From b41c7d9b9980fc9fb54dd77b32371ab52b02bc28 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 18:14:43 +0800
Subject: [PATCH 01/11] New version initialization

---
 .bazelversion                                 |     2 +-
 BUILD                                         |    41 +-
 README.md                                     |   294 +-
 WORKSPACE                                     |   105 +-
 build.sh                                      |     9 +-
 build_deps/BUILD                              |    37 +
 build_deps/build_pip_pkg.sh                   |     4 +-
 build_deps/patches/BUILD                      |    15 +
 build_deps/patches/internal_visibility.patch  |    13 +
 build_deps/patches/python_toolchain.patch     |    74 +
 build_deps/patches/tensorflow_llvm_url.patch  |    23 +
 build_deps/patches/tensorflow_serving.patch   |    25 +
 .../tensorflow_tf_gen_op_wrapper_py.patch     |    11 +
 build_deps/patches/tensorflow_zlib.patch      |    11 +
 build_deps/patches/tf2xla_visibility.patch    |    13 +
 build_deps/pip_tf/BUILD                       |    30 +
 build_deps/pip_tf/README.md                   |    25 +
 build_deps/pip_tf/defs.bzl                    |   132 +
 build_deps/pip_tf/pip_tf_flags_test.py        |    65 +
 build_deps/pip_tf/tensorflow.bzl              |  3536 ++
 build_deps/requirements.in                    |    37 +
 build_deps/requirements_lock_3_10.txt         |   983 +
 build_deps/requirements_lock_3_11.txt         |   733 +
 build_deps/requirements_lock_3_12.txt         |   739 +
 build_deps/requirements_lock_3_13.txt         |   739 +
 build_deps/tf_dependency/build_defs.bzl.tpl   |     1 +
 build_deps/tf_dependency/tf_configure.bzl     |     4 +
 build_deps/toolchains/gpu/crosstool/BUILD.tpl |    69 -
 .../toolchains/gpu/crosstool/CROSSTOOL.tpl    |  1409 -
 .../gpu/crosstool/cc_toolchain_config.bzl.tpl |  1493 -
 .../crosstool_wrapper_driver_is_not_gcc.tpl   |   269 -
 .../windows/msvc_wrapper_for_nvcc.py.tpl      |   192 -
 build_deps/toolchains/gpu/cub.BUILD           |    25 -
 build_deps/toolchains/gpu/cuda/BUILD.tpl      |   227 -
 .../toolchains/gpu/cuda/BUILD.windows.tpl     |   164 -
 .../toolchains/gpu/cuda/build_defs.bzl.tpl    |    62 -
 .../toolchains/gpu/cuda/cuda_config.h.tpl     |    26 -
 build_deps/toolchains/gpu/cuda_configure.bzl  |  1116 -
 build_deps/toolchains/gpu/find_cuda_config.py |   632 -
 configure.py                                  |   415 +-
 deepray/BUILD                                 |   122 +-
 deepray/__init__.py                           |   113 +-
 deepray/activations/__init__.py               |    27 -
 deepray/callbacks/__init__.py                 |     3 +-
 deepray/callbacks/callbacks.py                |    45 +-
 deepray/callbacks/model_checkpoint.py         |   147 +
 deepray/callbacks/profiler_callback.py        |    68 +
 deepray/callbacks/progbar_logger.py           |   458 +
 .../time_history.py}                          |    72 +-
 deepray/callbacks/time_stopping.py            |     2 +-
 deepray/callbacks/tqdm_progress_bar.py        |     2 +-
 deepray/callbacks/training_speed.py           |   155 +
 deepray/copts.bzl                             |     8 +
 deepray/core/base_trainer.py                  |   991 -
 deepray/core/base_trainer_test.py             |   351 -
 deepray/core/common/distribution_utils.py     |    24 +-
 deepray/core/common/flags.py                  |    14 -
 deepray/core/compile_utils.py                 |    18 +-
 deepray/core/dllogger_class.py                |    77 -
 deepray/core/module.py                        |   627 -
 .../core/platform/build_config.default.bzl    |     4 +-
 deepray/core/platform/build_config_root.bzl   |    10 +-
 deepray/core/trainer.py                       |  3073 ++
 deepray/core/utils/misc/distribution_utils.py |    34 -
 deepray/core/utils/misc/keras_utils.py        |   208 -
 deepray/custom_ops/BUILD                      |     5 +-
 deepray/custom_ops/correlation_cost/BUILD     |    29 +-
 .../cc/kernels/correlation_cost_op_gpu.cu.cc  |     3 +-
 .../correlation_cost/python/optical_flow.py   |     6 +-
 .../python/tests/run_all_test.py              |     3 +-
 .../custom_ops/distributed_embeddings/BUILD   |    17 +-
 .../cc/kernels/embedding_lookup_kernels.cu.cc |     5 +-
 .../cc/ops/embedding_lookup_ops.cc            |    11 +-
 .../python/tests/dist_model_parallel_test.py  |     2 -
 deepray/custom_ops/embedding_bag/BUILD        |    49 +
 deepray/custom_ops/embedding_bag/__init__.py  |     1 +
 .../embedding_bag_backward_kernels.cu.cc      |   247 +
 .../cc/kernels/embedding_bag_ops.cc           |   330 +
 .../cc/kernels/embedding_bag_ops.h            |    57 +
 .../cc/kernels/embedding_bag_ops_gpu.cu.cc    |   108 +
 .../embedding_bag/cc/ops/embedding_bag_ops.cc |    70 +
 .../embedding_bag/python}/__init__.py         |     0
 .../embedding_bag/python/embedding_bag.py     |   143 +
 .../embedding_bag/python/tests}/__init__.py   |     0
 .../python/tests/embedding_bag_test.py        |   116 +
 .../python}/tests/run_all_test.py             |     3 +-
 deepray/custom_ops/embedding_variable/BUILD   |   282 +
 .../custom_ops/embedding_variable/__init__.py |     3 +
 .../embedding_variable/cc/embedding/BUILD     |   269 +
 .../cc/embedding/batch.cu.cc                  |   219 +
 .../embedding_variable/cc/embedding/batch.h   |    66 +
 .../cc/embedding/bloom_filter_policy.h        |   438 +
 .../embedding_variable/cc/embedding/cache.h   |   521 +
 .../cc/embedding/cache_factory.h              |    47 +
 .../cc/embedding/cache_thread_pool_creator.h  |    45 +
 .../cc/embedding/config.proto                 |    58 +
 .../counter_filter_descriptor_impl.h          |   252 +
 .../cc/embedding/counter_filter_policy.h      |   189 +
 .../cc/embedding/cpu_hash_map_kv.h            |   214 +
 .../cc/embedding/dense_hash_map_kv.h          |   151 +
 .../cc/embedding/dram_leveldb_storage.h       |   221 +
 .../cc/embedding/dram_pmem_storage.h          |   218 +
 .../cc/embedding/dram_ssd_storage.h           |   214 +
 .../dynamic_dim_feature_descriptor_impl.h     |   195 +
 .../cc/embedding/emb_file.h                   |   244 +
 .../cc/embedding/emb_file_creator.h           |    97 +
 .../cc/embedding/embedding_config.h           |   110 +
 .../cc/embedding/embedding_memory_pool.h      |    89 +
 .../cc/embedding/embedding_var.cu.cc          |    77 +
 .../cc/embedding/embedding_var.h              |   706 +
 .../cc/embedding/embedding_var_ckpt_data.cc   |   229 +
 .../cc/embedding/embedding_var_ckpt_data.h    |    57 +
 .../cc/embedding/embedding_var_context.h      |    64 +
 .../embedding/embedding_var_dump_iterator.h   |    91 +
 .../cc/embedding/embedding_var_restore.cc     |   646 +
 .../cc/embedding/embedding_var_restore.h      |   223 +
 .../cc/embedding/eviction_manager.h           |   139 +
 .../cc/embedding/feature_descriptor.h         |   154 +
 .../cc/embedding/feature_descriptor_impl.h    |   299 +
 .../cc/embedding/filter_factory.h             |    51 +
 .../cc/embedding/filter_policy.h              |   106 +
 .../cc/embedding/globalstep_shrink_policy.h   |    62 +
 .../cc/embedding/gpu_hash_map_kv.h            |   333 +
 .../cc/embedding/gpu_hash_table.cu.cc         |   708 +
 .../cc/embedding/gpu_hash_table.h             |   136 +
 .../cc/embedding/hbm_dram_ssd_storage.h       |   601 +
 .../cc/embedding/hbm_dram_storage.h           |   536 +
 .../hbm_multi_tier_feature_descriptor.h       |   116 +
 .../cc/embedding/hbm_storage_iterator.h       |   124 +
 .../intra_thread_copy_id_allocator.h          |    73 +
 .../cc/embedding/kv_interface.h               |   121 +
 .../cc/embedding/l2weight_shrink_policy.h     |    71 +
 .../cc/embedding/leveldb_kv.h                 |   288 +
 .../cc/embedding/multi_tier_storage.cu.cc     |   188 +
 .../cc/embedding/multi_tier_storage.h         |   303 +
 .../cc/embedding/normal_feature_descriptor.h  |   127 +
 .../cc/embedding/nullable_filter_policy.h     |   173 +
 .../cc/embedding/shrink_policy.h              |    72 +
 .../cc/embedding/single_tier_storage.h        |   581 +
 .../cc/embedding/ssd_hash_kv.h                |   802 +
 .../cc/embedding/ssd_record_descriptor.cc     |    80 +
 .../cc/embedding/ssd_record_descriptor.h      |   105 +
 .../embedding_variable/cc/embedding/storage.h |   367 +
 .../cc/embedding/storage_config.h             |    59 +
 .../cc/embedding/storage_factory.h            |    78 +
 .../embedding_lookup_sparse_local_op.cc       |   757 +
 .../embedding_lookup_sparse_local_op_test.cc  |   901 +
 .../embedding_lookup_sparse_op.h              |    11 +
 ...bedding_lookup_sparse_post_grad_op_test.cc |   394 +
 .../embedding_lookup_sparse_post_op.cc        |   466 +
 .../embedding_lookup_sparse_post_op_test.cc   |   419 +
 .../embedding_lookup_sparse_pre_op.cc         |   315 +
 .../embedding_lookup_sparse_pre_op_test.cc    |   627 +
 .../fused_embedding_common.cu.h               |    98 +
 .../fused_embedding_local_ops_gpu.cu.cc       |   315 +
 .../fused_embedding_local_ops_test.cc         |   419 +
 .../cc/fused_embedding/fused_embedding_ops.cc |   308 +
 .../fused_embedding_post_grad_ops_test.cc     |   243 +
 .../fused_embedding_post_ops_gpus.cu.cc       |   328 +
 .../fused_embedding_post_ops_test.cc          |   213 +
 .../fused_embedding_pre_ops_gpus.cu.cc        |   521 +
 .../fused_embedding_pre_ops_test.cc           |   352 +
 .../cc/fused_layer_norm/BUILD                 |    22 +
 .../cc/fused_layer_norm/compile_util.h        |    78 +
 .../fused_layer_normalize_ops.cc              |   678 +
 .../fused_layer_normalize_ops_test.cc         |   269 +
 .../group_embedding_lookup_ops.cc             |   176 +
 .../group_embedding_lookup_ops.cu.cc          |   105 +
 .../group_embedding_lookup_ops_test.cc        |  1089 +
 ...dding_lookup_sparse_backward_base_ops.cu.h |   371 +
 ...up_embedding_lookup_sparse_backward_ops.cc |   264 +
 ...embedding_lookup_sparse_backward_ops.cu.cc |   176 +
 ...edding_lookup_sparse_forward_base_ops.cu.h |   721 +
 ...embedding_lookup_sparse_forward_base_ops.h |    64 +
 ...oup_embedding_lookup_sparse_forward_ops.cc |   690 +
 ..._embedding_lookup_sparse_forward_ops.cu.cc |   309 +
 .../incr_save_restore_ops.cc                  |   493 +
 .../incr_save_restore/incr_save_restore_ops.h |   553 +
 .../incr_save_restore_ops_test.cc             |   256 +
 .../cc/kernels/embedding_collection.cc        |    95 +
 .../cc/kernels/hotness_calculate.cu.cc        |    84 +
 .../cc/kernels/hotness_calculate.h            |    41 +
 .../cc/kernels/kv_variable_lookup_ops.cc      |   593 +
 .../cc/kernels/kv_variable_ops.cc             |   620 +
 .../cc/kernels/kv_variable_restore_ops.cc     |   259 +
 .../cc/kernels/kv_variable_util.cc            |    69 +
 .../cc/kernels/kv_variable_util.h             |   165 +
 .../cc/kernels/save_restore_ops.cc            |   176 +
 .../cc/kernels/save_restore_tensor_ev.h       |    82 +
 .../cc/kernels/training_adagrad_ops.cc        |   383 +
 .../cc/kernels/training_adam_async_ops.cc     |   603 +
 .../cc/kernels/training_adam_ops.cc           |   529 +
 .../cc/kernels/training_ali_op_helpers.h      |   182 +
 .../cc/kernels/training_ali_ops_gpu.cu.cc     |   650 +
 .../cc/kernels/training_ali_ops_gpu.h         |   119 +
 .../cc/kernels/training_ftrl_ops.cc           |   485 +
 .../cc/kernels/training_sgd_ops.cc            |   200 +
 .../cc/ops/embedding_collection.cc            |    39 +
 .../cc/ops/group_embedding_ops.cc             |   282 +
 .../cc/ops/incr_save_restore_ops.cc           |    73 +
 .../cc/ops/kv_variable_ops.cc                 |   436 +
 .../cc/ops/save_restore_ops.cc                |   122 +
 .../cc/ops/training_adagrad_ops.cc            |   109 +
 .../cc/ops/training_adam_async_ops.cc         |   129 +
 .../cc/ops/training_adam_ops.cc               |   127 +
 .../cc/ops/training_ftrl_ops.cc               |    96 +
 .../cc/ops/training_sgd_ops.cc                |    80 +
 .../embedding_variable/cc/tests/BUILD         |    65 +
 .../tests/embedding_variable_memory_test.cc   |    80 +
 .../cc/tests/embedding_variable_ops_test.cc   |  1324 +
 .../embedding_variable_performance_test.cc    |   455 +
 .../cc/tests/embedding_variable_test.h        |   109 +
 .../embedding_variable/config.proto           |    58 +
 .../embedding_variable_ops_test.py            |   114 +
 .../embedding_variable/multiplex_1_test.py    |    50 +
 .../embedding_variable/python}/__init__.py    |     0
 .../python/group_embedding_lookup_ops.py      |   543 +
 .../python/group_embedding_types.py           |    54 +
 .../python/kv_variable_ops.py                 |  1027 +
 .../python/tests/__init__.py                  |     0
 .../python/tests/embedding_bag_test.py        |   116 +
 .../tests/group_embedding_lookup_ops_test.py  |   254 +
 .../python/tests/run_all_test.py              |     7 +
 .../embedding_variable/variable_scope.py      |  1277 +
 .../embedding_variable/variables.py           |   206 +
 deepray/custom_ops/ffm_ops/BUILD              |    12 +-
 .../ffm_ops/cc/kernels/ffm_kernels.cu.cc      |     3 +-
 deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc  |     5 +-
 deepray/custom_ops/ffm_ops/python/ffm_ops.py  |     2 +-
 deepray/custom_ops/multiplex_1/BUILD          |    17 +-
 .../custom_ops/multiplex_1/__init__.py        |     0
 .../custom_ops/multiplex_1/multiplex_1_op.cc  |     9 +-
 deepray/custom_ops/multiplex_2/BUILD          |    12 +-
 .../custom_ops/multiplex_2/__init__.py        |     0
 .../multiplex_2/multiplex_2_kernel.cc         |     2 +-
 .../multiplex_2/multiplex_2_kernel.cu.cc      |     2 +-
 .../custom_ops/multiplex_2/multiplex_2_op.cc  |     2 +-
 deepray/custom_ops/multiplex_3/BUILD          |    13 +-
 .../custom_ops/multiplex_3/__init__.py        |     0
 .../multiplex_3/multiplex_3_kernel.cc         |     3 +-
 .../custom_ops/multiplex_3/multiplex_3_op.cc  |     5 +-
 deepray/custom_ops/multiplex_4/BUILD          |     5 +
 deepray/custom_ops/multiplex_4/__init__.py    |     0
 .../custom_ops/multiplex_4/multiplex_4_op.cc  |     3 +-
 deepray/custom_ops/parquet_dataset/BUILD      |    16 +-
 .../parquet_dataset/cc/kernels/arrow_util.cc  |    14 +-
 .../parquet_dataset/cc/kernels/eigen.h        |     2 +-
 .../cc/kernels/parquet_batch_reader.cc        |    34 +-
 .../cc/kernels/parquet_dataset_ops.cc         |    10 +-
 .../cc/kernels/parquet_dataset_ops.h          |     2 +-
 .../cc/kernels/parquet_pybind.cc              |     7 +-
 .../parquet_dataset/python/dataframe.py       |    30 +-
 .../python/parquet_dataset_ops.py             |    33 +-
 .../parquet_dataset/python/parquet_pybind.py  |     7 +-
 .../python/tests/parquet_dataset_ops_test.py  |   299 +-
 .../parquet_dataset/read_parquet_deepray.py   |    24 +-
 deepray/custom_ops/seq2seq/BUILD              |    30 +-
 deepray/custom_ops/seq2seq/__init__.py        |    22 +
 .../seq2seq/cc/kernels/beam_search_ops.cc     |     5 +
 .../seq2seq/cc/kernels/beam_search_ops.h      |     5 +
 .../seq2seq/python}/README.md                 |    22 +-
 deepray/custom_ops/seq2seq/python/__init__.py |     0
 .../seq2seq/python}/attention_wrapper.py      |    99 +-
 .../seq2seq/python}/basic_decoder.py          |    33 +-
 .../seq2seq/python}/beam_search_decoder.py    |    33 +-
 .../seq2seq/python}/decoder.py                |    22 +-
 .../seq2seq/python}/loss.py                   |     7 +-
 .../seq2seq/python}/sampler.py                |    11 +-
 .../seq2seq/python/tests/__init__.py          |     0
 .../python}/tests/attention_wrapper_test.py   |    18 +-
 .../python}/tests/basic_decoder_test.py       |     9 +-
 .../python}/tests/beam_search_decoder_test.py |     8 +-
 .../python}/tests/beam_search_ops_test.py     |    12 +-
 .../seq2seq/python}/tests/decoder_test.py     |     9 +-
 .../seq2seq/python}/tests/loss_test.py        |     5 +-
 .../seq2seq/python/tests/run_all_test.py      |     9 +
 deepray/custom_ops/simple_hash_table/BUILD    |     6 +-
 .../simple_hash_table_kernel.cc               |    11 +-
 .../simple_hash_table/simple_hash_table_op.cc |     9 +-
 deepray/custom_ops/sleep/BUILD                |     3 +-
 deepray/custom_ops/sleep/sleep_op.cc          |     5 +-
 deepray/custom_ops/text/BUILD                 |     6 +-
 deepray/custom_ops/training_ops/BUILD         |     8 +-
 .../training_ops/cc/kernels/training_ops.cc   |   194 +-
 .../training_ops/cc/kernels/training_ops.h    |    13 +
 .../cc/kernels/training_ops_gpu.cu.cc         |    64 +-
 .../training_ops/cc/ops/training_ops.cc       |    53 +-
 deepray/custom_ops/unique_ops/BUILD           |    45 +-
 .../unique_ops/cc/kernels/random.cc           |    58 -
 .../unique_ops/cc/kernels/task_runner.h       |     2 +-
 .../unique_ops/cc/kernels/unique_ali_op.cc    |   100 +-
 .../cc/kernels/unique_ali_op_gpu.cu.cc        |    14 +-
 .../cc/kernels/unique_ali_op_util.h           |   147 +-
 .../unique_ops/cc/ops/unique_ops.cc           |    26 +-
 .../python/tests/unique_ali_op_test.py        |   349 +
 .../unique_ops/python/tests/unique_op_test.py |   303 -
 deepray/custom_ops/utils/BUILD                |   127 +
 deepray/custom_ops/utils/check.h              |    33 +
 deepray/custom_ops/utils/fake_input.cc        |   239 +
 deepray/custom_ops/utils/fake_input.h         |    40 +
 .../utils/kernel_benchmark_testlib.cc         |   210 +
 .../utils/kernel_benchmark_testlib.h          |    86 +
 deepray/custom_ops/utils/ok_status_util.h     |    41 +
 deepray/custom_ops/utils/ops_testutil.cc      |   271 +
 deepray/custom_ops/utils/ops_testutil.h       |   212 +
 deepray/custom_ops/utils/ops_testutil_test.cc |    52 +
 .../random_test.cc => utils/random.cc}        |    23 +-
 .../{unique_ops/cc/kernels => utils}/random.h |     8 -
 deepray/custom_ops/utils/spin_lock.h          |    73 +
 deepray/custom_ops/utils/spin_rw_lock.h       |   248 +
 deepray/custom_ops/utils/tensor_testutil.cc   |   294 +
 deepray/custom_ops/utils/tensor_testutil.h    |   162 +
 .../custom_ops/utils/tensor_testutil_test.cc  |   335 +
 deepray/custom_ops/zero_out/BUILD             |     6 +
 .../zero_out/cc/kernels/zero_out_kernels.cc   |     9 +
 .../zero_out/cc/ops/zero_out_ops.cc           |     3 +-
 .../adult_census_income.py                    |    10 +-
 .../adult_census_income_test.py               |     2 -
 .../ali_display_ad_click.py                   |    21 +-
 .../ali_display_ad_click_test.py              |     2 -
 .../amazon_books_2014/amazon_books_2014.py    |    10 +-
 .../amazon_books_2014_test.py                 |     8 +-
 deepray/datasets/avazu/avazu.py               |     6 +-
 deepray/datasets/avazu/avazu_test.py          |     2 -
 deepray/datasets/cifar/cifar.py               |    19 +-
 deepray/datasets/cifar/cifar_test.py          |     2 -
 .../creditcardfraud/creditcardfraud.py        |    26 +-
 .../creditcardfraud/creditcardfraud_test.py   |     2 -
 .../{docker => }/Dockerfile_preprocessing     |     0
 deepray/datasets/criteo/README.md             |   282 -
 deepray/datasets/criteo/criteo.py             |    18 +-
 deepray/datasets/criteo/criteo_dataset.md     |   190 +
 deepray/datasets/criteo/criteo_test.py        |     3 -
 deepray/datasets/criteo/criteo_tsv_reader.py  |    11 +-
 .../datasets/criteo/criteo_tsv_reader_test.py |     2 -
 deepray/datasets/criteo/feature_map_small.csv |    80 +-
 .../datasets/criteo/feature_map_xlarge.csv    |    80 +-
 .../datasets/criteo/preproc/data/__init__.py  |     0
 .../datasets/criteo/preproc/data/defaults.py  |    43 +
 .../criteo/preproc/data/feature_spec.py       |   268 +
 .../criteo/preproc/parquet_to_binary.py       |     8 +-
 .../criteo/preproc/preproc_NVTabular.py       |     6 +-
 .../criteo/preproc/spark_data_utils.py        |    16 +-
 .../datasets/criteo/preproc/split_dataset.py  |     6 +-
 .../requirements_preprocessing.txt            |     2 +-
 deepray/datasets/csv_pipeline.py              |    20 -
 deepray/datasets/csv_pipeline/__init__.py     |     0
 deepray/datasets/csv_pipeline/csv_pipeline.py |    18 +
 deepray/datasets/datapipeline.py              |    51 +-
 deepray/datasets/dataset_factory.py           |     6 +-
 .../GooglePretrainedWeightDownloader.py       |     4 +-
 deepray/datasets/downloader/bertPrep.py       |    21 +-
 .../downloader/create_datasets_from_start.sh  |     2 +-
 .../downloader/create_finetuning_data.py      |     2 -
 .../downloader/create_pretraining_data.py     |     2 -
 .../datasets/fashion_mnist/fashion_mnist.py   |    22 +-
 .../fashion_mnist/fashion_mnist_test.py       |     2 -
 .../datasets/imagenet-1k/imagenet_to_gcs.py   |     3 -
 deepray/datasets/imdb/imdb.py                 |    10 +-
 deepray/datasets/imdb/imdb_test.py            |     2 -
 deepray/datasets/kafka_dataset.py             |    43 -
 deepray/datasets/kafka_pipeline/__init__.py   |     0
 .../datasets/kafka_pipeline/kafka_pipeline.py |   254 +
 .../kafka_pipeline/kafka_pipeline_test.py     |    52 +
 deepray/datasets/mnist/mnist.py               |    28 +-
 deepray/datasets/mnist/mnist_test.py          |    45 +-
 deepray/datasets/movielens/movielens.csv      |     2 +-
 deepray/datasets/movielens/movielens.py       |     4 +-
 .../movielens/movielens_100k_ratings.py       |    84 +-
 .../movielens/movielens_100k_ratings_test.py  |    43 -
 .../movielens/movielens_1m_ratings.py         |    94 +-
 .../movielens/movielens_1m_ratings_test.py    |    43 -
 .../movielens/movielens_ratings_test.py       |    39 +
 deepray/datasets/movielens/process.py         |     4 +-
 deepray/datasets/movielens/producer.py        |    14 +-
 deepray/datasets/openwebtext/openwebtext.py   |     7 +-
 .../datasets/openwebtext/openwebtext_test.py  |     2 -
 .../parquet_pipeline/ali_parquet_dataset.py   |   232 +-
 .../ali_parquet_dataset_test.py               |    51 +-
 .../parquet_pipeline/parquet_pipeline_test.py |     2 -
 deepray/datasets/squad/classifier_dataset.py  |   101 +
 deepray/datasets/squad/pretrain_dataset.py    |   122 +
 deepray/datasets/squad/squad.py               |    12 +-
 deepray/datasets/squad/squad_dataset.py       |   111 +
 deepray/datasets/squad/squad_test.py          |     2 -
 .../tfrecord_pipeline/tfrecord_pipeline.py    |    11 +-
 .../tfrecord_pipeline_test.py                 |     2 -
 .../toxic_comment_classification_challenge.py |     7 +-
 ...c_comment_classification_challenge_test.py |     2 -
 .../datasets/wikicorpus_en/wikicorpus_en.py   |     1 -
 .../wikicorpus_en/wikicorpus_en_test.py       |     2 -
 deepray/deepray.bzl                           |   336 +-
 deepray/layers/BUILD                          |     3 +-
 deepray/layers/__init__.py                    |     3 +-
 deepray/layers/attention.py                   |    43 +-
 deepray/layers/dcn.py                         |    45 +-
 deepray/layers/dense.py                       |   287 +
 deepray/layers/dense_einsum.py                |     1 -
 deepray/layers/dynamic_embedding.py           |   258 +-
 deepray/layers/embedding.py                   |    62 +-
 deepray/layers/embedding_variable.py          |   206 +
 deepray/layers/feature_cross.py               |    31 +-
 deepray/layers/masked_softmax.py              |     1 -
 deepray/layers/max_unpooling_2d.py            |    41 +-
 deepray/layers/max_unpooling_2d_v2.py         |     2 +-
 deepray/layers/mlp.py                         |    27 +-
 deepray/layers/networks/__init__.py           |    17 -
 deepray/layers/noisy_dense.py                 |    33 +-
 deepray/layers/on_device_embedding.py         |     1 -
 deepray/layers/pooling.py                     |     4 +-
 deepray/layers/rnn/esn_cell.py                |     8 +-
 deepray/layers/rnn/layer_norm_lstm_cell.py    |     2 +-
 .../layers/rnn/layer_norm_simple_rnn_cell.py  |     2 +-
 deepray/layers/rnn/nas_cell.py                |     8 +-
 deepray/layers/rnn/tests/esn_cell_test.py     |     2 +-
 .../rnn/tests/layer_norm_lstm_cell_test.py    |     2 +-
 .../tests/layer_norm_simple_rnn_cell_test.py  |     2 +-
 deepray/layers/rnn/tests/nas_cell_test.py     |     2 +-
 deepray/layers/self_attention_mask.py         |     1 -
 .../tests_bak/on_device_embedding_test.py     |   183 +
 deepray/layers/tf_utils.py                    |     7 +-
 deepray/layers/transformer.py                 |    15 +-
 deepray/layers/transformer_scaffold.py        |     1 -
 deepray/losses/__init__.py                    |    23 +-
 deepray/losses/_loss_util.py                  |   281 +
 deepray/losses/contrastive.py                 |     4 +-
 deepray/losses/focal_loss.py                  |     7 +-
 deepray/losses/giou_loss.py                   |     4 +-
 deepray/losses/lifted.py                      |    11 +-
 deepray/losses/losses_impl.py                 |  1937 +
 deepray/losses/quantiles.py                   |     5 +-
 deepray/losses/softmax_loss.py                |   167 +
 ...ed_sparse_categorical_crossentropy_test.py |   377 +
 deepray/losses/triplet.py                     |    12 +-
 deepray/losses/utils.py                       |   563 +
 ...eighted_sparse_categorical_crossentropy.py |   108 +
 deepray/metrics/__init__.py                   |     9 +-
 deepray/metrics/_ranking.py                   |   165 +
 deepray/metrics/alpha_dcg.py                  |   126 +
 deepray/metrics/arp.py                        |    47 +
 deepray/metrics/cohens_kappa.py               |     7 +-
 deepray/metrics/dcg.py                        |    75 +
 deepray/metrics/f_scores.py                   |     8 +-
 deepray/metrics/geometric_mean.py             |     7 +-
 deepray/metrics/hits.py                       |    65 +
 .../matthews_correlation_coefficient.py       |     4 +-
 deepray/metrics/mean_average_precision.py     |    79 +
 deepray/metrics/metrics_impl.py               |   895 +
 deepray/metrics/mrr.py                        |   111 +
 .../metrics/multilabel_confusion_matrix.py    |     2 +-
 deepray/metrics/ndcg.py                       |   131 +
 deepray/metrics/opa.py                        |    55 +
 deepray/metrics/precision.py                  |    73 +
 deepray/metrics/precision_ia.py               |    88 +
 deepray/metrics/r_square.py                   |     2 +-
 deepray/metrics/recall.py                     |    73 +
 deepray/metrics/streaming_correlations.py     |     6 +-
 deepray/metrics/utils.py                      |   153 +-
 deepray/models/BUILD                          |     1 +
 deepray/{layers/networks => models}/README.md |     4 +-
 deepray/models/__init__.py                    |     3 +
 .../albert_transformer_encoder.py             |    21 +-
 .../networks => models}/bert_classifier.py    |     5 +-
 .../networks => models}/bert_pretrainer.py    |     9 +-
 .../networks => models}/bert_span_labeler.py  |     7 +-
 .../networks => models}/classification.py     |     1 -
 .../networks => models}/encoder_scaffold.py   |    19 +-
 .../{layers/networks => models}/masked_lm.py  |     2 -
 deepray/models/ncf_common.py                  |     2 -
 deepray/models/ncf_model.py                   |     2 +-
 deepray/models/ncf_test.py                    |     4 +-
 deepray/models/rec/base_model.py              |     2 -
 deepray/models/rec/flen.py                    |     2 -
 deepray/models/rec/tfra_demo.py               |   192 -
 deepray/models/rec/tower_new_tfra.py          |   162 -
 .../networks => models}/span_labeling.py      |     0
 deepray/models/tests/__init__.py              |     0
 .../tests}/albert_transformer_encoder_test.py |     2 +-
 .../tests}/bert_classifier_test.py            |     0
 .../tests}/bert_pretrainer_test.py            |     0
 .../tests}/bert_span_labeler_test.py          |     0
 .../tests}/classification_test.py             |     0
 .../tests}/encoder_scaffold_test.py           |    28 +-
 .../tests}/masked_lm_test.py                  |     3 +-
 .../tests}/span_labeling_test.py              |     0
 .../tests}/transformer_encoder_test.py        |     2 +-
 .../transformer_encoder.py                    |    18 +-
 deepray/optimizers/BUILD                      |    13 +
 deepray/optimizers/__init__.py                |     5 +-
 deepray/optimizers/adagrad.py                 |    83 +
 deepray/optimizers/adam.py                    |    99 +-
 deepray/optimizers/adam_async.py              |   188 +
 deepray/optimizers/ev_optimizer_patch.py      |   260 +
 deepray/optimizers/ftrl.py                    |    96 +
 deepray/optimizers/gradient_descent.py        |    91 +
 deepray/optimizers/lazy_adam.py               |    14 +-
 deepray/optimizers/multi_optimizer.py         |    90 +-
 deepray/optimizers/optimization.py            |     5 -
 .../tests/weight_decay_optimizers_test.py     |     2 +-
 deepray/optimizers/weight_decay_optimizers.py |     2 +-
 deepray/repo.bzl                              |    48 +
 deepray/seq2seq/BUILD                         |    26 -
 deepray/seq2seq/__init__.py                   |    53 -
 deepray/tensorflow.bzl                        |   333 -
 deepray/utils/BUILD                           |     4 +
 deepray/utils/benchmark.py                    |    14 +-
 deepray/utils/ckpt_util.py                    |    11 +
 deepray/utils/data/feature_map.py             |   160 +-
 deepray/utils/data/input_meta.py              |     2 -
 deepray/utils/dllogger_class.py               |    77 -
 deepray/utils/export/export.py                |   157 +-
 deepray/utils/flags/_base.py                  |    20 +-
 deepray/utils/flags/_benchmark.py             |    24 +-
 deepray/utils/flags/_device.py                |     4 +-
 deepray/utils/flags/_distribution.py          |     8 +-
 deepray/utils/flags/common_flags.py           |    81 +-
 deepray/utils/flags/core.py                   |     2 +-
 deepray/utils/horovod_utils.py                |    39 +-
 deepray/utils/keras_utils.py                  |   210 +-
 deepray/utils/logging_util.py                 |   392 +
 deepray/utils/logs/hooks.py                   |   113 -
 deepray/utils/logs/hooks_test.py              |   142 -
 deepray/utils/logs/logger.py                  |     2 -
 deepray/utils/logs/metric_hook.py             |    91 -
 deepray/utils/logs/metric_hook_test.py        |   208 -
 deepray/utils/logs/mlperf_helper.py           |     1 -
 deepray/utils/logs/summary_manager.py         |     4 +-
 deepray/utils/resource_loader.py              |     4 +-
 deepray/utils/test_utils.py                   |    10 +-
 deepray/utils/timer.py                        |    34 +
 deepray/utils/types.py                        |    33 +-
 deepray/version.py                            |     4 +-
 deepray/workspace0.bzl                        |     4 +-
 deepray/workspace2.bzl                        |   155 +-
 deepray/workspace3.bzl                        |    16 +-
 docker.sh                                     |    30 +-
 .../run_horovod.sh                            |    64 +-
 .../CV/Classify_images_of_clothing/train.py   |    50 +-
 modelzoo/CV/GAN/train.py                      |     5 +-
 modelzoo/CV/SwinTransformers/train.py         |     4 +-
 modelzoo/CV/mnist/run_early.sh                |    15 +-
 modelzoo/CV/mnist/run_horovod.sh              |    46 +-
 modelzoo/CV/mnist/train.py                    |   132 +-
 modelzoo/CV/mnist/train_earlystop.py          |   112 -
 modelzoo/ELECTRA/.gitignore                   |   129 +
 modelzoo/ELECTRA/Dockerfile                   |    31 +
 modelzoo/ELECTRA/LICENSE                      |   203 +
 modelzoo/ELECTRA/NOTICE                       |     5 +
 modelzoo/ELECTRA/README.md                    |  1005 +
 modelzoo/ELECTRA/build_pretraining_dataset.py |   237 +
 modelzoo/ELECTRA/configuration.py             |   132 +
 modelzoo/ELECTRA/configuration_utils.py       |   518 +
 modelzoo/ELECTRA/data/BooksDownloader.py      |    26 +
 .../ELECTRA/data/BookscorpusTextFormatting.py |    32 +
 modelzoo/ELECTRA/data/Downloader.py           |    91 +
 .../data/GooglePretrainedWeightDownloader.py  |   158 +
 modelzoo/ELECTRA/data/MRPCDownloader.py       |    44 +
 .../data/NVIDIAPretrainedWeightDownloader.py  |    27 +
 modelzoo/ELECTRA/data/SquadDownloader.py      |    54 +
 modelzoo/ELECTRA/data/TextSharding.py         |   327 +
 modelzoo/ELECTRA/data/WikiDownloader.py       |    57 +
 .../ELECTRA/data/WikicorpusTextFormatting.py  |    46 +
 modelzoo/ELECTRA/data/__init__.py             |    12 +
 .../data/create_datasets_from_start.sh        |    47 +
 modelzoo/ELECTRA/data/dataPrep.py             |   312 +
 modelzoo/ELECTRA/data/glue/download_mrpc.sh   |    20 +
 modelzoo/ELECTRA/data/squad/squad_download.sh |    73 +
 modelzoo/ELECTRA/file_utils.py                |   515 +
 modelzoo/ELECTRA/gpu_affinity.py              |    63 +
 modelzoo/ELECTRA/images/total_loss.svg        |     1 +
 modelzoo/ELECTRA/modeling.py                  |  1084 +
 modelzoo/ELECTRA/modeling_utils.py            |  2843 ++
 modelzoo/ELECTRA/optimization.py              |   383 +
 .../ELECTRA/postprocess_pretrained_ckpt.py    |    72 +
 modelzoo/ELECTRA/pretrain_utils.py            |   367 +
 modelzoo/ELECTRA/run.sub                      |    88 +
 modelzoo/ELECTRA/run_inference.py             |   212 +
 modelzoo/ELECTRA/run_pretraining.py           |   505 +
 modelzoo/ELECTRA/run_tf_squad.py              |   675 +
 .../ELECTRA/scripts/benchmark_pretraining.sh  |    43 +
 modelzoo/ELECTRA/scripts/benchmark_squad.sh   |    28 +
 modelzoo/ELECTRA/scripts/bind.sh              |   226 +
 .../scripts/configs/pretrain_config.sh        |   411 +
 .../ELECTRA/scripts/configs/squad_config.sh   |   271 +
 modelzoo/ELECTRA/scripts/docker/build.sh      |    15 +
 modelzoo/ELECTRA/scripts/docker/launch.sh     |    29 +
 .../scripts/finetune_ckpts_on_squad.sh        |    28 +
 modelzoo/ELECTRA/scripts/run_pretraining.sh   |   171 +
 modelzoo/ELECTRA/scripts/run_squad.sh         |   112 +
 modelzoo/ELECTRA/squad_utils.py               |  1093 +
 modelzoo/ELECTRA/tokenization.py              |    68 +
 modelzoo/ELECTRA/tokenization_utils.py        |  2415 ++
 modelzoo/ELECTRA/utils.py                     |   231 +
 modelzoo/ELECTRA/vocab/vocab.txt              | 30522 ++++++++++++++++
 modelzoo/LanguageModeling/BERT/.dockerignore  |    27 +
 modelzoo/LanguageModeling/BERT/.gitignore     |   147 +
 modelzoo/LanguageModeling/BERT/Bert_result.md |    26 -
 modelzoo/LanguageModeling/BERT/Dockerfile     |    55 +
 modelzoo/LanguageModeling/BERT/README.md      |     8 +-
 .../LanguageModeling/BERT/bert_dllogger.json  |    15 +
 .../BERT/classifier_data_lib.py               |   581 +
 .../LanguageModeling/BERT/common_flags.py     |    72 +
 .../BERT/create_finetuning_data.py            |   184 +
 .../BERT/create_pretraining_data.py           |   655 +
 .../BERT/data/BooksDownloader.py              |    26 +
 .../BERT/data/BookscorpusTextFormatting.py    |    32 +
 .../LanguageModeling/BERT/data/Downloader.py  |   123 +
 .../BERT/data/GLUEDownloader.py               |    46 +
 .../data/GooglePretrainedWeightDownloader.py  |   157 +
 .../data/NVIDIAPretrainedWeightDownloader.py  |    27 +
 .../BERT/data/PubMedDownloader.py             |    93 +
 .../BERT/data/PubMedTextFormatting.py         |    44 +
 modelzoo/LanguageModeling/BERT/data/README.md |    28 +
 .../BERT/data/SquadDownloader.py              |    54 +
 .../BERT/data/TextSharding.py                 |   331 +
 .../BERT/data/WikiDownloader.py               |    59 +
 .../BERT/data/WikicorpusTextFormatting.py     |    46 +
 .../LanguageModeling/BERT/data/__init__.py    |    12 +
 .../LanguageModeling/BERT/data/bertPrep.py    |   388 +
 .../create_biobert_datasets_from_start.sh     |    55 +
 .../BERT/data/create_datasets_from_start.sh   |    71 +
 .../BERT/data/images/bert_pipeline.png        |   Bin 0 -> 212516 bytes
 .../BERT/data/images/images_nvlamb.png        |   Bin 0 -> 88164 bytes
 .../LanguageModeling/BERT/gpu_affinity.py     |    63 +
 .../LanguageModeling/BERT/input_pipeline.py   |   232 +
 ...uad_train_benchmark_base_fp16_gpu4_bs8.log |   477 -
 ...ing_squad_base_fp16_gbs48.230222025408.log |   594 -
 .../BERT/model_saving_utils.py                |   101 +
 .../BERT/official/modeling/__init__.py        |     0
 .../official/modeling/hyperparams/__init__.py |     0
 .../modeling/hyperparams/params_dict.py       |   410 +
 .../modeling/hyperparams/params_dict_test.py  |   322 +
 .../official/modeling/training/__init__.py    |     0
 .../modeling/training/distributed_executor.py |   800 +
 .../BERT/official}/nlp/bert_modeling.py       |   358 +-
 .../BERT/official}/nlp/bert_models.py         |   229 +-
 .../BERT/official/nlp/modeling/__init__.py    |     1 +
 .../official/nlp/modeling/losses/__init__.py  |    17 +
 ...eighted_sparse_categorical_crossentropy.py |   106 +
 ...ed_sparse_categorical_crossentropy_test.py |   381 +
 .../BERT/official/nlp/transformer/__init__.py |     0
 .../nlp/transformer/beam_search_v1.py         |   184 +-
 .../nlp/transformer/beam_search_v1_test.py    |    15 +-
 .../official}/nlp/transformer/model_params.py |    11 +-
 .../official}/nlp/transformer/model_utils.py  |    16 +-
 .../nlp/transformer/model_utils_test.py       |    27 +-
 .../LanguageModeling/BERT/optimization.py     |   140 +-
 modelzoo/LanguageModeling/BERT/run.sub        |    82 +
 .../LanguageModeling/BERT/run_classifier.py   |   402 +
 .../LanguageModeling/BERT/run_pretraining.py  |   205 +
 modelzoo/LanguageModeling/BERT/run_squad.py   |   414 +-
 .../BERT/run_squad_predict.py                 |   287 -
 .../benchmark_pretraining_lamb_phase2.sh      |     8 +-
 .../BERT/scripts/docker/build.sh              |    15 +
 .../BERT/scripts/docker/launch.sh             |    28 +
 .../BERT/scripts/finetune_train_benchmark.sh  |    55 +-
 .../BERT/scripts/gen_squad_evel.sh            |    40 -
 .../BERT/scripts/run_inference_benchmark.sh   |     4 +-
 .../scripts/run_inference_benchmark_seq128.sh |     4 +-
 .../BERT/scripts/run_pretraining_adam.sh      |     6 +-
 .../scripts/run_pretraining_lamb_phase1.sh    |     8 +-
 .../scripts/run_pretraining_lamb_phase2.sh    |     8 +-
 .../BERT/scripts/run_squad.sh                 |    25 +-
 .../BERT/scripts/run_squad_inference.sh       |     4 +-
 modelzoo/LanguageModeling/BERT/squad_lib.py   |   877 +
 .../LanguageModeling/BERT/squad_lib_sp.py     |   868 +
 modelzoo/LanguageModeling/BERT/tf_trt.py      |    70 +
 .../LanguageModeling/BERT/tokenization.py     |   537 +
 .../run_horovod.sh                            |     2 +-
 .../trainer.py                                |     4 +-
 .../a.py                                      |    67 +
 .../models.py                                 |   242 +
 .../run.py                                    |   126 +
 .../run_dp.py                                 |   139 +
 .../run.sh}                                   |    52 +-
 .../CreditCardFraudDetection/run_horovod.sh   |    77 -
 .../CreditCardFraudDetection/train.py         |    46 +-
 modelzoo/Recommendation/Criteo_DCN/README.md  |    32 +
 .../Criteo_DCN/datasets/__init__.py           |     0
 .../Criteo_DCN/datasets/custom_dataset.py     |    26 +
 .../datasets/custom_dataset_test.py           |    49 +
 modelzoo/Recommendation/Criteo_DCN/dcn_v2.py  |   119 +
 modelzoo/Recommendation/Criteo_DCN/eval.py    |    43 +
 .../Criteo_DCN/feature_map_small.csv          |    41 +
 modelzoo/Recommendation/Criteo_DCN/infer.py   |    43 +
 modelzoo/Recommendation/Criteo_DCN/run.sh     |    37 +
 modelzoo/Recommendation/Criteo_DCN/train.py   |    84 +
 modelzoo/Recommendation/Criteo_DCN/train1.py  |    90 +
 modelzoo/Recommendation/MovieLens/mymodel.py  |    57 +
 modelzoo/Recommendation/MovieLens/run.sh      |     3 +
 .../Recommendation/MovieLens/run_ranking.sh   |     3 +
 modelzoo/Recommendation/MovieLens/train.py    |    94 +
 .../Recommendation/MovieLens/train_ranking.py |    81 +
 modelzoo/Recommendation/NCF/run_ncf.py        |     7 +-
 ...s_2014_dien_fp16_gbs32768.230321132836.log |     2 +-
 ...ks_2014_din_fp16_gbs32768.230321132123.log |     2 +-
 ...ks_2014_sim_fp16_gbs32768.230321133429.log |     2 +-
 modelzoo/Recommendation/SIM/main.py           |     5 +-
 modelzoo/Recommendation/SIM/run_dien.py       |     4 +-
 modelzoo/Recommendation/SIM/run_din.py        |     4 +-
 modelzoo/Recommendation/SIM/run_horovod.sh    |     4 +-
 modelzoo/Recommendation/SIM/run_sim.py        |     4 +-
 modelzoo/Recommendation/TFRA/demo.py          |    68 -
 modelzoo/Recommendation/TFRA/demo_tfra.py     |    50 -
 modelzoo/Recommendation/WideDeep/train.py     |     4 +-
 .../avazu-ctr-prediction/ccpm.py              |     2 -
 .../avazu-ctr-prediction/ccpm_diamond.py      |     2 -
 .../avazu-ctr-prediction/run_horovod.sh       |     2 +-
 .../avazu-ctr-prediction/train.py             |    24 +-
 .../Frozen-Graph-TensorFlow/README.md         |    38 -
 .../TensorFlow_v2/README.md                   |    75 -
 .../TensorFlow_v2/example_1.py                |   103 -
 .../TensorFlow_v2/example_2.py                |   167 -
 .../TensorFlow_v2/utils.py                    |    38 -
 modelzoo/Recommendation/criteo_ctr/dcn_v2.py  |   140 +-
 .../criteo_ctr/feature_map_small.csv          |    80 +-
 modelzoo/Recommendation/criteo_ctr/frozen.py  |    46 -
 modelzoo/Recommendation/criteo_ctr/infer.py   |    91 +-
 .../criteo_ctr/optimize_for_inference.py      |    41 -
 .../Recommendation/criteo_ctr/run_horovod.sh  |    90 -
 .../Recommendation/criteo_ctr/run_optimize.sh |    76 -
 modelzoo/Recommendation/criteo_ctr/train.py   |   149 +-
 .../keras_horovod_dis/demo_tfra.py            |     5 +-
 .../keras_horovod_distributed_demo.py         |     2 +-
 recommendation/create_ncf_data.py             |     2 +-
 recommendation/movielens.py                   |     2 +-
 recommendation/movielens_dataset.py           |     8 +-
 recommendation/ncf_common.py                  |     4 +-
 recommendation/ncf_keras_main.py              |     8 +-
 recommendation/ncf_test.py                    |     4 +-
 recommendation/ranking/common.py              |     2 +-
 .../preprocessing/criteo_preprocess.py        |     1 -
 recommendation/ranking/train.py               |     3 +-
 recommendation/ranking/train_test.py          |     2 +-
 requirements.txt                              |    14 +-
 setup.py                                      |     3 -
 third_party/arrow/arrow-20.patch              |    13 +
 third_party/arrow/arrow.BUILD                 |   206 +-
 third_party/clang_toolchain/BUILD             |     0
 .../clang_toolchain/cc_configure_clang.bzl    |    27 +
 .../clang_toolchain/download_clang.bzl        |    64 +
 third_party/cuCollections/BUILD               |     0
 third_party/cuCollections/cuCollections.BUILD |    26 +
 .../cuco.BUILD                                |     3 +-
 .../cucollection.patch                        |    80 +-
 third_party/cutlass.BUILD                     |    27 +-
 third_party/flash_attn/BUILD                  |     0
 third_party/flash_attn/flash_attn.BUILD       |    51 +
 third_party/flash_attn/flash_attn.patch       |   450 +
 third_party/gpus/BUILD.bazel                  |     0
 third_party/gpus/find_cuda_config.py          |   161 +-
 third_party/leveldb.BUILD                     |    80 +
 third_party/openblas.BUILD                    |     1 +
 third_party/openssl.BUILD                     |    56 +
 third_party/py/BUILD                          |    40 +
 third_party/py/pypi.bzl                       |    54 +
 third_party/py/python_init_pip.bzl            |    53 +
 third_party/readerwriterqueue.BUILD           |    10 +
 third_party/remote_config/BUILD               |     0
 third_party/remote_config/common.bzl          |   327 +
 third_party/repo.bzl                          |   244 +
 third_party/sparsehash.BUILD                  |    12 -
 third_party/sparsehash_c11/BUILD              |     0
 .../{ => sparsehash_c11}/sparsehash_c11.BUILD |     0
 .../sparsehash_c11/sparsehash_c11.patch       |  4956 +++
 third_party/tf/BUILD                          |     0
 third_party/tf/tf_215.patch                   |    31 +
 third_party/xla/BUILD.bazel                   |    25 +
 third_party/xla/workspace.bzl                 |    37 +
 tools/build_base_container.sh                 |    20 +-
 tools/docker/base_container.Dockerfile        |   125 +-
 tools/docker/bashrc.bash                      |     4 +-
 tools/docker/bazel.bazelrc                    |     2 +
 tools/docker/build_wheel.Dockerfile           |     2 +-
 tools/docker/entry.sh                         |    38 +
 tools/docker/py3.10_env.yml                   |    11 +
 tools/docker/py3.8_env.yml                    |    11 +
 tools/docker/sanity_check.Dockerfile          |    12 +-
 tools/docs/build_docs.py                      |     2 +-
 tools/install_deps/install_clang.sh           |     9 +-
 tools/install_deps/install_cmake.sh           |     5 +-
 tools/install_deps/install_miniforge.sh       |    58 +
 tools/install_deps/install_nsight-systems.sh  |    24 +
 tools/install_deps/install_openmpi.sh         |    15 +-
 tools/install_deps/install_python.sh          |    50 +-
 tools/install_deps/pytest.txt                 |     7 -
 tools/install_deps/tensorflow-cpu.txt         |     1 -
 tools/install_deps/tensorflow.txt             |     2 +-
 tools/install_deps/typedapi.txt               |     1 -
 tools/install_deps/yapf.txt                   |     2 +-
 tools/update_release_version.sh               |     1 -
 791 files changed, 130839 insertions(+), 16923 deletions(-)
 create mode 100644 build_deps/BUILD
 create mode 100644 build_deps/patches/BUILD
 create mode 100644 build_deps/patches/internal_visibility.patch
 create mode 100644 build_deps/patches/python_toolchain.patch
 create mode 100644 build_deps/patches/tensorflow_llvm_url.patch
 create mode 100644 build_deps/patches/tensorflow_serving.patch
 create mode 100644 build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch
 create mode 100644 build_deps/patches/tensorflow_zlib.patch
 create mode 100644 build_deps/patches/tf2xla_visibility.patch
 create mode 100644 build_deps/pip_tf/BUILD
 create mode 100644 build_deps/pip_tf/README.md
 create mode 100644 build_deps/pip_tf/defs.bzl
 create mode 100644 build_deps/pip_tf/pip_tf_flags_test.py
 create mode 100644 build_deps/pip_tf/tensorflow.bzl
 create mode 100644 build_deps/requirements.in
 create mode 100644 build_deps/requirements_lock_3_10.txt
 create mode 100644 build_deps/requirements_lock_3_11.txt
 create mode 100644 build_deps/requirements_lock_3_12.txt
 create mode 100644 build_deps/requirements_lock_3_13.txt
 delete mode 100644 build_deps/toolchains/gpu/crosstool/BUILD.tpl
 delete mode 100644 build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
 delete mode 100755 build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
 delete mode 100644 build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
 delete mode 100644 build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
 delete mode 100644 build_deps/toolchains/gpu/cub.BUILD
 delete mode 100644 build_deps/toolchains/gpu/cuda/BUILD.tpl
 delete mode 100644 build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
 delete mode 100644 build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
 delete mode 100644 build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
 delete mode 100644 build_deps/toolchains/gpu/cuda_configure.bzl
 delete mode 100644 build_deps/toolchains/gpu/find_cuda_config.py
 create mode 100644 deepray/callbacks/model_checkpoint.py
 create mode 100644 deepray/callbacks/profiler_callback.py
 create mode 100644 deepray/callbacks/progbar_logger.py
 rename deepray/{utils/misc/keras_utils.py => callbacks/time_history.py} (65%)
 create mode 100644 deepray/callbacks/training_speed.py
 delete mode 100644 deepray/core/base_trainer.py
 delete mode 100644 deepray/core/base_trainer_test.py
 delete mode 100644 deepray/core/dllogger_class.py
 delete mode 100644 deepray/core/module.py
 create mode 100644 deepray/core/trainer.py
 delete mode 100644 deepray/core/utils/misc/keras_utils.py
 create mode 100644 deepray/custom_ops/embedding_bag/BUILD
 create mode 100644 deepray/custom_ops/embedding_bag/__init__.py
 create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
 create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
 create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
 create mode 100644 deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
 create mode 100644 deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc
 rename deepray/{layers/nlp => custom_ops/embedding_bag/python}/__init__.py (100%)
 create mode 100644 deepray/custom_ops/embedding_bag/python/embedding_bag.py
 rename deepray/{layers/nlp/transformer => custom_ops/embedding_bag/python/tests}/__init__.py (100%)
 create mode 100644 deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
 rename deepray/{seq2seq => custom_ops/embedding_bag/python}/tests/run_all_test.py (72%)
 create mode 100644 deepray/custom_ops/embedding_variable/BUILD
 create mode 100644 deepray/custom_ops/embedding_variable/__init__.py
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/BUILD
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/batch.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/config.proto
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/BUILD
 create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc
 create mode 100644 deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h
 create mode 100644 deepray/custom_ops/embedding_variable/config.proto
 create mode 100644 deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
 create mode 100644 deepray/custom_ops/embedding_variable/multiplex_1_test.py
 rename deepray/{seq2seq/tests => custom_ops/embedding_variable/python}/__init__.py (100%)
 create mode 100644 deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
 create mode 100644 deepray/custom_ops/embedding_variable/python/group_embedding_types.py
 create mode 100644 deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
 rename build_deps/toolchains/gpu/BUILD => deepray/custom_ops/embedding_variable/python/tests/__init__.py (100%)
 create mode 100644 deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
 create mode 100644 deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
 create mode 100644 deepray/custom_ops/embedding_variable/python/tests/run_all_test.py
 create mode 100644 deepray/custom_ops/embedding_variable/variable_scope.py
 create mode 100644 deepray/custom_ops/embedding_variable/variables.py
 rename build_deps/toolchains/gpu/crosstool/BUILD => deepray/custom_ops/multiplex_1/__init__.py (100%)
 rename build_deps/toolchains/gpu/cuda/BUILD => deepray/custom_ops/multiplex_2/__init__.py (100%)
 rename third_party/cucollection/BUILD => deepray/custom_ops/multiplex_3/__init__.py (100%)
 create mode 100644 deepray/custom_ops/multiplex_4/__init__.py
 create mode 100644 deepray/custom_ops/seq2seq/__init__.py
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/README.md (90%)
 create mode 100644 deepray/custom_ops/seq2seq/python/__init__.py
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/attention_wrapper.py (95%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/basic_decoder.py (87%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/beam_search_decoder.py (98%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/decoder.py (97%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/loss.py (99%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/sampler.py (99%)
 create mode 100644 deepray/custom_ops/seq2seq/python/tests/__init__.py
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/attention_wrapper_test.py (98%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/basic_decoder_test.py (99%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/beam_search_decoder_test.py (98%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/beam_search_ops_test.py (94%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/decoder_test.py (97%)
 rename deepray/{seq2seq => custom_ops/seq2seq/python}/tests/loss_test.py (99%)
 create mode 100644 deepray/custom_ops/seq2seq/python/tests/run_all_test.py
 delete mode 100644 deepray/custom_ops/unique_ops/cc/kernels/random.cc
 create mode 100644 deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
 delete mode 100644 deepray/custom_ops/unique_ops/python/tests/unique_op_test.py
 create mode 100644 deepray/custom_ops/utils/BUILD
 create mode 100644 deepray/custom_ops/utils/check.h
 create mode 100644 deepray/custom_ops/utils/fake_input.cc
 create mode 100644 deepray/custom_ops/utils/fake_input.h
 create mode 100644 deepray/custom_ops/utils/kernel_benchmark_testlib.cc
 create mode 100644 deepray/custom_ops/utils/kernel_benchmark_testlib.h
 create mode 100644 deepray/custom_ops/utils/ok_status_util.h
 create mode 100644 deepray/custom_ops/utils/ops_testutil.cc
 create mode 100644 deepray/custom_ops/utils/ops_testutil.h
 create mode 100644 deepray/custom_ops/utils/ops_testutil_test.cc
 rename deepray/custom_ops/{unique_ops/cc/kernels/random_test.cc => utils/random.cc} (67%)
 rename deepray/custom_ops/{unique_ops/cc/kernels => utils}/random.h (82%)
 create mode 100644 deepray/custom_ops/utils/spin_lock.h
 create mode 100644 deepray/custom_ops/utils/spin_rw_lock.h
 create mode 100644 deepray/custom_ops/utils/tensor_testutil.cc
 create mode 100644 deepray/custom_ops/utils/tensor_testutil.h
 create mode 100644 deepray/custom_ops/utils/tensor_testutil_test.cc
 rename deepray/datasets/criteo/{docker => }/Dockerfile_preprocessing (100%)
 delete mode 100644 deepray/datasets/criteo/README.md
 create mode 100644 deepray/datasets/criteo/criteo_dataset.md
 create mode 100644 deepray/datasets/criteo/preproc/data/__init__.py
 create mode 100644 deepray/datasets/criteo/preproc/data/defaults.py
 create mode 100644 deepray/datasets/criteo/preproc/data/feature_spec.py
 rename deepray/datasets/criteo/{docker => }/requirements_preprocessing.txt (58%)
 delete mode 100644 deepray/datasets/csv_pipeline.py
 create mode 100644 deepray/datasets/csv_pipeline/__init__.py
 create mode 100644 deepray/datasets/csv_pipeline/csv_pipeline.py
 delete mode 100644 deepray/datasets/kafka_dataset.py
 create mode 100644 deepray/datasets/kafka_pipeline/__init__.py
 create mode 100644 deepray/datasets/kafka_pipeline/kafka_pipeline.py
 create mode 100644 deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
 delete mode 100644 deepray/datasets/movielens/movielens_100k_ratings_test.py
 delete mode 100644 deepray/datasets/movielens/movielens_1m_ratings_test.py
 create mode 100644 deepray/datasets/movielens/movielens_ratings_test.py
 create mode 100644 deepray/datasets/squad/classifier_dataset.py
 create mode 100644 deepray/datasets/squad/pretrain_dataset.py
 create mode 100644 deepray/datasets/squad/squad_dataset.py
 create mode 100644 deepray/layers/dense.py
 create mode 100644 deepray/layers/embedding_variable.py
 create mode 100644 deepray/layers/tests_bak/on_device_embedding_test.py
 create mode 100644 deepray/losses/_loss_util.py
 create mode 100644 deepray/losses/losses_impl.py
 create mode 100644 deepray/losses/softmax_loss.py
 create mode 100644 deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
 create mode 100644 deepray/losses/utils.py
 create mode 100644 deepray/losses/weighted_sparse_categorical_crossentropy.py
 create mode 100644 deepray/metrics/_ranking.py
 create mode 100644 deepray/metrics/alpha_dcg.py
 create mode 100644 deepray/metrics/arp.py
 create mode 100644 deepray/metrics/dcg.py
 create mode 100644 deepray/metrics/hits.py
 create mode 100644 deepray/metrics/mean_average_precision.py
 create mode 100644 deepray/metrics/metrics_impl.py
 create mode 100644 deepray/metrics/mrr.py
 create mode 100644 deepray/metrics/ndcg.py
 create mode 100644 deepray/metrics/opa.py
 create mode 100644 deepray/metrics/precision.py
 create mode 100644 deepray/metrics/precision_ia.py
 create mode 100644 deepray/metrics/recall.py
 rename deepray/{layers/networks => models}/README.md (96%)
 rename deepray/{layers/networks => models}/albert_transformer_encoder.py (91%)
 rename deepray/{layers/networks => models}/bert_classifier.py (95%)
 rename deepray/{layers/networks => models}/bert_pretrainer.py (96%)
 rename deepray/{layers/networks => models}/bert_span_labeler.py (93%)
 rename deepray/{layers/networks => models}/classification.py (97%)
 rename deepray/{layers/networks => models}/encoder_scaffold.py (94%)
 rename deepray/{layers/networks => models}/masked_lm.py (98%)
 delete mode 100644 deepray/models/rec/tfra_demo.py
 delete mode 100644 deepray/models/rec/tower_new_tfra.py
 rename deepray/{layers/networks => models}/span_labeling.py (100%)
 create mode 100644 deepray/models/tests/__init__.py
 rename deepray/{layers/networks => models/tests}/albert_transformer_encoder_test.py (98%)
 rename deepray/{layers/networks => models/tests}/bert_classifier_test.py (100%)
 rename deepray/{layers/networks => models/tests}/bert_pretrainer_test.py (100%)
 rename deepray/{layers/networks => models/tests}/bert_span_labeler_test.py (100%)
 rename deepray/{layers/networks => models/tests}/classification_test.py (100%)
 rename deepray/{layers/networks => models/tests}/encoder_scaffold_test.py (96%)
 rename deepray/{layers/networks => models/tests}/masked_lm_test.py (98%)
 rename deepray/{layers/networks => models/tests}/span_labeling_test.py (100%)
 rename deepray/{layers/networks => models/tests}/transformer_encoder_test.py (99%)
 rename deepray/{layers/networks => models}/transformer_encoder.py (92%)
 create mode 100644 deepray/optimizers/adagrad.py
 create mode 100644 deepray/optimizers/adam_async.py
 create mode 100644 deepray/optimizers/ev_optimizer_patch.py
 create mode 100644 deepray/optimizers/ftrl.py
 create mode 100644 deepray/optimizers/gradient_descent.py
 create mode 100644 deepray/repo.bzl
 delete mode 100644 deepray/seq2seq/BUILD
 delete mode 100644 deepray/seq2seq/__init__.py
 delete mode 100644 deepray/tensorflow.bzl
 create mode 100644 deepray/utils/ckpt_util.py
 delete mode 100644 deepray/utils/dllogger_class.py
 create mode 100644 deepray/utils/logging_util.py
 delete mode 100644 deepray/utils/logs/hooks.py
 delete mode 100644 deepray/utils/logs/hooks_test.py
 delete mode 100644 deepray/utils/logs/metric_hook.py
 delete mode 100644 deepray/utils/logs/metric_hook_test.py
 create mode 100644 deepray/utils/timer.py
 delete mode 100644 modelzoo/CV/mnist/train_earlystop.py
 create mode 100644 modelzoo/ELECTRA/.gitignore
 create mode 100644 modelzoo/ELECTRA/Dockerfile
 create mode 100644 modelzoo/ELECTRA/LICENSE
 create mode 100644 modelzoo/ELECTRA/NOTICE
 create mode 100644 modelzoo/ELECTRA/README.md
 create mode 100644 modelzoo/ELECTRA/build_pretraining_dataset.py
 create mode 100644 modelzoo/ELECTRA/configuration.py
 create mode 100644 modelzoo/ELECTRA/configuration_utils.py
 create mode 100644 modelzoo/ELECTRA/data/BooksDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
 create mode 100644 modelzoo/ELECTRA/data/Downloader.py
 create mode 100644 modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/MRPCDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/SquadDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/TextSharding.py
 create mode 100644 modelzoo/ELECTRA/data/WikiDownloader.py
 create mode 100644 modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
 create mode 100644 modelzoo/ELECTRA/data/__init__.py
 create mode 100755 modelzoo/ELECTRA/data/create_datasets_from_start.sh
 create mode 100644 modelzoo/ELECTRA/data/dataPrep.py
 create mode 100755 modelzoo/ELECTRA/data/glue/download_mrpc.sh
 create mode 100755 modelzoo/ELECTRA/data/squad/squad_download.sh
 create mode 100644 modelzoo/ELECTRA/file_utils.py
 create mode 100644 modelzoo/ELECTRA/gpu_affinity.py
 create mode 100644 modelzoo/ELECTRA/images/total_loss.svg
 create mode 100644 modelzoo/ELECTRA/modeling.py
 create mode 100644 modelzoo/ELECTRA/modeling_utils.py
 create mode 100644 modelzoo/ELECTRA/optimization.py
 create mode 100644 modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
 create mode 100644 modelzoo/ELECTRA/pretrain_utils.py
 create mode 100644 modelzoo/ELECTRA/run.sub
 create mode 100644 modelzoo/ELECTRA/run_inference.py
 create mode 100644 modelzoo/ELECTRA/run_pretraining.py
 create mode 100644 modelzoo/ELECTRA/run_tf_squad.py
 create mode 100644 modelzoo/ELECTRA/scripts/benchmark_pretraining.sh
 create mode 100644 modelzoo/ELECTRA/scripts/benchmark_squad.sh
 create mode 100755 modelzoo/ELECTRA/scripts/bind.sh
 create mode 100644 modelzoo/ELECTRA/scripts/configs/pretrain_config.sh
 create mode 100644 modelzoo/ELECTRA/scripts/configs/squad_config.sh
 create mode 100644 modelzoo/ELECTRA/scripts/docker/build.sh
 create mode 100644 modelzoo/ELECTRA/scripts/docker/launch.sh
 create mode 100644 modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh
 create mode 100644 modelzoo/ELECTRA/scripts/run_pretraining.sh
 create mode 100644 modelzoo/ELECTRA/scripts/run_squad.sh
 create mode 100644 modelzoo/ELECTRA/squad_utils.py
 create mode 100644 modelzoo/ELECTRA/tokenization.py
 create mode 100644 modelzoo/ELECTRA/tokenization_utils.py
 create mode 100644 modelzoo/ELECTRA/utils.py
 create mode 100755 modelzoo/ELECTRA/vocab/vocab.txt
 create mode 100644 modelzoo/LanguageModeling/BERT/.dockerignore
 create mode 100644 modelzoo/LanguageModeling/BERT/.gitignore
 delete mode 100644 modelzoo/LanguageModeling/BERT/Bert_result.md
 create mode 100644 modelzoo/LanguageModeling/BERT/Dockerfile
 create mode 100644 modelzoo/LanguageModeling/BERT/bert_dllogger.json
 create mode 100644 modelzoo/LanguageModeling/BERT/classifier_data_lib.py
 create mode 100644 modelzoo/LanguageModeling/BERT/common_flags.py
 create mode 100644 modelzoo/LanguageModeling/BERT/create_finetuning_data.py
 create mode 100644 modelzoo/LanguageModeling/BERT/create_pretraining_data.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/Downloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/README.md
 create mode 100644 modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/TextSharding.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/bertPrep.py
 create mode 100644 modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh
 create mode 100644 modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh
 create mode 100644 modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png
 create mode 100644 modelzoo/LanguageModeling/BERT/data/images/images_nvlamb.png
 create mode 100644 modelzoo/LanguageModeling/BERT/gpu_affinity.py
 create mode 100644 modelzoo/LanguageModeling/BERT/input_pipeline.py
 delete mode 100644 modelzoo/LanguageModeling/BERT/logs/squad_train_benchmark_base_fp16_gpu4_bs8.log
 delete mode 100644 modelzoo/LanguageModeling/BERT/logs/tf_bert_finetuning_squad_base_fp16_gbs48.230222025408.log
 create mode 100644 modelzoo/LanguageModeling/BERT/model_saving_utils.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/training/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/bert_modeling.py (81%)
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/bert_models.py (61%)
 create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
 create mode 100644 modelzoo/LanguageModeling/BERT/official/nlp/transformer/__init__.py
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/beam_search_v1.py (83%)
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/beam_search_v1_test.py (85%)
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_params.py (94%)
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_utils.py (89%)
 rename {deepray/layers => modelzoo/LanguageModeling/BERT/official}/nlp/transformer/model_utils_test.py (74%)
 create mode 100644 modelzoo/LanguageModeling/BERT/run.sub
 create mode 100644 modelzoo/LanguageModeling/BERT/run_classifier.py
 create mode 100644 modelzoo/LanguageModeling/BERT/run_pretraining.py
 delete mode 100644 modelzoo/LanguageModeling/BERT/run_squad_predict.py
 create mode 100644 modelzoo/LanguageModeling/BERT/scripts/docker/build.sh
 create mode 100644 modelzoo/LanguageModeling/BERT/scripts/docker/launch.sh
 delete mode 100644 modelzoo/LanguageModeling/BERT/scripts/gen_squad_evel.sh
 create mode 100644 modelzoo/LanguageModeling/BERT/squad_lib.py
 create mode 100644 modelzoo/LanguageModeling/BERT/squad_lib_sp.py
 create mode 100644 modelzoo/LanguageModeling/BERT/tf_trt.py
 create mode 100644 modelzoo/LanguageModeling/BERT/tokenization.py
 create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
 create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
 create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
 create mode 100644 modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
 rename modelzoo/Recommendation/{TFRA/run_horovod.sh => CreditCardFraudDetection/run.sh} (58%)
 delete mode 100644 modelzoo/Recommendation/CreditCardFraudDetection/run_horovod.sh
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/README.md
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/__init__.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/eval.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/feature_map_small.csv
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/infer.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/run.sh
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/train.py
 create mode 100644 modelzoo/Recommendation/Criteo_DCN/train1.py
 create mode 100644 modelzoo/Recommendation/MovieLens/mymodel.py
 create mode 100644 modelzoo/Recommendation/MovieLens/run.sh
 create mode 100644 modelzoo/Recommendation/MovieLens/run_ranking.sh
 create mode 100644 modelzoo/Recommendation/MovieLens/train.py
 create mode 100644 modelzoo/Recommendation/MovieLens/train_ranking.py
 delete mode 100644 modelzoo/Recommendation/TFRA/demo.py
 delete mode 100644 modelzoo/Recommendation/TFRA/demo_tfra.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/README.md
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/README.md
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_1.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_2.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/utils.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/frozen.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/optimize_for_inference.py
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/run_horovod.sh
 delete mode 100644 modelzoo/Recommendation/criteo_ctr/run_optimize.sh
 create mode 100644 third_party/arrow/arrow-20.patch
 create mode 100644 third_party/clang_toolchain/BUILD
 create mode 100644 third_party/clang_toolchain/cc_configure_clang.bzl
 create mode 100644 third_party/clang_toolchain/download_clang.bzl
 create mode 100644 third_party/cuCollections/BUILD
 create mode 100644 third_party/cuCollections/cuCollections.BUILD
 rename third_party/{cucollection => cuCollections}/cuco.BUILD (99%)
 rename third_party/{cucollection => cuCollections}/cucollection.patch (90%)
 create mode 100644 third_party/flash_attn/BUILD
 create mode 100644 third_party/flash_attn/flash_attn.BUILD
 create mode 100644 third_party/flash_attn/flash_attn.patch
 create mode 100644 third_party/gpus/BUILD.bazel
 create mode 100644 third_party/leveldb.BUILD
 create mode 100644 third_party/openssl.BUILD
 create mode 100644 third_party/py/BUILD
 create mode 100644 third_party/py/pypi.bzl
 create mode 100644 third_party/py/python_init_pip.bzl
 create mode 100644 third_party/readerwriterqueue.BUILD
 create mode 100644 third_party/remote_config/BUILD
 create mode 100644 third_party/remote_config/common.bzl
 create mode 100644 third_party/repo.bzl
 delete mode 100644 third_party/sparsehash.BUILD
 create mode 100644 third_party/sparsehash_c11/BUILD
 rename third_party/{ => sparsehash_c11}/sparsehash_c11.BUILD (100%)
 create mode 100644 third_party/sparsehash_c11/sparsehash_c11.patch
 create mode 100644 third_party/tf/BUILD
 create mode 100644 third_party/tf/tf_215.patch
 create mode 100644 third_party/xla/BUILD.bazel
 create mode 100644 third_party/xla/workspace.bzl
 create mode 100644 tools/docker/bazel.bazelrc
 create mode 100644 tools/docker/entry.sh
 create mode 100644 tools/docker/py3.10_env.yml
 create mode 100644 tools/docker/py3.8_env.yml
 create mode 100644 tools/install_deps/install_miniforge.sh
 create mode 100644 tools/install_deps/install_nsight-systems.sh
 delete mode 100644 tools/install_deps/pytest.txt
 delete mode 100644 tools/install_deps/tensorflow-cpu.txt
 delete mode 100644 tools/install_deps/typedapi.txt

diff --git a/.bazelversion b/.bazelversion
index 7d3cdbf0..4be2c727 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-5.3.1
\ No newline at end of file
+6.5.0
\ No newline at end of file
diff --git a/BUILD b/BUILD
index 174fad05..369f8476 100644
--- a/BUILD
+++ b/BUILD
@@ -1,6 +1,45 @@
+load("@bazel_skylib//rules:build_test.bzl", "build_test")
+
+# Copyright 2024 The Deepray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("@rules_license//rules:license.bzl", "license")
+load("//third_party/py:pypi.bzl", "pypi_requirement")
+
+package(
+    default_applicable_licenses = [":license"],
+    default_visibility = ["//deepray:__subpackages__"],
+)
+
+license(
+    name = "license",
+    package_name = "deepray",
+)
+
+exports_files([
+    "LICENSE",
+    "setup.py",
+    "MANIFEST.in",
+    "README.md",
+    "requirements.txt",
+])
+
+###############################################################################
+# PIP Package
+###############################################################################
 sh_binary(
     name = "build_pip_pkg",
-    srcs = ["build_deps/build_pip_pkg.sh"],
+    srcs = ["//build_deps:build_pip_pkg.sh"],
     data = [
         "LICENSE",
         "MANIFEST.in",
diff --git a/README.md b/README.md
index b40e2573..a0fdd4a4 100644
--- a/README.md
+++ b/README.md
@@ -1,220 +1,152 @@
+## **Introduction**
+Deepray is a deep learning framework for Keras, to build model like LEGO, and train model with easier, faster and cheaper way.
 
------------------
 
-[![PyPI Status Badge](https://badge.fury.io/py/deepray.svg)](https://pypi.org/project/deepray/)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/deepray)](https://pypi.org/project/deepray/)
-[![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/deepray/api_docs/python/dp)
-[![Gitter chat](https://img.shields.io/badge/chat-on%20gitter-46bc99.svg)](https://gitter.im/tensorflow/sig-deepray)
-[![Code style: yapf](https://img.shields.io/badge/code%20style-yapf-blue)](https://github.com/google/yapf)
+## **Why Deepray?**
+Deepray contains list of features to improve usability and performance for Deep Learning, especially provides some essential components for recommendation algorithm. 
 
-### Continuous Build Status
 
-| Build      | Status |
-| ---             | ---    |
-| **Ubuntu**   | [![Status](https://github.com/deepray-AI/deepray/workflows/deepray-release/badge.svg)](https://github.com/deepray-AI/deepray/actions?query=workflow%3deepray-release) |
+**Trainer**
+ - Distributed Training with Horovod backend
+ - Gradient accumulation
 
+**Layers**
+ - Embedding Variable from [DeepRec](https://github.com/DeepRec-AI/DeepRec).
+ - Compositional Embedding
+ - Feature Cross layer for recommendation algorithm
+ - ......
 
-**Deepray** is a repository of contributions that conform to
-well-established API patterns, but implement new functionality
-not available in core TensorFlow. TensorFlow natively supports
-a large number of operators, layers, metrics, losses, and optimizers.
-However, in a fast moving field like ML, there are many interesting new
-developments that cannot be integrated into core TensorFlow
-(because their broad applicability is not yet clear, or it is mostly
- used by a smaller subset of the community).
+**Kernels**
+ - Group Embedding for Embedding Variable
+ - ......
 
+**Optimizer**
+ - Adam/Adagrad/SDG/FTRL Optimizer for Embedding Variable
+ - AdamAsync Optimizer
+ - MultiOptimizer
+ - ......
 
-## Maintainership
-The maintainer of Deepray now is [@fuhailin](https://github.com/fuhailin). If you would
-like to maintain something, please feel free to submit a PR. We encourage multiple 
-owners for all submodules.
+**Datasets**
+ - Parquet Dataset from [HybridBackend](https://github.com/DeepRec-AI/HybridBackend)
+ - ......
 
-## Installation
-#### Stable Builds
-Deepray is available on PyPI for Linux. To install the latest version, run the following:
-```
-pip install deepray
-```
+**......**
 
-To ensure you have a version of TensorFlow that is compatible with Deepray, you can specify the `tensorflow` extra requirement during install:
+#### Compatibility Matrix
+|  Deepray        | TensorFlow |  Compiler  | cuDNN | CUDA |
+| :-------------- | :--------- | :--------- | :---- | :--- |
+| deepray-0.21.86 | 2.15       | GCC 11.4.0 | 8.9   | 12.3.2 |
 
-```
-pip install deepray[tensorflow]
-```
 
-Similar extras exist for the `tensorflow-gpu` and `tensorflow-cpu` packages. To use Deepray:
+# Quick start
+ - Install Deepray:
 
-```python
-import tensorflow as tf
-import deepray as dp
+```bash
+pip install deepray
 ```
 
-### Python Op Compatility
-Deepray is actively working towards forward compatibility with TensorFlow 2.x. 
-However, there are still a few private API uses within the repository so at the moment 
-we can only guarantee compatibility with the TensorFlow versions which it was tested against. 
-Warnings will be emitted when importing `deepray` if your TensorFlow version does not match 
-what it was tested against.
-
-#### Python Op Compatibility Matrix
-| Deepray        | TensorFlow     | Python              |
-| :------------- | :------------- | :------------------ |
-| deepray-0.18.0 | 2.9.3 | 3.8, 3.9, 3.10, 3.11 |
-
-
-### C++ Custom Op Compatibility
-TensorFlow C++ APIs are not stable and thus we can only guarantee compatibility with the 
-version Deepray was built against. It is possible custom ops will work with multiple 
-versions of TensorFlow, but there is also a chance for segmentation faults or other problematic crashes.
-Warnings will be emitted when loading a custom op if your TensorFlow version does not match 
-what it was built against.
-
-Additionally, custom ops registration does not have a stable ABI interface so it is 
-required that users have a compatible installation of TensorFlow even if the versions 
-match what we had built against. A simplification of this is that **Deepray 
-custom ops will work with `pip`-installed TensorFlow** but will have issues when TensorFlow 
-is compiled differently. A typical example of this would be `conda`-installed TensorFlow.
-[RFC #133](https://github.com/tensorflow/community/pull/133) aims to fix this.
-
-
-#### C++ Custom Op Compatibility Matrix
-| Deepray        | TensorFlow | Compiler  | cuDNN | CUDA |
-| :------------- | :--------- | :-------- | :---- | :--- |
-| deepray-0.18.0 | 2.12       | GCC 9.3.1 | 8.1   | 11.8 |
-
-
-
-#### Installing from Source
-You can also install from source. This requires the [Bazel](
-https://bazel.build/) build system (version >= 1.0.0).
-
-##### CPU Custom Ops
+ - Using Docker(**Recommended**):
+Latest Release Images: **hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04**
 ```
-git clone https://github.com/deepray-AI/deepray.git
-cd deepray
-
-# This script links project with TensorFlow dependency
-python3 ./configure.py
-
-bazel build build_pip_pkg
-bazel-bin/build_pip_pkg artifacts
-
-pip install artifacts/deepray-*.whl
+docker pull hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04
+docker run -it hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.3.2-ubuntu22.04
 ```
 
-##### GPU and CPU Custom Ops
+ - Build from source:
 ```
 git clone https://github.com/deepray-AI/deepray.git
-cd deepray
+cd deepray && bash build.sh
+```
 
-export TF_NEED_CUDA="1"
 
-# Set these if the below defaults are different on your system
-export TF_CUDA_VERSION="11"
-export TF_CUDNN_VERSION="8"
-export CUDA_TOOLKIT_PATH="/usr/local/cuda"
-export CUDNN_INSTALL_PATH="/usr/lib/x86_64-linux-gnu"
+### Deepray example
+Define the training workflow. Here's a toy example ([explore real examples](https://github.com/deepray-AI/deepray/blob/main/modelzoo/Recommendation/CreditCardFraudDetection/train.py)):
 
-# This script links project with TensorFlow dependency
-python3 ./configure.py
+```python
+# main.py
+# ! pip install deepray
+from typing import Dict
 
-bazel build build_pip_pkg
-bazel-bin/build_pip_pkg artifacts
+import tensorflow as tf
+from absl import flags
 
-pip install artifacts/deepray-*.whl
+import deepray as dp
+from deepray.core.trainer import Trainer
+from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating
+from deepray.layers.embedding_variable import EmbeddingVariable
+
+# --------------------------------
+# Step 1: Define a Keras Module
+# --------------------------------
+class RankingModel(tf.keras.Model):
+
+  def __init__(self, embedding_dimension=32):
+    super().__init__()
+    # Compute embeddings for users.
+    self.user_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension)
+    self.movie_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension)
+
+    # Compute predictions.
+    self.ratings = tf.keras.Sequential(
+        [
+            # Learn multiple dense layers.
+            tf.keras.layers.Dense(256, activation="relu"),
+            tf.keras.layers.Dense(64, activation="relu"),
+            # Make rating predictions in the final layer.
+            tf.keras.layers.Dense(1)
+        ]
+    )
+
+  def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor:
+    user_id, movie_title = inputs["user_id"], inputs["movie_title"]
+    user_id = tf.reshape(user_id, [-1])
+    movie_title = tf.reshape(movie_title, [-1])
+    user_embedding = self.user_embeddings(user_id)
+    movie_embedding = self.movie_embeddings(movie_title)
+    emb_vec = tf.concat([user_embedding, movie_embedding], axis=1)
+    return self.ratings(emb_vec)
+
+
+# -------------------
+# Step 2: Define data
+# -------------------
+data_pipe = Movielens100kRating(split=True)
+dataset = data_pipe(flags.FLAGS.batch_size, is_training=True)
+
+# -------------------
+# Step 3: Train
+# -------------------
+optimizer = dp.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
+model = RankingModel()
+trainer = Trainer(model=model, optimizer=optimizer, loss="MSE", metrics=[tf.keras.metrics.RootMeanSquaredError()])
+trainer.fit(x=dataset)
 ```
 
-## Tutorials
-See [`docs/tutorials/`](docs/tutorials/)
-for end-to-end examples of various deepray.
-
-## Core Concepts
+Run the model on your terminal
 
-#### Standardized API within Subpackages
-User experience and project maintainability are core concepts in
-Deepray. In order to achieve these we require that our additions
-conform to established API patterns seen in core TensorFlow.
-
-#### GPU and CPU Custom Ops
-Deepray supports precompiled custom ops for CPU and GPU. However, 
-GPU custom ops currently only work on Linux distributions. For this reason Windows and macOS 
-will fallback to pure TensorFlow Python implementations whenever possible.
+```bash
+python main.py --batch_size=32 --learning_rate=0.03
+```
+----
+## Examples
 
-The order of priority on macOS/Windows is:
-1) Pure TensorFlow + Python implementation (works on CPU and GPU)
-2) C++ implementation for CPU
+###### Recommender Systems
 
-The order of priority on Linux is:
-1) CUDA implementation
-2) C++ implementation
-3) Pure TensorFlow + Python implementation (works on CPU and GPU)
+- [Deep & Cross Network V2 with Criteo](https://github.com/deepray-AI/deepray/tree/main/modelzoo/Recommendation/criteo_ctr)
+- [MovieLens](https://github.com/deepray-AI/deepray/tree/main/modelzoo/Recommendation)
 
-If you want to change the default priority, "C++ and CUDA" VS "pure TensorFlow Python",
-you can set the environment variable `DEEPRAY_PY_OPS=1` from the command line or
-run `dp.options.disable_custom_kernel()` in your code.
+###### Natural Language Processing
 
-For example, if you are on Linux and you have compatibility problems with the compiled ops,
-you can give priority to the Python implementations:
+- [BERT](https://github.com/deepray-AI/deepray/tree/main/modelzoo/LanguageModeling/BERT)
 
-From the command line:
-```bash
-export DEEPRAY_PY_OPS=1
-```
+###### Computer Vision
 
-or in your code:
-
-```python
-import deepray as dp
-dp.options.disable_custom_kernel()
-```
+- [Mnist](https://github.com/deepray-AI/deepray/tree/main/modelzoo/CV/mnist)
 
-This variable defaults to `True` on Windows and macOS, and `False` on Linux.
-
-## Contributing
-Deepray is a community-led open source project (only a few maintainers work for Google!). 
-As such, the project depends on public contributions, bug fixes, and documentation. 
-This project adheres to [TensorFlow's code of conduct](CODE_OF_CONDUCT.md).
-By participating, you are expected to uphold this code.
-
-Do you want to contribute but are not sure of what? Here are a few suggestions:
-1. Add a new tutorial. Located in [`docs/tutorials/`](docs/tutorials),
-  these are a great way to familiarize yourself and others with Deepray. See
-  [the guidelines](docs/tutorials/README.md) for more information on how to add
-  examples.
-2. Improve the docstrings. The docstrings are fetched and then displayed in the documentation.
-  Do a change and hundreds of developers will see it and benefit from it. Maintainers are often focused 
-  on making APIs, fixing bugs and other code related changes. The documentation will never 
-  be loved enough!
-3. Solve an [existing issue](https://github.com/deepray-AI/deepray/issues).
-  These range from low-level software bugs to higher-level design problems.
-  Check out the label [help wanted](https://github.com/tensorflow/deepray/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22). If you're a new contributor, the label [good first issue](https://github.com/tensorflow/deepray/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) can be a good place to start.
-4. Review a pull request. So you're not a software engineer but you know a lot
-  about a certain field a research? That's awesome and we need your help! Many people 
-  are submitting pull requests to add layers/optimizers/functions taken from recent
-  papers. Since Deepray maintainers are not specialized in everything,
-  you can imagine how hard it is to review. It takes very long to read the paper,
-  understand it and check the math in the pull request. If you're specialized, look at 
-  the [list of pull requests](https://github.com/deepray-AI/deepray/pulls). 
-  If there is something from a paper you know, please comment on the pull request to
-  check the math is ok. If you see that everything is good, say it! It will help 
-  the maintainers to sleep better at night knowing that he/she wasn't the only
-  person to approve the pull request.
-5. You have an opinion and want to share it? The docs are not very helpful for 
-  a function or a class? You tried to open a pull request but you didn't manage to 
-  install or test anything and you think it's too complicated? You made a pull request
-  but you didn't find the process good enough and it made no sense to you? Please 
-  say it! We want feedback. Maintainers are too much the head into the code 
-  to understand what it's like for someone new to open source to come to this project. 
-  If you don't understand something, be aware there are no people who are 
-  bad at understanding, there are just bad tutorials and bad guides.
-
-Please see [contribution guidelines](CONTRIBUTING.md) to get started (and remember,
-if you don't understand something, open an issue, or even make a pull request to 
-improve the guide!).
-
-## Community
-* [Public Mailing List](https://groups.google.com/a/tensorflow.org/forum/#!forum/deepray)
+## Communication
 
+- [GitHub issues](https://github.com/deepray-AI/deepray/issues): any install, bug, feature issues.
+- 微信号: StateOfArt
 
 ## License
 [Apache License 2.0](LICENSE)
diff --git a/WORKSPACE b/WORKSPACE
index 74519138..0fe7ed52 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,16 +1,99 @@
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure")
-load("//build_deps/toolchains/gpu:cuda_configure.bzl", "cuda_configure")
+workspace(name = "deepray")
+
+load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")  # buildifier: disable=load-on-top
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")  # buildifier: disable=load-on-top
+
+http_archive(
+    name = "rules_python",
+    sha256 = "d71d2c67e0bce986e1c5a7731b4693226867c45bfe0b7c5e0067228a536fc580",
+    strip_prefix = "rules_python-0.29.0",
+    url = "https://github.com/bazelbuild/rules_python/releases/download/0.29.0/rules_python-0.29.0.tar.gz",
+)
+
+load("@rules_python//python:repositories.bzl", "py_repositories", "python_register_toolchains")  # buildifier: disable=load-on-top
+
+py_repositories()
+
+python_register_toolchains(
+    name = "python",
+    ignore_root_user_error = True,
+    python_version = "3.10",
+)
+
+load("//third_party/xla:workspace.bzl", xla_repo = "repo")
 
-tf_configure(
-    name = "local_config_tf",
+xla_repo()
+
+# Initialize hermetic Python
+load("@xla//third_party/py:python_init_rules.bzl", "python_init_rules")
+
+python_init_rules()
+
+load("@xla//third_party/py:python_init_repositories.bzl", "python_init_repositories")
+
+python_init_repositories(
+    default_python_version = "system",
+    requirements = {
+        "3.10": "//build_deps:requirements_lock_3_10.txt",
+        "3.11": "//build_deps:requirements_lock_3_11.txt",
+        "3.12": "//build_deps:requirements_lock_3_12.txt",
+        "3.13": "//build_deps:requirements_lock_3_13.txt",
+    },
 )
 
+load("@xla//third_party/py:python_init_toolchains.bzl", "python_init_toolchains")
+
+python_init_toolchains()
+
+load("//third_party/py:python_init_pip.bzl", "python_init_pip")
+
+python_init_pip()
+
+load("@pypi//:requirements.bzl", "install_deps")
+
+install_deps()
+
+load("@xla//:workspace4.bzl", "xla_workspace4")
+
+xla_workspace4()
+
+load("@tsl//third_party/gpus/cuda/hermetic:cuda_json_init_repository.bzl", "cuda_json_init_repository")
+
+cuda_json_init_repository()
+
+load("@cuda_redist_json//:distributions.bzl", "CUDA_REDISTRIBUTIONS", "CUDNN_REDISTRIBUTIONS")
+load("@tsl//third_party/gpus/cuda/hermetic:cuda_redist_init_repositories.bzl", "cuda_redist_init_repositories", "cudnn_redist_init_repository")
+
+cuda_redist_init_repositories(cuda_redistributions = CUDA_REDISTRIBUTIONS)
+
+cudnn_redist_init_repository(cudnn_redistributions = CUDNN_REDISTRIBUTIONS)
+
+load("@tsl//third_party/gpus/cuda/hermetic:cuda_configure.bzl", "cuda_configure")
+
+cuda_configure(name = "local_config_cuda")
+
+load("@tsl//third_party/nccl/hermetic:nccl_redist_init_repository.bzl", "nccl_redist_init_repository")
+
+nccl_redist_init_repository()
+
+load("@tsl//third_party/nccl/hermetic:nccl_configure.bzl", "nccl_configure")
+
+nccl_configure(name = "local_config_nccl")
+
+load("//build_deps/tf_dependency:tf_configure.bzl", "tf_configure")
+
+tf_configure(name = "local_config_tf")
+
 http_archive(
     name = "org_tensorflow",
-    strip_prefix = "tensorflow-2.9.1",
+    patch_args = ["-p1"],
+    patches = [
+        "//third_party/tf:tf_215.patch",
+    ],
+    sha256 = "f36416d831f06fe866e149c7cd752da410a11178b01ff5620e9f265511ed57cf",
+    strip_prefix = "tensorflow-2.15.1",
     urls = [
-        "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.9.1.tar.gz",
+        "https://github.com/tensorflow/tensorflow/archive/refs/tags/v2.15.1.tar.gz",
     ],
 )
 
@@ -30,12 +113,6 @@ load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
 
 tf_workspace0()
 
-# Initialize the TensorFlow repository and all dependencies.
-#
-# The cascade of load() statements and tf_workspace?() calls works around the
-# restriction that load() statements need to be at the top of .bzl files.
-# E.g. we can not retrieve a new repository with http_archive and then load()
-# a macro from that repository in the same file.
 load("@//deepray:workspace3.bzl", "dp_workspace3")
 
 dp_workspace3()
@@ -51,5 +128,3 @@ dp_workspace2()
 load("@//deepray:workspace0.bzl", "dp_workspace0")
 
 dp_workspace0()
-
-cuda_configure(name = "local_config_cuda")
diff --git a/build.sh b/build.sh
index 8f2db504..b2435923 100644
--- a/build.sh
+++ b/build.sh
@@ -3,11 +3,10 @@ set -e
 
 yes "" | bash ./configure || true
 
-# bazel build build_pip_pkg \
-#     --action_env=HTTP_PROXY=http://127.0.0.1:7890 \
-#     --action_env=HTTPS_PROXY=http://127.0.0.1:7890
-
-bazel build build_pip_pkg
+# --compilation_mode dbg \
+bazel build build_pip_pkg \
+    --copt=-O3 --copt=-march=native \
+    -s
 
 rm -rf artifacts/
 
diff --git a/build_deps/BUILD b/build_deps/BUILD
new file mode 100644
index 00000000..8298d741
--- /dev/null
+++ b/build_deps/BUILD
@@ -0,0 +1,37 @@
+# Copyright 2024 The Deepray Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+licenses(["notice"])
+
+package(
+    default_visibility = ["//deepray:__subpackages__"],
+)
+
+exports_files(["build_pip_pkg.sh"])
+
+compile_pip_requirements(
+    name = "requirements",
+    timeout = "moderate",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--rebuild",
+        "--resolver=backtracking",
+        "-i https://pypi.tuna.tsinghua.edu.cn/simple",
+    ],
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
diff --git a/build_deps/build_pip_pkg.sh b/build_deps/build_pip_pkg.sh
index 07fe4ca4..5e962bcc 100755
--- a/build_deps/build_pip_pkg.sh
+++ b/build_deps/build_pip_pkg.sh
@@ -28,9 +28,9 @@ function is_macos() {
 }
 
 if is_windows; then
-  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/__main__/"
+  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.exe.runfiles/deepray/"
 else
-  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/__main__/"
+  PIP_FILE_PREFIX="bazel-bin/build_pip_pkg.runfiles/deepray/"
 fi
 
 function abspath() {
diff --git a/build_deps/patches/BUILD b/build_deps/patches/BUILD
new file mode 100644
index 00000000..a4988a91
--- /dev/null
+++ b/build_deps/patches/BUILD
@@ -0,0 +1,15 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This empty BUILD file is required to make Bazel treat this directory as a package.
diff --git a/build_deps/patches/internal_visibility.patch b/build_deps/patches/internal_visibility.patch
new file mode 100644
index 00000000..7703001e
--- /dev/null
+++ b/build_deps/patches/internal_visibility.patch
@@ -0,0 +1,13 @@
+diff --git tensorflow/BUILD tensorflow/BUILD
+index 202553cd531..171eb04665c 100644
+--- tensorflow/BUILD
++++ tensorflow/BUILD
+@@ -1039,7 +1039,7 @@ package_group(
+         "//smartass/brain/configure/...",
+         "//tensorflow/...",
+         "//tensorflow_decision_forests/...",
+-        "//tensorflow_federated/...",
++        "public",
+         "//third_party/cloud_tpu/convergence_tools/sdc_monitoring/...",
+         "//third_party/cloud_tpu/inference_converter/...",
+         "//third_party/py/cloud_ml_autoflow/...",
diff --git a/build_deps/patches/python_toolchain.patch b/build_deps/patches/python_toolchain.patch
new file mode 100644
index 00000000..14f3dc69
--- /dev/null
+++ b/build_deps/patches/python_toolchain.patch
@@ -0,0 +1,74 @@
+diff --git tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+index a2bdd6a7eed..ec25c23d8d4 100644
+--- tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
++++ tensorflow/tools/toolchains/cpus/aarch64/aarch64_compiler_configure.bzl
+@@ -2,7 +2,7 @@
+ 
+ load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64.bzl", "remote_aarch64_configure")
+ load("//third_party/remote_config:remote_platform_configure.bzl", "remote_platform_configure")
+-load("//third_party/py:python_configure.bzl", "remote_python_configure")
++load("//third_party/py/non_hermetic:python_configure.bzl", "remote_python_configure")
+ 
+ def ml2014_tf_aarch64_configs(name_container_map, env):
+     for name, container in name_container_map.items():
+diff --git tensorflow/tools/toolchains/remote_config/rbe_config.bzl tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+index 9f71a414bf7..57f70752323 100644
+--- tensorflow/tools/toolchains/remote_config/rbe_config.bzl
++++ tensorflow/tools/toolchains/remote_config/rbe_config.bzl
+@@ -1,6 +1,6 @@
+ """Macro that creates external repositories for remote config."""
+ 
+-load("//third_party/py:python_configure.bzl", "local_python_configure", "remote_python_configure")
++load("//third_party/py/non_hermetic:python_configure.bzl", "local_python_configure", "remote_python_configure")
+ load("//third_party/gpus:cuda_configure.bzl", "remote_cuda_configure")
+ load("//third_party/nccl:nccl_configure.bzl", "remote_nccl_configure")
+ load("//third_party/gpus:rocm_configure.bzl", "remote_rocm_configure")
+diff --git tensorflow/workspace2.bzl tensorflow/workspace2.bzl
+index 7e9faa558a4..5b18cb0969a 100644
+--- tensorflow/workspace2.bzl
++++ tensorflow/workspace2.bzl
+@@ -8,7 +8,7 @@ load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
+ load("//third_party/tensorrt:tensorrt_configure.bzl", "tensorrt_configure")
+ load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
+ load("//third_party/git:git_configure.bzl", "git_configure")
+-load("//third_party/py:python_configure.bzl", "python_configure")
++load("//third_party/py/non_hermetic:python_configure.bzl", "python_configure")
+ load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
+ load("//tensorflow/tools/toolchains:cpus/aarch64/aarch64_compiler_configure.bzl", "aarch64_compiler_configure")
+ load("//tensorflow/tools/toolchains:cpus/arm/arm_compiler_configure.bzl", "arm_compiler_configure")
+diff --git third_party/py/non_hermetic/python_configure.bzl third_party/py/non_hermetic/python_configure.bzl
+index 300cbfb6c71..09d98505dd9 100644
+--- third_party/py/non_hermetic/python_configure.bzl
++++ third_party/py/non_hermetic/python_configure.bzl
+@@ -206,7 +206,7 @@ def _create_local_python_repository(repository_ctx):
+     # Resolve all labels before doing any real work. Resolving causes the
+     # function to be restarted with all previous state being lost. This
+     # can easily lead to a O(n^2) runtime in the number of labels.
+-    build_tpl = repository_ctx.path(Label("//third_party/py:BUILD.tpl"))
++    build_tpl = repository_ctx.path(Label("//third_party/py/non_hermetic:BUILD.tpl"))
+ 
+     python_bin = get_python_bin(repository_ctx)
+     _check_python_bin(repository_ctx, python_bin)
+diff --git third_party/py/numpy/BUILD third_party/py/numpy/BUILD
+index 97c7907fc38..c80cc5287bc 100644
+--- third_party/py/numpy/BUILD
++++ third_party/py/numpy/BUILD
+@@ -2,14 +2,15 @@ licenses(["restricted"])
+ 
+ package(default_visibility = ["//visibility:public"])
+ 
+-alias(
++py_library(
+     name = "numpy",
+-    actual = "@pypi_numpy//:pkg",
++    srcs = ["tf_numpy_dummy.py"]
++    srcs_version = "PY3",
+ )
+ 
+ alias(
+     name = "headers",
+-    actual = "@pypi_numpy//:numpy_headers",
++    actual = "@local_config_python//:numpy_headers",
+ )
+ 
+ genrule(
diff --git a/build_deps/patches/tensorflow_llvm_url.patch b/build_deps/patches/tensorflow_llvm_url.patch
new file mode 100644
index 00000000..88136b56
--- /dev/null
+++ b/build_deps/patches/tensorflow_llvm_url.patch
@@ -0,0 +1,23 @@
+diff --git third_party/llvm/workspace.bzl third_party/llvm/workspace.bzl
+index 038e0ee5fe5..4693f5cfadc 100644
+--- third_party/llvm/workspace.bzl
++++ third_party/llvm/workspace.bzl
+@@ -5,15 +5,15 @@ load("//third_party:repo.bzl", "tf_http_archive")
+ def repo(name):
+     """Imports LLVM."""
+     LLVM_COMMIT = "668e33c6401abe7844691fb7d47a3cf2d2012dbc"
+-    LLVM_SHA256 = "b97fefaa486b106c8dd45b963116ed7684d8f3f55682116d5760b0b60db17702"
++    LLVM_SHA256 = "f6659fe4c8bfb271262abbe52f1f1320d12174504202c7c4bc4bce0910511297"
+ 
+     tf_http_archive(
+         name = name,
+         sha256 = LLVM_SHA256,
+-        strip_prefix = "llvm-project-{commit}".format(commit = LLVM_COMMIT),
++        strip_prefix = "llvm-llvm-project-{commit_partial}".format(commit_partial = LLVM_COMMIT[:7]),
+         urls = [
+             "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
+-            "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
++            "https://api.github.com/repos/llvm/llvm-project/tarball/{commit}".format(commit = LLVM_COMMIT),
+         ],
+         build_file = "//third_party/llvm:llvm.BUILD",
+         patch_file = [
diff --git a/build_deps/patches/tensorflow_serving.patch b/build_deps/patches/tensorflow_serving.patch
new file mode 100644
index 00000000..9808ef8c
--- /dev/null
+++ b/build_deps/patches/tensorflow_serving.patch
@@ -0,0 +1,25 @@
+diff --git a/tensorflow_serving/util/net_http/server/public/BUILD b/tensorflow_serving/util/net_http/server/public/BUILD
+index e7f96d98..2ae0530a 100644
+--- tensorflow_serving/util/net_http/server/public/BUILD
++++ tensorflow_serving/util/net_http/server/public/BUILD
+@@ -34,6 +34,7 @@ cc_library(
+     hdrs = [
+         "httpserver.h",
+     ],
++    visibility = ["//visibility:public"],
+     deps = [
+         ":http_server_api",
+         "//tensorflow_serving/util/net_http/server/internal:evhttp_server",
+diff --git a/tensorflow_serving/workspace.bzl b/tensorflow_serving/workspace.bzl
+index 08c3cc28..0803cdf3 100644
+--- tensorflow_serving/workspace.bzl
++++ tensorflow_serving/workspace.bzl
+@@ -31,7 +31,7 @@ def tf_serving_workspace():
+         url = "https://github.com/libevent/libevent/archive/release-2.1.8-stable.zip",
+         sha256 = "70158101eab7ed44fd9cc34e7f247b3cae91a8e4490745d9d6eb7edc184e4d96",
+         strip_prefix = "libevent-release-2.1.8-stable",
+-        build_file = "@//third_party/libevent:BUILD",
++        build_file = "@//third_party:event.BUILD.bzl",
+     )
+ 
+     # ===== ICU dependency =====
diff --git a/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch b/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch
new file mode 100644
index 00000000..f24ed7dc
--- /dev/null
+++ b/build_deps/patches/tensorflow_tf_gen_op_wrapper_py.patch
@@ -0,0 +1,11 @@
+--- tensorflow/tensorflow.bzl
++++ tensorflow/tensorflow.bzl
+@@ -1473,7 +1473,7 @@
+     # Make a py_library out of the generated python file.
+     if not generated_target_name:
+         generated_target_name = name
+-    py_deps = [clean_dep("//tensorflow/python/framework:for_generated_wrappers_v2")]
++    py_deps = ["@pypi_tensorflow//:pkg"]
+     if extra_py_deps:
+         py_deps += extra_py_deps
+     py_lib_rule(
diff --git a/build_deps/patches/tensorflow_zlib.patch b/build_deps/patches/tensorflow_zlib.patch
new file mode 100644
index 00000000..e551f3d6
--- /dev/null
+++ b/build_deps/patches/tensorflow_zlib.patch
@@ -0,0 +1,11 @@
+--- third_party/zlib.BUILD
++++ third_party/zlib.BUILD
+@@ -31,7 +31,7 @@
+         "zutil.c",
+         "zutil.h",
+     ],
+-    hdrs = ["zlib.h"],
++    hdrs = ["zconf.h", "zlib.h"],
+     copts = select({
+         "@org_tensorflow//tensorflow/tsl:windows": [],
+         "//conditions:default": [
diff --git a/build_deps/patches/tf2xla_visibility.patch b/build_deps/patches/tf2xla_visibility.patch
new file mode 100644
index 00000000..84b7cdca
--- /dev/null
+++ b/build_deps/patches/tf2xla_visibility.patch
@@ -0,0 +1,13 @@
+diff --git tensorflow/compiler/tf2xla/BUILD tensorflow/compiler/tf2xla/BUILD
+index 22d9877bed9..3f6b421465d 100644
+--- tensorflow/compiler/tf2xla/BUILD
++++ tensorflow/compiler/tf2xla/BUILD
+@@ -46,7 +46,7 @@ package_group(
+     packages = [
+         "//platforms/performance/automl/...",
+         "//tensorflow/...",
+-        "//tensorflow_federated/cc/core/impl/executors/...",
++        "public",
+         "//tensorflow_models/...",
+         "//third_party/deepmind/deepmind_research/density_functional_approximation_dm21/...",
+         "//third_party/mlir_edge/model_curriculum/iree/...",
diff --git a/build_deps/pip_tf/BUILD b/build_deps/pip_tf/BUILD
new file mode 100644
index 00000000..56ebe258
--- /dev/null
+++ b/build_deps/pip_tf/BUILD
@@ -0,0 +1,30 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("@rules_python//python:defs.bzl", "py_test")
+load(":defs.bzl", "PIP_TF_COPTS", "PIP_TF_CXXOPTS", "PIP_TF_LINKOPTS")
+
+py_test(
+    name = "pip_tf_flags_test",
+    srcs = ["pip_tf_flags_test.py"],
+    args = [
+        "--copts=" + ",".join(PIP_TF_COPTS),
+        "--cxxopts=" + ",".join(PIP_TF_CXXOPTS),
+        "--linkopts=" + ",".join(PIP_TF_LINKOPTS),
+    ],
+    deps = [
+        "@pypi_absl_py//:pkg",
+        "@pypi_tensorflow//:pkg",
+    ],
+)
diff --git a/build_deps/pip_tf/README.md b/build_deps/pip_tf/README.md
new file mode 100644
index 00000000..79283160
--- /dev/null
+++ b/build_deps/pip_tf/README.md
@@ -0,0 +1,25 @@
+
+When building libraries (such as custom op libraries) against the TensorFlow pip
+package, care must be taken to ensure those libraries build against that
+package's headers and with the same compiler and linker flags as that package
+was compiled with. These utilities help ensure that's the case.
+
+This package assumes Tensorflow is available in the `@pypi_tensorflow` package,
+with the additional build content specified in `TF_ADDITIVE_BUILD_CONTENT`:
+
+```
+load("@com_google_fcp//tensorflow/pip_tf:defs.bzl", "TF_ADDITIVE_BUILD_CONTENT")
+
+pip_parse(
+    name = "pypi",
+    annotations = {
+        "tensorflow": package_annotation(
+            additive_build_content = TF_ADDITIVE_BUILD_CONTENT,
+        ),
+    },
+    ...
+)
+```
+
+NOTE: The `gpu_srcs` and `gpu_deps` parameters supported by TensorFlow's version
+of `tf_custom_op_library` are not supported by this version.
diff --git a/build_deps/pip_tf/defs.bzl b/build_deps/pip_tf/defs.bzl
new file mode 100644
index 00000000..c5395854
--- /dev/null
+++ b/build_deps/pip_tf/defs.bzl
@@ -0,0 +1,132 @@
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Provides rules for building custom TensorFlow ops compatible with pip."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda", "if_cuda_is_configured")
+
+# Build flags for using the pip-provided TensorFlow package. pip_tf_flags_test ensures that these
+# values stay in sync with the currently-used TF version.
+PIP_TF_COPTS = ["-DEIGEN_MAX_ALIGN_BYTES=64", "-D_GLIBCXX_USE_CXX11_ABI=1"]
+PIP_TF_CXXOPTS = ["--std=c++17"]
+PIP_TF_LINKOPTS = []
+
+def _force_pip_tf_transition_impl(settings, _attr):
+    copts = list(settings["//command_line_option:copt"])
+    cxxopts = list(settings["//command_line_option:cxxopt"])
+    linkopts = list(settings["//command_line_option:linkopt"])
+    copts += PIP_TF_COPTS
+    cxxopts += PIP_TF_CXXOPTS
+    linkopts += PIP_TF_LINKOPTS
+
+    # TensorFlow's pip package was built with libstdc++.
+    # TODO: Enable when clang build
+    # cxxopts.append("-stdlib=libstdc++")
+    # linkopts.append("-stdlib=libstdc++")
+
+    return {
+        "//command_line_option:copt": copts,
+        "//command_line_option:cxxopt": cxxopts,
+        "//command_line_option:linkopt": linkopts,
+    }
+
+_force_pip_tf_transition = transition(
+    implementation = _force_pip_tf_transition_impl,
+    inputs = [
+        "//command_line_option:copt",
+        "//command_line_option:cxxopt",
+        "//command_line_option:linkopt",
+    ],
+    outputs = [
+        "//command_line_option:copt",
+        "//command_line_option:cxxopt",
+        "//command_line_option:linkopt",
+    ],
+)
+
+def _force_pip_tf_impl(ctx):
+    cc_binary = ctx.attr.cc_binary[0]
+    output_file = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.symlink(
+        output = output_file,
+        target_file = cc_binary.files.to_list()[0],
+    )
+    return DefaultInfo(
+        files = depset([output_file]),
+        data_runfiles = ctx.runfiles(transitive_files = depset([output_file])),
+    )
+
+_force_pip_tf = rule(
+    doc = """Forces a shared library to be built in a way that's compatible
+with the pip-provided Python TensorFlow package.""",
+    implementation = _force_pip_tf_impl,
+    attrs = {
+        "cc_binary": attr.label(
+            cfg = _force_pip_tf_transition,
+            mandatory = True,
+            doc = "The cc_binary target to build with TensorFlow compatibility.",
+        ),
+        "_allowlist_function_transition": attr.label(
+            default = "@bazel_tools//tools/allowlists/function_transition_allowlist",
+        ),
+    },
+)
+
+def tf_custom_op_library(
+        name,
+        srcs = [],
+        gpu_srcs = [],
+        deps = [],
+        gpu_deps = None,
+        tags = [],
+        visibility = None,
+        **kwargs):
+    """Replacement for TF's custom_op_library that targets pip-provided TF.
+
+    This rule will force a transition to an environment that targets the
+    pip-provided TF library. This means that all deps of this target and the
+    target's own sources will be compiled with the necessary compiler flags to
+    correctly target a pip TF library.
+    """
+    if not gpu_deps:
+        gpu_deps = []
+
+    if gpu_srcs:
+        basename = name.split(".")[0]
+        cuda_library(
+            name = basename + "_gpu",
+            srcs = gpu_srcs,
+            deps = deps + gpu_deps,
+            **kwargs
+        )
+        deps = deps + [":" + basename + "_gpu"]
+
+    native.cc_binary(
+        name = name + "_lib",
+        srcs = srcs,
+        linkshared = 1,
+        deps = deps + [
+            "@pypi_tensorflow//:libtensorflow_framework",
+            "@pypi_tensorflow//:tf_headers",
+        ],
+        tags = tags + ["manual"],
+        visibility = ["//visibility:private"],
+        **kwargs
+    )
+
+    _force_pip_tf(
+        name = name,
+        cc_binary = name + "_lib",
+        visibility = visibility,
+        tags = tags,
+    )
diff --git a/build_deps/pip_tf/pip_tf_flags_test.py b/build_deps/pip_tf/pip_tf_flags_test.py
new file mode 100644
index 00000000..dec43f4a
--- /dev/null
+++ b/build_deps/pip_tf/pip_tf_flags_test.py
@@ -0,0 +1,65 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Verifies that build flags for custom TF ops are correct."""
+
+import re
+
+from absl import flags
+from absl.testing import absltest
+import tensorflow as tf
+
+_COPTS = flags.DEFINE_list('copts', [], 'TF copts')
+_CXXOPTS = flags.DEFINE_list('cxxopts', [], 'TF cxxopts')
+_LINKOPTS = flags.DEFINE_list('linkopts', [], 'TF linkopts')
+
+_ERROR_MSG = """
+If the TensorFlow version has been updated, copy the new value to
+fcp/tensorflow/pip_tf/defs.bzl.
+"""
+
+
+class PipTfFlagsTest(absltest.TestCase):
+
+  def test_compile_flags(self):
+    copts = []
+    cxxopts = []
+    for flag in tf.sysconfig.get_compile_flags():
+      # Ignore include flags, which are handled by bazel.
+      if flag.startswith('-I'):
+        continue
+
+      if flag.startswith('--std=c++'):  # Don't add C++-only flags to copts.
+        cxxopts.append(flag)
+      else:
+        copts.append(flag)
+
+    self.assertSameElements(copts, _COPTS.value, _ERROR_MSG)
+    self.assertSameElements(cxxopts, _CXXOPTS.value, _ERROR_MSG)
+
+  def test_link_flags(self):
+    linkopts = []
+    for flag in tf.sysconfig.get_link_flags():
+      # Ignore library search paths, which are handled by bazel.
+      if flag.startswith('-L'):
+        continue
+      # Ignore -ltensorflow_framework, which is handled by bazel.
+      if re.search(r'^-l(:lib)?tensorflow_framework', flag):
+        continue
+      linkopts.append(flag)
+
+    self.assertSameElements(linkopts, _LINKOPTS.value, _ERROR_MSG)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/build_deps/pip_tf/tensorflow.bzl b/build_deps/pip_tf/tensorflow.bzl
new file mode 100644
index 00000000..5f164549
--- /dev/null
+++ b/build_deps/pip_tf/tensorflow.bzl
@@ -0,0 +1,3536 @@
+#
+# Returns the options to use for a C++ library or binary build.
+# Uses the ":optmode" config_setting to pick the options.
+load(
+    "@org_tensorflow//tensorflow/core/platform:build_config_root.bzl",
+    "if_dynamic_kernels",
+    "if_static",
+    "tf_additional_grpc_deps_py",
+    "tf_additional_xla_deps_py",
+    "tf_exec_properties",
+    "tf_gpu_tests_tags",
+)
+load(
+    "@org_tensorflow//tensorflow/core/platform:rules_cc.bzl",
+    "cc_binary",
+    "cc_library",
+    "cc_shared_library",
+    "cc_test",
+)
+load(
+    "@org_tensorflow//tensorflow/tsl:tsl.bzl",
+    "tsl_gpu_library",
+    _cc_header_only_library = "cc_header_only_library",
+    _clean_dep = "clean_dep",
+    _if_cuda_or_rocm = "if_cuda_or_rocm",
+    _if_nccl = "if_nccl",
+    _transitive_hdrs = "transitive_hdrs",
+)
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+    "if_tensorrt_exec",
+)
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    "cuda_library",
+    "if_cuda",
+    "if_cuda_exec",
+)
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "rocm_copts",
+)
+load(
+    "@org_tensorflow//third_party/mkl:build_defs.bzl",
+    "if_enable_mkl",
+    "if_mkl",
+    "if_mkl_ml",
+)
+load(
+    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
+    "if_mkldnn_aarch64_acl",
+    "if_mkldnn_aarch64_acl_openmp",
+    "if_mkldnn_openmp",
+)
+load(
+    "@org_tensorflow//tensorflow/tsl/mkl:build_defs.bzl",
+    "onednn_v3_define",
+)
+load(
+    "@org_tensorflow//third_party/compute_library:build_defs.bzl",
+    "if_enable_acl",
+)
+load(
+    "@org_tensorflow//third_party/llvm_openmp:openmp.bzl",
+    "windows_llvm_openmp_linkopts",
+)
+load(
+    "@org_tensorflow//tensorflow:py.default.bzl",
+    _plain_py_binary = "py_binary",
+    _plain_py_library = "py_library",
+    _plain_py_test = "py_test",
+)
+load("@bazel_skylib//lib:new_sets.bzl", "sets")
+load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
+
+def register_extension_info(**kwargs):
+    pass
+
+# version for the shared libraries, can
+# not contain rc or alpha, only numbers.
+# Also update tensorflow/core/public/version.h
+# and tensorflow/tools/pip_package/setup.py
+VERSION = "2.14.0"
+VERSION_MAJOR = VERSION.split(".")[0]
+two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
+
+# The workspace root, to be used to set workspace 'include' paths in a way that
+# will still work correctly when TensorFlow is included as a dependency of an
+# external project.
+workspace_root = Label("//:WORKSPACE").workspace_root or "."
+
+clean_dep = _clean_dep
+cc_header_only_library = _cc_header_only_library
+transitive_hdrs = _transitive_hdrs
+
+def if_oss(oss_value, google_value = []):
+    """Returns one of the arguments based on the non-configurable build env.
+
+    Specifically, it does not return a `select`, and can be used to e.g.
+    compute elements of list attributes.
+    """
+    return oss_value  # copybara:comment_replace return google_value
+
+def if_google(google_value, oss_value = []):
+    """Returns one of the arguments based on the non-configurable build env.
+
+    Specifically, it does not return a `select`, and can be used to e.g.
+    compute elements of list attributes.
+    """
+    return oss_value  # copybara:comment_replace return google_value
+
+def if_v2(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:api_version_2"): a,
+        "//conditions:default": [],
+    })
+
+def if_not_v2(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:api_version_2"): [],
+        "//conditions:default": a,
+    })
+
+def if_nvcc(a):
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": a,
+        "//conditions:default": [],
+    })
+
+def if_xla_available(if_true, if_false = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:with_xla_support"): if_true,
+        "//conditions:default": if_false,
+    })
+
+# Given a source file, generate a test name.
+# i.e. "common_runtime/direct_session_test.cc" becomes
+#      "common_runtime_direct_session_test"
+def src_to_test_name(src):
+    return src.replace("/", "_").replace(":", "_").split(".")[0]
+
+def full_path(relative_paths):
+    return [native.package_name() + "/" + relative for relative in relative_paths]
+
+def _add_tfcore_prefix(src):
+    if src.startswith("//"):
+        return src
+    return "@org_tensorflow//tensorflow/core:" + src
+
+def tf_android_core_proto_headers(core_proto_sources_relative):
+    """Returns the list of pb.h and proto.h headers that are generated for the provided sources."""
+    return ([
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".pb.h")
+        for p in core_proto_sources_relative
+    ] + [
+        _add_tfcore_prefix(p).replace(":", "/").replace(".proto", ".proto.h")
+        for p in core_proto_sources_relative
+    ])
+
+def tf_portable_full_lite_protos(full, lite):
+    return select({
+        "@org_tensorflow//tensorflow:mobile_lite_protos": lite,
+        "@org_tensorflow//tensorflow:mobile_full_protos": full,
+        # The default should probably be lite runtime, but since most clients
+        # seem to use the non-lite version, let's make that the default for now.
+        "//conditions:default": full,
+    })
+
+def if_no_default_logger(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:no_default_logger"): a,
+        "//conditions:default": [],
+    })
+
+def if_android_x86(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android_x86"): a,
+        clean_dep("@org_tensorflow//tensorflow:android_x86_64"): a,
+        "//conditions:default": [],
+    })
+
+def if_android_arm(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android_arm"): a,
+        "//conditions:default": [],
+    })
+
+def if_android_arm64(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android_arm64"): a,
+        "//conditions:default": [],
+    })
+
+def if_android_mips(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android_mips"): a,
+        "//conditions:default": [],
+    })
+
+def if_not_android(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android"): [],
+        "//conditions:default": a,
+    })
+
+def if_not_android_mips_and_mips64(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android_mips"): [],
+        clean_dep("@org_tensorflow//tensorflow:android_mips64"): [],
+        "//conditions:default": a,
+    })
+
+def if_android(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android"): a,
+        "//conditions:default": [],
+    })
+
+def if_android_or_ios(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:android"): a,
+        clean_dep("@org_tensorflow//tensorflow:ios"): a,
+        "//conditions:default": [],
+    })
+
+def if_emscripten(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:emscripten"): a,
+        "//conditions:default": [],
+    })
+
+def if_chromiumos(a, otherwise = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:chromiumos"): a,
+        "//conditions:default": otherwise,
+    })
+
+def if_macos(a, otherwise = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): a,
+        "//conditions:default": otherwise,
+    })
+
+def if_ios(a, otherwise = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:ios"): a,
+        "//conditions:default": otherwise,
+    })
+
+def if_ios_x86_64(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:ios_x86_64"): a,
+        "//conditions:default": [],
+    })
+
+def if_mobile(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:mobile"): a,
+        "//conditions:default": [],
+    })
+
+def if_not_mobile(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:mobile"): [],
+        "//conditions:default": a,
+    })
+
+# Config setting selector used when building for products
+# which requires restricted licenses to be avoided.
+def if_not_mobile_or_arm_or_lgpl_restricted(a):
+    _ = (a,)
+    return select({
+        "//conditions:default": [],
+    })
+
+def if_not_windows(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:windows"): [],
+        "//conditions:default": a,
+    })
+
+def if_windows(a, otherwise = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:windows"): a,
+        "//conditions:default": otherwise,
+    })
+
+def if_windows_cuda(a, otherwise = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:is_cuda_enabled_and_windows"): a,
+        "//conditions:default": otherwise,
+    })
+
+def if_not_fuchsia(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:fuchsia"): [],
+        "//conditions:default": a,
+    })
+
+def if_linux_x86_64(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): a,
+        "//conditions:default": [],
+    })
+
+def if_override_eigen_strong_inline(a):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:override_eigen_strong_inline"): a,
+        "//conditions:default": [],
+    })
+
+if_nccl = _if_nccl
+
+def if_zendnn(if_true, if_false = []):
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_libtpu(if_true, if_false = []):
+    """Shorthand for select()ing whether to build backend support for TPUs when building libtpu.so"""
+    return select({
+        # copybara:uncomment_begin(different config setting in OSS)
+        # "//tools/cc_target_os:gce": if_true,
+        # copybara:uncomment_end_and_comment_begin
+        clean_dep("@org_tensorflow//tensorflow:with_tpu_support"): if_true,
+        # copybara:comment_end
+        "//conditions:default": if_false,
+    })
+
+def if_with_tpu_support(if_true, if_false = []):
+    """Shorthand for select()ing whether to build API support for TPUs when building TensorFlow"""
+    return select({
+        "@org_tensorflow//tensorflow:with_tpu_support": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_registration_v2(if_true, if_false = []):
+    return select({
+        "@org_tensorflow//tensorflow:registration_v2": if_true,
+        "//conditions:default": if_false,
+    })
+
+def if_portable(if_true, if_false = []):
+    return if_true
+
+ADDITIONAL_API_INDEXABLE_SETTINGS = []
+
+# We are never indexing generated code in the OSS build, but still
+# return a select() for consistency.
+def if_indexing_source_code(
+        if_true,  # @unused
+        if_false):
+    """Return a select() on whether or not we are building for source code indexing."""
+    return select({
+        "//conditions:default": if_false,
+    })
+
+# Linux systems may required -lrt linker flag for e.g. clock_gettime
+# see https://github.com/tensorflow/tensorflow/issues/15129
+def lrt_if_needed():
+    lrt = ["-lrt"]
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:linux_aarch64"): lrt,
+        clean_dep("@org_tensorflow//tensorflow:linux_x86_64"): lrt,
+        clean_dep("@org_tensorflow//tensorflow:linux_ppc64le"): lrt,
+        "//conditions:default": [],
+    })
+
+def get_win_copts(is_external = False):
+    WINDOWS_COPTS = [
+        # copybara:uncomment_begin(no MSVC flags in google)
+        # "-DPLATFORM_WINDOWS",
+        # "-DEIGEN_HAS_C99_MATH",
+        # "-DTENSORFLOW_USE_EIGEN_THREADPOOL",
+        # "-DEIGEN_AVOID_STL_ARRAY",
+        # "-Iexternal/gemmlowp",
+        # "-Wno-sign-compare",
+        # "-DNOGDI",
+        # copybara:uncomment_end_and_comment_begin
+        "/DPLATFORM_WINDOWS",
+        "/DEIGEN_HAS_C99_MATH",
+        "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+        "/DEIGEN_AVOID_STL_ARRAY",
+        "/Iexternal/gemmlowp",
+        "/wd4018",  # -Wno-sign-compare
+        # Bazel's CROSSTOOL currently pass /EHsc to enable exception by
+        # default. We can't pass /EHs-c- to disable exception, otherwise
+        # we will get a waterfall of flag conflict warnings. Wait for
+        # Bazel to fix this.
+        # "/D_HAS_EXCEPTIONS=0",
+        # "/EHs-c-",
+        "/wd4577",
+        "/DNOGDI",
+        # Also see build:windows lines in tensorflow/opensource_only/.bazelrc
+        # where we set some other options globally.
+        # copybara:comment_end
+    ]
+
+    if is_external:
+        return WINDOWS_COPTS + [if_oss(
+            "/UTF_COMPILE_LIBRARY",
+            "-UTF_COMPILE_LIBRARY",
+        )]
+    else:
+        return WINDOWS_COPTS + [if_oss(
+            "/DTF_COMPILE_LIBRARY",
+            "-DTF_COMPILE_LIBRARY",
+        )]
+
+def tf_copts(
+        android_optimization_level_override = "-O2",
+        is_external = False,
+        allow_exceptions = False):
+    # For compatibility reasons, android_optimization_level_override
+    # is currently only being set for Android.
+    # To clear this value, and allow the CROSSTOOL default
+    # to be used, pass android_optimization_level_override=None
+    android_copts = [
+        "-DTF_LEAN_BINARY",
+        "-Wno-narrowing",
+    ]
+    if android_optimization_level_override:
+        android_copts.append(android_optimization_level_override)
+    return (
+        if_not_windows([
+            "-DEIGEN_AVOID_STL_ARRAY",
+            "-Iexternal/gemmlowp",
+            "-Wno-sign-compare",
+            "-ftemplate-depth=900",
+        ]) +
+        (if_not_windows(["-fno-exceptions"]) if not allow_exceptions else []) +
+        if_cuda(["-DGOOGLE_CUDA=1"]) +
+        if_nvcc(["-DTENSORFLOW_USE_NVCC=1"]) +
+        if_libtpu(["-DLIBTPU_ON_GCE"], []) +
+        if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
+        if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
+        if_rocm(["-DTENSORFLOW_USE_ROCM=1"]) +
+        # Compile in oneDNN based ops when building for x86 platforms
+        if_mkl(["-DINTEL_MKL"]) +
+        # Enable additional ops (e.g., ops with non-NHWC data layout) and
+        # optimizations for Intel builds using oneDNN if configured
+        if_enable_mkl(["-DENABLE_MKL"]) +
+        if_mkldnn_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
+        onednn_v3_define() +
+        if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
+        if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
+        if_zendnn(["-DAMD_ZENDNN"]) +
+        if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
+        if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
+        if_linux_x86_64(["-msse3"]) +
+        if_ios_x86_64(["-msse4.1"]) +
+        if_no_default_logger(["-DNO_DEFAULT_LOGGER"]) +
+        select({
+            clean_dep("@org_tensorflow//tensorflow:framework_shared_object"): [],
+            "//conditions:default": ["-DTENSORFLOW_MONOLITHIC_BUILD"],
+        }) +
+        select({
+            clean_dep("@org_tensorflow//tensorflow:android"): android_copts,
+            clean_dep("@org_tensorflow//tensorflow:emscripten"): [],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [],
+            clean_dep("@org_tensorflow//tensorflow:windows"): get_win_copts(is_external),
+            clean_dep("@org_tensorflow//tensorflow:ios"): [],
+            clean_dep("@org_tensorflow//tensorflow:no_lgpl_deps"): ["-D__TENSORFLOW_NO_LGPL_DEPS__", "-pthread"],
+            "//conditions:default": ["-pthread"],
+        })
+    )
+
+def tf_copts_exec(
+        android_optimization_level_override = "-O2",
+        is_external = False,
+        allow_exceptions = False):
+    return tf_copts(
+        android_optimization_level_override,
+        is_external,
+        allow_exceptions,
+    ) + if_cuda_exec(["-DGOOGLE_CUDA=1"]) + if_tensorrt_exec(["-DGOOGLE_TENSORRT=1"])
+
+def tf_openmp_copts():
+    # We assume when compiling on Linux gcc/clang will be used and MSVC on Windows
+    return select({
+        # copybara:uncomment_begin
+        # "//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
+        # "//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp"],
+        # copybara:uncomment_end_and_comment_begin
+        "@org_tensorflow//third_party/mkl:build_with_mkl_lnx_openmp": ["-fopenmp"],
+        "@org_tensorflow//third_party/mkl:build_with_mkl_windows_openmp": ["/openmp:llvm"],
+        # copybara:comment_end
+        "//conditions:default": [],
+    })
+
+def tf_openmp_lopts():
+    # When compiling on Windows, force MSVC to use libiomp that was compiled
+    # as part of this build.
+    return select({
+        "//third_party/mkl:build_with_mkl_windows_openmp": [windows_llvm_openmp_linkopts()],
+        "//conditions:default": [],
+    })
+
+def tf_opts_nortti():
+    return [
+        "-fno-rtti",
+        "-DGOOGLE_PROTOBUF_NO_RTTI",
+        "-DGOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ]
+
+def tf_opts_force_rtti():
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:force_rtti"): ["-frtti"],
+        "//conditions:default": [],
+    })
+
+def tf_opts_nortti_if_android():
+    return if_android(tf_opts_nortti()) + tf_opts_force_rtti()
+
+def tf_opts_nortti_if_mobile():
+    return if_mobile(tf_opts_nortti()) + tf_opts_force_rtti()
+
+def tf_defines_nortti():
+    return [
+        "GOOGLE_PROTOBUF_NO_RTTI",
+        "GOOGLE_PROTOBUF_NO_STATIC_INITIALIZER",
+    ]
+
+def tf_defines_nortti_if_android():
+    return if_android(tf_defines_nortti())
+
+def tf_features_nomodules_if_android():
+    return if_android(["-use_header_modules"])
+
+def tf_features_nomodules_if_mobile():
+    return if_mobile(["-use_header_modules"])
+
+# portable_tensorflow_lib_lite does not export the headers needed to
+# use it.  Thus anything that depends on it needs to disable layering
+# check.
+def tf_features_nolayering_check_if_ios():
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:ios"): ["-layering_check"],
+        "//conditions:default": [],
+    })
+
+def tf_opts_nortti_if_lite_protos():
+    return tf_portable_full_lite_protos(
+        full = [],
+        lite = tf_opts_nortti(),
+    ) + tf_opts_force_rtti()
+
+def tf_defines_nortti_if_lite_protos():
+    return tf_portable_full_lite_protos(
+        full = [],
+        lite = tf_defines_nortti(),
+    )
+
+# Given a list of "op_lib_names" (a list of files in the ops directory
+# without their .cc extensions), generate a library for that file.
+def tf_gen_op_libs(
+        op_lib_names,
+        sub_directory = "ops/",
+        deps = None,
+        is_external = True,
+        compatible_with = None,
+        features = []):
+    # Make library out of each op so it can also be used to generate wrappers
+    # for various languages.
+    if not deps:
+        deps = []
+    for n in op_lib_names:
+        cc_library(
+            name = n + "_op_lib",
+            copts = tf_copts_exec(is_external = is_external),
+            features = features,
+            srcs = [sub_directory + n + ".cc"],
+            deps = deps + [clean_dep("@org_tensorflow//tensorflow/core:framework")],
+            compatible_with = compatible_with,
+            visibility = ["//visibility:public"],
+            alwayslink = 1,
+            linkstatic = 1,
+        )
+
+def _make_search_paths(prefix, levels_to_root):
+    return ",".join(
+        [
+            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
+            for search_level in range(levels_to_root + 1)
+        ],
+    )
+
+def _rpath_linkopts(name):
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
+        ],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
+        ],
+    })
+
+def _rpath_user_link_flags(name):
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
+        ],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),),
+        ],
+    })
+
+# Bazel-generated shared objects which must be linked into TensorFlow binaries
+# to define symbols from //tensorflow/core:framework and //tensorflow/core:lib.
+def tf_binary_additional_srcs(fullversion = False):
+    if fullversion:
+        suffix = "." + VERSION
+    else:
+        suffix = "." + VERSION_MAJOR
+
+    return if_static(
+        extra_deps = [],
+        macos = [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework%s.dylib" % suffix),
+        ],
+        otherwise = [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so%s" % suffix),
+        ],
+    )
+
+def tf_binary_additional_data_deps():
+    return if_static(
+        extra_deps = [],
+        macos = [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.dylib"),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION_MAJOR),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION),
+        ],
+        otherwise = [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so"),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION_MAJOR),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION),
+        ],
+    )
+
+def tf_binary_pybind_deps():
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): [
+            clean_dep(
+                "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_macos",
+            ),
+        ],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [
+            clean_dep(
+                "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_windows",
+            ),
+        ],
+        "//conditions:default": [
+            clean_dep(
+                "@org_tensorflow//tensorflow/python:_pywrap_tensorflow_internal_linux",
+            ),
+        ],
+    })
+
+# Helper function for the per-OS tensorflow libraries and their version symlinks
+def tf_shared_library_deps():
+    return select({
+        clean_dep("@org_tensorflow//tensorflow:macos_with_framework_shared_object"): [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.dylib"),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.%s.dylib" % VERSION_MAJOR),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.%s.dylib" % VERSION),
+        ],
+        clean_dep("@org_tensorflow//tensorflow:macos"): [],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [
+            clean_dep("@org_tensorflow//tensorflow:tensorflow.dll"),
+            clean_dep("@org_tensorflow//tensorflow:tensorflow_dll_import_lib"),
+        ],
+        clean_dep("@org_tensorflow//tensorflow:framework_shared_object"): [
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.so"),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.so.%s" % VERSION_MAJOR),
+            clean_dep("@org_tensorflow//tensorflow:libtensorflow.so.%s" % VERSION),
+        ],
+        "//conditions:default": [],
+    }) + tf_binary_additional_srcs()
+
+# Helper functions to add kernel dependencies to tf binaries when using dynamic
+# kernel linking.
+def tf_binary_dynamic_kernel_dsos():
+    return if_dynamic_kernels(
+        extra_deps = [
+            # TODO(gunan): Remove dependencies on these, and make them load dynamically.
+            # "@org_tensorflow//tensorflow/core/kernels:libtfkernel_all_kernels.so",
+        ],
+        otherwise = [],
+    )
+
+# Helper functions to add kernel dependencies to tf binaries when using static
+# kernel linking.
+def tf_binary_dynamic_kernel_deps(kernels):
+    return if_dynamic_kernels(
+        extra_deps = [],
+        otherwise = kernels,
+    )
+
+# Shared libraries have different name pattern on different platforms,
+# but cc_binary cannot output correct artifact name yet,
+# so we generate multiple cc_binary targets with all name patterns when necessary.
+# TODO(pcloudy): Remove this workaround when https://github.com/bazelbuild/bazel/issues/4570
+# is done and cc_shared_library is available.
+SHARED_LIBRARY_NAME_PATTERN_LINUX = "lib%s.so%s"
+SHARED_LIBRARY_NAME_PATTERN_MACOS = "lib%s%s.dylib"
+SHARED_LIBRARY_NAME_PATTERN_WINDOWS = "%s%s.dll"
+SHARED_LIBRARY_NAME_PATTERNS = [
+    SHARED_LIBRARY_NAME_PATTERN_LINUX,
+    SHARED_LIBRARY_NAME_PATTERN_MACOS,
+    SHARED_LIBRARY_NAME_PATTERN_WINDOWS,
+]
+
+def tf_cc_shared_object(
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = lrt_if_needed(),
+        framework_so = tf_binary_additional_srcs(),
+        soversion = None,
+        kernels = [],
+        per_os_targets = False,  # Generate targets with SHARED_LIBRARY_NAME_PATTERNS
+        visibility = None,
+        **kwargs):
+    """Configure the shared object (.so) file for TensorFlow."""
+    if soversion != None:
+        suffix = "." + str(soversion).split(".")[0]
+        longsuffix = "." + str(soversion)
+    else:
+        suffix = ""
+        longsuffix = ""
+
+    if per_os_targets:
+        names = [
+            (
+                pattern % (name, ""),
+                pattern % (name, suffix),
+                pattern % (name, longsuffix),
+            )
+            for pattern in SHARED_LIBRARY_NAME_PATTERNS
+        ]
+    else:
+        names = [(
+            name,
+            name + suffix,
+            name + longsuffix,
+        )]
+
+    testonly = kwargs.pop("testonly", False)
+
+    for name_os, name_os_major, name_os_full in names:
+        # Windows DLLs cant be versioned
+        if name_os.endswith(".dll"):
+            name_os_major = name_os
+            name_os_full = name_os
+
+        if name_os != name_os_major:
+            native.genrule(
+                name = name_os + "_sym",
+                outs = [name_os],
+                srcs = [name_os_major],
+                output_to_bindir = 1,
+                cmd = "ln -sf $$(basename $<) $@",
+            )
+            native.genrule(
+                name = name_os_major + "_sym",
+                outs = [name_os_major],
+                srcs = [name_os_full],
+                output_to_bindir = 1,
+                cmd = "ln -sf $$(basename $<) $@",
+            )
+
+        soname = name_os_major.split("/")[-1]
+
+        data_extra = []
+        if framework_so != []:
+            data_extra = tf_binary_additional_data_deps()
+
+        cc_binary(
+            exec_properties = if_google({"cpp_link.mem": "16g"}, {}),
+            name = name_os_full,
+            srcs = srcs + framework_so,
+            deps = deps,
+            linkshared = 1,
+            data = data + data_extra,
+            linkopts = linkopts + _rpath_linkopts(name_os_full) + select({
+                clean_dep("@org_tensorflow//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
+                clean_dep("@org_tensorflow//tensorflow:macos"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-Wl,-soname," + soname,
+                ],
+            }),
+            testonly = testonly,
+            visibility = visibility,
+            **kwargs
+        )
+
+    flat_names = [item for sublist in names for item in sublist]
+    if name not in flat_names:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                clean_dep("@org_tensorflow//tensorflow:windows"): [":%s.dll" % (name)],
+                clean_dep("@org_tensorflow//tensorflow:macos"): [":lib%s%s.dylib" % (name, longsuffix)],
+                "//conditions:default": [":lib%s.so%s" % (name, longsuffix)],
+            }),
+            visibility = visibility,
+            testonly = testonly,
+        )
+
+# buildozer: disable=function-docstring-args
+def tf_cc_shared_library_opensource(
+        name,
+        srcs = [],
+        dynamic_deps = [],
+        static_deps = [],
+        deps = [],
+        roots = [],
+        exports_filter = [],
+        data = [],
+        copts = [],
+        linkopts = lrt_if_needed(),
+        additional_linker_inputs = [],
+        linkstatic = True,
+        framework_so = [clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework_import_lib")],
+        soversion = None,
+        per_os_targets = False,  # TODO(rostam): Should be deprecated.
+        win_def_file = None,
+        visibility = None):
+    """Configures the shared object file for TensorFlow."""
+    names = _get_shared_library_name_os_version_matrix(
+        name,
+        per_os_targets = per_os_targets,
+        version = soversion,
+    )
+    for name_os, name_os_major, name_os_full in names:
+        soname = name_os_major.split("/")[-1]  # Uses major version for soname.
+        user_link_flags = linkopts + _rpath_user_link_flags(name_os_full) + select({
+            clean_dep("@org_tensorflow//tensorflow:ios"): [
+                "-Wl,-install_name,@rpath/" + soname,
+            ],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [
+                "-Wl,-install_name,@rpath/" + soname,
+            ],
+            clean_dep("@org_tensorflow//tensorflow:windows"): [],
+            "//conditions:default": [
+                "-Wl,-soname," + soname,
+            ],
+        })
+        _tf_cc_shared_library_opensource(
+            name_os_full,
+            additional_linker_inputs = additional_linker_inputs,
+            copts = copts,
+            data = data,
+            deps = deps + framework_so,
+            dynamic_deps = dynamic_deps,
+            exports_filter = exports_filter,
+            linkstatic = linkstatic,
+            roots = roots,
+            shared_lib_name = name_os_full,
+            srcs = srcs,
+            static_deps = static_deps,
+            user_link_flags = user_link_flags,
+            visibility = visibility,
+            win_def_file = win_def_file,
+        )
+
+        if name_os != name_os_major:
+            filegroup_name = name_os_full + "_filegroup"
+            filegroup(
+                name = filegroup_name,
+                srcs = [name_os_full],
+                output_group = "main_shared_library_output",
+                visibility = visibility,
+            )
+            _create_symlink(name_os, name_os_major, visibility = visibility)
+            _create_symlink(name_os_major, filegroup_name, visibility = visibility)
+
+    if name not in [item for sublist in names for item in sublist]:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                clean_dep("@org_tensorflow//tensorflow:windows"): [":%s" % get_versioned_shared_library_name_windows(name, soversion)],
+                clean_dep("@org_tensorflow//tensorflow:macos"): [":%s" % get_versioned_shared_library_name_macos(name, soversion)],
+                "//conditions:default": [":%s" % get_versioned_shared_library_name_linux(name, soversion)],
+            }),
+            visibility = visibility,
+        )
+
+def _tf_cc_shared_library_opensource(
+        name,
+        additional_linker_inputs = None,
+        copts = None,
+        data = None,
+        deps = None,
+        dynamic_deps = None,
+        exports_filter = None,
+        linkstatic = False,
+        roots = None,
+        shared_lib_name = None,
+        srcs = None,
+        static_deps = None,
+        user_link_flags = None,
+        visibility = None,
+        win_def_file = None):
+    cc_library_name = name + "_cclib"
+    cc_library(
+        name = cc_library_name,
+        srcs = srcs,
+        data = data,
+        deps = deps,
+        copts = copts,
+        linkstatic = linkstatic,
+    )
+    cc_shared_library(
+        name = name,
+        roots = [cc_library_name] + roots,
+        exports_filter = exports_filter,
+        dynamic_deps = dynamic_deps,
+        static_deps = static_deps,
+        shared_lib_name = shared_lib_name,
+        user_link_flags = user_link_flags,
+        additional_linker_inputs = additional_linker_inputs,
+        visibility = visibility,
+        win_def_file = if_windows(win_def_file, otherwise = None),
+    )
+
+def _create_symlink(src, dest, visibility = None):
+    native.genrule(
+        name = src + "_sym",
+        outs = [src],
+        srcs = [dest],
+        output_to_bindir = 1,
+        cmd = "ln -sf $$(realpath --relative-to=$(RULEDIR) $<) $@",
+        visibility = visibility,
+    )
+
+def _get_shared_library_name_os_version_matrix(name, per_os_targets = False, version = None):
+    if per_os_targets:
+        names = [
+            (get_versioned_shared_library_name_linux(name), get_versioned_shared_library_name_linux(name, version, True), get_versioned_shared_library_name_linux(name, version)),
+            (get_versioned_shared_library_name_macos(name), get_versioned_shared_library_name_macos(name, version, True), get_versioned_shared_library_name_macos(name, version)),
+            (get_versioned_shared_library_name_windows(name), get_versioned_shared_library_name_windows(name, version, True), get_versioned_shared_library_name_windows(name, version)),
+        ]
+    else:
+        names = [(name, name + get_suffix_major_version(version), name + get_suffix_version(version))]
+    return names
+
+def get_versioned_shared_library_name_linux(name, version = None, major = False):
+    if major:
+        name = SHARED_LIBRARY_NAME_PATTERN_LINUX % (name, get_suffix_major_version(version))
+    else:
+        name = SHARED_LIBRARY_NAME_PATTERN_LINUX % (name, get_suffix_version(version))
+    return name
+
+def get_versioned_shared_library_name_macos(name, version = None, major = False):
+    if major:
+        name = SHARED_LIBRARY_NAME_PATTERN_MACOS % (name, get_suffix_major_version(version))
+    else:
+        name = SHARED_LIBRARY_NAME_PATTERN_MACOS % (name, get_suffix_version(version))
+    return name
+
+def get_versioned_shared_library_name_windows(name, version = None, major = False):
+    _ = version  # buildifier: disable=unused-variable
+    _ = major  # buildifier: disable=unused-variable
+    return SHARED_LIBRARY_NAME_PATTERN_WINDOWS % (name, "")
+
+def get_suffix_version(version):
+    return "." + str(version) if version else ""
+
+def get_suffix_major_version(version):
+    return "." + extract_major_version(version) if version else ""
+
+def extract_major_version(version):
+    return str(version).split(".", 1)[0]
+
+# Export open source version of tf_cc_shared_library under base name as well.
+tf_cc_shared_library = tf_cc_shared_library_opensource
+
+# Links in the framework shared object
+# (//third_party/tensorflow:libtensorflow_framework.so) when not building
+# statically. Also adds linker options (rpaths) so that the framework shared
+# object can be found.
+def tf_cc_binary(
+        name,
+        srcs = [],
+        deps = [],
+        data = [],
+        linkopts = lrt_if_needed(),
+        copts = tf_copts(),
+        kernels = [],
+        per_os_targets = False,  # Generate targets with SHARED_LIBRARY_NAME_PATTERNS
+        visibility = None,
+        default_copts = [],
+        **kwargs):
+    if kernels:
+        added_data_deps = tf_binary_dynamic_kernel_dsos()
+    else:
+        added_data_deps = []
+
+    if per_os_targets:
+        names = [pattern % (name, "") for pattern in SHARED_LIBRARY_NAME_PATTERNS]
+    else:
+        names = [name]
+
+    # Optional MKL dependency, we also tell buildcleaner to ignore this dep using a tag.
+    mkl_dep = if_mkl_ml([clean_dep("//third_party/mkl:intel_binary_blob")])
+    tags = kwargs.pop("tags", []) + ["req_dep=" + clean_dep("//third_party/mkl:intel_binary_blob")]
+
+    for name_os in names:
+        cc_binary(
+            name = name_os,
+            copts = default_copts + copts,
+            srcs = srcs + tf_binary_additional_srcs(),
+            deps = deps + tf_binary_dynamic_kernel_deps(kernels) + mkl_dep + if_static(
+                extra_deps = [],
+                otherwise = [
+                    clean_dep("@org_tensorflow//tensorflow:libtensorflow_framework_import_lib"),
+                ],
+            ),
+            tags = tags,
+            data = depset(data + added_data_deps),
+            linkopts = linkopts + _rpath_linkopts(name_os),
+            visibility = visibility,
+            **kwargs
+        )
+    if name not in names:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                "@org_tensorflow//tensorflow:windows": [":%s.dll" % name],
+                "@org_tensorflow//tensorflow:macos": [":lib%s.dylib" % name],
+                "//conditions:default": [":lib%s.so" % name],
+            }),
+            visibility = visibility,
+        )
+
+register_extension_info(
+    extension = tf_cc_binary,
+    label_regex_for_dep = "{extension_name}",
+)
+
+# A simple wrap around native.cc_binary rule.
+# When using this rule, you should realize it doesn't link to any tensorflow
+# dependencies by default.
+def tf_native_cc_binary(
+        name,
+        copts = tf_copts(),
+        linkopts = [],
+        **kwargs):
+    cc_binary(
+        name = name,
+        copts = copts,
+        linkopts = select({
+            clean_dep("@org_tensorflow//tensorflow:windows"): [],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name) + lrt_if_needed(),
+        **kwargs
+    )
+
+def tf_gen_op_wrapper_cc(
+        name,
+        out_ops_file,
+        pkg = "",
+        op_gen = clean_dep("@org_tensorflow//tensorflow/cc:cc_op_gen_main"),
+        deps = None,
+        include_internal_ops = 0,
+        # ApiDefs will be loaded in the order specified in this list.
+        api_def_srcs = [],
+        compatible_with = []):
+    # Construct an op generator binary for these ops.
+    tool = out_ops_file + "_gen_cc"
+    if deps == None:
+        deps = [pkg + ":" + name + "_op_lib"]
+    tf_cc_binary(
+        name = tool,
+        copts = tf_copts(),
+        linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + lrt_if_needed(),
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        deps = [op_gen] + deps,
+    )
+
+    srcs = api_def_srcs[:]
+
+    if not api_def_srcs:
+        api_def_args_str = ","
+    else:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                " $$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        api_def_args_str = ",".join(api_def_args)
+
+    native.genrule(
+        name = name + "_genrule",
+        outs = [
+            out_ops_file + ".h",
+            out_ops_file + ".cc",
+            out_ops_file + "_internal.h",
+            out_ops_file + "_internal.cc",
+        ],
+        srcs = srcs,
+        tools = [":" + tool] + tf_binary_additional_srcs(),
+        cmd = ("$(location :" + tool + ") $(location :" + out_ops_file + ".h) " +
+               "$(location :" + out_ops_file + ".cc) " +
+               str(include_internal_ops) + " " + api_def_args_str),
+        compatible_with = compatible_with,
+    )
+
+# Given a list of "op_lib_names" (a list of files in the ops directory
+# without their .cc extensions), generate individual C++ .cc and .h
+# files for each of the ops files mentioned, and then generate a
+# single cc_library called "name" that combines all the
+# generated C++ code.
+#
+# For example, for:
+#  tf_gen_op_wrappers_cc("tf_ops_lib", [ "array_ops", "math_ops" ])
+#
+#
+# This will ultimately generate ops/* files and a library like:
+#
+# cc_library(name = "tf_ops_lib",
+#            srcs = [ "ops/array_ops.cc",
+#                     "ops/math_ops.cc" ],
+#            hdrs = [ "ops/array_ops.h",
+#                     "ops/math_ops.h" ],
+#            deps = [ ... ])
+#
+# Plus a private library for the "hidden" ops.
+# cc_library(name = "tf_ops_lib_internal",
+#            srcs = [ "ops/array_ops_internal.cc",
+#                     "ops/math_ops_internal.cc" ],
+#            hdrs = [ "ops/array_ops_internal.h",
+#                     "ops/math_ops_internal.h" ],
+#            deps = [ ... ])
+# TODO(joshl): Cleaner approach for hidden ops.
+def tf_gen_op_wrappers_cc(
+        name,
+        op_lib_names = [],
+        other_srcs = [],
+        other_hdrs = [],
+        other_srcs_internal = [],
+        other_hdrs_internal = [],
+        pkg = "",
+        deps = [
+            clean_dep("@org_tensorflow//tensorflow/cc:ops"),
+            clean_dep("@org_tensorflow//tensorflow/cc:scope"),
+            clean_dep("@org_tensorflow//tensorflow/cc:const_op"),
+        ],
+        deps_internal = [],
+        op_gen = clean_dep("@org_tensorflow//tensorflow/cc:cc_op_gen_main"),
+        include_internal_ops = 0,
+        visibility = None,
+        # ApiDefs will be loaded in the order specified in this list.
+        api_def_srcs = [],
+        # Any extra dependencies that the wrapper generator might need.
+        extra_gen_deps = [],
+        compatible_with = []):
+    subsrcs = other_srcs[:]
+    subhdrs = other_hdrs[:]
+    internalsrcs = other_srcs_internal[:]
+    internalhdrs = other_hdrs_internal[:]
+    for n in op_lib_names:
+        tf_gen_op_wrapper_cc(
+            n,
+            "ops/" + n,
+            api_def_srcs = api_def_srcs,
+            include_internal_ops = include_internal_ops,
+            op_gen = op_gen,
+            pkg = pkg,
+            deps = [pkg + ":" + n + "_op_lib"] + extra_gen_deps,
+            compatible_with = compatible_with,
+        )
+        subsrcs += ["ops/" + n + ".cc"]
+        subhdrs += ["ops/" + n + ".h"]
+        internalsrcs += ["ops/" + n + "_internal.cc"]
+        internalhdrs += ["ops/" + n + "_internal.h"]
+
+    cc_library(
+        name = name,
+        srcs = subsrcs,
+        hdrs = subhdrs,
+        deps = deps + if_not_android([
+            clean_dep("@org_tensorflow//tensorflow/core:core_cpu"),
+            clean_dep("@org_tensorflow//tensorflow/core:framework"),
+            clean_dep("@org_tensorflow//tensorflow/core:lib"),
+            clean_dep("@org_tensorflow//tensorflow/core:ops"),
+            clean_dep("@org_tensorflow//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("@org_tensorflow//tensorflow/core:portable_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = visibility,
+        compatible_with = compatible_with,
+    )
+    cc_library(
+        name = name + "_internal",
+        srcs = internalsrcs,
+        hdrs = internalhdrs,
+        deps = deps + deps_internal + if_not_android([
+            clean_dep("@org_tensorflow//tensorflow/core:core_cpu"),
+            clean_dep("@org_tensorflow//tensorflow/core:framework"),
+            clean_dep("@org_tensorflow//tensorflow/core:lib"),
+            clean_dep("@org_tensorflow//tensorflow/core:ops"),
+            clean_dep("@org_tensorflow//tensorflow/core:protos_all_cc"),
+        ]) + if_android([
+            clean_dep("@org_tensorflow//tensorflow/core:portable_tensorflow_lib"),
+        ]),
+        copts = tf_copts(),
+        alwayslink = 1,
+        visibility = [clean_dep("@org_tensorflow//tensorflow:internal")],
+        compatible_with = compatible_with,
+    )
+
+OpRegistrationSrcInfo = provider(
+    "Info needed to extract op registration sources.",
+    fields = {
+        "srcs": "depset of source Files that contains op registrations.",
+    },
+)
+
+def _collect_op_reg_srcs_aspect_impl(_target, ctx):
+    """Aspect implementation function for collect_op_reg_srcs_aspect.
+
+    This aspect will traverse the dependency graph along the "deps" attribute of the target
+    and return an OpRegistrationSrcInfo provider.
+
+    OpRegistrationSrcInfo will have the union of the srcs of the C++ dependencies
+    with filename end with "_ops.cc" or "_op.cc".
+    """
+    direct, transitive = [], []
+    if ctx.rule.kind == "cc_library" and hasattr(ctx.rule.attr, "srcs"):
+        # Assuming the filename of op registration source files ends with "_ops.cc" or "_op.cc"
+        direct += [
+            src
+            for src in ctx.rule.files.srcs
+            if src.path.endswith("_op.cc") or src.path.endswith("_ops.cc")
+        ]
+    if hasattr(ctx.rule.attr, "deps"):
+        for dep in ctx.rule.attr.deps:
+            if OpRegistrationSrcInfo in dep:
+                transitive.append(dep[OpRegistrationSrcInfo].srcs)
+    if not direct and not transitive:
+        return []
+    return [OpRegistrationSrcInfo(srcs = depset(direct = direct, transitive = transitive))]
+
+collect_op_reg_srcs_aspect = aspect(
+    attr_aspects = ["deps"],
+    required_providers = [CcInfo],
+    implementation = _collect_op_reg_srcs_aspect_impl,
+)
+
+def _generate_op_reg_offsets_impl(ctx):
+    op_reg_srcs = []
+    for dep in ctx.attr.deps:
+        if OpRegistrationSrcInfo in dep:
+            for src in dep[OpRegistrationSrcInfo].srcs.to_list():
+                op_reg_srcs.append(src)
+
+    args = ctx.actions.args()
+    args.add(ctx.outputs.out.path, format = "--out_path=%s")
+    args.add_all(op_reg_srcs)
+
+    ctx.actions.run(
+        outputs = [ctx.outputs.out],
+        inputs = op_reg_srcs + ctx.files.tf_binary_additional_srcs,
+        tools = [ctx.executable._offset_counter],
+        executable = ctx.executable._offset_counter,
+        arguments = [args],
+    )
+
+generate_op_reg_offsets = rule(
+    attrs = {
+        "out": attr.output(),
+        "deps": attr.label_list(
+            aspects = [collect_op_reg_srcs_aspect],
+            mandatory = True,
+            allow_files = True,
+            providers = [CcInfo],
+        ),
+        # This is for carrying the required files for _offset_counter to execute.
+        "tf_binary_additional_srcs": attr.label_list(
+            cfg = "exec",
+            mandatory = True,
+            allow_files = True,
+        ),
+        "_offset_counter": attr.label(
+            cfg = "exec",
+            executable = True,
+            allow_files = True,
+            default = "@org_tensorflow//tensorflow/python/framework:offset_counter",
+        ),
+    },
+    implementation = _generate_op_reg_offsets_impl,
+)
+
+def tf_gen_op_wrapper_py(
+        name,
+        out = None,
+        hidden = None,
+        visibility = None,
+        deps = [],
+        require_shape_functions = False,
+        hidden_file = None,
+        generated_target_name = None,
+        op_whitelist = [],
+        op_allowlist = [],
+        cc_linkopts = lrt_if_needed(),
+        api_def_srcs = [],
+        compatible_with = [],
+        testonly = False,
+        copts = [],
+        extra_py_deps = None,
+        py_lib_rule = _plain_py_library):
+    """Generates a Python library target wrapping the ops registered in "deps".
+
+    Args:
+        name: used as the name of the generated target and as a name component of
+            the intermediate files.
+        out: name of the python file created by this rule. If None, then
+            "ops/gen_{name}.py" is used.
+        hidden: Optional list of ops names to make private in the Python module.
+            It is invalid to specify both "hidden" and "op_allowlist".
+        visibility: passed to py_library.
+        deps: list of dependencies for the intermediate tool used to generate the
+            python target. NOTE these `deps` are not applied to the final python
+            library target itself.
+        require_shape_functions: Unused. Leave this as False.
+        hidden_file: optional file that contains a list of op names to make private
+            in the generated Python module. Each op name should be on a line by
+            itself. Lines that start with characters that are invalid op name
+            starting characters are treated as comments and ignored.
+        generated_target_name: name of the generated target (overrides the
+            "name" arg)
+        op_whitelist: [DEPRECATED] an older spelling for "op_allowlist"
+        op_allowlist: if not empty, only op names in this list will be wrapped. It
+            is invalid to specify both "hidden" and "op_allowlist".
+        cc_linkopts: Optional linkopts to be added to tf_cc_binary that contains the
+            specified ops.
+        api_def_srcs: undocumented.
+        compatible_with: undocumented.
+        testonly: undocumented.
+        copts: undocumented.
+        extra_py_deps: undocumented.
+        py_lib_rule: undocumented.
+    """
+    _ = require_shape_functions  # Unused.
+    if op_whitelist and op_allowlist:
+        fail("op_whitelist is deprecated. Only use op_allowlist.")
+    if op_whitelist:
+        full_target_name = "//" + native.package_name() + ":" + name
+        print("op_whitelist is deprecated. Please migrate to the preferred " +
+              "`op_allowlist` spelling. Offending target: " +
+              full_target_name)  # buildifier: disable=print
+        op_allowlist = op_whitelist
+
+    if (hidden or hidden_file) and op_allowlist:
+        fail("Cannot pass specify both hidden and op_allowlist.")
+
+    # Construct a cc_binary containing the specified ops.
+    tool_name = "gen_" + name + "_py_wrappers_cc"
+    if not deps:
+        deps = [str(Label("@org_tensorflow//tensorflow/core:" + name + "_op_lib"))]
+    tf_cc_binary(
+        name = tool_name,
+        copts = copts + tf_copts_exec(),
+        linkopts = if_not_windows(["-lm", "-Wl,-ldl"]) + cc_linkopts,
+        linkstatic = 1,  # Faster to link this one-time-use binary dynamically
+        visibility = [clean_dep("@org_tensorflow//tensorflow:internal")],
+        deps = ([
+            clean_dep("@org_tensorflow//tensorflow/core:framework"),
+            clean_dep("@org_tensorflow//tensorflow/python/framework:python_op_gen_main"),
+        ] + deps),
+        testonly = testonly,
+    )
+
+    pygen_args = []
+
+    # Invoke the previous cc_binary to generate a python file.
+    if not out:
+        out = "ops/gen_" + name + ".py"
+
+    extra_srcs = []
+    if hidden:
+        pygen_args.append("--hidden_op_list=" + ",".join(hidden))
+    elif hidden_file:
+        # `hidden_file` is file containing a list of op names to be hidden in the
+        # generated module.
+        pygen_args.append("--hidden_op_list_filename=$(location " + hidden_file + ")")
+        extra_srcs.append(hidden_file)
+    elif op_allowlist:
+        pygen_args.append("--op_allowlist=" + ",".join(["'%s'" % op for op in op_allowlist]))
+
+    # Prepare ApiDef directories to pass to the genrule.
+    if api_def_srcs:
+        api_def_args = []
+        for api_def_src in api_def_srcs:
+            # Add directory of the first ApiDef source to args.
+            # We are assuming all ApiDefs in a single api_def_src are in the
+            # same directory.
+            api_def_args.append(
+                "$$(dirname $$(echo $(locations " + api_def_src +
+                ") | cut -d\" \" -f1))",
+            )
+        pygen_args.append("--api_def_dirs=" + ",".join(api_def_args))
+
+    op_reg_offset_out = "gen_" + name + "_reg_offsets.pb"
+    generate_op_reg_offsets(
+        name = name + "_reg_offsets",
+        out = op_reg_offset_out,
+        # Feed an empty dep list if not indexing to skip unnecessary aspect propagation.
+        deps = select({
+            clean_dep("@org_tensorflow//tensorflow:api_indexable"): deps,
+            "//conditions:default": [],
+        }),
+        tf_binary_additional_srcs = tf_binary_additional_srcs(),
+        testonly = testonly,
+    )
+    extra_srcs.append(op_reg_offset_out)
+    pygen_args.append("--op_reg_offset_filename=$(location " + op_reg_offset_out + ")")
+
+    native.genrule(
+        name = name + "_pygenrule",
+        outs = [out],
+        srcs = api_def_srcs + extra_srcs,
+        tools = [tool_name] + tf_binary_additional_srcs(),
+        cmd = ("$(location " + tool_name + ") " + " ".join(pygen_args) + " > $@"),
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+
+    # Make a py_library out of the generated python file.
+    if not generated_target_name:
+        generated_target_name = name
+    py_deps = [clean_dep("@org_tensorflow//tensorflow/python/framework:for_generated_wrappers_v2")]
+    if extra_py_deps:
+        py_deps += extra_py_deps
+    py_lib_rule(
+        name = generated_target_name,
+        srcs = [out],
+        srcs_version = "PY3",
+        visibility = visibility,
+        deps = py_deps,
+        # Instruct build_cleaner to try to avoid using this rule; typically ops
+        # creators will provide their own tf_custom_op_py_library based target
+        # that wraps this one.
+        tags = ["avoid_dep"],
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+
+# Define a bazel macro that creates cc_test for tensorflow.
+#
+# Links in the framework shared object
+# (//third_party/tensorflow:libtensorflow_framework.so) when not building
+# statically. Also adds linker options (rpaths) so that the framework shared
+# object can be found.
+#
+# TODO(opensource): we need to enable this to work around the hidden symbol
+# __cudaRegisterFatBinary error. Need more investigations.
+def tf_cc_test(
+        name,
+        srcs,
+        deps,
+        data = [],
+        extra_copts = [],
+        suffix = "",
+        linkopts = lrt_if_needed(),
+        kernels = [],
+        **kwargs):
+    cc_test(
+        name = "%s%s" % (name, suffix),
+        srcs = srcs + tf_binary_additional_srcs(),
+        copts = tf_copts() + extra_copts,
+        linkopts = select({
+            clean_dep("@org_tensorflow//tensorflow:android"): [
+                "-pie",
+            ],
+            clean_dep("@org_tensorflow//tensorflow:windows"): [],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+            clean_dep("//third_party/compute_library:build_with_acl"): [
+                "-fopenmp",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                clean_dep("//third_party/mkl:intel_binary_blob"),
+            ],
+        ),
+        data = data +
+               tf_binary_dynamic_kernel_dsos() +
+               tf_binary_additional_srcs(),
+        exec_properties = tf_exec_properties(kwargs),
+        **kwargs
+    )
+
+def tf_cc_shared_test(
+        name,
+        srcs,
+        deps,
+        data = [],
+        extra_copts = [],
+        suffix = "",
+        linkopts = lrt_if_needed(),
+        kernels = [],
+        **kwargs):
+    cc_test(
+        name = "%s%s" % (name, suffix),
+        srcs = srcs,
+        copts = tf_copts() + extra_copts,
+        linkopts = select({
+            clean_dep("@org_tensorflow//tensorflow:android"): [
+                "-pie",
+            ],
+            clean_dep("@org_tensorflow//tensorflow:windows"): [],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [
+                "-lm",
+            ],
+            "//conditions:default": [
+                "-lpthread",
+                "-lm",
+            ],
+            clean_dep("//third_party/compute_library:build_with_acl"): [
+                "-fopenmp",
+                "-lm",
+            ],
+        }) + linkopts + _rpath_linkopts(name),
+        deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(
+            [
+                clean_dep("//third_party/mkl:intel_binary_blob"),
+            ],
+        ),
+        dynamic_deps = if_static(
+            extra_deps = [],
+            macos = ["@org_tensorflow//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
+            otherwise = ["@org_tensorflow//tensorflow:libtensorflow_framework.so.%s" % VERSION],
+        ),
+        data = data + tf_binary_dynamic_kernel_dsos(),
+        exec_properties = tf_exec_properties(kwargs),
+        **kwargs
+    )
+
+register_extension_info(
+    extension = tf_cc_test,
+    label_regex_for_dep = "{extension_name}",
+)
+
+# TODO(jakeharmon): Replace with an implementation which doesn't add a
+# dependency on core:common_runtime
+def tf_gpu_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        extra_copts = [],
+        linkstatic = 0,
+        args = [],
+        kernels = [],
+        linkopts = [],
+        **kwargs):
+    targets = []
+    tf_cc_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        args = args,
+        data = data,
+        extra_copts = extra_copts + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"]),
+        kernels = kernels,
+        linkopts = linkopts,
+        linkstatic = linkstatic,
+        suffix = "_cpu",
+        tags = tags,
+        deps = deps,
+        **kwargs
+    )
+    targets.append(name + "_cpu")
+    tf_cc_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        args = args,
+        data = data,
+        extra_copts = extra_copts + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"]),
+        kernels = kernels,
+        linkopts = linkopts,
+        linkstatic = select({
+            # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+            clean_dep("@org_tensorflow//tensorflow:macos"): 1,
+            "@local_config_cuda//cuda:using_nvcc": 1,
+            "@local_config_cuda//cuda:using_clang": 1,
+            "//conditions:default": 0,
+        }),
+        suffix = "_gpu",
+        tags = tags + tf_gpu_tests_tags(),
+        deps = deps + if_cuda_or_rocm([
+            clean_dep("@org_tensorflow//tensorflow/core:gpu_runtime"),
+        ]),
+        **kwargs
+    )
+    targets.append(name + "_gpu")
+    if "multi_gpu" in tags or "multi_and_single_gpu" in tags:
+        cleaned_tags = tags + two_gpu_tags
+        if "requires-gpu-nvidia" in cleaned_tags:
+            cleaned_tags.remove("requires-gpu-nvidia")
+        tf_cc_test(
+            name = name,
+            size = size,
+            srcs = srcs,
+            args = args,
+            data = data,
+            extra_copts = extra_copts,
+            kernels = kernels,
+            linkopts = linkopts,
+            linkstatic = select({
+                # TODO(allenl): Remove Mac static linking when Bazel 0.6 is out.
+                clean_dep("@org_tensorflow//tensorflow:macos"): 1,
+                "@local_config_cuda//cuda:using_nvcc": 1,
+                "@local_config_cuda//cuda:using_clang": 1,
+                "//conditions:default": 0,
+            }),
+            suffix = "_2gpu",
+            tags = cleaned_tags,
+            deps = deps + if_cuda_or_rocm([
+                clean_dep("@org_tensorflow//tensorflow/core:gpu_runtime"),
+            ]),
+            **kwargs
+        )
+        targets.append(name + "_2gpu")
+
+    native.test_suite(name = name, tests = targets, tags = tags)
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_test(*args, **kwargs):
+    tf_gpu_cc_test(*args, **kwargs)
+
+def tf_gpu_only_cc_test(
+        name,
+        srcs = [],
+        deps = [],
+        tags = [],
+        data = [],
+        size = "medium",
+        args = [],
+        kernels = [],
+        linkopts = []):
+    tags = tags + tf_gpu_tests_tags()
+
+    gpu_lib_name = "%s%s" % (name, "_gpu_lib")
+    tf_gpu_kernel_library(
+        name = gpu_lib_name,
+        srcs = srcs + tf_binary_additional_srcs(),
+        deps = deps,
+        testonly = 1,
+    )
+    cc_test(
+        name = "%s%s" % (name, "_gpu"),
+        size = size,
+        args = args,
+        features = if_cuda(["-use_header_modules"]),
+        data = data + tf_binary_dynamic_kernel_dsos(),
+        deps = [":" + gpu_lib_name],
+        linkopts = if_not_windows(["-lpthread", "-lm"]) + linkopts + _rpath_linkopts(name),
+        tags = tags,
+        exec_properties = tf_exec_properties({"tags": tags}),
+    )
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_only_cc_test(*args, **kwargs):
+    tf_gpu_only_cc_test(*args, **kwargs)
+
+# Create a cc_test for each of the tensorflow tests listed in "tests", along
+# with a test suite of the given name, if provided.
+def tf_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        args = None,
+        linkopts = lrt_if_needed(),
+        kernels = [],
+        create_named_test_suite = False,
+        visibility = None):
+    test_names = []
+    for src in srcs:
+        test_name = src_to_test_name(src)
+        tf_cc_test(
+            name = test_name,
+            size = size,
+            srcs = [src],
+            args = args,
+            kernels = kernels,
+            linkopts = linkopts,
+            linkstatic = linkstatic,
+            tags = tags,
+            deps = deps,
+            visibility = visibility,
+        )
+        test_names.append(test_name)
+
+    # Add a test suite with the generated tests if a name was provided and
+    # it does not conflict any of the test names.
+    if create_named_test_suite:
+        native.test_suite(
+            name = name,
+            tests = test_names,
+            visibility = visibility,
+            tags = tags,
+        )
+
+register_extension_info(
+    extension = tf_cc_tests,
+    label_regex_for_dep = "{extension_name}",
+)
+
+def tf_cc_test_mkl(
+        srcs,
+        deps,
+        name = "",
+        data = [],
+        linkstatic = 0,
+        tags = [],
+        size = "medium",
+        kernels = [],
+        args = None):
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
+
+    for src in srcs:
+        cc_test(
+            name = src_to_test_name(src),
+            srcs = if_mkl([src]) + tf_binary_additional_srcs(),
+            # Adding an explicit `-fexceptions` because `allow_exceptions = True`
+            # in `tf_copts` doesn't work internally.
+            copts = tf_copts() + ["-fexceptions"] + tf_openmp_copts(),
+            linkopts = select({
+                clean_dep("@org_tensorflow//tensorflow:android"): [
+                    "-pie",
+                ],
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-lpthread",
+                    "-lm",
+                ],
+            }) + _rpath_linkopts(src_to_test_name(src)) + tf_openmp_lopts(),
+            deps = deps + tf_binary_dynamic_kernel_deps(kernels) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
+            data = data + tf_binary_dynamic_kernel_dsos(),
+            exec_properties = tf_exec_properties({"tags": tags}),
+            linkstatic = linkstatic,
+            tags = tags,
+            size = size,
+            args = args,
+            features = disable_header_modules,
+        )
+
+def tf_gpu_cc_tests(
+        srcs,
+        deps,
+        name = "",
+        tags = [],
+        size = "medium",
+        linkstatic = 0,
+        args = None,
+        kernels = [],
+        linkopts = []):
+    for src in srcs:
+        tf_gpu_cc_test(
+            name = src_to_test_name(src),
+            size = size,
+            srcs = [src],
+            args = args,
+            kernels = kernels,
+            linkopts = linkopts,
+            linkstatic = linkstatic,
+            tags = tags,
+            deps = deps,
+        )
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+def tf_cuda_cc_tests(*args, **kwargs):
+    tf_gpu_cc_tests(*args, **kwargs)
+
+def tf_java_test(
+        name,
+        srcs = [],
+        deps = [],
+        kernels = [],
+        *args,
+        **kwargs):
+    cc_library_name = name + "_cclib"
+    cc_library(
+        # TODO(b/183579145): Remove when cc_shared_library supports CcInfo or JavaInfo providers .
+        name = cc_library_name,
+        srcs = tf_binary_additional_srcs(fullversion = True) + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
+    )
+    native.java_test(
+        name = name,
+        srcs = srcs,
+        deps = deps + [cc_library_name],
+        *args,
+        **kwargs
+    )
+
+def _cuda_copts(opts = []):
+    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+
+        If we're doing CUDA compilation, returns copts for our particular CUDA
+        compiler.  If we're not doing CUDA compilation, returns an empty list.
+
+        """
+    return select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options=relaxed-constexpr",
+            "-nvcc_options=ftz=true",
+        ] + opts,
+        "@local_config_cuda//cuda:using_clang": [
+            "-fcuda-flush-denormals-to-zero",
+        ] + opts,
+    })
+
+# Build defs for TensorFlow kernels
+
+# When this target is built using --config=cuda, a cc_library is built
+# that passes -DGOOGLE_CUDA=1 and '-x cuda', linking in additional
+# libraries needed by GPU kernels.
+#
+# When this target is built using --config=rocm, a cc_library is built
+# that passes -DTENSORFLOW_USE_ROCM and '-x rocm', linking in additional
+# libraries needed by GPU kernels.
+def tf_gpu_kernel_library(
+        srcs,
+        copts = [],
+        cuda_copts = [],
+        deps = [],
+        hdrs = [],
+        **kwargs):
+    copts = copts + tf_copts() + _cuda_copts(opts = cuda_copts) + rocm_copts(opts = cuda_copts)
+    kwargs["features"] = kwargs.get("features", []) + ["-use_header_modules"]
+
+    cuda_library(
+        srcs = srcs,
+        hdrs = hdrs,
+        copts = copts,
+        deps = deps + if_cuda([
+            clean_dep("@org_tensorflow//tensorflow/tsl/cuda:cudart_stub"),
+        ]) + if_cuda_or_rocm([
+            clean_dep("@org_tensorflow//tensorflow/core:gpu_lib"),
+        ]),
+        alwayslink = 1,
+        **kwargs
+    )
+
+tf_gpu_library = tsl_gpu_library
+
+# terminology changes: saving tf_cuda_* definition for compatibility
+tf_cuda_library = tsl_gpu_library
+
+def tf_kernel_library(
+        name,
+        prefix = None,
+        srcs = None,
+        gpu_srcs = None,
+        hdrs = None,
+        deps = None,
+        gpu_deps = None,
+        alwayslink = 1,
+        copts = None,
+        gpu_copts = None,
+        is_external = False,
+        compatible_with = None,
+        **kwargs):
+    """A rule to build a TensorFlow OpKernel.
+
+      May either specify srcs/hdrs or prefix.  Similar to tf_gpu_library,
+      but with alwayslink=1 by default.  If prefix is specified:
+        * prefix*.cc (except *.cu.cc) is added to srcs
+        * prefix*.h (except *.cu.h) is added to hdrs
+        * prefix*.cu.cc and prefix*.h (including *.cu.h) are added to gpu_srcs.
+      With the exception that test files are excluded.
+      For example, with prefix = "cast_op",
+        * srcs = ["cast_op.cc"]
+        * hdrs = ["cast_op.h"]
+        * gpu_srcs = ["cast_op_gpu.cu.cc", "cast_op.h"]
+        * "cast_op_test.cc" is excluded
+      With prefix = "cwise_op"
+        * srcs = ["cwise_op_abs.cc", ..., "cwise_op_tanh.cc"],
+        * hdrs = ["cwise_ops.h", "cwise_ops_common.h"],
+        * gpu_srcs = ["cwise_op_gpu_abs.cu.cc", ..., "cwise_op_gpu_tanh.cu.cc",
+                      "cwise_ops.h", "cwise_ops_common.h",
+                      "cwise_ops_gpu_common.cu.h"]
+        * "cwise_ops_test.cc" is excluded
+      """
+    if not srcs:
+        srcs = []
+    if not hdrs:
+        hdrs = []
+    if not deps:
+        deps = []
+    if not gpu_deps:
+        gpu_deps = []
+    if not copts:
+        copts = []
+    if not gpu_copts:
+        gpu_copts = []
+    textual_hdrs = []
+    copts = copts + tf_copts(is_external = is_external) + if_cuda(["-DNV_CUDNN_DISABLE_EXCEPTION"])
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+    if prefix:
+        if native.glob([prefix + "*.cu.cc"], exclude = ["*test*"]):
+            if not gpu_srcs:
+                gpu_srcs = []
+            gpu_srcs = gpu_srcs + native.glob(
+                [prefix + "*.cu.cc", prefix + "*.h"],
+                exclude = [prefix + "*test*"],
+            )
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+            exclude = [prefix + "*test*", prefix + "*.cu.cc"],
+        )
+        hdrs = hdrs + native.glob(
+            [prefix + "*.h"],
+            exclude = [prefix + "*test*", prefix + "*.cu.h", prefix + "*impl.h"],
+        )
+        textual_hdrs = native.glob(
+            [prefix + "*impl.h"],
+            exclude = [prefix + "*test*", prefix + "*.cu.h"],
+        )
+    cuda_deps = [clean_dep("@org_tensorflow//tensorflow/core:gpu_lib")]
+    if gpu_srcs:
+        for gpu_src in gpu_srcs:
+            if gpu_src.endswith(".cc") and not gpu_src.endswith(".cu.cc"):
+                fail("{} not allowed in gpu_srcs. .cc sources must end with .cu.cc"
+                    .format(gpu_src))
+        tf_gpu_kernel_library(
+            name = name + "_gpu",
+            srcs = gpu_srcs,
+            deps = deps + gpu_deps,
+            copts = gpu_copts,
+            **kwargs
+        )
+        cuda_deps.extend([":" + name + "_gpu"])
+    kwargs["tags"] = kwargs.get("tags", []) + [
+        "req_dep=%s" % clean_dep("@org_tensorflow//tensorflow/core:gpu_lib"),
+        "req_dep=@local_config_cuda//cuda:cuda_headers",
+    ]
+    tf_gpu_library(
+        name = name,
+        srcs = srcs,
+        hdrs = hdrs,
+        textual_hdrs = textual_hdrs,
+        copts = copts,
+        cuda_deps = cuda_deps + gpu_deps,
+        linkstatic = 1,  # Needed since alwayslink is broken in bazel b/27630669
+        alwayslink = alwayslink,
+        deps = deps,
+        compatible_with = compatible_with,
+        **kwargs
+    )
+
+    # TODO(gunan): CUDA dependency not clear here. Fix it.
+    tf_cc_shared_object(
+        name = "libtfkernel_%s.so" % name,
+        srcs = srcs + hdrs,
+        copts = copts,
+        tags = ["manual", "notap"],
+        deps = deps,
+    )
+
+register_extension_info(
+    extension = tf_kernel_library,
+    label_regex_for_dep = "{extension_name}",
+)
+
+def tf_mkl_kernel_library(
+        name,
+        prefix = None,
+        srcs = None,
+        hdrs = None,
+        deps = None,
+        alwayslink = 1,
+        # Adding an explicit `-fexceptions` because `allow_exceptions = True`
+        # in `tf_copts` doesn't work internally.
+        copts = tf_copts() + ["-fexceptions"] + tf_openmp_copts(),
+        linkopts = tf_openmp_lopts()):
+    """A rule to build MKL-based TensorFlow kernel libraries."""
+
+    if not bool(srcs):
+        srcs = []
+    if not bool(hdrs):
+        hdrs = []
+
+    if prefix:
+        srcs = srcs + native.glob(
+            [prefix + "*.cc"],
+            exclude = [prefix + "*test*"],
+        )
+        hdrs = hdrs + native.glob(
+            [prefix + "*.h"],
+            exclude = [prefix + "*test*"],
+        )
+
+    # -fno-exceptions in nocopts breaks compilation if header modules are enabled.
+    disable_header_modules = ["-use_header_modules"]
+
+    cc_library(
+        name = name,
+        srcs = if_mkl(srcs),
+        hdrs = hdrs,
+        deps = deps,
+        linkopts = linkopts,
+        alwayslink = alwayslink,
+        copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
+        features = disable_header_modules,
+    )
+
+def _get_transitive_headers(hdrs, deps):
+    """Obtain the header files for a target and its transitive dependencies.
+
+      Args:
+        hdrs: a list of header files
+        deps: a list of targets that are direct dependencies
+
+      Returns:
+        a collection of the transitive headers
+      """
+    return depset(
+        hdrs,
+        transitive = [dep[CcInfo].compilation_context.headers for dep in deps],
+    )
+
+def _get_repository_roots(ctx, files):
+    """Returns abnormal root directories under which files reside.
+
+      When running a ctx.action, source files within the main repository are all
+      relative to the current directory; however, files that are generated or exist
+      in remote repositories will have their root directory be a subdirectory,
+      e.g. bazel-out/local-fastbuild/genfiles/external/jpeg_archive. This function
+      returns the set of these devious directories, ranked and sorted by popularity
+      in order to hopefully minimize the number of I/O system calls within the
+      compiler, because includes have quadratic complexity.
+      """
+    result = {}
+    for f in files.to_list():
+        root = f.root.path
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+        work = f.owner.workspace_root
+        if work:
+            if root:
+                root += "/"
+            root += work
+        if root:
+            if root not in result:
+                result[root] = 0
+            result[root] -= 1
+    return [k for v, k in sorted([(v, k) for k, v in result.items()])]
+
+def tf_custom_op_library_additional_deps():
+    return [
+        "@com_google_protobuf//:protobuf_headers",  # copybara:comment
+        clean_dep("//third_party/eigen3"),
+        clean_dep("@org_tensorflow//tensorflow/core:framework_headers_lib"),
+    ]
+
+# A list of targets that contains the implementation of
+# tf_custom_op_library_additional_deps. It's used to generate a DEF file for
+# exporting symbols from _pywrap_tensorflow.dll on Windows.
+def tf_custom_op_library_additional_deps_impl():
+    return [
+        # copybara:comment_begin
+        "@com_google_protobuf//:protobuf",
+        "@nsync//:nsync_cpp",
+        # copybara:comment_end
+
+        # for //third_party/eigen3
+        clean_dep("//third_party/eigen3"),
+
+        # for //tensorflow/core:framework_headers_lib
+        clean_dep("@org_tensorflow//tensorflow/core:framework"),
+        clean_dep("@org_tensorflow//tensorflow/core:reader_base"),
+    ]
+
+# Traverse the dependency graph along the "deps" attribute of the
+# target and return a struct with one field called 'tf_collected_deps'.
+# tf_collected_deps will be the union of the deps of the current target
+# and the tf_collected_deps of the dependencies of this target.
+def _collect_deps_aspect_impl(target, ctx):
+    direct, transitive = [], []
+    all_deps = []
+    if hasattr(ctx.rule.attr, "deps"):
+        all_deps += ctx.rule.attr.deps
+    if hasattr(ctx.rule.attr, "data"):
+        all_deps += ctx.rule.attr.data
+    if hasattr(ctx.rule.attr, "roots"):
+        all_deps += ctx.rule.attr.roots
+    for dep in all_deps:
+        direct.append(dep.label)
+        if hasattr(dep, "tf_collected_deps"):
+            transitive.append(dep.tf_collected_deps)
+    return struct(tf_collected_deps = depset(direct = direct, transitive = transitive))
+
+collect_deps_aspect = aspect(
+    attr_aspects = ["deps", "data", "roots"],
+    implementation = _collect_deps_aspect_impl,
+)
+
+def _dep_label(dep):
+    label = dep.label
+    return label.package + ":" + label.name
+
+# This rule checks that transitive dependencies don't depend on the targets
+# listed in the 'disallowed_deps' attribute, but do depend on the targets listed
+# in the 'required_deps' attribute. Dependencies considered are targets in the
+# 'deps' attribute or the 'data' attribute.
+def _check_deps_impl(ctx):
+    required_deps = ctx.attr.required_deps
+    disallowed_deps = ctx.attr.disallowed_deps
+    for input_dep in ctx.attr.deps:
+        if not hasattr(input_dep, "tf_collected_deps"):
+            continue
+        collected_deps = sets.make(input_dep.tf_collected_deps.to_list())
+        for disallowed_dep in disallowed_deps:
+            if sets.contains(collected_deps, disallowed_dep.label):
+                fail(
+                    "{src} cannot depend on {dep}. See: bazel query 'somepath(//{src}, //{dep})'".format(
+                        src = _dep_label(input_dep),
+                        dep = _dep_label(disallowed_dep),
+                    ),
+                )
+        for required_dep in required_deps:
+            if not sets.contains(collected_deps, required_dep.label):
+                fail(
+                    _dep_label(input_dep) + " must depend on " +
+                    _dep_label(required_dep),
+                )
+    return []
+
+check_deps = rule(
+    _check_deps_impl,
+    attrs = {
+        "deps": attr.label_list(
+            aspects = [collect_deps_aspect],
+            mandatory = True,
+            allow_files = True,
+        ),
+        "disallowed_deps": attr.label_list(
+            default = [],
+            allow_files = True,
+        ),
+        "required_deps": attr.label_list(
+            default = [],
+            allow_files = True,
+        ),
+    },
+)
+
+def tf_custom_op_library(
+        name,
+        srcs = [],
+        gpu_srcs = [],
+        deps = [],
+        gpu_deps = None,
+        linkopts = [],
+        copts = [],
+        **kwargs):
+    """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels."""
+
+    if not gpu_deps:
+        gpu_deps = []
+
+    deps = deps + if_cuda_or_rocm([
+        clean_dep("@org_tensorflow//tensorflow/core:stream_executor_headers_lib"),
+    ]) + if_cuda([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cudart_static",
+    ]) + if_windows([
+        clean_dep("@org_tensorflow//tensorflow/python:pywrap_tensorflow_import_lib"),
+    ]) + tf_custom_op_library_additional_deps()
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+
+    if gpu_srcs:
+        basename = name.split(".")[0]
+        cuda_library(
+            name = basename + "_gpu",
+            srcs = gpu_srcs,
+            copts = copts + tf_copts() + _cuda_copts() + rocm_copts() +
+                    if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            deps = deps + gpu_deps,
+            **kwargs
+        )
+        deps = deps + [":" + basename + "_gpu"]
+
+    check_deps(
+        name = name + "_check_deps",
+        disallowed_deps = [
+            clean_dep("@org_tensorflow//tensorflow/core:framework"),
+            clean_dep("@org_tensorflow//tensorflow/core:lib"),
+        ],
+        deps = deps,
+    )
+    tf_cc_shared_object(
+        name = name,
+        srcs = srcs,
+        deps = deps,
+        data = if_static([name + "_check_deps"]),
+        copts = copts + tf_copts(is_external = True),
+        features = ["windows_export_all_symbols"],
+        linkopts = linkopts + select({
+            "//conditions:default": [
+                "-lm",
+            ],
+            clean_dep("@org_tensorflow//tensorflow:windows"): [],
+            clean_dep("@org_tensorflow//tensorflow:macos"): [],
+        }),
+        **kwargs
+    )
+
+def tf_custom_op_py_library(
+        name,
+        srcs = [],
+        dso = [],
+        kernels = [],
+        srcs_version = "PY3",
+        visibility = None,
+        deps = [],
+        **kwargs):
+    _ignore = [kernels]
+    _make_tags_mutable(kwargs)
+    _plain_py_library(
+        name = name,
+        data = dso,
+        srcs = srcs,
+        srcs_version = srcs_version,
+        visibility = visibility,
+        deps = deps,
+        **kwargs
+    )
+
+# In tf_py_wrap_cc_opensource generated libraries
+# module init functions are not exported unless
+# they contain one of the keywords in the version file
+# this prevents custom python modules.
+# This function attempts to append init_module_name to list of
+# exported functions in version script
+def _append_init_to_versionscript_impl(ctx):
+    mod_name = ctx.attr.module_name
+    if ctx.attr.is_version_script:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "global:": "global:\n     init_%s;\n     _init_%s;\n     PyInit_*;\n     _PyInit_*;" % (mod_name, mod_name),
+            },
+            is_executable = False,
+        )
+    else:
+        ctx.actions.expand_template(
+            template = ctx.file.template_file,
+            output = ctx.outputs.versionscript,
+            substitutions = {
+                "*tensorflow*": "*tensorflow*\ninit_%s\n_init_%s\nPyInit_*\n_PyInit_*\n" % (mod_name, mod_name),
+            },
+            is_executable = False,
+        )
+
+_append_init_to_versionscript = rule(
+    attrs = {
+        "module_name": attr.string(mandatory = True),
+        "template_file": attr.label(
+            allow_single_file = True,
+            mandatory = True,
+        ),
+        "is_version_script": attr.bool(
+            default = True,
+            doc = "whether target is a ld version script or exported symbol list",
+            mandatory = False,
+        ),
+    },
+    outputs = {"versionscript": "%{name}.lds"},
+    implementation = _append_init_to_versionscript_impl,
+)
+
+# This macro should only be used for pywrap_tensorflow_internal.so.
+# It was copied and refined from the original tf_py_wrap_cc_opensource rule.
+# buildozer: disable=function-docstring-args
+def pywrap_tensorflow_macro_opensource(
+        name,
+        srcs = [],
+        roots = [],
+        deps = [],
+        dynamic_deps = [],
+        static_deps = [],
+        exports_filter = [],
+        copts = [],
+        version_script = None,
+        win_def_file = None):
+    """Builds the pywrap_tensorflow_internal shared object."""
+    module_name = name.split("/")[-1]
+
+    # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so
+    # and use that as the name for the rule producing the .so file.
+    cc_library_base = "/".join(name.split("/")[:-1] + ["_" + module_name])
+
+    # TODO(b/137885063): tf_cc_shared_object needs to be cleaned up; we really
+    # shouldn't be passing a name qualified with .so here.
+    cc_shared_library_name = cc_library_base + ".so"
+    cc_library_pyd_name = "/".join(
+        name.split("/")[:-1] + ["_" + module_name + ".pyd"],
+    )
+
+    # We need pybind11 to export the shared object PyInit symbol only in OSS.
+    extra_deps = [clean_dep("@pybind11")]
+
+    if not version_script:
+        version_script = select({
+            "@org_tensorflow//tensorflow:macos": clean_dep("@org_tensorflow//tensorflow:tf_exported_symbols.lds"),
+            "//conditions:default": clean_dep("@org_tensorflow//tensorflow:tf_version_script.lds"),
+        })
+    vscriptname = name + "_versionscript"
+    _append_init_to_versionscript(
+        name = vscriptname,
+        is_version_script = select({
+            "@org_tensorflow//tensorflow:macos": False,
+            "//conditions:default": True,
+        }),
+        module_name = module_name,
+        template_file = version_script,
+    )
+    extra_linkopts = select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): [
+            # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+            # not being exported.  There should be a better way to deal with this.
+            "-Wl,-w",
+            "-Wl,-exported_symbols_list,$(location %s.lds)" % vscriptname,
+        ],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [],
+        "//conditions:default": [
+            "-Wl,--version-script",
+            "$(location %s.lds)" % vscriptname,
+        ],
+    })
+    additional_linker_inputs = if_windows([], otherwise = ["%s.lds" % vscriptname])
+
+    # This is needed so that libtensorflow_cc is included in the pip package.
+    srcs += select({
+        clean_dep("@org_tensorflow//tensorflow:macos"): [clean_dep("@org_tensorflow//tensorflow:libtensorflow_cc.%s.dylib" % VERSION_MAJOR)],
+        clean_dep("@org_tensorflow//tensorflow:windows"): [],
+        "//conditions:default": [clean_dep("@org_tensorflow//tensorflow:libtensorflow_cc.so.%s" % VERSION_MAJOR)],
+    })
+
+    tf_cc_shared_library_opensource(
+        name = cc_shared_library_name,
+        srcs = srcs,
+        # framework_so is no longer needed as libtf.so is included via the extra_deps.
+        framework_so = [],
+        copts = copts + if_not_windows([
+            "-Wno-self-assign",
+            "-Wno-sign-compare",
+            "-Wno-write-strings",
+        ]),
+        linkopts = extra_linkopts,
+        linkstatic = 1,
+        roots = roots,
+        deps = deps + extra_deps,
+        dynamic_deps = dynamic_deps,
+        static_deps = static_deps,
+        exports_filter = exports_filter,
+        win_def_file = win_def_file,
+        additional_linker_inputs = additional_linker_inputs,
+    )
+
+    # When a non-versioned .so is added as a 'src' to a bazel target, it uses
+    # -l%(so_name) instead of -l:%(so_file) during linking.  When -l%(so_name)
+    # is passed to ld, it will look for an associated file with the schema
+    # lib%(so_name).so.  Since pywrap_tensorflow is not explicitly versioned
+    # and is not prefixed with lib_, we add a rule for the creation of an .so
+    # file with the canonical lib schema (e.g. libNAME.so), so that
+    # -l%(so_name) is resolved during linking.
+    #
+    # See: https://github.com/bazelbuild/bazel/blob/7a6808260a733d50983c1adf0cf5a7493472267f/src/main/java/com/google/devtools/build/lib/rules/cpp/LibrariesToLinkCollector.java#L319
+    for pattern in SHARED_LIBRARY_NAME_PATTERNS:
+        name_os = pattern % (cc_library_base, "")
+        native.genrule(
+            name = name_os + "_rule",
+            srcs = [":" + cc_shared_library_name],
+            outs = [name_os],
+            cmd = "cp $< $@",
+        )
+
+    native.genrule(
+        name = "gen_" + cc_library_pyd_name,
+        srcs = [":" + cc_shared_library_name],
+        outs = [cc_library_pyd_name],
+        cmd = "cp $< $@",
+    )
+
+    # TODO(amitpatankar): Remove this py_library reference and
+    # move the dependencies to pywrap_tensorflow. This can
+    # eliminate one layer of Python import redundancy. We would
+    # have to change all pywrap_tensorflow imports to
+    # pywrap_tensorflow_internal.
+
+    # Bazel requires an empty .py file for pywrap_tensorflow_internal.py.
+    empty_py_file = [name + ".py"]
+    native.genrule(
+        name = "empty_py_file_rule",
+        outs = empty_py_file,
+        cmd = "touch $@",
+    )
+
+    # TODO(b/271333181): This should be done more generally on Windows for every dll dependency
+    # (there is only one currently) that is not in the same directory, otherwise Python will fail to
+    # link the pyd (which is just a dll) because of missing dependencies.
+    _create_symlink("ml_dtypes.so", "@org_tensorflow//tensorflow/tsl/python/lib/core:ml_dtypes.so")
+
+    _plain_py_library(
+        name = name,
+        srcs = [":" + name + ".py"],
+        srcs_version = "PY3",
+        data = select({
+            clean_dep("@org_tensorflow//tensorflow:windows"): [
+                ":" + cc_library_pyd_name,
+                ":ml_dtypes.so",
+                "@org_tensorflow//tensorflow/tsl/python/lib/core:ml_dtypes.so",
+            ],
+            "//conditions:default": [
+                ":" + cc_shared_library_name,
+            ],
+        }),
+    )
+
+# Export open source version of pywrap_tensorflow_macro under base name as well.
+pywrap_tensorflow_macro = pywrap_tensorflow_macro_opensource
+
+# This macro is for running python tests against system installed pip package
+# on Windows.
+#
+# py_test is built as an executable python zip file on Windows, which contains all
+# dependencies of the target. Because of the C++ extensions, it would be very
+# inefficient if the py_test zips all runfiles, plus we don't need them when running
+# tests against system installed pip package. So we'd like to get rid of the deps
+# of py_test in this case.
+#
+# In order to trigger the tests without bazel clean after getting rid of deps,
+# we introduce the following :
+# 1. When --define=no_tensorflow_py_deps=true, the py_test depends on a marker
+#    file of the pip package, the test gets to rerun when the pip package change.
+#    Note that this only works on Windows. See the definition of
+#    //third_party/tensorflow/tools/pip_package:win_pip_package_marker for specific reasons.
+# 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
+def py_test(deps = [], data = [], kernels = [], exec_properties = None, test_rule = _plain_py_test, **kwargs):
+    if not exec_properties:
+        exec_properties = tf_exec_properties(kwargs)
+
+    _make_tags_mutable(kwargs)
+    test_rule(
+        deps = select({
+            "//conditions:default": deps,
+            clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): [],
+        }),
+        data = data + select({
+            "//conditions:default": kernels,
+            clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): ["@org_tensorflow//tensorflow/tools/pip_package:win_pip_package_marker"],
+        }),
+        exec_properties = exec_properties,
+        **kwargs
+    )
+
+register_extension_info(
+    extension = py_test,
+    label_regex_for_dep = "{extension_name}",
+)
+
+# Similar to py_test above, this macro is used to exclude dependencies for some py_binary
+# targets in order to reduce the size of //tensorflow/tools/pip_package:simple_console_windows.
+# See https://github.com/tensorflow/tensorflow/issues/22390
+def py_binary(name, deps = [], **kwargs):
+    # Add an extra target for dependencies to avoid nested select statement.
+    _plain_py_library(
+        name = name + "_deps",
+        deps = deps,
+    )
+
+    # Python version placeholder
+    _make_tags_mutable(kwargs)
+    _plain_py_binary(
+        name = name,
+        deps = select({
+            "//conditions:default": [":" + name + "_deps"],
+            clean_dep("@org_tensorflow//tensorflow:no_tensorflow_py_deps"): [],
+        }),
+        **kwargs
+    )
+
+def pytype_library(name, pytype_deps = [], pytype_srcs = [], **kwargs):
+    # Types not enforced in OSS.
+    _make_tags_mutable(kwargs)
+    _plain_py_library(name = name, **kwargs)
+
+# Tensorflow uses rules_python 0.0.1, and in that version of rules_python,
+# the rules require the tags value to be a mutable list because they
+# modify it in-place. Later versions of rules_python don't have this
+# requirement.
+def _make_tags_mutable(kwargs):
+    if "tags" in kwargs and kwargs["tags"] != None:
+        # The value might be a frozen list, which looks just like
+        # a regular list. So always make a copy.
+        kwargs["tags"] = list(kwargs["tags"])
+
+def tf_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        tags = [],
+        shard_count = 1,
+        additional_visibility = [],
+        kernels = [],
+        flaky = 0,
+        xla_enable_strict_auto_jit = False,
+        xla_enabled = False,
+        grpc_enabled = False,
+        tfrt_enabled = False,
+        # `tfrt_enabled` is set for some test targets, and if we enable
+        # TFRT tests just by that, this will enable TFRT builds for open source.
+        # TFRT open source is not fully integrated yet so we need a temporary
+        # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal`
+        # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is
+        # only applied for internal builds.
+        # TODO(b/156911178): Revert this temporary workaround once TFRT open source
+        # is fully integrated with TF.
+        tfrt_enabled_internal = False,
+        **kwargs):
+    """Create one or more python tests with extra tensorflow dependencies."""
+    xla_test_true_list = []
+    if "additional_deps" in kwargs:
+        fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
+    deps = kwargs.pop("deps", [])
+
+    # xla_enable_strict_auto_jit is used to run Tensorflow unit tests with all XLA compilable
+    # kernels compiled with XLA.
+    if xla_enable_strict_auto_jit:
+        xla_enabled = True
+        xla_test_true_list.append("@org_tensorflow//tensorflow/python/framework:is_xla_test_true")
+    if xla_enabled:
+        deps = deps + tf_additional_xla_deps_py()
+    if grpc_enabled:
+        deps = deps + tf_additional_grpc_deps_py()
+
+    # NOTE(ebrevdo): This is a workaround for depset() not being able to tell
+    # the difference between 'dep' and 'clean_dep(dep)'.
+    for to_add in [
+        "@org_tensorflow//tensorflow/python:extra_py_tests_deps",
+    ]:
+        if to_add not in deps and clean_dep(to_add) not in deps:
+            deps.append(clean_dep(to_add))
+
+    env = kwargs.pop("env", {})
+
+    # Python version placeholder
+    kwargs.setdefault("srcs_version", "PY3")
+    py_test(
+        name = name,
+        size = size,
+        srcs = srcs,
+        args = args,
+        data = data,
+        flaky = flaky,
+        kernels = kernels,
+        main = main,
+        shard_count = shard_count,
+        tags = tags,
+        visibility = [clean_dep("@org_tensorflow//tensorflow:internal")] +
+                     additional_visibility,
+        deps = depset(deps + xla_test_true_list),
+        env = env,
+        **kwargs
+    )
+    if tfrt_enabled_internal:
+        tfrt_env = {}
+        tfrt_env.update(env)
+        tfrt_env["EXPERIMENTAL_ENABLE_TFRT"] = "1"
+
+        # None `main` defaults to `name` + ".py" in `py_test` target. However, since we
+        # are appending _tfrt. it becomes `name` + "_tfrt.py" effectively. So force
+        # set `main` argument without `_tfrt`.
+        if main == None:
+            main = name + ".py"
+
+        py_test(
+            env = tfrt_env,
+            name = name + "_tfrt",
+            size = size,
+            srcs = srcs,
+            args = args,
+            data = data,
+            flaky = flaky,
+            kernels = kernels,
+            main = main,
+            shard_count = shard_count,
+            tags = tags + ["tfrt"],
+            visibility = [clean_dep("@org_tensorflow//tensorflow:internal")] +
+                         additional_visibility,
+            deps = depset(deps + xla_test_true_list),
+            **kwargs
+        )
+
+register_extension_info(
+    extension = tf_py_test,
+    label_regex_for_dep = "{extension_name}(_tfrt)?",
+)
+
+def gpu_py_test(
+        name,
+        srcs,
+        size = "medium",
+        data = [],
+        main = None,
+        args = [],
+        shard_count = 1,
+        kernels = [],
+        tags = [],
+        flaky = 0,
+        xla_enable_strict_auto_jit = False,
+        xla_enabled = False,
+        grpc_enabled = False,
+        xla_tags = [],  # additional tags for xla_gpu tests
+        **kwargs):
+    if main == None:
+        main = name + ".py"
+    if "additional_deps" in kwargs:
+        fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
+    configs = ["cpu", "gpu"]
+    if "multi_gpu" in tags or "multi_and_single_gpu" in tags:
+        configs = configs + ["2gpu"]
+
+    targets = []
+
+    for config in configs:
+        test_name = name
+        test_tags = tags
+        if config == "gpu":
+            test_tags = test_tags + tf_gpu_tests_tags()
+        if config == "2gpu":
+            test_tags = test_tags + two_gpu_tags
+            if "requires-gpu-nvidia" in test_tags:
+                test_tags.remove("requires-gpu-nvidia")
+
+        # TODO(b/215751004): CPU on XLA tests are skipped intentionally.
+        if config != "cpu" and xla_enable_strict_auto_jit:
+            strict_auto_jit_test_name = test_name + "_xla_" + config
+            tf_py_test(
+                name = strict_auto_jit_test_name,
+                size = size,
+                srcs = srcs,
+                args = args,
+                data = data,
+                flaky = flaky,
+                grpc_enabled = grpc_enabled,
+                kernels = kernels,
+                main = main,
+                shard_count = shard_count,
+                tags = test_tags + xla_tags + ["xla", "manual"],
+                xla_enabled = xla_enabled,
+                xla_enable_strict_auto_jit = True,
+                **kwargs
+            )
+            targets.append(strict_auto_jit_test_name)
+
+        test_name = test_name + "_" + config
+
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = srcs,
+            args = args,
+            data = data,
+            flaky = flaky,
+            grpc_enabled = grpc_enabled,
+            kernels = kernels,
+            main = main,
+            shard_count = shard_count,
+            tags = test_tags,
+            xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = False,
+            **kwargs
+        )
+        targets.append(test_name)
+
+    native.test_suite(name = name, tests = targets, tags = tags)
+
+# terminology changes: saving cuda_* definition for compatibility
+def cuda_py_test(*args, **kwargs):
+    gpu_py_test(*args, **kwargs)
+
+register_extension_info(
+    extension = gpu_py_test,
+    label_regex_for_dep = "{extension_name}_cpu",
+)
+
+def py_tests(
+        name,
+        srcs,
+        size = "medium",
+        kernels = [],
+        data = [],
+        tags = [],
+        shard_count = 1,
+        prefix = "",
+        xla_enable_strict_auto_jit = False,
+        xla_enabled = False,
+        grpc_enabled = False,
+        tfrt_enabled = False,
+        **kwargs):
+    if "additional_deps" in kwargs:
+        fail("Use `deps` to specify dependencies. `additional_deps` has been replaced with the standard pattern of `deps`.")
+    for src in srcs:
+        test_name = src.split("/")[-1].split(".")[0]
+        if prefix:
+            test_name = "%s_%s" % (prefix, test_name)
+        tf_py_test(
+            name = test_name,
+            size = size,
+            srcs = [src],
+            data = data,
+            grpc_enabled = grpc_enabled,
+            kernels = kernels,
+            main = src,
+            shard_count = shard_count,
+            tags = tags,
+            xla_enabled = xla_enabled,
+            xla_enable_strict_auto_jit = xla_enable_strict_auto_jit,
+            tfrt_enabled = tfrt_enabled,
+            **kwargs
+        )
+
+# Creates a genrule named <name> for running tools/proto_text's generator to
+# make the proto_text functions, for the protos passed in <srcs>.
+#
+# Return a struct with fields (hdrs, srcs) containing the names of the
+# generated files.
+def tf_generate_proto_text_sources(name, srcs_relative_dir, srcs, protodeps = [], deps = [], visibility = None, compatible_with = None):
+    out_hdrs = (
+        [
+            p.replace(".proto", ".pb_text.h")
+            for p in srcs
+        ] + [p.replace(".proto", ".pb_text-impl.h") for p in srcs]
+    )
+    out_srcs = [p.replace(".proto", ".pb_text.cc") for p in srcs]
+    native.genrule(
+        name = name + "_srcs",
+        srcs = srcs + protodeps + [clean_dep("@org_tensorflow//tensorflow/tools/proto_text:placeholder.txt")],
+        outs = out_hdrs + out_srcs,
+        visibility = visibility,
+        cmd =
+            "$(location //tensorflow/tools/proto_text:gen_proto_text_functions) " +
+            "$(@D) " + srcs_relative_dir + " $(SRCS)",
+        tools = [
+            clean_dep("@org_tensorflow//tensorflow/tools/proto_text:gen_proto_text_functions"),
+        ],
+        compatible_with = compatible_with,
+    )
+
+    native.filegroup(
+        name = name + "_hdrs",
+        srcs = out_hdrs,
+        visibility = visibility,
+        compatible_with = compatible_with,
+    )
+
+    cc_library(
+        compatible_with = compatible_with,
+        name = name,
+        srcs = out_srcs,
+        hdrs = out_hdrs,
+        visibility = visibility,
+        deps = deps,
+        alwayslink = 1,
+    )
+
+def tf_genrule_cmd_append_to_srcs(to_append):
+    return ("cat $(SRCS) > $(@) && " + "echo >> $(@) && " + "echo " + to_append +
+            " >> $(@)")
+
+def _local_exec_transition_impl(settings, attr):
+    return {
+        # Force all targets in the subgraph to build on the local machine.
+        "//command_line_option:modify_execution_info": ".*=+no-remote-exec",
+    }
+
+# A transition that forces all targets in the subgraph to be built locally.
+_local_exec_transition = transition(
+    implementation = _local_exec_transition_impl,
+    inputs = [],
+    outputs = [
+        "//command_line_option:modify_execution_info",
+    ],
+)
+
+def _local_genrule_impl(ctx):
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [f for t in ctx.attr.srcs for f in t.files.to_list()],
+        tools = [ctx.executable.exec_tool],
+        arguments = [f.path for t in ctx.attr.srcs for f in t.files.to_list()] +
+                    [ctx.outputs.out.path],
+        command = "%s %s" % (ctx.executable.exec_tool.path, ctx.attr.arguments),
+        execution_requirements = {"no-remote-exec": ""},
+        use_default_shell_env = True,
+    )
+
+# A genrule that executes locally and forces the tool it runs to be built locally.
+# For python, we want to build all py_binary rules locally that we also want
+# to execute locally, as the remote image might use a different python version.
+# TODO(klimek): Currently we still need to annotate the py_binary rules to use
+# the local platform when building. When we know how to change the platform
+# (https://github.com/bazelbuild/bazel/issues/11035) use this to not require
+# annotating the py_binary rules.
+_local_genrule_internal = rule(
+    implementation = _local_genrule_impl,
+    attrs = {
+        "out": attr.output(),
+        "exec_tool": attr.label(
+            executable = True,
+            cfg = _local_exec_transition,
+            allow_files = True,
+        ),
+        "arguments": attr.string(),
+        "srcs": attr.label_list(
+            allow_files = True,
+        ),
+        "_whitelist_function_transition": attr.label(default = "@bazel_tools//tools/whitelists/function_transition_whitelist"),
+    },
+)
+
+# Wrap the rule in a macro so we can pass in exec_compatible_with.
+def _local_genrule(**kwargs):
+    _local_genrule_internal(
+        exec_compatible_with = [
+            "@local_execution_config_platform//:platform_constraint",
+        ],
+        **kwargs
+    )
+
+def tf_version_info_genrule(name, out, compatible_with = None):
+    # TODO(gunan): Investigate making this action hermetic so we do not need
+    # to run it locally.
+    _local_genrule(
+        name = name,
+        out = out,
+        compatible_with = compatible_with,
+        exec_tool = "@org_tensorflow//tensorflow/tools/git:gen_git_source",
+        srcs = [
+            "@local_config_git//:gen/spec.json",
+            "@local_config_git//:gen/head",
+            "@local_config_git//:gen/branch_ref",
+        ],
+        arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}",
+    )
+
+def _dict_to_kv(d):
+    """Convert a dictionary to a space-joined list of key=value pairs."""
+    return " " + " ".join(["%s=%s" % (k, v) for k, v in d.items()])
+
+def tf_py_build_info_genrule(name, out):
+    _local_genrule(
+        name = name,
+        out = out,
+        exec_tool = "@org_tensorflow//tensorflow/tools/build_info:gen_build_info",
+        arguments =
+            "--raw_generate \"$@\" " +
+            " --key_value" +
+            " is_rocm_build=" + if_rocm("True", "False") +
+            " is_cuda_build=" + if_cuda("True", "False") +
+            " is_tensorrt_build=" + if_tensorrt("True", "False") +
+            if_windows(_dict_to_kv({
+                "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll",
+            }), "") + if_windows_cuda(_dict_to_kv({
+                "nvcuda_dll_name": "nvcuda.dll",
+                "cudart_dll_name": "cudart{cuda_version}.dll",
+                "cudnn_dll_name": "cudnn{cudnn_version}.dll",
+            }), ""),
+    )
+
+def cc_library_with_android_deps(
+        deps,
+        android_deps = [],
+        common_deps = [],
+        copts = tf_copts(),
+        **kwargs):
+    deps = if_not_android(deps) + if_android(android_deps) + common_deps
+    cc_library(deps = deps, copts = copts, **kwargs)
+
+def tensorflow_opensource_extra_deps():
+    return []
+
+# Builds a pybind11 compatible library.
+def pybind_library(
+        name,
+        copts = [],
+        features = [],
+        tags = [],
+        deps = [],
+        **kwargs):
+    # Mark common dependencies as required for build_cleaner.
+    tags = tags + ["req_dep=" + clean_dep("//third_party/pybind11"), "req_dep=@local_config_python//:python_headers"]
+
+    native.cc_library(
+        name = name,
+        copts = copts + ["-fexceptions"],
+        features = features + [
+            "-use_header_modules",  # Required for pybind11.
+            "-parse_headers",
+        ],
+        tags = tags,
+        deps = deps + [clean_dep("//third_party/pybind11"), "@local_config_python//:python_headers"],
+        **kwargs
+    )
+
+# buildozer: disable=function-docstring-args
+def pybind_extension_opensource(
+        name,
+        srcs,
+        module_name = None,  # Unused.
+        hdrs = [],
+        dynamic_deps = [],
+        static_deps = [],
+        deps = [],
+        additional_exported_symbols = [],
+        compatible_with = None,
+        copts = [],
+        data = [],
+        defines = [],
+        deprecation = None,
+        enable_stub_generation = False,  # Unused.
+        features = [],
+        link_in_framework = False,
+        licenses = None,
+        linkopts = [],
+        pytype_deps = [],
+        pytype_srcs = [],
+        restricted_to = None,
+        srcs_version = "PY3",
+        testonly = None,
+        visibility = None,
+        win_def_file = None):
+    """Builds a generic Python extension module."""
+    _ignore = [enable_stub_generation, module_name]  # buildifier: disable=unused-variable
+    p = name.rfind("/")
+    if p == -1:
+        sname = name
+        prefix = ""
+    else:
+        sname = name[p + 1:]
+        prefix = name[:p + 1]
+    so_file = "%s%s.so" % (prefix, sname)
+    filegroup_name = "%s_filegroup" % name
+    pyd_file = "%s%s.pyd" % (prefix, sname)
+    exported_symbols = [
+        "init%s" % sname,
+        "init_%s" % sname,
+        "PyInit_%s" % sname,
+    ] + additional_exported_symbols
+
+    exported_symbols_file = "%s-exported-symbols.lds" % name
+    version_script_file = "%s-version-script.lds" % name
+
+    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
+    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
+
+    native.genrule(
+        name = name + "_exported_symbols",
+        outs = [exported_symbols_file],
+        cmd = "echo '%s' >$@" % exported_symbols_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    native.genrule(
+        name = name + "_version_script",
+        outs = [version_script_file],
+        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    if static_deps:
+        cc_library_name = so_file + "_cclib"
+        cc_library(
+            name = cc_library_name,
+            hdrs = hdrs,
+            srcs = srcs + hdrs,
+            data = data,
+            deps = deps,
+            compatible_with = compatible_with,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            restricted_to = restricted_to,
+            testonly = testonly,
+            visibility = visibility,
+        )
+
+        cc_shared_library(
+            name = so_file,
+            roots = [cc_library_name],
+            dynamic_deps = dynamic_deps,
+            static_deps = static_deps,
+            additional_linker_inputs = [exported_symbols_file, version_script_file],
+            compatible_with = compatible_with,
+            deprecation = deprecation,
+            features = features + ["-use_header_modules"],
+            licenses = licenses,
+            restricted_to = restricted_to,
+            shared_lib_name = so_file,
+            testonly = testonly,
+            user_link_flags = linkopts + _rpath_user_link_flags(name) + select({
+                clean_dep("@org_tensorflow//tensorflow:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            visibility = visibility,
+        )
+
+        # cc_shared_library can generate more than one file.
+        # Solution to avoid the error "variable '$<' : more than one input file."
+        filegroup(
+            name = filegroup_name,
+            srcs = [so_file],
+            output_group = "main_shared_library_output",
+            testonly = testonly,
+        )
+    else:
+        if link_in_framework:
+            srcs += tf_binary_additional_srcs()
+
+        cc_binary(
+            name = so_file,
+            srcs = srcs + hdrs,
+            data = data,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            linkopts = linkopts + _rpath_linkopts(name) + select({
+                clean_dep("@org_tensorflow//tensorflow:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("@org_tensorflow//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            deps = deps + [
+                exported_symbols_file,
+                version_script_file,
+            ],
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            linkshared = 1,
+            testonly = testonly,
+            licenses = licenses,
+            visibility = visibility,
+            deprecation = deprecation,
+            restricted_to = restricted_to,
+            compatible_with = compatible_with,
+        )
+
+        # For Windows, emulate the above filegroup with the shared object.
+        native.alias(
+            name = filegroup_name,
+            actual = so_file,
+        )
+
+    # For Windows only.
+    native.genrule(
+        name = name + "_pyd_copy",
+        srcs = [filegroup_name],
+        outs = [pyd_file],
+        cmd = "cp $< $@",
+        output_to_bindir = True,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+
+    _plain_py_library(
+        name = name,
+        data = select({
+            clean_dep("@org_tensorflow//tensorflow:windows"): [pyd_file],
+            "//conditions:default": [so_file],
+        }) + pytype_srcs,
+        deps = pytype_deps,
+        srcs_version = srcs_version,
+        licenses = licenses,
+        testonly = testonly,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+    )
+
+# Export open source version of pybind_extension under base name as well.
+pybind_extension = pybind_extension_opensource
+
+# Note: we cannot add //third_party/tf_runtime:__subpackages__ here,
+# because that builds all of tf_runtime's packages, and some of them
+# are known not to build on big endian systems.
+# See b/148087476 and
+# https://github.com/tensorflow/tensorflow/issues/57844.
+# TODO(b/254083070): remove this definition once the packages move to TSL.
+def tsl_async_value_deps():
+    return [
+        "@tf_runtime//:async_value",
+        "@tf_runtime//:dtype",
+        "@tf_runtime//:support",
+        "@tf_runtime//:concurrent_vector",
+        "@tf_runtime//:ref_count",
+        "@tf_runtime//third_party/llvm_derived:unique_any",
+        "@tf_runtime//third_party/llvm_derived:in_place",
+    ]
+
+def tf_python_pybind_static_deps(testonly = False):
+    # TODO(b/146808376): Reduce the dependencies to those that are really needed.
+    static_deps = [
+        "//:__subpackages__",
+        "@FP16//:__subpackages__",
+        "@FXdiv//:__subpackages__",
+        "@XNNPACK//:__subpackages__",
+        "@arm_neon_2_x86_sse//:__subpackages__",
+        "@bazel_tools//:__subpackages__",
+        "@boringssl//:__subpackages__",
+        "@clog//:__subpackages__",
+        "@com_github_cares_cares//:__subpackages__",
+        "@com_github_googlecloudplatform_tensorflow_gcp_tools//:__subpackages__",
+        "@com_github_grpc_grpc//:__subpackages__",
+        "@com_google_absl//:__subpackages__",
+        "@com_google_googleapis//:__subpackages__",
+        "@com_google_protobuf//:__subpackages__",
+        "@com_googlesource_code_re2//:__subpackages__",
+        "@compute_library//:__subpackages__",
+        "@cpuinfo//:__subpackages__",
+        "@cudnn_frontend_archive//:__subpackages__",  #  TFRT integration for TensorFlow.
+        "@curl//:__subpackages__",
+        "@dlpack//:__subpackages__",
+        "@double_conversion//:__subpackages__",
+        "@eigen_archive//:__subpackages__",
+        "@farmhash_archive//:__subpackages__",
+        "@farmhash_gpu_archive//:__subpackages__",
+        "@fft2d//:__subpackages__",
+        "@flatbuffers//:__subpackages__",
+        "@gemmlowp//:__subpackages__",
+        "@gif//:__subpackages__",
+        "@highwayhash//:__subpackages__",
+        "@hwloc//:__subpackages__",
+        "@icu//:__subpackages__",
+        "@jsoncpp_git//:__subpackages__",
+        "@libjpeg_turbo//:__subpackages__",
+        "@llvm-project//:__subpackages__",
+        "@llvm_openmp//:__subpackages__",
+        "@llvm_terminfo//:__subpackages__",
+        "@llvm_zlib//:__subpackages__",
+        "@local_config_cuda//:__subpackages__",
+        "@local_config_git//:__subpackages__",
+        "@local_config_nccl//:__subpackages__",
+        "@local_config_python//:__subpackages__",
+        "@local_config_rocm//:__subpackages__",
+        "@local_config_tensorrt//:__subpackages__",
+        "@local_execution_config_platform//:__subpackages__",
+        "@mkl_dnn_acl_compatible//:__subpackages__",
+        "@nsync//:__subpackages__",
+        "@nccl_archive//:__subpackages__",
+        "@onednn//:__subpackages__",
+        "@org_sqlite//:__subpackages__",
+        "@platforms//:__subpackages__",
+        "@png//:__subpackages__",
+        "@pthreadpool//:__subpackages__",
+        "@pybind11//:__subpackages__",
+        "@ruy//:__subpackages__",
+        "@snappy//:__subpackages__",
+        "@sobol_data//:__subpackages__",
+        "@stablehlo//:__subpackages__",
+        "@tf_runtime//:__subpackages__",
+        "@upb//:__subpackages__",
+        "@zlib//:__subpackages__",
+    ]
+    static_deps += tsl_async_value_deps()
+    static_deps += [] if not testonly else [
+        "@com_google_benchmark//:__subpackages__",
+        "@com_google_googletest//:__subpackages__",
+    ]
+    return if_oss(static_deps)
+
+# buildozer: enable=function-docstring-args
+def tf_python_pybind_extension_opensource(
+        name,
+        srcs,
+        module_name = None,
+        hdrs = [],  # TODO(b/264128506): Drop after migration to cc_shared_library.
+        deps = [],
+        dynamic_deps = [],
+        static_deps = [],
+        compatible_with = None,
+        copts = [],
+        defines = [],
+        features = [],
+        testonly = False,
+        visibility = None,
+        win_def_file = None):
+    """A wrapper macro for pybind_extension_opensource that is used in tensorflow/python/BUILD.
+
+    Please do not use it anywhere else as it may behave unexpectedly. b/146445820
+
+    It is used for targets under //third_party/tensorflow/python that link
+    against libtensorflow_framework.so and pywrap_tensorflow_internal.so.
+    """
+    extended_deps = deps + if_mkl_ml(["//third_party/mkl:intel_binary_blob"])
+    extended_deps += [] if dynamic_deps else if_windows([], ["@org_tensorflow//tensorflow:libtensorflow_framework_import_lib"]) + tf_binary_pybind_deps()
+    pybind_extension_opensource(
+        name,
+        srcs,
+        module_name = module_name,
+        hdrs = hdrs,
+        dynamic_deps = dynamic_deps,
+        static_deps = static_deps,
+        deps = extended_deps,
+        compatible_with = compatible_with,
+        copts = copts,
+        defines = defines,
+        features = features,
+        testonly = testonly,
+        visibility = visibility,
+        win_def_file = win_def_file,
+    )
+
+# Export open source version of tf_python_pybind_extension under base name as well.
+tf_python_pybind_extension = tf_python_pybind_extension_opensource
+
+def tf_pybind_cc_library_wrapper_opensource(name, deps, visibility = None, **kwargs):
+    """Wrapper for cc_library and proto dependencies used by tf_python_pybind_extension_opensource.
+
+    This wrapper ensures that cc libraries' and protos' headers are made
+    available to pybind code, without creating ODR violations in the dynamically
+    linked case.  The symbols in these deps symbols should be linked to, and
+    exported by, the core pywrap_tensorflow_internal.so
+    """
+    cc_header_only_library(name = name, deps = deps, visibility = visibility, **kwargs)
+
+# Export open source version of tf_pybind_cc_library_wrapper under base name as well.
+tf_pybind_cc_library_wrapper = tf_pybind_cc_library_wrapper_opensource
+
+if_cuda_or_rocm = _if_cuda_or_rocm
+
+def tf_monitoring_framework_deps(link_to_tensorflow_framework = True):
+    """Get the monitoring libs that will be linked to the tensorflow framework.
+
+      Currently in OSS, the protos must be statically linked to the tensorflow
+      framework, whereas the grpc should not be linked here.
+    """
+    return select({
+        "@org_tensorflow//tensorflow:stackdriver_support": [
+            "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter_protos",
+        ],
+        "//conditions:default": [],
+    })
+
+def tf_monitoring_python_deps():
+    """Get the monitoring libs that will be linked to the python wrapper.
+
+      Currently in OSS, the grpc must be statically linked to the python wrapper
+      whereas the protos should not be linked here.
+    """
+    return select({
+        "@org_tensorflow//tensorflow:stackdriver_support": [
+            "@com_github_googlecloudplatform_tensorflow_gcp_tools//monitoring:stackdriver_exporter",
+        ],
+        "//conditions:default": [],
+    })
+
+# Teams sharing the same repo can provide their own ops_to_register.h file using
+# this function, and pass in -Ipath/to/repo flag when building the target.
+def tf_selective_registration_deps():
+    return []
+
+def tf_jit_compilation_passes_extra_deps():
+    return []
+
+def if_mlir(if_true, if_false = []):
+    return select({
+        str(Label("@org_tensorflow//tensorflow:with_mlir_support")): if_true,
+        "//conditions:default": if_false,
+    })
+
+def tf_enable_mlir_bridge():
+    return select({
+        str(Label("@org_tensorflow//tensorflow:enable_mlir_bridge")): [
+            "@org_tensorflow//tensorflow/python/framework:is_mlir_bridge_test_true",
+        ],
+        str(Label("@org_tensorflow//tensorflow:disable_mlir_bridge")): [
+            "@org_tensorflow//tensorflow/python/framework:is_mlir_bridge_test_false",
+        ],
+        "//conditions:default": [],
+    })
+
+def tfcompile_target_cpu(name = ""):
+    return ""
+
+def tfcompile_dfsan_enabled():
+    return False
+
+def tfcompile_dfsan_abilists():
+    return []
+
+def tf_external_workspace_visible(visibility):
+    # External workspaces can see this target.
+    return ["//visibility:public"]
+
+def _filegroup_as_file_impl(ctx):
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(
+        output = out,
+        content = "\n".join([f.short_path for f in ctx.files.dep]),
+    )
+    return DefaultInfo(files = depset([out]))
+
+_filegroup_as_file = rule(
+    implementation = _filegroup_as_file_impl,
+    attrs = {
+        "dep": attr.label(),
+    },
+)
+
+def filegroup_as_file(name, dep, visibility = []):
+    """Creates a filegroup ${name}_file which contains the file ${name}."""
+    _filegroup_as_file(name = name, dep = dep)
+    native.filegroup(
+        name = name + "_file",
+        srcs = [name],
+        visibility = visibility,
+    )
+
+def tf_grpc_dependencies():
+    return ["@org_tensorflow//tensorflow:grpc"]
+
+def tf_grpc_cc_dependencies():
+    return ["@org_tensorflow//tensorflow:grpc++"]
+
+def get_compatible_with_portable():
+    return []
+
+def filegroup(**kwargs):
+    native.filegroup(**kwargs)
+
+def genrule(**kwargs):
+    native.genrule(**kwargs)
+
+def internal_tfrt_deps():
+    return []
+
+def _tf_gen_options_header_impl(ctx):
+    header_depset = depset([ctx.outputs.output_header])
+
+    define_vals = {True: "true", False: "false"}
+    substitutions = {}
+    for target, identifier in ctx.attr.build_settings.items():
+        setting_val = target[BuildSettingInfo].value
+        lines = [
+            "// %s" % target.label,
+            "#define TF_OPTION_%s() %s" % (identifier, define_vals[setting_val]),
+        ]
+        substitutions["#define_option %s" % identifier] = "\n".join(lines)
+
+    ctx.actions.expand_template(
+        template = ctx.file.template,
+        output = ctx.outputs.output_header,
+        substitutions = substitutions,
+    )
+
+    return [
+        DefaultInfo(files = header_depset),
+    ]
+
+tf_gen_options_header = rule(
+    attrs = {
+        "output_header": attr.output(
+            doc = "File path for the generated header (output)",
+            mandatory = True,
+        ),
+        "template": attr.label(
+            doc = """Template for the header.
+            For each option name 'X' (see build_settings attribute),
+            '#define_option X' results in a macro 'TF_OPTION_X()'
+            """,
+            allow_single_file = True,
+            mandatory = True,
+        ),
+        "build_settings": attr.label_keyed_string_dict(
+            doc = """Dictionary from build-setting labels to option names. Example:
+               {"@org_tensorflow//tensorflow:x_setting" : "X"}
+            """,
+            providers = [BuildSettingInfo],
+        ),
+    },
+    implementation = _tf_gen_options_header_impl,
+    doc = """
+    Generates a header file for Bazel build settings.
+
+    This is an alternative to setting preprocessor defines on the compiler
+    command line. It has a few advantages:
+      - Usage of the options requires #include-ing the header, and thus a
+        Bazel-level dependency.
+      - Each option has a definition site in source code, which mentions the
+        corresponding Bazel setting. This is particularly useful when
+        navigating code with the assistance of static analysis (e.g.
+        https://cs.opensource.google/tensorflow).
+      - Each option is represented as a FUNCTION()-style macro, which is always
+        defined (i.e. one uses #if instead of #ifdef). This allows forms like
+        'if constexpr (TF_OPTION_FOO()) { ... }', and helps catch missing
+        dependencies (if 'F' is undefined, '#if F()' results in an error).
+    """,
+)
+
+# These flags are used selectively to disable benign ptxas warnings for some
+# build targets.  On clang "-Xcuda-ptxas --disable-warnings" is sufficient, but
+# that does not work on some versions of GCC.  So for now this is empty in the
+# open source build.
+def tf_disable_ptxas_warning_flags():
+    return []
+
+# Use this to replace the `non_portable_tf_deps` (i.e., tensorflow/core/...) with
+# tensorflow/core:portable_tensorflow_lib_lite when building portably.
+def replace_with_portable_tf_lib_when_required(non_portable_tf_deps, use_lib_with_runtime = False):
+    portable_tf_lib = "@org_tensorflow//tensorflow/core:portable_tensorflow_lib_lite"
+
+    return select({
+        "@org_tensorflow//tensorflow:android": [portable_tf_lib],
+        "@org_tensorflow//tensorflow:ios": [portable_tf_lib],
+        "//conditions:default": non_portable_tf_deps,
+    })
+
+def tf_python_framework_friends():
+    return ["@org_tensorflow//tensorflow:__subpackages__"]
diff --git a/build_deps/requirements.in b/build_deps/requirements.in
new file mode 100644
index 00000000..175e899b
--- /dev/null
+++ b/build_deps/requirements.in
@@ -0,0 +1,37 @@
+# Requirements for the Federated Compute Python development environment.
+#
+# *   For packages that have a stable release, we use a version that is
+#     compatible with that release (e.g. `~=x.y`). See
+#     https://peps.python.org/pep-0440/#compatible-release for more information.
+# *   For packages that do not have a stable release, we use a version that
+#     matches a release that has been tested (e.g. `==x.y.z`). See
+#     https://peps.python.org/pep-0440/#version-matching for more information.
+#
+# Note: There is bug in `pip` when multiple packages use the compatible release
+# operator `~=` to specify a version and one of those versions ends in `0`. See
+# https://github.com/pypa/pip/issues/9613 for more information. In this case,
+# use the equivalent clause `>=x.0,==x.*` instead of `~=x.0`.
+#
+# This assumes that the packages follow Semantic Versioning, see
+# https://semver.org/. If a package follows a different versioning scheme or
+# requires unique handling, we use a different version specifier and comment the
+# versioning scheme or reasoning.
+
+absl-py~=1.4
+attrs~=23.1
+dm-tree~=0.1.8
+dill == 0.3.6
+pandas
+fastparquet
+portpicker>=1.6.0
+protobuf>=4.23
+pytest-xdist
+pytest~=6.2.5
+scipy~=1.14.1
+tblib == 1.7.0
+tqdm
+tf_keras
+# The TensorFlow version should match what's specified in the WORKSPACE file for
+# C++ targets.
+tensorflow~=2.15.1
+typeguard~=2.13.3
diff --git a/build_deps/requirements_lock_3_10.txt b/build_deps/requirements_lock_3_10.txt
new file mode 100644
index 00000000..419ad233
--- /dev/null
+++ b/build_deps/requirements_lock_3_10.txt
@@ -0,0 +1,983 @@
+#
+# This file is autogenerated by pip-compile with Python 3.10
+# by the following command:
+#
+#    bazel run //build_deps:requirements.update
+#
+--index-url https://pypi.tuna.tsinghua.edu.cn/simple
+
+absl-py==1.4.0 \
+    --hash=sha256:0d3fe606adfa4f7db64792dd4c7aee4ee0c38ab75dfd353b7a83ed3e957fcb47 \
+    --hash=sha256:d2c244d01048ba476e7c080bd2c6df5e141d211de80223460d5b3b8a2a58433d
+    # via
+    #   -r build_deps/requirements.in
+    #   tensorboard
+    #   tensorflow
+astunparse==1.6.3 \
+    --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
+    --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
+    # via tensorflow
+attrs==23.2.0 \
+    --hash=sha256:935dc3b529c262f6cf76e50877d35a4bd3c1de194fd41f47a2b7ae8f19971f30 \
+    --hash=sha256:99b87a485a5820b23b879f04c2305b44b951b502fd64be915879d77a7e8fc6f1
+    # via
+    #   -r build_deps/requirements.in
+    #   pytest
+cachetools==5.5.2 \
+    --hash=sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4 \
+    --hash=sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a
+    # via google-auth
+certifi==2024.8.30 \
+    --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \
+    --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9
+    # via requests
+charset-normalizer==3.4.0 \
+    --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \
+    --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
+    --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
+    --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
+    --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
+    --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
+    --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
+    --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \
+    --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
+    --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
+    --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
+    --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
+    --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \
+    --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \
+    --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
+    --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
+    --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \
+    --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \
+    --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \
+    --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \
+    --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
+    --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
+    --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
+    --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
+    --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \
+    --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \
+    --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
+    --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
+    --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
+    --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
+    --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
+    --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \
+    --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
+    --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
+    --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
+    --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
+    --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \
+    --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
+    --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \
+    --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \
+    --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
+    --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
+    --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \
+    --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
+    --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \
+    --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
+    --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \
+    --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \
+    --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
+    --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
+    --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
+    --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
+    --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
+    --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
+    --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
+    --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
+    --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \
+    --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
+    --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
+    --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \
+    --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \
+    --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
+    --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \
+    --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \
+    --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \
+    --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \
+    --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \
+    --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
+    --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \
+    --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \
+    --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \
+    --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
+    --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \
+    --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
+    --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
+    --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \
+    --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
+    --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
+    --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \
+    --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
+    --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \
+    --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
+    --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
+    --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \
+    --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
+    --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
+    --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \
+    --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \
+    --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \
+    --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \
+    --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
+    --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \
+    --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
+    --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
+    --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \
+    --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
+    --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
+    --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \
+    --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
+    --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
+    --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \
+    --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \
+    --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
+    --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
+    --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
+    # via requests
+cramjam==2.10.0 \
+    --hash=sha256:001fc2572adc655406fb899087f57a740e58a800b05acdccac8bf5759b617d90 \
+    --hash=sha256:04f54bea9ce39c440d1ac6901fe4d647f9218dd5cd8fe903c6fe9c42bf5e1f3b \
+    --hash=sha256:05793857773ec62101edf2c0d22d8edc955707727124f637d2f6cc138e5f97aa \
+    --hash=sha256:06ad4a8b368d30ded1d932d9eed647962fbe44923269185a6bbd5e0d11cc39ab \
+    --hash=sha256:0acb17e3681138b48300b27d3409742c81d5734ec39c650a60a764c135197840 \
+    --hash=sha256:0d27fe3e316f9ae7fe1367b6daf0ffc993c1c66edae588165ac0f41f91a5a6b1 \
+    --hash=sha256:112638a4cdf806509d2d2661cb519d239d731bd5fd2e95f211c48ac0f0deeab5 \
+    --hash=sha256:11c5ef0c70d6bdd8e1d8afed8b0430709b22decc3865eb6c0656aa00117a7b3d \
+    --hash=sha256:17dda15edf256362edb30dcb1d5ecdcd727d946c6be0d1b130e736f3f49487dc \
+    --hash=sha256:1a200b74220dcd80c2bb99e3bfe1cdb1e4ed0f5c071959f4316abd65f9ef1e39 \
+    --hash=sha256:1c071765bdd5eefa3b2157a61e84d72e161b63f95eb702a0133fee293800a619 \
+    --hash=sha256:1e826469cfbb6dcd5b967591e52855073267835229674cfa3d327088805855da \
+    --hash=sha256:22a7ab05c62b0a71fcd6db4274af1508c5ea039a43fb143ac50a62f86e6f32f7 \
+    --hash=sha256:2464bdf0e2432e0f07a834f48c16022cd7f4648ed18badf52c32c13d6722518c \
+    --hash=sha256:260732e3b5c56d6182586f3a7fc5e3f3641b27bfbad5883e8d8e292af85a6870 \
+    --hash=sha256:26c44f17938cf00a339899ce6ea7ba12af7b1210d707a80a7f14724fba39869b \
+    --hash=sha256:27b2625c0840b9a5522eba30b165940084391762492e03b9d640fca5074016ae \
+    --hash=sha256:28a13c0317e71121b2059ffa8beefa2b185be241c52f740f6eb261f0067186db \
+    --hash=sha256:2c1eb6e6c3d5c1cc3f7c7f8a52e034340a3c454641f019687fa94077c05da5c2 \
+    --hash=sha256:2c24907c972aca7b56c8326307e15d78f56199852dda1e67e4e54c2672afede4 \
+    --hash=sha256:2c7008bb54bdc5d130c0e8581925dfcbdc6f0a4d2051de7a153bfced9a31910f \
+    --hash=sha256:2e419b65538786fc1f0cf776612262d4bf6c9449983d3fc0d0acfd86594fe551 \
+    --hash=sha256:337ceb50bde7708b2a4068f3000625c23ceb1b2497edce2e21fd08ef58549170 \
+    --hash=sha256:3484f1595eef64cefed05804d7ec8a88695f89086c49b086634e44c16f3d4769 \
+    --hash=sha256:3596b6ceaf85f872c1e56295c6ec80bb15fdd71e7ed9e0e5c3e654563dcc40a2 \
+    --hash=sha256:35bcecff38648908a4833928a892a1e7a32611171785bef27015107426bc1d9d \
+    --hash=sha256:38fba4594dd0e2b7423ef403039e63774086ebb0696d9060db20093f18a2f43e \
+    --hash=sha256:3a94fe7024137ed8bf200308000d106874afe52ff203f852f43b3547eddfa10e \
+    --hash=sha256:3e0b70fe7796b63b87cb7ebfaad0ebaca7574fdf177311952f74b8bda6522fb8 \
+    --hash=sha256:42dcd7c83104edae70004a8dc494e4e57de4940e3019e5d2cbec2830d5908a85 \
+    --hash=sha256:44c15f6117031a84497433b5f55d30ee72d438fdcba9778fec0c5ca5d416aa96 \
+    --hash=sha256:44c2660ee7c4c269646955e4e40c2693f803fbad12398bb31b2ad00cfc6027b8 \
+    --hash=sha256:4b201aacc7a06079b063cfbcf5efe78b1e65c7279b2828d06ffaa90a8316579d \
+    --hash=sha256:4b3e0067ae3513e4cbd0efbabbe5a2bcfa2c2d4bddc67188eeb0751b9a02fdb7 \
+    --hash=sha256:4ba90f7b8f986934f33aad8cc029cf7c74842d3ecd5eda71f7531330d38a8dc4 \
+    --hash=sha256:4c7bab3703babb93c9dd4444ac9797d01ec46cf521e247d3319bfb292414d053 \
+    --hash=sha256:5018c7414047f640b126df02e9286a8da7cc620798cea2b39bac79731c2ee336 \
+    --hash=sha256:50b59e981f219d6840ac43cda8e885aff1457944ddbabaa16ac047690bfd6ad1 \
+    --hash=sha256:51eb00c72d4a93e4a2ddcc751ba2a7a1318026247e80742866912ec82b39e5ce \
+    --hash=sha256:5264ac242697fbb1cfffa79d0153cbc4c088538bd99d60cfa374e8a8b83e2bb5 \
+    --hash=sha256:570c81f991033e624874475ade96b601f1db2c51b3e69c324072adcfb23ef5aa \
+    --hash=sha256:5b21b1672814ecce88f1da76635f0483d2d877d4cb8998db3692792f46279bf1 \
+    --hash=sha256:5b34f4678d386c64d3be402fdf67f75e8f1869627ea2ec4decd43e828d3b6fba \
+    --hash=sha256:5c52805c7ccb533fe42d3d36c91d237c97c3b6551cd6b32f98b79eeb30d0f139 \
+    --hash=sha256:61b7f3c81e5e9015e73e5f423706b2f5e85a07ce79dea35645fad93505ff06cf \
+    --hash=sha256:636a48e2d01fe8d7955e9523efd2f8efce55a0221f3b5d5b4bdf37c7ff056bf1 \
+    --hash=sha256:645827af834a64145ba4b06f703342b2dbe1d40d1a48fb04e82373bd95cf68e2 \
+    --hash=sha256:647553c44cf6b5ce2d9b56e743cc1eab886940d776b36438183e807bb5a7a42b \
+    --hash=sha256:6655d04942f7c02087a6bba4bdc8d88961aa8ddf3fb9a05b3bad06d2d1ca321b \
+    --hash=sha256:68362d87372a90b9717536238c81d74d7feb4a14392ac239ceb61c1c199a9bac \
+    --hash=sha256:6d86c1e2006fe82a8679ed851c2462a6019b57255b3902d16ac35df4a37f6cdd \
+    --hash=sha256:73b6ffc8ffe6546462ccc7e34ca3acd9eb3984e1232645f498544a7eab6b8aca \
+    --hash=sha256:7699d61c712bc77907c48fe63a21fffa03c4dd70401e1d14e368af031fde7c21 \
+    --hash=sha256:76e4e42f2ecf1aca0a710adaa23000a192efb81a2aee3bcc16761f1777f08a74 \
+    --hash=sha256:77192bc1a9897ecd91cf977a5d5f990373e35a8d028c9141c8c3d3680a4a4cd7 \
+    --hash=sha256:7ab6f36c772109c974890eafff2a841ddbf38ea1293b01a778b28f26089a890d \
+    --hash=sha256:7dda9be2caf067ac21c4aa63497833e0984908b66849c07aaa42b1cfa93f5e1c \
+    --hash=sha256:7ddbf6a3d3def7ae46638ebf87d7746ccebf22f885a87884ac24d97943af3f30 \
+    --hash=sha256:8695857e0b0b5289fabb6c200b95e2b18d8575551ddd9d50746b3d78b6fb5aa8 \
+    --hash=sha256:86b29e349064821ceeb14d60d01a11a0788f94e73ed4b3a5c3f9fac7aa4e2cd7 \
+    --hash=sha256:88754dd516f0e2f4dd242880b8e760dc854e917315a17fe3fc626475bea9b252 \
+    --hash=sha256:8b40d46d2aa566f8e3def953279cce0191e47364b453cda492db12a84dd97f78 \
+    --hash=sha256:8bb0b6aaaa5f37091e05d756a3337faf0ddcffe8a68dbe8a710731b0d555ec8f \
+    --hash=sha256:91ab85752a08dc875a05742cfda0234d7a70fadda07dd0b0582cfe991911f332 \
+    --hash=sha256:92fd6e784ade210c3522bc627b3938821d12fac52acefe4d6630460e243e28de \
+    --hash=sha256:967f5f0f22bf5dba4e4d7abe9594b28f5da95606225a50555926ff6e975d84dd \
+    --hash=sha256:9cadef44f5ad4c5b4d06ba3c28464d70241a40539c0343b1821ba43102b6a9fc \
+    --hash=sha256:9e20ebea6ec77232cd12e4084c8be6d03534dc5f3d027d365b32766beafce6c3 \
+    --hash=sha256:a01e89e99ba066dfa2df40fe99a2371565f4a3adc6811a73c8019d9929a312e8 \
+    --hash=sha256:a04376601c8f9714fb3a6a0a1699b85aab665d9d952a2a31fb37cf70e1be1fba \
+    --hash=sha256:a094ca72440364bc1d0a793555875e515b0d7cc0eef171f4cd49c7e4855ba06e \
+    --hash=sha256:a120fc0514c9ed9a4051d040ddd36176241d4f54c4a37d8e4f3d29ac9bdb4c3a \
+    --hash=sha256:a2742eea6e336961167c5b6a2393fa04d54bdb10980f0d60ea36ed0a824e9a20 \
+    --hash=sha256:a2923b8cd2fcbd22e0842decb66bf925a9e95bda165490d037c355e5df8fef68 \
+    --hash=sha256:a71ab695a16c6d5aeae1f02fcc37fbd1ae876e8fb339337aca187012a3d6c0a2 \
+    --hash=sha256:ac5a8a3ef660e6869a7761cd0664223eb546b2d17e9121c8ab0ad46353635611 \
+    --hash=sha256:acef0e2c4d9f38428721a0ec878dee3fb73a35e640593d99c9803457dbb65214 \
+    --hash=sha256:adf484b06063134ae604d4fc826d942af7e751c9d0b2fcab5bf1058a8ebe242b \
+    --hash=sha256:afa36aa006d7692718fce427ecb276211918447f806f80c19096a627f5122e3d \
+    --hash=sha256:b07fe3e48c881a75a11f722e1d5b052173b5e7c78b22518f659b8c9b4ac4c937 \
+    --hash=sha256:b8dee2e4a402dac2df110e7b02fae49507a63b44b6fd91350cf069f31545a925 \
+    --hash=sha256:ba19308b8e19cdaadfbf47142f52b705d2cbfb8edd84a8271573e50fa7fa022d \
+    --hash=sha256:bcedda2ef2560e6e62cac03734ab1ad28616206b4d4f2d138440b4f43e18c395 \
+    --hash=sha256:bf1321a40da930edeff418d561dfb03e6d59d5b8ab5cbab1c4b03ff0aa4c6d21 \
+    --hash=sha256:c6afff7e9da53afb8d11eae27a20ee5709e2943b39af6c949b38424d0f271569 \
+    --hash=sha256:cddd12ee5a2ef4100478db7f5563a9cdb8bc0a067fbd8ccd1ecdc446d2e6a41a \
+    --hash=sha256:ce11be5722c9d433c5e1eb3980f16eb7d80828b9614f089e28f4f1724fc8973f \
+    --hash=sha256:ce208a3e4043b8ce89e5d90047da16882456ea395577b1ee07e8215dce7d7c91 \
+    --hash=sha256:d46fd5a9e8eb5d56eccc6191a55e3e1e2b3ab24b19ab87563a2299a39c855fd7 \
+    --hash=sha256:d61a21e4153589bd53ffe71b553f93f2afbc8fb7baf63c91a83c933347473083 \
+    --hash=sha256:d84581c869d279fab437182d5db2b590d44975084e8d50b164947f7aaa2c5f25 \
+    --hash=sha256:de3e4be5aa71b73c2640c9b86e435ec033592f7f79787937f8342259106a63ae \
+    --hash=sha256:def47645b1b970fd97f063da852b0ddc4f5bdee9af8d5b718d9682c7b828d89d \
+    --hash=sha256:e0744e391ea8baf0ddea5a180b0aa71a6a302490c14d7a37add730bf0172c7c6 \
+    --hash=sha256:e193918c81139361f3f45db19696d31847601f2c0e79a38618f34d7bff6ee704 \
+    --hash=sha256:e1c03360c1760f8608dc5ce1ddd7e5491180765360cae8104b428d5f86fbe1b9 \
+    --hash=sha256:e2d216ed4aca2090eabdd354204ae55ed3e13333d1a5b271981543696e634672 \
+    --hash=sha256:e3012564760394dff89e7a10c5a244f8885cd155aec07bdbe2d6dc46be398614 \
+    --hash=sha256:e821dd487384ae8004e977c3b13135ad6665ccf8c9874e68441cad1146e66d8a \
+    --hash=sha256:eafdc9d1721afcb4be9d20b980b61d404a592c19067197976a4077f52727bd1a \
+    --hash=sha256:f25db473667774725e4f34e738d644ffb205bf0bdc0e8146870a1104c5f42e4a \
+    --hash=sha256:fb73ee9616e3efd2cf3857b019c66f9bf287bb47139ea48425850da2ae508670 \
+    --hash=sha256:ff7b95bd299c9360e7cb8d226002d58e2917f594ea5af0373efc713f896622b9
+    # via fastparquet
+dill==0.3.6 \
+    --hash=sha256:a07ffd2351b8c678dfc4a856a3005f8067aea51d6ba6c700796a4d9e280f39f0 \
+    --hash=sha256:e5db55f3687856d8fbdab002ed78544e1c4559a130302693d839dfe8f93f2373
+    # via -r build_deps/requirements.in
+dm-tree==0.1.8 \
+    --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \
+    --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \
+    --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \
+    --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \
+    --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \
+    --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \
+    --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \
+    --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \
+    --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \
+    --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \
+    --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \
+    --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \
+    --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \
+    --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \
+    --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
+    --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
+    --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
+    --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
+    --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
+    --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \
+    --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
+    --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \
+    --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
+    --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
+    --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
+    --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
+    --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
+    --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
+    --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \
+    --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
+    --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
+    --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
+    --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
+    --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
+    --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
+    --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \
+    --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
+    --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
+    --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
+    --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
+    --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
+    # via -r build_deps/requirements.in
+execnet==2.1.1 \
+    --hash=sha256:26dee51f1b80cebd6d0ca8e74dd8745419761d3bef34163928cbebbdc4749fdc \
+    --hash=sha256:5189b52c6121c24feae288166ab41b32549c7e2348652736540b9e6e7d4e72e3
+    # via pytest-xdist
+fastparquet==2024.11.0 \
+    --hash=sha256:053695c2f730b78a2d3925df7cd5c6444d6c1560076af907993361cc7accf3e2 \
+    --hash=sha256:0a52eecc6270ae15f0d51347c3f762703dd667ca486f127dc0a21e7e59856ae5 \
+    --hash=sha256:0b74333914f454344458dab9d1432fda9b70d62e28dc7acb1512d937ef1424ee \
+    --hash=sha256:0e2d7f02f57231e6c86d26e9ea71953737202f20e948790e5d4db6d6a1a150dc \
+    --hash=sha256:1ae953c0e3832ae3936b6d92fde493ac7d8b775d7d59d02f7f46f67e1c21ed24 \
+    --hash=sha256:29d5c718817bcd765fc519b17f759cad4945974421ecc1931d3bdc3e05e57fa9 \
+    --hash=sha256:36b5c9bd2ffaaa26ff45d59a6cefe58503dd748e0c7fad80dd905749da0f2b9e \
+    --hash=sha256:374cdfa745aa7d5188430528d5841cf823eb9ad16df72ad6dadd898ccccce3be \
+    --hash=sha256:403d31109d398b6be7ce84fa3483fc277c6a23f0b321348c0a505eb098a041cb \
+    --hash=sha256:41d1610130b5cb1ce36467766191c5418cba8631e2bfe3affffaf13f9be4e7a8 \
+    --hash=sha256:46b2db02fc2a1507939d35441c8ab211d53afd75d82eec9767d1c3656402859b \
+    --hash=sha256:4abd3426607335e5ad09be29ef4eeccdf097710e44420deac16893cee64ea0d8 \
+    --hash=sha256:4c8401bfd86cccaf0ab7c0ade58c91ae19317ff6092e1d4ad96c2178197d8124 \
+    --hash=sha256:561202e8f0e859ccc1aa77c4aaad1d7901b2d50fd6f624ca018bae4c3c7a62ce \
+    --hash=sha256:5914ecfa766b7763201b9f49d832a5e89c2dccad470ca4f9c9b228d9a8349756 \
+    --hash=sha256:59e5c5b51083d5b82572cdb7aed0346e3181e3ac9d2e45759da2e804bdafa7ee \
+    --hash=sha256:60ccf587410f0979105e17036df61bb60e1c2b81880dc91895cdb4ee65b71e7f \
+    --hash=sha256:63e0e416e25c15daa174aad8ba991c2e9e5b0dc347e5aed5562124261400f87b \
+    --hash=sha256:6595d3771b3d587a31137e985f751b4d599d5c8e9af9c4858e373fdf5c3f8720 \
+    --hash=sha256:6b7df5d3b61a19d76e209fe8d3133759af1c139e04ebc6d43f3cc2d8045ef338 \
+    --hash=sha256:6b936dcf40ca5fff9e70383d48811b1482b871ff74af857cb4db5f4d072f01ab \
+    --hash=sha256:6ec7b398a86432993441d0a08dfae59e29649c803ed64ec4b1d7c3e0855b14cb \
+    --hash=sha256:74a0b3c40ab373442c0fda96b75a36e88745d8b138fcc3a6143e04682cbbb8ca \
+    --hash=sha256:869e167a4067116b4a27eb7adbe597130b2e2e9cfc0f3e84f60e2e182a933f23 \
+    --hash=sha256:8b35823ac7a194134e5f82fa4a9659e42e8f9ad1f2d22a55fbb7b9e4053aabbb \
+    --hash=sha256:9a9387e77ac608d8978774caaf1e19de67eaa1386806e514dcb19f741b19cfe5 \
+    --hash=sha256:a3afdef2895c9f459135a00a7ed3ceafebfbce918a9e7b5d550e4fae39c1b64d \
+    --hash=sha256:a5ad5fc14b0567e700bea3cd528a0bd45a6f9371370b49de8889fb3d10a6574a \
+    --hash=sha256:bdadf7b6bad789125b823bfc5b0a719ba5c4a2ef965f973702d3ea89cff057f6 \
+    --hash=sha256:cbbb9057a26acf0abad7adf58781ee357258b7708ee44a289e3bee97e2f55d42 \
+    --hash=sha256:d20632964e65530374ff7cddd42cc06aa0a1388934903693d6d22592a5ba827b \
+    --hash=sha256:d24c923a2d9d22a5e7564245f856e6462d524d57982ac8f7479cde991ff73362 \
+    --hash=sha256:d281edd625c33628ba028d3221180283d6161bc5ceb55eae1f0ca1678f864f26 \
+    --hash=sha256:dbad4b014782bd38b58b8e9f514fe958cfa7a6c4e187859232d29fd5c5ddd849 \
+    --hash=sha256:dc475993232c6a64f350aeb928013a807eb93f78675810fd019cbcff39f6baf3 \
+    --hash=sha256:e29ff7a367fafa57c6896fb6abc84126e2466811aefd3e4ad4070b9e18820e54 \
+    --hash=sha256:e3b1fc73fd3e1b70b0de254bae7feb890436cb67e99458b88cb9bd3cc44db419 \
+    --hash=sha256:eb3356862fba2f9b2ea8e679d66901f466c92be8e023439fe854bc392fbf40a6 \
+    --hash=sha256:f9cca4c6b5969df5561c13786f9d116300db1ec22c7941e237cfca4ce602f59b \
+    --hash=sha256:fa56b19a29008c34cfe8831e810f770080debcbffc69aabd1df4d47572181f9c \
+    --hash=sha256:fbe4468146b633d8f09d7b196fea0547f213cb5ce5f76e9d1beb29eaa9593a93
+    # via -r build_deps/requirements.in
+flatbuffers==24.12.23 \
+    --hash=sha256:2910b0bc6ae9b6db78dd2b18d0b7a0709ba240fb5585f286a3a2b30785c22dac \
+    --hash=sha256:c418e0d48890f4142b92fd3e343e73a48f194e1f80075ddcc5793779b3585444
+    # via tensorflow
+fsspec==2025.3.2 \
+    --hash=sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711 \
+    --hash=sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6
+    # via fastparquet
+gast==0.6.0 \
+    --hash=sha256:52b182313f7330389f72b069ba00f174cfe2a06411099547288839c6cbafbd54 \
+    --hash=sha256:88fc5300d32c7ac6ca7b515310862f71e6fdf2c029bbec7c66c0f5dd47b6b1fb
+    # via tensorflow
+google-auth==2.39.0 \
+    --hash=sha256:0150b6711e97fb9f52fe599f55648950cc4540015565d8fbb31be2ad6e1548a2 \
+    --hash=sha256:73222d43cdc35a3aeacbfdcaf73142a97839f10de930550d89ebfe1d0a00cde7
+    # via
+    #   google-auth-oauthlib
+    #   tensorboard
+google-auth-oauthlib==1.2.1 \
+    --hash=sha256:2d58a27262d55aa1b87678c3ba7142a080098cbc2024f903c62355deb235d91f \
+    --hash=sha256:afd0cad092a2eaa53cd8e8298557d6de1034c6cb4a740500b5357b648af97263
+    # via tensorboard
+google-pasta==0.2.0 \
+    --hash=sha256:4612951da876b1a10fe3960d7226f0c7682cf901e16ac06e473b267a5afa8954 \
+    --hash=sha256:b32482794a366b5366a32c92a9a9201b107821889935a02b3e51f6b432ea84ed \
+    --hash=sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e
+    # via tensorflow
+grpcio==1.68.1 \
+    --hash=sha256:025f790c056815b3bf53da850dd70ebb849fd755a4b1ac822cb65cd631e37d43 \
+    --hash=sha256:04cfd68bf4f38f5bb959ee2361a7546916bd9a50f78617a346b3aeb2b42e2161 \
+    --hash=sha256:0feb02205a27caca128627bd1df4ee7212db051019a9afa76f4bb6a1a80ca95e \
+    --hash=sha256:1098f03dedc3b9810810568060dea4ac0822b4062f537b0f53aa015269be0a76 \
+    --hash=sha256:12941d533f3cd45d46f202e3667be8ebf6bcb3573629c7ec12c3e211d99cfccf \
+    --hash=sha256:255b1635b0ed81e9f91da4fcc8d43b7ea5520090b9a9ad9340d147066d1d3613 \
+    --hash=sha256:298ee7f80e26f9483f0b6f94cc0a046caf54400a11b644713bb5b3d8eb387600 \
+    --hash=sha256:2c4cec6177bf325eb6faa6bd834d2ff6aa8bb3b29012cceb4937b86f8b74323c \
+    --hash=sha256:2cc1fd04af8399971bcd4f43bd98c22d01029ea2e56e69c34daf2bf8470e47f5 \
+    --hash=sha256:334ab917792904245a028f10e803fcd5b6f36a7b2173a820c0b5b076555825e1 \
+    --hash=sha256:3522c77d7e6606d6665ec8d50e867f13f946a4e00c7df46768f1c85089eae515 \
+    --hash=sha256:37ea3be171f3cf3e7b7e412a98b77685eba9d4fd67421f4a34686a63a65d99f9 \
+    --hash=sha256:390eee4225a661c5cd133c09f5da1ee3c84498dc265fd292a6912b65c421c78c \
+    --hash=sha256:3aed6544e4d523cd6b3119b0916cef3d15ef2da51e088211e4d1eb91a6c7f4f1 \
+    --hash=sha256:3ceb56c4285754e33bb3c2fa777d055e96e6932351a3082ce3559be47f8024f0 \
+    --hash=sha256:44a8502dd5de653ae6a73e2de50a401d84184f0331d0ac3daeb044e66d5c5054 \
+    --hash=sha256:4b177f5547f1b995826ef529d2eef89cca2f830dd8b2c99ffd5fde4da734ba73 \
+    --hash=sha256:4efac5481c696d5cb124ff1c119a78bddbfdd13fc499e3bc0ca81e95fc573684 \
+    --hash=sha256:52fbf85aa71263380d330f4fce9f013c0798242e31ede05fcee7fbe40ccfc20d \
+    --hash=sha256:55857c71641064f01ff0541a1776bfe04a59db5558e82897d35a7793e525774c \
+    --hash=sha256:66a24f3d45c33550703f0abb8b656515b0ab777970fa275693a2f6dc8e35f1c1 \
+    --hash=sha256:6ab2d912ca39c51f46baf2a0d92aa265aa96b2443266fc50d234fa88bf877d8e \
+    --hash=sha256:77d65165fc35cff6e954e7fd4229e05ec76102d4406d4576528d3a3635fc6172 \
+    --hash=sha256:7dfc914cc31c906297b30463dde0b9be48e36939575eaf2a0a22a8096e69afe5 \
+    --hash=sha256:7f20ebec257af55694d8f993e162ddf0d36bd82d4e57f74b31c67b3c6d63d8b2 \
+    --hash=sha256:80af6f1e69c5e68a2be529990684abdd31ed6622e988bf18850075c81bb1ad6e \
+    --hash=sha256:83bbf5807dc3ee94ce1de2dfe8a356e1d74101e4b9d7aa8c720cc4818a34aded \
+    --hash=sha256:8720c25cd9ac25dd04ee02b69256d0ce35bf8a0f29e20577427355272230965a \
+    --hash=sha256:8829924fffb25386995a31998ccbbeaa7367223e647e0122043dfc485a87c666 \
+    --hash=sha256:8a3869a6661ec8f81d93f4597da50336718bde9eb13267a699ac7e0a1d6d0bea \
+    --hash=sha256:8cb620037a2fd9eeee97b4531880e439ebfcd6d7d78f2e7dcc3726428ab5ef63 \
+    --hash=sha256:919d7f18f63bcad3a0f81146188e90274fde800a94e35d42ffe9eadf6a9a6330 \
+    --hash=sha256:95c87ce2a97434dffe7327a4071839ab8e8bffd0054cc74cbe971fba98aedd60 \
+    --hash=sha256:963cc8d7d79b12c56008aabd8b457f400952dbea8997dd185f155e2f228db079 \
+    --hash=sha256:96f473cdacfdd506008a5d7579c9f6a7ff245a9ade92c3c0265eb76cc591914f \
+    --hash=sha256:9d1fae6bbf0816415b81db1e82fb3bf56f7857273c84dcbe68cbe046e58e1ccd \
+    --hash=sha256:a0c8ddabef9c8f41617f213e527254c41e8b96ea9d387c632af878d05db9229c \
+    --hash=sha256:a1b988b40f2fd9de5c820f3a701a43339d8dcf2cb2f1ca137e2c02671cc83ac1 \
+    --hash=sha256:a47faedc9ea2e7a3b6569795c040aae5895a19dde0c728a48d3c5d7995fda385 \
+    --hash=sha256:a8040f85dcb9830d8bbb033ae66d272614cec6faceee88d37a88a9bd1a7a704e \
+    --hash=sha256:b33bd114fa5a83f03ec6b7b262ef9f5cac549d4126f1dc702078767b10c46ed9 \
+    --hash=sha256:c08079b4934b0bf0a8847f42c197b1d12cba6495a3d43febd7e99ecd1cdc8d54 \
+    --hash=sha256:c28848761a6520c5c6071d2904a18d339a796ebe6b800adc8b3f474c5ce3c3ad \
+    --hash=sha256:cb400138e73969eb5e0535d1d06cae6a6f7a15f2cc74add320e2130b8179211a \
+    --hash=sha256:cbb5780e2e740b6b4f2d208e90453591036ff80c02cc605fea1af8e6fc6b1bbe \
+    --hash=sha256:ccf2ebd2de2d6661e2520dae293298a3803a98ebfc099275f113ce1f6c2a80f1 \
+    --hash=sha256:d35740e3f45f60f3c37b1e6f2f4702c23867b9ce21c6410254c9c682237da68d \
+    --hash=sha256:d99abcd61760ebb34bdff37e5a3ba333c5cc09feda8c1ad42547bea0416ada78 \
+    --hash=sha256:ddda1aa22495d8acd9dfbafff2866438d12faec4d024ebc2e656784d96328ad0 \
+    --hash=sha256:dffd29a2961f3263a16d73945b57cd44a8fd0b235740cb14056f0612329b345e \
+    --hash=sha256:e4842e4872ae4ae0f5497bf60a0498fa778c192cc7a9e87877abd2814aca9475 \
+    --hash=sha256:e8dbe3e00771bfe3d04feed8210fc6617006d06d9a2679b74605b9fed3e8362c \
+    --hash=sha256:ee2e743e51cb964b4975de572aa8fb95b633f496f9fcb5e257893df3be854746 \
+    --hash=sha256:eeb38ff04ab6e5756a2aef6ad8d94e89bb4a51ef96e20f45c44ba190fa0bcaad \
+    --hash=sha256:f8261fa2a5f679abeb2a0a93ad056d765cdca1c47745eda3f2d87f874ff4b8c9
+    # via
+    #   tensorboard
+    #   tensorflow
+h5py==3.12.1 \
+    --hash=sha256:018a4597f35092ae3fb28ee851fdc756d2b88c96336b8480e124ce1ac6fb9166 \
+    --hash=sha256:050a4f2c9126054515169c49cb900949814987f0c7ae74c341b0c9f9b5056834 \
+    --hash=sha256:06a903a4e4e9e3ebbc8b548959c3c2552ca2d70dac14fcfa650d9261c66939ed \
+    --hash=sha256:1473348139b885393125126258ae2d70753ef7e9cec8e7848434f385ae72069e \
+    --hash=sha256:2f0f1a382cbf494679c07b4371f90c70391dedb027d517ac94fa2c05299dacda \
+    --hash=sha256:326d70b53d31baa61f00b8aa5f95c2fcb9621a3ee8365d770c551a13dbbcbfdf \
+    --hash=sha256:3b15d8dbd912c97541312c0e07438864d27dbca857c5ad634de68110c6beb1c2 \
+    --hash=sha256:3fdf95092d60e8130ba6ae0ef7a9bd4ade8edbe3569c13ebbaf39baefffc5ba4 \
+    --hash=sha256:4532c7e97fbef3d029735db8b6f5bf01222d9ece41e309b20d63cfaae2fb5c4d \
+    --hash=sha256:513171e90ed92236fc2ca363ce7a2fc6f2827375efcbb0cc7fbdd7fe11fecafc \
+    --hash=sha256:52ab036c6c97055b85b2a242cb540ff9590bacfda0c03dd0cf0661b311f522f8 \
+    --hash=sha256:577d618d6b6dea3da07d13cc903ef9634cde5596b13e832476dd861aaf651f3e \
+    --hash=sha256:59400f88343b79655a242068a9c900001a34b63e3afb040bd7cdf717e440f653 \
+    --hash=sha256:59685fe40d8c1fbbee088c88cd4da415a2f8bee5c270337dc5a1c4aa634e3307 \
+    --hash=sha256:5c4b41d1019322a5afc5082864dfd6359f8935ecd37c11ac0029be78c5d112c9 \
+    --hash=sha256:62be1fc0ef195891949b2c627ec06bc8e837ff62d5b911b6e42e38e0f20a897d \
+    --hash=sha256:6fdf6d7936fa824acfa27305fe2d9f39968e539d831c5bae0e0d83ed521ad1ac \
+    --hash=sha256:7b3b8f3b48717e46c6a790e3128d39c61ab595ae0a7237f06dfad6a3b51d5351 \
+    --hash=sha256:84342bffd1f82d4f036433e7039e241a243531a1d3acd7341b35ae58cdab05bf \
+    --hash=sha256:ad8a76557880aed5234cfe7279805f4ab5ce16b17954606cca90d578d3e713ef \
+    --hash=sha256:ba51c0c5e029bb5420a343586ff79d56e7455d496d18a30309616fdbeed1068f \
+    --hash=sha256:cb65f619dfbdd15e662423e8d257780f9a66677eae5b4b3fc9dca70b5fd2d2a3 \
+    --hash=sha256:ccd9006d92232727d23f784795191bfd02294a4f2ba68708825cb1da39511a93 \
+    --hash=sha256:d2b8dd64f127d8b324f5d2cd1c0fd6f68af69084e9e47d27efeb9e28e685af3e \
+    --hash=sha256:d3e465aee0ec353949f0f46bf6c6f9790a2006af896cee7c178a8c3e5090aa32 \
+    --hash=sha256:e4d51919110a030913201422fb07987db4338eba5ec8c5a15d6fab8e03d443fc
+    # via tensorflow
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via requests
+iniconfig==2.0.0 \
+    --hash=sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3 \
+    --hash=sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374
+    # via pytest
+keras==2.15.0 \
+    --hash=sha256:2dcc6d2e30cf9c951064b63c1f4c404b966c59caf09e01f3549138ec8ee0dd1f \
+    --hash=sha256:81871d298c064dc4ac6b58440fdae67bfcf47c8d7ad28580fab401834c06a575
+    # via tensorflow
+libclang==18.1.1 \
+    --hash=sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a \
+    --hash=sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8 \
+    --hash=sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb \
+    --hash=sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592 \
+    --hash=sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f \
+    --hash=sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5 \
+    --hash=sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8 \
+    --hash=sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250 \
+    --hash=sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b \
+    --hash=sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe
+    # via tensorflow
+markdown==3.7 \
+    --hash=sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2 \
+    --hash=sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803
+    # via tensorboard
+markupsafe==3.0.2 \
+    --hash=sha256:0bff5e0ae4ef2e1ae4fdf2dfd5b76c75e5c2fa4132d05fc1b0dabcd20c7e28c4 \
+    --hash=sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30 \
+    --hash=sha256:1225beacc926f536dc82e45f8a4d68502949dc67eea90eab715dea3a21c1b5f0 \
+    --hash=sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9 \
+    --hash=sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396 \
+    --hash=sha256:1a9d3f5f0901fdec14d8d2f66ef7d035f2157240a433441719ac9a3fba440b13 \
+    --hash=sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028 \
+    --hash=sha256:1e084f686b92e5b83186b07e8a17fc09e38fff551f3602b249881fec658d3eca \
+    --hash=sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557 \
+    --hash=sha256:2cb8438c3cbb25e220c2ab33bb226559e7afb3baec11c4f218ffa7308603c832 \
+    --hash=sha256:3169b1eefae027567d1ce6ee7cae382c57fe26e82775f460f0b2778beaad66c0 \
+    --hash=sha256:3809ede931876f5b2ec92eef964286840ed3540dadf803dd570c3b7e13141a3b \
+    --hash=sha256:38a9ef736c01fccdd6600705b09dc574584b89bea478200c5fbf112a6b0d5579 \
+    --hash=sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a \
+    --hash=sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c \
+    --hash=sha256:48032821bbdf20f5799ff537c7ac3d1fba0ba032cfc06194faffa8cda8b560ff \
+    --hash=sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c \
+    --hash=sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22 \
+    --hash=sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094 \
+    --hash=sha256:57cb5a3cf367aeb1d316576250f65edec5bb3be939e9247ae594b4bcbc317dfb \
+    --hash=sha256:5b02fb34468b6aaa40dfc198d813a641e3a63b98c2b05a16b9f80b7ec314185e \
+    --hash=sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5 \
+    --hash=sha256:6af100e168aa82a50e186c82875a5893c5597a0c1ccdb0d8b40240b1f28b969a \
+    --hash=sha256:6c89876f41da747c8d3677a2b540fb32ef5715f97b66eeb0c6b66f5e3ef6f59d \
+    --hash=sha256:6e296a513ca3d94054c2c881cc913116e90fd030ad1c656b3869762b754f5f8a \
+    --hash=sha256:70a87b411535ccad5ef2f1df5136506a10775d267e197e4cf531ced10537bd6b \
+    --hash=sha256:7e94c425039cde14257288fd61dcfb01963e658efbc0ff54f5306b06054700f8 \
+    --hash=sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225 \
+    --hash=sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c \
+    --hash=sha256:88b49a3b9ff31e19998750c38e030fc7bb937398b1f78cfa599aaef92d693144 \
+    --hash=sha256:8c4e8c3ce11e1f92f6536ff07154f9d49677ebaaafc32db9db4620bc11ed480f \
+    --hash=sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87 \
+    --hash=sha256:9025b4018f3a1314059769c7bf15441064b2207cb3f065e6ea1e7359cb46db9d \
+    --hash=sha256:93335ca3812df2f366e80509ae119189886b0f3c2b81325d39efdb84a1e2ae93 \
+    --hash=sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf \
+    --hash=sha256:9e2d922824181480953426608b81967de705c3cef4d1af983af849d7bd619158 \
+    --hash=sha256:a123e330ef0853c6e822384873bef7507557d8e4a082961e1defa947aa59ba84 \
+    --hash=sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb \
+    --hash=sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48 \
+    --hash=sha256:b424c77b206d63d500bcb69fa55ed8d0e6a3774056bdc4839fc9298a7edca171 \
+    --hash=sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c \
+    --hash=sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6 \
+    --hash=sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd \
+    --hash=sha256:bbcb445fa71794da8f178f0f6d66789a28d7319071af7a496d4d507ed566270d \
+    --hash=sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1 \
+    --hash=sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d \
+    --hash=sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca \
+    --hash=sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a \
+    --hash=sha256:cfad01eed2c2e0c01fd0ecd2ef42c492f7f93902e39a42fc9ee1692961443a29 \
+    --hash=sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe \
+    --hash=sha256:d8213e09c917a951de9d09ecee036d5c7d36cb6cb7dbaece4c71a60d79fb9798 \
+    --hash=sha256:e07c3764494e3776c602c1e78e298937c3315ccc9043ead7e685b7f2b8d47b3c \
+    --hash=sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8 \
+    --hash=sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f \
+    --hash=sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f \
+    --hash=sha256:eaa0a10b7f72326f1372a713e73c3f739b524b3af41feb43e4921cb529f5929a \
+    --hash=sha256:eb7972a85c54febfb25b5c4b4f3af4dcc731994c7da0d8a0b4a6eb0640e1d178 \
+    --hash=sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0 \
+    --hash=sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79 \
+    --hash=sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430 \
+    --hash=sha256:fcabf5ff6eea076f859677f5f0b6b5c1a51e70a376b0579e0eadef8db48c6b50
+    # via werkzeug
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
+    # via tensorflow
+numpy==1.26.4 \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
+    # via
+    #   fastparquet
+    #   h5py
+    #   ml-dtypes
+    #   pandas
+    #   scipy
+    #   tensorboard
+    #   tensorflow
+oauthlib==3.2.2 \
+    --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \
+    --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918
+    # via requests-oauthlib
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via tensorflow
+packaging==24.1 \
+    --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
+    --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124
+    # via
+    #   fastparquet
+    #   pytest
+    #   tensorflow
+pandas==2.2.3 \
+    --hash=sha256:062309c1b9ea12a50e8ce661145c6aab431b1e99530d3cd60640e255778bd43a \
+    --hash=sha256:15c0e1e02e93116177d29ff83e8b1619c93ddc9c49083f237d4312337a61165d \
+    --hash=sha256:1948ddde24197a0f7add2bdc4ca83bf2b1ef84a1bc8ccffd95eda17fd836ecb5 \
+    --hash=sha256:1db71525a1538b30142094edb9adc10be3f3e176748cd7acc2240c2f2e5aa3a4 \
+    --hash=sha256:22a9d949bfc9a502d320aa04e5d02feab689d61da4e7764b62c30b991c42c5f0 \
+    --hash=sha256:29401dbfa9ad77319367d36940cd8a0b3a11aba16063e39632d98b0e931ddf32 \
+    --hash=sha256:31d0ced62d4ea3e231a9f228366919a5ea0b07440d9d4dac345376fd8e1477ea \
+    --hash=sha256:3508d914817e153ad359d7e069d752cdd736a247c322d932eb89e6bc84217f28 \
+    --hash=sha256:37e0aced3e8f539eccf2e099f65cdb9c8aa85109b0be6e93e2baff94264bdc6f \
+    --hash=sha256:381175499d3802cde0eabbaf6324cce0c4f5d52ca6f8c377c29ad442f50f6348 \
+    --hash=sha256:38cf8125c40dae9d5acc10fa66af8ea6fdf760b2714ee482ca691fc66e6fcb18 \
+    --hash=sha256:3b71f27954685ee685317063bf13c7709a7ba74fc996b84fc6821c59b0f06468 \
+    --hash=sha256:3fc6873a41186404dad67245896a6e440baacc92f5b716ccd1bc9ed2995ab2c5 \
+    --hash=sha256:4850ba03528b6dd51d6c5d273c46f183f39a9baf3f0143e566b89450965b105e \
+    --hash=sha256:4f18ba62b61d7e192368b84517265a99b4d7ee8912f8708660fb4a366cc82667 \
+    --hash=sha256:56534ce0746a58afaf7942ba4863e0ef81c9c50d3f0ae93e9497d6a41a057645 \
+    --hash=sha256:59ef3764d0fe818125a5097d2ae867ca3fa64df032331b7e0917cf5d7bf66b13 \
+    --hash=sha256:5dbca4c1acd72e8eeef4753eeca07de9b1db4f398669d5994086f788a5d7cc30 \
+    --hash=sha256:5de54125a92bb4d1c051c0659e6fcb75256bf799a732a87184e5ea503965bce3 \
+    --hash=sha256:61c5ad4043f791b61dd4752191d9f07f0ae412515d59ba8f005832a532f8736d \
+    --hash=sha256:6374c452ff3ec675a8f46fd9ab25c4ad0ba590b71cf0656f8b6daa5202bca3fb \
+    --hash=sha256:63cc132e40a2e084cf01adf0775b15ac515ba905d7dcca47e9a251819c575ef3 \
+    --hash=sha256:66108071e1b935240e74525006034333f98bcdb87ea116de573a6a0dccb6c039 \
+    --hash=sha256:6dfcb5ee8d4d50c06a51c2fffa6cff6272098ad6540aed1a76d15fb9318194d8 \
+    --hash=sha256:7c2875855b0ff77b2a64a0365e24455d9990730d6431b9e0ee18ad8acee13dbd \
+    --hash=sha256:7eee9e7cea6adf3e3d24e304ac6b8300646e2a5d1cd3a3c2abed9101b0846761 \
+    --hash=sha256:800250ecdadb6d9c78eae4990da62743b857b470883fa27f652db8bdde7f6659 \
+    --hash=sha256:86976a1c5b25ae3f8ccae3a5306e443569ee3c3faf444dfd0f41cda24667ad57 \
+    --hash=sha256:8cd6d7cc958a3910f934ea8dbdf17b2364827bb4dafc38ce6eef6bb3d65ff09c \
+    --hash=sha256:99df71520d25fade9db7c1076ac94eb994f4d2673ef2aa2e86ee039b6746d20c \
+    --hash=sha256:a5a1595fe639f5988ba6a8e5bc9649af3baf26df3998a0abe56c02609392e0a4 \
+    --hash=sha256:ad5b65698ab28ed8d7f18790a0dc58005c7629f227be9ecc1072aa74c0c1d43a \
+    --hash=sha256:b1d432e8d08679a40e2a6d8b2f9770a5c21793a6f9f47fdd52c5ce1948a5a8a9 \
+    --hash=sha256:b8661b0238a69d7aafe156b7fa86c44b881387509653fdf857bebc5e4008ad42 \
+    --hash=sha256:ba96630bc17c875161df3818780af30e43be9b166ce51c9a18c1feae342906c2 \
+    --hash=sha256:bc6b93f9b966093cb0fd62ff1a7e4c09e6d546ad7c1de191767baffc57628f39 \
+    --hash=sha256:c124333816c3a9b03fbeef3a9f230ba9a737e9e5bb4060aa2107a86cc0a497fc \
+    --hash=sha256:cd8d0c3be0515c12fed0bdbae072551c8b54b7192c7b1fda0ba56059a0179698 \
+    --hash=sha256:d9c45366def9a3dd85a6454c0e7908f2b3b8e9c138f5dc38fed7ce720d8453ed \
+    --hash=sha256:f00d1345d84d8c86a63e476bb4955e46458b304b9575dcf71102b5c705320015 \
+    --hash=sha256:f3a255b2c19987fbbe62a9dfd6cff7ff2aa9ccab3fc75218fd4b7530f01efa24 \
+    --hash=sha256:fffb8ae78d8af97f849404f21411c95062db1496aeb3e56f146f0355c9989319
+    # via
+    #   -r build_deps/requirements.in
+    #   fastparquet
+pluggy==1.5.0 \
+    --hash=sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1 \
+    --hash=sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669
+    # via pytest
+portpicker==1.6.0 \
+    --hash=sha256:b2787a41404cf7edbe29b07b9e0ed863b09f2665dcc01c1eb0c2261c1e7d0755 \
+    --hash=sha256:bd507fd6f96f65ee02781f2e674e9dc6c99bbfa6e3c39992e3916204c9d431fa
+    # via -r build_deps/requirements.in
+protobuf==4.25.5 \
+    --hash=sha256:0aebecb809cae990f8129ada5ca273d9d670b76d9bfc9b1809f0a9c02b7dbf41 \
+    --hash=sha256:4be0571adcbe712b282a330c6e89eae24281344429ae95c6d85e79e84780f5ea \
+    --hash=sha256:5e61fd921603f58d2f5acb2806a929b4675f8874ff5f330b7d6f7e2e784bbcd8 \
+    --hash=sha256:7a183f592dc80aa7c8da7ad9e55091c4ffc9497b3054452d629bb85fa27c2a45 \
+    --hash=sha256:7f8249476b4a9473645db7f8ab42b02fe1488cbe5fb72fddd445e0665afd8584 \
+    --hash=sha256:919ad92d9b0310070f8356c24b855c98df2b8bd207ebc1c0c6fcc9ab1e007f3d \
+    --hash=sha256:98d8d8aa50de6a2747efd9cceba361c9034050ecce3e09136f90de37ddba66e1 \
+    --hash=sha256:abe32aad8561aa7cc94fc7ba4fdef646e576983edb94a73381b03c53728a626f \
+    --hash=sha256:b0234dd5a03049e4ddd94b93400b67803c823cfc405689688f59b34e0742381a \
+    --hash=sha256:b2fde3d805354df675ea4c7c6338c1aecd254dfc9925e88c6d31a2bcb97eb173 \
+    --hash=sha256:fe14e16c22be926d3abfcb500e60cab068baf10b542b8c858fa27e098123e331
+    # via
+    #   -r build_deps/requirements.in
+    #   tensorboard
+    #   tensorflow
+psutil==6.1.1 \
+    --hash=sha256:018aeae2af92d943fdf1da6b58665124897cfc94faa2ca92098838f83e1b1bca \
+    --hash=sha256:0bdd4eab935276290ad3cb718e9809412895ca6b5b334f5a9111ee6d9aff9377 \
+    --hash=sha256:1924e659d6c19c647e763e78670a05dbb7feaf44a0e9c94bf9e14dfc6ba50468 \
+    --hash=sha256:33431e84fee02bc84ea36d9e2c4a6d395d479c9dd9bba2376c1f6ee8f3a4e0b3 \
+    --hash=sha256:384636b1a64b47814437d1173be1427a7c83681b17a450bfc309a1953e329603 \
+    --hash=sha256:6d4281f5bbca041e2292be3380ec56a9413b790579b8e593b1784499d0005dac \
+    --hash=sha256:8be07491f6ebe1a693f17d4f11e69d0dc1811fa082736500f649f79df7735303 \
+    --hash=sha256:8df0178ba8a9e5bc84fed9cfa61d54601b371fbec5c8eebad27575f1e105c0d4 \
+    --hash=sha256:97f7cb9921fbec4904f522d972f0c0e1f4fabbdd4e0287813b21215074a0f160 \
+    --hash=sha256:9ccc4316f24409159897799b83004cb1e24f9819b0dcf9c0b68bdcb6cefee6a8 \
+    --hash=sha256:b6e06c20c05fe95a3d7302d74e7097756d4ba1247975ad6905441ae1b5b66003 \
+    --hash=sha256:c777eb75bb33c47377c9af68f30e9f11bc78e0f07fbf907be4a5d70b2fe5f030 \
+    --hash=sha256:ca9609c77ea3b8481ab005da74ed894035936223422dc591d6772b147421f777 \
+    --hash=sha256:cf8496728c18f2d0b45198f06895be52f36611711746b7f30c464b422b50e2f5 \
+    --hash=sha256:eaa912e0b11848c4d9279a93d7e2783df352b082f40111e078388701fd479e53 \
+    --hash=sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649 \
+    --hash=sha256:fc0ed7fe2231a444fc219b9c42d0376e0a9a1a72f16c5cfa0f68d19f1a0663e8
+    # via portpicker
+py==1.11.0 \
+    --hash=sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719 \
+    --hash=sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378
+    # via pytest
+pyasn1==0.6.1 \
+    --hash=sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629 \
+    --hash=sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.2 \
+    --hash=sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a \
+    --hash=sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6
+    # via google-auth
+pytest==6.2.5 \
+    --hash=sha256:131b36680866a76e6781d13f101efb86cf674ebb9762eb70d3082b6f29889e89 \
+    --hash=sha256:7310f8d27bc79ced999e760ca304d69f6ba6c6649c0b60fb0e04a4a77cacc134
+    # via
+    #   -r build_deps/requirements.in
+    #   pytest-xdist
+pytest-xdist==3.5.0 \
+    --hash=sha256:cbb36f3d67e0c478baa57fa4edc8843887e0f6cfc42d677530a36d7472b32d8a \
+    --hash=sha256:d075629c7e00b611df89f490a5063944bee7a4362a5ff11c7cc7824a03dfce24
+    # via -r build_deps/requirements.in
+python-dateutil==2.9.0.post0 \
+    --hash=sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3 \
+    --hash=sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427
+    # via pandas
+pytz==2025.1 \
+    --hash=sha256:89dd22dca55b46eac6eda23b2d72721bf1bdfef212645d81513ef5d03038de57 \
+    --hash=sha256:c2db42be2a2518b28e65f9207c4d05e6ff547d1efa4086469ef855e4ab70178e
+    # via pandas
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via
+    #   requests-oauthlib
+    #   tensorboard
+requests-oauthlib==2.0.0 \
+    --hash=sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36 \
+    --hash=sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9
+    # via google-auth-oauthlib
+rsa==4.9.1 \
+    --hash=sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762 \
+    --hash=sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75
+    # via google-auth
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via -r build_deps/requirements.in
+six==1.16.0 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+    # via
+    #   astunparse
+    #   google-pasta
+    #   python-dateutil
+    #   tensorboard
+    #   tensorflow
+tblib==1.7.0 \
+    --hash=sha256:059bd77306ea7b419d4f76016aef6d7027cc8a0785579b5aad198803435f882c \
+    --hash=sha256:289fa7359e580950e7d9743eab36b0691f0310fce64dee7d9c31065b8f723e23
+    # via -r build_deps/requirements.in
+tensorboard==2.15.2 \
+    --hash=sha256:a6f6443728064d962caea6d34653e220e34ef8df764cb06a8212c17e1a8f0622
+    # via tensorflow
+tensorboard-data-server==0.7.2 \
+    --hash=sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb \
+    --hash=sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60 \
+    --hash=sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530
+    # via tensorboard
+tensorflow==2.15.1 \
+    --hash=sha256:10132acc072d59696c71ce7221d2d8e0e3ff1e6bc8688dbac6d7aed8e675b710 \
+    --hash=sha256:30c5ef9c758ec9ff7ce2aff76b71c980bc5119b879071c2cc623b1591a497a1a \
+    --hash=sha256:432788ac5d1234b9e9b7c7f73603a5655271a28c293329c52c7c0b9434a1184e \
+    --hash=sha256:6761efe511e6ee0f893f60738fefbcc51d6dc386eeaaafea59d21899ef369ffd \
+    --hash=sha256:89b5aa1022dec47e567512eaf4e1271b8e6c1ff1984e30d0d9127bd1093ed4c5 \
+    --hash=sha256:8e5431d45ceb416c2b1b6de87378054fbac7d2ed35d45b102d89a786613fffdc \
+    --hash=sha256:91b51a507007d63a70b65be307d701088d15042a6399c0e2312b53072226e909 \
+    --hash=sha256:a49f8755c74a89553294a99ab25aa87ab1cddbfa40fe58387e09f64f0578cedc \
+    --hash=sha256:aa926114d1e13ffe5b2ea59c3f195216f26646d7fe36e9e5207b291e4b7902ff \
+    --hash=sha256:aaf3cfa290597ebbdf19d1a78729e3f555e459506cd58f8d7399359ac5e02a05 \
+    --hash=sha256:b75815b6a601edad52b4181e9805c8fcd04813a6ab1d5cd8127188dfd2788e20 \
+    --hash=sha256:bb0edd69103c154245c5f209f0507355cc68ba7e4de350084bc31edc562478e4 \
+    --hash=sha256:e73d43dbc68d8c711e70edecc4ac70472799a25ec4ec18a84d479ee18033d3c5 \
+    --hash=sha256:ea290e435464cf0794f657b48786e5fa413362abe55ed771c172c25980d070ce \
+    --hash=sha256:f8e85821317c9c0fbf1256e9f721cfb1400ba1e09becb844b3ddd91f744805fc
+    # via
+    #   -r build_deps/requirements.in
+    #   tf-keras
+tensorflow-estimator==2.15.0 \
+    --hash=sha256:aedf21eec7fb2dc91150fc91a1ce12bc44dbb72278a08b58e79ff87c9e28f153
+    # via tensorflow
+tensorflow-io-gcs-filesystem==0.37.1 \
+    --hash=sha256:0df00891669390078a003cedbdd3b8e645c718b111917535fa1d7725e95cdb95 \
+    --hash=sha256:249c12b830165841411ba71e08215d0e94277a49c551e6dd5d72aab54fe5491b \
+    --hash=sha256:257aab23470a0796978efc9c2bcf8b0bc80f22e6298612a4c0a50d3f4e88060c \
+    --hash=sha256:286389a203a5aee1a4fa2e53718c661091aa5fea797ff4fa6715ab8436b02e6c \
+    --hash=sha256:32c50ab4e29a23c1f91cd0f9ab8c381a0ab10f45ef5c5252e94965916041737c \
+    --hash=sha256:426de1173cb81fbd62becec2012fc00322a295326d90eb6c737fab636f182aed \
+    --hash=sha256:6e1f2796b57e799a8ca1b75bf47c2aaa437c968408cc1a402a9862929e104cda \
+    --hash=sha256:8943036bbf84e7a2be3705cb56f9c9df7c48c9e614bb941f0936c58e3ca89d6f \
+    --hash=sha256:8febbfcc67c61e542a5ac1a98c7c20a91a5e1afc2e14b1ef0cb7c28bc3b6aa70 \
+    --hash=sha256:9679b36e3a80921876f31685ab6f7270f3411a4cc51bc2847e80d0e4b5291e27 \
+    --hash=sha256:b02f9c5f94fd62773954a04f69b68c4d576d076fd0db4ca25d5479f0fbfcdbad \
+    --hash=sha256:ee5da49019670ed364f3e5fb86b46420841a6c3cb52a300553c63841671b3e6d \
+    --hash=sha256:ee7c8ee5fe2fd8cb6392669ef16e71841133041fee8a330eff519ad9b36e4556 \
+    --hash=sha256:fbb33f1745f218464a59cecd9a18e32ca927b0f4d77abd8f8671b645cc1a182f \
+    --hash=sha256:fe8dcc6d222258a080ac3dfcaaaa347325ce36a7a046277f6b3e19abc1efb3c5 \
+    --hash=sha256:ffebb6666a7bfc28005f4fbbb111a455b5e7d6cd3b12752b7050863ecb27d5cc
+    # via tensorflow
+termcolor==2.5.0 \
+    --hash=sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8 \
+    --hash=sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f
+    # via tensorflow
+tf-keras==2.15.1 \
+    --hash=sha256:40ab605cecc7759c657cb2bccd9efaacd6fc2369a6c1eba8053890afeac46886 \
+    --hash=sha256:8beaef46b8b4f1158de1410e7c0cf82f008b9e8c4ab3443f54ac1aaef9c2ad74
+    # via -r build_deps/requirements.in
+toml==0.10.2 \
+    --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \
+    --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f
+    # via pytest
+tqdm==4.67.1 \
+    --hash=sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2 \
+    --hash=sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2
+    # via -r build_deps/requirements.in
+typeguard==2.13.3 \
+    --hash=sha256:00edaa8da3a133674796cf5ea87d9f4b4c367d77476e185e80251cc13dfbb8c4 \
+    --hash=sha256:5e3e3be01e887e7eafae5af63d1f36c849aaa94e3a0112097312aabfa16284f1
+    # via -r build_deps/requirements.in
+typing-extensions==4.12.2 \
+    --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
+    --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
+    # via tensorflow
+tzdata==2025.1 \
+    --hash=sha256:24894909e88cdb28bd1636c6887801df64cb485bd593f2fd83ef29075a81d694 \
+    --hash=sha256:7e127113816800496f027041c570f50bcd464a020098a3b6b199517772303639
+    # via pandas
+urllib3==2.2.3 \
+    --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
+    --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+    # via requests
+werkzeug==3.1.3 \
+    --hash=sha256:54b78bf3716d19a65be4fceccc0d1d7b89e608834989dfae50ea87564639213e \
+    --hash=sha256:60723ce945c19328679790e3282cc758aa4a6040e4bb330f53d30fa546d44746
+    # via tensorboard
+wheel==0.45.1 \
+    --hash=sha256:661e1abd9198507b1409a20c02106d9670b2576e916d58f520316666abca6729 \
+    --hash=sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248
+    # via astunparse
+wrapt==1.14.1 \
+    --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \
+    --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \
+    --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \
+    --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \
+    --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \
+    --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \
+    --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \
+    --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \
+    --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \
+    --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \
+    --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \
+    --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \
+    --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \
+    --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \
+    --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \
+    --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \
+    --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \
+    --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \
+    --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \
+    --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \
+    --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \
+    --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \
+    --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \
+    --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \
+    --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \
+    --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \
+    --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \
+    --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \
+    --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \
+    --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \
+    --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \
+    --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \
+    --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \
+    --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \
+    --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \
+    --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \
+    --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \
+    --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \
+    --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \
+    --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \
+    --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \
+    --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \
+    --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \
+    --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \
+    --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \
+    --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \
+    --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \
+    --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \
+    --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \
+    --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \
+    --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \
+    --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \
+    --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \
+    --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \
+    --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \
+    --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \
+    --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \
+    --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \
+    --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \
+    --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \
+    --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \
+    --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \
+    --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \
+    --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \
+    --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \
+    --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \
+    --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \
+    --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \
+    --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \
+    --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \
+    --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \
+    --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \
+    --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \
+    --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af
+    # via tensorflow
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==75.6.0 \
+    --hash=sha256:8199222558df7c86216af4f84c30e9b34a61d8ba19366cc914424cdbd28252f6 \
+    --hash=sha256:ce74b49e8f7110f9bf04883b730f4765b774ef3ef28f722cce7c273d253aaf7d
+    # via
+    #   tensorboard
+    #   tensorflow
diff --git a/build_deps/requirements_lock_3_11.txt b/build_deps/requirements_lock_3_11.txt
new file mode 100644
index 00000000..a68dbad4
--- /dev/null
+++ b/build_deps/requirements_lock_3_11.txt
@@ -0,0 +1,733 @@
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    bazel run //build:requirements.update
+#
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   google-benchmark
+    #   ml-collections
+    #   optax
+    #   orbax-checkpoint
+certifi==2024.8.30 \
+    --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \
+    --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9
+    # via requests
+charset-normalizer==3.4.0 \
+    --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \
+    --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
+    --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
+    --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
+    --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
+    --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
+    --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
+    --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \
+    --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
+    --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
+    --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
+    --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
+    --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \
+    --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \
+    --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
+    --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
+    --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \
+    --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \
+    --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \
+    --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \
+    --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
+    --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
+    --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
+    --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
+    --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \
+    --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \
+    --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
+    --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
+    --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
+    --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
+    --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
+    --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \
+    --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
+    --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
+    --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
+    --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
+    --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \
+    --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
+    --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \
+    --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \
+    --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
+    --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
+    --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \
+    --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
+    --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \
+    --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
+    --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \
+    --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \
+    --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
+    --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
+    --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
+    --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
+    --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
+    --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
+    --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
+    --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
+    --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \
+    --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
+    --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
+    --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \
+    --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \
+    --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
+    --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \
+    --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \
+    --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \
+    --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \
+    --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \
+    --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
+    --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \
+    --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \
+    --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \
+    --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
+    --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \
+    --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
+    --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
+    --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \
+    --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
+    --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
+    --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \
+    --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
+    --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \
+    --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
+    --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
+    --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \
+    --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
+    --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
+    --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \
+    --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \
+    --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \
+    --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \
+    --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
+    --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \
+    --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
+    --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
+    --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \
+    --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
+    --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
+    --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \
+    --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
+    --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
+    --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \
+    --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \
+    --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
+    --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
+    --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
+    # via requests
+chex==0.1.87 \
+    --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \
+    --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16
+    # via optax
+clu==0.0.12 \
+    --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \
+    --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4
+    # via -r build/requirements.in
+contextlib2==21.6.0 \
+    --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \
+    --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869
+    # via ml-collections
+dm-tree==0.1.8 \
+    --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \
+    --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \
+    --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \
+    --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \
+    --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \
+    --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \
+    --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \
+    --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \
+    --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \
+    --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \
+    --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \
+    --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \
+    --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \
+    --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \
+    --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
+    --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
+    --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
+    --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
+    --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
+    --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \
+    --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
+    --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \
+    --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
+    --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
+    --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
+    --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
+    --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
+    --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
+    --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \
+    --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
+    --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
+    --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
+    --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
+    --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
+    --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
+    --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \
+    --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
+    --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
+    --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
+    --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
+    --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
+    # via -r build/requirements.in
+einops==0.8.0 \
+    --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \
+    --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f
+    # via -r build/requirements.in
+etils[epath,epy]==1.10.0 \
+    --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \
+    --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05
+    # via
+    #   clu
+    #   optax
+    #   orbax-checkpoint
+flax==0.10.1 \
+    --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \
+    --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3
+    # via
+    #   -r build/requirements.in
+    #   clu
+fsspec==2024.10.0 \
+    --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \
+    --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493
+    # via etils
+google-benchmark==1.8.3 \
+    --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \
+    --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \
+    --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \
+    --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \
+    --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \
+    --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \
+    --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \
+    --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \
+    --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \
+    --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \
+    --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \
+    --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \
+    --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \
+    --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \
+    --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \
+    --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \
+    --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \
+    --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \
+    --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \
+    --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \
+    --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c
+    # via -r build/requirements.in
+humanize==4.11.0 \
+    --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \
+    --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be
+    # via orbax-checkpoint
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via requests
+importlib-resources==6.4.5 \
+    --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \
+    --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717
+    # via etils
+jax[tpu]==0.4.35 \
+    --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \
+    --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   optax
+    #   orbax-checkpoint
+jaxlib==0.4.35 \
+    --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \
+    --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \
+    --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \
+    --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \
+    --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \
+    --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \
+    --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \
+    --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \
+    --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \
+    --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \
+    --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \
+    --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \
+    --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \
+    --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \
+    --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \
+    --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \
+    --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \
+    --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \
+    --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \
+    --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb
+    # via
+    #   chex
+    #   clu
+    #   jax
+    #   optax
+libtpu==0.0.2 \
+    --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f
+    # via jax
+libtpu-nightly==0.1.dev20241010+nightly.cleanup \
+    --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232
+    # via jax
+markdown-it-py==3.0.0 \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+ml-collections==0.1.1 \
+    --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc
+    # via clu
+ml-dtypes==0.5.0 \
+    --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \
+    --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \
+    --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \
+    --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \
+    --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \
+    --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \
+    --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \
+    --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \
+    --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \
+    --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \
+    --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \
+    --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \
+    --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \
+    --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \
+    --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \
+    --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \
+    --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \
+    --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \
+    --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \
+    --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \
+    --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599
+    # via
+    #   jax
+    #   jaxlib
+    #   tensorstore
+msgpack==1.1.0 \
+    --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
+    --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
+    --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
+    --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
+    --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
+    --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \
+    --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
+    --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
+    --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \
+    --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
+    --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \
+    --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \
+    --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \
+    --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \
+    --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \
+    --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
+    --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \
+    --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \
+    --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
+    --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
+    --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \
+    --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \
+    --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \
+    --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \
+    --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
+    --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
+    --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
+    --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \
+    --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \
+    --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \
+    --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
+    --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \
+    --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \
+    --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \
+    --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \
+    --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \
+    --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
+    --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \
+    --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
+    --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \
+    --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \
+    --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \
+    --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \
+    --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \
+    --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \
+    --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \
+    --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \
+    --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
+    --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \
+    --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
+    --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
+    --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
+    --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \
+    --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \
+    --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \
+    --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
+    --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \
+    --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \
+    --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \
+    --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \
+    --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \
+    --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \
+    --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \
+    --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788
+    # via
+    #   flax
+    #   orbax-checkpoint
+nest-asyncio==1.6.0 \
+    --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \
+    --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c
+    # via orbax-checkpoint
+numpy==2.1.3 \
+    --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \
+    --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \
+    --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \
+    --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \
+    --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \
+    --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \
+    --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \
+    --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \
+    --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \
+    --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \
+    --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \
+    --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \
+    --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \
+    --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \
+    --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \
+    --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \
+    --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \
+    --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \
+    --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \
+    --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \
+    --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \
+    --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \
+    --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \
+    --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \
+    --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \
+    --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \
+    --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \
+    --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \
+    --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \
+    --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \
+    --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \
+    --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \
+    --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \
+    --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \
+    --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \
+    --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \
+    --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \
+    --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \
+    --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \
+    --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \
+    --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \
+    --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \
+    --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \
+    --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \
+    --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \
+    --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \
+    --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \
+    --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \
+    --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \
+    --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \
+    --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \
+    --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \
+    --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \
+    --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \
+    --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   optax
+    #   orbax-checkpoint
+    #   scipy
+    #   tensorstore
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via jax
+optax==0.2.3 \
+    --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \
+    --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9
+    # via
+    #   -r build/requirements.in
+    #   flax
+orbax==0.1.9 \
+    --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218
+    # via -r build/requirements.in
+orbax-checkpoint==0.8.0 \
+    --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \
+    --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc
+    # via
+    #   flax
+    #   orbax
+packaging==24.1 \
+    --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
+    --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124
+    # via clu
+protobuf==5.28.3 \
+    --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \
+    --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \
+    --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \
+    --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \
+    --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \
+    --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \
+    --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \
+    --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \
+    --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \
+    --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \
+    --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed
+    # via
+    #   -r build/requirements.in
+    #   orbax-checkpoint
+pygments==2.18.0 \
+    --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \
+    --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a
+    # via rich
+pyyaml==6.0.2 \
+    --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \
+    --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
+    --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \
+    --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \
+    --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
+    --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \
+    --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
+    --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \
+    --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
+    --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \
+    --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \
+    --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \
+    --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \
+    --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \
+    --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \
+    --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \
+    --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
+    --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \
+    --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
+    --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \
+    --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \
+    --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \
+    --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \
+    --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
+    --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
+    --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \
+    --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \
+    --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
+    --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
+    --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \
+    --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
+    --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \
+    --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
+    --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \
+    --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \
+    --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
+    --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \
+    --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \
+    --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \
+    --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
+    --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
+    --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \
+    --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
+    --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
+    --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \
+    --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \
+    --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \
+    --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \
+    --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \
+    --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
+    --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \
+    --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \
+    --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4
+    # via
+    #   flax
+    #   ml-collections
+    #   orbax-checkpoint
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via jax
+rich==13.9.4 \
+    --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \
+    --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90
+    # via flax
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via
+    #   jax
+    #   jaxlib
+six==1.16.0 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+    # via ml-collections
+tensorstore==0.1.67 \
+    --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \
+    --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \
+    --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \
+    --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \
+    --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \
+    --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \
+    --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \
+    --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \
+    --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \
+    --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \
+    --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \
+    --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \
+    --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \
+    --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \
+    --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \
+    --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \
+    --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \
+    --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \
+    --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \
+    --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \
+    --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908
+    # via
+    #   flax
+    #   orbax-checkpoint
+toolz==1.0.0 \
+    --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \
+    --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02
+    # via chex
+typing-extensions==4.12.2 \
+    --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
+    --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
+    # via
+    #   chex
+    #   clu
+    #   etils
+    #   flax
+    #   orbax-checkpoint
+urllib3==2.2.3 \
+    --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
+    --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+    # via requests
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
+    # via clu
+zipp==3.20.2 \
+    --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \
+    --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29
+    # via etils
diff --git a/build_deps/requirements_lock_3_12.txt b/build_deps/requirements_lock_3_12.txt
new file mode 100644
index 00000000..86ac5e4c
--- /dev/null
+++ b/build_deps/requirements_lock_3_12.txt
@@ -0,0 +1,739 @@
+#
+# This file is autogenerated by pip-compile with Python 3.12
+# by the following command:
+#
+#    bazel run //build:requirements.update
+#
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   google-benchmark
+    #   ml-collections
+    #   optax
+    #   orbax-checkpoint
+certifi==2024.8.30 \
+    --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \
+    --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9
+    # via requests
+charset-normalizer==3.4.0 \
+    --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \
+    --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
+    --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
+    --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
+    --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
+    --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
+    --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
+    --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \
+    --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
+    --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
+    --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
+    --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
+    --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \
+    --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \
+    --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
+    --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
+    --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \
+    --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \
+    --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \
+    --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \
+    --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
+    --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
+    --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
+    --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
+    --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \
+    --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \
+    --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
+    --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
+    --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
+    --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
+    --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
+    --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \
+    --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
+    --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
+    --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
+    --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
+    --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \
+    --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
+    --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \
+    --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \
+    --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
+    --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
+    --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \
+    --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
+    --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \
+    --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
+    --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \
+    --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \
+    --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
+    --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
+    --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
+    --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
+    --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
+    --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
+    --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
+    --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
+    --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \
+    --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
+    --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
+    --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \
+    --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \
+    --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
+    --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \
+    --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \
+    --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \
+    --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \
+    --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \
+    --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
+    --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \
+    --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \
+    --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \
+    --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
+    --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \
+    --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
+    --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
+    --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \
+    --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
+    --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
+    --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \
+    --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
+    --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \
+    --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
+    --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
+    --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \
+    --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
+    --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
+    --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \
+    --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \
+    --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \
+    --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \
+    --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
+    --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \
+    --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
+    --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
+    --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \
+    --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
+    --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
+    --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \
+    --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
+    --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
+    --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \
+    --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \
+    --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
+    --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
+    --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
+    # via requests
+chex==0.1.87 \
+    --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \
+    --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16
+    # via optax
+clu==0.0.12 \
+    --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \
+    --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4
+    # via -r build/requirements.in
+contextlib2==21.6.0 \
+    --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \
+    --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869
+    # via ml-collections
+dm-tree==0.1.8 \
+    --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \
+    --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \
+    --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \
+    --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \
+    --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \
+    --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \
+    --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \
+    --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \
+    --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \
+    --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \
+    --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \
+    --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \
+    --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \
+    --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \
+    --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
+    --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
+    --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
+    --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
+    --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
+    --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \
+    --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
+    --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \
+    --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
+    --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
+    --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
+    --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
+    --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
+    --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
+    --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \
+    --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
+    --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
+    --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
+    --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
+    --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
+    --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
+    --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \
+    --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
+    --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
+    --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
+    --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
+    --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
+    # via -r build/requirements.in
+einops==0.8.0 \
+    --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \
+    --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f
+    # via -r build/requirements.in
+etils[epath,epy]==1.10.0 \
+    --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \
+    --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05
+    # via
+    #   clu
+    #   optax
+    #   orbax-checkpoint
+flax==0.10.1 \
+    --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \
+    --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3
+    # via
+    #   -r build/requirements.in
+    #   clu
+fsspec==2024.10.0 \
+    --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \
+    --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493
+    # via etils
+google-benchmark==1.8.3 \
+    --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \
+    --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \
+    --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \
+    --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \
+    --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \
+    --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \
+    --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \
+    --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \
+    --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \
+    --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \
+    --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \
+    --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \
+    --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \
+    --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \
+    --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \
+    --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \
+    --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \
+    --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \
+    --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \
+    --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \
+    --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c
+    # via -r build/requirements.in
+humanize==4.11.0 \
+    --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \
+    --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be
+    # via orbax-checkpoint
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via requests
+importlib-resources==6.4.5 \
+    --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \
+    --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717
+    # via etils
+jax[tpu]==0.4.35 \
+    --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \
+    --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   optax
+    #   orbax-checkpoint
+jaxlib==0.4.35 \
+    --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \
+    --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \
+    --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \
+    --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \
+    --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \
+    --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \
+    --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \
+    --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \
+    --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \
+    --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \
+    --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \
+    --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \
+    --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \
+    --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \
+    --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \
+    --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \
+    --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \
+    --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \
+    --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \
+    --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb
+    # via
+    #   chex
+    #   clu
+    #   jax
+    #   optax
+libtpu==0.0.2 \
+    --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f
+    # via jax
+libtpu-nightly==0.1.dev20241010+nightly.cleanup \
+    --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232
+    # via jax
+markdown-it-py==3.0.0 \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+ml-collections==0.1.1 \
+    --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc
+    # via clu
+ml-dtypes==0.5.0 \
+    --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \
+    --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \
+    --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \
+    --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \
+    --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \
+    --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \
+    --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \
+    --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \
+    --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \
+    --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \
+    --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \
+    --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \
+    --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \
+    --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \
+    --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \
+    --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \
+    --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \
+    --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \
+    --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \
+    --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \
+    --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599
+    # via
+    #   jax
+    #   jaxlib
+    #   tensorstore
+msgpack==1.1.0 \
+    --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
+    --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
+    --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
+    --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
+    --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
+    --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \
+    --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
+    --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
+    --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \
+    --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
+    --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \
+    --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \
+    --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \
+    --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \
+    --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \
+    --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
+    --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \
+    --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \
+    --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
+    --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
+    --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \
+    --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \
+    --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \
+    --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \
+    --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
+    --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
+    --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
+    --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \
+    --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \
+    --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \
+    --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
+    --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \
+    --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \
+    --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \
+    --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \
+    --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \
+    --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
+    --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \
+    --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
+    --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \
+    --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \
+    --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \
+    --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \
+    --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \
+    --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \
+    --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \
+    --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \
+    --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
+    --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \
+    --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
+    --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
+    --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
+    --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \
+    --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \
+    --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \
+    --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
+    --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \
+    --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \
+    --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \
+    --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \
+    --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \
+    --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \
+    --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \
+    --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788
+    # via
+    #   flax
+    #   orbax-checkpoint
+nest-asyncio==1.6.0 \
+    --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \
+    --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c
+    # via orbax-checkpoint
+numpy==2.1.3 \
+    --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \
+    --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \
+    --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \
+    --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \
+    --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \
+    --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \
+    --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \
+    --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \
+    --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \
+    --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \
+    --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \
+    --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \
+    --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \
+    --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \
+    --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \
+    --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \
+    --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \
+    --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \
+    --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \
+    --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \
+    --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \
+    --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \
+    --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \
+    --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \
+    --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \
+    --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \
+    --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \
+    --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \
+    --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \
+    --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \
+    --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \
+    --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \
+    --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \
+    --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \
+    --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \
+    --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \
+    --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \
+    --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \
+    --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \
+    --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \
+    --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \
+    --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \
+    --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \
+    --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \
+    --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \
+    --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \
+    --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \
+    --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \
+    --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \
+    --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \
+    --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \
+    --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \
+    --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \
+    --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \
+    --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   optax
+    #   orbax-checkpoint
+    #   scipy
+    #   tensorstore
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via jax
+optax==0.2.3 \
+    --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \
+    --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9
+    # via
+    #   -r build/requirements.in
+    #   flax
+orbax==0.1.9 \
+    --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218
+    # via -r build/requirements.in
+orbax-checkpoint==0.8.0 \
+    --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \
+    --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc
+    # via
+    #   flax
+    #   orbax
+packaging==24.1 \
+    --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
+    --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124
+    # via clu
+protobuf==5.28.3 \
+    --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \
+    --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \
+    --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \
+    --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \
+    --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \
+    --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \
+    --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \
+    --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \
+    --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \
+    --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \
+    --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed
+    # via
+    #   -r build/requirements.in
+    #   orbax-checkpoint
+pygments==2.18.0 \
+    --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \
+    --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a
+    # via rich
+pyyaml==6.0.2 \
+    --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \
+    --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
+    --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \
+    --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \
+    --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
+    --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \
+    --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
+    --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \
+    --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
+    --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \
+    --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \
+    --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \
+    --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \
+    --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \
+    --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \
+    --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \
+    --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
+    --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \
+    --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
+    --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \
+    --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \
+    --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \
+    --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \
+    --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
+    --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
+    --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \
+    --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \
+    --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
+    --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
+    --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \
+    --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
+    --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \
+    --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
+    --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \
+    --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \
+    --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
+    --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \
+    --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \
+    --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \
+    --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
+    --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
+    --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \
+    --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
+    --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
+    --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \
+    --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \
+    --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \
+    --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \
+    --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \
+    --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
+    --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \
+    --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \
+    --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4
+    # via
+    #   flax
+    #   ml-collections
+    #   orbax-checkpoint
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via jax
+rich==13.9.4 \
+    --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \
+    --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90
+    # via flax
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via
+    #   jax
+    #   jaxlib
+six==1.16.0 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+    # via ml-collections
+tensorstore==0.1.67 \
+    --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \
+    --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \
+    --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \
+    --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \
+    --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \
+    --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \
+    --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \
+    --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \
+    --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \
+    --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \
+    --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \
+    --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \
+    --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \
+    --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \
+    --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \
+    --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \
+    --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \
+    --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \
+    --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \
+    --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \
+    --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908
+    # via
+    #   flax
+    #   orbax-checkpoint
+toolz==1.0.0 \
+    --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \
+    --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02
+    # via chex
+typing-extensions==4.12.2 \
+    --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
+    --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
+    # via
+    #   chex
+    #   clu
+    #   etils
+    #   flax
+    #   orbax-checkpoint
+urllib3==2.2.3 \
+    --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
+    --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+    # via requests
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
+    # via clu
+zipp==3.20.2 \
+    --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \
+    --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29
+    # via etils
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==75.3.0 \
+    --hash=sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd \
+    --hash=sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686
+    # via chex
diff --git a/build_deps/requirements_lock_3_13.txt b/build_deps/requirements_lock_3_13.txt
new file mode 100644
index 00000000..3683d95d
--- /dev/null
+++ b/build_deps/requirements_lock_3_13.txt
@@ -0,0 +1,739 @@
+#
+# This file is autogenerated by pip-compile with Python 3.13
+# by the following command:
+#
+#    bazel run //build:requirements.update
+#
+--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
+--find-links https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   google-benchmark
+    #   ml-collections
+    #   optax
+    #   orbax-checkpoint
+certifi==2024.8.30 \
+    --hash=sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8 \
+    --hash=sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9
+    # via requests
+charset-normalizer==3.4.0 \
+    --hash=sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621 \
+    --hash=sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6 \
+    --hash=sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8 \
+    --hash=sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912 \
+    --hash=sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c \
+    --hash=sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b \
+    --hash=sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d \
+    --hash=sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d \
+    --hash=sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95 \
+    --hash=sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e \
+    --hash=sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565 \
+    --hash=sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64 \
+    --hash=sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab \
+    --hash=sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be \
+    --hash=sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e \
+    --hash=sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907 \
+    --hash=sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0 \
+    --hash=sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2 \
+    --hash=sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62 \
+    --hash=sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62 \
+    --hash=sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23 \
+    --hash=sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc \
+    --hash=sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284 \
+    --hash=sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca \
+    --hash=sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455 \
+    --hash=sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858 \
+    --hash=sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b \
+    --hash=sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594 \
+    --hash=sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc \
+    --hash=sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db \
+    --hash=sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b \
+    --hash=sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea \
+    --hash=sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6 \
+    --hash=sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920 \
+    --hash=sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749 \
+    --hash=sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7 \
+    --hash=sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd \
+    --hash=sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99 \
+    --hash=sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242 \
+    --hash=sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee \
+    --hash=sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129 \
+    --hash=sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2 \
+    --hash=sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51 \
+    --hash=sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee \
+    --hash=sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8 \
+    --hash=sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b \
+    --hash=sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613 \
+    --hash=sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742 \
+    --hash=sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe \
+    --hash=sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3 \
+    --hash=sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5 \
+    --hash=sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631 \
+    --hash=sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7 \
+    --hash=sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15 \
+    --hash=sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c \
+    --hash=sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea \
+    --hash=sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417 \
+    --hash=sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250 \
+    --hash=sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88 \
+    --hash=sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca \
+    --hash=sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa \
+    --hash=sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99 \
+    --hash=sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149 \
+    --hash=sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41 \
+    --hash=sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574 \
+    --hash=sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0 \
+    --hash=sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f \
+    --hash=sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d \
+    --hash=sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654 \
+    --hash=sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3 \
+    --hash=sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19 \
+    --hash=sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90 \
+    --hash=sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578 \
+    --hash=sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9 \
+    --hash=sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1 \
+    --hash=sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51 \
+    --hash=sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719 \
+    --hash=sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236 \
+    --hash=sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a \
+    --hash=sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c \
+    --hash=sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade \
+    --hash=sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944 \
+    --hash=sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc \
+    --hash=sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6 \
+    --hash=sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6 \
+    --hash=sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27 \
+    --hash=sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6 \
+    --hash=sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2 \
+    --hash=sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12 \
+    --hash=sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf \
+    --hash=sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114 \
+    --hash=sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7 \
+    --hash=sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf \
+    --hash=sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d \
+    --hash=sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b \
+    --hash=sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed \
+    --hash=sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03 \
+    --hash=sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4 \
+    --hash=sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67 \
+    --hash=sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365 \
+    --hash=sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a \
+    --hash=sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748 \
+    --hash=sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b \
+    --hash=sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079 \
+    --hash=sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482
+    # via requests
+chex==0.1.87 \
+    --hash=sha256:0096d89cc8d898bb521ef4bfbf5c24549022b0e5b301f529ab57238896fe6c5d \
+    --hash=sha256:ce536475661fd96d21be0c1728ecdbedd03f8ff950c662dfc338c92ea782cb16
+    # via optax
+clu==0.0.12 \
+    --hash=sha256:0d183e7d25f7dd0700444510a264e24700e2f068bdabd199ed22866f7e54edba \
+    --hash=sha256:f71eaa1afbd30f57f7709257ba7e1feb8ad5c1c3dcae3606672a138678bb3ce4
+    # via -r build/requirements.in
+contextlib2==21.6.0 \
+    --hash=sha256:3fbdb64466afd23abaf6c977627b75b6139a5a3e8ce38405c5b413aed7a0471f \
+    --hash=sha256:ab1e2bfe1d01d968e1b7e8d9023bc51ef3509bba217bb730cee3827e1ee82869
+    # via ml-collections
+dm-tree==0.1.8 \
+    --hash=sha256:054b461f8176f4bce7a21f7b1870f873a1ced3bdbe1282c816c550bb43c71fa6 \
+    --hash=sha256:09964470f76a5201aff2e8f9b26842976de7889300676f927930f6285e256760 \
+    --hash=sha256:0d3172394079a86c3a759179c65f64c48d1a42b89495fcf38976d11cc3bb952c \
+    --hash=sha256:0e9620ccf06393eb6b613b5e366469304622d4ea96ae6540b28a33840e6c89cf \
+    --hash=sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430 \
+    --hash=sha256:1607ce49aa42f010d1e5e616d92ce899d66835d4d8bea49679582435285515de \
+    --hash=sha256:181c35521d480d0365f39300542cb6cd7fd2b77351bb43d7acfda15aef63b317 \
+    --hash=sha256:1d7c26e431fc93cc7e0cba867eb000db6a05f6f2b25af11ac4e9dada88fc5bca \
+    --hash=sha256:1fe962015b2fe1282892b28ebe962faed53c7f98d942da9a4625cbf27baef913 \
+    --hash=sha256:250b692fb75f45f02e2f58fbef9ab338904ef334b90557565621fa251df267cf \
+    --hash=sha256:2869228d9c619074de501a3c10dc7f07c75422f8fab36ecdcb859b6f1b1ec3ef \
+    --hash=sha256:28c52cbf4f8b3dbd0beaedf44f69fa85eec5e9dede612e08035e06ada6ec9426 \
+    --hash=sha256:2f7915660f59c09068e428613c480150180df1060561fd0d1470684ae7007bd1 \
+    --hash=sha256:343a4a4ebaa127451ff971254a4be4084eb4bdc0b2513c32b46f6f728fd03f9e \
+    --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
+    --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
+    --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
+    --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
+    --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
+    --hash=sha256:75c5d528bb992981c20793b6b453e91560784215dffb8a5440ba999753c14ceb \
+    --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
+    --hash=sha256:81fce77f22a302d7a5968aebdf4efafef4def7ce96528719a354e6990dcd49c7 \
+    --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
+    --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
+    --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
+    --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
+    --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
+    --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
+    --hash=sha256:b095ba4f8ca1ba19350fd53cf1f8f3eb0bd406aa28af64a6dfc86707b32a810a \
+    --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
+    --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
+    --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
+    --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
+    --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
+    --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
+    --hash=sha256:d20f2faa3672b52e5013f4077117bfb99c4cfc0b445d3bde1584c34032b57436 \
+    --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
+    --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
+    --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
+    --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
+    --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
+    # via -r build/requirements.in
+einops==0.8.0 \
+    --hash=sha256:63486517fed345712a8385c100cb279108d9d47e6ae59099b07657e983deae85 \
+    --hash=sha256:9572fb63046264a862693b0a87088af3bdc8c068fde03de63453cbbde245465f
+    # via -r build/requirements.in
+etils[epath,epy]==1.10.0 \
+    --hash=sha256:0777fe60a234b4c65ca53470fc64f2dd2d0c6bca7fcc623fdaa8d7fa5a317098 \
+    --hash=sha256:4eaa9d7248fd4eeb75e44d47ca29875a5ccea044cc14a17435794bf8ac116a05
+    # via
+    #   clu
+    #   optax
+    #   orbax-checkpoint
+flax==0.10.1 \
+    --hash=sha256:5218959706bc659a1f282ca537446163093d186d8edb9b1405c0efee4d90d22a \
+    --hash=sha256:ea98ed843c37954af2e262ea47356312a046794d7a5490d31682dffe908e25d3
+    # via
+    #   -r build/requirements.in
+    #   clu
+fsspec==2024.10.0 \
+    --hash=sha256:03b9a6785766a4de40368b88906366755e2819e758b83705c88cd7cb5fe81871 \
+    --hash=sha256:eda2d8a4116d4f2429db8550f2457da57279247dd930bb12f821b58391359493
+    # via etils
+google-benchmark==1.8.3 \
+    --hash=sha256:063f6df1ed384e4dc881ac96644153c18ed755f1a2ed32272534a110bdf14871 \
+    --hash=sha256:066b69f809fd0ebc697c90075d1194e4c4ada117811731431523f821b421b28f \
+    --hash=sha256:2b3bb7905233dec505de5cff35e0725b190f411d16ae97e9050073bf9c79cf2a \
+    --hash=sha256:5c4786323817112303edf7fd70dc60d1aa15c175d1c9e2c63d71292bb3e51828 \
+    --hash=sha256:71152a826b162146473a06015eefa9f066e19b316a06826fbf25386615653a64 \
+    --hash=sha256:731f1881b757df18add80566ae796b6da101935ea1f45932d1ee094d5fb85b46 \
+    --hash=sha256:902d6e6da560a716ba709c6b55f8585f1aa64a76711b9a1f068e064567f58a4a \
+    --hash=sha256:93e9ef9abf9f9e845a2141935bbcee5e42a7bedc3efb14072adc0310a8b49072 \
+    --hash=sha256:9e1d39431e2a5d0960676c3f62180f48c0cb2802c42895eaf5541b7029c20301 \
+    --hash=sha256:9f3432a57177f7a46608a07551d50edfe608da344aca07d476a888fb36438650 \
+    --hash=sha256:aa3354bb71dc3a32672d1c7fd0621f4967c519213c018dd8e20a9d9e6fb2ae7b \
+    --hash=sha256:ab8212aaadc39b5aaa0afc063b64959ca93271cf6a72852f0d0aad26f9ae9f24 \
+    --hash=sha256:ba0547b1075a290e3432025bb544b02f7c717c30e31f696f82907571cb5e2be9 \
+    --hash=sha256:c476005b9e7f32c45000719b7c8c2fa95ddcfc058af8d08052eb73692d143619 \
+    --hash=sha256:d1504fd53e936d70f438e474c2e87fd94f81bd74a5ae855b1e40d1f9994cdbeb \
+    --hash=sha256:d2ea4544d3e17a6f87432bc97e79fea23490d9c7c4d10ebd213acf6a40bd1b61 \
+    --hash=sha256:d5d31bbbec9ebe9a1bab34a631a35988c424ef55ea14055238bc77f7d8f19836 \
+    --hash=sha256:e69bd848173557ed3762830725bff00c2a92de974189a54bd77485bb8bcb18f4 \
+    --hash=sha256:f23a591951c59100e30d97b7ba222072f544d318f470420e21872dee40a4aff0 \
+    --hash=sha256:fb014cb611e929d2c2696b009f51ac657c24f706881f3123f10c810b11ba378b \
+    --hash=sha256:fc4faa364f22ef81b7d3e9f4ecc6ad62f28d68c47008002aa64474b941b1c76c
+    # via -r build/requirements.in
+humanize==4.11.0 \
+    --hash=sha256:b53caaec8532bcb2fff70c8826f904c35943f8cecaca29d272d9df38092736c0 \
+    --hash=sha256:e66f36020a2d5a974c504bd2555cf770621dbdbb6d82f94a6857c0b1ea2608be
+    # via orbax-checkpoint
+idna==3.10 \
+    --hash=sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9 \
+    --hash=sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3
+    # via requests
+importlib-resources==6.4.5 \
+    --hash=sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065 \
+    --hash=sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717
+    # via etils
+jax[tpu]==0.4.35 \
+    --hash=sha256:c0c986993026b10bf6f607fecb7417377460254640766ce40f1fef3fd139c12e \
+    --hash=sha256:fa99e909a31424abfec750019a6dd36f6acc18a6e7d40e2c0086b932cc351325
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   optax
+    #   orbax-checkpoint
+jaxlib==0.4.35 \
+    --hash=sha256:04d1db3bf0050d120238bfb9b686b58fefcc4d9dd9e2d96aecd3f68a1f1f5e0a \
+    --hash=sha256:0be3cf9df879d9ae1b5b92fc281f77d21f522fcbae1a48a02661026bbd9b9309 \
+    --hash=sha256:0fd990354d5623d3a34493fcd7213493390dbf5039bea19b62e2aaee1049eda9 \
+    --hash=sha256:14aeac3fea2ca1d5afb1878f72470b159cc89adb2633c5f0686f5d7c39f2ac18 \
+    --hash=sha256:187cb6929dc139b75d952d67c33118473c1b4105525a3e5607f064e7b8efdc74 \
+    --hash=sha256:261570c94b169dc90f3af903282eeec856b52736c0944d243504ced93d19b217 \
+    --hash=sha256:330c090bb9af413f552d8a92d097e50baec6b75823430fb2966a49f5298d4c43 \
+    --hash=sha256:504d0a2e2117724359d99d7e3663022686dcdddd85aa14bdad02008d444481ad \
+    --hash=sha256:5d2d8a5b89d334b875ede98d7fcee946bebef1a1b5abd118ff543bcef4ab09f5 \
+    --hash=sha256:7b11ad7c13f7f96f36efd303711ecac425f19ca2ddf65cf1be1541167a959ee5 \
+    --hash=sha256:7f8bfc90f68857b223b7e38a9bdf466a4f1cb405c9a4aa11698dc9ab7b35c29b \
+    --hash=sha256:8f8c499644660aefd0ae2ee31039da6d4df0f26d0ee67ba9fb316183a5304288 \
+    --hash=sha256:907e548ad6ce53b242a55c5f36c2a2a4c37d38f6cd8c356fc550a2f18ab0e82f \
+    --hash=sha256:91a283a72263feebe0d110d1136df96950744e47530f12df42c03f36888c971e \
+    --hash=sha256:b44f3e6e9fb748bb43df914356cf9d0d0c9a6e446a12c21fe843db25ed0df65f \
+    --hash=sha256:bc9eafba001ff8569cfa252fe7f04ba553622702b4b473b656dd0866edf6b8d4 \
+    --hash=sha256:d210bab7e1ce0b2f2e568548b3903ea6aec349019fc1398cd2a0c069e8342e62 \
+    --hash=sha256:dddffce48d7e6057008999aed2d8a9daecc57a48c45a4f8c475e00880eb2e41d \
+    --hash=sha256:e1cee6dc291251f3fb6b0127fdd96c0439ac1ea97e01571d06910df72d6ac6e1 \
+    --hash=sha256:e8c9579e20d5ecdc4f61336cdd032710cb8c38d5ae9c4fce0cf9ea031cef21cb
+    # via
+    #   chex
+    #   clu
+    #   jax
+    #   optax
+libtpu==0.0.2 \
+    --hash=sha256:9e1f7899ece1f4bb8c0832f5570246b46f1ca57837e5b62e1409ee48cf06403f
+    # via jax
+libtpu-nightly==0.1.dev20241010+nightly.cleanup \
+    --hash=sha256:935fe93a8d34e4566c168e9bc8c690d4729d5cf4e051625e86f4e4fa9a261232
+    # via jax
+markdown-it-py==3.0.0 \
+    --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
+    --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
+    # via rich
+mdurl==0.1.2 \
+    --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
+    --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
+    # via markdown-it-py
+ml-collections==0.1.1 \
+    --hash=sha256:3fefcc72ec433aa1e5d32307a3e474bbb67f405be814ea52a2166bfc9dbe68cc
+    # via clu
+ml-dtypes==0.5.0 \
+    --hash=sha256:099e09edd54e676903b4538f3815b5ab96f5b119690514602d96bfdb67172cbe \
+    --hash=sha256:2e7534392682c3098bc7341648c650864207169c654aed83143d7a19c67ae06f \
+    --hash=sha256:3e7d3a380fe73a63c884f06136f8baa7a5249cc8e9fdec677997dd78549f8128 \
+    --hash=sha256:54415257f00eb44fbcc807454efac3356f75644f1cbfc2d4e5522a72ae1dacab \
+    --hash=sha256:5f2b59233a0dbb6a560b3137ed6125433289ccba2f8d9c3695a52423a369ed15 \
+    --hash=sha256:60275f2b51b56834e840c4809fca840565f9bf8e9a73f6d8c94f5b5935701215 \
+    --hash=sha256:76942f6aeb5c40766d5ea62386daa4148e6a54322aaf5b53eae9e7553240222f \
+    --hash=sha256:7ee9c320bb0f9ffdf9f6fa6a696ef2e005d1f66438d6f1c1457338e00a02e8cf \
+    --hash=sha256:8c32138975797e681eb175996d64356bcfa124bdbb6a70460b9768c2b35a6fa4 \
+    --hash=sha256:968fede07d1f9b926a63df97d25ac656cac1a57ebd33701734eaf704bc55d8d8 \
+    --hash=sha256:a03fc861b86cc586728e3d093ba37f0cc05e65330c3ebd7688e7bae8290f8859 \
+    --hash=sha256:a38df8df61194aeaae1ab7579075779b4ad32cd1cffd012c28be227fa7f2a70a \
+    --hash=sha256:a988bac6572630e1e9c2edd9b1277b4eefd1c86209e52b0d061b775ac33902ff \
+    --hash=sha256:ab046f2ff789b1f11b2491909682c5d089934835f9a760fafc180e47dcb676b8 \
+    --hash=sha256:afa08343069874a30812871d639f9c02b4158ace065601406a493a8511180c02 \
+    --hash=sha256:c7a9152f5876fef565516aa5dd1dccd6fc298a5891b2467973905103eb5c7856 \
+    --hash=sha256:cb5cc7b25acabd384f75bbd78892d0c724943f3e2e1986254665a1aa10982e07 \
+    --hash=sha256:d3b3db9990c3840986a0e70524e122cfa32b91139c3653df76121ba7776e015f \
+    --hash=sha256:d4b1a70a3e5219790d6b55b9507606fc4e02911d1497d16c18dd721eb7efe7d0 \
+    --hash=sha256:dc74fd9995513d33eac63d64e436240f5494ec74d522a9f0920194942fc3d2d7 \
+    --hash=sha256:e04fde367b2fe901b1d47234426fe8819909bd1dd862a5adb630f27789c20599
+    # via
+    #   jax
+    #   jaxlib
+    #   tensorstore
+msgpack==1.1.0 \
+    --hash=sha256:06f5fd2f6bb2a7914922d935d3b8bb4a7fff3a9a91cfce6d06c13bc42bec975b \
+    --hash=sha256:071603e2f0771c45ad9bc65719291c568d4edf120b44eb36324dcb02a13bfddf \
+    --hash=sha256:0907e1a7119b337971a689153665764adc34e89175f9a34793307d9def08e6ca \
+    --hash=sha256:0f92a83b84e7c0749e3f12821949d79485971f087604178026085f60ce109330 \
+    --hash=sha256:115a7af8ee9e8cddc10f87636767857e7e3717b7a2e97379dc2054712693e90f \
+    --hash=sha256:13599f8829cfbe0158f6456374e9eea9f44eee08076291771d8ae93eda56607f \
+    --hash=sha256:17fb65dd0bec285907f68b15734a993ad3fc94332b5bb21b0435846228de1f39 \
+    --hash=sha256:2137773500afa5494a61b1208619e3871f75f27b03bcfca7b3a7023284140247 \
+    --hash=sha256:3180065ec2abbe13a4ad37688b61b99d7f9e012a535b930e0e683ad6bc30155b \
+    --hash=sha256:398b713459fea610861c8a7b62a6fec1882759f308ae0795b5413ff6a160cf3c \
+    --hash=sha256:3d364a55082fb2a7416f6c63ae383fbd903adb5a6cf78c5b96cc6316dc1cedc7 \
+    --hash=sha256:3df7e6b05571b3814361e8464f9304c42d2196808e0119f55d0d3e62cd5ea044 \
+    --hash=sha256:41c991beebf175faf352fb940bf2af9ad1fb77fd25f38d9142053914947cdbf6 \
+    --hash=sha256:42f754515e0f683f9c79210a5d1cad631ec3d06cea5172214d2176a42e67e19b \
+    --hash=sha256:452aff037287acb1d70a804ffd022b21fa2bb7c46bee884dbc864cc9024128a0 \
+    --hash=sha256:4676e5be1b472909b2ee6356ff425ebedf5142427842aa06b4dfd5117d1ca8a2 \
+    --hash=sha256:46c34e99110762a76e3911fc923222472c9d681f1094096ac4102c18319e6468 \
+    --hash=sha256:471e27a5787a2e3f974ba023f9e265a8c7cfd373632247deb225617e3100a3c7 \
+    --hash=sha256:4a1964df7b81285d00a84da4e70cb1383f2e665e0f1f2a7027e683956d04b734 \
+    --hash=sha256:4b51405e36e075193bc051315dbf29168d6141ae2500ba8cd80a522964e31434 \
+    --hash=sha256:4d1b7ff2d6146e16e8bd665ac726a89c74163ef8cd39fa8c1087d4e52d3a2325 \
+    --hash=sha256:53258eeb7a80fc46f62fd59c876957a2d0e15e6449a9e71842b6d24419d88ca1 \
+    --hash=sha256:534480ee5690ab3cbed89d4c8971a5c631b69a8c0883ecfea96c19118510c846 \
+    --hash=sha256:58638690ebd0a06427c5fe1a227bb6b8b9fdc2bd07701bec13c2335c82131a88 \
+    --hash=sha256:58dfc47f8b102da61e8949708b3eafc3504509a5728f8b4ddef84bd9e16ad420 \
+    --hash=sha256:59caf6a4ed0d164055ccff8fe31eddc0ebc07cf7326a2aaa0dbf7a4001cd823e \
+    --hash=sha256:5dbad74103df937e1325cc4bfeaf57713be0b4f15e1c2da43ccdd836393e2ea2 \
+    --hash=sha256:5e1da8f11a3dd397f0a32c76165cf0c4eb95b31013a94f6ecc0b280c05c91b59 \
+    --hash=sha256:646afc8102935a388ffc3914b336d22d1c2d6209c773f3eb5dd4d6d3b6f8c1cb \
+    --hash=sha256:64fc9068d701233effd61b19efb1485587560b66fe57b3e50d29c5d78e7fef68 \
+    --hash=sha256:65553c9b6da8166e819a6aa90ad15288599b340f91d18f60b2061f402b9a4915 \
+    --hash=sha256:685ec345eefc757a7c8af44a3032734a739f8c45d1b0ac45efc5d8977aa4720f \
+    --hash=sha256:6ad622bf7756d5a497d5b6836e7fc3752e2dd6f4c648e24b1803f6048596f701 \
+    --hash=sha256:73322a6cc57fcee3c0c57c4463d828e9428275fb85a27aa2aa1a92fdc42afd7b \
+    --hash=sha256:74bed8f63f8f14d75eec75cf3d04ad581da6b914001b474a5d3cd3372c8cc27d \
+    --hash=sha256:79ec007767b9b56860e0372085f8504db5d06bd6a327a335449508bbee9648fa \
+    --hash=sha256:7a946a8992941fea80ed4beae6bff74ffd7ee129a90b4dd5cf9c476a30e9708d \
+    --hash=sha256:7ad442d527a7e358a469faf43fda45aaf4ac3249c8310a82f0ccff9164e5dccd \
+    --hash=sha256:7c9a35ce2c2573bada929e0b7b3576de647b0defbd25f5139dcdaba0ae35a4cc \
+    --hash=sha256:7e7b853bbc44fb03fbdba34feb4bd414322180135e2cb5164f20ce1c9795ee48 \
+    --hash=sha256:879a7b7b0ad82481c52d3c7eb99bf6f0645dbdec5134a4bddbd16f3506947feb \
+    --hash=sha256:8a706d1e74dd3dea05cb54580d9bd8b2880e9264856ce5068027eed09680aa74 \
+    --hash=sha256:8a84efb768fb968381e525eeeb3d92857e4985aacc39f3c47ffd00eb4509315b \
+    --hash=sha256:8cf9e8c3a2153934a23ac160cc4cba0ec035f6867c8013cc6077a79823370346 \
+    --hash=sha256:8da4bf6d54ceed70e8861f833f83ce0814a2b72102e890cbdfe4b34764cdd66e \
+    --hash=sha256:8e59bca908d9ca0de3dc8684f21ebf9a690fe47b6be93236eb40b99af28b6ea6 \
+    --hash=sha256:914571a2a5b4e7606997e169f64ce53a8b1e06f2cf2c3a7273aa106236d43dd5 \
+    --hash=sha256:a51abd48c6d8ac89e0cfd4fe177c61481aca2d5e7ba42044fd218cfd8ea9899f \
+    --hash=sha256:a52a1f3a5af7ba1c9ace055b659189f6c669cf3657095b50f9602af3a3ba0fe5 \
+    --hash=sha256:ad33e8400e4ec17ba782f7b9cf868977d867ed784a1f5f2ab46e7ba53b6e1e1b \
+    --hash=sha256:b4c01941fd2ff87c2a934ee6055bda4ed353a7846b8d4f341c428109e9fcde8c \
+    --hash=sha256:bce7d9e614a04d0883af0b3d4d501171fbfca038f12c77fa838d9f198147a23f \
+    --hash=sha256:c40ffa9a15d74e05ba1fe2681ea33b9caffd886675412612d93ab17b58ea2fec \
+    --hash=sha256:c5a91481a3cc573ac8c0d9aace09345d989dc4a0202b7fcb312c88c26d4e71a8 \
+    --hash=sha256:c921af52214dcbb75e6bdf6a661b23c3e6417f00c603dd2070bccb5c3ef499f5 \
+    --hash=sha256:d46cf9e3705ea9485687aa4001a76e44748b609d260af21c4ceea7f2212a501d \
+    --hash=sha256:d8ce0b22b890be5d252de90d0e0d119f363012027cf256185fc3d474c44b1b9e \
+    --hash=sha256:dd432ccc2c72b914e4cb77afce64aab761c1137cc698be3984eee260bcb2896e \
+    --hash=sha256:e0856a2b7e8dcb874be44fea031d22e5b3a19121be92a1e098f46068a11b0870 \
+    --hash=sha256:e1f3c3d21f7cf67bcf2da8e494d30a75e4cf60041d98b3f79875afb5b96f3a3f \
+    --hash=sha256:f1ba6136e650898082d9d5a5217d5906d1e138024f836ff48691784bbe1adf96 \
+    --hash=sha256:f3e9b4936df53b970513eac1758f3882c88658a220b58dcc1e39606dccaaf01c \
+    --hash=sha256:f80bc7d47f76089633763f952e67f8214cb7b3ee6bfa489b3cb6a84cfac114cd \
+    --hash=sha256:fd2906780f25c8ed5d7b323379f6138524ba793428db5d0e9d226d3fa6aa1788
+    # via
+    #   flax
+    #   orbax-checkpoint
+nest-asyncio==1.6.0 \
+    --hash=sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe \
+    --hash=sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c
+    # via orbax-checkpoint
+numpy==2.1.3 \
+    --hash=sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe \
+    --hash=sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0 \
+    --hash=sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48 \
+    --hash=sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a \
+    --hash=sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564 \
+    --hash=sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958 \
+    --hash=sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17 \
+    --hash=sha256:15cb89f39fa6d0bdfb600ea24b250e5f1a3df23f901f51c8debaa6a5d122b2f0 \
+    --hash=sha256:17ee83a1f4fef3c94d16dc1802b998668b5419362c8a4f4e8a491de1b41cc3ee \
+    --hash=sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b \
+    --hash=sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4 \
+    --hash=sha256:3522b0dfe983a575e6a9ab3a4a4dfe156c3e428468ff08ce582b9bb6bd1d71d4 \
+    --hash=sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6 \
+    --hash=sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4 \
+    --hash=sha256:4d1167c53b93f1f5d8a139a742b3c6f4d429b54e74e6b57d0eff40045187b15d \
+    --hash=sha256:4f2015dfe437dfebbfce7c85c7b53d81ba49e71ba7eadbf1df40c915af75979f \
+    --hash=sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f \
+    --hash=sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f \
+    --hash=sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56 \
+    --hash=sha256:576a1c1d25e9e02ed7fa5477f30a127fe56debd53b8d2c89d5578f9857d03ca9 \
+    --hash=sha256:6a4825252fcc430a182ac4dee5a505053d262c807f8a924603d411f6718b88fd \
+    --hash=sha256:72dcc4a35a8515d83e76b58fdf8113a5c969ccd505c8a946759b24e3182d1f23 \
+    --hash=sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed \
+    --hash=sha256:762479be47a4863e261a840e8e01608d124ee1361e48b96916f38b119cfda04a \
+    --hash=sha256:78574ac2d1a4a02421f25da9559850d59457bac82f2b8d7a44fe83a64f770098 \
+    --hash=sha256:825656d0743699c529c5943554d223c021ff0494ff1442152ce887ef4f7561a1 \
+    --hash=sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512 \
+    --hash=sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f \
+    --hash=sha256:973faafebaae4c0aaa1a1ca1ce02434554d67e628b8d805e61f874b84e136b09 \
+    --hash=sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f \
+    --hash=sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc \
+    --hash=sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8 \
+    --hash=sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0 \
+    --hash=sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761 \
+    --hash=sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef \
+    --hash=sha256:b47fbb433d3260adcd51eb54f92a2ffbc90a4595f8970ee00e064c644ac788f5 \
+    --hash=sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e \
+    --hash=sha256:bc6f24b3d1ecc1eebfbf5d6051faa49af40b03be1aaa781ebdadcbc090b4539b \
+    --hash=sha256:c006b607a865b07cd981ccb218a04fc86b600411d83d6fc261357f1c0966755d \
+    --hash=sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43 \
+    --hash=sha256:c7662f0e3673fe4e832fe07b65c50342ea27d989f92c80355658c7f888fcc83c \
+    --hash=sha256:c80e4a09b3d95b4e1cac08643f1152fa71a0a821a2d4277334c88d54b2219a41 \
+    --hash=sha256:c894b4305373b9c5576d7a12b473702afdf48ce5369c074ba304cc5ad8730dff \
+    --hash=sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408 \
+    --hash=sha256:d89dd2b6da69c4fff5e39c28a382199ddedc3a5be5390115608345dec660b9e2 \
+    --hash=sha256:d9beb777a78c331580705326d2367488d5bc473b49a9bc3036c154832520aca9 \
+    --hash=sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57 \
+    --hash=sha256:e14e26956e6f1696070788252dcdff11b4aca4c3e8bd166e0df1bb8f315a67cb \
+    --hash=sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9 \
+    --hash=sha256:e711e02f49e176a01d0349d82cb5f05ba4db7d5e7e0defd026328e5cfb3226d3 \
+    --hash=sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a \
+    --hash=sha256:ecc76a9ba2911d8d37ac01de72834d8849e55473457558e12995f4cd53e778e0 \
+    --hash=sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e \
+    --hash=sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598 \
+    --hash=sha256:fa2d1337dc61c8dc417fbccf20f6d1e139896a30721b7f1e832b2bb6ef4eb6c4
+    # via
+    #   -r build/requirements.in
+    #   chex
+    #   clu
+    #   flax
+    #   jax
+    #   jaxlib
+    #   ml-dtypes
+    #   optax
+    #   orbax-checkpoint
+    #   scipy
+    #   tensorstore
+opt-einsum==3.4.0 \
+    --hash=sha256:69bb92469f86a1565195ece4ac0323943e83477171b91d24c35afe028a90d7cd \
+    --hash=sha256:96ca72f1b886d148241348783498194c577fa30a8faac108586b14f1ba4473ac
+    # via jax
+optax==0.2.3 \
+    --hash=sha256:083e603dcd731d7e74d99f71c12f77937dd53f79001b4c09c290e4f47dd2e94f \
+    --hash=sha256:ec7ab925440b0c5a512e1f24fba0fb3e7d760a7fd5d2496d7a691e9d37da01d9
+    # via
+    #   -r build/requirements.in
+    #   flax
+orbax==0.1.9 \
+    --hash=sha256:42dd487ceef9fbf027f4720f3d041686af75120466a528a8a8141226bc197218
+    # via -r build/requirements.in
+orbax-checkpoint==0.8.0 \
+    --hash=sha256:0754ecc2e5fc858e62bbcf610606502d8e1c9ada7295d9bb49cc172f884b0b1e \
+    --hash=sha256:df8e353feb7f4eeba9f5b16f704699df54c3c44c5c6ec4d4d117c40bf27830cc
+    # via
+    #   flax
+    #   orbax
+packaging==24.1 \
+    --hash=sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002 \
+    --hash=sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124
+    # via clu
+protobuf==5.28.3 \
+    --hash=sha256:0c4eec6f987338617072592b97943fdbe30d019c56126493111cf24344c1cc24 \
+    --hash=sha256:135658402f71bbd49500322c0f736145731b16fc79dc8f367ab544a17eab4535 \
+    --hash=sha256:27b246b3723692bf1068d5734ddaf2fccc2cdd6e0c9b47fe099244d80200593b \
+    --hash=sha256:3e6101d095dfd119513cde7259aa703d16c6bbdfae2554dfe5cfdbe94e32d548 \
+    --hash=sha256:3fa2de6b8b29d12c61911505d893afe7320ce7ccba4df913e2971461fa36d584 \
+    --hash=sha256:64badbc49180a5e401f373f9ce7ab1d18b63f7dd4a9cdc43c92b9f0b481cef7b \
+    --hash=sha256:70585a70fc2dd4818c51287ceef5bdba6387f88a578c86d47bb34669b5552c36 \
+    --hash=sha256:712319fbdddb46f21abb66cd33cb9e491a5763b2febd8f228251add221981135 \
+    --hash=sha256:91fba8f445723fcf400fdbe9ca796b19d3b1242cd873907979b9ed71e4afe868 \
+    --hash=sha256:a3f6857551e53ce35e60b403b8a27b0295f7d6eb63d10484f12bc6879c715687 \
+    --hash=sha256:cee1757663fa32a1ee673434fcf3bf24dd54763c79690201208bafec62f19eed
+    # via
+    #   -r build/requirements.in
+    #   orbax-checkpoint
+pygments==2.18.0 \
+    --hash=sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199 \
+    --hash=sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a
+    # via rich
+pyyaml==6.0.2 \
+    --hash=sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff \
+    --hash=sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48 \
+    --hash=sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086 \
+    --hash=sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e \
+    --hash=sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133 \
+    --hash=sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5 \
+    --hash=sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484 \
+    --hash=sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee \
+    --hash=sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5 \
+    --hash=sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68 \
+    --hash=sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a \
+    --hash=sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf \
+    --hash=sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99 \
+    --hash=sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8 \
+    --hash=sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85 \
+    --hash=sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19 \
+    --hash=sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc \
+    --hash=sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a \
+    --hash=sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1 \
+    --hash=sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317 \
+    --hash=sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c \
+    --hash=sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631 \
+    --hash=sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d \
+    --hash=sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652 \
+    --hash=sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5 \
+    --hash=sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e \
+    --hash=sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b \
+    --hash=sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8 \
+    --hash=sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476 \
+    --hash=sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706 \
+    --hash=sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563 \
+    --hash=sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237 \
+    --hash=sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b \
+    --hash=sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083 \
+    --hash=sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180 \
+    --hash=sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425 \
+    --hash=sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e \
+    --hash=sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f \
+    --hash=sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725 \
+    --hash=sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183 \
+    --hash=sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab \
+    --hash=sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774 \
+    --hash=sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725 \
+    --hash=sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e \
+    --hash=sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5 \
+    --hash=sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d \
+    --hash=sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290 \
+    --hash=sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44 \
+    --hash=sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed \
+    --hash=sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4 \
+    --hash=sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba \
+    --hash=sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12 \
+    --hash=sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4
+    # via
+    #   flax
+    #   ml-collections
+    #   orbax-checkpoint
+requests==2.32.3 \
+    --hash=sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760 \
+    --hash=sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6
+    # via jax
+rich==13.9.4 \
+    --hash=sha256:439594978a49a09530cff7ebc4b5c7103ef57baf48d5ea3184f21d9a2befa098 \
+    --hash=sha256:6049d5e6ec054bf2779ab3358186963bac2ea89175919d699e378b99738c2a90
+    # via flax
+scipy==1.14.1 \
+    --hash=sha256:0c2f95de3b04e26f5f3ad5bb05e74ba7f68b837133a4492414b3afd79dfe540e \
+    --hash=sha256:1729560c906963fc8389f6aac023739ff3983e727b1a4d87696b7bf108316a79 \
+    --hash=sha256:278266012eb69f4a720827bdd2dc54b2271c97d84255b2faaa8f161a158c3b37 \
+    --hash=sha256:2843f2d527d9eebec9a43e6b406fb7266f3af25a751aa91d62ff416f54170bc5 \
+    --hash=sha256:2da0469a4ef0ecd3693761acbdc20f2fdeafb69e6819cc081308cc978153c675 \
+    --hash=sha256:2ff0a7e01e422c15739ecd64432743cf7aae2b03f3084288f399affcefe5222d \
+    --hash=sha256:2ff38e22128e6c03ff73b6bb0f85f897d2362f8c052e3b8ad00532198fbdae3f \
+    --hash=sha256:30ac8812c1d2aab7131a79ba62933a2a76f582d5dbbc695192453dae67ad6310 \
+    --hash=sha256:3a1b111fac6baec1c1d92f27e76511c9e7218f1695d61b59e05e0fe04dc59617 \
+    --hash=sha256:4079b90df244709e675cdc8b93bfd8a395d59af40b72e339c2287c91860deb8e \
+    --hash=sha256:5149e3fd2d686e42144a093b206aef01932a0059c2a33ddfa67f5f035bdfe13e \
+    --hash=sha256:5a275584e726026a5699459aa72f828a610821006228e841b94275c4a7c08417 \
+    --hash=sha256:631f07b3734d34aced009aaf6fedfd0eb3498a97e581c3b1e5f14a04164a456d \
+    --hash=sha256:716e389b694c4bb564b4fc0c51bc84d381735e0d39d3f26ec1af2556ec6aad94 \
+    --hash=sha256:8426251ad1e4ad903a4514712d2fa8fdd5382c978010d1c6f5f37ef286a713ad \
+    --hash=sha256:8475230e55549ab3f207bff11ebfc91c805dc3463ef62eda3ccf593254524ce8 \
+    --hash=sha256:8bddf15838ba768bb5f5083c1ea012d64c9a444e16192762bd858f1e126196d0 \
+    --hash=sha256:8e32dced201274bf96899e6491d9ba3e9a5f6b336708656466ad0522d8528f69 \
+    --hash=sha256:8f9ea80f2e65bdaa0b7627fb00cbeb2daf163caa015e59b7516395fe3bd1e066 \
+    --hash=sha256:97c5dddd5932bd2a1a31c927ba5e1463a53b87ca96b5c9bdf5dfd6096e27efc3 \
+    --hash=sha256:a49f6ed96f83966f576b33a44257d869756df6cf1ef4934f59dd58b25e0327e5 \
+    --hash=sha256:af29a935803cc707ab2ed7791c44288a682f9c8107bc00f0eccc4f92c08d6e07 \
+    --hash=sha256:b05d43735bb2f07d689f56f7b474788a13ed8adc484a85aa65c0fd931cf9ccd2 \
+    --hash=sha256:b28d2ca4add7ac16ae8bb6632a3c86e4b9e4d52d3e34267f6e1b0c1f8d87e389 \
+    --hash=sha256:b99722ea48b7ea25e8e015e8341ae74624f72e5f21fc2abd45f3a93266de4c5d \
+    --hash=sha256:baff393942b550823bfce952bb62270ee17504d02a1801d7fd0719534dfb9c84 \
+    --hash=sha256:c0ee987efa6737242745f347835da2cc5bb9f1b42996a4d97d5c7ff7928cb6f2 \
+    --hash=sha256:d0d2821003174de06b69e58cef2316a6622b60ee613121199cb2852a873f8cf3 \
+    --hash=sha256:e0cf28db0f24a38b2a0ca33a85a54852586e43cf6fd876365c86e0657cfe7d73 \
+    --hash=sha256:e4f5a7c49323533f9103d4dacf4e4f07078f360743dec7f7596949149efeec06 \
+    --hash=sha256:eb58ca0abd96911932f688528977858681a59d61a7ce908ffd355957f7025cfc \
+    --hash=sha256:edaf02b82cd7639db00dbff629995ef185c8df4c3ffa71a5562a595765a06ce1 \
+    --hash=sha256:fef8c87f8abfb884dac04e97824b61299880c43f4ce675dd2cbeadd3c9b466d2
+    # via
+    #   jax
+    #   jaxlib
+six==1.16.0 \
+    --hash=sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926 \
+    --hash=sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254
+    # via ml-collections
+tensorstore==0.1.67 \
+    --hash=sha256:186664b53d438f041b6aa706f0537147e4a23c2a4920f4483c77167967042081 \
+    --hash=sha256:1b9950271f740b60286d6f88af740debb7f471036337ac864673415ef7dc46f0 \
+    --hash=sha256:32cd94e9974e1683c1984041a1f12f8db0dc94a8cbc266e444451dca0f4228a4 \
+    --hash=sha256:3476f2a3338d858dd34fcfdb8120df90203acc606fe41f8fdc70a8f3aee0e5e1 \
+    --hash=sha256:3abfe92bf11721b43ed124c5f00c6c4b191b330c3ab0a6eb2cc8a4aa06760864 \
+    --hash=sha256:53a9efd39ec0c9a8ccc11d4ffda719d210e95c4a4e3a9ccd6ea9a012e0794596 \
+    --hash=sha256:56372833decf2e9fd6e57e0619e2eb167f22b7f9a5d4fa715b17959e4cdf2983 \
+    --hash=sha256:686d330c8689306e390ed46aff85337f836e9e8ffcee019c89ce47e58bdae8cc \
+    --hash=sha256:74eb34cea61081c6505204fe59e6183c67bf68535dd0f5a35eb6db04a951e9b9 \
+    --hash=sha256:82ec1e66bf5f581f0192ff257c162db3ceccab3a0fb42378c06efeb555b46fe8 \
+    --hash=sha256:83f7281d5212f080554a23bfebe09ec4d9ce07047a8146dbb4350d5664d955a9 \
+    --hash=sha256:937da6006e1303960bcca8542168973735915207f97a93dc40288f1b26a3a7c1 \
+    --hash=sha256:972fc74103d672aada6cb5acbd25094482f56c12d3d6a3d11fd49f209c3e451b \
+    --hash=sha256:bbbcf520a167cd9466c03c6af8cd92aa8c82fab0b7858a188053a329c1f152b9 \
+    --hash=sha256:cfcc4e86f06e22524f29869fdbf432531de71d8f757aa3b749331d2b5e00079c \
+    --hash=sha256:d3a88a1c3db0fab891e652f1eefa82aa846ae686927cd8ff0c53f6f10d245f99 \
+    --hash=sha256:dbc24747e114f11d168fc358cad051e1a2025e6ce8fb3d33b25db51755f8aff5 \
+    --hash=sha256:dd6be769293479be523c2ac8a33cf9b5dbc8e5b37436bad740e3d7a782e91232 \
+    --hash=sha256:e7421d27cb0ac28acaeb4a5f11a61d3901b48f06a5213b16fef5e11e1ef199fc \
+    --hash=sha256:ee9a1000e8e7ebdf495272362fdb66957fba0753cc556a7e98f584cea08a6295 \
+    --hash=sha256:fe25948659e8b3b93d12e7c609be6b8d71ba2b2aaba2fea451b7cf95cc340908
+    # via
+    #   flax
+    #   orbax-checkpoint
+toolz==1.0.0 \
+    --hash=sha256:292c8f1c4e7516bf9086f8850935c799a874039c8bcf959d47b600e4c44a6236 \
+    --hash=sha256:2c86e3d9a04798ac556793bced838816296a2f085017664e4995cb40a1047a02
+    # via chex
+typing-extensions==4.12.2 \
+    --hash=sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d \
+    --hash=sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8
+    # via
+    #   chex
+    #   clu
+    #   etils
+    #   flax
+    #   orbax-checkpoint
+urllib3==2.2.3 \
+    --hash=sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac \
+    --hash=sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9
+    # via requests
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
+    # via clu
+zipp==3.20.2 \
+    --hash=sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350 \
+    --hash=sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29
+    # via etils
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==75.3.0 \
+    --hash=sha256:f2504966861356aa38616760c0f66568e535562374995367b4e69c7143cf6bcd \
+    --hash=sha256:fba5dd4d766e97be1b1681d98712680ae8f2f26d7881245f2ce9e40714f1a686
+    # via chex
diff --git a/build_deps/tf_dependency/build_defs.bzl.tpl b/build_deps/tf_dependency/build_defs.bzl.tpl
index 48542fb8..84e2163d 100644
--- a/build_deps/tf_dependency/build_defs.bzl.tpl
+++ b/build_deps/tf_dependency/build_defs.bzl.tpl
@@ -2,3 +2,4 @@
 
 D_GLIBCXX_USE_CXX11_ABI = "%{tf_cx11_abi}"
 CPLUSPLUS_VERSION = "%{tf_cplusplus_ver}"
+DTF_VERSION_INTEGER = "%{tf_version_integer}"
\ No newline at end of file
diff --git a/build_deps/tf_dependency/tf_configure.bzl b/build_deps/tf_dependency/tf_configure.bzl
index be03e21d..0fdb70fe 100644
--- a/build_deps/tf_dependency/tf_configure.bzl
+++ b/build_deps/tf_dependency/tf_configure.bzl
@@ -12,6 +12,8 @@ _TF_CXX11_ABI_FLAG = "TF_CXX11_ABI_FLAG"
 
 _TF_CPLUSPLUS_VER = "TF_CPLUSPLUS_VER"
 
+_TF_VERSION_INTEGER = "TF_VERSION_INTEGER"
+
 def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
     if not out:
         out = tpl
@@ -211,6 +213,7 @@ def _tf_pip_impl(repository_ctx):
     tf_shared_cc_library_path = "%s/%s" % (tf_shared_library_dir, tf_shared_cc_library_name)
     tf_cx11_abi = "-D_GLIBCXX_USE_CXX11_ABI=%s" % (repository_ctx.os.environ[_TF_CXX11_ABI_FLAG])
     tf_cplusplus_ver = "-std=%s" % repository_ctx.os.environ[_TF_CPLUSPLUS_VER]
+    tf_version_integer = "-DTF_VERSION_INTEGER=%s" % (repository_ctx.os.environ[_TF_VERSION_INTEGER])
 
     tf_shared_library_rule = _symlink_genrule_for_dir(
         repository_ctx,
@@ -244,6 +247,7 @@ def _tf_pip_impl(repository_ctx):
         {
             "%{tf_cx11_abi}": tf_cx11_abi,
             "%{tf_cplusplus_ver}": tf_cplusplus_ver,
+            "%{tf_version_integer}": tf_version_integer,
         },
     )
 
diff --git a/build_deps/toolchains/gpu/crosstool/BUILD.tpl b/build_deps/toolchains/gpu/crosstool/BUILD.tpl
deleted file mode 100644
index de954b78..00000000
--- a/build_deps/toolchains/gpu/crosstool/BUILD.tpl
+++ /dev/null
@@ -1,69 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-
-toolchain(
-    name = "toolchain-linux-x86_64",
-    exec_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    target_compatible_with = [
-        "@platforms//os:linux",
-        "@platforms//cpu:x86_64",
-    ],
-    toolchain = ":cc-compiler-local",
-    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
-)
-
-cc_toolchain_suite(
-    name = "toolchain",
-    toolchains = {
-        "local|compiler": ":cc-compiler-local",
-        "k8": ":cc-compiler-local",
-        "ppc": ":cc-compiler-local",
-        "aarch64": ":cc-compiler-local",
-    },
-)
-
-cc_toolchain(
-    name = "cc-compiler-local",
-    all_files = "%{linker_files}",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = "%{linker_files}",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    # To support linker flags that need to go to the start of command line
-    # we need the toolchain to support parameter files. Parameter files are
-    # last on the command line and contain all shared libraries to link, so all
-    # regular options will be left of them.
-    supports_param_files = 1,
-    toolchain_config = ":cc-compiler-local-config",
-    toolchain_identifier = "local_linux",
-)
-
-cc_toolchain_config(
-    name = "cc-compiler-local-config",
-    cpu = "local",
-    builtin_include_directories = "%{cxx_builtin_include_directories}".split(","),
-    extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"],
-    host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc",
-    host_compiler_prefix = "/usr/bin",
-    host_compiler_warnings = [],
-    host_unfiltered_compile_flags = [],
-    linker_bin_path = "/usr/bin",
-)
-
-filegroup(
-    name = "empty",
-    srcs = [],
-)
-
-filegroup(
-    name = "crosstool_wrapper_driver_is_not_gcc",
-    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
-)
diff --git a/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl b/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
deleted file mode 100644
index 1a13ac84..00000000
--- a/build_deps/toolchains/gpu/crosstool/CROSSTOOL.tpl
+++ /dev/null
@@ -1,1409 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lstdc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  feature {
-    name: "alwayslink"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,-no-as-needed"
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-        flag: "-Wl,-z,relro,-z,now"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        %{host_compiler_warnings}
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "build-id"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        # Stamp the binary with a unique identifier.
-        flag: "-Wl,--build-id=md5"
-        flag: "-Wl,--hash-style=gnu"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-no-canonical-prefixes"
-        %{extra_no_canonical_prefixes_flags}
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        %{linker_bin_path_flag}
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "alwayslink"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "build-id"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-Wl,--gc-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "%{host_compiler_path}" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-%{host_compiler_includes}
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-  feature {
-    name: "c++11"
-    flag_set {
-      action: "c++-compile"
-      flag_group {
-        flag: "-std=c++11"
-      }
-    }
-  }
-
-  feature {
-    name: "stdlib"
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "-lc++"
-      }
-    }
-  }
-
-  feature {
-    name: "determinism"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Make C++ compilation deterministic. Use linkstamping instead of these
-        # compiler symbols.
-        flag: "-Wno-builtin-macro-redefined"
-        flag: "-D__DATE__=\"redacted\""
-        flag: "-D__TIMESTAMP__=\"redacted\""
-        flag: "-D__TIME__=\"redacted\""
-      }
-    }
-  }
-
-  # This feature will be enabled for builds that support pic by bazel.
-  feature {
-    name: "pic"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        expand_if_all_available: "pic"
-        flag: "-fPIC"
-      }
-      flag_group {
-        expand_if_none_available: "pic"
-        flag: "-fPIE"
-      }
-    }
-  }
-
-  # Security hardening on by default.
-  feature {
-    name: "hardening"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-        # We need to undef it before redefining it as some distributions now
-        # have it enabled by default.
-        flag: "-U_FORTIFY_SOURCE"
-        flag: "-D_FORTIFY_SOURCE=1"
-        flag: "-fstack-protector"
-      }
-    }
-    flag_set {
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-pie"
-      }
-    }
-  }
-
-  feature {
-    name: "warnings"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # All warnings are enabled. Maybe enable -Werror as well?
-        flag: "-Wall"
-        %{host_compiler_warnings}
-      }
-    }
-  }
-
-  # Keep stack frames for debugging, even in opt mode.
-  feature {
-    name: "frame-pointer"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-fno-omit-frame-pointer"
-      }
-    }
-  }
-
-  feature {
-    name: "no-canonical-prefixes"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag:"-no-canonical-prefixes"
-      }
-    }
-  }
-
-  feature {
-    name: "disable-assertions"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: "linker-bin-path"
-
-    flag_set {
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        %{linker_bin_path_flag}
-      }
-    }
-  }
-
-  feature {
-    name: "undefined-dynamic"
-    flag_set {
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-executable"
-      flag_group {
-        flag: "-undefined"
-        flag: "dynamic_lookup"
-      }
-    }
-  }
-
-  feature {
-    name: "common"
-    implies: "stdlib"
-    implies: "c++11"
-    implies: "determinism"
-    implies: "hardening"
-    implies: "warnings"
-    implies: "frame-pointer"
-    implies: "no-canonical-prefixes"
-    implies: "linker-bin-path"
-    implies: "undefined-dynamic"
-  }
-
-  feature {
-    name: "opt"
-    implies: "common"
-    implies: "disable-assertions"
-
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        # No debug symbols.
-        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
-        # or even generally? However, that can't happen here, as it requires
-        # special handling in Bazel.
-        flag: "-g0"
-
-        # Conservative choice for -O
-        # -O3 can increase binary size and even slow down the resulting binaries.
-        # Profile first and / or use FDO if you need better performance than this.
-        flag: "-O2"
-
-        # Removal of unused code and data at link time (can this increase binary size in some cases?).
-        flag: "-ffunction-sections"
-        flag: "-fdata-sections"
-      }
-    }
-  }
-
-  feature {
-    name: "fastbuild"
-    implies: "common"
-  }
-
-  feature {
-    name: "dbg"
-    implies: "common"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      flag_group {
-        flag: "-g"
-      }
-    }
-  }
-
-  # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "%{host_compiler_path}" }
-
-  # Use the default system toolchain for everything else.
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Enabled dynamic linking.
-  linking_mode_flags { mode: DYNAMIC }
-
-%{host_compiler_includes}
-}
-
-toolchain {
-  toolchain_identifier: "local_windows"
-  host_system_name: "local"
-  target_system_name: "local"
-
-  abi_version: "local"
-  abi_libc_version: "local"
-  target_cpu: "x64_windows"
-  compiler: "msvc-cl"
-  target_libc: "msvcrt"
-
-%{cxx_builtin_include_directory}
-
-  tool_path {
-    name: "ar"
-    path: "%{msvc_lib_path}"
-  }
-  tool_path {
-    name: "ml"
-    path: "%{msvc_ml_path}"
-  }
-  tool_path {
-    name: "cpp"
-    path: "%{msvc_cl_path}"
-  }
-  tool_path {
-    name: "gcc"
-    path: "%{msvc_cl_path}"
-  }
-  tool_path {
-    name: "gcov"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "ld"
-    path: "%{msvc_link_path}"
-  }
-  tool_path {
-    name: "nm"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objcopy"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "objdump"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  tool_path {
-    name: "strip"
-    path: "wrapper/bin/msvc_nop.bat"
-  }
-  supports_interface_shared_objects: true
-
-  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
-  compiler_flag: "/DCOMPILER_MSVC"
-
-  # Don't define min/max macros in windows.h.
-  compiler_flag: "/DNOMINMAX"
-
-  # Platform defines.
-  compiler_flag: "/D_WIN32_WINNT=0x0600"
-  # Turn off warning messages.
-  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
-  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
-  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
-
-  # Useful options to have on for compilation.
-  # Increase the capacity of object files to 2^32 sections.
-  compiler_flag: "/bigobj"
-  # Allocate 500MB for precomputed headers.
-  compiler_flag: "/Zm500"
-  # Use unsigned char by default.
-  compiler_flag: "/J"
-  # Use function level linking.
-  compiler_flag: "/Gy"
-  # Use string pooling.
-  compiler_flag: "/GF"
-  # Catch C++ exceptions only and tell the compiler to assume that functions declared
-  # as extern "C" never throw a C++ exception.
-  compiler_flag: "/EHsc"
-
-  # Globally disabled warnings.
-  # Don't warn about elements of array being be default initialized.
-  compiler_flag: "/wd4351"
-  # Don't warn about no matching delete found.
-  compiler_flag: "/wd4291"
-  # Don't warn about diamond inheritance patterns.
-  compiler_flag: "/wd4250"
-  # Don't warn about insecure functions (e.g. non _s functions).
-  compiler_flag: "/wd4996"
-
-  linker_flag: "/MACHINE:X64"
-
-  feature {
-    name: "no_legacy_features"
-  }
-
-  # TODO(klimek): Previously we were using a .bat file to start python to run
-  # the python script that can redirect to nvcc - unfortunately .bat files
-  # have a rather short maximum length for command lines (8k). Instead, we
-  # now use the python binary as the compiler and pass the python script to
-  # it at the start of the command line. Investigate different possibilities
-  # to run the nvcc wrapper, either using pyinstaller --onefile, or writing
-  # a small C++ wrapper to redirect.
-  feature {
-    name: "redirector"
-    enabled: true
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      flag_group {
-        flag: "-B"
-        flag: "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py"
-      }
-    }
-  }
-
-  # Suppress startup banner.
-  feature {
-    name: "nologo"
-    flag_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      flag_group {
-        flag: "/nologo"
-      }
-    }
-  }
-
-  feature {
-    name: 'has_configured_linker_path'
-  }
-
-  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
-  feature {
-    name: 'no_stripping'
-  }
-
-  # This feature indicates this is a toolchain targeting Windows.
-  feature {
-    name: 'targets_windows'
-    implies: 'copy_dynamic_libraries_to_binary'
-    enabled: true
-  }
-
-  feature {
-    name: 'copy_dynamic_libraries_to_binary'
-  }
-
-  action_config {
-    config_name: 'assemble'
-    action_name: 'assemble'
-    tool {
-      tool_path: '%{msvc_ml_path}'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'preprocess-assemble'
-    action_name: 'preprocess-assemble'
-    tool {
-      tool_path: '%{msvc_ml_path}'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'sysroot'
-  }
-
-  action_config {
-    config_name: 'c-compile'
-    action_name: 'c-compile'
-    tool {
-      tool_path: '%{msvc_cl_path}'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-compile'
-    action_name: 'c++-compile'
-    tool {
-      tool_path: '%{msvc_cl_path}'
-    }
-    implies: 'compiler_input_flags'
-    implies: 'compiler_output_flags'
-    implies: 'legacy_compile_flags'
-    implies: 'nologo'
-    implies: 'msvc_env'
-    implies: 'parse_showincludes'
-    implies: 'user_compile_flags'
-    implies: 'sysroot'
-    implies: 'unfiltered_compile_flags'
-  }
-
-  action_config {
-    config_name: 'c++-link-executable'
-    action_name: 'c++-link-executable'
-    tool {
-      tool_path: '%{msvc_link_path}'
-    }
-    implies: 'nologo'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-  }
-
-  action_config {
-    config_name: 'c++-link-dynamic-library'
-    action_name: 'c++-link-dynamic-library'
-    tool {
-      tool_path: '%{msvc_link_path}'
-    }
-    implies: 'nologo'
-    implies: 'shared_flag'
-    implies: 'linkstamps'
-    implies: 'output_execpath_flags'
-    implies: 'input_param_flags'
-    implies: 'user_link_flags'
-    implies: 'legacy_link_flags'
-    implies: 'linker_subsystem_flag'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-    implies: 'no_stripping'
-    implies: 'has_configured_linker_path'
-    implies: 'def_file'
-  }
-
-  action_config {
-      config_name: 'c++-link-nodeps-dynamic-library'
-      action_name: 'c++-link-nodeps-dynamic-library'
-      tool {
-        tool_path: '%{msvc_link_path}'
-      }
-      implies: 'nologo'
-      implies: 'shared_flag'
-      implies: 'linkstamps'
-      implies: 'output_execpath_flags'
-      implies: 'input_param_flags'
-      implies: 'user_link_flags'
-      implies: 'legacy_link_flags'
-      implies: 'linker_subsystem_flag'
-      implies: 'linker_param_file'
-      implies: 'msvc_env'
-      implies: 'no_stripping'
-      implies: 'has_configured_linker_path'
-      implies: 'def_file'
-    }
-
-  action_config {
-    config_name: 'c++-link-static-library'
-    action_name: 'c++-link-static-library'
-    tool {
-      tool_path: '%{msvc_lib_path}'
-    }
-    implies: 'nologo'
-    implies: 'archiver_flags'
-    implies: 'input_param_flags'
-    implies: 'linker_param_file'
-    implies: 'msvc_env'
-  }
-
-  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
-  # not used in this crosstool
-  feature {
-    name: 'legacy_compile_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'legacy_compile_flags'
-        flag: '%{legacy_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: "msvc_env"
-    env_set {
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-module-compile"
-      action: "c++-module-codegen"
-      action: "c++-header-parsing"
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c++-link-executable"
-      action: "c++-link-dynamic-library"
-      action: "c++-link-nodeps-dynamic-library"
-      action: "c++-link-static-library"
-      env_entry {
-        key: "PATH"
-        value: "%{msvc_env_path}"
-      }
-      env_entry {
-        key: "INCLUDE"
-        value: "%{msvc_env_include}"
-      }
-      env_entry {
-        key: "LIB"
-        value: "%{msvc_env_lib}"
-      }
-      env_entry {
-        key: "TMP"
-        value: "%{msvc_env_tmp}"
-      }
-      env_entry {
-        key: "TEMP"
-        value: "%{msvc_env_tmp}"
-      }
-    }
-  }
-
-  feature {
-    name: 'include_paths'
-    flag_set {
-      action: "assemble"
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      flag_group {
-        iterate_over: 'quote_include_paths'
-        flag: '/I%{quote_include_paths}'
-      }
-      flag_group {
-        iterate_over: 'include_paths'
-        flag: '/I%{include_paths}'
-      }
-      flag_group {
-        iterate_over: 'system_include_paths'
-        flag: '/I%{system_include_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: "preprocessor_defines"
-    flag_set {
-      action: "assemble"
-      action: "preprocess-assemble"
-      action: "c-compile"
-      action: "c++-compile"
-      action: "c++-header-parsing"
-      action: "c++-module-compile"
-      flag_group {
-        flag: "/D%{preprocessor_defines}"
-        iterate_over: "preprocessor_defines"
-      }
-    }
-  }
-
-  # Tell Bazel to parse the output of /showIncludes
-  feature {
-    name: 'parse_showincludes'
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-module-compile'
-      action: 'c++-header-parsing'
-      flag_group {
-        flag: "/showIncludes"
-      }
-    }
-  }
-
-
-  feature {
-    name: 'generate_pdb_file'
-    requires: {
-      feature: 'dbg'
-    }
-    requires: {
-      feature: 'fastbuild'
-    }
-  }
-
-  feature {
-    name: 'shared_flag'
-    flag_set {
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/DLL'
-      }
-    }
-  }
-
-  feature {
-    name: 'linkstamps'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      expand_if_all_available: 'linkstamp_paths'
-      flag_group {
-        iterate_over: 'linkstamp_paths'
-        flag: '%{linkstamp_paths}'
-      }
-    }
-  }
-
-  feature {
-    name: 'output_execpath_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'archiver_flags'
-    flag_set {
-      expand_if_all_available: 'output_execpath'
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '/OUT:%{output_execpath}'
-      }
-    }
-  }
-
-  feature {
-    name: 'input_param_flags'
-    flag_set {
-      expand_if_all_available: 'interface_library_output_path'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/IMPLIB:%{interface_library_output_path}"
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libopts'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'libopts'
-        flag: '%{libopts}'
-      }
-    }
-    flag_set {
-      expand_if_all_available: 'libraries_to_link'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        iterate_over: 'libraries_to_link'
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file_group'
-          }
-          iterate_over: 'libraries_to_link.object_files'
-          flag_group {
-            flag: '%{libraries_to_link.object_files}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'object_file'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'interface_library'
-          }
-          flag_group {
-            flag: '%{libraries_to_link.name}'
-          }
-        }
-        flag_group {
-          expand_if_equal: {
-            variable: 'libraries_to_link.type'
-            value: 'static_library'
-          }
-          flag_group {
-            expand_if_false: 'libraries_to_link.is_whole_archive'
-            flag: '%{libraries_to_link.name}'
-          }
-          flag_group {
-            expand_if_true: 'libraries_to_link.is_whole_archive'
-            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
-          }
-        }
-      }
-    }
-  }
-
-  # Since this feature is declared earlier in the CROSSTOOL than
-  # "user_link_flags", this feature will be applied prior to it anwyhere they
-  # are both implied. And since "user_link_flags" contains the linkopts from
-  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
-  # file.
-  feature {
-    name: 'linker_subsystem_flag'
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: '/SUBSYSTEM:CONSOLE'
-      }
-    }
-  }
-
-  # The "user_link_flags" contains user-defined linkopts (from build rules)
-  # so it should be defined after features that declare user-overridable flags.
-  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
-  # but we want to let the user override it, therefore "link_flag_subsystem" is
-  # defined earlier in the CROSSTOOL file than "user_link_flags".
-  feature {
-    name: 'user_link_flags'
-    flag_set {
-      expand_if_all_available: 'user_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'user_link_flags'
-        flag: '%{user_link_flags}'
-      }
-    }
-  }
-  feature {
-    name: 'legacy_link_flags'
-    flag_set {
-      expand_if_all_available: 'legacy_link_flags'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'legacy_link_flags'
-        flag: '%{legacy_link_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'linker_param_file'
-    flag_set {
-      expand_if_all_available: 'linker_param_file'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      action: 'c++-link-static-library'
-      flag_group {
-        flag: '@%{linker_param_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'static_link_msvcrt'
-  }
-
-  feature {
-    name: 'static_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MT"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_no_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MD"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrt.lib"
-      }
-    }
-    requires: { feature: 'fastbuild'}
-    requires: { feature: 'opt'}
-  }
-
-  feature {
-    name: 'static_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MTd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:libcmtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dynamic_link_msvcrt_debug'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/MDd"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEFAULTLIB:msvcrtd.lib"
-      }
-    }
-    requires: { feature: 'dbg'}
-  }
-
-  feature {
-    name: 'dbg'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FULL"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'fastbuild'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/Od"
-        flag: "/Z7"
-        flag: "/DDEBUG"
-      }
-    }
-    flag_set {
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEBUG:FASTLINK"
-        flag: "/INCREMENTAL:NO"
-      }
-    }
-    implies: 'generate_pdb_file'
-  }
-
-  feature {
-    name: 'opt'
-    flag_set {
-      action: 'c-compile'
-      action: 'c++-compile'
-      flag_group {
-        flag: "/O2"
-        flag: "/DNDEBUG"
-      }
-    }
-  }
-
-  feature {
-    name: 'user_compile_flags'
-    flag_set {
-      expand_if_all_available: 'user_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'user_compile_flags'
-        flag: '%{user_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'sysroot'
-    flag_set {
-      expand_if_all_available: 'sysroot'
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        iterate_over: 'sysroot'
-        flag: '--sysroot=%{sysroot}'
-      }
-    }
-  }
-
-  feature {
-    name: 'unfiltered_compile_flags'
-    flag_set {
-      expand_if_all_available: 'unfiltered_compile_flags'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        iterate_over: 'unfiltered_compile_flags'
-        flag: '%{unfiltered_compile_flags}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_output_flags'
-    flag_set {
-      action: 'assemble'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-        flag: '/Zi'
-      }
-    }
-    flag_set {
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_none_available: 'output_assembly_file'
-        expand_if_none_available: 'output_preprocess_file'
-        flag: '/Fo%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_assembly_file'
-        flag: '/Fa%{output_file}'
-      }
-      flag_group {
-        expand_if_all_available: 'output_file'
-        expand_if_all_available: 'output_preprocess_file'
-        flag: '/P'
-        flag: '/Fi%{output_file}'
-      }
-    }
-  }
-
-  feature {
-    name: 'compiler_input_flags'
-    flag_set {
-      action: 'assemble'
-      action: 'preprocess-assemble'
-      action: 'c-compile'
-      action: 'c++-compile'
-      action: 'c++-header-parsing'
-      action: 'c++-module-compile'
-      action: 'c++-module-codegen'
-      flag_group {
-        expand_if_all_available: 'source_file'
-        flag: '/c'
-        flag: '%{source_file}'
-      }
-    }
-  }
-
-  feature {
-    name : 'def_file',
-    flag_set {
-      expand_if_all_available: 'def_file_path'
-      action: 'c++-link-executable'
-      action: 'c++-link-dynamic-library'
-      action: "c++-link-nodeps-dynamic-library"
-      flag_group {
-        flag: "/DEF:%{def_file_path}"
-        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
-        # the warning message about DLL name doesn't match the default one.
-        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
-        flag: "/ignore:4070"
-      }
-    }
-  }
-
-  feature {
-    name: 'windows_export_all_symbols'
-  }
-
-  feature {
-    name: 'no_windows_export_all_symbols'
-  }
-
-  linking_mode_flags { mode: DYNAMIC }
-}
diff --git a/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl b/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
deleted file mode 100755
index ba002b45..00000000
--- a/build_deps/toolchains/gpu/crosstool/cc_toolchain_config.bzl.tpl
+++ /dev/null
@@ -1,1493 +0,0 @@
-"""cc_toolchain_config rule for configuring CUDA toolchains on Linux, Mac, and Windows."""
-
-load(
-    "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-    "action_config",
-    "env_entry",
-    "env_set",
-    "feature",
-    "feature_set",
-    "flag_group",
-    "flag_set",
-    "tool",
-    "tool_path",
-    "variable_with_value",
-)
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
-
-ACTION_NAMES = struct(
-    assemble = ASSEMBLE_ACTION_NAME,
-    c_compile = C_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    ld_embed_data = "ld_embed_data",
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-)
-
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
-
-    host_system_name = "local"
-
-    target_system_name = "local"
-
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = None
-
-    all_link_actions = [
-        ACTION_NAMES.cpp_link_executable,
-        ACTION_NAMES.cpp_link_dynamic_library,
-        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-    ]
-
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
-    )
-
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
-        ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_compile_flags_feature = feature(
-        name = "default_compile_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.linkstamp_compile,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.lto_backend,
-                    ACTION_NAMES.clif_match,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "/DCOMPILER_MSVC",
-                            "/DNOMINMAX",
-                            "/D_WIN32_WINNT=0x0600",
-                            "/D_CRT_SECURE_NO_DEPRECATE",
-                            "/D_CRT_SECURE_NO_WARNINGS",
-                            "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                            "/bigobj",
-                            "/Zm500",
-                            "/J",
-                            "/Gy",
-                            "/GF",
-                            "/EHsc",
-                            "/wd4351",
-                            "/wd4291",
-                            "/wd4250",
-                            "/wd4996",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cpp11_feature = feature(
-        name = "c++11",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-std=c++11"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "c++11",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            cpp11_feature,
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
-            tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
-            tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
-            tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcc", path = ctx.attr.msvc_cl_path),
-            tool_path(name = "gcov", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(name = "ld", path = ctx.attr.msvc_link_path),
-            tool_path(name = "nm", path = "wrapper/bin/msvc_nop.bat"),
-            tool_path(
-                name = "objcopy",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "objdump",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-            tool_path(
-                name = "strip",
-                path = "wrapper/bin/msvc_nop.bat",
-            ),
-        ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
-        ]
-    else:
-        fail("Unreachable")
-
-    out = ctx.actions.declare_file(ctx.label.name)
-    ctx.actions.write(out, "Fake executable")
-    return [
-        cc_common.create_cc_toolchain_config_info(
-            ctx = ctx,
-            features = features,
-            action_configs = action_configs,
-            artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
-            toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
-            target_cpu = target_cpu,
-            target_libc = target_libc,
-            compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
-            make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
-        ),
-        DefaultInfo(
-            executable = out,
-        ),
-    ]
-
-cc_toolchain_config = rule(
-    attrs = {
-        "cpu": attr.string(
-            mandatory = True,
-            values = [
-                "darwin",
-                "local",
-                "x64_windows",
-            ],
-        ),
-        "builtin_include_directories": attr.string_list(),
-        "extra_no_canonical_prefixes_flags": attr.string_list(),
-        "host_compiler_path": attr.string(),
-        "host_compiler_prefix": attr.string(),
-        "host_compiler_warnings": attr.string_list(),
-        "host_unfiltered_compile_flags": attr.string_list(),
-        "linker_bin_path": attr.string(),
-        "msvc_cl_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_include": attr.string(default = "msvc_not_used"),
-        "msvc_env_lib": attr.string(default = "msvc_not_used"),
-        "msvc_env_path": attr.string(default = "msvc_not_used"),
-        "msvc_env_tmp": attr.string(default = "msvc_not_used"),
-        "msvc_lib_path": attr.string(default = "msvc_not_used"),
-        "msvc_link_path": attr.string(default = "msvc_not_used"),
-        "msvc_ml_path": attr.string(default = "msvc_not_used"),
-    },
-    executable = True,
-    provides = [CcToolchainConfigInfo],
-    implementation = _impl,
-)
diff --git a/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
deleted file mode 100644
index 81c16c61..00000000
--- a/build_deps/toolchains/gpu/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ /dev/null
@@ -1,269 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs.
-
-SYNOPSIS:
-  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
-                                or cc_binary() rule]
-
-DESCRIPTION:
-  This script is expected to be called by the cc_library() or cc_binary() bazel
-  rules. When the option "-x cuda" is present in the list of arguments passed
-  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
-  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
-  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
-  arguments as is.
-
-NOTES:
-  Changes to the contents of this file must be propagated from
-  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
-  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-__author__ = 'keveman@google.com (Manjunath Kudlur)'
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
-
-NVCC_PATH = '%{nvcc_path}'
-PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
-NVCC_VERSION = '%{cuda_version}'
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from the argv list.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    option: The option whose value to extract, without the leading '-'.
-
-  Returns:
-    A list of values, either directly following the option,
-    (eg., -opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., -opt val1 -opt val2).
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
-  args, _ = parser.parse_known_args(argv)
-  if not args or not vars(args)[option]:
-    return []
-  else:
-    return sum(vars(args)[option], [])
-
-
-def GetHostCompilerOptions(argv):
-  """Collect the -isystem, -iquote, and --sysroot option values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be used as the --compiler-options to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-isystem', nargs='*', action='append')
-  parser.add_argument('-iquote', nargs='*', action='append')
-  parser.add_argument('--sysroot', nargs=1)
-  parser.add_argument('-g', nargs='*', action='append')
-  parser.add_argument('-fno-canonical-system-headers', action='store_true')
-
-  args, _ = parser.parse_known_args(argv)
-
-  opts = ''
-
-  if args.isystem:
-    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
-  if args.iquote:
-    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
-  if args.g:
-    opts += ' -g' + ' -g'.join(sum(args.g, []))
-  if args.fno_canonical_system_headers:
-    opts += ' -fno-canonical-system-headers'
-  if args.sysroot:
-    opts += ' --sysroot ' + args.sysroot[0]
-
-  return opts
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    The string that can be passed directly to nvcc.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, _ = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return ' '.join(['--'+a for a in options])
-  return ''
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  host_compiler_options = GetHostCompilerOptions(argv)
-  nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
-  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
-  defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
-  undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
-  # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
-  nvcc_allowed_std_options = ["c++03", "c++11", "c++14", "c++17"]
-  std_options = ''.join([' -std=' + define
-      for define in std_options if define in nvcc_allowed_std_options])
-
-  # The list of source files get passed after the -c option. I don't know of
-  # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
-
-  # Pass -w through from host to nvcc, but don't do anything fancier with
-  # warnings-related flags, since they're not necessarily the same across
-  # compilers.
-  warning_options = ' -w' if '-w' in argv else ''
-
-  if len(src_files) == 0:
-    return 1
-  if len(out_file) != 1:
-    return 1
-
-  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
-         else ' -g -G')
-
-  includes = (' -I ' + ' -I '.join(include_options)
-              if len(include_options) > 0
-              else '')
-
-  # Unfortunately, there are other options that have -c prefix too.
-  # So allowing only those look like C/C++ files.
-  src_files = [f for f in src_files if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  srcs = ' '.join(src_files)
-  out = ' -o ' + out_file[0]
-
-  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
-  nvccopts = '-D_FORCE_INLINES '
-  supported_cuda_compute_capabilities = sorted([
-      x.replace(".", "") for x in supported_cuda_compute_capabilities])
-  for capability in supported_cuda_compute_capabilities[:-1]:
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (
-        capability, capability)
-  if supported_cuda_compute_capabilities:
-    capability = supported_cuda_compute_capabilities[-1]
-    nvccopts += r'-gencode=arch=compute_%s,code=\"sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += std_options
-  nvccopts += m_options
-  nvccopts += warning_options
-
-  if depfiles:
-    # Generate the dependency file
-    depfile = depfiles[0]
-    cmd = (NVCC_PATH + ' ' + nvccopts +
-           ' --compiler-options "' + host_compiler_options + '"' +
-           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-           ' -I .' +
-           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
-    if log: Log(cmd)
-    exit_status = os.system(cmd)
-    if exit_status != 0:
-      return exit_status
-
-  cmd = (NVCC_PATH + ' ' + nvccopts +
-         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
-         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
-         ' -I .' +
-         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
-
-  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
-  # Need to investigate and fix.
-  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
-  if log: Log(cmd)
-  return os.system(cmd)
-
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
deleted file mode 100644
index 1a097568..00000000
--- a/build_deps/toolchains/gpu/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
-
-DESCRIPTION:
-  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
-"""
-
-from __future__ import print_function
-
-from argparse import ArgumentParser
-import os
-import subprocess
-import re
-import sys
-import pipes
-
-# Template values set by cuda_autoconf.
-CPU_COMPILER = ('%{cpu_compiler}')
-GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
-
-NVCC_PATH = '%{nvcc_path}'
-NVCC_VERSION = '%{cuda_version}'
-NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
-supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
-
-def Log(s):
-  print('gpus/crosstool: {0}'.format(s))
-
-
-def GetOptionValue(argv, option):
-  """Extract the list of values for option from options.
-
-  Args:
-    option: The option whose value to extract, without the leading '/'.
-
-  Returns:
-    1. A list of values, either directly following the option,
-    (eg., /opt val1 val2) or values collected from multiple occurrences of
-    the option (eg., /opt val1 /opt val2).
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
-  args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
-    return (sum(vars(args)[option], []), leftover)
-  return ([], leftover)
-
-def _update_options(nvcc_options):
-  if NVCC_VERSION in ("7.0",):
-    return nvcc_options
-
-  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
-  return [ update_options[opt] if opt in update_options else opt
-                    for opt in nvcc_options ]
-
-def GetNvccOptions(argv):
-  """Collect the -nvcc_options values from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-
-  Returns:
-    1. The string that can be passed directly to nvcc.
-    2. The leftover options.
-  """
-
-  parser = ArgumentParser()
-  parser.add_argument('-nvcc_options', nargs='*', action='append')
-
-  args, leftover = parser.parse_known_args(argv)
-
-  if args.nvcc_options:
-    options = _update_options(sum(args.nvcc_options, []))
-    return (['--' + a for a in options], leftover)
-  return ([], leftover)
-
-
-def InvokeNvcc(argv, log=False):
-  """Call nvcc with arguments assembled from argv.
-
-  Args:
-    argv: A list of strings, possibly the argv passed to main().
-    log: True if logging is requested.
-
-  Returns:
-    The return value of calling os.system('nvcc ' + args)
-  """
-
-  src_files = [f for f in argv if
-               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
-  if len(src_files) == 0:
-    raise Error('No source files found for cuda compilation.')
-
-  out_file = [ f for f in argv if f.startswith('/Fo') ]
-  if len(out_file) != 1:
-    raise Error('Please sepecify exactly one output file for cuda compilation.')
-  out = ['-o', out_file[0][len('/Fo'):]]
-
-  nvcc_compiler_options, argv = GetNvccOptions(argv)
-
-  opt_option, argv = GetOptionValue(argv, 'O')
-  opt = ['-g', '-G']
-  if (len(opt_option) > 0 and opt_option[0] != 'd'):
-    opt = ['-O2']
-
-  include_options, argv = GetOptionValue(argv, 'I')
-  includes = ["-I " + include for include in include_options]
-
-  defines, argv = GetOptionValue(argv, 'D')
-  defines = ['-D' + define for define in defines]
-
-  undefines, argv = GetOptionValue(argv, 'U')
-  undefines = ['-U' + define for define in undefines]
-
-  # The rest of the unrecongized options should be passed to host compiler
-  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
-
-  m_options = ["-m64"]
-
-  nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
-  nvccopts += nvcc_compiler_options
-  nvccopts += undefines
-  nvccopts += defines
-  nvccopts += m_options
-  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
-  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
-  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
-  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
-  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
-  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
-  if os.path.isfile(NVCC_TEMP_DIR):
-    os.remove(NVCC_TEMP_DIR)
-  if not os.path.exists(NVCC_TEMP_DIR):
-    os.makedirs(NVCC_TEMP_DIR)
-  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
-  cmd = [NVCC_PATH] + nvccopts
-  if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
-                          stdout=sys.stdout,
-                          stderr=sys.stderr,
-                          env=os.environ.copy(),
-                          shell=True)
-  proc.wait()
-  return proc.returncode
-
-def main():
-  parser = ArgumentParser()
-  parser.add_argument('-x', nargs=1)
-  parser.add_argument('--cuda_log', action='store_true')
-  args, leftover = parser.parse_known_args(sys.argv[1:])
-
-  if args.x and args.x[0] == 'cuda':
-    if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
-    if args.cuda_log: Log('using nvcc')
-    return InvokeNvcc(leftover, log=args.cuda_log)
-
-  # Strip our flags before passing through to the CPU compiler for files which
-  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
-  # We not only want to pass -x to the CPU compiler, but also keep it in its
-  # relative location in the argv list (the compiler is actually sensitive to
-  # this).
-  cpu_compiler_flags = [flag for flag in sys.argv[1:]
-                             if not flag.startswith(('--cuda_log'))
-                             and not flag.startswith(('-nvcc_options'))]
-
-  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
-
-if __name__ == '__main__':
-  sys.exit(main())
diff --git a/build_deps/toolchains/gpu/cub.BUILD b/build_deps/toolchains/gpu/cub.BUILD
deleted file mode 100644
index cdc9e4f3..00000000
--- a/build_deps/toolchains/gpu/cub.BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-# Description: CUB library which is a set of primitives for GPU programming.
-
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "if_cuda")
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-licenses(["notice"])  # BSD
-
-filegroup(
-    name = "cub_header_files",
-    srcs = glob([
-        "cub/**",
-    ]),
-)
-
-cc_library(
-    name = "cub",
-    hdrs = if_cuda([":cub_header_files"]),
-    include_prefix = "gpu",
-    deps = [
-        "@local_config_cuda//cuda:cuda_headers",
-    ],
-)
diff --git a/build_deps/toolchains/gpu/cuda/BUILD.tpl b/build_deps/toolchains/gpu/cuda/BUILD.tpl
deleted file mode 100644
index 1ac5643f..00000000
--- a/build_deps/toolchains/gpu/cuda/BUILD.tpl
+++ /dev/null
@@ -1,227 +0,0 @@
-load(":build_defs.bzl", "cuda_header_library")
-
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
-)
-
-cuda_header_library(
-    name = "cuda_headers",
-    hdrs = [
-        %{cuda_headers}
-    ],
-    include_prefix = "third_party/gpus",
-    includes = [
-        ".",
-        "cuda/include",
-        "cuda/include/crt",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudart_static",
-    srcs = ["cuda/lib/%{cudart_static_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkopts = select({
-        ":freebsd": [],
-        "//conditions:default": ["-ldl"],
-    }) + [
-        "-lpthread",
-        %{cudart_static_linkopt}
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda_driver",
-    srcs = ["cuda/lib/%{cuda_driver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudart",
-    srcs = ["cuda/lib/%{cudart_lib}"],
-    data = ["cuda/lib/%{cudart_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cublas",
-    srcs = ["cuda/lib/%{cublas_lib}"],
-    data = ["cuda/lib/%{cublas_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cusolver",
-    srcs = ["cuda/lib/%{cusolver_lib}"],
-    data = ["cuda/lib/%{cusolver_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkopts = ["-lgomp"],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudnn",
-    srcs = ["cuda/lib/%{cudnn_lib}"],
-    data = ["cuda/lib/%{cudnn_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cufft",
-    srcs = ["cuda/lib/%{cufft_lib}"],
-    data = ["cuda/lib/%{cufft_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "curand",
-    srcs = ["cuda/lib/%{curand_lib}"],
-    data = ["cuda/lib/%{curand_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cc_library(
-    name = "cupti_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cupti_dsos",
-    data = ["cuda/lib/%{cupti_lib}"],
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda_libs",
-    data = [
-        ":cudart",
-    ],
-    linkopts = select({
-        ":darwin": [
-            "-Wl,-rpath,./lib",
-            "-Wl,-rpath,./extras/CUPTI/lib",
-        ],
-        "//conditions:default": [
-            "-Wl,-rpath,./lib64",
-            "-Wl,-rpath,./extras/CUPTI/lib64",
-        ],
-    }),
-    deps = [
-        ":cudart",
-    ],
-)
-
-%{copy_rules}
diff --git a/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl b/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
deleted file mode 100644
index 3ed4fd41..00000000
--- a/build_deps/toolchains/gpu/cuda/BUILD.windows.tpl
+++ /dev/null
@@ -1,164 +0,0 @@
-licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
-
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "using_nvcc",
-    values = {
-        "define": "using_cuda_nvcc=true",
-    },
-)
-
-config_setting(
-    name = "using_clang",
-    values = {
-        "define": "using_cuda_clang=true",
-    },
-)
-
-# Equivalent to using_clang && -c opt.
-config_setting(
-    name = "using_clang_opt",
-    values = {
-        "define": "using_cuda_clang=true",
-        "compilation_mode": "opt",
-    },
-)
-
-config_setting(
-    name = "darwin",
-    values = {"cpu": "darwin"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "freebsd",
-    values = {"cpu": "freebsd"},
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        %{cuda_headers}
-    ],
-    includes = [
-        ".",
-        "cuda/include",
-        "cuda/include/crt",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cudart_static",
-    # /WHOLEARCHIVE:cudart_static.lib will cause a
-    # "Internal error during CImplib::EmitThunk" error.
-    # Treat this library as interface library to avoid being whole archived when
-    # linking a DLL that depends on this.
-    # TODO(pcloudy): Remove this rule after b/111278841 is resolved.
-    interface_library = "cuda/lib/%{cudart_static_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cuda_driver",
-    interface_library = "cuda/lib/%{cuda_driver_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cudart",
-    interface_library = "cuda/lib/%{cudart_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cublas",
-    interface_library = "cuda/lib/%{cublas_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cusolver",
-    interface_library = "cuda/lib/%{cusolver_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cudnn",
-    interface_library = "cuda/lib/%{cudnn_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudnn_header",
-    includes = [
-        ".",
-        "cuda/include",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cufft",
-    interface_library = "cuda/lib/%{cufft_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "curand",
-    interface_library = "cuda/lib/%{curand_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cuda",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cublas",
-        ":cuda_headers",
-        ":cudart",
-        ":cudnn",
-        ":cufft",
-        ":curand",
-    ],
-)
-
-cc_library(
-    name = "cupti_headers",
-    hdrs = [
-        "cuda/cuda_config.h",
-        ":cuda-extras",
-    ],
-    includes = [
-        ".",
-        "cuda/",
-        "cuda/extras/CUPTI/include/",
-    ],
-    visibility = ["//visibility:public"],
-)
-
-cc_import(
-    name = "cupti_dsos",
-    interface_library = "cuda/lib/%{cupti_lib}",
-    system_provided = 1,
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "libdevice_root",
-    data = [":cuda-nvvm"],
-    visibility = ["//visibility:public"],
-)
-
-%{copy_rules}
diff --git a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl b/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
deleted file mode 100644
index a4f484fb..00000000
--- a/build_deps/toolchains/gpu/cuda/build_defs.bzl.tpl
+++ /dev/null
@@ -1,62 +0,0 @@
-# Macros for building CUDA code.
-def if_cuda(if_true, if_false = []):
-    """Shorthand for select()'ing on whether we're building with CUDA.
-
-    Returns a select statement which evaluates to if_true if we're building
-    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
-
-    """
-    return select({
-        "@local_config_cuda//cuda:using_nvcc": if_true,
-        "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false
-    })
-
-
-def cuda_default_copts():
-    """Default options for all CUDA compilations."""
-    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + %{cuda_extra_copts})
-
-
-def cuda_is_configured():
-    """Returns true if CUDA was enabled during the configure process."""
-    return %{cuda_is_configured}
-
-def if_cuda_is_configured(x):
-    """Tests if the CUDA was enabled during the configure process.
-
-    Unlike if_cuda(), this does not require that we are building with
-    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
-    """
-    if cuda_is_configured():
-      return x
-    return []
-
-def cuda_header_library(
-        name,
-        hdrs,
-        include_prefix = None,
-        strip_include_prefix = None,
-        deps = [],
-        **kwargs):
-    """Generates a cc_library containing both virtual and system include paths.
-
-    Generates both a header-only target with virtual includes plus the full
-    target without virtual includes. This works around the fact that bazel can't
-    mix 'includes' and 'include_prefix' in the same target."""
-
-    native.cc_library(
-        name = name + "_virtual",
-        hdrs = hdrs,
-        include_prefix = include_prefix,
-        strip_include_prefix = strip_include_prefix,
-        deps = deps,
-        visibility = ["//visibility:private"],
-    )
-
-    native.cc_library(
-        name = name,
-        textual_hdrs = hdrs,
-        deps = deps + [":%s_virtual" % name],
-        **kwargs
-    )
diff --git a/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl b/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
deleted file mode 100644
index 811b040e..00000000
--- a/build_deps/toolchains/gpu/cuda/cuda_config.h.tpl
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef CUDA_CUDA_CONFIG_H_
-#define CUDA_CUDA_CONFIG_H_
-
-#define TF_CUDA_CAPABILITIES %{cuda_compute_capabilities}
-
-#define TF_CUDA_VERSION "%{cuda_version}"
-#define TF_CUDNN_VERSION "%{cudnn_version}"
-
-#define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
-
-#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/build_deps/toolchains/gpu/cuda_configure.bzl b/build_deps/toolchains/gpu/cuda_configure.bzl
deleted file mode 100644
index ba38c6b5..00000000
--- a/build_deps/toolchains/gpu/cuda_configure.bzl
+++ /dev/null
@@ -1,1116 +0,0 @@
-# -*- Python -*-
-"""Repository rule for CUDA autoconfiguration.
-`cuda_configure` depends on the following environment variables:
-  * `TF_NEED_CUDA`: Whether to enable building with CUDA.
-  * `GCC_HOST_COMPILER_PATH`: The GCC host compiler path
-  * `TF_CUDA_CLANG`: Whether to use clang as a cuda compiler.
-  * `CLANG_CUDA_COMPILER_PATH`: The clang compiler path that will be used for
-    both host and device code compilation if TF_CUDA_CLANG is 1.
-  * `TF_CUDA_PATHS`: The base paths to look for CUDA and cuDNN. Default is
-    `/usr/local/cuda,usr/`.
-  * `CUDA_TOOLKIT_PATH`: The path to the CUDA toolkit. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_VERSION`: The version of the CUDA toolkit. If this is blank, then
-    use the system default.
-  * `TF_CUDNN_VERSION`: The version of the cuDNN library.
-  * `CUDNN_INSTALL_PATH`: The path to the cuDNN library. Default is
-    `/usr/local/cuda`.
-  * `TF_CUDA_COMPUTE_CAPABILITIES`: The CUDA compute capabilities. Default is
-    `3.5,5.2`.
-  * `PYTHON_BIN_PATH`: The python binary path
-"""
-
-load(
-    "@bazel_tools//tools/cpp:lib_cc_configure.bzl",
-    "escape_string",
-    "get_env_var",
-)
-load(
-    "@bazel_tools//tools/cpp:windows_cc_configure.bzl",
-    "find_msvc_tool",
-    "find_vc_path",
-    "setup_vc_env_vars",
-)
-
-_GCC_HOST_COMPILER_PATH = "GCC_HOST_COMPILER_PATH"
-
-_CLANG_CUDA_COMPILER_PATH = "CLANG_CUDA_COMPILER_PATH"
-
-_CUDA_TOOLKIT_PATH = "CUDA_TOOLKIT_PATH"
-
-_TF_CUDA_VERSION = "TF_CUDA_VERSION"
-
-_TF_CUDNN_VERSION = "TF_CUDNN_VERSION"
-
-_CUDNN_INSTALL_PATH = "CUDNN_INSTALL_PATH"
-
-_TF_CUDA_COMPUTE_CAPABILITIES = "TF_CUDA_COMPUTE_CAPABILITIES"
-
-_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
-
-_PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
-
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = [
-    "3.5",
-    "5.2",
-]
-
-def _get_python_bin(repository_ctx):
-    """Gets the python bin path."""
-    python_bin = repository_ctx.os.environ.get(_PYTHON_BIN_PATH)
-    if python_bin != None:
-        return python_bin
-    python_bin_name = "python.exe" if _is_windows(repository_ctx) else "python"
-    python_bin_path = repository_ctx.which(python_bin_name)
-    if python_bin_path != None:
-        return str(python_bin_path)
-    auto_configure_fail(
-        "Cannot find python in PATH, please make sure " +
-        "python is installed and add its directory in PATH, or --define " +
-        "%s='/something/else'.\nPATH=%s" % (
-            _PYTHON_BIN_PATH,
-            repository_ctx.os.environ.get("PATH", ""),
-        ),
-    )
-
-def _get_nvcc_tmp_dir_for_windows(repository_ctx):
-    """Return the tmp directory for nvcc to generate intermediate source files."""
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-            "\\",
-            "\\\\",
-        ),
-    )
-    return escaped_tmp_dir + "\\\\nvcc_inter_files_tmp_dir"
-
-def _get_msvc_compiler(repository_ctx):
-    vc_path = find_vc_path(repository_ctx)
-    return find_msvc_tool(repository_ctx, vc_path, "cl.exe").replace("\\", "/")
-
-def _get_win_cuda_defines(repository_ctx):
-    """Return CROSSTOOL defines for Windows"""
-
-    # If we are not on Windows, return empty vaules for Windows specific fields.
-    # This ensures the CROSSTOOL file parser is happy.
-    if not _is_windows(repository_ctx):
-        return dict({
-            "%{msvc_env_tmp}": "",
-            "%{msvc_env_path}": "",
-            "%{msvc_env_include}": "",
-            "%{msvc_env_lib}": "",
-            "%{msvc_cl_path}": "",
-            "%{msvc_ml_path}": "",
-            "%{msvc_link_path}": "",
-            "%{msvc_lib_path}": "",
-            "%{cxx_builtin_include_directory}": "",
-        })
-
-    vc_path = find_vc_path(repository_ctx)
-    if not vc_path:
-        auto_configure_fail(
-            "Visual C++ build tools not found on your machine. " +
-            "Please check your installation following https://docs.bazel.build/versions/master/windows.html#using",
-        )
-        return {}
-
-    env = setup_vc_env_vars(repository_ctx, vc_path)
-    escaped_paths = escape_string(env["PATH"])
-    escaped_include_paths = escape_string(env["INCLUDE"])
-    escaped_lib_paths = escape_string(env["LIB"])
-    escaped_tmp_dir = escape_string(
-        get_env_var(repository_ctx, "TMP", "C:\\Windows\\Temp").replace(
-            "\\",
-            "\\\\",
-        ),
-    )
-
-    msvc_cl_path = _get_python_bin(repository_ctx)
-    msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
-        "\\",
-        "/",
-    )
-    msvc_link_path = find_msvc_tool(repository_ctx, vc_path, "link.exe").replace(
-        "\\",
-        "/",
-    )
-    msvc_lib_path = find_msvc_tool(repository_ctx, vc_path, "lib.exe").replace(
-        "\\",
-        "/",
-    )
-
-    # nvcc will generate some temporary source files under %{nvcc_tmp_dir}
-    # The generated files are guranteed to have unique name, so they can share the same tmp directory
-    escaped_cxx_include_directories = [
-        "cxx_builtin_include_directory: \"%s\"" %
-        _get_nvcc_tmp_dir_for_windows(repository_ctx),
-    ]
-    for path in escaped_include_paths.split(";"):
-        if path:
-            escaped_cxx_include_directories.append(
-                "cxx_builtin_include_directory: \"%s\"" % path,
-            )
-
-    return {
-        "%{msvc_env_tmp}": escaped_tmp_dir,
-        "%{msvc_env_path}": escaped_paths,
-        "%{msvc_env_include}": escaped_include_paths,
-        "%{msvc_env_lib}": escaped_lib_paths,
-        "%{msvc_cl_path}": msvc_cl_path,
-        "%{msvc_ml_path}": msvc_ml_path,
-        "%{msvc_link_path}": msvc_link_path,
-        "%{msvc_lib_path}": msvc_lib_path,
-        "%{cxx_builtin_include_directory}": "\n".join(escaped_cxx_include_directories),
-    }
-
-def find_cc(repository_ctx):
-    """Find the C++ compiler."""
-    if _is_windows(repository_ctx):
-        return _get_msvc_compiler(repository_ctx)
-
-    target_cc_name = "gcc"
-    cc_path_envvar = _GCC_HOST_COMPILER_PATH
-    cc_name = target_cc_name
-
-    if cc_path_envvar in repository_ctx.os.environ:
-        cc_name_from_env = repository_ctx.os.environ[cc_path_envvar].strip()
-        if cc_name_from_env:
-            cc_name = cc_name_from_env
-    if cc_name.startswith("/"):
-        # Absolute path, maybe we should make this supported by our which function.
-        return cc_name
-    cc = repository_ctx.which(cc_name)
-    if cc == None:
-        fail(("Cannot find {}, either correct your path or set the {}" +
-              " environment variable").format(target_cc_name, cc_path_envvar))
-    return cc
-
-_INC_DIR_MARKER_BEGIN = "#include <...>"
-
-# OSX add " (framework directory)" at the end of line, strip it.
-_OSX_FRAMEWORK_SUFFIX = " (framework directory)"
-
-_OSX_FRAMEWORK_SUFFIX_LEN = len(_OSX_FRAMEWORK_SUFFIX)
-
-def _cxx_inc_convert(path):
-    """Convert path returned by cc -E xc++ in a complete path."""
-    path = path.strip()
-    if path.endswith(_OSX_FRAMEWORK_SUFFIX):
-        path = path[:-_OSX_FRAMEWORK_SUFFIX_LEN].strip()
-    return path
-
-def _normalize_include_path(repository_ctx, path):
-    """Normalizes include paths before writing them to the crosstool.
-      If path points inside the 'crosstool' folder of the repository, a relative
-      path is returned.
-      If path points outside the 'crosstool' folder, an absolute path is returned.
-      """
-    path = str(repository_ctx.path(path))
-    crosstool_folder = str(repository_ctx.path(".").get_child("crosstool"))
-
-    if path.startswith(crosstool_folder):
-        # We drop the path to "$REPO/crosstool" and a trailing path separator.
-        return path[len(crosstool_folder) + 1:]
-    return path
-
-def _get_cxx_inc_directories_impl(repository_ctx, cc, lang_is_cpp):
-    """Compute the list of default C or C++ include directories."""
-    if lang_is_cpp:
-        lang = "c++"
-    else:
-        lang = "c"
-    result = repository_ctx.execute([cc, "-E", "-x" + lang, "-", "-v"])
-    index1 = result.stderr.find(_INC_DIR_MARKER_BEGIN)
-    if index1 == -1:
-        return []
-    index1 = result.stderr.find("\n", index1)
-    if index1 == -1:
-        return []
-    index2 = result.stderr.rfind("\n ")
-    if index2 == -1 or index2 < index1:
-        return []
-    index2 = result.stderr.find("\n", index2 + 1)
-    if index2 == -1:
-        inc_dirs = result.stderr[index1 + 1:]
-    else:
-        inc_dirs = result.stderr[index1 + 1:index2].strip()
-
-    return [
-        _normalize_include_path(repository_ctx, _cxx_inc_convert(p))
-        for p in inc_dirs.split("\n")
-    ]
-
-def get_cxx_inc_directories(repository_ctx, cc):
-    """Compute the list of default C and C++ include directories."""
-
-    # For some reason `clang -xc` sometimes returns include paths that are
-    # different from the ones from `clang -xc++`. (Symlink and a dir)
-    # So we run the compiler with both `-xc` and `-xc++` and merge resulting lists
-    includes_cpp = _get_cxx_inc_directories_impl(repository_ctx, cc, True)
-    includes_c = _get_cxx_inc_directories_impl(repository_ctx, cc, False)
-
-    return includes_cpp + [
-        inc
-        for inc in includes_c
-        if inc not in includes_cpp
-    ]
-
-def auto_configure_fail(msg):
-    """Output failure message when cuda configuration fails."""
-    red = "\033[0;31m"
-    no_color = "\033[0m"
-    fail("\n%sCuda Configuration Error:%s %s\n" % (red, no_color, msg))
-
-# END cc_configure common functions (see TODO above).
-
-def _host_compiler_includes(repository_ctx, cc):
-    """Generates the cxx_builtin_include_directory entries for gcc inc dirs.
-      Args:
-        repository_ctx: The repository context.
-        cc: The path to the gcc host compiler.
-      Returns:
-        A string containing the cxx_builtin_include_directory for each of the gcc
-        host compiler include directories, which can be added to the CROSSTOOL
-        file.
-      """
-    inc_dirs = get_cxx_inc_directories(repository_ctx, cc)
-    inc_entries = []
-    for inc_dir in inc_dirs:
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % inc_dir)
-    return "\n".join(inc_entries)
-
-def _cuda_include_path(repository_ctx, cuda_config):
-    """Generates the cxx_builtin_include_directory entries for cuda inc dirs.
-      Args:
-        repository_ctx: The repository context.
-        cc: The path to the gcc host compiler.
-      Returns:
-        A string containing the cxx_builtin_include_directory for each of the gcc
-        host compiler include directories, which can be added to the CROSSTOOL
-        file.
-      """
-    nvcc_path = repository_ctx.path("%s/bin/nvcc%s" % (
-        cuda_config.cuda_toolkit_path,
-        ".exe" if cuda_config.cpu_value == "Windows" else "",
-    ))
-    result = repository_ctx.execute([
-        nvcc_path,
-        "-v",
-        "/dev/null",
-        "-o",
-        "/dev/null",
-    ])
-    target_dir = ""
-    for one_line in result.stderr.splitlines():
-        if one_line.startswith("#$ _TARGET_DIR_="):
-            target_dir = (
-                cuda_config.cuda_toolkit_path + "/" + one_line.replace(
-                    "#$ _TARGET_DIR_=",
-                    "",
-                ) + "/include"
-            )
-    inc_entries = []
-    if target_dir != "":
-        inc_entries.append("  cxx_builtin_include_directory: \"%s\"" % target_dir)
-    default_include = cuda_config.cuda_toolkit_path + "/include"
-    inc_entries.append(
-        "  cxx_builtin_include_directory: \"%s\"" % default_include,
-    )
-    return "\n".join(inc_entries)
-
-def enable_cuda(repository_ctx):
-    if "TF_NEED_CUDA" in repository_ctx.os.environ:
-        enable_cuda = repository_ctx.os.environ["TF_NEED_CUDA"].strip()
-        return enable_cuda == "1"
-    return False
-
-def matches_version(environ_version, detected_version):
-    """Checks whether the user-specified version matches the detected version.
-      This function performs a weak matching so that if the user specifies only
-      the
-      major or major and minor versions, the versions are still considered
-      matching
-      if the version parts match. To illustrate:
-          environ_version  detected_version  result
-          -----------------------------------------
-          5.1.3            5.1.3             True
-          5.1              5.1.3             True
-          5                5.1               True
-          5.1.3            5.1               False
-          5.2.3            5.1.3             False
-      Args:
-        environ_version: The version specified by the user via environment
-          variables.
-        detected_version: The version autodetected from the CUDA installation on
-          the system.
-      Returns: True if user-specified version matches detected version and False
-        otherwise.
-    """
-    environ_version_parts = environ_version.split(".")
-    detected_version_parts = detected_version.split(".")
-    if len(detected_version_parts) < len(environ_version_parts):
-        return False
-    for i, part in enumerate(detected_version_parts):
-        if i >= len(environ_version_parts):
-            break
-        if part != environ_version_parts[i]:
-            return False
-    return True
-
-def find_cuda_define(repository_ctx, header_dir, header_file, define):
-    """Returns the value of a #define in a header file.
-      Greps through a header file and returns the value of the specified #define.
-      If the #define is not found, then raise an error.
-      Args:
-        repository_ctx: The repository context.
-        header_dir: The directory containing the header file.
-        header_file: The header file name.
-        define: The #define to search for.
-      Returns:
-        The value of the #define found in the header.
-      """
-
-    # Confirm location of the header and grep for the line defining the macro.
-    h_path = repository_ctx.path("%s/%s" % (header_dir, header_file))
-    if not h_path.exists:
-        auto_configure_fail("Cannot find %s at %s" % (header_file, str(h_path)))
-    result = repository_ctx.execute(
-        # Grep one more lines as some #defines are split into two lines.
-        [
-            "grep",
-            "--color=never",
-            "-A1",
-            "-E",
-            define,
-            str(h_path),
-        ],
-    )
-    if result.stderr:
-        auto_configure_fail("Error reading %s: %s" % (str(h_path), result.stderr))
-
-    # Parse the version from the line defining the macro.
-    if result.stdout.find(define) == -1:
-        auto_configure_fail(
-            "Cannot find line containing '%s' in %s" % (define, h_path),
-        )
-
-    # Split results to lines
-    lines = result.stdout.split("\n")
-    num_lines = len(lines)
-    for l in range(num_lines):
-        line = lines[l]
-        if define in line:  # Find the line with define
-            version = line
-            if l != num_lines - 1 and line[-1] == "\\":  # Add next line, if multiline
-                version = version[:-1] + lines[l + 1]
-            break
-
-    # Remove any comments
-    version = version.split("//")[0]
-
-    # Remove define name
-    version = version.replace(define, "").strip()
-
-    # Remove the code after the version number.
-    version_end = version.find(" ")
-    if version_end != -1:
-        if version_end == 0:
-            auto_configure_fail(
-                "Cannot extract the version from line containing '%s' in %s" %
-                (define, str(h_path)),
-            )
-        version = version[:version_end].strip()
-    return version
-
-def compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities."""
-    if _TF_CUDA_COMPUTE_CAPABILITIES not in repository_ctx.os.environ:
-        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities_str = repository_ctx.os.environ[_TF_CUDA_COMPUTE_CAPABILITIES]
-    capabilities = capabilities_str.split(",")
-    for capability in capabilities:
-        # Workaround for Skylark's lack of support for regex. This check should
-        # be equivalent to checking:
-        #     if re.match("[0-9]+.[0-9]+", capability) == None:
-        parts = capability.split(".")
-        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
-            auto_configure_fail("Invalid compute capability: %s" % capability)
-    return capabilities
-
-def get_cpu_value(repository_ctx):
-    """Returns the name of the host operating system.
-      Args:
-        repository_ctx: The repository context.
-      Returns:
-        A string containing the name of the host operating system.
-      """
-    os_name = repository_ctx.os.name.lower()
-    if os_name.startswith("mac os"):
-        return "Darwin"
-    if os_name.find("windows") != -1:
-        return "Windows"
-    result = repository_ctx.execute(["uname", "-s"])
-    return result.stdout.strip()
-
-def _is_windows(repository_ctx):
-    """Returns true if the host operating system is windows."""
-    return get_cpu_value(repository_ctx) == "Windows"
-
-def lib_name(base_name, cpu_value, version = None, static = False):
-    """Constructs the platform-specific name of a library.
-      Args:
-        base_name: The name of the library, such as "cudart"
-        cpu_value: The name of the host operating system.
-        version: The version of the library.
-        static: True the library is static or False if it is a shared object.
-      Returns:
-        The platform-specific name of the library.
-      """
-    version = "" if not version else "." + version
-    if cpu_value in ("Linux", "FreeBSD"):
-        if static:
-            return "lib%s.a" % base_name
-        return "lib%s.so%s" % (base_name, version)
-    elif cpu_value == "Windows":
-        return "%s.lib" % base_name
-    elif cpu_value == "Darwin":
-        if static:
-            return "lib%s.a" % base_name
-        return "lib%s%s.dylib" % (base_name, version)
-    else:
-        auto_configure_fail("Invalid cpu_value: %s" % cpu_value)
-
-def find_lib(repository_ctx, paths, check_soname = True):
-    """
-      Finds a library among a list of potential paths.
-      Args:
-        paths: List of paths to inspect.
-      Returns:
-        Returns the first path in paths that exist.
-    """
-    objdump = repository_ctx.which("objdump")
-    mismatches = []
-    for path in [repository_ctx.path(path) for path in paths]:
-        if not path.exists:
-            continue
-        if check_soname and objdump != None and not _is_windows(repository_ctx):
-            output = repository_ctx.execute([objdump, "-p", str(path)]).stdout
-            output = [line for line in output.splitlines() if "SONAME" in line]
-            sonames = [line.strip().split(" ")[-1] for line in output]
-            if not any([soname == path.basename for soname in sonames]):
-                mismatches.append(str(path))
-                continue
-        return path
-    if mismatches:
-        auto_configure_fail(
-            "None of the libraries match their SONAME: " + ", ".join(mismatches),
-        )
-    auto_configure_fail("No library found under: " + ", ".join(paths))
-
-def _find_cuda_lib(
-        lib,
-        repository_ctx,
-        cpu_value,
-        basedir,
-        version,
-        static = False):
-    """Finds the given CUDA or cuDNN library on the system.
-      Args:
-        lib: The name of the library, such as "cudart"
-        repository_ctx: The repository context.
-        cpu_value: The name of the host operating system.
-        basedir: The install directory of CUDA or cuDNN.
-        version: The version of the library.
-        static: True if static library, False if shared object.
-      Returns:
-        Returns the path to the library.
-      """
-    file_name = lib_name(lib, cpu_value, version, static)
-
-    return find_lib(
-        repository_ctx,
-        ["%s/%s" % (basedir, file_name)],
-        check_soname = version and not static,
-    )
-
-def _find_libs(repository_ctx, cuda_config):
-    """Returns the CUDA and cuDNN libraries on the system.
-      Args:
-        repository_ctx: The repository context.
-        cuda_config: The CUDA config as returned by _get_cuda_config
-      Returns:
-        Map of library names to structs of filename and path.
-      """
-    cpu_value = cuda_config.cpu_value
-    stub_dir = "" if _is_windows(repository_ctx) else "/stubs"
-    return {
-        "cuda": _find_cuda_lib(
-            "cuda",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cuda_library_dir"] + stub_dir,
-            None,
-        ),
-        "cudart": _find_cuda_lib(
-            "cudart",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cudart_version,
-        ),
-        "cudart_static": _find_cuda_lib(
-            "cudart_static",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_version,
-            static = True,
-        ),
-        "cublas": _find_cuda_lib(
-            "cublas",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cublas_library_dir"],
-            cuda_config.cublas_version,
-        ),
-        "cusolver": _find_cuda_lib(
-            "cusolver",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cusolver_library_dir"],
-            cuda_config.cusolver_version,
-        ),
-        "curand": _find_cuda_lib(
-            "curand",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["curand_library_dir"],
-            cuda_config.curand_version,
-        ),
-        "cufft": _find_cuda_lib(
-            "cufft",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cufft_library_dir"],
-            cuda_config.cufft_version,
-        ),
-        "cudnn": _find_cuda_lib(
-            "cudnn",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cudnn_library_dir"],
-            cuda_config.cudnn_version,
-        ),
-        "cupti": _find_cuda_lib(
-            "cupti",
-            repository_ctx,
-            cpu_value,
-            cuda_config.config["cupti_library_dir"],
-            cuda_config.cuda_version,
-        ),
-    }
-
-def _cudart_static_linkopt(cpu_value):
-    """Returns additional platform-specific linkopts for cudart."""
-    return "" if cpu_value == "Darwin" else "\"-lrt\","
-
-def _get_cuda_config(repository_ctx):
-    """Detects and returns information about the CUDA installation on the system.
-      Args:
-        repository_ctx: The repository context.
-      Returns:
-        A struct containing the following fields:
-          cuda_toolkit_path: The CUDA toolkit installation directory.
-          cudnn_install_basedir: The cuDNN installation directory.
-          cuda_version: The version of CUDA on the system.
-          cudart_version: The CUDA runtime version on the system.
-          cudnn_version: The version of cuDNN on the system.
-          compute_capabilities: A list of the system's CUDA compute capabilities.
-          cpu_value: The name of the host operating system.
-      """
-    config = find_cuda_config(repository_ctx, ["cuda", "cudnn"])
-    cpu_value = get_cpu_value(repository_ctx)
-    toolkit_path = config["cuda_toolkit_path"]
-
-    is_windows = _is_windows(repository_ctx)
-    cuda_version = config["cuda_version"].split(".")
-    cuda_major = cuda_version[0]
-    cuda_minor = cuda_version[1]
-
-    cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
-    cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
-
-    if int(cuda_major) >= 11:
-        # The libcudart soname in CUDA 11.x is versioned as 11.0 for backward compatability.
-        if int(cuda_major) == 11:
-            cudart_version = "64_110" if is_windows else "11.0"
-        else:
-            cudart_version = ("64_%s" if is_windows else "%s") % cuda_major
-        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
-        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
-        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
-        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
-    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
-        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
-        # It changed from 'x.y' to just 'x' in CUDA 10.1.
-        cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
-        cublas_version = cuda_lib_version
-        cusolver_version = cuda_lib_version
-        curand_version = cuda_lib_version
-        cufft_version = cuda_lib_version
-        cudart_version = cuda_version
-    else:
-        cublas_version = cuda_version
-        cusolver_version = cuda_version
-        curand_version = cuda_version
-        cufft_version = cuda_version
-        cudart_version = cuda_version
-
-    return struct(
-        cuda_toolkit_path = toolkit_path,
-        cuda_version = cuda_version,
-        cudart_version = cudart_version,
-        cublas_version = cublas_version,
-        cusolver_version = cusolver_version,
-        curand_version = curand_version,
-        cufft_version = cufft_version,
-        cudnn_version = cudnn_version,
-        compute_capabilities = compute_capabilities(repository_ctx),
-        cpu_value = cpu_value,
-        config = config,
-    )
-
-def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
-    if substitutions == None:
-        substitutions = {}
-    if not out:
-        out = tpl.replace(":", "/")
-    repository_ctx.template(
-        out,
-        Label("//build_deps/toolchains/gpu/%s.tpl" % tpl),
-        substitutions,
-    )
-
-_DUMMY_CROSSTOOL_BUILD_FILE = """
-load("//crosstool:error_gpu_disabled.bzl", "error_gpu_disabled")
-error_gpu_disabled()
-"""
-
-def _create_dummy_repository(repository_ctx):
-    cpu_value = get_cpu_value(repository_ctx)
-
-    # Set up BUILD file for cuda/.
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "False",
-            "%{cuda_extra_copts}": "[]",
-        },
-    )
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": lib_name("cuda", cpu_value),
-            "%{cudart_static_lib}": lib_name(
-                "cudart_static",
-                cpu_value,
-                static = True,
-            ),
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value),
-            "%{cudart_lib}": lib_name("cudart", cpu_value),
-            "%{cublas_lib}": lib_name("cublas", cpu_value),
-            "%{cusolver_lib}": lib_name("cusolver", cpu_value),
-            "%{cudnn_lib}": lib_name("cudnn", cpu_value),
-            "%{cufft_lib}": lib_name("cufft", cpu_value),
-            "%{curand_lib}": lib_name("curand", cpu_value),
-            "%{cupti_lib}": lib_name("cupti", cpu_value),
-            "%{copy_rules}": "",
-            "%{cuda_headers}": "",
-        },
-    )
-
-    # Create dummy files for the CUDA toolkit since they are still required by
-    # tensorflow/core/platform/default/build_config:cuda.
-    repository_ctx.file("cuda/cuda/include/cuda.h")
-    repository_ctx.file("cuda/cuda/include/cublas.h")
-    repository_ctx.file("cuda/cuda/include/cudnn.h")
-    repository_ctx.file("cuda/cuda/extras/CUPTI/include/cupti.h")
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cuda", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudart", cpu_value))
-    repository_ctx.file(
-        "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value),
-    )
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cufft", cpu_value))
-    repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cupti", cpu_value))
-
-def _execute(
-        repository_ctx,
-        cmdline,
-        error_msg = None,
-        error_details = None,
-        empty_stdout_fine = False):
-    """Executes an arbitrary shell command.
-      Args:
-        repository_ctx: the repository_ctx object
-        cmdline: list of strings, the command to execute
-        error_msg: string, a summary of the error if the command fails
-        error_details: string, details about the error or steps to fix it
-        empty_stdout_fine: bool, if True, an empty stdout result is fine,
-          otherwise it's an error
-      Return: the result of repository_ctx.execute(cmdline)
-    """
-    result = repository_ctx.execute(cmdline)
-    if result.stderr or not (empty_stdout_fine or result.stdout):
-        auto_configure_fail(
-            "\n".join([
-                error_msg.strip() if error_msg else "Repository command failed",
-                result.stderr.strip(),
-                error_details if error_details else "",
-            ]),
-        )
-    return result
-
-def _norm_path(path):
-    """Returns a path with '/' and remove the trailing slash."""
-    path = path.replace("\\", "/")
-    if path[-1] == "/":
-        path = path[:-1]
-    return path
-
-def make_copy_files_rule(repository_ctx, name, srcs, outs):
-    """Returns a rule to copy a set of files."""
-    cmds = []
-
-    # Copy files.
-    for src, out in zip(srcs, outs):
-        cmds.append('cp -f "%s" $(location %s)' % (src, out))
-    outs = [('        "%s",' % out) for out in outs]
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""%s \""",
-)""" % (name, "\n".join(outs), " && ".join(cmds))
-
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
-    """Returns a rule to recursively copy a directory."""
-    src_dir = _norm_path(src_dir)
-    out_dir = _norm_path(out_dir)
-    outs = _read_dir(repository_ctx, src_dir)
-    outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-
-    # '@D' already contains the relative path for a single file, see
-    # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
-    out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
-    return """genrule(
-    name = "%s",
-    outs = [
-%s
-    ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
-
-def _read_dir(repository_ctx, src_dir):
-    """Returns a string with all files in a directory.
-      Finds all files inside a directory, traversing subfolders and following
-      symlinks. The returned string contains the full path of all files
-      separated by line breaks.
-      """
-    if _is_windows(repository_ctx):
-        src_dir = src_dir.replace("/", "\\")
-        find_result = _execute(
-            repository_ctx,
-            ["cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
-            empty_stdout_fine = True,
-        )
-
-        # src_files will be used in genrule.outs where the paths must
-        # use forward slashes.
-        result = find_result.stdout.replace("\\", "/")
-    else:
-        find_result = _execute(
-            repository_ctx,
-            ["find", src_dir, "-follow", "-type", "f"],
-            empty_stdout_fine = True,
-        )
-        result = find_result.stdout
-    return sorted(result.splitlines())
-
-def _create_local_cuda_repository(repository_ctx):
-    """Creates the repository containing files set up to build with CUDA."""
-    cuda_config = _get_cuda_config(repository_ctx)
-
-    cuda_include_path = cuda_config.config["cuda_include_dir"]
-    cublas_include_path = cuda_config.config["cublas_include_dir"]
-    cudnn_header_dir = cuda_config.config["cudnn_include_dir"]
-    cupti_header_dir = cuda_config.config["cupti_include_dir"]
-    nvvm_libdevice_dir = cuda_config.config["nvvm_library_dir"]
-
-    # Create genrule to copy files from the installed CUDA toolkit into execroot.
-    copy_rules = [
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-include",
-            src_dir = cuda_include_path,
-            out_dir = "cuda/include",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-nvvm",
-            src_dir = nvvm_libdevice_dir,
-            out_dir = "cuda/nvvm/libdevice",
-        ),
-        make_copy_dir_rule(
-            repository_ctx,
-            name = "cuda-extras",
-            src_dir = cupti_header_dir,
-            out_dir = "cuda/extras/CUPTI/include",
-        ),
-    ]
-
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cublas-include",
-        srcs = [
-            cublas_include_path + "/cublas.h",
-            cublas_include_path + "/cublas_v2.h",
-            cublas_include_path + "/cublas_api.h",
-        ],
-        outs = [
-            "cublas/include/cublas.h",
-            "cublas/include/cublas_v2.h",
-            "cublas/include/cublas_api.h",
-        ],
-    ))
-
-    cuda_libs = _find_libs(repository_ctx, cuda_config)
-    cuda_lib_srcs = []
-    cuda_lib_outs = []
-    for path in cuda_libs.values():
-        cuda_lib_srcs.append(str(path))
-        cuda_lib_outs.append("cuda/lib/" + path.basename)
-    copy_rules.append(make_copy_files_rule(
-        repository_ctx,
-        name = "cuda-lib",
-        srcs = cuda_lib_srcs,
-        outs = cuda_lib_outs,
-    ))
-
-    copy_rules.append(make_copy_dir_rule(
-        repository_ctx,
-        name = "cuda-bin",
-        src_dir = cuda_config.cuda_toolkit_path + "/bin",
-        out_dir = "cuda/bin",
-    ))
-
-    # Copy cudnn.h if cuDNN was not installed to CUDA_TOOLKIT_PATH.
-    included_files = _read_dir(repository_ctx, cuda_include_path)
-    if not any([file.endswith("cudnn.h") for file in included_files]):
-        if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]:
-            cudnn_headers = ["cudnn.h"]
-        else:
-            cudnn_headers = [
-                "cudnn_adv_infer.h",
-                "cudnn_adv_train.h",
-                "cudnn_cnn_infer.h",
-                "cudnn_cnn_train.h",
-                "cudnn_ops_infer.h",
-                "cudnn_ops_train.h",
-                "cudnn.h",
-                "cudnn_version.h",
-            ]
-        cudnn_srcs = []
-        cudnn_outs = []
-        for header in cudnn_headers:
-            cudnn_srcs.append(cudnn_header_dir + "/" + header)
-            cudnn_outs.append("cudnn/include/" + header)
-
-        copy_rules.append(make_copy_files_rule(
-            repository_ctx,
-            name = "cudnn-include",
-            srcs = cudnn_srcs,
-            outs = cudnn_outs,
-        ))
-    else:
-        copy_rules.append("filegroup(name = 'cudnn-include')\n")
-
-    # Set up BUILD file for cuda/
-    _tpl(
-        repository_ctx,
-        "cuda:build_defs.bzl",
-        {
-            "%{cuda_is_configured}": "True",
-            "%{cuda_extra_copts}": "[]",
-        },
-    )
-
-    _tpl(
-        repository_ctx,
-        "cuda:BUILD",
-        {
-            "%{cuda_driver_lib}": cuda_libs["cuda"].basename,
-            "%{cudart_static_lib}": cuda_libs["cudart_static"].basename,
-            "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value),
-            "%{cudart_lib}": cuda_libs["cudart"].basename,
-            "%{cublas_lib}": cuda_libs["cublas"].basename,
-            "%{cusolver_lib}": cuda_libs["cusolver"].basename,
-            "%{cudnn_lib}": cuda_libs["cudnn"].basename,
-            "%{cufft_lib}": cuda_libs["cufft"].basename,
-            "%{curand_lib}": cuda_libs["curand"].basename,
-            "%{cupti_lib}": cuda_libs["cupti"].basename,
-            "%{copy_rules}": "\n".join(copy_rules),
-            "%{cuda_headers}": (
-                '":cuda-include",\n' + '        ":cudnn-include",'
-            ),
-        },
-        "cuda/BUILD",
-    )
-
-    # Set up crosstool/
-    cc = find_cc(repository_ctx)
-    cc_fullpath = cc
-
-    host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-
-    cuda_defines = {}
-
-    # Bazel sets '-B/usr/bin' flag to workaround build errors on RHEL (see
-    # https://github.com/bazelbuild/bazel/issues/760).
-    # However, this stops our custom clang toolchain from picking the provided
-    # LLD linker, so we're only adding '-B/usr/bin' when using non-downloaded
-    # toolchain.
-    # TODO: when bazel stops adding '-B/usr/bin' by default, remove this
-    #       flag from the CROSSTOOL completely (see
-    #       https://github.com/bazelbuild/bazel/issues/5634)
-    cuda_defines["%{linker_bin_path_flag}"] = 'flag: "-B/usr/bin"'
-
-    cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
-    cuda_defines["%{host_compiler_warnings}"] = ""
-
-    # nvcc has the system include paths built in and will automatically
-    # search them; we cannot work around that, so we add the relevant cuda
-    # system paths to the allowed compiler specific include paths.
-    cuda_defines["%{host_compiler_includes}"] = (
-        host_compiler_includes + "\n" + _cuda_include_path(
-            repository_ctx,
-            cuda_config,
-        ) +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
-        "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir
-    )
-
-    # For gcc, do not canonicalize system header paths; some versions of gcc
-    # pick the shortest possible path for system includes when creating the
-    # .d file - given that includes that are prefixed with "../" multiple
-    # time quickly grow longer than the root of the tree, this can lead to
-    # bazel's header check failing.
-    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
-        "flag: \"-fno-canonical-system-headers\""
-    )
-    nvcc_path = str(
-        repository_ctx.path("%s/bin/nvcc%s" % (
-            cuda_config.cuda_toolkit_path,
-            ".exe" if _is_windows(repository_ctx) else "",
-        )),
-    )
-
-    builtin_include_directories = []
-    for one_line in cuda_defines["%{host_compiler_includes}"].splitlines():
-        inc_dir = one_line.split(":")[1][2:-1]
-        builtin_include_directories.append(inc_dir)
-
-    _tpl(
-        repository_ctx,
-        "crosstool:BUILD",
-        {
-            "%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc",
-            "%{cxx_builtin_include_directories}": ",".join(builtin_include_directories),
-            "%{win_linker_files}": ":windows_msvc_wrapper_files",
-        },
-    )
-    wrapper_defines = {
-        "%{cpu_compiler}": str(cc),
-        "%{cuda_version}": cuda_config.cuda_version,
-        "%{nvcc_path}": nvcc_path,
-        "%{gcc_host_compiler_path}": str(cc),
-        "%{cuda_compute_capabilities}": ", ".join(
-            ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-        ),
-        "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
-    }
-
-    _tpl(
-        repository_ctx,
-        "crosstool:cc_toolchain_config.bzl",
-        wrapper_defines,
-    )
-    _tpl(
-        repository_ctx,
-        "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
-        wrapper_defines,
-    )
-
-    _tpl(
-        repository_ctx,
-        "crosstool:windows/msvc_wrapper_for_nvcc.py",
-        wrapper_defines,
-    )
-
-    _tpl(
-        repository_ctx,
-        "crosstool:CROSSTOOL",
-        cuda_defines.update(_get_win_cuda_defines(repository_ctx)),
-        out = "crosstool/CROSSTOOL",
-    )
-
-def find_cuda_config(repository_ctx, cuda_libraries):
-    """Returns CUDA config dictionary from running find_cuda_config.py"""
-    exec_result = repository_ctx.execute([
-        _get_python_bin(repository_ctx),
-        repository_ctx.path(Label("//build_deps/toolchains/gpu:find_cuda_config.py")),
-    ] + cuda_libraries)
-    if exec_result.return_code:
-        auto_configure_fail("Failed to run find_cuda_config.py: %s" % exec_result.stderr)
-
-    # Parse the dict from stdout.
-    return dict([tuple(x.split(": ")) for x in exec_result.stdout.splitlines()])
-
-def _cuda_autoconf_impl(repository_ctx):
-    """Implementation of the cuda_autoconf repository rule."""
-    if not enable_cuda(repository_ctx):
-        _create_dummy_repository(repository_ctx)
-    else:
-        _create_local_cuda_repository(repository_ctx)
-
-cuda_configure = repository_rule(
-    environ = [
-        _GCC_HOST_COMPILER_PATH,
-        _CLANG_CUDA_COMPILER_PATH,
-        "TF_NEED_CUDA",
-        "TF_CUDA_CLANG",
-        _TF_DOWNLOAD_CLANG,
-        _CUDA_TOOLKIT_PATH,
-        _CUDNN_INSTALL_PATH,
-        _TF_CUDA_VERSION,
-        _TF_CUDNN_VERSION,
-        _TF_CUDA_COMPUTE_CAPABILITIES,
-        "NVVMIR_LIBRARY_DIR",
-        _PYTHON_BIN_PATH,
-    ],
-    implementation = _cuda_autoconf_impl,
-)
-
-"""Detects and configures the local CUDA toolchain.
-Add the following to your WORKSPACE FILE:
-```python
-cuda_configure(name = "local_config_cuda")
-```
-Args:
-  name: A unique name for this workspace rule.
-"""
diff --git a/build_deps/toolchains/gpu/find_cuda_config.py b/build_deps/toolchains/gpu/find_cuda_config.py
deleted file mode 100644
index 679de5ea..00000000
--- a/build_deps/toolchains/gpu/find_cuda_config.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Prints CUDA library and header directories and versions found on the system.
-
-The script searches for CUDA library and header files on the system, inspects
-them to determine their version and prints the configuration to stdout.
-The paths to inspect and the required versions are specified through environment
-variables. If no valid configuration is found, the script prints to stderr and
-returns an error code.
-
-The list of libraries to find is specified as arguments. Supported libraries are
-CUDA (includes cuBLAS), cuDNN, NCCL, and TensorRT.
-
-The script takes a list of base directories specified by the TF_CUDA_PATHS
-environment variable as comma-separated glob list. The script looks for headers
-and library files in a hard-coded set of subdirectories from these base paths.
-If TF_CUDA_PATHS is not specified, a OS specific default is used:
-
-  Linux:   /usr/local/cuda, /usr, and paths from 'ldconfig -p'.
-  Windows: CUDA_PATH environment variable, or
-           C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\*
-
-For backwards compatibility, some libraries also use alternative base
-directories from other environment variables if they are specified. List of
-library-specific environment variables:
-
-  Library   Version env variable  Additional base directories
-  ----------------------------------------------------------------
-  CUDA      TF_CUDA_VERSION       CUDA_TOOLKIT_PATH
-  cuBLAS    TF_CUBLAS_VERSION     CUDA_TOOLKIT_PATH
-  cuDNN     TF_CUDNN_VERSION      CUDNN_INSTALL_PATH
-  NCCL      TF_NCCL_VERSION       NCCL_INSTALL_PATH, NCCL_HDR_PATH
-  TensorRT  TF_TENSORRT_VERSION   TENSORRT_INSTALL_PATH
-
-Versions environment variables can be of the form 'x' or 'x.y' to request a
-specific version, empty or unspecified to accept any version.
-
-The output of a found library is of the form:
-tf_<library>_version: x.y.z
-tf_<library>_header_dir: ...
-tf_<library>_library_dir: ...
-"""
-
-import io
-import os
-import glob
-import platform
-import re
-import subprocess
-import sys
-
-# pylint: disable=g-import-not-at-top
-try:
-  from shutil import which
-except ImportError:
-  from distutils.spawn import find_executable as which
-# pylint: enable=g-import-not-at-top
-
-
-class ConfigError(Exception):
-  pass
-
-
-def _is_linux():
-  return platform.system() == "Linux"
-
-
-def _is_windows():
-  return platform.system() == "Windows"
-
-
-def _is_macos():
-  return platform.system() == "Darwin"
-
-
-def _matches_version(actual_version, required_version):
-  """Checks whether some version meets the requirements.
-
-    All elements of the required_version need to be present in the
-    actual_version.
-
-        required_version  actual_version  result
-        -----------------------------------------
-        1                 1.1             True
-        1.2               1               False
-        1.2               1.3             False
-                          1               True
-
-    Args:
-      required_version: The version specified by the user.
-      actual_version: The version detected from the CUDA installation.
-    Returns: Whether the actual version matches the required one.
-    """
-  if actual_version is None:
-    return False
-
-  # Strip spaces from the versions.
-  actual_version = actual_version.strip()
-  required_version = required_version.strip()
-  return actual_version.startswith(required_version)
-
-
-def _at_least_version(actual_version, required_version):
-  actual = [int(v) for v in actual_version.split(".")]
-  required = [int(v) for v in required_version.split(".")]
-  return actual >= required
-
-
-def _get_header_version(path, name):
-  """Returns preprocessor defines in C header file."""
-  for line in io.open(path, "r", encoding="utf-8").readlines():
-    match = re.match(r"#define %s +(\d+)" % name, line)
-    if match:
-      return match.group(1)
-  return ""
-
-
-def _cartesian_product(first, second):
-  """Returns all path combinations of first and second."""
-  return [os.path.join(f, s) for f in first for s in second]
-
-
-def _get_ld_config_paths():
-  """Returns all directories from 'ldconfig -p'."""
-  if not _is_linux():
-    return []
-  ldconfig_path = which("ldconfig") or "/sbin/ldconfig"
-  output = subprocess.check_output([ldconfig_path, "-p"])
-  pattern = re.compile(".* => (.*)")
-  result = set()
-  for line in output.splitlines():
-    try:
-      match = pattern.match(line.decode("ascii"))
-    except UnicodeDecodeError:
-      match = False
-    if match:
-      result.add(os.path.dirname(match.group(1)))
-  return sorted(list(result))
-
-
-def _get_default_cuda_paths(cuda_version):
-  if not cuda_version:
-    cuda_version = "*"
-  elif "." not in cuda_version:
-    cuda_version = cuda_version + ".*"
-
-  if _is_windows():
-    return [
-        os.environ.get(
-            "CUDA_PATH",
-            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" % cuda_version,
-        )
-    ]
-  return [
-      "/usr/local/cuda-%s" % cuda_version,
-      "/usr/local/cuda",
-      "/usr",
-      "/usr/local/cudnn",
-  ] + _get_ld_config_paths()
-
-
-def _header_paths():
-  """Returns hard-coded set of relative paths to look for header files."""
-  return [
-      "",
-      "include",
-      "include/cuda",
-      "include/*-linux-gnu",
-      "extras/CUPTI/include",
-      "include/cuda/CUPTI",
-      "local/cuda/extras/CUPTI/include",
-  ]
-
-
-def _library_paths():
-  """Returns hard-coded set of relative paths to look for library files."""
-  return [
-      "",
-      "lib64",
-      "lib",
-      "lib/*-linux-gnu",
-      "lib/x64",
-      "extras/CUPTI/*",
-      "local/cuda/lib64",
-      "local/cuda/extras/CUPTI/lib64",
-  ]
-
-
-def _not_found_error(base_paths, relative_paths, filepattern):
-  base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
-  relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
-  return ConfigError(
-      "Could not find any %s in any subdirectory:%s\nof:%s\n" % (filepattern, relative_paths, base_paths)
-  )
-
-
-def _find_file(base_paths, relative_paths, filepattern):
-  for path in _cartesian_product(base_paths, relative_paths):
-    for file in glob.glob(os.path.join(path, filepattern)):
-      return file
-  raise _not_found_error(base_paths, relative_paths, filepattern)
-
-
-def _find_library(base_paths, library_name, required_version):
-  """Returns first valid path to the requested library."""
-  if _is_windows():
-    filepattern = library_name + ".lib"
-  elif _is_macos():
-    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] + required_version.split(".")[:1]))
-  else:
-    filepattern = (".".join(["lib" + library_name, "so"] + required_version.split(".")[:1]) + "*")
-  return _find_file(base_paths, _library_paths(), filepattern)
-
-
-def _find_versioned_file(base_paths, relative_paths, filepatterns, required_version, get_version):
-  """Returns first valid path to a file that matches the requested version."""
-  if type(filepatterns) not in [list, tuple]:
-    filepatterns = [filepatterns]
-  for path in _cartesian_product(base_paths, relative_paths):
-    for filepattern in filepatterns:
-      for file in glob.glob(os.path.join(path, filepattern)):
-        actual_version = get_version(file)
-        if _matches_version(actual_version, required_version):
-          return file, actual_version
-  raise _not_found_error(
-      base_paths,
-      relative_paths,
-      ", ".join(filepatterns) + " matching version '%s'" % required_version,
-  )
-
-
-def _find_header(base_paths, header_name, required_version, get_version):
-  """Returns first valid path to a header that matches the requested version."""
-  return _find_versioned_file(base_paths, _header_paths(), header_name, required_version, get_version)
-
-
-def _find_cuda_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = int(_get_header_version(path, "CUDA_VERSION"))
-    if not version:
-      return None
-    return "%d.%d" % (version // 1000, version % 1000 // 10)
-
-  cuda_header_path, header_version = _find_header(base_paths, "cuda.h", required_version, get_header_version)
-  cuda_version = header_version  # x.y, see above.
-
-  cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
-
-  def get_nvcc_version(path):
-    pattern = r"Cuda compilation tools, release \d+\.\d+, V(\d+\.\d+\.\d+)"
-    for line in subprocess.check_output([path, "--version"]).splitlines():
-      match = re.match(pattern, line.decode("ascii"))
-      if match:
-        return match.group(1)
-    return None
-
-  nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
-  nvcc_path, nvcc_version = _find_versioned_file(
-      base_paths,
-      [
-          "",
-          "bin",
-          "local/cuda/bin",
-      ],
-      nvcc_name,
-      cuda_version,
-      get_nvcc_version,
-  )
-
-  nvvm_path = _find_file(
-      base_paths,
-      [
-          "nvvm/libdevice",
-          "share/cuda",
-          "lib/nvidia-cuda-toolkit/libdevice",
-          "local/cuda/nvvm/libdevice",
-      ],
-      "libdevice*.10.bc",
-  )
-
-  cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
-  cupti_library_path = _find_library(base_paths, "cupti", required_version)
-
-  cuda_binary_dir = os.path.dirname(nvcc_path)
-  nvvm_library_dir = os.path.dirname(nvvm_path)
-
-  # XLA requires the toolkit path to find ptxas and libdevice.
-  # TODO(csigg): pass in both directories instead.
-  cuda_toolkit_paths = (
-      os.path.normpath(os.path.join(cuda_binary_dir, "..")),
-      os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
-  )
-  if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
-    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths)
-
-  return {
-      "cuda_version": cuda_version,
-      "cuda_include_dir": os.path.dirname(cuda_header_path),
-      "cuda_library_dir": os.path.dirname(cuda_library_path),
-      "cuda_binary_dir": cuda_binary_dir,
-      "nvvm_library_dir": nvvm_library_dir,
-      "cupti_include_dir": os.path.dirname(cupti_header_path),
-      "cupti_library_dir": os.path.dirname(cupti_library_path),
-      "cuda_toolkit_path": cuda_toolkit_paths[0],
-  }
-
-
-def _find_cublas_config(base_paths, required_version, cuda_version):
-  if _at_least_version(cuda_version, "10.1"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name) for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH")
-      )
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cublas_api.h", required_version, get_header_version)
-    # cuBLAS uses the major version only.
-    cublas_version = header_version.split(".")[0]
-
-  else:
-    # There is no version info available before CUDA 10.1, just find the file.
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
-    # cuBLAS version is the same as CUDA version (x.y).
-    cublas_version = required_version
-
-  library_path = _find_library(base_paths, "cublas", cublas_version)
-
-  return {
-      "cublas_version": header_version,
-      "cublas_include_dir": os.path.dirname(header_path),
-      "cublas_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusolver_config(base_paths, required_version, cuda_version):
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name) for name in (
-              "CUSOLVER_VER_MAJOR",
-              "CUSOLVER_VER_MINOR",
-              "CUSOLVER_VER_PATCH",
-          )
-      )
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusolver_common.h", required_version, get_header_version)
-    cusolver_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
-    cusolver_version = required_version
-
-  library_path = _find_library(base_paths, "cusolver", cusolver_version)
-
-  return {
-      "cusolver_version": header_version,
-      "cusolver_include_dir": os.path.dirname(header_path),
-      "cusolver_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_curand_config(base_paths, required_version, cuda_version):
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name) for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH")
-      )
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "curand.h", required_version, get_header_version)
-    curand_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "curand.h")
-    curand_version = required_version
-
-  library_path = _find_library(base_paths, "curand", curand_version)
-
-  return {
-      "curand_version": header_version,
-      "curand_include_dir": os.path.dirname(header_path),
-      "curand_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cufft_config(base_paths, required_version, cuda_version):
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (_get_header_version(path, name) for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cufft.h", required_version, get_header_version)
-    cufft_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cufft.h")
-    cufft_version = required_version
-
-  library_path = _find_library(base_paths, "cufft", cufft_version)
-
-  return {
-      "cufft_version": header_version,
-      "cufft_include_dir": os.path.dirname(header_path),
-      "cufft_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cudnn_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = [_get_header_version(path, name) for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
-    return ".".join(version) if version[0] else None
-
-  header_path, header_version = _find_header(
-      base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version
-  )
-  cudnn_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "cudnn", cudnn_version)
-
-  return {
-      "cudnn_version": cudnn_version,
-      "cudnn_include_dir": os.path.dirname(header_path),
-      "cudnn_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_cusparse_config(base_paths, required_version, cuda_version):
-  if _at_least_version(cuda_version, "11.0"):
-
-    def get_header_version(path):
-      version = (
-          _get_header_version(path, name) for name in (
-              "CUSPARSE_VER_MAJOR",
-              "CUSPARSE_VER_MINOR",
-              "CUSPARSE_VER_PATCH",
-          )
-      )
-      return ".".join(version)
-
-    header_path, header_version = _find_header(base_paths, "cusparse.h", required_version, get_header_version)
-    cusparse_version = header_version.split(".")[0]
-
-  else:
-    header_version = cuda_version
-    header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
-    cusparse_version = required_version
-
-  library_path = _find_library(base_paths, "cusparse", cusparse_version)
-
-  return {
-      "cusparse_version": header_version,
-      "cusparse_include_dir": os.path.dirname(header_path),
-      "cusparse_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_nccl_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (_get_header_version(path, name) for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
-    return ".".join(version)
-
-  header_path, header_version = _find_header(base_paths, "nccl.h", required_version, get_header_version)
-  nccl_version = header_version.split(".")[0]
-
-  library_path = _find_library(base_paths, "nccl", nccl_version)
-
-  return {
-      "nccl_version": nccl_version,
-      "nccl_include_dir": os.path.dirname(header_path),
-      "nccl_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _find_tensorrt_config(base_paths, required_version):
-
-  def get_header_version(path):
-    version = (
-        _get_header_version(path, name) for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH")
-    )
-    # `version` is a generator object, so we convert it to a list before using
-    # it (muitiple times below).
-    version = list(version)
-    if not all(version):
-      return None  # Versions not found, make _matches_version returns False.
-    return ".".join(version)
-
-  try:
-    header_path, header_version = _find_header(base_paths, "NvInfer.h", required_version, get_header_version)
-  except ConfigError:
-    # TensorRT 6 moved the version information to NvInferVersion.h.
-    header_path, header_version = _find_header(base_paths, "NvInferVersion.h", required_version, get_header_version)
-
-  tensorrt_version = header_version.split(".")[0]
-  library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
-
-  return {
-      "tensorrt_version": tensorrt_version,
-      "tensorrt_include_dir": os.path.dirname(header_path),
-      "tensorrt_library_dir": os.path.dirname(library_path),
-  }
-
-
-def _list_from_env(env_name, default=[]):
-  """Returns comma-separated list from environment variable."""
-  if env_name in os.environ:
-    return os.environ[env_name].split(",")
-  return default
-
-
-def _get_legacy_path(env_name, default=[]):
-  """Returns a path specified by a legacy environment variable.
-
-    CUDNN_INSTALL_PATH, NCCL_INSTALL_PATH, TENSORRT_INSTALL_PATH set to
-    '/usr/lib/x86_64-linux-gnu' would previously find both library and header
-    paths. Detect those and return '/usr', otherwise forward to _list_from_env().
-    """
-  if env_name in os.environ:
-    match = re.match(r"^(/[^/ ]*)+/lib/\w+-linux-gnu/?$", os.environ[env_name])
-    if match:
-      return [match.group(1)]
-  return _list_from_env(env_name, default)
-
-
-def _normalize_path(path):
-  """Returns normalized path, with forward slashes on Windows."""
-  path = os.path.realpath(path)
-  if _is_windows():
-    path = path.replace("\\", "/")
-  return path
-
-
-def find_cuda_config():
-  """Returns a dictionary of CUDA library and header file paths."""
-  libraries = [argv.lower() for argv in sys.argv[1:]]
-  cuda_version = os.environ.get("TF_CUDA_VERSION", "")
-  base_paths = _list_from_env("TF_CUDA_PATHS", _get_default_cuda_paths(cuda_version))
-  base_paths = [path for path in base_paths if os.path.exists(path)]
-
-  result = {}
-  if "cuda" in libraries:
-    cuda_paths = _list_from_env("CUDA_TOOLKIT_PATH", base_paths)
-    result.update(_find_cuda_config(cuda_paths, cuda_version))
-
-    cuda_version = result["cuda_version"]
-    cublas_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (10, 1):
-      # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
-      cublas_paths = cuda_paths
-    cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(_find_cublas_config(cublas_paths, cublas_version, cuda_version))
-
-    cusolver_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusolver_paths = cuda_paths
-    cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
-    result.update(_find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
-
-    curand_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      curand_paths = cuda_paths
-    curand_version = os.environ.get("TF_CURAND_VERSION", "")
-    result.update(_find_curand_config(curand_paths, curand_version, cuda_version))
-
-    cufft_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cufft_paths = cuda_paths
-    cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
-    result.update(_find_cufft_config(cufft_paths, cufft_version, cuda_version))
-
-    cusparse_paths = base_paths
-    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
-      cusparse_paths = cuda_paths
-    cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
-    result.update(_find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
-
-  if "cudnn" in libraries:
-    cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
-    cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
-    result.update(_find_cudnn_config(cudnn_paths, cudnn_version))
-
-  if "nccl" in libraries:
-    nccl_paths = _get_legacy_path("NCCL_INSTALL_PATH", base_paths)
-    nccl_version = os.environ.get("TF_NCCL_VERSION", "")
-    result.update(_find_nccl_config(nccl_paths, nccl_version))
-
-  if "tensorrt" in libraries:
-    tensorrt_paths = _get_legacy_path("TENSORRT_INSTALL_PATH", base_paths)
-    tensorrt_version = os.environ.get("TF_TENSORRT_VERSION", "")
-    result.update(_find_tensorrt_config(tensorrt_paths, tensorrt_version))
-
-  for k, v in result.items():
-    if k.endswith("_dir") or k.endswith("_path"):
-      result[k] = _normalize_path(v)
-
-  return result
-
-
-def main():
-  try:
-    for key, value in sorted(find_cuda_config().items()):
-      print("%s: %s" % (key, value))
-  except ConfigError as e:
-    sys.stderr.write(str(e))
-    sys.exit(1)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/configure.py b/configure.py
index 60fb41cf..e2086460 100644
--- a/configure.py
+++ b/configure.py
@@ -17,12 +17,15 @@
 import argparse
 import errno
 import glob
+import logging
 import os
 import pathlib
 import platform
 import re
+import shutil
 import subprocess
 import sys
+from typing import Optional
 
 import tensorflow as tf
 from packaging.version import Version
@@ -37,9 +40,6 @@
 _DEFAULT_CUDA_VERSION = '11'
 _DEFAULT_CUDNN_VERSION = '2'
 _DEFAULT_TENSORRT_VERSION = '6'
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '7.0,7.5,8.0,8.6'
-
-_SUPPORTED_ANDROID_NDK_VERSIONS = [19, 20, 21, 25]
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
@@ -48,20 +48,6 @@
 _DP_BAZELRC = ''
 _DP_CURRENT_BAZEL_VERSION = None
 
-NCCL_LIB_PATHS = ['lib64/', 'lib/powerpc64le-linux-gnu/', 'lib/x86_64-linux-gnu/', '']
-
-# List of files to configure when building Bazel on Apple platforms.
-APPLE_BAZEL_FILES = [
-    'tensorflow/lite/ios/BUILD', 'tensorflow/lite/objc/BUILD', 'tensorflow/lite/swift/BUILD',
-    'tensorflow/lite/tools/benchmark/experimental/ios/BUILD'
-]
-
-# List of files to move when building for iOS.
-IOS_FILES = [
-    'tensorflow/lite/objc/TensorFlowLiteObjC.podspec',
-    'tensorflow/lite/swift/TensorFlowLiteSwift.podspec',
-]
-
 
 class UserInputError(Exception):
   pass
@@ -104,6 +90,45 @@ def get_tf_header_dir():
   return tf_header_dir
 
 
+def get_cpp_version():
+  cpp_version = "c++14"
+  if Version(tf.__version__) >= Version("2.10"):
+    cpp_version = "c++17"
+  return cpp_version
+
+
+def get_tf_shared_lib_dir():
+  import tensorflow as tf
+
+  # OS Specific parsing
+  if is_windows():
+    tf_shared_lib_dir = tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
+    return tf_shared_lib_dir.replace("\\", "/")
+  elif is_raspi_arm():
+    return tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
+  else:
+    return tf.sysconfig.get_link_flags()[0][2:]
+
+
+# Converts the linkflag namespec to the full shared library name
+def get_shared_lib_name():
+  import tensorflow as tf
+
+  namespec = tf.sysconfig.get_link_flags()
+  if is_macos():
+    # MacOS
+    return "lib" + namespec[1][2:] + ".dylib"
+  elif is_windows():
+    # Windows
+    return "_pywrap_tensorflow_internal.lib"
+  elif is_raspi_arm():
+    # The below command for linux would return an empty list
+    return "_pywrap_tensorflow_internal.so"
+  else:
+    # Linux
+    return namespec[1][3:]
+
+
 def get_tf_version_integer():
   """
   Get Tensorflow version as a 4 digits string.
@@ -115,7 +140,7 @@ def get_tf_version_integer():
     2.8.3 get 2083
 
   The 4-digits-string will be passed to C macro to discriminate different
-  Tensorflow versions.
+  Tensorflow versions. 
 
   We assume that major version has 1 digit, minor version has 2 digits. And
   patch version has 1 digit.
@@ -146,45 +171,6 @@ def get_tf_version_integer():
   return int(tf_version_num)
 
 
-def get_cpp_version():
-  cpp_version = "c++14"
-  if Version(tf.__version__) >= Version("2.10"):
-    cpp_version = "c++17"
-  return cpp_version
-
-
-def get_tf_shared_lib_dir():
-  import tensorflow as tf
-
-  # OS Specific parsing
-  if is_windows():
-    tf_shared_lib_dir = tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
-    return tf_shared_lib_dir.replace("\\", "/")
-  elif is_raspi_arm():
-    return tf.sysconfig.get_compile_flags()[0][2:-7] + "python"
-  else:
-    return tf.sysconfig.get_link_flags()[0][2:]
-
-
-# Converts the linkflag namespec to the full shared library name
-def get_shared_lib_name():
-  import tensorflow as tf
-
-  namespec = tf.sysconfig.get_link_flags()
-  if is_macos():
-    # MacOS
-    return "lib" + namespec[1][2:] + ".dylib"
-  elif is_windows():
-    # Windows
-    return "_pywrap_tensorflow_internal.lib"
-  elif is_raspi_arm():
-    # The below command for linux would return an empty list
-    return "_pywrap_tensorflow_internal.so"
-  else:
-    # Linux
-    return namespec[1][3:]
-
-
 def get_input(question):
   try:
     try:
@@ -222,6 +208,10 @@ def write_action_env(var_name, var):
   write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
 
 
+def write_repo_env(var_name, var):
+  write_to_bazelrc('build --repo_env {}="{}"'.format(var_name, str(var)))
+
+
 def run_shell(cmd, allow_non_zero=False, stderr=None):
   if stderr is None:
     stderr = sys.stdout
@@ -315,6 +305,8 @@ def setup_python(environ_cp):
   python_major_version = get_python_major_version(python_bin_path)
   if python_major_version == '2':
     write_to_bazelrc('build --host_force_python=PY2')
+  logging.debug(f"Hermetic Python version: {sys.version_info.major}.{sys.version_info.minor}")
+  write_repo_env("HERMETIC_PYTHON_VERSION", f"{sys.version_info.major}.{sys.version_info.minor}")
 
   # Convert python path to Windows style before writing into bazel.rc
   if is_windows() or is_cygwin():
@@ -553,44 +545,6 @@ def set_cc_opt_flags(environ_cp):
     write_to_bazelrc('build:opt --host_copt=%s' % opt)
 
 
-def set_tf_cuda_clang(environ_cp):
-  """set TF_CUDA_CLANG action_env.
-
-  Args:
-    environ_cp: copy of the os.environ.
-  """
-  question = 'Do you want to use clang as CUDA compiler?'
-  yes_reply = 'Clang will be used as CUDA compiler.'
-  no_reply = 'nvcc will be used as CUDA compiler.'
-  set_action_env_var(
-      environ_cp,
-      'TF_CUDA_CLANG',
-      None,
-      False,
-      question=question,
-      yes_reply=yes_reply,
-      no_reply=no_reply,
-      bazel_config_name='cuda_clang',
-  )
-
-
-def set_tf_download_clang(environ_cp):
-  """Set TF_DOWNLOAD_CLANG action_env."""
-  question = 'Do you wish to download a fresh release of clang? (Experimental)'
-  yes_reply = 'Clang will be downloaded and used to compile tensorflow.'
-  no_reply = 'Clang will not be downloaded.'
-  set_action_env_var(
-      environ_cp,
-      'TF_DOWNLOAD_CLANG',
-      None,
-      False,
-      question=question,
-      yes_reply=yes_reply,
-      no_reply=no_reply,
-      bazel_config_name='download_clang'
-  )
-
-
 def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var, var_default):
   """Get var_name either from env, or user or default.
 
@@ -683,28 +637,12 @@ def prompt_loop_or_load_from_env(
   return val
 
 
-def set_clang_cuda_compiler_path(environ_cp):
-  """Set CLANG_CUDA_COMPILER_PATH."""
-  default_clang_path = '/usr/lib/llvm-17/bin/clang'
-  if not os.path.exists(default_clang_path):
-    default_clang_path = '/usr/lib/llvm-16/bin/clang'
-    if not os.path.exists(default_clang_path):
-      default_clang_path = which('clang') or ''
-
-  clang_cuda_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='CLANG_CUDA_COMPILER_PATH',
-      var_default=default_clang_path,
-      ask_for_var='Please specify clang path that to be used as host compiler.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg='Invalid clang path. %s cannot be found.',
-  )
-
-  # Set CLANG_CUDA_COMPILER_PATH
-  environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path
-  write_action_env('CLANG_CUDA_COMPILER_PATH', clang_cuda_compiler_path)
-  return clang_cuda_compiler_path
+def choose_compiler(environ_cp):
+  question = 'Do you want to use Clang as the compiler?'
+  yes_reply = 'Clang will be used to compile Deepray.'
+  no_reply = 'GCC will be used to compile Deepray.'
+  var = int(get_var(environ_cp, 'TF_NEED_CLANG', None, False, question, yes_reply, no_reply))
+  return var
 
 
 def set_gcc_host_compiler_path(environ_cp):
@@ -726,16 +664,20 @@ def set_gcc_host_compiler_path(environ_cp):
       resolve_symlinks=True,
       error_msg='Invalid gcc path. %s cannot be found.',
   )
+  write_repo_env("CC", gcc_host_compiler_path)
+  write_repo_env("BAZEL_COMPILER", gcc_host_compiler_path)
+  return gcc_host_compiler_path
 
-  write_action_env('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
 
-
-def choose_compiler(environ_cp):
-  question = 'Do you want to use Clang to build Deepray?'
-  yes_reply = 'Clang will be used to compile Deepray.'
-  no_reply = 'GCC will be used to compile Deepray.'
-  var = int(get_var(environ_cp, 'TF_NEED_CLANG', None, False, question, yes_reply, no_reply))
-  return var
+def get_gcc_major_version(gcc_path: str):
+  gcc_version_proc = subprocess.run(
+      [gcc_path, "-dumpversion"],
+      check=True,
+      capture_output=True,
+      text=True,
+  )
+  major_version = int(gcc_version_proc.stdout)
+  return major_version
 
 
 def set_clang_compiler_path(environ_cp):
@@ -751,10 +693,13 @@ def set_clang_compiler_path(environ_cp):
   Returns:
     string value for clang_compiler_path.
   """
-  # Default path if clang-16 is installed by using apt-get install
-  default_clang_path = '/usr/lib/llvm-17/bin/clang'
+  # Default path if clang-18 is installed by using apt-get install
+  # remove 16 logic upon release of 19
+  default_clang_path = '/usr/lib/llvm-18/bin/clang'
   if not os.path.exists(default_clang_path):
-    default_clang_path = '/usr/lib/llvm-16/bin/clang'
+    default_clang_path = '/usr/lib/llvm-17/bin/clang'
+    if not os.path.exists(default_clang_path):
+      default_clang_path = '/usr/lib/llvm-16/bin/clang'
     if not os.path.exists(default_clang_path):
       default_clang_path = which('clang') or ''
 
@@ -772,9 +717,8 @@ def set_clang_compiler_path(environ_cp):
       ),
   )
 
-  write_action_env('CLANG_COMPILER_PATH', clang_compiler_path)
-  write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path)
-  write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path)
+  write_repo_env('CC', clang_compiler_path)
+  write_repo_env('BAZEL_COMPILER', clang_compiler_path)
 
   return clang_compiler_path
 
@@ -812,8 +756,16 @@ def retrieve_clang_version(clang_executable):
 # offset of in the current version of ubp. See
 # https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 def disable_clang_offsetof_extension(clang_version):
-  if int(clang_version.split('.')[0]) in (16, 17):
+  clang_major_version = int(clang_version.split('.')[0])
+  if clang_major_version in (16, 17):
     write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
+  if clang_major_version >= 16:
+    # Enable clang settings that are needed for the build to work with newer
+    # versions of Clang.
+    write_to_bazelrc("build --config=clang")
+  if clang_major_version < 19:
+    # Prevent XNNPACK from using `-mavxvnniint8` (only available in clang 16+/gcc 13+).
+    write_to_bazelrc("build --define=xnn_enable_avxvnniint8=false")
 
 
 def set_tf_cuda_paths(environ_cp):
@@ -885,37 +837,76 @@ def set_tf_nccl_version(environ_cp):
   environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
 
 
-def get_native_cuda_compute_capabilities(environ_cp):
-  """Get native cuda compute capabilities.
+def _find_executable(executable: str) -> Optional[str]:
+  logging.info("Trying to find path to %s...", executable)
+  # Resolving the symlink is necessary for finding system headers.
+  if unresolved_path := shutil.which(executable):
+    return str(pathlib.Path(unresolved_path).resolve())
+  return None
+
+
+def _find_executable_or_die(executable_name: str, executable_path: Optional[str] = None) -> str:
+  """Finds executable and resolves symlinks or raises RuntimeError.
+
+  Resolving symlinks is sometimes necessary for finding system headers.
 
   Args:
-    environ_cp: copy of the os.environ.
+    executable_name: The name of the executable that we want to find.
+    executable_path: If not None, the path to the executable.
 
   Returns:
-    string of native cuda compute capabilities, separated by comma.
+    The path to the executable we are looking for, after symlinks are resolved.
+  Raises:
+    RuntimeError: if path to the executable cannot be found.
   """
-  device_query_bin = os.path.join(environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
-  if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK):
-    try:
-      output = run_shell(device_query_bin).split('\n')
-      pattern = re.compile('[0-9]*\\.[0-9]*')
-      output = [pattern.search(x) for x in output if 'Capability' in x]
-      output = ','.join(x.group() for x in output if x is not None)
-    except subprocess.CalledProcessError:
-      output = ''
-  else:
-    output = ''
-  return output
+  if executable_path:
+    return str(pathlib.Path(executable_path).resolve(strict=True))
+  resolved_path_to_exe = _find_executable(executable_name)
+  if resolved_path_to_exe is None:
+    raise RuntimeError(
+        f"Could not find executable `{executable_name}`! "
+        "Please change your $PATH or pass the path directly like"
+        f"`--{executable_name}_path=path/to/executable."
+    )
+  logging.info("Found path to %s at %s", executable_name, resolved_path_to_exe)
+
+  return resolved_path_to_exe
+
+
+def _get_cuda_compute_capabilities_or_die() -> list[str]:
+  """Finds compute capabilities via nvidia-smi or rasies exception.
+
+  Returns:
+    list of unique, sorted strings representing compute capabilities:
+  Raises:
+    RuntimeError: if path to nvidia-smi couldn't be found.
+    subprocess.CalledProcessError: if nvidia-smi process failed.
+  """
+  try:
+    nvidia_smi = _find_executable_or_die("nvidia-smi")
+    nvidia_smi_proc = subprocess.run(
+        [nvidia_smi, "--query-gpu=compute_cap", "--format=csv,noheader"],
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+    # Command above returns a newline separated list of compute capabilities
+    # with possible repeats. So we should unique them and sort the final result.
+    capabilities = sorted(set(nvidia_smi_proc.stdout.strip().split("\n")))
+    logging.info("Found CUDA compute capabilities: %s", capabilities)
+    return ','.join(capabilities)
+  except (RuntimeError, subprocess.CalledProcessError) as e:
+    logging.info(
+        "Could not find nvidia-smi, or nvidia-smi command failed. Please pass"
+        " capabilities directly using --cuda_compute_capabilities."
+    )
+    raise e
 
 
-def set_tf_cuda_compute_capabilities(environ_cp):
-  """Set TF_CUDA_COMPUTE_CAPABILITIES."""
+def set_hermetic_cuda_compute_capabilities(environ_cp):
+  """Set HERMETIC_CUDA_COMPUTE_CAPABILITIES."""
   while True:
-    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(environ_cp)
-    if not native_cuda_compute_capabilities:
-      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    else:
-      default_cuda_compute_capabilities = native_cuda_compute_capabilities
+    default_cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
 
     ask_cuda_compute_capabilities = (
         'Please specify a list of comma-separated CUDA compute capabilities '
@@ -925,18 +916,21 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         ' binary GPU code, or as "sm_xy" to only include the binary '
         'code.\nPlease note that each additional compute capability '
         'significantly increases your build time and binary size, and that '
-        'TensorFlow only supports compute capabilities >= 3.5 [Default is: '
+        'Deepray only supports compute capabilities >= 3.5 [Default is: '
         '%s]: ' % default_cuda_compute_capabilities
     )
-    tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
-        environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES', ask_cuda_compute_capabilities, default_cuda_compute_capabilities
+    hermetic_cuda_compute_capabilities = get_from_env_or_user_or_default(
+        environ_cp,
+        'HERMETIC_CUDA_COMPUTE_CAPABILITIES',
+        ask_cuda_compute_capabilities,
+        default_cuda_compute_capabilities,
     )
     # Check whether all capabilities from the input is valid
     all_valid = True
     # Remove all whitespace characters before splitting the string
     # that users may insert by accident, as this will result in error
-    tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
-    for compute_capability in tf_cuda_compute_capabilities.split(','):
+    hermetic_cuda_compute_capabilities = ''.join(hermetic_cuda_compute_capabilities.split())
+    for compute_capability in hermetic_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
         # We now support sm_35,sm_50,sm_60,compute_70.
@@ -977,20 +971,26 @@ def set_tf_cuda_compute_capabilities(environ_cp):
       break
 
     # Reset and Retry
-    environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = ''
+    environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = ''
 
-  # Set TF_CUDA_COMPUTE_CAPABILITIES
-  environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities
-  write_action_env('TF_CUDA_COMPUTE_CAPABILITIES', tf_cuda_compute_capabilities)
+  # Set HERMETIC_CUDA_COMPUTE_CAPABILITIES
+  environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = (hermetic_cuda_compute_capabilities)
+  write_to_bazelrc(
+      'build:{} --repo_env {}="{}"'.format(
+          'cuda', 'HERMETIC_CUDA_COMPUTE_CAPABILITIES', str(hermetic_cuda_compute_capabilities)
+      )
+  )
 
 
 def set_other_cuda_vars(environ_cp):
   """Set other CUDA related variables."""
   # If CUDA is enabled, always use GPU during build and test.
-  if environ_cp.get('TF_CUDA_CLANG') == '1':
+  if environ_cp.get('TF_NEED_CLANG') == '1':
     write_to_bazelrc('build --config=cuda_clang')
+    write_action_env('CLANG_CUDA_COMPILER_PATH', environ_cp.get('CLANG_COMPILER_PATH'))
   else:
     write_to_bazelrc('build --config=cuda')
+    write_to_bazelrc('build --config=cuda_nvcc')
 
 
 def system_specific_test_config(environ_cp):
@@ -1192,55 +1192,24 @@ def main():
   # This should be replaced with a call to tf.sysconfig if it's added
   write_action_env("TF_CPLUSPLUS_VER", get_cpp_version())
 
+  tf_version_integer = get_tf_version_integer()
   # This is used to trace the difference between Tensorflow versions.
-  write_action_env("TF_VERSION_INTEGER", get_tf_version_integer())
-
-  if is_windows():
-    environ_cp['TF_NEED_OPENCL'] = '0'
-    environ_cp['TF_CUDA_CLANG'] = '0'
-    # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
-    # Windows.
-    environ_cp['TF_DOWNLOAD_CLANG'] = '0'
-    environ_cp['TF_NEED_MPI'] = '0'
-
-  if is_macos():
-    environ_cp['TF_NEED_TENSORRT'] = '0'
-
-  if is_ppc64le():
-    # Enable MMA Dynamic Dispatch support if 'gcc' and if linker >= 2.35
-    gcc_env = get_gcc_compiler(environ_cp)
-    if gcc_env is not None:
-
-      # Use gold linker if 'gcc' and if 'ppc64le'
-      write_to_bazelrc('build --linkopt="-fuse-ld=gold"')
-
-      # Get the linker version
-      ld_version = run_shell([gcc_env, '-Wl,-version']).split()
-
-      ld_version_int = convert_version_to_int(ld_version[3])
-      if ld_version_int is None:
-        ld_version_int = convert_version_to_int(ld_version[4])
+  write_action_env("TF_VERSION_INTEGER", tf_version_integer)
+  write_to_bazelrc('')
 
-      # Enable if 'ld' version >= 2.35
-      if ld_version_int >= 2035000:
-        write_to_bazelrc('build --copt="-DEIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH=1"')
-
-  with_xla_support = environ_cp.get('TF_ENABLE_XLA', None)
-  if with_xla_support is not None:
-    write_to_bazelrc('build --define=with_xla_support=%s' % ('true' if int(with_xla_support) else 'false'))
-
-  # set_action_env_var(environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm')
-  if (
-      environ_cp.get('TF_NEED_ROCM') == '1' and 'LD_LIBRARY_PATH' in environ_cp and
-      environ_cp.get('LD_LIBRARY_PATH') != '1'
-  ):
-    write_action_env('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH'))
-
-  if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')):
-    write_action_env('ROCM_PATH', environ_cp.get('ROCM_PATH'))
-
-  if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('HIP_PLATFORM')):
-    write_action_env('HIP_PLATFORM', environ_cp.get('HIP_PLATFORM'))
+  # Ask whether we should use clang for the CPU build.
+  if is_linux():
+    environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp))
+    if environ_cp.get('TF_NEED_CLANG') == '1':
+      clang_compiler_path = set_clang_compiler_path(environ_cp)
+      clang_version = retrieve_clang_version(clang_compiler_path)
+      disable_clang_offsetof_extension(clang_version)
+    else:
+      gcc_path = set_gcc_host_compiler_path(environ_cp)
+      gcc_major_version = get_gcc_major_version(gcc_path)
+      if gcc_major_version < 13:
+        # Prevent XNNPACK from using `-mavxvnniint8` (only available in clang 16+/gcc 13+).
+        write_to_bazelrc('build --define=xnn_enable_avxvnniint8=false')
 
   if is_windows():
     print(
@@ -1300,30 +1269,14 @@ def main():
           'times in a row. Assuming to be a scripting mistake.' % _DEFAULT_PROMPT_ASK_ATTEMPTS
       )
 
-    set_tf_cuda_compute_capabilities(environ_cp)
+    set_hermetic_cuda_compute_capabilities(environ_cp)
     if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get('LD_LIBRARY_PATH') != '1':
       write_action_env('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH'))
 
-    set_tf_cuda_clang(environ_cp)
-    if environ_cp.get('TF_CUDA_CLANG') == '1':
-      # Set up which clang we should use as the cuda / host compiler.
-      clang_cuda_compiler_path = set_clang_cuda_compiler_path(environ_cp)
-      clang_version = retrieve_clang_version(clang_cuda_compiler_path)
-      disable_clang_offsetof_extension(clang_version)
-    else:
-      # Set up which gcc nvcc should use as the host compiler
-      # No need to set this on Windows
-      if not is_windows():
-        set_gcc_host_compiler_path(environ_cp)
     set_other_cuda_vars(environ_cp)
   else:
-    # CUDA not required. Ask whether we should use clang for the CPU build.
-    if is_linux():
-      environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp))
-      if environ_cp.get('TF_NEED_CLANG') == '1':
-        clang_compiler_path = set_clang_compiler_path(environ_cp)
-        clang_version = retrieve_clang_version(clang_compiler_path)
-        disable_clang_offsetof_extension(clang_version)
+    if environ_cp.get('TF_NEED_CLANG') == '1':
+      write_action_env('CLANG_COMPILER_PATH', clang_compiler_path)
 
   # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
diff --git a/deepray/BUILD b/deepray/BUILD
index 06896eb7..5d7a13a2 100644
--- a/deepray/BUILD
+++ b/deepray/BUILD
@@ -1,119 +1,32 @@
-load("//deepray:tensorflow.bzl", "if_google")
-load("@bazel_skylib//lib:selects.bzl", "selects")
-
 licenses(["notice"])  # Apache 2.0
 
-package(default_visibility = ["//visibility:public"])
-
-config_setting(
-    name = "windows",
-    constraint_values = ["@platforms//os:windows"],
+package(
+    default_visibility = [":internal"],
+    licenses = ["notice"],  # Apache 2.0
 )
 
-# Sometimes Bazel reports darwin_x86_64 as "darwin" and sometimes as
-# "darwin_x86_64". The former shows up when building on a Mac x86_64 host for a Mac x86_64 target.
-# The latter shows up when cross-compiling for Mac x86_64 from a Mac ARM machine and in internal
-# Google builds.
-config_setting(
-    name = "macos_x86_64_default",
-    flag_values = if_google(
-        {"//tools/cpp:cc_target_os": "apple"},
-        {},
-    ),
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin",
-    },
-)
+exports_files([
+    "LICENSE",
+])
 
-config_setting(
-    name = "macos_x86_64_crosscompile",
-    flag_values = if_google(
-        {"//tools/cpp:cc_target_os": "apple"},
-        {},
-    ),
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin_x86_64",
-    },
-)
-
-selects.config_setting_group(
-    name = "macos_x86_64",
-    match_any = [
-        ":macos_x86_64_default",
-        ":macos_x86_64_crosscompile",
+package_group(
+    name = "internal",
+    includes = [
     ],
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "macos_arm64",
-    flag_values = if_google(
-        {"//tools/cpp:cc_target_os": "apple"},
-        {},
-    ),
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin_arm64",
-    },
-    visibility = ["//visibility:public"],
-)
-
-# TODO(jakeharmon): Remove in favor of TSL version
-selects.config_setting_group(
-    name = "macos",
-    match_any = [
-        ":macos_x86_64",
-        ":macos_arm64",
+    packages = [
+        "//...",
+        "//deepray/...",
     ],
-    visibility = ["//visibility:public"],
 )
 
-# Crosses between framework_shared_object and a bunch of other configurations
-# due to limitations in nested select() statements.
 config_setting(
-    name = "framework_shared_object",
-    define_values = {"framework_shared_object": "true"},
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "macos_x86_64_with_framework_shared_object",
-    define_values = {
-        "framework_shared_object": "true",
-    },
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin",
-    },
-    visibility = ["//visibility:public"],
-)
-
-config_setting(
-    name = "macos_arm64_with_framework_shared_object",
-    define_values = {
-        "framework_shared_object": "true",
-    },
-    values = {
-        "apple_platform_type": "macos",
-        "cpu": "darwin_arm64",
-    },
-    visibility = ["//visibility:public"],
-)
-
-selects.config_setting_group(
-    name = "macos_with_framework_shared_object",
-    match_any = [
-        ":macos_x86_64_with_framework_shared_object",
-        ":macos_arm64_with_framework_shared_object",
-    ],
-    visibility = ["//visibility:public"],
+    name = "windows",
+    constraint_values = ["@platforms//os:windows"],
 )
 
 py_library(
     name = "deepray",
-    srcs = glob(["*.py"]),
+    srcs = glob(["**/*.py"]),
     deps = [
         "//deepray/activations",
         "//deepray/callbacks",
@@ -123,10 +36,11 @@ py_library(
         "//deepray/layers",
         "//deepray/losses",
         "//deepray/metrics",
+        "//deepray/models",
         "//deepray/optimizers",
-        "//deepray/seq2seq",
+        # "//deepray/seq2seq",
         "//deepray/testing",
-        "//deepray/text",
+        # "//deepray/text",
         "//deepray/utils",
     ],
 )
diff --git a/deepray/__init__.py b/deepray/__init__.py
index b8731d98..a1d02a99 100644
--- a/deepray/__init__.py
+++ b/deepray/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The Deepray Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,22 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Useful extra functionality for TensorFlow maintained by SIG-deepray."""
+import argparse
+import os
+
+os.environ["TF_USE_LEGACY_KERAS"] = "1"
 import sys
 
+import tensorflow as tf
 from absl import flags
 
-from deepray.utils.flags import common_flags
-
-common_flags.define_common_flags()
-
-FLAGS = flags.FLAGS
-FLAGS(sys.argv, known_only=True)
-
-from deepray.utils.ensure_tf_install import _check_tf_version
-
-_check_tf_version()
-
 # Local project imports
 from deepray import activations
 from deepray import callbacks
@@ -35,12 +28,98 @@
 from deepray import layers
 from deepray import losses
 from deepray import metrics
+from deepray import models
 from deepray import optimizers
-from deepray.layers import rnn
-from deepray import seq2seq
-from deepray import text
 from deepray import options
 from deepray.register import register_all
+from deepray.utils import logging_util
 from deepray.utils import types
-
+from deepray.utils.ensure_tf_install import _check_tf_version
+from deepray.utils.flags import common_flags
 from deepray.version import __version__
+from deepray.utils import gpu_affinity
+
+# _check_tf_version()
+
+logger = logging_util.get_logger()
+
+common_flags.define_common_flags()
+flags.FLAGS(sys.argv, known_only=True)
+
+
+def init():
+  logger.debug(f"sys.argv = {sys.argv}")  # sys.argv from Horovod
+
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+  if flags.FLAGS.distribution_strategy == "horovod":
+    import horovod.tensorflow as hvd
+    hvd.init()
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      gpu_affinity.set_affinity(hvd.local_rank())
+
+
+def start_tensorflow_server(cluster_resolver):
+  # Set the environment variable to allow reporting worker and ps failure to the
+  # coordinator. This is a workaround and won't be necessary in the future.
+  os.environ["GRPC_FAIL_FAST"] = "use_caller"
+
+  server = tf.distribute.Server(
+      cluster_resolver.cluster_spec(),
+      job_name=cluster_resolver.task_type,
+      task_index=cluster_resolver.task_id,
+      protocol=cluster_resolver.rpc_layer or "grpc",
+      start=True,
+  )
+  server.join()
+
+
+def runner(function, verbose=None):
+  parser = argparse.ArgumentParser(description='Deepray Runner')
+  parser.add_argument('-v', '--version', action='version', version=__version__, help='Shows Deepray version.')
+  parser.add_argument(
+      '--distribution_strategy', type=str, default='Horovod', help='Whether run distributed training with Horovod.'
+  )
+
+  physical_devices = tf.config.list_physical_devices('GPU')
+  world_size = len(physical_devices)
+  logger.debug(f"world_size = {world_size}")
+
+  user_argv = sys.argv  # get user specified args
+  args, unknown = parser.parse_known_args()
+
+  if world_size > 1 and args.distribution_strategy == "Horovod":
+    user_argv.extend([
+        "--distribution_strategy=horovod",
+        f"--num_gpus={world_size}",
+        "--use_horovod",
+    ])
+    try:
+      import horovod
+      os.environ['HOROVOD_STALL_CHECK_TIME_SECONDS'] = '5'
+      os.environ['HOROVOD_STALL_SHUTDOWN_TIME_SECONDS'] = '30'
+    except ImportError:
+      raise ValueError("Please install Horovod properly first if you want to use Horovod distribution_strategy.")
+
+    def helper(argv, main):
+      logger.debug(f"argv = {argv}")
+      init()
+      main()
+
+    horovod.run(helper, args=(sys.argv,), kwargs={"main": function}, np=world_size, verbose=verbose, use_mpi=True)
+  elif args.distribution_strategy == "ParameterServer":
+    cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
+    if cluster_resolver.task_type in ("worker", "ps"):
+      start_tensorflow_server(cluster_resolver)
+    else:
+      user_argv.extend(["--distribution_strategy=parameter_server"])
+      init()
+      function()
+  else:
+    logger.info("Deepray finds only one GPU available, so we turn off distribution_strategy.")
+    user_argv.extend(["--distribution_strategy=off", f"--num_gpus={world_size}"])
+    init()
+    function()
diff --git a/deepray/activations/__init__.py b/deepray/activations/__init__.py
index 58300cdb..e69de29b 100644
--- a/deepray/activations/__init__.py
+++ b/deepray/activations/__init__.py
@@ -1,27 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Additional activation functions."""
-
-from deepray.activations.hardshrink import hardshrink
-from deepray.activations.lisht import lisht
-from deepray.activations.mish import mish
-from deepray.activations.softshrink import softshrink
-from deepray.activations.rrelu import rrelu
-from deepray.activations.snake import snake
-from deepray.activations.sparsemax import sparsemax
-from deepray.activations.tanhshrink import tanhshrink
-from deepray.activations.swish import simple_swish
-from deepray.activations.swish import hard_swish
-from deepray.activations.swish import identity
\ No newline at end of file
diff --git a/deepray/callbacks/__init__.py b/deepray/callbacks/__init__.py
index 98f2eae1..ee575bcd 100755
--- a/deepray/callbacks/__init__.py
+++ b/deepray/callbacks/__init__.py
@@ -15,6 +15,7 @@
 """Additional callbacks that conform to Keras API."""
 
 from deepray.callbacks.average_model_checkpoint import AverageModelCheckpoint
+from deepray.callbacks.callbacks import HvdCallbackList
+from deepray.callbacks.model_checkpoint import ModelCheckpoint
 from deepray.callbacks.time_stopping import TimeStopping
 from deepray.callbacks.tqdm_progress_bar import TQDMProgressBar
-from deepray.callbacks.callbacks import HvdCallbackList
diff --git a/deepray/callbacks/callbacks.py b/deepray/callbacks/callbacks.py
index 6c6f7066..0250e3fb 100644
--- a/deepray/callbacks/callbacks.py
+++ b/deepray/callbacks/callbacks.py
@@ -13,44 +13,43 @@
 # limitations under the License.
 # ==============================================================================
 """Callbacks: utilities called at certain points during model training."""
-
-import horovod.tensorflow.keras as hvd
 import numpy as np
 import tensorflow as tf
 from absl import flags
-from keras.callbacks import CallbackList
-
-FLAGS = flags.FLAGS
+from tf_keras import callbacks as callbacks_module
 
 
 def sync_to_numpy_or_python_type(tensors):
   """Syncs and converts a structure of `Tensor`s to `NumPy` arrays or Python
-    scalar types.
+  scalar types.
 
-    For each tensor, it calls `tensor.numpy()`. If the result is a scalar value,
-    it converts it to a Python type, such as a float or int, by calling
-    `result.item()`.
+  For each tensor, it calls `tensor.numpy()`. If the result is a scalar value,
+  it converts it to a Python type, such as a float or int, by calling
+  `result.item()`.
 
-    Numpy scalars are converted, as Python types are often more convenient to
-    deal with. This is especially useful for bfloat16 Numpy scalars, which don't
-    support as many operations as other Numpy values.
+  Numpy scalars are converted, as Python types are often more convenient to
+  deal with. This is especially useful for bfloat16 Numpy scalars, which don't
+  support as many operations as other Numpy values.
 
-    Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are
-    forced to
-    sync during this process.
+  Async strategies (such as `TPUStrategy` and `ParameterServerStrategy`) are
+  forced to
+  sync during this process.
 
-    Args:
-      tensors: A structure of tensors.
+  Args:
+    tensors: A structure of tensors.
 
-    Returns:
-      `tensors`, but scalar tensors are converted to Python types and non-scalar
-      tensors are converted to Numpy arrays.
-    """
+  Returns:
+    `tensors`, but scalar tensors are converted to Python types and non-scalar
+    tensors are converted to Numpy arrays.
+  """
   if isinstance(tensors, tf.distribute.experimental.coordinator.RemoteValue):
     tensors = tensors.fetch()
+  if isinstance(tensors, list) and isinstance(tensors[0], tf.distribute.experimental.coordinator.RemoteValue):
+    tensors = tf.nest.map_structure(lambda t: t.fetch(), tensors)
 
   def _to_single_numpy_or_python_type(t):
-    if FLAGS.use_horovod:
+    if flags.FLAGS.use_horovod:
+      import horovod.tensorflow.keras as hvd
       t = hvd.allreduce(t, op=hvd.Average)
     # Don't turn ragged or sparse tensors to NumPy.
     if isinstance(t, tf.Tensor):
@@ -64,7 +63,7 @@ def _to_single_numpy_or_python_type(t):
   return tf.nest.map_structure(_to_single_numpy_or_python_type, tensors)
 
 
-class HvdCallbackList(CallbackList):
+class HvdCallbackList(callbacks_module.CallbackList):
 
   def _process_logs(self, logs, is_batch_hook=False):
     """Turns tensors into numpy arrays or Python scalars if necessary."""
diff --git a/deepray/callbacks/model_checkpoint.py b/deepray/callbacks/model_checkpoint.py
new file mode 100644
index 00000000..8b84b986
--- /dev/null
+++ b/deepray/callbacks/model_checkpoint.py
@@ -0,0 +1,147 @@
+# Copyright 2023 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import os
+import sys
+
+import tensorflow as tf
+from absl import flags
+from tf_keras.callbacks import Callback
+from typeguard import typechecked
+
+from deepray.utils import export
+from deepray.utils import logging_util
+from deepray.utils.horovod_utils import is_main_process, get_world_size, get_rank
+
+logger = logging_util.get_logger()
+
+
+@tf.keras.utils.register_keras_serializable(package="Deepray")
+class ModelCheckpoint(Callback):
+
+  @typechecked
+  def __init__(self, save_checkpoint_steps: int = sys.maxsize, max_to_keep: int = 3):
+    super().__init__()
+    self.save_checkpoint_steps = save_checkpoint_steps
+    self.max_to_keep = max_to_keep
+    self.epochs = flags.FLAGS.epochs
+    if flags.FLAGS.stop_steps >= 0:
+      self.epochs = 1
+    if flags.FLAGS.use_dynamic_embedding:
+      from tensorflow_recommenders_addons import dynamic_embedding as de
+      tf.train.Checkpoint = de.train.checkpoint.DECheckpoint
+
+  def set_models(self, models):
+    self.models = models
+
+  def set_optimizer(self, optimizer):
+    self.optimizer = optimizer
+
+  # def set_iterator(self, iterator):
+  #   self.iterator = iterator
+
+  @property
+  def manager(self):
+    if len(self._managers) == 1:
+      return self._managers["main"]
+    else:
+      return self._managers
+
+  def on_callback_begin(self):
+    self._checkpoints, self._managers = {}, {}
+    for name, model in self.models.items():
+      if "main" in name:
+        _checkpoint = tf.train.Checkpoint(model=model, optimizer=self.optimizer)
+        self._checkpoints[name] = _checkpoint
+        if get_world_size() > 1:
+          self._managers[name] = tf.train.CheckpointManager(
+              _checkpoint,
+              os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}_{get_rank()}'),
+              max_to_keep=self.max_to_keep
+          )
+        else:
+          self._managers[name] = tf.train.CheckpointManager(
+              _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep
+          )
+      else:
+        _checkpoint = tf.train.Checkpoint(model=model)
+        self._checkpoints[name] = _checkpoint
+        self._managers[name] = tf.train.CheckpointManager(
+            _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep
+        )
+
+    if flags.FLAGS.init_checkpoint:
+      for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), flags.FLAGS.init_checkpoint):
+        if init_ckpt:
+          if tf.io.gfile.isdir(init_ckpt):
+            latest_checkpoint = tf.train.latest_checkpoint(init_ckpt)
+          else:
+            latest_checkpoint = init_ckpt
+          logger.info(
+              f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.'
+          )
+          if os.getenv("DEEPRAY_VERBOSITY", None) == "detail" or flags.FLAGS.use_dynamic_embedding:
+            # TFRA DE doesn't support "assert_existing_objects_matched" method
+            ckpt.restore(latest_checkpoint)
+          else:
+            ckpt.restore(latest_checkpoint).assert_existing_objects_matched()
+          logger.info('Loading from checkpoint file...')
+
+    self.current_step = 0
+    self._steps_from_save = 0  # self.optimizer.iterations.numpy()
+
+  def on_train_begin(self, logs=None):
+    self.on_callback_begin()
+
+  def on_test_begin(self, logs=None):
+    self.on_callback_begin()
+
+  def on_predict_begin(self, logs=None):
+    self.on_callback_begin()
+
+  def on_train_batch_end(self, batch, logs=None):
+    self.current_step = batch
+    if self._steps_from_save + self.save_checkpoint_steps <= batch:
+      export.export_to_checkpoint(self.manager, batch)
+      self._steps_from_save = batch
+
+  def on_epoch_end(self, epoch, logs=None):
+    # Saves model checkpoints and run validation steps at every epoch end.
+    # To avoid repeated model saving, we do not save after the last step of training.
+    if epoch < self.epochs - 1:
+      export.export_to_checkpoint(self.manager, self.current_step)
+
+  def on_train_end(self, logs=None):
+    export.export_to_checkpoint(self.manager, self.current_step)
+
+  def get_config(self):
+    config = {
+        "save_checkpoint_steps": self.save_checkpoint_steps,
+        "max_to_keep": self.max_to_keep,
+    }
+
+    base_config = super().get_config()
+    return {**base_config, **config}
+
+
+class SimpleCheckpoint(Callback):
+  """Keras callback to save tf.train.Checkpoints."""
+
+  def __init__(self, checkpoint_manager):
+    super(SimpleCheckpoint, self).__init__()
+    self.checkpoint_manager = checkpoint_manager
+
+  def on_epoch_end(self, epoch, logs=None):
+    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
+    self.checkpoint_manager.save(checkpoint_number=step_counter)
diff --git a/deepray/callbacks/profiler_callback.py b/deepray/callbacks/profiler_callback.py
new file mode 100644
index 00000000..229c715f
--- /dev/null
+++ b/deepray/callbacks/profiler_callback.py
@@ -0,0 +1,68 @@
+from tf_keras.callbacks import Callback
+from tensorflow.python.eager import profiler
+
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
+
+
+def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_per_epoch):
+  """Validate profile_steps flag value and return profiler callback."""
+  profile_steps_error_message = (
+      'profile_steps must be a comma separated pair of positive integers, '
+      'specifying the first and last steps to be profiled.'
+  )
+  try:
+    profile_steps = [int(i) for i in profile_steps.split(',')]
+  except ValueError:
+    raise ValueError(profile_steps_error_message)
+  if len(profile_steps) != 2:
+    raise ValueError(profile_steps_error_message)
+  start_step, stop_step = profile_steps
+  if start_step < 0 or start_step > stop_step:
+    raise ValueError(profile_steps_error_message)
+  if enable_tensorboard:
+    logger.warning(
+        'Both TensorBoard and profiler callbacks are used. Note that the '
+        'TensorBoard callback profiles the 2nd step (unless otherwise '
+        'specified). Please make sure the steps profiled by the two callbacks '
+        'do not overlap.'
+    )
+  return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
+
+
+class ProfilerCallback(Callback):
+  """Save profiles in specified step range to log directory."""
+
+  def __init__(self, log_dir, start_step, stop_step, steps_per_epoch):
+    super(ProfilerCallback, self).__init__()
+    self.log_dir = log_dir
+    self.start_step = start_step
+    self.stop_step = stop_step
+    self.start_epoch = start_step // steps_per_epoch
+    self.stop_epoch = stop_step // steps_per_epoch
+    self.start_step_in_epoch = start_step % steps_per_epoch
+    self.stop_step_in_epoch = stop_step % steps_per_epoch
+    self.should_start = False
+    self.should_stop = False
+
+  def on_epoch_begin(self, epoch, logs=None):
+    if epoch == self.start_epoch:
+      self.should_start = True
+    if epoch == self.stop_epoch:
+      self.should_stop = True
+
+  def on_batch_begin(self, batch, logs=None):
+    if batch == self.start_step_in_epoch and self.should_start:
+      self.should_start = False
+      profiler.start()
+      logger.info('Profiler started at Step %s', self.start_step)
+
+  def on_batch_end(self, batch, logs=None):
+    if batch == self.stop_step_in_epoch and self.should_stop:
+      self.should_stop = False
+      results = profiler.stop()
+      profiler.save(self.log_dir, results)
+      logger.info(
+          'Profiler saved profiles for steps between %s and %s to %s', self.start_step, self.stop_step, self.log_dir
+      )
diff --git a/deepray/callbacks/progbar_logger.py b/deepray/callbacks/progbar_logger.py
new file mode 100644
index 00000000..27edf0cd
--- /dev/null
+++ b/deepray/callbacks/progbar_logger.py
@@ -0,0 +1,458 @@
+# Copyright 2023 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import json
+import os
+import time
+import copy
+import numpy as np
+import sys
+
+import tensorflow as tf
+from absl import flags
+from tf_keras.callbacks import Callback
+from tf_keras.src.utils import io_utils
+from tf_keras.src.utils import tf_utils
+
+from deepray.utils import logging_util
+from deepray.utils.benchmark import PerformanceCalculator
+from deepray.utils.flags import common_flags
+from deepray.utils.horovod_utils import is_main_process, get_world_size
+
+logger = logging_util.get_logger()
+
+
+class Progbar:
+  """Displays a progress bar.
+
+    Args:
+        target: Total number of steps expected, None if unknown.
+        width: Progress bar width on screen.
+        verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+        stateful_metrics: Iterable of string names of metrics that should *not*
+          be averaged over time. Metrics in this list will be displayed as-is.
+          All others will be averaged by the progbar before display.
+        interval: Minimum visual progress update interval (in seconds).
+        unit_name: Display name for step counts (usually "step" or "sample").
+    """
+
+  def __init__(
+      self,
+      target,
+      width=30,
+      verbose=1,
+      interval=0.05,
+      stateful_metrics=None,
+      unit_name="step",
+  ):
+    self.target = target
+    self.width = width
+    self.verbose = verbose
+    self.interval = interval
+    self.unit_name = unit_name
+    if stateful_metrics:
+      self.stateful_metrics = set(stateful_metrics)
+    else:
+      self.stateful_metrics = set()
+
+    self._dynamic_display = (
+        (hasattr(sys.stdout, "isatty") and sys.stdout.isatty()) or "ipykernel" in sys.modules or
+        "posix" in sys.modules or "PYCHARM_HOSTED" in os.environ
+    )
+    self._total_width = 0
+    self._seen_so_far = 0
+    # We use a dict + list to avoid garbage collection
+    # issues found in OrderedDict
+    self._values = {}
+    self._values_order = []
+    self._start = time.time()
+    self._last_update = 0
+    self._time_at_epoch_start = self._start
+    self._time_at_epoch_end = None
+    self._time_after_first_step = None
+
+  def update(self, current, values=None, finalize=None):
+    """Updates the progress bar.
+
+        Args:
+            current: Index of current step.
+            values: List of tuples: `(name, value_for_last_step)`. If `name` is
+              in `stateful_metrics`, `value_for_last_step` will be displayed
+              as-is. Else, an average of the metric over time will be
+              displayed.
+            finalize: Whether this is the last update for the progress bar. If
+              `None`, uses `current >= self.target`. Defaults to `None`.
+        """
+    if finalize is None:
+      if self.target is None:
+        finalize = False
+      else:
+        finalize = current >= self.target
+
+    values = values or []
+    for k, v in values:
+      if k not in self._values_order:
+        self._values_order.append(k)
+      if k not in self.stateful_metrics:
+        # In the case that progress bar doesn't have a target value in
+        # the first epoch, both on_batch_end and on_epoch_end will be
+        # called, which will cause 'current' and 'self._seen_so_far' to
+        # have the same value. Force the minimal value to 1 here,
+        # otherwise stateful_metric will be 0s.
+        value_base = max(current - self._seen_so_far, 1)
+        if k not in self._values:
+          self._values[k] = [v * value_base, value_base]
+        else:
+          self._values[k][0] += v * value_base
+          self._values[k][1] += value_base
+      else:
+        # Stateful metrics output a numeric value. This representation
+        # means "take an average from a single value" but keeps the
+        # numeric formatting.
+        self._values[k] = [v, 1]
+    self._seen_so_far = current
+
+    message = ""
+    now = time.time()
+    info = f" - {now - self._start:.0f}s"
+    if current == self.target:
+      self._time_at_epoch_end = now
+    if self.verbose == 1:
+      if now - self._last_update < self.interval and not finalize:
+        return
+
+      prev_total_width = self._total_width
+      if self._dynamic_display:
+        message += "\b" * prev_total_width
+        message += "\r"
+      else:
+        message += "\n"
+
+      if self.target is not None:
+        numdigits = int(np.log10(self.target)) + 1
+        bar = ("%" + str(numdigits) + "d/%d [") % (current, self.target)
+        prog = float(current) / self.target
+        prog_width = int(self.width * prog)
+        if prog_width > 0:
+          bar += "=" * (prog_width - 1)
+          if current < self.target:
+            bar += ">"
+          else:
+            bar += "="
+        bar += "." * (self.width - prog_width)
+        bar += "]"
+      else:
+        bar = "%7d/Unknown" % current
+
+      self._total_width = len(bar)
+      message += bar
+
+      time_per_unit = self._estimate_step_duration(current, now)
+
+      if self.target is None or finalize:
+        info += self._format_time(time_per_unit, self.unit_name)
+      else:
+        eta = time_per_unit * (self.target - current)
+        if eta > 3600:
+          eta_format = "%d:%02d:%02d" % (
+              eta // 3600,
+              (eta % 3600) // 60,
+              eta % 60,
+          )
+        elif eta > 60:
+          eta_format = "%d:%02d" % (eta // 60, eta % 60)
+        else:
+          eta_format = "%ds" % eta
+
+        info = f" - ETA: {eta_format}"
+
+      for k in self._values_order:
+        info += f" - {k}:"
+        if isinstance(self._values[k], list):
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
+          if abs(avg) > 1e-3:
+            info += f" {avg:.4f}"
+          else:
+            info += f" {avg:.4e}"
+        else:
+          info += f" {self._values[k]}"
+
+      self._total_width += len(info)
+      if prev_total_width > self._total_width:
+        info += " " * (prev_total_width - self._total_width)
+
+      if finalize:
+        info += "\n"
+
+      message += info
+      logger.info(message)
+      # io_utils.print_msg(message, line_break=False)
+      message = ""
+
+    elif self.verbose == 2:
+      if finalize:
+        numdigits = int(np.log10(self.target)) + 1
+        count = ("%" + str(numdigits) + "d/%d") % (current, self.target)
+        info = count + info
+        for k in self._values_order:
+          info += f" - {k}:"
+          avg = np.mean(self._values[k][0] / max(1, self._values[k][1]))
+          if avg > 1e-3:
+            info += f" {avg:.4f}"
+          else:
+            info += f" {avg:.4e}"
+        if self._time_at_epoch_end:
+          time_per_epoch = (self._time_at_epoch_end - self._time_at_epoch_start)
+          avg_time_per_step = time_per_epoch / self.target
+          self._time_at_epoch_start = now
+          self._time_at_epoch_end = None
+          info += " -" + self._format_time(time_per_epoch, "epoch")
+          info += " -" + self._format_time(avg_time_per_step, self.unit_name)
+          info += "\n"
+        message += info
+        io_utils.print_msg(message, line_break=False)
+        message = ""
+
+    self._last_update = now
+
+  def add(self, n, values=None):
+    self.update(self._seen_so_far + n, values)
+
+  def _format_time(self, time_per_unit, unit_name):
+    """format a given duration to display to the user.
+
+        Given the duration, this function formats it in either milliseconds
+        or seconds and displays the unit (i.e. ms/step or s/epoch)
+        Args:
+          time_per_unit: the duration to display
+          unit_name: the name of the unit to display
+        Returns:
+          a string with the correctly formatted duration and units
+        """
+    formatted = ""
+    if time_per_unit >= 1 or time_per_unit == 0:
+      formatted += f" {time_per_unit:.0f}s/{unit_name}"
+    elif time_per_unit >= 1e-3:
+      formatted += f" {time_per_unit * 1000.0:.0f}ms/{unit_name}"
+    else:
+      formatted += f" {time_per_unit * 1000000.0:.0f}us/{unit_name}"
+    return formatted
+
+  def _estimate_step_duration(self, current, now):
+    """Estimate the duration of a single step.
+
+        Given the step number `current` and the corresponding time `now` this
+        function returns an estimate for how long a single step takes. If this
+        is called before one step has been completed (i.e. `current == 0`) then
+        zero is given as an estimate. The duration estimate ignores the duration
+        of the (assumed to be non-representative) first step for estimates when
+        more steps are available (i.e. `current>1`).
+
+        Args:
+          current: Index of current step.
+          now: The current time.
+
+        Returns: Estimate of the duration of a single step.
+        """
+    if current:
+      # there are a few special scenarios here:
+      # 1) somebody is calling the progress bar without ever supplying
+      #    step 1
+      # 2) somebody is calling the progress bar and supplies step one
+      #    multiple times, e.g. as part of a finalizing call
+      # in these cases, we just fall back to the simple calculation
+      if self._time_after_first_step is not None and current > 1:
+        time_per_unit = (now - self._time_after_first_step) / (current - 1)
+      else:
+        time_per_unit = (now - self._start) / current
+
+      if current == 1:
+        self._time_after_first_step = now
+      return time_per_unit
+    else:
+      return 0
+
+  def _update_stateful_metrics(self, stateful_metrics):
+    self.stateful_metrics = self.stateful_metrics.union(stateful_metrics)
+
+
+class ProgbarLogger(Callback):
+  """Callback that prints metrics to stdout.
+
+    Args:
+        count_mode: One of `"steps"` or `"samples"`.
+            Whether the progress bar should
+            count samples seen or steps (batches) seen.
+        stateful_metrics: Iterable of string names of metrics that
+            should *not* be averaged over an epoch.
+            Metrics in this list will be logged as-is.
+            All others will be averaged over time (e.g. loss, etc).
+            If not provided, defaults to the `Model`'s metrics.
+
+    Raises:
+        ValueError: In case of invalid `count_mode`.
+    """
+
+  def __init__(self, count_mode: str = "samples", stateful_metrics=None):
+    super().__init__()
+    self._supports_tf_logs = True
+    if count_mode == "samples":
+      self.use_steps = False
+    elif count_mode == "steps":
+      self.use_steps = True
+    else:
+      raise ValueError(f"Unknown `count_mode`: {count_mode}. "
+                       'Expected values are ["samples", "steps"]')
+    # Defaults to all Model's metrics except for loss.
+    self.stateful_metrics = (set(stateful_metrics) if stateful_metrics else set())
+
+    self.seen = 0
+    self.progbar = None
+    self.target = None
+    self.verbose = 1
+    self.epochs = 1
+
+    self._train_step, self._test_step, self._predict_step = None, None, None
+    self._call_batch_hooks = True
+
+    self._called_in_fit = False
+
+  def set_params(self, params):
+    self.verbose = params["verbose"]
+    self.epochs = params["epochs"]
+    if self.use_steps and "steps" in params:
+      self.target = params["steps"]
+    elif not self.use_steps and "samples" in params:
+      self.target = params["samples"]
+    else:
+      self.target = (
+          None  # Will be inferred at the end of the first epoch.
+      )
+
+    self._call_batch_hooks = self.verbose == 1
+    if self.target is None:
+      try:
+        self._train_step = self.model._train_counter
+        self._test_step = self.model._test_counter
+        self._predict_step = self.model._predict_counter
+      except AttributeError:
+        self._call_batch_hooks = True
+
+  def on_train_begin(self, logs=None):
+    # When this logger is called inside `fit`, validation is silent.
+    self._called_in_fit = True
+
+  def on_test_begin(self, logs=None):
+    if not self._called_in_fit:
+      self._reset_progbar()
+      self._maybe_init_progbar()
+
+  def on_predict_begin(self, logs=None):
+    self._reset_progbar()
+    self._maybe_init_progbar()
+
+  def on_epoch_begin(self, epoch, logs=None):
+    self._reset_progbar()
+    self._maybe_init_progbar()
+    if self.verbose and self.epochs > 1:
+      io_utils.print_msg(f"Epoch {epoch + 1}/{self.epochs}")
+
+  def on_train_batch_end(self, batch, logs=None):
+    self._batch_update_progbar(batch, logs)
+
+  def on_test_batch_end(self, batch, logs=None):
+    if not self._called_in_fit:
+      self._batch_update_progbar(batch, logs)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    # Don't pass prediction results.
+    self._batch_update_progbar(batch, None)
+
+  def on_epoch_end(self, epoch, logs=None):
+    self._finalize_progbar(logs, self._train_step)
+
+  def on_test_end(self, logs=None):
+    if not self._called_in_fit:
+      self._finalize_progbar(logs, self._test_step)
+
+  def on_predict_end(self, logs=None):
+    self._finalize_progbar(logs, self._predict_step)
+
+  def _reset_progbar(self):
+    self.seen = 0
+    self.progbar = None
+
+  def _maybe_init_progbar(self):
+    """Instantiate a `Progbar` if not yet, and update the stateful
+        metrics."""
+    # TODO(rchao): Legacy TF1 code path may use list for
+    # `self.stateful_metrics`. Remove "cast to set" when TF1 support is
+    # dropped.
+    self.stateful_metrics = set(self.stateful_metrics)
+
+    if self.model:
+      # Update the existing stateful metrics as `self.model.metrics` may
+      # contain updated metrics after `MetricsContainer` is built in the
+      # first train step.
+      self.stateful_metrics = self.stateful_metrics.union(set(m.name for m in self.model.metrics))
+
+    if self.progbar is None:
+      self.progbar = Progbar(
+          target=self.target,
+          verbose=self.verbose,
+          stateful_metrics=self.stateful_metrics,
+          unit_name="step" if self.use_steps else "sample",
+      )
+
+    self.progbar._update_stateful_metrics(self.stateful_metrics)
+
+  def _implements_train_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_test_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_predict_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _batch_update_progbar(self, batch, logs=None):
+    """Updates the progbar."""
+    logs = logs or {}
+    self._maybe_init_progbar()
+    if self.use_steps:
+      self.seen = batch + 1  # One-indexed.
+    else:
+      # v1 path only.
+      logs = copy.copy(logs)
+      batch_size = logs.pop("size", 0)
+      num_steps = logs.pop("num_steps", 1)
+      logs.pop("batch", None)
+      add_seen = num_steps * batch_size
+      self.seen += add_seen
+
+    if self.verbose == 1:
+      # Only block async when verbose = 1.
+      logs = tf_utils.sync_to_numpy_or_python_type(logs)
+      self.progbar.update(self.seen, list(logs.items()), finalize=False)
+
+  def _finalize_progbar(self, logs, counter):
+    logs = tf_utils.sync_to_numpy_or_python_type(logs or {})
+    if self.target is None:
+      if counter is not None:
+        counter = counter.numpy()
+        if not self.use_steps:
+          counter *= logs.get("size", 1)
+      self.target = counter or self.seen
+      self.progbar.target = self.target
+    self.progbar.update(self.target, list(logs.items()), finalize=True)
diff --git a/deepray/utils/misc/keras_utils.py b/deepray/callbacks/time_history.py
similarity index 65%
rename from deepray/utils/misc/keras_utils.py
rename to deepray/callbacks/time_history.py
index 94b99092..0778c52c 100644
--- a/deepray/utils/misc/keras_utils.py
+++ b/deepray/callbacks/time_history.py
@@ -1,29 +1,14 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper functions for the Keras implementations of models."""
-
-import multiprocessing
-import os
 import time
 
-from absl import logging
 import tensorflow as tf
-
+from tf_keras.callbacks import Callback
 from tensorflow.python.eager import monitoring
 
-global_batch_size_gauge = monitoring.IntGauge('/tensorflow/training/global_batch_size', 'TF training global batch size')
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
 
+global_batch_size_gauge = monitoring.IntGauge('/tensorflow/training/global_batch_size', 'TF training global batch size')
 first_batch_time_gauge = monitoring.IntGauge(
     '/tensorflow/training/first_batch', 'TF training start/end time for first batch (unix epoch time in us.', 'type'
 )
@@ -43,7 +28,7 @@ def __repr__(self):
     return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(self.batch_index, self.timestamp)
 
 
-class TimeHistory(tf.keras.callbacks.Callback):
+class TimeHistory(Callback):
   """Callback for Keras models."""
 
   def __init__(self, batch_size, log_steps, initial_step=0, logdir=None):
@@ -137,7 +122,7 @@ def on_batch_end(self, batch, logs=None):
       examples_per_second = steps_per_second * self.batch_size
 
       self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
-      logging.info(
+      logger.info(
           'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
           'and %d', elapsed_time, examples_per_second, self.last_log_step, self.global_steps
       )
@@ -156,46 +141,3 @@ def on_epoch_end(self, epoch, logs=None):
 
     self.steps_before_epoch += self.steps_in_epoch
     self.steps_in_epoch = 0
-
-
-class SimpleCheckpoint(tf.keras.callbacks.Callback):
-  """Keras callback to save tf.train.Checkpoints."""
-
-  def __init__(self, checkpoint_manager):
-    super(SimpleCheckpoint, self).__init__()
-    self.checkpoint_manager = checkpoint_manager
-
-  def on_epoch_end(self, epoch, logs=None):
-    step_counter = self.checkpoint_manager._step_counter.numpy()  # pylint: disable=protected-access
-    self.checkpoint_manager.save(checkpoint_number=step_counter)
-
-
-def set_session_config(enable_xla=False):
-  """Sets the session config."""
-  if enable_xla:
-    tf.config.optimizer.set_jit(True)
-
-
-# TODO(hongkuny): remove set_config_v2 globally.
-set_config_v2 = set_session_config
-
-
-def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count):
-  """Set GPU thread mode and count, and adjust dataset threads count."""
-  cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
-
-  # Allocate private thread pool for each GPU to schedule and launch kernels
-  per_gpu_thread_count = per_gpu_thread_count or 2
-  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
-
-  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
-  # private threads and memory copy threads.
-  total_gpu_thread_count = per_gpu_thread_count * num_gpus
-  num_runtime_threads = num_gpus
-  if not datasets_num_private_threads:
-    datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
-    logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads)
diff --git a/deepray/callbacks/time_stopping.py b/deepray/callbacks/time_stopping.py
index ca23885e..4196bf0e 100644
--- a/deepray/callbacks/time_stopping.py
+++ b/deepray/callbacks/time_stopping.py
@@ -18,7 +18,7 @@
 import time
 
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
+from tf_keras.callbacks import Callback
 from typeguard import typechecked
 
 
diff --git a/deepray/callbacks/tqdm_progress_bar.py b/deepray/callbacks/tqdm_progress_bar.py
index c51291dc..8805b677 100644
--- a/deepray/callbacks/tqdm_progress_bar.py
+++ b/deepray/callbacks/tqdm_progress_bar.py
@@ -18,7 +18,7 @@
 from collections import defaultdict
 
 import tensorflow as tf
-from tensorflow.keras.callbacks import Callback
+from tf_keras.callbacks import Callback
 from typeguard import typechecked
 
 
diff --git a/deepray/callbacks/training_speed.py b/deepray/callbacks/training_speed.py
new file mode 100644
index 00000000..72ab756f
--- /dev/null
+++ b/deepray/callbacks/training_speed.py
@@ -0,0 +1,155 @@
+# Copyright 2023 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from absl import flags
+from tf_keras.callbacks import Callback
+from tf_keras.src.utils import io_utils
+
+from deepray.utils import logging_util
+from deepray.utils.benchmark import PerformanceCalculator
+from deepray.utils.horovod_utils import get_world_size, is_main_process
+
+logger = logging_util.get_logger()
+
+
+class TrainingSpeed(Callback):
+  """Callback that prints metrics to stdout.
+
+    Args:
+        count_mode: One of `"steps"` or `"samples"`.
+            Whether the progress bar should
+            count samples seen or steps (batches) seen.
+
+    Raises:
+        ValueError: In case of invalid `count_mode`.
+    """
+
+  def __init__(self, batch_size: int = None):
+    super().__init__()
+    local_batch_size = batch_size or flags.FLAGS.batch_size
+    logger.info(f"Callback using local (per-replica) batch_size: {local_batch_size}")
+
+    if flags.FLAGS.use_horovod:
+      world_size = get_world_size()
+      self.global_batch_size = local_batch_size * world_size
+      if is_main_process():
+        logger.info(f"Horovod enabled: global_batch_size set to {self.global_batch_size} ({world_size} workers)")
+    else:
+      self.global_batch_size = local_batch_size
+
+    self.seen = 0
+    self.performance_calculator = None
+    self.epochs = 1
+
+    self._train_step, self._test_step, self._predict_step = None, None, None
+    self._call_batch_hooks = True
+
+    self._called_in_fit = False
+
+  def set_params(self, params):
+    self.epochs = params["epochs"]
+    self._call_batch_hooks = True
+    try:
+      self._train_step = self.model._train_counter
+      self._test_step = self.model._test_counter
+      self._predict_step = self.model._predict_counter
+    except AttributeError:
+      self._call_batch_hooks = True
+
+    self.last_step = 0
+    if isinstance(self.last_step, (tf.Tensor, tf.Variable)):
+      self.last_step = self.last_step.numpy()
+
+  def set_optimizer(self, optimizer):
+    self.optimizer = optimizer
+
+  def on_train_begin(self, logs=None):
+    # When this logger is called inside `fit`, validation is silent.
+    self._called_in_fit = True
+    self._perf_wo = 0
+    self._perf_wo_n = 0
+
+    # Training loop starts here.
+    if hasattr(self.optimizer, "iterations"):
+      self._first_steps = self.optimizer.iterations.numpy()
+    else:
+      self._first_steps = 0
+
+  def on_test_begin(self, logs=None):
+    if not self._called_in_fit:
+      self._reset_progbar()
+      self._maybe_init_progbar()
+
+  def on_predict_begin(self, logs=None):
+    self._reset_progbar()
+    self._maybe_init_progbar()
+
+  def on_train_batch_end(self, batch, logs=None):
+    if is_main_process():
+      self._batch_update_progbar(batch, logs)
+
+  def on_test_batch_end(self, batch, logs=None):
+    if not self._called_in_fit:
+      self._batch_update_progbar(batch, logs)
+
+  def on_predict_batch_end(self, batch, logs=None):
+    # Don't pass prediction results.
+    self._batch_update_progbar(batch, None)
+
+  def on_test_end(self, logs=None):
+    if not self._called_in_fit:
+      self._finalize_progbar(logs, self._test_step)
+
+  def on_predict_end(self, logs=None):
+    self._finalize_progbar(logs, self._predict_step)
+
+  def _reset_progbar(self):
+    self.seen = 0
+    self.performance_calculator = None
+
+  def _maybe_init_progbar(self):
+    if self.performance_calculator is None:
+      self.performance_calculator = PerformanceCalculator()
+
+  def _implements_train_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_test_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _implements_predict_batch_hooks(self):
+    return self._call_batch_hooks
+
+  def _batch_update_progbar(self, batch, logs=None):
+    """Updates the performance_calculator."""
+    self._maybe_init_progbar()
+    self.seen = batch + 1  # One-indexed.
+    delta_steps = self.seen - self.last_step
+
+    step_throughput = self.performance_calculator(delta_steps, self.global_batch_size)
+    logger.info('Perf %.2f samples/s' % step_throughput)
+
+    if batch > self._first_steps + delta_steps * 2:
+      self._perf_wo += step_throughput
+      self._perf_wo_n += 1
+
+    self.last_step = self.seen
+
+  def _finalize_progbar(self, logs, counter):
+    results_perf = self.performance_calculator.get_current_benchmark_results()
+    logger.info(results_perf)
+    if self._perf_wo_n != 0:
+      logger.info("Throughput Average (examples/sec) = %0.2f", self._perf_wo / self._perf_wo_n)
diff --git a/deepray/copts.bzl b/deepray/copts.bzl
index e56213f0..42dd8ccc 100644
--- a/deepray/copts.bzl
+++ b/deepray/copts.bzl
@@ -71,3 +71,11 @@ TEST_CPP_COPTS = DEFAULT_CPP_COPTS + [
 TEST_LINKOPTS = DEFAULT_LINKOPTS + [
     "-fsanitize=address",
 ]
+
+# cc_* rules should include this list in copts. If additional cc_*-wide
+# customization appears, we might want to switch to macros.
+
+"""This is the definition site for things we want to keep consistent, like copts."""
+
+FCP_COPTS = [
+]
diff --git a/deepray/core/base_trainer.py b/deepray/core/base_trainer.py
deleted file mode 100644
index 68ee81c0..00000000
--- a/deepray/core/base_trainer.py
+++ /dev/null
@@ -1,991 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A light weight utilities to train TensorFlow models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import sys
-import time
-from typing import Union, List, Dict, Text
-
-import tensorflow as tf
-from absl import logging, flags
-from dllogger import Verbosity
-from keras.engine import compile_utils
-from keras.engine import data_adapter
-from packaging import version
-
-from .compile_utils import HvdMetricsContainer
-
-if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"):
-  from tensorflow.keras import optimizers
-else:
-  from tensorflow.keras.optimizers import legacy as optimizers
-from deepray.callbacks import HvdCallbackList
-from deepray.core.common import distribution_utils
-from deepray.optimizers.optimization import GradientAccumulator
-from deepray.utils import dllogger_class
-from deepray.utils import gpu_affinity
-from deepray.utils.flags import common_flags
-from deepray.utils.misc import keras_utils
-from deepray.utils.benchmark import PerformanceCalculator
-from deepray.utils.horovod_utils import is_main_process, get_world_size
-from deepray.utils import export
-
-from .module import Module
-
-_SUMMARY_TXT = 'training_summary.txt'
-_MIN_SUMMARY_STEPS = 10
-FLAGS = flags.FLAGS
-
-if FLAGS.use_dynamic_embedding:
-  from tensorflow_recommenders_addons import dynamic_embedding as de
-  from tensorflow_recommenders_addons.dynamic_embedding.python.ops.dynamic_embedding_ops import TrainableWrapper, DEResourceVariable
-  tf.train.Checkpoint = de.train.checkpoint.DEHvdCheckpoint
-else:
-  TrainableWrapper, DEResourceVariable = type(None), type(None)
-
-# Users should always run this script under TF 2.x
-# The container haven't changed version number yet, skip the check.
-assert tf.version.VERSION.startswith('2.')
-
-gpus = tf.config.experimental.list_physical_devices('GPU')
-for gpu in gpus:
-  tf.config.experimental.set_memory_growth(gpu, True)
-
-if FLAGS.use_horovod:
-  if FLAGS.keras_use_ctl:
-    import horovod.tensorflow as hvd
-  else:
-    import horovod.tensorflow.keras as hvd
-  from horovod.tensorflow.compression import Compression
-
-  hvd.init()
-  if gpus:
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
-    gpu_affinity.set_affinity(hvd.local_rank())
-
-# Enables XLA in Session Config. Should not be set for TPU.
-keras_utils.set_config_v2(FLAGS.enable_xla)
-
-use_float16 = common_flags.use_float16()
-if use_float16:
-  policy = tf.keras.mixed_precision.Policy("mixed_float16")
-  tf.keras.mixed_precision.set_global_policy(policy)
-  logging.info("mixed_float16 enabled!")
-
-
-def write_txt_summary(training_summary, summary_dir):
-  """Writes a summary text file to record stats."""
-  summary_path = os.path.join(summary_dir, _SUMMARY_TXT)
-  with tf.io.gfile.GFile(summary_path, 'wb') as f:
-    logging.info('Training Summary: \n%s', str(training_summary))
-    f.write(json.dumps(training_summary, indent=4, default=str))
-
-
-class Trainer(Module):
-  """Configures the model for training.
-
-  Example:
-
-  ```python
-  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
-                loss=tf.keras.losses.BinaryCrossentropy(),
-                metrics=[tf.keras.metrics.BinaryAccuracy(),
-                         tf.keras.metrics.FalseNegatives()])
-  ```
-
-  Args:
-      optimizer: String (name of optimizer) or optimizer instance. See
-        `tf.keras.optimizers`.
-      loss: Loss function. May be a string (name of loss function), or
-        a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
-        function is any callable with the signature `loss = fn(y_true,
-        y_pred)`, where `y_true` are the ground truth values, and
-        `y_pred` are the model's predictions.
-        `y_true` should have shape
-        `(batch_size, d0, .. dN)` (except in the case of
-        sparse loss functions such as
-        sparse categorical crossentropy which expects integer arrays of
-        shape `(batch_size, d0, .. dN-1)`).
-        `y_pred` should have shape `(batch_size, d0, .. dN)`.
-        The loss function should return a float tensor.
-        If a custom `Loss` instance is
-        used and reduction is set to `None`, return value has shape
-        `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
-        values; otherwise, it is a scalar. If the model has multiple
-        outputs, you can use a different loss on each output by passing a
-        dictionary or a list of losses. The loss value that will be
-        minimized by the model will then be the sum of all individual
-        losses, unless `loss_weights` is specified.
-      metrics: List of metrics to be evaluated by the model during
-        training and testing. Each of this can be a string (name of a
-        built-in function), function or a `tf.keras.metrics.Metric`
-        instance. See `tf.keras.metrics`. Typically you will use
-        `metrics=['accuracy']`.
-        A function is any callable with the signature `result = fn(y_true,
-        y_pred)`. To specify different metrics for different outputs of a
-        multi-output model, you could also pass a dictionary, such as
-        `metrics={'output_a':'accuracy', 'output_b':['accuracy', 'mse']}`.
-        You can also pass a list to specify a metric or a list of metrics
-        for each output, such as
-        `metrics=[['accuracy'], ['accuracy', 'mse']]`
-        or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
-        strings 'accuracy' or 'acc', we convert this to one of
-        `tf.keras.metrics.BinaryAccuracy`,
-        `tf.keras.metrics.CategoricalAccuracy`,
-        `tf.keras.metrics.SparseCategoricalAccuracy` based on the shapes
-        of the targets and of the model output. We do a similar
-        conversion for the strings 'crossentropy' and 'ce' as well.
-        The metrics passed here are evaluated without sample weighting; if
-        you would like sample weighting to apply, you can specify your
-        metrics via the `weighted_metrics` argument instead.
-      loss_weights: Optional list or dictionary specifying scalar
-        coefficients (Python floats) to weight the loss contributions of
-        different model outputs. The loss value that will be minimized by
-        the model will then be the *weighted sum* of all individual
-        losses, weighted by the `loss_weights` coefficients.  If a list,
-        it is expected to have a 1:1 mapping to the model's outputs. If a
-        dict, it is expected to map output names (strings) to scalar
-        coefficients.
-      weighted_metrics: List of metrics to be evaluated and weighted by
-        `sample_weight` or `class_weight` during training and testing.
-      run_eagerly: Bool. Defaults to `False`. If `True`, this `Model`'s
-        logic will not be wrapped in a `tf.function`. Recommended to leave
-        this as `None` unless your `Model` cannot be run inside a
-        `tf.function`. `run_eagerly=True` is not supported when using
-        `tf.distribute.experimental.ParameterServerStrategy`.
-      steps_per_execution: Int. Defaults to 1. The number of batches to
-        run during each `tf.function` call. Running multiple batches
-        inside a single `tf.function` call can greatly improve performance
-        on TPUs or small models with a large Python overhead. At most, one
-        full epoch will be run each execution. If a number larger than the
-        size of the epoch is passed, the execution will be truncated to
-        the size of the epoch. Note that if `steps_per_execution` is set
-        to `N`, `Callback.on_batch_begin` and `Callback.on_batch_end`
-        methods will only be called every `N` batches (i.e. before/after
-        each `tf.function` execution).
-      jit_compile: If `True`, compile the model training step with XLA.
-        [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
-        for machine learning.
-        `jit_compile` is not enabled for by default.
-        Note that `jit_compile=True`
-        may not necessarily work for all models.
-        For more information on supported operations please refer to the
-        [XLA documentation](https://www.tensorflow.org/xla).
-        Also refer to
-        [known XLA issues](https://www.tensorflow.org/xla/known_issues)
-        for more details.
-      **kwargs: Arguments supported for backwards compatibility only.
-  """
-
-  def __init__(
-      self,
-      model: Union[tf.keras.Model, List[tf.keras.Model], Dict[Text, tf.keras.Model]],
-      optimizer="rmsprop",
-      loss=None,
-      metrics=None,
-      loss_weights=None,
-      weighted_metrics=None,
-      use_horovod=None,
-      run_eagerly=None,
-      jit_compile=None,
-      **kwargs
-  ):
-    super().__init__(**kwargs)
-    self._model = {}
-    if isinstance(model, list):
-      if len(model) > 0:
-        self._model = {"main": model[0]}
-        if len(model) == 2:
-          self._model["sub_model"] = model[1]
-        else:
-          for i in range(1, len(model)):
-            self._model[f"sub_model{i}"] = model[i]
-      else:
-        raise ValueError("Not a reachable model.")
-    elif isinstance(model, dict):
-      main_keys = [k for k in model.keys() if "main" in k]
-      if len(main_keys) == 1:
-        if (len(model) == 1):
-          self._model = {"main": next(iter(model.values()))}
-        else:
-          self._model = model
-      else:
-        raise ValueError(f"Must set only one model with key contains \"main\", found {main_keys}.")
-    elif isinstance(model, tf.keras.Model):
-      self._model = {"main": model}
-    else:
-      raise ValueError("Not a reachable model.")
-
-    self._loss = loss
-    self._metrics = metrics
-    self._loss_weights = loss_weights
-    self._weighted_metrics = weighted_metrics
-
-    self.use_horovod = use_horovod if use_horovod else FLAGS.use_horovod
-    self.run_eagerly = run_eagerly if run_eagerly else FLAGS.run_eagerly
-    self._jit_compile = jit_compile
-
-    self.epochs = FLAGS.epochs
-
-    if is_main_process():
-      logging.info(" {} Initialize training".format(time.strftime("%Y%m%d %H:%M:%S")))
-
-      logging.info("\ttf.app.flags.FLAGS:")
-      for key, value in sorted(FLAGS.flag_values_dict().items()):
-        logging.info(f"\t{key:25}= {value}")
-
-    self.global_batch_size = FLAGS.batch_size * FLAGS.num_accumulation_steps
-    learning_rate = FLAGS.learning_rate
-
-    if self.use_horovod:
-      self.global_batch_size *= get_world_size()
-      learning_rate *= get_world_size()
-
-    # TODO: fuhailin
-    # if isinstance(optimizer, optimizers.Optimizer):
-    self.optimizer = optimizer
-    # else:
-    #   raise ValueError("Not support opt.")
-    self.use_float16 = common_flags.use_float16()
-    if self.use_float16:
-      self.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(self.optimizer, dynamic=True)
-
-    with distribution_utils.get_strategy_scope(self._distribution_strategy):
-      # To correctly place the model weights on accelerators,
-      # model should be created in scope.
-      if isinstance(self._loss, compile_utils.LossesContainer):
-        self.loss_container = self._loss
-      else:
-        self.loss_container = compile_utils.LossesContainer(
-            self._loss, self._loss_weights, output_names=self.main_model.output_names
-        )
-      self.metric_container = HvdMetricsContainer(
-          self._metrics,
-          self._weighted_metrics,
-          output_names=self.main_model.output_names,
-          # from_serialized=from_serialized,
-      ) if self._metrics or self._weighted_metrics else None
-
-  @property
-  def main_model(self):
-    """
-    Returns:
-      The main model
-    """
-    if len(self._model) == 1:
-      return self._model["main"]
-    else:
-      for name, _model in self._model.items():
-        if "main" in name:
-          return _model
-      ValueError("Could not find the main model.")
-
-  @property
-  def models(self):
-    if len(self._model) == 1:
-      return self._model["main"]
-    else:
-      return self._model
-
-  @property
-  def checkpoint(self):
-    if len(self._checkpoints) == 1:
-      return self._checkpoints["main"]
-    else:
-      return self._checkpoints
-
-  @property
-  def manager(self):
-    if len(self._managers) == 1:
-      return self._managers["main"]
-    else:
-      return self._managers
-
-  def fit(
-      self,
-      train_input=None,
-      eval_input=None,
-      eval_steps=None,
-      verbose="auto",
-      callbacks=[],
-      steps_per_epoch: int = None,
-  ):
-    """Trains the model for a fixed number of epochs (dataset iterations).
-
-    Args:
-        x: Input data. It could be:
-          - A Numpy array (or array-like), or a list of arrays
-            (in case the model has multiple inputs).
-          - A TensorFlow tensor, or a list of tensors
-            (in case the model has multiple inputs).
-          - A dict mapping input names to the corresponding array/tensors,
-            if the model has named inputs.
-          - A `tf.data` dataset. Should return a tuple
-            of either `(inputs, targets)` or
-            `(inputs, targets, sample_weights)`.
-          - A generator or `keras.utils.Sequence` returning `(inputs,
-            targets)` or `(inputs, targets, sample_weights)`.
-          - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
-            callable that takes a single argument of type
-            `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
-            `DatasetCreator` should be used when users prefer to specify the
-            per-replica batching and sharding logic for the `Dataset`.
-            See `tf.keras.utils.experimental.DatasetCreator` doc for more
-            information.
-          A more detailed description of unpacking behavior for iterator
-          types (Dataset, generator, Sequence) is given below. If these
-          include `sample_weights` as a third component, note that sample
-          weighting applies to the `weighted_metrics` argument but not the
-          `metrics` argument in `compile()`. If using
-          `tf.distribute.experimental.ParameterServerStrategy`, only
-          `DatasetCreator` type is supported for `x`.
-        y: Target data. Like the input data `x`,
-          it could be either Numpy array(s) or TensorFlow tensor(s).
-          It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset, generator,
-          or `keras.utils.Sequence` instance, `y` should
-          not be specified (since targets will be obtained from `x`).
-        batch_size: Integer or `None`.
-            Number of samples per gradient update.
-            If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` if your data is in the
-            form of datasets, generators, or `keras.utils.Sequence`
-            instances (since they generate batches).
-        epochs: Integer. Number of epochs to train the model.
-            An epoch is an iteration over the entire `x` and `y`
-            data provided
-            (unless the `steps_per_epoch` flag is set to
-            something other than None).
-            Note that in conjunction with `initial_epoch`,
-            `epochs` is to be understood as "final epoch".
-            The model is not trained for a number of iterations
-            given by `epochs`, but merely until the epoch
-            of index `epochs` is reached.
-        verbose: 'auto', 0, 1, or 2. Verbosity mode.
-            0 = silent, 1 = progress bar, 2 = one line per epoch.
-            'auto' defaults to 1 for most cases, but 2 when used with
-            `ParameterServerStrategy`. Note that the progress bar is not
-            particularly useful when logged to a file, so verbose=2 is
-            recommended when not running interactively (eg, in a production
-            environment).
-        callbacks: List of `keras.callbacks.Callback` instances.
-            List of callbacks to apply during training.
-            See `tf.keras.callbacks`. Note
-            `tf.keras.callbacks.ProgbarLogger` and
-            `tf.keras.callbacks.History` callbacks are created automatically
-            and need not be passed into `model.fit`.
-            `tf.keras.callbacks.ProgbarLogger` is created or not based on
-            `verbose` argument to `model.fit`.
-            Callbacks with batch-level calls are currently unsupported with
-            `tf.distribute.experimental.ParameterServerStrategy`, and users
-            are advised to implement epoch-level calls instead with an
-            appropriate `steps_per_epoch` value.
-        validation_split: Float between 0 and 1.
-            Fraction of the training data to be used as validation data.
-            The model will set apart this fraction of the training data,
-            will not train on it, and will evaluate
-            the loss and any model metrics
-            on this data at the end of each epoch.
-            The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling. This
-            argument is not supported when `x` is a dataset, generator or
-            `keras.utils.Sequence` instance.
-            If both `validation_data` and `validation_split` are provided,
-            `validation_data` will override `validation_split`.
-            `validation_split` is not yet supported with
-            `tf.distribute.experimental.ParameterServerStrategy`.
-        validation_data: Data on which to evaluate
-            the loss and any model metrics at the end of each epoch.
-            The model will not be trained on this data. Thus, note the fact
-            that the validation loss of data provided using
-            `validation_split` or `validation_data` is not affected by
-            regularization layers like noise and dropout.
-            `validation_data` will override `validation_split`.
-            `validation_data` could be:
-              - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
-              - A tuple `(x_val, y_val, val_sample_weights)` of NumPy
-                arrays.
-              - A `tf.data.Dataset`.
-              - A Python generator or `keras.utils.Sequence` returning
-              `(inputs, targets)` or `(inputs, targets, sample_weights)`.
-            `validation_data` is not yet supported with
-            `tf.distribute.experimental.ParameterServerStrategy`.
-        shuffle: Boolean (whether to shuffle the training data
-            before each epoch) or str (for 'batch'). This argument is
-            ignored when `x` is a generator or an object of tf.data.Dataset.
-            'batch' is a special option for dealing
-            with the limitations of HDF5 data; it shuffles in batch-sized
-            chunks. Has no effect when `steps_per_epoch` is not `None`.
-        class_weight: Optional dictionary mapping class indices (integers)
-            to a weight (float) value, used for weighting the loss function
-            (during training only).
-            This can be useful to tell the model to
-            "pay more attention" to samples from
-            an under-represented class.
-        sample_weight: Optional Numpy array of weights for
-            the training samples, used for weighting the loss function
-            (during training only). You can either pass a flat (1D)
-            Numpy array with the same length as the input samples
-            (1:1 mapping between weights and samples),
-            or in the case of temporal data,
-            you can pass a 2D array with shape
-            `(samples, sequence_length)`,
-            to apply a different weight to every timestep of every sample.
-            This argument is not supported when `x` is a dataset, generator,
-            or `keras.utils.Sequence` instance, instead provide the
-            sample_weights as the third element of `x`.
-            Note that sample weighting does not apply to metrics specified
-            via the `metrics` argument in `compile()`. To apply sample
-            weighting to your metrics, you can specify them via the
-            `weighted_metrics` in `compile()` instead.
-        initial_epoch: Integer.
-            Epoch at which to start training
-            (useful for resuming a previous training run).
-        steps_per_epoch: Integer or `None`.
-            Total number of steps (batches of samples)
-            before declaring one epoch finished and starting the
-            next epoch. When training with input tensors such as
-            TensorFlow data tensors, the default `None` is equal to
-            the number of samples in your dataset divided by
-            the batch size, or 1 if that cannot be determined. If x is a
-            `tf.data` dataset, and 'steps_per_epoch'
-            is None, the epoch will run until the input dataset is
-            exhausted.  When passing an infinitely repeating dataset, you
-            must specify the `steps_per_epoch` argument. If
-            `steps_per_epoch=-1` the training will run indefinitely with an
-            infinitely repeating dataset.  This argument is not supported
-            with array inputs.
-            When using `tf.distribute.experimental.ParameterServerStrategy`:
-              * `steps_per_epoch=None` is not supported.
-        eval_steps: Only relevant if `validation_data` is provided and
-            is a `tf.data` dataset. Total number of steps (batches of
-            samples) to draw before stopping when performing validation
-            at the end of every epoch. If 'eval_steps' is None,
-            validation will run until the `validation_data` dataset is
-            exhausted. In the case of an infinitely repeated dataset, it
-            will run into an infinite loop. If 'eval_steps' is
-            specified and only part of the dataset will be consumed, the
-            evaluation will start from the beginning of the dataset at each
-            epoch. This ensures that the same validation samples are used
-            every time.
-        validation_batch_size: Integer or `None`.
-            Number of samples per validation batch.
-            If unspecified, will default to `batch_size`.
-            Do not specify the `validation_batch_size` if your data is in
-            the form of datasets, generators, or `keras.utils.Sequence`
-            instances (since they generate batches).
-        validation_freq: Only relevant if validation data is provided.
-          Integer or `collections.abc.Container` instance (e.g. list, tuple,
-          etc.).  If an integer, specifies how many training epochs to run
-          before a new validation run is performed, e.g. `validation_freq=2`
-          runs validation every 2 epochs. If a Container, specifies the
-          epochs on which to run validation, e.g.
-          `validation_freq=[1, 2, 10]` runs validation at the end of the
-          1st, 2nd, and 10th epochs.
-        max_queue_size: Integer. Used for generator or
-          `keras.utils.Sequence` input only. Maximum size for the generator
-          queue.  If unspecified, `max_queue_size` will default to 10.
-        workers: Integer. Used for generator or `keras.utils.Sequence` input
-            only. Maximum number of processes to spin up
-            when using process-based threading. If unspecified, `workers`
-            will default to 1.
-        use_multiprocessing: Boolean. Used for generator or
-            `keras.utils.Sequence` input only. If `True`, use process-based
-            threading. If unspecified, `use_multiprocessing` will default to
-            `False`. Note that because this implementation relies on
-            multiprocessing, you should not pass non-picklable arguments to
-            the generator as they can't be passed easily to children
-            processes.
-
-    Unpacking behavior for iterator-like inputs:
-        A common pattern is to pass a tf.data.Dataset, generator, or
-      tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
-      yield not only features (x) but optionally targets (y) and sample
-      weights.  Keras requires that the output of such iterator-likes be
-      unambiguous. The iterator should return a tuple of length 1, 2, or 3,
-      where the optional second and third elements will be used for y and
-      sample_weight respectively. Any other type provided will be wrapped in
-      a length one tuple, effectively treating everything as 'x'. When
-      yielding dicts, they should still adhere to the top-level tuple
-      structure.
-      e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate
-      features, targets, and weights from the keys of a single dict.
-        A notable unsupported data type is the namedtuple. The reason is
-      that it behaves like both an ordered datatype (tuple) and a mapping
-      datatype (dict). So given a namedtuple of the form:
-          `namedtuple("example_tuple", ["y", "x"])`
-      it is ambiguous whether to reverse the order of the elements when
-      interpreting the value. Even worse is a tuple of the form:
-          `namedtuple("other_tuple", ["x", "y", "z"])`
-      where it is unclear if the tuple was intended to be unpacked into x,
-      y, and sample_weight or passed through as a single element to `x`. As
-      a result the data processing code will simply raise a ValueError if it
-      encounters a namedtuple. (Along with instructions to remedy the
-      issue.)
-
-    Returns:
-        A `History` object. Its `History.history` attribute is
-        a record of training loss values and metrics values
-        at successive epochs, as well as validation loss values
-        and validation metrics values (if applicable).
-
-    Raises:
-        RuntimeError: 1. If the model was never compiled or,
-        2. If `model.fit` is  wrapped in `tf.function`.
-
-        ValueError: In case of mismatch between the provided input data
-            and what the model expects or when the input data is empty.
-    """
-    self.steps_per_epoch = steps_per_epoch if steps_per_epoch else -1
-    self.eval_steps = eval_steps
-    if FLAGS.benchmark or FLAGS.stop_steps >= 0:
-      if FLAGS.stop_steps >= 0:
-        self.steps_per_epoch = FLAGS.stop_steps
-      else:
-        self.steps_per_epoch = 1000
-      self.epochs = 1
-
-    if FLAGS.keras_use_ctl:
-      self._performance_calculator = PerformanceCalculator(total_steps=self.steps_per_epoch * self.epochs)
-
-      self.steps_per_loop = FLAGS.steps_per_summary
-      if 1 < self.steps_per_epoch < self.steps_per_loop:
-        if is_main_process():
-          logging.error(
-              'steps_per_summary: %d is specified to be greater than '
-              ' steps_per_epoch: %d, we will use steps_per_epoch as'
-              ' steps_per_summary.', self.steps_per_loop, self.steps_per_epoch
-          )
-        self.steps_per_loop = self.steps_per_epoch
-
-      self._configure_steps_per_execution(self.steps_per_loop or 1)
-      assert tf.executing_eagerly()
-
-      if self.run_eagerly:
-        # if self.steps_per_loop > 1:
-        #   raise ValueError(
-        #     'steps_per_loop is used for performance optimization. When you want '
-        #     'to run eagerly, you cannot leverage graph mode loop.')
-        if isinstance(self._distribution_strategy, tf.distribute.experimental.TPUStrategy):
-          raise ValueError(
-              'TPUStrategy should not run eagerly as it heavily replies on graph'
-              ' optimization for the distributed system.'
-          )
-
-      self.make_train_function()
-
-      # Create summary writers
-      if is_main_process():
-        self.summary_dir = os.path.join(FLAGS.model_dir, 'summaries')
-        self.eval_summary_writer = tf.summary.create_file_writer(os.path.join(self.summary_dir, 'eval'))
-        if self.steps_per_loop >= _MIN_SUMMARY_STEPS:
-          # Only writes summary when the stats are collected sufficiently over
-          # enough steps.
-          self.train_summary_writer = tf.summary.create_file_writer(os.path.join(self.summary_dir, 'train'))
-        else:
-          self.train_summary_writer = None
-      else:
-        self.eval_summary_writer = None
-        self.train_summary_writer = None
-        eval_input_fn = None
-
-      self._checkpoints, self._managers = {}, {}
-      for name, model in self._model.items():
-        if "main" in name:
-          _checkpoint = tf.train.Checkpoint(model=model, optimizer=self.optimizer)
-          self._checkpoints[name] = _checkpoint
-          self._managers[name] = tf.train.CheckpointManager(
-              _checkpoint, os.path.join(FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=3
-          )
-        else:
-          _checkpoint = tf.train.Checkpoint(model=model)
-          self._checkpoints[name] = _checkpoint
-          self._managers[name] = tf.train.CheckpointManager(
-              _checkpoint, os.path.join(FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=3
-          )
-
-      if FLAGS.init_checkpoint:
-        for (name, ckpt), init_ckpt in zip(self._checkpoints.items(), FLAGS.init_checkpoint):
-          if init_ckpt:
-            if tf.io.gfile.isdir(init_ckpt):
-              latest_checkpoint = tf.train.latest_checkpoint(init_ckpt)
-            else:
-              latest_checkpoint = init_ckpt
-            logging.info(
-                f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.'
-            )
-            ckpt.restore(latest_checkpoint).assert_existing_objects_matched()
-            logging.info('Loading from checkpoint file completed')
-
-      if FLAGS.init_weights:
-        for (name, _model), init_weight in zip(self._model.items(), FLAGS.init_weights):
-          if init_weight:
-            logging.info(f'variables file {init_weight} found and restoring from initial variables for {name} model.')
-            _model.load_weights(os.path.join(init_weight, "variables"))
-            logging.info('Loading from weights file completed')
-
-      if FLAGS.num_accumulation_steps > 1:
-        self.accum_gradients = GradientAccumulator()
-
-      verbose = 0  # training_module._get_verbosity(verbose, self._distribution_strategy)
-
-      # Container that configures and calls `tf.keras.Callback`s.
-      if not isinstance(callbacks, HvdCallbackList):
-        self.callbacks = HvdCallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self.main_model,
-            verbose=verbose,
-            epochs=self.epochs,
-            steps=self.steps_per_epoch * self.epochs,
-        )
-      return self.run_customized_training_loop(train_input, eval_input)
-    else:
-      if FLAGS.use_horovod and not FLAGS.use_dynamic_embedding:
-        # Add Horovod Distributed Optimizer
-        opt = hvd.DistributedOptimizer(self.optimizer)
-      else:
-        opt = self.optimizer
-
-      self.main_model.compile(
-          optimizer=opt,
-          loss=self._loss,
-          loss_weights=self._loss_weights,
-          metrics=self._metrics,
-          weighted_metrics=self._weighted_metrics,
-          run_eagerly=self.run_eagerly
-      )
-
-      # if not FLAGS.benchmark:
-      #   # Create Tensorboard summary and checkpoint callbacks.
-      #   summary_dir = os.path.join(FLAGS.model_dir, "summaries")
-      #   callbacks.append(tf.keras.callbacks.TensorBoard(summary_dir, profile_batch=0))
-
-      #   # Horovod: save checkpoints only on worker 0 to prevent other workers from corrupting them.
-      #   if is_main_process():
-      #     checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
-      #     callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True))
-
-      if FLAGS.use_horovod:
-        callbacks += [
-            # Horovod: broadcast initial variable states from rank 0 to all other processes.
-            # This is necessary to ensure consistent initialization of all workers when
-            # training is started with random weights or restored from a checkpoint.
-            # hvd callback用于广播rank0的初始化器产生的值
-            de.keras.callbacks.DEHvdBroadcastGlobalVariablesCallback(root_rank=0)
-            if FLAGS.use_dynamic_embedding else hvd.callbacks.BroadcastGlobalVariablesCallback(0),
-        ]
-
-      # Horovod: write logs on worker 0.
-      verbose = 2 if is_main_process() else 0
-      history = self.main_model.fit(
-          train_input,
-          epochs=self.epochs,
-          steps_per_epoch=self.steps_per_epoch if self.steps_per_epoch else None,
-          callbacks=callbacks,
-          validation_data=eval_input,
-          validation_steps=eval_steps,
-          verbose=verbose
-      )
-      return history
-
-  def run_customized_training_loop(
-      self,
-      train_input=None,
-      eval_input=None,
-  ):
-    # if self.epochs > 1 and FLAGS.num_train_examples == -1:
-    #   raise ValueError('When the num_train_examples is INFINITE or UNKNOWN, we just can run one epoch.')
-
-    # Training loop starts here.
-    self.current_step = self._first_steps = self.optimizer.iterations.numpy()
-
-    if self.use_horovod:
-      with tf.init_scope():
-        self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name='first_batch')
-    if not hasattr(self.main_model, 'optimizer'):
-      raise ValueError('User should set optimizer attribute to model '
-                       'inside `model_fn`.')
-    # if self.sub_model_export_name and self.sub_model is None:
-    #   raise ValueError('sub_model_export_name is specified as %s, but '
-    #                    'sub_model is None.' % self.sub_model_export_name)
-
-    self._steps_from_save = 0
-    start_time = time.time()
-    self._perf_wo = 0
-    self._perf_wo_n = 0
-
-    self.callbacks.on_train_begin()
-    training_logs = None
-    for epoch in range(self.epochs):
-      train_iterator = distribution_utils.make_distributed_iterator(self._distribution_strategy, train_input)
-      self.on_epoch_begin(epoch)
-      while self.steps_per_epoch < 0 or self._step_epoch < self.steps_per_epoch:
-        t0 = time.time()
-        self.callbacks.on_train_batch_begin(self.current_step)
-        # Runs several steps in the host while loop.
-        steps, num_accumulation_steps = self.steps_to_run(self.current_step, self.steps_per_epoch, self.steps_per_loop)
-
-        try:
-          if steps == 1:
-            training_logs = self._train_step(next(train_iterator), num_accumulation_steps)
-          else:
-            # Converts steps to a Tensor to avoid tf.function retracing.
-            training_logs = self._train_steps(
-                train_iterator, tf.convert_to_tensor(steps, dtype=tf.int32), num_accumulation_steps
-            )
-        except (tf.errors.OutOfRangeError, StopIteration):
-          if is_main_process():
-            logging.info(f"Done reading data for epoch {epoch}")
-          if self.optimizer.iterations.numpy() == self._first_steps:
-            logging.warning("No data was processed.")
-            return None
-          elif steps > 1 and self.optimizer.iterations.numpy() > self.current_step:
-            steps = self.optimizer.iterations.numpy() - self.current_step
-            training_logs = self.get_metrics_result()
-            self.on_batch_end(training_logs, steps, t0)
-          break
-
-        self.on_batch_end(training_logs, steps, t0)
-      self.on_epoch_end(epoch, self.current_step, eval_input, epoch_logs=training_logs)
-      if self.main_model.stop_training:
-        logging.info(f"self.model.stop_training = {self.main_model.stop_training}")
-        break
-    self.callbacks.on_train_end(logs=training_logs)
-
-    total_time = time.time() - start_time
-    results_perf = self._performance_calculator.results
-    if not self._performance_calculator.completed:
-      logging.info(f"self._performance_calculator.completed: {self._performance_calculator.completed}")
-      results_perf = self._performance_calculator.get_current_benchmark_results()
-
-    export.export_to_checkpoint(self.manager, self.current_step)
-    if is_main_process():
-      training_summary = {'total_training_steps': self.current_step}
-      if self.loss_container:
-        training_summary['train_loss'] = self._float_metric_value(self.loss_container.metrics[0])
-
-      if self.metric_container and self.metric_container.metrics:
-        # TODO(hongkuny): Cleans up summary reporting in text.
-        for metric in self.metric_container.metrics:
-          training_summary['last_' + metric.name] = self._float_metric_value(metric)
-          # training_summary['eval_metrics'] = _float_metric_value(self.metric_container.metrics[0])
-
-      write_txt_summary(training_summary, self.summary_dir)
-
-      dllogging = dllogger_class.dllogger_class(FLAGS.dllog_path)
-      total_sentences = self.current_step * self.global_batch_size
-      logging.info("-----------------------------")
-      logging.info("  Batch size = %d", FLAGS.batch_size)
-      logging.info("  Num steps = %d", self.current_step)
-      logging.info("  LR = %g", FLAGS.learning_rate)
-      if self.use_horovod:
-        logging.info("Multi-GPU training with TF Horovod")
-        logging.info("hvd.size() = %d", get_world_size())
-      logging.info("Total Training Time = %0.2f for Examples = %d", total_time, total_sentences)
-      logging.info("Throughput Average (examples/sec) with overhead = %0.2f", results_perf['throughput'])
-      if self._perf_wo_n != 0:
-        logging.info("Throughput Average (examples/sec) = %0.2f", self._perf_wo / self._perf_wo_n)
-      logging.info("-----------------------------")
-
-      if dllogging and self._perf_wo_n != 0:
-        dllogging.logger.log(
-            step=(), data={"throughput_train": self._perf_wo / self._perf_wo_n}, verbosity=Verbosity.DEFAULT
-        )
-        dllogging.logger.log(step=(), data={"total_loss": training_summary['train_loss']}, verbosity=Verbosity.DEFAULT)
-        dllogging.logger.log(data=results_perf, step=tuple())
-
-      return self.main_model
-
-  def train_single_step(self, iterator, num_grad_accumulates):
-    """Performs a distributed training step.
-
-    Args:
-      iterator: the distributed iterator of training datasets.
-
-    Raises:
-      ValueError: Any of the arguments or tensor shapes are invalid.
-    """
-    if num_grad_accumulates != 1:
-      for _ in tf.range(num_grad_accumulates):
-        self.forward(iterator)
-        if _ == 0 or (_ + 1) % num_grad_accumulates == 0:
-          self.step(num_grad_accumulates)
-        if self.use_horovod and self.first_batch:
-          self.do_broadcast()
-    else:
-      self._replicated_step(iterator)
-    return self.get_metrics_result()
-
-  @property
-  def trainable_variables(self):
-    if hasattr(self.loss_container, 'trainable_variables'):
-      return self.main_model.trainable_variables + self.loss_container.trainable_variables
-    else:
-      return self.main_model.trainable_variables
-
-  def do_broadcast(self):
-    model_broadcast_vars = [
-        var for var in self.main_model.variables
-        if (not isinstance(var, TrainableWrapper)) and (not isinstance(var, DEResourceVariable))
-    ]
-    opt_broadcast_vars = [
-        var for var in self.optimizer.variables()
-        if (not isinstance(var, TrainableWrapper)) and (not isinstance(var, DEResourceVariable))
-    ]
-
-    print_op = tf.print(
-        f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...",
-        output_stream=sys.stdout
-    )
-    with tf.control_dependencies([print_op]):
-      hvd.broadcast_variables(model_broadcast_vars + opt_broadcast_vars, root_rank=0)
-    self.first_batch.assign(False)
-
-  def _replicated_step(self, inputs):
-    """Replicated training step."""
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(inputs)
-    with tf.GradientTape() as tape:
-      model_outputs = self.main_model(x, training=True)
-      loss = self.loss_container(y, model_outputs, sample_weight=sample_weight)
-
-    if self.use_horovod and not FLAGS.use_dynamic_embedding:
-      tape = hvd.DistributedGradientTape(
-          tape, sparse_as_dense=False, compression=Compression.fp16 if self.use_float16 else Compression.none
-      )
-    # Run backwards pass.
-    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
-
-    if self.use_horovod and self.first_batch:
-      self.do_broadcast()
-
-    # For reporting, the metric takes the mean of losses.
-    if self.metric_container:
-      self.metric_container.update_state(y_true=y, y_pred=model_outputs, sample_weight=sample_weight)
-
-  def forward(self, inputs):
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(inputs)
-    with tf.GradientTape() as tape:
-      model_outputs = self.main_model(x, training=True)
-      loss = self.loss_container(y, model_outputs, sample_weight=sample_weight)
-
-    # Compute gradients
-    if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"):
-      grads_and_vars = self.optimizer._compute_gradients(loss=loss, var_list=self.trainable_variables, tape=tape)
-    else:
-      grads_and_vars = self.optimizer.compute_gradients(loss=loss, var_list=self.trainable_variables, tape=tape)
-    grads = [g for g, _ in grads_and_vars]
-    self.accum_gradients.add_gradients(grads)
-
-    # For reporting, the metric takes the mean of losses.
-    if self.metric_container:
-      self.metric_container.update_state(y_true=y, y_pred=model_outputs, sample_weight=sample_weight)
-
-  def step(self, num_grad_accumulates):
-    gradients = self.accum_gradients.gradients
-    if self.use_horovod:
-      gradients = [
-          None if g is None else hvd.allreduce(
-              g / tf.cast(num_grad_accumulates, g.dtype),
-              compression=Compression.fp16 if self.use_float16 else Compression.none
-          ) for g in gradients
-      ]
-    else:
-      gradients = [None if g is None else g / tf.cast(num_grad_accumulates, g.dtype) for g in gradients]
-
-    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
-    self.accum_gradients.reset()
-
-  def train_steps_strategy(self, iterator, steps, num_grad_accumulates):
-    """Performs distributed training steps in a loop.
-
-    Args:
-      iterator: the distributed iterator of training datasets.
-      steps: a tf.int32 integer tensor to specify number of steps to run
-        inside host training loop.
-
-    Raises:
-      ValueError: Any of the arguments or tensor shapes are invalid.
-    """
-    if not isinstance(steps, tf.Tensor):
-      raise ValueError('steps should be an Tensor. Python object may cause '
-                       'retracing.')
-
-    if num_grad_accumulates != 1:
-      for _ in tf.range(steps * num_grad_accumulates):
-        self._distribution_strategy.run(self.forward, args=(next(iterator),))
-        if _ == 0 or (_ + 1) % num_grad_accumulates == 0:
-          self._distribution_strategy.run(self.step, args=(num_grad_accumulates,))
-    else:
-      for _ in tf.range(steps):
-        self._distribution_strategy.run(self._replicated_step, args=(next(iterator),))
-    return self.get_metrics_result()
-
-  def train_steps(self, iterator, steps, num_grad_accumulates):
-    if not isinstance(steps, tf.Tensor):
-      raise ValueError('steps should be an Tensor. Python object may cause '
-                       'retracing.')
-
-    if num_grad_accumulates != 1:
-      for _ in tf.range(steps * num_grad_accumulates):
-        self.forward(next(iterator))
-        if _ == 0 or (_ + 1) % num_grad_accumulates == 0:
-          self.step(num_grad_accumulates)
-        if self.use_horovod and self.first_batch:
-          self.do_broadcast()
-    else:
-      for _ in tf.range(steps):
-        self._replicated_step(next(iterator))
-    return self.get_metrics_result()
-
-  def train_single_step_strategy(self, iterator, num_grad_accumulates):
-    """Performs a distributed training step.
-
-    Args:
-      iterator: the distributed iterator of training datasets.
-
-    Raises:
-      ValueError: Any of the arguments or tensor shapes are invalid.
-    """
-    if num_grad_accumulates != 1:
-      for _ in tf.range(num_grad_accumulates):
-        self._distribution_strategy.run(self.forward, args=(iterator,))
-        if _ == 0 or (_ + 1) % num_grad_accumulates == 0:
-          self._distribution_strategy.run(self.step, args=(num_grad_accumulates,))
-    else:
-      self._distribution_strategy.run(self._replicated_step, args=(iterator,))
-    return self.get_metrics_result()
-
-  def make_train_function(self):
-    if not self.run_eagerly:
-      _train_single_step = tf.function(self.train_single_step)
-      _train_multi_steps = tf.function(self.train_steps)
-    else:
-      _train_single_step = self.train_single_step
-      _train_multi_steps = self.train_steps
-
-    if self._distribution_strategy:
-      self._train_step = self.train_single_step_strategy
-      self._train_steps = self.train_steps_strategy
-    else:
-      self._train_step = _train_single_step
-      self._train_steps = _train_multi_steps
diff --git a/deepray/core/base_trainer_test.py b/deepray/core/base_trainer_test.py
deleted file mode 100644
index 29d59520..00000000
--- a/deepray/core/base_trainer_test.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests for tensorflow_models.core.trainers.trainer."""
-# pylint: disable=g-direct-tensorflow-import
-import gc
-import multiprocessing
-import os
-import sys
-
-from absl.testing import parameterized
-import orbit
-import portpicker
-import tensorflow as tf
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import strategy_combinations
-from official.core import base_trainer as trainer_lib
-from official.core import config_definitions as cfg
-from official.core import train_lib
-from official.utils.testing import mock_task
-
-TPU_TEST = 'test_tpu' in sys.argv[0]
-GPU_TEST = 'test_gpu' in sys.argv[0]
-
-
-def all_strategy_combinations():
-  return combinations.combine(
-      distribution=[
-          strategy_combinations.default_strategy,
-          strategy_combinations.cloud_tpu_strategy,
-          strategy_combinations.one_device_strategy_gpu,
-      ],
-  )
-
-
-def create_in_process_cluster(num_workers, num_ps):
-  """Creates and starts local servers and returns the cluster_resolver."""
-  worker_ports = [portpicker.pick_unused_port() for _ in range(num_workers)]
-  ps_ports = [portpicker.pick_unused_port() for _ in range(num_ps)]
-
-  cluster_dict = {}
-  cluster_dict['worker'] = ['localhost:%s' % port for port in worker_ports]
-  if num_ps > 0:
-    cluster_dict['ps'] = ['localhost:%s' % port for port in ps_ports]
-
-  cluster_spec = tf.train.ClusterSpec(cluster_dict)
-
-  # Workers need some inter_ops threads to work properly.
-  worker_config = tf.compat.v1.ConfigProto()
-  if multiprocessing.cpu_count() < num_workers + 1:
-    worker_config.inter_op_parallelism_threads = num_workers + 1
-
-  for i in range(num_workers):
-    tf.distribute.Server(cluster_spec, job_name='worker', task_index=i, config=worker_config, protocol='grpc')
-
-  for i in range(num_ps):
-    tf.distribute.Server(cluster_spec, job_name='ps', task_index=i, protocol='grpc')
-
-  cluster_resolver = tf.distribute.cluster_resolver.SimpleClusterResolver(cluster_spec, rpc_layer='grpc')
-  return cluster_resolver
-
-
-def dataset_fn(input_context=None):
-  del input_context
-
-  def dummy_data(_):
-    return tf.zeros((1, 1), dtype=tf.float32)
-
-  dataset = tf.data.Dataset.range(1)
-  dataset = dataset.repeat()
-  dataset = dataset.map(dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-class MockAsyncTrainer(trainer_lib._AsyncTrainer):
-  """Mock AsyncTrainer to test the _AsyncTrainer class."""
-
-  def __init__(self):
-    self._strategy = tf.distribute.get_strategy()
-    self.init_async()
-
-    self.global_step = tf.Variable(
-        0, dtype=tf.int64, name='global_step', trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
-    )
-    self.eval_global_step = tf.Variable(
-        0,
-        dtype=tf.int64,
-        name='eval_global_step',
-        trainable=False,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
-    )
-
-    train_dataset = self.distribute_dataset(dataset_fn)
-    orbit.StandardTrainer.__init__(self, train_dataset, options=orbit.StandardTrainerOptions())
-
-    validation_dataset = self.distribute_dataset(dataset_fn)
-    orbit.StandardEvaluator.__init__(
-        self, validation_dataset, options=orbit.StandardEvaluatorOptions(use_tf_while_loop=True)
-    )
-
-  def train_loop_begin(self):
-    self.global_step.assign(0)
-
-  def train_step(self, iterator):
-
-    def replica_step(_):
-      self.global_step.assign_add(1)
-
-    self._strategy.run(replica_step, args=(next(iterator),))
-
-  def train_loop_end(self):
-    self.join()
-    return self.global_step.numpy()
-
-  def eval_begin(self):
-    self.eval_global_step.assign(0)
-
-  def eval_step(self, iterator):
-
-    def replica_step(_):
-      self.eval_global_step.assign_add(1)
-
-    self._strategy.run(replica_step, args=(next(iterator),))
-
-  def eval_end(self):
-    self.join()
-    return self.eval_global_step.numpy()
-
-
-class TrainerTest(tf.test.TestCase, parameterized.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            optimizer_config=cfg
-            .OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })
-        )
-    )
-
-  def tearDown(self):
-    gc.collect()
-    # This will only contain uncollectable garbage, i.e. reference cycles
-    # involving objects with __del__ defined.
-    self.assertEmpty(gc.garbage)
-    super().tearDown()
-
-  def create_test_trainer(self, config, model_dir=None, task=None):
-    task = task or mock_task.MockTask(config.task, logging_dir=model_dir)
-    ckpt_exporter = train_lib.maybe_create_best_ckpt_exporter(config, model_dir)
-    trainer = trainer_lib.Trainer(
-        config,
-        task,
-        model=task.build_model(),
-        optimizer=task.create_optimizer(config.trainer.optimizer_config, config.runtime),
-        checkpoint_exporter=ckpt_exporter
-    )
-    return trainer
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_train(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(self._config)
-      logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('training_loss', logs)
-      self.assertIn('learning_rate', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_passing_datasets(self, distribution):
-    with distribution.scope():
-      task = mock_task.MockTask(self._config)
-      train_dataset = orbit.utils.make_distributed_dataset(
-          distribution, task.build_inputs, self._config.task.train_data
-      )
-      validation_dataset = orbit.utils.make_distributed_dataset(
-          distribution, task.build_inputs, self._config.task.validation_data
-      )
-      self._config.task.train_data = None
-      self._config.task.validation_data = None
-      trainer = trainer_lib.Trainer(
-          self._config,
-          task,
-          model=task.build_model(),
-          optimizer=task.create_optimizer(self._config.trainer.optimizer_config, self._config.runtime),
-          train_dataset=train_dataset,
-          validation_dataset=validation_dataset
-      )
-    logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', logs)
-    self.assertIn('learning_rate', logs)
-    logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('validation_loss', logs)
-
-  def test_base_async_trainer(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/GPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
-    with distribution.scope():
-      trainer = MockAsyncTrainer()
-      trainer.init_async()
-      self.assertIsInstance(trainer._coordinator, tf.distribute.experimental.coordinator.ClusterCoordinator)
-      self.assertEqual(trainer.train(tf.constant(10)), 10)
-      self.assertEqual(trainer.evaluate(tf.constant(11)), 11)
-
-  def test_async_trainer_train(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/TPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
-    with distribution.scope():
-      config = cfg.ExperimentConfig(**self._config.as_dict())
-      config.trainer.eval_tf_while_loop = True
-      trainer = self.create_test_trainer(config)
-      logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('training_loss', logs)
-      self.assertIn('learning_rate', logs)
-
-  def test_async_trainer_validate(self):
-    if TPU_TEST or GPU_TEST:
-      self.skipTest('Aysnc training is not available on GPU/GPU.')
-    num_workers = 3
-    num_ps = 2
-    cluster_resolver = create_in_process_cluster(num_workers, num_ps)
-    distribution = tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
-    with distribution.scope():
-      config = cfg.ExperimentConfig(**self._config.as_dict())
-      config.trainer.eval_tf_while_loop = True
-      trainer = self.create_test_trainer(config)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertIn('acc', logs)
-      self.assertIn('validation_loss', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_validate(self, distribution):
-    with distribution.scope():
-      trainer = self.create_test_trainer(self._config)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
-      self.assertIn('validation_loss', logs)
-
-  @combinations.generate(all_strategy_combinations())
-  def test_trainer_validate_without_loss(self, distribution):
-
-    class MockTaskWithoutValidationLoss(mock_task.MockTask):
-
-      def validation_step(self, inputs, model, metrics=None):
-        # Disable validation loss.
-        logs = super().validation_step(inputs, model)
-        del logs[self.loss]
-        return logs
-
-    with distribution.scope():
-      task = MockTaskWithoutValidationLoss()
-      trainer = self.create_test_trainer(self._config, task=task)
-      logs = trainer.evaluate(tf.convert_to_tensor(5, dtype=tf.int32))
-      self.assertEqual(logs['counter'], 5. * distribution.num_replicas_in_sync)
-      self.assertNotIn('validation_loss', logs)
-
-  @combinations.generate(
-      combinations.combine(
-          mixed_precision_dtype=['float32', 'bfloat16', 'float16'],
-          loss_scale=[None, 'dynamic', 128, 256],
-      )
-  )
-  def test_configure_optimizer(self, mixed_precision_dtype, loss_scale):
-    config = cfg.ExperimentConfig(
-        runtime=cfg.RuntimeConfig(mixed_precision_dtype=mixed_precision_dtype, loss_scale=loss_scale),
-        trainer=cfg.TrainerConfig(
-            optimizer_config=cfg
-            .OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                },
-            })
-        )
-    )
-    trainer = self.create_test_trainer(config)
-    if mixed_precision_dtype == 'float16':
-      self.assertIsInstance(trainer.optimizer, tf.keras.mixed_precision.LossScaleOptimizer)
-      if loss_scale in (None, 'dynamic'):
-        self.assertTrue(trainer.optimizer.dynamic)
-      else:
-        self.assertFalse(trainer.optimizer.dynamic)
-        self.assertEqual(trainer.optimizer.initial_scale, loss_scale)
-    else:
-      self.assertIsInstance(trainer.optimizer, (tf.keras.optimizers.SGD, tf.keras.optimizers.legacy.SGD))
-
-    metrics = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', metrics)
-
-  def test_export_best_ckpt(self):
-    config = cfg.ExperimentConfig(
-        trainer=cfg.TrainerConfig(
-            best_checkpoint_export_subdir='best_ckpt',
-            best_checkpoint_eval_metric='acc',
-            optimizer_config=cfg
-            .OptimizationConfig({
-                'optimizer': {
-                    'type': 'sgd'
-                },
-                'learning_rate': {
-                    'type': 'constant'
-                }
-            })
-        )
-    )
-    model_dir = self.get_temp_dir()
-    trainer = self.create_test_trainer(config, model_dir=model_dir)
-    trainer.fit(tf.convert_to_tensor(1, dtype=tf.int32))
-    trainer.evaluate(tf.convert_to_tensor(1, dtype=tf.int32))
-    self.assertTrue(tf.io.gfile.exists(os.path.join(model_dir, 'best_ckpt', 'info.json')))
-
-  def test_model_with_compiled_loss(self):
-    task = mock_task.MockTask()
-    model = task.build_model()
-    model.compile(loss=tf.keras.losses.CategoricalCrossentropy())
-    trainer = trainer_lib.Trainer(
-        self._config, task, model=model, optimizer=task.create_optimizer(self._config.trainer.optimizer_config)
-    )
-    logs = trainer.fit(tf.convert_to_tensor(5, dtype=tf.int32))
-    self.assertIn('training_loss', logs)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/deepray/core/common/distribution_utils.py b/deepray/core/common/distribution_utils.py
index 73887a35..879834a6 100644
--- a/deepray/core/common/distribution_utils.py
+++ b/deepray/core/common/distribution_utils.py
@@ -24,8 +24,6 @@
 
 from deepray.utils.horovod_utils import is_main_process
 
-FLAGS = flags.FLAGS
-
 
 def _collective_communication(all_reduce_alg):
   """Return a CollectiveCommunication based on all_reduce_alg.
@@ -97,7 +95,7 @@ def tpu_initialize(tpu_address):
   return cluster_resolver
 
 
-def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, num_packs=1, **kwargs):
+def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None, num_packs=1, **kwargs):
   """Return a Strategy for running the model.
   Args:
     distribution_strategy: a string specifying which distribution strategy to
@@ -126,15 +124,15 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n
       `distribution_strategy` is `tpu` but `tpu_address` is not specified.
   """
   del kwargs
-  if FLAGS.num_gpus < 0:
+  if flags.FLAGS.num_gpus < 0:
     raise ValueError("`num_gpus` can not be negative.")
 
-  if FLAGS.use_horovod:
+  if flags.FLAGS.use_horovod:
     distribution_strategy = "off"
     if is_main_process():
-      logging.info("Run horovod and turn off distribution strategy.")
+      logging.info("Run horovod and turn off TF distribution strategy.")
   else:
-    distribution_strategy = FLAGS.distribution_strategy
+    distribution_strategy = flags.FLAGS.distribution_strategy
 
   if not isinstance(distribution_strategy, str):
     msg = ("distribution_strategy must be a string but got: %s." % (distribution_strategy,))
@@ -152,7 +150,7 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n
 
   if distribution_strategy == "tpu":
     # When tpu_address is an empty string, we communicate with local TPUs.
-    cluster_resolver = tpu_initialize(FLAGS.tpu)
+    cluster_resolver = tpu_initialize(flags.FLAGS.tpu_address)
     return tf.distribute.TPUStrategy(cluster_resolver)
 
   if distribution_strategy == "multi_worker_mirrored":
@@ -161,25 +159,25 @@ def get_distribution_strategy(distribution_strategy=None, all_reduce_alg=None, n
     )
 
   if distribution_strategy == "one_device":
-    if FLAGS.num_gpus == 0:
+    if flags.FLAGS.num_gpus == 0:
       return tf.distribute.OneDeviceStrategy("device:CPU:0")
-    if FLAGS.num_gpus > 1:
+    if flags.FLAGS.num_gpus > 1:
       raise ValueError("`OneDeviceStrategy` can not be used for more than "
                        "one device.")
     return tf.distribute.OneDeviceStrategy("device:GPU:0")
 
   if distribution_strategy == "mirrored":
-    if FLAGS.num_gpus == 0:
+    if flags.FLAGS.num_gpus == 0:
       devices = ["device:CPU:0"]
     else:
-      devices = ["device:GPU:%d" % i for i in range(FLAGS.num_gpus)]
+      devices = ["device:GPU:%d" % i for i in range(flags.FLAGS.num_gpus)]
     return tf.distribute.MirroredStrategy(
         devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs)
     )
 
   if distribution_strategy == "parameter_server":
     cluster_resolver = tf.distribute.cluster_resolver.TFConfigClusterResolver()
-    return tf.distribute.experimental.ParameterServerStrategy(cluster_resolver)
+    return tf.distribute.ParameterServerStrategy(cluster_resolver)
 
   raise ValueError("Unrecognized Distribution Strategy: %r" % distribution_strategy)
 
diff --git a/deepray/core/common/flags.py b/deepray/core/common/flags.py
index 13cce788..b592f755 100644
--- a/deepray/core/common/flags.py
+++ b/deepray/core/common/flags.py
@@ -38,20 +38,6 @@ def define_flags():
   flags.DEFINE_string(
       'experiment', default=None, help='The experiment type registered, specifying an ExperimentConfig.'
   )
-
-  flags.DEFINE_enum(
-      'mode',
-      default=None,
-      enum_values=[
-          'train', 'eval', 'train_and_eval', 'continuous_eval', 'continuous_train_and_eval', 'train_and_validate',
-          'train_and_post_eval'
-      ],
-      help='Mode to run: `train`, `eval`, `train_and_eval`, '
-      '`continuous_eval`, `continuous_train_and_eval` and '
-      '`train_and_validate` (which is not implemented in '
-      'the open source version).'
-  )
-
   flags.DEFINE_string(
       'model_dir', default=None, help='The directory where the model and training/evaluation summaries'
       'are stored.'
diff --git a/deepray/core/compile_utils.py b/deepray/core/compile_utils.py
index bc0e8150..5ee1479a 100644
--- a/deepray/core/compile_utils.py
+++ b/deepray/core/compile_utils.py
@@ -1,10 +1,16 @@
 import horovod.tensorflow as hvd
 import tensorflow as tf
 from absl import flags
-from keras.engine.compile_utils import MetricsContainer, match_dtype_and_rank, get_mask, apply_mask
-# Keras = 2.9.0
+from packaging.version import parse
 
-FLAGS = flags.FLAGS
+if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"):
+  from keras.engine.compile_utils import MetricsContainer, match_dtype_and_rank, get_mask, apply_mask
+elif parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.engine.compile_utils import MetricsContainer, match_dtype_and_rank
+  from tf_keras.src.utils.losses_utils import get_mask, apply_mask
+else:
+  from keras.src.engine.compile_utils import MetricsContainer, match_dtype_and_rank
+  from keras.src.utils.losses_utils import get_mask, apply_mask
 
 
 class HvdMetricsContainer(MetricsContainer):
@@ -37,12 +43,12 @@ def update_state(self, y_true, y_pred, sample_weight=None):
       mask = get_mask(y_p)
       sw = apply_mask(y_p, sw, mask)
 
-      if FLAGS.use_horovod:
+      if flags.FLAGS.use_horovod:
         y_t = hvd.allgather(y_t)
         y_p = hvd.allgather(y_p)
-        if mask:
+        if mask is not None:
           mask = hvd.allgather(mask)
-        if sw:
+        if sw is not None:
           sw = hvd.allgather(sw)
 
       for metric_obj in metric_objs:
diff --git a/deepray/core/dllogger_class.py b/deepray/core/dllogger_class.py
deleted file mode 100644
index 2c851120..00000000
--- a/deepray/core/dllogger_class.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from dllogger import Logger, StdOutBackend, JSONStreamBackend, Verbosity
-
-
-class dllogger_class():
-
-  def format_step(self, step):
-    if isinstance(step, str):
-      return step
-    elif isinstance(step, int):
-      return "Iteration: {} ".format(step)
-    elif len(step) > 0:
-      return "Iteration: {} ".format(step[0])
-    else:
-      return ""
-
-  def __init__(self, log_path="bert_dllog.json"):
-    self.logger = Logger(
-        [
-            StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step),
-            JSONStreamBackend(Verbosity.VERBOSE, log_path),
-        ]
-    )
-    self.logger.metadata("mlm_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("nsp_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("avg_loss_step", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("total_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("f1", {"unit": None, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("precision", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("recall", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("mcc", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("exact_match", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata(
-        "throughput_train",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "TRAIN"
-        },
-    )
-    self.logger.metadata(
-        "throughput_inf",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "VAL"
-        },
-    )
-    self.logger.metadata(
-        "throughput_val",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "VAL"
-        },
-    )
diff --git a/deepray/core/module.py b/deepray/core/module.py
deleted file mode 100644
index b450e1d9..00000000
--- a/deepray/core/module.py
+++ /dev/null
@@ -1,627 +0,0 @@
-import time
-
-import numpy as np
-import tensorflow as tf
-from absl import logging, flags
-from keras import callbacks as callbacks_module
-from keras.engine import base_layer
-from keras.engine import data_adapter
-from keras.engine.data_adapter import _ClusterCoordinatorDataHandler, DataHandler
-from keras.utils import tf_utils
-from keras.utils import version_utils
-from packaging import version
-from tensorflow.python.eager import context
-
-from deepray.core.common import distribution_utils
-from deepray.utils import export
-from deepray.utils.horovod_utils import is_main_process
-
-FLAGS = flags.FLAGS
-
-
-def _minimum_control_deps(outputs):
-  """Returns the minimum control dependencies to ensure step succeeded."""
-  if tf.executing_eagerly():
-    return []  # Control dependencies not needed.
-  outputs = tf.nest.flatten(outputs, expand_composites=True)
-  for out in outputs:
-    # Variables can't be control dependencies.
-    if not isinstance(out, tf.Variable):
-      return [out]  # Return first Tensor or Op from outputs.
-  return []  # No viable Tensor or Op to use for control deps.
-
-
-def flatten_metrics_in_order(logs, metrics_names):
-  """Turns the `logs` dict into a list as per key order of `metrics_names`."""
-  results = []
-  for name in metrics_names:
-    if name in logs:
-      results.append(logs[name])
-  for key in sorted(logs.keys()):
-    if key not in metrics_names:
-      results.append(logs[key])
-  if len(results) == 1:
-    return results[0]
-  return results
-
-
-class DataHandlerMOD(DataHandler):
-
-  def _validate_data_handler(self):
-    pass
-
-
-def get_data_handler(*args, **kwargs):
-  if getattr(kwargs["model"], "_cluster_coordinator", None):
-    return _ClusterCoordinatorDataHandler(*args, **kwargs)
-  return DataHandlerMOD(*args, **kwargs)
-
-
-class Module():
-
-  def __init__(self, **kwargs):
-    super().__init__(**kwargs)
-    self._distribution_strategy = distribution_utils.get_distribution_strategy()
-    self._init_batch_counters()
-    self.eval_steps = None
-    self._cluster_coordinator = None
-
-    self.test_function = None
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _init_batch_counters(self):
-    # Untracked Variables, used to keep track of mini-batches seen in `fit`,
-    # `evaluate`, and `predict`.
-    agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
-    self._train_counter = tf.Variable(0, dtype='int64', aggregation=agg)
-    self._test_counter = tf.Variable(0, dtype='int64', aggregation=agg)
-    self._predict_counter = tf.Variable(0, dtype='int64', aggregation=agg)
-
-  @tf.__internal__.tracking.no_automatic_dependency_tracking
-  def _configure_steps_per_execution(self, steps_per_execution):
-    self._steps_per_execution = tf.Variable(
-        steps_per_execution, dtype='int64', aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
-    )
-
-  @property
-  def distribute_strategy(self):
-    """The `tf.distribute.Strategy` this model was created under."""
-    return self._distribution_strategy or tf.distribute.get_strategy()
-
-  def steps_to_run(self, current_step, steps_per_epoch, steps_per_loop):
-    """Calculates steps to run on device."""
-    if steps_per_loop <= 0:
-      raise ValueError('steps_per_loop should be positive integer.')
-    if steps_per_loop == 1:
-      return steps_per_loop, FLAGS.num_accumulation_steps
-
-    # Note: broadcast should be called after the first gradient step to ensure optimizer
-    # initialization.
-    # if self.use_horovod and self.current_step == self._first_steps:
-    #   return 1, 1
-
-    remainder_in_epoch = current_step % steps_per_epoch
-    if remainder_in_epoch != 0:
-      return min(steps_per_epoch - remainder_in_epoch, steps_per_loop), FLAGS.num_accumulation_steps
-    else:
-      return steps_per_loop, FLAGS.num_accumulation_steps
-
-  def _float_metric_value(self, metric):
-    """Gets the value of a float-value keras metric."""
-    return metric.result().numpy().astype(float)
-
-  def on_epoch_begin(self, epoch):
-    self._step_epoch = 0
-    """Calls the `on_epoch_begin` methods of its callbacks.
-    """
-    self.callbacks.on_epoch_begin(epoch)
-
-    # Training loss/metric are taking average over steps inside micro
-    # training loop. We reset their values before each round.
-    self.loss_container.reset_state()
-    if self.metric_container:
-      self.metric_container.reset_state()
-
-  def on_batch_end(self, logs, steps, t0):
-    """Runs custom callbacks at the end of every N(steps) step."""
-    self._step_epoch += steps
-    self.current_step += steps
-
-    self.callbacks.on_train_batch_end(self.current_step, logs)
-
-    elapse_time = time.time() - t0
-    # Updates training logging.
-    if self.steps_per_epoch > 0:
-      training_status = 'Train Step: %d/%d / time=%.3f sec' % (
-          self.current_step, self.steps_per_epoch * self.epochs + self._first_steps, elapse_time
-      )
-    else:
-      training_status = 'Train Step: %d / time=%.3f sec' % (self.current_step, elapse_time)
-
-    self._steps_from_save += steps
-
-    if self._steps_from_save >= FLAGS.save_checkpoint_steps:
-      export.export_to_checkpoint(self.manager, self.current_step)
-      self._steps_from_save = 0
-
-    if self.train_summary_writer:
-      with self.train_summary_writer.as_default():
-        for metric in self.metrics:
-          metric_value = self._float_metric_value(metric)
-          training_status += '  %s=%f' % (metric.name, metric_value)
-          tf.summary.scalar(metric.name, metric_value, step=self.current_step)
-        self.train_summary_writer.flush()
-
-    # The number of samples trained per second
-    step_throughput = self._performance_calculator(steps, self.global_batch_size)
-    if is_main_process():
-      if self.use_float16:
-        if version.parse(tf.keras.__version__.replace("-tf", "+tf")) < version.parse("2.11"):
-          logging.info(
-              'Step: %d Lr %g Loss scale %g' %
-              (self.current_step, self.optimizer._optimizer._decayed_lr('float32'), self.optimizer.loss_scale)
-          )
-        else:
-          logging.info(
-              'Step: %d Lr %g Loss scale %g' % (self.current_step, self.optimizer.lr, self.optimizer.loss_scale)
-          )
-
-      logging.info(training_status)
-      logging.info('Perf %.2f samples/s' % step_throughput)
-
-    if self.current_step > self._first_steps + steps * 2:
-      self._perf_wo += step_throughput
-      self._perf_wo_n += 1
-
-  def on_epoch_end(self, epoch, current_step, eval_input, epoch_logs=None):
-    # Saves model checkpoints and run validation steps at every epoch end.
-    # To avoid repeated model saving, we do not save after the last step of training.
-    if epoch < self.epochs - 1:
-      export.export_to_checkpoint(self.manager, current_step)
-    if eval_input:  # and is_main_process():
-      if is_main_process():
-        logging.info('Running evaluation after step: %s.', current_step)
-
-      val_logs = self.evaluate(eval_input, self.eval_steps)
-      val_logs = {'val_' + name: val for name, val in val_logs.items()}
-      epoch_logs.update(val_logs)
-
-      if is_main_process():
-        with self.eval_summary_writer.as_default():
-          for name, value in val_logs.items():
-            logging.info('Step: [%d] Validation %s = %f', current_step, name, value)
-            tf.summary.scalar(name, value, step=current_step)
-          self.eval_summary_writer.flush()
-    """Calls the `on_epoch_end` methods of its callbacks.
-    """
-    self.callbacks.on_epoch_end(epoch, epoch_logs)
-
-  def evaluate(self, eval_input: tf.data.Dataset, eval_steps: int = None, callbacks=None, return_dict=True, **kwargs):
-    """Returns the loss value & metrics values for the model in test mode.
-
-    Computation is done in batches (see the `batch_size` arg.)
-
-    Args:
-        eval_input: Target data. Like the input data `x`, it could be either Numpy
-          array(s) or TensorFlow tensor(s). It should be consistent with `x`
-          (you cannot have Numpy inputs and tensor targets, or inversely).
-          If `x` is a dataset, generator or `keras.utils.Sequence` instance,
-          `y` should not be specified (since targets will be obtained from
-          the iterator/dataset).
-        eval_steps: Integer or `None`. Total number of steps (batches of samples)
-          before declaring the evaluation round finished. Ignored with the
-          default value of `None`. If x is a `tf.data` dataset and `steps`
-          is None, 'evaluate' will run until the dataset is exhausted. This
-          argument is not supported with array inputs.
-
-
-    See the discussion of `Unpacking behavior for iterator-like inputs` for
-    `Model.fit`.
-
-    Returns:
-        Scalar test loss (if the model has a single output and no metrics)
-        or list of scalars (if the model has multiple outputs
-        and/or metrics). The attribute `model.metrics_names` will give you
-        the display labels for the scalar outputs.
-
-    Raises:
-        RuntimeError: If `trainer.evaluate` is wrapped in a `tf.function`.
-    """
-    if eval_steps is None:
-      if self.eval_steps is not None:
-        eval_steps = self.eval_steps
-    else:
-      if self.eval_steps is None:
-        self.eval_steps = eval_steps
-      """Runs validation steps and aggregate metrics."""
-      if self.eval_steps is None:
-        self.eval_steps = eval_steps
-
-    base_layer.keras_api_gauge.get_cell('evaluate').set(True)
-    version_utils.disallow_legacy_graph('Model', 'evaluate')
-
-    use_cached_eval_dataset = kwargs.pop('_use_cached_eval_dataset', False)
-    if kwargs:
-      raise TypeError(f'Invalid keyword arguments: {list(kwargs.keys())}')
-
-    # TODO(@fuhailin): custom ProgbarLogger fix bug when verbose = 2
-    verbose = 1
-    with distribution_utils.get_strategy_scope(self._distribution_strategy):
-      # Use cached evaluation data only when it's called in `Model.fit`
-      if use_cached_eval_dataset and getattr(self, '_eval_data_handler', None) is not None:
-        data_handler = self._eval_data_handler
-      else:
-        # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
-        data_handler = get_data_handler(
-            x=eval_input,
-            y=None,
-            sample_weight=None,
-            batch_size=FLAGS.batch_size,
-            steps_per_epoch=self.eval_steps,
-            initial_epoch=0,
-            epochs=1,
-            max_queue_size=10,
-            workers=1,
-            use_multiprocessing=False,
-            model=self.main_model,
-            steps_per_execution=self._steps_per_execution
-        )
-
-      # Container that configures and calls `tf.keras.Callback`s.
-      if not isinstance(callbacks, callbacks_module.CallbackList):
-        callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self.main_model,
-            verbose=verbose,
-            epochs=1,
-            steps=data_handler.inferred_steps
-        )
-      logs = {}
-      self.test_function = self.make_test_function()
-      self._test_counter.assign(0)
-      callbacks.on_test_begin()
-      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
-        # Re-initialize evaluation metric.
-        self.reset_metrics()
-        while eval_steps is None or self._test_counter.numpy() < eval_steps:
-          try:
-            steps, _ = self.steps_to_run(
-                self._test_counter.numpy(),
-                steps_per_epoch=eval_steps if eval_steps else -1,
-                steps_per_loop=FLAGS.steps_per_summary
-            )
-            with tf.profiler.experimental.Trace('test', step_num=self._test_counter.numpy(), _r=1):
-              callbacks.on_test_batch_begin(self._test_counter.numpy())
-              tmp_logs = self.test_function(iterator, tf.convert_to_tensor(steps, dtype=tf.int32))
-              if data_handler.should_sync:
-                context.async_wait()
-              logs = tmp_logs  # No error, now safe to assign to logs.
-              callbacks.on_test_batch_end(self._test_counter.numpy(), logs)
-          except (tf.errors.OutOfRangeError, StopIteration):
-            callbacks.on_test_batch_end(self._test_counter.numpy(), logs)
-            self.eval_steps = self._test_counter.numpy()
-            if is_main_process():
-              logging.info('Data exhausted after %d eval_steps', self._test_counter.numpy())
-            break
-
-      logs = tf_utils.sync_to_numpy_or_python_type(logs)
-      callbacks.on_test_end(logs=logs)
-
-      if return_dict:
-        return logs
-      else:
-        return flatten_metrics_in_order(logs, self.metrics_names)
-
-  def make_test_function(self, force=False):
-    """Creates a function that executes one step of evaluation.
-
-    This method can be overridden to support custom evaluation logic.
-    This method is called by `Model.evaluate` and `Model.test_on_batch`.
-
-    Typically, this method directly controls `tf.function` and
-    `tf.distribute.Strategy` settings, and delegates the actual evaluation
-    logic to `Model.test_step`.
-
-    This function is cached the first time `Model.evaluate` or
-    `Model.test_on_batch` is called. The cache is cleared whenever
-    `Model.compile` is called. You can skip the cache and generate again the
-    function with `force=True`.
-
-    Args:
-      force: Whether to regenerate the test function and skip the cached
-        function if available.
-
-    Returns:
-      Function. The function created by this method should accept a
-      `tf.data.Iterator`, and return a `dict` containing values that will
-      be passed to `tf.keras.Callbacks.on_test_batch_end`.
-    """
-    if self.test_function is not None and not force:
-      return self.test_function
-
-    def step_function(trainer, iterator):
-      """Runs a single evaluation step."""
-
-      def run_step(data):
-        outputs = self.test_step(data)
-        # Ensure counter is updated only if `test_step` succeeds.
-        with tf.control_dependencies(_minimum_control_deps(outputs)):
-          trainer._test_counter.assign_add(1)  # pylint: disable=protected-access
-        return outputs
-
-      if self._jit_compile:
-        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
-
-      data = next(iterator)
-      outputs = run_step(data)
-      return outputs
-
-    # Special case if steps_per_execution is one.
-    if self._steps_per_execution is None or self._steps_per_execution.numpy().item() == 1:
-
-      def test_function(iterator):
-        """Runs a test execution with a single step."""
-        return step_function(self, iterator)
-
-      if not self.run_eagerly:
-        test_function = tf.function(test_function, reduce_retracing=True)
-
-      if self._cluster_coordinator:
-        self.test_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-            test_function, args=(it,))
-      else:
-        self.test_function = test_function
-
-    # If we're using a coordinator, use the value of self._steps_per_execution
-    # at the time the function is called/scheduled, and not when it is actually
-    # executed.
-    elif self._cluster_coordinator:
-
-      def test_function(iterator, steps_per_execution):
-        """Runs a test execution with multiple steps."""
-        for _ in tf.range(steps_per_execution):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        test_function = tf.function(test_function, reduce_retracing=True)
-
-      self.test_function = lambda it: self._cluster_coordinator.schedule(  # pylint: disable=g-long-lambda
-          test_function,
-          args=(it, self._steps_per_execution.value()))
-    else:
-
-      def test_function(iterator, steps):
-        """Runs a test execution with multiple steps."""
-        for _ in tf.range(steps):
-          outputs = step_function(self, iterator)
-        return outputs
-
-      if not self.run_eagerly:
-        test_function = tf.function(test_function, reduce_retracing=True)
-      self.test_function = test_function
-
-    return self.test_function
-
-  @property
-  def metrics(self):
-    metrics = []
-    if self.loss_container is not None:
-      metrics += self.loss_container.metrics
-    if self.metric_container is not None:
-      metrics += self.metric_container.metrics
-    return metrics
-
-  def get_metrics_result(self):
-    """Returns the model's metrics values as a dict.
-
-    If any of the metric result is a dict (containing multiple metrics),
-    each of them gets added to the top level returned dict of this method.
-
-    Returns:
-      A `dict` containing values of the metrics listed in `self.metrics`.
-      Example:
-      `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    # Collect metrics to return
-    return_metrics = {}
-    for metric in self.metrics:
-      result = metric.result()
-      if isinstance(result, dict):
-        return_metrics.update(result)
-      else:
-        return_metrics[metric.name] = result
-    return return_metrics
-
-  def test_step(self, data):
-    """The logic for one evaluation step.
-
-    This method can be overridden to support custom evaluation logic.
-    This method is called by `Model.make_test_function`.
-
-    This function should contain the mathematical logic for one step of
-    evaluation.
-    This typically includes the forward pass, loss calculation, and metrics
-    updates.
-
-    Configuration details for *how* this logic is run (e.g. `tf.function` and
-    `tf.distribute.Strategy` settings), should be left to
-    `Model.make_test_function`, which can also be overridden.
-
-    Args:
-      data: A nested structure of `Tensor`s.
-
-    Returns:
-      A `dict` containing values that will be passed to
-      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
-      values of the `Model`'s metrics are returned.
-    """
-    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
-
-    y_pred = self.main_model(x, training=False)
-    # Updates stateful loss metrics.
-    self.compute_loss(x, y, y_pred, sample_weight)
-    return self.compute_metrics(x, y, y_pred, sample_weight)
-
-  def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
-    """Compute the total loss, validate it, and return it.
-
-    Subclasses can optionally override this method to provide custom loss
-    computation logic.
-
-    Example:
-    ```python
-    class MyModel(tf.keras.Model):
-
-      def __init__(self, *args, **kwargs):
-        super(MyModel, self).__init__(*args, **kwargs)
-        self.loss_tracker = tf.keras.metrics.Mean(name='loss')
-
-      def compute_loss(self, x, y, y_pred, sample_weight):
-        loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
-        loss += tf.add_n(self.losses)
-        self.loss_tracker.update_state(loss)
-        return loss
-
-      def reset_metrics(self):
-        self.loss_tracker.reset_states()
-
-      @property
-      def metrics(self):
-        return [self.loss_tracker]
-
-    tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
-    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
-
-    inputs = tf.keras.layers.Input(shape=(10,), name='my_input')
-    outputs = tf.keras.layers.Dense(10)(inputs)
-    model = MyModel(inputs, outputs)
-    model.add_loss(tf.reduce_sum(outputs))
-
-    optimizer = tf.keras.optimizers.SGD()
-    model.compile(optimizer, loss='mse', steps_per_execution=10)
-    model.fit(dataset, epochs=2, steps_per_epoch=10)
-    print('My custom loss: ', model.loss_tracker.result().numpy())
-    ```
-
-    Args:
-      x: Input data.
-      y: Target data.
-      y_pred: Predictions returned by the model (output of `model(x)`)
-      sample_weight: Sample weights for weighting the loss function.
-
-    Returns:
-      The total loss as a `tf.Tensor`, or `None` if no loss results (which is
-      the case when called by `Model.test_step`).
-    """
-    del x  # The default implementation does not use `x`.
-    return self.loss_container(
-        y,
-        y_pred,
-        sample_weight,
-        # regularization_losses=self.losses
-    )
-
-  def compute_metrics(self, x, y, y_pred, sample_weight):
-    """Update metric states and collect all metrics to be returned.
-
-    Subclasses can optionally override this method to provide custom metric
-    updating and collection logic.
-
-    Example:
-    ```python
-    class MyModel(tf.keras.Sequential):
-
-      def compute_metrics(self, x, y, y_pred, sample_weight):
-
-        # This super call updates `self.compiled_metrics` and returns results
-        # for all metrics listed in `self.metrics`.
-        metric_results = super(MyModel, self).compute_metrics(
-            x, y, y_pred, sample_weight)
-
-        # Note that `self.custom_metric` is not listed in `self.metrics`.
-        self.custom_metric.update_state(x, y, y_pred, sample_weight)
-        metric_results['custom_metric_name'] = self.custom_metric.result()
-        return metric_results
-    ```
-
-    Args:
-      x: Input data.
-      y: Target data.
-      y_pred: Predictions returned by the model (output of `model.call(x)`)
-      sample_weight: Sample weights for weighting the loss function.
-
-    Returns:
-      A `dict` containing values that will be passed to
-      `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the
-      values of the metrics listed in `self.metrics` are returned. Example:
-      `{'loss': 0.2, 'accuracy': 0.7}`.
-    """
-    del x  # The default implementation does not use `x`.
-    self.metric_container.update_state(y, y_pred, sample_weight)
-    # Collect metrics to return
-    return self.get_metrics_result()
-
-  def reset_metrics(self):
-    """Resets the state of all the metrics in the model.
-
-    Examples:
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> outputs = tf.keras.layers.Dense(2)(inputs)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
-
-    >>> x = np.random.random((2, 3))
-    >>> y = np.random.randint(0, 2, (2, 2))
-    >>> _ = model.fit(x, y, verbose=0)
-    >>> assert all(float(m.result()) for m in model.metrics)
-
-    >>> model.reset_metrics()
-    >>> assert all(float(m.result()) == 0 for m in model.metrics)
-
-    """
-    for m in self.metrics:
-      m.reset_state()
-
-  @property
-  def metrics_names(self):
-    """Returns the model's display labels for all outputs.
-
-    Note: `metrics_names` are available only after a `keras.Model` has been
-    trained/evaluated on actual data.
-
-    Examples:
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> outputs = tf.keras.layers.Dense(2)(inputs)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
-    >>> model.metrics_names
-    []
-
-    >>> x = np.random.random((2, 3))
-    >>> y = np.random.randint(0, 2, (2, 2))
-    >>> model.fit(x, y)
-    >>> model.metrics_names
-    ['loss', 'mae']
-
-    >>> inputs = tf.keras.layers.Input(shape=(3,))
-    >>> d = tf.keras.layers.Dense(2, name='out')
-    >>> output_1 = d(inputs)
-    >>> output_2 = d(inputs)
-    >>> model = tf.keras.models.Model(
-    ...    inputs=inputs, outputs=[output_1, output_2])
-    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
-    >>> model.fit(x, (y, y))
-    >>> model.metrics_names
-    ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
-    'out_1_acc']
-
-    """
-
-    # This property includes all output names including `loss` and per-output
-    # losses for backward compatibility.
-    return [m.name for m in self.metrics]
diff --git a/deepray/core/platform/build_config.default.bzl b/deepray/core/platform/build_config.default.bzl
index 2590d8a1..dc54c11f 100644
--- a/deepray/core/platform/build_config.default.bzl
+++ b/deepray/core/platform/build_config.default.bzl
@@ -1,12 +1,12 @@
 """OSS versions of Bazel macros that can't be migrated to TSL."""
 
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//deepray/tsl:tsl.bzl",
     "clean_dep",
     "if_libtpu",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl_ml",
diff --git a/deepray/core/platform/build_config_root.bzl b/deepray/core/platform/build_config_root.bzl
index 4fc39673..30a4fee6 100644
--- a/deepray/core/platform/build_config_root.bzl
+++ b/deepray/core/platform/build_config_root.bzl
@@ -1,5 +1,10 @@
 """Provides a redirection point for platform specific implementations of starlark utilities."""
 
+load(
+    "//deepray/core/platform:build_config_root.default.bzl",
+    _if_dynamic_kernels = "if_dynamic_kernels",
+    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
+)
 load(
     "//deepray/tsl/platform/default:build_config_root.bzl",
     _if_static = "if_static",
@@ -12,11 +17,6 @@ load(
     _tf_exec_properties = "tf_exec_properties",
     _tf_gpu_tests_tags = "tf_gpu_tests_tags",
 )
-load(
-    "//deepray/core/platform:build_config_root.default.bzl",
-    _if_dynamic_kernels = "if_dynamic_kernels",
-    _tf_additional_plugin_deps = "tf_additional_plugin_deps",
-)
 
 if_dynamic_kernels = _if_dynamic_kernels
 if_static = _if_static
diff --git a/deepray/core/trainer.py b/deepray/core/trainer.py
new file mode 100644
index 00000000..8e112e0b
--- /dev/null
+++ b/deepray/core/trainer.py
@@ -0,0 +1,3073 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training-related part of the TF-Keras engine."""
+
+import copy
+import os
+import random
+import sys
+import warnings
+import weakref
+from typing import Union, List, Dict, Text
+
+import horovod.tensorflow as hvd
+import numpy as np
+import tensorflow as tf
+import tf_keras as keras
+from absl import flags
+from tensorflow.python.distribute import distribute_utils
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.eager import context
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+from tf_keras import callbacks as callbacks_module
+from tf_keras import optimizers
+from tf_keras.src.dtensor import dtensor_api
+from tf_keras.src.dtensor import layout_map as layout_map_lib
+from tf_keras.src.engine import compile_utils
+from tf_keras.src.engine import data_adapter
+from tf_keras.src.engine import training as training_module
+from tf_keras.src.engine import training_utils
+from tf_keras.src.metrics import base_metric
+from tf_keras.src.mixed_precision import loss_scale_optimizer as lso
+from tf_keras.src.optimizers import optimizer
+from tf_keras.src.optimizers import optimizer_v1
+from tf_keras.src.saving import serialization_lib
+from tf_keras.src.utils import generic_utils
+from tf_keras.src.utils import steps_per_execution_tuning
+from tf_keras.src.utils import tf_utils
+from tf_keras.src.utils import traceback_utils
+from tf_keras.src.utils import version_utils
+from tf_keras.src.utils.mode_keys import ModeKeys
+
+from deepray.callbacks import HvdCallbackList
+from deepray.callbacks.progbar_logger import ProgbarLogger
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+from deepray.utils import logging_util
+from deepray.utils.horovod_utils import is_main_process
+
+logger = logging_util.get_logger()
+
+try:
+  from tensorflow_recommenders_addons.dynamic_embedding.python.ops.embedding_weights import TrainableWrapper
+  from tensorflow_recommenders_addons.dynamic_embedding.python.ops.shadow_embedding_ops import DEResourceVariable
+except ImportError:
+  TrainableWrapper, DEResourceVariable = None, None
+
+
+def set_random_seed(random_seed):
+  random.seed(random_seed)  # set random seed for python
+  np.random.seed(random_seed)  # set random seed for numpy
+  tf.random.set_seed(random_seed)  # set random seed for tensorflow-cpu
+  os.environ['TF_DETERMINISTIC_OPS'] = '1'  # set random seed for tensorflow-gpu
+
+
+@keras_export("keras.Model", "keras.models.Model")
+class Trainer():
+  """A model grouping layers into an object with training/inference features.
+
+    Args:
+        inputs: The input(s) of the model: a `keras.Input` object or a
+            combination of `keras.Input` objects in a dict, list or tuple.
+        outputs: The output(s) of the model: a tensor that originated from
+            `keras.Input` objects or a combination of such tensors in a dict,
+            list or tuple. See Functional API example below.
+        name: String, the name of the model.
+
+    There are two ways to instantiate a `Model`:
+
+    1 - With the "Functional API", where you start from `Input`,
+    you chain layer calls to specify the model's forward pass,
+    and finally you create your model from inputs and outputs:
+
+    ```python
+    import tensorflow as tf
+
+    inputs = tf.keras.Input(shape=(3,))
+    x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
+    outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+    model = tf.keras.Model(inputs=inputs, outputs=outputs)
+    ```
+
+    Note: Only dicts, lists, and tuples of input tensors are supported. Nested
+    inputs are not supported (e.g. lists of list or dicts of dict).
+
+    A new Functional API model can also be created by using the
+    intermediate tensors. This enables you to quickly extract sub-components
+    of the model.
+
+    Example:
+
+    ```python
+    inputs = keras.Input(shape=(None, None, 3))
+    processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
+    conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
+    pooling = keras.layers.GlobalAveragePooling2D()(conv)
+    feature = keras.layers.Dense(10)(pooling)
+
+    full_model = keras.Model(inputs, feature)
+    backbone = keras.Model(processed, conv)
+    activations = keras.Model(conv, feature)
+    ```
+
+    Note that the `backbone` and `activations` models are not
+    created with `keras.Input` objects, but with the tensors that are originated
+    from `keras.Input` objects. Under the hood, the layers and weights will
+    be shared across these models, so that user can train the `full_model`, and
+    use `backbone` or `activations` to do feature extraction.
+    The inputs and outputs of the model can be nested structures of tensors as
+    well, and the created models are standard Functional API models that support
+    all the existing APIs.
+
+    2 - By subclassing the `Model` class: in that case, you should define your
+    layers in `__init__()` and you should implement the model's forward pass
+    in `call()`.
+
+    ```python
+    import tensorflow as tf
+
+    class MyModel(tf.keras.Model):
+
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        return self.dense2(x)
+
+    model = MyModel()
+    ```
+
+    If you subclass `Model`, you can optionally have
+    a `training` argument (boolean) in `call()`, which you can use to specify
+    a different behavior in training and inference:
+
+    ```python
+    import tensorflow as tf
+
+    class MyModel(tf.keras.Model):
+
+      def __init__(self):
+        super().__init__()
+        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+        self.dropout = tf.keras.layers.Dropout(0.5)
+
+      def call(self, inputs, training=False):
+        x = self.dense1(inputs)
+        if training:
+          x = self.dropout(x, training=training)
+        return self.dense2(x)
+
+    model = MyModel()
+    ```
+
+    Once the model is created, you can config the model with losses and metrics
+    with `model.compile()`, train the model with `model.fit()`, or use the model
+    to do prediction with `model.predict()`.
+    """
+
+  @tf.__internal__.tracking.no_automatic_dependency_tracking
+  @traceback_utils.filter_traceback
+  def __init__(
+      self,
+      model: Union[keras.Model, List[keras.Model], Dict[Text, keras.Model]],
+      optimizer="rmsprop",
+      loss=None,
+      metrics=None,
+      loss_weights=None,
+      weighted_metrics=None,
+      run_eagerly=None,
+      steps_per_execution=None,
+      jit_compile=None,
+      pss_evaluation_shards=0,
+      *args,
+      **kwargs
+  ):
+
+    self._model = {}
+    if isinstance(model, list):
+      if len(model) > 0:
+        self._model = {"main": model[0]}
+        if len(model) == 2:
+          self._model["sub_model"] = model[1]
+        else:
+          for i in range(1, len(model)):
+            self._model[f"sub_model{i}"] = model[i]
+      else:
+        raise ValueError("Not a reachable model.")
+    elif isinstance(model, dict):
+      main_keys = [k for k in model.keys() if "main" in k]
+      if len(main_keys) == 1:
+        if (len(model) == 1):
+          self._model = {"main": next(iter(model.values()))}
+        else:
+          self._model = model
+      else:
+        raise ValueError(f"Must set only one model with key contains \"main\", found {main_keys}.")
+    elif isinstance(model, (keras.Model, tf.keras.Model)):
+      self._model = {"main": model}
+    else:
+      raise ValueError("Not a reachable model.")
+
+    if run_eagerly is None:
+      run_eagerly = flags.FLAGS.run_eagerly
+
+    if steps_per_execution is None:
+      steps_per_execution = flags.FLAGS.steps_per_execution
+
+    # Special case for Subclassed Functional Model, which we couldn't detect
+    # when __new__ is called. We only realize it is a functional model when
+    # it calls super.__init__ with input and output tensor.
+    from tf_keras.src.engine import functional
+
+    if training_module.is_functional_model_init_params(args, kwargs) and not isinstance(self, functional.Functional):
+      # Filter the kwargs for multiple inheritance.
+      supported_kwargs = [
+          "inputs",
+          "outputs",
+          "name",
+          "trainable",
+          "skip_init",
+      ]
+      model_kwargs = {k: kwargs[k] for k in kwargs if k in supported_kwargs}
+      other_kwargs = {k: kwargs[k] for k in kwargs if k not in supported_kwargs}
+      training_module.inject_functional_model_class(self.__class__)
+      functional.Functional.__init__(self, *args, **model_kwargs)
+
+      # In case there is any multiple inheritance here, we need to call
+      # the __init__ for any class that appears after the Functional
+      # class.
+      clz_to_init = []
+      found_functional_class = False
+      for clz in self.__class__.__bases__:
+        if issubclass(clz, functional.Functional):
+          found_functional_class = True
+          continue
+        if found_functional_class:
+          clz_to_init.append(clz)
+
+      if clz_to_init:
+        for clz in clz_to_init:
+          clz.__init__(self, *args, **other_kwargs)
+      elif other_kwargs:
+        # In case there are unused kwargs, we should raise an error to
+        # user, in case they have a typo in the param name.
+        raise TypeError(
+            "The following keyword arguments passed to `Model` aren't "
+            "supported: {}.".format(other_kwargs)
+        )
+      return
+
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    # `inputs` / `outputs` will only appear in kwargs if either are
+    # misspelled.
+    generic_utils.validate_kwargs(
+        kwargs,
+        {
+            "trainable",
+            "dtype",
+            "dynamic",
+            "name",
+            "autocast",
+            "inputs",
+            "outputs",
+        },
+    )
+    super().__init__(**kwargs)
+
+    # stop_training is used by callback to stop training when error happens
+    self.stop_training = False
+    self.history = None
+    # These objects are used in the default `Model.compile`. They are not
+    # guaranteed to be set after `Model.compile` is called, as users can
+    # override compile with custom logic.
+    self.compiled_loss = None
+    self.compiled_metrics = None
+
+    # Don't reset compilation if already done. This may occur if calling
+    # `__init__` (or `_init_graph_network`) on an already-compiled model
+    # such as a Sequential model. Sequential models may need to rebuild
+    # themselves after compilation.
+    self._maybe_create_attribute("_is_compiled", False)
+    self._maybe_create_attribute("optimizer", None)
+
+    # Model must be created under scope of DistStrat it will be trained
+    # with.
+    if tf.distribute.has_strategy():
+      self._distribution_strategy = tf.distribute.get_strategy()
+    else:
+      self._distribution_strategy = None
+    self._distribute_reduction_method = None
+
+    self._cluster_coordinator = None
+
+    # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
+    self._run_eagerly = None
+    # Initialize cache attrs.
+    self._reset_compile_cache()
+
+    # Fault-tolerance handler. Set in `ModelCheckpoint`.
+    self._training_state = None
+
+    self._steps_per_execution = None
+    self._steps_per_execution_tuner = None
+    self._autotune_steps_per_execution = False
+
+    self._layout_map = layout_map_lib.get_current_layout_map()
+
+    self._init_batch_counters()
+    self._base_model_initialized = True
+
+    # `jit_compile` starts off with None as default and gets overwritten by
+    # the value specified in `Model.compile`, and this is effective for
+    # `fit`, `evaluate`, and `predict`.
+    self._jit_compile = None
+
+    self.compile(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        weighted_metrics=weighted_metrics,
+        run_eagerly=run_eagerly,
+        steps_per_execution=steps_per_execution,
+        jit_compile=jit_compile,
+        pss_evaluation_shards=pss_evaluation_shards,
+        **kwargs,
+    )
+
+    if is_main_process():
+      logger.info("Initialize training")
+      logger.info("flags.FLAGS:")
+      for key, value in sorted(flags.FLAGS.flag_values_dict().items()):
+        logger.info(f"\t{key:25}= {value}")
+    if flags.FLAGS.random_seed is not None:
+      set_random_seed(flags.FLAGS.random_seed)
+
+  def _create_counter_variable(self, init_value):
+    """Helper function for counter variable creation.
+
+    For the DTensor use case with layout map, since the variable are not
+    tracked by model, they can't be visited by the layout map, and need to
+    be properly initialized as DVariable.
+    """
+    # This function should be removed after we move to the strategy based
+    # implementation for DTensor.
+    if self._layout_map is None:
+      agg = tf.VariableAggregation.ONLY_FIRST_REPLICA
+      return tf.Variable(init_value, dtype="int64", aggregation=agg)
+    else:
+      layout = dtensor_api.Layout.replicated(mesh=self._layout_map.get_default_mesh(), rank=0)
+      return dtensor_api.DVariable(init_value, dtype="int64", layout=layout)
+
+  @tf.__internal__.tracking.no_automatic_dependency_tracking
+  def _init_batch_counters(self):
+    # Untracked Variables, used to keep track of mini-batches seen in `fit`,
+    # `evaluate`, and `predict`.
+    if not tf.inside_function():
+      # Creating variables inside tf.function is not allowed, hence
+      # these would otherwise prevent users from creating TF-Keras layers
+      # inside tf.function.
+      # These variables are not connected to outputs so they have no
+      # effect on graph generation anyway.
+
+      self._train_counter = self._create_counter_variable(0)
+      self._test_counter = self._create_counter_variable(0)
+      self._predict_counter = self._create_counter_variable(0)
+      if flags.FLAGS.use_horovod:
+        self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name='first_batch')
+
+  @traceback_utils.filter_traceback
+  def compile(
+      self,
+      optimizer="rmsprop",
+      loss=None,
+      metrics=None,
+      loss_weights=None,
+      weighted_metrics=None,
+      run_eagerly=None,
+      steps_per_execution=None,
+      jit_compile=None,
+      pss_evaluation_shards=0,
+      **kwargs,
+  ):
+    """Configures the model for training.
+
+    Example:
+
+    ```python
+    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
+                  loss=tf.keras.losses.BinaryCrossentropy(),
+                  metrics=[tf.keras.metrics.BinaryAccuracy(),
+                           tf.keras.metrics.FalseNegatives()])
+    ```
+
+    Args:
+        optimizer: String (name of optimizer) or optimizer instance. See
+          `tf.keras.optimizers`.
+        loss: Loss function. May be a string (name of loss function), or
+          a `tf.keras.losses.Loss` instance. See `tf.keras.losses`. A loss
+          function is any callable with the signature `loss = fn(y_true,
+          y_pred)`, where `y_true` are the ground truth values, and
+          `y_pred` are the model's predictions.
+          `y_true` should have shape
+          `(batch_size, d0, .. dN)` (except in the case of
+          sparse loss functions such as
+          sparse categorical crossentropy which expects integer arrays of
+          shape `(batch_size, d0, .. dN-1)`).
+          `y_pred` should have shape `(batch_size, d0, .. dN)`.
+          The loss function should return a float tensor.
+          If a custom `Loss` instance is
+          used and reduction is set to `None`, return value has shape
+          `(batch_size, d0, .. dN-1)` i.e. per-sample or per-timestep loss
+          values; otherwise, it is a scalar. If the model has multiple
+          outputs, you can use a different loss on each output by passing a
+          dictionary or a list of losses. The loss value that will be
+          minimized by the model will then be the sum of all individual
+          losses, unless `loss_weights` is specified.
+        metrics: List of metrics to be evaluated by the model during
+          training and testing. Each of this can be a string (name of a
+          built-in function), function or a `tf.keras.metrics.Metric`
+          instance. See `tf.keras.metrics`. Typically you will use
+          `metrics=['accuracy']`.
+          A function is any callable with the signature `result = fn(y_true,
+          y_pred)`. To specify different metrics for different outputs of a
+          multi-output model, you could also pass a dictionary, such as
+          `metrics={'output_a':'accuracy', 'output_b':['accuracy', 'mse']}`.
+          You can also pass a list to specify a metric or a list of metrics
+          for each output, such as
+          `metrics=[['accuracy'], ['accuracy', 'mse']]`
+          or `metrics=['accuracy', ['accuracy', 'mse']]`. When you pass the
+          strings 'accuracy' or 'acc', we convert this to one of
+          `tf.keras.metrics.BinaryAccuracy`,
+          `tf.keras.metrics.CategoricalAccuracy`,
+          `tf.keras.metrics.SparseCategoricalAccuracy` based on the shapes
+          of the targets and of the model output. We do a similar
+          conversion for the strings 'crossentropy' and 'ce' as well.
+          The metrics passed here are evaluated without sample weighting; if
+          you would like sample weighting to apply, you can specify your
+          metrics via the `weighted_metrics` argument instead.
+        loss_weights: Optional list or dictionary specifying scalar
+          coefficients (Python floats) to weight the loss contributions of
+          different model outputs. The loss value that will be minimized by
+          the model will then be the *weighted sum* of all individual
+          losses, weighted by the `loss_weights` coefficients.  If a list,
+          it is expected to have a 1:1 mapping to the model's outputs. If a
+          dict, it is expected to map output names (strings) to scalar
+          coefficients.
+        weighted_metrics: List of metrics to be evaluated and weighted by
+          `sample_weight` or `class_weight` during training and testing.
+        run_eagerly: Bool. If `True`, this `Model`'s logic will not be
+          wrapped in a `tf.function`. Recommended to leave this as `None`
+          unless your `Model` cannot be run inside a `tf.function`.
+          `run_eagerly=True` is not supported when using
+          `tf.distribute.experimental.ParameterServerStrategy`. Defaults to
+           `False`.
+        steps_per_execution: Int or `'auto'`. The number of batches to
+          run during each `tf.function` call. If set to "auto", keras will
+          automatically tune `steps_per_execution` during runtime. Running
+          multiple batches inside a single `tf.function` call can greatly
+          improve performance on TPUs, when used with distributed strategies
+          such as `ParameterServerStrategy`, or with small models with a
+          large Python overhead. At most, one full epoch will be run each
+          execution. If a number larger than the size of the epoch is
+          passed, the execution will be truncated to the size of the epoch.
+          Note that if `steps_per_execution` is set to `N`,
+          `Callback.on_batch_begin` and `Callback.on_batch_end` methods will
+          only be called every `N` batches (i.e. before/after each
+          `tf.function` execution). Defaults to `1`.
+        jit_compile: If `True`, compile the model training step with XLA.
+          [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
+          for machine learning.
+          `jit_compile` is not enabled for by default.
+          Note that `jit_compile=True`
+          may not necessarily work for all models.
+          For more information on supported operations please refer to the
+          [XLA documentation](https://www.tensorflow.org/xla).
+          Also refer to
+          [known XLA issues](https://www.tensorflow.org/xla/known_issues)
+          for more details.
+        pss_evaluation_shards: Integer or 'auto'. Used for
+          `tf.distribute.ParameterServerStrategy` training only. This arg
+          sets the number of shards to split the dataset into, to enable an
+          exact visitation guarantee for evaluation, meaning the model will
+          be applied to each dataset element exactly once, even if workers
+          fail. The dataset must be sharded to ensure separate workers do
+          not process the same data. The number of shards should be at least
+          the number of workers for good performance. A value of 'auto'
+          turns on exact evaluation and uses a heuristic for the number of
+          shards based on the number of workers. 0, meaning no
+          visitation guarantee is provided. NOTE: Custom implementations of
+          `Model.test_step` will be ignored when doing exact evaluation.
+          Defaults to `0`.
+        **kwargs: Arguments supported for backwards compatibility only.
+    """
+    if jit_compile and not tf_utils.can_jit_compile(warn=True):
+      jit_compile = False
+    self._compile_config = serialization_lib.Config(
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        weighted_metrics=weighted_metrics,
+        run_eagerly=run_eagerly,
+        steps_per_execution=steps_per_execution,
+        jit_compile=jit_compile,
+    )
+    with self.distribute_strategy.scope():
+      if "experimental_steps_per_execution" in kwargs:
+        logging.warning(
+            "The argument `steps_per_execution` is no longer "
+            "experimental. Pass `steps_per_execution` instead of "
+            "`experimental_steps_per_execution`."
+        )
+        if not steps_per_execution:
+          steps_per_execution = kwargs.pop("experimental_steps_per_execution")
+
+      # When compiling from an already-serialized model, we do not want to
+      # reapply some processing steps (e.g. metric renaming for
+      # multi-output models, which have prefixes added for each
+      # corresponding output name).
+      from_serialized = kwargs.pop("from_serialized", False)
+
+      self._validate_compile(optimizer, metrics, **kwargs)
+      self._run_eagerly = run_eagerly
+
+      self.optimizer = self._get_optimizer(optimizer)
+      self.optimizer.global_step = self._train_counter
+      self.main_model.optimizer = self.optimizer
+
+      mesh = None
+      if self._layout_map is not None:
+        mesh = self._layout_map.get_default_mesh()
+
+      if isinstance(loss, compile_utils.LossesContainer):
+        self.compiled_loss = loss
+      else:
+        self.compiled_loss = compile_utils.LossesContainer(
+            loss,
+            loss_weights,
+            output_names=self.main_model.output_names,
+            mesh=mesh,
+        )
+      self.compiled_metrics = compile_utils.MetricsContainer(
+          metrics,
+          weighted_metrics,
+          output_names=self.main_model.output_names,
+          from_serialized=from_serialized,
+          mesh=mesh,
+      )
+
+      if steps_per_execution == "auto":
+        if self._steps_per_execution is None:
+          self._configure_steps_per_execution(1)
+        self._steps_per_execution_tuner = (
+            steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution)
+        )
+        self._autotune_steps_per_execution = True
+      else:
+        self._configure_steps_per_execution(steps_per_execution or 1)
+
+      self._pss_evaluation_shards = self._infer_exact_eval_shards(pss_evaluation_shards)
+
+      # Initializes attrs that are reset each time `compile` is called.
+      self._reset_compile_cache()
+      self._is_compiled = True
+      self.loss = loss or {}
+      if (self._run_eagerly or self.main_model.dynamic) and jit_compile:
+        raise ValueError("You cannot enable `run_eagerly` and `jit_compile` "
+                         "at the same time.")
+      else:
+        self._jit_compile = jit_compile
+
+  def _get_optimizer(self, optimizer):
+    """Wraps `optimizer` in `LossScaleOptimizer` if necessary."""
+
+    def _get_single_optimizer(opt):
+      opt = optimizers.get(opt)
+      if self.main_model.dtype_policy.name == "mixed_float16" and not isinstance(opt, lso.BaseLossScaleOptimizer):
+        # Loss scaling is necessary with mixed_float16 for models to
+        # converge to the same accuracy as with float32.
+        opt = lso.BaseLossScaleOptimizer(opt)
+      return opt
+
+    return tf.nest.map_structure(_get_single_optimizer, optimizer)
+
+  @tf.__internal__.tracking.no_automatic_dependency_tracking
+  def _reset_compile_cache(self):
+    self.train_function = None
+    self.test_function = None
+    self.predict_function = None
+    # Used to cache the `tf.function`'ed `train_function` to be logged in
+    # TensorBoard, since the original `train_function` is not necessarily
+    # a `tf.function` (e.g., with ParameterServerStrategy, the
+    # `train_function` is a scheduling of the actual training function to a
+    # remote worker).
+    self.train_tf_function = None
+
+    # Used to cache `trainable` attr of `Layer`s for `fit`.
+    self._compiled_trainable_state = self._get_trainable_state()
+
+  @tf.__internal__.tracking.no_automatic_dependency_tracking
+  def _configure_steps_per_execution(self, steps_per_execution):
+    self._steps_per_execution = self._create_counter_variable(steps_per_execution)
+
+  @property
+  def _should_compute_mask(self):
+    return False
+
+  @property
+  def metrics(self):
+    """Return metrics added using `compile()` or `add_metric()`.
+
+    Note: Metrics passed to `compile()` are available only after a
+    `keras.Model` has been trained/evaluated on actual data.
+
+    Examples:
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> outputs = tf.keras.layers.Dense(2)(inputs)
+    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+    >>> [m.name for m in model.metrics]
+    []
+
+    >>> x = np.random.random((2, 3))
+    >>> y = np.random.randint(0, 2, (2, 2))
+    >>> model.fit(x, y)
+    >>> [m.name for m in model.metrics]
+    ['loss', 'mae']
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> d = tf.keras.layers.Dense(2, name='out')
+    >>> output_1 = d(inputs)
+    >>> output_2 = d(inputs)
+    >>> model = tf.keras.models.Model(
+    ...    inputs=inputs, outputs=[output_1, output_2])
+    >>> model.add_metric(
+    ...    tf.reduce_sum(output_2), name='mean', aggregation='mean')
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
+    >>> model.fit(x, (y, y))
+    >>> [m.name for m in model.metrics]
+    ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
+    'out_1_acc', 'mean']
+
+    """
+    metrics = []
+    if self._is_compiled:
+      if self.compiled_loss is not None:
+        metrics += self.compiled_loss.metrics
+      if self.compiled_metrics is not None:
+        metrics += self.compiled_metrics.metrics
+
+    for l in self.main_model._flatten_layers():
+      metrics.extend(l._metrics)
+    return metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs.
+
+    Note: `metrics_names` are available only after a `keras.Model` has been
+    trained/evaluated on actual data.
+
+    Examples:
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> outputs = tf.keras.layers.Dense(2)(inputs)
+    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+    >>> model.metrics_names
+    []
+
+    >>> x = np.random.random((2, 3))
+    >>> y = np.random.randint(0, 2, (2, 2))
+    >>> model.fit(x, y)
+    >>> model.metrics_names
+    ['loss', 'mae']
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> d = tf.keras.layers.Dense(2, name='out')
+    >>> output_1 = d(inputs)
+    >>> output_2 = d(inputs)
+    >>> model = tf.keras.models.Model(
+    ...    inputs=inputs, outputs=[output_1, output_2])
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae", "acc"])
+    >>> model.fit(x, (y, y))
+    >>> model.metrics_names
+    ['loss', 'out_loss', 'out_1_loss', 'out_mae', 'out_acc', 'out_1_mae',
+    'out_1_acc']
+
+    """
+
+    # This property includes all output names including `loss` and
+    # per-output losses for backward compatibility.
+    return [m.name for m in self.metrics]
+
+  @property
+  def distribute_strategy(self):
+    """The `tf.distribute.Strategy` this model was created under."""
+    return self._distribution_strategy or tf.distribute.get_strategy()
+
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
+
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become
+    easier for you to debug it by stepping into individual layer calls.
+
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
+
+    Returns:
+      Boolean, whether the model should run eagerly.
+    """
+    if self.main_model.dynamic and self._run_eagerly == False:
+      # TODO(fchollet): consider using py_func to enable this.
+      raise ValueError(
+          "Your model contains layers that can only be "
+          "successfully run in eager execution (layers "
+          "constructed with `dynamic=True`). "
+          "You cannot set `run_eagerly=False`."
+      )
+
+    if self._cluster_coordinator and self._run_eagerly:
+      raise ValueError("When using `Model` with `ParameterServerStrategy`, "
+                       "`run_eagerly` is not supported.")
+
+    # Run eagerly logic, by priority:
+    # (1) Dynamic models must be run eagerly.
+    # (2) Explicitly setting run_eagerly causes a Model to be run eagerly.
+    # (3) Not explicitly setting run_eagerly defaults to TF's global
+    # setting.
+    return (
+        self.main_model.dynamic or self._run_eagerly or
+        (tf.config.functions_run_eagerly() and self._run_eagerly is None)
+    )
+
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
+
+  @property
+  def autotune_steps_per_execution(self):
+    """Settable property to enable tuning for steps_per_execution"""
+    return self._autotune_steps_per_execution
+
+  @autotune_steps_per_execution.setter
+  def autotune_steps_per_execution(self, value):
+    self._autotune_steps_per_execution = value
+    if value and self._steps_per_execution_tuner is None:
+      if self._steps_per_execution is None:
+        self._configure_steps_per_execution(1)
+      self._steps_per_execution_tuner = (
+          steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution)
+      )
+
+  @property
+  def steps_per_execution(self):
+    """Settable `steps_per_execution variable. Requires a compiled model."""
+    return self._steps_per_execution
+
+  @steps_per_execution.setter
+  def steps_per_execution(self, value):
+    if self._steps_per_execution is None:
+      self._configure_steps_per_execution(value)
+    else:
+      self._steps_per_execution.assign(value)
+
+  @property
+  def jit_compile(self):
+    """Specify whether to compile the model with XLA.
+
+    [XLA](https://www.tensorflow.org/xla) is an optimizing compiler
+    for machine learning. `jit_compile` is not enabled by default.
+    Note that `jit_compile=True` may not necessarily work for all models.
+
+    For more information on supported operations please refer to the
+    [XLA documentation](https://www.tensorflow.org/xla). Also refer to
+    [known XLA issues](https://www.tensorflow.org/xla/known_issues)
+    for more details.
+    """
+    return self._jit_compile
+
+  @jit_compile.setter
+  def jit_compile(self, value):
+    # Function remains cached with previous jit_compile settings
+    if self._jit_compile == value:
+      # Avoid resetting compiler cache if possible if the value is the
+      # same
+      return
+    # Check if TensorFlow is compiled with XLA before setting the value
+    if value and not tf_utils.can_jit_compile(warn=True):
+      self._jit_compile = False
+      return
+
+    self._jit_compile = value
+    # Setting `jit_compile` should invalidate previously cached functions.
+    self._reset_compile_cache()
+
+  @property
+  def distribute_reduction_method(self):
+    """The method employed to reduce per-replica values during training.
+
+    Unless specified, the value "auto" will be assumed, indicating that
+    the reduction strategy should be chosen based on the current
+    running environment.
+    See `reduce_per_replica` function for more details.
+
+    """
+    return self._distribute_reduction_method or "auto"
+
+  @distribute_reduction_method.setter
+  def distribute_reduction_method(self, value):
+    self._distribute_reduction_method = value
+
+  def _validate_target_and_loss(self, y, loss):
+    """Raises error if target or loss is not found.
+
+    This method verifies that the target and loss are properly populated
+    when applicable, or raises errors.
+
+    Args:
+      y: the target for training.
+      loss: the total loss tensor including loss added via `compile` and
+        `add_loss`.
+    """
+
+    # `self.loss` references the loss added via `compile` call. If users
+    # have provided such, the target must be provided; otherwise it's a user
+    # error.  Note that `self.loss` does not include losses added via
+    # `add_loss`, and it is a valid use when such loss from `add_loss`
+    # exists and target does not.
+    if self.loss and y is None:
+      raise ValueError(
+          "Target data is missing. Your model was compiled with "
+          f"loss={self.loss}, "
+          "and therefore expects target data to be provided in `fit()`."
+      )
+
+    # For training, there must be compiled loss or regularization loss to
+    # exist in order to apply the gradients. If one is not found, it means
+    # no loss was supplied via `compile` or `add_loss`.
+    elif loss is None:
+      raise ValueError(
+          "No loss found. You may have forgotten to provide a `loss` "
+          "argument in the `compile()` method."
+      )
+
+  def train_step(self, data):
+    """The logic for one training step.
+
+    This method can be overridden to support custom training logic.
+    For concrete examples of how to override this method see
+    [Customizing what happens in fit](
+    https://www.tensorflow.org/guide/tf_keras/customizing_what_happens_in_fit).
+    This method is called by `Model.make_train_function`.
+
+    This method should contain the mathematical logic for one step of
+    training.  This typically includes the forward pass, loss calculation,
+    backpropagation, and metric updates.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function`
+    and `tf.distribute.Strategy` settings), should be left to
+    `Model.make_train_function`, which can also be overridden.
+
+    Args:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+      values of the `Model`'s metrics are returned. Example:
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+    # Run forward pass.
+    with tf.GradientTape() as tape:
+      y_pred = self.main_model(x, training=True)
+      loss = self.compute_loss(x, y, y_pred, sample_weight)
+    self._validate_target_and_loss(y, loss)
+    # Run backwards pass.
+    self.optimizer.minimize(loss, self.main_model.trainable_variables, tape=tape)
+    return self.compute_metrics(x, y, y_pred, sample_weight)
+
+  def hvd_train_step(self, data):
+    """The logic for one training step on Horovod.
+
+    This method can be overridden to support custom training logic.
+    For concrete examples of how to override this method see
+    [Customizing what happens in fit](
+    https://www.tensorflow.org/guide/tf_keras/customizing_what_happens_in_fit).
+    This method is called by `Model.make_train_function`.
+
+    This method should contain the mathematical logic for one step of
+    training.  This typically includes the forward pass, loss calculation,
+    backpropagation, and metric updates.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function`
+    and `tf.distribute.Strategy` settings), should be left to
+    `Model.make_train_function`, which can also be overridden.
+
+    Args:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+      values of the `Model`'s metrics are returned. Example:
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+    # Run forward pass.
+    with tf.GradientTape() as dp_tape, tf.GradientTape() as mp_tape:
+      y_pred = self.main_model(x, training=True)
+      loss = self.compute_loss(x, y, y_pred, sample_weight)
+    dp_tape = hvd.DistributedGradientTape(dp_tape, sparse_as_dense=False)
+    self._validate_target_and_loss(y, loss)
+    # Run backwards pass.
+    dp_vars, mp_vars = [], []
+    for x in self.main_model.variables:
+      if isinstance(x, kv_variable_ops.EmbeddingVariable):
+        mp_vars.append(x)
+      else:
+        dp_vars.append(x)
+    self.optimizer.minimize(loss, dp_vars, tape=dp_tape)
+    self.optimizer.minimize(loss, mp_vars, tape=mp_tape)
+    return self.compute_metrics(x, y, y_pred, sample_weight)
+
+  def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
+    """Compute the total loss, validate it, and return it.
+
+    Subclasses can optionally override this method to provide custom loss
+    computation logic.
+
+    Example:
+    ```python
+    class MyModel(tf.keras.Model):
+
+      def __init__(self, *args, **kwargs):
+        super(MyModel, self).__init__(*args, **kwargs)
+        self.loss_tracker = tf.keras.metrics.Mean(name='loss')
+
+      def compute_loss(self, x, y, y_pred, sample_weight):
+        loss = tf.reduce_mean(tf.math.squared_difference(y_pred, y))
+        loss += tf.add_n(self.losses)
+        self.loss_tracker.update_state(loss)
+        return loss
+
+      def reset_metrics(self):
+        self.loss_tracker.reset_states()
+
+      @property
+      def metrics(self):
+        return [self.loss_tracker]
+
+    tensors = tf.random.uniform((10, 10)), tf.random.uniform((10,))
+    dataset = tf.data.Dataset.from_tensor_slices(tensors).repeat().batch(1)
+
+    inputs = tf.keras.layers.Input(shape=(10,), name='my_input')
+    outputs = tf.keras.layers.Dense(10)(inputs)
+    model = MyModel(inputs, outputs)
+    model.add_loss(tf.reduce_sum(outputs))
+
+    optimizer = tf.keras.optimizers.SGD()
+    model.compile(optimizer, loss='mse', steps_per_execution=10)
+    model.fit(dataset, epochs=2, steps_per_epoch=10)
+    print('My custom loss: ', model.loss_tracker.result().numpy())
+    ```
+
+    Args:
+      x: Input data.
+      y: Target data.
+      y_pred: Predictions returned by the model (output of `model(x)`)
+      sample_weight: Sample weights for weighting the loss function.
+
+    Returns:
+      The total loss as a `tf.Tensor`, or `None` if no loss results (which
+      is the case when called by `Model.test_step`).
+    """
+    del x  # The default implementation does not use `x`.
+    return self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.main_model.losses)
+
+  def compute_metrics(self, x, y, y_pred, sample_weight):
+    """Update metric states and collect all metrics to be returned.
+
+    Subclasses can optionally override this method to provide custom metric
+    updating and collection logic.
+
+    Example:
+    ```python
+    class MyModel(tf.keras.Sequential):
+
+      def compute_metrics(self, x, y, y_pred, sample_weight):
+
+        # This super call updates `self.compiled_metrics` and returns
+        # results for all metrics listed in `self.metrics`.
+        metric_results = super(MyModel, self).compute_metrics(
+            x, y, y_pred, sample_weight)
+
+        # Note that `self.custom_metric` is not listed in `self.metrics`.
+        self.custom_metric.update_state(x, y, y_pred, sample_weight)
+        metric_results['custom_metric_name'] = self.custom_metric.result()
+        return metric_results
+    ```
+
+    Args:
+      x: Input data.
+      y: Target data.
+      y_pred: Predictions returned by the model (output of `model.call(x)`)
+      sample_weight: Sample weights for weighting the loss function.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end()`. Typically, the
+      values of the metrics listed in `self.metrics` are returned. Example:
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    del x  # The default implementation does not use `x`.
+    self.compiled_metrics.update_state(y, y_pred, sample_weight)
+    return self.get_metrics_result()
+
+  def get_metrics_result(self):
+    """Returns the model's metrics values as a dict.
+
+    If any of the metric result is a dict (containing multiple metrics),
+    each of them gets added to the top level returned dict of this method.
+
+    Returns:
+      A `dict` containing values of the metrics listed in `self.metrics`.
+      Example:
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    # Collect metrics to return
+    return_metrics = {}
+    for metric in self.metrics:
+      result = metric.result()
+      if isinstance(result, dict):
+        return_metrics.update(result)
+      else:
+        return_metrics[metric.name] = result
+    return return_metrics
+
+  def _validate_and_get_metrics_result(self, logs):
+    """Returns model metrics as a dict if the keys match with input logs.
+
+    When the training / evalution is performed with asynchronous steps, such
+    as the case with `tf.distribute.ParameterServerStrategy`, the last
+    scheduled `train / test_step` may not give the latest metrics because it
+    is not guaranteed to be executed the last. This method gets metrics from
+    the model directly instead of relying on the return from last step
+    function.
+
+    It logs a warning if the metric results could not be overridden when
+    used with `tf.distribute.ParameterServerStrategy`.
+
+    When the user has custom train / test step functions, the metrics
+    returned may be different from `Model.metrics`. In those instances,
+    this function will be no-op and return the logs.
+
+    Args:
+      logs: A `dict` of metrics returned by train / test step function.
+
+    Returns:
+      A `dict` containing values of the metrics listed in `self.metrics`
+      when logs and model metrics keys match. Otherwise it returns input
+      `logs`.
+    """
+    PSS_WARN_MSG = "Could not get Model metric results. \
+        Using the results of last step function could lead to incorrect \
+        results when used with ParameterServerStrategy"
+
+    try:
+      metric_logs = self.get_metrics_result()
+    except TypeError:
+      if self._cluster_coordinator:
+        logging.warning(PSS_WARN_MSG)
+    else:
+      # Verify that train / test step logs passed and metric logs have
+      # matching keys. Could be different when using custom step functions
+      if isinstance(logs, dict) and set(logs.keys()) == set(metric_logs.keys()):
+        logs = tf_utils.sync_to_numpy_or_python_type(metric_logs)
+      elif self._cluster_coordinator:
+        logging.warning(PSS_WARN_MSG)
+    return logs
+
+  def _aggregate_exact_metrics(self, logs):
+    # When doing exact evaluation, `logs` is a list of each data shard's
+    # metric variables, which will be used to update the metrics.
+    for shard_result in logs:
+      for metric in self.metrics:
+        if metric.name not in shard_result.keys():
+          logging.log_first_n(
+              logging.WARN,
+              f"No matching result found for metric {metric.name}. "
+              "This metric's computed result may be incorrect.",
+              3,
+          )
+          continue
+        metric_result = shard_result[metric.name]
+        if len(metric_result) != len(metric.weights):
+          raise ValueError(
+              f"Expected {len(metric.weights)} variables in result "
+              f"for metric {metric.name}, but found "
+              f"{len(metric_result)}."
+          )
+        for weight, val in zip(metric.weights, metric_result):
+          weight.assign_add(val)
+    return self.get_metrics_result()
+
+  def make_train_function(self, force=False):
+    """Creates a function that executes one step of training.
+
+    This method can be overridden to support custom training logic.
+    This method is called by `Model.fit` and `Model.train_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual training
+    logic to `Model.train_step`.
+
+    This function is cached the first time `Model.fit` or
+    `Model.train_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called. You can skip the cache and generate again the
+    function with `force=True`.
+
+    Args:
+      force: Whether to regenerate the train function and skip the cached
+        function if available.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return a `dict` containing values that will
+      be passed to `tf.keras.Callbacks.on_train_batch_end`, such as
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    if self.train_function is not None and not force:
+      return self.train_function
+
+    def step_function(iterator):
+      """Runs a single training step."""
+
+      def run_step(data):
+        outputs = self.train_step(data)
+        # Ensure counter is updated only if `train_step` succeeds.
+        with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
+          self._train_counter.assign_add(1)
+        return outputs
+
+      if self.jit_compile:
+        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
+      data = next(iterator)
+      outputs = self.distribute_strategy.run(run_step, args=(data,))
+      outputs = training_module.reduce_per_replica(
+          outputs,
+          self.distribute_strategy,
+          reduction=self.distribute_reduction_method,
+      )
+      return outputs
+
+    # Special case if steps_per_execution is one.
+    if (
+        self._steps_per_execution is None or
+        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+    ):
+
+      def train_function(iterator):
+        """Runs a training execution with a single step."""
+        return step_function(iterator)
+
+      if not self.run_eagerly:
+        train_function = tf.function(train_function, reduce_retracing=True)
+        self.train_tf_function = train_function
+
+      if self._cluster_coordinator:
+        self.train_function = (lambda it: self._cluster_coordinator.schedule(train_function, args=(it,)))
+      else:
+        self.train_function = train_function
+
+    # If we're using a coordinator, use the value of
+    # self._steps_per_execution at the time the function is
+    # called/scheduled, and not when it is actually executed.
+    elif self._cluster_coordinator:
+
+      def train_function(iterator, steps_per_execution):
+        """Runs a training execution with multiple steps."""
+        for _ in tf.range(steps_per_execution):
+          outputs = step_function(iterator)
+        return outputs
+
+      if not self.run_eagerly:
+        train_function = tf.function(train_function, reduce_retracing=True)
+        self.train_tf_function = train_function
+
+      self.train_function = lambda it: self._cluster_coordinator.schedule(
+          train_function, args=(it, self._steps_per_execution.value())
+      )
+    else:
+
+      def train_function(iterator):
+        """Runs a training execution with multiple steps."""
+        for _ in tf.range(self._steps_per_execution):
+          outputs = step_function(iterator)
+        return outputs
+
+      if not self.run_eagerly:
+        train_function = tf.function(train_function, reduce_retracing=True)
+        self.train_tf_function = train_function
+      self.train_function = train_function
+
+    return self.train_function
+
+  def make_hvd_train_function(self, force=False):
+    """Creates a function that executes one step of training.
+
+    This method can be overridden to support custom training logic.
+    This method is called by `Model.fit` and `Model.train_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual training
+    logic to `Model.train_step`.
+
+    This function is cached the first time `Model.fit` or
+    `Model.train_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called. You can skip the cache and generate again the
+    function with `force=True`.
+
+    Args:
+      force: Whether to regenerate the train function and skip the cached
+        function if available.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return a `dict` containing values that will
+      be passed to `tf.keras.Callbacks.on_train_batch_end`, such as
+      `{'loss': 0.2, 'accuracy': 0.7}`.
+    """
+    if self.train_function is not None and not force:
+      return self.train_function
+
+    def step_function(iterator):
+      """Runs a single training step."""
+
+      def do_broadcast():
+        model_broadcast_vars = [
+            x for x in self.main_model.variables
+            if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
+        ]
+        opt_broadcast_vars = [
+            x for x in self.optimizer.variables()
+            if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
+        ]
+        print_op = tf.print(
+            f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...",
+            output_stream=sys.stdout
+        )
+        with tf.control_dependencies([print_op]):
+          hvd.broadcast_variables(model_broadcast_vars + opt_broadcast_vars, root_rank=0)
+        self.first_batch.assign(False)
+
+      def run_step(data):
+        outputs = self.hvd_train_step(data)
+        # Ensure counter is updated only if `hvd_train_step` succeeds.
+        with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
+          self._train_counter.assign_add(1)
+          if self.first_batch:
+            do_broadcast()
+        return outputs
+
+      if self.jit_compile:
+        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
+      data = next(iterator)
+      outputs = run_step(data)
+      return outputs
+
+    # Special case if steps_per_execution is one.
+    if (
+        self._steps_per_execution is None or
+        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+    ):
+
+      def train_function(iterator):
+        """Runs a training execution with a single step."""
+        return step_function(iterator)
+
+      if not self.run_eagerly:
+        train_function = tf.function(train_function, reduce_retracing=True)
+        self.train_tf_function = train_function
+
+      self.train_function = train_function
+    else:
+
+      def train_function(iterator):
+        """Runs a training execution with multiple steps."""
+        for _ in tf.range(self._steps_per_execution):
+          outputs = step_function(iterator)
+        return outputs
+
+      if not self.run_eagerly:
+        train_function = tf.function(train_function, reduce_retracing=True)
+        self.train_tf_function = train_function
+      self.train_function = train_function
+
+    return self.train_function
+
+  @traceback_utils.filter_traceback
+  def fit(
+      self,
+      x=None,
+      y=None,
+      batch_size=None,
+      epochs=None,
+      verbose="auto",
+      callbacks=[],
+      validation_split=0.0,
+      validation_data=None,
+      shuffle=True,
+      class_weight=None,
+      sample_weight=None,
+      initial_epoch=0,
+      steps_per_epoch=None,
+      validation_steps=None,
+      validation_batch_size=None,
+      validation_freq=1,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+  ):
+    """Trains the model for a fixed number of epochs (dataset iterations).
+
+    Args:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset. Should return a tuple
+            of either `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+          - A generator or `keras.utils.Sequence` returning `(inputs,
+            targets)` or `(inputs, targets, sample_weights)`.
+          - A `tf.keras.utils.experimental.DatasetCreator`, which wraps a
+            callable that takes a single argument of type
+            `tf.distribute.InputContext`, and returns a `tf.data.Dataset`.
+            `DatasetCreator` should be used when users prefer to specify the
+            per-replica batching and sharding logic for the `Dataset`.
+            See `tf.keras.utils.experimental.DatasetCreator` doc for more
+            information.
+          A more detailed description of unpacking behavior for iterator
+          types (Dataset, generator, Sequence) is given below. If these
+          include `sample_weights` as a third component, note that sample
+          weighting applies to the `weighted_metrics` argument but not the
+          `metrics` argument in `compile()`. If using
+          `tf.distribute.experimental.ParameterServerStrategy`, only
+          `DatasetCreator` type is supported for `x`.
+        y: Target data. Like the input data `x`,
+          it could be either Numpy array(s) or TensorFlow tensor(s).
+          It should be consistent with `x` (you cannot have Numpy inputs and
+          tensor targets, or inversely). If `x` is a dataset, generator,
+          or `keras.utils.Sequence` instance, `y` should
+          not be specified (since targets will be obtained from `x`).
+        batch_size: Integer or `None`.
+            Number of samples per gradient update.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` if your data is in the
+            form of datasets, generators, or `keras.utils.Sequence`
+            instances (since they generate batches).
+        epochs: Integer. Number of epochs to train the model.
+            An epoch is an iteration over the entire `x` and `y`
+            data provided
+            (unless the `steps_per_epoch` flag is set to
+            something other than None).
+            Note that in conjunction with `initial_epoch`,
+            `epochs` is to be understood as "final epoch".
+            The model is not trained for a number of iterations
+            given by `epochs`, but merely until the epoch
+            of index `epochs` is reached.
+        verbose: 'auto', 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = one line per epoch.
+            'auto' becomes 1 for most cases, but 2 when used with
+            `ParameterServerStrategy`. Note that the progress bar is not
+            particularly useful when logged to a file, so verbose=2 is
+            recommended when not running interactively (eg, in a production
+            environment). Defaults to 'auto'.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during training.
+            See `tf.keras.callbacks`. Note
+            `tf.keras.callbacks.ProgbarLogger` and
+            `tf.keras.callbacks.History` callbacks are created automatically
+            and need not be passed into `model.fit`.
+            `tf.keras.callbacks.ProgbarLogger` is created or not based on
+            `verbose` argument to `model.fit`.
+            Callbacks with batch-level calls are currently unsupported with
+            `tf.distribute.experimental.ParameterServerStrategy`, and users
+            are advised to implement epoch-level calls instead with an
+            appropriate `steps_per_epoch` value.
+        validation_split: Float between 0 and 1.
+            Fraction of the training data to be used as validation data.
+            The model will set apart this fraction of the training data,
+            will not train on it, and will evaluate
+            the loss and any model metrics
+            on this data at the end of each epoch.
+            The validation data is selected from the last samples
+            in the `x` and `y` data provided, before shuffling. This
+            argument is not supported when `x` is a dataset, generator or
+            `keras.utils.Sequence` instance.
+            If both `validation_data` and `validation_split` are provided,
+            `validation_data` will override `validation_split`.
+            `validation_split` is not yet supported with
+            `tf.distribute.experimental.ParameterServerStrategy`.
+        validation_data: Data on which to evaluate
+            the loss and any model metrics at the end of each epoch.
+            The model will not be trained on this data. Thus, note the fact
+            that the validation loss of data provided using
+            `validation_split` or `validation_data` is not affected by
+            regularization layers like noise and dropout.
+            `validation_data` will override `validation_split`.
+            `validation_data` could be:
+              - A tuple `(x_val, y_val)` of Numpy arrays or tensors.
+              - A tuple `(x_val, y_val, val_sample_weights)` of NumPy
+                arrays.
+              - A `tf.data.Dataset`.
+              - A Python generator or `keras.utils.Sequence` returning
+              `(inputs, targets)` or `(inputs, targets, sample_weights)`.
+            `validation_data` is not yet supported with
+            `tf.distribute.experimental.ParameterServerStrategy`.
+        shuffle: Boolean (whether to shuffle the training data
+            before each epoch) or str (for 'batch'). This argument is
+            ignored when `x` is a generator or an object of tf.data.Dataset.
+            'batch' is a special option for dealing
+            with the limitations of HDF5 data; it shuffles in batch-sized
+            chunks. Has no effect when `steps_per_epoch` is not `None`.
+        class_weight: Optional dictionary mapping class indices (integers)
+            to a weight (float) value, used for weighting the loss function
+            (during training only).
+            This can be useful to tell the model to
+            "pay more attention" to samples from
+            an under-represented class. When `class_weight` is specified
+            and targets have a rank of 2 or greater, either `y` must be
+            one-hot encoded, or an explicit final dimension of `1` must
+            be included for sparse class labels.
+        sample_weight: Optional Numpy array of weights for
+            the training samples, used for weighting the loss function
+            (during training only). You can either pass a flat (1D)
+            Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples),
+            or in the case of temporal data,
+            you can pass a 2D array with shape
+            `(samples, sequence_length)`,
+            to apply a different weight to every timestep of every sample.
+            This argument is not supported when `x` is a dataset, generator,
+            or `keras.utils.Sequence` instance, instead provide the
+            sample_weights as the third element of `x`.
+            Note that sample weighting does not apply to metrics specified
+            via the `metrics` argument in `compile()`. To apply sample
+            weighting to your metrics, you can specify them via the
+            `weighted_metrics` in `compile()` instead.
+        initial_epoch: Integer.
+            Epoch at which to start training
+            (useful for resuming a previous training run).
+        steps_per_epoch: Integer or `None`.
+            Total number of steps (batches of samples)
+            before declaring one epoch finished and starting the
+            next epoch. When training with input tensors such as
+            TensorFlow data tensors, the default `None` is equal to
+            the number of samples in your dataset divided by
+            the batch size, or 1 if that cannot be determined. If x is a
+            `tf.data` dataset, and 'steps_per_epoch'
+            is None, the epoch will run until the input dataset is
+            exhausted.  When passing an infinitely repeating dataset, you
+            must specify the `steps_per_epoch` argument. If
+            `steps_per_epoch=-1` the training will run indefinitely with an
+            infinitely repeating dataset.  This argument is not supported
+            with array inputs.
+            When using `tf.distribute.experimental.ParameterServerStrategy`:
+              * `steps_per_epoch=None` is not supported.
+        validation_steps: Only relevant if `validation_data` is provided and
+            is a `tf.data` dataset. Total number of steps (batches of
+            samples) to draw before stopping when performing validation
+            at the end of every epoch. If 'validation_steps' is None,
+            validation will run until the `validation_data` dataset is
+            exhausted. In the case of an infinitely repeated dataset, it
+            will run into an infinite loop. If 'validation_steps' is
+            specified and only part of the dataset will be consumed, the
+            evaluation will start from the beginning of the dataset at each
+            epoch. This ensures that the same validation samples are used
+            every time.
+        validation_batch_size: Integer or `None`.
+            Number of samples per validation batch.
+            If unspecified, will default to `batch_size`.
+            Do not specify the `validation_batch_size` if your data is in
+            the form of datasets, generators, or `keras.utils.Sequence`
+            instances (since they generate batches).
+        validation_freq: Only relevant if validation data is provided.
+          Integer or `collections.abc.Container` instance (e.g. list, tuple,
+          etc.).  If an integer, specifies how many training epochs to run
+          before a new validation run is performed, e.g. `validation_freq=2`
+          runs validation every 2 epochs. If a Container, specifies the
+          epochs on which to run validation, e.g.
+          `validation_freq=[1, 2, 10]` runs validation at the end of the
+          1st, 2nd, and 10th epochs.
+        max_queue_size: Integer. Used for generator or
+          `keras.utils.Sequence` input only. Maximum size for the generator
+          queue.  If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up
+            when using process-based threading. If unspecified, `workers`
+            will default to 1.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-pickleable arguments to
+            the generator as they can't be passed easily to children
+            processes.
+
+    Unpacking behavior for iterator-like inputs:
+        A common pattern is to pass a tf.data.Dataset, generator, or
+      tf.keras.utils.Sequence to the `x` argument of fit, which will in fact
+      yield not only features (x) but optionally targets (y) and sample
+      weights.  TF-Keras requires that the output of such iterator-likes be
+      unambiguous. The iterator should return a tuple of length 1, 2, or 3,
+      where the optional second and third elements will be used for y and
+      sample_weight respectively. Any other type provided will be wrapped in
+      a length one tuple, effectively treating everything as 'x'. When
+      yielding dicts, they should still adhere to the top-level tuple
+      structure.
+      e.g. `({"x0": x0, "x1": x1}, y)`. TF-Keras will not attempt to
+      separate features, targets, and weights from the keys of a single
+      dict.
+        A notable unsupported data type is the namedtuple. The reason is
+      that it behaves like both an ordered datatype (tuple) and a mapping
+      datatype (dict). So given a namedtuple of the form:
+          `namedtuple("example_tuple", ["y", "x"])`
+      it is ambiguous whether to reverse the order of the elements when
+      interpreting the value. Even worse is a tuple of the form:
+          `namedtuple("other_tuple", ["x", "y", "z"])`
+      where it is unclear if the tuple was intended to be unpacked into x,
+      y, and sample_weight or passed through as a single element to `x`. As
+      a result the data processing code will simply raise a ValueError if it
+      encounters a namedtuple. (Along with instructions to remedy the
+      issue.)
+
+    Returns:
+        A `History` object. Its `History.history` attribute is
+        a record of training loss values and metrics values
+        at successive epochs, as well as validation loss values
+        and validation metrics values (if applicable).
+
+    Raises:
+        RuntimeError: 1. If the model was never compiled or,
+        2. If `model.fit` is  wrapped in `tf.function`.
+
+        ValueError: In case of mismatch between the provided input data
+            and what the model expects or when the input data is empty.
+    """
+    if steps_per_epoch and flags.FLAGS.use_horovod:
+      try:
+        import horovod.tensorflow as hvd
+        steps_array = hvd.allgather_object(steps_per_epoch, name='check_train_step')
+        logger.debug(f"steps_array = {steps_array}")
+        assert max(set(steps_array)) == min(set(steps_array))
+      except:
+        raise ValueError(
+            f"steps_per_epoch = {steps_per_epoch}, different rank should have same steps when using Horovod."
+        )
+    # Legacy graph support is contained in `training_v1.Model`.
+    if batch_size is None:
+      batch_size = flags.FLAGS.batch_size
+    if epochs is None:
+      epochs = flags.FLAGS.epochs
+    if flags.FLAGS.stop_steps >= 0:
+      epochs = 1
+      if steps_per_epoch is None:
+        steps_per_epoch = flags.FLAGS.stop_steps
+      else:
+        steps_per_epoch = min(steps_per_epoch, flags.FLAGS.stop_steps)
+
+    version_utils.disallow_legacy_graph("Model", "fit")
+    self._assert_compile_was_called()
+    self._check_call_args("fit")
+    training_module._disallow_inside_tf_function("fit")
+
+    verbose = training_module._get_verbosity(verbose, self.distribute_strategy)
+
+    if validation_split and validation_data is None:
+      # Create the validation data using the training data. Only supported
+      # for `Tensor` and `NumPy` input.
+      (
+          x,
+          y,
+          sample_weight,
+      ), validation_data = data_adapter.train_validation_split(
+          (x, y, sample_weight), validation_split=validation_split
+      )
+
+    if validation_data:
+      (
+          val_x,
+          val_y,
+          val_sample_weight,
+      ) = data_adapter.unpack_x_y_sample_weight(validation_data)
+
+    if self.distribute_strategy._should_use_with_coordinator:
+      self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy))
+
+    with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+        self
+    ):
+      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      data_handler = data_adapter.get_data_handler(
+          x=x,
+          y=y,
+          sample_weight=sample_weight,
+          batch_size=batch_size,
+          steps_per_epoch=steps_per_epoch,
+          initial_epoch=initial_epoch,
+          epochs=epochs,
+          shuffle=shuffle,
+          class_weight=class_weight,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self,
+          steps_per_execution=self._steps_per_execution,
+      )
+
+      for callback in callbacks:
+        if hasattr(callback, 'set_optimizer') and callable(callback.set_optimizer):
+          callback.set_optimizer(self.optimizer)
+        if hasattr(callback, 'set_models') and callable(callback.set_models):
+          callback.set_models(self._model)
+
+      # Container that configures and calls `tf.keras.Callback`s.
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        if flags.FLAGS.use_horovod:
+          if is_main_process():
+            callbacks += [ProgbarLogger(count_mode="steps")]
+          callbacks = HvdCallbackList(
+              callbacks,
+              add_history=True,
+              add_progbar=False,
+              model=self.main_model,
+              verbose=verbose,
+              epochs=epochs,
+              steps=data_handler.inferred_steps,
+          )
+        else:
+          callbacks = callbacks_module.CallbackList(
+              callbacks,
+              add_history=True,
+              add_progbar=verbose != 0,
+              model=self.main_model,
+              verbose=verbose,
+              epochs=epochs,
+              steps=data_handler.inferred_steps,
+          )
+
+      self.stop_training = False
+      self.train_function = self.make_train_function() if not flags.FLAGS.use_horovod else self.make_hvd_train_function(
+      )
+      self._train_counter.assign(0)
+      callbacks.on_train_begin()
+      training_logs = None
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.start()
+      # Handle fault-tolerance for multi-worker.
+      # TODO(omalleyt): Fix the ordering issues that mean this has to
+      # happen after `callbacks.on_train_begin`.
+      steps_per_epoch_inferred = (steps_per_epoch or data_handler.inferred_steps)
+      (
+          data_handler._initial_epoch,
+          data_handler._initial_step,
+      ) = self._maybe_load_initial_counters_from_ckpt(steps_per_epoch_inferred, initial_epoch)
+      logs = None
+      for epoch, iterator in data_handler.enumerate_epochs():
+        self.reset_metrics()
+        callbacks.on_epoch_begin(epoch)
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            with tf.profiler.experimental.Trace(
+                "train",
+                epoch_num=epoch,
+                step_num=step,
+                batch_size=batch_size,
+                _r=1,
+            ):
+              callbacks.on_train_batch_begin(step)
+              tmp_logs = self.train_function(iterator)
+              if data_handler.should_sync:
+                context.async_wait()
+              # No error, now safe to assign to logs.
+              logs = tmp_logs
+              end_step = step + data_handler.step_increment
+              callbacks.on_train_batch_end(end_step, logs)
+              if self.stop_training:
+                break
+
+        logs = tf_utils.sync_to_numpy_or_python_type(logs)
+        if logs is None:
+          raise ValueError(
+              "Unexpected result of `train_function` "
+              "(Empty logs). This could be due to issues in input "
+              "pipeline that resulted in an empty dataset. "
+              "Otherwise, please use "
+              "`Model.compile(..., run_eagerly=True)`, or "
+              "`tf.config.run_functions_eagerly(True)` for more "
+              "information of where went wrong, or file a "
+              "issue/bug to `tf.keras`."
+          )
+        # Override with model metrics instead of last step logs
+        logs = self._validate_and_get_metrics_result(logs)
+        epoch_logs = copy.copy(logs)
+
+        # Run validation.
+        if validation_data and self._should_eval(epoch, validation_freq):
+          if self._pss_evaluation_shards:
+            self._disallow_exact_eval_with_add_metrics()
+          # Create data_handler for evaluation and cache it.
+          if getattr(self, "_eval_data_handler", None) is None:
+            self._eval_data_handler = data_adapter.get_data_handler(
+                x=val_x,
+                y=val_y,
+                sample_weight=val_sample_weight,
+                batch_size=validation_batch_size or batch_size,
+                steps_per_epoch=validation_steps,
+                initial_epoch=0,
+                epochs=1,
+                max_queue_size=max_queue_size,
+                workers=workers,
+                use_multiprocessing=use_multiprocessing,
+                model=self,
+                steps_per_execution=self._steps_per_execution,
+                pss_evaluation_shards=self._pss_evaluation_shards,
+            )
+          val_logs = self.evaluate(
+              x=val_x,
+              y=val_y,
+              sample_weight=val_sample_weight,
+              batch_size=validation_batch_size or batch_size,
+              steps=validation_steps,
+              callbacks=callbacks,
+              max_queue_size=max_queue_size,
+              workers=workers,
+              use_multiprocessing=use_multiprocessing,
+              return_dict=True,
+              _use_cached_eval_dataset=True,
+          )
+          val_logs = {"val_" + name: val for name, val in val_logs.items()}
+          epoch_logs.update(val_logs)
+
+        callbacks.on_epoch_end(epoch, epoch_logs)
+        training_logs = epoch_logs
+        if self.stop_training:
+          break
+
+      if isinstance(self.optimizer, optimizer.Optimizer) and epochs > 0:
+        self.optimizer.finalize_variable_values(self.trainable_variables)
+
+      # If eval data_handler exists, delete it after all epochs are done.
+      if getattr(self, "_eval_data_handler", None) is not None:
+        del self._eval_data_handler
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.stop()
+      callbacks.on_train_end(logs=training_logs)
+      return self.history
+
+  def test_step(self, data):
+    """The logic for one evaluation step.
+
+    This method can be overridden to support custom evaluation logic.
+    This method is called by `Model.make_test_function`.
+
+    This function should contain the mathematical logic for one step of
+    evaluation.
+    This typically includes the forward pass, loss calculation, and metrics
+    updates.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function`
+    and `tf.distribute.Strategy` settings), should be left to
+    `Model.make_test_function`, which can also be overridden.
+
+    Args:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      A `dict` containing values that will be passed to
+      `tf.keras.callbacks.CallbackList.on_train_batch_end`. Typically, the
+      values of the `Model`'s metrics are returned.
+    """
+    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+
+    y_pred = self.main_model(x, training=False)
+    # Updates stateful loss metrics.
+    self.compute_loss(x, y, y_pred, sample_weight)
+    return self.compute_metrics(x, y, y_pred, sample_weight)
+
+  def _make_test_function_exact(self):
+    if getattr(self, "_shard_test_function", None):
+      return self._shard_test_function
+
+    def step_function(batch):
+
+      def run_step(data):
+        # TODO(b/272050910): Use sample_weight for weighted metrics.
+        x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
+        y_pred = self.main_model(x, training=False)
+        return x, y, y_pred, sample_weight
+
+      if self._jit_compile:
+        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
+
+      outputs = self.distribute_strategy.run(run_step, args=(batch,))
+      outputs = training_module.reduce_per_replica(
+          outputs,
+          self.distribute_strategy,
+          reduction=self.distribute_reduction_method,
+      )
+      return outputs
+
+    def shard_test_function(dataset, total_shards, shard_idx):
+      # Copy loss and metric variables to the worker and work with them
+      # locally. This ensures each shard function is atomic: if a worker
+      # is preempted, the intermediate progress is discarded and that
+      # shard is retried. This in turn guarantees exactly-once visitation.
+      local_unweighted_metrics, local_weighted_metrics = [], []
+      with tf_utils.with_metric_local_vars_scope():
+        # TODO(jmullenbach): implement and use a clone for
+        # `MetricsContainer` and use its `update_state` method directly.
+        for metric in self.compiled_metrics.unweighted_metrics:
+          if metric is not None:
+            local_unweighted_metrics.append(base_metric.clone_metric(metric))
+        for metric in self.compiled_metrics.weighted_metrics:
+          if metric is not None:
+            local_weighted_metrics.append(base_metric.clone_metric(metric))
+        local_loss = compile_utils.LossesContainer.from_config(self.compiled_loss.get_config())
+
+      dataset = input_ops.auto_shard_dataset(dataset, total_shards, shard_idx)
+      iterator = iter(dataset)
+      with distribute_utils.cache_variable_reads():
+        for batch in iterator:
+          x, y, y_pred, sample_weight = step_function(batch)
+          for weighted_metric in local_weighted_metrics:
+            weighted_metric.update_state(y, y_pred, sample_weight)
+          for unweighted_metric in local_unweighted_metrics:
+            unweighted_metric.update_state(y, y_pred)
+          local_loss(y, y_pred, sample_weight)
+      local_metrics = (local_unweighted_metrics + local_weighted_metrics + local_loss.metrics)
+      outputs = {metric.name: metric.weights for metric in local_metrics}
+      with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
+        self._test_counter.assign_add(1)
+      return outputs
+
+    if not self.run_eagerly:
+      shard_test_function = tf.function(shard_test_function, reduce_retracing=True)
+
+    self._shard_test_function = (lambda *args: self._cluster_coordinator.schedule(
+        shard_test_function,
+        args=args,
+    ))
+    return self._shard_test_function
+
+  def make_test_function(self, force=False):
+    """Creates a function that executes one step of evaluation.
+
+    This method can be overridden to support custom evaluation logic.
+    This method is called by `Model.evaluate` and `Model.test_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual evaluation
+    logic to `Model.test_step`.
+
+    This function is cached the first time `Model.evaluate` or
+    `Model.test_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called. You can skip the cache and generate again the
+    function with `force=True`.
+
+    Args:
+      force: Whether to regenerate the test function and skip the cached
+        function if available.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return a `dict` containing values that will
+      be passed to `tf.keras.Callbacks.on_test_batch_end`.
+    """
+    if self.test_function is not None and not force:
+      return self.test_function
+
+    def step_function(iterator):
+      """Runs a single evaluation step."""
+
+      def run_step(data):
+        outputs = self.test_step(data)
+        # Ensure counter is updated only if `test_step` succeeds.
+        with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
+          self._test_counter.assign_add(1)
+        return outputs
+
+      if self.jit_compile:
+        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
+
+      data = next(iterator)
+      outputs = self.distribute_strategy.run(run_step, args=(data,))
+      outputs = training_module.reduce_per_replica(
+          outputs,
+          self.distribute_strategy,
+          reduction=self.distribute_reduction_method,
+      )
+      return outputs
+
+    # Special case if steps_per_execution is one.
+    if (
+        self._steps_per_execution is None or
+        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+    ):
+
+      def test_function(iterator):
+        """Runs a test execution with a single step."""
+        return step_function(iterator)
+
+      if not self.run_eagerly:
+        test_function = tf.function(test_function, reduce_retracing=True)
+
+      if self._cluster_coordinator:
+        self.test_function = (lambda it: self._cluster_coordinator.schedule(test_function, args=(it,)))
+      else:
+        self.test_function = test_function
+
+    # If we're using a coordinator, use the value of
+    # self._steps_per_execution at the time the function is
+    # called/scheduled, and not when it is actually executed.
+    elif self._cluster_coordinator:
+
+      def test_function(iterator, steps_per_execution):
+        """Runs a test execution with multiple steps."""
+        for _ in tf.range(steps_per_execution):
+          outputs = step_function(iterator)
+        return outputs
+
+      if not self.run_eagerly:
+        test_function = tf.function(test_function, reduce_retracing=True)
+
+      self.test_function = lambda it: self._cluster_coordinator.schedule(
+          test_function, args=(it, self._steps_per_execution.value())
+      )
+    else:
+
+      def test_function(iterator):
+        """Runs a test execution with multiple steps."""
+        for _ in tf.range(self._steps_per_execution):
+          outputs = step_function(iterator)
+        return outputs
+
+      if not self.run_eagerly:
+        test_function = tf.function(test_function, reduce_retracing=True)
+      self.test_function = test_function
+
+    return self.test_function
+
+  @traceback_utils.filter_traceback
+  def evaluate(
+      self,
+      x=None,
+      y=None,
+      batch_size=None,
+      verbose="auto",
+      sample_weight=None,
+      steps=None,
+      callbacks=None,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+      return_dict=False,
+      **kwargs,
+  ):
+    """Returns the loss value & metrics values for the model in test mode.
+
+    Computation is done in batches (see the `batch_size` arg.)
+
+    Args:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+            if the model has named inputs.
+          - A `tf.data` dataset. Should return a tuple
+            of either `(inputs, targets)` or
+            `(inputs, targets, sample_weights)`.
+          - A generator or `keras.utils.Sequence` returning `(inputs,
+            targets)` or `(inputs, targets, sample_weights)`.
+          A more detailed description of unpacking behavior for iterator
+          types (Dataset, generator, Sequence) is given in the `Unpacking
+          behavior for iterator-like inputs` section of `Model.fit`.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely).
+          If `x` is a dataset, generator or `keras.utils.Sequence` instance,
+          `y` should not be specified (since targets will be obtained from
+          the iterator/dataset).
+        batch_size: Integer or `None`. Number of samples per batch of
+          computation. If unspecified, `batch_size` will default to 32. Do
+          not specify the `batch_size` if your data is in the form of a
+          dataset, generators, or `keras.utils.Sequence` instances (since
+          they generate batches).
+        verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = single line.
+            `"auto"` becomes 1 for most cases, and to 2 when used with
+            `ParameterServerStrategy`. Note that the progress bar is not
+            particularly useful when logged to a file, so `verbose=2` is
+            recommended when not running interactively (e.g. in a production
+            environment). Defaults to 'auto'.
+        sample_weight: Optional Numpy array of weights for the test samples,
+          used for weighting the loss function. You can either pass a flat
+          (1D) Numpy array with the same length as the input samples
+            (1:1 mapping between weights and samples), or in the case of
+              temporal data, you can pass a 2D array with shape `(samples,
+              sequence_length)`, to apply a different weight to every
+              timestep of every sample. This argument is not supported when
+              `x` is a dataset, instead pass sample weights as the third
+              element of `x`.
+        steps: Integer or `None`. Total number of steps (batches of samples)
+          before declaring the evaluation round finished. Ignored with the
+          default value of `None`. If x is a `tf.data` dataset and `steps`
+          is None, 'evaluate' will run until the dataset is exhausted. This
+          argument is not supported with array inputs.
+        callbacks: List of `keras.callbacks.Callback` instances. List of
+          callbacks to apply during evaluation. See
+          [callbacks](https://www.tensorflow.org/api_docs/python/tf/tf_keras/callbacks).
+        max_queue_size: Integer. Used for generator or
+          `keras.utils.Sequence` input only. Maximum size for the generator
+          queue. If unspecified, `max_queue_size` will default to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+          only. Maximum number of processes to spin up when using
+          process-based threading. If unspecified, `workers` will default to
+          1.
+        use_multiprocessing: Boolean. Used for generator or
+          `keras.utils.Sequence` input only. If `True`, use process-based
+          threading. If unspecified, `use_multiprocessing` will default to
+          `False`. Note that because this implementation relies on
+          multiprocessing, you should not pass non-pickleable arguments to
+          the generator as they can't be passed easily to children
+          processes.
+        return_dict: If `True`, loss and metric results are returned as a
+          dict, with each key being the name of the metric. If `False`, they
+          are returned as a list.
+        **kwargs: Unused at this time.
+
+    See the discussion of `Unpacking behavior for iterator-like inputs` for
+    `Model.fit`.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: If `model.evaluate` is wrapped in a `tf.function`.
+    """
+    version_utils.disallow_legacy_graph("Model", "evaluate")
+    self._assert_compile_was_called()
+    self._check_call_args("evaluate")
+    self._check_sample_weight_warning(x, sample_weight)
+    training_module._disallow_inside_tf_function("evaluate")
+    use_cached_eval_dataset = kwargs.pop("_use_cached_eval_dataset", False)
+    if kwargs:
+      raise TypeError(f"Invalid keyword arguments: {list(kwargs.keys())}")
+
+    if self.distribute_strategy._should_use_with_coordinator:
+      self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy))
+
+    verbose = training_module._get_verbosity(verbose, self.distribute_strategy)
+    if self._pss_evaluation_shards:
+      self._disallow_exact_eval_with_add_metrics()
+    with self.distribute_strategy.scope():
+      # Use cached evaluation data only when it's called in `Model.fit`
+      if (use_cached_eval_dataset and getattr(self, "_eval_data_handler", None) is not None):
+        data_handler = self._eval_data_handler
+      else:
+        # Creates a `tf.data.Dataset` and handles batch and epoch
+        # iteration.
+        data_handler = data_adapter.get_data_handler(
+            x=x,
+            y=y,
+            sample_weight=sample_weight,
+            batch_size=batch_size,
+            steps_per_epoch=steps,
+            initial_epoch=0,
+            epochs=1,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            model=self,
+            steps_per_execution=self._steps_per_execution,
+            pss_evaluation_shards=self._pss_evaluation_shards,
+        )
+
+      # Container that configures and calls `tf.keras.Callback`s.
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        callbacks = callbacks_module.CallbackList(
+            callbacks,
+            add_history=True,
+            add_progbar=verbose != 0,
+            model=self,
+            verbose=verbose,
+            epochs=1,
+            steps=data_handler.inferred_steps,
+        )
+
+      # Initialize to prevent errors if 0 epochs are evaluated.
+      logs = {}
+
+      test_function_runner = self._get_test_function_runner(callbacks)
+      self._test_counter.assign(0)
+      callbacks.on_test_begin()
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.start()
+      for (
+          _,
+          dataset_or_iterator,
+      ) in data_handler.enumerate_epochs():  # Single epoch.
+        self.reset_metrics()
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            with tf.profiler.experimental.Trace("test", step_num=step, _r=1):
+              callbacks.on_test_batch_begin(step)
+              logs = test_function_runner.run_step(
+                  dataset_or_iterator,
+                  data_handler,
+                  step,
+                  self._pss_evaluation_shards,
+              )
+
+      logs = tf_utils.sync_to_numpy_or_python_type(logs)
+      # Override with model metrics instead of last step logs
+      if self._pss_evaluation_shards:
+        logs = self._aggregate_exact_metrics(logs)
+      else:
+        logs = self._validate_and_get_metrics_result(logs)
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.stop()
+      callbacks.on_test_end(logs=logs)
+
+      if return_dict:
+        return logs
+      else:
+        return training_module.flatten_metrics_in_order(logs, self.metrics_names)
+
+  def _disallow_exact_eval_with_add_metrics(self):
+    metrics_from_add_metric = [metric for layer in self._flatten_layers() for metric in layer._metrics]
+    compiled_metrics = self.compiled_metrics.metrics
+    if any([metric not in compiled_metrics for metric in metrics_from_add_metric]):
+      raise ValueError(
+          "Detected that a metric was added to this model "
+          "via `Model.add_metric`. This is not currently "
+          "supported when using exact evaluation with "
+          "`tf.distribute.ParameterServerStrategy`."
+      )
+
+  def _infer_exact_eval_shards(self, pss_evaluation_shards):
+    if not self.distribute_strategy._should_use_with_coordinator:
+      return 0
+    if pss_evaluation_shards == "auto":
+      # TODO(b/264265138) evaluate and improve this heuristic
+      return self.distribute_strategy._num_workers * 5
+    return pss_evaluation_shards
+
+  def _get_test_function_runner(self, callbacks):
+    if (self._pss_evaluation_shards and self.distribute_strategy._should_use_with_coordinator):
+      self.test_function = self._make_test_function_exact()
+      test_function_runner = training_module._ExactTestFunction(self.test_function, callbacks)
+    else:
+      self.test_function = self.make_test_function()
+      test_function_runner = training_module._TestFunction(self.test_function, callbacks)
+    return test_function_runner
+
+  def predict_step(self, data):
+    """The logic for one inference step.
+
+    This method can be overridden to support custom inference logic.
+    This method is called by `Model.make_predict_function`.
+
+    This method should contain the mathematical logic for one step of
+    inference.  This typically includes the forward pass.
+
+    Configuration details for *how* this logic is run (e.g. `tf.function`
+    and `tf.distribute.Strategy` settings), should be left to
+    `Model.make_predict_function`, which can also be overridden.
+
+    Args:
+      data: A nested structure of `Tensor`s.
+
+    Returns:
+      The result of one inference step, typically the output of calling the
+      `Model` on data.
+    """
+    x, _, _ = data_adapter.unpack_x_y_sample_weight(data)
+    return self.main_model(x, training=False)
+
+  def make_predict_function(self, force=False):
+    """Creates a function that executes one step of inference.
+
+    This method can be overridden to support custom inference logic.
+    This method is called by `Model.predict` and `Model.predict_on_batch`.
+
+    Typically, this method directly controls `tf.function` and
+    `tf.distribute.Strategy` settings, and delegates the actual evaluation
+    logic to `Model.predict_step`.
+
+    This function is cached the first time `Model.predict` or
+    `Model.predict_on_batch` is called. The cache is cleared whenever
+    `Model.compile` is called. You can skip the cache and generate again the
+    function with `force=True`.
+
+    Args:
+      force: Whether to regenerate the predict function and skip the cached
+        function if available.
+
+    Returns:
+      Function. The function created by this method should accept a
+      `tf.data.Iterator`, and return the outputs of the `Model`.
+    """
+    if self.predict_function is not None and not force:
+      return self.predict_function
+
+    def step_function(iterator):
+      """Runs a single evaluation step."""
+
+      def run_step(data):
+        outputs = self.predict_step(data)
+        # Ensure counter is updated only if `test_step` succeeds.
+        with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
+          self._predict_counter.assign_add(1)
+        return outputs
+
+      if self.jit_compile:
+        run_step = tf.function(run_step, jit_compile=True, reduce_retracing=True)
+
+      data = next(iterator)
+      outputs = self.distribute_strategy.run(run_step, args=(data,))
+      outputs = training_module.reduce_per_replica(outputs, self.distribute_strategy, reduction="concat")
+      return outputs
+
+    # Special case if steps_per_execution is one.
+    if (
+        self._steps_per_execution is None or
+        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+    ):
+
+      def predict_function(iterator):
+        """Runs an evaluation execution with a single step."""
+        return step_function(iterator)
+
+    else:
+
+      def predict_function(iterator):
+        """Runs an evaluation execution with multiple steps."""
+        outputs = step_function(iterator)
+        for _ in tf.range(self._steps_per_execution - 1):
+          tf.autograph.experimental.set_loop_options(
+              shape_invariants=[
+                  (
+                      outputs,
+                      tf.nest.map_structure(
+                          lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=True).shape,
+                          outputs,
+                      ),
+                  )
+              ]
+          )
+          step_outputs = step_function(iterator)
+          outputs = tf.nest.map_structure(lambda t1, t2: training_module.concat([t1, t2]), outputs, step_outputs)
+        return outputs
+
+    if not self.run_eagerly:
+      predict_function = tf.function(predict_function, reduce_retracing=True)
+    self.predict_function = predict_function
+
+    return self.predict_function
+
+  @traceback_utils.filter_traceback
+  def predict(
+      self,
+      x,
+      batch_size=None,
+      verbose="auto",
+      steps=None,
+      callbacks=None,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+  ):
+    """Generates output predictions for the input samples.
+
+    Computation is done in batches. This method is designed for batch
+    processing of large numbers of inputs. It is not intended for use inside
+    of loops that iterate over your data and process small numbers of inputs
+    at a time.
+
+    For small numbers of inputs that fit in one batch,
+    directly use `__call__()` for faster execution, e.g.,
+    `model(x)`, or `model(x, training=False)` if you have layers such as
+    `tf.keras.layers.BatchNormalization` that behave differently during
+    inference. You may pair the individual model call with a `tf.function`
+    for additional performance inside your inner loop.
+    If you need access to numpy array values instead of tensors after your
+    model call, you can use `tensor.numpy()` to get the numpy array value of
+    an eager tensor.
+
+    Also, note the fact that test loss is not affected by
+    regularization layers like noise and dropout.
+
+    Note: See [this FAQ entry](
+    https://keras.io/getting_started/faq/#whats-the-difference-between-model-methods-predict-and-call)
+    for more details about the difference between `Model` methods
+    `predict()` and `__call__()`.
+
+    Args:
+        x: Input samples. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset.
+          - A generator or `keras.utils.Sequence` instance.
+          A more detailed description of unpacking behavior for iterator
+          types (Dataset, generator, Sequence) is given in the `Unpacking
+          behavior for iterator-like inputs` section of `Model.fit`.
+        batch_size: Integer or `None`.
+            Number of samples per batch.
+            If unspecified, `batch_size` will default to 32.
+            Do not specify the `batch_size` if your data is in the
+            form of dataset, generators, or `keras.utils.Sequence` instances
+            (since they generate batches).
+        verbose: `"auto"`, 0, 1, or 2. Verbosity mode.
+            0 = silent, 1 = progress bar, 2 = single line.
+            `"auto"` becomes 1 for most cases, and to 2 when used with
+            `ParameterServerStrategy`. Note that the progress bar is not
+            particularly useful when logged to a file, so `verbose=2` is
+            recommended when not running interactively (e.g. in a production
+            environment). Defaults to 'auto'.
+        steps: Total number of steps (batches of samples)
+            before declaring the prediction round finished.
+            Ignored with the default value of `None`. If x is a `tf.data`
+            dataset and `steps` is None, `predict()` will
+            run until the input dataset is exhausted.
+        callbacks: List of `keras.callbacks.Callback` instances.
+            List of callbacks to apply during prediction.
+            See [callbacks](
+            https://www.tensorflow.org/api_docs/python/tf/tf_keras/callbacks).
+        max_queue_size: Integer. Used for generator or
+            `keras.utils.Sequence` input only. Maximum size for the
+            generator queue. If unspecified, `max_queue_size` will default
+            to 10.
+        workers: Integer. Used for generator or `keras.utils.Sequence` input
+            only. Maximum number of processes to spin up when using
+            process-based threading. If unspecified, `workers` will default
+            to 1.
+        use_multiprocessing: Boolean. Used for generator or
+            `keras.utils.Sequence` input only. If `True`, use process-based
+            threading. If unspecified, `use_multiprocessing` will default to
+            `False`. Note that because this implementation relies on
+            multiprocessing, you should not pass non-pickleable arguments to
+            the generator as they can't be passed easily to children
+            processes.
+
+    See the discussion of `Unpacking behavior for iterator-like inputs` for
+    `Model.fit`. Note that Model.predict uses the same interpretation rules
+    as `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for
+    all three methods.
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        RuntimeError: If `model.predict` is wrapped in a `tf.function`.
+        ValueError: In case of mismatch between the provided
+            input data and the model's expectations,
+            or in case a stateful model receives a number of samples
+            that is not a multiple of the batch size.
+    """
+    version_utils.disallow_legacy_graph("Model", "predict")
+    self._check_call_args("predict")
+    training_module._disallow_inside_tf_function("predict")
+
+    # TODO(yashkatariya): Cache model on the coordinator for faster
+    # prediction.  If running under PSS, then swap it with OneDeviceStrategy
+    # so that execution will run on the coordinator.
+    original_pss_strategy = None
+    if self.distribute_strategy._should_use_with_coordinator:
+      original_pss_strategy = self.distribute_strategy
+      self._distribution_strategy = None
+
+    # Cluster coordinator is set by `.fit()` and `.evaluate()` which is not
+    # needed in `.predict()` because all the predictions happen on the
+    # coordinator/locally.
+    if self._cluster_coordinator:
+      self._cluster_coordinator = None
+
+    verbose = training_module._get_verbosity(verbose, self.distribute_strategy)
+    outputs = None
+    with self.distribute_strategy.scope():
+      # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
+      dataset_types = (tf.compat.v1.data.Dataset, tf.data.Dataset)
+      if (self._in_multi_worker_mode() or
+          training_module._is_tpu_multi_host(self.distribute_strategy)) and isinstance(x, dataset_types):
+        try:
+          options = tf.data.Options()
+          data_option = tf.data.experimental.AutoShardPolicy.DATA
+          options.experimental_distribute.auto_shard_policy = (data_option)
+          x = x.with_options(options)
+        except ValueError:
+          warnings.warn(
+              "Using Model.predict with MultiWorkerMirroredStrategy "
+              "or TPUStrategy and AutoShardPolicy.FILE might lead to "
+              "out-of-order result. Consider setting it to "
+              "AutoShardPolicy.DATA.",
+              stacklevel=2,
+          )
+
+      data_handler = data_adapter.get_data_handler(
+          x=x,
+          batch_size=batch_size,
+          steps_per_epoch=steps,
+          initial_epoch=0,
+          epochs=1,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self,
+          steps_per_execution=self._steps_per_execution,
+      )
+
+      # Container that configures and calls `tf.keras.Callback`s.
+      if not isinstance(callbacks, callbacks_module.CallbackList):
+        callbacks = callbacks_module.CallbackList(
+            callbacks,
+            add_history=True,
+            add_progbar=verbose != 0,
+            model=self,
+            verbose=verbose,
+            epochs=1,
+            steps=data_handler.inferred_steps,
+        )
+
+      self.predict_function = self.make_predict_function()
+      self._predict_counter.assign(0)
+      callbacks.on_predict_begin()
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.start()
+      batch_outputs = None
+      for _, iterator in data_handler.enumerate_epochs():  # Single epoch.
+        with data_handler.catch_stop_iteration():
+          for step in data_handler.steps():
+            callbacks.on_predict_batch_begin(step)
+            tmp_batch_outputs = self.predict_function(iterator)
+            if data_handler.should_sync:
+              context.async_wait()
+            batch_outputs = (
+                tmp_batch_outputs  # No error, now safe to assign.
+            )
+            if outputs is None:
+              outputs = tf.nest.map_structure(
+                  lambda batch_output: [batch_output],
+                  batch_outputs,
+              )
+            else:
+              tf.__internal__.nest.map_structure_up_to(
+                  batch_outputs,
+                  lambda output, batch_output: output.append(batch_output),
+                  outputs,
+                  batch_outputs,
+              )
+            end_step = step + data_handler.step_increment
+            callbacks.on_predict_batch_end(end_step, {"outputs": batch_outputs})
+      if batch_outputs is None:
+        raise ValueError(
+            "Unexpected result of `predict_function` "
+            "(Empty batch_outputs). Please use "
+            "`Model.compile(..., run_eagerly=True)`, or "
+            "`tf.config.run_functions_eagerly(True)` for more "
+            "information of where went wrong, or file a "
+            "issue/bug to `tf.keras`."
+        )
+      if self.autotune_steps_per_execution:
+        self._steps_per_execution_tuner.stop()
+      callbacks.on_predict_end()
+    all_outputs = tf.__internal__.nest.map_structure_up_to(
+        batch_outputs, training_module.potentially_ragged_concat, outputs
+    )
+
+    # If originally PSS strategy was used, then replace it back since
+    # predict is running under `OneDeviceStrategy` after the swap and once
+    # its done we need to replace it back to PSS again.
+    if original_pss_strategy is not None:
+      self._distribution_strategy = original_pss_strategy
+
+    return tf_utils.sync_to_numpy_or_python_type(all_outputs)
+
+  def reset_metrics(self):
+    """Resets the state of all the metrics in the model.
+
+    Examples:
+
+    >>> inputs = tf.keras.layers.Input(shape=(3,))
+    >>> outputs = tf.keras.layers.Dense(2)(inputs)
+    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+    >>> model.compile(optimizer="Adam", loss="mse", metrics=["mae"])
+
+    >>> x = np.random.random((2, 3))
+    >>> y = np.random.randint(0, 2, (2, 2))
+    >>> _ = model.fit(x, y, verbose=0)
+    >>> assert all(float(m.result()) for m in model.metrics)
+
+    >>> model.reset_metrics()
+    >>> assert all(float(m.result()) == 0 for m in model.metrics)
+
+    """
+    for m in self.metrics:
+      m.reset_state()
+
+  def train_on_batch(
+      self,
+      x,
+      y=None,
+      sample_weight=None,
+      class_weight=None,
+      reset_metrics=True,
+      return_dict=False,
+  ):
+    """Runs a single gradient update on a single batch of data.
+
+    Args:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+              (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+              (in case the model has multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+              if the model has named inputs.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s).
+        sample_weight: Optional array of the same length as x, containing
+          weights to apply to the model's loss for each sample. In the case
+          of temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample.
+        class_weight: Optional dictionary mapping class indices (integers)
+          to a weight (float) to apply to the model's loss for the samples
+          from this class during training. This can be useful to tell the
+          model to "pay more attention" to samples from an under-represented
+          class. When `class_weight` is specified and targets have a rank of
+          2 or greater, either `y` must be one-hot encoded, or an explicit
+          final dimension of `1` must be included for sparse class labels.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated
+          across batches.
+        return_dict: If `True`, loss and metric results are returned as a
+          dict, with each key being the name of the metric. If `False`, they
+          are returned as a list.
+
+    Returns:
+        Scalar training loss
+        (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+      RuntimeError: If `model.train_on_batch` is wrapped in a `tf.function`.
+    """
+    self._assert_compile_was_called()
+    self._check_call_args("train_on_batch")
+    training_module._disallow_inside_tf_function("train_on_batch")
+    if reset_metrics:
+      self.reset_metrics()
+    with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+        self
+    ):
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x, y, sample_weight, class_weight)
+      self.train_function = self.make_train_function()
+      logs = self.train_function(iterator)
+
+    logs = tf_utils.sync_to_numpy_or_python_type(logs)
+    if return_dict:
+      return logs
+    else:
+      return training_module.flatten_metrics_in_order(logs, self.metrics_names)
+
+  def test_on_batch(
+      self,
+      x,
+      y=None,
+      sample_weight=None,
+      reset_metrics=True,
+      return_dict=False,
+  ):
+    """Test the model on a single batch of samples.
+
+    Args:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays (in case the
+              model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors (in case the model has
+              multiple inputs).
+          - A dict mapping input names to the corresponding array/tensors,
+              if the model has named inputs.
+        y: Target data. Like the input data `x`, it could be either Numpy
+          array(s) or TensorFlow tensor(s). It should be consistent with `x`
+          (you cannot have Numpy inputs and tensor targets, or inversely).
+        sample_weight: Optional array of the same length as x, containing
+          weights to apply to the model's loss for each sample. In the case
+          of temporal data, you can pass a 2D array with shape (samples,
+          sequence_length), to apply a different weight to every timestep of
+          every sample.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated
+          across batches.
+        return_dict: If `True`, loss and metric results are returned as a
+          dict, with each key being the name of the metric. If `False`, they
+          are returned as a list.
+
+    Returns:
+        Scalar test loss (if the model has a single output and no metrics)
+        or list of scalars (if the model has multiple outputs
+        and/or metrics). The attribute `model.metrics_names` will give you
+        the display labels for the scalar outputs.
+
+    Raises:
+        RuntimeError: If `model.test_on_batch` is wrapped in a
+          `tf.function`.
+    """
+    self._assert_compile_was_called()
+    self._check_call_args("test_on_batch")
+    training_module._disallow_inside_tf_function("test_on_batch")
+    if reset_metrics:
+      self.reset_metrics()
+    with self.distribute_strategy.scope():
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x, y, sample_weight)
+      self.test_function = self.make_test_function()
+      logs = self.test_function(iterator)
+
+    logs = tf_utils.sync_to_numpy_or_python_type(logs)
+    if return_dict:
+      return logs
+    else:
+      return training_module.flatten_metrics_in_order(logs, self.metrics_names)
+
+  def predict_on_batch(self, x):
+    """Returns predictions for a single batch of samples.
+
+    Args:
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays (in case the
+              model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors (in case the model has
+              multiple inputs).
+
+    Returns:
+        Numpy array(s) of predictions.
+
+    Raises:
+        RuntimeError: If `model.predict_on_batch` is wrapped in a
+          `tf.function`.
+    """
+    self._check_call_args("predict_on_batch")
+    training_module._disallow_inside_tf_function("predict_on_batch")
+    with self.distribute_strategy.scope():
+      iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x)
+      self.predict_function = self.make_predict_function()
+      outputs = self.predict_function(iterator)
+    return tf_utils.sync_to_numpy_or_python_type(outputs)
+
+  @doc_controls.do_not_generate_docs
+  def fit_generator(
+      self,
+      generator,
+      steps_per_epoch=None,
+      epochs=1,
+      verbose=1,
+      callbacks=None,
+      validation_data=None,
+      validation_steps=None,
+      validation_freq=1,
+      class_weight=None,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+      shuffle=True,
+      initial_epoch=0,
+  ):
+    """Fits the model on data yielded batch-by-batch by a Python generator.
+
+    DEPRECATED:
+      `Model.fit` now supports generators, so there is no longer any need to
+      use this endpoint.
+    """
+    warnings.warn(
+        "`Model.fit_generator` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `Model.fit`, which supports generators.",
+        stacklevel=2,
+    )
+    return self.fit(
+        generator,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        verbose=verbose,
+        callbacks=callbacks,
+        validation_data=validation_data,
+        validation_steps=validation_steps,
+        validation_freq=validation_freq,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        shuffle=shuffle,
+        initial_epoch=initial_epoch,
+    )
+
+  @doc_controls.do_not_generate_docs
+  def evaluate_generator(
+      self,
+      generator,
+      steps=None,
+      callbacks=None,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+      verbose=0,
+  ):
+    """Evaluates the model on a data generator.
+
+    DEPRECATED:
+      `Model.evaluate` now supports generators, so there is no longer any
+      need to use this endpoint.
+    """
+    warnings.warn(
+        "`Model.evaluate_generator` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `Model.evaluate`, which supports generators.",
+        stacklevel=2,
+    )
+    self._check_call_args("evaluate_generator")
+
+    return self.evaluate(
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks,
+    )
+
+  @doc_controls.do_not_generate_docs
+  def predict_generator(
+      self,
+      generator,
+      steps=None,
+      callbacks=None,
+      max_queue_size=10,
+      workers=1,
+      use_multiprocessing=False,
+      verbose=0,
+  ):
+    """Generates predictions for the input samples from a data generator.
+
+    DEPRECATED:
+      `Model.predict` now supports generators, so there is no longer any
+      need to use this endpoint.
+    """
+    warnings.warn(
+        "`Model.predict_generator` is deprecated and "
+        "will be removed in a future version. "
+        "Please use `Model.predict`, which supports generators.",
+        stacklevel=2,
+    )
+    return self.predict(
+        generator,
+        steps=steps,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose,
+        callbacks=callbacks,
+    )
+
+  def _check_call_args(self, method_name):
+    """Check that `call()` has only one positional arg."""
+    # Always allow first arg, regardless of arg name.
+    fullargspec = self.main_model._call_spec.full_argspec
+    if fullargspec.defaults:
+      positional_args = fullargspec.args[:-len(fullargspec.defaults)]
+    else:
+      positional_args = fullargspec.args
+    if "training" in positional_args:
+      positional_args.remove("training")
+
+    # self and first arg can be positional.
+    if len(positional_args) > 2:
+      extra_args = positional_args[2:]
+      raise ValueError(
+          f"Models passed to `{method_name}` can only have `training` "
+          "and the first argument in `call()` as positional arguments, "
+          f"found: {extra_args}."
+      )
+
+  def _validate_compile(self, optimizer, metrics, **kwargs):
+    """Performs validation checks for the default `compile()`."""
+    if any(isinstance(opt, optimizer_v1.Optimizer) for opt in tf.nest.flatten(optimizer)):
+      raise ValueError(
+          f"`tf.compat.v1.keras` Optimizer ({optimizer}) is "
+          "not supported when eager execution is enabled. Use a "
+          "`tf.keras` Optimizer instead, or disable eager "
+          "execution."
+      )
+
+    kwargs.pop("cloning", None)  # Legacy DistStrat argument, never used.
+    kwargs.pop("experimental_run_tf_function", None)  # Always `True`.
+    distribute_arg = kwargs.pop("distribute", None)
+    if distribute_arg is not None:
+      raise ValueError(
+          "`distribute` argument in compile is not available in TF 2.0. "
+          "Please create the model under the `strategy.scope()`. "
+          f"Received: {distribute_arg}."
+      )
+    target_tensor_arg = kwargs.pop("target_tensors", None)
+    if target_tensor_arg is not None:
+      raise ValueError(
+          "`target_tensors` argument is not supported when executing "
+          f"eagerly. Received: {target_tensor_arg}."
+      )
+    invalid_kwargs = set(kwargs) - {"sample_weight_mode"}
+    if invalid_kwargs:
+      raise TypeError(
+          "Invalid keyword argument(s) in `compile()`: "
+          f"{(invalid_kwargs,)}. Valid keyword arguments include "
+          '"cloning", "experimental_run_tf_function", "distribute",'
+          ' "target_tensors", or "sample_weight_mode".'
+      )
+
+    # Model must be created and compiled with the same DistStrat.
+    if tf.distribute.has_strategy():
+      strategy = tf.distribute.get_strategy()
+      for v in self.main_model.variables:
+        if not strategy.extended.variable_created_in_scope(v):
+          raise ValueError(
+              f"Variable ({v}) was not created in the distribution "
+              f"strategy scope of ({strategy}). It is most likely "
+              "because some layers, model, or optimizer was being "
+              "created outside the distribution strategy scope. Try "
+              "to make sure your code looks similar "
+              "to the following.\nwith strategy.scope():\n"
+              "  model=_create_model()\n"
+              "  model.compile(...)"
+          )
+
+    # Model metrics must be created in the same distribution strategy scope
+    # as the model.
+    strategy = self.distribute_strategy
+    for metric in tf.nest.flatten(metrics):
+      for v in getattr(metric, "variables", []):
+        if not strategy.extended.variable_created_in_scope(v):
+          raise ValueError(
+              f"Metric ({metric}) passed to `model.compile` was "
+              "created inside a different distribution strategy "
+              "scope than the model. All metrics must be created "
+              "in the same distribution strategy "
+              f"scope as the model (in this case {strategy}). "
+              "If you pass in a string identifier for a metric to "
+              "compile, the metric will automatically be created "
+              "in the correct distribution strategy scope."
+          )
+
+    # Model metrics must be created in the same distribution strategy scope
+    # as the model.
+    for opt in tf.nest.flatten(optimizer):
+      for v in getattr(opt, "_weights", []):
+        if not strategy.extended.variable_created_in_scope(v):
+          raise ValueError(
+              f"Optimizer ({optimizer}) passed to `model.compile` "
+              "was created inside a different distribution strategy "
+              "scope than the model. All optimizers must be created "
+              "in the same distribution strategy scope as the model "
+              f"(in this case {strategy}). If you pass in a string "
+              "identifier for an optimizer to compile, the optimizer "
+              "will automatically be created in the correct "
+              "distribution strategy scope."
+          )
+
+  def _maybe_load_initial_counters_from_ckpt(self, steps_per_epoch, initial_epoch):
+    """Maybe load initial epoch from ckpt, considering worker recovery.
+
+    Refer to tensorflow/python/tf_keras/distribute/worker_training_state.py
+    for more information.
+
+    Args:
+      steps_per_epoch: The number of step per epoch.
+      initial_epoch: The original initial_epoch user passes in `fit()`.
+      mode: The mode for running `model.fit()`.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the (epoch, step) the training is supposed to
+      continue at. Otherwise, return the `initial_epoch, initial_step` the
+      user passes in.
+    """
+    initial_step = 0
+    if self._training_state is not None:
+      return self._training_state.maybe_load_initial_counters_from_ckpt(
+          steps_per_epoch, initial_epoch, mode=ModeKeys.TRAIN
+      )
+    return (initial_epoch, initial_step)
+
+  def _assert_compile_was_called(self):
+    # Checks whether `compile` has been called. If it has been called,
+    # then the optimizer is set. This is different from whether the
+    # model is compiled
+    # (i.e. whether the model is built and its inputs/outputs are set).
+    if not self._is_compiled:
+      raise RuntimeError(
+          "You must compile your model before "
+          "training/testing. "
+          "Use `model.compile(optimizer, loss)`."
+      )
+
+  def _check_sample_weight_warning(self, x, sample_weight):
+    # Datasets can include sample weight, by returning a tuple with the
+    # structure of `(x, y, sample_weight)`.
+    sample_weight_present = sample_weight is not None or (
+        isinstance(x, tf.data.Dataset) and isinstance(x.element_spec, tuple) and len(x.element_spec) == 3
+    )
+
+    if (sample_weight_present and self.compiled_metrics._user_weighted_metrics is None):
+      logging.warning(
+          "`evaluate()` received a value for `sample_weight`, but "
+          "`weighted_metrics` were not provided.  Did you mean to pass "
+          "metrics to `weighted_metrics` in `compile()`?  If this is "
+          "intentional you can pass `weighted_metrics=[]` to `compile()` "
+          "in order to silence this warning."
+      )
+
+  def _should_eval(self, epoch, validation_freq):
+    epoch = epoch + 1  # one-index the user-facing epoch.
+    if isinstance(validation_freq, int):
+      return epoch % validation_freq == 0
+    elif isinstance(validation_freq, list):
+      return epoch in validation_freq
+    else:
+      raise ValueError(
+          "Expected `validation_freq` to be a list or int. "
+          f"Received: validation_freq={validation_freq} of the "
+          f"type {type(validation_freq)}."
+      )
+
+  ######################################################################
+  # Functions below exist only as v1 / v2 compatibility shims.
+  ######################################################################
+
+  def _get_compile_args(self, user_metrics=True):
+    """Used for saving or cloning a Model.
+
+    Args:
+      user_metrics: Whether to return user-supplied metrics or `Metric`
+        objects. If True, returns the user-supplied metrics.
+        Defaults to `True`.
+
+    Returns:
+      Dictionary of arguments that were used when compiling the model.
+    """
+    self._assert_compile_was_called()
+    saved_metrics = self.compiled_metrics._user_metrics
+    saved_weighted_metrics = self.compiled_metrics._user_weighted_metrics
+
+    if not user_metrics:
+      if saved_metrics is not None:
+        saved_metrics = self.compiled_metrics._metrics
+      if saved_weighted_metrics is not None:
+        saved_weighted_metrics = self.compiled_metrics._weighted_metrics
+
+    compile_args = {
+        "optimizer": self.optimizer,
+        "loss": self.compiled_loss._user_losses,
+        "metrics": saved_metrics,
+        "weighted_metrics": saved_weighted_metrics,
+        "loss_weights": self.compiled_loss._user_loss_weights,
+    }
+    return compile_args
+
+  def _get_callback_model(self):
+    return self
+
+  def _in_multi_worker_mode(self):
+    return self.distribute_strategy.extended._in_multi_worker_mode()
+
+  @property
+  def _compile_was_called(self):
+    return self._is_compiled
+
+  @property
+  def main_model(self):
+    """
+    Returns:
+      The main model
+    """
+    if len(self._model) == 1:
+      return self._model["main"]
+    else:
+      for name, _model in self._model.items():
+        if "main" in name:
+          return _model
+      ValueError("Could not find the main model.")
+
+  @tf.__internal__.tracking.no_automatic_dependency_tracking
+  def _maybe_create_attribute(self, name, default_value):
+    """Create attribute (with the default value) if it hasn't been created.
+
+    This is useful for fields that is used for tracking purpose,
+    _trainable_weights, or _layers. Note that user could create a layer
+    subclass and assign an internal field before invoking the
+    Layer.__init__(), the __setattr__() need to create the tracking fields
+    and __init__() need to not override them.
+
+    Args:
+      name: String, the name of the attribute.
+      default_value: Object, the default value of the attribute.
+    """
+    if not hasattr(self, name):
+      self.__setattr__(name, default_value)
+
+  def _get_trainable_state(self):
+    """Get the `trainable` state of each sublayer.
+
+    Returns:
+      A dict mapping all sublayers to their `trainable` value.
+    """
+    trainable_state = weakref.WeakKeyDictionary()
+    for layer in self.main_model._flatten_layers():
+      trainable_state[layer] = layer.trainable
+    return trainable_state
diff --git a/deepray/core/utils/misc/distribution_utils.py b/deepray/core/utils/misc/distribution_utils.py
index 9a9d072f..0042d469 100644
--- a/deepray/core/utils/misc/distribution_utils.py
+++ b/deepray/core/utils/misc/distribution_utils.py
@@ -284,40 +284,6 @@ def undo_set_up_synthetic_data():
   _undo_monkey_patch_dataset_method(tf.distribute.experimental.MultiWorkerMirroredStrategy)
 
 
-def configure_cluster(worker_hosts=None, task_index=-1):
-  """Set multi-worker cluster spec in TF_CONFIG environment variable.
-
-  Args:
-    worker_hosts: comma-separated list of worker ip:port pairs.
-
-  Returns:
-    Number of workers in the cluster.
-  """
-  tf_config = json.loads(os.environ.get('TF_CONFIG', '{}'))
-  if tf_config:
-    num_workers = (len(tf_config['cluster'].get('chief', [])) + len(tf_config['cluster'].get('worker', [])))
-  elif worker_hosts:
-    workers = worker_hosts.split(',')
-    num_workers = len(workers)
-    if num_workers > 1 and task_index < 0:
-      raise ValueError('Must specify task_index when number of workers > 1')
-    task_index = 0 if num_workers == 1 else task_index
-    os.environ['TF_CONFIG'] = json.dumps(
-        {
-            'cluster': {
-                'worker': workers
-            },
-            'task': {
-                'type': 'worker',
-                'index': task_index
-            }
-        }
-    )
-  else:
-    num_workers = 1
-  return num_workers
-
-
 def get_strategy_scope(strategy):
   if strategy:
     strategy_scope = strategy.scope()
diff --git a/deepray/core/utils/misc/keras_utils.py b/deepray/core/utils/misc/keras_utils.py
deleted file mode 100644
index a4c24e97..00000000
--- a/deepray/core/utils/misc/keras_utils.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Helper functions for the Keras implementations of models."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import multiprocessing
-import os
-import time
-
-from absl import logging
-import tensorflow as tf
-from tensorflow.core.protobuf import rewriter_config_pb2
-from tensorflow.python import tf2
-from tensorflow.python.eager import profiler
-
-
-class BatchTimestamp(object):
-  """A structure to store batch time stamp."""
-
-  def __init__(self, batch_index, timestamp):
-    self.batch_index = batch_index
-    self.timestamp = timestamp
-
-  def __repr__(self):
-    return "'BatchTimestamp<batch_index: {}, timestamp: {}>'".format(self.batch_index, self.timestamp)
-
-
-class TimeHistory(tf.keras.callbacks.Callback):
-  """Callback for Keras models."""
-
-  def __init__(self, batch_size, log_steps):
-    """Callback for logging performance.
-
-    Args:
-      batch_size: Total batch size.
-      log_steps: Interval of steps between logging of batch level stats.
-    """
-    self.batch_size = batch_size
-    super(TimeHistory, self).__init__()
-    self.log_steps = log_steps
-    self.global_steps = 0
-
-    # Logs start of step 1 then end of each step based on log_steps interval.
-    self.timestamp_log = []
-
-    # Records the time each epoch takes to run from start to finish of epoch.
-    self.epoch_runtime_log = []
-
-  def on_train_end(self, logs=None):
-    self.train_finish_time = time.time()
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self.epoch_start = time.time()
-
-  def on_batch_begin(self, batch, logs=None):
-    self.global_steps += 1
-    if self.global_steps == 1:
-      self.start_time = time.time()
-      self.timestamp_log.append(BatchTimestamp(self.global_steps, self.start_time))
-
-  def on_batch_end(self, batch, logs=None):
-    """Records elapse time of the batch and calculates examples per second."""
-    if self.global_steps % self.log_steps == 0:
-      timestamp = time.time()
-      elapsed_time = timestamp - self.start_time
-      examples_per_second = (self.batch_size * self.log_steps) / elapsed_time
-      self.timestamp_log.append(BatchTimestamp(self.global_steps, timestamp))
-      logging.info(
-          "BenchmarkMetric: {'global step':%d, 'time_taken': %f,"
-          "'examples_per_second': %f}", self.global_steps, elapsed_time, examples_per_second
-      )
-      self.start_time = timestamp
-
-  def on_epoch_end(self, epoch, logs=None):
-    epoch_run_time = time.time() - self.epoch_start
-    self.epoch_runtime_log.append(epoch_run_time)
-    logging.info("BenchmarkMetric: {'epoch':%d, 'time_taken': %f}", epoch, epoch_run_time)
-
-
-def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_per_epoch):
-  """Validate profile_steps flag value and return profiler callback."""
-  profile_steps_error_message = (
-      'profile_steps must be a comma separated pair of positive integers, '
-      'specifying the first and last steps to be profiled.'
-  )
-  try:
-    profile_steps = [int(i) for i in profile_steps.split(',')]
-  except ValueError:
-    raise ValueError(profile_steps_error_message)
-  if len(profile_steps) != 2:
-    raise ValueError(profile_steps_error_message)
-  start_step, stop_step = profile_steps
-  if start_step < 0 or start_step > stop_step:
-    raise ValueError(profile_steps_error_message)
-  if enable_tensorboard:
-    logging.warning(
-        'Both TensorBoard and profiler callbacks are used. Note that the '
-        'TensorBoard callback profiles the 2nd step (unless otherwise '
-        'specified). Please make sure the steps profiled by the two callbacks '
-        'do not overlap.'
-    )
-  return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
-
-
-class ProfilerCallback(tf.keras.callbacks.Callback):
-  """Save profiles in specified step range to log directory."""
-
-  def __init__(self, log_dir, start_step, stop_step, steps_per_epoch):
-    super(ProfilerCallback, self).__init__()
-    self.log_dir = log_dir
-    self.start_step = start_step
-    self.stop_step = stop_step
-    self.start_epoch = start_step // steps_per_epoch
-    self.stop_epoch = stop_step // steps_per_epoch
-    self.start_step_in_epoch = start_step % steps_per_epoch
-    self.stop_step_in_epoch = stop_step % steps_per_epoch
-    self.should_start = False
-    self.should_stop = False
-
-  def on_epoch_begin(self, epoch, logs=None):
-    if epoch == self.start_epoch:
-      self.should_start = True
-    if epoch == self.stop_epoch:
-      self.should_stop = True
-
-  def on_batch_begin(self, batch, logs=None):
-    if batch == self.start_step_in_epoch and self.should_start:
-      self.should_start = False
-      profiler.start()
-      logging.info('Profiler started at Step %s', self.start_step)
-
-  def on_batch_end(self, batch, logs=None):
-    if batch == self.stop_step_in_epoch and self.should_stop:
-      self.should_stop = False
-      results = profiler.stop()
-      profiler.save(self.log_dir, results)
-      logging.info(
-          'Profiler saved profiles for steps between %s and %s to %s', self.start_step, self.stop_step, self.log_dir
-      )
-
-
-def set_session_config(enable_eager=False, enable_xla=False):
-  """Sets the session config."""
-  if is_v2_0():
-    set_config_v2(enable_xla=enable_xla)
-  else:
-    config = get_config_proto_v1(enable_xla=enable_xla)
-    if enable_eager:
-      tf.compat.v1.enable_eager_execution(config=config)
-    else:
-      sess = tf.Session(config=config)
-      tf.keras.backend.set_session(sess)
-
-
-def get_config_proto_v1(enable_xla=False):
-  """Return config proto according to flag settings, or None to use default."""
-  config = None
-  if enable_xla:
-    config = tf.compat.v1.ConfigProto()
-    config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_2)
-  return config
-
-
-def set_config_v2(enable_xla=False):
-  """Config eager context according to flag values using TF 2.0 API."""
-  if enable_xla:
-    tf.config.optimizer.set_jit(True)
-
-
-def is_v2_0():
-  """Returns true if using tf 2.0."""
-  return tf2.enabled()
-
-
-def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count):
-  """Set GPU thread mode and count, and adjust dataset threads count."""
-  cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
-
-  # Allocate private thread pool for each GPU to schedule and launch kernels
-  per_gpu_thread_count = per_gpu_thread_count or 2
-  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
-
-  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
-  # private threads and memory copy threads.
-  total_gpu_thread_count = per_gpu_thread_count * num_gpus
-  num_runtime_threads = num_gpus
-  if not datasets_num_private_threads:
-    datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
-    logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads)
diff --git a/deepray/custom_ops/BUILD b/deepray/custom_ops/BUILD
index 4b0226ff..99fabfd4 100644
--- a/deepray/custom_ops/BUILD
+++ b/deepray/custom_ops/BUILD
@@ -5,12 +5,15 @@ py_library(
     srcs = glob(["**/*.py"]),
     deps = [
         "//deepray/custom_ops/correlation_cost",
+        "//deepray/custom_ops/embedding_bag",
+        "//deepray/custom_ops/embedding_variable",
         "//deepray/custom_ops/ffm_ops",
-        "//deepray/custom_ops/multiplex_1:multiplex_1_op",
+        "//deepray/custom_ops/multiplex_1",
         "//deepray/custom_ops/multiplex_2:multiplex_2_op",
         "//deepray/custom_ops/multiplex_3:multiplex_3_op",
         "//deepray/custom_ops/multiplex_4:multiplex_4_op",
         "//deepray/custom_ops/parquet_dataset",
+        "//deepray/custom_ops/seq2seq",
         "//deepray/custom_ops/simple_hash_table",
         "//deepray/custom_ops/sleep:sleep_op",
         "//deepray/custom_ops/training_ops",
diff --git a/deepray/custom_ops/correlation_cost/BUILD b/deepray/custom_ops/correlation_cost/BUILD
index 0a9c71a9..9927511c 100644
--- a/deepray/custom_ops/correlation_cost/BUILD
+++ b/deepray/custom_ops/correlation_cost/BUILD
@@ -1,19 +1,24 @@
+load("@rules_python//python:defs.bzl", "py_test")
 load("//deepray:deepray.bzl", "custom_op_library")
 
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
+CORRELATION_COST_OP_SRCS = [
+    "cc/kernels/correlation_cost_op.cc",
+    "cc/ops/correlation_cost_op.cc",
+]
+
 custom_op_library(
     name = "_correlation_cost_ops.so",
-    srcs = [
-        "cc/kernels/correlation_cost_op.cc",
-        "cc/kernels/correlation_cost_op.h",
-        "cc/ops/correlation_cost_op.cc",
+    srcs = CORRELATION_COST_OP_SRCS + ["cc/kernels/correlation_cost_op.h"],
+    gpu_deps = [
+        "@cub_archive//:cub",
     ],
-    cuda_srcs = [
-        "cc/kernels/correlation_cost_op.h",
+    gpu_srcs = [
         "cc/kernels/correlation_cost_op_gpu.cu.cc",
+        "cc/kernels/correlation_cost_op.h",
     ],
 )
 
@@ -25,12 +30,7 @@ py_library(
             "*.py",
         ],
     ),
-    data = [
-        ":_correlation_cost_ops.so",
-    ],
-    deps = [
-        "//deepray/utils",
-    ],
+    data = [":_correlation_cost_ops.so"],
 )
 
 py_test(
@@ -38,7 +38,12 @@ py_test(
     size = "small",
     srcs = glob(["python/tests/*"]),
     main = "python/tests/run_all_test.py",
+    python_version = "PY3",
     deps = [
         ":correlation_cost",
+        "//deepray/utils",
+        "@pypi_pytest//:pkg",
+        "@pypi_tensorflow//:pkg",
+        "@pypi_typeguard//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
index 9496d47c..9978bcdb 100644
--- a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
+++ b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
@@ -17,8 +17,9 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include <cub/cub.cuh>
+
 #include "correlation_cost_op.h"
-#include "cub/device/device_reduce.cuh"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
diff --git a/deepray/custom_ops/correlation_cost/python/optical_flow.py b/deepray/custom_ops/correlation_cost/python/optical_flow.py
index d7565e5d..38742732 100644
--- a/deepray/custom_ops/correlation_cost/python/optical_flow.py
+++ b/deepray/custom_ops/correlation_cost/python/optical_flow.py
@@ -18,7 +18,7 @@
 from typeguard import typechecked
 from deepray.utils.resource_loader import LazySO
 
-_correlation_cost_so = LazySO("custom_ops/correlation_cost/_correlation_cost_ops.so")
+gen_correlation_cost_ops = LazySO("custom_ops/correlation_cost/_correlation_cost_ops.so")
 
 
 def _correlation_cost(
@@ -76,7 +76,7 @@ def _correlation_cost(
     """
 
   with tf.name_scope(name or "correlation_cost"):
-    op_call = _correlation_cost_so.ops.deepray_correlation_cost
+    op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost
 
     if data_format == "channels_last":
       op_data_format = "NHWC"
@@ -116,7 +116,7 @@ def _correlation_cost_grad(op, grad_output):
   input_b = tf.convert_to_tensor(op.inputs[1], name="input_b")
   grad_output_tensor = tf.convert_to_tensor(grad_output, name="grad_output")
 
-  op_call = _correlation_cost_so.ops.deepray_correlation_cost_grad
+  op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost_grad
   grads = op_call(
       input_a,
       input_b,
diff --git a/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py b/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py
index d5c4af3d..8261049e 100644
--- a/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py
+++ b/deepray/custom_ops/correlation_cost/python/tests/run_all_test.py
@@ -1,8 +1,7 @@
 from pathlib import Path
 import sys
-
 import pytest
 
 if __name__ == "__main__":
   dirname = Path(__file__).absolute().parent
-  sys.exit(pytest.main([str(dirname)]))
+  sys.exit(pytest.main(["-s", str(dirname)]))
diff --git a/deepray/custom_ops/distributed_embeddings/BUILD b/deepray/custom_ops/distributed_embeddings/BUILD
index acba7f11..33e86195 100644
--- a/deepray/custom_ops/distributed_embeddings/BUILD
+++ b/deepray/custom_ops/distributed_embeddings/BUILD
@@ -11,16 +11,25 @@ custom_op_library(
         "cc/kernels/embedding_lookup_kernels.cc",
         "cc/ops/embedding_lookup_ops.cc",
     ],
-    cuda_srcs = [
+    gpu_deps = [
+        "@cub_archive//:cub",
+        "@com_github_NVIDIA_cuCollections//:cuCollections",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_runtime",
+        "@local_config_cuda//cuda:cudart",
+    ],
+    gpu_srcs = [
+        # TODO: Update cuCollections version
+        "cc/kernels/embedding_lookup.h",
         "cc/kernels/embedding_lookup_kernels.cu.cc",
     ],
     deps = [
-        "@cuCollections//:cuco_hash_table",
+        "//deepray/custom_ops/utils:ok_status_util",
     ],
 )
 
 py_library(
-    name = "distributed_embeddings_ops",
+    name = "distributed_embeddings",
     srcs = glob(
         [
             "python/*.py",
@@ -42,6 +51,6 @@ py_test(
     srcs = glob(["python/tests/*"]),
     main = "python/tests/run_all_test.py",
     deps = [
-        ":distributed_embeddings_ops",
+        ":distributed_embeddings",
     ],
 )
diff --git a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
index eca712d0..09638d40 100644
--- a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
+++ b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
@@ -21,8 +21,9 @@
 
 #include <cooperative_groups.h>
 
-#include "cub/cub.cuh"
-#include "cuco/static_map.cuh"
+#include <cub/cub.cuh>
+#include <cuco/static_map.cuh>
+
 #include "embedding_lookup.h"
 #include "tensorflow/core/lib/core/bits.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
diff --git a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc b/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
index feb00d43..7d498505 100644
--- a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
+++ b/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -30,7 +31,7 @@ REGISTER_OP("ReadVariableNoCopy")
       TF_RETURN_IF_ERROR(
           shape_inference::ValidateVariableResourceHandle(c, &shape_and_type));
       c->set_output(0, shape_and_type[0].shape);
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("RowToSplit")
@@ -40,7 +41,7 @@ REGISTER_OP("RowToSplit")
     .Output("row_split: Tindices")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // TODO
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("EmbeddingLookupVariableHotness")
@@ -66,7 +67,7 @@ REGISTER_OP("EmbeddingLookupVariableHotness")
         outdim_0 -= 1;
       }
       c->set_output(0, c->Matrix(outdim_0, c->Dim(params_shape, 1)));
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("EmbeddingLookupVariableHotnessGrad")
@@ -86,7 +87,7 @@ REGISTER_OP("EmbeddingLookupVariableHotnessGrad")
                     c->Vector(shape_inference::InferenceContext::kUnknownDim));
       c->set_output(1, c->Matrix(shape_inference::InferenceContext::kUnknownDim,
                                  c->Dim(grad_shape, 1)));
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("IntegerLookup")
@@ -99,7 +100,7 @@ REGISTER_OP("IntegerLookup")
     .Output("values: T")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(2));
-      return Status::OK();
+      return TFOkStatus;
     });
 
 }  // namespace tensorflow
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
index 7a094719..e98161d9 100644
--- a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
+++ b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
@@ -31,8 +31,6 @@
 flags.DEFINE_bool("graph_mode", default=False, help="Run in graph mode.")
 flags.DEFINE_string("mixed_precision_policy", default=None, help="Mixed precision policy to be set.")
 
-FLAGS = flags.FLAGS
-
 large_testcase_sizes = [
     [2, 8], [2, 16], [10, 8], [10, 16], [10, 16], [10, 16], [10, 16], [10, 16], [10, 32], [10, 128],
     [10, 128], [10, 128], [10, 128], [10, 1024], [100, 16], [100, 32], [100, 32], [100, 32], [100, 32], [100, 128],
diff --git a/deepray/custom_ops/embedding_bag/BUILD b/deepray/custom_ops/embedding_bag/BUILD
new file mode 100644
index 00000000..89e3236a
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/BUILD
@@ -0,0 +1,49 @@
+load("@rules_python//python:defs.bzl", "py_test")
+load("//deepray:deepray.bzl", "custom_op_library")
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+custom_op_library(
+    name = "_embedding_bag_ops.so",
+    srcs = [
+        "cc/kernels/embedding_bag_ops.cc",
+        "cc/kernels/embedding_bag_ops.h",
+        "cc/ops/embedding_bag_ops.cc",
+    ],
+    gpu_deps = [
+        "@local_config_cuda//cuda:cuda_runtime",
+    ],
+    gpu_srcs = [
+        "cc/kernels/embedding_bag_ops.h",
+        "cc/kernels/embedding_bag_ops_gpu.cu.cc",
+        "cc/kernels/embedding_bag_backward_kernels.cu.cc",
+    ],
+)
+
+py_library(
+    name = "embedding_bag",
+    srcs = glob(
+        [
+            "python/*.py",
+            "*.py",
+        ],
+    ),
+    data = [":_embedding_bag_ops.so"],
+)
+
+py_test(
+    name = "embedding_bag_test",
+    size = "small",
+    srcs = glob(["python/tests/*"]),
+    main = "python/tests/run_all_test.py",
+    python_version = "PY3",
+    deps = [
+        ":embedding_bag",
+        "//deepray/utils",
+        "@pypi_pytest//:pkg",
+        "@pypi_tensorflow//:pkg",
+        "@pypi_typeguard//:pkg",
+    ],
+)
diff --git a/deepray/custom_ops/embedding_bag/__init__.py b/deepray/custom_ops/embedding_bag/__init__.py
new file mode 100644
index 00000000..7f50af3e
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/__init__.py
@@ -0,0 +1 @@
+from .python.embedding_bag import EmbeddingBag, _embedding_bag
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
new file mode 100644
index 00000000..b6cdce68
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
@@ -0,0 +1,247 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <thrust/device_ptr.h>
+#include <thrust/fill.h>
+#include <thrust/sort.h>
+
+#include "embedding_bag_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+constexpr int MAX_THREADS_PER_BLOCK = 1024;
+
+namespace tensorflow {
+namespace addons {
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Tindices, const int kThreadsPerBlock>
+__global__ void PrepTempArraysKernel(
+    const Tindices *__restrict__ indices, Tindices *__restrict__ sortedIndices,
+    Tindices *__restrict__ sortedIndicesCounter, const int indices_size) {
+  const int arrayIdx = (blockIdx.x * kThreadsPerBlock) + threadIdx.x;
+  if (arrayIdx <
+      indices_size) {  // Make sure we don't run off the end of the actual array
+    sortedIndices[arrayIdx] = indices[arrayIdx];
+    sortedIndicesCounter[arrayIdx] = arrayIdx;
+  }
+}
+
+// Define the CUDA kernel.
+template <typename T, typename Tindices, const int kThreadsPerBlock>
+__global__ void EmbeddingBagWeightsGradKernel(
+    const int value_dim, const Tindices *__restrict__ indices,
+    const T *__restrict__ values, const T *__restrict__ dloss,
+    T *__restrict__ weights_grad, Combiner combiner) {
+  const int sample_idx = blockIdx.x;
+  const int bag_idx = blockIdx.y;
+  const int bag_dim = gridDim.y;
+  const int valueBaseIdx =
+      indices[(sample_idx * bag_dim) + bag_idx] * value_dim;
+  const int dlossBaseIdx = sample_idx * value_dim;
+  // Use a full-precision accumulator even for half-precision inputs
+  float partialDotProduct = 0.0f;
+  for (int i = threadIdx.x; i < value_dim;
+       i += blockDim.x)  // Note that some threads may stop one iteration
+                         // earlier if the block straddles the end of the array
+  {
+    partialDotProduct +=
+        static_cast<float>(values[valueBaseIdx + i] * dloss[dlossBaseIdx + i]);
+  }
+  unsigned activeMask = 0xffffffff;
+#pragma unroll
+  for (int offset = kThreadsPerBlock / 2; offset > 0; offset /= 2) {
+    partialDotProduct +=
+        __shfl_down_sync(activeMask, partialDotProduct, offset);
+  }
+  if (combiner == Combiner::kMean) {
+    partialDotProduct /= static_cast<float>(bag_dim);
+  }
+  // Thread 0 now has the full dot product
+  if (threadIdx.x == 0) {
+    weights_grad[(sample_idx * bag_dim) + bag_idx] =
+        static_cast<T>(partialDotProduct);
+  }
+}
+
+template <typename T, typename Tindices>
+__global__ void EmbeddingBagValuesGradKernel(
+    const int value_dim, const int bag_dim,
+    const Tindices *__restrict__ sortedIndices,
+    const Tindices *__restrict__ counter, const T *__restrict__ values,
+    const T *__restrict__ weights, const T *__restrict__ dloss,
+    T *__restrict__ values_grad, Combiner combiner) {
+  const int startIdx = blockIdx.x;
+  const int chunk = blockIdx.y;
+  const int kThreadsPerBlock = blockDim.x;
+  const int featureIdx = threadIdx.x + (chunk * kThreadsPerBlock);
+  // The core problem here is that we want to avoid parallel writes to the
+  // same element of the grads. We avoid that by pre-sorting a copy of the
+  // indices tensor, and also co-sorting a 'counter' array so that we still know
+  // which element of the incoming gradient tensor corresponds to each. Then, we
+  // take the slightly lazy approach of spinning up a warp for each element of
+  // the indices array, but having each warp check the previous element before
+  // it starts. If the two elements are the same, then the warp immediately
+  // returns without doing anything. If not, then the warp iterates forward and
+  // accumulates gradient until it hits a different index element, at which
+  // point it writes the accumulated value and returns. This ensures that each
+  // row of the values grad tensor is handled by one and exactly one warp.
+  const int valuesIdx = ldg(sortedIndices + startIdx);
+  if (startIdx > 0) {
+    const int prevIdx = ldg(sortedIndices + startIdx - 1);
+    if (prevIdx == valuesIdx) {
+      return;  // Another block is handling this index, exit
+    }
+  }
+  int endIdx = startIdx;
+  while (endIdx < gridDim.x - 1)  // Don't run off the end of the array
+  {
+    int nextIdx = endIdx + 1;
+    int nextValuesIdx = ldg(sortedIndices + nextIdx);
+    if (nextValuesIdx == valuesIdx) {
+      endIdx += 1;
+    } else {
+      break;
+    }
+  }
+  if (featureIdx < value_dim)  // Don't run off the end of the row
+  {
+    const int outputOffset = (valuesIdx * value_dim) + featureIdx;
+    float accum = 0.0f;  // Full precision even if the inputs aren't
+
+    for (int currentIdx = startIdx; currentIdx <= endIdx; ++currentIdx) {
+      int originalIdxPosition = ldg(counter + currentIdx);
+      T weight = weights[originalIdxPosition];
+      // The floor division on this line is correct and intentional
+      T featureDloss =
+          ldg(dloss + (originalIdxPosition / bag_dim) + featureIdx);
+      accum += static_cast<float>(weight * featureDloss);
+    }
+    if (combiner == Combiner::kMean) {
+      accum /= static_cast<float>(bag_dim);
+    }
+    values_grad[outputOffset] = static_cast<T>(accum);
+  }
+}
+
+// Define the GPU implementation that launches the CUDA kernel.
+template <typename T, typename Tindices>
+struct EmbeddingBagBackwardFunctor<GPUDevice, T, Tindices> {
+  // indices should remain unchanged, but thrust complains if it's a const
+  // pointer
+  void operator()(const GPUDevice &d,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::ConstTensor grads,
+                  typename TTypes<T, 2>::Tensor params_grads,
+                  typename TTypes<T, 2>::Tensor weights_grads,
+                  Combiner combiner, OpKernelContext *context) {
+    // I copy-pasted this bit from histogram_op_gpu.cu.cc and I sure hope it
+    // works
+    tensorflow::AllocatorAttributes gpu_allocator;
+    gpu_allocator.set_on_host(false);
+    gpu_allocator.set_gpu_compatible(true);
+
+    Tensor sortedIndicesTensor;
+    Tensor sortedIndicesCounterTensor;
+
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<Tindices>::value,
+                                          TensorShape({indices.size()}),
+                                          &sortedIndicesTensor, gpu_allocator));
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DataTypeToEnum<Tindices>::value,
+                                TensorShape({indices.size()}),
+                                &sortedIndicesCounterTensor, gpu_allocator));
+    auto sortedIndices = sortedIndicesTensor.flat<Tindices>();
+    auto sortedIndicesCounter = sortedIndicesCounterTensor.flat<Tindices>();
+    // Note: I tried splitting the two kernels into different streams but
+    // performance was barely affected.
+    const Eigen::Index batch_dim = indices.dimension(0);
+    const Eigen::Index bag_dim = indices.dimension(1);
+    const Eigen::Index output_dim = params.dimension(1);
+    const auto params_size = params.size();
+    const int kThreadsPerBlock = 32;
+    dim3 gridShape = dim3(batch_dim, bag_dim, 1);
+    TF_CHECK_OK(GpuLaunchKernel(
+        EmbeddingBagWeightsGradKernel<T, Tindices, kThreadsPerBlock>, gridShape,
+        kThreadsPerBlock, 0, d.stream(), output_dim, indices.data(),
+        params.data(), grads.data(), weights_grads.data(), combiner));
+
+    const int indices_size = indices.size();
+    const int values_size = params.size();
+    const int total_blocks = Eigen::divup(indices_size, kThreadsPerBlock);
+    gridShape = dim3(total_blocks, 1, 1);
+
+    TF_CHECK_OK(GpuLaunchKernel(
+        PrepTempArraysKernel<Tindices, kThreadsPerBlock>, gridShape,
+        kThreadsPerBlock, 0, d.stream(), indices.data(), sortedIndices.data(),
+        sortedIndicesCounter.data(), indices_size));
+
+    thrust::device_ptr<Tindices> sortedIndicesCounterDevicePtr(
+        sortedIndicesCounter.data());
+    thrust::device_ptr<Tindices> sortedIndicesDevicePtr(sortedIndices.data());
+    thrust::device_ptr<T> paramsGradDevicePtr(params_grads.data());
+    thrust::fill(paramsGradDevicePtr,
+                 paramsGradDevicePtr + static_cast<int>(params_size),
+                 static_cast<T>(0.0f));
+    thrust::sort_by_key(sortedIndicesDevicePtr,
+                        sortedIndicesDevicePtr + indices_size,
+                        sortedIndicesCounterDevicePtr);
+    // Handle each row with as few thread blocks as possible
+    int threadsPerBlock;
+    int blocksPerRow;
+    if (output_dim <= MAX_THREADS_PER_BLOCK) {
+      blocksPerRow = 1;
+      threadsPerBlock = output_dim;
+    } else {
+      blocksPerRow =
+          Eigen::divup(static_cast<int>(output_dim), MAX_THREADS_PER_BLOCK);
+      threadsPerBlock =
+          Eigen::divup(static_cast<int>(output_dim), blocksPerRow);
+    }
+    // int blocksPerRow = 1;
+    // while (threadsPerBlock > MAX_THREADS_PER_BLOCK) {
+    //   threadsPerBlock = (threadsPerBlock + 1) / 2;  // Ceiling division
+    //   blocksPerRow *= 2;
+    // }
+    gridShape = dim3(indices_size, blocksPerRow, 1);
+    TF_CHECK_OK(GpuLaunchKernel(
+        EmbeddingBagValuesGradKernel<T, Tindices>, gridShape, threadsPerBlock,
+        0, d.stream(), output_dim, bag_dim, sortedIndices.data(),
+        sortedIndicesCounter.data(), params.data(), weights.data(),
+        grads.data(), params_grads.data(), combiner));
+  }
+};
+
+// Explicitly instantiate functors for the types of OpKernels registered.
+template struct EmbeddingBagBackwardFunctor<GPUDevice, double, int32>;
+template struct EmbeddingBagBackwardFunctor<GPUDevice, float, int32>;
+template struct EmbeddingBagBackwardFunctor<GPUDevice, Eigen::half, int32>;
+template struct EmbeddingBagBackwardFunctor<GPUDevice, double, int64>;
+template struct EmbeddingBagBackwardFunctor<GPUDevice, float, int64>;
+template struct EmbeddingBagBackwardFunctor<GPUDevice, Eigen::half, int64>;
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
new file mode 100644
index 00000000..fd6169d1
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
@@ -0,0 +1,330 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
+#include "embedding_bag_ops.h"
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+namespace tensorflow {
+namespace addons {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+// CPU specialization of actual computation.
+template <typename T, typename Tindices>
+struct EmbeddingBagFunctor<CPUDevice, T, Tindices> {
+  static constexpr int64 kPacketSize = Eigen::internal::packet_traits<T>::size;
+  using VectorMap = Eigen::Map<Eigen::Vector<T, Eigen::Dynamic>>;
+  using ConstVectorMap = Eigen::Map<const Eigen::Vector<T, Eigen::Dynamic>>;
+
+  void operator()(const CPUDevice &device,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::Tensor output, Combiner combiner) {
+    const Eigen::Index bags = indices.dimension(0);
+    const Eigen::Index sequence_length = indices.dimension(1);
+    const Eigen::Index output_dim = params.dimension(1);
+
+    const auto work = [&](Eigen::Index start, Eigen::Index end) {
+      for (Eigen::Index bag = start; bag < end; ++bag) {
+        VectorMap output_slice(&output(bag, 0), output_dim);
+        output_slice.setZero();
+        for (Eigen::Index seq = 0; seq < sequence_length; ++seq) {
+          const ConstVectorMap params_slice(&params(indices(bag, seq), 0),
+                                            output_dim);
+          output_slice += params_slice * weights(bag, seq);
+        }
+        if (combiner == Combiner::kMean) {
+          output_slice /= static_cast<T>(sequence_length);
+        }
+      }
+    };
+
+    const double bytes_loaded =
+        sequence_length * (sizeof(Tindices) + sizeof(T)) +
+        (sequence_length * output_dim) * sizeof(T);
+    const double bytes_stored = output_dim * sizeof(T);
+    const double compute_cycles =
+        (sequence_length * output_dim) *
+        (Eigen::TensorOpCost::AddCost<T>() + Eigen::TensorOpCost::MulCost<T>());
+    const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, compute_cycles,
+                                   /*vectorized=*/true,
+                                   /*packet_size=*/kPacketSize);
+    device.parallelFor(bags, cost, std::move(work));
+  }
+};
+
+// CPU specialization of actual computation.
+template <typename T, typename Tindices>
+struct EmbeddingBagBackwardFunctor<CPUDevice, T, Tindices> {
+  static constexpr int64 kPacketSize = Eigen::internal::packet_traits<T>::size;
+  using VectorMap = Eigen::Map<Eigen::Vector<T, Eigen::Dynamic>>;
+  using ConstVectorMap = Eigen::Map<const Eigen::Vector<T, Eigen::Dynamic>>;
+
+  void operator()(const CPUDevice &device,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::ConstTensor grads,
+                  typename TTypes<T, 2>::Tensor params_grads,
+                  typename TTypes<T, 2>::Tensor weights_grads,
+                  Combiner combiner, OpKernelContext *context) {
+    const Eigen::Index sequence_length = indices.dimension(1);
+    const Eigen::Index output_dim = params.dimension(1);
+
+    std::unordered_map<Tindices, Eigen::Index> index_map;
+    // The pair (x, {y_i}) in index_vec means
+    // index y_i in `indices` contributes to bag `x`.
+    std::vector<std::pair<Tindices, std::vector<Eigen::Index>>> index_vec;
+    for (Eigen::Index i = 0; i < indices.size(); ++i) {
+      Tindices index = indices.data()[i];
+      if (index_map.find(index) == index_map.end()) {
+        index_map[index] = index_vec.size();
+        index_vec.push_back({index, {}});
+      }
+      index_vec[index_map[index]].second.push_back(i);
+    }
+
+    const auto compute_params_grads = [&](Eigen::Index start,
+                                          Eigen::Index end) {
+      for (Eigen::Index i = start; i < end; ++i) {
+        VectorMap params_grads_slice(&params_grads(index_vec[i].first, 0),
+                                     output_dim);
+        for (Eigen::Index index : index_vec[i].second) {
+          const Eigen::Index bag = index / sequence_length;
+          const Eigen::Index seq = index % sequence_length;
+          const ConstVectorMap grads_slice(&grads(bag, 0), output_dim);
+          params_grads_slice += grads_slice * weights(bag, seq);
+        }
+        if (combiner == Combiner::kMean) {
+          params_grads_slice /= static_cast<T>(sequence_length);
+        }
+      }
+    };
+
+    const Eigen::Index num_unique_params = index_vec.size();
+    const double bytes_loaded = 100 * output_dim * sizeof(T);
+    const double bytes_stored = output_dim * sizeof(T);
+    const double compute_cycles =
+        100 * output_dim *
+        (Eigen::TensorOpCost::AddCost<T>() + Eigen::TensorOpCost::MulCost<T>());
+    const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, compute_cycles,
+                                   /*vectorized=*/true,
+                                   /*packet_size=*/kPacketSize);
+    params_grads.setZero();
+    device.parallelFor(num_unique_params, cost,
+                       std::move(compute_params_grads));
+
+    const auto compute_weights_grads =
+        [&](const Eigen::array<Eigen::Index, 2> &coords) -> T {
+      const Eigen::Index bag = coords[0];
+      const Eigen::Index seq = coords[1];
+      const ConstVectorMap grads_slice(&grads(bag, 0), output_dim);
+      const ConstVectorMap params_slice(&params(indices(bag, seq), 0),
+                                        output_dim);
+      T output = params_slice.dot(grads_slice);
+      if (combiner == Combiner::kMean) {
+        output /= static_cast<T>(sequence_length);
+      }
+      return output;
+    };
+
+    weights_grads.device(device) =
+        weights_grads.generate(std::move(compute_weights_grads));
+  }
+};
+}  // namespace functor
+
+namespace {
+bool ValidateCombiner(const std::string &combiner_string, Combiner *combiner) {
+  if (combiner_string == "SUM") {
+    *combiner = Combiner::kSum;
+  } else if (combiner_string == "MEAN") {
+    *combiner = Combiner::kMean;
+  } else {
+    return false;
+  }
+  return true;
+}
+}  // namespace
+
+template <typename Device, typename T, typename Tindices>
+class EmbeddingBagOp : public OpKernel {
+ public:
+  explicit EmbeddingBagOp(OpKernelConstruction *context) : OpKernel(context) {
+    std::string combiner_string;
+    OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string));
+    OP_REQUIRES(
+        context, ValidateCombiner(combiner_string, &combiner_),
+        errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner."));
+  }
+
+  void Compute(OpKernelContext *context) override {
+    const Tensor &indices = context->input(0);
+    const Tensor &params = context->input(1);
+    const Tensor &weights = context->input(2);
+
+    const TensorShape &indices_shape = indices.shape();
+    const TensorShape &params_shape = params.shape();
+    const TensorShape &weights_shape = weights.shape();
+
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices_shape),
+                errors::InvalidArgument("indices shape should be 2-D."));
+    OP_REQUIRES(context, indices_shape == weights_shape,
+                errors::InvalidArgument(
+                    "Shape of indices and weights should be equal."));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(params_shape),
+                errors::InvalidArgument("params shape should be 2-D."));
+
+    TensorShape output_shape = {indices_shape.dim_size(0),
+                                params_shape.dim_size(1)};
+
+    Tensor *output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+
+    functor::EmbeddingBagFunctor<Device, T, Tindices>()(
+        context->eigen_device<Device>(), indices.tensor<Tindices, 2>(),
+        params.tensor<T, 2>(), weights.tensor<T, 2>(), output->tensor<T, 2>(),
+        combiner_);
+  }
+
+ private:
+  Combiner combiner_;
+};
+
+template <typename Device, typename T, typename Tindices>
+class EmbeddingBagBackwardOp : public OpKernel {
+ public:
+  explicit EmbeddingBagBackwardOp(OpKernelConstruction *context)
+      : OpKernel(context) {
+    std::string combiner_string;
+    OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string));
+    OP_REQUIRES(
+        context, ValidateCombiner(combiner_string, &combiner_),
+        errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner."));
+  }
+
+  void Compute(OpKernelContext *context) override {
+    const Tensor &indices = context->input(0);
+    const Tensor &params = context->input(1);
+    const Tensor &weights = context->input(2);
+    const Tensor &grads = context->input(3);
+
+    Tensor *params_grads = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, params.shape(), &params_grads));
+    Tensor *weights_grads = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(1, weights.shape(), &weights_grads));
+    functor::EmbeddingBagBackwardFunctor<Device, T, Tindices>()(
+        context->eigen_device<Device>(), indices.tensor<Tindices, 2>(),
+        params.tensor<T, 2>(), weights.tensor<T, 2>(), grads.tensor<T, 2>(),
+        params_grads->tensor<T, 2>(), weights_grads->tensor<T, 2>(), combiner_,
+        context);  // Pass the context so the GPU op can allocate the temporary
+                   // arrays it needs
+  }
+
+ private:
+  Combiner combiner_;
+};
+
+// Register the CPU kernels.
+#define REGISTER_CPU_KERNEL(T)                                          \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag")                  \
+                              .Device(DEVICE_CPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int32>("Tindices"),       \
+                          EmbeddingBagOp<CPUDevice, T, int32>);         \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag")                  \
+                              .Device(DEVICE_CPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int64>("Tindices"),       \
+                          EmbeddingBagOp<CPUDevice, T, int64>);         \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad")              \
+                              .Device(DEVICE_CPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int32>("Tindices"),       \
+                          EmbeddingBagBackwardOp<CPUDevice, T, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad")              \
+                              .Device(DEVICE_CPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int64>("Tindices"),       \
+                          EmbeddingBagBackwardOp<CPUDevice, T, int64>);
+REGISTER_CPU_KERNEL(Eigen::half);
+REGISTER_CPU_KERNEL(float);
+REGISTER_CPU_KERNEL(double);
+#undef REGISTER_CPU_KERNEL
+
+#if GOOGLE_CUDA
+namespace functor {
+// Forward declarations of the functor specializations for GPU.
+#define DECLARE_GPU_SPEC(T, Tindices)                                         \
+  template <>                                                                 \
+  void EmbeddingBagFunctor<GPUDevice, T, Tindices>::operator()(               \
+      const GPUDevice &, typename TTypes<Tindices, 2>::ConstTensor,           \
+      typename TTypes<T, 2>::ConstTensor, typename TTypes<T, 2>::ConstTensor, \
+      typename TTypes<T, 2>::Tensor, Combiner);                               \
+  extern template struct EmbeddingBagFunctor<GPUDevice, T, Tindices>;
+
+#define DECLARE_GPU_SPECS(T)  \
+  DECLARE_GPU_SPEC(T, int32); \
+  DECLARE_GPU_SPEC(T, int64);
+
+DECLARE_GPU_SPECS(Eigen::half);
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+#undef DECLARE_GPU_SPEC
+#undef DECLARE_GPU_SPECS
+}  // namespace functor
+
+// Register the GPU kernels.
+#define REGISTER_GPU_KERNEL(T)                                          \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag")                  \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int32>("Tindices"),       \
+                          EmbeddingBagOp<GPUDevice, T, int32>);         \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBag")                  \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int64>("Tindices"),       \
+                          EmbeddingBagOp<GPUDevice, T, int64>);         \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad")              \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int32>("Tindices"),       \
+                          EmbeddingBagBackwardOp<GPUDevice, T, int32>); \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>EmbeddingBagGrad")              \
+                              .Device(DEVICE_GPU)                       \
+                              .TypeConstraint<T>("T")                   \
+                              .TypeConstraint<int64>("Tindices"),       \
+                          EmbeddingBagBackwardOp<GPUDevice, T, int64>);
+REGISTER_GPU_KERNEL(Eigen::half);
+REGISTER_GPU_KERNEL(float);
+REGISTER_GPU_KERNEL(double);
+#undef REGISTER_GPU_KERNEL
+#endif  // GOOGLE_CUDA
+}  // namespace addons
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
new file mode 100644
index 00000000..b05c58e7
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
@@ -0,0 +1,57 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_
+#define TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_types.h"
+
+namespace tensorflow {
+namespace addons {
+
+enum class Combiner {
+  kSum,
+  kMean,
+};
+
+namespace functor {
+
+template <typename Device, typename T, typename Tindices>
+struct EmbeddingBagFunctor {
+  void operator()(const Device &device,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::Tensor output, Combiner combiner);
+};
+
+template <typename Device, typename T, typename Tindices>
+struct EmbeddingBagBackwardFunctor {
+  void operator()(const Device &device,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::ConstTensor grads,
+                  typename TTypes<T, 2>::Tensor params_grads,
+                  typename TTypes<T, 2>::Tensor weights_grads,
+                  Combiner combiner, OpKernelContext *context);
+};
+
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_ADDONS_LAYERS_KERNELS_EMBEDDING_BAG_OPS_H_
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
new file mode 100644
index 00000000..7be3d552
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
@@ -0,0 +1,108 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "embedding_bag_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+namespace addons {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+// Define the GPU kernel.
+template <typename T, typename Tindices, const int kThreadsPerBlock>
+__global__ void EmbeddingBagGPUKernel(const Tindices *__restrict__ indices,
+                                      const T *__restrict__ params,
+                                      const T *__restrict__ weights,
+                                      T *__restrict__ output,
+                                      const Eigen::Index output_dim,
+                                      const Eigen::Index sequence_length,
+                                      Combiner combiner) {
+  // blockIdx.x indicates which row of the output we are writing to. It also
+  // indicates which `bag` we're reading from.
+  // blockIdx.y indicates which chunk of that row we are writing to.
+  // threadIdx.x indicates which element of that chunk we are writing to.
+
+  // feature_idx is the position in the final dimension of the output that we
+  // are writing to.
+  const Eigen::Index feature_idx = blockIdx.y * kThreadsPerBlock + threadIdx.x;
+  // It's necessary in case output_dim is not evenly divided by blockDim.x.
+  if (feature_idx < output_dim) {
+    // output_idx is the offset of the output we are writing to.
+    const Eigen::Index output_idx = blockIdx.x * output_dim + feature_idx;
+    // bag_offset is the offset in indices corresponding to the first
+    // index of the `bag` that we will be summing over.
+    const Eigen::Index bag_offset = blockIdx.x * sequence_length;
+    T accum = static_cast<T>(0);
+    for (Eigen::Index idx_offset = bag_offset;
+         idx_offset < bag_offset + sequence_length; ++idx_offset) {
+      accum += params[indices[idx_offset] * output_dim + feature_idx] *
+               weights[idx_offset];
+    }
+    if (combiner == Combiner::kMean) {
+      accum /= static_cast<T>(sequence_length);
+    }
+    output[output_idx] = accum;
+  }
+}
+}  // namespace
+
+namespace functor {
+// Define the GPU implementation that launches the CUDA kernel.
+template <typename T, typename Tindices>
+struct EmbeddingBagFunctor<GPUDevice, T, Tindices> {
+  static constexpr int kThreadsPerBlock = 32;
+
+  void operator()(const GPUDevice &device,
+                  typename TTypes<Tindices, 2>::ConstTensor indices,
+                  typename TTypes<T, 2>::ConstTensor params,
+                  typename TTypes<T, 2>::ConstTensor weights,
+                  typename TTypes<T, 2>::Tensor output, Combiner combiner) {
+    const Eigen::Index bags = indices.dimension(0);
+    const Eigen::Index sequence_length = indices.dimension(1);
+    const Eigen::Index output_dim = params.dimension(1);
+
+    const int blocks_per_value_vec =
+        Eigen::divup(output_dim, static_cast<Eigen::Index>(kThreadsPerBlock));
+    const dim3 grids = dim3(bags, blocks_per_value_vec);
+
+    TF_CHECK_OK(GpuLaunchKernel(
+        EmbeddingBagGPUKernel<T, Tindices, kThreadsPerBlock>, grids,
+        kThreadsPerBlock, 0, device.stream(), indices.data(), params.data(),
+        weights.data(), output.data(), output_dim, sequence_length, combiner));
+  }
+};
+
+// Explicit instantiation of the GPU functor.
+#define DECLARE_GPU_SPECS(T)                                \
+  template struct EmbeddingBagFunctor<GPUDevice, T, int32>; \
+  template struct EmbeddingBagFunctor<GPUDevice, T, int64>;
+
+DECLARE_GPU_SPECS(Eigen::half);
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+#undef DECLARE_GPU_SPECS
+
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc b/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc
new file mode 100644
index 00000000..38a39cb1
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/cc/ops/embedding_bag_ops.cc
@@ -0,0 +1,70 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace addons {
+
+using ::tensorflow::shape_inference::DimensionHandle;
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeHandle;
+
+REGISTER_OP("Deepray>EmbeddingBag")
+    .Input("indices: Tindices")
+    .Input("params: T")
+    .Input("weights: T")
+    .Output("output: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("combiner: {'SUM', 'MEAN'} = 'SUM'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices, params, weights, unused, output;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &indices));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &params));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &weights));
+      DimensionHandle output_dim = c->Dim(params, 1);
+      TF_RETURN_IF_ERROR(
+          c->ReplaceDim(indices, c->Rank(indices) - 1, output_dim, &output));
+      TF_RETURN_IF_ERROR(c->Merge(indices, weights, &unused));
+      c->set_output(0, output);
+      return Status();
+    });
+
+REGISTER_OP("Deepray>EmbeddingBagGrad")
+    .Input("indices: Tindices")
+    .Input("params: T")
+    .Input("weights: T")
+    .Input("grads: T")
+    .Output("params_grads: T")
+    .Output("weights_grads: T")
+    .Attr("T: {bfloat16, half, float, double}")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("combiner: {'SUM', 'MEAN'} = 'SUM'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle indices, params, weights, unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &indices));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &params));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &weights));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &unused));
+      TF_RETURN_IF_ERROR(c->Merge(indices, weights, &unused));
+      c->set_output(0, c->input(1));
+      c->set_output(1, c->input(2));
+      return Status();
+    });
+
+}  // namespace addons
+}  // namespace tensorflow
diff --git a/deepray/layers/nlp/__init__.py b/deepray/custom_ops/embedding_bag/python/__init__.py
similarity index 100%
rename from deepray/layers/nlp/__init__.py
rename to deepray/custom_ops/embedding_bag/python/__init__.py
diff --git a/deepray/custom_ops/embedding_bag/python/embedding_bag.py b/deepray/custom_ops/embedding_bag/python/embedding_bag.py
new file mode 100644
index 00000000..9c6acc04
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/python/embedding_bag.py
@@ -0,0 +1,143 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import tensorflow as tf
+from typeguard import typechecked
+
+from deepray.utils.types import Constraint, Initializer, Regularizer
+from deepray.utils.resource_loader import LazySO
+
+_embedding_bag_so = LazySO("custom_ops/embedding_bag/_embedding_bag_ops.so")
+
+
+def _embedding_bag(
+    indices,
+    params,
+    weights=None,
+    combiner="sum",
+    name=None,
+):
+  """EmbeddingBag computation.
+
+    See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
+
+    Equivalent to tf.gather() followed by tf.reduce_{sum,mean}() across the last dimension, with optional
+    weights. Fusing these into a single op has massive benefits for execution speed and particularly
+    memory usage, as the intermediate output of the gather never needs to be materialized.
+
+    Args:
+      indices: An int32 or int64 `Tensor` of the indices to gather from
+          `params`. Must be at least 2-dimensional, as the last dimension
+          will be summed out. Maximum value must be less than params.shape[0].
+      params: A float32 `Tensor` from which to gather params. Must be rank 2.
+      weights: A float32 `Tensor` of weights which will be applied to each of
+          the gathered embedding vectors before the sum step.
+      name: A name for the operation (optional).
+
+    Returns:
+      A `Tensor` of the format specified by `data_format`.
+    """
+  if weights is None:
+    weights = tf.ones_like(indices, dtype=params.dtype)
+  elif combiner != "sum":
+    raise RuntimeError("Combiner mode must be 'sum' when weights are supplied to EmbeddingBag!")
+
+  return _embedding_bag_so.ops.deepray_embedding_bag(indices, params, weights, combiner=combiner.upper(), name=name)
+
+
+@tf.RegisterGradient("Deepray>EmbeddingBag")
+def _embedding_bag_grad(op, grads):
+  indices, params, weights = op.inputs[:3]
+  combiner = op.get_attr("combiner")
+  value_grads, weight_grads = _embedding_bag_so.ops.deepray_embedding_bag_grad(
+      indices, params, weights, grads, combiner=combiner
+  )
+  return [None, value_grads, weight_grads]
+
+
+@tf.keras.utils.register_keras_serializable(package="Deepray")
+class EmbeddingBag(tf.keras.layers.Layer):
+  """EmbeddingBag Layer.
+
+    See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
+
+    Equivalent to tf.gather() followed by tf.reduce_sum() across the last dimension, with optional
+    weights. Fusing these into a single op has massive benefits for execution speed and particularly
+    memory usage, as the intermediate output of the gather never needs to be materialized.
+
+    Input Shapes:
+      indices: An int32 or int64 `Tensor` of the indices to gather from
+          `params`. Must be at least 2-dimensional, as the last dimension
+          will be summed out. Maximum value must be less than params.shape[0].
+      params: A float32 `Tensor` from which to gather params. Must be rank 2.
+      weights: A float32 `Tensor` of weights which will be applied to each of
+          the gathered embedding vectors before the sum step.
+
+    Output shape:
+        indices.shape[:-1], params.shape[-1]
+    """
+
+  @typechecked
+  def __init__(
+      self,
+      input_dim: int,
+      output_dim: int,
+      embeddings_initializer: Initializer = "uniform",
+      embeddings_regularizer: Regularizer = None,
+      embeddings_constraint: Constraint = None,
+      mask_zero: bool = False,
+      combiner: str = "sum",
+      **kwargs,
+  ):
+    super(EmbeddingBag, self).__init__(**kwargs)
+    if input_dim <= 0 or output_dim <= 0:
+      raise ValueError(
+          "Both `input_dim` and `output_dim` should be positive, "
+          "found input_dim {} and output_dim {}".format(input_dim, output_dim)
+      )
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.embeddings_initializer = tf.keras.initializers.get(embeddings_initializer)
+    self.embeddings_regularizer = tf.keras.regularizers.get(embeddings_regularizer)
+    self.embeddings_constraint = tf.keras.constraints.get(embeddings_constraint)
+    self.mask_zero = mask_zero
+    self.supports_masking = mask_zero
+    self.combiner = combiner
+
+  def build(self, input_shape):
+    self.embeddings = self.add_weight(
+        shape=(self.input_dim, self.output_dim),
+        name="embeddings",
+        initializer=self.embeddings_initializer,
+        regularizer=self.embeddings_regularizer,
+        constraint=self.embeddings_constraint,
+    )
+    self.built = True
+
+  def call(self, indices, weights=None):
+    return _embedding_bag(indices, self.embeddings, weights, combiner=self.combiner)
+
+  def get_config(self):
+    config = {
+        "input_dim": self.input_dim,
+        "output_dim": self.output_dim,
+        "embeddings_initializer": tf.keras.initializers.serialize(self.embeddings_initializer),
+        "embeddings_regularizer": tf.keras.regularizers.serialize(self.embeddings_regularizer),
+        "embeddings_constraint": tf.keras.constraints.serialize(self.embeddings_constraint),
+        "mask_zero": self.mask_zero,
+        "combiner": self.combiner,
+    }
+    base_config = super(EmbeddingBag, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/nlp/transformer/__init__.py b/deepray/custom_ops/embedding_bag/python/tests/__init__.py
similarity index 100%
rename from deepray/layers/nlp/transformer/__init__.py
rename to deepray/custom_ops/embedding_bag/python/tests/__init__.py
diff --git a/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
new file mode 100644
index 00000000..f1d1ee33
--- /dev/null
+++ b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
@@ -0,0 +1,116 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for EmbeddingBag layer."""
+
+import pytest
+import numpy as np
+import tensorflow as tf
+
+from deepray.custom_ops.embedding_bag import EmbeddingBag, _embedding_bag
+from deepray.utils import test_utils
+
+
+def manual_embedding_bag(indices, params, weights=None, combiner="mean"):
+  gathered = tf.gather(params, indices)
+  if weights is not None:
+    gathered *= tf.expand_dims(weights, -1)
+  if combiner == "sum":
+    return tf.reduce_sum(gathered, -2, keepdims=False)
+  else:
+    assert combiner == "mean"
+    assert weights is None
+    return tf.reduce_mean(gathered, -2, keepdims=False)
+
+
+@pytest.mark.with_device(["cpu", "gpu"])
+@pytest.mark.parametrize("input_shape", [(16, 32)])
+@pytest.mark.parametrize("input_dim", [63, 64])
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("combiner", ["sum", "mean"])
+def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner):
+  indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype)
+  params = np.random.random(size=(input_dim, 16)).astype(dtype)
+  if combiner == "sum":
+    weights = np.random.random(size=indices.shape).astype(dtype)
+  else:
+    weights = None
+  expected = manual_embedding_bag(indices, params, weights, combiner=combiner)
+  embedding_bag = EmbeddingBag(input_dim, 16, combiner=combiner, dtype=dtype)
+  embedding_bag.build(indices.shape)
+  embedding_bag.set_weights([params])
+  indices = tf.convert_to_tensor(indices)
+  if weights is not None:
+    weights = tf.convert_to_tensor(weights)
+  output = embedding_bag(
+      indices,
+      weights,
+  )
+  test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2)
+
+
+@pytest.mark.with_device(["cpu", "gpu"])
+@pytest.mark.parametrize("input_shape", [(16, 32)])
+@pytest.mark.parametrize("input_dim", [63, 64])
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("combiner", ["sum", "mean"])
+@pytest.mark.usefixtures("maybe_run_functions_eagerly")
+def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
+  indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype)
+  params = np.random.random(size=(input_dim, 16)).astype(dtype)
+  if combiner == "sum":
+    weights = np.random.random(size=indices.shape).astype(dtype)
+  else:
+    weights = None
+
+  indices = tf.convert_to_tensor(indices)
+  params = tf.convert_to_tensor(params)
+  if weights is not None:
+    weights = tf.convert_to_tensor(weights)
+
+  embedding_bag_fn = tf.function(_embedding_bag)
+
+  if combiner == "sum":
+    with tf.GradientTape(persistent=True) as tape:
+      tape.watch([params, weights])
+      output = embedding_bag_fn(indices, params, weights, combiner="sum")
+      expected = manual_embedding_bag(indices, params, weights, combiner="sum")
+
+    grads = tape.gradient(output, [params, weights])
+    expected_grads = tape.gradient(expected, [params, weights])
+    # Gather returns sparse IndexedSlices so we have to sum them together.
+    test_utils.assert_allclose_according_to_type(
+        tf.convert_to_tensor(expected_grads[0]),
+        tf.convert_to_tensor(grads[0]),
+        half_rtol=1e-2,
+        half_atol=1e-2,
+    )
+    test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2)
+  else:
+    with tf.GradientTape(persistent=True) as tape:
+      tape.watch(params)
+      output = embedding_bag_fn(indices, params, combiner=combiner)
+      expected = manual_embedding_bag(indices, params, combiner=combiner)
+
+    grads = tape.gradient(output, [params])
+    expected_grads = tape.gradient(expected, [params])
+    # Gather returns sparse IndexedSlices so we have to sum them together.
+    test_utils.assert_allclose_according_to_type(
+        tf.convert_to_tensor(expected_grads[0]),
+        tf.convert_to_tensor(grads[0]),
+        half_rtol=1e-2,
+        half_atol=1e-2,
+    )
diff --git a/deepray/seq2seq/tests/run_all_test.py b/deepray/custom_ops/embedding_bag/python/tests/run_all_test.py
similarity index 72%
rename from deepray/seq2seq/tests/run_all_test.py
rename to deepray/custom_ops/embedding_bag/python/tests/run_all_test.py
index d5c4af3d..8261049e 100644
--- a/deepray/seq2seq/tests/run_all_test.py
+++ b/deepray/custom_ops/embedding_bag/python/tests/run_all_test.py
@@ -1,8 +1,7 @@
 from pathlib import Path
 import sys
-
 import pytest
 
 if __name__ == "__main__":
   dirname = Path(__file__).absolute().parent
-  sys.exit(pytest.main([str(dirname)]))
+  sys.exit(pytest.main(["-s", str(dirname)]))
diff --git a/deepray/custom_ops/embedding_variable/BUILD b/deepray/custom_ops/embedding_variable/BUILD
new file mode 100644
index 00000000..ab953b1b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/BUILD
@@ -0,0 +1,282 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+load("@local_tsl//tsl/platform/default:build_config.bzl", "py_proto_library")
+load("//deepray:deepray.bzl", "custom_op_library")
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+proto_library(
+    name = "config_proto",
+    srcs = ["config.proto"],
+)
+
+cc_proto_library(
+    name = "config_proto_cc",
+    deps = [":config_proto"],
+)
+
+py_proto_library(
+    name = "config_proto_py_pb2",
+    srcs = ["config.proto"],
+    default_runtime = "@com_google_protobuf//:protobuf_python",
+    protoc = "@com_google_protobuf//:protoc",
+    srcs_version = "PY3",
+    deps = [
+        "@com_google_protobuf//:protobuf_python",
+    ],
+)
+
+py_library(
+    name = "embedding_variable",
+    srcs = glob(
+        [
+            "python/*.py",
+            "python/**/*.py",
+            "*.py",
+        ],
+    ),
+    data = [
+        ":_group_embedding_ops.so",
+        ":_kv_variable_ops.so",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":config_proto_py_pb2",
+        "//deepray/utils",
+    ],
+)
+
+cc_library(
+    name = "save_restore_tensor_ev",
+    hdrs = [
+        "cc/kernels/save_restore_tensor_ev.h",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/lib:tensor_bundle",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cc_library(
+    name = "kv_variable_util",
+    srcs = ["cc/kernels/kv_variable_util.cc"],
+    hdrs = [
+        "cc/kernels/kv_variable_util.h",
+    ],
+    copts = ["-Wno-unused-result"],
+    deps = [
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+    ],
+)
+
+cuda_library(
+    name = "training_ali_lib",
+    srcs = [
+        "cc/kernels/training_ali_ops_gpu.cu.cc",
+    ],
+    hdrs = [
+        "cc/kernels/training_ali_ops_gpu.h",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+    ],
+)
+
+GROUP_EMBEDDING_OP_SRCS = [
+    "cc/group_embedding/group_embedding_lookup_ops.cc",
+    "cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc",
+    "cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h",
+    "cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc",
+]
+
+GROUP_EMBEDDING_OP_GPU_SRCS = [
+    "cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc",
+    "cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc",
+    "cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h",
+    "cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h",
+]
+
+cuda_library(
+    name = "fused_embedding_common_cuh",
+    hdrs = ["cc/fused_embedding/fused_embedding_common.cu.h"],
+)
+
+FUSED_EMBEDDING_OP_SRCS = [
+    "cc/fused_embedding/embedding_lookup_sparse_post_op.cc",
+    "cc/fused_embedding/embedding_lookup_sparse_pre_op.cc",
+    "cc/fused_embedding/fused_embedding_ops.cc",
+]
+
+FUSED_EMBEDDING_OP_GPU_SRCS = [
+    "cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc",
+    "cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc",
+]
+
+custom_op_library(
+    name = "_kv_variable_ops.so",
+    srcs = [
+        "cc/kernels/kv_variable_lookup_ops.cc",
+        "cc/kernels/kv_variable_ops.cc",
+        "cc/kernels/kv_variable_restore_ops.cc",
+        "cc/kernels/save_restore_ops.cc",
+        "cc/kernels/training_adagrad_ops.cc",
+        "cc/kernels/training_adam_async_ops.cc",
+        "cc/kernels/training_adam_ops.cc",
+        "cc/kernels/training_ali_op_helpers.h",
+        "cc/kernels/training_ftrl_ops.cc",
+        "cc/kernels/training_sgd_ops.cc",
+        "cc/ops/kv_variable_ops.cc",
+        "cc/ops/save_restore_ops.cc",
+        "cc/ops/training_adagrad_ops.cc",
+        "cc/ops/training_adam_async_ops.cc",
+        "cc/ops/training_adam_ops.cc",
+        "cc/ops/training_ftrl_ops.cc",
+        "cc/ops/training_sgd_ops.cc",
+    ],
+    copts = ["-Wno-unused-result"] + if_cuda(["-DGOOGLE_CUDA=1"]),
+    gpu_deps = [
+        ":training_ali_lib",
+    ],
+    gpu_srcs = [
+        "cc/kernels/training_ali_ops_gpu.h",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable:kv_variable_util",
+        "//deepray/custom_ops/unique_ops:unique_ali_util",
+        "//deepray/custom_ops/utils:spin_rw_lock",
+        "@com_github_google_leveldb//:leveldb",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+)
+
+py_test(
+    name = "multiplex_1_test",
+    size = "medium",
+    srcs = ["multiplex_1_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_mac",  # TODO(b/216321151): Re-enable this test.
+    ],
+    deps = [
+        ":embedding_variable",
+        "@pypi_numpy//:pkg",
+        "@pypi_tensorflow//:pkg",
+    ],
+)
+
+custom_op_library(
+    name = "_raw_ops.so",
+    srcs = [
+        "cc/kernels/embedding_collection.cc",
+        "cc/kernels/embedding_collection.hpp",
+        "cc/ops/embedding_collection.cc",
+    ],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        ":hotness_calculate",
+    ],
+)
+
+cuda_library(
+    name = "hotness_calculate",
+    srcs = [
+        "cc/kernels/hotness_calculate.cu.cc",
+    ],
+    hdrs = [
+        "cc/kernels/hotness_calculate.h",
+    ],
+    defines = [
+        "TF_VERSION_MAJOR=2",
+    ],
+    deps = [
+        "//deepray/custom_ops/utils:check_util",
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cuda_runtime",
+        "@local_config_cuda//cuda:cudart",
+    ],
+)
+
+custom_op_library(
+    name = "_save_restore_ops.so",
+    srcs = [
+        "cc/kernels/save_restore_ops.cc",
+        "cc/ops/save_restore_ops.cc",
+    ],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+    ],
+)
+
+custom_op_library(
+    name = "_group_embedding_ops.so",
+    srcs = [
+        "cc/ops/group_embedding_ops.cc",
+    ] + GROUP_EMBEDDING_OP_SRCS,
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    gpu_deps = [
+        ":fused_embedding_common_cuh",
+    ],
+    gpu_srcs = GROUP_EMBEDDING_OP_GPU_SRCS,
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+        "//deepray/custom_ops/unique_ops:unique_ali_util",
+    ],
+)
+
+cc_test(
+    name = "group_embedding_ops_test",
+    size = "small",
+    srcs = ["cc/group_embedding/group_embedding_lookup_ops_test.cc"],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        ":_group_embedding_ops.so",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable:kv_variable_util",
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+        "//deepray/custom_ops/utils:fake_input",
+        "//deepray/custom_ops/utils:kernel_benchmark_testlib",
+        "//deepray/custom_ops/utils:ops_testutil",
+        "//deepray/custom_ops/utils:tensor_testutil",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+custom_op_library(
+    name = "_incr_save_restore_ops.so",
+    srcs = [
+        "cc/incr_save_restore/incr_save_restore_ops.cc",
+        "cc/incr_save_restore/incr_save_restore_ops.h",
+        "cc/ops/incr_save_restore_ops.cc",
+    ],
+    deps = [
+        ":save_restore_tensor_ev",
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+    ],
+)
+
+cc_test(
+    name = "incr_save_restore_ops_test",
+    size = "small",
+    srcs = ["cc/incr_save_restore/incr_save_restore_ops_test.cc"],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        ":_incr_save_restore_ops.so",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable:kv_variable_util",
+        "//deepray/custom_ops/embedding_variable/cc/embedding:embedding_ops_lib",
+        "//deepray/custom_ops/utils:fake_input",
+        "//deepray/custom_ops/utils:kernel_benchmark_testlib",
+        "//deepray/custom_ops/utils:ops_testutil",
+        "//deepray/custom_ops/utils:tensor_testutil",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/deepray/custom_ops/embedding_variable/__init__.py b/deepray/custom_ops/embedding_variable/__init__.py
new file mode 100644
index 00000000..abbf9a39
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/__init__.py
@@ -0,0 +1,3 @@
+from .python import kv_variable_ops
+from .python import group_embedding_lookup_ops
+from .python.kv_variable_ops import gen_kv_variable_ops
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/BUILD b/deepray/custom_ops/embedding_variable/cc/embedding/BUILD
new file mode 100644
index 00000000..8a2b3f95
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/BUILD
@@ -0,0 +1,269 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+load(
+    "@org_tensorflow//tensorflow:tensorflow.bzl",
+    "tf_copts",
+)
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "embedding_ops_lib",
+    deps = [
+        ":embedding_gpu",
+        ":embedding_var",
+        ":embedding_var_ckpt_data",
+        ":multi_tier_storage",
+        ":ssd_record_descriptor",
+    ],
+)
+
+cc_library(
+    name = "ssd_record_descriptor",
+    srcs = ["ssd_record_descriptor.cc"],
+    hdrs = [
+        "counter_filter_descriptor_impl.h",
+        "dynamic_dim_feature_descriptor_impl.h",
+        "embedding_config.h",
+        "embedding_memory_pool.h",
+        "embedding_var_dump_iterator.h",
+        "feature_descriptor.h",
+        "feature_descriptor_impl.h",
+        "hbm_multi_tier_feature_descriptor.h",
+        "kv_interface.h",
+        "normal_feature_descriptor.h",
+        "ssd_record_descriptor.h",
+    ],
+    copts = [
+        "-Wno-unused-result",
+        "-Wno-c++11-narrowing",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/lib:allocator",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable:save_restore_tensor_ev",
+        # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_lib",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:tf_header_lib",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+)
+
+cuda_library(
+    name = "multi_tier_storage",
+    srcs = [
+        "multi_tier_storage.cu.cc",
+    ],
+    hdrs = [
+        "bloom_filter_policy.h",
+        "cache.h",
+        "cache_factory.h",
+        "cache_thread_pool_creator.h",
+        "counter_filter_descriptor_impl.h",
+        "counter_filter_policy.h",
+        "cpu_hash_map_kv.h",
+        "dram_leveldb_storage.h",
+        "dram_pmem_storage.h",
+        "dram_ssd_storage.h",
+        "dynamic_dim_feature_descriptor_impl.h",
+        "emb_file.h",
+        "emb_file_creator.h",
+        "embedding_config.h",
+        "embedding_memory_pool.h",
+        "embedding_var.h",
+        "embedding_var_ckpt_data.h",
+        "embedding_var_context.h",
+        "embedding_var_dump_iterator.h",
+        "embedding_var_restore.h",
+        "eviction_manager.h",
+        "feature_descriptor.h",
+        "feature_descriptor_impl.h",
+        "filter_factory.h",
+        "filter_policy.h",
+        "globalstep_shrink_policy.h",
+        "gpu_hash_map_kv.h",
+        "hbm_dram_ssd_storage.h",
+        "hbm_dram_storage.h",
+        "hbm_multi_tier_feature_descriptor.h",
+        "hbm_storage_iterator.h",
+        "intra_thread_copy_id_allocator.h",
+        "kv_interface.h",
+        "l2weight_shrink_policy.h",
+        "leveldb_kv.h",
+        "multi_tier_storage.h",
+        "normal_feature_descriptor.h",
+        "nullable_filter_policy.h",
+        "shrink_policy.h",
+        "single_tier_storage.h",
+        "ssd_hash_kv.h",
+        "ssd_record_descriptor.h",
+        "storage.h",
+        "storage_config.h",
+        "storage_factory.h",
+    ],
+    copts = [
+        "-Wno-unused-result",
+    ],
+    deps = [
+        ":embedding_gpu",
+        "//deepray/custom_ops/embedding_variable/cc/lib:allocator",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/utils:spin_rw_lock",
+        "@com_github_google_leveldb//:leveldb",
+        # "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        # "@org_tensorflow//tensorflow/core/common_runtime:core_cpu",
+        # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_runtime",
+        # "@org_tensorflow//tensorflow/core/kernels:gpu_device_array",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:tf_header_lib",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+)
+
+cc_library(
+    name = "embedding_var_ckpt_data",
+    srcs = ["embedding_var_ckpt_data.cc"],
+    hdrs = [
+        "counter_filter_descriptor_impl.h",
+        "dynamic_dim_feature_descriptor_impl.h",
+        "embedding_config.h",
+        "embedding_memory_pool.h",
+        "embedding_var_ckpt_data.h",
+        "embedding_var_dump_iterator.h",
+        "feature_descriptor.h",
+        "feature_descriptor_impl.h",
+        "hbm_multi_tier_feature_descriptor.h",
+        "kv_interface.h",
+        "normal_feature_descriptor.h",
+    ],
+    copts = [
+        "-Wno-c++11-narrowing",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/lib:allocator",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/embedding_variable:save_restore_tensor_ev",
+        # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_lib",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:tf_header_lib",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+)
+
+cuda_library(
+    name = "embedding_gpu",
+    srcs = [
+        "batch.cu.cc",
+        "gpu_hash_table.cu.cc",
+    ],
+    hdrs = [
+        "batch.h",
+        "gpu_hash_table.h",
+    ],
+    copts = tf_copts(allow_exceptions = True) + if_cuda([
+        "-DNV_CUDNN_DISABLE_EXCEPTION",
+    ]) + select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options=relaxed-constexpr",
+            #"-nvcc_options=ftz=true",
+        ],
+        "@local_config_cuda//cuda:using_clang": [
+            "-fcuda-flush-denormals-to-zero",
+        ],
+    }),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_github_google_leveldb//:leveldb",
+        "@cuCollections//:cuco_hash_table",
+        "@libcuckoo",
+        # "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        # "@org_tensorflow//tensorflow/core/platform:stream_executor",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:tf_header_lib",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+    alwayslink = 1,
+)
+
+cuda_library(
+    name = "embedding_var",
+    srcs = [
+        "embedding_var.cu.cc",
+        "embedding_var_restore.cc",
+    ],
+    hdrs = [
+        "bloom_filter_policy.h",
+        "cache.h",
+        "cache_factory.h",
+        "cache_thread_pool_creator.h",
+        "counter_filter_descriptor_impl.h",
+        "counter_filter_policy.h",
+        "cpu_hash_map_kv.h",
+        "dram_leveldb_storage.h",
+        "dram_pmem_storage.h",
+        "dram_ssd_storage.h",
+        "dynamic_dim_feature_descriptor_impl.h",
+        "emb_file.h",
+        "emb_file_creator.h",
+        "embedding_config.h",
+        "embedding_memory_pool.h",
+        "embedding_var.h",
+        "embedding_var_context.h",
+        "embedding_var_dump_iterator.h",
+        "embedding_var_restore.h",
+        "eviction_manager.h",
+        "feature_descriptor.h",
+        "feature_descriptor_impl.h",
+        "filter_factory.h",
+        "filter_policy.h",
+        "globalstep_shrink_policy.h",
+        "gpu_hash_map_kv.h",
+        "hbm_dram_ssd_storage.h",
+        "hbm_dram_storage.h",
+        "hbm_multi_tier_feature_descriptor.h",
+        "hbm_storage_iterator.h",
+        "intra_thread_copy_id_allocator.h",
+        "kv_interface.h",
+        "l2weight_shrink_policy.h",
+        "leveldb_kv.h",
+        "normal_feature_descriptor.h",
+        "nullable_filter_policy.h",
+        "shrink_policy.h",
+        "single_tier_storage.h",
+        "ssd_hash_kv.h",
+        "storage.h",
+        "storage_config.h",
+        "storage_factory.h",
+    ],
+    copts = tf_copts() + ["-g"] + select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options=relaxed-constexpr",
+        ],
+        "@local_config_cuda//cuda:using_clang": [
+            "-fcuda-flush-denormals-to-zero",
+        ],
+    }) + [
+        "-Wno-unused-result",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable/cc/lib:allocator",
+        ":embedding_gpu",
+        ":embedding_var_ckpt_data",
+        ":multi_tier_storage",
+        ":ssd_record_descriptor",
+        "//deepray/custom_ops/embedding_variable:config_proto_cc",
+        "//deepray/custom_ops/utils:spin_rw_lock",
+        "@com_github_google_leveldb//:leveldb",
+        # "@org_tensorflow//tensorflow/core/common_runtime/gpu:gpu_runtime",
+        # "@org_tensorflow//tensorflow/core/kernels:gpu_device_array",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:tf_header_lib",
+        "@sparsehash_c11//:dense_hash_map",
+    ],
+)
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc
new file mode 100644
index 00000000..6323b151
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/batch.cu.cc
@@ -0,0 +1,219 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "batch.h"
+
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+__global__ void BatchCopy(V** batch, V* val_base, int value_len, int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / value_len;
+  int item_pos = i % value_len;
+
+  if (i < limit * value_len) {
+    val_base[i] = *(batch[item_id] + item_pos);
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T) \
+  template __global__ void BatchCopy<T>(T**, T*, int, int);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_int32(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_int64(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+
+template <class V>
+__global__ void BatchUnpack(V** dev_value_address, V* memcpy_buffer_gpu,
+                            int value_len, int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / value_len;
+  int item_pos = i % value_len;
+
+  if (i < limit * value_len) {
+    *(dev_value_address[item_id] + item_pos) = memcpy_buffer_gpu[i];
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T) \
+  template __global__ void BatchUnpack<T>(T**, T*, int, int);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_int32(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_int64(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+
+template <class V>
+__global__ void CopyEmbedding(V** batch, V** batch_data_space, int total_dims,
+                              int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / total_dims;
+  int item_pos = i % total_dims;
+
+  if (i < limit * total_dims) {
+    *(batch_data_space[item_id] + item_pos) = *(batch[item_id] + item_pos);
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T) \
+  template __global__ void CopyEmbedding<T>(T**, T**, int, int);
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+}  // namespace embedding
+
+template <class V>
+__global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr,
+                                      int embedding_dim, long long int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / embedding_dim;
+  int item_pos = i % embedding_dim;
+
+  if (i < limit * embedding_dim) {
+    *(a[item_id] + item_pos) += g[i] * g[i];
+    *(v[item_id] + item_pos) -= lr * g[i] * rsqrt(*(a[item_id] + item_pos));
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T)                                      \
+  template __global__ void SparseApplyAdagradGPU<T>(T**, T**, const T*, T, \
+                                                    int, long long int);
+TF_CALL_float(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_double(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+
+template <class V>
+__global__ void SparseApplyAdamGPU(V** var, V** m, V** v, const V* g, V lr,
+                                   V beta1, V beta2, V epsilon, V beta1_power,
+                                   V beta2_power, int embedding_dim,
+                                   long long int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / embedding_dim;
+  int item_pos = i % embedding_dim;
+
+  if (i < limit * embedding_dim) {
+    const V alpha = lr * sqrt(static_cast<V>(1) - beta2_power) /
+                    (static_cast<V>(1) - beta1_power);
+    *(m[item_id] + item_pos) =
+        *(m[item_id] + item_pos) * beta1 + g[i] * (1 - beta1);
+    *(v[item_id] + item_pos) =
+        *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1 - beta2);
+    *(var[item_id] + item_pos) -= (*(m[item_id] + item_pos) * alpha) /
+                                  (sqrt(*(v[item_id] + item_pos)) + epsilon);
+  }
+  __syncthreads();
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T)             \
+  template __global__ void SparseApplyAdamGPU<T>( \
+      T**, T**, T**, const T*, T, T, T, T, T, T, int, long long int);
+TF_CALL_float(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_double(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+
+template <class V>
+__global__ void SparseApplyAdamAsyncGPU(V** var, V** m, V** v, const V* g, V lr,
+                                        V beta1, V beta2, V epsilon,
+                                        V* beta1_power_ptr, V* beta2_power_ptr,
+                                        int embedding_dim,
+                                        long long int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / embedding_dim;
+  int item_pos = i % embedding_dim;
+
+  if (i < limit * embedding_dim) {
+    V beta1_power = *beta1_power_ptr;
+    V beta2_power = *beta2_power_ptr;
+    const V alpha = lr * sqrt(static_cast<V>(1) - beta2_power) /
+                    (static_cast<V>(1) - beta1_power);
+    *(m[item_id] + item_pos) =
+        *(m[item_id] + item_pos) * beta1 + g[i] * (1 - beta1);
+    *(v[item_id] + item_pos) =
+        *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1 - beta2);
+    *(var[item_id] + item_pos) -= (*(m[item_id] + item_pos) * alpha) /
+                                  (sqrt(*(v[item_id] + item_pos)) + epsilon);
+  }
+  __syncthreads();
+
+  if (i == 0) {
+    *beta1_power_ptr *= beta1;
+    *beta2_power_ptr *= beta2;
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T)                  \
+  template __global__ void SparseApplyAdamAsyncGPU<T>( \
+      T**, T**, T**, const T*, T, T, T, T, T*, T*, int, long long int);
+TF_CALL_float(REGISTER_KERNELS_ALL_INDEX)
+    TF_CALL_double(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+
+        template <class V>
+        __global__
+    void SparseApplyAdamAsyncSparseRmspropGPU(V** var, V** m, V** v, const V* g,
+                                              V lr, V beta1, V beta2, V epsilon,
+                                              int embedding_dim,
+                                              long long int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / embedding_dim;
+  int item_pos = i % embedding_dim;
+
+  if (i < limit * embedding_dim) {
+    *(v[item_id] + item_pos) =
+        *(v[item_id] + item_pos) * beta2 + g[i] * g[i] * (1.0 - beta2);
+    *(m[item_id] + item_pos) =
+        *(m[item_id] + item_pos) * beta1 +
+        rsqrt(*(v[item_id] + item_pos) + epsilon) * lr * g[i];
+    *(var[item_id] + item_pos) -= *(m[item_id] + item_pos);
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T)                               \
+  template __global__ void SparseApplyAdamAsyncSparseRmspropGPU<T>( \
+      T**, T**, T**, const T*, T, T, T, T, int, long long int);
+TF_CALL_float(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_double(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+
+template <class V>
+__global__ void SparseApplyAdamWGPU(V** var, V** m, V** v, const V* g, V alpha,
+                                    V beta1, V beta2, V epsilon, V weight_decay,
+                                    int embedding_dim, long long int limit) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int item_id = i / embedding_dim;
+  int item_pos = i % embedding_dim;
+
+  if (i < limit * embedding_dim) {
+    *(m[item_id] + item_pos) +=
+        (g[i] - *(m[item_id] + item_pos)) * (1.0 - beta1);
+    *(v[item_id] + item_pos) +=
+        (g[i] * g[i] - *(v[item_id] + item_pos)) * (1.0 - beta2);
+    *(var[item_id] + item_pos) -=
+        (*(m[item_id] + item_pos) * alpha) /
+            (sqrt(*(v[item_id] + item_pos)) + epsilon) +
+        weight_decay * (*(var[item_id] + item_pos));
+  }
+}
+
+#define REGISTER_KERNELS_ALL_INDEX(T)              \
+  template __global__ void SparseApplyAdamWGPU<T>( \
+      T**, T**, T**, const T*, T, T, T, T, T, int, long long int);
+TF_CALL_float(REGISTER_KERNELS_ALL_INDEX);
+TF_CALL_double(REGISTER_KERNELS_ALL_INDEX);
+#undef REGISTER_KERNELS_ALL_INDEX
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/batch.h b/deepray/custom_ops/embedding_variable/cc/embedding/batch.h
new file mode 100644
index 00000000..800e2e3c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/batch.h
@@ -0,0 +1,66 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_
+
+#if GOOGLE_CUDA
+namespace tensorflow {
+namespace embedding {
+
+template <class V>
+__global__ void BatchCopy(V** batch, V* val_base, int value_len, int limit);
+
+template <class V>
+__global__ void BatchUnpack(V** dev_value_address, V* memcpy_buffer_gpu,
+                            int value_len, int limit);
+
+template <class V>
+__global__ void CopyEmbedding(V** batch, V** batch_data_space, int total_dims,
+                              int limit);
+}  // namespace embedding
+
+template <class V>
+__global__ void SparseApplyAdagradGPU(V** a, V** v, const V* g, V lr,
+                                      int embedding_dim, long long int limit);
+
+template <class V>
+__global__ void SparseApplyAdamGPU(V** var, V** m, V** v, const V* g, V lr,
+                                   V beta1, V beta2, V epsilon, V beta1_power,
+                                   V beta2_power, int embedding_dim,
+                                   long long int limit);
+
+template <class V>
+__global__ void SparseApplyAdamAsyncGPU(V** var, V** m, V** v, const V* g, V lr,
+                                        V beta1, V beta2, V epsilon,
+                                        V* beta1_power_ptr, V* beta2_power_ptr,
+                                        int embedding_dim, long long int limit);
+
+template <class V>
+__global__ void SparseApplyAdamAsyncSparseRmspropGPU(V** var, V** m, V** v,
+                                                     const V* g, V lr, V beta1,
+                                                     V beta2, V epsilon,
+                                                     int embedding_dim,
+                                                     long long int limit);
+
+template <class V>
+__global__ void SparseApplyAdamWGPU(V** var, V** m, V** v, const V* g, V alpha,
+                                    V beta1, V beta2, V epsilon, V weight_decay,
+                                    int embedding_dim, long long int limit);
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BATCH_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h
new file mode 100644
index 00000000..6d30bbc8
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/bloom_filter_policy.h
@@ -0,0 +1,438 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_
+
+#include "embedding_config.h"
+#include "filter_policy.h"
+#include "intra_thread_copy_id_allocator.h"
+
+namespace tensorflow {
+
+namespace {
+const static std::vector<int64> default_seeds = {
+    2,  3,  5,  7,  11, 13, 17, 19, 23, 29, 31, 37, 41,
+    43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97};
+}
+
+template <typename K, typename V, typename EV>
+class BloomFilterPolicy : public FilterPolicy<K, V, EV> {
+  using FilterPolicy<K, V, EV>::ev_;
+  using FilterPolicy<K, V, EV>::config_;
+
+ public:
+  BloomFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                    embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc), FilterPolicy<K, V, EV>(config, ev) {
+    switch (config_.counter_type) {
+      case DT_UINT64:
+        VLOG(2) << "The type of bloom counter is uint64";
+        bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(long));
+        break;
+      case DT_UINT32:
+        VLOG(2) << "The type of bloom counter is uint32";
+        bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(int));
+        break;
+      case DT_UINT16:
+        VLOG(2) << "The type of bloom counter is uint16";
+        bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(int16));
+        break;
+      case DT_UINT8:
+        VLOG(2) << "The type of bloom counter is uint8";
+        bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(bool));
+        break;
+      default:
+        VLOG(2) << "defualt type of counter is uint64";
+        bloom_counter_ = (void*)calloc(config_.num_counter, sizeof(long));
+    }
+    GenerateSeed(config.kHashFunc);
+  }
+
+  Status Lookup(K key, V* val, const V* default_value_ptr,
+                const V* default_value_no_permission) override {
+    void* value_ptr = nullptr;
+    Status s = ev_->LookupKey(key, &value_ptr);
+    if (s.ok()) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+    return OkStatus();
+  }
+
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                   V* output, int64 num_of_keys, V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, value_ptr_list, &embedding_ptr, default_value_ptr,
+                    default_value_no_permission](int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        void* value_ptr = value_ptr_list[i];
+        if (value_ptr != nullptr) {
+          embedding_ptr[i] =
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+        } else {
+          embedding_ptr[i] = default_value_no_permission;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(),
+                                stream, event_mgr, ctx.gpu_device);
+  }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, void** value_ptrs_list,
+                              int64 num_of_keys) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::vector<K>> lookup_or_create_ids(num_worker_threads);
+    std::vector<std::vector<int>> lookup_or_create_cursor(num_worker_threads);
+    std::vector<std::vector<void*>> lookup_or_create_ptrs(num_worker_threads);
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    std::vector<std::list<int64>> not_found_cursor_list(num_worker_threads + 1);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    auto do_work = [this, keys, value_ptrs_list, &lookup_or_create_ids,
+                    &lookup_or_create_ptrs, &lookup_or_create_cursor,
+                    main_thread_id,
+                    &thread_copy_id_alloc](int64 start, int64 limit) {
+      int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int i = start; i < limit; i++) {
+        if (GetBloomFreq(keys[i]) >= config_.filter_freq) {
+          lookup_or_create_ids[copy_id].emplace_back(keys[i]);
+          lookup_or_create_ptrs[copy_id].emplace_back(value_ptrs_list[i]);
+          lookup_or_create_cursor[copy_id].emplace_back(i);
+        } else {
+          AddFreq(keys[i], 1);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    std::vector<K> total_ids(num_of_keys);
+    std::vector<void*> total_ptrs(num_of_keys);
+    std::vector<int> total_cursors(num_of_keys);
+    int num_of_admit_id = 0;
+    for (int i = 0; i < num_worker_threads; i++) {
+      if (lookup_or_create_ids[i].size() > 0) {
+        memcpy(total_ids.data() + num_of_admit_id,
+               lookup_or_create_ids[i].data(),
+               sizeof(K) * lookup_or_create_ids[i].size());
+        memcpy(total_ptrs.data() + num_of_admit_id,
+               lookup_or_create_ptrs[i].data(),
+               sizeof(void*) * lookup_or_create_ptrs[i].size());
+        memcpy(total_cursors.data() + num_of_admit_id,
+               lookup_or_create_cursor[i].data(),
+               sizeof(int) * lookup_or_create_cursor[i].size());
+        num_of_admit_id += lookup_or_create_ids[i].size();
+      }
+    }
+
+    ev_->BatchLookupOrCreateKey(ctx, total_ids.data(), total_ptrs.data(),
+                                num_of_keys, not_found_cursor_list);
+    for (int i = 0; i < total_ptrs.size(); i++) {
+      value_ptrs_list[total_cursors[i]] = total_ptrs[i];
+    }
+  }
+#endif  // GOOGLE_CUDA
+
+  void LookupOrCreate(K key, V* val, const V* default_value_ptr,
+                      void** value_ptr, int count,
+                      const V* default_value_no_permission) override {
+    if (GetBloomFreq(key) >= config_.filter_freq) {
+      bool is_filter = true;
+      TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      AddFreq(key, count);
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+  }
+
+  Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter,
+                           int64 count) override {
+    *value_ptr = nullptr;
+    if ((GetFreq(key, *value_ptr) + count) >= config_.filter_freq) {
+      Status s = ev_->LookupKey(key, value_ptr);
+      if (!s.ok()) {
+        *value_ptr = feat_desc_->Allocate();
+        feat_desc_->SetDefaultValue(*value_ptr, key);
+        ev_->storage()->Insert(key, value_ptr);
+        s = OkStatus();
+      }
+      *is_filter = true;
+      feat_desc_->AddFreq(*value_ptr, count);
+    } else {
+      *is_filter = false;
+      AddFreq(key, count);
+    }
+    return OkStatus();
+  }
+
+  int64 GetFreq(K key, void* val) override { return GetBloomFreq(key); }
+
+  int64 GetFreq(K key) override { return GetBloomFreq(key); }
+
+  void* GetBloomCounter() const { return bloom_counter_; }
+
+  bool is_admit(K key, void* value_ptr) override {
+    if (value_ptr == nullptr) {
+      return false;
+    } else {
+      return GetFreq(key, value_ptr) >= config_.filter_freq;
+    }
+  }
+
+ private:
+  int64 GetBloomFreq(K key) {
+    std::vector<int64> hash_val;
+    for (int64 i = 0; i < config_.kHashFunc; i++) {
+      hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter);
+    }
+    int64 min_freq;
+    switch (config_.counter_type) {
+      case DT_UINT64:
+        min_freq = GetMinFreq<uint64>(hash_val);
+        break;
+      case DT_UINT32:
+        min_freq = GetMinFreq<uint32>(hash_val);
+        break;
+      case DT_UINT16:
+        min_freq = GetMinFreq<uint16>(hash_val);
+        break;
+      case DT_UINT8:
+        min_freq = GetMinFreq<uint8>(hash_val);
+        break;
+      default:
+        min_freq = GetMinFreq<uint64>(hash_val);
+    }
+    return min_freq;
+  }
+
+#define mix(h)                    \
+  ({                              \
+    (h) ^= (h) >> 23;             \
+    (h) *= 0x2127599bf4325c37ULL; \
+    (h) ^= (h) >> 47;             \
+  })
+
+  uint64_t FastHash64(K key, uint64_t seed) {
+    const uint64_t m = 0x880355f21e6d1965ULL;
+
+    uint64_t h = seed ^ (8 * m);
+    uint64_t v;
+    v = key;
+    h ^= mix(v);
+    h *= m;
+
+    v = 0;
+    h ^= mix(v);
+    h *= m;
+
+    return mix(h);
+  }
+
+  template <typename VBloom>
+  int64 GetMinFreq(std::vector<int64> hash_val) {
+    VBloom min_freq = *((VBloom*)bloom_counter_ + hash_val[0]);
+    for (auto it : hash_val) {
+      min_freq = std::min(*((VBloom*)bloom_counter_ + it), min_freq);
+    }
+    return min_freq;
+  }
+
+  template <typename VBloom>
+  void SetMinFreq(std::vector<int64> hash_val, int64 freq) {
+    for (auto it : hash_val) {
+      *((VBloom*)bloom_counter_ + it) = freq;
+    }
+  }
+
+  void SetBloomFreq(K key, int64 freq) {
+    std::vector<int64> hash_val;
+    for (int64 i = 0; i < config_.kHashFunc; i++) {
+      hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter);
+    }
+    switch (config_.counter_type) {
+      case DT_UINT64:
+        SetMinFreq<uint64>(hash_val, freq);
+        break;
+      case DT_UINT32:
+        SetMinFreq<uint32>(hash_val, freq);
+        break;
+      case DT_UINT16:
+        SetMinFreq<uint16>(hash_val, freq);
+        break;
+      case DT_UINT8:
+        SetMinFreq<uint8>(hash_val, freq);
+        break;
+      default:
+        SetMinFreq<uint64>(hash_val, freq);
+    }
+  }
+
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr,
+                 RestoreBuffer& restore_buff) override {
+    K* key_buff = (K*)restore_buff.key_buffer;
+    V* value_buff = (V*)restore_buff.value_buffer;
+    int64* version_buff = (int64*)restore_buff.version_buffer;
+    int64* freq_buff = (int64*)restore_buff.freq_buffer;
+    if (to_dram) {
+      LOG(FATAL) << "BloomFilter dosen't support ImportToDRAM";
+      return OkStatus();
+    }
+
+    for (auto i = 0; i < key_num; ++i) {
+      // this can describe by graph(Mod + DynamicPartition),
+      // but memory waste and slow
+      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
+        continue;
+      }
+      void* value_ptr = nullptr;
+      int64 new_freq = freq_buff[i];
+      int64 import_version = -1;
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
+      if (!is_filter) {
+        if (freq_buff[i] >= config_.filter_freq) {
+          SetBloomFreq(key_buff[i], freq_buff[i]);
+        } else {
+          SetBloomFreq(key_buff[i], config_.filter_freq);
+          new_freq = config_.filter_freq;
+        }
+      } else {
+        SetBloomFreq(key_buff[i], freq_buff[i]);
+      }
+      if (new_freq >= config_.filter_freq) {
+        ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(),
+                               new_freq, import_version, config_.emb_index);
+      }
+    }
+    return OkStatus();
+  }
+
+  void AddFreq(K key) {
+    std::vector<int64> hash_val;
+    for (int64 i = 0; i < config_.kHashFunc; i++) {
+      hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter);
+    }
+
+    for (auto it : hash_val) {
+      switch (config_.counter_type) {
+        case DT_UINT64:
+          if (*((uint64*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint64*)bloom_counter_ + it, 1);
+          break;
+        case DT_UINT32:
+          if (*((uint32*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint32*)bloom_counter_ + it, 1);
+          break;
+        case DT_UINT16:
+          if (*((uint16*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint16*)bloom_counter_ + it, 1);
+          break;
+        case DT_UINT8:
+          if (*((uint8*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint8*)bloom_counter_ + it, 1);
+          break;
+        default:
+          if (*((uint64*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint64*)bloom_counter_ + it, 1);
+      }
+    }
+  }
+
+  void AddFreq(K key, int64 count) {
+    std::vector<int64> hash_val;
+    for (int64 i = 0; i < config_.kHashFunc; i++) {
+      hash_val.emplace_back(FastHash64(key, seeds_[i]) % config_.num_counter);
+    }
+
+    for (auto it : hash_val) {
+      switch (config_.counter_type) {
+        case DT_UINT64:
+          if (*((uint64*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint64*)bloom_counter_ + it, count);
+          break;
+        case DT_UINT32:
+          if (*((uint32*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint32*)bloom_counter_ + it, count);
+          break;
+        case DT_UINT16:
+          if (*((uint16*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint16*)bloom_counter_ + it, count);
+          break;
+        case DT_UINT8:
+          if (*((uint8*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint8*)bloom_counter_ + it, count);
+          break;
+        default:
+          if (*((uint64*)bloom_counter_ + it) < config_.filter_freq)
+            __sync_fetch_and_add((uint64*)bloom_counter_ + it, count);
+      }
+    }
+  }
+
+  void GenerateSeed(int64 kHashFunc) {
+    if (kHashFunc < default_seeds.size()) {
+      for (int64 i = 0; i < kHashFunc; i++) {
+        seeds_.emplace_back(default_seeds[i]);
+      }
+    } else {
+      for (int64 i = 0; i < default_seeds.size(); i++) {
+        seeds_.emplace_back(default_seeds[i]);
+      }
+      int64 last_seed = 98;
+      for (int64 i = default_seeds.size(); i < kHashFunc; i++) {
+        for (int64 j = last_seed;; j++) {
+          if (j % 2 == 0) continue;
+          bool is_prime = true;
+          for (int64 k = 0; k <= std::sqrt(j) + 1; k++) {
+            if (j % k == 0) is_prime = false;
+          }
+          if (is_prime) {
+            seeds_.emplace_back(j);
+            last_seed = j;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  void* bloom_counter_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
+  std::vector<int64> seeds_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_BLOOM_FILTER_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache.h
new file mode 100644
index 00000000..5c9a51a9
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache.h
@@ -0,0 +1,521 @@
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_
+#include <iostream>
+#include <limits>
+#include <list>
+#include <map>
+#include <set>
+#include <unordered_map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace embedding {
+
+template <class K>
+class BatchCache {
+ public:
+  BatchCache() {}
+  virtual ~BatchCache() {}
+  void update(const Tensor& t) { update((K*)t.data(), t.NumElements()); }
+  void add_to_prefetch_list(const Tensor& t) {
+    add_to_prefetch_list((K*)t.data(), t.NumElements());
+  }
+  void add_to_cache(const Tensor& t) {
+    add_to_cache((K*)t.data(), t.NumElements());
+  }
+
+  void update(const Tensor& t, const Tensor& counts_tensor) {
+    update((K*)t.data(), t.NumElements(), nullptr,
+           (int64*)counts_tensor.data());
+  }
+
+  virtual size_t get_evic_ids(K* evic_ids, size_t k_size) = 0;
+  virtual size_t get_cached_ids(K* cached_ids, size_t k_size,
+                                int64* cached_versions,
+                                int64* cached_freqs) = 0;
+  virtual void update(const K* batch_ids, size_t batch_size,
+                      bool use_locking = true) = 0;
+  virtual void update(const K* batch_ids, size_t batch_size,
+                      const int64* batch_versions, const int64* batch_freqs,
+                      bool use_locking = true) = 0;
+  virtual void add_to_prefetch_list(const K* batch_ids, size_t batch_size) = 0;
+  virtual void add_to_cache(const K* batch_ids, size_t batch_size) = 0;
+  virtual size_t size() = 0;
+  virtual void reset_status() {
+    num_hit = 0;
+    num_miss = 0;
+  }
+  std::string DebugString() {
+    float hit_rate = 0.0;
+    if (num_hit > 0 || num_miss > 0) {
+      hit_rate = num_hit * 100.0 / (num_hit + num_miss);
+    }
+    return strings::StrCat("HitRate = ", hit_rate,
+                           " %, visit_count = ", num_hit + num_miss,
+                           ", hit_count = ", num_hit);
+  }
+  virtual mutex_lock maybe_lock_cache(mutex& mu, mutex& temp_mu,
+                                      bool use_locking) {
+    if (use_locking) {
+      mutex_lock l(mu);
+      return l;
+    } else {
+      mutex_lock l(temp_mu);
+      return l;
+    }
+  }
+
+ protected:
+  int64 num_hit;
+  int64 num_miss;
+};
+
+template <class K>
+class PrefetchNode {
+ public:
+  explicit PrefetchNode() : key_(-1), ref_count_(1) {}
+  explicit PrefetchNode(K id) : key_(id), ref_count_(1) {}
+  virtual ~PrefetchNode() {}
+  virtual void Ref() { ref_count_++; };
+  virtual void UnRef() { ref_count_--; };
+  virtual K key() { return key_; }
+  virtual int64 ref_count() { return ref_count_; }
+
+ protected:
+  K key_;
+  int64 ref_count_;
+};
+
+template <class K>
+class PrefetchLFUNode : public PrefetchNode<K> {
+ public:
+  explicit PrefetchLFUNode(K id) {
+    PrefetchNode<K>::key_ = id;
+    PrefetchNode<K>::ref_count_ = 1;
+    freq_ = 1;
+  }
+
+  PrefetchLFUNode(K id, int64 freq) {
+    PrefetchNode<K>::key_ = id;
+    PrefetchNode<K>::ref_count_ = 1;
+    freq_ = freq;
+  }
+
+  void Ref() override {
+    PrefetchNode<K>::ref_count_++;
+    freq_++;
+  }
+
+  int64 freq() { return freq_; }
+
+ private:
+  int64 freq_;
+};
+
+template <class K>
+class LRUCache : public BatchCache<K> {
+ public:
+  LRUCache() {
+    mp.clear();
+    head = new LRUNode(0);
+    tail = new LRUNode(0);
+    head->next = tail;
+    tail->pre = head;
+    BatchCache<K>::num_hit = 0;
+    BatchCache<K>::num_miss = 0;
+  }
+
+  size_t size() {
+    mutex_lock l(mu_);
+    return mp.size();
+  }
+
+  size_t get_evic_ids(K* evic_ids, size_t k_size) {
+    mutex_lock l(mu_);
+    size_t true_size = 0;
+    LRUNode* evic_node = tail->pre;
+    LRUNode* rm_node = evic_node;
+    for (size_t i = 0; i < k_size && evic_node != head; ++i) {
+      evic_ids[i] = evic_node->id;
+      rm_node = evic_node;
+      evic_node = evic_node->pre;
+      mp.erase(rm_node->id);
+      delete rm_node;
+      true_size++;
+    }
+    evic_node->next = tail;
+    tail->pre = evic_node;
+    return true_size;
+  }
+
+  size_t get_cached_ids(K* cached_ids, size_t k_size, int64* cached_versions,
+                        int64* cached_freqs) override {
+    mutex_lock l(mu_);
+    LRUNode* it = head->next;
+    size_t i;
+    for (i = 0; i < k_size && it != tail; i++, it = it->next) {
+      cached_ids[i] = it->id;
+    }
+    return i;
+  }
+
+  void update(const K* batch_ids, size_t batch_size, bool use_locking = true) {
+    mutex temp_mu;
+    auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      typename std::map<K, LRUNode*>::iterator it = mp.find(id);
+      if (it != mp.end()) {
+        LRUNode* node = it->second;
+        node->pre->next = node->next;
+        node->next->pre = node->pre;
+        head->next->pre = node;
+        node->next = head->next;
+        head->next = node;
+        node->pre = head;
+        BatchCache<K>::num_hit++;
+      } else {
+        LRUNode* newNode = new LRUNode(id);
+        head->next->pre = newNode;
+        newNode->next = head->next;
+        head->next = newNode;
+        newNode->pre = head;
+        mp[id] = newNode;
+        BatchCache<K>::num_miss++;
+      }
+    }
+  }
+
+  void update(const K* batch_ids, size_t batch_size, const int64* batch_version,
+              const int64* batch_freqs, bool use_locking = true) override {
+    // TODO: add to rank accroding to the version of ids
+    update(batch_ids, batch_size);
+  }
+
+  void add_to_prefetch_list(const K* batch_ids, const size_t batch_size) {
+    mutex_lock l(mu_);
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it_prefetch = prefetch_id_table.find(id);
+      if (it_prefetch == prefetch_id_table.end()) {
+        auto it_cache = mp.find(id);
+        if (it_cache != mp.end()) {
+          LRUNode* node = it_cache->second;
+          node->pre->next = node->next;
+          node->next->pre = node->pre;
+          delete node;
+          mp.erase(id);
+        }
+        prefetch_id_table[id] = new PrefetchNode<K>(id);
+      } else {
+        it_prefetch->second->Ref();
+      }
+    }
+  }
+
+  void add_to_cache(const K* batch_ids, const size_t batch_size) {
+    mutex_lock l(mu_);
+    std::vector<K> ids_to_cache(batch_size);
+    int64 nums_to_cache = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it_prefetch = prefetch_id_table.find(id);
+      if (it_prefetch == prefetch_id_table.end()) {
+        LOG(FATAL) << "The id should be prefetched before being used.";
+      }
+      it_prefetch->second->UnRef();
+      if (it_prefetch->second->ref_count() == 0) {
+        delete it_prefetch->second;
+        prefetch_id_table.erase(id);
+        ids_to_cache[nums_to_cache] = id;
+        nums_to_cache++;
+      }
+    }
+    update(ids_to_cache.data(), nums_to_cache, false);
+  }
+
+ private:
+  class LRUNode {
+   public:
+    K id;
+    LRUNode *pre, *next;
+    LRUNode(K id) : id(id), pre(nullptr), next(nullptr) {}
+  };
+  LRUNode *head, *tail;
+  std::map<K, LRUNode*> mp;
+  std::unordered_map<K, PrefetchNode<K>*> prefetch_id_table;
+  mutex mu_;
+};
+
+template <class K>
+class LFUCache : public BatchCache<K> {
+ public:
+  LFUCache() {
+    min_freq = std::numeric_limits<size_t>::max();
+    max_freq = 0;
+    freq_table.emplace_back(
+        std::pair<std::list<LFUNode>*, int64>(new std::list<LFUNode>, 0));
+    BatchCache<K>::num_hit = 0;
+    BatchCache<K>::num_miss = 0;
+  }
+
+  size_t size() {
+    mutex_lock l(mu_);
+    return key_table.size();
+  }
+
+  size_t get_cached_ids(K* cached_ids, size_t k_size, int64* cached_versions,
+                        int64* cached_freqs) override {
+    mutex_lock l(mu_);
+    size_t i = 0;
+    size_t curr_freq = max_freq;
+    auto it = freq_table[max_freq - 1].first->begin();
+    while (i < k_size && curr_freq >= min_freq) {
+      cached_ids[i] = (*it).key;
+      cached_freqs[i] = (*it).freq;
+      i++;
+      it++;
+      if (it == freq_table[curr_freq - 1].first->end()) {
+        do {
+          curr_freq--;
+        } while (freq_table[curr_freq - 1].second == 0 &&
+                 curr_freq >= min_freq);
+        if (curr_freq >= min_freq) {
+          it = freq_table[curr_freq - 1].first->begin();
+        }
+      }
+    }
+    return i;
+  }
+
+  size_t get_evic_ids(K* evic_ids, size_t k_size) {
+    mutex_lock l(mu_);
+    size_t true_size = 0;
+    size_t st_freq = min_freq;
+    for (size_t i = 0; i < k_size && key_table.size() > 0; ++i) {
+      auto rm_it = freq_table[st_freq - 1].first->back();
+      key_table.erase(rm_it.key);
+      evic_ids[i] = rm_it.key;
+      ++true_size;
+      freq_table[st_freq - 1].first->pop_back();
+      freq_table[st_freq - 1].second--;
+      if (freq_table[st_freq - 1].second == 0) {
+        ++st_freq;
+        while (st_freq <= max_freq) {
+          if (freq_table[st_freq - 1].second == 0) {
+            ++st_freq;
+          } else {
+            min_freq = st_freq;
+            break;
+          }
+        }
+        if (st_freq > max_freq) {
+          reset_min_and_max_freq();
+        }
+      }
+    }
+    return true_size;
+  }
+
+  void update(const K* batch_ids, size_t batch_size, bool use_locking = true) {
+    mutex temp_mu;
+    auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it = key_table.find(id);
+      if (it == key_table.end()) {
+        freq_table[0].first->emplace_front(LFUNode(id, 1));
+        freq_table[0].second++;
+        key_table[id] = freq_table[0].first->begin();
+        min_freq = 1;
+        max_freq = std::max(max_freq, min_freq);
+        BatchCache<K>::num_miss++;
+      } else {
+        typename std::list<LFUNode>::iterator node = it->second;
+        size_t freq = node->freq;
+        freq_table[freq - 1].first->erase(node);
+        freq_table[freq - 1].second--;
+        if (freq_table[freq - 1].second == 0) {
+          if (min_freq == freq) min_freq += 1;
+        }
+        if (freq == freq_table.size()) {
+          freq_table.emplace_back(
+              std::pair<std::list<LFUNode>*, int64>(new std::list<LFUNode>, 0));
+        }
+        max_freq = std::max(max_freq, freq + 1);
+        freq_table[freq].first->emplace_front(LFUNode(id, freq + 1));
+        freq_table[freq].second++;
+        key_table[id] = freq_table[freq].first->begin();
+        BatchCache<K>::num_hit++;
+      }
+    }
+  }
+
+  void update(const K* batch_ids, const size_t batch_size,
+              const int64* batch_versions, const int64* batch_freqs,
+              bool use_locking = true) override {
+    mutex temp_mu;
+    auto lock = BatchCache<K>::maybe_lock_cache(mu_, temp_mu, use_locking);
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it = key_table.find(id);
+      size_t freq = batch_freqs[i];
+      if (it == key_table.end()) {
+        if (freq < min_freq) {
+          min_freq = freq;
+        }
+
+        if (freq > max_freq) {
+          max_freq = freq;
+          int64 prev_size = freq_table.size();
+          if (max_freq > prev_size) {
+            freq_table.resize(
+                max_freq, std::pair<std::list<LFUNode>*, int64>(nullptr, 0));
+            for (int64 j = prev_size; j < max_freq; j++) {
+              freq_table[j].first = new std::list<LFUNode>;
+            }
+          }
+        }
+        freq_table[freq - 1].first->emplace_front(LFUNode(id, freq));
+        freq_table[freq - 1].second++;
+        key_table[id] = freq_table[freq - 1].first->begin();
+        BatchCache<K>::num_miss++;
+      } else {
+        typename std::list<LFUNode>::iterator node = it->second;
+        size_t last_freq = node->freq;
+        size_t curr_freq = last_freq + freq;
+        freq_table[last_freq - 1].first->erase(node);
+        freq_table[last_freq - 1].second--;
+
+        if (curr_freq > max_freq) {
+          max_freq = curr_freq;
+          freq_table.resize(max_freq, std::pair<std::list<LFUNode>*, int64>(
+                                          new std::list<LFUNode>, 0));
+        }
+
+        if (freq_table[last_freq - 1].second == 0) {
+          if (min_freq == last_freq) {
+            update_min_freq();
+          }
+        }
+
+        freq_table[curr_freq - 1].first->emplace_front(LFUNode(id, curr_freq));
+        freq_table[curr_freq - 1].second++;
+        key_table[id] = freq_table[curr_freq - 1].first->begin();
+        BatchCache<K>::num_hit++;
+      }
+    }
+  }
+
+  void add_to_prefetch_list(const K* batch_ids, const size_t batch_size) {
+    mutex_lock l(mu_);
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it_prefetch = prefetch_id_table.find(id);
+      if (it_prefetch == prefetch_id_table.end()) {
+        auto it_cache = key_table.find(id);
+        if (it_cache != key_table.end()) {
+          auto cache_node = it_cache->second;
+          int64 freq = cache_node->freq;
+          freq_table[freq - 1].first->erase(cache_node);
+          freq_table[freq - 1].second--;
+          key_table.erase(id);
+          if (freq_table[freq - 1].second == 0) {
+            if (freq == max_freq) {
+              update_max_freq();
+            }
+            if (freq == min_freq) {
+              update_min_freq();
+            }
+          }
+          prefetch_id_table[id] = new PrefetchLFUNode<K>(id, freq);
+        } else {
+          prefetch_id_table[id] = new PrefetchLFUNode<K>(id);
+        }
+      } else {
+        it_prefetch->second->Ref();
+      }
+    }
+  }
+
+  void add_to_cache(const K* batch_ids, const size_t batch_size) {
+    mutex_lock l(mu_);
+    std::vector<K> ids_to_cache(batch_size);
+    std::vector<int64> freqs_to_cache(batch_size);
+    int64 nums_to_cache = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      K id = batch_ids[i];
+      auto it_prefetch = prefetch_id_table.find(id);
+      if (it_prefetch == prefetch_id_table.end()) {
+        LOG(FATAL) << "The id should be prefetched before being used.";
+      }
+      it_prefetch->second->UnRef();
+      if (it_prefetch->second->ref_count() == 0) {
+        int64 freq = it_prefetch->second->freq();
+        delete it_prefetch->second;
+        prefetch_id_table.erase(id);
+        ids_to_cache[nums_to_cache] = id;
+        freqs_to_cache[nums_to_cache] = freq;
+        nums_to_cache++;
+      }
+    }
+    const int64* versions_to_cache = nullptr;
+    update(ids_to_cache.data(), nums_to_cache, versions_to_cache,
+           freqs_to_cache.data(), false);
+  }
+
+ private:
+  void reset_min_and_max_freq() {
+    min_freq = std::numeric_limits<size_t>::max();
+    max_freq = 0;
+  }
+
+  void update_min_freq() {
+    size_t i;
+    for (i = min_freq + 1; i <= max_freq; i++) {
+      if (freq_table[i - 1].second != 0) {
+        min_freq = i;
+        break;
+      }
+    }
+    if (i > max_freq) {
+      reset_min_and_max_freq();
+    }
+  }
+
+  void update_max_freq() {
+    size_t i;
+    for (i = max_freq - 1; i >= min_freq; i--) {
+      if (freq_table[i - 1].second != 0) {
+        max_freq = i;
+        break;
+      }
+    }
+    if (i < min_freq) {
+      reset_min_and_max_freq();
+    }
+  }
+
+  class LFUNode {
+   public:
+    K key;
+    size_t freq;
+    LFUNode(K key, size_t freq) : key(key), freq(freq) {}
+  };
+  size_t min_freq;
+  size_t max_freq;
+  std::vector<std::pair<std::list<LFUNode>*, int64>> freq_table;
+  std::unordered_map<K, typename std::list<LFUNode>::iterator> key_table;
+  std::unordered_map<K, PrefetchLFUNode<K>*> prefetch_id_table;
+  mutex mu_;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h
new file mode 100644
index 00000000..97e4cf2c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h
@@ -0,0 +1,47 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_
+
+#include "cache.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+
+namespace tensorflow {
+namespace embedding {
+class CacheFactory {
+ public:
+  template <typename K>
+  static BatchCache<K>* Create(CacheStrategy cache_strategy, std::string name) {
+    switch (cache_strategy) {
+      case CacheStrategy::LRU:
+        LOG(INFO) << " Use Storage::LRU in multi-tier EmbeddingVariable "
+                  << name;
+        return new LRUCache<K>();
+      case CacheStrategy::LFU:
+        LOG(INFO) << " Use Storage::LFU in multi-tier EmbeddingVariable "
+                  << name;
+        return new LFUCache<K>();
+      default:
+        LOG(INFO) << " Invalid Cache strategy, \
+                       use LFU in multi-tier EmbeddingVariable "
+                  << name;
+        return new LFUCache<K>();
+    }
+  }
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_FACTORY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h b/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h
new file mode 100644
index 00000000..3c43a41c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/cache_thread_pool_creator.h
@@ -0,0 +1,45 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+namespace embedding {
+template <typename K, typename V>
+class MultiTierStorage;
+
+class CacheThreadPoolCreator {
+ public:
+  static thread::ThreadPool* Create() {
+    int64 num_threads = 1;
+    TF_CHECK_OK(
+        ReadInt64FromEnvVar("TF_MULTI_TIER_EV_CACHE_THREADS", 1, &num_threads));
+    static thread::ThreadPool cache_thread_pool(Env::Default(), ThreadOptions(),
+                                                "MultiTier_Embedding_Cache",
+                                                num_threads,
+                                                /*low_latency_hint=*/false);
+    return &cache_thread_pool;
+  }
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CACHE_THREADPOOL_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/config.proto b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto
new file mode 100644
index 00000000..424fc5e1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto
@@ -0,0 +1,58 @@
+syntax = "proto3";
+
+package tensorflow.embedding;
+
+enum StorageType {
+  // none
+  DEFAULT = 0;
+
+  // one level
+  DRAM = 1;
+  PMEM_MEMKIND = 2;
+  PMEM_LIBPMEM = 3;
+  SSDHASH = 4;
+  LEVELDB = 5;
+  HBM = 6;
+
+  // two level
+  DRAM_PMEM = 11;
+  DRAM_SSDHASH = 12;
+  HBM_DRAM = 13;
+  DRAM_LEVELDB = 14;
+
+  // three level
+  DRAM_PMEM_SSDHASH = 101;
+  HBM_DRAM_SSDHASH = 102;
+
+}
+
+enum CopyBackFlag {
+  NOT_COPYBACK = 0;
+  COPYBACK = 1;
+  COPYBACK_AND_DESTROY = 2;
+}
+
+enum SlotType {
+  EMBEDDING_VARIABLE = 0;
+  VARIABLE = 1;
+}
+
+enum CacheStrategy {
+  LRU = 0;
+  LFU = 1;
+}
+
+enum EmbeddingVariableType {
+  IMMUTABLE = 0;
+  MUTABLE = 1;
+}
+
+enum ValuePtrStatus {
+  OK = 0;
+  IS_DELETED = 1;
+  NOT_IN_DRAM = 2;
+}
+
+enum IsSetInitialized {
+  NOT_SET_INITAILIZED = 0;
+}
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h
new file mode 100644
index 00000000..bb5682c5
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_descriptor_impl.h
@@ -0,0 +1,252 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
+#include <list>
+
+#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl : public FeatureDescriptorImpl<V> {
+ public:
+  CounterFilterDescriptorImpl(Allocator* alloc, int64 slot_num,
+                              bool need_record_freq, bool need_record_version,
+                              int64 filter_freq, StorageType storage_type)
+      : filter_freq_(filter_freq),
+        is_record_freq_(need_record_freq),
+        FeatureDescriptorImpl<V>(slot_num, need_record_freq,
+                                 need_record_version) {
+    if (filter_freq >= (1L << version_offset_bits_)) {
+      LOG(FATAL) << "Filter freqeuncy threshold shouldn't bigger than 2^12.";
+    }
+
+    if (storage_type == StorageType::HBM_DRAM ||
+        storage_type == StorageType::HBM_DRAM_SSDHASH) {
+#if GOOGLE_CUDA
+      feat_desc_impl_.reset(new HbmMultiTierFeatureDescriptorImpl<V>(
+          alloc, slot_num, need_record_freq, need_record_version));
+#endif  // GOOGLE_CUDA
+    } else {
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          alloc, slot_num, need_record_freq, need_record_version));
+    }
+  }
+
+  CounterFilterDescriptorImpl(CounterFilterDescriptorImpl<V>* feat_desc_impl)
+      : filter_freq_(feat_desc_impl->filter_freq_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {
+#if GOOGLE_CUDA
+    if (typeid(*(feat_desc_impl->feat_desc_impl_.get())) ==
+        typeid(HbmMultiTierFeatureDescriptorImpl<V>*)) {
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+              feat_desc_impl->feat_desc_impl_.get())));
+    } else {
+#endif  // GOOGLE_CUDA
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+              feat_desc_impl->feat_desc_impl_.get())));
+#if GOOGLE_CUDA
+    }
+#endif  // GOOGLE_CUDA
+  }
+
+  ~CounterFilterDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    return feat_desc_impl_->InitSlotInfo(emb_index, embedding_dim,
+                                         default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    return feat_desc_impl_->InitSlotInfo(feat_desc_impl);
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  bool IsAdmit(void* val) override { return (GetFlag(val) == 0); }
+
+  void* Admit(void* val) override {
+    if (!IsAdmit(val)) {
+      return feat_desc_impl_->Allocate();
+    } else {
+      LOG(FATAL) << "Only unadmited feature could be admited.";
+      return nullptr;
+    }
+  }
+
+  void* Allocate() override {
+    uint64* val = (uint64*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                                               alloc_bytes_);
+    uint64 flag = 1L << flag_offset_bits_;
+    uint64 version = (0xffffffffffffffff << version_offset_bits_);
+    uint64 freq = 0;
+    *val = version + freq;
+    val = (uint64*)((uint64)val | flag);
+    return (void*)val;
+  }
+
+  void* Allocate(int64 freq) override {
+    if (freq < filter_freq_) {
+      return Allocate();
+    } else {
+      return feat_desc_impl_->Allocate();
+    }
+  }
+
+  void Deallocate(void* val) override {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->Deallocate(val);
+    } else {
+      void* tmp = GetPtr(val);
+      alloc_->DeallocateRaw(tmp);
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val : vals) {
+      if (IsAdmit(val)) {
+        feat_desc_impl_->Deallocate(val);
+      } else {
+        void* tmp = GetPtr(val);
+        alloc_->DeallocateRaw(tmp);
+      }
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {
+    uint64* tmp = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      __sync_fetch_and_add(tmp, count);
+    } else {
+      feat_desc_impl_->AddFreq(val, count);
+    }
+  }
+
+  void SetAllocator(Allocator* alloc) override {
+    feat_desc_impl_->SetAllocator(alloc);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    if (IsAdmit(val)) {
+      feat_desc_impl_->SetValue(val, emb_index, value);
+    }
+  }
+
+  void SetDefaultValue(void* val, int64 key) override {
+    feat_desc_impl_->SetDefaultValue(val, key);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device) {
+    feat_desc_impl_->SetDefaultValues(keys, init_cursor, value_ptrs,
+                                      compute_stream, event_mgr, gpu_device);
+  }
+#endif
+
+  int64 GetFreq(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      return *((uint64*)tmp) & ((1L << version_offset_bits_) - 1);
+    } else {
+      if (is_record_freq_) {
+        return feat_desc_impl_->GetFreq(val);
+      } else {
+        return filter_freq_;
+      }
+    }
+  }
+
+  int64 GetVersion(void* val) override {
+    if (!IsAdmit(val)) {
+      void* tmp = GetPtr(val);
+      int64 version = *(uint64*)tmp >> version_offset_bits_;
+      if (version == 0xffffffffffff) {
+        version = -1;
+      }
+      return version;
+    } else {
+      return feat_desc_impl_->GetVersion(val);
+    }
+  }
+
+  void UpdateVersion(void* val, int64 version) override {
+    if (!IsAdmit(val)) {
+      void* tmp_ptr = GetPtr(val);
+      uint64 tmp_val = 0;
+      uint64 result = 0;
+      do {
+        tmp_val = *(uint64*)tmp_ptr;
+        version = version << version_offset_bits_;
+        uint64 freq = tmp_val & ((1L << version_offset_bits_) - 1);
+        result = version + freq;
+      } while (
+          !__sync_bool_compare_and_swap((uint64*)tmp_ptr, tmp_val, result));
+    } else {
+      feat_desc_impl_->UpdateVersion(val, version);
+    }
+  }
+
+  void SetFreq(void* val, int64 freq) override {
+    uint64* tmp_ptr = (uint64*)GetPtr(val);
+    if (!IsAdmit(val)) {
+      uint64 tmp = *tmp_ptr;
+      tmp = ~((1L << version_offset_bits_) - 1) & tmp;
+      tmp += freq;
+      __sync_bool_compare_and_swap(tmp_ptr, *tmp_ptr, tmp);
+    } else {
+      feat_desc_impl_->SetFreq(val, freq);
+    }
+  }
+
+  int data_bytes() override { return alloc_bytes_; }
+
+ private:
+  uint64 GetFlag(void* val) { return (uint64)val >> flag_offset_bits_; }
+
+  void* GetPtr(void* val) {
+    return (void*)((uint64)val & ((1L << flag_offset_bits_) - 1));
+  }
+
+  int64 filter_freq_;
+  int alloc_bytes_ = 8;
+  Allocator* alloc_ = ev_allocator();
+  const int freq_offset_bits_ = 0;
+  const int version_offset_bits_ = 16;
+  const int flag_offset_bits_ = 48;
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+  bool is_record_freq_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h
new file mode 100644
index 00000000..4098aa75
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/counter_filter_policy.h
@@ -0,0 +1,189 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_
+
+#include "embedding_config.h"
+#include "filter_policy.h"
+
+namespace tensorflow {
+
+template <typename K, typename V, typename EV>
+class CounterFilterPolicy : public FilterPolicy<K, V, EV> {
+  using FilterPolicy<K, V, EV>::ev_;
+  using FilterPolicy<K, V, EV>::config_;
+
+ public:
+  CounterFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                      embedding::FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc), FilterPolicy<K, V, EV>(config, ev) {}
+
+  Status Lookup(K key, V* val, const V* default_value_ptr,
+                const V* default_value_no_permission) override {
+    void* value_ptr = nullptr;
+    Status s = ev_->LookupKey(key, &value_ptr);
+    if (s.ok() && feat_desc_->IsAdmit(value_ptr)) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+    return OkStatus();
+  }
+
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                   V* output, int64 num_of_keys, V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
+                    default_value_ptr,
+                    default_value_no_permission](int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        void* value_ptr = value_ptr_list[i];
+        int64 freq = GetFreq(keys[i], value_ptr);
+        if (value_ptr != nullptr && feat_desc_->IsAdmit(value_ptr)) {
+          embedding_ptr[i] =
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+        } else {
+          embedding_ptr[i] = default_value_no_permission;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(),
+                                stream, event_mgr, ctx.gpu_device);
+  }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, void** value_ptrs_list,
+                              int64 num_of_keys) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> not_found_cursor_list(num_worker_threads + 1);
+    ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs_list, num_of_keys,
+                                not_found_cursor_list);
+  }
+#endif  // GOOGLE_CUDA
+
+  void LookupOrCreate(K key, V* val, const V* default_value_ptr,
+                      void** value_ptr, int count,
+                      const V* default_value_no_permission) override {
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    if (is_filter) {
+      V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_no_permission, sizeof(V) * ev_->ValueLen());
+    }
+  }
+
+  Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter,
+                           int64 count) override {
+    *is_filter = false;
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      if (count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        feat_desc_->Deallocate(*value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+      ev_->storage()->Insert(key, value_ptr);
+      s = OkStatus();
+    } else if (!feat_desc_->IsAdmit(*value_ptr)) {
+      int64 freq = feat_desc_->GetFreq(*value_ptr);
+      if (freq + count >= config_.filter_freq) {
+        void* admit_value_ptr = feat_desc_->Admit(*value_ptr);
+        feat_desc_->SetFreq(admit_value_ptr, freq);
+        feat_desc_->UpdateVersion(admit_value_ptr,
+                                  feat_desc_->GetVersion(*value_ptr));
+        feat_desc_->SetDefaultValue(admit_value_ptr, key);
+        ev_->storage()->UpdateValuePtr(key, admit_value_ptr, *value_ptr);
+        *value_ptr = admit_value_ptr;
+        *is_filter = true;
+      }
+    } else {
+      *is_filter = true;
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
+    return s;
+  }
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
+  }
+
+  int64 GetFreq(K key) override {
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetFreq(value_ptr);
+  }
+
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr,
+                 RestoreBuffer& restore_buff) override {
+    K* key_buff = (K*)restore_buff.key_buffer;
+    V* value_buff = (V*)restore_buff.value_buffer;
+    int64* version_buff = (int64*)restore_buff.version_buffer;
+    int64* freq_buff = (int64*)restore_buff.freq_buffer;
+    for (auto i = 0; i < key_num; ++i) {
+      // this can describe by graph(Mod + DynamicPartition),
+      // but memory waste and slow
+      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
+        continue;
+      }
+      int64 import_freq = 0;
+      int64 import_version = -1;
+      if (!is_filter) {
+        if (freq_buff[i] >= config_.filter_freq) {
+          import_freq = freq_buff[i];
+        } else {
+          import_freq = config_.filter_freq;
+        }
+      } else {
+        import_freq = freq_buff[i];
+      }
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
+      ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(),
+                             import_freq, import_version, config_.emb_index);
+    }
+    return OkStatus();
+  }
+
+  bool is_admit(K key, void* value_ptr) override {
+    return feat_desc_->IsAdmit(value_ptr);
+  }
+
+ private:
+  embedding::FeatureDescriptor<V>* feat_desc_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h
new file mode 100644
index 00000000..3aaaf9d0
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/cpu_hash_map_kv.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_
+
+#include "kv_interface.h"
+#include "sparsehash/dense_hash_map_lockless"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace embedding {
+
+template <class K, class V>
+class LocklessHashMap : public KVInterface<K, V> {
+ public:
+  LocklessHashMap(FeatureDescriptor<V>* feat_desc) : feat_desc_(feat_desc) {
+    hash_map_.max_load_factor(0.8);
+    hash_map_.set_empty_key_and_value(LocklessHashMap<K, V>::EMPTY_KEY_,
+                                      nullptr);
+    hash_map_.set_counternum(16);
+    hash_map_.set_deleted_key(LocklessHashMap<K, V>::DELETED_KEY_);
+    pthread_key_create(&key_, NULL);
+  }
+
+  ~LocklessHashMap() override { pthread_key_delete(key_); }
+
+  Status Lookup(K key, void** value_ptr) override {
+    auto iter = hash_map_.find_wait_free(key);
+    if (iter.first == LocklessHashMap<K, V>::EMPTY_KEY_) {
+      return errors::NotFound("Unable to find Key: ", key,
+                              " in LocklessHashMap.");
+    } else {
+      *value_ptr = iter.second;
+      return OkStatus();
+    }
+  }
+
+  Status Contains(K key) override {
+    auto iter = hash_map_.find_wait_free(key);
+    if (iter.first == LocklessHashMap<K, V>::EMPTY_KEY_) {
+      return errors::NotFound("Unable to find Key: ", key,
+                              " in LocklessHashMap.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  Status Insert(K key, const void* value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, const_cast<void*>(value_ptr))));
+    // insert fail, exist key
+    if ((*(iter.first)).second != value_ptr) {
+      return errors::AlreadyExists("already exists Key: ", key,
+                                   " in LocklessHashMap.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  // Other Method
+  int64 Size() const override { return hash_map_.size_lockless(); }
+
+  // Remove KV
+  Status Remove(K key) override {
+    if (hash_map_.erase_lockless(key)) {
+      return OkStatus();
+    } else {
+      return errors::NotFound("Unable to find Key: ", key,
+                              " in LocklessHashMap.");
+    }
+  }
+
+  Status Commit(K key, const void* value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, const_cast<void*>(value_ptr))));
+    if ((*(iter.first)).second != value_ptr) {
+      AppendToValuePtrQueue((*(iter.first)).second);
+      __sync_bool_compare_and_swap(&((*(iter.first)).second),
+                                   (*(iter.first)).second, value_ptr);
+    }
+    return OkStatus();
+  }
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    for (int i = 0; i < keys.size(); ++i) {
+      auto iter = hash_map_.insert_lockless(std::move(
+          std::pair<K, void*>(keys[i], const_cast<void*>(value_ptrs[i]))));
+      if ((*(iter.first)).second != value_ptrs[i]) {
+        AppendToValuePtrQueue((*(iter.first)).second);
+        __sync_bool_compare_and_swap(&((*(iter.first)).second),
+                                     (*(iter.first)).second, value_ptrs[i]);
+      }
+    }
+    return OkStatus();
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    std::pair<const K, void*>* hash_map_dump;
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ &&
+          hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_) {
+        key_list->emplace_back(hash_map_dump[j].first);
+        value_ptr_list->emplace_back(hash_map_dump[j].second);
+      }
+    }
+    free(hash_map_dump);
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    std::pair<const K, void*>* hash_map_dump;
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ &&
+          hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_) {
+        int part_id =
+            hash_map_dump[j].first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(hash_map_dump[j].first);
+          value_ptr_list[part_id].emplace_back(hash_map_dump[j].second);
+        }
+      }
+    }
+
+    free(hash_map_dump);
+    return OkStatus();
+  }
+
+  std::string DebugString() const override {
+    LOG(INFO) << "map info size:" << Size()
+              << "map info bucket_count:" << hash_map_.bucket_count()
+              << "map info load_factor:" << hash_map_.load_factor()
+              << "map info max_load_factor:" << hash_map_.max_load_factor()
+              << "map info min_load_factor:" << hash_map_.min_load_factor();
+    return "";
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    auto iter = hash_map_.insert_lockless(
+        std::move(std::pair<K, void*>(key, old_value_ptr)));
+    bool flag = __sync_bool_compare_and_swap(&((*(iter.first)).second),
+                                             old_value_ptr, new_value_ptr);
+    if (flag) {
+      AppendToValuePtrQueue(old_value_ptr);
+    } else {
+      feat_desc_->Deallocate(new_value_ptr);
+    }
+  }
+
+ private:
+  void AppendToValuePtrQueue(void* old_value_ptr) {
+    // A parameter that can be adjusted in the future
+    std::deque<void*>* value_ptr_queue = GetOutOfDateValuePtrQueue();
+    if (value_ptr_queue->size() > CAP_INVALID_VALUEPTR) {
+      void* value_ptr = value_ptr_queue->front();
+      feat_desc_->Deallocate(value_ptr);
+      value_ptr_queue->pop_front();
+    }
+    value_ptr_queue->emplace_back(old_value_ptr);
+  }
+
+  std::deque<void*>* GetOutOfDateValuePtrQueue() {
+    std::deque<void*>* value_ptr_queue =
+        static_cast<std::deque<void*>*>(pthread_getspecific(key_));
+    if (value_ptr_queue == nullptr) {
+      value_ptr_queue = new std::deque<void*>();
+      pthread_setspecific(key_, value_ptr_queue);
+    }
+    return value_ptr_queue;
+  }
+
+ private:
+  typedef google::dense_hash_map_lockless<K, void*> LockLessHashMap;
+  static const int EMPTY_KEY_;
+  static const int DELETED_KEY_;
+  LockLessHashMap hash_map_;
+  const int CAP_INVALID_VALUEPTR = 20000;
+  FeatureDescriptor<V>* feat_desc_;
+  pthread_key_t key_;
+};
+template <class K, class V>
+const int LocklessHashMap<K, V>::EMPTY_KEY_ = -111;
+template <class K, class V>
+const int LocklessHashMap<K, V>::DELETED_KEY_ = -222;
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_CPU_HASH_MAP_KV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h
new file mode 100644
index 00000000..8ae59141
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/dense_hash_map_kv.h
@@ -0,0 +1,151 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_
+
+#include "deepray/custom_ops/utils/spin_rw_lock.h"
+#include "kv_interface.h"
+#include "sparsehash/dense_hash_map"
+#include "tensorflow/core/framework/typed_allocator.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace embedding {
+
+template <class K, class V>
+class DenseHashMap : public KVInterface<K, V> {
+ public:
+  DenseHashMap() : hash_map_(nullptr) {
+    hash_map_ = new dense_hash_map[partition_num_];
+    for (int i = 0; i < partition_num_; i++) {
+      hash_map_[i].hash_map.max_load_factor(0.8);
+      hash_map_[i].hash_map.set_empty_key(-1);
+      hash_map_[i].hash_map.set_deleted_key(-2);
+    }
+  }
+
+  ~DenseHashMap() override { delete[] hash_map_; }
+
+  Status Lookup(K key, void** value_ptr) override {
+    int64 l_id = std::abs(key) % partition_num_;
+    spin_rd_lock l(hash_map_[l_id].mu);
+    auto iter = hash_map_[l_id].hash_map.find(key);
+    if (iter == hash_map_[l_id].hash_map.end()) {
+      return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap.");
+    } else {
+      *value_ptr = iter->second;
+      return OkStatus();
+    }
+  }
+
+  Status Contains(K key) override {
+    int64 l_id = std::abs(key) % partition_num_;
+    spin_rd_lock l(hash_map_[l_id].mu);
+    auto iter = hash_map_[l_id].hash_map.find(key);
+    if (iter == hash_map_[l_id].hash_map.end()) {
+      return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  Status Insert(K key, const void* value_ptr) override {
+    int64 l_id = std::abs(key) % partition_num_;
+    spin_wr_lock l(hash_map_[l_id].mu);
+    auto iter = hash_map_[l_id].hash_map.find(key);
+    // insert fail, exist key
+    if (iter != hash_map_[l_id].hash_map.end()) {
+      return errors::AlreadyExists("already exists Key: ", key,
+                                   " in DenseHashMap.");
+    } else {
+      auto iter = hash_map_[l_id].hash_map.insert(
+          std::move(std::pair<K, void*>(key, const_cast<void*>(value_ptr))));
+      return OkStatus();
+    }
+  }
+
+  // Other Method
+  int64 Size() const override {
+    int64 ret = 0;
+    for (int i = 0; i < partition_num_; i++) {
+      spin_rd_lock l(hash_map_[i].mu);
+      ret += hash_map_[i].hash_map.size();
+    }
+    return ret;
+  }
+
+  // Remove KV
+  Status Remove(K key) override {
+    int64 l_id = std::abs(key) % partition_num_;
+    spin_wr_lock l(hash_map_[l_id].mu);
+    if (hash_map_[l_id].hash_map.erase(key)) {
+      return OkStatus();
+    } else {
+      return errors::NotFound("Unable to find Key: ", key, " in DenseHashMap.");
+    }
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    dense_hash_map hash_map_dump[partition_num_];
+    for (int i = 0; i < partition_num_; i++) {
+      spin_rd_lock l(hash_map_[i].mu);
+      hash_map_dump[i].hash_map = hash_map_[i].hash_map;
+    }
+    for (int i = 0; i < partition_num_; i++) {
+      for (const auto it : hash_map_dump[i].hash_map) {
+        key_list->push_back(it.first);
+        value_ptr_list->push_back(it.second);
+      }
+    }
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    dense_hash_map hash_map_dump[partition_num_];
+    for (int i = 0; i < partition_num_; i++) {
+      spin_rd_lock l(hash_map_[i].mu);
+      hash_map_dump[i].hash_map = hash_map_[i].hash_map;
+    }
+    for (int i = 0; i < partition_num_; i++) {
+      for (const auto it : hash_map_dump[i].hash_map) {
+        int part_id = it.first % kSavedPartitionNum % partition_nums;
+        if (part_id != partition_id) {
+          key_list[part_id].emplace_back(it.first);
+          value_ptr_list[part_id].emplace_back(it.second);
+        }
+      }
+    }
+    return OkStatus();
+  }
+
+  std::string DebugString() const override { return ""; }
+
+ private:
+  const int partition_num_ = 1000;
+  struct dense_hash_map {
+    mutable easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
+    google::dense_hash_map<K, void*> hash_map;
+  };
+  dense_hash_map* hash_map_;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DENSE_HASH_MAP_KV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h
new file mode 100644
index 00000000..cd795954
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_leveldb_storage.h
@@ -0,0 +1,221 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_
+
+#include "cpu_hash_map_kv.h"
+#include "leveldb_kv.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+
+namespace tensorflow {
+template <class K, class V>
+class EmbeddingVar;
+
+namespace embedding {
+template <typename K, typename V>
+class DramLevelDBStore : public MultiTierStorage<K, V> {
+ public:
+  DramLevelDBStore(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc,
+                   const std::string& name)
+      : dram_feat_desc_(feat_desc), MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    leveldb_ = new LevelDBStore<K, V>(sc, feat_desc);
+  }
+
+  ~DramLevelDBStore() override {
+    MultiTierStorage<K, V>::DeleteFromEvictionManager();
+    delete dram_;
+    delete leveldb_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DramLevelDBStore);
+
+  Status Get(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = leveldb_->Get(key, value_ptr);
+    if (s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      leveldb_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
+    }
+    return s;
+  }
+
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
+  }
+
+  void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
+  }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = leveldb_->Get(key, value_ptr);
+    if (s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      leveldb_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
+    }
+    dram_->CreateAndInsert(key, value_ptr);
+    return OkStatus();
+  }
+
+  Status Remove(K key) override {
+    dram_->Remove(key);
+    leveldb_->Remove(key);
+    return OkStatus();
+  }
+
+  bool IsUseHbm() override { return false; }
+
+  bool IsSingleHbm() override { return false; }
+
+  int64 Size() const override {
+    int64 total_size = dram_->Size();
+    total_size += leveldb_->Size();
+    return total_size;
+  }
+
+  int64 Size(int level) const override {
+    if (level == 0) {
+      return dram_->Size();
+    } else if (level == 1) {
+      return leveldb_->Size();
+    } else {
+      return -1;
+    }
+  }
+
+  int LookupTier(K key) const override {
+    Status s = dram_->Contains(key);
+    if (s.ok()) return 0;
+    s = leveldb_->Contains(key);
+    if (s.ok()) return 1;
+    return -1;
+  }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<K> key_list, tmp_leveldb_key_list;
+    std::vector<void*> value_ptr_list, tmp_leveldb_value_list;
+    TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
+
+    TF_CHECK_OK(
+        leveldb_->GetSnapshot(&tmp_leveldb_key_list, &tmp_leveldb_value_list));
+
+    for (int64 i = 0; i < tmp_leveldb_value_list.size(); i++) {
+      tmp_leveldb_value_list[i] =
+          (void*)((int64)tmp_leveldb_value_list[i] | (1L << kDramFlagOffset));
+    }
+
+    std::vector<K> leveldb_key_list;
+    for (int64 i = 0; i < tmp_leveldb_key_list.size(); i++) {
+      Status s = dram_->Contains(tmp_leveldb_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_leveldb_key_list[i]);
+        leveldb_key_list.emplace_back(tmp_leveldb_key_list[i]);
+        value_ptr_list.emplace_back(tmp_leveldb_value_list[i]);
+      }
+    }
+
+    ValueIterator<V>* value_iter = leveldb_->GetValueIterator(
+        leveldb_key_list, emb_config.emb_index, value_len);
+
+    {
+      mutex_lock l(*(leveldb_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      FeatureDescriptor<V> hbm_feat_desc(1, 1, ev_allocator() /*useless*/,
+                                         StorageType::HBM_DRAM, true, true,
+                                         {false, 0});
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = &hbm_feat_desc;
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer, emb_config, value_len, default_value, key_list,
+          value_ptr_list, feat_desc_list, value_iter)));
+    }
+
+    for (auto it : tmp_leveldb_value_list) {
+      cpu_allocator()->DeallocateRaw((void*)((int64)it & 0xffffffffffff));
+    }
+    delete value_iter;
+
+    return OkStatus();
+  }
+
+  Status Eviction(K* evict_ids, int64 evict_size) override {
+    void* value_ptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        dram_->DestroyValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
+    mutex_lock l(*(dram_->get_mutex()));
+    mutex_lock l1(*(leveldb_->get_mutex()));
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(leveldb_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        MultiTierStorage<K, V>::KeepInvalidValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+ protected:
+  int total_dim() override { return dram_feat_desc_->total_dim(); }
+
+ private:
+  DramStorage<K, V>* dram_;
+  LevelDBStore<K, V>* leveldb_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_LEVELDB_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h
new file mode 100644
index 00000000..6f83ecb6
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_pmem_storage.h
@@ -0,0 +1,218 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
+
+#include "cpu_hash_map_kv.h"
+#include "feature_descriptor.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+
+namespace tensorflow {
+template <class K, class V>
+class EmbeddingVar;
+
+namespace embedding {
+
+template <typename K, typename V>
+class DramPmemStorage : public MultiTierStorage<K, V> {
+ public:
+  DramPmemStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc,
+                  const std::string& name)
+      : dram_feat_desc_(feat_desc), MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    pmem_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    pmem_feat_desc_->SetAllocator(
+        experimental_pmem_allocator(sc.path, sc.size[0]));
+
+    pmem_ = new PmemLibpmemStorage<K, V>(sc, pmem_feat_desc_);
+  }
+
+  ~DramPmemStorage() override {
+    MultiTierStorage<K, V>::DeleteFromEvictionManager();
+    delete dram_;
+    delete pmem_;
+    delete pmem_feat_desc_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DramPmemStorage);
+
+  Status Get(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = pmem_->Get(key, value_ptr);
+    void* new_value_ptr = dram_->CreateValuePtr();
+    if (s.ok()) {
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      dram_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
+    }
+    return s;
+  }
+
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
+  }
+
+  void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
+  }
+
+  bool IsUseHbm() override { return false; }
+
+  bool IsSingleHbm() override { return false; }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = pmem_->Get(key, value_ptr);
+
+    void* new_value_ptr = dram_->CreateValuePtr();
+    if (s.ok()) {
+      memcpy(new_value_ptr, value_ptr, pmem_feat_desc_->data_bytes());
+    }
+    *value_ptr = new_value_ptr;
+
+    s = dram_->TryInsert(key, *value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    // Insert Failed, key already exist
+    dram_->DestroyValuePtr(*value_ptr);
+    return dram_->Get(key, value_ptr);
+  }
+
+  Status Remove(K key) override {
+    dram_->Remove(key);
+    pmem_->Remove(key);
+    return OkStatus();
+  }
+
+  int64 Size() const override {
+    int64 total_size = dram_->Size();
+    total_size += pmem_->Size();
+    return total_size;
+  }
+
+  int64 Size(int level) const override {
+    if (level == 0) {
+      return dram_->Size();
+    } else if (level == 1) {
+      return pmem_->Size();
+    } else {
+      return -1;
+    }
+  }
+
+  int LookupTier(K key) const override {
+    Status s = dram_->Contains(key);
+    if (s.ok()) return 0;
+    s = pmem_->Contains(key);
+    if (s.ok()) return 1;
+    return -1;
+  }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<K> key_list, tmp_pmem_key_list;
+    std::vector<void*> value_ptr_list, tmp_pmem_value_list;
+
+    TF_CHECK_OK(dram_->GetSnapshot(&key_list, &value_ptr_list));
+    dram_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    TF_CHECK_OK(pmem_->GetSnapshot(&tmp_pmem_key_list, &tmp_pmem_value_list));
+    pmem_->Shrink(tmp_pmem_key_list, tmp_pmem_value_list, shrink_args,
+                  value_len);
+
+    for (int64 i = 0; i < tmp_pmem_key_list.size(); i++) {
+      Status s = dram_->Contains(tmp_pmem_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_pmem_key_list[i]);
+        value_ptr_list.emplace_back(tmp_pmem_value_list[i]);
+      }
+    }
+
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer, emb_config, value_len, default_value, key_list,
+        value_ptr_list, pmem_feat_desc_)));
+
+    return OkStatus();
+  }
+
+  Status Eviction(K* evict_ids, int64 evict_size) override {
+    void* value_ptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        dram_->DestroyValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
+    mutex_lock l(*(dram_->get_mutex()));
+    mutex_lock l1(*(pmem_->get_mutex()));
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(pmem_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        MultiTierStorage<K, V>::KeepInvalidValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    pmem_feat_desc_->InitSlotInfo(dram_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
+ protected:
+  int total_dim() override { return pmem_feat_desc_->total_dim(); }
+
+ private:
+  DramStorage<K, V>* dram_;
+  PmemLibpmemStorage<K, V>* pmem_;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* pmem_feat_desc_ = nullptr;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_PMEM_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h
new file mode 100644
index 00000000..f8cdff26
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/dram_ssd_storage.h
@@ -0,0 +1,214 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_
+
+#include "cpu_hash_map_kv.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+#include "ssd_hash_kv.h"
+
+namespace tensorflow {
+template <class K, class V>
+class EmbeddingVar;
+
+namespace embedding {
+template <typename K, typename V>
+class DramSsdHashStorage : public MultiTierStorage<K, V> {
+ public:
+  DramSsdHashStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc,
+                     const std::string& name)
+      : dram_feat_desc_(feat_desc), MultiTierStorage<K, V>(sc, name) {
+    dram_ = new DramStorage<K, V>(sc, feat_desc);
+    ssd_hash_ = new SsdHashStorage<K, V>(sc, feat_desc);
+  }
+
+  ~DramSsdHashStorage() override {
+    MultiTierStorage<K, V>::DeleteFromEvictionManager();
+    delete dram_;
+    delete ssd_hash_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DramSsdHashStorage);
+
+  Status Get(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = ssd_hash_->Get(key, value_ptr);
+    if (s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      // Insert Failed, the key is already in Dram;
+      ssd_hash_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
+    }
+    return s;
+  }
+
+  void Insert(K key, void** value_ptr) override {
+    dram_->Insert(key, value_ptr);
+  }
+
+  void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override {
+    dram_->CreateAndInsert(key, value_ptr);
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
+  }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    Status s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = ssd_hash_->Get(key, value_ptr);
+    if (s.ok()) {
+      s = dram_->TryInsert(key, *value_ptr);
+      if (s.ok()) {
+        return s;
+      }
+      // Insert Failed, the key is already in Dram;
+      ssd_hash_->DestroyValuePtr(*value_ptr);
+      return dram_->Get(key, value_ptr);
+    }
+    dram_->CreateAndInsert(key, value_ptr);
+    return OkStatus();
+  }
+
+  Status Remove(K key) override {
+    dram_->Remove(key);
+    ssd_hash_->Remove(key);
+    return OkStatus();
+  }
+
+  int64 Size() const override {
+    int64 total_size = dram_->Size();
+    total_size += ssd_hash_->Size();
+    return total_size;
+  }
+
+  int64 Size(int level) const override {
+    if (level == 0) {
+      return dram_->Size();
+    } else if (level == 1) {
+      return ssd_hash_->Size();
+    } else {
+      return -1;
+    }
+  }
+
+  int LookupTier(K key) const override {
+    Status s = dram_->Contains(key);
+    if (s.ok()) return 0;
+    s = ssd_hash_->Contains(key);
+    if (s.ok()) return 1;
+    return -1;
+  }
+
+  bool IsUseHbm() override { return false; }
+
+  bool IsSingleHbm() override { return false; }
+
+  bool IsUsePersistentStorage() override { return true; }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    dram_->Save(tensor_name, prefix, writer, emb_config, shrink_args, value_len,
+                default_value);
+
+    ssd_hash_->Save(tensor_name, prefix, writer, emb_config, shrink_args,
+                    value_len, default_value);
+
+    return OkStatus();
+  }
+
+  Status RestoreSSD(int64 emb_index, int64 emb_slot_num, int64 value_len,
+                    const std::string& ssd_emb_file_name,
+                    EmbeddingVar<K, V>* ev,
+                    RestoreSSDBuffer<K>& restore_buff) override {
+    std::map<int64, int64> file_id_map;
+    for (int64 i = 0; i < restore_buff.num_of_files; i++) {
+      file_id_map[restore_buff.file_list_buf[i]] = i;
+    }
+
+    ssd_hash_->CopyEmbFilesFromCkpt(
+        restore_buff.file_list_buf, restore_buff.invalid_record_count_list_buf,
+        restore_buff.record_count_list_buf, restore_buff.num_of_files,
+        ssd_emb_file_name);
+
+    ssd_hash_->Import(restore_buff.key_list_buf,
+                      restore_buff.key_file_id_list_buf,
+                      restore_buff.key_offset_list_buf,
+                      restore_buff.num_of_keys, file_id_map);
+    return OkStatus();
+  }
+
+  Status Eviction(K* evict_ids, int64 evict_size) override {
+    void* value_ptr = nullptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        dram_->DestroyValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) override {
+    mutex_lock l(*(dram_->get_mutex()));
+    mutex_lock l1(*(ssd_hash_->get_mutex()));
+    MultiTierStorage<K, V>::ReleaseInvalidValuePtr(dram_->feature_descriptor());
+    void* value_ptr = nullptr;
+    for (int64 i = 0; i < evict_size; ++i) {
+      if (dram_->Get(evict_ids[i], &value_ptr).ok()) {
+        TF_CHECK_OK(ssd_hash_->Commit(evict_ids[i], value_ptr));
+        TF_CHECK_OK(dram_->Remove(evict_ids[i]));
+        MultiTierStorage<K, V>::KeepInvalidValuePtr(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    dram_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    ssd_hash_->Init();
+    MultiTierStorage<K, V>::Init();
+  }
+
+ protected:
+  int total_dim() override { return dram_feat_desc_->total_dim(); }
+
+ private:
+  DramStorage<K, V>* dram_ = nullptr;
+  SsdHashStorage<K, V>* ssd_hash_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DRAM_SSD_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h
new file mode 100644
index 00000000..79e029a2
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/dynamic_dim_feature_descriptor_impl.h
@@ -0,0 +1,195 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_DYNAMIC_DIM_DESCRIPTOR_H_
+#include <atomic>
+#include <bitset>
+#include <list>
+
+#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h"
+#include "feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+constexpr int COLUMN_BITSET_BYTES = 5;
+constexpr int COLUMN_BITSET_SIZE = COLUMN_BITSET_BYTES * 8;
+
+struct MetaHeader {
+  volatile unsigned char embed_num;
+  unsigned char value_type;
+  unsigned char header_size;
+  unsigned char column_bitset[COLUMN_BITSET_BYTES];
+
+  static const int kEmbeddingNumStartIndex = 0;
+  static const int kValueTypeStartIndex =
+      kEmbeddingNumStartIndex + sizeof(char);
+  static const int kHeaderSizeStartIndex = kValueTypeStartIndex + sizeof(char);
+  static const int kColumnBitsetIndex = kHeaderSizeStartIndex + sizeof(char);
+
+  inline unsigned int GetEmbeddingNum() { return (unsigned int)embed_num; }
+
+  inline void SetEmbeddingNum(size_t s) { embed_num = (unsigned char)s; }
+
+  inline std::bitset<COLUMN_BITSET_SIZE> GetColumnBitset() {
+    unsigned long meta = ((unsigned long*)this)[0];
+    std::bitset<COLUMN_BITSET_SIZE> bs(meta >> (8 * kColumnBitsetIndex));
+    return bs;
+  }
+
+  inline void SetColumnBitset(const std::bitset<COLUMN_BITSET_SIZE>& bs,
+                              unsigned int embnum) {
+    ((unsigned long*)(this))[0] = (bs.to_ulong() << (8 * kColumnBitsetIndex)) |
+                                  (header_size << (8 * kHeaderSizeStartIndex)) |
+                                  (value_type << (8 * kValueTypeStartIndex)) |
+                                  (embnum << (8 * kEmbeddingNumStartIndex));
+  }
+
+  inline unsigned int GetHeaderSize() { return (unsigned int)header_size; }
+
+  inline void SetHeaderSize(size_t size) { header_size = (unsigned char)size; }
+};
+
+template <class V>
+class DynmaicDimDescriptorImpl : public FeatureDescriptorImpl<V> {
+  using FeatureDescriptorImpl<V>::slot_infos_;
+
+ public:
+  DynmaicDimDescriptorImpl(Allocator* alloc, int64 slot_num)
+      : alloc_bytes_(sizeof(std::atomic_flag) + sizeof(MetaHeader) +
+                     sizeof(V*) * slot_num),
+        header_offset_bytes_(sizeof(V*) * slot_num),
+        flag_offset_bytes_(sizeof(MetaHeader) + sizeof(V*) * slot_num),
+        FeatureDescriptorImpl<V>(slot_num, false, false) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+  }
+  ~DynmaicDimDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    return FeatureDescriptorImpl<V>::SetEmbeddingInfo(emb_index, embedding_dim,
+                                                      default_value);
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->embed_num;
+    auto metadata = meta->GetColumnBitset();
+
+    if (!metadata.test(emb_index)) {
+      std::atomic_flag* flag = (std::atomic_flag*)(val + flag_offset_bytes_);
+      while (flag->test_and_set(std::memory_order_acquire));
+      metadata = meta->GetColumnBitset();
+      if (metadata.test(emb_index)) {
+        flag->clear(std::memory_order_release);
+        return ((V**)val)[emb_index];
+      }
+      embnum++;
+      int64 alloc_value_len = slot_infos_[emb_index].embedding_dim;
+      V* tensor_val = (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                                              sizeof(V) * alloc_value_len);
+      V* default_v = (V*)slot_infos_[emb_index].default_value;
+      memcpy(tensor_val, default_v,
+             sizeof(V) * slot_infos_[emb_index].default_value_len);
+      ((V**)val)[emb_index] = tensor_val;
+
+      metadata.set(emb_index);
+      // NOTE:if we use ((unsigned long*)((char*)ptr_ + 1))[0] =
+      // metadata.to_ulong(); the ptr_ will be occaionally  modified from
+      // 0x7f18700912a0 to 0x700912a0 must use  ((V**)ptr_ + 1 + 1)[emb_index] =
+      // tensor_val;  to avoid
+      // LOG(INFO)<<"emb_num: "<<embnum;
+      meta->SetColumnBitset(metadata, embnum);
+      flag->clear(std::memory_order_release);
+      return tensor_val;
+    } else {
+      return ((V**)val)[emb_index];
+    }
+  }
+
+  bool IsAdmit(void* val) override { return true; }
+
+  void* Admit(void* val) override {}
+
+  void* Allocate() override {
+    void* val =
+        alloc_->AllocateRaw(Allocator::kAllocatorAlignment, alloc_bytes_);
+    memset(val, 0, alloc_bytes_);
+    new ((char*)val + header_offset_bytes_) MetaHeader();
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    MetaHeader* meta = (MetaHeader*)(val + header_offset_bytes_);
+    unsigned int embnum = (unsigned int)meta->GetEmbeddingNum();
+    // LOG(INFO)<<"emb_num in deallocate: "<<embnum;
+    auto metadata = meta->GetColumnBitset();
+    for (int i = 0; i < embnum; i++) {
+      if (metadata.test(i)) {
+        V* val_ptr = ((V**)((int64*)val + meta->GetHeaderSize()))[i];
+        if (val_ptr != nullptr) {
+          alloc_->DeallocateRaw(val_ptr);
+        }
+      }
+    }
+  }
+
+  void Deallocate(const std::vector<void*>& vals) override {
+    for (auto val : vals) {
+      Deallocate(val);
+    }
+  }
+
+  void AddFreq(void* val, int64 count) override {}
+
+  void SetAllocator(Allocator* alloc) override { alloc_ = alloc; }
+
+  void SetDefaultValue(void* val, int64 key) override {}
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(
+        val_ptr, value,
+        sizeof(V) *
+            FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device) {}
+#endif
+
+  int64 GetFreq(void* val) override {}
+
+  int64 GetVersion(void* val) override {}
+
+  void UpdateVersion(void* val, int64 version) override {}
+
+  void SetFreq(void* val, int64 freq) override {}
+
+  int data_bytes() override { return alloc_bytes_; }
+
+ private:
+  int alloc_bytes_ = 0;
+  int header_offset_bytes_ = 0;
+  int flag_offset_bytes_ = 0;
+  Allocator* alloc_ = ev_allocator();
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_COUNTER_FILTER_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h
new file mode 100644
index 00000000..75506b4e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file.h
@@ -0,0 +1,244 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_
+#include <fcntl.h>
+#include <malloc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <string>
+
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace embedding {
+class EmbFile {
+ public:
+  EmbFile(const std::string& path, size_t ver, int64 buffer_size)
+      : version_(ver),
+        file_size_(buffer_size),
+        count_(0),
+        invalid_count_(0),
+        is_deleted_(false) {
+    std::stringstream ss;
+    ss << std::setw(4) << std::setfill('0') << ver << ".emb";
+    filepath_ = path + ss.str();
+    OpenFstream();
+  }
+
+  virtual ~EmbFile() {}
+  virtual void Reopen() = 0;
+  virtual void Read(char* val, const size_t val_len, const size_t offset) = 0;
+
+  virtual void DeleteFile() {
+    is_deleted_ = true;
+    if (fs_.is_open()) {
+      fs_.close();
+    }
+    close(fd_);
+    std::remove(filepath_.c_str());
+  }
+
+  void LoadExistFile(const std::string& old_file_path, size_t count,
+                     size_t invalid_count) {
+    Env::Default()->CopyFile(old_file_path, filepath_);
+    Reopen();
+    count_ = count;
+    invalid_count_ = invalid_count;
+  }
+
+  void Flush() {
+    if (fs_.is_open()) {
+      fs_.flush();
+    }
+  }
+
+  void MapForRead() {
+    file_addr_for_read_ =
+        (char*)mmap(nullptr, file_size_, PROT_READ, MAP_PRIVATE, fd_, 0);
+  }
+
+  void UnmapForRead() { munmap((void*)file_addr_for_read_, file_size_); }
+
+  void ReadWithMemcpy(char* val, const size_t val_len, const size_t offset) {
+    memcpy(val, file_addr_for_read_ + offset, val_len);
+  }
+
+  void Write(const char* val, const size_t val_len) {
+    if (fs_.is_open()) {
+      fs_.write(val, val_len);
+      posix_fadvise(fd_, 0, file_size_, POSIX_FADV_DONTNEED);
+    } else {
+      fs_.open(filepath_,
+               std::ios::app | std::ios::in | std::ios::out | std::ios::binary);
+      fs_.write(val, val_len);
+      fs_.close();
+    }
+  }
+
+  size_t Count() const { return count_; }
+
+  void AddCount(size_t n) { count_ += n; }
+
+  size_t InvalidCount() const { return invalid_count_; }
+
+  void AddInvalidCount(size_t n) { invalid_count_ += n; }
+
+  void AddInvalidCountAtomic(size_t n) {
+    __sync_fetch_and_add(&invalid_count_, n);
+  }
+
+  size_t Version() const { return version_; }
+
+  bool IsDeleted() const { return is_deleted_; }
+
+  bool IsNeedToBeCompacted() {
+    return (count_ >= invalid_count_) && (count_ / 3 < invalid_count_);
+  }
+
+ protected:
+  void OpenFstream() {
+    fs_.open(filepath_,
+             std::ios::app | std::ios::in | std::ios::out | std::ios::binary);
+    CHECK(fs_.good());
+  }
+  void CloseFstream() {
+    if (fs_.is_open()) {
+      fs_.close();
+    }
+  }
+
+ private:
+  size_t version_;
+  size_t count_;
+  size_t invalid_count_;
+  char* file_addr_for_read_;
+  std::fstream fs_;
+
+ protected:
+  int64 file_size_;
+  int fd_;
+  bool is_deleted_;
+  std::string filepath_;
+};
+
+class MmapMadviseEmbFile : public EmbFile {
+ public:
+  MmapMadviseEmbFile(const std::string& path, size_t ver, int64 buffer_size)
+      : EmbFile(path, ver, buffer_size) {
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY);
+    file_addr_ = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ,
+                             MAP_PRIVATE, fd_, 0);
+  }
+
+  void Reopen() override {
+    CloseFstream();
+    munmap((void*)file_addr_, EmbFile::file_size_);
+    close(EmbFile::fd_);
+    OpenFstream();
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY);
+    file_addr_ = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ,
+                             MAP_PRIVATE, fd_, 0);
+  }
+
+  void DeleteFile() override {
+    is_deleted_ = true;
+    CloseFstream();
+    munmap((void*)file_addr_, EmbFile::file_size_);
+    close(EmbFile::fd_);
+    std::remove(EmbFile::filepath_.c_str());
+  }
+
+  void Read(char* val, const size_t val_len, const size_t offset) override {
+    memcpy(val, file_addr_ + offset, val_len);
+    int err = madvise(file_addr_, EmbFile::file_size_, MADV_DONTNEED);
+    if (err < 0) {
+      LOG(FATAL) << "Failed to madvise the page, file_addr_: "
+                 << (void*)file_addr_ << ", file_size: " << EmbFile::file_size_;
+    }
+  }
+
+ private:
+  char* file_addr_;
+};
+
+class MmapEmbFile : public EmbFile {
+ public:
+  MmapEmbFile(const std::string& path, size_t ver, int64 buffer_size)
+      : EmbFile(path, ver, buffer_size) {
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY);
+  }
+
+  void Reopen() override {
+    CloseFstream();
+    close(EmbFile::fd_);
+    OpenFstream();
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY);
+  }
+
+  void Read(char* val, const size_t val_len, const size_t offset) override {
+    char* file_addr_tmp = (char*)mmap(nullptr, EmbFile::file_size_, PROT_READ,
+                                      MAP_PRIVATE, fd_, 0);
+    memcpy(val, file_addr_tmp + offset, val_len);
+    munmap((void*)file_addr_tmp, EmbFile::file_size_);
+  }
+};
+
+class DirectIoEmbFile : public EmbFile {
+ public:
+  DirectIoEmbFile(const std::string& path, size_t ver, int64 buffer_size)
+      : EmbFile(path, ver, buffer_size) {
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY | O_DIRECT);
+  }
+
+  void Reopen() override {
+    EmbFile::CloseFstream();
+    close(EmbFile::fd_);
+    OpenFstream();
+    EmbFile::fd_ = open(EmbFile::filepath_.data(), O_RDONLY | O_DIRECT);
+  }
+
+  void Read(char* val, const size_t val_len, const size_t offset) override {
+    size_t page_size = getpagesize();
+    int pages_to_read = val_len / page_size;
+    if (val_len % page_size != 0) {
+      pages_to_read += 1;
+    }
+    if (offset + val_len >= page_size * pages_to_read) {
+      pages_to_read += 1;
+    }
+    int aligned_offset = offset - (offset % page_size);
+    char* read_buffer = (char*)memalign(page_size, page_size * pages_to_read);
+
+    int status = pread(EmbFile::fd_, (void*)read_buffer,
+                       page_size * pages_to_read, aligned_offset);
+    if (status < 0) {
+      LOG(FATAL) << "Failed to pread, read size: " << page_size * pages_to_read
+                 << ", offset: " << aligned_offset;
+    }
+    memcpy(val, read_buffer + (offset % page_size), val_len);
+    free(read_buffer);
+  }
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h
new file mode 100644
index 00000000..a439315d
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/emb_file_creator.h
@@ -0,0 +1,97 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_
+#include <map>
+#include <string>
+
+#include "emb_file.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace embedding {
+
+enum class IoScheme {
+  MMAP_AND_MADVISE = 0,
+  MMAP = 1,
+  DIRECT_IO = 2,
+  INVALID = 3
+};
+
+class EmbFileCreator {
+ public:
+  virtual EmbFile* Create(const std::string& path, const size_t version,
+                          const size_t buffer_size) = 0;
+};
+
+class MmapAndMadviseEmbFileCreator : public EmbFileCreator {
+ public:
+  EmbFile* Create(const std::string& path, const size_t version,
+                  const size_t buffer_size) override {
+    return new MmapMadviseEmbFile(path, version, buffer_size);
+  }
+};
+
+class MmapEmbFileCreator : public EmbFileCreator {
+ public:
+  EmbFile* Create(const std::string& path, const size_t version,
+                  const size_t buffer_size) override {
+    return new MmapEmbFile(path, version, buffer_size);
+  }
+};
+
+class DirectIoEmbFileCreator : public EmbFileCreator {
+ public:
+  EmbFile* Create(const std::string& path, const size_t version,
+                  const size_t buffer_size) override {
+    return new DirectIoEmbFile(path, version, buffer_size);
+  }
+};
+
+class EmbFileCreatorFactory {
+ public:
+  static EmbFileCreator* Create(const std::string& io_scheme) {
+    std::map<std::string, IoScheme> scheme_map{
+        {"mmap_and_madvise", IoScheme::MMAP_AND_MADVISE},
+        {"mmap", IoScheme::MMAP},
+        {"directio", IoScheme::DIRECT_IO}};
+
+    IoScheme scheme = IoScheme::INVALID;
+    if (scheme_map.find(io_scheme) != scheme_map.end()) {
+      scheme = scheme_map[io_scheme];
+    }
+
+    switch (scheme) {
+      case IoScheme::MMAP_AND_MADVISE:
+        static MmapAndMadviseEmbFileCreator mmap_madvise_file_creator;
+        return &mmap_madvise_file_creator;
+      case IoScheme::MMAP:
+        static MmapEmbFileCreator mmap_file_creator;
+        return &mmap_file_creator;
+      case IoScheme::DIRECT_IO:
+        static DirectIoEmbFileCreator directio_file_creator;
+        return &directio_file_creator;
+      default:
+        LOG(WARNING) << "Invalid IO scheme of SSDHASH,"
+                     << " use default mmap_and_advise scheme.";
+        static MmapAndMadviseEmbFileCreator default_file_creator;
+        return &default_file_creator;
+    }
+  }
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMB_FILE_CREATOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h
new file mode 100644
index 00000000..e328ef91
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_config.h
@@ -0,0 +1,110 @@
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_
+
+#include <cmath>
+
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+struct EmbeddingConfig {
+  int64 emb_index;
+  int64 primary_emb_index;
+  int64 block_num;
+  int64 slot_num;
+  std::string name;
+  int64 steps_to_live;
+  int64 filter_freq;
+  int64 max_freq;
+  float l2_weight_threshold;
+  int64 kHashFunc;
+  int64 num_counter;
+  DataType counter_type;
+  int64 default_value_dim;
+  float default_value_no_permission;
+  bool record_freq;
+  bool record_version;
+  bool is_inference;
+
+  EmbeddingConfig(int64 emb_index = 0, int64 primary_emb_index = 0,
+                  int64 block_num = 1, int slot_num = 0,
+                  const std::string& name = "", int64 steps_to_live = 0,
+                  int64 filter_freq = 0, int64 max_freq = 999999,
+                  float l2_weight_threshold = -1.0, int64 max_element_size = 0,
+                  float false_positive_probability = -1.0,
+                  DataType counter_type = DT_UINT64,
+                  int64 default_value_dim = 4096,
+                  float default_value_no_permission = .0,
+                  bool record_freq = false, bool record_version = false,
+                  bool is_inference = false)
+      : emb_index(emb_index),
+        primary_emb_index(primary_emb_index),
+        block_num(block_num),
+        slot_num(slot_num),
+        name(name),
+        steps_to_live(steps_to_live),
+        filter_freq(filter_freq),
+        max_freq(max_freq),
+        l2_weight_threshold(l2_weight_threshold),
+        counter_type(counter_type),
+        default_value_dim(default_value_dim),
+        default_value_no_permission(default_value_no_permission),
+        record_freq(record_freq),
+        record_version(record_version),
+        is_inference(is_inference) {
+    if (max_element_size != 0 && false_positive_probability != -1.0) {
+      kHashFunc = calc_num_hash_func(false_positive_probability);
+      num_counter =
+          calc_num_counter(max_element_size, false_positive_probability);
+    } else {
+      kHashFunc = 0;
+      num_counter = 0;
+    }
+  }
+
+  int64 calc_num_counter(int64 max_element_size,
+                         float false_positive_probability) {
+    float loghpp = fabs(log(false_positive_probability));
+    float factor = log(2) * log(2);
+    int64 num_bucket = ceil(loghpp / factor * max_element_size);
+    if (num_bucket * sizeof(counter_type) > 10 * (1L << 30))
+      LOG(WARNING) << "The Size of BloomFilter is more than 10GB!";
+    return num_bucket;
+  }
+
+  bool is_counter_filter() {
+    if (filter_freq != 0 && kHashFunc == 0 && num_counter == 0) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  int64 calc_num_hash_func(float false_positive_probability) {
+    float loghpp = fabs(log(false_positive_probability) / log(2));
+    return ceil(loghpp);
+  }
+  bool is_primary() const { return emb_index == primary_emb_index; }
+
+  bool is_save_freq() const { return filter_freq != 0 || record_freq; }
+
+  bool is_save_version() const { return steps_to_live != 0 || record_version; }
+
+  int64 get_filter_freq() { return filter_freq; }
+
+  std::string DebugString() const {
+    return strings::StrCat(
+        "opname: ", name, " emb_index: ", emb_index,
+        " primary_emb_index: ", primary_emb_index, " block_num: ", block_num,
+        " slot_num: ", slot_num, " steps_to_live: ", steps_to_live,
+        " filter_freq: ", filter_freq, " max_freq: ", max_freq,
+        " l2_weight_threshold: ", l2_weight_threshold,
+        " default_value_dim: ", default_value_dim);
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_CONFIG_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h
new file mode 100644
index 00000000..030ea37d
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_memory_pool.h
@@ -0,0 +1,89 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_
+#include <deque>
+
+#include "tensorflow/core/framework/allocator.h"
+
+namespace tensorflow {
+namespace embedding {
+template <typename V>
+class EmbeddingMemoryPool {
+ public:
+  explicit EmbeddingMemoryPool(Allocator* alloc, int64 value_len,
+                               int64 block_size)
+      : alloc_(alloc), value_len_(value_len), block_size_(block_size) {
+    embs_per_block_ = block_size_ / (sizeof(V) * value_len_);
+    CreateBlock();
+  }
+
+  ~EmbeddingMemoryPool() {
+    for (auto it : block_list_) {
+      alloc_->DeallocateRaw(it);
+    }
+  }
+
+  V* Allocate() {
+    if (free_ptr_queue_.size() == 0) {
+      CreateBlock();
+    }
+    V* ptr = free_ptr_queue_.front();
+    free_ptr_queue_.pop_front();
+    return ptr;
+  }
+
+  void Deallocate(std::vector<void*> value_ptrs) {
+    int64 prev_size = value_ptrs_queue_.size();
+    for (auto it : value_ptrs) {
+      value_ptrs_queue_.emplace_back(it);
+    }
+    if (value_ptrs_queue_.size() > embs_per_block_) {
+      int64 n = value_ptrs_queue_.size() - embs_per_block_;
+      n = std::min(prev_size, n);
+      for (int64 i = 0; i < n; i++) {
+        void* val = value_ptrs_queue_.front();
+        free_ptr_queue_.emplace_back((V*)val);
+        value_ptrs_queue_.pop_front();
+      }
+    }
+  }
+
+  void Deallocate(V* ptr) { free_ptr_queue_.emplace_back(ptr); }
+
+ private:
+  void CreateBlock() {
+    V* dev_addr =
+        (V*)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                                sizeof(V) * value_len_ * embs_per_block_);
+    block_list_.emplace_back(dev_addr);
+    for (int64 i = 0; i < embs_per_block_; i++) {
+      free_ptr_queue_.emplace_back(dev_addr + i * value_len_);
+    }
+  }
+
+  int64 block_size_;
+  int64 value_len_;
+  int64 embs_per_block_;
+  Allocator* alloc_;
+  std::deque<V*> free_ptr_queue_;
+  std::deque<void*> value_ptrs_queue_;
+  std::vector<V*> block_list_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_MEMORY_POOL_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc
new file mode 100644
index 00000000..7d8f889c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.cu.cc
@@ -0,0 +1,77 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "embedding_var.h"
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using se::DeviceMemoryBase;
+using se::Stream;
+
+void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr) {
+  volatile bool is_kernel_finish = false;
+  event_mgr->ThenExecute(stream,
+                         [&is_kernel_finish]() { is_kernel_finish = true; });
+  while (!is_kernel_finish) {
+  }
+}
+
+template <class K, class V>
+void EmbeddingVar<K, V>::CopyEmbeddingsToBuffer(
+    V* val_base, int64 size, V** memcpy_address, se::Stream* compute_stream,
+    EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device) {
+  int block_dim = 128;
+  V** dev_value_address = (V**)GetBuffer(size);
+  DeviceMemoryBase gpu_dst_ptr(dev_value_address, size * sizeof(V*));
+  compute_stream->ThenMemcpy(&gpu_dst_ptr, memcpy_address, size * sizeof(V*));
+
+  int limit = size;
+  int length = ValueLen();
+  TF_CHECK_OK(GpuLaunchKernel(embedding::BatchCopy<V>,
+                              (limit + block_dim - 1) / block_dim * length,
+                              block_dim, 0, gpu_device.stream(),
+                              dev_value_address, val_base, length, limit));
+  SyncWithEventMgr(compute_stream, event_mgr);
+}
+#define REGISTER_KERNELS(ktype, vtype)                              \
+  template void EmbeddingVar<ktype, vtype>::CopyEmbeddingsToBuffer( \
+      vtype*, int64, vtype**, se::Stream*, EventMgr*,               \
+      const Eigen::GpuDevice& gpu_device);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h
new file mode 100644
index 00000000..57495fa4
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h
@@ -0,0 +1,706 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_
+
+#include "cache.h"
+#include "embedding_config.h"
+#include "embedding_var_context.h"
+#include "embedding_var_restore.h"
+#include "filter_factory.h"
+#include "gpu_hash_map_kv.h"
+#include "storage.h"
+#include "storage_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/typed_allocator.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+#if GOOGLE_CUDA
+void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
+#endif  // GOOGLE_CUDA
+
+template <class K, class V>
+class GPUHashTable;
+
+template <class K, class V>
+class EmbeddingVar : public ResourceBase {
+ public:
+  EmbeddingVar(const string& name, embedding::Storage<K, V>* storage,
+               EmbeddingConfig emb_cfg, Allocator* alloc,
+               embedding::FeatureDescriptor<V>* feat_desc)
+      : name_(name),
+        storage_(storage),
+        default_value_(nullptr),
+        default_value_no_permission_(nullptr),
+        value_len_(0),
+        alloc_(alloc),
+        default_value_alloc_(alloc),
+        emb_config_(emb_cfg),
+        feat_desc_(feat_desc) {}
+
+  Status Init(const Tensor& default_tensor, int64 default_value_dim) {
+    if (storage_ == nullptr) {
+      return errors::InvalidArgument(
+          "Invalid ht_type to construct EmbeddingVar");
+    }
+
+    storage_type_ = storage_->GetStorageType();
+    filter_ = FilterFactory::CreateFilter<K, V, EmbeddingVar<K, V>>(
+        emb_config_, this, storage_, feat_desc_);
+    emb_config_.default_value_dim = default_value_dim;
+    value_len_ = default_tensor.NumElements() / emb_config_.default_value_dim;
+
+    if (storage_->IsUseHbm()) {
+#if GOOGLE_CUDA
+      default_value_ = TypedAllocator::Allocate<V>(
+          alloc_, default_tensor.NumElements(), AllocationAttributes());
+      auto default_tensor_flat = default_tensor.flat<V>();
+      dev_addr_buffer_ = nullptr;
+      dev_addr_buffer_size_ = 0;
+      cudaMemcpy(default_value_, &default_tensor_flat(0),
+                 default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
+#endif  // GOOGLE_CUDA
+    } else if (storage_->IsSingleHbm()) {
+#if GOOGLE_CUDA
+      storage_->SetValueLen(value_len_);
+      default_value_ = TypedAllocator::Allocate<V>(
+          alloc_, default_tensor.NumElements(), AllocationAttributes());
+      auto default_tensor_flat = default_tensor.flat<V>();
+      cudaMemcpy(default_value_, &default_tensor_flat(0),
+                 default_tensor.TotalBytes(), cudaMemcpyDeviceToDevice);
+#endif  // GOOGLE_CUDA
+    } else {
+      alloc_ = ev_allocator();
+      default_value_ = TypedAllocator::Allocate<V>(default_value_alloc_,
+                                                   default_tensor.NumElements(),
+                                                   AllocationAttributes());
+
+      auto default_tensor_flat = default_tensor.flat<V>();
+      memcpy(default_value_, &default_tensor_flat(0),
+             default_tensor.TotalBytes());
+
+      default_value_no_permission_ = TypedAllocator::Allocate<V>(
+          default_value_alloc_, value_len_, AllocationAttributes());
+      for (int i = 0; i < value_len_; ++i) {
+        default_value_no_permission_[i] =
+            static_cast<V>(emb_config_.default_value_no_permission);
+      }
+    }
+    bool is_all_slots_initialized = feat_desc_->InitSlotInfo(
+        emb_config_.emb_index, value_len_,
+        std::pair<V*, int64>(default_value_, emb_config_.default_value_dim));
+    if (is_all_slots_initialized) {
+      storage_->Init();
+      SetAllSlotInitialized();
+    }
+
+    return OkStatus();
+  }
+
+  void SetInitialized() { is_initialized_ = true; }
+
+  void SetAllSlotInitialized() { is_all_slot_initialized_ = true; }
+
+  bool IsInitialized() const { return is_initialized_; }
+
+  bool IsAllSlotInitialized() const { return is_all_slot_initialized_; }
+
+  Status LookupKey(K key, void** value_ptr) {
+    return storage_->Get(key, value_ptr);
+  }
+
+  Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter,
+                           bool indices_as_pointer, int64 count = 1) {
+    if (indices_as_pointer) {
+      *value_ptr = (void*)key;
+      *is_filter = filter_->is_admit(key, *value_ptr);
+      return OkStatus();
+    } else {
+      Status s = filter_->LookupOrCreateKey(key, value_ptr, is_filter, count);
+      return s;
+    }
+  }
+
+  Status Insert(K key, V* value) {
+    void* value_ptr = nullptr;
+    CreateKey(key, &value_ptr, true);
+    feat_desc_->SetValue(value_ptr, emb_config_.emb_index, value);
+    return OkStatus();
+  }
+
+  Status LookupOrCreateKey(K key, void** value_ptr) {
+    Status s = storage_->GetOrCreate(key, value_ptr);
+    TF_CHECK_OK(s);
+    return s;
+  }
+
+  void CreateKey(K key, void** value_ptr, bool to_dram) {
+    storage_->CreateAndInsert(key, value_ptr, to_dram);
+  }
+
+  void UpdateVersion(void* value_ptr, int64 gs) {
+    feat_desc_->UpdateVersion(value_ptr, gs);
+  }
+
+  void BatchCommit(const std::vector<K>& keys,
+                   const std::vector<void*>& value_ptrs) {
+    TF_CHECK_OK(storage_->BatchCommit(keys, value_ptrs));
+  }
+
+  void Eviction(K* evict_ids, int64 evict_size) {
+    TF_CHECK_OK(storage_->Eviction(evict_ids, evict_size));
+  }
+
+  int64 GetVersion(K key) {
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetVersion(value_ptr);
+  }
+
+  int64 GetFreq(K key) { return filter_->GetFreq(key); }
+
+  Status Lookup(K key, V* val, V* default_v) {
+    const V* default_value_ptr =
+        (default_v == nullptr) ? default_value_ : default_v;
+    return filter_->Lookup(key, val, default_value_ptr,
+                           default_value_no_permission_);
+  }
+
+  void GetEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                     const K* keys, V* output, int64 num_of_keys) {
+    auto do_work = [this, keys, output](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v =
+            default_value_ +
+            (std::abs(keys[i]) % emb_config_.default_value_dim) * value_len_;
+        filter_->Lookup(keys[i], output + i * value_len_, default_v,
+                        default_value_no_permission_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
+  // Used for CPU Adaptive Embedding
+  void GetEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                     const K* keys, V* output, int64 num_of_keys,
+                     V* default_value) {
+    auto do_work = [this, keys, output, default_value](int64 start,
+                                                       int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v = default_value + i * value_len_;
+        void* value_ptr = nullptr;
+        filter_->LookupOrCreate(keys[i], output + i * value_len_, default_v,
+                                &value_ptr, 1, default_value_no_permission_);
+        feat_desc_->AddFreq(value_ptr, 1);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
+  void GetOrCreateKey(const EmbeddingVarContext<CPUDevice>& context,
+                      const Tensor& keys_tensor, void** value_ptrs,
+                      int64 num_of_keys) {
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_filter = false;
+        filter_->LookupOrCreateKey(keys[i], &value_ptrs[i], &is_filter, 1);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+
+    storage_->AddToCachePrefetchList(keys_tensor);
+  }
+
+  void GatherEmbeddings(const EmbeddingVarContext<CPUDevice>& context,
+                        const Tensor& keys_tensor, void** value_ptrs, V* output,
+                        int64 num_of_keys) {
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs, output](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
+        V* value = nullptr;
+        if (is_admit) {
+          value =
+              feat_desc_->GetEmbedding(value_ptrs[i], emb_config_.emb_index);
+        } else {
+          value = default_value_no_permission_;
+        }
+        memcpy(output + i * value_len_, value, sizeof(V) * value_len_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+
+    storage_->AddToCache(keys_tensor);
+  }
+
+#if GOOGLE_CUDA
+  void GetEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
+                     const K* keys, V* output, int64 num_of_keys) {
+    if (IsSingleHbm()) {
+      storage_->BatchLookup(context.gpu_device, keys, output, num_of_keys,
+                            default_value_);
+    } else {
+      filter_->BatchLookup(context, keys, output, num_of_keys, default_value_,
+                           default_value_no_permission_);
+    }
+  }
+
+  void GetOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                      const Tensor& keys_tensor, void** value_ptrs,
+                      int64 num_of_keys, bool indices_as_pointer = false) {
+    const K* keys = (K*)keys_tensor.data();
+    filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    storage_->AddToCachePrefetchList(keys_tensor);
+  }
+
+  void BatchLookupOrCreateKey(
+      const EmbeddingVarContext<GPUDevice>& context, const K* keys,
+      void** value_ptrs, int64 num_of_keys,
+      std::vector<std::list<int64>>& not_found_cursor_list) {
+    storage_->BatchGetOrCreate(context, keys, value_ptrs, num_of_keys,
+                               value_len_, not_found_cursor_list);
+  }
+
+  void GatherEmbeddings(const EmbeddingVarContext<GPUDevice>& context,
+                        const Tensor& keys_tensor, void** value_ptrs, V* output,
+                        int64 num_of_keys) {
+    std::vector<V*> embedding_ptr(num_of_keys);
+    const K* keys = (K*)keys_tensor.data();
+    auto do_work = [this, keys, value_ptrs, output, &embedding_ptr](
+                       int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        bool is_admit = filter_->is_admit(keys[i], value_ptrs[i]);
+        feat_desc_->AddFreq(value_ptrs[i], 1);
+        if (is_admit) {
+          embedding_ptr[i] =
+              feat_desc_->GetEmbedding(value_ptrs[i], emb_config_.emb_index);
+        } else {
+          embedding_ptr[i] = default_value_no_permission_;
+        }
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+
+    auto stream = context.compute_stream;
+    auto event_mgr = context.event_mgr;
+    CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(), stream,
+                           event_mgr, context.gpu_device);
+
+    storage_->AddToCache(keys_tensor);
+  }
+
+  void BatchLookupKey(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                      void** value_ptr_list, int64 num_of_keys) {
+    storage_->BatchGet(ctx, keys, value_ptr_list, num_of_keys);
+  }
+
+  Status LookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys, void** value_ptrs, int64 num_of_keys,
+                           int64* indices_counts,
+                           bool indices_as_pointer = false) {
+    if (indices_as_pointer) {
+      auto lookup_key_and_set_version_fn = [keys, value_ptrs](int64 start,
+                                                              int64 limit) {
+        for (int i = start; i < limit; i++) {
+          value_ptrs[i] = (void*)keys[i];
+        }
+      };
+      const int64 unit_cost =
+          1000;  // very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+            unit_cost, lookup_key_and_set_version_fn);
+    } else {
+      filter_->BatchLookupOrCreateKey(context, keys, value_ptrs, num_of_keys);
+    }
+
+    if (indices_counts != nullptr) {
+      auto add_freq_fn = [this, value_ptrs, indices_counts](int64 start,
+                                                            int64 limit) {
+        for (int i = start; i < limit; i++) {
+          feat_desc_->AddFreq(value_ptrs[i], indices_counts[i]);
+        }
+      };
+      const int64 unit_cost =
+          1000;  // very unreliable estimate for cost per step.
+      auto worker_threads = context.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+            unit_cost, add_freq_fn);
+    }
+    return OkStatus();
+  }
+#endif
+
+#if GOOGLE_CUDA
+  void CopyEmbeddingsToBuffer(V* val_base, int64 size, V** memcpy_address,
+                              se::Stream* compute_stream, EventMgr* event_mgr,
+                              const Eigen::GpuDevice& gpu_device);
+#endif  // GOOGLE_CUDA
+
+  typename TTypes<V>::Flat flat(void* value_ptr) {
+    V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
+    Eigen::array<Eigen::DenseIndex, 1> dims({value_len_});
+    return typename TTypes<V>::Flat(val, dims);
+  }
+
+  V* GetValuePtr(void* ptr) {
+    return feat_desc_->GetEmbedding(ptr, emb_config_.emb_index);
+  }
+
+  int64 ValueLen() const { return value_len_; }
+
+  int64 Size() const { return storage_->Size(); }
+
+  int64 CacheSize() const { return storage_->CacheSize(); }
+
+  int64 MemoryUsage() const {
+    return storage_->Size() * (sizeof(K) + feat_desc_->data_bytes());
+  }
+
+  int64 MinFreq() { return emb_config_.filter_freq; }
+
+  int64 StepsToLive() const { return emb_config_.steps_to_live; }
+
+  bool IsMultiLevel() { return storage_->IsMultiLevel(); }
+
+  bool IsUseHbm() { return storage_->IsUseHbm(); }
+
+  bool IsSingleHbm() { return storage_->IsSingleHbm(); }
+
+  bool IsUsePersistentStorage() { return storage_->IsUsePersistentStorage(); }
+
+  void InitCache(embedding::CacheStrategy cache_strategy) {
+    storage_->InitCache(cache_strategy);
+  }
+
+  std::string DebugString() const { return emb_config_.DebugString(); }
+
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string, int64 partition_id,
+               int64 partition_num, bool is_incr, BundleReader* reader,
+               bool reset_version = false,
+               const Eigen::GpuDevice* device = nullptr) {
+    return storage_->Restore(name_string, file_name_string, partition_id,
+                             partition_num, value_len_, is_incr, reset_version,
+                             emb_config_, device, reader, this, filter_);
+  }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, embedding::ShrinkArgs& shrink_args) {
+    return storage_->Save(tensor_name, prefix, writer, emb_config_, shrink_args,
+                          value_len_, default_value_);
+  }
+
+  void GetSnapshot(std::vector<K>* key_list, std::vector<V*>* value_list,
+                   std::vector<int64>* version_list,
+                   std::vector<int64>* freq_list) {
+    std::vector<void*> value_ptr_list;
+    storage_->GetSnapshot(key_list, &value_ptr_list);
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+    for (int64 i = 0; i < key_list->size(); i++) {
+      if (feat_desc_->IsAdmit(value_ptr_list[i])) {
+        V* val =
+            feat_desc_->GetEmbedding(value_ptr_list[i], emb_config_.emb_index);
+        value_list->emplace_back(val);
+      } else {
+        value_list->emplace_back(default_value_);
+      }
+
+      if (is_save_version) {
+        int64 dump_version = feat_desc_->GetVersion(value_ptr_list[i]);
+        version_list->emplace_back(dump_version);
+      }
+
+      if (is_save_freq) {
+        int64 dump_freq = feat_desc_->GetFreq(value_ptr_list[i]);
+        freq_list->emplace_back(dump_freq);
+      }
+    }
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_num) {
+    return storage_->GetShardedSnapshot(key_list, value_ptr_list, partition_id,
+                                        partition_num);
+  }
+
+  void ExportAndRemove(K* key_list, V* value_list, int64* version_list,
+                       int64* freq_list, std::vector<K>& tot_keys_list,
+                       std::vector<void*>& tot_value_ptr_list) {
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true,
+                                   &save_unfiltered_features));
+
+    bool is_save_freq = emb_config_.is_save_freq();
+    bool is_save_version = emb_config_.is_save_version();
+
+    for (int64 i = 0; i < tot_keys_list.size(); ++i) {
+      auto& value_ptr = tot_value_ptr_list[i];
+      if ((int64)value_ptr == embedding::ValuePtrStatus::IS_DELETED) continue;
+
+      bool is_admit = feat_desc_->IsAdmit(value_ptr);
+      bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+
+      if (is_admit) {
+        key_list[i] = tot_keys_list[i];
+
+        if (!is_in_dram) {
+          auto tmp_value = value_list + i * value_len_;
+          tmp_value = (V*)embedding::ValuePtrStatus::NOT_IN_DRAM;
+          value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        } else if (feat_desc_->GetEmbedding(value_ptr, 0) == nullptr) {
+          memcpy(value_list + i * value_len_, default_value_,
+                 sizeof(V) * value_len_);
+        } else {
+          V* val = feat_desc_->GetEmbedding(value_ptr, emb_config_.emb_index);
+          memcpy(value_list + i * value_len_, val, sizeof(V) * value_len_);
+        }
+
+        if (is_save_version) {
+          int64 dump_version = feat_desc_->GetVersion(value_ptr);
+          version_list[i] = dump_version;
+        }
+
+        if (is_save_freq) {
+          int64 dump_freq = feat_desc_->GetFreq(value_ptr);
+          freq_list[i] = dump_freq;
+        }
+      } else {
+        if (!save_unfiltered_features) continue;
+        // TODO(JUNQI) : currently not export filtered keys
+      }
+
+      if (emb_config_.is_primary()) {
+        Status s;
+        s = storage_->Remove(tot_keys_list[i]);
+        if (!s.ok()) {
+          LOG(ERROR) << "Remove keys error: " << s.message();
+        }
+        feat_desc_->Deallocate(value_ptr);
+      }
+    }
+    return;
+  }
+
+  Status RestoreFromKeysAndValues(int64 key_num, int partition_id,
+                                  int partition_num, const K* key_list,
+                                  const V* value_list,
+                                  const int64* version_list,
+                                  const int64* freq_list,
+                                  const Eigen::GpuDevice* device = nullptr) {
+    RestoreBuffer restore_buff((char*)key_list, (char*)value_list,
+                               (char*)version_list, (char*)freq_list);
+    return storage_->RestoreFeatures(
+        key_num, kSavedPartitionNum, partition_id, partition_num, value_len_,
+        false /* is_filter*/, false /* is_incr*/, emb_config_, device, filter_,
+        restore_buff);
+  }
+
+  mutex* mu() { return &mu_; }
+
+  embedding::Storage<K, V>* storage() { return storage_; }
+
+  embedding::FeatureDescriptor<V>* feature_descriptor() { return feat_desc_; }
+
+  Status Shrink(embedding::ShrinkArgs& shrink_args) {
+    if (emb_config_.is_primary()) {
+      shrink_args.value_len = value_len_;
+      return storage_->Shrink(shrink_args);
+    } else {
+      return OkStatus();
+    }
+  }
+
+  string Name() { return name_; }
+
+  V* GetDefaultValuePtr() { return default_value_; }
+
+  int64 GetDefaultValueDim() { return emb_config_.default_value_dim; }
+
+  V* GetDefaultValue(int64 key) {
+    return default_value_ + (key % emb_config_.default_value_dim) * value_len_;
+  }
+
+  embedding::BatchCache<K>* Cache() { return storage_->Cache(); }
+
+  int64 GetEmbeddingIndex() { return emb_config_.emb_index; }
+
+  int64 GetEmbeddingSlotNum() { return emb_config_.slot_num; }
+
+  Allocator* GetAllocator() { return alloc_; }
+
+  V** GetBuffer(int64 size) {
+    if (dev_addr_buffer_size_ >= size) {
+      return dev_addr_buffer_;
+    } else {
+      if (dev_addr_buffer_size_ != 0) {
+        alloc_->DeallocateRaw(dev_addr_buffer_);
+      }
+      dev_addr_buffer_ = (V**)alloc_->AllocateRaw(
+          Allocator::kAllocatorAlignment, size * sizeof(V*));
+      dev_addr_buffer_size_ = size;
+      return dev_addr_buffer_;
+    }
+  }
+
+  void UpdateCache(const Tensor& indices, const Tensor& indices_counts,
+                   bool is_called_by_gather = false) {
+    if (!is_called_by_gather ||
+        (is_called_by_gather && emb_config_.is_inference)) {
+      storage_->UpdateCache(indices, indices_counts);
+    }
+  }
+
+  void UpdateCache(const Tensor& indices, bool is_called_by_gather = false) {
+    if (!is_called_by_gather ||
+        (is_called_by_gather && emb_config_.is_inference)) {
+      storage_->UpdateCache(indices);
+    }
+  }
+
+  void UpdateCache(const K* key_buff, int64 key_num, const int64* version_buff,
+                   const int64* freq_buff) {
+    auto cache = Cache();
+    if (cache) {
+      cache->update(key_buff, key_num, version_buff, freq_buff);
+      auto cache_size = CacheSize();
+      if (cache->size() > cache_size) {
+        int64 evict_size = cache->size() - cache_size;
+        K* evict_ids = new K[evict_size];
+        size_t true_size = cache->get_evic_ids(evict_ids, evict_size);
+        if (!IsUseHbm()) {
+          Eviction(evict_ids, true_size);
+        }
+        delete[] evict_ids;
+      }
+    }
+  }
+
+  void LookupOrCreate(const K* key, V* val, V* default_v, int32 default_v_num,
+                      size_t n, const Eigen::GpuDevice& device) {
+    storage_->BatchLookupOrCreate(key, val, default_v, default_v_num, n,
+                                  device);
+  }
+
+  void LookupOrCreateKey(const K* key, int32* item_idxs, size_t n,
+                         const Eigen::GpuDevice& device,
+                         int64 update_version = -1) {
+    storage_->BatchLookupOrCreateKeys(key, item_idxs, n, device);
+  }
+
+  void Lookup(const K* key, V* val, V* default_v, int32 default_v_num, size_t n,
+              const Eigen::GpuDevice& device) {
+    storage_->BatchLookup(key, val, default_v, default_v_num, n, device);
+  }
+
+  int32 SlotNum() {
+    return (emb_config_.block_num * (1 + emb_config_.slot_num));
+  }
+
+  int32 EmbIdx() { return emb_config_.emb_index; }
+
+  GPUHashTable<K, V>* HashTable() { return storage_->HashTable(); }
+  FilterPolicy<K, V, EmbeddingVar<K, V>>* GetFilter() const { return filter_; }
+
+ protected:
+  ~EmbeddingVar() override {
+    // When dynamic dimension embedding is used,
+    // there will be more than one primary slot
+    if (emb_config_.is_primary() && emb_config_.primary_emb_index == 0) {
+      delete storage_;
+      delete feat_desc_;
+    }
+    if (embedding::StorageType::HBM_DRAM == storage_type_) {
+      alloc_->DeallocateRaw(dev_addr_buffer_);
+    }
+    TypedAllocator::Deallocate(default_value_alloc_, default_value_,
+                               value_len_ * emb_config_.default_value_dim);
+    if (default_value_no_permission_) {
+      TypedAllocator::Deallocate(default_value_alloc_,
+                                 default_value_no_permission_, value_len_);
+    }
+    if (filter_) {
+      delete filter_;
+    }
+  }
+
+ private:
+  void LookupThroughFilter(const EmbeddingVarContext<CPUDevice>& context,
+                           const Tensor& indices, V* output,
+                           int64 num_of_keys) {
+    const K* keys = (K*)indices.data();
+    auto do_work = [this, keys, output](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; ++i) {
+        V* default_v = default_value_ +
+                       (keys[i] % emb_config_.default_value_dim) * value_len_;
+        filter_->Lookup(keys[i], output + i * value_len_, default_v,
+                        default_value_no_permission_);
+      }
+    };
+    auto worker_threads = context.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          value_len_ * sizeof(V), do_work);
+  }
+
+  std::string name_;
+  bool is_initialized_ = false;
+  bool is_all_slot_initialized_ = false;
+
+  mutex mu_;
+
+  V* default_value_;
+  V* default_value_no_permission_;
+  V** dev_addr_buffer_;
+  int64 dev_addr_buffer_size_;
+  int64 value_len_;
+  Allocator* alloc_;
+  Allocator* default_value_alloc_;
+  embedding::Storage<K, V>* storage_;
+  embedding::StorageType storage_type_;
+  EmbeddingConfig emb_config_;
+  FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EmbeddingVar);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc
new file mode 100644
index 00000000..7eabf919
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.cc
@@ -0,0 +1,229 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "embedding_var_ckpt_data.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h"
+#include "embedding_var_dump_iterator.h"
+#include "tensorflow/core/framework/register_types.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(
+    K key, void* value_ptr, const EmbeddingConfig& emb_config, V* default_value,
+    FeatureDescriptor<V>* feat_desc, bool is_save_freq, bool is_save_version,
+    bool save_unfiltered_features) {
+  if ((int64)value_ptr == ValuePtrStatus::IS_DELETED) return;
+
+  bool is_in_dram = ((int64)value_ptr >> kDramFlagOffset == 0);
+  bool is_admit = feat_desc->IsAdmit(value_ptr);
+
+  if (is_admit) {
+    key_vec_.emplace_back(key);
+
+    if (!is_in_dram) {
+      value_ptr_vec_.emplace_back((V*)ValuePtrStatus::NOT_IN_DRAM);
+      value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+    } else if (feat_desc->GetEmbedding(value_ptr, 0) == nullptr) {
+      value_ptr_vec_.emplace_back(default_value);
+    } else {
+      V* val = feat_desc->GetEmbedding(value_ptr, emb_config.emb_index);
+      value_ptr_vec_.emplace_back(val);
+    }
+    if (is_save_version) {
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
+      version_vec_.emplace_back(dump_version);
+    }
+
+    if (is_save_freq) {
+      int64 dump_freq = feat_desc->GetFreq(value_ptr);
+      freq_vec_.emplace_back(dump_freq);
+    }
+  } else {
+    if (!save_unfiltered_features) return;
+
+    key_filter_vec_.emplace_back(key);
+
+    if (is_save_version) {
+      int64 dump_version = feat_desc->GetVersion(value_ptr);
+      version_filter_vec_.emplace_back(dump_version);
+    }
+
+    int64 dump_freq = feat_desc->GetFreq(value_ptr);
+    freq_filter_vec_.emplace_back(dump_freq);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                                         \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(                   \
+      ktype, void*, const EmbeddingConfig&, vtype*, FeatureDescriptor<vtype>*, \
+      bool, bool, bool);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+void EmbeddingVarCkptData<K, V>::Emplace(K key, V* value_ptr) {
+  key_vec_.emplace_back(key);
+  value_ptr_vec_.emplace_back(value_ptr);
+}
+#define REGISTER_KERNELS(ktype, vtype) \
+  template void EmbeddingVarCkptData<ktype, vtype>::Emplace(ktype, vtype*);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+void EmbeddingVarCkptData<K, V>::SetWithPartition(
+    std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts) {
+  part_offset_.resize(kSavedPartitionNum + 1);
+  part_filter_offset_.resize(kSavedPartitionNum + 1);
+  part_offset_[0] = 0;
+  part_filter_offset_[0] = 0;
+  for (int i = 0; i < kSavedPartitionNum; i++) {
+    part_offset_[i + 1] =
+        part_offset_[i] + ev_ckpt_data_parts[i].key_vec_.size();
+
+    part_filter_offset_[i + 1] =
+        part_filter_offset_[i] + ev_ckpt_data_parts[i].key_filter_vec_.size();
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_vec_.size(); j++) {
+      key_vec_.emplace_back(ev_ckpt_data_parts[i].key_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].value_ptr_vec_.size(); j++) {
+      value_ptr_vec_.emplace_back(ev_ckpt_data_parts[i].value_ptr_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_vec_.size(); j++) {
+      version_vec_.emplace_back(ev_ckpt_data_parts[i].version_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_vec_.size(); j++) {
+      freq_vec_.emplace_back(ev_ckpt_data_parts[i].freq_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].key_filter_vec_.size(); j++) {
+      key_filter_vec_.emplace_back(ev_ckpt_data_parts[i].key_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].version_filter_vec_.size();
+         j++) {
+      version_filter_vec_.emplace_back(
+          ev_ckpt_data_parts[i].version_filter_vec_[j]);
+    }
+
+    for (int64 j = 0; j < ev_ckpt_data_parts[i].freq_filter_vec_.size(); j++) {
+      freq_filter_vec_.emplace_back(ev_ckpt_data_parts[i].freq_filter_vec_[j]);
+    }
+  }
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                                \
+  template void EmbeddingVarCkptData<ktype, vtype>::SetWithPartition( \
+      std::vector<EmbeddingVarCkptData<ktype, vtype>>&);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status EmbeddingVarCkptData<K, V>::ExportToCkpt(const string& tensor_name,
+                                                BundleWriter* writer,
+                                                int64 value_len,
+                                                ValueIterator<V>* value_iter) {
+  size_t bytes_limit = 8 << 20;
+  std::unique_ptr<char[]> dump_buffer(new char[bytes_limit]);
+
+  EVVectorDataDumpIterator<K> key_dump_iter(key_vec_);
+  Status s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys", writer, dump_buffer.get(), bytes_limit,
+      &key_dump_iter, TensorShape({key_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EV2dVectorDataDumpIterator<V> value_dump_iter(value_ptr_vec_, value_len,
+                                                value_iter);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-values", writer, dump_buffer.get(), bytes_limit,
+      &value_dump_iter, TensorShape({value_ptr_vec_.size(), value_len}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int64> version_dump_iter(version_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-versions", writer, dump_buffer.get(), bytes_limit,
+      &version_dump_iter, TensorShape({version_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int64> freq_dump_iter(freq_vec_);
+  s = SaveTensorWithFixedBuffer(tensor_name + "-freqs", writer,
+                                dump_buffer.get(), bytes_limit, &freq_dump_iter,
+                                TensorShape({freq_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<K> filtered_key_dump_iter(key_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-keys_filtered", writer, dump_buffer.get(), bytes_limit,
+      &filtered_key_dump_iter, TensorShape({key_filter_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int64> filtered_version_dump_iter(
+      version_filter_vec_);
+  s = SaveTensorWithFixedBuffer(tensor_name + "-versions_filtered", writer,
+                                dump_buffer.get(), bytes_limit,
+                                &filtered_version_dump_iter,
+                                TensorShape({version_filter_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int64> filtered_freq_dump_iter(freq_filter_vec_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-freqs_filtered", writer, dump_buffer.get(), bytes_limit,
+      &filtered_freq_dump_iter, TensorShape({freq_filter_vec_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int32> part_offset_dump_iter(part_offset_);
+  s = SaveTensorWithFixedBuffer(
+      tensor_name + "-partition_offset", writer, dump_buffer.get(), bytes_limit,
+      &part_offset_dump_iter, TensorShape({part_offset_.size()}));
+  if (!s.ok()) return s;
+
+  EVVectorDataDumpIterator<int32> part_filter_offset_dump_iter(
+      part_filter_offset_);
+  s = SaveTensorWithFixedBuffer(tensor_name + "-partition_filter_offset",
+                                writer, dump_buffer.get(), bytes_limit,
+                                &part_filter_offset_dump_iter,
+                                TensorShape({part_filter_offset_.size()}));
+  if (!s.ok()) return s;
+
+  return OkStatus();
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                              \
+  template Status EmbeddingVarCkptData<ktype, vtype>::ExportToCkpt( \
+      const string&, BundleWriter*, int64, ValueIterator<vtype>*);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+}  // namespace embedding
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h
new file mode 100644
index 00000000..0ea4f1e3
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_ckpt_data.h
@@ -0,0 +1,57 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
+#include "embedding_config.h"
+#include "embedding_var_dump_iterator.h"
+#include "tensorflow/core/platform/types.h"
+namespace tensorflow {
+class BundleWriter;
+namespace {
+const int kDramFlagOffset = 49;
+}
+
+namespace embedding {
+template <class K, class V>
+class EmbeddingVarCkptData {
+ public:
+  void Emplace(K key, void* value_ptr, const EmbeddingConfig& emb_config,
+               V* default_value, FeatureDescriptor<V>* feat_desc,
+               bool is_save_freq, bool is_save_version,
+               bool save_unfiltered_features);
+
+  void Emplace(K key, V* value_ptr);
+
+  void SetWithPartition(
+      std::vector<EmbeddingVarCkptData<K, V>>& ev_ckpt_data_parts);
+
+  Status ExportToCkpt(const string& tensor_name, BundleWriter* writer,
+                      int64 value_len, ValueIterator<V>* value_iter = nullptr);
+
+ private:
+  std::vector<K> key_vec_;
+  std::vector<V*> value_ptr_vec_;
+  std::vector<int64> version_vec_;
+  std::vector<int64> freq_vec_;
+  std::vector<K> key_filter_vec_;
+  std::vector<int64> version_filter_vec_;
+  std::vector<int64> freq_filter_vec_;
+  std::vector<int32> part_offset_;
+  std::vector<int32> part_filter_offset_;
+  const int kSavedPartitionNum = 1000;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CKPT_DATA_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h
new file mode 100644
index 00000000..192298a7
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h
@@ -0,0 +1,64 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template <typename Device>
+struct EmbeddingVarContext;
+
+template <>
+struct EmbeddingVarContext<CPUDevice> {
+ public:
+  EmbeddingVarContext<CPUDevice>(OpKernelContext* op_ctx)
+      : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()) {}
+
+  const DeviceBase::CpuWorkerThreads* worker_threads;
+};
+
+#if GOOGLE_CUDA
+template <>
+struct EmbeddingVarContext<GPUDevice> {
+ public:
+  EmbeddingVarContext<GPUDevice>(OpKernelContext* op_ctx)
+      : worker_threads(op_ctx->device()->tensorflow_cpu_worker_threads()),
+        compute_stream(op_ctx->op_device_context()->stream()),
+        event_mgr(
+            op_ctx->device()->tensorflow_accelerator_device_info()->event_mgr),
+        gpu_allocator(op_ctx->device()->GetAllocator(AllocatorAttributes())),
+        gpu_device(op_ctx->eigen_gpu_device()) {}
+
+  const DeviceBase::CpuWorkerThreads* worker_threads = nullptr;
+  se::Stream* compute_stream = nullptr;
+  EventMgr* event_mgr = nullptr;
+  Allocator* gpu_allocator = nullptr;
+  const GPUDevice& gpu_device;
+};
+#endif  // GOOGLE_CUDA
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_CONTEXT_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h
new file mode 100644
index 00000000..ff52465e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_dump_iterator.h
@@ -0,0 +1,91 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
+#include "embedding_config.h"
+#include "kv_interface.h"
+#include "tensorflow/core/platform/types.h"
+namespace tensorflow {
+template <class T>
+class DumpIterator;
+
+namespace embedding {
+template <class T>
+class EVVectorDataDumpIterator : public DumpIterator<T> {
+ public:
+  EVVectorDataDumpIterator(const std::vector<T>& item_list)
+      : curr_iter_(item_list.begin()), end_iter_(item_list.end()) {}
+
+  bool HasNext() const { return curr_iter_ != end_iter_; }
+
+  T Next() {
+    T val = *curr_iter_;
+    curr_iter_++;
+    return val;
+  }
+
+ private:
+  typename std::vector<T>::const_iterator curr_iter_;
+  typename std::vector<T>::const_iterator end_iter_;
+};
+
+template <class T>
+class EV2dVectorDataDumpIterator : public DumpIterator<T> {
+ public:
+  EV2dVectorDataDumpIterator(std::vector<T*>& valueptr_list, int64 value_len,
+                             ValueIterator<T>* val_iter)
+      : curr_iter_(valueptr_list.begin()),
+        end_iter_(valueptr_list.end()),
+        val_iter_(val_iter),
+        value_len_(value_len),
+        col_idx_(0) {
+    if (!valueptr_list.empty()) {
+      if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
+        curr_ptr_ = val_iter_->Next();
+      } else {
+        curr_ptr_ = *curr_iter_;
+      }
+    }
+  }
+
+  bool HasNext() const { return curr_iter_ != end_iter_; }
+
+  T Next() {
+    T val = curr_ptr_[col_idx_++];
+    if (col_idx_ >= value_len_) {
+      curr_iter_++;
+      col_idx_ = 0;
+      if (curr_iter_ != end_iter_) {
+        if ((int64)*curr_iter_ == ValuePtrStatus::NOT_IN_DRAM) {
+          curr_ptr_ = val_iter_->Next();
+        } else {
+          curr_ptr_ = *curr_iter_;
+        }
+      }
+    }
+    return val;
+  }
+
+ private:
+  typename std::vector<T*>::const_iterator curr_iter_;
+  typename std::vector<T*>::const_iterator end_iter_;
+  ValueIterator<T>* val_iter_;
+  int64 value_len_;
+  int64 col_idx_;
+  T* curr_ptr_ = nullptr;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_DUMP_ITERATOR_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc
new file mode 100644
index 00000000..5f7eb9d1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.cc
@@ -0,0 +1,646 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "embedding_var_restore.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+template <typename K>
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer) {
+  TensorShape shape;
+  Status st;
+  st = reader->LookupTensorShape(record_key, &shape);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  st = reader->LookupHeader(record_key, sizeof(K) * shape.dim_size(0));
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  size_t bytes_read = 0;
+  *buffer = new K[shape.dim_size(0)];
+  st = reader->LookupSegment(record_key, sizeof(K) * shape.dim_size(0),
+                             (char*)*buffer, bytes_read);
+  if (!st.ok()) {
+    LOG(FATAL) << "Restore record " << record_key << " failed";
+  }
+  return shape.dim_size(0);
+}
+#define REGISTER_KERNELS(ktype) \
+  template int64 ReadRecord(BundleReader*, const string&, ktype**);
+REGISTER_KERNELS(int32);
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreSSD() {
+  std::string name_string_temp(restore_args_.m_name_string);
+  std::string new_str = "_";
+  int64 pos = name_string_temp.find("/");
+  while (pos != std::string::npos) {
+    name_string_temp.replace(pos, 1, new_str.data(), 1);
+    pos = name_string_temp.find("/");
+  }
+  std::string ssd_record_file_name =
+      restore_args_.m_file_name_string + "-" + name_string_temp + "-ssd_record";
+  if (Env::Default()->FileExists(ssd_record_file_name + ".index").ok()) {
+    std::string ssd_emb_file_name = restore_args_.m_file_name_string + "-" +
+                                    name_string_temp + "-emb_files";
+    BundleReader ssd_record_reader(Env::Default(), ssd_record_file_name);
+    RestoreSSDBuffer<K> ssd_buffer(&ssd_record_reader);
+    VLOG(1) << "Loading SSD record... " << ssd_record_file_name;
+    storage_->RestoreSSD(ev_->GetEmbeddingIndex(), ev_->GetEmbeddingSlotNum(),
+                         ev_->ValueLen(), ssd_emb_file_name, ev_, ssd_buffer);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype) \
+  template void CheckpointLoader<ktype, vtype>::RestoreSSD();
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <typename K, typename V>
+void CheckpointLoader<K, V>::RestoreInternal(const std::string& name_string,
+                                             const EmbeddingConfig& emb_config,
+                                             const Eigen::GpuDevice* device,
+                                             RestoreBuffer& restore_buff) {
+  Status s = EVInitTensorNameAndShape(name_string);
+  if (!s.ok()) {
+    LOG(ERROR) << "EVInitTensorNameAndShape fail:" << s.ToString();
+    return;
+  }
+
+  Tensor part_offset_tensor;
+  Tensor part_filter_offset_tensor;
+  if (!restore_args_.m_is_oldform) {
+    /****** InitPartOffsetTensor ******/
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    string offset_tensor_name;
+    if (!restore_args_.m_is_incr) {
+      offset_tensor_name = name_string + kPartOffsetTensorSuffsix;
+    } else {
+      offset_tensor_name = name_string + kIncrPartOffsetTensorSuffsix;
+    }
+
+    Status s = reader_->LookupDtypeAndShape(
+        offset_tensor_name, &part_offset_type, &part_offset_shape);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.message();
+    }
+    part_offset_tensor =
+        Tensor(cpu_allocator(), part_offset_type, part_offset_shape);
+    s = reader_->Lookup(offset_tensor_name, &part_offset_tensor);
+    if (!s.ok()) {
+      LOG(ERROR) << "EV restoring fail:" << s.message();
+    }
+
+    if (restore_args_.m_has_filter) {
+      TensorShape part_filter_offset_shape;
+      DataType part_filter_offset_type;
+      string offset_filter_tensor_name =
+          name_string + kPartFilterOffsetTensorSuffsix;
+      s = reader_->LookupDtypeAndShape(offset_filter_tensor_name,
+                                       &part_filter_offset_type,
+                                       &part_filter_offset_shape);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.message();
+      }
+      part_filter_offset_tensor = Tensor(
+          cpu_allocator(), part_filter_offset_type, part_filter_offset_shape);
+      s = reader_->Lookup(offset_filter_tensor_name,
+                          &part_filter_offset_tensor);
+      if (!s.ok()) {
+        LOG(ERROR) << "EV restoring fail: " << s.message();
+      }
+    }
+  }
+
+  if (restore_args_.m_is_oldform) {
+    VLOG(1) << "old form, EV name:" << name_string
+            << ", partition_id:" << restore_args_.m_partition_id
+            << ", new partition num:" << restore_args_.m_partition_num;
+    int64 new_dim = ev_->ValueLen();
+    TensorShape key_shape;
+    Status st =
+        reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+    if (!st.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << st.message();
+    }
+    int tot_key_num = key_shape.dim_size(0);
+    Status s = EVRestoreFeatures(tot_key_num, 0, 0, 0, 0, restore_buff, new_dim,
+                                 emb_config, device);
+    if (!s.ok()) {
+      LOG(ERROR) << "EVRestoreFeaturesOld fail: " << s.message();
+    }
+  } else {
+    int64 new_dim = ev_->ValueLen();
+    VLOG(1) << "new form checkpoint... :" << name_string
+            << " , partition_id:" << restore_args_.m_partition_id
+            << " , partition_num:" << restore_args_.m_partition_num;
+    auto part_offset_flat = part_offset_tensor.flat<int32>();
+    for (size_t i = 0; i < restore_args_.m_loaded_parts.size(); i++) {
+      int subpart_id = restore_args_.m_loaded_parts[i];
+      size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+      size_t value_unit_bytes_new = sizeof(V) * new_dim;
+      int subpart_offset = part_offset_flat(subpart_id);
+      int tot_key_num = part_offset_flat(subpart_id + 1) - subpart_offset;
+      int64 key_part_offset = subpart_offset * sizeof(K);
+      int64 value_part_offset =
+          subpart_offset * sizeof(V) * restore_args_.m_old_dim;
+      int64 version_part_offset = subpart_offset * sizeof(int64);
+      int64 freq_part_offset = subpart_offset * sizeof(int64);
+      VLOG(1) << "dynamically load ev : " << name_string
+              << ", subpartid:" << subpart_id;
+
+      EVRestoreFeatures(tot_key_num, key_part_offset, value_part_offset,
+                        version_part_offset, freq_part_offset, restore_buff,
+                        new_dim, emb_config, device);
+
+      if (restore_args_.m_has_filter) {
+        auto part_filter_offset_flat = part_filter_offset_tensor.flat<int32>();
+        Status s = EVRestoreFilteredFeatures(subpart_id, new_dim, restore_buff,
+                                             part_filter_offset_flat,
+                                             emb_config, device);
+        if (!s.ok()) {
+          LOG(ERROR) << "EVRestoreFilteredFeatures fail: " << s.message();
+        }
+      }
+    }
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                                     \
+  template void CheckpointLoader<ktype, vtype>::RestoreInternal(           \
+      const std::string&, const EmbeddingConfig&, const Eigen::GpuDevice*, \
+      RestoreBuffer&);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+bool CheckpointLoader<K, V>::IsOldCheckpoint(
+    const std::string& curr_partid_str,
+    const std::string& kPartOffsetTensorSuffsix) {
+  if (restore_args_.m_name_string.find(kPartStr) == std::string::npos) {
+    string tensor_name = restore_args_.m_name_string;
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+
+    string part_id = std::to_string(0);
+    tensor_name = restore_args_.m_name_string + "/" + kPartStr + part_id;
+
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+  } else {
+    string part_id = std::to_string(0);
+    size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+    size_t part_size = strlen(kPartStr);
+    size_t cur_part_size = curr_partid_str.size();
+
+    string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+    string post_subname = restore_args_.m_name_string.substr(
+        part_pos + part_size + cur_part_size);
+    string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+
+    TensorShape part_offset_shape;
+    DataType part_offset_type;
+    Status form_st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (form_st.ok()) return false;
+    pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1); /* var1*/
+    post_subname = restore_args_.m_name_string.substr(part_pos + part_size +
+                                                      cur_part_size);
+    tensor_name = pre_subname + post_subname;
+
+    Status st =
+        reader_->LookupDtypeAndShape(tensor_name + kPartOffsetTensorSuffsix,
+                                     &part_offset_type, &part_offset_shape);
+    if (st.ok()) return false;
+  }
+
+  return true;
+}
+#define REGISTER_KERNELS(ktype, vtype)                           \
+  template bool CheckpointLoader<ktype, vtype>::IsOldCheckpoint( \
+      const std::string&, const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+void CheckpointLoader<K, V>::InitPartNumAndLoadedParts(
+    std::vector<std::string>& tensor_name_vec) {
+  std::string tmp_key_suffix;
+  std::string tmp_kPartOffsetTensorSuffsix;
+  if (!restore_args_.m_is_incr) {
+    tmp_key_suffix = kKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kPartOffsetTensorSuffsix;
+  } else {
+    tmp_key_suffix = kIncrKeySuffix;
+    tmp_kPartOffsetTensorSuffsix = kIncrPartOffsetTensorSuffsix;
+  }
+
+  restore_args_.m_loaded_parts.reserve(kSavedPartitionNum);
+  int orig_partnum = 0;
+  const string& curr_partid_str = std::to_string(restore_args_.m_partition_id);
+  size_t part_pos = restore_args_.m_name_string.find(kPartStr);
+
+  if (IsOldCheckpoint(curr_partid_str, tmp_kPartOffsetTensorSuffsix)) {
+    restore_args_.m_is_oldform = true;
+  }
+
+  if (part_pos == std::string::npos) {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string tensor_name =
+          restore_args_.m_name_string + "/" + kPartStr + part_id;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      tensor_name_vec.emplace_back(restore_args_.m_name_string);
+    }
+    for (int i = 0; i < kSavedPartitionNum; ++i) {
+      restore_args_.m_loaded_parts.push_back(i);
+    }
+  } else {
+    for (;; orig_partnum++) {
+      string part_id = std::to_string(orig_partnum);
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tensor_name = pre_subname + kPartStr + part_id + post_subname;
+      string tensor_key = tensor_name + tmp_key_suffix;
+      TensorShape key_shape;
+      Status st = reader_->LookupTensorShape(tensor_key, &key_shape);
+      if (!st.ok()) {
+        break;
+      }
+      tensor_name_vec.emplace_back(tensor_name);
+    }
+    if (orig_partnum == 0) {
+      string pre_subname = restore_args_.m_name_string.substr(0, part_pos - 1);
+      string post_subname = restore_args_.m_name_string.substr(
+          part_pos + strlen(kPartStr) + curr_partid_str.size());
+      string tmp_name = pre_subname + post_subname;
+      tensor_name_vec.emplace_back(tmp_name);
+    }
+    for (int i = 0; i < kSavedPartitionNum; i++) {
+      if (i % restore_args_.m_partition_num == restore_args_.m_partition_id) {
+        restore_args_.m_loaded_parts.push_back(i);
+      }
+    }
+  }
+  for (auto& tensor_name : tensor_name_vec) {
+    VLOG(1) << "**** " << restore_args_.m_name_string << " " << tensor_name
+            << " ****";
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                                     \
+  template void CheckpointLoader<ktype, vtype>::InitPartNumAndLoadedParts( \
+      std::vector<std::string>&);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVInitTensorNameAndShape(
+    const std::string& tensor_name) {
+  if (!restore_args_.m_is_incr) {
+    restore_args_.m_tensor_key = tensor_name + kKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kFreqSuffix;
+  } else {
+    restore_args_.m_tensor_key = tensor_name + kIncrKeySuffix;
+    restore_args_.m_tensor_value = tensor_name + kIncrValueSuffix;
+    restore_args_.m_tensor_version = tensor_name + kIncrVersionSuffix;
+    restore_args_.m_tensor_freq = tensor_name + kIncrFreqSuffix;
+  }
+
+  TensorShape key_shape, value_shape, version_shape, freq_shape;
+
+  Status st =
+      reader_->LookupTensorShape(restore_args_.m_tensor_key, &key_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_value, &value_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_version,
+                                  &version_shape);
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_key,
+                             sizeof(K) * key_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(
+      restore_args_.m_tensor_value,
+      sizeof(V) * value_shape.dim_size(0) * value_shape.dim_size(1));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_version,
+                             sizeof(int64) * version_shape.dim_size(0));
+  if (!st.ok()) {
+    return st;
+  }
+  st = reader_->LookupTensorShape(restore_args_.m_tensor_freq, &freq_shape);
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      freq_shape = version_shape;
+    } else {
+      return st;
+    }
+  }
+  st = reader_->LookupHeader(restore_args_.m_tensor_freq,
+                             sizeof(int64) * freq_shape.dim_size(0));
+  if (!st.ok()) {
+    if (st.code() == error::NOT_FOUND) {
+      restore_args_.m_has_freq = false;
+    } else {
+      return st;
+    }
+  }
+  restore_args_.m_old_dim = value_shape.dim_size(1);
+
+  if (!restore_args_.m_is_oldform) {
+    TensorShape key_filter_shape, version_filter_shape, freq_filter_shape;
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_key + "_filtered",
+                                    &key_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        key_filter_shape = key_shape;
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupTensorShape(
+        restore_args_.m_tensor_version + "_filtered", &version_filter_shape);
+    if ((!st.ok()) && (st.code() != error::NOT_FOUND)) {
+      return st;
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_key + "_filtered",
+                               sizeof(K) * key_filter_shape.dim_size(0));
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        restore_args_.m_has_filter = false;
+      } else {
+        return st;
+      }
+    }
+    st = reader_->LookupHeader(restore_args_.m_tensor_version + "_filtered",
+                               sizeof(K) * version_filter_shape.dim_size(0));
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
+      return st;
+    }
+    st = reader_->LookupTensorShape(restore_args_.m_tensor_freq + "_filtered",
+                                    &freq_filter_shape);
+    if (!st.ok()) {
+      if (st.code() == error::NOT_FOUND) {
+        freq_filter_shape = freq_shape;
+      } else {
+        return st;
+      }
+    }
+
+    st = reader_->LookupHeader(restore_args_.m_tensor_freq + "_filtered",
+                               sizeof(K) * freq_filter_shape.dim_size(0));
+    if (!st.ok() && st.code() != error::NOT_FOUND) {
+      return st;
+    }
+  }
+
+  return OkStatus();
+}
+#define REGISTER_KERNELS(ktype, vtype)                                      \
+  template Status CheckpointLoader<ktype, vtype>::EVInitTensorNameAndShape( \
+      const std::string&);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFeatures(
+    int tot_key_num, int64 key_part_offset, int64 value_part_offset,
+    int64 version_part_offset, int64 freq_part_offset,
+    RestoreBuffer& restore_buff, int64 new_dim,
+    const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
+  size_t value_unit_bytes = sizeof(V) * restore_args_.m_old_dim;
+  size_t value_unit_bytes_new = sizeof(V) * new_dim;
+  int64 tot_key_bytes_read(0);
+  int64 tot_value_bytes_read(0);
+  int64 tot_version_bytes_read(0);
+  int64 tot_freq_bytes_read(0);
+  size_t key_bytes_read = 0;
+  size_t value_bytes_read = 0;
+  size_t version_bytes_read = 0;
+  size_t freq_bytes_read = 0;
+
+  while (tot_key_num > 0) {
+    size_t read_key_num = std::min(
+        std::min(kBufferSize / sizeof(K), kBufferSize / value_unit_bytes),
+        kBufferSize / sizeof(int64));
+    read_key_num = std::min(read_key_num, kBufferSize / value_unit_bytes_new);
+    read_key_num = std::min((int)read_key_num, tot_key_num);
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_key, key_part_offset + tot_key_bytes_read,
+        read_key_num * sizeof(K), restore_buff.key_buffer, key_bytes_read);
+    reader_->LookupSegmentOffset(restore_args_.m_tensor_value,
+                                 value_part_offset + tot_value_bytes_read,
+                                 read_key_num * value_unit_bytes,
+                                 restore_buff.value_buffer, value_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(restore_args_.m_tensor_version,
+                                   version_part_offset + tot_version_bytes_read,
+                                   read_key_num * sizeof(int64),
+                                   restore_buff.version_buffer,
+                                   version_bytes_read);
+      if (version_bytes_read == 0) {
+        memset(restore_buff.version_buffer, -1, sizeof(int64) * read_key_num);
+      }
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+
+    if (restore_args_.m_has_freq) {
+      reader_->LookupSegmentOffset(restore_args_.m_tensor_freq,
+                                   freq_part_offset + tot_freq_bytes_read,
+                                   read_key_num * sizeof(int64),
+                                   restore_buff.freq_buffer, freq_bytes_read);
+      if (freq_bytes_read == 0) {
+        int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+        for (int64 i = 0; i < read_key_num; i++) {
+          freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+        }
+      }
+    } else {
+      int64* freq_tmp = (int64*)restore_buff.freq_buffer;
+      for (int64 i = 0; i < read_key_num; i++) {
+        freq_tmp[i] = (ev_->MinFreq() == 0) ? 1 : ev_->MinFreq();
+      }
+    }
+    if (key_bytes_read > 0) {
+      read_key_num = key_bytes_read / sizeof(K);
+      Status st = RestoreCustomDim(new_dim, read_key_num, value_unit_bytes,
+                                   value_bytes_read, value_unit_bytes_new,
+                                   restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+
+      st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, new_dim, false,
+          restore_args_.m_is_incr, emb_config, device, filter_, restore_buff);
+      if (!st.ok()) {
+        LOG(FATAL) << "EV Restore fail:" << st.ToString();
+      }
+    }
+
+    tot_key_num -= read_key_num;
+    tot_key_bytes_read += key_bytes_read;
+    tot_value_bytes_read += value_bytes_read;
+    tot_version_bytes_read += version_bytes_read;
+    tot_freq_bytes_read += freq_bytes_read;
+  }
+
+  return OkStatus();
+}
+#define REGISTER_KERNELS(ktype, vtype)                               \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFeatures( \
+      int, int64, int64, int64, int64, RestoreBuffer&, int64,        \
+      const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+template <class K, class V>
+Status CheckpointLoader<K, V>::EVRestoreFilteredFeatures(
+    int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
+    typename TTypes<int32>::Flat part_filter_offset_flat,
+    const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device) {
+  int subpart_filter_offset = part_filter_offset_flat(subpart_id);
+  int tot_key_filter_num =
+      part_filter_offset_flat(subpart_id + 1) - subpart_filter_offset;
+  int64 key_filter_part_offset = subpart_filter_offset * sizeof(K);
+  int64 version_filter_part_offset = subpart_filter_offset * sizeof(int64);
+  int64 freq_filter_part_offset = subpart_filter_offset * sizeof(int64);
+
+  VLOG(1) << "key_filter_num: " << tot_key_filter_num
+          << ", subpart_filter_offset: " << subpart_filter_offset;
+
+  size_t key_filter_bytes_read = 0;
+  size_t version_filter_bytes_read = 0;
+  size_t freq_filter_bytes_read = 0;
+
+  while (tot_key_filter_num > 0) {
+    size_t read_key_num =
+        std::min(kBufferSize / sizeof(K), kBufferSize / sizeof(int64));
+    read_key_num = std::min((int)read_key_num, tot_key_filter_num);
+    reader_->LookupSegmentOffset(restore_args_.m_tensor_key + "_filtered",
+                                 key_filter_part_offset + key_filter_bytes_read,
+                                 read_key_num * sizeof(K),
+                                 restore_buff.key_buffer,
+                                 key_filter_bytes_read);
+    if (!restore_args_.m_reset_version) {
+      reader_->LookupSegmentOffset(
+          restore_args_.m_tensor_version + "_filtered",
+          version_filter_part_offset + version_filter_bytes_read,
+          read_key_num * sizeof(int64), restore_buff.version_buffer,
+          version_filter_bytes_read);
+    } else {
+      int64* version_tmp = (int64*)restore_buff.version_buffer;
+      memset(version_tmp, 0, read_key_num * sizeof(int64));
+    }
+    reader_->LookupSegmentOffset(
+        restore_args_.m_tensor_freq + "_filtered",
+        freq_filter_part_offset + freq_filter_bytes_read,
+        read_key_num * sizeof(int64), restore_buff.freq_buffer,
+        freq_filter_bytes_read);
+    if (key_filter_bytes_read > 0) {
+      read_key_num = key_filter_bytes_read / sizeof(K);
+      VLOG(2) << "restore, read_key_num:" << read_key_num;
+      Status st = storage_->RestoreFeatures(
+          read_key_num, kSavedPartitionNum, restore_args_.m_partition_id,
+          restore_args_.m_partition_num, value_len, true,
+          restore_args_.m_is_incr, emb_config, device, filter_, restore_buff);
+      if (!st.ok()) return st;
+      tot_key_filter_num -= read_key_num;
+    }
+  }
+  return OkStatus();
+}
+#define REGISTER_KERNELS(ktype, vtype)                                       \
+  template Status CheckpointLoader<ktype, vtype>::EVRestoreFilteredFeatures( \
+      int64, int64, RestoreBuffer&, typename TTypes<int32>::Flat,            \
+      const EmbeddingConfig&, const Eigen::GpuDevice*);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h
new file mode 100644
index 00000000..4235d8fb
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_restore.h
@@ -0,0 +1,223 @@
+/* Copyright 2023 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
+
+#include "embedding_config.h"
+#include "embedding_var.h"
+#include "filter_policy.h"
+#include "storage.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+template <typename K, typename V>
+class EmbeddingVar;
+
+namespace {
+const size_t kBufferSize = 8 << 20;
+constexpr char kPartStr[] = "part_";
+
+constexpr char kPartOffsetTensorSuffsix[] = "-partition_offset";
+constexpr char kPartFilterOffsetTensorSuffsix[] = "-partition_filter_offset";
+constexpr char kKeySuffix[] = "-keys";
+constexpr char kValueSuffix[] = "-values";
+constexpr char kVersionSuffix[] = "-versions";
+constexpr char kFreqSuffix[] = "-freqs";
+
+constexpr char kIncrPartOffsetTensorSuffsix[] = "-incr_partition_offset";
+constexpr char kIncrKeySuffix[] = "-sparse_incr_keys";
+constexpr char kIncrValueSuffix[] = "-sparse_incr_values";
+constexpr char kIncrVersionSuffix[] = "-sparse_incr_versions";
+constexpr char kIncrFreqSuffix[] = "-sparse_incr_freqs";
+}  // namespace
+
+template <typename K>
+int64 ReadRecord(BundleReader* reader, const string& record_key, K** buffer);
+
+template <typename K>
+struct RestoreSSDBuffer {
+  int64* file_list_buf = nullptr;
+  int64* invalid_record_count_list_buf = nullptr;
+  int64* record_count_list_buf = nullptr;
+  K* key_list_buf = nullptr;
+  int64* key_file_id_list_buf = nullptr;
+  int64* key_offset_list_buf = nullptr;
+  int64 num_of_keys = 0;
+  int64 num_of_files = 0;
+
+  explicit RestoreSSDBuffer(BundleReader* ssd_record_reader) {
+    num_of_files = ReadRecord(ssd_record_reader, "files", &file_list_buf);
+
+    ReadRecord(ssd_record_reader, "invalid_record_count",
+               &invalid_record_count_list_buf);
+    ReadRecord(ssd_record_reader, "record_count", &record_count_list_buf);
+    num_of_keys = ReadRecord(ssd_record_reader, "keys", &key_list_buf);
+
+    ReadRecord(ssd_record_reader, "keys_file_id", &key_file_id_list_buf);
+    ReadRecord(ssd_record_reader, "keys_offset", &key_offset_list_buf);
+  }
+
+  ~RestoreSSDBuffer() {
+    delete[] file_list_buf;
+    delete[] invalid_record_count_list_buf;
+    delete[] record_count_list_buf;
+    delete[] key_list_buf;
+    delete[] key_file_id_list_buf;
+    delete[] key_offset_list_buf;
+  }
+};
+
+struct RestoreArgs {
+  std::string m_file_name_string;
+  std::string m_name_string;
+  std::string m_tensor_key;
+  std::string m_tensor_value;
+  std::string m_tensor_version;
+  std::string m_tensor_freq;
+  std::vector<int> m_loaded_parts;
+  int64 m_partition_id;
+  int64 m_partition_num;
+  int64 m_idx;
+  int m_old_dim;
+  bool m_is_incr;
+  bool m_reset_version;
+  bool m_has_freq;
+  bool m_has_filter;
+  bool m_is_oldform;
+  RestoreArgs(const std::string name_string, const std::string file_name_string,
+              int64 partition_id, int64 partition_num, bool is_incr,
+              bool reset_version)
+      : m_name_string(name_string),
+        m_file_name_string(file_name_string),
+        m_partition_id(partition_id),
+        m_partition_num(partition_num),
+        m_idx(0),
+        m_old_dim(0),
+        m_is_incr(is_incr),
+        m_reset_version(reset_version),
+        m_has_freq(true),
+        m_has_filter(true),
+        m_is_oldform(false) {}
+  RestoreArgs() = default;
+};
+
+template <typename K, typename V>
+class CheckpointLoader {
+ public:
+  CheckpointLoader(embedding::Storage<K, V>* storage, EmbeddingVar<K, V>* ev,
+                   FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                   const std::string& name_string,
+                   const std::string& file_name_string, int64 partition_id,
+                   int64 partition_num, bool is_incr, bool reset_version,
+                   BundleReader* reader)
+      : storage_(storage), ev_(ev), filter_(filter), reader_(reader) {
+    restore_args_ = RestoreArgs(name_string, file_name_string, partition_id,
+                                partition_num, is_incr, reset_version);
+  }
+
+  void RestoreCkpt(const EmbeddingConfig& emb_config,
+                   const Eigen::GpuDevice* device) {
+    /* Step 1: Restore SSD ckpt Data (Optional)
+       Step 2; Restore model ckpt */
+    RestoreSSD();
+
+    std::vector<std::string> tensor_name_vec;
+    InitPartNumAndLoadedParts(tensor_name_vec);
+
+    RestoreBuffer restore_buff(kBufferSize);
+    for (auto& tensor_name : tensor_name_vec) {
+      RestoreInternal(tensor_name, emb_config, device, restore_buff);
+    }
+  }
+
+  void RestoreInternal(const std::string& name_string,
+                       const EmbeddingConfig& emb_config,
+                       const Eigen::GpuDevice* device,
+                       RestoreBuffer& restore_buff);
+
+ private:
+  void RestoreSSD();
+
+  bool IsOldCheckpoint(const std::string& curr_partid_str,
+                       const std::string& kPartOffsetTensorSuffsix);
+
+  void InitPartNumAndLoadedParts(std::vector<std::string>& tensor_name_vec);
+
+  Status EVInitTensorNameAndShape(const std::string& tensor_name);
+
+  Status EVRestoreFeatures(int tot_key_num, int64 key_part_offset,
+                           int64 value_part_offset, int64 version_part_offset,
+                           int64 freq_part_offset, RestoreBuffer& restore_buff,
+                           int64 new_dim, const EmbeddingConfig& emb_config,
+                           const Eigen::GpuDevice* device);
+
+  Status EVRestoreFilteredFeatures(
+      int64 subpart_id, int64 value_len, RestoreBuffer& restore_buff,
+      typename TTypes<int32>::Flat part_filter_offset_flat,
+      const EmbeddingConfig& emb_config, const Eigen::GpuDevice* device);
+
+  Status RestoreCustomDim(int new_dim, int read_key_num,
+                          size_t value_unit_bytes, size_t value_bytes_read,
+                          size_t value_unit_bytes_new,
+                          RestoreBuffer& restore_buff) {
+    bool restore_customDim;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_RESTORE_CUSTOM_DIM", false,
+                                   &restore_customDim));
+    if (restore_customDim && ev_->IsUseHbm()) {
+      return errors::FailedPrecondition(
+          "HBM EV not and custom dim,"
+          "are not supported used together");
+    }
+    if (restore_customDim && restore_args_.m_old_dim != new_dim) {
+      VLOG(2) << "restore, read_value_reshape dim: from "
+              << restore_args_.m_old_dim << " to " << new_dim;
+      if (read_key_num * value_unit_bytes != value_bytes_read) {
+        return tensorflow::errors::FailedPrecondition(
+            "Expected read_key_num * value_unit_bytes == "
+            "value_bytes_read, but got read_key_num * value_unit_bytes "
+            "!= value_bytes_read!");
+      }
+
+      std::unique_ptr<char[]> tmp_ptr(new char[kBufferSize]);
+      size_t read_once = std::min(value_unit_bytes, value_unit_bytes_new);
+      for (int i = 0; i < read_key_num; ++i) {
+        memcpy(tmp_ptr.get() + i * value_unit_bytes_new,
+               restore_buff.value_buffer + i * value_unit_bytes, read_once);
+        if (restore_args_.m_old_dim >= new_dim) continue;
+        auto p = ev_->GetDefaultValue(restore_args_.m_idx++);
+        memcpy(tmp_ptr.get() + i * value_unit_bytes_new + value_unit_bytes,
+               p + value_unit_bytes, value_unit_bytes_new - value_unit_bytes);
+      }
+      auto tmp = tmp_ptr.release();
+      tmp_ptr.reset(restore_buff.value_buffer);
+      restore_buff.value_buffer = tmp;
+    }
+    return OkStatus();
+  }
+
+ private:
+  embedding::Storage<K, V>* storage_;
+  EmbeddingVar<K, V>* ev_;
+  FilterPolicy<K, V, EmbeddingVar<K, V>>* filter_;
+  BundleReader* reader_;
+  RestoreArgs restore_args_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EMBEDDING_VAR_RESTORE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h b/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h
new file mode 100644
index 00000000..766362da
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/eviction_manager.h
@@ -0,0 +1,139 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_
+
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+namespace embedding {
+template <typename K, typename V>
+class MultiTierStorage;
+
+template <typename K, typename V>
+struct StorageItem {
+  volatile bool is_occupied;
+  volatile bool is_deleted;
+
+  StorageItem(bool is_occupied, volatile bool is_deleted)
+      : is_occupied(is_occupied), is_deleted(is_deleted) {}
+};
+
+template <typename K, typename V>
+class EvictionManager {
+ public:
+  EvictionManager() {
+    num_of_threads_ = 1;
+    TF_CHECK_OK(ReadInt64FromEnvVar("TF_MULTI_TIER_EV_EVICTION_THREADS", 1,
+                                    &num_of_threads_));
+    thread_pool_.reset(new thread::ThreadPool(Env::Default(), ThreadOptions(),
+                                              "EVICTION_MANAGER", 3,
+                                              /*low_latency_hint=*/false));
+  }
+
+  ~EvictionManager() {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EvictionManager);
+
+  void Schedule(std::function<void()> fn) {
+    thread_pool_->Schedule(std::move(fn));
+  }
+
+  void AddStorage(MultiTierStorage<K, V>* storage) {
+    mutex_lock l(mu_);
+    auto ret = storage_table_.emplace(
+        std::make_pair(storage, new StorageItem<K, V>(false, false)));
+    if (ret.second && num_of_active_threads_ < num_of_threads_) StartThread();
+  }
+
+  void DeleteStorage(MultiTierStorage<K, V>* storage) {
+    auto storage_item = storage_table_[storage];
+    bool delete_flag = false;
+    while (!delete_flag) {
+      volatile bool* occupy_flag = &storage_item->is_occupied;
+      delete_flag = __sync_bool_compare_and_swap(occupy_flag, false, true);
+      if (delete_flag) {
+        storage_item->is_deleted = true;
+      }
+      *occupy_flag = false;
+    }
+  }
+
+ private:
+  void StartThread() {
+    while (this->flag_.test_and_set(std::memory_order_acquire));
+    if (num_of_active_threads_ < num_of_threads_) {
+      __sync_fetch_and_add(&num_of_active_threads_, 1);
+      thread_pool_->Schedule([this]() { EvictionLoop(); });
+    }
+    this->flag_.clear(std::memory_order_release);
+  }
+
+  bool CheckStorages() {
+    mutex_lock l(mu_);
+    for (auto it = storage_table_.begin(); it != storage_table_.end();) {
+      if (!(it->second)->is_deleted)
+        return true;
+      else
+        it = storage_table_.erase(it);
+    }
+    return false;
+  }
+
+  void EvictionLoop() {
+    while (CheckStorages()) {
+      mutex_lock l(mu_);
+      for (auto it : storage_table_) {
+        auto storage = it.first;
+        auto storage_item = it.second;
+        volatile bool* occupy_flag = &storage_item->is_occupied;
+        if (__sync_bool_compare_and_swap(occupy_flag, false, true)) {
+          if (storage_item->is_deleted) {
+            *occupy_flag = false;
+            continue;
+          }
+          storage->BatchEviction();
+          *occupy_flag = false;
+        }
+        Env::Default()->SleepForMicroseconds(1);
+      }
+    }
+    __sync_fetch_and_sub(&num_of_active_threads_, 1);
+  }
+
+  int64 num_of_threads_;
+  int64 num_of_active_threads_;
+  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
+  std::map<MultiTierStorage<K, V>*, StorageItem<K, V>*> storage_table_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+  mutex mu_;
+};
+
+class EvictionManagerCreator {
+ public:
+  template <typename K, typename V>
+  static EvictionManager<K, V>* Create() {
+    static EvictionManager<K, V> eviction_manager;
+    return &eviction_manager;
+  }
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_EVICTION_MANAGER_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h
new file mode 100644
index 00000000..05787c6a
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor.h
@@ -0,0 +1,154 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
+#include <list>
+
+#include "counter_filter_descriptor_impl.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "dynamic_dim_feature_descriptor_impl.h"
+#include "feature_descriptor_impl.h"
+#include "hbm_multi_tier_feature_descriptor.h"
+#include "normal_feature_descriptor.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace embedding {
+
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class CounterFilterDescriptorImpl;
+
+template <class V>
+class FeatureDescriptor {
+ public:
+  FeatureDescriptor(int64 block_num, int64 slot_num, Allocator* alloc,
+                    StorageType storage_type, bool need_record_freq,
+                    bool need_record_version,
+                    const std::pair<bool, int64>& filter_info) {
+    if (block_num > 1) {
+      feat_desc_impl_.reset(
+          new DynmaicDimDescriptorImpl<V>(alloc, block_num * slot_num));
+    } else if (filter_info.first) {
+      feat_desc_impl_.reset(new CounterFilterDescriptorImpl<V>(
+          alloc, slot_num, need_record_freq, need_record_version,
+          filter_info.second, storage_type));
+    } else if (storage_type == StorageType::HBM_DRAM ||
+               storage_type == StorageType::HBM_DRAM_SSDHASH) {
+      feat_desc_impl_.reset(new HbmMultiTierFeatureDescriptorImpl<V>(
+          alloc, slot_num, need_record_freq, need_record_version));
+    } else {
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          alloc, slot_num, need_record_freq, need_record_version));
+    }
+  }
+
+  FeatureDescriptor(FeatureDescriptor<V>* feat_desc) {
+    if (typeid(*(feat_desc->feat_desc_impl_.get())) ==
+        typeid(CounterFilterDescriptorImpl<V>*)) {
+      feat_desc_impl_.reset(new CounterFilterDescriptorImpl<V>(
+          dynamic_cast<CounterFilterDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    } else if (typeid(*(feat_desc->feat_desc_impl_.get())) ==
+               typeid(HbmMultiTierFeatureDescriptorImpl<V>)) {
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          dynamic_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    } else {
+      feat_desc_impl_.reset(new NormalFeatureDescriptorImpl<V>(
+          dynamic_cast<NormalFeatureDescriptorImpl<V>*>(
+              feat_desc->feat_desc_impl_.get())));
+    }
+  }
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) {
+    return feat_desc_impl_->InitSlotInfo(emb_index, embedding_dim,
+                                         default_value);
+  }
+
+  bool InitSlotInfo(FeatureDescriptor<V>* feat_desc) {
+    return feat_desc_impl_->InitSlotInfo(feat_desc->feat_desc_impl_.get());
+  }
+
+  V* GetEmbedding(void* val, int emb_index) {
+    return feat_desc_impl_->GetEmbedding(val, emb_index);
+  }
+
+  void* Allocate() { return feat_desc_impl_->Allocate(); }
+
+  void* Allocate(int64 freq) { return feat_desc_impl_->Allocate(freq); }
+
+  void Deallocate(void* val) { feat_desc_impl_->Deallocate(val); }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) {
+    feat_desc_impl_->Deallocate(value_ptrs);
+  }
+
+  void SetDefaultValue(void* val, int64 index) {
+    feat_desc_impl_->SetDefaultValue(val, index);
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) {
+    feat_desc_impl_->SetValue(val, emb_index, value);
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device) {
+    reinterpret_cast<HbmMultiTierFeatureDescriptorImpl<V>*>(
+        feat_desc_impl_.get())
+        ->SetDefaultValues(keys, init_cursor, value_ptrs, compute_stream,
+                           event_mgr, gpu_device);
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) { feat_desc_impl_->SetAllocator(alloc); }
+
+  int data_bytes() { return feat_desc_impl_->data_bytes(); }
+
+  int64 GetFreq(void* val) { return feat_desc_impl_->GetFreq(val); }
+
+  int64 GetVersion(void* val) { return feat_desc_impl_->GetVersion(val); }
+
+  void SetFreq(void* val, int64 freq) { feat_desc_impl_->SetFreq(val, freq); }
+
+  void UpdateVersion(void* val, int64 version) {
+    feat_desc_impl_->UpdateVersion(val, version);
+  }
+
+  void AddFreq(void* val, int64 freq) { feat_desc_impl_->AddFreq(val, freq); }
+
+  int total_dim() { return feat_desc_impl_->total_dim(); }
+
+  bool IsAdmit(void* val) { return feat_desc_impl_->IsAdmit(val); }
+
+  void* Admit(void* val) { return feat_desc_impl_->Admit(val); }
+
+ protected:
+  std::unique_ptr<FeatureDescriptorImpl<V>> feat_desc_impl_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h
new file mode 100644
index 00000000..18dc6696
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/feature_descriptor_impl.h
@@ -0,0 +1,299 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/util/env_var.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+namespace embedding {
+struct SlotInfo {
+  int embedding_dim;
+  int embedding_offset;
+  void* default_value;
+  int64 default_value_dim;
+  int default_value_len;
+};
+
+class BaseFreqDescriptor {
+ public:
+  virtual int64 GetFreq(void* value_ptr) = 0;
+  virtual void AddFreq(void* value_ptr, int64 freq) {}
+  virtual void SetFreq(void* value_ptr, int64 freq) {}
+  virtual BaseFreqDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class FreqDescriptor : public BaseFreqDescriptor {
+ public:
+  explicit FreqDescriptor(int offset_byte) : offset_byte_(offset_byte) {}
+
+  int64 GetFreq(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void AddFreq(void* value_ptr, int64 freq) override {
+    __sync_fetch_and_add((int64*)(value_ptr + offset_byte_), freq);
+  }
+
+  void SetFreq(void* value_ptr, int64 freq) override {
+    *(int64*)(value_ptr + offset_byte_) = freq;
+  }
+
+  BaseFreqDescriptor* Clone() override {
+    return new FreqDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+
+ private:
+  int offset_byte_;
+};
+
+class NonFreqDescriptor : public BaseFreqDescriptor {
+ public:
+  int64 GetFreq(void* value_ptr) override {
+    LOG(FATAL) << "Can not get freq from NonFreqCounter.";
+  }
+
+  BaseFreqDescriptor* Clone() override { return new NonFreqDescriptor(); }
+};
+
+class BaseVersionDescriptor {
+ public:
+  virtual int64 GetVersion(void* value_ptr) = 0;
+  virtual void UpdateVersion(void* value_ptr, int64 version) {}
+  virtual BaseVersionDescriptor* Clone() = 0;
+  virtual void SetOffset(int* alloc_bytes) {}
+};
+
+class VersionDescriptor : public BaseVersionDescriptor {
+ public:
+  explicit VersionDescriptor(int offset_byte) : offset_byte_(offset_byte) {}
+
+  int64 GetVersion(void* value_ptr) override {
+    return *(int64*)(value_ptr + offset_byte_);
+  }
+
+  void UpdateVersion(void* value_ptr, int64 version) override {
+    *(int64*)(value_ptr + offset_byte_) = version;
+  }
+
+  BaseVersionDescriptor* Clone() override {
+    return new VersionDescriptor(offset_byte_);
+  }
+
+  void SetOffset(int* alloc_bytes) override {
+    offset_byte_ = *alloc_bytes;
+    *alloc_bytes += sizeof(int64);
+  }
+
+ private:
+  int offset_byte_;
+};
+
+class NonVersionDescriptor : public BaseVersionDescriptor {
+ public:
+  int64 GetVersion(void* value_ptr) override {
+    LOG(FATAL) << "Can not get version from NonFreqCounter.";
+  }
+
+  BaseVersionDescriptor* Clone() override { return new NonVersionDescriptor(); }
+};
+
+template <class V>
+class FeatureDescriptorImpl {
+ public:
+  FeatureDescriptorImpl(int64 slot_num, bool need_record_freq,
+                        bool need_record_version) {
+    slot_infos_.resize(slot_num);
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      slot_infos_[i].embedding_offset = EMPTY_OFFSET_VALUE;
+    }
+
+    if (!need_record_freq) {
+      freq_desc_.reset(new NonFreqDescriptor());
+    }
+    if (!need_record_version) {
+      version_desc_.reset(new NonVersionDescriptor());
+    }
+  }
+
+  FeatureDescriptorImpl(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+    freq_desc_.reset(feat_desc_impl->freq_desc_->Clone());
+    version_desc_.reset(feat_desc_impl->version_desc_->Clone());
+  }
+
+  virtual ~FeatureDescriptorImpl() {}
+
+  virtual bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                            const std::pair<V*, int64>& default_value) = 0;
+  virtual bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    LOG(FATAL) << "InitSlotInfo(feat_desc_impl) is not implemented.";
+  }
+  virtual V* GetEmbedding(void* val, int emb_index) = 0;
+  virtual void* Allocate() = 0;
+  virtual void* Allocate(int64 freq) { return Allocate(); }
+  virtual void Deallocate(void* val) = 0;
+  virtual void Deallocate(const std::vector<void*>& val) = 0;
+  virtual void SetAllocator(Allocator* alloc) = 0;
+  virtual void SetDefaultValue(void* val, int64 key) = 0;
+  virtual void SetValue(void* val, int64 emb_index, V* value) {}
+  virtual bool IsAdmit(void* val) { return true; }
+  virtual void* Admit(void* val) {}
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device) {}
+#endif
+  virtual int data_bytes() = 0;
+
+  virtual int64 GetFreq(void* val) { return freq_desc_->GetFreq(val); }
+
+  virtual int64 GetVersion(void* val) { return version_desc_->GetVersion(val); }
+
+  virtual void SetFreq(void* val, int64 freq) {
+    freq_desc_->SetFreq(val, freq);
+  }
+
+  virtual void UpdateVersion(void* val, int64 version) {
+    version_desc_->UpdateVersion(val, version);
+  }
+
+  virtual void AddFreq(void* val, int64 freq) {
+    freq_desc_->AddFreq(val, freq);
+  }
+
+  inline int total_dim() {
+    int64 slot_num = slot_infos_.size();
+    return slot_infos_[slot_num - 1].embedding_offset +
+           slot_infos_[slot_num - 1].embedding_dim;
+  }
+
+ protected:
+  bool SetEmbeddingInfo(int emb_index, int64 embedding_dim,
+                        const std::pair<V*, int64>& default_value) {
+    slot_infos_[emb_index].default_value = default_value.first;
+    slot_infos_[emb_index].default_value_dim = default_value.second;
+    slot_infos_[emb_index].default_value_len = embedding_dim;
+
+    bool is_aligned = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("EV_DATA_ALIGNED", true, &is_aligned));
+    if (is_aligned) {
+      embedding_dim = ComputeAlignedDim(embedding_dim);
+    }
+
+    // Avoid parallel consitency issue
+    __sync_bool_compare_and_swap(&slot_infos_[emb_index].embedding_offset,
+                                 EMPTY_OFFSET_VALUE, embedding_dim);
+    slot_infos_[emb_index].embedding_dim = embedding_dim;
+    // Check whether all offsets are set
+    for (int i = 0; i < slot_infos_.size(); i++) {
+      if (slot_infos_[i].embedding_offset == EMPTY_OFFSET_VALUE) {
+        return false;
+      }
+    }
+
+    ComputeEmbeddingOffsets();
+    return true;
+  }
+
+  void SetSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) {
+    slot_infos_ = feat_desc_impl->slot_infos_;
+  }
+
+  void ComputeAllocBytes(int* alloc_bytes) {
+    for (auto slot_info : slot_infos_) {
+      *alloc_bytes += slot_info.embedding_dim * sizeof(V);
+    }
+  }
+
+  void CreateFreqAndVersionDescriptor(int* alloc_bytes) {
+    if (!freq_desc_) {
+      freq_desc_.reset(new FreqDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+    if (!version_desc_) {
+      version_desc_.reset(new VersionDescriptor(*alloc_bytes));
+      *alloc_bytes += sizeof(int64);
+    }
+  }
+
+  void InitFreqAndVersion(void* val) {
+    freq_desc_->SetFreq(val, 0);
+    version_desc_->UpdateVersion(val, -1);
+  }
+
+  void SetFreqAndVersionOffset(int* alloc_bytes) {
+    freq_desc_->SetOffset(alloc_bytes);
+    version_desc_->SetOffset(alloc_bytes);
+  }
+
+  V* GetDefaultValuePtr(int64 emb_index, int64 key) {
+    V* default_value_base = (V*)slot_infos_[emb_index].default_value;
+    int64 default_value_offset =
+        (std::abs(key) % slot_infos_[emb_index].default_value_dim) *
+        slot_infos_[emb_index].default_value_len;
+    return default_value_base + default_value_offset;
+  }
+
+  void SetDefaultValue(void* val, int64 emb_index, int64 key) {
+    memcpy(val, GetDefaultValuePtr(emb_index, key),
+           slot_infos_[emb_index].default_value_len * sizeof(V));
+  }
+
+ private:
+  int64 ComputeAlignedDim(int64 embedding_dim) {
+    int padding_bytes = ALIGN_BYTES - embedding_dim * sizeof(V) % ALIGN_BYTES;
+    if (padding_bytes == ALIGN_BYTES) {
+      return embedding_dim;
+    } else {
+      return embedding_dim + padding_bytes / sizeof(V);
+    }
+  }
+
+  void ComputeEmbeddingOffsets() {
+    for (int i = slot_infos_.size() - 1; i >= 0; i--) {
+      slot_infos_[i].embedding_offset = 0;
+      for (int j = 0; j < i; j++) {
+        slot_infos_[i].embedding_offset += slot_infos_[j].embedding_offset;
+      }
+    }
+  }
+
+ protected:
+  const int EMPTY_OFFSET_VALUE = -1;
+  const int ALIGN_BYTES = 16;
+  std::vector<SlotInfo> slot_infos_;
+  std::unique_ptr<BaseFreqDescriptor> freq_desc_;
+  std::unique_ptr<BaseVersionDescriptor> version_desc_;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FEATURE_DESCRIPTOR_IMPL_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h
new file mode 100644
index 00000000..db2e1a88
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/filter_factory.h
@@ -0,0 +1,51 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
+
+#include "bloom_filter_policy.h"
+#include "counter_filter_policy.h"
+#include "embedding_config.h"
+#include "filter_policy.h"
+#include "nullable_filter_policy.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K, class V>
+class Storage;
+}
+
+class FilterFactory {
+ public:
+  template <typename K, typename V, typename EV>
+  static FilterPolicy<K, V, EV>* CreateFilter(
+      const EmbeddingConfig& config, EV* ev, embedding::Storage<K, V>* storage,
+      embedding::FeatureDescriptor<V>* feat_desc) {
+    if (config.filter_freq > 0) {
+      if (config.kHashFunc != 0) {
+        return new BloomFilterPolicy<K, V, EV>(config, ev, feat_desc);
+      } else {
+        return new CounterFilterPolicy<K, V, EV>(config, ev, feat_desc);
+      }
+    } else {
+      return new NullableFilterPolicy<K, V, EV>(config, ev, storage, feat_desc);
+    }
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_FACTORY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h
new file mode 100644
index 00000000..090f6f02
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/filter_policy.h
@@ -0,0 +1,106 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_
+
+#include "emb_file.h"
+#include "embedding_config.h"
+#include "feature_descriptor.h"
+
+namespace tensorflow {
+
+struct RestoreBuffer {
+  char* key_buffer = nullptr;
+  char* value_buffer = nullptr;
+  char* version_buffer = nullptr;
+  char* freq_buffer = nullptr;
+  bool should_release = false;
+
+  explicit RestoreBuffer(size_t buffer_size) {
+    key_buffer = new char[buffer_size];
+    value_buffer = new char[buffer_size];
+    version_buffer = new char[buffer_size];
+    freq_buffer = new char[buffer_size];
+    should_release = true;
+  }
+
+  explicit RestoreBuffer(char* i_key_buffer, char* i_value_buffer,
+                         char* i_version_buffer, char* i_freq_buffer) {
+    key_buffer = i_key_buffer;
+    value_buffer = i_value_buffer;
+    version_buffer = i_version_buffer;
+    freq_buffer = i_freq_buffer;
+  }
+
+  ~RestoreBuffer() {
+    if (should_release) {
+      delete[] key_buffer;
+      delete[] value_buffer;
+      delete[] version_buffer;
+      delete[] freq_buffer;
+    }
+  }
+};
+
+template <typename K>
+class RestoreSSDBuffer;
+
+template <typename K, typename V, typename EV>
+class FilterPolicy {
+ public:
+  FilterPolicy(const EmbeddingConfig& config, EV* ev)
+      : config_(config), ev_(ev) {}
+
+  virtual void LookupOrCreate(K key, V* val, const V* default_value_ptr,
+                              void** value_ptr, int count,
+                              const V* default_value_no_permission) = 0;
+
+  virtual Status Lookup(K key, V* val, const V* default_value_ptr,
+                        const V* default_value_no_permission) = 0;
+
+#if GOOGLE_CUDA
+  virtual void BatchLookup(const EmbeddingVarContext<GPUDevice>& context,
+                           const K* keys, V* output, int64 num_of_keys,
+                           V* default_value_ptr,
+                           V* default_value_no_permission) = 0;
+
+  virtual void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                                      const K* keys, void** value_ptrs_list,
+                                      int64 num_of_keys) = 0;
+#endif  // GOOGLE_CUDA
+
+  virtual Status LookupOrCreateKey(K key, void** val, bool* is_filter,
+                                   int64 count) = 0;
+
+  virtual Status LookupKey(K key, void** val, bool* is_filter, int64 count) {}
+
+  virtual int64 GetFreq(K key, void* value_ptr) = 0;
+  virtual int64 GetFreq(K key) = 0;
+
+  virtual bool is_admit(K key, void* value_ptr) = 0;
+
+  virtual Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool to_dram, bool is_incr,
+                         RestoreBuffer& restore_buff) = 0;
+
+ protected:
+  EmbeddingConfig config_;
+  EV* ev_;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_FILTER_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h
new file mode 100644
index 00000000..ed7a7be9
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/globalstep_shrink_policy.h
@@ -0,0 +1,62 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_
+
+#include "shrink_policy.h"
+
+namespace tensorflow {
+namespace embedding {
+template <typename K, typename V>
+class GlobalStepShrinkPolicy : public ShrinkPolicy<K, V> {
+ public:
+  GlobalStepShrinkPolicy(int64 steps_to_live, FeatureDescriptor<V>* feat_desc,
+                         KVInterface<K, V>* kv)
+      : steps_to_live_(steps_to_live), kv_(kv), ShrinkPolicy<K, V>(feat_desc) {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GlobalStepShrinkPolicy);
+
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_list,
+              const ShrinkArgs& shrink_args) override {
+    ShrinkPolicy<K, V>::ReleaseValuePtrs();
+    FilterToDelete(shrink_args.global_step, key_list, value_list);
+  }
+
+ private:
+  void FilterToDelete(int64 global_step, std::vector<K>& key_list,
+                      std::vector<void*>& value_list) {
+    for (int64 i = 0; i < key_list.size(); ++i) {
+      int64 version = ShrinkPolicy<K, V>::feat_desc_->GetVersion(value_list[i]);
+      if (version == -1) {
+        ShrinkPolicy<K, V>::feat_desc_->UpdateVersion(value_list[i],
+                                                      global_step);
+      } else {
+        if (global_step - version > steps_to_live_) {
+          kv_->Remove(key_list[i]);
+          ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
+        }
+      }
+    }
+  }
+
+ private:
+  int64 steps_to_live_;
+  KVInterface<K, V>* kv_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GLOBALSTEP_SHRINK_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h
new file mode 100644
index 00000000..5d7ba3d0
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_map_kv.h
@@ -0,0 +1,333 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
+
+#if GOOGLE_CUDA
+
+#include "gpu_hash_table.h"
+#include "kv_interface.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+
+namespace embedding {
+
+template <typename K, typename V>
+class GPUHashMapKV : public KVInterface<K, V> {
+ public:
+  GPUHashMapKV(const EmbeddingConfig& config, Allocator* alloc)
+      : config_(config), alloc_(alloc), static_hash_table_(nullptr) {
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_));
+    if (!is_inference_) {
+      hash_table_ = new GPUHashTable<K, V>(-1, alloc);
+    }
+  }
+
+  ~GPUHashMapKV() override {
+    if (is_inference_) {
+      TypedAllocator::Deallocate(
+          alloc_, static_hash_table_->values_d,
+          static_hash_table_->capacity_ * static_hash_table_->dimension_);
+      delete static_hash_table_;
+    } else {
+      for (int i = 0; i < hash_table_->bank_ptrs.size(); ++i) {
+        TypedAllocator::Deallocate(alloc_, hash_table_->bank_ptrs[i],
+                                   value_len_ * hash_table_->initial_bank_size);
+        TypedAllocator::Deallocate(alloc_, hash_table_->existence_flag_ptrs[i],
+                                   hash_table_->initial_bank_size);
+      }
+      if (hash_table_->mem_bank_num != 0) {
+        auto num_elements = hash_table_->mem_bank_num *
+                            (config_.block_num * (1 + config_.slot_num));
+        TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs,
+                                   num_elements);
+        TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs,
+                                   num_elements);
+      }
+      delete hash_table_;
+    }
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GPUHashMapKV);
+
+  void SetValueLen(int64 value_len) { value_len_ = value_len; }
+
+  Status BatchLookupOrCreateKeys(const K* keys, size_t n, int32* item_idxs,
+                                 const Eigen::GpuDevice& device) {
+    if (n > 0) {
+      mutex_lock lock(lock_);
+      int remaining_size =
+          n + *(hash_table_->start_idx) -
+          hash_table_->mem_bank_num * hash_table_->initial_bank_size;
+      if (remaining_size > 0) {
+        Resize(remaining_size);
+      }
+      functor::KvLookupInsertKey<Eigen::GpuDevice, K, V>()(
+          keys, item_idxs, n, hash_table_, hash_table_->start_idx,
+          device.stream());
+    }
+    return OkStatus();
+  }
+
+  Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
+                             int32 default_v_num, size_t n,
+                             const Eigen::GpuDevice& device) {
+    if (n > 0) {
+      int32* item_idxs =
+          TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
+      BatchLookupOrCreateKeys(keys, n, item_idxs, device);
+      functor::KvLookupCreateEmb<Eigen::GpuDevice, K, V>()(
+          keys, val, default_v, value_len_, item_idxs, n, config_.emb_index,
+          default_v_num, hash_table_->d_bank_ptrs,
+          hash_table_->d_existence_flag_ptrs,
+          (config_.block_num * (1 + config_.slot_num)),
+          hash_table_->initial_bank_size, device.stream());
+      TypedAllocator::Deallocate(alloc_, item_idxs, n);
+    }
+
+    return OkStatus();
+  }
+
+  void GetSnapshot(std::vector<K>* key_list, std::vector<V*>* value_list,
+                   const EmbeddingConfig& emb_config) {
+    if (is_inference_) return;  // Special case for testing in training mode;
+    auto size = hash_table_->Size();
+    if (size <= 0) return;
+
+    int32* item_idxs =
+        TypedAllocator::Allocate<int32>(alloc_, size, AllocationAttributes());
+    K* keys_gpu =
+        TypedAllocator::Allocate<K>(alloc_, size, AllocationAttributes());
+    V* values_gpu = TypedAllocator::Allocate<V>(alloc_, size * value_len_,
+                                                AllocationAttributes());
+    V* values = TypedAllocator::Allocate<V>(cpu_allocator(), size * value_len_,
+                                            AllocationAttributes());
+    key_list->resize(size);
+    for (int64 i = 0; i < size; i++) {
+      value_list->emplace_back(values + i * value_len_);
+    }
+
+    auto slot_num = emb_config.block_num * (1 + emb_config.slot_num);
+    functor::KvKeyGetSnapshot<Eigen::GpuDevice, K, V>()(
+        keys_gpu, item_idxs, emb_config.emb_index, emb_config.primary_emb_index,
+        hash_table_->d_existence_flag_ptrs, hash_table_->mem_bank_num, slot_num,
+        hash_table_->initial_bank_size, hash_table_, size, NULL);
+
+    functor::KvEmbGetSnapshot<Eigen::GpuDevice, K, V>()(
+        keys_gpu, values_gpu, -1, value_len_, item_idxs, size,
+        emb_config.emb_index, hash_table_->d_bank_ptrs,
+        hash_table_->mem_bank_num, slot_num, hash_table_->initial_bank_size,
+        NULL);
+
+    cudaMemcpyAsync(const_cast<K*>(key_list->data()), keys_gpu,
+                    size * sizeof(K), cudaMemcpyDeviceToHost);
+    cudaMemcpyAsync(values, values_gpu, size * value_len_ * sizeof(V),
+                    cudaMemcpyDeviceToHost);
+    EventSynchronize(NULL);
+    TypedAllocator::Deallocate(alloc_, item_idxs, size);
+    TypedAllocator::Deallocate(alloc_, keys_gpu, size);
+    TypedAllocator::Deallocate(alloc_, values_gpu, size * value_len_);
+  }
+
+  Status Import(const std::vector<K>& key_import,
+                const std::vector<V>& value_import,
+                const Eigen::GpuDevice* device,
+                const EmbeddingConfig& emb_config) {
+    int n = key_import.size();
+    auto stream = device->stream();
+
+    if (is_inference_) {
+      if (n == 0) {
+        LOG(INFO) << "Size of keys in EmbeddingVar:  " << emb_config.name
+                  << " is 0 while loading in inference mode!";
+        return OkStatus();
+      }
+      static_hash_table_ =
+          new GPUStaticHashTable<K, V>(n, value_len_, -1, -1, alloc_, stream);
+      K* keys_d =
+          TypedAllocator::Allocate<K>(alloc_, n, AllocationAttributes());
+      cudaMemcpyAsync(keys_d, key_import.data(), n * sizeof(K),
+                      cudaMemcpyHostToDevice, stream);
+      static_hash_table_->values_d = TypedAllocator::Allocate<V>(
+          alloc_, value_import.size(), AllocationAttributes());
+      cudaMemcpyAsync(static_hash_table_->values_d, value_import.data(),
+                      value_import.size() * sizeof(V), cudaMemcpyHostToDevice,
+                      stream);
+      functor::KvInitStaticMap<Eigen::GpuDevice, K, V>()(
+          keys_d, static_hash_table_, n, value_len_, stream);
+      EventSynchronize(stream);
+
+      TypedAllocator::Deallocate(alloc_, keys_d, n);
+    } else {
+      if (n > 0) {
+        int32* item_idxs =
+            TypedAllocator::Allocate<int32>(alloc_, n, AllocationAttributes());
+        K* key_gpu =
+            TypedAllocator::Allocate<K>(alloc_, n, AllocationAttributes());
+        cudaMemcpyAsync(key_gpu, key_import.data(),
+                        key_import.size() * sizeof(K), cudaMemcpyHostToDevice,
+                        stream);
+        BatchLookupOrCreateKeys(key_gpu, n, item_idxs, *device);
+        V* value_gpu = TypedAllocator::Allocate<V>(alloc_, value_import.size(),
+                                                   AllocationAttributes());
+        cudaMemcpyAsync(value_gpu, value_import.data(),
+                        value_import.size() * sizeof(V), cudaMemcpyHostToDevice,
+                        stream);
+
+        functor::KvUpdateEmb<Eigen::GpuDevice, K, V>()(
+            key_import.data(), value_gpu, value_len_, item_idxs, n,
+            emb_config.emb_index, key_import.size(), hash_table_->d_bank_ptrs,
+            hash_table_->d_existence_flag_ptrs,
+            (emb_config.block_num * (1 + emb_config.slot_num)),
+            hash_table_->initial_bank_size, stream);
+        EventSynchronize(stream);
+        TypedAllocator::Deallocate(alloc_, item_idxs, n);
+        TypedAllocator::Deallocate(alloc_, value_gpu, value_import.size());
+        TypedAllocator::Deallocate(alloc_, key_gpu, n);
+      }
+    }
+
+    return OkStatus();
+  }
+
+  Status BatchLookupOrCreate(const K* keys, size_t n,
+                             void** value_ptrs) override {
+    return OkStatus();
+  }
+
+  Status Lookup(K key, void** value_ptr) override { return OkStatus(); }
+
+  Status Contains(K key) override { return OkStatus(); }
+
+  Status Insert(K key, const void* value_ptr) override { return OkStatus(); }
+
+  Status Remove(K key) override { return OkStatus(); }
+
+  Status BatchLookup(const K* keys, size_t size, void** value_ptrs) override {
+    return OkStatus();
+  }
+
+  Status BatchInsert(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    return OkStatus();
+  }
+
+  Status BatchRemove(const K* keys, size_t size) override { return OkStatus(); }
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    return OkStatus();
+  }
+
+  int64 Size() const override { return 0; }
+
+  void FreeValuePtr(void* value_ptr) override {}
+
+  Status Commit(K key, const void* value_ptr) override { return OkStatus(); }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    LOG(INFO) << "GPUHashMapKV do not support GetShardedSnapshot";
+    return OkStatus();
+  }
+
+  std::string DebugString() const override { return std::string(); }
+
+  GPUHashTable<K, V>* HashTable() override { return hash_table_; }
+
+  Status BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val,
+                     size_t n, const V* default_v) override {
+    if (n > 0) {
+      if (is_inference_) {
+        functor::KvLookupKey<GPUStaticHashTable<K, V>, K, V>()(
+            keys, val, n, value_len_, config_.emb_index,
+            (config_.block_num * (1 + config_.slot_num)), static_hash_table_,
+            default_v, config_.default_value_dim, device.stream());
+      } else {
+        functor::KvLookupKey<GPUHashTable<K, V>, K, V>()(
+            keys, val, n, value_len_, config_.emb_index,
+            (config_.block_num * (1 + config_.slot_num)), hash_table_,
+            default_v, config_.default_value_dim, device.stream());
+      }
+    }
+    return OkStatus();
+  }
+
+ private:
+  void Resize(int hint) {
+    while (hint > 0) {
+      for (int i = 0; i < (config_.block_num * (1 + config_.slot_num)); ++i) {
+        V* ptr = TypedAllocator::Allocate<V>(
+            alloc_, value_len_ * hash_table_->initial_bank_size,
+            AllocationAttributes());
+        hash_table_->bank_ptrs.push_back(ptr);
+        bool* ptr2 = TypedAllocator::Allocate<bool>(
+            alloc_, hash_table_->initial_bank_size, AllocationAttributes());
+        hash_table_->existence_flag_ptrs.push_back(ptr2);
+        cudaMemset(ptr2, 0, sizeof(bool) * hash_table_->initial_bank_size);
+      }
+      hint -= hash_table_->initial_bank_size;
+      ++hash_table_->mem_bank_num;
+    }
+
+    auto num_elements = hash_table_->mem_bank_num *
+                        (config_.block_num * (1 + config_.slot_num));
+    if (hash_table_->d_bank_ptrs) {
+      TypedAllocator::Deallocate(alloc_, hash_table_->d_bank_ptrs,
+                                 num_elements);
+      TypedAllocator::Deallocate(alloc_, hash_table_->d_existence_flag_ptrs,
+                                 num_elements);
+    }
+    hash_table_->d_bank_ptrs = TypedAllocator::Allocate<V*>(
+        alloc_, num_elements, AllocationAttributes());
+    cudaMemcpy(hash_table_->d_bank_ptrs, hash_table_->bank_ptrs.data(),
+               num_elements * sizeof(V*), cudaMemcpyHostToDevice);
+    hash_table_->d_existence_flag_ptrs = TypedAllocator::Allocate<bool*>(
+        alloc_, num_elements, AllocationAttributes());
+    cudaMemcpy(hash_table_->d_existence_flag_ptrs,
+               hash_table_->existence_flag_ptrs.data(),
+               num_elements * sizeof(bool*), cudaMemcpyHostToDevice);
+  }
+
+  void EventSynchronize(const cudaStream_t& stream) {
+    cudaEvent_t is_finish;
+    cudaEventCreate(&is_finish);
+    cudaEventRecord(is_finish, stream);
+    cudaEventSynchronize(is_finish);
+    cudaEventDestroy(is_finish);
+  }
+
+ private:
+  EmbeddingConfig config_;
+  bool is_inference_;
+  GPUStaticHashTable<K, V>* static_hash_table_;
+  GPUHashTable<K, V>* hash_table_;
+  Allocator* alloc_;
+  int64 value_len_;
+  mutex lock_;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_MAP_KV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc
new file mode 100644
index 00000000..fa114f62
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.cu.cc
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "gpu_hash_table.h"
+
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+#include <cuda_runtime.h>
+
+#include "cuco/dynamic_map.cuh"
+#include "cuco/static_map.cuh"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace cg = cooperative_groups;
+
+namespace tensorflow {
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+const size_t BLOCK_SIZE = 128;
+const size_t STRIDE = 1;
+const size_t TILE_SIZE = 4;
+}  // namespace
+template <typename T>
+class gpu_hash_map_tf_allocator {
+ public:
+  Allocator* alloc_;
+  using value_type = T;
+
+  gpu_hash_map_tf_allocator(Allocator* alloc) : alloc_(alloc) {}
+
+  gpu_hash_map_tf_allocator(const gpu_hash_map_tf_allocator& a) noexcept
+      : alloc_(a.alloc_) {}
+
+  template <typename U>
+  gpu_hash_map_tf_allocator(const gpu_hash_map_tf_allocator<U>& a) noexcept
+      : alloc_(a.alloc_) {}
+
+  gpu_hash_map_tf_allocator& operator=(
+      const gpu_hash_map_tf_allocator& a) noexcept {
+    return *this;
+  }
+
+  gpu_hash_map_tf_allocator& operator=(gpu_hash_map_tf_allocator&& a) {
+    alloc_ = a.alloc_;
+    return *this;
+  }
+
+  ~gpu_hash_map_tf_allocator() noexcept {}
+
+  value_type* allocate(size_t size) const {
+    void* ptr =
+        alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                            size * sizeof(value_type), AllocationAttributes());
+    return (value_type*)ptr;
+  }
+
+  void deallocate(value_type* ptr, size_t) const { alloc_->DeallocateRaw(ptr); }
+};
+
+template <typename T, typename U>
+bool operator==(gpu_hash_map_tf_allocator<T> const&,
+                gpu_hash_map_tf_allocator<U> const&) noexcept {
+  return true;
+}
+
+template <typename T, typename U>
+bool operator!=(gpu_hash_map_tf_allocator<T> const& lhs,
+                gpu_hash_map_tf_allocator<U> const& rhs) noexcept {
+  return not(lhs == rhs);
+}
+
+template <typename KeyType, typename ValueType,
+          typename CUCOAllocator = gpu_hash_map_tf_allocator<uint8_t>>
+class DynamicHashTable {
+ public:
+  cuco::dynamic_map<KeyType, ValueType, cuda::thread_scope_device,
+                    CUCOAllocator>
+      map_;
+
+  DynamicHashTable(size_t initial_capacity, KeyType empty_key_sentinel,
+                   ValueType empty_value_sentinel, CUCOAllocator alloc)
+      : map_(initial_capacity, empty_key_sentinel, empty_value_sentinel,
+             alloc) {}
+  ~DynamicHashTable() {}
+};
+
+template <typename K, typename V>
+GPUHashTable<K, V>::GPUHashTable(K empty_key_sentinel, Allocator* alloc,
+                                 size_t initial_capacity)
+    : initial_bank_size(initial_capacity) {
+  hash_table =
+      new DynamicHashTable<K, int32>(initial_capacity, empty_key_sentinel, -1,
+                                     gpu_hash_map_tf_allocator<uint8_t>(alloc));
+  cudaMallocManaged(
+      &start_idx, sizeof(cuda::atomic<std::size_t, cuda::thread_scope_device>));
+  *start_idx = 0;
+}
+
+template <typename K, typename V>
+GPUHashTable<K, V>::~GPUHashTable() {
+  delete hash_table;
+  cudaFree(start_idx);
+}
+
+template <typename K, typename V>
+int32 GPUHashTable<K, V>::Size() {
+  return hash_table->map_.get_size();
+}
+
+template <typename K, typename V,
+          typename CUCOAllocator = gpu_hash_map_tf_allocator<uint8_t>>
+class StaticHashTable {
+ public:
+  cuco::static_map<K, int32, cuda::thread_scope_device, CUCOAllocator> map_;
+
+  StaticHashTable(size_t initial_capacity, K empty_key_sentinel,
+                  int32 empty_value_sentinel, CUCOAllocator alloc)
+      : map_(initial_capacity, empty_key_sentinel, empty_value_sentinel,
+             alloc) {}
+};
+
+template <typename K, typename V>
+GPUStaticHashTable<K, V>::GPUStaticHashTable(size_t capacity, int dimension,
+                                             K empty_key_sentinel,
+                                             int32 empty_value_sentinel,
+                                             Allocator* alloc,
+                                             cudaStream_t stream) {
+  capacity_ = capacity;
+  dimension_ = dimension;
+  // cudaMallocAsync(&values_d, sizeof(V) * dimension * capacity, stream);
+  // cudaMallocManaged(&values_d, sizeof(V) * dimension * capacity);
+
+  hash_table = new StaticHashTable<K, V>(
+      capacity / 0.8 /*load_factor*/, empty_key_sentinel, empty_value_sentinel,
+      gpu_hash_map_tf_allocator<uint8_t>(alloc));
+}
+
+template <typename K, typename V>
+GPUStaticHashTable<K, V>::~GPUStaticHashTable() {
+  delete hash_table;
+  delete default_values;
+  cudaFree(values_d);
+}
+
+template <typename K, typename V>
+std::size_t GPUStaticHashTable<K, V>::Size() {
+  return hash_table->map_.get_size();
+}
+
+#define REGISTER_ALL_TYPE(type)                   \
+  template class GPUHashTable<int32, type>;       \
+  template class GPUHashTable<int64, type>;       \
+  template class GPUStaticHashTable<int32, type>; \
+  template class GPUStaticHashTable<int64, type>;
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE)
+#undef REGISTER_ALL_TYPE
+
+namespace functor {
+using atomicT = cuda::atomic<std::size_t, cuda::thread_scope_device>;
+
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename mutableViewT,
+          typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_initialize_static_map(const Key* key_first, int32 num_items,
+                                         int32 dimension,
+                                         mutableViewT map_mutable_view,
+                                         atomicT* num_successes,
+                                         Hash hash = Hash{},
+                                         KeyEqual key_equal = KeyEqual{}) {
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t thread_num_successes = 0;
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 value = key_idx * dimension;
+
+    auto const insert_pair = cuco::pair_type<Key, int32>{key, value};
+    if (map_mutable_view.insert(tile, insert_pair, hash, key_equal) &&
+        tile.thread_rank() == 0) {
+      thread_num_successes++;
+    }
+
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+  std::size_t block_num_successes =
+      BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    *num_successes += block_num_successes;
+  }
+}
+
+template <typename Key, typename V>
+struct KvInitStaticMap<GPUDevice, Key, V> {
+  void operator()(const Key* keys, GPUStaticHashTable<Key, V>* hash_table,
+                  int32 num_items, int32 dimension, cudaStream_t stream) {
+    using MutableViewT = typename cuco::static_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::device_mutable_view;
+
+    auto& map = hash_table->hash_table->map_;
+    size_t num_to_insert = num_items;
+    while (num_to_insert > 0) {
+      static_assert(sizeof(std::size_t) == sizeof(atomicT));
+      CUCO_CUDA_TRY(
+          cudaMemsetAsync(map.get_num_success(), 0, sizeof(atomicT), stream));
+
+      auto n = std::min((size_t)65535, num_to_insert);
+      auto const grid_size =
+          (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) / (STRIDE * BLOCK_SIZE);
+      TF_CHECK_OK(GpuLaunchKernel(
+          kv_initialize_static_map<BLOCK_SIZE, TILE_SIZE, Key, V, MutableViewT,
+                                   cuco::detail::MurmurHash3_32<Key>,
+                                   thrust::equal_to<Key>>,
+          grid_size, BLOCK_SIZE, 0, stream, keys, n, dimension,
+          map.get_device_mutable_view(), map.get_num_success(),
+          cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+
+      CUCO_CUDA_TRY(cudaStreamSynchronize(stream));
+
+      std::size_t h_num_successes =
+          map.get_num_success()->load(cuda::std::memory_order_relaxed);
+      map.update_size(h_num_successes);
+      keys += n;
+      num_to_insert -= n;
+    }
+  }
+};
+
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename ViewT, typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_lookup_dynamic_key_kernel(
+    const Key* key_first, V** value_srcs, V* value_first, const V* default_v,
+    int32 default_v_num, size_t num_items, int32 dimension, ViewT* submap_views,
+    uint32_t num_submaps, int32 slot_idx, int32 slot_num, int32 bank_size,
+    Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) {
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+  auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 found_value = empty_value_sentinel;
+
+    for (auto i = 0; i < num_submaps; ++i) {
+      auto submap_view = submap_views[i];
+      auto found = submap_view.find(tile, key, hash, key_equal);
+      if (found != submap_view.end()) {
+        found_value = found->second;
+        break;
+      }
+    }
+    if (found_value == empty_value_sentinel) {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            default_v[abs(key) % default_v_num * dimension + id];
+      }
+    } else {
+      auto bank_idx = found_value / bank_size;
+      auto offset_in_bank = found_value % bank_size;
+      auto slot_offset = bank_idx * slot_num + slot_idx;
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            value_srcs[slot_offset][offset_in_bank * dimension + id];
+      }
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+template <typename Key, typename V>
+struct KvLookupKey<GPUHashTable<Key, V>, Key, V> {
+  void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension,
+                  int32 slot_idx, int32 slot_num,
+                  GPUHashTable<Key, V>* hash_table, const V* default_v,
+                  int32 default_v_num, cudaStream_t stream) {
+    using mutableViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::mutable_view_type;
+    using ViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::view_type;
+
+    auto& map = hash_table->hash_table->map_;
+
+    auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) /
+                           (STRIDE * BLOCK_SIZE);
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_lookup_dynamic_key_kernel<BLOCK_SIZE, TILE_SIZE, Key, V, ViewT>,
+        grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->d_bank_ptrs, vals,
+        default_v, default_v_num, num_items, dimension,
+        map.get_submap_views().data().get(), map.get_submaps().size(), slot_idx,
+        slot_num, hash_table->initial_bank_size,
+        cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+  }
+};
+
+template <uint32_t block_size, uint32_t tile_size, typename Key, typename V,
+          typename ViewT, typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_lookup_static_key_kernel(
+    const Key* key_first, const V* value_srcs, V* value_first,
+    const V* default_v, int32 default_v_num, size_t num_items, int32 dimension,
+    ViewT map_views, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) {
+  auto grid = cooperative_groups::this_grid();
+  auto block = cooperative_groups::this_thread_block();
+  auto tile = cooperative_groups::tiled_partition<tile_size>(block);
+
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;  // actual thread idx
+  auto empty_value_sentinel = map_views.get_empty_value_sentinel();
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 found_value = empty_value_sentinel;
+    auto found = map_views.find(tile, key, hash, key_equal);
+    if (found != map_views.end()) {
+      found_value = found->second;
+    }
+
+    if (found_value == empty_value_sentinel) {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] =
+            default_v[abs(key) % default_v_num * dimension + id];
+      }
+    } else {
+      for (int id = tile.thread_rank(); id < dimension; id += tile_size) {
+        value_first[key_idx * dimension + id] = value_srcs[found_value + id];
+      }
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+}
+
+template <typename Key, typename V>
+struct KvLookupKey<GPUStaticHashTable<Key, V>, Key, V> {
+  void operator()(const Key* keys, V* vals, int32 num_items, int32 dimension,
+                  int32 slot_idx, int32 slot_num,
+                  GPUStaticHashTable<Key, V>* hash_table, const V* default_v,
+                  int32 default_v_num, cudaStream_t stream) {
+    using ViewT = typename cuco::static_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::device_view;
+    auto& map = hash_table->hash_table->map_;
+
+    auto const grid_size = (TILE_SIZE * num_items + STRIDE * BLOCK_SIZE - 1) /
+                           (STRIDE * BLOCK_SIZE);
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_lookup_static_key_kernel<BLOCK_SIZE, TILE_SIZE, Key, V, ViewT>,
+        grid_size, BLOCK_SIZE, 0, stream, keys, hash_table->values_d, vals,
+        default_v, default_v_num, num_items, dimension, map.get_device_view(),
+        cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+  }
+};
+
+template <uint32_t block_size, uint32_t tile_size, typename Key,
+          typename mutableViewT, typename ViewT,
+          typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_lookup_and_insert_key_kernel(
+    const Key* key_first, int32* value_first, int32 num_items,
+    mutableViewT* submap_mutable_views, ViewT* submap_views,
+    uint32_t num_submaps, atomicT* num_successes, atomicT* start_idx,
+    int32 submap_idx, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) {
+  typedef cub::BlockReduce<std::size_t, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  std::size_t thread_num_successes = 0;
+
+  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
+  auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
+  int32 tmp;
+
+  while (key_idx < num_items) {
+    auto key = *(key_first + key_idx);
+    int32 found_value = empty_value_sentinel;
+
+    for (auto i = 0; i < num_submaps; ++i) {
+      auto submap_view = submap_views[i];
+      auto found = submap_view.find(tile, key, hash, key_equal);
+      if (found != submap_view.end()) {
+        found_value = found->second;
+        break;
+      }
+    }
+    if (found_value == empty_value_sentinel) {
+      if (tile.thread_rank() == 0) {
+        tmp = start_idx->fetch_add(1);
+      }
+      found_value = tile.shfl(tmp, 0);
+      auto insert_pair = cuco::pair_type<Key, int32>{key, found_value};
+      if (submap_mutable_views[submap_idx].insert(tile, insert_pair, hash,
+                                                  key_equal) &&
+          tile.thread_rank() == 0) {
+        thread_num_successes++;
+      }
+    }
+
+    if (tile.thread_rank() == 0) {
+      *(value_first + key_idx) = found_value;
+    }
+    key_idx += (gridDim.x * blockDim.x) / tile_size;
+  }
+
+  std::size_t block_num_successes =
+      BlockReduce(temp_storage).Sum(thread_num_successes);
+  if (threadIdx.x == 0) {
+    *num_successes += block_num_successes;
+  }
+}
+
+template <typename Key, typename V>
+struct KvLookupInsertKey<GPUDevice, Key, V> {
+  void operator()(const Key* key_first, int32* value_first, int32 num_items,
+                  GPUHashTable<Key, V>* hash_table, atomicT* start_idx,
+                  cudaStream_t stream) {
+    using mutableViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::mutable_view_type;
+    using ViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::view_type;
+    auto& map = hash_table->hash_table->map_;
+    map.reserve(map.get_size() + num_items);
+    uint32_t submap_idx = 0;
+    std::size_t num_to_insert = num_items;
+
+    while (num_to_insert > 0) {
+      std::size_t capacity_remaining =
+          map.get_max_load_factor() *
+              map.get_submaps()[submap_idx]->get_capacity() -
+          map.get_submaps()[submap_idx]->get_size();
+      if (capacity_remaining >= map.get_min_insert_size()) {
+        *(map.get_num_successes()) = 0;
+        int device_id;
+        CUCO_CUDA_TRY(cudaGetDevice(&device_id));
+        CUCO_CUDA_TRY(cudaMemPrefetchAsync(map.get_num_successes(),
+                                           sizeof(atomicT), device_id));
+
+        auto n = std::min(capacity_remaining, num_to_insert);
+
+        auto const grid_size =
+            (TILE_SIZE * n + STRIDE * BLOCK_SIZE - 1) / (STRIDE * BLOCK_SIZE);
+        TF_CHECK_OK(GpuLaunchKernel(
+            kv_lookup_and_insert_key_kernel<
+                BLOCK_SIZE, TILE_SIZE, Key, mutableViewT, ViewT,
+                cuco::detail::MurmurHash3_32<Key>, thrust::equal_to<Key>>,
+            grid_size, BLOCK_SIZE, 0, stream, key_first, value_first, n,
+            map.get_submap_mutable_views().data().get(),
+            map.get_submap_views().data().get(), map.get_submaps().size(),
+            map.get_num_successes(), start_idx, submap_idx,
+            cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+        CUCO_CUDA_TRY(cudaDeviceSynchronize());
+        std::size_t h_num_successes =
+            map.get_num_successes()->load(cuda::std::memory_order_relaxed);
+        map.update_submap_sizes(submap_idx, h_num_successes);
+        key_first += n;
+        value_first += n;
+        num_to_insert -= n;
+      }
+      submap_idx++;
+    }
+  }
+};
+
+template <typename Key, typename Value>
+__global__ void kv_lookup_or_create_emb_kernel(
+    const Key* key_first, Value* val, Value* default_v, int64 dim,
+    int32* item_idxs, int32 slot_idx, Value** d_banks, bool** d_flags,
+    int32 slot_num, int32 default_v_num, int32 bank_size) {
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto slot_offset = bank_idx * slot_num + slot_idx;
+  bool stored = d_flags[slot_offset][offset_in_bank];
+  __syncthreads();
+  if (stored == false) {
+    d_flags[slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      int32 default_v_idx = *(key_first + item_idx) % default_v_num;
+      d_banks[slot_offset][offset_in_bank * dim + id] =
+          default_v[default_v_idx * dim + id];
+    }
+  }
+  for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+    val[item_idx * dim + id] = d_banks[slot_offset][offset_in_bank * dim + id];
+  }
+}
+
+template <typename Key, typename Value>
+struct KvLookupCreateEmb<GPUDevice, Key, Value> {
+  void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, Value** d_banks, bool** d_flags,
+                  int32 slot_num, int32 bank_size, cudaStream_t stream) {
+    auto const block_size = 256;
+    auto const grid_size = num_items;
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_lookup_or_create_emb_kernel<Key, Value>, grid_size, block_size, 0,
+        stream, key_first, val, default_v, dim, item_idxs, slot_idx, d_banks,
+        d_flags, slot_num, default_v_num, bank_size));
+  }
+};
+
+template <typename Key, typename Value>
+__global__ void kv_update_emb_kernel(const Key* key_first, Value* default_v,
+                                     int64 dim, int32* item_idxs,
+                                     int32 slot_idx, Value** d_banks,
+                                     bool** d_flags, int32 slot_num,
+                                     int32 default_v_num, int32 bank_size) {
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto slot_offset = bank_idx * slot_num + slot_idx;
+  bool stored = d_flags[slot_offset][offset_in_bank];
+  __syncthreads();
+  if (stored == false) {
+    d_flags[slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      int32 default_v_idx;
+      default_v_idx = item_idx % default_v_num;
+      d_banks[slot_offset][offset_in_bank * dim + id] =
+          default_v[default_v_idx * dim + id];
+    }
+  }
+}
+
+template <typename Key, typename Value>
+struct KvUpdateEmb<GPUDevice, Key, Value> {
+  void operator()(const Key* key_first, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, Value** d_banks, bool** d_flags,
+                  int32 slot_num, int32 bank_size, cudaStream_t stream) {
+    auto const block_size = 256;
+    auto const grid_size = num_items;
+    TF_CHECK_OK(GpuLaunchKernel(kv_update_emb_kernel<Key, Value>, grid_size,
+                                block_size, 0, stream, key_first, default_v,
+                                dim, item_idxs, slot_idx, d_banks, d_flags,
+                                slot_num, default_v_num, bank_size));
+  }
+};
+
+template <typename Key, typename ViewT,
+          typename Hash = cuco::detail::MurmurHash3_32<Key>,
+          typename KeyEqual = thrust::equal_to<Key>>
+__global__ void kv_get_key_snapshot_kernel(
+    Key* key, int32* item_idxs, int32 slot_idx, int32 primary_slot_idx,
+    bool** d_flags, int32 bank_num, int32 slot_num, int32 bank_size,
+    ViewT* submap_views, uint32_t num_submaps, int32 ev_size,
+    Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) {
+  int n = 0;
+  for (auto i = 0; i < num_submaps; ++i) {
+    auto submap_view_size = submap_views[i].get_capacity();
+    for (auto j = 0; j < submap_view_size; ++j) {
+      auto found = submap_views[i].get_slot(j, hash, key_equal);
+      if (found != submap_views[i].end()) {
+        int32 item_pos = found->second;
+        auto bank_idx = item_pos / bank_size;
+        auto offset_in_bank = item_pos % bank_size;
+        auto slot_offset = bank_idx * slot_num + slot_idx;
+        auto pri_slot_offset = bank_idx * slot_num + primary_slot_idx;
+        if (d_flags[slot_offset][offset_in_bank] &&
+            d_flags[pri_slot_offset][offset_in_bank]) {
+          *(key + n) = found->first;
+          *(item_idxs + n) = found->second;
+          ++n;
+        }
+      }
+    }
+  }
+  for (auto i = n; i < ev_size; ++i) {
+    *(key + n) = submap_views[0].get_empty_key_sentinel();
+  }
+}
+
+template <typename Key, typename V>
+struct KvKeyGetSnapshot<GPUDevice, Key, V> {
+  void operator()(Key* key_first, int32* value_first, int32 slot_idx,
+                  int32 primary_slot_idx, bool** d_flags, int32 bank_num,
+                  int32 slot_num, int32 bank_size,
+                  GPUHashTable<Key, V>* hash_table, int32 ev_size,
+                  cudaStream_t stream) {
+    using ViewT = typename cuco::dynamic_map<
+        Key, int32, cuda::thread_scope_device,
+        gpu_hash_map_tf_allocator<uint8_t>>::view_type;
+    auto& map = hash_table->hash_table->map_;
+
+    auto const block_size = 1;
+    auto const grid_size = 1;
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_get_key_snapshot_kernel<Key, ViewT,
+                                   cuco::detail::MurmurHash3_32<Key>,
+                                   thrust::equal_to<Key>>,
+        grid_size, block_size, 0, stream, key_first, value_first, slot_idx,
+        primary_slot_idx, d_flags, bank_num, slot_num, bank_size,
+        map.get_submap_views().data().get(), map.get_submaps().size(), ev_size,
+        cuco::detail::MurmurHash3_32<Key>{}, thrust::equal_to<Key>{}));
+    CUCO_CUDA_TRY(cudaDeviceSynchronize());
+  }
+};
+
+template <typename Key, typename Value>
+__global__ void kv_emb_get_snapshot_kernel(Key* key, Value* val,
+                                           Key empty_key_sentinel, int64 dim,
+                                           int32* item_idxs, int32 slot_idx,
+                                           Value** d_banks, int32 bank_num,
+                                           int32 slot_num, int32 bank_size,
+                                           int32 total_num) {
+  auto item_idx = blockIdx.x;
+  if (item_idx < total_num) {
+    auto item_pos = item_idxs[item_idx];
+    auto bank_idx = item_pos / bank_size;
+    auto offset_in_bank = item_pos % bank_size;
+    auto slot_offset = bank_idx * slot_num + slot_idx;
+    if (key[item_idx] != empty_key_sentinel) {
+      for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+        val[item_idx * dim + id] =
+            d_banks[slot_offset][offset_in_bank * dim + id];
+      }
+    }
+  }
+}
+
+template <typename Key, typename Value>
+struct KvEmbGetSnapshot<GPUDevice, Key, Value> {
+  void operator()(Key* key, Value* val, Key empty_key_sentinel, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  Value** d_banks, int32 bank_num, int32 slot_num,
+                  int32 bank_size, cudaStream_t stream) {
+    auto const block_size = 256;
+    auto const grid_size = num_items;
+    if (grid_size == 0) return;
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_emb_get_snapshot_kernel<Key, Value>, grid_size, block_size, 0,
+        stream, key, val, empty_key_sentinel, dim, item_idxs, slot_idx, d_banks,
+        bank_num, slot_num, bank_size, num_items));
+  }
+};
+
+}  // namespace functor
+
+#define REGISTER_ALL_TYPE(type)                                       \
+  template struct functor::KvInitStaticMap<GPUDevice, int32, type>;   \
+  template struct functor::KvInitStaticMap<GPUDevice, int64, type>;   \
+  template struct functor::KvLookupInsertKey<GPUDevice, int32, type>; \
+  template struct functor::KvLookupInsertKey<GPUDevice, int64, type>; \
+  template struct functor::KvLookupCreateEmb<GPUDevice, int32, type>; \
+  template struct functor::KvLookupCreateEmb<GPUDevice, int64, type>; \
+  template struct functor::KvKeyGetSnapshot<GPUDevice, int32, type>;  \
+  template struct functor::KvKeyGetSnapshot<GPUDevice, int64, type>;  \
+  template struct functor::KvEmbGetSnapshot<GPUDevice, int32, type>;  \
+  template struct functor::KvEmbGetSnapshot<GPUDevice, int64, type>;  \
+  template struct functor::KvUpdateEmb<GPUDevice, int32, type>;       \
+  template struct functor::KvUpdateEmb<GPUDevice, int64, type>;
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_ALL_TYPE)
+
+#define REGISTER_LOOKUP_KERNEL_ALL(hash_table, type)                          \
+  template struct functor::KvLookupKey<hash_table<int32, type>, int32, type>; \
+  template struct functor::KvLookupKey<hash_table<int64, type>, int64, type>;
+#define REGISTER_INFERENCE_LOOKUP_KERNEL(type) \
+  REGISTER_LOOKUP_KERNEL_ALL(GPUHashTable, type)
+#define REGISTER_TRAINING_LOOKUP_KERNEL(type) \
+  REGISTER_LOOKUP_KERNEL_ALL(GPUStaticHashTable, type)
+
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_INFERENCE_LOOKUP_KERNEL)
+TF_CALL_REAL_NUMBER_TYPES(REGISTER_TRAINING_LOOKUP_KERNEL)
+
+#undef REGISTER_INFERENCE_LOOKUP_KERNEL
+#undef REGISTER_TRAINING_LOOKUP_KERNEL
+#undef REGISTER_LOOKUP_KERNEL_ALL_TYPE
+#undef REGISTER_ALL_TYPE
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h
new file mode 100644
index 00000000..497b8017
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h
@@ -0,0 +1,136 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_
+
+#if GOOGLE_CUDA
+#include <cuda/std/atomic>
+
+#include "tensorflow/core/framework/typed_allocator.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+template <typename T>
+class gpu_hash_map_tf_allocator;
+
+template <typename KeyType, typename ValueType, typename Allocator>
+class DynamicHashTable;
+
+template <typename KeyType, typename ValueType, typename Allocator>
+class StaticHashTable;
+
+template <typename K, typename V>
+class GPUStaticHashTable {
+ public:
+  GPUStaticHashTable(size_t capacity, int dimension, K empty_key_sentinel,
+                     int32 empty_value_sentinel, Allocator* alloc,
+                     cudaStream_t stream);
+
+  ~GPUStaticHashTable();
+
+  std::size_t Size();
+
+  StaticHashTable<K, V, gpu_hash_map_tf_allocator<uint8_t>>* hash_table;
+  V* values_d{nullptr};
+  int dimension_;
+  V* default_values{nullptr};
+  int capacity_;
+};
+
+template <typename K, typename V>
+class GPUHashTable {
+ public:
+  GPUHashTable(K empty_key_sentinel, Allocator* alloc,
+               size_t initial_capacity = 50000);
+
+  ~GPUHashTable();
+
+  int32 Size();
+
+  DynamicHashTable<K, int32, gpu_hash_map_tf_allocator<uint8_t>>* hash_table;
+
+  const int32 initial_bank_size;
+  cuda::atomic<std::size_t, cuda::thread_scope_device>* start_idx;
+  int32 mem_bank_num = 0;
+  std::vector<V*> bank_ptrs;
+  V** d_bank_ptrs = nullptr;
+  std::vector<bool*> existence_flag_ptrs;
+  bool** d_existence_flag_ptrs = nullptr;
+};
+
+namespace functor {
+
+template <typename HashTable, typename Key, typename V>
+struct KvLookupKey {
+  void operator()(const Key* key_first, V* value_first, int32 num_items,
+                  int32 dimension, int32 slot_idx, int32 slot_num,
+                  HashTable* hash_table, const V* default_v,
+                  int32 default_v_num, cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename V>
+struct KvInitStaticMap {
+  void operator()(const Key* key_first, GPUStaticHashTable<Key, V>* hash_table,
+                  int32 num_items, int32 dimension, cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename V>
+struct KvLookupInsertKey {
+  void operator()(
+      const Key* key_first, int32* value_first, int32 num_items,
+      GPUHashTable<Key, V>* hash_table,
+      cuda::atomic<std::size_t, cuda::thread_scope_device>* start_idx,
+      cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename Value>
+struct KvLookupCreateEmb {
+  void operator()(const Key* key_first, Value* val, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, Value** d_banks, bool** d_flags,
+                  int32 slot_num, int32 bank_size, cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename Value>
+struct KvUpdateEmb {
+  void operator()(const Key* key_first, Value* default_v, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  int32 default_v_num, Value** d_banks, bool** d_flags,
+                  int32 slot_num, int32 bank_size, cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename V>
+struct KvKeyGetSnapshot {
+  void operator()(Key* key_first, int32* value_first, int32 slot_idx,
+                  int32 primary_slot_idx, bool** d_flags, int32 bank_num,
+                  int32 slot_num, int32 bank_size,
+                  GPUHashTable<Key, V>* hash_table, int32 ev_size,
+                  cudaStream_t stream);
+};
+
+template <typename Device, typename Key, typename Value>
+struct KvEmbGetSnapshot {
+  void operator()(Key* key, Value* val, Key empty_key_sentinel, int64 dim,
+                  int32* item_idxs, int32 num_items, int32 slot_idx,
+                  Value** d_banks, int32 bank_num, int32 slot_num,
+                  int32 bank_size, cudaStream_t stream);
+};
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_GPU_HASH_TABLE_H_
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h
new file mode 100644
index 00000000..430acb5a
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_ssd_storage.h
@@ -0,0 +1,601 @@
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "hbm_storage_iterator.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+using se::DeviceMemoryBase;
+using se::Stream;
+
+template <typename K, typename V>
+class CheckpointLoader;
+
+void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
+
+namespace embedding {
+template <typename K, typename V>
+class HbmDramSsdStorage : public MultiTierStorage<K, V> {
+ public:
+  HbmDramSsdStorage(const StorageConfig& sc, Allocator* gpu_alloc,
+                    FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc),
+        MultiTierStorage<K, V>(sc, name),
+        dram_capacity_(-1) {
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
+    ssd_ = new SsdHashStorage<K, V>(sc, dram_feat_desc_);
+  }
+
+  ~HbmDramSsdStorage() override {
+    MultiTierStorage<K, V>::DeleteFromEvictionManager();
+    delete hbm_;
+    delete dram_;
+    delete ssd_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HbmDramSsdStorage);
+
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    ssd_->Init();
+
+    MultiTierStorage<K, V>::cache_capacity_ =
+        Storage<K, V>::storage_config_.size[0] / (total_dim() * sizeof(V));
+
+    dram_capacity_ =
+        Storage<K, V>::storage_config_.size[1] / (total_dim() * sizeof(V));
+    MultiTierStorage<K, V>::ready_eviction_ = true;
+  }
+
+  Status Get(K key, void** value_ptr) override {
+    Status s = hbm_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK);
+      return s;
+    }
+    s = ssd_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK_AND_DESTROY);
+      return s;
+    }
+    return s;
+  }
+
+  void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                void** value_ptr_list, int64 num_of_keys) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> copyback_cursor_list(num_worker_threads + 1);
+    std::vector<std::list<void*>> ssd_value_ptr_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, ssd_value_ptr_list);
+
+    CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list,
+                                copyback_cursor_list[0], ssd_value_ptr_list[0]);
+  }
+
+  void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+      void** value_ptr_list, int64 num_of_keys, int64 value_len,
+      std::vector<std::list<int64>>& not_fountd_cursor_list) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> copyback_cursor_list(num_worker_threads + 1);
+    std::vector<std::list<void*>> ssd_value_ptr_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, ssd_value_ptr_list,
+                      &not_fountd_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list,
+                                copyback_cursor_list[0], ssd_value_ptr_list[0]);
+
+    CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0],
+                    value_len);
+  }
+
+  void Insert(K key, void** value_ptr) override {
+    hbm_->Insert(key, value_ptr);
+  }
+
+  void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override {
+    if (to_dram) {
+      dram_->Insert(key, value_ptr);
+    } else {
+      hbm_->Insert(key, value_ptr);
+    }
+  }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL) << "Stroage with HBM only suppotrs batch APIs.";
+  }
+
+  void InitCache(embedding::CacheStrategy cache_strategy) override {
+    MultiTierStorage<K, V>::InitCache(cache_strategy);
+    dram_cache_ = new LRUCache<K>();
+  }
+
+  Status Remove(K key) override {
+    hbm_->Remove(key);
+    dram_->Remove(key);
+    ssd_->Remove(key);
+    return OkStatus();
+  }
+
+  int64 Size() const override {
+    int64 total_size = hbm_->Size();
+    total_size += dram_->Size();
+    total_size += ssd_->Size();
+    return total_size;
+  }
+
+  int64 Size(int level) const override {
+    if (level == 0) {
+      return hbm_->Size();
+    } else if (level == 1) {
+      return dram_->Size();
+    } else if (level == 2) {
+      return ssd_->Size();
+    } else {
+      return -1;
+    }
+  }
+
+  int LookupTier(K key) const override {
+    Status s = hbm_->Contains(key);
+    if (s.ok()) return 0;
+    s = dram_->Contains(key);
+    if (s.ok()) return 1;
+    s = ssd_->Contains(key);
+    if (s.ok()) return 2;
+    return -1;
+  }
+
+  bool IsUseHbm() override { return true; }
+
+  bool IsSingleHbm() override { return false; }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<K> key_list, tmp_dram_key_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
+    TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
+    hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    HbmValueIterator<K, V> hbm_value_iter(key_list, value_ptr_list,
+                                          emb_config.emb_index, value_len,
+                                          gpu_alloc_, hbm_feat_desc_);
+
+    for (int64 i = 0; i < value_ptr_list.size(); i++) {
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(value_ptr,
+                              hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
+    }
+
+    TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, &tmp_dram_value_list));
+    dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list, shrink_args,
+                  value_len);
+
+    for (int64 i = 0; i < tmp_dram_key_list.size(); i++) {
+      Status s = hbm_->Contains(tmp_dram_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_dram_key_list[i]);
+        value_ptr_list.emplace_back(tmp_dram_value_list[i]);
+      }
+    }
+
+    {
+      mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer, emb_config, value_len, default_value, key_list,
+          value_ptr_list, feat_desc_list, &hbm_value_iter)));
+    }
+
+    for (auto value_ptr : value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
+      }
+    }
+
+    ssd_->Save(tensor_name, prefix, writer, emb_config, shrink_args, value_len,
+               default_value);
+
+    return OkStatus();
+  }
+
+  Status DramToSsdBatchCommit(std::shared_ptr<std::vector<K>> keys) {
+    MultiTierStorage<K, V>::ReleaseValuePtrs(dram_value_ptr_out_of_date_,
+                                             dram_feat_desc_);
+    mutex_lock l(*(ssd_->get_mutex()));
+    mutex_lock l1(*(dram_->get_mutex()));
+
+    dram_cache_->update(keys->data(), keys->size());
+    int64 dram_count = dram_cache_->size();
+    if (dram_count > dram_capacity_) {
+      int k_size = dram_count - dram_capacity_;
+      constexpr int DramEvictionSize = 10000;
+      k_size = std::min(k_size, DramEvictionSize);
+      K dram_evic_ids[DramEvictionSize];
+      size_t true_size = dram_cache_->get_evic_ids(dram_evic_ids, k_size);
+      void* value_ptr;
+      for (int64 i = 0; i < true_size; ++i) {
+        if (dram_->Get(dram_evic_ids[i], &value_ptr).ok()) {
+          TF_CHECK_OK(ssd_->Commit(dram_evic_ids[i], value_ptr));
+          TF_CHECK_OK(dram_->Remove(dram_evic_ids[i]));
+          dram_value_ptr_out_of_date_.emplace_back(value_ptr);
+        }
+      }
+    }
+    return OkStatus();
+  }
+
+  void BatchEviction() override {
+    constexpr int EvictionSize = 10000;
+    K evic_ids[EvictionSize];
+    if (!MultiTierStorage<K, V>::ready_eviction_) {
+      return;
+    }
+    mutex_lock l(*(hbm_->get_mutex()));
+    mutex_lock l1(*(dram_->get_mutex()));
+
+    int64 cache_count = MultiTierStorage<K, V>::cache_->size();
+    if (cache_count > MultiTierStorage<K, V>::cache_capacity_) {
+      // eviction
+      int k_size = cache_count - MultiTierStorage<K, V>::cache_capacity_;
+      k_size = std::min(k_size, EvictionSize);
+      size_t true_size =
+          MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
+      void* value_ptr;
+      std::shared_ptr<std::vector<K>> keys(new std::vector<K>());
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
+
+      for (int64 i = 0; i < true_size; ++i) {
+        if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
+          keys->emplace_back(evic_ids[i]);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+                                   hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+                                         hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
+        }
+      }
+
+      CopyEmbeddingFromHbmToDram(hbm_value_ptrs, dram_value_ptrs, gpu_alloc_,
+                                 hbm_feat_desc_, dram_feat_desc_);
+
+      dram_->BatchCommit(*keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
+      for (auto it : *keys) {
+        TF_CHECK_OK(hbm_->Remove(it));
+      }
+      MultiTierStorage<K, V>::eviction_manager_->Schedule(
+          [this, keys]() { DramToSsdBatchCommit(keys); });
+    }
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+ protected:
+  int total_dim() override { return hbm_feat_desc_->total_dim(); }
+
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string, int64 partition_id,
+               int64 partition_num, int64 value_len, bool is_incr,
+               bool reset_version, const EmbeddingConfig& emb_config,
+               const Eigen::GpuDevice* device, BundleReader* reader,
+               EmbeddingVar<K, V>* ev,
+               FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) override {
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this), ev,
+                                    filter, name_string, file_name_string,
+                                    partition_id, partition_num, is_incr,
+                                    reset_version, reader);
+    restorer.RestoreCkpt(emb_config, device);
+
+    int64 num_of_hbm_ids =
+        std::min(MultiTierStorage<K, V>::cache_capacity_,
+                 (int64)MultiTierStorage<K, V>::cache_->size());
+    if (num_of_hbm_ids > 0) {
+      K* hbm_ids = new K[num_of_hbm_ids];
+      int64* hbm_freqs = new int64[num_of_hbm_ids];
+      int64* hbm_versions = nullptr;
+      MultiTierStorage<K, V>::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids,
+                                                     hbm_versions, hbm_freqs);
+      ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index);
+      MultiTierStorage<K, V>::cache_thread_pool_->Schedule(
+          [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() {
+            MultiTierStorage<K, V>::cache_->update(hbm_ids, num_of_hbm_ids,
+                                                   hbm_versions, hbm_freqs);
+            delete[] hbm_ids;
+            delete[] hbm_freqs;
+          });
+    }
+  }
+
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num,
+                               value_len, is_filter, true /*to_dram*/, is_incr,
+                               restore_buff);
+
+    MultiTierStorage<K, V>::cache_->update((K*)restore_buff.key_buffer, key_num,
+                                           (int64*)restore_buff.version_buffer,
+                                           (int64*)restore_buff.freq_buffer);
+    return s;
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {}
+
+ private:
+  void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
+    V* memcpy_buffer_cpu = new V[size * value_len];
+    V** value_address = new V*[size];
+    V* memcpy_buffer_gpu = (V*)gpu_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, size * value_len * sizeof(V));
+    V* dev_value_address = (V*)gpu_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, size * sizeof(V*));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
+      }
+    }
+    // Split from above for loop for minize the cost of mutex lock
+    // TODO: Speed up with intra parallelism
+
+    for (int64 i = 0; i < size; i++) {
+      memcpy(memcpy_buffer_cpu + i * value_len,
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] =
+          hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
+    }
+    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
+               size * value_len * sizeof(V), cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_value_address, value_address, size * sizeof(V*),
+               cudaMemcpyHostToDevice);
+    int block_dim = 128;
+    void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
+                    (void*)&value_len, (void*)&size};
+
+    cudaLaunchKernel((void*)BatchUnpack<V>,
+                     (size + block_dim - 1) / block_dim * value_len, block_dim,
+                     args, 0, NULL);
+    cudaDeviceSynchronize();
+
+    delete[] memcpy_buffer_cpu;
+    delete[] cpu_value_ptrs;
+    delete[] gpu_value_ptrs;
+    delete[] value_address;
+    gpu_alloc_->DeallocateRaw(dev_value_address);
+    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
+  }
+
+  void BatchGetValuePtrs(
+      const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+      void** value_ptr_list, int64 num_of_keys,
+      std::vector<std::list<int64>>& copyback_cursor_list,
+      std::vector<std::list<void*>>& ssd_value_ptr_list,
+      std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    std::function<void(std::vector<std::list<int64>>*, int64, int)>
+        set_not_found_list = 0;
+    if (not_found_cursor_list != nullptr) {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list, int64 i,
+             int copy_id) {
+            (*not_found_cursor_list)[copy_id].emplace_back(i);
+          };
+    } else {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list, int64 i,
+             int copy_id) {};
+    }
+
+    auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
+                    main_thread_id, &copyback_cursor_list, &ssd_value_ptr_list,
+                    set_not_found_list,
+                    &not_found_cursor_list](int64 start, int64 limit) {
+      int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int64 i = start; i < limit; i++) {
+        Status s = Get(keys[i], &value_ptr_list[i]);
+        if (s.ok()) {
+          int64 copyback_flag =
+              (int64)value_ptr_list[i] >> copyback_flag_offset_bits_;
+          RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]);
+          if (copyback_flag == COPYBACK) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+          } else if (copyback_flag == COPYBACK_AND_DESTROY) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+            ssd_value_ptr_list[copy_id].emplace_back(value_ptr_list[i]);
+          }
+        } else {
+          value_ptr_list[i] = nullptr;
+          set_not_found_list(not_found_cursor_list, i, copy_id);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+      if (copyback_cursor_list[i].size() > 0) {
+        copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
+                                       copyback_cursor_list[i]);
+      }
+      if (ssd_value_ptr_list[i].size() > 0) {
+        ssd_value_ptr_list[0].splice(ssd_value_ptr_list[0].end(),
+                                     ssd_value_ptr_list[i]);
+      }
+    }
+
+    if (not_found_cursor_list != nullptr) {
+      for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+        if ((*not_found_cursor_list)[i].size() > 0) {
+          (*not_found_cursor_list)[0].splice((*not_found_cursor_list)[0].end(),
+                                             (*not_found_cursor_list)[i]);
+        }
+      }
+    }
+  }
+
+  void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
+                                   const K* keys, void** value_ptr_list,
+                                   std::list<int64>& copyback_cursors,
+                                   std::list<void*>& ssd_value_ptrs) {
+    int64 total = copyback_cursors.size();
+    std::vector<void*> gpu_value_ptrs(total);
+    std::vector<K> copyback_keys(total);
+    std::vector<int64> memory_index(total);
+    // Create Hbm ValuePtrs.
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    // Mutex with eviction thread
+    for (; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+                              dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          gpu_value_ptr, dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
+    }
+    MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursors, memory_index,
+        gpu_value_ptrs, hbm_feat_desc_->total_dim(), hbm_feat_desc_,
+        dram_feat_desc_);
+
+    // Insert copyback ids to hbm hash table.
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs, memory_index,
+                      value_ptr_list](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; i++) {
+        Status s = hbm_->TryInsert(copyback_keys[i], gpu_value_ptrs[i]);
+        if (!s.ok()) {
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
+          hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, total, 100000,
+          do_insert);
+
+    for (auto it = ssd_value_ptrs.cbegin(); it != ssd_value_ptrs.cend(); ++it) {
+      ssd_->DestroyValuePtr(*it);
+    }
+  }
+
+  void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                       void** value_ptr_list,
+                       std::list<int64>& not_found_cursors, int64 value_len) {
+    int64 total = not_found_cursors.size();
+    if (total > 0) {
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
+      std::vector<int64> cursor_index(total);
+      // Create Hbm ValuePtrs.
+
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      // Mutex with eviction thread
+      for (; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
+      }
+
+      hbm_feat_desc_->SetDefaultValues(keys, not_found_cursors, value_ptr_list,
+                                       ctx.compute_stream, ctx.event_mgr,
+                                       ctx.gpu_device);
+
+      // Insert copyback ids to hbm hash table.
+      auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index](
+                           int64 start, int64 limit) {
+        for (int64 i = start; i < limit; i++) {
+          Status s =
+              hbm_->TryInsert(insert_pairs[i].first, insert_pairs[i].second);
+          if (!s.ok()) {
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
+            hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
+          }
+        }
+      };
+      auto worker_threads = ctx.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers, total, 100000,
+            do_insert);
+    }
+  }
+
+  void AddCopyBackFlagToValuePtr(void** value_ptr, CopyBackFlag copyback_flag) {
+    int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
+    tmp = ((int64)*value_ptr) | tmp;
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
+    int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
+    tmp = ((int64)*value_ptr) & tmp;
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+ private:
+  HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
+  DramStorage<K, V>* dram_ = nullptr;
+  SsdHashStorage<K, V>* ssd_ = nullptr;
+  Allocator* gpu_alloc_;
+  BatchCache<K>* dram_cache_;
+  int64 dram_capacity_;
+  std::deque<void*> dram_value_ptr_out_of_date_;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  const int copyback_flag_offset_bits_ = 60;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_SSD_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h
new file mode 100644
index 00000000..5b9531c9
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_dram_storage.h
@@ -0,0 +1,536 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "hbm_storage_iterator.h"
+#include "intra_thread_copy_id_allocator.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+using se::DeviceMemoryBase;
+using se::Stream;
+
+template <typename K, typename V>
+class CheckpointLoader;
+
+void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
+
+namespace embedding {
+template <typename K, typename V>
+class HbmDramStorage : public MultiTierStorage<K, V> {
+ public:
+  HbmDramStorage(const StorageConfig& sc, Allocator* gpu_alloc,
+                 FeatureDescriptor<V>* feat_desc, const std::string& name)
+      : gpu_alloc_(gpu_alloc), MultiTierStorage<K, V>(sc, name) {
+    hbm_ = new HbmStorageWithCpuKv<K, V>(sc, feat_desc);
+    hbm_feat_desc_ = feat_desc;
+    dram_feat_desc_ = new FeatureDescriptor<V>(feat_desc);
+    dram_ = new DramStorage<K, V>(sc, dram_feat_desc_);
+  }
+
+  ~HbmDramStorage() override {
+    MultiTierStorage<K, V>::DeleteFromEvictionManager();
+    delete hbm_;
+    delete dram_;
+    delete dram_feat_desc_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HbmDramStorage);
+
+  Status Get(K key, void** value_ptr) override {
+    Status s = hbm_->Get(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    s = dram_->Get(key, value_ptr);
+    if (s.ok()) {
+      AddCopyBackFlagToValuePtr(value_ptr, COPYBACK);
+      return s;
+    }
+    return s;
+  }
+
+  void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                void** value_ptr_list, int64 num_of_keys) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> copyback_cursor_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list,
+                                copyback_cursor_list[0]);
+  }
+
+  void Insert(K key, void** value_ptr) override {
+    hbm_->Insert(key, value_ptr);
+  }
+
+  void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+      void** value_ptr_list, int64 num_of_keys, int64 value_len,
+      std::vector<std::list<int64>>& not_fountd_cursor_list) override {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> copyback_cursor_list(num_worker_threads + 1);
+
+    BatchGetValuePtrs(ctx, keys, value_ptr_list, num_of_keys,
+                      copyback_cursor_list, &not_fountd_cursor_list);
+
+    CopyEmbeddingsFromDramToHbm(ctx, keys, value_ptr_list,
+                                copyback_cursor_list[0]);
+    CreateValuePtrs(ctx, keys, value_ptr_list, not_fountd_cursor_list[0],
+                    value_len);
+  }
+
+  void CreateAndInsert(K key, void** value_ptr, bool to_dram = false) override {
+    if (to_dram) {
+      dram_->CreateAndInsert(key, value_ptr);
+    } else {
+      hbm_->CreateAndInsert(key, value_ptr);
+    }
+  }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    LOG(FATAL) << "Stroage with HBM only suppotrs batch APIs.";
+  }
+
+  Status Remove(K key) override {
+    hbm_->Remove(key);
+    dram_->Remove(key);
+    return OkStatus();
+  }
+
+  int64 Size() const override {
+    int64 total_size = hbm_->Size();
+    total_size += dram_->Size();
+    return total_size;
+  }
+
+  int64 Size(int level) const override {
+    if (level == 0) {
+      return hbm_->Size();
+    } else if (level == 1) {
+      return dram_->Size();
+    } else {
+      return -1;
+    }
+  }
+
+  int LookupTier(K key) const override {
+    Status s = hbm_->Contains(key);
+    if (s.ok()) return 0;
+    s = dram_->Contains(key);
+    if (s.ok()) return 1;
+    return -1;
+  }
+
+  bool IsUseHbm() override { return true; }
+
+  bool IsSingleHbm() override { return false; }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<K> key_list, tmp_dram_key_list;
+    std::vector<void*> value_ptr_list, tmp_dram_value_list;
+    TF_CHECK_OK(hbm_->GetSnapshot(&key_list, &value_ptr_list));
+    hbm_->Shrink(key_list, value_ptr_list, shrink_args, value_len);
+
+    HbmValueIterator<K, V> hbm_value_iter(key_list, value_ptr_list,
+                                          emb_config.emb_index, value_len,
+                                          gpu_alloc_, hbm_feat_desc_);
+
+    for (int64 i = 0; i < value_ptr_list.size(); i++) {
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc_->data_bytes());
+      hbm_feat_desc_->SetFreq(value_ptr,
+                              hbm_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          value_ptr, hbm_feat_desc_->GetVersion(value_ptr_list[i]));
+      value_ptr_list[i] = (void*)((int64)value_ptr | (1L << kDramFlagOffset));
+    }
+
+    TF_CHECK_OK(dram_->GetSnapshot(&tmp_dram_key_list, &tmp_dram_value_list));
+    dram_->Shrink(tmp_dram_key_list, tmp_dram_value_list, shrink_args,
+                  value_len);
+
+    for (int64 i = 0; i < tmp_dram_key_list.size(); i++) {
+      Status s = hbm_->Contains(tmp_dram_key_list[i]);
+      if (!s.ok()) {
+        key_list.emplace_back(tmp_dram_key_list[i]);
+        value_ptr_list.emplace_back(tmp_dram_value_list[i]);
+      }
+    }
+
+    {
+      mutex_lock l(*(hbm_->get_mutex()));
+      std::vector<FeatureDescriptor<V>*> feat_desc_list(2);
+      feat_desc_list[0] = dram_feat_desc_;
+      feat_desc_list[1] = hbm_feat_desc_;
+      TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+          tensor_name, writer, emb_config, value_len, default_value, key_list,
+          value_ptr_list, feat_desc_list, &hbm_value_iter)));
+    }
+
+    for (auto value_ptr : value_ptr_list) {
+      if ((int64)value_ptr >> kDramFlagOffset == 1) {
+        value_ptr = (void*)((int64)value_ptr & ((1L << kDramFlagOffset) - 1));
+        cpu_allocator()->DeallocateRaw(value_ptr);
+      }
+    }
+    return OkStatus();
+  }
+
+  void BatchEviction() override {
+    constexpr int EvictionSize = 10000;
+    K evic_ids[EvictionSize];
+    if (!MultiTierStorage<K, V>::ready_eviction_) {
+      return;
+    }
+    mutex_lock l(*(hbm_->get_mutex()));
+    mutex_lock l1(*(dram_->get_mutex()));
+
+    int64 cache_count = MultiTierStorage<K, V>::cache_->size();
+    if (cache_count > MultiTierStorage<K, V>::cache_capacity_) {
+      // eviction
+      int k_size = cache_count - MultiTierStorage<K, V>::cache_capacity_;
+      k_size = std::min(k_size, EvictionSize);
+      size_t true_size =
+          MultiTierStorage<K, V>::cache_->get_evic_ids(evic_ids, k_size);
+      void* value_ptr;
+      std::vector<K> keys;
+      std::vector<void*> hbm_value_ptrs;
+      std::vector<void*> dram_value_ptrs;
+
+      for (int64 i = 0; i < true_size; ++i) {
+        if (hbm_->Get(evic_ids[i], &value_ptr).ok()) {
+          keys.emplace_back(evic_ids[i]);
+          hbm_value_ptrs.emplace_back(value_ptr);
+          void* dram_value_ptr = dram_->CreateValuePtr();
+          dram_feat_desc_->SetFreq(dram_value_ptr,
+                                   hbm_feat_desc_->GetFreq(value_ptr));
+          dram_feat_desc_->UpdateVersion(dram_value_ptr,
+                                         hbm_feat_desc_->GetVersion(value_ptr));
+          dram_value_ptrs.emplace_back(dram_value_ptr);
+        }
+      }
+
+      CopyEmbeddingFromHbmToDram(hbm_value_ptrs, dram_value_ptrs, gpu_alloc_,
+                                 hbm_feat_desc_, dram_feat_desc_);
+
+      dram_->BatchCommit(keys, dram_value_ptrs);
+      hbm_feat_desc_->Deallocate(hbm_value_ptrs);
+      for (auto it : keys) {
+        TF_CHECK_OK(hbm_->Remove(it));
+      }
+    }
+  }
+
+  void Restore(const std::string& name_string,
+               const std::string& file_name_string, int64 partition_id,
+               int64 partition_num, int64 value_len, bool is_incr,
+               bool reset_version, const EmbeddingConfig& emb_config,
+               const Eigen::GpuDevice* device, BundleReader* reader,
+               EmbeddingVar<K, V>* ev,
+               FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) override {
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this), ev,
+                                    filter, name_string, file_name_string,
+                                    partition_id, partition_num, is_incr,
+                                    reset_version, reader);
+
+    restorer.RestoreCkpt(emb_config, device);
+
+    int64 num_of_hbm_ids =
+        std::min(MultiTierStorage<K, V>::cache_capacity_,
+                 (int64)MultiTierStorage<K, V>::cache_->size());
+    if (num_of_hbm_ids > 0) {
+      K* hbm_ids = new K[num_of_hbm_ids];
+      int64* hbm_freqs = new int64[num_of_hbm_ids];
+      int64* hbm_versions = nullptr;
+      MultiTierStorage<K, V>::cache_->get_cached_ids(hbm_ids, num_of_hbm_ids,
+                                                     hbm_versions, hbm_freqs);
+      ImportToHbm(hbm_ids, num_of_hbm_ids, value_len, emb_config.emb_index);
+      MultiTierStorage<K, V>::cache_thread_pool_->Schedule(
+          [this, hbm_ids, num_of_hbm_ids, hbm_versions, hbm_freqs]() {
+            MultiTierStorage<K, V>::cache_->update(hbm_ids, num_of_hbm_ids,
+                                                   hbm_versions, hbm_freqs);
+            delete[] hbm_ids;
+            delete[] hbm_freqs;
+          });
+    }
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    hbm_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+  void Init() override {
+    dram_feat_desc_->InitSlotInfo(hbm_feat_desc_);
+    MultiTierStorage<K, V>::Init();
+  }
+
+ protected:
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num,
+                               value_len, is_filter, true /*to_dram*/, is_incr,
+                               restore_buff);
+
+    MultiTierStorage<K, V>::cache_->update((K*)restore_buff.key_buffer, key_num,
+                                           (int64*)restore_buff.version_buffer,
+                                           (int64*)restore_buff.freq_buffer);
+    return s;
+  }
+
+  int total_dim() override { return hbm_feat_desc_->total_dim(); }
+
+ private:
+  void BatchGetValuePtrs(
+      const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+      void** value_ptr_list, int64 num_of_keys,
+      std::vector<std::list<int64>>& copyback_cursor_list,
+      std::vector<std::list<int64>>* not_found_cursor_list = nullptr) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    IntraThreadCopyIdAllocator thread_copy_id_alloc(num_worker_threads);
+    uint64 main_thread_id = Env::Default()->GetCurrentThreadId();
+
+    std::function<void(std::vector<std::list<int64>>*, int64, int)>
+        set_not_found_list = 0;
+    if (not_found_cursor_list != nullptr) {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list, int64 i,
+             int copy_id) {
+            (*not_found_cursor_list)[copy_id].emplace_back(i);
+          };
+    } else {
+      set_not_found_list =
+          [](std::vector<std::list<int64>>* not_found_cursor_list, int64 i,
+             int copy_id) {};
+    }
+
+    auto do_work = [this, keys, value_ptr_list, &thread_copy_id_alloc,
+                    main_thread_id, &copyback_cursor_list, set_not_found_list,
+                    &not_found_cursor_list](int64 start, int64 limit) {
+      int copy_id = thread_copy_id_alloc.GetCopyIdOfThread(main_thread_id);
+      for (int64 i = start; i < limit; i++) {
+        Status s = Get(keys[i], &value_ptr_list[i]);
+        if (s.ok()) {
+          int64 copyback_flag =
+              (int64)value_ptr_list[i] >> copyback_flag_offset_bits_;
+          RemoveCopyBackFlagInValuePtr(&value_ptr_list[i]);
+          if (copyback_flag == CopyBackFlag::COPYBACK) {
+            copyback_cursor_list[copy_id].emplace_back(i);
+          }
+        } else {
+          value_ptr_list[i] = nullptr;
+          set_not_found_list(not_found_cursor_list, i, copy_id);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+
+    for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+      if (copyback_cursor_list[i].size() > 0) {
+        copyback_cursor_list[0].splice(copyback_cursor_list[0].end(),
+                                       copyback_cursor_list[i]);
+      }
+    }
+
+    if (not_found_cursor_list != nullptr) {
+      for (int i = 1; i < worker_threads->num_threads + 1; i++) {
+        if ((*not_found_cursor_list)[i].size() > 0) {
+          (*not_found_cursor_list)[0].splice((*not_found_cursor_list)[0].end(),
+                                             (*not_found_cursor_list)[i]);
+        }
+      }
+    }
+  }
+
+  void CopyEmbeddingsFromDramToHbm(const EmbeddingVarContext<GPUDevice>& ctx,
+                                   const K* keys, void** value_ptr_list,
+                                   std::list<int64>& copyback_cursors) {
+    int64 total = copyback_cursors.size();
+    std::vector<void*> gpu_value_ptrs(total);
+    std::vector<K> copyback_keys(total);
+    std::vector<int64> memory_index(total);
+    // Create Hbm ValuePtrs.
+    int64 i = 0;
+    auto it = copyback_cursors.cbegin();
+    // Mutex with eviction thread
+    for (; it != copyback_cursors.cend(); ++it, ++i) {
+      int64 j = *it;
+      memory_index[i] = j;
+      void* gpu_value_ptr = hbm_->CreateValuePtr();
+      hbm_feat_desc_->SetFreq(gpu_value_ptr,
+                              dram_feat_desc_->GetFreq(value_ptr_list[i]));
+      hbm_feat_desc_->UpdateVersion(
+          gpu_value_ptr, dram_feat_desc_->GetVersion(value_ptr_list[i]));
+      gpu_value_ptrs[i] = gpu_value_ptr;
+      copyback_keys[i] = keys[*it];
+    }
+    MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+        ctx, keys, value_ptr_list, copyback_cursors, memory_index,
+        gpu_value_ptrs, hbm_feat_desc_->total_dim(), hbm_feat_desc_,
+        dram_feat_desc_);
+
+    // Insert copyback ids to hbm hash table.
+    auto do_insert = [this, copyback_keys, gpu_value_ptrs, memory_index,
+                      value_ptr_list](int64 start, int64 limit) {
+      for (int64 i = start; i < limit; i++) {
+        Status s = hbm_->TryInsert(copyback_keys[i], gpu_value_ptrs[i]);
+        if (!s.ok()) {
+          hbm_->DestroyValuePtr(gpu_value_ptrs[i]);
+          hbm_->Get(copyback_keys[i], &value_ptr_list[memory_index[i]]);
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, total, 100000,
+          do_insert);
+  }
+
+  void CreateValuePtrs(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                       void** value_ptr_list,
+                       std::list<int64>& not_found_cursors, int64 value_len) {
+    int64 total = not_found_cursors.size();
+    if (total > 0) {
+      std::vector<std::pair<int64, void*>> insert_pairs(total);
+      std::vector<int64> cursor_index(total);
+      // Create Hbm ValuePtrs.
+      int64 i = 0;
+      auto it = not_found_cursors.cbegin();
+      for (; it != not_found_cursors.cend(); ++it, ++i) {
+        int64 j = *it;
+        cursor_index[i] = j;
+        void* gpu_value_ptr = hbm_->CreateValuePtr();
+        value_ptr_list[j] = gpu_value_ptr;
+        insert_pairs[i].first = keys[j];
+        insert_pairs[i].second = value_ptr_list[j];
+      }
+
+      hbm_feat_desc_->SetDefaultValues(keys, not_found_cursors, value_ptr_list,
+                                       ctx.compute_stream, ctx.event_mgr,
+                                       ctx.gpu_device);
+
+      // Insert copyback ids to hbm hash table.
+      auto do_insert = [this, insert_pairs, value_ptr_list, cursor_index](
+                           int64 start, int64 limit) {
+        for (int64 i = start; i < limit; i++) {
+          Status s =
+              hbm_->TryInsert(insert_pairs[i].first, insert_pairs[i].second);
+          if (!s.ok()) {
+            hbm_->DestroyValuePtr(insert_pairs[i].second);
+            hbm_->Get(insert_pairs[i].first, &value_ptr_list[cursor_index[i]]);
+          }
+        }
+      };
+      auto worker_threads = ctx.worker_threads;
+      Shard(worker_threads->num_threads, worker_threads->workers, total, 100000,
+            do_insert);
+    }
+  }
+
+  void AddCopyBackFlagToValuePtr(void** value_ptr, CopyBackFlag copyback_flag) {
+    int64 tmp = ((int64)copyback_flag) << copyback_flag_offset_bits_;
+    tmp = ((int64)*value_ptr) | tmp;
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void RemoveCopyBackFlagInValuePtr(void** value_ptr) {
+    int64 tmp = (1L << (copyback_flag_offset_bits_)) - 1;
+    tmp = ((int64)*value_ptr) & tmp;
+    *value_ptr = reinterpret_cast<void*>(tmp);
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {
+    dram_->Import(key, value, freq, version, emb_index);
+  }
+
+  void ImportToHbm(K* ids, int64 size, int64 value_len, int64 emb_index) {
+    V* memcpy_buffer_cpu = new V[size * value_len];
+    V** value_address = new V*[size];
+    V* memcpy_buffer_gpu = (V*)gpu_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, size * value_len * sizeof(V));
+    V* dev_value_address = (V*)gpu_alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, size * sizeof(V*));
+    void** gpu_value_ptrs = new void*[size];
+    void** cpu_value_ptrs = new void*[size];
+    for (int64 i = 0; i < size; i++) {
+      dram_->Get(ids[i], &cpu_value_ptrs[i]);
+      gpu_value_ptrs[i] = hbm_->CreateValuePtr();
+      Status s = hbm_->TryInsert(ids[i], gpu_value_ptrs[i]);
+      if (!s.ok()) {
+        hbm_feat_desc_->Deallocate(gpu_value_ptrs[i]);
+        hbm_->Get(ids[i], &gpu_value_ptrs[i]);
+      }
+    }
+    // Split from above for loop for minize the cost of mutex lock
+    // TODO: Speed up with intra parallelism
+
+    for (int64 i = 0; i < size; i++) {
+      memcpy(memcpy_buffer_cpu + i * value_len,
+             dram_feat_desc_->GetEmbedding(cpu_value_ptrs[i], emb_index),
+             value_len * sizeof(V));
+      value_address[i] =
+          hbm_feat_desc_->GetEmbedding(gpu_value_ptrs[i], emb_index);
+    }
+    cudaMemcpy(memcpy_buffer_gpu, memcpy_buffer_cpu,
+               size * value_len * sizeof(V), cudaMemcpyHostToDevice);
+    cudaMemcpy(dev_value_address, value_address, size * sizeof(V*),
+               cudaMemcpyHostToDevice);
+    int block_dim = 128;
+    void* args[] = {(void*)&dev_value_address, (void*)&memcpy_buffer_gpu,
+                    (void*)&value_len, (void*)&size};
+
+    cudaLaunchKernel((void*)BatchUnpack<V>,
+                     (size + block_dim - 1) / block_dim * value_len, block_dim,
+                     args, 0, NULL);
+    cudaDeviceSynchronize();
+
+    delete[] memcpy_buffer_cpu;
+    delete[] cpu_value_ptrs;
+    delete[] gpu_value_ptrs;
+    delete[] value_address;
+    gpu_alloc_->DeallocateRaw(dev_value_address);
+    gpu_alloc_->DeallocateRaw(memcpy_buffer_gpu);
+  }
+
+ private:
+  HbmStorageWithCpuKv<K, V>* hbm_ = nullptr;
+  DramStorage<K, V>* dram_ = nullptr;
+  FeatureDescriptor<V>* hbm_feat_desc_ = nullptr;
+  FeatureDescriptor<V>* dram_feat_desc_ = nullptr;
+  Allocator* gpu_alloc_;
+  const int copyback_flag_offset_bits_ = 60;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_DRAM_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h
new file mode 100644
index 00000000..ea9639ba
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_multi_tier_feature_descriptor.h
@@ -0,0 +1,116 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
+#include "embedding_memory_pool.h"
+#include "feature_descriptor_impl.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/env_var.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+// #include "xla/stream_executor/stream.h"
+// #include "xla/stream_executor/stream_executor.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class V>
+class NormalFeatureDescriptorImpl;
+
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl : public FeatureDescriptorImpl<V> {
+ public:
+  HbmMultiTierFeatureDescriptorImpl(Allocator* alloc, int64 slot_num,
+                                    bool need_record_freq,
+                                    bool need_record_version)
+      : dram_alloc_bytes_(sizeof(V*)),
+        hbm_alloc_(alloc),
+        dram_alloc_(ev_allocator()),
+        FeatureDescriptorImpl<V>(slot_num, need_record_freq,
+                                 need_record_version) {
+    FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(
+        &dram_alloc_bytes_);
+  }
+
+  ~HbmMultiTierFeatureDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes = FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&hbm_alloc_bytes_);
+      embedding_mem_pool_.reset(new EmbeddingMemoryPool<V>(
+          hbm_alloc_, hbm_alloc_bytes_ / sizeof(V), 1024 * 1024 * 64));
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return *((V**)val) +
+           FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val = dram_alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                                         dram_alloc_bytes_);
+    mutex_lock l(memory_pool_mu_);
+    *((V**)val) = embedding_mem_pool_->Allocate();
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override {
+    mutex_lock l(memory_pool_mu_);
+    embedding_mem_pool_->Deallocate(*((V**)val));
+    dram_alloc_->DeallocateRaw(val);
+  }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    mutex_lock l(memory_pool_mu_);
+    for (auto ptr : value_ptrs) {
+      embedding_mem_pool_->Deallocate(*((V**)ptr));
+      dram_alloc_->DeallocateRaw(ptr);
+    }
+  }
+  void SetDefaultValue(void* val, int64 key) override {
+    LOG(FATAL) << "Can't call SetDefaultValue(void* val, int64 key,"
+               << "int default_value_len) in HbmMultiTierFeatureDescriptor.";
+  }
+
+  void SetAllocator(Allocator* alloc) override { hbm_alloc_ = alloc; }
+
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device);
+
+  int data_bytes() override { return dram_alloc_bytes_; }
+
+ public:
+  friend class NormalFeatureDescriptorImpl<V>;
+
+ protected:
+  int dram_alloc_bytes_;
+  int hbm_alloc_bytes_ = 0;
+  mutex memory_pool_mu_;  // ensure thread safety of embedding_mem_pool_
+  Allocator* hbm_alloc_;
+  Allocator* dram_alloc_;
+  std::unique_ptr<EmbeddingMemoryPool<V>> embedding_mem_pool_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_MULTI_TIER_FEATURE_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h
new file mode 100644
index 00000000..848c55bb
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/hbm_storage_iterator.h
@@ -0,0 +1,124 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_
+
+#if GOOGLE_CUDA
+#include "storage.h"
+namespace tensorflow {
+
+template <class V>
+class ValuePtr;
+
+namespace embedding {
+template <class K, class V>
+class HbmValueIterator : public ValueIterator<V> {
+ public:
+  HbmValueIterator(const std::vector<K>& key_list,
+                   const std::vector<void*>& value_ptr_list, int64 emb_index,
+                   int64 value_len, Allocator* alloc,
+                   FeatureDescriptor<V>* feat_desc)
+      : value_len_(value_len), alloc_(alloc) {
+    int64 emb_offset = value_len_ * emb_index;
+    std::vector<std::list<V*>> value_parts_vec(kSavedPartitionNum);
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          value_parts_vec[part_id].emplace_back(
+              feat_desc->GetEmbedding(value_ptr_list[i], emb_index));
+          break;
+        }
+      }
+    }
+
+    for (int64 i = 0; i < kSavedPartitionNum; i++) {
+      values_.splice(values_.end(), value_parts_vec[i]);
+    }
+
+    values_iter_ = values_.begin();
+
+    num_of_embs_ = buffer_capacity_ / value_len_;
+    dev_addr_list_ = (V**)alloc_->AllocateRaw(Allocator::kAllocatorAlignment,
+                                              num_of_embs_ * sizeof(V*));
+    dev_embedding_buffer_ = (V*)alloc_->AllocateRaw(
+        Allocator::kAllocatorAlignment, buffer_capacity_ * sizeof(V));
+
+    FillEmbeddingBuffer();
+  }
+
+  ~HbmValueIterator() {
+    alloc_->DeallocateRaw(dev_addr_list_);
+    alloc_->DeallocateRaw(dev_embedding_buffer_);
+  }
+
+  V* Next() {
+    if (buffer_cursor_ == num_of_embs_) {
+      FillEmbeddingBuffer();
+      buffer_cursor_ = 0;
+    }
+
+    V* val = embedding_buffer_ + value_len_ * buffer_cursor_;
+    counter_++;
+    values_iter_++;
+    buffer_cursor_++;
+    return val;
+  }
+
+ private:
+  void FillEmbeddingBuffer() {
+    int64 total_num =
+        std::min(num_of_embs_, (int64)(values_.size() - counter_));
+    std::vector<V*> local_addr_list(total_num);
+    auto iter = values_iter_;
+    for (int64 i = 0; i < total_num; i++) {
+      local_addr_list[i] = *iter;
+      iter++;
+    }
+    cudaMemcpy(dev_addr_list_, local_addr_list.data(), sizeof(V*) * total_num,
+               cudaMemcpyHostToDevice);
+    int block_dim = 128;
+    void* args[] = {(void*)&dev_addr_list_,
+                    (void*)&dev_embedding_buffer_,
+                    (void*)&value_len_,
+                    (void*)&total_num,
+                    nullptr,
+                    nullptr};
+    cudaLaunchKernel((void*)BatchCopy<V>,
+                     (total_num + block_dim - 1) / block_dim * value_len_,
+                     block_dim, args, 0, NULL);
+    cudaDeviceSynchronize();
+    cudaMemcpy(embedding_buffer_, dev_embedding_buffer_,
+               sizeof(V) * total_num * value_len_, cudaMemcpyDeviceToHost);
+  }
+
+ private:
+  std::list<V*> values_;
+  typename std::list<V*>::iterator values_iter_;
+  const static int64 buffer_capacity_ = 1024 * 1024 * 1;
+  V embedding_buffer_[buffer_capacity_];
+  int64 counter_ = 0;
+  int64 buffer_cursor_ = 0;
+  int64 value_len_;
+  int64 num_of_embs_ = 0;
+  Allocator* alloc_;
+  V** dev_addr_list_;
+  V* dev_embedding_buffer_;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_HBM_STORAGE_ITERATOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h b/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h
new file mode 100644
index 00000000..5f97a2e2
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
+
+#include <iostream>
+#include <map>
+#include <memory>
+
+#include "deepray/custom_ops/utils/spin_rw_lock.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+namespace tensorflow {
+
+// Allocate a copy id for each thread
+class IntraThreadCopyIdAllocator {
+ public:
+  IntraThreadCopyIdAllocator(int num_threads)
+      : num_worker_threads_(num_threads) {
+    is_occupy_flag_.reset(new bool[num_worker_threads_]);
+    memset(is_occupy_flag_.get(), 0, sizeof(bool) * num_worker_threads_);
+  }
+
+  int64 GetCopyIdOfThread(uint64 main_thread_id) {
+    uint64 thread_id = Env::Default()->GetCurrentThreadId();
+    if (thread_id == main_thread_id) {
+      return num_worker_threads_;
+    } else {
+      int copy_id = -1;
+      {
+        spin_rd_lock l(mu_);
+        auto iter = hash_map_.find(thread_id);
+        if (iter != hash_map_.end()) {
+          copy_id = iter->second;
+          return copy_id;
+        }
+      }
+      if (copy_id == -1) {
+        // bind a new thread to a local cursor_list
+        copy_id = thread_id % num_worker_threads_;
+        while (!__sync_bool_compare_and_swap(&(is_occupy_flag_[copy_id]), false,
+                                             true)) {
+          copy_id = (copy_id + 1) % num_worker_threads_;
+        }
+        {
+          spin_wr_lock l(mu_);
+          hash_map_.insert(std::pair<uint64, int64>(thread_id, copy_id));
+        }
+        return copy_id;
+      }
+    }
+  }
+
+ private:
+  int num_worker_threads_;
+  std::unique_ptr<bool[]> is_occupy_flag_;
+  std::map<uint64, int64> hash_map_;
+  mutable easy_spinrwlock_t mu_ = EASY_SPINRWLOCK_INITIALIZER;
+};
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_INTRA_THREAD_COPY_ID_ALLOCATOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h b/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h
new file mode 100644
index 00000000..b80b58bd
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h
@@ -0,0 +1,121 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
+
+#include "feature_descriptor.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+const char* kInferenceMode = "INFERENCE_MODE";
+const int kSavedPartitionNum = 1000;
+}  // namespace
+
+template <class K, class V>
+class GPUHashTable;
+
+using GPUDevice = Eigen::GpuDevice;
+namespace embedding {
+
+template <class V>
+class ValueIterator {
+ public:
+  virtual V* Next() = 0;
+};
+
+template <class K, class V>
+class KVInterface {
+ public:
+  virtual ~KVInterface() {}
+  virtual Status Lookup(K key, void** value_ptr) = 0;
+  virtual Status Contains(K key) = 0;
+  virtual Status Insert(K key, const void* value_ptr) = 0;
+  virtual Status Remove(K key) = 0;
+
+  virtual Status BatchLookup(const K* keys, size_t size, void** value_ptrs) {
+    return errors::Unimplemented(
+        "Unimplemented for BatchLookup in KVInterface.");
+  }
+  // KV Batch Insert
+  virtual Status BatchInsert(const std::vector<K>& keys,
+                             const std::vector<void*>& value_ptrs) {
+    return errors::Unimplemented(
+        "Unimplemented for BatchInsert in KVInterface.");
+  }
+  // KV Batch Remove
+  virtual Status BatchRemove(const K* keys, size_t size) {
+    return errors::Unimplemented(
+        "Unimplemented for BatchRemove in KVInterface.");
+  }
+
+  virtual Status BatchLookupOrCreate(const K* keys, size_t size,
+                                     void** value_ptrs) {
+    return errors::Unimplemented(
+        "Unimplemented for BatchLookupOrInsert in KVInterface.");
+  }
+
+  virtual void UpdateValuePtr(K key, void* new_value_ptr, void* old_value_ptr) {
+    LOG(FATAL) << "Unimplemented for UpdateValuePtr in KVInterface.";
+  }
+
+  virtual Status BatchCommit(const std::vector<K>& keys,
+                             const std::vector<void*>& value_ptrs) = 0;
+
+  // KV Size
+  virtual int64 Size() const = 0;
+
+  virtual void FreeValuePtr(void* value_ptr) {}
+
+  virtual Status Commit(K key, const void* value_ptr) { return OkStatus(); }
+
+  virtual Status GetSnapshot(std::vector<K>* key_list,
+                             std::vector<void*>* value_ptr_list) = 0;
+
+  virtual Status GetShardedSnapshot(
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list, int partition_id,
+      int partition_nums) = 0;
+
+  virtual std::string DebugString() const = 0;
+
+  virtual Status BatchLookupOrCreate(const K* keys, V* val, V* default_v,
+                                     int32 default_v_num, size_t n,
+                                     const GPUDevice& device) {
+    return OkStatus();
+  }
+  virtual Status BatchLookupOrCreateKeys(const K* keys, size_t n,
+                                         int32* item_idxs,
+                                         const GPUDevice& device) {
+    return OkStatus();
+  }
+
+  virtual Status BatchLookup(const GPUDevice& device, const K* keys, V* val,
+                             size_t n, const V* default_v) {
+    return errors::Unimplemented(
+        "Unimplemented for BatchLookup in KVInterface.");
+  }
+
+  virtual GPUHashTable<K, V>* HashTable() { return nullptr; }
+
+  virtual void SetValueLen(int64 value_len) {}
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_KV_INTERFACE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h
new file mode 100644
index 00000000..18f3d2b8
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/l2weight_shrink_policy.h
@@ -0,0 +1,71 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_
+
+#include "shrink_policy.h"
+
+namespace tensorflow {
+
+namespace embedding {
+template <typename K, typename V>
+class L2WeightShrinkPolicy : public ShrinkPolicy<K, V> {
+ public:
+  L2WeightShrinkPolicy(float l2_weight_threshold, int64 index,
+                       FeatureDescriptor<V>* feat_desc, KVInterface<K, V>* kv)
+      : index_(index),
+        kv_(kv),
+        l2_weight_threshold_(l2_weight_threshold),
+        ShrinkPolicy<K, V>(feat_desc) {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(L2WeightShrinkPolicy);
+
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_list,
+              const ShrinkArgs& shrink_args) override {
+    ShrinkPolicy<K, V>::ReleaseValuePtrs();
+    FilterToDelete(shrink_args.value_len, key_list, value_list);
+  }
+
+ private:
+  void FilterToDelete(int64 value_len, std::vector<K>& key_list,
+                      std::vector<void*>& value_list) {
+    for (int64 i = 0; i < key_list.size(); ++i) {
+      V* val =
+          ShrinkPolicy<K, V>::feat_desc_->GetEmbedding(value_list[i], index_);
+      if (val != nullptr) {
+        V l2_weight = (V)0.0;
+        for (int64 j = 0; j < value_len; j++) {
+          l2_weight += val[j] * val[j];
+        }
+        l2_weight *= (V)0.5;
+        if (l2_weight < (V)l2_weight_threshold_) {
+          kv_->Remove(key_list[i]);
+          value_list[i] = (void*)ValuePtrStatus::IS_DELETED;
+          ShrinkPolicy<K, V>::EmplacePointer(value_list[i]);
+        }
+      }
+    }
+  }
+
+ private:
+  int64 index_;
+  // int64 offset_;
+  KVInterface<K, V>* kv_;
+  float l2_weight_threshold_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_L2WEIGHT_SHRINK_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h
new file mode 100644
index 00000000..8d415d75
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/leveldb_kv.h
@@ -0,0 +1,288 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
+
+#include <sstream>
+
+#include "kv_interface.h"
+#include "leveldb/comparator.h"
+#include "leveldb/db.h"
+#include "leveldb/write_batch.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+
+using leveldb::DB;
+using leveldb::Options;
+using leveldb::ReadOptions;
+using leveldb::WriteBatch;
+using leveldb::WriteOptions;
+
+namespace tensorflow {
+namespace embedding {
+
+template <class K>
+class SizeCounter {
+ public:
+  SizeCounter(int num_parts) {
+    num_parts_ = num_parts;
+    for (int i = 0; i < num_parts_; i++) {
+      counter_.emplace_back(0);
+    }
+  }
+
+  void add(K key, int64 count) {
+    int part = key % num_parts_;
+    __sync_fetch_and_add(&counter_[part], count);
+  }
+
+  void sub(K key, int64 count) {
+    int part = key % num_parts_;
+    __sync_fetch_and_sub(&counter_[part], count);
+  }
+
+  int64 size() {
+    int64 total = 0;
+    for (int i = 0; i < num_parts_; i++) {
+      total += counter_[i];
+    }
+    return total;
+  }
+
+ private:
+  std::vector<int64> counter_;
+  int num_parts_;
+};
+
+template <class K, class V>
+class LevelDBKV : public KVInterface<K, V> {
+ public:
+  LevelDBKV(std::string path, FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc) {
+    path_ = io::JoinPath(
+        path, "level_db_" + std::to_string(Env::Default()->NowMicros()));
+    ;
+    options_.create_if_missing = true;
+    leveldb::Status s = leveldb::DB::Open(options_, path_, &db_);
+    CHECK(s.ok());
+    counter_ = new SizeCounter<K>(8);
+  }
+
+  ~LevelDBKV() override { delete db_; }
+
+  Status Lookup(K key, void** value_ptr) override {
+    std::string val_str;
+    leveldb::Slice db_key((char*)(&key), sizeof(void*));
+    leveldb::ReadOptions options;
+    leveldb::Status s = db_->Get(options, db_key, &val_str);
+    if (s.IsNotFound()) {
+      return errors::NotFound("Unable to find Key: ", key, " in LevelDB.");
+    } else {
+      void* val = feat_desc_->Allocate();
+      memcpy((int64*)val, &val_str[0], val_str.length());
+      *value_ptr = val;
+      return OkStatus();
+    }
+  }
+
+  Status Contains(K key) override {
+    std::string val_str;
+    leveldb::Slice db_key((char*)(&key), sizeof(void*));
+    leveldb::ReadOptions options;
+    leveldb::Status s = db_->Get(options, db_key, &val_str);
+    if (s.IsNotFound()) {
+      return errors::NotFound("Unable to find Key: ", key, " in LevelDB.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  Status Insert(K key, const void* value_ptr) override {
+    counter_->add(key, 1);
+    return OkStatus();
+  }
+
+  Status BatchInsert(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    return BatchCommit(keys, value_ptrs);
+  }
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    WriteBatch batch;
+    for (int i = 0; i < keys.size(); i++) {
+      std::string value_res((char*)value_ptrs[i], feat_desc_->data_bytes());
+      leveldb::Slice db_key((char*)(&keys[i]), sizeof(void*));
+      batch.Put(db_key, value_res);
+      delete value_ptrs[i];
+    }
+    db_->Write(WriteOptions(), &batch);
+    return OkStatus();
+  }
+
+  Status Commit(K key, const void* value_ptr) override {
+    std::string value_res((char*)value_ptr, feat_desc_->data_bytes());
+    leveldb::Slice db_key((char*)(&key), sizeof(void*));
+    leveldb::Status s = db_->Put(WriteOptions(), db_key, value_res);
+    if (!s.ok()) {
+      return errors::AlreadyExists("already exists Key: ", key, " in RocksDB.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  Status Remove(K key) override {
+    counter_->sub(key, 1);
+    leveldb::Slice db_key((char*)(&key), sizeof(void*));
+    leveldb::Status s = db_->Delete(WriteOptions(), db_key);
+    if (s.ok()) {
+      return OkStatus();
+    } else {
+      return errors::NotFound("Unable to find Key: ", key, " in RocksDB.");
+    }
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    ReadOptions options;
+    options.snapshot = db_->GetSnapshot();
+    leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      K key;
+      memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
+      key_list->emplace_back(key);
+      FeatureDescriptor<V> hbm_feat_desc(1, 1, ev_allocator() /*useless*/,
+                                         StorageType::HBM_DRAM, true, true,
+                                         {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr, it->value().ToString().data(),
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(value_ptr,
+                                  feat_desc_->GetVersion(dram_value_ptr));
+      value_ptr_list->emplace_back(value_ptr);
+    }
+    delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    ReadOptions options;
+    options.snapshot = db_->GetSnapshot();
+    leveldb::Iterator* it = db_->NewIterator(options);
+    void* dram_value_ptr = feat_desc_->Allocate();
+    for (it->SeekToFirst(); it->Valid(); it->Next()) {
+      K key;
+      memcpy((char*)&key, it->key().ToString().data(), sizeof(K));
+      int part_id = key % kSavedPartitionNum % partition_nums;
+      if (part_id == partition_id) continue;
+      key_list[part_id].emplace_back(key);
+      FeatureDescriptor<V> hbm_feat_desc(1, 1, ev_allocator() /*useless*/,
+                                         StorageType::HBM_DRAM, true, true,
+                                         {false, 0});
+      void* value_ptr = cpu_allocator()->AllocateRaw(
+          Allocator::kAllocatorAlignment, hbm_feat_desc.data_bytes());
+      memcpy(dram_value_ptr, it->value().ToString().data(),
+             feat_desc_->data_bytes());
+      hbm_feat_desc.SetFreq(value_ptr, feat_desc_->GetFreq(dram_value_ptr));
+      hbm_feat_desc.UpdateVersion(value_ptr,
+                                  feat_desc_->GetVersion(dram_value_ptr));
+      value_ptr_list[part_id].emplace_back(value_ptr);
+    }
+    delete it;
+    feat_desc_->Deallocate(dram_value_ptr);
+    return OkStatus();
+  }
+
+  int64 Size() const override { return counter_->size(); }
+
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
+  }
+
+  std::string DebugString() const override { return ""; }
+
+ private:
+  DB* db_;
+  SizeCounter<K>* counter_;
+  Options options_;
+  std::string path_;
+  FeatureDescriptor<V>* feat_desc_;
+};
+
+template <class K, class V>
+class DBValueIterator : public ValueIterator<V> {
+ public:
+  DBValueIterator(const std::vector<K>& key_list, int64 emb_index,
+                  int64 value_len, LevelDBKV<K, V>* leveldb_kv,
+                  FeatureDescriptor<V>* feat_desc)
+      : value_len_(value_len),
+        emb_index_(emb_index),
+        leveldb_kv_(leveldb_kv),
+        feat_desc_(feat_desc) {
+    int64 emb_offset = value_len_ * emb_index;
+    std::vector<std::list<K>> keys_parts_vec(kSavedPartitionNum);
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          keys_parts_vec[part_id].emplace_back(key_list[i]);
+          break;
+        }
+      }
+    }
+
+    for (int64 i = 0; i < kSavedPartitionNum; i++) {
+      keys_.splice(keys_.end(), keys_parts_vec[i]);
+    }
+
+    keys_iter_ = keys_.begin();
+  }
+
+  ~DBValueIterator() { delete value_ptr_; }
+
+  V* Next() {
+    if (value_ptr_ != nullptr) {
+      feat_desc_->Deallocate(value_ptr_);
+    }
+    K key = *(keys_iter_++);
+
+    Status s = leveldb_kv_->Lookup(key, &value_ptr_);
+    if (!s.ok()) {
+      LOG(FATAL) << "Not found value in LevelDB when Save.";
+    }
+    return feat_desc_->GetEmbedding(value_ptr_, emb_index_);
+  }
+
+ private:
+  int64 value_len_;
+  int64 emb_index_;
+  LevelDBKV<K, V>* leveldb_kv_;
+  FeatureDescriptor<V>* feat_desc_;
+  std::list<K> keys_;
+  typename std::list<K>::const_iterator keys_iter_;
+  void* value_ptr_ = nullptr;
+  int64 key_cursor_ = 0;
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_LEVELDB_KV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc
new file mode 100644
index 00000000..b2cd8026
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.cu.cc
@@ -0,0 +1,188 @@
+/* Copyright 2019 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "multi_tier_storage.h"
+
+#include "hbm_multi_tier_feature_descriptor.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using se::DeviceMemoryBase;
+using se::Stream;
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+void SyncWithEventMgr(se::Stream* stream, EventMgr* event_mgr);
+
+namespace embedding {
+template <class K, class V>
+void MultiTierStorage<K, V>::CopyEmbeddingsFromDramToHbm(
+    const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+    void** value_ptr_list, std::list<int64>& copyback_cursor,
+    const std::vector<int64>& memory_index,
+    const std::vector<void*>& gpu_value_ptrs, int value_len,
+    FeatureDescriptor<V>* hbm_feat_desc, FeatureDescriptor<V>* dram_feat_desc) {
+  if (copyback_cursor.size() > 0) {
+    int total = copyback_cursor.size();
+    // Alocate memcpy buffer on CPU and GPU.
+    Allocator* gpu_alloc = ctx.gpu_allocator;
+    V* memcpy_buffer_gpu = (V*)gpu_alloc->AllocateRaw(
+        Allocator::kAllocatorAlignment, total * value_len * sizeof(V));
+    V* memcpy_buffer_cpu = (V*)cpu_allocator()->AllocateRaw(
+        Allocator::kAllocatorAlignment, total * value_len * sizeof(V));
+
+    // Copy embeddings on CPU to bufer on CPU
+    auto do_work = [memory_index, memcpy_buffer_cpu, value_ptr_list,
+                    gpu_value_ptrs, dram_feat_desc, value_len,
+                    this](int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        int j = memory_index[i];
+        memcpy(memcpy_buffer_cpu + i * value_len,
+               dram_feat_desc->GetEmbedding(value_ptr_list[j], 0),
+               value_len * sizeof(V));
+        value_ptr_list[j] = gpu_value_ptrs[i];
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, total, 1000,
+          do_work);
+
+    // Copy embeddings from CPU buffer to GPU buffer
+    auto compute_stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    DeviceMemoryBase gpu_buffer_dst_ptr(memcpy_buffer_gpu,
+                                        total * value_len * sizeof(V));
+    compute_stream->ThenMemcpy(&gpu_buffer_dst_ptr, memcpy_buffer_cpu,
+                               total * value_len * sizeof(V));
+    SyncWithEventMgr(compute_stream, event_mgr);
+
+    // Copy addr of embeddings on GPU to GPU
+    V** value_address = (V**)cpu_allocator()->AllocateRaw(
+        Allocator::kAllocatorAlignment, sizeof(V*) * total);
+    V** dev_value_address = (V**)gpu_alloc->AllocateRaw(
+        Allocator::kAllocatorAlignment, sizeof(V*) * total);
+    int64 i = 0;
+    auto it = copyback_cursor.cbegin();
+    for (; it != copyback_cursor.cend(); ++it, ++i) {
+      // Get the cursor
+      int64 cursor = *it;
+      value_address[i] = hbm_feat_desc->GetEmbedding(gpu_value_ptrs[i], 0);
+    }
+    DeviceMemoryBase gpu_addr_dst_ptr(dev_value_address, total * sizeof(V*));
+    compute_stream->ThenMemcpy(&gpu_addr_dst_ptr, value_address,
+                               total * sizeof(V*));
+
+    // Copy each embedding to corresponding address
+    int block_dim = 128;
+    TF_CHECK_OK(GpuLaunchKernel(
+        BatchUnpack<V>, (total + block_dim - 1) / block_dim * value_len,
+        block_dim, 0, ctx.gpu_device.stream(), dev_value_address,
+        memcpy_buffer_gpu, value_len, total));
+    SyncWithEventMgr(compute_stream, event_mgr);
+
+    gpu_alloc->DeallocateRaw(dev_value_address);
+    gpu_alloc->DeallocateRaw(memcpy_buffer_gpu);
+    cpu_allocator()->DeallocateRaw(value_address);
+    cpu_allocator()->DeallocateRaw(memcpy_buffer_cpu);
+  }
+}
+#define REGISTER_KERNELS(ktype, vtype)                                         \
+  template void MultiTierStorage<ktype, vtype>::CopyEmbeddingsFromDramToHbm(   \
+      const EmbeddingVarContext<GPUDevice>&, const ktype*, void**,             \
+      std::list<int64>&, const std::vector<int64>&, const std::vector<void*>&, \
+      int, FeatureDescriptor<vtype>*, FeatureDescriptor<vtype>*);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <class TValue>
+template <class K>
+void HbmMultiTierFeatureDescriptorImpl<TValue>::SetDefaultValues(
+    const K* keys, const std::list<int64>& init_cursor, void** value_ptrs,
+    se::Stream* compute_stream, EventMgr* event_mgr,
+    const Eigen::GpuDevice& gpu_device) {
+  if (init_cursor.size() > 0) {
+    int64 total = init_cursor.size();
+    TValue** value_address = nullptr;
+    value_address = TypedAllocator::Allocate<TValue*>(
+        cpu_allocator(), total * 2, AllocationAttributes());
+    TValue** default_value_address = value_address + total;
+    TValue** dev_value_address = nullptr;
+    dev_value_address = TypedAllocator::Allocate<TValue*>(
+        hbm_alloc_, total * 2, AllocationAttributes());
+    TValue** dev_default_value_address = dev_value_address + total;
+    for (int emb_index = 0;
+         emb_index < FeatureDescriptorImpl<TValue>::slot_infos_.size();
+         emb_index++) {
+      int64 i = 0;
+      auto it = init_cursor.cbegin();
+      for (; it != init_cursor.cend(); ++it, ++i) {
+        value_address[i] = GetEmbedding(value_ptrs[*it], emb_index);
+        default_value_address[i] =
+            FeatureDescriptorImpl<TValue>::GetDefaultValuePtr(emb_index,
+                                                              keys[i]);
+      }
+      DeviceMemoryBase gpu_dst_ptr(dev_value_address,
+                                   total * 2 * sizeof(TValue*));
+      compute_stream->ThenMemcpy(&gpu_dst_ptr, value_address,
+                                 total * 2 * sizeof(TValue*));
+      int block_dim = 128;
+      int value_len = FeatureDescriptorImpl<TValue>::slot_infos_[emb_index]
+                          .default_value_len;
+      TF_CHECK_OK(GpuLaunchKernel(
+          embedding::CopyEmbedding<TValue>,
+          (total * value_len + block_dim - 1) / block_dim, block_dim, 0,
+          gpu_device.stream(), dev_default_value_address, dev_value_address,
+          value_len, total));
+      SyncWithEventMgr(compute_stream, event_mgr);
+    }
+
+    TypedAllocator::Deallocate(hbm_alloc_, dev_value_address, total * 2);
+    TypedAllocator::Deallocate(cpu_allocator(), value_address, total * 2);
+  }
+}
+
+#define REGISTER_KERNELS(ktype, vtype)                                       \
+  template void HbmMultiTierFeatureDescriptorImpl<vtype>::SetDefaultValues(  \
+      const ktype*, const std::list<int64>&, void**, se::Stream*, EventMgr*, \
+      const Eigen::GpuDevice& gpu_device);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h
new file mode 100644
index 00000000..03e713b6
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/multi_tier_storage.h
@@ -0,0 +1,303 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_
+
+#include "cache_factory.h"
+#include "cache_thread_pool_creator.h"
+#include "cpu_hash_map_kv.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "embedding_var_context.h"
+#include "embedding_var_restore.h"
+#include "eviction_manager.h"
+#include "globalstep_shrink_policy.h"
+#include "kv_interface.h"
+#include "l2weight_shrink_policy.h"
+#include "storage.h"
+#include "storage_config.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+#if GOOGLE_CUDA
+#include "batch.h"
+#endif
+
+namespace tensorflow {
+template <typename K, typename V>
+class EmbeddingVar;
+
+template <class K>
+struct SsdRecordDescriptor;
+
+namespace embedding {
+template <typename K, typename V>
+class MultiTierStorage : public Storage<K, V> {
+ public:
+  MultiTierStorage(const StorageConfig& sc, const std::string& name)
+      : Storage<K, V>(sc), name_(name) {}
+
+  virtual ~MultiTierStorage() { delete cache_; }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(MultiTierStorage);
+
+  virtual void Init() override {
+    cache_capacity_ =
+        Storage<K, V>::storage_config_.size[0] / (total_dim() * sizeof(V));
+    ready_eviction_ = true;
+  }
+
+  int64 CacheSize() const override { return cache_capacity_; }
+
+  BatchCache<K>* Cache() override { return cache_; }
+
+  void InitCache(embedding::CacheStrategy cache_strategy) override {
+    if (cache_ == nullptr) {
+      cache_ = CacheFactory::Create<K>(cache_strategy, name_);
+      eviction_manager_ = EvictionManagerCreator::Create<K, V>();
+      eviction_manager_->AddStorage(this);
+      cache_thread_pool_ = CacheThreadPoolCreator::Create();
+    }
+  }
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    LOG(FATAL) << "BatchCommit isn't supported by MultiTierStorage.";
+    return OkStatus();
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    LOG(FATAL) << "Can't get snapshot of MultiTierStorage.";
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    LOG(FATAL) << "Can't get sharded snapshot of MultiTierStorage.";
+    return OkStatus();
+  }
+
+  void CopyEmbeddingsFromCPUToGPU(
+      int total, const K* keys, const std::list<int64>& copyback_cursor,
+      V** memcpy_address, size_t value_len, void** gpu_value_ptrs,
+      V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr,
+      const DeviceBase::CpuWorkerThreads* worker_threads) override {
+    LOG(FATAL) << "Unsupport CopyEmbeddingsFromCPUToGPU in MultiTierStorage.";
+  };
+
+  Status Contains(K key) override {
+    LOG(FATAL) << "Contains is not support in MultiTierStorage.";
+    return OkStatus();
+  }
+
+  bool IsMultiLevel() override { return true; }
+
+  void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len,
+                                 int64 block_size) override {
+    return;
+  }
+
+  void Schedule(std::function<void()> fn) override {
+    cache_thread_pool_->Schedule(std::move(fn));
+  }
+
+  virtual Status Eviction(K* evict_ids, int64 evict_size) override {
+    LOG(FATAL) << "Eviction isn't support by " << typeid(this).name();
+    return OkStatus();
+  }
+
+  virtual void BatchEviction() {
+    constexpr int EvictionSize = 10000;
+    K evic_ids[EvictionSize];
+    if (!ready_eviction_) return;
+    int cache_count = cache_->size();
+    if (cache_count > cache_capacity_) {
+      // eviction
+      int k_size = cache_count - cache_capacity_;
+      k_size = std::min(k_size, EvictionSize);
+      size_t true_size = cache_->get_evic_ids(evic_ids, k_size);
+      EvictionWithDelayedDestroy(evic_ids, true_size);
+    }
+  }
+
+  void UpdateCache(const Tensor& indices,
+                   const Tensor& indices_counts) override {
+    Schedule([this, indices, indices_counts]() {
+      cache_->update(indices, indices_counts);
+    });
+  }
+
+  void UpdateCache(const Tensor& indices) override {
+    Schedule([this, indices]() { cache_->update(indices); });
+  }
+
+  virtual bool IsUseHbm() override { return false; }
+
+  void AddToCachePrefetchList(const Tensor& indices) override {
+    Schedule([this, indices]() { cache_->add_to_prefetch_list(indices); });
+  }
+
+  void AddToCache(const Tensor& indices) override {
+    Schedule([this, indices]() { cache_->add_to_cache(indices); });
+  }
+
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num,
+                               value_len, is_filter, false /*to_dram*/, is_incr,
+                               restore_buff);
+
+    if (emb_config.is_primary()) {
+      K* key_buff = (K*)restore_buff.key_buffer;
+      V* value_buff = (V*)restore_buff.value_buffer;
+      int64* version_buff = (int64*)restore_buff.version_buffer;
+      int64* freq_buff = (int64*)restore_buff.freq_buffer;
+      if (cache_) {
+        cache_->update(key_buff, key_num, version_buff, freq_buff);
+        auto cache_size = CacheSize();
+        if (cache_->size() > cache_size) {
+          int64 evict_size = cache_->size() - cache_size;
+          std::vector<K> evict_ids(evict_size);
+          size_t true_size = cache_->get_evic_ids(evict_ids.data(), evict_size);
+          Eviction(evict_ids.data(), true_size);
+        }
+      }
+      return s;
+    }
+    return s;
+  }
+  virtual int total_dim() = 0;
+
+  void DeleteFromEvictionManager() { eviction_manager_->DeleteStorage(this); }
+
+  void ReleaseValuePtrs(std::deque<void*>& value_ptrs,
+                        FeatureDescriptor<V>* feat_desc) {
+    constexpr int CAP_INVALID_VALUEPTR = 64 * 1024;
+    if (value_ptrs.size() > CAP_INVALID_VALUEPTR) {
+      int64 num_of_deleted_value_ptrs =
+          value_ptrs.size() - CAP_INVALID_VALUEPTR;
+      for (int i = 0; i < num_of_deleted_value_ptrs; i++) {
+        void* value_ptr = value_ptrs.front();
+        feat_desc->Deallocate(value_ptr);
+        value_ptrs.pop_front();
+      }
+    }
+  }
+
+  void ReleaseInvalidValuePtr(FeatureDescriptor<V>* feat_desc) {
+    ReleaseValuePtrs(value_ptr_out_of_date_, feat_desc);
+  }
+
+  void KeepInvalidValuePtr(void* value_ptr) {
+    value_ptr_out_of_date_.emplace_back(value_ptr);
+  }
+
+#if GOOGLE_CUDA
+  void CopyEmbeddingsFromDramToHbm(
+      const EmbeddingVarContext<GPUDevice>& context, const K* keys,
+      void** value_ptr_list, std::list<int64>& copyback_cursors,
+      const std::vector<int64>& memory_index,
+      const std::vector<void*>& gpu_value_ptrs, int value_len,
+      FeatureDescriptor<V>* hbm_feat_desc,
+      FeatureDescriptor<V>* dram_feat_desc);
+#endif  // GOOGL_CUDA
+ private:
+  virtual Status EvictionWithDelayedDestroy(K* evict_ids, int64 evict_size) {}
+
+ protected:
+  std::deque<void*> value_ptr_out_of_date_;
+  BatchCache<K>* cache_ = nullptr;
+
+  EvictionManager<K, V>* eviction_manager_;
+  thread::ThreadPool* cache_thread_pool_;
+
+  condition_variable shutdown_cv_;
+  volatile bool shutdown_ = false;
+
+  int64 cache_capacity_ = -1;
+  volatile bool ready_eviction_ = false;
+
+  std::string name_;
+  std::vector<mutex> mu_list_;
+};
+
+#if GOOGLE_CUDA
+template <class V>
+void CopyEmbeddingFromHbmToDram(const std::vector<void*>& hbm_value_ptrs,
+                                const std::vector<void*>& dram_value_ptrs,
+                                Allocator* gpu_alloc,
+                                FeatureDescriptor<V>* hbm_feat_desc,
+                                FeatureDescriptor<V>* dram_feat_desc) {
+  int batch_size = hbm_value_ptrs.size();
+  V** dev_value_address;
+
+  dev_value_address = (V**)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, batch_size * sizeof(V*));
+  Allocator* cpu_alloc = ev_allocator();
+  V** value_address = (V**)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V*) * batch_size);
+
+  V* batch_data_place;
+  V* dev_batch_data_place;
+  int total_dim = dram_feat_desc->total_dim();
+  dev_batch_data_place = (V*)gpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  batch_data_place = (V*)cpu_alloc->AllocateRaw(
+      Allocator::kAllocatorAlignment, sizeof(V) * batch_size * total_dim);
+  // Copy GPU addresses V*
+  for (int i = 0; i < batch_size; ++i) {
+    value_address[i] = hbm_feat_desc->GetEmbedding(hbm_value_ptrs[i], 0);
+  }
+  cudaMemcpyAsync(dev_value_address, value_address, sizeof(V*) * batch_size,
+                  cudaMemcpyHostToDevice);
+
+  // Launch Kernel,Copy data to continuous place
+  int block_dim = 128;
+  void* args[] = {(void*)&dev_value_address, (void*)&dev_batch_data_place,
+                  (void*)&total_dim, (void*)&batch_size};
+
+  cudaLaunchKernel((void*)BatchCopy<V>,
+                   (batch_size * total_dim + block_dim - 1) / block_dim,
+                   block_dim, args, 0, NULL);
+
+  cudaMemcpyAsync(batch_data_place, dev_batch_data_place,
+                  sizeof(V) * batch_size * total_dim, cudaMemcpyDeviceToHost);
+
+  cudaEvent_t is_finish_;
+  cudaEventCreate(&is_finish_);
+  cudaEventRecord(is_finish_);
+  cudaEventSynchronize(is_finish_);
+  cudaEventDestroy(is_finish_);
+
+  for (int i = 0; i < batch_size; ++i) {
+    memcpy(dram_feat_desc->GetEmbedding(dram_value_ptrs[i], 0),
+           &batch_data_place[i * total_dim], total_dim * sizeof(V));
+  }
+
+  cpu_alloc->DeallocateRaw(value_address);
+  cpu_alloc->DeallocateRaw(batch_data_place);
+  gpu_alloc->DeallocateRaw(dev_value_address);
+  gpu_alloc->DeallocateRaw(dev_batch_data_place);
+}
+#endif  // GOOGL_CUDA
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_MULTI_TIER_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h
new file mode 100644
index 00000000..da844008
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/normal_feature_descriptor.h
@@ -0,0 +1,127 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
+#include <list>
+
+#include "feature_descriptor_impl.h"
+
+namespace tensorflow {
+namespace embedding {
+#if GOOGLE_CUDA
+template <class V>
+class HbmMultiTierFeatureDescriptorImpl;
+#endif
+
+template <class V>
+class NormalFeatureDescriptorImpl : public FeatureDescriptorImpl<V> {
+ public:
+  NormalFeatureDescriptorImpl(Allocator* alloc, int64 slot_num,
+                              bool need_record_freq, bool need_record_version)
+      : alloc_bytes_(0),
+        alloc_(alloc),
+        FeatureDescriptorImpl<V>(slot_num, need_record_freq,
+                                 need_record_version) {}
+
+  NormalFeatureDescriptorImpl(NormalFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_(feat_desc_impl->alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  NormalFeatureDescriptorImpl(
+      HbmMultiTierFeatureDescriptorImpl<V>* feat_desc_impl)
+      : alloc_bytes_(0),
+        alloc_(feat_desc_impl->dram_alloc_),
+        FeatureDescriptorImpl<V>(feat_desc_impl) {}
+
+  ~NormalFeatureDescriptorImpl() {}
+
+  bool InitSlotInfo(int emb_index, int64 embedding_dim,
+                    const std::pair<V*, int64>& default_value) override {
+    bool is_compute_alloc_bytes = FeatureDescriptorImpl<V>::SetEmbeddingInfo(
+        emb_index, embedding_dim, default_value);
+    if (is_compute_alloc_bytes) {
+      FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+      FeatureDescriptorImpl<V>::CreateFreqAndVersionDescriptor(&alloc_bytes_);
+    }
+    return is_compute_alloc_bytes;
+  }
+
+  bool InitSlotInfo(FeatureDescriptorImpl<V>* feat_desc_impl) override {
+    FeatureDescriptorImpl<V>::SetSlotInfo(feat_desc_impl);
+    FeatureDescriptorImpl<V>::ComputeAllocBytes(&alloc_bytes_);
+    FeatureDescriptorImpl<V>::SetFreqAndVersionOffset(&alloc_bytes_);
+    return true;
+  }
+
+  V* GetEmbedding(void* val, int emb_index) override {
+    return reinterpret_cast<V*>(val) +
+           FeatureDescriptorImpl<V>::slot_infos_[emb_index].embedding_offset;
+  }
+
+  void* Allocate() override {
+    void* val =
+        alloc_->AllocateRaw(Allocator::kAllocatorAlignment, alloc_bytes_);
+    FeatureDescriptorImpl<V>::InitFreqAndVersion(val);
+    return val;
+  }
+
+  void Deallocate(void* val) override { alloc_->DeallocateRaw(val); }
+
+  void Deallocate(const std::vector<void*>& value_ptrs) override {
+    for (auto val : value_ptrs) {
+      Deallocate(val);
+    }
+  }
+
+  void SetValue(void* val, int64 emb_index, V* value) override {
+    V* val_ptr = GetEmbedding(val, emb_index);
+    memcpy(
+        val_ptr, value,
+        sizeof(V) *
+            FeatureDescriptorImpl<V>::slot_infos_[emb_index].default_value_len);
+  }
+
+  void SetDefaultValue(void* val, int64 index) override {
+    for (int i = 0; i < FeatureDescriptorImpl<V>::slot_infos_.size(); i++) {
+      V* val_ptr = GetEmbedding(val, i);
+      FeatureDescriptorImpl<V>::SetDefaultValue((void*)val_ptr, i, index);
+    }
+  }
+
+#if GOOGLE_CUDA
+  template <class K>
+  void SetDefaultValues(const K* keys, const std::list<int64>& init_cursor,
+                        void** value_ptrs, se::Stream* compute_stream,
+                        EventMgr* event_mgr,
+                        const Eigen::GpuDevice& gpu_device) {
+    LOG(FATAL)
+        << "Can't call SetDefaultValue(const K*, const std::list<int64>&,"
+        << "void**, se::Stream*, EventMgr*, const Eigen::GpuDevice&)"
+        << " in HbmMultiTierFeatureDescriptor.";
+  }
+#endif
+
+  void SetAllocator(Allocator* alloc) override { alloc_ = alloc; }
+
+  int data_bytes() override { return alloc_bytes_; }
+
+ private:
+  int alloc_bytes_;
+  Allocator* alloc_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NORMAL_FEATURE_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h
new file mode 100644
index 00000000..1d5c12f7
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/nullable_filter_policy.h
@@ -0,0 +1,173 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_
+
+#include "batch.h"
+#include "embedding_config.h"
+#include "filter_policy.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K, class V>
+class Storage;
+}
+
+template <typename K, typename V, typename EV>
+class NullableFilterPolicy : public FilterPolicy<K, V, EV> {
+  using FilterPolicy<K, V, EV>::ev_;
+  using FilterPolicy<K, V, EV>::config_;
+
+ public:
+  NullableFilterPolicy(const EmbeddingConfig& config, EV* ev,
+                       embedding::Storage<K, V>* storage,
+                       embedding::FeatureDescriptor<V>* feat_desc)
+      : storage_(storage),
+        feat_desc_(feat_desc),
+        FilterPolicy<K, V, EV>(config, ev) {}
+
+  Status Lookup(K key, V* val, const V* default_value_ptr,
+                const V* default_value_no_permission) override {
+    void* value_ptr = nullptr;
+    Status s = ev_->LookupKey(key, &value_ptr);
+    if (s.ok()) {
+      V* mem_val = feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+      memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+    } else {
+      memcpy(val, default_value_ptr, sizeof(V) * ev_->ValueLen());
+    }
+    return OkStatus();
+  }
+
+#if GOOGLE_CUDA
+  void BatchLookup(const EmbeddingVarContext<GPUDevice>& ctx, const K* keys,
+                   V* output, int64 num_of_keys, V* default_value_ptr,
+                   V* default_value_no_permission) override {
+    std::vector<void*> value_ptr_list(num_of_keys, nullptr);
+    ev_->BatchLookupKey(ctx, keys, value_ptr_list.data(), num_of_keys);
+    std::vector<V*> embedding_ptr(num_of_keys, nullptr);
+    auto do_work = [this, keys, value_ptr_list, &embedding_ptr,
+                    default_value_ptr,
+                    default_value_no_permission](int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        void* value_ptr = value_ptr_list[i];
+        if (value_ptr != nullptr) {
+          embedding_ptr[i] =
+              feat_desc_->GetEmbedding(value_ptr, config_.emb_index);
+        } else {
+          embedding_ptr[i] = default_value_ptr;
+        }
+      }
+    };
+    auto worker_threads = ctx.worker_threads;
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          1000, do_work);
+    auto stream = ctx.compute_stream;
+    auto event_mgr = ctx.event_mgr;
+    ev_->CopyEmbeddingsToBuffer(output, num_of_keys, embedding_ptr.data(),
+                                stream, event_mgr, ctx.gpu_device);
+  }
+
+  void BatchLookupOrCreateKey(const EmbeddingVarContext<GPUDevice>& ctx,
+                              const K* keys, void** value_ptrs,
+                              int64 num_of_keys) {
+    int num_worker_threads = ctx.worker_threads->num_threads;
+    std::vector<std::list<int64>> not_found_cursor_list(num_worker_threads + 1);
+    ev_->BatchLookupOrCreateKey(ctx, keys, value_ptrs, num_of_keys,
+                                not_found_cursor_list);
+  }
+#endif  // GOOGLE_CUDA
+
+  void LookupOrCreate(K key, V* val, const V* default_value_ptr,
+                      void** value_ptr, int count,
+                      const V* default_value_no_permission) override {
+    bool is_filter = true;
+    TF_CHECK_OK(LookupOrCreateKey(key, value_ptr, &is_filter, count));
+    V* mem_val = feat_desc_->GetEmbedding(*value_ptr, config_.emb_index);
+    memcpy(val, mem_val, sizeof(V) * ev_->ValueLen());
+  }
+
+  Status LookupOrCreateKey(K key, void** value_ptr, bool* is_filter,
+                           int64 count) override {
+    *is_filter = true;
+    Status s = ev_->LookupKey(key, value_ptr);
+    if (!s.ok()) {
+      *value_ptr = feat_desc_->Allocate();
+      feat_desc_->SetDefaultValue(*value_ptr, key);
+      storage_->Insert(key, value_ptr);
+      s = OkStatus();
+    }
+    feat_desc_->AddFreq(*value_ptr, count);
+    return s;
+  }
+
+  Status LookupKey(K key, void** val, bool* is_filter, int64 count) override {
+    *is_filter = true;
+    return ev_->LookupKey(key, val);
+  }
+
+  int64 GetFreq(K key, void* value_ptr) override {
+    return feat_desc_->GetFreq(value_ptr);
+  }
+
+  int64 GetFreq(K key) override {
+    if (!config_.is_save_freq()) return 0;
+    void* value_ptr = nullptr;
+    TF_CHECK_OK(ev_->LookupOrCreateKey(key, &value_ptr));
+    return feat_desc_->GetFreq(value_ptr);
+  }
+
+  Status Restore(int64 key_num, int bucket_num, int64 partition_id,
+                 int64 partition_num, int64 value_len, bool is_filter,
+                 bool to_dram, bool is_incr,
+                 RestoreBuffer& restore_buff) override {
+    K* key_buff = (K*)restore_buff.key_buffer;
+    V* value_buff = (V*)restore_buff.value_buffer;
+    int64* version_buff = (int64*)restore_buff.version_buffer;
+    int64* freq_buff = (int64*)restore_buff.freq_buffer;
+    for (auto i = 0; i < key_num; ++i) {
+      // this can describe by graph(Mod + DynamicPartition),
+      // but memory waste and slow
+      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
+        VLOG(1) << "skip EV key:" << *(key_buff + i);
+        continue;
+      }
+      int64 import_freq = 0;
+      int64 import_version = -1;
+
+      if (config_.filter_freq != 0 || ev_->IsMultiLevel() ||
+          config_.record_freq) {
+        import_freq = freq_buff[i];
+      }
+      if (config_.steps_to_live != 0 || config_.record_version) {
+        import_version = version_buff[i];
+      }
+      ev_->storage()->Import(key_buff[i], value_buff + i * ev_->ValueLen(),
+                             import_freq, import_version, config_.emb_index);
+    }
+    return OkStatus();
+  }
+
+  bool is_admit(K key, void* value_ptr) override { return true; }
+
+ private:
+  embedding::Storage<K, V>* storage_;
+  embedding::FeatureDescriptor<V>* feat_desc_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_NULLABLE_FILTER_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h b/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h
new file mode 100644
index 00000000..12231fb9
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/shrink_policy.h
@@ -0,0 +1,72 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
+
+#include "feature_descriptor.h"
+#include "kv_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+namespace embedding {
+struct ShrinkArgs {
+  ShrinkArgs() : global_step(0), value_len(0) {}
+
+  ShrinkArgs(int64 global_step, int64 value_len)
+      : global_step(global_step), value_len(value_len) {}
+  int64 global_step;
+  int64 value_len;
+};
+
+template <typename K, typename V>
+class ShrinkPolicy {
+ public:
+  ShrinkPolicy(FeatureDescriptor<V>* feat_desc) : feat_desc_(feat_desc) {}
+  virtual ~ShrinkPolicy() {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(ShrinkPolicy);
+
+  virtual void Shrink(std::vector<K>& key_list, std::vector<void*>& value_list,
+                      const ShrinkArgs& shrink_args) = 0;
+
+ protected:
+  void EmplacePointer(void* value_ptr) { to_delete_.emplace_back(value_ptr); }
+
+  void ReleaseValuePtrs() {
+    for (auto it : to_delete_) {
+      feat_desc_->Deallocate(it);
+    }
+    to_delete_.clear();
+  }
+
+ protected:
+  std::vector<void*> to_delete_;
+  FeatureDescriptor<V>* feat_desc_;
+};
+
+template <typename K, typename V>
+class NonShrinkPolicy : public ShrinkPolicy<K, V> {
+ public:
+  NonShrinkPolicy() : ShrinkPolicy<K, V>(nullptr) {}
+  TF_DISALLOW_COPY_AND_ASSIGN(NonShrinkPolicy);
+
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_list,
+              const ShrinkArgs& shrink_args) override {}
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SHRINK_POLICY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h
new file mode 100644
index 00000000..4dd11652
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/single_tier_storage.h
@@ -0,0 +1,581 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SINGLE_TIER_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SINGLE_TIER_STORAGE_H_
+
+#include "cache.h"
+#include "cpu_hash_map_kv.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "globalstep_shrink_policy.h"
+#if GOOGLE_CUDA
+#include "gpu_hash_map_kv.h"
+#endif  // GOOGLE_CUDA
+#include "kv_interface.h"
+#include "l2weight_shrink_policy.h"
+#include "leveldb_kv.h"
+#include "ssd_hash_kv.h"
+#include "storage.h"
+#include "storage_config.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+template <class K, class V>
+class EmbeddingVar;
+
+template <class K>
+struct SsdRecordDescriptor;
+
+namespace embedding {
+template <class K, class V>
+class DramSsdHashStorage;
+
+template <class K, class V>
+class DramPmemStorage;
+
+template <class K, class V>
+class DramLevelDBStore;
+
+#if GOOGLE_CUDA
+template <class K, class V>
+class HbmDramStorage;
+
+template <class K, class V>
+class HbmDramSsdStorage;
+#endif
+
+template <typename K, typename V>
+class SingleTierStorage : public Storage<K, V> {
+ public:
+  SingleTierStorage(const StorageConfig& sc, KVInterface<K, V>* kv,
+                    FeatureDescriptor<V>* feat_desc)
+      : kv_(kv), feat_desc_(feat_desc), Storage<K, V>(sc) {
+    if (sc.embedding_config.steps_to_live != 0) {
+      shrink_policy_ = new GlobalStepShrinkPolicy<K, V>(
+          sc.embedding_config.steps_to_live, feat_desc_, kv_);
+    } else if (sc.embedding_config.l2_weight_threshold != -1.0) {
+      shrink_policy_ = new L2WeightShrinkPolicy<K, V>(
+          sc.embedding_config.l2_weight_threshold,
+          sc.embedding_config.primary_emb_index, feat_desc_, kv_);
+    } else {
+      shrink_policy_ = new NonShrinkPolicy<K, V>();
+    }
+  }
+
+  ~SingleTierStorage() override {
+    mutex_lock l(Storage<K, V>::mu_);
+    std::vector<K> key_list;
+    std::vector<void*> value_ptr_list;
+    kv_->GetSnapshot(&key_list, &value_ptr_list);
+    for (auto value_ptr : value_ptr_list) {
+      feat_desc_->Deallocate(value_ptr);
+    }
+    delete kv_;
+    delete shrink_policy_;
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SingleTierStorage);
+
+  Status Get(K key, void** value_ptr) override {
+    return kv_->Lookup(key, value_ptr);
+  }
+
+  Status Contains(K key) override { return kv_->Contains(key); }
+
+  virtual void CreateAndInsert(K key, void** value_ptr,
+                               bool to_dram = false) override {
+    do {
+      *value_ptr = feat_desc_->Allocate();
+      Status s = kv_->Insert(key, *value_ptr);
+      if (s.ok()) {
+        break;
+      } else {
+        feat_desc_->Deallocate(*value_ptr);
+      }
+    } while (!(kv_->Lookup(key, value_ptr)).ok());
+  }
+
+  virtual void Insert(K key, void** value_ptr) override {
+    do {
+      Status s = kv_->Insert(key, *value_ptr);
+      if (s.ok()) {
+        break;
+      } else {
+        feat_desc_->Deallocate(*value_ptr);
+      }
+    } while (!(kv_->Lookup(key, value_ptr)).ok());
+  }
+
+  Status GetOrCreate(K key, void** value_ptr) override {
+    Status s = kv_->Lookup(key, value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+
+    *value_ptr = feat_desc_->Allocate();
+    s = kv_->Insert(key, *value_ptr);
+    if (s.ok()) {
+      return s;
+    }
+    // Insert Failed, key already exist
+    feat_desc_->Deallocate(*value_ptr);
+    return kv_->Lookup(key, value_ptr);
+  }
+
+  Status Remove(K key) override { return kv_->Remove(key); }
+
+  int64 Size() const override { return kv_->Size(); }
+
+  int64 Size(int level) const override {
+    if (level > 0) {
+      LOG(FATAL) << "Unsupport level>0 in SingleTierStorage.";
+    }
+    return kv_->Size();
+  }
+
+  int64 CacheSize() const override {
+    LOG(FATAL) << "Unsupport cachesize in SingleTierStorage.";
+    return 0;
+  }
+
+  int LookupTier(K key) const override {
+    Status s = kv_->Contains(key);
+    return (s.ok()) ? 0 : -1;
+  }
+
+  void CopyEmbeddingsFromCPUToGPU(
+      int total, const K* keys, const std::list<int64>& copyback_cursor,
+      V** memcpy_address, size_t value_len, void** gpu_value_ptrs,
+      V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr,
+      const DeviceBase::CpuWorkerThreads* worker_threads) override {
+    LOG(FATAL) << "Unsupport CopyEmbeddingsFromCPUToGPU in SingleTierStorage.";
+  };
+
+  BatchCache<K>* Cache() override {
+    LOG(FATAL) << "Unsupport Cache in SingleTierStorage.";
+    return nullptr;
+  }
+
+  void InitCache(embedding::CacheStrategy cache_strategy) override {
+    LOG(FATAL) << "Unsupport InitCache in SingleTierStorage.";
+  }
+
+  virtual Status BatchCommit(const std::vector<K>& keys,
+                             const std::vector<void*>& value_ptrs) override {
+    LOG(FATAL) << "Unsupport BatchCommit in Storage: " << typeid(this).name();
+    return OkStatus();
+  }
+
+  virtual Status Commit(K keys, const void* value_ptr) {
+    LOG(FATAL) << "Unsupport Commit in Storage: " << typeid(this).name();
+    return OkStatus();
+  }
+
+  Status Eviction(K* evict_ids, int64 evict_size) override {
+    LOG(FATAL) << "Unsupport Eviction in SingleTierStorage.";
+    return OkStatus();
+  }
+
+  void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len,
+                                 int64 block_size) override {
+    return;
+  }
+
+  virtual void Import(K key, V* value, int64 freq, int64 version,
+                      int emb_index) override {}
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    mutex_lock l(Storage<K, V>::mu_);
+    return kv_->GetSnapshot(key_list, value_ptr_list);
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    mutex_lock l(Storage<K, V>::mu_);
+    return kv_->GetShardedSnapshot(key_list, value_ptr_list, partition_id,
+                                   partition_nums);
+  }
+
+  Status Save(const std::string& tensor_name, const std::string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<void*> value_ptr_list;
+    std::vector<K> key_list_tmp;
+    TF_CHECK_OK(kv_->GetSnapshot(&key_list_tmp, &value_ptr_list));
+
+    if (emb_config.is_primary()) {
+      Shrink(key_list_tmp, value_ptr_list, shrink_args, value_len);
+    }
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer, emb_config, value_len, default_value, key_list_tmp,
+        value_ptr_list, SingleTierStorage<K, V>::feat_desc_)));
+    return OkStatus();
+  }
+
+  bool IsMultiLevel() override { return false; }
+
+  bool IsUseHbm() override { return false; }
+
+  bool IsSingleHbm() override { return false; }
+
+  bool IsUsePersistentStorage() override { return false; }
+
+  void Schedule(std::function<void()> fn) override {
+    LOG(FATAL) << "Unsupport Schedule in SingleTierStorage.";
+  }
+
+  void UpdateValuePtr(K key, void* new_value_ptr,
+                      void* old_value_ptr) override {
+    kv_->UpdateValuePtr(key, new_value_ptr, old_value_ptr);
+  }
+
+ protected:
+  virtual void* CreateValuePtr() { return feat_desc_->Allocate(); }
+
+  virtual void DestroyValuePtr(void* value_ptr) {
+    feat_desc_->Deallocate(value_ptr);
+  }
+
+  FeatureDescriptor<V>* feature_descriptor() { return feat_desc_; }
+
+  virtual Status RestoreFeatures(int64 key_num, int bucket_num,
+                                 int64 partition_id, int64 partition_num,
+                                 int64 value_len, bool is_filter, bool is_incr,
+                                 const EmbeddingConfig& emb_config,
+                                 const Eigen::GpuDevice* device,
+                                 FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                                 RestoreBuffer& restore_buff) override {
+    Status s = filter->Restore(key_num, bucket_num, partition_id, partition_num,
+                               value_len, is_filter, false /*to_dram*/, is_incr,
+                               restore_buff);
+    return s;
+  }
+
+ protected:
+  virtual void Shrink(std::vector<K>& key_list,
+                      std::vector<void*>& value_ptr_list,
+                      ShrinkArgs& shrink_args, int64 value_len) {
+    mutex_lock l(Storage<K, V>::mu_);
+    shrink_args.value_len = value_len;
+    shrink_policy_->Shrink(key_list, value_ptr_list, shrink_args);
+  }
+
+ protected:
+  KVInterface<K, V>* kv_;
+  ShrinkPolicy<K, V>* shrink_policy_;
+  Allocator* alloc_;
+  FeatureDescriptor<V>* feat_desc_;
+};
+
+template <typename K, typename V>
+class DramStorage : public SingleTierStorage<K, V> {
+ public:
+  DramStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc),
+                                feat_desc) {}
+
+  ~DramStorage() override {}
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) {
+    return SingleTierStorage<K, V>::kv_->BatchCommit(keys, value_ptrs);
+  }
+
+  Status TryInsert(K key, void* value_ptr) {
+    return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
+  }
+
+  Status Commit(K keys, const void* value_ptr) override {
+    return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
+  }
+
+  void Import(K key, V* value, int64 freq, int64 version,
+              int emb_index) override {
+    void* value_ptr = SingleTierStorage<K, V>::feat_desc_->Allocate(freq);
+    SingleTierStorage<K, V>::Insert(key, &value_ptr);
+    SingleTierStorage<K, V>::feat_desc_->SetValue(value_ptr, emb_index, value);
+    SingleTierStorage<K, V>::feat_desc_->SetFreq(value_ptr, freq);
+    SingleTierStorage<K, V>::feat_desc_->UpdateVersion(value_ptr, version);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(DramStorage);
+
+ public:
+  friend class DramSsdHashStorage<K, V>;
+  friend class DramPmemStorage<K, V>;
+  friend class DramLevelDBStore<K, V>;
+#if GOOGLE_CUDA
+  friend class HbmDramStorage<K, V>;
+  friend class HbmDramSsdStorage<K, V>;
+#endif
+ protected:
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_ptr_list,
+              ShrinkArgs& shrink_args, int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(key_list, value_ptr_list, shrink_args,
+                                    value_len);
+  }
+};
+
+#if GOOGLE_CUDA
+template <typename K, typename V>
+class HbmStorage : public SingleTierStorage<K, V> {
+ public:
+  HbmStorage(const StorageConfig& sc, Allocator* gpu_allocator,
+             FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(
+            sc, new GPUHashMapKV<K, V>(sc.embedding_config, gpu_allocator),
+            feat_desc) {}
+  ~HbmStorage() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(HbmStorage);
+
+  bool IsSingleHbm() override { return true; }
+
+  void SetValueLen(int64 value_len) override {
+    SingleTierStorage<K, V>::kv_->SetValueLen(value_len);
+  }
+
+  void BatchLookupOrCreate(const K* key, V* val, V* default_v,
+                           int32 default_v_num, size_t n,
+                           const Eigen::GpuDevice& device) override {
+    SingleTierStorage<K, V>::kv_->BatchLookupOrCreate(key, val, default_v,
+                                                      default_v_num, n, device);
+  }
+
+  void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
+                               const Eigen::GpuDevice& device) override {
+    SingleTierStorage<K, V>::kv_->BatchLookupOrCreateKeys(key, n, item_idxs,
+                                                          device);
+  }
+
+  void BatchLookup(const Eigen::GpuDevice& device, const K* keys, V* val,
+                   size_t n, const V* default_v) override {
+    SingleTierStorage<K, V>::kv_->BatchLookup(device, keys, val, n, default_v);
+  }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    std::vector<V*> value_ptr_list;
+    std::vector<K> key_list_tmp;
+    GPUHashMapKV<K, V>* gpu_kv =
+        dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+    gpu_kv->GetSnapshot(&key_list_tmp, &value_ptr_list, emb_config);
+
+    TF_CHECK_OK((Storage<K, V>::SaveToCheckpoint(
+        tensor_name, writer, value_len, key_list_tmp, value_ptr_list)));
+
+    if (value_ptr_list.size() > 0) {
+      TypedAllocator::Deallocate(cpu_allocator(), value_ptr_list[0],
+                                 value_ptr_list.size() * value_len);
+    }
+    return OkStatus();
+  }
+
+  GPUHashTable<K, V>* HashTable() override {
+    return SingleTierStorage<K, V>::kv_->HashTable();
+  }
+
+ protected:
+  Status RestoreFeatures(int64 key_num, int bucket_num, int64 partition_id,
+                         int64 partition_num, int64 value_len, bool is_filter,
+                         bool is_incr, const EmbeddingConfig& emb_config,
+                         const Eigen::GpuDevice* device,
+                         FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                         RestoreBuffer& restore_buff) override {
+    K* key_buff = (K*)restore_buff.key_buffer;
+    V* value_buff = (V*)restore_buff.value_buffer;
+    std::vector<K> key_import;
+    std::vector<V> value_import;
+    for (auto i = 0; i < key_num; ++i) {
+      if (*(key_buff + i) % bucket_num % partition_num != partition_id) {
+        LOG(INFO) << "skip EV key:" << *(key_buff + i);
+        continue;
+      }
+      key_import.emplace_back(*(key_buff + i));
+      auto row_offset = value_buff + i * value_len;
+      for (int j = 0; j < value_len; j++) {
+        value_import.emplace_back(*(row_offset + j));
+      }
+    }
+    GPUHashMapKV<K, V>* gpu_kv =
+        dynamic_cast<GPUHashMapKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+    gpu_kv->Import(key_import, value_import, device, emb_config);
+    return OkStatus();
+  }
+};
+
+template <typename K, typename V>
+class HbmStorageWithCpuKv : public SingleTierStorage<K, V> {
+ public:
+  HbmStorageWithCpuKv(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc),
+                                feat_desc) {}
+
+  ~HbmStorageWithCpuKv() override {}
+
+  Status TryInsert(K key, void* value_ptr) {
+    return SingleTierStorage<K, V>::kv_->Insert(key, value_ptr);
+  }
+
+ public:
+  friend class HbmDramStorage<K, V>;
+  friend class HbmDramSsdStorage<K, V>;
+
+ protected:
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_ptr_list,
+              ShrinkArgs& shrink_args, int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(key_list, value_ptr_list, shrink_args,
+                                    value_len);
+  }
+};
+#endif  // GOOGLE_CUDA
+
+template <typename K, typename V>
+class PmemMemkindStorage : public SingleTierStorage<K, V> {
+ public:
+  PmemMemkindStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc),
+                                feat_desc) {}
+  ~PmemMemkindStorage() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PmemMemkindStorage);
+};
+
+template <typename K, typename V>
+class PmemLibpmemStorage : public SingleTierStorage<K, V> {
+ public:
+  PmemLibpmemStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LocklessHashMap<K, V>(feat_desc),
+                                feat_desc) {}
+  ~PmemLibpmemStorage() override {}
+
+  Status Commit(K keys, const void* value_ptr) {
+    return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
+  }
+
+  TF_DISALLOW_COPY_AND_ASSIGN(PmemLibpmemStorage);
+
+ protected:
+  friend class DramPmemStorage<K, V>;
+  void Shrink(std::vector<K>& key_list, std::vector<void*>& value_ptr_list,
+              ShrinkArgs& shrink_args, int64 value_len) override {
+    SingleTierStorage<K, V>::Shrink(key_list, value_ptr_list, shrink_args,
+                                    value_len);
+  }
+};
+
+template <typename K, typename V>
+class LevelDBStore : public SingleTierStorage<K, V> {
+ public:
+  LevelDBStore(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new LevelDBKV<K, V>(sc.path, feat_desc),
+                                feat_desc) {}
+  ~LevelDBStore() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LevelDBStore);
+
+  Status Commit(K keys, const void* value_ptr) {
+    return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
+  }
+
+  embedding::ValueIterator<V>* GetValueIterator(const std::vector<K>& key_list,
+                                                int64 emb_index,
+                                                int64 value_len) {
+    LevelDBKV<K, V>* leveldb_kv =
+        reinterpret_cast<LevelDBKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+    return new DBValueIterator<K, V>(key_list, emb_index, value_len, leveldb_kv,
+                                     SingleTierStorage<K, V>::feat_desc_);
+  }
+
+ public:
+  friend class DramLevelDBStore<K, V>;
+};
+
+template <typename K, typename V>
+class SsdHashStorage : public SingleTierStorage<K, V> {
+ public:
+  SsdHashStorage(const StorageConfig& sc, FeatureDescriptor<V>* feat_desc)
+      : SingleTierStorage<K, V>(sc, new SSDHashKV<K, V>(sc.path, feat_desc),
+                                feat_desc) {}
+  ~SsdHashStorage() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SsdHashStorage);
+
+  Status Commit(K keys, const void* value_ptr) {
+    return SingleTierStorage<K, V>::kv_->Commit(keys, value_ptr);
+  }
+
+  Status Save(const string& tensor_name, const string& prefix,
+              BundleWriter* writer, const EmbeddingConfig& emb_config,
+              ShrinkArgs& shrink_args, int64 value_len,
+              V* default_value) override {
+    if (emb_config.is_primary()) {
+      SSDHashKV<K, V>* ssd_kv =
+          reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+      SsdRecordDescriptor<K> ssd_rec_desc;
+      {
+        mutex_lock l(Storage<K, V>::mu_);
+        ssd_kv->SetSsdRecordDescriptor(&ssd_rec_desc);
+      }
+      ssd_rec_desc.GenerateCheckpoint(prefix, tensor_name);
+    }
+    return OkStatus();
+  }
+
+  void Import(K* key_list, int64* key_file_id_list, int64* key_offset_list,
+              int64 num_of_keys, std::map<int64, int64>& file_id_map) {
+    SSDHashKV<K, V>* ssd_kv =
+        reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+
+    ssd_kv->Import(key_list, key_file_id_list, key_offset_list, num_of_keys,
+                   file_id_map);
+  }
+
+  void CopyEmbFilesFromCkpt(int64* file_list, int64* invalid_record_count_list,
+                            int64* record_count_list, int64 num_of_files,
+                            const std::string& ssd_emb_file_name) {
+    SSDHashKV<K, V>* ssd_kv =
+        reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+
+    ssd_kv->CopyEmbFilesFromCkpt(file_list, invalid_record_count_list,
+                                 record_count_list, num_of_files,
+                                 ssd_emb_file_name);
+  }
+
+  void SetSsdRecordDescriptor(SsdRecordDescriptor<K>* ssd_rec_desc) {
+    SSDHashKV<K, V>* ssd_kv =
+        reinterpret_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_);
+    ssd_kv->SetSsdRecordDescriptor(ssd_rec_desc);
+  }
+
+ public:
+  friend class DramSsdHashStorage<K, V>;
+#if GOOGLE_CUDA
+  friend class HbmDramSsdStorage<K, V>;
+#endif
+
+ protected:
+  void Init() override {
+    dynamic_cast<SSDHashKV<K, V>*>(SingleTierStorage<K, V>::kv_)->Init();
+  }
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h
new file mode 100644
index 00000000..5471ef05
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_hash_kv.h
@@ -0,0 +1,802 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_
+
+#include <cstdlib>
+#include <map>
+#include <vector>
+
+#include "emb_file_creator.h"
+#include "kv_interface.h"
+#include "sparsehash/dense_hash_map_lockless"
+#include "sparsehash/dense_hash_set_lockless"
+#include "ssd_record_descriptor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/util/env_var.h"
+
+namespace tensorflow {
+namespace embedding {
+class EmbPosition {
+ public:
+  EmbPosition(int o, size_t v, int bo, bool f)
+      : offset_(o),
+        version_(v),
+        buffer_offset_(bo),
+        flushed_(f),
+        invalid_(false) {}
+
+  EmbPosition()
+      : offset_(-1),
+        version_(-1),
+        buffer_offset_(-1),
+        flushed_(false),
+        invalid_(false) {}
+
+  void Print() {
+    LOG(INFO) << "EmbPosition: "
+              << "offset = " << offset_ << ", version = " << version_
+              << ", buffer_offset = " << buffer_offset_
+              << ", flushed = " << flushed_;
+  }
+
+ public:
+  int offset_;
+  int buffer_offset_;
+  size_t version_;
+  bool flushed_;
+  bool invalid_;
+};
+
+template <class K>
+class SSDIterator {
+ public:
+  SSDIterator(google::dense_hash_map_lockless<K, EmbPosition*>* hash_map,
+              const std::vector<EmbFile*>& emb_files, int64 value_len,
+              char* write_buffer)
+      : emb_files_(emb_files),
+        curr_file_(0),
+        curr_vec_(0),
+        value_len_(value_len),
+        write_buffer_(write_buffer) {
+    for (auto it : *hash_map) {
+      EmbPosition* posi = it.second;
+      auto iter = file_map_.find(posi->version_);
+      if (iter == file_map_.end()) {
+        std::vector<std::pair<K, EmbPosition*>> tmp;
+        file_map_[posi->version_] = tmp;
+        file_id_vec_.emplace_back(posi->version_);
+      }
+      file_map_[posi->version_].emplace_back(it);
+    }
+  }
+
+  virtual ~SSDIterator() {}
+
+  virtual bool Valid() { return !(curr_file_ == file_id_vec_.size()); }
+
+  virtual void SeekToFirst() {
+    curr_file_ = 0;
+    curr_vec_ = 0;
+    if (file_id_vec_.size() > 0) {
+      int64 f_id = file_id_vec_[curr_file_];
+      emb_files_[f_id]->MapForRead();
+    }
+  }
+
+  virtual void Next() {
+    curr_vec_++;
+    int64 f_id = file_id_vec_[curr_file_];
+    if (curr_vec_ == file_map_[f_id].size()) {
+      emb_files_[f_id]->UnmapForRead();
+      curr_vec_ = 0;
+      curr_file_++;
+      if (curr_file_ < file_id_vec_.size())
+        emb_files_[file_id_vec_[curr_file_]]->MapForRead();
+    }
+  }
+
+  virtual K Key() {
+    int64 f_id = file_id_vec_[curr_file_];
+    return (file_map_[f_id])[curr_vec_].first;
+  }
+
+  virtual int64 FileId() { return file_id_vec_[curr_file_]; }
+
+  virtual int64 Offset() {
+    int64 f_id = file_id_vec_[curr_file_];
+    EmbPosition* posi = (file_map_[f_id])[curr_vec_].second;
+    return posi->offset_;
+  }
+
+ private:
+  int64 value_len_;
+  int64 curr_file_;
+  int64 curr_vec_;
+  char* write_buffer_;
+  std::map<int64, std::vector<std::pair<K, EmbPosition*>>> file_map_;
+  std::vector<int64> file_id_vec_;
+  std::vector<EmbFile*> emb_files_;
+};
+
+template <class K, class V>
+class SSDHashKV : public KVInterface<K, V> {
+ public:
+  explicit SSDHashKV(const std::string& path, FeatureDescriptor<V>* feat_desc)
+      : feat_desc_(feat_desc) {
+    path_ = io::JoinPath(
+        path, "ssd_kv_" + std::to_string(Env::Default()->NowMicros()) + "_");
+    hash_map_.max_load_factor(0.8);
+    hash_map_.set_empty_key_and_value(EMPTY_KEY, nullptr);
+    hash_map_.set_counternum(16);
+    hash_map_.set_deleted_key(DELETED_KEY);
+    evict_file_set_.max_load_factor(0.8);
+    evict_file_set_.set_empty_key_and_value(EMPTY_KEY, -1);
+    evict_file_set_.set_counternum(16);
+    evict_file_set_.set_deleted_key(DELETED_KEY);
+
+    is_async_compaction_ = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_SSDHASH_ASYNC_COMPACTION", true,
+                                   &is_async_compaction_));
+
+    std::string io_scheme = "mmap_and_madvise";
+    TF_CHECK_OK(ReadStringFromEnvVar("TF_SSDHASH_IO_SCHEME", "mmap_and_madvise",
+                                     &io_scheme));
+    emb_file_creator_ = EmbFileCreatorFactory::Create(io_scheme);
+    EmbFile* ef =
+        emb_file_creator_->Create(path_, current_version_, BUFFER_SIZE);
+    emb_files_.emplace_back(ef);
+
+    if (!is_async_compaction_) {
+      LOG(INFO)
+          << "Use Sync Compactor in SSDHashKV of Multi-tier Embedding Storage!";
+      compaction_fn_ = [this]() { Compaction(); };
+      check_buffer_fn_ = [this]() { CheckBuffer(); };
+      save_kv_fn_ = [this](K key, const void* value_ptr,
+                           bool is_compaction = false) {
+        SaveKV(key, value_ptr, is_compaction);
+      };
+    } else {
+      LOG(INFO) << "Use Async Compactor in SSDHashKV of Multi-tier Embedding "
+                   "Storage!";
+      compaction_fn_ = []() {};
+      check_buffer_fn_ = [this]() { CheckBufferAsync(); };
+      save_kv_fn_ = [this](K key, const void* value_ptr,
+                           bool is_compaction = false) {
+        SaveKVAsync(key, value_ptr, is_compaction);
+      };
+      compaction_thread_ = Env::Default()->StartThread(
+          ThreadOptions(), "COMPACTION", [this]() { CompactionThread(); });
+    }
+  }
+
+  void Init() {
+    val_len_ = feat_desc_->data_bytes();
+    max_app_count_ = BUFFER_SIZE / val_len_;
+    write_buffer_ = new char[BUFFER_SIZE];
+    unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_);
+    key_buffer_ = new K[max_key_count];
+    done_ = true;
+  }
+
+  void SetSsdRecordDescriptor(SsdRecordDescriptor<K>* ssd_rec_desc) {
+    mutex_lock l(compact_save_mu_);
+    SSDIterator<K> ssd_iter(&hash_map_, emb_files_, val_len_, write_buffer_);
+    for (ssd_iter.SeekToFirst(); ssd_iter.Valid(); ssd_iter.Next()) {
+      ssd_rec_desc->key_list.emplace_back(ssd_iter.Key());
+      ssd_rec_desc->key_file_id_list.emplace_back(ssd_iter.FileId());
+      ssd_rec_desc->key_offset_list.emplace_back(ssd_iter.Offset());
+    }
+    ssd_rec_desc->file_prefix = path_;
+
+    for (auto file : emb_files_) {
+      if (file->IsDeleted()) continue;
+      ssd_rec_desc->file_list.emplace_back(file->Version());
+      ssd_rec_desc->invalid_record_count_list.emplace_back(
+          file->InvalidCount());
+      ssd_rec_desc->record_count_list.emplace_back(file->Count());
+    }
+
+    if (buffer_cur_ > 0) {
+      if (!is_async_compaction_) {
+        emb_files_[current_version_]->Write(write_buffer_,
+                                            buffer_cur_ * val_len_);
+        emb_files_[current_version_]->Flush();
+        ++current_version_;
+        CreateFile(current_version_);
+      } else {
+        emb_files_[evict_version_]->Write(write_buffer_,
+                                          buffer_cur_ * val_len_);
+        emb_files_[evict_version_]->Flush();
+        evict_version_ = ++current_version_;
+        CreateFile(evict_version_);
+      }
+      TF_CHECK_OK(UpdateFlushStatus());
+      current_offset_ = 0;
+      buffer_cur_ = 0;
+    }
+  }
+
+  ~SSDHashKV() override {
+    if (buffer_cur_ > 0) {
+      if (!is_async_compaction_) {
+        emb_files_[current_version_]->Write(write_buffer_,
+                                            buffer_cur_ * val_len_);
+      } else {
+        emb_files_[evict_version_]->Write(write_buffer_,
+                                          buffer_cur_ * val_len_);
+        mutex_lock l(shutdown_mu_);
+        shutdown_ = true;
+        // Need last compaction or not???
+        // CompactionAsync();
+        delete compaction_thread_;
+      }
+      buffer_cur_ = 0;
+    }
+    for (auto it : emb_files_) {
+      if (!it->IsDeleted()) {
+        it->DeleteFile();
+      }
+      delete it;
+    }
+    DeallocateEmbPositions();
+    delete[] write_buffer_;
+    delete[] key_buffer_;
+  }
+
+  Status UpdateFlushStatus() {
+    for (int i = 0; i < buffer_cur_; ++i) {
+      auto iter = hash_map_.find_wait_free(key_buffer_[i]);
+      if (iter.first == EMPTY_KEY) {
+        return errors::NotFound("Unable to find Key: ", key_buffer_[i],
+                                " in SSDHashKV.");
+      } else {
+        iter.second->flushed_ = true;
+      }
+    }
+    return OkStatus();
+  }
+
+  Status Lookup(K key, void** value_ptr) override {
+    auto iter = hash_map_.find_wait_free(key);
+    if (iter.first == EMPTY_KEY) {
+      return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
+    } else {
+      void* val = feat_desc_->Allocate();
+      EmbPosition* posi = iter.second;
+      if (posi->flushed_) {
+        emb_files_[posi->version_]->Read((char*)val, val_len_, posi->offset_);
+      } else {
+        memcpy((char*)val, write_buffer_ + posi->buffer_offset_, val_len_);
+      }
+      *value_ptr = val;
+      posi->invalid_ = true;
+      return OkStatus();
+    }
+  }
+
+  Status Contains(K key) override {
+    auto iter = hash_map_.find_wait_free(key);
+    if (iter.first == EMPTY_KEY) {
+      return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
+    } else {
+      return OkStatus();
+    }
+  }
+
+  Status Insert(K key, const void* value_ptr) override { return OkStatus(); }
+
+  Status BatchInsert(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    return BatchCommit(keys, value_ptrs);
+  }
+
+  Status BatchCommit(const std::vector<K>& keys,
+                     const std::vector<void*>& value_ptrs) override {
+    compaction_fn_();
+    __sync_fetch_and_add(&total_app_count_, keys.size());
+    for (int i = 0; i < keys.size(); i++) {
+      check_buffer_fn_();
+      save_kv_fn_(keys[i], value_ptrs[i], false);
+      delete value_ptrs[i];
+    }
+    return OkStatus();
+  }
+
+  Status Commit(K key, const void* value_ptr) override {
+    compaction_fn_();
+    __sync_fetch_and_add(&total_app_count_, 1);
+    check_buffer_fn_();
+    save_kv_fn_(key, value_ptr, false);
+    return OkStatus();
+  }
+
+  Status Remove(K key) override {
+    if (hash_map_.erase_lockless(key)) {
+      return OkStatus();
+    } else {
+      return errors::NotFound("Unable to find Key: ", key, " in SSDHashKV.");
+    }
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<void*>* value_ptr_list) override {
+    return OkStatus();
+  }
+
+  Status GetShardedSnapshot(std::vector<std::vector<K>>& key_list,
+                            std::vector<std::vector<void*>>& value_ptr_list,
+                            int partition_id, int partition_nums) override {
+    return OkStatus();
+  }
+
+  Status GetSnapshot(std::vector<K>* key_list,
+                     std::vector<EmbFile*>* file_list) {
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    auto hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != LocklessHashMap<K, V>::EMPTY_KEY_ &&
+          hash_map_dump[j].first != LocklessHashMap<K, V>::DELETED_KEY_) {
+        key_list->emplace_back(hash_map_dump[j].first);
+        file_list->emplace_back(hash_map_dump[j].second);
+      }
+    }
+    // Free the memory of snapshot allocated by hash map.
+    free(hash_map_dump);
+    return OkStatus();
+  }
+
+  void Import(K* key_list, int64* key_file_id_list, int64* key_offset_list,
+              int64 num_of_keys, std::map<int64, int64>& file_id_map) {
+    for (int i = 0; i < num_of_keys; i++) {
+      int64 old_file_id = key_file_id_list[i];
+      int64 new_file_id = file_id_map[old_file_id];
+      EmbPosition* ep =
+          new EmbPosition(key_offset_list[i], new_file_id, 0, true);
+      hash_map_.insert_lockless(std::move(std::pair<K, EmbPosition*>(
+          key_list[i], const_cast<EmbPosition*>(ep))));
+    }
+  }
+
+  void CopyEmbFilesFromCkpt(int64* file_list, int64* invalid_record_count_list,
+                            int64* record_count_list, int64 num_of_files,
+                            const std::string& old_file_prefix) {
+    // delete the file created by constructor
+    emb_files_[0]->DeleteFile();
+    delete emb_files_[0];
+    emb_files_.erase(emb_files_.begin());
+    for (int64 i = 0; i < num_of_files; i++) {
+      std::stringstream ss;
+      ss << old_file_prefix << "/" << file_list[i] << ".emb";
+      std::string old_file_path = ss.str();
+      EmbFile* f =
+          emb_file_creator_->Create(path_, current_version_, BUFFER_SIZE);
+      ++current_version_;
+      f->LoadExistFile(old_file_path, record_count_list[i],
+                       invalid_record_count_list[i]);
+      emb_files_.emplace_back(f);
+      total_app_count_ += record_count_list[i];
+    }
+    CreateFile(current_version_);
+  }
+
+  int64 Size() const override { return hash_map_.size_lockless(); }
+
+  void FreeValuePtr(void* value_ptr) override {
+    feat_desc_->Deallocate(value_ptr);
+  }
+
+ private:
+  void WriteFile(size_t version, size_t curr_buffer_offset) {
+    emb_files_[version]->Write(write_buffer_, curr_buffer_offset);
+    emb_files_[version]->Flush();
+  }
+
+  void CreateFile(size_t version) {
+    emb_files_.emplace_back(
+        emb_file_creator_->Create(path_, version, BUFFER_SIZE));
+  }
+
+  Status FlushAndUpdate(char* value_buffer, K* id_buffer,
+                        EmbPosition** pos_buffer, int64& n_ids,
+                        std::vector<int64>& invalid_files) {
+    {
+      mutex_lock l(mu_);
+      compaction_version_ = ++current_version_;
+      CreateFile(compaction_version_);
+    }
+
+    emb_files_[compaction_version_]->Write(value_buffer, n_ids * val_len_);
+    emb_files_[compaction_version_]->AddCount(n_ids);
+    emb_files_[compaction_version_]->Flush();
+
+    for (int64 i = 0; i < n_ids; i++) {
+      auto iter = hash_map_.insert_lockless(
+          std::move(std::pair<K, EmbPosition*>(id_buffer[i], nullptr)));
+      if ((*(iter.first)).first == EMPTY_KEY) {
+        return errors::NotFound("Unable to find Key: ", id_buffer[i],
+                                " in SSDHashKV.");
+      } else {
+        size_t offset = i * val_len_;
+        EmbPosition* ep =
+            new EmbPosition(offset, compaction_version_, offset, true);
+        bool flag = __sync_bool_compare_and_swap(&((*(iter.first)).second),
+                                                 pos_buffer[i], ep);
+        if (!flag) {
+          emb_files_[compaction_version_]->AddInvalidCountAtomic(1);
+          if (emb_files_[compaction_version_]->IsNeedToBeCompacted()) {
+            evict_file_set_.insert_lockless(compaction_version_);
+          }
+          delete ep;
+        } else {
+          pos_out_of_date_compact_.emplace_back(pos_buffer[i]);
+        }
+      }
+    }
+
+    for (int i = 0; i < invalid_files.size(); i++) {
+      evict_file_set_.erase_lockless(invalid_files[i]);
+    }
+    invalid_files.clear();
+    n_ids = 0;
+    return OkStatus();
+  }
+
+  void CheckBuffer() {
+    size_t curr_buffer_offset = buffer_cur_ * val_len_;
+    if (curr_buffer_offset + val_len_ > BUFFER_SIZE) {
+      WriteFile(current_version_, curr_buffer_offset);
+      if (emb_files_[current_version_]->Count() >= max_app_count_) {
+        ++current_version_;
+        current_offset_ = 0;
+        CreateFile(current_version_);
+      }
+      TF_CHECK_OK(UpdateFlushStatus());
+      buffer_cur_ = 0;
+    }
+  }
+
+  void CheckBufferAsync() {
+    size_t curr_buffer_offset = buffer_cur_ * val_len_;
+    if (curr_buffer_offset + val_len_ > BUFFER_SIZE) {
+      WriteFile(evict_version_, curr_buffer_offset);
+      TF_CHECK_OK(UpdateFlushStatus());
+      mutex_lock l(mu_);
+      evict_version_ = ++current_version_;
+      current_offset_ = 0;
+      CreateFile(evict_version_);
+      buffer_cur_ = 0;
+    }
+  }
+
+  void AppendToWriteBuffer(size_t curr_buffer_offset, K key,
+                           const void* value_ptr) {
+    current_offset_ += val_len_;
+    memcpy(write_buffer_ + curr_buffer_offset, (char*)value_ptr, val_len_);
+    key_buffer_[buffer_cur_] = key;
+    ++buffer_cur_;
+  }
+
+  void AppendToPositionRecordQueue(EmbPosition* old_posi) {
+    // A parameter that can be adjusted in the future
+    if (pos_out_of_date_.size() > CAP_INVALID_POS) {
+      EmbPosition* posi = pos_out_of_date_.front();
+      delete posi;
+      pos_out_of_date_.pop_front();
+    }
+    pos_out_of_date_.emplace_back(old_posi);
+  }
+
+  bool UpdatePosition(EmbPosition** pos, EmbPosition* old_posi,
+                      EmbPosition* new_posi) {
+    bool flag = __sync_bool_compare_and_swap(pos, old_posi, new_posi);
+    if (flag) {
+      AppendToPositionRecordQueue(old_posi);
+    }
+    return flag;
+  }
+
+  void SaveKV(K key, const void* value_ptr, bool is_compaction = false) {
+    size_t curr_buffer_offset = buffer_cur_ * val_len_;
+    EmbPosition* ep = new EmbPosition(current_offset_, current_version_,
+                                      curr_buffer_offset, false);
+    AppendToWriteBuffer(curr_buffer_offset, key, value_ptr);
+
+    auto iter = hash_map_.insert_lockless(std::move(
+        std::pair<K, EmbPosition*>(key, const_cast<EmbPosition*>(ep))));
+    emb_files_[ep->version_]->AddCount(1);
+
+    if ((*(iter.first)).second != ep) {
+      EmbPosition* old_posi = (*(iter.first)).second;
+      int64 version = old_posi->version_;
+      if (!is_compaction) {
+        emb_files_[version]->AddInvalidCount(1);
+        // A parameter that can be adjusted in the future
+        if (version != current_version_ &&
+            emb_files_[version]->IsNeedToBeCompacted()) {
+          evict_file_set_.insert_lockless(version);
+        }
+      }
+      UpdatePosition(&((*(iter.first)).second), old_posi, ep);
+    }
+  }
+
+  void SaveKVAsync(K key, const void* value_ptr, bool is_compaction = false) {
+    size_t curr_buffer_offset = buffer_cur_ * val_len_;
+    EmbPosition* ep = new EmbPosition(current_offset_, evict_version_,
+                                      curr_buffer_offset, false);
+
+    AppendToWriteBuffer(curr_buffer_offset, key, value_ptr);
+    auto iter = hash_map_.insert_lockless(std::move(
+        std::pair<K, EmbPosition*>(key, const_cast<EmbPosition*>(ep))));
+    emb_files_[ep->version_]->AddCount(1);
+
+    if ((*(iter.first)).second != ep) {
+      bool flag = false;
+      EmbPosition* old_posi = nullptr;
+      do {
+        old_posi = (*(iter.first)).second;
+        flag = UpdatePosition(&((*(iter.first)).second), old_posi, ep);
+      } while (!flag);
+
+      if (!is_compaction) {
+        int version = old_posi->version_;
+        emb_files_[version]->AddInvalidCountAtomic(1);
+        // A parameter that can be adjusted in the future
+        if (version != evict_version_ &&
+            emb_files_[version]->IsNeedToBeCompacted()) {
+          evict_file_set_.insert_lockless(version);
+        }
+      }
+    }
+  }
+
+  void DeleteInvalidFiles() {
+    for (auto it : evict_file_map_) {
+      emb_files_[it.first]->DeleteFile();
+    }
+    evict_file_map_.clear();
+  }
+
+  void DeleteInvalidRecord() {
+    for (auto it : pos_out_of_date_compact_) {
+      delete it;
+    }
+    pos_out_of_date_compact_.clear();
+  }
+
+  void LookupValidItems() {
+    for (auto it : hash_map_) {
+      EmbPosition* posi = it.second;
+      auto iter = evict_file_map_.find(posi->version_);
+      if (iter != evict_file_map_.end()) {
+        (*iter).second.emplace_back(it);
+      }
+    }
+  }
+
+  void InitializeEvictMap() {
+    for (auto it : evict_file_set_) {
+      std::vector<std::pair<K, EmbPosition*>> tmp;
+      evict_file_map_[it] = tmp;
+      evict_file_set_.erase_lockless(it);
+    }
+    LookupValidItems();
+  }
+
+  void InitializeEvictMapWithoutErase() {
+    for (auto it : evict_file_set_) {
+      std::vector<std::pair<K, EmbPosition*>> tmp;
+      evict_file_map_[it] = tmp;
+    }
+    LookupValidItems();
+  }
+
+  void MoveToNewFile() {
+    void* val = feat_desc_->Allocate();
+    for (auto it : evict_file_map_) {
+      EmbFile* file = emb_files_[it.first];
+      total_app_count_ -= file->InvalidCount();
+      file->MapForRead();
+      for (auto it_vec : it.second) {
+        EmbPosition* posi = it_vec.second;
+        file->ReadWithMemcpy((char*)val, val_len_, posi->offset_);
+        CheckBuffer();
+        SaveKV(it_vec.first, val, true);
+      }
+      file->UnmapForRead();
+    }
+    feat_desc_->Deallocate(val);
+  }
+
+  void MoveToNewFileAsync() {
+    char* compact_buffer = new char[BUFFER_SIZE];
+    int64 n_ids = 0;
+    std::vector<int64> invalid_files;
+    unsigned int max_key_count = 1 + int(BUFFER_SIZE / val_len_);
+    K* id_buffer = new K[max_key_count];
+    EmbPosition** pos_buffer = new EmbPosition*[max_key_count];
+    for (auto it : evict_file_map_) {
+      EmbFile* file = emb_files_[it.first];
+      __sync_fetch_and_sub(&total_app_count_, file->InvalidCount());
+      file->MapForRead();
+      for (auto it_vec : it.second) {
+        EmbPosition* posi = it_vec.second;
+        id_buffer[n_ids] = it_vec.first;
+        pos_buffer[n_ids] = posi;
+        file->ReadWithMemcpy(compact_buffer + val_len_ * n_ids, val_len_,
+                             posi->offset_);
+        n_ids++;
+        if (n_ids == max_app_count_) {
+          Status st = FlushAndUpdate(compact_buffer, id_buffer, pos_buffer,
+                                     n_ids, invalid_files);
+          if (!st.ok()) {
+            LOG(WARNING) << "FLUSH ERROR: " << st.ToString();
+          }
+        }
+      }
+      file->UnmapForRead();
+      invalid_files.emplace_back(it.first);
+    }
+    Status st = FlushAndUpdate(compact_buffer, id_buffer, pos_buffer, n_ids,
+                               invalid_files);
+    if (!st.ok()) {
+      LOG(WARNING) << "FLUSH ERROR: " << st.ToString();
+    }
+    delete[] id_buffer;
+    delete[] compact_buffer;
+    delete[] pos_buffer;
+  }
+
+  void Compaction() {
+    int64 hash_size = hash_map_.size_lockless();
+    // These parameter that can be adjusted in the future
+    if (hash_size * 3 / 2 < total_app_count_ ||
+        total_app_count_ - hash_size > CAP_INVALID_ID) {
+      // delete the evict_files
+      DeleteInvalidFiles();
+      // Initialize evict_file_map
+      InitializeEvictMap();
+      // read embeddings and write to new file
+      MoveToNewFile();
+    }
+  }
+
+  void CompactionAsync() {
+    int64 hash_size = hash_map_.size_lockless();
+    // These parameter that can be adjusted in the future
+    if (hash_size * 3 / 2 < total_app_count_ ||
+        total_app_count_ - hash_size > CAP_INVALID_ID) {
+      DeleteInvalidRecord();
+      // delete the evict_files
+      DeleteInvalidFiles();
+      // Initialize evict_file_map
+      InitializeEvictMapWithoutErase();
+      // read embeddings and write to new file
+      MoveToNewFileAsync();
+    }
+  }
+
+  void CompactionThread() {
+    if (val_len_ == -1) {
+      while (!done_) {
+      }
+    }
+    while (!shutdown_) {
+      if (shutdown_mu_.try_lock()) {
+        if (!shutdown_) {
+          mutex_lock l(compact_save_mu_);
+          CompactionAsync();
+        }
+        shutdown_mu_.unlock();
+      }
+      Env::Default()->SleepForMicroseconds(1000);
+    }
+  }
+
+  std::string DebugString() const {
+    return strings::StrCat(
+        "map info size:", Size(),
+        ", map info bucket_count:", hash_map_.load_factor(),
+        ",map info load_factor:", hash_map_.load_factor(),
+        ", map info max_load_factor:", hash_map_.max_load_factor(),
+        ", map info min_load_factor: ", hash_map_.min_load_factor(),
+        ", evict_version: ", evict_version_,
+        ", compaction_version: ", compaction_version_);
+  }
+
+ private:
+  void DeallocateEmbPositions() {
+    std::pair<const K, EmbPosition*>* hash_map_dump;
+    int64 bucket_count;
+    auto it = hash_map_.GetSnapshot();
+    hash_map_dump = it.first;
+    bucket_count = it.second;
+    for (int64 j = 0; j < bucket_count; j++) {
+      if (hash_map_dump[j].first != SSDHashKV<K, V>::EMPTY_KEY &&
+          hash_map_dump[j].first != SSDHashKV<K, V>::DELETED_KEY) {
+        delete hash_map_dump[j].second;
+      }
+    }
+    free(hash_map_dump);
+  }
+
+ private:
+  size_t val_len_ = -1;
+  volatile size_t current_version_ = 0;
+  volatile size_t evict_version_ = 0;
+  volatile size_t compaction_version_ = 0;
+  volatile size_t current_offset_ = 0;
+  volatile size_t buffer_cur_ = 0;
+  size_t total_app_count_ = 0;
+  size_t max_app_count_;
+
+  char* write_buffer_ = nullptr;
+  K* key_buffer_ = nullptr;
+  bool is_async_compaction_;
+  FeatureDescriptor<V>* feat_desc_;
+
+  int total_dims_;
+  std::string path_;
+
+  typedef google::dense_hash_map_lockless<K, EmbPosition*> LockLessHashMap;
+  LockLessHashMap hash_map_;
+  mutex mu_;
+  mutex shutdown_mu_;
+  mutex compact_save_mu_;
+
+  static const int EMPTY_KEY;
+  static const int DELETED_KEY;
+  static const int CAP_INVALID_POS;
+  static const int CAP_INVALID_ID;
+  static const size_t BUFFER_SIZE;
+
+  std::vector<EmbFile*> emb_files_;
+  std::deque<EmbPosition*> pos_out_of_date_;
+  std::deque<EmbPosition*> pos_out_of_date_compact_;
+  typedef google::dense_hash_set_lockless<K> LocklessHashSet;
+  LocklessHashSet evict_file_set_;
+  std::map<int64, std::vector<std::pair<K, EmbPosition*>>> evict_file_map_;
+
+  Thread* compaction_thread_ = nullptr;
+  volatile bool shutdown_ = false;
+  volatile bool done_ = false;
+  // std::atomic_flag flag_ = ATOMIC_FLAG_INIT; unused
+
+  std::function<void()> compaction_fn_;
+  std::function<void()> check_buffer_fn_;
+  std::function<void(K, const void*, bool)> save_kv_fn_;
+  EmbFileCreator* emb_file_creator_ = nullptr;
+};
+template <class K, class V>
+const int SSDHashKV<K, V>::EMPTY_KEY = -1;
+template <class K, class V>
+const int SSDHashKV<K, V>::DELETED_KEY = -2;
+template <class K, class V>
+const int SSDHashKV<K, V>::CAP_INVALID_POS = 200000;
+template <class K, class V>
+const int SSDHashKV<K, V>::CAP_INVALID_ID = 10000000;
+template <class K, class V>
+const size_t SSDHashKV<K, V>::BUFFER_SIZE = 1 << 27;
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_HASH_KV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc
new file mode 100644
index 00000000..60879c19
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.cc
@@ -0,0 +1,80 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+#include "ssd_record_descriptor.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+
+namespace tensorflow {
+namespace embedding {
+template <class K>
+template <class T>
+void SsdRecordDescriptor<K>::DumpSection(const std::vector<T>& data_vec,
+                                         const std::string& section_str,
+                                         BundleWriter* writer,
+                                         std::vector<char>& dump_buffer) {
+  EVVectorDataDumpIterator<T> iter(data_vec);
+  SaveTensorWithFixedBuffer(section_str, writer, dump_buffer.data(),
+                            dump_buffer.size(), &iter,
+                            TensorShape({data_vec.size()}));
+}
+#define REGISTER_KERNELS(ktype, ttype)                              \
+  template void SsdRecordDescriptor<ktype>::DumpSection(            \
+      const std::vector<ttype>&, const std::string&, BundleWriter*, \
+      std::vector<char>&);
+REGISTER_KERNELS(int32, int32);
+REGISTER_KERNELS(int32, int64);
+REGISTER_KERNELS(int64, int32);
+REGISTER_KERNELS(int64, int64);
+#undef REGISTER_KERNELS
+
+template <class K>
+void SsdRecordDescriptor<K>::DumpSsdMeta(const std::string& prefix,
+                                         const std::string& var_name) {
+  std::fstream fs;
+  std::string var_name_temp(var_name);
+  std::string new_str = "_";
+  int64 pos = var_name_temp.find("/");
+  while (pos != std::string::npos) {
+    var_name_temp.replace(pos, 1, new_str.data(), 1);
+    pos = var_name_temp.find("/");
+  }
+
+  std::string ssd_record_path = prefix + "-" + var_name_temp + "-ssd_record";
+  BundleWriter ssd_record_writer(Env::Default(), ssd_record_path);
+  size_t bytes_limit = 8 << 20;
+  std::vector<char> dump_buffer(bytes_limit);
+
+  DumpSection(key_list, "keys", &ssd_record_writer, dump_buffer);
+  DumpSection(key_file_id_list, "keys_file_id", &ssd_record_writer,
+              dump_buffer);
+  DumpSection(key_offset_list, "keys_offset", &ssd_record_writer, dump_buffer);
+  DumpSection(file_list, "files", &ssd_record_writer, dump_buffer);
+  DumpSection(invalid_record_count_list, "invalid_record_count",
+              &ssd_record_writer, dump_buffer);
+  DumpSection(record_count_list, "record_count", &ssd_record_writer,
+              dump_buffer);
+
+  ssd_record_writer.Finish();
+}
+#define REGISTER_KERNELS(ktype)                                             \
+  template void SsdRecordDescriptor<ktype>::DumpSsdMeta(const std::string&, \
+                                                        const std::string&);
+REGISTER_KERNELS(int32);
+REGISTER_KERNELS(int64);
+#undef REGISTER_KERNELS
+}  // namespace embedding
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h
new file mode 100644
index 00000000..d5a46bc6
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/ssd_record_descriptor.h
@@ -0,0 +1,105 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
+
+#include <cstdlib>
+#include <fstream>
+#include <iomanip>
+#include <map>
+#include <vector>
+
+#include "embedding_var_dump_iterator.h"
+#include "kv_interface.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+class BundleWriter;
+namespace embedding {
+
+template <class K>
+class SsdRecordDescriptor {
+ public:
+  // prefix of embedding file
+  tstring file_prefix;
+  // keys in ssd storage
+  std::vector<K> key_list;
+  // file ids of features
+  std::vector<int64> key_file_id_list;
+  // offsets in the file of features
+  std::vector<int64> key_offset_list;
+  // files in ssd storage
+  std::vector<int64> file_list;
+  // number of invalid records in the file
+  std::vector<int64> invalid_record_count_list;
+  // number of records in the file
+  std::vector<int64> record_count_list;
+
+  void GenerateCheckpoint(const std::string& prefix,
+                          const std::string& var_name) {
+    DumpSsdMeta(prefix, var_name);
+    CopyEmbeddingFilesToCkptDir(prefix, var_name);
+  }
+
+ private:
+  template <typename T>
+  void DumpSection(const std::vector<T>& data_vec,
+                   const std::string& section_str, BundleWriter* writer,
+                   std::vector<char>& dump_buffer);
+
+  void DumpSsdMeta(const std::string& prefix, const std::string& var_name);
+
+  void CopyEmbeddingFilesToCkptDir(const std::string& prefix,
+                                   const std::string& var_name) {
+    std::string var_name_temp(var_name);
+    std::string new_str = "_";
+    int64 pos = var_name_temp.find("/");
+    while (pos != std::string::npos) {
+      var_name_temp.replace(pos, 1, new_str.data(), 1);
+      pos = var_name_temp.find("/");
+    }
+
+    std::string embedding_folder_path =
+        prefix + "-" + var_name_temp + "-emb_files/";
+    Status s = Env::Default()->CreateDir(embedding_folder_path);
+    if (errors::IsAlreadyExists(s)) {
+      int64 undeleted_files, undeleted_dirs;
+      Env::Default()->DeleteRecursively(embedding_folder_path, &undeleted_files,
+                                        &undeleted_dirs);
+      Env::Default()->CreateDir(embedding_folder_path);
+    }
+
+    for (int64 i = 0; i < file_list.size(); i++) {
+      int64 file_id = file_list[i];
+      std::stringstream old_ss;
+      old_ss << std::setw(4) << std::setfill('0') << file_id << ".emb";
+      std::string file_path = file_prefix + old_ss.str();
+      std::string file_name = file_path.substr(file_path.rfind("/"));
+      std::stringstream new_ss;
+      new_ss << file_id << ".emb";
+      std::string new_file_path = embedding_folder_path + new_ss.str();
+      Status s = Env::Default()->CopyFile(file_path, new_file_path);
+      if (!s.ok()) {
+        LOG(FATAL) << "Copy file " << file_path << " failed!";
+      }
+    }
+  }
+};
+
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_SSD_RECORD_DESCRIPTOR_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage.h
new file mode 100644
index 00000000..40817e59
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage.h
@@ -0,0 +1,367 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_
+
+#include "cache.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "embedding_memory_pool.h"
+#include "embedding_var_ckpt_data.h"
+#include "embedding_var_restore.h"
+#include "filter_policy.h"
+#include "kv_interface.h"
+#include "shrink_policy.h"
+#include "storage_config.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+template <class K, class V>
+class CheckpointLoader;
+
+template <class K, class V>
+class EmbeddingVar;
+
+template <class K>
+struct SsdRecordDescriptor;
+
+template <class K, class V>
+class GPUHashTable;
+
+class BundleWriter;
+class BundleReader;
+
+template <typename Device>
+struct EmbeddingVarContext;
+namespace embedding {
+
+template <typename K, typename V>
+class Storage {
+  friend class CheckpointLoader<K, V>;
+
+ public:
+  explicit Storage(const StorageConfig& storage_config)
+      : storage_config_(storage_config) {
+    initialize_value_.resize(storage_config.embedding_config.slot_num + 1);
+  }
+  virtual ~Storage() {}
+  TF_DISALLOW_COPY_AND_ASSIGN(Storage);
+
+  virtual Status Get(K key, void** value_ptr) = 0;
+#if GOOGLE_CUDA
+  virtual void BatchGet(const EmbeddingVarContext<GPUDevice>& ctx, const K* key,
+                        void** value_ptr_list, int64 num_of_keys) {}
+
+  virtual void BatchGetOrCreate(
+      const EmbeddingVarContext<GPUDevice>& ctx, const K* key,
+      void** value_ptr_list, int64 num_of_keys, int64 value_len,
+      std::vector<std::list<int64>>& not_found_cursor_list) {}
+#endif  // GOOGLE_CUDA
+  virtual Status Contains(K key) = 0;
+  virtual void CreateAndInsert(K key, void** value_ptr,
+                               bool to_dram = false) = 0;
+  virtual void Insert(K key, void** value_ptr) = 0;
+  virtual void Init() {}
+  virtual void SetValueLen(int64 value_len) {}
+  virtual Status GetOrCreate(K key, void** value_ptr) = 0;
+  virtual int LookupTier(K key) const = 0;
+  virtual Status Remove(K key) = 0;
+  virtual int64 Size() const = 0;
+  virtual int64 Size(int level) const = 0;
+  virtual Status GetSnapshot(std::vector<K>* key_list,
+                             std::vector<void*>* value_ptr_list) = 0;
+  virtual Status GetShardedSnapshot(
+      std::vector<std::vector<K>>& key_list,
+      std::vector<std::vector<void*>>& value_ptr_list, int partition_id,
+      int partition_nums) = 0;
+  virtual Status Save(const string& tensor_name, const string& prefix,
+                      BundleWriter* writer, const EmbeddingConfig& emb_config,
+                      ShrinkArgs& shrink_args, int64 value_len,
+                      V* default_value) = 0;
+
+  virtual Status BatchCommit(const std::vector<K>& keys,
+                             const std::vector<void*>& value_ptrs) = 0;
+
+  virtual Status Eviction(K* evict_ids, int64 evict_size) = 0;
+
+  virtual void CopyEmbeddingsFromCPUToGPU(
+      int total, const K* keys, const std::list<int64>& copyback_cursor,
+      V** memcpy_address, size_t value_len, void** gpu_value_ptrs,
+      V* memcpy_buffer_gpu, se::Stream* compute_stream, EventMgr* event_mgr,
+      const DeviceBase::CpuWorkerThreads* worker_threads) = 0;
+
+  virtual void BatchLookupOrCreate(const K* key, V* val, V* default_v,
+                                   int32 default_v_num, size_t n,
+                                   const Eigen::GpuDevice& device) {}
+  virtual void BatchLookupOrCreateKeys(const K* key, int32* item_idxs, size_t n,
+                                       const Eigen::GpuDevice& device) {}
+  virtual void BatchLookup(const Eigen::GpuDevice& device, const K* keys,
+                           V* val, size_t n, const V* default_v) {}
+  virtual GPUHashTable<K, V>* HashTable() { return nullptr; }
+
+  virtual void InitCache(embedding::CacheStrategy cache_strategy) = 0;
+  virtual int64 CacheSize() const = 0;
+  virtual BatchCache<K>* Cache() = 0;
+  virtual bool IsMultiLevel() = 0;
+  virtual bool IsUseHbm() = 0;
+  virtual bool IsSingleHbm() = 0;
+  virtual bool IsUsePersistentStorage() { return false; };
+  virtual void Schedule(std::function<void()> fn) = 0;
+  virtual void CreateEmbeddingMemoryPool(Allocator* alloc, int64 value_len,
+                                         int64 block_size) = 0;
+
+  inline mutex* get_mutex() { return &mu_; }
+  inline int64 GetAllocLen() { return alloc_len_; }
+  inline int64 GetOffset(int64 index) { return alloc_len_ * index; }
+  inline int64 GetTotalDims() { return total_dims_; }
+  inline embedding::StorageType GetStorageType() {
+    return storage_config_.type;
+  }
+  inline std::string GetStoragePath() { return storage_config_.path; }
+  inline embedding::CacheStrategy CacheStrategy() {
+    return storage_config_.cache_strategy;
+  }
+
+  inline std::string DebugString() const {
+    return strings::StrCat("class type: ", typeid(this).name(),
+                           " alloc len: ", alloc_len_,
+                           " total dims: ", total_dims_,
+                           " storage config: ", storage_config_.DebugString());
+  }
+
+  inline void Insert(const std::vector<K>& keys, void** value_ptrs) {
+    for (size_t i = 0; i < keys.size(); i++) {
+      Insert(keys[i], value_ptrs[i]);
+    }
+  }
+
+  virtual void UpdateCache(const Tensor& indices,
+                           const Tensor& indices_counts) {}
+
+  virtual void UpdateCache(const Tensor& indices) {}
+
+  virtual void AddToCachePrefetchList(const Tensor& indices) {}
+
+  virtual void AddToCache(const Tensor& indices) {}
+
+  virtual void Restore(const std::string& name_string,
+                       const std::string& file_name_string, int64 partition_id,
+                       int64 partition_num, int64 value_len, bool is_incr,
+                       bool reset_version, const EmbeddingConfig& emb_config,
+                       const Eigen::GpuDevice* device, BundleReader* reader,
+                       EmbeddingVar<K, V>* ev,
+                       FilterPolicy<K, V, EmbeddingVar<K, V>>* filter) {
+    CheckpointLoader<K, V> restorer(reinterpret_cast<Storage<K, V>*>(this), ev,
+                                    filter, name_string, file_name_string,
+                                    partition_id, partition_num, is_incr,
+                                    reset_version, reader);
+    restorer.RestoreCkpt(emb_config, device);
+  };
+
+  virtual void UpdateValuePtr(K key, void* new_value_ptr,
+                              void* old_value_ptr) = 0;
+
+  virtual void Import(K key, V* value, int64 freq, int64 version,
+                      int emb_index) = 0;
+
+  virtual Status RestoreFeatures(int64 key_num, int bucket_num,
+                                 int64 partition_id, int64 partition_num,
+                                 int64 value_len, bool is_filter, bool is_incr,
+                                 const EmbeddingConfig& emb_config,
+                                 const Eigen::GpuDevice* device,
+                                 FilterPolicy<K, V, EmbeddingVar<K, V>>* filter,
+                                 RestoreBuffer& restore_buff) {
+    return OkStatus();
+  }
+
+ protected:
+  virtual Status RestoreSSD(int64 emb_index, int64 emb_slot_num,
+                            int64 value_len,
+                            const std::string& ssd_emb_file_name,
+                            EmbeddingVar<K, V>* ev,
+                            RestoreSSDBuffer<K>& restore_buff) {
+    for (int64 i = 0; i < restore_buff.num_of_keys; i++) {
+      int64 file_id = restore_buff.key_file_id_list_buf[i];
+      int64 key_offset = restore_buff.key_offset_list_buf[i];
+      // Read data from embedding files on SSD. Data are stored in
+      // NormalContiguousValuePtr temporarily.
+      std::stringstream ss;
+      ss << ssd_emb_file_name << "/" << file_id << ".emb";
+      int fd = open(ss.str().data(), O_RDONLY);
+      EmbeddingConfig& emb_config = storage_config_.embedding_config;
+      FeatureDescriptor<V> normal_feat_desc(
+          emb_config.block_num, emb_config.slot_num + 1, ev_allocator(),
+          StorageType::DRAM, true, true, {false, 0});
+      void* value_ptr = normal_feat_desc.Allocate();
+      char* file_addr =
+          (char*)mmap(nullptr, normal_feat_desc.data_bytes() + key_offset,
+                      PROT_READ, MAP_PRIVATE, fd, 0);
+      memcpy(value_ptr, file_addr + key_offset, normal_feat_desc.data_bytes());
+      munmap(file_addr, normal_feat_desc.data_bytes() + key_offset);
+      close(fd);
+      // Copy Data to ValuePtr, data of slots are set by primary here.
+      int64 import_freq = normal_feat_desc.GetFreq(value_ptr);
+      int64 import_version = normal_feat_desc.GetVersion(value_ptr);
+      V* value = normal_feat_desc.GetEmbedding(value_ptr, emb_index);
+      Import(restore_buff.key_list_buf[i], value, import_freq, import_version,
+             emb_index);
+      normal_feat_desc.Deallocate(value_ptr);
+    }
+    return OkStatus();
+  }
+
+ private:
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list, const std::vector<void*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config, V* default_value,
+      FeatureDescriptor<V>* feat_desc) {
+    std::vector<EmbeddingVarCkptData<K, V>> ev_ckpt_data_parts(
+        kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true,
+                                   &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i], emb_config, default_value,
+              feat_desc, is_save_freq, is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list, const std::vector<void*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data,
+      const EmbeddingConfig& emb_config, V* default_value,
+      const std::vector<FeatureDescriptor<V>*>& feat_desc) {
+    std::vector<EmbeddingVarCkptData<K, V>> ev_ckpt_data_parts(
+        kSavedPartitionNum);
+
+    bool save_unfiltered_features = true;
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_EV_SAVE_FILTERED_FEATURES", true,
+                                   &save_unfiltered_features));
+
+    bool is_save_freq = emb_config.is_save_freq();
+    bool is_save_version = emb_config.is_save_version();
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          int feat_desc_type = (int64)value_ptr_list[i] >> kDramFlagOffset;
+          ev_ckpt_data_parts[part_id].Emplace(
+              key_list[i], value_ptr_list[i], emb_config, default_value,
+              feat_desc[feat_desc_type], is_save_freq, is_save_version,
+              save_unfiltered_features);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+  void GeneratePartitionedCkptData(
+      const std::vector<K>& key_list, const std::vector<V*>& value_ptr_list,
+      EmbeddingVarCkptData<K, V>* partitioned_ckpt_data) {
+    std::vector<EmbeddingVarCkptData<K, V>> ev_ckpt_data_parts(
+        kSavedPartitionNum);
+
+    for (int64 i = 0; i < key_list.size(); i++) {
+      for (int part_id = 0; part_id < kSavedPartitionNum; part_id++) {
+        if (key_list[i] % kSavedPartitionNum == part_id) {
+          ev_ckpt_data_parts[part_id].Emplace(key_list[i], value_ptr_list[i]);
+          break;
+        }
+      }
+    }
+
+    partitioned_ckpt_data->SetWithPartition(ev_ckpt_data_parts);
+  }
+
+ protected:
+  Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer,
+                          const EmbeddingConfig& emb_config, int64 value_len,
+                          V* default_value, const std::vector<K>& key_list,
+                          const std::vector<void*>& value_ptr_list,
+                          FeatureDescriptor<V>* feat_desc,
+                          ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value, feat_desc);
+    Status s = partitioned_ckpt_data.ExportToCkpt(tensor_name, writer,
+                                                  value_len, value_iter);
+    return OkStatus();
+  }
+
+  Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer,
+                          const EmbeddingConfig& emb_config, int64 value_len,
+                          V* default_value, const std::vector<K>& key_list,
+                          const std::vector<void*>& value_ptr_list,
+                          const std::vector<FeatureDescriptor<V>*>& feat_desc,
+                          ValueIterator<V>* value_iter = nullptr) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data, emb_config,
+                                default_value, feat_desc);
+    Status s = partitioned_ckpt_data.ExportToCkpt(tensor_name, writer,
+                                                  value_len, value_iter);
+    return OkStatus();
+  }
+
+  Status SaveToCheckpoint(const string& tensor_name, BundleWriter* writer,
+                          int64 value_len, const std::vector<K>& key_list,
+                          const std::vector<V*>& value_ptr_list) {
+    EmbeddingVarCkptData<K, V> partitioned_ckpt_data;
+    GeneratePartitionedCkptData(key_list, value_ptr_list,
+                                &partitioned_ckpt_data);
+    Status s =
+        partitioned_ckpt_data.ExportToCkpt(tensor_name, writer, value_len);
+    return OkStatus();
+  }
+
+ protected:
+  int64 alloc_len_ = 0;
+  int64 total_dims_ = 0;
+  StorageConfig storage_config_;
+
+  mutex mu_;
+  std::atomic_flag flag_ = ATOMIC_FLAG_INIT;
+  std::vector<V*> initialize_value_;
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h
new file mode 100644
index 00000000..79e17ae1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage_config.h
@@ -0,0 +1,59 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_
+
+#include "cache.h"
+#include "embedding_config.h"
+
+namespace tensorflow {
+namespace embedding {
+struct StorageConfig {
+  StorageConfig()
+      : type(StorageType::DEFAULT),
+        path(""),
+        cache_strategy(CacheStrategy::LFU) {
+    size = {1 << 30, 1 << 30, 1 << 30, 1 << 30};
+  }
+
+  StorageConfig(StorageType t, const std::string& p,
+                const std::vector<int64>& s, const EmbeddingConfig& ec,
+                const CacheStrategy cache_strategy_ = CacheStrategy::LFU)
+      : type(t),
+        path(p),
+        size(s),
+        embedding_config(ec),
+        cache_strategy(cache_strategy_) {}
+  StorageType type;
+  std::string path;
+  std::vector<int64> size;
+  CacheStrategy cache_strategy;
+  EmbeddingConfig embedding_config;
+
+  std::string DebugString() const {
+    std::string size_str =
+        std::accumulate(std::next(size.begin()), size.end(),
+                        std::to_string(size[0]), [](std::string a, int64_t b) {
+                          return std::move(a) + "_" + std::to_string(b);
+                        });
+    return strings::StrCat("storage type: ", type, " storage path: ", path,
+                           " storage capacity: ", size_str,
+                           " cache strategy: ", cache_strategy);
+  }
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_CONFIG_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h b/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h
new file mode 100644
index 00000000..67d8a0b6
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h
@@ -0,0 +1,78 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
+#define TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
+
+#include "deepray/custom_ops/embedding_variable/cc/lib/allocator.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "dram_leveldb_storage.h"
+#include "dram_pmem_storage.h"
+#include "dram_ssd_storage.h"
+#include "hbm_dram_ssd_storage.h"
+#include "hbm_dram_storage.h"
+#include "multi_tier_storage.h"
+#include "single_tier_storage.h"
+#include "storage.h"
+#include "storage_config.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace embedding {
+class StorageFactory {
+ public:
+  template <typename K, typename V>
+  static Storage<K, V>* Create(const StorageConfig& sc,
+                               Allocator* gpu_allocator,
+                               FeatureDescriptor<V>* feat_desc,
+                               const string& name) {
+    switch (sc.type) {
+      case StorageType::DRAM:
+        return new DramStorage<K, V>(sc, feat_desc);
+      case StorageType::PMEM_MEMKIND:
+        feat_desc->SetAllocator(pmem_allocator());
+        return new PmemMemkindStorage<K, V>(sc, feat_desc);
+      case StorageType::PMEM_LIBPMEM:
+        feat_desc->SetAllocator(
+            experimental_pmem_allocator(sc.path, sc.size[0]));
+        return new PmemLibpmemStorage<K, V>(sc, feat_desc);
+      case StorageType::DRAM_PMEM:
+        return new DramPmemStorage<K, V>(sc, feat_desc, name);
+      case StorageType::LEVELDB:
+      case StorageType::DRAM_LEVELDB:
+        return new DramLevelDBStore<K, V>(sc, feat_desc, name);
+      case StorageType::SSDHASH:
+      case StorageType::DRAM_SSDHASH:
+        return new DramSsdHashStorage<K, V>(sc, feat_desc, name);
+      case StorageType::HBM:
+#if GOOGLE_CUDA
+        return new HbmStorage<K, V>(sc, gpu_allocator, feat_desc);
+#endif  // GOOGLE_CUDA
+      case StorageType::HBM_DRAM:
+#if GOOGLE_CUDA
+        return new HbmDramStorage<K, V>(sc, gpu_allocator, feat_desc, name);
+#endif  // GOOGLE_CUDA
+      case StorageType::HBM_DRAM_SSDHASH:
+#if GOOGLE_CUDA
+        return new HbmDramSsdStorage<K, V>(sc, gpu_allocator, feat_desc, name);
+#endif  // GOOGLE_CUDA
+      default:
+        return new DramStorage<K, V>(sc, feat_desc);
+    }
+  }
+};
+}  // namespace embedding
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
new file mode 100644
index 00000000..e50be39e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
@@ -0,0 +1,757 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace {
+// input: input tensor value (it sores the id)
+// cols: How many elements to do SparseSegmentSum
+// output: rows * embedding_size
+template <typename T>
+static void sparse_gather_v1(T *input, int rows, int cols,
+                             float *embedding_table, float *output,
+                             int embedding_size, bool is_mean) {
+  T *pidx = input;
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < embedding_size; ++j) {
+      float value = 0;
+      int dense_num = 0;
+      for (int k = 0; k < cols; ++k) {
+        int embedding_row = (int)pidx[k];
+        if (embedding_row >= 0) {
+          value += embedding_table[embedding_row * embedding_size + j];
+          dense_num += 1;
+        }
+      }
+
+      if (is_mean && dense_num > 0) {
+        *output++ = value / dense_num;
+      } else {
+        *output++ = value;
+      }
+    }
+    pidx += cols;
+  }
+}
+
+// embedding_size = 1
+template <typename T>
+static void sparse_gather_embeddingsize1(T *input, int rows, int cols,
+                                         float *embedding_table, float *output,
+                                         bool is_mean) {
+  T *pidx = input;
+  for (int i = 0; i < rows; ++i) {
+    float value = 0;
+    int dense_num = 0;
+    for (int k = 0; k < cols; ++k) {
+      int embedding_row = pidx[k];
+      if (embedding_row >= 0) {
+        value += embedding_table[embedding_row];
+        dense_num += 1;
+      }
+    }
+    if (is_mean && dense_num > 0) {
+      *output++ = value / dense_num;
+    } else {
+      *output++ = value;
+    }
+    pidx += cols;
+  }
+}
+
+// input cols = 1
+template <typename T>
+static void sparse_gather_column1(T *input, int rows, float *embedding_table,
+                                  float *output, int embedding_size) {
+  T *pidx = input;
+  for (int i = 0; i < rows; ++i) {
+    int embedding_row = *pidx++;
+    if (embedding_row >= 0) {
+      float *pembedding = &embedding_table[embedding_row * embedding_size];
+      for (int j = 0; j < embedding_size; ++j) {
+        output[j] = pembedding[j];
+      }
+    } else {
+      for (int j = 0; j < embedding_size; ++j) {
+        output[j] = 0;
+      }
+    }
+    output += embedding_size;
+  }
+}
+
+template <typename T>
+static void sparse_gather(T *input, int rows, int cols, float *embedding_table,
+                          float *output, int embedding_size, bool is_mean) {
+  if (embedding_size == 1) {
+    sparse_gather_embeddingsize1(input, rows, cols, embedding_table, output,
+                                 is_mean);
+  } else if (cols == 1) {
+    sparse_gather_column1(input, rows, embedding_table, output, embedding_size);
+  } else {
+    // printf("General sparse gather!\n");
+    sparse_gather_v1(input, rows, cols, embedding_table, output, embedding_size,
+                     is_mean);
+  }
+}
+
+// Use memcpy or manually assign?
+static void mycopy(float *dst, float *src, int float_num) {
+  memcpy(dst, src, float_num * sizeof(float));
+}
+
+static void myadd(float *dst, float *src, int float_num) {
+  for (int i = 0; i < float_num; ++i) {
+    dst[i] += src[i];
+  }
+}
+
+template <typename T>
+static void row_add(std::map<T *, std::vector<T *>> &mapSet, int64 row_nums) {
+  for (auto it = mapSet.begin(); it != mapSet.end(); ++it) {
+    T *dst = it->first;
+    std::vector<T *> srcs(std::move(it->second));
+    int64 src_size = srcs.size();
+
+    for (int row = 0; row < row_nums; ++row) {
+      dst[row] = 0.0;
+      for (int index = 0; index < src_size; ++index) {
+        dst[row] += srcs[index][row];
+      }
+    }
+  }
+}
+
+template <typename T>
+static void row_add_mean(std::map<T *, std::vector<T *>> &mapSet,
+                         int64 row_nums, bool is_mean) {
+#define L(n) srcs[index + n][row]
+
+  for (auto it = mapSet.begin(); it != mapSet.end(); ++it) {
+    T *dst = it->first;
+    std::vector<T *> srcs(std::move(it->second));
+    int64 src_size = srcs.size();
+
+    if (src_size == 1) {
+      for (int row = 0; row < row_nums; ++row) {
+        dst[row] = srcs[0][row];
+      }
+      continue;
+    }
+
+    float sum_tmp = 0.0;
+    int64 index = 0;
+    int64 r = (src_size) % 8;
+    int64 m = 1;
+    if (src_size < 10 && is_mean) m = src_size;
+
+    for (int row = 0; row < row_nums; ++row) {
+      sum_tmp = 0.0;
+      index = 0;
+      dst[row] = 0.0;
+      switch (r) {
+        case 2: {
+          sum_tmp = (L(0) + L(1)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 3: {
+          sum_tmp = (L(0) + L(1) + L(2)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 4: {
+          sum_tmp = (L(0) + L(1) + L(2) + L(3)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 5: {
+          sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 6: {
+          sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 7: {
+          sum_tmp = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
+          dst[row] = sum_tmp;
+          break;
+        }
+        case 0: {
+          dst[row] =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
+          index += 8;
+          break;
+        }
+        case 1: {
+          dst[row] =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
+              m;
+          index += 8;
+          break;
+        }
+      }
+      for (index += r; index < src_size; index += 8) {
+        sum_tmp = L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
+        dst[row] += sum_tmp;
+      }
+      if (src_size >= 10 && is_mean) dst[row] /= src_size;
+    }
+  }
+}
+
+static void myscale(float *dst, float factor, int float_num) {
+  for (int i = 0; i < float_num; ++i) {
+    dst[i] *= factor;
+  }
+}
+
+template <typename Tid, typename Tshape>
+static void sparse_gather(Tid *input, int64 input_size, Tshape *indice,
+                          int indice_dim, Tshape *shape, int rows, int cols,
+                          float *embedding_table, float *output,
+                          int embedding_size, bool is_mean) {
+  // Record how many values in each row
+  int *row_values = new int[rows];
+  memset(row_values, 0, rows * sizeof(int));
+
+  std::map<float *, std::vector<float *>> mapSet;
+
+  for (int64 i = 0; i < input_size; ++i) {
+    Tid id = input[i];
+    if (i < input_size && input[i] < 0) {  // Skip invalid id
+      continue;
+    }
+    auto row = indice[i * indice_dim];
+    // for (int k = 1; k < indice_dim - 1; ++k) {
+    //   row = row * shape[k] + indice[i * indice_dim + k];
+    // }
+    row_values[row] += 1;
+
+    auto index = row * embedding_size;
+    if (!mapSet.count(&output[index])) {
+      std::vector<float *> srcs;
+      mapSet[&output[index]] = srcs;
+    }
+    mapSet[&output[index]].push_back(&embedding_table[id * embedding_size]);
+  }
+
+  // row_add(mapSet, embedding_size);
+  row_add_mean(mapSet, embedding_size, is_mean);
+
+  for (int i = 0; i < rows; ++i) {
+    if (row_values[i] == 0) {
+      memset(&output[i * embedding_size], 0, embedding_size * sizeof(float));
+      // } else if (is_mean && row_values[i] > 1) {
+      //   float factor = 1.0f / row_values[i];
+      //   myscale(&output[i * embedding_size], factor, embedding_size);
+    }
+  }
+  delete[] row_values;
+}
+}  // namespace
+
+/*
+  sample: [['green' 'red' 'blue' 'yellow' 'pink' 'blue' 'red' 'indigo']
+           ['' '' '' '' '' '' '' '']
+           ['' '' '' 'yellow' 'pink' 'blue' 'red' 'indigo']
+           ['' '' '' '' '' '' '' '']
+           ['green' '' '' '' '' '' '' '']]
+     =>   [[ True  True  True  True  True  True  True  True]
+           [False False False False False False False False]
+           [False False False  True  True  True  True  True]
+           [False False False False False False False False]
+           [ True False False False False False False False]]
+--------------------------------------------------------------------------------------
+  weight: float[[ 0.23860918  0.07992432 -0.7441818 ]
+                [-0.8256738  -0.50271106  0.39016065]
+                [-0.7978571   0.3993331  -0.12494776]
+                [-0.555991   -0.6705441  -0.23192379]
+                [-0.5283828   0.19715567  0.12184268]]
+  input: int64[4 0 0 1 1 0 0 1 1 1 0 0 1 4] from StringToHashBucketFast output
+  dense_shape: int64[5 8]
+  indice: int64[[0 0] from to_sparse_input/indices(Where) output
+                [0 1]
+                [0 2]
+                [0 3]
+                [0 4]
+                [0 5]
+                [0 6]
+                [0 7]
+                [2 3]
+                [2 4]
+                [2 5]
+                [2 6]
+                [2 7]
+                [4 0]]
+    embedded: float[[-0.25637093 -0.12391002 -0.21055032]
+                    [ 0.          0.          0.        ]
+                    [-0.3999606  -0.2696569  -0.06357633]
+                    [ 0.          0.          0.        ]
+                    [-0.5283828   0.19715567  0.12184268]]
+-----------------------------------------------------------------------------------
+      input_size: sum of input tensor size == 14
+      indice_dim: dim_size(1) of indice tensor[14, 2] == 2
+      shape: dense_shape == [5 8]
+      batch_size: dim of dense_shape == 5
+      cols: dim_size(1) of dense_shape == 8
+      embedding_size: dim_size(1) of weight tensor == 3
+      sparse_gather(input, input_size, indice, indice_dim, shape, batch_size,
+                    cols, weight, output, embedding_size, is_mean);
+*/
+
+template <typename Device, typename Tid, typename Tshape>
+class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel {
+ public:
+  explicit FusedSafeEmbeddingLookupSparseLocalOp(OpKernelConstruction *context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_));
+    // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims));
+    node_name = context->def().name();
+  }
+
+  ~FusedSafeEmbeddingLookupSparseLocalOp() {}
+
+  void Compute(OpKernelContext *context) override {
+    // Grab the weight
+    float *weight;
+    const Tensor *weight_tensor = &context->input(0);
+
+    // for saved model
+    if (weight_tensor->dtype() == DT_RESOURCE) {
+      Var *variable;
+      OP_REQUIRES_OK(
+          context,
+          LookupResource(context, HandleFromInput(context, 0), &variable));
+      core::ScopedUnref s(variable);
+      weight_tensor = variable->tensor();
+      OP_REQUIRES(
+          context, weight_tensor->dtype() == DT_FLOAT,
+          errors::InvalidArgument("Expect float weight in ", node_name));
+    }
+
+    weight = (float *)weight_tensor->tensor_data().data();
+
+    // Input id
+    const Tensor &input_tensor = context->input(1);
+    Tid *input = (Tid *)input_tensor.tensor_data().data();
+
+    const Tensor &shape_tensor = context->input(2);
+    Tshape *shape = (Tshape *)shape_tensor.tensor_data().data();
+
+    // To check the input
+    OP_REQUIRES(
+        context, (shape_tensor.dims() == 1),
+        errors::InvalidArgument("Shape tensor is not valid (dims != 1)"));
+    OP_REQUIRES(
+        context, (shape_tensor.dim_size(0) >= 2),
+        errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)"));
+
+    int64 input_size = 1;
+    for (int i = 0; i < input_tensor.dims(); ++i) {
+      input_size *= input_tensor.dim_size(i);
+    }
+
+    int input_dims = shape_tensor.dim_size(0);
+    int cols = shape[input_dims - 1];
+    int batch_size = 1;
+    for (int i = 0; i < input_dims - 1; ++i) {
+      batch_size *= shape[i];
+    }
+    int embedding_size = weight_tensor->dim_size(1);
+    bool is_mean = (combiner_ == "mean");
+
+    const Tensor &indice_tensor = context->input(3);
+    Tshape *indice = (Tshape *)indice_tensor.tensor_data().data();
+    int indice_dim = indice_tensor.dim_size(1);
+
+    // Create an output tensor
+    Tensor *output_tensor = NULL;
+    TensorShape output_shape({batch_size, embedding_size});
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+    float *output = (float *)output_tensor->tensor_data().data();
+
+    if (false && input_size == batch_size * cols) {  // input id is dense
+      // fixme(marvin): disable this branch just for test.
+      sparse_gather(input, batch_size, cols, weight, output, embedding_size,
+                    is_mean);
+    } else {  // input id is sparse
+      OP_REQUIRES(context, (indice_tensor.dims() == 2),
+                  errors::InvalidArgument(
+                      "Indice tensor is not as expected (dims != 2)"));
+      OP_REQUIRES(
+          context, (indice_tensor.dim_size(0) == input_size),
+          errors::InvalidArgument(
+              "Indice tensor is not as expected (dim_size(0) != batch_size)"));
+      sparse_gather(input, input_size, indice, indice_dim, shape, batch_size,
+                    cols, weight, output, embedding_size, is_mean);
+    }
+  }
+
+ private:
+  std::string combiner_;
+  std::string node_name;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedSafeEmbeddingLookupSparseLocal")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<int32>("T_id")
+        .TypeConstraint<int64>("T_shape"),
+    FusedSafeEmbeddingLookupSparseLocalOp<CPUDevice, int32, int64>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedSafeEmbeddingLookupSparseLocal")
+        .Device(DEVICE_CPU)
+        .TypeConstraint<int64>("T_id")
+        .TypeConstraint<int64>("T_shape"),
+    FusedSafeEmbeddingLookupSparseLocalOp<CPUDevice, int64, int64>);
+
+enum class SparseSegmentReductionOperation { kSum, kMean, kSqrtN };
+
+namespace functor {
+
+template <typename T, typename Index, typename SegmentId>
+struct SparseSegmentGradFunctor {
+  void operator()(OpKernelContext *context,
+                  SparseSegmentReductionOperation operation,
+                  typename TTypes<T>::ConstMatrix input_flat,
+                  typename TTypes<Index>::ConstVec indices_vec,
+                  typename TTypes<SegmentId>::ConstVec segment_vec,
+                  typename TTypes<T>::Matrix output_flat) {
+    const int64_t N = indices_vec.size();
+    const SegmentId M = output_flat.dimension(0);
+
+    // Note that similar to SparseSegmentMean, we assume that segment_vec is
+    // already sorted and has non-negative values.
+    const SegmentId num_segments = input_flat.dimension(0);
+    const SegmentId last_segment_id_plus_one =
+        internal::SubtleMustCopy(segment_vec(N - 1)) + 1;
+    OP_REQUIRES(context, last_segment_id_plus_one <= num_segments,
+                errors::InvalidArgument("Invalid number of segments"));
+
+    // Compute scaling factors for input.
+    std::vector<double> scaling(
+        (operation == SparseSegmentReductionOperation::kSum ? 0 : num_segments),
+        0.0);
+    if (operation != SparseSegmentReductionOperation::kSum) {
+      for (int64_t i = 0; i < N; ++i) {
+        const SegmentId idx = internal::SubtleMustCopy(segment_vec(i));
+        OP_REQUIRES(
+            context, FastBoundsCheck(idx, num_segments),
+            errors::InvalidArgument("Segment id ", idx, " out of range [0, ",
+                                    num_segments, ")."));
+        scaling[idx] += 1;
+      }
+      for (size_t i = 0; i < scaling.size(); ++i) {
+        switch (operation) {
+          case SparseSegmentReductionOperation::kSum: {
+            OP_REQUIRES(
+                context, false,
+                errors::Internal(
+                    "Should not happen: sum inside SparseSegmentReductionOp "
+                    "scaling generation."));
+          }
+          case SparseSegmentReductionOperation::kMean: {
+            scaling[i] = 1.0 / std::max(scaling[i], 1.0);
+            break;
+          }
+          case SparseSegmentReductionOperation::kSqrtN: {
+            scaling[i] = 1.0 / sqrt(std::max(scaling[i], 1.0));
+            break;
+          }
+            // No default to get compiler warnings for missing cases.
+        }
+      }
+    }
+
+    output_flat.setZero();
+    std::vector<bool> is_modified(M, false);
+
+    for (int64_t i = 0; i < N; ++i) {
+      const Index output_idx = internal::SubtleMustCopy(indices_vec(i));
+      OP_REQUIRES(context, FastBoundsCheck(output_idx, M),
+                  errors::InvalidArgument("Index ", output_idx,
+                                          " out of range [0, ", M, ")."));
+
+      const SegmentId idx = internal::SubtleMustCopy(segment_vec(i));
+      OP_REQUIRES(
+          context, FastBoundsCheck(idx, num_segments),
+          errors::InvalidArgument("Segment id ", idx, " out of range [0, ",
+                                  num_segments, ")."));
+
+      const T scale = (operation == SparseSegmentReductionOperation::kSum
+                           ? static_cast<T>(1)
+                           : static_cast<T>(scaling[idx]));
+      if (is_modified[output_idx]) {
+        if (scale == 1.0) {
+          output_flat.template chip<0>(output_idx) +=
+              input_flat.template chip<0>(idx);
+        } else {
+          output_flat.template chip<0>(output_idx) +=
+              input_flat.template chip<0>(idx) * scale;
+        }
+      } else {
+        if (scale == 1.0) {
+          output_flat.template chip<0>(output_idx) =
+              input_flat.template chip<0>(idx);
+        } else {
+          output_flat.template chip<0>(output_idx) =
+              input_flat.template chip<0>(idx) * scale;
+        }
+      }
+      is_modified[output_idx] = true;
+    }
+  }
+};
+
+}  // namespace functor
+
+template <typename Device, typename T, typename Tinput, typename Tindices,
+          typename Tdense_shape>
+class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
+ public:
+  explicit FusedSafeEmbeddingLookupSparseLocalGradOp(
+      OpKernelConstruction *context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_));
+    // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims));
+
+    if (combiner_ == "sum") {
+      operation_ = SparseSegmentReductionOperation::kSum;
+    } else if (combiner_ == "mean") {
+      operation_ = SparseSegmentReductionOperation::kMean;
+    } else if (combiner_ == "sqrtn") {
+      operation_ = SparseSegmentReductionOperation::kSqrtN;
+    } else {
+      OP_REQUIRES(
+          context, false,
+          errors::InvalidArgument(
+              "Currently, 'mean', 'sqrtn' and 'sum' are only supported"));
+    }
+
+    node_name = context->def().name();
+
+    static bool printed = false;
+    if (!printed) {
+      printf("******** FusedSafeEmbeddingLookupSparseLocalGradOp ********\n");
+      printed = true;
+    }
+  }
+
+  ~FusedSafeEmbeddingLookupSparseLocalGradOp() {}
+
+  void Compute(OpKernelContext *context) override {
+    // Grab gradients
+    const Tensor &gradients_tensor = context->input(0);
+    T *gradients = (T *)gradients_tensor.tensor_data().data();
+    OP_REQUIRES(
+        context, (gradients_tensor.dims() == 2),
+        errors::InvalidArgument("Gradients tensor is not valid (dims != 2)"));
+    int64 gradients_row = gradients_tensor.dim_size(0);
+    int64 embedding_col = gradients_tensor.dim_size(1);
+
+    // Grad input hash value
+    const Tensor &input_tensor = context->input(1);
+    Tinput *input = (Tinput *)input_tensor.tensor_data().data();
+    int64 input_size = 1;
+    for (int i = 0; i < input_tensor.dims(); ++i) {
+      input_size *= input_tensor.dim_size(i);
+    }
+
+    // Grad indices value
+    const Tensor &indices_tensor = context->input(2);
+    Tindices *indices_ptr = (Tindices *)indices_tensor.tensor_data().data();
+    int indices_row = indices_tensor.dim_size(0);
+    int indices_col = indices_tensor.dim_size(1);
+    OP_REQUIRES(context, (indices_tensor.dims() == 2),
+                errors::InvalidArgument(
+                    "Indice tensor is not as expected (dims != 2)"));
+    OP_REQUIRES(
+        context, (indices_tensor.dim_size(0) == input_size),
+        errors::InvalidArgument(
+            "Indice tensor is not as expected (dim_size(0) != batch_size)"));
+    std::vector<Tindices> input_indices;  // collect first col
+    for (int64 i = 0; i < indices_row; ++i) {
+      input_indices.emplace_back(indices_ptr[i * indices_col]);
+    }
+
+    // Grad input dense shape
+    const Tensor &dense_shape_tensor = context->input(3);
+    Tdense_shape *dense_shape =
+        (Tdense_shape *)dense_shape_tensor.tensor_data().data();
+    OP_REQUIRES(
+        context, (dense_shape_tensor.dims() == 1),
+        errors::InvalidArgument("Shape tensor is not valid (dims != 1)"));
+    OP_REQUIRES(
+        context, (dense_shape_tensor.dim_size(0) >= 2),
+        errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)"));
+    int input_dims = dense_shape_tensor.dim_size(0);
+    int input_cols = dense_shape[input_dims - 1];
+    int batch_size = 1;
+    for (int i = 0; i < input_dims - 1; ++i) {
+      batch_size *= dense_shape[i];
+    }
+    OP_REQUIRES(
+        context, (gradients_row == batch_size),
+        errors::InvalidArgument("gradients row is not same as batch_size)"));
+
+    // Grad combiner
+    // bool is_mean = (combiner == 1);
+
+    // compute unique value and indices of input hash value
+    std::vector<Tinput> unique_value;
+    std::vector<Tinput> unique_indices;
+    unique_value.reserve(input_size);
+    unique_indices.reserve(input_size);
+    for (int64 i = 0; i < input_size; ++i) {
+      Tinput id = input[i];
+      if (id < 0) {  // Skip invalid id
+        continue;
+      }
+      auto it = std::find(unique_value.begin(), unique_value.end(), id);
+      if (it == unique_value.end()) {  // no find
+        unique_indices.push_back(unique_value.size());
+        unique_value.push_back(id);
+      } else {
+        unique_indices.push_back(it - unique_value.begin());
+      }
+    }
+
+    // printf("unique_indices: ");
+    // for (int i = 0; i < unique_indices.size(); ++i)
+    //   printf("%d ", unique_indices[i]);
+    // printf("\n");
+
+    // printf("input_indices: ");
+    // for (int i = 0; i < input_indices.size(); ++i)
+    //   printf("%d ", input_indices[i]);
+    // printf("\n");
+
+    // Create an output tensor
+    Tensor *output_tensor = NULL;
+    TensorShape output_shape({unique_value.size(), embedding_col});
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, output_shape, &output_tensor));
+    output_tensor->flat<T>().setZero();
+    T *output = (T *)output_tensor->tensor_data().data();
+
+    memset(output, 0, embedding_col * sizeof(float) * unique_value.size());
+
+    Tensor *unique_tensor = NULL;
+    TensorShape unique_shape({unique_value.size()});
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, unique_shape, &unique_tensor));
+    Tinput *unique = (Tinput *)unique_tensor->tensor_data().data();
+
+    int64 unique_num = unique_value.size();
+    for (int64 i = 0; i < unique_num; ++i) {
+      unique[i] = unique_value[i];
+    }
+
+    // if (input_size == batch_size * input_cols) { // input id is dense
+    // } else { // input id is sparse
+    // }
+
+    if (operation_ == SparseSegmentReductionOperation::kMean) {
+      auto input_flat = gradients_tensor.flat_outer_dims<T>();
+      typename TTypes<Tinput>::ConstVec indices_vec(unique_indices.data(),
+                                                    unique_indices.size());
+      typename TTypes<Tindices>::ConstVec segment_vec(input_indices.data(),
+                                                      input_indices.size());
+      auto output_flat = output_tensor->flat_outer_dims<T>();
+      functor::SparseSegmentGradFunctor<T, Tinput, Tindices>()(
+          context, operation_, input_flat, indices_vec, segment_vec,
+          output_flat);
+    } else if (operation_ == SparseSegmentReductionOperation::kSum) {
+      uint64 rows = unique_indices.size();
+      // std::vector<int64> row_values(unique_value.size(), 0);
+      std::map<float *, std::vector<float *>> mapSet;
+
+      for (int64 i = 0; i < rows; ++i) {
+        // row_values[unique_indices[i]] += 1;
+
+        auto index = unique_indices[i] * embedding_col;
+        // memset(&output[index * embedding_col], 0, embedding_col *
+        // sizeof(float));
+        if (!mapSet.count(&output[index])) {
+          std::vector<float *> srcs;
+          mapSet[&output[index]] = srcs;
+        }
+        mapSet[&output[index]].push_back(
+            &gradients[input_indices[i] * embedding_col]);
+      }
+
+      row_add(mapSet, embedding_col);
+      // printf("******Goto row_add_mean func.******\n");
+      // row_add_mean(mapSet, embedding_col, false);
+
+      // for (int i = 0; i < unique_value.size(); ++i) {
+      //   if (row_values[i] == 0) {
+      //     memset(&output[i * embedding_col], 0, embedding_col *
+      //     sizeof(float));
+      //   }
+      // }
+      // delete[] row_values;
+
+    } else if (operation_ == SparseSegmentReductionOperation::kSqrtN) {
+    }
+  }
+
+ private:
+  template <typename Tdata>
+  void copy(Tdata *dst, const Tdata *src, const int64 num) {
+    memcpy(dst, src, num * sizeof(T));
+  }
+
+  template <typename Tdata>
+  void add(Tdata *dst, const Tdata *src, const int64 num) {
+    for (int64 i = 0; i < num; ++i) {
+      dst[i] += src[i];
+    }
+  }
+
+  template <typename Tdata>
+  void scale(Tdata *dst, const Tdata factor, const int64 num) {
+    for (int64 i = 0; i < num; ++i) {
+      dst[i] *= factor;
+    }
+  }
+
+ private:
+  std::string combiner_;
+  std::string node_name;
+  SparseSegmentReductionOperation operation_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FusedSafeEmbeddingLookupSparseLocalGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<int64>("Tinput")
+                            .TypeConstraint<int32>("Tindices")
+                            .TypeConstraint<int64>("Tdense_shape"),
+                        FusedSafeEmbeddingLookupSparseLocalGradOp<
+                            CPUDevice, float, int64, int32, int64>);
+
+REGISTER_KERNEL_BUILDER(Name("FusedSafeEmbeddingLookupSparseLocalGrad")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<int64>("Tinput")
+                            .TypeConstraint<int64>("Tindices")
+                            .TypeConstraint<int64>("Tdense_shape"),
+                        FusedSafeEmbeddingLookupSparseLocalGradOp<
+                            CPUDevice, float, int64, int64, int64>);
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
new file mode 100644
index 00000000..dc222b12
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
@@ -0,0 +1,901 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 };
+
+template <TestCase test_case>
+void get_node_attr_from_test_case(string &combiner_str, float &max_norm) {
+  if (test_case == Sqrtn) {
+    combiner_str = "sqrtn";
+    max_norm = -1.0f;
+  } else if (test_case == Mean) {
+    combiner_str = "mean";
+    max_norm = -1.0f;
+  } else if (test_case == Sum) {
+    combiner_str = "sum";
+    max_norm = -1.0f;
+  } else if (test_case == SqrtnAndMaxNorm200) {
+    combiner_str = "sqrtn";
+    max_norm = 200.0f;
+  } else if (test_case == MeanAndMaxNorm100) {
+    combiner_str = "mean";
+    max_norm = 100.0f;
+  }
+}
+
+template <TestCase test_case>
+void fill_emb_vector_expected(Tensor *expected);
+
+template <>
+void fill_emb_vector_expected<Sqrtn>(Tensor *expected) {
+  test::FillValues<float>(
+      expected, {22.627416610717773, 24.0416316986084,   25.45584487915039,
+                 26.870058059692383, 28.284271240234375, 29.698484420776367,
+                 31.112699508666992, 32.526912689208984, 73.90083312988281,
+                 75.63288879394531,  77.36493682861328,  79.09698486328125,
+                 80.82904052734375,  82.56108856201172,  84.29314422607422,
+                 86.02519226074219,  124.70765686035156, 126.43971252441406,
+                 128.17176818847656, 129.90380859375,    131.6358642578125,
+                 133.367919921875,   135.09996032714844, 136.83201599121094,
+                 107.48023223876953, 108.89444732666016, 110.30866241455078,
+                 111.72286987304688, 113.1370849609375,  114.55130004882812,
+                 115.96551513671875, 117.37973022460938});
+}
+
+template <>
+void fill_emb_vector_expected<Mean>(Tensor *expected) {
+  test::FillValues<float>(
+      expected, {16.00000000000000, 17.00000000000000, 18.00000000000000,
+                 19.00000000000000, 20.00000000000000, 21.00000000000000,
+                 22.00000000000000, 23.00000000000000, 42.66666793823242,
+                 43.66666793823242, 44.66666793823242, 45.66666793823242,
+                 46.66666793823242, 47.66666793823242, 48.66666793823242,
+                 49.66666793823242, 72.00000000000000, 73.00000000000000,
+                 74.00000000000000, 75.00000000000000, 76.00000000000000,
+                 77.00000000000000, 78.00000000000000, 79.00000000000000,
+                 76.00000000000000, 77.00000000000000, 78.00000000000000,
+                 79.00000000000000, 80.00000000000000, 81.00000000000000,
+                 82.00000000000000, 83.00000000000000});
+}
+
+template <>
+void fill_emb_vector_expected<Sum>(Tensor *expected) {
+  test::FillValues<float>(
+      expected, {32.0,  34.0,  36.0,  38.0,  40.0,  42.0,  44.0,  46.0,
+                 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0,
+                 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0,
+                 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0});
+}
+
+template <>
+void fill_emb_vector_expected<SqrtnAndMaxNorm200>(Tensor *expected) {
+  test::FillValues<float>(
+      expected,
+      {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+       29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+       77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+       86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+       98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+       72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+       77.87342834, 78.98532867});
+}
+
+class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase {
+ protected:
+  template <typename T, TestCase test_case>
+  void Run(Device device) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+    DataType dtype = DataTypeToEnum<T>::value;
+    std::string combiner_str;
+    float max_norm;
+
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up",
+                                "FusedEmbeddingLocalSparseLookUp")
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(dtype))
+                     .Attr("T", dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int entries = 8;
+    const int bucket_size = 16;
+
+    Tensor sp_values(DT_INT64, {nnz});
+    Tensor sp_indices(DT_INT64, {nnz, 2});
+    Tensor sp_dense_shape(DT_INT64, {2});
+    Tensor emb_variable(dtype, {bucket_size, emb_vector_dim});
+
+    test::FillValues<int64>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+    test::FillValues<int64>(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7,
+                                          2, 1, 2, 4, 2, 7, 3, 0, 3, 6});
+    test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+    test::FillValues<T>(
+        &emb_variable,
+        {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+         10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+         20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+         30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+         40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+         50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+         60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+         70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+         80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+         90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+         100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+         110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+         120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+    AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+    AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+    AddInputFromArray<int64>(sp_dense_shape.shape(),
+                             sp_dense_shape.flat<int64>());
+    AddInputFromArray<T>(emb_variable.shape(), emb_variable.flat<T>());
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor emb_vector_expected(dtype, {batch_size, emb_vector_dim});
+    Tensor sp_values_offset_expected(DT_INT32, {batch_size});
+    fill_emb_vector_expected<test_case>(&emb_vector_expected);
+    test::FillValues<int32>(&sp_values_offset_expected, {0, 2, 5, 8});
+
+    const Tensor &emb_vector = *GetOutput(0);
+    const Tensor &values_offset = *GetOutput(1);
+    TF_EXPECT_OK(device_->Sync());
+
+    test::ExpectTensorNear<T>(emb_vector_expected, emb_vector, 1e-4);
+    test::ExpectTensorEqual<int32>(sp_values_offset_expected, values_offset);
+  }
+};
+
+template <TestCase test_case>
+void fill_grad_expected(Tensor *expected);
+
+template <>
+void fill_grad_expected<Sqrtn>(Tensor *expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.7071067690849304, 1.4142135381698608,
+                 2.1213204860687256, 2.8284270763397217, 3.535533905029297,
+                 4.242640972137451,  4.949747562408447,  0.000000000000000,
+                 0.7071067690849304, 1.4142135381698608, 2.1213204860687256,
+                 2.8284270763397217, 3.535533905029297,  4.242640972137451,
+                 4.949747562408447,  4.618802070617676,  5.196152687072754,
+                 5.773502826690674,  6.350852966308594,  6.928203582763672,
+                 7.505553722381592,  8.082903861999512,  8.66025447845459,
+                 4.618802070617676,  5.196152687072754,  5.773502826690674,
+                 6.350852966308594,  6.928203582763672,  7.505553722381592,
+                 8.082903861999512,  8.66025447845459,   4.618802070617676,
+                 5.196152687072754,  5.773502826690674,  6.350852966308594,
+                 6.928203582763672,  7.505553722381592,  8.082903861999512,
+                 8.66025447845459,   9.237604141235352,  9.81495475769043,
+                 10.392305374145508, 10.96965503692627,  11.547005653381348,
+                 12.124356269836426, 12.701705932617188, 13.279056549072266,
+                 9.237604141235352,  9.81495475769043,   10.392305374145508,
+                 10.96965503692627,  11.547005653381348, 12.124356269836426,
+                 12.701705932617188, 13.279056549072266, 9.237604141235352,
+                 9.81495475769043,   10.392305374145508, 10.96965503692627,
+                 11.547005653381348, 12.124356269836426, 12.701705932617188,
+                 13.279056549072266, 16.970563888549805, 17.677669525146484,
+                 18.384777069091797, 19.091882705688477, 19.79899024963379,
+                 20.5060977935791,   21.21320343017578,  21.920310974121094,
+                 16.970563888549805, 17.677669525146484, 18.384777069091797,
+                 19.091882705688477, 19.79899024963379,  20.5060977935791,
+                 21.21320343017578,  21.920310974121094});
+}
+
+template <>
+void fill_grad_expected<Mean>(Tensor *expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.500000000000000,  1.000000000000000,
+                 1.500000000000000,  2.000000000000000,  2.500000000000000,
+                 3.000000000000000,  3.500000000000000,  0.000000000000000,
+                 0.500000000000000,  1.000000000000000,  1.500000000000000,
+                 2.000000000000000,  2.500000000000000,  3.000000000000000,
+                 3.500000000000000,  2.6666667461395264, 3.000000000000000,
+                 3.3333332538604736, 3.6666667461395264, 4.000000000000000,
+                 4.333333492279053,  4.666666507720947,  5.000000000000000,
+                 2.6666667461395264, 3.000000000000000,  3.3333332538604736,
+                 3.6666667461395264, 4.000000000000000,  4.333333492279053,
+                 4.666666507720947,  5.000000000000000,  2.6666667461395264,
+                 3.000000000000000,  3.3333332538604736, 3.6666667461395264,
+                 4.000000000000000,  4.333333492279053,  4.666666507720947,
+                 5.000000000000000,  5.333333492279053,  5.666666507720947,
+                 6.000000000000000,  6.333333492279053,  6.666666507720947,
+                 7.000000000000000,  7.333333492279053,  7.666666507720947,
+                 5.333333492279053,  5.666666507720947,  6.000000000000000,
+                 6.333333492279053,  6.666666507720947,  7.000000000000000,
+                 7.333333492279053,  7.666666507720947,  5.333333492279053,
+                 5.666666507720947,  6.000000000000000,  6.333333492279053,
+                 6.666666507720947,  7.000000000000000,  7.333333492279053,
+                 7.666666507720947,  12.000000000000000, 12.500000000000000,
+                 13.000000000000000, 13.500000000000000, 14.000000000000000,
+                 14.500000000000000, 15.000000000000000, 15.500000000000000,
+                 12.000000000000000, 12.500000000000000, 13.000000000000000,
+                 13.500000000000000, 14.000000000000000, 14.500000000000000,
+                 15.000000000000000, 15.500000000000000});
+}
+
+template <>
+void fill_grad_expected<Sum>(Tensor *expected) {
+  test::FillValues<float>(
+      expected,
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  0.0,  1.0,  2.0,  3.0,
+       4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+       8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0,  9.0,  10.0, 11.0,
+       12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+       16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0,
+       20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+       24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+}
+
+template <>
+void fill_grad_expected<MeanAndMaxNorm100>(Tensor *expected) {
+  test::FillValues<float>(
+      expected,
+      {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
+       2.50000000,  3.00000000,  3.50000000,  0.00000000,  0.50000000,
+       1.00000000,  1.50000000,  2.00000000,  2.50000000,  3.00000000,
+       3.50000000,  2.65028572,  2.98157120,  3.31285667,  3.64414287,
+       3.97542834,  4.30671406,  4.63799953,  4.96928549,  2.16437674,
+       2.43492365,  2.70547056,  2.97601795,  3.24656487,  3.51711202,
+       3.78765893,  4.05820608,  1.58337951,  1.78130186,  1.97922409,
+       2.17714667,  2.37506914,  2.57299161,  2.77091384,  2.96883631,
+       5.33333349,  5.66666651,  6.00000000,  6.33333349,  6.66666651,
+       7.00000000,  7.33333349,  7.66666651,  1.89459133,  2.01300311,
+       2.13141513,  2.24982715,  2.36823893,  2.48665094,  2.60506320,
+       2.72347474,  1.89459133,  2.01300311,  2.13141513,  2.24982715,
+       2.36823893,  2.48665094,  2.60506320,  2.72347474,  3.43474555,
+       3.57786012,  3.72097445,  3.86408877,  4.00720310,  4.15031767,
+       4.29343224,  4.43654633,  11.92628479, 12.42321396, 12.92014217,
+       13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516});
+}
+
+class FusedEmbeddingLocalSparseLookUpGradOpTest : public OpsTestBase {
+ protected:
+  template <typename T, TestCase test_case>
+  void Run(Device device) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+    DataType dtype = DataTypeToEnum<T>::value;
+    std::string combiner_str;
+    float max_norm;
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up_grad",
+                                "FusedEmbeddingLocalSparseLookUpGrad")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Attr("T", dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int bucket_size = 16;
+
+    Tensor top_grad(dtype, {batch_size, emb_vector_dim});
+    Tensor emb_variable(dtype, {bucket_size, emb_vector_dim});
+    Tensor sp_values(DT_INT64, {nnz});
+    Tensor sp_values_offset(DT_INT32, {batch_size});
+
+    test::FillValues<T>(
+        &top_grad,
+        {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+         11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+         22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+    test::FillValues<T>(
+        &emb_variable,
+        {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+         10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+         20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+         30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+         40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+         50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+         60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+         70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+         80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+         90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+         100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+         110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+         120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+    test::FillValues<int64>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+    test::FillValues<int32>(&sp_values_offset, {0, 2, 5, 8});
+
+    AddInputFromArray<T>(top_grad.shape(), top_grad.flat<T>());
+    AddInputFromArray<T>(emb_variable.shape(), emb_variable.flat<T>());
+    AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+    AddInputFromArray<int32>(sp_values_offset.shape(),
+                             sp_values_offset.flat<int32>());
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor grad_expected(dtype, {nnz, emb_vector_dim});
+    fill_grad_expected<test_case>(&grad_expected);
+
+    const Tensor &grad = *GetOutput(0);
+    TF_EXPECT_OK(device_->Sync());
+
+    test::ExpectTensorNear<T>(grad_expected, grad, 1e-4);
+  }
+};
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalFloatSumCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocal",
+                              "FusedSafeEmbeddingLookupSparseLocal")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Attr("T", DT_FLOAT)
+                   .Attr("combiner", "sum")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int nnz = 10;
+  const int batch_size = 4;
+  const int emb_vector_dim = 8;
+  const int entries = 8;
+  const int bucket_size = 16;
+
+  Tensor sp_values(DT_INT64, {nnz});
+  Tensor sp_weight(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor emb_variable(DT_FLOAT, {bucket_size, emb_vector_dim});
+
+  // [3, 1, 4, 5, 7, 3, 12, 12, 15, 4]
+  test::FillValues<int64>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+  test::FillValues<int64>(&sp_weight, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+  // [0, 0, 1, 1, 1, 2, 2, 2, 3, 3]
+  test::FillValues<int64>(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7,
+                                        2, 1, 2, 4, 2, 7, 3, 0, 3, 6});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+  test::FillValues<float>(
+      &emb_variable,
+      {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+       10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+       20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+       30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+       40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+       50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+       60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+       70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+       80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+       90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+       100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+       110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+       120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+  AddInputFromArray<float>(emb_variable.shape(), emb_variable.flat<float>());
+  AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim});
+  // Tensor sp_values_offset_expected(DT_INT32, {batch_size});
+  fill_emb_vector_expected<Sum>(&emb_vector_expected);
+  // test::FillValues<int32>(&sp_values_offset_expected, {0, 2, 5, 8});
+
+  const Tensor &emb_vector = *GetOutput(0);
+  // const Tensor& values_offset = *GetOutput(1);
+  // TF_EXPECT_OK(device_->Sync());
+
+  float *output = (float *)emb_vector.tensor_data().data();
+  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+
+  test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-2);
+  // test::ExpectTensorEqual<int32>(sp_values_offset_expected, values_offset);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatSumCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocalGrad",
+                              "FusedSafeEmbeddingLookupSparseLocalGrad")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_INT64))  // input hash value
+                   .Input(FakeInput(DT_INT64))  // dense_shape
+                   .Input(FakeInput(DT_INT64))  // indices
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tinput", DT_INT64)
+                   .Attr("Tindices", DT_INT64)
+                   .Attr("Tdense_shape", DT_INT64)
+                   .Attr("combiner", "sum")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int nnz = 32;
+  const int batch_size = 32;
+  const int emb_vector_dim = 4;
+  const int entries = 1;
+  const int bucket_size = 16;
+
+  Tensor sp_values(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim});
+
+  test::FillValues<float>(
+      &grad_variable,
+      {-0.00363823911,  0.0138593055,    0.00232614437,    0.00241222954,
+       -0.000268990319, -0.00410466315,  0.00478722388,    -0.000196215493,
+       -0.0044340631,   -0.00725936424,  -0.00691315765,   -0.00612797868,
+       -0.00678675482,  -0.00246100035,  0.00216219737,    -0.00346030248,
+       0.00100048154,   -0.00852716807,  0.00803291425,    -0.000800206966,
+       -3.03583856e-05, 0.00524863973,   -0.0163001865,    -0.0109826243,
+       0.0830041766,    0.153927863,     -0.0508279465,    -0.00474824524,
+       7.8225421e-05,   -0.000293536956, 0.00610643439,    -0.00019871055,
+       -0.000780000235, -0.00221115421,  0.00387162319,    0.00222597015,
+       -0.0102384416,   -0.00801581,     -0.0017716008,    0.00598057127,
+       -0.00808391348,  -0.00166459556,  0.00106997311,    -0.00185864791,
+       0.00491535058,   -0.00633693347,  0.0212651137,     0.00704831816,
+       -0.00338345463,  -0.00668374076,  -0.0000871402444, -0.000196078254,
+       0.00254824688,   -0.00249796058,  -0.0034719836,    -0.003478111,
+       6.03029093e-06,  -0.00211180653,  0.000114592229,   -0.00240143575,
+       -0.00592383416,  -0.00984606426,  0.00129341101,    0.00100650277,
+       0.000906444562,  -0.00139640097,  -0.000192714069,  0.00277191238,
+       -0.000245573436, -0.00680374401,  0.00356984767,    -0.00120577728,
+       -0.000766036392, -0.00487764599,  0.000532136182,   -0.00413817167,
+       -0.0302855149,   -0.0406391025,   0.0006130244,     0.0183675159,
+       -0.00247384049,  -0.00609699031,  0.00127684267,    -0.00235637,
+       0.00715987338,   0.00783564895,   -0.00139878597,   -0.0048744888,
+       0.00356917572,   -0.0164020304,   0.0179400034,     0.000975746894,
+       -0.00529623777,  -0.00490315,     0.00691250199,    0.00286021968,
+       -0.00426661829,  -0.00417789398,  -0.00597105641,   -0.00605484238,
+       0.00197085389,   -0.00757023226,  0.00458694575,    0.00153650146,
+       -0.00345475,     -0.00823391136,  0.000807857723,   0.0121598523,
+       -0.00745406374,  -0.0135948248,   0.004774753,      -0.00390140619,
+       -0.00208005216,  -0.00362896058,  0.00558064319,    -0.000532045437,
+       -0.00854093302,  0.00566324079,   -0.00435794424,   0.00403016619,
+       0.000468764076,  0.000297251798,  -0.00617758604,   -0.00338481856,
+       0.00280403625,   -0.00649327,     -0.000154057736,  -0.000479023496});
+  test::FillValues<int64>(&sp_values,
+                          {9, 2, 9, 2, 2, 9, 2, 2, 2, 2, 2, 2, 9, 2, 2, 2,
+                           9, 2, 2, 2, 2, 9, 2, 9, 2, 2, 2, 9, 2, 9, 2, 2});
+  test::FillValues<int64>(
+      &sp_indices, {0,  0, 1,  0, 2,  0, 3,  0, 4,  0, 5,  0, 6,  0, 7,  0,
+                    8,  0, 9,  0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0,
+                    16, 0, 17, 0, 18, 0, 19, 0, 20, 0, 21, 0, 22, 0, 23, 0,
+                    24, 0, 25, 0, 26, 0, 27, 0, 28, 0, 29, 0, 30, 0, 31, 0});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+
+  AddInputFromArray<float>(grad_variable.shape(), grad_variable.flat<float>());
+  AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor output1_tensor_expected(DT_FLOAT, {2, emb_vector_dim});
+  Tensor output2_tensor_expected(DT_INT64, {2});
+
+  test::FillValues<float>(
+      &output1_tensor_expected,
+      {-0.0247110315, -0.00123064546, -0.0152365314, -0.0140080471,
+       0.0247110203, 0.00123063289, 0.0152365509, 0.0140080536});
+
+  test::FillValues<int64>(&output2_tensor_expected, {9, 2});
+  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
+  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+
+  const Tensor &output1_tensor = *GetOutput(0);
+  const Tensor &output2_tensor = *GetOutput(1);
+
+  float *output1 = (float *)output1_tensor.tensor_data().data();
+  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+
+  printf("out = %.11f , expect = %.11f\n", output1[5], output1_ex[5]);
+  printf("out = %.11f , expect = %.11f\n", output1[7], output1_ex[7]);
+  test::ExpectTensorNear<float>(output1_tensor_expected, output1_tensor, 1e-8);
+  test::ExpectTensorEqual<int64>(output2_tensor_expected, output2_tensor);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatMeanCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseLocalGrad",
+                              "FusedSafeEmbeddingLookupSparseLocalGrad")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_INT64))  // input hash value
+                   .Input(FakeInput(DT_INT64))  // dense_shape
+                   .Input(FakeInput(DT_INT64))  // indices
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tinput", DT_INT64)
+                   .Attr("Tindices", DT_INT64)
+                   .Attr("Tdense_shape", DT_INT64)
+                   .Attr("combiner", "mean")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int nnz = 9;
+  const int batch_size = 5;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+  const int bucket_size = 16;
+
+  Tensor sp_values(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim});
+
+  test::FillValues<float>(
+      &grad_variable, {0.0103125420, 0.018807490, -0.0106398590, -0.029409127,
+                       0.0054132286, 0.013920069, -0.0190976150, -0.023196392,
+                       0.0100601720, 0.015330995, -0.0055795530, -0.024889620,
+                       0.0108455080, 0.018832123, -0.0095151365, -0.029357582,
+                       0.0100478110, 0.018798435, -0.0112019650, -0.029439624});
+  test::FillValues<int64>(&sp_values, {1, 1, 0, 4, 1, 1, 1, 0, 1});
+  test::FillValues<int64>(
+      &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+
+  AddInputFromArray<float>(grad_variable.shape(), grad_variable.flat<float>());
+  AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor output1_tensor_expected(DT_FLOAT, {3, emb_vector_dim});
+  Tensor output2_tensor_expected(DT_INT64, {3});
+  test::FillValues<float>(
+      &output1_tensor_expected,
+      {0.0254510570, 0.0477297000, -0.0317581670, -0.075281680, 0.0084614195,
+       0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344,
+       -0.0095488075, -0.011598196});
+  test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
+  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
+  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+
+  const Tensor &output1_tensor = *GetOutput(0);
+  const Tensor &output2_tensor = *GetOutput(1);
+
+  float *output1 = (float *)output1_tensor.tensor_data().data();
+  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+
+  // printf("out = %f , expect = %f\n", output1[0], output1_ex[0]);
+  // printf("out = %f , expect = %f\n", output1[1], output1_ex[1]);
+  // printf("out = %f , expect = %f\n", output1[2], output1_ex[2]);
+  // printf("out = %f , expect = %f\n", output1[3], output1_ex[3]);
+
+  // printf("out = %d , expect = %d\n", output2[0], output2_ex[0]);
+  // printf("out = %d , expect = %d\n", output2[1], output2_ex[1]);
+  // printf("out = %d , expect = %d\n", output2[2], output2_ex[2]);
+
+  test::ExpectTensorNear<float>(output1_tensor_expected, output1_tensor, 1e-8);
+  test::ExpectTensorEqual<int64>(output2_tensor_expected, output2_tensor);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatSumCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparse",
+                              "FusedSafeEmbeddingLookupSparse")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Attr("T", DT_FLOAT)
+                   .Attr("combiner", "sum")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int nnz = 9;
+  const int batch_size = 5;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+  const int gathered_weight_size = 3;
+
+  Tensor sp_values(DT_INT64, {nnz});
+  Tensor sp_weight(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor emb_variable(DT_FLOAT, {gathered_weight_size, emb_vector_dim});
+
+  // [1 1 0 4 1 1 1 0 1] -> [1 0 4], [0 0 1 2 0 0 0 1 0]
+  test::FillValues<int64>(&sp_values, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  test::FillValues<int64>(&sp_weight, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  // [0 0 0 1 1 3 3 4 4]
+  test::FillValues<int64>(
+      &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+  test::FillValues<float>(
+      &emb_variable, {-0.023765106, -0.248630840, 0.275294270, 0.228118000,
+                      -0.147108670, -0.298352200, -0.067187610, 0.274558250,
+                      0.491792620, -0.094891705, 0.064489834, 0.058840238});
+
+  AddInputFromArray<float>(emb_variable.shape(), emb_variable.flat<float>());
+  AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim});
+
+  test::FillValues<float>(
+      &emb_vector_expected,
+      {-0.19463888, -0.79561390, 0.48340094,  0.73079425,  0.46802750,
+       -0.34352255, 0.33978412,  0.28695825,  0.00000000,  0.00000000,
+       0.00000000,  0.00000000,  -0.04753021, -0.49726167, 0.55058855,
+       0.45623600,  -0.17087378, -0.54698306, 0.20810667,  0.50267625});
+
+  const Tensor &emb_vector = *GetOutput(0);
+
+  float *output = (float *)emb_vector.tensor_data().data();
+  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+
+  test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-8);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatMeanCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparse",
+                              "FusedSafeEmbeddingLookupSparse")
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Input(FakeInput(DT_INT64))
+                   .Attr("T", DT_FLOAT)
+                   .Attr("combiner", "mean")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int nnz = 9;
+  const int batch_size = 5;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+  const int gathered_weight_size = 3;
+
+  Tensor sp_values(DT_INT64, {nnz});
+  Tensor sp_weight(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor emb_variable(DT_FLOAT, {gathered_weight_size, emb_vector_dim});
+
+  // [1 1 0 4 1 1 1 0 1] -> [1 0 4], [0 0 1 2 0 0 0 1 0]
+  test::FillValues<int64>(&sp_values, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  test::FillValues<int64>(&sp_weight, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  // [0 0 0 1 1 3 3 4 4]
+  test::FillValues<int64>(
+      &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+  test::FillValues<float>(&emb_variable,
+                          {-0.02299355, -0.247596220, 0.27484232, 0.226618130,
+                           -0.14686598, -0.297978460, -0.06733219, 0.273977040,
+                           0.49191360, -0.094738655, 0.06426916, 0.058573183});
+
+  AddInputFromArray<float>(emb_variable.shape(), emb_variable.flat<float>());
+  AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor emb_vector_expected(DT_FLOAT, {batch_size, emb_vector_dim});
+  test::FillValues<float>(
+      &emb_vector_expected,
+      {-0.064284360, -0.26439032,  0.160784140,  0.24240442,  0.234460030,
+       -0.17116743,  0.169555740,  0.14259565,   0.000000000, 0.00000000,
+       0.000000000,  0.00000000,   -0.022993550, -0.24759622, 0.274842320,
+       0.22661813,   -0.084929764, -0.27278733,  0.103755064, 0.25029758});
+
+  const Tensor &emb_vector = *GetOutput(0);
+
+  float *output = (float *)emb_vector.tensor_data().data();
+  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+
+  test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-7);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatSumCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseGrad",
+                              "FusedSafeEmbeddingLookupSparseGrad")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_INT64))  // unique_id
+                   .Input(FakeInput(DT_INT64))  // unique_indices
+                   .Input(FakeInput(DT_INT64))  // dense_shape
+                   .Input(FakeInput(DT_INT64))  // indices
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tinput", DT_INT64)
+                   .Attr("Tindices", DT_INT64)
+                   .Attr("Tdense_shape", DT_INT64)
+                   .Attr("combiner", "sum")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int unique_size = 3;
+  const int nnz = 9;
+  const int batch_size = 5;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  Tensor unique_id(DT_INT64, {unique_size});
+  Tensor unique_indices(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim});
+
+  test::FillValues<float>(
+      &grad_variable,
+      {0.0076283700764179229736328125,  0.0121669657528400421142578125,
+       -0.0049919090233743190765380859, -0.0190300568938255310058593750,
+       0.0065145129337906837463378906,  0.0117923058569431304931640625,
+       -0.0164990965276956558227539062, -0.0200323350727558135986328125,
+       0.0100607946515083312988281250,  0.0153625328093767166137695312,
+       -0.0056031607091426849365234375, -0.0249206330627202987670898438,
+       0.0099571626633405685424804688,  0.0154269225895404815673828125,
+       -0.0055019007995724678039550781, -0.0239365808665752410888671875,
+       0.0084272380918264389038085938,  0.0152924191206693649291992188,
+       -0.0086676068603992462158203125, -0.0239860229194164276123046875});
+  test::FillValues<int64>(&unique_id, {1, 0, 4});
+  test::FillValues<int64>(&unique_indices, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  test::FillValues<int64>(
+      &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+
+  AddInputFromArray<float>(grad_variable.shape(), grad_variable.flat<float>());
+  AddInputFromArray<int64>(unique_id.shape(), unique_id.flat<int64>());
+  AddInputFromArray<int64>(unique_indices.shape(),
+                           unique_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor output1_tensor_expected(DT_FLOAT, {unique_size, emb_vector_dim});
+  Tensor output2_tensor_expected(DT_INT64, {unique_size});
+  test::FillValues<float>(
+      &output1_tensor_expected,
+      {0.0501128211617469787597656250, 0.0822724997997283935546875000,
+       -0.0461543202400207519531250000, -0.1299516409635543823242187500,
+       0.0160556081682443618774414062, 0.0274593848735094070434570312,
+       -0.0136595163494348526000976562, -0.0430160798132419586181640625,
+       0.0065145129337906837463378906, 0.0117923058569431304931640625,
+       -0.0164990965276956558227539062, -0.0200323369354009628295898438});
+  test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
+  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
+  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+
+  const Tensor &output1_tensor = *GetOutput(0);
+  const Tensor &output2_tensor = *GetOutput(1);
+
+  float *output1 = (float *)output1_tensor.tensor_data().data();
+  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+
+  printf("out = %.28f , expect = %.28f\n", output1[11], output1_ex[11]);
+
+  test::ExpectTensorNear<float>(output1_tensor_expected, output1_tensor, 1e-8);
+  test::ExpectTensorEqual<int64>(output2_tensor_expected, output2_tensor);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatMeanCpu) {
+  TF_EXPECT_OK(NodeDefBuilder("FusedSafeEmbeddingLookupSparseGrad",
+                              "FusedSafeEmbeddingLookupSparseGrad")
+                   .Input(FakeInput(DT_FLOAT))  // gradients
+                   .Input(FakeInput(DT_INT64))  // unique_id
+                   .Input(FakeInput(DT_INT64))  // unique_indices
+                   .Input(FakeInput(DT_INT64))  // dense_shape
+                   .Input(FakeInput(DT_INT64))  // indices
+                   .Attr("T", DT_FLOAT)
+                   .Attr("Tinput", DT_INT64)
+                   .Attr("Tindices", DT_INT64)
+                   .Attr("Tdense_shape", DT_INT64)
+                   .Attr("combiner", "mean")
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+
+  const int unique_size = 3;
+  const int nnz = 9;
+  const int batch_size = 5;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  Tensor unique_id(DT_INT64, {unique_size});
+  Tensor unique_indices(DT_INT64, {nnz});
+  Tensor sp_indices(DT_INT64, {nnz, 2});
+  Tensor sp_dense_shape(DT_INT64, {2});
+  Tensor grad_variable(DT_FLOAT, {batch_size, emb_vector_dim});
+
+  test::FillValues<float>(
+      &grad_variable, {0.0103125420, 0.018807490, -0.0106398590, -0.029409127,
+                       0.0054132286, 0.013920069, -0.0190976150, -0.023196392,
+                       0.0100601720, 0.015330995, -0.0055795530, -0.024889620,
+                       0.0108455080, 0.018832123, -0.0095151365, -0.029357582,
+                       0.0100478110, 0.018798435, -0.0112019650, -0.029439624});
+  test::FillValues<int64>(&unique_id, {1, 0, 4});
+  test::FillValues<int64>(&unique_indices, {0, 0, 1, 2, 0, 0, 0, 1, 0});
+  test::FillValues<int64>(
+      &sp_indices, {0, 1, 0, 3, 0, 6, 1, 3, 1, 6, 3, 3, 3, 4, 4, 1, 4, 7});
+  test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+
+  AddInputFromArray<float>(grad_variable.shape(), grad_variable.flat<float>());
+  AddInputFromArray<int64>(unique_id.shape(), unique_id.flat<int64>());
+  AddInputFromArray<int64>(unique_indices.shape(),
+                           unique_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+  AddInputFromArray<int64>(sp_dense_shape.shape(),
+                           sp_dense_shape.flat<int64>());
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor output1_tensor_expected(DT_FLOAT, {unique_size, emb_vector_dim});
+  Tensor output2_tensor_expected(DT_INT64, {unique_size});
+  test::FillValues<float>(
+      &output1_tensor_expected,
+      {0.0254510570, 0.0477297000, -0.0317581670, -0.075281680, 0.0084614195,
+       0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344,
+       -0.0095488075, -0.011598196});
+  test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
+  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
+  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+
+  const Tensor &output1_tensor = *GetOutput(0);
+  const Tensor &output2_tensor = *GetOutput(1);
+
+  float *output1 = (float *)output1_tensor.tensor_data().data();
+  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+
+  test::ExpectTensorNear<float>(output1_tensor_expected, output1_tensor, 1e-8);
+  test::ExpectTensorEqual<int64>(output2_tensor_expected, output2_tensor);
+}
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h
new file mode 100644
index 00000000..b0b2b9b4
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_op.h
@@ -0,0 +1,11 @@
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_
+
+#include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_embedding_EMBEDDING_LOOKUP_SPARSE_OP_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc
new file mode 100644
index 00000000..c19b02f1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_grad_op_test.cc
@@ -0,0 +1,394 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+class FusedSafeEmbeddingPostLookupGradOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype,
+                          const std::string& combiner, const float max_norm,
+                          const int default_id) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_safe_embedding_post_look_up_grad",
+                                "FusedEmbeddingSparsePostLookUpGrad")
+                     .Attr("T", dtype)
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_axis", 0)
+                     .Attr("combiner", combiner)
+                     .Attr("max_norm", max_norm)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT32))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedSafeEmbeddingPostLookupGradOpTest,
+       Partition2_Mean_MaxNorm100_Float) {
+  const int nnz = 10;
+  const int batch_size = 4;
+  const int emb_vector_dim = 8;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "mean", 100.0, -1);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+       11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+       22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+
+  // emb_shards
+  AddInputFromArray<float>(
+      TensorShape({6, emb_vector_dim}),
+      {8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, 26.0, 27.0,
+       28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+       32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 32.0, 33.0, 34.0, 35.0,
+       36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0});
+  AddInputFromArray<float>(
+      TensorShape({4, emb_vector_dim}),
+      {56.0,  57.0,  58.0,  59.0,  60.0,  61.0,  62.0,  63.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+  // sp_values: 3, 1, 4, 5, 7, 3, 12, 12, 15, 4
+  // partitioned_values: 1, 3, 3, 4, 4, 5 and 7, 12, 12, 15
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({6, 2}),
+                           {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1});
+  AddInputFromArray<int64>(TensorShape({4, 2}), {1, 7, 2, 4, 2, 7, 3, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 3, 3, 2});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}),
+                         {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({6, emb_vector_dim}));
+    test::FillValues<float>(
+        &grad_shards_1,
+        {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
+         2.50000000,  3.00000000,  3.50000000,  0.00000000,  0.50000000,
+         1.00000000,  1.50000000,  2.00000000,  2.50000000,  3.00000000,
+         3.50000000,  5.33333349,  5.66666651,  6.00000000,  6.33333349,
+         6.66666651,  7.00000000,  7.33333349,  7.66666651,  2.65028572,
+         2.98157120,  3.31285667,  3.64414287,  3.97542834,  4.30671406,
+         4.63799953,  4.96928549,  11.92628479, 12.42321396, 12.92014217,
+         13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516,
+         2.16437674,  2.43492365,  2.70547056,  2.97601795,  3.24656487,
+         3.51711202,  3.78765893,  4.05820608});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({4, emb_vector_dim}));
+    test::FillValues<float>(
+        &grad_shards_2,
+        {1.58337951, 1.78130186, 1.97922409, 2.17714667, 2.37506914, 2.57299161,
+         2.77091384, 2.96883631, 1.89459133, 2.01300311, 2.13141513, 2.24982715,
+         2.36823893, 2.48665094, 2.60506320, 2.72347474, 1.89459133, 2.01300311,
+         2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, 2.72347474,
+         3.43474555, 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767,
+         4.29343224, 4.43654633});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+TEST_F(FusedSafeEmbeddingPostLookupGradOpTest,
+       Partition2_SUM_Float_No_Default) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, -1);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 1, 1});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_1,
+                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_2,
+                            {2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+TEST_F(FusedSafeEmbeddingPostLookupGradOpTest, Partition2_SUM_Float_Default_0) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, 0);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 1, 1});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_1,
+                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_2,
+                            {2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks                                                     //
+//----------------------------------------------------------------------------//
+
+template <typename T>
+void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    std::copy_n(vals.data(), vals.size(), flat.data());
+  }
+}
+
+template <typename T>
+void FillValues(Tensor* tensor, int val) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = val;
+  }
+}
+
+template <typename T>
+void FillZerosValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 0.0;
+  }
+}
+
+template <typename T>
+void FillOnesValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  float scale = std::rand() / ((RAND_MAX + 1u) / 6);
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 1.1 * scale;
+  }
+}
+
+template <typename T>
+void FillIndiceValues(Tensor* tensor, const int partitions,
+                      const int batch_size, const int entries) {
+  auto flat = tensor->flat<T>();
+  int k = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < entries; ++j) {
+      flat.data()[k] = i + partitions;
+      flat.data()[k + 1] = j;
+      k += 2;
+    }
+  }
+}
+
+template <typename T>
+void PrintValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    std::cout << flat.data()[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+
+template <typename T>
+static Graph* EmbPostGradOp(const string& kind, int num_partitions,
+                            const std::string& combiner, const float max_norm,
+                            const int default_id) {
+  const int nnz = 3;
+  const int batch_size = 512;
+  const int emb_vector_dim = 32;
+  const int entries = 8;
+  const float sparsity = 0.5;
+  const int total_inputs = batch_size * entries * sparsity;
+
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType type = DataTypeToEnum<T>::v();
+
+  string op_name = "FusedEmbeddingSparsePostLookUpGrad";
+
+  // top_grad
+  Tensor top_grad(type, TensorShape({batch_size, emb_vector_dim}));
+  FillOnesValues<T>(&top_grad);
+
+  // emb_shards
+  std::vector<NodeBuilder::NodeOut> input_emb_shards;
+  input_emb_shards.reserve(num_partitions);
+  for (int i = 0; i < num_partitions; ++i) {
+    Tensor emb_shards(
+        type, TensorShape({total_inputs / num_partitions, emb_vector_dim}));
+    FillOnesValues<T>(&emb_shards);
+    input_emb_shards.push_back(test::graph::Constant(g, emb_shards));
+    // PrintValues<T>(&emb_shards);
+  }
+
+  // partitioned_indices
+  std::vector<NodeBuilder::NodeOut> partitioned_indices;
+  partitioned_indices.reserve(num_partitions);
+  for (int i = 0; i < num_partitions; ++i) {
+    Tensor sub_partitioned_indice(
+        DT_INT64, TensorShape({total_inputs / num_partitions, 2}));
+    FillIndiceValues<int64>(&sub_partitioned_indice, i,
+                            batch_size / num_partitions, entries * sparsity);
+    partitioned_indices.push_back(
+        test::graph::Constant(g, sub_partitioned_indice));
+    // PrintValues<int64>(&sub_partitioned_indice);
+  }
+
+  // sp_dense_shape
+  Tensor feature_nums(DT_INT32, TensorShape({batch_size}));
+  FillValues<int32>(&feature_nums, entries * sparsity);
+
+  // row_empty_and_invalid_flags
+  Tensor row_empty_and_invalid_flags(DT_INT32, TensorShape({batch_size + nnz}));
+  FillZerosValues<int>(&row_empty_and_invalid_flags);
+
+  auto nodeBuilder =
+      NodeBuilder(g->NewName("n"), op_name)
+          .Attr("T", type)
+          .Attr("num_partitions", num_partitions)
+          .Attr("partition_axis", 0)
+          .Attr("combiner", combiner)
+          .Attr("max_norm", max_norm)
+          .Attr("default_id", default_id)
+          .Input(test::graph::Constant(g, top_grad))
+          .Input(input_emb_shards)
+          .Input(partitioned_indices)
+          .Input(test::graph::Constant(g, feature_nums))
+          .Input(test::graph::Constant(g, row_empty_and_invalid_flags));
+  TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr));
+  return g;
+}
+
+#define BM_EMB_POST_OP(kind, NP, C, T, DEVICE, NTH)                            \
+  static void BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH(   \
+      int iters) {                                                             \
+    testing::UseRealTime();                                                    \
+    SessionOptions opts;                                                       \
+    opts.config.set_intra_op_parallelism_threads(NTH);                         \
+    test::Benchmark(#DEVICE, EmbPostGradOp<T>(#kind, NP, #C, -1.0, -1), &opts) \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH);
+
+#define BM_EMB_POST_OP_kind(NP, C, NTH) \
+  BM_EMB_POST_OP(OPT, NP, C, float, CPU, NTH);
+
+#define BM_EMB_POST_OP_NTH(NP, C) \
+  BM_EMB_POST_OP_kind(NP, C, 1);  \
+  BM_EMB_POST_OP_kind(NP, C, 4);  \
+  BM_EMB_POST_OP_kind(NP, C, 8);
+
+BM_EMB_POST_OP_NTH(2, sum);
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc
new file mode 100644
index 00000000..5b86b0e1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op.cc
@@ -0,0 +1,466 @@
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+enum SparseSegmentReductionOperation { kSum, kMean, kSqrtN };
+
+namespace {
+inline int64 partitioned_indices(
+    std::vector<std::tuple<size_t, const int64*>>& indices, int indice_dim,
+    int64 id) {
+  int indices_num = indices.size();
+  int64 rows = 0;
+  for (int i = 0; i < indices_num; ++i) {
+    size_t sub_nnz = std::get<0>(indices[i]);
+    rows += sub_nnz;
+    if (rows > id) {
+      int idx = id - (rows - sub_nnz);
+      return std::get<1>(indices[i])[idx * indice_dim];
+    }
+  }
+}
+
+inline const float* const partitioned_embedding_tables(
+    std::vector<std::tuple<size_t, const float*>>& embedding_tables,
+    int embedding_size, int64 id) {
+  int tables_num = embedding_tables.size();
+  int64 rows = 0;
+  for (int i = 0; i < tables_num; ++i) {
+    size_t sub_nnz = std::get<0>(embedding_tables[i]);
+    rows += sub_nnz;
+    if (rows > id) {
+      int idx = id - (rows - sub_nnz);
+      return &(std::get<1>(embedding_tables[i])[idx * embedding_size]);
+    }
+  }
+}
+
+static void sparse_partitioned_gather(
+    int64 input_size, std::vector<std::tuple<size_t, const int64*>>& indices,
+    int indice_dim, int rows,
+    std::vector<std::tuple<size_t, const float*>>& embedding_tables,
+    float* output, const int64_t embedding_size,
+    SparseSegmentReductionOperation operation, const bool set_empty_row_zero,
+    const int* empty_row) {
+  // Record how many values in each row
+  uint64_t* row_values = new uint64_t[rows];
+  memset(row_values, 0, rows * sizeof(uint64_t));
+  // output_buffer is output buffer
+  float* output_buffer = new float[rows * embedding_size];
+  memset(output_buffer, 0, rows * embedding_size * sizeof(float));
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+  auto avx512_add = [](const float* input, uint64_t input_idx, float* output,
+                       uint64_t output_idx, const int64_t num) {
+    constexpr size_t float_displacement = 4;
+    constexpr size_t float_alignment = 16;
+    int64_t quotient = num >> float_displacement;
+    int64_t remainder = num & 0x000F;
+
+    for (int64_t j = 0; j < quotient; ++j) {
+      int64_t offset = j << float_displacement;
+      __m512 a = _mm512_loadu_ps(&input[input_idx + offset]);
+      __m512 b = _mm512_loadu_ps(&output[output_idx + offset]);
+      a = _mm512_add_ps(a, b);
+      _mm512_storeu_ps(&output[output_idx + offset], a);
+    }
+
+    if (remainder != 0) {
+      __mmask16 mask = 0xffff >> (float_alignment - remainder);
+      int64_t offset = quotient << float_displacement;
+      __m512 zero = _mm512_setzero_ps();
+      __m512 a = _mm512_mask_loadu_ps(zero, mask, &input[input_idx + offset]);
+      __m512 b = _mm512_mask_loadu_ps(zero, mask, &output[output_idx + offset]);
+      a = _mm512_mask_add_ps(zero, mask, a, b);
+      _mm512_mask_storeu_ps(&output[output_idx + offset], mask, a);
+    }
+  };
+
+  auto avx512_mean = [](const float* input, uint64_t input_idx,
+                        const float* sum, float* output, uint64_t output_idx,
+                        const int64_t num) {
+    constexpr size_t float_displacement = 4;
+    constexpr size_t float_alignment = 16;
+    int64_t quotient = num >> float_displacement;
+    int64_t remainder = num & 0x000F;
+    __m512 sum_ = _mm512_broadcastss_ps(_mm_load_ss(sum));
+
+    for (int64_t j = 0; j < quotient; ++j) {
+      int64_t offset = j << float_displacement;
+      __m512 a = _mm512_loadu_ps(&input[input_idx + offset]);
+      __m512 b = _mm512_loadu_ps(&output[output_idx + offset]);
+      a = _mm512_add_ps(a, b);
+      a = _mm512_mul_ps(a, sum_);
+      _mm512_storeu_ps(&output[output_idx + offset], a);
+    }
+
+    if (remainder != 0) {
+      __mmask16 mask = 0xffff >> (float_alignment - remainder);
+      int64_t offset = quotient << float_displacement;
+      __m512 zero = _mm512_setzero_ps();
+      __m512 a = _mm512_mask_loadu_ps(zero, mask, &input[input_idx + offset]);
+      __m512 b = _mm512_mask_loadu_ps(zero, mask, &output[output_idx + offset]);
+      a = _mm512_mask_add_ps(zero, mask, a, b);
+      a = _mm512_mask_mul_ps(zero, mask, a, sum_);
+      _mm512_mask_storeu_ps(&output[output_idx + offset], mask, a);
+    }
+  };
+#endif
+
+  for (int64_t i = input_size - 1; i >= 0; --i) {
+    // From sub_indices to find output row
+    auto row = partitioned_indices(indices, indice_dim, i);
+    row_values[row] += 1;
+    // From sub_embedding_tables to find embedding_table row ptr
+    auto embedding_row =
+        partitioned_embedding_tables(embedding_tables, embedding_size, i);
+    // add output_buffer to do block addition
+    uint64_t output_row = row * embedding_size;
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+    avx512_add(embedding_row, 0, output_buffer, output_row, embedding_size);
+#else
+    for (int64_t j = 0; j < embedding_size; ++j) {
+      output_buffer[output_row + j] += embedding_row[j];
+    }
+#endif
+
+    if (row_values[row] == 8) {
+      memcpy(&output[output_row], &output_buffer[output_row],
+             embedding_size * sizeof(float));
+      memset(&output_buffer[output_row], 0, embedding_size * sizeof(float));
+    } else if (row_values[row] % 8 == 0) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+      avx512_add(output_buffer, output_row, output, output_row, embedding_size);
+#else
+      for (int64_t j = 0; j < embedding_size; ++j) {
+        output[output_row + j] += output_buffer[output_row + j];
+      }
+#endif
+      memset(&output_buffer[output_row], 0, embedding_size * sizeof(float));
+    }
+  }
+
+  for (int64_t i = 0; i < rows; ++i) {
+    int64_t output_row = i * embedding_size;
+    // zero emtpy rows
+    if (set_empty_row_zero && empty_row[i] == 1) {
+      memset(&output[output_row], 0, embedding_size * sizeof(float));
+    } else {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+      if (operation == SparseSegmentReductionOperation::kSum) {
+        if (row_values[i] < 8) {
+          memcpy(&output[output_row], &output_buffer[output_row],
+                 embedding_size * sizeof(float));
+        } else {
+          avx512_add(output_buffer, output_row, output, output_row,
+                     embedding_size);
+        }
+      } else if (operation == SparseSegmentReductionOperation::kMean) {
+        float sum = 1.0 / static_cast<float>(row_values[i]);
+        avx512_mean(output_buffer, output_row, &sum, output, output_row,
+                    embedding_size);
+      } else if (operation == SparseSegmentReductionOperation::kSqrtN) {
+        float sqrt = 1.0 / std::sqrt(row_values[i]);
+        avx512_mean(output_buffer, output_row, &sqrt, output, output_row,
+                    embedding_size);
+      }
+#else
+      if (operation == SparseSegmentReductionOperation::kSum) {
+        for (int64_t j = 0; j < embedding_size; ++j) {
+          output[output_row + j] += output_buffer[output_row + j];
+        }
+      } else if (operation == SparseSegmentReductionOperation::kMean) {
+        for (int64_t j = 0; j < embedding_size; ++j) {
+          output[output_row + j] += output_buffer[output_row + j];
+          output[output_row + j] /= row_values[i];
+        }
+      } else if (operation == SparseSegmentReductionOperation::kSqrtN) {
+        for (int64_t j = 0; j < embedding_size; ++j) {
+          output[output_row + j] += output_buffer[output_row + j];
+          output[output_row + j] /= std::sqrt(row_values[i]);
+        }
+      }
+#endif
+    }
+  }
+
+  delete[] row_values;
+  delete[] output_buffer;
+}
+
+static inline void set_feature_nums(
+    int32* feature_nums, int64 input_size,
+    std::vector<std::tuple<size_t, const int64*>> indices, int indice_dim) {
+  for (int64 i = 0; i < input_size; ++i) {
+    feature_nums[partitioned_indices(indices, indice_dim, i)]++;
+  }
+}
+}  // namespace
+
+template <typename Device>
+class FusedSafeEmbeddingPostLookupOp : public OpKernel {
+ public:
+  explicit FusedSafeEmbeddingPostLookupOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+    if (combiner_ == "sum") {
+      operation_ = SparseSegmentReductionOperation::kSum;
+    } else if (combiner_ == "mean") {
+      operation_ = SparseSegmentReductionOperation::kMean;
+    } else if (combiner_ == "sqrtn") {
+      operation_ = SparseSegmentReductionOperation::kSqrtN;
+    } else {
+      OP_REQUIRES(
+          ctx, false,
+          errors::InvalidArgument(
+              "Currently, 'mean', 'sqrtn' and 'sum' are only supported"));
+    }
+  }
+
+  ~FusedSafeEmbeddingPostLookupOp() {}
+
+  void Compute(OpKernelContext* ctx) override {
+    OpInputList emb_shards;
+    OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards));
+
+    OpInputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->input_list("partitioned_indices", &partitioned_indices));
+
+    Tensor const* dense_shape_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor));
+
+    Tensor const* row_empty_and_invalid_flags = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags",
+                                   &row_empty_and_invalid_flags));
+
+    const int64_t embedding_size = emb_shards[0].shape().dim_size(1);
+
+    int input_dims = dense_shape_tensor->dim_size(0);
+    int batch_size = 1;
+    for (int i = 0; i < input_dims - 1; ++i) {
+      batch_size *= dense_shape_tensor->flat<int64>().data()[i];
+    }
+
+    // To check the input
+    OP_REQUIRES(
+        ctx, (dense_shape_tensor->dims() == 1),
+        errors::InvalidArgument("Shape tensor is not valid (dims != 1)"));
+    OP_REQUIRES(
+        ctx, (dense_shape_tensor->dim_size(0) >= 2),
+        errors::InvalidArgument("Shape tensor is not valid (dim_size(0) < 2)"));
+
+    const int* empty_row = row_empty_and_invalid_flags->flat<int>().data();
+
+    Tensor* emb_vectors_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({batch_size, embedding_size}),
+                                  &emb_vectors_tensor));
+    float* output = (float*)emb_vectors_tensor->tensor_data().data();
+    memset(output, 0, batch_size * embedding_size * sizeof(float));
+
+    Tensor* feature_nums_tensor;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({batch_size}),
+                                             &feature_nums_tensor));
+    int32* feature_nums = (int32*)feature_nums_tensor->tensor_data().data();
+    memset(feature_nums, 0, batch_size * sizeof(int32));
+
+    int64 input_size = 0;
+    for (int i = 0; i < num_partitions_; ++i) {
+      input_size += partitioned_indices[i].shape().dim_size(0);
+    }
+
+    int indice_dim = partitioned_indices[0].shape().dim_size(1);
+
+    const bool set_empty_row_zero = default_id_ >= 0;
+
+    std::vector<std::tuple<size_t, const float*>> embedding_tables;
+    std::vector<std::tuple<size_t, const int64*>> indices;
+    embedding_tables.reserve(num_partitions_);
+    indices.reserve(num_partitions_);
+    for (int i = 0; i < num_partitions_; i++) {
+      const size_t sub_nnz = emb_shards[i].shape().dim_size(0);
+      OP_REQUIRES(
+          ctx, sub_nnz == partitioned_indices[i].shape().dim_size(0),
+          errors::InvalidArgument(
+              "emb_shard and partitioned_indice dosn't have the same length"));
+      embedding_tables.emplace_back(
+          std::make_tuple(sub_nnz, emb_shards[i].flat<float>().data()));
+      indices.emplace_back(std::make_tuple(
+          sub_nnz, partitioned_indices[i].flat<int64>().data()));
+    }
+
+    sparse_partitioned_gather(input_size, indices, indice_dim, batch_size,
+                              embedding_tables, output, embedding_size,
+                              operation_, set_empty_row_zero, empty_row);
+    set_feature_nums(feature_nums, input_size, indices, indice_dim);
+  }
+
+ private:
+  int num_partitions_;
+  int partition_axis_;
+  std::string combiner_;
+  float max_norm_;
+  int64_t default_id_;
+  SparseSegmentReductionOperation operation_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedEmbeddingSparsePostLookUp").Device(DEVICE_CPU),
+    FusedSafeEmbeddingPostLookupOp<CPUDevice>);
+
+template <typename Device>
+class FusedSafeEmbeddingPostLookupGradOp : public OpKernel {
+ public:
+  explicit FusedSafeEmbeddingPostLookupGradOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+    if (combiner_ == "sum") {
+      operation_ = SparseSegmentReductionOperation::kSum;
+    } else if (combiner_ == "mean") {
+      operation_ = SparseSegmentReductionOperation::kMean;
+    } else if (combiner_ == "sqrtn") {
+      operation_ = SparseSegmentReductionOperation::kSqrtN;
+    } else {
+      OP_REQUIRES(
+          ctx, false,
+          errors::InvalidArgument(
+              "Currently, 'mean', 'sqrtn' and 'sum' are only supported"));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor const* top_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor));
+
+    OpInputList emb_shards;
+    OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards));
+
+    OpInputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->input_list("partitioned_indices", &partitioned_indices));
+
+    Tensor const* feature_nums = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("feature_nums", &feature_nums));
+
+    Tensor const* row_empty_and_invalid_flags = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags",
+                                   &row_empty_and_invalid_flags));
+
+    OpOutputList grad_shards;
+    OP_REQUIRES_OK(ctx, ctx->output_list("grad_shards", &grad_shards));
+
+    const float* top_grad = top_grad_tensor->flat<float>().data();
+    const int64_t batch_size = top_grad_tensor->shape().dim_size(0);
+    const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1);
+    const int* f_nums = feature_nums->flat<int>().data();
+    const int* empty_row = row_empty_and_invalid_flags->flat<int>().data();
+
+    const bool set_empty_row_zero = default_id_ >= 0;
+
+    for (int i = 0; i < num_partitions_; i++) {
+      const int64_t sub_nnz = partitioned_indices[i].shape().dim_size(0);
+      const int64_t indices_col = partitioned_indices[i].shape().dim_size(1);
+      const int64* indices = partitioned_indices[i].flat<int64>().data();
+      Tensor* grad_shard;
+      OP_REQUIRES_OK(
+          ctx, grad_shards.allocate(i, TensorShape({sub_nnz, emb_vec_size}),
+                                    &grad_shard));
+      float* grad = grad_shard->flat<float>().data();
+
+      std::vector<float> l2_norm(sub_nnz, 1.0);
+      if (max_norm_ > 0.0) {
+        const float* emb = emb_shards[i].flat<float>().data();
+        for (int j = 0; j < sub_nnz; ++j) {
+          float sum = 0.0;
+          for (int k = 0; k < emb_vec_size; ++k) {
+            sum += emb[j * emb_vec_size + k] * emb[j * emb_vec_size + k];
+          }
+          l2_norm[j] = std::sqrt(sum);
+        }
+      }
+
+      if (operation_ == SparseSegmentReductionOperation::kSum) {
+        for (int j = 0; j < sub_nnz; ++j) {
+          int64 idx = indices[j * indices_col];
+          if (set_empty_row_zero == true && empty_row[idx] == 1)
+            memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size);
+          else
+            memcpy(&grad[j * emb_vec_size], &top_grad[idx * emb_vec_size],
+                   sizeof(float) * emb_vec_size);
+        }
+      } else if (operation_ == SparseSegmentReductionOperation::kMean) {
+        for (int j = 0; j < sub_nnz; ++j) {
+          int64 idx = indices[j * indices_col];
+          if (set_empty_row_zero == true && empty_row[idx] == 1)
+            memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size);
+          else {
+            for (int k = 0; k < emb_vec_size; ++k) {
+              grad[j * emb_vec_size + k] =
+                  top_grad[idx * emb_vec_size + k] / f_nums[idx];
+              if (max_norm_ > 0.0 && l2_norm[j] > max_norm_) {
+                grad[j * emb_vec_size + k] *= max_norm_ / l2_norm[j];
+              }
+            }
+          }
+        }
+      } else if (operation_ == SparseSegmentReductionOperation::kSqrtN) {
+        for (int j = 0; j < sub_nnz; ++j) {
+          int64 idx = indices[j * indices_col];
+          if (set_empty_row_zero == true && empty_row[idx] == 1)
+            memset(&grad[j * emb_vec_size], 0, sizeof(float) * emb_vec_size);
+          else {
+            for (int k = 0; k < emb_vec_size; ++k) {
+              grad[j * emb_vec_size + k] =
+                  top_grad[idx * emb_vec_size + k] / std::sqrt(f_nums[idx]);
+              if (max_norm_ > 0.0 && l2_norm[j] > max_norm_) {
+                grad[j * emb_vec_size + k] *= max_norm_ / l2_norm[j];
+              }
+            }
+          }
+        }
+      } else {
+        OP_REQUIRES(
+            ctx, false,
+            errors::InvalidArgument(
+                "Currently, 'mean', 'sqrtn' and 'sum' are only supported"));
+      }
+    }
+  }
+
+ private:
+  int num_partitions_;
+  int partition_axis_;
+  std::string combiner_;
+  float max_norm_;
+  int64_t default_id_;
+  SparseSegmentReductionOperation operation_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedEmbeddingSparsePostLookUpGrad").Device(DEVICE_CPU),
+    FusedSafeEmbeddingPostLookupGradOp<CPUDevice>);
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc
new file mode 100644
index 00000000..c9e38da0
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_post_op_test.cc
@@ -0,0 +1,419 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+class FusedSafeEmbeddingPostLookupOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype,
+                          const std::string& combiner, const float max_norm,
+                          const int default_id) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_safe_embedding_post_look_up",
+                                "FusedEmbeddingSparsePostLookUp")
+                     .Attr("T", dtype)
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_axis", 0)
+                     .Attr("combiner", combiner)
+                     .Attr("max_norm", max_norm)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(num_partitions, dtype))
+                     .Input(FakeInput(num_partitions, DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT64))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+// TEST_F(FusedSafeEmbeddingPostLookupOpTest,
+//        Partition3_Sqrtn_MaxNorm200_Float) {
+//   const int nnz = 10;
+//   const int batch_size = 4;
+//   const int emb_vector_dim = 8;
+//   const int entries = 8;
+
+//   MakeOpAndSetDevice(Device::CPU, 3, DT_FLOAT, "sqrtn", 200.0, -1);
+
+//   // emb_shards
+//   AddInputFromArray<float>(
+//       TensorShape({6, emb_vector_dim}),
+//       {
+//           8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0,
+//           26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0,
+//           28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0,
+//           38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0,
+//           40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+//       });
+//   AddInputFromArray<float>(TensorShape({1, emb_vector_dim}),
+//                            {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+//   AddInputFromArray<float>(
+//       TensorShape({3, emb_vector_dim}),
+//       {96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+//        96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+//        120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+//   // partitioned_indices
+//   AddInputFromArray<int64>(TensorShape({6, 2}),
+//                            {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1});
+//   AddInputFromArray<int64>(TensorShape({1, 2}), {1, 7});
+//   AddInputFromArray<int64>(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0});
+
+//   // sp_dense_shape
+//   AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+//   // row_empty_and_invalid_flags
+//   AddInputFromArray<int>(TensorShape({batch_size + nnz}),
+//                          {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+//   TF_ASSERT_OK(RunOpKernel());
+//   TF_EXPECT_OK(device_->Sync());
+
+//   {
+//     Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+//                                 TensorShape({batch_size, emb_vector_dim}));
+//     test::FillValues<float>(
+//         &expected_emb_vectors,
+//         {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+//          29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+//          77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+//          86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+//          98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+//          72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+//          77.87342834, 78.98532867});
+//     test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+//   }
+//   {
+//     Tensor feature_nums_expected(allocator(), DT_INT32,
+//                                  TensorShape({batch_size}));
+//     test::FillValues<int>(&feature_nums_expected, {2, 3, 3, 2});
+//     test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+//   }
+// }
+
+TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition3_Sqrtn_Float) {
+  const int nnz = 10;
+  const int batch_size = 4;
+  const int emb_vector_dim = 8;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 3, DT_FLOAT, "sqrtn", -1.0, -1);
+
+  // emb_shards
+  AddInputFromArray<float>(
+      TensorShape({6, emb_vector_dim}),
+      {
+          8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0,
+          26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0,
+          28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0,
+          38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0,
+          40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+      });
+  AddInputFromArray<float>(TensorShape({1, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+  AddInputFromArray<float>(
+      TensorShape({3, emb_vector_dim}),
+      {96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({6, 2}),
+                           {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1});
+  AddInputFromArray<int64>(TensorShape({1, 2}), {1, 7});
+  AddInputFromArray<int64>(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}),
+                         {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {22.62741661,  24.04162979,  25.45584297,  26.87005806,  28.28427124,
+         29.69848442,  31.11269760,  32.52691269,  73.90083313,  75.63288116,
+         77.36493683,  79.09698486,  80.82903290,  82.56108856,  84.29313660,
+         86.02519226,  124.70765686, 126.43970490, 128.17175293, 129.90380859,
+         131.63586426, 133.36790466, 135.09996033, 136.83201599, 107.48023224,
+         108.89443970, 110.30865479, 111.72286987, 113.13708496, 114.55130005,
+         115.96550751, 117.37972260});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 3, 3, 2});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition2_Sum_No_Default) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, -1);
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 1, 1});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedSafeEmbeddingPostLookupOpTest, Partition2_Sum_Default_0) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::CPU, 2, DT_FLOAT, "sum", -1.0, 0);
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 0.0, 0.0, 0.0, 0.0});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 1, 1});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks                                                     //
+//----------------------------------------------------------------------------//
+
+template <typename T>
+void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    std::copy_n(vals.data(), vals.size(), flat.data());
+  }
+}
+
+template <typename T>
+void FillZerosValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 0.0;
+  }
+}
+
+template <typename T>
+void FillOnesValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  float scale = std::rand() / ((RAND_MAX + 1u) / 6);
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 1.1 * scale;
+  }
+}
+
+template <typename T>
+void FillIndiceValues(Tensor* tensor, const int partitions,
+                      const int batch_size, const int entries) {
+  auto flat = tensor->flat<T>();
+  int k = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < entries; ++j) {
+      flat.data()[k] = i + partitions;
+      flat.data()[k + 1] = j;
+      k += 2;
+    }
+  }
+}
+
+template <typename T>
+void PrintValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    std::cout << flat.data()[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+
+template <typename T>
+static Graph* EmbPostOp(const string& kind, int num_partitions,
+                        const std::string& combiner, const float max_norm,
+                        const int default_id) {
+  const int nnz = 3;
+  const int batch_size = 512;
+  const int emb_vector_dim = 32;
+  const int entries = 8;
+  const float sparsity = 0.5;
+  const int total_inputs = batch_size * entries * sparsity;
+
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType type = DataTypeToEnum<T>::v();
+
+  const bool isDefault = (kind == "Default");
+  string op_name = isDefault ? "FusedEmbeddingSparsePostLookUpOrigin"
+                             : "FusedEmbeddingSparsePostLookUp";
+
+  // emb_shards
+  std::vector<NodeBuilder::NodeOut> input_emb_shards;
+  input_emb_shards.reserve(num_partitions);
+  for (int i = 0; i < num_partitions; ++i) {
+    Tensor emb_shards(
+        type, TensorShape({total_inputs / num_partitions, emb_vector_dim}));
+    FillOnesValues<T>(&emb_shards);
+    input_emb_shards.push_back(test::graph::Constant(g, emb_shards));
+    // PrintValues<T>(&emb_shards);
+  }
+
+  // partitioned_indices
+  std::vector<NodeBuilder::NodeOut> partitioned_indices;
+  partitioned_indices.reserve(num_partitions);
+  for (int i = 0; i < num_partitions; ++i) {
+    Tensor sub_partitioned_indice(
+        DT_INT64, TensorShape({total_inputs / num_partitions, 2}));
+    FillIndiceValues<int64>(&sub_partitioned_indice, i,
+                            batch_size / num_partitions, entries * sparsity);
+    partitioned_indices.push_back(
+        test::graph::Constant(g, sub_partitioned_indice));
+    // PrintValues<int64>(&sub_partitioned_indice);
+  }
+
+  // sp_dense_shape
+  Tensor sp_dense_shape(DT_INT64, TensorShape({2}));
+  FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  Tensor row_empty_and_invalid_flags(DT_INT32, TensorShape({batch_size + nnz}));
+  FillZerosValues<int>(&row_empty_and_invalid_flags);
+
+  auto nodeBuilder =
+      NodeBuilder(g->NewName("n"), op_name)
+          .Attr("T", type)
+          .Attr("num_partitions", num_partitions)
+          .Attr("partition_axis", 0)
+          .Attr("combiner", combiner)
+          .Attr("max_norm", max_norm)
+          .Attr("default_id", default_id)
+          .Input(input_emb_shards)
+          .Input(partitioned_indices)
+          .Input(test::graph::Constant(g, sp_dense_shape))
+          .Input(test::graph::Constant(g, row_empty_and_invalid_flags))
+          .Input(partitioned_indices);
+  TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr));
+  return g;
+}
+
+#define BM_EMB_POST_OP(kind, NP, C, T, DEVICE, NTH)                          \
+  static void BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH( \
+      int iters) {                                                           \
+    testing::UseRealTime();                                                  \
+    SessionOptions opts;                                                     \
+    opts.config.set_intra_op_parallelism_threads(NTH);                       \
+    test::Benchmark(#DEVICE, EmbPostOp<T>(#kind, NP, #C, -1.0, -1), &opts)   \
+        .Run(iters);                                                         \
+  }                                                                          \
+  BENCHMARK(BM_EMB_POST_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH);
+
+#define BM_EMB_POST_OP_kind(NP, C, NTH) \
+  BM_EMB_POST_OP(OPT, NP, C, float, CPU, NTH);
+
+#define BM_EMB_POST_OP_NTH(NP, C) \
+  BM_EMB_POST_OP_kind(NP, C, 1);  \
+  BM_EMB_POST_OP_kind(NP, C, 4);  \
+  BM_EMB_POST_OP_kind(NP, C, 8);
+
+BM_EMB_POST_OP_NTH(2, sum);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc
new file mode 100644
index 00000000..c74e2317
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op.cc
@@ -0,0 +1,315 @@
+#define EIGEN_USE_THREADS
+
+#include <limits.h>
+
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+namespace {
+
+struct IndicePair {
+  int64_t row;
+  int64_t column;
+};
+
+enum Part_Strategy { MOD, DIV, DIV_EV };
+
+typedef void (*PARTITIONALGO)(const int64_t* id_table,
+                              const int64_t numPartitions,
+                              const int64_t idsPerPartition,
+                              const int64_t extras, const int64_t originId,
+                              int64_t* segment, int64_t* newId);
+
+template <Part_Strategy PS>
+inline void GetPartitionIndex(const int64_t* id_table,
+                              const int64_t numPartitions,
+                              const int64_t idsPerPartition,
+                              const int64_t extras, const int64_t originId,
+                              int64_t* segment, int64_t* newId) {}
+
+template <>
+inline void GetPartitionIndex<Part_Strategy::MOD>(
+    const int64_t* id_table, const int64_t numPartitions,
+    const int64_t idsPerPartition, const int64_t extras, const int64_t originId,
+    int64_t* segment, int64_t* newId) {
+  *segment = originId % numPartitions;
+  *newId = originId / numPartitions;
+}
+
+template <>
+inline void GetPartitionIndex<Part_Strategy::DIV>(
+    const int64_t* id_table, const int64_t numPartitions,
+    const int64_t idsPerPartition, const int64_t extras, const int64_t originId,
+    int64_t* segment, int64_t* newId) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+  const int64_t* prange = id_table + numPartitions % 8;
+  __m512i voffset = _mm512_set1_epi64(originId);
+  int vectorSize = numPartitions / 8;
+  for (int i = vectorSize - 1; i >= 0; --i) {
+    __m512i vrange = _mm512_maskz_loadu_epi64(0xff, prange + i * 8);
+    __mmask8 mask = _mm512_cmple_epi64_mask(vrange, voffset);
+    if (mask != 0) {
+      int numGreater = __builtin_ctz(mask);
+      *segment = (numPartitions - 1) - 8 * (vectorSize - 1 - i) - numGreater;
+      *newId = originId - id_table[*segment];
+      return;
+    }
+  }
+
+  for (int j = numPartitions % 8 - 1; j > -1; --j) {
+    if (originId >= id_table[j]) {
+      *segment = j;
+      *newId = originId - id_table[j];
+      break;
+    }
+  }
+#else
+  *segment = originId < extras * (idsPerPartition + 1)
+                 ? originId / (idsPerPartition + 1)
+                 : (originId - extras) / idsPerPartition;
+  *newId = *segment < extras ? originId % (idsPerPartition + 1)
+                             : (originId - extras) % idsPerPartition;
+#endif
+}
+
+template <>
+inline void GetPartitionIndex<Part_Strategy::DIV_EV>(
+    const int64_t* id_table, const int64_t numPartitions,
+    const int64_t idsPerPartition, const int64_t extras, const int64_t originId,
+    int64_t* segment, int64_t* newId) {
+  *segment = originId < 0 ? *segment : 0;
+  *newId = originId;
+}
+}  // namespace
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+class FusedEmbeddingSparsePreLookUpCPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingSparsePreLookUpCPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_empty_row", &fill_empty_row_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("prune_invalid_id", &prune_invalid_id_));
+
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("partition_strategy", &partition_strategy_str_));
+    if (partition_strategy_str_ == "div") {
+      partition_strategy_ = GetPartitionIndex<Part_Strategy::DIV>;
+    } else if (partition_strategy_str_ == "mod") {
+      partition_strategy_ = GetPartitionIndex<Part_Strategy::MOD>;
+    } else if (partition_strategy_str_ == "div_ev") {
+      partition_strategy_ = GetPartitionIndex<Part_Strategy::DIV_EV>;
+    } else {
+      OP_REQUIRES(
+          ctx, false,
+          errors::InvalidArgument("Not support partition_strategy type. ",
+                                  partition_strategy_));
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const int64_t default_id = default_id_ >= 0 ? default_id_ : 0;
+    // 1. get input tensor
+    Tensor const* values_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor));
+    const int64_t nnz = values_tensor->shape().dim_size(0);
+
+    const int64_t* values =
+        reinterpret_cast<const int64_t*>(values_tensor->flat<int64>().data());
+
+    Tensor const* indices_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor));
+
+    const int64_t* indices =
+        reinterpret_cast<const int64_t*>(indices_tensor->flat<int64>().data());
+
+    Tensor const* dense_shape = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape));
+    const int64_t batch_size = dense_shape->flat<int64>().data()[0];
+
+    OpInputList partition_shapes;
+    OP_REQUIRES_OK(ctx, ctx->input_list("partition_shapes", &partition_shapes));
+
+    partition_total_sizes_ = 0;
+    for (const Tensor& shape : partition_shapes) {
+      OP_REQUIRES(ctx, shape.dims() <= 2,
+                  errors::InvalidArgument(
+                      "input partition_shapes must all less than rank 2"));
+      partition_total_sizes_ += shape.flat<int64>().data()[0];
+    }
+
+    if (partition_total_sizes_ == 1) {
+      partition_strategy_ = GetPartitionIndex<Part_Strategy::DIV_EV>;
+    }
+
+    // 1.1 define output tensors
+    OpOutputList partitioned_values;
+    OP_REQUIRES_OK(ctx,
+                   ctx->output_list("partitioned_values", &partitioned_values));
+    OpOutputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->output_list("partitioned_indices", &partitioned_indices));
+
+    Tensor* all_flags;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(2 * num_partitions_,
+                                  TensorShape{batch_size + nnz}, &all_flags));
+    int32_t* all_flags_list = all_flags->flat<int32_t>().data();
+
+    memset(all_flags_list, 0, (batch_size + nnz) * sizeof(int32_t));
+
+    // 2.1 get index
+    const int64_t idsPerPartition = partition_total_sizes_ / num_partitions_;
+    const int64_t extras = partition_total_sizes_ % num_partitions_;
+    std::vector<int64_t> empty_index_;
+    // [p_seg_nums + list(p_seg, p_id)]
+    int64_t* const id_index_array = new int64_t[num_partitions_ + 1 + nnz * 2];
+    memset(id_index_array, 0, (num_partitions_ + 1) * sizeof(int64_t));
+
+    // 2.2 get the map of the mutli-table index
+    int64_t default_p_seg = 0;
+    int64_t default_p_val = 0;
+    int64_t p_seg = 0;
+    int64_t p_val = 0;
+    register int64_t tmp_id;
+    int64_t* const min_id_per_seg = new int64_t[num_partitions_];
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+    int64_t* tmp_value_arr;
+
+    // 2.1 build min_id_per_seg
+    memset(min_id_per_seg, 0, (num_partitions_) * sizeof(int64_t));
+    for (int i = 0; i < num_partitions_; ++i) {
+      min_id_per_seg[i] =
+          i < extras ? i * (idsPerPartition + 1) : i * idsPerPartition + extras;
+    }
+
+    // 2.2.1 get new seg & id in id_index_array
+    int64_t* new_p_seg;
+    int64_t* new_p_id;
+    int64_t* id_indices = id_index_array + num_partitions_ + 1;
+
+    for (int64_t index = 0; index < nnz; ++index) {
+      new_p_seg = id_indices + index * 2;
+      new_p_id = id_indices + index * 2 + 1;
+
+      // set default values;
+      *(new_p_seg) = prune_invalid_id_ ? num_partitions_ : 0;
+      *(new_p_id) = *(values + index);
+
+      // set all_flags_list;
+      all_flags_list[batch_size + index] = (*new_p_id < 0) ? 0 : 1;
+      all_flags_list[*(indices + index * 2)] +=
+          !prune_invalid_id_ || !(*new_p_id < 0);
+
+      partition_strategy_(min_id_per_seg, num_partitions_, idsPerPartition,
+                          extras, *(new_p_seg + 1), new_p_seg, new_p_id);
+      ++id_index_array[*new_p_seg];
+    }
+
+#else
+    for (int64_t index = 0; index < nnz; ++index) {
+      tmp_id = values[index];
+      if (tmp_id < 0) {
+        p_seg = prune_invalid_id_ ? num_partitions_ : 0;
+        p_val = values[index];
+        all_flags_list[*(indices + 2 * index)] += !p_seg;
+      } else {
+        all_flags_list[batch_size + index] = 1;
+        ++all_flags_list[*(indices + 2 * index)];
+        partition_strategy_(nullptr, num_partitions_, idsPerPartition, extras,
+                            tmp_id, &p_seg, &p_val);
+      }
+      ++id_index_array[p_seg];
+      *(id_index_array + 2 * index + num_partitions_ + 1) = p_seg;
+      *(id_index_array + 2 * index + num_partitions_ + 2) = p_val;
+    }
+#endif
+
+    // 2.3 fill_empty_row_index_
+    if (fill_empty_row_) {
+      // get default id p_seg_ and p_val_
+      partition_strategy_(min_id_per_seg, num_partitions_, idsPerPartition,
+                          extras, default_id, &default_p_seg, &default_p_val);
+      for (int64_t origin_index = 0; origin_index < batch_size;
+           ++origin_index) {
+        if (all_flags_list[origin_index]) {
+          all_flags_list[origin_index] = 0;
+          continue;
+        }
+        all_flags_list[origin_index] = 1;
+        empty_index_.push_back(origin_index);
+        empty_index_.push_back(0);
+      }
+    }
+
+    // 3 packaging the output tensor
+    for (int i = 0; i < num_partitions_; ++i) {
+      int64_t size = id_index_array[i];
+      if (fill_empty_row_ && i == default_p_seg) {
+        size += empty_index_.size() >> 1;
+      }
+
+      Tensor* sub_partitioned_values;
+      OP_REQUIRES_OK(ctx, partitioned_values.allocate(
+                              i, TensorShape({static_cast<int64_t>(size)}),
+                              &sub_partitioned_values));
+      int64_t* sub_p_values = reinterpret_cast<int64_t*>(
+          sub_partitioned_values->flat<int64>().data());
+
+      Tensor* sub_partitioned_indices;
+      OP_REQUIRES_OK(ctx, partitioned_indices.allocate(
+                              i, TensorShape({static_cast<int64_t>(size), 2}),
+                              &sub_partitioned_indices));
+
+      int64_t* sub_p_indces = reinterpret_cast<int64_t*>(
+          sub_partitioned_indices->flat<int64>().data());
+      if (!size) continue;
+
+      int sub_part_index = 0;
+      for (int index = 0; index < nnz; ++index) {
+        if (id_index_array[(index) * 2 + num_partitions_ + 1] == i) {
+          sub_p_values[sub_part_index] =
+              id_index_array[(index) * 2 + num_partitions_ + 2];
+          sub_p_indces[sub_part_index * 2] = *(indices + (index) * 2);
+          sub_p_indces[sub_part_index * 2 + 1] = *(indices + (index) * 2 + 1);
+          ++sub_part_index;
+        }
+      }
+      if (fill_empty_row_ && default_p_seg == i) {
+        memcpy(sub_p_indces + sub_part_index * 2, empty_index_.data(),
+               empty_index_.size() * sizeof(int64_t));
+
+        std::fill(sub_p_values + sub_part_index, sub_p_values + size,
+                  default_p_val);
+      }
+    }
+    delete[] min_id_per_seg;
+    delete[] id_index_array;
+  }
+
+ private:
+  int num_partitions_;
+  int partition_total_sizes_;
+  int partition_axis_;
+  bool fill_empty_row_;
+  bool prune_invalid_id_;
+  int64_t default_id_;
+  PARTITIONALGO partition_strategy_;
+  std::string partition_strategy_str_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedEmbeddingSparsePreLookUp").Device(DEVICE_CPU),
+    FusedEmbeddingSparsePreLookUpCPU);
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc
new file mode 100644
index 00000000..ea74e624
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_pre_op_test.cc
@@ -0,0 +1,627 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+class FusedEmbeddingSparsePreLookUpOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, const int num_partitions,
+                          const bool fill_empty_row,
+                          const bool prune_invalid_id, const int default_id,
+                          const string partition_strategy = "div") {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("FusedEmbeddingSparsePreLookUp",
+                                "FusedEmbeddingSparsePreLookUp")
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_strategy", partition_strategy)
+                     .Attr("partition_axis", 0)
+                     .Attr("fill_empty_row", fill_empty_row)
+                     .Attr("prune_invalid_id", prune_invalid_id)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(num_partitions, DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Int64) {
+  MakeOpAndSetDevice(Device::CPU, 1, false, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {1, 1});
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({12}),
+                           {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7});
+  // sp_indices
+  AddInputFromArray<int64>(TensorShape({12, 2}),
+                           {2,  3, 4,  6, 1, 6, 12, 12, 12, 12, 11, 5,
+                            15, 0, 11, 6, 7, 9, 11, 8,  12, 13, 13, 0});
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {16, 16});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({12}));
+    test::FillValues<int64>(&expected_values,
+                            {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({12, 2}));
+    test::FillValues<int64>(&expected_indices,
+                            {2,  3, 4,  6, 1, 6, 12, 12, 12, 12, 11, 5,
+                             15, 0, 11, 6, 7, 9, 11, 8,  12, 13, 13, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Fill_Empty) {
+  MakeOpAndSetDevice(Device::CPU, 1, true, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {1, 1});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({11}));
+    test::FillValues<int64>(&expected_values,
+                            {0, 4, 3, -2, 5, -3, -4, 9, -6, 2, 0});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({11, 2}));
+    test::FillValues<int64>(
+        &expected_indices,
+        {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7, 2, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Ev_Fill_Empty_Prune_Invalid) {
+  MakeOpAndSetDevice(Device::CPU, 1, true, true, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {1, 1});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({9}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 5, 9, 2, 0, 0, 0});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6,
+                                                7, 2, 0, 4, 0, 5, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64) {
+  MakeOpAndSetDevice(Device::CPU, 3, false, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {6, 16});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {3, 16});
+  // partition_shapes 2
+  AddInputFromArray<int64>(TensorShape({2}), {7, 16});
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({12}),
+                           {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7});
+  // sp_indices
+  AddInputFromArray<int64>(TensorShape({12, 2}),
+                           {2,  3, 4,  6, 1, 6, 12, 12, 12, 12, 11, 5,
+                            15, 0, 11, 6, 7, 9, 11, 8,  12, 13, 13, 0});
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {16, 16});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({6}));
+    test::FillValues<int64>(&expected_values, {1, 5, 3, 0, 5, 5});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({6, 2}));
+    test::FillValues<int64>(&expected_indices,
+                            {2, 3, 4, 6, 1, 6, 11, 6, 7, 9, 11, 8});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 1});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {12, 12, 13, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(4));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {1, 3, 4, 0});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(2));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {12, 12, 11, 5, 15, 0, 12, 13});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(5));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition2_Fill_Empty) {
+  MakeOpAndSetDevice(Device::CPU, 2, true, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({9}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, -2, -3, -4, -6, 2, 0});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 4, 0, 5,
+                                                2, 6, 1, 6, 7, 2, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Fill_Empty_Prune_Invalid) {
+  MakeOpAndSetDevice(Device::CPU, 2, true, true, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({7}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 2, 0, 0, 0});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({7, 2}));
+    test::FillValues<int64>(&expected_indices,
+                            {0, 0, 0, 4, 1, 2, 6, 7, 2, 0, 4, 0, 5, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Fill_Empty_Prune_Invalid_Default_7) {
+  MakeOpAndSetDevice(Device::CPU, 2, true, true, 7);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 2});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 6, 7});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({5}));
+    test::FillValues<int64>(&expected_values, {0, 4, 2, 2, 2});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({5, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0, 2, 0, 4, 0, 5, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Prune_Invalid_Default_3) {
+  MakeOpAndSetDevice(Device::CPU, 2, false, true, 3);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 2});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 6, 7});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition1) {
+  MakeOpAndSetDevice(Device::CPU, 1, false, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {10, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({10}));
+    test::FillValues<int64>(&expected_values,
+                            {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({10, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 3, 4,
+                                                4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition1_Fill_Empty_Prune_Invalid_Default_3) {
+  MakeOpAndSetDevice(Device::CPU, 1, true, true, 3);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {10, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({9}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 5, 9, 2, 3, 3, 3});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+    ;
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6,
+                                                7, 2, 0, 4, 0, 5, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64_Perfs) {
+  int num_partitions = 4;
+  int batch_size = 100000;
+  int num_per_part = batch_size / num_partitions;
+  int embed_dim = 32;
+  int default_id = -1;
+
+  std::vector<int64> sp_values;
+  std::vector<int64> sp_indices;
+
+  MakeOpAndSetDevice(Device::CPU, num_partitions, false, false, default_id);
+
+  for (int i = 0; i < num_partitions; ++i) {
+    AddInputFromArray<int64>(TensorShape({2}),
+                             {num_per_part * embed_dim, embed_dim});
+  }
+
+  for (int i = 0; i < batch_size * embed_dim; ++i) {
+    sp_values.push_back(i);
+  }
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < embed_dim; ++j) {
+      sp_indices.push_back(i);
+      sp_indices.push_back(j);
+    }
+  }
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({sp_values.size()}), sp_values);
+  // sp_indices
+  AddInputFromArray<int64>(TensorShape({sp_values.size(), 2}), sp_indices);
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, embed_dim});
+  TF_ASSERT_OK(RunOpKernel());
+}
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks                                                     //
+//----------------------------------------------------------------------------//
+
+template <typename T>
+void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    std::copy_n(vals.data(), vals.size(), flat.data());
+  }
+}
+
+template <typename T>
+void FillZerosValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 0.0;
+  }
+}
+
+template <typename T>
+void FillOnesValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  float scale = std::rand() / ((RAND_MAX + 1u) / 6);
+  for (int i = 0; i < flat.size(); ++i) {
+    flat.data()[i] = 1.1 * scale;
+  }
+}
+
+template <typename T>
+void FillIndiceValues(Tensor* tensor, const int partitions,
+                      const int batch_size, const int entries) {
+  auto flat = tensor->flat<T>();
+  int k = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < entries; ++j) {
+      flat.data()[k] = i + partitions;
+      flat.data()[k + 1] = j;
+      k += 2;
+    }
+  }
+}
+
+template <typename T>
+void PrintValues(Tensor* tensor) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) {
+    std::cout << flat.data()[i] << ", ";
+  }
+  std::cout << std::endl;
+}
+
+template <typename T>
+static Graph* EmbPreOp(const string& kind, int num_partitions,
+                       const std::string& combiner, const float max_norm,
+                       const int default_id) {
+  int batch_size = 100000;
+  int num_per_part = batch_size / num_partitions;
+  int embed_dim = 32;
+  const string partition_strategy = "div";
+  const bool fill_empty_row = false;
+  const bool prune_invalid_id = false;
+
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType type = DataTypeToEnum<T>::v();
+
+  const bool isDefault = (kind == "Default");
+  string op_name = isDefault ? "FusedEmbeddingSparsePreLookUp"
+                             : "FusedEmbeddingSparsePreLookUp";
+
+  std::vector<int64> sp_values;
+  std::vector<int64> sp_indices;
+
+  // partitioned_indices
+  std::vector<NodeBuilder::NodeOut> partitioned_indices;
+  partitioned_indices.reserve(num_partitions);
+  for (int i = 0; i < num_partitions; ++i) {
+    Tensor sub_partitioned_indice(DT_INT64, TensorShape({2}));
+    FillValues<int64>(&sub_partitioned_indice,
+                      {num_per_part * embed_dim, embed_dim});
+    partitioned_indices.push_back(
+        test::graph::Constant(g, sub_partitioned_indice));
+  }
+
+  for (int i = 0; i < batch_size * embed_dim; ++i) {
+    sp_values.push_back(i);
+  }
+
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < embed_dim; ++j) {
+      sp_indices.push_back(i);
+      sp_indices.push_back(j);
+    }
+  }
+
+  // sp_values
+  Tensor sp_values_t(DT_INT64, TensorShape({sp_values.size()}));
+  FillValues<int64>(&sp_values_t, sp_values);
+
+  // sp_indices
+  Tensor sp_indices_t(DT_INT64, TensorShape({sp_values.size(), 2}));
+  FillValues<int64>(&sp_indices_t, sp_indices);
+
+  // sp_dense_shape
+  Tensor sp_dense_shape_t(DT_INT64, TensorShape({2}));
+  FillValues<int64>(&sp_dense_shape_t, {batch_size, embed_dim});
+
+  auto nodeBuilder = NodeBuilder(g->NewName("n"), op_name)
+                         .Attr("num_partitions", num_partitions)
+                         .Attr("partition_strategy", partition_strategy)
+                         .Attr("partition_axis", 0)
+                         .Attr("fill_empty_row", fill_empty_row)
+                         .Attr("prune_invalid_id", prune_invalid_id)
+                         .Attr("default_id", default_id)
+                         .Input(partitioned_indices)
+                         .Input(test::graph::Constant(g, sp_values_t))
+                         .Input(test::graph::Constant(g, sp_indices_t))
+                         .Input(test::graph::Constant(g, sp_dense_shape_t));
+  TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr));
+  return g;
+}
+
+#define BM_EMB_PRE_OP(kind, NP, C, T, DEVICE, NTH)                          \
+  static void BM_EMB_PRE_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH( \
+      int iters) {                                                          \
+    testing::UseRealTime();                                                 \
+    SessionOptions opts;                                                    \
+    opts.config.set_intra_op_parallelism_threads(NTH);                      \
+    test::Benchmark(#DEVICE, EmbPreOp<T>(#kind, NP, #C, -1.0, -1), &opts)   \
+        .Run(iters);                                                        \
+  }                                                                         \
+  BENCHMARK(BM_EMB_PRE_OP##_##kind##_##NP##_##C##_##T##_##DEVICE##_##NTH);
+
+#define BM_EMB_PRE_OP_kind(NP, C, NTH) \
+  BM_EMB_PRE_OP(OPT, NP, C, float, CPU, NTH);
+
+#define BM_EMB_PRE_OP_NTH(NP, C) \
+  BM_EMB_PRE_OP_kind(NP, C, 1);  \
+  // BM_EMB_PRE_OP_kind(NP, C, 4);  \
+  // BM_EMB_PRE_OP_kind(NP, C, 8);  \
+
+BM_EMB_PRE_OP_NTH(4, sum);
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h
new file mode 100644
index 00000000..19b839f0
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h
@@ -0,0 +1,98 @@
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_
+
+#if GOOGLE_CUDA
+
+#define CK_CUDA_THROW_(x)                                                      \
+  do {                                                                         \
+    cudaError_t retval = (x);                                                  \
+    if (retval != cudaSuccess) {                                               \
+      throw std::runtime_error(std::string("Runtime error: ") +                \
+                               (cudaGetErrorString(retval)) + " " + __FILE__ + \
+                               ":" + std::to_string(__LINE__) + " \n");        \
+    }                                                                          \
+  } while (0)
+
+namespace tensorflow {
+
+namespace {
+
+inline int CalcBlocksLinearMapping(const int problem_size, const int threads) {
+  return problem_size % threads == 0 ? (problem_size / threads)
+                                     : (problem_size / threads + 1);
+}
+
+struct IndicePair {
+  int64_t row_in_batch;
+  int64_t entry_in_column;
+};
+
+enum Combiner { Mean, Sum, Sqrtn };
+
+template <Combiner combiner, typename T>
+__forceinline__ __device__ float Combine(const float in, const T feature_num);
+
+template <>
+__forceinline__ __device__ float Combine<Sqrtn, int>(const float in,
+                                                     const int feature_num) {
+  return in / sqrtf(feature_num);
+}
+
+template <>
+__forceinline__ __device__ float Combine<Mean, int>(const float in,
+                                                    const int feature_num) {
+  return in / feature_num;
+}
+
+template <>
+__forceinline__ __device__ float Combine<Sum, int>(const float in,
+                                                   const int feature_num) {
+  return in;
+}
+
+template <>
+__forceinline__ __device__ float Combine<Sqrtn, float>(
+    const float in, const float feature_num) {
+  return in / sqrtf(feature_num);
+}
+
+template <>
+__forceinline__ __device__ float Combine<Mean, float>(const float in,
+                                                      const float feature_num) {
+  return in / feature_num;
+}
+
+template <>
+__forceinline__ __device__ float Combine<Sum, float>(const float in,
+                                                     const float feature_num) {
+  return in;
+}
+
+template <Combiner combiner>
+__forceinline__ __device__ float CombineGrad(const float grad,
+                                             const int feature_num);
+
+template <>
+__forceinline__ __device__ float CombineGrad<Sqrtn>(const float grad,
+                                                    const int feature_num) {
+  return grad / sqrtf(feature_num);
+}
+
+template <>
+__forceinline__ __device__ float CombineGrad<Mean>(const float grad,
+                                                   const int feature_num) {
+  return grad / feature_num;
+}
+
+template <>
+__forceinline__ __device__ float CombineGrad<Sum>(const float grad,
+                                                  const int feature_num) {
+  return grad;
+}
+}  // namespace
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_EMBEDDING_FUSED_EMBEDDING_COMMON_CU_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
new file mode 100644
index 00000000..84581673
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_gpu.cu.cc
@@ -0,0 +1,315 @@
+#include <exception>
+#include <string>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "fused_embedding_common.cu.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+__global__ void SetToIntMaxSTG128(int* values_offset, const int batch_size) {
+  const int thread_offset = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+  const int int_max = 0x7fffffff;
+  if (thread_offset + 4 < batch_size) {
+    ::int4 four = make_int4(int_max, int_max, int_max, int_max);
+    *((::int4*)(values_offset + thread_offset)) = four;
+  } else if (thread_offset < batch_size) {
+    for (int i = thread_offset; i < batch_size; i++) {
+      values_offset[i] = int_max;
+    }
+  }
+}
+
+__global__ void CalcPerElementRowInBatchValuesOffset(const int64_t* indices,
+                                                     int* values_offset,
+                                                     const int64_t nnz) {
+  const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_offset < int(nnz)) {
+    const int64_t element_row = indices[2 * thread_offset];
+    atomicMin(values_offset + int(element_row), thread_offset);
+  }
+}
+
+template <Combiner combiner>
+__global__ void EmbeddingLookUp(const float* emb_variable,
+                                const int64_t* values, const int* values_offset,
+                                float* embedding_vector, const float max_norm,
+                                const int emb_vec_size,
+                                const int64_t batch_size, const int64_t nnz) {
+  __shared__ float l2_sum[1];
+
+  int value_offset = values_offset[blockIdx.x];
+  int feature_num;
+  if (blockIdx.x == int(batch_size) - 1) {
+    feature_num = int(nnz) - value_offset;
+  } else {
+    feature_num = values_offset[blockIdx.x + 1] - value_offset;
+  }
+  float out = 0.0f;
+  for (int i = 0; i < feature_num; i++) {
+    float emb_element =
+        emb_variable[int(values[value_offset + i]) * emb_vec_size +
+                     threadIdx.x];
+    if (max_norm >= 0.0f) {
+      // calc l2 norm of this emb row(per block) and compare with max_norm.
+      // if greater than max_norm, then clip every element with factor
+      // max_norm / l2norm
+      if (threadIdx.x == 0) {
+        l2_sum[0] = 0.0f;
+      }
+      __syncthreads();
+      atomicAdd(l2_sum, emb_element * emb_element);
+      __syncthreads();
+      float l2_norm = sqrtf(l2_sum[0]);
+      if (l2_norm > max_norm) {
+        emb_element *= max_norm / l2_norm;
+      }
+    }
+    out += emb_element;
+  }
+
+  // combine
+  out = Combine<combiner, int>(out, feature_num);
+
+  // store the embedding vector
+  embedding_vector[blockIdx.x * emb_vec_size + threadIdx.x] = out;
+}
+
+template <Combiner combiner>
+__global__ void DoEmbeddingGrad(const float* top_grad,
+                                const float* emb_variable,
+                                const int64_t* values, const int* values_offset,
+                                float* grad_values, const float max_norm,
+                                const int emb_vec_size,
+                                const int64_t batch_size, const int64_t nnz) {
+  __shared__ float l2_sum[1];
+  const int value_offset = values_offset[blockIdx.x];
+  int feature_num;
+  if (blockIdx.x == int(batch_size) - 1) {
+    feature_num = int(nnz) - value_offset;
+  } else {
+    feature_num = values_offset[blockIdx.x + 1] - value_offset;
+  }
+  float grad = top_grad[blockIdx.x * emb_vec_size + threadIdx.x];
+  grad = CombineGrad<combiner>(grad, feature_num);
+  for (int i = 0; i < feature_num; i++) {
+    float grad_i = grad;
+    if (max_norm > 0.0f) {
+      float emb_element =
+          emb_variable[int(values[value_offset + i]) * emb_vec_size +
+                       threadIdx.x];
+      if (threadIdx.x == 0) {
+        l2_sum[0] = 0.0f;
+      }
+      __syncthreads();
+      atomicAdd(l2_sum, emb_element * emb_element);
+      __syncthreads();
+      float l2_norm = sqrtf(l2_sum[0]);
+      if (l2_norm > max_norm) {
+        grad_i *= max_norm / l2_norm;
+      }
+    }
+    grad_values[(value_offset + i) * emb_vec_size + threadIdx.x] = grad_i;
+  }
+}
+
+}  // namespace
+
+class FusedEmbeddingLocalSparseLookUpGPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingLocalSparseLookUpGPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    Tensor const* values_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor));
+    Tensor const* indices_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor));
+    Tensor const* dense_shape_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor));
+    Tensor const* emb_variable_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("emb_variable", &emb_variable_tensor));
+
+    auto dense_shape = dense_shape_tensor->flat<int64>().data();
+    const size_t batch_size = dense_shape[0];
+    const int64 nnz = indices_tensor->shape().dim_size(0);
+    const int64 emb_vec_size = emb_variable_tensor->shape().dim_size(1);
+
+    TensorShape emb_vectors_tensor_shape;
+
+    emb_vectors_tensor_shape = TensorShape(
+        std::vector<int64>({static_cast<long long>(batch_size), emb_vec_size}));
+    Tensor* emb_vectors_tensor = nullptr;
+    // allocate output
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, emb_vectors_tensor_shape,
+                                             &emb_vectors_tensor));
+
+    // allocate offset tensor
+    TensorShape values_offset_tensor_shape =
+        TensorShape(std::vector<int64>({static_cast<long long>(batch_size)}));
+
+    Tensor* values_offset_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, values_offset_tensor_shape,
+                                             &values_offset_tensor));
+
+    {
+      const int threads = 1024;
+      int blocks = batch_size / threads;
+      blocks = batch_size % threads == 0 ? blocks : blocks + 1;
+      SetToIntMaxSTG128<<<blocks, threads, 0, stream>>>(
+          values_offset_tensor->flat<int>().data(), int(batch_size));
+    }
+    {
+      const int threads = 1024;
+      int blocks = nnz % threads == 0 ? (nnz / threads) : (nnz / threads + 1);
+
+      // calculate values offset
+      CalcPerElementRowInBatchValuesOffset<<<blocks, threads, 0, stream>>>(
+          reinterpret_cast<const int64_t*>(
+              indices_tensor->flat<int64>().data()),
+          values_offset_tensor->flat<int>().data(), nnz);
+    }
+    {
+      const int blocks = int(batch_size);
+      const int threads = int(emb_vec_size);
+      if (combiner_ == "sqrtn") {
+        EmbeddingLookUp<Sqrtn><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(emb_vectors_tensor->flat<float>().data()),
+            max_norm_, int(emb_vec_size), batch_size, nnz);
+      } else if (combiner_ == "mean") {
+        EmbeddingLookUp<Mean><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(emb_vectors_tensor->flat<float>().data()),
+            max_norm_, int(emb_vec_size), batch_size, nnz);
+      } else {
+        EmbeddingLookUp<Sum><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(emb_vectors_tensor->flat<float>().data()),
+            max_norm_, int(emb_vec_size), batch_size, nnz);
+      }
+    }
+  }
+
+ private:
+  std::string combiner_;
+  float max_norm_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingLocalSparseLookUp")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("sp_dense_shape"),
+                        FusedEmbeddingLocalSparseLookUpGPU);
+
+class FusedEmbeddingLocalSparseLookUpGradGPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingLocalSparseLookUpGradGPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    Tensor const* top_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor));
+
+    Tensor const* emb_variable_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("emb_variable", &emb_variable_tensor));
+    Tensor const* values_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor));
+    Tensor const* values_offset_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_values_offset", &values_offset_tensor));
+
+    const int64 emb_vec_size = top_grad_tensor->shape().dim_size(1);
+    const int64 batch_size = top_grad_tensor->shape().dim_size(0);
+    const int64 nnz = values_tensor->shape().dim_size(0);
+
+    Tensor* grad_emb_weight_sp_values_tensor;
+    TensorShape grad_emb_weight_sp_values_tensor_shape =
+        TensorShape(std::vector<int64>({nnz, emb_vec_size}));
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, grad_emb_weight_sp_values_tensor_shape,
+                                  &grad_emb_weight_sp_values_tensor));
+
+    {
+      const int blocks = int(batch_size);
+      const int threads = int(emb_vec_size);
+
+      if (combiner_ == "sqrtn") {
+        DoEmbeddingGrad<Sqrtn><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                top_grad_tensor->flat<float>().data()),
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(
+                grad_emb_weight_sp_values_tensor->flat<float>().data()),
+            max_norm_, emb_vec_size, batch_size, nnz);
+      } else if (combiner_ == "mean") {
+        DoEmbeddingGrad<Mean><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                top_grad_tensor->flat<float>().data()),
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(
+                grad_emb_weight_sp_values_tensor->flat<float>().data()),
+            max_norm_, emb_vec_size, batch_size, nnz);
+      } else {
+        DoEmbeddingGrad<Sum><<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const float*>(
+                top_grad_tensor->flat<float>().data()),
+            reinterpret_cast<const float*>(
+                emb_variable_tensor->flat<float>().data()),
+            reinterpret_cast<const int64_t*>(
+                values_tensor->flat<int64>().data()),
+            values_offset_tensor->flat<int>().data(),
+            reinterpret_cast<float*>(
+                grad_emb_weight_sp_values_tensor->flat<float>().data()),
+            max_norm_, emb_vec_size, batch_size, nnz);
+      }
+    }
+  }
+
+ private:
+  float max_norm_;
+  std::string combiner_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedEmbeddingLocalSparseLookUpGrad").Device(DEVICE_GPU),
+    FusedEmbeddingLocalSparseLookUpGradGPU);
+
+}  // namespace tensorflow
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc
new file mode 100644
index 00000000..04e79ad7
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_local_ops_test.cc
@@ -0,0 +1,419 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 };
+
+template <TestCase test_case>
+void get_node_attr_from_test_case(string& combiner_str, float& max_norm) {
+  if (test_case == Sqrtn) {
+    combiner_str = "sqrtn";
+    max_norm = -1.0f;
+  } else if (test_case == Mean) {
+    combiner_str = "mean";
+    max_norm = -1.0f;
+  } else if (test_case == Sum) {
+    combiner_str = "sum";
+    max_norm = -1.0f;
+  } else if (test_case == SqrtnAndMaxNorm200) {
+    combiner_str = "sqrtn";
+    max_norm = 200.0f;
+  } else if (test_case == MeanAndMaxNorm100) {
+    combiner_str = "mean";
+    max_norm = 100.0f;
+  }
+}
+
+template <TestCase test_case>
+void fill_emb_vector_expected(Tensor* expected);
+
+template <>
+void fill_emb_vector_expected<Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {22.627416610717773, 24.0416316986084,   25.45584487915039,
+                 26.870058059692383, 28.284271240234375, 29.698484420776367,
+                 31.112699508666992, 32.526912689208984, 73.90083312988281,
+                 75.63288879394531,  77.36493682861328,  79.09698486328125,
+                 80.82904052734375,  82.56108856201172,  84.29314422607422,
+                 86.02519226074219,  124.70765686035156, 126.43971252441406,
+                 128.17176818847656, 129.90380859375,    131.6358642578125,
+                 133.367919921875,   135.09996032714844, 136.83201599121094,
+                 107.48023223876953, 108.89444732666016, 110.30866241455078,
+                 111.72286987304688, 113.1370849609375,  114.55130004882812,
+                 115.96551513671875, 117.37973022460938});
+}
+
+template <>
+void fill_emb_vector_expected<Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {16.00000000000000, 17.00000000000000, 18.00000000000000,
+                 19.00000000000000, 20.00000000000000, 21.00000000000000,
+                 22.00000000000000, 23.00000000000000, 42.66666793823242,
+                 43.66666793823242, 44.66666793823242, 45.66666793823242,
+                 46.66666793823242, 47.66666793823242, 48.66666793823242,
+                 49.66666793823242, 72.00000000000000, 73.00000000000000,
+                 74.00000000000000, 75.00000000000000, 76.00000000000000,
+                 77.00000000000000, 78.00000000000000, 79.00000000000000,
+                 76.00000000000000, 77.00000000000000, 78.00000000000000,
+                 79.00000000000000, 80.00000000000000, 81.00000000000000,
+                 82.00000000000000, 83.00000000000000});
+}
+
+template <>
+void fill_emb_vector_expected<Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {32.0,  34.0,  36.0,  38.0,  40.0,  42.0,  44.0,  46.0,
+                 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0,
+                 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0,
+                 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0});
+}
+
+template <>
+void fill_emb_vector_expected<SqrtnAndMaxNorm200>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+       29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+       77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+       86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+       98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+       72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+       77.87342834, 78.98532867});
+}
+
+class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase {
+ protected:
+  template <typename T, TestCase test_case>
+  void Run(Device device) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+    DataType dtype = DataTypeToEnum<T>::value;
+    std::string combiner_str;
+    float max_norm;
+
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up",
+                                "FusedEmbeddingLocalSparseLookUp")
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(dtype))
+                     .Attr("T", dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int entries = 8;
+    const int bucket_size = 16;
+
+    Tensor sp_values(DT_INT64, {nnz});
+    Tensor sp_indices(DT_INT64, {nnz, 2});
+    Tensor sp_dense_shape(DT_INT64, {2});
+    Tensor emb_variable(dtype, {bucket_size, emb_vector_dim});
+
+    test::FillValues<int64>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+    test::FillValues<int64>(&sp_indices, {0, 1, 0, 5, 1, 2, 1, 1, 1, 7,
+                                          2, 1, 2, 4, 2, 7, 3, 0, 3, 6});
+    test::FillValues<int64>(&sp_dense_shape, {batch_size, entries});
+    test::FillValues<T>(
+        &emb_variable,
+        {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+         10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+         20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+         30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+         40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+         50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+         60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+         70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+         80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+         90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+         100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+         110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+         120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+    AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+    AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+    AddInputFromArray<int64>(sp_dense_shape.shape(),
+                             sp_dense_shape.flat<int64>());
+    AddInputFromArray<T>(emb_variable.shape(), emb_variable.flat<T>());
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor emb_vector_expected(dtype, {batch_size, emb_vector_dim});
+    Tensor sp_values_offset_expected(DT_INT32, {batch_size});
+    fill_emb_vector_expected<test_case>(&emb_vector_expected);
+    test::FillValues<int32>(&sp_values_offset_expected, {0, 2, 5, 8});
+
+    const Tensor& emb_vector = *GetOutput(0);
+    const Tensor& values_offset = *GetOutput(1);
+    TF_EXPECT_OK(device_->Sync());
+
+    test::ExpectTensorNear<T>(emb_vector_expected, emb_vector, 1e-4);
+    test::ExpectTensorEqual<int32>(sp_values_offset_expected, values_offset);
+  }
+};
+
+template <TestCase test_case>
+void fill_grad_expected(Tensor* expected);
+
+template <>
+void fill_grad_expected<Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.7071067690849304, 1.4142135381698608,
+                 2.1213204860687256, 2.8284270763397217, 3.535533905029297,
+                 4.242640972137451,  4.949747562408447,  0.000000000000000,
+                 0.7071067690849304, 1.4142135381698608, 2.1213204860687256,
+                 2.8284270763397217, 3.535533905029297,  4.242640972137451,
+                 4.949747562408447,  4.618802070617676,  5.196152687072754,
+                 5.773502826690674,  6.350852966308594,  6.928203582763672,
+                 7.505553722381592,  8.082903861999512,  8.66025447845459,
+                 4.618802070617676,  5.196152687072754,  5.773502826690674,
+                 6.350852966308594,  6.928203582763672,  7.505553722381592,
+                 8.082903861999512,  8.66025447845459,   4.618802070617676,
+                 5.196152687072754,  5.773502826690674,  6.350852966308594,
+                 6.928203582763672,  7.505553722381592,  8.082903861999512,
+                 8.66025447845459,   9.237604141235352,  9.81495475769043,
+                 10.392305374145508, 10.96965503692627,  11.547005653381348,
+                 12.124356269836426, 12.701705932617188, 13.279056549072266,
+                 9.237604141235352,  9.81495475769043,   10.392305374145508,
+                 10.96965503692627,  11.547005653381348, 12.124356269836426,
+                 12.701705932617188, 13.279056549072266, 9.237604141235352,
+                 9.81495475769043,   10.392305374145508, 10.96965503692627,
+                 11.547005653381348, 12.124356269836426, 12.701705932617188,
+                 13.279056549072266, 16.970563888549805, 17.677669525146484,
+                 18.384777069091797, 19.091882705688477, 19.79899024963379,
+                 20.5060977935791,   21.21320343017578,  21.920310974121094,
+                 16.970563888549805, 17.677669525146484, 18.384777069091797,
+                 19.091882705688477, 19.79899024963379,  20.5060977935791,
+                 21.21320343017578,  21.920310974121094});
+}
+
+template <>
+void fill_grad_expected<Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.500000000000000,  1.000000000000000,
+                 1.500000000000000,  2.000000000000000,  2.500000000000000,
+                 3.000000000000000,  3.500000000000000,  0.000000000000000,
+                 0.500000000000000,  1.000000000000000,  1.500000000000000,
+                 2.000000000000000,  2.500000000000000,  3.000000000000000,
+                 3.500000000000000,  2.6666667461395264, 3.000000000000000,
+                 3.3333332538604736, 3.6666667461395264, 4.000000000000000,
+                 4.333333492279053,  4.666666507720947,  5.000000000000000,
+                 2.6666667461395264, 3.000000000000000,  3.3333332538604736,
+                 3.6666667461395264, 4.000000000000000,  4.333333492279053,
+                 4.666666507720947,  5.000000000000000,  2.6666667461395264,
+                 3.000000000000000,  3.3333332538604736, 3.6666667461395264,
+                 4.000000000000000,  4.333333492279053,  4.666666507720947,
+                 5.000000000000000,  5.333333492279053,  5.666666507720947,
+                 6.000000000000000,  6.333333492279053,  6.666666507720947,
+                 7.000000000000000,  7.333333492279053,  7.666666507720947,
+                 5.333333492279053,  5.666666507720947,  6.000000000000000,
+                 6.333333492279053,  6.666666507720947,  7.000000000000000,
+                 7.333333492279053,  7.666666507720947,  5.333333492279053,
+                 5.666666507720947,  6.000000000000000,  6.333333492279053,
+                 6.666666507720947,  7.000000000000000,  7.333333492279053,
+                 7.666666507720947,  12.000000000000000, 12.500000000000000,
+                 13.000000000000000, 13.500000000000000, 14.000000000000000,
+                 14.500000000000000, 15.000000000000000, 15.500000000000000,
+                 12.000000000000000, 12.500000000000000, 13.000000000000000,
+                 13.500000000000000, 14.000000000000000, 14.500000000000000,
+                 15.000000000000000, 15.500000000000000});
+}
+
+template <>
+void fill_grad_expected<Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  0.0,  1.0,  2.0,  3.0,
+       4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+       8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0,  9.0,  10.0, 11.0,
+       12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+       16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0,
+       20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+       24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+}
+
+template <>
+void fill_grad_expected<MeanAndMaxNorm100>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
+       2.50000000,  3.00000000,  3.50000000,  0.00000000,  0.50000000,
+       1.00000000,  1.50000000,  2.00000000,  2.50000000,  3.00000000,
+       3.50000000,  2.65028572,  2.98157120,  3.31285667,  3.64414287,
+       3.97542834,  4.30671406,  4.63799953,  4.96928549,  2.16437674,
+       2.43492365,  2.70547056,  2.97601795,  3.24656487,  3.51711202,
+       3.78765893,  4.05820608,  1.58337951,  1.78130186,  1.97922409,
+       2.17714667,  2.37506914,  2.57299161,  2.77091384,  2.96883631,
+       5.33333349,  5.66666651,  6.00000000,  6.33333349,  6.66666651,
+       7.00000000,  7.33333349,  7.66666651,  1.89459133,  2.01300311,
+       2.13141513,  2.24982715,  2.36823893,  2.48665094,  2.60506320,
+       2.72347474,  1.89459133,  2.01300311,  2.13141513,  2.24982715,
+       2.36823893,  2.48665094,  2.60506320,  2.72347474,  3.43474555,
+       3.57786012,  3.72097445,  3.86408877,  4.00720310,  4.15031767,
+       4.29343224,  4.43654633,  11.92628479, 12.42321396, 12.92014217,
+       13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516});
+}
+
+class FusedEmbeddingLocalSparseLookUpGradOpTest : public OpsTestBase {
+ protected:
+  template <typename T, TestCase test_case>
+  void Run(Device device) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+    DataType dtype = DataTypeToEnum<T>::value;
+    std::string combiner_str;
+    float max_norm;
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_local_sparse_look_up_grad",
+                                "FusedEmbeddingLocalSparseLookUpGrad")
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Attr("T", dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int bucket_size = 16;
+
+    Tensor top_grad(dtype, {batch_size, emb_vector_dim});
+    Tensor emb_variable(dtype, {bucket_size, emb_vector_dim});
+    Tensor sp_values(DT_INT64, {nnz});
+    Tensor sp_values_offset(DT_INT32, {batch_size});
+
+    test::FillValues<T>(
+        &top_grad,
+        {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+         11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+         22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+    test::FillValues<T>(
+        &emb_variable,
+        {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+         10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+         20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+         30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+         40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+         50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+         60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+         70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+         80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+         90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+         100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+         110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+         120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+    test::FillValues<int64>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+    test::FillValues<int32>(&sp_values_offset, {0, 2, 5, 8});
+
+    AddInputFromArray<T>(top_grad.shape(), top_grad.flat<T>());
+    AddInputFromArray<T>(emb_variable.shape(), emb_variable.flat<T>());
+    AddInputFromArray<int64>(sp_values.shape(), sp_values.flat<int64>());
+    AddInputFromArray<int32>(sp_values_offset.shape(),
+                             sp_values_offset.flat<int32>());
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor grad_expected(dtype, {nnz, emb_vector_dim});
+    fill_grad_expected<test_case>(&grad_expected);
+
+    const Tensor& grad = *GetOutput(0);
+    TF_EXPECT_OK(device_->Sync());
+
+    test::ExpectTensorNear<T>(grad_expected, grad, 1e-4);
+  }
+};
+
+#ifdef GOOGLE_CUDA
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest,
+       EmbeddingLocalSparseLookUpFloatSqrtnGpu) {
+  Run<float, Sqrtn>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest,
+       EmbeddingLocalSparseLookUpFloatMeanGpu) {
+  Run<float, Mean>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest,
+       EmbeddingLocalSparseLookUpFloatSumGpu) {
+  Run<float, Sum>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpOpTest,
+       EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) {
+  Run<float, SqrtnAndMaxNorm200>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest,
+       EmbeddingLocalSparseLookUpGradFloatGpu) {
+  Run<float, Sqrtn>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest,
+       EmbeddingLocalSparseLookUpGradFloatMeanGpu) {
+  Run<float, Mean>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest,
+       EmbeddingLocalSparseLookUpGradFloatSumGpu) {
+  Run<float, Sum>(Device::GPU);
+}
+
+TEST_F(FusedEmbeddingLocalSparseLookUpGradOpTest,
+       EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) {
+  Run<float, MeanAndMaxNorm100>(Device::GPU);
+}
+
+#endif
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc
new file mode 100644
index 00000000..c8408134
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_ops.cc
@@ -0,0 +1,308 @@
+#include <stdio.h>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("FusedEmbeddingLocalSparseLookUp")
+    .Attr("T: {float32}")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("max_norm: float = -1.0")
+    .Input("sp_values: int64")
+    .Input("sp_indices: int64")
+    .Input("sp_dense_shape: int64")
+    .Input("emb_variable: T")
+    .Output("emb_vectors: T")
+    .Output("sp_values_offset: int32")
+    .SetShapeFn([](InferenceContext* ctx) {
+      ShapeHandle temp;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 1, &temp));
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(1), 2, &temp));
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(2), 1, &temp));
+      ShapeHandle emb_var_shape;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3), 2, &emb_var_shape));
+
+      DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1);
+      DimensionHandle batch_dim = ctx->UnknownDim();
+
+      ShapeHandle output_shape = ctx->MakeShape({batch_dim, emb_vec_size_dim});
+      ctx->set_output(0, output_shape);
+
+      return OkStatus();
+    });
+//     .Doc(R"doc(
+// FusedEmbedding ops that performs a local embedding lookup. The process will
+// perform embedding vector copying from emb_variable. The input is usually a
+// SparseTensor. The output sp_values_offset is reserved for gradient
+// calculation.
+//     )doc");
+
+REGISTER_OP("FusedEmbeddingLocalSparseLookUpGrad")
+    .Attr("T: {float32}")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("max_norm: float = -1.0")
+    .Input("top_grad: T")
+    .Input("emb_variable: T")
+    .Input("sp_values: int64")
+    .Input("sp_values_offset: int32")
+    .Output("grad_emb_weight_sp_values: T")
+    .SetShapeFn([](InferenceContext* ctx) {
+      ShapeHandle top_grad_shape;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &top_grad_shape));
+      DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1);
+      ctx->set_output(0, ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim}));
+      return OkStatus();
+    });
+
+//     .Doc(R"doc(
+// The gradient ops for FusedEmbeddingLocalSparseLookUp. sp_values_offset from
+// the forward op need to be passed to this grad op as input.
+//     )doc");
+
+REGISTER_OP("FusedEmbeddingSparsePreLookUp")
+    .Attr("num_partitions: int >= 1 = 1")
+    .Attr("partition_axis: int >= 0 = 0")  // for now only support = 0,
+                                           // will consider support = 1
+                                           // if necessary
+    .Attr("fill_empty_row: bool = false")
+    .Attr("prune_invalid_id: bool = false")
+    .Attr("default_id: int = -1")
+    .Attr("partition_strategy: {'div','mod'} = 'div'")
+    .Input("partition_shapes: num_partitions * int64")
+    .Input("sp_values: int64")
+    .Input("sp_indices: int64")
+    .Input("sp_dense_shape: int64")
+    .Output("partitioned_values: num_partitions * int64")
+    .Output("partitioned_indices: num_partitions * int64")
+    .Output("row_empty_and_invalid_flags: int32")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_partitions;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions));
+      int partition_axis;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("partition_axis", &partition_axis));
+
+      ShapeHandle unused;
+      // sp_values
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(num_partitions), 1, &unused));
+      // sp_indices
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(num_partitions + 1), 2, &unused));
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim));
+      // sp_dense_shape
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(num_partitions + 2), 1, &unused));
+
+      // partition_shapes
+      for (int i = 0; i < num_partitions; i++) {
+        ShapeHandle partition_shape;
+        TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 1, &partition_shape));
+        TF_RETURN_IF_ERROR(
+            ctx->WithValue(ctx->NumElements(partition_shape), 2, &unused_dim));
+
+        ShapeHandle values_result_shape, indices_result_shape;
+        if (int(partition_axis) == 0) {
+          values_result_shape = ctx->MakeShape({ctx->UnknownDim()});
+          indices_result_shape = ctx->MakeShape({ctx->UnknownDim(), 2});
+        } else {
+          return errors::InvalidArgument("partition_axis > 0 not implemented!");
+        }
+        ctx->set_output(i, values_result_shape);
+        ctx->set_output(i + num_partitions, indices_result_shape);
+      }
+      ctx->set_output(2 * num_partitions, ctx->MakeShape({ctx->UnknownDim()}));
+
+      return OkStatus();
+    });
+//     .Doc(R"doc(
+// A fused embedding op, usually using for partitioned and distriuted embedding
+// variables. FusedEmbeddingSparsePreLookUp, FusedEmbeddingSparsePostLookUp
+// should be used together. This op will first read the partition pattern of
+// embedding variables through partition_shapes, then sort, re-calculate and
+// assign the embedding indices to the corresponding partition. Several Gather
+// ops usually should be appended after this op to gather embedding shards from
+// multiple partitioned embedding variables. This op has no gradient function.
+//     )doc");
+
+REGISTER_OP("FusedEmbeddingSparsePostLookUp")
+    .Attr("T : {float32}")
+    .Attr("num_partitions: int >= 1 = 1")
+    .Attr("default_id: int = -1")
+    .Attr("partition_axis: int >= 0 = 0")  // for now only support = 0,
+                                           // will consider support = 1
+                                           // if necessary
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("max_norm: float = -1.0")
+    .Input("emb_shards: num_partitions * T")
+    .Input("partitioned_indices: num_partitions * int64")
+    .Input("sp_dense_shape: int64")
+    .Input("row_empty_and_invalid_flags: int32")
+    .Input(
+        "partitioned_values: num_partitions * int64")  // only for backward use.
+                                                       // actually directly port
+                                                       // to python grad op
+                                                       // output
+    .Output("emb_vectors: T")
+    .Output("feature_nums: int32")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_partitions;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions));
+
+      ShapeHandle first_emb_shard_shape;
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(0), 2, &first_emb_shard_shape));
+
+      ShapeHandle unused;
+      for (int i = 0; i < num_partitions; i++) {
+        // emb_shards
+        TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused));
+        // partitioned_indices
+        TF_RETURN_IF_ERROR(
+            ctx->WithRank(ctx->input(i + num_partitions), 2, &unused));
+        DimensionHandle unused_dim;
+        TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim));
+      }
+      // sp_dense_shape
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(2 * num_partitions), 1, &unused));
+      // row_empty_and_invalid_flags
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(2 * num_partitions + 1), 1, &unused));
+
+      DimensionHandle emb_vec_size_dim = ctx->Dim(first_emb_shard_shape, 1);
+      ctx->set_output(0, ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim}));
+      ctx->set_output(1, ctx->MakeShape({ctx->UnknownDim()}));
+      return OkStatus();
+    });
+
+//     .Doc(R"doc(
+// A fused embedding op, usually using for partitioned and distriuted embedding
+// variables. FusedEmbeddingSparsePreLookUp, FusedEmbeddingSparsePostLookUp
+// should be used together. There should be several Gather ops before this op.
+// The Gather ops gather embedding shards from embedding variable and this op
+// glue them together, then apply combiner and max_morm according to embedding
+// indices.
+//     )doc");
+
+REGISTER_OP("FusedEmbeddingSparsePostLookUpGrad")
+    .Attr("T : {float32}")
+    .Attr("num_partitions: int >= 1 = 1")
+    .Attr("partition_axis: int >= 0 = 0")  // for now only support = 0,
+                                           // will consider support = 1
+                                           // if necessary
+    .Attr("default_id: int = -1")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("max_norm: float = -1.0")
+    .Input("top_grad: T")
+    .Input("emb_shards: num_partitions * T")
+    .Input("partitioned_indices: num_partitions * int64")
+    .Input("feature_nums: int32")
+    .Input("row_empty_and_invalid_flags: int32")
+    .Output("grad_shards: num_partitions * T")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_partitions;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_partitions", &num_partitions));
+
+      ShapeHandle unused;
+      ShapeHandle top_grad_shape;
+
+      // top_grad
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &top_grad_shape));
+      // emb_shards
+      for (int i = 1; i < num_partitions + 1; i++) {
+        TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused));
+      }
+      // partitioned_indices
+      for (int i = num_partitions + 1; i < 2 * num_partitions + 1; i++) {
+        TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &unused));
+        DimensionHandle unused_dim;
+        TF_RETURN_IF_ERROR(ctx->WithValue(ctx->Dim(unused, 1), 2, &unused_dim));
+      }
+      // feature_nums
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(2 * num_partitions + 1), 1, &unused));
+      // row_empty_and_invalid_flags
+      TF_RETURN_IF_ERROR(
+          ctx->WithRank(ctx->input(2 * num_partitions + 2), 1, &unused));
+
+      DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1);
+
+      ShapeHandle output_shape =
+          ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim});
+      for (int i = 0; i < num_partitions; i++) {
+        ctx->set_output(i, output_shape);
+      }
+      return OkStatus();
+    });
+
+//     .Doc(R"doc(
+// Calculate gradient of FusedEmbeddingSparsePostLookUp
+//     )doc");
+
+REGISTER_OP("FusedSafeEmbeddingLookupSparseLocal")
+    .Input("weight: T_weight")
+    .Input("id_input: T_id")
+    .Input("dense_shape: T_shape")
+    .Input("indice: T_shape")
+    .Input("weight_input: T_id")
+    .Output("embedded: T")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'")
+    .Attr("prune: bool = true")
+    .Attr("max_norm: float = -1.0")
+    .Attr("default_id: int = -1")
+    .Attr("partition_strategy: {'div','mod'} = 'div'")
+    .Attr("T_id: {int64, int32}")
+    .Attr("T_shape: {int64, int32}")
+    .Attr("T_weight: {float, resource}")
+    .Attr("T: {float} = DT_FLOAT")
+    .SetShapeFn([](InferenceContext* ctx) {
+      ShapeHandle temp;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(1), 1, &temp));
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3), 2, &temp));
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(2), 1, &temp));
+      ShapeHandle emb_var_shape;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &emb_var_shape));
+
+      DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1);
+      DimensionHandle batch_dim = ctx->UnknownDim();
+
+      ShapeHandle output_shape = ctx->MakeShape({batch_dim, emb_vec_size_dim});
+      ctx->set_output(0, output_shape);
+
+      return OkStatus();
+    });
+
+REGISTER_OP("FusedSafeEmbeddingLookupSparseLocalGrad")
+    .Input("gradients: T")
+    .Input("input: Tinput")
+    .Input("indices: Tindices")
+    .Input("dense_shape: Tdense_shape")
+    .Output("output: T")
+    .Output("unique_value: Tinput")
+    .Attr("T: {float}")
+    .Attr("Tinput: {int64}")
+    .Attr("Tindices: {int64, int32}")
+    .Attr("Tdense_shape: {int64, int32}")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'")
+    .SetShapeFn([](InferenceContext* ctx) {
+      ShapeHandle emb_var_shape;
+      TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(0), 2, &emb_var_shape));
+
+      DimensionHandle emb_vec_size_dim = ctx->Dim(emb_var_shape, 1);
+      DimensionHandle unique_dim = ctx->UnknownDim();
+
+      ShapeHandle output_shape = ctx->MakeShape({unique_dim, emb_vec_size_dim});
+      ctx->set_output(0, output_shape);
+
+      ShapeHandle unique_value_shape = ctx->MakeShape({unique_dim});
+      ctx->set_output(1, unique_value_shape);
+
+      return OkStatus();
+    });
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc
new file mode 100644
index 00000000..acef2961
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_grad_ops_test.cc
@@ -0,0 +1,243 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+class FusedEmbeddingSparsePostLookUpGradOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype,
+                          const std::string& combiner, const float max_norm,
+                          const int default_id) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding__sparse_post_look_up_grad",
+                                "FusedEmbeddingSparsePostLookUpGrad")
+                     .Attr("T", dtype)
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_axis", 0)
+                     .Attr("combiner", combiner)
+                     .Attr("max_norm", max_norm)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(dtype))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT32))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest,
+       Partition2_Mean_MaxNorm100_Float) {
+  const int nnz = 10;
+  const int batch_size = 4;
+  const int emb_vector_dim = 8;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "mean", 100.0, -1);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+       11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+       22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+
+  // emb_shards
+  AddInputFromArray<float>(
+      TensorShape({6, emb_vector_dim}),
+      {8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0, 26.0, 27.0,
+       28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+       32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 32.0, 33.0, 34.0, 35.0,
+       36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0});
+  AddInputFromArray<float>(
+      TensorShape({4, emb_vector_dim}),
+      {56.0,  57.0,  58.0,  59.0,  60.0,  61.0,  62.0,  63.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+  // sp_values: 3, 1, 4, 5, 7, 3, 12, 12, 15, 4
+  // partitioned_values: 1, 3, 3, 4, 4, 5 and 7, 12, 12, 15
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({6, 2}),
+                           {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1});
+  AddInputFromArray<int64>(TensorShape({4, 2}), {1, 7, 2, 4, 2, 7, 3, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 3, 3, 2});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}),
+                         {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({6, emb_vector_dim}));
+    test::FillValues<float>(
+        &grad_shards_1,
+        {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
+         2.50000000,  3.00000000,  3.50000000,  0.00000000,  0.50000000,
+         1.00000000,  1.50000000,  2.00000000,  2.50000000,  3.00000000,
+         3.50000000,  5.33333349,  5.66666651,  6.00000000,  6.33333349,
+         6.66666651,  7.00000000,  7.33333349,  7.66666651,  2.65028572,
+         2.98157120,  3.31285667,  3.64414287,  3.97542834,  4.30671406,
+         4.63799953,  4.96928549,  11.92628479, 12.42321396, 12.92014217,
+         13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516,
+         2.16437674,  2.43492365,  2.70547056,  2.97601795,  3.24656487,
+         3.51711202,  3.78765893,  4.05820608});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({4, emb_vector_dim}));
+    test::FillValues<float>(
+        &grad_shards_2,
+        {1.58337951, 1.78130186, 1.97922409, 2.17714667, 2.37506914, 2.57299161,
+         2.77091384, 2.96883631, 1.89459133, 2.01300311, 2.13141513, 2.24982715,
+         2.36823893, 2.48665094, 2.60506320, 2.72347474, 1.89459133, 2.01300311,
+         2.13141513, 2.24982715, 2.36823893, 2.48665094, 2.60506320, 2.72347474,
+         3.43474555, 3.57786012, 3.72097445, 3.86408877, 4.00720310, 4.15031767,
+         4.29343224, 4.43654633});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest,
+       Partition2_SUM_Float_No_Default) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, -1);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 1, 1});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_1,
+                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_2,
+                            {2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePostLookUpGradOpTest,
+       Partition2_SUM_Float_Default_0) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, 0);
+
+  // top_grad
+  AddInputFromArray<float>(
+      TensorShape({batch_size, emb_vector_dim}),
+      {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0});
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // feature_nums
+  AddInputFromArray<int>(TensorShape({batch_size}), {2, 1, 1});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor grad_shards_1(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_1,
+                            {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+    test::ExpectTensorNear<float>(grad_shards_1, *GetOutput(0), 1e-4);
+  }
+
+  {
+    Tensor grad_shards_2(allocator(), DT_FLOAT,
+                         TensorShape({2, emb_vector_dim}));
+    test::FillValues<float>(&grad_shards_2,
+                            {2.0, 2.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0});
+    test::ExpectTensorNear<float>(grad_shards_2, *GetOutput(1), 1e-4);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
new file mode 100644
index 00000000..1e3bacc2
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_gpus.cu.cc
@@ -0,0 +1,328 @@
+#include <exception>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "cub/thread/thread_operators.cuh"
+#include "fused_embedding_common.cu.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+__global__ void SumUpEmbeddingShard(const float* emb_shard,
+                                    const int64_t* partitioned_indice,
+                                    float* emb_vectors, int* feature_nums,
+                                    const float max_norm,
+                                    const int emb_vec_size) {
+  __shared__ float l2_sum[1];
+
+  const int64_t row_in_batch = partitioned_indice[2 * blockIdx.x];
+  float emb_element = emb_shard[blockIdx.x * emb_vec_size + threadIdx.x];
+  if (max_norm >= 0.0f) {
+    if (threadIdx.x == 0) {
+      l2_sum[0] = 0.0f;
+    }
+    __syncthreads();
+    atomicAdd(l2_sum, emb_element * emb_element);
+    __syncthreads();
+    float l2_norm = sqrtf(l2_sum[0]);
+    if (l2_norm > max_norm) {
+      emb_element *= max_norm / l2_norm;
+    }
+  }
+
+  atomicAdd(emb_vectors + row_in_batch * emb_vec_size + threadIdx.x,
+            emb_element);
+
+  if (threadIdx.x == 0) {
+    atomicAdd(feature_nums + row_in_batch, 1);
+  }
+}
+
+template <Combiner combiner>
+__global__ void ApplyCombiner(float* emb_vectors, const int* row_emptiness_flag,
+                              const bool set_empty_row_zero,
+                              const int* feature_nums) {
+  const int offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (set_empty_row_zero) {
+    if (row_emptiness_flag[blockIdx.x]) {
+      emb_vectors[offset] = 0.0f;
+      return;
+    }
+  }
+  const int feature_num = feature_nums[blockIdx.x];
+  const float emb_element = emb_vectors[offset];
+  emb_vectors[offset] = Combine<combiner, int>(emb_element, feature_num);
+}
+
+template <Combiner combiner>
+__global__ void DistributeGradToShard(
+    const float* top_grad, const float* emb_shard,
+    const int64_t* partitioned_indice, const int* feature_nums,
+    const int* row_emptiness_flag, const bool set_empty_row_zero,
+    float* grad_shard, const int64_t sub_nnz, const int64_t emb_vec_size,
+    const float max_norm) {
+  __shared__ int64_t row_in_batch_shared[1];
+  __shared__ int feature_num_shared[1];
+  __shared__ float l2_sum[1];
+  int64_t row_in_batch;
+  if (threadIdx.x == 0) {
+    row_in_batch = partitioned_indice[2 * blockIdx.x];
+    row_in_batch_shared[0] = row_in_batch;
+    feature_num_shared[0] = feature_nums[row_in_batch];
+  }
+  __syncthreads();
+  row_in_batch = row_in_batch_shared[0];
+  const int feature_num = feature_num_shared[0];
+  if (set_empty_row_zero) {
+    if (row_emptiness_flag[row_in_batch]) {
+      grad_shard[blockIdx.x * emb_vec_size + threadIdx.x] = 0.0f;
+      return;
+    }
+  }
+  float grad = top_grad[row_in_batch * emb_vec_size + threadIdx.x];
+  grad = CombineGrad<combiner>(grad, feature_num);
+  if (max_norm >= 0.0f) {
+    const float emb_element =
+        emb_shard[blockIdx.x * emb_vec_size + threadIdx.x];
+    if (threadIdx.x == 0) {
+      l2_sum[0] = 0.0f;
+    }
+    __syncthreads();
+    atomicAdd(l2_sum, emb_element * emb_element);
+    __syncthreads();
+    float l2_norm = sqrtf(l2_sum[0]);
+    if (l2_norm > max_norm) {
+      grad *= max_norm / l2_norm;
+    }
+  }
+  grad_shard[blockIdx.x * emb_vec_size + threadIdx.x] = grad;
+}
+}  // namespace
+
+class FusedEmbeddingSparsePostLookUpGPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingSparsePostLookUpGPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    OpInputList emb_shards;
+    OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards));
+
+    OpInputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->input_list("partitioned_indices", &partitioned_indices));
+
+    Tensor const* dense_shape_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape_tensor));
+
+    Tensor const* row_empty_and_invalid_flags = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags",
+                                   &row_empty_and_invalid_flags));
+
+    const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1);
+    const int64_t batch_size = dense_shape_tensor->flat<int64>().data()[0];
+
+    // 1. sum up emb values from different entries and dump into output
+    Tensor* emb_vectors_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(0, TensorShape({batch_size, emb_vec_size}),
+                                  &emb_vectors_tensor));
+    // stream_executor::DeviceMemoryBase emb_vectors_wrapper(
+    //    emb_vectors_tensor.flat<float>().data(),
+    //    emb_vectors_tensor->NumElements() * sizeof(float));
+    // stream->ThenMemZero(&emb_vectors_wrapper,
+    //                    emb_vectors_tensor->NumElements() * sizeof(float));
+
+    cudaMemsetAsync(emb_vectors_tensor->flat<float>().data(), 0x0,
+                    sizeof(float) * emb_vectors_tensor->NumElements(), stream);
+
+    Tensor* feature_nums;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({batch_size}), &feature_nums));
+    // stream_executor::DeviceMemoryBase feature_nums_wrapper(
+    //    feature_nums.flat<int>().data(),
+    //    feature_nums.NumElements() * sizeof(int));
+    // stream->ThenMemZero(&feature_nums_wrapper,
+    //                    feature_nums.NumElements() * sizeof(int));
+    cudaMemsetAsync(feature_nums->flat<int>().data(), 0x0,
+                    sizeof(int) * feature_nums->NumElements(), stream);
+
+    for (int i = 0; i < num_partitions_; i++) {
+      const size_t sub_nnz = emb_shards[i].shape().dim_size(0);
+      OP_REQUIRES(
+          ctx, sub_nnz == partitioned_indices[i].shape().dim_size(0),
+          errors::InvalidArgument(
+              "emb_shard and partitioned_indice dosn't have the same length"));
+
+      {
+        const int blocks = sub_nnz;
+        const int threads = emb_vec_size;
+        SumUpEmbeddingShard<<<blocks, threads, 0, stream>>>(
+            emb_shards[i].flat<float>().data(),
+            reinterpret_cast<const int64_t*>(
+                partitioned_indices[i].flat<int64>().data()),
+            emb_vectors_tensor->flat<float>().data(),
+            feature_nums->flat<int>().data(), max_norm_, emb_vec_size);
+        CK_CUDA_THROW_(cudaGetLastError());
+      }
+    }
+
+    const bool set_empty_row_zero = default_id_ >= 0;
+    // 2. combiner
+    {
+      const int blocks = batch_size;
+      const int threads = emb_vec_size;
+      if (combiner_ == "sqrtn") {
+        ApplyCombiner<Sqrtn><<<blocks, threads, 0, stream>>>(
+            emb_vectors_tensor->flat<float>().data(),
+            row_empty_and_invalid_flags->flat<int>().data(), set_empty_row_zero,
+            feature_nums->flat<int>().data());
+      } else if (combiner_ == "mean") {
+        ApplyCombiner<Mean><<<blocks, threads, 0, stream>>>(
+            emb_vectors_tensor->flat<float>().data(),
+            row_empty_and_invalid_flags->flat<int>().data(), set_empty_row_zero,
+            feature_nums->flat<int>().data());
+      } else {
+        ApplyCombiner<Sum><<<blocks, threads, 0, stream>>>(
+            emb_vectors_tensor->flat<float>().data(),
+            row_empty_and_invalid_flags->flat<int>().data(), set_empty_row_zero,
+            feature_nums->flat<int>().data());
+      }
+      CK_CUDA_THROW_(cudaGetLastError());
+    }
+  }
+
+ private:
+  int num_partitions_;
+  int partition_axis_;
+  std::string combiner_;
+  float max_norm_;
+  int64_t default_id_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingSparsePostLookUp")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("sp_dense_shape"),
+                        FusedEmbeddingSparsePostLookUpGPU);
+
+class FusedEmbeddingSparsePostLookUpGradGPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingSparsePostLookUpGradGPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("max_norm", &max_norm_));
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    Tensor const* top_grad_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("top_grad", &top_grad_tensor));
+
+    OpInputList emb_shards;
+    OP_REQUIRES_OK(ctx, ctx->input_list("emb_shards", &emb_shards));
+
+    OpInputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->input_list("partitioned_indices", &partitioned_indices));
+
+    Tensor const* feature_nums = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("feature_nums", &feature_nums));
+
+    Tensor const* row_empty_and_invalid_flags = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("row_empty_and_invalid_flags",
+                                   &row_empty_and_invalid_flags));
+
+    OpOutputList grad_shards;
+    OP_REQUIRES_OK(ctx, ctx->output_list("grad_shards", &grad_shards));
+
+    const int64_t batch_size = top_grad_tensor->shape().dim_size(0);
+    const int64_t emb_vec_size = emb_shards[0].shape().dim_size(1);
+
+    const bool set_empty_row_zero = default_id_ >= 0;
+
+    for (int i = 0; i < num_partitions_; i++) {
+      const int64_t sub_nnz = partitioned_indices[i].shape().dim_size(0);
+
+      Tensor* grad_shard;
+      OP_REQUIRES_OK(
+          ctx, grad_shards.allocate(i, TensorShape({sub_nnz, emb_vec_size}),
+                                    &grad_shard));
+
+      {
+        const int blocks = sub_nnz;
+        const int threads = emb_vec_size;
+        if (combiner_ == "sqrtn") {
+          DistributeGradToShard<Sqrtn><<<blocks, threads, 0, stream>>>(
+              top_grad_tensor->flat<float>().data(),
+              emb_shards[i].flat<float>().data(),
+              reinterpret_cast<const int64_t*>(
+                  partitioned_indices[i].flat<int64>().data()),
+              feature_nums->flat<int>().data(),
+              row_empty_and_invalid_flags->flat<int>().data(),
+              set_empty_row_zero, grad_shard->flat<float>().data(), sub_nnz,
+              emb_vec_size, max_norm_);
+        } else if (combiner_ == "mean") {
+          DistributeGradToShard<Mean><<<blocks, threads, 0, stream>>>(
+              top_grad_tensor->flat<float>().data(),
+              emb_shards[i].flat<float>().data(),
+              reinterpret_cast<const int64_t*>(
+                  partitioned_indices[i].flat<int64>().data()),
+              feature_nums->flat<int>().data(),
+              row_empty_and_invalid_flags->flat<int>().data(),
+              set_empty_row_zero, grad_shard->flat<float>().data(), sub_nnz,
+              emb_vec_size, max_norm_);
+        } else {
+          DistributeGradToShard<Sum><<<blocks, threads, 0, stream>>>(
+              top_grad_tensor->flat<float>().data(),
+              emb_shards[i].flat<float>().data(),
+              reinterpret_cast<const int64_t*>(
+                  partitioned_indices[i].flat<int64>().data()),
+              feature_nums->flat<int>().data(),
+              row_empty_and_invalid_flags->flat<int>().data(),
+              set_empty_row_zero, grad_shard->flat<float>().data(), sub_nnz,
+              emb_vec_size, max_norm_);
+        }
+        CK_CUDA_THROW_(cudaGetLastError());
+      }
+    }
+  }
+
+ private:
+  int num_partitions_;
+  int partition_axis_;
+  std::string combiner_;
+  float max_norm_;
+  int64_t default_id_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedEmbeddingSparsePostLookUpGrad").Device(DEVICE_GPU),
+    FusedEmbeddingSparsePostLookUpGradGPU);
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc
new file mode 100644
index 00000000..3321f3ff
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_post_ops_test.cc
@@ -0,0 +1,213 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+class FusedEmbeddingSparsePostLookUpOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, int num_partitions, DataType dtype,
+                          const std::string& combiner, const float max_norm,
+                          const int default_id) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_sparse_post_look_up",
+                                "FusedEmbeddingSparsePostLookUp")
+                     .Attr("T", dtype)
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_axis", 0)
+                     .Attr("combiner", combiner)
+                     .Attr("max_norm", max_norm)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(num_partitions, dtype))
+                     .Input(FakeInput(num_partitions, DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_INT64))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedEmbeddingSparsePostLookUpOpTest,
+       Partition3_Sqrtn_MaxNorm200_Float) {
+  const int nnz = 10;
+  const int batch_size = 4;
+  const int emb_vector_dim = 8;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 3, DT_FLOAT, "sqrtn", 200.0, -1);
+
+  // emb_shards
+  AddInputFromArray<float>(
+      TensorShape({6, emb_vector_dim}),
+      {
+          8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 24.0, 25.0,
+          26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 24.0, 25.0, 26.0, 27.0,
+          28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0,
+          38.0, 39.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0,
+          40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0,
+      });
+  AddInputFromArray<float>(TensorShape({1, emb_vector_dim}),
+                           {56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0});
+  AddInputFromArray<float>(
+      TensorShape({3, emb_vector_dim}),
+      {96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       96.0,  97.0,  98.0,  99.0,  100.0, 101.0, 102.0, 103.0,
+       120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({6, 2}),
+                           {0, 5, 0, 1, 2, 1, 1, 2, 3, 6, 1, 1});
+  AddInputFromArray<int64>(TensorShape({1, 2}), {1, 7});
+  AddInputFromArray<int64>(TensorShape({3, 2}), {2, 4, 2, 7, 3, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}),
+                         {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+         29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+         77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+         86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+         98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+         72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+         77.87342834, 78.98532867});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 3, 3, 2});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePostLookUpOpTest, Partition2_Sum_No_Default) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, -1);
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 1, 1});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePostLookUpOpTest, Partition2_Sum_Default_0) {
+  const int nnz = 3;
+  const int batch_size = 3;
+  const int emb_vector_dim = 4;
+  const int entries = 8;
+
+  MakeOpAndSetDevice(Device::GPU, 2, DT_FLOAT, "sum", -1.0, 0);
+
+  // emb_shards
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0});
+  AddInputFromArray<float>(TensorShape({2, emb_vector_dim}),
+                           {10.0, 10.0, 10.0, 10.0, 13.0, 13.0, 13.0, 13.0});
+
+  // partitioned_indices
+  AddInputFromArray<int64>(TensorShape({2, 2}), {0, 0, 0, 5});
+  AddInputFromArray<int64>(TensorShape({2, 2}), {1, 4, 2, 0});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {batch_size, entries});
+
+  // row_empty_and_invalid_flags
+  AddInputFromArray<int>(TensorShape({batch_size + nnz}), {0, 0, 1, 1, 1, 1});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_emb_vectors(allocator(), DT_FLOAT,
+                                TensorShape({batch_size, emb_vector_dim}));
+    test::FillValues<float>(
+        &expected_emb_vectors,
+        {3.0, 3.0, 3.0, 3.0, 10.0, 10.0, 10.0, 10.0, 0.0, 0.0, 0.0, 0.0});
+    test::ExpectTensorNear<float>(expected_emb_vectors, *GetOutput(0), 1e-4);
+  }
+  {
+    Tensor feature_nums_expected(allocator(), DT_INT32,
+                                 TensorShape({batch_size}));
+    test::FillValues<int>(&feature_nums_expected, {2, 1, 1});
+    test::ExpectTensorEqual<int32>(feature_nums_expected, *GetOutput(1));
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc
new file mode 100644
index 00000000..9e2f2378
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_gpus.cu.cc
@@ -0,0 +1,521 @@
+#include <exception>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "cub/device/device_radix_sort.cuh"
+#include "cub/device/device_select.cuh"
+#include "cub/iterator/constant_input_iterator.cuh"
+#include "cub/thread/thread_operators.cuh"
+#include "fused_embedding_common.cu.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+namespace {
+
+__global__ void InitFlagsToOneInt4(int length, int* flags) {
+  int offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (4 * offset + 3 < length) {
+    *((::int4*)(flags + 4 * offset)) = make_int4(1, 1, 1, 1);
+  } else if (4 * offset < length) {
+    for (int i = 0; i < length - 4 * offset; i++) {
+      flags[4 * offset + i] = 1;
+    }
+  }
+}
+
+__global__ void FusedMultiFunctionalKernel(
+    const IndicePair* indices, const int64_t* values, const int64_t nnz,
+    const int64_t batch_size, const bool prune_invalid_id,
+    const int64_t default_id, int* row_emptiness_flag, int* invalid_id_flag,
+    IndicePair* tmp_indices_buffer, int64_t* values_extended) {
+  // This kernel will do many things together
+  // 1. The first part of threads will do job 1(DetectRowEmptiness), others will
+  // do job2(InitBatchRowsBuffer)
+  // 2. Do job3 (set values extended to default id)
+
+  const int offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (offset < nnz) {
+    // do DetectRowEmptiness
+    if (prune_invalid_id) {
+      const int64_t value = values[offset];
+      if (value < 0) {
+        // invalid, set invalid_id_flag
+        atomicAnd(invalid_id_flag + offset, 0);
+      } else {
+        // valid, set row_emptiness_flag
+        const int64_t row_in_batch = indices[offset].row_in_batch;
+        atomicAnd(row_emptiness_flag + row_in_batch, 0);
+      }
+    } else {
+      // set row_emptiness_flag
+      const int64_t row_in_batch = indices[offset].row_in_batch;
+      atomicAnd(row_emptiness_flag + row_in_batch, 0);
+    }
+  } else {
+    // do InitBatchRowsBuffer
+    const int other_offset = offset - nnz;
+    if (other_offset < batch_size) {
+      tmp_indices_buffer[other_offset].row_in_batch = other_offset;
+      // always set entry id to 0;
+      tmp_indices_buffer[other_offset].entry_in_column = 0;
+    }
+  }
+
+  // set values extended to default id
+  if (2 * offset + 1 < nnz + batch_size) {
+    longlong2 l2 = make_longlong2(default_id, default_id);
+    *((longlong2*)(values_extended + 2 * offset)) = l2;
+  } else if (2 * offset < nnz + batch_size) {
+    values_extended[2 * offset] = default_id;
+  }
+}
+
+__global__ void DetectInvalid(const int64_t* values, const int64_t nnz,
+                              int* invalid_id_flag) {
+  const int offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (offset < nnz) {
+    const int64_t value = values[offset];
+    if (value < 0) {
+      atomicAnd(invalid_id_flag + offset, 0);
+    }
+  }
+}
+
+__global__ void CalcElementsOffsetPerPartition(
+    const int64_t* values_sorted, int64_t* partition_sizes_accumulate,
+    int64_t* elements_offset_per_partition, int nnz) {
+  // dichotomy
+  const int64_t target = partition_sizes_accumulate[blockIdx.x];
+  int roof = nnz;
+  int floor = 0;
+
+  int pos = (roof + floor) / 2;
+  while (1) {
+    if (pos == 0) {
+      pos = -1;
+      break;
+    } else if (pos == nnz - 1) {
+      break;
+    }
+    int64_t value = values_sorted[pos];
+    int64_t value_plus_1 = values_sorted[pos + 1];
+    if (value < target && value_plus_1 >= target) {
+      break;
+    }
+    if (value < target) {
+      floor = pos;
+    } else {
+      roof = pos;
+    }
+    pos = (roof + floor) / 2;
+  }
+  elements_offset_per_partition[blockIdx.x] = int64_t(pos + 1);
+}
+
+__global__ void GatherAndConvertToSubPartition(
+    const int64_t* sub_values_sorted, int64_t* sub_partitioned_values,
+    const int64_t partition_start_base, const int64_t partition_size) {
+  const int t_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (t_offset < partition_size) {
+    int64_t value = sub_values_sorted[t_offset];
+    // rebase value to it's corresponding sub partition
+    value = value - partition_start_base;
+    sub_partitioned_values[t_offset] = value;
+  }
+}
+
+}  // namespace
+
+class FusedEmbeddingSparsePreLookUpGPU : public OpKernel {
+ public:
+  explicit FusedEmbeddingSparsePreLookUpGPU(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_partitions", &num_partitions_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("partition_axis", &partition_axis_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("fill_empty_row", &fill_empty_row_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("prune_invalid_id", &prune_invalid_id_));
+    int temp_default_id;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("default_id", &temp_default_id));
+    default_id_ = int64_t(temp_default_id);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    const int64_t default_id = default_id_ >= 0 ? default_id_ : 0;
+    const int linear_mapping_threads = 128;
+
+    // 1. bind inputs
+    Tensor const* values_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_values", &values_tensor));
+    const int64_t nnz = values_tensor->shape().dim_size(0);
+
+    Tensor const* indices_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_indices", &indices_tensor));
+
+    Tensor const* dense_shape = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->input("sp_dense_shape", &dense_shape));
+    const int64_t batch_size = dense_shape->flat<int64>().data()[0];
+
+    OpInputList partition_shapes;
+    OP_REQUIRES_OK(ctx, ctx->input_list("partition_shapes", &partition_shapes));
+
+    partition_sizes_accumulate_.clear();
+    for (const Tensor& shape : partition_shapes) {
+      OP_REQUIRES(ctx, shape.dims() <= 2,
+                  errors::InvalidArgument(
+                      "input partition_shapes must all less than rank 2"));
+      const int64_t accu = partition_sizes_accumulate_.empty()
+                               ? shape.flat<int64>().data()[0]
+                               : shape.flat<int64>().data()[0] +
+                                     partition_sizes_accumulate_.back();
+      partition_sizes_accumulate_.push_back(accu);
+    }
+
+    // 2. allocate cub tmp storage
+    Tensor cub_temp_storage;
+    size_t max_cub_bytes = 0;
+    size_t temp_storage_bytes = 0;
+
+    if (num_partitions_ > 1) {
+      cub::DeviceRadixSort::SortPairs(
+          (void*)nullptr, temp_storage_bytes, (int64_t*)nullptr,
+          (int64_t*)nullptr, (IndicePair*)nullptr, (IndicePair*)nullptr,
+          int(nnz + batch_size), 0, sizeof(int64_t) * 8, stream);
+      max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes
+                                                         : max_cub_bytes;
+    }
+
+    if (fill_empty_row_ || prune_invalid_id_) {
+      cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, (int64_t*)nullptr,
+                                 (int*)nullptr, (int64_t*)nullptr,
+                                 (int*)nullptr, nnz, stream);
+
+      max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes
+                                                         : max_cub_bytes;
+
+      cub::DeviceSelect::Flagged(
+          (void*)nullptr, temp_storage_bytes, (IndicePair*)nullptr,
+          (int*)nullptr, (IndicePair*)nullptr, (int*)nullptr, nnz, stream);
+
+      max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes
+                                                         : max_cub_bytes;
+
+      if (fill_empty_row_) {
+        cub::DeviceSelect::Flagged((void*)nullptr, temp_storage_bytes,
+                                   (IndicePair*)nullptr, (int*)nullptr,
+                                   (IndicePair*)nullptr, (int*)nullptr,
+                                   batch_size, stream);
+        max_cub_bytes = temp_storage_bytes > max_cub_bytes ? temp_storage_bytes
+                                                           : max_cub_bytes;
+      }
+    }
+
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(
+                 DT_INT8, TensorShape({static_cast<int64_t>(max_cub_bytes)}),
+                 &cub_temp_storage));
+
+    // 3. fill_empty_row, prune, if avaliable.
+    Tensor values_extended;
+    Tensor indices_extended;
+    Tensor tmp_indices_buffer;
+    Tensor* all_flags;
+    Tensor selected_num_d;
+    int new_nnz = nnz;
+
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(2 * num_partitions_,
+                                  TensorShape{batch_size + nnz}, &all_flags));
+
+    if (fill_empty_row_ || prune_invalid_id_) {
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DT_INT64, TensorShape{nnz + batch_size},
+                                        &values_extended));
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_INT64, TensorShape{2 * (nnz + batch_size)},
+                                  &indices_extended));
+      OP_REQUIRES_OK(ctx,
+                     ctx->allocate_temp(DT_INT64, TensorShape{2 * batch_size},
+                                        &tmp_indices_buffer));
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DT_INT32, TensorShape{1}, &selected_num_d));
+
+      {
+        const int threads = linear_mapping_threads;
+        const int blocks =
+            CalcBlocksLinearMapping(batch_size + nnz, threads * 4);
+        InitFlagsToOneInt4<<<blocks, threads, 0, stream>>>(
+            batch_size + nnz, all_flags->flat<int>().data());
+        CK_CUDA_THROW_(cudaGetLastError());
+      }
+
+      // 3.1 set flags, init tmp_indices_buffer etc.
+      if (fill_empty_row_) {
+        {
+          const int threads = linear_mapping_threads;
+          const int blocks = CalcBlocksLinearMapping(nnz + batch_size, threads);
+          FusedMultiFunctionalKernel<<<blocks, threads, 0, stream>>>(
+              reinterpret_cast<const IndicePair*>(
+                  indices_tensor->flat<int64>().data()),
+              reinterpret_cast<const int64_t*>(
+                  values_tensor->flat<int64>().data()),
+              nnz, batch_size, prune_invalid_id_, default_id,
+              all_flags->flat<int>().data(),
+              all_flags->flat<int>().data() + batch_size,
+              reinterpret_cast<IndicePair*>(
+                  tmp_indices_buffer.flat<int64>().data()),
+              reinterpret_cast<int64_t*>(values_extended.flat<int64>().data()));
+          CK_CUDA_THROW_(cudaGetLastError());
+        }
+      } else if (prune_invalid_id_) {
+        {
+          const int threads = linear_mapping_threads;
+          const int blocks = CalcBlocksLinearMapping(nnz, threads);
+          DetectInvalid<<<blocks, threads, 0, stream>>>(
+              reinterpret_cast<const int64_t*>(
+                  values_tensor->flat<int64>().data()),
+              nnz, all_flags->flat<int>().data() + batch_size);
+          CK_CUDA_THROW_(cudaGetLastError());
+        }
+      }
+      // 3.2 select copy valid id, select copy empty row indices
+
+      cudaError_t cuda_ret = cudaSuccess;
+      cuda_ret = cub::DeviceSelect::Flagged(
+          cub_temp_storage.flat<int8>().data(), max_cub_bytes,
+          reinterpret_cast<const int64_t*>(values_tensor->flat<int64>().data()),
+          (const int*)(all_flags->flat<int>().data() + batch_size),
+          reinterpret_cast<int64_t*>(values_extended.flat<int64>().data()),
+          selected_num_d.flat<int>().data(), int(nnz), stream);
+      CK_CUDA_THROW_(cudaGetLastError());
+
+      cub::DeviceSelect::Flagged(
+          cub_temp_storage.flat<int8>().data(), max_cub_bytes,
+          reinterpret_cast<const IndicePair*>(
+              indices_tensor->flat<int64>().data()),
+          all_flags->flat<int>().data() + batch_size,
+          reinterpret_cast<IndicePair*>(indices_extended.flat<int64>().data()),
+          selected_num_d.flat<int>().data(), nnz, stream);
+
+      if (prune_invalid_id_) {
+        int selected_num;
+        cudaMemcpyAsync(&selected_num, selected_num_d.flat<int>().data(),
+                        sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+        new_nnz = selected_num;
+      }
+
+      if (fill_empty_row_) {
+        cub::DeviceSelect::Flagged(
+            cub_temp_storage.flat<int8>().data(), max_cub_bytes,
+            reinterpret_cast<const IndicePair*>(
+                tmp_indices_buffer.flat<int64>().data()),
+            all_flags->flat<int>().data(),
+            reinterpret_cast<IndicePair*>(
+                indices_extended.flat<int64>().data()) +
+                new_nnz,
+            selected_num_d.flat<int>().data(), batch_size, stream);
+        CK_CUDA_THROW_(cudaGetLastError());
+        int selected_num;
+        cudaMemcpyAsync(&selected_num, selected_num_d.flat<int>().data(),
+                        sizeof(int), cudaMemcpyDeviceToHost, stream);
+        cudaStreamSynchronize(stream);
+        new_nnz += selected_num;
+      }
+    }
+
+    // 3.5 set the correct pointer
+    const int64_t* values_in = (fill_empty_row_ || prune_invalid_id_)
+                                   ? reinterpret_cast<const int64_t*>(
+                                         values_extended.flat<int64>().data())
+                                   : reinterpret_cast<const int64_t*>(
+                                         values_tensor->flat<int64>().data());
+    const IndicePair* indices_in =
+        (fill_empty_row_ || prune_invalid_id_)
+            ? reinterpret_cast<const IndicePair*>(
+                  indices_extended.flat<int64>().data())
+            : reinterpret_cast<const IndicePair*>(
+                  indices_tensor->flat<int64>().data());
+
+    OpOutputList partitioned_values;
+    OP_REQUIRES_OK(ctx,
+                   ctx->output_list("partitioned_values", &partitioned_values));
+    OpOutputList partitioned_indices;
+    OP_REQUIRES_OK(
+        ctx, ctx->output_list("partitioned_indices", &partitioned_indices));
+
+    // 4. set output
+    if (num_partitions_ == 1) {
+      // single partition case, just directly copy
+      Tensor* pv_out;
+      OP_REQUIRES_OK(
+          ctx, partitioned_values.allocate(
+                   0, TensorShape({static_cast<int64_t>(new_nnz)}), &pv_out));
+      Tensor* pi_out;
+      OP_REQUIRES_OK(
+          ctx,
+          partitioned_indices.allocate(
+              0, TensorShape({static_cast<int64_t>(new_nnz), 2}), &pi_out));
+
+      cudaMemcpyAsync(pv_out->flat<int64>().data(), values_in,
+                      sizeof(int64_t) * new_nnz, cudaMemcpyDeviceToDevice,
+                      stream);
+      cudaMemcpyAsync(pi_out->flat<int64>().data(), indices_in,
+                      sizeof(IndicePair) * new_nnz, cudaMemcpyDeviceToDevice,
+                      stream);
+
+    } else {
+      // multi-partitions case, calcaulate indices and split them.
+      Tensor values_sorted;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT64, TensorShape{new_nnz},
+                                             &values_sorted));
+      Tensor indices_sorted;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_INT64, TensorShape{new_nnz, 2},
+                                             &indices_sorted));
+
+      cub::DeviceRadixSort::SortPairs(
+          cub_temp_storage.flat<int8>().data(), max_cub_bytes, values_in,
+          reinterpret_cast<int64_t*>(values_sorted.flat<int64>().data()),
+          indices_in,
+          reinterpret_cast<IndicePair*>(indices_sorted.flat<int64>().data()),
+          int(new_nnz), 0, sizeof(int64_t) * 8, stream);
+      CK_CUDA_THROW_(cudaGetLastError());
+
+      // 4.1 calculate how many elements for each
+      // partition
+      Tensor partition_sizes_accumulate;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT64, TensorShape({static_cast<int64_t>(num_partitions_)}),
+              &partition_sizes_accumulate));
+      cudaMemcpyAsync(partition_sizes_accumulate.flat<int64>().data(),
+                      partition_sizes_accumulate_.data(),
+                      num_partitions_ * sizeof(int64_t), cudaMemcpyHostToDevice,
+                      stream);
+
+      Tensor elements_offset_per_partition;
+      OP_REQUIRES_OK(
+          ctx,
+          ctx->allocate_temp(
+              DT_INT64, TensorShape({static_cast<int64_t>(num_partitions_)}),
+              &elements_offset_per_partition));
+
+      {
+        const int blocks = num_partitions_;
+        const int threads = 1;
+        CalcElementsOffsetPerPartition<<<blocks, threads, 0, stream>>>(
+            reinterpret_cast<const int64_t*>(
+                values_sorted.flat<int64>().data()),
+            reinterpret_cast<int64_t*>(
+                partition_sizes_accumulate.flat<int64>().data()),
+            reinterpret_cast<int64_t*>(
+                elements_offset_per_partition.flat<int64>().data()),
+            int(new_nnz));
+        CK_CUDA_THROW_(cudaGetLastError());
+      }
+
+      elements_offset_per_partition_.clear();
+      elements_offset_per_partition_.resize(num_partitions_);
+      // stream_executor::DeviceMemoryBase
+      // elements_offset_per_partition_wrapped(
+      //     elements_offset_per_partition.flat<int64>().data(),
+      //     num_partitions_);
+      // stream->ThenMemcpy(elements_offset_per_partition_.data(),
+      //                    elements_offset_per_partition_wrapped,
+      //                    num_partitions_ *
+      //                    sizeof(int64_t));
+      // stream->BlockHostUntilDone();
+
+      cudaMemcpyAsync(elements_offset_per_partition_.data(),
+                      elements_offset_per_partition.flat<int64>().data(),
+                      num_partitions_ * sizeof(int64_t), cudaMemcpyDeviceToHost,
+                      stream);
+      cudaStreamSynchronize(stream);
+
+      // 4.2 set output
+      int64_t sub_start_offset = 0;
+      for (int i = 0; i < num_partitions_; i++) {
+        int64_t size = elements_offset_per_partition_[i] - sub_start_offset;
+
+        Tensor* sub_partitioned_values;
+        OP_REQUIRES_OK(ctx, partitioned_values.allocate(
+                                i, TensorShape({static_cast<int64_t>(size)}),
+                                &sub_partitioned_values));
+
+        Tensor* sub_partitioned_indices;
+        OP_REQUIRES_OK(ctx, partitioned_indices.allocate(
+                                i, TensorShape({static_cast<int64_t>(size), 2}),
+                                &sub_partitioned_indices));
+
+        if (size > 0) {
+          // some partition does not have any
+          // element that falls in it
+          const int threads = linear_mapping_threads;
+          int blocks = CalcBlocksLinearMapping(size, threads);
+
+          const int partition_start_base =
+              i == 0 ? 0 : partition_sizes_accumulate_[i - 1];
+          GatherAndConvertToSubPartition<<<blocks, threads, 0, stream>>>(
+              reinterpret_cast<const int64_t*>(
+                  values_sorted.flat<int64>().data()) +
+                  sub_start_offset,
+              reinterpret_cast<int64_t*>(
+                  sub_partitioned_values->flat<int64>().data()),
+              partition_start_base, size);
+
+          CK_CUDA_THROW_(cudaGetLastError());
+
+          // stream_executor::DeviceMemoryBase
+          // sub_indices_sorted_wrapped(
+          //     reinterpret_cast<IndicePair*>(indices_sorted.flat<int64>().data())
+          //     +
+          //         partition_start_base,
+          //     size * sizeof(IndicePair));
+          // stream_executor::DeviceMemoryBase
+          // sub_indices_out_wrapped(
+          //     reinterpret_cast<IndicePair*>(
+          //         sub_partitioned_indices.flat<int64>().data()),
+          //     size * sizeof(IndicePair));
+          // stream->ThenMemcpy(&sub_indices_out_wrapped,
+          //                    sub_indices_sorted_wrapped,
+          //                    size * 2 *
+          //                    sizeof(int64_t));
+          cudaMemcpyAsync(
+              sub_partitioned_indices->flat<int64>().data(),
+              indices_sorted.flat<int64>().data() + 2 * sub_start_offset,
+              size * 2 * sizeof(int64_t), cudaMemcpyDeviceToDevice, stream);
+        }
+        sub_start_offset = elements_offset_per_partition_[i];
+      }
+    }
+    // Op kernel execution done
+  }
+
+ private:
+  int num_partitions_;
+  int partition_axis_;
+  bool fill_empty_row_;
+  bool prune_invalid_id_;
+  int64_t default_id_;
+  std::vector<int64_t> partition_sizes_accumulate_;
+  std::vector<int64_t> elements_offset_per_partition_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FusedEmbeddingSparsePreLookUp")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("partition_shapes")
+                            .HostMemory("sp_dense_shape"),
+                        FusedEmbeddingSparsePreLookUpGPU);
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc
new file mode 100644
index 00000000..e9603304
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_pre_ops_test.cc
@@ -0,0 +1,352 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdio.h>
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+class FusedEmbeddingSparsePreLookUpOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, const int num_partitions,
+                          const bool fill_empty_row,
+                          const bool prune_invalid_id, const int default_id) {
+    if (device == Device::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    TF_EXPECT_OK(NodeDefBuilder("fused_embedding_sparse_pre_look_up",
+                                "FusedEmbeddingSparsePreLookUp")
+                     .Attr("num_partitions", num_partitions)
+                     .Attr("partition_axis", 0)
+                     .Attr("fill_empty_row", fill_empty_row)
+                     .Attr("prune_invalid_id", prune_invalid_id)
+                     .Attr("default_id", default_id)
+                     .Input(FakeInput(num_partitions, DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Input(FakeInput(DT_INT64))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition3_Int64) {
+  MakeOpAndSetDevice(Device::GPU, 3, false, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {6, 16});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {3, 16});
+  // partition_shapes 2
+  AddInputFromArray<int64>(TensorShape({2}), {7, 16});
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({12}),
+                           {1, 5, 3, 6, 12, 14, 15, 0, 5, 5, 11, 7});
+  // sp_indices
+  AddInputFromArray<int64>(TensorShape({12, 2}),
+                           {2,  3, 4,  6, 1, 6, 12, 12, 12, 12, 11, 5,
+                            15, 0, 11, 6, 7, 9, 11, 8,  12, 13, 13, 0});
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {16, 16});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({6}));
+    test::FillValues<int64>(&expected_values, {0, 1, 3, 5, 5, 5});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({6, 2}));
+    test::FillValues<int64>(&expected_indices,
+                            {11, 6, 2, 3, 1, 6, 4, 6, 7, 9, 11, 8});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 1});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {12, 12, 13, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(4));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {2, 3, 5, 6});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(2));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {12, 13, 12, 12, 11, 5, 15, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(5));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition2_Fill_Empty) {
+  MakeOpAndSetDevice(Device::GPU, 2, true, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({9}));
+    test::FillValues<int64>(&expected_values, {-6, -4, -3, -2, 0, 0, 2, 3, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2}));
+    test::FillValues<int64>(&expected_indices, {6, 1, 5, 2, 4, 0, 3, 0, 0, 0, 2,
+                                                0, 6, 7, 1, 2, 0, 4});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Fill_Empty_Prune_Invalid) {
+  MakeOpAndSetDevice(Device::GPU, 2, true, true, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({7}));
+    test::FillValues<int64>(&expected_values, {0, 0, 0, 0, 2, 3, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({7, 2}));
+    test::FillValues<int64>(&expected_indices,
+                            {0, 0, 2, 0, 4, 0, 5, 0, 6, 7, 1, 2, 0, 4});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Fill_Empty_Prune_Invalid_Default_7) {
+  MakeOpAndSetDevice(Device::GPU, 2, true, true, 7);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {0, 2, 3, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 6, 7, 1, 2, 0, 4});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({5}));
+    test::FillValues<int64>(&expected_values, {0, 2, 2, 2, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({5, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 2, 0, 4, 0, 5, 0, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition2_Prune_Invalid_Default_3) {
+  MakeOpAndSetDevice(Device::GPU, 2, false, true, 3);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+  // partition_shapes 1
+  AddInputFromArray<int64>(TensorShape({2}), {5, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({4}));
+    test::FillValues<int64>(&expected_values, {0, 2, 3, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({4, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 6, 7, 1, 2, 0, 4});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(2));
+  }
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({2}));
+    test::FillValues<int64>(&expected_values, {0, 4});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(1));
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({2, 2}));
+    test::FillValues<int64>(&expected_indices, {3, 4, 6, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(3));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest, Partition1) {
+  MakeOpAndSetDevice(Device::GPU, 1, false, false, -1);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {10, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({10}));
+    test::FillValues<int64>(&expected_values,
+                            {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({10, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 0, 3, 4,
+                                                4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+TEST_F(FusedEmbeddingSparsePreLookUpOpTest,
+       Partition1_Fill_Empty_Prune_Invalid_Default_3) {
+  MakeOpAndSetDevice(Device::GPU, 1, true, true, 3);
+  // partition_shapes 0
+  AddInputFromArray<int64>(TensorShape({2}), {10, 8});
+
+  // sp_values
+  AddInputFromArray<int64>(TensorShape({10}),
+                           {0, 4, 3, -2, 5, -3, -4, 9, -6, 2});
+
+  // sp_indices
+  AddInputFromArray<int64>(
+      TensorShape({10, 2}),
+      {0, 0, 0, 4, 1, 2, 3, 0, 3, 4, 4, 0, 5, 2, 6, 0, 6, 1, 6, 7});
+
+  // sp_dense_shape
+  AddInputFromArray<int64>(TensorShape({2}), {7, 8});
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_values(allocator(), DT_INT64, TensorShape({9}));
+    test::FillValues<int64>(&expected_values, {0, 4, 3, 5, 9, 2, 3, 3, 3});
+    test::ExpectTensorEqual<int64>(expected_values, *GetOutput(0));
+
+    Tensor expected_indices(allocator(), DT_INT64, TensorShape({9, 2}));
+    test::FillValues<int64>(&expected_indices, {0, 0, 0, 4, 1, 2, 3, 4, 6, 0, 6,
+                                                7, 2, 0, 4, 0, 5, 0});
+    test::ExpectTensorEqual<int64>(expected_indices, *GetOutput(1));
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD
new file mode 100644
index 00000000..379f2da7
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
+
+tf_custom_op_library(
+    name = "fused_embedding_ops",
+    srcs = [
+        "compile_util.h",
+        "fused_layer_normalize_ops.cc",
+    ],
+    gpu_deps = [
+    ],
+    gpu_srcs = [
+    ],
+    deps = [
+        "@com_github_google_leveldb//:leveldb",
+        "@sparsehash_c11//:dense_hash_map",
+        # "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        # "@org_tensorflow//tensorflow/core/common_runtime:core_cpu",
+        "@org_tensorflow//tensorflow/core/kernels:training_op_helpers",
+        "@org_tensorflow//tensorflow/core/kernels:gpu_device_array",
+        "@org_tensorflow//tensorflow/core/kernels:gather_functor",
+    ],
+)
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h
new file mode 100644
index 00000000..cfeffbd8
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/compile_util.h
@@ -0,0 +1,78 @@
+#ifndef TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_
+#define TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_
+
+#include <type_traits>
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+
+using namespace tensorflow;
+// A class for forced loop unrolling at compile time
+template <int i>
+struct compile_time_for {
+  template <typename Lambda, typename... Args>
+  inline static void op(const Lambda& function, Args... args) {
+    compile_time_for<i - 1>::op(function, args...);
+    function(std::integral_constant<int, i - 1>{}, args...);
+  }
+};
+template <>
+struct compile_time_for<1> {
+  template <typename Lambda, typename... Args>
+  inline static void op(const Lambda& function, Args... args) {
+    function(std::integral_constant<int, 0>{}, args...);
+  }
+};
+template <>
+struct compile_time_for<0> {
+  // 0 loops, do nothing
+  template <typename Lambda, typename... Args>
+  inline static void op(const Lambda& function, Args... args) {}
+};
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+
+template <int BLOCK_NUM>
+inline __m512 reduce_sum_block(const __m512* v) {
+  __m512 block_sum = _mm512_setzero_ps();
+  auto reduce_sum = [&](auto idx) {
+    block_sum = _mm512_add_ps(block_sum, v[idx]);
+  };
+  compile_time_for<BLOCK_NUM>::op(reduce_sum);
+  return block_sum;
+}
+
+inline __m512 reduce_sum_block_ps(const __m512* v, int64 BLOCK_NUM) {
+  switch (BLOCK_NUM) {
+    case 1:
+      return v[0];
+    case 2:
+      return reduce_sum_block<2>(v);
+    case 3:
+      return reduce_sum_block<3>(v);
+    case 4:
+      return reduce_sum_block<4>(v);
+    case 5:
+      return reduce_sum_block<5>(v);
+    case 6:
+      return reduce_sum_block<6>(v);
+    case 7:
+      return reduce_sum_block<7>(v);
+    case 8:
+      return reduce_sum_block<8>(v);
+  }
+}
+
+static inline float horizontal_add(__m512 src) {
+  __m512 tmp = _mm512_add_ps(
+      src, _mm512_shuffle_f32x4(src, src, _MM_SHUFFLE(1, 0, 3, 2)));
+  __m128 r = _mm512_castps512_ps128(_mm512_add_ps(
+      tmp, _mm512_shuffle_f32x4(tmp, tmp, _MM_SHUFFLE(2, 3, 0, 1))));
+  r = _mm_hadd_ps(r, r);
+  return _mm_cvtss_f32(_mm_hadd_ps(r, r));
+}
+
+#endif  // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+#endif  // TENSORFLOW_CORE_KERNELS_FUSED_LAYER_NORMALIZE_COMPILE_UTIL_OP_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc
new file mode 100644
index 00000000..d91dfe6c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops.cc
@@ -0,0 +1,678 @@
+#include "compile_util.h"
+
+using namespace tensorflow;
+
+template <typename T>
+class FusedLayerNormOp : public OpKernel {
+ private:
+  float epsilon;
+
+ public:
+  explicit FusedLayerNormOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon));
+  }
+
+  ~FusedLayerNormOp() {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input
+    const Tensor* input_tensor = &context->input(0);
+    const Tensor* gamma_tensor = &context->input(1);
+    const Tensor* beta_tensor = &context->input(2);
+
+    const T* input = input_tensor->flat<T>().data();
+    const float* gamma = gamma_tensor->flat<float>().data();
+    const float* beta = beta_tensor->flat<float>().data();
+
+    // To check the input
+    OP_REQUIRES(context, (input_tensor->dims() >= 2),
+                errors::InvalidArgument("Input dimension should be >= 2"));
+    OP_REQUIRES(context, (gamma_tensor->dims() == 1),
+                errors::InvalidArgument("dims(gamma) != 1"));
+    OP_REQUIRES(context, (beta_tensor->dims() == 1),
+                errors::InvalidArgument("dims(beta) != 1"));
+
+    int64 cols = input_tensor->dim_size(input_tensor->dims() - 1);
+    OP_REQUIRES(
+        context, (gamma_tensor->dim_size(0) == cols),
+        errors::InvalidArgument("size(gamma) != last_dim_size_of_input"));
+    OP_REQUIRES(
+        context, (beta_tensor->dim_size(0) == cols),
+        errors::InvalidArgument("size(beta) != last_dim_size_of_input"));
+
+    int64 rows = 1;
+    TensorShape mean_var_shape;
+    for (int i = 0; i < input_tensor->dims() - 1; ++i) {
+      auto dim_size = input_tensor->dim_size(i);
+      rows *= dim_size;
+      mean_var_shape.AddDim(dim_size);
+    }
+
+    // Create output tensors
+    Tensor* output_tensor = NULL;
+    Tensor* mean_tensor = NULL;
+    Tensor* rvariance_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
+                                                     &output_tensor));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, mean_var_shape, &mean_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(2, mean_var_shape,
+                                                     &rvariance_tensor));
+    T* output = output_tensor->flat<T>().data();
+    float* mean = mean_tensor->flat<float>().data();
+    float* rvariance = rvariance_tensor->flat<float>().data();
+
+    // Init
+    memset(mean, 0, sizeof(float) * rows);
+    memset(rvariance, 0, sizeof(float) * rows);
+
+    // Do it
+    // Let every thread compute 16 rows to avoid false sharing
+    const int64 total_unit = (rows + 15) / 16;
+    const int64 unit_cost =
+        16 * cols * 50;  // assume every element consumes 50 cycles
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+    int64 block_num = cols >> 7;
+    int64 remainder_128 = cols & 0x7F;
+    int64 remainder_16 = remainder_128 & 0x0F;
+    int64 remainder_block_num = remainder_128 >> 4;
+    int64 remainder_block_num_total = remainder_block_num + !!remainder_16;
+#endif  // AVX512F
+    const float one_over_cols = 1.0f / cols;
+
+    auto& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+    thread::ThreadPool* thread_pool = worker_threads.workers;
+
+    thread_pool->ParallelFor(
+        total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) {
+          auto begin_row = begin_unit * 16;
+          auto end_row = end_unit * 16;
+          if (end_row > rows) {
+            end_row = rows;
+          }
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+          forward_avx512(input, gamma, beta, output, mean, rvariance, cols,
+                         begin_row, end_row, block_num, remainder_block_num,
+                         remainder_block_num_total, remainder_128, remainder_16,
+                         one_over_cols);
+#else
+          forward(input, gamma, beta, output, mean, rvariance, cols, begin_row, end_row, one_over_cols);
+#endif  // AVX512F
+        });
+  }
+
+ private:
+  // Compute the rows locate in the range of [begin_row, begin_row + ROWS)
+  void forward(const float* input, const float* gamma, const float* beta,
+               float* output, float* mean, float* rvariance, int64 cols,
+               int64 begin_row, int64 end_row, const float one_over_cols) {
+    for (int64 i = begin_row; i < end_row; i++) {
+      // Sum
+      int64 j = 0;
+      for (; j + 7 < cols; j += 8) {
+        T data_0 = input[i * cols + j];
+        T data_1 = input[i * cols + j + 1];
+        T data_2 = input[i * cols + j + 2];
+        T data_3 = input[i * cols + j + 3];
+        T data_4 = input[i * cols + j + 4];
+        T data_5 = input[i * cols + j + 5];
+        T data_6 = input[i * cols + j + 6];
+        T data_7 = input[i * cols + j + 7];
+        mean[i] += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 +
+                   data_6 + data_7;
+      }
+      for (; j < cols; j++) {
+        mean[i] += input[i * cols + j];
+      }
+      // Mean
+      mean[i] *= one_over_cols;
+
+      // variance
+      for (j = 0; j + 7 < cols; j += 8) {
+        T data_0 = input[i * cols + j] - mean[i];
+        T data_1 = input[i * cols + j + 1] - mean[i];
+        T data_2 = input[i * cols + j + 2] - mean[i];
+        T data_3 = input[i * cols + j + 3] - mean[i];
+        T data_4 = input[i * cols + j + 4] - mean[i];
+        T data_5 = input[i * cols + j + 5] - mean[i];
+        T data_6 = input[i * cols + j + 6] - mean[i];
+        T data_7 = input[i * cols + j + 7] - mean[i];
+        rvariance[i] += data_0 * data_0 + data_1 * data_1 + data_2 * data_2 +
+                        data_3 * data_3 + data_4 * data_4 + data_5 * data_5 +
+                        data_6 * data_6 + data_7 * data_7;
+      }
+      for (; j < cols; j++) {
+        T data = input[i * cols + j] - mean[i];
+        rvariance[i] += data * data;
+      }
+      rvariance[i] *= one_over_cols;
+      rvariance[i] += epsilon;
+      rvariance[i] = 1.0f / sqrtf(rvariance[i]);
+
+      for (j = 0; j + 7 < cols; j += 8) {
+        T data_0 = (input[i * cols + j] - mean[i]) * rvariance[i];
+        T data_1 = (input[i * cols + j + 1] - mean[i]) * rvariance[i];
+        T data_2 = (input[i * cols + j + 2] - mean[i]) * rvariance[i];
+        T data_3 = (input[i * cols + j + 3] - mean[i]) * rvariance[i];
+        T data_4 = (input[i * cols + j + 4] - mean[i]) * rvariance[i];
+        T data_5 = (input[i * cols + j + 5] - mean[i]) * rvariance[i];
+        T data_6 = (input[i * cols + j + 6] - mean[i]) * rvariance[i];
+        T data_7 = (input[i * cols + j + 7] - mean[i]) * rvariance[i];
+        output[i * cols + j] = gamma[j] * data_0 + beta[j];
+        output[i * cols + j + 1] = gamma[j] * data_1 + beta[j];
+        output[i * cols + j + 2] = gamma[j] * data_2 + beta[j];
+        output[i * cols + j + 3] = gamma[j] * data_3 + beta[j];
+        output[i * cols + j + 4] = gamma[j] * data_4 + beta[j];
+        output[i * cols + j + 5] = gamma[j] * data_5 + beta[j];
+        output[i * cols + j + 6] = gamma[j] * data_6 + beta[j];
+        output[i * cols + j + 7] = gamma[j] * data_7 + beta[j];
+      }
+      for (; j < cols; j++) {
+        T data = (input[i * cols + j] - mean[i]) * rvariance[i];
+        output[i * cols + j] = gamma[j] * data + beta[j];
+      }
+    }
+  }
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+  // AVX512 block size = 8; pack 8 * 16 = 128;
+  inline void forward_avx512(const float* input, const float* gamma,
+                             const float* beta, float* output, float* mean,
+                             float* rvariance, int64 cols, int64 begin_row,
+                             int64 end_row, int64 block_num,
+                             int64 remainder_block_num,
+                             int64 remainder_block_num_total,
+                             int64 remainder_128, int64 remainder_16,
+                             const float one_over_cols) {
+    for (int64 i = begin_row; i < end_row; ++i) {
+      // Sum
+      for (int64 j = 0; j < block_num; ++j) {
+        __m512 inputs[8];
+        auto load = [&](auto idx) {
+          inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx);
+        };
+        compile_time_for<8>::op(load);
+        __m512 block_sum = reduce_sum_block<8>(inputs);
+        mean[i] += _mm512_reduce_add_ps(block_sum);
+      }
+      if (remainder_block_num_total) {  // remainder sum
+        __m512 inputs[remainder_block_num_total];
+        for (int64 idx = 0; idx < remainder_block_num; idx++) {
+          inputs[idx] = _mm512_loadu_ps(input + cols * i + cols -
+                                        remainder_128 + 16 * idx);
+        }
+        if (remainder_16) {
+          __mmask16 mask = 0xFFFF >> (16 - remainder_16);
+          inputs[remainder_block_num] = _mm512_maskz_loadu_ps(
+              mask, input + cols * i + cols - remainder_16);
+        }
+        __m512 block_sum =
+            reduce_sum_block_ps(inputs, remainder_block_num_total);
+        mean[i] += _mm512_reduce_add_ps(block_sum);
+      }
+
+      // Mean
+      mean[i] *= one_over_cols;
+      __m512 means = _mm512_set1_ps(mean[i]);
+
+      // Variance
+      for (int64 j = 0; j < block_num; ++j) {
+        __m512 inputs[8];
+        auto load_var = [&](auto idx) {
+          inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx);
+          inputs[idx] = _mm512_sub_ps(inputs[idx], means);
+          inputs[idx] = _mm512_mul_ps(inputs[idx], inputs[idx]);
+        };
+        compile_time_for<8>::op(load_var);
+        __m512 block_sum = reduce_sum_block<8>(inputs);
+        rvariance[i] += _mm512_reduce_add_ps(block_sum);
+      }
+      if (remainder_block_num_total) {  // remainder var
+        __m512 inputs[remainder_block_num_total];
+        for (int64 idx = 0; idx < remainder_block_num; idx++) {
+          inputs[idx] = _mm512_loadu_ps(input + cols * i + cols -
+                                        remainder_128 + 16 * idx);
+          inputs[idx] = _mm512_sub_ps(inputs[idx], means);
+          inputs[idx] = _mm512_mul_ps(inputs[idx], inputs[idx]);
+        }
+        if (remainder_16) {
+          __mmask16 mask = 0xFFFF >> (16 - remainder_16);
+          inputs[remainder_block_num] = _mm512_maskz_loadu_ps(
+              mask, input + cols * i + cols - remainder_16);
+          inputs[remainder_block_num] =
+              _mm512_maskz_sub_ps(mask, inputs[remainder_block_num], means);
+          inputs[remainder_block_num] = _mm512_maskz_mul_ps(
+              mask, inputs[remainder_block_num], inputs[remainder_block_num]);
+        }
+        __m512 block_sum =
+            reduce_sum_block_ps(inputs, remainder_block_num_total);
+        rvariance[i] += _mm512_reduce_add_ps(block_sum);
+      }
+
+      rvariance[i] *= one_over_cols;
+      rvariance[i] += epsilon;
+      rvariance[i] = 1.0f / sqrtf(rvariance[i]);
+      __m512 rvariances = _mm512_set1_ps(rvariance[i]);
+      // Normalize and store
+      for (int64 j = 0; j < block_num; ++j) {
+        __m512 inputs[8];
+        __m512 nums[8];  // used to load gammas and betas
+        auto load_normalize = [&](auto idx) {
+          // (x - mean) / sqrt(var + eps)
+          inputs[idx] = _mm512_loadu_ps(input + cols * i + 128 * j + 16 * idx);
+          inputs[idx] = _mm512_sub_ps(inputs[idx], means);
+          inputs[idx] = _mm512_mul_ps(inputs[idx], rvariances);
+          // Mul gamma
+          nums[idx] = _mm512_loadu_ps(gamma + 128 * j + 16 * idx);
+          inputs[idx] = _mm512_mul_ps(inputs[idx], nums[idx]);
+          // Add beta
+          nums[idx] = _mm512_loadu_ps(beta + 128 * j + 16 * idx);
+          inputs[idx] = _mm512_add_ps(inputs[idx], nums[idx]);
+
+          // Store
+          _mm512_storeu_ps(output + cols * i + 128 * j + 16 * idx, inputs[idx]);
+        };
+        compile_time_for<8>::op(load_normalize);
+      }
+      if (remainder_block_num_total) {  // remainder normalize and store
+        __m512 inputs;
+        __m512 nums;  // used to load gammas and betas
+        for (int64 idx = 0; idx < remainder_block_num;
+             idx++) {  // remainder of 128
+          // (x - mean) / sqrt(var + eps)
+          inputs = _mm512_loadu_ps(input + cols * i + cols - remainder_128 +
+                                   16 * idx);
+          inputs = _mm512_sub_ps(inputs, means);
+          inputs = _mm512_mul_ps(inputs, rvariances);
+          // Mul gamma
+          nums = _mm512_loadu_ps(gamma + cols - remainder_128 + 16 * idx);
+          inputs = _mm512_mul_ps(inputs, nums);
+          // Add beta
+          nums = _mm512_loadu_ps(beta + cols - remainder_128 + 16 * idx);
+          inputs = _mm512_add_ps(inputs, nums);
+
+          // Store
+          _mm512_storeu_ps(output + cols * i + cols - remainder_128 + 16 * idx,
+                           inputs);
+        }
+        if (remainder_16) {  // remainder of 16
+          __mmask16 mask = 0xFFFF >> (16 - remainder_16);
+          // (x - mean) / sqrt(var + eps)
+          inputs = _mm512_maskz_loadu_ps(
+              mask, input + cols * i + cols - remainder_16);
+          inputs = _mm512_maskz_sub_ps(mask, inputs, means);
+          inputs = _mm512_maskz_mul_ps(mask, inputs, rvariances);
+          // Mul gamma
+          nums = _mm512_maskz_loadu_ps(mask, gamma + cols - remainder_16);
+          inputs = _mm512_maskz_mul_ps(mask, inputs, nums);
+          // Add beta
+          nums = _mm512_maskz_loadu_ps(mask, beta + cols - remainder_16);
+          inputs = _mm512_maskz_add_ps(mask, inputs, nums);
+
+          // Store
+          _mm512_mask_storeu_ps(output + cols * i + cols - remainder_16, mask,
+                                inputs);
+        }
+      }
+    }
+  }
+
+#endif  // forward layer norm avx512 impl
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedLayerNorm").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    FusedLayerNormOp<float>);
+
+template <typename T>
+class FusedLayerNormGradOp : public OpKernel {
+ public:
+  explicit FusedLayerNormGradOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  ~FusedLayerNormGradOp() {}
+
+  void Compute(OpKernelContext* context) override {
+    // Grab the input
+    const Tensor* y_grad_tensor = &context->input(0);
+    const Tensor* x_tensor = &context->input(1);
+    const Tensor* mean_tensor = &context->input(2);
+    const Tensor* rvariance_tensor = &context->input(3);
+    const Tensor* gamma_tensor = &context->input(4);
+
+    const T* y_grad = y_grad_tensor->flat<T>().data();
+    const T* x = x_tensor->flat<T>().data();
+    const float* mean = mean_tensor->flat<float>().data();
+    const float* rvariance = rvariance_tensor->flat<float>().data();
+    const float* gamma = gamma_tensor->flat<float>().data();
+
+    int64 cols = x_tensor->dim_size(x_tensor->dims() - 1);
+    int64 rows = mean_tensor->NumElements();
+
+    // Create output tensors
+    Tensor* x_grad_tensor = NULL;
+    Tensor* gamma_grad_tensor = NULL;
+    Tensor* beta_grad_tensor = NULL;
+    OP_REQUIRES_OK(context, context->allocate_output(0, x_tensor->shape(),
+                                                     &x_grad_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(1, gamma_tensor->shape(),
+                                                     &gamma_grad_tensor));
+    OP_REQUIRES_OK(context, context->allocate_output(2, gamma_tensor->shape(),
+                                                     &beta_grad_tensor));
+    T* x_grad = x_grad_tensor->flat<T>().data();
+    float* gamma_grad = gamma_grad_tensor->flat<float>().data();
+    float* beta_grad = beta_grad_tensor->flat<float>().data();
+
+    // Init
+    memset(gamma_grad, 0, sizeof(float) * cols);
+    memset(beta_grad, 0, sizeof(float) * cols);
+
+    auto& worker_threads =
+        *(context->device()->tensorflow_cpu_worker_threads());
+    thread::ThreadPool* thread_pool = worker_threads.workers;
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+    const int total_unit = (rows >= 128 ? 8 : (rows + 15) / 16);
+    const int64 rows_per_unit = (rows + total_unit - 1) / total_unit;
+    const int64 unit_cost = rows_per_unit * cols * 100;
+    thread_pool->ParallelFor(
+        total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) {
+          auto begin_row = begin_unit * rows_per_unit;
+          auto end_row = end_unit * rows_per_unit;
+          if (end_row > rows) {
+            end_row = rows;
+          }
+          backward(y_grad, x, mean, rvariance, gamma, x_grad, gamma_grad,
+                   beta_grad, cols, begin_row, end_row);
+        });
+#else
+    const float one_over_cols = 1.0f / cols;
+    const int64 total_unit = (rows + 15) / 16;
+    const int64 unit_cost =
+        16 * cols * 100;  // assume every element consumes 100 cycles
+
+    thread_pool->ParallelFor(
+        total_unit, unit_cost, [&](int64 begin_unit, int64 end_unit) {
+          auto begin_row = begin_unit * 16;
+          auto end_row = end_unit * 16;
+          if (end_row > rows) {
+            end_row = rows;
+          }
+          backward(y_grad, x, mean, rvariance, gamma, x_grad, gamma_grad,
+                   beta_grad, begin_row, end_row, cols, one_over_cols);
+        });
+#endif  // backward compute
+  }
+
+ private:
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+  void backward(const float* diff, const float* x, const float* mean,
+                const float* rvariance, const float* gamma, float* x_diff,
+                float* gamma_diff, float* beta_diff, int64 cols, int begin_row,
+                int end_row) {
+    int i = begin_row;
+    for (; i + 3 < end_row; i += 4) {
+      backward_avx512<4>(diff, x, mean, rvariance, gamma, x_diff, gamma_diff,
+                         beta_diff, cols, i);
+    }
+    for (; i < end_row; ++i) {
+      backward_avx512<1>(diff, x, mean, rvariance, gamma, x_diff, gamma_diff,
+                         beta_diff, cols, i);
+    }
+  }
+#else
+  // For gradient of x, it comes from 3 parts: x-mean, mean, and rvariance
+  //   grad from (x - mean): y_grad * gamma * [rvariance]
+  //   grad from mean: - sum_row(y_grad * gamma * [rvariance]) / #cols
+  //   grad from rvariance: sum_row(y_grad * gamma * (x - mean)) * (-
+  //   [rvariance]^3) * (x - mean) / #cols
+  // For gradient of gamma, grad = y_grad * (x - mean) * rvariance
+  // For gradient of beta, grad = y_grad
+  void backward(const float* y_grad, const float* x, const float* mean,
+                const float* rvariance, const float* gamma, float* x_grad,
+                float* gamma_grad, float* beta_grad, int64 begin_row,
+                int64 end_row, int64 cols, const float one_over_cols) {
+    for (int64 i = begin_row; i < end_row; ++i) {
+      int64 j = 0;
+      float sum_m = 0;
+      float sum_r = 0;
+      // sum_m: sum_row(y_grad * gamma)
+      // sum_r: sum_row(y_grad * gamma * (x - mean))
+      for (; j + 7 < cols; j += 8) {
+        T data_0 = y_grad[i * cols + j] * gamma[j];
+        T data_1 = y_grad[i * cols + j + 1] * gamma[j + 1];
+        T data_2 = y_grad[i * cols + j + 2] * gamma[j + 2];
+        T data_3 = y_grad[i * cols + j + 3] * gamma[j + 3];
+        T data_4 = y_grad[i * cols + j + 4] * gamma[j + 4];
+        T data_5 = y_grad[i * cols + j + 5] * gamma[j + 5];
+        T data_6 = y_grad[i * cols + j + 6] * gamma[j + 6];
+        T data_7 = y_grad[i * cols + j + 7] * gamma[j + 7];
+        sum_m += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 + data_6 +
+                 data_7;
+
+        data_0 = data_0 * (x[i * cols + j] - mean[i]);
+        data_1 = data_1 * (x[i * cols + j + 1] - mean[i]);
+        data_2 = data_2 * (x[i * cols + j + 2] - mean[i]);
+        data_3 = data_3 * (x[i * cols + j + 3] - mean[i]);
+        data_4 = data_4 * (x[i * cols + j + 4] - mean[i]);
+        data_5 = data_5 * (x[i * cols + j + 5] - mean[i]);
+        data_6 = data_6 * (x[i * cols + j + 6] - mean[i]);
+        data_7 = data_7 * (x[i * cols + j + 7] - mean[i]);
+        sum_r += data_0 + data_1 + data_2 + data_3 + data_4 + data_5 + data_6 +
+                 data_7;
+      }
+      for (; j < cols; ++j) {  // remainder
+        sum_m += y_grad[i * cols + j] * gamma[j];
+        sum_r += y_grad[i * cols + j] * gamma[j] * (x[i * cols + j] - mean[i]);
+      }
+      sum_m *= one_over_cols;
+      sum_r *= rvariance[i] * rvariance[i];
+      sum_r *= one_over_cols;
+
+      for (j = 0; j + 7 < cols; j += 8) {
+        x_grad[i * cols + j] = y_grad[i * cols + j] * gamma[j];
+        x_grad[i * cols + j + 1] = y_grad[i * cols + j + 1] * gamma[j + 1];
+        x_grad[i * cols + j + 2] = y_grad[i * cols + j + 2] * gamma[j + 2];
+        x_grad[i * cols + j + 3] = y_grad[i * cols + j + 3] * gamma[j + 3];
+        x_grad[i * cols + j + 4] = y_grad[i * cols + j + 4] * gamma[j + 4];
+        x_grad[i * cols + j + 5] = y_grad[i * cols + j + 5] * gamma[j + 5];
+        x_grad[i * cols + j + 6] = y_grad[i * cols + j + 6] * gamma[j + 6];
+        x_grad[i * cols + j + 7] = y_grad[i * cols + j + 7] * gamma[j + 7];
+
+        x_grad[i * cols + j] -= sum_m + sum_r * (x[i * cols + j] - mean[i]);
+        x_grad[i * cols + j + 1] -=
+            sum_m + sum_r * (x[i * cols + j + 1] - mean[i]);
+        x_grad[i * cols + j + 2] -=
+            sum_m + sum_r * (x[i * cols + j + 2] - mean[i]);
+        x_grad[i * cols + j + 3] -=
+            sum_m + sum_r * (x[i * cols + j + 3] - mean[i]);
+        x_grad[i * cols + j + 4] -=
+            sum_m + sum_r * (x[i * cols + j + 4] - mean[i]);
+        x_grad[i * cols + j + 5] -=
+            sum_m + sum_r * (x[i * cols + j + 5] - mean[i]);
+        x_grad[i * cols + j + 6] -=
+            sum_m + sum_r * (x[i * cols + j + 6] - mean[i]);
+        x_grad[i * cols + j + 7] -=
+            sum_m + sum_r * (x[i * cols + j + 7] - mean[i]);
+
+        x_grad[i * cols + j] *= rvariance[i];
+        x_grad[i * cols + j + 1] *= rvariance[i];
+        x_grad[i * cols + j + 2] *= rvariance[i];
+        x_grad[i * cols + j + 3] *= rvariance[i];
+        x_grad[i * cols + j + 4] *= rvariance[i];
+        x_grad[i * cols + j + 5] *= rvariance[i];
+        x_grad[i * cols + j + 6] *= rvariance[i];
+        x_grad[i * cols + j + 7] *= rvariance[i];
+      }
+      for (; j < cols; ++j) {  // remainder
+        x_grad[i * cols + j] = y_grad[i * cols + j] * gamma[j];
+        x_grad[i * cols + j] -= sum_m + sum_r * (x[i * cols + j] - mean[i]);
+        x_grad[i * cols + j] *= rvariance[i];
+      }
+
+      // grad of gamma
+      for (j = 0; j + 7 < cols; j += 8) {
+        gamma_grad[j] +=
+            y_grad[i * cols + j] * (x[i * cols + j] - mean[i]) * rvariance[i];
+        gamma_grad[j + 1] += y_grad[i * cols + j + 1] *
+                             (x[i * cols + j + 1] - mean[i]) * rvariance[i];
+        gamma_grad[j + 2] += y_grad[i * cols + j + 2] *
+                             (x[i * cols + j + 2] - mean[i]) * rvariance[i];
+        gamma_grad[j + 3] += y_grad[i * cols + j + 3] *
+                             (x[i * cols + j + 3] - mean[i]) * rvariance[i];
+        gamma_grad[j + 4] += y_grad[i * cols + j + 4] *
+                             (x[i * cols + j + 4] - mean[i]) * rvariance[i];
+        gamma_grad[j + 5] += y_grad[i * cols + j + 5] *
+                             (x[i * cols + j + 5] - mean[i]) * rvariance[i];
+        gamma_grad[j + 6] += y_grad[i * cols + j + 6] *
+                             (x[i * cols + j + 6] - mean[i]) * rvariance[i];
+        gamma_grad[j + 7] += y_grad[i * cols + j + 7] *
+                             (x[i * cols + j + 7] - mean[i]) * rvariance[i];
+      }
+      for (; j < cols; ++j) {  // remainder
+        gamma_grad[j] +=
+            y_grad[i * cols + j] * (x[i * cols + j] - mean[i]) * rvariance[i];
+      }
+
+      // grad of beta
+      for (j = 0; j + 7 < cols; j += 8) {
+        beta_grad[j] += y_grad[i * cols + j];
+        beta_grad[j + 1] += y_grad[i * cols + j + 1];
+        beta_grad[j + 2] += y_grad[i * cols + j + 2];
+        beta_grad[j + 3] += y_grad[i * cols + j + 3];
+        beta_grad[j + 4] += y_grad[i * cols + j + 4];
+        beta_grad[j + 5] += y_grad[i * cols + j + 5];
+        beta_grad[j + 6] += y_grad[i * cols + j + 6];
+        beta_grad[j + 7] += y_grad[i * cols + j + 7];
+      }
+      for (; j < cols; ++j) {  // remainder
+        beta_grad[j] += y_grad[i * cols + j];
+      }
+    }
+  }
+#endif  // backward define
+
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+  template <int ROWS>
+  inline void backward_avx512(const float* y_grad, const float* x,
+                              const float* mean, const float* rvariance,
+                              const float* gamma, float* x_grad,
+                              float* gamma_grad, float* beta_grad, int64 cols,
+                              int64 start_row) {
+    float sum_m[ROWS], sum_r[ROWS];
+    __m512 vsum_m[ROWS], vsum_r[ROWS], vmean[ROWS], vrvariance[ROWS];
+
+    // Init
+    auto setzero = [&](auto idx) {
+      vsum_m[idx] = _mm512_setzero_ps();
+      vsum_r[idx] = _mm512_setzero_ps();
+      vmean[idx] = _mm512_set1_ps(mean[start_row + idx]);
+      vrvariance[idx] = _mm512_set1_ps(rvariance[start_row + idx]);
+    };
+    compile_time_for<ROWS>::op(setzero);
+
+    // Compute sum for y_grad * gamma and y_grad * gamma * (x - mean)
+    int64 j = 0;
+    for (; j + 15 < cols; j += 16) {
+      auto compute_sum = [&](auto idx) {
+        __m512 vy_grad = _mm512_loadu_ps(y_grad + (start_row + idx) * cols + j);
+        __m512 vgamma = _mm512_loadu_ps(gamma + j);
+
+        __m512 mul = _mm512_mul_ps(vy_grad, vgamma);
+        vsum_m[idx] = _mm512_add_ps(mul, vsum_m[idx]);
+
+        __m512 vx = _mm512_loadu_ps(x + (start_row + idx) * cols + j);
+        __m512 x_minus_mean = _mm512_sub_ps(vx, vmean[idx]);
+        vsum_r[idx] = _mm512_fmadd_ps(mul, x_minus_mean, vsum_r[idx]);
+      };
+
+      compile_time_for<ROWS>::op(compute_sum);
+    }
+
+    auto reduce_sum = [&](auto idx) {
+      sum_m[idx] = horizontal_add(vsum_m[idx]);
+      sum_r[idx] = horizontal_add(vsum_r[idx]);
+
+      for (int64 c = j; c < cols; ++c) {
+        const auto offset = (start_row + idx) * cols + c;
+        sum_m[idx] += y_grad[offset] * gamma[c];
+        sum_r[idx] +=
+            y_grad[offset] * gamma[c] * (x[offset] - mean[start_row + idx]);
+      }
+
+      sum_m[idx] /= cols;
+      sum_r[idx] *= rvariance[start_row + idx] * rvariance[start_row + idx];
+      sum_r[idx] /= cols;
+
+      vsum_m[idx] = _mm512_set1_ps(sum_m[idx]);
+      vsum_r[idx] = _mm512_set1_ps(sum_r[idx]);
+    };
+
+    compile_time_for<ROWS>::op(reduce_sum);
+
+    // Compute gradient for x, gamma, beta
+    for (j = 0; j + 15 < cols; j += 16) {
+      __m512 vgamma_grad = _mm512_loadu_ps(gamma_grad + j);
+      __m512 vbeta_grad = _mm512_loadu_ps(beta_grad + j);
+
+      auto compute_grad = [&](auto idx) {
+        __m512 vy_grad = _mm512_loadu_ps(y_grad + (start_row + idx) * cols + j);
+        __m512 vgamma = _mm512_loadu_ps(gamma + j);
+
+        __m512 vx_grad = _mm512_mul_ps(vy_grad, vgamma);
+
+        __m512 vx = _mm512_loadu_ps(x + (start_row + idx) * cols + j);
+        __m512 x_minus_mean = _mm512_sub_ps(vx, vmean[idx]);
+
+        vx_grad = _mm512_sub_ps(
+            vx_grad, _mm512_fmadd_ps(vsum_r[idx], x_minus_mean, vsum_m[idx]));
+        vx_grad = _mm512_mul_ps(vx_grad, vrvariance[idx]);
+
+        // save gradient of x
+        _mm512_storeu_ps(x_grad + (start_row + idx) * cols + j, vx_grad);
+
+        // gradient for gamma and beta
+        vgamma_grad = _mm512_fmadd_ps(_mm512_mul_ps(vy_grad, x_minus_mean),
+                                      vrvariance[idx], vgamma_grad);
+        vbeta_grad = _mm512_add_ps(vy_grad, vbeta_grad);
+      };
+
+      compile_time_for<ROWS>::op(compute_grad);
+
+      // save gradient of gamma, beta
+      _mm512_storeu_ps(gamma_grad + j, vgamma_grad);
+      _mm512_storeu_ps(beta_grad + j, vbeta_grad);
+    }
+
+    // Deal with the remain data
+    if (cols % 16 != 0) {
+      int remain = cols % 16;
+      auto remain_grad = [&](auto idx) {
+        for (int64 c = j; c < cols; ++c) {
+          const auto offset = (start_row + idx) * cols + c;
+          float vx_grad = y_grad[offset] * gamma[c];
+          float x_minus_mean = x[offset] - mean[start_row + idx];
+          vx_grad -= sum_m[idx] + sum_r[idx] * x_minus_mean;
+          vx_grad *= rvariance[start_row + idx];
+
+          // save gradient of x
+          x_grad[offset] = vx_grad;
+
+          // gradient for gamma and beta
+          gamma_grad[c] +=
+              y_grad[offset] * x_minus_mean * rvariance[start_row + idx];
+          beta_grad[c] += y_grad[offset];
+        }
+      };
+
+      compile_time_for<ROWS>::op(remain_grad);
+    }
+  }
+#endif  // backward layer norm avx512 impl
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("FusedLayerNormGrad").Device(DEVICE_CPU).TypeConstraint<float>("T"),
+    FusedLayerNormGradOp<float>);
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc
new file mode 100644
index 00000000..45b5ea15
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/fused_layer_norm/fused_layer_normalize_ops_test.cc
@@ -0,0 +1,269 @@
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+namespace {
+
+enum class Device { CPU, GPU };
+
+class FusedLayerNormalizeOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, DataType dtype, int axis,
+                          float epsilon) {
+    TF_EXPECT_OK(NodeDefBuilder("fused_layer_normalize", "FusedLayerNorm")
+                     .Attr("T", dtype)
+                     .Attr("epsilon", epsilon)
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedLayerNormalizeOpTest, 2Dims_Float) {
+  const int rows = 7;
+  const int cols = 255;
+
+  MakeOpAndSetDevice(Device::CPU, DT_FLOAT, 0, 1e-12);
+
+  float input_array[1785];
+  for (int i = 0; i < sizeof(input_array) / sizeof(float); i++) {
+    input_array[i] = 1.0;
+  }
+  for (int i = 0; i < rows; i++) {
+    input_array[i * cols] = 2.0;
+  }
+  AddInputFromArray<float>(TensorShape({rows, cols}), input_array);
+  AddInput<float>(TensorShape({cols}), [](int i) -> float { return 2.0; });
+  AddInput<float>(TensorShape({cols}), [](int i) -> float { return 1.0; });
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_output(allocator(), DT_FLOAT, TensorShape({rows, cols}));
+    Tensor mean(allocator(), DT_FLOAT, TensorShape({rows}));
+    Tensor rvariance(allocator(), DT_FLOAT, TensorShape({rows}));
+    float output_array[1785];
+    float rvar_value = 16.000125885009766f;
+    float mean_value = 256.0f / 255.0f;
+    // 1.00392162799835205f;
+    for (int i = 0; i < sizeof(output_array) / sizeof(float); i++) {
+      output_array[i] = 0.87450695037841797;
+    }
+    for (int i = 0; i < rows; i++) {
+      output_array[i * cols] = 2.0f * sqrtf(254.0f) + 1.0f;
+      // 32.874755859375;
+    }
+
+    float mean_array[rows];
+    for (int i = 0; i < sizeof(mean_array) / sizeof(float); i++) {
+      mean_array[i] = mean_value;
+    }
+
+    float rvariance_array[rows];
+
+    for (int i = 0; i < sizeof(rvariance_array) / sizeof(float); i++) {
+      rvariance_array[i] = rvar_value;
+    }
+    test::FillValues<float>(&expected_output, output_array);
+    test::FillValues<float>(&mean, mean_array);
+    test::FillValues<float>(&rvariance, rvariance_array);
+    test::ExpectTensorNear<float>(expected_output, *GetOutput(0), 1e-5);
+    test::ExpectTensorNear<float>(mean, *GetOutput(1), 1e-5);
+    test::ExpectTensorNear<float>(rvariance, *GetOutput(2), 1e-5);
+  }
+}
+
+class FusedLayerNormalizeGradOpTest : public OpsTestBase {
+ protected:
+  void MakeOpAndSetDevice(Device device, DataType dtype, int axis,
+                          float epsilon) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("fused_layer_normalize_grad", "FusedLayerNormGrad")
+            .Attr("T", dtype)
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Input(FakeInput(DT_FLOAT))
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(FusedLayerNormalizeGradOpTest, 2Dims_Float) {
+  const int rows = 7;
+  const int cols = 255;
+
+  MakeOpAndSetDevice(Device::CPU, DT_FLOAT, 0, 1e-12);
+
+  AddInput<float>(TensorShape({rows, cols}),
+                  [](int i) -> float { return 1.0f; });  // y_grad
+  AddInput<float>(TensorShape({rows, cols}), [](int i) -> float {
+    return (i % cols) ? 1.0f : 2.0f;
+  });  // x
+  AddInput<float>(TensorShape({rows}),
+                  [](int i) -> float { return 256.0f / 255.0f; });  // mean
+  AddInput<float>(TensorShape({rows}), [](int i) -> float {
+    return 16.00012302493275484;
+  });  // rvariance
+  AddInput<float>(TensorShape({cols}),
+                  [](int i) -> float { return 2.0f; });  // gamma
+
+  TF_ASSERT_OK(RunOpKernel());
+  TF_EXPECT_OK(device_->Sync());
+
+  {
+    Tensor expected_output(allocator(), DT_FLOAT, TensorShape({rows, cols}));
+    Tensor gamma_grad(allocator(), DT_FLOAT, TensorShape({cols}));
+    Tensor beta_grad(allocator(), DT_FLOAT, TensorShape({cols}));
+    float x_grad[1785];
+    for (int i = 0; i < sizeof(x_grad) / sizeof(float); i++) {
+      x_grad[i] = 0.0f;
+    }
+    for (int i = 0; i < rows; i++) {
+      x_grad[i * cols] = 0.00048447030712850392f;
+    }
+
+    float gamma_grads[cols];
+    for (int i = 0; i < sizeof(gamma_grads) / sizeof(float); i++) {
+      gamma_grads[i] = -0.4392257034778595;
+    }
+    gamma_grads[0] = 111.56163787841797;
+
+    float beta_grads[cols];
+    for (int i = 0; i < sizeof(beta_grads) / sizeof(float); i++) {
+      beta_grads[i] = 7.0f;
+    }
+    test::FillValues<float>(&expected_output, x_grad);
+    test::FillValues<float>(&gamma_grad, gamma_grads);
+    test::FillValues<float>(&beta_grad, beta_grads);
+    test::ExpectTensorNear<float>(expected_output, *GetOutput(0), 1e-5);
+    test::ExpectTensorNear<float>(gamma_grad, *GetOutput(1), 1e-5);
+    test::ExpectTensorNear<float>(beta_grad, *GetOutput(2), 1e-5);
+  }
+}
+
+//----------------------------------------------------------------------------//
+// Performance benchmarks                                                     //
+//----------------------------------------------------------------------------//
+static Graph* FusedLayerNormalize(int rows, int cols) {
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType dtype = DT_FLOAT;
+
+  Tensor in(dtype, TensorShape({rows, cols}));
+  in.flat<float>().setRandom();
+  Tensor gamma(dtype, TensorShape({cols}));
+  gamma.flat<float>().setRandom();
+  Tensor beta(dtype, TensorShape({cols}));
+  beta.flat<float>().setRandom();
+
+  Node* input_in = test::graph::Constant(g, in);
+  Node* input_gamma = test::graph::Constant(g, gamma);
+  Node* input_beta = test::graph::Constant(g, beta);
+  auto nodeBuilder = NodeBuilder(g->NewName("n"), "FusedLayerNorm")
+                         .Input(input_in)
+                         .Input(input_gamma)
+                         .Input(input_beta)
+                         .Attr("T", dtype)
+                         .Attr("epsilon", 1e-12);
+  TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr));
+
+  return g;
+}
+
+#define BM_FusedLayerNorm(ROWS, COLS, NTH)                                     \
+  static void BM_FusedLayerNorm##_##ROWS##_##COLS##_##NTH##_CPU(int iters) {   \
+    testing::UseRealTime();                                                    \
+    testing::ItemsProcessed(static_cast<int64>(iters) * ROWS * COLS * 3);      \
+    SessionOptions opts;                                                       \
+    opts.config.set_intra_op_parallelism_threads(NTH);                         \
+    test::Benchmark("cpu", FusedLayerNormalize(ROWS, COLS), &opts).Run(iters); \
+  }                                                                            \
+  BENCHMARK(BM_FusedLayerNorm##_##ROWS##_##COLS##_##NTH##_CPU);
+
+#define BM_FusedLayerNorm_NTH(ROWS, COLS) \
+  BM_FusedLayerNorm(ROWS, COLS, 1);       \
+  BM_FusedLayerNorm(ROWS, COLS, 4);       \
+  BM_FusedLayerNorm(ROWS, COLS, 8);
+
+BM_FusedLayerNorm_NTH(1024, 63);
+BM_FusedLayerNorm_NTH(1024, 255);
+BM_FusedLayerNorm_NTH(1024, 511);
+BM_FusedLayerNorm_NTH(1024, 1023);
+BM_FusedLayerNorm_NTH(1024, 1024);
+BM_FusedLayerNorm_NTH(1024, 2048);
+BM_FusedLayerNorm_NTH(1024, 4096);
+
+}  // namespace
+
+static Graph* FusedLayerNormalizeGrad(int rows, int cols) {
+  Graph* g = new Graph(OpRegistry::Global());
+  DataType dtype = DT_FLOAT;
+
+  Tensor y_grad(dtype, TensorShape({rows, cols}));
+  y_grad.flat<float>().setRandom();
+  Tensor x(dtype, TensorShape({rows, cols}));
+  x.flat<float>().setRandom();
+  Tensor mean(dtype, TensorShape({rows}));
+  mean.flat<float>().setRandom();
+  Tensor rvarance(dtype, TensorShape({rows}));
+  rvarance.flat<float>().setRandom();
+  Tensor gamma(dtype, TensorShape({cols}));
+  gamma.flat<float>().setRandom();
+
+  Node* input_y_grad = test::graph::Constant(g, y_grad);
+  Node* input_x = test::graph::Constant(g, x);
+  Node* input_mean = test::graph::Constant(g, mean);
+  Node* input_rvarance = test::graph::Constant(g, rvarance);
+  Node* input_gamma = test::graph::Constant(g, gamma);
+  auto nodeBuilder = NodeBuilder(g->NewName("n"), "FusedLayerNormGrad")
+                         .Input(input_y_grad)
+                         .Input(input_x)
+                         .Input(input_mean)
+                         .Input(input_rvarance)
+                         .Input(input_gamma)
+                         .Attr("T", dtype);
+  TF_CHECK_OK(nodeBuilder.Finalize(g, nullptr));
+
+  return g;
+}
+
+#define BM_FusedLayerNormGrad(ROWS, COLS, NTH)                            \
+  static void BM_FusedLayerNormGrad##_##ROWS##_##COLS##_##NTH##_CPU(      \
+      int iters) {                                                        \
+    testing::UseRealTime();                                               \
+    testing::ItemsProcessed(static_cast<int64>(iters) * ROWS * COLS * 3); \
+    SessionOptions opts;                                                  \
+    opts.config.set_intra_op_parallelism_threads(NTH);                    \
+    test::Benchmark("cpu", FusedLayerNormalizeGrad(ROWS, COLS), &opts)    \
+        .Run(iters);                                                      \
+  }                                                                       \
+  BENCHMARK(BM_FusedLayerNormGrad##_##ROWS##_##COLS##_##NTH##_CPU);
+
+#define BM_FusedLayerNormGrad_NTH(ROWS, COLS) \
+  BM_FusedLayerNormGrad(ROWS, COLS, 1);       \
+  BM_FusedLayerNormGrad(ROWS, COLS, 4);       \
+  BM_FusedLayerNormGrad(ROWS, COLS, 8);
+
+BM_FusedLayerNormGrad_NTH(1024, 63);
+BM_FusedLayerNormGrad_NTH(1024, 255);
+BM_FusedLayerNormGrad_NTH(1024, 511);
+BM_FusedLayerNormGrad_NTH(1024, 1023);
+BM_FusedLayerNormGrad_NTH(1024, 1024);
+BM_FusedLayerNormGrad_NTH(1024, 2048);
+BM_FusedLayerNormGrad_NTH(1024, 4096);
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc
new file mode 100644
index 00000000..4603ed2f
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cc
@@ -0,0 +1,176 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "group_embedding_lookup_sparse_forward_base_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+#define USING_BASE_CLASS_MEMBER                           \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_num_lookup; \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_dimension;  \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingVariableLookupDenseCpuOp
+    : public GroupLookupBaseCpuOp<TKey, TValue> {
+  USING_BASE_CLASS_MEMBER
+ public:
+  explicit GroupEmbeddingVariableLookupDenseCpuOp(OpKernelConstruction* c)
+      : GroupLookupBaseCpuOp<TKey, TValue>(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &m_is_use_default_value_tensor));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    /*
+      step 1: unique and assign unique output and index
+      step 2: doing parallel unique value gather
+    */
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    for (int i = 0; i < m_num_lookup; ++i) {
+      EmbeddingVar<TKey, TValue>* embedding_var = nullptr;
+      OP_REQUIRES_OK(
+          ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var));
+      core::ScopedUnref unref_me(embedding_var);
+
+      const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i);
+      auto dense_values = dense_values_tensor.flat<TKey>().data();
+      int nnz = dense_values_tensor.NumElements();
+
+      auto dense_values_tensor_shape = dense_values_tensor.shape();
+      TensorShape emb_vectors_tensor_shape =
+          TensorShape(dense_values_tensor_shape);
+      emb_vectors_tensor_shape.AddDim(m_dimension);
+      Tensor* gather_embedding_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &gather_embedding_tensor));
+      auto gather_embedding = gather_embedding_tensor->flat<TValue>().data();
+
+      OP_REQUIRES(
+          ctx,
+          !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() &&
+                                             embedding_var->CacheSize() >= nnz),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  embedding_var->CacheSize(),
+                                  " should large than IDs in batch ", nnz));
+
+      EmbeddingVarContext<CPUDevice> ev_ctx(ctx);
+      if (m_is_use_default_value_tensor) {
+        embedding_var->GetEmbeddings(
+            ev_ctx, dense_values, gather_embedding, nnz,
+            reinterpret_cast<TValue*>(ctx->input(m_num_lookup * 4 + 1).data()));
+      } else {
+        embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding,
+                                     nnz);
+        embedding_var->UpdateCache(dense_values_tensor, true);
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type) \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("GroupEmbeddingVarLookupDense")         \
+          .Device(DEVICE_CPU)                      \
+          .TypeConstraint<key_type>("Tkeys")       \
+          .TypeConstraint<value_type>("dtype"),    \
+      GroupEmbeddingVariableLookupDenseCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+template <typename TKey, typename TValue>
+class GroupVariableLookupDenseCpuOp
+    : public GroupLookupBaseCpuOp<TKey, TValue> {
+  USING_BASE_CLASS_MEMBER
+ public:
+  explicit GroupVariableLookupDenseCpuOp(OpKernelConstruction* c)
+      : GroupLookupBaseCpuOp<TKey, TValue>(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    /*
+      step 1: unique and assign unique output and index
+      step 2: doing parallel unique value gather
+    */
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    for (int i = 0; i < m_num_lookup; ++i) {
+      const Tensor& emb_variable_tensor = ctx->input(i);
+      auto embedding_variable = emb_variable_tensor.flat<TValue>().data();
+
+      const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i);
+
+      int nnz = dense_values_tensor.NumElements();
+
+      auto dense_values_tensor_shape = dense_values_tensor.shape();
+      TensorShape emb_vectors_tensor_shape =
+          TensorShape(dense_values_tensor_shape);
+      emb_vectors_tensor_shape.AddDim(m_dimension);
+      Tensor* gather_embedding_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &gather_embedding_tensor));
+      auto gather_embedding = gather_embedding_tensor->flat<TValue>().data();
+
+      // Stage 1
+      Tensor unique_idx_tensor;
+      Tensor unique_tensor;
+      Tensor unique_counter;
+
+      UniqueWithoutAxis<TKey, int32>(
+          ctx, dense_values_tensor, &unique_idx_tensor, &unique_tensor,
+          &unique_counter, 0, this->partition_size_, this->serial_,
+          this->unique_ratio_hint_, this->map_flag_);
+
+      ctx->set_output(m_num_lookup + i, unique_tensor);
+      ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
+      auto* unique = unique_tensor.flat<TKey>().data();
+      auto* unique_idx = unique_idx_tensor.flat<int>().data();
+      int slice_bytes = nnz * m_dimension * 1000;
+      auto do_lookup = [this, ctx, embedding_variable, unique, unique_idx,
+                        gather_embedding](int64 start, int64 end) {
+        for (int k = start; k < end; ++k) {
+          auto indices = unique_idx[k];
+          TKey unique_id = unique[indices];
+          memcpy(gather_embedding + k * m_dimension,
+                 embedding_variable + unique_id * m_dimension,
+                 sizeof(float) * m_dimension);
+        }
+      };
+      Shard(worker_threads->num_threads, worker_threads->workers, nnz,
+            slice_bytes, do_lookup);
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type)                  \
+  REGISTER_KERNEL_BUILDER(Name("GroupVariableLookupDense")          \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<key_type>("Tkeys")    \
+                              .TypeConstraint<value_type>("dtype"), \
+                          GroupVariableLookupDenseCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+#undef USING_BASE_CLASS_MEMBER
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc
new file mode 100644
index 00000000..fefd6041
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops.cu.cc
@@ -0,0 +1,105 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "group_embedding_lookup_sparse_forward_base_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+using GPUDevice = Eigen::GpuDevice;
+
+#define USING_BASE_CLASS_MEMBER                           \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_num_lookup; \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_dimension;  \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor;
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingVariableLookupDenseGpuOp
+    : public GroupLookupBaseCpuOp<TKey, TValue> {
+  USING_BASE_CLASS_MEMBER
+ public:
+  explicit GroupEmbeddingVariableLookupDenseGpuOp(OpKernelConstruction* c)
+      : GroupLookupBaseCpuOp<TKey, TValue>(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &m_is_use_default_value_tensor));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+
+    for (int i = 0; i < m_num_lookup; ++i) {
+      EmbeddingVar<TKey, TValue>* embedding_var = nullptr;
+      OP_REQUIRES_OK(
+          ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var));
+      core::ScopedUnref unref_me(embedding_var);
+
+      const Tensor& dense_values_tensor = ctx->input(m_num_lookup + i);
+      auto dense_values = dense_values_tensor.flat<TKey>().data();
+      int nnz = dense_values_tensor.NumElements();
+
+      auto dense_values_tensor_shape = dense_values_tensor.shape();
+      TensorShape emb_vectors_tensor_shape =
+          TensorShape(dense_values_tensor_shape);
+      emb_vectors_tensor_shape.AddDim(m_dimension);
+      Tensor* gather_embedding_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &gather_embedding_tensor));
+      auto gather_embedding = gather_embedding_tensor->flat<TValue>().data();
+
+      OP_REQUIRES(
+          ctx,
+          !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() &&
+                                             embedding_var->CacheSize() >= nnz),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  embedding_var->CacheSize(),
+                                  " should large than IDs in batch ", nnz));
+
+      EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
+      if (m_is_use_default_value_tensor) {
+        embedding_var->GetEmbeddings(
+            ev_ctx, dense_values, gather_embedding, nnz,
+            reinterpret_cast<TValue*>(ctx->input(m_num_lookup * 4 + 1).data()),
+            stream);
+      } else {
+        embedding_var->GetEmbeddings(ev_ctx, dense_values, gather_embedding,
+                                     nnz, nullptr, stream);
+        embedding_var->UpdateCache(dense_values_tensor, true, stream);
+      }
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(key_type, value_type) \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("GroupEmbeddingVarLookupDense")         \
+          .Device(DEVICE_GPU)                      \
+          .TypeConstraint<key_type>("Tkeys")       \
+          .TypeConstraint<value_type>("dtype"),    \
+      GroupEmbeddingVariableLookupDenseGpuOp<key_type, value_type>)
+
+REGISTER_GPU_KERNELS(int32, float);
+REGISTER_GPU_KERNELS(int64, float);
+#undef REGISTER_GPU_KERNELS
+
+#undef USING_BASE_CLASS_MEMBER
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc
new file mode 100644
index 00000000..e62668fc
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_ops_test.cc
@@ -0,0 +1,1089 @@
+#include <sys/resource.h>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "deepray/custom_ops/utils/fake_input.h"
+#include "deepray/custom_ops/utils/kernel_benchmark_testlib.h"
+#include "deepray/custom_ops/utils/ops_testutil.h"
+#include "deepray/custom_ops/utils/tensor_testutil.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+// #include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+enum DEVICE { CPU, GPU };
+
+enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 };
+
+template <TestCase test_case>
+void get_node_attr_from_test_case(string& combiner_str, float& max_norm) {
+  if (test_case == Sqrtn) {
+    combiner_str = "sqrtn";
+    max_norm = -1.0f;
+  } else if (test_case == Mean) {
+    combiner_str = "mean";
+    max_norm = -1.0f;
+  } else if (test_case == Sum) {
+    combiner_str = "sum";
+    max_norm = -1.0f;
+  } else if (test_case == SqrtnAndMaxNorm200) {
+    combiner_str = "sqrtn";
+    max_norm = 200.0f;
+  } else if (test_case == MeanAndMaxNorm100) {
+    combiner_str = "mean";
+    max_norm = 100.0f;
+  }
+}
+
+template <TestCase test_case>
+void fill_var_vector_expected(Tensor* expected);
+
+template <>
+void fill_var_vector_expected<Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {22.627416610717773, 24.0416316986084,   25.45584487915039,
+                 26.870058059692383, 28.284271240234375, 29.698484420776367,
+                 31.112699508666992, 32.526912689208984, 73.90083312988281,
+                 75.63288879394531,  77.36493682861328,  79.09698486328125,
+                 80.82904052734375,  82.56108856201172,  84.29314422607422,
+                 86.02519226074219,  124.70765686035156, 126.43971252441406,
+                 128.17176818847656, 129.90380859375,    131.6358642578125,
+                 133.367919921875,   135.09996032714844, 136.83201599121094,
+                 107.48023223876953, 108.89444732666016, 110.30866241455078,
+                 111.72286987304688, 113.1370849609375,  114.55130004882812,
+                 115.96551513671875, 117.37973022460938});
+}
+
+template <>
+void fill_var_vector_expected<Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {16.00000000000000, 17.00000000000000, 18.00000000000000,
+                 19.00000000000000, 20.00000000000000, 21.00000000000000,
+                 22.00000000000000, 23.00000000000000, 42.66666793823242,
+                 43.66666793823242, 44.66666793823242, 45.66666793823242,
+                 46.66666793823242, 47.66666793823242, 48.66666793823242,
+                 49.66666793823242, 72.00000000000000, 73.00000000000000,
+                 74.00000000000000, 75.00000000000000, 76.00000000000000,
+                 77.00000000000000, 78.00000000000000, 79.00000000000000,
+                 76.00000000000000, 77.00000000000000, 78.00000000000000,
+                 79.00000000000000, 80.00000000000000, 81.00000000000000,
+                 82.00000000000000, 83.00000000000000});
+}
+
+template <>
+void fill_var_vector_expected<Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {32.0,  34.0,  36.0,  38.0,  40.0,  42.0,  44.0,  46.0,
+                 128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0,
+                 216.0, 219.0, 222.0, 225.0, 228.0, 231.0, 234.0, 237.0,
+                 152.0, 154.0, 156.0, 158.0, 160.0, 162.0, 164.0, 166.0});
+}
+
+template <>
+void fill_var_vector_expected<SqrtnAndMaxNorm200>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+       29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+       77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+       86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+       98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+       72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+       77.87342834, 78.98532867});
+}
+
+class GroupVariableForWardOpTest : public OpsTestBase {
+ protected:
+  template <typename TKey, typename TValue, TestCase test_case>
+  void Run(DEVICE device) {
+    if (device == DEVICE::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    DataType k_dtype = DataTypeToEnum<TKey>::value;
+    DataType v_dtype = DataTypeToEnum<TValue>::value;
+    std::string combiner_str;
+    float max_norm;
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int entries = 8;
+    const int bucket_size = 16;
+    const int num_lookups = 2;
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("group_variable_lookup", "GroupVariableLookup")
+                     .Input(FakeInput(num_lookups, v_dtype))   // ev
+                     .Input(FakeInput(num_lookups, k_dtype))   // sp_values
+                     .Input(FakeInput(num_lookups, DT_INT64))  // sp_indices
+                     .Input(FakeInput(num_lookups, v_dtype))   // sp_weights
+                     .Input(FakeInput(DT_INT32))               // dense_shape
+                     .Input(FakeInput(v_dtype))                // default_value
+                     .Attr("dtype", v_dtype)
+                     .Attr("Tkeys", k_dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Attr("dimension", emb_vector_dim)
+                     .Attr("num_lookups", num_lookups)
+                     .Attr("ignore_weights", true)
+                     .Attr("is_use_default_value_tensor", false)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor emb_variable(v_dtype, {bucket_size, emb_vector_dim});
+      test::FillValues<TValue>(
+          &emb_variable,
+          {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+           10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+           20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+           30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+           40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+           50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+           60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+           70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+           80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+           90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+           100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+           110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+           120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+
+      AddInputFromArray<TValue>(emb_variable.shape(),
+                                emb_variable.flat<TValue>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_values(k_dtype, {nnz});
+      test::FillValues<TKey>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+      AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_indices(DT_INT64, {nnz});
+      test::FillValues<int64>(&sp_indices, {0, 0, 1, 1, 1, 2, 2, 2, 3, 3});
+      AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_weights(v_dtype, {nnz});
+      test::FillValues<TValue>(&sp_weights, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                             1.0f, 1.0f, 1.0f, 1.0f});
+      AddInputFromArray<TValue>(sp_weights.shape(), sp_weights.flat<TValue>());
+    }
+
+    Tensor* batch_size_tensor =
+        AddInput(DataTypeToEnum<int32>::v(), TensorShape({}));
+    auto batch_size_data = batch_size_tensor->flat<int>().data();
+    batch_size_data[0] = batch_size;
+
+    Tensor* default_v_tensor =
+        AddInput(DataTypeToEnum<TValue>::v(), TensorShape({}));
+    auto default_v = default_v_tensor->flat<float>().data();
+    default_v[0] = 1.0f;
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor emb_vector_expected(v_dtype, {batch_size, emb_vector_dim});
+    Tensor unique_values_expected(DT_INT64, {7});
+    Tensor unique_idx_expected(DT_INT32, {nnz});
+    Tensor batch_size_expected(DT_INT32, {batch_size});
+
+    fill_var_vector_expected<test_case>(&emb_vector_expected);
+
+    if (device == DEVICE::GPU) {
+      test::FillValues<int32>(&batch_size_expected, {0, 2, 5, 8});
+    } else {
+      test::FillValues<int64>(&unique_values_expected, {3, 1, 4, 5, 7, 12, 15});
+      test::FillValues<int32>(&unique_idx_expected,
+                              {0, 1, 2, 3, 4, 0, 5, 5, 6, 2});
+      test::FillValues<int32>(&batch_size_expected, {2, 5, 8, 10});
+    }
+    TF_EXPECT_OK(device_->Sync());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      const Tensor& emb_vector = *GetOutput(i);
+      const Tensor& unique_values = *GetOutput(num_lookups + i);
+      const Tensor& unique_idx_output = *GetOutput(2 * num_lookups + i);
+      const Tensor& batch_size_output = *GetOutput(3 * num_lookups + i);
+      test::ExpectTensorNear<TValue>(emb_vector_expected, emb_vector, 1e-4);
+      if (device == DEVICE::CPU) {
+        test::ExpectTensorEqual<int64>(unique_values_expected, unique_values);
+        test::ExpectTensorEqual<int32>(unique_idx_expected, unique_idx_output);
+      }
+      test::ExpectTensorEqual<int32>(batch_size_expected, batch_size_output);
+    }
+  }
+};
+
+#ifdef GOOGLE_CUDA
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSqrtnGpu) {
+  Run<int64, float, Sqrtn>(DEVICE::GPU);
+}
+
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatMeanGpu) {
+  Run<int64, float, Mean>(DEVICE::GPU);
+}
+
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSumGpu) {
+  Run<int64, float, Sum>(DEVICE::GPU);
+}
+
+// TEST_F(GroupVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) {
+//   Run<int64, float, SqrtnAndMaxNorm200>(DEVICE::GPU);
+// }
+#endif  // GOOGLE_CUDA
+
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSqrtnCpu) {
+  Run<int64, float, Sqrtn>(DEVICE::CPU);
+}
+
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatMeanCpu) {
+  Run<int64, float, Mean>(DEVICE::CPU);
+}
+
+TEST_F(GroupVariableForWardOpTest, EmbeddingLocalSparseLookUpFloatSumCpu) {
+  Run<int64, float, Sum>(DEVICE::CPU);
+}
+
+// TEST_F(GroupVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Cpu) {
+//   Run<int64, float, SqrtnAndMaxNorm200>(DEVICE::CPU);
+// }
+
+template <DEVICE device, TestCase test_case>
+void fill_var_grad_expected(Tensor* expected);
+
+template <>
+void fill_var_grad_expected<DEVICE::CPU, Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.7071067690849304, 1.4142135381698608,
+                 2.1213204860687256, 2.8284270763397217, 3.535533905029297,
+                 4.242640972137451,  4.949747562408447,  0.000000000000000,
+                 0.7071067690849304, 1.4142135381698608, 2.1213204860687256,
+                 2.8284270763397217, 3.535533905029297,  4.242640972137451,
+                 4.949747562408447,  4.618802070617676,  5.196152687072754,
+                 5.773502826690674,  6.350852966308594,  6.928203582763672,
+                 7.505553722381592,  8.082903861999512,  8.66025447845459,
+                 4.618802070617676,  5.196152687072754,  5.773502826690674,
+                 6.350852966308594,  6.928203582763672,  7.505553722381592,
+                 8.082903861999512,  8.66025447845459,   4.618802070617676,
+                 5.196152687072754,  5.773502826690674,  6.350852966308594,
+                 6.928203582763672,  7.505553722381592,  8.082903861999512,
+                 8.66025447845459,   9.237604141235352,  9.81495475769043,
+                 10.392305374145508, 10.96965503692627,  11.547005653381348,
+                 12.124356269836426, 12.701705932617188, 13.279056549072266,
+                 16.970563888549805, 17.677669525146484, 18.384777069091797,
+                 19.091882705688477, 19.79899024963379,  20.5060977935791,
+                 21.21320343017578,  21.920310974121094});
+}
+
+template <>
+void fill_var_grad_expected<DEVICE::CPU, Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.500000000000000,  1.000000000000000,
+                 1.500000000000000,  2.000000000000000,  2.500000000000000,
+                 3.000000000000000,  3.500000000000000,  0.000000000000000,
+                 0.500000000000000,  1.000000000000000,  1.500000000000000,
+                 2.000000000000000,  2.500000000000000,  3.000000000000000,
+                 3.500000000000000,
+
+                 2.6666667461395264, 3.000000000000000,  3.3333332538604736,
+                 3.6666667461395264, 4.000000000000000,  4.333333492279053,
+                 4.666666507720947,  5.000000000000000,  2.6666667461395264,
+                 3.000000000000000,  3.3333332538604736, 3.6666667461395264,
+                 4.000000000000000,  4.333333492279053,  4.666666507720947,
+                 5.000000000000000,  2.6666667461395264, 3.000000000000000,
+                 3.3333332538604736, 3.6666667461395264, 4.000000000000000,
+                 4.333333492279053,  4.666666507720947,  5.000000000000000,
+                 5.333333492279053,  5.666666507720947,  6.000000000000000,
+                 6.333333492279053,  6.666666507720947,  7.000000000000000,
+                 7.333333492279053,  7.666666507720947,  12.000000000000000,
+                 12.500000000000000, 13.000000000000000, 13.500000000000000,
+                 14.000000000000000, 14.500000000000000, 15.000000000000000,
+                 15.500000000000000});
+}
+
+template <>
+void fill_var_grad_expected<DEVICE::CPU, Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  0.0,  1.0,  2.0,  3.0,
+       4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+       8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0,  9.0,  10.0, 11.0,
+       12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+       24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+}
+
+template <>
+void fill_var_grad_expected<DEVICE::GPU, Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.7071067690849304, 1.4142135381698608,
+                 2.1213204860687256, 2.8284270763397217, 3.535533905029297,
+                 4.242640972137451,  4.949747562408447,  0.000000000000000,
+                 0.7071067690849304, 1.4142135381698608, 2.1213204860687256,
+                 2.8284270763397217, 3.535533905029297,  4.242640972137451,
+                 4.949747562408447,  4.618802070617676,  5.196152687072754,
+                 5.773502826690674,  6.350852966308594,  6.928203582763672,
+                 7.505553722381592,  8.082903861999512,  8.66025447845459,
+                 4.618802070617676,  5.196152687072754,  5.773502826690674,
+                 6.350852966308594,  6.928203582763672,  7.505553722381592,
+                 8.082903861999512,  8.66025447845459,   4.618802070617676,
+                 5.196152687072754,  5.773502826690674,  6.350852966308594,
+                 6.928203582763672,  7.505553722381592,  8.082903861999512,
+                 8.66025447845459,   9.237604141235352,  9.81495475769043,
+                 10.392305374145508, 10.96965503692627,  11.547005653381348,
+                 12.124356269836426, 12.701705932617188, 13.279056549072266,
+                 9.237604141235352,  9.81495475769043,   10.392305374145508,
+                 10.96965503692627,  11.547005653381348, 12.124356269836426,
+                 12.701705932617188, 13.279056549072266, 9.237604141235352,
+                 9.81495475769043,   10.392305374145508, 10.96965503692627,
+                 11.547005653381348, 12.124356269836426, 12.701705932617188,
+                 13.279056549072266, 16.970563888549805, 17.677669525146484,
+                 18.384777069091797, 19.091882705688477, 19.79899024963379,
+                 20.5060977935791,   21.21320343017578,  21.920310974121094,
+                 16.970563888549805, 17.677669525146484, 18.384777069091797,
+                 19.091882705688477, 19.79899024963379,  20.5060977935791,
+                 21.21320343017578,  21.920310974121094});
+}
+
+template <>
+void fill_var_grad_expected<DEVICE::GPU, Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {0.000000000000000,  0.500000000000000,  1.000000000000000,
+                 1.500000000000000,  2.000000000000000,  2.500000000000000,
+                 3.000000000000000,  3.500000000000000,  0.000000000000000,
+                 0.500000000000000,  1.000000000000000,  1.500000000000000,
+                 2.000000000000000,  2.500000000000000,  3.000000000000000,
+                 3.500000000000000,  2.6666667461395264, 3.000000000000000,
+                 3.3333332538604736, 3.6666667461395264, 4.000000000000000,
+                 4.333333492279053,  4.666666507720947,  5.000000000000000,
+                 2.6666667461395264, 3.000000000000000,  3.3333332538604736,
+                 3.6666667461395264, 4.000000000000000,  4.333333492279053,
+                 4.666666507720947,  5.000000000000000,  2.6666667461395264,
+                 3.000000000000000,  3.3333332538604736, 3.6666667461395264,
+                 4.000000000000000,  4.333333492279053,  4.666666507720947,
+                 5.000000000000000,  5.333333492279053,  5.666666507720947,
+                 6.000000000000000,  6.333333492279053,  6.666666507720947,
+                 7.000000000000000,  7.333333492279053,  7.666666507720947,
+                 5.333333492279053,  5.666666507720947,  6.000000000000000,
+                 6.333333492279053,  6.666666507720947,  7.000000000000000,
+                 7.333333492279053,  7.666666507720947,  5.333333492279053,
+                 5.666666507720947,  6.000000000000000,  6.333333492279053,
+                 6.666666507720947,  7.000000000000000,  7.333333492279053,
+                 7.666666507720947,  12.000000000000000, 12.500000000000000,
+                 13.000000000000000, 13.500000000000000, 14.000000000000000,
+                 14.500000000000000, 15.000000000000000, 15.500000000000000,
+                 12.000000000000000, 12.500000000000000, 13.000000000000000,
+                 13.500000000000000, 14.000000000000000, 14.500000000000000,
+                 15.000000000000000, 15.500000000000000});
+}
+
+template <>
+void fill_var_grad_expected<DEVICE::GPU, Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  0.0,  1.0,  2.0,  3.0,
+       4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+       8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 8.0,  9.0,  10.0, 11.0,
+       12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+       16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 16.0, 17.0, 18.0, 19.0,
+       20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0,
+       24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+}
+
+// template <>
+// void fill_var_grad_expected<MeanAndMaxNorm100>(Tensor* expected) {
+//   test::FillValues<float>(
+//       expected,
+//       {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
+//        2.50000000,  3.00000000,  3.50000000,  0.00000000,  0.50000000,
+//        1.00000000,  1.50000000,  2.00000000,  2.50000000,  3.00000000,
+//        3.50000000,  2.65028572,  2.98157120,  3.31285667,  3.64414287,
+//        3.97542834,  4.30671406,  4.63799953,  4.96928549,  2.16437674,
+//        2.43492365,  2.70547056,  2.97601795,  3.24656487,  3.51711202,
+//        3.78765893,  4.05820608,  1.58337951,  1.78130186,  1.97922409,
+//        2.17714667,  2.37506914,  2.57299161,  2.77091384,  2.96883631,
+//        5.33333349,  5.66666651,  6.00000000,  6.33333349,  6.66666651,
+//        7.00000000,  7.33333349,  7.66666651,  1.89459133,  2.01300311,
+//        2.13141513,  2.24982715,  2.36823893,  2.48665094,  2.60506320,
+//        2.72347474,  1.89459133,  2.01300311,  2.13141513,  2.24982715,
+//        2.36823893,  2.48665094,  2.60506320,  2.72347474,  3.43474555,
+//        3.57786012,  3.72097445,  3.86408877,  4.00720310,  4.15031767,
+//        4.29343224,  4.43654633,  11.92628479, 12.42321396, 12.92014217,
+//        13.41707039, 13.91399956, 14.41092777, 14.90785599, 15.40478516});
+// }
+
+class GroupVariableBackWardOpTest : public OpsTestBase {
+ protected:
+  template <typename TKey, typename TValue, TestCase test_case>
+  void Run(DEVICE device) {
+    if (device == DEVICE::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    DataType k_dtype = DataTypeToEnum<TKey>::value;
+    DataType v_dtype = DataTypeToEnum<TValue>::value;
+    std::string combiner_str;
+    float max_norm;
+
+    const int nnz = 7;
+    const int nums = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int entries = 8;
+    const int bucket_size = 16;
+    const int num_lookups = 2;
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(
+        NodeDefBuilder("group_variable_lookup_grad", "GroupVariableLookupGrad")
+            .Input(FakeInput(num_lookups, DT_FLOAT))  // grads
+            .Input(FakeInput(num_lookups, v_dtype))   // variable
+            .Input(FakeInput(num_lookups, k_dtype))   // unique_key
+            .Input(FakeInput(num_lookups, DT_INT64))  // unique_idx
+            .Input(FakeInput(num_lookups, DT_INT32))  // batch_nums
+            .Attr("dtype", v_dtype)
+            .Attr("Tkeys", k_dtype)
+            .Attr("combiner", combiner_str)
+            .Attr("max_norm", max_norm)
+            .Attr("dimension", emb_vector_dim)
+            .Attr("num_lookups", num_lookups)
+            .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor top_grad(DT_FLOAT, {batch_size, emb_vector_dim});
+      test::FillValues<float>(
+          &top_grad,
+          {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+           11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+           22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+
+      AddInputFromArray<float>(top_grad.shape(), top_grad.flat<float>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor emb_variable(v_dtype, {bucket_size, emb_vector_dim});
+      test::FillValues<TValue>(
+          &emb_variable,
+          {0.0,   1.0,   2.0,   3.0,   4.0,   5.0,   6.0,   7.0,   8.0,   9.0,
+           10.0,  11.0,  12.0,  13.0,  14.0,  15.0,  16.0,  17.0,  18.0,  19.0,
+           20.0,  21.0,  22.0,  23.0,  24.0,  25.0,  26.0,  27.0,  28.0,  29.0,
+           30.0,  31.0,  32.0,  33.0,  34.0,  35.0,  36.0,  37.0,  38.0,  39.0,
+           40.0,  41.0,  42.0,  43.0,  44.0,  45.0,  46.0,  47.0,  48.0,  49.0,
+           50.0,  51.0,  52.0,  53.0,  54.0,  55.0,  56.0,  57.0,  58.0,  59.0,
+           60.0,  61.0,  62.0,  63.0,  64.0,  65.0,  66.0,  67.0,  68.0,  69.0,
+           70.0,  71.0,  72.0,  73.0,  74.0,  75.0,  76.0,  77.0,  78.0,  79.0,
+           80.0,  81.0,  82.0,  83.0,  84.0,  85.0,  86.0,  87.0,  88.0,  89.0,
+           90.0,  91.0,  92.0,  93.0,  94.0,  95.0,  96.0,  97.0,  98.0,  99.0,
+           100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0,
+           110.0, 111.0, 112.0, 113.0, 114.0, 115.0, 116.0, 117.0, 118.0, 119.0,
+           120.0, 121.0, 122.0, 123.0, 124.0, 125.0, 126.0, 127.0});
+      AddInputFromArray<TValue>(emb_variable.shape(),
+                                emb_variable.flat<TValue>());
+    }
+    if (device == DEVICE::GPU) {
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values(k_dtype, {nums});
+        test::FillValues<TKey>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+        AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT64, {nnz});
+        test::FillValues<int64>(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3});
+        AddInputFromArray<int64>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int64>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT32, {batch_size});
+        test::FillValues<int32>(&sp_values_offset, {0, 2, 5, 8});
+        AddInputFromArray<int32>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int>());
+      }
+      TF_ASSERT_OK(RunOpKernel());
+
+      Tensor grad_expected(v_dtype, {nums, emb_vector_dim});
+      fill_var_grad_expected<DEVICE::GPU, test_case>(&grad_expected);
+
+      TF_EXPECT_OK(device_->Sync());
+
+      for (int i = 0; i < num_lookups; ++i) {
+        const Tensor& grad = *GetOutput(i);
+        test::ExpectTensorNear<TValue>(grad_expected, grad, 1e-4);
+      }
+    } else {
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values(k_dtype, {nnz});
+        test::FillValues<TKey>(&sp_values, {3, 1, 4, 5, 7, 12, 15});
+        AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT64, {nnz});
+        test::FillValues<int64>(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3});
+        AddInputFromArray<int64>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int64>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT32, {batch_size});
+        test::FillValues<int32>(&sp_values_offset, {2, 5, 8, 10});
+        AddInputFromArray<int32>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int>());
+      }
+      TF_ASSERT_OK(RunOpKernel());
+
+      Tensor grad_expected(v_dtype, {nnz, emb_vector_dim});
+      fill_var_grad_expected<DEVICE::CPU, test_case>(&grad_expected);
+
+      TF_EXPECT_OK(device_->Sync());
+
+      for (int i = 0; i < num_lookups; ++i) {
+        const Tensor& grad = *GetOutput(i);
+        test::ExpectTensorNear<TValue>(grad_expected, grad, 1e-4);
+      }
+    }
+  }
+};
+
+#ifdef GOOGLE_CUDA
+TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatGpu) {
+  Run<int64, float, Sqrtn>(DEVICE::GPU);
+}
+
+TEST_F(GroupVariableBackWardOpTest,
+       EmbeddingLocalSparseLookUpGradFloatMeanGpu) {
+  Run<int64, float, Mean>(DEVICE::GPU);
+}
+
+TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatSumGpu) {
+  Run<int64, float, Sum>(DEVICE::GPU);
+}
+
+// TEST_F(GroupVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) {
+//   Run<int64, float, MeanAndMaxNorm100>(DEVICE::GPU);
+// }
+#endif  // GOOGLE_CUDA
+
+TEST_F(GroupVariableBackWardOpTest,
+       EmbeddingLocalSparseLookUpGradFloatSqrtCpu) {
+  Run<int64, float, Sqrtn>(DEVICE::CPU);
+}
+
+TEST_F(GroupVariableBackWardOpTest,
+       EmbeddingLocalSparseLookUpGradFloatMeanCpu) {
+  Run<int64, float, Mean>(DEVICE::CPU);
+}
+
+TEST_F(GroupVariableBackWardOpTest, EmbeddingLocalSparseLookUpGradFloatSumCpu) {
+  Run<int64, float, Sum>(DEVICE::CPU);
+}
+
+// TEST_F(GroupVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Cpu) {
+//   Run<int64, float, MeanAndMaxNorm100>(DEVICE::CPU);
+// }
+
+template <TestCase test_case>
+void fill_ev_vector_expected(Tensor* expected);
+
+template <>
+void fill_ev_vector_expected<Sqrtn>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421,
+       1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205,
+       1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205, 1.73205,
+       1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421, 1.41421});
+}
+
+template <>
+void fill_ev_vector_expected<Mean>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {
+                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                    1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                });
+}
+
+template <>
+void fill_ev_vector_expected<Sum>(Tensor* expected) {
+  test::FillValues<float>(
+      expected, {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0,
+                 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0,
+                 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0});
+}
+
+template <>
+void fill_ev_vector_expected<SqrtnAndMaxNorm200>(Tensor* expected) {
+  test::FillValues<float>(
+      expected,
+      {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
+       29.69848442, 31.11269951, 32.52691269,  73.90083313,  75.63288879,
+       77.36493683, 79.09698486, 80.82904053,  82.56108856,  84.29314423,
+       86.02519226, 92.61308289, 94.01081848,  95.40855408,  96.80628204,
+       98.20401764, 99.60175323, 100.99948120, 102.39721680, 71.20205688,
+       72.31395721, 73.42584991, 74.53774261,  75.64963531,  76.76153564,
+       77.87342834, 78.98532867});
+}
+
+class GroupEmbeddingVariableForWardOpTest : public OpsTestBase {
+ protected:
+  template <typename TKey, typename TValue, TestCase test_case>
+  void Run(DEVICE device) {
+    if (device == DEVICE::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    DataType k_dtype = DataTypeToEnum<TKey>::value;
+    DataType v_dtype = DataTypeToEnum<TValue>::value;
+    // TensorShapeProto tshape_proto;
+    // tshape_proto.add_dim()->set_size(8);
+    // TF_EXPECT_OK(NodeDefBuilder("kv_var_handle", "KvVarHandleOp")
+    //                  .Attr("dtype", v_dtype)
+    //                  .Attr("Tkeys", k_dtype)
+    //                  .Attr("shape", tshape_proto)
+    //                  .Attr("container", "EV")
+    //                  .Attr("shared_name", "EV")
+    //                  .Finalize(node_def()));
+    // TF_EXPECT_OK(InitOp());
+    // TF_ASSERT_OK(RunOpKernel());
+    // const Tensor& ev_resource = *GetOutput(0);
+    // ResourceHandle ev_handle = ev_resource.flat<ResourceHandle>()(0);
+
+    // TF_EXPECT_OK(NodeDefBuilder("initialize_kv_variable",
+    //                             "InitializeKvVariableOp")
+    //                  .Input(FakeInput(DT_RESOURCE))  // ev
+    //                  .Input(FakeInput(DT_RESOURCE))  // ev
+    //                  .Input(FakeInput(v_dtype))      // sp_values
+    //                  .Input(FakeInput(k_dtype))      // sp_indices
+    //                  .Attr("dtype", v_dtype)
+    //                  .Attr("Tkeys", k_dtype)
+    //                  .Attr("slot_num", 0)
+    //                  .Attr("shape", tshape_proto)
+    //                  .Attr("initial_num_buckets", 131072)  // 2^17
+    //                  .Attr("max_load_factor", 0.8)
+    //                  .Attr("steps_to_live", 0)
+    //                  .Attr("emb_index", 0)
+    //                  .Attr("block_num", 1)
+    //                  .Attr("slot_index", 0)
+    //                  .Attr("ht_partition_num", 1000)
+    //                  .Attr("filter_freq", 0)
+    //                  .Attr("max_freq", 999999)
+    //                  .Attr("max_element_size", 0)
+    //                  .Attr("counter_type", k_dtype)
+    //                  .Attr("false_positive_probability", -1.0)
+    //                  .Attr("l2_weight_threshold", -1.0)
+    //                  .Attr("layout", "")
+    //                  .Attr("storage_type", 0)
+    //                  .Attr("default_value_dim", 8)
+    //                  .Attr("default_value_no_permission", 0.0)
+    //                  .Attr("record_freq", false)
+    //                  .Attr("record_version", false)
+    //                  .Finalize(node_def()));
+    // TF_EXPECT_OK(InitOp());
+
+    // AddInputFromArray<ResourceHandle>(TensorShape({}), {ev_handle});
+    // AddInputFromArray<ResourceHandle>(TensorShape({}), {ev_handle});
+
+    // Tensor default_values(v_dtype, {8});
+    // test::FillValues<TValue>(&default_values,
+    //                          {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
+    // AddInputFromArray<TValue>(default_values.shape(),
+    //                           default_values.flat<TValue>());
+    // Tensor empty_key(k_dtype, {1});
+    // test::FillValues<TKey>(&empty_key, {-1});
+    // AddInputFromArray<TKey>(empty_key.shape(), empty_key.flat<TKey>());
+    // TF_ASSERT_OK(RunOpKernel());
+
+    // Clear Resource
+    // inputs_.clear();
+    // gtl::STLDeleteElements(&tensors_);
+    // gtl::STLDeleteElements(&managed_outputs_);
+
+    std::string combiner_str;
+    float max_norm;
+
+    const int nnz = 10;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int num_lookups = 2;
+    std::vector<TKey> sp_values_vec{3, 1, 4, 5, 7, 3, 12, 12, 15, 4};
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("group_embedding_variable_lookup",
+                                "GroupEmbeddingVarLookup")
+                     .Input(FakeInput(num_lookups, DT_RESOURCE))  // ev
+                     .Input(FakeInput(num_lookups, k_dtype))      // sp_values
+                     .Input(FakeInput(num_lookups, DT_INT64))     // sp_indices
+                     .Input(FakeInput(num_lookups, v_dtype))      // sp_weights
+                     .Input(FakeInput(DT_INT32))                  // dense_shape
+                     .Input(FakeInput(v_dtype))  // default_value
+                     .Attr("dtype", v_dtype)
+                     .Attr("Tkeys", k_dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Attr("dimension", emb_vector_dim)
+                     .Attr("num_lookups", num_lookups)
+                     .Attr("ignore_weights", true)
+                     .Attr("is_use_default_value_tensor", false)
+                     .Attr("is_inference", false)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      EmbeddingVar<TKey, TValue>* embedding_var = nullptr;
+      Allocator* gpu_allocator = device_->GetAllocator(AllocatorAttributes());
+      auto embedding_config =
+          EmbeddingConfig(0, 0, 1, 1, "", 0, 0, 99999, 14.0);
+      embedding::StorageType storage_type = embedding::StorageType::DRAM;
+      auto feat_desc = new embedding::FeatureDescriptor<float>(
+          1, 1, ev_allocator(), storage_type, false,
+          embedding_config.is_save_version(),
+          {embedding_config.is_counter_filter(), 0});
+      auto storage = embedding::StorageFactory::Create<TKey, TValue>(
+          embedding::StorageConfig(storage_type, "", {1024, 1024, 1024, 1024},
+                                   embedding_config),
+          gpu_allocator, feat_desc, "EV" + std::to_string(i));
+      embedding_var = new EmbeddingVar<TKey, TValue>("EV" + std::to_string(i),
+                                                     storage, embedding_config,
+                                                     gpu_allocator, feat_desc);
+      Tensor value(DT_FLOAT, TensorShape({emb_vector_dim}));
+      test::FillValues<TValue>(&value,
+                               std::vector<TValue>(emb_vector_dim, 1.0));
+      embedding_var->Init(value, 1);
+
+      for (int64 j = 0; j < nnz; ++j) {
+        void* value_ptr = nullptr;
+        Status s =
+            embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
+        typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
+      }
+      AddResourceInput<EmbeddingVar<TKey, TValue>>("", "EV" + std::to_string(i),
+                                                   embedding_var);
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_values(k_dtype, {nnz});
+      test::FillValues<TKey>(&sp_values, sp_values_vec);
+      AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_indices(DT_INT64, {nnz});
+      test::FillValues<int64>(&sp_indices, {0, 0, 1, 1, 1, 2, 2, 2, 3, 3});
+      AddInputFromArray<int64>(sp_indices.shape(), sp_indices.flat<int64>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor sp_weights(v_dtype, {nnz});
+      test::FillValues<TValue>(&sp_weights, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                                             1.0f, 1.0f, 1.0f, 1.0f});
+      AddInputFromArray<TValue>(sp_weights.shape(), sp_weights.flat<TValue>());
+    }
+
+    Tensor* batch_size_tensor =
+        AddInput(DataTypeToEnum<int32>::v(), TensorShape({}));
+    auto batch_size_data = batch_size_tensor->flat<int>().data();
+    batch_size_data[0] = batch_size;
+
+    TF_ASSERT_OK(RunOpKernel());
+
+    Tensor emb_vector_expected(v_dtype, {batch_size, emb_vector_dim});
+    Tensor sp_values_offset_expected(DT_INT64, {7});
+    Tensor unique_idx_expected(DT_INT32, {nnz});
+    Tensor batch_size_expected(DT_INT32, {batch_size});
+    fill_ev_vector_expected<test_case>(&emb_vector_expected);
+
+    if (device == DEVICE::GPU) {
+      test::FillValues<int32>(&batch_size_expected, {0, 2, 5, 8});
+    } else {
+      test::FillValues<int64>(&sp_values_offset_expected,
+                              {3, 1, 4, 5, 7, 12, 15});
+      test::FillValues<int32>(&unique_idx_expected,
+                              {0, 1, 2, 3, 4, 0, 5, 5, 6, 2});
+      test::FillValues<int32>(&batch_size_expected, {2, 5, 8, 10});
+    }
+    TF_EXPECT_OK(device_->Sync());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      const Tensor& emb_vector = *GetOutput(i);
+      const Tensor& values_offset = *GetOutput(num_lookups + i);
+      const Tensor& unique_idx_output = *GetOutput(2 * num_lookups + i);
+      const Tensor& batch_size_output = *GetOutput(3 * num_lookups + i);
+      test::ExpectTensorNear<TValue>(emb_vector_expected, emb_vector, 1e-4);
+      // Currently GPU do not have Unique logic.
+      if (device == DEVICE::CPU) {
+        test::ExpectTensorEqual<int64>(sp_values_offset_expected,
+                                       values_offset);
+        test::ExpectTensorEqual<int32>(unique_idx_expected, unique_idx_output);
+      }
+      test::ExpectTensorEqual<int32>(batch_size_expected, batch_size_output);
+    }
+  }
+};
+
+#ifdef GOOGLE_CUDA
+// TODO(junqi): Complete GPUEV related test
+// TEST_F(GroupEmbeddingVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSqrtnGpu) {
+//   Run<int64, float, Sqrtn>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatMeanGpu) {
+//   Run<int64, float, Mean>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSumGpu) {
+//   Run<int64, float, Sum>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Gpu) {
+//   Run<int64, float, SqrtnAndMaxNorm200>(DEVICE::GPU);
+// }
+#endif  // GOOGLE_CUDA
+
+TEST_F(GroupEmbeddingVariableForWardOpTest,
+       EmbeddingVarLocalSparseLookUpFloatSqrtnCpu) {
+  Run<int64, float, Sqrtn>(DEVICE::CPU);
+}
+
+TEST_F(GroupEmbeddingVariableForWardOpTest,
+       EmbeddingVarLocalSparseLookUpFloatMeanCpu) {
+  Run<int64, float, Mean>(DEVICE::CPU);
+}
+
+TEST_F(GroupEmbeddingVariableForWardOpTest,
+       EmbeddingVarLocalSparseLookUpFloatSumCpu) {
+  Run<int64, float, Sum>(DEVICE::CPU);
+}
+
+// TEST_F(GroupEmbeddingForWardOpTest,
+//        EmbeddingLocalSparseLookUpFloatSqrtnAndMaxNorm200Cpu) {
+//   Run<int64, float, SqrtnAndMaxNorm200>(DEVICE::CPU);
+// }
+
+class GroupEmbeddingVariableBackWardOpTest : public OpsTestBase {
+ protected:
+  template <typename TKey, typename TValue, TestCase test_case>
+  void Run(DEVICE device) {
+    if (device == DEVICE::GPU) {
+      SetDevice(DEVICE_GPU,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
+    DataType k_dtype = DataTypeToEnum<TKey>::value;
+    DataType v_dtype = DataTypeToEnum<TValue>::value;
+    std::string combiner_str;
+    float max_norm;
+
+    const int nums = 10;
+    const int nnz = 7;
+    const int batch_size = 4;
+    const int emb_vector_dim = 8;
+    const int entries = 8;
+    const int bucket_size = 16;
+    const int num_lookups = 2;
+    std::vector<TKey> sp_values_vec{3, 1, 4, 5, 7, 3, 12, 12, 15, 4};
+    get_node_attr_from_test_case<test_case>(combiner_str, max_norm);
+
+    TF_EXPECT_OK(NodeDefBuilder("group_embedding_variable_lookup_grad",
+                                "GroupEmbeddingVariableLookupGrad")
+                     .Input(FakeInput(num_lookups, DT_FLOAT))     // grads
+                     .Input(FakeInput(num_lookups, DT_RESOURCE))  // ev
+                     .Input(FakeInput(num_lookups, k_dtype))      // unique_key
+                     .Input(FakeInput(num_lookups, DT_INT64))     // unique_idx
+                     .Input(FakeInput(num_lookups, DT_INT32))     // batch_nums
+                     .Attr("dtype", v_dtype)
+                     .Attr("Tkeys", k_dtype)
+                     .Attr("combiner", combiner_str)
+                     .Attr("max_norm", max_norm)
+                     .Attr("dimension", emb_vector_dim)
+                     .Attr("num_lookups", num_lookups)
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    for (int i = 0; i < num_lookups; ++i) {
+      Tensor top_grad(DT_FLOAT, {batch_size, emb_vector_dim});
+      test::FillValues<float>(
+          &top_grad,
+          {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  8.0,  9.0,  10.0,
+           11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0,
+           22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0});
+
+      AddInputFromArray<float>(top_grad.shape(), top_grad.flat<float>());
+    }
+
+    for (int i = 0; i < num_lookups; ++i) {
+      EmbeddingVar<TKey, TValue>* embedding_var = nullptr;
+      Allocator* gpu_allocator = device_->GetAllocator(AllocatorAttributes());
+      auto embedding_config =
+          EmbeddingConfig(0, 0, 1, 1, "", 0, 0, 99999, 14.0);
+      embedding::StorageType storage_type = embedding::StorageType::DRAM;
+      auto feat_desc = new embedding::FeatureDescriptor<float>(
+          1, 1, ev_allocator(), storage_type, false,
+          embedding_config.is_save_version(),
+          {embedding_config.is_counter_filter(), 0});
+      auto storage = embedding::StorageFactory::Create<TKey, TValue>(
+          embedding::StorageConfig(storage_type, "", {1024, 1024, 1024, 1024},
+                                   embedding_config),
+          gpu_allocator, feat_desc, "EV" + std::to_string(i));
+      embedding_var = new EmbeddingVar<TKey, TValue>("EV" + std::to_string(i),
+                                                     storage, embedding_config,
+                                                     gpu_allocator, feat_desc);
+      Tensor value(DT_FLOAT, TensorShape({emb_vector_dim}));
+      test::FillValues<TValue>(&value,
+                               std::vector<TValue>(emb_vector_dim, 1.0));
+      embedding_var->Init(value, 1);
+
+      for (int64 j = 0; j < nnz; ++j) {
+        void* value_ptr = nullptr;
+        Status s =
+            embedding_var->LookupOrCreateKey(sp_values_vec[j], &value_ptr);
+        typename TTypes<TValue>::Flat vflat = embedding_var->flat(value_ptr);
+      }
+      AddResourceInput<EmbeddingVar<TKey, TValue>>("", "EV" + std::to_string(i),
+                                                   embedding_var);
+    }
+
+    if (device == DEVICE::GPU) {
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values(k_dtype, {nums});
+        test::FillValues<TKey>(&sp_values, {3, 1, 4, 5, 7, 3, 12, 12, 15, 4});
+        AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT64, {nnz});
+        test::FillValues<int64>(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3});
+        AddInputFromArray<int64>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int64>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT32, {batch_size});
+        test::FillValues<int32>(&sp_values_offset, {0, 2, 5, 8});
+        AddInputFromArray<int32>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int>());
+      }
+      TF_ASSERT_OK(RunOpKernel());
+
+      Tensor grad_expected(v_dtype, {nums, emb_vector_dim});
+      fill_var_grad_expected<DEVICE::GPU, test_case>(&grad_expected);
+
+      TF_EXPECT_OK(device_->Sync());
+
+      for (int i = 0; i < num_lookups; ++i) {
+        const Tensor& grad = *GetOutput(i);
+        test::ExpectTensorNear<TValue>(grad_expected, grad, 1e-4);
+      }
+    } else {
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values(k_dtype, {nnz});
+        test::FillValues<TKey>(&sp_values, {3, 1, 4, 5, 7, 12, 15});
+        AddInputFromArray<TKey>(sp_values.shape(), sp_values.flat<TKey>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT64, {nnz});
+        test::FillValues<int64>(&sp_values_offset, {0, 0, 1, 1, 1, 2, 3});
+        AddInputFromArray<int64>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int64>());
+      }
+
+      for (int i = 0; i < num_lookups; ++i) {
+        Tensor sp_values_offset(DT_INT32, {batch_size});
+        test::FillValues<int32>(&sp_values_offset, {2, 5, 8, 10});
+        AddInputFromArray<int32>(sp_values_offset.shape(),
+                                 sp_values_offset.flat<int>());
+      }
+      TF_ASSERT_OK(RunOpKernel());
+
+      Tensor grad_expected(v_dtype, {nnz, emb_vector_dim});
+      fill_var_grad_expected<DEVICE::CPU, test_case>(&grad_expected);
+
+      TF_EXPECT_OK(device_->Sync());
+
+      for (int i = 0; i < num_lookups; ++i) {
+        const Tensor& grad = *GetOutput(i);
+        test::ExpectTensorNear<TValue>(grad_expected, grad, 1e-4);
+      }
+    }
+  }
+};
+
+#ifdef GOOGLE_CUDA
+// TODO(junqi): Complete GPUEV related test
+
+// TEST_F(GroupEmbeddingVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatGpu) {
+//   Run<int64, float, Sqrtn>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatMeanGpu) {
+//   Run<int64, float, Mean>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatSumGpu) {
+//   Run<int64, float, Sum>(DEVICE::GPU);
+// }
+
+// TEST_F(GroupEmbeddingVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Gpu) {
+//   Run<int64, float, MeanAndMaxNorm100>(DEVICE::GPU);
+// }
+#endif  // GOOGLE_CUDA
+
+TEST_F(GroupEmbeddingVariableBackWardOpTest,
+       EmbeddingVarLocalSparseLookUpGradFloatSqrtCpu) {
+  Run<int64, float, Sqrtn>(DEVICE::CPU);
+}
+
+TEST_F(GroupEmbeddingVariableBackWardOpTest,
+       EmbeddingVarLocalSparseLookUpGradFloatMeanCpu) {
+  Run<int64, float, Mean>(DEVICE::CPU);
+}
+
+TEST_F(GroupEmbeddingVariableBackWardOpTest,
+       EmbeddingVarLocalSparseLookUpGradFloatSumCpu) {
+  Run<int64, float, Sum>(DEVICE::CPU);
+}
+
+// TEST_F(GroupEmbeddingVariableBackWardOpTest,
+//        EmbeddingLocalSparseLookUpGradFloatMeanAndMaxNorm100Cpu) {
+//   Run<int64, float, MeanAndMaxNorm100>(DEVICE::CPU);
+// }
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
new file mode 100644
index 00000000..8ced8a0c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
@@ -0,0 +1,371 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename TKey, typename TValue>
+struct GroupEmbeddingBackWardArgs {
+  GroupEmbeddingBackWardArgs() = default;
+  GroupEmbeddingBackWardArgs(TValue *grads, TKey *sp_values,
+                             TValue *emb_variable, TValue *grads_output,
+                             int *offset_indices, int nnz)
+      : grads_(grads),
+        sp_values_(sp_values),
+        emb_variable_(emb_variable),
+        grads_output_(grads_output),
+        offset_indices_(offset_indices),
+        nnz_(nnz) {}
+  TValue *grads_;
+  TKey *sp_values_;
+  TValue *emb_variable_;
+  TValue *grads_output_;
+  int *offset_indices_;
+  int nnz_;
+};
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void ComputeEVGradFn(
+    const int batch_size, const float max_norm, const int num_lookups,
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+  float l2_sum;
+
+  const auto &block = cooperative_groups::this_thread_block();
+  const auto &tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int idx = 0; idx < num_lookups; ++idx) {
+      int value_offset = args[idx].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[idx].nnz_ - value_offset;
+      } else {
+        feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
+      }
+
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+
+        for (int j = 0; j < feature_num; ++j) {
+          float grad_i = grad;
+          int feature_offset = (value_offset + j) * dimension;
+          if (max_norm > 0.0f) {
+            float emb_element = 0.0f;  // TODO(junqihu): get emb_weight
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            float l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
+          }
+          args[idx].grads_output_[(value_offset + j) * dimension + tid] =
+              grad_i;
+        }
+      }
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void ComputeSparseGradFn(
+    const int batch_size, const float max_norm, const int num_lookups,
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+  float l2_sum;
+  const auto &block = cooperative_groups::this_thread_block();
+  const auto &tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int idx = 0; idx < num_lookups; ++idx) {
+      const int value_offset = args[idx].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[idx].nnz_ - value_offset;
+      } else {
+        feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
+      }
+
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+        for (int i = 0; i < feature_num; i++) {
+          float grad_i = grad;
+          if (max_norm > 0.0f) {
+            int64_t indices = int(args[idx].sp_values_[value_offset + i]);
+            float emb_element =
+                args[idx].emb_variable_[indices * dimension + tid];
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            float l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
+          }
+          args[idx].grads_output_[(value_offset + i) * dimension + tid] =
+              grad_i;
+        }
+      }
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalComputeEVGradFn(
+    const int batch_size, const float max_norm, const int num_lookups,
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+  __shared__ TValue l2_sum[1];
+
+  const auto &block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int idx = 0; idx < num_lookups; ++idx) {
+      int value_offset = args[idx].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[idx].nnz_ - value_offset;
+      } else {
+        feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
+      }
+
+      if (feature_num > 0) {
+        float grad = args[idx].grads_[bid * dimension + tid];
+        grad = CombineGrad<combiner>(grad, feature_num);
+
+        for (int j = 0; j < feature_num; ++j) {
+          float grad_i = grad;
+          int feature_offset = (value_offset + j) * dimension;
+          if (max_norm > 0.0f) {
+            float emb_element = 0.0f;  // TODO(junqihu): get emb_weight
+            if (tid == 0) {
+              l2_sum[0] = 0.0f;
+            }
+            __syncthreads();
+            atomicAdd(l2_sum, emb_element * emb_element);
+            __syncthreads();
+            float l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              grad_i *= max_norm / l2_norm;
+            }
+          }
+          args[idx].grads_output_[(value_offset + j) * dimension + tid] =
+              grad_i;
+        }
+      }
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalComputeSparseGradFn(
+    const int batch_size, const float max_norm, const int num_lookups,
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+  __shared__ TValue l2_sum[1];
+
+  const auto &block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  for (int idx = 0; idx < num_lookups; ++idx) {
+    const int value_offset = args[idx].offset_indices_[bid];
+    int feature_num;
+    if (bid == (batch_size - 1)) {
+      feature_num = args[idx].nnz_ - value_offset;
+    } else {
+      feature_num = args[idx].offset_indices_[bid + 1] - value_offset;
+    }
+
+    if (feature_num > 0) {
+      float grad = args[idx].grads_[bid * dimension + tid];
+      grad = CombineGrad<combiner>(grad, feature_num);
+      for (int i = 0; i < feature_num; i++) {
+        float grad_i = grad;
+        if (max_norm > 0.0f) {
+          int64_t indices = int(args[idx].sp_values_[value_offset + i]);
+          float emb_element =
+              args[idx].emb_variable_[indices * dimension + tid];
+          if (tid == 0) {
+            l2_sum[0] = 0.0f;
+          }
+          __syncthreads();
+          atomicAdd(l2_sum, emb_element * emb_element);
+          __syncthreads();
+          float l2_norm = sqrtf(l2_sum[0]);
+          if (l2_norm > max_norm) {
+            grad_i *= max_norm / l2_norm;
+          }
+        }
+        args[idx].grads_output_[(value_offset + i) * dimension + tid] = grad_i;
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingLookupBackWard {
+ public:
+  explicit GroupEmbeddingLookupBackWard(int dimension, int num_lookups,
+                                        float max_norm,
+                                        Allocator *gpu_allocator = nullptr)
+      : alloc_(gpu_allocator) {
+    d_args_ =
+        TypedAllocator::Allocate<GroupEmbeddingBackWardArgs<TKey, TValue>>(
+            gpu_allocator, num_lookups, AllocationAttributes());
+    h_args_.reserve(num_lookups);
+    max_norm_ = max_norm;
+    nums_ = num_lookups;
+    dimension_ = dimension;
+  }
+
+  void set(GroupEmbeddingBackWardArgs<TKey, TValue> &arg) {
+    h_args_.emplace_back(arg);
+  }
+
+  ~GroupEmbeddingLookupBackWard() {
+    TypedAllocator::Deallocate(alloc_, d_args_, nums_);
+  }
+
+  template <typename GradFn>
+  inline void Backward(GradFn fn, int batch_size, int tile_size,
+                       cudaStream_t stream) {
+    CK_CUDA_THROW_(cudaMemcpyAsync(
+        d_args_, h_args_.data(),
+        h_args_.size() * sizeof(GroupEmbeddingBackWardArgs<TKey, TValue>),
+        cudaMemcpyHostToDevice, stream));
+
+    {
+      if (tile_size <= 32) {
+        const int block_size = batch_size * tile_size / 64 + 1;
+
+        fn<<<block_size, 64, 0, stream>>>(batch_size, max_norm_, nums_,
+                                          dimension_, d_args_);
+      } else {
+        fn<<<batch_size, tile_size, 0, stream>>>(batch_size, max_norm_, nums_,
+                                                 dimension_, d_args_);
+      }
+    }
+
+    CK_CUDA_THROW_(cudaGetLastError());
+  }
+
+ protected:
+  std::vector<GroupEmbeddingBackWardArgs<TKey, TValue>> h_args_;
+  GroupEmbeddingBackWardArgs<TKey, TValue> *d_args_;
+  Allocator *alloc_;
+  float max_norm_;
+  int nums_;
+  int dimension_;
+};
+
+template <typename TKey, typename TValue>
+class GroupLookupBackWardBaseOp : public OpKernel {
+ public:
+  explicit GroupLookupBackWardBaseOp(OpKernelConstruction *c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
+    OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_));
+  }
+
+  template <bool Isev = false, Combiner combiner>
+  inline void compute(GroupEmbeddingLookupBackWard<TKey, TValue> &lookuper,
+                      const int batch_size, cudaStream_t stream) {
+    if (Isev) {
+      if (dimension_ <= 2) {
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
+      } else if (dimension_ <= 4) {
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
+      } else if (dimension_ <= 8) {
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
+      } else if (dimension_ <= 16) {
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
+      } else if (dimension_ <= 32) {
+        lookuper.Backward(ComputeEVGradFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
+      } else {
+        lookuper.Backward(NormalComputeEVGradFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
+      }
+    } else {
+      if (dimension_ <= 2) {
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
+      } else if (dimension_ <= 4) {
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
+      } else if (dimension_ <= 8) {
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
+      } else if (dimension_ <= 16) {
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
+      } else if (dimension_ <= 32) {
+        lookuper.Backward(ComputeSparseGradFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
+      } else {
+        lookuper.Backward(NormalComputeSparseGradFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
+      }
+    }
+  }
+
+ protected:
+  std::string combiner_;
+  float max_norm_;
+  int num_lookups_;
+  int dimension_;
+};
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc
new file mode 100644
index 00000000..6ac80896
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cc
@@ -0,0 +1,264 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingVarLookupGradCpuOp : public OpKernel {
+ public:
+  explicit GroupEmbeddingVarLookupGradCpuOp(OpKernelConstruction* c)
+      : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
+    OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+
+    for (int i = 0; i < num_lookups_; ++i) {
+      const Tensor grads_tensor = ctx->input(i);
+      auto* grads = grads_tensor.flat<TValue>().data();
+      const Tensor unique_keys_tensor = ctx->input(2 * num_lookups_ + i);
+      auto* unique_keys = unique_keys_tensor.flat<TKey>().data();
+      int unique_nnz = unique_keys_tensor.NumElements();
+
+      const Tensor sp_indices_tensor = ctx->input(3 * num_lookups_ + i);
+      auto* sp_indices = sp_indices_tensor.flat<int64>().data();
+      const Tensor batch_nums_tensor = ctx->input(4 * num_lookups_ + i);
+      auto* batch_nums = batch_nums_tensor.flat<int>().data();
+
+      Tensor* grads_sp_values_tensor;
+      TensorShape grads_sp_values_tensor_shape =
+          TensorShape(std::vector<int64>({unique_nnz, dimension_}));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape,
+                                               &grads_sp_values_tensor));
+      auto* grads_sp_values = grads_sp_values_tensor->flat<TValue>().data();
+
+      int slice_bytes = unique_nnz * dimension_ * 1000;
+      if (combiner_ == "mean") {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            // Code Not Help
+            // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            //             int segment_id = sp_indices[i];
+            //             int scale = batch_nums[segment_id];
+            //             __m512 _weights = _mm512_set1_ps(scale);
+            //             for (int d = 0; d < dimension_; d+=16) {
+            //               int index = d / 16;
+            //               int remain = dimension_ - d;
+            //               __mmask16 mask = (remain >= 16 ? 0xffff : (1 <<
+            //               remain) - 1);
+            //               __m512 _grads = _mm512_set1_ps(grads[segment_id *
+            //               dimension_ + d]);
+            //               __m512 _item = _mm512_div_ps(_grads, _weights);
+            //               _mm512_mask_storeu_ps(grads_sp_values + i *
+            //               dimension_ + d, mask, _item);
+            //             }
+            // #else
+            int segment_id = sp_indices[i];
+            int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1];
+            int scale = batch_nums[segment_id] - batch_offset;
+            for (int d = 0; d < dimension_; ++d) {
+              grads_sp_values[i * dimension_ + d] =
+                  grads[segment_id * dimension_ + d] / scale;
+            }
+            // #endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/,
+              embedding_var_grad_combiner);  // Parallel on batch
+      } else if (combiner_ == "sum") {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            int segment_id = sp_indices[i];
+            memcpy(grads_sp_values + i * dimension_,
+                   grads + segment_id * dimension_,
+                   sizeof(TValue) * dimension_);
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/, embedding_var_grad_combiner);
+      } else {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            // #if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            //             int segment_id = sp_indices[i];
+            //             int scale = batch_nums[segment_id];
+            //             __m512 _weights = _mm512_set1_ps(sqrtf(scale));
+            //             for (int d = 0; d < dimension_; d += 16) {
+            //               int index = d / 16;
+            //               int remain = dimension_ - d;
+            //               __mmask16 mask = (remain >= 16 ? 0xffff : (1 <<
+            //               remain) - 1);
+            //               __m512 _grads =
+            //                   _mm512_set1_ps(grads[segment_id * dimension_ +
+            //                   d]);
+            //               __m512 _item = _mm512_div_ps(_grads, _weights);
+            //               _mm512_mask_storeu_ps(grads_sp_values + i *
+            //               dimension_ + d, mask,
+            //                                     _item);
+            //             }
+            // #else
+            int segment_id = sp_indices[i];
+            int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1];
+            int scale = batch_nums[segment_id] - batch_offset;
+            for (int d = 0; d < dimension_; ++d) {
+              grads_sp_values[i * dimension_ + d] =
+                  grads[segment_id * dimension_ + d] / sqrtf(scale);
+            }
+            // #endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/, embedding_var_grad_combiner);
+      }
+    }
+  }
+
+ private:
+  std::string combiner_;
+  float max_norm_;
+  int num_lookups_;
+  int dimension_;
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type) \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("GroupEmbeddingVariableLookupGrad")     \
+          .Device(DEVICE_CPU)                      \
+          .TypeConstraint<key_type>("Tkeys")       \
+          .TypeConstraint<value_type>("dtype"),    \
+      GroupEmbeddingVarLookupGradCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+template <typename TKey, typename TValue>
+class GroupVariableLookupGradCpuOp : public OpKernel {
+ public:
+  explicit GroupVariableLookupGradCpuOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
+    OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    for (int i = 0; i < num_lookups_; ++i) {
+      const Tensor grads_tensor = ctx->input(i);
+      auto* grads = grads_tensor.flat<TValue>().data();
+      const Tensor emb_variables_tensor = ctx->input(num_lookups_ + i);
+      const Tensor unique_keys_tensor = ctx->input(2 * num_lookups_ + i);
+      auto* unique_keys = unique_keys_tensor.flat<TKey>().data();
+      int unique_nnz = unique_keys_tensor.NumElements();
+
+      const Tensor sp_indices_tensor = ctx->input(3 * num_lookups_ + i);
+      auto* sp_indices = sp_indices_tensor.flat<int64>().data();
+      const Tensor batch_nums_tensor = ctx->input(4 * num_lookups_ + i);
+      auto* batch_nums = batch_nums_tensor.flat<int>().data();
+
+      Tensor* grads_sp_values_tensor;
+      TensorShape grads_sp_values_tensor_shape =
+          TensorShape(std::vector<int64>({unique_nnz, dimension_}));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape,
+                                               &grads_sp_values_tensor));
+      TValue* grads_sp_values = grads_sp_values_tensor->flat<TValue>().data();
+
+      int slice_bytes = unique_nnz * dimension_ * 1000;
+      if (combiner_ == "mean") {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            int segment_id = sp_indices[i];
+            int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1];
+            int scale = batch_nums[segment_id] - batch_offset;
+            for (int d = 0; d < dimension_; ++d) {
+              grads_sp_values[i * dimension_ + d] =
+                  grads[segment_id * dimension_ + d] / scale;
+            }
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/,
+              embedding_var_grad_combiner);  // Parallel on batch
+      } else if (combiner_ == "sum") {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            int segment_id = sp_indices[i];
+            memcpy(grads_sp_values + i * dimension_,
+                   grads + segment_id * dimension_,
+                   sizeof(TValue) * dimension_);
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/,
+              embedding_var_grad_combiner);  // Parallel on batch
+      } else {
+        auto embedding_var_grad_combiner = [this, &grads_sp_values, sp_indices,
+                                            grads, batch_nums](int64 start,
+                                                               int64 end) {
+          for (int64 i = start; i < end; ++i) {
+            int segment_id = sp_indices[i];
+            int batch_offset = segment_id == 0 ? 0 : batch_nums[segment_id - 1];
+            int scale = batch_nums[segment_id] - batch_offset;
+            for (int d = 0; d < dimension_; ++d) {
+              grads_sp_values[i * dimension_ + d] =
+                  grads[segment_id * dimension_ + d] / sqrtf(scale);
+            }
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, unique_nnz,
+              slice_bytes /*cost*/,
+              embedding_var_grad_combiner);  // Parallel on batch
+      }
+    }
+  }
+
+ private:
+  std::string combiner_;
+  float max_norm_;
+  int num_lookups_;
+  int dimension_;
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type)                  \
+  REGISTER_KERNEL_BUILDER(Name("GroupVariableLookupGrad")           \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<key_type>("Tkeys")    \
+                              .TypeConstraint<value_type>("dtype"), \
+                          GroupVariableLookupGradCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc
new file mode 100644
index 00000000..16d99562
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_ops.cu.cc
@@ -0,0 +1,176 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h"
+#include "group_embedding_lookup_sparse_backward_base_ops.cu.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+using GPUDevice = Eigen::GpuDevice;
+
+template <typename TFKey, typename TKey, typename TValue>
+class GroupVariableLookupBackwardOp
+    : public GroupLookupBackWardBaseOp<TKey, TValue> {
+ public:
+  explicit GroupVariableLookupBackwardOp(OpKernelConstruction* c)
+      : GroupLookupBackWardBaseOp<TKey, TValue>(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    int batch_size = -1;
+
+    Allocator* gpu_allocator =
+        ctx->device()->GetAllocator(AllocatorAttributes());
+    GroupEmbeddingLookupBackWard<TKey, TValue> lookuper(
+        this->dimension_, this->num_lookups_, this->max_norm_, gpu_allocator);
+    for (int i = 0; i < this->num_lookups_; ++i) {
+      const Tensor grads_tensor = ctx->input(i);
+      const Tensor emb_variables_tensor = ctx->input(this->num_lookups_ + i);
+      const Tensor sp_values_tensor = ctx->input(2 * this->num_lookups_ + i);
+      const Tensor sp_values_offset_tensor =
+          ctx->input(4 * this->num_lookups_ + i);
+      const int64_t nnz = sp_values_tensor.NumElements();
+
+      Tensor* grads_sp_values_tensor;
+      TensorShape grads_sp_values_tensor_shape =
+          TensorShape(std::vector<int64>({nnz, this->dimension_}));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape,
+                                               &grads_sp_values_tensor));
+      auto* grads_sp_values = grads_sp_values_tensor->flat<TValue>().data();
+      cudaMemsetAsync(grads_sp_values, 0,
+                      sizeof(TValue) * nnz * this->dimension_, stream);
+
+      if (i == 0) {
+        batch_size = sp_values_offset_tensor.shape().dim_size(0);
+      }
+
+      GroupEmbeddingBackWardArgs<TKey, TValue> args(
+          const_cast<TValue*>(grads_tensor.flat<TValue>().data()),
+          const_cast<TKey*>(reinterpret_cast<const TKey*>(
+              sp_values_tensor.flat<TFKey>().data())),
+          const_cast<TValue*>(emb_variables_tensor.flat<TValue>().data()),
+          grads_sp_values,
+          const_cast<int*>(sp_values_offset_tensor.flat<int>().data()), nnz);
+      lookuper.set(args);
+    }
+
+    if (this->combiner_ == "mean") {
+      this->template compute<false, Mean>(lookuper, batch_size, stream);
+    } else if (this->combiner_ == "sum") {
+      this->template compute<false, Sum>(lookuper, batch_size, stream);
+    } else {
+      this->template compute<false, Sqrtn>(lookuper, batch_size, stream);
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype) \
+  REGISTER_KERNEL_BUILDER(                                 \
+      Name("GroupVariableLookupGrad")                      \
+          .Device(DEVICE_GPU)                              \
+          .TypeConstraint<key_type_tf>("Tkeys")            \
+          .TypeConstraint<dtype>("dtype"),                 \
+      GroupVariableLookupBackwardOp<key_type_tf, key_type, dtype>)
+
+REGISTER_GPU_KERNELS(int64, int64_t, float);
+REGISTER_GPU_KERNELS(int32, int32_t, float);
+#undef REGISTER_GPU_KERNELS
+
+template <typename TFKey, typename TKey, typename TValue>
+class GroupEmbeddingVariableLookupBackwardOp
+    : public GroupLookupBackWardBaseOp<TKey, TValue> {
+ public:
+  explicit GroupEmbeddingVariableLookupBackwardOp(OpKernelConstruction* c)
+      : GroupLookupBackWardBaseOp<TKey, TValue>(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    auto stream = ctx->eigen_device<GPUDevice>().stream();
+    int batch_size = -1;
+
+    Allocator* gpu_allocator =
+        ctx->device()->GetAllocator(AllocatorAttributes());
+    GroupEmbeddingLookupBackWard<TKey, TValue> lookuper(
+        this->dimension_, this->num_lookups_, this->max_norm_, gpu_allocator);
+    for (int i = 0; i < this->num_lookups_; ++i) {
+      const Tensor grads_tensor = ctx->input(i);
+      EmbeddingVar<TFKey, TValue>* ev = nullptr;
+      OP_REQUIRES_OK(
+          ctx, LookupResource(ctx, HandleFromInput(ctx, this->num_lookups_ + i),
+                              &ev));
+      core::ScopedUnref unref_me(ev);
+      const Tensor sp_values_tensor = ctx->input(2 * this->num_lookups_ + i);
+      const Tensor sp_values_offset_tensor =
+          ctx->input(4 * this->num_lookups_ + i);
+      // int dimension = ev->ValueLen();
+      if (i == 0) {
+        batch_size = sp_values_offset_tensor.shape().dim_size(0);
+      }
+
+      const int64_t nnz = sp_values_tensor.NumElements();
+
+      Tensor* grads_sp_values_tensor;
+      TensorShape grads_sp_values_tensor_shape =
+          TensorShape(std::vector<int64>({nnz, this->dimension_}));
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, grads_sp_values_tensor_shape,
+                                               &grads_sp_values_tensor));
+      auto* grads_sp_values = grads_sp_values_tensor->flat<TValue>().data();
+      cudaMemsetAsync(grads_sp_values, 0,
+                      sizeof(TValue) * nnz * this->dimension_, stream);
+
+      GroupEmbeddingBackWardArgs<TKey, TValue> args(
+          const_cast<TValue*>(grads_tensor.flat<TValue>().data()),
+          const_cast<TKey*>(reinterpret_cast<const TKey*>(
+              sp_values_tensor.flat<TFKey>().data())),
+          nullptr /*fake*/, grads_sp_values,
+          const_cast<int*>(sp_values_offset_tensor.flat<int>().data()), nnz);
+      lookuper.set(args);
+    }
+
+    if (this->combiner_ == "mean") {
+      this->template compute<true, Mean>(lookuper, batch_size, stream);
+    } else if (this->combiner_ == "sum") {
+      this->template compute<true, Sum>(lookuper, batch_size, stream);
+    } else {
+      this->template compute<true, Sqrtn>(lookuper, batch_size, stream);
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype) \
+  REGISTER_KERNEL_BUILDER(                                 \
+      Name("GroupEmbeddingVariableLookupGrad")             \
+          .Device(DEVICE_GPU)                              \
+          .TypeConstraint<key_type_tf>("Tkeys")            \
+          .TypeConstraint<dtype>("dtype"),                 \
+      GroupEmbeddingVariableLookupBackwardOp<key_type_tf, key_type, dtype>)
+
+REGISTER_GPU_KERNELS(int64, int64_t, float);
+REGISTER_GPU_KERNELS(int32, int32_t, float);
+#undef REGISTER_GPU_KERNELS
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
new file mode 100644
index 00000000..3091535d
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.cu.h
@@ -0,0 +1,721 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#if GOOGLE_CUDA
+
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+
+#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/typed_allocator.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+namespace {
+
+template <typename TKey, typename TValue>
+struct GroupEmbeddingForWardArgs {
+  GroupEmbeddingForWardArgs() = default;
+  GroupEmbeddingForWardArgs(TValue* emb_variable, TValue* sp_weights,
+                            TValue* emb_vector, TKey* sp_values,
+                            int* offset_indices, int nnz)
+      : emb_variable_(emb_variable),
+        sp_weights_(sp_weights),
+        emb_vector_(emb_vector),
+        sp_values_(sp_values),
+        offset_indices_(offset_indices),
+        nnz_(nnz) {}
+  TValue* emb_variable_;
+  TValue* sp_weights_;
+  TValue* emb_vector_;
+  TKey* sp_values_;
+  int* offset_indices_;
+  int nnz_;
+};
+
+__global__ void SetToIntMaxSTG128(const int batch_size, int* values_offset) {
+  const int thread_offset = 4 * (blockIdx.x * blockDim.x + threadIdx.x);
+  const int int_max = 0x7fffffff;
+  if (thread_offset + 4 < batch_size) {
+    ::int4 four = make_int4(int_max, int_max, int_max, int_max);
+    *((::int4*)(values_offset + thread_offset)) = four;
+  } else if (thread_offset < batch_size) {
+    for (int i = thread_offset; i < batch_size; i++) {
+      values_offset[i] = int_max;
+    }
+  }
+}
+
+__device__ void FilledEmptyRowNumber(int batch_size,
+                                     volatile int* values_offset) {
+  const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  const int int_max = 0x7fffffff;
+  if (thread_offset > 1) {
+    if (thread_offset < batch_size) {
+      while (values_offset[thread_offset] == int_max) {
+        const int compare = values_offset[thread_offset - 1];
+        if (compare != int_max) {
+          atomicMin((int*)values_offset + thread_offset, compare);
+        }
+      }
+    }
+  } else {
+    if (values_offset[thread_offset] == int_max) {
+      values_offset[thread_offset] = 0;
+    }
+  }
+}
+
+__global__ void CalcPerElementRowOffset(int batch_size, int nnz, int stride,
+                                        const int64_t* indices,
+                                        int* values_offset) {
+  const int thread_offset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_offset < nnz) {
+    const int64_t element_row = indices[stride * thread_offset];
+    atomicMin((int*)values_offset + int(element_row), thread_offset);
+  }
+  __syncthreads();
+  FilledEmptyRowNumber(batch_size, values_offset);
+}
+
+inline void launch_cal_per_element_row_offset(const int batch_size, int nnz,
+                                              int stride,
+                                              const int64_t* sp_indices,
+                                              int* offset_indices,
+                                              cudaStream_t stream) {
+  static int threads = 1024;
+  int blocks = (batch_size - 1) / threads + 1;
+
+  SetToIntMaxSTG128<<<blocks, threads, 0, stream>>>(batch_size, offset_indices);
+  blocks = (nnz - 1) / threads + 1;
+  CalcPerElementRowOffset<<<blocks, threads, 0, stream>>>(
+      batch_size, nnz, stride, sp_indices, offset_indices);
+}
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void WeightedEmbeddingVarComputeFn(
+    const int batch_size, const int dimension, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  TValue l2_sum;
+
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+
+      float out = 0.0f;
+      float total_batch_weight = 0.0f;
+      if (feature_num > 0) {
+        for (int j = 0; j < feature_num; ++j) {
+          size_t feature_indices = value_offset + j;
+          int64_t embedding_offset = feature_indices * dimension;
+          TValue sum = args[ev_id].emb_variable_[embedding_offset + tid];
+          TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
+          if (max_norm >= 0.0) {
+            if (tid == 0) {
+              l2_sum = 0.0;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, sum * sum);
+            tile.sync();
+            TValue l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              sum *= max_norm / l2_norm;
+            }
+          }
+          out = __fmaf_rn(sum, sp_weights, out);
+        }
+        out = Combine<combiner, TValue>(out, total_batch_weight);
+      }
+      args[ev_id].emb_vector_[bid * dimension + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void WeightedVariableComputeFn(
+    const int batch_size, const int emb_vec_size, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  TValue l2_sum;
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < emb_vec_size) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+
+      TValue out = 0.0f;
+      TValue total_batch_weight = 0.0f;
+      const TValue* emb_variable = args[ev_id].emb_variable_;
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int i = 0; i < feature_num; i++) {
+          size_t feature_indices = value_offset + i;
+          int embedding_indices = int(args[ev_id].sp_values_[feature_indices]);
+          TValue sp_weights = args[ev_id].sp_weights_[embedding_indices];
+          total_batch_weight += sp_weights;
+          TValue emb_element = emb_variable[feature_indices];
+          if (max_norm >= 0.0f) {
+            // calc l2 norm of this emb row(per block) and compare with
+            // max_norm.
+            // if greater than max_norm, then clip every element with factor
+            // max_norm / l2norm
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            TValue l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              emb_element *= max_norm / l2_norm;
+            }
+          }
+          out = __fmaf_rn(emb_element, sp_weights, out);
+        }
+        out = Combine<combiner, TValue>(out, total_batch_weight);
+      }
+      args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void EmbeddingVarComputeFn(
+    const int batch_size, const int dimension, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  TValue l2_sum;
+
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0;
+
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int j = 0; j < feature_num; ++j) {
+          int64_t feature_offset = (value_offset + j) * dimension;
+          TValue sum = args[ev_id].emb_variable_[feature_offset + tid];
+          if (max_norm >= 0.0) {
+            if (tid == 0) {
+              l2_sum = 0.0;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, sum * sum);
+            tile.sync();
+            TValue l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              sum *= max_norm / l2_norm;
+            }
+          }
+          out += sum;
+        }
+        out = Combine<combiner, int>(out, feature_num);
+      }
+      args[ev_id].emb_vector_[bid * dimension + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
+__global__ void VariableComputeFn(
+    const int batch_size, const int emb_vec_size, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  TValue l2_sum;
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  // each block partition corresponding to one sample
+  const int bid =
+      block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
+  // each thread corresponding to one element in the embedding vector
+  const int tid = tile.thread_rank();
+
+  if (bid < batch_size && tid < emb_vec_size) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0f;
+
+      const TValue* emb_variable = args[ev_id].emb_variable_;
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int i = 0; i < feature_num; i++) {
+          int indices = int(args[ev_id].sp_values_[value_offset + i]);
+          TValue emb_element = emb_variable[indices * emb_vec_size + tid];
+          // printf("indices is %d emb_element is %f\n", indices, emb_element);
+          if (max_norm >= 0.0f) {
+            // calc l2 norm of this emb row(per block) and compare with
+            // max_norm.
+            // if greater than max_norm, then clip every element with factor
+            // max_norm / l2norm
+            if (tid == 0) {
+              l2_sum = 0.0f;
+            }
+            tile.shfl(l2_sum, 0);
+            atomicAdd(&l2_sum, emb_element * emb_element);
+            tile.sync();
+            TValue l2_norm = sqrtf(l2_sum);
+            if (l2_norm > max_norm) {
+              emb_element *= max_norm / l2_norm;
+            }
+          }
+          out += emb_element;
+        }
+        out = Combine<combiner, int>(out, feature_num);
+      }
+      args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalEmbeddingVarComputeFn(
+    const int batch_size, const int dimension, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  __shared__ TValue l2_sum[1];
+
+  const auto& block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0;
+
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int j = 0; j < feature_num; ++j) {
+          int64_t feature_offset = (value_offset + j) * dimension;
+          TValue sum = args[ev_id].emb_variable_[feature_offset + tid];
+          if (max_norm >= 0.0) {
+            if (tid == 0) {
+              l2_sum[0] = 0.0;
+            }
+            block.sync();
+            atomicAdd(l2_sum, sum * sum);
+            block.sync();
+            TValue l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              sum *= max_norm / l2_norm;
+            }
+          }
+          out += sum;
+        }
+        out = Combine<combiner, int>(out, feature_num);
+      }
+      args[ev_id].emb_vector_[bid * dimension + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalVariableComputeFn(
+    const int batch_size, const int emb_vec_size, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  __shared__ TValue l2_sum[1];
+  const auto& block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  if (bid < batch_size && tid < emb_vec_size) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0f;
+
+      const TValue* emb_variable = args[ev_id].emb_variable_;
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int i = 0; i < feature_num; i++) {
+          int indices = int(args[ev_id].sp_values_[value_offset + i]);
+          TValue emb_element = emb_variable[indices * emb_vec_size + tid];
+          // printf("indices is %d emb_element is %f\n", indices, emb_element);
+          if (max_norm >= 0.0f) {
+            // calc l2 norm of this emb row(per block) and compare with
+            // max_norm.
+            // if greater than max_norm, then clip every element with factor
+            // max_norm / l2norm
+            if (tid == 0) {
+              l2_sum[0] = 0.0f;
+            }
+            block.sync();
+            atomicAdd(l2_sum, emb_element * emb_element);
+            block.sync();
+            TValue l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              emb_element *= max_norm / l2_norm;
+            }
+          }
+          out += emb_element;
+        }
+        out = Combine<combiner, int>(out, feature_num);
+      }
+      args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalWeightedEmbeddingVarComputeFn(
+    const int batch_size, const int dimension, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  __shared__ TValue l2_sum[1];
+
+  const auto& block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  if (bid < batch_size && tid < dimension) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0f;
+      TValue total_batch_weight = 0.0f;
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int j = 0; j < feature_num; ++j) {
+          size_t feature_indices = value_offset + j;
+          int64_t embedding_offset = feature_indices * dimension;
+          TValue sum = args[ev_id].emb_variable_[embedding_offset + tid];
+          TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
+          if (max_norm >= 0.0) {
+            if (tid == 0) {
+              l2_sum[0] = 0.0;
+            }
+            block.sync();
+            atomicAdd(l2_sum, sum * sum);
+            block.sync();
+            TValue l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              sum *= max_norm / l2_norm;
+            }
+          }
+          out = __fmaf_rn(sum, sp_weights, out);
+        }
+        out = Combine<combiner, TValue>(out, total_batch_weight);
+      }
+      args[ev_id].emb_vector_[bid * dimension + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue, Combiner combiner>
+__global__ void NormalWeightedVariableComputeFn(
+    const int batch_size, const int emb_vec_size, const float max_norm,
+    const int num_lookups, GroupEmbeddingForWardArgs<TKey, TValue>* args) {
+  __shared__ TValue l2_sum[1];
+  const auto& block = cooperative_groups::this_thread_block();
+  // each block partition corresponding to one sample
+  const int bid = block.group_index().x;
+  // each thread corresponding to one element in the embedding vector
+  const int tid = block.thread_rank();
+
+  if (bid < batch_size && tid < emb_vec_size) {
+    for (int ev_id = 0; ev_id < num_lookups; ++ev_id) {
+      int value_offset = args[ev_id].offset_indices_[bid];
+      int feature_num;
+      if (bid == (batch_size - 1)) {
+        feature_num = args[ev_id].nnz_ - value_offset;
+      } else {
+        feature_num = args[ev_id].offset_indices_[bid + 1] - value_offset;
+      }
+      TValue out = 0.0f;
+      TValue total_batch_weight = 0.0f;
+      const TValue* emb_variable = args[ev_id].emb_variable_;
+
+      // #pragma unroll
+      if (feature_num > 0) {
+        for (int i = 0; i < feature_num; i++) {
+          size_t feature_indices = value_offset + i;
+          int embedding_indices = int(args[ev_id].sp_values_[feature_indices]);
+          TValue emb_element =
+              emb_variable[embedding_indices * emb_vec_size + tid];
+          TValue sp_weights = args[ev_id].sp_weights_[feature_indices];
+          total_batch_weight += sp_weights;
+          // printf("indices is %d emb_element is %f\n", indices, emb_element);
+          if (max_norm >= 0.0f) {
+            // calc l2 norm of this emb row(per block) and compare with
+            // max_norm.
+            // if greater than max_norm, then clip every element with factor
+            // max_norm / l2norm
+            if (tid == 0) {
+              l2_sum[0] = 0.0f;
+            }
+            block.sync();
+            atomicAdd(l2_sum, emb_element * emb_element);
+            block.sync();
+            TValue l2_norm = sqrtf(l2_sum[0]);
+            if (l2_norm > max_norm) {
+              emb_element *= max_norm / l2_norm;
+            }
+          }
+          out = __fmaf_rn(emb_element, sp_weights, out);
+        }
+        out = Combine<combiner, int>(out, feature_num);
+      }
+      args[ev_id].emb_vector_[bid * emb_vec_size + tid] = out;
+    }
+  }
+}
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingLookupForWard {
+ public:
+  explicit GroupEmbeddingLookupForWard(const int num_lookups,
+                                       const int dimension,
+                                       const float max_norm,
+                                       Allocator* gpu_allocator = nullptr)
+      : alloc_(gpu_allocator) {
+    max_norm_ = max_norm;
+    dimension_ = dimension;
+    ev_nums_ = num_lookups;
+    d_args_ = TypedAllocator::Allocate<GroupEmbeddingForWardArgs<TKey, TValue>>(
+        gpu_allocator, num_lookups, AllocationAttributes());
+    args_size_ = sizeof(GroupEmbeddingForWardArgs<TKey, TValue>);
+    h_args_.reserve(ev_nums_);
+  }
+
+  ~GroupEmbeddingLookupForWard() {
+    TypedAllocator::Deallocate(alloc_, d_args_, ev_nums_);
+  }
+
+  void set(GroupEmbeddingForWardArgs<TKey, TValue>& arg) {
+    h_args_.emplace_back(arg);
+  }
+
+  template <typename ForwardFn>
+  inline void Lookup(ForwardFn compute_fn, const int batch_size,
+                     const int tile_size, cudaStream_t stream) {
+    CK_CUDA_THROW_(cudaMemcpyAsync(d_args_, h_args_.data(),
+                                   ev_nums_ * args_size_,
+                                   cudaMemcpyHostToDevice, stream));
+
+    {
+      if (tile_size <= 32) {
+        const int block_size = batch_size * tile_size / 64 + 1;
+        compute_fn<<<block_size, 64, 0, stream>>>(batch_size, dimension_,
+                                                  max_norm_, ev_nums_, d_args_);
+      } else {
+        compute_fn<<<batch_size, tile_size, 0, stream>>>(
+            batch_size, dimension_, max_norm_, ev_nums_, d_args_);
+      }
+    }
+
+    CK_CUDA_THROW_(cudaGetLastError());
+  }
+
+ protected:
+  std::vector<GroupEmbeddingForWardArgs<TKey, TValue>> h_args_;
+  GroupEmbeddingForWardArgs<TKey, TValue>* d_args_{nullptr};
+  Allocator* alloc_;
+  float max_norm_{0.0f};
+  int ev_nums_{0};
+  int dimension_{0};
+  size_t args_size_{0};
+};
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingLookupForwardBaseOp : public OpKernel {
+ public:
+  explicit GroupEmbeddingLookupForwardBaseOp(OpKernelConstruction* c)
+      : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
+    OP_REQUIRES_OK(c, c->GetAttr("dimension", &dimension_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
+    OP_REQUIRES_OK(c, c->GetAttr("ignore_weights", &ignore_weights_));
+    OP_REQUIRES_OK(c, c->GetAttr("is_sequence", &is_sequence_));
+  }
+
+  template <bool isEv, Combiner combiner>
+  inline void compute(GroupEmbeddingLookupForWard<TKey, TValue>& lookuper,
+                      const int batch_size, cudaStream_t stream) {
+    if (isEv) {
+      if (ignore_weights_) {
+        if (dimension_ <= 2) {
+          lookuper.Lookup(EmbeddingVarComputeFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
+        } else if (dimension_ <= 4) {
+          lookuper.Lookup(EmbeddingVarComputeFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
+        } else if (dimension_ <= 8) {
+          lookuper.Lookup(EmbeddingVarComputeFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
+        } else if (dimension_ <= 16) {
+          lookuper.Lookup(EmbeddingVarComputeFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
+        } else if (dimension_ <= 32) {
+          lookuper.Lookup(EmbeddingVarComputeFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
+        } else {
+          lookuper.Lookup(NormalEmbeddingVarComputeFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
+        }
+      } else {
+        if (dimension_ <= 2) {
+          lookuper.Lookup(
+              WeightedEmbeddingVarComputeFn<TKey, TValue, combiner, 2>,
+              batch_size, 2, stream);
+        } else if (dimension_ <= 4) {
+          lookuper.Lookup(
+              WeightedEmbeddingVarComputeFn<TKey, TValue, combiner, 4>,
+              batch_size, 4, stream);
+        } else if (dimension_ <= 8) {
+          lookuper.Lookup(
+              WeightedEmbeddingVarComputeFn<TKey, TValue, combiner, 8>,
+              batch_size, 8, stream);
+        } else if (dimension_ <= 16) {
+          lookuper.Lookup(
+              WeightedEmbeddingVarComputeFn<TKey, TValue, combiner, 16>,
+              batch_size, 16, stream);
+        } else if (dimension_ <= 32) {
+          lookuper.Lookup(
+              WeightedEmbeddingVarComputeFn<TKey, TValue, combiner, 32>,
+              batch_size, 32, stream);
+        } else {
+          lookuper.Lookup(
+              NormalWeightedEmbeddingVarComputeFn<TKey, TValue, combiner>,
+              batch_size, dimension_, stream);
+        }
+      }
+    } else {
+      if (ignore_weights_) {
+        if (dimension_ <= 2) {
+          lookuper.Lookup(VariableComputeFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
+        } else if (dimension_ <= 4) {
+          lookuper.Lookup(VariableComputeFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
+        } else if (dimension_ <= 8) {
+          lookuper.Lookup(VariableComputeFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
+        } else if (dimension_ <= 16) {
+          lookuper.Lookup(VariableComputeFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
+        } else if (dimension_ <= 32) {
+          lookuper.Lookup(VariableComputeFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
+        } else {
+          lookuper.Lookup(NormalVariableComputeFn<TKey, TValue, combiner>,
+                          batch_size, dimension_, stream);
+        }
+      } else {
+        if (dimension_ <= 2) {
+          lookuper.Lookup(WeightedVariableComputeFn<TKey, TValue, combiner, 2>,
+                          batch_size, 2, stream);
+        } else if (dimension_ <= 4) {
+          lookuper.Lookup(WeightedVariableComputeFn<TKey, TValue, combiner, 4>,
+                          batch_size, 4, stream);
+        } else if (dimension_ <= 8) {
+          lookuper.Lookup(WeightedVariableComputeFn<TKey, TValue, combiner, 8>,
+                          batch_size, 8, stream);
+        } else if (dimension_ <= 16) {
+          lookuper.Lookup(WeightedVariableComputeFn<TKey, TValue, combiner, 16>,
+                          batch_size, 16, stream);
+        } else if (dimension_ <= 32) {
+          lookuper.Lookup(WeightedVariableComputeFn<TKey, TValue, combiner, 32>,
+                          batch_size, 32, stream);
+        } else {
+          lookuper.Lookup(
+              NormalWeightedVariableComputeFn<TKey, TValue, combiner>,
+              batch_size, dimension_, stream);
+        }
+      }
+    }
+  }
+
+ protected:
+  std::string combiner_;
+  float max_norm_;
+  int num_lookups_;
+  int dimension_;
+  bool ignore_weights_;
+  bool is_sequence_;
+};
+
+}  // namespace
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
new file mode 100644
index 00000000..c1395e00
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_base_ops.h
@@ -0,0 +1,64 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h",
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+
+namespace tensorflow {
+// It's suggested that all CPU GroupEmbedding operations inherit from this base
+// class.
+template <typename TKey, typename TValue>
+class GroupLookupBaseCpuOp : public OpKernel {
+ public:
+  explicit GroupLookupBaseCpuOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("combiner", &m_combiner));
+    OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &m_num_lookup));
+    OP_REQUIRES_OK(c, c->GetAttr("dimension", &m_dimension));
+    // OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
+    OP_REQUIRES_OK(c, c->GetAttr("ignore_weights", &m_ignore_weights));
+    OP_REQUIRES_OK(c, c->GetAttr("is_sequence", &m_is_sequence));
+    OP_REQUIRES_OK(c, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv,
+                                          kPartitionSize, &partition_size_));
+    OP_REQUIRES(
+        c, partition_size_ > 0,
+        errors::InvalidArgument("Invaild PARTITION_SIZE=", partition_size_));
+    OP_REQUIRES_OK(c, ReadBoolFromEnvVar(kUniqueOpSerialEnv, false, &serial_));
+    OP_REQUIRES_OK(
+        c, ReadInt64FromEnvVar(kUniqueOpUniqRatioHint, kDefaultUniqueRatioHint,
+                               &unique_ratio_hint_));
+    OP_REQUIRES(c, unique_ratio_hint_ > 0,
+                errors::InvalidArgument("Invaild ", kUniqueOpUniqRatioHint, "=",
+                                        unique_ratio_hint_));
+  }
+
+ protected:
+  // float max_norm_;
+  int m_num_lookup;
+  int m_dimension;
+  bool m_is_use_default_value_tensor;
+  bool m_ignore_weights;
+  bool m_is_sequence;
+  std::string m_combiner;
+  bool serial_ = false;
+  int64 partition_size_ = 0;
+  int64 unique_ratio_hint_;
+  UniqueMaps map_flag_ = GOOGLE;  // "GOOGLE" dense hash map is default
+  const int64 kDefaultUniqueRatioHint = 4;
+  const char* kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL";
+  const char* kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT";
+  const char* kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE";
+};
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
new file mode 100644
index 00000000..d4c61922
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
@@ -0,0 +1,690 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include <immintrin.h>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h"
+#include "group_embedding_lookup_sparse_forward_base_ops.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/util/work_sharder.h"
+namespace tensorflow {
+
+#define USING_BASE_CLASS_MEMBER                                            \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_num_lookup;                  \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_dimension;                   \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_use_default_value_tensor; \
+  using GroupLookupBaseCpuOp<TKey, TValue>::m_is_sequence;
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+
+template <typename TKey, typename TValue>
+class GroupEmbeddingVariableLookupCpuOp
+    : public GroupLookupBaseCpuOp<TKey, TValue> {
+  USING_BASE_CLASS_MEMBER
+
+ public:
+  explicit GroupEmbeddingVariableLookupCpuOp(OpKernelConstruction *c)
+      : GroupLookupBaseCpuOp<TKey, TValue>(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &m_is_use_default_value_tensor));
+  }
+
+  void Compute(OpKernelContext *ctx) override {
+    /*
+      step 1: unique and assign unique output and index
+      step 2: doing unique value gather
+      step 3: assign unique embedding to batch result and pooling
+    */
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+
+    for (int i = 0; i < m_num_lookup; ++i) {
+      EmbeddingVar<TKey, TValue> *embedding_var = nullptr;
+      OP_REQUIRES_OK(
+          ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var));
+      core::ScopedUnref unref_me(embedding_var);
+
+      const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i);
+      const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
+      auto sp_indices = sp_indices_tensor.flat<int64>().data();
+      int nnz = sp_values_tensor.NumElements();
+      const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
+      auto dense_shape = dense_shape_tensor.flat<int64>().data();
+      int64 batch_size = dense_shape[0];
+
+      OP_REQUIRES(
+          ctx,
+          !embedding_var->IsMultiLevel() || (embedding_var->IsMultiLevel() &&
+                                             embedding_var->CacheSize() >= nnz),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  embedding_var->CacheSize(),
+                                  " should large than IDs in batch ", nnz));
+
+      // Stage 1
+      Tensor unique_idx_tensor;
+      Tensor unique_tensor;
+      Tensor unique_counter;
+
+      UniqueWithoutAxis<TKey, int32>(ctx, sp_values_tensor, &unique_idx_tensor,
+                                     &unique_tensor, &unique_counter, 0,
+                                     this->partition_size_, this->serial_,
+                                     this->unique_ratio_hint_, this->map_flag_);
+
+      ctx->set_output(m_num_lookup + i, unique_tensor);
+      ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
+
+      auto *unique = unique_tensor.flat<TKey>().data();
+      auto *unique_idx = unique_idx_tensor.flat<int>().data();
+
+      int unique_nnz = unique_tensor.shape().dim_size(0);
+      TensorShape unique_shape{static_cast<int64>(unique_nnz)};
+
+      TensorShape batch_nums_tensor_shape =
+          TensorShape(std::vector<int64>({batch_size}));
+      Tensor *batch_nums_tensor = nullptr;
+      // allocate output
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i,
+                                               batch_nums_tensor_shape,
+                                               &batch_nums_tensor));
+      auto batch_nums = batch_nums_tensor->flat<int>().data();
+      memset(batch_nums, 0, batch_size * sizeof(int));
+      for (int k = 0; k < nnz; ++k) {
+        int batch_id = sp_indices[k * dense_shape_tensor.NumElements()];
+        batch_nums[batch_id] += 1;
+      }
+      for (int k = 1; k < batch_size; ++k) {
+        batch_nums[k] += batch_nums[k - 1];
+      }
+
+      // Stage 2
+      Tensor unique_embedding;
+      unique_shape.AppendShape({static_cast<int64>(m_dimension)});
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_temp(DataTypeToEnum<TValue>::v(), unique_shape,
+                                  &unique_embedding, attr));
+      auto unique_embedding_data = unique_embedding.flat<TValue>().data();
+      EmbeddingVarContext<CPUDevice> ev_ctx(ctx);
+      if (m_is_use_default_value_tensor) {
+        embedding_var->GetEmbeddings(
+            ev_ctx, unique, unique_embedding_data, unique_nnz,
+            reinterpret_cast<TValue *>(
+                ctx->input(m_num_lookup * 4 + 1).data()));
+      } else {
+        embedding_var->GetEmbeddings(ev_ctx, unique, unique_embedding_data,
+                                     unique_nnz);
+        embedding_var->UpdateCache(unique_tensor, unique_counter,
+                                   true /*called_by_gather*/);
+      }
+
+      std::vector<TValue> default_weights(nnz, 1.0);
+      TValue *sp_weights = default_weights.data();
+      if (!this->m_ignore_weights) {
+        const Tensor &sp_weights_tensor =
+            ctx->input(this->m_num_lookup * 3 + i);
+        sp_weights =
+            const_cast<TValue *>(sp_weights_tensor.flat<TValue>().data());
+      }
+
+      // Stage 3
+      TensorShape emb_vectors_tensor_shape;
+      // Special case for sequence categorical column output
+      if (m_is_sequence) {
+        emb_vectors_tensor_shape = TensorShape(
+            std::vector<int64>({batch_size, dense_shape[1], m_dimension}));
+      } else {
+        emb_vectors_tensor_shape =
+            TensorShape(std::vector<int64>({batch_size, m_dimension}));
+      }
+      Tensor *gather_embedding_tensor = nullptr;
+      // allocate output
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &gather_embedding_tensor));
+      auto gather_embedding = gather_embedding_tensor->flat<TValue>().data();
+
+      int slice_bytes = nnz / batch_size * m_dimension * 1000;
+      // todo: clean these redundant code
+      if (this->m_combiner == "mean") {
+        auto embedding_var_mean_combiner = [this, &gather_embedding, batch_nums,
+                                            unique_idx, unique,
+                                            unique_embedding_data, sp_weights](
+                                               int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            __m512 batch_total_weights = _mm512_set1_ps(0.0f);
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              __m512 _weights =
+                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
+              batch_total_weights =
+                  _mm512_add_ps(batch_total_weights, _weights);
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+
+            if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f);
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] =
+                  _mm512_div_ps(tmp_embedding[index], batch_total_weights);
+              _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d,
+                                    mask, tmp_embedding[index]);
+            }
+#else
+            TValue batch_total_weights = 0.0f;
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights += sp_weight;
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(*(u_embedding + d), sp_weight, tmp_embedding[d]);
+              }
+            }
+
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
+
+            memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, embedding_var_mean_combiner);
+      } else if (this->m_combiner == "sum") {
+        auto embedding_var_sum_combiner = [this, &gather_embedding, batch_nums,
+                                           unique_idx, unique,
+                                           unique_embedding_data,
+                                           sp_weights](int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              __m512 _weights =
+                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d,
+                                    mask, tmp_embedding[index]);
+            }
+#else
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(u_embedding[d], sp_weights[batch_offset + j],
+                             tmp_embedding[d]);
+              }
+            }
+            memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, embedding_var_sum_combiner);
+      } else {
+        auto embedding_var_sqrtn_combiner = [this, &gather_embedding,
+                                             batch_nums, unique_idx, unique,
+                                             unique_embedding_data, sp_weights](
+                                                int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            TValue batch_total_weights = 0.0f;
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              TValue local_weight = *(sp_weights + batch_offset + j);
+              __m512 _weights = _mm512_set1_ps(local_weight);
+              batch_total_weights =
+                  std::fma(local_weight, local_weight, batch_total_weights);
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, u_embedding + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+            __m512 _total_weights;
+            if (batch_num != 0) {
+              _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights));
+            } else {
+              _total_weights = _mm512_set1_ps(1.0f);
+            }
+
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] =
+                  _mm512_div_ps(tmp_embedding[index], _total_weights);
+              _mm512_mask_storeu_ps(gather_embedding + i * m_dimension + d,
+                                    mask, tmp_embedding[index]);
+            }
+#else
+            TValue batch_total_weights = 0.0f;
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              float *u_embedding =
+                  unique_embedding_data + unique_indice * m_dimension;
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights =
+                  std::fma(sp_weight, sp_weight, batch_total_weights);
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(u_embedding[d], sp_weight, tmp_embedding[d]);
+              }
+            }
+
+            if (batch_num != 0) {
+              batch_total_weights = sqrtf(batch_total_weights);
+            } else {
+              batch_total_weights = 1.0f;
+            }
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
+
+            memcpy(gather_embedding + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, embedding_var_sqrtn_combiner);
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type) \
+  REGISTER_KERNEL_BUILDER(                         \
+      Name("GroupEmbeddingVarLookup")              \
+          .Device(DEVICE_CPU)                      \
+          .TypeConstraint<key_type>("Tkeys")       \
+          .TypeConstraint<value_type>("dtype"),    \
+      GroupEmbeddingVariableLookupCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+template <typename TKey, typename TValue>
+class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
+  USING_BASE_CLASS_MEMBER
+ public:
+  explicit GroupVariableLookupCpuOp(OpKernelConstruction *c)
+      : GroupLookupBaseCpuOp<TKey, TValue>(c) {}
+
+  void Compute(OpKernelContext *ctx) override {
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    for (int i = 0; i < m_num_lookup; ++i) {
+      const Tensor &emb_variable_tensor = ctx->input(i);
+      const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i);
+      int nnz = sp_values_tensor.NumElements();
+      auto embedding_variable = emb_variable_tensor.flat<TValue>().data();
+
+      const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
+      auto sp_indices = sp_indices_tensor.flat<int64>().data();
+
+      const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
+      auto dense_shape = dense_shape_tensor.flat<int64>().data();
+      int64 batch_size = dense_shape[0];
+
+      TensorShape batch_nums_tensor_shape =
+          TensorShape(std::vector<int64>({batch_size}));
+      Tensor *batch_nums_tensor = nullptr;
+      // allocate output
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i,
+                                               batch_nums_tensor_shape,
+                                               &batch_nums_tensor));
+      auto batch_nums = batch_nums_tensor->flat<int>().data();
+      memset(batch_nums, 0, batch_size * sizeof(int));
+      for (int k = 0; k < nnz; ++k) {
+        int batch_id = sp_indices[k * dense_shape_tensor.NumElements()];
+        batch_nums[batch_id] += 1;
+      }
+      for (int k = 1; k < batch_size; ++k) {
+        batch_nums[k] += batch_nums[k - 1];
+      }
+
+      TensorShape emb_vectors_tensor_shape;
+      // Special case for sequence categorical column output
+      if (m_is_sequence) {
+        emb_vectors_tensor_shape = TensorShape(
+            std::vector<int64>({batch_size, dense_shape[1], m_dimension}));
+      } else {
+        emb_vectors_tensor_shape =
+            TensorShape(std::vector<int64>({batch_size, m_dimension}));
+      }
+
+      Tensor *emb_vectors_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &emb_vectors_tensor));
+      auto emb_vectors = emb_vectors_tensor->flat<TValue>().data();
+
+      // Stage 1
+      Tensor unique_idx_tensor;
+      Tensor unique_tensor;
+      Tensor unique_counter;
+
+      UniqueWithoutAxis<TKey, int32>(ctx, sp_values_tensor, &unique_idx_tensor,
+                                     &unique_tensor, &unique_counter, 0,
+                                     this->partition_size_, this->serial_,
+                                     this->unique_ratio_hint_, this->map_flag_);
+
+      ctx->set_output(m_num_lookup + i, unique_tensor);
+      ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
+
+      auto *unique = unique_tensor.flat<TKey>().data();
+      auto *unique_idx = unique_idx_tensor.flat<int>().data();
+
+      std::vector<TValue> default_weights(nnz, 1.0);
+      TValue *sp_weights = default_weights.data();
+      if (!this->m_ignore_weights) {
+        const Tensor &sp_weights_tensor =
+            ctx->input(this->m_num_lookup * 3 + i);
+        sp_weights =
+            const_cast<TValue *>(sp_weights_tensor.flat<TValue>().data());
+      }
+
+      int slice_bytes = nnz / batch_size * m_dimension * 1000;
+      if (this->m_combiner == "mean") {
+        auto do_var_mean = [this, &emb_vectors, batch_nums, unique_idx, unique,
+                            sp_weights,
+                            embedding_variable](int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            __m512 batch_total_weights = _mm512_set1_ps(0.0f);
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              __m512 _weights =
+                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
+              batch_total_weights =
+                  _mm512_add_ps(batch_total_weights, _weights);
+              const float *embedding_ptr =
+                  embedding_variable + unique_id * m_dimension;
+
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+            if (batch_num == 0) batch_total_weights = _mm512_set1_ps(1.0f);
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] =
+                  _mm512_div_ps(tmp_embedding[index], batch_total_weights);
+              _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask,
+                                    tmp_embedding[index]);
+            }
+#else
+            TValue batch_total_weights = 0.0f;
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights += sp_weight;
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(embedding_variable[unique_id * m_dimension + d],
+                             sp_weight, tmp_embedding[d]);
+              }
+            }
+            for (int d = 0; d < m_dimension; ++d) {
+              tmp_embedding[d] /= batch_total_weights;
+            }
+            memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, do_var_mean);
+      } else if (this->m_combiner == "sum") {
+        auto do_var_sum = [this, &emb_vectors, batch_nums, unique_idx, unique,
+                           sp_weights,
+                           embedding_variable](int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              __m512 _weights =
+                  _mm512_set1_ps(*(sp_weights + batch_offset + j));
+              const float *embedding_ptr =
+                  embedding_variable + unique_id * m_dimension;
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask,
+                                    tmp_embedding[index]);
+            }
+#else
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(embedding_variable[unique_id * m_dimension + d],
+                             sp_weights[batch_offset + j], tmp_embedding[d]);
+              }
+            }
+            memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, do_var_sum);
+      } else {
+        auto do_var_sqrtn = [this, &emb_vectors, batch_nums, unique_idx, unique,
+                             sp_weights,
+                             embedding_variable](int64 start, int64 end) {
+          for (int64 i = start; i < end; ++i) {
+#if defined(__GNUC__) && (__GNUC__ > 6) && (__AVX512F__)
+            TValue batch_total_weights = 0.0f;
+            int tmp_length = (m_dimension + 15) / 16;
+            __m512 tmp_embedding[tmp_length];
+            for (int i = 0; i < tmp_length; ++i) {
+              tmp_embedding[i] = _mm512_set1_ps(0.0f);
+            }
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              TValue local_weight = *(sp_weights + batch_offset + j);
+              __m512 _weights = _mm512_set1_ps(local_weight);
+              batch_total_weights =
+                  std::fma(local_weight, local_weight, batch_total_weights);
+              const float *embedding_ptr =
+                  embedding_variable + unique_id * m_dimension;
+              for (int d = 0; d < m_dimension; d += 16) {
+                int index = d / 16;
+                int remain = m_dimension - d;
+                __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+                __m512 _item = _mm512_maskz_loadu_ps(mask, embedding_ptr + d);
+                tmp_embedding[index] = _mm512_mask3_fmadd_ps(
+                    _item, _weights, tmp_embedding[index], mask);
+              }
+            }
+
+            __m512 _total_weights;
+            if (batch_num != 0) {
+              _total_weights = _mm512_set1_ps(sqrtf(batch_total_weights));
+            } else {
+              _total_weights = _mm512_set1_ps(1.0f);
+            }
+
+            for (int d = 0; d < m_dimension; d += 16) {
+              int index = d / 16;
+              int remain = m_dimension - d;
+              __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+              tmp_embedding[index] =
+                  _mm512_div_ps(tmp_embedding[index], _total_weights);
+              _mm512_mask_storeu_ps(emb_vectors + i * m_dimension + d, mask,
+                                    tmp_embedding[index]);
+            }
+#else
+            TValue batch_total_weights = 0.0f;
+            std::vector<TValue> tmp_embedding(m_dimension, 0.0f);
+            int batch_offset = i == 0 ? 0 : batch_nums[i - 1];
+            int batch_num = batch_nums[i] - batch_offset;
+            for (int j = 0; j < batch_num; ++j) {
+              int unique_indice = unique_idx[batch_offset + j];
+              int unique_id = unique[unique_indice];
+              TValue sp_weight = sp_weights[batch_offset + j];
+              batch_total_weights =
+                  std::fma(sp_weight, sp_weight, batch_total_weights);
+              for (int d = 0; d < m_dimension; ++d) {
+                tmp_embedding[d] =
+                    std::fma(embedding_variable[unique_id * m_dimension + d],
+                             sp_weight, tmp_embedding[d]);
+              }
+            }
+            if (batch_num != 0) {
+              batch_total_weights = sqrtf(batch_total_weights);
+            } else {
+              batch_total_weights = 1.0f;
+            }
+            memcpy(emb_vectors + i * m_dimension, tmp_embedding.data(),
+                   sizeof(float) * m_dimension);
+#endif
+          }
+        };
+        Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+              slice_bytes /*cost*/, do_var_sqrtn);
+      }
+    }
+  }
+};
+
+#define REGISTER_CPU_KERNELS(key_type, value_type)                  \
+  REGISTER_KERNEL_BUILDER(Name("GroupVariableLookup")               \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<key_type>("Tkeys")    \
+                              .TypeConstraint<value_type>("dtype"), \
+                          GroupVariableLookupCpuOp<key_type, value_type>)
+
+REGISTER_CPU_KERNELS(int32, float);
+REGISTER_CPU_KERNELS(int64, float);
+#undef REGISTER_CPU_KERNELS
+
+#undef USING_BASE_CLASS_MEMBER
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
new file mode 100644
index 00000000..0295e91e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cu.cc
@@ -0,0 +1,309 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <cuda_runtime.h>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/fused_embedding/fused_embedding_common.cu.h"
+#include "deepray/custom_ops/utils/spin_rw_lock.h"
+#include "group_embedding_lookup_sparse_forward_base_ops.cu.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+template <typename TFKey, typename TKey, typename TValue>
+class GroupEmbeddingVarLookupOp
+    : public GroupEmbeddingLookupForwardBaseOp<TKey, TValue> {
+ public:
+  explicit GroupEmbeddingVarLookupOp(OpKernelConstruction* c)
+      : GroupEmbeddingLookupForwardBaseOp<TKey, TValue>(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &is_use_default_value_tensor_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const auto& device = ctx->eigen_device<GPUDevice>();
+    TValue* default_v = nullptr;
+    int64 batch_size = -1;
+
+    Allocator* gpu_allocator =
+        ctx->device()->GetAllocator(AllocatorAttributes());
+    GroupEmbeddingLookupForWard<TKey, TValue> lookuper(
+        this->num_lookups_, this->dimension_, this->max_norm_, gpu_allocator);
+
+    std::vector<Tensor> tensor_list;
+    tensor_list.reserve(this->num_lookups_);
+
+    for (int i = 0; i < this->num_lookups_; ++i) {
+      EmbeddingVar<TFKey, TValue>* ev = nullptr;
+      OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, i), &ev));
+      core::ScopedUnref unref_me(ev);
+      int64 dimension = ev->ValueLen();
+
+      const Tensor& sp_values_tensor = ctx->input(this->num_lookups_ + i);
+      auto sp_values = sp_values_tensor.flat<TFKey>();
+      int64 N = sp_values_tensor.NumElements();
+
+      const Tensor& sp_indices_tensor = ctx->input(this->num_lookups_ * 2 + i);
+      auto sp_indices = sp_indices_tensor.flat<int64>().data();
+      int nnz = sp_indices_tensor.shape().dim_size(0);
+      const Tensor& dense_shape_tensor = ctx->input(this->num_lookups_ * 4 + i);
+      auto dense_shape = dense_shape_tensor.flat<int64>().data();
+      int dense_shape_num = dense_shape_tensor.NumElements();
+      batch_size = dense_shape[0];
+
+      TValue* default_v = nullptr;
+      if (is_use_default_value_tensor_) {
+        default_v = (TValue*)ctx->input(5 * this->num_lookups_).data();
+      } else {
+        default_v = ev->GetDefaultValuePtr();
+      }
+
+      // DEBUG
+      const TFKey* key_base = sp_values.data();
+      Tensor out_tensor;
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<TValue>::value,
+                                             {N * dimension}, &out_tensor));
+      TValue* out_base = out_tensor.flat<TValue>().data();
+
+      EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
+      if (ev->IsSingleHbm()) {
+        if (is_use_default_value_tensor_) {
+          Tensor default_values(ctx->input(5 * this->num_lookups_));
+          auto default_value_num = default_values.NumElements() / dimension;
+          auto default_values_matrix =
+              default_values.shaped<TValue, 2>({default_value_num, dimension});
+          TValue* default_v_base = &default_values_matrix(0, 0);
+          ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
+        } else {
+          ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
+        }
+      } else {
+        TensorShape indices_host_shape = sp_values_tensor.shape();
+        Tensor indices_host(sp_indices_tensor.dtype(), indices_host_shape);
+        // Copy ids from GPU to CPU for CPU Lookup.
+        auto stream = ctx->op_device_context()->stream();
+        auto event_mgr =
+            ctx->device()->tensorflow_accelerator_device_info()->event_mgr;
+        se::DeviceMemoryBase gpu_src(const_cast<TFKey*>(key_base),
+                                     N * sizeof(TFKey));
+        stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TFKey));
+        SyncWithEventMgr(stream, event_mgr);
+        EmbeddingVarContext<GPUDevice> ev_ctx(ctx);
+        ev->GetEmbeddings(ev_ctx, (TFKey*)indices_host.data(), out_base, N);
+        ev->UpdateCache(indices_host, true);
+      }
+
+      TensorShape emb_vectors_tensor_shape;
+      // Special case for sequence categorical column output
+      if (this->is_sequence_) {
+        emb_vectors_tensor_shape = TensorShape(
+            std::vector<int64>({batch_size, dense_shape[1], dimension}));
+      } else {
+        emb_vectors_tensor_shape =
+            TensorShape(std::vector<int64>({batch_size, dimension}));
+      }
+
+      Tensor* op_output_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &op_output_tensor));
+      auto op_output = op_output_tensor->flat<TValue>().data();
+
+      // allocate offset tensor
+      TensorShape values_offset_tensor_shape =
+          TensorShape(std::vector<int64>({batch_size}));
+
+      // Fake Output
+      Tensor* unique_keys_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {this->num_lookups_ + i}, this->num_lookups_ + i,
+                              sp_values_tensor.shape(), &unique_keys_tensor));
+
+      Tensor* unique_idx_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 2 + i,
+                                               values_offset_tensor_shape,
+                                               &unique_idx_tensor));
+
+      Tensor* values_offset_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 3 + i,
+                                               values_offset_tensor_shape,
+                                               &values_offset_tensor));
+      auto values_offset = values_offset_tensor->flat<int>().data();
+
+      launch_cal_per_element_row_offset(
+          batch_size, nnz, dense_shape_num,
+          reinterpret_cast<const int64_t*>(sp_indices), values_offset,
+          device.stream());
+
+      TValue* sp_weights = nullptr;
+      if (!this->ignore_weights_) {
+        const Tensor& sp_weights_tensor =
+            ctx->input(this->num_lookups_ * 3 + i);
+        sp_weights =
+            const_cast<TValue*>(sp_weights_tensor.flat<TValue>().data());
+      }
+
+      GroupEmbeddingForWardArgs<TKey, TValue> group_embedding_args(
+          out_base, sp_weights, op_output,
+          const_cast<TKey*>(reinterpret_cast<const TKey*>(key_base)),
+          values_offset, nnz);
+
+      lookuper.set(group_embedding_args);
+      tensor_list.emplace_back(out_tensor);
+    }
+
+    if (this->combiner_ == "sum") {
+      this->template compute<true, Sum>(lookuper, batch_size, device.stream());
+    } else if (this->combiner_ == "mean") {
+      this->template compute<true, Mean>(lookuper, batch_size, device.stream());
+    } else {
+      this->template compute<true, Sqrtn>(lookuper, batch_size,
+                                          device.stream());
+    }
+  }
+
+ private:
+  bool is_use_default_value_tensor_;
+};
+
+#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype_tf, dtype) \
+  REGISTER_KERNEL_BUILDER(                                           \
+      Name("GroupEmbeddingVarLookup")                                \
+          .Device(DEVICE_GPU)                                        \
+          .HostMemory("dense_shape")                                 \ 
+          .TypeConstraint<key_type_tf>("Tkeys")                      \
+          .TypeConstraint<dtype>("dtype"),                           \
+      GroupEmbeddingVarLookupOp<key_type_tf, key_type, dtype>)
+
+REGISTER_GPU_KERNELS(int64, int64_t, float, float);
+REGISTER_GPU_KERNELS(int32, int32_t, float, float);
+#undef REGISTER_GPU_KERNELS
+
+template <typename TFKey, typename TKey, typename TValue>
+class GroupVariableLookupOp
+    : public GroupEmbeddingLookupForwardBaseOp<TKey, TValue> {
+ public:
+  explicit GroupVariableLookupOp(OpKernelConstruction* c)
+      : GroupEmbeddingLookupForwardBaseOp<TKey, TValue>(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const cudaStream_t stream = ctx->eigen_device<GPUDevice>().stream();
+    Allocator* gpu_allocator =
+        ctx->device()->GetAllocator(AllocatorAttributes());
+    GroupEmbeddingLookupForWard<TKey, TValue> lookuper(
+        this->num_lookups_, this->dimension_, this->max_norm_, gpu_allocator);
+    int64 batch_size = -1;
+
+    for (int i = 0; i < this->num_lookups_; ++i) {
+      const Tensor& emb_variable_tensor = ctx->input(i);
+      const Tensor& sp_values_tensor = ctx->input(this->num_lookups_ + i);
+      int64 emb_vec_size = emb_variable_tensor.shape().dim_size(1);
+
+      const Tensor& sp_indices_tensor = ctx->input(this->num_lookups_ * 2 + i);
+      auto sp_indices = sp_indices_tensor.flat<int64>().data();
+      int nnz = sp_indices_tensor.shape().dim_size(0);
+      const Tensor& dense_shape_tensor = ctx->input(this->num_lookups_ * 4 + i);
+      auto dense_shape = dense_shape_tensor.flat<int64>().data();
+      int dense_shape_num = dense_shape_tensor.NumElements();
+      batch_size = dense_shape[0];
+
+      TensorShape emb_vectors_tensor_shape;
+      // Special case for sequence categorical column output
+      if (this->is_sequence_) {
+        emb_vectors_tensor_shape = TensorShape(
+            std::vector<int64>({batch_size, dense_shape[1], emb_vec_size}));
+      } else {
+        emb_vectors_tensor_shape =
+            TensorShape(std::vector<int64>({batch_size, emb_vec_size}));
+      }
+      Tensor* emb_vectors_tensor = nullptr;
+      // allocate output
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
+                                               &emb_vectors_tensor));
+      auto emb_vectors = emb_vectors_tensor->flat<TValue>().data();
+
+      // allocate offset tensor
+      TensorShape values_offset_tensor_shape =
+          TensorShape(std::vector<int64>({batch_size}));
+      // Fake Output
+      Tensor* unique_keys_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output(
+                              {this->num_lookups_ + i}, this->num_lookups_ + i,
+                              sp_values_tensor.shape(), &unique_keys_tensor));
+
+      Tensor* unique_idx_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 2 + i,
+                                               values_offset_tensor_shape,
+                                               &unique_idx_tensor));
+      Tensor* values_offset_tensor = nullptr;
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(this->num_lookups_ * 3 + i,
+                                               values_offset_tensor_shape,
+                                               &values_offset_tensor));
+      auto values_offset = values_offset_tensor->flat<int>().data();
+      launch_cal_per_element_row_offset(
+          batch_size, nnz, dense_shape_num,
+          reinterpret_cast<const int64_t*>(sp_indices), values_offset, stream);
+
+      TValue* sp_weights = nullptr;
+      if (!this->ignore_weights_) {
+        const Tensor& sp_weights_tensor =
+            ctx->input(this->num_lookups_ * 3 + i);
+        sp_weights =
+            const_cast<TValue*>(sp_weights_tensor.flat<TValue>().data());
+      }
+      GroupEmbeddingForWardArgs<TKey, TValue> group_embedding_args(
+          const_cast<TValue*>(emb_variable_tensor.flat<TValue>().data()),
+          sp_weights, emb_vectors,
+          const_cast<TKey*>(reinterpret_cast<const TKey*>(
+              sp_values_tensor.flat<TFKey>().data())),
+          values_offset, nnz);
+      lookuper.set(group_embedding_args);
+    }
+
+    if (this->combiner_ == "sum") {
+      this->template compute<false, Sum>(lookuper, batch_size, stream);
+    } else if (this->combiner_ == "mean") {
+      this->template compute<false, Mean>(lookuper, batch_size, stream);
+    } else {
+      this->template compute<false, Sqrtn>(lookuper, batch_size, stream);
+    }
+  }
+};
+
+#define REGISTER_GPU_KERNELS(key_type_tf, key_type, dtype_tf, dtype) \
+  REGISTER_KERNEL_BUILDER(Name("GroupVariableLookup")                \
+                              .Device(DEVICE_GPU)                    \
+                              .HostMemory("dense_shape")             \ 
+                              .TypeConstraint<key_type_tf>("Tkeys")  \
+                              .TypeConstraint<dtype>("dtype"),       \
+                          GroupVariableLookupOp<key_type_tf, key_type, dtype>)
+
+REGISTER_GPU_KERNELS(int64, int64_t, float, float);
+REGISTER_GPU_KERNELS(int32, int32_t, float, float);
+#undef REGISTER_GPU_KERNELS
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc
new file mode 100644
index 00000000..d91852cb
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.cc
@@ -0,0 +1,493 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "incr_save_restore_ops.h"
+
+#include "tensorflow/core/framework/resource_handle.h"
+
+namespace tensorflow {
+
+template <typename TIndex>
+class RecordSparseIndicesOp : public OpKernel {
+ public:
+  explicit RecordSparseIndicesOp(OpKernelConstruction* context)
+      : OpKernel(context), auto_record_(false) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("var_name", &sparse_incr_res_name_));
+    OP_REQUIRES_OK(context, context->GetAttr("auto_record", &auto_record_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    IndicesIncrRecorder<TIndex>* sparse_incr_res = nullptr;
+    auto rm = ctx->resource_manager();
+    OP_REQUIRES_OK(
+        ctx, rm->LookupOrCreate<IndicesIncrRecorder<TIndex>>(
+                 "", sparse_incr_res_name_ + "_sparse_incr", &sparse_incr_res,
+                 [this](IndicesIncrRecorder<TIndex>** ptr) {
+                   *ptr =
+                       new IndicesIncrRecorder<TIndex>(sparse_incr_res_name_);
+                   if (auto_record_) {
+                     (*ptr)->UpdateGlobalVersion();
+                   }
+                   VLOG(2) << "sparse_incr_res created, name:"
+                           << sparse_incr_res_name_;
+                   return OkStatus();
+                 }));
+    sparse_incr_res->UpdateIndices(ctx->input(0), ctx);
+  }
+
+ private:
+  string sparse_incr_res_name_;
+  bool auto_record_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int32>("TIndex"),
+                        RecordSparseIndicesOp<int32>);
+
+REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int64>("TIndex"),
+                        RecordSparseIndicesOp<int64>);
+
+REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int32>("TIndex"),
+                        RecordSparseIndicesOp<int32>);
+
+REGISTER_KERNEL_BUILDER(Name("RecordSparseIndices")
+                            .Device(DEVICE_GPU)
+                            .TypeConstraint<int64>("TIndex"),
+                        RecordSparseIndicesOp<int64>);
+
+class ActivateSparseRecorderOp : public OpKernel {
+ public:
+  explicit ActivateSparseRecorderOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& tensor_names = context->input(0);
+    const auto& tensor_names_flat = tensor_names.flat<tstring>();
+    const int num_tensors = static_cast<int>(tensor_names.NumElements());
+
+    auto rm = context->resource_manager();
+    for (int i = 0; i < num_tensors; ++i) {
+      const string& tensor_name = tensor_names_flat(i);
+      // cast forcely to IndicesIncrRecorder for incr cpkt
+      string incr_res_name = tensor_name + "_sparse_incr";
+      IndicesIncrRecorder<int32>* sparse_incr_res = nullptr;
+      rm->Lookup("", incr_res_name, &sparse_incr_res);
+      if (sparse_incr_res != nullptr) {
+        sparse_incr_res->UpdateGlobalVersion();
+      } else {
+        IndicesIncrRecorder<int64>* sparse_incr_res = nullptr;
+        rm->Lookup("", incr_res_name, &sparse_incr_res);
+        if (sparse_incr_res != nullptr) {
+          sparse_incr_res->UpdateGlobalVersion();
+        } else {
+          LOG(WARNING) << tensor_name << "_sparse_incr"
+                       << " Resource NOT FOUND";
+        }
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("ActivateSparseRecorder").Device(DEVICE_CPU),
+                        ActivateSparseRecorderOp);
+
+class IncrSaveOp : public OpKernel {
+ public:
+  explicit IncrSaveOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const int kFixedInputs = 4;  // Prefix, tensor names, is_sparse
+    const Tensor& prefix = context->input(0);
+    const string& prefix_string = prefix.scalar<tstring>()();
+    const Tensor& tensor_names = context->input(1);
+    const Tensor& shape_and_slices = context->input(2);
+    const Tensor& is_sparse = context->input(3);
+    const int num_tensors = static_cast<int>(tensor_names.NumElements());
+    const auto& tensor_names_flat = tensor_names.flat<tstring>();
+    const auto& is_sparse_flat = is_sparse.flat<bool>();
+    const auto& shape_and_slices_flat = shape_and_slices.flat<tstring>();
+    LOG(INFO) << "prefix_string: " << prefix_string
+              << "num tensors:" << num_tensors;
+    auto rm = context->resource_manager();
+    BundleWriter writer(Env::Default(), prefix_string);
+
+    for (int i = 0; i < num_tensors; i++) {
+      const string& tensor_name = tensor_names_flat(i);
+      if (is_sparse_flat(i)) {
+        IndicesIncrRecorder<int64>* sparse_incr_res = nullptr;
+        rm->Lookup("", tensor_name + "_sparse_incr", &sparse_incr_res);
+        if (sparse_incr_res != nullptr) {
+          DumpIncrSparse<int64>(context, i, kFixedInputs, tensor_name, &writer,
+                                sparse_incr_res);
+        } else {
+          IndicesIncrRecorder<int32>* sparse_incr_res = nullptr;
+          rm->Lookup("", tensor_name + "_sparse_incr", &sparse_incr_res);
+          if (sparse_incr_res != nullptr) {
+            DumpIncrSparse<int32>(context, i, kFixedInputs, tensor_name,
+                                  &writer, sparse_incr_res);
+          } else {
+            LOG(WARNING) << tensor_name << "_sparse_incr"
+                         << " Resource NOT FOUND";
+          }
+        }
+      } else {
+        const Tensor& tensor = context->input(i + kFixedInputs);
+
+        if (!shape_and_slices_flat(i).empty()) {
+          const string& shape_spec = shape_and_slices_flat(i);
+          TensorShape shape;
+          TensorSlice slice(tensor.dims());
+          TensorShape slice_shape;
+
+          OP_REQUIRES_OK(context,
+                         checkpoint::ParseShapeAndSlice(shape_spec, &shape,
+                                                        &slice, &slice_shape));
+          OP_REQUIRES(
+              context, slice_shape.IsSameSize(tensor.shape()),
+              errors::InvalidArgument(
+                  "Slice in shape_and_slice "
+                  "specification does not match the "
+                  "shape of the tensor to  save: ",
+                  shape_spec, ", tensor: ", tensor.shape().DebugString()));
+
+          OP_REQUIRES_OK(context,
+                         writer.AddSlice(tensor_name, shape, slice, tensor));
+        } else {
+          OP_REQUIRES_OK(context, writer.Add(tensor_name, tensor));
+        }
+      }
+    }
+    OP_REQUIRES_OK(context, writer.Finish());
+  }
+
+ private:
+  template <typename T>
+  void DumpIncrSparse(OpKernelContext* context, int i, const int& kFixedInputs,
+                      const string& tensor_name, BundleWriter* writer,
+                      IndicesIncrRecorder<T>* sparse_incr_res) {
+    if (tensor_types_[i] == DT_RESOURCE) {
+      // ev, must be sparse
+      EmbeddingVar<T, float>* variable = nullptr;
+      OP_REQUIRES_OK(
+          context,
+          LookupResource(context, HandleFromInput(context, i + kFixedInputs),
+                         &variable));
+      core::ScopedUnref unref_variable(variable);
+      OP_REQUIRES_OK(context, sparse_incr_res->DumpSparseEmbeddingTensor(
+                                  tensor_name, variable, writer, context));
+    } else {
+      const Tensor& sparse_var = context->input(i + kFixedInputs);
+      OP_REQUIRES_OK(context, sparse_incr_res->DumpSparseNormalTensor(
+                                  tensor_name, sparse_var, writer));
+    }
+  }
+
+ private:
+  DataTypeVector tensor_types_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncrSave").Device(DEVICE_CPU), IncrSaveOp);
+
+class IncrRestoreOp : public OpKernel {
+ public:
+  explicit IncrRestoreOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& prefix = context->input(0);
+    const string& prefix_string = prefix.scalar<tstring>()();
+    const Tensor& tensor_names = context->input(1);
+    const Tensor& shape_and_slices = context->input(2);
+    const Tensor& is_sparse_tensor = context->input(3);
+    const bool& is_sparse = is_sparse_tensor.scalar<bool>()();
+    const auto& shape_and_slices_flat = shape_and_slices.flat<tstring>();
+    const int num_tensors = static_cast<int>(tensor_names.NumElements());
+    if (is_sparse) {
+      BundleReader reader(Env::Default(), prefix_string);
+      OP_REQUIRES_OK(context, reader.status());
+      VLOG(1) << "BundleReader incr, prefix_string: " << prefix_string;
+      LOG(INFO) << "BundleReader incr, prefix_string: " << prefix_string;
+      const auto& tensor_names_flat = tensor_names.flat<tstring>();
+      if (num_tensors > 1) {
+        // EV
+        if (num_tensors != 3) {
+          OP_REQUIRES_OK(
+              context,
+              errors::InvalidArgument(
+                  "Incr cpkt restore for ev must has 3 tensors, actually ",
+                  num_tensors, " given"));
+        }
+
+        const string& ev_keys_name = tensor_names_flat(0);
+        string incr_tensor_name =
+            ev_keys_name.substr(0, ev_keys_name.find("-keys"));
+        // 1 read keys, values and versions
+        TensorShape incr_shape;
+        Tensor* incr_keys_tensor = nullptr;
+        Tensor* incr_values_tensor = nullptr;
+        Tensor* incr_versions_tensor = nullptr;
+        OP_REQUIRES_OK(
+            context, reader.LookupTensorShape(
+                         incr_tensor_name + "-sparse_incr_keys", &incr_shape));
+        OP_REQUIRES_OK(context, context->allocate_output(0, incr_shape,
+                                                         &incr_keys_tensor));
+        OP_REQUIRES_OK(context,
+                       reader.Lookup(incr_tensor_name + "-sparse_incr_keys",
+                                     incr_keys_tensor));
+        OP_REQUIRES_OK(context, reader.LookupTensorShape(
+                                    incr_tensor_name + "-sparse_incr_values",
+                                    &incr_shape));
+        OP_REQUIRES_OK(context, context->allocate_output(1, incr_shape,
+                                                         &incr_values_tensor));
+        OP_REQUIRES_OK(context,
+                       reader.Lookup(incr_tensor_name + "-sparse_incr_values",
+                                     incr_values_tensor));
+
+        OP_REQUIRES_OK(context, reader.LookupTensorShape(
+                                    incr_tensor_name + "-sparse_incr_versions",
+                                    &incr_shape));
+        OP_REQUIRES_OK(context, context->allocate_output(
+                                    2, incr_shape, &incr_versions_tensor));
+        OP_REQUIRES_OK(context,
+                       reader.Lookup(incr_tensor_name + "-sparse_incr_versions",
+                                     incr_versions_tensor));
+      } else {
+        // 1 Read keys from incr ckpt
+        TensorShape keys_shape;
+        Tensor keys_tensor;
+        DataType key_type;
+
+        const string& tensor_name = tensor_names_flat(0);
+        OP_REQUIRES_OK(context, reader.LookupDtypeAndShape(
+                                    tensor_name + "-sparse_incr_keys",
+                                    &key_type, &keys_shape));
+
+        OP_REQUIRES_OK(context, context->allocate_temp(key_type, keys_shape,
+                                                       &keys_tensor));
+
+        OP_REQUIRES_OK(context, reader.Lookup(tensor_name + "-sparse_incr_keys",
+                                              &keys_tensor));
+
+        LOG(INFO) << "Finished restoring incr normal sparse keys tensor:"
+                  << tensor_name.data()
+                  << ", size:" << keys_tensor.TotalBytes();
+
+        // 2 Read values from incr ckpt
+        TensorShape values_shape;
+        Tensor values_tensor;
+
+        OP_REQUIRES_OK(context,
+                       reader.LookupTensorShape(
+                           tensor_name + "-sparse_incr_values", &values_shape));
+
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, values_shape,
+                                                       &values_tensor));
+
+        OP_REQUIRES_OK(
+            context,
+            reader.Lookup(tensor_name + "-sparse_incr_values", &values_tensor));
+
+        LOG(INFO) << "Finished restoring incr normal sparse values tensor:"
+                  << tensor_name.data()
+                  << ", size:" << values_tensor.TotalBytes();
+        // 3 do incr update
+        const Tensor& orig_sparse_tensor = context->input(4);
+        Tensor* new_sparse_tensor = nullptr;
+        OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
+                                    {4}, 0, orig_sparse_tensor.shape(),
+                                    &new_sparse_tensor));
+
+        // 3.1 update specific rows
+        auto incr_values_flat = values_tensor.template matrix<float>();
+        auto new_values_flat = new_sparse_tensor->template matrix<float>();
+        auto limit = new_sparse_tensor->dim_size(1);
+
+        for (auto i = 0; i < keys_tensor.NumElements(); i++) {
+          if (key_type == DT_INT32) {
+            auto incr_key =
+                keys_tensor.flat<EnumToDataType<DT_INT32>::Type>()(i);
+            if (incr_key >= new_sparse_tensor->dim_size(0)) continue;
+            for (auto j = 0; j < limit; j++) {
+              new_values_flat(incr_key, j) = incr_values_flat(i, j);
+            }
+          } else {
+            auto incr_key =
+                keys_tensor.flat<EnumToDataType<DT_INT64>::Type>()(i);
+            if (incr_key >= new_sparse_tensor->dim_size(0)) continue;
+            for (auto j = 0; j < limit; j++) {
+              new_values_flat(incr_key, j) = incr_values_flat(i, j);
+            }
+          }
+        }
+        LOG(INFO) << "Finished restoring normal sparse tensor(full+incr):"
+                  << tensor_name.data()
+                  << ", size:" << new_sparse_tensor->TotalBytes();
+      }
+    } else {
+      RestoreTensorsV2(context, prefix, tensor_names, shape_and_slices,
+                       tensor_types_);
+    }
+  }
+
+ private:
+  DataTypeVector tensor_types_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncrRestore").Device(DEVICE_CPU), IncrRestoreOp);
+
+class CollectSparseIndicesOp : public OpKernel {
+ public:
+  explicit CollectSparseIndicesOp(OpKernelConstruction* context)
+      : OpKernel(context), update_count_thd_(0) {
+    string config_str;
+    OP_REQUIRES_OK(context, context->GetAttr("config", &config_str));
+    OP_REQUIRES_OK(context, ParseConfig(config_str));
+    OP_REQUIRES_OK(context, context->GetAttr("ktype", &tensor_type_));
+    OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
+
+    int64 part_idx, part_count, hash_bucket_size;
+    OP_REQUIRES_OK(context, context->GetAttr("part_idx", &part_idx));
+    OP_REQUIRES_OK(context, context->GetAttr("part_count", &part_count));
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("hash_bucket_size", &hash_bucket_size));
+
+    if (part_count > 0 && hash_bucket_size > 0) {
+      string part_mode_str;
+      OP_REQUIRES_OK(context, context->GetAttr("part_mode", &part_mode_str));
+      if (part_mode_str == "mod") {
+        partitioner_ = std::move(std::unique_ptr<SparsePartitioner>(
+            new ModSparsePartitioner(part_count, part_idx, hash_bucket_size)));
+      } else {
+        partitioner_ = std::move(std::unique_ptr<SparsePartitioner>(
+            new DivSparsePartitioner(part_count, part_idx, hash_bucket_size)));
+      }
+    }
+  }
+
+  void Compute(OpKernelContext* context) override {
+    if (tensor_type_ == DT_INT32) {
+      OP_REQUIRES_OK(context,
+                     ExportSparseIndices<int32>(tensor_name_, context));
+    } else if (tensor_type_ == DT_INT64) {
+      OP_REQUIRES_OK(context,
+                     ExportSparseIndices<int64>(tensor_name_, context));
+    } else {
+      LOG(WARNING) << "Not support key type:" << DataTypeString(tensor_type_);
+    }
+  }
+
+ private:
+  template <typename KeyType>
+  Status ExportSparseIndices(const string& tensor_name,
+                             OpKernelContext* context) {
+    auto rm = context->resource_manager();
+    string resource_name = tensor_name + "_sparse_incr";
+    IndicesIncrRecorder<KeyType>* sparse_incr_res = nullptr;
+    rm->Lookup("", resource_name, &sparse_incr_res);
+    if (sparse_incr_res == nullptr) {
+      LOG(WARNING) << tensor_name << " Resource NOT FOUND";
+      return OkStatus();
+    }
+    return DoExportSparseIndices(sparse_incr_res, context);
+  }
+
+  template <typename KeyType>
+  Status DoExportSparseIndices(IndicesIncrRecorder<KeyType>* sparse_incr_res,
+                               OpKernelContext* ctx) {
+    std::unordered_map<KeyType, uint64> indices;
+    sparse_incr_res->SwapIndices(indices);
+    std::vector<KeyType> filtered_indices;
+    FilterIndices(indices, filtered_indices);
+
+    Tensor* keys_out = nullptr;
+    Tensor* global_keys_out = nullptr;
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        0, TensorShape({(int64)filtered_indices.size()}), &keys_out));
+
+    TF_RETURN_IF_ERROR(ctx->allocate_output(
+        1, TensorShape({(int64)filtered_indices.size()}), &global_keys_out));
+
+    auto keys_out_flat = keys_out->flat<KeyType>();
+    auto global_keys_out_flat = global_keys_out->flat<KeyType>();
+    for (size_t i = 0; i < filtered_indices.size(); i++) {
+      KeyType k = filtered_indices[i];
+      KeyType global_k = k;
+      if (partitioner_) {
+        global_k = (KeyType)partitioner_->CalcGlobalOffset(k);
+        VLOG(2) << partitioner_->toString() << ", key:" << k
+                << ", global key:" << global_k;
+      }
+      keys_out_flat(i) = k;
+      global_keys_out_flat(i) = global_k;
+    }
+    return OkStatus();
+  }
+
+  template <typename KeyType>
+  void FilterIndices(const std::unordered_map<KeyType, uint64>& indices,
+                     std::vector<KeyType>& filtered_indices) {
+    filtered_indices.reserve(indices.size());
+    for (const auto& it : indices) {
+      const auto& key = it.first;
+      uint64 update_count = it.second;
+      if (update_count >= update_count_thd_) {
+        filtered_indices.push_back(key);
+      }
+    }
+  }
+
+  Status ParseConfig(const string& config_str) {
+    LOG(INFO) << "Collect sparse indices config:" << config_str;
+    std::vector<string> configs = str_util::Split(config_str, ",");
+    for (size_t i = 0; i < configs.size(); i++) {
+      const string& s = configs[i];
+      std::vector<string> kv = str_util::Split(s, "=");
+      if (kv.size() < 2) {
+        LOG(WARNING) << "invalid config:" << s;
+        continue;
+      }
+      if (kv[0] == "update_count_thd") {
+        if (!strings::safe_strtou64(kv[1], &update_count_thd_)) {
+          LOG(WARNING) << "invalid config:" << s;
+        }
+      }
+    }
+
+    LOG(INFO) << "Parse collect sparse indices config success,"
+              << "update_cound_thd=" << update_count_thd_;
+
+    return OkStatus();
+  }
+
+ private:
+  std::string tensor_name_;
+  DataType tensor_type_;
+  uint64 update_count_thd_;
+  std::unique_ptr<SparsePartitioner> partitioner_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("CollectSparseIndices").Device(DEVICE_CPU),
+                        CollectSparseIndicesOp);
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h
new file mode 100644
index 00000000..71671cd8
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops.h
@@ -0,0 +1,553 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_
+
+#include <string>
+#include <vector>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/save_restore_tensor.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+
+namespace tensorflow {
+template <typename T>
+class ThreadSafeHashMap {
+ public:
+  ThreadSafeHashMap() {}
+  ~ThreadSafeHashMap() {}
+
+ public:
+  void Update(const Tensor& indices, int64 start, int64 end) {
+    mutex_lock l(lock_);
+    auto indices_flat = indices.flat<T>();
+    for (int64 idx = start; idx < end; idx++) {
+      auto indice = indices_flat(idx);
+      auto it = hash_map_.find(indice);
+      if (it == hash_map_.end()) {
+        hash_map_[indice] = 1;
+      } else {
+        it->second = it->second + 1;
+      }
+    }
+  }
+
+  void Swap(std::unordered_map<T, uint64>& out) {
+    mutex_lock l(lock_);
+    hash_map_.swap(out);
+  }
+
+  void GetKeys(std::set<T>& key_set) {
+    mutex_lock l(lock_);
+    for (auto it : hash_map_) {
+      key_set.insert(it.first);
+    }
+  }
+
+  void Clear() {
+    mutex_lock l(lock_);
+    hash_map_.clear();
+  }
+
+ private:
+  std::unordered_map<T, uint64> hash_map_;
+  mutex lock_;
+};
+
+template <typename T>
+class ParallelHashMap {
+ public:
+  explicit ParallelHashMap(int min_part_size = 128, int part_count = 32)
+      : part_count_(part_count), min_part_size_(min_part_size) {
+    hash_maps_.resize(part_count_);
+  }
+
+  void Update(const Tensor& indices, OpKernelContext* ctx) {
+    const int64 N = indices.NumElements();
+    auto thread_pool = *(ctx->device()->tensorflow_cpu_worker_threads());
+
+    std::vector<std::pair<int64, int64>> parts;
+    SplitParallelParts(
+        N, std::min(part_count_, thread_pool.workers->NumThreads()), parts);
+
+    int part_count = parts.size();
+    BlockingCounter counter(part_count);
+    for (int i = 0; i < part_count; i++) {
+      int64 start = parts[i].first;
+      int64 end = parts[i].second;
+      thread_pool.workers->Schedule([this, indices, i, start, end, &counter]() {
+        hash_maps_[i].Update(indices, start, end);
+        counter.DecrementCount();
+      });
+    }
+    counter.Wait();
+  }
+
+  void Swap(std::unordered_map<T, uint64>& indices) {
+    std::vector<std::unordered_map<T, uint64>> tmp_maps;
+    tmp_maps.resize(part_count_);
+    for (int i = 0; i < part_count_; i++) {
+      hash_maps_[i].Swap(tmp_maps[i]);
+    }
+
+    indices.clear();
+    for (int i = 0; i < part_count_; i++) {
+      for (auto it : tmp_maps[i]) {
+        auto indiceIt = indices.find(it.first);
+        if (indiceIt == indices.end()) {
+          indices[it.first] = it.second;
+        } else {
+          indices[it.first] += it.second;
+        }
+      }
+    }
+  }
+
+  void Clear() {
+    for (size_t i = 0; i < part_count_; i++) {
+      hash_maps_[i].Clear();
+    }
+  }
+
+  void GetKeys(std::set<T>& key_set) {
+    for (size_t i = 0; i < part_count_; i++) {
+      hash_maps_[i].GetKeys(key_set);
+    }
+  }
+
+  void SplitParallelParts(int64 total_num, int64 part_count,
+                          std::vector<std::pair<int64, int64>>& parts) {
+    if (total_num == 0) {
+      return;
+    }
+
+    int64 actual_part_count = part_count;
+    int64 part_size = total_num / actual_part_count;
+    if (part_size < min_part_size_) {
+      actual_part_count = total_num / min_part_size_;
+      actual_part_count = actual_part_count == 0 ? 1 : actual_part_count;
+    }
+
+    part_size = total_num / actual_part_count;
+    int64 left = total_num % actual_part_count;
+    int64 start = 0;
+    for (int i = 0; i < actual_part_count; i++) {
+      int64 end = start + part_size + (left > 0 ? 1 : 0);
+      parts.push_back(std::make_pair(start, end));
+      start = end;
+      left -= 1;
+    }
+  }
+
+ private:
+  std::vector<ThreadSafeHashMap<T>> hash_maps_;
+  int part_count_;
+  int min_part_size_;
+};
+
+template <class K>
+class IncrKeyDumpIterator : public DumpIterator<K> {
+ public:
+  explicit IncrKeyDumpIterator(std::vector<K>& incr_keys)
+      : incr_keys_(incr_keys) {
+    keys_iter_ = incr_keys_.begin();
+  }
+
+  bool HasNext() const { return keys_iter_ != incr_keys_.end(); }
+
+  K Next() { return *keys_iter_++; }
+
+ private:
+  std::vector<K>& incr_keys_;
+  typename std::vector<K>::iterator keys_iter_;
+};
+
+template <class K, class T>
+class IncrEVValueDumpIterator : public DumpIterator<T> {
+ public:
+  IncrEVValueDumpIterator(std::vector<K>& incr_keys,
+                          EmbeddingVar<K, T>*& emb_var)
+      : incr_keys_(incr_keys), emb_var_(emb_var) {
+    keys_iter_ = incr_keys_.begin();
+    keys_idx_ = 1;
+    col_idx_ = 0;
+  }
+
+  bool HasNext() const {
+    if (keys_iter_ != incr_keys_.end()) {
+      if (keys_idx_ < incr_keys_.size()) {
+        return true;
+      } else {
+        return col_idx_ < emb_var_->ValueLen();
+      }
+    } else {
+      return false;
+    }
+  }
+
+  T Next() {
+    if (col_idx_ >= emb_var_->ValueLen()) {
+      keys_iter_++;
+      keys_idx_++;
+      col_idx_ = 0;
+    }
+    void* value_ptr = NULL;
+    TF_CHECK_OK(emb_var_->LookupOrCreateKey(*keys_iter_, &value_ptr));
+    return emb_var_->flat(value_ptr)(col_idx_++);
+  }
+
+ private:
+  int64 keys_idx_;
+  int64 col_idx_;
+  typename std::vector<K>::iterator keys_iter_;
+  std::vector<K>& incr_keys_;
+  EmbeddingVar<K, T>* emb_var_;
+};
+
+template <class K, class V, class T>
+class IncrEVVersionDumpIterator : public DumpIterator<T> {
+ public:
+  IncrEVVersionDumpIterator(std::vector<K>& incr_keys,
+                            EmbeddingVar<K, V>*& emb_var)
+      : incr_keys_(incr_keys), emb_var_(emb_var) {
+    keys_iter_ = incr_keys_.begin();
+  }
+
+  bool HasNext() const { return keys_iter_ != incr_keys_.end(); }
+
+  T Next() {
+    if (emb_var_->StepsToLive() == 0) {
+      keys_iter_++;
+      return 0;
+    } else {
+      K key = *keys_iter_;
+      int64 dump_version = emb_var_->GetVersion(key);
+      keys_iter_++;
+      return dump_version;
+    }
+  }
+
+ private:
+  std::vector<K>& incr_keys_;
+  typename std::vector<K>::iterator keys_iter_;
+  EmbeddingVar<K, V>* emb_var_;
+};
+
+template <class K, class V, class T>
+class IncrEVFreqDumpIterator : public DumpIterator<T> {
+ public:
+  IncrEVFreqDumpIterator(std::vector<K>& incr_keys,
+                         EmbeddingVar<K, V>*& emb_var)
+      : incr_keys_(incr_keys), emb_var_(emb_var) {
+    keys_iter_ = incr_keys_.begin();
+  }
+
+  bool HasNext() const { return keys_iter_ != incr_keys_.end(); }
+
+  T Next() {
+    K key = *keys_iter_;
+    int64 dump_version = emb_var_->GetFreq(key);
+    keys_iter_++;
+    return dump_version;
+  }
+
+ private:
+  std::vector<K>& incr_keys_;
+  typename std::vector<K>::iterator keys_iter_;
+  EmbeddingVar<K, V>* emb_var_;
+};
+
+template <class K, class T>
+class IncrNormalValueDumpIterator : public DumpIterator<T> {
+ public:
+  IncrNormalValueDumpIterator(std::vector<K>& incr_keys, const Tensor& variable)
+      : incr_keys_(incr_keys), variable_(variable) {
+    var_data_ = (T*)variable.flat<T>().data();
+    keys_iter_ = incr_keys_.begin();
+    keys_idx_ = 1;
+    col_idx_ = 0;
+  }
+
+  bool HasNext() const {
+    if (keys_iter_ != incr_keys_.end()) {
+      if (keys_idx_ < incr_keys_.size()) {
+        return true;
+      } else {
+        return col_idx_ < variable_.dim_size(1);
+      }
+    } else {
+      return false;
+    }
+  }
+
+  T Next() {
+    if (col_idx_ >= variable_.dim_size(1)) {
+      keys_iter_++;
+      keys_idx_++;
+      col_idx_ = 0;
+    }
+    T val = var_data_[(*keys_iter_) * variable_.dim_size(1) + col_idx_];
+    col_idx_++;
+    return val;
+  }
+
+ private:
+  std::vector<K>& incr_keys_;
+  T* var_data_;
+  int64 col_limit_;
+  int64 keys_idx_;
+  typename std::vector<K>::iterator keys_iter_;
+  int64 col_idx_;
+  const Tensor& variable_;
+};
+
+template <typename K, typename V = float>
+class IndicesIncrRecorder : public ResourceBase {
+ public:
+  explicit IndicesIncrRecorder(const std::string& name, int32 part_count = 16,
+                               int32 min_part_size = 128)
+      : name_(name), incr_indices_(min_part_size, part_count) {}
+
+  void UpdateIndices(const Tensor& indices, OpKernelContext* ctx) {
+    if (global_version_ == -1) {
+      return;
+    }
+
+    incr_indices_.Update(indices, ctx);
+  }
+
+  void UpdateGlobalVersion() {
+    global_version_ = 1;
+    mutex_lock l(mu_);
+    incr_indices_.Clear();
+  }
+
+  void SwapIndices(std::unordered_map<K, uint64>& indices) {
+    incr_indices_.Swap(indices);
+  }
+
+  Status DumpSparseNormalTensor(const string& tensor_name,
+                                const Tensor& variable, BundleWriter* writer) {
+    mutex_lock l(mu_);
+    size_t bytes_limit = 8 << 20;
+    char* dump_buffer = (char*)malloc(sizeof(char) * bytes_limit);
+
+    std::set<K> incr_keys_set;
+    incr_indices_.GetKeys(incr_keys_set);
+    std::vector<K> incr_keys;
+    incr_keys.assign(incr_keys_set.begin(), incr_keys_set.end());
+
+    IncrKeyDumpIterator<K> key_dump_iter(incr_keys);
+    Status st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_keys", writer, dump_buffer, bytes_limit,
+        &key_dump_iter, TensorShape({incr_keys.size()}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+
+    IncrNormalValueDumpIterator<K, V> value_dump_iter(incr_keys, variable);
+    st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_values", writer, dump_buffer, bytes_limit,
+        &value_dump_iter,
+        TensorShape({incr_keys.size(), variable.dim_size(1)}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+
+    free(dump_buffer);
+    return OkStatus();
+  }
+
+  Status DumpSparseEmbeddingTensor(const string& tensor_name,
+                                   EmbeddingVar<K, V>* emb_var,
+                                   BundleWriter* writer,
+                                   OpKernelContext* context) {
+    mutex_lock l(mu_);
+    size_t bytes_limit = 8 << 20;
+    char* dump_buffer = (char*)malloc(sizeof(char) * bytes_limit);
+
+    std::set<K> incr_keys;
+    incr_indices_.GetKeys(incr_keys);
+
+    std::vector<std::vector<K>> incr_keys_parts;
+    incr_keys_parts.resize(kSavedPartitionNum);
+
+    for (auto& ik : incr_keys) {
+      for (int partid = 0; partid < kSavedPartitionNum; partid++) {
+        if (ik % kSavedPartitionNum == partid &&
+            emb_var->GetFreq(ik) >= emb_var->MinFreq()) {
+          incr_keys_parts[partid].push_back(ik);
+          break;
+        }
+      }
+    }
+
+    std::vector<K> partitioned_incr_keys;
+    Tensor part_offset_tensor;
+    context->allocate_temp(DT_INT32, TensorShape({kSavedPartitionNum + 1}),
+                           &part_offset_tensor);
+    auto part_offset_flat = part_offset_tensor.flat<int32>();
+    part_offset_flat(0) = 0;
+    int ptsize = 0;
+    for (int partid = 0; partid < kSavedPartitionNum; partid++) {
+      std::vector<K>& key_list = incr_keys_parts[partid];
+
+      ptsize += key_list.size();
+      for (int inpid = 0; inpid < key_list.size(); inpid++) {
+        partitioned_incr_keys.push_back(key_list[inpid]);
+      }
+
+      part_offset_flat(partid + 1) = part_offset_flat(partid) + key_list.size();
+    }
+    writer->Add(tensor_name + "-incr_partition_offset", part_offset_tensor);
+
+    IncrKeyDumpIterator<K> key_dump_iter(partitioned_incr_keys);
+    Status st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_keys", writer, dump_buffer, bytes_limit,
+        &key_dump_iter, TensorShape({partitioned_incr_keys.size()}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+
+    IncrEVValueDumpIterator<K, V> ev_value_dump_iter(partitioned_incr_keys,
+                                                     emb_var);
+    st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_values", writer, dump_buffer, bytes_limit,
+        &ev_value_dump_iter,
+        TensorShape(
+            {(uint64)partitioned_incr_keys.size(), emb_var->ValueLen()}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+
+    IncrEVVersionDumpIterator<K, V, int64> ev_version_dump_iter(
+        partitioned_incr_keys, emb_var);
+    st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_versions", writer, dump_buffer, bytes_limit,
+        &ev_version_dump_iter,
+        TensorShape({(uint64)partitioned_incr_keys.size()}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+    IncrEVFreqDumpIterator<K, V, int64> ev_freq_dump_iter(partitioned_incr_keys,
+                                                          emb_var);
+    st = SaveTensorWithFixedBuffer(
+        tensor_name + "-sparse_incr_freqs", writer, dump_buffer, bytes_limit,
+        &ev_freq_dump_iter,
+        TensorShape({(uint64)partitioned_incr_keys.size()}));
+    if (!st.ok()) {
+      free(dump_buffer);
+      return st;
+    }
+    free(dump_buffer);
+    return OkStatus();
+  }
+
+  string DebugString() const { return "IndicesIncrRecorder"; }
+
+  string GetName() { return name_; }
+
+ private:
+  mutex mu_;
+  string name_;
+  ParallelHashMap<K> incr_indices_;
+  std::atomic<int64> global_version_ = {-1};
+
+  TF_DISALLOW_COPY_AND_ASSIGN(IndicesIncrRecorder);
+};
+
+class SparsePartitioner {
+ public:
+  SparsePartitioner(int64 part_count, int64_t part_idx, int64 hash_bucket_size)
+      : part_count_(part_count),
+        part_idx_(part_idx),
+        hash_bucket_size_(hash_bucket_size) {
+    assert(part_idx_ >= part_count_);
+  }
+
+  virtual int64 CalcGlobalOffset(int64 part_offset) = 0;
+
+  std::string toString() const {
+    return strings::Printf(
+        "part_mode:%s, part_count:%lld, part_idx:%ld, hash_bucket_size:%ld",
+        part_mode_.c_str(), part_count_, part_idx_, (long)hash_bucket_size_);
+  }
+
+ protected:
+  std::string part_mode_;
+  int64 part_count_;
+  int64 part_idx_;
+  int64 hash_bucket_size_;
+};
+
+class DivSparsePartitioner : public SparsePartitioner {
+ public:
+  DivSparsePartitioner(int64 part_count, int64 part_idx, int64 hash_bucket_size)
+      : SparsePartitioner(part_count, part_idx, hash_bucket_size) {
+    part_mode_ = "div";
+    int64 ids_per_part = hash_bucket_size_ / part_count_;
+    int64 extras = hash_bucket_size_ % part_count_;
+
+    part_offset_start_ = 0;
+    for (int i = 0; i < part_idx; i++) {
+      part_offset_start_ += (i < extras ? (ids_per_part + 1) : ids_per_part);
+    }
+  }
+
+  int64 CalcGlobalOffset(int64 part_offset) {
+    return part_offset_start_ + part_offset;
+  }
+
+ private:
+  int64 part_offset_start_;
+};
+
+class ModSparsePartitioner : public SparsePartitioner {
+ public:
+  ModSparsePartitioner(int64 part_count, int64 part_idx, int64 hash_bucket_size)
+      : SparsePartitioner(part_count, part_idx, hash_bucket_size) {
+    part_mode_ = "mod";
+  }
+
+  int64 CalcGlobalOffset(int64 part_offset) {
+    return part_offset * part_count_ + part_idx_;
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_INCR_SAVE_RESTORE_OPS_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
new file mode 100644
index 00000000..fc175dbf
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "incr_save_restore_ops.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "deepray/custom_ops/utils/fake_input.h"
+#include "deepray/custom_ops/utils/ops_testutil.h"
+#include "deepray/custom_ops/utils/tensor_testutil.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+namespace {
+
+void doTestSplitParallelParts(
+    int part_count, int min_part_size, int total_num,
+    std::vector<std::pair<int64, int64>> expect_parts) {
+  ParallelHashMap<int32> parallel_hashmap(min_part_size, part_count);
+
+  std::vector<std::pair<int64, int64>> parts;
+  parallel_hashmap.SplitParallelParts(total_num, part_count, parts);
+
+  ASSERT_EQ(expect_parts.size(), parts.size());
+  for (size_t i = 0; i < parts.size(); i++) {
+    EXPECT_EQ(expect_parts[i].first, parts[i].first);
+    EXPECT_EQ(expect_parts[i].second, parts[i].second);
+  }
+}
+
+TEST(ParallelHashMapTest, TestSplitParallelParts) {
+  doTestSplitParallelParts(4, 3, 0, {});
+  doTestSplitParallelParts(4, 3, 1, {{0, 1}});
+  doTestSplitParallelParts(4, 3, 8, {{0, 4}, {4, 8}});
+  doTestSplitParallelParts(4, 3, 12, {{0, 3}, {3, 6}, {6, 9}, {9, 12}});
+  doTestSplitParallelParts(4, 3, 13, {{0, 4}, {4, 7}, {7, 10}, {10, 13}});
+  doTestSplitParallelParts(4, 3, 15, {{0, 4}, {4, 8}, {8, 12}, {12, 15}});
+  doTestSplitParallelParts(4, 3, 16, {{0, 4}, {4, 8}, {8, 12}, {12, 16}});
+  doTestSplitParallelParts(4, 3, 17, {{0, 5}, {5, 9}, {9, 13}, {13, 17}});
+}
+
+TEST(ParallelHashMapTest, TestUpdateAndSwap) {
+  ParallelHashMap<int32> parallel_hashmap(2);
+  Tensor t(DT_INT32, TensorShape({5}));
+  test::FillValues<int32>(&t, {1, 2, 3, 2, 3});
+
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  std::unique_ptr<OpKernelContext> context(new OpKernelContext(&params, 3));
+
+  parallel_hashmap.Update(t, context.get());
+
+  std::unordered_map<int32, uint64> out_indices;
+  parallel_hashmap.Swap(out_indices);
+  EXPECT_EQ(3, out_indices.size());
+  EXPECT_EQ(1, out_indices[1]);
+  EXPECT_EQ(2, out_indices[2]);
+  EXPECT_EQ(2, out_indices[3]);
+}
+
+TEST(ParallelHashMapTest, TestGetKeys) {
+  ParallelHashMap<int32> parallel_hashmap(2);
+  Tensor t(DT_INT32, TensorShape({5}));
+  test::FillValues<int32>(&t, {1, 2, 3, 2, 3});
+
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  std::unique_ptr<OpKernelContext> context(new OpKernelContext(&params, 3));
+
+  parallel_hashmap.Update(t, context.get());
+
+  std::set<int32> keys;
+  parallel_hashmap.GetKeys(keys);
+  EXPECT_EQ(3, keys.size());
+  EXPECT_TRUE(keys.find(1) != keys.end());
+  EXPECT_TRUE(keys.find(2) != keys.end());
+  EXPECT_TRUE(keys.find(3) != keys.end());
+}
+
+TEST(IndicesIncrRecorderTest, TestUpdateAndSwap) {
+  Tensor t(DT_INT32, TensorShape({5}));
+  test::FillValues<int32>(&t, {1, 2, 3, 2, 3});
+
+  std::unique_ptr<Device> device(
+      DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+
+  OpKernelContext::Params params;
+  params.device = device.get();
+  params.frame_iter = FrameAndIter(0, 0);
+  std::unique_ptr<OpKernelContext> context(new OpKernelContext(&params, 3));
+
+  IndicesIncrRecorder<int32> recorder("test", 16, 2);
+  recorder.UpdateGlobalVersion();
+  recorder.UpdateIndices(t, context.get());
+
+  std::unordered_map<int32, uint64> out_indices;
+  recorder.SwapIndices(out_indices);
+  EXPECT_EQ(3, out_indices.size());
+  EXPECT_EQ(1, out_indices[1]);
+  EXPECT_EQ(2, out_indices[2]);
+  EXPECT_EQ(2, out_indices[3]);
+}
+
+TEST(DivSparsePartitionerTest, TestCalcGlobalOffset) {
+  // part_count: 4, hash_bucket_size: 15
+  // [0, 4), [4, 8), [8, 12), [12, 15)
+
+  {
+    DivSparsePartitioner p(4, 0, 15);
+    EXPECT_EQ(0, p.CalcGlobalOffset(0));
+    EXPECT_EQ(1, p.CalcGlobalOffset(1));
+    EXPECT_EQ(2, p.CalcGlobalOffset(2));
+    EXPECT_EQ(3, p.CalcGlobalOffset(3));
+  }
+
+  {
+    DivSparsePartitioner p(4, 1, 15);
+    EXPECT_EQ(4, p.CalcGlobalOffset(0));
+    EXPECT_EQ(5, p.CalcGlobalOffset(1));
+    EXPECT_EQ(6, p.CalcGlobalOffset(2));
+    EXPECT_EQ(7, p.CalcGlobalOffset(3));
+  }
+
+  {
+    DivSparsePartitioner p(4, 2, 15);
+    EXPECT_EQ(8, p.CalcGlobalOffset(0));
+    EXPECT_EQ(9, p.CalcGlobalOffset(1));
+    EXPECT_EQ(10, p.CalcGlobalOffset(2));
+    EXPECT_EQ(11, p.CalcGlobalOffset(3));
+  }
+
+  {
+    DivSparsePartitioner p(4, 3, 15);
+    EXPECT_EQ(12, p.CalcGlobalOffset(0));
+    EXPECT_EQ(13, p.CalcGlobalOffset(1));
+    EXPECT_EQ(14, p.CalcGlobalOffset(2));
+  }
+}
+
+class CollectOpTest : public OpsTestBase {
+ protected:
+  void MakeOp(const string &config_str, const string &tensor_name,
+              DataType ktype, const string &part_mode = "div",
+              int64 part_idx = 0, int64 part_count = 0,
+              int64 hash_bucket_size = 0) {
+    TF_EXPECT_OK(NodeDefBuilder("collect_op", "CollectSparseIndices")
+                     .Attr("tensor_name", tensor_name)
+                     .Attr("config", config_str)
+                     .Attr("part_idx", part_idx)
+                     .Attr("part_count", part_count)
+                     .Attr("hash_bucket_size", hash_bucket_size)
+                     .Attr("part_mode", part_mode)
+                     .Attr("ktype", ktype)
+                     .Finalize(node_def()));
+
+    TF_EXPECT_OK(InitOp());
+  }
+
+  template <typename KeyType>
+  void CheckCollect() {
+    string tensor_name = "test_tensor_name";
+    DataType key_type = DataTypeToEnum<KeyType>::v();
+    MakeOp("", tensor_name, key_type);
+
+    // prepare context to run the op
+    context_.reset(nullptr);
+
+    params_.reset(new OpKernelContext::Params);
+    params_.get()->device = device_;
+    params_.get()->frame_iter = FrameAndIter(0, 0);
+    params_.get()->inputs = &inputs_;
+    params_.get()->op_kernel = kernel_.get();
+    step_container_.reset(new ScopedStepContainer(0, [](const string &) {}));
+    params_->step_container = step_container_.get();
+    std::vector<AllocatorAttributes> attrs;
+    test::SetOutputAttrs(params_.get(), &attrs);
+    checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper;
+    params_.get()->slice_reader_cache = &slice_reader_cache_wrapper;
+    params_.get()->resource_manager = device_->resource_manager();
+
+    context_.reset(new OpKernelContext(params_.get()));
+
+    IndicesIncrRecorder<KeyType> *sparse_incr_res = nullptr;
+    auto rm = device_->resource_manager();
+
+    Status s = rm->LookupOrCreate<IndicesIncrRecorder<KeyType>>(
+        "", tensor_name + "_sparse_incr", &sparse_incr_res,
+        [this, tensor_name](IndicesIncrRecorder<KeyType> **ptr) {
+          *ptr = new IndicesIncrRecorder<KeyType>(tensor_name);
+          (*ptr)->UpdateGlobalVersion();
+          return OkStatus();
+        });
+    ASSERT_TRUE(s.ok());
+
+    Tensor indices(allocator(), key_type, TensorShape({5}));
+    test::FillValues<KeyType>(
+        &indices, {(KeyType)1, (KeyType)2, (KeyType)3, (KeyType)4, (KeyType)5});
+    sparse_incr_res->UpdateIndices(indices, context_.get());
+
+    device_->Compute(kernel_.get(), context_.get());
+
+    Tensor output_keys = *GetOutput(0);
+    Tensor output_global_keys = *GetOutput(1);
+    EXPECT_EQ(5, output_keys.NumElements());
+    EXPECT_EQ(5, output_global_keys.NumElements());
+    test::ExpectTensorEqual<KeyType>(output_keys, output_global_keys);
+  }
+};
+
+#define TEST_COLLECT(kt) \
+  TEST_F(CollectOpTest, TestCollect##_##kt) { CheckCollect<kt>(); }
+
+TEST_COLLECT(int64);
+TEST_COLLECT(int32);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc b/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc
new file mode 100644
index 00000000..66aa6d68
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/embedding_collection.cc
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_runtime_api.h>
+
+#include <string>
+#include <vector>
+
+#include "hotness_calculate.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+
+namespace stream_executor {
+namespace gpu {
+cudaStream_t AsGpuStreamValue(Stream* stream);
+}  // namespace gpu
+}  // namespace stream_executor
+
+namespace tensorflow {
+
+// -----------------------------------------------------------------------------------------------
+// HotnessCalculate
+// -----------------------------------------------------------------------------------------------
+template <typename DType>
+class HotnessCalculateOp : public OpKernel {
+ public:
+  explicit HotnessCalculateOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    launcher_.initialize();
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_lookups", &num_lookups_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("num_gpus", &num_gpus_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* row_length_send_buffer = nullptr;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input("row_length_buffer", &row_length_send_buffer));
+    int64_t input_len = row_length_send_buffer->dim_size(0);
+    OP_REQUIRES(
+        ctx, input_len % (num_lookups_ * num_gpus_) == 0,
+        errors::InvalidArgument("input_len%(num_lookups_*num_gpus_) != 0"));
+    size_t local_batchsize = input_len / num_lookups_ / num_gpus_;
+    Tensor* hotness = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {num_lookups_}, &hotness));
+
+    // temp buffer
+    Tensor device_buffer;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_temp(DT_INT32, {num_lookups_}, &device_buffer));
+
+    // stream
+    auto device_ctx = ctx->op_device_context();
+    OP_REQUIRES(ctx, device_ctx != nullptr,
+                errors::Aborted("No valid device context."));
+    cudaStream_t stream =
+        stream_executor::gpu::AsGpuStreamValue(device_ctx->stream());
+
+    // cuda kernel
+    launcher_(row_length_send_buffer->data(), local_batchsize, num_lookups_,
+              num_gpus_, device_buffer.data(), hotness->data(), stream);
+  }
+
+ private:
+  sok::HotnessCalLauncher<DType> launcher_;
+  int num_lookups_;
+  int num_gpus_;
+};
+
+#define REGISTER_GPU_KERNELS(dtype_tf, dtype)                        \
+  REGISTER_KERNEL_BUILDER(Name("HotnessCalculate")                   \
+                              .Device(DEVICE_GPU)                    \
+                              .HostMemory("hotness")                 \
+                              .TypeConstraint<dtype_tf>("Tindices"), \
+                          HotnessCalculateOp<dtype>)
+
+REGISTER_GPU_KERNELS(int64_t, int64_t);
+REGISTER_GPU_KERNELS(int32_t, int32_t);
+
+#undef REGISTER_GPU_KERNELS
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
new file mode 100644
index 00000000..0222bdcf
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hotness_calculate.h"
+
+#include "deepray/custom_ops/utils/check.h"
+
+namespace sok {
+
+template <typename DType>
+__global__ void hotnessCalKernel(const DType *row_length_recv_buffer,
+                                 size_t local_batchsize, int num_lookup,
+                                 int num_gpus, int *outputs) {
+  size_t thread_cnt = blockDim.x * gridDim.x;
+  size_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
+  size_t items = local_batchsize * num_lookup * num_gpus;
+  extern __shared__ int smem[];
+  for (size_t i = threadIdx.x; i < num_lookup; i += blockDim.x) {
+    smem[i] = 0;
+  }
+
+  __syncthreads();
+  for (size_t i = thread_idx; i < items; i += thread_cnt) {
+    size_t num_lookup_id = (i / local_batchsize) % num_lookup;
+    int value = (int)(row_length_recv_buffer[i]);
+    atomicMax(smem + num_lookup_id, value);
+  }
+
+  __syncthreads();
+  for (size_t i = threadIdx.x; i < num_lookup; i += blockDim.x) {
+    atomicMax(outputs + i, smem[i]);
+  }
+}
+
+template <typename DType>
+void HotnessCalLauncher<DType>::initialize() {
+  int device;
+  CUDACHECK(cudaGetDevice(&device));
+  CUDACHECK(cudaDeviceGetAttribute(&sm_count_, cudaDevAttrMultiProcessorCount,
+                                   device));
+}
+
+template <typename DType>
+void HotnessCalLauncher<DType>::operator()(
+    const void *row_length_recv_buffer, size_t local_batchsize, int num_lookup,
+    int num_gpus, void *output_device, void *output_host, cudaStream_t stream) {
+  const DType *t_row_length_recv_buffer =
+      reinterpret_cast<const DType *>(row_length_recv_buffer);
+  int32_t *t_output_device = reinterpret_cast<int32_t *>(output_device);
+  int32_t *t_output_host = reinterpret_cast<int32_t *>(output_host);
+
+  dim3 grid_dim(2 * sm_count_);
+  dim3 block_dim(1024ul);
+  CUDACHECK(cudaMemsetAsync(t_output_device, 0, sizeof(int32_t) * num_lookup,
+                            stream));
+  hotnessCalKernel<DType>
+      <<<grid_dim, block_dim, num_lookup * sizeof(int32_t), stream>>>(
+          t_row_length_recv_buffer, local_batchsize, num_lookup, num_gpus,
+          t_output_device);
+  CUDACHECK(cudaMemcpyAsync(t_output_host, t_output_device,
+                            sizeof(int32_t) * num_lookup,
+                            cudaMemcpyDeviceToHost, stream));
+  CUDACHECK(cudaStreamSynchronize(stream));
+
+  // CUDACHECK(cudaGetLastError());
+}
+
+template class HotnessCalLauncher<int32_t>;
+template class HotnessCalLauncher<int64_t>;
+
+}  // namespace sok
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h
new file mode 100644
index 00000000..8b1d473a
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef HOTNESS_KERNEL_H
+#define HOTNESS_KERNEL_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+namespace sok {
+
+template <typename DType>
+class HotnessCalLauncher {
+ public:
+  void initialize();
+  void operator()(const void* row_length_recv_buffer, size_t local_batchsize,
+                  int num_lookup, int num_gpus, void* output_device,
+                  void* output_host, cudaStream_t stream = 0);
+
+ private:
+  int sm_count_;
+};
+
+}  // namespace sok
+
+#endif
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc
new file mode 100644
index 00000000..349b941b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_lookup_ops.cc
@@ -0,0 +1,593 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var_context.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+
+#if GOOGLE_CUDA
+using se::DeviceMemoryBase;
+using se::Stream;
+#endif  // GOOGLE_CUDA
+
+template <typename TKey, typename TValue>
+class KvResourceLookupResourceOp : public OpKernel {
+ public:
+  explicit KvResourceLookupResourceOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {1}, &output));
+    auto output_scalar = output->scalar<int64>();
+    output_scalar() = (int64)ev;
+  }
+};
+
+#define REGISTER_KV_LOOKUP_RESOURCE(dev, ktype, vtype)         \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceLookupResource")     \
+                              .Device(DEVICE_##dev)            \
+                              .HostMemory("output")            \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvResourceLookupResourceOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type)         \
+  REGISTER_KV_LOOKUP_RESOURCE(dev, int32, type) \
+  REGISTER_KV_LOOKUP_RESOURCE(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KV_LOOKUP_RESOURCE
+
+template <typename Device, typename TKey, typename TValue>
+class KvResourceLookupIDOp : public OpKernel {
+ public:
+  explicit KvResourceLookupIDOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = c->input(1);
+    const int64 N = indices.NumElements();
+
+    TensorShape result_shape = indices.shape();
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+
+    if (N > 0) {
+      auto out_flat = out->flat<int64>();
+      int64* out_base = &out_flat(0);
+
+      auto indices_flat = indices.flat<TKey>();
+      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
+      EmbeddingVarContext<Device> ev_ctx(c);
+      ev->GetOrCreateKey(ev_ctx, indices, reinterpret_cast<void**>(out_base),
+                         indices_size);
+    }
+  }
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype)                    \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID")      \
+                              .Device(DEVICE_##dev)            \
+                              .TypeConstraint<vtype>("dtype")  \
+                              .TypeConstraint<ktype>("Tkeys"), \
+                          KvResourceLookupIDOp<CPUDevice, ktype, vtype>)
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS(dev, ktype, vtype)                    \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceLookupID")      \
+                              .Device(DEVICE_##dev)            \
+                              .HostMemory("indices")           \
+                              .HostMemory("pointer")           \
+                              .TypeConstraint<vtype>("dtype")  \
+                              .TypeConstraint<ktype>("Tkeys"), \
+                          KvResourceLookupIDOp<GPUDevice, ktype, vtype>)
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+template <typename Device, typename TKey, typename TValue>
+class KvResourceCollectEmbeddingOp : public OpKernel {
+ public:
+  explicit KvResourceCollectEmbeddingOp(OpKernelConstruction* c)
+      : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = c->input(1);
+    const Tensor& pointer = c->input(2);
+    const int64 N = indices.NumElements();
+
+    TensorShape result_shape = indices.shape();
+    TensorShape value_shape({ev->ValueLen()});
+    result_shape.AppendShape(value_shape);
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+
+    if (N > 0) {
+      auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
+      TValue* out_base = &out_flat(0, 0);
+
+      auto indices_flat = indices.flat<TKey>();
+      auto pointer_flat = pointer.flat<int64>();
+      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
+      const int64 slice_elems = out_flat.dimension(1);
+      OP_REQUIRES(
+          c, ev->ValueLen() == slice_elems,
+          errors::InvalidArgument(
+              "ev's value_len should same with output's dimension(1)",
+              std::to_string(slice_elems), std::to_string(ev->ValueLen())));
+      OP_REQUIRES(
+          c,
+          !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  ev->CacheSize(),
+                                  " should large than IDs in batch ", N));
+      const size_t slice_bytes = slice_elems * sizeof(TValue);
+      EmbeddingVarContext<Device> ev_ctx(c);
+      ev->GatherEmbeddings(ev_ctx, indices, (void**)pointer.data(), out_base,
+                           N);
+    }
+  }
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype)   \
+  REGISTER_KERNEL_BUILDER(                    \
+      Name("_OPT_KvResourceCollectEmbedding") \
+          .Device(DEVICE_##dev)               \
+          .HostMemory("resource")             \
+          .HostMemory("indices")              \
+          .HostMemory("pointer")              \
+          .HostMemory("default_value")        \
+          .HostMemory("output")               \
+          .TypeConstraint<vtype>("dtype")     \
+          .TypeConstraint<ktype>("Tkeys"),    \
+      KvResourceCollectEmbeddingOp<CPUDevice, ktype, vtype>)
+
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS(dev, ktype, vtype)   \
+  REGISTER_KERNEL_BUILDER(                    \
+      Name("_OPT_KvResourceCollectEmbedding") \
+          .Device(DEVICE_##dev)               \
+          .HostMemory("indices")              \
+          .HostMemory("pointer")              \
+          .HostMemory("default_value")        \
+          .TypeConstraint<vtype>("dtype")     \
+          .TypeConstraint<ktype>("Tkeys"),    \
+      KvResourceCollectEmbeddingOp<GPUDevice, ktype, vtype>)
+
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+template <typename TKey, typename TValue, bool has_counts>
+class KvResourceGatherOp : public OpKernel {
+ public:
+  explicit KvResourceGatherOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &is_use_default_value_tensor_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = c->input(1);
+    const int64 N = indices.NumElements();
+
+    TensorShape result_shape = indices.shape();
+    TensorShape value_shape({ev->ValueLen()});
+    result_shape.AppendShape(value_shape);
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+
+    if (N > 0) {
+      auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
+      TValue* out_base = &out_flat(0, 0);
+
+      const int64 slice_elems = out_flat.dimension(1);
+      OP_REQUIRES(
+          c, ev->ValueLen() == slice_elems,
+          errors::InvalidArgument(
+              "ev's value_len should same with output's dimension(1)",
+              std::to_string(slice_elems), std::to_string(ev->ValueLen())));
+      OP_REQUIRES(
+          c,
+          !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  ev->CacheSize(),
+                                  " should large than IDs in batch ", N));
+
+      EmbeddingVarContext<CPUDevice> ev_ctx(c);
+      if (is_use_default_value_tensor_) {
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N,
+                          reinterpret_cast<TValue*>(c->input(2).data()));
+      } else {
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices.data(), out_base, N);
+        if (has_counts) {
+          const Tensor& indices_counts = c->input(2);
+          ev->UpdateCache(indices, indices_counts, true);
+        } else {
+          ev->UpdateCache(indices, true);
+        }
+      }
+    }
+  }
+
+ private:
+  bool is_use_default_value_tensor_;
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype)                    \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceGather")             \
+                              .Device(DEVICE_##dev)            \
+                              .TypeConstraint<vtype>("dtype")  \
+                              .TypeConstraint<ktype>("Tkeys"), \
+                          KvResourceGatherOp<ktype, vtype, false>)
+
+#define REGISTER_KERNELS_ALL_INDICES(type) \
+  REGISTER_KERNELS(CPU, int32, type);      \
+  REGISTER_KERNELS(CPU, int64, type)
+
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDICES)
+#undef REGISTER_KERNELS_ALL_INDICES
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(dev, ktype, vtype)                    \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceGatherV1")           \
+                              .Device(DEVICE_##dev)            \
+                              .TypeConstraint<vtype>("dtype")  \
+                              .TypeConstraint<ktype>("Tkeys"), \
+                          KvResourceGatherOp<ktype, vtype, true>)
+
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+template <typename Device, typename TKey, typename TValue, bool has_counts>
+class KvResourceGatherGPUOp : public OpKernel {
+ public:
+  explicit KvResourceGatherGPUOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
+                                 &is_use_default_value_tensor_));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = c->input(1);
+    const int64 N = indices.NumElements();
+
+    TensorShape result_shape = indices.shape();
+    TensorShape value_shape({ev->ValueLen()});
+    result_shape.AppendShape(value_shape);
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
+
+    if (N > 0) {
+      auto out_flat = out->shaped<TValue, 2>({N, out->NumElements() / N});
+      TValue* out_base = &out_flat(0, 0);
+
+      auto indices_flat = indices.flat<TKey>();
+      const int64 indices_size = static_cast<int64>(indices_flat.dimension(0));
+      const int64 slice_elems = out_flat.dimension(1);
+      TValue* default_v = nullptr;
+      if (is_use_default_value_tensor_) {
+        default_v = (TValue*)c->input(2).data();
+      } else {
+        default_v = ev->GetDefaultValuePtr();
+      }
+      OP_REQUIRES(
+          c, ev->ValueLen() == slice_elems,
+          errors::InvalidArgument(
+              "ev's value_len should same with output's dimension(1)",
+              std::to_string(slice_elems), std::to_string(ev->ValueLen())));
+      OP_REQUIRES(
+          c,
+          !ev->IsMultiLevel() || (ev->IsMultiLevel() && ev->CacheSize() >= N),
+          errors::InvalidArgument("MultiLevel EV's Cache size ",
+                                  ev->CacheSize(),
+                                  " should large than IDs in batch ", N));
+      const size_t slice_bytes = slice_elems * sizeof(TValue);
+      EmbeddingVarContext<GPUDevice> ev_ctx(c);
+      if (ev->IsSingleHbm()) {
+        const TKey* key_base = &indices_flat(0);
+        const Device& device = c->eigen_device<Device>();
+        if (is_use_default_value_tensor_) {
+          Tensor default_values(c->input(2));
+          auto default_value_num =
+              default_values.NumElements() / ev->ValueLen();
+          auto default_values_matrix = default_values.shaped<TValue, 2>(
+              {default_value_num, ev->ValueLen()});
+          TValue* default_v_base = &default_values_matrix(0, 0);
+          ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
+        } else {
+          ev->GetEmbeddings(ev_ctx, key_base, out_base, N);
+        }
+      } else {
+        Tensor indices_host(indices.dtype(), indices.shape());
+        // Copy ids from GPU to CPU for CPU Lookup.
+        auto stream = c->op_device_context()->stream();
+        auto event_mgr =
+            c->device()->tensorflow_accelerator_device_info()->event_mgr;
+        se::DeviceMemoryBase gpu_src(const_cast<TKey*>(&indices_flat(0)),
+                                     N * sizeof(TKey));
+        stream->ThenMemcpy(indices_host.data(), gpu_src, N * sizeof(TKey));
+        SyncWithEventMgr(stream, event_mgr);
+
+        EmbeddingVarContext<GPUDevice> ev_ctx(c);
+        ev->GetEmbeddings(ev_ctx, (TKey*)indices_host.data(), out_base, N);
+        if (has_counts) {
+          const Tensor& indices_counts = c->input(2);
+          ev->UpdateCache(indices_host, indices_counts, true);
+        } else {
+          ev->UpdateCache(indices_host, true);
+        }
+      }
+    }
+  }
+
+ private:
+  bool is_use_default_value_tensor_;
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype) \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("KvResourceGather")              \
+          .Device(DEVICE_##dev)             \
+          .TypeConstraint<vtype>("dtype")   \
+          .TypeConstraint<ktype>("Tkeys"),  \
+      KvResourceGatherGPUOp<GPUDevice, ktype, vtype, false>)
+
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU);
+#undef REGISTER_KERNELS_GPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(dev, ktype, vtype) \
+  REGISTER_KERNEL_BUILDER(                  \
+      Name("KvResourceGatherV1")            \
+          .Device(DEVICE_##dev)             \
+          .HostMemory("counts")             \
+          .TypeConstraint<vtype>("dtype")   \
+          .TypeConstraint<ktype>("Tkeys"),  \
+      KvResourceGatherGPUOp<GPUDevice, ktype, vtype, true>)
+
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type);   \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+template <typename TKey, typename TValue>
+class EVGetFrequencyOp : public OpKernel {
+ public:
+  explicit EVGetFrequencyOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = ctx->input(1);
+    auto indices_flat = indices.flat<TKey>();
+
+    Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, {indices.NumElements()}, &output));
+    for (int i = 0; i < indices.NumElements(); ++i) {
+      int64 f = ev->GetFreq(indices_flat(i));
+      output->flat<int64>()(i) = f;
+    }
+  }
+};
+
+#define REGISTER_KERNELS(ktype, vtype)                           \
+  REGISTER_KERNEL_BUILDER(Name("EVGetFrequency")                 \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<ktype>("Tkeys")    \
+                              .TypeConstraint<vtype>("Tvalues"), \
+                          EVGetFrequencyOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type)    \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL)
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <typename TKey, typename TValue>
+class EVGetVersionOp : public OpKernel {
+ public:
+  explicit EVGetVersionOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = ctx->input(1);
+    auto indices_flat = indices.flat<TKey>();
+
+    Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, {indices.NumElements()}, &output));
+    for (int i = 0; i < indices.NumElements(); ++i) {
+      int64 v = ev->GetVersion(indices_flat(i));
+      output->flat<int64>()(i) = v;
+    }
+  }
+};
+
+#define REGISTER_KERNELS(ktype, vtype)                           \
+  REGISTER_KERNEL_BUILDER(Name("EVGetVersion")                   \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<ktype>("Tkeys")    \
+                              .TypeConstraint<vtype>("Tvalues"), \
+                          EVGetVersionOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type)    \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL)
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <typename TKey, typename TValue>
+class KvResourceLookupTierOp : public OpKernel {
+ public:
+  explicit KvResourceLookupTierOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    const Tensor& indices = ctx->input(1);
+    auto indices_flat = indices.flat<TKey>();
+
+    Tensor* output;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, {indices.NumElements()}, &output));
+    for (int i = 0; i < indices.NumElements(); ++i) {
+      int v = ev->storage()->LookupTier(indices_flat(i));
+      output->flat<int>()(i) = v;
+    }
+  }
+};
+
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceLookupTier")         \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvResourceLookupTierOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type)    \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL)
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceLookupTier")         \
+                              .Device(DEVICE_GPU)              \
+                              .HostMemory("ids")               \
+                              .HostMemory("output")            \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvResourceLookupTierOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(type) \
+  REGISTER_KERNELS(int32, type)    \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_ALL)
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc
new file mode 100644
index 00000000..922c2122
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_ops.cc
@@ -0,0 +1,620 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "training_ali_ops_gpu.h"
+#endif
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+// Please use the appropriate namespace for your project
+namespace tensorflow {
+
+using ::tensorflow::OpKernel;
+using ::tensorflow::OpKernelConstruction;
+using ::tensorflow::OpKernelContext;
+using ::tensorflow::Tensor;
+using ::tensorflow::errors::InvalidArgument;
+
+// -----------------------------------------------------------------------------------------------
+// KvVarHandle
+// -----------------------------------------------------------------------------------------------
+template <typename KeyType, typename ValueType>
+class KvVarHandleOp : public OpKernel {
+ public:
+  explicit KvVarHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("Tkeys", &key_type_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_and_shape_.dtype));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &dtype_and_shape_.shape));
+    OP_REQUIRES(ctx, dtype_and_shape_.shape.dims() == 1,
+                errors::Aborted("len(shape) must be 1"));
+    OP_REQUIRES(ctx, dtype_and_shape_.shape.dim_size(0) > 0,
+                errors::Aborted("shape[0] must > 0"));
+
+    info_ = Info();
+    is_anonymous_ = name_ == ResourceHandle::ANONYMOUS_NAME;
+
+    // Use const_tensor_ if the variable is non-anonymous.
+    if (!is_anonymous_) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                             &const_tensor_, attr));
+      const_tensor_.scalar<ResourceHandle>()() =
+          MakeResourceHandle<EmbeddingVar<KeyType, ValueType>>(
+              ctx, container_, name_,
+              std::vector<DtypeAndPartialTensorShape>{dtype_and_shape_});
+      std::cout << "[EV INFO] Create non-anonymous " + info_ << std::endl;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    if (is_anonymous_) {
+      // throw std::invalid_argument("EV cannot be ANONYMOUS!");
+      OP_REQUIRES(ctx, false,
+                  errors::InvalidArgument("EV cannot be ANONYMOUS!"));
+    } else {
+      ctx->set_output(0, const_tensor_);
+    }
+  }
+
+  const Tensor* const_tensor() const override {
+    return is_anonymous_ ? nullptr : &const_tensor_;
+  }
+
+ private:
+  bool is_anonymous_;
+  std::string container_;
+  std::string name_;
+  std::string info_;
+  DataType key_type_;
+  DtypeAndPartialTensorShape dtype_and_shape_;
+  Tensor const_tensor_;
+
+  std::string Info() {
+    std::string dtype = DataTypeString(dtype_and_shape_.dtype);
+    std::string key_type = DataTypeString(key_type_);
+    std::string dim_0 = std::to_string(dtype_and_shape_.shape.dim_size(0));
+    std::string shape = "[" + dim_0 + "]";
+    std::string info =
+        "<EmbeddingVar> handle: " + container_ + "/" + name_ + ", ";
+    info += "key_type: " + key_type + ", dtype: " + dtype + ", shape: " + shape;
+    return info;
+  }
+};
+
+#define REGISTER_KV_VAR_HANDLE(dev, ktype, vtype)              \
+  REGISTER_KERNEL_BUILDER(Name("KvVarHandleOp")                \
+                              .Device(DEVICE_##dev)            \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvVarHandleOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type)    \
+  REGISTER_KV_VAR_HANDLE(dev, int32, type) \
+  REGISTER_KV_VAR_HANDLE(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KV_VAR_HANDLE
+
+template <typename T, typename TKey, typename TValue>
+class KvVariableShapeOp : public OpKernel {
+ public:
+  explicit KvVariableShapeOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    TensorShape shape({ev->Size(), ev->ValueLen()});
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {shape.dims()}, &output));
+    for (int i = 0; i < shape.dims(); ++i) {
+      output->flat<T>()(i) = shape.dim_size(i);
+    }
+  }
+};
+
+#define REGISTER_KERNELS(dev, type, ktype, vtype)               \
+  REGISTER_KERNEL_BUILDER(Name("KvVariableShape")               \
+                              .Device(DEVICE_##dev)             \
+                              .TypeConstraint<type>("out_type") \
+                              .TypeConstraint<ktype>("Tkeys")   \
+                              .TypeConstraint<vtype>("dtype")   \
+                              .HostMemory("output"),            \
+                          KvVariableShapeOp<type, ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type)     \
+  REGISTER_KERNELS(dev, int32, int32, type) \
+  REGISTER_KERNELS(dev, int32, int64, type) \
+  REGISTER_KERNELS(dev, int64, int32, type) \
+  REGISTER_KERNELS(dev, int64, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+class DestroyKvResourceOp : public OpKernel {
+ public:
+  explicit DestroyKvResourceOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx,
+                   ctx->GetAttr("ignore_lookup_error", &ignore_lookup_error_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const ResourceHandle& p = HandleFromInput(ctx, 0);
+    Status status = DeleteResource(ctx, p);
+    if (ignore_lookup_error_ && errors::IsNotFound(status)) {
+      return;
+    }
+    OP_REQUIRES_OK(ctx, status);
+  }
+
+ private:
+  bool ignore_lookup_error_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DestroyKvResourceOp").Device(DEVICE_CPU),
+                        DestroyKvResourceOp);
+
+template <typename TKey, typename TValue>
+class InitializeKvVariableOp : public OpKernel {
+ public:
+  explicit InitializeKvVariableOp(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(c, c->GetAttr("counter_type", &counter_type_));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    OP_REQUIRES(c, shape_.dims() == 1,
+                errors::InvalidArgument("KvVariable dimension must be 1"));
+    OP_REQUIRES_OK(c, c->GetAttr("emb_index", &emb_index_));
+    OP_REQUIRES_OK(c, c->GetAttr("block_num", &block_num_));
+    OP_REQUIRES_OK(c, c->GetAttr("slot_index", &slot_index_));
+    OP_REQUIRES_OK(c, c->GetAttr("steps_to_live", &steps_to_live_));
+    OP_REQUIRES_OK(c, c->GetAttr("filter_freq", &filter_freq_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_freq", &max_freq_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_element_size", &max_element_size_));
+    OP_REQUIRES_OK(c, c->GetAttr("false_positive_probability",
+                                 &false_positive_probability_));
+    OP_REQUIRES_OK(c, c->GetAttr("l2_weight_threshold", &l2_weight_threshold_));
+    OP_REQUIRES_OK(c, c->GetAttr("default_value_dim", &default_value_dim_));
+    OP_REQUIRES_OK(c, c->GetAttr("default_value_no_permission",
+                                 &default_value_no_permission_));
+    OP_REQUIRES_OK(c, c->GetAttr("slot_num", &slot_num_));
+    OP_REQUIRES_OK(c, c->GetAttr("record_freq", &record_freq_));
+    OP_REQUIRES_OK(c, c->GetAttr("record_version", &record_version_));
+    int embedding_var_type = 0;
+    Status s = c->GetAttr("embedding_variable_type", &embedding_var_type);
+    if (!s.ok()) {
+      // Not InitializeKvVariableV2Op!
+      embedding_var_type = embedding::EmbeddingVariableType::MUTABLE;
+    }
+    is_inference_ = false;
+    TF_CHECK_OK(ReadBoolFromEnvVar(kInferenceMode, false, &is_inference_));
+    is_inference_ |=
+        (embedding_var_type == embedding::EmbeddingVariableType::IMMUTABLE);
+
+    // initial_num_buckets is useless, so is used to set is_set_initialized_.
+    int64 initial_num_buckets = 0;
+    OP_REQUIRES_OK(c, c->GetAttr("initial_num_buckets", &initial_num_buckets));
+    is_set_initialized_ = true;
+    if (initial_num_buckets ==
+        embedding::IsSetInitialized::NOT_SET_INITAILIZED) {
+      is_set_initialized_ = false;
+    }
+
+    int64 storage_type = 0;
+    OP_REQUIRES_OK(c, c->GetAttr("storage_type", &storage_type));
+    storage_type_ = static_cast<embedding::StorageType>(storage_type);
+    device_type_str_ = c->device_type().type_string();
+    if (storage_type_ == embedding::DEFAULT) {
+      if (device_type_str_ == "CPU") {
+        storage_type_ = embedding::DRAM;
+      } else {
+        storage_type_ = embedding::HBM;
+      }
+    }
+
+    bool if_op_on_gpu = (device_type_str_ == "GPU");
+    bool if_embedding_on_hbm = (storage_type_ == embedding::HBM ||
+                                storage_type_ == embedding::HBM_DRAM ||
+                                storage_type_ == embedding::HBM_DRAM_SSDHASH);
+    OP_REQUIRES(
+        c, if_op_on_gpu == if_embedding_on_hbm,
+        errors::InvalidArgument("Storage of EV and device of Op mismatch."));
+
+    OP_REQUIRES_OK(c, c->GetAttr("storage_path", &storage_path_));
+    OP_REQUIRES_OK(c, c->GetAttr("storage_size", &storage_size_));
+
+    if (filter_freq_ < 0) {
+      LOG(INFO) << "filter_freq < 0 is invalid, feature filter is disabled.";
+      filter_freq_ = 0;
+    }
+
+    record_freq_ |= (storage_type > 5);
+    record_version_ |= (storage_type > 5);
+
+    OP_REQUIRES(c, steps_to_live_ >= 0,
+                errors::InvalidArgument("steps_to_live must >= 0, ",
+                                        std::to_string(steps_to_live_)));
+
+    OP_REQUIRES_OK(c, c->GetAttr("ht_type", &ht_type_));
+    OP_REQUIRES_OK(c, c->GetAttr("ht_partition_num", &ht_partition_num_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& default_values = context->input(2);
+
+    OP_REQUIRES(context, dtype_ == default_values.dtype(),
+                errors::InvalidArgument(
+                    "Variable and value dtypes don't match; respectively, ",
+                    dtype_, " and ", default_values.dtype()));
+
+    ResourceHandle handle_self = HandleFromInput(context, 0);
+    ResourceHandle handle_primary = HandleFromInput(context, 1);
+    std::string opname = handle_self.name();
+
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+
+    if (handle_self.name() == handle_primary.name() &&
+        handle_self.container() == handle_primary.container()) {
+      OP_REQUIRES_OK(
+          context,
+          LookupOrCreateResource<EmbeddingVar<TKey, TValue>>(
+              context, handle_self, &ev,
+              [this, default_values, opname, context,
+               handle_self](EmbeddingVar<TKey, TValue>** ptr) {
+                Allocator* allocator =
+                    context->device()->GetAllocator(AllocatorAttributes());
+                auto embedding_config = EmbeddingConfig(
+                    emb_index_ + block_num_ * slot_index_, emb_index_,
+                    block_num_, slot_num_, opname + "-primary", steps_to_live_,
+                    filter_freq_, max_freq_, l2_weight_threshold_,
+                    max_element_size_, false_positive_probability_,
+                    counter_type_, default_value_dim_,
+                    default_value_no_permission_, record_freq_, record_version_,
+                    is_inference_);
+                Allocator* alloc_for_ev =
+                    (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+                auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                    block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                    record_freq_, embedding_config.is_save_version(),
+                    {embedding_config.is_counter_filter(), filter_freq_});
+                auto storage = embedding::StorageFactory::Create<TKey, TValue>(
+                    embedding::StorageConfig(storage_type_, storage_path_,
+                                             storage_size_, embedding_config),
+                    alloc_for_ev, feat_desc, handle_self.name());
+                *ptr = new EmbeddingVar<TKey, TValue>(handle_self.name(),
+                                                      storage, embedding_config,
+                                                      alloc_for_ev, feat_desc);
+                return (*ptr)->Init(default_values, default_value_dim_);
+              }));
+    } else {
+      EmbeddingVar<TKey, TValue>* primary_variable = nullptr;
+      OP_REQUIRES_OK(
+          context,
+          LookupOrCreateResource<EmbeddingVar<TKey, TValue>>(
+              context, handle_primary, &primary_variable,
+              [this, default_values, opname, handle_primary,
+               context](EmbeddingVar<TKey, TValue>** ptr) {
+                int64 primary_slot_index(0), primary_emb_index(0);
+                Allocator* allocator =
+                    context->device()->GetAllocator(AllocatorAttributes());
+                auto embedding_config = EmbeddingConfig(
+                    primary_emb_index + block_num_ * primary_slot_index,
+                    primary_emb_index, block_num_, slot_num_,
+                    opname + "-primary", steps_to_live_, filter_freq_,
+                    max_freq_, l2_weight_threshold_, max_element_size_,
+                    false_positive_probability_, counter_type_, 0, record_freq_,
+                    record_version_, is_inference_);
+                Allocator* alloc_for_ev =
+                    (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+                auto feat_desc = new embedding::FeatureDescriptor<TValue>(
+                    block_num_, slot_num_ + 1, alloc_for_ev, storage_type_,
+                    record_freq_, embedding_config.is_save_version(),
+                    {embedding_config.is_counter_filter(), filter_freq_});
+                auto storage = embedding::StorageFactory::Create<TKey, TValue>(
+                    embedding::StorageConfig(storage_type_, storage_path_,
+                                             storage_size_, embedding_config),
+                    alloc_for_ev, feat_desc, handle_primary.name());
+                *ptr = new EmbeddingVar<TKey, TValue>(handle_primary.name(),
+                                                      storage, embedding_config,
+                                                      alloc_for_ev, feat_desc);
+                // default_values is slot value, should not to initialize
+                // primary value
+                return OkStatus();
+              }));
+
+      OP_REQUIRES_OK(
+          context,
+          LookupOrCreateResource<EmbeddingVar<TKey, TValue>>(
+              context, handle_self, &ev,
+              [this, default_values, opname, primary_variable, handle_self,
+               context](EmbeddingVar<TKey, TValue>** ptr) {
+                Allocator* allocator =
+                    context->device()->GetAllocator(AllocatorAttributes());
+                auto embedding_config = EmbeddingConfig(
+                    emb_index_ + block_num_ * slot_index_, emb_index_,
+                    block_num_, slot_num_, opname, steps_to_live_, filter_freq_,
+                    max_freq_, l2_weight_threshold_, max_element_size_,
+                    false_positive_probability_, counter_type_,
+                    default_value_dim_, default_value_no_permission_,
+                    record_freq_, record_version_, is_inference_);
+                Allocator* alloc_for_ev =
+                    (device_type_str_ == "CPU") ? ev_allocator() : allocator;
+                *ptr = new EmbeddingVar<TKey, TValue>(
+                    handle_self.name(), primary_variable->storage(),
+                    embedding_config, alloc_for_ev,
+                    primary_variable->feature_descriptor());
+                return (*ptr)->Init(default_values, default_value_dim_);
+              }));
+      core::ScopedUnref unref_me(primary_variable);
+    }
+    core::ScopedUnref unref_me(ev);
+    if (is_set_initialized_) {
+      ev->SetInitialized();
+    }
+  }
+
+ private:
+  DataType dtype_;
+  DataType counter_type_;
+  TensorShape shape_;
+  int64 steps_to_live_;
+  int64 emb_index_;
+  int64 block_num_;
+  int64 slot_index_;
+  int64 slot_num_;
+  std::string ht_type_;
+  int64 ht_partition_num_;
+  int64 filter_freq_;
+  int64 max_freq_;
+  float l2_weight_threshold_;
+  int64 max_element_size_;
+  float false_positive_probability_;
+  embedding::StorageType storage_type_;
+  std::string storage_path_;
+  std::vector<int64> storage_size_;
+  int64 default_value_dim_;
+  float default_value_no_permission_;
+  bool record_freq_;
+  bool record_version_;
+  bool is_inference_;
+  bool is_set_initialized_;
+  std::string device_type_str_;
+};
+
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableOp")       \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          InitializeKvVariableOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op")     \
+                              .Device(DEVICE_CPU)              \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          InitializeKvVariableOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL_INDEX(type) \
+  REGISTER_KERNELS(int32, type)          \
+  REGISTER_KERNELS(int64, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_ALL_INDEX)
+#undef REGISTER_KERNELS_ALL_INDEX
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableOp")       \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          InitializeKvVariableOp<ktype, vtype>);
+
+#define REGISTER_GPU_KERNELS(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(ktype, vtype)                         \
+  REGISTER_KERNEL_BUILDER(Name("InitializeKvVariableV2Op")     \
+                              .Device(DEVICE_GPU)              \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          InitializeKvVariableOp<ktype, vtype>);
+
+#define REGISTER_GPU_KERNELS(type) \
+  REGISTER_KERNELS(int32, type);   \
+  REGISTER_KERNELS(int64, type);
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+template <typename TKey, typename TValue>
+class KvResourceIsInitializedOp : public OpKernel {
+ public:
+  explicit KvResourceIsInitializedOp(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output));
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    bool found;
+    if (LookupResource<EmbeddingVar<TKey, TValue>>(ctx, HandleFromInput(ctx, 0),
+                                                   &ev)
+            .ok()) {
+      found = ev->IsInitialized();
+      ev->Unref();
+    } else {
+      found = false;
+    }
+
+    output->flat<bool>()(0) = found;
+  }
+};
+#define REGISTER_KERNELS(dev, ktype, vtype)                   \
+  REGISTER_KERNEL_BUILDER(Name("KvVarIsInitializedOp")        \
+                              .TypeConstraint<ktype>("Tkeys") \
+                              .TypeConstraint<vtype>("dtype") \
+                              .HostMemory("is_initialized")   \
+                              .Device(DEVICE_##dev),          \
+                          KvResourceIsInitializedOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type)    \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <typename TKey, typename TValue>
+class KvResourceIsAllSlotInitializedOp : public OpKernel {
+ public:
+  explicit KvResourceIsAllSlotInitializedOp(OpKernelConstruction* c)
+      : OpKernel(c) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* output;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, {}, &output));
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    bool found;
+    if (LookupResource<EmbeddingVar<TKey, TValue>>(ctx, HandleFromInput(ctx, 0),
+                                                   &ev)
+            .ok()) {
+      found = ev->IsAllSlotInitialized();
+      ev->Unref();
+    } else {
+      found = false;
+    }
+    output->flat<bool>()(0) = found;
+  }
+};
+#define REGISTER_KERNELS(dev, ktype, vtype)                          \
+  REGISTER_KERNEL_BUILDER(Name("KvVarIsAllSlotInitializedOp")        \
+                              .TypeConstraint<ktype>("Tkeys")        \
+                              .TypeConstraint<vtype>("dtype")        \
+                              .HostMemory("is_all_slot_initialized") \
+                              .Device(DEVICE_##dev),                 \
+                          KvResourceIsAllSlotInitializedOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type)    \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <typename TKey, typename TValue>
+class KvResourceInitCacheStrategyOp : public OpKernel {
+ public:
+  explicit KvResourceInitCacheStrategyOp(OpKernelConstruction* c)
+      : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("cache_strategy", &cache_strategy_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &ev));
+    core::ScopedUnref unref_me(ev);
+    ev->InitCache(static_cast<embedding::CacheStrategy>(cache_strategy_));
+  }
+
+ private:
+  int cache_strategy_;
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype)                     \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceInitCacheStrategyOp") \
+                              .TypeConstraint<ktype>("Tkeys")   \
+                              .TypeConstraint<vtype>("dtype")   \
+                              .Device(DEVICE_##dev),            \
+                          KvResourceInitCacheStrategyOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type)    \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc
new file mode 100644
index 00000000..3f19372a
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_restore_ops.cc
@@ -0,0 +1,259 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+=======================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "xla/stream_executor/cuda/cuda_activation.h"
+using stream_executor::cuda::ScopedActivateExecutorContext;
+#elif TENSORFLOW_USE_ROCM
+#include "tensorflow/core/platform/rocm.h"
+using stream_executor::rocm::ScopedActivateExecutorContext;
+
+#endif
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/gather_functor.h"
+#include "tensorflow/core/kernels/scatter_functor.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+constexpr int64 DEFAULT_RESTORE_THREAD_NUM = 4;
+
+class KvRestoreThreadPool {
+ public:
+  KvRestoreThreadPool() {
+    TF_CHECK_OK(ReadInt64FromEnvVar("TF_EV_RESTORE_THREAD_NUM",
+                                    DEFAULT_RESTORE_THREAD_NUM, &thread_num_));
+  }
+
+  static thread::ThreadPool* GetInstance() {
+    static thread::ThreadPool tp(Env::Default(), "restore_ev_threadpool",
+                                 thread_num_);
+    return &tp;
+  }
+
+ private:
+  static int64 thread_num_;
+};
+
+int64 KvRestoreThreadPool::thread_num_ = DEFAULT_RESTORE_THREAD_NUM;
+
+template <typename Device, typename TKey, typename TValue>
+class KvResourceImportV3Op : public AsyncOpKernel {
+ public:
+  explicit KvResourceImportV3Op(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+    OP_REQUIRES_OK(c, c->GetAttr("shape", &shape_));
+    OP_REQUIRES(c, shape_.dims() == 1,
+                errors::InvalidArgument("KvVariable dimension must be 1"));
+    OP_REQUIRES_OK(c, c->GetAttr("partition_id", &partition_id_));
+    OP_REQUIRES(c, partition_id_ >= 0,
+                errors::InvalidArgument("partition_id must >= 0, ",
+                                        std::to_string(partition_id_)));
+    OP_REQUIRES_OK(c, c->GetAttr("partition_num", &partition_num_));
+    OP_REQUIRES(c, partition_num_ >= 1,
+                errors::InvalidArgument("partition_num must >= 1, ",
+                                        std::to_string(partition_num_)));
+    OP_REQUIRES_OK(c, c->GetAttr("reset_version", &reset_version_));
+    bool reset_version = false;
+    TF_CHECK_OK(
+        ReadBoolFromEnvVar("TF_EV_RESET_VERSION", false, &reset_version));
+    reset_version_ = reset_version_ || reset_version;
+
+    TF_CHECK_OK(ReadBoolFromEnvVar("TF_ENABLE_EV_ASYNC_RESTORE", true,
+                                   &ev_async_restore_));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& file_name = context->input(0);
+    const std::string file_name_string = file_name.scalar<tstring>()();
+    const Tensor& name = context->input(2);
+    const std::string name_string = name.scalar<tstring>()();
+
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(context,
+                   LookupResource(context, HandleFromInput(context, 1), &ev));
+
+    core::ScopedUnref unref_me(ev);
+
+    // EV should not be initialized at this time.
+    if (ev->IsInitialized()) {
+      LOG(WARNING) << "EV (" << name_string
+                   << ") has already been initialized.";
+    }
+
+    auto do_compute = [this, context, file_name_string, ev, name_string,
+                       done]() {
+      BundleReader reader(Env::Default(), file_name_string);
+      auto s = reader.status();
+      if (!s.ok()) {
+        LOG(FATAL) << "Restore EV failure, create BundleReader error:"
+                   << s.ToString();
+        done();
+      }
+
+      if (ev->IsSingleHbm()) {
+#if GOOGLE_CUDA
+        ScopedActivateExecutorContext scoped_activation{
+            context->op_device_context()->stream()->parent()};
+        const Eigen::GpuDevice& device = context->eigen_gpu_device();
+        ev->Restore(name_string, file_name_string, partition_id_,
+                    partition_num_, false, &reader, reset_version_, &device);
+#endif
+      } else {
+        ev->Restore(name_string, file_name_string, partition_id_,
+                    partition_num_, false, &reader, reset_version_, nullptr);
+      }
+      ev->SetInitialized();
+      done();
+    };
+
+    if (ev_async_restore_) {
+      auto tp = KvRestoreThreadPool::GetInstance();
+      tp->Schedule(do_compute);
+    } else {
+      do_compute();
+    }
+  }
+
+ private:
+  int64 partition_id_;
+  int64 partition_num_;
+  DataType dtype_;
+  TensorShape shape_;
+  bool reset_version_;
+  bool ev_async_restore_;
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype, device)            \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceImportV3")           \
+                              .Device(DEVICE_##dev)            \
+                              .HostMemory("prefix")            \
+                              .HostMemory("tensor_names")      \
+                              .HostMemory("empty_key")         \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvResourceImportV3Op<device, ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type, device) \
+  REGISTER_KERNELS(dev, int32, type, device)    \
+  REGISTER_KERNELS(dev, int64, type, device)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type, CPUDevice)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type, GPUDevice)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+
+template <typename TKey, typename TValue>
+class KvResourceIncrImportOp : public AsyncOpKernel {
+ public:
+  explicit KvResourceIncrImportOp(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+
+    OP_REQUIRES_OK(c, c->GetAttr("partition_id", &partition_id_));
+    OP_REQUIRES(c, partition_id_ >= 0,
+                errors::InvalidArgument("partition_id must >= 0, ",
+                                        std::to_string(partition_id_)));
+    OP_REQUIRES_OK(c, c->GetAttr("partition_num", &partition_num_));
+    OP_REQUIRES(c, partition_num_ >= 1,
+                errors::InvalidArgument("partition_num must >= 1, ",
+                                        std::to_string(partition_num_)));
+  }
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
+    const Tensor& file_name = context->input(0);
+    const std::string file_name_string = file_name.scalar<tstring>()();
+    const Tensor& name = context->input(2);
+    const std::string name_string = name.scalar<tstring>()();
+
+    EmbeddingVar<TKey, TValue>* ev = nullptr;
+    OP_REQUIRES_OK(context,
+                   LookupResource(context, HandleFromInput(context, 1), &ev));
+
+    core::ScopedUnref unref_me(ev);
+
+    BundleReader reader(Env::Default(), file_name_string);
+    OP_REQUIRES_OK(context, reader.status());
+
+    LOG(INFO) << "incr import, evname:" << name_string
+              << "partition_num:" << partition_num_;
+
+    ev->Restore(name_string, file_name_string, partition_id_, partition_num_,
+                true, &reader);
+    ev->SetInitialized();
+    done();
+  }
+
+ private:
+  int64 partition_id_;
+  int64 partition_num_;
+  DataType dtype_;
+  TensorShape shape_;
+  int64 steps_to_live_;
+  bool restore_versions_;
+  string ht_type_;
+  int64 ht_partition_num_;
+};
+
+#define REGISTER_KERNELS(dev, ktype, vtype)                    \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceIncrImport")         \
+                              .Device(DEVICE_##dev)            \
+                              .TypeConstraint<ktype>("Tkeys")  \
+                              .TypeConstraint<vtype>("dtype"), \
+                          KvResourceIncrImportOp<ktype, vtype>);
+#define REGISTER_KERNELS_ALL(dev, type) \
+  REGISTER_KERNELS(dev, int32, type)    \
+  REGISTER_KERNELS(dev, int64, type)
+#define REGISTER_KERNELS_CPU(type) REGISTER_KERNELS_ALL(CPU, type)
+TF_CALL_FLOAT_TYPES(REGISTER_KERNELS_CPU)
+#undef REGISTER_KERNELS_CPU
+
+#if GOOGLE_CUDA
+#define REGISTER_KERNELS_GPU(type) REGISTER_KERNELS_ALL(GPU, type)
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS_GPU)
+#undef REGISTER_KERNELS_GPU
+#endif  // GOOGLE_CUDA
+
+#undef REGISTER_KERNELS_ALL
+#undef REGISTER_KERNELS
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc
new file mode 100644
index 00000000..63d7760d
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.cc
@@ -0,0 +1,69 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif
+
+#include "kv_variable_util.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/storage_factory.h"
+#include "deepray/custom_ops/embedding_variable/config.pb.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/env_var.h"
+#include "tensorflow/core/util/util.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+Status MoveMatchingFiles(Env* env, const tstring& pattern,
+                         const tstring& merged_prefix,
+                         int64 input_prefix_size) {
+  std::vector<string> file_vec;
+  TF_RETURN_IF_ERROR(env->GetMatchingPaths(pattern, &file_vec));
+  for (int64 i = 0; i < file_vec.size(); i++) {
+    const tstring& filename = tstring(file_vec[i].substr(input_prefix_size));
+    TF_RETURN_IF_ERROR(env->RenameFile(file_vec[i], merged_prefix + filename));
+  }
+  return OkStatus();
+}
+
+Status MoveSsdFiles(Env* env, const gtl::ArraySlice<tstring>& input_prefixes,
+                    const tstring& merged_prefix) {
+  for (auto input_prefix : input_prefixes) {
+    const tstring& input_ssd_record_pattern = input_prefix + "*-ssd_record*";
+    TF_RETURN_IF_ERROR(MoveMatchingFiles(env, input_ssd_record_pattern,
+                                         merged_prefix, input_prefix.size()));
+
+    const tstring& input_emb_files_pattern = input_prefix + "*-emb_files";
+    TF_RETURN_IF_ERROR(MoveMatchingFiles(env, input_emb_files_pattern,
+                                         merged_prefix, input_prefix.size()));
+  }
+  return OkStatus();
+}
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h
new file mode 100644
index 00000000..a44c0d8e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h
@@ -0,0 +1,165 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_
+#define TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache_factory.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/random/philox_random.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/random_distributions.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+using GPUDevice = Eigen::GpuDevice;
+
+template <class T>
+class EVKeyDumpIterator : public DumpIterator<T> {
+ public:
+  EVKeyDumpIterator(std::vector<T>& key_list) : key_list_(key_list) {
+    keys_idx_ = 0;
+  }
+
+  bool HasNext() const { return keys_idx_ < key_list_.size(); }
+
+  T Next() { return key_list_[keys_idx_++]; }
+
+ private:
+  int64 keys_idx_;
+  std::vector<T>& key_list_;
+};
+
+template <class K, class T>
+class EVValueDumpIterator : public DumpIterator<T> {
+ public:
+  EVValueDumpIterator(EmbeddingVar<K, T>*& ev, std::vector<T*>& valueptr_list)
+      : ev_(ev), valueptr_list_(valueptr_list) {
+    keys_idx_ = 0;
+    col_idx_ = 0;
+  }
+
+  bool HasNext() const {
+    if (keys_idx_ < valueptr_list_.size()) {
+      if (keys_idx_ < valueptr_list_.size() - 1)
+        return true;
+      else
+        return col_idx_ < ev_->ValueLen();
+    } else
+      return false;
+  }
+
+  T Next() {
+    if (col_idx_ >= ev_->ValueLen()) {
+      keys_idx_++;
+      col_idx_ = 0;
+    }
+    Eigen::array<Eigen::DenseIndex, 1> dims({ev_->ValueLen()});
+    typename TTypes<T>::Flat value_flat =
+        typename TTypes<T>::Flat(valueptr_list_[keys_idx_], dims);
+    return value_flat(col_idx_++);
+  }
+
+ private:
+  EmbeddingVar<K, T>* ev_;
+  std::vector<T*>& valueptr_list_;
+  int64 keys_idx_;
+  int64 col_idx_;
+};
+
+template <class T>
+class EVVersionDumpIterator : public DumpIterator<T> {
+ public:
+  EVVersionDumpIterator(std::vector<T>& version_list)
+      : version_list_(version_list) {
+    keys_idx_ = 0;
+  }
+
+  bool HasNext() const { return keys_idx_ < version_list_.size(); }
+
+  T Next() { return version_list_[keys_idx_++]; }
+
+ private:
+  std::vector<T>& version_list_;
+  int64 keys_idx_;
+};
+
+template <class T>
+class EVFreqDumpIterator : public DumpIterator<T> {
+ public:
+  EVFreqDumpIterator(std::vector<T>& freq_list) : freq_list_(freq_list) {
+    keys_idx_ = 0;
+  }
+
+  bool HasNext() const { return keys_idx_ < freq_list_.size(); }
+
+  T Next() { return freq_list_[keys_idx_++]; }
+
+ private:
+  std::vector<T>& freq_list_;
+  int64 keys_idx_;
+};
+
+template <class T>
+class EVOffsetDumpIterator : public DumpIterator<T> {
+ public:
+  EVOffsetDumpIterator(std::vector<T>& offset_list)
+      : offset_list_(offset_list) {
+    keys_idx_ = 0;
+  }
+
+  bool HasNext() const { return keys_idx_ < offset_list_.size(); }
+
+  T Next() { return offset_list_[keys_idx_++]; }
+
+ private:
+  std::vector<T>& offset_list_;
+  int64 keys_idx_;
+};
+
+template <class K, class V>
+Status GetInputEmbeddingVar(OpKernelContext* ctx, int input,
+                            EmbeddingVar<K, V>** var) {
+  if (LookupResource(ctx, HandleFromInput(ctx, input), var).ok()) {
+    return OkStatus();
+  } else {
+    return errors::Internal("Invalid versioned variable reference.");
+  }
+}
+
+Status MoveMatchingFiles(Env* env, const tstring& pattern,
+                         const tstring& merged_prefix, int64 input_prefix_size);
+
+/*Move two files and one directory:
+1. xxxxx-ssd_record.index
+2. xxxxx-ssd_record.data
+3. xxxxxx-emb_files/
+1 and 2 record the meta data of SSDHash,
+and 3 records the embeddings on SSD*/
+Status MoveSsdFiles(Env* env, const gtl::ArraySlice<tstring>& input_prefixes,
+                    const tstring& merged_prefix);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_KERNELS_KV_VARIABLE_OPS_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc
new file mode 100644
index 00000000..9a19319a
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_ops.cc
@@ -0,0 +1,176 @@
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Shared validations of the inputs to the SaveV2 and RestoreV2 ops.
+void ValidateInputs(bool is_save_op, OpKernelContext* context,
+                    const Tensor& prefix, const Tensor& tensor_names,
+                    const Tensor& shape_and_slices, const int kFixedInputs) {
+  const int num_tensors = static_cast<int>(tensor_names.NumElements());
+  OP_REQUIRES(
+      context, prefix.NumElements() == 1,
+      errors::InvalidArgument("Input prefix should have a single element, got ",
+                              prefix.NumElements(), " instead."));
+  OP_REQUIRES(context,
+              TensorShapeUtils::IsVector(tensor_names.shape()) &&
+                  TensorShapeUtils::IsVector(shape_and_slices.shape()),
+              errors::InvalidArgument(
+                  "Input tensor_names and shape_and_slices "
+                  "should be an 1-D tensors, got ",
+                  tensor_names.shape().DebugString(), " and ",
+                  shape_and_slices.shape().DebugString(), " instead."));
+  OP_REQUIRES(context,
+              tensor_names.NumElements() == shape_and_slices.NumElements(),
+              errors::InvalidArgument("tensor_names and shape_and_slices "
+                                      "have different number of elements: ",
+                                      tensor_names.NumElements(), " vs. ",
+                                      shape_and_slices.NumElements()));
+  OP_REQUIRES(context,
+              FastBoundsCheck(tensor_names.NumElements() + kFixedInputs,
+                              std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Too many inputs to the op"));
+  OP_REQUIRES(
+      context, shape_and_slices.NumElements() == num_tensors,
+      errors::InvalidArgument("Expected ", num_tensors,
+                              " elements in shapes_and_slices, but got ",
+                              context->input(2).NumElements()));
+  if (is_save_op) {
+    OP_REQUIRES(context, context->num_inputs() == num_tensors + kFixedInputs,
+                errors::InvalidArgument(
+                    "Got ", num_tensors, " tensor names but ",
+                    context->num_inputs() - kFixedInputs, " tensors."));
+    OP_REQUIRES(context, context->num_inputs() == num_tensors + kFixedInputs,
+                errors::InvalidArgument(
+                    "Expected a total of ", num_tensors + kFixedInputs,
+                    " inputs as input #1 (which is a string "
+                    "tensor of saved names) contains ",
+                    num_tensors, " names, but received ", context->num_inputs(),
+                    " inputs"));
+  }
+}
+
+}  // namespace
+
+class SaveV3 : public OpKernel {
+ public:
+  explicit SaveV3(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtypes", &tensor_types_));
+    OP_REQUIRES_OK(context, context->GetAttr("ev_key_types", &ev_key_types_));
+    OP_REQUIRES_OK(context, context->GetAttr("has_ev", &has_ev_));
+  }
+
+  template <typename TKey, typename TValue>
+  void DumpEvWithGlobalStep(OpKernelContext* context, const string& tensor_name,
+                            EmbeddingVar<TKey, TValue>* ev,
+                            BundleWriter& writer, DataType global_step_type) {
+    if (global_step_type == DT_INT32) {
+      DumpEv<TKey, TValue, int32>(context, ev, tensor_name, writer);
+    } else {
+      DumpEv<TKey, TValue, int64>(context, ev, tensor_name, writer);
+    }
+  }
+
+  template <typename TKey, typename TValue, typename TGlobalStep>
+  void DumpEv(OpKernelContext* context, EmbeddingVar<TKey, TValue>* variable,
+              const string& tensor_name, BundleWriter& writer) {
+    const Tensor& global_step = context->input(5);
+    TGlobalStep global_step_scalar = global_step.scalar<TGlobalStep>()();
+    core::ScopedUnref s(variable);
+    embedding::ShrinkArgs shrink_args;
+    shrink_args.global_step = global_step_scalar;
+    const Tensor& prefix = context->input(0);
+    const string& prefix_string = prefix.scalar<tstring>()();
+    OP_REQUIRES_OK(context, variable->Save(tensor_name, prefix_string, &writer,
+                                           shrink_args));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& prefix = context->input(0);
+    const Tensor& tensor_names = context->input(1);
+    const Tensor& shape_and_slices = context->input(2);
+    const Tensor& ev_names = context->input(3);
+    const Tensor& ev_resources = context->input(4);
+    const int kFixedInputs = 5;
+    ValidateInputs(true /* is save op */, context, prefix, tensor_names,
+                   shape_and_slices, kFixedInputs);
+    if (!context->status().ok()) return;
+    // Prefix, tensor names, shape_and_slices, ev names, ev resources.
+    const int num_tensors = static_cast<int>(tensor_names.NumElements());
+    const int num_ev = static_cast<int>(ev_names.NumElements());
+    const string& prefix_string = prefix.scalar<tstring>()();
+    const auto& tensor_names_flat = tensor_names.flat<tstring>();
+    const auto& ev_names_flat = ev_names.flat<tstring>();
+    const auto& ev_resources_flat = ev_resources.flat<int64>();
+    const auto& shape_and_slices_flat = shape_and_slices.flat<tstring>();
+
+    BundleWriter writer(Env::Default(), prefix_string);
+    OP_REQUIRES_OK(context, writer.status());
+    VLOG(1) << "BundleWriter, prefix_string: " << prefix_string;
+
+    int start_index = 0;
+    if (has_ev_) {
+      start_index = 1;
+    }
+
+    for (int i = 0; i < num_ev; i++) {
+      const string& ev_name = ev_names_flat(i);
+      if (ev_key_types_[i] == DT_INT32) {
+        EmbeddingVar<int32, float>* ev =
+            reinterpret_cast<EmbeddingVar<int32, float>*>(ev_resources_flat(i));
+        DumpEvWithGlobalStep(context, ev_name, ev, writer, tensor_types_[0]);
+      } else if (ev_key_types_[i] == DT_INT64) {
+        EmbeddingVar<int64, float>* ev =
+            reinterpret_cast<EmbeddingVar<int64, float>*>(ev_resources_flat(i));
+        DumpEvWithGlobalStep(context, ev_name, ev, writer, tensor_types_[0]);
+      }
+    }
+
+    for (int i = start_index; i < num_tensors; ++i) {
+      const string& tensor_name = tensor_names_flat(i);
+      if (tensor_types_[i] == DT_RESOURCE) {
+        auto& handle = HandleFromInput(context, i + kFixedInputs);
+
+      } else {
+        const Tensor& tensor = context->input(i + kFixedInputs);
+
+        if (!shape_and_slices_flat(i).empty()) {
+          const string& shape_spec = shape_and_slices_flat(i);
+          TensorShape shape;
+          TensorSlice slice(tensor.dims());
+          TensorShape slice_shape;
+
+          OP_REQUIRES_OK(context,
+                         checkpoint::ParseShapeAndSlice(shape_spec, &shape,
+                                                        &slice, &slice_shape));
+          OP_REQUIRES(
+              context, slice_shape.IsSameSize(tensor.shape()),
+              errors::InvalidArgument(
+                  "Slice in shape_and_slice "
+                  "specification does not match the "
+                  "shape of the tensor to  save: ",
+                  shape_spec, ", tensor: ", tensor.shape().DebugString()));
+
+          OP_REQUIRES_OK(context,
+                         writer.AddSlice(tensor_name, shape, slice, tensor));
+        } else {
+          OP_REQUIRES_OK(context, writer.Add(tensor_name, tensor));
+        }
+      }
+    }
+    OP_REQUIRES_OK(context, writer.Finish());
+  }
+
+ private:
+  DataTypeVector tensor_types_;
+  DataTypeVector ev_key_types_;
+  bool has_ev_;
+};
+REGISTER_KERNEL_BUILDER(Name("SaveV3").Device(DEVICE_CPU), SaveV3);
+
+}  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h
new file mode 100644
index 00000000..4b3a5fa1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/save_restore_tensor_ev.h
@@ -0,0 +1,82 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_
+#define TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_
+
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "tensorflow/core/util/tensor_slice_reader.h"
+#include "tensorflow/core/util/tensor_slice_writer.h"
+
+namespace tensorflow {
+
+class OpKernelContext;
+
+template <class T>
+class DumpIterator {
+ public:
+  virtual ~DumpIterator() {}
+  virtual bool HasNext() const = 0;
+  virtual T Next() = 0;
+};
+
+template <typename T>
+Status SaveTensorWithFixedBuffer(const string& tensor_name,
+                                 BundleWriter* writer, char* dump_buffer,
+                                 size_t bytes_limit, DumpIterator<T>* dump_iter,
+                                 const TensorShape& dump_tensor_shape,
+                                 bool use_shape = true) {
+  bool dump_happened = false;
+  size_t bytes_written = 0;
+  int buffer_idx = 0;
+  Status st;
+  int64 total_bytes_written = 0;
+  T* key_dump_buffer = (T*)dump_buffer;
+  if (use_shape)
+    st = writer->AddTensorHeader(tensor_name, DataTypeToEnum<T>::v(),
+                                 dump_tensor_shape);
+  if (!st.ok()) return st;
+
+  while (dump_iter->HasNext()) {
+    T key = dump_iter->Next();
+    if (bytes_written + sizeof(T) > bytes_limit) {
+      dump_happened = true;
+      TF_CHECK_OK(writer->AppendSegmentData(dump_buffer, bytes_written));
+      bytes_written = 0;
+      buffer_idx = 0;
+    }
+    key_dump_buffer[buffer_idx] = key;
+    buffer_idx++;
+    bytes_written += sizeof(T);
+    total_bytes_written += sizeof(T);
+  }
+
+  if (!dump_happened) {
+    VLOG(1) << tensor_name
+            << " only one buffer written, size:" << bytes_written;
+    TF_CHECK_OK(writer->AddCompeleteData(dump_buffer, bytes_written));
+  } else {
+    VLOG(1) << tensor_name
+            << " mutiple buffer written, size:" << total_bytes_written
+            << ", bytes written:" << bytes_written;
+    TF_CHECK_OK(writer->AppendSegmentData(dump_buffer, bytes_written));
+    writer->EndSegmentData(total_bytes_written, bytes_written);
+  }
+  return OkStatus();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_SAVE_RESTORE_TENSOR_EV_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc
new file mode 100644
index 00000000..46e72845
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adagrad_ops.cc
@@ -0,0 +1,383 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+#include <algorithm>
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "training_ali_op_helpers.h"
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "training_ali_ops_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using SYCLDevice = Eigen::SyclDevice;
+
+template <typename T, typename Tindex, bool indices_as_pointer, bool has_counts>
+class KvSparseApplyAdagradOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdagradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1});
+
+    EmbeddingVar<Tindex, T>* var = NULL;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+    EmbeddingVar<Tindex, T>* accum = NULL;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum));
+    core::ScopedUnref unref_accum(accum);
+
+    const Tensor& lr = ctx->input(2);
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    const Tensor& global_step = ctx->input(5);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(6);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) { return counts[index]; };
+    } else {
+      get_count_fn = [](int64* counts, int64 index) { return 1; };
+    }
+
+    if (N > 0) {
+      if (inner_dim > 0) {
+        auto indices_vec = indices.vec<Tindex>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        int64 gs = global_step.scalar<int64>()();
+        auto do_work = [this, ctx, &indices_vec, var, accum, &grad_flat, &gs,
+                        &lr_scalar, indices_counts,
+                        get_count_fn](int64 start_i, int64 limit_i) {
+          for (int64 i = start_i; i < limit_i; i++) {
+            const Tindex index = indices_vec(i);
+            void* value_ptr = nullptr;
+            bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
+            OP_REQUIRES_OK(ctx,
+                           var->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                                  indices_as_pointer, count));
+            var->UpdateVersion(value_ptr, gs);
+            if (is_filter) {
+              auto var_i = var->flat(value_ptr);
+              auto a = accum->flat(value_ptr);
+              auto g = grad_flat.template chip<0>(i);
+              a += g.square();
+              var_i -= g.constant(lr_scalar) * g * a.rsqrt();
+            }
+          }
+        };
+        const int64 cost = 1000;  // very unreliable estimate for cost per step.
+        auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+        Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+              do_work);
+
+        if (has_counts && !indices_as_pointer) {
+          const Tensor& indices_counts = ctx->input(6);
+          var->UpdateCache(indices, indices_counts);
+        }
+      }
+    }
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                         \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagrad")                \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
+                          KvSparseApplyAdagradOp<T, Tindices, false, false>); \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagrad")           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
+                          KvSparseApplyAdagradOp<T, Tindices, true, false>);  \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdagradWithCounts")      \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
+                          KvSparseApplyAdagradOp<T, Tindices, false, true>);  \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdagradWithCounts") \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices"),          \
+                          KvSparseApplyAdagradOp<T, Tindices, true, true>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+template <typename Device, typename T, typename Tindex, bool indices_as_pointer,
+          bool has_counts>
+class KvSparseApplyAdagradGPUOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdagradGPUOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+
+    int num_worker_threads =
+        ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
+  }
+
+  void ApplyGradients(EmbeddingVar<Tindex, T>* var,
+                      EmbeddingVar<Tindex, T>* accum, T** var_ptr, T** acc_ptr,
+                      T lr_scalar, const T* grad_base, const int64 task_size,
+                      se::Stream* stream, EventMgr* event_mgr,
+                      const Eigen::GpuDevice& gpu_device) {
+    // Send pointers of embeddings to GPU
+    T** dev_var_ptr = (T**)var->GetBuffer(task_size * 2);
+    T** dev_acc_ptr = dev_var_ptr + task_size;
+    CHECK(dev_var_ptr);
+    CHECK(dev_acc_ptr);
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 2);
+    stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 2);
+
+    int block_size = 128;
+    int embedding_dim = var->ValueLen();
+    functor::KvSparseApplyAdagradHbm<GPUDevice, Tindex, T>()(
+        block_size, embedding_dim, dev_acc_ptr, dev_var_ptr, grad_base,
+        lr_scalar, task_size, gpu_device);
+    SyncWithEventMgr(stream, event_mgr);
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1});
+
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+    EmbeddingVar<Tindex, T>* accum = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum));
+    core::ScopedUnref unref_accum(accum);
+
+    const Tensor& lr = ctx->input(2);
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    const Tensor& global_step = ctx->input(5);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (inner_dim > 0) {
+        auto indices_flat = indices.flat<Tindex>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        int64 gs = global_step.scalar<int64>()();
+        T lr_scalar = lr.scalar<T>()();
+        if (var->IsSingleHbm()) {
+          const Tindex* key_base = &indices_flat(0);
+          const T* grad_base = &grad_flat(0);
+          const Device& device = ctx->eigen_device<Device>();
+
+          functor::KvSparseApplyAdagrad<Device, T, Tindex>()(
+              N, ctx->get_allocator(AllocatorAttributes()), var, accum,
+              key_base, grad_base, lr_scalar, gs, device);
+        } else {
+          Tensor indices_temp_host(indices.dtype(), indices.shape());
+          const Tensor* indices_host_ptr = nullptr;
+          // Copy ids from GPU to CPU for CPU Lookup.
+          auto stream = ctx->op_device_context()->stream();
+          auto event_mgr =
+              ctx->device()->tensorflow_accelerator_device_info()->event_mgr;
+          if (!indices_as_pointer) {
+            indices_host_ptr = &indices_temp_host;
+            se::DeviceMemoryBase gpu_src(const_cast<Tindex*>(&indices_flat(0)),
+                                         N * sizeof(Tindex));
+            stream->ThenMemcpy(indices_host_ptr->data(), gpu_src,
+                               N * sizeof(Tindex));
+            SyncWithEventMgr(stream, event_mgr);
+          } else {
+            indices_host_ptr = &indices;
+          }
+
+          int counts_index = has_counts ? 6 : -1;
+          T** var_ptr = new T*[N * 2];
+          T** acc_ptr = var_ptr + N;
+          std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(2);
+          vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+          vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(accum, acc_ptr);
+          GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs,
+                               indices_as_pointer, counts_index, N,
+                               thread_copy_id_alloc_.get());
+
+          ApplyGradients(var, accum, var_ptr, acc_ptr, lr_scalar, &grad_flat(0),
+                         N, stream, event_mgr, ctx->eigen_device<GPUDevice>());
+
+          if (has_counts && !indices_as_pointer) {
+            const Tensor& counts_tensor = ctx->input(counts_index);
+            var->UpdateCache(*indices_host_ptr, counts_tensor);
+          }
+
+          delete[] var_ptr;
+        }
+      }
+    }
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
+};
+
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Tindex)                                          \
+  template <>                                                                \
+  void KvSparseApplyAdagrad<GPUDevice, T, Tindex>::operator()(               \
+      int32 num_items, Allocator* alloc, EmbeddingVar<Tindex, T>* var,       \
+      EmbeddingVar<Tindex, T>* accum, const Tindex* key_base, const T* grad, \
+      T lr, int64 gs, const GPUDevice& device);                              \
+  extern template struct KvSparseApplyAdagrad<GPUDevice, T, Tindex>;
+DECLARE_GPU_SPEC(float, int32);
+DECLARE_GPU_SPEC(double, int32);
+DECLARE_GPU_SPEC(float, int64);
+DECLARE_GPU_SPEC(double, int64);
+#undef DECLARE_GPU_SPEC
+}  // end of namespace functor
+
+#define REGISTER_KERNELS(T, Tindices)                                   \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("KvResourceSparseApplyAdagrad")                              \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<T>("T")                                       \
+          .HostMemory("lr")                                             \
+          .HostMemory("global_step")                                    \
+          .TypeConstraint<Tindices>("Tindices"),                        \
+      KvSparseApplyAdagradGPUOp<GPUDevice, T, Tindices, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("_OPT_KvResourceSparseApplyAdagrad")                         \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<T>("T")                                       \
+          .HostMemory("indices")                                        \
+          .HostMemory("lr")                                             \
+          .HostMemory("global_step")                                    \
+          .TypeConstraint<Tindices>("Tindices"),                        \
+      KvSparseApplyAdagradGPUOp<GPUDevice, T, Tindices, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("KvResourceSparseApplyAdagradWithCounts")                    \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<T>("T")                                       \
+          .HostMemory("lr")                                             \
+          .HostMemory("global_step")                                    \
+          .HostMemory("indices_counts")                                 \
+          .TypeConstraint<Tindices>("Tindices"),                        \
+      KvSparseApplyAdagradGPUOp<GPUDevice, T, Tindices, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("_OPT_KvResourceSparseApplyAdagradWithCounts")               \
+          .Device(DEVICE_GPU)                                           \
+          .TypeConstraint<T>("T")                                       \
+          .HostMemory("indices")                                        \
+          .HostMemory("lr")                                             \
+          .HostMemory("global_step")                                    \
+          .HostMemory("indices_counts")                                 \
+          .TypeConstraint<Tindices>("Tindices"),                        \
+      KvSparseApplyAdagradGPUOp<GPUDevice, T, Tindices, true, true>);
+#define REGISTER_GPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_float(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+#endif  // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc
new file mode 100644
index 00000000..0517788c
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_async_ops.cc
@@ -0,0 +1,603 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+#include <algorithm>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h"
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "training_ali_op_helpers.h"
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "training_ali_ops_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using SYCLDevice = Eigen::SyclDevice;
+
+template <typename Device, typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
+class KvSparseApplyAdamAsyncOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdamAsyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("apply_sparse_rmsprop", &apply_sparse_rmsprop_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1, 2, 3, 4});
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+
+    EmbeddingVar<Tindex, T>* m = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m));
+    core::ScopedUnref unref_m(m);
+
+    EmbeddingVar<Tindex, T>* v = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v));
+    core::ScopedUnref unref_v(v);
+
+    Tensor beta1_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, true, &beta1_power));
+
+    Tensor beta2_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 4, use_exclusive_lock_, true, &beta2_power));
+    OP_REQUIRES(
+        ctx, beta1_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(3)));
+    OP_REQUIRES(
+        ctx, beta2_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(4)));
+
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+    const Tensor& grad = ctx->input(9);
+    const Tensor& indices = ctx->input(10);
+    const Tensor& global_step = ctx->input(11);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(12);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) { return counts[index]; };
+    } else {
+      get_count_fn = [](int64* counts, int64 index) { return 1; };
+    }
+    if (N > 0) {
+      if (apply_sparse_rmsprop_) {
+        auto indices_vec = indices.vec<Tindex>();
+
+        auto grad_flat = grad.flat_outer_dims<T>();
+        const T lr_scalar = lr.scalar<T>()();
+        const T beta1_scalar = beta1.scalar<T>()();
+        const T beta2_scalar = beta2.scalar<T>()();
+        const T epsilon_scalar = epsilon.scalar<T>()();
+
+        auto do_work = [this, ctx, &indices_vec, &var, v, m, &grad_flat,
+                        &beta2_scalar, &beta1_scalar, &epsilon_scalar,
+                        &lr_scalar, &global_step, get_count_fn,
+                        indices_counts](int64 start_i, int64 limit_i) {
+          Tstep gs = global_step.scalar<Tstep>()();
+          for (int64 i = start_i; i < limit_i; i++) {
+            const Tindex index = indices_vec(i);
+            void* value_ptr = nullptr;
+            bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
+            OP_REQUIRES_OK(ctx,
+                           var->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                                  indices_as_pointer, count));
+            var->UpdateVersion(value_ptr, gs);
+            if (is_filter) {
+              auto v_ = v->flat(value_ptr);
+              auto m_ = m->flat(value_ptr);
+              auto grad_ = grad_flat.template chip<0>(i);
+
+              v_ = v_ * v_.constant(beta2_scalar) +
+                   grad_.square() * grad_.constant(T(1) - beta2_scalar);
+              m_ = m_ * m_.constant(beta1_scalar) +
+                   (v_ + v_.constant(epsilon_scalar)).rsqrt() *
+                       v_.constant(lr_scalar) * grad_;
+
+              auto v = var->flat(value_ptr);
+              v -= m_;
+            }
+          }
+        };
+        const int64 cost = 1000;
+        auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+        Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+              do_work);
+      } else {
+        auto beta1_power_scalar = beta1_power.scalar<T>();
+        auto beta2_power_scalar = beta2_power.scalar<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T beta1_scalar = beta1.scalar<T>()();
+        T beta2_scalar = beta2.scalar<T>()();
+        T epsilon_scalar = epsilon.scalar<T>()();
+        const T alpha =
+            lr_scalar *
+            Eigen::numext::sqrt(static_cast<T>(1) - beta2_power_scalar()) /
+            (static_cast<T>(1) - beta1_power_scalar());
+
+        auto do_work = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices,
+                        &lr_scalar, &beta1_scalar, &beta1_power, &beta2_power,
+                        &beta2_scalar, &epsilon_scalar, &alpha, &global_step,
+                        get_count_fn,
+                        indices_counts](int64 start_i, int64 limit_i) {
+          if (inner_dim > 0) {
+            auto grad_flat = grad.flat_outer_dims<T>();
+            auto indices_vec = indices.vec<Tindex>();
+            Tstep gs = global_step.scalar<Tstep>()();
+
+            for (int64 i = start_i; i < limit_i; i++) {
+              const Tindex index = indices_vec(i);
+              void* value_ptr = nullptr;
+              bool is_filter = false;
+              int64 count = get_count_fn(indices_counts, i);
+              OP_REQUIRES_OK(
+                  ctx, var->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                              indices_as_pointer, count));
+              var->UpdateVersion(value_ptr, gs);
+              if (is_filter) {
+                auto m_a = m->flat(value_ptr);
+                auto v_a = v->flat(value_ptr);
+                auto g = grad_flat.template chip<0>(i);
+                auto var_i = var->flat(value_ptr);
+
+                m_a =
+                    m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
+                v_a = v_a * beta2_scalar +
+                      g.square() * (static_cast<T>(1) - beta2_scalar);
+                var_i -= (m_a * alpha) / (v_a.sqrt() + epsilon_scalar);
+              }
+            }
+          }
+        };
+
+        const int64 cost = 1000;
+        auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+        Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+              do_work);
+
+        beta1_power_scalar() *= beta1_scalar;
+        beta2_power_scalar() *= beta2_scalar;
+      }
+      if (has_counts && !indices_as_pointer) {
+        const Tensor& indices_counts = ctx->input(12);
+        var->UpdateCache(indices, indices_counts);
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool apply_sparse_rmsprop_;
+};
+
+#define REGISTER_KERNELS(D, T, Tindices, Tstep)                               \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("KvResourceSparseApplyAdamAsync")                                  \
+          .Device(DEVICE_##D)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices")                               \
+          .TypeConstraint<Tstep>("Tstep"),                                    \
+      KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyAdamAsync")                             \
+          .Device(DEVICE_##D)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices")                               \
+          .TypeConstraint<Tstep>("Tstep"),                                    \
+      KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("KvResourceSparseApplyAdamAsyncWithCounts")                        \
+          .Device(DEVICE_##D)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices")                               \
+          .TypeConstraint<Tstep>("Tstep"),                                    \
+      KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")                   \
+          .Device(DEVICE_##D)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices")                               \
+          .TypeConstraint<Tstep>("Tstep"),                                    \
+      KvSparseApplyAdamAsyncOp<D##Device, T, Tindices, Tstep, true, true>);
+
+#define REGISTER_CPU_KERNELS(T)           \
+  REGISTER_KERNELS(CPU, T, int32, int32); \
+  REGISTER_KERNELS(CPU, T, int64, int32); \
+  REGISTER_KERNELS(CPU, T, int32, int64); \
+  REGISTER_KERNELS(CPU, T, int64, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+template <typename Device, typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
+class KvSparseApplyAdamAsyncGPUOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdamAsyncGPUOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("apply_sparse_rmsprop", &apply_sparse_rmsprop_));
+
+    int num_worker_threads =
+        ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
+  }
+
+  void ApplyGradients(EmbeddingVar<Tindex, T>* var, EmbeddingVar<Tindex, T>* m,
+                      EmbeddingVar<Tindex, T>* v, T** var_ptr, T** m_ptr,
+                      T** v_ptr, T beta1, T beta2, T epsilon, T lr,
+                      typename TTypes<T>::Scalar beta1_power_scalar,
+                      typename TTypes<T>::Scalar beta2_power_scalar,
+                      const T* grad_base, const int64 task_size,
+                      se::Stream* stream, EventMgr* event_mgr,
+                      const Eigen::GpuDevice& gpu_device) {
+    // Send pointers of embeddings to GPU
+    T** dev_var_ptr = (T**)var->GetBuffer(task_size * 3);
+    T** dev_m_ptr = dev_var_ptr + task_size;
+    T** dev_v_ptr = dev_m_ptr + task_size;
+    CHECK(dev_var_ptr);
+    CHECK(dev_m_ptr);
+    CHECK(dev_v_ptr);
+
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
+
+    int block_size = 128;
+    int embedding_dim = var->ValueLen();
+    T* beta1_power_ptr = beta1_power_scalar.data();
+    T* beta2_power_ptr = beta2_power_scalar.data();
+    if (apply_sparse_rmsprop_) {
+      functor::KvSparseApplyAdamAsyncSparseRmspropHbm<GPUDevice, Tindex, T>()(
+          block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr,
+          grad_base, lr, beta1, beta2, epsilon, task_size, gpu_device);
+    } else {
+      functor::KvSparseApplyAdamAsyncHbm<GPUDevice, Tindex, T>()(
+          block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr,
+          grad_base, lr, beta1, beta2, epsilon, beta1_power_ptr,
+          beta2_power_ptr, task_size, gpu_device);
+    }
+    SyncWithEventMgr(stream, event_mgr);
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1, 2, 3, 4});
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+
+    EmbeddingVar<Tindex, T>* m = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m));
+    core::ScopedUnref unref_m(m);
+
+    EmbeddingVar<Tindex, T>* v = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v));
+    core::ScopedUnref unref_v(v);
+
+    Tensor beta1_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, true, &beta1_power));
+
+    Tensor beta2_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 4, use_exclusive_lock_, true, &beta2_power));
+    OP_REQUIRES(
+        ctx, beta1_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(3)));
+    OP_REQUIRES(
+        ctx, beta2_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(4)));
+
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+    const Tensor& grad = ctx->input(9);
+    const Tensor& indices = ctx->input(10);
+    const Tensor& global_step = ctx->input(11);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (var->IsSingleHbm()) {
+        const Device& device = ctx->eigen_device<Device>();
+        OP_REQUIRES_OK(
+            ctx, functor::KvSparseApplyAdamAsync<Device, T, Tindex, Tstep>()(
+                     device, var, m, v, beta1_power.scalar<T>(),
+                     beta2_power.scalar<T>(), indices.vec<Tindex>(),
+                     grad.flat_outer_dims<T>(), lr.scalar<T>(),
+                     beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+                     global_step.scalar<Tstep>(), apply_sparse_rmsprop_,
+                     inner_dim, ctx->get_allocator(AllocatorAttributes())));
+      } else {
+        auto indices_vec = indices.vec<Tindex>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        Tstep gs = global_step.scalar<int64>()();
+        const T lr_scalar = lr.scalar<T>()();
+        const T beta1_scalar = beta1.scalar<T>()();
+        const T beta2_scalar = beta2.scalar<T>()();
+        const T epsilon_scalar = epsilon.scalar<T>()();
+        auto beta1_power_scalar = beta1_power.scalar<T>();
+        auto beta2_power_scalar = beta2_power.scalar<T>();
+
+        Tensor indices_temp_host(indices.dtype(), indices.shape());
+        const Tensor* indices_host_ptr = nullptr;
+        // Copy ids from GPU to CPU for CPU Lookup.
+        auto stream = ctx->op_device_context()->stream();
+        auto event_mgr =
+            ctx->device()->tensorflow_accelerator_device_info()->event_mgr;
+        if (!indices_as_pointer) {
+          indices_host_ptr = &indices_temp_host;
+          se::DeviceMemoryBase gpu_src(const_cast<Tindex*>(&indices_vec(0)),
+                                       N * sizeof(Tindex));
+          stream->ThenMemcpy(indices_host_ptr->data(), gpu_src,
+                             N * sizeof(Tindex));
+          SyncWithEventMgr(stream, event_mgr);
+        } else {
+          indices_host_ptr = &indices;
+        }
+
+        int counts_index = has_counts ? 12 : -1;
+        T** var_ptr = new T*[N * 3];
+        T** m_ptr = var_ptr + N;
+        T** v_ptr = m_ptr + N;
+        std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(3);
+        vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+        vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(m, m_ptr);
+        vars[2] = std::pair<EmbeddingVar<Tindex, T>*, T**>(v, v_ptr);
+        GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs,
+                             indices_as_pointer, counts_index, N,
+                             thread_copy_id_alloc_.get());
+
+        ApplyGradients(var, m, v, var_ptr, m_ptr, v_ptr, beta1_scalar,
+                       beta2_scalar, epsilon_scalar, lr_scalar,
+                       beta1_power_scalar, beta2_power_scalar, &grad_flat(0), N,
+                       stream, event_mgr, ctx->eigen_device<GPUDevice>());
+
+        if (has_counts && !indices_as_pointer) {
+          const Tensor& counts_tensor = ctx->input(counts_index);
+          var->UpdateCache(*indices_host_ptr, counts_tensor);
+        }
+
+        delete[] var_ptr;
+      }
+    }
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool apply_sparse_rmsprop_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
+};
+
+#define REGISTER_KERNELS(D, T, Tindices, Tstep)                               \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsync")              \
+                              .Device(DEVICE_##D)                             \
+                              .HostMemory("lr")                               \
+                              .HostMemory("beta1")                            \
+                              .HostMemory("beta2")                            \
+                              .HostMemory("epsilon")                          \
+                              .HostMemory("global_step")                      \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices")           \
+                              .TypeConstraint<Tstep>("Tstep"),                \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, \
+                                                      Tstep, false, false>);  \
+  REGISTER_KERNEL_BUILDER(Name("_OPT_KvResourceSparseApplyAdamAsync")         \
+                              .Device(DEVICE_##D)                             \
+                              .HostMemory("indices")                          \
+                              .HostMemory("lr")                               \
+                              .HostMemory("beta1")                            \
+                              .HostMemory("beta2")                            \
+                              .HostMemory("epsilon")                          \
+                              .HostMemory("global_step")                      \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices")           \
+                              .TypeConstraint<Tstep>("Tstep"),                \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, \
+                                                      Tstep, true, false>);   \
+  REGISTER_KERNEL_BUILDER(Name("KvResourceSparseApplyAdamAsyncWithCounts")    \
+                              .Device(DEVICE_##D)                             \
+                              .HostMemory("lr")                               \
+                              .HostMemory("beta1")                            \
+                              .HostMemory("beta2")                            \
+                              .HostMemory("epsilon")                          \
+                              .HostMemory("global_step")                      \
+                              .HostMemory("indices_counts")                   \
+                              .TypeConstraint<T>("T")                         \
+                              .TypeConstraint<Tindices>("Tindices")           \
+                              .TypeConstraint<Tstep>("Tstep"),                \
+                          KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, \
+                                                      Tstep, false, true>);   \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyAdamAsyncWithCounts")                   \
+          .Device(DEVICE_##D)                                                 \
+          .HostMemory("indices")                                              \
+          .HostMemory("lr")                                                   \
+          .HostMemory("beta1")                                                \
+          .HostMemory("beta2")                                                \
+          .HostMemory("epsilon")                                              \
+          .HostMemory("global_step")                                          \
+          .HostMemory("indices_counts")                                       \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices")                               \
+          .TypeConstraint<Tstep>("Tstep"),                                    \
+      KvSparseApplyAdamAsyncGPUOp<D##Device, T, Tindices, Tstep, true, true>);
+#define REGISTER_GPU_KERNELS(T)           \
+  REGISTER_KERNELS(GPU, T, int32, int32); \
+  REGISTER_KERNELS(GPU, T, int64, int32); \
+  REGISTER_KERNELS(GPU, T, int32, int64); \
+  REGISTER_KERNELS(GPU, T, int64, int64);
+
+TF_CALL_float(REGISTER_GPU_KERNELS);
+TF_CALL_double(REGISTER_GPU_KERNELS);
+
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Tindex, Tstep)                                 \
+  template <>                                                              \
+  Status KvSparseApplyAdamAsync<GPUDevice, T, Tindex, Tstep>::operator()(  \
+      const GPUDevice& d, EmbeddingVar<Tindex, T>* var,                    \
+      EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,              \
+      typename TTypes<T>::Scalar beta1_power_scalar,                       \
+      typename TTypes<T>::Scalar beta2_power_scalar,                       \
+      typename TTypes<Tindex>::ConstVec indices_vec,                       \
+      typename TTypes<T>::ConstMatrix grad,                                \
+      typename TTypes<T>::ConstScalar lr_scalar,                           \
+      typename TTypes<T>::ConstScalar beta1_scalar,                        \
+      typename TTypes<T>::ConstScalar beta2_scalar,                        \
+      typename TTypes<T>::ConstScalar epsilon_scalar,                      \
+      typename TTypes<Tstep>::ConstScalar global_step_scalar,              \
+      bool apply_sparse_rmsprop, const int64 inner_dim, Allocator* alloc); \
+  extern template struct KvSparseApplyAdamAsync<GPUDevice, T, Tindex, Tstep>;
+
+#define DECLARE_GPU_SPEC_TYPE(T)     \
+  DECLARE_GPU_SPEC(T, int32, int32); \
+  DECLARE_GPU_SPEC(T, int32, int64); \
+  DECLARE_GPU_SPEC(T, int64, int32); \
+  DECLARE_GPU_SPEC(T, int64, int64);
+
+DECLARE_GPU_SPEC_TYPE(float);
+DECLARE_GPU_SPEC_TYPE(double);
+
+#undef DECLARE_GPU_SPEC_TYPE
+#undef DECLARE_GPU_SPEC
+}  // end of namespace functor
+
+#endif  // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc
new file mode 100644
index 00000000..7dd80c73
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_adam_ops.cc
@@ -0,0 +1,529 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+#include <algorithm>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h"
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "training_ali_op_helpers.h"
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "training_ali_ops_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using SYCLDevice = Eigen::SyclDevice;
+
+template <typename Device, typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
+class KvSparseApplyAdamOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdamOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1, 2});
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+
+    EmbeddingVar<Tindex, T>* m = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m));
+    core::ScopedUnref unref_m(m);
+
+    EmbeddingVar<Tindex, T>* v = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v));
+    core::ScopedUnref unref_v(v);
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& beta2_power = ctx->input(4);
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+    const Tensor& grad = ctx->input(9);
+    const Tensor& indices = ctx->input(10);
+    const Tensor& global_step = ctx->input(11);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(12);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) { return counts[index]; };
+    } else {
+      get_count_fn = [](int64* counts, int64 index) { return 1; };
+    }
+    if (N > 0) {
+      T beta1_power_scalar = beta1_power.scalar<T>()();
+      T beta2_power_scalar = beta2_power.scalar<T>()();
+      T lr_scalar = lr.scalar<T>()();
+      T beta1_scalar = beta1.scalar<T>()();
+      T beta2_scalar = beta2.scalar<T>()();
+      T epsilon_scalar = epsilon.scalar<T>()();
+      const T alpha =
+          lr_scalar *
+          Eigen::numext::sqrt(static_cast<T>(1) - beta2_power_scalar) /
+          (static_cast<T>(1) - beta1_power_scalar);
+
+      auto do_work = [this, ctx, inner_dim, &var, &m, &v, &grad, &indices,
+                      &lr_scalar, &beta1_scalar, &beta1_power, &beta2_power,
+                      &beta2_scalar, &epsilon_scalar, &alpha, &global_step,
+                      get_count_fn,
+                      indices_counts](int64 start_i, int64 limit_i) {
+        if (inner_dim > 0) {
+          auto grad_flat = grad.flat_outer_dims<T>();
+          auto indices_vec = indices.vec<Tindex>();
+          Tstep gs = global_step.scalar<Tstep>()();
+
+          for (int64 i = start_i; i < limit_i; i++) {
+            const Tindex index = indices_vec(i);
+            void* value_ptr = nullptr;
+            bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
+            OP_REQUIRES_OK(ctx,
+                           var->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                                  indices_as_pointer, count));
+            var->UpdateVersion(value_ptr, gs);
+            if (is_filter) {
+              auto m_a = m->flat(value_ptr);
+              auto v_a = v->flat(value_ptr);
+              auto g = grad_flat.template chip<0>(i);
+              auto var_i = var->flat(value_ptr);
+
+              m_a = m_a * beta1_scalar + g * (static_cast<T>(1) - beta1_scalar);
+              v_a = v_a * beta2_scalar +
+                    g.square() * (static_cast<T>(1) - beta2_scalar);
+              var_i -= (m_a * alpha) / (v_a.sqrt() + epsilon_scalar);
+            }
+          }
+        }
+      };
+
+      const int64 cost = 1000;
+      auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+      Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+            do_work);
+      if (has_counts && !indices_as_pointer) {
+        const Tensor& indices_counts = ctx->input(12);
+        var->UpdateCache(indices, indices_counts);
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T, Tindices, Tstep)                          \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("KvResourceSparseApplyAdam")                                  \
+          .Device(DEVICE_##D)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .TypeConstraint<Tindices>("Tindices")                          \
+          .TypeConstraint<Tstep>("Tstep"),                               \
+      KvSparseApplyAdamOp<D##Device, T, Tindices, Tstep, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_OPT_KvResourceSparseApplyAdam")                             \
+          .Device(DEVICE_##D)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .TypeConstraint<Tindices>("Tindices")                          \
+          .TypeConstraint<Tstep>("Tstep"),                               \
+      KvSparseApplyAdamOp<D##Device, T, Tindices, Tstep, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("KvResourceSparseApplyAdamWithCounts")                        \
+          .Device(DEVICE_##D)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .TypeConstraint<Tindices>("Tindices")                          \
+          .TypeConstraint<Tstep>("Tstep"),                               \
+      KvSparseApplyAdamOp<D##Device, T, Tindices, Tstep, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("_OPT_KvResourceSparseApplyAdamWithCounts")                   \
+          .Device(DEVICE_##D)                                            \
+          .TypeConstraint<T>("T")                                        \
+          .TypeConstraint<Tindices>("Tindices")                          \
+          .TypeConstraint<Tstep>("Tstep"),                               \
+      KvSparseApplyAdamOp<D##Device, T, Tindices, Tstep, true, true>);
+
+#define REGISTER_CPU_KERNELS(T)           \
+  REGISTER_KERNELS(CPU, T, int32, int32); \
+  REGISTER_KERNELS(CPU, T, int64, int32); \
+  REGISTER_KERNELS(CPU, T, int32, int64); \
+  REGISTER_KERNELS(CPU, T, int64, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+template <typename Device, typename T, typename Tindex, typename Tstep,
+          bool indices_as_pointer, bool has_counts>
+class KvSparseApplyAdamGPUOp : public OpKernel {
+ public:
+  explicit KvSparseApplyAdamGPUOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+
+    int num_worker_threads =
+        ctx->device()->tensorflow_cpu_worker_threads()->num_threads;
+    thread_copy_id_alloc_.reset(
+        new IntraThreadCopyIdAllocator(num_worker_threads));
+  }
+
+  void ApplyGradients(EmbeddingVar<Tindex, T>* var, EmbeddingVar<Tindex, T>* m,
+                      EmbeddingVar<Tindex, T>* v, T** var_ptr, T** m_ptr,
+                      T** v_ptr, T beta1, T beta2, T epsilon, T lr,
+                      T beta1_power, T beta2_power, const T* grad_base,
+                      const int64 task_size, se::Stream* stream,
+                      EventMgr* event_mgr, const Eigen::GpuDevice& gpu_device) {
+    // Send pointers of embeddings to GPU
+    T** dev_var_ptr = (T**)var->GetBuffer(task_size * 3);
+    T** dev_m_ptr = dev_var_ptr + task_size;
+    T** dev_v_ptr = dev_m_ptr + task_size;
+    CHECK(dev_var_ptr);
+    CHECK(dev_m_ptr);
+    CHECK(dev_v_ptr);
+
+    se::DeviceMemoryBase dst_ptr(dev_var_ptr, sizeof(T*) * task_size * 3);
+    stream->ThenMemcpy(&dst_ptr, var_ptr, sizeof(T*) * task_size * 3);
+
+    int block_size = 128;
+    int embedding_dim = var->ValueLen();
+    functor::KvSparseApplyAdamHbm<GPUDevice, Tindex, T>()(
+        block_size, embedding_dim, dev_var_ptr, dev_m_ptr, dev_v_ptr, grad_base,
+        lr, beta1, beta2, epsilon, beta1_power, beta2_power, task_size,
+        gpu_device);
+    SyncWithEventMgr(stream, event_mgr);
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0, 1, 2});
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+
+    EmbeddingVar<Tindex, T>* m = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &m));
+    core::ScopedUnref unref_m(m);
+
+    EmbeddingVar<Tindex, T>* v = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &v));
+    core::ScopedUnref unref_v(v);
+
+    const Tensor& beta1_power = ctx->input(3);
+    const Tensor& beta2_power = ctx->input(4);
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+    const Tensor& grad = ctx->input(9);
+    const Tensor& indices = ctx->input(10);
+    const Tensor& global_step = ctx->input(11);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    if (N > 0) {
+      if (var->IsSingleHbm()) {
+        const Device& device = ctx->eigen_device<Device>();
+        OP_REQUIRES_OK(
+            ctx, functor::KvSparseApplyAdam<Device, T, Tindex, Tstep>()(
+                     device, var, m, v, beta1_power.scalar<T>(),
+                     beta2_power.scalar<T>(), indices.vec<Tindex>(),
+                     grad.flat_outer_dims<T>(), lr.scalar<T>(),
+                     beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+                     global_step.scalar<Tstep>(), inner_dim,
+                     ctx->get_allocator(AllocatorAttributes())));
+      } else {
+        auto indices_vec = indices.vec<Tindex>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        Tstep gs = global_step.scalar<int64>()();
+        const T lr_scalar = lr.scalar<T>()();
+        const T beta1_scalar = beta1.scalar<T>()();
+        const T beta2_scalar = beta2.scalar<T>()();
+        const T epsilon_scalar = epsilon.scalar<T>()();
+        const T beta1_power_scalar = beta1_power.scalar<T>()();
+        const T beta2_power_scalar = beta2_power.scalar<T>()();
+
+        Tensor indices_temp_host(indices.dtype(), indices.shape());
+        const Tensor* indices_host_ptr = nullptr;
+        // Copy ids from GPU to CPU for CPU Lookup.
+        auto stream = ctx->op_device_context()->stream();
+        auto event_mgr =
+            ctx->device()->tensorflow_accelerator_device_info()->event_mgr;
+        if (!indices_as_pointer) {
+          indices_host_ptr = &indices_temp_host;
+          se::DeviceMemoryBase gpu_src(const_cast<Tindex*>(&indices_vec(0)),
+                                       N * sizeof(Tindex));
+          stream->ThenMemcpy(indices_host_ptr->data(), gpu_src,
+                             N * sizeof(Tindex));
+          SyncWithEventMgr(stream, event_mgr);
+        } else {
+          indices_host_ptr = &indices;
+        }
+
+        int counts_index = has_counts ? 12 : -1;
+        T** var_ptr = new T*[N * 3];
+        T** m_ptr = var_ptr + N;
+        T** v_ptr = m_ptr + N;
+        std::vector<std::pair<EmbeddingVar<Tindex, T>*, T**>> vars(3);
+        vars[0] = std::pair<EmbeddingVar<Tindex, T>*, T**>(var, var_ptr);
+        vars[1] = std::pair<EmbeddingVar<Tindex, T>*, T**>(m, m_ptr);
+        vars[2] = std::pair<EmbeddingVar<Tindex, T>*, T**>(v, v_ptr);
+        GetEmbeddingPointers(ctx, vars, (Tindex*)indices_host_ptr->data(), gs,
+                             indices_as_pointer, counts_index, N,
+                             thread_copy_id_alloc_.get());
+
+        ApplyGradients(var, m, v, var_ptr, m_ptr, v_ptr, beta1_scalar,
+                       beta2_scalar, epsilon_scalar, lr_scalar,
+                       beta1_power_scalar, beta2_power_scalar, &grad_flat(0), N,
+                       stream, event_mgr, ctx->eigen_device<GPUDevice>());
+
+        if (has_counts && !indices_as_pointer) {
+          const Tensor& counts_tensor = ctx->input(counts_index);
+          var->UpdateCache(*indices_host_ptr, counts_tensor);
+        }
+
+        delete[] var_ptr;
+      }
+    }
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  std::unique_ptr<IntraThreadCopyIdAllocator> thread_copy_id_alloc_;
+};
+
+#define REGISTER_KERNELS(D, T, Tindices, Tstep)                             \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("KvResourceSparseApplyAdam")                                     \
+          .Device(DEVICE_##D)                                               \
+          .HostMemory("lr")                                                 \
+          .HostMemory("beta1_power")                                        \
+          .HostMemory("beta2_power")                                        \
+          .HostMemory("beta1")                                              \
+          .HostMemory("beta2")                                              \
+          .HostMemory("epsilon")                                            \
+          .HostMemory("global_step")                                        \
+          .TypeConstraint<T>("T")                                           \
+          .TypeConstraint<Tindices>("Tindices")                             \
+          .TypeConstraint<Tstep>("Tstep"),                                  \
+      KvSparseApplyAdamGPUOp<D##Device, T, Tindices, Tstep, false, false>); \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("_OPT_KvResourceSparseApplyAdam")                                \
+          .Device(DEVICE_##D)                                               \
+          .HostMemory("indices")                                            \
+          .HostMemory("lr")                                                 \
+          .HostMemory("beta1_power")                                        \
+          .HostMemory("beta2_power")                                        \
+          .HostMemory("beta1")                                              \
+          .HostMemory("beta2")                                              \
+          .HostMemory("epsilon")                                            \
+          .HostMemory("global_step")                                        \
+          .TypeConstraint<T>("T")                                           \
+          .TypeConstraint<Tindices>("Tindices")                             \
+          .TypeConstraint<Tstep>("Tstep"),                                  \
+      KvSparseApplyAdamGPUOp<D##Device, T, Tindices, Tstep, true, false>);  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("KvResourceSparseApplyAdamWithCounts")                           \
+          .Device(DEVICE_##D)                                               \
+          .HostMemory("lr")                                                 \
+          .HostMemory("beta1_power")                                        \
+          .HostMemory("beta2_power")                                        \
+          .HostMemory("beta1")                                              \
+          .HostMemory("beta2")                                              \
+          .HostMemory("epsilon")                                            \
+          .HostMemory("global_step")                                        \
+          .HostMemory("indices_counts")                                     \
+          .TypeConstraint<T>("T")                                           \
+          .TypeConstraint<Tindices>("Tindices")                             \
+          .TypeConstraint<Tstep>("Tstep"),                                  \
+      KvSparseApplyAdamGPUOp<D##Device, T, Tindices, Tstep, false, true>);  \
+  REGISTER_KERNEL_BUILDER(                                                  \
+      Name("_OPT_KvResourceSparseApplyAdamWithCounts")                      \
+          .Device(DEVICE_##D)                                               \
+          .HostMemory("indices")                                            \
+          .HostMemory("lr")                                                 \
+          .HostMemory("beta1_power")                                        \
+          .HostMemory("beta2_power")                                        \
+          .HostMemory("beta1")                                              \
+          .HostMemory("beta2")                                              \
+          .HostMemory("epsilon")                                            \
+          .HostMemory("global_step")                                        \
+          .HostMemory("indices_counts")                                     \
+          .TypeConstraint<T>("T")                                           \
+          .TypeConstraint<Tindices>("Tindices")                             \
+          .TypeConstraint<Tstep>("Tstep"),                                  \
+      KvSparseApplyAdamGPUOp<D##Device, T, Tindices, Tstep, true, true>);
+#define REGISTER_GPU_KERNELS(T)           \
+  REGISTER_KERNELS(GPU, T, int32, int32); \
+  REGISTER_KERNELS(GPU, T, int64, int32); \
+  REGISTER_KERNELS(GPU, T, int32, int64); \
+  REGISTER_KERNELS(GPU, T, int64, int64);
+
+TF_CALL_float(REGISTER_GPU_KERNELS);
+TF_CALL_double(REGISTER_GPU_KERNELS);
+
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T, Tindex, Tstep)                           \
+  template <>                                                        \
+  Status KvSparseApplyAdam<GPUDevice, T, Tindex, Tstep>::operator()( \
+      const GPUDevice& d, EmbeddingVar<Tindex, T>* var,              \
+      EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,        \
+      typename TTypes<T>::ConstScalar beta1_power_scalar,            \
+      typename TTypes<T>::ConstScalar beta2_power_scalar,            \
+      typename TTypes<Tindex>::ConstVec indices_vec,                 \
+      typename TTypes<T>::ConstMatrix grad,                          \
+      typename TTypes<T>::ConstScalar lr_scalar,                     \
+      typename TTypes<T>::ConstScalar beta1_scalar,                  \
+      typename TTypes<T>::ConstScalar beta2_scalar,                  \
+      typename TTypes<T>::ConstScalar epsilon_scalar,                \
+      typename TTypes<Tstep>::ConstScalar global_step_scalar,        \
+      const int64 inner_dim, Allocator* alloc);                      \
+  extern template struct KvSparseApplyAdam<GPUDevice, T, Tindex, Tstep>;
+
+#define DECLARE_GPU_SPEC_TYPE(T)     \
+  DECLARE_GPU_SPEC(T, int32, int32); \
+  DECLARE_GPU_SPEC(T, int32, int64); \
+  DECLARE_GPU_SPEC(T, int64, int32); \
+  DECLARE_GPU_SPEC(T, int64, int64);
+
+DECLARE_GPU_SPEC_TYPE(float);
+DECLARE_GPU_SPEC_TYPE(double);
+
+#undef DECLARE_GPU_SPEC_TYPE
+#undef DECLARE_GPU_SPEC
+}  // end of namespace functor
+
+#endif  // End of GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h
new file mode 100644
index 00000000..3136c30b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_op_helpers.h
@@ -0,0 +1,182 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/dense_update_functor.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+
+namespace tensorflow {
+
+// **********************************************************************
+// TODO: candy.dc
+// this code is duplicated from training_op_helpers.h
+// Once this function and Class support template, this duplicated code
+// should be removed
+// **********************************************************************
+
+// Returns a borrowed pointer to the mutex for the variable `input` in `ctx`.
+//
+// If `input` corresponds to a `DT_RESOURCE`-type variable input,
+// `*maybe_resource` will be updated to contain the underlying resource, and the
+// caller will be responsible for calling `Unref()` on that resource.
+template <typename K, typename V>
+mutex* GetTrainingEmbeddingVariableMutex(OpKernelContext* ctx, int input,
+                                         EmbeddingVar<K, V>** maybe_resource) {
+  *maybe_resource = nullptr;
+  if (ctx->input_dtype(input) == DT_RESOURCE) {
+    if (LookupResource(ctx, HandleFromInput(ctx, input), maybe_resource).ok()) {
+      return (*maybe_resource)->mu();
+    } else {
+      ctx->CtxFailureWithWarning(
+          errors::Internal("Invalid variable reference."));
+      return nullptr;
+    }
+  }
+  return ctx->input_ref_mutex(input);
+}
+
+// Utility structure that releases a sequence of borrowed mutexes when it is
+// deleted.
+template <typename K, typename V>
+struct EmbeddingVariableInputLockHolder {
+ public:
+  EmbeddingVariableInputLockHolder(
+      std::vector<EmbeddingVar<K, V>*> vars,
+      std::unique_ptr<std::vector<mutex_lock>> locks)
+      : vars_(std::move(vars)), locks_(std::move(locks)) {}
+
+  EmbeddingVariableInputLockHolder(EmbeddingVariableInputLockHolder&& other)
+      : vars_(std::move(other.vars_)), locks_(std::move(other.locks_)) {}
+
+  ~EmbeddingVariableInputLockHolder() {
+    // Release the locks before unreffing the Vars, because each lock
+    // is potentially borrowed from a Var in vars_.
+    locks_.reset();
+    for (EmbeddingVar<K, V>* var : vars_) {
+      var->Unref();
+    }
+  }
+
+ private:
+  std::vector<EmbeddingVar<K, V>*> vars_;
+  // NOTE: Use a `std::unique_ptr` instead of moving in a vector directly,
+  // because a `std::vector<mutex_lock>` is not movable on all platforms.
+  std::unique_ptr<std::vector<mutex_lock>> locks_;
+};
+
+template <typename K, typename V>
+EmbeddingVariableInputLockHolder<K, V>
+MaybeLockEmbeddingVariableInputMutexesInOrder(
+    OpKernelContext* ctx, bool do_lock, const std::vector<int>& input_ids) {
+  if (!do_lock) {
+    return EmbeddingVariableInputLockHolder<K, V>({}, {});
+  }
+  std::vector<EmbeddingVar<K, V>*> vars;
+  std::vector<mutex*> mutexes;
+  std::vector<int> acquire_order;
+  for (auto input : input_ids) {
+    EmbeddingVar<K, V>* var;
+    mutex* mutex = GetTrainingEmbeddingVariableMutex(ctx, input, &var);
+    if (var) vars.push_back(var);
+    // Only lock each mutex once if duplicates exist (n^2 but n is 2 or 3).
+    if (std::find(mutexes.begin(), mutexes.end(), mutex) == mutexes.end()) {
+      acquire_order.push_back(mutexes.size());
+      mutexes.push_back(mutex);
+    }
+  }
+  std::sort(acquire_order.begin(), acquire_order.end(),
+            [&mutexes](int a, int b) { return mutexes[a] < mutexes[b]; });
+
+  std::unique_ptr<std::vector<mutex_lock>> locks =
+      std::make_unique<std::vector<mutex_lock>>();
+  locks->reserve(acquire_order.size());
+
+  for (auto input : acquire_order) {
+    EmbeddingVar<K, V>* var;
+    mutex* mu = GetTrainingEmbeddingVariableMutex(ctx, input, &var);
+    core::ScopedUnref scoped_unref(var);
+    if (mu != nullptr) {
+      locks->emplace_back(*mu);
+    }
+  }
+  return EmbeddingVariableInputLockHolder<K, V>(std::move(vars),
+                                                std::move(locks));
+}
+
+template <class K, class V, class Tstep>
+void LookupKeyAndSetVersion(OpKernelContext* ctx, EmbeddingVar<K, V>* var,
+                            void** value_ptrs, Tstep gs, const K* indices,
+                            int64 task_size, bool indices_as_pointer,
+                            int counts_index) {
+  EmbeddingVarContext<Eigen::GpuDevice> ev_ctx(ctx);
+  int64* indices_counts = nullptr;
+  std::function<int64(int64*, int64)> get_count_fn = 0;
+  if (counts_index != -1) {
+    const Tensor& counts_tensor = ctx->input(counts_index);
+    indices_counts = (int64*)counts_tensor.data();
+  }
+  var->LookupOrCreateKey(ev_ctx, indices, value_ptrs, task_size, indices_counts,
+                         indices_as_pointer);
+
+  auto update_version_fn = [var, value_ptrs, gs](int64 start, int64 limit) {
+    for (int i = start; i < limit; i++) {
+      var->UpdateVersion(value_ptrs[i], gs);
+    }
+  };
+  const int64 unit_cost = 1000;  // very unreliable estimate for cost per step.
+  auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+  Shard(worker_threads->num_threads, worker_threads->workers, task_size,
+        unit_cost, update_version_fn);
+}
+
+template <class K, class V>
+void LookupEmbedding(OpKernelContext* ctx,
+                     std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars,
+                     void** value_ptrs, const K* indices, int64 num_of_keys) {
+  for (auto it : vars) {
+    EmbeddingVar<K, V>* var = it.first;
+    V** var_ptr = it.second;
+    auto lookup_emb_fn = [var, var_ptr, value_ptrs](int64 start, int64 limit) {
+      for (int i = start; i < limit; i++) {
+        var_ptr[i] = var->GetValuePtr(value_ptrs[i]);
+      }
+    };
+    const int64 unit_cost = 1000;  // very unreliable estimate for cost per
+                                   // step.
+    auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
+    Shard(worker_threads->num_threads, worker_threads->workers, num_of_keys,
+          unit_cost, lookup_emb_fn);
+  }
+}
+
+template <class K, class V, class Tstep>
+void GetEmbeddingPointers(
+    OpKernelContext* ctx,
+    std::vector<std::pair<EmbeddingVar<K, V>*, V**>>& vars, const K* indices,
+    Tstep gs, bool indices_as_pointer, int counts_index, int64 num_of_keys,
+    IntraThreadCopyIdAllocator* thread_copy_id_alloc) {
+  std::vector<void*> value_ptrs(num_of_keys);
+  LookupKeyAndSetVersion(ctx, vars[0].first, value_ptrs.data(), gs, indices,
+                         num_of_keys, indices_as_pointer, counts_index);
+  LookupEmbedding(ctx, vars, value_ptrs.data(), indices, num_of_keys);
+}
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OP_HELPERS_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc
new file mode 100644
index 00000000..41eb2631
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.cu.cc
@@ -0,0 +1,650 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "training_ali_ops_gpu.h"
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/gpu_hash_table.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace functor {
+template <typename T>
+__device__ T impl_sqrt(T x) {
+  return sqrt(x);
+}
+template <typename T>
+__device__ T impl_rsqrt(T x) {
+  return rsqrt(x);
+}
+template <>
+__device__ Eigen::half impl_sqrt(Eigen::half x) {
+  return __float2half(sqrt(__half2float(x)));
+}
+template <>
+__device__ Eigen::half impl_rsqrt(Eigen::half x) {
+  return __float2half(rsqrt(__half2float(x)));
+}
+
+template <typename Tindex, typename Value>
+__global__ void kv_sparse_apply_adagrad_kernel(
+    const Tindex* key_base, int32* item_idxs, int64 dim, Value** d_banks,
+    bool** d_flags, int32 var_slot_idx, int32 acc_slot_idx, int32 slot_num,
+    int32 bank_size, Value lr, const Value* grad, Value* var_default_v,
+    Value* acc_default_v, int32 var_default_v_num, int32 acc_default_v_num) {
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto var_slot_offset = bank_idx * slot_num + var_slot_idx;
+  auto acc_slot_offset = bank_idx * slot_num + acc_slot_idx;
+  bool var_stored = d_flags[var_slot_offset][offset_in_bank];
+  bool acc_stored = d_flags[acc_slot_offset][offset_in_bank];
+  __syncthreads();
+
+  if (var_default_v != nullptr && var_stored == false) {
+    d_flags[var_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[var_slot_offset][offset_in_bank * dim + id] =
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim +
+                        id];
+    }
+  }
+  if (acc_default_v != nullptr && acc_stored == false) {
+    d_flags[acc_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[acc_slot_offset][offset_in_bank * dim + id] =
+          acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim +
+                        id];
+    }
+  }
+  for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+    auto tmp_offset = offset_in_bank * dim + id;
+    Value g = grad[item_idx * dim + id];
+    Value* acc = &d_banks[acc_slot_offset][tmp_offset];
+    (*acc) += g * g;
+    d_banks[var_slot_offset][tmp_offset] -= lr * g * rsqrtf(*acc);
+  }
+}
+
+template <typename T, typename Tindex>
+struct KvSparseApplyAdagrad<GPUDevice, T, Tindex> {
+  void operator()(int32 num_items, Allocator* alloc,
+                  EmbeddingVar<Tindex, T>* var, EmbeddingVar<Tindex, T>* accum,
+                  const Tindex* key_base, const T* grad, T lr, int64 gs,
+                  const GPUDevice& device) {
+    int32* item_idxs = TypedAllocator::Allocate<int32>(alloc, num_items,
+                                                       AllocationAttributes());
+    var->LookupOrCreateKey(key_base, item_idxs, num_items, device, gs);
+    auto const block_size = 256;
+    auto const grid_size = num_items;
+    GPUHashTable<Tindex, T>* hashtable = var->HashTable();
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_sparse_apply_adagrad_kernel<Tindex, T>, grid_size, block_size, 0,
+        device.stream(), key_base, item_idxs, var->ValueLen(),
+        hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs, var->EmbIdx(),
+        accum->EmbIdx(), var->SlotNum(), hashtable->initial_bank_size, lr, grad,
+        var->GetDefaultValuePtr(), accum->GetDefaultValuePtr(),
+        var->GetDefaultValueDim(), accum->GetDefaultValueDim()));
+    TypedAllocator::Deallocate(alloc, item_idxs, num_items);
+  }
+};
+
+template <typename TKey, typename T>
+struct KvSparseApplyAdagradHbm<GPUDevice, TKey, T> {
+  void operator()(int block_size, int embedding_dim, T** dev_a, T** dev_v,
+                  const T* grad_base, T lr_scalar, int64 task_size,
+                  const GPUDevice& device) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SparseApplyAdagradGPU<T>,
+        (task_size + block_size - 1) / block_size * embedding_dim, block_size,
+        0, device.stream(), dev_a, dev_v, grad_base, lr_scalar, embedding_dim,
+        task_size));
+  }
+};
+
+template <typename TKey, typename T>
+__global__ void KvSparseApplyAdamKernel(
+    const TKey* key_base, int32* item_idxs, int64 dim, T** d_banks,
+    bool** d_flags, int32 var_slot_idx, int32 v_slot_idx, int32 m_slot_idx,
+    int32 slot_num, int32 bank_size, const T* beta1_scalar,
+    const T* beta2_scalar, const T* beta1_power_scalar,
+    const T* beta2_power_scalar, const T* epsilon_scalar, const T* lr_scalar,
+    const T* grad, T* var_default_v, T* v_default_v, T* m_default_v,
+    int32 var_default_v_num, int32 v_default_v_num, int32 m_default_v_num) {
+  const T lr = *lr_scalar;
+  const T beta1 = *beta1_scalar;
+  const T beta2 = *beta2_scalar;
+  const T beta1_power = *beta1_power_scalar;
+  const T beta2_power = *beta2_power_scalar;
+  const T epsilon = *epsilon_scalar;
+
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto var_slot_offset = bank_idx * slot_num + var_slot_idx;
+  auto v_slot_offset = bank_idx * slot_num + v_slot_idx;
+  auto m_slot_offset = bank_idx * slot_num + m_slot_idx;
+  bool var_stored = d_flags[var_slot_offset][offset_in_bank];
+  bool v_stored = d_flags[v_slot_offset][offset_in_bank];
+  bool m_stored = d_flags[m_slot_offset][offset_in_bank];
+  const T alpha = lr * sqrt(static_cast<T>(1) - beta2_power) /
+                  (static_cast<T>(1) - beta1_power);
+  __syncthreads();
+
+  if (var_default_v != nullptr && var_stored == false) {
+    d_flags[var_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[var_slot_offset][offset_in_bank * dim + id] =
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim +
+                        id];
+    }
+  }
+  if (v_default_v != nullptr && v_stored == false) {
+    d_flags[v_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[v_slot_offset][offset_in_bank * dim + id] =
+          v_default_v[(*(key_base + item_idx) % v_default_v_num) * dim + id];
+    }
+  }
+  if (m_default_v != nullptr && m_stored == false) {
+    d_flags[m_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[m_slot_offset][offset_in_bank * dim + id] =
+          m_default_v[(*(key_base + item_idx) % m_default_v_num) * dim + id];
+    }
+  }
+  for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+    auto tmp_offset = offset_in_bank * dim + id;
+    T grad_a = grad[item_idx * dim + id];
+    T& var_a = d_banks[var_slot_offset][tmp_offset];
+    T& v_a = d_banks[v_slot_offset][tmp_offset];
+    T& m_a = d_banks[m_slot_offset][tmp_offset];
+
+    m_a = m_a * beta1 + grad_a * (static_cast<T>(1) - beta1);
+    v_a = v_a * beta2 + grad_a * grad_a * (static_cast<T>(1) - beta2);
+    var_a -= (m_a * alpha) / (sqrt(v_a) + epsilon);
+  }
+}
+
+template <typename T, typename Tindex, typename Tstep>
+struct KvSparseApplyAdam<GPUDevice, T, Tindex, Tstep> {
+  Status operator()(const GPUDevice& d, EmbeddingVar<Tindex, T>* var,
+                    EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
+                    typename TTypes<T>::ConstScalar beta1_power_scalar,
+                    typename TTypes<T>::ConstScalar beta2_power_scalar,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    typename TTypes<T>::ConstMatrix grad,
+                    typename TTypes<T>::ConstScalar lr_scalar,
+                    typename TTypes<T>::ConstScalar beta1_scalar,
+                    typename TTypes<T>::ConstScalar beta2_scalar,
+                    typename TTypes<T>::ConstScalar epsilon_scalar,
+                    typename TTypes<Tstep>::ConstScalar global_step_scalar,
+                    const int64 inner_dim, Allocator* alloc) {
+    const int32 N = indices_vec.dimension(0);
+    if (N <= 0) return OkStatus();
+
+    if (inner_dim > 0) {
+      const int64 global_step = global_step_scalar();
+      int32* item_idxs =
+          TypedAllocator::Allocate<int32>(alloc, N, AllocationAttributes());
+      var->LookupOrCreateKey(indices_vec.data(), item_idxs, N, d, global_step);
+      auto const block_size = 256;
+      auto const grid_size = N;
+      auto hashtable = var->HashTable();
+      TF_CHECK_OK(GpuLaunchKernel(
+          KvSparseApplyAdamKernel<Tindex, T>, grid_size, block_size, 0,
+          d.stream(), indices_vec.data(), item_idxs, var->ValueLen(),
+          hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs,
+          var->EmbIdx(), v->EmbIdx(), m->EmbIdx(), var->SlotNum(),
+          hashtable->initial_bank_size, beta1_scalar.data(),
+          beta2_scalar.data(), beta1_power_scalar.data(),
+          beta2_power_scalar.data(), epsilon_scalar.data(), lr_scalar.data(),
+          grad.data(), var->GetDefaultValuePtr(), v->GetDefaultValuePtr(),
+          m->GetDefaultValuePtr(), var->GetDefaultValueDim(),
+          v->GetDefaultValueDim(), m->GetDefaultValueDim()));
+      TypedAllocator::Deallocate(alloc, item_idxs, N);
+    }
+
+    return OkStatus();
+  }
+};
+
+#define FINAL_MASK 0xffffffff
+
+template <typename T>
+__inline__ __device__ T warpReduceSum(T val) {
+  for (int mask = 16; mask > 0; mask >>= 1)
+    val += __shfl_xor_sync(FINAL_MASK, val, mask, 32);
+  return val;
+}
+
+template <typename T>
+__inline__ __device__ T blockReduceSum(T val) {
+  static __shared__ T shared[32];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+
+  val = warpReduceSum<T>(val);
+
+  if (lane == 0) shared[wid] = val;
+  __syncthreads();
+
+  val = (threadIdx.x < (blockDim.x >> 5)) ? shared[lane] : (T)0.0f;
+  val = warpReduceSum(val);
+  return val;
+}
+
+template <typename TKey, typename Value>
+__global__ void kv_sparse_apply_ftrl_kernel(
+    const TKey* key_base, int32* item_idxs, int64 dim, Value** d_banks,
+    bool** d_flags, int32 var_slot_idx, int32 acc_slot_idx,
+    int32 linear_slot_idx, int32 slot_num, int32 bank_size, Value lr_scalar,
+    const Value* grad, Value* var_default_v, Value* acc_default_v,
+    Value* linear_default_v, int32 var_default_v_num, int32 acc_default_v_num,
+    int32 linear_default_v_num, Value l1_scalar, Value l2_scalar,
+    Value lr_power_scalar, bool has_l2_shrinkage, Value l2_shrinkage_scalar) {
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto var_slot_offset = bank_idx * slot_num + var_slot_idx;
+  auto acc_slot_offset = bank_idx * slot_num + acc_slot_idx;
+  auto linear_slot_offset = bank_idx * slot_num + linear_slot_idx;
+  extern __shared__ __align__(sizeof(Value)) unsigned char shared[];
+  Value* new_acc = reinterpret_cast<Value*>(shared);
+  __shared__ Value linear_sqr_sum;
+  bool var_stored = d_flags[var_slot_offset][offset_in_bank];
+  bool acc_stored = d_flags[acc_slot_offset][offset_in_bank];
+  bool linear_stored = d_flags[linear_slot_offset][offset_in_bank];
+  __syncthreads();
+
+  if (var_default_v != nullptr && var_stored == false) {
+    d_flags[var_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[var_slot_offset][offset_in_bank * dim + id] =
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim +
+                        id];
+    }
+  }
+  if (acc_default_v != nullptr && acc_stored == false) {
+    d_flags[acc_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[acc_slot_offset][offset_in_bank * dim + id] =
+          acc_default_v[(*(key_base + item_idx) % acc_default_v_num) * dim +
+                        id];
+    }
+  }
+  if (linear_default_v != nullptr && linear_stored == false) {
+    d_flags[linear_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[linear_slot_offset][offset_in_bank * dim + id] =
+          linear_default_v[(*(key_base + item_idx) % linear_default_v_num) *
+                               dim +
+                           id];
+    }
+  }
+  Value linear_tmp = 0;
+  for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+    auto tmp_offset = offset_in_bank * dim + id;
+    Value* var_p = &d_banks[var_slot_offset][tmp_offset];
+    Value g = grad[item_idx * dim + id];
+    Value gg;
+    if (has_l2_shrinkage) {
+      gg = g + 2 * l2_shrinkage_scalar * (*var_p);
+    } else {
+      gg = g;
+    }
+    Value* acc_p = &d_banks[acc_slot_offset][tmp_offset];
+    new_acc[id] = *acc_p + gg * gg;
+    Value* linear_p = &d_banks[linear_slot_offset][tmp_offset];
+    if (lr_power_scalar == -0.5) {
+      (*linear_p) +=
+          gg - (sqrtf(new_acc[id]) - sqrtf(*acc_p)) / lr_scalar * (*var_p);
+    } else {
+      (*linear_p) += gg - (powf(new_acc[id], -lr_power_scalar) -
+                           powf(*acc_p, -lr_power_scalar)) /
+                              lr_scalar * (*var_p);
+    }
+    linear_tmp += (*linear_p) * (*linear_p);
+  }
+  linear_tmp = blockReduceSum<Value>(linear_tmp);
+  if (threadIdx.x == 0) {
+    linear_sqr_sum = linear_tmp;
+  }
+  __syncthreads();
+  Value linear_norm = sqrtf(linear_sqr_sum);
+  for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+    auto tmp_offset = offset_in_bank * dim + id;
+    Value* var_p = &d_banks[var_slot_offset][tmp_offset];
+    Value* acc_p = &d_banks[acc_slot_offset][tmp_offset];
+    Value* linear_p = &d_banks[linear_slot_offset][tmp_offset];
+    Value g = grad[item_idx * dim + id];
+    if (linear_norm > l1_scalar) {
+      if (lr_power_scalar == -0.5) {
+        auto eta_rec = sqrtf(new_acc[id]) / lr_scalar;
+        auto coef = (l1_scalar - linear_norm) /
+                    ((eta_rec + 2 * l2_scalar) * linear_norm);
+        *var_p = coef * (*linear_p);
+      } else {
+        auto eta_rec = powf(new_acc[id], -lr_power_scalar) / lr_scalar;
+        auto coef = (l1_scalar - linear_norm) /
+                    ((eta_rec + 2 * l2_scalar) * linear_norm);
+        *var_p = coef * (*linear_p);
+      }
+    } else {
+      *var_p = 0;
+    }
+    (*acc_p) += g * g;
+  }
+}
+
+template <typename TKey, typename T>
+struct KvSparseApplyFtrl<GPUDevice, TKey, T> {
+  void operator()(int32 num_items, Allocator* alloc, EmbeddingVar<TKey, T>* var,
+                  EmbeddingVar<TKey, T>* accum, EmbeddingVar<TKey, T>* linear,
+                  const TKey* key_base, const T* grad, T lr, T l1, T l2,
+                  T lr_power, bool has_l2_shrinkage, T l2_shrinkage,
+                  const GPUDevice& device) {
+    int32* item_idxs = TypedAllocator::Allocate<int32>(alloc, num_items,
+                                                       AllocationAttributes());
+    var->LookupOrCreateKey(key_base, item_idxs, num_items, device);
+    auto const block_size = 256;
+    auto const grid_size = num_items;
+    auto hashtable = var->HashTable();
+    TF_CHECK_OK(GpuLaunchKernel(
+        kv_sparse_apply_ftrl_kernel<TKey, T>, grid_size, block_size,
+        (var->ValueLen()) * sizeof(T), device.stream(), key_base, item_idxs,
+        var->ValueLen(), hashtable->d_bank_ptrs,
+        hashtable->d_existence_flag_ptrs, var->EmbIdx(), accum->EmbIdx(),
+        linear->EmbIdx(), var->SlotNum(), hashtable->initial_bank_size, lr,
+        grad, var->GetDefaultValuePtr(), accum->GetDefaultValuePtr(),
+        linear->GetDefaultValuePtr(), var->GetDefaultValueDim(),
+        accum->GetDefaultValueDim(), linear->GetDefaultValueDim(), l1, l2,
+        lr_power, has_l2_shrinkage, l2_shrinkage));
+    TypedAllocator::Deallocate(alloc, item_idxs, num_items);
+  }
+};
+
+template <typename TKey, typename T>
+__global__ void KvSparseApplyAdamAsyncKernel(
+    const TKey* key_base, int32* item_idxs, int64 dim, T** d_banks,
+    bool** d_flags, int32 var_slot_idx, int32 v_slot_idx, int32 m_slot_idx,
+    int32 slot_num, int32 bank_size, const T* beta1_scalar,
+    const T* beta2_scalar, const T* beta1_power_scalar,
+    const T* beta2_power_scalar, const T* epsilon_scalar, const T* lr_scalar,
+    const T* grad, T* var_default_v, T* v_default_v, T* m_default_v,
+    int32 var_default_v_num, int32 v_default_v_num, int32 m_default_v_num,
+    bool apply_sparse_rmsprop) {
+  const T lr = *lr_scalar;
+  const T beta1 = *beta1_scalar;
+  const T beta2 = *beta2_scalar;
+  const T beta1_power = *beta1_power_scalar;
+  const T beta2_power = *beta2_power_scalar;
+  const T epsilon = *epsilon_scalar;
+
+  auto item_idx = blockIdx.x;
+  auto item_pos = item_idxs[item_idx];
+  auto bank_idx = item_pos / bank_size;
+  auto offset_in_bank = item_pos % bank_size;
+  auto var_slot_offset = bank_idx * slot_num + var_slot_idx;
+  auto v_slot_offset = bank_idx * slot_num + v_slot_idx;
+  auto m_slot_offset = bank_idx * slot_num + m_slot_idx;
+  bool var_stored = d_flags[var_slot_offset][offset_in_bank];
+  bool v_stored = d_flags[v_slot_offset][offset_in_bank];
+  bool m_stored = d_flags[m_slot_offset][offset_in_bank];
+  const T alpha = lr * sqrt(static_cast<T>(1) - beta2_power) /
+                  (static_cast<T>(1) - beta1_power);
+  __syncthreads();
+
+  if (var_default_v != nullptr && var_stored == false) {
+    d_flags[var_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[var_slot_offset][offset_in_bank * dim + id] =
+          var_default_v[(*(key_base + item_idx) % var_default_v_num) * dim +
+                        id];
+    }
+  }
+  if (v_default_v != nullptr && v_stored == false) {
+    d_flags[v_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[v_slot_offset][offset_in_bank * dim + id] =
+          v_default_v[(*(key_base + item_idx) % v_default_v_num) * dim + id];
+    }
+  }
+  if (m_default_v != nullptr && m_stored == false) {
+    d_flags[m_slot_offset][offset_in_bank] = true;
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      d_banks[m_slot_offset][offset_in_bank * dim + id] =
+          m_default_v[(*(key_base + item_idx) % m_default_v_num) * dim + id];
+    }
+  }
+
+  if (apply_sparse_rmsprop) {
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      auto tmp_offset = offset_in_bank * dim + id;
+      T grad_a = grad[item_idx * dim + id];
+      T& var_a = d_banks[var_slot_offset][tmp_offset];
+      T& v_a = d_banks[v_slot_offset][tmp_offset];
+      T& m_a = d_banks[m_slot_offset][tmp_offset];
+
+      v_a = v_a * beta2 + grad_a * grad_a * (static_cast<T>(1) - beta2);
+      m_a = m_a * beta1 + rsqrt(v_a + epsilon) * lr * grad_a;
+      var_a -= m_a;
+    }
+  } else {
+    for (auto id = threadIdx.x; id < dim; id += blockDim.x) {
+      auto tmp_offset = offset_in_bank * dim + id;
+      T grad_a = grad[item_idx * dim + id];
+      T& var_a = d_banks[var_slot_offset][tmp_offset];
+      T& v_a = d_banks[v_slot_offset][tmp_offset];
+      T& m_a = d_banks[m_slot_offset][tmp_offset];
+
+      m_a = m_a * beta1 + grad_a * (static_cast<T>(1) - beta1);
+      v_a = v_a * beta2 + grad_a * grad_a * (static_cast<T>(1) - beta2);
+      var_a -= (m_a * alpha) / (sqrt(v_a) + epsilon);
+    }
+  }
+}
+
+template <typename T, typename Tindex, typename Tstep>
+struct KvSparseApplyAdamAsync<GPUDevice, T, Tindex, Tstep> {
+  Status operator()(const GPUDevice& d, EmbeddingVar<Tindex, T>* var,
+                    EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
+                    typename TTypes<T>::Scalar beta1_power_scalar,
+                    typename TTypes<T>::Scalar beta2_power_scalar,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    typename TTypes<T>::ConstMatrix grad,
+                    typename TTypes<T>::ConstScalar lr_scalar,
+                    typename TTypes<T>::ConstScalar beta1_scalar,
+                    typename TTypes<T>::ConstScalar beta2_scalar,
+                    typename TTypes<T>::ConstScalar epsilon_scalar,
+                    typename TTypes<Tstep>::ConstScalar global_step_scalar,
+                    bool apply_sparse_rmsprop, const int64 inner_dim,
+                    Allocator* alloc) {
+    const int32 N = indices_vec.dimension(0);
+    if (N <= 0) return OkStatus();
+
+    if (inner_dim > 0) {
+      const int64 global_step = global_step_scalar();
+      int32* item_idxs =
+          TypedAllocator::Allocate<int32>(alloc, N, AllocationAttributes());
+      var->LookupOrCreateKey(indices_vec.data(), item_idxs, N, d, global_step);
+      auto const block_size = 256;
+      auto const grid_size = N;
+      auto hashtable = var->HashTable();
+      TF_CHECK_OK(GpuLaunchKernel(
+          KvSparseApplyAdamAsyncKernel<Tindex, T>, grid_size, block_size, 0,
+          d.stream(), indices_vec.data(), item_idxs, var->ValueLen(),
+          hashtable->d_bank_ptrs, hashtable->d_existence_flag_ptrs,
+          var->EmbIdx(), v->EmbIdx(), m->EmbIdx(), var->SlotNum(),
+          hashtable->initial_bank_size, beta1_scalar.data(),
+          beta2_scalar.data(), beta1_power_scalar.data(),
+          beta2_power_scalar.data(), epsilon_scalar.data(), lr_scalar.data(),
+          grad.data(), var->GetDefaultValuePtr(), v->GetDefaultValuePtr(),
+          m->GetDefaultValuePtr(), var->GetDefaultValueDim(),
+          v->GetDefaultValueDim(), m->GetDefaultValueDim(),
+          apply_sparse_rmsprop));
+      TypedAllocator::Deallocate(alloc, item_idxs, N);
+    }
+
+    if (!apply_sparse_rmsprop) {
+      beta1_power_scalar.device(d) = beta1_power_scalar * beta1_scalar;
+      beta2_power_scalar.device(d) = beta2_power_scalar * beta2_scalar;
+    }
+
+    return OkStatus();
+  }
+};
+
+template <typename TKey, typename T>
+struct KvSparseApplyAdamAsyncHbm<GPUDevice, TKey, T> {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T* beta1_power_ptr, T* beta2_power_ptr,
+                  int64 task_size, const GPUDevice& device) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SparseApplyAdamAsyncGPU<T>,
+        (task_size + block_size - 1) / block_size * embedding_dim, block_size,
+        0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2,
+        epsilon, beta1_power_ptr, beta2_power_ptr, embedding_dim, task_size));
+  }
+};
+
+template <typename TKey, typename T>
+struct KvSparseApplyAdamAsyncSparseRmspropHbm<GPUDevice, TKey, T> {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, int64 task_size, const GPUDevice& device) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SparseApplyAdamAsyncSparseRmspropGPU<T>,
+        (task_size + block_size - 1) / block_size * embedding_dim, block_size,
+        0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2,
+        epsilon, embedding_dim, task_size));
+  }
+};
+
+template <typename TKey, typename T>
+struct KvSparseApplyAdamHbm<GPUDevice, TKey, T> {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T beta1_power, T beta2_power, int64 task_size,
+                  const GPUDevice& device) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SparseApplyAdamGPU<T>,
+        (task_size + block_size - 1) / block_size * embedding_dim, block_size,
+        0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2,
+        epsilon, beta1_power, beta2_power, embedding_dim, task_size));
+  }
+};
+
+template <typename TKey, typename T>
+struct KvSparseApplyAdamWHbm<GPUDevice, TKey, T> {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T weight_decay, int64 task_size,
+                  const GPUDevice& device) {
+    TF_CHECK_OK(GpuLaunchKernel(
+        SparseApplyAdamWGPU<T>,
+        (task_size + block_size - 1) / block_size * embedding_dim, block_size,
+        0, device.stream(), dev_var, dev_m, dev_v, grad_base, lr, beta1, beta2,
+        epsilon, weight_decay, embedding_dim, task_size));
+  }
+};
+
+}  // namespace functor
+
+#define REGISTER_ALL_TYPE(type)                                          \
+  template struct functor::KvSparseApplyAdagrad<GPUDevice, type, int32>; \
+  template struct functor::KvSparseApplyAdagrad<GPUDevice, type, int64>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                       \
+  template struct functor::KvSparseApplyFtrl<GPUDevice, int32, type>; \
+  template struct functor::KvSparseApplyFtrl<GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                              \
+  template struct functor::KvSparseApplyAdam<GPUDevice, type, int32, int32>; \
+  template struct functor::KvSparseApplyAdam<GPUDevice, type, int32, int64>; \
+  template struct functor::KvSparseApplyAdam<GPUDevice, type, int64, int32>; \
+  template struct functor::KvSparseApplyAdam<GPUDevice, type, int64, int64>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                           \
+  template struct functor::KvSparseApplyAdamAsync<GPUDevice, type, int32, \
+                                                  int32>;                 \
+  template struct functor::KvSparseApplyAdamAsync<GPUDevice, type, int32, \
+                                                  int64>;                 \
+  template struct functor::KvSparseApplyAdamAsync<GPUDevice, type, int64, \
+                                                  int32>;                 \
+  template struct functor::KvSparseApplyAdamAsync<GPUDevice, type, int64, \
+                                                  int64>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                    \
+  template struct functor::KvSparseApplyAdamAsyncSparseRmspropHbm< \
+      GPUDevice, int32, type>;                                     \
+  template struct functor::KvSparseApplyAdamAsyncSparseRmspropHbm< \
+      GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                               \
+  template struct functor::KvSparseApplyAdamAsyncHbm<GPUDevice, int32, type>; \
+  template struct functor::KvSparseApplyAdamAsyncHbm<GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                          \
+  template struct functor::KvSparseApplyAdamHbm<GPUDevice, int32, type>; \
+  template struct functor::KvSparseApplyAdamHbm<GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                             \
+  template struct functor::KvSparseApplyAdagradHbm<GPUDevice, int32, type>; \
+  template struct functor::KvSparseApplyAdagradHbm<GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+#define REGISTER_ALL_TYPE(type)                                           \
+  template struct functor::KvSparseApplyAdamWHbm<GPUDevice, int32, type>; \
+  template struct functor::KvSparseApplyAdamWHbm<GPUDevice, int64, type>;
+TF_CALL_float(REGISTER_ALL_TYPE);
+TF_CALL_double(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
+}  // end namespace tensorflow
+#endif  // GOOGLE_CUDA
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h
new file mode 100644
index 00000000..b31a3691
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ali_ops_gpu.h
@@ -0,0 +1,119 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_
+
+#if GOOGLE_CUDA
+#include "deepray/custom_ops/embedding_variable/cc/embedding/embedding_var.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/platform/types.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace functor {
+
+template <typename Device, typename T, typename Tindex>
+struct KvSparseApplyAdagrad {
+  void operator()(int32 num_items, Allocator* alloc,
+                  EmbeddingVar<Tindex, T>* var, EmbeddingVar<Tindex, T>* accum,
+                  const Tindex* key_base, const T* grad, T lr, int64 gs,
+                  const Device& device);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyFtrl {
+  void operator()(int32 num_items, Allocator* alloc, EmbeddingVar<TKey, T>* var,
+                  EmbeddingVar<TKey, T>* accum, EmbeddingVar<TKey, T>* linear,
+                  const TKey* key_base, const T* grad, T lr, T l1, T l2,
+                  T lr_power, bool has_l2_shrinkage, T l2_shrinkage,
+                  const Device& device);
+};
+
+template <typename Device, typename T, typename Tindex, typename Tstep>
+struct KvSparseApplyAdam {
+  Status operator()(const Device& d, EmbeddingVar<Tindex, T>* var,
+                    EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
+                    typename TTypes<T>::ConstScalar beta1_power_scalar,
+                    typename TTypes<T>::ConstScalar beta2_power_scalar,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    typename TTypes<T>::ConstMatrix grad,
+                    typename TTypes<T>::ConstScalar lr_scalar,
+                    typename TTypes<T>::ConstScalar beta1_scalar,
+                    typename TTypes<T>::ConstScalar beta2_scalar,
+                    typename TTypes<T>::ConstScalar epsilon_scalar,
+                    typename TTypes<Tstep>::ConstScalar global_step_scalar,
+                    const int64 inner_dim, Allocator* alloc);
+};
+
+template <typename Device, typename T, typename Tindex, typename Tstep>
+struct KvSparseApplyAdamAsync {
+  Status operator()(const Device& d, EmbeddingVar<Tindex, T>* var,
+                    EmbeddingVar<Tindex, T>* m, EmbeddingVar<Tindex, T>* v,
+                    typename TTypes<T>::Scalar beta1_power_scalar,
+                    typename TTypes<T>::Scalar beta2_power_scalar,
+                    typename TTypes<Tindex>::ConstVec indices_vec,
+                    typename TTypes<T>::ConstMatrix grad,
+                    typename TTypes<T>::ConstScalar lr_scalar,
+                    typename TTypes<T>::ConstScalar beta1_scalar,
+                    typename TTypes<T>::ConstScalar beta2_scalar,
+                    typename TTypes<T>::ConstScalar epsilon_scalar,
+                    typename TTypes<Tstep>::ConstScalar global_step_scalar,
+                    bool apply_sparse_rmsprop, const int64 inner_dim,
+                    Allocator* alloc);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyAdamAsyncSparseRmspropHbm {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, int64 task_size, const Device& device);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyAdamHbm {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T beta1_power, T beta2_power, int64 task_size,
+                  const Device& device);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyAdagradHbm {
+  void operator()(int block_size, int embedding_dim, T** dev_a, T** dev_v,
+                  const T* grad_base, T lr_scalar, int64 task_size,
+                  const Device& device);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyAdamAsyncHbm {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T* beta1_power_ptr, T* beta2_power_ptr,
+                  int64 task_size, const Device& device);
+};
+
+template <typename Device, typename TKey, typename T>
+struct KvSparseApplyAdamWHbm {
+  void operator()(int block_size, int embedding_dim, T** dev_var, T** dev_m,
+                  T** dev_v, const T* grad_base, T lr, T beta1, T beta2,
+                  T epsilon, T weight_decay, int64 task_size,
+                  const Device& device);
+};
+}  // end namespace functor
+}  // end namespace tensorflow
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_TRAINING_ALI_OPS_GPU_H_
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc
new file mode 100644
index 00000000..741477be
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_ftrl_ops.cc
@@ -0,0 +1,485 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+#include <algorithm>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/intra_thread_copy_id_allocator.h"
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "training_ali_op_helpers.h"
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#include "training_ali_ops_gpu.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using SYCLDevice = Eigen::SyclDevice;
+
+// Note, this op works on cpu only.
+template <typename Device, typename TKey, typename T, bool has_l2_shrinkage,
+          bool indices_as_pointer, bool has_counts>
+class KvSparseApplyFtrlOp : public OpKernel {
+ public:
+  explicit KvSparseApplyFtrlOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<TKey, T>(
+        ctx, use_exclusive_lock_, {0, 1, 2});
+
+    EmbeddingVar<TKey, T>* var_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var_));
+    core::ScopedUnref unref_var(var_);
+    EmbeddingVar<TKey, T>* accum_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum_));
+    core::ScopedUnref unref_accum(accum_);
+    EmbeddingVar<TKey, T>* linear_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &linear_));
+    core::ScopedUnref unref_linear(linear_);
+
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    const Tensor& lr = ctx->input(5);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(lr.shape()) &&
+                    lr.scalar<T>()() > static_cast<T>(0),
+                errors::InvalidArgument("lr is not a positive scalar: ",
+                                        lr.shape().DebugString()));
+
+    const Tensor& l1 = ctx->input(6);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(l1.shape()) &&
+                    l1.scalar<T>()() >= static_cast<T>(0),
+                errors::InvalidArgument("l1 regularization strength is not a "
+                                        "non-negative scalar: ",
+                                        l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(7);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(l2.shape()) &&
+                    l2.scalar<T>()() >= static_cast<T>(0),
+                errors::InvalidArgument("l2 regularization strength is not a "
+                                        "non-negative scalar: ",
+                                        l2.shape().DebugString()));
+    const int lr_power_index = has_l2_shrinkage ? 9 : 8;
+    const Tensor& lr_power = ctx->input(lr_power_index);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(lr_power.shape()) &&
+                    lr_power.scalar<T>()() <= static_cast<T>(0),
+                errors::InvalidArgument("lr_power is not a "
+                                        "non-positive scalar: ",
+                                        lr_power.shape().DebugString()));
+    int64 inner_dim = 1;
+    TensorShape var_shape({var_->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    const Tensor* l2_shrinkage;
+    if (has_l2_shrinkage) {
+      l2_shrinkage = &ctx->input(8);
+      OP_REQUIRES(
+          ctx,
+          TensorShapeUtils::IsScalar(l2_shrinkage->shape()) &&
+              l2_shrinkage->scalar<T>()() >= static_cast<T>(0),
+          errors::InvalidArgument("l2 shrinkage regularization strength "
+                                  "is not a non-negative scalar: ",
+                                  l2_shrinkage->shape().DebugString()));
+    }
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const int counts_input_index = has_l2_shrinkage ? 10 : 9;
+      const Tensor& counts_tensor = ctx->input(counts_input_index);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) { return counts[index]; };
+    } else {
+      get_count_fn = [](int64* counts, int64 index) { return 1; };
+    }
+
+    if (N > 0) {
+      if (inner_dim > 0) {
+        auto indices_vec = indices.vec<TKey>();
+        auto grad_flat = grad.flat_outer_dims<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        T l2_shrinkage_scalar = 0.0;
+        if (has_l2_shrinkage) {
+          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
+        }
+        T lr_power_scalar = lr_power.scalar<T>()();
+        auto do_work = [this, ctx, inner_dim, &var_, &indices_vec, &accum_,
+                        &linear_, &grad_flat, &lr_scalar, &l1_scalar,
+                        &l2_scalar, &lr_power, &l2_shrinkage_scalar,
+                        &lr_power_scalar, get_count_fn,
+                        indices_counts](int64 start_i, int64 limit_i) {
+          for (int64 i = start_i; i < limit_i; i++) {
+            const TKey index = indices_vec(i);
+            void* value_ptr = nullptr;
+            bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
+            OP_REQUIRES_OK(
+                ctx, var_->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                             indices_as_pointer, count));
+            if (is_filter) {
+              auto var = var_->flat(value_ptr);
+              auto accum = accum_->flat(value_ptr);
+              auto linear = linear_->flat(value_ptr);
+              auto grad = grad_flat.template chip<0>(i);
+
+// Use a macro to implement the computation here due to the templating of the
+// eigen tensor library.
+#define COMPUTE_FTRL(grad_to_use)                                            \
+  auto new_accum = accum + grad_to_use.square();                             \
+  if (lr_power_scalar == static_cast<T>(-0.5)) {                             \
+    linear +=                                                                \
+        grad_to_use - (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var;   \
+  } else {                                                                   \
+    linear += grad_to_use - (new_accum.pow(-lr_power_scalar) -               \
+                             accum.pow(-lr_power_scalar)) /                  \
+                                lr_scalar * var;                             \
+  }                                                                          \
+  Eigen::Tensor<T, 0, Eigen::RowMajor, long int> linear_sqrsum =             \
+      linear.square().sum().sqrt();                                          \
+  T linear_norm = linear_sqrsum(0);                                          \
+  if (linear_norm > l1_scalar) {                                             \
+    if (lr_power_scalar == static_cast<T>(-0.5)) {                           \
+      auto eta_rec = new_accum.sqrt() / lr_scalar;                           \
+      auto coef = (l1_scalar - linear_norm) /                                \
+                  ((eta_rec + static_cast<T>(2) * l2_scalar) * linear_norm); \
+      var = coef * linear;                                                   \
+    } else {                                                                 \
+      auto eta_rec = new_accum.pow(-lr_power_scalar) / lr_scalar;            \
+      auto coef = (l1_scalar - linear_norm) /                                \
+                  ((eta_rec + static_cast<T>(2) * l2_scalar) * linear_norm); \
+      var = coef * linear;                                                   \
+    }                                                                        \
+  } else {                                                                   \
+    var = var.constant(static_cast<T>(0));                                   \
+  }                                                                          \
+  accum += grad.square();
+              if (has_l2_shrinkage) {
+                auto grad_with_shrinkage =
+                    grad + static_cast<T>(2) * l2_shrinkage_scalar * var;
+                COMPUTE_FTRL(grad_with_shrinkage);
+              } else {
+                COMPUTE_FTRL(grad);
+              }
+            }
+          }
+#undef COMPUTE_FTRL
+        };
+
+        const int64 cost = 4500;  // very unreliable estimate for cost per step.
+        auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+        Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+              do_work);
+
+        if (has_counts && !indices_as_pointer) {
+          const int counts_input_index = has_l2_shrinkage ? 10 : 9;
+          const Tensor& indices_counts = ctx->input(counts_input_index);
+          var_->UpdateCache(indices, indices_counts);
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(Tindices, T)                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("KvResourceSparseApplyFtrl")                                       \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, \
+                          false, false>);                                     \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyFtrl")                                  \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, \
+                          true, false>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("KvResourceSparseApplyFtrlWithCounts")                             \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, \
+                          false, true>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("_OPT_KvResourceSparseApplyFtrlWithCounts")                        \
+          .Device(DEVICE_CPU)                                                 \
+          .TypeConstraint<T>("T")                                             \
+          .TypeConstraint<Tindices>("Tindices"),                              \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/false, \
+                          true, true>);
+
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(int64, T);   \
+  REGISTER_KERNELS(int32, T);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(Tindices, T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("KvResourceSparseApplyFtrlV2")                                    \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, \
+                          false, false>);                                    \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("_OPT_KvResourceSparseApplyFtrlV2")                               \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, \
+                          true, false>)                                      \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("KvResourceSparseApplyFtrlV2WithCounts")                          \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, \
+                          false, true>);                                     \
+  REGISTER_KERNEL_BUILDER(                                                   \
+      Name("_OPT_KvResourceSparseApplyFtrlV2WithCounts")                     \
+          .Device(DEVICE_CPU)                                                \
+          .TypeConstraint<T>("T")                                            \
+          .TypeConstraint<Tindices>("Tindices"),                             \
+      KvSparseApplyFtrlOp<CPUDevice, Tindices, T, /*has_l2_shrinkage=*/true, \
+                          true, true>);
+
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(int64, T);   \
+  REGISTER_KERNELS(int32, T);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+#if GOOGLE_CUDA
+template <typename Device, typename TKey, typename T, bool has_l2_shrinkage,
+          bool indices_as_pointer>
+class KvSparseApplyFtrlOpGPU : public OpKernel {
+ public:
+  explicit KvSparseApplyFtrlOpGPU(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    EmbeddingVar<TKey, T>* var_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var_));
+    EmbeddingVar<TKey, T>* accum_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 1, &accum_));
+    EmbeddingVar<TKey, T>* linear_ = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 2, &linear_));
+
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    const Tensor& lr = ctx->input(5);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(lr.shape()) &&
+                    lr.scalar<T>()() > static_cast<T>(0),
+                errors::InvalidArgument("lr is not a positive scalar: ",
+                                        lr.shape().DebugString()));
+
+    const Tensor& l1 = ctx->input(6);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(l1.shape()) &&
+                    l1.scalar<T>()() >= static_cast<T>(0),
+                errors::InvalidArgument("l1 regularization strength is not a "
+                                        "non-negative scalar: ",
+                                        l1.shape().DebugString()));
+    const Tensor& l2 = ctx->input(7);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(l2.shape()) &&
+                    l2.scalar<T>()() >= static_cast<T>(0),
+                errors::InvalidArgument("l2 regularization strength is not a "
+                                        "non-negative scalar: ",
+                                        l2.shape().DebugString()));
+    const int lr_power_index = has_l2_shrinkage ? 9 : 8;
+    const Tensor& lr_power = ctx->input(lr_power_index);
+    OP_REQUIRES(ctx,
+                TensorShapeUtils::IsScalar(lr_power.shape()) &&
+                    lr_power.scalar<T>()() <= static_cast<T>(0),
+                errors::InvalidArgument("lr_power is not a "
+                                        "non-positive scalar: ",
+                                        lr_power.shape().DebugString()));
+    int64 inner_dim = 1;
+    TensorShape var_shape({var_->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    const Tensor* l2_shrinkage;
+    if (has_l2_shrinkage) {
+      l2_shrinkage = &ctx->input(8);
+      OP_REQUIRES(
+          ctx,
+          TensorShapeUtils::IsScalar(l2_shrinkage->shape()) &&
+              l2_shrinkage->scalar<T>()() >= static_cast<T>(0),
+          errors::InvalidArgument("l2 shrinkage regularization strength "
+                                  "is not a non-negative scalar: ",
+                                  l2_shrinkage->shape().DebugString()));
+    }
+
+    if (N > 0) {
+      if (inner_dim > 0) {
+        auto indices_flat = indices.flat<TKey>();
+        auto grad_flat = grad.flat<T>();
+        T lr_scalar = lr.scalar<T>()();
+        T l1_scalar = l1.scalar<T>()();
+        T l2_scalar = l2.scalar<T>()();
+        T l2_shrinkage_scalar = 0.0;
+        if (has_l2_shrinkage) {
+          l2_shrinkage_scalar = l2_shrinkage->scalar<T>()();
+        }
+        T lr_power_scalar = lr_power.scalar<T>()();
+        const TKey* key_base = &indices_flat(0);
+        const T* grad_base = &grad_flat(0);
+        const Device& device = ctx->eigen_device<Device>();
+
+        functor::KvSparseApplyFtrl<Device, TKey, T>()(
+            N, ctx->get_allocator(AllocatorAttributes()), var_, accum_, linear_,
+            key_base, grad_base, lr_scalar, l1_scalar, l2_scalar,
+            lr_power_scalar, has_l2_shrinkage, l2_shrinkage_scalar, device);
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+namespace functor {
+#define DECLARE_GPU_SPEC(TKey, T)                                        \
+  template <>                                                            \
+  void KvSparseApplyFtrl<GPUDevice, TKey, T>::operator()(                \
+      int32 num_items, Allocator* alloc, EmbeddingVar<TKey, T>* var,     \
+      EmbeddingVar<TKey, T>* accum, EmbeddingVar<TKey, T>* linear,       \
+      const TKey* key_base, const T* grad, T lr, T l1, T l2, T lr_power, \
+      bool has_l2_shrinkage, T l2_shrinkage, const GPUDevice& device);   \
+  extern template struct KvSparseApplyFtrl<GPUDevice, TKey, T>;
+DECLARE_GPU_SPEC(int32, float);
+DECLARE_GPU_SPEC(int32, double);
+DECLARE_GPU_SPEC(int64, float);
+DECLARE_GPU_SPEC(int64, double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+#define REGISTER_KERNELS(Tindices, T)            \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("KvResourceSparseApplyFtrl")          \
+          .Device(DEVICE_GPU)                    \
+          .TypeConstraint<T>("T")                \
+          .HostMemory("lr")                      \
+          .HostMemory("l1")                      \
+          .HostMemory("l2")                      \
+          .HostMemory("lr_power")                \
+          .TypeConstraint<Tindices>("Tindices"), \
+      KvSparseApplyFtrlOpGPU<GPUDevice, Tindices, T, false, false>);
+#define REGISTER_GPU_KERNELS(T) \
+  REGISTER_KERNELS(int64, T);   \
+  REGISTER_KERNELS(int32, T);
+TF_CALL_float(REGISTER_GPU_KERNELS);
+TF_CALL_double(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+
+#define REGISTER_KERNELS(Tindices, T)            \
+  REGISTER_KERNEL_BUILDER(                       \
+      Name("KvResourceSparseApplyFtrlV2")        \
+          .Device(DEVICE_GPU)                    \
+          .TypeConstraint<T>("T")                \
+          .HostMemory("lr")                      \
+          .HostMemory("l1")                      \
+          .HostMemory("l2")                      \
+          .HostMemory("lr_power")                \
+          .HostMemory("l2_shrinkage")            \
+          .TypeConstraint<Tindices>("Tindices"), \
+      KvSparseApplyFtrlOpGPU<GPUDevice, Tindices, T, true, false>);
+#define REGISTER_GPU_KERNELS(T) \
+  REGISTER_KERNELS(int64, T);   \
+  REGISTER_KERNELS(int32, T);
+TF_CALL_float(REGISTER_GPU_KERNELS);
+TF_CALL_double(REGISTER_GPU_KERNELS);
+#undef REGISTER_GPU_KERNELS
+#undef REGISTER_KERNELS
+#endif  // GOOGLE_CUDA
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc b/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc
new file mode 100644
index 00000000..16dd8e6d
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/training_sgd_ops.cc
@@ -0,0 +1,200 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+#include <algorithm>
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "tensorflow/core/framework/bounds_check.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/kernels/training_op_helpers.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
+#include "tensorflow/core/util/work_sharder.h"
+#include "training_ali_op_helpers.h"
+
+#ifdef TENSORFLOW_USE_SYCL
+#include "tensorflow/core/common_runtime/sycl/sycl_util.h"
+#endif  // TENSORFLOW_USE_SYCL
+
+#if GOOGLE_CUDA
+#include "tensorflow/core/kernels/gpu_device_array.h"
+#include "tensorflow/core/platform/stream_executor.h"
+#endif  // GOOGLE_CUDA
+
+namespace tensorflow {
+
+using CPUDevice = Eigen::ThreadPoolDevice;
+using GPUDevice = Eigen::GpuDevice;
+using SYCLDevice = Eigen::SyclDevice;
+
+template <typename T, typename Tindex, typename Tstep, bool indices_as_pointer,
+          bool has_counts>
+class KvResourceSparseApplyGradientDescentOp : public OpKernel {
+ public:
+  explicit KvResourceSparseApplyGradientDescentOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
+    auto locks = MaybeLockEmbeddingVariableInputMutexesInOrder<Tindex, T>(
+        ctx, use_exclusive_lock_, {0});
+
+    EmbeddingVar<Tindex, T>* var = nullptr;
+    OP_REQUIRES_OK(ctx, GetInputEmbeddingVar(ctx, 0, &var));
+    core::ScopedUnref unref_var(var);
+
+    const Tensor& lr = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(2);
+    const Tensor& indices = ctx->input(3);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    const Tensor& global_step = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(global_step.shape()),
+                errors::InvalidArgument("global_step is not a scalar: ",
+                                        global_step.shape().DebugString()));
+
+    int64 inner_dim = 1;
+    TensorShape var_shape({var->ValueLen()});
+    for (int d = 0; d < var_shape.dims(); d++) {
+      OP_REQUIRES(ctx, var_shape.dim_size(d) == grad.dim_size(d + 1),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d + 1)));
+      inner_dim *= grad.dim_size(d + 1);
+    }
+    OP_REQUIRES(ctx, inner_dim > 0,
+                errors::InvalidArgument(
+                    "Inner dimension should be greater than zero."));
+
+    const int64 N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+    int64* indices_counts = nullptr;
+    std::function<int64(int64*, int64)> get_count_fn = 0;
+    if (has_counts) {
+      const Tensor& counts_tensor = ctx->input(5);
+      indices_counts = (int64*)counts_tensor.data();
+      get_count_fn = [](int64* counts, int64 index) { return counts[index]; };
+    } else {
+      get_count_fn = [](int64* counts, int64 index) { return 1; };
+    }
+
+    if (N > 0) {
+      auto indices_vec = indices.vec<Tindex>();
+      T lr_scalar = lr.scalar<T>()();
+      Tstep gs = global_step.scalar<Tstep>()();
+
+      if (inner_dim > 0) {
+        auto grad_flat = grad.flat_outer_dims<T>();
+        auto do_work = [this, ctx, &indices_vec, var, &grad_flat, &gs,
+                        &lr_scalar, indices_counts,
+                        get_count_fn](int64 start_i, int64 limit_i) {
+          for (int64 i = start_i; i < limit_i; i++) {
+            const Tindex index = indices_vec(i);
+            void* value_ptr = nullptr;
+            bool is_filter = false;
+            int64 count = get_count_fn(indices_counts, i);
+            OP_REQUIRES_OK(ctx,
+                           var->LookupOrCreateKey(index, &value_ptr, &is_filter,
+                                                  indices_as_pointer, count));
+            var->UpdateVersion(value_ptr, gs);
+            if (is_filter) {
+              auto g = grad_flat.template chip<0>(i);
+              auto v = var->flat(value_ptr);
+              v -= g.constant(lr_scalar) * g;
+            }
+          }
+        };
+        const int64 cost = 1000;
+        auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
+        Shard(worker_threads.num_threads, worker_threads.workers, N, cost,
+              do_work);
+        if (has_counts && !indices_as_pointer) {
+          const Tensor& indices = ctx->input(5);
+          var->UpdateCache(indices, indices_counts);
+        } else {
+          var->UpdateCache(indices);
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(T, Tindices, Tstep)                            \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("KvResourceSparseApplyGradientDescent")                      \
+          .Device(DEVICE_CPU)                                           \
+          .HostMemory("var")                                            \
+          .TypeConstraint<T>("T")                                       \
+          .TypeConstraint<Tindices>("Tindices")                         \
+          .TypeConstraint<Tstep>("Tstep"),                              \
+      KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, false, \
+                                             false>);                   \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("_OPT_KvResourceSparseApplyGradientDescent")                 \
+          .Device(DEVICE_CPU)                                           \
+          .HostMemory("var")                                            \
+          .TypeConstraint<T>("T")                                       \
+          .TypeConstraint<Tindices>("Tindices")                         \
+          .TypeConstraint<Tstep>("Tstep"),                              \
+      KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, true,  \
+                                             false>);                   \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("KvResourceSparseApplyGradientDescentWithCounts")            \
+          .Device(DEVICE_CPU)                                           \
+          .HostMemory("var")                                            \
+          .TypeConstraint<T>("T")                                       \
+          .TypeConstraint<Tindices>("Tindices")                         \
+          .TypeConstraint<Tstep>("Tstep"),                              \
+      KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, false, \
+                                             true>);                    \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("_OPT_KvResourceSparseApplyGradientDescentWithCounts")       \
+          .Device(DEVICE_CPU)                                           \
+          .HostMemory("var")                                            \
+          .TypeConstraint<T>("T")                                       \
+          .TypeConstraint<Tindices>("Tindices")                         \
+          .TypeConstraint<Tstep>("Tstep"),                              \
+      KvResourceSparseApplyGradientDescentOp<T, Tindices, Tstep, true, true>);
+
+#define REGISTER_CPU_KERNELS(T)      \
+  REGISTER_KERNELS(T, int64, int32); \
+  REGISTER_KERNELS(T, int64, int64); \
+  REGISTER_KERNELS(T, int32, int32); \
+  REGISTER_KERNELS(T, int32, int64);
+
+TF_CALL_float(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc b/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc
new file mode 100644
index 00000000..c3a8e129
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/embedding_collection.cc
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+namespace tensorflow {
+
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("HotnessCalculate")
+    .Input("row_length_buffer: Tindices")
+    .Output("hotness: int32")
+    .Attr("num_gpus: int")
+    .Attr("num_lookups: int")
+    .Attr("Tindices: {int32, int64} = DT_INT64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unknown_1d_shape = c->UnknownShapeOfRank(1);
+
+      c->set_output(0, unknown_1d_shape);
+
+      return OkStatus();
+    });
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc
new file mode 100644
index 00000000..09f237ed
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/group_embedding_ops.cc
@@ -0,0 +1,282 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using ::tensorflow::shape_inference::DimensionHandle;
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeAndType;
+using ::tensorflow::shape_inference::ShapeHandle;
+
+namespace tensorflow {
+
+REGISTER_OP("GroupEmbeddingVarLookupDense")
+    .Input("resource: num_lookups * resource")
+    .Input("dense_values: num_lookups * Tkeys")
+    .Input("default_value: dtype")
+    .Attr("is_use_default_value_tensor: bool = false")
+    .Attr("dimension: int")
+    .Output("output: num_lookups * dtype")
+    .Output("unique_keys: num_lookups * Tkeys")
+    .Output("unique_idx: num_lookups * int32")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .Attr("num_lookups: int >= 1")
+    .Attr("is_inference: bool = false")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'")  // placeholder
+    .Attr("ignore_weights: bool = true")                  // placeholder
+    .Attr("is_sequence: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_lookups;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_lookups", &num_lookups));
+      const std::vector<shape_inference::ShapeAndType>* shapes_and_types =
+          nullptr;
+      for (int i = 0; i < num_lookups; ++i) {
+        shapes_and_types = c->input_handle_shapes_and_types(i);
+        // LOG(INFO) << "shapes_and_types: shape="
+        //           << c->DebugString(shapes_and_types->at(0).shape);
+
+        ShapeHandle temp;
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtLeast(c->input(num_lookups + i), 1, &temp));
+
+        ShapeHandle unused;
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtLeast(shapes_and_types->at(0).shape, 1, &unused));
+        ShapeHandle params_subshape;
+        params_subshape = shapes_and_types->at(0).shape;
+
+        ShapeHandle indices_shape = c->input(num_lookups + i);
+        ShapeHandle out;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(indices_shape, params_subshape, &out));
+        c->set_output(i, out);
+        c->set_output(num_lookups + i,
+                      c->Vector(InferenceContext::kUnknownDim));
+        // c->set_output(num_lookups * 2 + i, c->input(num_lookups+i));
+      }
+
+      return OkStatus();
+    });
+
+REGISTER_OP("GroupEmbeddingVarLookup")
+    .Input("resource: num_lookups * resource")
+    .Input("sp_values: num_lookups * Tkeys")
+    .Input("sp_indices: num_lookups * int64")
+    .Input("sp_weights: num_lookups * dtype")
+    .Input("dense_shape: num_lookups * int64")
+    .Input("default_value: dtype")
+    .Attr("ignore_weights: bool = false")
+    .Attr("is_use_default_value_tensor: bool = false")
+    .Attr("is_sequence: bool = false")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("dimension: int")
+    .Output("output: num_lookups * dtype")
+    .Output("unique_keys: num_lookups * Tkeys")
+    .Output("unique_idx: num_lookups * int32")
+    .Output("batch_nums: num_lookups * int32")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .Attr("num_lookups: int >= 1")
+    .Attr("is_inference: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      int num_lookups;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_lookups", &num_lookups));
+
+      for (int i = 0; i < num_lookups; ++i) {
+        auto shapes_and_types = c->input_handle_shapes_and_types(i);
+        ShapeHandle unused;
+        TF_RETURN_IF_ERROR(
+            c->WithRankAtLeast(shapes_and_types->at(0).shape, 1, &unused));
+        TF_RETURN_IF_ERROR(
+            c->WithRank(c->input(num_lookups * 2 + i), 2, &unused));
+        // TF_RETURN_IF_ERROR(c->WithRank(c->input(num_lookups*3+i), 1,
+        // &unused));
+        ShapeHandle params_subshape;
+        params_subshape = shapes_and_types->at(0).shape;
+
+        ShapeHandle indices_shape = c->input(num_lookups + i);
+        ShapeHandle out;
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(indices_shape, params_subshape, &out));
+        c->set_output(i, out);
+        c->set_output(num_lookups + i,
+                      c->Vector(InferenceContext::kUnknownDim));
+        c->set_output(num_lookups * 2 + i, c->input(num_lookups + i));
+        c->set_output(num_lookups * 3 + i,
+                      c->Vector(InferenceContext::kUnknownDim));
+      }
+
+      return OkStatus();
+    });
+
+REGISTER_OP("GroupEmbeddingVariableLookupGrad")
+    .Input("grads: num_lookups * dtype")
+    .Input("embedding_resources: num_lookups * resource")
+    .Input("unique_keys: num_lookups * Tkeys")
+    .Input("sp_indices: num_lookups * int64")
+    .Input("batch_nums: num_lookups * int32")
+    .Output("nnz_grads: num_lookups * dtype")
+    .Attr("dimension: int")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("num_lookups: int >=1")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_lookups = ctx->num_outputs();
+      for (int i = 0; i < num_lookups; ++i) {
+        ShapeHandle top_grad_shape;
+        TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(i), 2, &top_grad_shape));
+        DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1);
+        ctx->set_output(i,
+                        ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim}));
+      }
+      return OkStatus();
+    });
+
+REGISTER_OP("GroupVariableLookup")
+    .Input("emb_variables: num_lookups * dtype")
+    .Input("sp_values: num_lookups * Tkeys")
+    .Input("sp_indices: num_lookups * int64")
+    .Input("sp_weights: num_lookups * dtype")
+    .Input("dense_shape: num_lookups * int64")
+    .Input("default_value: dtype")
+    .Output("output: num_lookups * dtype")
+    .Output("unique_keys: num_lookups * Tkeys")
+    .Output("unique_idx: num_lookups * int32")
+    .Output("batch_nums: num_lookups * int32")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("dimension: int")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .Attr("num_lookups: int >= 1")
+    .Attr("ignore_weights: bool = false")
+    .Attr("is_use_default_value_tensor: bool = false")
+    .Attr("is_sequence: bool = false")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_lookups;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_lookups", &num_lookups));
+
+      bool is_sequence;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("is_sequence", &is_sequence));
+
+      for (int i = 0; i < num_lookups; ++i) {
+        ShapeHandle temp;
+        TF_RETURN_IF_ERROR(
+            ctx->WithRank(ctx->input(num_lookups + i), 1, &temp));
+        TF_RETURN_IF_ERROR(
+            ctx->WithRank(ctx->input(2 * num_lookups + i), 2, &temp));
+        // TF_RETURN_IF_ERROR(ctx->WithRank(ctx->input(3*num_lookups+i), 1,
+        // &temp));
+        ShapeHandle unused;
+        TF_RETURN_IF_ERROR(ctx->WithRankAtLeast(ctx->input(i), 1, &unused));
+        ShapeHandle params_subshape;
+        TF_RETURN_IF_ERROR(ctx->Subshape(ctx->input(i), 1, &params_subshape));
+        DimensionHandle emb_vec_size_dim = ctx->Dim(params_subshape, 0);
+        DimensionHandle batch_dim = ctx->UnknownDim();
+        if (is_sequence) {
+          ShapeHandle output_shape =
+              ctx->MakeShape({batch_dim, batch_dim, emb_vec_size_dim});
+          ctx->set_output(i, output_shape);
+        } else {
+          ShapeHandle output_shape =
+              ctx->MakeShape({batch_dim, emb_vec_size_dim});
+          ctx->set_output(i, output_shape);
+        }
+        ctx->set_output(num_lookups + i,
+                        ctx->Vector(InferenceContext::kUnknownDim));
+        ctx->set_output(num_lookups * 2 + i, ctx->input(num_lookups + i));
+        ctx->set_output(num_lookups * 3 + i,
+                        ctx->Vector(InferenceContext::kUnknownDim));
+      }
+
+      return OkStatus();
+    });
+
+REGISTER_OP("GroupVariableLookupGrad")
+    .Input("grads: num_lookups * float32")
+    .Input("embedding_variables: num_lookups * dtype")
+    .Input("unique_keys: num_lookups * Tkeys")
+    .Input("sp_indices: num_lookups * int64")
+    .Input("batch_nums: num_lookups * int32")
+    .Output("nnz_grads: num_lookups * float32")
+    .Attr("dimension: int")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'}")
+    .Attr("num_lookups: int >=1")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_lookups = ctx->num_outputs();
+      for (int i = 0; i < num_lookups; ++i) {
+        ShapeHandle top_grad_shape;
+        TF_RETURN_IF_ERROR(
+            ctx->WithRankAtLeast(ctx->input(i), 2, &top_grad_shape));
+        DimensionHandle emb_vec_size_dim = ctx->Dim(top_grad_shape, 1);
+        ctx->set_output(i,
+                        ctx->MakeShape({ctx->UnknownDim(), emb_vec_size_dim}));
+      }
+      return OkStatus();
+    });
+
+REGISTER_OP("GroupVariableLookupDense")
+    .Input("emb_variables: num_lookups * dtype")
+    .Input("dense_values: num_lookups * Tkeys")
+    .Input("default_value: dtype")
+    .Output("output: num_lookups * dtype")
+    .Output("unique_keys: num_lookups * Tkeys")
+    .Output("unique_idx: num_lookups * int32")
+    .Attr("dimension: int")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("max_norm: float = -1.0")
+    .Attr("num_lookups: int >= 1")
+    .Attr("combiner: {'sqrtn', 'mean', 'sum'} = 'mean'")  // placeholder
+    .Attr("ignore_weights: bool = true")                  // placeholder
+    .SetShapeFn([](InferenceContext* ctx) {
+      int num_lookups;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("num_lookups", &num_lookups));
+
+      for (int i = 0; i < num_lookups; ++i) {
+        ShapeHandle temp;
+        TF_RETURN_IF_ERROR(
+            ctx->WithRankAtLeast(ctx->input(num_lookups + i), 1, &temp));
+        ShapeHandle unused;
+        TF_RETURN_IF_ERROR(ctx->WithRankAtLeast(ctx->input(i), 1, &unused));
+        ShapeHandle params_subshape;
+        TF_RETURN_IF_ERROR(ctx->Subshape(ctx->input(i), 1, &params_subshape));
+        DimensionHandle emb_vec_size_dim = ctx->Dim(params_subshape, 0);
+        DimensionHandle batch_dim = ctx->UnknownDim();
+        ShapeHandle output_shape =
+            ctx->MakeShape({batch_dim, emb_vec_size_dim});
+        ShapeHandle offset_shape = ctx->MakeShape({batch_dim, 1});
+        ctx->set_output(i, output_shape);
+        ctx->set_output(num_lookups + i,
+                        ctx->Vector(InferenceContext::kUnknownDim));
+        // ctx->set_output(num_lookups * 2 + i, ctx->input(num_lookups+i));
+      }
+
+      return OkStatus();
+    });
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc
new file mode 100644
index 00000000..771cdf9f
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/incr_save_restore_ops.cc
@@ -0,0 +1,73 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("KvResourceIncrImport")
+    .Input("prefix: string")
+    .Input("resource_handle: resource")
+    .Input("tensor_names: string")
+    .Input("empty_key: Tkeys")
+    .Input("value: dtype")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("partition_id: int = 0")
+    .Attr("partition_num: int = 1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      return OkStatus();
+    })
+    .Doc(R"doc()doc");
+
+REGISTER_OP("IncrSave")
+    .Input("prefix: string")
+    .Input("tensor_names: string")
+    .Input("shape_and_slices: string")
+    .Input("is_sparse: bool")
+    .Input("tensors: dtypes")
+    .Attr("dtypes: list(type)")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+REGISTER_OP("IncrRestore")
+    .Input("prefix: string")
+    .Input("tensor_names: string")
+    .Input("shape_and_slices: string")
+    .Input("is_sparse: bool")
+    .Input("in_tensors: dtypes")
+    .Output("out_tensors: dtypes")
+    .Attr("dtypes: list(type)")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+REGISTER_OP("RecordSparseIndices")
+    .Input("keys: TIndex")
+    .Attr("var_name: string = ''")
+    .Attr("TIndex: {int32, int64}")
+    .Attr("auto_record: bool = false")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+REGISTER_OP("ActivateSparseRecorder")
+    .Input("tensor_names: string")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+REGISTER_OP("CollectSparseIndices")
+    .Output("indices: ktype")
+    .Output("global_indices: ktype")
+    .Attr("tensor_name: string")
+    .Attr("config: string = ''")
+    .Attr("part_idx: int = -1")
+    .Attr("part_count: int = 0")
+    .Attr("hash_bucket_size: int = 0")
+    .Attr("part_mode: string = ''")
+    .Attr("ktype: {int32, int64}")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc
new file mode 100644
index 00000000..7c354106
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/kv_variable_ops.cc
@@ -0,0 +1,436 @@
+// Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+using ::tensorflow::shape_inference::DimensionHandle;
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeAndType;
+using ::tensorflow::shape_inference::ShapeHandle;
+
+namespace tensorflow {
+
+namespace {
+
+Status ReadVariableShapeFn(InferenceContext* c) {
+  std::vector<ShapeAndType> shape_and_type;
+  TF_RETURN_IF_ERROR(ValidateVariableResourceHandle(c, &shape_and_type));
+  c->set_output(0, shape_and_type[0].shape);
+  return OkStatus();
+}
+
+Status CreateAssignShapeFn(InferenceContext* c) {
+  std::vector<ShapeAndType> handle_shape_and_type;
+  TF_RETURN_IF_ERROR(shape_inference::ValidateVariableResourceHandle(
+      c, &handle_shape_and_type));
+
+  ShapeHandle value_shape = c->input(1);
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(
+      c->Merge(handle_shape_and_type[0].shape, value_shape, &unused));
+  return OkStatus();
+}
+
+}  // namespace
+
+// KvVar
+REGISTER_OP("KvVarHandleOp")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .Attr("Tkeys: {int64, int32} = DT_INT64")
+    .Output("resource: resource")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType t;
+      TF_RETURN_IF_ERROR(c->GetAttr("dtype", &t));
+      PartialTensorShape p;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &p));
+      ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(p, &s));
+      c->set_output_handle_shapes_and_types(0,
+                                            std::vector<ShapeAndType>{{s, t}});
+
+      return absl::OkStatus();
+    })
+    .Doc(R"(
+Creates a handle to a Variable resource.
+
+container: the container this variable is placed in.
+shared_name: the name by which this variable is referred to.
+dtype: the type of this variable. Must agree with the dtypes
+  of all ops using this variable.
+shape: The (possibly partially specified) shape of this variable.
+)");
+
+REGISTER_OP("ReadKvVariableOp")
+    .Input("resource: resource")
+    .Output("value: dtype")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .SetShapeFn(ReadVariableShapeFn)
+    .Doc(R"(
+Reads the value of a variable.
+
+The tensor returned by this operation is immutable.
+
+The value returned by this operation is guaranteed to be influenced by all the
+writes on which this operation depends directly or indirectly, and to not be
+influenced by any of the writes which depend directly or indirectly on this
+operation.
+
+resource: handle to the resource in which to store the variable.
+dtype: the dtype of the value.
+)");
+
+REGISTER_OP("InitializeKvVariableOp")
+    .Input("resource_self: resource")
+    .Input("resource_primary: resource")
+    .Input("value: dtype")
+    .Input("empty_key: Tkeys")
+    .Attr("slot_num: int = 0")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .Attr("steps_to_live: int = 0")
+    .Attr("ht_type: string = ''")
+    .Attr("emb_index: int = 0")
+    .Attr("block_num: int = 1")
+    .Attr("slot_index: int = 0")
+    .Attr("ht_partition_num: int = 1000")
+    .Attr("filter_freq: int = 0")
+    .Attr("max_freq: int = 999999")
+    .Attr("max_element_size: int  = 0")
+    .Attr("counter_type: type")
+    .Attr("false_positive_probability: float = -1.0")
+    .Attr("l2_weight_threshold: float =-1.0")
+    .Attr("layout: string = ''")
+    .Attr("storage_type: int = 0")
+    .Attr("storage_path: string = '.'")
+    .Attr("storage_size: list(int) = []")
+    .Attr("default_value_dim: int = 4096")
+    .Attr("default_value_no_permission: float = .0")
+    .Attr("record_freq: bool = false")
+    .Attr("record_version: bool = false")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); })
+    .Doc(R"(
+Assigns a new value to a variable.
+
+Any ReadVariableOp with a control dependency on this op is guaranteed to return
+this value or a subsequent newer value of the variable.
+
+resource_self: handle to the resource in which to store the variable.
+resource_primary: handle to the resource in which to store the variable.
+value: the value to set the new tensor to use.
+dtype: the dtype of the value.
+)");
+
+REGISTER_OP("InitializeKvVariableV2Op")
+    .Input("resource_self: resource")
+    .Input("resource_primary: resource")
+    .Input("value: dtype")
+    .Input("empty_key: Tkeys")
+    .Attr("slot_num: int = 0")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .Attr("initial_num_buckets: int = 131072")  // 2^17
+    .Attr("max_load_factor: float = 0.8")
+    .Attr("steps_to_live: int = 0")
+    .Attr("ht_type: string = ''")
+    .Attr("emb_index: int = 0")
+    .Attr("block_num: int = 1")
+    .Attr("slot_index: int = 0")
+    .Attr("ht_partition_num: int = 1000")
+    .Attr("filter_freq: int = 0")
+    .Attr("max_freq: int = 999999")
+    .Attr("max_element_size: int  = 0")
+    .Attr("counter_type: type")
+    .Attr("false_positive_probability: float = -1.0")
+    .Attr("l2_weight_threshold: float =-1.0")
+    .Attr("layout: string = ''")
+    .Attr("storage_type: int = 0")
+    .Attr("storage_path: string = '.'")
+    .Attr("storage_size: list(int) = []")
+    .Attr("default_value_dim: int = 4096")
+    .Attr("default_value_no_permission: float = .0")
+    .Attr("record_freq: bool = false")
+    .Attr("record_version: bool = false")
+    .Attr("embedding_variable_type: int = 0")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); })
+    .Doc(R"(
+Assigns a new value to a variable.
+
+Any ReadVariableOp with a control dependency on this op is guaranteed to return
+this value or a subsequent newer value of the variable.
+
+resource_self: handle to the resource in which to store the variable.
+resource_primary: handle to the resource in which to store the variable.
+value: the value to set the new tensor to use.
+dtype: the dtype of the value.
+)");
+
+REGISTER_OP("KvVarIsInitializedOp")
+    .Input("resource: resource")
+    .Output("is_initialized: bool")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type = DT_FLOAT")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a resource handle-based variable has been initialized.
+
+resource: the input resource handle.
+is_initialized: a scalar boolean which is true if the variable has been
+initialized.
+)doc");
+
+REGISTER_OP("KvVarIsAllSlotInitializedOp")
+    .Input("resource: resource")
+    .Output("is_all_slot_initialized: bool")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type = DT_FLOAT")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(R"doc(
+Checks whether a resource handle-based variable has been initialized.
+
+resource: the input resource handle.
+is_all_slot_initialized: a scalar boolean which is true if the variable has been
+initialized.
+)doc");
+
+REGISTER_OP("KvResourceInitCacheStrategyOp")
+    .Input("resource: resource")
+    .Attr("cache_strategy: int = 1")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: {float32, double}")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); });
+
+Status KvVariableShapeShapeFn(InferenceContext* c) {
+  auto* handle_data = c->input_handle_shapes_and_types(0);
+  if (handle_data == nullptr || handle_data->empty()) {
+    return errors::InvalidArgument("Handle doesn't have shape information.");
+  }
+  c->set_output(0, (*handle_data)[0].shape);
+  return OkStatus();
+}
+
+REGISTER_OP("KvVariableShape")
+    .Input("input: resource")
+    .Output("output: out_type")
+    .Attr("out_type: {int32, int64} = DT_INT32")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type = DT_FLOAT")
+    // .SetShapeFn(KvVariableShapeShapeFn)
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Vector(2));
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Returns the shape of the variable pointed to by `resource`.
+
+This operation returns a 1-D integer tensor representing the shape of `input`.
+
+For example:
+
+```
+# 't' is [[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]
+shape(t) ==> [2, 2, 3]
+```
+
+)doc");
+
+REGISTER_OP("DestroyKvResourceOp")
+    .Input("resource: resource")
+    .Attr("ignore_lookup_error: bool = true")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"(
+Deletes the resource specified by the handle.
+
+All subsequent operations using the resource will result in a NotFound
+error status.
+
+resource: handle to the resource to delete.
+ignore_lookup_error: whether to ignore the error when the resource
+  doesn't exist.
+)");
+
+REGISTER_OP("_OPT_KvResourceLookupID")
+    .Input("resource: resource")
+    .Input("indices: Tkeys")
+    .Output("pointer: int64")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<ShapeAndType> handle_shape_and_type;
+      TF_RETURN_IF_ERROR(
+          ValidateVariableResourceHandle(c, &handle_shape_and_type));
+
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused));
+
+      ShapeHandle indices_shape = c->input(1);
+      c->set_output(0, indices_shape);
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Lookup the `pointer` from the variable pointed to by `resource` according to `indices`.
+)doc");
+
+REGISTER_OP("KvResourceGatherV1")
+    .Input("resource: resource")
+    .Input("indices: Tkeys")
+    .Input("default_value: dtype")
+    .Input("counts: counts_type")
+    .Attr("validate_indices: bool = true")
+    .Attr("is_use_default_value_tensor: bool = false")
+    .Attr("is_inference: bool = false")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("counts_type: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<ShapeAndType> handle_shape_and_type;
+      TF_RETURN_IF_ERROR(
+          ValidateVariableResourceHandle(c, &handle_shape_and_type));
+
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused));
+      ShapeHandle params_subshape;
+      params_subshape = handle_shape_and_type[0].shape;
+      // TF_RETURN_IF_ERROR(
+      //     c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
+      ShapeHandle indices_shape = c->input(1);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
+      c->set_output(0, out);
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Gather slices from the variable pointed to by `resource` according to `indices`.
+
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+
+```python
+    # Scalar indices
+    output[:, ..., :] = params[indices, :, ... :]
+
+    # Vector indices
+    output[i, :, ..., :] = params[indices[i], :, ... :]
+
+    # Higher rank indices
+    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+```
+
+)doc");
+
+REGISTER_OP("KvResourceGather")
+    .Input("resource: resource")
+    .Input("indices: Tkeys")
+    .Input("default_value: dtype")
+    .Attr("is_use_default_value_tensor: bool = false")
+    .Attr("validate_indices: bool = true")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("is_inference: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      std::vector<ShapeAndType> handle_shape_and_type;
+      TF_RETURN_IF_ERROR(shape_inference::ValidateVariableResourceHandle(
+          c, &handle_shape_and_type));
+
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRankAtLeast(handle_shape_and_type[0].shape, 1, &unused));
+
+      ShapeHandle params_subshape;
+      params_subshape = handle_shape_and_type[0].shape;
+      // TF_RETURN_IF_ERROR(
+      //     c->Subshape(handle_shape_and_type.shape, 1, &params_subshape));
+      ShapeHandle indices_shape = c->input(1);
+      ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->Concatenate(indices_shape, params_subshape, &out));
+      c->set_output(0, out);
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Gather slices from the variable pointed to by `resource` according to `indices`.
+
+`indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
+Produces an output tensor with shape `indices.shape + params.shape[1:]` where:
+
+```python
+    # Scalar indices
+    output[:, ..., :] = params[indices, :, ... :]
+
+    # Vector indices
+    output[i, :, ..., :] = params[indices[i], :, ... :]
+
+    # Higher rank indices
+    output[i, ..., j, :, ... :] = params[indices[i, ..., j], :, ..., :]
+```
+
+)doc");
+
+REGISTER_OP("EVGetFrequency")
+    .Input("resource_handle: resource")
+    .Input("ids: Tkeys")
+    .Output("output: int64")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); })
+    .Doc(R"doc()doc");
+
+REGISTER_OP("EVGetVersion")
+    .Input("resource_handle: resource")
+    .Input("ids: Tkeys")
+    .Output("output: int64")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); })
+    .Doc(R"doc()doc");
+
+REGISTER_OP("KvResourceLookupTier")
+    .Input("resource_handle: resource")
+    .Input("ids: Tkeys")
+    .Output("output: int32")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .SetShapeFn([](InferenceContext* c) { return OkStatus(); })
+    .Doc(R"doc()doc");
+
+REGISTER_OP("KvResourceLookupResource")
+    .Input("resource_handle: resource")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type = DT_FLOAT")
+    .Output("output: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      return OkStatus();
+    })
+    .Doc(R"doc()doc");
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc
new file mode 100644
index 00000000..b49d868b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/save_restore_ops.cc
@@ -0,0 +1,122 @@
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/util/saved_tensor_slice_util.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("SaveV3")
+    .Input("prefix: string")
+    .Input("tensor_names: string")
+    .Input("shape_and_slices: string")
+    .Input("ev_names: string")
+    .Input("ev_resources: int64")
+    .Input("tensors: dtypes")
+    .Attr("dtypes: list(type)")
+    .Attr("ev_key_types: list(type) = []")
+    .Attr("has_ev: bool = false")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      ShapeHandle s;
+      DimensionHandle unused_dim;
+
+      // Validate prefix.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+
+      // Validate tensor_names and shapes_and_slices.
+      for (int i = 1; i <= 2; ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &s));
+        TF_RETURN_IF_ERROR(
+            c->WithValue(c->Dim(s, 0), c->num_inputs() - 5, &unused_dim));
+      }
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &s));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 1, &s));
+      return OkStatus();
+    });
+
+REGISTER_OP("KvResourceImport")
+    .Input("resource_handle: resource")
+    .Input("value: dtype")
+    .Input("empty_key: Tkeys")
+    .Input("keys: Tkeys")
+    .Input("values: dtype")
+    .Input("versions: int64")
+    .Attr("shape: shape")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("steps_to_live: int = 0")
+    .Attr("ht_type: string = ''")
+    .Attr("ht_partition_num: int = 1000")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+
+      // TODO(dingchen): Validate keys and values shape.
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Replaces the contents of the table with the specified keys and values.
+
+The tensor `keys` must be of the same type as the keys of the table.
+The tensor `values` must be of the type of the table values.
+
+resource_handle: Handle to the table.
+keys:  Any shape.  Keys to look up.
+values: Values to associate with keys.
+)doc");
+
+REGISTER_OP("KvResourceImportV3")
+    .Input("prefix: string")
+    .Input("resource_self: resource")
+    .Input("tensor_names: string")
+    .Input("empty_key: Tkeys")
+    .Attr("shape: shape")
+    .Attr("partition_id: int = 0")
+    .Attr("partition_num: int = 1")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("dtype: type")
+    .Attr("reset_version: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &handle));
+      return OkStatus();
+    })
+    .Doc(R"doc()doc");
+
+REGISTER_OP("KvResourceExport")
+    .Input("resource_handle: resource")
+    .Output("keys: Tkeys")
+    .Output("values: Tvalues")
+    .Output("versions: int64")
+    .Output("freqs: int64")
+    .Attr("Tkeys: {int64, int32}")
+    .Attr("Tvalues: type")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle values = c->UnknownShape();
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(values, 2, &values));
+      ShapeHandle keys = c->UnknownShapeOfRank(1);
+      ShapeHandle versions = c->UnknownShapeOfRank(1);
+      ShapeHandle freqs = c->UnknownShapeOfRank(1);
+      c->set_output(0, keys);
+      c->set_output(1, values);
+      c->set_output(2, versions);
+      c->set_output(3, freqs);
+      return OkStatus();
+    })
+    .Doc(R"doc(
+Outputs all keys and values in the kv resource.
+
+resource_handle: Handle to the kvResource.
+keys: Vector of all keys present in the table.
+values: Tensor of all values in the table. Indexed in parallel with `keys`.
+versions: Vector of all versions present in the table.
+freqs: Vector of all freqs present in the table.
+)doc");
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc
new file mode 100644
index 00000000..d61ea68b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adagrad_ops.cc
@@ -0,0 +1,109 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse,
+                                           int grad_idx, ShapeHandle* s) {
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
+  if (!sparse) {
+    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
+    return OkStatus();
+  }
+  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
+
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
+
+  return OkStatus();
+}
+
+static Status KvResourceApplyAdagradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(
+      HandleKvGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return OkStatus();
+}
+
+#define REGISTER_OP_BY_NAME(name)                                   \
+  REGISTER_OP(name)                                                 \
+      .Input("var: resource")                                       \
+      .Input("accum: resource")                                     \
+      .Input("lr: T")                                               \
+      .Input("grad: T")                                             \
+      .Input("indices: Tindices")                                   \
+      .Input("global_step: Tstep")                                  \
+      .Attr("T: numbertype")                                        \
+      .Attr("Tindices: {int32, int64}")                             \
+      .Attr("Tstep: {int32, int64}")                                \
+      .Attr("use_locking: bool = false")                            \
+      .SetShapeFn([](InferenceContext* c) {                         \
+        return KvResourceApplyAdagradShapeFn(c, true /* sparse */); \
+      })                                                            \
+      .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagrad");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagrad");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)                                   \
+  REGISTER_OP(name)                                                 \
+      .Input("var: resource")                                       \
+      .Input("accum: resource")                                     \
+      .Input("lr: T")                                               \
+      .Input("grad: T")                                             \
+      .Input("indices: Tindices")                                   \
+      .Input("global_step: Tstep")                                  \
+      .Input("indices_counts: int64")                               \
+      .Attr("T: numbertype")                                        \
+      .Attr("Tindices: {int32, int64}")                             \
+      .Attr("Tstep: {int32, int64}")                                \
+      .Attr("use_locking: bool = false")                            \
+      .SetShapeFn([](InferenceContext* c) {                         \
+        return KvResourceApplyAdagradShapeFn(c, true /* sparse */); \
+      })                                                            \
+      .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdagradWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdagradWithCounts");
+#undef REGISTER_OP_BY_NAME
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc
new file mode 100644
index 00000000..a19cfeb8
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_async_ops.cc
@@ -0,0 +1,129 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse,
+                                           int grad_idx, ShapeHandle* s) {
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
+  if (!sparse) {
+    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
+    return OkStatus();
+  }
+  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
+
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
+
+  return OkStatus();
+}
+
+static Status KvApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleKvGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return OkStatus();
+}
+
+#define REGISTER_OP_BY_NAME(name)                             \
+  REGISTER_OP(name)                                           \
+      .Input("var: resource")                                 \
+      .Input("m: resource")                                   \
+      .Input("v: resource")                                   \
+      .Input("beta1_power: resource")                         \
+      .Input("beta2_power: resource")                         \
+      .Input("lr: T")                                         \
+      .Input("beta1: T")                                      \
+      .Input("beta2: T")                                      \
+      .Input("epsilon: T")                                    \
+      .Input("grad: T")                                       \
+      .Input("indices: Tindices")                             \
+      .Input("global_step: Tstep")                            \
+      .Attr("T: numbertype")                                  \
+      .Attr("Tindices: {int32, int64}")                       \
+      .Attr("Tstep: {int32, int64}")                          \
+      .Attr("use_locking: bool = false")                      \
+      .Attr("apply_sparse_rmsprop: bool = false")             \
+      .Attr("indices_as_pointer: bool = false")               \
+      .SetShapeFn([](InferenceContext* c) {                   \
+        return KvApplyAdamAsyncShapeFn(c, true /* sparse */); \
+      })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsync");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsync");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)                             \
+  REGISTER_OP(name)                                           \
+      .Input("var: resource")                                 \
+      .Input("m: resource")                                   \
+      .Input("v: resource")                                   \
+      .Input("beta1_power: resource")                         \
+      .Input("beta2_power: resource")                         \
+      .Input("lr: T")                                         \
+      .Input("beta1: T")                                      \
+      .Input("beta2: T")                                      \
+      .Input("epsilon: T")                                    \
+      .Input("grad: T")                                       \
+      .Input("indices: Tindices")                             \
+      .Input("global_step: Tstep")                            \
+      .Input("indices_counts: int64")                         \
+      .Attr("T: numbertype")                                  \
+      .Attr("Tindices: {int32, int64}")                       \
+      .Attr("Tstep: {int32, int64}")                          \
+      .Attr("use_locking: bool = false")                      \
+      .Attr("apply_sparse_rmsprop: bool = false")             \
+      .Attr("indices_as_pointer: bool = false")               \
+      .SetShapeFn([](InferenceContext* c) {                   \
+        return KvApplyAdamAsyncShapeFn(c, true /* sparse */); \
+      })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamAsyncWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamAsyncWithCounts");
+#undef REGISTER_OP_BY_NAME
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc
new file mode 100644
index 00000000..64be1148
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/training_adam_ops.cc
@@ -0,0 +1,127 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse,
+                                           int grad_idx, ShapeHandle* s) {
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
+  if (!sparse) {
+    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
+    return OkStatus();
+  }
+  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
+
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
+
+  return OkStatus();
+}
+
+static Status KvResourceApplyAdamShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleKvGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return OkStatus();
+}
+
+#define REGISTER_OP_BY_NAME(name)                                \
+  REGISTER_OP(name)                                              \
+      .Input("var: resource")                                    \
+      .Input("m: resource")                                      \
+      .Input("v: resource")                                      \
+      .Input("beta1_power: T")                                   \
+      .Input("beta2_power: T")                                   \
+      .Input("lr: T")                                            \
+      .Input("beta1: T")                                         \
+      .Input("beta2: T")                                         \
+      .Input("epsilon: T")                                       \
+      .Input("grad: T")                                          \
+      .Input("indices: Tindices")                                \
+      .Input("global_step: Tstep")                               \
+      .Attr("T: numbertype")                                     \
+      .Attr("Tindices: {int32, int64}")                          \
+      .Attr("Tstep: {int32, int64}")                             \
+      .Attr("use_locking: bool = false")                         \
+      .Attr("indices_as_pointer: bool = false")                  \
+      .SetShapeFn([](InferenceContext* c) {                      \
+        return KvResourceApplyAdamShapeFn(c, true /* sparse */); \
+      })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdam");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdam");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)                                \
+  REGISTER_OP(name)                                              \
+      .Input("var: resource")                                    \
+      .Input("m: resource")                                      \
+      .Input("v: resource")                                      \
+      .Input("beta1_power: T")                                   \
+      .Input("beta2_power: T")                                   \
+      .Input("lr: T")                                            \
+      .Input("beta1: T")                                         \
+      .Input("beta2: T")                                         \
+      .Input("epsilon: T")                                       \
+      .Input("grad: T")                                          \
+      .Input("indices: Tindices")                                \
+      .Input("global_step: Tstep")                               \
+      .Input("indices_counts: int64")                            \
+      .Attr("T: numbertype")                                     \
+      .Attr("Tindices: {int32, int64}")                          \
+      .Attr("Tstep: {int32, int64}")                             \
+      .Attr("use_locking: bool = false")                         \
+      .Attr("indices_as_pointer: bool = false")                  \
+      .SetShapeFn([](InferenceContext* c) {                      \
+        return KvResourceApplyAdamShapeFn(c, true /* sparse */); \
+      })
+REGISTER_OP_BY_NAME("KvResourceSparseApplyAdamWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyAdamWithCounts");
+#undef REGISTER_OP_BY_NAME
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc
new file mode 100644
index 00000000..319a6a8f
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/training_ftrl_ops.cc
@@ -0,0 +1,96 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+static Status HandleKvGradAndIndicesInputs(InferenceContext* c, bool sparse,
+                                           int grad_idx, ShapeHandle* s) {
+  ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
+  if (!sparse) {
+    TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
+    return OkStatus();
+  }
+  // Indices is a vector where indices.dim[0].rank == grad[0].rank.
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(grad_idx + 1), 1, &indices));
+  DimensionHandle unused;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused));
+
+  // Trailing part of grad matches trailing part of *s.
+  ShapeHandle grad_unknown_first;
+  TF_RETURN_IF_ERROR(c->Subshape(grad, 1, &grad_unknown_first));
+  TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
+
+  return OkStatus();
+}
+
+static Status KvResourceApplyFtrlShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // accum
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // linear
+  TF_RETURN_IF_ERROR(
+      HandleKvGradAndIndicesInputs(c, sparse, 3 /* grad_idx */, &s));
+  int idx = sparse ? 5 : 4;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // l2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(idx++), 0, &unused));  // lr_power
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return OkStatus();
+}
+
+#define REGISTER_OP_BY_NAME(name)                                \
+  REGISTER_OP(name)                                              \
+      .Input("var: resource")                                    \
+      .Input("accum: resource")                                  \
+      .Input("linear: resource")                                 \
+      .Input("grad: T")                                          \
+      .Input("indices: Tindices")                                \
+      .Input("lr: T")                                            \
+      .Input("l1: T")                                            \
+      .Input("l2: T")                                            \
+      .Input("lr_power: T")                                      \
+      .Attr("T: numbertype")                                     \
+      .Attr("Tindices: {int32, int64, string}")                  \
+      .Attr("use_locking: bool = false")                         \
+      .Attr("indices_as_pointer: bool = false")                  \
+      .SetShapeFn([](InferenceContext* c) {                      \
+        return KvResourceApplyFtrlShapeFn(c, true /* sparse */); \
+      })                                                         \
+      .Doc(R"doc()doc")
+REGISTER_OP_BY_NAME("KvResourceSparseApplyFtrl");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyFtrl");
+#undef REGISTER_OP_BY_NAME
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc b/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc
new file mode 100644
index 00000000..6ec435f5
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/ops/training_sgd_ops.cc
@@ -0,0 +1,80 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+static ShapeHandle ShapeOrHandleShape(InferenceContext* c, int input) {
+  auto* handle_data = c->input_handle_shapes_and_types(input);
+  if (handle_data != nullptr && !handle_data->empty() &&
+      (*handle_data)[0].dtype != DT_INVALID) {
+    return (*handle_data)[0].shape;
+  }
+  return c->input(input);
+}
+
+static Status KvApplyGradientDescentShapeFn(InferenceContext* c) {
+  ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));  // alpha
+  ShapeHandle grad = ShapeOrHandleShape(c, 2);
+  ShapeHandle indices;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &indices));
+  DimensionHandle unused2;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(indices, 0), c->Dim(grad, 0), &unused2));
+  return OkStatus();
+}
+
+#define REGISTER_OP_BY_NAME(name)               \
+  REGISTER_OP(name)                             \
+      .Input("var: resource")                   \
+      .Input("alpha: T")                        \
+      .Input("grad: T")                         \
+      .Input("indices: Tindices")               \
+      .Input("global_step: Tstep")              \
+      .Attr("T: numbertype")                    \
+      .Attr("Tindices: {int32, int64}")         \
+      .Attr("Tstep: {int32, int64}")            \
+      .Attr("use_locking: bool = false")        \
+      .Attr("indices_as_pointer: bool = false") \
+      .SetShapeFn(KvApplyGradientDescentShapeFn)
+REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescent");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescent");
+#undef REGISTER_OP_BY_NAME
+
+#define REGISTER_OP_BY_NAME(name)               \
+  REGISTER_OP(name)                             \
+      .Input("var: resource")                   \
+      .Input("alpha: T")                        \
+      .Input("grad: T")                         \
+      .Input("indices: Tindices")               \
+      .Input("global_step: Tstep")              \
+      .Input("counts: int64")                   \
+      .Attr("T: numbertype")                    \
+      .Attr("Tindices: {int32, int64}")         \
+      .Attr("Tstep: {int32, int64}")            \
+      .Attr("use_locking: bool = false")        \
+      .Attr("indices_as_pointer: bool = false") \
+      .SetShapeFn(KvApplyGradientDescentShapeFn)
+REGISTER_OP_BY_NAME("KvResourceSparseApplyGradientDescentWithCounts");
+REGISTER_OP_BY_NAME("_OPT_KvResourceSparseApplyGradientDescentWithCounts");
+#undef REGISTER_OP_BY_NAME
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/tests/BUILD b/deepray/custom_ops/embedding_variable/cc/tests/BUILD
new file mode 100644
index 00000000..6e0d0f8e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/tests/BUILD
@@ -0,0 +1,65 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+cc_library(
+    name = "embedding_variable_test_lib",
+    hdrs = [
+        "embedding_variable_test.h",
+    ],
+    deps = [
+        "//deepray/custom_ops/embedding_variable:kv_variable_util",
+        "//deepray/custom_ops/embedding_variable/cc/lib:tensor_bundle",
+    ],
+)
+
+cc_test(
+    name = "embedding_variable_ops_test",
+    srcs = [
+        "embedding_variable_ops_test.cc",
+        "embedding_variable_test.h",
+    ],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        ":embedding_variable_test_lib",
+        "//deepray/custom_ops/utils:tensor_testutil",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_test(
+    name = "embedding_variable_performance_test",
+    srcs = [
+        "embedding_variable_performance_test.cc",
+        "embedding_variable_test.h",
+    ],
+    deps = [
+        ":embedding_variable_test_lib",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
+cc_test(
+    name = "embedding_variable_memory_test",
+    srcs = [
+        "embedding_variable_memory_test.cc",
+        "embedding_variable_test.h",
+    ],
+    deps = [
+        ":embedding_variable_test_lib",
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc
new file mode 100644
index 00000000..bc095509
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_memory_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "embedding_variable_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace embedding {
+float PerfMemory(Tensor& default_value, const std::vector<int64>& id_list,
+                 int value_size, int64 default_value_dim, int64 filter_freq = 0,
+                 int64 steps_to_live = 0, int64 record_freq = false) {
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim,
+                               filter_freq, steps_to_live, -1.0,
+                               embedding::StorageType::DRAM,
+                               {1024, 1024, 1024, 1024}, record_freq);
+  void* value_ptr = nullptr;
+  bool is_filter = false;
+  double start_mem, end_mem;
+  start_mem = getResident() * getpagesize();
+  for (int i = 0; i < id_list.size(); i++) {
+    ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
+    if (is_filter) ev->flat(value_ptr);
+  }
+  end_mem = getResident() * getpagesize();
+  double used_mb = (end_mem - start_mem) / 1000000;
+  LOG(INFO) << "[TestMemory]Use Memory: " << used_mb;
+  return used_mb;
+}
+
+TEST(EmbeddingVariabelMemoryTest, TestMemory) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  int filter_freq = 2;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = i;
+  }
+  float used_mb =
+      PerfMemory(default_value, id_list, value_size, default_value_dim);
+  float theoritical_mb =
+      50 + num_of_ids * (value_size * sizeof(float)) / 1000000;
+  LOG(INFO) << "[TestMemory]Theoritical Memory: " << theoritical_mb;
+  EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
+              (used_mb < theoritical_mb * 1.07));
+
+  for (int i = 0; i < num_of_ids / 2; i++) {
+    id_list.emplace_back(i);
+  }
+  used_mb = PerfMemory(default_value, id_list, value_size, default_value_dim,
+                       filter_freq);
+  theoritical_mb = 50 + num_of_ids *
+                            (8 + value_size * sizeof(float) / 2 +
+                             4 /*memory for ids_list*/) /
+                            1000000;
+  LOG(INFO) << "[TestMemory]Theoritical Memory: " << theoritical_mb;
+  EXPECT_TRUE((used_mb > theoritical_mb * 0.99) &&
+              (used_mb < theoritical_mb * 1.25));
+}
+}  // namespace embedding
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc
new file mode 100644
index 00000000..a29d3d16
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_ops_test.cc
@@ -0,0 +1,1324 @@
+#include <thread>
+
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#include "deepray/custom_ops/utils/tensor_testutil.h"
+#include "embedding_variable_test.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/device/device_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#endif  // GOOGLE_CUDA
+
+#include <sys/resource.h>
+#include <time.h>
+
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
+namespace tensorflow {
+namespace embedding {
+namespace {
+const int THREADNUM = 16;
+const int64 max = 2147483647;
+
+struct ProcMemory {
+  long size;      // total program size
+  long resident;  // resident set size
+  long share;     // shared pages
+  long trs;       // text (code)
+  long lrs;       // library
+  long drs;       // data/stack
+  long dt;        // dirty pages
+
+  ProcMemory()
+      : size(0), resident(0), share(0), trs(0), lrs(0), drs(0), dt(0) {}
+};
+
+ProcMemory getProcMemory() {
+  ProcMemory m;
+  FILE* fp = fopen("/proc/self/statm", "r");
+  if (fp == NULL) {
+    LOG(ERROR) << "Fail to open /proc/self/statm.";
+    return m;
+  }
+
+  if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", &m.size, &m.resident, &m.share,
+             &m.trs, &m.lrs, &m.drs, &m.dt) != 7) {
+    fclose(fp);
+    LOG(ERROR) << "Fail to fscanf /proc/self/statm.";
+    return m;
+  }
+  fclose(fp);
+
+  return m;
+}
+
+double getSize() {
+  ProcMemory m = getProcMemory();
+  return m.size;
+}
+
+double getResident() {
+  ProcMemory m = getProcMemory();
+  return m.resident;
+}
+
+string Prefix(const string& prefix) {
+  return strings::StrCat(testing::TmpDir(), "/", prefix);
+}
+
+std::vector<string> AllTensorKeys(BundleReader* reader) {
+  std::vector<string> ret;
+  reader->Seek(kHeaderEntryKey);
+  reader->Next();
+  for (; reader->Valid(); reader->Next()) {
+    // ret.push_back(reader->key().ToString());
+    ret.push_back(std::string(reader->key()));
+  }
+  return ret;
+}
+
+TEST(EmbeddingVariableTest, TestEmptyEV) {
+  int64 value_size = 8;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
+  {
+    auto variable = CreateEmbeddingVar(value_size, value, 1);
+
+    LOG(INFO) << "size:" << variable->Size();
+    Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1}));
+
+    BundleWriter writer(Env::Default(), Prefix("foo"));
+    embedding::ShrinkArgs shrink_args;
+    shrink_args.global_step = 1;
+    variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
+    TF_ASSERT_OK(writer.Finish());
+
+    {
+      BundleReader reader(Env::Default(), Prefix("foo"));
+      TF_ASSERT_OK(reader.status());
+      EXPECT_EQ(AllTensorKeys(&reader),
+                std::vector<string>(
+                    {"var/part_0-freqs", "var/part_0-freqs_filtered",
+                     "var/part_0-keys", "var/part_0-keys_filtered",
+                     "var/part_0-partition_filter_offset",
+                     "var/part_0-partition_offset", "var/part_0-values",
+                     "var/part_0-versions", "var/part_0-versions_filtered"}));
+      {
+        string key = "var/part_0-keys";
+        EXPECT_TRUE(reader.Contains(key));
+        // Tests for LookupDtypeAndShape().
+        DataType dtype;
+        TensorShape shape;
+        TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+        // Tests for Lookup(), checking tensor contents.
+        Tensor val(dtype, TensorShape{0});
+        TF_ASSERT_OK(reader.Lookup(key, &val));
+        LOG(INFO) << "read keys:" << val.DebugString();
+      }
+      {
+        string key = "var/part_0-values";
+        EXPECT_TRUE(reader.Contains(key));
+        // Tests for LookupDtypeAndShape().
+        DataType dtype;
+        TensorShape shape;
+        TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+        // Tests for Lookup(), checking tensor contents.
+        Tensor val(dtype, TensorShape{0, value_size});
+        TF_ASSERT_OK(reader.Lookup(key, &val));
+        LOG(INFO) << "read values:" << val.DebugString();
+      }
+      {
+        string key = "var/part_0-versions";
+        EXPECT_TRUE(reader.Contains(key));
+        // Tests for LookupDtypeAndShape().
+        DataType dtype;
+        TensorShape shape;
+        TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+        // Tests for Lookup(), checking tensor contents.
+        Tensor val(dtype, TensorShape{0});
+        TF_ASSERT_OK(reader.Lookup(key, &val));
+        LOG(INFO) << "read versions:" << val.DebugString();
+      }
+    }
+  }
+}
+
+TEST(EmbeddingVariableTest, TestEVExportSmallLockless) {
+  int64 value_size = 8;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
+
+  Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1}));
+
+  for (int64 i = 0; i < 5; i++) {
+    void* value_ptr = nullptr;
+    variable->LookupOrCreateKey(i, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+    vflat(i) = 5.0;
+  }
+
+  LOG(INFO) << "size:" << variable->Size();
+
+  BundleWriter writer(Env::Default(), Prefix("foo"));
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
+  TF_ASSERT_OK(writer.Finish());
+
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>(
+            {"var/part_0-freqs", "var/part_0-freqs_filtered", "var/part_0-keys",
+             "var/part_0-keys_filtered", "var/part_0-partition_filter_offset",
+             "var/part_0-partition_offset", "var/part_0-values",
+             "var/part_0-versions", "var/part_0-versions_filtered"}));
+    {
+      string key = "var/part_0-keys";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{5});
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read keys:" << val.DebugString();
+    }
+    {
+      string key = "var/part_0-values";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{5, value_size});
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read values:" << val.DebugString();
+    }
+    {
+      string key = "var/part_0-versions";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{5});
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read versions:" << val.DebugString();
+    }
+  }
+}
+
+TEST(EmbeddingVariableTest, TestEVExportLargeLockless) {
+  int64 value_size = 128;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+  auto variable = CreateEmbeddingVar(value_size, value, 1, 0, 5);
+
+  Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1}));
+
+  int64 ev_size = 10048576;
+  for (int64 i = 0; i < ev_size; i++) {
+    void* value_ptr = nullptr;
+    variable->LookupOrCreateKey(i, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+  }
+
+  LOG(INFO) << "size:" << variable->Size();
+
+  BundleWriter writer(Env::Default(), Prefix("foo"));
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 1;
+  variable->Save("var/part_0", Prefix("foo"), &writer, shrink_args);
+  TF_ASSERT_OK(writer.Finish());
+
+  {
+    BundleReader reader(Env::Default(), Prefix("foo"));
+    TF_ASSERT_OK(reader.status());
+    EXPECT_EQ(
+        AllTensorKeys(&reader),
+        std::vector<string>(
+            {"var/part_0-freqs", "var/part_0-freqs_filtered", "var/part_0-keys",
+             "var/part_0-keys_filtered", "var/part_0-partition_filter_offset",
+             "var/part_0-partition_offset", "var/part_0-values",
+             "var/part_0-versions", "var/part_0-versions_filtered"}));
+    {
+      string key = "var/part_0-keys";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{ev_size});
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read keys:" << val.DebugString();
+    }
+    {
+      string key = "var/part_0-values";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{ev_size, value_size});
+      LOG(INFO) << "read values:" << val.DebugString();
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read values:" << val.DebugString();
+    }
+    {
+      string key = "var/part_0-versions";
+      EXPECT_TRUE(reader.Contains(key));
+      // Tests for LookupDtypeAndShape().
+      DataType dtype;
+      TensorShape shape;
+      TF_ASSERT_OK(reader.LookupDtypeAndShape(key, &dtype, &shape));
+      // Tests for Lookup(), checking tensor contents.
+      Tensor val(dtype, TensorShape{ev_size});
+      TF_ASSERT_OK(reader.Lookup(key, &val));
+      LOG(INFO) << "read versions:" << val.DebugString();
+    }
+  }
+}
+
+void multi_insertion(EmbeddingVar<int64, float>* variable, int64 value_size) {
+  for (long j = 0; j < 5; j++) {
+    void* value_ptr = nullptr;
+    variable->LookupOrCreateKey(j, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestMultiInsertion) {
+  int64 value_size = 128;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
+
+  std::vector<std::thread> insert_threads(THREADNUM);
+  for (size_t i = 0; i < THREADNUM; i++) {
+    insert_threads[i] = std::thread(multi_insertion, variable, value_size);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  ASSERT_EQ(variable->Size(), 5);
+}
+
+void InsertAndLookup(EmbeddingVar<int64, float>* variable, int64* keys,
+                     long ReadLoops, int value_size) {
+  for (long j = 0; j < ReadLoops; j++) {
+    void* val = nullptr;
+    void* val_1 = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(keys[j], &val, &is_filter, false);
+    variable->LookupOrCreateKey(keys[j], &val_1, &is_filter, false);
+    ASSERT_EQ(val, val_1);
+  }
+}
+
+void MultiBloomFilter(EmbeddingVar<int64, float>* var, int value_size,
+                      int64 i) {
+  for (long j = 0; j < 1; j++) {
+    void* val = nullptr;
+    bool is_filter = true;
+    var->LookupOrCreateKey(i + 1, &val, &is_filter, false);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestBloomFilter) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  std::vector<float> default_value = {0.0, 1.0, 2.0, 3.0, 4.0,
+                                      5.0, 6.0, 7.0, 8.0, 9.0};
+  test::FillValues<float>(&value, default_value);
+
+  auto var = CreateEmbeddingVar(value_size, value, 1, 3, 5, -1.0,
+                                embedding::StorageType::DRAM,
+                                {1024, 1024, 1024, 1024}, false, 10, 0.01);
+
+  // float *val = (float *)malloc((value_size+1)*sizeof(float));
+  void* val = nullptr;
+  bool is_filter = true;
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(1, &val, &is_filter, false);
+  var->LookupOrCreateKey(2, &val, &is_filter, false);
+
+  std::vector<int64> keylist;
+  std::vector<float*> valuelist;
+  std::vector<int64> version_list;
+  std::vector<int64> freq_list;
+
+  ASSERT_EQ(var->Size(), 1);
+}
+
+TEST(EmbeddingVariableTest, TestBloomCounterInt64) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+  auto var = CreateEmbeddingVar(
+      value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM,
+      {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT64);
+
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+
+  std::vector<int64> hash_val1 = {17, 7, 48, 89, 9, 20, 56};
+  std::vector<int64> hash_val2 = {58, 14, 10, 90, 28, 14, 67};
+  std::vector<int64> hash_val3 = {64, 63, 9, 77, 7, 38, 11};
+  std::vector<int64> hash_val4 = {39, 10, 79, 28, 58, 55, 60};
+
+  std::map<int64, int> tab;
+  for (auto it : hash_val1) tab.insert(std::pair<int64, int>(it, 1));
+  for (auto it : hash_val2) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val3) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val4) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+
+  std::vector<std::thread> insert_threads(4);
+  for (size_t i = 0; i < 4; i++) {
+    insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  auto filter = var->GetFilter();
+  auto bloom_filter =
+      static_cast<BloomFilterPolicy<int64, float, EmbeddingVar<int64, float>>*>(
+          filter);
+  //(int64 *)var->GetBloomCounter();
+  int64* counter = (int64*)bloom_filter->GetBloomCounter();
+
+  for (auto it : hash_val1) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val2) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val3) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val4) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestBloomCounterInt32) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+
+  auto var = CreateEmbeddingVar(
+      value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM,
+      {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT32);
+
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+
+  std::vector<int64> hash_val1 = {17, 7, 48, 89, 9, 20, 56};
+  std::vector<int64> hash_val2 = {58, 14, 10, 90, 28, 14, 67};
+  std::vector<int64> hash_val3 = {64, 63, 9, 77, 7, 38, 11};
+  std::vector<int64> hash_val4 = {39, 10, 79, 28, 58, 55, 60};
+
+  std::map<int64, int> tab;
+  for (auto it : hash_val1) tab.insert(std::pair<int64, int>(it, 1));
+  for (auto it : hash_val2) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val3) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val4) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+
+  std::vector<std::thread> insert_threads(4);
+  for (size_t i = 0; i < 4; i++) {
+    insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  auto filter = var->GetFilter();
+  auto bloom_filter =
+      static_cast<BloomFilterPolicy<int64, float, EmbeddingVar<int64, float>>*>(
+          filter);
+  //(int64 *)var->GetBloomCounter();
+  int32* counter = (int32*)bloom_filter->GetBloomCounter();
+
+  for (auto it : hash_val1) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val2) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val3) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val4) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestBloomCounterInt16) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+
+  auto var = CreateEmbeddingVar(
+      value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM,
+      {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT16);
+
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+
+  std::vector<int64> hash_val1 = {17, 7, 48, 89, 9, 20, 56};
+  std::vector<int64> hash_val2 = {58, 14, 10, 90, 28, 14, 67};
+  std::vector<int64> hash_val3 = {64, 63, 9, 77, 7, 38, 11};
+  std::vector<int64> hash_val4 = {39, 10, 79, 28, 58, 55, 60};
+
+  std::map<int64, int> tab;
+  for (auto it : hash_val1) tab.insert(std::pair<int64, int>(it, 1));
+  for (auto it : hash_val2) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val3) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val4) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+
+  std::vector<std::thread> insert_threads(4);
+  for (size_t i = 0; i < 4; i++) {
+    insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  // int16* counter = (int16 *)var->GetBloomCounter();
+  auto filter = var->GetFilter();
+  auto bloom_filter =
+      static_cast<BloomFilterPolicy<int64, float, EmbeddingVar<int64, float>>*>(
+          filter);
+  //(int64 *)var->GetBloomCounter();
+  int16* counter = (int16*)bloom_filter->GetBloomCounter();
+
+  for (auto it : hash_val1) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val2) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val3) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+  for (auto it : hash_val4) {
+    ASSERT_EQ(counter[it], tab[it]);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestBloomCounterInt8) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+
+  auto var = CreateEmbeddingVar(
+      value_size, value, 1, 3, 5, -1.0, embedding::StorageType::DRAM,
+      {1024, 1024, 1024, 1024}, false, 10, 0.01, DT_UINT8);
+
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+
+  std::vector<int64> hash_val1 = {17, 7, 48, 89, 9, 20, 56};
+  std::vector<int64> hash_val2 = {58, 14, 10, 90, 28, 14, 67};
+  std::vector<int64> hash_val3 = {64, 63, 9, 77, 7, 38, 11};
+  std::vector<int64> hash_val4 = {39, 10, 79, 28, 58, 55, 60};
+
+  std::map<int64, int> tab;
+  for (auto it : hash_val1) tab.insert(std::pair<int64, int>(it, 1));
+  for (auto it : hash_val2) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val3) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+  for (auto it : hash_val4) {
+    if (tab.find(it) != tab.end())
+      tab[it]++;
+    else
+      tab.insert(std::pair<int64, int>(it, 1));
+  }
+
+  std::vector<std::thread> insert_threads(4);
+  for (size_t i = 0; i < 4; i++) {
+    insert_threads[i] = std::thread(MultiBloomFilter, var, value_size, i);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  auto filter = var->GetFilter();
+  auto bloom_filter =
+      static_cast<BloomFilterPolicy<int64, float, EmbeddingVar<int64, float>>*>(
+          filter);
+  int8* counter = (int8*)bloom_filter->GetBloomCounter();
+  //(int64 *)var->GetBloomCounter();
+  // int8* counter = (int8 *)var->GetBloomCounter();
+
+  for (auto it : hash_val1) {
+    ASSERT_EQ((int)counter[it], tab[it]);
+  }
+  for (auto it : hash_val2) {
+    ASSERT_EQ((int)counter[it], tab[it]);
+  }
+  for (auto it : hash_val3) {
+    ASSERT_EQ((int)counter[it], tab[it]);
+  }
+  for (auto it : hash_val4) {
+    ASSERT_EQ((int)counter[it], tab[it]);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestInsertAndLookup) {
+  int64 value_size = 128;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10));
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
+
+  int64 InsertLoops = 1000;
+  bool* flag = (bool*)malloc(sizeof(bool) * max);
+  srand((unsigned)time(NULL));
+  int64* keys = (int64*)malloc(sizeof(int64) * InsertLoops);
+
+  for (long i = 0; i < max; i++) {
+    flag[i] = 0;
+  }
+
+  int index = 0;
+  while (index < InsertLoops) {
+    long j = rand() % max;
+    if (flag[j] == 1)  // the number is already set as a key
+      continue;
+    else {  // the number is not selected as a key
+      keys[index] = j;
+      index++;
+      flag[j] = 1;
+    }
+  }
+  free(flag);
+  std::vector<std::thread> insert_threads(THREADNUM);
+  for (size_t i = 0; i < THREADNUM; i++) {
+    insert_threads[i] = std::thread(InsertAndLookup, variable,
+                                    &keys[i * InsertLoops / THREADNUM],
+                                    InsertLoops / THREADNUM, value_size);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+}
+
+void MultiFilter(EmbeddingVar<int64, float>* variable, int value_size) {
+  bool is_filter = true;
+  void* val;
+  variable->LookupOrCreateKey(20, &val, &is_filter, false);
+}
+
+TEST(EmbeddingVariableTest, TestFeatureFilterParallel) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+  auto var = CreateEmbeddingVar(value_size, value, 1, 7, 5);
+
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+  int thread_num = 5;
+  std::vector<std::thread> insert_threads(thread_num);
+  for (size_t i = 0; i < thread_num; i++) {
+    insert_threads[i] = std::thread(MultiFilter, var, value_size);
+  }
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+
+  void* value_ptr = nullptr;
+  var->LookupOrCreateKey(20, &value_ptr);
+  ASSERT_EQ(var->GetFreq(20), thread_num);
+}
+
+EmbeddingVar<int64, float>* InitEV_Lockless(int64 value_size) {
+  Tensor value(DT_INT64, TensorShape({value_size}));
+  test::FillValues<int64>(&value, std::vector<int64>(value_size, 10));
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
+
+  return variable;
+}
+
+void MultiLookup(EmbeddingVar<int64, float>* variable, int64 InsertLoop,
+                 int thread_num, int i) {
+  for (int64 j = i * InsertLoop / thread_num;
+       j < (i + 1) * InsertLoop / thread_num; j++) {
+    void* value_ptr = nullptr;
+    variable->LookupOrCreateKey(j, &value_ptr);
+  }
+}
+
+void BM_MULTIREAD_LOCKLESS(int iters, int thread_num) {
+  // testing::StopTiming();
+  // testing::UseRealTime();
+
+  int64 value_size = 128;
+  auto variable = InitEV_Lockless(value_size);
+  int64 InsertLoop = 1000000;
+
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+
+  for (int64 i = 0; i < InsertLoop; i++) {
+    void* value_ptr = nullptr;
+    variable->LookupOrCreateKey(i, &value_ptr);
+    typename TTypes<float>::Flat vflat = variable->flat(value_ptr);
+  }
+
+  // testing::StartTiming();
+  while (iters--) {
+    std::vector<std::thread> insert_threads(thread_num);
+    for (size_t i = 0; i < thread_num; i++) {
+      insert_threads[i] =
+          std::thread(MultiLookup, variable, InsertLoop, thread_num, i);
+    }
+    for (auto& t : insert_threads) {
+      t.join();
+    }
+  }
+}
+
+TEST(EmbeddingVariableTest, TestAllocate) {
+  int value_len = 8;
+  double t0 = getResident() * getpagesize() / 1024.0 / 1024.0;
+  double t1 = 0;
+  LOG(INFO) << "memory t0: " << t0;
+  for (int64 i = 0; i < 1000; ++i) {
+    float* tensor_val = TypedAllocator::Allocate<float>(
+        ev_allocator(), value_len, AllocationAttributes());
+    t1 = getResident() * getpagesize() / 1024.0 / 1024.0;
+    memset(tensor_val, 0, sizeof(float) * value_len);
+  }
+  double t2 = getResident() * getpagesize() / 1024.0 / 1024.0;
+  LOG(INFO) << "memory t1-t0: " << t1 - t0;
+  LOG(INFO) << "memory t2-t1: " << t2 - t1;
+  LOG(INFO) << "memory t2-t0: " << t2 - t0;
+}
+
+TEST(EmbeddingVariableTest, TestEVStorageType_DRAM) {
+  int64 value_size = 128;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 9.0));
+  float* fill_v = (float*)malloc(value_size * sizeof(float));
+  auto variable = CreateEmbeddingVar(value_size, value, 1);
+
+  int64 ev_size = 100;
+  for (int64 i = 0; i < ev_size; i++) {
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(i, &val, &is_filter, false);
+  }
+
+  LOG(INFO) << "size:" << variable->Size();
+}
+
+void t1(KVInterface<int64, float>* hashmap) {
+  for (int i = 0; i < 100; ++i) {
+    hashmap->Insert(i, nullptr);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestRemoveLockless) {
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM, false, false,
+      {false, 0});
+  KVInterface<int64, float>* hashmap =
+      new LocklessHashMap<int64, float>(feat_desc);
+  feat_desc->InitSlotInfo(0, 100, {nullptr, 1});
+  ASSERT_EQ(hashmap->Size(), 0);
+  LOG(INFO) << "hashmap size: " << hashmap->Size();
+  auto t = std::thread(t1, hashmap);
+  t.join();
+  LOG(INFO) << "hashmap size: " << hashmap->Size();
+  ASSERT_EQ(hashmap->Size(), 100);
+  TF_CHECK_OK(hashmap->Remove(1));
+  TF_CHECK_OK(hashmap->Remove(2));
+  ASSERT_EQ(hashmap->Size(), 98);
+  LOG(INFO) << "2 size:" << hashmap->Size();
+}
+
+TEST(EmbeddingVariableTest, TestLRUCachePrefetch) {
+  BatchCache<int64>* cache = new LRUCache<int64>();
+  int num_ids = 5;
+  std::vector<int64> prefetch_ids;
+  int index = 0;
+  int64 true_evict_size;
+  int64* evict_ids = new int64[num_ids];
+  std::vector<int64> access_seq;
+  for (int i = 1; i <= num_ids; i++) {
+    for (int j = 0; j < i; j++) {
+      prefetch_ids.emplace_back(i);
+    }
+  }
+  cache->add_to_prefetch_list(prefetch_ids.data(), prefetch_ids.size());
+  ASSERT_EQ(cache->size(), 0);
+  true_evict_size = cache->get_evic_ids(evict_ids, num_ids);
+  ASSERT_EQ(true_evict_size, 0);
+  for (int i = 1; i <= 2; i++) {
+    for (int j = 0; j < i; j++) {
+      access_seq.emplace_back(i);
+    }
+  }
+  cache->add_to_cache(access_seq.data(), access_seq.size());
+  ASSERT_EQ(cache->size(), 2);
+  true_evict_size = cache->get_evic_ids(evict_ids, num_ids);
+  ASSERT_EQ(true_evict_size, 2);
+  access_seq.clear();
+  for (int i = 5; i >= 3; i--) {
+    for (int j = 0; j < i; j++) {
+      access_seq.emplace_back(i);
+    }
+  }
+  cache->add_to_cache(access_seq.data(), access_seq.size());
+  ASSERT_EQ(cache->size(), 3);
+  true_evict_size = cache->get_evic_ids(evict_ids, 2);
+  ASSERT_EQ(evict_ids[0], 5);
+  ASSERT_EQ(evict_ids[1], 4);
+  ASSERT_EQ(cache->size(), 1);
+
+  delete cache;
+  delete[] evict_ids;
+}
+
+TEST(EmbeddingVariableTest, TestLRUCache) {
+  BatchCache<int64>* cache = new LRUCache<int64>();
+  int num_ids = 30;
+  int num_access = 100;
+  int num_evict = 50;
+  int64 ids[num_access] = {0};
+  int64 evict_ids[num_evict] = {0};
+  for (int i = 0; i < num_access; i++) {
+    ids[i] = i % num_ids;
+  }
+  cache->update(ids, num_access);
+  int64 size = cache->get_evic_ids(evict_ids, num_evict);
+  ASSERT_EQ(size, num_ids);
+  ASSERT_EQ(cache->size(), 0);
+  for (int i = 0; i < size; i++) {
+    ASSERT_EQ(evict_ids[i], (num_access % num_ids + i) % num_ids);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestLRUCacheGetCachedIds) {
+  BatchCache<int64>* cache = new LRUCache<int64>();
+  int num_ids = 30;
+  int num_access = 100;
+  int num_evict = 15;
+  int num_cache = 20;
+  int64 ids[num_access] = {0};
+  int64 evict_ids[num_evict] = {0};
+  for (int i = 0; i < num_access; i++) {
+    ids[i] = i % num_ids;
+  }
+  cache->update(ids, num_access);
+  ASSERT_EQ(cache->size(), num_ids);
+  int64* cached_ids = new int64[num_cache];
+  int64* cached_freqs = new int64[num_cache];
+  int64 true_size =
+      cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs);
+  ASSERT_EQ(true_size, 20);
+  cache->get_evic_ids(evict_ids, num_evict);
+  ASSERT_EQ(cache->size(), 15);
+  true_size =
+      cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs);
+  ASSERT_EQ(true_size, 15);
+  delete cache;
+  delete[] cached_ids;
+  delete[] cached_freqs;
+}
+
+TEST(EmbeddingVariableTest, TestLFUCacheGetCachedIds) {
+  BatchCache<int64>* cache = new LFUCache<int64>();
+  int num_ids = 30;
+  int num_access = 100;
+  int num_evict = 15;
+  int num_cache = 20;
+  int64 ids[num_access] = {0};
+  int64 evict_ids[num_evict] = {0};
+  for (int i = 0; i < num_access; i++) {
+    ids[i] = i % num_ids;
+  }
+  cache->update(ids, num_access);
+  ASSERT_EQ(cache->size(), num_ids);
+  int64* cached_ids = new int64[num_cache];
+  int64* cached_freqs = new int64[num_cache];
+  int64 true_size =
+      cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs);
+  ASSERT_EQ(true_size, 20);
+  cache->get_evic_ids(evict_ids, num_evict);
+  ASSERT_EQ(cache->size(), 15);
+  true_size =
+      cache->get_cached_ids(cached_ids, num_cache, nullptr, cached_freqs);
+  ASSERT_EQ(true_size, 15);
+  delete cache;
+  delete[] cached_ids;
+  delete[] cached_freqs;
+}
+
+TEST(EmbeddingVariableTest, TestLFUCachePrefetch) {
+  BatchCache<int64>* cache = new LFUCache<int64>();
+  int num_ids = 5;
+  std::vector<int64> prefetch_ids;
+  int index = 0;
+  int64 true_evict_size;
+  int64* evict_ids = new int64[num_ids];
+  std::vector<int64> access_seq;
+  for (int i = 1; i <= num_ids; i++) {
+    for (int j = 0; j < i; j++) {
+      prefetch_ids.emplace_back(i);
+    }
+  }
+  cache->add_to_prefetch_list(prefetch_ids.data(), prefetch_ids.size());
+  ASSERT_EQ(cache->size(), 0);
+  true_evict_size = cache->get_evic_ids(evict_ids, num_ids);
+  ASSERT_EQ(true_evict_size, 0);
+  for (int i = 1; i <= 2; i++) {
+    for (int j = 0; j < i; j++) {
+      access_seq.emplace_back(i);
+    }
+  }
+  cache->add_to_cache(access_seq.data(), access_seq.size());
+  ASSERT_EQ(cache->size(), 2);
+  true_evict_size = cache->get_evic_ids(evict_ids, num_ids);
+  ASSERT_EQ(true_evict_size, 2);
+  access_seq.clear();
+  for (int i = 5; i >= 3; i--) {
+    for (int j = 0; j < i; j++) {
+      access_seq.emplace_back(i);
+    }
+  }
+  cache->add_to_cache(access_seq.data(), access_seq.size());
+  ASSERT_EQ(cache->size(), 3);
+  true_evict_size = cache->get_evic_ids(evict_ids, 2);
+  ASSERT_EQ(evict_ids[0], 3);
+  ASSERT_EQ(evict_ids[1], 4);
+  ASSERT_EQ(cache->size(), 1);
+
+  delete cache;
+  delete[] evict_ids;
+}
+
+TEST(EmbeddingVariableTest, TestLFUCache) {
+  BatchCache<int64>* cache = new LFUCache<int64>();
+  int num_ids = 30;
+  int num_access = 100;
+  int num_evict = 50;
+  int64 ids[num_access] = {0};
+  int64 evict_ids[num_evict] = {0};
+  for (int i = 0; i < num_access; i++) {
+    ids[i] = i % num_ids;
+  }
+  cache->update(ids, num_access);
+  int64 size = cache->get_evic_ids(evict_ids, num_evict);
+  ASSERT_EQ(size, num_ids);
+  ASSERT_EQ(cache->size(), 0);
+  for (int i = 0; i < size; i++) {
+    ASSERT_EQ(evict_ids[i], (num_access % num_ids + i) % num_ids);
+  }
+}
+
+const int total_size = 1024 * 8;
+const int th_num = 1;
+const int malloc_size = total_size / th_num;
+
+void malloc_use_allocator(Allocator* allocator) {
+  timespec start;
+  timespec end;
+  float* first = (float*)allocator->AllocateRaw(0, sizeof(float));
+
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  for (int i = 0; i < malloc_size; ++i) {
+    int ev_list_size = 32;
+    float* ptr_ =
+        (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  LOG(INFO) << "cost time: "
+            << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec -
+                start.tv_nsec) /
+                   1000000
+            << "ms";
+}
+
+TEST(EmbeddingVariableTest, TestEVMalloc) {
+  std::thread th_arr[th_num];
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i] = std::thread(malloc_use_allocator, ev_allocator());
+  }
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i].join();
+  }
+}
+
+TEST(EmbeddingVariableTest, TestCPUMalloc) {
+  std::thread th_arr[th_num];
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i] = std::thread(malloc_use_allocator, cpu_allocator());
+  }
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i].join();
+  }
+}
+
+#if GOOGLE_CUDA
+TEST(EmbeddingVariableTest, TestGPUMalloc) {
+  SessionOptions sops;
+  std::unique_ptr<Device> device =
+      DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
+  Allocator* gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator(
+      GPUOptions(), TfDeviceId(0), 1 << 26, {} /* peer_gpu_ids */);
+
+  std::thread th_arr[th_num];
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i] = std::thread(malloc_use_allocator, gpu_allocator);
+  }
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i].join();
+  }
+}
+
+TEST(EmbeddingVariableTest, TestCPUGPUMalloc) {
+  SessionOptions sops;
+  std::unique_ptr<Device> device =
+      DeviceFactory::NewDevice(DEVICE_GPU, sops, "/job:a/replica:0/task:0");
+
+  auto gpu_allocator = GPUProcessState::singleton()->GetGPUAllocator(
+      GPUOptions(), TfDeviceId(0), 1 << 26, {} /* peer_gpu_ids */);
+  auto mem_pool = new EmbeddingMemoryPool<float>(gpu_allocator, 256, 1024);
+  float* ptr_1 = mem_pool->Allocate();
+  float* ptr_2 = mem_pool->Allocate();
+  std::vector<void*> value_ptrs;
+  value_ptrs.emplace_back(ptr_1);
+  mem_pool->Deallocate(value_ptrs);
+  value_ptrs.clear();
+  value_ptrs.emplace_back(ptr_2);
+  mem_pool->Deallocate(value_ptrs);
+  float* ptr_3 = mem_pool->Allocate();
+  ASSERT_EQ(ptr_1, ptr_3);
+  delete mem_pool;
+}
+#endif  // GOOGLE_CUDA
+
+void malloc_free_use_allocator(Allocator* allocator) {
+  timespec start;
+  timespec end;
+  std::vector<float*> ptrs;
+  float* first = (float*)allocator->AllocateRaw(0, sizeof(float));
+
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  for (int i = 0; i < malloc_size; ++i) {
+    int ev_list_size = 32;
+    float* ptr_ =
+        (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size);
+    ptrs.push_back(ptr_);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  LOG(INFO) << "first time: "
+            << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec -
+                start.tv_nsec) /
+                   1000000
+            << "ms";
+
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  for (auto iter = ptrs.begin(); iter != ptrs.end(); iter++) {
+    allocator->DeallocateRaw(*iter);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  LOG(INFO) << "free time: "
+            << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec -
+                start.tv_nsec) /
+                   1000000
+            << "ms";
+
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  for (int i = 0; i < malloc_size; ++i) {
+    int ev_list_size = 32;
+    float* ptr_ =
+        (float*)allocator->AllocateRaw(0, sizeof(float) * ev_list_size);
+  }
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  LOG(INFO) << "second time: "
+            << ((double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec -
+                start.tv_nsec) /
+                   1000000
+            << "ms";
+}
+
+TEST(EmbeddingVariableTest, TestEVMallocFree) {
+  std::thread th_arr[th_num];
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i] = std::thread(malloc_free_use_allocator, ev_allocator());
+  }
+  for (unsigned int i = 0; i < th_num; ++i) {
+    th_arr[i].join();
+  }
+}
+
+void SingleCommit(KVInterface<int64, float>* hashmap, std::vector<int64> keys,
+                  int bias) {
+  std::vector<void*> value_ptrs;
+  for (int64 i = 0; i < keys.size(); ++i) {
+    void* tmp = cpu_allocator()->AllocateRaw(0, 124 * sizeof(float) + 16);
+    for (int j = 0; j < 124; j++) {
+      ((float*)tmp)[j] = keys[i] + bias;
+    }
+    value_ptrs.push_back(tmp);
+  }
+  ASSERT_EQ(keys.size(), value_ptrs.size());
+  uint64 start = Env::Default()->NowNanos();
+  for (int64 i = 0; i < keys.size(); i++) {
+    hashmap->Commit(keys[i], value_ptrs[i]);
+  }
+  uint64 end = Env::Default()->NowNanos();
+  uint64 result_cost = end - start;
+}
+
+void TestCompaction() {
+  std::string temp_dir = testing::TmpDir();
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, true, true,
+      {false, 0});
+  auto hashmap = new SSDHashKV<int64, float>(temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
+  ASSERT_EQ(hashmap->Size(), 0);
+  std::vector<int64> ids;
+  for (int i = 0; i < 262144; i++) {
+    ids.emplace_back(i);
+  }
+  auto t1 = std::thread(SingleCommit, hashmap, ids, 3);
+  t1.join();
+  ids.clear();
+  for (int i = 0; i < 131073; i++) {
+    ids.emplace_back(i);
+  }
+  t1 = std::thread(SingleCommit, hashmap, ids, 1);
+  t1.join();
+  ids.clear();
+  sleep(1);
+  void* val = nullptr;
+  for (int i = 131073; i < 262144; i++) {
+    hashmap->Lookup(i, &val);
+    float* v = (float*)val;
+    for (int j = 0; j < 124; j++) {
+      ASSERT_EQ(v[j], i + 3);
+    }
+  }
+  for (int i = 131073; i < 262144; i++) {
+    ids.emplace_back(i);
+  }
+  t1 = std::thread(SingleCommit, hashmap, ids, 2);
+  t1.join();
+  ids.clear();
+  ids.emplace_back(262155);
+  t1 = std::thread(SingleCommit, hashmap, ids, 0);
+  t1.join();
+  sleep(1);
+  for (int i = 0; i < 131073; i++) {
+    hashmap->Lookup(i, &val);
+    float* v = (float*)val;
+    for (int j = 0; j < 124; j++) {
+      ASSERT_EQ(v[j], i + 1);
+    }
+  }
+  for (int i = 131073; i < 262144; i++) {
+    hashmap->Lookup(i, &val);
+    float* v = (float*)val;
+    for (int j = 0; j < 124; j++) {
+      ASSERT_EQ(v[j], i + 2);
+    }
+  }
+  delete hashmap;
+}
+
+TEST(KVInterfaceTest, TestSSDKVAsyncCompaction) {
+  setenv("TF_SSDHASH_ASYNC_COMPACTION", "true", 1);
+  TestCompaction();
+}
+
+TEST(KVInterfaceTest, TestSSDKVSyncCompaction) {
+  setenv("TF_SSDHASH_ASYNC_COMPACTION", "false", 1);
+  TestCompaction();
+}
+
+void TestReadEmbFile() {
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), embedding::StorageType::DRAM_SSDHASH, true, true,
+      {false, 0});
+  std::string temp_dir = testing::TmpDir();
+  auto hashmap = new SSDHashKV<int64, float>(temp_dir, feat_desc);
+  feat_desc->InitSlotInfo(0, 124, {nullptr, 1});
+  hashmap->Init();
+  ASSERT_EQ(hashmap->Size(), 0);
+  std::vector<int64> ids;
+  for (int i = 0; i < 262145; i++) {
+    ids.emplace_back(i);
+  }
+  SingleCommit(hashmap, ids, 3);
+  sleep(1);
+  ids.clear();
+  void* val = nullptr;
+  for (int i = 0; i < 262144; i++) {
+    hashmap->Lookup(i, &val);
+    float* v = (float*)val;
+    for (int j = 0; j < 124; j++) {
+      ASSERT_EQ(v[j], i + 3);
+    }
+  }
+  delete hashmap;
+}
+
+TEST(KVInterfaceTest, TestMmapMadviseFile) {
+  setenv("TF_SSDHASH_IO_SCHEME", "mmap_and_madvise", 1);
+  TestReadEmbFile();
+}
+
+TEST(KVInterfaceTest, TestMmapFile) {
+  std::string temp_dir = testing::TmpDir();
+  setenv("TF_SSDHASH_IO_SCHEME", "mmap", 1);
+  TestReadEmbFile();
+}
+
+TEST(KVInterfaceTest, TestDirectIoFile) {
+  std::string temp_dir = testing::TmpDir();
+  setenv("TF_SSDHASH_IO_SCHEME", "directio", 1);
+  TestReadEmbFile();
+}
+
+void InsertKey(EmbeddingVar<int64, float>* variable, int value_size) {
+  float* val = (float*)malloc((value_size + 1) * sizeof(float));
+  for (int64 i = 0; i < 100000000; i++) {
+    void* val = nullptr;
+    bool is_filter = true;
+    variable->LookupOrCreateKey(20, &val, &is_filter, false);
+  }
+}
+
+void RemoveKey(EmbeddingVar<int64, float>* variable) {
+  for (int64 i = 0; i < 10; i++) {
+    sleep(1);
+    variable->storage()->Remove(20);
+  }
+}
+
+TEST(EmbeddingVariableTest, TestLookupRemoveConcurrency) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  auto var = CreateEmbeddingVar(value_size, value, 1);
+  int thread_num = 5;
+  std::vector<std::thread> insert_threads(thread_num);
+  for (size_t i = 0; i < thread_num - 1; i++) {
+    insert_threads[i] = std::thread(InsertKey, var, value_size);
+  }
+  insert_threads[thread_num - 1] = std::thread(RemoveKey, var);
+  for (auto& t : insert_threads) {
+    t.join();
+  }
+}
+
+TEST(EmbeddingVariableTest, TestInsertAndGetSnapshot) {
+  int value_size = 10;
+  Tensor value(DT_FLOAT, TensorShape({value_size}));
+  test::FillValues<float>(&value, std::vector<float>(value_size, 10.0));
+  auto var = CreateEmbeddingVar(value_size, value, 1);
+  float* set_value = (float*)malloc(value_size * sizeof(float));
+  // Insertion
+  for (int i = 0; i < 100; i++) {
+    for (int j = 0; j < value_size; j++) {
+      set_value[j] = i + j;
+    }
+    var->Insert(i, set_value);
+  }
+  free(set_value);
+  // GetSnapshot
+  std::vector<int64> key_list;
+  std::vector<float*> value_ptr_list;
+  std::vector<int64> version_list;
+  std::vector<int64> freq_list;
+  var->GetSnapshot(&key_list, &value_ptr_list, &version_list, &freq_list);
+  for (int i = 0; i < key_list.size(); i++) {
+    ASSERT_EQ(key_list[i], i);
+    for (int j = 0; j < value_size; j++) {
+      ASSERT_EQ(value_ptr_list[i][j], i + j);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace embedding
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc
new file mode 100644
index 00000000..a7de65cc
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_performance_test.cc
@@ -0,0 +1,455 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#include "embedding_variable_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace embedding {
+void GenerateSkewIds(int num_of_ids, float skew_factor,
+                     std::vector<int64>& hot_ids_list,
+                     std::vector<int64>& cold_ids_list) {
+  int num_of_hot_ids = num_of_ids * (1 - skew_factor);
+  int num_of_cold_ids = num_of_ids - num_of_hot_ids;
+  std::set<int64> hot_ids_set;
+  std::set<int64> cold_ids_set;
+  hot_ids_list.resize(num_of_hot_ids);
+  cold_ids_list.resize(num_of_cold_ids);
+  srand((unsigned)time(NULL));
+  // Generate hot ids
+  for (int i = 0; i < num_of_hot_ids; i++) {
+    bool flag = false;
+    int64 key;
+    do {
+      key = rand() % 100000000;
+      flag = hot_ids_set.insert(key).second;
+      hot_ids_list[i] = key;
+    } while (!flag);
+  }
+  // Generate cold ids
+  for (int i = 0; i < num_of_cold_ids; i++) {
+    bool flag = false;
+    int64 key;
+    do {
+      key = rand() % 100000000;
+      if (hot_ids_set.find(key) != hot_ids_set.end()) {
+        flag = false;
+      } else {
+        flag = cold_ids_set.insert(key).second;
+        cold_ids_list[i] = key;
+      }
+    } while (!flag);
+  }
+}
+
+void InitSkewInputBatch(std::vector<std::vector<int64>>& input_batches,
+                        float skew_factor,
+                        const std::vector<int64>& hot_ids_list,
+                        const std::vector<int64>& cold_ids_list) {
+  srand((unsigned)time(NULL));
+  int num_of_hot_ids = hot_ids_list.size();
+  int num_of_cold_ids = cold_ids_list.size();
+  int num_of_batch = input_batches.size();
+  for (int i = 0; i < input_batches.size(); i++) {
+    for (int j = 0; j < input_batches[i].size(); j++) {
+      int tmp = rand() % 10;
+      if ((float)tmp * 0.1 < skew_factor) {
+        int pos = rand() % num_of_hot_ids;
+        input_batches[i][j] = hot_ids_list[pos];
+      } else {
+        int pos = rand() % num_of_cold_ids;
+        input_batches[i][j] = cold_ids_list[pos];
+      }
+    }
+  }
+}
+
+void GenerateSkewInput(int num_of_ids, float skew_factor,
+                       std::vector<std::vector<int64>>& input_batches) {
+  std::vector<int64> hot_ids_list;
+  std::vector<int64> cold_ids_list;
+  // Generate hot ids
+  GenerateSkewIds(num_of_ids, skew_factor, hot_ids_list, cold_ids_list);
+  // Select id for each batch
+  InitSkewInputBatch(input_batches, skew_factor, hot_ids_list, cold_ids_list);
+}
+
+void thread_lookup_or_create(EmbeddingVar<int64, float>* ev,
+                             const int64* input_batch, float* default_value,
+                             int default_value_dim, float** outputs,
+                             int value_size, int start, int end) {
+  void* value_ptr = nullptr;
+  bool is_filter = false;
+  for (int i = start; i < end; i++) {
+    ev->LookupOrCreateKey(input_batch[i], &value_ptr, &is_filter, false);
+    if (is_filter) {
+      auto val = ev->flat(value_ptr);
+      memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+    } else {
+      int default_value_index = input_batch[i] % default_value_dim;
+      memcpy(outputs[i], default_value + default_value_index * value_size,
+             sizeof(float) * value_size);
+    }
+  }
+}
+
+double PerfLookupOrCreate(const std::vector<std::vector<int64>>& input_batches,
+                          int num_thread, int filter_freq = 0) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim,
+                               filter_freq);
+  std::vector<std::thread> worker_threads(num_thread);
+  double total_time = 0.0;
+  timespec start, end;
+  for (int k = 0; k < input_batches.size(); k++) {
+    // Allocate Outputs for each batch
+    std::vector<float*> outputs(input_batches[k].size());
+    for (int i = 0; i < outputs.size(); i++) {
+      outputs[i] =
+          (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size);
+    }
+    // Execution
+    std::vector<std::pair<int, int>> thread_task_range(num_thread);
+    for (int i = 0; i < num_thread; i++) {
+      int st = input_batches[k].size() / num_thread * i;
+      int ed = input_batches[k].size() / num_thread * (i + 1);
+      ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed;
+      thread_task_range[i].first = st;
+      thread_task_range[i].second = ed;
+    }
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i] = std::thread(
+          thread_lookup_or_create, ev, input_batches[k].data(),
+          default_value_matrix.data(), default_value_dim, outputs.data(),
+          value_size, thread_task_range[i].first, thread_task_range[i].second);
+    }
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i].join();
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    if (k > 10)
+      total_time += ((double)(end.tv_sec - start.tv_sec) * 1000000000 +
+                     end.tv_nsec - start.tv_nsec);
+    // Check
+    for (int i = 0; i < input_batches[k].size(); i++) {
+      int64 key = input_batches[k][i];
+      float* output = outputs[i];
+      for (int j = 0; j < value_size; j++) {
+        float val = default_value_matrix(key % default_value_dim, j);
+        if (output[j] != val) {
+          LOG(INFO) << "Value Error: outputs[" << key << "][" << j << "] is "
+                    << output[j] << ", while the anwser is " << val;
+          return -1.0;
+        }
+      }
+    }
+    // Deallocate Output
+    for (auto ptr : outputs) {
+      cpu_allocator()->DeallocateRaw(ptr);
+    }
+  }
+  ev->Unref();
+  return total_time;
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestLookupOrCreate) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  LOG(INFO) << "[TestLookupOrCreate] Start generating skew input";
+  GenerateSkewInput(num_of_ids, 0.8, input_batches);
+  LOG(INFO) << "[TestLookupOrCreate] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread : num_thread_vec) {
+    LOG(INFO) << "[TestLookupOrCreate] Test LookupOrCreate With " << num_thread
+              << " threads.";
+    double exec_time = PerfLookupOrCreate(input_batches, num_thread);
+    if (exec_time == -1.0) {
+      LOG(INFO) << "[TestLookupOrCreate] Test Failed";
+    } else {
+      LOG(INFO) << "[TestLookupOrCreate] Performance of LookupOrCreate With "
+                << num_thread << " threads: " << exec_time / 1000000 << " ms";
+    }
+  }
+}
+
+void thread_lookup(EmbeddingVar<int64, float>* ev, const int64* input_batch,
+                   float** outputs, int value_size, int start, int end) {
+  void* value_ptr = nullptr;
+  bool is_filter = false;
+  for (int i = start; i < end; i++) {
+    ev->LookupKey(input_batch[i], &value_ptr);
+    auto val = ev->flat(value_ptr);
+    memcpy(outputs[i], &val(0), sizeof(float) * value_size);
+  }
+}
+
+double PerfLookup(EmbeddingVar<int64, float>* ev,
+                  const std::vector<std::vector<int64>>& input_batches,
+                  int num_thread, int value_size, float* default_value,
+                  int64 default_value_dim) {
+  std::vector<std::thread> worker_threads(num_thread);
+  double total_time = 0.0;
+  timespec start, end;
+  for (int k = 0; k < input_batches.size(); k++) {
+    // Allocate Outputs for each batch
+    std::vector<float*> outputs(input_batches[k].size());
+    for (int i = 0; i < outputs.size(); i++) {
+      outputs[i] =
+          (float*)cpu_allocator()->AllocateRaw(0, sizeof(float) * value_size);
+    }
+    // Execution
+    std::vector<std::pair<int, int>> thread_task_range(num_thread);
+    for (int i = 0; i < num_thread; i++) {
+      int st = input_batches[k].size() / num_thread * i;
+      int ed = input_batches[k].size() / num_thread * (i + 1);
+      ed = (ed > input_batches[k].size()) ? input_batches[k].size() : ed;
+      thread_task_range[i].first = st;
+      thread_task_range[i].second = ed;
+    }
+    clock_gettime(CLOCK_MONOTONIC, &start);
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i] = std::thread(
+          thread_lookup, ev, input_batches[k].data(), outputs.data(),
+          value_size, thread_task_range[i].first, thread_task_range[i].second);
+    }
+    for (int i = 0; i < num_thread; i++) {
+      worker_threads[i].join();
+    }
+    clock_gettime(CLOCK_MONOTONIC, &end);
+    if (k > 10)
+      total_time += ((double)(end.tv_sec - start.tv_sec) * 1000000000 +
+                     end.tv_nsec - start.tv_nsec);
+    // Check
+    for (int i = 0; i < input_batches[k].size(); i++) {
+      int64 key = input_batches[k][i];
+      float* output = outputs[i];
+      for (int j = 0; j < value_size; j++) {
+        float val = default_value[(key % default_value_dim) * value_size + j];
+        if (output[j] != val) {
+          LOG(INFO) << "Value Error: outputs[" << key << "][" << j << "] is "
+                    << output[j] << ", while is the anwser is " << val;
+          return -1.0;
+        }
+      }
+    }
+    // Deallocate Output
+    for (auto ptr : outputs) {
+      cpu_allocator()->DeallocateRaw(ptr);
+    }
+  }
+  return total_time;
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestLookup) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  float skew_factor = 0.8;
+
+  LOG(INFO) << "[TestLookup] Start initializing EV storage.";
+  std::vector<int64> hot_ids_list;
+  std::vector<int64> cold_ids_list;
+  GenerateSkewIds(num_of_ids, skew_factor, hot_ids_list, cold_ids_list);
+
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim);
+  void* value_ptr = nullptr;
+  bool is_filter = false;
+  for (int i = 0; i < hot_ids_list.size(); i++) {
+    ev->LookupOrCreateKey(hot_ids_list[i], &value_ptr, &is_filter, false);
+  }
+  for (int i = 0; i < cold_ids_list.size(); i++) {
+    ev->LookupOrCreateKey(cold_ids_list[i], &value_ptr, &is_filter, false);
+  }
+  LOG(INFO) << "[TestLookup] End initializing EV storage.";
+
+  LOG(INFO) << "[TestLookup] Start generating skew input";
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  InitSkewInputBatch(input_batches, skew_factor, hot_ids_list, cold_ids_list);
+  LOG(INFO) << "[TestLookup] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread : num_thread_vec) {
+    LOG(INFO) << "[TestLookup] Test Lookup With " << num_thread << " threads.";
+    double exec_time =
+        PerfLookup(ev, input_batches, num_thread, value_size,
+                   (float*)default_value.data(), default_value_dim);
+    if (exec_time == -1.0) {
+      LOG(INFO) << "[TestLookup] Test Failed";
+    } else {
+      LOG(INFO) << "[TestLookup] Performance of Lookup With " << num_thread
+                << " threads: " << exec_time / 1000000 << " ms";
+    }
+  }
+  ev->Unref();
+}
+
+string Prefix(const string& prefix) {
+  return strings::StrCat(testing::TmpDir(), "/", prefix);
+}
+
+void PerfSave(Tensor& default_value, const std::vector<int64>& id_list,
+              int value_size, int64 default_value_dim, int64 steps_to_live = 0,
+              float l2_weight_threshold = -1.0) {
+  auto ev = CreateEmbeddingVar(value_size, default_value, default_value_dim, 0,
+                               steps_to_live, l2_weight_threshold);
+  void* value_ptr = nullptr;
+  bool is_filter = false;
+  srand((unsigned)time(NULL));
+
+  for (int i = 0; i < id_list.size(); i++) {
+    ev->LookupOrCreateKey(id_list[i], &value_ptr, &is_filter, false);
+    ev->flat(value_ptr);
+    int64 global_step = rand() % 100;
+    ev->UpdateVersion(value_ptr, global_step);
+  }
+  Tensor part_offset_tensor(DT_INT32, TensorShape({kSavedPartitionNum + 1}));
+
+  BundleWriter writer(Env::Default(), Prefix("foo"));
+  timespec start, end;
+  double total_time = 0.0;
+  embedding::ShrinkArgs shrink_args;
+  shrink_args.global_step = 100;
+  clock_gettime(CLOCK_MONOTONIC, &start);
+  ev->Save("var", Prefix("foo"), &writer, shrink_args);
+  clock_gettime(CLOCK_MONOTONIC, &end);
+  total_time += (double)(end.tv_sec - start.tv_sec) * 1000000000 + end.tv_nsec -
+                start.tv_nsec;
+  TF_ASSERT_OK(writer.Finish());
+  LOG(INFO) << "[TestSave]execution time: " << total_time / 1000000 << "ms";
+  ev->Unref();
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestSave) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+
+  int num_of_ids = 1000000;
+  srand((unsigned)time(NULL));
+  std::vector<int64> id_list(num_of_ids);
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size, default_value_dim);
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestGlobalStepEviction) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  srand((unsigned)time(NULL));
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size, default_value_dim, 80);
+}
+
+TEST(EmbeddingVariablePerformanceTest, TestL2WeightEviction) {
+  int value_size = 32;
+  int64 default_value_dim = 4096;
+  Tensor default_value(DT_FLOAT, TensorShape({default_value_dim, value_size}));
+  auto default_value_matrix = default_value.matrix<float>();
+  for (int i = 0; i < default_value_dim; i++) {
+    for (int j = 0; j < value_size; j++) {
+      default_value_matrix(i, j) = i * value_size + j;
+    }
+  }
+
+  int l2_weight_threshold_index = default_value_dim * 0.2;
+  float l2_weight_threshold = 0.0;
+  for (int64 j = 0; j < value_size; j++) {
+    l2_weight_threshold +=
+        pow(default_value_matrix(l2_weight_threshold_index, j), 2);
+  }
+  l2_weight_threshold *= 0.5;
+
+  int num_of_ids = 1000000;
+  std::vector<int64> id_list(num_of_ids);
+  srand((unsigned)time(NULL));
+  for (int i = 0; i < num_of_ids; i++) {
+    id_list[i] = rand() % 50000000;
+  }
+  PerfSave(default_value, id_list, value_size, default_value_dim, 0,
+           l2_weight_threshold);
+}
+
+TEST(EmbeddingVariablePerformaceTest, TestCounterFilterLookupOrCreate) {
+  int num_of_batch = 100;
+  int batch_size = 1024 * 128;
+  int num_of_ids = 5000000;
+  int64 filter_freq = 5;
+  std::vector<std::vector<int64>> input_batches(num_of_batch);
+  for (int i = 0; i < num_of_batch; i++) {
+    input_batches[i].resize(batch_size);
+  }
+  LOG(INFO) << "[TestCounterFilterLookupOrCreate] Start generating skew input";
+  GenerateSkewInput(num_of_ids, 0.8, input_batches);
+  LOG(INFO) << "[TestCounterFilterLookupOrCreate] Finish generating skew input";
+  std::vector<int> num_thread_vec({1, 2, 4, 8, 16});
+  for (auto num_thread : num_thread_vec) {
+    LOG(INFO) << "[TestCounterFilterLookupOrCreate] Test LookupOrCreate With "
+              << num_thread << " threads.";
+    double exec_time =
+        PerfLookupOrCreate(input_batches, num_thread, filter_freq);
+    if (exec_time == -1.0) {
+      LOG(INFO) << "[TestCounterFilterLookupOrCreate] Test Failed";
+    } else {
+      LOG(INFO) << "[TestCounterFilterLookupOrCreate] Performance of "
+                   "LookupOrCreate With "
+                << num_thread << " threads: " << exec_time / 1000000 << " ms";
+    }
+  }
+}
+}  // namespace embedding
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h
new file mode 100644
index 00000000..76b566f4
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/cc/tests/embedding_variable_test.h
@@ -0,0 +1,109 @@
+/* Copyright 2022 The DeepRec Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+======================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H
+#define TENSORFLOW_CORE_KERNELS_EMBEDING_VARIABLE_TEST_H
+#include <thread>
+
+#include "deepray/custom_ops/embedding_variable/cc/embedding/cache.h"
+#include "deepray/custom_ops/embedding_variable/cc/embedding/kv_interface.h"
+#include "deepray/custom_ops/embedding_variable/cc/kernels/kv_variable_util.h"
+#include "deepray/custom_ops/embedding_variable/cc/lib/tensor_bundle.h"
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_device.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
+#endif  // GOOGLE_CUDA
+
+#include <sys/resource.h>
+#include <time.h>
+
+#ifdef TENSORFLOW_USE_JEMALLOC
+#include "jemalloc/jemalloc.h"
+#endif
+
+namespace tensorflow {
+namespace embedding {
+struct ProcMemory {
+  long size;      // total program size
+  long resident;  // resident set size
+  long share;     // shared pages
+  long trs;       // text (code)
+  long lrs;       // library
+  long drs;       // data/stack
+  long dt;        // dirty pages
+
+  ProcMemory()
+      : size(0), resident(0), share(0), trs(0), lrs(0), drs(0), dt(0) {}
+};
+
+ProcMemory getProcMemory() {
+  ProcMemory m;
+  FILE* fp = fopen("/proc/self/statm", "r");
+  if (fp == NULL) {
+    LOG(ERROR) << "Fail to open /proc/self/statm.";
+    return m;
+  }
+
+  if (fscanf(fp, "%ld %ld %ld %ld %ld %ld %ld", &m.size, &m.resident, &m.share,
+             &m.trs, &m.lrs, &m.drs, &m.dt) != 7) {
+    fclose(fp);
+    LOG(ERROR) << "Fail to fscanf /proc/self/statm.";
+    return m;
+  }
+  fclose(fp);
+
+  return m;
+}
+
+double getSize() {
+  ProcMemory m = getProcMemory();
+  return m.size;
+}
+
+double getResident() {
+  ProcMemory m = getProcMemory();
+  return m.resident;
+}
+
+EmbeddingVar<int64, float>* CreateEmbeddingVar(
+    int value_size, Tensor& default_value, int64 default_value_dim,
+    int64 filter_freq = 0, int64 steps_to_live = 0,
+    float l2_weight_threshold = -1.0,
+    embedding::StorageType storage_type = embedding::StorageType::DRAM,
+    std::vector<int64> storage_size = {1024 * 1024 * 1024, 1024 * 1024 * 1024,
+                                       1024 * 1024 * 1024, 1024 * 1024 * 1024},
+    bool record_freq = false, int64 max_element_size = 0,
+    float false_positive_probability = -1.0,
+    DataType counter_type = DT_UINT64) {
+  auto embedding_config = EmbeddingConfig(
+      0, 0, 1, 0, "emb_var", steps_to_live, filter_freq, 999999,
+      l2_weight_threshold, max_element_size, false_positive_probability,
+      counter_type, default_value_dim, 0.0, record_freq, false, false);
+  auto feat_desc = new embedding::FeatureDescriptor<float>(
+      1, 1, ev_allocator(), storage_type, record_freq,
+      embedding_config.is_save_version(),
+      {embedding_config.is_counter_filter(), filter_freq});
+  auto storage = embedding::StorageFactory::Create<int64, float>(
+      embedding::StorageConfig(storage_type, "", storage_size,
+                               embedding_config),
+      cpu_allocator(), feat_desc, "emb_var");
+  auto ev = new EmbeddingVar<int64, float>("emb_var", storage, embedding_config,
+                                           cpu_allocator(), feat_desc);
+  ev->Init(default_value, default_value_dim);
+  return ev;
+}
+}  // namespace embedding
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_FRAMEWORK_EMBEDDING_STORAGE_FACTORY_H_
diff --git a/deepray/custom_ops/embedding_variable/config.proto b/deepray/custom_ops/embedding_variable/config.proto
new file mode 100644
index 00000000..424fc5e1
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/config.proto
@@ -0,0 +1,58 @@
+syntax = "proto3";
+
+package tensorflow.embedding;
+
+enum StorageType {
+  // none
+  DEFAULT = 0;
+
+  // one level
+  DRAM = 1;
+  PMEM_MEMKIND = 2;
+  PMEM_LIBPMEM = 3;
+  SSDHASH = 4;
+  LEVELDB = 5;
+  HBM = 6;
+
+  // two level
+  DRAM_PMEM = 11;
+  DRAM_SSDHASH = 12;
+  HBM_DRAM = 13;
+  DRAM_LEVELDB = 14;
+
+  // three level
+  DRAM_PMEM_SSDHASH = 101;
+  HBM_DRAM_SSDHASH = 102;
+
+}
+
+enum CopyBackFlag {
+  NOT_COPYBACK = 0;
+  COPYBACK = 1;
+  COPYBACK_AND_DESTROY = 2;
+}
+
+enum SlotType {
+  EMBEDDING_VARIABLE = 0;
+  VARIABLE = 1;
+}
+
+enum CacheStrategy {
+  LRU = 0;
+  LFU = 1;
+}
+
+enum EmbeddingVariableType {
+  IMMUTABLE = 0;
+  MUTABLE = 1;
+}
+
+enum ValuePtrStatus {
+  OK = 0;
+  IS_DELETED = 1;
+  NOT_IN_DRAM = 2;
+}
+
+enum IsSetInitialized {
+  NOT_SET_INITAILIZED = 0;
+}
diff --git a/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
new file mode 100644
index 00000000..42ca0c6b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
@@ -0,0 +1,114 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiplex_2."""
+
+import numpy as np
+import tensorflow as tf
+
+from deepray.custom_ops.multiplex_2 import multiplex_2_op
+from tensorflow.python.framework import errors_impl
+# This pylint disable is only needed for internal google users
+from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
+
+
+@test_util.with_eager_op_as_function
+class MultiplexOpRank1Test(tf.test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_int(self):
+    a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64)
+    b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64)
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b))
+    # expected result is [1, 20, 3, 40, 5]
+    result = multiplex_2_op.multiplex(cond, a, b)
+    self.assertAllEqual(result, expect)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_float(self):
+    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0])
+    b = tf.constant([10.0, 20.0, 30.0, 40.0, 50.0])
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    # expected result is [1.0, 20.0, 3.0, 40.0, 5.0]
+    expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b))
+    result = multiplex_2_op.multiplex(cond, a, b)
+    self.assertAllEqual(result, expect)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_bad_types(self):
+    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0])  # float
+    b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64)
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    with self.assertRaisesRegex(
+        (errors_impl.InvalidArgumentError, TypeError),
+        # Eager mode raises InvalidArgumentError with the following message
+        r'(cannot compute Examples>MultiplexDense as input #2\(zero-based\) '
+        r'was expected to be a float tensor but is a int64 tensor '
+        r'\[Op:Examples>MultiplexDense\]'
+        r')|('
+        # Graph mode raises TypeError with the following message
+        r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that "
+        r"does not match type float32 of argument 'a'.)"
+    ):
+      self.evaluate(multiplex_2_op.multiplex(cond, a, b))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_bad_size(self):
+    a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64)  # longer than b
+    b = tf.constant([10, 20], dtype=tf.int64)  # shorter than a
+    cond = tf.constant([True, False, True, False, True], dtype=bool)
+    with self.assertRaisesRegex(
+        (errors_impl.InvalidArgumentError, ValueError),
+        # Eager mode raises InvalidArgumentError with the following message
+        r'(?s)(a and b must have the same shape. '
+        r'a shape: \[5\] b shape: \[2\].* '
+        r'\[Op:Examples>MultiplexDense\]'
+        r')|('
+        # Graph mode raises ValueError with the following message
+        r'Dimension 0 in both shapes must be equal, but are 5 and 2\. '
+        r'Shapes are \[5\] and \[2\]\.)'
+    ):
+      self.evaluate(multiplex_2_op.multiplex(cond, a, b))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_2d(self):
+    a = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.int64)
+    b = tf.constant([[10, 20, 30], [40, 50, 60]], dtype=tf.int64)
+    cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool)
+    expect = np.where(self.evaluate(cond), self.evaluate(a), self.evaluate(b))
+    # expected result is [[1, 20], [3, 40]]
+    result = multiplex_2_op.multiplex(cond, a, b)
+    self.assertAllEqual(result, expect)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_bad_shape(self):
+    a = tf.constant([[1, 2, 3], [4, 5, 6]], dtype=tf.int64)  # shape (2,3)
+    b = tf.constant([[10, 20], [30, 40], [50, 60]], dtype=tf.int64)  # shape (3,2)
+    cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool)
+    with self.assertRaisesRegex(
+        (errors_impl.InvalidArgumentError, ValueError),
+        # Eager mode raises InvalidArgumentError with the following message
+        r'(a and b must have the same shape.'
+        r' a shape: \[2,3\] b shape: \[3,2\]'
+        r')|('
+        # Graph mode raises ValueError with the following message
+        r'Dimension 0 in both shapes must be equal, '
+        r'but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.'
+    ):
+      self.evaluate(multiplex_2_op.multiplex(cond, a, b))
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/deepray/custom_ops/embedding_variable/multiplex_1_test.py b/deepray/custom_ops/embedding_variable/multiplex_1_test.py
new file mode 100644
index 00000000..2f2045e6
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/multiplex_1_test.py
@@ -0,0 +1,50 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiplex_1."""
+
+import numpy as np
+import tensorflow as tf
+
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+# This pylint disable is only needed for internal google users
+from tensorflow.python.framework import errors_impl  # pylint: disable=g-direct-tensorflow-import
+from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
+
+
+@test_util.with_eager_op_as_function
+class MultiplexOpRank1Test(tf.test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_multiplex_int(self):
+    print(gen_kv_variable_ops)
+    print(dir(gen_kv_variable_ops))
+
+  # @test_util.run_in_graph_and_eager_modes
+  # def test_multiplex_int(self):
+  #   shape = [3]
+  #   dtype = tf.float32
+  #   shared_name = "var_1_2"
+  #   name = "var_1/"
+  #   _invalid_key_type = tf.int64
+  #   container = ""
+  #   gen_kv_variable_ops.kv_var_handle_op(shape=shape, dtype=dtype,
+  #                                               shared_name=shared_name,
+  #                                               name=name,
+  #                                               Tkeys=_invalid_key_type,
+  #                                               container=container)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/deepray/seq2seq/tests/__init__.py b/deepray/custom_ops/embedding_variable/python/__init__.py
similarity index 100%
rename from deepray/seq2seq/tests/__init__.py
rename to deepray/custom_ops/embedding_variable/python/__init__.py
diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
new file mode 100644
index 00000000..35536d6b
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
@@ -0,0 +1,543 @@
+import sys
+from collections import defaultdict
+
+import tensorflow as tf
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import tf_logging as logging
+
+import deepray as dp
+from . import kv_variable_ops
+from .group_embedding_types import (DistStrategy, get_group_lookup_strategy)
+
+gen_group_embedding_ops = tf.load_op_library(resource_loader.get_path_to_datafile("../_group_embedding_ops.so"))
+
+__all__ = ["group_embedding_lookup", "group_embedding_lookup_sparse"]
+
+
+#for GPU EV group_lookup_dense
+def group_embedding_var_lookup_dense(params, dense_values, dimensions, ev_init_value=None):
+  if ev_init_value is not None:
+    default_value = ev_init_value
+    is_use_default_value_tensor = True
+  else:
+    default_value = ops.convert_to_tensor(1.0)
+    is_use_default_value_tensor = False
+  return gen_group_embedding_ops.group_embedding_var_lookup_dense(
+      params, dense_values, default_value, dimensions, is_use_default_value_tensor
+  )
+
+
+#for GPU EV group_lookup
+def group_embedding_var_lookup(
+    params,
+    sp_values,
+    sp_indices,
+    sp_weights,
+    combiners,
+    batch_size,
+    dimensions,
+    ignore_weights,
+    is_sequence=False,
+    ev_init_value=None
+):
+  if ev_init_value is not None:
+    default_value = ev_init_value
+    is_use_default_value_tensor = True
+  else:
+    default_value = ops.convert_to_tensor(1.0)
+    is_use_default_value_tensor = False
+  if ignore_weights:
+    sp_weight = ops.convert_to_tensor(1.0)
+    sp_weights = [sp_weight for _ in range(len(sp_values))]
+  return gen_group_embedding_ops.group_embedding_var_lookup(
+      params,
+      sp_values,
+      sp_indices,
+      sp_weights,
+      batch_size,
+      default_value,
+      combiners,
+      dimensions,
+      ignore_weights=ignore_weights,
+      is_use_default_value_tensor=is_use_default_value_tensor,
+      is_sequence=is_sequence
+  )
+
+
+def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
+  """
+    This interface is designed for fused multiple embedding lookup.
+    Args:
+      params: list, tuple
+              a list or tuple of trainable *Variable* or *EmbeddingVariable*.
+      ids: list, tuple
+              a list or tuple of tf.SparseTensor or tf.Tensor.
+              btw RaggedTensor is preferred.
+      name: The operations name
+    Returns
+    -------
+    emb_vec: list
+            a list of tf.Tensor(the results of lookup).
+  """
+
+  if params is None:
+    raise ValueError("params must be specified")
+  if not isinstance(params, list):
+    params = [params]
+  for index, param in enumerate(params):
+    if isinstance(param, dp.layers.embedding_variable.EmbeddingVariable):
+      params[index] = param.embedding_variable
+
+  if len(params) != len(ids):
+    raise ValueError("len of params must be equal to len of ids")
+
+  ## Currently not doing unique
+  strategy = get_group_lookup_strategy()
+
+  if strategy == DistStrategy.LOCALIZED:
+
+    emb_vec = [None for _ in range(len(params))]
+
+    ev_group_id_map = {}
+    tf_group_id_map = {}
+    ev_group_id = 0
+    tf_group_id = 0
+    is_ev_list = [False for _ in range(len(params))]
+    params_idx_map = {}
+
+    for index, param in enumerate(params):
+      params_idx_map[param.ref()] = index
+
+      if isinstance(param, kv_variable_ops.EmbeddingVariable):
+        is_ev_list[index] = True
+        dim = param.shape[0]
+        if dim not in ev_group_id_map:
+          ev_group_id_map[dim] = ev_group_id
+          ev_group_id += 1
+      else:  # tensorflow variable
+        dim = param.shape[1]
+        if dim not in tf_group_id_map:
+          tf_group_id_map[dim] = tf_group_id
+          tf_group_id += 1
+
+    if ev_group_id > 0:
+      ev_ids = [[] for _ in range(ev_group_id)]
+      ev_handlers = [[] for _ in range(ev_group_id)]
+      ev_dimensions = [0 for _ in range(ev_group_id)]
+      output_index_list = [[] for _ in range(ev_group_id)]
+
+      for index, ev_flag in enumerate(is_ev_list):
+        if not ev_flag:
+          continue
+        param = params[index]
+        dim = param.shape[0]
+        group_id = ev_group_id_map[dim]
+        ev_id = ids[index]
+
+        ev_dimensions[group_id] = dim
+        resource_variable_ops.variable_accessed(param)
+        ev_handlers[group_id].append(param.handle)
+        ev_ids[group_id].append(array_ops.reshape(ev_id, [-1]))
+        output_index_list[group_id].append(params_idx_map[param.ref()])
+
+      for group_id in range(ev_group_id):
+        dim = ev_dimensions[group_id]
+        output_index = output_index_list[group_id]
+        with ops.name_scope(name, "localized_group_embedding_lookup_ev_dim{}".format(dim), params + ids) as name_scope:
+          outputs = group_embedding_var_lookup_dense(ev_handlers[group_id], ev_ids[group_id], dim)[0]
+          for idx, output in zip(output_index, outputs):
+            emb_vec[idx] = output
+
+    if tf_group_id > 0:
+      tf_ids = [[] for _ in range(tf_group_id)]
+      tf_handlers = [[] for _ in range(tf_group_id)]
+      tf_dimensions = [0 for _ in range(tf_group_id)]
+      output_index_list = [[] for _ in range(tf_group_id)]
+
+      for index, ev_flag in enumerate(is_ev_list):
+        if ev_flag:
+          continue
+        param = params[index]
+        dim = param.shape[1]
+        group_id = tf_group_id_map[dim]
+        tf_id = ids[index]
+
+        tf_dimensions[group_id] = dim
+        tf_handlers[group_id].append(param)
+        tf_ids[group_id].append(array_ops.reshape(tf_id, [-1]))
+        output_index_list[group_id].append(params_idx_map[param.ref()])
+
+      for group_id in range(tf_group_id):
+        dim = tf_dimensions[group_id]
+        output_index = output_index_list[group_id]
+        with ops.name_scope(
+            name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + ids
+        ) as name_scope:
+          outputs = group_embedding_lookup_ops.group_variable_lookup_dense(
+              tf_handlers[group_id], tf_ids[group_id], dim
+          )[0]
+          for idx, output in zip(output_index, outputs):
+            emb_vec[idx] = output
+
+  else:
+    raise ValueError("Unrecognized strategy, expected collective, given{}".format(strategy))
+
+  return emb_vec
+
+
+def group_embedding_lookup_sparse(
+    params,
+    sp_ids,
+    combiners,
+    sp_weights=None,
+    partition_strategy='mod',
+    is_sequence=False,
+    params_num_per_group=sys.maxsize,
+    name=None,
+):
+  """
+    This interface is designed for fused multiple embedding lookup.
+    Args:
+      params: list, tuple
+              a list or tuple of trainable *Variable* or *EmbeddingVariable*.
+      sp_ids: list, tuple
+              a list or tuple of tf.SparseTensor or tf.RaggedTensor.
+              btw RaggedTensor is preferred.
+      combiners: list, tuple
+              a list or tuple of string to specify the combiner of each embedding lookup,
+              supported args is *sum* or *mean*
+      sp_weights: list, tuple
+               a list or tuple of tf.SparseTensor used for embedding lookup.
+      is_sequence: bool
+                return list of `Tensor` of shape `[batch_size, D]` when is False
+                return list of `Tensor` of shape `[batch_size, T, D]` when is True
+      params_num_per_group: int
+                The number of params in GroupEmbedding op.Function will schedule len(params) // params_num_per_group + 1
+                GroupEmbedding Op. Default setting would launch one Op containing all params which is suitable for GPU scenarios
+                to maximize the GPU utilization.On the contrast, you could set value to 1 when Op
+                is placed on CPU so as to maximize inter parallelism.
+      name: The operations name
+    Returns
+    -------
+    emb_vec: list
+            a list of tf.Tensor(the results of lookup).
+  """
+
+  if combiners is None:
+    logging.warn('The default value of combiner will change from "mean" to "sqrtn" after 2016/11/01.')
+    combiners = ['mean'] * len(params)
+  if not isinstance(combiners, list):
+    combiners = [combiners]
+  for combiner in combiners:
+    if combiner not in ('mean', 'sum'):
+      raise ValueError("combiners must be one of 'mean', 'sum'")
+
+  if params is None:
+    raise ValueError('params must be specified')
+  if not isinstance(params, list):
+    params = [params]
+
+  # Currently do not support PartitionedVariable.
+  for index, param in enumerate(params):
+    if isinstance(param, variables.PartitionedVariable):
+      tmp_param = list(param)
+      if len(tmp_param) != 1:
+        raise TypeError("PartitionedVariable not support in 'group_embedding_lookup_sparse'. ")
+      params[index] = tmp_param[0]
+    elif isinstance(param, dp.layers.embedding_variable.EmbeddingVariable):
+      params[index] = param.embedding_variable
+
+  ignore_weights = sp_weights is None
+
+  if len(combiners) != len(sp_ids):
+    raise ValueError('len of combiners must be equal to len of sp_ids')
+  if len(combiners) != len(params):
+    raise ValueError('len of combiners must be equal to len of params')
+  if not ignore_weights:
+    if len(combiners) != len(sp_weights):
+      raise ValueError('len of combiners must be equal to len of sp_weights')
+
+  strategy = get_group_lookup_strategy()
+  if strategy == DistStrategy.SOK:
+    import horovod.tensorflow as hvd
+    should_shard = False
+    if len(params) > hvd.size():
+      should_shard = True
+      global_size = hvd.size()
+    if should_shard:
+      for (index, param) in enumerate(params):
+        param.target_gpu = index % global_size
+    else:
+      for (index, param) in enumerate(params):
+        param.target_gpu = -1
+
+    try:
+      from sparse_operation_kit import experiment as sok
+    except:
+      raise ImportError('sparse_operation_kit is not found while group_embedding strategy is given `collective`')
+    with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope:
+      emb_vec = sok.lookup_sparse(params, sp_ids, combiners=combiners)
+  elif strategy == DistStrategy.HB:
+    emb_vec = []
+    with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope:
+      for idx, embedding in enumerate(params):
+        if not ignore_weights:
+          sp_weight = sp_weights[idx]
+        else:
+          sp_weight = None
+        emb_vec.append(embedding_lookup_sparse(embedding, sp_ids[idx], sp_weight, combiner=combiners[idx]))
+
+  elif strategy == DistStrategy.LOCALIZED:
+
+    emb_vec = [None for _ in range(len(params))]
+
+    ev_group_id_map = {}
+    tf_group_id_map = {}
+    ev_group_id = 0
+    tf_group_id = 0
+    is_ev_list = [False for _ in range(len(params))]
+    params_idx_map = defaultdict(list)  # queue
+
+    for (index, param) in enumerate(params):
+      params_idx_map[param.ref()].append(index)
+      sp_id = sp_ids[index]
+      if not isinstance(sp_id, sparse_tensor.SparseTensor):
+        try:  # assume RaggedTensor
+          sp_id = sp_id.to_sparse()
+          sp_ids[index] = sp_id
+        except:
+          raise ValueError('sp_id is neither SparseTensor nor RaggedTensor!')
+
+      if not ignore_weights:
+        sp_weight = sp_weights[index]
+        if sp_weight is not None:
+          if not isinstance(sp_weight, sparse_tensor.SparseTensor):
+            raise TypeError('sp_weights must be either None or SparseTensor')
+          sp_id.values.get_shape().assert_is_compatible_with(sp_weight.values.get_shape())
+          sp_id.indices.get_shape().assert_is_compatible_with(sp_weight.indices.get_shape())
+          sp_id.dense_shape.get_shape().assert_is_compatible_with(sp_weight.dense_shape.get_shape())
+
+      if isinstance(param, kv_variable_ops.EmbeddingVariable):
+        is_ev_list[index] = True
+        dim = param.shape[0]
+        if dim not in ev_group_id_map:
+          ev_group_id_map[dim] = ev_group_id
+          ev_group_id += 1
+      else:
+        # tensorflow variable
+        dim = param.shape[1]
+        if dim not in tf_group_id_map:
+          tf_group_id_map[dim] = tf_group_id
+          tf_group_id += 1
+
+    if ev_group_id > 0:
+      ev_sp_values = [[] for _ in range(ev_group_id)]
+      ev_sp_indices = [[] for _ in range(ev_group_id)]
+      ev_sp_weights = [[] for _ in range(ev_group_id)]
+      ev_dense_shapes = [[] for _ in range(ev_group_id)]
+      ev_handlers = [[] for _ in range(ev_group_id)]
+      ev_dimensions = [0 for _ in range(ev_group_id)]
+      ev_combiners = ['mean' for _ in range(ev_group_id)]
+      output_index_list = [[] for _ in range(ev_group_id)]
+
+      for (index, ev_flag) in enumerate(is_ev_list):
+        if not ev_flag:
+          continue
+        param = params[index]
+        dim = param.shape[0]
+        group_id = ev_group_id_map[dim]
+        sp_id = sp_ids[index]
+        combiner = combiners[index]
+
+        ev_combiners[group_id] = combiner
+        ev_dimensions[group_id] = dim
+        resource_variable_ops.variable_accessed(param)
+        ev_handlers[group_id].append(param.handle)
+        ev_sp_values[group_id].append(sp_id.values)
+        ev_sp_indices[group_id].append(sp_id.indices)
+        ev_dense_shapes[group_id].append(sp_id.dense_shape)
+        output_index_list[group_id].append(params_idx_map[param.ref()].pop(0))
+
+        if not ignore_weights:
+          sp_weight = sp_weights[index]
+          ev_sp_weights[group_id].append(sp_weight.values)
+
+      for group_id in range(ev_group_id):
+        dim = ev_dimensions[group_id]
+        output_index = output_index_list[group_id]
+
+        (num_sub_group, num_remainder) = \
+          divmod(len(ev_handlers[group_id]),
+                 params_num_per_group)
+        for j in range(num_sub_group):
+          sub_ev_sp_weight = (
+              [None for _ in range(params_num_per_group)] if ignore_weights else
+              (ev_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group]
+          )
+          with ops.name_scope(
+              name, 'localized_group_embedding_lookup_ev_dim{}_{}'.format(dim, j), params + sp_ids
+          ) as name_scope:
+            outputs = group_embedding_var_lookup(
+                (ev_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                (ev_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                (ev_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                sub_ev_sp_weight,
+                ev_combiners[group_id],
+                (ev_dense_shapes[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                dim,
+                ignore_weights,
+                is_sequence,
+            )[0]
+
+            for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs):
+              emb_vec[idx] = output
+
+        if num_remainder > 0:
+          sub_ev_sp_weight = (
+              [None for _ in range(num_remainder)] if ignore_weights else (ev_sp_weights[group_id])[-num_remainder:]
+          )
+          with ops.name_scope(
+              name, 'localized_group_embedding_lookup_ev_dim{}'.format(dim), params + sp_ids
+          ) as name_scope:
+            outputs = group_embedding_var_lookup(
+                (ev_handlers[group_id])[-num_remainder:],
+                (ev_sp_values[group_id])[-num_remainder:],
+                (ev_sp_indices[group_id])[-num_remainder:],
+                sub_ev_sp_weight,
+                ev_combiners[group_id],
+                (ev_dense_shapes[group_id])[-num_remainder:],
+                dim,
+                ignore_weights,
+                is_sequence,
+            )[0]
+
+            for (idx, output) in zip(output_index[-num_remainder:], outputs):
+              emb_vec[idx] = output
+
+    if tf_group_id > 0:
+      tf_sp_values = [[] for _ in range(tf_group_id)]
+      tf_sp_indices = [[] for _ in range(tf_group_id)]
+      tf_sp_weights = [[] for _ in range(tf_group_id)]
+      tf_dense_shape = [[] for _ in range(tf_group_id)]
+      tf_handlers = [[] for _ in range(tf_group_id)]
+      tf_dimensions = [0 for _ in range(tf_group_id)]
+      tf_combiners = ['mean' for _ in range(tf_group_id)]
+      output_index_list = [[] for _ in range(tf_group_id)]
+
+      for (index, ev_flag) in enumerate(is_ev_list):
+        if ev_flag:
+          continue
+        param = params[index]
+        dim = param.shape[1]
+        group_id = tf_group_id_map[dim]
+        sp_id = sp_ids[index]
+        combiner = combiners[index]
+
+        tf_combiners[group_id] = combiner
+        tf_dimensions[group_id] = dim
+        tf_handlers[group_id].append(param)
+        tf_sp_values[group_id].append(sp_id.values)
+        tf_sp_indices[group_id].append(sp_id.indices)
+        tf_dense_shape[group_id].append(sp_id.dense_shape)
+        output_index_list[group_id].append(params_idx_map[param].pop(0))
+
+        if not ignore_weights:
+          sp_weight = sp_weights[index]
+          tf_sp_weights[group_id].append(sp_weight.values)
+
+      for group_id in range(tf_group_id):
+        dim = tf_dimensions[group_id]
+        output_index = output_index_list[group_id]
+
+        (num_sub_group, num_remainder) = divmod(len(tf_handlers[group_id]), params_num_per_group)
+        for j in range(num_sub_group):
+          sub_tf_sp_weight = (
+              [None for _ in range(params_num_per_group)] if ignore_weights else
+              (tf_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group]
+          )
+          with ops.name_scope(
+              name, 'localized_group_embedding_lookup_variable_dim{}_{}'.format(dim, j), params + sp_ids
+          ) as name_scope:
+            outputs = group_embedding_lookup_ops.group_variable_lookup(
+                (tf_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                (tf_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                (tf_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                sub_tf_sp_weight,
+                tf_combiners[group_id],
+                (tf_dense_shape[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
+                dim,
+                ignore_weights,
+                is_sequence,
+            )[0]
+
+            for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs):
+              emb_vec[idx] = output
+
+        if num_remainder > 0:
+          sub_tf_sp_weight = (
+              [None for _ in range(num_remainder)] if ignore_weights else (tf_sp_weights[group_id])[-num_remainder:]
+          )
+          with ops.name_scope(
+              name, 'localized_group_embedding_lookup_variable_dim{}'.format(dim), params + sp_ids
+          ) as name_scope:
+            outputs = group_embedding_lookup_ops.group_variable_lookup(
+                (tf_handlers[group_id])[-num_remainder:],
+                (tf_sp_values[group_id])[-num_remainder:],
+                (tf_sp_indices[group_id])[-num_remainder:],
+                sub_tf_sp_weight,
+                tf_combiners[group_id],
+                (tf_dense_shape[group_id])[-num_remainder:],
+                dim,
+                ignore_weights,
+                is_sequence,
+            )[0]
+
+            for (idx, output) in zip(output_index[-num_remainder:], outputs):
+              emb_vec[idx] = output
+  elif strategy == DistStrategy.UNKNOWN:
+
+    raise ValueError('Unrecognized strategy, expected collective, given{}'.format(strategy))
+
+  return emb_vec
+
+
+@ops.RegisterGradient("GroupEmbeddingVarLookupDense")
+def _GroupGatherDenseGrad(op, *top_grads):
+  ev_num = op.get_attr("num_lookups")
+  grads = []
+  for i in range(ev_num):
+    handle = op.inputs[i]
+    indice = op.inputs[ev_num + i]
+    params_shape = resource_variable_ops.variable_shape(handle)
+    grad = top_grads[i]
+    grads.append(indexed_slices.IndexedSlices(grad, indice, params_shape))
+  return grads + [None for _ in range(ev_num + 1)]
+
+
+@ops.RegisterGradient("GroupEmbeddingVarLookup")
+def _GroupGatherGrad(op, *grads):
+  ev_num = op.get_attr("num_lookups")
+  combiner = op.get_attr("combiner")
+  dimension = op.get_attr("dimension")
+  return_grads = []
+  params = op.inputs[:ev_num]
+  sp_indices = op.inputs[ev_num * 2:ev_num * 3]
+  unique_values = op.outputs[ev_num:2 * ev_num]
+  batch_nums = op.outputs[3 * ev_num:4 * ev_num]
+  with ops.colocate_with(params[0]):
+    nnz_grads = gen_group_embedding_ops.group_embedding_variable_lookup_grad(
+        grads[:ev_num], params, unique_values, sp_indices, batch_nums, dimension, combiner
+    )
+  for i in range(ev_num):
+    handle = params[i]
+    params_shape = resource_variable_ops.variable_shape(handle)
+    indice = unique_values[i]
+    grad = nnz_grads[i]
+    return_grads.append(indexed_slices.IndexedSlices(grad, indice, params_shape))
+  return return_grads + [None for _ in range(ev_num * 4 + 1)]
diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_types.py b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py
new file mode 100644
index 00000000..4eb679c9
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py
@@ -0,0 +1,54 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defines functions common to group embedding lookup files."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from enum import Enum, unique
+
+
+@unique
+class DistStrategy(Enum):
+  SOK = "sok"
+  HB = "hb"
+  DISTRIBUTED = "ps"
+  LOCALIZED = "localized"
+  UNKNOWN = "unknown"
+
+
+_group_lookup_strategy = DistStrategy.LOCALIZED
+
+
+def set_group_lookup_strategy(strategy):
+
+  def str_to_strategy(strategy):
+    if strategy == "sok":
+      return DistStrategy.SOK
+    elif strategy == "hb":
+      return DistStrategy.HB
+    elif strategy == "ps":
+      return DistStrategy.DISTRIBUTED
+    elif strategy == "localized":
+      return DistStrategy.LOCALIZED
+
+  global _group_lookup_strategy
+  _group_lookup_strategy = str_to_strategy(strategy)
+
+
+def get_group_lookup_strategy():
+  global _group_lookup_strategy
+  return _group_lookup_strategy
diff --git a/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
new file mode 100644
index 00000000..c9dab432
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
@@ -0,0 +1,1027 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops to use variables as resources."""
+
+# pylint: disable=g-bad-name
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import os
+import weakref
+
+import tensorflow as tf
+from absl import flags
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_module
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import handle_data_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.resource_variable_ops import get_eager_safe_handle_data, _combine_handle_data, _set_handle_shapes_and_types, ResourceVariable
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.saved_model import registration
+from tensorflow.python.trackable import base as trackable
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.util import compat
+
+from deepray.custom_ops.embedding_variable import config_pb2
+from deepray.custom_ops.embedding_variable import variables as ev_variables
+from deepray.utils import logging_util
+
+gen_kv_variable_ops = tf.load_op_library(resource_loader.get_path_to_datafile("../_kv_variable_ops.so"))
+
+logger = logging_util.get_logger()
+
+__all__ = ["EmbeddingVariable"]
+
+
+def _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, name, graph_mode, initial_value=None):
+  """Create a variable handle, copying in handle data from `initial_value`."""
+  container = ops.get_default_graph()._container  # pylint: disable=protected-access
+  if container is None:
+    container = ""
+  shape = tensor_shape.as_shape(shape)
+  dtype = dtypes.as_dtype(dtype)
+  key_type = dtypes.as_dtype(key_type)
+
+  handle = gen_kv_variable_ops.kv_var_handle_op(
+      shape=shape,
+      dtype=dtype,
+      Tkeys=key_type,
+      shared_name=shared_name,
+      # debug_name=name,
+      name=name,
+      container=container
+  )
+  if initial_value is None:
+    initial_value = handle
+  if graph_mode:
+    full_handle_data = _combine_handle_data(handle, initial_value)
+    _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
+    return handle
+  else:
+    handle_data = handle_data_util.create_handle_data(shape, dtype)
+    if initial_value is not None and initial_value.dtype == dtypes.variant:
+      extra_handle_data = get_eager_safe_handle_data(initial_value)
+      if extra_handle_data is not None and extra_handle_data.is_set:
+        if (not handle_data.is_set or len(handle_data.shape_and_type) != 1):
+          raise RuntimeError("Expected VarHandleOp to return a length==1 shape_and_type, "
+                             f"but saw: '{handle_data}'")
+        handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
+
+    _set_handle_shapes_and_types(handle, handle_data, graph_mode)
+    return handle
+
+
+def eager_safe_variable_handle(initial_value, shape, key_type, shared_name, name, graph_mode):
+  """Creates a variable handle with information to do shape inference.
+
+  The dtype is read from `initial_value` and stored in the returned
+  resource tensor's handle data.
+
+  If `initial_value.dtype == tf.variant`, we additionally extract the handle
+  data (if any) from `initial_value` and append it to the `handle_data`.
+  In this case, the returned tensor's handle data is in the form
+
+  ```
+  is_set: true
+  shape_and_type {
+    shape {
+      // initial_value.shape
+    }
+    dtype: DT_VARIANT
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[0]
+  }
+  shape_and_type {
+    // handle_data(initial_value).shape_and_type[1]
+  }
+  ...
+  ```
+
+  Ops that read from this tensor, such as `ReadVariableOp` and
+  `AssignVariableOp`, know that `handle_data(handle).shape_and_type[1:]`
+  correspond to the handle data of the variant(s) stored in the Variable.
+
+  Args:
+    initial_value: A `Tensor`.
+    shape: The shape of the handle data. Can be `TensorShape(None)` (i.e.
+      unknown shape).
+    shared_name: A string.
+    name: A string.
+    graph_mode: A python bool.
+
+  Returns:
+    The handle, a `Tensor` of type `resource`.
+  """
+  dtype = initial_value.dtype.base_dtype
+  return _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, name, graph_mode, initial_value)
+
+
+class EmbeddingVariable(ResourceVariable, saveable_object.SaveableObject):
+  """Variable based on resource handles.
+
+  See the [Variables How To](https://tensorflow.org/guide/variables)
+  for a high level overview.
+
+  A `ResourceVariable` allows you to maintain state across subsequent calls to
+  session.run.
+
+  The `ResourceVariable` constructor requires an initial value for the variable,
+  which can be a `Tensor` of any type and shape. The initial value defines the
+  type and shape of the variable. After construction, the type and shape of
+  the variable are fixed. The value can be changed using one of the assign
+  methods.
+
+  Just like any `Tensor`, variables created with
+  `tf.Variable(use_resource=True)` can be used as inputs for other Ops in the
+  graph. Additionally, all the operators overloaded for the `Tensor` class are
+  carried over to variables, so you can also add nodes to the graph by just
+  doing arithmetic on variables.
+
+  Unlike ref-based variable, a ResourceVariable has well-defined semantics. Each
+  usage of a ResourceVariable in a TensorFlow graph adds a read_value operation
+  to the graph. The Tensors returned by a read_value operation are guaranteed to
+  see all modifications to the value of the variable which happen in any
+  operation on which the read_value depends on (either directly, indirectly, or
+  via a control dependency) and guaranteed to not see any modification to the
+  value of the variable from operations that depend on the read_value operation.
+  Updates from operations that have no dependency relationship to the read_value
+  operation might or might not be visible to read_value.
+
+  For example, if there is more than one assignment to a ResourceVariable in
+  a single session.run call there is a well-defined value for each operation
+  which uses the variable's value if the assignments and the read are connected
+  by edges in the graph. Consider the following example, in which two writes
+  can cause tf.Variable and tf.ResourceVariable to behave differently:
+
+  ```python
+  a = tf.Variable(1.0, use_resource=True)
+  a.initializer.run()
+
+  assign = a.assign(2.0)
+  with tf.control_dependencies([assign]):
+    b = a.read_value()
+  with tf.control_dependencies([b]):
+    other_assign = a.assign(3.0)
+  with tf.control_dependencies([other_assign]):
+    # Will print 2.0 because the value was read before other_assign ran. If
+    # `a` was a tf.Variable instead, 2.0 or 3.0 could be printed.
+    tf.compat.v1.Print(b, [b]).eval()
+  ```
+  """
+
+  def __init__(
+      self,  # pylint: disable=super-init-not-called
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      validate_shape=True,  # pylint: disable=unused-argument
+      caching_device=None,
+      name=None,
+      dtype=None,
+      variable_def=None,
+      import_scope=None,
+      constraint=None,
+      distribute_strategy=None,
+      synchronization=None,
+      aggregation=None,
+      shape=None,
+      handle=None,
+      experimental_enable_variable_lifting=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Creates a variable.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. Can also be a callable with
+        no argument that returns the initial value when called. (Note that
+        initializer functions from init_ops.py must first be bound to a shape
+        before being used here.)
+      trainable: If `True`, the default, also adds the variable to the graph
+        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
+        the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True`, unless `synchronization` is set to `ON_READ`, in
+        which case it defaults to `False`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      caching_device: Optional device string or function describing where the
+        Variable should be cached for reading.  Defaults to the Variable's
+        device.  If not `None`, caches on another device.  Typical use is to
+        cache on the device where the Ops using the Variable reside, to
+        deduplicate copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
+      variable_def: `VariableDef` protocol buffer. If not None, recreates the
+        `ResourceVariable` object with its contents. `variable_def` and other
+        arguments (except for import_scope) are mutually exclusive.
+      import_scope: Optional `string`. Name scope to add to the
+        ResourceVariable. Only used when `variable_def` is provided.
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+      distribute_strategy: The tf.distribute.Strategy this variable is being
+        created inside of.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
+      handle: (optional) The handle of a `tf.Variable`. If provided, only
+        `trainable`, `shape`, `dtype`, and `handle` will be used to construct
+        this `tf.Variable`.
+      experimental_enable_variable_lifting: Whether to lift the variable out if
+        it's in a `tf.function`. Default is `True`. When this argument
+        is `True`, variable creation will follow the behavior and
+        restrictions described
+        [here](https://www.tensorflow.org/guide/function#creating_tfvariables).
+        If this argument is `False`, that description doesn't apply,
+        and you can freely create and use the variable in the
+        `tf.function`, as if it's a "mutable `tf.Tensor`". You can't
+        return the variable though.
+
+    Raises:
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+
+    @compatibility(eager)
+    When Eager Execution is enabled, the default for the `collections` argument
+    is `None`, which signifies that this `Variable` will not be added to any
+    collections.
+    @end_compatibility
+    """
+    if variable_def:
+      if initial_value is not None:
+        raise ValueError(
+            f"The variable_def and initial_value args to "
+            f"`tf.Variable` are mutually exclusive, but got both: "
+            f"variable_def={variable_def},\n"
+            f"initial_value={initial_value}"
+        )
+      if context.executing_eagerly():
+        raise ValueError(
+            f"Creating a `tf.Variable` with a `variable_def` arg "
+            f"is not supported when eager execution is enabled. "
+            f"Got: variable_def={variable_def}"
+        )
+      self._init_from_proto(variable_def, import_scope=import_scope, validate_shape=validate_shape)
+    elif handle is not None:
+      self._init_from_handle(trainable=trainable, shape=shape, dtype=dtype, handle=handle)
+    else:
+      evconfig.reveal()
+      self._init_from_args(
+          initial_value=initial_value,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          name=name,
+          dtype=dtype,
+          constraint=constraint,
+          synchronization=synchronization,
+          aggregation=aggregation,
+          shape=shape,
+          distribute_strategy=distribute_strategy,
+          validate_shape=validate_shape,
+          experimental_enable_variable_lifting=experimental_enable_variable_lifting,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          ht_partition_num=ht_partition_num
+      )
+
+  def __repr__(self):
+    return "<tf.EmbeddingVariable '%s' embedding dim=%s dtype=%s>" % (self.name, self.shape, self.dtype.name)
+
+  def _init_from_args(
+      self,
+      initial_value=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      name=None,
+      dtype=None,
+      constraint=None,
+      synchronization=None,
+      aggregation=None,
+      distribute_strategy=None,
+      shape=None,
+      validate_shape=True,
+      experimental_enable_variable_lifting=None,
+      invalid_key=-1,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Creates a variable.
+
+    Args:
+      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
+        which is the initial value for the Variable. The initial value must have
+        a shape specified unless `validate_shape` is set to False. Can also be a
+        callable with no argument that returns the initial value when called.
+        (Note that initializer functions from init_ops.py must first be bound to
+        a shape before being used here.)
+      trainable: If `True`, the default, also adds the variable to the graph
+        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
+        the default list of variables to use by the `Optimizer` classes.
+        Defaults to `True`, unless `synchronization` is set to `ON_READ`, in
+        which case it defaults to `False`.
+      collections: List of graph collections keys. The new variable is added to
+        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+      caching_device: Optional device string or function describing where the
+        Variable should be cached for reading.  Defaults to the Variable's
+        device.  If not `None`, caches on another device.  Typical use is to
+        cache on the device where the Ops using the Variable reside, to
+        deduplicate copying through `Switch` and other conditional statements.
+      name: Optional name for the variable. Defaults to `'Variable'` and gets
+        uniquified automatically.
+      dtype: If set, initial_value will be converted to the given type. If None,
+        either the datatype will be kept (if initial_value is a Tensor) or
+        float32 will be used (if it is a Python object convertible to a Tensor).
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses when to
+        synchronize.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      distribute_strategy: DistributionStrategy under which this variable was
+        created.
+      shape: (optional) The shape of this variable. If None, the shape of
+        `initial_value` will be used. When setting this argument to
+        `tf.TensorShape(None)` (representing an unspecified shape), the variable
+        can be assigned with values of different shapes.
+      validate_shape: If `False`, allows the variable to be initialized with a
+        value of unknown shape. If `True`, the default, the shape of
+        `initial_value` must be known.
+      experimental_enable_variable_lifting: Whether to lift the variable out if
+        it's in a `tf.function`. Default is `True`. When this argument
+        is `True`, variable creation will follow the behavior and
+        restrictions described
+        [here](https://www.tensorflow.org/guide/function#creating_tfvariables).
+        If this argument is `False`, that description doesn't apply,
+        and you can freely create and use the variable in the
+        `tf.function`, as if it's a "mutable `tf.Tensor`". You can't
+        return the variable though.
+
+    Raises:
+      ValueError: If the initial value is not specified, or does not have a
+        shape and `validate_shape` is `True`.
+
+    @compatibility(eager)
+    When Eager Execution is enabled, variables are never added to collections.
+    It is not implicitly added to the `GLOBAL_VARIABLES` or
+    `TRAINABLE_VARIABLES` collections, and the `collections` argument is
+    ignored.
+    @end_compatibility
+    """
+    synchronization, aggregation, trainable = (
+        variables.validate_synchronization_aggregation_trainable(synchronization, aggregation, trainable, name)
+    )
+    if experimental_enable_variable_lifting is None:
+      experimental_enable_variable_lifting = True
+    if initial_value is None:
+      raise ValueError(
+          "The `initial_value` arg to `tf.Variable` must "
+          "be specified except when you are not providing a "
+          "`variable_def`. You provided neither."
+      )
+    init_from_fn = callable(initial_value)
+
+    if isinstance(initial_value,
+                  tensor_module.Tensor) and hasattr(initial_value, "graph") and initial_value.graph.building_function:
+      raise ValueError(
+          f"Argument `initial_value` ({initial_value}) could not "
+          "be lifted out of a `tf.function`. "
+          f"(Tried to create variable with name='{name}'). "
+          "To avoid this error, when constructing `tf.Variable`s "
+          "inside of `tf.function` you can create the "
+          "`initial_value` tensor in a "
+          "`tf.init_scope` or pass a callable `initial_value` "
+          "(e.g., `tf.Variable(lambda : "
+          "tf.truncated_normal([10, 40]))`). "
+          "Please file a feature request if this "
+          "restriction inconveniences you."
+      )
+
+    if collections is None:
+      collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+    if not isinstance(collections, (list, tuple, set)):
+      raise ValueError(
+          f"collections argument to Variable constructor must be a list, "
+          f"tuple, or set. Got {collections} of type {type(collections)}"
+      )
+    if constraint is not None and not callable(constraint):
+      raise ValueError(
+          f"Argument `constraint` must be None or a callable. "
+          f"a callable. Got a {type(constraint)}:  {constraint}"
+      )
+
+    if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
+      collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
+
+    self._save_slice_info = None
+    self._in_graph_mode = not context.executing_eagerly()
+    self._steps_to_live = evconfig.steps_to_live
+    self._init_data_source = evconfig.init_data_source
+    self._emb_index = evconfig.emb_index
+    self._slot_index = evconfig.slot_index
+    self._block_num = evconfig.block_num
+    self._block_handle_name = None
+    self._primary = evconfig.primary
+    self._ht_type = evconfig.ht_type
+    self._ht_partition_num = ht_partition_num
+    self._is_sparse = False
+    self.importer = None
+    if evconfig.filter_strategy != None:
+      if isinstance(evconfig.filter_strategy, ev_variables.CounterFilter):
+        self._filter_freq = evconfig.filter_strategy.filter_freq
+        self._max_element_size = 0
+        self._false_positive_probability = -1.0
+        self._counter_type = dtypes.uint64
+      elif isinstance(evconfig.filter_strategy, ev_variables.CBFFilter):
+        self._filter_freq = evconfig.filter_strategy.filter_freq
+        self._max_element_size = evconfig.filter_strategy.max_element_size
+        self._false_positive_probability = evconfig.filter_strategy.false_positive_probability
+        self._counter_type = evconfig.filter_strategy.counter_type
+    else:
+      self._filter_freq = 0
+      self._max_element_size = 0
+      self._false_positive_probability = -1.0
+      self._counter_type = dtypes.uint64
+
+    self._record_freq = (os.environ.get("TF_RECORD_FREQ", "0") == "1")
+    self._record_version = (os.environ.get("TF_RECORD_VERSION", "0") == "1")
+    self._l2_weight_threshold = evconfig.l2_weight_threshold
+    self._storage_type = evconfig.storage_type
+    self._storage_path = evconfig.storage_path
+    self._storage_size = evconfig.storage_size
+    self._default_value_dim = evconfig.default_value_dim
+    self._default_value_no_permission = evconfig.default_value_no_permission
+    self._storage_cache_strategy = evconfig.storage_cache_strategy
+    self._layout = evconfig.layout
+
+    if self._primary is None:
+      self._is_primary = True
+    else:
+      self._is_primary = False
+
+    with ops.init_scope():
+      self._in_graph_mode = not context.executing_eagerly()
+    if experimental_enable_variable_lifting:
+      maybe_init_scope = ops.init_scope
+    else:
+      maybe_init_scope = contextlib.nullcontext
+    with maybe_init_scope():
+      with ops.name_scope(name, "Variable", [] if init_from_fn else [initial_value], skip_on_eager=False) as name:
+        self._invalid_key = invalid_key
+        self._invalid_key_type = ops.convert_to_tensor(invalid_key, name="invalid_key").dtype.base_dtype
+        handle_name = ops.name_from_scope_name(name)
+        shared_name = handle_name
+        if self._in_graph_mode:
+          unique_id = shared_name
+        else:
+          # When in eager mode, use a uid for the shared_name, to prevent
+          # accidental sharing.
+          unique_id = "%s_%d" % (handle_name, ops.uid())
+        self._unique_id = unique_id
+        if handle_name is None:
+          self._handle_name = "Variable:0"
+        else:
+          self._handle_name = handle_name + ":0"
+        # Use attr_scope and device(None) to simulate the behavior of
+        # colocate_with when the variable we want to colocate with doesn't
+        # yet exist.
+        device_context_manager = (ops.device if self._in_graph_mode else ops.NullContextmanager)
+        attr = attr_value_pb2.AttrValue(
+            list=attr_value_pb2.AttrValue.ListValue(s=[compat.as_bytes("loc:@%s" % handle_name)])
+        )
+        with ops.get_default_graph()._attr_scope({"_class": attr}):
+          with ops.name_scope("Initializer"), device_context_manager(None):
+            if init_from_fn:
+              initial_value = initial_value()
+            if isinstance(initial_value, trackable.CheckpointInitialValue):
+              self._maybe_initialize_trackable()
+              self._update_uid = initial_value.checkpoint_position.restore_uid
+              initial_value = initial_value.wrapped_value
+            initial_value = ops.convert_to_tensor(initial_value, name="initial_value", dtype=dtype)
+            rank = initial_value.get_shape().rank - 1
+          if shape is not None:
+            if not initial_value.shape.is_compatible_with(shape):
+              raise ValueError(
+                  f"In this `tf.Variable` creation, the initial value's shape "
+                  f"({initial_value.shape}) is not compatible with "
+                  f"the explicitly supplied `shape` argument ({shape})."
+              )
+          else:
+            shape = initial_value.get_shape()[rank:]
+          _device = "GPU" if self._storage_type in [
+              config_pb2.StorageType.HBM, config_pb2.StorageType.HBM_DRAM, config_pb2.StorageType.HBM_DRAM_SSDHASH
+          ] else "CPU"
+          with ops.device(_device):
+            handle = eager_safe_variable_handle(
+                initial_value=initial_value,
+                shape=shape,
+                key_type=self._invalid_key_type,
+                shared_name=shared_name,
+                name=name,
+                graph_mode=self._in_graph_mode
+            )
+          handle._parent_trackable = weakref.ref(self)
+          handle._name = handle_name + ":0"
+          handle._unique_id = unique_id
+          self._handle = handle
+        # pylint: disable=protected-access
+        if (
+            self._in_graph_mode and initial_value is not None and
+            initial_value.op._get_control_flow_context() is not None
+        ):
+          raise ValueError(
+              f"The `initial_value` passed to `tf.Variable` {name} is from "
+              f"inside a control-flow  construct, such as a loop or "
+              f"conditional. When creating a "
+              f"`tf.Variable` inside a loop or conditional, use a lambda as "
+              f"the `initial_value`. Got: initial_value=({initial_value})"
+          )
+        # pylint: enable=protected-access
+        dtype = initial_value.dtype.base_dtype
+        self._counts_tensor = {}
+        self._is_multi_tier = self.is_multi_tier(self._storage_type)
+        if self._primary is None:
+          self._primary = self
+
+        if self._is_primary:
+          self._slot_num = flags.FLAGS.ev_slot_num
+        else:
+          self._slot_num = evconfig.slot_num
+
+        if self._in_graph_mode:
+          with ops.name_scope("IsInitialized"):
+            self._is_initialized_op = (
+                gen_kv_variable_ops.kv_var_is_initialized_op(handle, Tkeys=self._invalid_key_type, dtype=self._dtype)
+            )
+          if initial_value is not None:
+            # pylint: disable=g-backslash-continuation
+            with ops.name_scope("Assign") as n, \
+                 ops.colocate_with(None, ignore_existing=True), \
+                 ops.device(handle.device):
+              with ops.control_dependencies(None if self._is_primary else [self._primary.initializer]):
+                self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op(
+                    handle,
+                    self._primary._handle,
+                    variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+                    ops.convert_to_tensor(invalid_key),
+                    slot_num=self._slot_num,
+                    shape=initial_value.get_shape()[rank:],
+                    steps_to_live=self._steps_to_live,
+                    emb_index=self._emb_index,
+                    block_num=self.block_num,
+                    slot_index=self._slot_index,
+                    ht_type=self._ht_type,
+                    ht_partition_num=self._ht_partition_num,
+                    filter_freq=self._filter_freq,
+                    l2_weight_threshold=self._l2_weight_threshold,
+                    max_element_size=self._max_element_size,
+                    false_positive_probability=self._false_positive_probability,
+                    counter_type=self._counter_type,
+                    max_freq=99999,
+                    layout=self._layout,
+                    storage_type=self._storage_type,
+                    storage_path=self._storage_path,
+                    storage_size=self._storage_size,
+                    default_value_dim=self._default_value_dim,
+                    default_value_no_permission=self._default_value_no_permission,
+                    record_freq=self._record_freq,
+                    record_version=self._record_version,
+                    embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
+                    name=n
+                )
+              set_attr_ops = []
+
+              if self._is_primary and self._is_multi_tier:
+                with ops.control_dependencies([self._init_op]):
+                  set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
+                      self._handle,
+                      cache_strategy=self._storage_cache_strategy,
+                      Tkeys=self._invalid_key_type,
+                      dtype=dtype
+                  )
+                set_attr_ops.append(set_cache_strategy_op)
+              with ops.control_dependencies(set_attr_ops + [self._init_op]):
+                self._initializer_op = control_flow_ops.no_op()
+
+              self.create_init_op_for_restore(name, initial_value, invalid_key, rank)
+        else:
+          self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op(
+              handle,
+              self._primary._handle,
+              initial_value,
+              ops.convert_to_tensor(invalid_key),
+              slot_num=self._slot_num,
+              shape=shape,
+              steps_to_live=self._steps_to_live,
+              emb_index=self._emb_index,
+              block_num=self.block_num,
+              slot_index=self._slot_index,
+              ht_type=self._ht_type,
+              ht_partition_num=self._ht_partition_num,
+              filter_freq=self._filter_freq,
+              l2_weight_threshold=self._l2_weight_threshold,
+              max_element_size=self._max_element_size,
+              false_positive_probability=self._false_positive_probability,
+              counter_type=self._counter_type,
+              max_freq=99999,
+              layout=self._layout,
+              storage_type=self._storage_type,
+              storage_path=self._storage_path,
+              storage_size=self._storage_size,
+              default_value_dim=self._default_value_dim,
+              default_value_no_permission=self._default_value_no_permission,
+              record_freq=self._record_freq,
+              record_version=self._record_version,
+              embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE
+          )
+          if self._is_primary and self._is_multi_tier:
+            with ops.control_dependencies([self._init_op]):
+              set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
+                  self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=dtype
+              )
+
+        if self._in_graph_mode:
+          # Eager variables are only added to collections if they are part of an
+          # eager variable store (otherwise in an interactive session they would
+          # hog memory and cause OOM). This is done in ops/variable_scope.py.
+          ops.add_to_collections(collections, self)
+        elif ops.GraphKeys.GLOBAL_STEP in collections:
+          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
+      initial_value = initial_value if self._in_graph_mode else None
+      super(EmbeddingVariable, self).__init__(
+          trainable=trainable,
+          shape=shape,
+          dtype=dtype,
+          handle=handle,
+          synchronization=synchronization,
+          constraint=constraint,
+          aggregation=aggregation,
+          distribute_strategy=distribute_strategy,
+          name=name,
+          initial_value=initial_value,
+          caching_device=caching_device,
+          validate_shape=validate_shape,
+      )
+
+  def is_multi_tier(self, storage_type):
+    multi_level_list = [
+        config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_PMEM,
+        config_pb2.StorageType.DRAM_LEVELDB, config_pb2.StorageType.DRAM_SSDHASH, config_pb2.StorageType.HBM_DRAM,
+        config_pb2.StorageType.DRAM_PMEM_SSDHASH, config_pb2.StorageType.HBM_DRAM_SSDHASH
+    ]
+    return storage_type in multi_level_list
+
+  def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
+    with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]):
+      self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op(
+          self._handle,
+          self._primary._handle,
+          variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+          ops.convert_to_tensor(invalid_key),
+          initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED,
+          slot_num=self._slot_num,
+          shape=initial_value.get_shape()[rank:],
+          steps_to_live=self._steps_to_live,
+          emb_index=self._emb_index,
+          block_num=self.block_num,
+          slot_index=self._slot_index,
+          ht_type=self._ht_type,
+          ht_partition_num=self._ht_partition_num,
+          filter_freq=self._filter_freq,
+          l2_weight_threshold=self._l2_weight_threshold,
+          max_element_size=self._max_element_size,
+          false_positive_probability=self._false_positive_probability,
+          counter_type=self._counter_type,
+          max_freq=99999,
+          layout=self._layout,
+          storage_type=self._storage_type,
+          storage_path=self._storage_path,
+          storage_size=self._storage_size,
+          default_value_dim=self._default_value_dim,
+          default_value_no_permission=self._default_value_no_permission,
+          record_freq=self._record_freq,
+          record_version=self._record_version,
+          embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE
+      )
+    set_attr_ops = []
+    if self._is_primary and self._is_multi_tier:
+      with ops.control_dependencies([self._initializer_for_restore]):
+        set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
+            self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=self._dtype
+        )
+      set_attr_ops.append(set_cache_op)
+    with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
+      self._init_op_for_restore = control_flow_ops.no_op()
+    # self.collect_restore_denpendencies()
+
+  def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
+    """Reads the value of this variable sparsely, using `gather`."""
+    with ops.name_scope("Gather" if name is None else name) as name:
+      if self._trainable:
+        tape.variable_accessed(self)
+      if ev_init_value is not None:
+        default_value = math_ops.cast(ev_init_value, self.dtype)
+        is_use_default_value_tensor = True
+      else:
+        default_value = ops.convert_to_tensor(1.0, dtype=self.dtype)
+        is_use_default_value_tensor = False
+      if counts is not None:
+        value = gen_kv_variable_ops.kv_resource_gather_v1(
+            self._handle, indices, default_value, counts, is_inference=True, name=name
+        )
+        self._counts_tensor[indices] = counts
+      else:
+        value = gen_kv_variable_ops.kv_resource_gather(
+            self._handle, indices, default_value, is_use_default_value_tensor, is_inference=True, name=name
+        )
+    return value
+
+  @property
+  def initializer(self):
+    """The op responsible for initializing this variable."""
+    return self._initializer_op
+
+  @property
+  def initial_value(self):
+    """Returns the Tensor used as the initial value for the variable."""
+    if context.executing_eagerly():
+      raise RuntimeError("initial_value not supported in EAGER mode.")
+    return self._initial_value
+
+  def is_initialized(self):
+    return gen_kv_variable_ops.kv_var_is_initialized_op(self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype)
+
+  def is_all_slot_initialized(self):
+    return gen_kv_variable_ops.kv_var_is_all_slot_initialized_op(
+        self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype
+    )
+
+  @property
+  def block_num(self):
+    if self._block_num is None:
+      return 1
+    else:
+      return self._block_num
+
+  def need_counts(self):
+    return self._record_freq or (self._filter_freq > 0) or self._is_multi_tier
+
+  @property
+  def storage_type(self):
+    return self._storage_type
+
+  def lookup_resource(self):
+    return gen_kv_variable_ops.kv_resource_lookup_resource(self.handle, Tkeys=self._invalid_key_type, dtype=self._dtype)
+
+  # Unused
+  # def _gather_saveables_for_checkpoint(self):
+  #   return {"foo": lambda name: EmbeddingVariableSaveable(self, name)}
+
+
+def lookup_resource(var):
+  return gen_kv_variable_ops.kv_resource_lookup_resource(var.handle, Tkeys=var._invalid_key_type, dtype=var._dtype)
+
+
+def variable_shape(handle, indices, grad):
+  handle_data = get_eager_safe_handle_data(handle)
+  if handle_data is None or not handle_data.is_set:
+    return gen_kv_variable_ops.kv_variable_shape(handle, Tkeys=indices.dtype, dtype=grad.dtype)
+  shape_proto = handle_data.shape_and_type[0].shape
+  if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
+    return gen_kv_variable_ops.kv_variable_shape(handle, Tkeys=indices.dtype, dtype=grad.dtype)
+  return constant_op.constant([x.size for x in shape_proto.dim], dtype=dtypes.int32)
+
+
+def get_tensor_slices(trackables):
+  tensor_names = []
+  shapes_and_slices = []
+  tensors = []
+  restored_trackables = []
+  ev_names = []
+  ev_resources = []
+  ev_key_types = []
+  has_ev = False
+  for obj_prefix, obj in trackables.items():
+    if isinstance(obj, EmbeddingVariable):
+      ev_names.append(obj.name)
+      ev_resources.append(obj.lookup_resource())
+      ev_key_types.append(obj._invalid_key_type)
+      has_ev = True
+
+    tensor_names.append(obj_prefix + "/value")
+    shapes_and_slices.append("")
+    tensors.append(constant_op.constant(2, dtype=obj.dtype))
+  return tensor_names, shapes_and_slices, tensors, restored_trackables, ev_names, ev_resources, ev_key_types, has_ev
+
+
+def save_fn(trackables, file_prefix):
+  """Save stack and part objects to a checkpoint shard."""
+  tensor_names, shapes_and_slices, tensors, _, ev_names, ev_resources, ev_key_types, has_ev = get_tensor_slices(
+      trackables
+  )
+  gen_kv_variable_ops.save_v3(
+      file_prefix, tensor_names, shapes_and_slices, ev_names, ev_resources, tensors, ev_key_types, has_ev
+  )
+  return file_prefix
+
+
+restore_queue = dict()
+
+
+def restore_fn(trackables, merged_prefix):
+  for obj_prefix, obj in trackables.items():
+    # Initialize queue entry if not exists
+    if obj._primary.name not in restore_queue:
+      restore_queue[obj._primary.name] = []
+    restore_queue[obj._primary.name].append(obj)
+    if obj.is_all_slot_initialized():
+      for ev in restore_queue[obj._primary.name]:
+        gen_kv_variable_ops.kv_resource_import_v3(
+            merged_prefix,
+            ev.handle,
+            ev.name,
+            ops.convert_to_tensor(ev._invalid_key),
+            shape=ev.shape,
+            partition_id=0,
+            partition_num=1,
+            dtype=ev.dtype
+        )
+
+
+registration.register_checkpoint_saver(
+    name="EmbeddingVariable",
+    predicate=lambda x: isinstance(x, (EmbeddingVariable)),
+    save_fn=save_fn,
+    restore_fn=restore_fn
+)
+
+
+@ops.RegisterGradient("KvResourceGather")
+def _GatherGrad(op, grad):
+  """Gradient for gather op."""
+  # Build appropriately shaped IndexedSlices
+  handle = op.inputs[0]
+  indices = op.inputs[1]
+  params_shape = variable_shape(handle, indices, grad)
+  size = array_ops.expand_dims(array_ops.size(indices), 0)
+  values_shape = array_ops.concat([size, params_shape[0:]], 0)
+  values = array_ops.reshape(grad, values_shape)
+  indices = array_ops.reshape(indices, size)
+  return [indexed_slices.IndexedSlices(values, indices, params_shape), None, None]
+
+
+@ops.RegisterGradient("KvResourceGatherV1")
+def _GatherV1Grad(op: ops.Operation, grad):
+  """Gradient for gather op."""
+  # Build appropriately shaped IndexedSlices
+  handle = op.inputs[0]
+  indices = op.inputs[1]
+  params_shape = variable_shape(handle, indices, grad)
+  size = array_ops.expand_dims(array_ops.size(indices), 0)
+  values_shape = array_ops.concat([size, params_shape[0:]], 0)
+  values = array_ops.reshape(grad, values_shape)
+  indices = array_ops.reshape(indices, size)
+  return [indexed_slices.IndexedSlices(values, indices, params_shape), None, None]
+
+
+ops.NotDifferentiable("KvVarIsInitializedOp")
+ops.NotDifferentiable("KvVariableShape")
+
+
+class EmbeddingVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles EmbeddingVariables."""
+
+  def __init__(self, var, name):
+    self.handle_op = var.handle
+    self.invalid_key = var.invalid_key
+    self.dtype = var._dtype
+    self.key_type = var._invalid_key_type
+    self.steps_to_live = var.steps_to_live
+    self.ht_type = var._ht_type
+    self.ht_partition_num = var._ht_partition_num
+    name = var._shared_name
+    self.var = var
+    is_partitioned_ev = not isinstance(self.var._save_slice_info, str)
+    self.partition_id = 0
+    self.partition_num = 1
+    if self.var._save_slice_info is not None:
+      self.partition_id = self.var._save_slice_info.var_offset[0] if is_partitioned_ev else 0
+      self.partition_num = self.var._save_slice_info.full_shape[0] if is_partitioned_ev else 1
+
+    def _read_variable_closure(v):
+
+      def f():
+        with ops.device(v.device):
+          x = v.read_value()
+          return array_ops.identity(x)
+
+      return f
+
+    unused_tensor = var.handle
+    self.resource = lookup_resource(var)
+
+    specs = []
+    specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-keys", dtype=self.key_type, device=var.device))
+    specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-values", dtype=dtypes.float32, device=var.device))
+    specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-versions", dtype=dtypes.int64, device=var.device))
+    specs.append(saveable_object.SaveSpec(unused_tensor, "", name + "-freqs", dtype=dtypes.int64, device=var.device))
+
+    # pylint: disable=protected-access
+    super(EmbeddingVariableSaveable, self).__init__(var, specs, name)
+    self.is_sparse = var._is_sparse
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    # pylint: disable=protected-access
+    with ops.device("/cpu:0"):
+      name_tensor = ops.convert_to_tensor(self.name)
+    with ops.colocate_with(self.handle_op):
+      handle_name = ops.name_from_scope_name(self.name)
+      is_partitioned_ev = not isinstance(self.var._save_slice_info, str)
+      if self.var._init_data_source is not None:
+        return self.var.recover_from_init_data_source(self.var._init_data_source, self.partition_id, self.partition_num)
+      else:
+        restore_dependency = ops.get_collection(ops.GraphKeys.EMBEDDING_VARIABLE_RESTORE_DEPENDENCY)[0]
+        with ops.control_dependencies(restore_dependency[self.var._primary_handle]):
+          rank = self.op.initial_value.get_shape().rank - 1
+          restore_op = gen_kv_variable_ops.kv_resource_import_v3(
+              restored_tensors[0],
+              self.handle_op,
+              name_tensor,
+              ops.convert_to_tensor(self.invalid_key),
+              shape=self.op.initial_value.get_shape()[rank:],
+              partition_id=self.partition_id,
+              partition_num=self.partition_num,
+              dtype=self.var._dtype
+          )
+        return restore_op
+
+  def incr_restore(self, restored_tensors, unused_restored_shapes):
+    # pylint: disable=protected-access
+    name_tensor = ops.convert_to_tensor(self.name)
+    with ops.colocate_with(self.handle_op):
+      handle_name = ops.name_from_scope_name(self.name)
+      return gen_kv_variable_ops.kv_resource_incr_import(
+          restored_tensors[0],
+          self.handle_op,
+          name_tensor,
+          ops.convert_to_tensor(self.invalid_key),
+          variables._try_guard_against_uninitialized_dependencies(self.name, self.op.initial_value),
+          partition_id=self.partition_id,
+          partition_num=self.partition_num
+      )
diff --git a/build_deps/toolchains/gpu/BUILD b/deepray/custom_ops/embedding_variable/python/tests/__init__.py
similarity index 100%
rename from build_deps/toolchains/gpu/BUILD
rename to deepray/custom_ops/embedding_variable/python/tests/__init__.py
diff --git a/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
new file mode 100644
index 00000000..f1d1ee33
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
@@ -0,0 +1,116 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for EmbeddingBag layer."""
+
+import pytest
+import numpy as np
+import tensorflow as tf
+
+from deepray.custom_ops.embedding_bag import EmbeddingBag, _embedding_bag
+from deepray.utils import test_utils
+
+
+def manual_embedding_bag(indices, params, weights=None, combiner="mean"):
+  gathered = tf.gather(params, indices)
+  if weights is not None:
+    gathered *= tf.expand_dims(weights, -1)
+  if combiner == "sum":
+    return tf.reduce_sum(gathered, -2, keepdims=False)
+  else:
+    assert combiner == "mean"
+    assert weights is None
+    return tf.reduce_mean(gathered, -2, keepdims=False)
+
+
+@pytest.mark.with_device(["cpu", "gpu"])
+@pytest.mark.parametrize("input_shape", [(16, 32)])
+@pytest.mark.parametrize("input_dim", [63, 64])
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("combiner", ["sum", "mean"])
+def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner):
+  indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype)
+  params = np.random.random(size=(input_dim, 16)).astype(dtype)
+  if combiner == "sum":
+    weights = np.random.random(size=indices.shape).astype(dtype)
+  else:
+    weights = None
+  expected = manual_embedding_bag(indices, params, weights, combiner=combiner)
+  embedding_bag = EmbeddingBag(input_dim, 16, combiner=combiner, dtype=dtype)
+  embedding_bag.build(indices.shape)
+  embedding_bag.set_weights([params])
+  indices = tf.convert_to_tensor(indices)
+  if weights is not None:
+    weights = tf.convert_to_tensor(weights)
+  output = embedding_bag(
+      indices,
+      weights,
+  )
+  test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2)
+
+
+@pytest.mark.with_device(["cpu", "gpu"])
+@pytest.mark.parametrize("input_shape", [(16, 32)])
+@pytest.mark.parametrize("input_dim", [63, 64])
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("indices_dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("combiner", ["sum", "mean"])
+@pytest.mark.usefixtures("maybe_run_functions_eagerly")
+def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
+  indices = np.random.randint(low=0, high=input_dim, size=input_shape).astype(indices_dtype)
+  params = np.random.random(size=(input_dim, 16)).astype(dtype)
+  if combiner == "sum":
+    weights = np.random.random(size=indices.shape).astype(dtype)
+  else:
+    weights = None
+
+  indices = tf.convert_to_tensor(indices)
+  params = tf.convert_to_tensor(params)
+  if weights is not None:
+    weights = tf.convert_to_tensor(weights)
+
+  embedding_bag_fn = tf.function(_embedding_bag)
+
+  if combiner == "sum":
+    with tf.GradientTape(persistent=True) as tape:
+      tape.watch([params, weights])
+      output = embedding_bag_fn(indices, params, weights, combiner="sum")
+      expected = manual_embedding_bag(indices, params, weights, combiner="sum")
+
+    grads = tape.gradient(output, [params, weights])
+    expected_grads = tape.gradient(expected, [params, weights])
+    # Gather returns sparse IndexedSlices so we have to sum them together.
+    test_utils.assert_allclose_according_to_type(
+        tf.convert_to_tensor(expected_grads[0]),
+        tf.convert_to_tensor(grads[0]),
+        half_rtol=1e-2,
+        half_atol=1e-2,
+    )
+    test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2)
+  else:
+    with tf.GradientTape(persistent=True) as tape:
+      tape.watch(params)
+      output = embedding_bag_fn(indices, params, combiner=combiner)
+      expected = manual_embedding_bag(indices, params, combiner=combiner)
+
+    grads = tape.gradient(output, [params])
+    expected_grads = tape.gradient(expected, [params])
+    # Gather returns sparse IndexedSlices so we have to sum them together.
+    test_utils.assert_allclose_according_to_type(
+        tf.convert_to_tensor(expected_grads[0]),
+        tf.convert_to_tensor(grads[0]),
+        half_rtol=1e-2,
+        half_atol=1e-2,
+    )
diff --git a/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
new file mode 100644
index 00000000..f41f9179
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
@@ -0,0 +1,254 @@
+"""Tests for tensorflow.ops.embedding_variable GPU version."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.platform import googletest
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import embedding_ops
+
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import array_ops
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import config
+from deepray.custom_ops.embedding_variable import config_pb2
+from tensorflow.python.training import training_util
+from tensorflow.python.training import adagrad
+from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_v2
+from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable
+from deepray.custom_ops.embedding_variable import variables as ev_variables
+
+
+class GroupEmbeddingGPUTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_gpu_only
+  def testMultiKvResourceGather(self):
+    print("testMultiKvResourceGather")
+
+    def runTestAdagrad(embedding_weights, indices, combiners):
+      emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
+      contcat_emb = array_ops.concat(emb, axis=-1)
+      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True, force_gpu=True) as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        r, _, _ = sess.run([emb, train_op, loss])
+        return r
+
+    with ops.device('/GPU:0'):
+      emb_var_0 = get_embedding_variable(
+          "emb_var_0", embedding_dim=8, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+      emb_var_1 = get_embedding_variable(
+          "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+    indices_0 = sparse_tensor.SparseTensor(
+        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+        dense_shape=[4, 3]
+    )
+
+    indices = [indices_0 for _ in range(2)]
+    ev_weights = [emb_var_0, emb_var_1]
+    combiners = ["mean", "sum"]
+
+    ev_result = runTestAdagrad(ev_weights, indices, combiners)
+    for i in range(4):
+      if i == 2:
+        for j in range(16):
+          self.assertEqual(ev_result[1].tolist()[i][j], 2)
+      else:
+        for j in range(16):
+          self.assertEqual(ev_result[1].tolist()[i][j], 1)
+
+    for i in range(4):
+      for j in range(8):
+        self.assertEqual(ev_result[0].tolist()[i][j], 1)
+
+  @test_util.run_gpu_only
+  def testMultiEmbeddingSparseLookUp(self):
+    print("testMultiEmbeddingSparseLookUp")
+
+    def runTestAdagrad(embedding_weights, indices, combiners):
+      emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
+      contcat_emb = array_ops.concat(emb, axis=-1)
+      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True, force_gpu=True) as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        r, _, _ = sess.run([emb, train_op, loss])
+        return r
+
+    with ops.device('/GPU:0'):
+
+      var_0 = variable_scope.get_variable(
+          "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 8)
+      )
+      var_1 = variable_scope.get_variable(
+          "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+      )
+
+    indices_0 = sparse_tensor.SparseTensor(
+        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+        dense_shape=[4, 3]
+    )
+
+    indices = [indices_0 for _ in range(2)]
+    var_weights = [var_0, var_1]
+    combiners = ["mean", "sum"]
+
+    var_result = runTestAdagrad(var_weights, indices, combiners)
+    for i in range(4):
+      if i == 2:
+        for j in range(16):
+          self.assertEqual(var_result[1].tolist()[i][j], 2)
+      else:
+        for j in range(16):
+          self.assertEqual(var_result[1].tolist()[i][j], 1)
+
+    for i in range(4):
+      for j in range(8):
+        self.assertEqual(var_result[0].tolist()[i][j], 1)
+
+  @test_util.run_gpu_only
+  def testMultiKvResourceGatherEqualMultiEmbeddingSparseLookUp(self):
+    print("testMultiKvResourceGather")
+
+    def runTestAdagrad(embedding_weights, indices, combiners):
+      emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
+      contcat_emb = array_ops.concat(emb, axis=-1)
+      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
+      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      gs = training_util.get_or_create_global_step()
+      opt = adagrad.AdagradOptimizer(0.1)
+      g_v = opt.compute_gradients(loss)
+      train_op = opt.apply_gradients(g_v)
+      init = variables.global_variables_initializer()
+      with self.test_session(use_gpu=True, force_gpu=True) as sess:
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+        sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+        sess.run([init])
+        r, _, _ = sess.run([emb, train_op, loss])
+        r, _, _ = sess.run([emb, train_op, loss])
+        r, _, _ = sess.run([emb, train_op, loss])
+        r, _, _ = sess.run([emb, train_op, loss])
+        r, _, _ = sess.run([emb, train_op, loss])
+        return r
+
+    with ops.device('/GPU:0'):
+      emb_var_1 = get_embedding_variable(
+          "emb_var_0", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+      emb_var_2 = get_embedding_variable(
+          "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+      var_0 = variable_scope.get_variable(
+          "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+      )
+      var_1 = variable_scope.get_variable(
+          "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+      )
+
+    indices_0 = sparse_tensor.SparseTensor(
+        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+        dense_shape=[4, 3]
+    )
+
+    indices = [indices_0 for _ in range(4)]
+    weights = [emb_var_1, emb_var_2, var_0, var_1]
+    combiners = ["mean", "sum", "mean", "sum"]
+
+    ev_result = runTestAdagrad(weights, indices, combiners)
+
+    for i in range(2):
+      for j in range(0, 4):
+        for k in range(0, 16):
+          self.assertNear(ev_result[i].tolist()[j][k], ev_result[2 + i].tolist()[j][k], 1e-05)
+
+  @test_util.run_gpu_only
+  def testMultiKvResourceGatherForSparseColumnEmbeddingCol(self):
+    with feature_column_v2.group_embedding_column_scope(name="test"):
+      ad_columns = feature_column_v2.categorical_column_with_embedding(
+          key="ad_emb",
+          dtype=dtypes.int64,
+          ev_option=ev_variables.EmbeddingVariableOption(
+              storage_option=ev_variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
+          )
+      )
+      ad_weights = feature_column_v2.embedding_column(
+          categorical_column=ad_columns, dimension=8, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+      user_columns = feature_column_v2.categorical_column_with_embedding(
+          key="user_emb",
+          dtype=dtypes.int64,
+          ev_option=variables.EmbeddingVariableOption(
+              storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
+          )
+      )
+      user_weights = feature_column_v2.embedding_column(
+          categorical_column=user_columns, dimension=16, initializer=init_ops.ones_initializer(dtypes.float32)
+      )
+
+    ids = {}
+    ids["ad_emb"] = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 3]],
+        values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
+        dense_shape=[5, 4]
+    )
+    ids["user_emb"] = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 1], [2, 2], [2, 3], [4, 3]],
+        values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
+        dense_shape=[5, 4]
+    )
+
+    emb = feature_column.input_layer(features=ids, feature_columns=[ad_weights, user_weights])
+
+    fun = math_ops.multiply(emb, 2.0, name='multiply')
+    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    opt = adagrad.AdagradOptimizer(0.1)
+    g_v = opt.compute_gradients(loss)
+    train_op = opt.apply_gradients(g_v)
+    init = variables.global_variables_initializer()
+    with self.test_session(force_gpu=True) as sess:
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_VAR_OPS))
+      sess.run(ops.get_collection(ops.GraphKeys.EV_INIT_SLOT_OPS))
+      sess.run(init)
+      print("init global done")
+      print(sess.run([emb, train_op, loss]))
+      print(sess.run([emb, train_op, loss]))
+      print(sess.run([emb, train_op, loss]))
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py b/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py
new file mode 100644
index 00000000..8261049e
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/python/tests/run_all_test.py
@@ -0,0 +1,7 @@
+from pathlib import Path
+import sys
+import pytest
+
+if __name__ == "__main__":
+  dirname = Path(__file__).absolute().parent
+  sys.exit(pytest.main(["-s", str(dirname)]))
diff --git a/deepray/custom_ops/embedding_variable/variable_scope.py b/deepray/custom_ops/embedding_variable/variable_scope.py
new file mode 100644
index 00000000..a1530297
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/variable_scope.py
@@ -0,0 +1,1277 @@
+import collections as collections_lib
+import copy
+import functools
+import traceback
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops.variable_scope import AUTO_REUSE
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import function_utils
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import tf_export
+
+from deepray.utils import logging_util
+from . import variables as ev_variables
+from .python import kv_variable_ops
+
+logger = logging_util.get_logger()
+
+
+@tf_export(v1=["VariableScope"])
+class VariableScope(object):
+  """Variable scope object to carry defaults to provide to `get_variable`.
+
+  Many of the arguments we need for `get_variable` in a variable store are most
+  easily handled with a context. This object is used for the defaults.
+
+  Attributes:
+    name: name of the current scope, used as prefix in get_variable.
+    initializer: default initializer passed to get_variable.
+    regularizer: default regularizer passed to get_variable.
+    reuse: Boolean, None, or tf.compat.v1.AUTO_REUSE, setting the reuse in
+      get_variable. When eager execution is enabled this argument is always
+      forced to be False.
+    caching_device: string, callable, or None: the caching device passed to
+      get_variable.
+    partitioner: callable or `None`: the partitioner passed to `get_variable`.
+    custom_getter: default custom getter passed to get_variable.
+    name_scope: The name passed to `tf.name_scope`.
+    dtype: default type passed to get_variable (defaults to DT_FLOAT).
+    use_resource: if False, create a normal Variable; if True create an
+      experimental ResourceVariable with well-defined semantics. Defaults to
+      False (will later change to True). When eager execution is enabled this
+      argument is always forced to be True.
+    constraint: An optional projection function to be applied to the variable
+      after being updated by an `Optimizer` (e.g. used to implement norm
+      constraints or value constraints for layer weights). The function must
+      take as input the unprojected Tensor representing the value of the
+      variable and return the Tensor for the projected value (which must have
+      the same shape). Constraints are not safe to use when doing asynchronous
+      distributed training.
+  """
+
+  def __init__(
+      self,
+      reuse,
+      name="",
+      initializer=None,
+      regularizer=None,
+      caching_device=None,
+      partitioner=None,
+      custom_getter=None,
+      name_scope="",
+      dtype=dtypes.float32,
+      use_resource=None,
+      constraint=None
+  ):
+    """Creates a new VariableScope with the given properties."""
+    self._name = name
+    self._initializer = initializer
+    self._regularizer = regularizer
+    self._reuse = reuse
+    self._caching_device = caching_device
+    self._partitioner = partitioner
+    self._custom_getter = custom_getter
+    self._name_scope = name_scope
+    self._dtype = dtype
+    self._use_resource = use_resource
+    self._constraint = constraint
+    if context.executing_eagerly():
+      if self._caching_device is not None:
+        raise NotImplementedError("Caching devices is not yet supported "
+                                  "when eager execution is enabled.")
+      self._reuse = AUTO_REUSE
+      self._use_resource = True
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def original_name_scope(self):
+    return self._name_scope
+
+  @property
+  def reuse(self):
+    return self._reuse
+
+  @property
+  def initializer(self):
+    return self._initializer
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def use_resource(self):
+    return self._use_resource
+
+  @property
+  def regularizer(self):
+    return self._regularizer
+
+  @property
+  def caching_device(self):
+    return self._caching_device
+
+  @property
+  def partitioner(self):
+    return self._partitioner
+
+  @property
+  def custom_getter(self):
+    return self._custom_getter
+
+  @property
+  def constraint(self):
+    return self._constraint
+
+  def reuse_variables(self):
+    """Reuse variables in this scope."""
+    self._reuse = True
+
+  def set_initializer(self, initializer):
+    """Set initializer for this scope."""
+    self._initializer = initializer
+
+  def set_dtype(self, dtype):
+    """Set data type for this scope."""
+    self._dtype = dtype
+
+  def set_use_resource(self, use_resource):
+    """Sets whether to use ResourceVariables for this scope."""
+    if context.executing_eagerly() and not use_resource:
+      raise ValueError("When eager execution is enabled, "
+                       "use_resource cannot be set to false.")
+    self._use_resource = use_resource
+
+  def set_regularizer(self, regularizer):
+    """Set regularizer for this scope."""
+    self._regularizer = regularizer
+
+  def set_caching_device(self, caching_device):
+    """Set caching_device for this scope."""
+    if context.executing_eagerly():
+      raise NotImplementedError("Caching devices are not yet supported "
+                                "when eager execution is enabled.")
+    self._caching_device = caching_device
+
+  def set_partitioner(self, partitioner):
+    """Set partitioner for this scope."""
+    self._partitioner = partitioner
+
+  def set_custom_getter(self, custom_getter):
+    """Set custom getter for this scope."""
+    self._custom_getter = custom_getter
+
+  def get_collection(self, name):
+    """Get this scope's variables."""
+    scope = self._name + "/" if self._name else ""
+    return ops.get_collection(name, scope)
+
+  def trainable_variables(self):
+    """Get this scope's trainable variables."""
+    return self.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+
+  def global_variables(self):
+    """Get this scope's global variables."""
+    return self.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+
+  def local_variables(self):
+    """Get this scope's local variables."""
+    return self.get_collection(ops.GraphKeys.LOCAL_VARIABLES)
+
+  def get_variable(
+      self,
+      var_store,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      regularizer=None,
+      reuse=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      custom_getter=None,
+      constraint=None,
+  ):
+    """Gets an existing variable with this name or create a new one."""
+    if regularizer is None:
+      regularizer = self._regularizer
+    if caching_device is None:
+      caching_device = self._caching_device
+    if partitioner is None:
+      partitioner = self._partitioner
+    if custom_getter is None:
+      custom_getter = self._custom_getter
+    if context.executing_eagerly():
+      reuse = False
+      use_resource = True
+    else:
+      if reuse is None:
+        reuse = self._reuse
+      if use_resource is None:
+        use_resource = self._use_resource
+
+    full_name = self.name + "/" + name if self.name else name
+    # Variable names only depend on variable_scope (full_name here),
+    # not name_scope, so we reset it below for the time of variable creation.
+    with ops.name_scope(None):
+      # Check that `initializer` dtype and `dtype` are consistent before
+      # replacing them with defaults.
+      if dtype is not None and initializer is not None and not callable(initializer):
+        init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
+        if init_dtype != dtype:
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
+                           "don't match." % (init_dtype, dtype))
+      if initializer is None:
+        initializer = self._initializer
+      if constraint is None:
+        constraint = self._constraint
+      if dtype is None:
+        dtype = self._dtype
+      return var_store.get_variable(
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          custom_getter=custom_getter,
+          constraint=constraint,
+      )
+
+  def get_embedding_variable(
+      self,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      regularizer=None,
+      reuse=None,
+      trainable=True,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      custom_getter=None,
+      constraint=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Gets an existing variable with this name or create a new one."""
+    if regularizer is None:
+      regularizer = self._regularizer
+    if caching_device is None:
+      caching_device = self._caching_device
+    if partitioner is None:
+      partitioner = self._partitioner
+    if custom_getter is None:
+      custom_getter = self._custom_getter
+    if not context.executing_eagerly():
+      if reuse is None:
+        reuse = self._reuse
+      if use_resource is None:
+        use_resource = self._use_resource
+    else:
+      reuse = AUTO_REUSE
+      use_resource = True
+
+    full_name = self.name + "/" + name if self.name else name
+    # Variable names only depend on variable_scope (full_name here),
+    # not name_scope, so we reset it below for the time of variable creation.
+    with ops.name_scope(None):
+      # Check that `initializer` dtype and `dtype` are consistent before
+      # replacing them with defaults.
+      if dtype is not None and initializer is not None and not callable(initializer):
+        init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
+        if init_dtype != dtype:
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
+                           "don't match." % (init_dtype, dtype))
+      if initializer is None:
+        initializer = self._initializer
+      if constraint is None:
+        constraint = self._constraint
+      if dtype is None:
+        dtype = self._dtype
+      if invalid_key is None:
+        invalid_key = -1
+      return _VariableStore().get_variable(
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          custom_getter=custom_getter,
+          constraint=constraint,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          ht_partition_num=ht_partition_num
+      )
+
+  def get_dynamic_dimension_embedding_variable(
+      self,
+      var_store,
+      name,
+      shape=None,
+      embedding_block_num=None,
+      dtype=None,
+      initializer=None,
+      regularizer=None,
+      reuse=None,
+      trainable=True,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      custom_getter=None,
+      constraint=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Gets an existing variable with this name or create a new one."""
+    if regularizer is None:
+      regularizer = self._regularizer
+    if caching_device is None:
+      caching_device = self._caching_device
+    if partitioner is None:
+      partitioner = self._partitioner
+    if custom_getter is None:
+      custom_getter = self._custom_getter
+    if not context.executing_eagerly():
+      if reuse is None:
+        reuse = self._reuse
+      if use_resource is None:
+        use_resource = self._use_resource
+    else:
+      reuse = AUTO_REUSE
+      use_resource = True
+
+    full_name = self.name + "/" + name if self.name else name
+    # Variable names only depend on variable_scope (full_name here),
+    # not name_scope, so we reset it below for the time of variable creation.
+    with ops.name_scope(None):
+      # Check that `initializer` dtype and `dtype` are consistent before
+      # replacing them with defaults.
+      if dtype is not None and initializer is not None and not callable(initializer):
+        init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
+        if init_dtype != dtype:
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
+                           "don't match." % (init_dtype, dtype))
+      if initializer is None:
+        initializer = self._initializer
+      if constraint is None:
+        constraint = self._constraint
+      if dtype is None:
+        dtype = self._dtype
+      if invalid_key is None:
+        invalid_key = -1
+      return var_store.get_variable(
+          full_name,
+          shape=shape,
+          embedding_block_num=embedding_block_num,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          custom_getter=custom_getter,
+          constraint=constraint,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          ht_partition_num=ht_partition_num
+      )
+
+  def _get_partitioned_variable(
+      self,
+      var_store,
+      name,
+      shape=None,
+      dtype=None,
+      initializer=None,
+      regularizer=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      constraint=None,
+  ):
+    """Gets an existing variable with this name or create a new one."""
+    if initializer is None:
+      initializer = self._initializer
+    if regularizer is None:
+      regularizer = self._regularizer
+    if constraint is None:
+      constraint = self._constraint
+    if caching_device is None:
+      caching_device = self._caching_device
+    if partitioner is None:
+      partitioner = self._partitioner
+    if dtype is None:
+      dtype = self._dtype
+    if use_resource is None:
+      use_resource = self._use_resource
+
+    if self._custom_getter is not None:
+      raise ValueError(
+          "Private access to _get_partitioned_variable is not allowed when "
+          "a custom getter is set.  Current custom getter: %s.  "
+          "It is likely that you're using create_partitioned_variables.  "
+          "If so, consider instead using get_variable with a non-empty "
+          "partitioner parameter instead." % self._custom_getter
+      )
+
+    if partitioner is None:
+      raise ValueError("No partitioner was specified")
+
+    # This allows the variable scope name to be used as the variable name if
+    # this function is invoked with an empty name arg, for backward
+    # compatibility with create_partitioned_variables().
+    full_name_list = []
+    if self.name:
+      full_name_list.append(self.name)
+    if name:
+      full_name_list.append(name)
+    full_name = "/".join(full_name_list)
+
+    # Variable names only depend on variable_scope (full_name here),
+    # not name_scope, so we reset it below for the time of variable creation.
+    with ops.name_scope(None):
+      # pylint: disable=protected-access
+      return var_store._get_partitioned_variable(
+          full_name,
+          shape=shape,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=self.reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+      )
+      # pylint: enable=protected-access
+
+
+class _VariableStore(object):
+  """Variable store that carries a number of named Variables.
+
+  New variable names and new variables can be created; all stored
+  variables are initialized with the initializer passed to __init__.
+
+  Attributes:
+    vars: a dictionary with string names (same as passed in GetVar) as keys and
+      the corresponding TensorFlow Variables as values.
+  """
+
+  def __init__(self):
+    """Create a variable store."""
+    self._vars = {}  # A dictionary of the stored TensorFlow variables.
+    self._partitioned_vars = {}  # A dict of the stored PartitionedVariables.
+    self._store_eager_variables = False
+
+  def get_variable(
+      self,
+      name,
+      shape=None,
+      embedding_block_num=None,
+      dtype=dtypes.float32,
+      initializer=None,
+      regularizer=None,
+      reuse=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      custom_getter=None,
+      constraint=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Gets an existing variable with these parameters or create a new one.
+
+    If a variable with the given name is already stored, we return the stored
+    variable. Otherwise, we create a new one.
+
+    Set `reuse` to `True` when you only want to reuse existing Variables.
+    Set `reuse` to `False` when you only want to create new Variables.
+    Set `reuse` to None (the default) or tf.compat.v1.AUTO_REUSE when you want
+    variables to be created if they don't exist or returned if they do.
+
+    If initializer is `None` (the default), the default initializer passed in
+    the constructor is used. If that one is `None` too, we use a new
+    `glorot_uniform_initializer`. If initializer is a Tensor, we use
+    it as a value and derive the shape from the initializer.
+
+    If a partitioner is provided, a `PartitionedVariable` is returned.
+    Accessing this object as a `Tensor` returns the shards concatenated along
+    the partition axis.
+
+    Some useful partitioners are available.  See, e.g.,
+    `variable_axis_size_partitioner` and `min_max_variable_partitioner`.
+
+    Args:
+      name: The name of the new or existing variable.
+      shape: Shape of the new or existing variable.
+      dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
+      initializer: Initializer for the variable.
+      regularizer: A (Tensor -> Tensor or None) function; the result of applying
+        it on a newly created variable will be added to the collection
+        GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
+      reuse: a Boolean, None, or tf.AUTO_REUSE. Controls reuse or creation of
+        variables. When eager execution is enabled  this argument is always
+        forced to be False.
+      trainable: If `True` also add the variable to the graph collection
+        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). `trainable`
+        defaults to `True`, unless `synchronization` is set to `ON_READ`, in
+        which case it defaults to `False`.
+      collections: List of graph collections keys to add the `Variable` to.
+        Defaults to `[GraphKeys.GLOBAL_VARIABLES]` (see `tf.Variable`).
+      caching_device: Optional device string or function describing where the
+        Variable should be cached for reading.  Defaults to the Variable's
+        device.  If not `None`, caches on another device.  Typical use is to
+        cache on the device where the Ops using the `Variable` reside, to
+        deduplicate copying through `Switch` and other conditional statements.
+      partitioner: Optional callable that accepts a fully defined `TensorShape`
+        and dtype of the `Variable` to be created, and returns a list of
+        partitions for each axis (currently only one axis can be partitioned).
+      validate_shape: If False, allows the variable to be initialized with a
+        value of unknown shape. If True, the default, the shape of initial_value
+        must be known.
+      use_resource: If False, creates a regular Variable. If True, creates
+        instead an experimental ResourceVariable which has well-defined
+        semantics. Defaults to False (will later change to True). When eager
+        execution is enabled this argument is always forced to be true.
+      custom_getter: Callable that takes as a first argument the true getter,
+        and allows overwriting the internal get_variable method. The signature
+        of `custom_getter` should match that of this method,
+        but the most future-proof version will allow for changes: `def
+          custom_getter(getter, *args, **kwargs)`.  Direct access to
+        all `get_variable` parameters is also allowed: `def
+          custom_getter(getter, name, *args, **kwargs)`.  A simple identity
+        custom getter that simply creates variables with modified names is:
+          ```python
+        def custom_getter(getter, name, *args, **kwargs): return getter(name +
+          '_suffix', *args, **kwargs) ```
+      constraint: An optional projection function to be applied to the variable
+        after being updated by an `Optimizer` (e.g. used to implement norm
+        constraints or value constraints for layer weights). The function must
+        take as input the unprojected Tensor representing the value of the
+        variable and return the Tensor for the projected value (which must have
+        the same shape). Constraints are not safe to use when doing asynchronous
+        distributed training.
+
+    Returns:
+      The created or existing `Variable` (or `PartitionedVariable`, if a
+      partitioner was used).
+
+    Raises:
+      ValueError: when creating a new variable and shape is not declared,
+        when reusing a variable and specifying a conflicting shape,
+        or when violating reuse during variable creation.
+      RuntimeError: when eager execution is enabled and not called from an
+        EagerVariableStore.
+    """
+    if custom_getter is not None and not callable(custom_getter):
+      raise ValueError("Passed a custom_getter which is not callable: %s" % custom_getter)
+
+    # If a *_ref type is passed in an error would be triggered further down the
+    # stack. We prevent this using base_dtype to get a non-ref version of the
+    # type, before doing anything else. When _ref types are removed in favor of
+    # resources, this line can be removed.
+    try:
+      dtype = dtype.base_dtype
+    except AttributeError:
+      # .base_dtype not existing means that we will try and use the raw dtype
+      # which was passed in - this might be a NumPy type which is valid.
+      pass
+
+    # This is the main logic of get_variable.  However, custom_getter
+    # may override this logic.  So we save it as a callable and pass
+    # it to custom_getter.
+    # Note: the parameters of _true_getter, and their documentation, match
+    # *exactly* item-for-item with the docstring of this method.
+    def _true_getter(  # pylint: disable=missing-docstring
+        name,
+        shape=None,
+        embedding_block_num=None,
+        dtype=dtypes.float32,
+        initializer=None,
+        regularizer=None,
+        reuse=None,
+        trainable=None,
+        collections=None,
+        caching_device=None,
+        partitioner=None,
+        validate_shape=True,
+        use_resource=None,
+        constraint=None,
+        invalid_key=None,
+        evconfig=ev_variables.EmbeddingVariableConfig(),
+        ht_partition_num=1000):
+      is_scalar = (shape is not None and isinstance(shape, collections_lib.abc.Sequence) and not shape)
+      # Partitioned variable case
+      if partitioner is not None and not is_scalar:
+        if not callable(partitioner):
+          raise ValueError("Partitioner must be callable, but received: %s" % partitioner)
+        with ops.name_scope(None):
+          return self._get_partitioned_variable(
+              name=name,
+              shape=shape,
+              embedding_block_num=embedding_block_num,
+              dtype=dtype,
+              initializer=initializer,
+              regularizer=regularizer,
+              reuse=reuse,
+              trainable=trainable,
+              collections=collections,
+              caching_device=caching_device,
+              partitioner=partitioner,
+              validate_shape=validate_shape,
+              use_resource=use_resource,
+              constraint=constraint,
+              invalid_key=invalid_key,
+              evconfig=evconfig,
+              ht_partition_num=ht_partition_num
+          )
+
+      # Special case for partitioned variable to allow reuse without having to
+      # specify partitioner.
+      if reuse is True and partitioner is None and name in self._partitioned_vars:
+        return self._get_partitioned_variable(
+            name=name,
+            shape=shape,
+            embedding_block_num=embedding_block_num,
+            dtype=dtype,
+            initializer=initializer,
+            regularizer=regularizer,
+            reuse=reuse,
+            trainable=trainable,
+            collections=collections,
+            caching_device=caching_device,
+            partitioner=None,
+            validate_shape=validate_shape,
+            use_resource=use_resource,
+            constraint=constraint,
+            invalid_key=invalid_key,
+            evconfig=evconfig,
+            ht_partition_num=ht_partition_num
+        )
+
+      # Single variable case
+      if "%s/part_0" % name in self._vars:
+        raise ValueError(
+            "No partitioner was provided, but a partitioned version of the "
+            "variable was found: %s/part_0. Perhaps a variable of the same "
+            "name was already created with partitioning?" % name
+        )
+
+      return self._get_single_variable(
+          name=name,
+          shape=shape,
+          embedding_block_num=embedding_block_num,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          ht_partition_num=ht_partition_num
+      )
+
+    if custom_getter is not None:
+      # Handle backwards compatibility with getter arguments that were added
+      # to the API after users started writing custom getters.
+      custom_getter_kwargs = {
+          "getter": _true_getter,
+          "name": name,
+          "shape": shape,
+          "embedding_block_num": embedding_block_num,
+          "dtype": dtype,
+          "initializer": initializer,
+          "regularizer": regularizer,
+          "reuse": reuse,
+          "trainable": trainable,
+          "collections": collections,
+          "caching_device": caching_device,
+          "partitioner": partitioner,
+          "validate_shape": validate_shape,
+          "use_resource": use_resource,
+          "invalid_key": invalid_key,
+          "evconfig": evconfig,
+          "ht_partition_num": ht_partition_num,
+      }
+      # `fn_args` and `has_kwargs` can handle functions, `functools.partial`,
+      # `lambda`.
+      if "constraint" in function_utils.fn_args(custom_getter) or function_utils.has_kwargs(custom_getter):
+        custom_getter_kwargs["constraint"] = constraint
+      return custom_getter(**custom_getter_kwargs)
+    else:
+      return _true_getter(
+          name,
+          shape=shape,
+          embedding_block_num=embedding_block_num,
+          dtype=dtype,
+          initializer=initializer,
+          regularizer=regularizer,
+          reuse=reuse,
+          trainable=trainable,
+          collections=collections,
+          caching_device=caching_device,
+          partitioner=partitioner,
+          validate_shape=validate_shape,
+          use_resource=use_resource,
+          constraint=constraint,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          ht_partition_num=ht_partition_num
+      )
+
+  def _get_single_variable(
+      self,
+      name,
+      shape=None,
+      embedding_block_num=None,
+      dtype=dtypes.float32,
+      initializer=None,
+      regularizer=None,
+      partition_info=None,
+      reuse=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      validate_shape=True,
+      use_resource=None,
+      constraint=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000
+  ):
+    """Get or create a single Variable (e.g.
+
+    a shard or entire variable).
+
+    See the documentation of get_variable above (ignore partitioning components)
+    for details.
+
+    Args:
+      name: see get_variable.
+      shape: see get_variable.
+      dtype: see get_variable.
+      initializer: see get_variable.
+      regularizer: see get_variable.
+      partition_info: _PartitionInfo object.
+      reuse: see get_variable.
+      trainable: see get_variable.
+      collections: see get_variable.
+      caching_device: see get_variable.
+      validate_shape: see get_variable.
+      use_resource: see get_variable.
+      constraint: see get_variable.
+
+    Returns:
+      A Variable.  See documentation of get_variable above.
+
+    Raises:
+      ValueError: See documentation of get_variable above.
+    """
+    # Set to true if initializer is a constant.
+    initializing_from_value = False
+    if initializer is not None and not callable(initializer):
+      initializing_from_value = True
+    if shape is not None and initializing_from_value:
+      raise ValueError("If initializer is a constant, do not specify shape.")
+
+    dtype = dtypes.as_dtype(dtype)
+    shape = tensor_shape.as_shape(shape)
+
+    if name in self._vars:
+      # Here we handle the case when returning an existing variable.
+      if reuse is False:
+        var = self._vars[name]
+        err_msg = (
+            "Variable %s already exists, disallowed."
+            " Did you mean to set reuse=True or "
+            "reuse=tf.AUTO_REUSE in VarScope?" % name
+        )
+        # ResourceVariables don't have an op associated with so no traceback
+        if isinstance(var, resource_variable_ops.ResourceVariable):
+          raise ValueError(err_msg)
+        tb = var.op.traceback[::-1]
+        # Throw away internal tf entries and only take a few lines. In some
+        # cases the traceback can be longer (e.g. if someone uses factory
+        # functions to create variables) so we take more than needed in the
+        # default case.
+        tb = [x for x in tb if "tensorflow/python" not in x[0]][:5]
+        raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(traceback.format_list(tb))))
+      found_var = self._vars[name]
+      from tensorflow.python.ops.hash_table import hash_table
+      if isinstance(found_var, (hash_table.HashTable, hash_table.DistributedHashTable)):
+        raise ValueError(
+            "Trying to reuse variable %s, but an existing variable is a"
+            " HashTable or DistributedHashTable, can not reuse it." % (name)
+        )
+      if not shape.is_compatible_with(found_var.get_shape()):
+        raise ValueError(
+            "Trying to share variable %s, but specified shape %s"
+            " and found shape %s." % (name, shape, found_var.get_shape())
+        )
+      if not dtype.is_compatible_with(found_var.dtype):
+        dtype_str = dtype.name
+        found_type_str = found_var.dtype.name
+        raise ValueError(
+            "Trying to share variable %s, but specified dtype %s"
+            " and found dtype %s." % (name, dtype_str, found_type_str)
+        )
+      return found_var
+
+    # Create the tensor to initialize the variable with default value.
+    if initializer is None:
+      initializer, initializing_from_value = self._get_default_initializer(name=name, shape=shape, dtype=dtype)
+    # Enter an init scope when creating the initializer.
+    with ops.init_scope():
+      if initializing_from_value:
+        init_val = initializer
+        variable_dtype = None
+      else:
+        # Instantiate initializer if provided initializer is a type object.
+        if tf_inspect.isclass(initializer):
+          initializer = initializer()
+        if shape is not None and shape.is_fully_defined():
+          if use_resource and invalid_key is not None:
+            s = [1 if isinstance(initializer, init_ops.Constant) else evconfig.default_value_dim] + shape.as_list()
+            evconfig.default_value_dim = 1 if isinstance(initializer, init_ops.Constant) else evconfig.default_value_dim
+          else:
+            s = shape.as_list()
+          init_val = functools.partial(initializer, shape=s, dtype=dtype)
+          variable_dtype = dtype.base_dtype
+        elif len(tf_inspect.getargspec(initializer).args) == len(tf_inspect.getargspec(initializer).defaults or []):
+          init_val = initializer
+          variable_dtype = None
+        else:
+          raise ValueError(
+              "The initializer passed is not valid. It should "
+              "be a callable with no arguments and the "
+              "shape should not be provided or an instance of "
+              "`tf.keras.initializers.*' and `shape` should be "
+              "fully defined."
+          )
+
+    v = default_variable_creator(
+        initial_value=init_val,
+        name=name,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        embedding_block_num=embedding_block_num,
+        dtype=variable_dtype,
+        validate_shape=validate_shape,
+        constraint=constraint,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        initializer=initializer,
+        ht_partition_num=ht_partition_num
+    )
+    if not context.executing_eagerly() or self._store_eager_variables:
+      # In eager mode we do not want to keep default references to Variable
+      # objects as this will prevent their memory from being released.
+      self._vars[name] = v
+    logging.vlog(1, "Created variable %s with shape %s and init %s", v.name, format(shape), initializer)
+
+    # Run the regularizer if requested and save the resulting loss.
+    if regularizer:
+      with ops.colocate_with(v):
+        with ops.name_scope(name + "/Regularizer/"):
+          with ops.init_scope():
+            loss = regularizer(v)
+        if loss is not None:
+          if context.executing_eagerly():
+            v_name = "v_%s" % type(v)
+            loss_name = "loss_%s" % type(loss)
+          else:
+            v_name = v.name
+            loss_name = loss.name
+          logging.vlog(
+              1, "Applied regularizer to %s and added the result %s "
+              "to REGULARIZATION_LOSSES.", v_name, loss_name
+          )
+          ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, loss)
+    return v
+
+  # Initialize variable when no initializer provided
+  def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32):
+    """Provide a default initializer and a corresponding value.
+
+    Args:
+      name: see get_variable.
+      shape: see get_variable.
+      dtype: see get_variable.
+
+    Returns:
+      initializer and initializing_from_value. See get_variable above.
+
+    Raises:
+      ValueError: When giving unsupported dtype.
+    """
+    del shape
+    # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+    if dtype.is_floating:
+      initializer = init_ops.glorot_uniform_initializer()
+      initializing_from_value = False
+    # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+    # If dtype is DT_BOOL, provide a default value `FALSE`
+    elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool or dtype == dtypes.string:
+      initializer = init_ops.zeros_initializer()
+      initializing_from_value = False
+    # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+    else:
+      raise ValueError("An initializer for variable %s of %s is required" % (name, dtype.base_dtype))
+
+    return initializer, initializing_from_value
+
+
+# @tf_export(v1=["get_embedding_variable"])
+def get_embedding_variable_internal(
+    name,
+    embedding_dim,
+    key_dtype=dtypes.int64,
+    value_dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    custom_getter=None,
+    constraint=None,
+    steps_to_live=None,
+    init_data_source=None,
+    ev_option=ev_variables.EmbeddingVariableOption()
+):
+  if key_dtype == dtypes.int64:
+    invalid_key = 9223372036854775807
+  elif key_dtype == dtypes.int32:
+    invalid_key = -1
+  elif key_dtype == dtypes.string:
+    invalid_key = ""
+  else:
+    raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype)
+  l2_weight_threshold = -1.0
+  if initializer is None and ev_option.init.initializer is None:
+    initializer = init_ops.truncated_normal_initializer()
+  elif ev_option.init.initializer is not None:
+    if initializer is not None:
+      logger.warning("Use initializer in InitializerOption.")
+    initializer = ev_option.init.initializer
+  if ev_option.evict is not None:
+    if isinstance(ev_option.evict, ev_variables.GlobalStepEvict):
+      if steps_to_live is not None:
+        logger.warning("Warning: steps_to_live is double set, the steps_to_live in EvcitConfig is valid")
+      steps_to_live = ev_option.evict.steps_to_live
+    elif isinstance(ev_option.evict, ev_variables.L2WeightEvict):
+      l2_weight_threshold = ev_option.evict.l2_weight_threshold
+  else:
+    l2_weight_threshold = -1.0
+  if steps_to_live is not None and l2_weight_threshold > 0:
+    raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.")
+  return VariableScope(reuse=False).get_embedding_variable(
+      name,
+      shape=embedding_dim,
+      dtype=value_dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=True,
+      custom_getter=custom_getter,
+      constraint=constraint,
+      invalid_key=invalid_key,
+      evconfig=ev_variables.EmbeddingVariableConfig(
+          steps_to_live=steps_to_live,
+          init_data_source=init_data_source,
+          ht_type=ev_option.ht_type,
+          l2_weight_threshold=l2_weight_threshold,
+          filter_strategy=ev_option.filter_strategy,
+          storage_type=ev_option.storage_option.storage_type,
+          storage_path=ev_option.storage_option.storage_path,
+          storage_size=ev_option.storage_option.storage_size,
+          storage_cache_strategy=ev_option.storage_option.cache_strategy,
+          layout=ev_option.storage_option.layout,
+          default_value_dim=ev_option.init.default_value_dim,
+          default_value_no_permission=ev_option.init.default_value_no_permission
+      ),
+      ht_partition_num=ev_option.ht_partition_num
+  )
+
+
+# @tf_export(v1=["get_embedding_variable_v2"])
+def get_embedding_variable_v2_internal(
+    name,
+    embedding_dim,
+    key_dtype=dtypes.int64,
+    value_dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    custom_getter=None,
+    constraint=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000
+):
+  if key_dtype == dtypes.int64:
+    invalid_key = 9223372036854775807
+  elif key_dtype == dtypes.int32:
+    invalid_key = -1
+  elif key_dtype == dtypes.string:
+    invalid_key = ""
+  else:
+    raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype)
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer()
+  return VariableScope(reuse=False).get_embedding_variable(
+      name,
+      shape=embedding_dim,
+      dtype=value_dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=True,
+      custom_getter=custom_getter,
+      constraint=constraint,
+      invalid_key=invalid_key,
+      evconfig=evconfig,
+      ht_partition_num=ht_partition_num
+  )
+
+
+@tf_export(v1=["get_embedding_variable"])
+def get_embedding_variable(
+    name,
+    embedding_dim,
+    key_dtype=dtypes.int64,
+    value_dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    custom_getter=None,
+    constraint=None,
+    steps_to_live=None,
+    init_data_source=None,
+    ev_option=ev_variables.EmbeddingVariableOption()
+):
+  if key_dtype == dtypes.int64:
+    invalid_key = 9223372036854775807
+  elif key_dtype == dtypes.int32:
+    invalid_key = -1
+  elif key_dtype == dtypes.string:
+    invalid_key = ""
+  else:
+    raise ValueError("Not support key_dtype: %s, only support int64/int32/string" % key_dtype)
+  l2_weight_threshold = -1.0
+  if initializer is None and ev_option.init.initializer is None:
+    initializer = init_ops.truncated_normal_initializer()
+  elif ev_option.init.initializer is not None:
+    if initializer is not None:
+      print("use initializer give in InitializerOption.")
+    initializer = ev_option.init.initializer
+  if steps_to_live is not None:
+    logger.warning("steps_to_live is deprecated,"
+                   " use tf.GlobaStepEvcit(steps_to_live)")
+  if ev_option.evict is not None:
+    if isinstance(ev_option.evict, ev_variables.GlobalStepEvict):
+      if steps_to_live is not None:
+        logger.warning("Warning: steps_to_live is double set, the steps_to_live in GlobalStepEvict is valid")
+      steps_to_live = ev_option.evict.steps_to_live
+    elif isinstance(ev_option.evict, ev_variables.L2WeightEvict):
+      l2_weight_threshold = ev_option.evict.l2_weight_threshold
+  else:
+    l2_weight_threshold = -1.0
+  if steps_to_live is not None and l2_weight_threshold > 0:
+    raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.")
+  return VariableScope(reuse=False).get_embedding_variable(
+      name,
+      shape=embedding_dim,
+      dtype=value_dtype,
+      initializer=initializer,
+      regularizer=regularizer,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      partitioner=partitioner,
+      validate_shape=validate_shape,
+      use_resource=True,
+      custom_getter=custom_getter,
+      constraint=constraint,
+      invalid_key=invalid_key,
+      evconfig=ev_variables.EmbeddingVariableConfig(
+          steps_to_live=steps_to_live,
+          init_data_source=init_data_source,
+          ht_type=ev_option.ht_type,
+          l2_weight_threshold=l2_weight_threshold,
+          filter_strategy=ev_option.filter_strategy,
+          storage_type=ev_option.storage_option.storage_type,
+          storage_path=ev_option.storage_option.storage_path,
+          storage_size=ev_option.storage_option.storage_size,
+          storage_cache_strategy=ev_option.storage_option.cache_strategy,
+          layout=ev_option.storage_option.layout,
+          default_value_dim=ev_option.init.default_value_dim,
+          default_value_no_permission=ev_option.init.default_value_no_permission
+      ),
+      ht_partition_num=ev_option.ht_partition_num
+  )
+
+
+def default_variable_creator(
+    initial_value=None,
+    trainable=None,
+    collections=None,
+    validate_shape=True,
+    caching_device=None,
+    name=None,
+    variable_def=None,
+    dtype=None,
+    embedding_block_num=None,
+    import_scope=None,
+    constraint=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    initializer=None,
+    ht_partition_num=1000
+):
+  if invalid_key is not None:
+    emb_blocknum = embedding_block_num
+    if emb_blocknum is None:
+      ev = kv_variable_ops.EmbeddingVariable(
+          initial_value=initial_value,
+          trainable=trainable,
+          collections=collections,
+          validate_shape=validate_shape,
+          caching_device=caching_device,
+          name=name,
+          dtype=dtype,
+          constraint=constraint,
+          variable_def=variable_def,
+          import_scope=import_scope,
+          invalid_key=invalid_key,
+          evconfig=evconfig,
+          # initializer=initializer,
+          ht_partition_num=ht_partition_num
+      )
+      if evconfig.init_data_source is not None:
+        ev.set_init_data_source_initializer(evconfig.init_data_source)
+      return ev
+    else:
+      evconfig.block_num = emb_blocknum
+      evlist = []
+      block_evconfig = copy.copy(evconfig)
+      block_evconfig.handle_name = name
+      block_evconfig.emb_index = 0
+      primary_ev = kv_variable_ops.EmbeddingVariable(
+          initial_value=initial_value,
+          trainable=trainable,
+          collections=collections,
+          validate_shape=validate_shape,
+          caching_device=caching_device,
+          name=name + "/block0",
+          dtype=dtype,
+          constraint=constraint,
+          variable_def=variable_def,
+          import_scope=import_scope,
+          invalid_key=invalid_key,
+          evconfig=block_evconfig,
+          initializer=initializer,
+          ht_partition_num=ht_partition_num
+      )
+      if evconfig.init_data_source is not None:
+        primary_ev.set_init_data_source_initializer(evconfig.init_data_source)
+      evlist.append(primary_ev)
+      block_evconfig.primary = primary_ev
+      with ops.colocate_with(primary_ev):
+        block_evconfig.handle_name = primary_ev._block_handle_name
+        for i in range(emb_blocknum - 1):
+          slave_evconfig = copy.copy(block_evconfig)
+          slave_evconfig.emb_index = i + 1
+          slave_evconfig._slot_num = primary_ev._slot_num
+          slave_ev = kv_variable_ops.EmbeddingVariable(
+              initial_value=initial_value,
+              trainable=trainable,
+              collections=collections,
+              validate_shape=validate_shape,
+              caching_device=caching_device,
+              name=name + "/block" + str(i + 1),
+              dtype=dtype,
+              constraint=constraint,
+              variable_def=variable_def,
+              import_scope=import_scope,
+              invalid_key=invalid_key,
+              evconfig=slave_evconfig,
+              initializer=initializer,
+              ht_partition_num=ht_partition_num
+          )
+          if evconfig.init_data_source is not None:
+            slave_ev._set_init_data_source_initializer(evconfig.init_data_source)
+          evlist.append(slave_ev)
+        dyn_ev = kv_variable_ops.DynamicEmbeddingVariable(name, evlist)
+        return dyn_ev
diff --git a/deepray/custom_ops/embedding_variable/variables.py b/deepray/custom_ops/embedding_variable/variables.py
new file mode 100644
index 00000000..31a1c6ad
--- /dev/null
+++ b/deepray/custom_ops/embedding_variable/variables.py
@@ -0,0 +1,206 @@
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.util.tf_export import tf_export
+
+from deepray.custom_ops.embedding_variable import config_pb2
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
+
+
+@tf_export(v1=["InitializerOption"])
+class InitializerOption(object):
+
+  def __init__(self, initializer=None, default_value_dim=4096, default_value_no_permission=.0):
+    self.initializer = initializer
+    self.default_value_dim = default_value_dim
+    self.default_value_no_permission = default_value_no_permission
+    if default_value_dim <= 0:
+      print("default value dim must larger than 1, the default value dim is set to default 4096.")
+      default_value_dim = 4096
+
+
+@tf_export(v1=["GlobalStepEvict"])
+class GlobalStepEvict(object):
+
+  def __init__(self, steps_to_live=None):
+    self.steps_to_live = steps_to_live
+
+
+@tf_export(v1=["L2WeightEvict"])
+class L2WeightEvict(object):
+
+  def __init__(self, l2_weight_threshold=-1.0):
+    self.l2_weight_threshold = l2_weight_threshold
+    if l2_weight_threshold <= 0 and l2_weight_threshold != -1.0:
+      logger.warning("l2_weight_threshold is invalid, l2_weight-based eviction is disabled")
+
+
+@tf_export(v1=["CheckpointOption"])
+class CheckpointOption(object):
+
+  def __init__(
+      self,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      always_load_from_specific_ckpt=False,
+      init_data_source=None
+  ):
+    self.ckpt_to_load_from = ckpt_to_load_from
+    self.tensor_name_in_ckpt = tensor_name_in_ckpt
+    self.always_load_from_specific_ckpt = always_load_from_specific_ckpt
+    self.init_data_source = init_data_source
+
+
+@tf_export(v1=["StorageOption"])
+class StorageOption(object):
+
+  def __init__(
+      self,
+      storage_type=None,
+      storage_path=None,
+      storage_size=[1024 * 1024 * 1024],
+      cache_strategy=config_pb2.CacheStrategy.LFU,
+      layout=None
+  ):
+    self.storage_type = storage_type
+    self.storage_path = storage_path
+    self.storage_size = storage_size
+    self.cache_strategy = cache_strategy
+    self.layout = layout
+    if not isinstance(storage_size, list):
+      raise ValueError("storage_size should be list type")
+    if len(storage_size) < 4:
+      for i in range(len(storage_size), 4):
+        storage_size.append(1024 * 1024 * 1024)
+    if storage_path is not None:
+      if storage_type is None:
+        raise ValueError("storage_type musnt'be None when storage_path is set")
+      else:
+        if not file_io.file_exists(storage_path):
+          file_io.recursive_create_dir(storage_path)
+    else:
+      if storage_type is not None and storage_type in [
+          config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_SSDHASH,
+          config_pb2.StorageType.DRAM_LEVELDB
+      ]:
+        raise ValueError("storage_path musnt'be None when storage_type is set")
+
+
+@tf_export(v1=["EmbeddingVariableOption"])
+class EmbeddingVariableOption(object):
+
+  def __init__(
+      self,
+      ht_type="",
+      ht_partition_num=1000,
+      evict_option=None,
+      ckpt=None,
+      filter_option=None,
+      storage_option=StorageOption(),
+      init_option=InitializerOption()
+  ):
+    self.ht_type = ht_type
+    self.ht_partition_num = ht_partition_num
+    self.evict = evict_option
+    self.ckpt = ckpt
+    self.filter_strategy = filter_option
+    self.storage_option = storage_option
+    self.init = init_option
+
+
+@tf_export(v1=["CounterFilter"])
+class CounterFilter(object):
+
+  def __init__(self, filter_freq=0):
+    self.filter_freq = filter_freq
+
+
+@tf_export(v1=["CBFFilter"])
+class CBFFilter(object):
+
+  def __init__(self, filter_freq=0, max_element_size=0, false_positive_probability=-1.0, counter_type=dtypes.uint64):
+    if false_positive_probability != -1.0:
+      if false_positive_probability <= 0.0:
+        raise ValueError("false_positive_probablity must larger than 0")
+      else:
+        if max_element_size <= 0:
+          raise ValueError("max_element_size must larger than 0 when false_positive_probability is not -1.0")
+    else:
+      if max_element_size != 0:
+        raise ValueError("max_element_size can't be set when false_probability is -1.0")
+    self.max_element_size = max_element_size
+    self.false_positive_probability = false_positive_probability
+    self.counter_type = counter_type
+    self.filter_freq = filter_freq
+
+
+class EmbeddingVariableConfig(object):
+
+  def __init__(
+      self,
+      steps_to_live=None,
+      steps_to_live_l2reg=None,
+      l2reg_theta=None,
+      l2reg_lambda=None,
+      l2_weight_threshold=-1.0,
+      ht_type=None,
+      filter_strategy=None,
+      ckpt_to_load_from=None,
+      tensor_name_in_ckpt=None,
+      always_load_from_specific_ckpt=False,
+      init_data_source=None,
+      handle_name=None,
+      emb_index=None,
+      slot_index=None,
+      block_num=None,
+      primary=None,
+      slot_num=None,
+      storage_type=config_pb2.StorageType.DRAM,
+      storage_path=None,
+      storage_size=None,
+      storage_cache_strategy=config_pb2.CacheStrategy.LFU,
+      layout=None,
+      default_value_dim=4096,
+      default_value_no_permission=.0
+  ):
+    self.steps_to_live = steps_to_live
+    self.steps_to_live_l2reg = steps_to_live_l2reg
+    self.l2reg_theta = l2reg_theta
+    self.l2reg_lambda = l2reg_lambda
+    self.ckpt_to_load_from = ckpt_to_load_from
+    self.tensor_name_in_ckpt = tensor_name_in_ckpt
+    self.always_load_from_specific_ckpt = always_load_from_specific_ckpt
+    self.init_data_source = init_data_source
+    self.handle_name = handle_name
+    self.emb_index = emb_index
+    self.slot_index = slot_index
+    self.block_num = block_num
+    self.primary = primary
+    self.slot_num = slot_num
+    self.ht_type = ht_type
+    self.l2_weight_threshold = l2_weight_threshold
+    self.filter_strategy = filter_strategy
+    self.storage_type = storage_type
+    self.storage_path = storage_path
+    self.storage_size = storage_size
+    self.storage_cache_strategy = storage_cache_strategy
+    self.layout = layout
+    self.default_value_dim = default_value_dim
+    self.default_value_no_permission = default_value_no_permission
+
+  def reveal(self):
+    if self.steps_to_live is None:
+      self.steps_to_live = 0
+    if self.steps_to_live_l2reg is None:
+      self.steps_to_live_l2reg = 0
+    if self.l2reg_theta is None:
+      self.l2reg_theta = 0
+    if self.l2reg_lambda is None:
+      self.l2reg_lambda = 0
+    if self.ht_type is None:
+      self.ht_type = ''
+    if self.emb_index is None:
+      self.emb_index = 0
+    if self.slot_index is None:
+      self.slot_index = 0
diff --git a/deepray/custom_ops/ffm_ops/BUILD b/deepray/custom_ops/ffm_ops/BUILD
index 3e2bce28..b60b9fea 100644
--- a/deepray/custom_ops/ffm_ops/BUILD
+++ b/deepray/custom_ops/ffm_ops/BUILD
@@ -1,5 +1,4 @@
 load("//deepray:deepray.bzl", "custom_op_library")
-load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -10,11 +9,14 @@ custom_op_library(
         "cc/kernels/ffm_kernels.h",
         "cc/ops/ffm_ops.cc",
     ],
-    copts = [CPLUSPLUS_VERSION],
-    cuda_srcs = [
+    copts = ["-Wno-unused-result"],
+    gpu_srcs = [
         "cc/kernels/ffm_kernels.h",
         "cc/kernels/ffm_kernels.cu.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+    ],
 )
 
 py_library(
@@ -42,5 +44,9 @@ py_test(
     main = "python/tests/run_all_test.py",
     deps = [
         ":ffm_ops",
+        "//deepray/layers",
+        "@pypi_numpy//:pkg",
+        "@pypi_pytest//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
index 87384401..e3c05133 100644
--- a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
+++ b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
@@ -15,10 +15,11 @@
 #if GOOGLE_CUDA
 #define EIGEN_USE_GPU
 
+#include "ffm_kernels.h"
+
 #include <string>
 #include <vector>
 
-#include "ffm_kernels.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
index 7aa66618..ec7f6ebb 100644
--- a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
+++ b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -43,7 +44,7 @@ REGISTER_OP("FFM")
       }
 
       ctx->set_output(0, ctx->Matrix(batch_size, out_dims));
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("FFMGrad")
@@ -57,7 +58,7 @@ REGISTER_OP("FFMGrad")
     .SetShapeFn([](shape_inference::InferenceContext *ctx) {
       ctx->set_output(0, ctx->input(1));
       ctx->set_output(1, ctx->input(2));
-      return Status::OK();
+      return TFOkStatus;
     });
 
 }  // namespace tensorflow
diff --git a/deepray/custom_ops/ffm_ops/python/ffm_ops.py b/deepray/custom_ops/ffm_ops/python/ffm_ops.py
index 4b993931..c40fd3e0 100644
--- a/deepray/custom_ops/ffm_ops/python/ffm_ops.py
+++ b/deepray/custom_ops/ffm_ops/python/ffm_ops.py
@@ -16,7 +16,7 @@
 
 from deepray.utils.resource_loader import LazySO
 
-gen_ffm_ops = LazySO("custom_ops/feature_cross/_ffm_ops.so")
+gen_ffm_ops = LazySO("custom_ops/ffm_ops/_ffm_ops.so")
 
 
 def ffm(left: tf.Tensor, right: tf.Tensor, dim_size: int, int_type: str = 'multiply') -> tf.Tensor:
diff --git a/deepray/custom_ops/multiplex_1/BUILD b/deepray/custom_ops/multiplex_1/BUILD
index 6d6c699b..6c807e89 100644
--- a/deepray/custom_ops/multiplex_1/BUILD
+++ b/deepray/custom_ops/multiplex_1/BUILD
@@ -9,11 +9,20 @@ custom_op_library(
         "multiplex_1_kernel.cc",
         "multiplex_1_op.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+    ],
 )
 
 py_library(
-    name = "multiplex_1_op",
-    srcs = ["multiplex_1_op.py"],
+    name = "multiplex_1",
+    srcs = glob(
+        [
+            "python/*.py",
+            "python/**/*.py",
+            "*.py",
+        ],
+    ),
     data = [":multiplex_1_kernel.so"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
@@ -29,6 +38,8 @@ py_test(
         "no_mac",  # TODO(b/216321151): Re-enable this test.
     ],
     deps = [
-        ":multiplex_1_op",
+        ":multiplex_1",
+        "@pypi_numpy//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/build_deps/toolchains/gpu/crosstool/BUILD b/deepray/custom_ops/multiplex_1/__init__.py
similarity index 100%
rename from build_deps/toolchains/gpu/crosstool/BUILD
rename to deepray/custom_ops/multiplex_1/__init__.py
diff --git a/deepray/custom_ops/multiplex_1/multiplex_1_op.cc b/deepray/custom_ops/multiplex_1/multiplex_1_op.cc
index f2c8d015..3de4974b 100644
--- a/deepray/custom_ops/multiplex_1/multiplex_1_op.cc
+++ b/deepray/custom_ops/multiplex_1/multiplex_1_op.cc
@@ -13,16 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
-/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()".
-This code is for compatibility.*/
-#if TF_VERSION_INTEGER >= 2100
-#define TFOkStatus ::tensorflow::OkStatus()
-#else
-#define TFOkStatus ::tensorflow::Status::OK()
-#endif
+using namespace tensorflow;
 
 // Use a namespace when registering by prepending the
 // package's name to the op’s name and separate with a '>'.
diff --git a/deepray/custom_ops/multiplex_2/BUILD b/deepray/custom_ops/multiplex_2/BUILD
index b4dfb447..5478998b 100644
--- a/deepray/custom_ops/multiplex_2/BUILD
+++ b/deepray/custom_ops/multiplex_2/BUILD
@@ -13,7 +13,7 @@ custom_op_library(
         "multiplex_2_kernel.h",
         "multiplex_2_op.cc",
     ],
-    cuda_srcs = [
+    gpu_srcs = [
         "multiplex_2_kernel.h",
         "multiplex_2_kernel.cu.cc",
     ],
@@ -21,7 +21,13 @@ custom_op_library(
 
 py_library(
     name = "multiplex_2_op",
-    srcs = ["multiplex_2_op.py"],
+    srcs = glob(
+        [
+            "python/*.py",
+            "python/**/*.py",
+            "*.py",
+        ],
+    ),
     data = ["multiplex_2_kernel.so"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
@@ -38,5 +44,7 @@ py_test(
     ],
     deps = [
         ":multiplex_2_op",
+        "@pypi_numpy//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/build_deps/toolchains/gpu/cuda/BUILD b/deepray/custom_ops/multiplex_2/__init__.py
similarity index 100%
rename from build_deps/toolchains/gpu/cuda/BUILD
rename to deepray/custom_ops/multiplex_2/__init__.py
diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc
index 7a1e37b0..8174b934 100644
--- a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc
+++ b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "deepray/custom_ops/multiplex_2/multiplex_2_kernel.h"
+#include "multiplex_2_kernel.h"
 
 // Please use the appropriate namespace for your project
 namespace tensorflow {
diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc
index c405fb4c..bed5e149 100644
--- a/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc
+++ b/deepray/custom_ops/multiplex_2/multiplex_2_kernel.cu.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#include "deepray/custom_ops/multiplex_2/multiplex_2_kernel.h"
+#include "multiplex_2_kernel.h"
 
 // Please use the appropriate namespace for your project
 namespace tensorflow {
diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_op.cc b/deepray/custom_ops/multiplex_2/multiplex_2_op.cc
index d59e88ac..0748d0ef 100644
--- a/deepray/custom_ops/multiplex_2/multiplex_2_op.cc
+++ b/deepray/custom_ops/multiplex_2/multiplex_2_op.cc
@@ -41,7 +41,7 @@ REGISTER_OP("Examples>MultiplexDense")
       TF_RETURN_IF_ERROR(c->Merge(c->input(0), c->input(2), &unused));
 
       c->set_output(0, out);
-      return ::tensorflow::Status::OK();
+      return ::tensorflow::OkStatus();
     })
     .Doc(R"doc(
 Return elements chosen from `a` or `b` depending on `cond`.
diff --git a/deepray/custom_ops/multiplex_3/BUILD b/deepray/custom_ops/multiplex_3/BUILD
index 068baf4d..008948cf 100644
--- a/deepray/custom_ops/multiplex_3/BUILD
+++ b/deepray/custom_ops/multiplex_3/BUILD
@@ -14,11 +14,20 @@ custom_op_library(
         "multiplex_3_kernel.cc",
         "multiplex_3_op.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+    ],
 )
 
 py_library(
     name = "multiplex_3_op",
-    srcs = ["multiplex_3_op.py"],
+    srcs = glob(
+        [
+            "python/*.py",
+            "python/**/*.py",
+            "*.py",
+        ],
+    ),
     data = [":multiplex_3_kernel.so"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
@@ -38,5 +47,7 @@ py_test(
     ],
     deps = [
         ":multiplex_3_op",
+        "@pypi_numpy//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/third_party/cucollection/BUILD b/deepray/custom_ops/multiplex_3/__init__.py
similarity index 100%
rename from third_party/cucollection/BUILD
rename to deepray/custom_ops/multiplex_3/__init__.py
diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc b/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc
index 374376bf..6c34ffa7 100644
--- a/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc
+++ b/deepray/custom_ops/multiplex_3/multiplex_3_kernel.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/platform/errors.h"
@@ -204,7 +205,7 @@ class MultiplexSparseOp : public OpKernel {
                              indices_tensor.shape().DebugString(),
                              " values: ", values_tensor.shape().DebugString());
     }
-    return Status::OK();
+    return TFOkStatus;
   }
 };
 
diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_op.cc b/deepray/custom_ops/multiplex_3/multiplex_3_op.cc
index 80be6976..4e852b69 100644
--- a/deepray/custom_ops/multiplex_3/multiplex_3_op.cc
+++ b/deepray/custom_ops/multiplex_3/multiplex_3_op.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
+using namespace tensorflow;
+
 // Use a namespace when registering by prepending the
 // package's name to the op’s name and separate with a '>'.
 // This is the recommendation for out-of-tree ops to avoid name collisions in
@@ -52,7 +55,7 @@ REGISTER_OP("Examples>MultiplexSparse")
       c->set_output(0, c->Matrix(num_rows, dense_rank));
       c->set_output(1, c->Vector(num_rows));
       c->set_output(2, c->Vector(dense_rank));
-      return ::tensorflow::Status::OK();
+      return TFOkStatus;
     })
     .Doc(R"doc(
 Return elements chosen from `a` or `b` depending on `cond`.
diff --git a/deepray/custom_ops/multiplex_4/BUILD b/deepray/custom_ops/multiplex_4/BUILD
index b699319b..7cac986d 100644
--- a/deepray/custom_ops/multiplex_4/BUILD
+++ b/deepray/custom_ops/multiplex_4/BUILD
@@ -10,6 +10,9 @@ custom_op_library(
         "multiplex_4_kernel.cc",
         "multiplex_4_op.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+    ],
 )
 
 py_library(
@@ -38,6 +41,8 @@ py_test(
     deps = [
         ":model_using_multiplex",
         ":multiplex_4_op",
+        "@pypi_numpy//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
 
diff --git a/deepray/custom_ops/multiplex_4/__init__.py b/deepray/custom_ops/multiplex_4/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/custom_ops/multiplex_4/multiplex_4_op.cc b/deepray/custom_ops/multiplex_4/multiplex_4_op.cc
index 88a5ec06..102d2142 100644
--- a/deepray/custom_ops/multiplex_4/multiplex_4_op.cc
+++ b/deepray/custom_ops/multiplex_4/multiplex_4_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -42,7 +43,7 @@ Status MultiplexShapeFunction(InferenceContext* c) {
     TF_RETURN_IF_ERROR(c->Merge(c->input(i), c->input(last), &unused));
   }
   c->set_output(0, c->input(last));
-  return Status::OK();
+  return TFOkStatus;
 }
 
 REGISTER_OP("Examples>MultiplexDense")
diff --git a/deepray/custom_ops/parquet_dataset/BUILD b/deepray/custom_ops/parquet_dataset/BUILD
index f8b14c26..f5ce380a 100644
--- a/deepray/custom_ops/parquet_dataset/BUILD
+++ b/deepray/custom_ops/parquet_dataset/BUILD
@@ -1,5 +1,4 @@
-load("//deepray:deepray.bzl", "custom_op_library")
-load("//deepray:tensorflow.bzl", "pybind_extension")
+load("//deepray:deepray.bzl", "custom_op_library", "pybind_extension")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -32,10 +31,9 @@ cc_library(
         "DEEPREC_ARROW_ZEROCOPY",
     ],
     deps = [
-        "@com_github_apache_arrow//:arrow",
-        "@eigen3",
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:tf_header_lib",
+        "@org_apache_arrow//:arrow",
     ],
 )
 
@@ -49,9 +47,9 @@ cc_library(
     ],
     deps = [
         ":arrow_util",
-        "@com_github_apache_arrow//:arrow",
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:tf_header_lib",
+        "@org_apache_arrow//:arrow",
     ],
 )
 
@@ -60,7 +58,9 @@ pybind_extension(
     srcs = [
         "cc/kernels/parquet_pybind.cc",
     ],
-    copts = ["-fexceptions"],
+    copts = [
+        "-fexceptions",
+    ],
     features = ["-use_header_modules"],
     module_name = "_parquet_pybind",
     deps = [
@@ -122,6 +122,8 @@ py_binary(
     name = "read_parquet_deepray",
     srcs = ["read_parquet_deepray.py"],
     deps = [
-        "//deepray/custom_ops/parquet_dataset",
+        ":parquet_dataset",
+        "@pypi_fastparquet//:pkg",
+        "@pypi_pandas//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc
index 564df01e..8f57ab9b 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.cc
@@ -14,9 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "arrow_util.h"
 
-#include <arrow/array.h>
-#include <arrow/util/thread_pool.h>
-#include <tensorflow/core/framework/allocation_description.pb.h>
 #include <unistd.h>
 
 #include <cstdlib>
@@ -26,7 +23,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "arrow/array.h"
+#include "arrow/util/thread_pool.h"
 #include "eigen.h"
+#include "tensorflow/core/framework/allocation_description.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -252,7 +252,7 @@ class RaggedTensorBuilder : public ::arrow::ArrayVisitor {
 #define CASE_ARROW_ENUM_SET_DTYPE(PTR, ENUM)                       \
   case ENUM: {                                                     \
     *PTR = DataTypeToEnum<ArrowEnumToDataType<ENUM>::Type>::value; \
-    return Status::OK();                                           \
+    return OkStatus();                                             \
   }
 
 Status MakeDataTypeAndRaggedRankFromArrowDataType(
@@ -280,7 +280,7 @@ Status MakeDataTypeAndRaggedRankFromArrowDataType(
       return errors::Unimplemented("Arrow data type ", arrow_dtype->ToString(),
                                    " not supported.");
   }
-  return Status::OK();
+  return OkStatus();
 }
 
 Status MakeTensorsFromArrowArray(
@@ -297,7 +297,7 @@ Status MakeTensorsFromArrowArray(
 
   RaggedTensorBuilder builder(dtype, ragged_rank);
   TF_RETURN_IF_ARROW_ERROR(builder.Build(arrow_array, output_tensors));
-  return Status::OK();
+  return OkStatus();
 }
 
 int UpdateArrowCpuThreadPoolCapacityFromEnv() {
@@ -315,7 +315,7 @@ ::arrow::Status OpenArrowFile(
     const std::string& filename) {
 #if DEEPREC_ARROW_HDFS
   if (filename.rfind("hdfs://", 0) == 0) {
-    ::arrow::internal::Uri uri;
+    ::arrow::util::Uri uri;
     ARROW_RETURN_NOT_OK(uri.Parse(filename));
     ARROW_ASSIGN_OR_RAISE(auto options, ::arrow::fs::HdfsOptions::FromUri(uri));
     std::shared_ptr<::arrow::io::HadoopFileSystem> fs;
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h b/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h
index f84dc9f3..d61a2140 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/eigen.h
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/public/version.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 // NOTE: EIGEN_MAX_ALIGN_BYTES is 64 in TF 1.x. See:
 // DeepRec/third_party/eigen.BUILD#L67
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
index 677b4dfa..7000331d 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.h"
+#include "parquet_batch_reader.h"
 
 #include <memory>
 #include <numeric>
@@ -20,17 +20,17 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/match.h"
-#include "deepray/custom_ops/parquet_dataset/cc/kernels/arrow_util.h"
+#include "arrow_util.h"
 
 namespace tensorflow {
 namespace data {
 
 class ParquetBatchReader::Impl {
  public:
-  Impl(const string& filename, const int64 batch_size,
-       const std::vector<string>& field_names,
-       const DataTypeVector& field_dtypes,
-       const std::vector<int32>& field_ragged_ranks,
+  Impl(const string &filename, const int64 batch_size,
+       const std::vector<string> &field_names,
+       const DataTypeVector &field_dtypes,
+       const std::vector<int32> &field_ragged_ranks,
        const int64 partition_count, const int64 partition_index,
        const bool drop_remainder)
       : filename_(filename),
@@ -44,7 +44,7 @@ class ParquetBatchReader::Impl {
 
   Status Open() {
     if (TF_PREDICT_TRUE(batch_reader_)) {
-      return Status::OK();
+      return OkStatus();
     }
     if (TF_PREDICT_FALSE(partition_index_ >= partition_count_)) {
       return errors::InvalidArgument("Partition index ", partition_index_,
@@ -71,15 +71,15 @@ class ParquetBatchReader::Impl {
                                      " must has distinct column names");
     }
     for (size_t i = 0; i < field_names_.size(); ++i) {
-      auto& cname = field_names_[i];
+      auto &cname = field_names_[i];
       int column_index = schema->GetFieldIndex(cname);
       if (TF_PREDICT_FALSE(column_index < 0)) {
         return errors::NotFound("No column called `", cname, "` found in ",
                                 filename_);
       }
       column_indices_.push_back(column_index);
-      const auto& expected_dtype = field_dtypes_[i];
-      const auto& expected_ragged_rank = field_ragged_ranks_[i];
+      const auto &expected_dtype = field_dtypes_[i];
+      const auto &expected_ragged_rank = field_ragged_ranks_[i];
       DataType actual_dtype;
       int32 actual_ragged_rank = 0;
       TF_RETURN_IF_ERROR(ArrowUtil::MakeDataTypeAndRaggedRankFromArrowDataType(
@@ -101,10 +101,10 @@ class ParquetBatchReader::Impl {
 
     TF_RETURN_IF_ARROW_ERROR(reader_->GetRecordBatchReader(
         row_group_indices_, column_indices_, &batch_reader_));
-    return Status::OK();
+    return OkStatus();
   }
 
-  Status Read(std::vector<Tensor>* output_tensors) {
+  Status Read(std::vector<Tensor> *output_tensors) {
     // Read next batch from parquet file.
     std::shared_ptr<::arrow::RecordBatch> batch;
     TF_RETURN_IF_ARROW_ERROR(batch_reader_->ReadNext(&batch));
@@ -123,7 +123,7 @@ class ParquetBatchReader::Impl {
           field_dtypes_[i], field_ragged_ranks_[i], arrays[i], output_tensors));
     }
 
-    return Status::OK();
+    return OkStatus();
   }
 
  private:
@@ -142,9 +142,9 @@ class ParquetBatchReader::Impl {
 };
 
 ParquetBatchReader::ParquetBatchReader(
-    const string& filename, const int64 batch_size,
-    const std::vector<string>& field_names, const DataTypeVector& field_dtypes,
-    const std::vector<int32>& field_ragged_ranks, const int64 partition_count,
+    const string &filename, const int64 batch_size,
+    const std::vector<string> &field_names, const DataTypeVector &field_dtypes,
+    const std::vector<int32> &field_ragged_ranks, const int64 partition_count,
     const int64 partition_index, const bool drop_remainder)
     : pimpl_(new ParquetBatchReader::Impl(
           filename, batch_size, field_names, field_dtypes, field_ragged_ranks,
@@ -152,7 +152,7 @@ ParquetBatchReader::ParquetBatchReader(
 
 Status ParquetBatchReader::Open() { return pimpl_->Open(); }
 
-Status ParquetBatchReader::Read(std::vector<Tensor>* output_tensors) {
+Status ParquetBatchReader::Read(std::vector<Tensor> *output_tensors) {
   return pimpl_->Read(output_tensors);
 }
 
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc
index 1bdcd582..7ab81028 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h"
+#include "parquet_dataset_ops.h"
 
 #include <unordered_set>
 
@@ -83,14 +83,14 @@ class ParquetTabularDatasetOp::Dataset : public DatasetBase {
     return output_shapes_;
   }
 
-  Status CheckExternalState() const override { return Status::OK(); }
+  Status CheckExternalState() const override { return OkStatus(); }
 
   string DebugString() const override {
     return "ParquetTabularDatasetOp::Dataset";
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return Status::OK();
+    return OkStatus();
   }
 
  protected:
@@ -122,7 +122,7 @@ class ParquetTabularDatasetOp::Dataset : public DatasetBase {
                        {"partition_index", partition_index},
                        {"drop_remainder", drop_remainder}},
                       output));
-    return Status::OK();
+    return OkStatus();
   }
 
  private:
@@ -159,7 +159,7 @@ class ParquetTabularDatasetOp::Dataset::Iterator
       return s;
     }
     *end_of_sequence = true;
-    return Status::OK();
+    return OkStatus();
   }
 
  protected:
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h
index 11659001..047a0277 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_dataset_ops.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_DATA_PARQUET_DATASET_OPS_H_
 #define TENSORFLOW_CORE_KERNELS_DATA_PARQUET_DATASET_OPS_H_
 
-#include "deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.h"
+#include "parquet_batch_reader.h"
 #include "tensorflow/core/framework/dataset.h"
 
 namespace tensorflow {
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc
index fc06b1c0..a516bb21 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_pybind.cc
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <pybind11/complex.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
 #include <iostream>
 #include <string>
 #include <tuple>
 #include <vector>
 
 #include "arrow_util.h"
+#include "pybind11/complex.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/deepray/custom_ops/parquet_dataset/python/dataframe.py b/deepray/custom_ops/parquet_dataset/python/dataframe.py
index c7488515..b12ebc7e 100644
--- a/deepray/custom_ops/parquet_dataset/python/dataframe.py
+++ b/deepray/custom_ops/parquet_dataset/python/dataframe.py
@@ -23,9 +23,10 @@
 from __future__ import print_function
 
 import collections
+
 import numpy as np
+import tensorflow as tf
 from six.moves import xrange  # pylint: disable=redefined-builtin
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -59,16 +60,12 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None):
       self._ragged_rank = ragged_rank
       if shape:
         shape = tensor_shape.TensorShape(shape)
-        shape_rank = 0
-        for _ in shape:
-          shape_rank += 1
-        if ragged_rank is not None and ragged_rank != shape_rank:
+        for d in shape:
+          if d is None:
+            raise ValueError(f'Field {name} has incomplete shape: {shape}')
+        if ragged_rank is not None and ragged_rank > 1:
           raise ValueError(f'Field {name} is a nested list ({ragged_rank}) '
                            f'with shape {shape}')
-        self._ragged_rank = shape_rank
-      elif ragged_rank is not None:
-        shape = tensor_shape.TensorShape([None for _ in xrange(ragged_rank)])
-
       self._shape = shape
 
     @property
@@ -130,16 +127,15 @@ def output_classes(self):
     def output_types(self):
       return self.map(lambda i: self._dtype if i == 0 else dtypes.int32)
 
-    def output_shapes(self, batch_size=None):
+    @property
+    def output_shapes(self):
       if self._shape is None:
-        return self.map(lambda i: tensor_shape.TensorShape(batch_size) if i == 0 else tensor_shape.TensorShape(None))
-      return self.map(
-          lambda i: tensor_shape.TensorShape(batch_size).concatenate(self._shape)
-          if i == 0 else tensor_shape.TensorShape(None)
-      )
+        return self.map(lambda _: tf.TensorShape(None))
+      return self.map(lambda i: tf.TensorShape(None).concatenate(self._shape) if i == 0 else tf.TensorShape(None))
 
-    def output_specs(self, batch_size=None):
-      shape = tensor_shape.TensorShape(batch_size)
+    @property
+    def output_specs(self):
+      shape = tf.TensorShape(None)
       if self._shape is not None:
         shape = shape.concatenate(self._shape)
       specs = [tensor_spec.TensorSpec(shape, dtype=self._dtype)]
diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
index a8766954..67330c7b 100644
--- a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
+++ b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import tensorflow as tf
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@
 from .parquet_pybind import parquet_filenames_and_fields
 
 _parquet_dataset_ops_so = LazySO("custom_ops/parquet_dataset/_parquet_dataset_ops.so")
+gen_parquet_ops = _parquet_dataset_ops_so.ops
 
 
 class DataFrameValueSpec(type_spec.BatchableTypeSpec):
@@ -40,24 +42,22 @@ class DataFrameValueSpec(type_spec.BatchableTypeSpec):
   def value_type(self):
     return DataFrame.Value if self._ragged_rank > 0 else ops.Tensor
 
-  def __init__(self, field, batch_size=None):
+  def __init__(self, field):
     """Constructs a type specification for a `tf.RaggedTensor`.
 
     Args:
       field: The field definition.
-      batch_size: The batch_size of DataFrame.
     """
     if field.incomplete:
       raise ValueError(f'Field {field} is incomplete, please specify dtype and ragged_rank')
     self._field = field
-    self._batch_size = batch_size
 
   def _serialize(self):
     return (self._field.dtype, self._field.ragged_rank)
 
   @property
   def _component_specs(self):
-    return self._field.output_specs(self._batch_size)
+    return self._field.output_specs
 
   def _to_components(self, value):
     if isinstance(value, DataFrame.Value):
@@ -81,7 +81,7 @@ def _to_legacy_output_types(self):
     return self._field.output_types
 
   def _to_legacy_output_shapes(self):
-    return self._field.output_shapes(self._batch_size)
+    return self._field.output_shapes
 
   def _to_legacy_output_classes(self):
     return self._field.output_classes
@@ -105,12 +105,18 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in
     self._filename = ops.convert_to_tensor(filename, dtype=dtypes.string, name='filename')
     self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64, name='batch_size')
     self._fields = fields
-    self._output_specs = {
-        f.name: (
-            DataFrameValueSpec(f, batch_size if drop_remainder else None) if f.ragged_rank > 0 else
-            tensor_spec.TensorSpec(shape=[batch_size if drop_remainder else None], dtype=f.dtype)
-        ) for f in self._fields
-    }
+    self._output_specs = {}
+    for f in self._fields:
+      item = None
+      if f.ragged_rank > 0:
+        item = DataFrameValueSpec(f)
+      else:
+        shape = tf.TensorShape(batch_size if drop_remainder else None)
+        if f.shape:
+          shape = shape.concatenate(f.shape)
+        item = tensor_spec.TensorSpec(shape=shape, dtype=f.dtype)
+      self._output_specs[f.name] = item
+
     self._field_names = nest.flatten({f.name: f.name for f in self._fields})
     self._field_dtypes = nest.flatten({f.name: f.dtype for f in self._fields})
     self._field_ragged_ranks = nest.flatten({f.name: f.ragged_rank for f in self._fields})
@@ -118,7 +124,7 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in
     self._partition_index = partition_index
     self._drop_remainder = drop_remainder
 
-    variant_tensor = _parquet_dataset_ops_so.ops.parquet_tabular_dataset_v1(
+    variant_tensor = gen_parquet_ops.parquet_tabular_dataset_v1(
         self._filename,
         self._batch_size,
         field_names=self._field_names,
@@ -227,9 +233,12 @@ def element_spec(self):
   def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, num_sequential_reads=1):
     """Internal method to create a `ParquetDataset`."""
     if num_parallel_reads is None:
+      # Sequential Reading
       return filenames.flat_map(dataset_creator)
     if num_parallel_reads == dataset_ops.AUTOTUNE:
+      # Auto-tuned Parallel Reading
       return filenames.interleave(dataset_creator, num_parallel_calls=num_parallel_reads)
+    # Specified Parallel Reading
     return readers.ParallelInterleaveDataset(
         filenames,
         dataset_creator,
diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
index 7ce65b64..8bd32f7e 100644
--- a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
+++ b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
@@ -20,15 +20,16 @@
 
 import numpy as np
 from six import string_types as string
-
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
-from .dataframe import DataFrame
+from tensorflow.python.types import core
+
 from deepray.custom_ops.parquet_dataset import _parquet_pybind as _lib
+from .dataframe import DataFrame
 
 
 def parquet_fields(filename, fields=None, lower=False):
@@ -121,7 +122,7 @@ def parquet_filenames_and_fields(filenames, fields, lower=False):
         raise ValueError(f'Field {f} must be `hb.data.DataFrame.Field`.')
       if f.incomplete:
         raise ValueError(f'Field {f} is incomplete, please specify dtype and ragged_rank')
-  elif isinstance(filenames, ops.Tensor):
+  elif isinstance(filenames, core.Tensor):
     if filenames.dtype != dtypes.string:
       raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.')
     if fields is None:
diff --git a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
index 32d1393f..10d3fc26 100644
--- a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
+++ b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
@@ -18,16 +18,19 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+import pandas as pd
 import os
+from six.moves import xrange  # pylint: disable=redefined-builtin
 import tempfile
 
-import numpy as np
-import pandas as pd
 import tensorflow as tf
+# from tensorflow.python.data.experimental.ops import parquet_dataset_ops
+from deepray.custom_ops.parquet_dataset import parquet_dataset_ops
+
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.platform import test
-
-from deepray.custom_ops.parquet_dataset import parquet_dataset_ops
+from tensorflow.python.data.ops.dataset_ops import AUTOTUNE
 
 
 class ParquetDatasetTest(test_base.DatasetTestBase):
@@ -42,154 +45,152 @@ def setUpClass(self):
 
   def test_read(self):
     batch_size = 32
-    ds = parquet_dataset_ops.ParquetDataset(
-        self._filename,
-        batch_size=batch_size,
-        fields=[parquet_dataset_ops.DataFrame.Field('A', tf.int64),
-                parquet_dataset_ops.DataFrame.Field('C', tf.int64)]
-    )
-    ds = ds.prefetch(4)
-    # batch = tf.data.make_one_shot_iterator(ds).get_next()
+    with tf.Graph().as_default() as graph:
+      ds = parquet_dataset_ops.ParquetDataset(
+          self._filename,
+          batch_size=batch_size,
+          fields=[
+              parquet_dataset_ops.DataFrame.Field('A', tf.int64),
+              parquet_dataset_ops.DataFrame.Field('C', tf.int64)
+          ]
+      )
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
 
     a = self._df['A']
     c = self._df['C']
-    i = 0
-    for result in ds.take(3):
-      print(result)
-      # i += 1
-      # start_row = i * batch_size
-      # end_row = (i + 1) * batch_size
-      # np.testing.assert_equal(result['A'], a[start_row:end_row].to_numpy())
-      # np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
-
-  # def test_schema_auto_detection_read(self):
-  #   batch_size = 32
-  #   with tf.Graph().as_default() as graph:
-  #     ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size)
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   c = self._df['C']
-  #   with tf.Session(graph=graph) as sess:
-  #     for i in xrange(3):
-  #       result = sess.run(batch)
-  #       start_row = i * batch_size
-  #       end_row = (i + 1) * batch_size
-  #       np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
-
-  # def test_dtype_auto_detection_read(self):
-  #   batch_size = 32
-  #   with tf.Graph().as_default() as graph:
-  #     ds = parquet_dataset_ops.ParquetDataset(
-  #       [self._filename],
-  #       batch_size=batch_size,
-  #       fields=['B', 'C'])
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   c = self._df['C']
-  #   with tf.Session(graph=graph) as sess:
-  #     for i in xrange(3):
-  #       result = sess.run(batch)
-  #       start_row = i * batch_size
-  #       end_row = (i + 1) * batch_size
-  #       np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
-
-  # def test_dtype_auto_detection_read_lower(self):
-  #   batch_size = 32
-  #   with tf.Graph().as_default() as graph:
-  #     actual_fields = parquet_dataset_ops.ParquetDataset.read_schema(
-  #       self._filename, ['B', 'D'], lower=True)
-  #     fld = actual_fields[1].name
-  #     ds = parquet_dataset_ops.ParquetDataset(
-  #       [self._filename],
-  #       batch_size=batch_size,
-  #       fields=actual_fields)
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   c = self._df[fld]
-  #   with tf.Session(graph=graph) as sess:
-  #     for i in xrange(3):
-  #       result = sess.run(batch)
-  #       start_row = i * batch_size
-  #       end_row = (i + 1) * batch_size
-  #       np.testing.assert_equal(result[fld], c[start_row:end_row].to_numpy())
-
-  # def test_read_from_generator(self):
-  #   num_epochs = 2
-  #   batch_size = 100
-  #   with tf.Graph().as_default() as graph:
-  #     def gen_filenames():
-  #       for i in xrange(num_epochs + 1):
-  #         if i == num_epochs:
-  #           return  # raise StopIteration
-  #         yield self._filename
-  #     filenames = tf.data.Dataset.from_generator(
-  #       gen_filenames, tf.string, tf.TensorShape([]))
-  #     fields = [
-  #       parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-  #       parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)]
-  #     ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields))
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   with tf.Session(graph=graph) as sess:
-  #     for _ in xrange(len(self._df) * num_epochs // batch_size):
-  #       sess.run(batch)
-  #     with self.assertRaises(tf.errors.OutOfRangeError):
-  #       sess.run(batch)
-
-  # def test_read_from_generator_parallel(self):
-  #   num_epochs = 2
-  #   batch_size = 100
-  #   with tf.Graph().as_default() as graph:
-  #     def gen_filenames():
-  #       for i in xrange(num_epochs + 1):
-  #         if i == num_epochs:
-  #           return  # raise StopIteration
-  #         yield self._filename
-  #     filenames = tf.data.Dataset.from_generator(
-  #       gen_filenames, tf.string, tf.TensorShape([]))
-  #     fields = [
-  #       parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-  #       parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)]
-  #     ds = filenames.apply(
-  #       parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=3))
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   with tf.Session(graph=graph) as sess:
-  #     for _ in xrange(len(self._df) * num_epochs // batch_size):
-  #       sess.run(batch)
-  #     with self.assertRaises(tf.errors.OutOfRangeError):
-  #       sess.run(batch)
-
-  # def test_read_from_generator_parallel_auto(self):
-  #   num_epochs = 2
-  #   batch_size = 100
-  #   with tf.Graph().as_default() as graph:
-  #     def gen_filenames():
-  #       for i in xrange(num_epochs + 1):
-  #         if i == num_epochs:
-  #           return  # raise StopIteration
-  #         yield self._filename
-  #     filenames = tf.data.Dataset.from_generator(
-  #       gen_filenames, tf.string, tf.TensorShape([]))
-  #     fields = [
-  #       parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-  #       parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)]
-  #     ds = filenames.apply(
-  #       parquet_dataset_ops.read_parquet(
-  #         batch_size, fields=fields, num_parallel_reads=AUTOTUNE))
-  #     ds = ds.prefetch(4)
-  #     batch = tf.data.make_one_shot_iterator(ds).get_next()
-
-  #   with tf.Session(graph=graph) as sess:
-  #     for _ in xrange(len(self._df) * num_epochs // batch_size):
-  #       sess.run(batch)
-  #     with self.assertRaises(tf.errors.OutOfRangeError):
-  #       sess.run(batch)
+    with tf.Session(graph=graph) as sess:
+      for i in xrange(3):
+        result = sess.run(batch)
+        start_row = i * batch_size
+        end_row = (i + 1) * batch_size
+        np.testing.assert_equal(result['A'], a[start_row:end_row].to_numpy())
+        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+
+  def test_schema_auto_detection_read(self):
+    batch_size = 32
+    with tf.Graph().as_default() as graph:
+      ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size)
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    c = self._df['C']
+    with tf.Session(graph=graph) as sess:
+      for i in xrange(3):
+        result = sess.run(batch)
+        start_row = i * batch_size
+        end_row = (i + 1) * batch_size
+        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+
+  def test_dtype_auto_detection_read(self):
+    batch_size = 32
+    with tf.Graph().as_default() as graph:
+      ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=['B', 'C'])
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    c = self._df['C']
+    with tf.Session(graph=graph) as sess:
+      for i in xrange(3):
+        result = sess.run(batch)
+        start_row = i * batch_size
+        end_row = (i + 1) * batch_size
+        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+
+  def test_dtype_auto_detection_read_lower(self):
+    batch_size = 32
+    with tf.Graph().as_default() as graph:
+      actual_fields = parquet_dataset_ops.ParquetDataset.read_schema(self._filename, ['B', 'D'], lower=True)
+      fld = actual_fields[1].name
+      ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=actual_fields)
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    c = self._df[fld]
+    with tf.Session(graph=graph) as sess:
+      for i in xrange(3):
+        result = sess.run(batch)
+        start_row = i * batch_size
+        end_row = (i + 1) * batch_size
+        np.testing.assert_equal(result[fld], c[start_row:end_row].to_numpy())
+
+  def test_read_from_generator(self):
+    num_epochs = 2
+    batch_size = 100
+    with tf.Graph().as_default() as graph:
+
+      def gen_filenames():
+        for i in xrange(num_epochs + 1):
+          if i == num_epochs:
+            return  # raise StopIteration
+          yield self._filename
+
+      filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
+      fields = [
+          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
+          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+      ]
+      ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields))
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    with tf.Session(graph=graph) as sess:
+      for _ in xrange(len(self._df) * num_epochs // batch_size):
+        sess.run(batch)
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batch)
+
+  def test_read_from_generator_parallel(self):
+    num_epochs = 2
+    batch_size = 100
+    with tf.Graph().as_default() as graph:
+
+      def gen_filenames():
+        for i in xrange(num_epochs + 1):
+          if i == num_epochs:
+            return  # raise StopIteration
+          yield self._filename
+
+      filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
+      fields = [
+          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
+          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+      ]
+      ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=3))
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    with tf.Session(graph=graph) as sess:
+      for _ in xrange(len(self._df) * num_epochs // batch_size):
+        sess.run(batch)
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batch)
+
+  def test_read_from_generator_parallel_auto(self):
+    num_epochs = 2
+    batch_size = 100
+    with tf.Graph().as_default() as graph:
+
+      def gen_filenames():
+        for i in xrange(num_epochs + 1):
+          if i == num_epochs:
+            return  # raise StopIteration
+          yield self._filename
+
+      filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
+      fields = [
+          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
+          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+      ]
+      ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=AUTOTUNE))
+      ds = ds.prefetch(4)
+      batch = tf.data.make_one_shot_iterator(ds).get_next()
+
+    with tf.Session(graph=graph) as sess:
+      for _ in xrange(len(self._df) * num_epochs // batch_size):
+        sess.run(batch)
+      with self.assertRaises(tf.errors.OutOfRangeError):
+        sess.run(batch)
 
 
 if __name__ == "__main__":
diff --git a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
index 34411a21..ccd3a14e 100644
--- a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
+++ b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
@@ -12,6 +12,7 @@
 os.environ['CUDA_VISIBLE_DEVICES'] = ''
 _workspace = tempfile.mkdtemp()
 _filename = os.path.join(_workspace, 'test.parquet')
+print(_filename)
 # _df = pd.DataFrame(
 #     np.random.randint(0, 100, size=(200, 4), dtype=np.int64),
 # columns=list('ABCd'))
@@ -34,17 +35,18 @@
 ds = parquet_dataset_ops.ParquetDataset(
     _filename,
     batch_size=batch_size,
-    fields=[
-        parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1),
-        parquet_dataset_ops.DataFrame.Field(
-            'B',
-            tf.int64,
-            shape=[3],
-        ),
-        parquet_dataset_ops.DataFrame.Field('C', tf.int32),
-        parquet_dataset_ops.DataFrame.Field('D', tf.int64),
-        parquet_dataset_ops.DataFrame.Field('E', tf.string),
-    ]
+    fields=['A', 'C']
+    # fields=[
+    #     parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1),
+    #     parquet_dataset_ops.DataFrame.Field(
+    #         'B',
+    #         tf.int64,
+    #         shape=[3],
+    #     ),
+    #     parquet_dataset_ops.DataFrame.Field('C', tf.int32),
+    #     parquet_dataset_ops.DataFrame.Field('D', tf.int64),
+    #     parquet_dataset_ops.DataFrame.Field('E', tf.string),
+    # ]
 )
 
 ds = ds.prefetch(4)
diff --git a/deepray/custom_ops/seq2seq/BUILD b/deepray/custom_ops/seq2seq/BUILD
index 6eb65487..740c8275 100644
--- a/deepray/custom_ops/seq2seq/BUILD
+++ b/deepray/custom_ops/seq2seq/BUILD
@@ -11,8 +11,36 @@ custom_op_library(
         "cc/kernels/beam_search_ops.h",
         "cc/ops/beam_search_ops.cc",
     ],
-    cuda_srcs = [
+    gpu_srcs = [
         "cc/kernels/beam_search_ops.h",
         "cc/kernels/beam_search_ops_gpu.cu.cc",
     ],
 )
+
+py_library(
+    name = "seq2seq",
+    srcs = glob(
+        [
+            "python/*.py",
+            "*.py",
+        ],
+    ),
+    data = [
+        ":_beam_search_ops.so",
+    ],
+)
+
+py_test(
+    name = "seq2seq_test",
+    size = "medium",
+    srcs = glob(["python/tests/*"]),
+    main = "python/tests/run_all_test.py",
+    deps = [
+        ":seq2seq",
+        "//deepray/utils",
+        "@pypi_numpy//:pkg",
+        "@pypi_pytest//:pkg",
+        "@pypi_pytest_xdist//:pkg",
+        "@pypi_tensorflow//:pkg",
+    ],
+)
diff --git a/deepray/custom_ops/seq2seq/__init__.py b/deepray/custom_ops/seq2seq/__init__.py
new file mode 100644
index 00000000..39f88075
--- /dev/null
+++ b/deepray/custom_ops/seq2seq/__init__.py
@@ -0,0 +1,22 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Additional layers for sequence to sequence models."""
+
+from .python import attention_wrapper
+from .python import basic_decoder
+from .python import beam_search_decoder
+from .python import decoder
+from .python import loss
+from .python import sampler
\ No newline at end of file
diff --git a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc
index 7ea86176..6a5f20ec 100644
--- a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc
+++ b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.cc
@@ -32,8 +32,13 @@ limitations under the License.
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/work_sharder.h"
+#if TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 16
+#include "unsupported/Eigen/CXX11/Tensor"
+#else
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
 
 namespace tensorflow {
 namespace deepray {
diff --git a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h
index ae85f60b..297592d7 100644
--- a/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h
+++ b/deepray/custom_ops/seq2seq/cc/kernels/beam_search_ops.h
@@ -18,7 +18,12 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/version.h"
+#if TF_MAJOR_VERSION == 2 && TF_MINOR_VERSION >= 16
+#include "unsupported/Eigen/CXX11/Tensor"
+#else
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#endif
 
 namespace tensorflow {
 class OpKernelContext;
diff --git a/deepray/seq2seq/README.md b/deepray/custom_ops/seq2seq/python/README.md
similarity index 90%
rename from deepray/seq2seq/README.md
rename to deepray/custom_ops/seq2seq/python/README.md
index 7430fb3d..bdb3d6a8 100644
--- a/deepray/seq2seq/README.md
+++ b/deepray/custom_ops/seq2seq/python/README.md
@@ -1,14 +1,14 @@
-# Deepray - Seq2seq
+# Addons - Seq2seq
 
 ## Contents
-https://www.tensorflow.org/deepray/api_docs/python/dp/seq2seq
+https://www.tensorflow.org/addons/api_docs/python/tfa/seq2seq
 
 ## Contribution Guidelines
 #### Standard API
 In order to conform with the current API standard, all objects must:
  * Inherit from proper base class within each module, eg `BaseDecoder` in decoder.py for customized
    decoder or `_BaseAttentionMechanism` for new attentions.
- * Register as a keras global object so it can be serialized properly: `@tf.keras.utils.register_keras_serializable(package='Deepray')`
+ * Register as a keras global object so it can be serialized properly: `@tf.keras.utils.register_keras_serializable(package='Addons')`
 
 #### Testing Requirements
  * Simple unittests that demonstrate the class is behaving as expected on
@@ -67,7 +67,7 @@ logits = outputs.rnn_output
 ```
 
 ``` python
-import deepray as dp
+import tensorflow_addons as tfa
 
 # TF 2.0, new style
 
@@ -79,9 +79,9 @@ encoder_outputs, state_h, state_c = encoder(
 encoder_state = (state_h, state_c)
 
 # Decoder RNN cell with attention
-attention_mechanism = dp.seq2seq.LuongAttention(num_units, encoder_outputs)
+attention_mechanism = tfa.seq2seq.LuongAttention(num_units, encoder_outputs)
 decoder_cell = tf.keras.layers.LSTMCell(num_units)
-decoder_cell = dp.seq2seq.AttentionWrapper(
+decoder_cell = tfa.seq2seq.AttentionWrapper(
     decoder_cell,
     attention_mechanism,
     attention_layer_size=num_units,
@@ -89,11 +89,11 @@ decoder_cell = dp.seq2seq.AttentionWrapper(
 )
 
 # Sampler
-sampler = dp.seq2seq.sampler.TrainingSampler()
+sampler = tfa.seq2seq.sampler.TrainingSampler()
 
 # Decoder
 projection_layer = tf.keras.layers.Dense(num_outputs)
-decoder = dp.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer)
+decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer=projection_layer)
 
 # Dynamic decoding
 decoder_initial_state = decoder_cell.get_initial_state(inputs=decoder_inputs)
@@ -145,14 +145,14 @@ outputs, _ = tf.contrib.seq2seq.dynamic_decode(decoder, ...)
 
 ``` python
 # TF 2.0, new style
-import deepray as dp
+import tensorflow_addons as tfa
 
 # Replicate encoder infos beam_width times
-decoder_initial_state = dp.seq2seq.tile_batch(
+decoder_initial_state = tfa.seq2seq.tile_batch(
     encoder_state, multiplier=hparams.beam_width)
 
 # Define a beam-search decoder
-decoder = dp.seq2seq.BeamSearchDecoder(
+decoder = tfa.seq2seq.BeamSearchDecoder(
     cell=decoder_cell,
     beam_width=beam_width,
     output_layer=projection_layer,
diff --git a/deepray/custom_ops/seq2seq/python/__init__.py b/deepray/custom_ops/seq2seq/python/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/seq2seq/attention_wrapper.py b/deepray/custom_ops/seq2seq/python/attention_wrapper.py
similarity index 95%
rename from deepray/seq2seq/attention_wrapper.py
rename to deepray/custom_ops/seq2seq/python/attention_wrapper.py
index 830ac6a1..b8eb8ad2 100644
--- a/deepray/seq2seq/attention_wrapper.py
+++ b/deepray/custom_ops/seq2seq/python/attention_wrapper.py
@@ -17,10 +17,12 @@
 import collections
 import functools
 import math
+from typing import Optional, Callable, Union, List
 
 import numpy as np
-
 import tensorflow as tf
+from packaging.version import Version
+from typeguard import typechecked
 
 from deepray.utils import keras_utils
 from deepray.utils.types import (
@@ -31,8 +33,10 @@
     Number,
 )
 
-from typeguard import typechecked
-from typing import Optional, Callable, Union, List
+if Version(tf.__version__) < Version("2.13"):
+  SERIALIZATION_ARGS = {}
+else:
+  SERIALIZATION_ARGS = {"use_legacy_format": True}
 
 
 class AttentionMechanism(tf.keras.layers.Layer):
@@ -355,11 +359,19 @@ def deserialize_inner_layer_from_config(cls, config, custom_objects):
     config = config.copy()
     query_layer_config = config.pop("query_layer", None)
     if query_layer_config:
-      query_layer = tf.keras.layers.deserialize(query_layer_config, custom_objects=custom_objects)
+      query_layer = tf.keras.layers.deserialize(
+          query_layer_config,
+          custom_objects=custom_objects,
+          **SERIALIZATION_ARGS,
+      )
       config["query_layer"] = query_layer
     memory_layer_config = config.pop("memory_layer", None)
     if memory_layer_config:
-      memory_layer = tf.keras.layers.deserialize(memory_layer_config, custom_objects=custom_objects)
+      memory_layer = tf.keras.layers.deserialize(
+          memory_layer_config,
+          custom_objects=custom_objects,
+          **SERIALIZATION_ARGS,
+      )
       config["memory_layer"] = memory_layer
     return config
 
@@ -375,7 +387,7 @@ def state_size(self):
     return self.alignments_size
 
   def initial_alignments(self, batch_size, dtype):
-    """Creates the initial alignment values for the `dp.seq2seq.AttentionWrapper`
+    """Creates the initial alignment values for the `tfa.seq2seq.AttentionWrapper`
         class.
 
         This is important for attention mechanisms that use the previous
@@ -395,7 +407,7 @@ def initial_alignments(self, batch_size, dtype):
     return tf.zeros([batch_size, self._alignments_size], dtype=dtype)
 
   def initial_state(self, batch_size, dtype):
-    """Creates the initial state values for the `dp.seq2seq.AttentionWrapper` class.
+    """Creates the initial state values for the `tfa.seq2seq.AttentionWrapper` class.
 
         This is important for attention mechanisms that use the previous
         alignment to calculate the alignment at the next time step
@@ -767,7 +779,9 @@ def get_config(self):
         "normalize": self.normalize,
         "probability_fn": self.probability_fn_name,
         "kernel_initializer": tf.keras.initializers.serialize(
-            self.kernel_initializer)
+            self.kernel_initializer,
+            **SERIALIZATION_ARGS,
+        )
     }
     # yapf: enable
 
@@ -776,7 +790,10 @@ def get_config(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    config = AttentionMechanism.deserialize_inner_layer_from_config(config, custom_objects=custom_objects)
+    config = AttentionMechanism.deserialize_inner_layer_from_config(
+        config,
+        custom_objects=custom_objects,
+    )
     return cls(**config)
 
 
@@ -917,7 +934,7 @@ def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, s
         test-time, and when hard attention is not desired.
       mode: How to compute the attention distribution.  Must be one of
         'recursive', 'parallel', or 'hard'.  See the docstring for
-        `dp.seq2seq.monotonic_attention` for more information.
+        `tfa.seq2seq.monotonic_attention` for more information.
       seed: (optional) Random seed for pre-sigmoid noise.
 
     Returns:
@@ -974,7 +991,7 @@ class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
     to construct its attention distributions.  Since the attention scores are
     passed through a sigmoid, a learnable scalar bias parameter is applied
     after the score function and before the sigmoid.  Otherwise, it is
-    equivalent to `dp.seq2seq.BahdanauAttention`.  This approach is proposed in
+    equivalent to `tfa.seq2seq.BahdanauAttention`.  This approach is proposed in
 
     Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
     "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
@@ -1015,7 +1032,7 @@ def __init__(
             of the memory is large.
           mode: How to compute the attention distribution. Must be one of
             'recursive', 'parallel', or 'hard'. See the docstring for
-            `dp.seq2seq.monotonic_attention` for more information.
+            `tfa.seq2seq.monotonic_attention` for more information.
           kernel_initializer: (optional), the name of the initializer for the
             attention kernel.
           dtype: The data type for the query and memory layers of the attention
@@ -1128,7 +1145,9 @@ def get_config(self):
         "score_bias_init": self.score_bias_init,
         "mode": self.mode,
         "kernel_initializer": tf.keras.initializers.serialize(
-            self.kernel_initializer),
+            self.kernel_initializer,
+            **SERIALIZATION_ARGS,
+        ),
     }
     # yapf: enable
 
@@ -1149,7 +1168,7 @@ class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
     memory it can't attend to any prior points at subsequence output timesteps.
     It achieves this by using the `_monotonic_probability_fn` instead of `softmax`
     to construct its attention distributions.  Otherwise, it is equivalent to
-    `dp.seq2seq.LuongAttention`.  This approach is proposed in
+    `tfa.seq2seq.LuongAttention`.  This approach is proposed in
 
     [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
     "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
@@ -1189,7 +1208,7 @@ def __init__(
             of the memory is large.
           mode: How to compute the attention distribution.  Must be one of
             'recursive', 'parallel', or 'hard'.  See the docstring for
-            `dp.seq2seq.monotonic_attention` for more information.
+            `tfa.seq2seq.monotonic_attention` for more information.
           dtype: The data type for the query and memory layers of the attention
             mechanism.
           name: Name to use when creating ops.
@@ -1289,7 +1308,7 @@ class AttentionWrapperState(
         ),
     )
 ):
-  """State of a `dp.seq2seq.AttentionWrapper`.
+  """State of a `tfa.seq2seq.AttentionWrapper`.
 
     Attributes:
       cell_state: The state of the wrapped RNN cell at the previous time
@@ -1318,8 +1337,8 @@ def clone(self, **kwargs):
         >>> batch_size = 1
         >>> memory = tf.random.normal(shape=[batch_size, 3, 100])
         >>> encoder_state = [tf.zeros((batch_size, 100)), tf.zeros((batch_size, 100))]
-        >>> attention_mechanism = dp.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size)
-        >>> attention_cell = dp.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10)
+        >>> attention_mechanism = tfa.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size)
+        >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10)
         >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
         >>> decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
 
@@ -1488,11 +1507,11 @@ class AttentionWrapper(tf.keras.layers.AbstractRNNCell):
     >>> memory = tf.random.uniform([batch_size, max_time, hidden_size])
     >>> memory_sequence_length = tf.fill([batch_size], max_time)
     >>>
-    >>> attention_mechanism = dp.seq2seq.LuongAttention(hidden_size)
+    >>> attention_mechanism = tfa.seq2seq.LuongAttention(hidden_size)
     >>> attention_mechanism.setup_memory(memory, memory_sequence_length)
     >>>
     >>> cell = tf.keras.layers.LSTMCell(hidden_size)
-    >>> cell = dp.seq2seq.AttentionWrapper(
+    >>> cell = tfa.seq2seq.AttentionWrapper(
     ...     cell, attention_mechanism, attention_layer_size=hidden_size)
     >>>
     >>> inputs = tf.random.uniform([batch_size, hidden_size])
@@ -1520,11 +1539,11 @@ def __init__(
   ):
     """Construct the `AttentionWrapper`.
 
-        **NOTE** If you are using the `dp.seq2seq.BeamSearchDecoder` with a cell wrapped
+        **NOTE** If you are using the `tfa.seq2seq.BeamSearchDecoder` with a cell wrapped
         in `AttentionWrapper`, then you must ensure that:
 
         - The encoder output has been tiled to `beam_width` via
-          `dp.seq2seq.tile_batch` (NOT `tf.tile`).
+          `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
         - The `batch_size` argument passed to the `get_initial_state` method of
           this wrapper is equal to `true_batch_size * beam_width`.
         - The initial state created with `get_initial_state` above contains a
@@ -1538,18 +1557,18 @@ def __init__(
         >>> sequence_length = tf.convert_to_tensor([5])
         >>> encoder_outputs = tf.random.uniform(shape=(batch_size, 5, 10))
         >>> encoder_final_state = [tf.zeros((batch_size, 10)), tf.zeros((batch_size, 10))]
-        >>> tiled_encoder_outputs = dp.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width)
-        >>> tiled_encoder_final_state = dp.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width)
-        >>> tiled_sequence_length = dp.seq2seq.tile_batch(sequence_length, multiplier=beam_width)
-        >>> attention_mechanism = dp.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length)
-        >>> attention_cell = dp.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism)
+        >>> tiled_encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width)
+        >>> tiled_encoder_final_state = tfa.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width)
+        >>> tiled_sequence_length = tfa.seq2seq.tile_batch(sequence_length, multiplier=beam_width)
+        >>> attention_mechanism = tfa.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length)
+        >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism)
         >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size * beam_width, dtype=tf.float32)
         >>> decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)
 
         Args:
           cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
             interface.
-          attention_mechanism: A list of `dp.seq2seq.AttentionMechanism`
+          attention_mechanism: A list of `tfa.seq2seq.AttentionMechanism`
             instances single instance.
           attention_layer_size: A list of Python integers or a single Python
             integer, the depth of the attention (output) layer(s). If `None`
@@ -1673,7 +1692,7 @@ def cell_input_fn(inputs, attention):
             "When constructing AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
             "(encoder output) and initial_cell_state.  Are you using "
             "the BeamSearchDecoder?  You may need to tile your "
-            "initial state via the dp.seq2seq.tile_batch "
+            "initial state via the tfa.seq2seq.tile_batch "
             "function with argument multiple=beam_width."
         )
         with tf.control_dependencies(
@@ -1752,10 +1771,10 @@ def output_size(self):
 
   @property
   def state_size(self):
-    """The `state_size` property of `dp.seq2seq.AttentionWrapper`.
+    """The `state_size` property of `tfa.seq2seq.AttentionWrapper`.
 
         Returns:
-          A `dp.seq2seq.AttentionWrapperState` tuple containing shapes used
+          A `tfa.seq2seq.AttentionWrapperState` tuple containing shapes used
           by this object.
         """
     return AttentionWrapperState(
@@ -1769,11 +1788,11 @@ def state_size(self):
     )  # sometimes a TensorArray
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
-    """Return an initial (zero) state tuple for this `dp.seq2seq.AttentionWrapper`.
+    """Return an initial (zero) state tuple for this `tfa.seq2seq.AttentionWrapper`.
 
         **NOTE** Please see the initializer documentation for details of how
-        to call `get_initial_state` if using a `dp.seq2seq.AttentionWrapper`
-        with a `dp.seq2seq.BeamSearchDecoder`.
+        to call `get_initial_state` if using a `tfa.seq2seq.AttentionWrapper`
+        with a `tfa.seq2seq.BeamSearchDecoder`.
 
         Args:
           inputs: The inputs that will be fed to this cell.
@@ -1781,7 +1800,7 @@ def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
           dtype: The internal state data type.
 
         Returns:
-          An `dp.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and,
+          An `tfa.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and,
           possibly, empty `TensorArray` objects.
 
         Raises:
@@ -1803,7 +1822,7 @@ def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
           "(encoder output) and the requested batch size. Are you using "
           "the BeamSearchDecoder?  If so, make sure your encoder output "
           "has been tiled to beam_width via "
-          "dp.seq2seq.tile_batch, and the batch_size= argument "
+          "tfa.seq2seq.tile_batch, and the batch_size= argument "
           "passed to get_initial_state is batch_size * beam_width."
       )
       with tf.control_dependencies(self._batch_size_checks(batch_size, error_message)):  # pylint: disable=bad-continuation
@@ -1844,7 +1863,7 @@ def call(self, inputs, state, **kwargs):
         Args:
           inputs: (Possibly nested tuple of) Tensor, the input at this time
             step.
-          state: An instance of `dp.seq2seq.AttentionWrapperState` containing
+          state: An instance of `tfa.seq2seq.AttentionWrapperState` containing
             tensors from the previous time step.
           **kwargs: Dict, other keyword arguments for the cell call method.
 
@@ -1852,11 +1871,11 @@ def call(self, inputs, state, **kwargs):
           A tuple `(attention_or_cell_output, next_state)`, where:
 
           - `attention_or_cell_output` depending on `output_attention`.
-          - `next_state` is an instance of `dp.seq2seq.AttentionWrapperState`
+          - `next_state` is an instance of `tfa.seq2seq.AttentionWrapperState`
              containing the state calculated at this time step.
 
         Raises:
-          TypeError: If `state` is not an instance of `dp.seq2seq.AttentionWrapperState`.
+          TypeError: If `state` is not an instance of `tfa.seq2seq.AttentionWrapperState`.
         """
     if not isinstance(state, AttentionWrapperState):
       try:
@@ -1880,7 +1899,7 @@ def call(self, inputs, state, **kwargs):
         "When applying AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
         "(encoder output) and the query (decoder output).  Are you using "
         "the BeamSearchDecoder?  You may need to tile your memory input "
-        "via the dp.seq2seq.tile_batch function with argument "
+        "via the tfa.seq2seq.tile_batch function with argument "
         "multiple=beam_width."
     )
     with tf.control_dependencies(self._batch_size_checks(cell_batch_size, error_message)):  # pylint: disable=bad-continuation
diff --git a/deepray/seq2seq/basic_decoder.py b/deepray/custom_ops/seq2seq/python/basic_decoder.py
similarity index 87%
rename from deepray/seq2seq/basic_decoder.py
rename to deepray/custom_ops/seq2seq/python/basic_decoder.py
index 48a7a75f..de3ea9b9 100644
--- a/deepray/seq2seq/basic_decoder.py
+++ b/deepray/custom_ops/seq2seq/python/basic_decoder.py
@@ -15,26 +15,25 @@
 """A basic decoder that may sample to generate the next input."""
 
 import collections
+from typing import Optional
 
 import tensorflow as tf
+from typeguard import typechecked
 
-from deepray.seq2seq import decoder
-from deepray.seq2seq import sampler as sampler_py
 from deepray.utils import keras_utils
-
-from typeguard import typechecked
-from typing import Optional
+from . import decoder
+from . import sampler as sampler_py
 
 
 class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))):
-  """Outputs of a `dp.seq2seq.BasicDecoder` step.
+  """Outputs of a `tfa.seq2seq.BasicDecoder` step.
 
     Attributes:
       rnn_output: The output for this step. If the `output_layer` argument
-         of `dp.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it
+         of `tfa.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it
          is the output of the RNN cell.
       sample_id: The token IDs sampled for this step, as returned by the
-        `sampler` instance passed to `dp.seq2seq.BasicDecoder`.
+        `sampler` instance passed to `tfa.seq2seq.BasicDecoder`.
     """
 
   pass
@@ -43,11 +42,11 @@ class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_outp
 class BasicDecoder(decoder.BaseDecoder):
   """Basic sampling decoder for training and inference.
 
-    The `dp.seq2seq.Sampler` instance passed as argument is responsible to sample from
+    The `tfa.seq2seq.Sampler` instance passed as argument is responsible to sample from
     the output distribution and produce the input for the next decoding step. The decoding
     loop is implemented by the decoder in its `__call__` method.
 
-    Example using `dp.seq2seq.TrainingSampler` for training:
+    Example using `tfa.seq2seq.TrainingSampler` for training:
 
     >>> batch_size = 4
     >>> max_time = 7
@@ -58,10 +57,10 @@ class BasicDecoder(decoder.BaseDecoder):
     >>>
     >>> embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_size)
     >>> decoder_cell = tf.keras.layers.LSTMCell(hidden_size)
-    >>> sampler = dp.seq2seq.TrainingSampler()
+    >>> sampler = tfa.seq2seq.TrainingSampler()
     >>> output_layer = tf.keras.layers.Dense(output_vocab_size)
     >>>
-    >>> decoder = dp.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer)
+    >>> decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer)
     >>>
     >>> input_ids = tf.random.uniform(
     ...     [batch_size, max_time], maxval=input_vocab_size, dtype=tf.int64)
@@ -76,10 +75,10 @@ class BasicDecoder(decoder.BaseDecoder):
     >>> logits.shape
     TensorShape([4, 7, 64])
 
-    Example using `dp.seq2seq.GreedyEmbeddingSampler` for inference:
+    Example using `tfa.seq2seq.GreedyEmbeddingSampler` for inference:
 
-    >>> sampler = dp.seq2seq.GreedyEmbeddingSampler(embedding_layer)
-    >>> decoder = dp.seq2seq.BasicDecoder(
+    >>> sampler = tfa.seq2seq.GreedyEmbeddingSampler(embedding_layer)
+    >>> decoder = tfa.seq2seq.BasicDecoder(
     ...     decoder_cell, sampler, output_layer, maximum_iterations=10)
     >>>
     >>> initial_state = decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
@@ -106,11 +105,11 @@ def __init__(
         Args:
           cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
             interface.
-          sampler: A `dp.seq2seq.Sampler` instance.
+          sampler: A `tfa.seq2seq.Sampler` instance.
           output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
             `tf.keras.layers.Dense`. Optional layer to apply to the RNN output
              prior to storing the result or sampling.
-          **kwargs: Other keyword arguments of `dp.seq2seq.BaseDecoder`.
+          **kwargs: Other keyword arguments of `tfa.seq2seq.BaseDecoder`.
         """
     keras_utils.assert_like_rnncell("cell", cell)
     self.cell = cell
diff --git a/deepray/seq2seq/beam_search_decoder.py b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py
similarity index 98%
rename from deepray/seq2seq/beam_search_decoder.py
rename to deepray/custom_ops/seq2seq/python/beam_search_decoder.py
index 5626057b..25082eac 100644
--- a/deepray/seq2seq/beam_search_decoder.py
+++ b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py
@@ -15,19 +15,18 @@
 """A decoder that performs beam search."""
 
 import collections
-import numpy as np
+from typing import Callable, Optional
 
+import numpy as np
 import tensorflow as tf
+from typeguard import typechecked
 
 from deepray import options
-from deepray.seq2seq import attention_wrapper
-from deepray.seq2seq import decoder
 from deepray.utils import keras_utils
 from deepray.utils.resource_loader import LazySO
 from deepray.utils.types import FloatTensorLike, TensorLike, Number
-
-from typeguard import typechecked
-from typing import Callable, Optional
+from . import attention_wrapper
+from . import decoder
 
 _beam_search_so = LazySO("custom_ops/seq2seq/_beam_search_ops.so")
 
@@ -44,7 +43,7 @@ class BeamSearchDecoderState(
         ),
     )
 ):
-  """State of a `dp.seq2seq.BeamSearchDecoder`.
+  """State of a `tfa.seq2seq.BeamSearchDecoder`.
 
     Attributes:
       cell_state: The cell state returned at the previous time step.
@@ -64,12 +63,12 @@ class BeamSearchDecoderState(
 class BeamSearchDecoderOutput(
     collections.namedtuple("BeamSearchDecoderOutput", ("scores", "predicted_ids", "parent_ids"))
 ):
-  """Outputs of a `dp.seq2seq.BeamSearchDecoder` step.
+  """Outputs of a `tfa.seq2seq.BeamSearchDecoder` step.
 
     Attributes:
       scores: The scores this step, which are the log
         probabilities over the output vocabulary, possibly penalized by length
-        and attention coverage. When `dp.seq2seq.BeamSearchDecoder` is created with
+        and attention coverage. When `tfa.seq2seq.BeamSearchDecoder` is created with
         `output_all_scores=False` (default), this will be a `float32` `Tensor`
         of shape `[batch_size, beam_width]` containing the top scores
         corresponding to the predicted IDs. When `output_all_scores=True`,
@@ -93,7 +92,7 @@ class FinalBeamSearchDecoderOutput(
       predicted_ids: The final prediction. A tensor of shape
         `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if
         `output_time_major` is True). Beams are ordered from best to worst.
-      beam_search_decoder_output: An instance of `dp.seq2seq.BeamSearchDecoderOutput` that
+      beam_search_decoder_output: An instance of `tfa.seq2seq.BeamSearchDecoderOutput` that
         describes the state of the beam search.
     """
 
@@ -220,7 +219,7 @@ def gather_tree(
     """
   if not options.is_custom_kernel_disabled():
     try:
-      return _beam_search_so.ops.deepray_gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token)
+      return _beam_search_so.ops.addons_gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token)
     except tf.errors.NotFoundError:
       options.warn_fallback("gather_tree")
 
@@ -686,10 +685,10 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder):
   """Beam search decoder.
 
     **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
-    `dp.seq2seq.AttentionWrapper`, then you must ensure that:
+    `tfa.seq2seq.AttentionWrapper`, then you must ensure that:
 
     - The encoder output has been tiled to `beam_width` via
-      `dp.seq2seq.tile_batch` (NOT `tf.tile`).
+      `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
     - The `batch_size` argument passed to the `get_initial_state` method of
       this wrapper is equal to `true_batch_size * beam_width`.
     - The initial state created with `get_initial_state` above contains a
@@ -699,11 +698,11 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder):
     An example:
 
     ```
-    tiled_encoder_outputs = dp.seq2seq.tile_batch(
+    tiled_encoder_outputs = tfa.seq2seq.tile_batch(
         encoder_outputs, multiplier=beam_width)
-    tiled_encoder_final_state = dp.seq2seq.tile_batch(
+    tiled_encoder_final_state = tfa.seq2seq.tile_batch(
         encoder_final_state, multiplier=beam_width)
-    tiled_sequence_length = dp.seq2seq.tile_batch(
+    tiled_sequence_length = tfa.seq2seq.tile_batch(
         sequence_length, multiplier=beam_width)
     attention_mechanism = MyFavoriteAttentionMechanism(
         num_units=attention_depth,
@@ -716,7 +715,7 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder):
         cell_state=tiled_encoder_final_state)
     ```
 
-    Meanwhile, with `dp.seq2seq.AttentionWrapper`, coverage penalty is suggested to use
+    Meanwhile, with `tfa.seq2seq.AttentionWrapper`, coverage penalty is suggested to use
     when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
     the decoding to cover all inputs.
     """
diff --git a/deepray/seq2seq/decoder.py b/deepray/custom_ops/seq2seq/python/decoder.py
similarity index 97%
rename from deepray/seq2seq/decoder.py
rename to deepray/custom_ops/seq2seq/python/decoder.py
index 97875555..f5b9bad5 100644
--- a/deepray/seq2seq/decoder.py
+++ b/deepray/custom_ops/seq2seq/python/decoder.py
@@ -15,14 +15,14 @@
 """Base classes and functions for dynamic decoding."""
 
 import abc
-
-import tensorflow as tf
-from deepray.utils.types import TensorLike
-from typeguard import typechecked
 from typing import Any, Optional, Tuple, Union
 
+import tensorflow as tf
 # TODO: Find public API alternatives to these
 from tensorflow.python.ops import control_flow_util
+from typeguard import typechecked
+
+from deepray.utils.types import TensorLike
 
 
 class Decoder(metaclass=abc.ABCMeta):
@@ -37,7 +37,7 @@ class Decoder(metaclass=abc.ABCMeta):
       finished.
     - `training`: boolean whether it should behave in training mode or in
       inference mode.
-    - `outputs`: instance of `dp.seq2seq.BasicDecoderOutput`. Result of the decoding, at
+    - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
       each time step.
     """
 
@@ -105,12 +105,12 @@ def tracks_own_finished(self):
     """Describes whether the Decoder keeps track of finished states.
 
         Most decoders will emit a true/false `finished` value independently
-        at each time step.  In this case, the `dp.seq2seq.dynamic_decode` function keeps
+        at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
         track of which batch entries are already finished, and performs a
         logical OR to insert new batches to the finished set.
 
         Some decoders, however, shuffle batches / beams between time steps and
-        `dp.seq2seq.dynamic_decode` will mix up the finished state across these entries
+        `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
         because it does not track the reshuffle across time steps. In this
         case, it is up to the decoder to declare that it will keep track of its
         own finished state by setting this property to `True`.
@@ -135,7 +135,7 @@ class BaseDecoder(tf.keras.layers.Layer):
       finished.
     - `training`: boolean whether it should behave in training mode or in
       inference mode.
-    - `outputs`: instance of `dp.seq2seq.BasicDecoderOutput`. Result of the decoding, at
+    - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
       each time step.
     """
 
@@ -238,12 +238,12 @@ def tracks_own_finished(self):
     """Describes whether the Decoder keeps track of finished states.
 
         Most decoders will emit a true/false `finished` value independently
-        at each time step.  In this case, the `dp.seq2seq.dynamic_decode` function keeps
+        at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
         track of which batch entries are already finished, and performs a
         logical OR to insert new batches to the finished set.
 
         Some decoders, however, shuffle batches / beams between time steps and
-        `dp.seq2seq.dynamic_decode` will mix up the finished state across these entries
+        `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
         because it does not track the reshuffle across time steps. In this
         case, it is up to the decoder to declare that it will keep track of its
         own finished state by setting this property to `True`.
@@ -274,7 +274,7 @@ def dynamic_decode(
     Calls `initialize()` once and `step()` repeatedly on the decoder object.
 
     Args:
-      decoder: A `dp.seq2seq.Decoder` or `dp.seq2seq.BaseDecoder` instance.
+      decoder: A `tfa.seq2seq.Decoder` or `tfa.seq2seq.BaseDecoder` instance.
       output_time_major: Python boolean.  Default: `False` (batch major). If
         `True`, outputs are returned as time major tensors (this mode is
         faster). Otherwise, outputs are returned as batch major tensors (this
diff --git a/deepray/seq2seq/loss.py b/deepray/custom_ops/seq2seq/python/loss.py
similarity index 99%
rename from deepray/seq2seq/loss.py
rename to deepray/custom_ops/seq2seq/python/loss.py
index 148e8262..7d3cb9b6 100644
--- a/deepray/seq2seq/loss.py
+++ b/deepray/custom_ops/seq2seq/python/loss.py
@@ -14,11 +14,12 @@
 # ==============================================================================
 """Loss functions for sequence models."""
 
-import tensorflow as tf
-from deepray.utils.types import TensorLike
+from typing import Callable, Optional
 
+import tensorflow as tf
 from typeguard import typechecked
-from typing import Callable, Optional
+
+from deepray.utils.types import TensorLike
 
 
 def sequence_loss(
diff --git a/deepray/seq2seq/sampler.py b/deepray/custom_ops/seq2seq/python/sampler.py
similarity index 99%
rename from deepray/seq2seq/sampler.py
rename to deepray/custom_ops/seq2seq/python/sampler.py
index af6b5f1c..069d70db 100644
--- a/deepray/seq2seq/sampler.py
+++ b/deepray/custom_ops/seq2seq/python/sampler.py
@@ -15,13 +15,14 @@
 """Objects sampling from the decoder output distribution and producing the next input."""
 
 import abc
+from typing import Callable, Optional
 
 import tensorflow as tf
-from deepray.seq2seq import decoder
-from deepray.utils.types import Initializer, TensorLike
 from typeguard import typechecked
-from typing import Callable, Optional
+
 from deepray.utils import types
+from deepray.utils.types import Initializer, TensorLike
+from . import decoder
 
 _transpose_batch_time = decoder._transpose_batch_time
 
@@ -31,7 +32,7 @@ class Sampler(metaclass=abc.ABCMeta):
 
     Sampler classes implement the logic of sampling from the decoder output distribution
     and producing the inputs for the next decoding step. In most cases, they should not be
-    used directly but passed to a `dp.seq2seq.BasicDecoder` instance that will manage the
+    used directly but passed to a `tfa.seq2seq.BasicDecoder` instance that will manage the
     sampling.
 
     Here is an example using a training sampler directly to implement a custom decoding
@@ -41,7 +42,7 @@ class Sampler(metaclass=abc.ABCMeta):
     >>> max_time = 7
     >>> hidden_size = 16
     >>>
-    >>> sampler = dp.seq2seq.TrainingSampler()
+    >>> sampler = tfa.seq2seq.TrainingSampler()
     >>> cell = tf.keras.layers.LSTMCell(hidden_size)
     >>>
     >>> input_tensors = tf.random.uniform([batch_size, max_time, hidden_size])
diff --git a/deepray/custom_ops/seq2seq/python/tests/__init__.py b/deepray/custom_ops/seq2seq/python/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/seq2seq/tests/attention_wrapper_test.py b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
similarity index 98%
rename from deepray/seq2seq/tests/attention_wrapper_test.py
rename to deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
index 6332f32b..82c91ba5 100644
--- a/deepray/seq2seq/tests/attention_wrapper_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
@@ -12,17 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for dp.seq2seq.attention_wrapper."""
+"""Tests for tfa.seq2seq.attention_wrapper."""
 
 import collections
 
-import pytest
 import numpy as np
+import pytest
 import tensorflow as tf
+from packaging.version import Version
 
-from deepray.seq2seq import attention_wrapper as wrapper
-from deepray.seq2seq import basic_decoder
-from deepray.seq2seq import sampler as sampler_py
+from deepray.custom_ops.seq2seq import attention_wrapper as wrapper
+from deepray.custom_ops.seq2seq import basic_decoder
+from deepray.custom_ops.seq2seq import sampler as sampler_py
 
 
 class DummyData:
@@ -123,6 +124,9 @@ def test_save_load_layer(attention_cls):
   model.compile("rmsprop", "mse")
   y_ref = model.predict_on_batch([x_test, dummy_data.query, dummy_data.state])
 
+  if Version(tf.__version__) >= Version("2.13"):
+    model.use_legacy_config = True
+
   config = model.get_config()
   weights = model.get_weights()
   loaded_model = tf.keras.Model.from_config(config, custom_objects={attention_cls.__name__: attention_cls})
@@ -173,7 +177,7 @@ def test_masking():
 @pytest.mark.parametrize("attention_cls", attention_classes)
 def test_memory_re_setup(attention_cls):
 
-  class MyModel(tf.keras.Model):
+  class MyModel(tf.keras.models.Model):
 
     def __init__(self, vocab, embedding_dim, memory_size, units):
       super().__init__()
@@ -749,7 +753,7 @@ def test_luong_monotonic_scaled():
 
 
 def test_attention_state_with_keras_rnn():
-  # See https://github.com/tensorflow/deepray/issues/1095.
+  # See https://github.com/tensorflow/addons/issues/1095.
   cell = tf.keras.layers.LSTMCell(8)
 
   mechanism = wrapper.LuongAttention(units=8, memory=tf.ones((2, 4, 8)))
diff --git a/deepray/seq2seq/tests/basic_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
similarity index 99%
rename from deepray/seq2seq/tests/basic_decoder_test.py
rename to deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
index 74a62e29..5e47e17d 100644
--- a/deepray/seq2seq/tests/basic_decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for dp.seq2seq.basic_decoder."""
+"""Tests for tfa.seq2seq.basic_decoder."""
 
 import numpy as np
 import pytest
-
 import tensorflow as tf
 
-from deepray.seq2seq import attention_wrapper
-from deepray.seq2seq import basic_decoder
-from deepray.seq2seq import sampler as sampler_py
+from deepray.custom_ops.seq2seq import attention_wrapper
+from deepray.custom_ops.seq2seq import basic_decoder
+from deepray.custom_ops.seq2seq import sampler as sampler_py
 
 
 @pytest.mark.parametrize("use_output_layer", [True, False])
diff --git a/deepray/seq2seq/tests/beam_search_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
similarity index 98%
rename from deepray/seq2seq/tests/beam_search_decoder_test.py
rename to deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
index 7ea81644..854554d0 100644
--- a/deepray/seq2seq/tests/beam_search_decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for dp.seq2seq.seq2seq.beam_search_decoder."""
+"""Tests for tfa.seq2seq.seq2seq.beam_search_decoder."""
 
 import numpy as np
 import pytest
 import tensorflow as tf
 
-from deepray.seq2seq import attention_wrapper
-from deepray.seq2seq import beam_search_decoder, gather_tree
+from deepray.custom_ops.seq2seq import attention_wrapper
+from deepray.custom_ops.seq2seq import beam_search_decoder
 
 
 @pytest.mark.usefixtures("run_custom_and_py_ops")
@@ -42,7 +42,7 @@ def test_gather_tree():
   expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]], [[2, 4, 4], [7, 6, 6], [8, 9,
                                                                                          10]]]).transpose([1, 0, 2])
 
-  res = gather_tree(
+  res = beam_search_decoder.gather_tree(
       predicted_ids,
       parent_ids,
       max_sequence_lengths=max_sequence_lengths,
diff --git a/deepray/seq2seq/tests/beam_search_ops_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
similarity index 94%
rename from deepray/seq2seq/tests/beam_search_ops_test.py
rename to deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
index 0d457da6..06270686 100644
--- a/deepray/seq2seq/tests/beam_search_ops_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for dp.seq2seq.beam_search_ops."""
+"""Tests for tfa.seq2seq.beam_search_ops."""
 
 import itertools
 
@@ -20,7 +20,7 @@
 import pytest
 import tensorflow as tf
 
-from deepray.seq2seq import gather_tree
+from deepray.custom_ops.seq2seq import beam_search_decoder
 
 
 def _transpose_batch_time(x):
@@ -35,7 +35,7 @@ def test_gather_tree_one():
   parent_ids = _transpose_batch_time([[[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]]])
   max_sequence_lengths = [3]
   expected_result = _transpose_batch_time([[[2, 2, 2], [6, 5, 6], [7, 8, 9], [10, 10, 10]]])
-  beams = gather_tree(
+  beams = beam_search_decoder.gather_tree(
       step_ids=step_ids,
       parent_ids=parent_ids,
       max_sequence_lengths=max_sequence_lengths,
@@ -54,7 +54,7 @@ def test_bad_parent_values_on_cpu():
   max_sequence_lengths = [3]
 
   with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"):
-    _ = gather_tree(
+    _ = beam_search_decoder.gather_tree(
         step_ids=step_ids,
         parent_ids=parent_ids,
         max_sequence_lengths=max_sequence_lengths,
@@ -73,7 +73,7 @@ def test_bad_parent_values_on_gpu():
   max_sequence_lengths = [3]
 
   with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"):
-    _ = gather_tree(
+    _ = beam_search_decoder.gather_tree(
         step_ids=step_ids,
         parent_ids=parent_ids,
         max_sequence_lengths=max_sequence_lengths,
@@ -92,7 +92,7 @@ def test_gather_tree_batch():
   step_ids = np.random.randint(0, high=end_token + 1, size=(max_time, batch_size, beam_width))
   parent_ids = np.random.randint(0, high=beam_width - 1, size=(max_time, batch_size, beam_width))
 
-  beams = gather_tree(
+  beams = beam_search_decoder.gather_tree(
       step_ids=step_ids.astype(np.int32),
       parent_ids=parent_ids.astype(np.int32),
       max_sequence_lengths=max_sequence_lengths,
diff --git a/deepray/seq2seq/tests/decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py
similarity index 97%
rename from deepray/seq2seq/tests/decoder_test.py
rename to deepray/custom_ops/seq2seq/python/tests/decoder_test.py
index b5eb9d8c..7b07c73a 100644
--- a/deepray/seq2seq/tests/decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for dp.seq2seq.decoder."""
+"""Tests for tfa.seq2seq.decoder."""
 
 import numpy as np
 import pytest
 import tensorflow as tf
 
-from deepray.seq2seq import basic_decoder
-from deepray.seq2seq import decoder
-from deepray.seq2seq import sampler as sampler_py
+from deepray.custom_ops.seq2seq import basic_decoder
+from deepray.custom_ops.seq2seq import decoder
+from deepray.custom_ops.seq2seq import sampler as sampler_py
 from deepray.utils import test_utils
 
 
@@ -28,6 +28,7 @@
 @pytest.mark.parametrize("maximum_iterations", [None, 1, tf.constant(1, dtype=tf.int32)])
 @pytest.mark.parametrize("time_major", [True, False])
 def test_dynamic_decode_rnn(time_major, maximum_iterations):
+
   sequence_length = [3, 4, 3, 1, 0]
   batch_size = 5
   max_time = 8
diff --git a/deepray/seq2seq/tests/loss_test.py b/deepray/custom_ops/seq2seq/python/tests/loss_test.py
similarity index 99%
rename from deepray/seq2seq/tests/loss_test.py
rename to deepray/custom_ops/seq2seq/python/tests/loss_test.py
index 6d48f578..e187292d 100644
--- a/deepray/seq2seq/tests/loss_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/loss_test.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf.deepray.seq2seq.python.loss_ops."""
+"""Tests for tf.addons.seq2seq.python.loss_ops."""
 
 import pytest
 import numpy as np
 import tensorflow as tf
 
-from deepray.seq2seq import loss
+from deepray.custom_ops.seq2seq import loss
 
 
 def get_test_data():
@@ -98,6 +98,7 @@ def test_sequence_loss(average_across_timesteps, average_across_batch, zero_weig
 @pytest.mark.parametrize("average_across_timesteps", [True, False])
 @pytest.mark.parametrize("average_across_batch", [True, False])
 def test_sequence_loss_class(average_across_timesteps, average_across_batch):
+
   (
       batch_size,
       sequence_length,
diff --git a/deepray/custom_ops/seq2seq/python/tests/run_all_test.py b/deepray/custom_ops/seq2seq/python/tests/run_all_test.py
new file mode 100644
index 00000000..62a5c7ed
--- /dev/null
+++ b/deepray/custom_ops/seq2seq/python/tests/run_all_test.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+import sys
+
+import pytest
+
+if __name__ == "__main__":
+  dirname = Path(__file__).absolute().parent
+  # sys.exit(pytest.main([str(dirname)]))
+  sys.exit(pytest.main(["-n 20", "-s", "-v", str(dirname)]))
diff --git a/deepray/custom_ops/simple_hash_table/BUILD b/deepray/custom_ops/simple_hash_table/BUILD
index 4d911a5b..660ba05e 100644
--- a/deepray/custom_ops/simple_hash_table/BUILD
+++ b/deepray/custom_ops/simple_hash_table/BUILD
@@ -12,7 +12,7 @@ custom_op_library(
         "simple_hash_table_op.cc",
     ],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//deepray/custom_ops/utils:ok_status_util",
     ],
 )
 
@@ -21,8 +21,6 @@ py_library(
     srcs = ["simple_hash_table_op.py"],
     data = ["simple_hash_table_kernel.so"],
     srcs_version = "PY3",
-    deps = [
-    ],
 )
 
 py_library(
@@ -56,5 +54,7 @@ py_test(
     ],
     deps = [
         ":simple_hash_table",
+        "@pypi_absl_py//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc
index 94ab7103..a603fee0 100644
--- a/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc
+++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_kernel.cc
@@ -17,18 +17,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/strcat.h"
 
-/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()".
-This code is for compatibility.*/
-#if TF_VERSION_INTEGER >= 2100
-#define TFOkStatus ::tensorflow::OkStatus()
-#else
-#define TFOkStatus ::tensorflow::Status::OK()
-#endif
-
 // Please use the appropriate namespace for your project
 namespace tensorflow {
 namespace custom_op_examples {
@@ -100,6 +93,8 @@ class SimpleHashTableResource : public ::tensorflow::ResourceBase {
   Status Import(const Tensor& keys, const Tensor& values) {
     const auto key_values = keys.flat<K>();
     const auto value_values = values.flat<V>();
+    LOG(INFO) << "key_values = " << key_values;
+    LOG(INFO) << "value_values = " << value_values;
 
     mutex_lock l(mu_);
     table_.clear();
diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc b/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc
index cbc9022f..e96c2f46 100644
--- a/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc
+++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_op.cc
@@ -15,17 +15,10 @@ limitations under the License.
 
 #include <string>
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
-/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()".
-This code is for compatibility.*/
-#if TF_VERSION_INTEGER >= 2100
-#define TFOkStatus ::tensorflow::OkStatus()
-#else
-#define TFOkStatus ::tensorflow::Status::OK()
-#endif
-
 // Please use the appropriate namespace for your project
 namespace tensorflow {
 namespace custom_op_examples {
diff --git a/deepray/custom_ops/sleep/BUILD b/deepray/custom_ops/sleep/BUILD
index ab6cf8bc..3a49e002 100644
--- a/deepray/custom_ops/sleep/BUILD
+++ b/deepray/custom_ops/sleep/BUILD
@@ -11,7 +11,7 @@ custom_op_library(
         "sleep_op.cc",
     ],
     deps = [
-        "@com_google_absl//absl/container:flat_hash_map",
+        "//deepray/custom_ops/utils:ok_status_util",
     ],
 )
 
@@ -45,5 +45,6 @@ py_test(
     ],
     deps = [
         ":sleep_op",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/sleep/sleep_op.cc b/deepray/custom_ops/sleep/sleep_op.cc
index 0fe77599..df6b37c7 100644
--- a/deepray/custom_ops/sleep/sleep_op.cc
+++ b/deepray/custom_ops/sleep/sleep_op.cc
@@ -13,9 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
+using namespace tensorflow;  // NOLINT(build/namespaces)
+
 // Use a namespace when registering by prepending the
 // package's name to the op’s name and separate with a '>'.
 // This is the recommendation for out-of-tree ops to avoid name collisions in
@@ -28,7 +31,7 @@ using ::tensorflow::shape_inference::InferenceContext;
 
 ::tensorflow::Status ScalarOutput(InferenceContext* c) {
   c->set_output(0, c->Scalar());
-  return ::tensorflow::Status::OK();
+  return TFOkStatus;
 }
 
 REGISTER_OP("Examples>AsyncSleep")
diff --git a/deepray/custom_ops/text/BUILD b/deepray/custom_ops/text/BUILD
index b9641b60..55b151e3 100644
--- a/deepray/custom_ops/text/BUILD
+++ b/deepray/custom_ops/text/BUILD
@@ -1,10 +1,10 @@
-load("//deepray:deepray.bzl", "custom_op_library")
+load("//build_deps/pip_tf:defs.bzl", "tf_custom_op_library")
 
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = ["//visibility:public"])
 
-custom_op_library(
+tf_custom_op_library(
     name = "_skip_gram_ops.so",
     srcs = [
         "cc/kernels/skip_gram_kernels.cc",
@@ -12,7 +12,7 @@ custom_op_library(
     ],
 )
 
-custom_op_library(
+tf_custom_op_library(
     name = "_parse_time_op.so",
     srcs = select({
         "//deepray:windows": [],
diff --git a/deepray/custom_ops/training_ops/BUILD b/deepray/custom_ops/training_ops/BUILD
index 8ff1851a..c830984b 100644
--- a/deepray/custom_ops/training_ops/BUILD
+++ b/deepray/custom_ops/training_ops/BUILD
@@ -1,3 +1,4 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//deepray:deepray.bzl", "custom_op_library")
 
 licenses(["notice"])  # Apache 2.0
@@ -9,10 +10,15 @@ custom_op_library(
         "cc/kernels/training_ops.h",
         "cc/ops/training_ops.cc",
     ],
-    cuda_srcs = [
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    gpu_srcs = [
         "cc/kernels/training_ops.h",
         "cc/kernels/training_ops_gpu.cu.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+        # "@org_tensorflow//tensorflow/core/kernels:training_op_helpers",
+    ],
 )
 
 py_library(
diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc b/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc
index 0cf0988e..0e455820 100644
--- a/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc
+++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <algorithm>  // NOLINT
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -31,7 +32,6 @@ using GPUDevice = Eigen::GpuDevice;
 using Index = Eigen::Index;
 
 namespace functor {
-
 template <typename T, typename Tindex>
 struct SparseApplyAdam<CPUDevice, T, Tindex> {
   Status operator()(const CPUDevice& d, typename TTypes<T>::Matrix var,
@@ -46,7 +46,7 @@ struct SparseApplyAdam<CPUDevice, T, Tindex> {
                     typename TTypes<Tindex>::ConstVec indices,
                     const int64 inner_dim) {
     const Tindex N = static_cast<Tindex>(indices.dimension(0));
-    if (N == 0) return Status::OK();
+    if (N == 0) return TFOkStatus;
     const Tindex first_dim_size = static_cast<Tindex>(var.dimension(0));
     const T beta1_power_scalar = beta1_power();
     const T beta2_power_scalar = beta2_power();
@@ -120,11 +120,10 @@ struct SparseApplyAdam<CPUDevice, T, Tindex> {
       d.parallelFor(N, cost, DoWork);
     }
 
-    return Status::OK();
+    return TFOkStatus;
   }
 };
-
-}  // namespace functor
+}  // End of namespace functor
 
 template <typename Device, typename T, typename Tindex>
 class SparseApplyAdamOp : public OpKernel {
@@ -133,7 +132,7 @@ class SparseApplyAdamOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
   }
 
-  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+  void Compute(OpKernelContext* ctx) override TF_NO_THREAD_SAFETY_ANALYSIS {
     const bool sparse = true;
     auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
         ctx, use_exclusive_lock_, sparse, {0, 1, 2});
@@ -290,4 +289,187 @@ REGISTER_KERNELS(GPU, double, int64);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #undef REGISTER_KERNELS
 
+namespace functor {
+template <typename T>
+struct ApplyAdamAsync<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Scalar beta1_power,
+                  typename TTypes<T>::Scalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
+    auto alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
+                 (T(1) - beta1_power());
+
+    // beta1 == μ
+    // beta2 == ν
+    // v     == n
+    // var   == θ
+    m.device(d) = m * beta1() + grad * (T(1) - beta1());
+    v.device(d) = v * beta2() + grad.square() * (T(1) - beta2());
+    if (use_nesterov) {
+      var.device(d) -= ((grad * (T(1) - beta1()) + beta1() * m) * alpha) /
+                       (v.sqrt() + epsilon());
+    } else {
+      var.device(d) -= (m * alpha) / (v.sqrt() + epsilon());
+    }
+
+    // update beta1_power && beta2_power
+    beta1_power.device(d) = beta1_power * beta1();
+    beta2_power.device(d) = beta2_power * beta2();
+  }
+};
+}  // namespace functor
+
+template <typename Device, typename T>
+class ApplyAdamAsyncOp : public OpKernel {
+ public:
+  explicit ApplyAdamAsyncOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const bool sparse = false;
+    auto locks = MaybeLockVariableInputMutexesInOrder<Device, T>(
+        ctx, use_exclusive_lock_, sparse, {0, 1, 2, 3, 4});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    Tensor beta1_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, false, &beta1_power));
+    Tensor beta2_power;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 4, use_exclusive_lock_, false, &beta2_power));
+
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+    OP_REQUIRES(
+        ctx, beta1_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(3)));
+    OP_REQUIRES(
+        ctx, beta2_power.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(4)));
+
+    const Tensor& lr = ctx->input(5);
+    const Tensor& beta1 = ctx->input(6);
+    const Tensor& beta2 = ctx->input(7);
+    const Tensor& epsilon = ctx->input(8);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(9);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdamAsync<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>(), use_nesterov_);
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(D, T)                                          \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("ApplyAdamAsync").Device(DEVICE_##D).TypeConstraint<T>("T"), \
+      ApplyAdamAsyncOp<D##Device, T>);                                  \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamAsync")                \
+                              .Device(DEVICE_##D)                       \
+                              .TypeConstraint<T>("T"),                  \
+                          ApplyAdamAsyncOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdamAsync<GPUDevice, T>::operator()(              \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::Scalar beta1_power,                 \
+      typename TTypes<T>::Scalar beta2_power,                 \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad, bool use_nesterov); \
+  extern template struct ApplyAdamAsync<GPUDevice, T>;
+
+DECLARE_GPU_SPEC(Eigen::half)
+DECLARE_GPU_SPEC(float)
+DECLARE_GPU_SPEC(double)
+#undef DECLARE_GPU_SPEC
+}  // end of namespace functor
+
+#define REGISTER_GPU_KERNELS(T) REGISTER_KERNELS(GPU, T);
+
+TF_CALL_half(REGISTER_GPU_KERNELS);
+TF_CALL_float(REGISTER_GPU_KERNELS);
+TF_CALL_double(REGISTER_GPU_KERNELS);
+
+#undef REGISTER_GPU_KERNELS
+#endif  // end of GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#undef REGISTER_KERNELS
+
 }  // namespace tensorflow
diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops.h b/deepray/custom_ops/training_ops/cc/kernels/training_ops.h
index f657bf23..8a9d2a48 100644
--- a/deepray/custom_ops/training_ops/cc/kernels/training_ops.h
+++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops.h
@@ -43,6 +43,19 @@ struct SparseApplyAdam {
                     const int64 inner_dim);
 };
 
+template <typename Device, typename T>
+struct ApplyAdamAsync {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Scalar beta1_power,
+                  typename TTypes<T>::Scalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov);
+};
+
 }  // end namespace functor
 }  // end namespace tensorflow
 
diff --git a/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc b/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc
index 7c627898..60a29a5b 100644
--- a/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc
+++ b/deepray/custom_ops/training_ops/cc/kernels/training_ops_gpu.cu.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "training_ops.h"
@@ -71,7 +72,7 @@ struct SparseApplyAdam<GPUDevice, T, Tindex> {
                     typename TTypes<Tindex>::ConstVec indices,
                     const int64 inner_dim) {
     const Tindex N = static_cast<Tindex>(indices.dimension(0));
-    if (N == 0) return Status::OK();
+    if (N == 0) return TFOkStatus;
 
     const Tindex first_dim_size = var.dimension(0);
     const Tindex grad_size = grad.size();
@@ -87,6 +88,62 @@ struct SparseApplyAdam<GPUDevice, T, Tindex> {
   }
 };
 
+template <typename T>
+__global__ __launch_bounds__(1024) void ApplyAdamAsyncKernel(
+    T* var, T* m, T* v, T* beta1_power, T* beta2_power, const T* lr_scalar,
+    const T* beta1_scalar, const T* beta2_scalar, const T* epsilon_scalar,
+    const T* grad, const bool use_nesterov, const int32 grad_size) {
+  T lr = *lr_scalar;
+  T beta1 = *beta1_scalar;
+  T beta2 = *beta2_scalar;
+  T epsilon = *epsilon_scalar;
+  T alpha = lr * sqrt(static_cast<T>(1) - *beta2_power) /
+            (static_cast<T>(1) - *beta1_power);
+
+  // beta1 == μ
+  // beta2 == ν
+  // v     == n
+  // var   == θ
+  GPU_1D_KERNEL_LOOP(index, grad_size) {
+    m[index] = m[index] * beta1 + grad[index] * (static_cast<T>(1) - beta1);
+    v[index] = v[index] * beta2 +
+               grad[index] * grad[index] * (static_cast<T>(1) - beta2);
+    if (use_nesterov) {
+      var[index] -=
+          ((grad[index] * (static_cast<T>(1) - beta1) + beta1 * m[index]) *
+           alpha) /
+          (sqrt(v[index]) + epsilon);
+    } else {
+      var[index] -= (m[index] * alpha) / (sqrt(v[index]) + epsilon);
+    }
+  }
+}
+
+template <typename T>
+struct ApplyAdamAsync<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Scalar beta1_power,
+                  typename TTypes<T>::Scalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad, bool use_nesterov) {
+    int32 grad_size = grad.size();
+
+    GpuLaunchConfig config = GetGpuLaunchConfig(grad_size, d);
+    GpuLaunchKernel(ApplyAdamAsyncKernel<T>, config.block_count,
+                    config.thread_per_block, 0, d.stream(), var.data(),
+                    m.data(), v.data(), beta1_power.data(), beta2_power.data(),
+                    lr.data(), beta1.data(), beta2.data(), epsilon.data(),
+                    grad.data(), use_nesterov, grad_size);
+    // update beta1_power && beta2_power
+    beta1_power.device(d) = beta1_power * beta1;
+    beta2_power.device(d) = beta2_power * beta2;
+  }
+};
+
 }  // namespace functor
 
 #define EXPLICITLY_INSTANTIATE_FUNCTOR(T)                        \
@@ -97,6 +154,11 @@ EXPLICITLY_INSTANTIATE_FUNCTOR(float);
 EXPLICITLY_INSTANTIATE_FUNCTOR(double);
 #undef EXPLICITLY_INSTANTIATE_FUNCTOR
 
+#define REGISTER_ALL_TYPE(type) \
+  template struct functor::ApplyAdamAsync<GPUDevice, type>;
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_ALL_TYPE);
+#undef REGISTER_ALL_TYPE
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/deepray/custom_ops/training_ops/cc/ops/training_ops.cc b/deepray/custom_ops/training_ops/cc/ops/training_ops.cc
index fda482d0..5ddf902e 100644
--- a/deepray/custom_ops/training_ops/cc/ops/training_ops.cc
+++ b/deepray/custom_ops/training_ops/cc/ops/training_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -39,7 +40,7 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
   ShapeHandle grad = ShapeOrHandleShape(c, grad_idx);
   if (!sparse) {
     TF_RETURN_IF_ERROR(c->Merge(*s, grad, s));
-    return Status::OK();
+    return TFOkStatus;
   }
   // Indices is a vector where indices.dim[0].rank == grad[0].rank.
   ShapeHandle indices;
@@ -53,7 +54,7 @@ static Status HandleGradAndIndicesInputs(InferenceContext* c, bool sparse,
       c->ReplaceDim(grad, 0, c->UnknownDim(), &grad_unknown_first));
   TF_RETURN_IF_ERROR(c->Merge(*s, grad_unknown_first, s));
 
-  return Status::OK();
+  return TFOkStatus;
 }
 
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
@@ -72,7 +73,7 @@ static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   if (c->num_outputs() > 0) {
     c->set_output(0, s);
   }
-  return Status::OK();
+  return TFOkStatus;
 }
 
 REGISTER_OP("SparseApplyAdam")
@@ -114,12 +115,50 @@ REGISTER_OP("ResourceSparseApplyAdam")
       return ApplyAdamShapeFn(c, true /* sparse */);
     });
 
-REGISTER_OP("ResourceApplyAdam")
+static Status ApplyAdamAsyncShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 9 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return TFOkStatus;
+}
+
+REGISTER_OP("ApplyAdamAsync")
+    .Input("var: Ref(T)")
+    .Input("m: Ref(T)")
+    .Input("v: Ref(T)")
+    .Input("beta1_power: Ref(T)")
+    .Input("beta2_power: Ref(T)")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Output("out: Ref(T)")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamAsyncShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceApplyAdamAsync")
     .Input("var: resource")
     .Input("m: resource")
     .Input("v: resource")
-    .Input("beta1_power: T")
-    .Input("beta2_power: T")
+    .Input("beta1_power: resource")
+    .Input("beta2_power: resource")
     .Input("lr: T")
     .Input("beta1: T")
     .Input("beta2: T")
@@ -129,7 +168,7 @@ REGISTER_OP("ResourceApplyAdam")
     .Attr("use_locking: bool = false")
     .Attr("use_nesterov: bool = false")
     .SetShapeFn([](InferenceContext* c) {
-      return ApplyAdamShapeFn(c, false /* sparse */);
+      return ApplyAdamAsyncShapeFn(c, false /* sparse */);
     });
 
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/unique_ops/BUILD b/deepray/custom_ops/unique_ops/BUILD
index e240e9a2..9fa689f7 100644
--- a/deepray/custom_ops/unique_ops/BUILD
+++ b/deepray/custom_ops/unique_ops/BUILD
@@ -1,5 +1,5 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//deepray:deepray.bzl", "custom_op_library")
-load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION")
 
 licenses(["notice"])  # Apache 2.0
 
@@ -11,44 +11,42 @@ package(
 )
 
 cc_library(
-    name = "random",
+    name = "unique_ali_util",
     srcs = [
-        "cc/kernels/random.cc",
-        "cc/kernels/random.h",
-    ],
-    copts = [CPLUSPLUS_VERSION],
-    deps = [
-        "@local_config_tf//:libtensorflow_framework",
-        "@local_config_tf//:tf_header_lib",
+        "cc/kernels/task_runner.h",
+        "cc/kernels/unique_ali_op_util.h",
     ],
-)
-
-cc_test(
-    name = "random_test",
-    srcs = ["cc/kernels/random_test.cc"],
     deps = [
-        ":random",
-        "@com_google_googletest//:gtest_main",
+        "//deepray/custom_ops/utils:ok_status_util",
+        "//deepray/custom_ops/utils:random",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@sparsehash_c11//:dense_hash_map",
     ],
 )
 
 custom_op_library(
     name = "_unique_ops.so",
     srcs = [
-        "cc/kernels/task_runner.h",
         "cc/kernels/unique_ali_op.cc",
-        "cc/kernels/unique_ali_op_util.h",
         "cc/ops/unique_ops.cc",
     ],
-    copts = [CPLUSPLUS_VERSION],
-    cuda_srcs = [
+    copts = [
+        "-Wno-unused-variable",
+        "-Wno-unused-result",
+    ] + if_cuda(["-DGOOGLE_CUDA=1"]),
+    gpu_srcs = [
         "cc/kernels/unique_ali_op_gpu.cu.cc",
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":random",
+        ":unique_ali_util",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@sparsehash_c11//:dense_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -73,5 +71,8 @@ py_test(
     main = "python/tests/run_all_test.py",
     deps = [
         ":unique_ops",
+        "@pypi_numpy//:pkg",
+        "@pypi_pytest//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random.cc b/deepray/custom_ops/unique_ops/cc/kernels/random.cc
deleted file mode 100644
index 1bf84917..00000000
--- a/deepray/custom_ops/unique_ops/cc/kernels/random.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "random.h"
-
-#include <random>
-
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/env_var.h"
-
-namespace tensorflow {
-namespace random {
-
-namespace {
-std::mt19937_64* InitRngWithRandomSeed() {
-  std::random_device device("/dev/urandom");
-  return new std::mt19937_64(device());
-}
-std::mt19937_64 InitRngWithDefaultSeed() { return std::mt19937_64(); }
-
-}  // anonymous namespace
-
-uint64 New64() {
-  static std::mt19937_64* rng = InitRngWithRandomSeed();
-  static mutex mu(LINKER_INITIALIZED);
-  mutex_lock l(mu);
-  return (*rng)();
-}
-
-uint64 New64DefaultSeed() {
-  static std::mt19937_64 rng = InitRngWithDefaultSeed();
-  static mutex mu(LINKER_INITIALIZED);
-  mutex_lock l(mu);
-  return rng();
-}
-
-uint64 New64Configuable() {
-  int64 random_64;
-  CHECK(
-      ReadInt64FromEnvVar("DEEPREC_CONFIG_RAND_64", New64(), &random_64).ok());
-  return static_cast<uint64>(random_64);
-}
-
-}  // namespace random
-}  // namespace tensorflow
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h b/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h
index 922f0596..566e8c4b 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h
+++ b/deepray/custom_ops/unique_ops/cc/kernels/task_runner.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/core/lib/core/blocking_counter.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 
 namespace tensorflow {
 
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc
index 047ff3bd..c6056334 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc
+++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op.cc
@@ -17,36 +17,28 @@ limitations under the License.
 #include <functional>
 #include <unordered_map>
 
-#include "absl/container/flat_hash_map.h"
-#include "sparsehash/dense_hash_map"
 #include "task_runner.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/util/env_var.h"
 #include "unique_ali_op_util.h"
 
 namespace tensorflow {
 
 namespace {
-const char *kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL";
-const char *kUniqueOpHashMapEnv = "DEEPREC_UNIQUE_OP_HASH_MAP";
-const char *kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT";
-const char *kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE";
-const char *kMultiMapString = "MULTIMAP";
-const char *kStlHashMapString = "STL";
-const char *kAbslHashMapString = "ABSL";
-const char *kGoogleHashMapString = "GOOGLE";
+const char* kUniqueOpSerialEnv = "DEEPREC_UNIQUE_OP_SERIAL";
+const char* kUniqueOpHashMapEnv = "DEEPREC_UNIQUE_OP_HASH_MAP";
+const char* kUniqueOpUniqRatioHint = "DEEPREC_UNIQUE_OP_UNIQ_RATIO_HINT";
+const char* kUniqueOpPartitionSizeEnv = "DEEPREC_UNIQUE_OP_PARTITION_SIZE";
+const char* kMultiMapString = "MULTIMAP";
+const char* kStlHashMapString = "STL";
+const char* kAbslHashMapString = "ABSL";
+const char* kGoogleHashMapString = "GOOGLE";
 const int64 kDefaultUniqueRatioHint = 4;
 }  // namespace
 
 template <typename T, typename TIndex>
 class UniqueAliOp : public OpKernel {
  public:
-  explicit UniqueAliOp(OpKernelConstruction *context) : OpKernel(context) {
+  explicit UniqueAliOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(
         context, ReadInt64FromEnvVar(kUniqueOpPartitionSizeEnv, kPartitionSize,
                                      &partition_size_));
@@ -101,14 +93,14 @@ class UniqueAliOp : public OpKernel {
     }
   }
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     VLOG(2) << "Unique V2 executed";
     ComputeInternal(context);
   }
 
  private:
-  void ComputeInternal(OpKernelContext *context) {
-    const Tensor &input = context->input(0);
+  void ComputeInternal(OpKernelContext* context) {
+    const Tensor& input = context->input(0);
     Tensor idx;
     Tensor output;
     Tensor output_counter;
@@ -117,7 +109,7 @@ class UniqueAliOp : public OpKernel {
           context, input, &idx, &output, &output_counter, num_outputs(),
           partition_size_, serial_, unique_ratio_hint_, map_flag_);
     } else {
-      const Tensor &axis_tensor = context->input(1);
+      const Tensor& axis_tensor = context->input(1);
       UniqueWithAxis<T, TIndex>(context, input, axis_tensor, &idx, &output,
                                 &output_counter, num_outputs(), partition_size_,
                                 serial_, unique_ratio_hint_, map_flag_);
@@ -129,33 +121,65 @@ class UniqueAliOp : public OpKernel {
     }
   }
 
+ protected:
   bool serial_ = false;
   int64 partition_size_ = 0;
   int64 unique_ratio_hint_;
   UniqueMaps map_flag_ = GOOGLE;  // "GOOGLE" dense hash map is default
 };
 
+template <typename T, typename TIndex>
+class UniqueWithCountAliOp : public UniqueAliOp<T, TIndex> {
+  using UniqueAliOp<T, TIndex>::serial_;
+  using UniqueAliOp<T, TIndex>::partition_size_;
+  using UniqueAliOp<T, TIndex>::unique_ratio_hint_;
+  using UniqueAliOp<T, TIndex>::map_flag_;
+  using OpKernel::num_outputs;
+
+ public:
+  explicit UniqueWithCountAliOp(OpKernelConstruction* context)
+      : UniqueAliOp<T, TIndex>(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("N", &num_sparse_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input = context->input(0);
+    Tensor idx;
+    Tensor output;
+    Tensor output_counter;
+    UniqueWithExtraCounts<T, TIndex>(
+        context, input, &idx, &output, &output_counter, num_outputs(),
+        partition_size_, serial_, unique_ratio_hint_, num_sparse_, map_flag_);
+    context->set_output(0, output);
+    context->set_output(1, idx);
+    context->set_output(2, output_counter);
+  }
+
+ private:
+  int num_sparse_;
+};
+
 #define REGISTER_UNIQUE(type)                                    \
   REGISTER_KERNEL_BUILDER(Name("Deepray>Unique")                 \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("Deepray>Unique")                 \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueV2")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int32>("out_idx"), \
-                          UniqueAliOp<type, int32>);             \
+                          UniqueAliOp<type, int32>)              \
   REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueV2")               \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithCounts")       \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -165,7 +189,7 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);             \
+                          UniqueAliOp<type, int64>)              \
   REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithCountsV2")     \
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
@@ -175,7 +199,17 @@ class UniqueAliOp : public OpKernel {
                               .Device(DEVICE_CPU)                \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>)
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts")  \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts")  \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>)
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(tstring)
 #undef REGISTER_UNIQUE
@@ -199,7 +233,17 @@ REGISTER_UNIQUE(tstring)
                               .HostMemory("count")               \
                               .TypeConstraint<type>("T")         \
                               .TypeConstraint<int64>("out_idx"), \
-                          UniqueAliOp<type, int64>);
+                          UniqueAliOp<type, int64>)              \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts")  \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int32>("out_idx"), \
+                          UniqueWithCountAliOp<type, int32>)     \
+  REGISTER_KERNEL_BUILDER(Name("Deepray>UniqueWithExtraCounts")  \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<type>("T")         \
+                              .TypeConstraint<int64>("out_idx"), \
+                          UniqueWithCountAliOp<type, int64>);
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE);
 REGISTER_UNIQUE(tstring)
 #undef REGISTER_UNIQUE
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc
index c3677d26..05075327 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc
+++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_gpu.cu.cc
@@ -19,23 +19,15 @@ limitations under the License.
 
 #include "cub/device/device_radix_sort.cuh"
 #include "cub/device/device_scan.cuh"
-#include "cub/device/device_select.cuh"
-#include "cub/iterator/constant_input_iterator.cuh"
 #include "cub/iterator/counting_input_iterator.cuh"
 #include "cub/iterator/transform_input_iterator.cuh"
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor_types.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/cuda.h"
-#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 #include "tensorflow/core/util/gpu_solvers.h"  // For ScratchSpace
-#include "tensorflow/stream_executor/stream_executor.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 using GPUDevice = Eigen::GpuDevice;
@@ -149,7 +141,7 @@ class UniqueAliV2GpuOp : public AsyncOpKernel {
                             &device, this](int64 N_out) {
       TF_RETURN_IF_ERROR(ctx->allocate_output(0, {N_out}, &output_tensor));
       TF_RETURN_IF_ERROR(ctx->allocate_output(1, {N}, &idx_tensor));
-      return Status::OK();
+      return TFOkStatus;
     };
     if (N == 0) {
       OP_REQUIRES_OK_ASYNC(ctx, allocate_output(0), done);
@@ -242,7 +234,7 @@ class UniqueAliV2GpuOp : public AsyncOpKernel {
             ->ThenMemcpy(N_out.mutable_data(), wrapped_num_out, sizeof(TIndex))
             .ok(),
         errors::Internal("Failed to launch copy from device to host."), done);
-    ctx->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
+    ctx->device()->tensorflow_accelerator_device_info()->event_mgr->ThenExecute(
         stream, [ref_output_indices]() { ref_output_indices.Unref(); });
     stream->BlockHostUntilDone();
     int64_t uniq_size = (*N_out.data()) + 1;
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h
index c27afd2e..54287e9d 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h
+++ b/deepray/custom_ops/unique_ops/cc/kernels/unique_ali_op_util.h
@@ -22,21 +22,14 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
-#include "random.h"
+#include "deepray/custom_ops/utils/ok_status_util.h"
+#include "deepray/custom_ops/utils/random.h"
 #include "sparsehash/dense_hash_map"
 #include "task_runner.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/lib/hash/hash.h"
-#include "tensorflow/core/util/env_var.h"
-#include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
 
@@ -192,7 +185,9 @@ void NewSizes(OpKernelContext* context, const Tensor& input,
 
 template <typename T, typename TIndex, class HashMap>
 void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx,
-                     int64 axis, int64* uniq_size, Tensor* output) {
+                     int64 axis, int64* uniq_size, int num_sparse,
+                     google::dense_hash_map<int, TIndex>* counter_map,
+                     Tensor* output) {
   auto Tin = input.flat<T>();
   const int64 N = input.NumElements();
   auto idx_vec = idx->template vec<TIndex>();
@@ -207,6 +202,22 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx,
     }
   }
 
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto idx_it = uniq.find(ids);
+      if (idx_it != uniq.end()) {
+        counter_map->emplace(idx_it->second, counter_vec(k));
+      }
+    }
+  }
+
   *uniq_size = static_cast<int64>(uniq.size());
   TensorShape output_shape(input.shape());
   output_shape.set_dim(axis, *uniq_size);
@@ -224,6 +235,8 @@ void SerialComputeV1(OpKernelContext* context, const Tensor& input, Tensor* idx,
 template <typename T, typename TIndex, class HashMap>
 void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
                        Tensor* idx, int64 axis, int64* uniq_size,
+                       int num_sparse,
+                       google::dense_hash_map<int, TIndex>* counter_map,
                        Tensor* output) {
   // Struct INode was used to store an inverse mapping for each node in the
   // hash map container.
@@ -424,6 +437,25 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
   TaskRunner t3_runner(GlobalIndexTask, thread_pool, num_tasks_t1);
   t3_runner.Run();
 
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * N);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<T>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      for (int j = 0; j < num_tasks_t1; ++j) {
+        const INode* inode = uniq_maps[j].GetINodeByKey(ids);
+        if (inode != nullptr) {
+          counter_map->emplace(inode->index_, counter_vec(k));
+          continue;
+        }
+      }
+    }
+  }
+
   // Parallel Step 4: Write output indicies Tensor.
   int32 max_tasks_t4 = (N + kPartitionSize - 1) / kPartitionSize;
   int32 num_tasks_t4 = std::max(std::min(max_threads, max_tasks_t4), 1);
@@ -458,7 +490,9 @@ void ParallelComputeV1(OpKernelContext* context, const Tensor& input,
 template <typename TIndex, class HashMap>
 void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
                      int64 axis, int64* uniq_size_out, int32 num_buckets,
-                     int64 unique_ratio_hint, Tensor* output) {
+                     int64 unique_ratio_hint, int num_sparse,
+                     google::dense_hash_map<int, TIndex>* counter_map,
+                     Tensor* output) {
   auto Tin = input.vec<int64>();
   const int64 N = input.NumElements();
 
@@ -475,7 +509,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
   Partitioner map_parter(N, num_partitions);
   auto PartitionTask = [N, num_buckets, &Tin, &partitions, &map_parter,
                         &idx_vec](int32 task_id, int32 num_tasks) {
-    auto st = Status::OK();
+    auto st = TFOkStatus;
     int64* partition = partitions.get() + task_id * num_buckets;
     for (int64 i = 0; i < num_buckets; ++i) {
       partition[i] = -1;
@@ -499,7 +533,7 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
   };
 
   SummaryTaskRunner<Status, StatusSummaryUpdater> t0_runner(
-      PartitionTask, Status::OK(), thread_pool, num_partitions);
+      PartitionTask, TFOkStatus, thread_pool, num_partitions);
   t0_runner.Run();
   OP_REQUIRES_OK(context, t0_runner.summary());
 
@@ -543,6 +577,24 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
   int64 uniq_size =
       global_offsets[num_buckets - 1] + uniq_maps[num_buckets - 1].size();
 
+  counter_map->set_empty_key(std::numeric_limits<int>::max());
+  counter_map->resize(2 * uniq_size);
+
+  google::dense_hash_map<int64, TIndex> extra_unique_id_map;
+  extra_unique_id_map.set_empty_key(std::numeric_limits<int64>::max());
+  extra_unique_id_map.resize(2 * uniq_size);
+  for (int i = 0; i < num_sparse; ++i) {
+    const Tensor& indices_tensor = context->input(1 + i);
+    auto extra_ids_vec = indices_tensor.template vec<int64>();
+    const Tensor& counter_tensor = context->input(1 + num_sparse + i);
+    auto counter_vec = counter_tensor.template vec<TIndex>();
+    for (int64 k = 0; k < extra_ids_vec.size(); ++k) {
+      auto ids = extra_ids_vec(k);
+      auto counts = counter_vec(k);
+      extra_unique_id_map.emplace(ids, counts);
+    }
+  }
+
   *uniq_size_out = uniq_size;
   AllocatorAttributes attr;
   attr.set_on_host(true);
@@ -552,7 +604,8 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
   auto key_output_vec = output->template vec<int64>();
 
   auto OutputTask = [&key_output_vec, &uniq_maps, &global_offsets, &Tin,
-                     &idx_vec, &map_parter](int32 task_id, int32 num_tasks) {
+                     &idx_vec, &map_parter, &counter_map,
+                     extra_unique_id_map](int32 task_id, int32 num_tasks) {
     TIndex offset = global_offsets[task_id];
     for (auto iter = uniq_maps[task_id].begin();
          iter != uniq_maps[task_id].end(); ++iter) {
@@ -566,7 +619,10 @@ void MultiMapCompute(OpKernelContext* context, const Tensor& input, Tensor* idx,
         next_idx = idx_vec(cur_idx);
         idx_vec(cur_idx) = offset;
       }
-
+      auto it = extra_unique_id_map.find(iter->first);
+      if (it != extra_unique_id_map.end()) {
+        counter_map->emplace(offset, it->second);
+      }
       ++offset;
     }
   };
@@ -631,8 +687,10 @@ void MultipleElements(OpKernelContext* context, const Tensor& input,
 }
 
 template <typename TIndex>
-void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
-                      Tensor* idx, int num_outputs, int64 uniq_size) {
+void CheckCountOutput(OpKernelContext* context, Tensor* output,
+                      Tensor* output_counter, Tensor* idx, int num_outputs,
+                      int64 uniq_size, int num_sparse,
+                      google::dense_hash_map<int, TIndex> counter_map) {
   if (num_outputs > 2) {
     auto idx_vec = idx->template vec<TIndex>();
     AllocatorAttributes attr;
@@ -646,13 +704,19 @@ void CheckCountOutput(OpKernelContext* context, Tensor* output_counter,
     for (int64 i = 0; i < N; ++i) {
       count_output_vec(idx_vec(i))++;
     }
+    if (num_sparse > 0) {
+      for (auto& it : counter_map) {
+        count_output_vec(it.first) += (it.second - 1);
+      }
+    }
   }
 }
 
 template <typename T, typename TIndex, class HashMap>
-void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
-                                Tensor* idx, int64 axis, int64* uniq_size,
-                                int64 N, bool serial, Tensor* output) {
+void ComputeInternalWithHashMap(
+    OpKernelContext* context, const Tensor& input, Tensor* idx, int64 axis,
+    int64* uniq_size, int64 N, int num_sparse, bool serial,
+    google::dense_hash_map<int, TIndex>* counter_map, Tensor* output) {
   OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
               errors::InvalidArgument("unique expects a 1D vector."));
   // TODO(dga):  Make unique polymorphic for returning int32 and int64
@@ -664,10 +728,10 @@ void ComputeInternalWithHashMap(OpKernelContext* context, const Tensor& input,
 
   if (N >= kPartitionLimit && !serial) {
     ParallelComputeV1<T, TIndex, HashMap>(context, input, idx, axis, uniq_size,
-                                          output);
+                                          num_sparse, counter_map, output);
   } else {
     SerialComputeV1<T, TIndex, HashMap>(context, input, idx, axis, uniq_size,
-                                        output);
+                                        num_sparse, counter_map, output);
   }
 }
 
@@ -676,7 +740,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx,
                     Tensor* output, Tensor* output_counter, int num_outputs,
                     int64 partition_size, bool serial, int64 axis,
                     int64 unique_ratio_hint, std::vector<int64>& new_sizes,
-                    UniqueMaps map_flag) {
+                    UniqueMaps map_flag, int num_sparse = 0) {
   typedef google::dense_hash_map<T, TIndex> DefaultHashMap;
 
   AllocatorAttributes attr;
@@ -686,6 +750,7 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx,
                                       TensorShape({new_sizes[1]}), idx, attr));
 
   int64 uniq_size_out;
+  google::dense_hash_map<int, TIndex> counter_map;
 
   if (new_sizes[0] == 1 && new_sizes[2] == 1) {
     // Specialized and faster implementation when unique is run over single
@@ -704,35 +769,40 @@ void UniqueInternal(OpKernelContext* context, const Tensor& input, Tensor* idx,
           MultiMapCompute<TIndex,
                           google::dense_hash_map<int64, TIndex, IdHash>>(
               context, input, idx, axis, &uniq_size_out, num_buckets,
-              unique_ratio_hint, output);
+              unique_ratio_hint, num_sparse, &counter_map, output);
         } else {
           SerialComputeV1<T, TIndex, DefaultHashMap>(context, input, idx, axis,
-                                                     &uniq_size_out, output);
+                                                     &uniq_size_out, num_sparse,
+                                                     &counter_map, output);
         }
         break;
       case STL:
         ComputeInternalWithHashMap<T, TIndex, std::unordered_map<T, TIndex>>(
-            context, input, idx, axis, &uniq_size_out, N, serial, output);
+            context, input, idx, axis, &uniq_size_out, N, num_sparse, serial,
+            &counter_map, output);
         break;
       case ABSL:
         ComputeInternalWithHashMap<T, TIndex, absl::flat_hash_map<T, TIndex>>(
-            context, input, idx, axis, &uniq_size_out, N, serial, output);
+            context, input, idx, axis, &uniq_size_out, N, num_sparse, serial,
+            &counter_map, output);
         break;
       case GOOGLE:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>(
-            context, input, idx, axis, &uniq_size_out, N, serial, output);
+            context, input, idx, axis, &uniq_size_out, N, num_sparse, serial,
+            &counter_map, output);
         break;
       default:
         ComputeInternalWithHashMap<T, TIndex, DefaultHashMap>(
-            context, input, idx, axis, &uniq_size_out, N, serial, output);
+            context, input, idx, axis, &uniq_size_out, N, num_sparse, serial,
+            &counter_map, output);
     }
   } else {
     MultipleElements<T, TIndex>(context, input, idx, output, &uniq_size_out,
                                 axis, new_sizes);
   }
 
-  CheckCountOutput<TIndex>(context, output_counter, idx, num_outputs,
-                           uniq_size_out);
+  CheckCountOutput<TIndex>(context, output, output_counter, idx, num_outputs,
+                           uniq_size_out, num_sparse, counter_map);
 }
 
 template <typename T, typename TIndex>
@@ -763,6 +833,21 @@ void UniqueWithAxis(OpKernelContext* context, const Tensor& input,
                             unique_ratio_hint, new_sizes, map_flag);
 }
 
+template <typename T, typename TIndex>
+void UniqueWithExtraCounts(OpKernelContext* context, const Tensor& input,
+                           Tensor* idx, Tensor* output, Tensor* output_counter,
+                           int num_outputs, int64 partition_size, bool serial,
+                           int64 unique_ratio_hint, int num_sparse,
+                           UniqueMaps map_flag) {
+  int64 axis = 0;
+  std::vector<int64> new_sizes{1, input.NumElements(), 1};
+  OP_REQUIRES(context, TensorShapeUtils::IsVector(input.shape()),
+              errors::InvalidArgument("unique expects a 1D vector."));
+  UniqueInternal<T, TIndex>(context, input, idx, output, output_counter,
+                            num_outputs, partition_size, serial, axis,
+                            unique_ratio_hint, new_sizes, map_flag, num_sparse);
+}
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_KERNELS_UNIQUE_ALI_OP_UTIL_H_
diff --git a/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc b/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc
index f8158336..67c83d6e 100644
--- a/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc
+++ b/deepray/custom_ops/unique_ops/cc/ops/unique_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -46,10 +47,9 @@ REGISTER_OP("Deepray>UniqueV2")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->Vector(InferenceContext::kUnknownDim));
       c->set_output(1, c->input(0));
-      return Status::OK();
+      return TFOkStatus;
     });
 
-// --------------------------------------------------------------------------
 REGISTER_OP("Deepray>UniqueWithCounts")
     .Input("x: T")
     .Output("y: T")
@@ -62,7 +62,7 @@ REGISTER_OP("Deepray>UniqueWithCounts")
       c->set_output(0, uniq);
       c->set_output(1, c->input(0));
       c->set_output(2, uniq);
-      return Status::OK();
+      return TFOkStatus;
     });
 
 REGISTER_OP("Deepray>UniqueWithCountsV2")
@@ -79,7 +79,25 @@ REGISTER_OP("Deepray>UniqueWithCountsV2")
       c->set_output(0, uniq);
       c->set_output(1, c->input(0));
       c->set_output(2, uniq);
-      return Status::OK();
+      return TFOkStatus;
+    });
+
+REGISTER_OP("Deepray>UniqueWithExtraCounts")
+    .Input("x: T")
+    .Input("extra_indices: N * T")
+    .Input("extra_counts: N * out_idx")
+    .Output("y: T")
+    .Output("idx: out_idx")
+    .Output("count: out_idx")
+    .Attr("T: type")
+    .Attr("N: int >= 0")
+    .Attr("out_idx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c) {
+      auto uniq = c->Vector(InferenceContext::kUnknownDim);
+      c->set_output(0, uniq);
+      c->set_output(1, c->input(0));
+      c->set_output(2, uniq);
+      return TFOkStatus;
     });
 
 }  // namespace tensorflow
\ No newline at end of file
diff --git a/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
new file mode 100644
index 00000000..bdf8a334
--- /dev/null
+++ b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
@@ -0,0 +1,349 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.kernels.unique_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+# set environ before tf initializing global varialbes
+PreservedKey = 1 << 10
+os.environ["DEEPREC_CONFIG_RAND_64"] = str(PreservedKey)
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+
+from deepray.custom_ops.unique_ops import gen_array_ops
+
+
+class UniqueTest(test.TestCase):
+
+  def testInt32(self):
+    x = np.random.randint(0, high=1000, size=700000)
+    with self.cached_session(use_gpu=True) as sess:
+      y, idx = gen_array_ops.deepray_unique(x)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    with self.cached_session(use_gpu=True) as sess:
+      y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def testInt64OutIdxInt64(self):
+    np.random.seed(0)
+    x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64)
+    with self.cached_session(use_gpu=True) as sess:
+      y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def testInt64OutIdxInt32(self):
+    np.random.seed(0)
+    x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64)
+    with self.cached_session(use_gpu=True) as sess:
+      y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int32)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def testString(self):
+    indx = np.random.randint(65, high=122, size=70000)
+    x = [chr(i) for i in indx]
+    with self.cached_session() as sess:
+      y, idx = gen_array_ops.deepray_unique(x)
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
+
+  def testInt32Axis(self):
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.cached_session() as sess:
+        y0, idx0 = gen_array_ops.deepray_unique_v2(x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0 = sess.run([y0, idx0])
+        y1, idx1 = gen_array_ops.deepray_unique_v2(x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1 = sess.run([y1, idx1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.cached_session() as sess:
+      y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32))
+      tf_y, tf_idx = sess.run([y, idx])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+
+  def IllegalIdForMultMapUnique(self):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_PARTITION_SIZE' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
+    os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = '2'
+
+    with self.cached_session() as sess:
+      x = np.array([-1, 0, 1, PreservedKey], dtype=np.int64)
+      y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64)
+      with self.assertRaisesRegexp(
+          errors_impl.InvalidArgumentError, "Input id is preserved key of dense_hash_map, "
+          "not supported: " + str(PreservedKey)
+      ):
+        tf_y, tf_idx = sess.run([y, idx])
+
+    del os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = old_env
+
+  def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+
+    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    self.testInt32()
+    self.testInt32OutIdxInt64()
+    self.testInt64OutIdxInt64()
+    self.testInt64OutIdxInt32()
+    self.testInt32Axis()
+    self.testInt32V2()
+    if test_illegal_key:
+      self.IllegalIdForMultMapUnique()
+
+    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+
+  def testUniqueMultiMap(self):
+    self.RunUniqueWithDifferentMaps('MULTIMAP', True)
+
+  def testUniqueStlMap(self):
+    self.RunUniqueWithDifferentMaps('STL')
+
+  def testUniqueAbslMap(self):
+    self.RunUniqueWithDifferentMaps('ABSL')
+
+  def testUniqueDenseHashMap(self):
+    self.RunUniqueWithDifferentMaps('GOOGLE')
+
+
+class UniqueWithCountsTest(test.TestCase):
+
+  def testInt32(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_counts(x)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_counts(x, out_idx=dtypes.int64)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
+  def testString(self):
+    indx = np.random.randint(65, high=122, size=7000)
+    x = [chr(i) for i in indx]
+
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_counts(x)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
+    for value, count in zip(tf_y, tf_count):
+      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+      self.assertEqual(count, sum(v))
+
+  def testInt32Axis(self):
+    for dtype in [np.int32, np.int64]:
+      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+      with self.cached_session() as sess:
+        y0, idx0, count0 = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([0], dtype))
+        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([1], dtype))
+        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+      self.assertAllEqual(tf_count0, np.array([2, 1]))
+      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      self.assertAllEqual(tf_count1, np.array([1, 2]))
+
+  def testInt32V2(self):
+    # This test is only temporary, once V2 is used
+    # by default, the axis will be wrapped to allow `axis=None`.
+    x = np.random.randint(2, high=10, size=7000)
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_counts_v2(x, axis=np.array([], np.int32))
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      self.assertEqual(count, np.sum(x == value))
+
+  def RunUniqueWithCountsWithDifferentMaps(self, map_type):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+
+    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    self.testInt32()
+    self.testInt32OutIdxInt64()
+    self.testInt32Axis()
+    self.testInt32V2()
+
+    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+
+  def testUniqueWithCountsMultiMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+
+  def testUniqueWithCountsStlMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('STL')
+
+  def testUniqueWithCountsAbslMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+
+  def testUniqueWithCountsDenseHashMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
+
+
+class UniqueWithExtraCountsTest(test.TestCase):
+
+  def testInt32(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int32)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def testInt32OutIdxInt64(self):
+    x = np.random.randint(2, high=1000, size=700000)
+    extra_x = x[:5].tolist()
+    extra_x_tensor = [constant_op.constant(extra_x, dtypes.int64)]
+    extra_count = [500 for _ in range(5)]
+    extra_count_tensor = [constant_op.constant(extra_count, dtypes.int64)]
+    with self.cached_session() as sess:
+      y, idx, count = gen_array_ops.deepray_unique_with_extra_counts(x, extra_x_tensor, extra_count_tensor)
+      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+
+    self.assertEqual(len(x), len(tf_idx))
+    self.assertEqual(len(tf_y), len(np.unique(x)))
+    for i in range(len(x)):
+      self.assertEqual(x[i], tf_y[tf_idx[i]])
+    for value, count in zip(tf_y, tf_count):
+      if value in extra_x:
+        self.assertEqual(count, np.sum(x == value) + 499)
+      else:
+        self.assertEqual(count, np.sum(x == value))
+
+  def RunUniqueWithCountsWithDifferentMaps(self, map_type):
+    recover_env = False
+    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+      recover_env = True
+      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+
+    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    self.testInt32()
+    self.testInt32OutIdxInt64()
+
+    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    if recover_env:
+      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+
+  def testUniqueWithCountsMultiMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+
+  def testUniqueWithCountsStlMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('STL')
+
+  def testUniqueWithCountsAbslMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+
+  def testUniqueWithCountsDenseHashMap(self):
+    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py b/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py
deleted file mode 100644
index a3b8a470..00000000
--- a/deepray/custom_ops/unique_ops/python/tests/unique_op_test.py
+++ /dev/null
@@ -1,303 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for tensorflow.kernels.unique_op."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-import numpy as np
-from tensorflow.python.framework import dtypes
-from tensorflow.python.platform import test
-from tensorflow.python.framework import errors_impl
-
-from deepray.custom_ops.unique_ops import gen_array_ops
-
-unique = gen_array_ops.deepray_unique
-
-# set environ before tf initializing global varialbes
-PreservedKey = 1 << 10
-os.environ["DEEPREC_CONFIG_RAND_64"] = str(PreservedKey)
-
-
-class UniqueTest(test.TestCase):
-
-  def testInt32(self):
-    x = np.random.randint(2, high=10, size=7000)
-    with self.cached_session() as sess:
-      y, idx = gen_array_ops.deepray_unique(x)
-      tf_y, tf_idx = self.evaluate([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  def testInt32OutIdxInt64(self):
-    x = np.random.randint(2, high=10, size=7000)
-    with self.cached_session() as sess:
-      y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64)
-      tf_y, tf_idx = self.evaluate([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  def testInt64OutIdxInt64(self):
-    np.random.seed(0)
-    x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64)
-    with self.cached_session(use_gpu=True) as sess:
-      y, idx = unique(x, out_idx=dtypes.int64)
-      tf_y, tf_idx = sess.run([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  def testInt64OutIdxInt32(self):
-    np.random.seed(0)
-    x = np.random.randint(-1000000000, high=1000000000, size=1000000, dtype=np.int64)
-    with self.cached_session(use_gpu=True) as sess:
-      y, idx = unique(x, out_idx=dtypes.int32)
-      tf_y, tf_idx = sess.run([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  def testString(self):
-    indx = np.random.randint(65, high=122, size=7000)
-    x = [chr(i) for i in indx]
-    with self.cached_session() as sess:
-      y, idx = gen_array_ops.deepray_unique(x)
-      tf_y, tf_idx = self.evaluate([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
-
-  def testInt32Axis(self):
-    for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      with self.cached_session() as sess:
-        y0, idx0 = gen_array_ops.deepray_unique_v2(x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
-        y1, idx1 = gen_array_ops.deepray_unique_v2(x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-
-  def testInt32V2(self):
-    # This test is only temporary, once V2 is used
-    # by default, the axis will be wrapped to allow `axis=None`.
-    x = np.random.randint(2, high=10, size=7000)
-    with self.cached_session() as sess:
-      y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32))
-      tf_y, tf_idx = self.evaluate([y, idx])
-
-    self.assertEqual(len(x), len(tf_idx))
-    self.assertEqual(len(tf_y), len(np.unique(x)))
-    for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  def IllegalIdForMultMapUnique(self):
-    recover_env = False
-    if 'DEEPREC_UNIQUE_OP_PARTITION_SIZE' in os.environ:
-      recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
-    os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = '2'
-
-    with self.cached_session() as sess:
-      x = np.array([-1, 0, 1, PreservedKey], dtype=np.int64)
-      y, idx = unique(x, out_idx=dtypes.int64)
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError, "Input id is preserved key of dense_hash_map, "
-          "not supported: " + str(PreservedKey)
-      ):
-        tf_y, tf_idx = sess.run([y, idx])
-
-    del os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
-    if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = old_env
-
-  def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False):
-    recover_env = False
-    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
-      recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
-
-    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
-    self.testInt32()
-    self.testInt32OutIdxInt64()
-    self.testInt64OutIdxInt64()
-    self.testInt64OutIdxInt32()
-    self.testInt32Axis()
-    self.testInt32V2()
-    if test_illegal_key:
-      self.IllegalIdForMultMapUnique()
-
-    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
-    if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
-
-  def testUniqueMultiMap(self):
-    self.RunUniqueWithDifferentMaps('MULTIMAP')
-
-  def testUniqueStlMap(self):
-    self.RunUniqueWithDifferentMaps('STL')
-
-  def testUniqueAbslMap(self):
-    self.RunUniqueWithDifferentMaps('ABSL')
-
-  def testUniqueDenseHashMap(self):
-    self.RunUniqueWithDifferentMaps('GOOGLE')
-
-  # def testBool(self):
-  #   x = np.random.choice([True, False], size=7000)
-  #   with self.cached_session() as sess:
-  #     y, idx = gen_array_ops.deepray_unique(x)
-  #     tf_y, tf_idx = self.evaluate([y, idx])
-
-  #   self.assertEqual(len(x), len(tf_idx))
-  #   self.assertEqual(len(tf_y), len(np.unique(x)))
-  #   for i in range(len(x)):
-  #     self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-  # def testBoolV2(self):
-  #   x = np.random.choice([True, False], size=7000)
-  #   with self.cached_session() as sess:
-  #     y, idx = gen_array_ops.deepray_unique_v2(x, axis=np.array([], np.int32))
-  #     tf_y, tf_idx = self.evaluate([y, idx])
-
-  #   self.assertEqual(len(x), len(tf_idx))
-  #   self.assertEqual(len(tf_y), len(np.unique(x)))
-  #   for i in range(len(x)):
-  #     self.assertEqual(x[i], tf_y[tf_idx[i]])
-
-
-# class UniqueWithCountsTest(test.TestCase):
-
-#   def testInt32(self):
-#     x = np.random.randint(2, high=10, size=7000)
-#     with self.cached_session() as sess:
-#       y, idx, count = array_ops.unique_with_counts(x)
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]])
-#     for value, count in zip(tf_y, tf_count):
-#       self.assertEqual(count, np.sum(x == value))
-
-#   def testInt32OutIdxInt64(self):
-#     x = np.random.randint(2, high=10, size=7000)
-#     with self.cached_session() as sess:
-#       y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64)
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]])
-#     for value, count in zip(tf_y, tf_count):
-#       self.assertEqual(count, np.sum(x == value))
-
-#   def testString(self):
-#     indx = np.random.randint(65, high=122, size=7000)
-#     x = [chr(i) for i in indx]
-
-#     with self.cached_session() as sess:
-#       y, idx, count = array_ops.unique_with_counts(x)
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
-#     for value, count in zip(tf_y, tf_count):
-#       v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
-#       self.assertEqual(count, sum(v))
-
-#   def testInt32Axis(self):
-#     for dtype in [np.int32, np.int64]:
-#       x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-#       with self.cached_session() as sess:
-#         y0, idx0, count0 = gen_array_ops.deepray_unique_with_counts_v2(
-#             x, axis=np.array([0], dtype))
-#         tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
-#         y1, idx1, count1 = gen_array_ops.deepray_unique_with_counts_v2(
-#             x, axis=np.array([1], dtype))
-#         tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
-#       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-#       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-#       self.assertAllEqual(tf_count0, np.array([2, 1]))
-#       self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-#       self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-#       self.assertAllEqual(tf_count1, np.array([1, 2]))
-
-#   def testInt32V2(self):
-#     # This test is only temporary, once V2 is used
-#     # by default, the axis will be wrapped to allow `axis=None`.
-#     x = np.random.randint(2, high=10, size=7000)
-#     with self.cached_session() as sess:
-#       y, idx, count = gen_array_ops.deepray_unique_with_counts_v2(
-#           x, axis=np.array([], np.int32))
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]])
-#     for value, count in zip(tf_y, tf_count):
-#       self.assertEqual(count, np.sum(x == value))
-
-#   def testBool(self):
-#     x = np.random.choice([True, False], size=7000)
-#     with self.cached_session() as sess:
-#       y, idx, count = array_ops.unique_with_counts(x)
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]])
-#     for value, count in zip(tf_y, tf_count):
-#       self.assertEqual(count, np.sum(x == value))
-
-#   def testBoolV2(self):
-#     x = np.random.choice([True, False], size=7000)
-#     with self.cached_session() as sess:
-#       y, idx, count = gen_array_ops.deepray_unique_with_counts_v2(
-#           x, axis=np.array([], np.int32))
-#       tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
-
-#     self.assertEqual(len(x), len(tf_idx))
-#     self.assertEqual(len(tf_y), len(np.unique(x)))
-#     for i in range(len(x)):
-#       self.assertEqual(x[i], tf_y[tf_idx[i]])
-#     for value, count in zip(tf_y, tf_count):
-#       self.assertEqual(count, np.sum(x == value))
-
-if __name__ == '__main__':
-  test.main()
diff --git a/deepray/custom_ops/utils/BUILD b/deepray/custom_ops/utils/BUILD
new file mode 100644
index 00000000..a8ee59b8
--- /dev/null
+++ b/deepray/custom_ops/utils/BUILD
@@ -0,0 +1,127 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_copts")
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "spin_rw_lock",
+    srcs = [
+        "spin_rw_lock.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "spin_lock",
+    srcs = [
+        "spin_lock.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "ok_status_util",
+    srcs = [
+        "ok_status_util.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "random",
+    srcs = [
+        "random.cc",
+        "random.h",
+    ],
+    deps = [
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cc_library(
+    name = "check_util",
+    srcs = [
+        "check.h",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "tensor_testutil",
+    testonly = 1,
+    srcs = ["tensor_testutil.cc"],
+    hdrs = ["tensor_testutil.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_googletest//:gtest",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cc_test(
+    name = "tensor_testutil_test",
+    size = "small",
+    srcs = ["tensor_testutil_test.cc"],
+    deps = [
+        ":tensor_testutil",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "kernel_benchmark_testlib",
+    testonly = 1,
+    srcs = ["kernel_benchmark_testlib.cc"],
+    hdrs = ["kernel_benchmark_testlib.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_benchmark//:benchmark",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cc_library(
+    name = "fake_input",
+    testonly = 1,
+    srcs = ["fake_input.cc"],
+    hdrs = ["fake_input.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cuda_library(
+    name = "ops_testutil",
+    testonly = 1,
+    srcs = ["ops_testutil.cc"],
+    hdrs = ["ops_testutil.h"],
+    deps = [
+        ":tensor_testutil",
+        "@local_config_tf//:libtensorflow_cc",
+        "@local_config_tf//:libtensorflow_framework",
+        "@local_config_tf//:tf_header_lib",
+    ],
+)
+
+cc_test(
+    name = "ops_testutil_test",
+    size = "small",
+    srcs = ["ops_testutil_test.cc"],
+    linkopts = [
+        "-lm",
+    ],
+    deps = [
+        ":fake_input",
+        ":ops_testutil",
+        "@com_google_benchmark//:benchmark",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/deepray/custom_ops/utils/check.h b/deepray/custom_ops/utils/check.h
new file mode 100644
index 00000000..066f786d
--- /dev/null
+++ b/deepray/custom_ops/utils/check.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef CHECK_H
+#define CHECK_H
+
+#include <cstdio>
+#include <cstdlib>
+
+#define CUDACHECK(cmd)                                              \
+  do {                                                              \
+    cudaError_t err = cmd;                                          \
+    if (err != cudaSuccess) {                                       \
+      printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(err));                              \
+      exit(EXIT_FAILURE);                                           \
+    }                                                               \
+  } while (0)
+
+#endif  // CHECK_H
diff --git a/deepray/custom_ops/utils/fake_input.cc b/deepray/custom_ops/utils/fake_input.cc
new file mode 100644
index 00000000..9e751a51
--- /dev/null
+++ b/deepray/custom_ops/utils/fake_input.cc
@@ -0,0 +1,239 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "fake_input.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace {
+
+class FakeInputImpl {
+ public:
+  FakeInputImpl(const OpDef* op_def, int in_index, const NodeDef* node_def,
+                NodeDefBuilder* builder);
+  void SetN(int n);
+  void SetDataType(DataType dt);
+  void SetTypeList(DataTypeSlice dts);
+  Status AddInputToBuilder();
+
+ private:
+  static string FakeNodeName(int in_index);
+  Status GetN(int* n) const;
+  Status GetDataType(DataType* dt) const;
+  void NSources(int n, DataType dt) const;
+  void SourceList(DataTypeSlice dts) const;
+
+  const OpDef* const op_def_;
+  const OpDef::ArgDef* const arg_;
+  const string in_node_;
+  const NodeDef* const node_def_;
+  NodeDefBuilder* const builder_;
+
+  bool n_specified_;
+  int n_;
+  bool dt_specified_;
+  DataType dt_;
+  bool dts_specified_;
+  DataTypeSlice dts_;
+};
+
+FakeInputImpl::FakeInputImpl(const OpDef* op_def, int in_index,
+                             const NodeDef* node_def, NodeDefBuilder* builder)
+    : op_def_(op_def),
+      arg_(&op_def->input_arg(in_index)),
+      in_node_(FakeNodeName(in_index)),
+      node_def_(node_def),
+      builder_(builder),
+      n_specified_(false),
+      dt_specified_(false),
+      dts_specified_(false) {}
+
+void FakeInputImpl::SetN(int n) {
+  n_specified_ = true;
+  n_ = n;
+}
+
+void FakeInputImpl::SetDataType(DataType dt) {
+  dt_specified_ = true;
+  dt_ = dt;
+}
+
+void FakeInputImpl::SetTypeList(DataTypeSlice dts) {
+  dts_specified_ = true;
+  dts_ = dts;
+}
+
+Status FakeInputImpl::AddInputToBuilder() {
+  if (dts_specified_) {
+    SourceList(dts_);
+
+  } else if (n_specified_ || !arg_->number_attr().empty()) {
+    int n;
+    TF_RETURN_IF_ERROR(GetN(&n));
+
+    DataType dt;
+    if (n > 0) {
+      TF_RETURN_IF_ERROR(GetDataType(&dt));
+    } else {
+      dt = DT_FLOAT;
+    }
+
+    NSources(n, dt);
+  } else {
+    if (!dt_specified_ && !arg_->type_list_attr().empty()) {
+      DataTypeVector dts;
+      Status status = GetNodeAttr(*node_def_, arg_->type_list_attr(), &dts);
+      if (!status.ok()) {
+        return errors::InvalidArgument(
+            "Could not infer list of types for input '", arg_->name(),
+            "': ", status.message());
+      }
+      SourceList(dts);
+      return OkStatus();
+    }
+
+    DataType dt;
+    TF_RETURN_IF_ERROR(GetDataType(&dt));
+    builder_->Input(in_node_, 0, dt);
+  }
+  return OkStatus();
+}
+
+// static
+string FakeInputImpl::FakeNodeName(int in_index) {
+  char c = 'a' + (in_index % 26);
+  return string(&c, 1);
+}
+
+Status FakeInputImpl::GetN(int* n) const {
+  if (n_specified_) {
+    *n = n_;
+  } else {
+    Status status = GetNodeAttr(*node_def_, arg_->number_attr(), n);
+    if (!status.ok()) {
+      return errors::InvalidArgument("Could not infer length of input '",
+                                     arg_->name(), "': ", status.message());
+    }
+  }
+  return OkStatus();
+}
+
+Status FakeInputImpl::GetDataType(DataType* dt) const {
+  if (dt_specified_) {
+    *dt = dt_;
+    return OkStatus();  // Ignore is_ref field of arg_.
+  } else if (arg_->type() != DT_INVALID) {
+    *dt = arg_->type();
+  } else if (!arg_->type_attr().empty()) {
+    Status status = GetNodeAttr(*node_def_, arg_->type_attr(), dt);
+    if (!status.ok()) {
+      // Check if the type attr has a default
+      const OpDef::AttrDef* attr = FindAttr(arg_->type_attr(), *op_def_);
+      if (attr && attr->has_default_value()) {
+        *dt = attr->default_value().type();
+      } else {
+        return errors::InvalidArgument("Could not infer type for input '",
+                                       arg_->name(), "': ", status.message());
+      }
+    }
+  } else {
+    return errors::InvalidArgument("No type or type_attr field in arg '",
+                                   arg_->name(), "'");
+  }
+  if (arg_->is_ref()) {
+    *dt = MakeRefType(*dt);
+  }
+  return OkStatus();
+}
+
+void FakeInputImpl::NSources(int n, DataType dt) const {
+  std::vector<NodeDefBuilder::NodeOut> srcs;
+  srcs.reserve(n);
+  for (int i = 0; i < n; ++i) {
+    srcs.emplace_back(in_node_, i, dt);
+  }
+  builder_->Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
+}
+
+void FakeInputImpl::SourceList(DataTypeSlice dts) const {
+  std::vector<NodeDefBuilder::NodeOut> srcs;
+  srcs.reserve(dts.size());
+  for (size_t i = 0; i < dts.size(); ++i) {
+    srcs.emplace_back(in_node_, i, dts[i]);
+  }
+  builder_->Input(gtl::ArraySlice<NodeDefBuilder::NodeOut>(srcs));
+}
+
+}  // namespace
+
+// Public interface ------------------------------------------------------------
+
+FakeInputFunctor FakeInput() {
+  return [](const OpDef& op_def, int in_index, const NodeDef& node_def,
+            NodeDefBuilder* builder) {
+    FakeInputImpl impl(&op_def, in_index, &node_def, builder);
+    return impl.AddInputToBuilder();
+  };
+}
+
+FakeInputFunctor FakeInput(DataType dt) {
+  return [dt](const OpDef& op_def, int in_index, const NodeDef& node_def,
+              NodeDefBuilder* builder) {
+    FakeInputImpl impl(&op_def, in_index, &node_def, builder);
+    impl.SetDataType(dt);
+    return impl.AddInputToBuilder();
+  };
+}
+
+FakeInputFunctor FakeInput(int n) {
+  return [n](const OpDef& op_def, int in_index, const NodeDef& node_def,
+             NodeDefBuilder* builder) {
+    FakeInputImpl impl(&op_def, in_index, &node_def, builder);
+    impl.SetN(n);
+    return impl.AddInputToBuilder();
+  };
+}
+
+FakeInputFunctor FakeInput(int n, DataType dt) {
+  return [n, dt](const OpDef& op_def, int in_index, const NodeDef& node_def,
+                 NodeDefBuilder* builder) {
+    FakeInputImpl impl(&op_def, in_index, &node_def, builder);
+    impl.SetN(n);
+    impl.SetDataType(dt);
+    return impl.AddInputToBuilder();
+  };
+}
+
+FakeInputFunctor FakeInput(DataTypeSlice dts) {
+  // Make a copy to ensure the data will still be around when the lambda is
+  // called.
+  DataTypeVector dtv(dts.begin(), dts.end());
+  return [dtv](const OpDef& op_def, int in_index, const NodeDef& node_def,
+               NodeDefBuilder* builder) {
+    FakeInputImpl impl(&op_def, in_index, &node_def, builder);
+    impl.SetTypeList(dtv);
+    return impl.AddInputToBuilder();
+  };
+}
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/utils/fake_input.h b/deepray/custom_ops/utils/fake_input.h
new file mode 100644
index 00000000..c3062762
--- /dev/null
+++ b/deepray/custom_ops/utils/fake_input.h
@@ -0,0 +1,40 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
+
+namespace tensorflow {
+
+// These functions return values that may be passed to
+// NodeDefBuilder::Input() to add an input for a test.  Use them when
+// you don't care about the node names/output indices providing the
+// input.  They also allow you to omit the input types and/or
+// list length when they may be inferred.
+FakeInputFunctor FakeInput();  // Infer everything
+FakeInputFunctor FakeInput(DataType dt);
+FakeInputFunctor FakeInput(int n);  // List of length n
+FakeInputFunctor FakeInput(int n, DataType dt);
+FakeInputFunctor FakeInput(DataTypeSlice dts);
+inline FakeInputFunctor FakeInput(std::initializer_list<DataType> dts) {
+  return FakeInput(DataTypeSlice(dts));
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FAKE_INPUT_H_
diff --git a/deepray/custom_ops/utils/kernel_benchmark_testlib.cc b/deepray/custom_ops/utils/kernel_benchmark_testlib.cc
new file mode 100644
index 00000000..cb325697
--- /dev/null
+++ b/deepray/custom_ops/utils/kernel_benchmark_testlib.cc
@@ -0,0 +1,210 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "kernel_benchmark_testlib.h"
+
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/executor_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/local_device.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_segment.h"
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/byte_order.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace test {
+
+// TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
+Benchmark::Benchmark(const string& device, Graph* g,
+                     const SessionOptions* options, Graph* init,
+                     Rendezvous* rendez, const char* executor_type,
+                     bool old_benchmark_api) {
+  auto cleanup = gtl::MakeCleanup([g, init]() {
+    delete g;
+    delete init;
+  });
+
+  SessionOptions default_options;
+  if (!options) {
+    options = &default_options;
+  }
+
+  CHECK(!old_benchmark_api) << "Expected new API only";
+
+  string t = absl::AsciiStrToUpper(device);
+  // Allow NewDevice to allocate a new threadpool with different number of
+  // threads for each new benchmark.
+  LocalDevice::set_use_global_threadpool(false);
+
+  device_mgr_ = std::make_unique<StaticDeviceMgr>(
+      DeviceFactory::NewDevice(t, *options, "/job:localhost/replica:0/task:0"));
+  device_ = device_mgr_->ListDevices()[0];
+  CHECK(device_) << "Could not create a " << device << " device";
+
+  pool_ =
+      new thread::ThreadPool(options->env, "blocking", port::MaxParallelism());
+
+  auto runner = [this](std::function<void()> closure) {
+    pool_->Schedule(closure);
+  };
+
+  if (rendez == nullptr) {
+    rendez_ = NewLocalRendezvous();
+  } else {
+    rendez_ = rendez;
+  }
+
+  const int graph_def_version = g->versions().producer();
+
+  flib_def_ = std::make_unique<FunctionLibraryDefinition>(g->flib_def());
+
+  pflr_ = std::unique_ptr<ProcessFunctionLibraryRuntime>(
+      new ProcessFunctionLibraryRuntime(
+          device_mgr_.get(), Env::Default(), nullptr, graph_def_version,
+          flib_def_.get(), OptimizerOptions(), pool_, nullptr, nullptr,
+          Rendezvous::Factory()));
+
+  flr_ = pflr_->GetFLR(device_->name());
+
+  LocalExecutorParams params;
+  params.device = device_;
+  params.function_library = flr_;
+  params.create_kernel = [this, graph_def_version](
+                             const std::shared_ptr<const NodeProperties>& props,
+                             OpKernel** kernel) {
+    return CreateNonCachedKernel(device_, flr_, props, graph_def_version,
+                                 kernel);
+  };
+  params.delete_kernel = [](OpKernel* kernel) {
+    DeleteNonCachedKernel(kernel);
+  };
+
+  if (init) {
+    std::unique_ptr<Executor> init_exec;
+    TF_CHECK_OK(NewExecutor(executor_type, params, *init, &init_exec));
+    Executor::Args args;
+    args.rendezvous = rendez_;
+    args.runner = runner;
+    TF_CHECK_OK(init_exec->Run(args));
+  }
+
+  TF_CHECK_OK(NewExecutor(executor_type, params, *g, &exec_));
+}
+
+Benchmark::Benchmark(const string& device, Graph* g, bool old_benchmark_api)
+    : Benchmark(device, g, nullptr, nullptr, nullptr, "", old_benchmark_api) {}
+
+Benchmark::~Benchmark() {
+  if (device_) {
+    rendez_->Unref();
+    // We delete `exec_` before `device_mgr_` because the `exec_` destructor may
+    // run kernel destructors that may attempt to access state borrowed from
+    // `device_mgr_`, such as the resource manager.
+    exec_.reset();
+    pflr_.reset();
+    device_mgr_.reset();
+    delete pool_;
+  }
+}
+
+void Benchmark::Run(benchmark::State& state) {
+  RunWithRendezvousArgs({}, {}, state);
+}
+
+string GetRendezvousKey(const Node* node) {
+  string send_device;
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "send_device", &send_device));
+  string recv_device;
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "recv_device", &recv_device));
+  string tensor_name;
+  TF_CHECK_OK(GetNodeAttr(node->attrs(), "tensor_name", &tensor_name));
+  uint64 send_device_incarnation;
+  TF_CHECK_OK(
+      GetNodeAttr(node->attrs(), "send_device_incarnation",
+                  reinterpret_cast<int64_t*>(&send_device_incarnation)));
+  return Rendezvous::CreateKey(send_device, send_device_incarnation,
+                               recv_device, tensor_name, FrameAndIter(0, 0));
+}
+
+void Benchmark::RunWithRendezvousArgs(
+    const std::vector<std::pair<string, Tensor>>& inputs,
+    const std::vector<string>& outputs, benchmark::State& state) {
+  if (!device_ || state.max_iterations == 0) {
+    return;
+  }
+  Tensor unused;  // In benchmark, we don't care the return value.
+  bool is_dead;
+
+  // Warm up
+  Executor::Args args;
+  args.rendezvous = rendez_;
+  args.runner = [this](std::function<void()> closure) {
+    pool_->Schedule(closure);
+  };
+  static const int kWarmupRuns = 3;
+  for (int i = 0; i < kWarmupRuns; ++i) {
+    for (const auto& p : inputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
+    }
+    TF_CHECK_OK(exec_->Run(args));
+    for (const string& key : outputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
+    }
+  }
+  TF_CHECK_OK(device_->Sync());
+  VLOG(3) << kWarmupRuns << " warmup runs done.";
+
+  // Benchmark loop. Timer starts automatically at the beginning of the loop
+  // and ends automatically after the last iteration.
+  for (auto s : state) {
+    for (const auto& p : inputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(p.first, &parsed));
+      TF_CHECK_OK(rendez_->Send(parsed, Rendezvous::Args(), p.second, false));
+    }
+    TF_CHECK_OK(exec_->Run(args));
+    for (const string& key : outputs) {
+      Rendezvous::ParsedKey parsed;
+      TF_CHECK_OK(Rendezvous::ParseKey(key, &parsed));
+      TF_CHECK_OK(rendez_->Recv(parsed, Rendezvous::Args(), &unused, &is_dead));
+    }
+  }
+  TF_CHECK_OK(device_->Sync());
+}
+
+}  // end namespace test
+}  // end namespace tensorflow
diff --git a/deepray/custom_ops/utils/kernel_benchmark_testlib.h b/deepray/custom_ops/utils/kernel_benchmark_testlib.h
new file mode 100644
index 00000000..fcab9a65
--- /dev/null
+++ b/deepray/custom_ops/utils/kernel_benchmark_testlib.h
@@ -0,0 +1,86 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Device;
+class FunctionLibraryRuntime;
+class ProcessFunctionLibraryRuntime;
+struct SessionOptions;
+class DynamicDeviceMgr;
+
+namespace test {
+
+class Benchmark {
+ public:
+  // "device" must be either "cpu" or "gpu".  Takes ownership of "g",
+  // "init", and one reference on "rendez" (if not null).
+  //
+  // old_benchmark_api: If true, the benchmark is running with older API
+  //   * In the old API, the timer needs to be stopped/restarted
+  //     by users.
+  //   * In the new API, the timer starts automatically at the first
+  //     iteration of the loop and stops after the last iteration.
+  // TODO(vyng) Remove this once we have migrated all code to newer API.
+  Benchmark(const string& device, Graph* g,
+            const SessionOptions* options = nullptr, Graph* init = nullptr,
+            Rendezvous* rendez = nullptr, const char* executor_type = "",
+            bool old_benchmark_api = false);
+
+  Benchmark(const string& device, Graph* g, bool old_benchmark_api);
+
+  ~Benchmark();
+
+  void Run(benchmark::State& state);
+
+  void RunWithRendezvousArgs(
+      const std::vector<std::pair<string, Tensor>>& inputs,
+      const std::vector<string>& outputs, benchmark::State& state);
+
+ private:
+  thread::ThreadPool* pool_ = nullptr;  // Not owned.
+  Device* device_ = nullptr;            // Not owned.
+  Rendezvous* rendez_ = nullptr;
+  std::unique_ptr<DynamicDeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  FunctionLibraryRuntime* flr_;  // Not owned.
+  std::unique_ptr<Executor> exec_;
+
+  Benchmark(const Benchmark&) = delete;
+  void operator=(const Benchmark&) = delete;
+};
+
+// Returns the rendezvous key associated with the given Send/Recv node.
+string GetRendezvousKey(const Node* node);
+
+}  // end namespace test
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_KERNEL_BENCHMARK_TESTLIB_H_
diff --git a/deepray/custom_ops/utils/ok_status_util.h b/deepray/custom_ops/utils/ok_status_util.h
new file mode 100644
index 00000000..a9c7517c
--- /dev/null
+++ b/deepray/custom_ops/utils/ok_status_util.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The Deepray Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef DEEPRAY_UTILS_H_
+#define DEEPRAY_UTILS_H_
+
+// #define PRINT_MACRO_HELPER(x) #x
+// #define PRINT_MACRO(x) #x "=" PRINT_MACRO_HELPER(x)
+
+namespace tensorflow {
+namespace deepray {
+
+/* After TensorFlow version 2.10.0, "Status::OK()" upgraded to "OkStatus()".
+This code is for compatibility.*/
+#if TF_VERSION_INTEGER >= 2150
+#define TFOkStatus absl::OkStatus()
+// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER))
+#elif TF_VERSION_INTEGER >= 2100
+#define TFOkStatus OkStatus()
+// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER))
+#else
+// #pragma message(PRINT_MACRO(TF_VERSION_INTEGER))
+// #define TFOkStatus Status::OK()
+#define TFOkStatus absl::OkStatus()
+#endif
+}  // namespace deepray
+}  // namespace tensorflow
+
+#endif  // DEEPRAY_UTILS_H_
diff --git a/deepray/custom_ops/utils/ops_testutil.cc b/deepray/custom_ops/utils/ops_testutil.cc
new file mode 100644
index 00000000..f694941a
--- /dev/null
+++ b/deepray/custom_ops/utils/ops_testutil.cc
@@ -0,0 +1,271 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_properties.h"
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+#define EIGEN_USE_GPU
+#include "tensorflow/core/common_runtime/gpu/gpu_managed_allocator.h"
+#endif
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "ops_testutil.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/control_flow.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+namespace test {
+
+void SetOutputAttrs(OpKernelContext::Params* params,
+                    std::vector<AllocatorAttributes>* attrs) {
+  attrs->clear();
+  for (int index = 0; index < params->op_kernel->num_outputs(); index++) {
+    AllocatorAttributes attr;
+    const bool on_host =
+        (params->op_kernel->output_memory_types()[index] == HOST_MEMORY);
+    attr.set_on_host(on_host);
+    attrs->push_back(attr);
+  }
+  params->output_attr_array = attrs->data();
+}
+
+}  // namespace test
+
+OpsTestBase::OpsTestBase() : device_type_(DEVICE_CPU) {
+  auto device = DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
+  CHECK(device) << "Could not create CPU device";
+
+  thread_pool_ = std::make_unique<thread::ThreadPool>(
+      Env::Default(), /*name=*/"default", /*num_threads=*/1);
+
+  device_ = device.get();
+  device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(device));
+
+  allocator_ = device_->GetAllocator(AllocatorAttributes());
+
+  flib_def_ = std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(),
+                                                          FunctionDefLibrary{});
+  pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions());
+}
+
+OpsTestBase::~OpsTestBase() {
+  for (auto& temp : tensors_) {
+    delete temp;
+  }
+  for (auto& temp : managed_outputs_) {
+    delete temp;
+  }
+  tensors_.clear();
+  managed_outputs_.clear();
+  context_.reset(nullptr);
+  params_.reset(nullptr);
+}
+
+void OpsTestBase::SetDevice(const DeviceType& device_type,
+                            std::unique_ptr<Device> device) {
+  CHECK(device_) << "No device provided";
+
+  device_ = device.get();
+  device_type_ = device_type;
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (device_type == DEVICE_GPU) {
+    managed_allocator_.reset(new GpuManagedAllocator());
+    allocator_ = managed_allocator_.get();
+  } else {
+    managed_allocator_.reset();
+    allocator_ = device_->GetAllocator(AllocatorAttributes());
+  }
+#else
+  CHECK_NE(device_type, DEVICE_GPU)
+      << "Requesting GPU on binary compiled without GOOGLE_CUDA or "
+         "TENSORFLOW_USE_ROCM.";
+  allocator_ = device_->GetAllocator(AllocatorAttributes());
+#endif
+
+  device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(device));
+  pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+      device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+      TF_GRAPH_DEF_VERSION, flib_def_.get(), OptimizerOptions(),
+      thread_pool_.get());
+}
+
+void OpsTestBase::set_node_def(const NodeDef& node_def) {
+  node_def_.CopyFrom(node_def);
+}
+
+NodeDef* OpsTestBase::node_def() { return &node_def_; }
+
+Status OpsTestBase::InitOp() {
+  return InitOpWithGraphVersion(TF_GRAPH_DEF_VERSION);
+}
+
+Status OpsTestBase::InitOpWithGraphVersion(int graph_def_version) {
+  std::shared_ptr<const NodeProperties> props;
+  TF_RETURN_IF_ERROR(NodeProperties::CreateFromNodeDef(
+      node_def_, OpRegistry::Global(), &props));
+  OpKernel* kernel;
+  TF_RETURN_IF_ERROR(CreateOpKernel(
+      device_type_, device_, allocator(), /*flib=*/nullptr,
+      device_->resource_manager(), props, graph_def_version, &kernel));
+  kernel_.reset(kernel);
+  input_types_ = kernel_->input_types();
+  return OkStatus();
+}
+
+static std::function<void(std::function<void()>)>* GetDefaultRunner() {
+  static auto* const default_runner =
+      new std::function<void(std::function<void()>)>(
+          [](const std::function<void()>& f) { f(); });
+  return default_runner;
+}
+
+void OpsTestBase::CreateContext() {
+  // Make sure the old OpKernelContext is deleted before the Params
+  // it was using.
+  context_.reset(nullptr);
+
+  // Delete the output copies from previous runs.
+  for (auto& temp : managed_outputs_) {
+    delete temp;
+  }
+  managed_outputs_.clear();
+  managed_outputs_.resize(0);
+
+  params_.reset(new OpKernelContext::Params);
+  params_->device = device_;
+  params_->frame_iter = FrameAndIter(0, 0);
+  params_->inputs = inputs_;
+  params_->op_kernel = kernel_.get();
+  step_container_.reset(new ScopedStepContainer(0, [](const string&) {}));
+  params_->step_container = step_container_.get();
+  test::SetOutputAttrs(params_.get(), &out_alloc_attrs_);
+  params_->slice_reader_cache = &slice_reader_cache_wrapper_;
+  params_->cancellation_manager = &default_cancellation_manager_;
+  params_->resource_manager = device_->resource_manager();
+  params_->function_library = pflr_->GetFLR(device_->name());
+  params_->runner = GetDefaultRunner();
+  params_->session_metadata = &session_metadata();
+
+  context_.reset(new OpKernelContext(params_.get()));
+}
+
+Status OpsTestBase::RunOpKernel() {
+  CreateContext();
+  device_->Compute(kernel_.get(), context_.get());
+  return context_->status();
+}
+
+const Tensor& OpsTestBase::GetInput(int input_index) const {
+  CHECK_LT(input_index, context_->num_inputs());
+  CHECK(!IsRefType(context_->input_dtype(input_index)));
+  return context_->input(input_index);
+}
+
+TensorValue OpsTestBase::mutable_input(int input_index) {
+  CHECK_LT(input_index, inputs_.size());
+  return inputs_[input_index];
+}
+
+Tensor* OpsTestBase::GetOutput(int output_index) {
+  CHECK_LT(output_index, context_->num_outputs());
+  Tensor* output = context_->mutable_output(output_index);
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  if (device_type_ == DEVICE_GPU) {
+    managed_outputs_.resize(context_->num_outputs());
+    // Copy the output tensor to managed memory if we haven't done so.
+    if (!managed_outputs_[output_index]) {
+      Tensor* managed_output =
+          new Tensor(allocator(), output->dtype(), output->shape());
+      auto src = output->tensor_data();
+      auto dst = managed_output->tensor_data();
+      context_->eigen_gpu_device().memcpyDeviceToHost(
+          const_cast<char*>(dst.data()), src.data(), src.size());
+      context_->eigen_gpu_device().synchronize();
+      managed_outputs_[output_index] = managed_output;
+    }
+    output = managed_outputs_[output_index];
+  }
+#endif
+  return output;
+}
+
+Allocator* OpsTestBase::allocator() { return allocator_; }
+
+OpKernel* OpsTestBase::op_kernel() { return kernel_.get(); }
+
+const DataTypeVector& OpsTestBase::output_types() const {
+  return kernel_->output_types();
+}
+
+Tensor* OpsTestBase::AddInput(DataType dtype, const TensorShape& shape) {
+  CHECK_GT(input_types_.size(), inputs_.size())
+      << "Adding more inputs than types; perhaps you need to call MakeOp";
+  bool is_ref = IsRefType(input_types_[inputs_.size()]);
+  Tensor* input = new Tensor(allocator(), dtype, shape);
+  tensors_.push_back(input);
+  if (is_ref) {
+    CHECK_EQ(RemoveRefType(input_types_[inputs_.size()]), dtype);
+    inputs_.push_back({&lock_for_refs_, input});
+  } else {
+    CHECK_EQ(input_types_[inputs_.size()], dtype);
+    inputs_.push_back({nullptr, input});
+  }
+  return input;
+}
+
+void OpsTestBase::AddResourceInputInternal(const std::string& container_name,
+                                           const std::string& name,
+                                           const TypeIndex& type_index) {
+  ResourceHandle handle;
+  handle.set_device(device_->name());
+  handle.set_container(container_name);
+  handle.set_name(name);
+  handle.set_hash_code(type_index.hash_code());
+  handle.set_maybe_type_name(type_index.name());
+  Tensor* input = new Tensor(allocator(), DT_RESOURCE, TensorShape({}));
+  input->scalar<ResourceHandle>()() = handle;
+  tensors_.push_back(input);
+  inputs_.push_back({nullptr, input});
+}
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/utils/ops_testutil.h b/deepray/custom_ops/utils/ops_testutil.h
new file mode 100644
index 00000000..3edd4a3e
--- /dev/null
+++ b/deepray/custom_ops/utils/ops_testutil.h
@@ -0,0 +1,212 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
+#define TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
+
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensor_testutil.h"
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/type_index.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/tensor_slice_reader_cache.h"
+
+namespace tensorflow {
+namespace test {
+
+void SetOutputAttrs(OpKernelContext::Params* params,
+                    std::vector<AllocatorAttributes>* attrs);
+
+}  // namespace test
+
+// Helpful functions to test operators.
+//
+// This class will eventually be replaced / heavily modified
+// to use the BrainClient interface.
+class OpsTestBase : public ::testing::Test {
+ public:
+  OpsTestBase();
+
+  ~OpsTestBase() override;
+
+  // Allow kernel unit tests to run on GPU
+  void SetDevice(const DeviceType& device_type, std::unique_ptr<Device> device);
+
+  void set_node_def(const NodeDef& node_def);
+
+  // Clients can manipulate the underlying NodeDef via this accessor.
+  NodeDef* node_def();
+
+  // Initializes an operator that takes in 'input_types' as input
+  // and output types as output.
+  //
+  // Returns the status of initialization.
+  Status InitOp();
+
+  // Only use this directly if you have a deprecated op that you need to test.
+  Status InitOpWithGraphVersion(int graph_def_version);
+
+  // Adds an input for every element described by the shape.
+  // 'input_mapping' maps an index (0...NumElements(shape)) to a
+  // value.
+  //
+  // TODO(vrv): Replace with something like a BrainClient Feed.
+  template <typename T>
+  void AddInput(const TensorShape& shape, std::function<T(int)> input_mapping) {
+    test::FillFn(AddInput(DataTypeToEnum<T>::v(), shape), input_mapping);
+  }
+
+  // Like AddInput but takes in an explicit arrayslice of data.
+  template <typename T>
+  void AddInputFromArray(const TensorShape& shape,
+                         const gtl::ArraySlice<T> data) {
+    test::FillValues<T>(AddInput(DataTypeToEnum<T>::v(), shape), data);
+  }
+
+  // Convenience function to add an input and populate it with the elements from
+  // an initializer list converting the types as needed.
+  template <typename T, typename SrcType>
+  void AddInputFromList(const TensorShape& shape,
+                        std::initializer_list<SrcType> data) {
+    test::FillValues<T>(AddInput(DataTypeToEnum<T>::v(), shape), data);
+  }
+
+  // Adds a Resource type as input. If <container> is empty, uses the default
+  // container name.
+  template <typename T>
+  void AddResourceInput(const string& container, const string& name,
+                        T* resource) {
+    CHECK_GT(input_types_.size(), inputs_.size())
+        << "Adding more inputs than types; perhaps you need to call MakeOp";
+    ResourceMgr* rm = device_->resource_manager();
+    std::string container_name =
+        container.empty() ? rm->default_container() : container;
+    EXPECT_TRUE(rm->Create(container_name, name, resource).ok());
+    AddResourceInputInternal(container_name, name, TypeIndex::Make<T>());
+  }
+
+  // Runs an operation producing 'num_outputs' outputs.
+  //
+  // Returns the context's status after running the operation.
+  Status RunOpKernel();
+
+  // Returns the tensor input for 'input_index'.
+  //
+  // REQUIRES: 0 <= input_index < context_->num_inputs()
+  const Tensor& GetInput(int input_index) const;
+
+  TensorValue mutable_input(int input_index);
+
+  // Returns the tensor output for 'output_index'.
+  //
+  // REQUIRES: 0 <= output_index < context_->num_outputs()
+  Tensor* GetOutput(int output_index);
+
+  Allocator* allocator();
+
+  OpKernel* op_kernel();
+
+  const DataTypeVector& output_types() const;
+
+  void set_session_metadata(SessionMetadata session_metadata) {
+    session_metadata_ = std::move(session_metadata);
+  }
+
+  const SessionMetadata& session_metadata() const { return session_metadata_; }
+
+ protected:
+  void CreateContext();
+  Tensor* AddInput(DataType dtype, const TensorShape& shape);
+  void AddResourceInputInternal(const std::string& container_name,
+                                const std::string& name,
+                                const TypeIndex& type_index);
+
+  // device_mgr_ owns device_.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device_;
+
+  // The device allocator, or the managed_allocator_ below if running on GPU.
+  Allocator* allocator_;
+
+  std::unique_ptr<OpKernel> kernel_;
+  std::unique_ptr<ScopedStepContainer> step_container_;
+  NodeDef node_def_;
+  DataTypeVector input_types_;
+  DeviceType device_type_;
+
+  mutex lock_for_refs_;  // Used as the Mutex for inputs added as refs
+
+  gtl::InlinedVector<TensorValue, 4> inputs_;
+  // Owns Tensors.
+  std::vector<Tensor*> tensors_;
+  // Copies of the outputs in unified memory (host and device accessible).
+  std::vector<Tensor*> managed_outputs_;
+
+  // AllocatorAttributes for the allocators of the outputs.
+  std::vector<AllocatorAttributes> out_alloc_attrs_;
+  checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_wrapper_;
+  CancellationManager default_cancellation_manager_;
+  std::unique_ptr<OpKernelContext::Params> params_;
+  std::unique_ptr<OpKernelContext> context_;
+  // Unified memory allocator, only used when running on GPU.
+  std::unique_ptr<Allocator> managed_allocator_;
+
+  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::unique_ptr<thread::ThreadPool> thread_pool_;
+
+  SessionMetadata session_metadata_;
+
+ private:
+  OpsTestBase(const OpsTestBase&) = delete;
+  void operator=(const OpsTestBase&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_OPS_TESTUTIL_H_
diff --git a/deepray/custom_ops/utils/ops_testutil_test.cc b/deepray/custom_ops/utils/ops_testutil_test.cc
new file mode 100644
index 00000000..0cee1981
--- /dev/null
+++ b/deepray/custom_ops/utils/ops_testutil_test.cc
@@ -0,0 +1,52 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "ops_testutil.h"
+
+#include "fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/variable_ops.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+TEST_F(OpsTestBase, ScopedStepContainer) {
+  TF_EXPECT_OK(NodeDefBuilder("identity", "Identity")
+                   .Input(FakeInput(DT_STRING))
+                   .Finalize(node_def()));
+  TF_EXPECT_OK(InitOp());
+  AddInputFromArray<tstring>(TensorShape({}), {""});
+  TF_EXPECT_OK(RunOpKernel());
+  EXPECT_TRUE(step_container_ != nullptr);
+}
+
+// Verify that a Resource input can be added to the test kernel.
+TEST_F(OpsTestBase, ResourceVariableInput) {
+  TF_EXPECT_OK(NodeDefBuilder("identity", "Identity")
+                   .Input(FakeInput(DT_RESOURCE))
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  Var* var = new Var(DT_STRING);
+  AddResourceInput("" /* container */, "Test" /* name */, var);
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor* output = GetOutput(0);
+  EXPECT_EQ(output->dtype(), DT_RESOURCE);
+}
+
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random_test.cc b/deepray/custom_ops/utils/random.cc
similarity index 67%
rename from deepray/custom_ops/unique_ops/cc/kernels/random_test.cc
rename to deepray/custom_ops/utils/random.cc
index d37c47eb..6baf1f4b 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/random_test.cc
+++ b/deepray/custom_ops/utils/random.cc
@@ -13,25 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/lib/random/random.h"
+#include "random.h"
 
-#include <set>
-
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/util/env_var.h"
 
 namespace tensorflow {
 namespace random {
-namespace {
-
-TEST(New64Test, SanityCheck) {
-  std::set<uint64> values;
-  for (int i = 0; i < 1000000; i++) {
-    uint64 x = New64();
-    EXPECT_TRUE(values.insert(x).second) << "duplicate " << x;
-  }
+
+uint64 New64Configuable() {
+  int64 random_64;
+  CHECK(
+      ReadInt64FromEnvVar("DEEPREC_CONFIG_RAND_64", New64(), &random_64).ok());
+  return static_cast<uint64>(random_64);
 }
 
-}  // namespace
 }  // namespace random
 }  // namespace tensorflow
diff --git a/deepray/custom_ops/unique_ops/cc/kernels/random.h b/deepray/custom_ops/utils/random.h
similarity index 82%
rename from deepray/custom_ops/unique_ops/cc/kernels/random.h
rename to deepray/custom_ops/utils/random.h
index 29aae909..50b61140 100644
--- a/deepray/custom_ops/unique_ops/cc/kernels/random.h
+++ b/deepray/custom_ops/utils/random.h
@@ -21,14 +21,6 @@ limitations under the License.
 namespace tensorflow {
 namespace random {
 
-// Return a 64-bit random value.  Different sequences are generated
-// in different processes.
-uint64 New64();
-
-// Return a 64-bit random value. Uses
-// std::mersenne_twister_engine::default_seed as seed value.
-uint64 New64DefaultSeed();
-
 // Call New64 to generate a 64-bit random value
 // if env var DEEPREC_CONFIG_RAND_64 not set.
 // Otherwise, return int64 from DEEPREC_CONFIG_RAND_64
diff --git a/deepray/custom_ops/utils/spin_lock.h b/deepray/custom_ops/utils/spin_lock.h
new file mode 100644
index 00000000..ec99f589
--- /dev/null
+++ b/deepray/custom_ops/utils/spin_lock.h
@@ -0,0 +1,73 @@
+#ifndef THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_SPINLOCK_H
+#define THIRD_PARTY_TENSORFLOW_CORE_LIB_CORE_SPINLOCK_H
+
+namespace tensorflow {
+namespace {
+/* Compile read-write barrier */
+#define mem_barrier() asm volatile("" : : : "memory")
+
+/* Pause instruction to prevent excess processor bus usage */
+#if defined(__x86_64)
+#define cpu_relax() asm volatile("pause\n" : : : "memory")
+#else
+#define cpu_relax() asm volatile("yield\n" : : : "memory")
+#endif
+
+#define __ASM_FORM(x) " " #x " "
+#define __ASM_SEL(a, b) __ASM_FORM(a)
+#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
+#define _ASM_PTR __ASM_SEL(.long, .quad)
+
+#define LOCK_PREFIX                                      \
+  ".section .smp_locks,\"a\"\n" _ASM_ALIGN "\n" _ASM_PTR \
+  "661f\n" /* address */                                 \
+  ".previous\n"                                          \
+  "661:\n\tlock; "
+#define LOCK_PREFIX                                      \
+  ".section .smp_locks,\"a\"\n" _ASM_ALIGN "\n" _ASM_PTR \
+  "661f\n" /* address */                                 \
+  ".previous\n"                                          \
+  "661:\n\tlock; "
+
+/* Atomic exchange (of various sizes) */
+static inline unsigned long xchg_64(void* ptr, unsigned long x) {
+#if defined(__x86_64)
+  asm volatile("xchgq %0,%1"
+               : "=r"((unsigned long)x)
+               : "m"(*(volatile long*)ptr), "0"((unsigned long)x)
+               : "memory");
+#else
+  x = __atomic_exchange_n((unsigned long*)ptr, x, __ATOMIC_SEQ_CST);
+#endif
+
+  return x;
+}
+
+static void lock_impl(unsigned long* lock) {
+  while (xchg_64((void*)lock, 1)) {
+    while (*lock) cpu_relax();
+  }
+}
+
+static void unlock_impl(unsigned long* lock) {
+  mem_barrier();
+  *lock = 0;
+}
+}  // namespace
+
+class spin_lock {
+ public:
+  spin_lock() = default;
+  spin_lock(const spin_lock&) = delete;
+  spin_lock& operator=(const spin_lock&) = delete;
+
+  void lock() { lock_impl(&lock_); }
+
+  void unlock() { unlock_impl(&lock_); }
+
+ private:
+  unsigned long lock_ = 0;
+};
+
+}  // namespace tensorflow
+#endif
diff --git a/deepray/custom_ops/utils/spin_rw_lock.h b/deepray/custom_ops/utils/spin_rw_lock.h
new file mode 100644
index 00000000..00439a48
--- /dev/null
+++ b/deepray/custom_ops/utils/spin_rw_lock.h
@@ -0,0 +1,248 @@
+#ifndef TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_
+#define TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_
+
+#define EASY_SMP_LOCK "lock;"
+#define easy_atomic_set(v, i) ((v) = (i))
+
+#if defined(__x86_64)
+#define cpu_relax() asm volatile("pause\n" : : : "memory")
+#else
+#define cpu_relax() asm volatile("yield\n" : : : "memory")
+#endif
+
+typedef volatile int64_t easy_atomic_t;
+static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i) {
+#if defined(__x86_64__)
+  __asm__ __volatile__(EASY_SMP_LOCK "addq %1,%0"
+                       : "=m"((*v))
+                       : "r"(i), "m"((*v)));
+#else
+  __atomic_add_fetch(v, i, __ATOMIC_SEQ_CST);
+#endif
+}
+static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value,
+                                                 int64_t i) {
+  int64_t __i = i;
+#if defined(__x86_64__)
+  __asm__ __volatile__(EASY_SMP_LOCK "xaddq %0, %1;"
+                       : "=r"(i)
+                       : "m"(*value), "0"(i));
+#else
+  i = __atomic_fetch_add(value, i, __ATOMIC_SEQ_CST);
+#endif
+  return i + __i;
+}
+static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old,
+                                              int64_t set) {
+  uint8_t res;
+#if defined(__x86_64__)
+  __asm__ volatile(EASY_SMP_LOCK "cmpxchgq %3, %1; sete %0"
+                   : "=a"(res)
+                   : "m"(*lock), "a"(old), "r"(set)
+                   : "cc", "memory");
+#else
+  res = __atomic_compare_exchange_n(lock, &old, set, true, __ATOMIC_SEQ_CST,
+                                    __ATOMIC_SEQ_CST);
+#endif
+  return res;
+}
+static __inline__ void easy_atomic_inc(easy_atomic_t *v) {
+#if defined(__x86_64__)
+  __asm__ __volatile__(EASY_SMP_LOCK "incq %0" : "=m"(*v) : "m"(*v));
+#else
+  __atomic_add_fetch(v, 1, __ATOMIC_SEQ_CST);
+#endif
+}
+static __inline__ void easy_atomic_dec(easy_atomic_t *v) {
+#if defined(__x86_64__)
+  __asm__ __volatile__(EASY_SMP_LOCK "decq %0" : "=m"(*v) : "m"(*v));
+#else
+  __atomic_sub_fetch(v, 1, __ATOMIC_SEQ_CST);
+#endif
+}
+
+#define EASY_OK 0
+#define EASY_ERROR (-1)
+#define EASY_ABORT (-2)
+#define EASY_ASYNC (-3)
+#define EASY_BREAK (-4)
+#define EASY_ENCODE (-5)
+#define EASY_QUEUE_FULL (-6)
+#define EASY_AGAIN (-EAGAIN)
+
+typedef struct easy_spinrwlock_t {
+  easy_atomic_t ref_cnt;
+  easy_atomic_t wait_write;
+} easy_spinrwlock_t;
+#define EASY_SPINRWLOCK_INITIALIZER {0, 0}
+static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock) {
+  int ret = EASY_OK;
+
+  if (NULL == lock) {
+    ret = EASY_ERROR;
+  } else {
+    int cond = 1;
+
+    while (cond) {
+      int loop = 1;
+
+      do {
+        easy_atomic_t oldv = lock->ref_cnt;
+
+        if (0 <= oldv && 0 == lock->wait_write) {
+          if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, oldv + 1)) {
+            return ret;
+          }
+        }
+
+        cpu_relax();
+        loop <<= 1;
+      } while (loop < 1024);
+
+      sched_yield();
+    }
+  }
+
+  return ret;
+}
+static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock) {
+  int ret = EASY_OK;
+
+  if (NULL == lock) {
+    ret = EASY_ERROR;
+  } else {
+    int cond = 1;
+    easy_atomic_inc(&lock->wait_write);
+
+    while (cond) {
+      int loop = 1;
+
+      do {
+        easy_atomic_t oldv = lock->ref_cnt;
+
+        if (0 == oldv) {
+          if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, -1)) {
+            cond = 0;
+            break;
+          }
+        }
+
+        cpu_relax();
+        loop <<= 1;
+      } while (loop < 1024);
+
+      if (cond) sched_yield();
+    }
+
+    easy_atomic_dec(&lock->wait_write);
+  }
+
+  return ret;
+}
+static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock) {
+  int ret = EASY_OK;
+
+  if (NULL == lock) {
+    ret = EASY_ERROR;
+  } else {
+    ret = EASY_AGAIN;
+    easy_atomic_t oldv = lock->ref_cnt;
+
+    if (0 <= oldv && 0 == lock->wait_write) {
+      easy_atomic_t newv = oldv + 1;
+
+      if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+        ret = EASY_OK;
+      }
+    }
+  }
+
+  return ret;
+}
+static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock) {
+  int ret = EASY_OK;
+
+  if (NULL == lock) {
+    ret = EASY_ERROR;
+  } else {
+    ret = EASY_AGAIN;
+    easy_atomic_t oldv = lock->ref_cnt;
+
+    if (0 == oldv) {
+      easy_atomic_t newv = -1;
+
+      if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+        ret = EASY_OK;
+      }
+    }
+  }
+
+  return ret;
+}
+static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t *lock) {
+  int ret = EASY_OK;
+
+  if (NULL == lock) {
+    ret = EASY_ERROR;
+  } else {
+    while (1) {
+      easy_atomic_t oldv = lock->ref_cnt;
+
+      if (-1 == oldv) {
+        easy_atomic_t newv = 0;
+
+        if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+          break;
+        }
+      } else if (0 < oldv) {
+        easy_atomic_t newv = oldv - 1;
+
+        if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
+          break;
+        }
+      } else {
+        ret = EASY_ERROR;
+        break;
+      }
+    }
+  }
+
+  return ret;
+}
+namespace tensorflow {
+
+class spin_rd_lock {
+ public:
+  typedef easy_spinrwlock_t lock_type;
+
+  explicit spin_rd_lock(lock_type *lock) : lock_(lock) {
+    easy_spinrwlock_rdlock(lock_);
+  }
+  explicit spin_rd_lock(lock_type &lock) : lock_(&lock) {
+    easy_spinrwlock_rdlock(lock_);
+  }
+  ~spin_rd_lock() { easy_spinrwlock_unlock(lock_); }
+
+ private:
+  lock_type *lock_;
+};
+
+class spin_wr_lock {
+ public:
+  typedef easy_spinrwlock_t lock_type;
+
+  explicit spin_wr_lock(lock_type *lock) : lock_(lock) {
+    easy_spinrwlock_wrlock(lock_);
+  }
+  explicit spin_wr_lock(lock_type &lock) : lock_(&lock) {
+    easy_spinrwlock_wrlock(lock_);
+  }
+  ~spin_wr_lock() { easy_spinrwlock_unlock(lock_); }
+
+ private:
+  lock_type *lock_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_LIB_CORE_SPIN_RW_LOCK_H_
diff --git a/deepray/custom_ops/utils/tensor_testutil.cc b/deepray/custom_ops/utils/tensor_testutil.cc
new file mode 100644
index 00000000..a97daa7a
--- /dev/null
+++ b/deepray/custom_ops/utils/tensor_testutil.cc
@@ -0,0 +1,294 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensor_testutil.h"
+
+#include <cmath>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace test {
+
+::testing::AssertionResult IsSameType(const Tensor& x, const Tensor& y) {
+  if (x.dtype() != y.dtype()) {
+    return ::testing::AssertionFailure()
+           << "Tensors have different dtypes (" << x.dtype() << " vs "
+           << y.dtype() << ")";
+  }
+  return ::testing::AssertionSuccess();
+}
+
+::testing::AssertionResult IsSameShape(const Tensor& x, const Tensor& y) {
+  if (!x.IsSameSize(y)) {
+    return ::testing::AssertionFailure()
+           << "Tensors have different shapes (" << x.shape().DebugString()
+           << " vs " << y.shape().DebugString() << ")";
+  }
+  return ::testing::AssertionSuccess();
+}
+
+template <typename T>
+static ::testing::AssertionResult EqualFailure(const T& x, const T& y) {
+  return ::testing::AssertionFailure()
+         << std::setprecision(std::numeric_limits<T>::digits10 + 2) << x
+         << " not equal to " << y;
+}
+
+template <>
+::testing::AssertionResult EqualFailure<int8>(const int8& x, const int8& y) {
+  return EqualFailure(static_cast<int>(x), static_cast<int>(y));
+}
+
+static ::testing::AssertionResult IsEqual(float x, float y, Tolerance t) {
+  // We consider NaNs equal for testing.
+  if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y))
+    return ::testing::AssertionSuccess();
+  if (t == Tolerance::kNone) {
+    if (x == y) return ::testing::AssertionSuccess();
+  } else {
+    if (::testing::internal::CmpHelperFloatingPointEQ<float>("", "", x, y))
+      return ::testing::AssertionSuccess();
+  }
+  return EqualFailure(x, y);
+}
+static ::testing::AssertionResult IsEqual(double x, double y, Tolerance t) {
+  // We consider NaNs equal for testing.
+  if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y))
+    return ::testing::AssertionSuccess();
+  if (t == Tolerance::kNone) {
+    if (x == y) return ::testing::AssertionSuccess();
+  } else {
+    if (::testing::internal::CmpHelperFloatingPointEQ<double>("", "", x, y))
+      return ::testing::AssertionSuccess();
+  }
+  return EqualFailure(x, y);
+}
+static ::testing::AssertionResult IsEqual(Eigen::half x, Eigen::half y,
+                                          Tolerance t) {
+  // We consider NaNs equal for testing.
+  if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y))
+    return ::testing::AssertionSuccess();
+
+  // Below is a reimplementation of CmpHelperFloatingPointEQ<Eigen::half>, which
+  // we cannot use because Eigen::half is not default-constructible.
+
+  if (Eigen::numext::isnan(x) || Eigen::numext::isnan(y))
+    return EqualFailure(x, y);
+
+  auto sign_and_magnitude_to_biased = [](uint16_t sam) {
+    const uint16_t kSignBitMask = 0x8000;
+    if (kSignBitMask & sam) return ~sam + 1;  // negative number.
+    return kSignBitMask | sam;                // positive number.
+  };
+
+  auto xb = sign_and_magnitude_to_biased(Eigen::numext::bit_cast<uint16_t>(x));
+  auto yb = sign_and_magnitude_to_biased(Eigen::numext::bit_cast<uint16_t>(y));
+  if (t == Tolerance::kNone) {
+    if (xb == yb) return ::testing::AssertionSuccess();
+  } else {
+    auto distance = xb >= yb ? xb - yb : yb - xb;
+    const uint16_t kMaxUlps = 4;
+    if (distance <= kMaxUlps) return ::testing::AssertionSuccess();
+  }
+  return EqualFailure(x, y);
+}
+template <typename T>
+static ::testing::AssertionResult IsEqual(const T& x, const T& y, Tolerance t) {
+  if (::testing::internal::CmpHelperEQ<T>("", "", x, y))
+    return ::testing::AssertionSuccess();
+  return EqualFailure(x, y);
+}
+
+template <typename T>
+static ::testing::AssertionResult IsEqual(const std::complex<T>& x,
+                                          const std::complex<T>& y,
+                                          Tolerance t) {
+  if (IsEqual(x.real(), y.real(), t) && IsEqual(x.imag(), y.imag(), t))
+    return ::testing::AssertionSuccess();
+  return EqualFailure(x, y);
+}
+
+template <typename T>
+static void ExpectEqual(const Tensor& x, const Tensor& y,
+                        Tolerance t = Tolerance::kDefault) {
+  const T* Tx = x.unaligned_flat<T>().data();
+  const T* Ty = y.unaligned_flat<T>().data();
+  auto size = x.NumElements();
+  int max_failures = 10;
+  int num_failures = 0;
+  for (decltype(size) i = 0; i < size; ++i) {
+    EXPECT_TRUE(IsEqual(Tx[i], Ty[i], t)) << "i = " << (++num_failures, i);
+    ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up.";
+  }
+}
+
+template <typename T>
+static ::testing::AssertionResult IsClose(const T& x, const T& y, const T& atol,
+                                          const T& rtol) {
+  // We consider NaNs equal for testing.
+  if (Eigen::numext::isnan(x) && Eigen::numext::isnan(y))
+    return ::testing::AssertionSuccess();
+  if (x == y) return ::testing::AssertionSuccess();  // Handle infinity.
+  auto tolerance = atol + rtol * Eigen::numext::abs(x);
+  if (Eigen::numext::abs(x - y) <= tolerance)
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << x << " not close to " << y;
+}
+
+template <typename T>
+static ::testing::AssertionResult IsClose(const std::complex<T>& x,
+                                          const std::complex<T>& y,
+                                          const T& atol, const T& rtol) {
+  if (IsClose(x.real(), y.real(), atol, rtol) &&
+      IsClose(x.imag(), y.imag(), atol, rtol))
+    return ::testing::AssertionSuccess();
+  return ::testing::AssertionFailure() << x << " not close to " << y;
+}
+
+// Return type can be different from T, e.g. float for T=std::complex<float>.
+template <typename T>
+static auto GetTolerance(double tolerance) {
+  using Real = typename Eigen::NumTraits<T>::Real;
+  auto default_tol = static_cast<Real>(5.0) * Eigen::NumTraits<T>::epsilon();
+  auto result = tolerance < 0.0 ? default_tol : static_cast<Real>(tolerance);
+  EXPECT_GE(result, static_cast<Real>(0));
+  return result;
+}
+
+template <typename T>
+static void ExpectClose(const Tensor& x, const Tensor& y, double atol,
+                        double rtol) {
+  auto typed_atol = GetTolerance<T>(atol);
+  auto typed_rtol = GetTolerance<T>(rtol);
+
+  const T* Tx = x.unaligned_flat<T>().data();
+  const T* Ty = y.unaligned_flat<T>().data();
+  auto size = x.NumElements();
+  int max_failures = 10;
+  int num_failures = 0;
+  for (decltype(size) i = 0; i < size; ++i) {
+    EXPECT_TRUE(IsClose(Tx[i], Ty[i], typed_atol, typed_rtol))
+        << "i = " << (++num_failures, i) << " Tx[i] = " << Tx[i]
+        << " Ty[i] = " << Ty[i];
+    ASSERT_LT(num_failures, max_failures)
+        << "Too many mismatches (atol = " << atol << " rtol = " << rtol
+        << "), giving up.";
+  }
+  EXPECT_EQ(num_failures, 0)
+      << "Mismatches detected (atol = " << atol << " rtol = " << rtol << ").";
+}
+
+void ExpectEqual(const Tensor& x, const Tensor& y, Tolerance t) {
+  ASSERT_TRUE(IsSameType(x, y));
+  ASSERT_TRUE(IsSameShape(x, y));
+
+  switch (x.dtype()) {
+    case DT_FLOAT:
+      return ExpectEqual<float>(x, y, t);
+    case DT_DOUBLE:
+      return ExpectEqual<double>(x, y, t);
+    case DT_INT32:
+      return ExpectEqual<int32>(x, y);
+    case DT_UINT32:
+      return ExpectEqual<uint32>(x, y);
+    case DT_UINT16:
+      return ExpectEqual<uint16>(x, y);
+    case DT_UINT8:
+      return ExpectEqual<uint8>(x, y);
+    case DT_INT16:
+      return ExpectEqual<int16>(x, y);
+    case DT_INT8:
+      return ExpectEqual<int8>(x, y);
+    case DT_STRING:
+      return ExpectEqual<tstring>(x, y);
+    case DT_COMPLEX64:
+      return ExpectEqual<complex64>(x, y, t);
+    case DT_COMPLEX128:
+      return ExpectEqual<complex128>(x, y, t);
+    case DT_INT64:
+      return ExpectEqual<int64_t>(x, y);
+    case DT_UINT64:
+      return ExpectEqual<uint64>(x, y);
+    case DT_BOOL:
+      return ExpectEqual<bool>(x, y);
+    case DT_QINT8:
+      return ExpectEqual<qint8>(x, y);
+    case DT_QUINT8:
+      return ExpectEqual<quint8>(x, y);
+    case DT_QINT16:
+      return ExpectEqual<qint16>(x, y);
+    case DT_QUINT16:
+      return ExpectEqual<quint16>(x, y);
+    case DT_QINT32:
+      return ExpectEqual<qint32>(x, y);
+    case DT_BFLOAT16:
+      return ExpectEqual<bfloat16>(x, y, t);
+    case DT_HALF:
+      return ExpectEqual<Eigen::half>(x, y, t);
+    case DT_FLOAT8_E5M2:
+      return ExpectEqual<float8_e5m2>(x, y, t);
+    case DT_FLOAT8_E4M3FN:
+      return ExpectEqual<float8_e4m3fn>(x, y, t);
+    case DT_INT4:
+      return ExpectEqual<int4>(x, y, t);
+    case DT_UINT4:
+      return ExpectEqual<uint4>(x, y, t);
+    default:
+      EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype());
+  }
+}
+
+void ExpectClose(const Tensor& x, const Tensor& y, double atol, double rtol) {
+  ASSERT_TRUE(IsSameType(x, y));
+  ASSERT_TRUE(IsSameShape(x, y));
+
+  switch (x.dtype()) {
+    case DT_HALF:
+      return ExpectClose<Eigen::half>(x, y, atol, rtol);
+    case DT_BFLOAT16:
+      return ExpectClose<Eigen::bfloat16>(x, y, atol, rtol);
+    case DT_FLOAT:
+      return ExpectClose<float>(x, y, atol, rtol);
+    case DT_DOUBLE:
+      return ExpectClose<double>(x, y, atol, rtol);
+    case DT_COMPLEX64:
+      return ExpectClose<complex64>(x, y, atol, rtol);
+    case DT_COMPLEX128:
+      return ExpectClose<complex128>(x, y, atol, rtol);
+    default:
+      EXPECT_TRUE(false) << "Unsupported type : " << DataTypeString(x.dtype());
+  }
+}
+
+::testing::AssertionResult internal_test::IsClose(Eigen::half x, Eigen::half y,
+                                                  double atol, double rtol) {
+  return test::IsClose(x, y, GetTolerance<Eigen::half>(atol),
+                       GetTolerance<Eigen::half>(rtol));
+}
+::testing::AssertionResult internal_test::IsClose(float x, float y, double atol,
+                                                  double rtol) {
+  return test::IsClose(x, y, GetTolerance<float>(atol),
+                       GetTolerance<float>(rtol));
+}
+::testing::AssertionResult internal_test::IsClose(double x, double y,
+                                                  double atol, double rtol) {
+  return test::IsClose(x, y, GetTolerance<double>(atol),
+                       GetTolerance<double>(rtol));
+}
+
+}  // end namespace test
+}  // end namespace tensorflow
diff --git a/deepray/custom_ops/utils/tensor_testutil.h b/deepray/custom_ops/utils/tensor_testutil.h
new file mode 100644
index 00000000..53ad5969
--- /dev/null
+++ b/deepray/custom_ops/utils/tensor_testutil.h
@@ -0,0 +1,162 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
+
+#include <numeric>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace test {
+
+// Constructs a scalar tensor with 'val'.
+template <typename T>
+Tensor AsScalar(const T& val) {
+  Tensor ret(DataTypeToEnum<T>::value, {});
+  ret.scalar<T>()() = val;
+  return ret;
+}
+
+// Constructs a flat tensor with 'vals'.
+template <typename T>
+Tensor AsTensor(gtl::ArraySlice<T> vals) {
+  Tensor ret(DataTypeToEnum<T>::value, {static_cast<int64_t>(vals.size())});
+  std::copy_n(vals.data(), vals.size(), ret.flat<T>().data());
+  return ret;
+}
+
+// Constructs a tensor of "shape" with values "vals".
+template <typename T>
+Tensor AsTensor(gtl::ArraySlice<T> vals, const TensorShape& shape) {
+  Tensor ret;
+  CHECK(ret.CopyFrom(AsTensor(vals), shape));
+  return ret;
+}
+
+// Fills in '*tensor' with 'vals'. E.g.,
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillValues<float>(&x, {11, 21, 21, 22});
+template <typename T>
+void FillValues(Tensor* tensor, gtl::ArraySlice<T> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    std::copy_n(vals.data(), vals.size(), flat.data());
+  }
+}
+
+// Fills in '*tensor' with 'vals', converting the types as needed.
+template <typename T, typename SrcType>
+void FillValues(Tensor* tensor, std::initializer_list<SrcType> vals) {
+  auto flat = tensor->flat<T>();
+  CHECK_EQ(flat.size(), vals.size());
+  if (flat.size() > 0) {
+    size_t i = 0;
+    for (auto itr = vals.begin(); itr != vals.end(); ++itr, ++i) {
+      flat(i) = T(*itr);
+    }
+  }
+}
+
+// Fills in '*tensor' with a sequence of value of val, val+1, val+2, ...
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillIota<float>(&x, 1.0);
+template <typename T>
+void FillIota(Tensor* tensor, const T& val) {
+  auto flat = tensor->flat<T>();
+  std::iota(flat.data(), flat.data() + flat.size(), val);
+}
+
+// Fills in '*tensor' with a sequence of value of fn(0), fn(1), ...
+//   Tensor x(&alloc, DT_FLOAT, TensorShape({2, 2}));
+//   test::FillFn<float>(&x, [](int i)->float { return i*i; });
+template <typename T>
+void FillFn(Tensor* tensor, std::function<T(int)> fn) {
+  auto flat = tensor->flat<T>();
+  for (int i = 0; i < flat.size(); ++i) flat(i) = fn(i);
+}
+
+// Expects "x" and "y" are tensors of the same type, same shape, and identical
+// values (within 4 ULPs for floating point types unless explicitly disabled).
+enum class Tolerance {
+  kNone,
+  kDefault,
+};
+void ExpectEqual(const Tensor& x, const Tensor& y,
+                 Tolerance t = Tolerance ::kDefault);
+
+// Expects "x" and "y" are tensors of the same (floating point) type,
+// same shape and element-wise difference between x and y is no more
+// than atol + rtol * abs(x). If atol or rtol is negative, the data type's
+// epsilon * kSlackFactor is used.
+void ExpectClose(const Tensor& x, const Tensor& y, double atol = -1.0,
+                 double rtol = -1.0);
+
+// Expects "x" and "y" are tensors of the same type T, same shape, and
+// equal values. Consider using ExpectEqual above instead.
+template <typename T>
+void ExpectTensorEqual(const Tensor& x, const Tensor& y) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ExpectEqual(x, y);
+}
+
+::testing::AssertionResult IsSameType(const Tensor& x, const Tensor& y);
+::testing::AssertionResult IsSameShape(const Tensor& x, const Tensor& y);
+
+template <typename T>
+void ExpectTensorEqual(const Tensor& x, const Tensor& y,
+                       std::function<bool(const T&, const T&)> is_equal) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ASSERT_TRUE(IsSameType(x, y));
+  ASSERT_TRUE(IsSameShape(x, y));
+
+  const T* Tx = x.unaligned_flat<T>().data();
+  const T* Ty = y.unaligned_flat<T>().data();
+  auto size = x.NumElements();
+  int max_failures = 10;
+  int num_failures = 0;
+  for (decltype(size) i = 0; i < size; ++i) {
+    EXPECT_TRUE(is_equal(Tx[i], Ty[i])) << "i = " << (++num_failures, i);
+    ASSERT_LT(num_failures, max_failures) << "Too many mismatches, giving up.";
+  }
+}
+
+// Expects "x" and "y" are tensors of the same type T, same shape, and
+// approximate equal values. Consider using ExpectClose above instead.
+template <typename T>
+void ExpectTensorNear(const Tensor& x, const Tensor& y, double atol) {
+  EXPECT_EQ(x.dtype(), DataTypeToEnum<T>::value);
+  ExpectClose(x, y, atol, /*rtol=*/0.0);
+}
+
+// For tensor_testutil_test only.
+namespace internal_test {
+::testing::AssertionResult IsClose(Eigen::half x, Eigen::half y,
+                                   double atol = -1.0, double rtol = -1.0);
+::testing::AssertionResult IsClose(float x, float y, double atol = -1.0,
+                                   double rtol = -1.0);
+::testing::AssertionResult IsClose(double x, double y, double atol = -1.0,
+                                   double rtol = -1.0);
+}  // namespace internal_test
+
+}  // namespace test
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_TENSOR_TESTUTIL_H_
diff --git a/deepray/custom_ops/utils/tensor_testutil_test.cc b/deepray/custom_ops/utils/tensor_testutil_test.cc
new file mode 100644
index 00000000..0e3b1572
--- /dev/null
+++ b/deepray/custom_ops/utils/tensor_testutil_test.cc
@@ -0,0 +1,335 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensor_testutil.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace test {
+namespace {
+
+using internal_test::IsClose;
+
+template <typename T>
+void TestEdgeCasesNear() {
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::infinity(),
+                      Eigen::NumTraits<T>::infinity(), 0.0, 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::lowest(),
+                      Eigen::NumTraits<T>::highest(),
+                      Eigen::NumTraits<double>::infinity(), 0.0));
+  EXPECT_FALSE(
+      IsClose(Eigen::NumTraits<T>::lowest(), Eigen::NumTraits<T>::highest(),
+              static_cast<double>(Eigen::NumTraits<T>::highest()), 0.0));
+  EXPECT_FALSE(IsClose(Eigen::NumTraits<T>::quiet_NaN(), T(0.0), 0.0, 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<T>::quiet_NaN(), 0.0, 0.0));
+  EXPECT_FALSE(IsClose(Eigen::NumTraits<T>::quiet_NaN(), T(0.0),
+                       Eigen::NumTraits<double>::infinity(), 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<double>::infinity(), 0.0));
+}
+
+// For debug printing. Example usage:
+// dumpFloatingPointStorage<Eigen::half, uint16>(
+//     static_cast<Eigen::half>(-2.71f));
+// dumpFloatingPointStorage<float, uint32>(-2.718281f);
+// dumpFloatingPointStorage <double, uint64>(-2.71828182846);
+template <typename T, typename U>
+void dumpFloatingPointStorage(T value) {
+  U* integral = reinterpret_cast<U*>(&value);
+  int shift_amount = (sizeof(U) << 3) - 1;
+  int exponent_bits = 2 + (log2(sizeof(U)) * 3);
+  U mask = static_cast<U>(1) << shift_amount;
+  for (int bits = 0; bits <= shift_amount; ++bits) {
+    std::cout << ((*integral & mask) > 0);
+    if (bits == 0 || bits == exponent_bits) std::cout << " ";
+    mask >>= 1;
+  }
+  std::cout << std::endl;
+  printf("%.20lf\n", static_cast<double>(value));
+}
+
+TEST(TensorTestUtilTest, ExpectTensorNearHalf) {
+  // Eigen::half has 1 sign bit, 5 exponent bits, and 10 mantissa bits.
+  // The exponent is offset at 15.
+  // https://en.wikipedia.org/wiki/Half-precision_floating-point_format
+  typedef Eigen::half T;
+
+  // Trivial cases: equalities.
+  EXPECT_TRUE(IsClose(static_cast<T>(1.0f), static_cast<T>(1.0f), 0.0, 0.0));
+  EXPECT_TRUE(IsClose(static_cast<T>(0.0f), static_cast<T>(-0.0f), 0.0, 0.0));
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(3.141592f), static_cast<T>(3.141592f), 0.0, 0.0));
+
+  // 0 10010 0001111110 -> 1150/128 = 8.984375 vs
+  // 0 10010 0001111111 -> 1151/128 = 8.9921875 (diff = 0.0078125)
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(8.9875f), static_cast<T>(8.99f), 0.0078125, 0.0));
+  EXPECT_FALSE(
+      IsClose(static_cast<T>(8.9875f), static_cast<T>(8.99f), 0.007, 0.0));
+
+  // 0 11000 0110100000 -> 1440/2 = 720 vs
+  // 0 11000 0110100001 -> 1441/2 = 720.5 (diff = 0.5)
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(720.2f), static_cast<T>(720.3f), 0.5, 0.0));
+  EXPECT_FALSE(
+      IsClose(static_cast<T>(720.2f), static_cast<T>(720.3f), 0.4, 0.0));
+
+  // 0 11001 0011010010 -> 1234 vs
+  // 0 11001 0011010011 -> 1235 (diff = 1)
+  // Rounds to even (1234.5 -> 1234).
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(1234.f), static_cast<T>(1235.f), 1.0, 0.0));
+  EXPECT_FALSE(
+      IsClose(static_cast<T>(1234.5f), static_cast<T>(1235.f), 0.5, 0.0));
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(1234.5f), static_cast<T>(1235.f), 1.0, 0.0));
+
+  // 1 10000 0101101100 -> -1388/512 = -2.7109375 vs
+  // 1 10000 0101110001 -> -1393/512 = -2.720703125 (diff = 0.009765625)
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(-2.71f), static_cast<T>(-2.72f), 0.01, 0.0));
+
+  TestEdgeCasesNear<T>();
+}
+
+TEST(TensorTestUtilTest, ExpectTensorNearFloat) {
+  // float has 1 sign bit, 8 exponent bits, and 23 mantissa bits.
+  // The exponent offset is 127.
+  // https://en.wikipedia.org/wiki/Single-precision_floating-point_format
+  typedef float T;
+  // Trivial cases: equalities.
+  EXPECT_TRUE(IsClose(1.0f, 1.0f, 0.0f, 0.0f));
+  EXPECT_TRUE(IsClose(0.0f, -0.0f, 0.0f, 0.0f));
+  EXPECT_TRUE(IsClose(3.14159265359f, 3.14159265359f, 0.0f, 0.0f));
+
+  // 0 10000010 00011111100110011001101 -> 9,424,077/2^20 vs
+  // 0 10000010 00011111100110100110110 -> 9,424,182/2^20
+  // diff = 105/2^20 = 0.000100135803223
+  EXPECT_TRUE(IsClose(8.9875f, 8.9876f, 0.0001002f, 0.0f));
+  EXPECT_FALSE(IsClose(8.9875f, 8.9876f, 0.0001f, 0.0f));
+
+  // 0 10001000 01101000000110011101001 -> 11,799,785/2^14 vs
+  // 0 10001000 01101000000110011101010 -> 11,799,786/2^14
+  // diff = 1/2^14 = 0.00006103515625
+  EXPECT_TRUE(IsClose(720.2017f, 720.2018f, 0.0001f, 0.0f));
+  EXPECT_FALSE(IsClose(720.20175f, 720.20185f, 0.0001f, 0.0f));
+  EXPECT_TRUE(IsClose(720.20175f, 720.20185f, 0.00013f, 0.0f));
+
+  // 0 10011001 11010110111100110100010 -> 15,432,098*2^3 vs
+  // 0 10011001 11010110111100110100011 -> 15,432,099*2^3 (diff = 2^3 = 8)
+  EXPECT_FALSE(IsClose(123456788.f, 123456789.f, 4.0f, 0.0f));
+  EXPECT_TRUE(IsClose(123456788.f, 123456789.f, 8.0f, 0.0f));
+
+  // 1 10000000 01011011111100001010001 -> 11,401,297/2^22 vs
+  // 1 10000000 01011011111100001010101 -> 11,401,301/2^22
+  // diff = 4/2^22 = 0.000000953674316
+  EXPECT_TRUE(IsClose(-2.718281f, -2.718282f, 0.1f, 0.0f));
+
+  TestEdgeCasesNear<T>();
+}
+
+TEST(TensorTestUtilTest, ExpectTensorNearDouble) {
+  // double has 1 sign bit, 11 exponent bits, and 52 mantissa bits.
+  // The exponent offset is 1,023.
+  // https://en.wikipedia.org/wiki/Double-precision_floating-point_format
+  typedef double T;
+  // Trivial cases: equalities.
+  EXPECT_TRUE(IsClose(1.0, 1.0, 0.0, 0.0));
+  EXPECT_TRUE(IsClose(0.0, -0.0, 0.0, 0.0));
+  EXPECT_TRUE(IsClose(3.14159265359, 3.14159265359, 0.0, 0.0));
+
+  // 0 10000000010 0001111110011001100110011001100110011001100110011010
+  //   -> 5,059,512,706,374,042/2^49 vs
+  // 0 10000000010 0001111110011010011010110101000010110000111100101000
+  //   -> 5,059,569,001,369,384/2^49
+  // diff = 56,294,995,342/2^49 = 9.999999999976694198267E-5
+  EXPECT_TRUE(IsClose(8.9875, 8.9876, 0.0001, 0.0));
+
+  // 0 10000001111 1000100101110000001100111010100100101010001100000101
+  //   -> 6,921,439,564,440,325/2^36
+  // 0 10000001111 1000100101110000001100111010111110110111111010010001
+  //   -> 6,921,439,571,312,273/2^36
+  // diff = 6,871,948/2^36 = 1.000000047497451305389E-4
+  EXPECT_FALSE(IsClose(100720.2018, 100720.2019, 0.0001, 0.0));
+  EXPECT_TRUE(IsClose(100720.2018, 100720.2019, 1.00000005e-4, 0.0));
+
+  // 0 10000110100 0101111011100010101000101110101101011010010111000100
+  //   -> 6,172,839,450,617,284 * 2
+  // 0 10000110100 0101111011100010101000101110101101011010010111000011
+  //   -> 6,172,839,450,617,283 * 2
+  // diff = 1 * 2 = 2
+  EXPECT_FALSE(IsClose(12345678901234567., 12345678901234566., 1.0, 0.0));
+  EXPECT_TRUE(IsClose(12345678901234567., 12345678901234566., 2.0, 0.0));
+
+  // 1 10000000000 0101101111110000101010001011000101000101111111001111
+  //   -> -6,121,026,514,870,223/2^51
+  // 1 10000000000 0101101111110000101010001011000101001011011111000101
+  //   -> -6,121,026,514,892,741/2^51
+  // diff = 22,518/2^51 = 1.00000008274037099909E-11
+  EXPECT_FALSE(IsClose(-2.71828182846, -2.71828182847, 1.0e-11, 0.0));
+  EXPECT_TRUE(IsClose(-2.71828182846, -2.71828182847, 1.00000009e-11, 0.0));
+
+  TestEdgeCasesNear<T>();
+}
+
+// Tensor::Slice() and Tensor::SubSlice() may return unaligned Tensor.
+TEST(TensorTestUtilTest, ExpectTensorNearSlice) {
+  Tensor x(DT_FLOAT, TensorShape({7, 3}));
+  test::FillFn<float>(&x, [](int i) { return 1.0f; });
+
+  test::ExpectTensorNear<float>(
+      x.SubSlice(3), test::AsTensor<float>({1.0, 1.0, 1.0}, TensorShape({3})),
+      1e-10);
+}
+
+template <typename T>
+void TestEdgeCasesClose() {
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::infinity(),
+                      Eigen::NumTraits<T>::infinity(), 0.0, 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::lowest(),
+                      Eigen::NumTraits<T>::highest(),
+                      Eigen::NumTraits<double>::infinity(),
+                      Eigen::NumTraits<double>::infinity()));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::lowest(),
+                      Eigen::NumTraits<T>::highest(),
+                      static_cast<double>(Eigen::NumTraits<T>::highest()),
+                      static_cast<double>(Eigen::NumTraits<T>::highest())));
+  EXPECT_FALSE(IsClose(Eigen::NumTraits<T>::quiet_NaN(), T(0.0), 0.0, 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<T>::quiet_NaN(), 0.0, 0.0));
+  EXPECT_FALSE(IsClose(Eigen::NumTraits<T>::quiet_NaN(), T(0.0),
+                       Eigen::NumTraits<double>::infinity(), 0.0));
+  EXPECT_TRUE(IsClose(Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<T>::quiet_NaN(),
+                      Eigen::NumTraits<double>::infinity(), 0.0));
+}
+
+TEST(TensorTestUtilTest, ExpectTensorCloseHalf) {
+  typedef Eigen::half T;
+
+  EXPECT_TRUE(IsClose(static_cast<T>(1.0f), static_cast<T>(1.1f), 0.1, 0.1));
+  EXPECT_TRUE(IsClose(static_cast<T>(1.0f), static_cast<T>(1.0f), 0.0, 0.0));
+  EXPECT_FALSE(IsClose(static_cast<T>(1.0f), static_cast<T>(1.1f), 0.0, 0.0));
+
+  // Epsilon:            0 00010 0000000000 -> 2^-13  = 0.0001220703125
+  // Default Tolerance:  0 00100 0100000000 -> 5/2^13 = 0.0006103515625
+
+  // 1.234 -> 0 01111 0011110000 -> 1264/2^10 = 1.234375
+  // 1.233 -> 0 01111 0011101111 -> 1263/2^10 = 1.2333984375
+  // 1.235 -> 0 01111 0011110001 -> 1265/2^10 = 1.2353515625
+  // 1.232 -> 0 01111 0011101110 -> 1262/2^10 = 1.232421875
+  // 1.236 -> 0 01111 0011110010 -> 1266/2^10 = 1.236328125
+  // 1/2^10 = 0.0009765625E
+  // Threshold = 0.0013637542724609375
+  EXPECT_TRUE(IsClose(static_cast<T>(1.234f), static_cast<T>(1.234f)));
+  EXPECT_TRUE(IsClose(static_cast<T>(1.234f), static_cast<T>(1.233f)));
+  EXPECT_TRUE(IsClose(static_cast<T>(1.234f), static_cast<T>(1.235f)));
+
+  // Diff = 0.001953125
+  EXPECT_FALSE(IsClose(static_cast<T>(1.234f), static_cast<T>(1.232f)));
+  EXPECT_FALSE(IsClose(static_cast<T>(1.234f), static_cast<T>(1.236f)));
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(1.234f), static_cast<T>(1.232f), 8e-4f, 1e-3f));
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(1.234f), static_cast<T>(1.236f), 1.4e-3f, 5e-4f));
+
+  // Too fine-grained: won't detect the difference
+  EXPECT_TRUE(
+      IsClose(static_cast<T>(3.141592f), static_cast<T>(3.141593f), 0.0, 0.0));
+
+  // Trivial case.
+  EXPECT_FALSE(IsClose(static_cast<T>(1e4f), static_cast<T>(1e-4f)));
+
+  TestEdgeCasesClose<T>();
+}
+
+TEST(TensorTestUtilTest, ExpectTensorCloseFloat) {
+  typedef float T;
+
+  EXPECT_TRUE(IsClose(1.0f, 1.1f, 0.1f, 0.1f));
+  EXPECT_TRUE(IsClose(1.0f, 1.0f, 0.0f, 0.0f));
+  EXPECT_FALSE(IsClose(1.0f, 1.1f, 0.0f, 0.0f));
+
+  // Epsilon:            2^-23  ~ 0.00000011920928955078
+  // Default Tolerance:  5/2^23 ~ 0.00000059604644775391
+
+  // 1.234567f -> 10,356,299/2^23 ~ 1.234567046165466308594
+  // 1.234568f -> 10,356,307/2^23 ~ 1.234567999839782714844
+  // 1.234566f -> 10,356,290/2^23 ~ 1.234565973281860351563
+  // 1.234569f -> 10,356,315/2^23 ~ 1.234568953514099121094
+  // 1.234565f -> 10,356,282/2^23 ~ 1.234565019607543945313
+  // Threshold ~ 0.00000133190576434572
+  EXPECT_TRUE(IsClose(1.234567f, 1.234567f));
+  EXPECT_TRUE(IsClose(1.234567f, 1.234568f));
+  EXPECT_TRUE(IsClose(1.234567f, 1.234566f));
+  EXPECT_FALSE(IsClose(1.234567f, 1.234569f));
+  EXPECT_FALSE(IsClose(1.234567f, 1.234565f));
+  EXPECT_TRUE(IsClose(1.234567f, 1.234569f, 8e-7f, 1e-6f));
+  EXPECT_TRUE(IsClose(1.234567f, 1.234565f, 3e-7f, 1.5e-6f));
+
+  // Too fine-grained: won't detect the difference
+  EXPECT_TRUE(IsClose(3.14159265f, 3.14159266f, 0.0f, 0.0f));
+
+  // Trivial cases
+  EXPECT_FALSE(IsClose(1e8f, 1e-8f));
+  EXPECT_FALSE(IsClose(1e15f, 1e-15f));
+
+  TestEdgeCasesClose<T>();
+}
+
+TEST(TensorTestUtilTest, ExpectTensorCloseDouble) {
+  typedef double T;
+
+  EXPECT_TRUE(IsClose(1.0, 1.1, 0.1, 0.1));
+  EXPECT_TRUE(IsClose(1.0, 1.0, 0.0, 0.0));
+  EXPECT_FALSE(IsClose(1.0, 1.1, 0.0, 0.0));
+
+  // Epsilon:            2^-52  ~ 2.220446049250313080847E-16
+  // Default Tolerance:  5/2^52 ~ 1.110223024625156540424E-15
+
+  // 1.234567890123456 -> 5,559,999,489,923,576/2^52 ~ 1.234567890123456024298
+  // 1.234567890123457 -> 5,559,999,489,923,580/2^52 ~ 1.234567890123456912477
+  // 1.234567890123455 -> 5,559,999,489,923,571/2^52 ~ 1.234567890123454914075
+  // 1.234567890123458 -> 5,559,999,489,923,585/2^52 ~ 1.2345678901234580227
+  // 1.234567890123454 -> 5,559,999,489,923,567/2^52 ~ 1.234567890123454025897
+  // 1.234567890123459 -> 5,559,999,489,923,589/2^52 ~ 1.234567890123458910878
+  // 1.234567890123453 -> 5,559,999,489,923,562/2^52 ~ 1.234567890123452915674
+  // Threshold ~ 2.480868721703117812159E-15
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123456));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123457));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123455));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123458));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123454));
+  EXPECT_FALSE(IsClose(1.234567890123456, 1.234567890123459));
+  EXPECT_FALSE(IsClose(1.234567890123456, 1.234567890123453));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123459, 9.5e-16, 1.6e-15));
+  EXPECT_TRUE(IsClose(1.234567890123456, 1.234567890123453, 7e-16, 2e-15));
+
+  // Too fine-grained: won't detect the difference
+  EXPECT_TRUE(IsClose(3.141592653589793238, 3.141592653589793239, 0.0, 0.0));
+
+  // Trivial cases
+  EXPECT_FALSE(IsClose(1e15, 1e-15));
+  EXPECT_FALSE(IsClose(1e30, 1e-30));
+
+  TestEdgeCasesClose<T>();
+}
+
+}  // namespace
+}  // namespace test
+}  // namespace tensorflow
diff --git a/deepray/custom_ops/zero_out/BUILD b/deepray/custom_ops/zero_out/BUILD
index 49c053d0..b8ac1cd1 100644
--- a/deepray/custom_ops/zero_out/BUILD
+++ b/deepray/custom_ops/zero_out/BUILD
@@ -10,6 +10,9 @@ custom_op_library(
         "cc/kernels/zero_out_kernels.cc",
         "cc/ops/zero_out_ops.cc",
     ],
+    deps = [
+        "//deepray/custom_ops/utils:ok_status_util",
+    ],
 )
 
 py_library(
@@ -37,5 +40,8 @@ py_test(
     main = "python/tests/run_all_test.py",
     deps = [
         ":zero_out_ops",
+        "@pypi_numpy//:pkg",
+        "@pypi_pytest//:pkg",
+        "@pypi_tensorflow//:pkg",
     ],
 )
diff --git a/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc b/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc
index 56271426..abbe95e1 100644
--- a/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc
+++ b/deepray/custom_ops/zero_out/cc/kernels/zero_out_kernels.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_kernel.h"
 
+#define PRINT_MACRO_HELPER(x) #x
+#define PRINT_MACRO(x) #x "=" PRINT_MACRO_HELPER(x)
+
+#pragma message(PRINT_MACRO(_GLIBCXX_USE_CXX11_ABI))
+
 using namespace tensorflow;
 
 class ZeroOutOp : public OpKernel {
@@ -24,6 +29,10 @@ class ZeroOutOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     // Grab the input tensor
     const Tensor& input_tensor = context->input(0);
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor.shape()),
+                errors::InvalidArgument("ZeroOut expects a 1-D vector."));
+
     auto input = input_tensor.flat<int32>();
 
     // Create an output tensor
diff --git a/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc b/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc
index 70fd824b..71aec83f 100644
--- a/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc
+++ b/deepray/custom_ops/zero_out/cc/ops/zero_out_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "deepray/custom_ops/utils/ok_status_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 
@@ -23,5 +24,5 @@ REGISTER_OP("ZeroOut")
     .Output("zeroed: int32")
     .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return Status::OK();
+      return TFOkStatus;
     });
diff --git a/deepray/datasets/adult_census_income/adult_census_income.py b/deepray/datasets/adult_census_income/adult_census_income.py
index e5ca1175..5ec8cced 100644
--- a/deepray/datasets/adult_census_income/adult_census_income.py
+++ b/deepray/datasets/adult_census_income/adult_census_income.py
@@ -4,13 +4,11 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import MinMaxScaler, LabelEncoder
 
-FLAGS = flags.FLAGS
-
 dir_path = os.path.dirname(os.path.realpath(__file__))
 if os.path.exists(os.path.join(dir_path, 'feature_map.csv')):
   FLAGS([
@@ -19,7 +17,7 @@
   ])
 
 
-class Adult_census_income(DataPipeLine):
+class Adult_census_income(DataPipeline):
 
   def __init__(self, data_path='/workspaces/dataset/census/adult.csv'):
     super().__init__()
@@ -48,9 +46,7 @@ def __init__(self, data_path='/workspaces/dataset/census/adult.csv'):
         f"--num_train_examples={self.train_df.shape[0]}",
     ])
 
-  def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
-  ):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
     if is_training:
       target = self.train_df.pop('income')
       dataset = tf.data.Dataset.from_tensor_slices((dict(self.train_df), target))
diff --git a/deepray/datasets/adult_census_income/adult_census_income_test.py b/deepray/datasets/adult_census_income/adult_census_income_test.py
index 7724995b..4027f716 100644
--- a/deepray/datasets/adult_census_income/adult_census_income_test.py
+++ b/deepray/datasets/adult_census_income/adult_census_income_test.py
@@ -11,8 +11,6 @@
 from deepray.datasets.adult_census_income import Adult_census_income
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
index 22ccf14d..c232ac93 100644
--- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
+++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
@@ -3,21 +3,19 @@
 from tensorflow.python.data.ops import dataset_ops
 
 from deepray.custom_ops.parquet_dataset import parquet_dataset_ops
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
 
-
-class Ali_display_ad_click(DataPipeLine):
+class Ali_display_ad_click(DataPipeline):
 
   def parse(self, record):
     label_map = {}
-    for label in FLAGS.label:
+    for label in flags.FLAGS.label:
       # label_map[label] = record.pop(label)
       label_map[label] = tf.reshape(record.pop(label), [-1, 1])
     return record, label_map
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     """Makes dataset (of filenames) from filename glob patterns."""
     # Extract lines from input files using the Dataset API.
 
@@ -30,16 +28,17 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat
             parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0)
             for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values
         ],
-        num_parallel_reads=FLAGS.parallel_reads_per_file if FLAGS.parallel_reads_per_file else dataset_ops.AUTOTUNE,
+        num_parallel_reads=flags.FLAGS.parallel_reads_per_file
+        if flags.FLAGS.parallel_reads_per_file else dataset_ops.AUTOTUNE,
     )
     dataset = dataset.map(
         map_func=self.parse,
-        num_parallel_calls=FLAGS.parallel_parse if FLAGS.parallel_parse else dataset_ops.AUTOTUNE,
+        num_parallel_calls=flags.FLAGS.parallel_parse if flags.FLAGS.parallel_parse else dataset_ops.AUTOTUNE,
     )
-    if FLAGS.shuffle_buffer:
+    if flags.FLAGS.shuffle_buffer:
       dataset = dataset.apply(
-          tf.data.experimental.shuffle_and_repeat(buffer_size=FLAGS.shuffle_buffer, count=FLAGS.epochs)
+          tf.data.experimental.shuffle_and_repeat(buffer_size=flags.FLAGS.shuffle_buffer, count=flags.FLAGS.epochs)
       )
     else:
-      dataset = dataset.repeat(FLAGS.epochs)
+      dataset = dataset.repeat(flags.FLAGS.epochs)
     return dataset
diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
index 64c1c324..f0d1dd39 100644
--- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
+++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
@@ -11,8 +11,6 @@
 from deepray.datasets.ali_display_ad_click.ali_display_ad_click import Ali_display_ad_click
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014.py b/deepray/datasets/amazon_books_2014/amazon_books_2014.py
index ecdc5eb1..2ea68ba3 100644
--- a/deepray/datasets/amazon_books_2014/amazon_books_2014.py
+++ b/deepray/datasets/amazon_books_2014/amazon_books_2014.py
@@ -19,12 +19,6 @@
 
 from deepray.datasets.tfrecord_pipeline import TFRecordPipeline
 
-FLAGS = flags.FLAGS
-FLAGS([
-    sys.argv[0],
-    "--num_train_examples=11932672",
-])
-
 LABEL = ["label"]
 NEGATIVE_HISTORY = ["item_feat_0_neg", "item_feat_1_neg"]
 POSITIVE_HISTORY = ["item_feat_0_pos", "item_feat_1_pos"]
@@ -37,6 +31,10 @@ class AmazonBooks2014(TFRecordPipeline):
   def __init__(self, max_seq_length, **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
+    FLAGS([
+        sys.argv[0],
+        "--num_train_examples=11932672",
+    ])
 
   def parser(self, record):
     tf_feature_spec = {
diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
index 44681f12..f89843d7 100644
--- a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
+++ b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
@@ -11,8 +11,6 @@
 from deepray.utils.benchmark import PerformanceCalculator
 from .amazon_books_2014 import AmazonBooks2014
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
@@ -31,14 +29,14 @@ def runner(argv=None):
   if argv:
     FLAGS(argv, known_only=True)
 
-  data_pipe = AmazonBooks2014(FLAGS.max_seq_length)
+  prebatch_size = 5
+  data_pipe = AmazonBooks2014(prebatch_size=prebatch_size, FLAGS.max_seq_length)
   # create data pipline of train & test dataset
 
   # since each tfrecord file must include all of the features, it is enough to read first chunk for each split.
   # train_files = [dataset_dir / file for file in feature_spec.source_spec[TRAIN_MAPPING][0][FILES_SELECTOR]]
 
-  prebatch_size = 5
-  train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size, prebatch_size=prebatch_size)
+  train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size)
 
   _performance_calculator = PerformanceCalculator(0, 1000)
 
diff --git a/deepray/datasets/avazu/avazu.py b/deepray/datasets/avazu/avazu.py
index 74a5b130..b5d80249 100644
--- a/deepray/datasets/avazu/avazu.py
+++ b/deepray/datasets/avazu/avazu.py
@@ -23,9 +23,7 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine
-
-FLAGS = flags.FLAGS
+from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 FLAGS([
@@ -41,7 +39,7 @@
 DEFAULT_VALUE = {"int64": 0, "float32": 0.0, "bytes": ""}
 
 
-class Avazu(ParquetPipeLine):
+class Avazu(ParquetPipeline):
 
   def parse(self, record):
     for name in self.feature_map[(self.feature_map['length'] == 1)]["name"].values:
diff --git a/deepray/datasets/avazu/avazu_test.py b/deepray/datasets/avazu/avazu_test.py
index 4738e832..3dd43727 100644
--- a/deepray/datasets/avazu/avazu_test.py
+++ b/deepray/datasets/avazu/avazu_test.py
@@ -11,8 +11,6 @@
 from deepray.datasets.avazu.avazu import Avazu
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/cifar/cifar.py b/deepray/datasets/cifar/cifar.py
index 4896c1dc..f825e63f 100644
--- a/deepray/datasets/cifar/cifar.py
+++ b/deepray/datasets/cifar/cifar.py
@@ -22,18 +22,17 @@
 import numpy as np
 import tensorflow as tf
 from absl import flags
-from keras.utils.data_utils import get_file
+from keras.src.utils.data_utils import get_file
 from tensorflow import keras
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS([
+flags.FLAGS([
     sys.argv[0],
     "--num_train_examples=60000",
 ])
 
 
-class CIFAR(DataPipeLine):
+class CIFAR(DataPipeline):
 
   def load_batch(self, fpath, label_key="labels"):
     """Internal utility for parsing CIFAR data.
@@ -123,7 +122,7 @@ def __init__(self, **kwargs):
         ),
     )
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     if is_training:
       num_train_samples = 50000
 
@@ -150,7 +149,7 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat
     y = keras.utils.to_categorical(y, num_classes)
 
     dataset = tf.data.Dataset.from_tensor_slices((x / 255.0, y))
-    dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size)
+    dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size)
     return dataset
 
 
@@ -202,7 +201,7 @@ def __init__(self, label_mode="fine", **kwargs):
                        f"Received: label_mode={label_mode}.")
 
     dirname = "cifar-100-python"
-    origin = "http://minio1.arsenal.kanzhun-inc.com/datasets/cifar100/cifar-100-python.tar.gz"  #"https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
+    origin = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
     self.path = get_file(
         dirname,
         origin=origin,
@@ -213,7 +212,7 @@ def __init__(self, label_mode="fine", **kwargs):
     )
     self.label_mode = label_mode
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     if is_training:
       fpath = os.path.join(self.path, "train")
 
@@ -230,5 +229,5 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebat
     y = keras.utils.to_categorical(y, num_classes)
 
     dataset = tf.data.Dataset.from_tensor_slices((x / 255.0, y))
-    dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size)
+    dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size)
     return dataset
diff --git a/deepray/datasets/cifar/cifar_test.py b/deepray/datasets/cifar/cifar_test.py
index 1de32b11..de256065 100644
--- a/deepray/datasets/cifar/cifar_test.py
+++ b/deepray/datasets/cifar/cifar_test.py
@@ -9,8 +9,6 @@
 
 from .cifar import CIFAR100, CIFAR10
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/creditcardfraud/creditcardfraud.py b/deepray/datasets/creditcardfraud/creditcardfraud.py
index 19e429de..a2c83c97 100644
--- a/deepray/datasets/creditcardfraud/creditcardfraud.py
+++ b/deepray/datasets/creditcardfraud/creditcardfraud.py
@@ -12,29 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fashion-MNIST dataset."""
+"""Credit Card Fraud dataset."""
 
-import gzip
-import os
 import sys
+
 import numpy as np
+import pandas as pd
 import tensorflow as tf
 from absl import flags
-import pandas as pd
-from keras.utils.data_utils import get_file
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import StandardScaler
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS([
+flags.FLAGS([
     sys.argv[0],
     "--num_train_examples=182280",
 ])
 
 
-class CreditCardFraud(DataPipeLine):
+class CreditCardFraud(DataPipeline):
 
   def __init__(self, url='https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv'):
     super().__init__()
@@ -86,19 +83,12 @@ def __len__(self):
     pass
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      context: tf.distribute.InputContext = None,
-      use_horovod=False,
-      *args,
-      **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     if is_training:
       ds = tf.data.Dataset.from_tensor_slices((self.train_features, self.train_labels))
 
     else:
       ds = tf.data.Dataset.from_tensor_slices((self.val_features, self.val_labels))
-    ds = ds.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size)
+    ds = ds.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size)
     return ds
diff --git a/deepray/datasets/creditcardfraud/creditcardfraud_test.py b/deepray/datasets/creditcardfraud/creditcardfraud_test.py
index 94ff2a7f..ea7c6c91 100644
--- a/deepray/datasets/creditcardfraud/creditcardfraud_test.py
+++ b/deepray/datasets/creditcardfraud/creditcardfraud_test.py
@@ -9,8 +9,6 @@
 
 from .creditcardfraud import CreditCardFraud
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/criteo/docker/Dockerfile_preprocessing b/deepray/datasets/criteo/Dockerfile_preprocessing
similarity index 100%
rename from deepray/datasets/criteo/docker/Dockerfile_preprocessing
rename to deepray/datasets/criteo/Dockerfile_preprocessing
diff --git a/deepray/datasets/criteo/README.md b/deepray/datasets/criteo/README.md
deleted file mode 100644
index 922715d8..00000000
--- a/deepray/datasets/criteo/README.md
+++ /dev/null
@@ -1,282 +0,0 @@
-# Criteo dataset processing
-
-This repository provides a script and recipe to process Criteo Terabyte Dataset.
-
-
-## Quick Start Guide
-
-To train your model using mixed or TF32 precision with Tensor Cores or using FP32, perform the following steps using
-the default parameters of DLRM on the Criteo Terabyte dataset. For the specifics concerning training and inference,
-see the [Advanced](#advanced) section.
-
-1. Clone the repository.
-```
-git clone xxx
-cd DeePray/deepray/datasets/criteo
-```
-
-2. Download the dataset.
-
-You can download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/.
-When you have successfully downloaded it and unpacked it, set the `CRITEO_DATASET_PARENT_DIRECTORY` to its parent directory:
-```
-CRITEO_DATASET_PARENT_DIRECTORY=/raid/criteo
-``` 
-We recommend to choose the fastest possible file system, otherwise it may lead to an IO bottleneck.
-
-3. Build DLRM Docker containers
-```bash
-docker build -t criteo_preprocessing -f Dockerfile_preprocessing . --build-arg DGX_VERSION=[DGX-2|DGX-A100]
-```
-
-3. Start an interactive session in the NGC container to run preprocessing.
-The DLRM PyTorch container can be launched with:
-```bash
-docker run --runtime=nvidia -it --rm --ipc=host  -v ${CRITEO_DATASET_PARENT_DIRECTORY}:/data/dlrm criteo_preprocessing bash
-```
-
-4.  Preprocess the dataset.
-
-Here are a few examples of different preprocessing commands. Out of the box, we support preprocessing  on DGX-2 and DGX A100 systems. For the details on how those scripts work and detailed description of dataset types (small FL=15, large FL=3, xlarge FL=2), system requirements, setup instructions for different systems  and all the parameters consult the [preprocessing section](#preprocessing).
-For an explanation of the `FL` parameter, see the [Dataset Guidelines](#dataset-guidelines) and [Preprocessing](#preprocessing) sections. 
-
-Depending on dataset type (small FL=15, large FL=3, xlarge FL=2) run one of following command:
-
-4.1. Preprocess to small dataset (FL=15) with Spark GPU:
-```bash
-cd /workspace/dlrm/preproc
-./prepare_dataset.sh 15 GPU Spark
-```
-
-4.2. Preprocess to large dataset (FL=3) with Spark GPU:
-```bash
-cd /workspace/dlrm/preproc
-./prepare_dataset.sh 3 GPU Spark
-```
-
-4.3. Preprocess to xlarge dataset (FL=2) with Spark GPU:
-```bash
-cd /workspace/dlrm/preproc
-./prepare_dataset.sh 2 GPU Spark
-```
-
-
-## Advanced
-
-The following sections provide greater details of the dataset.
-
-
-### Getting the data
-
-This example uses the [Criteo Terabyte Dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/).
-The first 23 days are used as the training set. The last day is split in half. The first part, referred to as "test", is used for validating training results. The second one, referred to as "validation", is unused.
-
-
-#### Dataset guidelines
-
-The preprocessing steps applied to the raw data include:
-- Replacing the missing values with `0`
-- Replacing the categorical values that exist fewer than `FL` times with a special value (FL value is called a frequency threshold or a frequency limit)
-- Converting the hash values to consecutive integers
-- Adding 3 to all the numerical features so that all of them are greater or equal to 1
-- Taking a natural logarithm of all numerical features
-
-
-#### BYO dataset 
-
-This implementation supports using other datasets thanks to BYO dataset functionality. 
-The BYO dataset functionality allows users to plug in their dataset in a common fashion for all Recommender models 
-that support this functionality. Using BYO dataset functionality, the user does not have to modify the source code of 
-the model thanks to the Feature Specification file. For general information on how BYO dataset works, refer to the 
-[BYO dataset overview section](#byo-dataset-functionality-overview).
-
-There are three ways to plug in user's dataset:
-<details>
-<summary><b>1. Provide an unprocessed dataset in a format matching the one used by Criteo 1TB, then use Criteo 1TB's preprocessing. Feature Specification file is then generated automatically.</b></summary>
-The required format of the user's dataset is:
-
-The data should be split into text files. Each line of those text files should contain a single training example. 
-An example should consist of multiple fields separated by tabulators:
-
-* The first field is the label – 1 for a positive example and 0 for negative.
-* The next N tokens should contain the numerical features separated by tabs.
-* The next M tokens should contain the hashed categorical features separated by tabs.
-
-The correct dataset files together with the Feature Specification yaml file will be generated automatically by preprocessing script.
-
-For an example of using this process, refer to the [Quick Start Guide](#quick-start-guide)
-
-</details>
-
-<details>
-<summary><b>2. Provide a CSV containing preprocessed data and a simplified Feature Specification yaml file, then transcode the data with `transcode.py` script </b> </summary>
-This option should be used if the user has their own CSV file with a preprocessed dataset they want to train on.
-
-The required format of the user's dataset is:
-* CSV files containing the data, already split into train and test sets. 
-* Feature Specification yaml file describing the layout of the CSV data
-
-For an example of a feature specification file, refer to the `tests/transcoding` folder.
-
-The CSV containing the data:
-* should be already split into train and test
-* should contain no header
-* should contain one column per feature, in the order specified by the list of features for that chunk 
-  in the source_spec section of the feature specification file
-* categorical features should be non-negative integers in the range [0,cardinality-1] if cardinality is specified
-
-The Feature Specification yaml file:
-* needs to describe the layout of data in CSV files
-* may contain information about cardinalities. However, if set to `auto`, they will be inferred from the data by the transcoding script.
-
-Refer to `tests/transcoding/small_csv.yaml` for an example of the yaml Feature Specification.
-
-The following example shows how to use this way of plugging user's dataset:
-
-Prepare your data and save the path:
-```bash
-DATASET_PARENT_DIRECTORY=/raid/dlrm
-```
-
-Build the DLRM image with:
-```bash
-docker build -t nvidia_dlrm_pyt .
-```
-Launch the container with:
-```bash
-docker run --runtime=nvidia -it --rm --ipc=host  -v ${DATASET_PARENT_DIRECTORY}:/data nvidia_dlrm_preprocessing bash
-```
-
-If you are just testing the process, you can create synthetic csv data:
-```bash
-python -m dlrm.scripts.gen_csv --feature_spec_in tests/transcoding/small_csv.yaml
-```
-
-Convert the data:
-```bash
-mkdir /data/conversion_output
-python -m dlrm.scripts.transcode --input /data --output /data/converted
-```
-You may need to tune the --chunk_size parameter. Higher values speed up the conversion but require more RAM.
-
-This will convert the data from `/data` and save the output in `/data/converted`.
-A feature specification file describing the new data will be automatically generated.
-
-To run the training on 1 GPU:
-```bash
-python -m dlrm.scripts.main --mode train --dataset /data/converted --amp --cuda_graphs
-```
-
-- multi-GPU for DGX A100:
-```bash
-python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
-          bash  -c './bind.sh --cpu=dgxa100_ccx.sh --mem=dgxa100_ccx.sh python -m dlrm.scripts.main \
-          --dataset /data/converted --seed 0 --epochs 1 --amp --cuda_graphs'
-```
-
-- multi-GPU for DGX-1 and DGX-2:
-```bash
-python -m torch.distributed.launch --no_python --use_env --nproc_per_node 8 \
-          bash  -c './bind.sh  --cpu=exclusive -- python -m dlrm.scripts.main \
-          --dataset /data/converted --seed 0 --epochs 1 --amp --cuda_graphs'
-```
-</details>
-<details>
-<summary><b>3. Provide a fully preprocessed dataset, saved in split binary files, and a Feature Specification yaml file</b></summary>
-This is the option to choose if you want full control over preprocessing and/or want to preprocess data directly to the target format.
-
-Your final output will need to contain a Feature Specification yaml describing data and file layout. 
-For an example feature specification file, refer to `tests/feature_specs/criteo_f15.yaml`
-
-For details, refer to the [BYO dataset overview section](#byo-dataset-functionality-overview).
-</details>
-
-
-
-##### Channel definitions and requirements
-
-This model defines three channels:
-
-- categorical, accepting an arbitrary number of features
-- numerical, accepting an arbitrary number of features
-- label, accepting a single feature
-
-
-The training script expects two mappings:
-
-- train
-- test
-
-For performance reasons:
-* The only supported dataset type is split binary
-* Splitting chunks into multiple files is not supported.
-* Each categorical feature has to be provided in a separate chunk
-* All numerical features have to be provided in a single chunk
-* All numerical features have to appear in the same order in channel_spec and source_spec
-* Only integer types are supported for categorical features
-* Only float16 is supported for numerical features
-
-##### BYO dataset constraints for the model
-
-There are the following constraints of BYO dataset functionality for this model:
-1. The performance of the model depends on the dataset size. Generally, the model should scale better for datasets containing more data points. For a smaller dataset, you might experience slower performance than the one reported for Criteo
-2. Using other datasets might require tuning some hyperparameters (for example, learning rate, beta1 and beta2) to reach desired accuracy.
-3. The optimized cuda interaction kernels for FP16 and TF32 assume that the number of categorical variables is smaller than WARP_SIZE=32 and embedding size is <=128
-#### Preprocessing 
-
-The preprocessing scripts provided in this repository support running both on CPU and GPU using [NVtabular](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/) (GPU only) and [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/).
-
-Please note that the preprocessing will require about 4TB of disk storage. 
-
-
-The syntax for the preprocessing script is as follows:
-```bash
-cd /workspace/dlrm/preproc
-./prepare_dataset.sh <frequency_threshold> <GPU|CPU> <NVTabular|Spark>
-```
-
-For the Criteo Terabyte dataset, we recommend a frequency threshold of `FL=3`(when using A100 40GB or V100 32 GB) or `FL=2`(when using A100 80GB) if you intend to run the hybrid-parallel mode
-on multiple GPUs. If you want to make the model fit into a single NVIDIA Tesla V100-32GB, you can set `FL=15`. 
-
-The first argument means the frequency threshold to apply to the categorical variables. For a frequency threshold `FL`, the categorical values that occur less 
-often than `FL` will be replaced with one special value for each category. Thus, a larger value of `FL` will require smaller embedding tables 
-and will substantially reduce the overall size of the model.
-
-The second argument is the hardware to use (either GPU or CPU).  
-
-The third arguments is a framework to use (either NVTabular or Spark). In case of choosing a CPU preprocessing this argument is omitted as it only Apache Spark is supported on CPU.
-
-The preprocessing scripts make use of the following environment variables to configure the data directory paths:
-- `download_dir` – this directory should contain the original Criteo Terabyte CSV files
-- `spark_output_path` – directory to which the parquet data will be written
-- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format
-- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM 
-
-In the `final_output_dir` will be three subdirectories created: `train`, `test`, `validation`, and one json file &ndash; `model_size.json` &ndash; containing a maximal index of each category. 
-The `train` is the train dataset transformed from day_0 to day_22. 
-The `test` is the test dataset transformed from the prior half of day_23. 
-The `validation` is the dataset transformed from the latter half of day_23.
-
-The model is tested on 3 datasets resulting from Criteo dataset preprocessing: small (Freqency threshold = 15), large (Freqency threshold = 3) and xlarge (Freqency threshold = 2). Each dataset occupies approx 370GB of disk space. Table below presents information on the supercomputer and GPU count that are needed to train model on particular dataset.
-
-| Dataset | GPU VRAM consumption\* | Model checkpoint size\* | FL setting | DGX A100 40GB, 1GPU | DGX A100 40GB, 8GPU | DGX A100 80GB, 1GPU | DGX A100 80GB, 8GPU | DGX-1** or DGX-2, 1 GPU | DGX-1** or DGX-2, 8GPU | DGX-2, 16GPU |
-| ------- | ---------------------- | ----------------------- | ---------- | -------------------- | -------------------- | -------------------- | -------------------- | ---------------------- | --------------------- | ------------ |
-| small (FL=15) | 20.5 | 15.0 | 15 | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
-| large (FL=3) | 132.3 | 81.9 | 3 | NA | Yes | NA | Yes | NA | Yes | Yes |
-| xlarge (FL=2) | 198.8 | 141.3 | 2 | NA | NA | NA | Yes | NA | NA | NA |
-
-\*with default embedding dimension setting
-\**DGX-1 V100 32GB
-
-##### NVTabular
-
-NVTabular preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values of `ALL_DS_MEM_FRAC`, `TRAIN_DS_MEM_FRAC`, `TEST_DS_MEM_FRAC`, `VALID_DS_MEM_FRAC` in `preproc/preproc_NVTabular.py`, so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 
-
-##### Spark
-
-The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. 
-
-Note that the Spark job requires about 3TB disk space used for data shuffling.
-
-Spark preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values in `preproc/DGX-2_config.sh` or `preproc/DGX-A100_config.sh`
-so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 
diff --git a/deepray/datasets/criteo/criteo.py b/deepray/datasets/criteo/criteo.py
index ee39a41a..17e20922 100644
--- a/deepray/datasets/criteo/criteo.py
+++ b/deepray/datasets/criteo/criteo.py
@@ -17,17 +17,19 @@
 import sys
 
 from absl import flags
+from tensorflow.python.distribute.distribute_lib import InputContext
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS([
-    sys.argv[0],
-    "--num_train_examples=11932672",
-])
 
+class Criteo(DataPipeline):
 
-class Criteo(DataPipeLine):
+  def __init__(self, context: InputContext = None, **kwargs):
+    super().__init__(context, **kwargs)
+    flags.FLAGS([
+        sys.argv[0],
+        "--num_train_examples=11932672",
+    ])
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     pass
diff --git a/deepray/datasets/criteo/criteo_dataset.md b/deepray/datasets/criteo/criteo_dataset.md
new file mode 100644
index 00000000..89dd97d8
--- /dev/null
+++ b/deepray/datasets/criteo/criteo_dataset.md
@@ -0,0 +1,190 @@
+## Quick Start Guide
+
+To prepare the Criteo 1TB dataset for training, follow these steps.  
+
+1. Make sure you meet the prerequisites.
+
+You will need around 4TB of storage for storing the original Criteo 1TB dataset, the results of some
+intermediate preprocessing steps and the final dataset. The final dataset itself will take about 400GB.
+
+We recommend using local storage, such as a fast SSD drive, to run the preprocessing. Using other types of storage
+will negatively impact the preprocessing time.
+
+
+2. Build the preprocessing docker image.
+```bash
+docker build -t preproc_docker_image -f Dockerfile_spark . --build-arg DGX_VERSION=[DGX-2|DGX-A100]
+```
+
+3. Download the data by following the instructions at: http://labs.criteo.com/2013/12/download-terabyte-click-logs/.
+
+When you have successfully downloaded the dataset, put it in the `/data/criteo_orig` directory in the container
+(`$PWD/data/criteo_orig` in the host system).
+
+4. Start an interactive session in the NGC container to run preprocessing.
+The DLRM TensorFlow container can be launched with:
+
+```bash
+mkdir -p data
+docker run --runtime=nvidia -it --rm --ipc=host  -v ${PWD}/data:/data preproc_docker_image bash
+```
+
+5. Unzip the data with:
+
+```bash
+gunzip /data/criteo_orig/*.gz
+```
+
+6. Preprocess the data.
+
+Here are a few examples of different preprocessing commands. Out of the box, we support preprocessing  on DGX-2 and DGX A100 systems. For the details on how those scripts work and detailed description of dataset types (small FL=15, large FL=3, xlarge FL=2), system requirements, setup instructions for different systems  and all the parameters consult the [preprocessing section](#preprocessing).
+For an explanation of the `FL` parameter, see the [Dataset Guidelines](#dataset-guidelines) and [Preprocessing](#preprocessing) sections. 
+
+Depending on dataset type (small FL=15, large FL=3, xlarge FL=2) run one of following command:
+
+```bash
+export download_dir=/data/criteo_orig
+export final_output_dir=/data/preprocessed
+
+cd preproc
+
+# Preprocess to small dataset (FL=15) with Spark GPU:
+./prepare_dataset.sh 15 GPU Spark
+
+# Preprocess to large dataset (FL=3) with Spark GPU:
+./prepare_dataset.sh 3 GPU Spark
+
+# Preprocess to xlarge dataset (FL=2) with Spark GPU:
+./prepare_dataset.sh 2 GPU Spark
+
+# to run on Spark GPU with no frequency limit:
+./prepare_dataset.sh 0 GPU Spark
+```
+
+
+
+## Advanced
+
+### Dataset guidelines
+
+The first 23 days are used as the training set. The last day is split in half.
+The first part is used as a validation set and the second set is used as a hold-out test set.
+
+The preprocessing steps applied to the raw data include:
+- Replacing the missing values with `0`.
+- Replacing the categorical values that exist fewer than 15 times with a special value.
+- Converting the hash values to consecutive integers.
+- Adding 2 to all the numerical features so that all of them are greater or equal to 1.
+- Taking a natural logarithm of all numerical features.
+
+
+### Preprocess with Spark
+
+The preprocessing scripts provided in this repository support running both on CPU and GPU using [NVtabular](https://developer.nvidia.com/blog/announcing-the-nvtabular-open-beta-with-multi-gpu-support-and-new-data-loaders/) (GPU only) and [Apache Spark 3.0](https://www.nvidia.com/en-us/deep-learning-ai/solutions/data-science/apache-spark-3/).
+
+Please note that the preprocessing will require about 4TB of disk storage. 
+
+
+The syntax for the preprocessing script is as follows:
+```bash
+cd /workspace/dlrm/preproc
+./prepare_dataset.sh <frequency_threshold> <GPU|CPU> <NVTabular|Spark>
+```
+
+For the Criteo Terabyte dataset, we recommend a frequency threshold of `FL=3`(when using A100 40GB or V100 32 GB) or `FL=2`(when using A100 80GB) if you intend to run the hybrid-parallel mode
+on multiple GPUs. If you want to make the model fit into a single NVIDIA Tesla V100-32GB, you can set `FL=15`. 
+
+The first argument means the frequency threshold to apply to the categorical variables. For a frequency threshold `FL`, the categorical values that occur less 
+often than `FL` will be replaced with one special value for each category. Thus, a larger value of `FL` will require smaller embedding tables 
+and will substantially reduce the overall size of the model.
+
+The second argument is the hardware to use (either GPU or CPU).  
+
+The third arguments is a framework to use (either NVTabular or Spark). In case of choosing a CPU preprocessing this argument is omitted as it only Apache Spark is supported on CPU.
+
+The preprocessing scripts make use of the following environment variables to configure the data directory paths:
+- `download_dir` – this directory should contain the original Criteo Terabyte CSV files
+- `spark_output_path` – directory to which the parquet data will be written
+- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format
+- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM 
+
+In the `final_output_dir` will be three subdirectories created: `train`, `test`, `validation`, and one json file &ndash; `model_size.json` &ndash; containing a maximal index of each category. 
+The `train` is the train dataset transformed from day_0 to day_22. 
+The `test` is the test dataset transformed from the prior half of day_23. 
+The `validation` is the dataset transformed from the latter half of day_23.
+
+The model is tested on 3 datasets resulting from Criteo dataset preprocessing: small (Freqency threshold = 15), large (Freqency threshold = 3) and xlarge (Freqency threshold = 2). Each dataset occupies approx 370GB of disk space. Table below presents information on the supercomputer and GPU count that are needed to train model on particular dataset.
+
+| Dataset | GPU VRAM consumption\* | Model checkpoint size\* | FL setting | DGX A100 40GB, 1GPU | DGX A100 40GB, 8GPU | DGX A100 80GB, 1GPU | DGX A100 80GB, 8GPU | DGX-1** or DGX-2, 1 GPU | DGX-1** or DGX-2, 8GPU | DGX-2, 16GPU |
+| ------- | ---------------------- | ----------------------- | ---------- | -------------------- | -------------------- | -------------------- | -------------------- | ---------------------- | --------------------- | ------------ |
+| small (FL=15) | 20.5 | 15.0 | 15 | Yes | Yes | Yes | Yes | Yes | Yes | Yes |
+| large (FL=3) | 132.3 | 81.9 | 3 | NA | Yes | NA | Yes | NA | Yes | Yes |
+| xlarge (FL=2) | 198.8 | 141.3 | 2 | NA | NA | NA | Yes | NA | NA | NA |
+
+\*with default embedding dimension setting
+\**DGX-1 V100 32GB
+
+##### NVTabular
+
+NVTabular preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values of `ALL_DS_MEM_FRAC`, `TRAIN_DS_MEM_FRAC`, `TEST_DS_MEM_FRAC`, `VALID_DS_MEM_FRAC` in `preproc/preproc_NVTabular.py`, so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 
+
+##### Spark
+
+The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then run several PySpark jobs with `spark_data_utils.py`. 
+
+Note that the Spark job requires about 3TB disk space used for data shuffling.
+
+Spark preprocessing is calibrated to run on [DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100/) and [DGX-2](https://www.nvidia.com/en-us/data-center/dgx-2/) AI systems. However, it should be possible to change the values in `preproc/DGX-2_config.sh` or `preproc/DGX-A100_config.sh`
+so that they'll work on also on other hardware platforms such as DGX-1 or a custom one. 
+
+
+
+
+
+The preprocessing scripts makes use of the following environment variables to configure the data directory paths:
+- `download_dir` – this directory should contain the original Criteo Terabyte CSV files
+- `spark_output_path` – directory to which the parquet data will be written
+- `conversion_intermediate_dir` – directory used for storing intermediate data used to convert from parquet to train-ready format
+- `final_output_dir` – directory to store the final results of the preprocessing which can then be used to train DLRM
+
+The script `spark_data_utils.py` is a PySpark application, which is used to preprocess the Criteo Terabyte Dataset. In the Docker image, we have installed Spark 3.0.1, which will start a standalone cluster of Spark. The scripts `run_spark_cpu.sh` and `run_spark_gpu.sh` start Spark, then runs several PySpark jobs with `spark_data_utils.py`, for example:
+generates the dictionary
+- transforms the train dataset
+- transforms the test dataset
+- transforms the validation dataset
+
+    Change the variables in the `run-spark.sh` script according to your environment.
+    Configure the paths.
+```
+export SPARK_LOCAL_DIRS=/data/spark-tmp
+export INPUT_PATH=/data/criteo
+export OUTPUT_PATH=/data/output
+```
+Note that the Spark job requires about 3TB disk space used for data shuffle.
+
+Where:
+`SPARK_LOCAL_DIRS` is the path where Spark uses to write shuffle data.
+`INPUT_PATH` is the path of the Criteo Terabyte Dataset, including uncompressed files like day_0, day_1…
+`OUTPUT_PATH` is where the script writes the output data. It will generate the following subdirectories of `models`, `train`, `test`, and `validation`.
+- The `model` is the dictionary folder.
+- The `train` is the train dataset transformed from day_0 to day_22.
+- The `test` is the test dataset transformed from the prior half of day_23.
+- The `validation` is the dataset transformed from the latter half of day_23.
+
+Configure the resources which Spark will use.
+```
+export TOTAL_CORES=80
+export TOTAL_MEMORY=800
+```
+
+Where:
+`TOTAL_CORES` is the total CPU cores you want Spark to use.
+
+`TOTAL_MEMORY` is the total memory Spark will use.
+
+Configure frequency limit.
+```
+USE_FREQUENCY_LIMIT=15
+```
+The frequency limit is used to filter out the categorical values which appear less than n times in the whole dataset, and make them be 0. Change this variable to 1 to enable it. The default frequency limit is 15 in the script. You also can change the number as you want by changing the line of `OPTS="--frequency_limit 8"`.
+
diff --git a/deepray/datasets/criteo/criteo_test.py b/deepray/datasets/criteo/criteo_test.py
index 8f2cb38e..bb66063b 100644
--- a/deepray/datasets/criteo/criteo_test.py
+++ b/deepray/datasets/criteo/criteo_test.py
@@ -20,9 +20,6 @@
     stop_threshold=False,
 )
 
-FLAGS = flags.FLAGS
-logging.set_verbosity(logging.INFO)
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/criteo/criteo_tsv_reader.py b/deepray/datasets/criteo/criteo_tsv_reader.py
index 0a511105..bbccdecc 100644
--- a/deepray/datasets/criteo/criteo_tsv_reader.py
+++ b/deepray/datasets/criteo/criteo_tsv_reader.py
@@ -21,13 +21,11 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.utils.horovod_utils import get_world_size, get_rank
 
-FLAGS = flags.FLAGS
 
-
-class CriteoTsvReader(DataPipeLine):
+class CriteoTsvReader(DataPipeline):
   """Input reader callable for pre-processed Criteo data.
 
   Raw Criteo data is assumed to be preprocessed in the following way:
@@ -49,7 +47,6 @@ def build_dataset(
       input_file_pattern,
       batch_size,
       is_training=True,
-      prebatch_size=0,
       epochs=1,
       shuffle=True,
       *args,
@@ -76,7 +73,7 @@ def make_dataset():
 
     indices = tf.data.Dataset.range(get_world_size())
     dataset = indices.interleave(
-        map_func=make_dataset, cycle_length=FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE
+        map_func=make_dataset, cycle_length=flags.FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE
     )
 
     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
@@ -93,7 +90,7 @@ def parser(self, example: tf.Tensor):
     fields = tf.io.decode_csv(example, record_defaults, field_delim='\t', na_value='-1')
 
     num_labels = 1
-    label = tf.reshape(fields[0], [FLAGS.batch_size, 1])
+    label = tf.reshape(fields[0], [flags.FLAGS.batch_size, 1])
 
     features = {}
     num_dense = len(dense_defaults)
diff --git a/deepray/datasets/criteo/criteo_tsv_reader_test.py b/deepray/datasets/criteo/criteo_tsv_reader_test.py
index 3c987e07..6188e386 100644
--- a/deepray/datasets/criteo/criteo_tsv_reader_test.py
+++ b/deepray/datasets/criteo/criteo_tsv_reader_test.py
@@ -10,8 +10,6 @@
 from deepray.datasets.criteo.criteo_tsv_reader import CriteoTsvReader
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
-
 
 def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
diff --git a/deepray/datasets/criteo/feature_map_small.csv b/deepray/datasets/criteo/feature_map_small.csv
index 43296d20..45f9d00c 100644
--- a/deepray/datasets/criteo/feature_map_small.csv
+++ b/deepray/datasets/criteo/feature_map_small.csv
@@ -1,41 +1,41 @@
 name,dtype,ftype,dim,length,voc_size
-feature_0,int32,Label,1,1,
-feature_1,float64,Numerical,1,1,
-feature_2,float64,Numerical,1,1,
-feature_3,float64,Numerical,1,1,
-feature_4,float64,Numerical,1,1,
-feature_5,float64,Numerical,1,1,
-feature_6,float64,Numerical,1,1,
-feature_7,float64,Numerical,1,1,
-feature_8,float64,Numerical,1,1,
-feature_9,float64,Numerical,1,1,
-feature_10,float64,Numerical,1,1,
-feature_11,float64,Numerical,1,1,
-feature_12,float64,Numerical,1,1,
-feature_13,float64,Numerical,1,1,
-feature_14,int32,Categorical,16,1,7912888
-feature_15,int32,Categorical,16,1,33822
-feature_16,int32,Categorical,16,1,17138
-feature_17,int32,Categorical,16,1,7338
-feature_18,int32,Categorical,16,1,20045
-feature_19,int32,Categorical,16,1,3
-feature_20,int32,Categorical,16,1,7104
-feature_21,int32,Categorical,16,1,1381
-feature_22,int32,Categorical,16,1,62
-feature_23,int32,Categorical,16,1,5554113
-feature_24,int32,Categorical,16,1,582468
-feature_25,int32,Categorical,16,1,245827
-feature_26,int32,Categorical,16,1,10
-feature_27,int32,Categorical,16,1,2208
-feature_28,int32,Categorical,16,1,10666
-feature_29,int32,Categorical,16,1,103
-feature_30,int32,Categorical,16,1,3
-feature_31,int32,Categorical,16,1,967
-feature_32,int32,Categorical,16,1,14
-feature_33,int32,Categorical,16,1,8165895
-feature_34,int32,Categorical,16,1,2675939
-feature_35,int32,Categorical,16,1,7156452
-feature_36,int32,Categorical,16,1,302515
-feature_37,int32,Categorical,16,1,12021
-feature_38,int32,Categorical,16,1,96
-feature_39,int32,Categorical,16,1,34
\ No newline at end of file
+f_c0,int32,Label,1,1,
+f_c1,float64,Numerical,1,1,
+f_c2,float64,Numerical,1,1,
+f_c3,float64,Numerical,1,1,
+f_c4,float64,Numerical,1,1,
+f_c5,float64,Numerical,1,1,
+f_c6,float64,Numerical,1,1,
+f_c7,float64,Numerical,1,1,
+f_c8,float64,Numerical,1,1,
+f_c9,float64,Numerical,1,1,
+f_c10,float64,Numerical,1,1,
+f_c11,float64,Numerical,1,1,
+f_c12,float64,Numerical,1,1,
+f_c13,float64,Numerical,1,1,
+f_c14,int32,Categorical,16,1,7912888
+f_c15,int32,Categorical,16,1,33822
+f_c16,int32,Categorical,16,1,17138
+f_c17,int32,Categorical,16,1,7338
+f_c18,int32,Categorical,16,1,20045
+f_c19,int32,Categorical,16,1,3
+f_c20,int32,Categorical,16,1,7104
+f_c21,int32,Categorical,16,1,1381
+f_c22,int32,Categorical,16,1,62
+f_c23,int32,Categorical,16,1,5554113
+f_c24,int32,Categorical,16,1,582468
+f_c25,int32,Categorical,16,1,245827
+f_c26,int32,Categorical,16,1,10
+f_c27,int32,Categorical,16,1,2208
+f_c28,int32,Categorical,16,1,10666
+f_c29,int32,Categorical,16,1,103
+f_c30,int32,Categorical,16,1,3
+f_c31,int32,Categorical,16,1,967
+f_c32,int32,Categorical,16,1,14
+f_c33,int32,Categorical,16,1,8165895
+f_c34,int32,Categorical,16,1,2675939
+f_c35,int32,Categorical,16,1,7156452
+f_c36,int32,Categorical,16,1,302515
+f_c37,int32,Categorical,16,1,12021
+f_c38,int32,Categorical,16,1,96
+f_c39,int32,Categorical,16,1,34
\ No newline at end of file
diff --git a/deepray/datasets/criteo/feature_map_xlarge.csv b/deepray/datasets/criteo/feature_map_xlarge.csv
index 40ecd51d..b90ab0f3 100644
--- a/deepray/datasets/criteo/feature_map_xlarge.csv
+++ b/deepray/datasets/criteo/feature_map_xlarge.csv
@@ -1,41 +1,41 @@
 name,dtype,ftype,dim,length,voc_size
-_c0,int32,Label,1,1,
-_c1,float64,Numerical,1,1,
-_c2,float64,Numerical,1,1,
-_c3,float64,Numerical,1,1,
-_c4,float64,Numerical,1,1,
-_c5,float64,Numerical,1,1,
-_c6,float64,Numerical,1,1,
-_c7,float64,Numerical,1,1,
-_c8,float64,Numerical,1,1,
-_c9,float64,Numerical,1,1,
-_c10,float64,Numerical,1,1,
-_c11,float64,Numerical,1,1,
-_c12,float64,Numerical,1,1,
-_c13,float64,Numerical,1,1,
-_c14,int32,Categorical,1,1,227605431
-_c15,int32,Categorical,1,1,39060
-_c16,int32,Categorical,1,1,17295
-_c17,int32,Categorical,1,1,7424
-_c18,int32,Categorical,1,1,20265
-_c19,int32,Categorical,1,1,3
-_c20,int32,Categorical,1,1,7122
-_c21,int32,Categorical,1,1,1543
-_c22,int32,Categorical,1,1,63
-_c23,int32,Categorical,1,1,130229466
-_c24,int32,Categorical,1,1,3067955
-_c25,int32,Categorical,1,1,405282
-_c26,int32,Categorical,1,1,10
-_c27,int32,Categorical,1,1,2208
-_c28,int32,Categorical,1,1,11938
-_c29,int32,Categorical,1,1,154
-_c30,int32,Categorical,1,1,3
-_c31,int32,Categorical,1,1,976
-_c32,int32,Categorical,1,1,14
-_c33,int32,Categorical,1,1,292775613
-_c34,int32,Categorical,1,1,40790947
-_c35,int32,Categorical,1,1,187188509
-_c36,int32,Categorical,1,1,590151
-_c37,int32,Categorical,1,1,12973
-_c38,int32,Categorical,1,1,108
-_c39,int32,Categorical,1,1,36
\ No newline at end of file
+f_c0,int32,Label,1,1,
+f_c1,float64,Numerical,1,1,
+f_c2,float64,Numerical,1,1,
+f_c3,float64,Numerical,1,1,
+f_c4,float64,Numerical,1,1,
+f_c5,float64,Numerical,1,1,
+f_c6,float64,Numerical,1,1,
+f_c7,float64,Numerical,1,1,
+f_c8,float64,Numerical,1,1,
+f_c9,float64,Numerical,1,1,
+f_c10,float64,Numerical,1,1,
+f_c11,float64,Numerical,1,1,
+f_c12,float64,Numerical,1,1,
+f_c13,float64,Numerical,1,1,
+f_c14,int32,Categorical,1,1,227605431
+f_c15,int32,Categorical,1,1,39060
+f_c16,int32,Categorical,1,1,17295
+f_c17,int32,Categorical,1,1,7424
+f_c18,int32,Categorical,1,1,20265
+f_c19,int32,Categorical,1,1,3
+f_c20,int32,Categorical,1,1,7122
+f_c21,int32,Categorical,1,1,1543
+f_c22,int32,Categorical,1,1,63
+f_c23,int32,Categorical,1,1,130229466
+f_c24,int32,Categorical,1,1,3067955
+f_c25,int32,Categorical,1,1,405282
+f_c26,int32,Categorical,1,1,10
+f_c27,int32,Categorical,1,1,2208
+f_c28,int32,Categorical,1,1,11938
+f_c29,int32,Categorical,1,1,154
+f_c30,int32,Categorical,1,1,3
+f_c31,int32,Categorical,1,1,976
+f_c32,int32,Categorical,1,1,14
+f_c33,int32,Categorical,1,1,292775613
+f_c34,int32,Categorical,1,1,40790947
+f_c35,int32,Categorical,1,1,187188509
+f_c36,int32,Categorical,1,1,590151
+f_c37,int32,Categorical,1,1,12973
+f_c38,int32,Categorical,1,1,108
+f_c39,int32,Categorical,1,1,36
\ No newline at end of file
diff --git a/deepray/datasets/criteo/preproc/data/__init__.py b/deepray/datasets/criteo/preproc/data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/datasets/criteo/preproc/data/defaults.py b/deepray/datasets/criteo/preproc/data/defaults.py
new file mode 100644
index 00000000..b4e12767
--- /dev/null
+++ b/deepray/datasets/criteo/preproc/data/defaults.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2021 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+CATEGORICAL_CHANNEL = "categorical"
+NUMERICAL_CHANNEL = "numerical"
+LABEL_CHANNEL = "label"
+
+SPLIT_BINARY = "split_binary"
+
+TRAIN_MAPPING = "train"
+TEST_MAPPING = "test"
+
+TYPE_SELECTOR = "type"
+FEATURES_SELECTOR = "features"
+FILES_SELECTOR = "files"
+
+DTYPE_SELECTOR = "dtype"
+CARDINALITY_SELECTOR = "cardinality"
+
+
+def get_categorical_feature_type(size: int):
+  """This function works both when max value and cardinality is passed.
+        Consistency by the user is required"""
+  types = (np.int8, np.int16, np.int32)
+
+  for numpy_type in types:
+    if size < np.iinfo(numpy_type).max:
+      return numpy_type
+
+  raise RuntimeError(f"Categorical feature of size {size} is too big for defined types")
diff --git a/deepray/datasets/criteo/preproc/data/feature_spec.py b/deepray/datasets/criteo/preproc/data/feature_spec.py
new file mode 100644
index 00000000..f40a43bb
--- /dev/null
+++ b/deepray/datasets/criteo/preproc/data/feature_spec.py
@@ -0,0 +1,268 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import os
+from typing import Dict
+from typing import List
+import numpy as np
+from defaults import CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL, \
+    TRAIN_MAPPING, TEST_MAPPING, \
+    TYPE_SELECTOR, FEATURES_SELECTOR, FILES_SELECTOR, CARDINALITY_SELECTOR, DTYPE_SELECTOR, \
+    SPLIT_BINARY, \
+    get_categorical_feature_type
+""" For performance reasons, numerical features are required to appear in the same order
+    in both source_spec and channel_spec.
+    For more detailed requirements, see the check_feature_spec method"""
+
+
+class FeatureSpec:
+
+  def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metadata=None, base_directory=None):
+    self.feature_spec: Dict = feature_spec if feature_spec is not None else {}
+    self.source_spec: Dict = source_spec if source_spec is not None else {}
+    self.channel_spec: Dict = channel_spec if channel_spec is not None else {}
+    self.metadata: Dict = metadata if metadata is not None else {}
+    self.base_directory: str = base_directory
+
+  @classmethod
+  def from_yaml(cls, path):
+    with open(path, 'r') as feature_spec_file:
+      base_directory = os.path.dirname(path)
+      feature_spec = yaml.safe_load(feature_spec_file)
+      return cls.from_dict(feature_spec, base_directory=base_directory)
+
+  @classmethod
+  def from_dict(cls, source_dict, base_directory):
+    return cls(base_directory=base_directory, **source_dict)
+
+  def to_dict(self) -> Dict:
+    attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata']
+    return {attr: self.__dict__[attr] for attr in attributes_to_dump}
+
+  def to_string(self):
+    return yaml.dump(self.to_dict())
+
+  def to_yaml(self, output_path=None):
+    if not output_path:
+      output_path = self.base_directory + '/feature_spec.yaml'
+    with open(output_path, 'w') as output_file:
+      print(yaml.dump(self.to_dict()), file=output_file)
+
+  def get_number_of_numerical_features(self) -> int:
+    numerical_features = self.channel_spec[NUMERICAL_CHANNEL]
+    return len(numerical_features)
+
+  def cat_positions_to_names(self, positions: List[int]):
+    #  Ordering needs to correspond to the one in get_categorical_sizes()
+    feature_names = self.get_categorical_feature_names()
+    return [feature_names[i] for i in positions]
+
+  def get_categorical_feature_names(self):
+    """ Provides the categorical feature names. The returned order should me maintained."""
+    return self.channel_spec[CATEGORICAL_CHANNEL]
+
+  def get_categorical_sizes(self) -> List[int]:
+    """For a given feature spec, this function is expected to return the sizes in the order corresponding to the
+        order in the channel_spec section """
+    categorical_features = self.get_categorical_feature_names()
+    cardinalities = [self.feature_spec[feature_name][CARDINALITY_SELECTOR] for feature_name in categorical_features]
+
+    return cardinalities
+
+  def check_feature_spec(self):
+    # TODO check if cardinality fits in dtype, check if base directory is set
+    # TODO split into two checking general and model specific requirements
+    # check that mappings are the ones expected
+    mapping_name_list = list(self.source_spec.keys())
+    assert sorted(mapping_name_list) == sorted([TEST_MAPPING, TRAIN_MAPPING])
+
+    # check that channels are the ones expected
+    channel_name_list = list(self.channel_spec.keys())
+    assert sorted(channel_name_list) == sorted([CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL])
+
+    categorical_features_list = self.channel_spec[CATEGORICAL_CHANNEL]
+    numerical_features_list = self.channel_spec[NUMERICAL_CHANNEL]
+    label_features_list = self.channel_spec[LABEL_CHANNEL]
+    set_of_categorical_features = set(categorical_features_list)
+    set_of_numerical_features = set(numerical_features_list)
+
+    # check that exactly one label feature is selected
+    assert len(label_features_list) == 1
+    label_feature_name = label_features_list[0]
+
+    # check that lists in channel spec contain unique names
+    assert sorted(list(set_of_categorical_features)) == sorted(categorical_features_list)
+    assert sorted(list(set_of_numerical_features)) == sorted(numerical_features_list)
+
+    # check that all features used in channel spec are exactly ones defined in feature_spec
+    feature_spec_features = list(self.feature_spec.keys())
+    channel_spec_features = list(
+        set.union(set_of_categorical_features, set_of_numerical_features, {label_feature_name})
+    )
+    assert sorted(feature_spec_features) == sorted(channel_spec_features)
+
+    # check that correct dtypes are provided for all features
+    for feature_dict in self.feature_spec.values():
+      assert DTYPE_SELECTOR in feature_dict
+      try:
+        np.dtype(feature_dict[DTYPE_SELECTOR])
+      except TypeError:
+        assert False, "Type not understood by numpy"
+
+    # check that categorical features have cardinality provided
+    for feature_name, feature_dict in self.feature_spec.items():
+      if feature_name in set_of_categorical_features:
+        assert CARDINALITY_SELECTOR in feature_dict
+        assert isinstance(feature_dict[CARDINALITY_SELECTOR], int)
+
+    for mapping_name in [TRAIN_MAPPING, TEST_MAPPING]:
+
+      mapping = self.source_spec[mapping_name]
+      mapping_features = set()
+      for chunk in mapping:
+        # check that chunk has the correct type
+        assert chunk[TYPE_SELECTOR] == SPLIT_BINARY
+
+        contained_features = chunk[FEATURES_SELECTOR]
+        containing_files = chunk[FILES_SELECTOR]
+
+        # check that features are unique in mapping
+        for feature in contained_features:
+          assert feature not in mapping_features
+          mapping_features.add(feature)
+
+        # check that chunk has at least one features
+        assert len(contained_features) >= 1
+
+        # check that chunk has exactly file
+        assert len(containing_files) == 1
+
+        first_feature = contained_features[0]
+
+        if first_feature in set_of_categorical_features:
+          # check that each categorical feature is in a different file
+          assert len(contained_features) == 1
+
+        elif first_feature in set_of_numerical_features:
+          # check that numerical features are all in one chunk
+          assert sorted(contained_features) == sorted(numerical_features_list)
+
+          # check that ordering is exactly same as in channel spec - required for performance
+          assert contained_features == numerical_features_list
+
+          # check numerical dtype
+          for feature in contained_features:
+            assert np.dtype(self.feature_spec[feature][DTYPE_SELECTOR]) == np.float16
+
+        elif first_feature == label_feature_name:
+          # check that label feature is in a separate file
+          assert len(contained_features) == 1
+
+          # check label dtype
+          assert np.dtype(self.feature_spec[first_feature][DTYPE_SELECTOR]) == bool
+
+        else:
+          assert False, "Feature of unknown type"
+
+      # check that all features appeared in mapping
+      assert sorted(mapping_features) == sorted(feature_spec_features)
+
+  @staticmethod
+  def get_default_feature_spec(number_of_numerical_features, categorical_feature_cardinalities):
+    numerical_feature_fstring = "num_{}"
+    categorical_feature_fstring = "cat_{}.bin"
+    label_feature_name = "label"
+
+    numerical_file_name = "numerical.bin"
+    categorical_file_fstring = "{}"  # TODO remove .bin from feature name, add to file name
+    label_file_name = "label.bin"
+
+    number_of_categorical_features = len(categorical_feature_cardinalities)
+    numerical_feature_names = [numerical_feature_fstring.format(i) for i in range(number_of_numerical_features)]
+    categorical_feature_names = [categorical_feature_fstring.format(i) for i in range(number_of_categorical_features)]
+    cat_feature_types = [get_categorical_feature_type(int(cat_size)) for cat_size in categorical_feature_cardinalities]
+
+    feature_dict = {
+        f_name: {
+            DTYPE_SELECTOR: str(np.dtype(f_type)),
+            CARDINALITY_SELECTOR: f_size
+        } for f_name, f_type, f_size in
+        zip(categorical_feature_names, cat_feature_types, categorical_feature_cardinalities)
+    }
+    for f_name in numerical_feature_names:
+      feature_dict[f_name] = {DTYPE_SELECTOR: str(np.dtype(np.float16))}
+    feature_dict[label_feature_name] = {DTYPE_SELECTOR: str(np.dtype(bool))}
+
+    channel_spec = {
+        CATEGORICAL_CHANNEL: categorical_feature_names,
+        NUMERICAL_CHANNEL: numerical_feature_names,
+        LABEL_CHANNEL: [label_feature_name]
+    }
+    source_spec = {}
+
+    for filename in (TRAIN_MAPPING, TEST_MAPPING):
+      source_spec[filename] = []
+      dst_folder = filename
+
+      numerical_file_path = os.path.join(dst_folder, numerical_file_name)
+      source_spec[filename].append(
+          {
+              TYPE_SELECTOR: SPLIT_BINARY,
+              FEATURES_SELECTOR: numerical_feature_names,
+              FILES_SELECTOR: [numerical_file_path]
+          }
+      )
+
+      label_file_path = os.path.join(dst_folder, label_file_name)
+      source_spec[filename].append(
+          {
+              TYPE_SELECTOR: SPLIT_BINARY,
+              FEATURES_SELECTOR: [label_feature_name],
+              FILES_SELECTOR: [label_file_path]
+          }
+      )
+
+      for feature_name in categorical_feature_names:
+        categorical_file_name = categorical_file_fstring.format(feature_name)
+        categorical_file_path = os.path.join(dst_folder, categorical_file_name)
+        source_spec[filename].append(
+            {
+                TYPE_SELECTOR: SPLIT_BINARY,
+                FEATURES_SELECTOR: [feature_name],
+                FILES_SELECTOR: [categorical_file_path]
+            }
+        )
+
+    return FeatureSpec(feature_spec=feature_dict, source_spec=source_spec, channel_spec=channel_spec, metadata={})
+
+  def get_mapping_paths(self, mapping_name: str):
+    label_feature_name = self.channel_spec[LABEL_CHANNEL][0]
+    set_of_categorical_features = set(self.channel_spec[CATEGORICAL_CHANNEL])
+    set_of_numerical_features = set(self.channel_spec[NUMERICAL_CHANNEL])
+
+    label_path = None
+    numerical_path = None
+    categorical_paths = dict()
+    for chunk in self.source_spec[mapping_name]:
+      local_path = os.path.join(self.base_directory, chunk[FILES_SELECTOR][0])
+      if chunk[FEATURES_SELECTOR][0] in set_of_numerical_features:
+        numerical_path = local_path
+      elif chunk[FEATURES_SELECTOR][0] in set_of_categorical_features:
+        local_feature = chunk[FEATURES_SELECTOR][0]
+        categorical_paths[local_feature] = local_path
+      elif chunk[FEATURES_SELECTOR][0] == label_feature_name:
+        label_path = local_path
+
+    return label_path, numerical_path, categorical_paths
diff --git a/deepray/datasets/criteo/preproc/parquet_to_binary.py b/deepray/datasets/criteo/preproc/parquet_to_binary.py
index f824cee9..cf13b33b 100644
--- a/deepray/datasets/criteo/preproc/parquet_to_binary.py
+++ b/deepray/datasets/criteo/preproc/parquet_to_binary.py
@@ -23,10 +23,10 @@
 
 
 def process_file(f, dst):
-  label = '_c0'
-  dense_columns = [f'_c{i}' for i in range(1, 14)]
-  categorical_columns = [f'_c{i}' for i in range(14, 40)]
-  all_columns_sorted = [f'_c{i}' for i in range(0, 40)]
+  label = 'f_c0'
+  dense_columns = [f'f_c{i}' for i in range(1, 14)]
+  categorical_columns = [f'f_c{i}' for i in range(14, 40)]
+  all_columns_sorted = [f'f_c{i}' for i in range(0, 40)]
   data = pd.read_parquet(f)
   data = data[all_columns_sorted]
 
diff --git a/deepray/datasets/criteo/preproc/preproc_NVTabular.py b/deepray/datasets/criteo/preproc/preproc_NVTabular.py
index b99b3be1..90b57faf 100644
--- a/deepray/datasets/criteo/preproc/preproc_NVTabular.py
+++ b/deepray/datasets/criteo/preproc/preproc_NVTabular.py
@@ -43,9 +43,9 @@
     LambdaOp
 from cudf.io.parquet import ParquetWriter
 
-CRITEO_CONTINUOUS_COLUMNS = [f'_c{x}' for x in range(1, 14)]
-CRITEO_CATEGORICAL_COLUMNS = [f'_c{x}' for x in range(14, 40)]
-CRITEO_CLICK_COLUMNS = ['_c0']
+CRITEO_CONTINUOUS_COLUMNS = [f'f_c{x}' for x in range(1, 14)]
+CRITEO_CATEGORICAL_COLUMNS = [f'f_c{x}' for x in range(14, 40)]
+CRITEO_CLICK_COLUMNS = ['f_c0']
 COLUMNS = CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS + CRITEO_CLICK_COLUMNS
 CRITEO_TRAIN_DAYS = list(range(0, 23))
 
diff --git a/deepray/datasets/criteo/preproc/spark_data_utils.py b/deepray/datasets/criteo/preproc/spark_data_utils.py
index f549f87b..ee9da510 100644
--- a/deepray/datasets/criteo/preproc/spark_data_utils.py
+++ b/deepray/datasets/criteo/preproc/spark_data_utils.py
@@ -33,7 +33,7 @@
 
 
 def get_column_counts_with_frequency_limit(df, frequency_limit=None):
-  cols = ['_c%d' % i for i in CAT_COLS]
+  cols = ['f_c%d' % i for i in CAT_COLS]
   df = (
       df.select(posexplode(array(*cols))
                ).withColumnRenamed('pos', 'column_id').withColumnRenamed('col',
@@ -182,7 +182,7 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0):
   # not make a difference.
   models = sorted(models, key=itemgetter(3), reverse=True)
   for i, model, original_rows, would_broadcast in models:
-    col_name = '_c%d' % i
+    col_name = 'f_c%d' % i
     if not (would_broadcast or broadcast_model):
       # The data is highly skewed so we need to offset that
       cutoff = int(original_rows * skew_broadcast_pct / 100.0)
@@ -193,11 +193,11 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0):
       model = (model.drop('model_count').withColumnRenamed('data', col_name))
       model = broadcast(model) if broadcast_model else model
       df = (df.join(model, col_name, how='left').drop(col_name).withColumnRenamed('id', col_name))
-  return df.fillna(0, ['_c%d' % i for i in CAT_COLS])
+  return df.fillna(0, ['f_c%d' % i for i in CAT_COLS])
 
 
 def transform_log(df, transform_log=False):
-  cols = ['_c%d' % i for i in INT_COLS]
+  cols = ['f_c%d' % i for i in INT_COLS]
   if transform_log:
     for col_name in cols:
       df = df.withColumn(col_name, log(df[col_name] + 3))
@@ -226,9 +226,9 @@ def delete_data_source(spark, path):
 
 
 def load_raw(spark, folder, day_range):
-  label_fields = [StructField('_c%d' % LABEL_COL, IntegerType())]
-  int_fields = [StructField('_c%d' % i, IntegerType()) for i in INT_COLS]
-  str_fields = [StructField('_c%d' % i, StringType()) for i in CAT_COLS]
+  label_fields = [StructField('f_c%d' % LABEL_COL, IntegerType())]
+  int_fields = [StructField('f_c%d' % i, IntegerType()) for i in INT_COLS]
+  str_fields = [StructField('f_c%d' % i, StringType()) for i in CAT_COLS]
 
   schema = StructType(label_fields + int_fields + str_fields)
   paths = [os.path.join(folder, 'day_%d' % i) for i in day_range]
@@ -423,7 +423,7 @@ def _main():
       models = list(load_column_models(spark, args.model_folder, bool(args.model_size_file)))
       if args.model_size_file:
         save_model_size(
-            OrderedDict(('_c%d' % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode
+            OrderedDict(('f_c%d' % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode
         )
       models = [(i, df, agg.sum, flag) for i, df, agg, flag in models]
 
diff --git a/deepray/datasets/criteo/preproc/split_dataset.py b/deepray/datasets/criteo/preproc/split_dataset.py
index 4dad640f..2e7a75df 100644
--- a/deepray/datasets/criteo/preproc/split_dataset.py
+++ b/deepray/datasets/criteo/preproc/split_dataset.py
@@ -25,8 +25,8 @@
 import sys
 
 sys.path.append('/workspace/dlrm')
-from dlrm.data.defaults import get_categorical_feature_type
-from dlrm.data.feature_spec import FeatureSpec
+from data.defaults import get_categorical_feature_type
+from data.feature_spec import FeatureSpec
 
 
 def split_binary_file(
@@ -71,7 +71,7 @@ def split_binary_file(
       numerical_f.write(numerical_features.astype(np.float16).tobytes())
 
       label = batch_data[:, 0]
-      label_f.write(label.astype(np.bool).tobytes())
+      label_f.write(label.astype(bool).tobytes())
 
       cat_offset = num_numerical_features + 1
       for cat_idx, cat_feature_type in enumerate(cat_feature_types):
diff --git a/deepray/datasets/criteo/docker/requirements_preprocessing.txt b/deepray/datasets/criteo/requirements_preprocessing.txt
similarity index 58%
rename from deepray/datasets/criteo/docker/requirements_preprocessing.txt
rename to deepray/datasets/criteo/requirements_preprocessing.txt
index 58f5116b..6be9d65d 100644
--- a/deepray/datasets/criteo/docker/requirements_preprocessing.txt
+++ b/deepray/datasets/criteo/requirements_preprocessing.txt
@@ -1,4 +1,4 @@
 numpy
 pandas
-joblib
+joblib==0.16
 tqdm
diff --git a/deepray/datasets/csv_pipeline.py b/deepray/datasets/csv_pipeline.py
deleted file mode 100644
index af3016d7..00000000
--- a/deepray/datasets/csv_pipeline.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import tensorflow as tf
-from deepray.datasets.datapipeline import DataPipeLine
-from absl import flags
-
-FLAGS = flags.FLAGS
-
-
-class CSVPipeLine(DataPipeLine):
-
-  def build_dataset(self, csv_path):
-    dataset = tf.data.experimental.make_csv_dataset(
-        csv_path,
-        record_defaults=list(self.feature_map["dtype"]),
-        column_names=list(self.feature_map["name"]),
-        batch_size=FLAGS.batch_size,
-        label_name=FLAGS.label,
-        field_delim=",",
-        header=True,
-    )
-    return dataset
diff --git a/deepray/datasets/csv_pipeline/__init__.py b/deepray/datasets/csv_pipeline/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/datasets/csv_pipeline/csv_pipeline.py b/deepray/datasets/csv_pipeline/csv_pipeline.py
new file mode 100644
index 00000000..ac8dd6e0
--- /dev/null
+++ b/deepray/datasets/csv_pipeline/csv_pipeline.py
@@ -0,0 +1,18 @@
+import tensorflow as tf
+from deepray.datasets.datapipeline import DataPipeline
+from absl import flags
+
+
+class CSVPipeline(DataPipeline):
+
+  def build_dataset(self, batch_size, input_file_pattern, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
+    dataset = tf.data.experimental.make_csv_dataset(
+        input_file_pattern,
+        record_defaults=list(self.feature_map["dtype"]),
+        column_names=list(self.feature_map["name"]),
+        batch_size=batch_size,
+        label_name=flags.FLAGS.label,
+        field_delim=",",
+        header=True,
+    )
+    return dataset
diff --git a/deepray/datasets/datapipeline.py b/deepray/datasets/datapipeline.py
index 1c0dbf14..23dcc057 100644
--- a/deepray/datasets/datapipeline.py
+++ b/deepray/datasets/datapipeline.py
@@ -4,15 +4,13 @@
 # @license : Copyright(C),  <hailin.fu@>
 
 import abc
-import multiprocessing
 import os
 import urllib.request
 from enum import Enum
 
 import pandas as pd
 import tensorflow as tf
-from absl import flags
-from absl import logging
+from absl import flags, logging
 
 import deepray
 from deepray.utils.data.feature_map import FeatureMap
@@ -24,33 +22,29 @@
 
 ROOT_PATH = os.path.dirname(deepray.__file__)
 
-FLAGS = flags.FLAGS
-flags.DEFINE_integer("parallel_parse", multiprocessing.cpu_count(), "Number of parallel parsing")
-flags.DEFINE_integer("shuffle_buffer", None, "Size of shuffle buffer")
-flags.DEFINE_integer("prefetch_buffer", 16, "Size of prefetch buffer")
-flags.DEFINE_integer("parallel_reads_per_file", None, "Number of parallel reads per file")
-flags.DEFINE_integer("interleave_cycle", 16, "Number of interleaved inputs")
-flags.DEFINE_integer("interleave_block", 2, "Number of interleaved block_length inputs")
-flags.DEFINE_float("neg_sample_rate", 0.0, "")
-flags.DEFINE_string("conf_file", os.getcwd() + "/conf/dp.yaml", "configuration in file.")
-
 IS_TRAINING = Enum('is_training', ('Train', 'Valid', 'Test'))
 
 
-class DataPipeLine(tf.keras.layers.Layer):
+class DataPipeline(object):
 
   def __init__(self, context: tf.distribute.InputContext = None, **kwargs):
-    super().__init__(**kwargs)
-    self.use_horovod = FLAGS.use_horovod
+    # super().__init__(**kwargs)
+    self.built = False
+    self.use_horovod = flags.FLAGS.use_horovod
     self.context = context
-    self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
-    # self.conf = Foo(FLAGS.conf_file).conf
+    self.feature_map = FeatureMap().feature_map
+    # self.conf = Foo(flags.FLAGS.conf_file).conf
     self.url = None
+    self.prebatch_size = kwargs.get("prebatch_size", None)
 
   @abc.abstractmethod
   def __len__(self):
     pass
 
+  @abc.abstractmethod
+  def build(self):
+    raise NotImplementedError("build: not implemented!")
+
   @classmethod
   def read_list_from_file(cls, filename):
     file_list = tf.io.gfile.glob(filename)
@@ -70,27 +64,18 @@ def parser(self, record):
 
   @abc.abstractmethod
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      prebatch_size=0,
-      epochs=1,
-      shuffle=False,
-      *args,
-      **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     """
     must be defined in subclass
     """
     raise NotImplementedError("build_dataset: not implemented!")
 
-  def call(self, input_file_pattern=None, batch_size=None, is_training=True, prebatch_size=0, *args, **kwargs):
+  def __call__(self, batch_size=None, input_file_pattern=None, is_training=True, *args, **kwargs):
     """Gets a closure to create a dataset."""
-
     return self.build_dataset(
-        input_file_pattern=input_file_pattern,
         batch_size=self.context.get_per_replica_batch_size(batch_size) if self.context else batch_size,
+        input_file_pattern=input_file_pattern,
         is_training=is_training,
         epochs=1,
         *args,
@@ -125,3 +110,9 @@ def _dataset_options(self, input_files):
       options.experimental_optimization.map_parallelization = True
 
     return options
+
+  def train_test_split(self, arrays, test_size=0.33, shuffle=False):
+    from sklearn.model_selection import train_test_split
+    random_state = flags.FLAGS.random_seed if flags.FLAGS.random_seed else 1024
+    X_train, X_test = train_test_split(arrays, test_size=test_size, shuffle=shuffle, random_state=random_state)
+    return X_train, X_test
diff --git a/deepray/datasets/dataset_factory.py b/deepray/datasets/dataset_factory.py
index f3fbb934..45b22763 100644
--- a/deepray/datasets/dataset_factory.py
+++ b/deepray/datasets/dataset_factory.py
@@ -1,8 +1,6 @@
 from absl import logging, flags
 
 flags.DEFINE_string("data_source", "parquet_dataset", "parquet or tfrecord")
-
-FLAGS = flags.FLAGS
 """
 Build model
 """
@@ -19,10 +17,10 @@ def load_dataset():
     module_instance = ArsenalDatasetV3()
 
   elif module_class_name == "parquet_dataset":
-    from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine
+    from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline
 
     logging.info("Load parquet dataset")
-    module_instance = ParquetPipeLine()
+    module_instance = ParquetPipeline()
   """
     abs_mod_dir_path = os.path.dirname(os.path.realpath(__file__))
     logging.info(f"abs_mod_dir_path: {abs_mod_dir_path}")
diff --git a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
index 08234316..71539eed 100644
--- a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
+++ b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
@@ -29,12 +29,12 @@ def __init__(self, save_path):
     self.model_urls = {
         'bert_base_uncased':
             (
-                'http://minio1.arsenal.kanzhun-inc.com/datasets/bert_models/google_pretrained_weights/uncased_L-12_H-768_A-12.tar.gz',
+                'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz',
                 'uncased_L-12_H-768_A-12.tar.gz'
             ),
         'bert_large_uncased':
             (
-                'http://minio1.arsenal.kanzhun-inc.com/datasets/bert_models/google_pretrained_weights/uncased_L-24_H-1024_A-16.tar.gz',
+                'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-24_H-1024_A-16.tar.gz',
                 'uncased_L-24_H-1024_A-16.tar.gz'
             ),
         # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
diff --git a/deepray/datasets/downloader/bertPrep.py b/deepray/datasets/downloader/bertPrep.py
index 0f751cc1..de0cf4ca 100644
--- a/deepray/datasets/downloader/bertPrep.py
+++ b/deepray/datasets/downloader/bertPrep.py
@@ -16,15 +16,17 @@
 import pprint
 import subprocess
 
-from bookscorpus import BookscorpusTextFormatting
+import bookscorpus.BookscorpusTextFormatting
+import pubmed.PubMedTextFormatting
+import wikicorpus.WikicorpusTextFormatting
+
 import Downloader
-from pubmed import PubMedTextFormatting
 import TextSharding
-from wikicorpus import WikicorpusTextFormatting
 
 
 def main(args):
-  working_dir = "/workspaces/dataset/wikicorpus_en"  # os.environ['BERT_PREP_WORKING_DIR']
+  working_dir = os.environ['BERT_PREP_WORKING_DIR']
+
   print('Working Directory:', working_dir)
   print('Action:', args.action)
   print('Dataset Name:', args.dataset)
@@ -37,7 +39,7 @@ def main(args):
                                 + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
                                 + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
   directory_structure = {
-      'download': working_dir + '',  # Downloaded and decompressed
+      'download': working_dir + '/download',  # Downloaded and decompressed
       'extracted': working_dir + '/extracted',  # Extracted from whatever the initial format is (e.g., wikiextractor)
       'formatted': working_dir +
                    '/formatted_one_article_per_line',  # This is the level where all sources should look the same
@@ -71,7 +73,7 @@ def main(args):
 
     if args.dataset == 'bookscorpus':
       books_path = directory_structure['download'] + '/bookscorpus'
-      # books_path = directory_structure['download']
+      #books_path = directory_structure['download']
       output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
       books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
       books_formatter.merge()
@@ -92,10 +94,9 @@ def main(args):
       wiki_formatter.merge()
 
     elif args.dataset == 'wikicorpus_zh':
-      assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is ' \
-                    'added.'
+      assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
       if args.skip_wikiextractor == 0:
-        path_to_wikiextractor_in_container = 'WikiExtractor.py'
+        path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
         wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure[
             'download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(
                 args.n_processes
@@ -176,7 +177,7 @@ def main(args):
     last_process = None
 
     def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
-      bert_preprocessing_command = 'python /workspaces/Deepray2/deepray/datasets/downloader/create_pretraining_data.py'
+      bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
       bert_preprocessing_command += ' --input_file=' + directory_structure[
           'sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
       bert_preprocessing_command += ' --output_file=' + directory_structure[
diff --git a/deepray/datasets/downloader/create_datasets_from_start.sh b/deepray/datasets/downloader/create_datasets_from_start.sh
index e16b7284..0677d45e 100755
--- a/deepray/datasets/downloader/create_datasets_from_start.sh
+++ b/deepray/datasets/downloader/create_datasets_from_start.sh
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+set -e
 export BERT_PREP_WORKING_DIR=/workspaces/bert_tf2/data
 
 to_download=${1:-"all"}
diff --git a/deepray/datasets/downloader/create_finetuning_data.py b/deepray/datasets/downloader/create_finetuning_data.py
index 683f2f33..581e7083 100644
--- a/deepray/datasets/downloader/create_finetuning_data.py
+++ b/deepray/datasets/downloader/create_finetuning_data.py
@@ -31,8 +31,6 @@
 from squad import squad_lib_sp
 import tokenization
 
-FLAGS = flags.FLAGS
-
 flags.DEFINE_enum(
     "fine_tuning_task_type", "classification", ["classification", "squad"],
     "The name of the BERT fine tuning task for which data "
diff --git a/deepray/datasets/downloader/create_pretraining_data.py b/deepray/datasets/downloader/create_pretraining_data.py
index 13040329..b922b241 100644
--- a/deepray/datasets/downloader/create_pretraining_data.py
+++ b/deepray/datasets/downloader/create_pretraining_data.py
@@ -24,8 +24,6 @@
 
 import tokenization
 
-FLAGS = flags.FLAGS
-
 flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).")
 
 flags.DEFINE_string("output_file", None, "Output TF example file (or comma-separated list of files).")
diff --git a/deepray/datasets/fashion_mnist/fashion_mnist.py b/deepray/datasets/fashion_mnist/fashion_mnist.py
index eaff543e..bdb78adb 100644
--- a/deepray/datasets/fashion_mnist/fashion_mnist.py
+++ b/deepray/datasets/fashion_mnist/fashion_mnist.py
@@ -17,21 +17,21 @@
 import gzip
 import os
 import sys
+
 import numpy as np
 import tensorflow as tf
 from absl import flags
-from keras.utils.data_utils import get_file
+from keras.src.utils.data_utils import get_file
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS([
+flags.FLAGS([
     sys.argv[0],
     "--num_train_examples=60000",
 ])
 
 
-class FashionMNIST(DataPipeLine):
+class FashionMNIST(DataPipeline):
 
   def __init__(self):
     """Loads the Fashion-MNIST dataset.
@@ -104,16 +104,8 @@ def __len__(self):
     pass
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      context: tf.distribute.InputContext = None,
-      use_horovod=False,
-      *args,
-      **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
-
     if is_training:
       with gzip.open(self.paths[0], "rb") as lbpath:
         y = np.frombuffer(lbpath.read(), np.uint8, offset=8)
@@ -130,5 +122,5 @@ def build_dataset(
     dataset = tf.data.Dataset.from_tensor_slices(
         (tf.cast(x[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y, tf.int64))
     )
-    dataset = dataset.repeat(FLAGS.epochs).shuffle(10000).batch(batch_size)
+    dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size)
     return dataset
diff --git a/deepray/datasets/fashion_mnist/fashion_mnist_test.py b/deepray/datasets/fashion_mnist/fashion_mnist_test.py
index d18d6dc1..73c3d9c8 100644
--- a/deepray/datasets/fashion_mnist/fashion_mnist_test.py
+++ b/deepray/datasets/fashion_mnist/fashion_mnist_test.py
@@ -9,8 +9,6 @@
 
 from .fashion_mnist import FashionMNIST
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
index 712f34a4..131606e9 100644
--- a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
+++ b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
@@ -57,8 +57,6 @@
     'Should have train and validation subdirectories inside it.'
 )
 
-FLAGS = flags.FLAGS
-
 LABELS_FILE = 'synset_labels.txt'
 
 TRAINING_SHARDS = 1024
@@ -384,6 +382,5 @@ def main(_):
 
 
 if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
   tf.disable_v2_behavior()
   app.run(main)
diff --git a/deepray/datasets/imdb/imdb.py b/deepray/datasets/imdb/imdb.py
index e257d87c..b1bc3f54 100644
--- a/deepray/datasets/imdb/imdb.py
+++ b/deepray/datasets/imdb/imdb.py
@@ -24,14 +24,12 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
-
-FLAGS = flags.FLAGS
+from deepray.datasets.datapipeline import DataPipeline
 
 AUTOTUNE = tf.data.AUTOTUNE
 
 
-class IMDB(DataPipeLine):
+class IMDB(DataPipeline):
 
   def __init__(self, url='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', **kwargs):
     super().__init__(**kwargs)
@@ -57,9 +55,7 @@ def parser(self, record):
     y = tokenized_sentences[:, 1:]
     return x, y
 
-  def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
-  ):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
 
     if is_training:
       raw_ds = tf.keras.utils.text_dataset_from_directory(
diff --git a/deepray/datasets/imdb/imdb_test.py b/deepray/datasets/imdb/imdb_test.py
index 601357ce..c421df93 100644
--- a/deepray/datasets/imdb/imdb_test.py
+++ b/deepray/datasets/imdb/imdb_test.py
@@ -9,8 +9,6 @@
 
 from .imdb import IMDB
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/kafka_dataset.py b/deepray/datasets/kafka_dataset.py
deleted file mode 100644
index 5045eb11..00000000
--- a/deepray/datasets/kafka_dataset.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from tensorflow.python.data.ops import readers
-import tensorflow as tf
-from deepray.datasets.datapipeline import DataPipeLine
-from absl import flags
-
-FLAGS = flags.FLAGS
-
-
-class KafkaDataset(DataPipeLine):
-
-  def parse(self, raw_message, raw_key):
-    context_features, sequence_features = {}, {}
-    for key, dim in self.feature_map["FLOAT"].items():
-      context_features[key] = tf.io.FixedLenFeature([], tf.float32)
-    for key, dim in self.feature_map["INT"].items():
-      context_features[key] = tf.io.FixedLenFeature([], tf.int64)
-    for key, dim in self.feature_map["VARINT"].items():
-      sequence_features[key] = tf.io.VarLenFeature(tf.int64)
-
-    tensor, sparse_tensor = tf.io.parse_single_sequence_example(
-        serialized=raw_message, context_features=context_features, sequence_features=sequence_features
-    )
-    reshaped_tensor = {}
-    for fea in context_features:
-      reshaped_tensor[fea] = tensor[fea]
-      # reshaped_tensor[fea] = tf.reshape(tensor[fea], [1])
-    label = reshaped_tensor.pop(FLAGS.label)
-    for fea in sequence_features:
-      reshaped_tensor[fea] = sparse_tensor[fea]
-      # reshaped_tensor[fea] = tf.sparse.reshape(sparse_tensor[fea], [-1])
-    return reshaped_tensor, label
-
-  def build_dataset(self):
-    dataset = (
-        readers.KafkaGroupIODataset(
-            topics=self.conf["Kafka"]["topics"],
-            group_id=self.conf["Kafka"]["group_id"],
-            servers=self.conf["Kafka"]["servers"],
-            stream_timeout=3000,
-            configuration=self.conf["Kafka"]["configuration"],
-        ).map(map_func=self.parse, num_parallel_calls=FLAGS.parallel_parse).batch(FLAGS.batch_size)
-    )
-    return dataset
diff --git a/deepray/datasets/kafka_pipeline/__init__.py b/deepray/datasets/kafka_pipeline/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline.py b/deepray/datasets/kafka_pipeline/kafka_pipeline.py
new file mode 100644
index 00000000..5ce39e62
--- /dev/null
+++ b/deepray/datasets/kafka_pipeline/kafka_pipeline.py
@@ -0,0 +1,254 @@
+import multiprocessing
+import sys
+from abc import ABC
+
+import tensorflow as tf
+from tensorflow_io.python.ops import core_ops
+
+from deepray.datasets.datapipeline import DataPipeline
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
+
+
+class KafkaGroupIODataset(tf.data.Dataset):
+  """Represents a streaming dataset from kafka using consumer groups.
+
+  The dataset is created by fetching messages from kafka using consumer clients
+  which are part of a consumer group. Owing to the offset management capability of
+  the kafka brokers, the dataset can maintain offsets of all the partitions
+  without explicit initialization. If the consumer client joins an existing
+  consumer group, it will start fetching messages from the already committed offsets.
+  To start fetching the messages from the beginning, please join a different consumer group.
+  The dataset will be prepared from the committed/start offset until the last offset.
+
+  The dataset can be prepared and iterated in the following manner:
+
+  >>> import tensorflow_io as tfio
+  >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset(
+                      topics=["topic1"],
+                      group_id="cg",
+                      servers="localhost:9092"
+                  )
+
+  >>> for (message, key) in dataset:
+  ...     print(message)
+
+  Cases may arise where the consumer read time out issues arise due to
+  the consumer group being in a rebalancing state. In order to address that, please
+  set `session.timeout.ms` and `max.poll.interval.ms` values in the configuration tensor
+  and try again after the group rebalances. For example: considering the kafka cluster
+  has been setup with the default settings, `max.poll.interval.ms` would be `300000ms`.
+  It can be changed to `8000ms` to reduce the time between pools. Also, the `session.timeout.ms`
+  can be changed to `7000ms`. However, the value for `session.timeout.ms` should be
+  according to the following relation:
+
+  - `group.max.session.timeout.ms` in server.properties > `session.timeout.ms` in the
+  consumer.properties.
+  - `group.min.session.timeout.ms` in server.properties < `session.timeout.ms` in the
+  consumer.properties
+
+  >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset(
+                      topics=["topic1"],
+                      group_id="cg",
+                      servers="localhost:9092",
+                      configuration=[
+                          "session.timeout.ms=7000",
+                          "max.poll.interval.ms=8000",
+                          "auto.offset.reset=earliest",
+                      ],
+                  )
+
+  In the above example, the `auto.offset.reset` configuration is set to `earliest` so that
+  in case the consumer group is being newly created, it will start reading the messages from
+  the beginning. If it is not set, it defaults to `latest`. For additional configurations,
+  please refer the librdkafka's configurations:
+  https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
+
+  In addition to the standard streaming functionality, there is added support for a timeout
+  based stream. Once the existing data has been fetched, this dataset will block for
+  an additional `stream_timeout` milliseconds, for the new messages to be captured.
+
+  >>> dataset = tfio.experimental.streaming.KafkaGroupIODataset(
+                      topics=["topic1"],
+                      group_id="cg",
+                      servers="localhost:9092",
+                      stream_timeout=30000,
+                      configuration=[
+                          "session.timeout.ms=7000",
+                          "max.poll.interval.ms=8000",
+                          "auto.offset.reset=earliest",
+                      ],
+                  )
+  >>> for (message, key) in dataset:
+  ...     print(message)
+
+  The above loop will run as long as the consumer clients are able to fetch messages
+  from the topic(s). However, since we set the `stream_timeout` value to `15000` milliseconds,
+  the dataset will wait for any new messages that might be added to the topic for that duration.
+
+  As the kafka deployments vary in configuration as per various use-cases, the time required for
+  the consumers to fetch a single message might also vary. This timeout value can be adjusted
+  using the `message_poll_timeout` parameter.
+
+  The `message_poll_timeout` value represents the duration which the consumers
+  have to wait while fetching a new message. However, even if we receive a new message
+  before the `message_poll_timeout` interval finishes, the consumer doesn't resume the
+  consumption but it will wait until the `message_poll_timeout` interval has finished.
+  Thus, if we want to block indefinitely until a new message arrives,
+  we cannot do it with `message_poll_timeout` alone. This is when the `stream_timeout`
+  value comes in, where we can set the value to a very high timeout
+  (i.e, block indefinitely) and keep on polling for new messages at
+  `message_poll_timeout` intervals.
+  """
+
+  def __init__(
+      self,
+      topics,
+      group_id,
+      servers,
+      stream_timeout=0,
+      message_poll_timeout=10000,
+      configuration=None,
+      internal=True,
+  ):
+    """
+    Args:
+      topics: A `tf.string` tensor containing topic names in [topic] format.
+        For example: ["topic1", "topic2"]
+      group_id: The id of the consumer group. For example: cgstream
+      servers: An optional list of bootstrap servers.
+        For example: `localhost:9092`.
+      stream_timeout: An optional timeout duration (in milliseconds) to block until
+        the new messages from kafka are fetched.
+        By default it is set to 0 milliseconds and doesn't block for new messages.
+        To block indefinitely, set it to -1.
+      message_poll_timeout: An optional timeout duration (in milliseconds)
+        after which the kafka consumer throws a timeout error while fetching
+        a single message. This value also represents the intervals at which
+        the kafka topic(s) are polled for new messages while using the `stream_timeout`
+      configuration: An optional `tf.string` tensor containing
+        configurations in [Key=Value] format.
+        Global configuration: please refer to 'Global configuration properties'
+          in librdkafka doc. Examples include
+          ["enable.auto.commit=false", "heartbeat.interval.ms=2000"]
+        Topic configuration: please refer to 'Topic configuration properties'
+          in librdkafka doc. Note all topic configurations should be
+          prefixed with `conf.topic.`. Examples include
+          ["conf.topic.auto.offset.reset=earliest"]
+        Reference: https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
+      internal: Whether the dataset is being created from within the named scope.
+        Default: True
+    """
+    with tf.name_scope("KafkaGroupIODataset"):
+      assert internal
+
+      if stream_timeout == -1:
+        stream_timeout = sys.maxsize
+      elif stream_timeout >= 0:
+        # Taking the max of `stream_timeout` and `message_poll_timeout`
+        # to prevent the user from bothering about the underlying polling
+        # mechanism.
+        stream_timeout = max(stream_timeout, message_poll_timeout)
+      else:
+        raise ValueError("Invalid stream_timeout value: {} ,set it to -1 to block indefinitely.".format(stream_timeout))
+      metadata = list(configuration or [])
+      if group_id is not None:
+        metadata.append("group.id=%s" % group_id)
+      if servers is not None:
+        metadata.append("bootstrap.servers=%s" % servers)
+      resource = core_ops.io_kafka_group_readable_init(topics=topics, metadata=metadata)
+
+      self._resource = resource
+      dataset = tf.data.Dataset.counter()
+      dataset = dataset.map(
+          lambda i: core_ops.io_kafka_group_readable_next(
+              input=self._resource,
+              index=i,
+              message_poll_timeout=message_poll_timeout,
+              stream_timeout=stream_timeout,
+          )
+      )
+      dataset = dataset.take_while(lambda v: tf.greater(v.continue_fetch, 0))
+      dataset = dataset.map(lambda v: v.message)
+      dataset = dataset.unbatch()
+
+      self._dataset = dataset
+      super().__init__(self._dataset._variant_tensor)  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return []
+
+  @property
+  def element_spec(self):
+    return self._dataset.element_spec
+
+
+class KafkaPipeline(DataPipeline, ABC):
+
+  def __init__(
+      self,
+      topics,
+      group_id,
+      servers,
+      stream_timeout=None,
+      configuration=None,
+      compression_type=None,
+      num_client=1,
+      **kwargs
+  ):
+    super().__init__(**kwargs)
+    self.topics = topics
+    self.group_id = group_id
+    self.servers = servers
+    self.stream_timeout = stream_timeout
+    self.configuration = configuration
+    self.compression_type = compression_type
+    self.num_client = num_client
+
+  def build_dataset(
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+  ):
+    if self.num_client > 1:
+      logger.info(f"Using {self.num_client} Kafka clients.")
+      clients = tuple(
+          KafkaGroupIODataset(
+              topics=self.topics,
+              group_id=self.group_id,
+              servers=self.servers,
+              stream_timeout=self.stream_timeout,
+              configuration=self.configuration,
+          ) for _ in range(self.num_client)
+      )
+      dataset = tf.data.Dataset.zip(clients)
+      dataset = dataset.map(lambda *x: tf.stack(x, axis=-1)).unbatch()
+    else:
+      dataset = KafkaGroupIODataset(
+          topics=self.topics,
+          group_id=self.group_id,
+          servers=self.servers,
+          stream_timeout=self.stream_timeout,
+          configuration=self.configuration,
+      )
+      logger.info(
+          "Using only one Kafka client, if there is an IO bottleneck, it is recommended to adjust 'num_client' to increase the number of Kafka clients"
+      )
+    if self.prebatch_size:
+      if batch_size > self.prebatch_size:
+        dataset = dataset.batch(
+            batch_size=batch_size // self.prebatch_size,
+            num_parallel_calls=tf.data.AUTOTUNE,
+            deterministic=True,
+            drop_remainder=True
+        )
+    else:
+      dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True, drop_remainder=True)
+    if self.compression_type:
+      dataset = dataset.map(
+          lambda v: tf.io.decode_compressed(v, compression_type=self.compression_type), multiprocessing.cpu_count()
+      )
+    if not hasattr(self.parser, "__isabstractmethod__"):
+      dataset = dataset.map(self.parser, multiprocessing.cpu_count())
+    if self.prebatch_size and batch_size % self.prebatch_size != 0:
+      dataset = dataset.unbatch().batch(batch_size)
+    return dataset
diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
new file mode 100644
index 00000000..bc957110
--- /dev/null
+++ b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
@@ -0,0 +1,52 @@
+# -*- coding: UTF-8 -*-
+import sys
+from absl import flags
+import tensorflow as tf
+
+from tf_keras import backend as K
+
+import deepray as dp
+from deepray.utils.benchmark import PerformanceCalculator
+from deepray.utils import logging_util
+
+from deepray.utils.horovod_utils import is_main_process
+
+from deepray.datasets.kafka_pipeline.kafka_pipeline import KafkaPipeline
+
+logger = logging_util.get_logger()
+
+
+def main():
+
+  data_pipe = KafkaPipeline(
+      # dataset_name=flags.FLAGS.dataset,
+      # partitions=[{'ds': date} for date in get_dates()],
+  )
+
+  train_dataset = data_pipe(input_file_pattern=None, batch_size=flags.FLAGS.batch_size)
+
+  _performance_calculator = PerformanceCalculator(0, 1000)
+  num_examples = 0
+  step = 0
+
+  for sample in train_dataset.take(1000):
+    step += 1
+    # example = tf.train.Example()
+    # example.ParseFromString(sample[0].numpy())
+    print(sample)
+    # print(key)
+    step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
+
+    if num_examples % 100 == 0:
+      logger.info(f'step {step}, Perf {step_throughput} samples/s')
+
+  print(num_examples)
+  results_perf = _performance_calculator.results
+  if not _performance_calculator.completed:
+    print(f"self._performance_calculator.completed: {_performance_calculator.completed}")
+    results_perf = _performance_calculator.get_current_benchmark_results()
+  print(results_perf)
+
+
+if __name__ == "__main__":
+  dp.runner(main)
diff --git a/deepray/datasets/mnist/mnist.py b/deepray/datasets/mnist/mnist.py
index cd4051d9..f328534f 100644
--- a/deepray/datasets/mnist/mnist.py
+++ b/deepray/datasets/mnist/mnist.py
@@ -19,19 +19,13 @@
 import numpy as np
 import tensorflow as tf
 from absl import flags
-from keras.utils.data_utils import get_file
+from keras.src.utils.data_utils import get_file
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.utils.horovod_utils import get_rank, get_world_size
 
-FLAGS = flags.FLAGS
-FLAGS([
-    sys.argv[0],
-    "--num_train_examples=60000",
-])
 
-
-class Mnist(DataPipeLine):
+class Mnist(DataPipeline):
 
   def __init__(self, path="mnist.npz"):
     """Loads the MNIST dataset.
@@ -80,6 +74,12 @@ def __init__(self, path="mnist.npz"):
       https://creativecommons.org/licenses/by-sa/3.0/)
     """
     super().__init__()
+
+    flags.FLAGS([
+        sys.argv[0],
+        "--num_train_examples=60000",
+    ])
+
     origin_folder = ("https://storage.googleapis.com/tensorflow/tf-keras-datasets/")
     self.path = get_file(
         path,
@@ -90,17 +90,15 @@ def __init__(self, path="mnist.npz"):
     )
 
   def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     with np.load(self.path, allow_pickle=True) as f:
       if is_training:
-        x, y = f["x_train"], f["y_train"]
+        image, label = f["x_train"], f["y_train"]
       else:
-        x, y = f["x_test"], f["y_test"]
+        image, label = f["x_test"], f["y_test"]
 
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (tf.cast(x[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y, tf.int64))
-    )
+    dataset = tf.data.Dataset.from_tensor_slices((tf.cast(image[..., tf.newaxis] / 255.0, tf.float32), label))
     if self.use_horovod:
       # For multi-host training, we want each hosts to always process the same
       # subset of files.  Each host only sees a subset of the entire dataset,
diff --git a/deepray/datasets/mnist/mnist_test.py b/deepray/datasets/mnist/mnist_test.py
index a79ec9f7..07cc49c6 100644
--- a/deepray/datasets/mnist/mnist_test.py
+++ b/deepray/datasets/mnist/mnist_test.py
@@ -2,41 +2,16 @@
 # @Time    : 2021/8/10 2:50 PM
 # @Author  : Hailin.Fu
 # @license : Copyright(C),  <hailin.fu@>
-import sys
-from datetime import datetime
+from absl import flags
+from deepray.datasets.mnist import Mnist
 
-from absl import app, flags
+data_pipe = Mnist()
+# create data pipline of train & test dataset
+train_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, is_training=True)
+test_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, is_training=False)
 
-from .mnist import Mnist
+num_examples = 0
+for x in train_dataset:
+  num_examples += flags.FLAGS.batch_size
 
-FLAGS = flags.FLAGS
-
-TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
-
-
-def runner(argv=None):
-  if len(argv) <= 1:
-    argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        "--train_data=movielens/1m-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
-    ]
-  if argv:
-    FLAGS(argv, known_only=True)
-
-  data_pipe = Mnist()
-  # create data pipline of train & test dataset
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  num_examples = 0
-  for x in train_dataset:
-    num_examples += FLAGS.batch_size
-
-  print(x)
-  print(num_examples)
-
-
-if __name__ == "__main__":
-  app.run(runner)
+print(num_examples)
diff --git a/deepray/datasets/movielens/movielens.csv b/deepray/datasets/movielens/movielens.csv
index c200ef88..61f0f17e 100644
--- a/deepray/datasets/movielens/movielens.csv
+++ b/deepray/datasets/movielens/movielens.csv
@@ -1,4 +1,4 @@
-name,dtype,ftype,dim,length,voc_size,lr,optimizer,storage_type,composition_factor,ev_filter
+name,dtype,ftype,dim,length,voc_size
 user_rating,int64,Label,1,1
 user_id,int64,Categorical,32,1
 movie_id,int64,Categorical,32,1
\ No newline at end of file
diff --git a/deepray/datasets/movielens/movielens.py b/deepray/datasets/movielens/movielens.py
index 6a941292..dbba39c4 100644
--- a/deepray/datasets/movielens/movielens.py
+++ b/deepray/datasets/movielens/movielens.py
@@ -4,11 +4,11 @@
 
 import tensorflow as tf
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.datasets.movielens import constants as rconst
 
 
-class Movielens(DataPipeLine):
+class Movielens(DataPipeline):
 
   @staticmethod
   def parser(self, serialized_data, batch_size=None, is_training=True):
diff --git a/deepray/datasets/movielens/movielens_100k_ratings.py b/deepray/datasets/movielens/movielens_100k_ratings.py
index 98d2e493..342ece15 100644
--- a/deepray/datasets/movielens/movielens_100k_ratings.py
+++ b/deepray/datasets/movielens/movielens_100k_ratings.py
@@ -1,50 +1,76 @@
-"""NCF model input pipeline."""
-
 import os
-import sys
 
+import numpy as np
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS(
-    [
-        sys.argv[0],
-        "--num_train_examples=100000",
-        "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
-    ]
-)
 
+class Movielens100kRating(DataPipeline):
 
-class Movielens100kRating(DataPipeLine):
+  def __init__(self, split=False, **kwargs):
+    super().__init__(**kwargs)
+    self.split = split
+    flags.FLAGS([
+        "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
+    ])
+    import tensorflow_datasets as tfds
+    # Ratings data.
+    self.ratings = tfds.load("movielens/100k-ratings", split="train", data_dir="/datasets/", download=True)
+    # Features of all the available movies.
+    self.movies = tfds.load('movielens/100k-movies', split="train", data_dir="/datasets/", download=True)
+    users = self.ratings.map(lambda x: x["user_id"], os.cpu_count())
+    movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count())
+    movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count())
+    self.user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.user_ids_vocabulary.adapt(users.batch(1_000_000))
+    self.movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.movie_ids_vocabulary.adapt(movie_ids.batch(1_000_000))
+    self.movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.movie_titles_vocabulary.adapt(movies.batch(1_682))
+
+  def get_vocabulary(self, feature):
+    if feature == "user_id":
+      return self.user_ids_vocabulary.get_vocabulary()
+    elif feature == "movie_id":
+      return self.movie_ids_vocabulary.get_vocabulary()
+    elif feature == "movie_title":
+      return self.movie_titles_vocabulary.get_vocabulary()
+    else:
+      column = self.original_dataset.map(lambda x: {
+          feature: x[feature]
+      }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count())
+      return np.unique(np.concatenate(list(column)))
 
   def parser(self, record):
     return {
         "movie_id": tf.strings.to_number(record["movie_id"], tf.int64),
-        "user_id": tf.strings.to_number(record["user_id"], tf.int64),
+        "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
+        "user_id": self.user_ids_vocabulary(record["user_id"]),
         "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
         "user_gender": tf.cast(record["user_gender"], tf.int32),
         "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
         "raw_user_age": tf.cast(record["raw_user_age"], tf.int32),
-        "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32),
+        "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32)
     }, record["user_rating"]
 
   def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
-    import tensorflow_datasets as tfds
-    ratings = tfds.load("movielens/100k-ratings", split="train", data_dir="/dataset/", download=True)
-    ratings = ratings.map(
-        self.parser
-        # lambda x: {
-        #   "movie_id": tf.strings.to_number(x["movie_id"], tf.int64),
-        #   "user_id": tf.strings.to_number(x["user_id"], tf.int64),
-        #   "user_rating": x["user_rating"]
-        # }
-    )
-    ratings = ratings.repeat(FLAGS.epochs)
-    shuffled = ratings.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False)
-    dataset = shuffled.batch(batch_size)
+    dataset = self.ratings.map(self.parser, os.cpu_count())
+    if epochs > 1:
+      dataset = dataset.repeat(epochs)
+    if shuffle:
+      dataset = dataset.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False)
+    if self.split:
+      if is_training:
+        dataset = dataset.take(80_000)
+      else:
+        dataset = dataset.skip(80_000).take(20_000)
+    dataset = dataset.batch(batch_size)
     return dataset
+
+  @property
+  def __len__(self):
+    return 1_000_000
diff --git a/deepray/datasets/movielens/movielens_100k_ratings_test.py b/deepray/datasets/movielens/movielens_100k_ratings_test.py
deleted file mode 100644
index 80459b3a..00000000
--- a/deepray/datasets/movielens/movielens_100k_ratings_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-# @Time    : 2021/8/10 2:50 PM
-# @Author  : Hailin.Fu
-# @license : Copyright(C),  <hailin.fu@>
-import sys
-from datetime import datetime
-
-from absl import app, flags, logging
-
-from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating
-
-FLAGS = flags.FLAGS
-logging.set_verbosity(logging.INFO)
-
-TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
-
-
-def runner(argv=None):
-  if len(argv) <= 1:
-    argv = [
-        sys.argv[0],
-        # "--batch_size=16",
-        "-epochs=1",
-        "--train_data=movielens/100k-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        # "--label=clicked",
-    ]
-  if argv:
-    FLAGS(argv, known_only=True)
-
-  data_pipe = Movielens100kRating()
-  # create data pipline of train & test dataset
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  num_examples = 0
-  for x in train_dataset:
-    num_examples += FLAGS.batch_size
-
-  print(x)
-  print(num_examples)
-
-
-if __name__ == "__main__":
-  app.run(runner)
diff --git a/deepray/datasets/movielens/movielens_1m_ratings.py b/deepray/datasets/movielens/movielens_1m_ratings.py
index 53a2c241..94b768f6 100644
--- a/deepray/datasets/movielens/movielens_1m_ratings.py
+++ b/deepray/datasets/movielens/movielens_1m_ratings.py
@@ -1,46 +1,76 @@
-"""NCF model input pipeline."""
 import os
-import sys
 
+import numpy as np
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
-FLAGS(
-    [
-        sys.argv[0],
-        "--num_train_examples=1000224",
+
+class Movielens1MRating(DataPipeline):
+
+  def __init__(self, split=False, **kwargs):
+    super().__init__(**kwargs)
+    self.split = split
+    flags.FLAGS([
         "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
-    ]
-)
+    ])
+    import tensorflow_datasets as tfds
+    # Ratings data.
+    self.ratings = tfds.load("movielens/1m-ratings", split="train", data_dir="/datasets/", download=True)
+    # Features of all the available movies.
+    self.movies = tfds.load('movielens/1m-movies', split="train", data_dir="/datasets/", download=True)
+    users = self.ratings.map(lambda x: x["user_id"], os.cpu_count())
+    movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count())
+    movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count())
+    self.user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.user_ids_vocabulary.adapt(users.batch(1_000_000))
+    self.movie_ids_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.movie_ids_vocabulary.adapt(movie_ids.batch(1_000_000))
+    self.movie_titles_vocabulary = tf.keras.layers.StringLookup(mask_token=None)
+    self.movie_titles_vocabulary.adapt(movies.batch(1_682))
 
+  def get_vocabulary(self, feature):
+    if feature == "user_id":
+      return self.user_ids_vocabulary.get_vocabulary()
+    elif feature == "movie_id":
+      return self.movie_ids_vocabulary.get_vocabulary()
+    elif feature == "movie_title":
+      return self.movie_titles_vocabulary.get_vocabulary()
+    else:
+      column = self.original_dataset.map(lambda x: {
+          feature: x[feature]
+      }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count())
+      return np.unique(np.concatenate(list(column)))
 
-class Movielens1MRating(DataPipeLine):
+  def parser(self, record):
+    return {
+        "movie_id": self.movie_ids_vocabulary(record["movie_id"]),
+        "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
+        "user_id": self.user_ids_vocabulary(record["user_id"]),
+        "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
+        "user_gender": tf.cast(record["user_gender"], tf.int32),
+        "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
+        "bucketized_user_age": tf.cast(record["bucketized_user_age"], tf.int32),
+        "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32)
+    }, record["user_rating"]
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      context: tf.distribute.InputContext = None,
-      use_horovod=False,
-      *args,
-      **kwargs
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
-    import tensorflow_datasets as tfds
-    dataset = tfds.load(input_file_pattern, split='train')
-    features = dataset.map(
-        lambda x: {
-            "movie_id": tf.strings.to_number(x["movie_id"], tf.int64),
-            "user_id": tf.strings.to_number(x["user_id"], tf.int64),
-        }
-    )
-    ratings = dataset.map(lambda x: tf.one_hot(tf.cast(x['user_rating'] - 1, dtype=tf.int64), 5))
-    dataset = dataset.zip((features, ratings))
-    dataset = dataset.repeat(FLAGS.epochs)
-    dataset = dataset.shuffle(1024, reshuffle_each_iteration=False)
+    dataset = self.ratings.map(self.parser, os.cpu_count())
+    if epochs > 1:
+      dataset = dataset.repeat(epochs)
+    if shuffle:
+      dataset = dataset.shuffle(1_000_000, seed=2021, reshuffle_each_iteration=False)
+    if self.split:
+      if is_training:
+        dataset = dataset.take(80_000)
+      else:
+        dataset = dataset.skip(80_000).take(20_000)
     dataset = dataset.batch(batch_size)
-
     return dataset
+
+  @property
+  def __len__(self):
+    return 1_000_224
diff --git a/deepray/datasets/movielens/movielens_1m_ratings_test.py b/deepray/datasets/movielens/movielens_1m_ratings_test.py
deleted file mode 100644
index 53373aa2..00000000
--- a/deepray/datasets/movielens/movielens_1m_ratings_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-# @Time    : 2021/8/10 2:50 PM
-# @Author  : Hailin.Fu
-# @license : Copyright(C),  <hailin.fu@>
-import sys
-from datetime import datetime
-
-from absl import app, flags, logging
-
-from deepray.datasets.movielens.movielens_1m_ratings import Movielens1MRating
-
-FLAGS = flags.FLAGS
-logging.set_verbosity(logging.INFO)
-
-TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
-
-
-def runner(argv=None):
-  if len(argv) <= 1:
-    argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        "--train_data=movielens/1m-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
-    ]
-  if argv:
-    FLAGS(argv, known_only=True)
-
-  data_pipe = Movielens1MRating()
-  # create data pipline of train & test dataset
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  num_examples = 0
-  for x in train_dataset:
-    num_examples += FLAGS.batch_size
-
-  print(x)
-  print(num_examples)
-
-
-if __name__ == "__main__":
-  app.run(runner)
diff --git a/deepray/datasets/movielens/movielens_ratings_test.py b/deepray/datasets/movielens/movielens_ratings_test.py
new file mode 100644
index 00000000..87174de7
--- /dev/null
+++ b/deepray/datasets/movielens/movielens_ratings_test.py
@@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# @Time    : 2021/8/10 2:50 PM
+# @Author  : Hailin.Fu
+# @license : Copyright(C),  <hailin.fu@>
+import sys, os
+import deepray as dp
+from datetime import datetime
+from absl import app, flags, logging
+
+from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+
+TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
+
+
+def define_flags():
+  argv = sys.argv + [
+      "--epochs=1",
+      "--batch_size=2",
+      "--train_data=movielens/100k-ratings",
+  ]
+  flags.FLAGS(argv)
+
+
+def runner():
+  data_pipe = Movielens100kRating()
+  train_dataset = data_pipe(flags.FLAGS.batch_size, is_training=True)
+  num_examples = 0
+
+  for x in train_dataset:
+    num_examples += flags.FLAGS.batch_size
+
+  print(x)
+  print(num_examples)
+
+
+if __name__ == "__main__":
+  dp.runner(runner)
diff --git a/deepray/datasets/movielens/process.py b/deepray/datasets/movielens/process.py
index c555f152..65f5e51c 100644
--- a/deepray/datasets/movielens/process.py
+++ b/deepray/datasets/movielens/process.py
@@ -37,7 +37,7 @@
 # pylint: enable=g-bad-import-order
 
 # URL to download dataset
-_DATA_URL = "http://minio1.arsenal.kanzhun-inc.com/datasets/movielens/"
+_DATA_URL = "https://files.grouplens.org/datasets/movielens/"
 
 GENRES = [
     'Action', 'Adventure', 'Animation', "Children", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
@@ -276,5 +276,5 @@ def main(_):
 
 
 if __name__ == "__main__":
-  FLAGS = flags.FLAGS
+
   app.run(main)
diff --git a/deepray/datasets/movielens/producer.py b/deepray/datasets/movielens/producer.py
index 24008f44..d047876e 100644
--- a/deepray/datasets/movielens/producer.py
+++ b/deepray/datasets/movielens/producer.py
@@ -6,21 +6,11 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.datasets.movielens import constants as rconst
 
-FLAGS = flags.FLAGS
-FLAGS(
-    [
-        sys.argv[0],
-        "--num_train_examples=5049000",
-        #  "--=6138000"
-        "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
-    ]
-)
 
-
-class Produce(DataPipeLine):
+class Produce(DataPipeline):
 
   def __init__(self, params, producer):
     self._producer = producer
diff --git a/deepray/datasets/openwebtext/openwebtext.py b/deepray/datasets/openwebtext/openwebtext.py
index d4f5df24..0c623c14 100644
--- a/deepray/datasets/openwebtext/openwebtext.py
+++ b/deepray/datasets/openwebtext/openwebtext.py
@@ -20,22 +20,21 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
 FLAGS([
     sys.argv[0],
     "--num_train_examples=60000",
 ])
 
 
-class Openwebtext(DataPipeLine):
+class Openwebtext(DataPipeline):
 
   def __init__(self, max_seq_length, **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     """The actual input function."""
     input_files = tf.io.gfile.glob(input_file_pattern)
 
diff --git a/deepray/datasets/openwebtext/openwebtext_test.py b/deepray/datasets/openwebtext/openwebtext_test.py
index a307503d..a2c95f80 100644
--- a/deepray/datasets/openwebtext/openwebtext_test.py
+++ b/deepray/datasets/openwebtext/openwebtext_test.py
@@ -12,8 +12,6 @@
 
 from .openwebtext import Openwebtext
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
index 0dc0a3d2..9ef186e0 100644
--- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
+++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
@@ -1,45 +1,223 @@
+import random
+
+import pandas as pd
 import tensorflow as tf
 from absl import flags
+from six import string_types
+from tensorflow import dtypes
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 
-from deepray.custom_ops.parquet_dataset import dataframe
 from deepray.custom_ops.parquet_dataset import parquet_dataset_ops
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.custom_ops.parquet_dataset.python.parquet_pybind import parquet_filenames_and_fields
+from deepray.datasets.datapipeline import DataPipeline
+from deepray.utils import logging_util
+from deepray.utils.horovod_utils import get_rank, get_world_size
+
+logger = logging_util.get_logger()
+
+
+def parquet_filenames(filenames, lower=False):
+  """Check and fetch parquet filenames and fields.
+
+  Args:
+    filenames: List of Path of parquet file list.
+    lower: Convert field name to lower case if not found.
+
+  Returns:
+    Validated file names and fields.
+  """
+  if isinstance(filenames, string_types):
+    filenames = [filenames]
+  elif isinstance(filenames, (tuple, list)):
+    for f in filenames:
+      if not isinstance(f, string_types):
+        raise ValueError(f'{f} in `filenames` must be a string')
+  elif isinstance(filenames, dataset_ops.Dataset):
+    if filenames.output_types != dtypes.string:
+      raise TypeError('`filenames` must be a `tf.data.Dataset` of `tf.string` elements.')
+    if not filenames.output_shapes.is_compatible_with(tensor_shape.TensorShape([])):
+      raise ValueError('`filenames` must be a `tf.data.Dataset` of scalar `tf.string` '
+                       'elements.')
+  elif isinstance(filenames, ops.Tensor):
+    if filenames.dtype != dtypes.string:
+      raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.')
+  else:
+    raise ValueError(
+        f'`filenames` {filenames} must be a `tf.data.Dataset` of scalar '
+        '`tf.string` elements or can be converted to a `tf.Tensor` of '
+        '`tf.string`.'
+    )
+
+  if not isinstance(filenames, dataset_ops.Dataset):
+    filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
+    filenames = array_ops.reshape(filenames, [-1], name='filenames')
+    filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+  return filenames
+
+
+class ParquetDataset(dataset_ops.DatasetV2):  # pylint: disable=abstract-method
+  """A Parquet Dataset that reads batches from parquet files."""
+
+  VERSION = 2002
+
+  def __init__(
+      self, filenames, column_names=None, batch_size=1, num_parallel_reads=None, num_sequential_reads=2, parser=None
+  ):
+    """Create a `ParquetDataset`.
+
+    Args:
+      filenames: A 0-D or 1-D `tf.string` tensor containing one or more
+        filenames.
+      batch_size: (Optional.) Maxium number of samples in an output batch.
+      column_names: (Optional.) List of DataFrame fields.
+      partition_count: (Optional.) Count of row group partitions.
+      partition_index: (Optional.) Index of row group partitions.
+      drop_remainder: (Optional.) If True, only keep batches with exactly
+        `batch_size` samples.
+      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
+        number of files to read in parallel. Defaults to reading files
+        sequentially.
+      num_sequential_reads: (Optional.) A `tf.int64` scalar representing the
+        number of batches to read in sequential. Defaults to 1.
+    """
+    self._batch_size = batch_size
+    self._filter = filter
+    self._parser = parser
+
+    filenames, fields = parquet_filenames_and_fields(filenames, column_names)
+    filenames = filenames.batch(32)
+
+    def _create_dataset(f):
+      dataset = parquet_dataset_ops.ParquetDataset(
+          filenames=f,
+          fields=fields,
+          batch_size=self._batch_size,
+      )
+      if self._parser:
+        dataset = dataset.map(self._parser, num_parallel_calls=tf.data.AUTOTUNE)
+      return dataset
+
+    self._impl = self._build_dataset(
+        _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads
+    )
+    super().__init__(self._impl._variant_tensor)  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return self._impl._inputs()  # pylint: disable=protected-access
+
+  @property
+  def element_spec(self):
+    return self._impl.element_spec  # pylint: disable=protected-access
+
+  def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, num_sequential_reads=1):
+    """Internal method to create a `ParquetDataset`."""
+    if num_parallel_reads is None:
+      return filenames.flat_map(dataset_creator)
+    if num_parallel_reads == dataset_ops.AUTOTUNE:
+      return filenames.interleave(dataset_creator, num_parallel_calls=2, deterministic=False)
+    return readers.ParallelInterleaveDataset(
+        filenames,
+        dataset_creator,
+        cycle_length=num_parallel_reads,
+        block_length=num_sequential_reads,
+        sloppy=True,
+        buffer_output_elements=None,
+        prefetch_input_elements=1
+    )
+
 
-FLAGS = flags.FLAGS
+class ParquetPipeline(DataPipeline):
 
+  def __init__(self, column_names=[], **kwargs):
+    super().__init__(**kwargs)
+    self.column_names = column_names
 
-class ParquetPipeLine(DataPipeLine):
+    # duplicate value check
+    visited = set()
+    dup_values = [name for name in self.column_names if name in visited or (visited.add(name) or False)]
+    assert len(dup_values) == 0, "The column_names input parameter has duplicate values: " + str(dup_values)
+
+    self.info_df = pd.DataFrame()
 
   def parse(self, record):
     label_map = {}
-    for label in FLAGS.label:
+    for label in flags.FLAGS.label:
       # label_map[label] = record.pop(label)
       label_map[label] = tf.reshape(record.pop(label), [-1, 1])
     return record, label_map
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
-    """Makes dataset (of filenames) from filename glob patterns."""
-    # Extract lines from input files using the Dataset API.
-
-    file_list = self.read_list_from_file(input_file_pattern)
-
-    dataset = parquet_dataset_ops.ParquetDataset(
-        file_list,
-        batch_size=batch_size,
-        fields=[
-            parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0)
-            for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values
-        ]
-    ).apply(dataframe.to_sparse())
-    dataset = dataset.map(
-        map_func=self.parse,
-        num_parallel_calls=FLAGS.parallel_parse if FLAGS.parallel_parse else dataset_ops.AUTOTUNE,
-    )
-    if FLAGS.shuffle_buffer:
-      dataset = dataset.apply(
-          tf.data.experimental.shuffle_and_repeat(buffer_size=FLAGS.shuffle_buffer, count=FLAGS.epochs)
+  def build_dataset(
+      self,
+      input_file_pattern,
+      batch_size,
+      is_training=True,
+      epochs=1,
+      shuffle=False,
+      *args,
+      **kwargs
+  ) -> tf.data.Dataset:
+    if isinstance(input_file_pattern, str):
+      data_file_list = self.read_list_from_file(input_file_pattern)
+    else:
+      data_file_list = input_file_pattern
+    if not data_file_list:
+      raise ValueError("The input file list is empty!")
+
+    # When `input_file` is a path to a single file or a list
+    # containing a single path, disable auto sharding so that
+    # same input file is sent to all workers.
+    random_state = flags.FLAGS.random_seed if flags.FLAGS.random_seed else 1024
+    if shuffle and isinstance(data_file_list, list):
+      random.Random(random_state).shuffle(data_file_list)
+      logger.info(f"Shuffling {len(data_file_list)} parquet files.")
+    if isinstance(data_file_list, str) or len(data_file_list) < get_world_size():
+      dataset = parquet_dataset_ops.ParquetDataset(
+          filenames=data_file_list,
+          fields=self.column_names if self.column_names else None,
+          batch_size=batch_size,
       )
+      if self.use_horovod:
+        # For multi-host training, we want each hosts to always process the same
+        # subset of files.  Each host only sees a subset of the entire dataset,
+        # allowing us to cache larger datasets in memory.
+        dataset = dataset.shard(num_shards=get_world_size(), index=get_rank())
+        logger.info("Using samples distributing strategy ❤")
+      if not hasattr(self.parser, "__isabstractmethod__"):
+        dataset = dataset.map(self.parser, tf.data.AUTOTUNE)
     else:
-      dataset = dataset.repeat(FLAGS.epochs)
+      if self.use_horovod:
+        # For multi-host training, we want each hosts process different
+        # subset of files.  Each host only sees a subset of the entire dataset,
+        # allowing us to cache larger datasets in memory.
+        data_file_list = [data_file_list[i] for i in range(len(data_file_list)) if i % get_world_size() == get_rank()]
+        logger.info("Using files distributing strategy ❤")
+      dataset = ParquetDataset(
+          filenames=data_file_list,
+          column_names=self.column_names if self.column_names else None,
+          batch_size=batch_size,
+          num_parallel_reads=dataset_ops.AUTOTUNE,
+          parser=None if hasattr(self.parser, "__isabstractmethod__") else self.parser
+      )
+
+    # if not hasattr(self.parser, "__isabstractmethod__"):
+    #   dataset = dataset.map(self.parser, multiprocessing.cpu_count())
+    # dataset = dataset.ignore_errors()
+    # Prefetch overlaps in-feed with training
+    # dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
+    # dataset = dataset.with_options(self._dataset_options(data_file_list))
+    # Using `ignore_errors()` will drop the element that causes an error.
+    # dataset = dataset.apply(tf.data.experimental.ignore_errors())
+
+    if shuffle:
+      shuffle_buffer = kwargs.get("shuffle_buffer", 10)
+      logger.debug(f"kwargs = {kwargs}")
+      logger.info(f"The shuffle_buffer is {shuffle_buffer}")
+      dataset = dataset.unbatch().shuffle(
+          buffer_size=shuffle_buffer, seed=flags.FLAGS.random_seed, reshuffle_each_iteration=False
+      ).batch(batch_size)
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
     return dataset
diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
index 2fdd60aa..5ca9fc3f 100644
--- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
+++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
@@ -4,49 +4,44 @@
 # @license : Copyright(C),  <hailin.fu@>
 import os
 import sys
-from datetime import datetime
 
-from absl import app, flags
+from absl import flags
 
-from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeLine
+import deepray as dp
+from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 
-TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
+def define_flags():
+  argv = sys.argv + [
+      "--batch_size=4096", "--epochs=1", "--dataset=ps_test",
+      "--feature_map=/workspaces/one-code/shadow-tf/datasets/feature_map.csv",
+      "--config_file=/workspaces/one-code/shadow-tf/train_feature_process.yaml"
+  ]
+  flags.FLAGS(argv)
 
-def runner(argv=None):
-  dir_path = os.path.dirname(os.path.realpath(__file__))
 
-  if len(argv) <= 1:
-    argv = [
-        sys.argv[0],
-        "--batch_size=2",
-        "--epochs=1",
-        "--train_data=/workspaces/dataset/ali_display_ad_click/output/*.parquet",
-        "--feature_map=/workspaces/Deepray2/deepray/datasets/ali_display_ad_click/feature_map.csv",
-        # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list",
-        # f"--feature_map={dir_path}/bz_search_1to3.csv",
-        "--label=label",
-    ]
-  if argv:
-    FLAGS(argv, known_only=True)
-
-  data_pipe = ParquetPipeLine()
+def main():
+  define_flags()
+  filenames = [
+      "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+      "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+  ]
+  data_pipe = ParquetPipeline(column_names=['f_c0', 'f_c1', 'f_c14'])
   # create data pipline of train & test dataset
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
+  train_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, input_file_pattern=filenames, is_training=True)
   _performance_calculator = PerformanceCalculator(0, 1000)
 
-  # partitions = data_pipe.get_supported_partitions()
-  # print(partitions)
   num_examples = 0
   step = 0
   for batch in train_dataset.take(1000):
     step += 1
-    num_examples += FLAGS.batch_size
-    step_throughput = _performance_calculator(1, FLAGS.batch_size)
+    num_examples += flags.FLAGS.batch_size
+    step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
     print(f'step {step}, Perf {step_throughput} samples/s')
+  print(batch)
 
   print(num_examples)
   results_perf = _performance_calculator.results
@@ -57,4 +52,4 @@ def runner(argv=None):
 
 
 if __name__ == "__main__":
-  app.run(runner)
+  dp.runner(main)
diff --git a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
index 9542f5e7..02324044 100644
--- a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
+++ b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
@@ -11,8 +11,6 @@
 from deepray.utils.benchmark import PerformanceCalculator
 from .parquet_pipeline import parquet_pipeline
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/squad/classifier_dataset.py b/deepray/datasets/squad/classifier_dataset.py
new file mode 100644
index 00000000..dc2e2cb5
--- /dev/null
+++ b/deepray/datasets/squad/classifier_dataset.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from deepray.datasets.datapipeline import DataPipeline
+from deepray.utils.horovod_utils import get_rank, get_world_size
+
+
+class Squad(DataPipeline):
+
+  def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
+    super().__init__(**kwargs)
+    self.max_seq_length = max_seq_length
+    self.input_pipeline_context = input_pipeline_context
+
+  def decode_record(self, record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+
+    return example
+
+  def single_file_dataset(self, input_file, name_to_features):
+    """Creates a single-file dataset to be passed for BERT custom training."""
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    d = tf.data.TFRecordDataset(input_file)
+    if self.use_horovod:
+      d = d.shard(num_shards=get_world_size(), index=get_rank())
+    d = d.map(lambda record: self.decode_record(record, name_to_features))
+
+    # When `input_file` is a path to a single file or a list
+    # containing a single path, disable auto sharding so that
+    # same input file is sent to all workers.
+    if isinstance(input_file, str) or len(input_file) == 1:
+      options = tf.data.Options()
+      options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+      d = d.with_options(options)
+    return d
+
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
+    """Creates input dataset from (tf)records files for train/eval."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'label_ids': tf.io.FixedLenFeature([], tf.int64),
+        'is_real_example': tf.io.FixedLenFeature([], tf.int64),
+    }
+    dataset = self.single_file_dataset(input_file_pattern, name_to_features)
+
+    # The dataset is always sharded by number of hosts.
+    # num_input_pipelines is the number of hosts rather than number of cores.
+    if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
+      dataset = dataset.shard(
+          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+      )
+
+    def parser(record):
+      x = {
+          'input_word_ids': record['input_ids'],
+          'input_mask': record['input_mask'],
+          'input_type_ids': record['segment_ids']
+      }
+      y = record['label_ids']
+      return x, y
+
+    dataset = dataset.map(parser)
+
+    if is_training:
+      dataset = dataset.shuffle(100)
+      dataset = dataset.repeat()
+
+    dataset = dataset.batch(batch_size, drop_remainder=is_training)
+    dataset = dataset.prefetch(1024)
+    return dataset
diff --git a/deepray/datasets/squad/pretrain_dataset.py b/deepray/datasets/squad/pretrain_dataset.py
new file mode 100644
index 00000000..6373cdcc
--- /dev/null
+++ b/deepray/datasets/squad/pretrain_dataset.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from deepray.datasets.datapipeline import DataPipeline
+from deepray.utils.horovod_utils import get_rank, get_world_size
+
+
+class Squad(DataPipeline):
+
+  def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
+    super().__init__(**kwargs)
+    self.max_seq_length = max_seq_length
+    self.input_pipeline_context = input_pipeline_context
+
+  def decode_record(self, record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+
+    return example
+
+  def build_dataset(
+      self,
+      input_file_pattern,
+      batch_size,
+      max_predictions_per_seq,
+      is_training=True,
+      epochs=1,
+      shuffle=False,
+      *args,
+      **kwargs
+  ):
+    """Creates input dataset from (tf)records files for pretraining."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+        'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+        'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
+    }
+
+    dataset = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)
+    if self.use_horovod:
+      dataset = dataset.shard(num_shards=get_world_size(), index=get_rank())
+
+    if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
+      dataset = dataset.shard(
+          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+      )
+
+    dataset = dataset.repeat()
+
+    # We set shuffle buffer to exactly match total number of
+    # training files to ensure that training data is well shuffled.
+    input_files = []
+    for input_pattern in input_file_pattern:
+      input_files.extend(tf.io.gfile.glob(input_pattern))
+    dataset = dataset.shuffle(len(input_files))
+
+    # In parallel, create tf record dataset for each train files.
+    # cycle_length = 8 means that up to 8 files will be read and deserialized in
+    # parallel. You may want to increase this number if you have a large number of
+    # CPU cores.
+    dataset = dataset.interleave(
+        tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
+    )
+
+    decode_fn = lambda record: self.decode_record(record, name_to_features)
+    dataset = dataset.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    def parser(record):
+      """Filter out features to use for pretraining."""
+      x = {
+          'input_word_ids': record['input_ids'],
+          'input_mask': record['input_mask'],
+          'input_type_ids': record['segment_ids'],
+          'masked_lm_positions': record['masked_lm_positions'],
+          'masked_lm_ids': record['masked_lm_ids'],
+          'masked_lm_weights': record['masked_lm_weights'],
+          'next_sentence_labels': record['next_sentence_labels'],
+      }
+
+      y = record['masked_lm_weights']
+
+      return x, y
+
+    dataset = dataset.map(parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+    if is_training:
+      dataset = dataset.shuffle(100)
+
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(1024)
+    return dataset
diff --git a/deepray/datasets/squad/squad.py b/deepray/datasets/squad/squad.py
index 991ef79f..67c6d7ce 100644
--- a/deepray/datasets/squad/squad.py
+++ b/deepray/datasets/squad/squad.py
@@ -21,22 +21,18 @@
 import tensorflow as tf
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.utils.horovod_utils import get_rank, get_world_size
 
-FLAGS = flags.FLAGS
 
-
-class Squad(DataPipeLine):
+class Squad(DataPipeline):
 
   def __init__(self, max_seq_length, dataset_type="squad", **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
     self.dataset_type = dataset_type
 
-  def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
-  ):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
     if self.dataset_type == "squad":
       return self.create_squad_dataset(
           input_file_pattern,
@@ -224,7 +220,7 @@ def _select_data_from_record(record):
 
     if is_training:
       dataset = dataset.shuffle(100)
-      dataset = dataset.repeat(FLAGS.epochs)
+      dataset = dataset.repeat(flags.FLAGS.epochs)
 
     dataset = dataset.batch(batch_size, drop_remainder=True)
     dataset = dataset.prefetch(1024)
diff --git a/deepray/datasets/squad/squad_dataset.py b/deepray/datasets/squad/squad_dataset.py
new file mode 100644
index 00000000..4cfa4801
--- /dev/null
+++ b/deepray/datasets/squad/squad_dataset.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from deepray.datasets.datapipeline import DataPipeline
+from deepray.utils.horovod_utils import get_rank, get_world_size
+
+
+class Squad(DataPipeline):
+
+  def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
+    super().__init__(**kwargs)
+    self.max_seq_length = max_seq_length
+    self.input_pipeline_context = input_pipeline_context
+
+  def decode_record(self, record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.cast(t, tf.int32)
+      example[name] = t
+
+    return example
+
+  def single_file_dataset(self, input_file, name_to_features):
+    """Creates a single-file dataset to be passed for BERT custom training."""
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    d = tf.data.TFRecordDataset(input_file)
+    if self.use_horovod:
+      d = d.shard(num_shards=get_world_size(), index=get_rank())
+
+    d = d.map(lambda record: self.decode_record(record, name_to_features))
+
+    # When `input_file` is a path to a single file or a list
+    # containing a single path, disable auto sharding so that
+    # same input file is sent to all workers.
+    if isinstance(input_file, str) or len(input_file) == 1:
+      options = tf.data.Options()
+      options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
+      d = d.with_options(options)
+    return d
+
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
+    """Creates input dataset from (tf)records files for train/eval."""
+    name_to_features = {
+        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+    }
+    if is_training:
+      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    else:
+      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+
+    dataset = self.single_file_dataset(input_file_pattern, name_to_features)
+
+    # The dataset is always sharded by number of hosts.
+    # num_input_pipelines is the number of hosts rather than number of cores.
+    if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
+      dataset = dataset.shard(
+          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+      )
+
+    def parser(record):
+      """Dispatches record to features and labels."""
+      x, y = {}, {}
+      for name, tensor in record.items():
+        if name in ('start_positions', 'end_positions'):
+          y[name] = tensor
+        elif name == 'input_ids':
+          x['input_word_ids'] = tensor
+        elif name == 'segment_ids':
+          x['input_type_ids'] = tensor
+        else:
+          x[name] = tensor
+      return x, y
+
+    dataset = dataset.map(parser)
+
+    if is_training:
+      dataset = dataset.shuffle(100)
+      # dataset = dataset.repeat()
+
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(1024)
+    return dataset
diff --git a/deepray/datasets/squad/squad_test.py b/deepray/datasets/squad/squad_test.py
index 1ceadbc5..03732f97 100644
--- a/deepray/datasets/squad/squad_test.py
+++ b/deepray/datasets/squad/squad_test.py
@@ -12,8 +12,6 @@
 
 from .squad import Squad
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 SQUAD_VERSION = "1.1"
diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
index 35327304..926b729f 100644
--- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
+++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
@@ -1,15 +1,12 @@
 import multiprocessing
 
 import tensorflow as tf
-from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 from deepray.utils.horovod_utils import get_rank, get_world_size
 
-FLAGS = flags.FLAGS
 
-
-class TFRecordPipeline(DataPipeLine):
+class TFRecordPipeline(DataPipeline):
   """
   Build a pipeline fetching, shuffling, and preprocessing the tfrecord files.
   """
@@ -41,9 +38,7 @@ def parser(self, record):
       label_map[label] = tensor.pop(label)
     return tensor, label_map
 
-  def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
-  ):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
     input_files = tf.io.gfile.glob(input_file_pattern)
 
     # When `input_file` is a path to a single file or a list
diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
index 4fcde0c1..46052299 100644
--- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
+++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
@@ -11,8 +11,6 @@
 from deepray.datasets.tfrecord_pipeline import TFRecordPipeline
 from deepray.utils.benchmark import PerformanceCalculator
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
index 770bb778..1d5b1ffc 100644
--- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
+++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
@@ -9,18 +9,17 @@
 from sklearn.model_selection import train_test_split
 from texthero import preprocessing
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
 os.environ['CURL_CA_BUNDLE'] = ''
 
-FLAGS = flags.FLAGS
 FLAGS([
     sys.argv[0],
     "--num_train_examples=111699",
 ])
 
 
-class ToxicCommentClassificationChallenge(DataPipeLine):
+class ToxicCommentClassificationChallenge(DataPipeline):
 
   def __init__(self, path="/workspaces/dataset/jigsaw-toxic-comment-classification-challenge", **kwargs):
     super().__init__(**kwargs)
@@ -72,7 +71,7 @@ def __init__(self, path="/workspaces/dataset/jigsaw-toxic-comment-classification
     self.train_bert = hero.clean(train['comment_text'], clean_text_bert_pipeline)
     self.test_bert = hero.clean(test['comment_text'], clean_text_bert_pipeline)
 
-  def build_dataset(self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, *args, **kwargs):
+  def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     if is_training:
       ds = tf.data.Dataset.from_tensor_slices((self.train_bert, self.y_train))
     else:
diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
index 53ecee03..6581d356 100644
--- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
+++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
@@ -9,8 +9,6 @@
 
 from .toxic_comment_classification_challenge import ToxicCommentClassificationChallenge
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en.py b/deepray/datasets/wikicorpus_en/wikicorpus_en.py
index 45b5fc03..95a9c8c5 100644
--- a/deepray/datasets/wikicorpus_en/wikicorpus_en.py
+++ b/deepray/datasets/wikicorpus_en/wikicorpus_en.py
@@ -21,7 +21,6 @@
 
 from deepray.datasets.tfrecord_pipeline import TFRecordPipeline
 
-FLAGS = flags.FLAGS
 FLAGS([
     sys.argv[0],
     "--num_train_examples=24324736",
diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
index dfd1666c..164133f5 100644
--- a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
+++ b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
@@ -13,8 +13,6 @@
 
 from .wikicorpus_en import Wikicorpus_en
 
-FLAGS = flags.FLAGS
-
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
 
 
diff --git a/deepray/deepray.bzl b/deepray/deepray.bzl
index 74a64031..d2428100 100644
--- a/deepray/deepray.bzl
+++ b/deepray/deepray.bzl
@@ -1,41 +1,59 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
 load("@local_config_tf//:build_defs.bzl", "CPLUSPLUS_VERSION", "D_GLIBCXX_USE_CXX11_ABI")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda", "if_cuda_is_configured")
+load(
+    "@org_tensorflow//tensorflow:py.default.bzl",
+    _plain_py_library = "py_library",
+)
+load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_binary_additional_srcs", "tf_copts")
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
+
+cc_shared_library = native.cc_shared_library
+
+def _cuda_copts(opts = []):
+    """Gets the appropriate set of copts for (maybe) CUDA compilation.
+
+    If we're doing CUDA compilation, returns copts for our particular CUDA
+    compiler.  If we're not doing CUDA compilation, returns an empty list.
+
+    """
+    return select({
+        "//conditions:default": [],
+        "@local_config_cuda//cuda:using_nvcc": [
+            "-nvcc_options=relaxed-constexpr",
+            "-nvcc_options=ftz=true",
+        ] + opts,
+        "@local_config_cuda//cuda:using_clang": [
+            "-fcuda-flush-denormals-to-zero",
+        ] + opts,
+    })
 
 def custom_op_library(
         name,
         srcs = [],
-        cuda_srcs = [],
+        gpu_srcs = [],
         deps = [],
-        cuda_deps = [],
+        gpu_deps = [],
         copts = [],
         **kwargs):
+    """
+    Reference: https://github.com/tensorflow/addons/blob/master/tensorflow_addons/tensorflow_addons.bzl
+    """
     deps = deps + [
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:libtensorflow_cc",
         "@local_config_tf//:tf_header_lib",
     ]
 
-    if cuda_srcs:
-        copts = copts + if_cuda(["-DGOOGLE_CUDA=1"])
-        cuda_copts = copts + if_cuda_is_configured([
-            "-x cuda",
-            "-nvcc_options=relaxed-constexpr",
-            "-nvcc_options=ftz=true",
-        ])
-        cuda_deps = deps + if_cuda_is_configured(cuda_deps) + if_cuda_is_configured([
-            "@local_config_cuda//cuda:cuda_headers",
-            "@local_config_cuda//cuda:cudart_static",
-        ])
+    if gpu_srcs:
         basename = name.split(".")[0]
-        native.cc_library(
+        cuda_library(
             name = basename + "_gpu",
-            srcs = cuda_srcs,
-            deps = cuda_deps,
-            copts = cuda_copts,
-            alwayslink = 1,
+            srcs = gpu_srcs,
+            copts = copts + tf_copts() + _cuda_copts(),
+            deps = deps + gpu_deps,
             **kwargs
         )
-        deps = deps + if_cuda_is_configured([":" + basename + "_gpu"])
+        deps = deps + [":" + basename + "_gpu"]
 
     copts = copts + select({
         "//deepray:windows": [
@@ -67,3 +85,281 @@ def custom_op_library(
         deps = deps,
         **kwargs
     )
+
+def clean_dep(target):
+    """Returns string to 'target' in @org_tensorflow repository.
+
+    Use this function when referring to targets in the @org_tensorflow
+    repository from macros that may be called from external repositories.
+    """
+
+    # A repo-relative label is resolved relative to the file in which the
+    # Label() call appears, i.e. @org_tensorflow.
+    return str(Label(target))
+
+def filegroup(**kwargs):
+    native.filegroup(**kwargs)
+
+def _rpath_user_link_flags(name):
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("@platforms//os:macos"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
+        ],
+        clean_dep("//deepray:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),),
+        ],
+    })
+
+def _rpath_linkopts(name):
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return select({
+        clean_dep("@platforms//os:macos"): [
+            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
+            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
+        ],
+        clean_dep("//deepray:windows"): [],
+        "//conditions:default": [
+            "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
+        ],
+    })
+
+def _make_search_paths(prefix, levels_to_root):
+    return ",".join(
+        [
+            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
+            for search_level in range(levels_to_root + 1)
+        ],
+    )
+
+# buildozer: disable=function-docstring-args
+def pybind_extension_opensource(
+        name,
+        srcs,
+        module_name = None,  # Unused.
+        hdrs = [],
+        dynamic_deps = [],
+        static_deps = [],
+        deps = [],
+        additional_exported_symbols = [],
+        compatible_with = None,
+        copts = [],
+        data = [],
+        defines = [],
+        deprecation = None,
+        enable_stub_generation = False,  # Unused.
+        additional_stubgen_deps = [],  # Unused.
+        features = [],
+        link_in_framework = False,
+        licenses = None,
+        linkopts = [],
+        pytype_deps = [],
+        pytype_srcs = [],
+        restricted_to = None,
+        srcs_version = "PY3",
+        testonly = None,
+        visibility = None,
+        win_def_file = None):
+    """Builds a generic Python extension module."""
+    _ignore = [enable_stub_generation, additional_stubgen_deps, module_name]  # buildifier: disable=unused-variable
+    p = name.rfind("/")
+    if p == -1:
+        sname = name
+        prefix = ""
+    else:
+        sname = name[p + 1:]
+        prefix = name[:p + 1]
+    so_file = "%s%s.so" % (prefix, sname)
+    filegroup_name = "%s_filegroup" % name
+    pyd_file = "%s%s.pyd" % (prefix, sname)
+    exported_symbols = [
+        "init%s" % sname,
+        "init_%s" % sname,
+        "PyInit_%s" % sname,
+    ] + additional_exported_symbols
+
+    exported_symbols_file = "%s-exported-symbols.lds" % name
+    version_script_file = "%s-version-script.lds" % name
+
+    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
+    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
+
+    native.genrule(
+        name = name + "_exported_symbols",
+        outs = [exported_symbols_file],
+        cmd = "echo '%s' >$@" % exported_symbols_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    native.genrule(
+        name = name + "_version_script",
+        outs = [version_script_file],
+        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    if static_deps:
+        cc_library_name = so_file + "_cclib"
+        cc_library(
+            name = cc_library_name,
+            hdrs = hdrs,
+            srcs = srcs + hdrs,
+            data = data,
+            deps = deps,
+            compatible_with = compatible_with,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("//deepray:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            restricted_to = restricted_to,
+            testonly = testonly,
+            visibility = visibility,
+        )
+
+        cc_shared_library(
+            name = so_file,
+            roots = [cc_library_name],
+            dynamic_deps = dynamic_deps,
+            static_deps = static_deps,
+            additional_linker_inputs = [exported_symbols_file, version_script_file],
+            compatible_with = compatible_with,
+            deprecation = deprecation,
+            features = features + ["-use_header_modules"],
+            licenses = licenses,
+            restricted_to = restricted_to,
+            shared_lib_name = so_file,
+            testonly = testonly,
+            user_link_flags = linkopts + _rpath_user_link_flags(name) + select({
+                clean_dep("@platforms//os:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("//deepray:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            visibility = visibility,
+        )
+
+        # cc_shared_library can generate more than one file.
+        # Solution to avoid the error "variable '$<' : more than one input file."
+        filegroup(
+            name = filegroup_name,
+            srcs = [so_file],
+            output_group = "main_shared_library_output",
+            testonly = testonly,
+        )
+    else:
+        if link_in_framework:
+            srcs += tf_binary_additional_srcs()
+
+        cc_binary(
+            name = so_file,
+            srcs = srcs + hdrs,
+            data = data,
+            copts = copts + [
+                "-fno-strict-aliasing",
+                "-fexceptions",
+            ] + select({
+                clean_dep("//deepray:windows"): [],
+                "//conditions:default": [
+                    "-fvisibility=hidden",
+                ],
+            }),
+            linkopts = linkopts + _rpath_linkopts(name) + select({
+                clean_dep("@platforms//os:macos"): [
+                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                    # not being exported.  There should be a better way to deal with this.
+                    "-Wl,-w",
+                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+                ],
+                clean_dep("//deepray:windows"): [],
+                "//conditions:default": [
+                    "-Wl,--version-script",
+                    "$(location %s)" % version_script_file,
+                ],
+            }),
+            deps = deps + [
+                exported_symbols_file,
+                version_script_file,
+            ],
+            defines = defines,
+            features = features + ["-use_header_modules"],
+            linkshared = 1,
+            testonly = testonly,
+            licenses = licenses,
+            visibility = visibility,
+            deprecation = deprecation,
+            restricted_to = restricted_to,
+            compatible_with = compatible_with,
+        )
+
+        # For Windows, emulate the above filegroup with the shared object.
+        native.alias(
+            name = filegroup_name,
+            actual = so_file,
+        )
+
+    # For Windows only.
+    native.genrule(
+        name = name + "_pyd_copy",
+        srcs = [filegroup_name],
+        outs = [pyd_file],
+        cmd = "cp $< $@",
+        output_to_bindir = True,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+
+    _plain_py_library(
+        name = name,
+        data = select({
+            clean_dep("//deepray:windows"): [pyd_file],
+            "//conditions:default": [so_file],
+        }) + pytype_srcs,
+        deps = pytype_deps,
+        srcs_version = srcs_version,
+        licenses = licenses,
+        testonly = testonly,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+    )
+
+# Export open source version of pybind_extension under base name as well.
+pybind_extension = pybind_extension_opensource
diff --git a/deepray/layers/BUILD b/deepray/layers/BUILD
index 4b73fd12..57ef2821 100644
--- a/deepray/layers/BUILD
+++ b/deepray/layers/BUILD
@@ -13,8 +13,9 @@ py_library(
         "//deepray/activations",
         "//deepray/layers/rnn",
         "//deepray/testing",
-        "//deepray/text",
+        # "//deepray/text",
         "//deepray/utils",
+        "@pypi_pandas//:pkg",
     ],
 )
 
diff --git a/deepray/layers/__init__.py b/deepray/layers/__init__.py
index d445c511..88a5336f 100644
--- a/deepray/layers/__init__.py
+++ b/deepray/layers/__init__.py
@@ -38,10 +38,9 @@
 from deepray.layers.spatial_pyramid_pooling import SpatialPyramidPooling2D
 from deepray.layers.tlu import TLU
 from deepray.layers.wrappers import WeightNormalization
-from deepray.layers.esn import ESN
 from deepray.layers.stochastic_depth import StochasticDepth
 from deepray.layers.noisy_dense import NoisyDense
-from deepray.layers.crf import CRF
+# from deepray.layers.crf import CRF
 
 from deepray.layers.on_device_embedding import OnDeviceEmbedding
 from deepray.layers.position_embedding import PositionEmbedding
diff --git a/deepray/layers/attention.py b/deepray/layers/attention.py
index 9d22095d..9a974003 100644
--- a/deepray/layers/attention.py
+++ b/deepray/layers/attention.py
@@ -23,15 +23,13 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow import keras
-from tensorflow.keras import layers
+import tf_keras as keras
 
 from deepray.layers import dense_einsum
 from deepray.layers import masked_softmax
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
-class Attention(tf.keras.layers.Layer):
+class Attention(keras.layers.Layer):
   """Attention layer.
 
   This is an implementation of multi-headed attention based on "Attention
@@ -80,12 +78,12 @@ def __init__(
     self._num_heads = num_heads
     self._head_size = head_size
     self._dropout_rate = dropout_rate
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
-    self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
-    self._bias_constraint = tf.keras.constraints.get(bias_constraint)
+    self._kernel_initializer = keras.initializers.get(kernel_initializer)
+    self._bias_initializer = keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = keras.regularizers.get(bias_regularizer)
+    self._kernel_constraint = keras.constraints.get(kernel_constraint)
+    self._bias_constraint = keras.constraints.get(bias_constraint)
 
     self._query_dense = dense_einsum.DenseEinsum(
         output_shape=(self._num_heads, self._head_size),
@@ -125,20 +123,20 @@ def __init__(
 
     self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1])
 
-    self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
+    self._dropout = keras.layers.Dropout(rate=self._dropout_rate)
 
   def get_config(self):
     config = {
         "num_heads": self._num_heads,
         "head_size": self._head_size,
         "dropout_rate": self._dropout_rate,
-        "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint)
+        "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer": keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
+        "activity_regularizer": keras.regularizers.serialize(self._activity_regularizer),
+        "kernel_constraint": keras.constraints.serialize(self._kernel_constraint),
+        "bias_constraint": keras.constraints.serialize(self._bias_constraint)
     }
     base_config = super(Attention, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -190,7 +188,6 @@ def call(self, inputs):
     return tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_tensor)
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
 class CachedAttention(Attention):
   """Attention layer with cache used for auto-agressive decoding.
 
@@ -266,7 +263,7 @@ def call(self, inputs, decode_loop_step=None):
     return tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_tensor), cache
 
 
-class WindowAttention(tf.keras.layers.Layer):
+class WindowAttention(keras.layers.Layer):
   """
     ## Window based multi-head self-attention
 
@@ -284,9 +281,9 @@ def __init__(self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0,
     self.window_size = window_size
     self.num_heads = num_heads
     self.scale = (dim // num_heads)**-0.5
-    self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
-    self.dropout = layers.Dropout(dropout_rate)
-    self.proj = layers.Dense(dim)
+    self.qkv = keras.layers.Dense(dim * 3, use_bias=qkv_bias)
+    self.dropout = keras.layers.Dropout(dropout_rate)
+    self.proj = keras.layers.Dense(dim)
 
   def build(self, input_shape):
     num_window_elements = (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
diff --git a/deepray/layers/dcn.py b/deepray/layers/dcn.py
index 935b2a2d..48658c6a 100644
--- a/deepray/layers/dcn.py
+++ b/deepray/layers/dcn.py
@@ -16,9 +16,10 @@
 from typing import Union, Text, Optional
 
 import tensorflow as tf
+import tf_keras as keras
 
 
-class Cross(tf.keras.layers.Layer):
+class Cross(keras.layers.Layer):
   """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
 
     A layer that creates explicit and bounded-degree feature interactions
@@ -43,12 +44,12 @@ class Cross(tf.keras.layers.Layer):
 
         ```python
         # after embedding layer in a functional model:
-        input = tf.keras.Input(shape=(None,), name='index', dtype=tf.int64)
+        input = keras.Input(shape=(None,), name='index', dtype=tf.int64)
         x0 = dp.layers.Embedding(vocabulary_size=32, embedding_dim=6)
         x1 = Cross()(x0, x0)
         x2 = Cross()(x0, x1)
-        logits = tf.keras.layers.Dense(units=10)(x2)
-        model = tf.keras.Model(input, logits)
+        logits = keras.layers.Dense(units=10)(x2)
+        model = keras.Model(input, logits)
         ```
 
     Args:
@@ -82,11 +83,11 @@ def __init__(
       projection_dim: Optional[int] = None,
       diag_scale: Optional[float] = 0.0,
       use_bias: bool = True,
-      preactivation: Optional[Union[str, tf.keras.layers.Activation]] = None,
-      kernel_initializer: Union[Text, tf.keras.initializers.Initializer] = "truncated_normal",
-      bias_initializer: Union[Text, tf.keras.initializers.Initializer] = "zeros",
-      kernel_regularizer: Union[Text, None, tf.keras.regularizers.Regularizer] = None,
-      bias_regularizer: Union[Text, None, tf.keras.regularizers.Regularizer] = None,
+      preactivation: Optional[Union[str, keras.layers.Activation]] = None,
+      kernel_initializer: Union[Text, keras.initializers.Initializer] = "truncated_normal",
+      bias_initializer: Union[Text, keras.initializers.Initializer] = "zeros",
+      kernel_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
+      bias_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
       **kwargs
   ):
 
@@ -95,11 +96,11 @@ def __init__(
     self._projection_dim = projection_dim
     self._diag_scale = diag_scale
     self._use_bias = use_bias
-    self._preactivation = tf.keras.activations.get(preactivation)
-    self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
-    self._bias_initializer = tf.keras.initializers.get(bias_initializer)
-    self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
-    self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+    self._preactivation = keras.activations.get(preactivation)
+    self._kernel_initializer = keras.initializers.get(kernel_initializer)
+    self._bias_initializer = keras.initializers.get(bias_initializer)
+    self._kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+    self._bias_regularizer = keras.regularizers.get(bias_regularizer)
     self._input_dim = None
 
     self._supports_masking = True
@@ -111,7 +112,7 @@ def build(self, input_shape):
     last_dim = input_shape[-1]
 
     if self._projection_dim is None:
-      self._dense = tf.keras.layers.Dense(
+      self._dense = keras.layers.Dense(
           last_dim,
           kernel_initializer=_clone_initializer(self._kernel_initializer),
           bias_initializer=self._bias_initializer,
@@ -122,14 +123,14 @@ def build(self, input_shape):
           activation=self._preactivation,
       )
     else:
-      self._dense_u = tf.keras.layers.Dense(
+      self._dense_u = keras.layers.Dense(
           self._projection_dim,
           kernel_initializer=_clone_initializer(self._kernel_initializer),
           kernel_regularizer=self._kernel_regularizer,
           use_bias=False,
           dtype=self.dtype,
       )
-      self._dense_v = tf.keras.layers.Dense(
+      self._dense_v = keras.layers.Dense(
           last_dim,
           kernel_initializer=_clone_initializer(self._kernel_initializer),
           bias_initializer=self._bias_initializer,
@@ -183,11 +184,11 @@ def get_config(self):
         "projection_dim": self._projection_dim,
         "diag_scale": self._diag_scale,
         "use_bias": self._use_bias,
-        "preactivation": tf.keras.activations.serialize(self._preactivation),
-        "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
+        "preactivation": keras.activations.serialize(self._preactivation),
+        "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
+        "bias_initializer": keras.initializers.serialize(self._bias_initializer),
+        "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
+        "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/dense.py b/deepray/layers/dense.py
new file mode 100644
index 00000000..98e2c3f6
--- /dev/null
+++ b/deepray/layers/dense.py
@@ -0,0 +1,287 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains the Dense layer."""
+# pylint: disable=g-classes-have-attributes,g-direct-tensorflow-import
+
+import tensorflow.compat.v2 as tf
+from packaging import version
+
+if version.parse(tf.__version__.replace("-tf", "+tf")) < version.parse("2.11"):
+  from keras import activations
+  from keras import backend
+  from keras import constraints
+  from keras import initializers
+  from keras import regularizers
+  from keras.dtensor import utils
+  from keras.engine.base_layer import Layer
+  from keras.engine.input_spec import InputSpec
+else:
+  from keras.src.dtensor import utils
+  from keras.src import activations
+  from keras.src import backend
+  from keras.src import constraints
+  from keras.src import initializers
+  from keras.src import regularizers
+  from keras.src.dtensor import utils
+  from keras.src.engine.base_layer import Layer
+  from keras.src.engine.input_spec import InputSpec
+
+
+class Dense(Layer):
+  """Just your regular densely-connected NN layer.
+
+  `Dense` implements the operation:
+  `output = activation(dot(input, kernel) + bias)`
+  where `activation` is the element-wise activation function
+  passed as the `activation` argument, `kernel` is a weights matrix
+  created by the layer, and `bias` is a bias vector created by the layer
+  (only applicable if `use_bias` is `True`). These are all attributes of
+  `Dense`.
+
+  Note: If the input to the layer has a rank greater than 2, then `Dense`
+  computes the dot product between the `inputs` and the `kernel` along the
+  last axis of the `inputs` and axis 0 of the `kernel` (using `tf.tensordot`).
+  For example, if input has dimensions `(batch_size, d0, d1)`,
+  then we create a `kernel` with shape `(d1, units)`, and the `kernel` operates
+  along axis 2 of the `input`, on every sub-tensor of shape `(1, 1, d1)`
+  (there are `batch_size * d0` such sub-tensors).
+  The output in this case will have shape `(batch_size, d0, units)`.
+
+  Besides, layer attributes cannot be modified after the layer has been called
+  once (except the `trainable` attribute).
+  When a popular kwarg `input_shape` is passed, then keras will create
+  an input layer to insert before the current layer. This can be treated
+  equivalent to explicitly defining an `InputLayer`.
+
+  Example:
+
+  >>> # Create a `Sequential` model and add a Dense layer as the first layer.
+  >>> model = tf.keras.models.Sequential()
+  >>> model.add(tf.keras.Input(shape=(16,)))
+  >>> model.add(tf.keras.layers.Dense(32, activation='relu'))
+  >>> # Now the model will take as input arrays of shape (None, 16)
+  >>> # and output arrays of shape (None, 32).
+  >>> # Note that after the first layer, you don't need to specify
+  >>> # the size of the input anymore:
+  >>> model.add(tf.keras.layers.Dense(32))
+  >>> model.output_shape
+  (None, 32)
+
+  Args:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix.
+    bias_initializer: Initializer for the bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation").
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    N-D tensor with shape: `(batch_size, ..., input_dim)`.
+    The most common situation would be
+    a 2D input with shape `(batch_size, input_dim)`.
+
+  Output shape:
+    N-D tensor with shape: `(batch_size, ..., units)`.
+    For instance, for a 2D input with shape `(batch_size, input_dim)`,
+    the output would have shape `(batch_size, units)`.
+  """
+
+  @utils.allow_initializer_layout
+  def __init__(
+      self,
+      units,
+      activation=None,
+      use_bias=True,
+      kernel_initializer='glorot_uniform',
+      bias_initializer='zeros',
+      kernel_regularizer=None,
+      bias_regularizer=None,
+      activity_regularizer=None,
+      kernel_constraint=None,
+      bias_constraint=None,
+      **kwargs
+  ):
+    super(Dense, self).__init__(activity_regularizer=activity_regularizer, **kwargs)
+
+    self.name = self.kwargs('name', None)
+    self.units = int(units) if not isinstance(units, int) else units
+    if self.units < 0:
+      raise ValueError(
+          f'Received an invalid value for `units`, expected '
+          f'a positive integer. Received: units={units}'
+      )
+    self.activation = activations.get(activation)
+    self.use_bias = use_bias
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+    self.input_spec = InputSpec(min_ndim=2)
+    self.supports_masking = True
+
+  def build(self, input_shape):
+    dtype = tf.as_dtype(self.dtype or backend.floatx())
+    if not (dtype.is_floating or dtype.is_complex):
+      raise TypeError('A Dense layer can only be built with a floating-point '
+                      f'dtype. Received: dtype={dtype}')
+
+    input_shape = tf.TensorShape(input_shape)
+    last_dim = tf.compat.dimension_value(input_shape[-1])
+    if last_dim is None:
+      raise ValueError(
+          'The last dimension of the inputs to a Dense layer '
+          'should be defined. Found None. '
+          f'Full input shape received: {input_shape}'
+      )
+    self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
+    self.kernel = self.add_weight(
+        '%skernel' % self.name + '_' if self.name else "",
+        shape=[last_dim, self.units],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        dtype=self.dtype,
+        trainable=True
+    )
+    if self.use_bias:
+      self.bias = self.add_weight(
+          '%sbias' % self.name + '_' if self.name else "",
+          shape=[
+              self.units,
+          ],
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          dtype=self.dtype,
+          trainable=True
+      )
+    else:
+      self.bias = None
+    self.built = True
+
+  def call(self, inputs):
+    if inputs.dtype.base_dtype != self._compute_dtype_object.base_dtype:
+      inputs = tf.cast(inputs, dtype=self._compute_dtype_object)
+
+    is_ragged = isinstance(inputs, tf.RaggedTensor)
+    if is_ragged:
+      # In case we encounter a RaggedTensor with a fixed last dimension (last
+      # dimension not ragged), we can flatten the input and restore the ragged
+      # dimensions at the end.
+      if tf.compat.dimension_value(inputs.shape[-1]) is None:
+        raise ValueError(
+            'Dense layer only supports RaggedTensors when the '
+            'innermost dimension is non-ragged. Received: '
+            f'inputs.shape={inputs.shape}.'
+        )
+      original_inputs = inputs
+      if inputs.flat_values.shape.rank > 1:
+        inputs = inputs.flat_values
+      else:
+        # Innermost partition is encoded using uniform_row_length.
+        # (This is unusual, but we can handle it.)
+        if inputs.shape.rank == 2:
+          inputs = inputs.to_tensor()
+          is_ragged = False
+        else:
+          for _ in range(original_inputs.ragged_rank - 1):
+            inputs = inputs.values
+          inputs = inputs.to_tensor()
+          original_inputs = tf.RaggedTensor.from_nested_row_splits(inputs, original_inputs.nested_row_splits[:-1])
+
+    rank = inputs.shape.rank
+    if rank == 2 or rank is None:
+      # We use embedding_lookup_sparse as a more efficient matmul operation for
+      # large sparse input tensors. The op will result in a sparse gradient, as
+      # opposed to sparse_ops.sparse_tensor_dense_matmul which results in dense
+      # gradients. This can lead to sigfinicant speedups, see b/171762937.
+      if isinstance(inputs, tf.SparseTensor):
+        # We need to fill empty rows, as the op assumes at least one id per row.
+        inputs, _ = tf.sparse.fill_empty_rows(inputs, 0)
+        # We need to do some munging of our input to use the embedding lookup as
+        # a matrix multiply. We split our input matrix into separate ids and
+        # weights tensors. The values of the ids tensor should be the column
+        # indices of our input matrix and the values of the weights tensor
+        # can continue to the actual matrix weights.
+        # The column arrangement of ids and weights
+        # will be summed over and does not matter. See the documentation for
+        # sparse_ops.sparse_tensor_dense_matmul a more detailed explanation
+        # of the inputs to both ops.
+        ids = tf.SparseTensor(indices=inputs.indices, values=inputs.indices[:, 1], dense_shape=inputs.dense_shape)
+        weights = inputs
+        outputs = tf.nn.embedding_lookup_sparse(self.kernel, ids, weights, combiner='sum')
+      else:
+        outputs = tf.matmul(a=inputs, b=self.kernel)
+    # Broadcast kernel to inputs.
+    else:
+      outputs = tf.tensordot(inputs, self.kernel, [[rank - 1], [0]])
+      # Reshape the output back to the original ndim of the input.
+      if not tf.executing_eagerly():
+        shape = inputs.shape.as_list()
+        output_shape = shape[:-1] + [self.kernel.shape[-1]]
+        outputs.set_shape(output_shape)
+
+    if self.use_bias:
+      outputs = tf.nn.bias_add(outputs, self.bias)
+
+    if self.activation is not None:
+      outputs = self.activation(outputs)
+
+    if is_ragged:
+      outputs = original_inputs.with_flat_values(outputs)
+
+    return outputs
+
+  def compute_output_shape(self, input_shape):
+    input_shape = tf.TensorShape(input_shape)
+    input_shape = input_shape.with_rank_at_least(2)
+    if tf.compat.dimension_value(input_shape[-1]) is None:
+      raise ValueError(
+          'The last dimension of the input shape of a Dense layer '
+          'should be defined. Found None. '
+          f'Received: input_shape={input_shape}'
+      )
+    return input_shape[:-1].concatenate(self.units)
+
+  def get_config(self):
+    config = super(Dense, self).get_config()
+    config.update(
+        {
+            'name': self.name,
+            'units': self.units,
+            'activation': activations.serialize(self.activation),
+            'use_bias': self.use_bias,
+            'kernel_initializer': initializers.serialize(self.kernel_initializer),
+            'bias_initializer': initializers.serialize(self.bias_initializer),
+            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
+            'kernel_constraint': constraints.serialize(self.kernel_constraint),
+            'bias_constraint': constraints.serialize(self.bias_constraint)
+        }
+    )
+    return config
diff --git a/deepray/layers/dense_einsum.py b/deepray/layers/dense_einsum.py
index 4fdbc24a..6abfbd37 100644
--- a/deepray/layers/dense_einsum.py
+++ b/deepray/layers/dense_einsum.py
@@ -24,7 +24,6 @@
 _CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
 class DenseEinsum(tf.keras.layers.Layer):
   """A densely connected layer that uses tf.einsum as the backing computation.
 
diff --git a/deepray/layers/dynamic_embedding.py b/deepray/layers/dynamic_embedding.py
index d3048144..742c632c 100644
--- a/deepray/layers/dynamic_embedding.py
+++ b/deepray/layers/dynamic_embedding.py
@@ -1,132 +1,139 @@
 # -*- coding:utf-8 -*-
 """Dynamic Embedding layer."""
-
 from collections import defaultdict
 from typing import Dict, List
 from typing import Optional, Literal
 
 import pandas as pd
 import tensorflow as tf
-import tensorflow_recommenders_addons as tfra
-from absl import flags, logging
+from absl import flags
 from tensorflow.python.keras import regularizers, initializers
-from tensorflow_recommenders_addons import dynamic_embedding as de
-from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import BasicEmbedding as DynamicEmbedding
-from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import HvdAllToAllEmbedding
 
 from deepray.layers.bucketize import NumericaBucketIdLayer, Hash
-from deepray.utils.horovod_utils import get_world_size, get_rank
+from deepray.utils import logging_util
+from deepray.utils.horovod_utils import get_world_size, get_rank, is_main_process
+
+logger = logging_util.get_logger()
+
+try:
+  import tensorflow_recommenders_addons as tfra
+  from tensorflow_recommenders_addons import dynamic_embedding as de
+  from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import BasicEmbedding as DynamicEmbedding
+  from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import HvdAllToAllEmbedding
+
+  class EmbeddingLayerRedis(DynamicEmbedding):
+
+    def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
+      self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
+      self.mask_value = mask_value
+      super().__init__(**kwargs)
+
+    def call(self, ids):
+      with tf.name_scope(self.name + "/EmbeddingLookupUnique"):
+        ids_flat = tf.reshape(ids, [-1])
+        with tf.device("/CPU:0"):
+          unique_ids, idx = tf.unique(ids_flat)
+        unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids)
+        embeddings_flat = tf.gather(unique_embeddings, idx)
+        embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0)
+        embeddings = tf.reshape(embeddings_flat, embeddings_shape)
+        return embeddings
+
+    def get_config(self):
+      config = {
+          'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
+          'mask_value': self.mask_value
+      }
+      base_config = super(EmbeddingLayerRedis, self).get_config()
+
+      return dict(list(base_config.items()) + list(config.items()))
+
+  class EmbeddingLayerGPU(DynamicEmbedding):
+
+    def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
+      self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
+      self.mask_value = mask_value
+      self.with_unique = kwargs.get("with_unique", True)
+      super().__init__(**kwargs)
+
+    def call(self, ids):
+      with tf.name_scope(self.name + "/EmbeddingLookupUnique"):
+        if self.with_unique:
+          ids_flat = tf.reshape(ids, [-1])
+          unique_ids, idx = tf.unique(ids_flat)
+          unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids)
+          embeddings_flat = tf.gather(unique_embeddings, idx)
+          embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0)
+          embeddings = tf.reshape(embeddings_flat, embeddings_shape)
+        else:
+          embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, ids)
+        return embeddings
 
-FLAGS = flags.FLAGS
+    def get_config(self):
+      config = {
+          'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
+          'mask_value': self.mask_value
+      }
+      base_config = super(EmbeddingLayerGPU, self).get_config()
+      return dict(list(base_config.items()) + list(config.items()))
 
+except ImportError as e:
+  logger.warning("An exception occurred when import tensorflow_recommenders_addons: " + str(e))
 
-class DynamicEmbeddingOption(object):
 
-  def __init__(
-      self,
-      device: Optional[Literal["HBM", "DRAM", "Redis", "HKV"]] = None,
-      init_capacity=1 * 1024 * 1024,
-      max_capacity=128 * 1024 * 1024,
-      max_hbm_for_vectors=4 * 1024 * 1024 * 1024
-  ):
-    self.device_name = device
-    self.init_capacity = init_capacity
-    self.max_capacity = max_capacity
-    self.max_hbm_for_vectors = max_hbm_for_vectors
+class DistributedDynamicEmbedding(tf.keras.layers.Layer):
 
-    if device == "Redis":
-      if FLAGS.redis_config_env:
-        redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir_env=FLAGS.redis_config_env)
+  def get_de_options(self, case, init_capacity, **kwargs):
+    redis_creator = None
+    cuckoo_creator = None
+    hkv_creator = None
+
+    if case == "Redis":
+      if flags.FLAGS.redis_config_env:
+        redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir_env=flags.FLAGS.redis_config_env)
       else:
-        redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir=FLAGS.redis_config_dir)
+        redis_config = tfra.dynamic_embedding.RedisTableConfig(redis_config_abs_dir=flags.FLAGS.redis_config_dir)
+      redis_creator = tfra.dynamic_embedding.RedisTableCreator(redis_config)
 
-      self.devices = ['/CPU:0']
-      self.kv_creator = tfra.dynamic_embedding.RedisTableCreator(redis_config)
-      return
-    elif device == "HKV":
-      self.devices = ['/GPU:0']
+    if case == "HKV":
       hkv_config = tfra.dynamic_embedding.HkvHashTableConfig(
           init_capacity=init_capacity,
-          max_capacity=max_capacity,
-          max_hbm_for_vectors=max_hbm_for_vectors,
+          max_capacity=kwargs.get("max_capacity", 128 * 1024 * 1024),
+          max_hbm_for_values=kwargs.get("max_hbm_for_values", 4 * 1024 * 1024 * 1024),
       )
-      if FLAGS.use_horovod:
-        self.kv_creator = tfra.dynamic_embedding.HkvHashTableCreator(
+      if flags.FLAGS.use_horovod:
+        hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator(
             hkv_config, saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
         )
       else:
-        self.kv_creator = tfra.dynamic_embedding.HkvHashTableCreator(hkv_config)
-      return
-    elif device == "HBM":
-      self.devices = ['/GPU:0']
-    elif device == "DRAM":
-      self.devices = ['/CPU:0']
-    else:
-      raise ValueError(f"Found device {device} not in supported type Redis, DRAM, HBM, HKV")
-    if FLAGS.use_horovod:
-      self.kv_creator = de.CuckooHashTableCreator(
+        hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator(hkv_config, saver=de.FileSystemSaver())
+
+    if flags.FLAGS.use_horovod:
+      cuckoo_creator = de.CuckooHashTableCreator(
           saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
       )
     else:
-      self.kv_creator = de.CuckooHashTableCreator(saver=de.FileSystemSaver())
-
-
-class EmbeddingLayerRedis(DynamicEmbedding):
-
-  def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
-    self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
-    self.mask_value = mask_value
-    super().__init__(**kwargs)
-
-  def call(self, ids):
-    with tf.name_scope(self.name + "/EmbeddingLookupUnique"):
-      ids_flat = tf.reshape(ids, [-1])
-      with tf.device("/CPU:0"):
-        unique_ids, idx = tf.unique(ids_flat)
-      unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids)
-      embeddings_flat = tf.gather(unique_embeddings, idx)
-      embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0)
-      embeddings = tf.reshape(embeddings_flat, embeddings_shape)
-      return embeddings
-
-  def get_config(self):
-    config = {
-        'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
-        'mask_value': self.mask_value
-    }
-    base_config = super(EmbeddingLayerRedis, self).get_config()
-
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class EmbeddingLayerGPU(DynamicEmbedding):
-
-  def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
-    self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
-    self.mask_value = mask_value
-    super().__init__(**kwargs)
-
-  def call(self, ids):
-    with tf.name_scope(self.name + "/EmbeddingLookupUnique"):
-      ids_flat = tf.reshape(ids, [-1])
-      unique_ids, idx = tf.unique(ids_flat)
-      unique_embeddings = tfra.dynamic_embedding.shadow_ops.embedding_lookup(self.shadow, unique_ids)
-      embeddings_flat = tf.gather(unique_embeddings, idx)
-      embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0)
-      embeddings = tf.reshape(embeddings_flat, embeddings_shape)
-      return embeddings
-
-  def get_config(self):
-    config = {
-        'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
-        'mask_value': self.mask_value
+      cuckoo_creator = de.CuckooHashTableCreator(saver=de.FileSystemSaver())
+
+    switcher = {
+        "Redis": {
+            "devices": ['/CPU:0'],
+            "kv_creator": redis_creator,
+        },
+        "DRAM": {
+            "devices": ['/CPU:0'],
+            "kv_creator": cuckoo_creator,
+        },
+        "HBM": {
+            "devices": ['/GPU:0'],
+            "kv_creator": cuckoo_creator,
+        },
+        "HKV": {
+            "devices": ['/GPU:0'],
+            "kv_creator": hkv_creator,
+        },
     }
-    base_config = super(EmbeddingLayerGPU, self).get_config()
-
-    return dict(list(base_config.items()) + list(config.items()))
-
-
-class DistributedDynamicEmbedding(tf.keras.layers.Layer):
+    return switcher.get(case, None)
 
   def __init__(
       self,
@@ -135,7 +142,8 @@ def __init__(
       value_dtype: str,
       initializer=None,
       name: str = '',
-      de_option: DynamicEmbeddingOption = DynamicEmbeddingOption(device="DRAM"),
+      device: Optional[Literal["HBM", "DRAM", "Redis", "HKV", "EV"]] = "DRAM",
+      init_capacity=1 * 1024 * 1024,
       **kwargs
   ):
     super(DistributedDynamicEmbedding, self).__init__()
@@ -143,35 +151,40 @@ def __init__(
     self.key_dtype = key_dtype
     self.value_dtype = value_dtype
     self.initializer = initializer
-    self.de_option = de_option
+    self.device = device
+    self.init_capacity = init_capacity
 
-    if de_option.device_name == "Redis":
+    if device == "Redis":
+      de_option = self.get_de_options(device, init_capacity, **kwargs)
       self.emb = EmbeddingLayerRedis(
           embedding_size=embedding_dim,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           initializer=initializer,
           name=name,
-          devices=de_option.devices,
-          kv_creator=de_option.kv_creator,
+          devices=de_option["devices"],
+          kv_creator=de_option["kv_creator"],
           **kwargs
       )
-      logging.info(f"Create EmbeddingLayer for {name} on {de_option.device_name} with {embedding_dim} dim")
+      if is_main_process():
+        logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim")
       return
 
-    if not FLAGS.use_horovod:
+    de_option = self.get_de_options(device, init_capacity, **kwargs)
+    if not flags.FLAGS.use_horovod:
       self.emb = EmbeddingLayerGPU(
           embedding_size=embedding_dim,
           key_dtype=key_dtype,
           value_dtype=value_dtype,
           initializer=initializer,
           name=name,
-          devices=de_option.devices,
-          init_capacity=de_option.init_capacity,
-          kv_creator=de_option.kv_creator,
+          devices=de_option["devices"],
+          init_capacity=init_capacity,
+          kv_creator=de_option["kv_creator"],
           **kwargs
       )
-      logging.info(f"Create EmbeddingLayer for {name} on {de_option.device_name} with {embedding_dim} dim")
+      if is_main_process():
+        logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim")
     else:
       self.emb = HvdAllToAllEmbedding(
           embedding_size=embedding_dim,
@@ -179,12 +192,13 @@ def __init__(
           value_dtype=value_dtype,
           initializer=initializer,
           name=name,
-          devices=de_option.devices,
-          init_capacity=de_option.init_capacity,
-          kv_creator=de_option.kv_creator,
+          devices=de_option["devices"],
+          init_capacity=init_capacity,
+          kv_creator=de_option["kv_creator"],
           **kwargs
       )
-      logging.info(f"Create HvdAllToAllEmbedding for {name} on {de_option.device_name} with {embedding_dim} dim")
+      if is_main_process():
+        logger.info(f"Create HvdAllToAllEmbedding for {name} on {device} with {embedding_dim} dim")
 
   def call(self, ids, *args, **kwargs):
     return self.emb(ids)
@@ -197,8 +211,8 @@ def get_config(self):
             "key_dtype": self.key_dtype,
             "value_dtype": self.value_dtype,
             "initializer": self.initializer,
-            "name": self.name,
-            "de_option": self.de_option,
+            "device": self.device,
+            "init_capacity": self.init_capacity
         }
     )
     return config
@@ -279,13 +293,13 @@ def factor2decimal(self, composition_part: int):
     return res
 
   def build(self, input_shape=None):
-    self.composition_emb = DistributedDynamicEmbedding(
+    self.composition_emb = EmbeddingVariable(
         embedding_dim=self.embedding_dim,
         key_dtype=self.key_dtype,
         value_dtype=self.value_dtype,
         initializer=self.initializer,
         name=f"embeddings_{self.suffix}/Compositional",
-        de_option=DynamicEmbeddingOption(device=self.device,)
+        device=self.device,
     )
 
   def call(self, inputs, *args, **kwargs):
@@ -384,13 +398,13 @@ def build(self, input_shape):
               name=self.fold_columns[name]
           )
         else:
-          self.embedding_layers[self.fold_columns[name]] = DistributedDynamicEmbedding(
+          self.embedding_layers[self.fold_columns[name]] = EmbeddingVariable(
               embedding_dim=dim,
               key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype,
               value_dtype=tf.float32,
               initializer=tf.keras.initializers.GlorotUniform(),
               name='embedding_' + self.fold_columns[name],
-              de_option=DynamicEmbeddingOption(device=storage_type)
+              device=storage_type,
           )
 
       self.split_dims[self.fold_columns[name]].append(length)
diff --git a/deepray/layers/embedding.py b/deepray/layers/embedding.py
index 71b91886..557b8314 100644
--- a/deepray/layers/embedding.py
+++ b/deepray/layers/embedding.py
@@ -20,24 +20,40 @@
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-from absl import flags
-from keras import backend
-from keras import constraints
-from keras import initializers
-from keras import regularizers
-from keras.dtensor import utils
-from keras.engine import base_layer_utils
-from keras.engine.base_layer import Layer
-from tensorflow import keras
-from tensorflow.keras import layers
-from tensorflow.keras.layers import StringLookup
-from tensorflow.python.keras.utils import tf_utils
-
+from packaging.version import parse
+
+if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"):
+  from keras import backend
+  from keras import constraints
+  from keras import initializers
+  from keras import regularizers
+  from keras.dtensor import utils
+  from keras.engine import base_layer_utils
+  from keras.engine.base_layer import Layer
+  from keras.utils import tf_utils
+elif parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src import backend
+  from tf_keras.src import constraints
+  from tf_keras.src import initializers
+  from tf_keras.src import regularizers
+  from tf_keras.src.dtensor import utils
+  from tf_keras.src.engine import base_layer_utils
+  from tf_keras.src.engine.base_layer import Layer
+  from tf_keras.src.utils import tf_utils
+else:
+  from keras.src import backend
+  from keras.src import constraints
+  from keras.src import initializers
+  from keras.src import regularizers
+  from keras.src.dtensor import utils
+  from keras.src.engine import base_layer_utils
+  from keras.src.engine.base_layer import Layer
+  from keras.src.utils import tf_utils
+
+import tf_keras as keras
 import deepray as dp
 from deepray.layers.bucketize import Hash
 
-FLAGS = flags.FLAGS
-
 
 def get_variable_path(checkpoint_path, name, i=0):
   tokens = name.split('/')
@@ -55,7 +71,7 @@ class Embedding(Layer):
     e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
 
     This layer can only be used on positive integer inputs of a fixed range. The
-    `tf.keras.layers.TextVectorization`, `tf.keras.layers.StringLookup`,
+    `tf.keras.layers.TextVectorization`, `keras.layers.StringLookup`,
     and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare
     inputs for an `Embedding` layer.
 
@@ -576,12 +592,12 @@ def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
     super().__init__(name=name)
     self.num_buckets = num_buckets
 
-    self.index_lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
-    self.q_embeddings = layers.Embedding(
+    self.index_lookup = keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
+    self.q_embeddings = keras.layers.Embedding(
         num_buckets,
         embedding_dim,
     )
-    self.r_embeddings = layers.Embedding(
+    self.r_embeddings = keras.layers.Embedding(
         num_buckets,
         embedding_dim,
     )
@@ -649,17 +665,17 @@ def __init__(self, blocks_vocabulary, blocks_embedding_dims, base_embedding_dim,
       block_embedding_encoder = self.embedding_encoder(vocabulary, embedding_dim, num_oov_indices=1)
       self.block_embedding_encoders.append(block_embedding_encoder)
       if embedding_dim == base_embedding_dim:
-        self.block_embedding_projectors.append(layers.Lambda(lambda x: x))
+        self.block_embedding_projectors.append(keras.layers.Lambda(lambda x: x))
       else:
-        self.block_embedding_projectors.append(layers.Dense(units=base_embedding_dim))
+        self.block_embedding_projectors.append(keras.layers.Dense(units=base_embedding_dim))
 
     self.base_embedding_dim = 64
 
   def embedding_encoder(self, vocabulary, embedding_dim, num_oov_indices=0, name=None):
     return keras.Sequential(
         [
-            StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices),
-            layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim),
+            keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices),
+            keras.layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim),
         ],
         name=f"{name}_embedding" if name else None,
     )
diff --git a/deepray/layers/embedding_variable.py b/deepray/layers/embedding_variable.py
new file mode 100644
index 00000000..ae9aef32
--- /dev/null
+++ b/deepray/layers/embedding_variable.py
@@ -0,0 +1,206 @@
+# -*- coding:utf-8 -*-
+"""Dynamic Embedding layer."""
+import typing
+
+import horovod.tensorflow as hvd
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+from deepray.custom_ops.embedding_variable import config_pb2
+from deepray.custom_ops.embedding_variable import variables as ev_variables
+from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable
+from deepray.utils import logging_util
+from deepray.utils.horovod_utils import get_world_size
+
+logger = logging_util.get_logger()
+
+StorageType = {
+    "HBM": config_pb2.StorageType.HBM,
+    "DRAM": config_pb2.StorageType.DRAM,
+    "HBM_DRAM": config_pb2.StorageType.HBM_DRAM,
+    "LEVELDB": config_pb2.StorageType.LEVELDB,
+    "SSDHASH": config_pb2.StorageType.SSDHASH,
+    "DRAM_LEVELDB": config_pb2.StorageType.DRAM_LEVELDB,
+    "DRAM_SSDHASH": config_pb2.StorageType.DRAM_SSDHASH
+}
+
+CacheStrategy = {"LFU": config_pb2.CacheStrategy.LFU, "LRU": config_pb2.CacheStrategy.LRU}
+
+
+def default_partition_fn(keys, shard_num):
+  """The default partition function.
+    partition keys by "mod" strategy.
+
+    keys: a tensor presents the keys to be partitioned.
+    shard_num: the num of partitions
+  Returns:
+    a tensor with same shape as keys with type of `tf.int32`,
+      represents the corresponding partition-ids of keys.
+  """
+  return math_ops.mod(keys, shard_num)
+
+
+def int64_partition_fn(keys, shard_num):
+  return math_ops.cast(math_ops.mod(keys, shard_num), dtype=dtypes.int32)
+
+
+def partition_fn_v2(keys, shard_num):
+  return tf.cast(
+      tf.strings.to_hash_bucket_fast(
+          tf.strings.as_string(keys),  # 将 int 转为 string 再哈希
+          num_buckets=shard_num
+      ),
+      tf.int32
+  )
+
+
+class EmbeddingVariable(tf.keras.layers.Layer):
+
+  def __init__(
+      self,
+      embedding_dim: int,
+      key_dtype=dtypes.int64,
+      value_dtype: str = None,
+      initializer=None,
+      name: str = '',
+      with_unique=False,
+      partition_fn: typing.Callable[[typing.Any, typing.Any], typing.Any] = None,
+      **kwargs
+  ):
+    super(EmbeddingVariable, self).__init__(name=name)
+    self.embedding_size = embedding_dim
+    self.with_unique = with_unique
+    self.world_size = get_world_size()
+
+    if partition_fn is None:
+      if key_dtype == dtypes.int64:
+        partition_fn = int64_partition_fn
+      elif key_dtype == dtypes.int32:
+        partition_fn = default_partition_fn
+
+    storage_type = kwargs.get("storage_type", None)
+    if storage_type:
+      ev_option = ev_variables.EmbeddingVariableOption(
+          storage_option=ev_variables.StorageOption(
+              storage_type=StorageType[storage_type],
+              storage_path=kwargs.get("storage_path", None),
+              storage_size=kwargs.get("storage_size", [1024 * 1024 * 1024]),
+              cache_strategy=CacheStrategy[kwargs.get("cache_strategy", "LFU")]
+          )
+      )
+    else:
+      ev_option = ev_variables.EmbeddingVariableOption()
+
+    self.embedding_variable = get_embedding_variable(
+        embedding_dim=embedding_dim,
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        initializer=initializer,
+        name=name,
+        ev_option=ev_option,
+    )
+
+    self.partition_fn = partition_fn
+    if self.world_size > 1:
+      self.call = self.hvd_read
+      if self.world_size >= 8:  # 小规模并行时用取模更快
+        self.partition_fn = partition_fn_v2
+    else:
+      self.call = self.unique_read if self.with_unique else self.read
+
+  def make_partition(self, data, partition_index):
+    """
+    Shard keys to shard_num partitions
+
+    Args:
+      data: keys or values, usually the IDs of dynamic features.
+      partition_index: partitions index.
+      shard_num: partition number
+    Returns:
+      a pair of tensor: (partition result, partition indices)
+    """
+    partitions = tf.dynamic_partition(data, partition_index, self.world_size)
+    indices = tf.dynamic_partition(math_ops.range(array_ops.shape(data)[0]), partition_index, self.world_size)
+    return partitions, indices
+
+  def read(self, ids, *args, **kwargs):
+    return self.embedding_variable.sparse_read(ids)
+
+  def unique_read(self, ids, *args, **kwargs):
+    """Read with deduplication for better performance with repeated IDs."""
+    with ops.name_scope(f"{self.name}/EmbeddingWithUnique"):
+      ids_flat = tf.reshape(ids, [-1])
+      unique_ids, idx = tf.unique(ids_flat)
+      unique_embeddings = self.embedding_variable.sparse_read(unique_ids)
+      embeddings_flat = tf.gather(unique_embeddings, idx)
+      embeddings_shape = tf.concat([tf.shape(ids), tf.constant(self.embedding_size, shape=(1,))], 0)
+      embeddings = tf.reshape(embeddings_flat, embeddings_shape)
+    return embeddings
+
+  def hvd_read(self, ids, *args, **kwargs):
+    """
+    Compute embedding output for feature ids. The output shape will be (shape(ids),
+    embedding_size).
+
+    Args:
+      ids: feature ids of the input. It should be same dtype as the key_dtype
+        of the layer.
+
+    Returns:
+      A embedding output with shape (shape(ids), embedding_size).
+    """
+    is_ragged = isinstance(ids, tf.RaggedTensor)
+
+    if is_ragged:
+      original_structure = ids
+      ids = ids.flat_values
+
+    input_shape = tf.shape(ids)
+    embeddings_shape = tf.concat([input_shape, [self.embedding_size]], 0)
+
+    ids_flat = tf.reshape(ids, [-1])
+
+    def distributed_lookup(ids):
+      partition_index = self.partition_fn(ids, self.world_size)
+      ids_partitions, gather_indices = self.make_partition(ids, partition_index)
+      partitions_sizes = tf.stack([tf.size(p) for p in ids_partitions], axis=0)
+      relocs_tensor = tf.concat(ids_partitions, axis=0)
+      # Provide a unique name for the first alltoall operation
+      flat_reloc_ids, remote_sizes = hvd.alltoall(
+          relocs_tensor, splits=partitions_sizes, name=f"{self.name}_alltoall_ids"
+      )
+
+      lookup_result = self.read(flat_reloc_ids)
+      lookup_result, _ = hvd.alltoall(lookup_result, splits=remote_sizes, name=f"{self.name}_alltoall_embeddings")
+
+      input_shape = tf.shape(ids)
+      recover_shape = tf.concat((input_shape, (self.embedding_size,)), axis=0)
+      gather_indices = tf.expand_dims(tf.concat(gather_indices, axis=0), axis=-1)
+      lookup_result = tf.scatter_nd(gather_indices, lookup_result, recover_shape)
+      return lookup_result
+
+    if self.with_unique:
+      # with ops.name_scope(name, "EmbeddingWithUnique"):
+      unique_ids, idx = tf.unique(ids_flat)
+      unique_embeddings = distributed_lookup(unique_ids)
+      embeddings_flat = tf.gather(unique_embeddings, idx)
+    else:
+      embeddings_flat = distributed_lookup(ids_flat)
+
+    embeddings = tf.reshape(embeddings_flat, embeddings_shape)
+
+    if is_ragged:
+      embeddings = tf.RaggedTensor.from_row_lengths(embeddings, original_structure.row_lengths())
+
+    return embeddings
+
+  def get_config(self):
+    config = super().get_config()
+    config.update({
+        "world_size": self.world_size,
+        "name": self.name,
+    })
+    return config
diff --git a/deepray/layers/feature_cross.py b/deepray/layers/feature_cross.py
index a2b24694..40ad397c 100644
--- a/deepray/layers/feature_cross.py
+++ b/deepray/layers/feature_cross.py
@@ -351,8 +351,8 @@ def build(self, input_shape):
         kernel_regularizer=self.regularizer,
         name="compress_tower"
     )
-    self._trainable_weights.extend(self.compress_tower.trainable_weights)
-    self._non_trainable_weights.extend(self.compress_tower.non_trainable_weights)
+    self.trainable_weights.extend(self.compress_tower.trainable_weights)
+    self.non_trainable_weights.extend(self.compress_tower.non_trainable_weights)
     return super(CDot, self).build(input_shape)
 
   def call(self, inputs, **kwargs):
@@ -516,7 +516,6 @@ def __init__(
       allow_kernel_norm: bool = False,
       use_dropout=False,
       keep_prob=0.95,
-      mode: str = tf.estimator.ModeKeys.TRAIN,
       **kwargs
   ):
     super(DCN, self).__init__(**kwargs)
@@ -529,7 +528,6 @@ def __init__(
     self.allow_kernel_norm = allow_kernel_norm
     self.use_dropout = use_dropout
     self.keep_prob = keep_prob
-    self.mode = mode
 
   def build(self, input_shape):
     dims = check_dim(input_shape[-1])
@@ -621,7 +619,7 @@ def build(self, input_shape):
 
     return super(DCN, self).build(input_shape)
 
-  def call(self, inputs, **kwargs):
+  def call(self, inputs, training=None, **kwargs):
     x0 = inputs
     xl = x0
 
@@ -660,7 +658,7 @@ def call(self, inputs, **kwargs):
         moe_out = tf.matmul(output_of_experts, gating_score_of_experts)
         xl = tf.squeeze(moe_out, -1) + xl
 
-      if self.use_dropout and self.mode == tf.estimator.ModeKeys.TRAIN:
+      if self.use_dropout and training:
         xl = tf.nn.dropout(xl, rate=1 - self.keep_prob)
 
     return xl
@@ -683,15 +681,15 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable):
           for v in var:
             K.track_variable(v)
             if trainable:
-              self._trainable_weights.append(v)
+              self.trainable_weights.append(v)
             else:
-              self._non_trainable_weights.append(v)
+              self.non_trainable_weights.append(v)
         else:
           K.track_variable(var)
           if trainable:
-            self._trainable_weights.append(var)
+            self.trainable_weights.append(var)
           else:
-            self._non_trainable_weights.append(var)
+            self.non_trainable_weights.append(var)
 
         with tf.compat.v1.variable_scope('', reuse=tf.compat.v1.AUTO_REUSE):
           trainable_var_norm = tf.compat.v1.get_variable(
@@ -703,15 +701,15 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable):
           for v in trainable_var_norm:
             K.track_variable(v)
             if trainable:
-              self._trainable_weights.append(v)
+              self.trainable_weights.append(v)
             else:
-              self._non_trainable_weights.append(v)
+              self.non_trainable_weights.append(v)
         else:
           K.track_variable(trainable_var_norm)
           if trainable:
-            self._trainable_weights.append(trainable_var_norm)
+            self.trainable_weights.append(trainable_var_norm)
           else:
-            self._non_trainable_weights.append(trainable_var_norm)
+            self.non_trainable_weights.append(trainable_var_norm)
         var = tf.multiply(normalized, trainable_var_norm, name='mul_var_norm')
     else:
       var = self.add_weight(
@@ -731,7 +729,6 @@ def get_config(self):
         'allow_kernel_norm': self.allow_kernel_norm,
         'use_dropout': self.use_dropout,
         'keep_prob': self.keep_prob,
-        'mode': self.mode
     }
 
     base_config = super(DCN, self).get_config()
@@ -819,8 +816,8 @@ def build(self, input_shape):
             )
         )
 
-      self._trainable_weights.extend(self._conv1d[-1].trainable_weights)
-      self._non_trainable_weights.extend(self._conv1d[-1].non_trainable_weights)
+      self.trainable_weights.extend(self._conv1d[-1].trainable_weights)
+      self.non_trainable_weights.extend(self._conv1d[-1].non_trainable_weights)
     return super(CIN, self).build(input_shape)
 
   def call(self, inputs, **kwargs):
diff --git a/deepray/layers/masked_softmax.py b/deepray/layers/masked_softmax.py
index 257b2ae9..1925955b 100644
--- a/deepray/layers/masked_softmax.py
+++ b/deepray/layers/masked_softmax.py
@@ -22,7 +22,6 @@
 import tensorflow as tf
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class MaskedSoftmax(tf.keras.layers.Layer):
   """Performs a softmax with optional masking on a tensor.
 
diff --git a/deepray/layers/max_unpooling_2d.py b/deepray/layers/max_unpooling_2d.py
index 01bda9c1..4e7cbc28 100644
--- a/deepray/layers/max_unpooling_2d.py
+++ b/deepray/layers/max_unpooling_2d.py
@@ -19,7 +19,46 @@
 from typeguard import typechecked
 from typing import Union, Iterable
 
-from deepray.utils.keras_utils import normalize_tuple
+
+def normalize_tuple(value, n, name):
+  """Transforms an integer or iterable of integers into an integer tuple.
+
+    A copy of tensorflow.python.keras.util.
+
+    Args:
+      value: The value to validate and convert. Could an int, or any iterable
+        of ints.
+      n: The size of the tuple to be returned.
+      name: The name of the argument being validated, e.g. "strides" or
+        "kernel_size". This is only used to format error messages.
+
+    Returns:
+      A tuple of n integers.
+
+    Raises:
+      ValueError: If something else than an int/long or iterable thereof was
+        passed.
+    """
+  if isinstance(value, int):
+    return (value,) * n
+  else:
+    try:
+      value_tuple = tuple(value)
+    except TypeError:
+      raise TypeError("The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value))
+    if len(value_tuple) != n:
+      raise ValueError(
+          "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)
+      )
+    for single_value in value_tuple:
+      try:
+        int(single_value)
+      except (ValueError, TypeError):
+        raise ValueError(
+            "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " "
+            "including element " + str(single_value) + " of type" + " " + str(type(single_value))
+        )
+    return value_tuple
 
 
 def _calculate_output_shape(input_shape, pool_size, strides, padding):
diff --git a/deepray/layers/max_unpooling_2d_v2.py b/deepray/layers/max_unpooling_2d_v2.py
index 1dd4607e..6acd769f 100644
--- a/deepray/layers/max_unpooling_2d_v2.py
+++ b/deepray/layers/max_unpooling_2d_v2.py
@@ -19,7 +19,7 @@
 from typeguard import typechecked
 from typing import Iterable
 
-from deepray.utils.keras_utils import normalize_tuple
+from deepray.layers.max_unpooling_2d import normalize_tuple
 
 
 def _max_unpooling_2d_v2(updates, mask, output_size):
diff --git a/deepray/layers/mlp.py b/deepray/layers/mlp.py
index bf1d59ea..b6caa864 100644
--- a/deepray/layers/mlp.py
+++ b/deepray/layers/mlp.py
@@ -2,8 +2,7 @@
 from typing import List
 
 import tensorflow as tf
-from tensorflow.keras.layers import BatchNormalization as BatchNorm
-from tensorflow.python.keras import regularizers
+import tf_keras as keras
 
 
 def extend_as_list(x, n):
@@ -66,8 +65,8 @@ def __init__(
     self.hidden_units = hidden_units
     self.prefix = name
     self.use_bias = use_bias
-    self.kernel_regularizer = regularizers.get(kernel_regularizer)
-    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+    self.bias_regularizer = keras.regularizers.get(bias_regularizer)
     self.enable_batch_normalization = enable_batch_normalization
     self.batch_normalization_momentum = batch_normalization_momentum
     self.batch_normalization_renorm = batch_normalization_renorm
@@ -95,15 +94,15 @@ def __init__(
 
   def build(self, input_shape):
     if self.enable_batch_normalization:
-      bn = BatchNorm(
+      bn = keras.layers.BatchNormalization(
           momentum=self.batch_normalization_momentum,
           renorm=self.batch_normalization_renorm,
           renorm_clipping=self.batch_normalization_renorm_clipping,
           renorm_momentum=self.batch_normalization_renorm_momentum,
           name=f"BatchNorm/in"
       )
-      self._trainable_weights.extend(bn.trainable_weights)
-      self._non_trainable_weights.extend(bn.non_trainable_weights)
+      self.trainable_weights.extend(bn.trainable_weights)
+      self.non_trainable_weights.extend(bn.non_trainable_weights)
       self.add_loss(bn.losses)
       self._stacked_layers.append(bn)
 
@@ -119,21 +118,21 @@ def build(self, input_shape):
           kernel_regularizer=self.kernel_regularizer,
           bias_regularizer=self.bias_regularizer
       )
-      self._trainable_weights.extend(dense.trainable_weights)
-      self._non_trainable_weights.extend(dense.non_trainable_weights)
+      self.trainable_weights.extend(dense.trainable_weights)
+      self.non_trainable_weights.extend(dense.non_trainable_weights)
       self.add_loss(dense.losses)
       self._stacked_layers.append(dense)
 
       if not is_final_layer and self.enable_batch_normalization:
-        bn = BatchNorm(
+        bn = keras.layers.BatchNormalization(
             momentum=self.batch_normalization_momentum,
             renorm=self.batch_normalization_renorm,
             renorm_clipping=self.batch_normalization_renorm_clipping,
             renorm_momentum=self.batch_normalization_renorm_momentum,
             name=f"BatchNorm/out"
         )
-        self._trainable_weights.extend(bn.trainable_weights)
-        self._non_trainable_weights.extend(bn.non_trainable_weights)
+        self.trainable_weights.extend(bn.trainable_weights)
+        self.non_trainable_weights.extend(bn.non_trainable_weights)
         self.add_loss(bn.losses)
         self._stacked_layers.append(bn)
 
@@ -158,8 +157,8 @@ def get_config(self):
         "enable_batch_normalization": self.enable_batch_normalization,
         "batch_normalization_momentum": self.batch_normalization_momentum,
         "use_bias": self.use_bias,
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'kernel_regularizer': keras.regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': keras.regularizers.serialize(self.bias_regularizer),
         'batch_normalization_renorm': self.batch_normalization_renorm,
         'batch_normalization_renorm_clipping': self.batch_normalization_renorm_clipping,
         'batch_normalization_renorm_momentum': self.batch_normalization_renorm_momentum
diff --git a/deepray/layers/networks/__init__.py b/deepray/layers/networks/__init__.py
index 1b10b038..e69de29b 100644
--- a/deepray/layers/networks/__init__.py
+++ b/deepray/layers/networks/__init__.py
@@ -1,17 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Networks package definition."""
-from deepray.layers.networks.transformer_encoder import TransformerEncoder
-from .span_labeling import SpanLabeling
\ No newline at end of file
diff --git a/deepray/layers/noisy_dense.py b/deepray/layers/noisy_dense.py
index 07fa9d39..83caf31e 100644
--- a/deepray/layers/noisy_dense.py
+++ b/deepray/layers/noisy_dense.py
@@ -14,14 +14,7 @@
 # ==============================================================================
 
 import tensorflow as tf
-from tensorflow.keras import (
-    activations,
-    initializers,
-    regularizers,
-    constraints,
-)
-from tensorflow.keras import backend as K
-from tensorflow.keras.layers import InputSpec
+import tf_keras as keras
 from typeguard import typechecked
 
 from deepray.utils import types
@@ -137,7 +130,7 @@ def __init__(
 
   def build(self, input_shape):
     # Make sure dtype is correct
-    dtype = tf.dtypes.as_dtype(self.dtype or K.floatx())
+    dtype = tf.dtypes.as_dtype(self.dtype or keras.floatx())
     if not (dtype.is_floating or dtype.is_complex):
       raise TypeError("Unable to build `Dense` layer with non-floating point "
                       "dtype %s" % (dtype,))
@@ -148,7 +141,7 @@ def build(self, input_shape):
     if self.last_dim is None:
       raise ValueError("The last dimension of the inputs to `Dense` "
                        "should be defined. Found `None`.")
-    self.input_spec = InputSpec(min_ndim=2, axes={-1: self.last_dim})
+    self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: self.last_dim})
 
     # use factorising Gaussian variables
     if self.use_factorised:
@@ -159,8 +152,8 @@ def build(self, input_shape):
       mu_init = (3.0 / self.last_dim)**(1 / 2)
       sigma_init = 0.017
 
-    sigma_init = initializers.Constant(value=sigma_init)
-    mu_init = initializers.RandomUniform(minval=-mu_init, maxval=mu_init)
+    sigma_init = keras.initializers.Constant(value=sigma_init)
+    mu_init = keras.initializers.RandomUniform(minval=-mu_init, maxval=mu_init)
 
     # Learnable parameters
     self.sigma_kernel = self.add_weight(
@@ -186,7 +179,7 @@ def build(self, input_shape):
     self.eps_kernel = self.add_weight(
         "eps_kernel",
         shape=[self.last_dim, self.units],
-        initializer=initializers.Zeros(),
+        initializer=keras.initializers.Zeros(),
         regularizer=None,
         constraint=None,
         dtype=self.dtype,
@@ -223,7 +216,7 @@ def build(self, input_shape):
           shape=[
               self.units,
           ],
-          initializer=initializers.Zeros(),
+          initializer=keras.initializers.Zeros(),
           regularizer=None,
           constraint=None,
           dtype=self.dtype,
@@ -284,13 +277,13 @@ def get_config(self):
             "units": self.units,
             "sigma": self.sigma,
             "use_factorised": self.use_factorised,
-            "activation": activations.serialize(self.activation),
+            "activation": keras.activations.serialize(self.activation),
             "use_bias": self.use_bias,
-            "kernel_regularizer": regularizers.serialize(self.kernel_regularizer),
-            "bias_regularizer": regularizers.serialize(self.bias_regularizer),
-            "activity_regularizer": regularizers.serialize(self.activity_regularizer),
-            "kernel_constraint": constraints.serialize(self.kernel_constraint),
-            "bias_constraint": constraints.serialize(self.bias_constraint),
+            "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer),
+            "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer),
+            "activity_regularizer": keras.regularizers.serialize(self.activity_regularizer),
+            "kernel_constraint": keras.constraints.serialize(self.kernel_constraint),
+            "bias_constraint": keras.constraints.serialize(self.bias_constraint),
         }
     )
     return config
diff --git a/deepray/layers/on_device_embedding.py b/deepray/layers/on_device_embedding.py
index 585c9fca..7cf4c4a8 100644
--- a/deepray/layers/on_device_embedding.py
+++ b/deepray/layers/on_device_embedding.py
@@ -24,7 +24,6 @@
 from deepray.layers import tf_utils
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
 class OnDeviceEmbedding(tf.keras.layers.Layer):
   """Performs an embedding lookup suitable for accelerator devices.
 
diff --git a/deepray/layers/pooling.py b/deepray/layers/pooling.py
index 40c2c311..a3780066 100644
--- a/deepray/layers/pooling.py
+++ b/deepray/layers/pooling.py
@@ -1,9 +1,7 @@
 import tensorflow as tf
 
-from keras.engine.base_layer import Layer
 
-
-class Pooling(Layer):
+class Pooling(tf.keras.layers.Layer):
   """
     input shape: (batch_size, seq_len, emb_dim)
     output shape: (batch_size, 1, emb_dim)
diff --git a/deepray/layers/rnn/esn_cell.py b/deepray/layers/rnn/esn_cell.py
index 28db723d..6440c94c 100644
--- a/deepray/layers/rnn/esn_cell.py
+++ b/deepray/layers/rnn/esn_cell.py
@@ -15,7 +15,11 @@
 """Implements ESN Cell."""
 
 import tensorflow as tf
-import tensorflow.keras as keras
+from packaging.version import parse
+if parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell
+else:
+  from tensorflow.keras.layers.AbstractRNNCell import AbstractRNNCell
 from typeguard import typechecked
 
 from deepray.utils.types import (
@@ -25,7 +29,7 @@
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class ESNCell(keras.layers.AbstractRNNCell):
+class ESNCell(AbstractRNNCell):
   """Echo State recurrent Network (ESN) cell.
     This implements the recurrent cell from the paper:
         H. Jaeger
diff --git a/deepray/layers/rnn/layer_norm_lstm_cell.py b/deepray/layers/rnn/layer_norm_lstm_cell.py
index 589e889c..1600fecd 100644
--- a/deepray/layers/rnn/layer_norm_lstm_cell.py
+++ b/deepray/layers/rnn/layer_norm_lstm_cell.py
@@ -15,7 +15,7 @@
 """Implements LayerNormLSTM Cell."""
 
 import tensorflow as tf
-import tensorflow.keras as keras
+import tf_keras as keras
 from typeguard import typechecked
 
 from deepray.utils.types import (
diff --git a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
index 7d4f0999..537a1107 100644
--- a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
+++ b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
@@ -15,7 +15,7 @@
 """Implements LayerNormSimpleRNNCell Cell."""
 
 import tensorflow as tf
-import tensorflow.keras as keras
+import tf_keras as keras
 from typeguard import typechecked
 
 from deepray.utils.types import (
diff --git a/deepray/layers/rnn/nas_cell.py b/deepray/layers/rnn/nas_cell.py
index 7bf0c6f6..62e04f7e 100644
--- a/deepray/layers/rnn/nas_cell.py
+++ b/deepray/layers/rnn/nas_cell.py
@@ -15,7 +15,11 @@
 """Implements NAS Cell."""
 
 import tensorflow as tf
-import tensorflow.keras as keras
+from packaging.version import parse
+if parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell
+else:
+  from tensorflow.keras.layers.AbstractRNNCell import AbstractRNNCell
 from typeguard import typechecked
 
 from deepray.utils.types import (
@@ -27,7 +31,7 @@
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class NASCell(keras.layers.AbstractRNNCell):
+class NASCell(AbstractRNNCell):
   """Neural Architecture Search (NAS) recurrent network cell.
 
     This implements the recurrent cell from the paper:
diff --git a/deepray/layers/rnn/tests/esn_cell_test.py b/deepray/layers/rnn/tests/esn_cell_test.py
index 3f5840e6..f6924270 100644
--- a/deepray/layers/rnn/tests/esn_cell_test.py
+++ b/deepray/layers/rnn/tests/esn_cell_test.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import tensorflow as tf
-import tensorflow.keras as keras
+import tf_keras as keras
 
 from deepray.layers.rnn import ESNCell
 
diff --git a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
index 43937ec7..49f26bd9 100644
--- a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
+++ b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 import tensorflow as tf
-import tensorflow.keras as keras
+import tf_keras as keras
 
 from deepray.layers.rnn import LayerNormLSTMCell
 
diff --git a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
index b2a5045b..baaefefc 100644
--- a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
+++ b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
@@ -15,7 +15,7 @@
 """Tests for LayerNormSimpleRNN Cell."""
 
 import numpy as np
-import tensorflow.keras as keras
+import tf_keras as keras
 
 from deepray.layers.rnn import LayerNormSimpleRNNCell
 
diff --git a/deepray/layers/rnn/tests/nas_cell_test.py b/deepray/layers/rnn/tests/nas_cell_test.py
index 24dc7465..63079d0c 100644
--- a/deepray/layers/rnn/tests/nas_cell_test.py
+++ b/deepray/layers/rnn/tests/nas_cell_test.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import tensorflow as tf
-import tensorflow.keras as keras
+import tf_keras as keras
 
 from deepray.layers.rnn import NASCell
 
diff --git a/deepray/layers/self_attention_mask.py b/deepray/layers/self_attention_mask.py
index ba0e5a92..fe0940f2 100644
--- a/deepray/layers/self_attention_mask.py
+++ b/deepray/layers/self_attention_mask.py
@@ -23,7 +23,6 @@
 from deepray.layers import tf_utils
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class SelfAttentionMask(tf.keras.layers.Layer):
   """Create 3D attention mask from a 2D tensor mask.
 
diff --git a/deepray/layers/tests_bak/on_device_embedding_test.py b/deepray/layers/tests_bak/on_device_embedding_test.py
new file mode 100644
index 00000000..7ab347a4
--- /dev/null
+++ b/deepray/layers/tests_bak/on_device_embedding_test.py
@@ -0,0 +1,183 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras-based one-hot embedding layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from deepray.layers import on_device_embedding
+
+
+# This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
+# guarantees forward compatibility of this code for the V2 switchover.
+@keras_parameterized.run_all_keras_modes
+class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
+
+  def test_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+
+  def test_layer_creation_with_float16_dtype(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+
+  def test_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(vocab_size=vocab_size, embedding_width=embedding_width)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+  def test_layer_invocation_with_float16_dtype(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+
+  def test_one_hot_layer_creation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float32)
+
+  def test_one_hot_layer_creation_with_float16_dtype(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # The output should be the same as the input, save that it has an extra
+    # embedding_width dimension on the end.
+    expected_output_shape = [None, sequence_length, embedding_width]
+    self.assertEqual(expected_output_shape, output_tensor.shape.as_list())
+    self.assertEqual(output_tensor.dtype, tf.float16)
+
+  def test_one_hot_layer_invocation(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float32, output.dtype)
+
+  def test_one_hot_layer_invocation_with_float16_dtype(self):
+    vocab_size = 31
+    embedding_width = 27
+    test_layer = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
+    )
+    # Create a 2-dimensional input (the first dimension is implicit).
+    sequence_length = 23
+    input_tensor = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
+    output_tensor = test_layer(input_tensor)
+
+    # Create a model from the test layer.
+    model = tf.keras.Model(input_tensor, output_tensor)
+
+    # Invoke the model on test data. We can't validate the output data itself
+    # (the NN is too complex) but this will rule out structural runtime errors.
+    batch_size = 3
+    input_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
+    output = model.predict(input_data)
+    self.assertEqual(tf.float16, output.dtype)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/deepray/layers/tf_utils.py b/deepray/layers/tf_utils.py
index 2c5370f9..42301f6b 100644
--- a/deepray/layers/tf_utils.py
+++ b/deepray/layers/tf_utils.py
@@ -22,6 +22,7 @@
 import tensorflow as tf
 
 from deepray import activations
+from deepray.activations import swish
 
 
 def pack_inputs(inputs):
@@ -92,9 +93,9 @@ def get_activation(identifier):
   if isinstance(identifier, six.string_types):
     name_to_fn = {
         "gelu": tf.keras.activations.gelu,
-        "simple_swish": activations.simple_swish,
-        "hard_swish": activations.hard_swish,
-        "identity": activations.identity,
+        "simple_swish": swish.simple_swish,
+        "hard_swish": swish.hard_swish,
+        "identity": swish.identity,
     }
     identifier = str(identifier).lower()
     if identifier in name_to_fn:
diff --git a/deepray/layers/transformer.py b/deepray/layers/transformer.py
index 08d1e28f..b00085a6 100644
--- a/deepray/layers/transformer.py
+++ b/deepray/layers/transformer.py
@@ -25,7 +25,6 @@
 from deepray.layers import dense_einsum
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
 class Transformer(tf.keras.layers.Layer):
   """Transformer layer.
 
@@ -129,7 +128,9 @@ def build(self, input_shape):
     )
     self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
     self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+        tf.keras.layers.LayerNormalization(
+            name=f"{self.name}/self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+        )
     )
     self._intermediate_dense = dense_einsum.DenseEinsum(
         output_shape=self._intermediate_size,
@@ -157,7 +158,7 @@ def build(self, input_shape):
     )
     self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
     self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+        name=f"{self.name}/output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
     super(Transformer, self).build(input_shape)
@@ -196,12 +197,12 @@ def call(self, inputs):
     attention_output = self._attention_dropout(attention_output)
     # Use float32 in keras layer norm and the gelu activation in the
     # intermediate dense layer for numeric stability
-    if self.dtype == tf.float16:
+    if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
       input_tensor = tf.cast(input_tensor, tf.float32)
       attention_output = tf.cast(attention_output, tf.float32)
     attention_output = self._attention_layer_norm(input_tensor + attention_output)
     intermediate_output = self._intermediate_dense(attention_output)
-    if self.dtype == tf.float16:
+    if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
       # Casts to float32 so that activation is done in float32.
       intermediate_output = tf.cast(intermediate_output, tf.float32)
       intermediate_output = self._intermediate_activation_layer(intermediate_output)
@@ -211,10 +212,10 @@ def call(self, inputs):
     layer_output = self._output_dense(intermediate_output)
     layer_output = self._output_dropout(layer_output)
     # Use float32 in keras layer norm for numeric stability
-    if self.dtype == tf.float16:
+    if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
       layer_output = tf.cast(layer_output, tf.float32)
     layer_output = self._output_layer_norm(layer_output + attention_output)
-    if self.dtype == tf.float16:
+    if self.dtype == tf.float16 or self.dtype == tf.bfloat16:
       layer_output = tf.cast(layer_output, tf.float16)
 
     return layer_output
diff --git a/deepray/layers/transformer_scaffold.py b/deepray/layers/transformer_scaffold.py
index e8d865ed..488f8a59 100644
--- a/deepray/layers/transformer_scaffold.py
+++ b/deepray/layers/transformer_scaffold.py
@@ -25,7 +25,6 @@
 from deepray.layers import dense_einsum
 
 
-# @tf.keras.utils.register_keras_serializable(package="Text")
 class TransformerScaffold(tf.keras.layers.Layer):
   """Transformer scaffold layer.
 
diff --git a/deepray/losses/__init__.py b/deepray/losses/__init__.py
index f77298c1..295e6c4b 100644
--- a/deepray/losses/__init__.py
+++ b/deepray/losses/__init__.py
@@ -15,16 +15,20 @@
 """Additional losses that conform to Keras API."""
 import abc
 
-import tensorflow as tf
 from absl import flags
-from keras.engine import compile_utils
-from tensorflow.keras.losses import BinaryCrossentropy
+import tensorflow as tf
+from packaging.version import parse
+
+if parse(tf.__version__) < parse("2.11"):
+  from keras.engine import compile_utils
+elif parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.engine import compile_utils
+  import tf_keras as keras
+else:
+  from keras.src.engine import compile_utils
 
+from tensorflow.keras.losses import BinaryCrossentropy
 from deepray.losses.contrastive import contrastive_loss, ContrastiveLoss
-from deepray.losses.focal_loss import (
-    sigmoid_focal_crossentropy,
-    SigmoidFocalCrossEntropy,
-)
 from deepray.losses.giou_loss import giou_loss, GIoULoss
 from deepray.losses.kappa_loss import WeightedKappaLoss
 from deepray.losses.lifted import lifted_struct_loss, LiftedStructLoss
@@ -42,8 +46,7 @@
     TripletSemiHardLoss,
     TripletHardLoss,
 )
-
-FLAGS = flags.FLAGS
+from deepray.losses.softmax_loss import SoftmaxLoss
 
 
 class Loss(compile_utils.LossesContainer):
@@ -64,7 +67,7 @@ def __call__(self, y_true, y_pred, sample_weight=None, regularization_losses=Non
       self._built = True
     loss_value = self.call(y_true, y_pred, sample_weight)
     total_loss_mean_value = tf.nn.compute_average_loss(
-        loss_value, global_batch_size=FLAGS.batch_size * FLAGS.num_accumulation_steps
+        loss_value, global_batch_size=flags.FLAGS.batch_size * flags.FLAGS.num_accumulation_steps
     )
 
     self._loss_metric.update_state(total_loss_mean_value,
diff --git a/deepray/losses/_loss_util.py b/deepray/losses/_loss_util.py
new file mode 100644
index 00000000..5eaa4561
--- /dev/null
+++ b/deepray/losses/_loss_util.py
@@ -0,0 +1,281 @@
+# Copyright 2024 The TensorFlow Ranking Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements the losses for TF-Ranking."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+from typing import Callable, Dict, Tuple
+import tensorflow as tf
+
+_PADDING_LABEL = -1.
+_PADDING_PREDICTION = -1e6
+_PADDING_WEIGHT = 0.
+
+TensorLike = tf.types.experimental.TensorLike
+TransformationFunction = Callable[[TensorLike], tf.Tensor]
+LossFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor]
+MetricFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor]
+
+
+def serialize_keras_object(obj):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.serialize_keras_object(obj)
+  else:
+    return tf.keras.utils.serialize_keras_object(obj)
+
+
+def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+  else:
+    return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+
+
+class _RankingLoss(object, metaclass=abc.ABCMeta):
+  """Interface for ranking loss."""
+
+  def __init__(self, name, lambda_weight=None, temperature=1.0, ragged=False):
+    """Constructor.
+
+    Args:
+      name: A string used as the name for this loss.
+      lambda_weight: A `_LambdaWeight` object.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    self._name = name
+    self._lambda_weight = lambda_weight
+    self._temperature = temperature
+    self._ragged = ragged
+
+  @property
+  def name(self):
+    """The loss name."""
+    return self._name
+
+  def _prepare_and_validate_params(self, labels, logits, weights, mask):
+    """Prepares and validate input parameters.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A tuple (labels, logits, weights, mask) of `tf.Tensor` objects that are
+      ready to be used in the loss.
+    """
+    if self._ragged:
+      labels, logits, weights, mask = ragged_to_dense(labels, logits, weights)
+
+    if mask is None:
+      mask = is_label_valid(labels)
+
+    if weights is None:
+      weights = 1.0
+
+    labels = tf.convert_to_tensor(labels)
+    logits = tf.convert_to_tensor(logits)
+    weights = tf.convert_to_tensor(weights)
+    mask = tf.convert_to_tensor(mask)
+
+    return labels, logits, weights, mask
+
+  def compute_unreduced_loss(self, labels, logits, mask=None):
+    """Computes the unreduced loss.
+
+    Args:
+      labels: A `Tensor` or `RaggedTensor` of the same shape as `logits`
+        representing graded relevance.
+      logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size].
+        Each value is the ranking score of the corresponding item.
+      mask: An optional `Tensor` of the same shape as logits indicating which
+        entries are valid for computing the loss. Will be ignored if the loss
+        was constructed with ragged=True.
+
+    Returns:
+      A tuple(losses, loss_weights) that have the same shape.
+    """
+    labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask)
+    return self._compute_unreduced_loss_impl(labels, logits, mask)
+
+  @abc.abstractmethod
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """Implementation for the unreduced loss.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      mask: An optional `Tensor` of the same shape as logits indicating which
+        entries are valid for computing the loss.
+
+    Returns:
+      A tuple(losses, loss_weights) that have the same shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def normalize_weights(self, labels, weights):
+    """Normalizes weights.
+
+    This is needed for `tf.estimator` given that the reduction may be
+    `SUM_OVER_NONZERO_WEIGHTS`.
+
+    This method is also needed to compute normalized weights when calling
+    `compute_unreduced_loss`, which is done in the tf.keras losses.
+
+    Args:
+      labels: A `Tensor` of shape [batch_size, list_size] representing graded
+        relevance.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+
+    Returns:
+      The normalized weights.
+    """
+    if self._ragged:
+      labels, _, weights, _ = utils.ragged_to_dense(labels, None, weights)
+    return self._normalize_weights_impl(labels, weights)
+
+  def _normalize_weights_impl(self, labels, weights):
+    """See `normalize_weights`."""
+    del labels
+    return 1.0 if weights is None else weights
+
+  def get_logits(self, logits):
+    """Computes logits rescaled by temperature.
+
+    Args:
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+
+    Returns:
+      Tensor of rescaled logits.
+    """
+    if not tf.is_tensor(logits):
+      logits = tf.convert_to_tensor(value=logits)
+    return logits / self._temperature
+
+  def compute(self, labels, logits, weights, reduction, mask=None):
+    """Computes the reduced loss for tf.estimator (not tf.keras).
+
+    Note that this function is not compatible with keras.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+        reduce training loss over batch.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      Reduced loss for training and eval.
+    """
+    logits = self.get_logits(logits)
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+    return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction)
+
+  @abc.abstractmethod
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """Computes the per-list loss.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A pair of `Tensor` objects of shape [batch_size] containing per-list
+      losses and weights.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def eval_metric(self, labels, logits, weights, mask=None):
+    """Computes the eval metric for the loss in tf.estimator (not tf.keras).
+
+    Note that this function is not compatible with keras.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the metric.
+
+    Returns:
+      A metric op.
+    """
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+    return tf.compat.v1.metrics.mean(losses, weights)
+
+
+def ragged_to_dense(labels, predictions, weights):
+  """Converts given inputs from ragged tensors to dense tensors.
+
+  Args:
+    labels: A `tf.RaggedTensor` of the same shape as `predictions` representing
+      relevance.
+    predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each
+      value is the ranking score of the corresponding example.
+    weights: An optional `tf.RaggedTensor` of the same shape of predictions or a
+      `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and
+      the latter case is per-list.
+
+  Returns:
+    A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
+  """
+  # TODO: Add checks to validate (ragged) shapes of input tensors.
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  labels = labels.to_tensor(_PADDING_LABEL)
+  if predictions is not None:
+    predictions = predictions.to_tensor(_PADDING_PREDICTION)
+  if isinstance(weights, tf.RaggedTensor):
+    weights = weights.to_tensor(_PADDING_WEIGHT)
+  return labels, predictions, weights, mask
+
+
+def is_label_valid(labels):
+  """Returns a boolean `Tensor` for label validity."""
+  labels = tf.convert_to_tensor(value=labels)
+  return tf.greater_equal(labels, 0.)
diff --git a/deepray/losses/contrastive.py b/deepray/losses/contrastive.py
index 7d138562..501d47a2 100644
--- a/deepray/losses/contrastive.py
+++ b/deepray/losses/contrastive.py
@@ -15,9 +15,9 @@
 """Implements contrastive loss."""
 
 import tensorflow as tf
+from tensorflow.python.keras import losses
 from typeguard import typechecked
 
-from deepray.utils.keras_utils import LossFunctionWrapper
 from deepray.utils.types import TensorLike, Number
 
 
@@ -66,7 +66,7 @@ def contrastive_loss(y_true: TensorLike, y_pred: TensorLike, margin: Number = 1.
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class ContrastiveLoss(LossFunctionWrapper):
+class ContrastiveLoss(losses.LossFunctionWrapper):
   r"""Computes the contrastive loss between `y_true` and `y_pred`.
 
     This loss encourages the embedding to be close to each other for
diff --git a/deepray/losses/focal_loss.py b/deepray/losses/focal_loss.py
index 7b4b6cd7..21e2ddbe 100644
--- a/deepray/losses/focal_loss.py
+++ b/deepray/losses/focal_loss.py
@@ -15,15 +15,14 @@
 """Implements Focal loss."""
 
 import tensorflow as tf
-import tensorflow.keras.backend as K
+import tf_keras as keras
 from typeguard import typechecked
 
-from deepray.utils.keras_utils import LossFunctionWrapper
 from deepray.utils.types import FloatTensorLike, TensorLike
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class SigmoidFocalCrossEntropy(LossFunctionWrapper):
+class SigmoidFocalCrossEntropy(keras.losses.LossFunctionWrapper):
   """Implements the focal loss function.
 
     Focal loss was first introduced in the RetinaNet paper
@@ -118,7 +117,7 @@ def sigmoid_focal_crossentropy(
   y_true = tf.cast(y_true, dtype=y_pred.dtype)
 
   # Get the cross_entropy for each entry
-  ce = K.binary_crossentropy(y_true, y_pred, from_logits=from_logits)
+  ce = keras.binary_crossentropy(y_true, y_pred, from_logits=from_logits)
 
   # If logits are provided then convert the predictions into probabilities
   if from_logits:
diff --git a/deepray/losses/giou_loss.py b/deepray/losses/giou_loss.py
index a2dda7af..81a49d96 100644
--- a/deepray/losses/giou_loss.py
+++ b/deepray/losses/giou_loss.py
@@ -17,14 +17,14 @@
 from typing import Optional
 
 import tensorflow as tf
+from tensorflow.python.keras import losses
 from typeguard import typechecked
 
-from deepray.utils.keras_utils import LossFunctionWrapper
 from deepray.utils.types import TensorLike
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class GIoULoss(LossFunctionWrapper):
+class GIoULoss(losses.LossFunctionWrapper):
   """Implements the GIoU loss function.
 
     GIoU loss was first introduced in the
diff --git a/deepray/losses/lifted.py b/deepray/losses/lifted.py
index 9146440c..7dade105 100644
--- a/deepray/losses/lifted.py
+++ b/deepray/losses/lifted.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Implements lifted_struct_loss."""
 
+from typing import Optional
+
 import tensorflow as tf
-from deepray.losses import metric_learning
+from tensorflow.python.keras import losses
+from typeguard import typechecked
 
-from deepray.utils.keras_utils import LossFunctionWrapper
+from deepray.losses import metric_learning
 from deepray.utils.types import FloatTensorLike, TensorLike
-from typeguard import typechecked
-from typing import Optional
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
@@ -106,7 +107,7 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class LiftedStructLoss(LossFunctionWrapper):
+class LiftedStructLoss(losses.LossFunctionWrapper):
   """Computes the lifted structured loss.
 
     The loss encourages the positive distances (between a pair of embeddings
diff --git a/deepray/losses/losses_impl.py b/deepray/losses/losses_impl.py
new file mode 100644
index 00000000..6230ac9e
--- /dev/null
+++ b/deepray/losses/losses_impl.py
@@ -0,0 +1,1937 @@
+# Copyright 2024 The TensorFlow Ranking Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements the losses for TF-Ranking."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import math
+
+import tensorflow as tf
+from deepray.losses import utils
+
+# The smallest probability that is used to derive smallest logit for invalid or
+# padding entries.
+_EPSILON = 1e-10
+
+
+def _safe_default_gain_fn(labels):
+  """Calculates safe gain functions for NDCG.
+
+  In applications such as distillation, the labels could have extreme values
+  that might result in numerical error when using the original gain function.
+  This should only be applied to NDCG related losses, but not DCG ones. It
+  should be applied on both the numerator and the denominator of NDCG.
+
+  Args:
+    labels: A `Tensor` with shape [batch_size, list_size], representing graded
+      relevance.
+  Returns:
+    A `tensor` of safe gain function values of shape [batch_size, list_size].
+  """
+  max_labels = tf.reduce_max(labels, axis=-1, keepdims=True)
+  gains = tf.pow(2., labels - max_labels) - tf.pow(2., -max_labels)
+  return gains
+
+
+def _check_tensor_shapes(tensors):
+  """Checks the tensor shapes to be compatible."""
+  for tensor in tensors:
+    tensor = tf.convert_to_tensor(value=tensor)
+    tensor.get_shape().assert_has_rank(2)
+    tensor.get_shape().assert_is_compatible_with(tf.convert_to_tensor(value=tensors[0]).get_shape())
+
+
+def _apply_pairwise_op(op, tensor):
+  """Applies the op on tensor in the pairwise manner."""
+  _check_tensor_shapes([tensor])
+  return op(tf.expand_dims(tensor, 2), tf.expand_dims(tensor, 1))
+
+
+def _get_valid_pairs_and_clean_labels(labels):
+  """Returns a boolean Tensor for valid pairs and cleaned labels."""
+  labels = tf.convert_to_tensor(value=labels)
+  labels.get_shape().assert_has_rank(2)
+  is_valid = utils.is_label_valid(labels)
+  valid_pairs = _apply_pairwise_op(tf.logical_and, is_valid)
+  labels = tf.compat.v1.where(is_valid, labels, tf.zeros_like(labels))
+  return valid_pairs, labels
+
+
+def approx_ranks(logits):
+  r"""Computes approximate ranks given a list of logits.
+
+  Given a list of logits, the rank of an item in the list is one plus the total
+  number of items with a larger logit. In other words,
+
+    rank_i = 1 + \sum_{j \neq i} I_{s_j > s_i},
+
+  where "I" is the indicator function. The indicator function can be
+  approximated by a generalized sigmoid:
+
+    I_{s_j < s_i} \approx 1/(1 + exp(-(s_j - s_i)/temperature)).
+
+  This function approximates the rank of an item using this sigmoid
+  approximation to the indicator function. This technique is at the core
+  of "A general approximation framework for direct optimization of
+  information retrieval measures" by Qin et al.
+
+  Args:
+    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+      ranking score of the corresponding item.
+
+  Returns:
+    A `Tensor` of ranks with the same shape as logits.
+  """
+  list_size = tf.shape(input=logits)[1]
+  x = tf.tile(tf.expand_dims(logits, 2), [1, 1, list_size])
+  y = tf.tile(tf.expand_dims(logits, 1), [1, list_size, 1])
+  pairs = tf.sigmoid(y - x)
+  return tf.reduce_sum(input_tensor=pairs, axis=-1) + .5
+
+
+def inverse_max_dcg(
+    labels,
+    gain_fn=lambda labels: tf.pow(2.0, labels) - 1.,
+    rank_discount_fn=lambda rank: 1. / tf.math.log1p(rank),
+    topn=None
+):
+  """Computes the inverse of max DCG.
+
+  Args:
+    labels: A `Tensor` with shape [batch_size, list_size]. Each value is the
+      graded relevance of the corresponding item.
+    gain_fn: A gain function. By default this is set to: 2^label - 1.
+    rank_discount_fn: A discount function. By default this is set to:
+      1/log(1+rank).
+    topn: An integer as the cutoff of examples in the sorted list.
+
+  Returns:
+    A `Tensor` with shape [batch_size, 1].
+  """
+  ideal_sorted_labels, = utils.sort_by_scores(labels, [labels], topn=topn)
+  rank = tf.range(tf.shape(input=ideal_sorted_labels)[1]) + 1
+  discounted_gain = gain_fn(ideal_sorted_labels) * rank_discount_fn(tf.cast(rank, dtype=tf.float32))
+  discounted_gain = tf.reduce_sum(input_tensor=discounted_gain, axis=1, keepdims=True)
+  return tf.compat.v1.where(tf.greater(discounted_gain, 0.), 1. / discounted_gain, tf.zeros_like(discounted_gain))
+
+
+def ndcg(labels, ranks=None, perm_mat=None):
+  """Computes NDCG from labels and ranks.
+
+  Args:
+    labels: A `Tensor` with shape [batch_size, list_size], representing graded
+      relevance.
+    ranks: A `Tensor` of the same shape as labels, or [1, list_size], or None.
+      If ranks=None, we assume the labels are sorted in their rank.
+    perm_mat: A `Tensor` with shape [batch_size, list_size, list_size] or None.
+      Permutation matrices with rows correpond to the ranks and columns
+      correspond to the indices. An argmax over each row gives the index of the
+      element at the corresponding rank.
+
+  Returns:
+    A `tensor` of NDCG, ApproxNDCG, or ExpectedNDCG of shape [batch_size, 1].
+  """
+  if ranks is not None and perm_mat is not None:
+    raise ValueError('Cannot use both ranks and perm_mat simultaneously.')
+
+  if ranks is None:
+    list_size = tf.shape(labels)[1]
+    ranks = tf.range(list_size) + 1
+  discounts = 1. / tf.math.log1p(tf.cast(ranks, dtype=tf.float32))
+  gains = _safe_default_gain_fn(tf.cast(labels, dtype=tf.float32))
+  if perm_mat is not None:
+    gains = tf.reduce_sum(input_tensor=perm_mat * tf.expand_dims(gains, 1), axis=-1)
+  dcg = tf.reduce_sum(input_tensor=gains * discounts, axis=-1, keepdims=True)
+  normalized_dcg = dcg * inverse_max_dcg(labels, gain_fn=_safe_default_gain_fn)
+
+  return normalized_dcg
+
+
+class _LambdaWeight(object, metaclass=abc.ABCMeta):
+  """Interface for ranking metric optimization.
+
+  This class wraps weights used in the LambdaLoss framework for ranking metric
+  optimization (https://ai.google/research/pubs/pub47258). Such an interface is
+  to be instantiated by concrete lambda weight models. The instance is used
+  together with standard loss such as logistic loss and softmax loss.
+  """
+  # TODO: Define a public version of `_LambdaWeight` for typing
+  # annotations.
+
+  @abc.abstractmethod
+  def pair_weights(self, labels, ranks):
+    """Returns the weight adjustment `Tensor` for example pairs.
+
+    Args:
+      labels: A dense `Tensor` of labels with shape [batch_size, list_size].
+      ranks: A dense `Tensor` of ranks with the same shape as `labels` that are
+        sorted by logits.
+
+    Returns:
+      A `Tensor` that can weight example pairs.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def individual_weights(self, labels, ranks):
+    """Returns the weight `Tensor` for individual examples.
+
+    Args:
+      labels: A dense `Tensor` of labels with shape [batch_size, list_size].
+      ranks: A dense `Tensor` of ranks with the same shape as `labels` that are
+        sorted by logits.
+
+    Returns:
+      A `Tensor` that can weight individual examples.
+    """
+    del ranks
+    return labels
+
+
+class LabelDiffLambdaWeight(_LambdaWeight):
+  """A simple LambdaWeight to compute the pair label difference."""
+
+  def pair_weights(self, labels, ranks):
+    """Returns the absolute label difference for each pair."""
+    del ranks  # Unused.
+    return tf.abs(_apply_pairwise_op(tf.subtract, labels))
+
+
+class AbstractDCGLambdaWeight(_LambdaWeight):
+  """Abstract LambdaWeight for Discounted Cumulative Gain (DCG) metric."""
+
+  def __init__(self, topn=None, gain_fn=lambda label: label, rank_discount_fn=lambda rank: 1. / rank, normalized=False):
+    """Initializer.
+
+    Ranks are 1-based, not 0-based.
+
+    Args:
+      topn: (int) The topn for the DCG metric.
+      gain_fn: (function) Transforms labels.
+      rank_discount_fn: (function) The rank discount function.
+      normalized: (bool) If True, normalize weight by the max DCG.
+    """
+    self._topn = topn
+    self._gain_fn = gain_fn
+    self._rank_discount_fn = rank_discount_fn
+    self._normalized = normalized
+
+  @abc.abstractmethod
+  def _pair_rank_discount(self, ranks, topn):
+    """Computes the rank-based discount for a pair.
+
+    Args:
+      ranks: A 2D `Tensor` for the 1-based ranks.
+      topn: A scalar `Tensor` for the topn cutoff.
+
+    Returns:
+     A pairwise weights `Tensor` based on the `rank_discount_fn`.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def pair_weights(self, labels, ranks):
+    """See `_LambdaWeight`."""
+    with tf.compat.v1.name_scope(name='dcg_lambda_weight'):
+      _check_tensor_shapes([labels, ranks])
+      valid_pair, labels = _get_valid_pairs_and_clean_labels(labels)
+      gain = self._gain_fn(labels)
+      if self._normalized:
+        gain *= inverse_max_dcg(labels, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, topn=self._topn)
+      pair_gain = _apply_pairwise_op(tf.subtract, gain)
+      pair_gain *= tf.cast(valid_pair, dtype=tf.float32)
+
+      list_size = tf.shape(input=labels)[1]
+      topn = self._topn or list_size
+      pair_weight = tf.abs(pair_gain) * self._pair_rank_discount(ranks, topn)
+
+      # For LambdaLoss with relative rank difference, the scale of loss becomes
+      # much smaller when applying LambdaWeight. This affects the training can
+      # make the optimal learning rate become much larger. We use a heuristic to
+      # scale it up to the same magnitude as standard pairwise loss.
+      pair_weight *= tf.cast(tf.shape(input=labels)[1], dtype=tf.float32)
+      return pair_weight
+
+  def individual_weights(self, labels, ranks):
+    """See `_LambdaWeight`."""
+    with tf.compat.v1.name_scope(name='dcg_lambda_weight'):
+      _check_tensor_shapes([labels, ranks])
+      labels = tf.convert_to_tensor(value=labels)
+      labels = tf.compat.v1.where(utils.is_label_valid(labels), labels, tf.zeros_like(labels))
+      gain = self._gain_fn(labels)
+      if self._normalized:
+        gain *= inverse_max_dcg(labels, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, topn=self._topn)
+      rank_discount = self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32))
+      return gain * rank_discount
+
+
+class DCGLambdaWeight(AbstractDCGLambdaWeight):
+  """LambdaWeight for Discounted Cumulative Gain metric."""
+
+  def __init__(
+      self,
+      topn=None,
+      gain_fn=lambda label: label,
+      rank_discount_fn=lambda rank: 1. / rank,
+      normalized=False,
+      smooth_fraction=0.
+  ):
+    """Initializer.
+
+    Ranks are 1-based, not 0-based. Given rank i and j, there are two types of
+    pair weights:
+      u = |rank_discount_fn(|i-j|) - rank_discount_fn(|i-j| + 1)|
+      v = |rank_discount_fn(i) - rank_discount_fn(j)|
+    where u is the newly introduced one in LambdaLoss paper
+    (https://ai.google/research/pubs/pub47258) and v is the original one in the
+    LambdaMART paper "From RankNet to LambdaRank to LambdaMART: An Overview".
+    The final pair weight contribution of ranks is
+      (1-smooth_fraction) * u + smooth_fraction * v.
+
+    Args:
+      topn: (int) The topn for the DCG metric.
+      gain_fn: (function) Transforms labels.
+      rank_discount_fn: (function) The rank discount function.
+      normalized: (bool) If True, normalize weight by the max DCG.
+      smooth_fraction: (float) parameter to control the contribution from
+        LambdaMART.
+    """
+    super().__init__(topn, gain_fn, rank_discount_fn, normalized)
+    if not 0. <= smooth_fraction <= 1.:
+      raise ValueError('smooth_fraction %s should be in range [0, 1].' % smooth_fraction)
+    self._smooth_fraction = smooth_fraction
+
+  def _pair_rank_discount(self, ranks, topn):
+    """See `_LambdaWeight`."""
+
+    def _discount_for_relative_rank_diff():
+      """Rank-based discount in the LambdaLoss paper."""
+      # The LambdaLoss is not well defined when topn is active and topn <
+      # list_size. The following implementation is based on Equation 18 proposed
+      # in https://research.google/pubs/pub47258/. Please refer to
+      # `DCGLambdaWeightV2` for a better implemention to handle topn.
+      pair_valid_rank = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn))
+      rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32)
+      pair_discount = tf.where(
+          tf.logical_and(tf.greater(rank_diff, 0), pair_valid_rank),
+          tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)), tf.zeros_like(rank_diff)
+      )
+      return pair_discount
+
+    def _discount_for_absolute_rank():
+      """Standard discount in the LambdaMART paper."""
+      # When the rank discount is (1 / rank) for example, the discount is
+      # |1 / r_i - 1 / r_j|. When i or j > topn, the discount becomes 0.
+      rank_discount = tf.compat.v1.where(
+          tf.greater(ranks, topn), tf.zeros_like(tf.cast(ranks, dtype=tf.float32)),
+          self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32))
+      )
+      pair_discount = tf.abs(_apply_pairwise_op(tf.subtract, rank_discount))
+      return pair_discount
+
+    u = _discount_for_relative_rank_diff()
+    v = _discount_for_absolute_rank()
+    pair_discount = (1. - self._smooth_fraction) * u + self._smooth_fraction * v
+    pair_mask = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn))
+    return pair_discount * tf.cast(pair_mask, dtype=tf.float32)
+
+
+class DCGLambdaWeightV2(AbstractDCGLambdaWeight):
+  """The V2 version of LambdaWeight for DCG metric.
+
+  V2: Everything is the same as LambdaLoss when topn=None. When topn is
+  activated, for any pair i, j where max(i, j) > topn, we multiply the inverse
+  of 1-1/log(1+max(i,j)) for example.
+  """
+
+  def _pair_rank_discount(self, ranks, topn):
+    """Implements the rank discount for pairs in topn metrics."""
+    rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32)
+    max_rank = tf.cast(_apply_pairwise_op(tf.math.maximum, ranks), tf.float32)
+    multiplier = tf.where(
+        tf.greater(max_rank, tf.cast(topn, tf.float32)), 1. / (1. - self._rank_discount_fn(max_rank)), 1.
+    )
+    pair_discount = tf.where(
+        tf.greater(rank_diff, 0.),
+        tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)) * multiplier,
+        tf.zeros_like(rank_diff)
+    )
+    return pair_discount
+
+
+class YetiDCGLambdaWeight(DCGLambdaWeightV2):
+  """A simple LambdaWeight to compute pair weight on neighbor pairs."""
+
+  def pair_weights(self, labels: tf.Tensor, ranks: tf.Tensor) -> tf.Tensor:
+    """See `_LambdaWeight`."""
+    pair_weight = super().pair_weights(labels, ranks)
+    with tf.compat.v1.name_scope(name='yeti_dcg_lambda_weight'):
+      neighbor_pair = tf.equal(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), 1)
+      pair_weight *= tf.cast(neighbor_pair, dtype=tf.float32)
+    return pair_weight
+
+
+class PrecisionLambdaWeight(_LambdaWeight):
+  """LambdaWeight for Precision metric."""
+
+  def __init__(self, topn, positive_fn=lambda label: tf.greater_equal(label, 1.0)):
+    """Constructor.
+
+    Args:
+      topn: (int) The K in Precision@K metric.
+      positive_fn: (function): A function on `Tensor` that output boolean True
+        for positive examples. The rest are negative examples.
+    """
+    self._topn = topn
+    self._positive_fn = positive_fn
+
+  def pair_weights(self, labels, ranks):
+    """See `_LambdaWeight`.
+
+    The current implementation here is that for any pairs of documents i and j,
+    we set the weight to be 1 if
+      - i and j have different labels.
+      - i <= topn and j > topn or i > topn and j <= topn.
+    This is exactly the same as the original LambdaRank method. The weight is
+    the gain of swapping a pair of documents.
+
+    Args:
+      labels: A dense `Tensor` of labels with shape [batch_size, list_size].
+      ranks: A dense `Tensor` of ranks with the same shape as `labels` that are
+        sorted by logits.
+
+    Returns:
+      A `Tensor` that can weight example pairs.
+    """
+    with tf.compat.v1.name_scope(name='precision_lambda_weight'):
+      _check_tensor_shapes([labels, ranks])
+      valid_pair, labels = _get_valid_pairs_and_clean_labels(labels)
+      binary_labels = tf.cast(self._positive_fn(labels), dtype=tf.float32)
+      label_diff = tf.abs(_apply_pairwise_op(tf.subtract, binary_labels))
+      label_diff *= tf.cast(valid_pair, dtype=tf.float32)
+      # i <= topn and j > topn or i > topn and j <= topn, i.e., xor(i <= topn, j
+      # <= topn).
+      rank_mask = _apply_pairwise_op(tf.math.logical_xor, tf.less_equal(ranks, self._topn))
+      return label_diff * tf.cast(rank_mask, dtype=tf.float32)
+
+
+class ListMLELambdaWeight(_LambdaWeight):
+  """LambdaWeight for ListMLE cost function."""
+
+  def __init__(self, rank_discount_fn):
+    """Constructor.
+
+    Ranks are 1-based, not 0-based.
+
+    Args:
+      rank_discount_fn: (function) The rank discount function.
+    """
+    self._rank_discount_fn = rank_discount_fn
+
+  def pair_weights(self, labels, ranks):
+    """See `_LambdaWeight`."""
+    pass
+
+  def individual_weights(self, labels, ranks):
+    """See `_LambdaWeight`."""
+    with tf.compat.v1.name_scope(name='p_list_mle_lambda_weight'):
+      _check_tensor_shapes([labels, ranks])
+      labels = tf.convert_to_tensor(value=labels)
+      rank_discount = self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32))
+      return tf.ones_like(labels) * rank_discount
+
+
+def _compute_ranks(logits, is_valid):
+  """Computes ranks by sorting valid logits.
+
+  Args:
+    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+      ranking score of the corresponding item.
+    is_valid: A `Tensor` of the same shape as `logits` representing validity of
+      each entry.
+
+  Returns:
+    The `ranks` Tensor.
+  """
+  _check_tensor_shapes([logits, is_valid])
+  # Only sort entries with is_valid = True.
+  scores = tf.compat.v1.where(
+      is_valid, logits, -1e-6 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=1, keepdims=True)
+  )
+  return utils.sorted_ranks(scores)
+
+
+def _pairwise_comparison(labels, logits, mask, pairwise_logits_op=tf.subtract):
+  r"""Returns pairwise comparison `Tensor`s.
+
+  Given a list of n items, the labels of graded relevance l_i and the logits
+  s_i, we form n^2 pairs. For each pair, we have the following:
+
+                        /
+                        | 1   if l_i > l_j for valid l_i and l_j.
+  * `pairwise_labels` = |
+                        | 0   otherwise
+                        \
+  * `pairwise_logits` = pairwise_logits_op(s_i, s_j)
+
+  Args:
+    labels: A `Tensor` with shape [batch_size, list_size].
+    logits: A `Tensor` with shape [batch_size, list_size].
+    mask: A `Tensor` with shape [batch_size, list_size] indicating which entries
+      are valid for computing the pairwise comparisons.
+    pairwise_logits_op: A pairwise function which operates on 2 tensors.
+
+  Returns:
+    A tuple of (pairwise_labels, pairwise_logits) with each having the shape
+    [batch_size, list_size, list_size].
+  """
+  # Compute the difference for all pairs in a list. The output is a Tensor with
+  # shape [batch_size, list_size, list_size] where the entry [-1, i, j] stores
+  # the information for pair (i, j).
+  pairwise_label_diff = _apply_pairwise_op(tf.subtract, labels)
+  pairwise_logits = _apply_pairwise_op(pairwise_logits_op, logits)
+  # Only keep the case when l_i > l_j.
+  pairwise_labels = tf.cast(tf.greater(pairwise_label_diff, 0), dtype=tf.float32)
+  valid_pair = _apply_pairwise_op(tf.logical_and, mask)
+  pairwise_labels *= tf.cast(valid_pair, dtype=tf.float32)
+  return pairwise_labels, pairwise_logits
+
+
+class GumbelSampler(object):
+  """Random sampler for sampling gumbel distributed logits."""
+
+  def __init__(self, name=None, sample_size=8, temperature=1.0, seed=None, ragged=False):
+    """Constructor."""
+    self._name = name
+    self._sample_size = sample_size
+    self._temperature = temperature
+    self._seed = seed
+    self._ragged = ragged
+
+  def sample(self, labels, logits, weights=None):
+    """Samples scores from Concrete(logits).
+
+    If the sampler was constructed with `ragged=True` this method expects
+    `labels`, `logits` and item-wise `weights` to be a `RaggedTensor`.
+
+    Args:
+      labels: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size]
+        same as `logits`, representing graded relevance. Or in the diversity
+        tasks, a `Tensor` (or `RaggedTensor`) with shape [batch_size, list_size,
+        subtopic_size]. Each value represents relevance to a subtopic, 1 for
+        relevent subtopic, 0 for irrelevant, and -1 for paddings. When the
+        actual subtopic number of a query is smaller than the `subtopic_size`,
+        `labels` will be padded to `subtopic_size` with -1.
+      logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size].
+        Each value is the ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` or `RaggedTensor` with shape [batch_size,
+        list_size] for item-wise weights. If None, the weight of a list in the
+        mini-batch is set to the sum of the labels of the items in that list.
+
+    Returns:
+      A tuple of expanded labels, logits, and weights where the first dimension
+      is now batch_size * sample_size. Logit Tensors are sampled from
+      Concrete(logits) while labels and weights are simply tiled so the
+      resulting
+      Tensor has the updated dimensions.
+    """
+    with tf.compat.v1.name_scope(self._name, 'gumbel_softmax_sample', (labels, logits, weights)):
+      # Convert ragged tensors to dense and construct a mask.
+      if self._ragged:
+        is_weights_ragged = isinstance(weights, tf.RaggedTensor)
+        labels, logits, weights, mask = utils.ragged_to_dense(labels, logits, weights)
+
+      batch_size = tf.shape(input=labels)[0]
+      list_size = tf.shape(input=labels)[1]
+
+      # Expand labels.
+      expanded_labels = tf.expand_dims(labels, 1)
+      expanded_labels = tf.repeat(expanded_labels, [self._sample_size], axis=1)
+      expanded_labels = utils.reshape_first_ndims(expanded_labels, 2, [batch_size * self._sample_size])
+
+      # Sample logits from Concrete(logits).
+      sampled_logits = tf.expand_dims(logits, 1)
+      sampled_logits = tf.tile(sampled_logits, [1, self._sample_size, 1])
+      sampled_logits += _sample_gumbel([batch_size, self._sample_size, list_size], seed=self._seed)
+      sampled_logits = tf.reshape(sampled_logits, [batch_size * self._sample_size, list_size])
+
+      is_label_valid = utils.is_label_valid(expanded_labels)
+      if is_label_valid.shape.rank > 2:
+        is_label_valid = tf.reduce_any(is_label_valid, axis=-1)
+      sampled_logits = tf.compat.v1.where(
+          is_label_valid, sampled_logits / self._temperature,
+          tf.math.log(1e-20) * tf.ones_like(sampled_logits)
+      )
+      sampled_logits = tf.math.log(tf.nn.softmax(sampled_logits) + 1e-20)
+
+      expanded_weights = weights
+      if expanded_weights is not None:
+        true_fn = lambda: tf.expand_dims(tf.expand_dims(expanded_weights, 1), 1)
+        false_fn = lambda: tf.expand_dims(expanded_weights, 1)
+        expanded_weights = tf.cond(pred=tf.math.equal(tf.rank(expanded_weights), 1), true_fn=true_fn, false_fn=false_fn)
+        expanded_weights = tf.tile(expanded_weights, [1, self._sample_size, 1])
+        expanded_weights = tf.reshape(expanded_weights, [batch_size * self._sample_size, -1])
+
+      # Convert dense tensors back to ragged.
+      if self._ragged:
+        # Construct expanded mask for the number of samples.
+        expanded_mask = tf.expand_dims(mask, 1)
+        expanded_mask = tf.repeat(expanded_mask, [self._sample_size], axis=1)
+        expanded_mask = tf.reshape(expanded_mask, [batch_size * self._sample_size, list_size])
+        # Convert labels and sampled logits to ragged tensors.
+        expanded_labels = tf.ragged.boolean_mask(expanded_labels, expanded_mask)
+        sampled_logits = tf.ragged.boolean_mask(sampled_logits, expanded_mask)
+        # If ragged weights were provided, convert dense weights back to ragged.
+        if is_weights_ragged:
+          expanded_weights = tf.ragged.boolean_mask(expanded_weights, expanded_mask)
+
+      return expanded_labels, sampled_logits, expanded_weights
+
+
+def _sample_gumbel(shape, eps=1e-20, seed=None):
+  u = tf.random.uniform(shape, minval=0, maxval=1, dtype=tf.float32, seed=seed)
+  return -tf.math.log(-tf.math.log(u + eps) + eps)
+
+
+class _RankingLoss(object, metaclass=abc.ABCMeta):
+  """Interface for ranking loss."""
+
+  def __init__(self, name, lambda_weight=None, temperature=1.0, ragged=False):
+    """Constructor.
+
+    Args:
+      name: A string used as the name for this loss.
+      lambda_weight: A `_LambdaWeight` object.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    self._name = name
+    self._lambda_weight = lambda_weight
+    self._temperature = temperature
+    self._ragged = ragged
+
+  @property
+  def name(self):
+    """The loss name."""
+    return self._name
+
+  def _prepare_and_validate_params(self, labels, logits, weights, mask):
+    """Prepares and validate input parameters.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A tuple (labels, logits, weights, mask) of `tf.Tensor` objects that are
+      ready to be used in the loss.
+    """
+    if self._ragged:
+      labels, logits, weights, mask = utils.ragged_to_dense(labels, logits, weights)
+
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+
+    if weights is None:
+      weights = 1.0
+
+    labels = tf.convert_to_tensor(labels)
+    logits = tf.convert_to_tensor(logits)
+    weights = tf.convert_to_tensor(weights)
+    mask = tf.convert_to_tensor(mask)
+
+    return labels, logits, weights, mask
+
+  def compute_unreduced_loss(self, labels, logits, mask=None):
+    """Computes the unreduced loss.
+
+    Args:
+      labels: A `Tensor` or `RaggedTensor` of the same shape as `logits`
+        representing graded relevance.
+      logits: A `Tensor` or `RaggedTensor` with shape [batch_size, list_size].
+        Each value is the ranking score of the corresponding item.
+      mask: An optional `Tensor` of the same shape as logits indicating which
+        entries are valid for computing the loss. Will be ignored if the loss
+        was constructed with ragged=True.
+
+    Returns:
+      A tuple(losses, loss_weights) that have the same shape.
+    """
+    labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask)
+    return self._compute_unreduced_loss_impl(labels, logits, mask)
+
+  @abc.abstractmethod
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """Implementation for the unreduced loss.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      mask: An optional `Tensor` of the same shape as logits indicating which
+        entries are valid for computing the loss.
+
+    Returns:
+      A tuple(losses, loss_weights) that have the same shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def normalize_weights(self, labels, weights):
+    """Normalizes weights.
+
+    This is needed for `tf.estimator` given that the reduction may be
+    `SUM_OVER_NONZERO_WEIGHTS`.
+
+    This method is also needed to compute normalized weights when calling
+    `compute_unreduced_loss`, which is done in the tf.keras losses.
+
+    Args:
+      labels: A `Tensor` of shape [batch_size, list_size] representing graded
+        relevance.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+
+    Returns:
+      The normalized weights.
+    """
+    if self._ragged:
+      labels, _, weights, _ = utils.ragged_to_dense(labels, None, weights)
+    return self._normalize_weights_impl(labels, weights)
+
+  def _normalize_weights_impl(self, labels, weights):
+    """See `normalize_weights`."""
+    del labels
+    return 1.0 if weights is None else weights
+
+  def get_logits(self, logits):
+    """Computes logits rescaled by temperature.
+
+    Args:
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+
+    Returns:
+      Tensor of rescaled logits.
+    """
+    if not tf.is_tensor(logits):
+      logits = tf.convert_to_tensor(value=logits)
+    return logits / self._temperature
+
+  def compute(self, labels, logits, weights, reduction, mask=None):
+    """Computes the reduced loss for tf.estimator (not tf.keras).
+
+    Note that this function is not compatible with keras.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+        reduce training loss over batch.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      Reduced loss for training and eval.
+    """
+    logits = self.get_logits(logits)
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+    return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction)
+
+  @abc.abstractmethod
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """Computes the per-list loss.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A pair of `Tensor` objects of shape [batch_size] containing per-list
+      losses and weights.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def eval_metric(self, labels, logits, weights, mask=None):
+    """Computes the eval metric for the loss in tf.estimator (not tf.keras).
+
+    Note that this function is not compatible with keras.
+
+    Args:
+      labels: A `Tensor` of the same shape as `logits` representing graded
+        relevance.
+      logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+        ranking score of the corresponding item.
+      weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise
+        weights, or a `Tensor` with shape [batch_size, list_size] for item-wise
+        weights.
+      mask: A `Tensor` of the same shape as logits indicating which entries are
+        valid for computing the metric.
+
+    Returns:
+      A metric op.
+    """
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+    return tf.compat.v1.metrics.mean(losses, weights)
+
+
+class _PairwiseLoss(_RankingLoss, metaclass=abc.ABCMeta):
+  """Interface for pairwise ranking loss."""
+
+  @abc.abstractmethod
+  def _pairwise_loss(self, pairwise_logits):
+    """The loss of pairwise logits with l_i > l_j."""
+    raise NotImplementedError('Calling an abstract method.')
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    ranks = _compute_ranks(logits, mask)
+    pairwise_labels, pairwise_logits = _pairwise_comparison(labels, logits, mask)
+    pairwise_weights = pairwise_labels
+    if self._lambda_weight is not None:
+      pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks)
+
+    pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient')
+    return self._pairwise_loss(pairwise_logits), pairwise_weights
+
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """See `_RankingLoss`."""
+    # Prepare input params.
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+
+    # Pairwise losses and weights will be of shape
+    # [batch_size, list_size, list_size].
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+
+    # Compute the weighted per-pair loss.
+    weighted_per_pair_loss = tf.math.multiply(losses, weights)
+
+    # Sum the inner dimensions to obtain per-list weights. For pairwise losses
+    # this typically indicates the (weighted) number of pairwise preferences per
+    # list.
+    per_list_weights = tf.reduce_sum(weights, axis=[1, 2])
+
+    # This computes the per-list losses by summing all weighted pairwise losses.
+    per_list_losses = tf.reduce_sum(weighted_per_pair_loss, axis=[1, 2])
+
+    # Normalize the per-list losses so that lists with different numbers of
+    # pairs have comparable losses. The different numbers of pairs is reflected
+    # in the per-list weights.
+    per_list_losses = tf.math.divide_no_nan(per_list_losses, per_list_weights)
+
+    return per_list_losses, per_list_weights
+
+  def _normalize_weights_impl(self, labels, weights):
+    """See _RankingLoss."""
+    # The `weights` is item-wise and is applied non-symmetrically to update
+    # pairwise_weights as
+    #   pairwise_weights(i, j) = w_i * pairwise_weights(i, j).
+    # This effectively applies to all pairs with l_i > l_j. Note that it is
+    # actually symmetric when `weights` are constant per list, i.e., listwise
+    # weights.
+    if weights is None:
+      weights = 1.
+    weights = tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels))
+    return tf.expand_dims(weights, axis=2)
+
+
+class PairwiseLogisticLoss(_PairwiseLoss):
+  """Implements pairwise logistic loss."""
+
+  def _pairwise_loss(self, pairwise_logits):
+    """See `_PairwiseLoss`."""
+    # The following is the same as log(1 + exp(-pairwise_logits)).
+    return tf.nn.relu(-pairwise_logits) + tf.math.log1p(tf.exp(-tf.abs(pairwise_logits)))
+
+
+class PairwiseHingeLoss(_PairwiseLoss):
+  """Implements pairwise hinge loss."""
+
+  def _pairwise_loss(self, pairwise_logits):
+    """See `_PairwiseLoss`."""
+    return tf.nn.relu(1 - pairwise_logits)
+
+
+class PairwiseSoftZeroOneLoss(_PairwiseLoss):
+  """Implements pairwise hinge loss."""
+
+  def _pairwise_loss(self, pairwise_logits):
+    """See `_PairwiseLoss`."""
+    return tf.compat.v1.where(
+        tf.greater(pairwise_logits, 0), 1. - tf.sigmoid(pairwise_logits), tf.sigmoid(-pairwise_logits)
+    )
+
+
+class PairwiseMSELoss(_PairwiseLoss):
+  """Implements pairwise MSE loss.
+
+  This loss computes over all pairs, including those with the same labels, but
+  excluding self pairs in the diagonal of the pairwise matrix.
+  """
+
+  def _pairwise_loss(self, pairwise_logits):
+    # Unused because of overridding `_compute_unreduced_loss_impl`.
+    pass
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+
+    # Compute loss.
+    pairwise_label_diff = _apply_pairwise_op(tf.subtract, labels)
+    pairwise_logit_diff = _apply_pairwise_op(tf.subtract, logits)
+    pairwise_mse_loss = tf.math.square(pairwise_logit_diff - pairwise_label_diff)
+    valid_pair = _apply_pairwise_op(tf.logical_and, mask)
+
+    # Compute weights.
+    pairwise_weights = tf.ones_like(pairwise_mse_loss)
+    batch_size, list_size = tf.unstack(tf.shape(input=labels))
+    # Excluding the self pairs.
+    pairwise_weights -= tf.eye(list_size, batch_shape=[batch_size], dtype=pairwise_weights.dtype)
+    # Including only valid pairs
+    pairwise_weights *= tf.cast(valid_pair, tf.float32)
+    if self._lambda_weight is not None:
+      ranks = _compute_ranks(logits, mask)
+      pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks)
+    pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient')
+
+    return pairwise_mse_loss, pairwise_weights
+
+
+class _ListwiseLoss(_RankingLoss):
+  """Interface for listwise loss."""
+
+  def _normalize_weights_impl(self, labels, weights):
+    """See `_RankingLoss`."""
+    if weights is None:
+      return 1.0
+    else:
+      weights = tf.convert_to_tensor(value=weights)
+      labels = tf.convert_to_tensor(value=labels)
+      is_valid = utils.is_label_valid(labels)
+      labels = tf.where(is_valid, labels, tf.zeros_like(labels))
+      return tf.compat.v1.math.divide_no_nan(
+          tf.reduce_sum(input_tensor=(weights * labels), axis=1, keepdims=True),
+          tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+      )
+
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """See `_RankingLoss`."""
+    # Prepare input params.
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+
+    # Listwise losses and weights will be of shape [batch_size, 1].
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+
+    # This removes the inner dimension of size 1 to make the output shape
+    # [batch_size].
+    per_list_losses = tf.squeeze(losses, axis=1)
+    per_list_weights = tf.squeeze(weights, axis=1)
+    return per_list_losses, per_list_weights
+
+
+class CircleLoss(_ListwiseLoss):
+  """Implements circle loss.
+
+  This is the Circle loss originally proposed by Sun et al.
+  ["Circle Loss: A Unified Perspective of Pair Similarity Optimization"]. See
+  https://arxiv.org/abs/2002.10857.
+
+  For a model that outputs similarity scores `s` on data point with
+  corresponding label y, the circle loss from Eq.(6) in the paper is
+    L_circle = log(1 + sum_{i is p,j is n}
+                   exp(gamma * (a_j * (s_j - d_n) - a_i * (s_i - d_p)))),
+  defined for the binary label, p for data points with positive labels and n for
+  data points with negative labels.
+    a_i = relu(1 + margin - s_i)
+    a_j = relu(s_j + margin)
+    d_p = 1 - margin
+    d_n = margin
+  We can extend to non-binary labels with an indiactor function,
+    L_circle = log(1 + sum_{i, j} I_{y_i > y_j}
+                   exp(gamma * (a_j * (s_j - d_n) - a_i * (s_i - d_p)))),
+  Note the loss takes only the similarity scores. We will clip any score value
+  beyond 0 and 1 to confine the scores in [0, 1], please be aware of that.
+  """
+
+  def __init__(self, name, lambda_weight=None, gamma=64, margin=0.25, ragged=False):
+    """Initializer.
+
+    Args:
+      name: A string used as the name for this loss.
+      lambda_weight: A `_LambdaWeight` object.
+      gamma: A float parameter used in circle loss.
+      margin: A float parameter defining the margin in circle loss.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    super().__init__(name, lambda_weight=lambda_weight, temperature=1.0, ragged=ragged)
+    self._margin = margin
+    self._gamma = gamma
+
+  def get_logits(self, logits):
+    """See `_RankingLoss`."""
+    # Add a clip to confine scores in [0, 1].
+    return tf.clip_by_value(tf.convert_to_tensor(value=logits), 0., 1.)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+
+    def circle_loss_pairwise_op(score_i, score_j):
+      alpha_i = tf.stop_gradient(tf.nn.relu(1 - score_i + self._margin), name='circle_loss_alpha_pos')
+      alpha_j = tf.stop_gradient(tf.nn.relu(score_j + self._margin), name='circle_loss_alpha_neg')
+      return alpha_i * (1 - score_i - self._margin) + alpha_j * (score_j - self._margin)
+
+    pairwise_labels, pairwise_logits = _pairwise_comparison(
+        labels, logits, mask, pairwise_logits_op=circle_loss_pairwise_op
+    )
+    pairwise_weights = tf.stop_gradient(pairwise_labels, name='weights_stop_gradient')
+    # TODO: try lambda_weights for circle loss.
+    # Pairwise losses and weights will be of shape
+    # [batch_size, list_size, list_size].
+    losses = tf.exp(self._gamma * pairwise_logits)
+
+    # This computes the per-list losses and weights for circle loss.
+    per_list_losses = tf.math.log1p(tf.reduce_sum(tf.math.multiply(losses, pairwise_weights), axis=[1, 2]))
+    per_list_weights = tf.reduce_sum(pairwise_weights, axis=[
+        1, 2
+    ]) / tf.reduce_sum(tf.cast(pairwise_weights > 0, tf.float32), axis=[1, 2])
+
+    # Return per-list losses and weights with shape [batch_size, 1].
+    return tf.expand_dims(per_list_losses, 1), tf.expand_dims(per_list_weights, 1)
+
+
+class SoftmaxLoss(_ListwiseLoss):
+  """Implements softmax loss."""
+
+  def precompute(self, labels, logits, weights, mask=None):
+    """Precomputes Tensors for softmax cross entropy inputs."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    ranks = _compute_ranks(logits, mask)
+    # Reset the masked labels to 0 and reset the masked logits to a logit with
+    # ~= 0 contribution in softmax.
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits))
+    if self._lambda_weight is not None and isinstance(self._lambda_weight, DCGLambdaWeight):
+      labels = self._lambda_weight.individual_weights(labels, ranks)
+    if weights is not None:
+      labels *= weights
+    return labels, logits
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    # Padding for rows with label_sum = 0.
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+    padded_labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
+    padded_labels = tf.compat.v1.where(mask, padded_labels, tf.zeros_like(padded_labels))
+    padded_label_sum = tf.reduce_sum(input_tensor=padded_labels, axis=1, keepdims=True)
+    labels_for_softmax = tf.math.divide_no_nan(padded_labels, padded_label_sum)
+    logits_for_softmax = logits
+    # Padded labels have 0 weights in label_sum.
+    weights_for_softmax = tf.reshape(label_sum, [-1])
+    losses = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels_for_softmax, logits_for_softmax)
+    return losses, weights_for_softmax
+
+  def compute(self, labels, logits, weights, reduction, mask=None):
+    """See `_RankingLoss`."""
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+    logits = self.get_logits(logits)
+    labels, logits = self.precompute(labels, logits, weights, mask)
+    losses, weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    return tf.compat.v1.losses.compute_weighted_loss(losses, weights, reduction=reduction)
+
+  def eval_metric(self, labels, logits, weights, mask=None):
+    """See `_RankingLoss`."""
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+    logits = self.get_logits(logits)
+    labels, logits = self.precompute(labels, logits, weights, mask)
+    losses, weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    return tf.compat.v1.metrics.mean(losses, weights)
+
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """See `_RankingLoss`."""
+    # Prepare input params.
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+
+    # As opposed to the other listwise losses, SoftmaxLoss returns already
+    # squeezed losses, which can be returned directly.
+    logits = self.get_logits(logits)
+    labels, logits = self.precompute(labels, logits, weights, mask)
+    return self._compute_unreduced_loss_impl(labels, logits, mask)
+
+  def compute_unreduced_loss(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    labels, logits, _, mask = self._prepare_and_validate_params(labels, logits, None, mask)
+    logits = self.get_logits(logits)
+    labels, logits = self.precompute(labels, logits, weights=None, mask=mask)
+    return self._compute_unreduced_loss_impl(labels, logits, mask)
+
+
+class PolyOneSoftmaxLoss(SoftmaxLoss):
+  """Implements poly1 softmax loss."""
+
+  def __init__(self, name, lambda_weight=None, epsilon=1.0, temperature=1.0, ragged=False):
+    """Constructor.
+
+    Args:
+      name: A string used as the name for this loss.
+      lambda_weight: A `_LambdaWeight` object.
+      epsilon: A float number for contribution of the first polynomial.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    super().__init__(name, lambda_weight=lambda_weight, temperature=temperature, ragged=ragged)
+    self._epsilon = epsilon
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    # Padding for rows with label_sum = 0.
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+    padded_labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
+    padded_labels = tf.compat.v1.where(mask, padded_labels, tf.zeros_like(padded_labels))
+    padded_label_sum = tf.reduce_sum(input_tensor=padded_labels, axis=1, keepdims=True)
+    labels_for_softmax = tf.math.divide_no_nan(padded_labels, padded_label_sum)
+    logits_for_softmax = logits
+    # Padded labels have 0 weights in label_sum.
+    weights_for_softmax = tf.reshape(label_sum, [-1])
+    pt = tf.reduce_sum(labels_for_softmax * tf.nn.softmax(logits_for_softmax), axis=-1)
+    ce = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(labels_for_softmax, logits_for_softmax)
+    losses = ce + self._epsilon * (1 - pt)
+    return losses, weights_for_softmax
+
+
+class UniqueSoftmaxLoss(_ListwiseLoss):
+  """Implements unique rating softmax loss."""
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits))
+    pairwise_labels, _ = _pairwise_comparison(labels, logits, mask)
+    # Used in denominator to compute unique softmax probability for each doc.
+    denominator_logits = tf.expand_dims(logits, axis=1) * pairwise_labels
+    denominator_logits = tf.concat([denominator_logits, tf.expand_dims(logits, axis=2)], axis=2)
+    denominator_mask = tf.concat([pairwise_labels, tf.expand_dims(tf.ones_like(logits), axis=2)], axis=2)
+    denominator_logits = tf.where(
+        tf.greater(denominator_mask, 0.0), denominator_logits,
+        -1e-3 + tf.reduce_min(denominator_logits) * tf.ones_like(denominator_logits)
+    )
+    logits_max = tf.reduce_max(denominator_logits, axis=-1, keepdims=True)
+    # Subtract the max so that exp(denominator_logits) is numerically valid.
+    denominator_logits -= logits_max
+    logits -= tf.squeeze(logits_max, axis=-1)
+    # Set gains for loss weights.
+    gains = tf.pow(2.0, labels) - 1
+    # Compute the softmax loss for each doc.
+    per_doc_softmax = -logits + tf.math.log(tf.reduce_sum(tf.exp(denominator_logits) * denominator_mask, axis=-1))
+    losses = tf.reduce_sum(per_doc_softmax * gains, axis=1, keepdims=True)
+    return losses, tf.ones_like(losses)
+
+
+class _PointwiseLoss(_RankingLoss):
+  """Interface for pointwise loss."""
+
+  def _normalize_weights_impl(self, labels, weights):
+    """See _RankingLoss."""
+    if weights is None:
+      weights = 1.
+    return tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels))
+
+  def compute_per_list(self, labels, logits, weights, mask=None):
+    """See `_RankingLoss`."""
+    # Prepare input params.
+    labels, logits, weights, mask = self._prepare_and_validate_params(labels, logits, weights, mask)
+
+    # Pointwise losses and weights will be of shape [batch_size, list_size].
+    losses, loss_weights = self._compute_unreduced_loss_impl(labels, logits, mask)
+    weights = tf.multiply(self._normalize_weights_impl(labels, weights), loss_weights)
+
+    # Compute the weighted per-item loss.
+    weighted_per_item_loss = tf.math.multiply(losses, weights)
+
+    # Sum the inner dimensions to obtain per-list weights. For pointwise losses
+    # this typically indicates the (weighted) number of items per list.
+    per_list_weights = tf.reduce_sum(weights, axis=1)
+
+    # This computes the per-list losses by summing all weighted per-item losses.
+    per_list_losses = tf.reduce_sum(weighted_per_item_loss, axis=1)
+
+    # Normalize the per-list losses so that lists with different numbers of
+    # items have comparable losses. The different numbers of items is reflected
+    # in the per-list weights.
+    per_list_losses = tf.math.divide_no_nan(per_list_losses, per_list_weights)
+    return per_list_losses, per_list_weights
+
+
+class ClickEMLoss(_PointwiseLoss):
+  """Implements the click EM loss with examination and relevance.
+
+  The implementation is based on the the paper by Wang et al: "Position bias
+  estimation for unbiased learning to rank in personal search." It assumes that
+  a click is generated by a factorized model P(examination) * P(relevance),
+  which are latent variables determined by `exam_logits` and `rel_logits`
+  respectively. An EM algorithm is used for estimation and this function
+  implements the expectation step to estimate the P(latent | observed), i.e.,
+  P(examination | click) and P(relevance | click).
+  """
+
+  def __init__(self, name, temperature=1.0, exam_loss_weight=1.0, rel_loss_weight=1.0, ragged=False):
+    super().__init__(name, None, temperature, ragged)
+    self._exam_loss_weight = exam_loss_weight
+    self._rel_loss_weight = rel_loss_weight
+
+  def _compute_latent_prob(self, clicks, exam_logits, rel_logits):
+    """Computes the probability of latent variables in EM.
+
+    The original compuation is as follows and can be unstable:
+      exam_prob = sigmoid(exam_logits)
+      rel_prob = sigmoid(rel_logits)
+      exam_prob_posterior = exam_prob * (1 - rel_prob) / (1 - exam_prob *
+        rel_prob)
+      rel_prob_posterior = rel_prob * (1 - exam_prob) / (1 - exam_prob *
+        rel_prob).
+
+    To increase the numeric stability, we compute the posteriror logits first.
+    Using the exam_logits_posterior as an example, we have:
+      exam_logit_posterior = logit(exam_prob_posterior)
+        = log(exam_prob_posterior / (1 - exam_prob_posterior))
+    It can be reduced to exam_logits and rel_logits:
+      exam_logit_posterior = exam_logits - log(1 + exp(rel_logits))
+        = exam_logits - softplus(rel_logits)
+
+    We can do similar reduction for rel_logit_posterior. Then we compute the
+    posterior probablity by apply sigmoid on the logits.
+
+    Args:
+      clicks: A 2-D `Tensor` for clicks as observed data. A value >= 1.0 is
+        treated as clicked.
+      exam_logits: A 2-D `Tensor` to compute P(examination) and has the same
+        shape as `clicks`.
+      rel_logits: A 2-D `Tensor` to compute P(relevance) and has the same shape
+        as `clicks`.
+
+    Returns:
+      A tuple of (exam_given_clicks, rel_given_clicks) representing
+      P(examination | click) and P(relevance | click).
+    """
+    with tf.compat.v1.name_scope(name='compute_latent_prob'):
+      is_clicked = tf.greater_equal(tf.cast(clicks, tf.float32), 1.0)
+      exam_logits_posterior = exam_logits - tf.math.softplus(rel_logits)
+      rel_logits_posterior = rel_logits - tf.math.softplus(exam_logits)
+      exam_prob_posterior = tf.compat.v1.where(
+          is_clicked, tf.ones_like(exam_logits_posterior), tf.sigmoid(exam_logits_posterior)
+      )
+      rel_prob_posterior = tf.compat.v1.where(
+          is_clicked, tf.ones_like(rel_logits_posterior), tf.sigmoid(rel_logits_posterior)
+      )
+      return tf.stop_gradient(exam_prob_posterior), tf.stop_gradient(rel_prob_posterior)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """Computes the loss for each element.
+
+    Args:
+      labels: A `Tensor` with shape [batch_size, list_size] representing clicks.
+      logits: A `Tensor` with shape [batch_size, list_size, 2], where the first
+        value in the 3rd-dim is the logits for examination and the second value
+        is the logits for relevance.
+      mask: A `Tensor` of the same shape as labels indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A tuple(losses, loss_weights).
+    """
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    exam_logits, rel_logits = tf.unstack(logits, axis=2)
+    exam_logits = tf.compat.v1.where(mask, exam_logits, tf.zeros_like(exam_logits))
+    rel_logits = tf.compat.v1.where(mask, rel_logits, tf.zeros_like(rel_logits))
+    # The distribution in the E step.
+    exam_latent_prob, rel_latent_prob = self._compute_latent_prob(labels, exam_logits, rel_logits)
+    # The loss in the M step.
+    losses = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(
+        labels=exam_latent_prob, logits=exam_logits
+    ) * self._exam_loss_weight
+    losses += tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(
+        labels=rel_latent_prob, logits=rel_logits
+    ) * self._rel_loss_weight
+    return losses, tf.cast(mask, dtype=tf.float32)
+
+
+class SigmoidCrossEntropyLoss(_PointwiseLoss):
+  """Implements sigmoid cross entropy loss."""
+
+  def __init__(self, name, temperature=1.0, ragged=False):
+    """Overwrite the constructor.
+
+    Args:
+      name: A string used as the name for this loss.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    super().__init__(name, None, temperature, ragged)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits))
+    losses = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+    return losses, tf.cast(mask, dtype=tf.float32)
+
+
+class MeanSquaredLoss(_PointwiseLoss):
+  """Implements the means squared error loss."""
+
+  def __init__(self, name, ragged=False):
+    """Overwrite the constructor.
+
+    Args:
+      name: A string used as the name for this loss.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    # temperature is not used in this loss.
+    super().__init__(name, None, temperature=1.0, ragged=ragged)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits))
+    losses = tf.compat.v1.squared_difference(labels, logits)
+    return losses, tf.cast(mask, dtype=tf.float32)
+
+
+class MixtureEMLoss(_ListwiseLoss):
+  """Implements the Mixture EM loss with examination and relevance.
+
+  An Expecatation-Maximization (EM) algorithm is used for estimation and this
+  function.
+  """
+
+  def __init__(self, name, temperature=1.0, alpha=1.0, ragged=False):
+    super().__init__(name, None, temperature, ragged)
+    self._alpha = alpha
+
+  def _compute_model_prob(self, per_list_logodds):
+    """Computes the probability of models in EM.
+
+    Args:
+      per_list_logodds: A `Tensor` with shape [batch_size, 1, model_num].
+
+    Returns:
+      A `Tensor` of probability with shape [batch_size, 1, model_num].
+    """
+    with tf.compat.v1.name_scope(name='compute_model_prob'):
+      return tf.stop_gradient(
+          tf.exp(-self._alpha * (per_list_logodds - tf.reduce_min(per_list_logodds, axis=2, keepdims=True)))
+      )
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """Computes the loss for each element.
+
+    Args:
+      labels: A `Tensor` with shape [batch_size, list_size] representing clicks.
+      logits: A `Tensor` with shape [batch_size, list_size, model_num], where
+        the 3rd-dim is dimension for the models to mix.
+      mask: A `Tensor` of the same shape as labels indicating which entries are
+        valid for computing the loss.
+
+    Returns:
+      A tuple(losses, loss_weights).
+    """
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    # The loss in the M step.
+    # shape = [batch_size, list_size, model_num]
+    losses = tf.stack(
+        [
+            tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=model_logits)
+            for model_logits in tf.unstack(logits, axis=-1)
+        ],
+        axis=2
+    )
+    losses = tf.where(tf.expand_dims(mask, axis=-1), losses, tf.zeros_like(losses, dtype=tf.float32))
+
+    # The model probability in the E step.
+    losses_no_gradient = tf.stop_gradient(losses)
+    # shape = [batch_size, 1, model_num]
+    per_list_logodds = tf.reduce_sum(losses_no_gradient, axis=1, keepdims=True)
+    model_prob = self._compute_model_prob(per_list_logodds)
+    prob_norm = tf.reduce_sum(model_prob, axis=2, keepdims=True)
+
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    nonzero_mask = tf.greater(label_sum, 0.0)
+    return tf.reshape(tf.reduce_sum(losses * model_prob / prob_norm, axis=[1, 2]),
+                      [-1, 1]), tf.cast(nonzero_mask, dtype=tf.float32)
+
+
+class ListMLELoss(_ListwiseLoss):
+  """Implements ListMLE loss."""
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    # Reset the masked labels to 0 and reset the masked logits to a logit with
+    # ~= 0 contribution.
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits))
+    scores = tf.compat.v1.where(
+        mask, labels,
+        tf.reduce_min(input_tensor=labels, axis=1, keepdims=True) - 1e-6 * tf.ones_like(labels)
+    )
+    # Use a fixed ops-level seed and the randomness is controlled by the
+    # graph-level seed.
+    sorted_labels, sorted_logits = utils.sort_by_scores(scores, [labels, logits], shuffle_ties=True, seed=37)
+
+    raw_max = tf.reduce_max(input_tensor=sorted_logits, axis=1, keepdims=True)
+    sorted_logits = sorted_logits - raw_max
+    sums = tf.cumsum(tf.exp(sorted_logits), axis=1, reverse=True)
+    sums = tf.math.log(sums) - sorted_logits
+
+    if self._lambda_weight is not None and isinstance(self._lambda_weight, ListMLELambdaWeight):
+      batch_size, list_size = tf.unstack(tf.shape(input=sorted_labels))
+      sums *= self._lambda_weight.individual_weights(
+          sorted_labels, tf.tile(tf.expand_dims(tf.range(list_size) + 1, 0), [batch_size, 1])
+      )
+
+    negative_log_likelihood = tf.reduce_sum(input_tensor=sums, axis=1, keepdims=True)
+    return negative_log_likelihood, tf.ones_like(negative_log_likelihood)
+
+
+class ApproxNDCGLoss(_ListwiseLoss):
+  """Implements ApproxNDCG loss."""
+
+  # Use a different default temperature.
+  def __init__(self, name, lambda_weight=None, temperature=0.1, ragged=False):
+    """See `_ListwiseLoss`."""
+    super().__init__(name, lambda_weight, temperature, ragged)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(
+        mask, logits, -1e3 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
+    )
+
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+    labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
+    ranks = approx_ranks(logits)
+
+    return -ndcg(labels, ranks), tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
+
+
+class ApproxMRRLoss(_ListwiseLoss):
+  """Implements ApproxMRR loss."""
+
+  # Use a different default temperature.
+  def __init__(self, name, lambda_weight=None, temperature=0.1, ragged=False):
+    """See `_ListwiseLoss`."""
+    super().__init__(name, lambda_weight, temperature, ragged)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(
+        mask, logits, -1e3 * tf.ones_like(logits) + tf.math.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
+    )
+
+    label_sum = tf.math.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+
+    nonzero_mask = tf.math.greater(tf.reshape(label_sum, [-1]), 0.0)
+    labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
+
+    rr = 1. / approx_ranks(logits)
+    rr = tf.math.reduce_sum(input_tensor=rr * labels, axis=-1, keepdims=True)
+    mrr = rr / tf.math.reduce_sum(input_tensor=labels, axis=-1, keepdims=True)
+    return -mrr, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
+
+
+class NeuralSortCrossEntropyLoss(_ListwiseLoss):
+  """Implements Cross-entropy loss of neural sort permutation matrix."""
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits))
+
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+
+    # shape = [batch_size, list_size, list_size].
+    true_perm = neural_sort(labels, mask=mask)
+    smooth_perm = neural_sort(logits, mask=mask)
+
+    losses = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(
+        labels=true_perm, logits=tf.math.log(1e-20 + smooth_perm), axis=2
+    )
+
+    # Neural sort will place masked entries last. Losses are still computed on
+    # those entries so we need to cancel those out. This means we need to mask
+    # out the last n entries, where n is the number of masked items per list. We
+    # do so by sorting the mask and setting (masked) invalid losses to 0.
+    sorted_mask = tf.cast(tf.sort(tf.cast(mask, dtype=tf.float32), axis=1, direction='DESCENDING'), dtype=tf.bool)
+    losses = tf.where(sorted_mask, losses, tf.zeros_like(losses))
+
+    # shape = [batch_size, list_size].
+    losses = tf.math.divide_no_nan(
+        tf.reduce_sum(input_tensor=losses, axis=-1, keepdims=True),
+        tf.reduce_sum(input_tensor=tf.cast(mask, dtype=tf.float32), axis=-1, keepdims=True)
+    )
+
+    return losses, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
+
+
+class NeuralSortNDCGLoss(_ListwiseLoss):
+  """Implements PiRank-NDCG loss.
+
+  The PiRank-NDCG loss is a differentiable approximation of the NDCG metric
+  using the NeuralSort trick, which generates a permutation matrix based on
+  ranking scores. Please refer to https://arxiv.org/abs/2012.06731 for the
+  PiRank method. For PiRank-NDCG in specific,
+    NDCG_metric = - sum_i (2^y_i - 1) / log(1 + r_i) / maxDCG,
+  where y_i and r_i are the label and the score rank of the ith document
+  respectively. This metric can be also written as the sum over rank r with an
+  indicator function I,
+    NDCG_metric = - sum_{i,r} (2^y_i - 1) / log(1 + r) * I(r, r_i) / maxDCG,
+  where the indicator function I(r, r_i) = 1 if r = r_i and 0 otherwise, which
+  is the permutation matrix.
+
+  Approximated with a differentiable permutation matrix using neural sort,
+    PiRank-NDCG = - sum_{i,r} (2^y_i - 1) / log(1 + r) * P(r, i) / maxDCG,
+  where P(r, i) is the approximation of the permutation matrix.
+  """
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    logits = tf.compat.v1.where(mask, logits, tf.zeros_like(logits))
+
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+    # shape = [batch_size, list_size].
+    labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
+    # shape = [batch_size, list_size, list_size].
+    smooth_perm = neural_sort(logits, mask=mask)
+
+    return -ndcg(labels, perm_mat=smooth_perm), tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
+
+
+def neural_sort(logits, name=None, mask=None):
+  r"""Generate the permutation matrix from logits by deterministic neuralsort.
+
+  The sort on a list of logits can be approximated by a differentiable
+  permutation matrix using Neural Sort (https://arxiv.org/abs/1903.08850).
+  The approximation is achieved by constructing a list of functions on logits,
+    fn_i(k) = (list_size + 1 - 2*i) * logit_k - sum_j |logit_k - logit_j|,
+  whose value is maximal when k is at the ith largest logit.
+  So that the permutation matrix can be expressed as
+           / 1 if j = argmax_k fn_i(k)
+    P_ij = |                           = one_hot(argmax(fn_i(j))).
+           \ 0 otherwise
+  And the differentiable approximation of the matrix is applied with softmax,
+    P^_ij = softmax(fn_i(j) / temperature),
+  where the parameter temperature tunes the smoothiness of the approximation.
+
+  #### References
+  [1]: Aditya Grover, Eric Wang, Aaron Zweig, Stefano Ermon.
+       Stochastic Optimization of Sorting Networks via Continuous Relaxations.
+       https://arxiv.org/abs/1903.08850
+
+  Args:
+    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+      ranking score of the corresponding item. (We are using logits here,
+      noticing the original paper is using probability weights, i.e., the
+      exponentials of the logits).
+    name: A string used as the name for this loss.
+    mask: A `Tensor` with the same shape as logits indicating which entries are
+      valid for computing the neural_sort. Invalid entries are pushed to the
+      end.
+
+  Returns:
+    A tensor of permutation matrices whose dimension is [batch_size, list_size,
+    list_size].
+  """
+  with tf.compat.v1.name_scope(name, 'neural_sort', [logits]):
+    if mask is None:
+      mask = tf.ones_like(logits, dtype=tf.bool)
+
+    # Reset logits to 0 and compute number of valid entries for each list in the
+    # batch.
+    logits = tf.where(mask, logits, tf.zeros_like(logits))
+    num_valid_entries = tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True)
+
+    # Compute logit differences and mask out invalid entries.
+    logit_diff = tf.abs(tf.expand_dims(logits, 2) - tf.expand_dims(logits, 1))
+    valid_pair_mask = _apply_pairwise_op(tf.logical_and, mask)
+    logit_diff = tf.where(valid_pair_mask, logit_diff, tf.zeros_like(logit_diff))
+    # shape = [batch_size, 1, list_size].
+    logit_diff_sum = tf.reduce_sum(input_tensor=logit_diff, axis=1, keepdims=True)
+
+    # Compute masked range so that masked items do not influence scaling.
+    masked_range = tf.cumsum(tf.cast(mask, dtype=tf.int32), axis=1)
+    scaling = tf.cast(num_valid_entries + 1 - 2 * masked_range, dtype=tf.float32)
+    # shape = [batch_size, list_size].
+    scaling = tf.expand_dims(scaling, 2)
+    # shape = [batch_size, list_size, list_size].
+    # Use broadcast to align the dims.
+    scaled_logits = scaling * tf.expand_dims(logits, 1)
+
+    p_logits = scaled_logits - logit_diff_sum
+
+    # Masked entries will be forcefully kept in-place by setting their values to
+    # -inf everywhere, except for masked rows where they share equal probability
+    # with other masked items.
+    p_logits = tf.where(valid_pair_mask, p_logits, -math.inf)
+    p_logits = tf.where(_apply_pairwise_op(tf.logical_or, mask), p_logits, tf.zeros_like(p_logits))
+
+    # By swapping the rows of masked items to the end of the permutation matrix,
+    # we force masked items to be placed last.
+    sorted_mask_indices = tf.argsort(tf.cast(mask, dtype=tf.int32), axis=1, direction='DESCENDING', stable=True)
+    p_logits = tf.gather(p_logits, sorted_mask_indices, batch_dims=1, axis=1)
+
+    smooth_perm = tf.nn.softmax(p_logits, -1)
+
+    return smooth_perm
+
+
+def gumbel_neural_sort(logits, name=None, sample_size=8, temperature=1.0, seed=None):
+  """Generate the permutation matrix from logits by stochastic neuralsort.
+
+  By sampling logits from the Gumbel distribution,
+    sampled_logits = logits + Gumbel(0, 1),
+  the determinstic neural sort z of sampled_logits obeys the distribution with
+    Prob(z|logits) = (exp(logit_z1) / Z) * (exp(logit_z2) / Z-exp(logit_z1)) *
+                     ... * (exp(logit_zn) / Z-sum_i^(n-1)exp(logit_zi)),
+  where Z = sum_i exp(logit_i).
+
+  Args:
+    logits: A `Tensor` with shape [batch_size, list_size]. Each value is the
+      ranking score of the corresponding item.
+    name: A string used as the name for this loss.
+    sample_size: An integer representing the number of samples drawn from the
+      Concrete distribution defined by scores.
+    temperature: The Gumbel-Softmax temperature.
+    seed: Seed for pseudo-random number generator.
+
+  Returns:
+    A `Tensor` of permutation matrices whose dimension is [batch_size,
+    sample_size, list_size, list_size].
+  """
+  with tf.compat.v1.name_scope(name, 'gumbel_neural_sort', [logits]):
+    batch_size = tf.shape(input=logits)[0]
+    list_size = tf.shape(input=logits)[1]
+
+    # Sample logits from Concrete(logits).
+    sampled_logits = tf.expand_dims(logits, 1)
+    sampled_logits += _sample_gumbel([batch_size, sample_size, list_size], seed=seed)
+    sampled_logits = tf.reshape(sampled_logits, [batch_size * sample_size, list_size])
+
+    # Sort by constructing the relaxed permuation matrix from sampled logits.
+    smooth_perm = neural_sort(sampled_logits / temperature, name)
+    smooth_perm = tf.reshape(smooth_perm, [batch_size, sample_size, list_size, list_size])
+
+    return smooth_perm
+
+
+class OrdinalLoss(_PointwiseLoss):
+  """Implements ordinal loss."""
+
+  def __init__(self, name, ordinal_size, temperature=1.0, ragged=False, use_fraction_label=False):
+    """Initializer.
+
+    Args:
+      name: A string used as the name for this loss.
+      ordinal_size: A integer number of ordinal levels of labels.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+      use_fraction_label: A boolean indicating when the input labels contain
+        fractions, whether to leverage the fraction part.
+    """
+    super().__init__(name, None, temperature, ragged)
+    self._ordinal_size = ordinal_size
+    self._use_fraction_label = use_fraction_label
+
+  def _labels_to_ordinals(self, labels, mask):
+    """Helper function to transform input labels to ordinal values.
+
+    When use_fraction_label is false, ordinals will be 1.0 if labels >= i for
+    the ordinal head i, with i = 1, ..., ordinal_size.
+    When use_fraction_label is true, the fraction part of labels will be counted
+    if labels > i-1 but < i.
+
+    For a fraction label 1.2, and ordinal_size=2
+    when use_fraction_label is false, it maps to an ordinal like [1.0, 0.0],
+    when use_fraction_label is true, it maps to an ordinal like [1.0, 0.2].
+
+    Args:
+      labels: A Tensor of shape [batch_size, list_size].
+      mask: A Tensor of shape [batch_size, list_size].
+
+    Returns:
+      ordinals, shape [batch_size, list_size, ordinal_size]
+    """
+    one_to_n = tf.range(1, self._ordinal_size + 1, dtype=tf.float32)
+    unsqueezed = tf.repeat(tf.expand_dims(labels, axis=2), self._ordinal_size, axis=-1)
+    ordinals = tf.where(unsqueezed >= one_to_n, tf.ones_like(unsqueezed), 0.0)
+    if self._use_fraction_label:
+      fractions = unsqueezed - one_to_n + 1.0
+      fractions = tf.where(tf.logical_and(fractions > 0.0, fractions < 1.0), fractions, 0.0)
+      ordinals += fractions
+    return tf.where(tf.expand_dims(mask, axis=-1), ordinals, 0.0)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    if logits.shape.rank != 3:
+      raise ValueError('Predictions for ordinal loss must have rank 3.')
+    elif logits.shape[-1] != self._ordinal_size:
+      raise ValueError(
+          'The last dimension of logits must be the number of ordinal levels '
+          f'{self._ordinal_size}, the actual dimension is {logits.shape[-1]}.'
+      )
+    labels = tf.where(mask, labels, 0.0)
+    logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0)
+    ordinals = self._labels_to_ordinals(labels, mask)
+    losses = tf.where(
+        tf.expand_dims(mask, -1), tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=ordinals, logits=logits), 0.0
+    )
+    return tf.reduce_sum(losses, axis=-1), tf.cast(mask, dtype=tf.float32)
+
+
+class MultiClassLoss(_PointwiseLoss):
+  """Implements multi-class loss."""
+
+  def __init__(self, name, num_classes, temperature=1.0, ragged=False, from_logits=False, label_smoothing=0.0):
+    """Initializer.
+
+    Args:
+      name: A string used as the name for this loss.
+      num_classes: A integer number of classes. To use this loss,
+        num_classes must be greater than 1.
+      temperature: A float number to modify the logits=logits/temperature.
+      ragged: A boolean indicating whether the input tensors are ragged.
+      from_logits: A boolean indicating whether the input is logits or probs.
+      label_smoothing: A float number of label smoothing.
+    """
+    super().__init__(name, None, temperature, ragged)
+    self._num_classes = num_classes
+    self._from_logits = from_logits
+    self._label_smoothing = label_smoothing
+
+  def _labels_to_one_hot_class(self, labels, mask):
+    """Helper function to transform input labels to one hot class labels.
+
+    Args:
+      labels: A Tensor of shape [batch_size, list_size].
+      mask: A Tensor of shape [batch_size, list_size].
+
+    Returns:
+      one-hot class label, shape [batch_size, list_size, num_classes]
+    """
+    classes = tf.one_hot(tf.cast(labels, tf.int32), self._num_classes, dtype=tf.float32)
+    return tf.where(tf.expand_dims(mask, axis=-1), classes, 0.0)
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    if logits.shape.rank != 3:
+      raise ValueError('Predictions for multi-class loss must have rank 3.')
+    elif logits.shape[-1] != self._num_classes:
+      raise ValueError(
+          'The last dimension of logits must be the number of classes '
+          f'{self._num_classes}, the actual dimension is {logits.shape[-1]}.'
+      )
+    labels = tf.where(mask, labels, 0.0)
+    logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0)
+    classes = self._labels_to_one_hot_class(labels, mask)
+    losses = tf.keras.losses.CategoricalCrossentropy(
+        from_logits=self._from_logits,
+        label_smoothing=self._label_smoothing,
+        axis=-1,
+        reduction=tf.keras.losses.Reduction.NONE,
+        name='categorical_crossentropy'
+    )(classes, logits, tf.cast(mask, dtype=tf.float32))
+    return losses, tf.cast(mask, dtype=tf.float32)
+
+
+class CoupledRankDistilLoss(_ListwiseLoss):
+  r"""Implements Coupled-RankDistil loss.
+
+  The Coupled-RankDistil loss ([Reddi et al, 2021][reddi2021]) is the
+  cross-entropy between k-Plackett's probability of logits (student) and labels
+  (teacher).
+
+  The k-Plackett's probability model is defined as:
+  $$
+  \mathcal{P}_k(\pi|s) = \frac{1}{(N-k)!} \\
+  \frac{\prod_{i=1}^k exp(s_{\pi(i)})}{\sum_{j=k}^N log(exp(s_{\pi(i)}))}.
+  $$
+
+  The Coupled-RankDistil loss is defined as:
+  $$
+  \mathcal{L}(y, s) = -\sum_{\pi} \mathcal{P}_k(\pi|y) log\mathcal{P}(\pi|s) \\
+  =  \mathcal{E}_{\pi \sim \matcal{P}(.|y)} [-\log \mathcal{P}(\pi|s)]
+  $$
+
+    References:
+    - [RankDistil: Knowledge Distillation for Ranking, Reddi et al,
+       2021][reddi2021]
+
+  [reddi2021]: https://research.google/pubs/pub50695/
+  """
+
+  def __init__(self, name, sample_size, topk=None, temperature=1., ragged=False):
+    """Initializer.
+
+    Args:
+      name: A string used as the name for this loss.
+      sample_size: Number of permutations to sample from teacher scores.
+      topk: top-k entries over which order is matched. A penalty is applied over
+        non top-k items.
+      temperature: A float number to modify the logits as
+        `logits=logits/temperature`.
+      ragged: A boolean indicating whether the input tensors are ragged.
+    """
+    super().__init__(name, None, temperature, ragged)
+    self._sample_size = sample_size
+    self._topk = topk
+
+  def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
+    """See `_RankingLoss`."""
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    labels = tf.where(mask, labels, tf.zeros_like(labels))
+    label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    nonzero_mask = tf.greater(tf.reshape(label_sum, [-1]), 0.0)
+
+    teacher_scores = tf.where(mask, labels, tf.math.log(_EPSILON) * tf.ones_like(labels))
+
+    student_scores = tf.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits))
+
+    # Sample teacher scores.
+    # [batch_size, list_size] -> [batch_size, sample_size, list_size].
+    sampled_teacher_scores = tf.expand_dims(teacher_scores, 1)
+    sampled_teacher_scores = tf.repeat(sampled_teacher_scores, [self._sample_size], axis=1)
+
+    batch_size, list_size = tf.unstack(tf.shape(input=labels))
+    sampled_teacher_scores += _sample_gumbel([batch_size, self._sample_size, list_size], seed=37)
+    sampled_teacher_scores = tf.math.log(tf.nn.softmax(sampled_teacher_scores) + _EPSILON)
+
+    # Expand student scores.
+    # [batch_size, list_size] -> [batch_size, sample_size, list_size].
+    expanded_student_scores = tf.expand_dims(student_scores, 1)
+    expanded_student_scores = tf.repeat(expanded_student_scores, [self._sample_size], axis=1)
+
+    # Sort teacher scores and student scores to obtain top-k student scores
+    # whose order is based on teacher scores.
+    sorted_student_scores = utils.sort_by_scores(
+        utils.reshape_first_ndims(sampled_teacher_scores, 2, [batch_size * self._sample_size]),
+        [utils.reshape_first_ndims(expanded_student_scores, 2, [batch_size * self._sample_size])],
+        shuffle_ties=True,
+        seed=37
+    )[0]
+    sorted_student_scores = utils.reshape_first_ndims(sorted_student_scores, 1, [batch_size, self._sample_size])
+    topk = self._topk or list_size
+    topk_student_scores = sorted_student_scores[:, :, :topk]
+
+    # For \pi from teacher scores, compute top-k Plackett's probability as:
+    # \prod_{i=1}^k exp(s_{\pi(i)}) / \sum_{j=k}^N log(exp(s_{\pi(i)})).
+
+    # Compute the denominator mask for  \sum_{j=k}^N log(exp(s_{\pi(i)}).
+    # We apply logsumexp over valid entries in this mask.
+    # topk_pl_denominator_mask = batch x sample_size x valid_denom_entries,
+    # where valid_denom_entries = [[1 1 1 1 1 1]
+    #                             [0 1 1 1 1 1]
+    #                             [0 0 1 1 1 1]].
+    # An alternative implementation would be to use `cumulative_logsumexp` with
+    # `reverse=True` to compute the denominator term.
+    ones = tf.ones((topk, list_size), dtype=tf.float32)
+    ones_upper = tf.linalg.band_part(ones, 0, -1)
+    topk_pl_denominator_mask = tf.tile(tf.expand_dims(ones_upper, axis=0), [batch_size * self._sample_size, 1, 1])
+    # [batch_size * sample_size, topk, list_size] ->
+    # [batch_size, sample_size, topk, list_size].
+    topk_pl_denominator_mask = tf.cast(
+        utils.reshape_first_ndims(topk_pl_denominator_mask, 1, [batch_size, self._sample_size]), dtype=tf.bool
+    )
+    sorted_student_scores = tf.tile(tf.expand_dims(sorted_student_scores, 2), [1, 1, topk, 1])
+
+    sorted_student_scores_denom = tf.where(
+        topk_pl_denominator_mask, sorted_student_scores,
+        tf.math.log(_EPSILON) * tf.ones_like(sorted_student_scores)
+    )
+    logprob = topk_student_scores - tf.math.reduce_logsumexp(sorted_student_scores_denom, axis=3)
+    # Compute log-likelihood over top-k Plackett-Luce scores.
+    # [batch_size, sample_size, topk] -> [batch_size, sample_size].
+    logprob = tf.reduce_sum(logprob, axis=2)
+
+    # Compute RankDistil loss as a mean over samples.
+    # [batch_size, sample_size] -> [batch_size, 1].
+    nll = tf.reduce_mean(-logprob, axis=1, keepdims=True)
+
+    return nll, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
diff --git a/deepray/losses/quantiles.py b/deepray/losses/quantiles.py
index 4bbb8843..1a4f9803 100644
--- a/deepray/losses/quantiles.py
+++ b/deepray/losses/quantiles.py
@@ -15,8 +15,9 @@
 """Implements quantiles losses."""
 
 import tensorflow as tf
+from tensorflow.python.keras import losses
 from typeguard import typechecked
-from deepray.utils.keras_utils import LossFunctionWrapper
+
 from deepray.utils.types import TensorLike, FloatTensorLike
 
 
@@ -68,7 +69,7 @@ def pinball_loss(y_true: TensorLike, y_pred: TensorLike, tau: FloatTensorLike =
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class PinballLoss(LossFunctionWrapper):
+class PinballLoss(losses.LossFunctionWrapper):
   """Computes the pinball loss between `y_true` and `y_pred`.
 
     `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))`
diff --git a/deepray/losses/softmax_loss.py b/deepray/losses/softmax_loss.py
new file mode 100644
index 00000000..76970d30
--- /dev/null
+++ b/deepray/losses/softmax_loss.py
@@ -0,0 +1,167 @@
+from typing import Any, Dict, Optional
+
+import tensorflow as tf
+
+from deepray.losses import losses_impl
+from deepray.losses import utils
+
+# The smallest probability that is used to derive smallest logit for invalid or
+# padding entries.
+_EPSILON = 1e-10
+
+
+class _RankingLoss(tf.keras.losses.Loss):
+  """Base class for all ranking losses.
+
+  Please see tf.keras.losses.Loss for more information about such a class and
+  https://www.tensorflow.org/tutorials/distribute/custom_training on how to do
+  customized training.
+  """
+
+  def __init__(
+      self,
+      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
+      name: Optional[str] = None,
+      ragged: bool = False
+  ):
+    super().__init__(reduction, name)
+    # An instance of loss in `losses_impl`. Overwrite this in subclasses.
+    self._loss = None
+    self._ragged = ragged
+
+  def __call__(
+      self,
+      y_true: utils.TensorLike,
+      y_pred: utils.TensorLike,
+      sample_weight: Optional[utils.TensorLike] = None
+  ) -> tf.Tensor:
+    """See tf.keras.losses.Loss."""
+    if self._loss is None:
+      raise ValueError('self._loss is not defined. Please use a subclass.')
+    sample_weight = self._loss.normalize_weights(y_true, sample_weight)
+    return super().__call__(y_true, y_pred, sample_weight)
+
+  def call(self, y_true: utils.TensorLike, y_pred: utils.TensorLike) -> tf.Tensor:
+    """See tf.keras.losses.Loss."""
+    y_pred = self._loss.get_logits(y_pred)
+    losses, weights = self._loss.compute_unreduced_loss(labels=y_true, logits=y_pred)
+    return tf.multiply(losses, weights)
+
+  def get_config(self) -> Dict[str, Any]:
+    config = super().get_config()
+    config.update({'ragged': self._ragged})
+    return config
+
+
+class _ListwiseLoss(_RankingLoss):
+  """Base class for listwise ranking losses."""
+
+  def __init__(
+      self,
+      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
+      name: Optional[str] = None,
+      lambda_weight: Optional[losses_impl._LambdaWeight] = None,
+      temperature: float = 1.0,
+      ragged: bool = False,
+      **kwargs
+  ):
+    super().__init__(reduction, name, ragged)
+    self._lambda_weight = lambda_weight
+    self._temperature = temperature
+
+  def get_config(self) -> Dict[str, Any]:
+    config = super().get_config()
+    config.update(
+        {
+            'lambda_weight': utils.serialize_keras_object(self._lambda_weight),
+            'temperature': self._temperature,
+        }
+    )
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    config = config.copy()
+    config.update({
+        'lambda_weight': utils.deserialize_keras_object(config['lambda_weight']),
+    })
+    return cls(**config)
+
+
+class SoftmaxLoss(_ListwiseLoss):
+  r"""Computes Softmax cross-entropy loss between `y_true` and `y_pred`.
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  loss = - sum_i y_i * log(softmax(s_i))
+  ```
+
+  Standalone usage:
+
+  >>> y_true = [[1., 0.]]
+  >>> y_pred = [[0.6, 0.8]]
+  >>> loss = dp.losses.SoftmaxLoss()
+  >>> loss(y_true, y_pred).numpy()
+  0.7981389
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[1., 0.], [0., 1., 0.]])
+  >>> y_pred = tf.ragged.constant([[0.6, 0.8], [0.5, 0.8, 0.4]])
+  >>> loss = dp.losses.SoftmaxLoss(ragged=True)
+  >>> loss(y_true, y_pred).numpy()
+  0.83911896
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', loss=tfr.keras.losses.SoftmaxLoss())
+  ```
+
+  Definition:
+
+  $$
+  \mathcal{L}(\{y\}, \{s\}) = - \sum_i y_i
+  \log\left(\frac{\exp(s_i)}{\sum_j \exp(s_j)}\right)
+  $$
+  """
+
+  def __init__(
+      self,
+      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
+      name: Optional[str] = None,
+      lambda_weight: Optional[losses_impl._LambdaWeight] = None,
+      temperature: float = 1.0,
+      ragged: bool = False
+  ):
+    """Softmax cross-entropy loss.
+
+    Args:
+      reduction: (Optional) The `tf.keras.losses.Reduction` to use (see
+        `tf.keras.losses.Loss`).
+      name: (Optional) The name for the op.
+      lambda_weight: (Optional) A lambdaweight to apply to the loss. Can be one
+        of `tfr.keras.losses.DCGLambdaWeight`,
+        `tfr.keras.losses.NDCGLambdaWeight`, or,
+        `tfr.keras.losses.PrecisionLambdaWeight`.
+      temperature: (Optional) The temperature to use for scaling the logits.
+      ragged: (Optional) If True, this loss will accept ragged tensors. If
+        False, this loss will accept dense tensors.
+    """
+    super().__init__(reduction, name, lambda_weight, temperature, ragged)
+    self._loss = losses_impl.SoftmaxLoss(
+        name='{}_impl'.format(name) if name else None,
+        lambda_weight=lambda_weight,
+        temperature=temperature,
+        ragged=ragged
+    )
+
+  def __call__(
+      self,
+      y_true: utils.TensorLike,
+      y_pred: utils.TensorLike,
+      sample_weight: Optional[utils.TensorLike] = None
+  ) -> tf.Tensor:
+    """See _RankingLoss."""
+    losses, sample_weight = self._loss.compute_per_list(y_true, y_pred, sample_weight)
+    return tf.keras.__internal__.losses.compute_weighted_loss(losses, sample_weight, reduction=self._get_reduction())
diff --git a/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
new file mode 100644
index 00000000..5f92306b
--- /dev/null
+++ b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
@@ -0,0 +1,377 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for masked LM loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import networks
+from deepray.losses import weighted_sparse_categorical_crossentropy
+
+
+@keras_parameterized.run_all_keras_modes
+class ClassificationLossTest(keras_parameterized.TestCase):
+
+  def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"):
+    # First, create a transformer stack that we can use to get the LM's
+    # vocabulary weight.
+    xformer_stack = networks.TransformerEncoder(
+        vocab_size=vocab_size,
+        num_layers=1,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
+    )
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])
+
+    # Create a maskedLM from the transformer stack.
+    test_network = networks.MaskedLM(
+        num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
+    )
+
+    # Create a model from the masked LM layer.
+    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+    masked_lm_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
+    output = test_network([lm_input_tensor, masked_lm_positions])
+    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
+
+  def create_classification_model(self, input_width, num_classes):
+    test_object = networks.Classification(input_width=input_width, num_classes=num_classes)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
+    output = test_object(pooled_data)
+    return tf.keras.Model(pooled_data, output)
+
+  def test_per_example_loss_3d_input(self):
+    """Test per-example loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions
+    )
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate per-example loss.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels
+    )
+
+    # Per-example loss data should have one value per prediction, and those
+    # values shouldn't be zero in this case (as we're using random data).
+    expected_shape = [batch_size, num_predictions]
+    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_per_example_loss_2d_input(self):
+    """Test per-example loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per example loss.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels
+    )
+
+    # Per-example loss data should have one value per batch item, and those
+    # values shouldn't be zero in this case (as we're using random data).
+    self.assertEqual([batch_size], per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_per_example_loss_weights_3d_input(self):
+    """Test weighted per-example loss with a 3-d input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions
+    )
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate per-example loss with weights.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    weights = np.random.randint(2, size=(batch_size, num_predictions))
+
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights
+    )
+
+    # Weighted per-example loss data should be equivalent to multiplying the
+    # loss tensor by the weights tensor.
+    expected_weighted_loss = per_example_loss_data * weights
+    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
+
+  def test_per_example_loss_weights_2d_input(self):
+    """Test weighted per-example loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per-example loss with weights.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    weights = np.random.randint(2, size=(batch_size))
+
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights
+    )
+
+    # Weighted per-example loss data should be equivalent to multiplying the
+    # loss tensor by the weights tensor.
+    expected_weighted_loss = per_example_loss_data * weights
+    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
+
+  def test_loss_3d_input(self):
+    """Test overall loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions
+    )
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate loss.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    weights = np.random.randint(2, size=(batch_size, num_predictions))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=weights
+    )
+
+    # Total loss data should have one value, and that value shouldn't be zero
+    # in this case (as we're using random data).
+    expected_shape = []  # Scalar
+    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_loss_2d_input(self):
+    """Test overall loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per example loss.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels)
+
+    # Loss data should have one value only, and that value shouldn't be zero in
+    # this case (as we're using random data).
+    self.assertNotAllClose(0, loss_data)
+
+  def test_loss_weights_3d_input(self):
+    """Test masked loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions
+    )
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate a fully masked weight tensor. This should give a loss of zero.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    null_weights = np.zeros((batch_size, num_predictions))
+    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=null_weights
+    )
+
+    # Because the tensor is fully masked, the loss should be 0.
+    self.assertAllClose(0, weighted_loss_data)
+
+  def test_loss_weights_2d_input(self):
+    """Test masked loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate a fully masked weight tensor. This should give a loss of zero.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    null_weights = np.zeros((batch_size))
+    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=null_weights
+    )
+
+    # Because the tensor is fully masked, the loss should be 0.
+    self.assertAllClose(0, weighted_loss_data)
+
+  def test_mismatched_predictions_and_labels_ranks_squeezes(self):
+    """Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
+    batch_size = 3
+    output_data = np.random.random_sample((batch_size, 10))
+    labels = np.random.randint(10, size=(batch_size, 1))
+
+    # All that this test tests is that the squeeze is successful.
+    _ = weighted_sparse_categorical_crossentropy.per_example_loss(predictions=output_data, labels=labels)
+
+  def test_mismatched_weights_and_labels_ranks_fail(self):
+    """Test that the loss asserts when rank(predictions) != rank(labels)."""
+    batch_size = 3
+    output_data = np.random.random_sample((batch_size, 10, 15))
+    labels = np.random.randint(10, size=(batch_size, 10))
+    weights = np.random.randint(2, size=(batch_size))
+
+    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
+      _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+          predictions=output_data, labels=labels, weights=weights
+      )
+    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
+      _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
+
+  def test_tf_tensor_inputs(self):
+    """Test that tf.Tensors can be used as inputs to the loss function."""
+    batch_size = 3
+    output_data = tf.convert_to_tensor(np.random.random_sample((batch_size, 10, 15)))
+    labels = tf.convert_to_tensor(np.random.randint(10, size=(batch_size, 10)))
+    weights = tf.convert_to_tensor(np.random.randint(2, size=(batch_size, 10)))
+
+    # We're not trying to validate numerical correctness, just ensure that
+    # we can in fact pass tensors to these functions without causing runtime
+    # errors from the shape checking code.
+    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights
+    )
+    _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
+
+  def test_legacy_lm_loss_compatibility(self):
+    """Test to validate computational correctness during refactors."""
+    # This is the empirical output of a masked LM with the following parameters:
+    #   batch_size = 3
+    #   vocab_size = 5
+    #   sequence_length = 4
+    #   num_predictions = 2
+    output_data = np.array(
+        [
+            [
+                [-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
+                [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683]
+            ],
+            [
+                [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+                [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741]
+            ],
+            [
+                [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+                [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]
+            ]
+        ]
+    )
+    labels = np.array([[4, 0], [2, 2], [2, 1]])
+
+    # Validate that per_example loss calculations are the same.
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels
+    )
+    expected_per_example_loss_data = [[1.2923571, 2.7117882], [2.287932, 2.287932], [3.0924666, 1.8219438]]
+    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
+
+    # Validate that overall loss calculations are the same.
+    weights = np.array([[1, 0], [0, 0], [0, 0]])
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
+    expected_loss_data = 1.2923441
+    self.assertAllClose(expected_loss_data, loss_data)
+
+  def test_legacy_classification_loss_compatibility(self):
+    """Test to validate computational correctness during refactors."""
+    # This is the empirical output of a classifier with the following params:
+    #   batch_size = 2
+    #   num_classes = 3
+    output_data = np.array(
+        [[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00], [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]]
+    )
+    labels = np.array([2, 1])
+
+    # Validate that per_example loss calculations are the same.
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels
+    )
+    expected_per_example_loss_data = [6.4434357, 6.4009643]
+    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
+
+    # Validate that overall loss calculations are the same.
+    weights = None
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
+    expected_loss_data = 6.4222
+    self.assertAllClose(expected_loss_data, loss_data)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/deepray/losses/triplet.py b/deepray/losses/triplet.py
index 0dea42a7..df566b28 100644
--- a/deepray/losses/triplet.py
+++ b/deepray/losses/triplet.py
@@ -14,12 +14,14 @@
 # ==============================================================================
 """Implements triplet loss."""
 
+from typing import Optional, Union, Callable
+
 import tensorflow as tf
+from tensorflow.python.keras import losses
+from typeguard import typechecked
+
 from deepray.losses import metric_learning
-from deepray.utils.keras_utils import LossFunctionWrapper
 from deepray.utils.types import FloatTensorLike, TensorLike
-from typeguard import typechecked
-from typing import Optional, Union, Callable
 
 
 def _masked_maximum(data, mask, dim=1):
@@ -272,7 +274,7 @@ def triplet_hard_loss(
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class TripletSemiHardLoss(LossFunctionWrapper):
+class TripletSemiHardLoss(losses.LossFunctionWrapper):
   """Computes the triplet loss with semi-hard negative mining.
 
     The loss encourages the positive distances (between a pair of embeddings
@@ -309,7 +311,7 @@ def __init__(
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class TripletHardLoss(LossFunctionWrapper):
+class TripletHardLoss(losses.LossFunctionWrapper):
   """Computes the triplet loss with hard negative and hard positive mining.
 
     The loss encourages the maximum positive distance (between a pair of embeddings
diff --git a/deepray/losses/utils.py b/deepray/losses/utils.py
new file mode 100644
index 00000000..ee3d2f4d
--- /dev/null
+++ b/deepray/losses/utils.py
@@ -0,0 +1,563 @@
+# Copyright 2024 The TensorFlow Ranking Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for ranking library."""
+
+from typing import Callable, Dict, Tuple
+
+import tensorflow as tf
+
+_PADDING_LABEL = -1.
+_PADDING_PREDICTION = -1e6
+_PADDING_WEIGHT = 0.
+
+TensorLike = tf.types.experimental.TensorLike
+TransformationFunction = Callable[[TensorLike], tf.Tensor]
+LossFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor]
+MetricFunction = Callable[[TensorLike, TensorLike, Dict[str, TensorLike]], tf.Tensor]
+TensorLike = tf.types.experimental.TensorLike
+GainFunction = Callable[[TensorLike], tf.Tensor]
+RankDiscountFunction = Callable[[TensorLike], tf.Tensor]
+PositiveFunction = Callable[[TensorLike], tf.Tensor]
+
+
+def _to_nd_indices(indices):
+  """Returns indices used for tf.gather_nd or tf.scatter_nd.
+
+  Args:
+    indices: A `Tensor` of shape [batch_size, size] with integer values. The
+      values are the indices of another `Tensor`. For example, `indices` is the
+      output of tf.argsort or tf.math.top_k.
+
+  Returns:
+    A `Tensor` with shape [batch_size, size, 2] that can be used by tf.gather_nd
+    or tf.scatter_nd.
+
+  """
+  indices.get_shape().assert_has_rank(2)
+  batch_ids = tf.ones_like(indices) * tf.expand_dims(tf.range(tf.shape(input=indices)[0]), 1)
+  return tf.stack([batch_ids, indices], axis=-1)
+
+
+def gather_per_row(inputs, indices):
+  """Gathers the values from input tensor based on per-row indices.
+
+  Example Usage:
+  ```python
+  scores = [[1., 3., 2.], [1., 2., 3.]]
+  indices = [[1, 2], [2, 1]]
+  tfr.utils.gather_per_row(scores, indices)
+  ```
+  Returns [[3., 2.], [3., 2.]]
+
+  Args:
+    inputs: (tf.Tensor) A tensor of shape [batch_size, list_size] or
+      [batch_size, list_size, feature_dims].
+    indices: (tf.Tensor) A tensor of shape [batch_size, size] of positions to
+      gather inputs from. Each index corresponds to a row entry in input_tensor.
+
+  Returns:
+    A tensor of values gathered from inputs, of shape [batch_size, size] or
+      [batch_size, size, feature_dims], depending on whether the input was 2D or
+      3D.
+  """
+  indices = tf.cast(indices, dtype=tf.int32)
+  return tf.gather(inputs, indices, batch_dims=1, axis=1)
+
+
+def is_label_valid(labels):
+  """Returns a boolean `Tensor` for label validity."""
+  labels = tf.convert_to_tensor(value=labels)
+  return tf.greater_equal(labels, 0.)
+
+
+def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None):
+  """Gets indices which would shuffle a tensor.
+
+  Args:
+    shape: The shape of the indices to generate.
+    mask: An optional mask that indicates which entries to place first. Its
+      shape should be equal to given shape.
+    shuffle_ties: Whether to randomly shuffle ties.
+    seed: The ops-level random seed.
+
+  Returns:
+    An int32 `Tensor` with given `shape`. Its entries are indices that would
+    (randomly) shuffle the values of a `Tensor` of given `shape` along the last
+    axis while placing masked items first.
+  """
+  # Generate random values when shuffling ties or all zeros when not.
+  if shuffle_ties:
+    shuffle_values = tf.random.uniform(shape, seed=seed)
+  else:
+    shuffle_values = tf.zeros(shape, dtype=tf.float32)
+
+  # Since shuffle_values is always in [0, 1), we can safely increase entries
+  # where mask=False with 2.0 to make sure those are placed last during the
+  # argsort op.
+  if mask is not None:
+    shuffle_values = tf.where(mask, shuffle_values, shuffle_values + 2.0)
+
+  # Generate indices by sorting the shuffle values.
+  return tf.argsort(shuffle_values, stable=True)
+
+
+def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=None, mask=None):
+  """Sorts list of features according to per-example scores.
+
+  Args:
+    scores: A `Tensor` of shape [batch_size, list_size] representing the
+      per-example scores.
+    features_list: A list of `Tensor`s to be sorted. The shape of the `Tensor`
+      can be [batch_size, list_size] or [batch_size, list_size, feature_dims].
+      The latter is applicable for example features.
+    topn: An integer as the cutoff of examples in the sorted list.
+    shuffle_ties: A boolean. If True, randomly shuffle before the sorting.
+    seed: The ops-level random seed used when `shuffle_ties` is True.
+    mask: An optional `Tensor` of shape [batch_size, list_size] representing
+      which entries are valid for sorting. Invalid entries will be pushed to the
+      end.
+
+  Returns:
+    A list of `Tensor`s as the list of sorted features by `scores`.
+  """
+  with tf.compat.v1.name_scope(name='sort_by_scores'):
+    scores = tf.cast(scores, tf.float32)
+    scores.get_shape().assert_has_rank(2)
+    list_size = tf.shape(input=scores)[1]
+    if topn is None:
+      topn = list_size
+    topn = tf.minimum(topn, list_size)
+
+    # Set invalid entries (those whose mask value is False) to the minimal value
+    # of scores so they will be placed last during sort ops.
+    if mask is not None:
+      scores = tf.where(mask, scores, tf.reduce_min(scores))
+
+    # Shuffle scores to break ties and/or push invalid entries (according to
+    # mask) to the end.
+    shuffle_ind = None
+    if shuffle_ties or mask is not None:
+      shuffle_ind = _get_shuffle_indices(tf.shape(input=scores), mask, shuffle_ties=shuffle_ties, seed=seed)
+      scores = tf.gather(scores, shuffle_ind, batch_dims=1, axis=1)
+
+    # Perform sort and return sorted feature_list entries.
+    _, indices = tf.math.top_k(scores, topn, sorted=True)
+    if shuffle_ind is not None:
+      indices = tf.gather(shuffle_ind, indices, batch_dims=1, axis=1)
+    return [tf.gather(f, indices, batch_dims=1, axis=1) for f in features_list]
+
+
+def sorted_ranks(scores, shuffle_ties=True, seed=None):
+  """Returns an int `Tensor` as the ranks (1-based) after sorting scores.
+
+  Example: Given scores = [[1.0, 3.5, 2.1]], the returned ranks will be [[3, 1,
+  2]]. It means that scores 1.0 will be ranked at position 3, 3.5 will be ranked
+  at position 1, and 2.1 will be ranked at position 2.
+
+  Args:
+    scores: A `Tensor` of shape [batch_size, list_size] representing the
+      per-example scores.
+    shuffle_ties: See `sort_by_scores`.
+    seed: See `sort_by_scores`.
+
+  Returns:
+    A 1-based int `Tensor`s as the ranks.
+  """
+  with tf.compat.v1.name_scope(name='sorted_ranks'):
+    batch_size, list_size = tf.unstack(tf.shape(input=scores))
+    # The current position in the list for each score.
+    positions = tf.tile(tf.expand_dims(tf.range(list_size), 0), [batch_size, 1])
+    # For score [[1.0, 3.5, 2.1]], sorted_positions are [[1, 2, 0]], meaning the
+    # largest score is at position 1, the 2nd is at position 2 and 3rd is at
+    # position 0.
+    sorted_positions = sort_by_scores(scores, [positions], shuffle_ties=shuffle_ties, seed=seed)[0]
+    # The indices of sorting sorted_positions will be [[2, 0, 1]] and ranks are
+    # 1-based and thus are [[3, 1, 2]].
+    ranks = tf.argsort(sorted_positions) + 1
+    return ranks
+
+
+def shuffle_valid_indices(is_valid, seed=None):
+  """Returns a shuffle of indices with valid ones on top."""
+  return organize_valid_indices(is_valid, shuffle=True, seed=seed)
+
+
+def organize_valid_indices(is_valid, shuffle=True, seed=None):
+  """Organizes indices in such a way that valid items appear first.
+
+  Args:
+    is_valid: A boolean `Tensor` for entry validity with shape [batch_size,
+      list_size].
+    shuffle: A boolean indicating whether valid items should be shuffled.
+    seed: An int for random seed at the op level. It works together with the
+      seed at global graph level together to determine the random number
+      generation. See `tf.set_random_seed`.
+
+  Returns:
+    A tensor of indices with shape [batch_size, list_size, 2]. The returned
+    tensor can be used with `tf.gather_nd` and `tf.scatter_nd` to compose a new
+    [batch_size, list_size] tensor. The values in the last dimension are the
+    indices for an element in the input tensor.
+  """
+  with tf.compat.v1.name_scope(name='organize_valid_indices'):
+    is_valid = tf.convert_to_tensor(value=is_valid)
+    is_valid.get_shape().assert_has_rank(2)
+    output_shape = tf.shape(input=is_valid)
+
+    if shuffle:
+      values = tf.random.uniform(output_shape, seed=seed)
+    else:
+      values = (
+          tf.ones_like(is_valid, tf.float32) * tf.reverse(tf.cast(tf.range(output_shape[1]), dtype=tf.float32), [-1])
+      )
+
+    rand = tf.where(is_valid, values, tf.ones(output_shape) * -1e-6)
+    # shape(indices) = [batch_size, list_size]
+    indices = tf.argsort(rand, direction='DESCENDING', stable=True)
+    return _to_nd_indices(indices)
+
+
+def reshape_first_ndims(tensor, first_ndims, new_shape):
+  """Reshapes the first n dims of the input `tensor` to `new shape`.
+
+  Args:
+    tensor: The input `Tensor`.
+    first_ndims: A int denoting the first n dims.
+    new_shape: A list of int representing the new shape.
+
+  Returns:
+    A reshaped `Tensor`.
+  """
+  assert tensor.get_shape().ndims is None or tensor.get_shape().ndims >= first_ndims, (
+      'Tensor shape is less than {} dims.'.format(first_ndims)
+  )
+  new_shape = tf.concat([new_shape, tf.shape(input=tensor)[first_ndims:]], 0)
+  if isinstance(tensor, tf.SparseTensor):
+    return tf.sparse.reshape(tensor, new_shape)
+
+  return tf.reshape(tensor, new_shape)
+
+
+def reshape_to_2d(tensor):
+  """Converts the given `tensor` to a 2-D `Tensor`."""
+  with tf.compat.v1.name_scope(name='reshape_to_2d'):
+    rank = tensor.shape.rank if tensor.shape is not None else None
+    if rank is not None and rank != 2:
+      if rank >= 3:
+        tensor = tf.reshape(tensor, tf.shape(input=tensor)[0:2])
+      else:
+        while tensor.shape.rank < 2:
+          tensor = tf.expand_dims(tensor, -1)
+    return tensor
+
+
+def _circular_indices(size, num_valid_entries):
+  """Creates circular indices with padding and mask for non-padded ones.
+
+  This returns a indices and a mask Tensor, where the mask is True for valid
+  entries and False for padded entries.
+
+  The returned indices have the shape of [batch_size, size], where the
+  batch_size is obtained from the 1st dim of `num_valid_entries`. For a
+  batch_size = 1, when size = 3, returns [[0, 1, 2]], when num_valid_entries =
+  2, returns [[0, 1, 0]]. The first 2 are valid and the returned mask is [True,
+  True, False].
+
+  Args:
+    size: A scalar int `Tensor` for the size.
+    num_valid_entries: A 1-D `Tensor` with shape [batch_size] representing the
+      number of valid entries for each instance in a batch.
+
+  Returns:
+    A tuple of Tensors (batch_indices, batch_indices_mask). The first has
+    shape [batch_size, size] and the second has shape [batch_size, size].
+  """
+  with tf.compat.v1.name_scope(name='circular_indices'):
+    # shape = [batch_size, size] with value [[0, 1, ...], [0, 1, ...], ...].
+    batch_indices = tf.tile(tf.expand_dims(tf.range(size), 0), [tf.shape(input=num_valid_entries)[0], 1])
+    num_valid_entries = tf.reshape(num_valid_entries, [-1, 1])
+    batch_indices_mask = tf.less(batch_indices, num_valid_entries)
+    # Use mod to make the indices to the ranges of valid entries.
+    num_valid_entries = tf.where(tf.less(num_valid_entries, 1), tf.ones_like(num_valid_entries), num_valid_entries)
+    batch_indices = tf.math.mod(batch_indices, num_valid_entries)
+    return batch_indices, batch_indices_mask
+
+
+def padded_nd_indices(is_valid, shuffle=False, seed=None):
+  """Pads the invalid entries by valid ones and returns the nd_indices.
+
+  For example, when we have a batch_size = 1 and list_size = 3. Only the first 2
+  entries are valid. We have:
+  ```
+  is_valid = [[True, True, False]]
+  nd_indices, mask = padded_nd_indices(is_valid)
+  ```
+  nd_indices has a shape [1, 3, 2] and mask has a shape [1, 3].
+
+  ```
+  nd_indices = [[[0, 0], [0, 1], [0, 0]]]
+  mask = [[True, True, False]]
+  ```
+  nd_indices can be used by gather_nd on a Tensor t
+  ```
+  padded_t = tf.gather_nd(t, nd_indices)
+  ```
+  and get the following Tensor with first 2 dims are [1, 3]:
+  ```
+  padded_t = [[t(0, 0), t(0, 1), t(0, 0)]]
+  ```
+
+  Args:
+    is_valid: A boolean `Tensor` for entry validity with shape [batch_size,
+      list_size].
+    shuffle: A boolean that indicates whether valid indices should be shuffled.
+    seed: Random seed for shuffle.
+
+  Returns:
+    A tuple of Tensors (nd_indices, mask). The first has shape [batch_size,
+    list_size, 2] and it can be used in gather_nd or scatter_nd. The second has
+    the shape of [batch_size, list_size] with value True for valid indices.
+  """
+  with tf.compat.v1.name_scope(name='nd_indices_with_padding'):
+    is_valid = tf.convert_to_tensor(value=is_valid)
+    list_size = tf.shape(input=is_valid)[1]
+    num_valid_entries = tf.reduce_sum(input_tensor=tf.cast(is_valid, dtype=tf.int32), axis=1)
+    indices, mask = _circular_indices(list_size, num_valid_entries)
+    # Valid indices of the tensor are shuffled and put on the top.
+    # [batch_size, list_size, 2].
+    shuffled_indices = organize_valid_indices(is_valid, shuffle=shuffle, seed=seed)
+    # Construct indices for gather_nd [batch_size, list_size, 2].
+    nd_indices = _to_nd_indices(indices)
+    nd_indices = tf.gather_nd(shuffled_indices, nd_indices)
+    return nd_indices, mask
+
+
+def de_noise(counts, noise, ratio=0.9):
+  """Returns a float `Tensor` as the de-noised `counts`.
+
+  The implementation is based on the the paper by Zhang and Xu: "Fast Exact
+  Maximum Likelihood Estimation for Mixture of Language Models." It assumes that
+  the observed `counts` are generated from a mixture of `noise` and the true
+  distribution: `ratio * noise_distribution + (1 - ratio) * true_distribution`,
+  where the contribution of `noise` is controlled by `ratio`. This method
+  returns the true distribution.
+
+  Args:
+    counts: A 2-D `Tensor` representing the observations. All values should be
+      nonnegative.
+    noise: A 2-D `Tensor` representing the noise distribution. This should be
+      the same shape as `counts`. All values should be positive and are
+      normalized to a simplex per row.
+    ratio: A float in (0, 1) representing the contribution from noise.
+
+  Returns:
+    A 2-D float `Tensor` and each row is a simplex.
+  Raises:
+    ValueError: if `ratio` is not in (0,1).
+    InvalidArgumentError: if any of `counts` is negative or any of `noise` is
+    not positive.
+  """
+  if not 0 < ratio < 1:
+    raise ValueError('ratio should be in (0, 1), but get {}'.format(ratio))
+  odds = (1 - ratio) / ratio
+
+  counts = tf.cast(counts, dtype=tf.float32)
+  noise = tf.cast(noise, dtype=tf.float32)
+
+  counts.get_shape().assert_has_rank(2)
+  noise.get_shape().assert_has_rank(2)
+  noise.get_shape().assert_is_compatible_with(counts.get_shape())
+
+  with tf.compat.v1.name_scope(name='de_noise'):
+    counts_nonneg = tf.debugging.assert_greater_equal(counts, 0.)
+    noise_pos = tf.debugging.assert_greater(noise, 0.)
+    with tf.control_dependencies([counts_nonneg, noise_pos]):
+      # Normalize noise to be a simplex per row.
+      noise = noise / tf.reduce_sum(noise, axis=1, keepdims=True)
+      sorted_idx = tf.argsort(counts / noise, direction='DESCENDING', stable=True)
+      nd_indices = _to_nd_indices(sorted_idx)
+      sorted_counts = tf.gather_nd(counts, nd_indices)
+      sorted_noise = tf.gather_nd(noise, nd_indices)
+      # Decide whether an entry will have a positive value or 0.
+      is_pos = tf.cast(
+          (odds + tf.cumsum(sorted_noise, axis=1)) / tf.cumsum(sorted_counts, axis=1) > sorted_noise / sorted_counts,
+          tf.float32
+      )
+      # The lambda in the paper above, which is the lagrangian multiplier for
+      # the simplex constraint on the variables.
+      lagrangian_multiplier = tf.reduce_sum(sorted_counts * is_pos, axis=1, keepdims=True
+                                           ) / (1 + tf.reduce_sum(sorted_noise * is_pos, axis=1, keepdims=True) / odds)
+      res = (sorted_counts / lagrangian_multiplier - sorted_noise / odds) * is_pos
+      return tf.scatter_nd(nd_indices, res, shape=tf.shape(counts))
+
+
+def ragged_to_dense(labels, predictions, weights):
+  """Converts given inputs from ragged tensors to dense tensors.
+
+  Args:
+    labels: A `tf.RaggedTensor` of the same shape as `predictions` representing
+      relevance.
+    predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each
+      value is the ranking score of the corresponding example.
+    weights: An optional `tf.RaggedTensor` of the same shape of predictions or a
+      `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and
+      the latter case is per-list.
+
+  Returns:
+    A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
+  """
+  # TODO: Add checks to validate (ragged) shapes of input tensors.
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  labels = labels.to_tensor(_PADDING_LABEL)
+  if predictions is not None:
+    predictions = predictions.to_tensor(_PADDING_PREDICTION)
+  if isinstance(weights, tf.RaggedTensor):
+    weights = weights.to_tensor(_PADDING_WEIGHT)
+  return labels, predictions, weights, mask
+
+
+def parse_keys_and_weights(key: str) -> Dict[str, float]:
+  """Parses the encoded key to keys and weights.
+
+  This parse function will remove all spaces. Different keys are split by ","
+  and then weight associated with key is split by ":".
+
+  Args:
+    key: A string represents a key, or a string of multiple keys, split by ",",
+      and weighted by the weights split by ":". For example, key =
+      'softmax_loss:0.9,sigmoid_cross_entropy_loss:0.1'.
+
+  Returns:
+    A dict from keys to weights.
+  """
+
+  def _parse(key_with_weight: str) -> Tuple[str, float]:
+    if ':' in key_with_weight:
+      pair = key_with_weight.split(':')
+    else:
+      pair = [key_with_weight, 1.0]
+
+    return pair[0], float(pair[1])
+
+  # Remove spaces.
+  key = key.replace(' ', '')
+  # Single objective or multiple objectives with weights:
+  keys_to_weights = dict(_parse(loss_key_with_weight) for loss_key_with_weight in key.split(','))
+
+  return keys_to_weights
+
+
+def serialize_keras_object(obj):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.serialize_keras_object(obj)
+  else:
+    return tf.keras.utils.serialize_keras_object(obj)
+
+
+def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+  else:
+    return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+
+
+# The following functions are used to transform labels and ranks for losses and
+# metrics computation. User customized functions can be defined similarly by
+# following the same annotations.
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def identity(label: TensorLike) -> tf.Tensor:
+  """Identity function that returns the input label.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    The input label.
+  """
+  return label
+
+
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def inverse(rank: TensorLike) -> tf.Tensor:
+  """Computes the inverse of input rank.
+
+  Args:
+    rank: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `1/x`.
+  """
+  return tf.math.divide_no_nan(1., rank)
+
+
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def pow_minus_1(label: TensorLike) -> tf.Tensor:
+  """Computes `2**x - 1` element-wise for each label.
+
+  Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
+  """
+  return tf.math.pow(2., label) - 1.
+
+
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def log2_inverse(rank: TensorLike) -> tf.Tensor:
+  """Computes `1./log2(1+x)` element-wise for each label.
+
+  Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    rank: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
+  """
+  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+
+
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def is_greater_equal_1(label: TensorLike) -> tf.Tensor:
+  """Computes whether label is greater or equal to 1.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `I(x > 1)`.
+  """
+  return tf.greater_equal(label, 1.0)
+
+
+@tf.keras.utils.register_keras_serializable(package="deepray.losses")
+def symmetric_log1p(t: TensorLike) -> tf.Tensor:
+  """Computes `sign(x) * log(1 + sign(x))`.
+
+  Args:
+    t: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `I(x > 1)`.
+  """
+  return tf.math.log1p(t * tf.sign(t)) * tf.sign(t)
diff --git a/deepray/losses/weighted_sparse_categorical_crossentropy.py b/deepray/losses/weighted_sparse_categorical_crossentropy.py
new file mode 100644
index 00000000..e21a86f9
--- /dev/null
+++ b/deepray/losses/weighted_sparse_categorical_crossentropy.py
@@ -0,0 +1,108 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sparse categorical cross-entropy losses."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def _adjust_labels(labels, predictions):
+  """Adjust the 'labels' tensor by squeezing it if needed."""
+  labels = tf.cast(labels, tf.int32)
+  if len(predictions.shape) == len(labels.shape):
+    labels = tf.squeeze(labels, [-1])
+  return labels, predictions
+
+
+def _validate_rank(labels, predictions, weights):
+  if weights is not None and len(weights.shape) != len(labels.shape):
+    raise RuntimeError(
+        ("Weight and label tensors were not of the same rank. weights.shape "
+         "was %s, and labels.shape was %s.") % (predictions.shape, labels.shape)
+    )
+  if (len(predictions.shape) - 1) != len(labels.shape):
+    raise RuntimeError(
+        (
+            "Weighted sparse categorical crossentropy expects `labels` to have a "
+            "rank of one less than `predictions`. labels.shape was %s, and "
+            "predictions.shape was %s."
+        ) % (labels.shape, predictions.shape)
+    )
+
+
+def per_example_loss(labels, predictions, weights=None):
+  """Calculate a per-example sparse categorical crossentropy loss.
+
+  This loss function assumes that the predictions are post-softmax.
+  Args:
+    labels: The labels to evaluate against. Should be a set of integer indices
+      ranging from 0 to (vocab_size-1).
+    predictions: The network predictions. Should have softmax already applied.
+    weights: An optional weight array of the same shape as the 'labels' array.
+      If None, all examples will be used.
+
+  Returns:
+    A tensor of shape predictions.shape[:-1] containing the per-example
+      loss.
+  """
+  # When using these functions with the Keras core API, we will need to squeeze
+  # the labels tensor - Keras adds a spurious inner dimension.
+  labels, predictions = _adjust_labels(labels, predictions)
+  _validate_rank(labels, predictions, weights)
+
+  labels_one_hot = tf.keras.backend.one_hot(labels, predictions.shape[-1])
+  labels_one_hot = tf.keras.backend.cast(labels_one_hot, predictions.dtype)
+  per_example_loss_data = -tf.keras.backend.sum(predictions * labels_one_hot, axis=[-1])
+  if weights is not None:
+    weights = tf.keras.backend.cast(weights, per_example_loss_data.dtype)
+    per_example_loss_data = weights * per_example_loss_data
+  return per_example_loss_data
+
+
+def loss(labels, predictions, weights=None):
+  """Calculate a per-batch sparse categorical crossentropy loss.
+
+  This loss function assumes that the predictions are post-softmax.
+  Args:
+    labels: The labels to evaluate against. Should be a set of integer indices
+      ranging from 0 to (vocab_size-1).
+    predictions: The network predictions. Should have softmax already applied.
+    weights: An optional weight array of the same shape as the 'labels' array.
+      If None, all examples will be used.
+
+  Returns:
+    A loss scalar.
+
+  Raises:
+    RuntimeError if the passed tensors do not have the same rank.
+  """
+  # When using these functions with the Keras core API, we will need to squeeze
+  # the labels tensor - Keras adds a spurious inner dimension.
+  labels, predictions = _adjust_labels(labels, predictions)
+  _validate_rank(labels, predictions, weights)
+
+  per_example_loss_data = per_example_loss(labels, predictions, weights)
+
+  if weights is None:
+    return tf.keras.backend.mean(per_example_loss_data)
+  else:
+    numerator = tf.keras.backend.sum(per_example_loss_data)
+    weights = tf.keras.backend.cast(weights, predictions.dtype)
+    denominator = tf.keras.backend.sum(weights) + 1e-5
+    return numerator / denominator
diff --git a/deepray/metrics/__init__.py b/deepray/metrics/__init__.py
index f628fa35..8f3c2214 100755
--- a/deepray/metrics/__init__.py
+++ b/deepray/metrics/__init__.py
@@ -22,13 +22,6 @@
     hamming_loss_fn,
 )
 from deepray.metrics.utils import MeanMetricWrapper
-from deepray.metrics.matthews_correlation_coefficient import (
-    MatthewsCorrelationCoefficient,
-)
-from deepray.metrics.multilabel_confusion_matrix import (
-    MultiLabelConfusionMatrix,
-)
-from deepray.metrics.r_square import RSquare
 from deepray.metrics.geometric_mean import GeometricMean
 from deepray.metrics.harmonic_mean import HarmonicMean
 from deepray.metrics.streaming_correlations import (
@@ -37,3 +30,5 @@
     PearsonsCorrelation,
     SpearmansRank,
 )
+from deepray.metrics.ndcg import NDCGMetric
+from deepray.metrics.mrr import MRRMetric
\ No newline at end of file
diff --git a/deepray/metrics/_ranking.py b/deepray/metrics/_ranking.py
new file mode 100644
index 00000000..c85a9d6c
--- /dev/null
+++ b/deepray/metrics/_ranking.py
@@ -0,0 +1,165 @@
+# Copyright 2024 The TensorFlow Ranking Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tensorflow as tf
+from typing import Callable
+
+TensorLike = tf.types.experimental.TensorLike
+GainFunction = Callable[[TensorLike], tf.Tensor]
+RankDiscountFunction = Callable[[TensorLike], tf.Tensor]
+PositiveFunction = Callable[[TensorLike], tf.Tensor]
+
+
+class _RankingMetric(tf.keras.metrics.Mean):
+  """Implements base ranking metric class.
+
+  Please see tf.keras.metrics.Mean for more information about such a class and
+  https://www.tensorflow.org/tutorials/distribute/custom_training on how to do
+  customized training.
+  """
+
+  def __init__(self, name=None, dtype=None, ragged=False, **kwargs):
+    super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs)
+    # An instance of `metrics_impl._RankingMetric`.
+    # Overwrite this in subclasses.
+    self._metric = None
+    self._ragged = ragged
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = tf.cast(y_true, self._dtype)
+    y_pred = tf.cast(y_pred, self._dtype)
+
+    # TODO: Add mask argument for metric.compute() call
+    per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight)
+    return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights)
+
+  def get_config(self):
+    config = super(_RankingMetric, self).get_config()
+    config.update({
+        "ragged": self._ragged,
+    })
+    return config
+
+
+def serialize_keras_object(obj):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.serialize_keras_object(obj)
+  else:
+    return tf.keras.utils.serialize_keras_object(obj)
+
+
+def deserialize_keras_object(config, module_objects=None, custom_objects=None, printable_module_name=None):
+  if hasattr(tf.keras.utils, "legacy"):
+    return tf.keras.utils.legacy.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+  else:
+    return tf.keras.utils.deserialize_keras_object(config, custom_objects, module_objects, printable_module_name)
+
+
+# The following functions are used to transform labels and ranks for losses and
+# metrics computation. User customized functions can be defined similarly by
+# following the same annotations.
+def identity(label: TensorLike) -> tf.Tensor:
+  """Identity function that returns the input label.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    The input label.
+  """
+  return label
+
+
+def inverse(rank: TensorLike) -> tf.Tensor:
+  """Computes the inverse of input rank.
+
+  Args:
+    rank: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `1/x`.
+  """
+  return tf.math.divide_no_nan(1., rank)
+
+
+def pow_minus_1(label: TensorLike) -> tf.Tensor:
+  """Computes `2**x - 1` element-wise for each label.
+
+  Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
+  """
+  return tf.math.pow(2., label) - 1.
+
+
+def log2_inverse(rank: TensorLike) -> tf.Tensor:
+  """Computes `1./log2(1+x)` element-wise for each label.
+
+  Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    rank: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
+  """
+  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+
+
+def is_greater_equal_1(label: TensorLike) -> tf.Tensor:
+  """Computes whether label is greater or equal to 1.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `I(x > 1)`.
+  """
+  return tf.greater_equal(label, 1.0)
+
+
+def symmetric_log1p(t: TensorLike) -> tf.Tensor:
+  """Computes `sign(x) * log(1 + sign(x))`.
+
+  Args:
+    t: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `I(x > 1)`.
+  """
+  return tf.math.log1p(t * tf.sign(t)) * tf.sign(t)
diff --git a/deepray/metrics/alpha_dcg.py b/deepray/metrics/alpha_dcg.py
new file mode 100644
index 00000000..ae1f2cde
--- /dev/null
+++ b/deepray/metrics/alpha_dcg.py
@@ -0,0 +1,126 @@
+from ._ranking import _RankingMetric
+
+
+class AlphaDCGMetric(_RankingMetric):
+  r"""Alpha discounted cumulative gain (alphaDCG).
+
+  Alpha discounted cumulative gain ([Clarke et al, 2008][clarke2008];
+  [Clarke et al, 2009][clarke2009]) is a cumulative gain metric that operates
+  on subtopics and is typically used for diversification tasks.
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  alphaDCG(y, s) = sum_t sum_i gain(y_{i,t}) * rank_discount(rank(s_i))
+  gain(y_{i,t}) = (1 - alpha)^(sum_j I[rank(s_j) < rank(s_i)] * gain(y_{j,t}))
+  ```
+
+  NOTE: The labels `y_true` should be of shape
+  `[batch_size, list_size, subtopic_size]`, indicating relevance for each
+  subtopic in the last dimension.
+
+  NOTE: The `rank_discount_fn` should be keras serializable. Please see
+  `tfr.keras.utils.log2_inverse` as an example when defining user customized
+  functions.
+
+  Standalone usage:
+
+  >>> y_true = [[[0., 1.], [1., 0.], [1., 1.]]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> alpha_dcg = tfr.keras.metrics.AlphaDCGMetric()
+  >>> alpha_dcg(y_true, y_pred).numpy()
+  2.1963947
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant(
+  ...   [[[0., 0.], [1., 0.]], [[1., 1.], [0., 2.], [1., 0.]]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> alpha_dcg = tfr.keras.metrics.AlphaDCGMetric(ragged=True)
+  >>> alpha_dcg(y_true, y_pred).numpy()
+  1.8184297
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.AlphaDCGMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \alpha\text{DCG}(y, s) =
+  \sum_t \sum_i \text{gain}(y_{i, t}, \alpha)
+  \text{ rank_discount}(\text{rank}(s_i))\\
+  \text{gain}(y_{i, t}, \alpha) =
+  y_{i, t} (1 - \alpha)^{\sum_j I[\text{rank}(s_j) < \text{rank}(s_i)] y_{j, t}}
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly and $I[]$ is the indicator function:
+
+  $$
+  I[\text{cond}] = \begin{cases}
+  1 & \text{if cond is true}\\
+  0 & \text{else}\end{cases}
+  $$
+
+  References:
+
+    - [Novelty and diversity in information retrieval evaluation, Clarke et al,
+       2008][clarke2008]
+    - [Overview of the TREC 2009 Web Track, Clarke et al, 2009][clarke2009]
+
+  [clarke2008]: https://dl.acm.org/doi/10.1145/1390334.1390446
+  [clarke2009]: https://trec.nist.gov/pubs/trec18/papers/ENT09.OVERVIEW.pdf
+  """
+
+  def __init__(
+      self,
+      name="alpha_dcg_metric",
+      topn=None,
+      alpha=0.5,
+      rank_discount_fn=None,
+      seed=None,
+      dtype=None,
+      ragged=False,
+      **kwargs
+  ):
+    """Construct the ranking metric class for alpha-DCG.
+
+    Args:
+      name: A string used as the name for this metric.
+      topn: A cutoff for how many examples to consider for this metric.
+      alpha: A float between 0 and 1, parameter used in definition of alpha-DCG.
+        Introduced as an assessor error in judging whether a document is
+        covering a subtopic of the query.
+      rank_discount_fn: A function of rank discounts. Default is set to
+        `1 / log2(rank+1)`. The `rank_discount_fn` should be keras serializable.
+        Please see the `log2_inverse` above as an example when defining user
+        customized functions.
+      seed: The ops-level random seed used in shuffle ties in `sort_by_scores`.
+      dtype: Data type of the metric output. See `tf.keras.metrics.Metric`.
+      ragged: A bool indicating whether the supplied tensors are ragged. If
+        True y_true, y_pred and sample_weight (if providing per-example weights)
+        need to be ragged tensors with compatible shapes.
+      **kwargs: Other keyward arguments used in `tf.keras.metrics.Metric`.
+    """
+    super(AlphaDCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._alpha = alpha
+    self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
+    self._seed = seed
+    self._metric = metrics_impl.AlphaDCGMetric(
+        name=name, topn=topn, alpha=alpha, rank_discount_fn=self._rank_discount_fn, seed=seed, ragged=ragged
+    )
+
+  def get_config(self):
+    config = super(AlphaDCGMetric, self).get_config()
+    config.update(
+        {
+            "topn": self._topn,
+            "alpha": self._alpha,
+            "rank_discount_fn": self._rank_discount_fn,
+            "seed": self._seed,
+        }
+    )
+    return config
diff --git a/deepray/metrics/arp.py b/deepray/metrics/arp.py
new file mode 100644
index 00000000..122f7d25
--- /dev/null
+++ b/deepray/metrics/arp.py
@@ -0,0 +1,47 @@
+from ._ranking import _RankingMetric
+
+
+class ARPMetric(_RankingMetric):
+  r"""Average relevance position (ARP).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  ARP(y, s) = sum_i (y_i * rank(s_i)) / sum_j y_j
+  ```
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> arp = tfr.keras.metrics.ARPMetric()
+  >>> arp(y_true, y_pred).numpy()
+  2.5
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> arp = tfr.keras.metrics.ARPMetric(ragged=True)
+  >>> arp(y_true, y_pred).numpy()
+  1.75
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.ARPMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{ARP}(\{y\}, \{s\}) =
+  \frac{1}{\sum_i y_i} \sum_i y_i \cdot \text{rank}(s_i)
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly.
+  """
+
+  def __init__(self, name=None, dtype=None, ragged=False, **kwargs):
+    super(ARPMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._metric = metrics_impl.ARPMetric(name=name, ragged=ragged)
diff --git a/deepray/metrics/cohens_kappa.py b/deepray/metrics/cohens_kappa.py
index 72ddae2d..84f1c4db 100644
--- a/deepray/metrics/cohens_kappa.py
+++ b/deepray/metrics/cohens_kappa.py
@@ -16,8 +16,7 @@
 
 import tensorflow as tf
 import numpy as np
-import tensorflow.keras.backend as K
-from tensorflow.keras.metrics import Metric
+import tf_keras as keras
 from deepray.utils.types import AcceptableDTypes, FloatTensorLike
 
 from typeguard import typechecked
@@ -25,7 +24,7 @@
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class CohenKappa(Metric):
+class CohenKappa(keras.metrics.Metric):
   """Computes Kappa score between two raters.
 
     The score lies in the range `[-1, 1]`. A score of -1 represents
@@ -256,7 +255,7 @@ def reset_state(self):
     """Resets all of the metric state variables."""
 
     for v in self.variables:
-      K.set_value(
+      keras.set_value(
           v,
           np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
       )
diff --git a/deepray/metrics/dcg.py b/deepray/metrics/dcg.py
new file mode 100644
index 00000000..05f770ee
--- /dev/null
+++ b/deepray/metrics/dcg.py
@@ -0,0 +1,75 @@
+from ._ranking import _RankingMetric
+
+
+class DCGMetric(_RankingMetric):
+  r"""Discounted cumulative gain (DCG).
+
+  Discounted cumulative gain ([Järvelin et al, 2002][jarvelin2002]).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  DCG(y, s) = sum_i gain(y_i) * rank_discount(rank(s_i))
+  ```
+
+  NOTE: The `gain_fn` and `rank_discount_fn` should be keras serializable.
+  Please see `tfr.keras.utils.pow_minus_1` and `tfr.keras.utils.log2_inverse` as
+  examples when defining user customized functions.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> dcg = tfr.keras.metrics.DCGMetric()
+  >>> dcg(y_true, y_pred).numpy()
+  1.1309297
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> dcg = tfr.keras.metrics.DCGMetric(ragged=True)
+  >>> dcg(y_true, y_pred).numpy()
+  2.065465
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.DCGMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{DCG}(\{y\}, \{s\}) =
+  \sum_i \text{gain}(y_i) \cdot \text{rank_discount}(\text{rank}(s_i))
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly.
+
+  References:
+
+    - [Cumulated gain-based evaluation of IR techniques, Järvelin et al,
+       2002][jarvelin2002]
+
+  [jarvelin2002]: https://dl.acm.org/doi/10.1145/582415.582418
+  """
+
+  def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dtype=None, ragged=False, **kwargs):
+    super(DCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._gain_fn = gain_fn or utils.pow_minus_1
+    self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
+    self._metric = metrics_impl.DCGMetric(
+        name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
+    )
+
+  def get_config(self):
+    base_config = super(DCGMetric, self).get_config()
+    config = {
+        "topn": self._topn,
+        "gain_fn": self._gain_fn,
+        "rank_discount_fn": self._rank_discount_fn,
+    }
+    config.update(base_config)
+    return config
diff --git a/deepray/metrics/f_scores.py b/deepray/metrics/f_scores.py
index db96729b..c15afc23 100755
--- a/deepray/metrics/f_scores.py
+++ b/deepray/metrics/f_scores.py
@@ -15,15 +15,15 @@
 """Implements F scores."""
 
 import tensorflow as tf
-from tensorflow.keras import backend as K
+import tf_keras as keras
 from typeguard import typechecked
 
 from deepray.utils.types import AcceptableDTypes, FloatTensorLike
 from typing import Optional
 
 
-@tf.keras.utils.register_keras_serializable(package="Deepray")
-class FBetaScore(tf.keras.metrics.Metric):
+@keras.utils.register_keras_serializable(package="Deepray")
+class FBetaScore(keras.metrics.Metric):
   r"""Computes F-Beta score.
 
     It is the weighted harmonic mean of precision
@@ -191,7 +191,7 @@ def get_config(self):
 
   def reset_state(self):
     reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
-    K.batch_set_value([(v, reset_value) for v in self.variables])
+    keras.batch_set_value([(v, reset_value) for v in self.variables])
 
   def reset_states(self):
     # Backwards compatibility alias of `reset_state`. New classes should
diff --git a/deepray/metrics/geometric_mean.py b/deepray/metrics/geometric_mean.py
index 4f5f698a..ee1081ec 100644
--- a/deepray/metrics/geometric_mean.py
+++ b/deepray/metrics/geometric_mean.py
@@ -15,16 +15,15 @@
 """Implements GeometricMean."""
 
 import tensorflow as tf
-from tensorflow.keras import backend as K
-from tensorflow.keras.metrics import Metric
-
+import tf_keras as keras
+from tf_keras import backend as K
 from typeguard import typechecked
 from deepray.utils.types import AcceptableDTypes
 from deepray.metrics.utils import sample_weight_shape_match
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
-class GeometricMean(Metric):
+class GeometricMean(keras.metrics.Metric):
   """Compute Geometric Mean
 
     The geometric mean is a kind of mean. Unlike the arithmetic mean
diff --git a/deepray/metrics/hits.py b/deepray/metrics/hits.py
new file mode 100644
index 00000000..eabdb2c3
--- /dev/null
+++ b/deepray/metrics/hits.py
@@ -0,0 +1,65 @@
+from ._ranking import _RankingMetric
+
+
+class HitsMetric(_RankingMetric):
+  r"""Hits@k metric.
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  Hits@k(y, s) = 1.0, if \exists i s.t. y_i >= 1 and rank(s_i) <= k
+  Hits@k(y, s) = 0.0, otherwise.
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1` and `y_i = 0` if `y_i < 1`.
+  NOTE: While `topn` could be left as `None` without raising an error, the Hits
+  metric without `topn` specified would be trivial as it simply measures the
+  percentage of lists with at least 1 relevant item.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> hits_at_1 = tfr.keras.metrics.HitsMetric(topn=1)
+  >>> hits_at_1(y_true, y_pred).numpy()
+  0.0
+  >>> hits_at_2 = tfr.keras.metrics.HitsMetric(topn=2)
+  >>> hits_at_2(y_true, y_pred).numpy()
+  1.0
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 1., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> hits_at_1 = tfr.keras.metrics.HitsMetric(topn=1, ragged=True)
+  >>> hits_at_1(y_true, y_pred).numpy()
+  0.5
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.HitsMetric(topn=1)])
+  ```
+
+  Definition:
+
+  $$
+  \text{Hits}@k(\{y\}, \{s\}) = \max_{i | y_i \geq 1}
+                                \mathbf{I} [\text{rank}(s_i) \leq k]
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly and $y_i$ are labels.
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    super(HitsMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.HitsMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    config = super(HitsMetric, self).get_config()
+    config.update({
+        "topn": self._topn,
+    })
+    return config
diff --git a/deepray/metrics/matthews_correlation_coefficient.py b/deepray/metrics/matthews_correlation_coefficient.py
index 6190144b..c871d7fe 100644
--- a/deepray/metrics/matthews_correlation_coefficient.py
+++ b/deepray/metrics/matthews_correlation_coefficient.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 import tensorflow as tf
-from tensorflow.keras import backend as K
+import tf_keras as keras
 
 from deepray.utils.types import AcceptableDTypes, FloatTensorLike
 from typeguard import typechecked
@@ -125,7 +125,7 @@ def reset_state(self):
     """Resets all of the metric state variables."""
 
     for v in self.variables:
-      K.set_value(
+      keras.set_value(
           v,
           np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
       )
diff --git a/deepray/metrics/mean_average_precision.py b/deepray/metrics/mean_average_precision.py
new file mode 100644
index 00000000..76a5ecf0
--- /dev/null
+++ b/deepray/metrics/mean_average_precision.py
@@ -0,0 +1,79 @@
+from ._ranking import _RankingMetric
+
+
+class MeanAveragePrecisionMetric(_RankingMetric):
+  r"""Mean average precision (MAP).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  MAP(y, s) = sum_k (P@k(y, s) * rel(k)) / sum_i y_i
+  rel(k) = y_i if rank(s_i) = k
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1`.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> map_metric = tfr.keras.metrics.MeanAveragePrecisionMetric(topn=2)
+  >>> map_metric(y_true, y_pred).numpy()
+  0.25
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> map_metric = tfr.keras.metrics.MeanAveragePrecisionMetric(
+  ...   topn=2, ragged=True)
+  >>> map_metric(y_true, y_pred).numpy()
+  0.5
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd',
+                metrics=[tfr.keras.metrics.MeanAveragePrecisionMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{MAP}(\{y\}, \{s\}) =
+  \frac{\sum_k P@k(y, s) \cdot \text{rel}(k)}{\sum_j \bar{y}_j} \\
+  \text{rel}(k) = \max_i I[\text{rank}(s_i) = k] \bar{y}_i
+  $$
+
+  where:
+
+  * $P@k(y, s)$ is the Precision at rank $k$. See
+    `tfr.keras.metrics.PrecisionMetric`.
+  * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$
+    with ties broken randomly
+  * $I[]$ is the indicator function:\
+    $I[\text{cond}] = \begin{cases}
+    1 & \text{if cond is true}\\
+    0 & \text{else}\end{cases}
+    $
+  * $\bar{y}_i$ are the truncated labels:\
+    $
+    \bar{y}_i = \begin{cases}
+    1 & \text{if }y_i \geq 1 \\
+    0 & \text{else}
+    \end{cases}
+    $
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    super(MeanAveragePrecisionMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.MeanAveragePrecisionMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    base_config = super(MeanAveragePrecisionMetric, self).get_config()
+    config = {
+        "topn": self._topn,
+    }
+    config.update(base_config)
+    return config
diff --git a/deepray/metrics/metrics_impl.py b/deepray/metrics/metrics_impl.py
new file mode 100644
index 00000000..dffb469d
--- /dev/null
+++ b/deepray/metrics/metrics_impl.py
@@ -0,0 +1,895 @@
+# Copyright 2024 The TensorFlow Ranking Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Implements the metrics for TF-Ranking.
+
+The test cases are mainly on metrics_test.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import functools
+import six
+import tensorflow as tf
+
+from deepray.metrics import utils
+
+_DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1
+
+_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank)
+
+
+def _alpha_dcg_gain_fn(labels, alpha):
+  """Computes gain for alpha DCG metric from sorted labels.
+
+  Args:
+    labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. Each
+      value represents graded relevance to a subtopic: 1 for relevent subtopic,
+      0 for irrelevant, and -1 for paddings. When the actual subtopic number of
+      a query is smaller than the `subtopic_size`, `labels` will be padded to
+      `subtopic_size` with -1, similar to the paddings used for queries with doc
+      number less then list_size.
+    alpha: A float between 0 and 1. Originally introduced as an assessor error
+      in judging whether a document is covering a subtopic of the query. It can
+      also be interpreted as the inverse number of documents covering the same
+      subtopic reader needs to get and confirm the subtopic information of a
+      query.
+
+  Returns:
+    A function computes the alpha DCG gain.
+  """
+  # Cumulative number of topics covered along the list_size dimension.
+  cum_subtopics = tf.cumsum(labels, axis=1, exclusive=True)
+  gains = tf.reduce_sum(tf.multiply(labels, tf.pow(1 - alpha, cum_subtopics)), axis=-1)
+
+  return gains
+
+
+def _per_example_weights_to_per_list_weights(weights, relevance):
+  """Computes per list weight from per example weight.
+
+  The per-list weights are computed as:
+    per_list_weights = sum(weights * relevance) / sum(relevance).
+
+  For a list with sum(relevance) = 0, we set a default weight as the following
+  average weight while all the lists with sum(weights) = 0 are ignored.
+    sum(per_list_weights) / num(sum(relevance) != 0 && sum(weights) != 0)
+  When all the lists have sum(relevance) == 0, we set the average weight to 1.0.
+
+  Such a computation is good for the following scenarios:
+    - When all the weights are 1.0, the per list weights will be 1.0 everywhere,
+      even for lists without any relevant examples because
+        sum(per_list_weights) ==  num(sum(relevance) != 0)
+      This handles the standard ranking metrics where the weights are all 1.0.
+    - When every list has a nonzero weight, the default weight is not used. This
+      handles the unbiased metrics well.
+    - For the mixture of the above 2 scenario, the weights for lists with
+      nonzero relevance and nonzero weights is proportional to
+        per_list_weights / sum(per_list_weights) *
+        num(sum(relevance) != 0) / num(lists).
+      The rest have weights 1.0 / num(lists).
+
+  Args:
+    weights:  The weights `Tensor` of shape [batch_size, list_size].
+    relevance:  The relevance `Tensor` of shape [batch_size, list_size].
+
+  Returns:
+    The per list `Tensor` of shape [batch_size, 1]
+  """
+  nonzero_weights = tf.greater(tf.reduce_sum(input_tensor=weights, axis=1, keepdims=True), 0.0)
+  per_list_relevance = tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True)
+  nonzero_relevance = tf.compat.v1.where(
+      nonzero_weights, tf.cast(tf.greater(per_list_relevance, 0.0), tf.float32), tf.zeros_like(per_list_relevance)
+  )
+  nonzero_relevance_count = tf.reduce_sum(input_tensor=nonzero_relevance, axis=0, keepdims=True)
+
+  per_list_weights = tf.compat.v1.math.divide_no_nan(
+      tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True), per_list_relevance
+  )
+  sum_weights = tf.reduce_sum(input_tensor=per_list_weights, axis=0, keepdims=True)
+
+  avg_weight = tf.compat.v1.where(
+      tf.greater(nonzero_relevance_count, 0.0), tf.compat.v1.math.divide_no_nan(sum_weights, nonzero_relevance_count),
+      tf.ones_like(nonzero_relevance_count)
+  )
+  return tf.compat.v1.where(
+      nonzero_weights,
+      tf.where(tf.greater(per_list_relevance, 0.0), per_list_weights,
+               tf.ones_like(per_list_weights) * avg_weight), tf.zeros_like(per_list_weights)
+  )
+
+
+def _discounted_cumulative_gain(
+    labels, weights=None, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN
+):
+  """Computes discounted cumulative gain (DCG).
+
+  DCG = SUM(gain_fn(label) / rank_discount_fn(rank)). Using the default values
+  of the gain and discount functions, we get the following commonly used
+  formula for DCG: SUM((2^label -1) / log(1+rank)).
+
+  Args:
+    labels: The relevance `Tensor` of shape [batch_size, list_size]. For the
+      ideal ranking, the examples are sorted by relevance in reverse order. In
+      alpha_dcg, it is a `Tensor` with shape [batch_size, list_size,
+      subtopic_size].
+    weights: A `Tensor` of the same shape as labels or [batch_size, 1]. The
+      former case is per-example and the latter case is per-list.
+    gain_fn: (function) Transforms labels.
+    rank_discount_fn: (function) The rank discount function.
+
+  Returns:
+    A `Tensor` as the weighted discounted cumulative gain per-list. The
+    tensor shape is [batch_size, 1].
+  """
+  list_size = tf.shape(input=labels)[1]
+  position = tf.cast(tf.range(1, list_size + 1), dtype=tf.float32)
+  gain = gain_fn(tf.cast(labels, dtype=tf.float32))
+  discount = rank_discount_fn(position)
+  return tf.reduce_sum(input_tensor=weights * gain * discount, axis=1, keepdims=True)
+
+
+def _per_list_recall(labels, predictions, topn, mask):
+  """Computes the recall@k for each query in the batch.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a
+      relevant example.
+    predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+      the ranking score of the corresponding example.
+    topn: A cutoff for how many examples to consider for this metric.
+    mask: A mask indicating which entries are valid for computing the metric.
+
+  Returns:
+    A `Tensor` of size [batch_size, 1] containing the recall of each query
+    respectively.
+  """
+  sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0]
+  topn_positives = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
+  labels = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+  per_list_recall = tf.compat.v1.math.divide_no_nan(
+      tf.reduce_sum(input_tensor=topn_positives, axis=1, keepdims=True),
+      tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+  )
+  return per_list_recall
+
+
+def _per_list_precision(labels, predictions, topn, mask):
+  """Computes the precision for each query in the batch.
+
+  Args:
+    labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means a
+      relevant example.
+    predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+      the ranking score of the corresponding example.
+    topn: A cutoff for how many examples to consider for this metric.
+    mask: A `Tensor` of the same shape as predictions indicating which entries
+      are valid for computing the metric.
+
+  Returns:
+    A `Tensor` of size [batch_size, 1] containing the precision of each query
+    respectively.
+  """
+  sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0]
+  # Relevance = 1.0 when labels >= 1.0.
+  relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
+  if topn is None:
+    topn = tf.shape(relevance)[1]
+  valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True))
+  per_list_precision = tf.compat.v1.math.divide_no_nan(
+      tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.cast(valid_topn, dtype=tf.float32)
+  )
+  return per_list_precision
+
+
+class _RankingMetric(six.with_metaclass(abc.ABCMeta, object)):
+  """Interface for ranking metrics."""
+
+  def __init__(self, ragged=False):
+    """Constructor.
+
+    Args:
+      ragged: A bool indicating whether the supplied tensors are ragged. If
+        True labels, predictions and weights (if providing per-example weights)
+        need to be ragged tensors with compatible shapes.
+    """
+    self._ragged = ragged
+
+  @abc.abstractproperty
+  def name(self):
+    """The metric name."""
+    raise NotImplementedError('Calling an abstract method.')
+
+  def _prepare_and_validate_params(self, labels, predictions, weights, mask):
+    """Prepares and validates the parameters.
+
+    Args:
+      labels: A `Tensor` of the same shape as `predictions`. A value >= 1 means
+        a relevant example.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: A `Tensor` of the same shape of predictions or [batch_size, 1].
+        The former case is per-example and the latter case is per-list.
+      mask: A `Tensor` of the same shape as predictions indicating which entries
+        are valid for computing the metric.
+
+    Returns:
+      (labels, predictions, weights, mask) ready to be used for metric
+      calculation.
+    """
+    if any(isinstance(tensor, tf.RaggedTensor) for tensor in [labels, predictions, weights]):
+      raise ValueError(
+          'labels, predictions and/or weights are ragged tensors, '
+          'use ragged=True to enable ragged support for metrics.'
+      )
+    labels = tf.convert_to_tensor(value=labels)
+    predictions = tf.convert_to_tensor(value=predictions)
+    weights = 1.0 if weights is None else tf.convert_to_tensor(value=weights)
+    example_weights = tf.ones_like(labels) * weights
+    predictions.get_shape().assert_is_compatible_with(example_weights.get_shape())
+    predictions.get_shape().assert_is_compatible_with(labels.get_shape())
+    predictions.get_shape().assert_has_rank(2)
+
+    # All labels should be >= 0. Invalid entries are reset.
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    mask = tf.math.logical_and(mask, tf.math.greater(example_weights, 0.0))
+    labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
+    predictions = tf.compat.v1.where(
+        mask, predictions,
+        -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True)
+    )
+    return labels, predictions, example_weights, mask
+
+  def compute(self, labels, predictions, weights=None, mask=None):
+    """Computes the metric with the given inputs.
+
+    Args:
+      labels: A `Tensor` of the same shape as `predictions` representing
+        relevance.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: An optional `Tensor` of the same shape of predictions or
+        [batch_size, 1]. The former case is per-example and the latter case is
+        per-list.
+      mask: An optional `Tensor` of the same shape as predictions indicating
+        which entries are valid for computing the metric. Will be ignored if
+        the metric was constructed with ragged=True.
+
+    Returns:
+      A tf metric.
+    """
+    if self._ragged:
+      labels, predictions, weights, mask = utils.ragged_to_dense(labels, predictions, weights)
+    labels, predictions, weights, mask = self._prepare_and_validate_params(labels, predictions, weights, mask)
+    return self._compute_impl(labels, predictions, weights, mask)
+
+  @abc.abstractmethod
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """Computes the metric with the given inputs.
+
+    Args:
+      labels: A `Tensor` of the same shape as `predictions` representing
+        relevance.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: A `Tensor` of the same shape of predictions or [batch_size, 1].
+        The former case is per-example and the latter case is per-list.
+      mask: A `Tensor` of the same shape as predictions indicating which entries
+        are valid for computing the metric.
+
+    Returns:
+      A tf metric.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+
+class _DivRankingMetric(_RankingMetric):
+  """Interface for diversity ranking metrics.
+
+  Attributes:
+    name: A string used as the name for this metric.
+  """
+
+  def __init__(self, name, topn=None, ragged=False):
+    super(_DivRankingMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  @abc.abstractmethod
+  def _compute_per_list_metric(self, labels, predictions, weights, topn, mask):
+    """Computes the metric with the given inputs.
+
+    Args:
+      labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A
+        nonzero value means that the example covers the corresponding subtopic.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: A `Tensor` of the same shape of predictions or [batch_size, 1].
+        The former case is per-example and the latter case is per-list.
+      topn: A cutoff for how many examples to consider for this metric.
+      mask: A `Tensor` of the same shape as predictions indicating which entries
+        are valid for computing the metric.
+
+    Returns:
+      A tf per-list metric.
+    """
+
+  def _prepare_and_validate_params(self, labels, predictions, weights, mask):
+    """Prepares and validates the parameters.
+
+    Args:
+      labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A
+        nonzero value means that the example covers the corresponding subtopic.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: A `Tensor` of the same shape of predictions or [batch_size, 1].
+        The former case is per-example and the latter case is per-list.
+      mask: A `Tensor` of the same shape as predictions indicating which entries
+        are valid for computing the metric.
+
+    Returns:
+      A 4-tuple of (labels, predictions, weights, mask) ready to be used
+      for metric calculation.
+    """
+    labels = tf.convert_to_tensor(value=labels)
+    predictions = tf.convert_to_tensor(value=predictions)
+    labels.get_shape().assert_has_rank(3)
+    if mask is None:
+      mask = utils.is_label_valid(labels)
+    mask = tf.convert_to_tensor(value=mask)
+    if mask.get_shape().rank == 3:
+      mask = tf.reduce_any(mask, axis=2)
+    predictions = tf.where(
+        mask, predictions,
+        -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True)
+    )
+    # All labels should be >= 0. Invalid entries are reset.
+    labels = tf.where(tf.expand_dims(mask, axis=2), labels, tf.zeros_like(labels))
+    weights = (tf.constant(1.0, dtype=tf.float32) if weights is None else tf.convert_to_tensor(value=weights))
+    example_weights = tf.ones_like(predictions) * weights
+
+    return labels, predictions, example_weights, mask
+
+  def _compute_per_list_weights(self, weights, labels):
+    """Computes per list weight from weights and labels for diversification.
+
+    Args:
+      weights:  The weights `Tensor` of shape [batch_size, list_size].
+      labels:  The labels `Tensor` of shape [batch_size, list_size,
+        subtopic_size].
+
+    Returns:
+      The per-list `Tensor` of shape [batch_size, 1]
+    """
+    # per_list_weights are computed from the whole list to avoid the problem of
+    # 0 when there is no relevant example in topn.
+    return _per_example_weights_to_per_list_weights(
+        weights, tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=-1), dtype=tf.float32)
+    )
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """Computes the metric and per list weight with the given inputs.
+
+    Args:
+      labels: A `Tensor` with shape [batch_size, list_size, subtopic_size]. A
+        nonzero value means that the example covers the corresponding subtopic.
+      predictions: A `Tensor` with shape [batch_size, list_size]. Each value is
+        the ranking score of the corresponding example.
+      weights: A `Tensor` of the same shape of predictions or [batch_size, 1].
+        The former case is per-example and the latter case is per-list.
+      mask: An optional `Tensor` of the same shape as predictions indicating
+        which entries are valid for computing the metric.
+
+    Returns:
+      A per-list metric and a per-list weights.
+    """
+    topn = tf.shape(input=predictions)[1] if self._topn is None else self._topn
+    per_list_metric = self._compute_per_list_metric(labels, predictions, weights, topn, mask)
+    per_list_weights = self._compute_per_list_weights(weights, labels)
+    return per_list_metric, per_list_weights
+
+
+class MRRMetric(_RankingMetric):
+  """Implements mean reciprocal rank (MRR)."""
+
+  def __init__(self, name, topn, ragged=False):
+    """Constructor."""
+    super(MRRMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
+    sorted_list_size = tf.shape(input=sorted_labels)[1]
+    # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance.
+    relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
+    reciprocal_rank = 1.0 / tf.cast(tf.range(1, sorted_list_size + 1), dtype=tf.float32)
+    # MRR has a shape of [batch_size, 1].
+    mrr = tf.reduce_max(input_tensor=relevance * reciprocal_rank, axis=1, keepdims=True)
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    )
+    return mrr, per_list_weights
+
+
+class HitsMetric(_RankingMetric):
+  r"""Implements Hits@k metric.
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  Hits@k(y, s) = 1.0, if \exists i s.t. y_i >= 1 and rank(s_i) <= k
+  Hits@k(y, s) = 0.0, otherwise.
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1` and `y_i = 0` if `y_i < 1`.
+  NOTE: While `topn` could be left as `None` without raising an error, the Hits
+  metric without `topn` specified would be trivial as it simply measures the
+  percentage of lists with at least 1 relevant item.
+  """
+
+  def __init__(self, name, topn, ragged=False):
+    """Constructor."""
+    super(HitsMetric, self).__init__(ragged=ragged)
+    self._name = name
+    if topn is None:
+      tf.compat.v1.logging.warning(
+          'Hits metric without `topn` specified could be trivial. '
+          'Consider specify `topn` for Hits metric.'
+      )
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
+    # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance.
+    relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
+    # Hits has a shape of [batch_size, 1].
+    hits = tf.reduce_max(input_tensor=relevance, axis=1, keepdims=True)
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    )
+    return hits, per_list_weights
+
+
+class ARPMetric(_RankingMetric):
+  """Implements average relevance position (ARP)."""
+
+  def __init__(self, name, ragged=False):
+    """Constructor."""
+    super(ARPMetric, self).__init__(ragged=ragged)
+    self._name = name
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1]
+    sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask)
+    weighted_labels = sorted_labels * sorted_weights
+    position = (tf.cast(tf.range(1, topn + 1), dtype=tf.float32) * tf.ones_like(weighted_labels))
+    per_list_weights = tf.reduce_sum(weighted_labels, axis=1, keepdims=True)
+    per_list_arp = tf.compat.v1.div_no_nan(
+        tf.reduce_sum(position * weighted_labels, axis=1, keepdims=True), per_list_weights
+    )
+    # TODO: Consider to add a cap position topn + 1 when there is no
+    # relevant examples.
+    return per_list_arp, per_list_weights
+
+
+class RecallMetric(_RankingMetric):
+  """Implements recall@k (r@k)."""
+
+  def __init__(self, name, topn, ragged=False):
+    """Constructor."""
+    super(RecallMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    per_list_recall = _per_list_recall(labels, predictions, topn, mask)
+    # per_list_weights are computed from the whole list to avoid the problem of
+    # 0 when there is no relevant example in topn.
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    )
+    return per_list_recall, per_list_weights
+
+
+class PrecisionMetric(_RankingMetric):
+  """Implements precision@k (P@k)."""
+
+  def __init__(self, name, topn, ragged=False):
+    """Constructor."""
+    super(PrecisionMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    per_list_precision = _per_list_precision(labels, predictions, topn, mask)
+    # per_list_weights are computed from the whole list to avoid the problem of
+    # 0 when there is no relevant example in topn.
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    )
+    return per_list_precision, per_list_weights
+
+
+class MeanAveragePrecisionMetric(_RankingMetric):
+  """Implements mean average precision (MAP)."""
+
+  def __init__(self, name, topn, ragged=False):
+    """Constructor."""
+    super(MeanAveragePrecisionMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    # Relevance = 1.0 when labels >= 1.0.
+    relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    sorted_relevance, sorted_weights = utils.sort_by_scores(predictions, [relevance, weights], topn=topn, mask=mask)
+    per_list_relevant_counts = tf.cumsum(sorted_relevance, axis=1)
+    per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1)
+    per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs)
+    total_precision = tf.reduce_sum(
+        input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True
+    )
+
+    # Compute the total relevance regardless of self._topn.
+    total_relevance = tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True)
+
+    per_list_map = tf.math.divide_no_nan(total_precision, total_relevance)
+    # per_list_weights are computed from the whole list to avoid the problem of
+    # 0 when there is no relevant example in topn.
+    per_list_weights = _per_example_weights_to_per_list_weights(weights, relevance)
+    return per_list_map, per_list_weights
+
+
+class NDCGMetric(_RankingMetric):
+  """Implements normalized discounted cumulative gain (NDCG)."""
+
+  def __init__(self, name, topn, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN, ragged=False):
+    """Constructor."""
+    super(NDCGMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+    self._gain_fn = gain_fn
+    self._rank_discount_fn = rank_discount_fn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask)
+    dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn)
+    # Sorting over the weighted gains to get ideal ranking.
+    weighted_gains = weights * self._gain_fn(tf.cast(labels, dtype=tf.float32))
+    ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores(
+        weighted_gains, [labels, weights], topn=topn, mask=mask
+    )
+    ideal_dcg = _discounted_cumulative_gain(
+        ideal_sorted_labels, ideal_sorted_weights, self._gain_fn, self._rank_discount_fn
+    )
+    per_list_ndcg = tf.compat.v1.math.divide_no_nan(dcg, ideal_dcg)
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
+    )
+    return per_list_ndcg, per_list_weights
+
+
+class DCGMetric(_RankingMetric):
+  """Implements discounted cumulative gain (DCG)."""
+
+  def __init__(self, name, topn, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN, ragged=False):
+    """Constructor."""
+    super(DCGMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+    self._gain_fn = gain_fn
+    self._rank_discount_fn = rank_discount_fn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask)
+    dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn)
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
+    )
+    per_list_dcg = tf.compat.v1.math.divide_no_nan(dcg, per_list_weights)
+    return per_list_dcg, per_list_weights
+
+
+class OPAMetric(_RankingMetric):
+  """Implements ordered pair accuracy (OPA)."""
+
+  def __init__(self, name, ragged=False):
+    """Constructor."""
+    super(OPAMetric, self).__init__(ragged=ragged)
+    self._name = name
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    valid_pair = tf.logical_and(tf.expand_dims(mask, 2), tf.expand_dims(mask, 1))
+    pair_label_diff = tf.expand_dims(labels, 2) - tf.expand_dims(labels, 1)
+    pair_pred_diff = tf.expand_dims(predictions, 2) - tf.expand_dims(predictions, 1)
+    # Correct pairs are represented twice in the above pair difference tensors.
+    # We only take one copy for each pair.
+    correct_pairs = tf.cast(pair_label_diff > 0, dtype=tf.float32) * tf.cast(pair_pred_diff > 0, dtype=tf.float32)
+    pair_weights = tf.cast(pair_label_diff > 0,
+                           dtype=tf.float32) * tf.expand_dims(weights, 2) * tf.cast(valid_pair, dtype=tf.float32)
+    per_list_weights = tf.expand_dims(tf.reduce_sum(pair_weights, axis=[1, 2]), 1)
+    per_list_opa = tf.compat.v1.math.divide_no_nan(
+        tf.expand_dims(tf.reduce_sum(correct_pairs * pair_weights, axis=[1, 2]), 1), per_list_weights
+    )
+    return per_list_opa, per_list_weights
+
+
+class PrecisionIAMetric(_DivRankingMetric):
+  """Implements Intent-Aware Precision@k (Pre-IA@k).
+
+  PrecisionIA is a metric introduced in ["Overview of the TREC 2009 Web Track."]
+  by C Clarke, et al. It is one of the evaluation measures for the TREC
+  diversity task, where a query may have multiple different implications, termed
+  as subtopics / nuggets. Specifically,
+    Pre-IA@k = SUM_t SUM_{i=1}^k label(rank=i, topic=t) / (# of Subtopics * k),
+  where t indexes subtopics and i indexes document ranks, SUM_t sums over all
+  subtopics and SUM_{i=1}^k sums over the top k ranks.
+  """
+
+  def _compute_per_list_metric(self, labels, predictions, weights, topn, mask):
+    """See `_DivRankingMetric`."""
+    sorted_labels = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)[0]
+    # relevance shape = [batch_size, topn].
+    relevance = tf.reduce_sum(tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32), axis=-1)
+    # num_subtopics shape = [batch_size, 1].
+    num_subtopics = tf.reduce_sum(
+        tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=1, keepdims=True), dtype=tf.float32), axis=-1
+    )
+    if topn is None:
+      topn = tf.shape(relevance)[1]
+    # valid_topn shape = [batch_size, 1].
+    valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True))
+    return tf.compat.v1.math.divide_no_nan(
+        tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True),
+        tf.reduce_sum(input_tensor=tf.cast(valid_topn, dtype=tf.float32) * num_subtopics, axis=1, keepdims=True)
+    )
+
+
+class AlphaDCGMetric(_DivRankingMetric):
+  """Implements alpha discounted cumulative gain (alphaDCG).
+
+  alphaDCG is a metric first introduced in ["Novelty and Diversity in
+  Information Retrieval Evaluation."] by C Clarke, et al. It is commonly used in
+  diversification tasks, where a query may have multiple different implications,
+  termed as subtopics / nuggets. This metric tends to emphasize a rank with
+  items covering different subtopics on top by a gain_fn with reduced gain from
+  readily covered subtopics. Specifically,
+    alphaDCG = SUM(gain_fn(label, alpha) / rank_discount_fn(rank)).
+  Using the default values of the gain and discount functions, we get the
+  following commonly used formula for alphaDCG:
+    SUM(label_i * (1-alpha)^(SUM_{rank_j<rank_i}label_j) / log2(1+rank_i)).
+  """
+
+  def __init__(self, name, topn, alpha=0.5, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN, seed=None, ragged=False):
+    """Constructor."""
+    super(AlphaDCGMetric, self).__init__(name, topn, ragged=ragged)
+    self._alpha = alpha
+    self._gain_fn = functools.partial(_alpha_dcg_gain_fn, alpha=alpha)
+    self._rank_discount_fn = rank_discount_fn
+    self._seed = seed
+
+  def _compute_per_list_metric(self, labels, predictions, weights, topn, mask):
+    """See `_DivRankingMetric`."""
+    sorted_labels, sorted_weights = utils.sort_by_scores(
+        predictions, [labels, weights], topn=topn, seed=self._seed, mask=mask
+    )
+    alpha_dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn)
+    per_list_weights = self._compute_per_list_weights(weights, labels)
+    return tf.compat.v1.math.divide_no_nan(alpha_dcg, per_list_weights)
+
+
+class BPrefMetric(_RankingMetric):
+  """Implements binary preference (BPref) metric.
+
+  In this implementation, the 0 labels are considered negative,
+  any unlabelled examples should be removed or labeled as -1 prior to
+  calculating BPref.
+  Graded labels will be converted to binary labels by clipping to max 1.
+
+  BPref is used in scenarios when the relevance judgements are incomplete.
+  It is based on relative ranks of the judged documents and measures the
+  preference for the retrieval of judged relevant documents ahead of judged
+  irrelevant documents.
+  The version of BPref that is used as default here was introduced in the TREC
+  competition in 2005 and is described in
+  https://trec.nist.gov/pubs/trec15/appendices/CE.MEASURES06.pdf :
+    BPref = 1 / R SUM_r(1- |n ranked higher than r| / min(R, N))
+
+    R = total number of relevant documents
+    N = total number of irrelevant documents
+    r = retrieved relevant document
+    n = retrieved irrelevant document
+
+  Note that the above trec formula is different from the other commonly cited
+  version where R is used to divide |n ranked higher than r|
+  instead of min(R, N):
+      BPref = 1 / R SUM_r(1- |n ranked higher than r| / R)
+  The potential issue of this definition is that the metric may not be monotonic
+  when N > R: i.e. When a lot of irrelevant documents ranked higher than the
+  relevant ones, the metric could be very positive. To use the latter formula,
+  set use_trec_version to False.
+  """
+
+  def __init__(self, name, topn, use_trec_version=True, ragged=False):
+    """Constructor."""
+    super(BPrefMetric, self).__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+    self._use_trec_version = use_trec_version
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+
+    # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance.
+    relevance = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+    irrelevance = tf.cast(mask, tf.float32) - relevance
+
+    total_relevance = tf.reduce_sum(relevance, axis=1, keepdims=True)
+    total_irrelevance = tf.reduce_sum(irrelevance, axis=1, keepdims=True)
+
+    sorted_relevance, sorted_irrelevance = utils.sort_by_scores(
+        predictions, [relevance, irrelevance], mask=mask, topn=topn
+    )
+
+    numerator = tf.minimum(tf.cumsum(sorted_irrelevance, axis=1), total_relevance)
+    denominator = tf.minimum(total_irrelevance, total_relevance) if self._use_trec_version else total_relevance
+
+    bpref = tf.math.divide_no_nan(
+        tf.reduce_sum(((1. - tf.math.divide_no_nan(numerator, denominator)) * sorted_relevance), axis=1, keepdims=True),
+        total_relevance
+    )
+
+    per_list_weights = _per_example_weights_to_per_list_weights(
+        weights=weights, relevance=tf.cast(tf.greater_equal(relevance, 1.0), dtype=tf.float32)
+    )
+
+    return bpref, per_list_weights
+
+
+class PWAMetric(_RankingMetric):
+  """Construct a custom Position-Weighted Average Metric.
+
+  For each query we order the results by scores and compute:
+
+  pwa = (ratings[0] * position_weights[0] + ...
+          + ratings[topn - 1] * position_weights[topn - 1]) /
+        (position_weights[0] + ... + position_weights[topn - 1])
+
+  where position_weights = (1. / 1, 1. / 2, ..., 1. / topn)
+
+  Metric value for the whole dataset is weighted sum over pwa values for
+  individual queries:
+
+  result = pwa(query_0) * weights[0] + pwa(query_1) * weights[1] + ...
+
+  For this metrcs, weights should be a `Tensor` of the shape [batch_size, 1].
+  """
+
+  def __init__(self, name, topn=5, ragged=False):
+    """Constructor."""
+    super().__init__(ragged=ragged)
+    self._name = name
+    self._topn = topn
+
+  @property
+  def name(self):
+    """The metric name."""
+    return self._name
+
+  def compute(self, labels, predictions, weights=None, mask=None):
+    """See `_RankingMetric`."""
+    if weights is not None:
+      weights_tensor = tf.convert_to_tensor(value=weights)
+      predictions_tensor = tf.convert_to_tensor(value=predictions)
+      expected_shape = tf.zeros([tf.shape(predictions_tensor)[0], 1])
+      if not weights_tensor.shape.is_compatible_with(expected_shape.shape):
+        raise ValueError('Weights should be a `Tensor` of the shape'
+                         '[batch_size, 1]')
+    return super().compute(labels, predictions, weights, mask)
+
+  def _compute_impl(self, labels, predictions, weights, mask):
+    """See `_RankingMetric`."""
+    topn = tf.shape(predictions)[1] if self._topn is None else self._topn
+    sorted_labels, sorted_mask = utils.sort_by_scores(predictions, [labels, mask], topn=topn, mask=mask)
+
+    sorted_list_size = tf.shape(input=sorted_labels)[1]
+    position_weights = 1.0 / tf.cast(tf.range(1, sorted_list_size + 1), dtype=tf.float32)
+    masked_position_weights = (tf.cast(sorted_mask, dtype=tf.float32) * position_weights)
+    pwa = tf.compat.v1.math.divide_no_nan(
+        tf.reduce_sum(input_tensor=tf.multiply(sorted_labels, masked_position_weights), axis=1, keepdims=True),
+        tf.reduce_sum(input_tensor=masked_position_weights, axis=1, keepdims=True)
+    )
+    # Weights list should come in with size [batch_size, 1], then will be
+    # expanded out to [batch_size, list_size] in the
+    # "_prepare_and_validate_params" step, so we need to reduce the Tensor back
+    # to size [batch_size, 1].
+    per_list_weights = tf.reduce_mean(input_tensor=weights, axis=1, keepdims=True)
+    return pwa, per_list_weights
diff --git a/deepray/metrics/mrr.py b/deepray/metrics/mrr.py
new file mode 100644
index 00000000..82469bab
--- /dev/null
+++ b/deepray/metrics/mrr.py
@@ -0,0 +1,111 @@
+import tensorflow as tf
+
+from deepray.metrics import metrics_impl
+
+
+class _RankingMetric(tf.keras.metrics.Mean):
+  """Implements base ranking metric class.
+
+  Please see tf.keras.metrics.Mean for more information about such a class and
+  https://www.tensorflow.org/tutorials/distribute/custom_training on how to do
+  customized training.
+  """
+
+  def __init__(self, name=None, dtype=None, ragged=False, **kwargs):
+    super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs)
+    # An instance of `metrics_impl._RankingMetric`.
+    # Overwrite this in subclasses.
+    self._metric = None
+    self._ragged = ragged
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = tf.cast(y_true, self._dtype)
+    y_pred = tf.cast(y_pred, self._dtype)
+
+    # TODO: Add mask argument for metric.compute() call
+    per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight)
+    return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights)
+
+  def get_config(self):
+    config = super(_RankingMetric, self).get_config()
+    config.update({
+        "ragged": self._ragged,
+    })
+    return config
+
+
+class MRRMetric(_RankingMetric):
+  r"""Mean reciprocal rank (MRR).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  MRR(y, s) = max_i y_i / rank(s_i)
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1`.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> mrr = dp.metrics.MRRMetric()
+  >>> mrr(y_true, y_pred).numpy()
+  0.5
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> mrr = dp.metrics.MRRMetric(ragged=True)
+  >>> mrr(y_true, y_pred).numpy()
+  0.75
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.MRRMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{MRR}(\{y\}, \{s\}) = \max_i \frac{\bar{y}_i}{\text{rank}(s_i)}
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly and $\bar{y_i}$ are truncated labels:
+
+  $$
+  \bar{y}_i = \begin{cases}
+  1 & \text{if }y_i \geq 1 \\
+  0 & \text{else}
+  \end{cases}
+  $$
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    super(MRRMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.MRRMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    config = super(MRRMetric, self).get_config()
+    config.update({
+        "topn": self._topn,
+    })
+    return config
diff --git a/deepray/metrics/multilabel_confusion_matrix.py b/deepray/metrics/multilabel_confusion_matrix.py
index 281deaa5..1bbe5bfb 100644
--- a/deepray/metrics/multilabel_confusion_matrix.py
+++ b/deepray/metrics/multilabel_confusion_matrix.py
@@ -17,7 +17,7 @@
 import warnings
 
 import tensorflow as tf
-from tensorflow.keras import backend as K
+from tf_keras import backend as K
 from tensorflow.keras.metrics import Metric
 import numpy as np
 
diff --git a/deepray/metrics/ndcg.py b/deepray/metrics/ndcg.py
new file mode 100644
index 00000000..b441df19
--- /dev/null
+++ b/deepray/metrics/ndcg.py
@@ -0,0 +1,131 @@
+import tensorflow as tf
+from deepray.metrics import metrics_impl
+from deepray.metrics import utils
+
+_DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1
+
+_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank)
+
+
+class _RankingMetric(tf.keras.metrics.Mean):
+  """Implements base ranking metric class.
+
+  Please see tf.keras.metrics.Mean for more information about such a class and
+  https://www.tensorflow.org/tutorials/distribute/custom_training on how to do
+  customized training.
+  """
+
+  def __init__(self, name=None, dtype=None, ragged=False, **kwargs):
+    super(_RankingMetric, self).__init__(name=name, dtype=dtype, **kwargs)
+    # An instance of `metrics_impl._RankingMetric`.
+    # Overwrite this in subclasses.
+    self._metric = None
+    self._ragged = ragged
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates metric statistics.
+
+    `y_true` and `y_pred` should have the same shape.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    y_true = tf.cast(y_true, self._dtype)
+    y_pred = tf.cast(y_pred, self._dtype)
+
+    # TODO: Add mask argument for metric.compute() call
+    per_list_metric_val, per_list_metric_weights = self._metric.compute(y_true, y_pred, sample_weight)
+    return super(_RankingMetric, self).update_state(per_list_metric_val, sample_weight=per_list_metric_weights)
+
+  def get_config(self):
+    config = super(_RankingMetric, self).get_config()
+    config.update({
+        "ragged": self._ragged,
+    })
+    return config
+
+
+@tf.keras.utils.register_keras_serializable(package="tensorflow_ranking")
+class NDCGMetric(_RankingMetric):
+  r"""Normalized discounted cumulative gain (NDCG).
+
+  Normalized discounted cumulative gain ([Järvelin et al, 2002][jarvelin2002])
+  is the normalized version of `tfr.keras.metrics.DCGMetric`.
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  NDCG(y, s) = DCG(y, s) / DCG(y, y)
+  DCG(y, s) = sum_i gain(y_i) * rank_discount(rank(s_i))
+  ```
+
+  NOTE: The `gain_fn` and `rank_discount_fn` should be keras serializable.
+  Please see `tfr.keras.utils.pow_minus_1` and `tfr.keras.utils.log2_inverse` as
+  examples when defining user customized functions.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> ndcg = dp.metrics.NDCGMetric()
+  >>> ndcg(y_true, y_pred).numpy()
+  0.6934264
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> ndcg = dp.metrics.NDCGMetric(ragged=True)
+  >>> ndcg(y_true, y_pred).numpy()
+  0.7974351
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.NDCGMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{NDCG}(\{y\}, \{s\}) =
+  \frac{\text{DCG}(\{y\}, \{s\})}{\text{DCG}(\{y\}, \{y\})} \\
+  \text{DCG}(\{y\}, \{s\}) =
+  \sum_i \text{gain}(y_i) \cdot \text{rank_discount}(\text{rank}(s_i))
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly.
+
+  References:
+
+    - [Cumulated gain-based evaluation of IR techniques, Järvelin et al,
+       2002][jarvelin2002]
+
+  [jarvelin2002]: https://dl.acm.org/doi/10.1145/582415.582418
+  """
+
+  def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dtype=None, ragged=False, **kwargs):
+    super(NDCGMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._gain_fn = gain_fn or utils.pow_minus_1
+    self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
+    self._metric = metrics_impl.NDCGMetric(
+        name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
+    )
+
+  def get_config(self):
+    base_config = super(NDCGMetric, self).get_config()
+    config = {
+        "topn": self._topn,
+        "gain_fn": self._gain_fn,
+        "rank_discount_fn": self._rank_discount_fn,
+    }
+    config.update(base_config)
+    return config
diff --git a/deepray/metrics/opa.py b/deepray/metrics/opa.py
new file mode 100644
index 00000000..1a1beaa8
--- /dev/null
+++ b/deepray/metrics/opa.py
@@ -0,0 +1,55 @@
+from ._ranking import _RankingMetric
+
+
+class OPAMetric(_RankingMetric):
+  r"""Ordered pair accuracy (OPA).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  OPA(y, s) = sum_i sum_j I[s_i > s_j] I[y_i > y_j] / sum_i sum_j I[y_i > y_j]
+  ```
+
+  NOTE: Pairs with equal labels (`y_i = y_j`) are always ignored. Pairs with
+  equal scores (`s_i = s_j`) are considered incorrectly ordered.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 2.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> opa = tfr.keras.metrics.OPAMetric()
+  >>> opa(y_true, y_pred).numpy()
+  0.33333334
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> opa = tfr.keras.metrics.OPAMetric(ragged=True)
+  >>> opa(y_true, y_pred).numpy()
+  0.5
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.OPAMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{OPA}(\{y\}, \{s\}) =
+  \frac{\sum_i \sum_j I[s_i > s_j] I[y_i > y_j]}{\sum_i \sum_j I[y_i > y_j]}
+  $$
+
+  where $I[]$ is the indicator function:
+
+  $$
+  I[\text{cond}] = \begin{cases}
+  1 & \text{if cond is true}\\
+  0 & \text{else}\end{cases}
+  $$
+  """
+
+  def __init__(self, name=None, dtype=None, ragged=False, **kwargs):
+    super(OPAMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._metric = metrics_impl.OPAMetric(name=name, ragged=ragged)
diff --git a/deepray/metrics/precision.py b/deepray/metrics/precision.py
new file mode 100644
index 00000000..54184f33
--- /dev/null
+++ b/deepray/metrics/precision.py
@@ -0,0 +1,73 @@
+from ._ranking import _RankingMetric
+
+
+class PrecisionMetric(_RankingMetric):
+  r"""Precision@k (P@k).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  P@K(y, s) = 1/k sum_i I[rank(s_i) < k] y_i
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1`.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> precision_at_2 = tfr.keras.metrics.PrecisionMetric(topn=2)
+  >>> precision_at_2(y_true, y_pred).numpy()
+  0.5
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> precision_at_2 = tfr.keras.metrics.PrecisionMetric(topn=2, ragged=True)
+  >>> precision_at_2(y_true, y_pred).numpy()
+  0.5
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.PrecisionMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{P@k}(\{y\}, \{s\}) =
+  \frac{1}{k} \sum_i I[\text{rank}(s_i) \leq k] \bar{y}_i
+  $$
+
+  where:
+
+  * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$
+    with ties broken randomly
+  * $I[]$ is the indicator function:\
+    $I[\text{cond}] = \begin{cases}
+    1 & \text{if cond is true}\\
+    0 & \text{else}\end{cases}
+    $
+  * $\bar{y}_i$ are the truncated labels:\
+    $
+    \bar{y}_i = \begin{cases}
+    1 & \text{if }y_i \geq 1 \\
+    0 & \text{else}
+    \end{cases}
+    $
+  * $k = |y|$ if $k$ is not provided
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    super(PrecisionMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.PrecisionMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    config = super(PrecisionMetric, self).get_config()
+    config.update({
+        "topn": self._topn,
+    })
+    return config
diff --git a/deepray/metrics/precision_ia.py b/deepray/metrics/precision_ia.py
new file mode 100644
index 00000000..1b4d17a5
--- /dev/null
+++ b/deepray/metrics/precision_ia.py
@@ -0,0 +1,88 @@
+from ._ranking import _RankingMetric
+
+
+class PrecisionIAMetric(_RankingMetric):
+  r"""Precision-IA@k (Pre-IA@k).
+
+  Intent-aware Precision@k ([Agrawal et al, 2009][agrawal2009];
+  [Clarke et al, 2009][clarke2009]) is a precision metric that operates on
+  subtopics and is typically used for diversification tasks..
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  Pre-IA@k(y, s) = sum_t sum_i I[rank(s_i) <= k] y_{i,t} / (# of subtopics * k)
+  ```
+
+  NOTE: The labels `y_true` should be of shape
+  `[batch_size, list_size, subtopic_size]`, indicating relevance for each
+  subtopic in the last dimension.
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_{i,t} = 1` if `y_{i,t} >= 1`.
+
+  Standalone usage:
+
+  >>> y_true = [[[0., 1.], [1., 0.], [1., 1.]]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> pre_ia = tfr.keras.metrics.PrecisionIAMetric()
+  >>> pre_ia(y_true, y_pred).numpy()
+  0.6666667
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant(
+  ...   [[[0., 0.], [1., 0.]], [[1., 1.], [0., 2.], [1., 0.]]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> pre_ia = tfr.keras.metrics.PrecisionIAMetric(ragged=True)
+  >>> pre_ia(y_true, y_pred).numpy()
+  0.5833334
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd',
+                metrics=[tfr.keras.metrics.PrecisionIAMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{Pre-IA@k}(y, s) = \frac{1}{\text{# of subtopics} \cdot k}
+  \sum_t \sum_i I[\text{rank}(s_i) \leq k] y_{i,t}
+  $$
+
+  where $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores
+  $s$ with ties broken randomly.
+
+  References:
+
+    - [Diversifying Search Results, Agrawal et al, 2009][agrawal2009]
+    - [Overview of the TREC 2009 Web Track, Clarke et al, 2009][clarke2009]
+
+  [agrawal2009]:
+  https://www.microsoft.com/en-us/research/publication/diversifying-search-results/
+  [clarke2009]: https://trec.nist.gov/pubs/trec18/papers/ENT09.OVERVIEW.pdf
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    """Constructor.
+
+    Args:
+      name: A string used as the name for this metric.
+      topn: A cutoff for how many examples to consider for this metric.
+      dtype: Data type of the metric output. See `tf.keras.metrics.Metric`.
+      ragged: A bool indicating whether the supplied tensors are ragged. If
+        True y_true, y_pred and sample_weight (if providing per-example weights)
+        need to be ragged tensors with compatible shapes.
+      **kwargs: Other keyward arguments used in `tf.keras.metrics.Metric`.
+    """
+    super(PrecisionIAMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.PrecisionIAMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    config = super(PrecisionIAMetric, self).get_config()
+    config.update({
+        "topn": self._topn,
+    })
+    return config
diff --git a/deepray/metrics/r_square.py b/deepray/metrics/r_square.py
index e3261de3..32585589 100644
--- a/deepray/metrics/r_square.py
+++ b/deepray/metrics/r_square.py
@@ -17,7 +17,7 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras import backend as K
+from tf_keras import backend as K
 from tensorflow.keras.metrics import Metric
 from tensorflow.python.ops import weights_broadcast_ops
 
diff --git a/deepray/metrics/recall.py b/deepray/metrics/recall.py
new file mode 100644
index 00000000..715ff072
--- /dev/null
+++ b/deepray/metrics/recall.py
@@ -0,0 +1,73 @@
+from ._ranking import _RankingMetric
+
+
+class RecallMetric(_RankingMetric):
+  r"""Recall@k (R@k).
+
+  For each list of scores `s` in `y_pred` and list of labels `y` in `y_true`:
+
+  ```
+  R@K(y, s) = sum_i I[rank(s_i) < k] y_i / sum_j y_j
+  ```
+
+  NOTE: This metric converts graded relevance to binary relevance by setting
+  `y_i = 1` if `y_i >= 1`.
+
+  Standalone usage:
+
+  >>> y_true = [[0., 1., 1.]]
+  >>> y_pred = [[3., 1., 2.]]
+  >>> recall_at_2 = tfr.keras.metrics.RecallMetric(topn=2)
+  >>> recall_at_2(y_true, y_pred).numpy()
+  0.5
+
+  >>> # Using ragged tensors
+  >>> y_true = tf.ragged.constant([[0., 1.], [1., 2., 0.]])
+  >>> y_pred = tf.ragged.constant([[2., 1.], [2., 5., 4.]])
+  >>> recall_at_2 = tfr.keras.metrics.RecallMetric(topn=2, ragged=True)
+  >>> recall_at_2(y_true, y_pred).numpy()
+  0.75
+
+  Usage with the `compile()` API:
+
+  ```python
+  model.compile(optimizer='sgd', metrics=[tfr.keras.metrics.RecallMetric()])
+  ```
+
+  Definition:
+
+  $$
+  \text{R@k}(\{y\}, \{s\}) =
+  \frac{\sum_i I[\text{rank}(s_i) \leq k] \bar{y}_i}{\sum_j \bar{y}_j}
+  $$
+
+  where:
+
+  * $\text{rank}(s_i)$ is the rank of item $i$ after sorting by scores $s$
+    with ties broken randomly
+  * $I[]$ is the indicator function:\
+    $I[\text{cond}] = \begin{cases}
+    1 & \text{if cond is true}\\
+    0 & \text{else}\end{cases}
+    $
+  * $\bar{y}_i$ are the truncated labels:\
+    $
+    \bar{y}_i = \begin{cases}
+    1 & \text{if }y_i \geq 1 \\
+    0 & \text{else}
+    \end{cases}
+    $
+  * $k = |y|$ if $k$ is not provided
+  """
+
+  def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
+    super(RecallMetric, self).__init__(name=name, dtype=dtype, ragged=ragged, **kwargs)
+    self._topn = topn
+    self._metric = metrics_impl.RecallMetric(name=name, topn=topn, ragged=ragged)
+
+  def get_config(self):
+    config = super(RecallMetric, self).get_config()
+    config.update({
+        "topn": self._topn,
+    })
+    return config
diff --git a/deepray/metrics/streaming_correlations.py b/deepray/metrics/streaming_correlations.py
index fc66c19e..44354cdb 100644
--- a/deepray/metrics/streaming_correlations.py
+++ b/deepray/metrics/streaming_correlations.py
@@ -18,13 +18,13 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras import backend
-from tensorflow.keras.metrics import Metric
+import tf_keras as keras
+from tf_keras import backend
 from deepray.utils.types import AcceptableDTypes
 from typeguard import typechecked
 
 
-class CorrelationBase(Metric):
+class CorrelationBase(keras.metrics.Metric):
   """Base class for streaming correlation metrics.
 
     Based on https://arxiv.org/abs/1712.01521.
diff --git a/deepray/metrics/utils.py b/deepray/metrics/utils.py
index 785cc668..225b9689 100644
--- a/deepray/metrics/utils.py
+++ b/deepray/metrics/utils.py
@@ -14,12 +14,23 @@
 # ==============================================================================
 """Utilities for metrics."""
 
+from typing import Callable
+from typing import Optional
+
 import numpy as np
 import tensorflow as tf
+from typeguard import typechecked
+
 from deepray.utils.types import AcceptableDTypes
 
-from typeguard import typechecked
-from typing import Optional, Callable
+_PADDING_LABEL = -1.
+_PADDING_PREDICTION = -1e6
+_PADDING_WEIGHT = 0.
+
+TensorLike = tf.types.experimental.TensorLike
+GainFunction = Callable[[TensorLike], tf.Tensor]
+RankDiscountFunction = Callable[[TensorLike], tf.Tensor]
+PositiveFunction = Callable[[TensorLike], tf.Tensor]
 
 
 class MeanMetricWrapper(tf.keras.metrics.Mean):
@@ -90,3 +101,141 @@ def sample_weight_shape_match(v, sample_weight):
   if np.size(sample_weight) == 1:
     return tf.fill(v.shape, sample_weight)
   return tf.convert_to_tensor(sample_weight)
+
+
+def pow_minus_1(label: TensorLike) -> tf.Tensor:
+  """Computes `2**x - 1` element-wise for each label.
+
+  Can be used to define `gain_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    label: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
+  """
+  return tf.math.pow(2., label) - 1.
+
+
+def log2_inverse(rank: TensorLike) -> tf.Tensor:
+  """Computes `1./log2(1+x)` element-wise for each label.
+
+  Can be used to define `rank_discount_fn` for `tfr.keras.metrics.NDCGMetric`.
+
+  Args:
+    rank: A `Tensor` or anything that can be converted to a tensor using
+      `tf.convert_to_tensor`.
+
+  Returns:
+    A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
+  """
+  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+
+
+def is_label_valid(labels):
+  """Returns a boolean `Tensor` for label validity."""
+  labels = tf.convert_to_tensor(value=labels)
+  return tf.greater_equal(labels, 0.)
+
+
+def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None):
+  """Gets indices which would shuffle a tensor.
+
+  Args:
+    shape: The shape of the indices to generate.
+    mask: An optional mask that indicates which entries to place first. Its
+      shape should be equal to given shape.
+    shuffle_ties: Whether to randomly shuffle ties.
+    seed: The ops-level random seed.
+
+  Returns:
+    An int32 `Tensor` with given `shape`. Its entries are indices that would
+    (randomly) shuffle the values of a `Tensor` of given `shape` along the last
+    axis while placing masked items first.
+  """
+  # Generate random values when shuffling ties or all zeros when not.
+  if shuffle_ties:
+    shuffle_values = tf.random.uniform(shape, seed=seed)
+  else:
+    shuffle_values = tf.zeros(shape, dtype=tf.float32)
+
+  # Since shuffle_values is always in [0, 1), we can safely increase entries
+  # where mask=False with 2.0 to make sure those are placed last during the
+  # argsort op.
+  if mask is not None:
+    shuffle_values = tf.where(mask, shuffle_values, shuffle_values + 2.0)
+
+  # Generate indices by sorting the shuffle values.
+  return tf.argsort(shuffle_values, stable=True)
+
+
+def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=None, mask=None):
+  """Sorts list of features according to per-example scores.
+
+  Args:
+    scores: A `Tensor` of shape [batch_size, list_size] representing the
+      per-example scores.
+    features_list: A list of `Tensor`s to be sorted. The shape of the `Tensor`
+      can be [batch_size, list_size] or [batch_size, list_size, feature_dims].
+      The latter is applicable for example features.
+    topn: An integer as the cutoff of examples in the sorted list.
+    shuffle_ties: A boolean. If True, randomly shuffle before the sorting.
+    seed: The ops-level random seed used when `shuffle_ties` is True.
+    mask: An optional `Tensor` of shape [batch_size, list_size] representing
+      which entries are valid for sorting. Invalid entries will be pushed to the
+      end.
+
+  Returns:
+    A list of `Tensor`s as the list of sorted features by `scores`.
+  """
+  with tf.compat.v1.name_scope(name='sort_by_scores'):
+    scores = tf.cast(scores, tf.float32)
+    scores.get_shape().assert_has_rank(2)
+    list_size = tf.shape(input=scores)[1]
+    if topn is None:
+      topn = list_size
+    topn = tf.minimum(topn, list_size)
+
+    # Set invalid entries (those whose mask value is False) to the minimal value
+    # of scores so they will be placed last during sort ops.
+    if mask is not None:
+      scores = tf.where(mask, scores, tf.reduce_min(scores))
+
+    # Shuffle scores to break ties and/or push invalid entries (according to
+    # mask) to the end.
+    shuffle_ind = None
+    if shuffle_ties or mask is not None:
+      shuffle_ind = _get_shuffle_indices(tf.shape(input=scores), mask, shuffle_ties=shuffle_ties, seed=seed)
+      scores = tf.gather(scores, shuffle_ind, batch_dims=1, axis=1)
+
+    # Perform sort and return sorted feature_list entries.
+    _, indices = tf.math.top_k(scores, topn, sorted=True)
+    if shuffle_ind is not None:
+      indices = tf.gather(shuffle_ind, indices, batch_dims=1, axis=1)
+    return [tf.gather(f, indices, batch_dims=1, axis=1) for f in features_list]
+
+
+def ragged_to_dense(labels, predictions, weights):
+  """Converts given inputs from ragged tensors to dense tensors.
+
+  Args:
+    labels: A `tf.RaggedTensor` of the same shape as `predictions` representing
+      relevance.
+    predictions: A `tf.RaggedTensor` with shape [batch_size, (list_size)]. Each
+      value is the ranking score of the corresponding example.
+    weights: An optional `tf.RaggedTensor` of the same shape of predictions or a
+      `tf.Tensor` of shape [batch_size, 1]. The former case is per-example and
+      the latter case is per-list.
+
+  Returns:
+    A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
+  """
+  # TODO: Add checks to validate (ragged) shapes of input tensors.
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  labels = labels.to_tensor(_PADDING_LABEL)
+  if predictions is not None:
+    predictions = predictions.to_tensor(_PADDING_PREDICTION)
+  if isinstance(weights, tf.RaggedTensor):
+    weights = weights.to_tensor(_PADDING_WEIGHT)
+  return labels, predictions, weights, mask
diff --git a/deepray/models/BUILD b/deepray/models/BUILD
index 3051b4c9..0a5b776e 100644
--- a/deepray/models/BUILD
+++ b/deepray/models/BUILD
@@ -9,6 +9,7 @@ py_library(
         "**/*.py",
     ]),
     deps = [
+        "//deepray/layers",
         "//deepray/testing",
         "//deepray/utils",
     ],
diff --git a/deepray/layers/networks/README.md b/deepray/models/README.md
similarity index 96%
rename from deepray/layers/networks/README.md
rename to deepray/models/README.md
index 87cc571e..95b632df 100644
--- a/deepray/layers/networks/README.md
+++ b/deepray/models/README.md
@@ -1,6 +1,6 @@
-# Networks
+# Models
 
-Networks are combinations of `tf.keras` layers (and possibly other networks).
+Models are combinations of `tf.keras` layers (and possibly other models).
 They are `tf.keras` models that would not be trained alone. It encapsulates
 common network structures like a transformer encoder into an easily handled
 object with a standardized configuration.
diff --git a/deepray/models/__init__.py b/deepray/models/__init__.py
index e69de29b..2a4e09c4 100644
--- a/deepray/models/__init__.py
+++ b/deepray/models/__init__.py
@@ -0,0 +1,3 @@
+from deepray.models.transformer_encoder import TransformerEncoder
+from deepray.models.albert_transformer_encoder import AlbertTransformerEncoder
+from deepray.models.bert_span_labeler import BertSpanLabeler
\ No newline at end of file
diff --git a/deepray/layers/networks/albert_transformer_encoder.py b/deepray/models/albert_transformer_encoder.py
similarity index 91%
rename from deepray/layers/networks/albert_transformer_encoder.py
rename to deepray/models/albert_transformer_encoder.py
index 43ff854d..1cd7c4a2 100644
--- a/deepray/layers/networks/albert_transformer_encoder.py
+++ b/deepray/models/albert_transformer_encoder.py
@@ -21,10 +21,13 @@
 
 import tensorflow as tf
 
-from deepray import layers
+from deepray.layers import dense_einsum
+from deepray.layers import on_device_embedding
+from deepray.layers import position_embedding
+from deepray.layers import self_attention_mask
+from deepray.layers import transformer
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class AlbertTransformerEncoder(tf.keras.Model):
   """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
 
@@ -111,7 +114,7 @@ def __init__(
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask')
     type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids')
 
-    self._embedding_layer = layers.OnDeviceEmbedding(
+    self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
         vocab_size=vocab_size,
         embedding_width=embedding_width,
         initializer=initializer,
@@ -121,13 +124,13 @@ def __init__(
     word_embeddings = self._embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
-    self._position_embedding_layer = layers.PositionEmbedding(
+    self._position_embedding_layer = position_embedding.PositionEmbedding(
         initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length, dtype=float_dtype
     )
     position_embeddings = self._position_embedding_layer(word_embeddings)
 
     type_embeddings = (
-        layers.OnDeviceEmbedding(
+        on_device_embedding.OnDeviceEmbedding(
             vocab_size=type_vocab_size,
             embedding_width=embedding_width,
             initializer=initializer,
@@ -146,16 +149,18 @@ def __init__(
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     if embedding_width != hidden_size:
-      embeddings = layers.DenseEinsum(
+      embeddings = dense_einsum.DenseEinsum(
           output_shape=hidden_size, kernel_initializer=initializer, name='embedding_projection'
       )(embeddings)
 
     if float_dtype == 'float16':
       embeddings = tf.cast(embeddings, tf.float16)
+    elif float_dtype == 'bfloat16':
+      embeddings = tf.cast(embeddings, tf.bfloat16)
 
     data = embeddings
-    attention_mask = layers.SelfAttentionMask()([data, mask])
-    shared_layer = layers.Transformer(
+    attention_mask = self_attention_mask.SelfAttentionMask()([data, mask])
+    shared_layer = transformer.Transformer(
         num_attention_heads=num_attention_heads,
         intermediate_size=intermediate_size,
         intermediate_activation=activation,
diff --git a/deepray/layers/networks/bert_classifier.py b/deepray/models/bert_classifier.py
similarity index 95%
rename from deepray/layers/networks/bert_classifier.py
rename to deepray/models/bert_classifier.py
index 5c5e8606..faea48d4 100644
--- a/deepray/layers/networks/bert_classifier.py
+++ b/deepray/models/bert_classifier.py
@@ -21,10 +21,9 @@
 
 import tensorflow as tf
 
-from deepray.layers import networks
+from deepray.models import classification
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class BertClassifier(tf.keras.Model):
   """Classifier model based on a BERT-style transformer-based encoder.
 
@@ -66,7 +65,7 @@ def __init__(self, network, num_classes, initializer='glorot_uniform', output='l
     _, cls_output = network(inputs)
     cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
 
-    self.classifier = networks.Classification(
+    self.classifier = classification.Classification(
         input_width=cls_output.shape[-1],
         num_classes=num_classes,
         initializer=initializer,
diff --git a/deepray/layers/networks/bert_pretrainer.py b/deepray/models/bert_pretrainer.py
similarity index 96%
rename from deepray/layers/networks/bert_pretrainer.py
rename to deepray/models/bert_pretrainer.py
index bd3ef5ff..90858b45 100644
--- a/deepray/layers/networks/bert_pretrainer.py
+++ b/deepray/models/bert_pretrainer.py
@@ -20,12 +20,13 @@
 from __future__ import print_function
 
 import copy
+
 import tensorflow as tf
 
-from deepray.layers import networks
+from deepray.models import classification
+from deepray.models import masked_lm
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class BertPretrainer(tf.keras.Model):
   """BERT network training model.
 
@@ -100,7 +101,7 @@ def __init__(
     )
     inputs.append(masked_lm_positions)
 
-    self.masked_lm = networks.MaskedLM(
+    self.masked_lm = masked_lm.MaskedLM(
         num_predictions=num_token_predictions,
         input_width=sequence_output.shape[-1],
         source_network=network,
@@ -112,7 +113,7 @@ def __init__(
     )
     lm_outputs = self.masked_lm([sequence_output, masked_lm_positions])
 
-    self.classification = networks.Classification(
+    self.classification = classification.Classification(
         input_width=cls_output.shape[-1],
         num_classes=num_classes,
         initializer=initializer,
diff --git a/deepray/layers/networks/bert_span_labeler.py b/deepray/models/bert_span_labeler.py
similarity index 93%
rename from deepray/layers/networks/bert_span_labeler.py
rename to deepray/models/bert_span_labeler.py
index 046b7acc..e7a9b312 100644
--- a/deepray/layers/networks/bert_span_labeler.py
+++ b/deepray/models/bert_span_labeler.py
@@ -21,10 +21,9 @@
 
 import tensorflow as tf
 
-from deepray.layers import networks
+from deepray.models import span_labeling
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class BertSpanLabeler(tf.keras.Model):
   """Span labeler model based on a BERT-style transformer-based encoder.
 
@@ -63,7 +62,7 @@ def __init__(self, network, initializer='glorot_uniform', output='logits', **kwa
 
     # This is an instance variable for ease of access to the underlying task
     # network.
-    self.span_labeling = networks.SpanLabeling(
+    self.span_labeling = span_labeling.SpanLabeling(
         input_width=sequence_output.shape[-1], initializer=initializer, output=output, name='span_labeling'
     )
     start_logits, end_logits = self.span_labeling(sequence_output)
@@ -74,7 +73,7 @@ def __init__(self, network, initializer='glorot_uniform', output='logits', **kwa
     start_logits = tf.keras.layers.Lambda(tf.identity, name='start_positions')(start_logits)
     end_logits = tf.keras.layers.Lambda(tf.identity, name='end_positions')(end_logits)
 
-    logits = {"start_positions": start_logits, "end_positions": end_logits}
+    logits = [start_logits, end_logits]
 
     super(BertSpanLabeler, self).__init__(inputs=inputs, outputs=logits, **kwargs)
 
diff --git a/deepray/layers/networks/classification.py b/deepray/models/classification.py
similarity index 97%
rename from deepray/layers/networks/classification.py
rename to deepray/models/classification.py
index 7a53f63e..b447fa19 100644
--- a/deepray/layers/networks/classification.py
+++ b/deepray/models/classification.py
@@ -22,7 +22,6 @@
 import tensorflow as tf
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class Classification(tf.keras.Model):
   """Classification network head for BERT modeling.
 
diff --git a/deepray/layers/networks/encoder_scaffold.py b/deepray/models/encoder_scaffold.py
similarity index 94%
rename from deepray/layers/networks/encoder_scaffold.py
rename to deepray/models/encoder_scaffold.py
index 4a5551ca..695b3191 100644
--- a/deepray/layers/networks/encoder_scaffold.py
+++ b/deepray/models/encoder_scaffold.py
@@ -20,12 +20,15 @@
 from __future__ import print_function
 
 import inspect
+
 import tensorflow as tf
 
-from deepray import layers
+from deepray.layers import on_device_embedding
+from deepray.layers import position_embedding
+from deepray.layers import self_attention_mask
+from deepray.layers import transformer
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class EncoderScaffold(tf.keras.Model):
   """Bi-directional Transformer-based encoder network scaffold.
 
@@ -95,7 +98,7 @@ def __init__(
       embedding_cfg=None,
       embedding_data=None,
       num_hidden_instances=1,
-      hidden_cls=layers.Transformer,
+      hidden_cls=transformer.Transformer,
       hidden_cfg=None,
       **kwargs
   ):
@@ -125,7 +128,7 @@ def __init__(
       type_ids = tf.keras.layers.Input(shape=(embedding_cfg['seq_length'],), dtype=tf.int32, name='input_type_ids')
       inputs = [word_ids, mask, type_ids]
 
-      self._embedding_layer = layers.OnDeviceEmbedding(
+      self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
           vocab_size=embedding_cfg['vocab_size'],
           embedding_width=embedding_cfg['hidden_size'],
           initializer=embedding_cfg['initializer'],
@@ -135,7 +138,7 @@ def __init__(
       word_embeddings = self._embedding_layer(word_ids)
 
       # Always uses dynamic slicing for simplicity.
-      self._position_embedding_layer = layers.PositionEmbedding(
+      self._position_embedding_layer = position_embedding.PositionEmbedding(
           initializer=embedding_cfg['initializer'],
           use_dynamic_slicing=True,
           max_sequence_length=embedding_cfg['max_seq_length']
@@ -143,7 +146,7 @@ def __init__(
       position_embeddings = self._position_embedding_layer(word_embeddings)
 
       type_embeddings = (
-          layers.OnDeviceEmbedding(
+          on_device_embedding.OnDeviceEmbedding(
               vocab_size=embedding_cfg['type_vocab_size'],
               embedding_width=embedding_cfg['hidden_size'],
               initializer=embedding_cfg['initializer'],
@@ -161,8 +164,10 @@ def __init__(
 
       if embedding_cfg.get('dtype') == 'float16':
         embeddings = tf.cast(embeddings, tf.float16)
+      elif embedding_cfg.get('dtype') == 'bfloat16':
+        embeddings = tf.cast(embeddings, tf.bfloat16)
 
-    attention_mask = layers.SelfAttentionMask()([embeddings, mask])
+    attention_mask = self_attention_mask.SelfAttentionMask()([embeddings, mask])
     data = embeddings
 
     for _ in range(num_hidden_instances):
diff --git a/deepray/layers/networks/masked_lm.py b/deepray/models/masked_lm.py
similarity index 98%
rename from deepray/layers/networks/masked_lm.py
rename to deepray/models/masked_lm.py
index b2a059db..9f89fddb 100644
--- a/deepray/layers/networks/masked_lm.py
+++ b/deepray/models/masked_lm.py
@@ -24,7 +24,6 @@
 from deepray.layers import tf_utils
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class MaskedLM(tf.keras.Model):
   """Masked language model network head for BERT modeling.
 
@@ -126,7 +125,6 @@ def _gather_indexes(self, sequence_tensor, positions):
     return output_tensor
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 # Temporary until we can create a Dense layer that ties the embedding.
 class Bias(tf.keras.layers.Layer):
   """Adds a bias term to an input."""
diff --git a/deepray/models/ncf_common.py b/deepray/models/ncf_common.py
index 0f2c7c58..645f6596 100644
--- a/deepray/models/ncf_common.py
+++ b/deepray/models/ncf_common.py
@@ -33,8 +33,6 @@
 from deepray.datasets.movielens import data_preprocessing
 from deepray.utils.flags import core as flags_core
 
-FLAGS = flags.FLAGS
-
 
 def get_inputs(params):
   """Returns some parameters used by the model."""
diff --git a/deepray/models/ncf_model.py b/deepray/models/ncf_model.py
index 92b978f2..0abe58e1 100644
--- a/deepray/models/ncf_model.py
+++ b/deepray/models/ncf_model.py
@@ -87,7 +87,7 @@ def call(self, inputs, training=None, mask=None):
 
     # Custom training loop calculates loss and metric as a part of
     # training/evaluation step function.
-    if not self._params["keras_use_ctl"]:
+    if not self._params["use_custom_training_loop"]:
       softmax_logits = MetricLayer(self._params["match_mlperf"])([softmax_logits, dup_mask_input])
       # TODO(b/134744680): Use model.add_loss() instead once the API is well
       # supported.
diff --git a/deepray/models/ncf_test.py b/deepray/models/ncf_test.py
index 4a797200..1555119c 100644
--- a/deepray/models/ncf_test.py
+++ b/deepray/models/ncf_test.py
@@ -65,7 +65,7 @@ def test_end_to_end_keras_dist_strat(self):
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
   def test_end_to_end_keras_dist_strat_ctl(self):
-    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-keras_use_ctl', 'True'])
+    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-use_custom_training_loop', 'True'])
     integration.run_synthetic(ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=flags)
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
@@ -87,7 +87,7 @@ def test_end_to_end_keras_1_gpu_dist_strat_ctl_fp16(self):
     integration.run_synthetic(
         ncf_keras_main.main,
         tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--keras_use_ctl']
+        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--use_custom_training_loop']
     )
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
diff --git a/deepray/models/rec/base_model.py b/deepray/models/rec/base_model.py
index cedc5014..59095974 100644
--- a/deepray/models/rec/base_model.py
+++ b/deepray/models/rec/base_model.py
@@ -18,8 +18,6 @@
 from deepray.utils.data.feature_map import FeatureMap
 from deepray.utils.data.input_meta import InputMeta
 
-FLAGS = flags.FLAGS
-
 # if FLAGS.use_dynamic_embedding:
 from tensorflow_recommenders_addons import dynamic_embedding as de
 
diff --git a/deepray/models/rec/flen.py b/deepray/models/rec/flen.py
index 2c7eff2c..ff2c245a 100644
--- a/deepray/models/rec/flen.py
+++ b/deepray/models/rec/flen.py
@@ -29,8 +29,6 @@
 from deepray.utils.data.feature_map import FeatureMap
 from deepray.layers.field_wise_bi_interaction import FieldWiseBiInteraction
 
-FLAGS = flags.FLAGS
-
 __all__ = [
     'FLEN',
 ]
diff --git a/deepray/models/rec/tfra_demo.py b/deepray/models/rec/tfra_demo.py
deleted file mode 100644
index 076cec3e..00000000
--- a/deepray/models/rec/tfra_demo.py
+++ /dev/null
@@ -1,192 +0,0 @@
-import tensorflow as tf
-
-from tensorflow.keras.layers import (Layer, Input, Concatenate, Dense, Flatten, Lambda)
-from tensorflow_recommenders_addons import dynamic_embedding as de
-
-
-class DeepLayer(Layer):
-
-  def __init__(self, hidden_dim, layer_num, out_dim):
-    self.layers = []
-    self.hidden_dim = hidden_dim
-    self.layer_num = layer_num
-    self.out_dim = out_dim
-    for i in range(layer_num):
-      self.layers.append(Dense(hidden_dim, "relu"))
-    self.layers.append(Dense(out_dim, "sigmoid"))
-    super(DeepLayer, self).__init__()
-
-  def call(self, inputs):
-    output = inputs
-    for layer in self.layers:
-      output = layer(output)
-    return output  # (batch, out_dim)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "hidden_dim": self.hidden_dim,
-        "layer_num": self.layer_num,
-        "out_dim": self.out_dim,
-    })
-    return config
-
-
-# 构建model
-def build_keras_model(is_training, mpi_size, mpi_rank):
-  # 初始化参数
-  embedding_size = 8
-
-  if is_training:
-    initializer = tf.keras.initializers.VarianceScaling()
-  else:
-    initializer = tf.keras.initializers.Zeros()
-  gpu_device = ["GPU:0"]
-  cpu_device = ["CPU:0"]
-
-  dense_embedding_layer = de.keras.layers.HvdAllToAllEmbedding(
-      mpi_size=mpi_size,
-      embedding_size=embedding_size,
-      key_dtype=tf.int32,
-      value_dtype=tf.float32,
-      initializer=initializer,
-      devices=gpu_device,
-      name='DenseUnifiedEmbeddingLayer',
-      kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
-  )
-
-  sparse_embedding_layer = de.keras.layers.HvdAllToAllEmbedding(
-      mpi_size=mpi_size,
-      embedding_size=embedding_size,
-      key_dtype=tf.int64,
-      value_dtype=tf.float32,
-      initializer=initializer,
-      devices=cpu_device,
-      name='SparseUnifiedEmbeddingLayer',
-      kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
-  )
-
-  # 输入层
-  dense_input_dict = {"movie_genres": {'code': 1111, 'dim': 1}, "user_gender": {'code': 2222, 'dim': 1}}
-  sparse_input_dict = {"movie_id": {'code': 3333, 'dim': 1}, "user_id": {'code': 4444, 'dim': 1}}
-
-  inputs = dict()
-  embedding_outs = []
-
-  # 定义 gpu embedding层
-  # 主要思路是合并输入进行embedding查询，最大化利用gpu并行能力，并降低kernel launch time
-  # 由于 gpu dynamic embedding的动态增机制，请务必设置os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"，以保证显存不会被tensorflow graph预读。
-  ###################################################
-  dense_input_tensors = list()
-  dense_input_split_dims = list()
-  for input_name in dense_input_dict.keys():
-    dense_input_tensor = Input(shape=(1,), dtype=tf.int32, name=input_name)
-    inputs[input_name] = dense_input_tensor
-
-    input_tensor_prefix_code = int(dense_input_dict[input_name]["code"]) << 17
-    # dense_input_tensor = tf.bitwise.bitwise_xor(dense_input_tensor, input_tensor_prefix_code)
-    # xor可以用加法替代，方便后续TRT、openvino的优化
-    dense_input_tensor = tf.add(dense_input_tensor, input_tensor_prefix_code)
-    dense_input_tensors.append(dense_input_tensor)
-    dense_input_split_dims.append(dense_input_dict[input_name]["dim"])
-
-  tmp_sum = 0
-  dense_input_split_dims_final = []
-  dense_input_is_sequence_feature = []
-  for dim in dense_input_split_dims:
-    if dim == 1:
-      tmp_sum = tmp_sum + 1
-    elif dim > 1:
-      if tmp_sum > 0:
-        dense_input_split_dims_final.append(tmp_sum)
-        dense_input_is_sequence_feature.append(False)
-      dense_input_split_dims_final.append(dim)
-      dense_input_is_sequence_feature.append(True)
-      tmp_sum = 0
-    else:
-      raise ("dim must >= 1, which is {}".format(dim))
-  if tmp_sum > 0:
-    dense_input_split_dims_final.append(tmp_sum)
-    dense_input_is_sequence_feature.append(False)
-
-  dense_input_tensors_concat = Concatenate(axis=1)(dense_input_tensors)
-  dense_embedding_out_concat = dense_embedding_layer(dense_input_tensors_concat)
-  ###################################################
-  # gpu embedding部分结束
-
-  # 定义 cpu embedding层
-  # id类特征维度空间大，显存不够用，放在主机内存
-  ###################################################
-  sparse_input_tensors = list()
-  sparse_input_split_dims = list()
-  for input_name in sparse_input_dict.keys():
-    sparse_input_tensor = Input(shape=(1,), dtype=tf.int64, name=input_name)
-    inputs[input_name] = sparse_input_tensor
-
-    input_tensor_prefix_code = int(sparse_input_dict[input_name]["code"]) << 47
-    # id_tensor = tf.bitwise.bitwise_xor(sparse_input_tensor, input_tensor_prefix_code)
-    # xor可以用加法替代，方便后续TRT、openvino的优化
-    sparse_input_tensor = tf.add(sparse_input_tensor, input_tensor_prefix_code)
-    sparse_input_tensors.append(sparse_input_tensor)
-    sparse_input_split_dims.append(sparse_input_dict[input_name]["dim"])
-
-  tmp_sum = 0
-  sparse_input_split_dims_final = []
-  sparse_input_is_sequence_feature = []
-  for dim in sparse_input_split_dims:
-    if dim == 1:
-      tmp_sum = tmp_sum + 1
-    elif dim > 1:
-      if tmp_sum > 0:
-        sparse_input_split_dims_final.append(tmp_sum)
-        sparse_input_is_sequence_feature.append(False)
-      sparse_input_split_dims_final.append(dim)
-      sparse_input_is_sequence_feature.append(True)
-      tmp_sum = 0
-    else:
-      raise ("dim must >= 1, which is {}".format(dim))
-  if tmp_sum > 0:
-    sparse_input_split_dims_final.append(tmp_sum)
-    sparse_input_is_sequence_feature.append(False)
-
-  sparse_input_tensors_concat = Concatenate(axis=1)(sparse_input_tensors)
-  sparse_embedding_out_concat = sparse_embedding_layer(sparse_input_tensors_concat)
-  ###################################################
-  # cpu embedding部分结束
-
-  # 接下来是特别处理向量特征
-  # split_dims和is_sequence_feature用来辨识向量特征
-  ###################################################
-  embedding_out = list()
-  embedding_out.extend(
-      tf.split(dense_embedding_out_concat, dense_input_split_dims_final, axis=1)
-  )  # (feature_combin_num, (batch, dim, emb_size))
-  embedding_out.extend(
-      tf.split(sparse_embedding_out_concat, sparse_input_split_dims_final, axis=1)
-  )  # (feature_combin_num, (batch, dim, emb_size))
-  assert ((len(dense_input_is_sequence_feature) + len(sparse_input_is_sequence_feature)) == len(embedding_out))
-  is_sequence_feature = dense_input_is_sequence_feature + sparse_input_is_sequence_feature
-  for i, embedding in enumerate(embedding_out):
-    if is_sequence_feature[i] == True:
-      # 处理向量特征获得的embedding
-      embedding_vec = tf.math.reduce_mean(
-          embedding, axis=1, keepdims=True
-      )  # (feature_combin_num, (batch, x, emb_size))
-    else:
-      embedding_vec = embedding
-    embedding_outs.append(embedding_vec)
-
-  ###################################################
-  ###################################################
-  # embedding层 部分结束
-  ###################################################
-  ###################################################
-
-  # 算法后续部分
-  embeddings_concat = Flatten()(Concatenate(axis=1)(embedding_outs))
-
-  outs = DeepLayer(256, 1, 1)(embeddings_concat)
-  outs = Lambda(lambda x: x, name="user_rating")(outs)
-
-  model = tf.keras.Model(inputs=inputs, outputs=outs)
-  return model
diff --git a/deepray/models/rec/tower_new_tfra.py b/deepray/models/rec/tower_new_tfra.py
deleted file mode 100644
index 7f132302..00000000
--- a/deepray/models/rec/tower_new_tfra.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# -*- coding:utf-8 -*-
-
-import tensorflow as tf
-from absl import logging, flags
-from tensorflow.keras.layers import Concatenate
-from tensorflow.keras.layers import Flatten, Lambda
-from tensorflow.python.framework import constant_op
-from tensorflow.python.keras import backend_config
-from tensorflow.python.ops import clip_ops
-
-from .base_model import BaseModel
-
-epsilon = backend_config.epsilon
-FLAGS = flags.FLAGS
-
-
-class TowerNewTFRA(BaseModel):
-
-  def __call__(
-      self,
-      nn_hidden_units=(256, 128, 64),
-      nn_l2_reg=0.0,
-      nn_dropout=0.0,
-      nn_use_bn=False,
-      is_training=True,
-      *args,
-      **kwargs
-  ):
-    self._nn_hidden_units = nn_hidden_units
-    self._is_training = is_training
-
-    self.targets = list(self.target_label_table.keys())
-    self.input_dict = self.input_from_features()
-    features = self.build_features()
-    output_dict = self.build_network(features=features)
-    model = tf.keras.Model(inputs=self.input_dict, outputs=output_dict)
-    return model
-
-  def build_network(self, flags=None, features=None):
-    geek_nn_dense_features, job_nn_dense_features = self.get_input_and_dense_features(
-        features, self._is_training, self.get_geek_nn_compo(), self.get_job_nn_compo(), self.targets, extra_dim=0
-    )
-
-    #         print("input_list:", len(input_list))
-    #         print("geek nn:", len(geek_nn_dense_features))
-    #         print("job nn:", len(job_nn_dense_features))
-
-    x_job = Flatten()(Concatenate(axis=-1)(job_nn_dense_features))
-    x_geek = Flatten()(Concatenate(axis=-1)(geek_nn_dense_features))
-    for i, n in enumerate(self._nn_hidden_units):
-      x_job = tf.keras.layers.Dense(n, activation='relu')(x_job)
-      x_geek = tf.keras.layers.Dense(n, activation='relu')(x_geek)
-    #             if nn_dropout:
-    #                 x_job = tf.keras.layers.Dropout(nn_dropout[i])(x_job)
-    #                 x_geek = tf.keras.layers.Dropout(nn_dropout[i])(x_geek)
-
-    x_job = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x_job)
-    x_geek = Lambda(lambda x: tf.math.l2_normalize(x, axis=1))(x_geek)
-    predict_out = tf.keras.layers.Dot(axes=-1, normalize=False)([x_job, x_geek])
-
-    epsilon_ = constant_op.constant(epsilon(), dtype=predict_out.dtype.base_dtype)
-    predict_out = clip_ops.clip_by_value(predict_out, epsilon_, 1. - epsilon_)
-
-    # output target
-    output_dict = dict()
-    output_dict['addf'] = Lambda(lambda x: x, name="addf")(predict_out)
-    output_dict['predict'] = Lambda(lambda x: x, name="predict")(predict_out)
-    output_dict['predict_0'] = Lambda(lambda x: x, name="predict_0")(predict_out)
-    output_dict['job_vec'] = tf.keras.layers.Lambda(lambda x: x, name='job_vec')(x_job)
-    output_dict['geek_vec'] = tf.keras.layers.Lambda(lambda x: x, name='geek_vec')(x_geek)
-
-    for i, target in enumerate(self.target_label_table):
-      output_dict[target] = Lambda(lambda x: x, name=target)(predict_out)
-
-    # output eva target & metrics
-    print("conf.evaluate_target:", self.conf.evaluate_target)
-
-    for key, config in self.conf.evaluate_target.items():
-      target = config['target'] if 'target' in config else 'predict'
-      if target in output_dict:
-        pass
-      else:
-        target = 'predict'
-      output_dict[key] = Lambda(lambda x: x, name=key)(output_dict[target])
-
-    logging.info(f'output_dict: {output_dict}')
-    return output_dict
-
-  # 生成 NN Feature
-  def get_input_and_dense_features(self, features, is_training, geek_comp, job_comp, targets=None, extra_dim=0):
-    # NN features
-    id_features = []
-    nn_features = []
-    all_features = []
-    geek_cnt = geek_comp
-    job_cnt = job_comp
-    emb_dim_by_name = dict()
-    num_targets = 1 if not targets else len(targets)
-    geek_feature_set = set()
-    for field, fea_list in self.field_dict.items():
-      if field in geek_cnt:
-        emb_dims = geek_cnt[field]
-      elif field in job_cnt:
-        emb_dims = job_cnt[field]
-      else:
-        continue
-
-      if len(emb_dims) < len(fea_list):
-        emb_dims = emb_dims + [emb_dims[-1]] * (len(fea_list) - len(emb_dims))
-
-      for i, fea_name in enumerate(fea_list):
-        if field in geek_cnt:
-          geek_feature_set.add(fea_name)
-
-        feature = features[fea_name]
-        emb_name = feature.emb_name
-        emb_dim = emb_dims[i]
-        if self.conf.emb_reuse:
-          if emb_name in emb_dim_by_name and emb_dim_by_name[emb_name] != emb_dim:
-            logging.warn(f"[EMBED REUSE] {feature.name}@{emb_name} from {emb_dim} to {emb_dim_by_name[emb_name]}")
-            emb_dim = emb_dim_by_name[emb_name]
-          emb_dim_by_name[emb_name] = emb_dim
-        if feature.emb_dynamic:
-          id_features.append(
-              self.make_feature(
-                  f=feature,
-                  emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
-                  emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else [])
-              )
-          )
-        else:
-          nn_features.append(
-              self.make_feature(
-                  f=feature,
-                  emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
-                  emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else [])
-              )
-          )
-        all_features.append(
-            self.make_feature(
-                f=feature,
-                emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
-                emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else [])
-            )
-        )
-
-    emb_dict = self.embedding_from_feature(all_features, is_training)
-    id_dense_features = self.dense_from_columns_id(id_features, emb_dict)
-    print("id:", id_dense_features)
-    nn_dense_features = self.dense_from_columns(nn_features, emb_dict)
-    nn_dense_features.update(id_dense_features)
-
-    geek_dense_features = []
-    job_dense_features = []
-    i = 0
-    for emb_name, feas in nn_dense_features.items():
-      if emb_name in geek_feature_set:
-        geek_dense_features.append(feas)
-      else:
-        job_dense_features.append(feas)
-
-    return geek_dense_features, job_dense_features
diff --git a/deepray/layers/networks/span_labeling.py b/deepray/models/span_labeling.py
similarity index 100%
rename from deepray/layers/networks/span_labeling.py
rename to deepray/models/span_labeling.py
diff --git a/deepray/models/tests/__init__.py b/deepray/models/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/layers/networks/albert_transformer_encoder_test.py b/deepray/models/tests/albert_transformer_encoder_test.py
similarity index 98%
rename from deepray/layers/networks/albert_transformer_encoder_test.py
rename to deepray/models/tests/albert_transformer_encoder_test.py
index aed76c7b..3bf39ead 100644
--- a/deepray/layers/networks/albert_transformer_encoder_test.py
+++ b/deepray/models/tests/albert_transformer_encoder_test.py
@@ -23,7 +23,7 @@
 import tensorflow as tf
 
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from official.nlp.modeling.networks import albert_transformer_encoder
+from deepray.models import albert_transformer_encoder
 
 
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
diff --git a/deepray/layers/networks/bert_classifier_test.py b/deepray/models/tests/bert_classifier_test.py
similarity index 100%
rename from deepray/layers/networks/bert_classifier_test.py
rename to deepray/models/tests/bert_classifier_test.py
diff --git a/deepray/layers/networks/bert_pretrainer_test.py b/deepray/models/tests/bert_pretrainer_test.py
similarity index 100%
rename from deepray/layers/networks/bert_pretrainer_test.py
rename to deepray/models/tests/bert_pretrainer_test.py
diff --git a/deepray/layers/networks/bert_span_labeler_test.py b/deepray/models/tests/bert_span_labeler_test.py
similarity index 100%
rename from deepray/layers/networks/bert_span_labeler_test.py
rename to deepray/models/tests/bert_span_labeler_test.py
diff --git a/deepray/layers/networks/classification_test.py b/deepray/models/tests/classification_test.py
similarity index 100%
rename from deepray/layers/networks/classification_test.py
rename to deepray/models/tests/classification_test.py
diff --git a/deepray/layers/networks/encoder_scaffold_test.py b/deepray/models/tests/encoder_scaffold_test.py
similarity index 96%
rename from deepray/layers/networks/encoder_scaffold_test.py
rename to deepray/models/tests/encoder_scaffold_test.py
index 9d25100d..afa7cb65 100644
--- a/deepray/layers/networks/encoder_scaffold_test.py
+++ b/deepray/models/tests/encoder_scaffold_test.py
@@ -22,8 +22,10 @@
 import tensorflow as tf
 
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from deepray import layers
-from deepray.layers.networks import encoder_scaffold
+from official.modeling import activations
+from official.nlp.modeling import layers
+from official.nlp.modeling.networks import encoder_scaffold
+from deepray.layers import on_device_embedding
 
 
 # Test class that wraps a standard transformer layer. If this layer is called
@@ -70,7 +72,7 @@ def test_network_creation(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -123,7 +125,7 @@ def test_network_creation_with_float16_dtype(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -170,7 +172,7 @@ def test_network_invocation(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -219,7 +221,7 @@ def test_network_invocation(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -252,7 +254,7 @@ def test_serialize_deserialize(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -291,7 +293,7 @@ def test_network_invocation(self):
 
     word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
-    embedding_layer = layers.OnDeviceEmbedding(
+    embedding_layer = on_device_embedding.OnDeviceEmbedding(
         vocab_size=vocab_size,
         embedding_width=hidden_size,
         initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -303,7 +305,7 @@ def test_network_invocation(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -350,7 +352,7 @@ def test_serialize_deserialize(self):
 
     word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
-    embedding_layer = layers.OnDeviceEmbedding(
+    embedding_layer = on_device_embedding.OnDeviceEmbedding(
         vocab_size=vocab_size,
         embedding_width=hidden_size,
         initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -362,7 +364,7 @@ def test_serialize_deserialize(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -441,7 +443,7 @@ def test_network_invocation(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
@@ -504,7 +506,7 @@ def test_serialize_deserialize(self):
     hidden_cfg = {
         "num_attention_heads": 2,
         "intermediate_size": 3072,
-        "intermediate_activation": tf.keras.activations.gelu,
+        "intermediate_activation": activations.gelu,
         "dropout_rate": 0.1,
         "attention_dropout_rate": 0.1,
         "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
diff --git a/deepray/layers/networks/masked_lm_test.py b/deepray/models/tests/masked_lm_test.py
similarity index 98%
rename from deepray/layers/networks/masked_lm_test.py
rename to deepray/models/tests/masked_lm_test.py
index 65a2e417..5a5f8963 100644
--- a/deepray/layers/networks/masked_lm_test.py
+++ b/deepray/models/tests/masked_lm_test.py
@@ -23,7 +23,8 @@
 
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
 
-from deepray.layers.networks import transformer_encoder, masked_lm
+from official.nlp.modeling.networks import masked_lm
+from official.nlp.modeling.networks import transformer_encoder
 
 
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
diff --git a/deepray/layers/networks/span_labeling_test.py b/deepray/models/tests/span_labeling_test.py
similarity index 100%
rename from deepray/layers/networks/span_labeling_test.py
rename to deepray/models/tests/span_labeling_test.py
diff --git a/deepray/layers/networks/transformer_encoder_test.py b/deepray/models/tests/transformer_encoder_test.py
similarity index 99%
rename from deepray/layers/networks/transformer_encoder_test.py
rename to deepray/models/tests/transformer_encoder_test.py
index 400d27bb..70945fbd 100644
--- a/deepray/layers/networks/transformer_encoder_test.py
+++ b/deepray/models/tests/transformer_encoder_test.py
@@ -22,7 +22,7 @@
 import tensorflow as tf
 
 from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
-from deepray.layers.networks import transformer_encoder
+from official.nlp.modeling.networks import transformer_encoder
 
 
 # This decorator runs the test in V1, V2-Eager, and V2-Functional mode. It
diff --git a/deepray/layers/networks/transformer_encoder.py b/deepray/models/transformer_encoder.py
similarity index 92%
rename from deepray/layers/networks/transformer_encoder.py
rename to deepray/models/transformer_encoder.py
index 6127b07a..11ef77ef 100644
--- a/deepray/layers/networks/transformer_encoder.py
+++ b/deepray/models/transformer_encoder.py
@@ -21,10 +21,12 @@
 
 import tensorflow as tf
 
-from deepray import layers
+from deepray.layers import on_device_embedding
+from deepray.layers import position_embedding
+from deepray.layers import self_attention_mask
+from deepray.layers import transformer
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
 class TransformerEncoder(tf.keras.Model):
   """Bi-directional Transformer-based encoder network.
 
@@ -103,19 +105,19 @@ def __init__(
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask')
     type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids')
 
-    self._embedding_layer = layers.OnDeviceEmbedding(
+    self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
         vocab_size=vocab_size, embedding_width=hidden_size, initializer=initializer, name='word_embeddings'
     )
     word_embeddings = self._embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
-    self._position_embedding_layer = layers.PositionEmbedding(
+    self._position_embedding_layer = position_embedding.PositionEmbedding(
         initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length
     )
     position_embeddings = self._position_embedding_layer(word_embeddings)
 
     type_embeddings = (
-        layers.OnDeviceEmbedding(
+        on_device_embedding.OnDeviceEmbedding(
             vocab_size=type_vocab_size,
             embedding_width=hidden_size,
             initializer=initializer,
@@ -133,11 +135,13 @@ def __init__(
 
     if float_dtype == 'float16':
       embeddings = tf.cast(embeddings, tf.float16)
+    elif float_dtype == 'bfloat16':
+      embeddings = tf.cast(embeddings, tf.bfloat16)
 
     data = embeddings
-    attention_mask = layers.SelfAttentionMask()([data, mask])
+    attention_mask = self_attention_mask.SelfAttentionMask()([data, mask])
     for i in range(num_layers):
-      layer = layers.Transformer(
+      layer = transformer.Transformer(
           num_attention_heads=num_attention_heads,
           intermediate_size=intermediate_size,
           intermediate_activation=activation,
diff --git a/deepray/optimizers/BUILD b/deepray/optimizers/BUILD
index 12756ad8..655f307d 100644
--- a/deepray/optimizers/BUILD
+++ b/deepray/optimizers/BUILD
@@ -18,6 +18,19 @@ py_test(
     main = "tests/run_all_test.py",
     deps = [
         ":optimizers",
+        "//deepray/custom_ops/embedding_variable",
+        "//deepray/custom_ops/training_ops",
+    ],
+)
+
+py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = glob(["tests/adam_test.py"]),
+    main = "tests/adam_test.py",
+    deps = [
+        ":optimizers",
+        # "//deepray/custom_ops/embedding_variable",
         "//deepray/custom_ops/training_ops",
     ],
 )
diff --git a/deepray/optimizers/__init__.py b/deepray/optimizers/__init__.py
index 8eaff461..c3bb482b 100644
--- a/deepray/optimizers/__init__.py
+++ b/deepray/optimizers/__init__.py
@@ -38,7 +38,6 @@
 from deepray.optimizers.proximal_adagrad import ProximalAdagrad
 from deepray.optimizers.rectified_adam import RectifiedAdam
 from deepray.optimizers.stochastic_weight_averaging import SWA
-from deepray.optimizers.weight_decay_optimizers import AdamW
 from deepray.optimizers.adabelief import AdaBelief
 from deepray.optimizers.weight_decay_optimizers import SGDW
 from deepray.optimizers.weight_decay_optimizers import (
@@ -50,3 +49,7 @@
 from deepray.optimizers.yogi import Yogi
 from deepray.optimizers.cocob import COCOB
 from deepray.optimizers.adam import Adam
+from deepray.optimizers.adam_async import AdamAsync
+from deepray.optimizers.gradient_descent import SGD
+from deepray.optimizers.adagrad import Adagrad
+from deepray.optimizers.ftrl import FtrlOptimizer
\ No newline at end of file
diff --git a/deepray/optimizers/adagrad.py b/deepray/optimizers/adagrad.py
new file mode 100644
index 00000000..31c046a1
--- /dev/null
+++ b/deepray/optimizers/adagrad.py
@@ -0,0 +1,83 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Adagrad for Deepray."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+
+import tensorflow as tf
+from absl import flags
+
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices
+
+
+class Adagrad(tf.keras.optimizers.legacy.Adagrad):
+
+  def __init__(self, learning_rate=0.001, **kwargs):
+    super().__init__(learning_rate=learning_rate, **kwargs)
+    self.global_step = None
+    flags.FLAGS([sys.argv[0], f"--ev_slot_num={1}"])
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = tf.compat.v1.constant_initializer(self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, "accumulator", init, slot_config=SlotConfig(slot_index=1, slot_num=1))
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
+
+    acc = self.get_slot(var, "accumulator")
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if indices_counts != None:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad_with_counts(
+            var.handle,
+            acc.handle,
+            coefficients["lr_t"],
+            grad,
+            indices,
+            self.global_step,
+            indices_counts,
+            use_locking=self._use_locking
+        )
+      else:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad(
+            var.handle,
+            acc.handle,
+            coefficients["lr_t"],
+            grad,
+            indices,
+            self.global_step,
+            use_locking=self._use_locking
+        )
+    else:
+      return tf.raw_ops.ResourceSparseApplyAdagradV2(
+          var=var.handle,
+          accum=acc.handle,
+          lr=coefficients["lr_t"],
+          epsilon=coefficients["epsilon"],
+          grad=grad,
+          indices=indices,
+          use_locking=self._use_locking,
+      )
+
+
+Adagrad.add_slot = add_slot
+Adagrad._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices
diff --git a/deepray/optimizers/adam.py b/deepray/optimizers/adam.py
index 0a909e83..35c2a3ff 100644
--- a/deepray/optimizers/adam.py
+++ b/deepray/optimizers/adam.py
@@ -19,32 +19,93 @@
 
 from __future__ import absolute_import, division, print_function
 
-from tensorflow.python.keras.optimizer_v2 import adam as tf_adam
+import sys
 
+from absl import flags
+from tf_keras.src.optimizers.legacy import adam as adam_old
+
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+from deepray.custom_ops.embedding_variable import kv_variable_ops
 from deepray.custom_ops.training_ops import gen_training_ops
+from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices
 
 
-class Adam(tf_adam.Adam):
+class Adam(adam_old.Adam):
   """Deepray Adam optimizer for efficient sparse updates"""
 
-  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
+  def __init__(self, learning_rate=0.001, **kwargs):
+    super().__init__(learning_rate=learning_rate, **kwargs)
+    self.global_step = None
+    flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"])
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, "m", slot_config=SlotConfig(slot_index=1, slot_num=2))
+    for var in var_list:
+      self.add_slot(var, "v", slot_config=SlotConfig(slot_index=2, slot_num=2))
+    if self.amsgrad:
+      for var in var_list:
+        self.add_slot(var, "vhat")
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
     coefficients = (
         (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
     )
-    return gen_training_ops.resource_sparse_apply_adam(
-        var=var.handle,
-        m=m.handle,
-        v=v.handle,
-        beta1_power=coefficients['beta_1_power'],
-        beta2_power=coefficients['beta_2_power'],
-        lr=coefficients['lr_t'],
-        beta1=coefficients['beta_1_t'],
-        beta2=coefficients['beta_2_t'],
-        epsilon=coefficients['epsilon'],
-        grad=grad,
-        indices=indices,
-        use_locking=self._use_locking
-    )
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if indices_counts is not None:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adam_with_counts(
+            var.handle,
+            m.handle,
+            v.handle,
+            coefficients['beta_1_power'],
+            coefficients['beta_2_power'],
+            coefficients['lr_t'],
+            coefficients['beta_1_t'],
+            coefficients['beta_2_t'],
+            coefficients['epsilon'],
+            grad,
+            indices,
+            self.global_step,
+            indices_counts,
+            use_locking=self._use_locking
+        )
+      else:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adam(
+            var.handle,
+            m.handle,
+            v.handle,
+            coefficients['beta_1_power'],
+            coefficients['beta_2_power'],
+            coefficients['lr_t'],
+            coefficients['beta_1_t'],
+            coefficients['beta_2_t'],
+            coefficients['epsilon'],
+            grad,
+            indices,
+            self.global_step,
+            use_locking=self._use_locking
+        )
+    else:
+      return gen_training_ops.resource_sparse_apply_adam(
+          var=var.handle,
+          m=m.handle,
+          v=v.handle,
+          beta1_power=coefficients['beta_1_power'],
+          beta2_power=coefficients['beta_2_power'],
+          lr=coefficients['lr_t'],
+          beta1=coefficients['beta_1_t'],
+          beta2=coefficients['beta_2_t'],
+          epsilon=coefficients['epsilon'],
+          grad=grad,
+          indices=indices,
+          use_locking=self._use_locking
+      )
+
+
+Adam.add_slot = add_slot
+Adam._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices
diff --git a/deepray/optimizers/adam_async.py b/deepray/optimizers/adam_async.py
new file mode 100644
index 00000000..0da3bad6
--- /dev/null
+++ b/deepray/optimizers/adam_async.py
@@ -0,0 +1,188 @@
+# Copyright 2025 The Deepray Authors. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""AdamAsync optimizer for Deepray. 
+"""
+
+from __future__ import absolute_import, division, print_function
+
+import sys
+
+import tensorflow as tf
+from absl import flags
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+from deepray.custom_ops.embedding_variable import config_pb2
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+from deepray.custom_ops.training_ops import gen_training_ops
+from .ev_optimizer_patch import add_slot, SlotConfig, _resource_apply_sparse_duplicate_indices
+
+
+class AdamAsync(tf.keras.optimizers.legacy.Adam):
+  """Deepray Adam optimizer for efficient sparse updates"""
+
+  def __init__(self, learning_rate=0.001, apply_sparse_rmsprop=False, **kwargs):
+    super().__init__(learning_rate=learning_rate, **kwargs)
+    self._apply_sparse_rmsprop = apply_sparse_rmsprop
+    self.global_step = None
+    flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"])
+
+  def _create_slots(self, var_list):
+    # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for var in var_list:
+      self.add_slot(var, "m", slot_config=SlotConfig(slot_index=1, slot_num=2))
+      # for var in var_list:
+      self.add_slot(var, "v", slot_config=SlotConfig(slot_index=2, slot_num=2))
+      if isinstance(var, kv_variable_ops.EmbeddingVariable):
+        self.add_slot(
+            var,
+            slot_name="beta1_power",
+            initializer=array_ops.expand_dims(self._get_hyper("beta_1", var.dtype.base_dtype), -1),
+            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+        )
+        self.add_slot(
+            var,
+            slot_name="beta2_power",
+            initializer=array_ops.expand_dims(self._get_hyper("beta_2", var.dtype.base_dtype), -1),
+            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+        )
+      else:
+        self.add_slot(
+            var,
+            slot_name="beta1_power",
+            initializer=self._get_hyper("beta_1", var.dtype.base_dtype),
+            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+        )
+        self.add_slot(
+            var,
+            slot_name="beta2_power",
+            initializer=self._get_hyper("beta_2", var.dtype.base_dtype),
+            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+        )
+    if self.amsgrad:
+      for var in var_list:
+        self.add_slot(var, "vhat")
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    if "learning_rate" in self._hyper:
+      lr_t = tf.identity(self._decayed_lr(var_dtype))
+      apply_state[(var_device, var_dtype)]["lr_t"] = lr_t
+
+    beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
+    beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
+    # beta_1_power = tf.identity(self._get_hyper("beta1_power", var_dtype))
+    # beta_2_power = tf.identity(self._get_hyper("beta2_power", var_dtype))
+
+    # lr = apply_state[(var_device, var_dtype)]["lr_t"] * (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+    apply_state[(var_device, var_dtype)].update(
+        dict(
+            # lr=lr,
+            epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+            beta_1_t=beta_1_t,
+            # beta_1_power=beta_1_power,
+            one_minus_beta_1_t=1 - beta_1_t,
+            beta_2_t=beta_2_t,
+            # beta_2_power=beta_2_power,
+            one_minus_beta_2_t=1 - beta_2_t,
+        )
+    )
+
+  def _resource_apply_dense(self, grad, var):
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self.get_slot(var, 'beta1_power')
+    beta2_power = self.get_slot(var, 'beta2_power')
+    return gen_training_ops.resource_apply_adam_async(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta1_power.handle,
+        beta2_power.handle,
+        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+        grad,
+        use_locking=self._use_locking,
+        apply_sparse_rmsprop=self._apply_sparse_rmsprop
+    )
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None):
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta1_power = self.get_slot(var, 'beta1_power')
+    beta2_power = self.get_slot(var, 'beta2_power')
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = (
+        (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
+    )
+
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if indices_counts is not None:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async_with_counts(
+            var.handle,
+            m.handle,
+            v.handle,
+            beta1_power.handle,
+            beta2_power.handle,
+            coefficients['lr_t'],
+            coefficients['beta_1_t'],
+            coefficients['beta_2_t'],
+            coefficients['epsilon'],
+            grad,
+            indices,
+            self.global_step,
+            indices_counts,
+            use_locking=self._use_locking,
+            apply_sparse_rmsprop=self._apply_sparse_rmsprop
+        )
+      else:
+        return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async(
+            var.handle,
+            m.handle,
+            v.handle,
+            beta1_power.handle,
+            beta2_power.handle,
+            coefficients['lr_t'],
+            coefficients['beta_1_t'],
+            coefficients['beta_2_t'],
+            coefficients['epsilon'],
+            grad,
+            indices,
+            self.global_step,
+            use_locking=self._use_locking,
+            apply_sparse_rmsprop=self._apply_sparse_rmsprop
+        )
+    else:
+      return gen_training_ops.resource_sparse_apply_adam_async(
+          var=var.handle,
+          m=m.handle,
+          v=v.handle,
+          beta1_power=beta1_power.handle,
+          beta2_power=beta2_power.handle,
+          lr=coefficients['lr_t'],
+          beta1=coefficients['beta_1_t'],
+          beta2=coefficients['beta_2_t'],
+          epsilon=coefficients['epsilon'],
+          grad=grad,
+          indices=indices,
+          use_locking=self._use_locking,
+          apply_sparse_rmsprop=self._apply_sparse_rmsprop
+      )
+
+
+AdamAsync.add_slot = add_slot
+AdamAsync._resource_apply_sparse_duplicate_indices = _resource_apply_sparse_duplicate_indices
diff --git a/deepray/optimizers/ev_optimizer_patch.py b/deepray/optimizers/ev_optimizer_patch.py
new file mode 100644
index 00000000..ba6e391f
--- /dev/null
+++ b/deepray/optimizers/ev_optimizer_patch.py
@@ -0,0 +1,260 @@
+# Copyright 2024 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""EmbeddingVariable optimizer."""
+
+import tensorflow as tf
+from packaging.version import parse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+
+from deepray.custom_ops.embedding_variable import config_pb2
+from deepray.custom_ops.embedding_variable import variables as ev_variables
+from deepray.custom_ops.unique_ops import gen_array_ops
+
+if parse(tf.__version__) < parse("2.11.0"):
+  from keras.optimizers.legacy.optimizer_v2 import _var_key
+elif parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.optimizers.legacy.optimizer_v2 import _var_key
+  from tf_keras.src import backend
+  from tf_keras.src.optimizers.legacy.optimizer_v2 import _deduplicate_indexed_slices
+else:
+  from keras.src.optimizers.legacy.optimizer_v2 import _var_key
+  from keras.src import backend
+  from keras.src.optimizers.legacy.optimizer_v2 import _deduplicate_indexed_slices
+
+import tf_keras as keras
+import functools
+
+from deepray.custom_ops.embedding_variable.python import kv_variable_ops
+
+from tensorflow.core.framework import attr_value_pb2
+from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable_internal, get_embedding_variable_v2_internal
+
+
+class SlotConfig:
+
+  def __init__(self, slot_num=1, slot_index=0, slot_type=config_pb2.SlotType.EMBEDDING_VARIABLE):
+    self.slot_num = slot_num
+    self.slot_index = slot_index
+    self.slot_type = slot_type
+
+
+def _set_init_op_embedding_type_attr(var, embedding_type):
+  var._init_op._set_attr("embedding_variable_type", attr_value_pb2.AttrValue(i=embedding_type))
+  var._initializer_for_restore._set_attr("embedding_variable_type", attr_value_pb2.AttrValue(i=embedding_type))
+
+
+def _set_init_op_slot_num_attr(var, slot_num):
+  var._init_op._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))
+  var._initializer_for_restore._set_attr("slot_num", attr_value_pb2.AttrValue(i=slot_num))
+
+
+def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=None):
+  """Add a new slot variable for `var`.
+
+  A slot variable is an additional variable associated with `var` to
+  train.  It is allocated and managed by optimizers, e.g. `Adam`.
+
+  Args:
+    var: a `Variable` object.
+    slot_name: name of the slot variable.
+    initializer: initializer of the slot variable
+    shape: (Optional) shape of the slot variable. If not set, it will
+      default to the shape of `var`.
+
+  Returns:
+    A slot variable.
+  """
+  if slot_name not in self._slot_names:
+    self._slot_names.append(slot_name)
+  var_key = _var_key(var)
+  slot_dict = self._slots.setdefault(var_key, {})
+  weight = slot_dict.get(slot_name, None)
+  if weight is None:
+    if isinstance(initializer, str) or callable(initializer):
+      initializer = keras.initializers.get(initializer)
+      if isinstance(
+          initializer,
+          tf.__internal__.tracking.CheckpointInitialValueCallable,
+      ) or (shape is not None):
+        slot_shape = shape
+      else:
+        slot_shape = var.shape
+      initial_value = functools.partial(initializer, shape=slot_shape, dtype=var.dtype)
+    else:
+      initial_value = initializer
+
+    if isinstance(var, kv_variable_ops.EmbeddingVariable):
+      if slot_config is None:
+        weight = get_embedding_variable_internal(
+            name=f"{var._shared_name}/{slot_name}",
+            initializer=initializer,
+            trainable=False,
+            embedding_dim=slot_shape,
+            key_dtype=var._invalid_key_type,
+            value_dtype=var.dtype,
+            validate_shape=slot_shape.is_fully_defined(),
+            steps_to_live=var._steps_to_live,
+            ht_partition_num=var._ht_partition_num
+        )
+        # _set_init_op_embedding_type_attr(weight, config_pb2.EmbeddingVariableType.MUTABLE)
+      else:
+        filter_strategy = None
+        if var._filter_freq != 0:
+          if var._max_element_size != 0:
+            filter_strategy = ev_variables.CBFFilter(
+                filter_freq=var._filter_freq,
+                max_element_size=var._max_element_size,
+                false_positive_probability=var._false_positive_probability,
+                counter_type=var._counter_type
+            )
+          else:
+            filter_strategy = ev_variables.CounterFilter(filter_freq=var._filter_freq)
+        if slot_config.slot_type is config_pb2.SlotType.EMBEDDING_VARIABLE:
+          # _set_init_op_slot_num_attr(var, slot_config.slot_num)
+          var._slot_num = slot_config.slot_num
+          emb_index = var._emb_index
+          if var.block_num > 1:
+            var = var._primary
+          weight = get_embedding_variable_v2_internal(
+              name=f"{var._shared_name}/{slot_name}",
+              initializer=initializer,
+              trainable=False,
+              embedding_dim=slot_shape,
+              key_dtype=var._invalid_key_type,
+              value_dtype=var.dtype,
+              validate_shape=slot_shape.is_fully_defined(),
+              evconfig=ev_variables.EmbeddingVariableConfig(
+                  steps_to_live=var._steps_to_live,
+                  handle_name=var._block_handle_name,
+                  emb_index=emb_index,
+                  block_num=var.block_num,
+                  slot_index=slot_config.slot_index,
+                  primary=var._primary,
+                  slot_num=slot_config.slot_num,
+                  storage_type=var.storage_type,
+                  storage_path=var._storage_path,
+                  storage_size=var._storage_size,
+                  storage_cache_strategy=var._storage_cache_strategy,
+                  layout=var._layout,
+                  l2_weight_threshold=var._l2_weight_threshold,
+                  filter_strategy=filter_strategy
+              )
+          )
+        else:
+          weight = tf.Variable(
+              name=f"{var._shared_name}/{slot_name}",
+              dtype=var.dtype,
+              trainable=False,
+              initial_value=initial_value,
+          )
+    else:
+      with self._distribution_strategy_scope():
+        strategy = tf.distribute.get_strategy()
+        if not strategy.extended.variable_created_in_scope(var):
+          raise ValueError(
+              "Trying to create optimizer slot variable under the "
+              "scope for tf.distribute.Strategy ({}), which is "
+              "different from the scope used for the original "
+              "variable ({}). Make sure the slot variables are "
+              "created under the same strategy scope. This may "
+              "happen if you're restoring from a checkpoint "
+              "outside the scope.".format(strategy, var)
+          )
+
+        with strategy.extended.colocate_vars_with(var):
+          weight = tf.Variable(
+              name=f"{var._shared_name}/{slot_name}",
+              dtype=var.dtype,
+              trainable=False,
+              initial_value=initial_value,
+          )
+
+    backend.track_variable(weight)
+    slot_dict[slot_name] = weight
+    self._restore_slot_variable(slot_name=slot_name, variable=var, slot_variable=weight)
+    self._weights.append(weight)
+  return weight
+
+
+def _deduplicate_indexed_slices_with_counts(values, indices):
+  """Sums `values` associated with any non-unique `indices`
+  and return counts of each count in `values`."""
+  unique_indices, new_index_positions, indices_counts = \
+      gen_array_ops.deepray_unique_with_counts(indices, out_idx=dtypes.int64)
+  summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0])
+  return summed_values, unique_indices, indices_counts
+
+
+def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices):
+  """Sums `values` associated with any non-unique `indices`
+  and return counts of each count in `values`."""
+  unique_indices, new_index_positions, summed_counts = \
+      gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts)
+  summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0])
+  return summed_values, unique_indices, summed_counts
+
+
+def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices, **kwargs):
+  """Add ops to apply sparse gradients to `handle`, with repeated indices.
+
+  Optimizers which override this method must deal with repeated indices. See
+  the docstring of `_apply_sparse_duplicate_indices` for details. By default
+  the correct behavior, to sum non-unique indices and their associated
+  gradients, is enforced by first pre-processing `grad` and `indices` and
+  passing them on to `_resource_apply_sparse`. Optimizers which deal correctly
+  with duplicate indices may instead override this method to avoid the
+  overhead of summing.
+
+  Args:
+    grad: a `Tensor` representing the gradient for the affected indices.
+    handle: a `Tensor` of dtype `resource` which points to the variable
+     to be updated.
+    indices: a `Tensor` of integral type representing the indices for
+     which the gradient is nonzero. Indices may be repeated.
+
+  Returns:
+    An `Operation` which updates the value of the variable.
+  """
+  from deepray.custom_ops.embedding_variable import kv_variable_ops
+  if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts():
+    if len(handle._counts_tensor.keys()) == 0:
+      summed_grad, unique_indices, indices_counts = \
+          _deduplicate_indexed_slices_with_counts(
+              values=grad, indices=indices)
+    else:
+      extra_counts, extra_indices = [], []
+      if indices.op.type == "ConcatV2":
+        for tensor in indices.op.inputs:
+          if tensor.op.type == "Reshape":
+            indices_tensor = tensor.op.inputs[0]
+            if indices_tensor in handle._counts_tensor:
+              extra_counts.append(handle._counts_tensor[indices_tensor])
+              extra_indices.append(indices_tensor)
+      elif indices.op.type == "Reshape":
+        indices_tensor = indices.op.inputs[0]
+        if indices_tensor in handle._counts_tensor:
+          extra_counts.append(handle._counts_tensor[indices_tensor])
+          extra_indices.append(indices_tensor)
+      summed_grad, unique_indices, indices_counts = \
+          _deduplicate_indexed_slices_with_counts_reduction(
+              grad, indices, extra_counts, extra_indices)
+    return self._resource_apply_sparse(
+        grad=summed_grad, var=handle, indices=unique_indices, indices_counts=indices_counts, **kwargs
+    )
+  else:
+    summed_grad, unique_indices = _deduplicate_indexed_slices(values=grad, indices=indices)
+    return self._resource_apply_sparse(summed_grad, handle, unique_indices, **kwargs)
diff --git a/deepray/optimizers/ftrl.py b/deepray/optimizers/ftrl.py
new file mode 100644
index 00000000..33c2c2f7
--- /dev/null
+++ b/deepray/optimizers/ftrl.py
@@ -0,0 +1,96 @@
+import sys
+
+import tensorflow as tf
+from absl import flags
+
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+from .ev_optimizer_patch import add_slot, SlotConfig
+
+
+class FtrlOptimizer(tf.keras.optimizers.legacy.Ftrl):
+
+  def __init__(self, learning_rate=0.001, **kwargs):
+    super().__init__(learning_rate=learning_rate, **kwargs)
+    self.global_step = None
+    flags.FLAGS([sys.argv[0], f"--ev_slot_num={2}"])
+
+  def _create_slots(self, var_list):
+    # Create the "accum" and "linear" slots.
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = tf.compat.v1.constant_initializer(self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, "accumulator", init, slot_config=SlotConfig(slot_index=1, slot_num=2))
+      self.add_slot(var, "linear", slot_config=SlotConfig(slot_index=2, slot_num=2))
+
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
+
+    # Adjust L2 regularization strength to include beta to avoid the
+    # underlying TensorFlow ops needing to include it.
+    adjusted_l2_regularization_strength = coefficients["l2_regularization_strength"
+                                                      ] + coefficients["beta"] / (2.0 * coefficients["lr_t"])
+
+    accum = self.get_slot(var, "accumulator")
+    linear = self.get_slot(var, "linear")
+
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      if isinstance(var, kv_variable_ops.EmbeddingVariable):
+        return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl(
+            var.handle,
+            accum.handle,
+            linear.handle,
+            grad,
+            indices,
+            coefficients["lr_t"],
+            coefficients["l1_regularization_strength"],
+            adjusted_l2_regularization_strength,
+            coefficients["learning_rate_power"],
+            use_locking=self._use_locking
+        )
+      else:
+        return tf.raw_ops.ResourceSparseApplyFtrl(
+            var=var.handle,
+            accum=accum.handle,
+            linear=linear.handle,
+            grad=grad,
+            indices=indices,
+            lr=coefficients["lr_t"],
+            l1=coefficients["l1_regularization_strength"],
+            l2=adjusted_l2_regularization_strength,
+            lr_power=coefficients["learning_rate_power"],
+            use_locking=self._use_locking,
+        )
+    else:
+      if isinstance(var, kv_variable_ops.EmbeddingVariable):
+        return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl_v2(
+            var.handle,
+            accum.handle,
+            linear.handle,
+            grad,
+            indices,
+            coefficients["lr_t"],
+            coefficients["l1_regularization_strength"],
+            adjusted_l2_regularization_strength,
+            coefficients["l2_shrinkage_regularization_strength"],
+            coefficients["learning_rate_power"],
+            use_locking=self._use_locking
+        )
+      else:
+        return tf.raw_ops.ResourceSparseApplyFtrlV2(
+            var=var.handle,
+            accum=accum.handle,
+            linear=linear.handle,
+            grad=grad,
+            indices=indices,
+            lr=coefficients["lr_t"],
+            l1=coefficients["l1_regularization_strength"],
+            l2=adjusted_l2_regularization_strength,
+            l2_shrinkage=coefficients["l2_shrinkage_regularization_strength"],
+            lr_power=coefficients["learning_rate_power"],
+            use_locking=self._use_locking,
+        )
+
+
+FtrlOptimizer.add_slot = add_slot
diff --git a/deepray/optimizers/gradient_descent.py b/deepray/optimizers/gradient_descent.py
new file mode 100644
index 00000000..1ff6f25c
--- /dev/null
+++ b/deepray/optimizers/gradient_descent.py
@@ -0,0 +1,91 @@
+# Copyright 2024 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GradientDescentOptimizer for Deepray."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tf_keras.src.optimizers.legacy import gradient_descent as gd_old
+
+from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+from deepray.custom_ops.embedding_variable import kv_variable_ops
+
+
+class SGD(gd_old.SGD):
+
+  def __init__(self, learning_rate=0.01, **kwargs):
+    super().__init__(learning_rate=learning_rate, **kwargs)
+    self.global_step = None
+
+  def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, **kwargs):
+    var_device, var_dtype = var.device, var.dtype.base_dtype
+    coefficients = kwargs.get("apply_state", {}).get((var_device, var_dtype)
+                                                    ) or self._fallback_apply_state(var_device, var_dtype)
+    if self._momentum:
+      # This method is only needed for momentum optimization.
+      momentum_var = self.get_slot(var, "momentum")
+      return tf.raw_ops.ResourceSparseApplyKerasMomentum(
+          var=var.handle,
+          accum=momentum_var.handle,
+          lr=coefficients["lr_t"],
+          grad=grad,
+          indices=indices,
+          momentum=coefficients["momentum"],
+          use_locking=self._use_locking,
+          use_nesterov=self.nesterov,
+      )
+    else:
+      if isinstance(var, kv_variable_ops.EmbeddingVariable):
+        if var.need_counts() and len(var._counts_tensor.keys()) != 0:
+          extra_counts, extra_indices = [], []
+          if indices.op.type == "ConcatV2":
+            for tensor in indices.op.inputs:
+              if tensor.op.type == "Reshape":
+                indices_tensor = tensor.op.inputs[0]
+                if indices_tensor in var._counts_tensor:
+                  extra_counts.append(var._counts_tensor[indices_tensor])
+                  extra_indices.append(indices_tensor)
+          elif indices.op.type == "Reshape":
+            indices_tensor = indices.op.inputs[0]
+            if indices_tensor in var._counts_tensor:
+              extra_counts.append(var._counts_tensor[indices_tensor])
+              extra_indices.append(indices_tensor)
+
+          from deepray.custom_ops.unique_ops import gen_array_ops
+          unique_indices, new_index_positions, indices_counts = \
+            gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts)
+          summed_grads = math_ops.unsorted_segment_sum(grad, new_index_positions, array_ops.shape(unique_indices)[0])
+          return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
+              var.handle,
+              coefficients["lr_t"],
+              summed_grads,
+              unique_indices,
+              self.global_step,
+              indices_counts,
+              use_locking=self._use_locking
+          )
+        else:
+          return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent(
+              var.handle, coefficients["lr_t"], grad, indices, self.global_step, use_locking=self._use_locking
+          )
+      else:
+        return tf.raw_ops.ResourceScatterAdd(
+            resource=var.handle,
+            indices=indices,
+            updates=-grad * coefficients["lr_t"],
+        )
diff --git a/deepray/optimizers/lazy_adam.py b/deepray/optimizers/lazy_adam.py
index 6fda8c3d..2c940f32 100644
--- a/deepray/optimizers/lazy_adam.py
+++ b/deepray/optimizers/lazy_adam.py
@@ -23,17 +23,18 @@
 import importlib
 import tensorflow as tf
 from deepray.utils.types import FloatTensorLike
+import tf_keras as keras
 
 from typeguard import typechecked
 from typing import Union, Callable
 
-if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None:
-  adam_optimizer_class = tf.keras.optimizers.legacy.Adam
+if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None:
+  adam_optimizer_class = keras.optimizers.legacy.Adam
 else:
-  adam_optimizer_class = tf.keras.optimizers.Adam
+  adam_optimizer_class = keras.optimizers.Adam
 
 
-@tf.keras.utils.register_keras_serializable(package="Deepray")
+@keras.utils.register_keras_serializable(package="Deepray")
 class LazyAdam(adam_optimizer_class):
   """Variant of the Adam optimizer that handles sparse updates more
     efficiently.
@@ -67,7 +68,7 @@ def __init__(
 
         Args:
           learning_rate: A `Tensor` or a floating point value. or a schedule
-            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
+            that is a `keras.optimizers.schedules.LearningRateSchedule`
             The learning rate.
           beta_1: A `float` value or a constant `float` tensor.
             The exponential decay rate for the 1st moment estimates.
@@ -142,3 +143,6 @@ def _resource_scatter_operate(self, resource, indices, update, resource_scatter_
     }
 
     return resource_scatter_op(**resource_update_kwargs)
+
+  def get_config(self):
+    return super().get_config()
diff --git a/deepray/optimizers/multi_optimizer.py b/deepray/optimizers/multi_optimizer.py
index 273710d4..aae91dd3 100644
--- a/deepray/optimizers/multi_optimizer.py
+++ b/deepray/optimizers/multi_optimizer.py
@@ -12,20 +12,39 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Discriminative Layer Training Optimizer for TensorFlow."""
+"""Multiple Optimizer for TensorFlow.
+References:
+1. https://github.com/tensorflow/recommenders/blob/7caed557b9d5194202d8323f2d4795231a5d0b1d/tensorflow_recommenders/experimental/optimizers/composite_optimizer.py#L25
+2. https://github.com/tensorflow/addons/blob/d208d752e98c310280938efa939117bf635a60a8/tensorflow_addons/optimizers/discriminative_layer_training.py#L47
+3. https://github.com/NVIDIA-Merlin/models/blob/eb1e54196a64a70950b2a7e7744d2150e052d53e/merlin/models/tf/blocks/optimizer.py#L73
+"""
 
 from collections import defaultdict
 from typing import List, Union
 
 import tensorflow as tf
-from tensorflow.keras.optimizers import Optimizer as keras_optimizer
-from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.training import optimizer
+from packaging.version import Version
 from typeguard import typechecked
 
-import deepray as dp
 from deepray.optimizers import KerasLegacyOptimizer
 
+if Version(tf.__version__).release >= Version("2.16").release:
+  # Determine if loading keras 2 or 3.
+  if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release):
+    # New versions of Keras require importing from `keras.src` when
+    # importing internal symbols.
+    from keras.src import backend
+    from keras.src.utils import tf_utils
+  else:
+    from tf_keras.src import backend
+    from tf_keras.src.utils import tf_utils
+elif Version(tf.__version__).release >= Version("2.13").release:
+  from keras.src import backend
+  from keras.src.utils import tf_utils
+else:
+  from keras import backend
+  from keras.utils import tf_utils
+
 
 class MultiOptimizer(KerasLegacyOptimizer):
   """Multi Optimizer Wrapper for Discriminative Layer Training.
@@ -86,28 +105,19 @@ def __init__(
       name: str = "MultiOptimizer",
       **kwargs,
   ):
-
     super(MultiOptimizer, self).__init__(name, **kwargs)
     if default_optimizer is None:
-      raise RuntimeError("Must specify `default_optimizer`.")
+      raise RuntimeError("Must specify a `default_optimizer`.")
     self.optimizers_and_varnames = optimizers_and_varnames
     self.default_optimizer = default_optimizer
 
-    if isinstance(self, optimizer.Optimizer):
-      self.compute_gradients = self.default_optimizer.compute_gradients
-    elif isinstance(self, optimizer_v2.OptimizerV2) or isinstance(self, keras_optimizer):
-      self.compute_gradients = self.default_optimizer._compute_gradients
-    else:
-      raise Exception("Optimizer type is not supported! got {}".format(str(type(self))))
+  def apply_gradients(self, grads_and_vars, **kwargs):
+    """Wrapped apply_gradient method.
 
-  def minimize(self, loss, var_list, tape):
-    # Compute gradients
-    grads_and_vars = self.compute_gradients(loss=loss, var_list=var_list, tape=tape)
-    self.apply_gradients(grads_and_vars)
-
-  def apply_gradients(self, grads_and_vars, name=None, **kwargs):
+    Returns an operation to be executed.
+    """
     # Create a dictionary with a default optimizer and an empty variable list
-    var_dict, grad_dict = defaultdict(list), defaultdict(list)
+    grad_var_dict = defaultdict(list)
 
     # Iterate over the trainable variables list
     for grad, var in grads_and_vars:
@@ -115,37 +125,33 @@ def apply_gradients(self, grads_and_vars, name=None, **kwargs):
       for optimizer, varnames in self.optimizers_and_varnames:
         if any(name in var.name for name in varnames.split(',')):
           # If it does, append the variable to the optimizer's variable list
-          var_dict[optimizer].append(var)
-          grad_dict[optimizer].append(grad)
+          grad_var_dict[optimizer].append((grad, var))
           break
       else:
         # If it doesn't, append the variable to the default optimizer's variable list
-        var_dict[self.default_optimizer].append(var)
-        grad_dict[self.default_optimizer].append(grad)
+        grad_var_dict[self.default_optimizer].append((grad, var))
 
+    update_ops = []
     # Call the apply_gradients method for each optimizer with the corresponding gradient and variable list
-    for optimizer, partvar_list in var_dict.items():
-      optimizer.apply_gradients(zip(grad_dict[optimizer], partvar_list))
+    for optimizer, grad_var in grad_var_dict.items():
+      update_ops.append(optimizer.apply_gradients(grad_var, **kwargs))
 
-  def get_config(self):
-    # https://github.com/tensorflow/addons/blob/062a7aaf33e4618fc3eb55f54915278287bb545f/tensorflow_addons/optimizers/discriminative_layer_training.py#L153
-    raise NotImplementedError("MultiOptimizer cannot be serialized because"
-                              " it uses callable to get variables.")
+    # update_ops = [optimizer.apply_gradients(grad_var, **kwargs) for optimizer, grad_var in grad_var_dict.items()]
+    update_group = tf.group(update_ops)
 
-  @property
-  def iterations(self):
-    """The number of training steps this `optimizer` has run.
+    any_symbolic = any(isinstance(i, tf.Operation) or tf_utils.is_symbolic_tensor(i) for i in update_ops)
 
-    By default, iterations would be incremented by one every time
-    `apply_gradients()` is called.
-    """
-    return self.default_optimizer.iterations
+    if not tf.executing_eagerly() or any_symbolic:
+      # If the current context is graph mode or any of the update ops are
+      # symbolic then the step update should be carried out under a graph
+      # context. (eager updates execute immediately)
+      with backend._current_graph(  # pylint: disable=protected-access
+          update_ops
+      ).as_default():
+        with tf.control_dependencies([update_group]):
+          return self.iterations.assign_add(1, read_value=False)
 
-  @iterations.setter
-  def iterations(self, variable):
-    """See base class."""
-    for optimizer, _ in self.optimizers_and_varnames:
-      optimizer.iterations = variable
+    return self.iterations.assign_add(1)
 
   def variables(self):
     """Returns the optimizer's variables."""
diff --git a/deepray/optimizers/optimization.py b/deepray/optimizers/optimization.py
index 26f3e5ff..2ca36f76 100644
--- a/deepray/optimizers/optimization.py
+++ b/deepray/optimizers/optimization.py
@@ -26,8 +26,6 @@
 
 from .warmup import WarmUpPolynomial
 
-FLAGS = flags.FLAGS
-
 
 def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type="adam"):
   """Creates an optimizer with learning rate schedule."""
@@ -81,9 +79,6 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type=
   # if FLAGS.use_horovod:
   #   import horovod.tensorflow.keras as hvd
   #   optimizer = hvd.DistributedOptimizer(optimizer, backward_passes_per_step=1, average_aggregated_gradients=True)
-  if FLAGS.use_dynamic_embedding:
-    from tensorflow_recommenders_addons import dynamic_embedding as de
-    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=FLAGS.use_horovod)
   return optimizer
 
 
diff --git a/deepray/optimizers/tests/weight_decay_optimizers_test.py b/deepray/optimizers/tests/weight_decay_optimizers_test.py
index 27d777c8..c2a1041f 100644
--- a/deepray/optimizers/tests/weight_decay_optimizers_test.py
+++ b/deepray/optimizers/tests/weight_decay_optimizers_test.py
@@ -373,7 +373,7 @@ def test_var_list_with_exclude_list_sgdw(dtype):
   )
 
 
-if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None:
+if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None:
   optimizer_class = tf.keras.optimizers.legacy.SGD
 else:
   optimizer_class = tf.keras.optimizers.SGD
diff --git a/deepray/optimizers/weight_decay_optimizers.py b/deepray/optimizers/weight_decay_optimizers.py
index 53624611..264bab37 100644
--- a/deepray/optimizers/weight_decay_optimizers.py
+++ b/deepray/optimizers/weight_decay_optimizers.py
@@ -256,7 +256,7 @@ def _do_use_weight_decay(self, var):
     return var.ref() in self._decay_var_list
 
 
-if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None:
+if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None:
   keras_legacy_optimizer = Union[tf.keras.optimizers.legacy.Optimizer, tf.keras.optimizers.Optimizer]
 else:
   keras_legacy_optimizer = tf.keras.optimizers.Optimizer
diff --git a/deepray/repo.bzl b/deepray/repo.bzl
new file mode 100644
index 00000000..8ce8d04d
--- /dev/null
+++ b/deepray/repo.bzl
@@ -0,0 +1,48 @@
+""" TensorFlow Http Archive
+
+Modified http_archive that allows us to override the TensorFlow commit that is
+downloaded by setting an environment variable. This override is to be used for
+testing purposes.
+
+Add the following to your Bazel build command in order to override the
+TensorFlow revision.
+
+build: --action_env TF_REVISION="<git commit hash>"
+
+  * `TF_REVISION`: tensorflow revision override (git commit hash)
+"""
+
+_TF_REVISION = "TF_REVISION"
+
+def _tensorflow_http_archive(ctx):
+    git_commit = ctx.attr.git_commit
+    sha256 = ctx.attr.sha256
+    patch = getattr(ctx.attr, "patch", None)
+
+    override_git_commit = ctx.os.environ.get(_TF_REVISION)
+    if override_git_commit:
+        sha256 = ""
+        git_commit = override_git_commit
+
+    strip_prefix = "tensorflow-%s" % git_commit
+    urls = [
+        "https://github.com/tensorflow/tensorflow/archive/%s.tar.gz" % git_commit,
+    ]
+    ctx.download_and_extract(
+        urls,
+        "",
+        sha256,
+        "",
+        strip_prefix,
+    )
+    if patch:
+        ctx.patch(patch, strip = 1)
+
+tensorflow_http_archive = repository_rule(
+    implementation = _tensorflow_http_archive,
+    attrs = {
+        "git_commit": attr.string(mandatory = True),
+        "sha256": attr.string(mandatory = True),
+        "patch": attr.label(),
+    },
+)
diff --git a/deepray/seq2seq/BUILD b/deepray/seq2seq/BUILD
deleted file mode 100644
index 325f9fac..00000000
--- a/deepray/seq2seq/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-py_library(
-    name = "seq2seq",
-    srcs = glob(["*.py"]),
-    data = [
-        "//deepray:options.py",
-        "//deepray/custom_ops/seq2seq:_beam_search_ops.so",
-    ],
-    deps = [
-        "//deepray/testing",
-        "//deepray/utils",
-    ],
-)
-
-py_test(
-    name = "seq2seq_test",
-    size = "medium",
-    srcs = glob(["tests/*"]),
-    main = "tests/run_all_test.py",
-    deps = [
-        ":seq2seq",
-    ],
-)
diff --git a/deepray/seq2seq/__init__.py b/deepray/seq2seq/__init__.py
deleted file mode 100644
index 7e5124b5..00000000
--- a/deepray/seq2seq/__init__.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Additional layers for sequence to sequence models."""
-
-from deepray.seq2seq.attention_wrapper import AttentionMechanism
-from deepray.seq2seq.attention_wrapper import AttentionWrapper
-from deepray.seq2seq.attention_wrapper import AttentionWrapperState
-from deepray.seq2seq.attention_wrapper import BahdanauAttention
-from deepray.seq2seq.attention_wrapper import BahdanauMonotonicAttention
-from deepray.seq2seq.attention_wrapper import LuongAttention
-from deepray.seq2seq.attention_wrapper import LuongMonotonicAttention
-from deepray.seq2seq.attention_wrapper import hardmax
-from deepray.seq2seq.attention_wrapper import monotonic_attention
-from deepray.seq2seq.attention_wrapper import safe_cumprod
-
-from deepray.seq2seq.basic_decoder import BasicDecoder
-from deepray.seq2seq.basic_decoder import BasicDecoderOutput
-
-from deepray.seq2seq.beam_search_decoder import BeamSearchDecoder
-from deepray.seq2seq.beam_search_decoder import BeamSearchDecoderOutput
-from deepray.seq2seq.beam_search_decoder import BeamSearchDecoderState
-from deepray.seq2seq.beam_search_decoder import FinalBeamSearchDecoderOutput
-from deepray.seq2seq.beam_search_decoder import gather_tree
-from deepray.seq2seq.beam_search_decoder import gather_tree_from_array
-from deepray.seq2seq.beam_search_decoder import tile_batch
-
-from deepray.seq2seq.decoder import BaseDecoder
-from deepray.seq2seq.decoder import Decoder
-from deepray.seq2seq.decoder import dynamic_decode
-
-from deepray.seq2seq.loss import SequenceLoss
-from deepray.seq2seq.loss import sequence_loss
-
-from deepray.seq2seq.sampler import CustomSampler
-from deepray.seq2seq.sampler import GreedyEmbeddingSampler
-from deepray.seq2seq.sampler import InferenceSampler
-from deepray.seq2seq.sampler import SampleEmbeddingSampler
-from deepray.seq2seq.sampler import Sampler
-from deepray.seq2seq.sampler import ScheduledEmbeddingTrainingSampler
-from deepray.seq2seq.sampler import ScheduledOutputTrainingSampler
-from deepray.seq2seq.sampler import TrainingSampler
diff --git a/deepray/tensorflow.bzl b/deepray/tensorflow.bzl
deleted file mode 100644
index 082a5518..00000000
--- a/deepray/tensorflow.bzl
+++ /dev/null
@@ -1,333 +0,0 @@
-load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library")
-
-cc_shared_library = native.cc_shared_library
-
-def if_google(google_value, oss_value = []):
-    """Returns one of the arguments based on the non-configurable build env.
-
-    Specifically, it does not return a `select`, and can be used to e.g.
-    compute elements of list attributes.
-    """
-    return oss_value  # copybara:comment_replace return google_value
-
-def clean_dep(target):
-    """Returns string to 'target' in @org_tensorflow repository.
-
-    Use this function when referring to targets in the @org_tensorflow
-    repository from macros that may be called from external repositories.
-    """
-
-    # A repo-relative label is resolved relative to the file in which the
-    # Label() call appears, i.e. @org_tensorflow.
-    return str(Label(target))
-
-# Include specific extra dependencies when building statically, or
-# another set of dependencies otherwise. If "macos" is provided, that
-# dependency list is used when using the framework_shared_object config
-# on MacOS platforms. If "macos" is not provided, the "otherwise" list is
-# used for all framework_shared_object platforms including MacOS.
-def if_static(extra_deps, otherwise = [], macos = []):
-    ret = {
-        str(Label("//deepray:framework_shared_object")): otherwise,
-        "//conditions:default": extra_deps,
-    }
-    if macos:
-        ret[str(Label("//deepray:macos_with_framework_shared_object"))] = macos
-    return select(ret)
-
-# version for the shared libraries, can
-# not contain rc or alpha, only numbers.
-# Also update tensorflow/core/public/version.h
-# and tensorflow/tools/pip_package/setup.py
-VERSION = "2.14.0"
-VERSION_MAJOR = VERSION.split(".")[0]
-two_gpu_tags = ["requires-gpu-nvidia:2", "notap", "manual", "no_pip"]
-
-# The workspace root, to be used to set workspace 'include' paths in a way that
-# will still work correctly when TensorFlow is included as a dependency of an
-# external project.
-workspace_root = Label("//:WORKSPACE").workspace_root or "."
-
-def tf_binary_additional_srcs(fullversion = False):
-    if fullversion:
-        suffix = "." + VERSION
-    else:
-        suffix = "." + VERSION_MAJOR
-
-    return if_static(
-        extra_deps = [],
-        macos = [
-            clean_dep("//deepray:libtensorflow_framework%s.dylib" % suffix),
-        ],
-        otherwise = [
-            clean_dep("//deepray:libtensorflow_framework.so%s" % suffix),
-        ],
-    )
-
-def _make_search_paths(prefix, levels_to_root):
-    return ",".join(
-        [
-            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
-            for search_level in range(levels_to_root + 1)
-        ],
-    )
-
-def _rpath_linkopts(name):
-    # Search parent directories up to the TensorFlow root directory for shared
-    # object dependencies, even if this op shared object is deeply nested
-    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
-    # the root and tensorflow/libtensorflow_framework.so should exist when
-    # deployed. Other shared object dependencies (e.g. shared between contrib/
-    # ops) are picked up as long as they are in either the same or a parent
-    # directory in the tensorflow/ tree.
-    levels_to_root = native.package_name().count("/") + name.count("/")
-    return select({
-        clean_dep("//deepray:macos"): [
-            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
-            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
-        ],
-        clean_dep("//deepray:windows"): [],
-        "//conditions:default": [
-            "-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),),
-        ],
-    })
-
-def _rpath_user_link_flags(name):
-    # Search parent directories up to the TensorFlow root directory for shared
-    # object dependencies, even if this op shared object is deeply nested
-    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
-    # the root and tensorflow/libtensorflow_framework.so should exist when
-    # deployed. Other shared object dependencies (e.g. shared between contrib/
-    # ops) are picked up as long as they are in either the same or a parent
-    # directory in the tensorflow/ tree.
-    levels_to_root = native.package_name().count("/") + name.count("/")
-    return select({
-        clean_dep("//deepray:macos"): [
-            "-Wl,%s" % (_make_search_paths("@loader_path", levels_to_root),),
-            "-Wl,-rename_section,__TEXT,text_env,__TEXT,__text",
-        ],
-        clean_dep("//deepray:windows"): [],
-        "//conditions:default": [
-            "-Wl,%s" % (_make_search_paths("$ORIGIN", levels_to_root),),
-        ],
-    })
-
-# buildozer: disable=function-docstring-args
-def pybind_extension_opensource(
-        name,
-        srcs,
-        module_name = None,
-        hdrs = [],
-        dynamic_deps = [],
-        static_deps = [],
-        deps = [],
-        additional_exported_symbols = [],
-        compatible_with = None,
-        copts = [],
-        data = [],
-        defines = [],
-        deprecation = None,
-        features = [],
-        link_in_framework = False,
-        licenses = None,
-        linkopts = [],
-        pytype_deps = [],
-        pytype_srcs = [],
-        restricted_to = None,
-        srcs_version = "PY3",
-        testonly = None,
-        visibility = None,
-        win_def_file = None):
-    """Builds a generic Python extension module."""
-    _ignore = [module_name]
-    p = name.rfind("/")
-    if p == -1:
-        sname = name
-        prefix = ""
-    else:
-        sname = name[p + 1:]
-        prefix = name[:p + 1]
-    so_file = "%s%s.so" % (prefix, sname)
-    filegroup_name = "%s_filegroup" % name
-    pyd_file = "%s%s.pyd" % (prefix, sname)
-    exported_symbols = [
-        "init%s" % sname,
-        "init_%s" % sname,
-        "PyInit_%s" % sname,
-    ] + additional_exported_symbols
-
-    exported_symbols_file = "%s-exported-symbols.lds" % name
-    version_script_file = "%s-version-script.lds" % name
-
-    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
-    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
-
-    native.genrule(
-        name = name + "_exported_symbols",
-        outs = [exported_symbols_file],
-        cmd = "echo '%s' >$@" % exported_symbols_output,
-        output_licenses = ["unencumbered"],
-        visibility = ["//visibility:private"],
-        testonly = testonly,
-    )
-
-    native.genrule(
-        name = name + "_version_script",
-        outs = [version_script_file],
-        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
-        output_licenses = ["unencumbered"],
-        visibility = ["//visibility:private"],
-        testonly = testonly,
-    )
-
-    if static_deps:
-        cc_library_name = so_file + "_cclib"
-        cc_library(
-            name = cc_library_name,
-            hdrs = hdrs,
-            srcs = srcs + hdrs,
-            data = data,
-            deps = deps,
-            compatible_with = compatible_with,
-            copts = copts + [
-                "-fno-strict-aliasing",
-                "-fexceptions",
-            ] + select({
-                clean_dep("//deepray:windows"): [],
-                "//conditions:default": [
-                    "-fvisibility=hidden",
-                ],
-            }),
-            defines = defines,
-            features = features + ["-use_header_modules"],
-            restricted_to = restricted_to,
-            testonly = testonly,
-            visibility = visibility,
-        )
-
-        cc_shared_library(
-            name = so_file,
-            roots = [cc_library_name],
-            dynamic_deps = dynamic_deps,
-            static_deps = static_deps,
-            additional_linker_inputs = [exported_symbols_file, version_script_file],
-            compatible_with = compatible_with,
-            deprecation = deprecation,
-            features = features + ["-use_header_modules"],
-            licenses = licenses,
-            restricted_to = restricted_to,
-            shared_lib_name = so_file,
-            testonly = testonly,
-            user_link_flags = linkopts + _rpath_user_link_flags(name) + select({
-                clean_dep("//deepray:macos"): [
-                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
-                    # not being exported.  There should be a better way to deal with this.
-                    "-Wl,-w",
-                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
-                ],
-                clean_dep("//deepray:windows"): [],
-                "//conditions:default": [
-                    "-Wl,--version-script",
-                    "$(location %s)" % version_script_file,
-                ],
-            }),
-            visibility = visibility,
-        )
-
-        # cc_shared_library can generate more than one file.
-        # Solution to avoid the error "variable '$<' : more than one input file."
-        filegroup(
-            name = filegroup_name,
-            srcs = [so_file],
-            output_group = "main_shared_library_output",
-            testonly = testonly,
-        )
-    else:
-        if link_in_framework:
-            srcs += tf_binary_additional_srcs()
-
-        cc_binary(
-            name = so_file,
-            srcs = srcs + hdrs,
-            data = data,
-            copts = copts + [
-                "-fno-strict-aliasing",
-                "-fexceptions",
-            ] + select({
-                clean_dep("//deepray:windows"): [],
-                "//conditions:default": [
-                    "-fvisibility=hidden",
-                ],
-            }),
-            linkopts = linkopts + _rpath_linkopts(name) + select({
-                clean_dep("//deepray:macos"): [
-                    # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
-                    # not being exported.  There should be a better way to deal with this.
-                    "-Wl,-w",
-                    "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
-                ],
-                clean_dep("//deepray:windows"): [],
-                "//conditions:default": [
-                    "-Wl,--version-script",
-                    "$(location %s)" % version_script_file,
-                ],
-            }),
-            deps = deps + [
-                exported_symbols_file,
-                version_script_file,
-            ],
-            defines = defines,
-            features = features + ["-use_header_modules"],
-            linkshared = 1,
-            testonly = testonly,
-            licenses = licenses,
-            visibility = visibility,
-            deprecation = deprecation,
-            restricted_to = restricted_to,
-            compatible_with = compatible_with,
-        )
-
-        # For Windows, emulate the above filegroup with the shared object.
-        native.alias(
-            name = filegroup_name,
-            actual = so_file,
-        )
-
-    # For Windows only.
-    native.genrule(
-        name = name + "_pyd_copy",
-        srcs = [filegroup_name],
-        outs = [pyd_file],
-        cmd = "cp $< $@",
-        output_to_bindir = True,
-        visibility = visibility,
-        deprecation = deprecation,
-        restricted_to = restricted_to,
-        compatible_with = compatible_with,
-        testonly = testonly,
-    )
-
-    native.py_library(
-        name = name,
-        data = select({
-            clean_dep("//deepray:windows"): [pyd_file],
-            "//conditions:default": [so_file],
-        }) + pytype_srcs,
-        deps = pytype_deps,
-        srcs_version = srcs_version,
-        licenses = licenses,
-        testonly = testonly,
-        visibility = visibility,
-        deprecation = deprecation,
-        restricted_to = restricted_to,
-        compatible_with = compatible_with,
-    )
-
-# Export open source version of pybind_extension under base name as well.
-pybind_extension = pybind_extension_opensource
-
-def filegroup(**kwargs):
-    native.filegroup(**kwargs)
-
-def genrule(**kwargs):
-    native.genrule(**kwargs)
diff --git a/deepray/utils/BUILD b/deepray/utils/BUILD
index 8f511360..27d9f0c2 100644
--- a/deepray/utils/BUILD
+++ b/deepray/utils/BUILD
@@ -12,6 +12,10 @@ py_library(
         "//deepray:conftest.py",
         "//deepray:options.py",
     ],
+    deps = [
+        "@pypi_tf_keras//:pkg",
+        "@pypi_tqdm//:pkg",
+    ],
 )
 
 py_test(
diff --git a/deepray/utils/benchmark.py b/deepray/utils/benchmark.py
index a5ac1ada..65d8a608 100644
--- a/deepray/utils/benchmark.py
+++ b/deepray/utils/benchmark.py
@@ -15,7 +15,6 @@
 from time import perf_counter
 
 import numpy as np
-import tensorflow as tf
 from absl import logging
 
 
@@ -38,10 +37,7 @@ def __init__(self, warmup_steps=0, total_steps=0):
     self.benchmark_start_time = None
     self.benchmark_after_warmup_start_time = None
     self.latency_percentiles = (90, 95, 99)
-    with tf.device("/CPU:0"):
-      self.samples = tf.Variable(0, trainable=False, dtype=tf.int64)
-
-    self.samples.assign(0)
+    self.samples = 0
     self.step_latencies = [0]
     self._results = {}
     # used to represent duration of entire training
@@ -76,17 +72,17 @@ def _calculate_latency(self):
   def _calculate_throughput(self):
     time_elapsed = perf_counter() - self.benchmark_start_time
     time_elapsed_after_warmup = perf_counter() - self.benchmark_after_warmup_start_time
-    benchmark_throughput = self.samples.numpy() / time_elapsed_after_warmup
-    return {"throughput": benchmark_throughput, "time": time_elapsed}
+    benchmark_throughput = self.samples / time_elapsed_after_warmup
+    return {"throughput": benchmark_throughput, "time": time_elapsed, "total_samples": self.samples}
 
   def __call__(self, steps, global_batch_size):
-    self.samples.assign_add(steps * global_batch_size)
+    self.samples += steps * global_batch_size
     step_latency = perf_counter() - self.step_start_time
     step_throughput = steps * global_batch_size / step_latency
     self.step_latencies.append(step_latency)
     self.step += steps
     if self.step == self.warmup_steps:
-      self.samples.assign(0)
+      self.samples = 0
       self.step_latencies = []
       self.benchmark_after_warmup_start_time = perf_counter()
     elif self.step == self.total_steps:
diff --git a/deepray/utils/ckpt_util.py b/deepray/utils/ckpt_util.py
new file mode 100644
index 00000000..78394151
--- /dev/null
+++ b/deepray/utils/ckpt_util.py
@@ -0,0 +1,11 @@
+import tensorflow as tf
+
+
+def print_checkpoint(save_path):
+  reader = tf.train.load_checkpoint(save_path)
+  shapes = reader.get_variable_to_shape_map()
+  dtypes = reader.get_variable_to_dtype_map()
+  print(f"Checkpoint at '{save_path}':")
+  for key in shapes:
+    print(f"  (key='{key}', shape={shapes[key]}, dtype={dtypes[key].name}, "
+          f"value={reader.get_tensor(key)})")
\ No newline at end of file
diff --git a/deepray/utils/data/feature_map.py b/deepray/utils/data/feature_map.py
index 316bd7eb..b5d07690 100644
--- a/deepray/utils/data/feature_map.py
+++ b/deepray/utils/data/feature_map.py
@@ -3,7 +3,7 @@
 # @Author  : Hailin.Fu
 # @license : Copyright(C),  <hailin.fu@>
 import os
-
+import yaml
 import pandas as pd
 import tensorflow as tf
 from absl import logging, flags
@@ -11,125 +11,78 @@
 from deepray.design_patterns import SingletonType
 from deepray.utils.horovod_utils import is_main_process
 
-FLAGS = flags.FLAGS
-
 
 class FeatureMap(metaclass=SingletonType):
 
-  def __init__(self, feature_map, black_list=None, white_list=None):
-    # Read YAML file
-    # with open(os.path.join(os.path.dirname(__file__), feature_file), encoding="utf-8") as stream:
-    #   try:
-    #     self.conf = yaml.safe_load(stream)
-    #   except yaml.YAMLError as exc:
-    #     logging.error(exc)
-    self._feature_file = feature_map
-    self._black_list = black_list
-    self._white_list = white_list if white_list else FLAGS.white_list
-    self.feature_map = self.get_summary()
-    if is_main_process() and self.feature_map is not None:
-      logging.info(
-          "\n" +
-          self.feature_map.loc[:,
-                               ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])].to_markdown()
+  def __init__(self):
+    if flags.FLAGS.config_file:
+      # Read YAML file
+      with open(flags.FLAGS.config_file, encoding="utf-8") as stream:
+        try:
+          self.yaml_conf = yaml.safe_load(stream)
+        except yaml.YAMLError as exc:
+          logging.error(exc)
+    if flags.FLAGS.feature_map and tf.io.gfile.exists(flags.FLAGS.feature_map):
+      self.feature_map = self.get_summary(
+          feature_map=flags.FLAGS.feature_map, black_list=flags.FLAGS.black_list, white_list=flags.FLAGS.white_list
       )
-
-  def get_summary(self):
-    if not tf.io.gfile.exists(self._feature_file):
       if is_main_process():
-        logging.info(f"File not exists: {self._feature_file}")
-      return None
-    with tf.io.gfile.GFile(self._feature_file, mode="r") as f:
-      file_name, file_extension = os.path.splitext(self._feature_file)
-      if file_extension == ".csv":
-        feature_map = pd.read_csv(
-            f,
-            dtype={
-                "code": int,
-                "name": "string",
-                "dtype": "string",
-                "ftype": "string",
-                "dim": "uint32",
-                "length": float,
-                "voc_size": float,
-                "lr": "float32",
-                "optimizer": "string",
-                "storage_type": "string",
-                "composition_size": "string",
-                "ev_filter": "string",
-            },
-        ).fillna(
-            value={
-                "code": -1,
-                "length": 1.0,
-                "voc_size": 0.0,
-                "lr": 0.0,
-                "optimizer": "",
-                "storage_type": "",
-                "composition_size": "",
-                "ev_filter": "",
-            }
+        logging.info("Used features map:")
+        print(
+            "\n" +
+            self.feature_map.loc[:,
+                                 ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])].to_markdown()
         )
+    else:
+      logging.info(f"feature_map file not exists: {flags.FLAGS.feature_map}")
+      self.feature_map = None
+
+  def get_summary(self, feature_map, black_list=None, white_list=None):
+    with tf.io.gfile.GFile(feature_map, mode="r") as f:
+      file_name, file_extension = os.path.splitext(feature_map)
+      sep = None
+      if file_extension == ".csv":
+        sep = ','
       elif file_extension == ".tsv":
-        feature_map = pd.read_csv(
-            f,
-            sep='\t',
-            header=None,
-            usecols=[i for i in range(13)],
-            names=[
-                "code", "name", "length", "dtype", "gpercentile", "gcov", "geva", "bpercentile", "bcov", "beva",
-                "def_valu", "fea_tag", "dim"
-            ],
-            dtype={
-                "code": "string",
-                "name": "string",
-                "length": float,
-                "dtype": "string",
-                # "ftype": "string",
-                "gpercentile": "string",
-                "geva": "string",
-                "bpercentile": "string",
-                "bcov": "string",
-                "beva": "string",
-                "def_valu": "string",
-                "fea_tag": "string"
-            },
-        ).fillna(
-            value={
-                # "code": "",
-                # "name": "",
-                "length": 1.0,
-                # "dtype": "",
-                # "gpercentile": "",
-                # "fea_geva": "",
-                # "fea_bpercentile": "",
-                # "fea_bcov": "",
-                # "fea_beva": "",
-                "def_valu": "",
-                # "fea_tag": ""
-            }
-        )
+        sep = '\t'
       else:
         ValueError(f"Not support format for {f}")
-    if self._black_list:
-      with open(self._black_list) as f:
+      feature_map = pd.read_csv(
+          f,
+          sep=sep,
+          dtype={
+              "code": int,
+              "name": "string",
+              "dtype": "string",
+              "ftype": "string",
+              "dim": "uint32",
+              "length": float,
+              "voc_size": float,
+          },
+      ).fillna(value={
+          "code": -1,
+          "length": 1.0,
+          "voc_size": 0.0,
+      })
+    if black_list:
+      with open(black_list) as f:
         black_feature_list = [feature.strip() for feature in f]
       feature_map = feature_map[~feature_map["name"].isin(black_feature_list)]
 
-    if self._white_list:
+    if white_list:
       white_feature_list = []
-      if os.path.isfile(self._white_list):
-        print(f'{self._white_list} is a file.')
-        with open(self._white_list) as f:
+      if os.path.isfile(white_list):
+        print(f'{white_list} is a file.')
+        with open(white_list) as f:
           white_feature_list += [feature.strip() for feature in f]
-      elif os.path.isdir(self._white_list):
-        print(f'{self._white_list} is a directory.')
-        for used_features in os.listdir(self._white_list):
-          filename = os.path.join(self._white_list, used_features)
+      elif os.path.isdir(white_list):
+        print(f'{white_list} is a directory.')
+        for used_features in os.listdir(white_list):
+          filename = os.path.join(white_list, used_features)
           with open(filename) as f:
             white_feature_list += [feature.strip() for feature in f]
       else:
-        print(f'{self._white_list} is neither a file nor a directory.')
+        print(f'{white_list} is neither a file nor a directory.')
 
       feature_map = feature_map[feature_map["name"].isin(white_feature_list)]
 
@@ -137,7 +90,6 @@ def get_summary(self):
     for column in [
         'length',
         'voc_size',
-        # 'composition_size'
     ]:
       if column in feature_map.columns:
         feature_map[column] = feature_map[column].astype(int)
diff --git a/deepray/utils/data/input_meta.py b/deepray/utils/data/input_meta.py
index a2a30865..ab20e450 100644
--- a/deepray/utils/data/input_meta.py
+++ b/deepray/utils/data/input_meta.py
@@ -10,8 +10,6 @@
 
 from deepray.design_patterns import SingletonType
 
-FLAGS = flags.FLAGS
-
 
 class InputMeta(metaclass=SingletonType):
 
diff --git a/deepray/utils/dllogger_class.py b/deepray/utils/dllogger_class.py
deleted file mode 100644
index 2c851120..00000000
--- a/deepray/utils/dllogger_class.py
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from dllogger import Logger, StdOutBackend, JSONStreamBackend, Verbosity
-
-
-class dllogger_class():
-
-  def format_step(self, step):
-    if isinstance(step, str):
-      return step
-    elif isinstance(step, int):
-      return "Iteration: {} ".format(step)
-    elif len(step) > 0:
-      return "Iteration: {} ".format(step[0])
-    else:
-      return ""
-
-  def __init__(self, log_path="bert_dllog.json"):
-    self.logger = Logger(
-        [
-            StdOutBackend(Verbosity.DEFAULT, step_format=self.format_step),
-            JSONStreamBackend(Verbosity.VERBOSE, log_path),
-        ]
-    )
-    self.logger.metadata("mlm_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("nsp_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("avg_loss_step", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("total_loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("loss", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"})
-    self.logger.metadata("f1", {"unit": None, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("precision", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("recall", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("mcc", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata("exact_match", {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"})
-    self.logger.metadata(
-        "throughput_train",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "TRAIN"
-        },
-    )
-    self.logger.metadata(
-        "throughput_inf",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "VAL"
-        },
-    )
-    self.logger.metadata(
-        "throughput_val",
-        {
-            "unit": "sequences/s",
-            "format": ":.3f",
-            "GOAL": "MAXIMIZE",
-            "STAGE": "VAL"
-        },
-    )
diff --git a/deepray/utils/export/export.py b/deepray/utils/export/export.py
index bfaf8228..d4ca9acc 100644
--- a/deepray/utils/export/export.py
+++ b/deepray/utils/export/export.py
@@ -26,15 +26,23 @@
 
 import horovod.tensorflow as hvd
 import tensorflow as tf
-from absl import logging, flags
-from keras.engine import data_adapter
+from absl import flags
+from packaging.version import parse
+
+if parse(tf.__version__.replace("-tf", "+tf")) < parse("2.11"):
+  from keras.engine import data_adapter
+elif parse(tf.__version__) > parse("2.16.0"):
+  from tf_keras.src.engine import data_adapter
+else:
+  from keras.src.engine import data_adapter
 from tensorflow.python.compiler.tensorrt import trt_convert as trt
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
 
+from deepray.utils import logging_util
 from deepray.utils.horovod_utils import is_main_process, get_world_size, get_rank
 
-FLAGS = flags.FLAGS
+logger = logging_util.get_logger()
 
 
 def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32, batch_size=1):
@@ -64,33 +72,37 @@ def serving_input_receiver_fn():
 
 
 def export_to_checkpoint(saver: Union[tf.train.Checkpoint, tf.train.CheckpointManager], checkpoint_number=None):
-  # TODO(@hejia): Fix export_to_checkpoint when use TFRA.
-  if FLAGS.use_dynamic_embedding:
-    return
 
   def helper(name, _saver):
     """Saves model to with provided checkpoint prefix."""
-    latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(FLAGS.model_dir, 'ckpt_' + name))
+    latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(flags.FLAGS.model_dir, 'ckpt_' + name))
     match = re.search(r"(?<=ckpt-)\d+", latest_checkpoint_file) if latest_checkpoint_file else None
     latest_step_ckpt = int(match.group()) if match else -1
 
     if latest_step_ckpt != checkpoint_number:
       save_path = _saver.save(checkpoint_number)
-      logging.info('Saved checkpoint to {}'.format(save_path))
+      logger.info('Saved checkpoint to {}'.format(save_path))
 
-  if is_main_process():
+  def _save_fn():
     if isinstance(saver, dict):
       for name, _saver in saver.items():
         helper(name, _saver)
     else:
       helper(name="main", _saver=saver)
 
+  if flags.FLAGS.use_horovod and flags.FLAGS.use_dynamic_embedding:
+    _save_fn()
+  else:
+    _save_fn()
+
 
 def export_to_savedmodel(
     model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
     savedmodel_dir: Optional[Text] = None,
     checkpoint_dir: Optional[Union[Text, Dict[Text, Text]]] = None,
-    restore_model_using_load_weights: bool = False
+    restore_model_using_load_weights: bool = False,
+    include_optimizer: bool = False,
+    signatures=None
 ) -> Text:
   """Export keras model for serving which does not include the optimizer.
 
@@ -112,7 +124,7 @@ def export_to_savedmodel(
     ValueError when model is not specified.
   """
 
-  if FLAGS.use_dynamic_embedding and FLAGS.use_horovod:
+  if flags.FLAGS.use_dynamic_embedding and flags.FLAGS.use_horovod:
     try:
       rank_array = hvd.allgather_object(get_rank(), name='check_tfra_ranks')
       assert len(set(rank_array)) == get_world_size()
@@ -120,8 +132,11 @@ def export_to_savedmodel(
       raise ValueError(f"Shouldn't place {inspect.stack()[0][3]} only in the main_process when use TFRA and Horovod.")
 
   def helper(name, _model: tf.keras.Model, _checkpoint_dir):
-    _savedmodel_dir = os.path.join(FLAGS.model_dir, 'export') if savedmodel_dir is None else savedmodel_dir
-    _savedmodel_dir = f"{_savedmodel_dir}_{name}"
+    _savedmodel_dir = os.path.join(flags.FLAGS.model_dir, 'export') if savedmodel_dir is None else savedmodel_dir
+    if get_world_size() > 1:
+      _savedmodel_dir = f"{_savedmodel_dir}_{name}_{get_rank()}"
+    else:
+      _savedmodel_dir = f"{_savedmodel_dir}_{name}"
     os.makedirs(_savedmodel_dir, exist_ok=True)
 
     if _checkpoint_dir:
@@ -139,28 +154,59 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir):
         # Restores the model from latest checkpoint.
         latest_checkpoint_file = tf.train.latest_checkpoint(_checkpoint_dir)
         assert latest_checkpoint_file
-        logging.info('Checkpoint file %s found and restoring from '
-                     'checkpoint', latest_checkpoint_file)
+        logger.info('Checkpoint file %s found and restoring from '
+                    'checkpoint', latest_checkpoint_file)
         checkpoint.restore(latest_checkpoint_file).assert_existing_objects_matched()
 
-    options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA']) if FLAGS.use_dynamic_embedding else None
-
-    if is_main_process():
-      tf.saved_model.save(_model, _savedmodel_dir, options=options)
+    if flags.FLAGS.use_dynamic_embedding:
+      try:
+        from tensorflow_recommenders_addons import dynamic_embedding as de
+        de.keras.models.de_save_model(
+            _model, _savedmodel_dir, overwrite=True, include_optimizer=include_optimizer, signatures=signatures
+        )
+      except:
+        # Compatible with TFRA version before commit 460b50847d459ebbf91b30ea0f9499fbc7ed9da0
+        def _check_de_var_with_fs_saver(_var):
+          try:
+            from tensorflow_recommenders_addons import dynamic_embedding as de
+            # This function only serves FileSystemSaver.
+            return hasattr(_var, "params") and \
+              hasattr(_var.params, "_created_in_class") and \
+              _var.params._saveable_object_creator is not None and \
+              isinstance(_var.params.kv_creator.saver, de.FileSystemSaver)
+          except:
+            return False
+
+        de_dir = os.path.join(_savedmodel_dir, "variables", "TFRADynamicEmbedding")
+        options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
+        if is_main_process():
+          for var in _model.variables:
+            _is_dump = _check_de_var_with_fs_saver(var)
+            if _is_dump:
+              de_var = var.params
+              if hasattr(de_var, 'saveable'):
+                de_var.saveable._saver_config.save_path = de_dir
+          tf.saved_model.save(_model, export_dir=_savedmodel_dir, signatures=signatures, options=options)
+        else:
+          for var in _model.variables:
+            _is_dump = _check_de_var_with_fs_saver(var)
+            if _is_dump:
+              de_var = var.params
+              a2a_emb = de_var._created_in_class
+              # save other rank's embedding weights
+              var.params.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
+              # save opt weights
+              if include_optimizer:
+                de_opt_vars = a2a_emb.optimizer_vars.as_list(
+                ) if hasattr(a2a_emb.optimizer_vars, "as_list") else a2a_emb.optimizer_vars
+                for de_opt_var in de_opt_vars:
+                  de_opt_var.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
     else:
-      de_dir = os.path.join(_savedmodel_dir, "variables", "TFRADynamicEmbedding")
-      for var in _model.variables:
-        if hasattr(var, "params"):
-          # save other rank's embedding weights
-          var.params.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
-          # save opt weights
-          # opt_de_vars = var.params.optimizer_vars.as_list(
-          # ) if hasattr(var.params.optimizer_vars, "as_list") else var.params.optimizer_vars
-          # for opt_de_var in opt_de_vars:
-          #   opt_de_var.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
+      if is_main_process():
+        tf.saved_model.save(_model, export_dir=_savedmodel_dir, signatures=signatures)
 
     if is_main_process():
-      logging.info(f"save pb model to: {_savedmodel_dir}, without optimizer & traces")
+      logger.info(f"save pb model to: {_savedmodel_dir}, without optimizer & traces")
 
     return _savedmodel_dir
 
@@ -170,7 +216,7 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir):
       _dir = helper(name, _model, _checkpoint_dir=checkpoint_dir[name] if checkpoint_dir else None)
       ans.append(_dir)
     prefix_path = longestCommonPrefix(ans)
-    logging.info(f"Export multiple models to {prefix_path}*")
+    logger.info(f"Export multiple models to {prefix_path}*")
     return prefix_path
   else:
     return helper(name="main", _model=model, _checkpoint_dir=checkpoint_dir)
@@ -178,47 +224,52 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir):
 
 def optimize_for_inference(
     model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
-    dataset: tf.data.Dataset,
     savedmodel_dir: Text,
+    dataset: tf.data.Dataset = None,
+    signatures=None,
 ) -> None:
-  x, y, z = data_adapter.unpack_x_y_sample_weight(next(iter(dataset)))
-  if isinstance(model, dict):
-    for name, _model in model.items():
-      if "main" in name:
-        preds = _model(x)
-        logging.info(preds)
-  else:
-    preds = model(x)
-    logging.info(preds)
+  x = None
+  if dataset:
+    x, y, z = data_adapter.unpack_x_y_sample_weight(next(iter(dataset)))
+    if isinstance(model, dict):
+      for name, _model in model.items():
+        if "main" in name:
+          preds = _model(x)
+          logger.debug(preds)
+    else:
+      preds = model(x)
+      logger.debug(preds)
 
   def helper(_model, path):
     tmp_path = tempfile.mkdtemp(dir='/tmp/')
-    export_to_savedmodel(_model, savedmodel_dir=tmp_path)
+    export_to_savedmodel(_model, savedmodel_dir=tmp_path, signatures=signatures)
     file = os.path.join(path, "saved_model.pb")
     if tf.io.gfile.exists(file):
       tf.io.gfile.remove(file)
-      logging.info(f"Replace optimized saved_modle.pb for {file}")
+      logger.info(f"Replace optimized saved_modle.pb for {file}")
       tf.io.gfile.copy(os.path.join(tmp_path + "_main", "saved_model.pb"), file, overwrite=True)
     else:
       raise FileNotFoundError(f"{file} does not exist.")
 
   if isinstance(model, dict):
     for name, _model in model.items():
-      if "main" in name:
-        preds = _model(x)
-        logging.info(preds)
+      if dataset:
+        if "main" in name:
+          preds = _model(x)
+          logger.info(preds)
       src = savedmodel_dir + name
       helper(_model, src)
   else:
-    preds = model(x)
-    logging.info(preds)
+    if dataset:
+      preds = model(x)
+      logger.info(preds)
     helper(model, savedmodel_dir)
 
 
 class SavedModel:
 
   def __init__(self, model_dir, precision):
-    if FLAGS.use_dynamic_embedding:
+    if flags.FLAGS.use_dynamic_embedding:
       from tensorflow_recommenders_addons import dynamic_embedding as de
       de.enable_inference_mode()
 
@@ -226,7 +277,7 @@ def __init__(self, model_dir, precision):
     self.graph_func = self.saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
     self.precision = tf.float16 if precision == "amp" else tf.float32
 
-    if not FLAGS.run_eagerly:
+    if not flags.FLAGS.run_eagerly:
       self._infer_step = tf.function(self.infer_step)
     else:
       self._infer_step = self.infer_step
@@ -244,7 +295,7 @@ class TFTRTModel:
   def export_model(self, model_dir, prec, tf_trt_model_dir=None):
     loaded_model = tf.saved_model.load(model_dir)
     signature = loaded_model.signatures['serving_default']
-    logging.info(signature)
+    logger.info(signature)
     # input_shape = [1, 384]
     # dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32))
     # x = [
@@ -262,13 +313,13 @@ def export_model(self, model_dir, prec, tf_trt_model_dir=None):
     converter.convert()
     tf_trt_model_dir = tf_trt_model_dir or f'/tmp/tf-trt_model_{prec}'
     converter.save(tf_trt_model_dir)
-    logging.info(f"TF-TRT model saved at {tf_trt_model_dir}")
+    logger.info(f"TF-TRT model saved at {tf_trt_model_dir}")
 
   def __init__(self, model_dir, precision):
     temp_tftrt_dir = f"/tmp/tf-trt_model_{precision}"
     self.export_model(model_dir, precision, temp_tftrt_dir)
     saved_model_loaded = tf.saved_model.load(temp_tftrt_dir, tags=[tag_constants.SERVING])
-    logging.info(f"TF-TRT model loaded from {temp_tftrt_dir}")
+    logger.info(f"TF-TRT model loaded from {temp_tftrt_dir}")
     self.graph_func = saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
     self.precision = tf.float16 if precision == "amp" else tf.float32
 
diff --git a/deepray/utils/flags/_base.py b/deepray/utils/flags/_base.py
index 400a3b5d..f4214d77 100644
--- a/deepray/utils/flags/_base.py
+++ b/deepray/utils/flags/_base.py
@@ -26,7 +26,7 @@ def define_base(
     num_train_examples=False,
     learning_rate=False,
     optimizer_type=False,
-    keras_use_ctl=False,
+    use_custom_training_loop=False,
     model_dir=False,
     clean=False,
     num_accumulation_steps=False,
@@ -36,9 +36,7 @@ def define_base(
     num_gpus=False,
     init_checkpoint=False,
     hooks=False,
-    dllog_path=False,
     export_dir=False,
-    save_checkpoint_steps=False,
     run_eagerly=False
 ):
   """Register base flags.
@@ -74,13 +72,13 @@ def define_base(
   if learning_rate:
     flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
     key_flags.append("learning_rate")
-  if keras_use_ctl:
+  if use_custom_training_loop:
     flags.DEFINE_bool(
-        name="keras_use_ctl",
+        name="use_custom_training_loop",
         default=True,
         help=flags_core.help_wrap("If True, we use a custom training loop for keras.")
     )
-    key_flags.append("keras_use_ctl")
+    key_flags.append("use_custom_training_loop")
   if optimizer_type:
     flags.DEFINE_string("optimizer_type", "adam", "Optimizer used for training - LAMB or ADAM")
     key_flags.append("optimizer_type")
@@ -94,16 +92,6 @@ def define_base(
     flags.DEFINE_list("init_weights", '', "Initial weights for the main model.")
     key_flags.append("init_weights")
 
-  if save_checkpoint_steps:
-    flags.DEFINE_integer(
-        'save_checkpoint_steps', sys.maxsize,
-        'save checkpoint for every n steps. Default value will not save checkpoint during training.'
-    )
-    key_flags.append("save_checkpoint_steps")
-  if dllog_path:
-    flags.DEFINE_string('dllog_path', 'deepray_dllogger.json', 'filename where dllogger writes to')
-    key_flags.append("dllog_path")
-
   if model_dir:
     flags.DEFINE_string(
         name="model_dir",
diff --git a/deepray/utils/flags/_benchmark.py b/deepray/utils/flags/_benchmark.py
index 7333e155..834c0301 100644
--- a/deepray/utils/flags/_benchmark.py
+++ b/deepray/utils/flags/_benchmark.py
@@ -26,11 +26,10 @@ def define_log_steps():
   return []
 
 
-def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader=False):
+def define_benchmark(bigquery_uploader=False):
   """Register benchmarking flags.
 
   Args:
-    benchmark_log_dir: Create a flag to specify location for benchmark logging.
     bigquery_uploader: Create flags for uploading results to BigQuery.
 
   Returns:
@@ -38,11 +37,6 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader=
   """
 
   key_flags = []
-
-  if benchmark:
-    flags.DEFINE_boolean('benchmark', False, 'Benchmark mode.')
-    key_flags.append("benchmark")
-
   flags.DEFINE_enum(
       name="benchmark_logger_type",
       default="BaseBenchmarkLogger",
@@ -68,11 +62,6 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader=
 
   define_log_steps()
 
-  if benchmark_log_dir:
-    flags.DEFINE_string(
-        name="benchmark_log_dir", default=None, help=help_wrap("The location of the benchmark logging.")
-    )
-
   if bigquery_uploader:
     flags.DEFINE_string(
         name="gcp_project", default=None, help=help_wrap("The GCP project name where the benchmark will be uploaded.")
@@ -105,15 +94,4 @@ def define_benchmark(benchmark=False, benchmark_log_dir=True, bigquery_uploader=
                        "information will be uploaded.")
     )
 
-  @flags.multi_flags_validator(
-      ["benchmark_logger_type", "benchmark_log_dir"],
-      message="--benchmark_logger_type=BenchmarkFileLogger will require "
-      "--benchmark_log_dir being set"
-  )
-  def _check_benchmark_log_dir(flags_dict):
-    benchmark_logger_type = flags_dict["benchmark_logger_type"]
-    if benchmark_logger_type == "BenchmarkFileLogger":
-      return flags_dict["benchmark_log_dir"]
-    return True
-
   return key_flags
diff --git a/deepray/utils/flags/_device.py b/deepray/utils/flags/_device.py
index 1278ab15..1fc06419 100644
--- a/deepray/utils/flags/_device.py
+++ b/deepray/utils/flags/_device.py
@@ -57,7 +57,7 @@ def define_device(tpu=False, redis=False):
 
   if tpu:
     flags.DEFINE_string(
-        name="tpu",
+        name="tpu_address",
         default=None,
         help=help_wrap(
             "The Cloud TPU to use for training. This should be either the name "
@@ -66,7 +66,7 @@ def define_device(tpu=False, redis=False):
             "CPU of the local instance instead. (Good for debugging.)"
         )
     )
-    key_flags.append("tpu")
+    key_flags.append("tpu_address")
 
     flags.DEFINE_string(
         name="tpu_zone",
diff --git a/deepray/utils/flags/_distribution.py b/deepray/utils/flags/_distribution.py
index 49d5f89b..bf7420c7 100644
--- a/deepray/utils/flags/_distribution.py
+++ b/deepray/utils/flags/_distribution.py
@@ -18,7 +18,7 @@
 from deepray.utils.flags._conventions import help_wrap
 
 
-def define_distribution(use_horovod=True, distribution_strategy=False, worker_hosts=True, task_index=True):
+def define_distribution(use_horovod=True, distribution_strategy=False, worker_hosts=False, task_index=False):
   """Register distributed execution flags.
 
   Args:
@@ -33,16 +33,16 @@ def define_distribution(use_horovod=True, distribution_strategy=False, worker_ho
   key_flags = []
 
   if use_horovod:
-    flags.DEFINE_bool('use_horovod', False, 'Whether to use horovod.')
+    flags.DEFINE_bool("use_horovod", False, 'Whether to use horovod.')
     key_flags.append("use_horovod")
 
   if distribution_strategy:
     flags.DEFINE_string(
         name="distribution_strategy",
-        default="mirrored",
+        default="off",
         help=help_wrap(
             "The Distribution Strategy to use for training. "
-            "Accepted values are 'off', 'one_device', "
+            "Accepted values are 'off', 'horovod', 'one_device', "
             "'mirrored', 'parameter_server', 'collective', "
             "case insensitive. 'off' means not to use "
             "Distribution Strategy; 'default' means to choose "
diff --git a/deepray/utils/flags/common_flags.py b/deepray/utils/flags/common_flags.py
index f4b9e051..4939aeb1 100644
--- a/deepray/utils/flags/common_flags.py
+++ b/deepray/utils/flags/common_flags.py
@@ -13,9 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Defining common flags used across all BERT models/applications."""
-import datetime
-import logging
-import os
 
 import tensorflow as tf
 from absl import flags
@@ -25,14 +22,13 @@
 
 def define_common_flags():
   """Define common flags for BERT tasks."""
-  logging.info("flags base......................................")
   flags_core.define_base(
       train_data=True,
       num_train_examples=True,
       batch_size=True,
       learning_rate=True,
       optimizer_type=True,
-      keras_use_ctl=True,
+      use_custom_training_loop=True,
       num_accumulation_steps=True,
       init_checkpoint=True,
       num_gpus=True,
@@ -43,8 +39,6 @@ def define_common_flags():
       hooks=False,
       export_dir=False,
       run_eagerly=True,
-      dllog_path=True,
-      save_checkpoint_steps=True,
   )
   flags.DEFINE_string(
       'config_file',
@@ -56,65 +50,19 @@ def define_common_flags():
       '`--config_file` and `--params_override`, `config_file` will be used '
       'first, followed by params_override.'
   )
-  flags.DEFINE_string('vocab_file', None, 'The vocabulary file that the BERT model was trained on.')
-  flags.DEFINE_bool(
-      "do_lower_case", True, "Whether to lower case the input text. Should be True for uncased "
-      "models and False for cased models."
-  )
   flags.DEFINE_integer(
-      'steps_per_summary', 200, 'Number of steps per graph-mode loop. Only training step '
+      'steps_per_execution', None, 'Number of steps per graph-mode loop. Only training step '
       'happens inside the loop. Callbacks will not be called '
       'inside.'
   )
   flags.DEFINE_integer("stop_steps", -1, "steps when training stops")
-  flags.DEFINE_boolean(
-      'scale_loss', False, 'Whether to divide the loss by number of replica inside the per-replica '
-      'loss function.'
-  )
-  flags.DEFINE_string(
-      'hub_module_url', None, 'TF-Hub path/url to Bert module. '
-      'If specified, init_checkpoint flag should not be used.'
-  )
   flags.DEFINE_string(
       'model_name', None, 'Specifies the name of the model. '
       'If "bert", will use canonical BERT; if "albert", will use ALBERT model.'
   )
-  flags.DEFINE_enum(
-      'mode', 'train_and_predict',
-      ['train_and_predict', 'train', 'predict', 'export_only', 'sm_predict', 'trt_predict'],
-      'One of {"train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"}. '
-      '`train_and_predict`: both train and predict to a json file. '
-      '`train`: only trains the model. '
-      'trains the model and evaluates in the meantime. '
-      '`predict`: predict answers from the squad json file. '
-      '`export_only`: will take the latest checkpoint inside '
-      'model_dir and export a `SavedModel`.'
-      '`sm_predict`: will load SavedModel from savedmodel_dir and predict answers'
-      '`trt_predict`: will load SavedModel from savedmodel_dir, convert and predict answers with TF-TRT'
-  )
-  flags.DEFINE_string(
-      'input_meta_data_path', None, 'Path to file that contains meta data about input '
-      'to be used for training and evaluation.'
-  )
   flags.DEFINE_bool("use_dynamic_embedding", False, "Whether use tfra.dynamic_embedding.")
-  flags.DEFINE_string('predict_file', None, 'Prediction data path with train tfrecords.')
-  flags.DEFINE_string(
-      "eval_script", None, "SQuAD evaluate.py file to compute f1 and exact_match E.g., evaluate-v1.1.py"
-  )
   flags.DEFINE_integer(
-      'n_best_size', 20, 'The total number of n-best predictions to generate in the '
-      'nbest_predictions.json output file.'
-  )
-  flags.DEFINE_integer(
-      'max_answer_length', 30, 'The maximum length of an answer that can be generated. This is needed '
-      'because the start and end predictions are not conditioned on one another.'
-  )
-  flags.DEFINE_bool(
-      'verbose_logging', False, 'If true, all of the warnings related to data processing will be printed. '
-      'A number of warnings are expected for a normal SQuAD evaluation.'
-  )
-  flags.DEFINE_integer(
-      "random_seed", 12345, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.")
+      "random_seed", None, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.")
   )
   # Adds flags for mixed precision training.
   flags_core.define_performance(
@@ -124,7 +72,7 @@ def define_common_flags():
       synthetic_data=False,
       max_train_steps=False,
       dtype=True,
-      dynamic_loss_scale=True,
+      dynamic_loss_scale=False,
       loss_scale=True,
       all_reduce_alg=False,
       num_packs=False,
@@ -135,30 +83,21 @@ def define_common_flags():
   flags_core.define_distribution(distribution_strategy=True)
   flags_core.define_data(
       dataset=True,
-      data_dir=True,
-      download_if_missing=True,
-  )
-  flags_core.define_device(tpu=False, redis=True)
-  flags_core.define_benchmark(benchmark=True,)
-
-  flags.DEFINE_string(
-      name="date", default=(datetime.datetime.now() - datetime.timedelta(days=1)).strftime("%Y-%m-%d"), help=""
+      data_dir=False,
+      download_if_missing=False,
   )
-  flags.DEFINE_string(name="restore_date", default=None, help="")
-  flags.DEFINE_string(name="start_date", default=None, help="")
-  flags.DEFINE_string(name="end_date", default=None, help="")
-  flags.DEFINE_string(name="fine_tune", default=None, help="")
-  flags.DEFINE_string(name="warmup_path", default=None, help="")
+  flags_core.define_device(tpu=False, redis=False)
+  flags_core.define_benchmark()
   flags.DEFINE_float(
       "dropout_rate",
       default=-1,
       help="Dropout rate for all the classification MLPs (default: -1, disabled).",
   )
-  flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
   flags.DEFINE_integer("prebatch", 1, "prebatch size for tfrecord")
-  flags.DEFINE_string("feature_map", os.path.join(os.getcwd(), "business/data/feature_map.csv"), "path to feature_map")
+  flags.DEFINE_string("feature_map", None, "path to feature_map")
   flags.DEFINE_string("black_list", None, "black list for feature_map")
   flags.DEFINE_string("white_list", None, "white list for feature_map")
+  flags.DEFINE_integer("ev_slot_num", 0, "ev_slot_num")
 
 
 def use_float16():
diff --git a/deepray/utils/flags/core.py b/deepray/utils/flags/core.py
index b727fafb..4f89965e 100644
--- a/deepray/utils/flags/core.py
+++ b/deepray/utils/flags/core.py
@@ -159,7 +159,7 @@ def parse_flags(flags_obj):
       "epsilon": flags_obj.epsilon,
       "match_mlperf": flags_obj.ml_perf,
       # "epochs_between_evals": flags_obj.epochs_between_evals,
-      "keras_use_ctl": flags_obj.keras_use_ctl,
+      "use_custom_training_loop": flags_obj.use_custom_training_loop,
       "hr_threshold": flags_obj.hr_threshold,
       "stream_files": flags_obj.tpu is not None,
       "train_dataset_path": flags_obj.train_dataset_path,
diff --git a/deepray/utils/horovod_utils.py b/deepray/utils/horovod_utils.py
index 6da2d21a..5e62d6ad 100644
--- a/deepray/utils/horovod_utils.py
+++ b/deepray/utils/horovod_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -11,10 +11,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import horovod.tensorflow.keras as hvd
-from absl import logging, flags
+# We don't want the whole process to quit because of the import failure when
+# we don't use horovod to do communication.
+try:
+  import horovod.tensorflow as hvd
+except ImportError:
+  pass
+from absl import flags
 
-FLAGS = flags.FLAGS
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
 
 
 def get_rank():
@@ -32,4 +39,26 @@ def get_world_size():
 
 
 def is_main_process():
-  return not FLAGS.use_horovod or get_rank() == 0
+  return not flags.FLAGS.use_horovod or get_rank() == 0
+
+
+def main_info(info):
+  if is_main_process():
+    logger.info(info)
+
+
+def main_warning(info):
+  if is_main_process():
+    logger.warning(info)
+
+
+def id_in_rank():
+  return 0
+
+
+def num_gpu_per_rank():
+  return 1
+
+
+def global_gpu_id():
+  return get_rank()
diff --git a/deepray/utils/keras_utils.py b/deepray/utils/keras_utils.py
index 6bd2663a..47620db9 100644
--- a/deepray/utils/keras_utils.py
+++ b/deepray/utils/keras_utils.py
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,154 +11,70 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Utilities for tf.keras."""
+"""Helper functions for the Keras implementations of models."""
+
+import multiprocessing
+import os
 
 import tensorflow as tf
+from absl import logging
+from tensorflow.python import tf2
 
+from deepray.utils import logging_util
 
-def is_tensor_or_variable(x):
-  return tf.is_tensor(x) or isinstance(x, tf.Variable)
-
-
-class LossFunctionWrapper(tf.keras.losses.Loss):
-  """Wraps a loss function in the `Loss` class."""
-
-  def __init__(self, fn, reduction=tf.keras.losses.Reduction.AUTO, name=None, **kwargs):
-    """Initializes `LossFunctionWrapper` class.
-
-        Args:
-          fn: The loss function to wrap, with signature `fn(y_true, y_pred,
-            **kwargs)`.
-          reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
-            loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-            option will be determined by the usage context. For almost all cases
-            this defaults to `SUM_OVER_BATCH_SIZE`. When used with
-            `tf.distribute.Strategy`, outside of built-in training loops such as
-            `tf.keras` `compile` and `fit`, using `AUTO` or `SUM_OVER_BATCH_SIZE`
-            will raise an error. Please see this custom training [tutorial](
-              https://www.tensorflow.org/tutorials/distribute/custom_training)
-            for more details.
-          name: (Optional) name for the loss.
-          **kwargs: The keyword arguments that are passed on to `fn`.
-        """
-    super().__init__(reduction=reduction, name=name)
-    self.fn = fn
-    self._fn_kwargs = kwargs
-
-  def call(self, y_true, y_pred):
-    """Invokes the `LossFunctionWrapper` instance.
-
-        Args:
-          y_true: Ground truth values.
-          y_pred: The predicted values.
-
-        Returns:
-          Loss values per sample.
-        """
-    return self.fn(y_true, y_pred, **self._fn_kwargs)
-
-  def get_config(self):
-    config = {}
-    for k, v in iter(self._fn_kwargs.items()):
-      config[k] = tf.keras.backend.eval(v) if is_tensor_or_variable(v) else v
-    base_config = super().get_config()
-    return {**base_config, **config}
-
-
-def normalize_data_format(value):
-  if value is None:
-    value = tf.keras.backend.image_data_format()
-  data_format = value.lower()
-  if data_format not in {"channels_first", "channels_last"}:
-    raise ValueError(
-        "The `data_format` argument must be one of "
-        '"channels_first", "channels_last". Received: ' + str(value)
-    )
-  return data_format
-
-
-def normalize_tuple(value, n, name):
-  """Transforms an integer or iterable of integers into an integer tuple.
-
-    A copy of tensorflow.python.keras.util.
-
-    Args:
-      value: The value to validate and convert. Could an int, or any iterable
-        of ints.
-      n: The size of the tuple to be returned.
-      name: The name of the argument being validated, e.g. "strides" or
-        "kernel_size". This is only used to format error messages.
-
-    Returns:
-      A tuple of n integers.
-
-    Raises:
-      ValueError: If something else than an int/long or iterable thereof was
-        passed.
-    """
-  if isinstance(value, int):
-    return (value,) * n
-  else:
-    try:
-      value_tuple = tuple(value)
-    except TypeError:
-      raise TypeError("The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value))
-    if len(value_tuple) != n:
-      raise ValueError(
-          "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)
-      )
-    for single_value in value_tuple:
-      try:
-        int(single_value)
-      except (ValueError, TypeError):
-        raise ValueError(
-            "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " "
-            "including element " + str(single_value) + " of type" + " " + str(type(single_value))
-        )
-    return value_tuple
-
-
-def _hasattr(obj, attr_name):
-  # If possible, avoid retrieving the attribute as the object might run some
-  # lazy computation in it.
-  if attr_name in dir(obj):
-    return True
-  try:
-    getattr(obj, attr_name)
-  except AttributeError:
-    return False
+logger = logging_util.get_logger()
+
+
+def set_session_config(enable_eager=False, enable_xla=False):
+  """Sets the session config."""
+  if is_v2_0():
+    set_config_v2(enable_xla=enable_xla)
   else:
-    return True
-
-
-def assert_like_rnncell(cell_name, cell):
-  """Raises a TypeError if cell is not like a
-    tf.keras.layers.AbstractRNNCell.
-
-    Args:
-      cell_name: A string to give a meaningful error referencing to the name
-        of the function argument.
-      cell: The object which should behave like a
-        tf.keras.layers.AbstractRNNCell.
-
-    Raises:
-      TypeError: A human-friendly exception.
-    """
-  conditions = [
-      _hasattr(cell, "output_size"),
-      _hasattr(cell, "state_size"),
-      _hasattr(cell, "get_initial_state"),
-      callable(cell),
-  ]
-
-  errors = [
-      "'output_size' property is missing",
-      "'state_size' property is missing",
-      "'get_initial_state' method is required",
-      "is not callable",
-  ]
-
-  if not all(conditions):
-    errors = [error for error, cond in zip(errors, conditions) if not cond]
-    raise TypeError("The argument {!r} ({}) is not an RNNCell: {}.".format(cell_name, cell, ", ".join(errors)))
+    config = get_config_proto_v1(enable_xla=enable_xla)
+    if enable_eager:
+      tf.compat.v1.enable_eager_execution(config=config)
+    else:
+      sess = tf.Session(config=config)
+      tf.keras.backend.set_session(sess)
+
+
+def get_config_proto_v1(enable_xla=False):
+  """Return config proto according to flag settings, or None to use default."""
+  config = None
+  if enable_xla:
+    config = tf.compat.v1.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_2)
+  return config
+
+
+def set_config_v2(enable_xla=False):
+  """Config eager context according to flag values using TF 2.0 API."""
+  if enable_xla:
+    tf.config.optimizer.set_jit(True)
+    logger.info("XLA activated")
+
+
+def is_v2_0():
+  """Returns true if using tf 2.0."""
+  return tf2.enabled()
+
+
+def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count):
+  """Set GPU thread mode and count, and adjust dataset threads count."""
+  cpu_count = multiprocessing.cpu_count()
+  logging.info('Logical CPU cores: %s', cpu_count)
+
+  # Allocate private thread pool for each GPU to schedule and launch kernels
+  per_gpu_thread_count = per_gpu_thread_count or 2
+  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
+  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
+  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
+  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+
+  # Limit data preprocessing threadpool to CPU cores minus number of total GPU
+  # private threads and memory copy threads.
+  total_gpu_thread_count = per_gpu_thread_count * num_gpus
+  num_runtime_threads = num_gpus
+  if not datasets_num_private_threads:
+    datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
+    logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads)
diff --git a/deepray/utils/logging_util.py b/deepray/utils/logging_util.py
new file mode 100644
index 00000000..0de16c7d
--- /dev/null
+++ b/deepray/utils/logging_util.py
@@ -0,0 +1,392 @@
+# coding=utf-8
+# Copyright 2020 Optuna, Hugging Face
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Logging utilities."""
+
+import functools
+import logging
+import os
+import sys
+import threading
+from logging import (
+    CRITICAL,  # NOQA
+    DEBUG,  # NOQA
+    ERROR,  # NOQA
+    FATAL,  # NOQA
+    INFO,  # NOQA
+    NOTSET,  # NOQA
+    WARN,  # NOQA
+    WARNING,  # NOQA
+)
+from logging import captureWarnings as _captureWarnings
+from typing import Optional
+
+from tqdm import auto as tqdm_lib
+
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+
+log_levels = {
+    "detail": logging.DEBUG,  # will also print filename and line number
+    "debug": logging.DEBUG,
+    "info": logging.INFO,
+    "warning": logging.WARNING,
+    "error": logging.ERROR,
+    "critical": logging.CRITICAL,
+}
+
+_default_log_level = logging.INFO
+
+_tqdm_active = True
+
+
+def _get_default_logging_level():
+  """
+    If DEEPRAY_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+    not - fall back to `_default_log_level`
+    """
+  env_level_str = os.getenv("DEEPRAY_VERBOSITY", None)
+  if env_level_str:
+    if env_level_str in log_levels:
+      return log_levels[env_level_str]
+    else:
+      logging.getLogger().warning(
+          f"Unknown option DEEPRAY_VERBOSITY={env_level_str}, "
+          f"has to be one of: { ', '.join(log_levels.keys()) }"
+      )
+  return _default_log_level
+
+
+def _get_library_name() -> str:
+  return __name__.split(".")[0]
+
+
+def _get_library_root_logger() -> logging.Logger:
+  return logging.getLogger(_get_library_name())
+
+
+def _configure_library_root_logger() -> None:
+  global _default_handler
+
+  with _lock:
+    if _default_handler:
+      # This library has already configured the library root logger.
+      return
+    _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+    # set defaults based on https://github.com/pyinstaller/pyinstaller/issues/7334#issuecomment-1357447176
+    if sys.stderr is None:
+      sys.stderr = open(os.devnull, "w")
+
+    _default_handler.flush = sys.stderr.flush
+
+    # Apply our default configuration to the library root logger.
+    library_root_logger = _get_library_root_logger()
+    library_root_logger.addHandler(_default_handler)
+    library_root_logger.setLevel(_get_default_logging_level())
+    # if logging level is debug, we add pathname and lineno to formatter for easy debugging
+    if os.getenv("DEEPRAY_VERBOSITY", None) == "detail":
+      formatter = logging.Formatter("%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s")
+      _default_handler.setFormatter(formatter)
+
+    library_root_logger.propagate = False
+
+
+def _reset_library_root_logger() -> None:
+  global _default_handler
+
+  with _lock:
+    if not _default_handler:
+      return
+
+    library_root_logger = _get_library_root_logger()
+    library_root_logger.removeHandler(_default_handler)
+    library_root_logger.setLevel(logging.NOTSET)
+    _default_handler = None
+
+
+def get_log_levels_dict():
+  return log_levels
+
+
+def captureWarnings(capture):
+  """
+    Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
+    `warnings` library.
+
+    Read more about this method here:
+    https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
+
+    All warnings will be logged through the `py.warnings` logger.
+
+    Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
+    level of that logger to the library's root logger.
+    """
+  logger = get_logger("py.warnings")
+
+  if not logger.handlers:
+    logger.addHandler(_default_handler)
+
+  logger.setLevel(_get_library_root_logger().level)
+
+  _captureWarnings(capture)
+
+
+def get_logger(name: Optional[str] = None) -> logging.Logger:
+  """
+    Return a logger with the specified name.
+
+    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+    """
+
+  if name is None:
+    name = _get_library_name()
+
+  _configure_library_root_logger()
+  return logging.getLogger(name)
+
+
+def get_verbosity() -> int:
+  """
+    Return the current level for the 🤗 Transformers's root logger as an int.
+
+    Returns:
+        `int`: The logging level.
+
+    <Tip>
+
+    🤗 Transformers has following logging levels:
+
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""
+
+  _configure_library_root_logger()
+  return _get_library_root_logger().getEffectiveLevel()
+
+
+def set_verbosity(verbosity: int) -> None:
+  """
+    Set the verbosity level for the 🤗 Transformers's root logger.
+
+    Args:
+        verbosity (`int`):
+            Logging level, e.g., one of:
+
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
+    """
+
+  _configure_library_root_logger()
+  _get_library_root_logger().setLevel(verbosity)
+
+
+def set_verbosity_info():
+  """Set the verbosity to the `INFO` level."""
+  return set_verbosity(INFO)
+
+
+def set_verbosity_warning():
+  """Set the verbosity to the `WARNING` level."""
+  return set_verbosity(WARNING)
+
+
+def set_verbosity_debug():
+  """Set the verbosity to the `DEBUG` level."""
+  return set_verbosity(DEBUG)
+
+
+def set_verbosity_error():
+  """Set the verbosity to the `ERROR` level."""
+  return set_verbosity(ERROR)
+
+
+def disable_default_handler() -> None:
+  """Disable the default handler of the HuggingFace Transformers's root logger."""
+
+  _configure_library_root_logger()
+
+  assert _default_handler is not None
+  _get_library_root_logger().removeHandler(_default_handler)
+
+
+def enable_default_handler() -> None:
+  """Enable the default handler of the HuggingFace Transformers's root logger."""
+
+  _configure_library_root_logger()
+
+  assert _default_handler is not None
+  _get_library_root_logger().addHandler(_default_handler)
+
+
+def add_handler(handler: logging.Handler) -> None:
+  """adds a handler to the HuggingFace Transformers's root logger."""
+
+  _configure_library_root_logger()
+
+  assert handler is not None
+  _get_library_root_logger().addHandler(handler)
+
+
+def remove_handler(handler: logging.Handler) -> None:
+  """removes given handler from the HuggingFace Transformers's root logger."""
+
+  _configure_library_root_logger()
+
+  assert handler is not None and handler not in _get_library_root_logger().handlers
+  _get_library_root_logger().removeHandler(handler)
+
+
+def disable_propagation() -> None:
+  """
+    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+    """
+
+  _configure_library_root_logger()
+  _get_library_root_logger().propagate = False
+
+
+def enable_propagation() -> None:
+  """
+    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+    prevent double logging if the root logger has been configured.
+    """
+
+  _configure_library_root_logger()
+  _get_library_root_logger().propagate = True
+
+
+def enable_explicit_format() -> None:
+  """
+    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+    ```
+        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+    ```
+    All handlers currently bound to the root logger are affected by this method.
+    """
+  handlers = _get_library_root_logger().handlers
+
+  for handler in handlers:
+    formatter = logging.Formatter("[%(levelname)s|%(filename)s:%(lineno)s] %(asctime)s >> %(message)s")
+    handler.setFormatter(formatter)
+
+
+def reset_format() -> None:
+  """
+    Resets the formatting for HuggingFace Transformers's loggers.
+
+    All handlers currently bound to the root logger are affected by this method.
+    """
+  handlers = _get_library_root_logger().handlers
+
+  for handler in handlers:
+    handler.setFormatter(None)
+
+
+def warning_advice(self, *args, **kwargs):
+  """
+    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
+    warning will not be printed
+    """
+  no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False)
+  if no_advisory_warnings:
+    return
+  self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_advice = warning_advice
+
+
+@functools.lru_cache(None)
+def warning_once(self, *args, **kwargs):
+  """
+    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+
+    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+    another type of cache that includes the caller frame information in the hashing function.
+    """
+  self.warning(*args, **kwargs)
+
+
+logging.Logger.warning_once = warning_once
+
+
+class EmptyTqdm:
+  """Dummy tqdm which doesn't do anything."""
+
+  def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
+    self._iterator = args[0] if args else None
+
+  def __iter__(self):
+    return iter(self._iterator)
+
+  def __getattr__(self, _):
+    """Return empty function."""
+
+    def empty_fn(*args, **kwargs):  # pylint: disable=unused-argument
+      return
+
+    return empty_fn
+
+  def __enter__(self):
+    return self
+
+  def __exit__(self, type_, value, traceback):
+    return
+
+
+class _tqdm_cls:
+
+  def __call__(self, *args, **kwargs):
+    if _tqdm_active:
+      return tqdm_lib.tqdm(*args, **kwargs)
+    else:
+      return EmptyTqdm(*args, **kwargs)
+
+  def set_lock(self, *args, **kwargs):
+    self._lock = None
+    if _tqdm_active:
+      return tqdm_lib.tqdm.set_lock(*args, **kwargs)
+
+  def get_lock(self):
+    if _tqdm_active:
+      return tqdm_lib.tqdm.get_lock()
+
+
+tqdm = _tqdm_cls()
+
+
+def is_progress_bar_enabled() -> bool:
+  """Return a boolean indicating whether tqdm progress bars are enabled."""
+  global _tqdm_active
+  return bool(_tqdm_active)
+
+
+def enable_progress_bar():
+  """Enable tqdm progress bar."""
+  global _tqdm_active
+  _tqdm_active = True
+
+
+def disable_progress_bar():
+  """Disable tqdm progress bar."""
+  global _tqdm_active
+  _tqdm_active = False
diff --git a/deepray/utils/logs/hooks.py b/deepray/utils/logs/hooks.py
deleted file mode 100644
index 065c2fef..00000000
--- a/deepray/utils/logs/hooks.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Hook that counts examples per second every N steps or seconds."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.utils.logs import logger
-
-
-class ExamplesPerSecondHook(tf.estimator.SessionRunHook):
-  """Hook to print out examples per second.
-
-  Total time is tracked and then divided by the total number of steps
-  to get the average step time and then batch_size is used to determine
-  the running average of examples per second. The examples per second for the
-  most recent interval is also logged.
-  """
-
-  def __init__(self, batch_size, every_n_steps=None, every_n_secs=None, warm_steps=0, metric_logger=None):
-    """Initializer for ExamplesPerSecondHook.
-
-    Args:
-      batch_size: Total batch size across all workers used to calculate
-        examples/second from global time.
-      every_n_steps: Log stats every n steps.
-      every_n_secs: Log stats every n seconds. Exactly one of the
-        `every_n_steps` or `every_n_secs` should be set.
-      warm_steps: The number of steps to be skipped before logging and running
-        average calculation. warm_steps steps refers to global steps across all
-        workers, not on each worker
-      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
-          hook should use to write the log. If None, BaseBenchmarkLogger will
-          be used.
-
-    Raises:
-      ValueError: if neither `every_n_steps` or `every_n_secs` is set, or
-      both are set.
-    """
-
-    if (every_n_steps is None) == (every_n_secs is None):
-      raise ValueError("exactly one of every_n_steps"
-                       " and every_n_secs should be provided.")
-
-    self._logger = metric_logger or logger.BaseBenchmarkLogger()
-
-    self._timer = tf.estimator.SecondOrStepTimer(every_steps=every_n_steps, every_secs=every_n_secs)
-
-    self._step_train_time = 0
-    self._total_steps = 0
-    self._batch_size = batch_size
-    self._warm_steps = warm_steps
-    # List of examples per second logged every_n_steps.
-    self.current_examples_per_sec_list = []
-
-  def begin(self):
-    """Called once before using the session to check global step."""
-    self._global_step_tensor = tf.compat.v1.train.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use StepCounterHook.")
-
-  def before_run(self, run_context):  # pylint: disable=unused-argument
-    """Called before each call to run().
-
-    Args:
-      run_context: A SessionRunContext object.
-
-    Returns:
-      A SessionRunArgs object or None if never triggered.
-    """
-    return tf.estimator.SessionRunArgs(self._global_step_tensor)
-
-  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
-    """Called after each call to run().
-
-    Args:
-      run_context: A SessionRunContext object.
-      run_values: A SessionRunValues object.
-    """
-    global_step = run_values.results
-
-    if self._timer.should_trigger_for_step(global_step) and global_step > self._warm_steps:
-      elapsed_time, elapsed_steps = self._timer.update_last_triggered_step(global_step)
-      if elapsed_time is not None:
-        self._step_train_time += elapsed_time
-        self._total_steps += elapsed_steps
-
-        # average examples per second is based on the total (accumulative)
-        # training steps and training time so far
-        average_examples_per_sec = self._batch_size * (self._total_steps / self._step_train_time)
-        # current examples per second is based on the elapsed training steps
-        # and training time per batch
-        current_examples_per_sec = self._batch_size * (elapsed_steps / elapsed_time)
-        # Logs entries to be read from hook during or after run.
-        self.current_examples_per_sec_list.append(current_examples_per_sec)
-        self._logger.log_metric("average_examples_per_sec", average_examples_per_sec, global_step=global_step)
-
-        self._logger.log_metric("current_examples_per_sec", current_examples_per_sec, global_step=global_step)
diff --git a/deepray/utils/logs/hooks_test.py b/deepray/utils/logs/hooks_test.py
deleted file mode 100644
index cb3c18ad..00000000
--- a/deepray/utils/logs/hooks_test.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for hooks."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-from official.utils.logs import hooks
-from official.utils.testing import mock_lib
-
-logging.set_verbosity(logging.DEBUG)
-
-
-class ExamplesPerSecondHookTest(tf.test.TestCase):
-  """Tests for the ExamplesPerSecondHook.
-
-  In the test, we explicitly run global_step tensor after train_op in order to
-  keep the global_step value and the train_op (which increase the glboal_step
-  by 1) consistent. This is to correct the discrepancies in reported global_step
-  value when running on GPUs.
-  """
-
-  def setUp(self):
-    """Mock out logging calls to verify if correct info is being monitored."""
-    self._logger = mock_lib.MockBenchmarkLogger()
-
-    self.graph = tf.Graph()
-    with self.graph.as_default():
-      tf.compat.v1.train.create_global_step()
-      self.train_op = tf.compat.v1.assign_add(tf.compat.v1.train.get_global_step(), 1)
-      self.global_step = tf.compat.v1.train.get_global_step()
-
-  def test_raise_in_both_secs_and_steps(self):
-    with self.assertRaises(ValueError):
-      hooks.ExamplesPerSecondHook(batch_size=256, every_n_steps=10, every_n_secs=20, metric_logger=self._logger)
-
-  def test_raise_in_none_secs_and_steps(self):
-    with self.assertRaises(ValueError):
-      hooks.ExamplesPerSecondHook(batch_size=256, every_n_steps=None, every_n_secs=None, metric_logger=self._logger)
-
-  def _validate_log_every_n_steps(self, every_n_steps, warm_steps):
-    hook = hooks.ExamplesPerSecondHook(
-        batch_size=256, every_n_steps=every_n_steps, warm_steps=warm_steps, metric_logger=self._logger
-    )
-
-    with tf.compat.v1.train.MonitoredSession(tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
-      for _ in range(every_n_steps):
-        # Explicitly run global_step after train_op to get the accurate
-        # global_step value
-        mon_sess.run(self.train_op)
-        mon_sess.run(self.global_step)
-        # Nothing should be in the list yet
-        self.assertFalse(self._logger.logged_metric)
-
-      mon_sess.run(self.train_op)
-      global_step_val = mon_sess.run(self.global_step)
-
-      if global_step_val > warm_steps:
-        self._assert_metrics()
-      else:
-        # Nothing should be in the list yet
-        self.assertFalse(self._logger.logged_metric)
-
-      # Add additional run to verify proper reset when called multiple times.
-      prev_log_len = len(self._logger.logged_metric)
-      mon_sess.run(self.train_op)
-      global_step_val = mon_sess.run(self.global_step)
-
-      if every_n_steps == 1 and global_step_val > warm_steps:
-        # Each time, we log two additional metrics. Did exactly 2 get added?
-        self.assertEqual(len(self._logger.logged_metric), prev_log_len + 2)
-      else:
-        # No change in the size of the metric list.
-        self.assertEqual(len(self._logger.logged_metric), prev_log_len)
-
-  def test_examples_per_sec_every_1_steps(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_steps(1, 0)
-
-  def test_examples_per_sec_every_5_steps(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_steps(5, 0)
-
-  def test_examples_per_sec_every_1_steps_with_warm_steps(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_steps(1, 10)
-
-  def test_examples_per_sec_every_5_steps_with_warm_steps(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_steps(5, 10)
-
-  def _validate_log_every_n_secs(self, every_n_secs):
-    hook = hooks.ExamplesPerSecondHook(
-        batch_size=256, every_n_steps=None, every_n_secs=every_n_secs, metric_logger=self._logger
-    )
-
-    with tf.compat.v1.train.MonitoredSession(tf.compat.v1.train.ChiefSessionCreator(), [hook]) as mon_sess:
-      # Explicitly run global_step after train_op to get the accurate
-      # global_step value
-      mon_sess.run(self.train_op)
-      mon_sess.run(self.global_step)
-      # Nothing should be in the list yet
-      self.assertFalse(self._logger.logged_metric)
-      time.sleep(every_n_secs)
-
-      mon_sess.run(self.train_op)
-      mon_sess.run(self.global_step)
-      self._assert_metrics()
-
-  def test_examples_per_sec_every_1_secs(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_secs(1)
-
-  def test_examples_per_sec_every_5_secs(self):
-    with self.graph.as_default():
-      self._validate_log_every_n_secs(5)
-
-  def _assert_metrics(self):
-    metrics = self._logger.logged_metric
-    self.assertEqual(metrics[-2]["name"], "average_examples_per_sec")
-    self.assertEqual(metrics[-1]["name"], "current_examples_per_sec")
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/deepray/utils/logs/logger.py b/deepray/utils/logs/logger.py
index 3863149a..554b8de7 100644
--- a/deepray/utils/logs/logger.py
+++ b/deepray/utils/logs/logger.py
@@ -45,8 +45,6 @@
 RUN_STATUS_FAILURE = "failure"
 RUN_STATUS_RUNNING = "running"
 
-FLAGS = flags.FLAGS
-
 # Don't use it directly. Use get_benchmark_logger to access a logger.
 _benchmark_logger = None
 _logger_lock = threading.Lock()
diff --git a/deepray/utils/logs/metric_hook.py b/deepray/utils/logs/metric_hook.py
deleted file mode 100644
index 73d3b6bc..00000000
--- a/deepray/utils/logs/metric_hook.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Session hook for logging benchmark metric."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-
-
-class LoggingMetricHook(tf.estimator.LoggingTensorHook):
-  """Hook to log benchmark metric information.
-
-  This hook is very similar as tf.train.LoggingTensorHook, which logs given
-  tensors every N local steps, every N seconds, or at the end. The metric
-  information will be logged to given log_dir or via metric_logger in JSON
-  format, which can be consumed by data analysis pipeline later.
-
-  Note that if `at_end` is True, `tensors` should not include any tensor
-  whose evaluation produces a side effect such as consuming additional inputs.
-  """
-
-  def __init__(self, tensors, metric_logger=None, every_n_iter=None, every_n_secs=None, at_end=False):
-    """Initializer for LoggingMetricHook.
-
-    Args:
-      tensors: `dict` that maps string-valued tags to tensors/tensor names,
-          or `iterable` of tensors/tensor names.
-      metric_logger: instance of `BenchmarkLogger`, the benchmark logger that
-          hook should use to write the log.
-      every_n_iter: `int`, print the values of `tensors` once every N local
-          steps taken on the current worker.
-      every_n_secs: `int` or `float`, print the values of `tensors` once every N
-          seconds. Exactly one of `every_n_iter` and `every_n_secs` should be
-          provided.
-      at_end: `bool` specifying whether to print the values of `tensors` at the
-          end of the run.
-
-    Raises:
-      ValueError:
-        1. `every_n_iter` is non-positive, or
-        2. Exactly one of every_n_iter and every_n_secs should be provided.
-        3. Exactly one of log_dir and metric_logger should be provided.
-    """
-    super(LoggingMetricHook,
-          self).__init__(tensors=tensors, every_n_iter=every_n_iter, every_n_secs=every_n_secs, at_end=at_end)
-
-    if metric_logger is None:
-      raise ValueError("metric_logger should be provided.")
-    self._logger = metric_logger
-
-  def begin(self):
-    super(LoggingMetricHook, self).begin()
-    self._global_step_tensor = tf.compat.v1.train.get_global_step()
-    if self._global_step_tensor is None:
-      raise RuntimeError("Global step should be created to use LoggingMetricHook.")
-    if self._global_step_tensor.name not in self._current_tensors:
-      self._current_tensors[self._global_step_tensor.name] = (self._global_step_tensor)
-
-  def after_run(self, unused_run_context, run_values):
-    # should_trigger is a internal state that populated at before_run, and it is
-    # using self_timer to determine whether it should trigger.
-    if self._should_trigger:
-      self._log_metric(run_values.results)
-
-    self._iter_count += 1
-
-  def end(self, session):
-    if self._log_at_end:
-      values = session.run(self._current_tensors)
-      self._log_metric(values)
-
-  def _log_metric(self, tensor_values):
-    self._timer.update_last_triggered_step(self._iter_count)
-    global_step = tensor_values[self._global_step_tensor.name]
-    # self._tag_order is populated during the init of LoggingTensorHook
-    for tag in self._tag_order:
-      self._logger.log_metric(tag, tensor_values[tag], global_step=global_step)
diff --git a/deepray/utils/logs/metric_hook_test.py b/deepray/utils/logs/metric_hook_test.py
deleted file mode 100644
index d8c82c53..00000000
--- a/deepray/utils/logs/metric_hook_test.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for metric_hook."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tempfile
-import time
-
-import tensorflow as tf  # pylint: disable=g-bad-import-order
-from tensorflow.python.training import monitored_session  # pylint: disable=g-bad-import-order
-
-from official.utils.logs import metric_hook
-from official.utils.testing import mock_lib
-
-
-class LoggingMetricHookTest(tf.test.TestCase):
-  """Tests for LoggingMetricHook."""
-
-  def setUp(self):
-    super(LoggingMetricHookTest, self).setUp()
-
-    self._log_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
-    self._logger = mock_lib.MockBenchmarkLogger()
-
-  def tearDown(self):
-    super(LoggingMetricHookTest, self).tearDown()
-    tf.io.gfile.rmtree(self.get_temp_dir())
-
-  def test_illegal_args(self):
-    with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
-      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=0)
-    with self.assertRaisesRegexp(ValueError, "nvalid every_n_iter"):
-      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=-10)
-    with self.assertRaisesRegexp(ValueError, "xactly one of"):
-      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5, every_n_secs=5)
-    with self.assertRaisesRegexp(ValueError, "xactly one of"):
-      metric_hook.LoggingMetricHook(tensors=["t"])
-    with self.assertRaisesRegexp(ValueError, "metric_logger"):
-      metric_hook.LoggingMetricHook(tensors=["t"], every_n_iter=5)
-
-  def test_print_at_end_only(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      t = tf.constant(42.0, name="foo")
-      train_op = tf.constant(3)
-      hook = metric_hook.LoggingMetricHook(tensors=[t.name], at_end=True, metric_logger=self._logger)
-      hook.begin()
-      mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
-      sess.run(tf.compat.v1.global_variables_initializer())
-
-      for _ in range(3):
-        mon_sess.run(train_op)
-        self.assertEqual(self._logger.logged_metric, [])
-
-      hook.end(sess)
-      self.assertEqual(len(self._logger.logged_metric), 1)
-      metric = self._logger.logged_metric[0]
-      self.assertRegexpMatches(metric["name"], "foo")
-      self.assertEqual(metric["value"], 42.0)
-      self.assertEqual(metric["unit"], None)
-      self.assertEqual(metric["global_step"], 0)
-
-  def test_global_step_not_found(self):
-    with tf.Graph().as_default():
-      t = tf.constant(42.0, name="foo")
-      hook = metric_hook.LoggingMetricHook(tensors=[t.name], at_end=True, metric_logger=self._logger)
-
-      with self.assertRaisesRegexp(RuntimeError, "should be created to use LoggingMetricHook."):
-        hook.begin()
-
-  def test_log_tensors(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      t1 = tf.constant(42.0, name="foo")
-      t2 = tf.constant(43.0, name="bar")
-      train_op = tf.constant(3)
-      hook = metric_hook.LoggingMetricHook(tensors=[t1, t2], at_end=True, metric_logger=self._logger)
-      hook.begin()
-      mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
-      sess.run(tf.compat.v1.global_variables_initializer())
-
-      for _ in range(3):
-        mon_sess.run(train_op)
-        self.assertEqual(self._logger.logged_metric, [])
-
-      hook.end(sess)
-      self.assertEqual(len(self._logger.logged_metric), 2)
-      metric1 = self._logger.logged_metric[0]
-      self.assertRegexpMatches(str(metric1["name"]), "foo")
-      self.assertEqual(metric1["value"], 42.0)
-      self.assertEqual(metric1["unit"], None)
-      self.assertEqual(metric1["global_step"], 0)
-
-      metric2 = self._logger.logged_metric[1]
-      self.assertRegexpMatches(str(metric2["name"]), "bar")
-      self.assertEqual(metric2["value"], 43.0)
-      self.assertEqual(metric2["unit"], None)
-      self.assertEqual(metric2["global_step"], 0)
-
-  def _validate_print_every_n_steps(self, sess, at_end):
-    t = tf.constant(42.0, name="foo")
-
-    train_op = tf.constant(3)
-    hook = metric_hook.LoggingMetricHook(tensors=[t.name], every_n_iter=10, at_end=at_end, metric_logger=self._logger)
-    hook.begin()
-    mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
-    sess.run(tf.compat.v1.global_variables_initializer())
-    mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-    for _ in range(3):
-      self._logger.logged_metric = []
-      for _ in range(9):
-        mon_sess.run(train_op)
-        # assertNotRegexpMatches is not supported by python 3.1 and later
-        self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
-      mon_sess.run(train_op)
-      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-
-    # Add additional run to verify proper reset when called multiple times.
-    self._logger.logged_metric = []
-    mon_sess.run(train_op)
-    # assertNotRegexpMatches is not supported by python 3.1 and later
-    self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
-
-    self._logger.logged_metric = []
-    hook.end(sess)
-    if at_end:
-      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-    else:
-      # assertNotRegexpMatches is not supported by python 3.1 and later
-      self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
-
-  def test_print_every_n_steps(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      self._validate_print_every_n_steps(sess, at_end=False)
-      # Verify proper reset.
-      self._validate_print_every_n_steps(sess, at_end=False)
-
-  def test_print_every_n_steps_and_end(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      self._validate_print_every_n_steps(sess, at_end=True)
-      # Verify proper reset.
-      self._validate_print_every_n_steps(sess, at_end=True)
-
-  def _validate_print_every_n_secs(self, sess, at_end):
-    t = tf.constant(42.0, name="foo")
-    train_op = tf.constant(3)
-
-    hook = metric_hook.LoggingMetricHook(tensors=[t.name], every_n_secs=1.0, at_end=at_end, metric_logger=self._logger)
-    hook.begin()
-    mon_sess = monitored_session._HookedSession(sess, [hook])  # pylint: disable=protected-access
-    sess.run(tf.compat.v1.global_variables_initializer())
-
-    mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-
-    # assertNotRegexpMatches is not supported by python 3.1 and later
-    self._logger.logged_metric = []
-    mon_sess.run(train_op)
-    self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
-    time.sleep(1.0)
-
-    self._logger.logged_metric = []
-    mon_sess.run(train_op)
-    self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-
-    self._logger.logged_metric = []
-    hook.end(sess)
-    if at_end:
-      self.assertRegexpMatches(str(self._logger.logged_metric), t.name)
-    else:
-      # assertNotRegexpMatches is not supported by python 3.1 and later
-      self.assertEqual(str(self._logger.logged_metric).find(t.name), -1)
-
-  def test_print_every_n_secs(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      self._validate_print_every_n_secs(sess, at_end=False)
-      # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=False)
-
-  def test_print_every_n_secs_and_end(self):
-    with tf.Graph().as_default(), tf.compat.v1.Session() as sess:
-      tf.compat.v1.train.get_or_create_global_step()
-      self._validate_print_every_n_secs(sess, at_end=True)
-      # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=True)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/deepray/utils/logs/mlperf_helper.py b/deepray/utils/logs/mlperf_helper.py
index a2340b70..c2553148 100644
--- a/deepray/utils/logs/mlperf_helper.py
+++ b/deepray/utils/logs/mlperf_helper.py
@@ -193,6 +193,5 @@ def clear_system_caches():
 
 
 if __name__ == "__main__":
-  logging.set_verbosity(logging.INFO)
   with LOGGER(True):
     ncf_print(key=TAGS.RUN_START)
diff --git a/deepray/utils/logs/summary_manager.py b/deepray/utils/logs/summary_manager.py
index 7f6ef677..7af94e6c 100644
--- a/deepray/utils/logs/summary_manager.py
+++ b/deepray/utils/logs/summary_manager.py
@@ -22,8 +22,6 @@
 import tensorflow as tf
 import horovod.tensorflow as hvd
 
-FLAGS = flags.FLAGS
-
 _MIN_SUMMARY_STEPS = 10
 
 
@@ -52,7 +50,7 @@ def __init__(self, summary_dir, global_step=None):
       self.summary_writers['train'], self.summary_writers['evel'] = None, None
     else:
       self.summary_writers['evel'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "eval"))
-      if FLAGS.steps_per_summary >= _MIN_SUMMARY_STEPS:
+      if FLAGS.steps_per_execution >= _MIN_SUMMARY_STEPS:
         # Only writes summary when the stats are collected sufficiently over enough steps.
         self.summary_writers['train'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "train"))
       else:
diff --git a/deepray/utils/resource_loader.py b/deepray/utils/resource_loader.py
index 3e0a6350..e8c9ed61 100644
--- a/deepray/utils/resource_loader.py
+++ b/deepray/utils/resource_loader.py
@@ -21,7 +21,7 @@
 import tensorflow as tf
 
 INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.9.1"
-EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.13.0"
+EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY = "2.15.0"
 abi_warning_already_raised = False
 SKIP_CUSTOM_OPS = False
 
@@ -63,7 +63,7 @@ def ops(self):
       pytest.skip("Skipping the test because a custom ops "
                   "was being loaded while --skip-custom-ops was set.")
     if self._ops is None:
-      self.display_warning_if_incompatible()
+      # self.display_warning_if_incompatible()
       self._ops = tf.load_op_library(get_path_to_datafile(self.relative_path, is_so=True))
     return self._ops
 
diff --git a/deepray/utils/test_utils.py b/deepray/utils/test_utils.py
index 338370a7..81cef91a 100644
--- a/deepray/utils/test_utils.py
+++ b/deepray/utils/test_utils.py
@@ -14,21 +14,21 @@
 # ==============================================================================
 """Utilities for testing Deepray."""
 
+import inspect
 import os
 import random
-import inspect
 
 import numpy as np
 import pytest
 import tensorflow as tf
-
 from packaging.version import Version
+
 from deepray import options
 from deepray.utils import resource_loader
 
-if Version(tf.__version__).release >= Version("2.13").release:
-  # New versions of Keras require importing from `keras.src` when
-  # importing internal symbols.
+if Version(tf.__version__) > Version("2.16.0"):
+  from tf_keras.src.testing_infra.test_utils import layer_test  # noqa: F401
+elif Version(tf.__version__).release >= Version("2.13").release:
   from keras.src.testing_infra.test_utils import layer_test  # noqa: F401
 elif Version(tf.__version__) >= Version("2.9"):
   from keras.testing_infra.test_utils import layer_test  # noqa: F401
diff --git a/deepray/utils/timer.py b/deepray/utils/timer.py
new file mode 100644
index 00000000..8593299f
--- /dev/null
+++ b/deepray/utils/timer.py
@@ -0,0 +1,34 @@
+import time
+from functools import wraps
+
+
+class Timer:
+  """Useage
+    if __name__ == "__main__":
+    with Timer():
+        # ...
+    """
+
+  def __enter__(self):
+    self._enter_time = time.time()
+
+  def __exit__(self, *exc_args):
+    self._exit_time = time.time()
+    print(f"{self._exit_time - self._enter_time:.2f} seconds elapsed")
+
+
+def timer(func):
+  """Useage
+    @timer
+    def your_function():
+        # ...
+    """
+
+  @wraps(func)
+  def inner(*args, **kwargs):
+    start_time = time.time()
+    retval = func(*args, **kwargs)
+    print(f"{time.time() - start_time:.2f} seconds elapsed")
+    return retval
+
+  return inner
diff --git a/deepray/utils/types.py b/deepray/utils/types.py
index b92a34d8..01dd101e 100644
--- a/deepray/utils/types.py
+++ b/deepray/utils/types.py
@@ -19,32 +19,37 @@
 import importlib
 import numpy as np
 import tensorflow as tf
+import tf_keras as keras
 
 from packaging.version import Version
 
-# TODO: Remove once https://github.com/tensorflow/tensorflow/issues/44613 is resolved
-if Version(tf.__version__).release >= Version("2.13").release:
-  # New versions of Keras require importing from `keras.src` when
-  # importing internal symbols.
-  from keras.src.engine import keras_tensor
+# Find KerasTensor.
+if Version(tf.__version__).release >= Version("2.16").release:
+  # Determine if loading keras 2 or 3.
+  if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release):
+    from keras import KerasTensor
+  else:
+    from tf_keras.src.engine.keras_tensor import KerasTensor
+elif Version(tf.__version__).release >= Version("2.13").release:
+  from keras.src.engine.keras_tensor import KerasTensor
 elif Version(tf.__version__).release >= Version("2.5").release:
-  from keras.engine import keras_tensor
+  from keras.engine.keras_tensor import KerasTensor
 else:
-  from tensorflow.python.keras.engine import keras_tensor
+  from tensorflow.python.keras.engine.keras_tensor import KerasTensor
 
 Number = Union[float, int, np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, np.uint8,
                np.uint16, np.uint32, np.uint64,]
 
-Initializer = Union[None, dict, str, Callable, tf.keras.initializers.Initializer]
-Regularizer = Union[None, dict, str, Callable, tf.keras.regularizers.Regularizer]
-Constraint = Union[None, dict, str, Callable, tf.keras.constraints.Constraint]
+Initializer = Union[None, dict, str, Callable, keras.initializers.Initializer]
+Regularizer = Union[None, dict, str, Callable, keras.regularizers.Regularizer]
+Constraint = Union[None, dict, str, Callable, keras.constraints.Constraint]
 Activation = Union[None, str, Callable]
-if importlib.util.find_spec("tensorflow.keras.optimizers.legacy") is not None:
-  Optimizer = Union[tf.keras.optimizers.Optimizer, tf.keras.optimizers.legacy.Optimizer, str]
+if importlib.util.find_spec("tf_keras.optimizers.legacy") is not None:
+  Optimizer = Union[keras.optimizers.Optimizer, keras.optimizers.legacy.Optimizer, str]
 else:
-  Optimizer = Union[tf.keras.optimizers.Optimizer, str]
+  Optimizer = Union[keras.optimizers.Optimizer, str]
 
 TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.SparseTensor, tf.Variable,
-                   keras_tensor.KerasTensor,]
+                   KerasTensor,]
 FloatTensorLike = Union[tf.Tensor, float, np.float16, np.float32, np.float64]
 AcceptableDTypes = Union[tf.DType, np.dtype, type, int, str, None]
diff --git a/deepray/version.py b/deepray/version.py
index 7c75ae0c..d62f5246 100644
--- a/deepray/version.py
+++ b/deepray/version.py
@@ -16,12 +16,12 @@
 
 # Required TensorFlow version [min, max)
 INCLUSIVE_MIN_TF_VERSION = "2.9.1"
-EXCLUSIVE_MAX_TF_VERSION = "2.13.0"
+EXCLUSIVE_MAX_TF_VERSION = "2.18.0"
 
 # We follow Semantic Versioning (https://semver.org/)
 _MAJOR_VERSION = "0"
 _MINOR_VERSION = "21"
-_PATCH_VERSION = "9"
+_PATCH_VERSION = "86"
 
 # When building releases, we can update this value on the release branch to
 # reflect the current release candidate ('rc0', 'rc1') or, finally, the official
diff --git a/deepray/workspace0.bzl b/deepray/workspace0.bzl
index d954c362..6b991de2 100644
--- a/deepray/workspace0.bzl
+++ b/deepray/workspace0.bzl
@@ -1,9 +1,9 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
+load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
+load("@rules_compressor//tensorflow:workspace2.bzl", rules_compressor_deps = "tf_workspace2")
 load("@rules_foreign_cc//foreign_cc:repositories.bzl", "rules_foreign_cc_dependencies")
 load("@rules_pkg//:deps.bzl", "rules_pkg_dependencies")
-load("@rules_compressor//tensorflow:workspace2.bzl", rules_compressor_deps = "tf_workspace2")
-load("@com_github_nelhage_rules_boost//:boost/boost.bzl", "boost_deps")
 
 def workspace():
     # If a target is bound twice, the later one wins, so we have to do tf bindings
diff --git a/deepray/workspace2.bzl b/deepray/workspace2.bzl
index fbfecb7c..b52c3b6f 100644
--- a/deepray/workspace2.bzl
+++ b/deepray/workspace2.bzl
@@ -1,8 +1,12 @@
 """Deepray workspace initialization. Consult the WORKSPACE on how to use it."""
 
-# Import external repository rules.
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+load("//third_party:repo.bzl", "tf_http_archive")
+
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
 
 # Define all external repositories required by TensorFlow
 def _tf_repositories():
@@ -22,12 +26,6 @@ def _tf_repositories():
         strip_prefix = "double-conversion-3.2.0",
     )
 
-    git_repository(
-        name = "rules_python",
-        remote = "https://github.com/bazelbuild/rules_python.git",
-        tag = "0.16.2",
-    )
-
     http_archive(
         name = "eigen3",
         urls = [
@@ -47,17 +45,18 @@ def _tf_repositories():
         type = "tar.gz",
         strip_prefix = "OpenBLAS-{}".format(OPENBLAS_VERSION),
         build_file = Label("//third_party:openblas.BUILD"),
-        # sha256 = "5d9491d07168a5d00116cdc068a40022c3455bf9293c7cb86a65b1054d7e5114",
+        sha256 = "4c25cb30c4bb23eddca05d7d0a85997b8db6144f5464ba7f8c09ce91e2f35543",
     )
 
-    ARROW_VERSION = "7.0.0"
     http_archive(
-        name = "com_github_apache_arrow",
-        sha256 = "57e13c62f27b710e1de54fd30faed612aefa22aa41fa2c0c3bacd204dd18a8f3",
+        name = "org_apache_arrow",
         build_file = Label("//third_party/arrow:arrow.BUILD"),
-        strip_prefix = "arrow-apache-arrow-" + ARROW_VERSION,
+        patches = ["//third_party/arrow:arrow-20.patch"],
+        patch_args = ["-p1"],
+        sha256 = "89efbbf852f5a1f79e9c99ab4c217e2eb7f991837c005cba2d4a2fbd35fad212",
+        strip_prefix = "apache-arrow-20.0.0",
         urls = [
-            "https://github.com/apache/arrow/archive/apache-arrow-{}.tar.gz".format(ARROW_VERSION),
+            "https://github.com/apache/arrow/releases/download/apache-arrow-20.0.0/apache-arrow-20.0.0.tar.gz",
         ],
     )
 
@@ -94,7 +93,7 @@ def _tf_repositories():
     )
 
     http_archive(
-        name = "com_github_apache_thrift",  # Apache License 2.0
+        name = "org_apache_thrift",  # Apache License 2.0
         build_file = Label("//third_party/thrift:thrift.BUILD"),
         sha256 = "5da60088e60984f4f0801deeea628d193c33cec621e78c8a43a5d8c4055f7ad9",
         strip_prefix = "thrift-0.13.0",
@@ -182,30 +181,6 @@ def _tf_repositories():
         ],
     )
 
-    http_archive(
-        name = "libcuckoo",
-        build_file = "//third_party:libcuckoo.BUILD",
-        patch_args = ["-p1"],
-        patches = [
-            "//third_party:cuckoohash_map.patch",
-        ],
-        sha256 = "7238436b7346a0edf4ce57c12f43f71af5347b8b15f9bf2f0e24bfdca6225fc5",
-        strip_prefix = "libcuckoo-0.3",
-        urls = [
-            "https://github.com/efficient/libcuckoo/archive/v0.3.zip",
-        ],
-    )
-
-    http_archive(
-        name = "sparsehash",
-        build_file = "//third_party:sparsehash.BUILD",
-        sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755",
-        strip_prefix = "sparsehash-c11-2.11.1",
-        urls = [
-            "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz",
-        ],
-    )
-
     http_archive(
         name = "murmurhash",
         build_file = "//third_party:murmurhash.BUILD",
@@ -228,23 +203,115 @@ def _tf_repositories():
     )
 
     http_archive(
+        name = "com_github_NVIDIA_cuCollections",
+        # sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0",
+        build_file = "//third_party/cuCollections:cuCollections.BUILD",
+        strip_prefix = "cuCollections-2303a7a2a03e38385dbe1bbc91c55007a94a9192",
+        urls = [
+            "https://github.com/NVIDIA/cuCollections/archive/2303a7a2a03e38385dbe1bbc91c55007a94a9192.zip",
+        ],
+    )
+
+    tf_http_archive(
         name = "cuCollections",  # Apache License 2.0
-        # patches = ["//third_party/cucollection:cucollection.patch"],
-        build_file = "//third_party/cucollection:cuco.BUILD",
+        patch_file = [clean_dep("//third_party/cuCollections:cucollection.patch")],
+        build_file = clean_dep("//third_party/cuCollections:cuco.BUILD"),
         sha256 = "c5c77a1f96b439b67280e86483ce8d5994aa4d14b7627b1d3bd7880be6be23fa",
         strip_prefix = "cuCollections-193de1aa74f5721717f991ca757dc610c852bb17",
         urls = [
             "https://github.com/NVIDIA/cuCollections/archive/193de1aa74f5721717f991ca757dc610c852bb17.zip",
+            "https://github.com/NVIDIA/cuCollections/archive/193de1aa74f5721717f991ca757dc610c852bb17.zip",
         ],
     )
 
-    http_archive(
-        name = "sparsehash_c11",
-        build_file = "//third_party:sparsehash_c11.BUILD",
+    tf_http_archive(
+        name = "sparsehash_c11",  # BSD-3-Clause License
+        build_file = clean_dep("//third_party/sparsehash_c11:sparsehash_c11.BUILD"),
+        patch_file = [
+            clean_dep("//third_party/sparsehash_c11:sparsehash_c11.patch"),
+        ],
         sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755",
         strip_prefix = "sparsehash-c11-2.11.1",
         urls = [
             "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz",
+            "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz",
+        ],
+    )
+
+    # http_archive(
+    #     name = "sparsehash_c11",  # BSD-3-Clause License
+    #     build_file = "//third_party/sparsehash_c11:sparsehash_c11.BUILD",
+    #     patch_args = ["-p1"],
+    #     patches = ["//third_party/sparsehash_c11:sparsehash_c11.patch"],
+    #     sha256 = "d4a43cad1e27646ff0ef3a8ce3e18540dbcb1fdec6cc1d1cb9b5095a9ca2a755",
+    #     strip_prefix = "sparsehash-c11-2.11.1",
+    #     urls = [
+    #         "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz",
+    #         # "https://github.com/sparsehash/sparsehash-c11/archive/v2.11.1.tar.gz",
+    #     ],
+    # )
+
+    http_archive(
+        name = "cutlass",
+        urls = ["https://github.com/NVIDIA/cutlass/archive/319a389f42b776fae5701afcb943fc03be5b5c25.zip"],
+        build_file = "//third_party:cutlass.BUILD",
+        strip_prefix = "cutlass-319a389f42b776fae5701afcb943fc03be5b5c25",
+    )
+
+    http_archive(
+        name = "flash_attn",
+        urls = ["https://github.com/Dao-AILab/flash-attention/archive/9818f85fee29ac6b60c9214bce841f8109a18b1b.zip"],  # v1.0.4
+        build_file = "//third_party/flash_attn:flash_attn.BUILD",
+        sha256 = "15f29a1095600ba2a3af688fa96a0a48635edb90fffec56c6eb7c48a4a322d2b",
+        strip_prefix = "flash-attention-9818f85fee29ac6b60c9214bce841f8109a18b1b",
+        patches = [
+            "//third_party/flash_attn:flash_attn.patch",
+        ],
+        patch_args = ["-p1"],
+    )
+
+    http_archive(
+        name = "libcuckoo",
+        build_file = "//third_party:libcuckoo.BUILD",
+        patch_args = ["-p1"],
+        patches = [
+            "//third_party:cuckoohash_map.patch",
+        ],
+        sha256 = "7238436b7346a0edf4ce57c12f43f71af5347b8b15f9bf2f0e24bfdca6225fc5",
+        strip_prefix = "libcuckoo-0.3",
+        urls = [
+            "https://github.com/efficient/libcuckoo/archive/v0.3.zip",
+        ],
+    )
+
+    http_archive(
+        name = "com_github_google_leveldb",
+        sha256 = "f99dc5dcb6f23e500b197db02e993ee0d3bafd1ac84b85ab50de9009b36fbf03",
+        strip_prefix = "leveldb-5d94ad4d95c09d3ac203ddaf9922e55e730706a8",
+        build_file = "//third_party:leveldb.BUILD",
+        urls = [
+            "https://github.com/google/leveldb/archive/5d94ad4d95c09d3ac203ddaf9922e55e730706a8.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "readerwriterqueue_archive",
+        build_file = clean_dep("//third_party:readerwriterqueue.BUILD"),
+        sha256 = "fc68f55bbd49a8b646462695e1777fb8f2c0b4f342d5e6574135211312ba56c1",
+        strip_prefix = "readerwriterqueue-1.0.6",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz",
+            "https://github.com/cameron314/readerwriterqueue/archive/v1.0.6.tar.gz",
+        ],
+    )
+
+    http_archive(
+        name = "openssl",
+        sha256 = "9f54d42aed56f62889e8384895c968e24d57eae701012776d5f18fb9f2ae48b0",
+        build_file = "//third_party:openssl.BUILD",
+        strip_prefix = "openssl-openssl-3.0.2",
+        urls = [
+            "https://github.com/openssl/openssl/archive/refs/tags/openssl-3.0.2.tar.gz",
         ],
     )
 
diff --git a/deepray/workspace3.bzl b/deepray/workspace3.bzl
index 2aaed1e7..8ff78d85 100644
--- a/deepray/workspace3.bzl
+++ b/deepray/workspace3.bzl
@@ -1,16 +1,8 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
 load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
-load("@bazel_tools//tools/build_defs/repo:git.bzl", "git_repository")
 
 def workspace():
-    http_archive(
-        name = "rules_cc",
-        urls = ["https://github.com/bazelbuild/rules_cc/releases/download/0.0.9/rules_cc-0.0.9.tar.gz"],
-        sha256 = "2037875b9a4456dce4a79d112a8ae885bbc4aad968e6587dca6e64f3a0900cdf",
-        strip_prefix = "rules_cc-0.0.9",
-    )
-
     http_archive(
         name = "rules_foreign_cc",
         sha256 = "476303bd0f1b04cc311fc258f1708a5f6ef82d3091e53fd1977fa20383425a6a",
@@ -27,12 +19,10 @@ def workspace():
         sha256 = "8f9ee2dc10c1ae514ee599a8b42ed99fa262b757058f65ad3c384289ff70c4b8",
     )
 
-    git_repository(
+    http_archive(
         name = "rules_compressor",
-        # branch = "main",
-        remote = "https://github.com/fuhailin/rules_compressor.git",
-        commit = "a98ee1d04dc8175aa87a9640caef25725a78ef03",
-        shallow_since = "1681204047 +0800",
+        url = "https://github.com/fuhailin/rules_compressor/archive/refs/heads/main.zip",
+        strip_prefix = "rules_compressor-main",
     )
 
 # Alias so it can be loaded without assigning to a different symbol to prevent
diff --git a/docker.sh b/docker.sh
index 1d8dcc9b..2c2f6bb8 100644
--- a/docker.sh
+++ b/docker.sh
@@ -2,20 +2,26 @@
 
 set -x -e
 
-PY_VERSION=${1:-"3.8"}
-TF_VERSION=${2:-"2.9.1"}
-CUDA_VERSION=${3:-"11.6.2"}
-OS_VERSION=${3:-"20.04"}
+PY_VERSION=${1:-"3.10"}
+TF_VERSION=${2:-"2.15.0"}
+CUDA_VERSION=${3:-"12.2.2"}
+OS_VERSION=${3:-"22.04"}
 
-docker pull hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION}
+# docker pull hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION}
+
+# docker volume create -d local --name dev-build \
+#     --opt device="/data/fuhailin/workspaces" \
+#     --opt type="none" \
+#     --opt o="bind"
 
 docker run --gpus all -it \
-    --rm=true \
-    --name="deepray_dev" \
-    -w /workspaces \
+    --rm \
+    --network=host \
+    --name="deepray_dev_py${PY_VERSION}" \
+    --volume=/data/fuhailin/workspaces/datasets/:/datasets \
     --volume=dev-build:/workspaces \
-    --shm-size=1g \
-    --device /dev/fuse \
-    --network host \
     --privileged \
-    hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} /bin/bash
+    --cap-add=SYS_PTRACE \
+    --shm-size=1g \
+    --ulimit memlock=-1 \
+    hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION}
diff --git a/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh b/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh
index 50110616..2caa469a 100644
--- a/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh
+++ b/modelzoo/CV/Classify_images_of_clothing/run_horovod.sh
@@ -14,66 +14,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-keras_use_ctl=${1:-"true"}
-num_gpu=${2:-"4"}
+set -eu
+set -o pipefail
 batch_size=${3:-"1024"}
 learning_rate=${4:-"5e-6"}
-precision=${5:-"fp32"}
-use_xla=${6:-"true"}
-epochs=${7:-"100"}
-model=${8:-"demo"}
-
-
-if [ $num_gpu -gt 1 ] ; then
-    mpi_command="mpirun -np $num_gpu \
-    --allow-run-as-root -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO \
-    -x LD_LIBRARY_PATH \
-    -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--use_horovod"
-else
-    mpi_command=""
-    use_hvd=""
-fi
 
-if [ "$precision" = "fp16" ] ; then
-    echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
-else
-    use_fp16=""
-fi
-
-if [ "$use_xla" = "true" ] ; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
-
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_training_fashion_mnist_%s_%s_gbs%d" "$model" "$precision" $GBS
-DATESTAMP=`date +'%y%m%d%H%M%S'`
+printf -v TAG "tf_training_fashion_mnist_gbs%d" $batch_size
+DATESTAMP=$(date +'%y%m%d%H%M%S')
 
 #Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results/${TAG}_${DATESTAMP}
+RESULTS_DIR=/workspaces/results/${TAG}_${DATESTAMP}
 LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
 mkdir -m 777 -p $RESULTS_DIR
 printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
 printf "Logs written to %s\n" "$LOGFILE"
 
 set -x
-$mpi_command python -m examples.CV.Classify_images_of_clothing.train \
-  --train_data=fashion_mnist \
-  --keras_use_ctl=$keras_use_ctl \
-  --num_gpus=$num_gpu \
-  --batch_size=$batch_size \
-  --learning_rate=$learning_rate \
-  --epochs=$epochs \
-  --model_dir=${RESULTS_DIR} \
-  $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
-
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --use_custom_training_loop=True \
+    --run_eagerly=False \
+    --train_data=fashion_mnist \
+    --batch_size=$batch_size \
+    --learning_rate=$learning_rate \
+    --epochs=3 \
+    --model_dir=${RESULTS_DIR} |& tee $LOGFILE
 set +x
diff --git a/modelzoo/CV/Classify_images_of_clothing/train.py b/modelzoo/CV/Classify_images_of_clothing/train.py
index c81ee943..590a3f5d 100644
--- a/modelzoo/CV/Classify_images_of_clothing/train.py
+++ b/modelzoo/CV/Classify_images_of_clothing/train.py
@@ -1,39 +1,37 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import tensorflow as tf
-from absl import app, flags
-
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
+from absl import flags
+import datetime, os
+import deepray as dp
+from deepray.core.trainer import Trainer
 from deepray.datasets.fashion_mnist import FashionMNIST
 
-FLAGS = flags.FLAGS
 
-
-def main(_):
-  _strategy = distribution_utils.get_distribution_strategy()
-  data_pipe = FashionMNIST()
-  with distribution_utils.get_strategy_scope(_strategy):
-    model = tf.keras.Sequential(
-        [
-            tf.keras.layers.Flatten(input_shape=(28, 28)),
-            tf.keras.layers.Dense(128, activation='relu'),
-            tf.keras.layers.Dense(10)
-        ]
-    )
+def main():
+  model = tf.keras.models.Sequential(
+      [
+          tf.keras.layers.Flatten(input_shape=(28, 28)),
+          tf.keras.layers.Dense(128, activation='relu'),
+          tf.keras.layers.Dropout(0.2),
+          tf.keras.layers.Dense(10, activation='softmax')
+      ]
+  )
 
   trainer = Trainer(
       model=model,
-      loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      optimizer='adam',
+      loss='sparse_categorical_crossentropy',
       metrics=['accuracy'],
   )
 
-  train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
+  data_pipe = FashionMNIST()
+  train_input_fn = data_pipe(flags.FLAGS.batch_size, is_training=True)
+
+  # logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
+  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+
+  tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
+  trainer.fit(train_input=train_input_fn, callbacks=[tensorboard_callback])
 
 
 if __name__ == "__main__":
-  flags.mark_flag_as_required("model_dir")
-  app.run(main)
+  dp.runner(main)
diff --git a/modelzoo/CV/GAN/train.py b/modelzoo/CV/GAN/train.py
index c9b1c2ae..c56339f7 100644
--- a/modelzoo/CV/GAN/train.py
+++ b/modelzoo/CV/GAN/train.py
@@ -8,18 +8,17 @@
 from absl import app, flags
 from datetime import datetime
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.mnist import Mnist
 
-FLAGS = flags.FLAGS
 FLAGS(
     [
         sys.argv[0],
         "--train_data=mnist",
         # "--distribution_strategy=off",
         # "--run_eagerly=true",
-        "--steps_per_summary=10",
+        "--steps_per_execution=10",
         # "--use_horovod=True",
         # "--batch_size=1024",
     ]
diff --git a/modelzoo/CV/SwinTransformers/train.py b/modelzoo/CV/SwinTransformers/train.py
index 772cb43d..246dfb83 100644
--- a/modelzoo/CV/SwinTransformers/train.py
+++ b/modelzoo/CV/SwinTransformers/train.py
@@ -6,13 +6,11 @@
 from absl import app, flags
 from tensorflow import keras
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.cifar import CIFAR100
 from .model import BaseModel
 
-FLAGS = flags.FLAGS
-
 learning_rate = 1e-3
 batch_size = 128
 num_epochs = 40
diff --git a/modelzoo/CV/mnist/run_early.sh b/modelzoo/CV/mnist/run_early.sh
index 3bfb479e..15583d12 100644
--- a/modelzoo/CV/mnist/run_early.sh
+++ b/modelzoo/CV/mnist/run_early.sh
@@ -26,14 +26,6 @@ use_xla=${6:-"true"}
 epochs=${7:-"10"}
 model=${8:-"demo"}
 
-if [ $num_gpu -gt 1 ]; then
-    hvd_command="horovodrun -np $num_gpu "
-    use_hvd="--use_horovod"
-else
-    hvd_command=""
-    use_hvd="--distribution_strategy=off"
-fi
-
 if [ "$precision" = "fp16" ]; then
     echo "fp16 activated!"
     use_fp16="--dtype=fp16"
@@ -60,15 +52,14 @@ printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
 printf "Logs written to %s\n" "$LOGFILE"
 
 set -x
-$hvd_command python train_earlystop.py \
+python train_earlystop.py \
     --train_data=mnist \
     --keras_use_ctl=$keras_use_ctl \
     --num_gpus=$num_gpu \
     --batch_size=$batch_size \
     --learning_rate=$learning_rate \
-    --steps_per_summary=20 \
+    --steps_per_execution=20 \
     --epochs=$epochs \
     --model_dir=${RESULTS_DIR} \
-    $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
-
+    $use_fp16 $use_xla_tag |& tee $LOGFILE
 set +x
diff --git a/modelzoo/CV/mnist/run_horovod.sh b/modelzoo/CV/mnist/run_horovod.sh
index a552bfb2..5ae789ae 100644
--- a/modelzoo/CV/mnist/run_horovod.sh
+++ b/modelzoo/CV/mnist/run_horovod.sh
@@ -15,31 +15,13 @@
 # limitations under the License.
 # ==============================================================================
 
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-keras_use_ctl=${1:-"true"}
-num_gpu=${2:-"1"}
-batch_size=${3:-"128"}
-learning_rate=${4:-"5e-6"}
-precision=${5:-"fp32"}
-use_xla=${6:-"true"}
-epochs=${7:-"1"}
-model=${8:-"demo"}
-
-if [ $num_gpu -gt 1 ]; then
-    mpi_command="mpirun -np $num_gpu \
-    --allow-run-as-root -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO \
-    -x LD_LIBRARY_PATH \
-    -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--use_horovod"
-else
-    mpi_command=""
-    use_hvd=""
-fi
+batch_size=${1:-"128"}
+learning_rate=${2:-"5e-6"}
+precision=${3:-"fp32"}
+use_xla=${4:-"False"}
+epochs=${5:-"1"}
 
 if [ "$precision" = "fp16" ]; then
-    echo "fp16 activated!"
     use_fp16="--dtype=fp16"
 else
     use_fp16=""
@@ -47,13 +29,12 @@ fi
 
 if [ "$use_xla" = "true" ]; then
     use_xla_tag="--enable_xla"
-    echo "XLA activated"
 else
     use_xla_tag=""
 fi
 
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_training_mnist_%s_%s_gbs%d" "$model" "$precision" $GBS
+export GBS=$(expr $batch_size)
+printf -v TAG "tf_training_mnist_gbs%d" $GBS
 DATESTAMP=$(date +'%y%m%d%H%M%S')
 
 #Edit to save logs & checkpoints in a different directory
@@ -64,16 +45,15 @@ printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
 printf "Logs written to %s\n" "$LOGFILE"
 
 set -x
-$mpi_command python train.py \
-    --train_data=mnist \
-    --keras_use_ctl=$keras_use_ctl \
-    --num_gpus=$num_gpu \
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --run_eagerly=False \
     --batch_size=$batch_size \
     --learning_rate=$learning_rate \
-    --steps_per_summary=1 \
-    --stop_steps=20 \
+    --steps_per_execution=10 \
+    --stop_steps=-1 \
     --epochs=$epochs \
     --model_dir=${RESULTS_DIR} \
-    $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
+    $use_fp16 $use_xla_tag
+# |& tee $LOGFILE
 
 set +x
diff --git a/modelzoo/CV/mnist/train.py b/modelzoo/CV/mnist/train.py
index 9665c2ef..dbd8be68 100644
--- a/modelzoo/CV/mnist/train.py
+++ b/modelzoo/CV/mnist/train.py
@@ -5,44 +5,93 @@
 import os
 import sys
 
+import keras
+import numpy as np
 import tensorflow as tf
-from absl import app, flags
+from absl import flags
 
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
+import deepray as dp
+from deepray.core.trainer import Trainer
 from deepray.datasets.mnist import Mnist
 
-FLAGS = flags.FLAGS
-FLAGS(
-    [
-        sys.argv[0],
-        "--train_data=mnist",
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--steps_per_summary=10",
-        # "--use_horovod=True",
-        # "--batch_size=1024",
-    ]
-)
-
-
-def main(_):
-  _strategy = distribution_utils.get_distribution_strategy()
-  data_pipe = Mnist()
-  with distribution_utils.get_strategy_scope(_strategy):
-    mnist_model = tf.keras.Sequential(
-        [
-            tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
-            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-            tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
-            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-            tf.keras.layers.Dropout(0.25),
-            tf.keras.layers.Flatten(),
-            tf.keras.layers.Dense(128, activation="relu"),
-            tf.keras.layers.Dropout(0.5),
-            tf.keras.layers.Dense(10, activation="softmax"),
-        ]
-    )
+
+def define_flasg():
+  flags.FLAGS(
+      [
+          sys.argv[0],
+          "--train_data=mnist",
+          # "--run_eagerly=true",
+          "--steps_per_execution=1",
+          # "--batch_size=1024",
+      ]
+  )
+
+
+class EarlyStoppingAtMinLoss(keras.callbacks.Callback):
+  """Stop training when the loss is at its min, i.e. the loss stops decreasing.
+
+    Arguments:
+        patience: Number of epochs to wait after min has been hit. After this
+        number of no improvement, training stops.
+    """
+
+  def __init__(self, patience=0):
+    super().__init__()
+    self.patience = patience
+    # best_weights to store the weights at which the minimum loss occurs.
+    self.best_weights = None
+
+  def on_train_begin(self, logs=None):
+    # The number of epoch it has waited when loss is no longer minimum.
+    self.wait = 0
+    # The epoch the training stops at.
+    self.stopped_epoch = 0
+    # Initialize the best as infinity.
+    self.best = np.Inf
+
+  # def on_batch_begin(self, batch, logs=None):
+  #   pass
+
+  # def on_batch_end(self, batch, logs=None):
+  #   if batch < 5:
+  #     print(batch, self.model.get_weights()[0][0][0][0])
+  #   pass
+
+  def on_epoch_end(self, epoch, logs=None):
+    print(logs)
+    current = logs.get("loss")
+    if np.less(current, self.best):
+      self.best = current
+      self.wait = 0
+      # Record the best weights if current results is better (less).
+      self.best_weights = self.model.get_weights()
+    else:
+      self.wait += 1
+      if self.wait >= self.patience:
+        self.stopped_epoch = epoch
+        self.model.stop_training = True
+        print("Restoring model weights from the end of the best epoch.")
+        self.model.set_weights(self.best_weights)
+
+  def on_train_end(self, logs=None):
+    if self.stopped_epoch > 0:
+      print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
+
+
+def main():
+  mnist_model = tf.keras.Sequential(
+      [
+          tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
+          tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+          tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
+          tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+          tf.keras.layers.Dropout(0.25),
+          tf.keras.layers.Flatten(),
+          tf.keras.layers.Dense(128, activation="relu"),
+          tf.keras.layers.Dropout(0.5),
+          tf.keras.layers.Dense(10, activation="softmax"),
+      ]
+  )
 
   trainer = Trainer(
       optimizer=tf.keras.optimizers.Adam(0.001),
@@ -51,14 +100,19 @@ def main(_):
       # loss='sparse_categorical_crossentropy',
       metrics=["accuracy"]
   )
-
+  data_pipe = Mnist()
+  train_input = data_pipe(flags.FLAGS.batch_size, is_training=True)
+  test_input = data_pipe(flags.FLAGS.batch_size, is_training=False)
   tboard_callback = tf.keras.callbacks.TensorBoard(
-      log_dir=os.path.join(FLAGS.model_dir, 'tensorboard'), histogram_freq=1, profile_batch='10,20'
+      log_dir=os.path.join(flags.FLAGS.model_dir, 'tensorboard'), histogram_freq=1, profile_batch='1,2'
   )
 
-  train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input, callbacks=[tboard_callback])
+  trainer.fit(
+      train_input=train_input,
+      eval_input=test_input,
+      callbacks=[tboard_callback, EarlyStoppingAtMinLoss()],
+  )
 
 
 if __name__ == "__main__":
-  app.run(main)
+  dp.runner(main)
diff --git a/modelzoo/CV/mnist/train_earlystop.py b/modelzoo/CV/mnist/train_earlystop.py
deleted file mode 100644
index 6c687991..00000000
--- a/modelzoo/CV/mnist/train_earlystop.py
+++ /dev/null
@@ -1,112 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import keras
-import numpy as np
-import tensorflow as tf
-from absl import app, flags
-
-from deepray.core.base_trainer import Trainer
-from deepray.datasets.mnist import Mnist
-
-FLAGS = flags.FLAGS
-FLAGS(
-    [
-        sys.argv[0],
-        "--train_data=mnist",
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--steps_per_summary=10",
-        # "--use_horovod=True",
-        # "--batch_size=1024",
-    ]
-)
-
-
-def get_model():
-  return tf.keras.Sequential(
-      [
-          tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
-          tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-          tf.keras.layers.Flatten(),
-          tf.keras.layers.Dense(1),
-      ]
-  )
-
-
-class EarlyStoppingAtMinLoss(keras.callbacks.Callback):
-  """Stop training when the loss is at its min, i.e. the loss stops decreasing.
-
-    Arguments:
-        patience: Number of epochs to wait after min has been hit. After this
-        number of no improvement, training stops.
-    """
-
-  def __init__(self, patience=0):
-    super().__init__()
-    self.patience = patience
-    # best_weights to store the weights at which the minimum loss occurs.
-    self.best_weights = None
-
-  def on_train_begin(self, logs=None):
-    # The number of epoch it has waited when loss is no longer minimum.
-    self.wait = 0
-    # The epoch the training stops at.
-    self.stopped_epoch = 0
-    # Initialize the best as infinity.
-    self.best = np.Inf
-
-  # def on_batch_begin(self, batch, logs=None):
-  #   pass
-
-  # def on_batch_end(self, batch, logs=None):
-  #   if batch < 5:
-  #     print(batch, self.model.get_weights()[0][0][0][0])
-  #   pass
-
-  def on_epoch_end(self, epoch, logs=None):
-    print(logs)
-    current = logs.get("loss")
-    if np.less(current, self.best):
-      self.best = current
-      self.wait = 0
-      # Record the best weights if current results is better (less).
-      self.best_weights = self.model.get_weights()
-    else:
-      self.wait += 1
-      if self.wait >= self.patience:
-        self.stopped_epoch = epoch
-        self.model.stop_training = True
-        print("Restoring model weights from the end of the best epoch.")
-        self.model.set_weights(self.best_weights)
-
-  def on_train_end(self, logs=None):
-    if self.stopped_epoch > 0:
-      print("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
-
-
-def main(_):
-  data_pipe = Mnist()
-  model = get_model()
-
-  trainer = Trainer(
-      optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.1),
-      model=model,
-      loss="mean_squared_error",
-      metrics=["mean_absolute_error"],
-  )
-
-  callbacks = [EarlyStoppingAtMinLoss()],
-
-  train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(
-      train_input=train_input,
-      callbacks=callbacks,
-  )
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/modelzoo/ELECTRA/.gitignore b/modelzoo/ELECTRA/.gitignore
new file mode 100644
index 00000000..7a43e90b
--- /dev/null
+++ b/modelzoo/ELECTRA/.gitignore
@@ -0,0 +1,129 @@
+# Initially taken from Github's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+#Data checkpoints and results       
+data/*/*/   
+data/*/*.zip
+checkpoints/
+results/*
+
+#Editor
+.idea
+.idea/*
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# vscode
+.vscode
diff --git a/modelzoo/ELECTRA/Dockerfile b/modelzoo/ELECTRA/Dockerfile
new file mode 100644
index 00000000..88decd29
--- /dev/null
+++ b/modelzoo/ELECTRA/Dockerfile
@@ -0,0 +1,31 @@
+# syntax = docker/dockerfile:1
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:20.07-tf2-py3
+FROM ${FROM_IMAGE_NAME}
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 cabextract
+
+ENV DATA_PREP_WORKING_DIR /workspace/electra/data
+WORKDIR /workspace
+RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
+RUN git clone https://github.com/soskek/bookcorpus.git
+
+WORKDIR /workspace/electra
+
+RUN pip install --no-cache-dir tqdm boto3 requests six ipdb h5py nltk progressbar filelock  \
+ git+https://github.com/NVIDIA/dllogger \
+ nvidia-ml-py3==7.352.0 tokenizers==0.11.0
+
+RUN apt-get install -y iputils-ping
+COPY . .
diff --git a/modelzoo/ELECTRA/LICENSE b/modelzoo/ELECTRA/LICENSE
new file mode 100644
index 00000000..6b0b1270
--- /dev/null
+++ b/modelzoo/ELECTRA/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/modelzoo/ELECTRA/NOTICE b/modelzoo/ELECTRA/NOTICE
new file mode 100644
index 00000000..453fd085
--- /dev/null
+++ b/modelzoo/ELECTRA/NOTICE
@@ -0,0 +1,5 @@
+ELECTRA Tensorflow 2
+
+This repository includes software from https://github.com/huggingface/transformers
+licensed under the Apache License 2.0.
+
diff --git a/modelzoo/ELECTRA/README.md b/modelzoo/ELECTRA/README.md
new file mode 100644
index 00000000..154a7dff
--- /dev/null
+++ b/modelzoo/ELECTRA/README.md
@@ -0,0 +1,1005 @@
+# ELECTRA For TensorFlow2
+ 
+This repository provides a script and recipe to train the ELECTRA model for TensorFlow2 to achieve state-of-the-art accuracy, and is tested and maintained by NVIDIA.
+ 
+ 
+## Table Of Contents
+- [Model overview](#model-overview)
+  * [Model architecture](#model-architecture)
+  * [Default configuration](#default-configuration)
+  * [Feature support matrix](#feature-support-matrix)
+      * [Features](#features)
+  * [Mixed precision training](#mixed-precision-training)
+      * [Enabling mixed precision](#enabling-mixed-precision)
+      * [Enabling TF32](#enabling-tf32)
+  * [Glossary](#glossary)
+- [Setup](#setup)
+  * [Requirements](#requirements)
+- [Quick Start Guide](#quick-start-guide)
+- [Advanced](#advanced)
+  * [Scripts and sample code](#scripts-and-sample-code)
+  * [Parameters](#parameters)
+    + [Pre-training parameters](#pre-training-parameters)
+    + [Fine-tuning parameters](#fine-tuning-parameters)
+  * [Command-line options](#command-line-options)
+  * [Getting the data](#getting-the-data)
+    + [Multi-dataset](#multi-dataset)
+  * [Training process](#training-process)
+    + [Pre-training](#pre-training)
+    + [Multi-node](#multi-node)
+    + [Fine-tuning](#fine-tuning)
+  * [Inference process](#inference-process)
+    + [Fine-tuning inference](#fine-tuning-inference)
+- [Performance](#performance)
+  * [Benchmarking](#benchmarking)
+    + [Training performance benchmark](#training-performance-benchmark)
+    + [Inference performance benchmark](#inference-performance-benchmark)
+  * [Results](#results)
+    + [Training accuracy results](#training-accuracy-results)
+      - [Pre-training loss curves](#pre-training-loss-curves)
+      - [Pre-training loss results](#pre-training-loss-results)  
+      - [Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-accuracy-nvidia-dgx-a100-8x-a100-40gb)
+      - [Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-accuracy-nvidia-dgx-1-8x-v100-16gb)
+      - [Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB)](#fine-tuning-accuracy-nvidia-dgx-2-16x-v100-32gb)
+      - [Training stability test](#training-stability-test)
+        * [Pre-training stability test: NVIDIA DGX A100 (8x A100 40GB)](#pre-training-stability-test-nvidia-dgx-a100-8x-a100-40gb)
+        * [Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-stability-test-nvidia-dgx-1-8x-v100-16gb)
+    + [Training performance results](#training-performance-results)
+      - [Training performance: NVIDIA DGX A100 (8x A100 40GB)](#training-performance-nvidia-dgx-a100-8x-a100-40gb)
+        * [Pre-training NVIDIA DGX A100 (8x A100 40GB)](#pre-training-nvidia-dgx-a100-8x-a100-40gb)
+        * [Fine-tuning NVIDIA DGX A100 (8x A100 40GB)](#fine-tuning-nvidia-dgx-a100-8x-a100-40gb)
+      - [Training performance: NVIDIA DGX-1 (8x V100 16GB)](#training-performance-nvidia-dgx-1-8x-v100-16gb)
+        * [Pre-training NVIDIA DGX-1 (8x V100 16GB)](#pre-training-nvidia-dgx-1-8x-v100-16gb)
+        * [Fine-tuning NVIDIA DGX-1 (8x V100 16GB)](#fine-tuning-nvidia-dgx-1-8x-v100-16gb)
+      - [Training performance: NVIDIA DGX-2 (16x V100 32GB)](#training-performance-nvidia-dgx-2-16x-v100-32gb)
+        * [Pre-training NVIDIA DGX-2 (16x V100 32GB)](#pre-training-nvidia-dgx-2-16x-v100-32gb)
+        * [Fine-tuning NVIDIA DGX-2 (16x V100 32GB)](#fine-tuning-nvidia-dgx-2-16x-v100-32gb)
+    + [Inference performance results](#inference-performance-results)
+      - [Inference performance: NVIDIA DGX A100 (1x A100 40GB)](#inference-performance-nvidia-dgx-a100-1x-a100-40gb)
+        * [Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)](#fine-tuning-inference-on-nvidia-dgx-a100-1x-a100-40gb)
+      - [Inference performance: NVIDIA T4](#inference-performance-nvidia-t4)
+        * [Fine-tuning inference on NVIDIA T4](#fine-tuning-inference-on-nvidia-t4)
+- [Release notes](#release-notes)
+  * [Changelog](#changelog)
+  * [Known issues](#known-issues)
+ 
+## Model overview
+ 
+Electra (Efficiently Learning an Encoder that Classifies Token Replacements Accurately), is a novel pre-training method for language representations which outperforms existing techniques, given the same compute budget on a wide array of Natural Language Processing (NLP) tasks. This model is based on the [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/forum?id=r1xMH1BtvB) paper. NVIDIA's implementation of ELECTRA is an optimized version of the [Hugging Face implementation](https://huggingface.co/transformers/model_doc/electra.html), leveraging mixed precision arithmetic and Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures for faster training times with state-of-the-art accuracy.
+ 
+This repository contains the scripts to interactively launch data download, training, benchmarking and inference routines in a Docker container for pre-training on your own dataset (Wikipedia and BookCorpus shown as an example), and fine-tuning for tasks such as question answering. The major differences between the original implementation as described in the paper and this version of ELECTRA are as follows:
+ 
+-   Scripts to download Wikipedia and BookCorpus datasets
+-   Scripts to preprocess downloaded data or a custom corpus into inputs and targets for pre-training in a modular fashion
+-   Automatic mixed precision (AMP) support and optimized for performance
+-   Multi-GPU and Multi-node training support with push-button scripts to reach state-of-the-art accuracy and performance.
+ 
+Other publicly available implementations of Electra include:
+1. [Hugging Face](https://huggingface.co/transformers/model_doc/electra.html)
+2. [Google's implementation](https://github.com/google-research/electra)
+ 
+This model is trained with mixed precision using Tensor Cores on Volta, Turing, and the NVIDIA Ampere GPU architectures. Additionally, this model provides push-button solutions to pre-training, fine-tuning and inference and on a corpus of choice. As a result, researchers can get results up to 4x faster than training without Tensor Cores. This model is tested against each NGC monthly container release to ensure consistent accuracy and performance over time.
+ 
+### Model architecture
+ 
+ELECTRA is a combination of two Transformer models: a generator and a discriminator. The generator’s role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator, which is the model we are interested in, tries to identify which tokens were replaced by the generator in the sequence. Both generator and discriminator use the same architecture as the encoder of the Transformer. The encoder is simply a stack of Transformer blocks, which consist of a multi-head attention layer followed by successive stages of feed-forward networks and layer normalization. The multi-head attention layer performs self-attention on multiple input representations.
+ 
+![Figure 1-1](https://1.bp.blogspot.com/-sHybc03nJRo/XmfLongdVYI/AAAAAAAAFbI/a0t5w_zOZ-UtxYaoQlVkmTRsyFJyFddtQCLcBGAsYHQ/s1600/image1.png "ELECTRA architecture")
+ 
+ 
+ 
+### Default configuration
+ 
+ELECTRA uses a new pre-training task called replaced token detection (RTD), that trains a bidirectional model (like a MLM) while learning from all input positions (like a LM). Inspired by generative adversarial networks (GANs), instead of corrupting the input by replacing tokens with “[MASK]” as in BERT, the generator is trained to corrupt the input by replacing some input tokens with incorrect, but somewhat plausible, fakes. On the other hand, the discriminator is trained to distinguish between “real” and “fake” input data. 
+ 
+The [Google ELECTRA repository](https://github.com/google-research/electra) reports the results for three configurations of ELECTRA, each corresponding to a unique model size. This implementation provides the same configurations by default, which are described in the table below.
+ 
+| **Model** | **Hidden layers** | **Hidden unit size** | **Parameters** |
+|:---------:|:----------:|:---:|:----:|
+|ELECTRA_SMALL|12 encoder| 256 | 14M|
+|ELECTRA_BASE |12 encoder| 768 |110M|
+|ELECTRA_LARGE|24 encoder|1024 |335M|
+ 
+The following features were implemented in this model:
+- General:
+  - Mixed precision support with TensorFlow Automatic Mixed Precision (TF-AMP)
+  - Multi-GPU support using Horovod
+  - XLA support
+  - Multi-Node support
+ 
+ 
+- Training
+  - Pre-training support
+  - Fine-tuning example
+ 
+ 
+- Inference:
+  - Joint predictions with beam search.
+ 
+### Feature support matrix
+ 
+The following features are supported by this model.
+ 
+| **Feature** | **ELECTRA** |
+|:---------:|:----------:|
+|LAMB|Yes|
+|Automatic mixed precision (AMP)|Yes|
+|XLA|Yes|
+|Horovod Multi-GPU|Yes|
+|Multi-node|Yes|
+   
+#### Features
+ 
+**Automatic Mixed Precision (AMP)**
+ 
+This implementation of ELECTRA uses AMP to implement mixed precision training. It allows us to use FP16 training with FP32 master weights by modifying just a few lines of code.
+ 
+**Horovod**
+ 
+Horovod is a distributed training framework for TensorFlow, Keras, PyTorch, and MXNet. The goal of Horovod is to make distributed deep learning fast and easy to use. For more information about how to get started with Horovod, see the [Horovod: Official repository](https://github.com/horovod/horovod).
+ 
+Multi-GPU training with Horovod
+ 
+Our model uses Horovod to implement efficient multi-GPU training with NCCL. For details, see example sources in this repository or see the [TensorFlow tutorial](https://github.com/horovod/horovod/#usage).
+ 
+**XLA support (experimental)**
+ 
+XLA is a domain-specific compiler for linear algebra that can accelerate TensorFlow models with potentially no source code changes. The results are improvements in speed and memory usage: most internal benchmarks run ~1.1-1.5x faster after XLA is enabled. 
+[AMP](https://nvidia.github.io/apex/amp.html) is an abbreviation used for automatic mixed precision training.
+ 
+**Multi-node Training**
+ 
+Supported on a Pyxis/Enroot Slurm cluster.
+ 
+### Mixed precision training
+ 
+Mixed precision is the combined use of different numerical precisions in a computational method. [Mixed precision](https://arxiv.org/abs/1710.03740) training offers significant computational speedup by performing operations in half-precision format, while storing minimal information in single-precision to retain as much information as possible in critical parts of the network. Since the introduction of [Tensor Cores](https://developer.nvidia.com/tensor-cores) in Volta, and following with both the Turing and Ampere architectures, significant training speedups are experienced by switching to mixed precision -- up to 3x overall speedup on the most arithmetically intense model architectures. Using mixed precision training requires two steps:
+ 
+1.  Porting the model to use the FP16 data type where appropriate.
+2.  Adding loss scaling to preserve small gradient values.
+ 
+This can now be achieved using Automatic Mixed Precision (AMP) for TensorFlow to enable the full [mixed precision methodology](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#tensorflow) in your existing TensorFlow model code. AMP enables mixed precision training on Volta, Turing, and NVIDIA Ampere GPU architectures automatically. The TensorFlow framework code makes all necessary model changes internally.
+ 
+In TF-AMP, the computational graph is optimized to use as few casts as necessary and maximize the use of FP16, and the loss scaling is automatically applied inside of supported optimizers. AMP can be configured to work with the existing tf.contrib loss scaling manager by disabling the AMP scaling with a single environment variable to perform only the automatic mixed-precision optimization. It accomplishes this by automatically rewriting all computation graphs with the necessary operations to enable mixed precision training and automatic loss scaling.
+ 
+For information about:
+-   How to train using mixed precision, see the [Mixed Precision Training](https://arxiv.org/abs/1710.03740) paper and [Training With Mixed Precision](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html) documentation.
+-   Techniques used for mixed precision training, see the [Mixed-Precision Training of Deep Neural Networks](https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/) blog.
+-   How to access and enable AMP for TensorFlow, see [Using TF-AMP](https://docs.nvidia.com/deeplearning/dgx/tensorflow-user-guide/index.html#tfamp) from the TensorFlow User Guide.
+ 
+ 
+#### Enabling mixed precision
+ 
+This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--amp` flag to the `run_pretraining.py` or `run_tf_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code:
+ 
+1. Set the Keras mixed precision policy:
+   ```python
+   if config.amp:
+       policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+       tf.keras.mixed_precision.experimental.set_policy(policy)
+   ```
+ 
+2. Use the loss scaling wrapper on the optimizer:
+   ```python
+   if config.amp:
+        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
+   ```
+ 
+3. Use scaled loss to calculate the gradients:
+   ```python
+   #Scale loss
+   if config.amp:
+       total_loss = optimizer.get_scaled_loss(total_loss)
+   gradients = tape.gradient(total_loss, model.trainable_variables)
+   #Get unscaled gradients if AMP
+   if config.amp:
+       gradients = optimizer.get_unscaled_gradients(gradients)
+   ```
+ 
+#### Enabling TF32
+ 
+TensorFloat-32 (TF32) is the new math mode in [NVIDIA A100](https://www.nvidia.com/en-us/data-center/a100/) GPUs for handling the matrix math also called tensor operations. TF32 running on Tensor Cores in A100 GPUs can provide up to 10x speedups compared to single-precision floating-point math (FP32) on Volta GPUs. 
+ 
+TF32 Tensor Cores can speed up networks using FP32, typically with no loss of accuracy. It is more robust than FP16 for models which require high dynamic range for weights or activations.
+ 
+For more information, refer to the [TensorFloat-32 in the A100 GPU Accelerates AI Training, HPC up to 20x](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) blog post.
+ 
+TF32 is supported in the NVIDIA Ampere GPU architecture and is enabled by default.
+ 
+### Glossary
+ 
+**Fine-tuning**  
+Training an already pretrained model further using a task specific dataset for subject-specific refinements, by adding task-specific layers on top if required.
+ 
+**Language Model**  
+Assigns a probability distribution over a sequence of words. Given a sequence of words, it assigns a probability to the whole sequence.
+ 
+**Pre-training**  
+Training a model on vast amounts of data on the same (or different) task to build general understandings.
+ 
+**Transformer**  
+The paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762) introduces a novel architecture called Transformer that uses an attention mechanism and transforms one sequence into another.
+ 
+ **Phase 1**
+Pretraining on samples of sequence length 128 and at most 15% masked predictions per sequence.
+ 
+**Phase 2**
+Pretraining on samples of sequence length 512 and at most 15% masked predictions per sequence.
+ 
+## Setup
+ 
+The following section lists the requirements that you need to meet in order to start training the ELECTRA model.
+ 
+### Requirements
+ 
+This repository contains Dockerfile which extends the TensorFlow2 NGC container and encapsulates some dependencies. Aside from these dependencies, ensure you have the following components:
+ 
+-   [NVIDIA Docker](https://github.com/NVIDIA/nvidia-docker)
+-   [TensorFlow2 20.07-py3 NGC container or later](https://ngc.nvidia.com/registry/nvidia-tensorflow)
+-   Supported GPUs:
+    - [NVIDIA Volta architecture](https://www.nvidia.com/en-us/data-center/volta-gpu-architecture/)
+    - [NVIDIA Turing architecture](https://www.nvidia.com/en-us/geforce/turing/)
+    - [NVIDIA Ampere architecture](https://www.nvidia.com/en-us/data-center/nvidia-ampere-gpu-architecture/)
+ 
+For more information about how to get started with NGC containers, see the following sections from the NVIDIA GPU Cloud Documentation and the Deep Learning Documentation:
+-   [Getting Started Using NVIDIA GPU Cloud](https://docs.nvidia.com/ngc/ngc-getting-started-guide/index.html)
+-   [Accessing And Pulling From The NGC Container Registry](https://docs.nvidia.com/deeplearning/dgx/user-guide/index.html#accessing_registry)
+-   [Running TensorFlow2](https://docs.nvidia.com/deeplearning/frameworks/tensorflow-release-notes/running.html#running)
+ 
+For those unable to use the TensorFlow 2 NGC container, to set up the required environment or create your own container, see the versioned [NVIDIA Container Support Matrix](https://docs.nvidia.com/deeplearning/dgx/support-matrix/index.html).
+ 
+For multi-node, the sample provided in this repository requires [Enroot](https://github.com/NVIDIA/enroot) and [Pyxis](https://github.com/NVIDIA/pyxis) set up on a [SLURM](https://slurm.schedmd.com) cluster.
+ 
+More information on how to set up and launch can be found in the [Multi-node Documentation](https://docs.nvidia.com/ngc/multi-node-bert-user-guide).
+ 
+## Quick Start Guide
+ 
+To train your model using mixed precision or TF32 precision with Tensor Cores or using FP32, perform the following steps using the default parameters of the ELECTRA model. The default parameters for pre-training have been set to run on both 8x A100 40G and 8 x V100 32G GPUs. For the specifics concerning training and inference, see the [Advanced](#advanced) section.
+ 
+1. Clone the repository.
+ 
+```
+git clone https://github.com/NVIDIA/DeepLearningExamples.git
+cd DeepLearningExamples/TensorFlow2/LanguageModeling/ELECTRA
+```
+ 
+2. Build ELECTRA on top of the NGC container.
+```
+bash scripts/docker/build.sh
+```
+ 
+3. Start an interactive session in the NGC container to run data download, training and inference.
+```
+bash scripts/docker/launch.sh
+```
+ 
+Resultant logs of pre-training and fine-tuning routines are stored in the `results/` folder. Checkpoints are stored in the `results/<model-name>/` folder.
+ 
+Required data is downloaded into the `data/` directory by default.
+ 
+4. Download and preprocess the dataset.
+ 
+This repository provides scripts to download, verify, and extract the following datasets:
+ 
+-   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) (fine-tuning for question answering)
+-   Wikipedia (pre-training)
+-   BookCorpus (pre-training)
+ 
+To download, verify, extract the datasets, and create the shards in `tfrecord` format, run:
+```
+/workspace/electra/data/create_datasets_from_start.sh
+```
+ 
+Note: For fine-tuning only, Wikipedia and Bookscorpus dataset download and preprocessing can be skipped by commenting it out.
+ 
+- Download Wikipedia only for pretraining
+ 
+The pre-training dataset is 170GB+ and takes 15+ hours to download. The BookCorpus server most of the time gets overloaded and also contains broken links resulting in HTTP 403 and 503 errors. Hence, it is recommended to skip downloading BookCorpus data by running:
+ 
+```
+/workspace/electra/data/create_datasets_from_start.sh wiki_only
+```
+ 
+- Download Wikipedia and BookCorpus
+ 
+Users are welcome to download the BookCorpus from other sources to match our accuracy, or repeatedly try our script until the required number of files are downloaded by running the following:
+```
+/workspace/electra/data/create_datasets_from_start.sh wiki_books
+```
+ 
+Note: Not using the BookCorpus can potentially change the final accuracy on a few downstream tasks.
+ 
+5. Start pretraining.
+ 
+To run on a single node 8 x V100 32G, from within the container, you can use the following script to run pre-training.
+```
+bash scripts/run_pretraining.sh
+```
+ 
+The default hyperparameters are set to run on both 8 x A100 40G and 8 x V100 32G.
+ 
+For the other platforms, the configs present in `scripts/configs/pretrain_config.sh` can be used as shown below:
+```
+bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_amp)
+```
+ 
+To run pre-training on multiple nodes, see the [Multi-node](#multi-node) section.
+ 
+6. Postprocess pretrained checkpoint and fine-tune on SQuAD dataset
+ 
+The above pretrained ELECTRA model representations can be fine-tuned with just one additional output layer for a state-of-the-art question answering system. Running the following script extracts and saves the discriminator and generator from the pretrained checkpoint and fine-tunes the discriminator on SQuAD:
+ 
+```
+checkpoints=results/base/checkpoints bash scripts/finetune_ckpts_on_squad.sh
+```
+ 
+It internally runs `postprocess_pretrained_ckpt.py` which extracts and saves the discriminator and the generator from the pretrained checkpoint.
+ 
+The default hyperparameters are set to run on 8 x V100 16G.
+ 
+To run fine-tuning with the SQuAD dataset on Google's pretrained checkpoints, do the following.
+```
+bash scripts/run_squad.sh
+```
+ 
+For other platforms, configs present in `scripts/configs/squad_config.sh` can be used as shown below:
+```
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) train_eval
+```
+ 
+7. Start validation/evaluation.
+ 
+Validation can be performed by running:
+```
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) eval
+```
+Running training first is required to generate needed checkpoints.
+ 
+8. Start inference/predictions.
+ 
+Inference can be performed by running:
+```
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) prediction
+```
+Inference predictions are saved to `<OUTPUT_DIRECTORY>/predictions.json`.
+ 
+## Advanced
+ 
+The following sections provide greater details of the datasets, running training and inference, and the training results.
+ 
+### Scripts and sample code
+ 
+Descriptions of the key scripts and folders are provided below.
+ 
+-   `data/` - Contains scripts for downloading and preparing individual datasets, and will contain downloaded and processed datasets.
+-   `scripts/` - Contains shell scripts to launch the Docker container, data download, pre-training, fine-tuning and inference.
+-   `results/` - Folder where all training and inference results get stored by default.
+-   `run_squad.sh`  - Interface for launching question answering fine-tuning with `run_tf_squad.py`.
+-   `run_pretraining.sh`  - Interface for launching ELECTRA pre-training with `run_pretraining.py`.
+-   `finetune_ckpts_on_squad.sh` - Interface for extracting and saving discriminator and generator from the pretrained checkpoint and run SQuAD fine-tuning on discriminator.
+-   `build_pretraining_dataset.py` - Creates `tfrecord` files from shared text files in the final step of dataset creation.
+-   `postprocess_pretrained_ckpt.py` - Converts pretrained checkpoint to discriminator checkpoint and generator checkpoint which can be fed into `run_tf_squad.py`.
+-   `modeling.py` - Implements the ELECTRA pre-training and fine-tuning model architectures with TensorFlow2.
+-   `optimization.py` - Implements the Adam optimizer, LAMB and the learning rate schedule with TensorFlow2.
+-   `configuration.py` - Implements parent class for model config.
+-   `tokenization.py` - Implements the ELECTRA tokenizer.
+-   `run_pretraining.py` - Implements ELECTRA pre-training.
+-   `pretrain_utils.py` - Utilities required for pre-training such as dynamic masking etc.,
+-   `run_tf_squad.py` - Implements fine-tuning training and evaluation for question answering on the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+-   `inference.py` - Implements interactive question answering.
+-   `postprocess_pretrained_ckpt.py` - Implements extracting and saving the discriminator and the generator from the pretrained checkpoint.
+ 
+ 
+### Parameters
+ 
+#### Pre-training parameters
+ 
+ELECTRA is designed to pre-train deep bidirectional networks for language representations. The following scripts replicate pre-training on Wikipedia + BookCorpus from this [paper](https://openreview.net/forum?id=r1xMH1BtvB). These scripts are general and can be used for pre-training language representations on any corpus of choice.
+ 
+In the parameters expected by `scripts/run_pretraining.sh`, `p1` stands for phase 1 whereas `p2` stands for phase 2 training. They are as follows:
+ 
+-   `<training_batch_size_p1>` is per-GPU batch size used for training. Larger batch sizes run more efficiently, but require more GPU memory. Default is 176.
+-   `<learning_rate_p1>` is the base learning rate for training. Default is 6e-3.
+-   `<precision>` is the type of math in your model, can be either `fp32` or `amp`. Default is `amp`. The options mean:
+    -   FP32: 32-bit IEEE single precision float format.
+    -   AMP: Automatic mixed precision 16 and 32-bit float format.
+-   `<num_gpus>` is the number of GPUs to use for training. Must be equal to or smaller than the number of GPUs attached to your node. Default is 8.
+-   `<warmup_steps_p1>` is the percentage of training steps used for warm-up at the start of training. Default is 2000.
+-   `<train_steps_p1>` is the total number of training steps. Default is 10000.
+-   `<save_checkpoint_steps>` controls how often checkpoints are saved. Default is 500.
+-   `<resume_training>` if set to `true`, training should resume from the latest model in `/results/checkpoints`. Default is `false`.
+-   `<accumulate_gradient>` a flag indicating whether a larger batch should be simulated with gradient accumulation. Default is `true`.
+-   `<gradient_accumulation_steps_p1>` an integer indicating the number of steps to accumulate gradients over. Effective batch size / GPU = `training_batch_size` x `gradient_accumulation_steps`. Default is 48.
+-   `<seed>` random seed for the run.
+ 
+-  `<training_batch_size_p2>` is per-GPU batch size used for training in phase 2. Larger batch sizes run more efficiently, but require more memory. Default is 24.
+-   `<learning_rate_p2>` is the base learning rate for training phase 2. Default is 4e-3.
+-   `<warmup_steps_p2>` is the percentage of training steps used for warm-up at the start of training. Default is 200.
+-   `<training_steps_p2>` is the total number of training steps for phase 2, to be continued in addition to phase 1. Default is 930.
+-   `<gradient_accumulation_steps_p2>` an integer indicating the number of steps to accumulate gradients over in phase 2. Effective batch size / GPU = `training_batch_size_p2` * `gradient_accumulation_steps_p2`. Default is 144.
+-   `<init_checkpoint>` A checkpoint to start the pre-training routine on (Usually a ELECTRA pretrained checkpoint). Default is `None`.
+ 
+ 
+The complete list of the available parameters for the `run_pretraining.py` script are:
+ 
+```
+  --model_name MODEL_NAME
+                              - Model name, used to define the name of the results folder.
+ 
+  --pretrain_tfrecords PRETRAIN_TFRECORDS
+                              - Specifies tfrecord files used for pretraining.
+ 
+  --max_seq_length MAX_SEQ_LENGTH
+                              - The maximum total input sequence length after
+                                WordPiece tokenization. Sequences longer than
+                                this will be truncated, and sequences shorter
+                                than this will be padded.
+ 
+  --mask_prob MASK_PROB       - Percentage of input tokens to mask out / replace.
+ 
+  --disc_weight DISC_WEIGHT
+                              - Ratio of discriminator loss over generator loss.
+ 
+  --generator_hidden_size GENERATOR_HIDDEN_SIZE
+                              - Fraction of discriminator hidden size for generator.
+ 
+  --train_batch_size TRAIN_BATCH_SIZE
+                              - Batch size per GPU for training.
+ 
+  --learning_rate LEARNING_RATE
+                              - The initial learning rate for the optimizer.
+ 
+  --num_train_steps NUM_TRAIN_STEPS
+                              - Total number of training steps to perform.
+ 
+  --num_warmup_steps NUM_WARMUP_STEPS
+                              - Number of steps of training to perform linear learning
+                                rate warmup for. For example, 0.1 = 10% of training.
+ 
+  --seed SEED                 - Sets the seed to use for random number generation.
+ 
+  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
+                              - Number of update steps to accumulate before
+                                performing a backward/update pass.
+ 
+  --fp16_compression          - Whether to use 16-bit all reduce
+ 
+  --amp                       - If set, will perform computations using
+                                automatic mixed precision.
+ 
+  --log_freq LOG_FREQ         - If set, the script will output the training
+                                loss every LOG_FREQ steps.
+ 
+  --save_checkpoints_steps SAVE_CHECKPOINTS_STEPS
+                              - Checkpoints saving frequency.
+ 
+  --keep_checkpoint_max KEEP_CHECKPOINT_MAX
+                              - Maximum number of checkpoints to keep.
+ 
+  --restore_checkpoint RESTORE_CHECKPOINT
+                              - Whether to restore from a checkpoint; if specified,
+                                set to `path-to-checkpoint` or `latest`
+ 
+  --phase2                    - Specified if training on phase 2 only. 
+                                If not specified, default pre-training is on phase 1.
+ 
+  --optimizer OPTIMIZER       - Specifies optimizer, `adam` or `lamb`.
+ 
+  --skip_adaptive             - Whether to apply adaptive learning rate on LayerNorm and biases.
+ 
+  --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
+                              - Number of steps to accumulate gradients across before
+                                performing an update.
+ 
+  --lr_decay_power LR_DECAY_POWER
+                              - Learning rate polynomial decay power.
+ 
+  --opt_beta_1 OPT_BETA_1     - beta2 of optimizer.
+ 
+  --opt_beta_2 OPT_BETA_2     - beta2 of optimizer.
+ 
+  --end_lr END_LR             - Ending learning rate.
+ 
+```
+ 
+#### Fine-tuning parameters
+ 
+Default arguments are listed below in the order `scripts/run_squad.sh` expects:
+ 
+-   ELECTRA MODEL - The default is `"google/electra-base-discriminator"`.
+-   Number of training Epochs - The default is `2`.
+-   Batch size - The default is `16`.
+-   Learning rate - The default is `4e-4`.
+-   Precision (either `amp`, `tf32` or `fp32`) - The default is `amp`.
+-   Number of GPUs - The default is `8`.
+-   Seed - The default is `1`.
+-   SQuAD version - The default is `1.1`
+-   SQuAD directory -  The default is `/workspace/electra/data/download/squad/v$SQUAD_VERSION`.
+-   Output directory for result - The default is `results/`.
+-   Initialize checkpoint - The default is `"None"`
+-   Mode (`train`, `eval`, `train_eval`, `prediction`) - The default is `train_eval`.
+ 
+The script saves the checkpoint at the end of each epoch to the `checkpoints/` folder.
+ 
+The main script `run_tf_squad.py` specific parameters are:
+ 
+```
+ --electra_model ELECTRA_MODEL     - Specifies the type of ELECTRA model to use;
+                                     should be the discriminator of a pretrained checkpoint(output of postprocess_pretrained_ckpt.py)
+                                     or one of the following:
+              google/electra-small-generator
+                google/electra-base-generator
+                google/electra-large-generator
+                google/electra-small-discriminator
+                google/electra-base-discriminator
+                google/electra-large-discriminator
+ 
+ --amp                        - If set, will perform computations using
+                                automatic mixed precision.
+ 
+ --data_dir DATA_DIR          - Path to the SQuAD json for training and evaluation.
+ 
+ --max_seq_length MAX_SEQ_LENGTH
+                              - The maximum total input sequence length
+                                after WordPiece tokenization.
+                                Sequences longer than this will be truncated,
+                                and sequences shorter than this will be padded.
+ 
+ --doc_stride DOC_STRIDE      - When splitting up a long document into chunks
+                                this parameters sets how much stride to take
+                                between chunks of tokens.
+ 
+ --max_query_length MAX_QUERY_LENGTH
+                              - The maximum number of tokens for the question.
+                                Questions longer than <max_query_length>
+                                will be truncated to the value specified.
+ 
+ --n_best_size N_BEST_SIZE       - The total number of n-best predictions to
+                                generate in the nbest_predictions.json
+                                output file.
+ 
+ --max_answer_length MAX_ANSWER_LENGTH
+                              - The maximum length of an answer that can be
+                                generated. This is needed because the start and
+                                end predictions are not conditioned on one another.
+    
+ --joint_head <True|False>    - If true, beam search will be used to jointly predict
+                                the start and end positions. Default is True.
+ 
+ --beam_size BEAM_SIZE        - The beam size used to do joint predictions. The default value is 5.
+ 
+ --verbose_logging            - If true, all the warnings related to data
+                                processing will be printed. A number of warnings
+                                are expected for a normal SQuAD evaluation.
+ 
+ --do_lower_case              - Whether to lower case the input text. Set to
+                                true for uncased models and false for cased models.
+ 
+ --version_2_with_negative       - If true, the SQuAD examples contain questions
+                                that do not have an answer.
+ 
+ --null_score_diff_threshold NULL_SCORE_DIFF_THRES HOLD
+                              - A null answer will be predicted if null_score
+                                is greater than NULL_SCORE_DIFF_THRESHOLD.
+```
+ 
+### Command-line options
+ 
+To see the full list of available options and their descriptions, use the `-h` or `--help` command line option, for example:
+ 
+`python run_pretraining.py --help`
+ 
+`python run_tf_squad.py --help`
+ 
+Detailed descriptions of command-line options can be found in the [Parameters](#parameters) section.
+ 
+### Getting the data
+ 
+For pre-training ELECTRA, we use the concatenation of Wikipedia (2500M words) as well as BookCorpus (800M words). For Wikipedia, we extract only the text passages and ignore headers, lists, and tables. ELECTRA requires that datasets are structured as a document level corpus rather than a shuffled sentence level corpus because it is critical to extract long contiguous sentences.
+ 
+The preparation of the pre-training dataset is described in the `dataPrep.py` script found in the `data/` folder. The component steps in the automated scripts to prepare the datasets are as follows:
+ 
+1.  Data download and extract - the dataset is downloaded and extracted.
+ 
+2.  Clean and format - document tags, etc. are removed from the dataset.
+ 
+3.  Sentence segmentation - the corpus text file is processed into separate sentences.
+ 
+4.  Sharding - the sentence segmented corpus file is split into a number of uniformly distributed smaller text documents.
+ 
+5.  `tfrecord` file creation - each text file shard is processed by the `build_pretraining_dataset.py` script to produce a corresponding `tfrecord` file. The script generates input data for the input text shard.
+ 
+The tools used for preparing the BookCorpus and Wikipedia datasets can be applied to prepare an arbitrary corpus. The `create_datasets_from_start.sh` script in the `data/` directory applies sentence segmentation, sharding, and `tfrecord` file creation given an arbitrary text file containing a document-separated text corpus.
+ 
+For fine-tuning a pre-trained ELECTRA model for specific tasks, by default this repository prepares the following dataset:
+ 
+-   [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/): for question answering
+ 
+Depending on the speed of your internet connection, this process takes about a day to complete. The BookCorpus server could sometimes get overloaded and also contain broken links resulting in HTTP 403 and 503 errors. You can either skip the missing files or retry downloading at a later time.
+ 
+ 
+#### Multi-dataset
+ 
+This repository provides functionality to combine multiple datasets into a single dataset for pre-training on a diverse text corpus at the shard level. Currently Wikipedia and BookCorpus get merged in `data/create_datasets_from_start.sh`. Snippets to download and format more text corpuses can be added to `data/dataPrep.py`. The sharding scheme combines multiple corpuses together and splits them into the required number of training(90%) and testing(10%) shards. Once the data is sharded, the `build_pretraining_dataset.py` converts raw text shards to tokenized segments and saves the dataset to the `data` directory in TFRecord format. This dataset can now be used to pre-train ELECTRA.
+ 
+ 
+### Training process
+ 
+The training process consists of two steps: pre-training and fine-tuning.
+ 
+#### Pre-training
+ 
+Pre-training is performed using `run_pretraining.py` along with parameters defined in `scripts/run_pretraining.sh` and `scripts/configs/pretrain_configs.sh`.
+ 
+The `run_pretraining.sh` script runs a job on a single node that trains the ELECTRA-base model from scratch using Wikipedia and BookCorpus datasets as training data using the LAMB optimizer.
+ 
+Phase 1: (Maximum sequence length of 128)
+-   Runs on 8 GPUs with training batch size of 176 per GPU
+-   Uses a learning rate of 6e-3
+-   Has FP16 precision enabled
+-   Runs for 10000 steps, where the first 2000 are warm-up steps
+-   Saves a checkpoint every 500 iterations (keeps only the latest 5 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `results/<model_name>` directory.
+-   Creates a log file containing all the output
+ 
+Phase 2: (Maximum sequence length of 512)
+-   Runs on 8 GPUs with training batch size of 24 per GPU
+-   Uses a learning rate of 4e-3
+-   Has FP16 precision enabled
+-   Runs for 930 steps, where the first 200 are warm-up steps
+-   Saves a checkpoint every 500 iterations (keeps only the latest 5 checkpoints) and at the end of training. All checkpoints, and training logs are saved to the `results/<model_name>` directory.
+-   Creates a log file containing all the output
+ 
+Specific configs available at `scripts/configs/pretrain_config.sh` can be run as follows:
+```
+bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgxa100_8gpu_amp)
+bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgx2_16gpu_amp)
+bash scripts/run_pretraining.sh $(source scripts/configs/pretrain_config.sh && dgx1_8gpu_amp)
+```
+ 
+The above commands will train ELECTRA based on Wikipedia and BookCorpus to state-of-the-art accuracy on any DGX platform using FP16 arithmetic. Around 96% of the training sequences are of length 128 (phase 1 of training) and less than 4% of the training sequences are of length 512 (phase 2 of training).
+ 
+In order to run pre-training routine on an initial checkpoint, perform the following in `scripts/run_pretraining.sh`:
+-   set `restore_checkpoint=<path_to_checkpoint>`
+-   Note: The parameter value assigned to `--model_size` during training should remain unchanged. Also, to resume pre-training on your corpus of choice, the training dataset should be created using the same vocabulary file used in `data/create_datasets_from_start.sh`.
+ 
+ 
+#### Multi-node
+ 
+Multi-node runs can be launched on a Pyxis/enroot Slurm cluster (see [Requirements](#requirements)) with the `run.sub` script with the following command for a 48-node NVIDIA DGX A100 example for both phase 1 and phase 2:
+ 
+```
+BATCHSIZE=176 LR=6e-3 GRAD_ACCUM_STEPS=1 PHASE=1 STEPS=10000 WARMUP=2000 b1=0.878 b2=0.974 decay=0.5 skip_adaptive=yes end_lr=0.0 sbatch N48 --ntasks-per-node=8 run.sub
+BATCHSIZE=24 LR=4e-3 GRAD_ACCUM_STEPS=3 PHASE=2 STEPS=930 WARMUP=200 b1=0.878 b2=0.974 decay=0.5 skip_adaptive=yes end_lr=0.0 sbatch N48 --ntasks-per-node=8 run.sub
+```
+ 
+Checkpoint after phase 1 will be saved in `<results_dir>/models/<model_name>`. The checkpoint will be automatically picked up to resume training on phase 2. Note that phase 2 should be run after phase 1.
+ 
+The batch variables `BATCHSIZE`, `LR`, `GRAD_ACCUM_STEPS`, `PHASE`, `STEPS`, `WARMUP`, `b1`, `b2`, `decay`, `skip_adaptive` and `end_lr` refer to the Python arguments `train_batch_size`, `learning_rate`, `gradient_accumulation_steps`, `phase2`, `num_train_steps`, `num_warmup_steps`, `opt_beta_1`, `opt_beta_2`, `lr_decay_power`, `skip_adaptive` and `end_lr` in `run_pretraining.py` respectively.
+ 
+Note that the `run.sub` script is a starting point that has to be adapted depending on the environment. In particular, variables such as `docker_image` and `datadir` handle the location of the files for each phase.
+ 
+Refer to the files contents to see the full list of variables to adjust for your system.
+ 
+ 
+#### Fine-tuning
+ 
+Fine-tuning is provided for a variety of tasks. The following tasks are included with this repository through the following scripts:
+ 
+-   Question Answering (`scripts/run_squad.sh`)
+ 
+By default, each Python script implements fine-tuning a pre-trained ELECTRA model for a specified number of training epochs as well as evaluation of the fine-tuned model. Each shell script invokes the associated Python script with the following default parameters:
+ 
+-   Uses 8 GPUs
+-   Has FP16 precision enabled
+-   Has XLA enabled
+-   Saves a checkpoint at the end of training to the `checkpoints/` folder
+ 
+Specific configs available at `scripts/configs/squad_configs.sh` can be run as follows:
+```
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgxa100_8gpu_amp) train_eval
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgx2_16gpu_amp) train_eval
+bash scripts/run_squad.sh $(source scripts/configs/squad_config.sh && dgx1_8gpu_amp) train_eval
+```
+ 
+Fine-tuning Python scripts implement support for mixed precision and multi-GPU training through [Horovod](https://github.com/horovod/horovod). For a full list of parameters and associated explanations, see the [Parameters](#parameters) section.
+ 
+All fine-tuning shell scripts have the same positional arguments, outlined below:
+ 
+```bash scripts/run_squad.sh <pretrained electra model> <epochs> <batch size> <learning rate> <amp|fp32> <num_gpus> <seed> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <mode (either `train`, `eval` or `train_eval`)>```
+ 
+By default, the mode positional argument is set to `train_eval`. See the [Fine-tuning parameters](#fine-tuning-parameters) for explanations of each positional argument.
+ 
+Note: The first positional argument (the path to the checkpoint to load) is required.
+ 
+Each fine-tuning script assumes that the corresponding dataset files exist in the `data/` directory or separate path can be a command-line input to `run_squad.sh`.
+ 
+### Inference process
+ 
+#### Fine-tuning inference
+ 
+Evaluation fine-tuning is enabled by the same scripts as training:
+ 
+-   Question Answering (`scripts/run_squad.sh`)
+ 
+The mode positional argument of the shell script is used to run in evaluation mode. The fine-tuned ELECTRA model will be run on the evaluation dataset, and the evaluation loss and accuracy will be displayed.
+ 
+Each inference shell script expects dataset files to exist in the same locations as the corresponding training scripts. The inference scripts can be run with default settings. By setting the `mode` variable in the script to either `eval` or `prediction` flag, you can choose between running predictions and evaluating them on a given dataset or just the former.
+ 
+`bash scripts/run_squad.sh <pretrained electra model> <epochs> <batch size> <learning rate> <amp|fp32> <num_gpus> <seed> <SQuAD version> <path to SQuAD dataset> <results directory> <path to fine-tuned model checkpoint> <eval or prediction>`
+ 
+To run inference interactively on question-context pairs, use the script `run_inference.py` as follows:
+ 
+`python run_inference.py --electra_model <electra_model_type> --init_checkpoint <fine_tuned_checkpoint>  --question="What food does Harry like?" --context="My name is Harry and I grew up in Canada. I love apples."`
+ 
+ 
+## Performance
+
+The performance measurements in this document were conducted at the time of publication and may not reflect the performance achieved from NVIDIA’s latest software release. For the most up-to-date performance measurements, go to [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference).
+ 
+### Benchmarking
+ 
+The following section shows how to run benchmarks measuring the model performance in training and inference modes.
+ 
+#### Training performance benchmark
+ 
+Training performance benchmarks for both pre-training phases can be obtained by running `scripts/benchmark_pretraining.sh`. Default parameters are set to run a few training steps for a converging NVIDIA DGX A100 system.
+ 
+To benchmark training performance with other parameters, run:
+```
+bash scripts/benchmark_pretraining.sh <train_batch_size_p1> <amp|tf32|fp32> <xla|no_xla> <num_gpus> <accumulate_gradients=true|false> <gradient_accumulation_steps_p1> <train_batch_size_p2> <gradient_accumulation_steps_p2> <base> 
+```
+ 
+An example call used to generate throughput numbers:
+```
+bash scripts/benchmark_pretraining.sh 88 amp xla 8 true 2 12 4 base
+```
+ 
+Training performance benchmarks for fine-tuning can be obtained by running `scripts/benchmark_squad.sh`. The required parameters can be passed through the command-line as described in [Training process](#training-process). The performance information is printed after 200 training iterations.
+ 
+To benchmark the training performance on a specific batch size, run:
+```
+bash scripts/benchmark_squad.sh train <num_gpus> <batch size> <infer_batch_size> <amp|tf32|fp32> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <cache_Dir>
+```
+ 
+An example call used to generate throughput numbers:
+```
+bash scripts/benchmark_squad.sh train 8 16
+```
+ 
+#### Inference performance benchmark
+ 
+Inference performance benchmarks fine-tuning can be obtained by running `scripts/benchmark_squad.sh`. The required parameters can be passed through the command-line as described in [Inference process](#inference-process). This script runs one epoch by default on the SQuAD v1.1 dataset and extracts the average performance for the given configuration. 
+ 
+To benchmark the training performance on a specific batch size, run:
+`bash scripts/benchmark_squad.sh train <num_gpus> <batch size> <infer_batch_size> <amp|fp32> <SQuAD version> <path to SQuAD dataset> <results directory> <checkpoint_to_load> <cache_Dir>`
+ 
+An example call used to generate throughput numbers:
+`bash scripts/benchmark_squad.sh eval 8 256`
+  
+ 
+### Results
+ 
+The following sections provide details on how we achieved our performance and accuracy in training and inference. All results are on ELECTRA-base model and on SQuAD v1.1 dataset with a sequence length of 384 unless otherwise mentioned.
+ 
+#### Training accuracy results
+ 
+##### Pre-training loss curves
+![Pretraining Loss Curves](images/total_loss.svg)
+ 
+Phase 1 is shown by the blue curve and Phase 2 by the grey. Y axis stands for the total loss and x axis for the total steps trained.
+ 
+##### Pre-training loss results
+ 
+| DGX System | GPUs | Batch size / GPU (Phase 1 and Phase 2) | Accumulation steps (Phase 1 and Phase 2) | Final Loss - TF32/FP32 | Final Loss - mixed precision | Time to train(hours) - TF32/FP32 | Time to train(hours) - mixed precision | Time to train speedup (TF32/FP32 to mixed precision)
+|---|---|---|---|---|---|---|---|---
+|48 x DGX A100 |8 |176 and 24 |1 and 3   |8.686|8.68|1.61  |1.126|1.43  
+|24 x DGX-2H   |16|176 and 24 |1 and 3   |8.72 |8.67|5.58  |1.74 |3.20
+|1 x DGX A100  |8 |176 and 24 |48 and 144|-    |-   |54.84 |30.47|1.8
+|1 x DGX-1 16G |8 |88 and 12  |96 and 288|-    |-   |241.8 |65.1 |3.71
+|1 x DGX-2 32G |16|176 and 24 |24 and 72 |-    |-   |109.97|29.08|3.78
+ 
+In the above table, FP32 and TF32 runs were made at half the batch per GPU and twice the gradient accumulation steps of a run with mixed precision in order to not run out of memory.
+ 
+ 
+The SQuAD fine-tuning scripts by default train on [Google's ELECTRA++ base pretrained checkpoint](https://github.com/google-research/electra#released-models) which uses around 10x training dataset (dataset used by XLNet authors) and greater than 5x training steps compared to the training recipe in `scripts/run_pretraining.sh`. The latter trains and achieves state-of-the-art accuracy on Wikipedia and BookCorpus datasets only.
+ 
+##### Fine-tuning accuracy: NVIDIA DGX A100 (8x A100 40GB)
+ 
+Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. 
+ 
+*ELECTRA BASE++*
+ 
+| GPUs    | Batch size / GPU    | Accuracy / F1 - FP32  | Accuracy / F1 - mixed precision  |   Time to train - TF32 (sec) |  Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) | 
+|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
+|   1   |       32            |           87.19 / 92.85       |              87.19 / 92.84               |          1699                |               749                  |                    2.27         |
+|   8   |       32            |           86.84 / 92.57      |                86.83 / 92.56            |          263                |               201                  |                    1.30         |
+ 
+ 
+##### Fine-tuning accuracy: NVIDIA DGX-1 (8x V100 16GB)
+ 
+Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs.
+ 
+*ELECTRA BASE++*
+ 
+| GPUs    | Batch size / GPU (FP32 : mixed precision)   | Accuracy / F1 - FP32  | Accuracy / F1 - mixed precision  |   Time to train - FP32 (sec) |  Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |       
+|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
+|   1   |          8 : 16           |         87.36 / 92.82        |             87.32 / 92.74              |              5136            |                  1378               |          3.73                 |
+|   8   |          8 : 16           |         87.02 / 92.73       |             87.02 / 92.72              |              730            |                  334               |          2.18                 |
+ 
+*ELECTRA BASE checkpoint Wikipedia and BookCorpus*
+ 
+GPUs | SQuAD version| Batch size / GPU (FP32 : mixed precision)   | Accuracy / F1 - FP32  | Accuracy / F1 - mixed precision  |   Time to train - FP32 (sec) |  Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |       
+|---------|-----|----------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
+|   8   |  v1.1 |        8 : 16           |         85.00 / 90.94        |             85.04 / 90.96              |              5136            |                  1378               |          3.73                 |
+|   8   |  v2.0 |        8 : 16           |         80.517 / 83.36       |             80.523 / 83.43              |              730            |                  334               |          2.18      
+ 
+##### Fine-tuning accuracy: NVIDIA DGX-2 (16x V100 32GB)
+ 
+Our results were obtained by running the `scripts/run_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-2 (16x V100 32G) GPUs.
+ 
+*ELECTRA BASE++*
+ 
+| GPUs    | Batch size / GPU    | Accuracy / F1 - FP32  | Accuracy / F1 - mixed precision  |   Time to train - FP32 (sec) |  Time to train - mixed precision (sec) | Time to train speedup (FP32 to mixed precision) |       
+|---------|---------------------|------------------|-----------------------------|--------------------------|---------------------------------|-------------------------------------------------|
+|   1   |          32           |         87.14 / 92.69        |             86.95 / 92.69              |              4478            |                  1162               |          3.85                 |
+|   16   |          32           |         86.95 / 90.58         |             86.93 / 92.48               |              333            |                  229               |          1.45                 |
+ 
+  
+##### Training stability test
+ 
+###### Pre-training stability test: NVIDIA DGX A100 (8x A100 40GB)
+ 
+*ELECTRA BASE Wikipedia and BookCorpus*
+ 
+Training stability with 48 x DGX A100, TF32 computations and loss reported after Phase 2:
+ 
+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
+|---|---|---|---|---|---|---|---
+|Final Loss| 8.72 | 8.69 | 8.71 | 8.7 | 8.68 | 8.7 | 0.015
+ 
+###### Fine-tuning stability test: NVIDIA DGX-1 (8x V100 16GB)
+ 
+*ELECTRA BASE++*
+ 
+Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v1.1:
+  
+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
+|---|---|---|---|---|---|---|---
+|Exact Match %| 86.99 | 86.81 | 86.95 | 87.10 | 87.26 | 87.02 | 0.17
+| f1 % | 92.7 | 92.66 | 92.65 |  92.61 | 92.97 | 92.72 | 0.14
+ 
+ Training stability with 8 GPUs, FP16 computations, batch size of 16 on SQuAD v2.0:
+ 
+| Accuracy Metric | Seed 1 | Seed 2 | Seed 3 | Seed 4 | Seed 5 | Mean | Standard Deviation
+|---|---|---|---|---|---|---|---
+|Exact Match %| 83.00 | 82.84 | 83.11 | 82.70 | 82.94 | 82.91 | 0.15
+| f1 % | 85.63 | 85.48 | 85.69 | 85.31 | 85.57 | 85.54 | 0.15
+ 
+#### Training performance results
+ 
+##### Training performance: NVIDIA DGX A100 (8x A100 40GB)
+ 
+Our results were obtained by running the `scripts/benchmark_squad.sh` training script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (8x A100 40GB) GPUs. Performance numbers (in items/images per second) were averaged over an entire training epoch.
+ 
+###### Pre-training NVIDIA DGX A100 (8x A100 40GB)
+ 
+| GPUs | Batch size / GPU (TF32 and FP16) | Accumulation steps (TF32 and FP16) | Sequence length | Throughput - TF32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 88 and 176| 768 and 384  | 128| 533 |955 |1.79|1.00| 1.00
+|8 | 88 and 176| 96 and 48    | 128| 4202|7512|1.79|7.88| 7.87
+|1 | 12 and 24 | 2304 and 1152| 512| 90  |171 |1.90|1.00| 1.00
+|8 | 12 and 24 | 288 and 144  | 512| 716 |1347|1.88|7.96| 7.88 
+ 
+###### Fine-tuning NVIDIA DGX A100 (8x A100 40GB)
+  
+| GPUs | Batch size / GPU | Sequence length | Throughput - TF32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (TF32 - mixed precision) | Weak scaling - TF32 | Weak scaling - mixed precision |
+|------------------|-----------|-----------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+| 1 |  32 |  384 | 107  | 317 |  2.96  |  1.00  | 1.00
+| 8 |  32 |  384 | 828  | 2221|  2.68  |  7.74  | 7.00
+ 
+##### Training performance: NVIDIA DGX-1 (8x V100 16GB)
+ 
+Our results were obtained by running the `scripts/benchmark_squad.sh` training scripts in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-1 with (8x V100 16GB) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+ 
+###### Pre-training NVIDIA DGX-1 (8x V100 16GB)
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 40 and 88| 1689 and 768 | 128| 116 |444 |3.83 |1.00 | 1.00
+|8 | 40 and 88| 211 and 96   | 128| 920 |3475|3.77 |7.93 | 7.83
+|1 | 6 and 12 | 4608 and 2304| 512| 24  |84  |3.50 |1.00 | 1.00
+|8 | 6 and 12 | 576 and 288  | 512| 190 |656 |3.45 |7.92 | 7.81 
+ 
+###### Fine-tuning NVIDIA DGX-1 (8x V100 16GB)
+ 
+| GPUs | Batch size / GPU (FP32 : mixed precision)   | Sequence length | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|------------------|-----------|-----------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 8 : 16| 384| 35| 154| 4.4 | 1.00| 1.00
+|8 | 8 : 16| 384|268|1051| 3.92| 7.66| 6.82
+ 
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+ 
+##### Training performance: NVIDIA DGX-2 (16x V100 32GB)
+ 
+Our results were obtained by running the `scripts/benchmark_squad.sh` training scripts in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX-2 with (16x V100 32G) GPUs. Performance numbers (in sequences per second) were averaged over an entire training epoch.
+ 
+###### Pre-training NVIDIA DGX-2 (16x V100 32GB)
+ 
+| GPUs | Batch size / GPU (FP32 and FP16) | Accumulation steps (FP32 and FP16) | Sequence length | Throughput - FP32(sequences/sec) | Throughput - mixed precision(sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision
+|------------------|----------------------|----------------------|-------------------|-----------------------------------------------|------------------------------------|---------------------------------|----------------------|----------------------------------------------
+|1 | 88 and 176| 768 and 384  | 128| 128 |500 |3.91| 1.00 | 1.00
+|8 | 88 and 176| 96 and 48    | 128| 1011|3916|3.87| 7.90 | 7.83
+|16| 88 and 176| 48 and 24    | 128| 2018|7773|3.85|15.77 |15.55
+|1 | 12 and 24 | 2304 and 1152| 512| 27  |96  |3.55| 1.00 | 1.00
+|8 | 12 and 24 | 288 and 144  | 512| 213 |754 |3.54| 7.89 | 7.85 
+|16| 12 and 24 | 144 and 72   | 512| 426 |1506|3.54| 15.78|15.69 
+ 
+###### Fine-tuning NVIDIA DGX-2 (16x V100 32GB)
+ 
+| GPUs | Batch size / GPU | Sequence length | Throughput - FP32 (sequences/sec) | Throughput - mixed precision (sequences/sec) | Throughput speedup (FP32 - mixed precision) | Weak scaling - FP32 | Weak scaling - mixed precision |
+|------|-----------|-------|----------------------------------|---------------------------------------------|---------------------------------------------|---------------------|--------------------------------|
+|    1 |               16 |          384 |                     40 |                                         184 |                                        4.6  |                1.00 |                           1.00 |
+|    8 |               16 |          384 |                    311 |                                        1289 |                                        4.14 |                7.77 |                           7.00 |
+|   16 |               16 |          384 |                    626 |                                        2594 |                                        4.14 |               15.65 |                          14.09 |
+ 
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+ 
+#### Inference performance results
+ 
+##### Inference performance: NVIDIA DGX A100 (1x A100 40GB)
+ 
+Our results were obtained by running the `scripts/benchmark_squad.sh` inferencing benchmarking script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA DGX A100 (1x A100 40GB) GPU.
+ 
+###### Fine-tuning inference on NVIDIA DGX A100 (1x A100 40GB)
+ 
+FP16
+ 
+| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
+|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
+|          1 |             384 |                            166 |            6.035 |            5.995 |            6.013 |            6.029 |
+|        256 |             384 |                            886 |           276.26 |           274.53 |          275.276 |          275.946 |
+|        512 |             384 |                            886 |           526.5  |          525.014 |          525.788 |          525.788 |
+ 
+TF32
+ 
+| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
+|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
+|          1 |             384 |                            122 |            8.228 |            8.171 |            8.198 |            8.221 |
+|        256 |             384 |                            342 |          729.293 |          727.990 |          728.505 |          729.027 |
+|        512 |             384 |                            350 |         1429.314 |         1427.719 |         1428.550 |         1428.550 |
+ 
+ 
+ 
+##### Inference performance: NVIDIA T4
+ 
+Our results were obtained by running the `scripts/benchmark_squad.sh` script in the tensorflow:20.07-tf2-py3 NGC container on NVIDIA Tesla T4 (1x T4 16GB) GPU. 
+ 
+###### Fine-tuning inference on NVIDIA T4
+ 
+FP16
+ 
+| Batch size | Sequence length | Throughput Avg (sequences/sec) | Latency Avg (ms) | Latency 90% (ms) | Latency 95% (ms) | Latency 99% (ms) |
+|------------|-----------------|--------------------------------|------------------|------------------|------------------|------------------|
+|          1 |             384 |                            58  |           17.413 |           17.295 |           17.349 |           17.395 |
+|        128 |             384 |                            185 |          677.298 |          675.211 |          675.674 |          676.269 |
+|        256 |             384 |                            169 |         1451.396 |         1445.070 |         1447.654 |         1450.141 |
+   
+To achieve these same results, follow the steps in the [Quick Start Guide](#quick-start-guide).
+  
+## Release notes
+ 
+### Changelog
+ 
+July 2020
+- Initial release.
+ 
+October 2020
+- Data preparation scripts for pre-training.
+- Pre-training support.
+- Mixed precision support with Keras AMP policy.
+- Update beam size in SQuAD fine-tuning from 4 to 5 for higher accuracy.
+- T4 inference performance.
+ 
+### Known issues
+ 
+There are no known issues with this model.
diff --git a/modelzoo/ELECTRA/build_pretraining_dataset.py b/modelzoo/ELECTRA/build_pretraining_dataset.py
new file mode 100644
index 00000000..e1385cb2
--- /dev/null
+++ b/modelzoo/ELECTRA/build_pretraining_dataset.py
@@ -0,0 +1,237 @@
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Writes out text data as tfrecords that ELECTRA can be pre-trained on."""
+
+import argparse
+import multiprocessing
+import os
+import random
+import time
+import tensorflow as tf
+
+import utils
+from tokenization import ElectraTokenizer
+
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+class ExampleBuilder(object):
+  """Given a stream of input text, creates pretraining examples."""
+
+  def __init__(self, tokenizer, max_length):
+    self._tokenizer = tokenizer
+    self._current_sentences = []
+    self._current_length = 0
+    self._max_length = max_length
+    self._target_length = max_length
+
+  def add_line(self, line):
+    """Adds a line of text to the current example being built."""
+    line = line.strip().replace("\n", " ")
+    if (not line) and self._current_length != 0:  # empty lines separate docs
+      return self._create_example()
+    bert_tokens = self._tokenizer.tokenize(line)
+    bert_tokids = self._tokenizer.convert_tokens_to_ids(bert_tokens)
+    self._current_sentences.append(bert_tokids)
+    self._current_length += len(bert_tokids)
+    if self._current_length >= self._target_length:
+      return self._create_example()
+    return None
+
+  def _create_example(self):
+    """Creates a pre-training example from the current list of sentences."""
+    # small chance to only have one segment as in classification tasks
+    if random.random() < 0.1:
+      first_segment_target_length = 100000
+    else:
+      # -3 due to not yet having [CLS]/[SEP] tokens in the input text
+      first_segment_target_length = (self._target_length - 3) // 2
+
+    first_segment = []
+    second_segment = []
+    for sentence in self._current_sentences:
+      # the sentence goes to the first segment if (1) the first segment is
+      # empty, (2) the sentence doesn't put the first segment over length or
+      # (3) 50% of the time when it does put the first segment over length
+      if (len(first_segment) == 0 or
+          len(first_segment) + len(sentence) < first_segment_target_length or
+          (len(second_segment) == 0 and
+           len(first_segment) < first_segment_target_length and
+           random.random() < 0.5)):
+        first_segment += sentence
+      else:
+        second_segment += sentence
+
+    # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
+    first_segment = first_segment[:self._max_length - 2]
+    second_segment = second_segment[:max(0, self._max_length -
+                                         len(first_segment) - 3)]
+
+    # prepare to start building the next example
+    self._current_sentences = []
+    self._current_length = 0
+    # small chance for random-length instead of max_length-length example
+    if random.random() < 0.05:
+      self._target_length = random.randint(5, self._max_length)
+    else:
+      self._target_length = self._max_length
+
+    return self._make_tf_example(first_segment, second_segment)
+
+  def _make_tf_example(self, first_segment, second_segment):
+    """Converts two "segments" of text into a tf.train.Example."""
+    vocab = self._tokenizer.vocab
+    input_ids = [vocab["[CLS]"]] + first_segment + [vocab["[SEP]"]]
+    segment_ids = [0] * len(input_ids)
+    if second_segment:
+      input_ids += second_segment + [vocab["[SEP]"]]
+      segment_ids += [1] * (len(second_segment) + 1)
+    input_mask = [1] * len(input_ids)
+    input_ids += [0] * (self._max_length - len(input_ids))
+    input_mask += [0] * (self._max_length - len(input_mask))
+    segment_ids += [0] * (self._max_length - len(segment_ids))
+    tf_example = tf.train.Example(features=tf.train.Features(feature={
+        "input_ids": create_int_feature(input_ids),
+        "input_mask": create_int_feature(input_mask),
+        "segment_ids": create_int_feature(segment_ids)
+    }))
+    return tf_example
+
+
+class ExampleWriter(object):
+  """Writes pre-training examples to disk."""
+
+  def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
+               num_jobs, blanks_separate_docs, do_lower_case,
+               num_out_files=1000):
+    self._blanks_separate_docs = blanks_separate_docs
+    tokenizer = ElectraTokenizer(
+        vocab_file=vocab_file,
+        do_lower_case=do_lower_case)
+    self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
+    self._writers = []
+    for i in range(num_out_files):
+      if i % num_jobs == job_id:
+        output_fname = os.path.join(
+            output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
+                i, num_out_files))
+        self._writers.append(tf.io.TFRecordWriter(output_fname))
+    self.n_written = 0
+
+  def write_examples(self, input_file):
+    """Writes out examples from the provided input file."""
+    with tf.io.gfile.GFile(input_file) as f:
+      for line in f:
+        line = line.strip()
+        if line or self._blanks_separate_docs:
+          example = self._example_builder.add_line(line)
+          if example:
+            self._writers[self.n_written % len(self._writers)].write(
+                example.SerializeToString())
+            self.n_written += 1
+      example = self._example_builder.add_line("")
+      if example:
+        self._writers[self.n_written % len(self._writers)].write(
+            example.SerializeToString())
+        self.n_written += 1
+
+  def finish(self):
+    for writer in self._writers:
+      writer.close()
+
+
+def write_examples(job_id, args):
+  """A single process creating and writing out pre-processed examples."""
+
+  def log(*args):
+    msg = " ".join(map(str, args))
+    print("Job {}:".format(job_id), msg)
+
+  log("Creating example writer")
+  example_writer = ExampleWriter(
+      job_id=job_id,
+      vocab_file=args.vocab_file,
+      output_dir=args.output_dir,
+      max_seq_length=args.max_seq_length,
+      num_jobs=args.num_processes,
+      blanks_separate_docs=args.blanks_separate_docs,
+      do_lower_case=args.do_lower_case,
+      num_out_files=args.num_out_files,
+  )
+  log("Writing tf examples")
+  fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
+  fnames = [f for (i, f) in enumerate(fnames)
+            if i % args.num_processes == job_id]
+  random.shuffle(fnames)
+  start_time = time.time()
+  for file_no, fname in enumerate(fnames):
+    if file_no > 0:
+      elapsed = time.time() - start_time
+      log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
+          "{:} examples written".format(
+              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
+              int((len(fnames) - file_no) / (file_no / elapsed)),
+              example_writer.n_written))
+    example_writer.write_examples(os.path.join(args.corpus_dir, fname))
+  example_writer.finish()
+  log("Done!")
+
+# python build_pretraining_dataset --corpus-dir
+def main():
+  parser = argparse.ArgumentParser(description=__doc__)
+  parser.add_argument("--corpus-dir", required=True,
+                      help="Location of pre-training text files.")
+  parser.add_argument("--vocab-file", required=True,
+                      help="Location of vocabulary file.")
+  parser.add_argument("--output-dir", required=True,
+                      help="Where to write out the tfrecords.")
+  parser.add_argument("--max-seq-length", default=128, type=int,
+                      help="Number of tokens per example.")
+  parser.add_argument("--num-processes", default=1, type=int,
+                      help="Parallelize across multiple processes.")
+  parser.add_argument("--blanks-separate-docs", default=True, type=bool,
+                      help="Whether blank lines indicate document boundaries.")
+  parser.add_argument("--do-lower-case", dest='do_lower_case',
+                      action='store_true', help="Lower case input text.")
+  parser.add_argument("--no-lower-case", dest='do_lower_case',
+                      action='store_false', help="Don't lower case input text.")
+  parser.add_argument("--num-out-files", default=1000, type=int,
+                      help="Number of output files.")
+  parser.add_argument("--seed", default=1314, type=int)
+  args = parser.parse_args()
+
+  random.seed(args.seed)
+
+  utils.rmkdir(args.output_dir)
+  if args.num_processes == 1:
+    write_examples(0, args)
+  else:
+    jobs = []
+    for i in range(args.num_processes):
+      job = multiprocessing.Process(target=write_examples, args=(i, args))
+      jobs.append(job)
+      job.start()
+    for job in jobs:
+      job.join()
+
+
+if __name__ == "__main__":
+  main()
diff --git a/modelzoo/ELECTRA/configuration.py b/modelzoo/ELECTRA/configuration.py
new file mode 100644
index 00000000..df8d5ae7
--- /dev/null
+++ b/modelzoo/ELECTRA/configuration.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ELECTRA model configuration """
+
+
+import logging
+
+from configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json",
+}
+
+
+class ElectraConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+        architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the ELECTRA model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+            embedding_size (:obj:`int`, optional, defaults to 128):
+                Dimensionality of the encoder layers and the pooler layer.
+            hidden_size (:obj:`int`, optional, defaults to 256):
+                Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (:obj:`int`, optional, defaults to 4):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 1024):
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+
+        Example::
+
+            from transformers import ElectraModel, ElectraConfig
+
+            # Initializing a ELECTRA electra-base-uncased style configuration
+            configuration = ElectraConfig()
+
+            # Initializing a model from the electra-base-uncased style configuration
+            model = ElectraModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
+    pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP
+    model_type = "electra"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        embedding_size=128,
+        hidden_size=256,
+        num_hidden_layers=12,
+        num_attention_heads=4,
+        intermediate_size=1024,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
diff --git a/modelzoo/ELECTRA/configuration_utils.py b/modelzoo/ELECTRA/configuration_utils.py
new file mode 100644
index 00000000..b90c4025
--- /dev/null
+++ b/modelzoo/ELECTRA/configuration_utils.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+
+import copy
+import json
+import logging
+import os
+from typing import Dict, Optional, Tuple
+
+from utils import log
+from file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
+
+
+logger = logging.getLogger(__name__)
+
+
+class PretrainedConfig(object):
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+            - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
+
+        Args:
+            finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
+                Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            num_labels (:obj:`int`, `optional`, defaults to `2`):
+                Number of classes to use when the model is a classification model (sequences/tokens)
+            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Should the model returns attentions weights.
+            output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
+                Should the model returns all hidden-states.
+            torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Is the model used with Torchscript (for PyTorch models).
+    """
+    pretrained_config_archive_map = {}  # type: Dict[str, str]
+    model_type = ""  # type: str
+
+    def __init__(self, **kwargs):
+        # Attributes with defaults
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_past = kwargs.pop("output_past", True)  # Not used by all models
+        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
+
+        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
+
+        # Parameters for sequence generation
+        self.max_length = kwargs.pop("max_length", 20)
+        self.min_length = kwargs.pop("min_length", 0)
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+
+        # Fine-tuning task arguments
+        self.architectures = kwargs.pop("architectures", None)
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.num_labels = kwargs.pop("num_labels", 2)
+        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
+        self.prefix = kwargs.pop("prefix", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+
+        # task specific arguments
+        self.task_specific_params = kwargs.pop("task_specific_params", None)
+
+        # TPU arguments
+        self.xla_device = kwargs.pop("xla_device", None)
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                log("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    @property
+    def num_labels(self):
+        return self._num_labels
+
+    @num_labels.setter
+    def num_labels(self, num_labels):
+        self._num_labels = num_labels
+        self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+    def save_pretrained(self, save_directory):
+        """
+        Save a configuration object to the directory `save_directory`, so that it
+        can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+
+        Args:
+            save_directory (:obj:`string`):
+                Directory where the configuration JSON file will be saved.
+        """
+        assert os.path.isdir(
+            save_directory
+        ), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+        log("Configuration saved in {}".format(output_config_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
+        r"""
+
+        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+
+        Args:
+            pretrained_model_name_or_path (:obj:`string`):
+                either:
+                  - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
+                    download, e.g.: ``bert-base-uncased``.
+                  - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
+                    our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                  - a path to a `directory` containing a configuration file saved using the
+                    :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                  - a path or url to a saved configuration JSON `file`, e.g.:
+                    ``./my_model_directory/configuration.json``.
+            cache_dir (:obj:`string`, `optional`):
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            kwargs (:obj:`Dict[str, any]`, `optional`):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+            proxies (:obj:`Dict`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.:
+                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
+                The proxies are used on each request.
+            return_unused_kwargs: (`optional`) bool:
+                If False, then this function returns just the final configuration object.
+                If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
+                of kwargs which has not been used to update `config` and is otherwise ignored.
+
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+
+        Examples::
+
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
+
+    @classmethod
+    def get_config_dict(
+        cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
+    ) -> Tuple[Dict, Dict]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
+        for instantiating a Config using `from_dict`.
+
+        Parameters:
+            pretrained_model_name_or_path (:obj:`string`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+            pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
+                A map of `shortcut names` to `url`. By default, will use the current class attribute.
+
+        Returns:
+            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+
+        if pretrained_config_archive_map is None:
+            pretrained_config_archive_map = cls.pretrained_config_archive_map
+
+        if pretrained_model_name_or_path in pretrained_config_archive_map:
+            config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_config_file = cached_path(
+                config_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+            )
+            # Load config dict
+            if resolved_config_file is None:
+                raise EnvironmentError
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in pretrained_config_archive_map:
+                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                    config_file
+                )
+            else:
+                msg = (
+                    "Can't load '{}'. Make sure that:\n\n"
+                    "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                    "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format(
+                        pretrained_model_name_or_path,
+                        pretrained_model_name_or_path,
+                        pretrained_model_name_or_path,
+                        CONFIG_NAME,
+                    )
+                )
+            raise EnvironmentError(msg)
+
+        except json.JSONDecodeError:
+            msg = (
+                "Couldn't reach server at '{}' to download configuration file or "
+                "configuration file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            )
+            raise EnvironmentError(msg)
+
+        if resolved_config_file == config_file:
+            log("loading configuration file {}".format(config_file))
+        else:
+            log("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
+
+        return config_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
+        """
+        Constructs a `Config` from a Python dictionary of parameters.
+
+        Args:
+            config_dict (:obj:`Dict[str, any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
+                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
+                method.
+            kwargs (:obj:`Dict[str, any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        config = cls(**config_dict)
+
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        # log("Model config {}".format(str(config)))
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_json_file(cls, json_file: str) -> "PretrainedConfig":
+        """
+        Constructs a `Config` from the path to a json file of parameters.
+
+        Args:
+            json_file (:obj:`string`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: str):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return "{} {}".format(self.__class__.__name__, self.to_json_string())
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+        return output
+
+    def to_json_string(self):
+        """
+        Serializes this instance to a JSON string.
+
+        Returns:
+            :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """
+        Save this instance to a json file.
+
+        Args:
+            json_file_path (:obj:`string`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string())
+
+    def update(self, config_dict: Dict):
+        """
+        Updates attributes of this class
+        with attributes from `config_dict`.
+
+        Args:
+            :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
+
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
+    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+        It is used to instantiate an BERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the BERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            hidden_size (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+
+        Example::
+
+            from transformers import BertModel, BertConfig
+
+            # Initializing a BERT bert-base-uncased style configuration
+            configuration = BertConfig()
+
+            # Initializing a model from the bert-base-uncased style configuration
+            model = BertModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
+    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+    model_type = "bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/data/BooksDownloader.py b/modelzoo/ELECTRA/data/BooksDownloader.py
new file mode 100644
index 00000000..a10ebde0
--- /dev/null
+++ b/modelzoo/ELECTRA/data/BooksDownloader.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+
+class BooksDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path
+        pass
+
+
+    def download(self):
+        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
+        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
+        bookscorpus_download_command += ' --trash-bad-count'
+        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
diff --git a/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
new file mode 100644
index 00000000..22e48d4b
--- /dev/null
+++ b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class BookscorpusTextFormatting:
+    def __init__(self, books_path, output_filename, recursive = False):
+        self.books_path = books_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one book per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
+                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+                    for line in file:
+                        if line.strip() != '':
+                            ofile.write(line.strip() + ' ')
+                ofile.write("\n\n")
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/data/Downloader.py b/modelzoo/ELECTRA/data/Downloader.py
new file mode 100644
index 00000000..ebbd43d6
--- /dev/null
+++ b/modelzoo/ELECTRA/data/Downloader.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
+from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
+from WikiDownloader import WikiDownloader
+from BooksDownloader import BooksDownloader
+from MRPCDownloader import MRPCDownloader
+from SquadDownloader import SquadDownloader
+
+
+class Downloader:
+    def __init__(self, dataset_name, save_path):
+        self.dataset_name = dataset_name
+        self.save_path = save_path
+
+
+    def download(self):
+        if self.dataset_name == 'bookscorpus':
+            self.download_bookscorpus()
+
+        elif self.dataset_name == 'wikicorpus_en':
+            self.download_wikicorpus('en')
+
+        elif self.dataset_name == 'wikicorpus_zh':
+            self.download_wikicorpus('zh')
+
+        elif self.dataset_name == 'google_pretrained_weights':
+            self.download_google_pretrained_weights()
+
+        elif self.dataset_name == 'nvidia_pretrained_weights':
+            self.download_nvidia_pretrained_weights()
+
+        elif self.dataset_name == 'mrpc':
+            self.download_mrpc()
+
+        elif self.dataset_name == 'squad':
+            self.download_squad()
+
+        elif self.dataset_name == 'all':
+            self.download_bookscorpus(self.save_path)
+            self.download_wikicorpus('en', self.save_path)
+            self.download_wikicorpus('zh', self.save_path)
+            self.download_google_pretrained_weights(self.save_path)
+            self.download_nvidia_pretrained_weights(self.save_path)
+            self.download_mrpc(self.save_path)
+            self.download_squad(self.save_path)
+
+        else:
+            print(self.dataset_name)
+            assert False, 'Unknown dataset_name provided to downloader'
+
+
+    def download_bookscorpus(self):
+        downloader = BooksDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_wikicorpus(self, language):
+        downloader = WikiDownloader(language, self.save_path)
+        downloader.download()
+
+
+    def download_google_pretrained_weights(self):
+        downloader = GooglePretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_nvidia_pretrained_weights(self):
+        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_mrpc(self):
+        downloader = MRPCDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_squad(self):
+        downloader = SquadDownloader(self.save_path)
+        downloader.download()
diff --git a/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
new file mode 100644
index 00000000..bb0684d3
--- /dev/null
+++ b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import os
+import urllib.request
+import zipfile
+
+class GooglePretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/google_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        # Download urls
+        self.model_urls = {
+            'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
+            'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
+            'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
+            'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
+            'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+            'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+            'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+        }
+
+        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+        self.bert_base_uncased_sha = {
+            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
+            'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
+            'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
+            'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_large_uncased_sha = {
+            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
+            'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
+            'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
+            'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_base_cased_sha = {
+            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
+            'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
+            'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
+            'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_large_cased_sha = {
+            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
+            'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
+            'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
+            'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_base_multilingual_cased_sha = {
+            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
+            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
+            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
+            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
+            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+        }
+
+        self.bert_large_multilingual_uncased_sha = {
+            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
+            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
+            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
+            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
+            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+        }
+
+        self.bert_base_chinese_sha = {
+            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
+            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
+            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
+            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
+            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+        }
+
+        # Relate SHA to urls for loop below
+        self.model_sha = {
+            'bert_base_uncased': self.bert_base_uncased_sha,
+            'bert_large_uncased': self.bert_large_uncased_sha,
+            'bert_base_cased': self.bert_base_cased_sha,
+            'bert_large_cased': self.bert_large_cased_sha,
+            'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+            'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+            'bert_base_chinese': self.bert_base_chinese_sha
+        }
+
+    # Helper to get sha256sum of a file
+    def sha256sum(self, filename):
+      h  = hashlib.sha256()
+      b  = bytearray(128*1024)
+      mv = memoryview(b)
+      with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+          h.update(mv[:n])
+
+      return h.hexdigest()
+
+    def download(self):
+        # Iterate over urls: download, unzip, verify sha256sum
+        found_mismatch_sha = False
+        for model in self.model_urls:
+          url = self.model_urls[model][0]
+          file = self.save_path + '/' + self.model_urls[model][1]
+
+          print('Downloading', url)
+          response = urllib.request.urlopen(url)
+          with open(file, 'wb') as handle:
+            handle.write(response.read())
+
+          print('Unzipping', file)
+          zip = zipfile.ZipFile(file, 'r')
+          zip.extractall(self.save_path)
+          zip.close()
+
+          sha_dict = self.model_sha[model]
+          for extracted_file in sha_dict:
+            sha = sha_dict[extracted_file]
+            if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
+              found_mismatch_sha = True
+              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
+            else:
+              print(file[:-4] + '/' + extracted_file, '\t', 'verified')
+
+        if not found_mismatch_sha:
+          print("All downloads pass sha256sum verification.")
+
+    def serialize(self):
+        pass
+
+    def deserialize(self):
+        pass
+
+    def listAvailableWeights(self):
+        print("Available Weight Datasets")
+        for item in self.model_urls:
+            print(item)
+
+    def listLocallyStoredWeights(self):
+        pass
+
diff --git a/modelzoo/ELECTRA/data/MRPCDownloader.py b/modelzoo/ELECTRA/data/MRPCDownloader.py
new file mode 100644
index 00000000..42dd4227
--- /dev/null
+++ b/modelzoo/ELECTRA/data/MRPCDownloader.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class MRPCDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/mrpc'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
+        self.download_urls = {
+            'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+
diff --git a/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
new file mode 100644
index 00000000..13c9a320
--- /dev/null
+++ b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class NVIDIAPretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/nvidia_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        pass
+
+
+    def download(self):
+        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/data/SquadDownloader.py b/modelzoo/ELECTRA/data/SquadDownloader.py
new file mode 100644
index 00000000..6d64ffc6
--- /dev/null
+++ b/modelzoo/ELECTRA/data/SquadDownloader.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class SquadDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/squad'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        if not os.path.exists(self.save_path + '/v1.1'):
+            os.makedirs(self.save_path + '/v1.1')
+
+        if not os.path.exists(self.save_path + '/v2.0'):
+            os.makedirs(self.save_path + '/v2.0')
+
+        self.download_urls = {
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
+            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
+            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+
diff --git a/modelzoo/ELECTRA/data/TextSharding.py b/modelzoo/ELECTRA/data/TextSharding.py
new file mode 100644
index 00000000..0753e742
--- /dev/null
+++ b/modelzoo/ELECTRA/data/TextSharding.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import islice
+
+import multiprocessing
+import statistics
+
+class Sharding:
+    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+        assert len(input_files) > 0, 'The input file list must contain at least one file.'
+        assert n_training_shards > 0, 'There must be at least one output shard.'
+        assert n_test_shards > 0, 'There must be at least one output shard.'
+
+        self.n_training_shards = n_training_shards
+        self.n_test_shards = n_test_shards
+        self.fraction_test_set = fraction_test_set
+
+        self.input_files = input_files
+
+        self.output_name_prefix = output_name_prefix
+        self.output_training_identifier = '_training'
+        self.output_test_identifier = '_test'
+        self.output_file_extension = '.txt'
+
+        self.articles = {}    # key: integer identifier, value: list of articles
+        self.sentences = {}    # key: integer identifier, value: list of sentences
+        self.output_training_files = {}    # key: filename, value: list of articles to go into file
+        self.output_test_files = {}  # key: filename, value: list of articles to go into file
+
+        self.init_output_files()
+
+
+    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+    def load_articles(self):
+        print('Start: Loading Articles')
+
+        global_article_count = 0
+        for input_file in self.input_files:
+            print('input file:', input_file)
+            with open(input_file, mode='r', newline='\n') as f:
+                for i, line in enumerate(f):
+                    if line.strip():
+                        self.articles[global_article_count] = line.rstrip()
+                        global_article_count += 1
+
+        print('End: Loading Articles: There are', len(self.articles), 'articles.')
+
+
+    def segment_articles_into_sentences(self, segmenter):
+        print('Start: Sentence Segmentation')
+        if len(self.articles) is 0:
+            self.load_articles()
+
+        assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
+
+        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+        use_multiprocessing = 'serial'
+
+        def chunks(data, size=len(self.articles)):
+            it = iter(data)
+            for i in range(0, len(data), size):
+                yield {k: data[k] for k in islice(it, size)}
+
+        if use_multiprocessing == 'manager':
+            manager = multiprocessing.Manager()
+            return_dict = manager.dict()
+            jobs = []
+            n_processes = 7    # in addition to the main process, total = n_proc+1
+
+            def work(articles, return_dict):
+                sentences = {}
+                for i, article in enumerate(articles):
+                    sentences[i] = segmenter.segment_string(articles[article])
+
+                    if i % 5000 == 0:
+                        print('Segmenting article', i)
+
+                return_dict.update(sentences)
+
+            for item in chunks(self.articles, len(self.articles)):
+                p = multiprocessing.Process(target=work, args=(item, return_dict))
+
+                # Busy wait
+                while len(jobs) >= n_processes:
+                    pass
+
+                jobs.append(p)
+                p.start()
+
+            for proc in jobs:
+                proc.join()
+
+        elif use_multiprocessing == 'queue':
+            work_queue = multiprocessing.Queue()
+            jobs = []
+
+            for item in chunks(self.articles, len(self.articles)):
+                pass
+
+        else:    # serial option
+            for i, article in enumerate(self.articles):
+                self.sentences[i] = segmenter.segment_string(self.articles[article])
+
+                if i % 5000 == 0:
+                    print('Segmenting article', i)
+
+        print('End: Sentence Segmentation')
+
+
+    def init_output_files(self):
+        print('Start: Init Output Files')
+        assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+        assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+
+        for i in range(self.n_training_shards):
+            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+            self.output_training_files[name] = []
+
+        for i in range(self.n_test_shards):
+            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+            self.output_test_files[name] = []
+
+        print('End: Init Output Files')
+
+
+    def get_sentences_per_shard(self, shard):
+        result = 0
+        for article_id in shard:
+            result += len(self.sentences[article_id])
+
+        return result
+
+
+    def distribute_articles_over_shards(self):
+        print('Start: Distribute Articles Over Shards')
+        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+
+        # Create dictionary with - key: sentence count per article, value: article id number
+        sentence_counts = defaultdict(lambda: [])
+
+        max_sentences = 0
+        total_sentences = 0
+
+        for article_id in self.sentences:
+            current_length = len(self.sentences[article_id])
+            sentence_counts[current_length].append(article_id)
+            max_sentences = max(max_sentences, current_length)
+            total_sentences += current_length
+
+        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+
+        consumed_article_set = set({})
+        unused_article_set = set(self.articles.keys())
+
+        # Make first pass and add one article worth of lines per file
+        for file in self.output_training_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_training_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+
+        for file in self.output_test_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_test_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+
+        training_counts = []
+        test_counts = []
+
+        for shard in self.output_training_files:
+            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        training_median = statistics.median(training_counts)
+        test_median = statistics.median(test_counts)
+
+        # Make subsequent passes over files to find articles to add without going over limit
+        history_remaining = []
+        n_history_remaining = 4
+
+        while len(consumed_article_set) < len(self.articles):
+            for fidx, file in enumerate(self.output_training_files):
+                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_training_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            for fidx, file in enumerate(self.output_test_files):
+                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_test_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+            if len(history_remaining) == n_history_remaining:
+                history_remaining.pop(0)
+            history_remaining.append(len(unused_article_set))
+
+            history_same = True
+            for i in range(1, len(history_remaining)):
+                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+
+            if history_same:
+                nominal_sentences_per_training_shard += 1
+                # nominal_sentences_per_test_shard += 1
+
+            training_counts = []
+            test_counts = []
+            for shard in self.output_training_files:
+                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+            for shard in self.output_test_files:
+                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+            training_median = statistics.median(training_counts)
+            test_median = statistics.median(test_counts)
+
+            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+
+
+        if len(unused_article_set) != 0:
+            print('Warning: Some articles did not make it into output files.')
+
+
+        for shard in self.output_training_files:
+            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        print('End: Distribute Articles Over Shards')
+
+
+    def write_shards_to_disk(self):
+        print('Start: Write Shards to Disk')
+        for shard in self.output_training_files:
+            self.write_single_shard(shard, self.output_training_files[shard])
+
+        for shard in self.output_test_files:
+            self.write_single_shard(shard, self.output_test_files[shard])
+
+        print('End: Write Shards to Disk')
+
+
+    def write_single_shard(self, shard_name, shard):
+        with open(shard_name, mode='w', newline='\n') as f:
+            for article_id in shard:
+                for line in self.sentences[article_id]:
+                    f.write(line + '\n')
+
+                f.write('\n')  # Line break between articles
+
+
+import nltk
+
+nltk.download('punkt')
+
+class NLTKSegmenter:
+    def __init(self):
+        pass
+
+    def segment_string(self, article):
+        return nltk.tokenize.sent_tokenize(article)
+
diff --git a/modelzoo/ELECTRA/data/WikiDownloader.py b/modelzoo/ELECTRA/data/WikiDownloader.py
new file mode 100644
index 00000000..505ec76c
--- /dev/null
+++ b/modelzoo/ELECTRA/data/WikiDownloader.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import subprocess
+import sys
+
+class WikiDownloader:
+    def __init__(self, language, save_path):
+        self.save_path = save_path + '/wikicorpus_' + language
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.language = language
+        self.download_urls = {
+            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+        }
+
+        self.output_files = {
+            'en' : 'wikicorpus_en.xml.bz2',
+            'zh' : 'wikicorpus_zh.xml.bz2'
+        }
+
+
+    def download(self):
+        if self.language in self.download_urls:
+            url = self.download_urls[self.language]
+            filename = self.output_files[self.language]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + filename):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + filename, "wb") as handle:
+                    handle.write(response.read())
+
+            # Always unzipping since this is relatively fast and will overwrite
+            print('Unzipping:', self.output_files[self.language])
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+
+        else:
+            assert False, 'WikiDownloader not implemented for this language yet.'
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
new file mode 100644
index 00000000..9d356b13
--- /dev/null
+++ b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class WikicorpusTextFormatting:
+    def __init__(self, wiki_path, output_filename, recursive = False):
+        self.wiki_path = wiki_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
+                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+                    print(filename)
+                    article_lines = []
+                    article_open = False
+
+                    with open(filename, mode='r', newline='\n') as file:
+                        for line in file:
+                            if '<doc id=' in line:
+                                article_open = True
+                            elif '</doc>' in line:
+                                article_open = False
+                                for oline in article_lines[1:]:
+                                    if oline != '\n':
+                                        ofile.write(oline.rstrip() + " ")
+                                ofile.write("\n\n")
+                                article_lines = []
+                            else:
+                                if article_open:
+                                    article_lines.append(line)
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/data/__init__.py b/modelzoo/ELECTRA/data/__init__.py
new file mode 100644
index 00000000..98386fd4
--- /dev/null
+++ b/modelzoo/ELECTRA/data/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/modelzoo/ELECTRA/data/create_datasets_from_start.sh b/modelzoo/ELECTRA/data/create_datasets_from_start.sh
new file mode 100755
index 00000000..58a72437
--- /dev/null
+++ b/modelzoo/ELECTRA/data/create_datasets_from_start.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+to_download=${1:-"wiki_only"}
+
+#Download
+if [ "$to_download" = "wiki_books" ] ; then
+    python3 /workspace/electra/data/dataPrep.py --action download --dataset bookscorpus
+fi
+python3 /workspace/electra/data/dataPrep.py --action download --dataset wikicorpus_en
+
+#Download SQuAD
+python3 /workspace/electra/data/dataPrep.py --action download --dataset squad
+
+# Properly format the text files
+if [ "$to_download" = "wiki_books" ] ; then
+    python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset bookscorpus
+fi
+python3 /workspace/electra/data/dataPrep.py --action text_formatting --dataset wikicorpus_en
+
+if [ "$to_download" = "wiki_books" ] ; then
+    DATASET="books_wiki_en_corpus"
+else
+    DATASET="wikicorpus_en"
+    # Shard the text files
+fi
+
+# Shard the text files (group wiki+books then shard)
+python3 /workspace/electra/data/dataPrep.py --action sharding --dataset $DATASET --n_test_shards 2048 --n_training_shards 2048
+
+# Create tfrecoreds files Phase 1
+python3 /workspace/electra/data/dataPrep.py --action create_tfrecord_files --dataset $DATASET --max_seq_length 128 --n_test_shards 2048 --n_training_shards 2048 --vocab_file=vocab/vocab.txt --do_lower_case=1
+
+# Create tfrecoreds files Phase 2
+python3 /workspace/electra/data/dataPrep.py --action create_tfrecord_files --dataset $DATASET --max_seq_length 512 --n_test_shards 2048 --n_training_shards 2048 --vocab_file=vocab/vocab.txt --do_lower_case=1
diff --git a/modelzoo/ELECTRA/data/dataPrep.py b/modelzoo/ELECTRA/data/dataPrep.py
new file mode 100644
index 00000000..a029bc63
--- /dev/null
+++ b/modelzoo/ELECTRA/data/dataPrep.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import BookscorpusTextFormatting
+import Downloader
+import TextSharding
+import WikicorpusTextFormatting
+
+import argparse
+import itertools
+import multiprocessing
+import os
+import pprint
+import subprocess
+
+
+def main(args):
+    working_dir = os.environ['DATA_PREP_WORKING_DIR']
+
+    print('Working Directory:', working_dir)
+    print('Action:', args.action)
+    print('Dataset Name:', args.dataset)
+
+    if args.input_files:
+        args.input_files = args.input_files.split(',')
+
+    hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+                                  + "_random_seed_" + str(args.random_seed)
+
+    directory_structure = {
+        'download' : working_dir + '/download',    # Downloaded and decompressed
+        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
+        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
+        'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
+        'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
+        'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
+    }
+
+    print('\nDirectory Structure:')
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(directory_structure)
+    print('')
+
+    if args.action == 'download':
+        if not os.path.exists(directory_structure['download']):
+            os.makedirs(directory_structure['download'])
+
+        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader.download()
+
+    elif args.action == 'text_formatting':
+        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
+
+        if not os.path.exists(directory_structure['extracted']):
+            os.makedirs(directory_structure['extracted'])
+
+        if not os.path.exists(directory_structure['formatted']):
+            os.makedirs(directory_structure['formatted'])
+
+        if args.dataset == 'bookscorpus':
+            books_path = directory_structure['download'] + '/bookscorpus'
+            #books_path = directory_structure['download']
+            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+            books_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_en':
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+                #wikiextractor_process.communicate()
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_zh':
+            raise NotImplementedError(
+                'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be '
+                'translated and properly segmented still, and should work once this step is added.')
+            # if args.skip_wikiextractor == 0:
+            #     path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+            #     wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+            #     print('WikiExtractor Command:', wikiextractor_command)
+            #     wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+            #     #wikiextractor_process.communicate()
+            #
+            # wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+            # output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+            # wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            # wiki_formatter.merge()
+            #
+            # assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
+
+    elif args.action == 'sharding':
+        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
+            if args.input_files is None:
+                if args.dataset == 'bookscorpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
+                elif args.dataset == 'wikicorpus_en':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'wikicorpus_zh':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
+                elif args.dataset == 'books_wiki_en_corpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+
+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+
+            if not os.path.exists(directory_structure['sharded']):
+                os.makedirs(directory_structure['sharded'])
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+
+            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+            # Different languages (e.g., Chinese simplified/traditional) may require translation and
+            # other packages to be called from here -- just add a conditional branch for those extra steps
+            segmenter = TextSharding.NLTKSegmenter()
+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
+
+            sharding.load_articles()
+            sharding.segment_articles_into_sentences(segmenter)
+            sharding.distribute_articles_over_shards()
+            sharding.write_shards_to_disk()
+
+            for _dir in ['train', 'test']:
+                if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir):
+                    os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir)
+                absolute_dir = directory_structure['sharded'] + '/' + args.dataset
+                command = 'mv ' + absolute_dir + '/*' + _dir + '*.txt' + ' ' + absolute_dir + '/' + _dir
+                mv_process = subprocess.Popen(command, shell=True)
+
+                mv_process.wait()
+        else:
+            assert False, 'Unsupported dataset for sharding'
+
+    elif args.action == 'create_tfrecord_files':
+
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+        if args.vocab_file is None:
+            args.vocab_file = os.path.join(working_dir, "vocab.txt")
+
+        for _dir in ['train', 'test']:
+            electra_preprocessing_command = 'python /workspace/electra/build_pretraining_dataset.py'
+            electra_preprocessing_command += ' --corpus-dir=' + directory_structure['sharded'] + '/' + args.dataset + '/' + _dir
+            electra_preprocessing_command += ' --output-dir=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + _dir
+            electra_preprocessing_command += ' --vocab-file=' + args.vocab_file
+            electra_preprocessing_command += ' --do-lower-case' if args.do_lower_case else ' --no-lower-case'
+            electra_preprocessing_command += ' --max-seq-length=' + str(args.max_seq_length)
+            electra_preprocessing_command += ' --num-processes=8'
+            electra_preprocessing_command += ' --num-out-files=' + str(args.n_training_shards) if _dir == 'train' \
+                else ' --num-out-files=' + str(args.n_test_shards)
+            electra_preprocessing_process = subprocess.Popen(electra_preprocessing_command, shell=True)
+
+            electra_preprocessing_process.wait()
+
+
+    elif args.action == 'create_hdf5_files':
+        raise NotImplementedError
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for Everything BERT-related'
+    )
+
+    parser.add_argument(
+        '--action',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
+        choices={
+            'download',               # Download and verify mdf5/sha sums
+            'text_formatting',        # Convert into a file that contains one article/book per line
+            'sharding',               # Convert previous formatted text into shards containing one sentence per line
+            'create_tfrecord_files',  # Turn each shard into a TFrecord with masking and next sentence prediction info
+            'create_hdf5_files'       # Turn each shard into a HDF5 file with masking and next sentence prediction info
+        }
+    )
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Specify the dataset to perform --action on',
+        choices={
+            'bookscorpus',
+            'wikicorpus_en',
+            'wikicorpus_zh',
+            'books_wiki_en_corpus',
+            'google_pretrained_weights',
+            'nvidia_pretrained_weights',
+            'mrpc',
+            'squad',
+            'all'
+        }
+    )
+
+    parser.add_argument(
+        '--input_files',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+    parser.add_argument(
+        '--n_training_shards',
+        type=int,
+        help='Specify the number of training shards to generate',
+        default=2048
+    )
+
+    parser.add_argument(
+        '--n_test_shards',
+        type=int,
+        help='Specify the number of test shards to generate',
+        default=2048
+    )
+
+    parser.add_argument(
+        '--fraction_test_set',
+        type=float,
+        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
+        default=0.1
+    )
+
+    parser.add_argument(
+        '--segmentation_method',
+        type=str,
+        help='Specify your choice of sentence segmentation',
+        choices={
+            'nltk'
+        },
+        default='nltk'
+    )
+
+    parser.add_argument(
+        '--n_processes',
+        type=int,
+        help='Specify the max number of processes to allow at one time',
+        default=4
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Specify the base seed to use for any random number generation',
+        default=12345
+    )
+
+    parser.add_argument(
+        '--dupe_factor',
+        type=int,
+        help='Specify the duplication factor',
+        default=5
+    )
+
+    parser.add_argument(
+        '--masked_lm_prob',
+        type=float,
+        help='Specify the probability for masked lm',
+        default=0.15
+    )
+
+    parser.add_argument(
+        '--max_seq_length',
+        type=int,
+        help='Specify the maximum sequence length',
+        default=512
+    )
+
+    parser.add_argument(
+        '--do_lower_case',
+        type=int,
+        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
+        default=0
+    )
+
+    parser.add_argument(
+        '--vocab_file',
+        type=str,
+        help='Specify absolute path to vocab file to use)'
+    )
+
+    parser.add_argument(
+        '--skip_wikiextractor',
+        type=int,
+        help='Specify whether to skip wikiextractor step 0=False, 1=True',
+        default=0
+    )
+
+    parser.add_argument(
+        '--interactive_json_config_generator',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/modelzoo/ELECTRA/data/glue/download_mrpc.sh b/modelzoo/ELECTRA/data/glue/download_mrpc.sh
new file mode 100755
index 00000000..65f3446b
--- /dev/null
+++ b/modelzoo/ELECTRA/data/glue/download_mrpc.sh
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Downloading MRPC data"
+
+wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py
+
+python download_glue_data.py --data_dir . --tasks MRPC
diff --git a/modelzoo/ELECTRA/data/squad/squad_download.sh b/modelzoo/ELECTRA/data/squad/squad_download.sh
new file mode 100755
index 00000000..7aa6f268
--- /dev/null
+++ b/modelzoo/ELECTRA/data/squad/squad_download.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Downloading dataset for squad..."
+
+# Download SQuAD
+
+v1="v1.1"
+mkdir $v1
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json -O $v1/train-v1.1.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -O $v1/dev-v1.1.json
+wget https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/ -O $v1/evaluate-v1.1.py
+
+EXP_TRAIN_v1='981b29407e0affa3b1b156f72073b945  -'
+EXP_DEV_v1='3e85deb501d4e538b6bc56f786231552  -'
+EXP_EVAL_v1='afb04912d18ff20696f7f88eed49bea9  -'
+CALC_TRAIN_v1=`cat ${v1}/train-v1.1.json |md5sum`
+CALC_DEV_v1=`cat ${v1}/dev-v1.1.json |md5sum`
+CALC_EVAL_v1=`cat ${v1}/evaluate-v1.1.py |md5sum`
+
+v2="v2.0"
+mkdir $v2
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O $v2/train-v2.0.json
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O $v2/dev-v2.0.json
+wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O $v2/evaluate-v2.0.py
+
+EXP_TRAIN_v2='62108c273c268d70893182d5cf8df740  -'
+EXP_DEV_v2='246adae8b7002f8679c027697b0b7cf8  -'
+EXP_EVAL_v2='ff23213bed5516ea4a6d9edb6cd7d627  -'
+
+CALC_TRAIN_v2=`cat ${v2}/train-v2.0.json |md5sum`
+CALC_DEV_v2=`cat ${v2}/dev-v2.0.json |md5sum`
+CALC_EVAL_v2=`cat ${v2}/evaluate-v2.0.py |md5sum`
+
+echo "Squad data download done!"
+
+echo "Verifying Dataset...."
+
+if [ "$EXP_TRAIN_v1" != "$CALC_TRAIN_v1" ]; then
+    echo "train-v1.1.json is corrupted! md5sum doesn't match"
+fi
+
+if [ "$EXP_DEV_v1" != "$CALC_DEV_v1" ]; then
+    echo "dev-v1.1.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v1" != "$CALC_EVAL_v1" ]; then
+    echo "evaluate-v1.1.py is corrupted! md5sum doesn't match"
+fi
+
+
+if [ "$EXP_TRAIN_v2" != "$CALC_TRAIN_v2" ]; then
+    echo "train-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_DEV_v2" != "$CALC_DEV_v2" ]; then
+    echo "dev-v2.0.json is corrupted! md5sum doesn't match"
+fi
+if [ "$EXP_EVAL_v2" != "$CALC_EVAL_v2" ]; then
+    echo "evaluate-v2.0.py is corrupted! md5sum doesn't match"
+fi
+
+echo "Complete!"
diff --git a/modelzoo/ELECTRA/file_utils.py b/modelzoo/ELECTRA/file_utils.py
new file mode 100644
index 00000000..da6a96e0
--- /dev/null
+++ b/modelzoo/ELECTRA/file_utils.py
@@ -0,0 +1,515 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+import fnmatch
+import json
+import logging
+import os
+import shutil
+import sys
+import tarfile
+import tempfile
+from contextlib import contextmanager
+from functools import partial, wraps
+from hashlib import sha256
+from typing import Optional
+from urllib.parse import urlparse
+from zipfile import ZipFile, is_zipfile
+
+import boto3
+import requests
+from botocore.config import Config
+from botocore.exceptions import ClientError
+from filelock import FileLock
+from tqdm.auto import tqdm
+
+# from examples import __version__
+__version__ = "0.1"
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+try:
+    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+    if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
+        import torch
+
+        _torch_available = True  # pylint: disable=invalid-name
+        logger.info("PyTorch version {} available.".format(torch.__version__))
+    else:
+        logger.info("Disabling PyTorch because USE_TF is set")
+        _torch_available = False
+except ImportError:
+    _torch_available = False  # pylint: disable=invalid-name
+
+try:
+    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+
+    if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
+        import tensorflow as tf
+
+        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
+        _tf_available = True  # pylint: disable=invalid-name
+        logger.info("TensorFlow version {} available.".format(tf.__version__))
+    else:
+        logger.info("Disabling Tensorflow because USE_TORCH is set")
+        _tf_available = False
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
+
+try:
+    from torch.hub import _get_torch_home
+
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+    )
+default_cache_path = os.path.join(torch_cache_home, "transformers")
+
+try:
+    from pathlib import Path
+
+    PYTORCH_PRETRAINED_BERT_CACHE = Path(
+        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
+    )
+except (AttributeError, ImportError):
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+    )
+
+PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
+
+WEIGHTS_NAME = "pytorch_model.bin"
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF_WEIGHTS_NAME = "model.ckpt"
+CONFIG_NAME = "config.json"
+MODEL_CARD_NAME = "modelcard.json"
+
+
+MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]]
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net"
+
+
+def is_torch_available():
+    return _torch_available
+
+
+def is_tf_available():
+    return _tf_available
+
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_start_docstrings_to_callable(*docstr):
+    def docstring_decorator(fn):
+        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
+        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
+        note = r"""
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        pre and post processing steps while the latter silently ignores them.
+        """
+        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https", "s3")
+
+
+def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
+    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
+    if postfix is None:
+        return "/".join((endpoint, identifier))
+    else:
+        return "/".join((endpoint, identifier, postfix))
+
+
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+    so that TF 2.0 can identify it as a HDF5 file
+    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+    """
+    url_bytes = url.encode("utf-8")
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+
+    if url.endswith(".h5"):
+        filename += ".h5"
+
+    return filename
+
+
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+
+    meta_path = cache_path + ".json"
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata["url"]
+    etag = metadata["etag"]
+
+    return url, etag
+
+
+def cached_path(
+    url_or_filename,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    resume_download=False,
+    user_agent=None,
+    extract_compressed_file=False,
+    force_extract=False,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletly recieved file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+            file in a folder along the archive.
+        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+            re-extract the archive and overide the folder where it was extracted.
+
+    Return:
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        Local path (string) otherwise
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+            local_files_only=local_files_only,
+        )
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        output_path = url_or_filename
+    elif urlparse(url_or_filename).scheme == "":
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+    if extract_compressed_file:
+        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+            return output_path
+
+        # Path where we extract compressed archives
+        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+        output_dir, output_file = os.path.split(output_path)
+        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+
+        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+            return output_path_extracted
+
+        # Prevent parallel extractions
+        lock_path = output_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path_extracted, ignore_errors=True)
+            os.makedirs(output_path_extracted)
+            if is_zipfile(output_path):
+                with ZipFile(output_path, "r") as zip_file:
+                    zip_file.extractall(output_path_extracted)
+                    zip_file.close()
+            elif tarfile.is_tarfile(output_path):
+                tar_file = tarfile.open(output_path)
+                tar_file.extractall(output_path_extracted)
+                tar_file.close()
+            else:
+                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+
+        return output_path_extracted
+
+    return output_path
+
+
+def split_s3_path(url):
+    """Split a full s3 path into the bucket name and path."""
+    parsed = urlparse(url)
+    if not parsed.netloc or not parsed.path:
+        raise ValueError("bad s3 path {}".format(url))
+    bucket_name = parsed.netloc
+    s3_path = parsed.path
+    # Remove '/' at beginning of path.
+    if s3_path.startswith("/"):
+        s3_path = s3_path[1:]
+    return bucket_name, s3_path
+
+
+def s3_request(func):
+    """
+    Wrapper function for s3 requests in order to create more helpful error
+    messages.
+    """
+
+    @wraps(func)
+    def wrapper(url, *args, **kwargs):
+        try:
+            return func(url, *args, **kwargs)
+        except ClientError as exc:
+            if int(exc.response["Error"]["Code"]) == 404:
+                raise EnvironmentError("file {} not found".format(url))
+            else:
+                raise
+
+    return wrapper
+
+
+@s3_request
+def s3_etag(url, proxies=None):
+    """Check ETag on S3 object."""
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+    bucket_name, s3_path = split_s3_path(url)
+    s3_object = s3_resource.Object(bucket_name, s3_path)
+    return s3_object.e_tag
+
+
+@s3_request
+def s3_get(url, temp_file, proxies=None):
+    """Pull a file directly from S3."""
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+    bucket_name, s3_path = split_s3_path(url)
+    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+
+
+def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
+    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+    if is_torch_available():
+        ua += "; torch/{}".format(torch.__version__)
+    if is_tf_available():
+        ua += "; tensorflow/{}".format(tf.__version__)
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    headers = {"user-agent": ua}
+    if resume_size > 0:
+        headers["Range"] = "bytes=%d-" % (resume_size,)
+    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get("Content-Length")
+    total = resume_size + int(content_length) if content_length is not None else None
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc="Downloading",
+        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+    )
+    for chunk in response.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+
+
+def get_from_cache(
+    url,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=10,
+    resume_download=False,
+    user_agent=None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given a URL, look for the corresponding file in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+
+    Return:
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        Local path (string) otherwise
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    etag = None
+    if not local_files_only:
+        # Get eTag to add to filename, if it exists.
+        if url.startswith("s3://"):
+            etag = s3_etag(url, proxies=proxies)
+        else:
+            try:
+                response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
+                if response.status_code == 200:
+                    etag = response.headers.get("ETag")
+            except (EnvironmentError, requests.exceptions.Timeout):
+                # etag is already None
+                pass
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+    # try to get the last downloaded one
+    if etag is None:
+        if os.path.exists(cache_path):
+            return cache_path
+        else:
+            matching_files = [
+                file
+                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+                if not file.endswith(".json") and not file.endswith(".lock")
+            ]
+            if len(matching_files) > 0:
+                return os.path.join(cache_dir, matching_files[-1])
+            else:
+                # If files cannot be found and local_files_only=True,
+                # the models might've been found if local_files_only=False
+                # Notify the user about that
+                if local_files_only:
+                    raise ValueError(
+                        "Cannot find the requested files in the cached path and outgoing traffic has been"
+                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+                        " to False."
+                    )
+                return None
+
+    # From now on, etag is not None.
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path, "a+b") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+
+            # GET file object
+            if url.startswith("s3://"):
+                if resume_download:
+                    logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
+                s3_get(url, temp_file, proxies=proxies)
+            else:
+                http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
+
+        logger.info("storing %s in cache at %s", url, cache_path)
+        os.replace(temp_file.name, cache_path)
+
+        logger.info("creating metadata file for %s", cache_path)
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
diff --git a/modelzoo/ELECTRA/gpu_affinity.py b/modelzoo/ELECTRA/gpu_affinity.py
new file mode 100644
index 00000000..68520734
--- /dev/null
+++ b/modelzoo/ELECTRA/gpu_affinity.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        return [i for i, e in enumerate(affinity_list) if e != 0]
+
+
+def set_affinity(gpu_id=None):
+    if gpu_id is None:
+        gpu_id = int(os.getenv('LOCAL_RANK', 0))
+
+    dev = device(gpu_id)
+    os.sched_setaffinity(0, dev.getCpuAffinity())
+
+    # list of ints representing the logical cores this process is now affinitied with
+    return os.sched_getaffinity(0)
diff --git a/modelzoo/ELECTRA/images/total_loss.svg b/modelzoo/ELECTRA/images/total_loss.svg
new file mode 100644
index 00000000..215868d7
--- /dev/null
+++ b/modelzoo/ELECTRA/images/total_loss.svg
@@ -0,0 +1 @@
+<svg viewBox="0 0 1496 400" xmlns="http://www.w3.org/2000/svg"><g><g><g><g><g><line x1="30" y1="366.69330718131914" x2="25" y2="366.69330718131914" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="331.5505003043039" x2="25" y2="331.5505003043039" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="300.11418832956616" x2="25" y2="300.11418832956616" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="30" y1="93.30055538135144" x2="25" y2="93.30055538135144" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(20, 0)"><text x="0" y="366.69330718131914" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">8</text><text x="0" y="331.5505003043039" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">9</text><text x="0" y="300.11418832956616" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">10</text><text x="0" y="93.30055538135144" dx="0em" dy="0.3em" style="text-anchor: end; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">20</text></g><line x1="30" y1="0" x2="30" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g><g transform="translate(30, 0)" clip-path="url(#clip_0)"><clipPath id="clip_0"><rect width="1466" height="378"></rect></clipPath><g><g><g><line x1="0" y1="0" x2="0" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="122.16666666666666" y1="0" x2="122.16666666666666" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="244.33333333333331" y1="0" x2="244.33333333333331" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="366.5" y1="0" x2="366.5" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="488.66666666666663" y1="0" x2="488.66666666666663" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="610.8333333333334" y1="0" x2="610.8333333333334" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="733" y1="0" x2="733" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="855.1666666666667" y1="0" x2="855.1666666666667" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="977.3333333333333" y1="0" x2="977.3333333333333" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="1099.5" y1="0" x2="1099.5" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="1221.6666666666667" y1="0" x2="1221.6666666666667" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="1343.8333333333333" y1="0" x2="1343.8333333333333" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="1466" y1="0" x2="1466" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line></g><g><line x1="0" y1="366.69330718131914" x2="1466" y2="366.69330718131914" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="331.5505003043039" x2="1466" y2="331.5505003043039" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="300.11418832956616" x2="1466" y2="300.11418832956616" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line><line x1="0" y1="93.30055538135144" x2="1466" y2="93.30055538135144" fill="rgb(0, 0, 0)" stroke="rgb(60, 60, 60)" stroke-width="1px" opacity="0.25"></line></g></g></g><g><g><line x1="122.16666666666666" y1="0" x2="122.16666666666666" y2="378" fill="rgb(0, 0, 0)" stroke="rgb(153, 153, 153)" stroke-width="1.5px"></line></g></g><g><g><g><g><g><path stroke="rgb(187, 187, 187)" stroke-width="2px" d="M122.16666666666666,62.5786333460095L122.28883333333333,64.1244351209902L122.53316666666666,65.62806552107179L122.65533333333333,66.7808700426616L122.7775,69.59808364820412L122.89966666666666,71.77118992821744L123.02183333333333,74.5675246242551L123.144,79.06499946866353L123.26616666666666,85.76099723627902L123.38833333333334,96.33036815072853L123.51050000000001,110.83473466531206L123.63266666666667,130.3705944628585L123.75483333333334,134.98920924687394L123.75483333333334,128.35023922868916L123.87700000000001,121.43508906705529L123.99916666666667,116.1598681706044L124.12133333333334,112.9456464617885L124.24350000000001,111.3507654162031L124.36566666666666,109.61783618450795L124.48783333333334,109.06215737648029L124.61000000000001,110.42922792269235L124.73216666666666,113.09766809041781L124.85433333333333,119.93673629140324L124.97650000000002,132.95722834049826L125.09866666666666,150.32523094773137L125.22083333333333,152.64846705885535L125.343,148.6278952777273L125.46516666666666,143.0538174468139L125.58733333333333,135.5016728160343L125.7095,129.49841438131278L125.83166666666666,125.14843764212861L125.95383333333334,123.02109051385455L126.076,122.77519026984564L126.19816666666667,123.64787320668034L126.32033333333334,125.70710637625331L126.4425,132.8999573871505L126.56466666666667,141.69293162548138L126.68683333333334,150.57194521812258L126.809,148.70001788162764L126.93116666666667,144.64136426501028L127.05333333333334,139.46519754479783L127.17549999999999,134.79009242040692L127.29766666666666,131.62469123798013L127.41983333333334,127.82172569053486L127.54199999999999,126.49642943249125L127.66416666666666,126.5981358917943L127.78633333333333,126.8122297306561L127.90849999999999,131.15471975348112L128.03066666666666,138.04143887442407L128.15283333333335,146.83589321712913L128.275,154.7316876339319L128.39716666666666,152.28116494436813L128.51933333333335,147.50941952072006L128.64149999999998,142.17105867838492L128.76366666666667,135.76272494916196L128.88583333333335,131.8707267799804L129.00799999999998,129.35073792020103L129.13016666666667,127.21291005334444L129.25233333333333,125.6689628050305L129.37449999999998,126.14230421927013L129.49666666666667,129.35007839879302L129.61883333333333,134.5282844046976L129.74099999999999,143.21382668640595L129.86316666666667,153.94099862660445L129.98533333333333,152.44348774607798L130.1075,147.5778883870084L130.22966666666667,141.94584486710485L130.35183333333333,137.0436972463555L130.474,132.8633489301242L130.59616666666668,129.53448345978953L130.71833333333333,127.74092873156314L130.8405,126.41395919720213L130.96266666666668,126.36888240449863L131.08483333333334,128.70612437107954L131.207,131.95576027272287L131.32916666666668,143.13619097128702L131.45133333333334,149.90900808398806L131.5735,155.53646221517283L131.69566666666668,151.0924867941321L131.81783333333334,145.49714010590958L131.94,138.70111204743685L132.06216666666666,133.3208629399554L132.18433333333334,129.94178082621107L132.3065,127.72132459741039L132.42866666666666,126.53495606374688L132.55083333333334,128.5269524373904L132.673,129.87479455750957L132.79516666666666,133.41324763642595L132.91733333333335,138.98493426060026L133.0395,150.49712415685232L133.16166666666666,152.0401982949932L133.28383333333335,151.53368103262127L133.406,149.66310532000418L133.52816666666666,148.03956317314766L133.65033333333332,147.90802291585152L133.7725,146.9008681843851L133.89466666666667,143.7100481510471L134.01683333333332,141.10564943856102L134.139,137.3109837837443L134.26116666666667,136.15729601092755L134.38333333333333,136.4834386636402L134.50549999999998,137.57512799313142L134.62766666666667,139.68566235789237L134.74983333333333,140.90261230502944L134.87199999999999,143.75105705566082L134.99416666666667,145.91535765954714L135.11633333333333,147.15675022297884L135.2385,143.50246317530522L135.36066666666667,139.46596794505663L135.48283333333333,137.05393551979154L135.605,133.77608853860121L135.72716666666668,132.00937533500348L135.84933333333333,131.56217275041175L135.9715,131.98333967537988L136.09366666666668,134.34786687479158L136.21583333333334,136.61979550531584L136.338,139.2084597673999L136.46016666666668,142.840417166812L136.58233333333334,149.61158691190326L136.7045,152.76698790135617L136.82666666666668,149.63101488388943L136.94883333333334,145.07215543810068L137.071,140.75425492709388L137.19316666666668,138.26378419193227L137.3153333333333,135.91463815280238L137.4375,134.70710988744932L137.5596666666667,133.8550843405212L137.5596666666667,136.76901922262428L137.68183333333332,139.23333674436802L137.804,140.93329296569001L137.92616666666666,144.64664670673628L138.04833333333332,150.60556123028815L138.1705,150.5174807111435L138.29266666666666,147.96889268735308L138.41483333333332,144.32816023181638L138.537,140.72234660091473L138.65916666666666,137.82897123847775L138.78133333333332,137.29931869924485L138.9035,136.82021994056183L139.02566666666667,137.01816561984947L139.14783333333332,140.13990415221755L139.27,142.56716325089178L139.39216666666667,147.66410921616531L139.51433333333333,151.5043581636833L139.6365,156.7250621288447L139.75866666666667,155.9983451266254L139.88083333333333,155.71738842056234L140.00300000000001,151.9582278503324L140.12516666666667,148.81542156826873L140.24733333333333,145.38903745611316L140.36950000000002,143.6955232034696L140.49166666666667,142.80010810093387L140.61383333333333,145.16077264025327L140.736,145.27778352904616L140.85816666666668,147.01894490124334L140.98033333333333,150.11604504441848L141.1025,152.75427819681067L141.22466666666668,157.88842899669802L141.34683333333334,159.7544557290425L141.469,161.2615028739725L141.59116666666665,160.88665968205086L141.71333333333334,157.52910622765964L141.8355,154.3395811672627L141.95766666666665,153.2055917186997L142.07983333333334,153.8440469310656L142.202,155.54805848157287L142.32416666666666,156.94039530958676L142.44633333333334,160.28425061827218L142.5685,164.44225140856074L142.69066666666666,166.60679872023383L142.81283333333334,172.19495606431235L142.935,174.84066625430097L143.05716666666666,176.70608453427062L143.17933333333332,175.7901990089765L143.3015,173.9749311812699L143.42366666666666,173.35328824428123L143.54583333333332,173.57108333230605L143.668,175.1945943115041L143.79016666666666,175.5331276681769L143.91233333333332,179.1350841281813L144.0345,182.91726606330556L144.15666666666667,186.67124683932917L144.27883333333332,190.79877529988678L144.401,194.3996648633814L144.52316666666667,198.7786960835111L144.64533333333333,199.3381306308546L144.7675,200.7270624468839L144.88966666666667,200.79868824720435L145.01183333333333,199.65405785078758L145.13400000000001,199.99773778733984L145.25616666666667,202.16975493801712L145.37833333333333,204.71303222260303L145.50050000000002,209.02068515947832L145.62266666666667,212.7724882745859L145.74483333333333,216.88183180830552L145.86700000000002,219.26915589003437L145.98916666666665,221.6526598349261L146.11133333333333,226.68828576600512L146.23350000000002,228.59980378901056L146.35566666666665,229.85844974592402L146.47783333333334,229.05269133574265L146.6,226.99051438951147L146.72216666666665,227.27880103036088L146.84433333333334,230.502576551268L146.9665,231.4063120310233L147.08866666666665,235.95579099718245L147.21083333333334,240.653392761776L147.333,244.02903475845864L147.45516666666666,247.0522608104642L147.57733333333334,251.74659798998988L147.6995,253.56877951189207L147.82166666666666,256.1922678266091L147.94383333333334,260.2130801964914L148.066,259.5961926724143L148.18816666666666,260.0633389613929L148.31033333333335,260.9500634084358L148.4325,260.8788516295807L148.55466666666666,262.9027346610267L148.67683333333335,267.60815205530673L148.799,269.7846926214413L148.92116666666666,271.01506071438047L149.04333333333335,273.4727850705674L149.16549999999998,275.0849459776264L149.28766666666667,276.5688639201712L149.40983333333332,280.6794546710836L149.53199999999998,279.3935076328248L149.65416666666667,282.17988492171145L149.77633333333333,283.2068639711662L149.89849999999998,284.02751754793604L150.02066666666667,284.85326819119155L150.14283333333333,287.70698763077974L150.265,290.13402329613155L150.38716666666667,290.537498323495L150.50933333333333,294.1925450512412L150.6315,295.2531999020133L150.75366666666667,294.7772980141567L150.87583333333333,297.57732959451096L150.998,298.6408002434559L151.12016666666668,299.9738590881003L151.24233333333333,298.41048422736685L151.3645,297.530886454108L151.48666666666668,295.8624514580528L151.60883333333334,295.4922927738552L151.731,295.88173078125345L151.85316666666665,296.72995868398937L151.97533333333334,300.035454599246L152.0975,302.1450547087833L152.21966666666665,303.89940307709713L152.34183333333334,304.8613564744637L152.464,307.72016026468947L152.58616666666666,308.0466089112511L152.70833333333334,309.99503982098884L152.8305,306.24483863042866L152.95266666666666,303.9809774310903L153.07483333333334,302.51412429481553L153.197,302.3054441522102L153.31916666666666,302.0707080965244L153.44133333333335,305.3330867159509L153.5635,304.7258424656196L153.68566666666666,306.83237583299774L153.80783333333335,308.2704098569747L153.93,311.3268491594782L154.05216666666666,310.665498593679L154.17433333333335,309.0746429851466L154.29649999999998,305.40299997725964L154.41866666666667,305.75164727701883L154.54083333333335,304.22103766578005L154.66299999999998,301.38952584438107L154.78516666666667,299.3820094018062L154.90733333333336,298.2137653690841L155.02949999999998,300.8583099771784L155.15166666666667,302.27314614915247L155.27383333333333,303.1799180361284L155.396,306.4272796317532L155.51816666666667,308.2008609778746L155.64033333333333,308.7410686510792L155.7625,311.34039660720936L155.88466666666667,310.42359978966687L156.00683333333333,310.2605447644962L156.129,309.5645384806824L156.25116666666668,306.5144409373429L156.37333333333333,304.89474287407296L156.4955,302.4133976269051L156.61766666666668,304.4577965228094L156.73983333333334,306.45941793474657L156.862,307.3942018845406L156.98416666666668,311.2204135828745L157.1063333333333,314.6245082890187L157.2285,314.78190314731137L157.35066666666668,315.9684564487024L157.4728333333333,314.8955459651404L157.595,313.6586024262406L157.71716666666669,312.2258425930197L157.83933333333331,312.30342862022064L157.9615,312.73974657831275L158.08366666666666,312.52502722140207L158.20583333333332,311.48113829025567L158.328,314.014168390231L158.45016666666666,314.1208551834393L158.57233333333332,316.3661601014557L158.6945,319.8036836191344L158.81666666666666,323.0541083763855L158.93883333333332,325.24625765786925L159.061,322.32059200954467L159.18316666666666,325.0608767470979L159.30533333333332,325.4341499064405L159.4275,321.2330494232927L159.54966666666667,321.3220487126806L159.67183333333332,318.9545243019452L159.794,317.1154318471681L159.91616666666667,317.52750208915677L160.03833333333333,318.1786525943628L160.1605,317.8853985942292L160.28266666666667,317.5653986616017L160.40483333333333,318.226377136058L160.527,317.12968687261736L160.64916666666667,318.00245708274343L160.77133333333333,317.10999816221636L160.8935,316.3819730429575L161.01566666666668,315.86261011537607L161.13783333333333,317.01905778470956L161.26,316.8723170412177L161.38216666666668,316.412388414449L161.50433333333334,317.5005486827501L161.6265,319.6357406999107L161.74866666666668,322.15420776982484L161.87083333333334,323.31577562942493L161.993,324.81626318100564L162.11516666666668,326.51391728063066L162.23733333333334,327.42775790328665L162.3595,324.64482918770824L162.48166666666668,324.952493470769L162.60383333333334,323.1975809218165L162.726,324.5497242279423L162.84816666666669,322.2964024937398L162.97033333333331,322.04385429033596L163.0925,320.07010864367817L163.2146666666667,319.70065948359917L163.33683333333332,320.5777022441274L163.459,321.99087232836655L163.5811666666667,322.58990045439697L163.70333333333332,320.15166502644007L163.8255,322.0132833162472L163.94766666666666,325.2177108923144L164.06983333333332,325.0465181232158L164.192,328.10910397888665L164.31416666666667,326.11122937071224L164.43633333333332,327.6773347269974L164.5585,324.4251575304646L164.68066666666667,325.9183395639089L164.80283333333333,326.86080837687155L164.925,326.3133899683954L165.04716666666667,329.35567905096985L165.16933333333333,328.13889764527073L165.2915,326.7230518689127L165.41366666666667,326.2452073078677L165.53583333333333,326.7235722021212L165.65800000000002,325.5856810488368L165.78016666666664,325.63161317032797L165.90233333333333,325.72224223736316L166.02450000000002,326.40822189748474L166.14666666666665,324.8055906684094L166.26883333333333,323.0518596918364L166.39100000000002,323.4207436303987L166.51316666666665,323.36279534706273L166.63533333333334,324.3857615639017L166.7575,324.18437688848553L166.87966666666665,324.24676026069153L167.00183333333334,323.84104404670535L167.124,322.7030374198078L167.24616666666665,322.46577298871654L167.36833333333334,324.3685799926708L167.4905,323.36809896177215L167.61266666666666,323.35419892389535L167.73483333333334,323.1057646143189L167.857,321.89839659221343L167.97916666666666,321.3143591023733L168.10133333333334,322.0414298287937L168.2235,322.52661290348556L168.34566666666666,322.2857546777079L168.46783333333335,320.9332336270325L168.59,323.2956199857143L168.71216666666666,322.9673427909192L168.83433333333335,323.0381905784106L168.9565,323.86412975164325L169.07866666666666,324.1256996538836L169.20083333333332,325.20247565865793L169.323,324.82995426490737L169.44516666666667,324.56456684745854L169.56733333333332,322.9005535805694L169.6895,324.7190166760632L169.81166666666667,325.33303943686974L169.93383333333333,327.8057550507828L170.056,333.48662437057425L170.17816666666667,339.9867222216495L170.30033333333333,352.12905772799576L170.4225,353.5076523742171L170.54466666666667,352.71409669497825L170.66683333333333,349.23104454119647L170.78900000000002,345.18165076252654L170.91116666666667,341.9331837620417L171.03333333333333,337.95322010607464L171.15550000000002,336.10587560761087L171.27766666666668,334.14494658932085L171.39983333333333,334.06648423102996L171.52200000000002,332.75465715411804L171.64416666666665,331.90009504255687L171.76633333333334,332.58897120435495L171.8885,329.771860200044L172.01066666666665,329.512547342958L172.13283333333334,327.4199251870924L172.255,324.82981139435975L172.37716666666665,327.2323229783383L172.49933333333334,324.6102190538446L172.6215,325.4665177946495L172.74366666666666,329.2210220988748L172.86583333333334,329.40662493362606L172.988,329.76393728365576L173.11016666666666,329.37182405476705L173.23233333333334,329.9244647782201L173.3545,329.84588903497786L173.47666666666666,328.67778441781735L173.59883333333335,328.0652318487895L173.721,328.44897470510165L173.84316666666666,326.1010377993997L173.96533333333335,324.6309439312123L174.0875,322.0000806065348L174.20966666666666,320.8108049665641L174.33183333333335,321.70616836299774L174.45399999999998,325.3681674565441L174.57616666666667,329.17462103283265L174.69833333333335,330.12530242793827L174.82049999999998,332.36720608217877L174.94266666666667,329.15288800466004L175.06483333333333,327.78678346149445L175.18699999999998,328.5692162387178L175.30916666666667,323.22393991351316L175.43133333333333,324.66294609064687L175.55349999999999,324.82527136235706L175.67566666666667,322.5478743718885L175.79783333333333,324.44164990651916L175.92,322.09326682218784L176.04216666666667,322.49079454589094L176.16433333333333,328.2958554300268L176.2865,329.03555882568287L176.40866666666668,329.46069288706525L176.53083333333333,332.27215376353223L176.653,333.1410065006059L176.77516666666668,332.33617508255327L176.89733333333334,329.62443477893294L177.0195,327.87848148905044L177.14166666666668,325.25826341717413L177.26383333333334,325.84988135682363L177.386,325.2002010444784L177.50816666666668,325.49170609658296L177.63033333333334,323.5071380712132L177.7525,323.2446803252714L177.87466666666666,326.7923883439157L177.99683333333334,328.3988944592709L178.119,329.46027087236575L178.24116666666666,328.30565119281954L178.36333333333334,327.2209865336535L178.4855,325.3752687797514L178.60766666666666,321.4888160175891L178.72983333333335,320.89313109850565L178.852,318.50672473728986L178.97416666666666,319.8416380623329L179.09633333333335,319.37012054256365L179.2185,320.9861281351006L179.34066666666666,320.81844299561317L179.46283333333332,324.7566919952611L179.585,325.9308492841392L179.70716666666667,326.0397425893976L179.82933333333332,328.7392001609664L179.9515,328.3973062006223L180.07366666666667,327.3340827137437L180.19583333333333,326.69388095265214L180.31799999999998,325.08638733864757L180.44016666666667,325.96223554361114L180.56233333333333,323.12418543907876L180.68449999999999,323.75018088850146L180.80666666666667,326.1613663494737L180.92883333333333,328.01235957413974L181.051,329.91301420482876L181.17316666666667,329.23102458817556L181.29533333333333,330.82919221824415L181.4175,331.1497143983685L181.53966666666668,332.88714639269705L181.66183333333333,329.79979988053157L181.784,328.7676486792703L181.90616666666668,328.0253353988896L182.02833333333334,326.8346494043166L182.1505,325.390377021981L182.27266666666668,325.3887290094456L182.39483333333334,325.1682761602236L182.517,328.1367899499564L182.63916666666668,327.43685043858954L182.76133333333334,326.8165311602866L182.8835,326.4216074968392L183.00566666666668,328.5944146414204L183.1278333333333,328.6273588043839L183.25,329.2039112064555L183.37216666666666,326.3321843594454L183.49433333333334,326.92514442528284L183.6165,325.27738886124536L183.73866666666666,324.3817568374818L183.86083333333335,324.2317163849655L183.983,325.5888842437601L184.10516666666666,325.7130739456658L184.22733333333335,327.9448555571715L184.3495,331.07354613238897L184.47166666666666,329.18147427149097L184.59383333333335,326.9193427996131L184.716,328.9949225166496L184.83816666666667,326.8937080069216L184.96033333333335,328.0432912640197L185.0825,326.9671655895743L185.20466666666667,326.8412313337799L185.32683333333335,323.18523679345003L185.449,321.63600267643216L185.57116666666664,320.0294218540503L185.69333333333336,322.265854613281L185.81550000000001,323.86666607563876L185.93766666666664,326.39494462440746L186.05983333333336,327.93847193865463L186.18200000000002,328.3451169093677L186.30416666666665,329.73371256698806L186.42633333333336,328.8628209325483L186.5485,330.3881727018203L186.67066666666665,329.3716378930948L186.79283333333336,327.0908163261871L186.915,322.80839865327226L187.03716666666665,319.9734769621549L187.15933333333334,322.12413593437833L187.2815,325.6311780722149L187.40366666666665,326.83165671607594L187.52583333333334,330.03591924706774L187.648,329.9332111128159L187.77016666666665,331.7492106692952L187.89233333333334,330.9395897536123L188.0145,331.0995683999885L188.13666666666666,328.65841548234823L188.25883333333334,327.4421938863072L188.381,326.8025374593519L188.50316666666666,326.5323681849795L188.62533333333334,327.0494031928248L188.7475,327.18510454883153L188.86966666666666,331.0030988941728L188.99183333333335,331.1540528765489L189.114,331.1097005337057L189.23616666666666,331.49454483467235L189.35833333333335,333.10949151563125L189.4805,335.00886808636193L189.60266666666666,336.2299402421838L189.72483333333335,332.7529696941003L189.847,328.4831268568606L189.96916666666667,329.4623586712317L190.09133333333335,328.55053514975504L190.2135,328.73195937175393L190.33566666666667,325.4078015102785L190.45783333333335,325.68939445803665L190.58,330.3235254772895L190.70216666666667,332.2534025439206L190.82433333333336,331.6352861469119L190.94650000000001,331.7464239383464L191.06866666666667,332.5791218656961L191.1908333333333,333.25131304527105L191.31300000000002,329.74980963688733L191.43516666666667,325.092760942927L191.5573333333333,323.01941060884985L191.67950000000002,321.20173596693843L191.80166666666665,318.3005385173253L191.9238333333333,319.50933488228554L192.04600000000002,320.34717705736824L192.16816666666665,323.7711307895443L192.2903333333333,325.4234428325662L192.4125,325.4476480966142L192.53466666666665,329.2235403391005L192.6568333333333,328.1986088904754L192.779,329.4569318749169L192.90116666666665,329.8669583970287L193.0233333333333,327.3471575779058L193.1455,325.9580269921304L193.26766666666666,324.98442939174043L193.3898333333333,320.7836051987861L193.512,322.05539922780315L193.63416666666666,323.6481527673914L193.75633333333332,323.22694152721766L193.8785,325.4727778955521L194.00066666666666,326.6269777244177L194.12283333333332,328.3359549971618L194.245,327.2695618533862L194.36716666666666,327.71987293973683L194.48933333333332,328.7275267143511L194.6115,328.8303190430215L194.73366666666666,329.39481759823167L194.85583333333332,327.5932123264804L194.978,325.54184860696756L195.10016666666667,325.81265860810856L195.22233333333332,326.9285147217137L195.3445,327.0097154563294L195.46666666666667,327.665504093066L195.58883333333333,330.16671415519727L195.711,331.81169457484873L195.83316666666667,332.0876577671608L195.95533333333333,331.4030010275989L196.07750000000001,332.8321964078358L196.19966666666667,332.3326374355691L196.32183333333333,332.29798099063055L196.44400000000002,329.9880637987921L196.56616666666667,327.1345051615217L196.6883333333333,326.7653708971637L196.81050000000002,324.3020367508623L196.93266666666668,326.07507667263684L197.0548333333333,329.65090701497496L197.17700000000002,332.7622548195816L197.29916666666668,334.9273045228573L197.4213333333333,336.3031433506561L197.54350000000002,333.92737222682587L197.66566666666665,335.06769436188046L197.7878333333333,336.02287323990106L197.91000000000003,333.5902980573701L198.03216666666665,331.56087636346217L198.1543333333333,328.49292793610186L198.27650000000003,327.62990903435804L198.39866666666666,328.4559806757774L198.52083333333331,328.84096402434756L198.643,332.74719080785655L198.76516666666666,332.7215864062598L198.88733333333332,333.23683445893533L199.0095,334.3400518025127L199.13166666666666,337.93314668689135L199.25383333333332,337.10609973347584L199.376,337.5680829455087L199.49816666666666,337.2276212798118L199.62033333333332,337.2089329503881L199.7425,335.87111967491074L199.86466666666666,333.8858184181835L199.98683333333332,331.2757724212681L200.109,331.53951795126636L200.23116666666667,332.7889229674699L200.35333333333332,333.8376886961461L200.4755,334.54817584376127L200.59766666666667,333.92952565727103L200.71983333333333,334.8415792816963L200.842,333.9355065071268L200.96416666666667,330.0985877172745L201.08633333333333,328.1997211316476L201.20850000000002,328.4255722146053L201.33066666666667,327.454594560765L201.45283333333333,329.05808062605564L201.57500000000002,329.4460031465733L201.69716666666667,330.5943567461508L201.81933333333333,332.57787002421486L201.94150000000002,334.3371897598431L202.06366666666668,337.4275393880545L202.18583333333333,342.0250712975774L202.30800000000002,340.98650372305764L202.43016666666668,341.06820655382444L202.55233333333334,342.396315094015L202.67450000000002,336.8617526297787L202.79666666666668,333.6070318768685L202.9188333333333,332.2165022117531L203.04100000000003,330.14782789084217L203.16316666666668,328.79981758399015L203.2853333333333,330.45245628573065L203.40750000000003,331.56082675783676L203.52966666666669,333.36179635679343L203.65183333333331,335.1203955579271L203.77400000000003,336.5244885926832L203.89616666666666,337.6030943046986L204.01833333333332,336.07260959726557L204.14050000000003,337.4836484921158L204.26266666666666,335.086197257511L204.38483333333332,332.3576662532247L204.507,330.98743820677663L204.62916666666666,331.49174610289987L204.75133333333332,332.96288221056767L204.8735,331.86854327090896L204.99566666666666,334.6501137959714L205.11783333333332,336.70290972451056L205.24,337.35424615194273L205.36216666666667,338.0223098418925L205.48433333333332,337.6577734823973L205.6065,335.38486456292617L205.72866666666667,332.9465281181119L205.85083333333333,330.54943317659877L205.973,327.1403542471545L206.09516666666667,328.0131683278281L206.21733333333333,327.56129540661755L206.3395,328.6050563985965L206.46166666666667,331.9037742818882L206.58383333333333,332.84306465108716L206.706,335.87965693603024L206.82816666666668,339.4338321926413L206.95033333333333,343.0688357985881L207.0725,349.3997419787805L207.19466666666668,346.95171665531024L207.31683333333334,344.23517939041824L207.439,341.0291446035503L207.56116666666668,348.89802642004867L207.68333333333334,353.2825563656306L207.8055,367.48179815661L207.92766666666668,376.3360079639766L208.04983333333334,383.49144463376115L208.41633333333334,379.8673193526487L208.53849999999997,372.36576523631624L208.66066666666669,369.0446077519247L208.78283333333331,365.120250848039L208.90499999999997,358.79667045239296L209.0271666666667,354.5657335962791L209.14933333333332,351.0865593893306L209.27149999999997,350.45219409958736L209.3936666666667,346.2668163406784L209.51583333333332,344.96357581862605L209.63799999999998,343.4868484909146L209.76016666666666,342.12168025107445L209.88233333333332,342.0242148269173L210.00449999999998,342.34965873974886L210.12666666666667,341.7952602777294L210.24883333333332,341.883908648468L210.37099999999998,342.3493554739296L210.49316666666667,343.6439942943315L210.61533333333333,341.2549633081974L210.73749999999998,340.11644644976906L210.85966666666667,339.4511411828426L210.98183333333333,338.81542786384273L211.10399999999998,337.02327317269555L211.22616666666667,338.51301948935753L211.34833333333333,337.02506337422994L211.4705,336.5715931762262L211.59266666666667,334.184721318907L211.71483333333333,333.76973107698547L211.837,335.0118687709995L211.95916666666668,335.738269227673L212.08133333333333,334.53075798817093L212.2035,333.384131336714L212.32566666666668,332.41328901645966L212.44783333333334,332.079093266397L212.57,334.85738998435454L212.69216666666668,334.5715739222851L212.81433333333334,334.9759676000915L212.9365,335.13323580934804L213.05866666666668,334.176506015822L213.18083333333334,332.29922339971387L213.303,333.0050018759768L213.42516666666668,336.09574506363145L213.54733333333334,333.3086837618403L213.6695,336.1157741283315L213.79166666666669,336.56296468544707L213.91383333333334,336.0036299973694L214.03599999999997,337.0920482397488L214.1581666666667,337.00126988969487L214.28033333333335,334.21219192162584L214.40249999999997,335.23024931713013L214.5246666666667,335.4458098953201L214.64683333333335,335.0287885731898L214.76899999999998,334.44515876302944L214.8911666666667,333.8977300932043L215.01333333333332,332.9197088182391L215.13549999999998,334.26023992902185L215.2576666666667,330.5108544059364L215.37983333333332,330.0518289418062L215.50199999999998,328.3731032311717L215.6241666666667,327.84023280126394L215.74633333333333,326.80434070378016L215.86849999999998,326.47744538530446L215.99066666666667,326.936371869623L216.11283333333333,327.18978422401017L216.23499999999999,327.77696867898595L216.35716666666667,325.8317185753332L216.47933333333333,326.7696400456949L216.6015,326.40479123921403L216.72366666666667,327.34738571652747L216.84583333333333,327.81570576662887L216.968,327.3213746779638L217.09016666666668,327.3056998395544L217.21233333333333,327.8636167553114L217.3345,329.5293702833213L217.45666666666668,331.95490389287176L217.57883333333334,331.78821580973283L217.701,332.52171222823307L217.82316666666668,332.4155833244776L217.94533333333334,332.2165483267316L218.0675,335.0324560360424L218.18966666666668,334.8709928418268L218.31183333333334,336.4565141044452L218.434,333.9015528563274L218.55616666666668,334.01882941321156L218.67833333333334,332.9287537404262L218.8005,333.9840631382831L218.9226666666667,333.55460315673514L219.04483333333334,336.39031891372406L219.167,335.56858126505057L219.2891666666667,334.53388010554903L219.41133333333335,336.71857442691936L219.5335,335.05228643316764L219.6556666666667,335.0254384622423L219.77783333333335,334.85766543975444L219.9,336.76625837761844L220.0221666666667,335.44842532549274L220.14433333333335,332.00034518227443L220.26649999999998,331.52746239864365L220.3886666666667,332.64293205444L220.51083333333335,335.31260984852986L220.63299999999998,335.8192717502771L220.7551666666667,334.0328976156736L220.87733333333333,333.2450140493815L220.99949999999998,331.7542953545763L221.1216666666667,327.9086230040689L221.24383333333333,327.88180153954056L221.36599999999999,328.3214907656847L221.48816666666664,329.1401915506005L221.61033333333333,330.0228588516517L221.7325,330.06968169119443L221.85466666666665,329.06663778300566L221.97683333333333,327.47951301953987L222.099,329.2259248107792L222.22116666666665,332.6493719628531L222.34333333333333,333.7177361614948L222.4655,334.09323276635166L222.58766666666665,331.5291338504516L222.70983333333334,330.9081651882244L222.832,329.40646346891475L222.95416666666665,329.9064223549869L223.07633333333334,331.81497435252584L223.1985,330.8102047806212L223.32066666666665,327.8246866129482L223.44283333333334,329.2478252351115L223.565,330.183720206399L223.68716666666666,332.69729037268166L223.80933333333334,333.3676153930867L223.9315,334.56612165556214L224.05366666666666,336.17736173192986L224.17583333333334,334.7016232786788L224.298,336.5221695298493L224.42016666666666,335.66164724963414L224.54233333333335,336.29095621977064L224.6645,334.6158910043711L224.78666666666666,335.30146508347747L224.90883333333335,335.93594402872964L225.031,338.03981479685183L225.15316666666666,334.3661559329563L225.27533333333335,336.66439656379185L225.3975,337.60482053639635L225.51966666666664,336.6065806322276L225.64183333333335,337.4901407764644L225.764,335.11083962035457L225.88616666666664,334.38663130110433L226.00833333333335,330.7955580049916L226.13049999999998,330.59438003145084L226.25266666666664,333.7549101368527L226.37483333333336,332.9849317685214L226.49699999999999,333.4416393980939L226.61916666666664,333.3078545889385L226.74133333333336,334.33793049283463L226.8635,334.13519624942785L226.98566666666665,336.50532673514374L227.10783333333333,335.80846959453976L227.23,334.7007784177638L227.35216666666665,334.7684567691879L227.47433333333333,334.7393037218098L227.5965,334.0852260247994L227.71866666666665,333.43907207080764L227.84083333333334,332.8515166742516L227.963,329.9370300604914L228.08516666666665,326.42655267703594L228.20733333333334,326.08932088778164L228.3295,326.8720475527186L228.45166666666665,326.7958932118177L228.57383333333334,330.20544627042847L228.696,333.47402853170627L228.81816666666666,334.16284899429263L228.94033333333334,334.16541362452466L229.0625,331.11882452374385L229.18466666666666,330.0762063531727L229.30683333333334,325.6797574450794L229.429,326.0087200902178L229.55116666666666,327.4973786691312L229.67333333333335,327.722286058338L229.7955,328.19004892289814L229.91766666666666,329.419243103993L230.03983333333335,329.3252458919715L230.162,331.7219579839392L230.28416666666666,333.2242189255651L230.40633333333335,333.63707555301096L230.5285,331.41193037695865L230.65066666666667,330.48996063133814L230.77283333333335,329.0772532767653L230.895,328.705515315705L231.01716666666667,328.525794000621L231.13933333333335,330.2156160379435L231.2615,328.2671046772072L231.38366666666664,329.8899207893358L231.50583333333336,330.6471036111807L231.62800000000001,331.2870727383881L231.75016666666664,333.6083543489252L231.87233333333336,333.41203880520993L231.99450000000002,332.9625245510822L232.11666666666665,333.8878389452074L232.23883333333336,335.872991904729L232.361,337.1279726721448L232.48316666666665,337.94619477605687L232.60533333333336,336.5654727854709L232.7275,337.01044467837005L232.84966666666665,337.9838309025016L232.97183333333334,340.1299769638624L233.094,338.0911350370925L233.21616666666665,337.31224247228664L233.33833333333334,335.30595492555074L233.4605,336.47183259130975L233.58266666666665,335.7821485494125L233.70483333333334,337.1381472928278L233.827,337.0297761224012L233.94916666666666,336.0542343209029L234.07133333333334,334.8008827394558L234.1935,335.0040416111335L234.31566666666666,336.3473921068669L234.43783333333334,335.90507668754765L234.56,335.40865876291076L234.68216666666666,333.3526179760825L234.80433333333335,333.717937723852L234.9265,331.1128578395753L235.04866666666666,334.770491041322L235.17083333333335,335.7549441124614L235.293,340.7102688506641L235.41516666666666,340.9169281057683L235.53733333333335,340.3668360951374L235.6595,337.71780691129976L235.78166666666667,336.1501413370326" style="fill: none;" fill="none"></path></g><g><path stroke="rgb(0, 119, 187)" stroke-width="2px" d="M140.61383333333333,-5.54817539306498L140.98033333333333,2.466222521449538L142.07983333333334,8.686892931929208L144.0345,14.21947943293003L144.27883333333332,20.70861220933432L149.65416666666667,26.050545608928985L150.265,28.113761156882163L152.0975,28.384445747578752L152.8305,29.703486801680242L153.44133333333335,29.043056871077056L154.29649999999998,27.937602976237542L155.02949999999998,28.318752537672538L159.30533333333332,26.948903563252316L159.54966666666667,25.709120021831495L160.527,24.49935623215788L163.0925,24.141730516179564L164.06983333333332,23.329697516245176L166.51316666666665,23.266126088815838L173.11016666666666,23.602028499431867L173.23233333333334,24.07670033311473L175.30916666666667,25.43150706837247L175.67566666666667,27.10153258066589L176.04216666666667,26.91333922086062L177.87466666666666,27.161006146411296L178.24116666666666,28.080132321570204L178.36333333333334,28.692841128661712L179.585,28.72462966089654L181.90616666666668,29.497605167130303L184.47166666666666,30.09630997769142L186.5485,31.078433636464183L187.15933333333334,32.8551945524913L190.45783333333335,35.286948211178924L191.67950000000002,36.793342733395036L192.4125,38.4816048482503L193.26766666666666,39.84048925707879L193.3898333333333,40.16540054172168L197.4213333333333,41.62546696281544L197.66566666666665,43.87647995183937L199.13166666666666,45.668274947974965L199.98683333333332,47.566015647203926L200.109,49.19507979496967L200.35333333333332,49.36737481711799L200.71983333333333,50.93741675254131L200.96416666666667,53.14232913056446L202.43016666666668,55.274752897302335L203.65183333333331,57.03701565077682L208.90499999999997,60.16020635275527L209.88233333333332,65.79107839638522L210.85966666666667,69.57808926845047L211.95916666666668,72.673764373594L213.54733333333334,77.13564793229193L214.03599999999997,81.02239132104069L215.13549999999998,86.83954086172895L215.6241666666667,90.20546167049082L215.86849999999998,93.33787619445548L216.11283333333333,96.98247543628958L216.968,101.63859358174915L217.57883333333334,104.66964318016613L217.82316666666668,107.96111221808223L218.434,112.28145531654621L218.67833333333334,116.50385479665846L219.167,118.49945049419392L219.6556666666667,120.97493947955843L220.3886666666667,123.95871549321558L220.51083333333335,126.04875710748132L223.32066666666665,129.72872985486754L223.80933333333334,133.68686068611396L227.84083333333334,137.34340550080344L228.696,140.25731728085188L229.429,142.74444726440436L231.13933333333335,143.01792188619947L231.62800000000001,146.3418879414499L232.48316666666665,150.75689951329872L232.7275,152.4106606820051L233.70483333333334,156.22650910541856L235.41516666666666,158.53519397026938L235.53733333333335,161.35867859028386L236.75900000000001,162.2519358139286L237.12550000000002,165.16066103654188L238.225,166.2620089051431L239.2023333333333,168.8841539460574L240.30183333333332,169.09908879057085L240.424,170.4254553671894L241.157,173.30406801877982L241.27916666666667,175.38193604146164L241.89000000000001,175.93842125285272L244.08900000000003,177.920015444071L244.4555,180.4896583651969L245.555,181.46642225226833L249.22000000000003,180.96729473329884L250.80816666666666,180.88712918113254L251.54116666666667,181.94399875629512L251.90766666666667,183.71314119068273L252.152,182.45907725833413L252.64066666666668,182.21454892554203L253.37366666666668,186.26838437715048L253.98449999999997,184.31154275802322L255.2061666666667,184.17649287935984L255.93916666666667,183.5608699048447L256.55,184.77619519270033L257.7716666666667,183.1406802244903L262.9026666666667,181.91377607249828L263.39133333333336,181.35553565370262L263.51349999999996,181.67993384219787L263.6356666666667,182.352616990532L265.8346666666667,179.27423376062222L267.42283333333336,177.85681119647506L269.01099999999997,177.1157799539548L272.18733333333336,177.53046772178394L273.65333333333336,176.51236089556832L274.99716666666666,174.10094077709155L275.36366666666663,174.57758217488737L275.73016666666666,174.32392283117903L278.1735,172.7583919586449L278.9065,173.44153466132934L279.76166666666666,172.37357753996108L280.49466666666666,174.32167305984404L282.44933333333336,176.7569008571931L282.81583333333333,177.29156494246394L284.1596666666667,178.85392320479903L284.52616666666665,177.91962156136583L284.8926666666667,177.4526276124459L285.01483333333334,177.18358457997914L290.8788333333333,180.68433565885894L291.24533333333335,183.1594962593695L291.6118333333333,183.19428871064355L291.73400000000004,181.83678556148877L294.4216666666667,182.7011930499378L295.8876666666667,184.07248705583805L297.1093333333333,183.82145207993491L297.598,183.79587839361065L300.6521666666667,184.89581204061824L300.77433333333335,185.9225966085429L302.1181666666667,187.0198918566849L303.09549999999996,190.18860308825205L303.5841666666667,188.82494235985794L303.82849999999996,188.70404878442397L307.73783333333336,189.89547810466448L308.2265,192.60187734839437L311.28066666666666,194.31164768370323L313.23533333333336,197.0030320837088L313.3575,200.1742325874535L315.19,203.57326571470367L315.67866666666663,200.9362298251154L317.6333333333333,199.93460954258634L317.7555,199.6759989996363L320.4431666666666,203.01219114390764L321.054,199.16893459773098L321.6648333333333,202.5125304934695L322.03133333333335,205.01966558714912L322.64216666666664,203.37328690042742L323.00866666666667,202.75160817041916L324.4746666666667,200.952548841389L325.452,204.18262104510552L326.30716666666666,202.6420569723439L327.1623333333334,204.6372062828109L328.26183333333336,204.54095574822333L328.7505,207.40577916707278L328.9948333333333,210.18427010313965L329.2391666666667,212.92836540883795L329.60566666666665,211.2609969585192L330.4608333333333,213.31371922521154L331.5603333333333,213.15476730554562L332.78200000000004,215.80972680851613L333.27066666666667,214.6865887985562L333.515,214.3148411370431L333.8815,216.17614258478383L334.4923333333333,216.38550019028096L334.85883333333334,215.36921632573183L334.981,213.6515707242418L337.4243333333333,216.2424229236672L338.5238333333333,215.35251961135364L339.2568333333333,218.25858656080214L340.112,216.03240599317013L340.2341666666667,216.2876003535956L341.33366666666666,220.34229840022633L342.1888333333333,221.0212028079111L344.7543333333333,226.4854303908006L345.243,222.11740263327766L346.4646666666667,225.95894306345107L346.9533333333333,218.96431872373634L348.05283333333335,223.52667578949684L349.64099999999996,224.48143713844846L350.0075,227.11866147519274L350.7405,222.97566174079157L350.9848333333333,223.52706750382143L355.13849999999996,228.11417168777328L355.87149999999997,224.62072946033945L357.9483333333333,224.59788025087624L359.17,224.06887811292862L359.5365,226.17380471726804L359.78083333333336,226.60548264064565L361.9798333333333,229.63985585609214L363.69016666666664,232.0550051039638L363.9345,234.41659882146598L366.1335,230.51623561786582L366.62216666666666,231.61163017181968L367.5995,235.1231235556758L369.67633333333333,232.5738705384551L370.40933333333334,235.7275174010322L373.3413333333333,235.19648720784863L374.92949999999996,234.91017271636383L375.296,234.71163058548458L375.41816666666665,236.63380641272104L375.66249999999997,237.36114232861974L375.90683333333334,240.8889939186083L376.39549999999997,237.854848325558L376.88416666666666,237.65157847641763L377.7393333333333,243.0622881682146L379.57183333333336,246.25114203307285L383.72549999999995,243.9202590853075L384.21416666666664,245.22647118862034L385.0693333333333,240.00962617786476L386.16883333333334,243.48272872635795L387.63483333333335,243.26753005426127L388.8565,241.98303985361673L389.10083333333336,242.2395347783462L389.34516666666667,243.62958074827006L389.956,246.68493985571152L390.3225,242.72042383671243L391.29983333333337,245.18273588042769L391.7885,244.8464150017446L394.2318333333334,242.9232065085803L394.8426666666666,244.89981921220553L394.9648333333334,248.42870597076924L395.2091666666667,248.95661157808962L395.5756666666666,245.9636259715554L396.0643333333333,244.5526055293472L396.18649999999997,245.3373605423177L397.04166666666663,249.1583982439053L397.40816666666666,247.57238219642198L398.019,248.33283050264006L400.3401666666667,248.54842722026612L401.5618333333333,251.16541785636392L402.2948333333333,251.246148948977L403.0278333333333,251.98756279342177L403.5165,254.92898925006924L404.9825,255.60468514464216L405.1046666666667,257.24109900124535L408.1588333333333,256.22286525837063L409.2583333333333,253.86740720426423L413.5341666666667,254.62896391116055L413.65633333333335,255.62712982003973L413.90066666666667,257.7138652417585L419.27599999999995,257.18376152253325L420.25333333333333,257.64723081242676L421.1085,260.2156159314303L421.3528333333333,261.3595667728248L421.5971666666667,259.3253698621809L422.20799999999997,260.3173491814324L425.2621666666667,259.20759922341085L425.38433333333336,258.34855778215064L426.606,258.5058212846435L426.85033333333337,252.91971454331738L427.4611666666667,254.47494932884734L429.17150000000004,257.31997818009967L429.90450000000004,261.10775262027573L431.00399999999996,262.32502986938414L432.8365,261.0936293175009L433.32516666666663,262.75454421909L433.5695,263.91978815485925L436.6236666666667,265.92376022261357L438.0896666666667,265.94881394311574L439.3113333333334,264.8476258268897L440.0443333333334,265.377900059447L442.1211666666666,266.2836435920955L442.85416666666663,265.6802939030963L442.9763333333333,264.1291498276266L444.68666666666667,264.1692966332165L445.41966666666667,264.3911775371123L446.0305,261.59976856809055L448.8403333333333,263.4784530678386L449.69550000000004,264.37102852662673L450.3063333333333,265.5958190312485L451.89450000000005,266.72488983767755L452.7496666666667,264.36456965119515L454.58216666666664,263.17524999313963L456.04816666666665,262.14755182510544L456.4146666666667,261.38818754663214L458.125,260.1914301872832L462.40083333333337,260.41062684157674L462.523,262.66744683632874L462.6451666666667,266.44014692626115L464.84416666666664,263.69531971062L466.6766666666667,261.26548743918386L472.5406666666667,264.2630384896801L473.6401666666667,262.6192343365322L474.7396666666666,263.165969340512L475.4726666666666,262.78202418020703L478.03816666666665,263.85227882524987L478.28249999999997,267.35093429047697L478.4046666666666,268.4153992565645L484.1465,265.35368204032403L485.1238333333334,267.70893082184074L487.9336666666666,266.3620171539628L489.03316666666666,267.93602193261097L489.5218333333333,267.97397778115635L490.0105,269.0965840355035L491.4765,271.2390188442235L491.843,271.8001388512339L492.82033333333334,272.2622497924901L493.1868333333333,273.994530976939L496.1188333333333,273.69514152270807L496.6075,274.5245822812867L496.8518333333333,272.0844854640111L500.2725,272.2556095681431L503.08233333333334,270.9190844178209L504.9148333333333,268.6459397537969L505.8921666666667,269.8949208233264L506.503,267.621761250331L507.11383333333333,268.96725362916357L507.84683333333334,268.594715156729L509.80150000000003,267.10579620068074L512.0005,268.59673110667393L512.1226666666666,267.8969497982395L512.9778333333334,270.0182675652793L513.3443333333333,272.66199500620814L516.2763333333334,273.7521061746601L516.6428333333333,274.09706352818966L517.4979999999999,275.56841937324265L517.8645,276.5041909120472L517.9866666666667,276.96530403659006L518.231,278.7364700465274L518.3531666666667,279.1444149983805L521.4073333333333,279.2520288752359L522.5068333333334,279.7580570640739L525.4388333333334,277.59414191474264L526.4161666666666,275.58014908214784L528.1265000000001,274.190485999207L528.2486666666666,273.7625369739627L530.3255,278.22558355219985L533.8683333333333,282.462111467572L534.1126666666667,283.6879782554951L535.3343333333333,284.71136516980016L537.4111666666666,286.36420437199735L538.0219999999999,284.4622006341371L540.7096666666666,286.39936149534196L541.687,284.33594698684965L543.0308333333334,278.4915690909727L543.2751666666667,278.0854366054863L544.1303333333333,280.5984766550845L544.8633333333333,277.8705083437665L545.7185000000001,277.07523616824324L546.4515,275.2312234361876L546.818,275.82533437466844L548.8948333333334,275.9079301397867L550.3608333333334,277.6500250748419L550.7273333333333,278.11635887791743L551.5825,276.85754357521307L553.5371666666666,275.5397059460793L553.7814999999999,275.5328400640684L554.5145,276.28593813159273L555.0031666666666,273.0662977355179L556.9578333333334,274.09325129410564L559.0346666666667,274.2618555862517L561.4780000000001,276.841459894727L563.677,275.41013035185034L563.9213333333333,276.17898488450817L564.1656666666667,275.88524333169875L564.5321666666666,280.3648333649774L564.8986666666667,281.79733862935853L566.9755,282.1269727742131L568.3193333333334,283.12053622147914L570.0296666666667,285.0950980983124L573.6946666666666,286.5836097023821L573.9390000000001,287.4630820385315L576.871,287.30827278044626L577.3596666666666,281.7600600637897L577.604,277.8545244559023L578.337,279.2664816060669L578.7035,282.2944268327226L578.9478333333334,284.74618889363086L580.0473333333333,285.0344599099173L580.9025,281.2119387718136L582.4906666666667,281.81234395086585L583.4680000000001,281.7842568728365L585.0561666666667,282.35810893276954L585.3004999999999,283.0052761150358L585.667,284.1826537652939L587.6216666666667,284.037200786293L588.2325,283.04283317946755L588.4768333333333,282.78391549981325L589.6985,284.58010377983277L590.1871666666667,283.7205968659853L593.73,283.3748551311371L594.3408333333333,285.04900061862236L595.196,284.3600469980256L600.5713333333333,282.247284071055L601.5486666666667,284.72082081838687L602.4038333333333,281.5511083091781L602.8925,281.2663434109196L604.2363333333334,280.4241275205316L604.9693333333333,283.25986731546743L605.7023333333334,285.8403100488385L608.39,285.1405824901471L609.7338333333333,286.7874115252107L610.3446666666666,287.43288347737905L611.4441666666667,285.9450824096257L611.5663333333333,284.92055558057683L613.0323333333333,286.1734462110718L614.3761666666667,284.84230573395183L614.7426666666667,283.78791184137515L615.9643333333333,285.4185019816399L621.7061666666666,287.08176234812527L622.317,287.2580513740271L622.4391666666667,288.290864770642L623.6608333333334,286.51297454977123L632.579,287.25116476736594L633.9228333333333,287.5837323485857L634.045,289.9895505778476L634.2893333333333,289.6686867907579L634.5336666666667,289.2434708510792L634.9001666666667,287.5031539683091L635.0223333333333,287.8511749862556L638.0765,288.4721918601605L638.6873333333333,287.5308904905722L639.0538333333334,287.11673362713714L640.0311666666666,282.0322952634872L641.375,282.27941651070466L642.4744999999999,283.87478859946094L642.841,283.91025483127544L642.9631666666667,281.3346685971792L644.7956666666666,283.2009495491239L646.7503333333333,284.31993522945197L647.4833333333333,284.4718393811015L647.972,288.2802900349343L649.0715,288.0298239687459L649.3158333333333,288.5798903455774L650.4153333333334,287.8410579477763L650.904,289.05862983610456L652.2478333333333,288.2859655746108L655.4241666666667,287.70382457605484L656.5236666666667,288.3174573018769L658.3561666666667,288.1931633360598L659.2113333333333,287.27056938797637L659.9443333333334,289.41611044923263L660.433,289.0944324944818L660.5551666666667,288.173623240994L662.8763333333334,288.6381104250788L663.2428333333334,290.05179010468214L667.6408333333334,289.10575062108916L668.3738333333333,289.6959012081151L670.2063333333333,290.2740007826964L670.4506666666666,292.91677053215193L672.5275,290.9674407563924L672.6496666666667,288.7999486879002L674.9708333333333,290.46320573570273L675.0930000000001,291.2447317097533L675.4594999999999,289.8370597105924L676.5590000000001,287.50014633319756L677.9028333333333,287.02869713391215L680.957,290.8168720340005L681.3235,290.5828206202817L682.6673333333333,292.0229468044786L683.0338333333333,290.40830994009184L683.2781666666667,289.0889236053155L683.5225,288.53039232030505L684.9885,287.0261694793626L687.0653333333333,287.08759544987004L689.6308333333333,289.3822997353291L690.486,290.41224665281703L691.4633333333334,288.9840179673249L691.952,289.44062339808806L692.8071666666667,291.45394989731165L692.9293333333334,293.7576186698454L693.0515,294.61903941059484L694.2731666666667,296.1462654672778L697.4495000000001,294.2807966521304L698.0603333333333,296.45784126224356L700.015,295.79516744725225L701.3588333333333,294.45524804253307L701.8475,292.2187810592195L701.9696666666666,289.1094157217484L702.947,292.41098762198015L703.68,292.3372715092511L704.0465,289.46516591331545L705.146,290.0663001453977L705.6346666666667,290.72457514786373L706.4898333333333,293.339405330767L706.612,294.09238399962896L707.345,293.6786071286018L708.078,295.3314301474517L708.3223333333334,297.26533200393226L708.6888333333334,299.4589474977537L712.9646666666666,296.2449376840522L714.4306666666666,296.6181745258033L716.7518333333334,297.5266022572447L716.9961666666667,298.2498984605917L719.5616666666667,293.9757440491166L722.005,293.5619264300565L724.937,290.87771606336173L725.3035,294.5862421057011L726.8916666666667,291.21425511976986L727.869,294.6787263556129L728.9685000000001,296.29048013135986L729.335,298.0001915062184L729.5793333333334,301.34447765095615L729.7015,298.7141572946796L735.0768333333333,299.2881074125995L735.8098333333334,299.0262649787949L736.1763333333333,297.77010329481845L737.0315,294.6941884431086L738.6196666666667,294.10490627898974L738.864,292.5406979871413L738.9861666666667,289.7961327948873L739.8413333333334,291.8072924593863L742.0403333333334,289.5814257987242L744.8501666666666,291.85932698964046L745.4609999999999,288.93842734260096L747.9043333333333,291.09501597590275L748.1486666666666,293.11916603087377L748.8816666666667,293.8075296886363L752.058,296.7296379036917L752.6688333333333,295.02625223752966L754.7456666666667,296.62112658176136L756.9446666666666,298.0045254296994L757.922,298.7036515934424L760.7318333333334,302.8448668198282L761.2205,300.0828130376501L761.7091666666666,300.8856140024251L761.8313333333334,302.0120742781354L765.0076666666668,303.6481576260185L767.8175000000001,302.44216150232006L771.3603333333333,301.0532352964676L772.7041666666667,300.7294411324728L774.6588333333333,297.3383379061246L776.3691666666666,299.2086906102871L776.4913333333333,296.0728168621116L778.9346666666667,295.8580625390472L781.8666666666667,292.2739261478047L783.8213333333333,292.70519269891645L786.1425,296.45169744832884L786.509,297.27182518665705L787.1198333333334,293.98720576084975L791.6400000000001,296.4943877775529L792.4951666666666,294.3190036302381L792.9838333333335,295.53392113639603L793.7168333333333,299.86046684971836L794.4498333333333,298.2134894939756L795.0606666666666,298.72432423481143L795.4271666666666,304.9739966174284L795.6714999999999,303.64675992685466L798.6034999999999,302.82604075338344L799.5808333333333,301.7443001543835L801.6576666666666,301.2074692128606L802.2685,303.25041049572576L802.5128333333333,299.5095036094879L803.9788333333333,302.1598266847519L804.2231666666667,299.7598797520566L804.9561666666667,300.4967388738172L806.4221666666667,299.2562848475218L807.033,299.1848709295681L808.2546666666667,297.7755275438153L809.4763333333334,295.86218240582343L812.8969999999999,296.93035771676944L813.1413333333333,296.51255108266054L813.6300000000001,298.37014782203266L815.4625,298.5407670474239L820.4713333333333,298.2016592764443L820.8378333333333,297.1052116404381L823.0368333333333,300.4028475434295L825.2358333333333,296.6632085440287L827.3126666666667,294.5665463332662L827.6791666666667,295.5954764033467L828.29,297.62541506575144L828.9008333333334,296.3273044840405L831.0998333333333,294.46541197336563L831.222,296.4278607159563L833.2988333333334,299.0959510205819L833.6653333333334,305.41885398355134L834.5205000000001,301.7644132842445L837.819,303.5363686606113L838.0633333333333,300.4462840848216L838.9185000000001,296.3418014693623L841.1175,297.15284305690926L842.217,298.84213028860745L843.3165,299.4111846075429L844.5381666666666,298.1685846959379L845.882,300.96700952312494L846.3706666666667,300.5287622171623L846.4928333333334,300.82699169521334L848.814,301.64384976348134L849.4248333333334,299.7260942273708L849.6691666666667,297.0891925800868L850.1578333333333,297.4614996378116L851.2573333333333,296.7865535109503L853.4563333333334,295.47320366143043L855.8996666666667,302.5134827865535L856.5105000000001,299.97911500843L857.6099999999999,302.5142930474383L858.3430000000001,301.198082170339L858.4651666666667,299.8556970317313L858.9538333333333,300.22933214941014L859.3203333333332,301.929404589327L862.0079999999999,301.3285379050109L864.207,300.0383292815433L864.6956666666666,300.3580619920153L865.3065,300.7301614625649L867.2611666666667,298.70068922778114L867.5055,300.3297848014682L868.7271666666667,299.7159785733166L869.4601666666666,301.64262492645275L869.7045,302.7051040598601L870.4375,301.5076610795245L870.6818333333333,301.5868551796201L870.9261666666666,303.07880513423925L872.3921666666666,304.00718535053045L872.6365,303.18973615310483L872.8808333333334,301.46921543858764L875.6906666666667,300.61017635885923L876.7901666666667,300.2872821427301L877.1566666666668,300.7244057033327L879.1113333333334,301.60582672125327L880.4551666666667,304.08253989410883L880.6995000000001,303.21436645533345L881.4324999999999,304.5998710793858L883.0206666666668,303.06715023907054L883.6315,303.6574579214411L884.4866666666668,303.61174376483785L885.2196666666666,306.11124202166275L886.8078333333333,304.7705750033274L889.9841666666666,303.89583945076595L892.794,304.1180535610303L893.2826666666666,303.3712876961954L893.4048333333334,302.4021464942456L894.26,302.82388110791476L896.3368333333334,303.85943647583167L897.6806666666666,304.922958849858L899.0245,304.5418929183228L899.8796666666667,305.3123582644463L900.6126666666667,305.9778800034182L903.5446666666666,304.0540162153723L910.2638333333333,304.0871402106729L910.7524999999999,302.67427429632886L912.3406666666666,300.40914121092084L914.1731666666667,300.88450823656285L915.1505,303.1437047250441L915.2726666666666,306.9355584634925L915.7613333333334,306.3098960357164L916.983,308.56312432631535L918.2046666666666,309.1271303447161L918.3268333333333,309.81157485179614L918.449,311.017348886179L919.4263333333333,310.7946948430668L921.8696666666667,311.39395976380126L924.8016666666667,309.4996372598947L925.6568333333332,309.1078558385457L926.0233333333334,307.6473790904715L927.2449999999999,307.625045595913L928.3444999999999,305.88483281077225L928.7109999999999,307.2409015034558L929.9326666666666,306.6174610192337L930.5435000000001,306.2270999082061L934.9415,307.15834865096684L936.0409999999999,307.31236379323894L936.2853333333334,304.38799645647356L936.774,306.66463177439584L937.507,307.0941517375106L937.8735,305.4843864578652L938.7286666666666,305.4921531825071L939.706,305.15253332495615L940.0725,307.1871579841562L941.0498333333334,306.81420599732456L941.5385,303.3226496430158L942.0271666666666,301.11596117017575L944.837,301.1287411719994L945.3256666666667,302.52233755588287L946.1808333333333,302.7207905513455L946.6695000000001,304.1214843617402L948.0133333333332,305.5033948453659L949.4793333333332,306.7000542405384L951.4340000000001,305.99377737828763L954.4881666666666,306.89038094417805L956.3206666666666,306.3130019211561L958.031,308.1172336944761L959.0083333333333,307.7674877315786L959.9856666666667,306.454277234904L960.7186666666666,305.26495073732525L960.8408333333333,305.6764070334849L964.0171666666666,303.2385713068505L965.7275,303.5825902110786L968.7816666666668,306.81211488018585L970.8585,305.1455170215224L972.3245000000001,308.031359984213L972.6909999999999,308.7804785538273L973.4240000000001,310.2932437790172L975.6229999999999,310.5882927521577L975.8673333333332,311.6853149929424L978.0663333333333,310.75843444441614L978.555,313.1299605268172L981.1205,313.153321808305L981.6091666666666,313.3204894260041L982.4643333333333,311.7328815302517L984.0525,309.8349906101212L984.7855,309.51908627409114L986.4958333333334,309.2685753737145L987.9618333333334,308.2468974733152L989.7943333333334,308.08567761033976L990.7716666666666,308.3833369668288L990.8938333333334,309.2906062882504L993.5814999999999,310.561651138551L994.0701666666668,310.9537079599918L995.7805000000001,308.12375066524135L997.2465000000001,307.57502474364844L997.3686666666667,310.0741648138304L998.2238333333332,308.9886179380063L999.9341666666668,309.67130801650023L1000.6671666666666,311.6172433776219L1001.5223333333333,309.9655296697988L1001.7666666666667,308.13503453720944L1002.8661666666667,309.7453643945436L1003.1105,312.1467039875257L1003.2326666666667,310.7241112488796L1005.5538333333333,309.960944896108L1005.9203333333334,308.9013916677612L1007.0198333333333,308.28934634187056L1008.608,307.841976585662L1011.54,309.0165372495593L1013.4946666666667,308.4192497516839L1014.8385000000001,308.5472682119636L1017.0375,309.05597185681285L1018.2591666666667,308.92455921001704L1018.3813333333333,308.5124646828655L1019.1143333333334,308.5696568325336L1019.9694999999999,307.3548315148785L1020.4581666666668,307.9540221380966L1020.5803333333334,303.56126475095755L1020.7025000000001,300.20107974350015L1023.3901666666666,301.5790687577437L1023.5123333333333,302.16768247044035L1024.001,302.50468445638694L1024.2453333333333,297.5884155964202L1025.3448333333333,300.82290558700737L1025.8335,297.0724806500615L1029.3763333333334,299.5716842267509L1029.4985000000001,300.895829844033L1030.598,301.1697740003695L1030.9645,302.6092031899502L1031.8196666666665,302.49426344498465L1033.0413333333333,301.2674738667069L1033.1634999999999,302.24708172125355L1033.4078333333332,300.5139890444309L1034.0186666666666,302.05333093426793L1034.7516666666666,302.4539665412209L1035.2403333333334,305.2315015863364L1035.4846666666667,306.32447558874276L1037.4393333333333,305.78844669925775L1038.1723333333334,302.59809554586496L1038.2945,303.0403757319516L1039.2718333333332,304.29461477096913L1041.593,303.3966685815712L1043.6698333333334,303.52250279377284L1046.8461666666667,302.9678825977958L1049.656,304.54506888626975L1051.2441666666666,304.7663608285229L1051.6106666666667,307.77269063336877L1055.3978333333332,309.8410937360623L1055.52,308.3944763063244L1057.4746666666667,307.71473665199693L1057.5968333333333,307.70516503380196L1058.8184999999999,309.8254477290709L1059.185,311.84196599110965L1059.4293333333333,310.9402930928285L1060.7731666666666,310.1767096622021L1061.5061666666666,310.7769197489366L1061.8726666666666,311.7564093980474L1062.3613333333333,311.7922268193389L1064.5603333333333,310.99558935943077L1064.6825,307.2838717270007L1065.1711666666667,308.1702334928547L1065.5376666666666,309.0305082497226L1065.782,311.09282140190004L1067.4923333333334,311.81721328716276L1067.981,311.39781143220387L1069.3248333333333,311.83612685037974L1069.6913333333334,311.5270465743039L1072.1346666666666,311.7447740591804L1073.112,311.5560174281714L1074.3336666666667,313.4315311023066L1074.4558333333334,314.895600650555L1075.7996666666668,314.2357904827426L1078.6095,312.1210955386186L1079.2203333333332,311.53732313721065L1081.2971666666667,311.18709074859305L1086.6725,309.3473575755863L1086.7946666666667,309.23108129463685L1087.2833333333333,308.65939354668376L1087.772,310.3310978482325L1089.971,308.02069216587495L1090.704,308.3857116269876L1095.9571666666668,307.53509674138763L1096.9344999999998,305.7432346490598L1097.9118333333333,308.0282136201822L1098.034,307.20406868229776L1100.1108333333332,309.13591789214L1101.5768333333333,307.4423768621376L1102.0655000000002,308.57997943654556L1103.4093333333333,307.83730727601255L1104.8753333333334,310.6367212013323L1105.2418333333333,311.7778046078616L1105.4861666666666,313.6238948788923L1105.9748333333332,312.0943230830081L1106.9521666666667,310.27676648590386L1114.2821666666666,310.9276706000297L1115.0151666666666,313.69622750774687L1115.1373333333333,314.56717648685503L1117.2141666666666,315.13774818925685L1118.0693333333334,313.7229210406831L1120.8791666666666,312.635751351477L1121.3678333333335,310.79698101369854L1122.7116666666668,311.14545044793874L1123.0781666666667,310.56910380864156L1125.155,312.5417456083643L1125.888,313.38941099588885L1127.4761666666666,312.28181673237225L1129.9195,309.9226434378693L1131.019,310.1377490657002L1132.9736666666668,310.0926207372109L1133.218,306.86853505469367L1134.3175,308.5144125692429L1135.417,309.0416164775452L1136.3943333333334,309.93123612137083L1138.8376666666668,310.8101968507933L1139.082,311.65769284594444L1139.4485,312.51561393197227L1139.6928333333333,312.37082087133854L1140.4258333333335,308.8674351279144L1141.8918333333334,310.6993361075647L1142.0140000000001,309.04430546420974L1142.8691666666666,310.62404964015707L1143.2356666666667,311.94728205238346L1143.8465,309.8825901079342L1144.7016666666668,311.95688751060436L1144.9460000000001,315.1158285375168L1151.1765,313.08769657761695L1152.5203333333334,314.2291631286421L1152.8868333333332,310.2517552529265L1153.4976666666666,310.83795988559507L1155.4523333333334,310.49019398640183L1155.941,312.22649029521716L1156.0631666666666,310.10928589707027L1156.3075,308.72039584168044L1157.5291666666667,309.2346183355966L1158.0178333333333,307.47747992555253L1158.5065,309.1893034372457L1159.606,307.69198872001584L1159.7281666666668,307.66536381121364L1160.339,311.02377214078086L1160.8276666666666,310.97522657366255L1160.9498333333333,313.72920825759275L1162.7823333333333,314.9465116296549L1166.5694999999998,314.6000951832377L1168.0355,317.6400659106413L1168.8906666666667,318.14821759457277L1169.5014999999999,318.6785965765196L1171.2118333333335,318.6147450154285L1172.3113333333333,320.68504268080267L1174.1438333333333,324.1688324663828L1175.4876666666667,322.8753783604204L1176.3428333333334,319.6267436731241L1176.465,321.44363098841046L1177.5645,323.11003771282384L1182.2068333333334,319.8838516478699L1182.5733333333333,322.2614107222161L1182.8176666666666,324.6429546106383L1183.1841666666667,320.1174623412068L1183.3063333333334,319.5297709846927L1183.795,317.33482539377883L1187.7043333333334,318.0797766796208L1189.4146666666668,320.0357511272806L1192.7131666666667,319.88685816722153L1193.4461666666668,321.6646241202141L1193.9348333333332,317.8355788785624L1196.1338333333333,315.71851128174467L1196.7446666666667,318.2606443124242L1198.5771666666667,319.4859487362429L1198.9436666666666,319.43572788393396L1199.3101666666666,318.8482568291977L1201.8756666666666,319.1146134595895L1203.2195,319.1113053957477L1204.8076666666666,319.081079490637L1204.9298333333334,319.6116988019264L1206.1515,322.37772643054876L1206.518,322.09472492387874L1207.0066666666667,323.9957529034948L1209.0835,323.7486107854725L1209.3278333333333,323.69654933690055L1209.6943333333334,322.19275830177605L1212.2598333333333,322.0629101023751L1212.5041666666666,322.7437264455798L1213.115,323.05663455812623L1214.4588333333334,321.82371976820457L1215.4361666666666,323.56050759632376L1215.9248333333335,319.4693664044592L1218.0016666666666,319.17295219986505L1218.7346666666667,320.9369765228647L1219.9563333333333,317.8560839714632L1220.2006666666666,315.1186666443041L1220.9336666666666,318.4813248963767L1221.5445,321.0079814702277L1222.1553333333334,319.93011313301434L1222.5218333333332,320.00047515808626L1224.3543333333332,321.93919628954626L1224.4765,322.64808453010846L1227.775,324.9170688096315L1228.7523333333334,324.77502624066835L1229.974,322.8548568836876L1231.3178333333333,323.0487159188202L1233.8833333333334,325.0656397137815L1235.9601666666665,324.42716796084585L1236.0823333333333,321.34149871647725L1239.9916666666666,318.31435243264883L1240.1138333333333,319.2251201575527L1240.8468333333333,318.74507699308924L1247.1995,320.03138132913125L1247.4438333333333,321.17679060865L1248.299,324.2622463213577L1248.6655,324.922186469559L1249.5206666666668,322.8824726204856L1252.8191666666667,323.8716500097625L1255.3846666666666,325.9085974598095L1256.6063333333334,324.5480366594934L1258.5610000000001,322.65977252731125L1258.8053333333335,323.54658498478096L1259.0496666666668,321.62348590522345L1259.6605,319.1866645420447L1263.5698333333332,320.43062552272403L1264.1806666666666,322.80949485608915L1265.5245,321.55982593035606L1265.6466666666665,320.1173354027024L1265.7688333333333,322.7514407744885L1266.1353333333334,323.4828748598283L1268.09,322.79241037945656L1271.3885,321.4885535055311L1271.755,321.24988821222615L1274.1983333333333,319.852388188657L1274.9313333333334,319.47757030641276L1275.0535,321.52860856873434L1278.2298333333333,321.94116385485387L1279.085,321.2471858737125L1279.9401666666668,316.8572555281793L1283.8495,319.0449559249011L1284.216,321.43049391861985L1286.5371666666667,321.0783820543709L1286.6593333333333,321.10882571627553L1287.7588333333333,321.605971657268L1288.614,321.2023159638608L1288.9805,323.3046792123494L1289.8356666666666,325.05207443356494L1290.2021666666667,324.66128482271097L1292.5233333333333,322.600584675833L1292.8898333333334,323.06676827017395L1294.2336666666667,324.9525956333229L1295.5775,322.310048422651L1296.7991666666667,323.0699520971381L1297.0435,325.53273164731394L1297.1656666666668,325.8173511427266L1297.5321666666666,325.9283429368707L1300.4641666666666,328.60127410953646L1301.075,328.31432170993287L1301.1971666666666,325.5367248206864L1301.6858333333334,324.7562529540734L1304.0069999999998,324.87020238427584L1304.1291666666666,324.39273206775056L1305.2286666666666,325.40158412235775L1306.2060000000001,322.56815659449677L1307.672,322.4536184000737L1312.1921666666667,319.0787336443904L1312.5586666666666,322.68598117242385L1313.6581666666666,322.82257216083957L1313.7803333333334,323.3507224913341L1314.5133333333333,326.1016926478299L1314.7576666666666,324.00463277928L1315.8571666666667,324.43643333368976L1320.133,326.22756006325795L1320.7438333333334,324.07197725956456L1323.5536666666667,325.4571885894626L1323.9201666666668,322.3970539082913L1324.2866666666666,324.49201398197033L1325.3861666666667,324.86137070071743L1328.8068333333333,324.7139476634852L1331.7388333333333,321.85010411453743L1332.9605,321.325362457358L1339.6796666666667,322.91392915693166L1340.4126666666666,325.1574424973797L1340.9013333333332,324.4618527964498L1341.2678333333333,320.9427597185679L1342.6116666666667,319.5407476887117L1343.8333333333333,321.95862034870805" style="fill: none;" fill="none"></path></g></g></g></g><g opacity="0.2"><g><g><g><path stroke="rgb(187, 187, 187)" stroke-width="2px" d="M122.16666666666666,62.5786333460095L122.28883333333333,65.4446928601285L122.53316666666666,68.00795842361629L122.65533333333333,69.31724734833773L122.7775,77.36524120290011L122.89966666666666,78.72701306973875L123.02183333333333,84.65206233640436L123.144,97.04260937743481L123.26616666666666,115.07133647364145L123.38833333333334,147.29037192577903L123.51050000000001,187.5425791655726L123.63266666666667,245.12334274724893L123.75483333333334,158.51696594844006L123.75483333333334,97.31476457421775L123.87700000000001,88.56015952998115L123.99916666666667,90.27406568865212L124.12133333333334,96.62901470582943L124.24350000000001,103.02338944328648L124.36566666666666,100.49261791410623L124.48783333333334,106.07469940416792L124.61000000000001,117.98772172918876L124.73216666666666,128.16011653288913L124.85433333333333,160.70363146372068L124.97650000000002,217.5007254997159L125.09866666666666,271.05281238781504L125.22083333333333,165.9287810938885L125.343,127.1222804997656L125.46516666666666,113.64734167256171L125.58733333333333,96.37142697629582L125.7095,97.86051426396591L125.83166666666666,101.80182346374528L125.95383333333334,111.31940194206038L126.076,121.39321419052612L126.19816666666667,128.6182782324359L126.32033333333334,137.6038693803416L126.4425,177.13834150325917L126.56466666666667,196.9267458995916L126.68683333333334,206.43555600253066L126.809,138.32993955351512L126.93116666666667,122.66778298987919L127.05333333333334,111.75751748496936L127.17549999999999,109.62921230340459L127.29766666666666,114.31363628976885L127.41983333333334,107.16028338238107L127.54199999999999,119.1013533311052L127.66416666666666,127.17474329274074L127.78633333333333,128.02764805326217L127.90849999999999,157.01953896219067L128.03066666666666,180.36917859007812L128.15283333333335,202.23409819050735L128.275,203.89523398585544L128.39716666666666,138.76598721426998L128.51933333333335,121.82797517572504L128.64149999999998,113.60679844304411L128.76366666666667,101.84462963996185L128.88583333333335,110.72918780367786L129.00799999999998,115.46090060007782L129.13016666666667,115.38073483651442L129.25233333333333,117.06838595557718L129.37449999999998,128.83866396059884L129.49666666666667,148.20805098913218L129.61883333333333,165.69905619901226L129.74099999999999,197.8664485647795L129.86316666666667,223.3046227193759L129.98533333333333,144.09694855738692L130.1075,121.41206697137875L130.22966666666667,111.89624898523516L130.35183333333333,110.69112034470476L130.474,110.22149859482982L130.59616666666668,111.3420723374141L130.71833333333333,117.77631603248295L130.8405,119.0040135115209L130.96266666666668,126.1135779145591L131.08483333333334,142.3077812723823L131.207,151.07030127037524L131.32916666666668,215.88317350790754L131.45133333333334,191.49505547215625L131.5735,189.60043488527563L131.69566666666668,127.08862757444254L131.81783333333334,115.63156972564462L131.94,102.86636152941549L132.06216666666666,104.5399287716969L132.18433333333334,111.48454229476528L132.3065,115.44165394893332L132.42866666666666,119.8998813843595L132.55083333333334,140.07324047087124L132.673,137.62975001324727L132.79516666666666,154.29800466054508L132.91733333333335,172.68829008695843L133.0395,225.74698189964028L133.16166666666666,160.9383037170696L133.28383333333335,148.67954720804292L133.406,139.27914556294866L133.52816666666666,139.00272241658038L133.65033333333332,147.163721708927L133.7725,141.2569979291332L133.89466666666667,126.24627953200965L134.01683333333332,126.76198358052156L134.139,116.67444149738236L134.26116666666667,129.70267815757242L134.38333333333333,138.33834453320313L134.50549999999998,143.83797000519453L134.62766666666667,151.93582657762153L134.74983333333333,147.89401970865813L134.87199999999999,160.4269860394674L134.99416666666667,158.48542062082694L135.11633333333333,154.29056910315455L135.2385,123.59993196509154L135.36066666666667,117.5699676616972L135.48283333333333,123.74226516399665L135.605,115.85256983651306L135.72716666666668,122.19091728728623L135.84933333333333,129.04060812919235L135.9715,134.38124733903194L136.09366666666668,148.1127742676103L136.21583333333334,149.83143025524103L136.338,154.3175544278702L136.46016666666668,164.30088918292174L136.58233333333334,191.18684964575354L136.7045,171.30673008904685L136.82666666666668,132.45742364577194L136.94883333333334,120.47711771876249L137.071,117.40062348505717L137.19316666666668,124.53080014414908L137.3153333333333,122.94125776715018L137.4375,127.95525169330648L137.5596666666667,129.0723647253454L137.5596666666667,153.84146309912353L137.68183333333332,153.59583098167323L137.804,150.75372756015796L137.92616666666666,166.6093235790401L138.04833333333332,186.8241397722241L138.1705,150.0188483223725L138.29266666666666,133.92419943136105L138.41483333333332,124.49661686520912L138.537,121.07377349430487L138.65916666666666,121.94293477866938L138.78133333333332,134.31558542292967L138.9035,134.11976299927093L139.02566666666667,138.14234512012166L139.14783333333332,158.47458764129394L139.27,156.70757289484183L139.39216666666667,178.31689686528514L139.51433333333333,174.25192611908082L139.6365,188.1696271308259L139.75866666666667,151.91338467285019L139.88083333333333,154.13027841299066L140.00300000000001,131.50712932672076L140.12516666666667,131.60565908531493L140.24733333333333,126.68276515562044L140.36950000000002,134.27637801174336L140.49166666666667,137.77622916064274L140.61383333333333,158.90257697099534L140.736,145.94171340496504L140.85816666666668,157.08218275519528L140.98033333333333,168.30074484649379L141.1025,168.161602145721L141.22466666666668,188.7789899562953L141.34683333333334,170.55488276127846L141.469,169.94827633215064L141.59116666666665,158.77139770731762L141.71333333333334,139.18525043175083L141.8355,136.88265717758372L141.95766666666665,146.85981010062238L142.07983333333334,157.48799465665698L142.202,165.39237904679635L142.32416666666666,164.9554377004624L142.44633333333334,179.97502375041373L142.5685,189.16580834947823L142.69066666666666,179.17833096589416L142.81283333333334,206.0048209363814L142.935,190.29300568018303L143.05716666666666,187.5029141726192L143.17933333333332,170.65262518126556L143.3015,163.89194754073282L143.42366666666666,169.8549017904359L143.54583333333332,174.8082674951803L143.668,184.5651830807737L143.79016666666666,177.45877268120142L143.91233333333332,200.41070189594944L144.0345,205.30537162863416L144.15666666666667,208.8849551518977L144.27883333333332,215.33217553856758L144.401,215.66871127137094L144.52316666666667,224.885774823489L144.64533333333333,202.52822683702487L144.7675,208.72219249489194L144.88966666666667,201.20489288503794L145.01183333333333,193.24947720077364L145.13400000000001,201.95277060668548L145.25616666666667,214.78576321073427L145.37833333333333,219.54936267047043L145.50050000000002,234.68016353079662L145.62266666666667,234.97273475678622L145.74483333333333,241.30181947562303L145.86700000000002,233.17045775415056L145.98916666666665,235.53110064409032L146.11133333333333,256.9496593102018L146.23350000000002,239.66933576279405L146.35566666666665,237.0928412101052L146.47783333333334,224.52737824822384L146.6,215.56664973746769L146.72216666666665,228.91770772658413L146.84433333333334,249.45933592417256L146.9665,236.5798380691507L147.08866666666665,263.1348760645518L147.21083333333334,268.7677127835741L147.333,263.91446311328775L147.45516666666666,264.7877850873689L147.57733333333334,279.8402690591536L147.6995,264.11010730388205L147.82166666666666,271.5108437374217L147.94383333333334,284.08164567858364L148.066,256.12438540889343L148.18816666666666,262.7244060792782L148.31033333333335,266.0252290642158L148.4325,260.4756389393665L148.55466666666666,274.6381831793402L148.67683333333335,295.7719137419424L148.799,282.4276374411861L148.92116666666666,278.0846375056184L149.04333333333335,287.7957535327164L149.16549999999998,284.38881228566225L149.28766666666667,285.1200527219863L149.40983333333332,305.10721916694206L149.53199999999998,272.20934408968947L149.65416666666667,298.48058922607675L149.77633333333333,289.09414114578396L149.89849999999998,288.72101172666675L150.02066666666667,289.5761862045313L150.14283333333333,304.41483060972644L150.265,304.2730945299598L150.38716666666667,292.83422036818945L150.50933333333333,315.79537554171344L150.6315,301.33585542106044L150.75366666666667,292.09476528500556L150.87583333333333,313.96053300391907L150.998,304.7397990967113L151.12016666666668,307.6424688291933L151.24233333333333,289.70283707341474L151.3645,292.59489365496427L151.48666666666668,286.58026353521285L151.60883333333334,293.40335709107075L151.731,298.0981992739054L151.85316666666665,301.5826717899791L151.97533333333334,319.4915191463451L152.0975,314.38966462550127L152.21966666666665,314.04039475563917L152.34183333333334,310.3717954661738L152.464,324.45876862558646L152.58616666666666,309.90326163629146L152.70833333333334,321.2831396647873L152.8305,285.84055743956856L152.95266666666666,291.4671050166788L153.07483333333334,294.335479760675L153.197,301.12567230431085L153.31916666666666,300.7440141606566L153.44133333333335,324.5255868162818L153.5635,301.30794243812267L153.68566666666666,319.05875171128946L153.80783333333335,316.5528378885697L153.93,329.2642220423035L154.05216666666666,306.9452851868858L154.17433333333335,300.2165858568028L154.29649999999998,285.40960060872976L154.41866666666667,307.73504777258995L154.54083333333335,295.6928413456062L154.66299999999998,285.83288102381294L154.78516666666667,288.2543732799236L154.90733333333336,291.6787509396844L155.02949999999998,316.30363150056644L155.15166666666667,310.4198027253181L155.27383333333333,308.371005044272L155.396,325.5280482722039L155.51816666666667,318.4552976032716L155.64033333333333,311.82085870154333L155.7625,326.5136197018015L155.88466666666667,305.28096573712145L156.00683333333333,309.3382456598398L156.129,305.6508794470708L156.25116666666668,289.79586766929015L156.37333333333333,295.8789191573496L156.4955,288.7293927653318L156.61766666666668,316.31501736526053L156.73983333333334,318.062797828232L156.862,312.7473530205815L156.98416666666668,333.88103818618913L157.1063333333333,334.6843006417636L157.2285,315.67537907057346L157.35066666666668,322.7828736488921L157.4728333333333,308.88753531681976L157.595,306.74449759117084L157.71716666666669,304.2343171572676L157.83933333333331,312.7434642653801L157.9615,315.2243399482793L158.08366666666666,311.31119433024634L158.20583333333332,305.63377723255513L158.328,328.7889560638402L158.45016666666666,314.7261353057087L158.57233333333332,329.41894080249847L158.6945,340.06848607348223L158.81666666666666,342.1735856692394L158.93883333333332,337.98217283964607L159.061,306.26279775054655L159.18316666666666,341.0832864761486L159.30533333333332,327.55823087107797L159.4275,298.4833908718136L159.54966666666667,321.82688008284424L159.67183333333332,305.8822324708135L159.794,306.9027531547274L159.91616666666667,319.8733778914854L160.03833333333333,321.8955907715692L160.1605,316.2290483095878L160.28266666666667,315.7585199164037L160.40483333333333,321.9998347420262L160.527,310.9901186901786L160.64916666666667,322.9969656019574L160.77133333333333,312.10254239386563L160.8935,312.28971861318377L161.01566666666668,312.93650863155193L161.13783333333333,323.658300069349L161.26,316.0421465944049L161.38216666666668,313.81943356473124L161.50433333333334,323.7428949184813L161.6265,332.03256411329005L161.74866666666668,336.8415579351195L161.87083333333334,329.9848021371684L161.993,333.46457661502507L162.11516666666668,336.32079333937776L162.23733333333334,332.65973142875714L162.3595,309.34716761528995L162.48166666666668,326.7019424167318L162.60383333333334,313.44345461266914L162.726,332.32981605375136L162.84816666666669,309.8393741768063L162.97033333333331,320.6167715757615L163.0925,309.12566508261045L163.2146666666667,317.6157114678168L163.33683333333332,325.5969038924924L163.459,330.1277804769687L163.5811666666667,326.0072992216435L163.70333333333332,306.6991724589752L163.8255,332.7876471640451L163.94766666666666,344.0563980723921L164.06983333333332,324.07827674032126L164.192,346.0838490707441L164.31416666666667,315.03588261555046L164.43633333333332,336.71063819154796L164.5585,306.6371597494145L164.68066666666667,334.5238263177779L164.80283333333333,332.2584384342395L164.925,323.2301814846676L165.04716666666667,347.20703662659423L165.16933333333333,321.33599040443397L165.2915,318.8244097533776L165.41366666666667,323.5517822490909L165.53583333333333,329.4488895617174L165.65800000000002,319.2183380650147L165.78016666666664,325.8920288395641L165.90233333333333,326.2363275820994L166.02450000000002,330.32551488809224L166.14666666666665,315.88310964554523L166.26883333333333,313.30417568988264L166.39100000000002,325.51974452765654L166.51316666666665,323.0346341575136L166.63533333333334,330.24976731336096L166.7575,323.04575743002886L166.87966666666665,324.6005126199642L167.00183333333334,321.55234832303006L167.124,316.33505634069064L167.24616666666665,321.12482677588196L167.36833333333334,335.38655973246256L167.4905,317.76121426981643L167.61266666666666,323.275444271891L167.73483333333334,321.7018641614828L167.857,315.1474225315852L167.97916666666666,318.02623516202596L168.10133333333334,326.19530253070434L168.2235,325.29098670719117L168.34566666666666,320.92455187253626L168.46783333333335,313.38264223402643L168.59,337.0477225360758L168.71216666666666,321.1138970029047L168.83433333333335,323.4399794491713L168.9565,328.5881361572319L169.07866666666666,325.6122761735857L169.20083333333332,331.37871508237566L169.323,322.7277401359608L169.44516666666667,323.06514733019185L169.56733333333332,313.6425203381113L169.6895,335.2383828659863L169.81166666666667,328.83657392942746L169.93383333333333,342.21860440883563L170.056,367.8967141153887L170.17816666666667,379.76197303380167L170.30033333333333,432.19863562970164L170.4225,361.4423406923539L170.54466666666667,348.25671700252843L170.66683333333333,330.22678868403983L170.78900000000002,323.2186516409869L170.91116666666667,324.1647796701107L171.03333333333333,316.35107046246634L171.15550000000002,325.8482904985758L171.27766666666668,323.2700628064247L171.39983333333333,333.62225352027417L171.52200000000002,325.42798494818385L171.64416666666665,327.103270853952L171.76633333333334,336.52293344410066L171.8885,314.2919543614756L172.01066666666665,328.04734964233273L172.13283333333334,315.83121593059053L172.255,310.5626331747691L172.37716666666665,341.2245324720586L172.49933333333334,310.17177965136125L172.6215,330.3658521787176L172.74366666666666,351.4379668551522L172.86583333333334,330.4605607542908L172.988,331.7968298377504L173.11016666666666,327.15953046860517L173.23233333333334,333.0755787876001L173.3545,329.40101693490294L173.47666666666666,322.1435398754652L173.59883333333335,324.6176561638699L173.721,330.63288966412864L173.84316666666666,313.134166246769L173.96533333333335,316.43452004795967L174.0875,307.5147675432684L174.20966666666666,314.15967569217037L174.33183333333335,326.8312809870728L174.45399999999998,347.01387855615735L174.57616666666667,351.71291840548815L174.69833333333335,335.5704743678277L174.82049999999998,345.3996997730283L174.94266666666667,311.5648975963444L175.06483333333333,320.16149288874294L175.18699999999998,333.0421816277219L175.30916666666667,294.61975552384064L175.43133333333333,332.951065480691L175.55349999999999,325.746786420187L175.67566666666667,309.96101614641856L175.79783333333333,335.40619196381624L175.92,309.1239934370916L176.04216666666667,324.75351110299516L176.16433333333333,363.5121615806504L176.2865,333.26220774242995L176.40866666666668,331.8812954379558L176.53083333333333,348.72443332265794L176.653,338.1128761603781L176.77516666666668,327.81602198004214L176.89733333333334,314.7067419500671L177.0195,318.1732080091083L177.14166666666668,310.8299168098847L177.26383333333334,329.22472355673693L177.386,321.54516341875535L177.50816666666668,327.14896930028124L177.63033333333334,312.50397308872095L177.7525,321.7617649120216L177.87466666666666,347.7340854820105L177.99683333333334,337.66952506000484L178.119,335.5471143522941L178.24116666666666,321.84588488629566L178.36333333333334,321.14793937408535L178.4855,315.12653554392944L178.60766666666666,300.37349470573986L178.72983333333335,317.539864467826L178.852,305.3328517284609L178.97416666666666,327.52107863069585L179.09633333333335,316.71217214817506L179.2185,330.3126055879115L179.34066666666666,319.8700031925781L179.46283333333332,348.11206987022877L179.585,332.6731235086521L179.70716666666667,326.65755646113655L179.82933333333332,344.51535876810135L179.9515,326.46727248046244L180.07366666666667,321.3796833442732L180.19583333333333,323.0917912926874L180.31799999999998,316.1373092236517L180.44016666666667,330.9745330378854L180.56233333333333,307.53270719305226L180.68449999999999,327.322512696946L180.80666666666667,340.2055094590976L180.92883333333333,338.72391783523074L181.051,340.9182576991192L181.17316666666667,325.39558773248075L181.29533333333333,340.0508217622862L181.4175,332.9725394443114L181.53966666666668,342.9283992011074L181.66183333333333,312.88375868333355L181.784,322.98529115858844L181.90616666666668,323.85342479941625L182.02833333333334,320.17573551558286L182.1505,317.33565011936395L182.27266666666668,325.37939044368L182.39483333333334,323.9221107270015L182.517,345.5401549388589L182.63916666666668,323.50124576485456L182.76133333333334,323.3255426850456L182.8835,324.1935270151753L183.00566666666668,341.21512705467717L183.1278333333333,328.8141111340874L183.25,332.49225426255924L183.37216666666666,310.56136601995337L183.49433333333334,330.3076939033077L183.6165,316.1081868366259L183.73866666666666,319.3566728256563L183.86083333333335,323.3829093936015L183.983,333.3983351618889L184.10516666666666,326.4177936550276L184.22733333333335,340.9169853382345L184.3495,349.4505653607582L184.47166666666666,318.6806285028288L184.59383333333335,314.41479870641143L184.716,341.0373290132382L184.83816666666667,315.25849524389133L184.96033333333335,334.642608626493L185.0825,320.94136344707937L185.20466666666667,326.12860616111806L185.32683333333335,303.27376395552994L185.449,313.0057868054514L185.57116666666664,311.0853364666225L185.69333333333336,335.2657202520812L185.81550000000001,333.1038310640965L185.93766666666664,341.1412003480764L186.05983333333336,336.83924084166387L186.18200000000002,330.6599659138096L186.30416666666665,337.7268768615242L186.42633333333336,323.9752152624173L186.5485,339.18230002170367L186.67066666666665,323.67578986600233L186.79283333333336,314.4854959300116L186.915,299.6380946639562L187.03716666666665,304.3986650814593L187.15933333333334,334.6129964576494L187.2815,346.32284521420934L187.40366666666665,333.7271417541272L187.52583333333334,348.873599461864L187.648,329.3518652330298L187.77016666666665,342.25402785730387L187.89233333333334,326.3927776484227L188.0145,332.0077378733772L188.13666666666666,315.19024842087254L188.25883333333334,320.6423747895689L188.381,323.20349441769883L188.50316666666666,325.00601271287866L188.62533333333334,329.99631336826127L188.7475,327.95524687592825L188.86966666666666,353.6128408358185L188.99183333333335,332.0109043706758L189.114,330.8584950482745L189.23616666666666,333.68475533552953L189.35833333333335,342.4297326396123L189.4805,346.00654921291914L189.60266666666666,343.2453598904937L189.72483333333335,313.7806764259201L189.847,305.37782495656734L189.96916666666667,335.07287551872224L190.09133333333335,323.43551805421515L190.2135,329.76211892527135L190.33566666666667,307.2399736086988L190.45783333333335,327.2901273047341L190.58,358.0366806652978L190.70216666666667,343.43161833813684L190.82433333333336,328.1566101382255L190.94650000000001,332.37698790099677L191.06866666666667,337.3421499473758L191.1908333333333,337.0892688762178L191.31300000000002,310.6486059887817L191.43516666666667,299.9936217354621L191.5573333333333,311.53503547586826L191.67950000000002,311.1056472274397L191.80166666666665,302.37288419363904L191.9238333333333,326.45325471890686L192.04600000000002,325.13990947361003L192.16816666666665,343.9526935663649L192.2903333333333,334.96341974956636L192.4125,325.5848483632542L192.53466666666665,351.5727398971234L192.6568333333333,322.45624265910857L192.779,336.6894403244073L192.90116666666665,332.20114584412875L193.0233333333333,313.45681476538755L193.1455,318.20616065458944L193.26766666666666,319.5265916038933L193.3898333333333,298.0353770749282L193.512,329.3664592418712L193.63416666666666,332.83797148395894L193.75633333333332,320.8512449521864L193.8785,338.52872860445353L194.00066666666666,333.25314538806936L194.12283333333332,338.20952680378554L194.245,321.2976192253887L194.36716666666666,330.2845534002338L194.48933333333332,334.5027513103345L194.6115,329.4134787743155L194.73366666666666,332.61397440238744L194.85583333333332,317.58463419198233L194.978,314.1765577965008L195.10016666666667,327.35190883441703L195.22233333333332,333.3317585497337L195.3445,327.4702708394916L195.46666666666667,331.40911402176744L195.58883333333333,344.7504906542999L195.711,341.30854122404725L195.83316666666667,333.65628879963344L195.95533333333333,327.55267801718344L196.07750000000001,341.0628854502233L196.19966666666667,329.51749422313856L196.32183333333333,332.10167046927984L196.44400000000002,317.2259411270677L196.56616666666667,311.46042280536483L196.6883333333333,324.6821926782266L196.81050000000002,310.7147308358903L196.93266666666668,336.3263190381222L197.0548333333333,350.7656539571813L197.17700000000002,351.0336751085324L197.29916666666668,347.50182780599977L197.4213333333333,344.2217207394699L197.54350000000002,320.8107096575833L197.66566666666665,341.6131569337525L197.7878333333333,341.4940844597998L197.91000000000003,320.1682184889181L198.03216666666665,320.3144845291192L198.1543333333333,311.6797024062595L198.27650000000003,322.7860665474568L198.39866666666666,333.1807518291009L198.52083333333331,331.03196929253244L198.643,355.90380426626785L198.76516666666666,332.57653628581244L198.88733333333332,336.17350051681126L199.0095,340.6698587286849L199.13166666666666,359.1541882552705L199.25383333333332,332.46231560890516L199.376,340.1995859882369L199.49816666666666,335.3056424569843L199.62033333333332,337.10305452154665L199.7425,328.40143511571694L199.86466666666666,322.8786754719391L199.98683333333332,316.9018721938543L200.109,333.0384957763332L200.23116666666667,339.96944280894195L200.35333333333332,339.8513494539999L200.4755,338.60654294035055L200.59766666666667,330.44786610776225L200.71983333333333,340.0632159485278L200.842,328.8524273524158L200.96416666666667,309.24153000273185L201.08633333333333,317.6619411327102L201.20850000000002,329.70863431807635L201.33066666666667,322.01128696075386L201.45283333333333,338.31096283375945L201.57500000000002,331.6538086954117L201.69716666666667,337.186522812859L201.81933333333333,344.07387250053944L201.94150000000002,344.50750001457186L202.06366666666668,355.57116143213494L202.18583333333333,369.507186651153L202.30800000000002,335.1686102616492L202.43016666666668,341.53161233533837L202.55233333333334,350.03601554564887L202.67450000000002,307.30214089469183L202.79666666666668,315.80560236015936L202.9188333333333,324.4569470822289L203.04100000000003,318.6887743563481L203.16316666666668,321.2740368408576L203.2853333333333,339.99435503120577L203.40750000000003,337.92057310561665L203.52966666666669,343.777868604533L203.65183333333331,345.2864562607476L203.77400000000003,344.608292666996L203.89616666666666,343.789957422294L204.01833333333332,327.54509760257804L204.14050000000003,345.60808740561043L204.26266666666666,321.8529240095973L204.38483333333332,317.3503011312544L204.507,323.33947578295897L204.62916666666666,334.3657042868035L204.75133333333332,341.43917412418085L204.8735,325.7419803310993L204.99566666666666,350.9217820029366L205.11783333333332,348.6099880750664L205.24,341.07225342812416L205.36216666666667,341.8365214385213L205.48433333333332,335.6004381076723L205.6065,322.8222039446167L205.72866666666667,319.4934928115388L205.85083333333333,317.31807588339456L205.973,308.525150256692L206.09516666666667,333.00793038391134L206.21733333333333,325.0135288919466L206.3395,334.5896793422485L206.46166666666667,351.31838859045143L206.58383333333333,338.22229737412016L206.706,353.6963933921473L206.82816666666668,360.41531493962617L206.95033333333333,364.54808494907763L207.0725,388.05780336073906L207.19466666666668,333.44663079205515L207.31683333333334,329.29186468033635L207.439,323.48493557212817L207.56116666666668,397.89279629421316L208.17199999999997,398.4157733452977L208.29416666666668,367.7477068639022L208.41633333333334,363.11884312428066L208.53849999999997,333.0892755639386L208.66066666666669,350.89260086357376L208.78283333333331,343.8074821367668L208.90499999999997,325.29352543343117L209.0271666666667,331.6616523604847L209.14933333333332,332.10268406380953L209.27149999999997,346.8827129595572L209.3936666666667,323.5985715215575L209.51583333333332,337.6841766384164L209.63799999999998,335.25403593396857L209.76016666666666,334.50153794613516L209.88233333333332,341.4725113370446L210.00449999999998,344.20057622761504L210.12666666666667,338.6729795546121L210.24883333333332,342.38674752580124L210.37099999999998,345.0006909861913L210.49316666666667,351.0883209176187L210.61533333333333,328.0669735799057L210.73749999999998,333.74564590304936L210.85966666666667,335.7088454535375L210.98183333333333,335.2384148755124L211.10399999999998,327.06617521748507L211.22616666666667,347.0983677440716L211.34833333333333,328.73066585223853L211.4705,334.0148661252344L211.59266666666667,321.0083445505823L211.71483333333333,331.42896032263036L211.837,342.1500305350346L211.95916666666668,339.88828111894964L212.08133333333333,327.7789937241232L212.2035,326.9685214395078L212.32566666666668,326.97073199735667L212.44783333333334,330.19235553059417L212.57,351.10928407231563L212.69216666666668,332.95710090603524L212.81433333333334,337.2779427094922L212.9365,336.0259915236069L213.05866666666668,328.8122314789367L213.18083333333334,321.8787865624178L213.303,337.03625805293166L213.42516666666668,354.2417617871688L213.54733333333334,317.9889821114311L213.6695,352.54163708636065L213.79166666666669,339.10978318408263L213.91383333333334,332.8537214061459L214.03599999999997,343.3358926935075L214.1581666666667,336.4873802834938L214.28033333333335,318.8817356059411L214.40249999999997,341.0657904711409L214.5246666666667,336.6702702573522L214.64683333333335,332.67661443392967L214.76899999999998,331.1593152304856L214.8911666666667,330.81446416510585L215.01333333333332,327.43733928698754L215.13549999999998,341.97249318127984L215.2576666666667,310.11083592881783L215.37983333333332,327.46393996527866L215.50199999999998,319.034708943672L215.6241666666667,324.8384789119347L215.74633333333333,321.001264288169L215.86849999999998,324.631773299003L215.99066666666667,329.5503735353803L216.11283333333333,328.6298672633622L216.23499999999999,331.126352863147L216.35716666666667,315.0419552018144L216.47933333333333,332.1409497759741L216.6015,324.3456996509669L216.72366666666667,332.7457431663373L216.84583333333333,330.48349438819673L216.968,324.5355305433084L217.09016666666668,327.21689130647604L217.21233333333333,331.04500444347013L217.3345,339.1484393405541L217.45666666666668,346.0849776841474L217.57883333333334,330.8454049594507L217.701,336.7126004375L217.82316666666668,331.81489821234004L217.94533333333334,331.09118439088354L218.0675,351.51161631119015L218.18966666666668,333.9576815188029L218.31183333333334,345.60384531591507L218.434,319.8227006198253L218.55616666666668,334.6842686916287L218.67833333333334,326.82577276523455L218.8005,340.0356956971481L218.9226666666667,331.1326040667884L219.04483333333334,352.98925140397813L219.167,330.95433830093634L219.2891666666667,328.7373994969432L219.41133333333335,349.4100813275617L219.5335,325.78182669965685L219.6556666666667,334.87334557610376L219.77783333333335,333.9087294929652L219.9,347.8184783942326L220.0221666666667,328.08869265881214L220.14433333333335,313.1799444989207L220.26649999999998,328.8618583455259L220.3886666666667,339.04393007587083L220.51083333333335,350.90929954778284L220.63299999999998,338.7067215501082L220.7551666666667,324.1072932879205L220.87733333333333,328.8192175903616L220.99949999999998,323.4447486003084L221.1216666666667,307.00590599410697L221.24383333333333,327.72985876630867L221.36599999999999,330.8253766580157L221.48816666666664,333.82241375012967L221.61033333333333,335.07457108752703L221.7325,330.335149998083L221.85466666666665,323.4455485944168L221.97683333333333,318.6418706106746L222.099,339.32011760820546L222.22116666666665,352.8278278026606L222.34333333333333,339.8451404416153L222.4655,336.2300194742711L222.58766666666665,317.40131397082075L222.70983333333334,327.4135472545754L222.832,321.0366978021266L222.95416666666665,332.75545675090103L223.07633333333334,342.86695207438737L223.1985,325.17955171939843L223.32066666666665,311.44883645738196L223.44283333333334,337.4430657463832L223.565,335.54330129979866L223.68716666666666,347.35524588109377L223.80933333333334,337.1948354842731L223.9315,341.45012269998324L224.05366666666666,345.4758156161784L224.17583333333334,326.4742349837591L224.298,347.0538389268723L224.42016666666666,330.831684639417L224.54233333333335,339.88233143092793L224.6645,325.29748836634883L224.78666666666666,339.2164240591103L224.90883333333335,339.55703487209246L225.031,350.25036061927784L225.15316666666666,314.3622046296263L225.27533333333335,350.03312277213035L225.3975,342.99061458229534L225.51966666666664,331.01211782015207L225.64183333333335,342.54701414611856L225.764,321.97518640413534L225.88616666666664,330.3156596889786L226.00833333333335,311.2242625216247L226.13049999999998,329.4569266376294L226.25266666666664,352.3259114155415L226.37483333333336,328.658860199931L226.49699999999999,336.0429377064856L226.61916666666664,332.550871680973L226.74133333333336,340.2431685598649L226.8635,332.9889636222594L226.98566666666665,350.30375211892255L227.10783333333333,331.89006300150635L227.23,328.5003727427458L227.35216666666665,335.15225766739655L227.47433333333333,334.5741569031483L227.5965,330.4056280086235L227.71866666666665,329.8037316784223L227.84083333333334,329.54371573166L227.963,313.9386857894358L228.08516666666665,307.27821093105206L228.20733333333334,324.18550709099L228.3295,331.3467079042473L228.45166666666665,326.36471871065186L228.57383333333334,350.2987020724115L228.696,352.70443314713924L228.81816666666666,338.0964906770357L228.94033333333334,334.17994694557945L229.0625,314.4188585665511L229.18466666666666,324.23588195934076L229.30683333333334,301.9207712928214L229.429,327.8797238298754L229.55116666666666,336.0763514663168L229.67333333333335,328.99997350839806L229.7955,330.85464681798567L229.91766666666666,336.48198017401296L230.03983333333335,328.79315364918784L230.162,345.67944972052726L230.28416666666666,341.88292947813665L230.40633333333335,335.98744896070343L230.5285,319.1069340340369L230.65066666666667,325.31860425570517L230.77283333333335,321.1958495231584L230.895,326.60770400746543L231.01716666666667,327.5094129705436L231.13933333333335,339.9763698891232L231.2615,317.45963365717546L231.38366666666664,339.2564242157678L231.50583333333336,334.97448504838223L231.62800000000001,334.93972348136157L231.75016666666664,347.11472217941474L231.87233333333336,332.30201732193217L231.99450000000002,330.4279904631897L232.11666666666665,339.18619230026115L232.23883333333336,347.37871548722995L232.361,344.3409959323409L232.48316666666665,342.6256539911848L232.60533333333336,328.8598217270931L232.7275,339.54456426968494L232.84966666666665,343.5604852574513L232.97183333333334,352.5919796946181L233.094,326.7936901721089L233.21616666666665,332.9365174308919L233.33833333333334,324.1849827188399L233.4605,343.16593200670604L233.58266666666665,331.9037687704295L233.70483333333334,344.94076662092255L233.827,336.4164152137194L233.94916666666666,330.585614560046L234.07133333333334,327.79632087722837L234.1935,336.1578954635999L234.31566666666666,344.07611457671976L234.43783333333334,333.41093326828997L234.56,332.61111878733084L234.68216666666666,321.961990429066L234.80433333333335,335.7965746086254L234.9265,316.7655431844797L235.04866666666666,356.38927347789274L235.17083333333335,341.3957106074602L235.293,370.45996557772116L235.41516666666666,342.0907086659656L235.53733333333335,337.2686608784629L235.6595,323.1353118084609L235.78166666666667,327.4190039750795" style="fill: none;" fill="none"></path></g><g><path stroke="rgb(0, 119, 187)" stroke-width="2px" d="M130.71833333333333,-11.332977681992475L133.16166666666666,3.884464754585508L134.62766666666667,11.838381532064432L137.071,24.547176677266236L140.61383333333333,38.05418714023864L140.98033333333333,44.81756900694188L142.07983333333334,41.76871790352101L144.0345,44.05440277749642L144.27883333333332,56.7246513973048L149.65416666666667,55.757180531385416L150.265,39.31295558438313L152.0975,29.84024240571972L152.8305,36.94006038648399L153.44133333333335,25.47112044510277L154.29649999999998,21.951406011992617L155.02949999999998,30.42688771583869L159.30533333333332,19.4827636536578L159.54966666666667,18.917980511483734L160.527,17.850206579095016L163.0925,22.15265026775387L164.06983333333332,18.825697814704085L166.51316666666665,22.90994334517893L173.11016666666666,25.495444797622042L173.23233333333334,26.760137411629216L175.30916666666667,33.17593552866333L175.67566666666667,36.69170945006846L176.04216666666667,25.854124738608448L177.87466666666666,28.56272948943723L178.24116666666666,33.3247069794034L178.36333333333334,32.17870630092449L179.585,28.90438773341458L181.90616666666668,33.90677740003207L184.47166666666666,33.505780504994334L186.5485,36.69717014710727L187.15933333333334,43.11504402609165L190.45783333333335,49.43853477179317L191.67950000000002,45.46816100692598L192.4125,48.22539994971925L193.26766666666666,47.654689811050275L193.3898333333333,42.01222748558871L197.4213333333333,50.032803738596726L197.66566666666665,56.95783391496104L199.13166666666666,56.026521750007646L199.98683333333332,58.55070711082692L200.109,58.59591813716469L200.35333333333332,50.3453876789805L200.71983333333333,59.99216505482042L200.96416666666667,65.95226930638898L202.43016666666668,67.65344828013332L203.65183333333331,67.22352787364025L208.90499999999997,78.50193869949561L209.88233333333332,99.87439837912945L210.85966666666667,91.99447419250617L211.95916666666668,90.84870082448634L213.54733333333334,103.7618783609916L214.03599999999997,104.05713749610373L215.13549999999998,122.13311775246291L215.6241666666667,110.030720669873L215.86849999999998,111.73708562334764L216.11283333333333,118.52039118542774L216.968,129.49010204819874L217.57883333333334,122.45240884849167L217.82316666666668,127.33113459065959L218.434,138.02014635843162L218.67833333333334,141.62948705743798L219.167,130.06699646691447L219.6556666666667,135.40431611729343L220.3886666666667,141.454586627954L220.51083333333335,138.17703673489638L223.32066666666665,151.48523398032233L223.80933333333334,157.16563428684327L227.84083333333334,158.9554052292472L228.696,157.32957454317906L229.429,157.24371502399262L231.13933333333335,144.5723609742221L231.62800000000001,165.9109054425362L232.48316666666665,177.08986186570564L232.7275,161.95915190704585L233.70483333333334,178.82294143891897L235.41516666666666,171.96629080148077L235.53733333333335,177.88363483775552L236.75900000000001,167.36486730934695L237.12550000000002,182.20153106625506L238.225,172.58095230705092L239.2023333333333,184.1946403920555L240.30183333333332,170.3199858060963L240.424,178.0549815950368L241.157,190.16257095207976L241.27916666666667,187.437935852766L241.89000000000001,179.111593232094L244.08900000000003,189.4046391183149L244.4555,195.48440238589458L245.555,187.06264169300275L249.22000000000003,178.1545696682317L250.80816666666666,180.43326414809417L251.54116666666667,188.00468520860295L251.90766666666667,193.94139003636195L252.152,175.45058927782236L252.64066666666668,180.83266111457166L253.37366666666668,210.3424971925294L253.98449999999997,173.4588468840063L255.2061666666667,183.41236273313993L255.93916666666667,180.09613106426073L256.55,191.75813983674044L257.7716666666667,174.03838094315796L262.9026666666667,175.0550341289558L263.39133333333336,178.21175141867238L263.51349999999996,183.52488223060521L263.6356666666667,186.19340309151983L265.8346666666667,162.405695520283L267.42283333333336,169.94950876985823L269.01099999999997,172.95101568132196L272.18733333333336,179.89131434212766L273.65333333333336,170.80780310592965L274.99716666666666,160.79256183829858L275.36366666666663,177.2930278089043L275.73016666666666,172.89057908076944L278.1735,164.0389411098009L278.9065,177.34250213112097L279.76166666666666,166.39297914323998L280.49466666666666,185.60778614451118L282.44933333333336,190.94504922111162L282.81583333333333,180.3395598636142L284.1596666666667,187.8652262112518L284.52616666666665,172.67980634281108L284.8926666666667,174.8200466035322L285.01483333333334,175.66357291362348L290.8788333333333,201.33734461179816L291.24533333333335,197.58700765914398L291.6118333333333,183.39152260546192L291.73400000000004,174.25879374011782L294.4216666666667,187.64737630384266L295.8876666666667,191.96449695570413L297.1093333333333,182.4028963288249L297.598,183.65100222692726L300.6521666666667,191.20654268840147L300.77433333333335,191.80874588632614L302.1181666666667,193.31529604934917L303.09549999999996,208.80947152024936L303.5841666666667,181.21308933398473L303.82849999999996,188.01990890209848L307.73783333333336,196.73827862200505L308.2265,208.41988828834718L311.28066666666666,204.18989144030817L313.23533333333336,212.7305249928913L313.3575,218.81027603015124L315.19,223.6020310127397L315.67866666666663,186.41789112875227L317.6333333333333,194.3214105614425L317.7555,198.21475817407156L320.4431666666666,222.6560587059324L321.054,178.27881779095893L321.6648333333333,222.20171564090342L322.03133333333335,219.63899840357354L322.64216666666664,194.2116094970349L323.00866666666667,199.2530216710436L324.4746666666667,190.9578381946554L325.452,223.17781809876934L326.30716666666666,194.05932791544134L327.1623333333334,216.202202792588L328.26183333333336,203.99612176342177L328.7505,224.18081705227232L328.9948333333333,226.43733724506802L329.2391666666667,228.97377086608282L329.60566666666665,201.98463506761044L330.4608333333333,225.2203601651059L331.5603333333333,212.25563576630998L332.78200000000004,231.31776264852L333.27066666666667,208.40078342416177L333.515,212.2169753126707L333.8815,226.94863333086494L334.4923333333333,217.57464277556363L334.85883333333334,209.67475868563068L334.981,204.10071494647994L337.4243333333333,231.36467398861055L338.5238333333333,210.3592624731186L339.2568333333333,235.28336110005216L340.112,203.72182137282795L340.2341666666667,217.7378391081388L341.33366666666666,244.42178336387528L342.1888333333333,224.89778260461605L344.7543333333333,259.4951352054909L345.243,198.50500998248015L346.4646666666667,248.71451163284763L346.9533333333333,182.1558627025565L348.05283333333335,250.78693744196846L349.64099999999996,229.95023085921102L350.0075,242.51991245973772L350.7405,200.52698636549925L350.9848333333333,226.67109609163924L355.13849999999996,255.5304976763759L355.87149999999997,205.56187678511256L357.9483333333333,224.46843443864586L359.17,221.08878732483672L359.5365,238.39062772680762L359.78083333333336,229.06352520135508L361.9798333333333,247.44310763834332L363.69016666666664,246.12288434126503L363.9345,248.16396052522816L366.1335,209.32843499084905L366.62216666666666,237.89599380688819L367.5995,255.8421468571682L369.67633333333333,218.52561285251008L370.40933333333334,254.25657014434083L373.3413333333333,232.20503872996338L374.92949999999996,233.2928932137389L375.296,233.5890472074422L375.41816666666665,247.76642595564L375.66249999999997,241.51654256062207L375.90683333333334,261.70857540875824L376.39549999999997,221.22088864777322L376.88416666666666,236.50232445922512L377.7393333333333,275.72721625445047L379.57183333333336,264.99482903467253L383.72549999999995,231.04520819926054L384.21416666666664,252.73833794556253L385.0693333333333,212.05579570911485L386.16883333333334,263.9659081287287L387.63483333333335,242.05099403148085L388.8565,234.80690079132697L389.10083333333336,243.69718575117713L389.34516666666667,251.6312261422583L389.956,264.6157455108015L390.3225,221.1986310796119L391.29983333333337,259.5332067166127L391.7885,242.9477244581565L394.2318333333334,232.253155098919L394.8426666666666,256.35491461755083L394.9648333333334,269.2546522398231L395.2091666666667,251.9658487814664L395.5756666666666,229.54812214648877L396.0643333333333,236.68046794028103L396.18649999999997,249.8237194585962L397.04166666666663,271.7869837730167L397.40816666666666,238.7408068617465L398.019,252.67903508749185L400.3401666666667,249.77309336328906L401.5618333333333,266.44489451152833L402.2948333333333,251.70403820488065L403.0278333333333,256.22406712687746L403.5165,272.1680657422997L404.9825,259.46280431199216L405.1046666666667,266.6875597535537L408.1588333333333,250.51760427940047L409.2583333333333,240.86005148171418L413.5341666666667,258.9815579383403L413.65633333333335,261.34735954792484L413.90066666666667,269.8225590850559L419.27599999999995,254.19750150077283L420.25333333333333,260.287242869681L421.1085,275.20280219995846L421.3528333333333,267.9261294499067L421.5971666666667,248.05309740769445L422.20799999999997,266.0017257509936L425.2621666666667,252.99580965811822L425.38433333333336,253.52682820225746L426.606,259.3985502332139L426.85033333333337,223.1005653944834L427.4611666666667,263.44443324185454L429.17150000000004,273.97524300741304L429.90450000000004,283.5304565659728L431.00399999999996,269.3183441041999L432.8365,254.21009011959723L433.32516666666663,272.34513909346185L433.5695,270.61020060145154L436.6236666666667,277.5410823218338L438.0896666666667,266.0908247768755L439.3113333333334,258.6831817500021L440.0443333333334,268.400719901799L442.1211666666666,271.4687830080927L442.85416666666663,262.2841682682323L442.9763333333333,255.48847409917113L444.68666666666667,264.3968972910035L445.41966666666667,265.6516289028634L446.0305,246.25688184751087L448.8403333333333,274.35372897333866L449.69550000000004,269.48002133262526L450.3063333333333,272.6329000029149L451.89450000000005,273.20493959828667L452.7496666666667,251.33104594019994L454.58216666666664,256.5238777522402L456.04816666666665,256.3898595734289L456.4146666666667,257.12125083804665L458.125,253.49900744181252L462.40083333333337,261.6557921631064L462.523,275.78892175535697L462.6451666666667,288.7696031138116L464.84416666666664,248.60095374866214L466.6766666666667,247.8581470972493L472.5406666666667,281.8426356620913L473.6401666666667,253.47162733647514L474.7396666666666,266.28320185472325L475.4726666666666,260.61561821136854L478.03816666666665,269.99065736631525L478.28249999999997,287.99106712312005L478.4046666666666,274.5201695331886L484.1465,248.57349273703556L485.1238333333334,281.41834615405173L487.9336666666666,258.8422686726257L489.03316666666666,277.0157117901105L489.5218333333333,268.1891521738479L490.0105,275.53905787828245L491.4765,283.67893682567416L491.843,274.9999069612397L492.82033333333334,274.8944841785794L493.1868333333333,284.005422374868L496.1188333333333,272.00425245700006L496.6075,279.26880434719783L496.8518333333333,258.62199229056387L500.2725,273.2271710597086L503.08233333333334,263.45648701421646L504.9148333333333,256.0820080517853L505.8921666666667,277.0729694695089L506.503,255.0577491625818L507.11383333333333,276.70848845400553L507.84683333333334,266.49240504797893L509.80150000000003,258.8061186188589L512.0005,277.1890461133897L512.1226666666666,263.9622277889501L512.9778333333334,282.3325595682011L513.3443333333333,288.10239652719446L516.2763333333334,280.0057827356943L516.6428333333333,276.0593911327182L517.4979999999999,284.0459986818337L517.8645,281.86305764076326L517.9866666666667,279.59182524715413L518.231,288.97665713839297L518.3531666666667,281.46669817092493L521.4073333333333,279.8625750769447L522.5068333333334,282.641874766766L525.4388333333334,265.6198407121126L526.4161666666666,264.41739644876475L528.1265000000001,266.43569323823914L528.2486666666666,271.3490186751765L530.3255,304.8605068000361L533.8683333333333,307.676350790639L534.1126666666667,290.73132948325525L535.3343333333333,290.57781054977454L537.4111666666666,295.907282727286L538.0219999999999,273.90736977624414L540.7096666666666,297.6207094969101L541.687,272.90537948975566L543.0308333333334,247.3760252226669L543.2751666666667,275.7944032265897L544.1303333333333,295.25325023326695L544.8633333333333,262.86614781762853L545.7185000000001,272.6082995267701L546.4515,264.9917683503495L546.818,279.21449302661284L548.8948333333334,276.3764051860768L550.3608333333334,287.7187666694146L550.7273333333333,280.77277342671L551.5825,269.8228675770697L553.5371666666666,268.17994808647404L553.7814999999999,275.49393638337745L554.5145,280.5897758860607L555.0031666666666,255.4501802150628L556.9578333333334,279.9803806340615L559.0346666666667,275.21908375811023L561.4780000000001,291.8960833756111L563.677,267.4264580138131L563.9213333333333,280.57365167617115L564.1656666666667,274.2261482572696L564.5321666666666,307.10382208751247L564.8986666666667,290.0474015927818L566.9755,284.0018098682084L568.3193333333334,288.81409281094L570.0296666666667,296.5380373273889L573.6946666666666,295.1617210616223L573.9390000000001,292.4963264452727L576.871,286.4325343024358L577.3596666666666,252.13168979003132L577.604,256.63977875458136L578.337,287.3962930040635L578.7035,300.05862150342864L578.9478333333334,299.03341887138566L580.0473333333333,286.67327749323164L580.9025,260.4299985133996L582.4906666666667,285.2376518941984L583.4680000000001,281.62514668667325L585.0561666666667,285.6309510213161L585.3004999999999,286.6993102762493L585.667,290.94366805567057L587.6216666666667,283.2143039223055L588.2325,277.46983476571074L588.4768333333333,281.3209442421699L589.6985,294.96795102325063L590.1871666666667,278.89627936612504L593.73,281.4231836758353L594.3408333333333,294.7174619307693L595.196,280.4857434191476L600.5713333333333,270.549582529925L601.5486666666667,299.13859423893825L602.4038333333333,264.19897111688385L602.8925,279.6577892421927L604.2363333333334,275.6959618983139L604.9693333333333,299.85894518867497L605.7023333333334,300.8999739159549L608.39,281.20616036845905L609.7338333333333,296.295132469447L610.3446666666666,291.1171707302972L611.4441666666667,277.6515351520329L611.5663333333333,279.1804307848045L613.0323333333333,293.37428469058875L614.3761666666667,277.40933650074123L614.7426666666667,277.88238849143795L615.9643333333333,294.8307151038693L621.7061666666666,296.6861593368795L622.317,288.2589947608525L622.4391666666667,294.2119805878441L623.6608333333334,276.63359940544854L632.579,291.4690952379848L633.9228333333333,289.4753160717952L634.045,304.00155689990856L634.2893333333333,287.8569476873437L634.5336666666667,286.8452937094283L634.9001666666667,277.8286194028914L635.0223333333333,289.83099872967483L638.0765,292.01591357724334L638.6873333333333,282.252225491206L639.0538333333334,284.780641977702L640.0311666666666,254.7507306103499L641.375,283.6836495577008L642.4744999999999,293.07999226998976L642.841,284.1113098149423L642.9631666666667,267.1453031121592L644.7956666666666,294.00287963612163L646.7503333333333,290.7413665115409L647.4833333333333,285.3340934426478L647.972,310.8309491433081L649.0715,286.6144740702063L649.3158333333333,291.71623460615166L650.4153333333334,283.68855087797306L650.904,296.0536602820271L652.2478333333333,283.9449321337931L655.4241666666667,284.4263090738619L656.5236666666667,291.8187508934184L658.3561666666667,287.4898072227377L659.2113333333333,282.0957473922243L659.9443333333334,301.8745128432911L660.433,287.2781126741534L660.5551666666667,283.00871013326514L662.8763333333334,291.28395092587243L663.2428333333334,298.1916799015082L667.6408333333334,283.80079159841983L668.3738333333333,293.0623172735398L670.2063333333333,293.57122561018923L670.4506666666666,308.35140610198425L672.5275,280.1555261839082L672.6496666666667,276.80631707487333L674.9708333333333,300.0675831948199L675.0930000000001,295.71246729024523L675.4594999999999,281.98331603748386L676.5590000000001,274.5926230220007L677.9028333333333,284.37113184979637L680.957,313.2420536286711L681.3235,289.2599861016582L682.6673333333333,300.31762330417746L683.0338333333333,281.4201635563526L683.2781666666667,281.72064145129605L683.5225,285.3849801398266L684.9885,278.6425827624324L687.0653333333333,287.4359150191078L689.6308333333333,302.7299071730526L690.486,296.3167366093392L691.4633333333334,281.01737191792466L691.952,292.0413366145346L692.8071666666667,303.1267625287554L692.9293333333334,307.15876470002496L693.0515,299.5479655126075L694.2731666666667,304.95138775235665L697.4495000000001,283.92460973378485L698.0603333333333,309.10378779607834L700.015,292.0675650462364L701.3588333333333,286.97397616702597L701.8475,279.8526847178679L701.9696666666666,272.07671915840825L702.947,311.8430545109483L703.68,291.9198905365758L704.0465,273.6923313350648L705.146,293.4957955426188L705.6346666666667,294.48248416953146L706.4898333333333,308.6058829570867L706.612,298.3955335975876L707.345,291.34464902147454L708.078,304.8744132906087L708.3223333333334,308.46738036319744L708.6888333333334,312.2035976346692L712.9646666666666,278.6585764689261L714.4306666666666,298.74204796106403L716.7518333333334,302.7272663029816L716.9961666666667,302.3820303537513L719.5616666666667,270.8481527700865L722.005,291.2277395342025L724.937,276.10711868502665L725.3035,316.51910530816895L726.8916666666667,272.79435852221474L727.869,315.108911415269L728.9685000000001,305.59195340420933L729.335,307.87808792273773L729.5793333333334,321.03788814825936L729.7015,284.2317495320663L735.0768333333333,302.56151237389275L735.8098333333334,297.5468160209764L736.1763333333333,290.7500526739355L737.0315,277.83873337855334L738.6196666666667,290.7874474224484L738.864,283.82848790805104L738.9861666666667,274.70316546307856L739.8413333333334,303.4672503436267L742.0403333333334,277.27253540276797L744.8501666666666,305.10661153540536L745.4609999999999,272.9059769072844L747.9043333333333,303.6191701238039L748.1486666666666,304.8561990997223L748.8816666666667,297.73854240894195L752.058,313.85161087899826L752.6688333333333,285.5532198666857L754.7456666666667,305.82340574592286L756.9446666666666,305.967302270445L757.922,302.69661095660933L760.7318333333334,327.46365928923603L761.2205,284.8965120716439L761.7091666666666,305.4760772473196L761.8313333333334,308.4769493750426L765.0076666666668,313.09267462682783L767.8175000000001,295.6987570301462L771.3603333333333,293.30249217052517L772.7041666666667,298.9012153614113L774.6588333333333,278.8177658686334L776.3691666666666,310.0346958391679L776.4913333333333,278.89975107769453L778.9346666666667,294.6440324528387L781.8666666666667,272.7390063067905L783.8213333333333,295.16088147441724L786.1425,318.61919352259014L786.509,301.9622840516379L787.1198333333334,276.0279444759673L791.6400000000001,311.1140022004558L792.4951666666666,282.2827268411588L792.9838333333335,302.51349078512476L793.7168333333333,325.6383426779861L794.4498333333333,289.0485403425612L795.0606666666666,301.63569177170257L795.4271666666666,343.0973701354789L795.6714999999999,296.2352727327469L798.6034999999999,298.2174650159837L799.5808333333333,295.68743041932447L801.6576666666666,298.18353769671205L802.2685,315.09897773019475L802.5128333333333,279.1537991550138L803.9788333333333,317.63994191978446L804.2231666666667,286.51318637649257L804.9561666666667,304.707000244802L806.4221666666667,292.3228240803784L807.033,298.780514606935L808.2546666666667,289.91260231823844L809.4763333333334,285.2457190827631L812.8969999999999,303.0566655078762L813.1413333333333,294.1559679913719L813.6300000000001,309.12073815427397L815.4625,299.50945659779967L820.4713333333333,296.28729468316584L820.8378333333333,290.96698540667325L823.0368333333333,319.8108457518014L825.2358333333333,276.3141314379544L827.3126666666667,282.95597043666174L827.6791666666667,301.4940688059434L828.29,309.3968008624731L828.9008333333334,289.076152263559L831.0998333333333,284.12867818726704L831.222,307.79901173341545L833.2988333333334,314.68307718977917L833.6653333333334,344.02418845936177L834.5205000000001,281.8610769420413L837.819,313.7812126164025L838.0633333333333,283.5157337204146L838.9185000000001,274.09285872573344L841.1175,301.79085936900924L842.217,308.5997350287735L843.3165,302.65648821183606L844.5381666666666,291.22329279504413L845.882,317.3405035641924L846.3706666666667,298.05744670466083L846.4928333333334,302.5226125711513L848.814,306.31543582223526L849.4248333333334,289.0856675890106L849.6691666666667,282.57157208614296L850.1578333333333,299.58006007201976L851.2573333333333,292.9904330695641L853.4563333333334,288.13814879947836L855.8996666666667,345.88816558745464L856.5105000000001,286.0106556316002L857.6099999999999,317.30197963553593L858.3430000000001,293.84727849341346L858.4651666666667,292.3608600744586L858.9538333333333,302.3554814594796L859.3203333333332,311.75052369014867L862.0079999999999,297.9462956636568L864.207,292.83069374952737L864.6956666666666,302.1763811641264L865.3065,302.84753571080364L867.2611666666667,287.4540234591294L867.5055,309.7332095416124L868.7271666666667,296.26139493261945L869.4601666666666,312.8017118435677L869.7045,308.7983478817235L870.4375,294.81145455883575L870.6818333333333,302.03601922435905L870.9261666666666,311.67706985886326L872.3921666666666,309.3232785048213L872.6365,298.59935680363833L872.8808333333334,291.90267167888794L875.6906666666667,295.78845990860367L876.7901666666667,298.4641194769724L877.1566666666668,303.2136091057824L879.1113333333334,306.65033491757595L880.4551666666667,318.51936216906154L880.6995000000001,298.3418699169628L881.4324999999999,312.57496064826756L883.0206666666668,294.52738715533303L883.6315,307.0247760787582L884.4866666666668,303.35282909240686L885.2196666666666,320.68474683877895L886.8078333333333,297.2851901009783L889.9841666666666,298.9868688444267L892.794,305.380402507285L893.2826666666666,299.1745591656636L893.4048333333334,296.96902432928755L894.26,305.2250362586755L896.3368333333334,309.79645701257L897.6806666666666,311.022257933414L899.0245,302.39166422996936L899.8796666666667,309.7163125100574L900.6126666666667,309.77746960697095L903.5446666666666,293.3804049605493L910.2638333333333,304.27491234296485L910.7524999999999,294.7919995956249L912.3406666666666,287.8884098082825L914.1731666666667,303.59265486690765L915.1505,316.2793591286612L915.2726666666666,329.3835021815343L915.7613333333334,302.7890458206899L916.983,321.6631706318966L918.2046666666666,312.3434605037488L918.3268333333333,313.72003354237586L918.449,317.9436683687605L919.4263333333333,309.5361174438582L921.8696666666667,314.81271855629575L924.8016666666667,298.9865570234235L925.6568333333332,306.8974261611048L926.0233333333334,299.50372044436335L927.2449999999999,307.4985206936864L928.3444999999999,296.2108659912124L928.7109999999999,315.0439295668296L929.9326666666666,303.1090283519407L930.5435000000001,304.02464889283794L934.9415,312.4910411105575L936.0409999999999,308.1866211479313L936.2853333333334,288.3371058824106L936.774,319.9043600192203L937.507,309.5398471483491L937.8735,296.52288336456917L938.7286666666666,305.5361684413678L939.706,303.2352887580074L940.0725,318.9863564061486L941.0498333333334,304.7095720068826L941.5385,284.27370579981334L942.0271666666666,288.91060127197045L944.837,301.20117152401207L945.3256666666667,310.54474616194955L946.1808333333333,303.8478576268467L946.6695000000001,312.18540457262793L948.0133333333332,313.45746939848976L949.4793333333332,313.5733025540583L951.4340000000001,302.022817141632L954.4881666666666,312.02266453970367L956.3206666666666,303.06212552897495L958.031,318.5525649489582L959.0083333333333,305.79330069606783L959.9856666666667,299.11998947983653L960.7186666666666,298.61354074434894L960.8408333333333,308.0187715265831L964.0171666666666,289.78822624926886L965.7275,305.5395587102605L968.7816666666668,325.80396931656617L970.8585,295.87336458994105L972.3245000000001,324.93364438089867L972.6909999999999,313.06138139822167L973.4240000000001,319.0135480601918L975.6229999999999,312.26577057421594L975.8673333333332,317.9791333393728L978.0663333333333,305.55981419624953L978.555,326.9367348059542L981.1205,313.2857369643671L981.6091666666666,314.2695457972845L982.4643333333333,302.8925953885763L984.0525,299.3025137241318L984.7855,307.7352521084589L986.4958333333334,307.8529728293867L987.9618333333334,302.52255520055644L989.7943333333334,307.1737402100583L990.7716666666666,310.07570551337994L990.8938333333334,314.4845703091506L993.5814999999999,317.86834194939155L994.0701666666668,313.1851470174787L995.7805000000001,292.5753884678142L997.2465000000001,304.48449673588766L997.3686666666667,324.6455203853927L998.2238333333332,302.9106900995477L999.9341666666668,313.5696709244547L1000.6671666666666,322.8905607273471L1001.5223333333333,300.7746976779323L1001.7666666666667,297.9691497278076L1002.8661666666667,319.0384683407103L1003.1105,326.1318971592743L1003.2326666666667,302.78841418097267L1005.5538333333333,305.6728238937517L1005.9203333333334,302.9673077833908L1007.0198333333333,304.84460620174514L1008.608,305.3194740080494L1011.54,315.7611587358218L1013.4946666666667,305.0570213998476L1014.8385000000001,309.27374551070386L1017.0375,311.9551241689188L1018.2591666666667,308.18097883829273L1018.3813333333333,306.18795264312655L1019.1143333333334,308.893952917967L1019.9694999999999,300.56271503795244L1020.4581666666668,311.37235420363294L1020.5803333333334,279.8213142426515L1020.7025000000001,281.84335830327365L1023.3901666666666,309.51021637283225L1023.5123333333333,305.5252734790763L1024.001,304.42158575064116L1024.2453333333333,271.16331622366846L1025.3448333333333,319.84507416687137L1025.8335,276.6670301635278L1029.3763333333334,314.1434208583264L1029.4985000000001,308.5123895811784L1030.598,302.7268931384433L1030.9645,310.89979909722723L1031.8196666666665,301.8437732611461L1033.0413333333333,294.40936373252487L1033.1634999999999,307.8597770586804L1033.4078333333332,290.87885815138446L1034.0186666666666,310.9295386947457L1034.7516666666666,304.7344527823878L1035.2403333333334,321.4787953139515L1035.4846666666667,312.59477966471525L1037.4393333333333,302.7690062668834L1038.1723333333334,285.1368033749111L1038.2945,305.5590897274027L1039.2718333333332,311.5033139425179L1041.593,298.3587289476016L1043.6698333333334,304.2365674920416L1046.8461666666667,299.84436077353075L1049.656,313.64344335038555L1051.2441666666666,306.02345814503116L1051.6106666666667,325.4055914686143L1055.3978333333332,321.8408657867541L1055.52,300.3268775279643L1057.4746666666667,303.8918583115531L1057.5968333333333,307.6509316632044L1058.8184999999999,322.1335836440174L1059.185,323.5337149345828L1059.4293333333333,305.881652268967L1060.7731666666666,305.88626490481636L1061.5061666666666,314.20110715436726L1061.8726666666666,317.3684199413349L1062.3613333333333,311.99527346271293L1064.5603333333333,306.52105147183846L1064.6825,287.08078636539653L1065.1711666666667,313.24330248479L1065.5376666666666,313.9528132733401L1065.782,323.0564201827364L1067.4923333333334,315.9556559877153L1067.981,309.03227257961186L1069.3248333333333,314.33215077001694L1069.6913333333334,309.7816138137786L1072.1346666666666,312.9815732651888L1073.112,310.4886462442185L1074.3336666666667,324.2880549797118L1074.4558333333334,323.33049455039367L1075.7996666666668,310.5241788960168L1078.6095,300.4129413176429L1079.2203333333332,308.2506820001688L1081.2971666666667,309.2101687011527L1086.6725,299.13119190161416L1086.7946666666667,308.57303692396795L1087.2833333333333,305.44035856491365L1087.772,319.9851901732604L1089.971,295.25593775251355L1090.704,310.46263289783076L1095.9571666666668,302.76022144271093L1096.9344999999998,295.78773073616946L1097.9118333333333,321.31775125893694L1098.034,302.57643104780806L1100.1108333333332,320.3258113426083L1101.5768333333333,298.02308547934615L1102.0655000000002,315.1096295258034L1103.4093333333333,303.6633962507987L1104.8753333333334,327.0161921704591L1105.2418333333333,318.32769365592475L1105.4861666666666,324.3064766107983L1105.9748333333332,303.5718128228548L1106.9521666666667,300.1813204842769L1114.2821666666666,314.64319197426073L1115.0151666666666,329.8893012082389L1115.1373333333333,319.5511599624215L1117.2141666666666,318.39176107830326L1118.0693333333334,305.829873797549L1120.8791666666666,306.5488458840436L1121.3678333333335,300.58605539717917L1122.7116666666668,313.1278351323003L1123.0781666666667,307.32400307499205L1125.155,323.97330419818604L1125.888,318.23887504559445L1127.4761666666666,306.08194694524246L1129.9195,296.89529205236465L1131.019,311.35961891636924L1132.9736666666668,309.8370223915012L1133.218,289.2289277322165L1134.3175,318.0165362314282L1135.417,312.0468299991473L1136.3943333333334,315.02313956046777L1138.8376666666668,315.84048414595276L1139.082,316.5061785029409L1139.4485,317.4243208413476L1139.6928333333333,311.5516514552597L1140.4258333333335,289.75634329913055L1141.8918333333334,321.29807745080217L1142.0140000000001,299.83534892793364L1142.8691666666666,319.7374472484013L1143.2356666666667,319.5585096049256L1143.8465,298.4451039902469L1144.7016666666668,323.99167558044246L1144.9460000000001,333.67714488824856L1151.1765,301.84829527287866L1152.5203333333334,320.78128003130075L1152.8868333333332,288.6628987122549L1153.4976666666666,314.18171806493376L1155.4523333333334,308.5271405029873L1155.941,322.26104850327687L1156.0631666666666,298.38755557092463L1156.3075,300.9698513885074L1157.5291666666667,312.16540533656854L1158.0178333333333,297.71121796321376L1158.5065,319.07964290208974L1159.606,299.34627485825075L1159.7281666666668,307.51453418907704L1160.339,330.80364668310784L1160.8276666666666,310.70028412111395L1160.9498333333333,329.8342849169936L1162.7823333333333,321.93997802468044L1166.5694999999998,312.6446297172948L1168.0355,335.47733359354174L1168.8906666666667,321.0442062187678L1169.5014999999999,321.7020171102913L1171.2118333333335,318.2531773803283L1172.3113333333333,332.69606915648234L1174.1438333333333,344.7176443010678L1175.4876666666667,315.6498688005231L1176.3428333333334,301.8574539876221L1176.465,331.953690946668L1177.5645,332.73295092142035L1182.2068333333334,302.23314624037533L1182.5733333333333,336.10427938126264L1182.8176666666666,338.50966653958335L1183.1841666666667,295.6939627933812L1183.3063333333334,316.2212095965821L1183.795,305.1928786609094L1187.7043333333334,322.3366645712227L1189.4146666666668,331.36854066140427L1192.7131666666667,319.04453198413455L1193.4461666666668,331.94375016438966L1193.9348333333332,297.01959735609597L1196.1338333333333,303.9975210296879L1196.7446666666667,333.090101839697L1198.5771666666667,326.52602370051716L1198.9436666666666,319.15130261287817L1199.3101666666666,315.540927624873L1201.8756666666666,320.62847562946354L1203.2195,319.09256039340056L1204.8076666666666,318.90985717534346L1204.9298333333334,322.6364975235493L1206.1515,338.5555275904568L1206.518,320.4961002944447L1207.0066666666667,335.00320557107824L1209.0835,322.3519924174001L1209.3278333333333,323.4017059259984L1209.6943333333334,313.8115387353797L1212.2598333333333,321.3281691267724L1212.5041666666666,326.63130746848157L1213.115,324.83600588010546L1214.4588333333334,314.93183025558636L1215.4361666666666,333.5979631740999L1215.9248333333335,297.2896435484687L1218.0016666666666,317.4988113465083L1218.7346666666667,331.13503512579734L1219.9563333333333,300.97424602723714L1220.2006666666666,300.06385623649993L1220.9336666666666,338.2872255875612L1221.5445,335.74449834377396L1222.1553333333334,313.89466836192463L1222.5218333333332,320.39950702627345L1224.3543333333332,333.1697846401769L1224.4765,326.69724534239225L1227.775,338.11113091815184L1228.7523333333334,323.9713931461951L1229.974,312.20131603347926L1231.3178333333333,324.14963599168556L1233.8833333333334,336.7597947316888L1235.9601666666665,320.83474346412527L1236.0823333333333,304.4343458785479L1239.9916666666666,301.71752804550295L1240.1138333333333,324.4393188219412L1240.8468333333333,316.03932520055736L1247.1995,327.4270801045741L1247.4438333333333,327.7518342656866L1248.299,342.3760923722616L1248.6655,328.6896721965828L1249.5206666666668,311.5803028884537L1252.8191666666667,329.5397902094646L1255.3846666666666,337.7215833803566L1256.6063333333334,316.9532299099054L1258.5610000000001,312.17962687784996L1258.8053333333335,328.6222594323887L1259.0496666666668,310.9540288824861L1259.6605,305.7417695250818L1263.5698333333332,327.57941377401494L1264.1806666666666,336.66020324893617L1265.5245,314.575563614705L1265.6466666666665,312.07238947139956L1265.7688333333333,338.13391682839523L1266.1353333333334,327.66188231274475L1268.09,318.9096753463384L1271.3885,314.20576072332125L1271.755,319.90104578123595L1274.1983333333333,312.0545301890706L1274.9313333333334,317.36245056123795L1275.0535,333.4252499440039L1278.2298333333333,324.28981377979926L1279.085,317.3448446003657L1279.9401666666668,293.1318838824854L1283.8495,331.7543687182035L1284.216,335.3211069435732L1286.5371666666667,319.09089261073416L1286.6593333333333,321.28139849913305L1287.7588333333333,324.4388866184826L1288.614,318.9251918201646L1288.9805,335.5062627541163L1289.8356666666666,335.1520656657423L1290.2021666666667,322.45642666232357L1292.5233333333333,311.1847188505547L1292.8898333333334,325.7223226822797L1294.2336666666667,335.87011895298605L1295.5775,307.76222567160096L1296.7991666666667,327.41301717847307L1297.0435,339.8860052679901L1297.1656666666668,327.4353435521341L1297.5321666666666,326.55807752374795L1300.4641666666666,344.2175642896406L1301.075,326.69345040918245L1301.1971666666666,310.26750201933464L1301.6858333333334,320.37173197012555L1304.0069999999998,325.51673912483403L1304.1291666666666,321.7014053155275L1305.2286666666666,331.18375519850974L1306.2060000000001,307.00130513591233L1307.672,321.8053978495472L1312.1921666666667,300.64357337945296L1312.5586666666666,343.9941908211784L1313.6581666666666,323.5977711170557L1313.7803333333334,326.3613630010681L1314.5133333333333,342.1885911610562L1314.7576666666666,312.39190467799074L1315.8571666666667,326.8951775674817L1320.133,336.5855314460462L1320.7438333333334,312.14271153086906L1323.5536666666667,333.4305631962146L1323.9201666666668,305.62525547803324L1324.2866666666666,336.64958983791746L1325.3861666666667,326.9630729513606L1328.8068333333333,323.8799235530987L1331.7388333333333,306.1212555385089L1332.9605,318.369133240703L1339.6796666666667,332.07915055396245L1340.4126666666666,338.1995370424514L1340.9013333333332,320.55051821938304L1341.2678333333333,301.7491701750617L1342.6116666666667,311.7180988272309L1343.8333333333333,336.0428093226941" style="fill: none;" fill="none"></path></g></g></g></g></g></g><g transform="translate(30, 378)" clip-path="url(#clip_1)"><clipPath id="clip_1"><rect width="1466" height="22"></rect></clipPath><g><g><line x1="0" y1="0" x2="0" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="122.16666666666666" y1="0" x2="122.16666666666666" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="244.33333333333331" y1="0" x2="244.33333333333331" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="366.5" y1="0" x2="366.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="488.66666666666663" y1="0" x2="488.66666666666663" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="610.8333333333334" y1="0" x2="610.8333333333334" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="733" y1="0" x2="733" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="855.1666666666667" y1="0" x2="855.1666666666667" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="977.3333333333333" y1="0" x2="977.3333333333333" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="1099.5" y1="0" x2="1099.5" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="1221.6666666666667" y1="0" x2="1221.6666666666667" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="1343.8333333333333" y1="0" x2="1343.8333333333333" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line><line x1="1466" y1="0" x2="1466" y2="5" style="visibility: inherit;" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g><g transform="translate(0, 8)"><text x="0" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">-1k</text><text x="122.16666666666666" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">0</text><text x="244.33333333333331" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">1k</text><text x="366.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">2k</text><text x="488.66666666666663" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">3k</text><text x="610.8333333333334" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">4k</text><text x="733" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">5k</text><text x="855.1666666666667" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">6k</text><text x="977.3333333333333" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">7k</text><text x="1099.5" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">8k</text><text x="1221.6666666666667" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">9k</text><text x="1343.8333333333333" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: inherit; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">10k</text><text x="1466" y="0" dx="0em" dy="0.95em" style="text-anchor: middle; visibility: hidden; font-family: &quot;Helvetica Neue&quot;, sans-serif; font-size: 12px; font-weight: 200;" fill="rgb(50, 49, 63)" stroke="none" stroke-width="1px">11k</text></g><line x1="0" y1="0" x2="1466" y2="0" fill="rgb(0, 0, 0)" stroke="rgb(204, 204, 204)" stroke-width="1px"></line></g></g></g></g></svg>
\ No newline at end of file
diff --git a/modelzoo/ELECTRA/modeling.py b/modelzoo/ELECTRA/modeling.py
new file mode 100644
index 00000000..437decca
--- /dev/null
+++ b/modelzoo/ELECTRA/modeling.py
@@ -0,0 +1,1084 @@
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+import tensorflow as tf
+
+from configuration import ElectraConfig
+from file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from modeling_utils import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
+from modeling_utils import get_initializer, shape_list
+from tokenization_utils import BatchEncoding
+import pretrain_utils, collections
+
+logger = logging.getLogger(__name__)
+
+
+TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5",
+}
+
+
+class TFElectraEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.embedding_size = config.embedding_size
+        self.initializer_range = config.initializer_range
+
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="token_type_embeddings",
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.amp = config.amp
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        super().build(input_shape)
+
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+
+        if input_ids is not None:
+            input_shape = shape_list(input_ids)
+        else:
+            input_shape = shape_list(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        if inputs_embeds is None:
+            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if self.amp:
+            embeddings = inputs_embeds + tf.cast(position_embeddings, tf.float16) + tf.cast(token_type_embeddings, tf.float16)
+        else:
+            embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = shape_list(inputs)[0]
+        length = shape_list(inputs)[1]
+
+        x = tf.reshape(inputs, [-1, self.embedding_size])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense")
+        self.dense_prediction = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="dense_prediction")
+        self.config = config
+
+    def call(self, discriminator_hidden_states, training=False):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+        logits = tf.squeeze(self.dense_prediction(hidden_states), axis=-1)
+
+        return logits
+
+
+class TFElectraGeneratorPredictions(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dense = tf.keras.layers.Dense(
+            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense")
+
+    def call(self, generator_hidden_states, training=False):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+
+        return hidden_states
+
+
+class TFElectraPreTrainedModel(TFBertPreTrainedModel):
+
+    config_class = ElectraConfig
+    pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "electra"
+
+    def get_extended_attention_mask(self, attention_mask, input_shape):
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        return extended_attention_mask
+
+    def get_head_mask(self, head_mask):
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        return head_mask
+
+
+class TFElectraMainLayer(TFElectraPreTrainedModel):
+
+    config_class = ElectraConfig
+
+    def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
+        super().__init__(config, **kwargs)
+
+        if shared_embeddings and input_embeddings is not None:
+            self.embeddings = input_embeddings
+        else:
+            self.embeddings = TFElectraEmbeddings(config, name="embeddings")
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = tf.keras.layers.Dense(
+                config.hidden_size,
+                kernel_initializer=get_initializer(config.initializer_range),
+                name="embeddings_project")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        head_mask = self.get_head_mask(head_mask)
+
+        hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+
+        if hasattr(self, "embeddings_project"):
+            hidden_states = self.embeddings_project(hidden_states, training=training)
+
+        hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training)
+
+        return hidden_states
+
+
+ELECTRA_START_DOCSTRING = r"""
+    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
+    Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. note::
+
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
+        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
+        in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ELECTRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
+            (if set to :obj:`False`) for evaluation.
+
+"""
+
+
+@add_start_docstrings(
+    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
+    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
+    "hidden size and embedding size are different."
+    ""
+    "Both the generator and discriminator checkpoints may be loaded into this model.",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraModel(TFElectraPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.electra = TFElectraMainLayer(config, name="electra")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraModel
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
+        outputs = self.electra(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings(
+    """
+Electra model with a binary classification head on top as used during pre-training for identifying generated
+tokens.
+
+Even though both the discriminator and generator may be loaded into this model, the discriminator is
+the only model of the two to have the correct classification head to be used for this model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForPreTraining(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForPreTraining
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+        output = (logits,)
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
+
+
+class TFElectraMaskedLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        super().build(input_shape)
+
+    def call(self, hidden_states, training=False):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+Electra model with a language modeling head on top.
+
+Even though both the discriminator and generator may be loaded into this model, the generator is
+the only model of the two to have been trained for the masked language modeling task.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForMaskedLM(TFElectraPreTrainedModel):
+    def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.vocab_size = config.vocab_size
+        self.electra = TFElectraMainLayer(config,
+                                          shared_embeddings=shared_embeddings,
+                                          input_embeddings=input_embeddings,
+                                          name="electra")
+        self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    def get_output_embeddings(self):
+        return self.generator_lm_head
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForMaskedLM
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
+        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+        """
+
+        generator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        generator_sequence_output = generator_hidden_states[0]
+        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
+        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
+        output = (prediction_scores,)
+        output += generator_hidden_states[1:]
+
+        return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+def get_generator_config(config, bert_config):
+    """Get model config for the generator network."""
+    gen_config = ElectraConfig.from_dict(bert_config.to_dict())
+    gen_config.hidden_size = int(round(
+        bert_config.hidden_size * config.generator_hidden_size))
+    #To keep hidden size divisble by 64 - attention head size
+    if gen_config.hidden_size % 64 != 0:
+        gen_config.hidden_size += 64 - (gen_config.hidden_size % 64)
+    gen_config.num_hidden_layers = int(round(
+        bert_config.num_hidden_layers * config.generator_layers))
+    gen_config.intermediate_size = 4 * gen_config.hidden_size
+    gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
+    return gen_config
+
+class PretrainingModel(tf.keras.Model):
+    """Transformer pre-training using the replaced-token-detection task."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        # Set up model config
+        self._config = config
+        self.disc_config = ElectraConfig(vocab_size=config.vocab_size,
+                                         embedding_size=config.embedding_size,
+                                         hidden_size=config.hidden_size,
+                                         num_hidden_layers=config.num_hidden_layers,
+                                         num_attention_heads=config.num_attention_heads,
+                                         intermediate_size=4*config.hidden_size,
+                                         hidden_act=config.act_func,
+                                         hidden_dropout_prob=config.hidden_dropout_prob,
+                                         attention_probs_dropout_prob=config.attention_probs_dropout_prob, )
+        self.disc_config.update({"amp": config.amp})
+
+        # Set up discriminator
+        self.discriminator = TFElectraForPreTraining(self.disc_config)
+
+        # Set up generator
+        gen_config = get_generator_config(config, self.disc_config)
+        gen_config.update({"amp": config.amp})
+        if config.electra_objective:
+            if config.shared_embeddings:
+                self.generator = TFElectraForMaskedLM(
+                    gen_config, shared_embeddings=True,
+                    input_embeddings=self.discriminator.get_input_embeddings())
+            else:
+                self.generator = TFElectraForMaskedLM(gen_config)
+        else:
+            self.generator = TFElectraForMaskedLM(self.disc_config)
+
+    def call(self, features, is_training):
+        config = self._config
+
+        # Mask the input
+        masked_inputs = pretrain_utils.mask(
+            config, pretrain_utils.features_to_inputs(features), config.mask_prob)
+
+        # Generator
+        if config.uniform_generator:
+            mlm_output = self._get_masked_lm_output(masked_inputs, None, is_training=is_training)
+        else:
+            mlm_output = self._get_masked_lm_output(
+                masked_inputs, self.generator, is_training=is_training)
+        fake_data = self._get_fake_data(masked_inputs, mlm_output.logits)
+        total_loss = config.gen_weight * mlm_output.loss
+
+        # Discriminator
+        disc_output = None
+        if config.electra_objective:
+            disc_output = self._get_discriminator_output(
+                fake_data.inputs, self.discriminator, fake_data.is_fake_tokens,
+                is_training=is_training)
+            total_loss += config.disc_weight * disc_output.loss
+
+        # Evaluation inputs
+        eval_fn_inputs = {
+            "input_ids": masked_inputs.input_ids,
+            "masked_lm_preds": mlm_output.preds,
+            "mlm_loss": mlm_output.per_example_loss,
+            "masked_lm_ids": masked_inputs.masked_lm_ids,
+            "masked_lm_weights": masked_inputs.masked_lm_weights,
+            "input_mask": masked_inputs.input_mask
+        }
+        if config.electra_objective:
+            eval_fn_inputs.update({
+                "disc_loss": disc_output.per_example_loss,
+                "disc_labels": disc_output.labels,
+                "disc_probs": disc_output.probs,
+                "disc_preds": disc_output.preds,
+                "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1,
+                                            output_type=tf.int32)
+            })
+
+        return total_loss, eval_fn_inputs
+
+    def _get_masked_lm_output(self, inputs, generator, is_training=False):
+        """Masked language modeling softmax layer."""
+        masked_lm_weights = inputs.masked_lm_weights
+
+        if self._config.uniform_generator:
+            logits = tf.zeros(self.disc_config.vocab_size)
+            logits_tiled = tf.zeros(
+                pretrain_utils.get_shape_list(inputs.masked_lm_ids) +
+                [self.disc_config.vocab_size])
+            logits_tiled += tf.reshape(logits, [1, 1, self.disc_config.vocab_size])
+            logits = logits_tiled
+        else:
+            outputs = generator(
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.input_mask,
+                token_type_ids=inputs.segment_ids,
+                training=is_training)
+            logits = outputs[0]
+            logits = pretrain_utils.gather_positions(
+                logits, inputs.masked_lm_positions)
+
+        oh_labels = tf.one_hot(
+            inputs.masked_lm_ids, depth=self.disc_config.vocab_size,
+            dtype=tf.float32)
+
+        probs = tf.cast(tf.nn.softmax(logits), tf.float32)
+        log_probs = tf.cast(tf.nn.log_softmax(logits), tf.float32)
+        label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)
+
+        numerator = tf.reduce_sum(masked_lm_weights * label_log_probs)
+        denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
+        loss = numerator / denominator
+        preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)
+
+        MLMOutput = collections.namedtuple(
+            "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"])
+        return MLMOutput(
+            logits=logits, probs=probs, per_example_loss=label_log_probs,
+            loss=loss, preds=preds)
+
+    def _get_discriminator_output(self, inputs, discriminator, labels, is_training=False):
+        """Discriminator binary classifier."""
+
+        outputs = discriminator(
+            input_ids=inputs.input_ids,
+            attention_mask=inputs.input_mask,
+            token_type_ids=inputs.segment_ids,
+            training=is_training,
+        )
+        logits = outputs[0]
+        weights = tf.cast(inputs.input_mask, tf.float32)
+        labelsf = tf.cast(labels, tf.float32)
+        logits = tf.cast(logits, tf.float32)
+        losses = tf.nn.sigmoid_cross_entropy_with_logits(
+            logits=logits, labels=labelsf) * weights
+        per_example_loss = (tf.reduce_sum(losses, axis=-1) /
+                            (1e-6 + tf.reduce_sum(weights, axis=-1)))
+        loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
+        probs = tf.nn.sigmoid(logits)
+        preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
+        DiscOutput = collections.namedtuple(
+            "DiscOutput", ["loss", "per_example_loss", "probs", "preds",
+                           "labels"])
+        return DiscOutput(
+            loss=loss, per_example_loss=per_example_loss, probs=probs,
+            preds=preds, labels=labels,
+        )
+
+    def _get_fake_data(self, inputs, mlm_logits):
+        """Sample from the generator to create corrupted input."""
+        inputs = pretrain_utils.unmask(inputs)
+        disallow = tf.one_hot(
+            inputs.masked_lm_ids, depth=self.disc_config.vocab_size,
+            dtype=tf.float32) if self._config.disallow_correct else None
+        sampled_tokens = tf.stop_gradient(pretrain_utils.sample_from_softmax(
+            mlm_logits / self._config.temperature, disallow=disallow))
+        sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
+        updated_input_ids, masked = pretrain_utils.scatter_update(
+            inputs.input_ids, sampled_tokids, inputs.masked_lm_positions)
+        labels = masked * (1 - tf.cast(
+            tf.equal(updated_input_ids, inputs.input_ids), tf.int32))
+        updated_inputs = pretrain_utils.get_updated_inputs(
+            inputs, input_ids=updated_input_ids)
+        FakedData = collections.namedtuple("FakedData", [
+            "inputs", "is_fake_tokens", "sampled_tokens"])
+        return FakedData(inputs=updated_inputs, is_fake_tokens=labels,
+                         sampled_tokens=sampled_tokens)
+
+
+@add_start_docstrings(
+    """
+Electra model with a token classification head on top.
+
+Both the discriminator and generator may be loaded into this model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForTokenClassification(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier")
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForTokenClassification
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+        logits = self.classifier(discriminator_sequence_output)
+        output = (logits,)
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
+
+
+class TFPoolerStartLogits(tf.keras.Model):
+    """ Compute SQuAD start_logits from sequence hidden states. """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+        self.dense = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="start_logit_pooler_dense"
+        )
+
+    def call(self, hidden_states, p_mask=None, next_layer_dtype=tf.float32):
+        """ Args:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+                invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        x = tf.squeeze(self.dense(hidden_states), axis=-1,
+                       name="squeeze_start_logit_pooler")
+
+        if p_mask is not None:
+            x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class TFPoolerEndLogits(tf.keras.Model):
+    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+        self.dense_0 = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
+            name="end_logit_pooler_dense_0"
+        )
+
+        self.activation = tf.keras.layers.Activation('tanh')  # nn.Tanh()
+        self.LayerNorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=config.layer_norm_eps,
+                                                            name="end_logit_pooler_LayerNorm")
+        self.dense_1 = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_1"
+        )
+
+    def call(self, hidden_states, start_states=None, start_positions=None, p_mask=None, training=False,
+             next_layer_dtype=tf.float32):
+        """ Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        assert (
+                start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
+        if start_positions is not None and training:
+            bsz, slen, hsz = hidden_states.shape
+            start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
+                                     batch_dims=1)  # shape (bsz, 1, hsz)
+            start_states = tf.broadcast_to(start_states, (bsz, slen, hsz))  # shape (bsz, slen, hsz)
+
+        x = self.dense_0(tf.concat([hidden_states, start_states], axis=-1))
+        x = self.activation(x)
+        if training:
+            # since we are not doing beam search, add dimension with value=1. corresponds to dimension with top_k during inference - if not layernorm crashes
+            x = tf.expand_dims(x, axis=2)
+        x = self.LayerNorm(x)
+
+        if training:
+            # undo the additional dimension added above
+            x = tf.squeeze(self.dense_1(x), axis=[-1, -2])
+        else:
+            x = tf.squeeze(self.dense_1(x), axis=-1)
+
+        if p_mask is not None:
+            x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class TFPoolerAnswerClass(tf.keras.Model):
+    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+        self.dense_0 = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
+            name="pooler_answer_class_dense_0"
+        )
+
+        self.activation = tf.keras.layers.Activation('tanh')
+        self.dense_1 = tf.keras.layers.Dense(
+            1, use_bias=False, kernel_initializer=get_initializer(config.initializer_range),
+            name="pooler_answer_class_dense_1"
+        )
+
+    def call(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+        """
+        Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span.
+            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+                position of the CLS token. If None, take the last token.
+            note(Original repo):
+                no dependency on end_feature so that we can obtain one single `cls_logits`
+                for each sample
+        """
+        assert (
+                start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
+                                     batch_dims=1)  # shape (bsz, 1, hsz)
+            start_states = tf.squeeze(start_states, axis=1)  # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_token_state = tf.gather(hidden_states, cls_index[:, None], axis=1, batch_dims=1)  # shape (bsz, 1, hsz)
+            cls_token_state = tf.squeeze(cls_token_state, axis=1)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, 0, :]  # shape (bsz, hsz)
+
+        x = self.dense_0(tf.concat([start_states, cls_token_state], axis=-1))
+        x = self.activation(x)
+        x = tf.squeeze(self.dense_1(x), axis=-1)
+
+        return x
+
+
+class TFElectraForQuestionAnswering(TFElectraPreTrainedModel):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+
+        self.start_n_top = args.beam_size  # config.start_n_top
+        self.end_n_top = args.beam_size  # config.end_n_top
+        self.joint_head = args.joint_head
+        self.v2 = args.version_2_with_negative
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.num_hidden_layers = config.num_hidden_layers
+        self.amp = config.amp
+
+        ##old head
+        if not self.joint_head:
+            self.qa_outputs = tf.keras.layers.Dense(
+                2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs")
+        else:
+            self.start_logits = TFPoolerStartLogits(config, name='start_logits')
+            self.end_logits = TFPoolerEndLogits(config, name='end_logits')
+            if self.v2:
+                self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
+
+    def call(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            start_positions=None,
+            end_positions=None,
+            cls_index=None,
+            p_mask=None,
+            is_impossible=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            training=False,
+    ):
+        outputs = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = outputs[0]
+
+        # Simple head model
+        if not self.joint_head:
+            logits = self.qa_outputs(discriminator_sequence_output)
+            [start_logits, end_logits] = tf.split(logits, 2, axis=-1)
+            start_logits = tf.squeeze(start_logits, axis=-1, name="squeeze_start_logit")
+            end_logits = tf.squeeze(end_logits, axis=-1, name="squeeze_end_logit")
+            outputs = (start_logits, end_logits) + outputs
+            return outputs
+
+        start_logits = self.start_logits(discriminator_sequence_output, p_mask=p_mask,
+                                         next_layer_dtype=self.end_logits.dense_0.dtype)
+        if training:  # start_positions is not None and end_positions is not None:
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(discriminator_sequence_output, start_positions=start_positions, p_mask=p_mask,
+                                         training=training,
+                                         next_layer_dtype=tf.float16 if self.amp else tf.float32)
+
+            if self.v2:  # cls_index is not None:#cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(discriminator_sequence_output, start_positions=start_positions,
+                                               cls_index=cls_index)
+
+            else:
+                cls_logits = None
+
+            outputs = (start_logits, end_logits, cls_logits) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = discriminator_sequence_output.shape
+            start_n_top = min(self.start_n_top, slen)
+            end_n_top = min(self.end_n_top, slen)
+            start_log_probs = tf.nn.log_softmax(start_logits, axis=-1, name="start_logit_softmax")  # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = tf.math.top_k(start_log_probs, k=start_n_top,
+                                                                 name="start_log_probs_top_k")
+
+            start_states = tf.gather(discriminator_sequence_output, start_top_index, axis=1,
+                                     batch_dims=1)  # shape (bsz, start_n_top, hsz)
+            start_states = tf.broadcast_to(tf.expand_dims(start_states, axis=1),
+                                           [bsz, slen, start_n_top, hsz])  # shape (bsz, slen, start_n_top, hsz)
+
+            discriminator_sequence_output_expanded = tf.broadcast_to(
+                tf.expand_dims(discriminator_sequence_output, axis=2),
+                list(start_states.shape))  # shape (bsz, slen, start_n_top, hsz)
+
+            p_mask = tf.expand_dims(p_mask, axis=-1) if p_mask is not None else None
+            end_logits = self.end_logits(discriminator_sequence_output_expanded, start_states=start_states,
+                                         p_mask=p_mask, next_layer_dtype=tf.float16 if self.amp else tf.float32)  # self.answer_class.dense_0.dtype)
+            end_log_probs = tf.nn.log_softmax(end_logits, axis=1,
+                                              name="end_logit_softmax")  # shape (bsz, slen, start_n_top)
+
+            # need to transpose because tf.math.top_k works on default axis=-1
+            end_log_probs = tf.transpose(end_log_probs, perm=[0, 2, 1])
+            end_top_log_probs, end_top_index = tf.math.top_k(
+                end_log_probs, k=end_n_top)  # shape (bsz, end_n_top, start_n_top).perm(0,2,1)
+            end_top_log_probs = tf.reshape(end_top_log_probs, (
+                -1, start_n_top * end_n_top))  # shape (bsz, self.start_n_top * self.end_n_top)
+            end_top_index = tf.reshape(end_top_index,
+                                       (-1, start_n_top * end_n_top))  # shape (bsz, self.start_n_top * self.end_n_top)
+            if self.v2:  # cls_index is not None:
+                start_p = tf.nn.softmax(start_logits, axis=-1, name="start_softmax")
+                start_states = tf.einsum(
+                    "blh,bl->bh", discriminator_sequence_output, tf.cast(start_p, tf.float16) if self.amp else start_p
+                )  # get the representation of START as weighted sum of hidden states
+                # explicitly setting cls_index to None
+                cls_logits = self.answer_class(
+                    discriminator_sequence_output, start_states=start_states, cls_index=None)
+                # one single `cls_logits` for each sample
+            else:
+                cls_logits = tf.fill([bsz], 0.0)
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        return outputs
diff --git a/modelzoo/ELECTRA/modeling_utils.py b/modelzoo/ELECTRA/modeling_utils.py
new file mode 100644
index 00000000..bfbc4cf4
--- /dev/null
+++ b/modelzoo/ELECTRA/modeling_utils.py
@@ -0,0 +1,2843 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF general model utils."""
+import functools
+import logging
+import os
+
+import h5py
+import numpy as np
+import tensorflow as tf
+from tensorflow.python.keras.saving import hdf5_format
+
+from configuration_utils import PretrainedConfig, BertConfig
+from file_utils import DUMMY_INPUTS, TF2_WEIGHTS_NAME, WEIGHTS_NAME, cached_path, hf_bucket_url, is_remote_url
+from file_utils import MULTIPLE_CHOICE_DUMMY_INPUTS, add_start_docstrings, add_start_docstrings_to_callable
+from tokenization_utils import BatchEncoding
+from utils import log
+
+
+class TFModelUtilsMixin:
+    """
+    A few utilities for `tf.keras.Model`s, to be used as a mixin.
+    """
+
+    def num_parameters(self, only_trainable: bool = False) -> int:
+        """
+        Get number of (optionally, trainable) parameters in the model.
+        """
+        if only_trainable:
+            return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
+        else:
+            return self.count_params()
+
+
+def keras_serializable(cls):
+    """
+    Decorate a Keras Layer class to support Keras serialization.
+
+    This is done by:
+    1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
+       serialization time
+    2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
+       convert it to a config object for the actual layer initializer
+    3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does
+       not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`
+
+    :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a
+                `TF*MainLayer` class in this project)
+    :return: the same class object, with modifications for Keras deserialization.
+    """
+    initializer = cls.__init__
+
+    config_class = getattr(cls, "config_class", None)
+    if config_class is None:
+        raise AttributeError("Must set `config_class` to use @keras_serializable")
+
+    @functools.wraps(initializer)
+    def wrapped_init(self, *args, **kwargs):
+        transformers_config = kwargs.pop("transformers_config", None)
+        config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None)
+        if config is not None and transformers_config is not None:
+            raise ValueError("Must pass either `config` or `transformers_config`, not both")
+        elif config is not None:
+            # normal layer construction, call with unchanged args (config is already in there)
+            initializer(self, *args, **kwargs)
+        elif transformers_config is not None:
+            # Keras deserialization, convert dict to config
+            config = config_class.from_dict(transformers_config)
+            initializer(self, config, *args, **kwargs)
+        else:
+            raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)")
+        self._transformers_config = config
+
+    cls.__init__ = wrapped_init
+
+    if not hasattr(cls, "get_config"):
+        raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses")
+    if hasattr(cls.get_config, "_is_default"):
+
+        def get_config(self):
+            cfg = super(cls, self).get_config()
+            cfg["transformers_config"] = self._transformers_config.to_dict()
+            return cfg
+
+        cls.get_config = get_config
+
+    cls._keras_serializable = True
+    if hasattr(tf.keras.utils, "register_keras_serializable"):
+        cls = tf.keras.utils.register_keras_serializable()(cls)
+    return cls
+
+
+class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
+    r""" Base class for all TF models.
+
+        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+    """
+    config_class = None
+    pretrained_model_archive_map = {}
+    base_model_prefix = ""
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(DUMMY_INPUTS)}
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        # Save config in model
+        self.config = config
+
+    def get_input_embeddings(self):
+        """
+        Returns the model's input embeddings.
+
+        Returns:
+            :obj:`tf.keras.layers.Layer`:
+                A torch module mapping vocabulary to hidden states.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+        else:
+            raise NotImplementedError
+
+    def get_output_embeddings(self):
+        """
+        Returns the model's output embeddings.
+
+        Returns:
+            :obj:`tf.keras.layers.Layer`:
+                A torch module mapping hidden states to vocabulary.
+        """
+        return None  # Overwrite for models with output embeddings
+
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Variable from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``tf.Variable``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        # if new_num_tokens is None:
+        #     return old_embeddings
+
+        # old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        # if old_num_tokens == new_num_tokens:
+        #     return old_embeddings
+
+        # # Build new embeddings
+        # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        # new_embeddings.to(old_embeddings.weight.device)
+
+        # # initialize all new embeddings (in particular added tokens)
+        # self._init_weights(new_embeddings)
+
+        # # Copy token embeddings from the previous weights
+        # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        # return new_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+
+        Arguments:
+
+            new_num_tokens: (`optional`) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
+
+        Return: ``tf.Variable``
+            Pointer to the input tokens Embeddings Module of the model
+        """
+        raise NotImplementedError
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the base model.
+
+            Arguments:
+
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+        """
+        raise NotImplementedError
+
+    def save_pretrained(self, save_directory):
+        """ Save a model and its configuration file to a directory, so that it
+            can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method.
+        """
+        if os.path.isfile(save_directory):
+            log("Provided path ({}) should be a directory, not a file".format(save_directory))
+            return
+        os.makedirs(save_directory, exist_ok=True)
+
+        # Save configuration file
+        self.config.save_pretrained(save_directory)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
+        self.save_weights(output_model_file)
+
+        with h5py.File(output_model_file, "r") as f:
+            if "layer_names" not in f.attrs and "model_weights" in f:
+                f = f["model_weights"]
+            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
+        log(f"Model weights saved in {output_model_file}: {hdf5_layer_names}")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
+
+        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
+        It is up to you to train those weights with a downstream fine-tuning task.
+
+        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) one of:
+                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            from_pt: (`optional`) boolean, default False:
+                Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            # For example purposes. Not runnable.
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
+
+        """
+        config = kwargs.pop("config", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_pt = kwargs.pop("from_pt", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+            elif os.path.isdir(pretrained_model_name_or_path):
+                if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                    # Load from a TF 2.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_pt` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
+                        )
+                    )
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = hf_bucket_url(
+                    pretrained_model_name_or_path, postfix=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME)
+                )
+
+            # redirect to the cache, if necessary
+            try:
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    resume_download=resume_download,
+                    proxies=proxies,
+                )
+            except EnvironmentError as e:
+                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                    log("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
+                else:
+                    log(
+                        "Model name '{}' was not found in model name list ({}). "
+                        "We assumed '{}' was a path or url but couldn't find any file "
+                        "associated to this path or url.".format(
+                            pretrained_model_name_or_path,
+                            ", ".join(cls.pretrained_model_archive_map.keys()),
+                            archive_file,
+                        )
+                    )
+                raise e
+            if resolved_archive_file == archive_file:
+                log("loading weights file {}".format(archive_file))
+            else:
+                log("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
+        else:
+            resolved_archive_file = None
+
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+
+        if from_pt:
+            # Load from a PyTorch checkpoint
+            raise NotImplementedError
+            # return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
+
+        model(model.dummy_inputs, training=False)  # build the network with dummy inputs
+
+        assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
+        # 'by_name' allow us to do transfer learning by skipping/adding layers
+        # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
+        try:
+            model.load_weights(resolved_archive_file, by_name=True)
+        except OSError:
+            raise OSError(
+                "Unable to load weights from h5 file. "
+                "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
+            )
+
+        model(model.dummy_inputs, training=False)  # Make sure restore ops are run
+
+        # Check if the models are the same to output loading information
+        with h5py.File(resolved_archive_file, "r") as f:
+            if "layer_names" not in f.attrs and "model_weights" in f:
+                f = f["model_weights"]
+            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
+        model_layer_names = set(layer.name for layer in model.layers)
+        missing_keys = list(model_layer_names - hdf5_layer_names)
+        unexpected_keys = list(hdf5_layer_names - model_layer_names)
+        error_msgs = []
+
+        if len(unexpected_keys) > 0:
+            log(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
+                f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
+            )
+        else:
+            log(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            log(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
+                f"and are newly initialized: {missing_keys}\n"
+            )
+        else:
+            log(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
+                f"If your task is similar to the task the model of the ckeckpoint was trained on, "
+                f"you can already use {model.__class__.__name__} for predictions without further training."
+            )
+        if len(error_msgs) > 0:
+            raise RuntimeError(
+                "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+            )
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            return model, loading_info
+
+        return model
+
+    def prepare_inputs_for_generation(self, inputs, **kwargs):
+        return {"inputs": inputs}
+
+    def _do_output_past(self, outputs):
+        has_output_past = hasattr(self.config, "output_past") and self.config.output_past
+        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
+
+        if has_output_past and not has_mem_len and len(outputs) > 1:
+            return True
+        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
+            return True
+
+        return False
+
+    def generate(
+        self,
+        input_ids=None,
+        max_length=None,
+        min_length=None,
+        do_sample=None,
+        early_stopping=None,
+        num_beams=None,
+        temperature=None,
+        top_k=None,
+        top_p=None,
+        repetition_penalty=None,
+        bad_words_ids=None,
+        bos_token_id=None,
+        pad_token_id=None,
+        eos_token_id=None,
+        length_penalty=None,
+        no_repeat_ngram_size=None,
+        num_return_sequences=None,
+        attention_mask=None,
+        decoder_start_token_id=None,
+    ):
+        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
+        and beam-search.
+
+        Adapted in part from `Facebook's XLM beam search code`_.
+
+        .. _`Facebook's XLM beam search code`:
+           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
+
+
+        Parameters:
+
+            input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)`
+                The sequence used as a prompt for the generation. If `None` the method initializes
+                it as an empty `torch.LongTensor` of shape `(1,)`.
+
+            max_length: (`optional`) int
+                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
+
+            min_length: (`optional`) int
+                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
+            do_sample: (`optional`) bool
+                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
+
+            early_stopping: (`optional`) bool
+                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
+
+            num_beams: (`optional`) int
+                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
+
+            temperature: (`optional`) float
+                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
+
+            top_k: (`optional`) int
+                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+
+            top_p: (`optional`) float
+                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+
+            repetition_penalty: (`optional`) float
+                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
+
+            bos_token_id: (`optional`) int
+                Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist.
+
+            pad_token_id: (`optional`) int
+                Pad token. Defaults to pad_token_id as defined in the models config.
+
+            eos_token_id: (`optional`) int
+                EOS token. Defaults to eos_token_id as defined in the models config.
+
+            length_penalty: (`optional`) float
+                Exponential penalty to the length. Default to 1.
+
+            no_repeat_ngram_size: (`optional`) int
+                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
+
+            bad_words_ids: (`optional`) list of lists of int
+                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+
+            num_return_sequences: (`optional`) int
+                The number of independently computed returned sequences for each element in the batch. Default to 1.
+
+            attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids`
+                Mask to avoid performing attention on padding token indices.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+                Defaults to `None`.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+
+            decoder_start_token_id=None: (`optional`) int
+                If an encoder-decoder model starts decoding with a different token than BOS.
+                Defaults to `None` and is changed to `BOS` later.
+
+        Return:
+
+            output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)`
+                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
+
+        Examples::
+
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            outputs = model.generate(max_length=40)  # do greedy decoding
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
+        """
+
+        # We cannot generate if the model does not have a LM head
+        if self.get_output_embeddings() is None:
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)"
+            )
+
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        temperature = temperature if temperature is not None else self.config.temperature
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        no_repeat_ngram_size = (
+            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+        )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+
+        if input_ids is not None:
+            batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
+        else:
+            batch_size = 1
+
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
+        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
+        assert temperature > 0, "`temperature` should be strictely positive."
+        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert input_ids is not None or (
+            isinstance(bos_token_id, int) and bos_token_id >= 0
+        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
+        assert pad_token_id is None or (
+            isinstance(pad_token_id, int) and (pad_token_id >= 0)
+        ), "`pad_token_id` should be a positive integer."
+        assert (eos_token_id is None) or (
+            isinstance(eos_token_id, int) and (eos_token_id >= 0)
+        ), "`eos_token_id` should be a positive integer."
+        assert length_penalty > 0, "`length_penalty` should be strictely positive."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictely positive integer."
+        assert (
+            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
+
+        if input_ids is None:
+            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
+                "you should either supply a context to complete as `input_ids` input "
+                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+            )
+            input_ids = tf.fill((batch_size, 1), bos_token_id)
+        else:
+            assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
+
+        # not allow to duplicate outputs when greedy decoding
+        if do_sample is False:
+            if num_beams == 1:
+                # no_beam_search greedy generation conditions
+                assert (
+                    num_return_sequences == 1
+                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
+
+            else:
+                # beam_search greedy generation conditions
+                assert (
+                    num_beams >= num_return_sequences
+                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+
+        # create attention mask if necessary
+        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
+        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
+            attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
+        elif attention_mask is None:
+            attention_mask = tf.ones_like(input_ids)
+
+        if pad_token_id is None and eos_token_id is not None:
+            log(
+                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
+            )
+            pad_token_id = eos_token_id
+
+        # current position and vocab size
+        cur_len = shape_list(input_ids)[1]
+        vocab_size = self.config.vocab_size
+
+        # set effective batch size and effective batch multiplier according to do_sample
+        if do_sample:
+            effective_batch_size = batch_size * num_return_sequences
+            effective_batch_mult = num_return_sequences
+        else:
+            effective_batch_size = batch_size
+            effective_batch_mult = 1
+
+        # Expand input ids if num_beams > 1 or num_return_sequences > 1
+        if num_return_sequences > 1 or num_beams > 1:
+            input_ids_len = shape_list(input_ids)[-1]
+            input_ids = tf.broadcast_to(
+                tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
+            )
+            attention_mask = tf.broadcast_to(
+                tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
+            )
+            input_ids = tf.reshape(
+                input_ids, (effective_batch_size * num_beams, input_ids_len)
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+            attention_mask = tf.reshape(
+                attention_mask, (effective_batch_size * num_beams, input_ids_len)
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+
+        if self.config.is_encoder_decoder:
+            if decoder_start_token_id is None:
+                decoder_start_token_id = bos_token_id
+
+            assert (
+                decoder_start_token_id is not None
+            ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
+            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+
+            # get encoder and store encoder outputs
+            encoder = self.get_encoder()
+
+            encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
+
+            # create empty decoder_input_ids
+            input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id
+            cur_len = 1
+
+        else:
+            encoder_outputs = None
+            cur_len = shape_list(input_ids)[-1]
+
+        if num_beams > 1:
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                early_stopping=early_stopping,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                bos_token_id=bos_token_id,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                decoder_start_token_id=decoder_start_token_id,
+                batch_size=effective_batch_size,
+                num_return_sequences=num_return_sequences,
+                length_penalty=length_penalty,
+                num_beams=num_beams,
+                vocab_size=vocab_size,
+                encoder_outputs=encoder_outputs,
+                attention_mask=attention_mask,
+            )
+        else:
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                bos_token_id=bos_token_id,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                decoder_start_token_id=decoder_start_token_id,
+                batch_size=effective_batch_size,
+                vocab_size=vocab_size,
+                encoder_outputs=encoder_outputs,
+                attention_mask=attention_mask,
+            )
+
+        return output
+
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        bos_token_id,
+        pad_token_id,
+        eos_token_id,
+        decoder_start_token_id,
+        batch_size,
+        vocab_size,
+        encoder_outputs,
+        attention_mask,
+    ):
+        """ Generate sequences for each example without beam search (num_beams == 1).
+            All returned sequence are generated independantly.
+        """
+
+        # length of generated sentences / unfinished sentences
+        unfinished_sents = tf.ones_like(input_ids[:, 0])
+        sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
+
+        past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
+            outputs = self(**model_inputs)
+            next_token_logits = outputs[0][:, -1, :]
+
+            # if model has past, then set the past variable to speed up decoding
+            if self._do_output_past(outputs):
+                past = outputs[1]
+
+            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                next_token_logits_penalties = _create_next_token_logits_penalties(
+                    input_ids, next_token_logits, repetition_penalty
+                )
+                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
+
+            if no_repeat_ngram_size > 0:
+                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
+                # create banned_tokens boolean mask
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                next_token_logits = set_tensor_by_indices_to_value(
+                    next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                next_token_logits = set_tensor_by_indices_to_value(
+                    next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
+            # set eos token prob to zero if min_length is not reached
+            if eos_token_id is not None and cur_len < min_length:
+                # create eos_token_id boolean mask
+                is_token_logit_eos_token = tf.convert_to_tensor(
+                    [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
+                )
+                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size])
+
+                next_token_logits = set_tensor_by_indices_to_value(
+                    next_token_logits, eos_token_indices_mask, -float("inf")
+                )
+
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature != 1.0:
+                    next_token_logits = next_token_logits / temperature
+                # Top-p/top-k filtering
+                next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+                # Sample
+                next_token = tf.squeeze(
+                    tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1
+                )
+            else:
+                # Greedy decoding
+                next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32)
+
+            # update generations and finished sentences
+            if eos_token_id is not None:
+                # pad finished sentences if eos_token_id exist
+                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+            else:
+                tokens_to_add = next_token
+
+            input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1)
+
+            if eos_token_id is not None:
+                eos_in_sents = tokens_to_add == eos_token_id
+                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
+                is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
+                    unfinished_sents, tf.cast(eos_in_sents, tf.int32)
+                )
+                sent_lengths = (
+                    sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos)
+                    + cur_len * is_sents_unfinished_and_token_to_add_is_eos
+                )
+
+                # unfinished_sents is set to zero if eos in sentence
+                unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos
+
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if tf.math.reduce_max(unfinished_sents) == 0:
+                break
+
+            # extend attention_mask for new generated input if only decoder
+            if self.config.is_encoder_decoder is False:
+                attention_mask = tf.concat(
+                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
+                )
+
+            cur_len = cur_len + 1
+
+        # if there are different sentences lengths in the batch, some batches have to be padded
+        min_sent_length = tf.math.reduce_min(sent_lengths)
+        max_sent_length = tf.math.reduce_max(sent_lengths)
+        if min_sent_length != max_sent_length:
+            assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
+            # finished sents are filled with pad_token
+            padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id
+
+            # create length masks for tf.where operation
+            broad_casted_sent_lengths = tf.broadcast_to(
+                tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length]
+            )
+            broad_casted_range = tf.transpose(
+                tf.broadcast_to(tf.expand_dims(tf.range(max_length), -1), [max_length, batch_size])
+            )
+
+            decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding)
+        else:
+            decoded = input_ids
+
+        return decoded
+
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        early_stopping,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        bos_token_id,
+        pad_token_id,
+        decoder_start_token_id,
+        eos_token_id,
+        batch_size,
+        num_return_sequences,
+        length_penalty,
+        num_beams,
+        vocab_size,
+        encoder_outputs,
+        attention_mask,
+    ):
+        """ Generate sequences for each example with beam search.
+        """
+
+        # generated hypotheses
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
+            for _ in range(batch_size)
+        ]
+
+        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
+        if do_sample is False:
+            beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
+            beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9)
+            beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1)
+        else:
+            beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32)
+
+        beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,))
+
+        # cache compute states
+        past = encoder_outputs
+
+        # done sentences
+        done = [False for _ in range(batch_size)]
+
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
+            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
+            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+
+            # if model has past, then set the past variable to speed up decoding
+            if self._do_output_past(outputs):
+                past = outputs[1]
+
+            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+            if repetition_penalty != 1.0:
+                next_token_logits_penalties = _create_next_token_logits_penalties(
+                    input_ids, next_token_logits, repetition_penalty
+                )
+                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
+
+            # Temperature (higher temperature => more likely to sample low probability tokens)
+            if temperature != 1.0:
+                next_token_logits = next_token_logits / temperature
+
+            #             calculate log softmax score
+            scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
+
+            # set eos token prob to zero if min_length is not reached
+            if eos_token_id is not None and cur_len < min_length:
+                # create eos_token_id boolean mask
+                num_batch_hypotheses = batch_size * num_beams
+
+                is_token_logit_eos_token = tf.convert_to_tensor(
+                    [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
+                )
+                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size])
+
+                scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf"))
+
+            if no_repeat_ngram_size > 0:
+                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+                num_batch_hypotheses = batch_size * num_beams
+                banned_tokens = calc_banned_ngram_tokens(
+                    input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
+                )
+                # create banned_tokens boolean mask
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                scores = set_tensor_by_indices_to_value(
+                    scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                scores = set_tensor_by_indices_to_value(
+                    scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
+            assert shape_list(scores) == [batch_size * num_beams, vocab_size]
+
+            if do_sample:
+                _scores = scores + tf.broadcast_to(
+                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
+                )  # (batch_size * num_beams, vocab_size)
+
+                # Top-p/top-k filtering
+                _scores = tf_top_k_top_p_filtering(
+                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size))
+
+                next_tokens = tf.random.categorical(
+                    _scores, dtype=tf.int32, num_samples=2 * num_beams
+                )  # (batch_size, 2 * num_beams)
+                # Compute next scores
+                next_scores = tf.gather(_scores, next_tokens, batch_dims=1)  # (batch_size, 2 * num_beams)
+
+                # sort the sampled vector to make sure that the first num_beams samples are the best
+                next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1)
+                next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
+                next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
+            else:
+                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
+                next_scores = scores + tf.broadcast_to(
+                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
+                )  # (batch_size * num_beams, vocab_size)
+
+                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                next_scores = tf.reshape(
+                    next_scores, (batch_size, num_beams * vocab_size)
+                )  # (batch_size, num_beams * vocab_size)
+
+                next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True)
+
+            assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
+
+            # next batch beam content
+            next_batch_beam = []
+
+            # for each sentence
+            for batch_idx in range(batch_size):
+
+                # if we are done with this sentence
+                if done[batch_idx]:
+                    assert (
+                        len(generated_hyps[batch_idx]) >= num_beams
+                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
+                    assert (
+                        eos_token_id is not None and pad_token_id is not None
+                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+
+                # next sentence beam content
+                next_sent_beam = []
+
+                # next tokens for this sentence
+                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
+                    zip(next_tokens[batch_idx], next_scores[batch_idx])
+                ):
+                    # get beam and token IDs
+                    beam_id = beam_token_id // vocab_size
+                    token_id = beam_token_id % vocab_size
+
+                    effective_beam_id = batch_idx * num_beams + beam_id
+                    # add to generated hypotheses if end of sentence or last iteration
+                    if (eos_token_id is not None) and (token_id.numpy() == eos_token_id):
+                        # if beam_token does not belong to top num_beams tokens, it should not be added
+                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
+                        if is_beam_token_worse_than_top_num_beams:
+                            continue
+                        generated_hyps[batch_idx].add(
+                            tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy()
+                        )
+                    else:
+                        # add next predicted token if it is not eos_token
+                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
+
+                    # the beam for next step is full
+                    if len(next_sent_beam) == num_beams:
+                        break
+
+                # Check if were done so that we can save a pad step if all(done)
+                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
+                    tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len
+                )
+
+                # update next beam content
+                assert len(next_sent_beam) == num_beams, "Beam should always be full"
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_idx + 1)
+
+            # stop when we are done with each sentence
+            if all(done):
+                break
+
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32)
+            beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32)
+            beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32)
+
+            # re-order batch
+            input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx])
+            input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1)
+            # re-order internal states
+            if past is not None:
+                past = self._reorder_cache(past, beam_idx)
+
+            # extend attention_mask for new generated input if only decoder
+            if self.config.is_encoder_decoder is False:
+                attention_mask = tf.concat(
+                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
+                )
+
+            # update current length
+            cur_len = cur_len + 1
+
+        # finalize all open beam hypotheses and end to generated hypotheses
+        for batch_idx in range(batch_size):
+            # Add all open beam hypothesis to generated_hyps
+            if done[batch_idx]:
+                continue
+            # test that beam scores match previously calculated scores if not eos and batch_idx not done
+            if eos_token_id is not None and all(
+                (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx]
+            ):
+                assert tf.reduce_all(
+                    next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
+                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
+                    next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
+                )
+
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(num_beams):
+                effective_beam_id = batch_idx * num_beams + beam_id
+                final_score = beam_scores[effective_beam_id].numpy().item()
+                final_tokens = input_ids[effective_beam_id]
+                generated_hyps[batch_idx].add(final_tokens, final_score)
+
+        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+
+        # select the best hypotheses
+        sent_lengths_list = []
+        best = []
+
+        # retrieve best hypotheses
+        for i, hypotheses in enumerate(generated_hyps):
+            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+            for j in range(output_num_return_sequences_per_batch):
+                best_hyp = sorted_hyps.pop()[1]
+                sent_lengths_list.append(len(best_hyp))
+                best.append(best_hyp)
+        assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(
+            output_batch_size, len(best)
+        )
+
+        sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32)
+
+        # shorter batches are filled with pad_token
+        if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy():
+            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
+            sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length)
+            decoded_list = []
+
+            # fill with hypothesis and eos_token_id if necessary
+            for i, hypo in enumerate(best):
+                assert sent_lengths[i] == shape_list(hypo)[0]
+                # if sent_length is max_len do not pad
+                if sent_lengths[i] == sent_max_len:
+                    decoded_slice = hypo
+                else:
+                    # else pad to sent_max_len
+                    num_pad_tokens = sent_max_len - sent_lengths[i]
+                    padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32)
+                    decoded_slice = tf.concat([hypo, padding], axis=-1)
+
+                    # finish sentence with EOS token
+                    if sent_lengths[i] < max_length:
+                        decoded_slice = tf.where(
+                            tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i],
+                            eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32),
+                            decoded_slice,
+                        )
+                # add to list
+                decoded_list.append(decoded_slice)
+
+            decoded = tf.stack(decoded_list)
+        else:
+            # none of the hypotheses have an eos_token
+            assert (len(hypo) == max_length for hypo in best)
+            decoded = tf.stack(best)
+
+        return decoded
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = []
+        for layer_past in past:
+            # get the correct batch idx from layer past batch dim
+            # batch dim of `past` and `mems` is at 2nd position
+            reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx]
+            reordered_layer_past = tf.concat(reordered_layer_past, axis=1)
+            # check that shape matches
+            assert shape_list(reordered_layer_past) == shape_list(layer_past)
+            reordered_past.append(reordered_layer_past)
+        past = tuple(reordered_past)
+        return past
+
+
+def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
+    # create logit penalties for already seen input_ids
+    token_penalties = np.ones(shape_list(logits))
+    prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
+    for i, prev_input_id in enumerate(prev_input_ids):
+        logit_penalized = logits[i].numpy()[prev_input_id]
+        logit_penalties = np.zeros(logit_penalized.shape)
+        # if previous logit score is < 0 then multiply repetition penalty else divide
+        logit_penalties[logit_penalized < 0] = repetition_penalty
+        logit_penalties[logit_penalized > 0] = 1 / repetition_penalty
+        np.put(token_penalties[i], prev_input_id, logit_penalties)
+    return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
+
+
+def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
+    # Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < no_repeat_ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].numpy().tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+
+    def _get_generated_ngrams(hypo_idx):
+        # Before decoding the next token, prevent decoding of ngrams that have already appeared
+        start_idx = cur_len + 1 - no_repeat_ngram_size
+        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
+        return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+    return banned_tokens
+
+
+def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
+    banned_tokens = []
+
+    def _tokens_match(prev_tokens, tokens):
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        if len(tokens) > len(prev_input_ids):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+
+        if prev_tokens[-len(tokens) :] == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+
+    for prev_input_ids_slice in prev_input_ids:
+        banned_tokens_slice = []
+
+        for banned_token_seq in bad_words_ids:
+            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+                bad_words_ids
+            )
+
+            if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False:
+                # if tokens do not match continue
+                continue
+
+            banned_tokens_slice.append(banned_token_seq[-1])
+
+        banned_tokens.append(banned_tokens_slice)
+
+    return banned_tokens
+
+
+def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (batch size, vocabulary size)
+            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+            Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    logits_shape = shape_list(logits)
+
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1])  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
+        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+
+    if top_p < 1.0:
+        sorted_indices = tf.argsort(logits, direction="DESCENDING")
+        sorted_logits = tf.gather(
+            logits, sorted_indices, axis=-1, batch_dims=1
+        )  # expects logits to be of dim (batch_size, vocab_size)
+
+        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
+
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove = tf.concat(
+                [
+                    tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
+                    sorted_indices_to_remove[:, min_tokens_to_keep:],
+                ],
+                -1,
+            )
+
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
+        sorted_indices_to_remove = tf.concat(
+            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
+        )
+        # scatter sorted tensors to original indexing
+        indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
+        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+    return logits
+
+
+def scatter_values_on_batch_indices(values, batch_indices):
+    shape = shape_list(batch_indices)
+    # broadcast batch dim to shape
+    broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
+    # transform batch_indices to pair_indices
+    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+    # scatter values to pair indices
+    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
+
+
+def set_tensor_by_indices_to_value(tensor, indices, value):
+    # create value_tensor since tensor value assignment is not possible in TF
+    value_tensor = tf.zeros_like(tensor) + value
+    return tf.where(indices, value_tensor, tensor)
+
+
+class BeamHypotheses(object):
+    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs, cur_len=None):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            if cur_len is None:
+                cur_len = self.max_length
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class TFConv1D(tf.keras.layers.Layer):
+    def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
+        """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super().__init__(**kwargs)
+        self.nf = nf
+        self.nx = nx
+        self.initializer_range = initializer_range
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+        )
+        self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
+
+    def call(self, x):
+        bz, sl = shape_list(x)[:2]
+
+        x = tf.reshape(x, [-1, self.nx])
+        x = tf.matmul(x, self.weight) + self.bias
+
+        x = tf.reshape(x, [bz, sl, self.nf])
+
+        return x
+
+
+class TFSharedEmbeddings(tf.keras.layers.Layer):
+    """Construct shared token embeddings.
+    """
+
+    def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
+
+    def build(self, input_shape):
+        """Build shared token embedding layer
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        self.weight = self.add_weight(
+            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+        )
+        super().build(input_shape)
+
+    def call(self, inputs, mode="embedding"):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, input_ids):
+        """Applies embedding based on inputs tensor."""
+        return tf.gather(self.weight, input_ids)
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [..., hidden_size]
+            Returns:
+                float32 tensor with shape [..., vocab_size].
+        """
+        first_dims = shape_list(inputs)[:-1]
+
+        x = tf.reshape(inputs, [-1, self.hidden_size])
+        logits = tf.matmul(x, self.weight, transpose_b=True)
+
+        return tf.reshape(logits, first_dims + [self.vocab_size])
+
+
+class TFSequenceSummary(tf.keras.layers.Layer):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+
+    def __init__(self, config, initializer_range=0.02, **kwargs):
+        super().__init__(**kwargs)
+
+        self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
+        if self.has_summary:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = tf.keras.layers.Dense(
+                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+            )
+
+        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
+        if self.has_activation:
+            self.activation = tf.keras.activations.tanh
+
+        self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
+        if self.has_first_dropout:
+            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
+
+        self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
+        if self.has_last_dropout:
+            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
+
+    def call(self, inputs, training=False):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'cls_index' and cls_index is None:
+                    we take the last token of the sequence as classification token
+        """
+        if not isinstance(inputs, (dict, tuple, list)):
+            hidden_states = inputs
+            cls_index = None
+        elif isinstance(inputs, (tuple, list)):
+            hidden_states = inputs[0]
+            cls_index = inputs[1] if len(inputs) > 1 else None
+            assert len(inputs) <= 2, "Too many inputs."
+        else:
+            hidden_states = inputs.get("hidden_states")
+            cls_index = inputs.get("cls_index", None)
+
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = tf.reduce_mean(hidden_states, axis=1)
+        elif self.summary_type == "cls_index":
+            hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
+            if cls_index is None:
+                cls_index = tf.fill(
+                    hidden_shape[:-2], hidden_shape[-2] - 1
+                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
+            cls_shape = shape_list(cls_index)
+            if len(cls_shape) <= len(hidden_shape) - 2:
+                cls_index = cls_index[..., tf.newaxis]
+            # else:
+            # cls_index = cls_index[..., tf.newaxis]
+            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
+            output = tf.squeeze(
+                output, axis=len(hidden_shape) - 2
+            )  # shape of output: (batch, num choices, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        if self.has_first_dropout:
+            output = self.first_dropout(output, training=training)
+
+        if self.has_summary:
+            output = self.summary(output)
+
+        if self.has_activation:
+            output = self.activation(output)
+
+        if self.has_last_dropout:
+            output = self.last_dropout(output, training=training)
+
+        return output
+
+
+def shape_list(x):
+    """Deal with dynamic shape in tensorflow cleanly."""
+    static = x.shape.as_list()
+    dynamic = tf.shape(x)
+    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+
+
+def get_initializer(initializer_range=0.02):
+    """Creates a `tf.initializers.truncated_normal` with the given range.
+    Args:
+        initializer_range: float, initializer range for stddev.
+    Returns:
+        TruncatedNormal initializer with stddev = `initializer_range`.
+    """
+    return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
+
+
+TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
+    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tf_model.h5",
+}
+
+
+def gelu(x):
+    """ Gaussian Error Linear Unit.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+    return x * cdf
+
+
+def gelu_new(x):
+    """Gaussian Error Linear Unit.
+    This is a smoother version of the RELU.
+    Original paper: https://arxiv.org/abs/1606.08415
+    Args:
+        x: float Tensor to perform activation.
+    Returns:
+        `x` with the GELU activation applied.
+    """
+    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+    return x * cdf
+
+
+def swish(x):
+    return x * tf.sigmoid(x)
+
+
+ACT2FN = {
+    "gelu": tf.keras.layers.Activation(gelu),
+    "relu": tf.keras.activations.relu,
+    "swish": tf.keras.layers.Activation(swish),
+    "gelu_new": tf.keras.layers.Activation(gelu_new),
+}
+
+
+class TFBertEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.initializer_range = config.initializer_range
+
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="token_type_embeddings",
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.hidden_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        super().build(input_shape)
+
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+
+        if input_ids is not None:
+            input_shape = shape_list(input_ids)
+        else:
+            input_shape = shape_list(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        if inputs_embeds is None:
+            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = shape_list(inputs)[0]
+        length = shape_list(inputs)[1]
+
+        x = tf.reshape(inputs, [-1, self.hidden_size])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFBertSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        assert config.hidden_size % config.num_attention_heads == 0
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.amp = config.amp
+
+        self.query = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = tf.keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        batch_size = shape_list(hidden_states)[0]
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(
+            query_layer, key_layer, transpose_b=True
+        )  # (batch size, num_heads, seq_len_q, seq_len_k)
+        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
+        attention_scores = attention_scores / tf.cast(tf.math.sqrt(dk), tf.float16 if self.amp else tf.float32)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        context_layer = tf.reshape(
+            context_layer, (batch_size, -1, self.all_head_size)
+        )  # (batch_size, seq_len_q, all_head_size)
+
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+
+
+class TFBertSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class TFBertAttention(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.self_attention = TFBertSelfAttention(config, name="self")
+        self.dense_output = TFBertSelfOutput(config, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, inputs, training=False):
+        input_tensor, attention_mask, head_mask = inputs
+
+        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
+        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFBertIntermediate(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class TFBertOutput(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, inputs, training=False):
+        hidden_states, input_tensor = inputs
+
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class TFBertLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.attention = TFBertAttention(config, name="attention")
+        self.intermediate = TFBertIntermediate(config, name="intermediate")
+        self.bert_output = TFBertOutput(config, name="output")
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFBertEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # outputs, (hidden states), (attentions)
+
+
+class TFBertPooler(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+
+    def call(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        return pooled_output
+
+
+class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(
+            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+
+    def call(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class TFBertLMPredictionHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.transform = TFBertPredictionHeadTransform(config, name="transform")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        super().build(input_shape)
+
+    def call(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+class TFBertMLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
+
+    def call(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class TFBertNSPHead(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.seq_relationship = tf.keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
+        )
+
+    def call(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+@keras_serializable
+class TFBertMainLayer(tf.keras.layers.Layer):
+    config_class = BertConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.num_hidden_layers = config.num_hidden_layers
+
+        self.embeddings = TFBertEmbeddings(config, name="embeddings")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.pooler = TFBertPooler(config, name="pooler")
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+
+
+class TFBertPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = BertConfig
+    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "bert"
+
+
+BERT_START_DOCSTRING = r"""
+    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
+    Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. note::
+
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
+        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
+        in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`__
+        position_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`__
+        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
+            (if set to :obj:`False`) for evaluation.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class TFBertModel(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.bert = TFBertMainLayer(config, name="bert")
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertModel
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertModel.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
+        outputs = self.bert(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training:
+    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING,
+)
+class TFBertForPreTraining(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
+
+    def get_output_embeddings(self):
+        return self.bert.embeddings
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForPreTraining
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+        seq_relationship_score = self.nsp(pooled_output)
+
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class TFBertForMaskedLM(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
+
+    def get_output_embeddings(self):
+        return self.bert.embeddings
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMaskedLM
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+)
+class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForNextSentencePrediction
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+        seq_relationship_score = self.nsp(pooled_output)
+
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # seq_relationship_score, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING,
+)
+class TFBertForSequenceClassification(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForSequenceClassification
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING,
+)
+class TFBertForMultipleChoice(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to build the network.
+
+        Returns:
+            tf.Tensor with dummy inputs
+        """
+        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
+            `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForMultipleChoice
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = tf.constant([tokenizer.encode(s) for s in choices])[None, :]  # Batch size 1, 2 choices
+        outputs = model(input_ids)
+        classification_scores = outputs[0]
+
+        """
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None:
+            num_choices = shape_list(input_ids)[1]
+            seq_length = shape_list(input_ids)[2]
+        else:
+            num_choices = shape_list(inputs_embeds)[1]
+            seq_length = shape_list(inputs_embeds)[2]
+
+        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+        flat_inputs = [
+            flat_input_ids,
+            flat_attention_mask,
+            flat_token_type_ids,
+            flat_position_ids,
+            head_mask,
+            inputs_embeds,
+        ]
+
+        outputs = self.bert(flat_inputs, training=training)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=training)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # reshaped_logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING,
+)
+class TFBertForTokenClassification(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+        )
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForTokenClassification
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING,
+)
+class TFBertForQuestionAnswering(TFBertPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.qa_outputs = tf.keras.layers.Dense(
+            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+        )
+
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import BertTokenizer, TFBertForQuestionAnswering
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+        """
+        outputs = self.bert(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/modelzoo/ELECTRA/optimization.py b/modelzoo/ELECTRA/optimization.py
new file mode 100644
index 00000000..b83e487c
--- /dev/null
+++ b/modelzoo/ELECTRA/optimization.py
@@ -0,0 +1,383 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functions and classes related to optimization (weight updates)."""
+
+import re
+import collections
+import tensorflow as tf
+import tensorflow_addons.optimizers as tfa_optimizers
+
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+from utils import log
+
+
+class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+    """Applys a warmup schedule on a given learning rate decay schedule."""
+
+    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
+        super().__init__()
+        self.initial_learning_rate = initial_learning_rate
+        self.warmup_steps = warmup_steps
+        self.power = power
+        self.decay_schedule_fn = decay_schedule_fn
+        self.name = name
+
+    def __call__(self, step):
+        with tf.name_scope(self.name or "WarmUp") as name:
+            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+            # learning rate will be `global_step/num_warmup_steps * init_lr`.
+            global_step_float = tf.cast(step, tf.float32)
+            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+            warmup_percent_done = global_step_float / warmup_steps_float
+            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
+            return tf.cond(
+                global_step_float < warmup_steps_float,
+                lambda: warmup_learning_rate,
+                lambda: self.decay_schedule_fn(step - self.warmup_steps),
+                name=name,
+            )
+
+    def get_config(self):
+        return {
+            "initial_learning_rate": self.initial_learning_rate,
+            "decay_schedule_fn": self.decay_schedule_fn,
+            "warmup_steps": self.warmup_steps,
+            "power": self.power,
+            "name": self.name,
+        }
+
+
+def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01,
+                     layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0,
+                     optimizer="adam", skip_adaptive=False, power=1.0, beta_1=0.9, beta_2=0.999, end_lr=0.0):
+    """Creates an optimizer with learning rate schedule."""
+    # Implements linear decay of the learning rate.
+    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+        initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power
+    )
+    if num_warmup_steps:
+        learning_rate_fn = WarmUp(
+            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
+        )
+    layer_decay = None
+    if layerwise_lr_decay > 0 and n_transformer_layers is not None:
+        layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers)
+
+    if optimizer == "adam":
+        optimizer = AdamWeightDecay(
+            learning_rate=learning_rate_fn,
+            weight_decay_rate=weight_decay_rate,
+            layer_decay=layer_decay,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
+            clip_norm=clip_norm,
+        )
+    else:
+        if skip_adaptive:
+            skip_list = ["layer_norm", "bias", "LayerNorm"]
+        else:
+            skip_list = ["None"]
+        log("Skip list for LAMB {}".format(skip_list))
+        
+        optimizer = tfa_optimizers.LAMB(
+            learning_rate=learning_rate_fn,
+            weight_decay_rate=weight_decay_rate,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
+            exclude_from_layer_adaptation=skip_list,
+        )
+
+    return optimizer
+
+
+class AdamWeightDecay(tf.keras.optimizers.Adam):
+    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+
+  Just adding the square of the weights to the loss function is *not* the
+  correct way of using L2 regularization/weight decay with Adam, since that will
+  interact with the m and v parameters in strange ways.
+
+  Instead we want ot decay the weights in a manner that doesn't interact with
+  the m/v parameters. This is equivalent to adding the square of the weights to
+  the loss with plain (non-momentum) SGD.
+  """
+
+    def __init__(
+            self,
+            learning_rate=0.001,
+            beta_1=0.9,
+            beta_2=0.999,
+            epsilon=1e-7,
+            amsgrad=False,
+            weight_decay_rate=0.0,
+            include_in_weight_decay=None,
+            exclude_from_weight_decay=None,
+            layer_decay=None,
+            clip_norm=1.0,
+            name="AdamWeightDecay",
+            **kwargs
+    ):
+        super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+        self.weight_decay_rate = weight_decay_rate
+        self._include_in_weight_decay = include_in_weight_decay
+        self._exclude_from_weight_decay = exclude_from_weight_decay
+        self.layer_decay = layer_decay
+        self.clip_norm = clip_norm
+
+    @classmethod
+    def from_config(cls, config):
+        """Creates an optimizer from its config with WarmUp custom object."""
+        custom_objects = {"WarmUp": WarmUp}
+        return super().from_config(config, custom_objects=custom_objects)
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super()._prepare_local(var_device, var_dtype, apply_state)
+        apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
+
+    def _decay_weights_op(self, var, learning_rate, apply_state):
+        do_decay = self._do_use_weight_decay(var.name)
+        if do_decay:
+            return var.assign_sub(
+                learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
+            )
+        return tf.no_op()
+
+    def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True):
+        grads, tvars = list(zip(*grads_and_vars))
+        # Being done in train_step
+        ##(grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.clip_norm)
+        return super().apply_gradients(zip(grads, tvars), name=name,
+                                       experimental_aggregate_gradients=experimental_aggregate_gradients)
+
+    def _get_lr(self, var, apply_state):
+        """Retrieves the learning rate with the given state."""
+        # if apply_state is None:
+        #     return self._decayed_lr_t[var_dtype], {}
+        var_name, var_device, var_dtype = var.name, var.device, var.dtype.base_dtype
+
+        apply_state = apply_state or {}
+        coefficients = apply_state.get((var_device, var_dtype))
+        if coefficients is None:
+            coefficients = self._fallback_apply_state(var_device, var_dtype)
+            apply_state[(var_device, var_dtype)] = coefficients
+        lr_t = coefficients["lr_t"]
+        lr = coefficients["lr"]
+
+        if self.layer_decay is not None:
+            update_for_var = False
+            for key in self.layer_decay:
+                if key in var_name:
+                    update_for_var = True
+                    lr_t *= self.layer_decay[key]
+                    lr *= self.layer_decay[key]
+                    break
+            if not update_for_var:
+                raise ValueError("No learning rate specified for variable", var)
+
+        return lr_t, lr, coefficients, dict(apply_state=apply_state)
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        # print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
+        lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            m = self.get_slot(var, 'm')
+            v = self.get_slot(var, 'v')
+
+            if not self.amsgrad:
+                return training_ops.resource_apply_adam(
+                    var.handle,
+                    m.handle,
+                    v.handle,
+                    coefficients['beta_1_power'],
+                    coefficients['beta_2_power'],
+                    lr_t,
+                    coefficients['beta_1_t'],
+                    coefficients['beta_2_t'],
+                    coefficients['epsilon'],
+                    grad,
+                    use_locking=self._use_locking)
+            else:
+                vhat = self.get_slot(var, 'vhat')
+                return training_ops.resource_apply_adam_with_amsgrad(
+                    var.handle,
+                    m.handle,
+                    v.handle,
+                    vhat.handle,
+                    coefficients['beta_1_power'],
+                    coefficients['beta_2_power'],
+                    lr_t,
+                    coefficients['beta_1_t'],
+                    coefficients['beta_2_t'],
+                    coefficients['epsilon'],
+                    grad,
+                    use_locking=self._use_locking)
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        # print("Sparse: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
+        lr_t, lr, coefficients, kwargs = self._get_lr(var, apply_state)
+        decay = self._decay_weights_op(var, lr_t, apply_state)
+        with tf.control_dependencies([decay]):
+            # m_t = beta1 * m + (1 - beta1) * g_t
+            m = self.get_slot(var, 'm')
+            m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
+            m_t = state_ops.assign(m, m * coefficients['beta_1_t'],
+                                   use_locking=self._use_locking)
+            with tf.control_dependencies([m_t]):
+                m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+            # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+            v = self.get_slot(var, 'v')
+            v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
+            v_t = state_ops.assign(v, v * coefficients['beta_2_t'],
+                                   use_locking=self._use_locking)
+            with tf.control_dependencies([v_t]):
+                v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+            if not self.amsgrad:
+                v_sqrt = math_ops.sqrt(v_t)
+                var_update = state_ops.assign_sub(
+                    var, lr * m_t / (v_sqrt + coefficients['epsilon']),
+                    use_locking=self._use_locking)
+                return control_flow_ops.group(*[var_update, m_t, v_t])
+            else:
+                v_hat = self.get_slot(var, 'vhat')
+                v_hat_t = math_ops.maximum(v_hat, v_t)
+                with tf.control_dependencies([v_hat_t]):
+                    v_hat_t = state_ops.assign(
+                        v_hat, v_hat_t, use_locking=self._use_locking)
+                v_hat_sqrt = math_ops.sqrt(v_hat_t)
+                var_update = state_ops.assign_sub(
+                    var,
+                    lr * m_t / (v_hat_sqrt + coefficients['epsilon']),
+                    use_locking=self._use_locking)
+                return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"weight_decay_rate": self.weight_decay_rate})
+        return config
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if self.weight_decay_rate == 0:
+            return False
+
+        if self._include_in_weight_decay:
+            for r in self._include_in_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return True
+
+        if self._exclude_from_weight_decay:
+            for r in self._exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+
+# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator(object):
+    """Distribution strategies-aware gradient accumulation utility."""
+
+    def __init__(self):
+        """Initializes the accumulator."""
+        self._gradients = []
+        self._accum_steps = tf.Variable(
+            initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+        )
+
+    @property
+    def step(self):
+        """Number of accumulated steps."""
+        return self._accum_steps.value()
+
+    @property
+    def gradients(self):
+        """The accumulated gradients."""
+        return list(
+            gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
+        )
+
+    def __call__(self, gradients):
+        """Accumulates :obj:`gradients`."""
+        if not self._gradients:
+            self._gradients.extend(
+                [
+                    tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
+                    for gradient in gradients
+                ]
+            )
+
+        if len(gradients) != len(self._gradients):
+            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
+
+        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
+            if accum_gradient is not None and gradient is not None:
+                accum_gradient.assign_add(gradient)
+
+        self._accum_steps.assign_add(1)
+
+    def reset(self):
+        """Resets the accumulated gradients."""
+        if self._gradients:
+            self._accum_steps.assign(0)
+
+        for gradient in self._get_replica_gradients():
+            if gradient is not None:
+                gradient.assign(tf.zeros_like(gradient))
+
+    def _get_replica_gradients(self):
+        if tf.distribute.has_strategy():
+            # In a replica context, we want to accumulate gradients on each replica
+            # without synchronization, so we directly assign the value of the
+            # current replica.
+            replica_context = tf.distribute.get_replica_context()
+
+            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
+                return self._gradients
+
+            return (
+                gradient.device_map.select_for_current_replica(gradient.values, replica_context)
+                for gradient in self._gradients
+                if gradient is not None
+            )
+        else:
+            return self._gradients
+
+
+def _get_layer_decay(layer_decay, n_layers):
+    """Have lower learning rates for layers closer to the input."""
+    key_to_depths = collections.OrderedDict({
+        "/embeddings/": 0,
+        "/embeddings_project/": 0,
+        "/start_logits/": n_layers + 2,
+        "/end_logits/": n_layers + 2,
+        "/answer_class/": n_layers + 2,
+        "/qa_outputs/": n_layers + 2,
+    })
+    for layer in range(n_layers):
+        key_to_depths["encoder/layer_._" + str(layer) + "/"] = layer + 1
+    return {
+        key: layer_decay ** (n_layers + 2 - depth)
+        for key, depth in key_to_depths.items()
+    }
diff --git a/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
new file mode 100644
index 00000000..a18c3643
--- /dev/null
+++ b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import collections
+import json
+import os
+
+import tensorflow as tf
+
+from utils import log, heading
+from run_pretraining import PretrainingConfig
+from modeling import PretrainingModel
+
+
+def from_pretrained_ckpt(args):
+    config = PretrainingConfig(
+        model_name='postprocessing',
+        data_dir='postprocessing',
+        generator_hidden_size=0.3333333,
+    )
+
+    # Padding for divisibility by 8
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+
+    if args.amp:
+        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+        tf.keras.mixed_precision.experimental.set_policy(policy)
+        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
+        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
+
+    # Set up model
+    model = PretrainingModel(config)
+
+    # Load checkpoint
+    checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model)
+    checkpoint.restore(args.pretrained_checkpoint).expect_partial()
+    log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint, int(checkpoint.step) - 1))
+
+    disc_dir = os.path.join(args.output_dir, 'discriminator')
+    gen_dir = os.path.join(args.output_dir, 'generator')
+
+    heading(" ** Saving discriminator")
+    model.discriminator(model.discriminator.dummy_inputs)
+    model.discriminator.save_pretrained(disc_dir)
+
+    heading(" ** Saving generator")
+    model.generator(model.generator.dummy_inputs)
+    model.generator.save_pretrained(gen_dir)
+
+
+if __name__ == '__main__':
+    # Parse essential args
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--pretrained_checkpoint')
+    parser.add_argument('--output_dir')
+    parser.add_argument('--amp', action='store_true', default=False)
+    args = parser.parse_args()
+
+    from_pretrained_ckpt(args)
diff --git a/modelzoo/ELECTRA/pretrain_utils.py b/modelzoo/ELECTRA/pretrain_utils.py
new file mode 100644
index 00000000..029dce9f
--- /dev/null
+++ b/modelzoo/ELECTRA/pretrain_utils.py
@@ -0,0 +1,367 @@
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers for preparing pre-training data and supplying them to the model."""
+
+import collections
+
+import numpy as np
+import tensorflow as tf
+
+import utils
+import tokenization
+
+
+def get_dataset(config, batch_size, num_cpu_threads=4, world_size=1, rank=0):
+    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+    name_to_features = {
+        "input_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+        "input_mask": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+        "segment_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+    }
+
+    input_files = []
+    for input_pattern in config.pretrain_tfrecords.split(","):
+        input_files.extend(tf.io.gfile.glob(input_pattern))
+
+    d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+    d = d.shard(num_shards=world_size, index=rank)
+    d = d.repeat()
+    d = d.shuffle(buffer_size=len(input_files), seed=config.seed, reshuffle_each_iteration=False)
+
+    cycle_length = min(num_cpu_threads, len(input_files))
+    d = d.interleave(
+        tf.data.TFRecordDataset,
+        cycle_length=cycle_length,
+        deterministic=True)
+    d = d.shuffle(buffer_size=100, seed=config.seed, reshuffle_each_iteration=False)
+
+    d = d.map(lambda record: _decode_record(record, name_to_features))
+    d = d.batch(batch_size)
+
+    return d
+
+def _decode_record(record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.io.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+        t = example[name]
+        if t.dtype == tf.int64:
+            t = tf.cast(t, tf.int32)
+        example[name] = t
+
+    return example
+
+
+# model inputs - it's a bit nicer to use a namedtuple rather than keep the
+# features as a dict
+Inputs = collections.namedtuple(
+    "Inputs", ["input_ids", "input_mask", "segment_ids", "masked_lm_positions",
+               "masked_lm_ids", "masked_lm_weights"])
+
+
+def features_to_inputs(features):
+    return Inputs(
+        input_ids=features["input_ids"],
+        input_mask=features["input_mask"],
+        segment_ids=features["segment_ids"],
+        masked_lm_positions=(features["masked_lm_positions"]
+                             if "masked_lm_positions" in features else None),
+        masked_lm_ids=(features["masked_lm_ids"]
+                       if "masked_lm_ids" in features else None),
+        masked_lm_weights=(features["masked_lm_weights"]
+                           if "masked_lm_weights" in features else None),
+    )
+
+
+def get_updated_inputs(inputs, **kwargs):
+    features = inputs._asdict()
+    for k, v in kwargs.items():
+        features[k] = v
+    return features_to_inputs(features)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+    """Returns a list of the shape of tensor, preferring static dimensions.
+
+  Args:
+    tensor: A tf.Tensor object to find the shape of.
+    expected_rank: (optional) int. The expected rank of `tensor`. If this is
+      specified and the `tensor` has a different rank, and exception will be
+      thrown.
+    name: Optional name of the tensor for the error message.
+
+  Returns:
+    A list of dimensions of the shape of tensor. All static dimensions will
+    be returned as python integers, and dynamic dimensions will be returned
+    as tf.Tensor scalars.
+  """
+    if isinstance(tensor, np.ndarray) or isinstance(tensor, list):
+        shape = np.array(tensor).shape
+        if isinstance(expected_rank, six.integer_types):
+            assert len(shape) == expected_rank
+        elif expected_rank is not None:
+            assert len(shape) in expected_rank
+        return shape
+    #
+    # if name is None:
+    #     name = tensor.name
+    #
+    # if expected_rank is not None:
+    #     assert_rank(tensor, expected_rank, name)
+
+    shape = tensor.shape.as_list()
+
+    non_static_indexes = []
+    for (index, dim) in enumerate(shape):
+        if dim is None:
+            non_static_indexes.append(index)
+
+    if not non_static_indexes:
+        return shape
+
+    dyn_shape = tf.shape(tensor)
+    for index in non_static_indexes:
+        shape[index] = dyn_shape[index]
+    return shape
+
+
+def gather_positions(sequence, positions):
+    """Gathers the vectors at the specific positions over a minibatch.
+
+  Args:
+    sequence: A [batch_size, seq_length] or
+        [batch_size, seq_length, depth] tensor of values
+    positions: A [batch_size, n_positions] tensor of indices
+
+  Returns: A [batch_size, n_positions] or
+    [batch_size, n_positions, depth] tensor of the values at the indices
+  """
+    shape = get_shape_list(sequence, expected_rank=[2, 3])
+    depth_dimension = (len(shape) == 3)
+    if depth_dimension:
+        B, L, D = shape
+    else:
+        B, L = shape
+        D = 1
+        sequence = tf.expand_dims(sequence, -1)
+    position_shift = tf.expand_dims(L * tf.range(B), -1)
+    flat_positions = tf.reshape(positions + position_shift, [-1])
+    flat_sequence = tf.reshape(sequence, [B * L, D])
+    gathered = tf.gather(flat_sequence, flat_positions)
+    if depth_dimension:
+        return tf.reshape(gathered, [B, -1, D])
+    else:
+        return tf.reshape(gathered, [B, -1])
+
+
+def scatter_update(sequence, updates, positions):
+    """Scatter-update a sequence.
+
+  Args:
+    sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
+    updates: A tensor of size batch_size*seq_len(*depth)
+    positions: A [batch_size, n_positions] tensor
+
+  Returns: A tuple of two tensors. First is a [batch_size, seq_len] or
+    [batch_size, seq_len, depth] tensor of "sequence" with elements at
+    "positions" replaced by the values at "updates." Updates to index 0 are
+    ignored. If there are duplicated positions the update is only applied once.
+    Second is a [batch_size, seq_len] mask tensor of which inputs were updated.
+  """
+    shape = get_shape_list(sequence, expected_rank=[2, 3])
+    depth_dimension = (len(shape) == 3)
+    if depth_dimension:
+        B, L, D = shape
+    else:
+        B, L = shape
+        D = 1
+        sequence = tf.expand_dims(sequence, -1)
+    N = get_shape_list(positions)[1]
+
+    shift = tf.expand_dims(L * tf.range(B), -1)
+    flat_positions = tf.reshape(positions + shift, [-1, 1])
+    flat_updates = tf.reshape(updates, [-1, D])
+    updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
+    updates = tf.reshape(updates, [B, L, D])
+
+    flat_updates_mask = tf.ones([B * N], tf.int32)
+    updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
+    updates_mask = tf.reshape(updates_mask, [B, L])
+    not_first_token = tf.concat([tf.zeros((B, 1), tf.int32),
+                                 tf.ones((B, L - 1), tf.int32)], -1)
+    updates_mask *= not_first_token
+    updates_mask_3d = tf.expand_dims(updates_mask, -1)
+
+    # account for duplicate positions
+    if sequence.dtype == tf.float32:
+        updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
+        updates /= tf.maximum(1.0, updates_mask_3d)
+    else:
+        assert sequence.dtype == tf.int32
+        updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
+    updates_mask = tf.minimum(updates_mask, 1)
+    updates_mask_3d = tf.minimum(updates_mask_3d, 1)
+
+    updated_sequence = (((1 - updates_mask_3d) * sequence) +
+                        (updates_mask_3d * updates))
+    if not depth_dimension:
+        updated_sequence = tf.squeeze(updated_sequence, -1)
+
+    return updated_sequence, updates_mask
+
+
+def _get_candidates_mask(inputs: Inputs, vocab,
+                         disallow_from_mask=None):
+    """Returns a mask tensor of positions in the input that can be masked out."""
+    ignore_ids = [vocab["[SEP]"], vocab["[CLS]"], vocab["[MASK]"]]
+    candidates_mask = tf.ones_like(inputs.input_ids, tf.bool)
+    for ignore_id in ignore_ids:
+        candidates_mask &= tf.not_equal(inputs.input_ids, ignore_id)
+    candidates_mask &= tf.cast(inputs.input_mask, tf.bool)
+    if disallow_from_mask is not None:
+        candidates_mask &= ~disallow_from_mask
+    return candidates_mask
+
+
+def mask(config, inputs, mask_prob, proposal_distribution=1.0,
+         disallow_from_mask=None, already_masked=None):
+    """Implementation of dynamic masking. The optional arguments aren't needed for
+    BERT/ELECTRA and are from early experiments in "strategically" masking out
+    tokens instead of uniformly at random.
+
+    Args:
+      config: configure_pretraining.PretrainingConfig
+      inputs: pretrain_data.Inputs containing input input_ids/input_mask
+      mask_prob: percent of tokens to mask
+      proposal_distribution: for non-uniform masking can be a [B, L] tensor
+                             of scores for masking each position.
+      disallow_from_mask: a boolean tensor of [B, L] of positions that should
+                          not be masked out
+      already_masked: a boolean tensor of [B, N] of already masked-out tokens
+                      for multiple rounds of masking
+    Returns: a pretrain_data.Inputs with masking added
+    """
+    # Get the batch size, sequence length, and max masked-out tokens
+    N = config.max_predictions_per_seq
+    B, L = get_shape_list(inputs.input_ids)
+
+    # Find indices where masking out a token is allowed
+    vocab = tokenization.ElectraTokenizer(
+        config.vocab_file, do_lower_case=config.do_lower_case).get_vocab()
+    candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask)
+
+    # Set the number of tokens to mask out per example
+    num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
+    num_to_predict = tf.maximum(1, tf.minimum(
+        N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
+    masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
+    if already_masked is not None:
+        masked_lm_weights *= (1 - already_masked)
+
+    # Get a probability of masking each position in the sequence
+    candidate_mask_float = tf.cast(candidates_mask, tf.float32)
+    sample_prob = (proposal_distribution * candidate_mask_float)
+    sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)
+
+    # Sample the positions to mask out
+    sample_prob = tf.stop_gradient(sample_prob)
+    sample_logits = tf.math.log(sample_prob)
+    masked_lm_positions = tf.random.categorical(
+        sample_logits, N, dtype=tf.int32)
+    masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)
+
+    # Get the ids of the masked-out tokens
+    shift = tf.expand_dims(L * tf.range(B), -1)
+    flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
+    masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]),
+                                 flat_positions)
+    masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
+    masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)
+
+    # Update the input ids
+    replace_with_mask_positions = masked_lm_positions * tf.cast(
+        tf.less(tf.random.uniform([B, N]), 0.85), tf.int32)
+    inputs_ids, _ = scatter_update(
+        inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]),
+        replace_with_mask_positions)
+
+    return get_updated_inputs(
+        inputs,
+        input_ids=tf.stop_gradient(inputs_ids),
+        masked_lm_positions=masked_lm_positions,
+        masked_lm_ids=masked_lm_ids,
+        masked_lm_weights=masked_lm_weights
+    )
+
+
+def unmask(inputs: Inputs):
+    unmasked_input_ids, _ = scatter_update(
+        inputs.input_ids, inputs.masked_lm_ids, inputs.masked_lm_positions)
+    return get_updated_inputs(inputs, input_ids=unmasked_input_ids)
+
+
+def sample_from_softmax(logits, disallow=None):
+    if disallow is not None:
+        logits -= 1000.0 * disallow
+    uniform_noise = tf.random.uniform(
+        get_shape_list(logits), minval=0, maxval=1)
+    gumbel_noise = tf.cast(-tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9), logits.dtype)
+    return tf.one_hot(tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1,
+                                output_type=tf.int32), logits.shape[-1])
+
+
+ENDC = "\033[0m"
+COLORS = ["\033[" + str(n) + "m" for n in list(range(91, 97)) + [90]]
+RED = COLORS[0]
+BLUE = COLORS[3]
+CYAN = COLORS[5]
+GREEN = COLORS[1]
+
+
+def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None):
+    """Pretty-print model inputs."""
+    pos_to_tokid = {}
+    for tokid, pos, weight in zip(
+            inputs.masked_lm_ids[0], inputs.masked_lm_positions[0],
+            inputs.masked_lm_weights[0]):
+        if weight == 0:
+            pass
+        else:
+            pos_to_tokid[pos] = tokid
+
+    text = ""
+    provided_update_mask = (updates_mask is not None)
+    if not provided_update_mask:
+        updates_mask = np.zeros_like(inputs.input_ids)
+    for pos, (tokid, um) in enumerate(
+            zip(inputs.input_ids[0], updates_mask[0])):
+        token = inv_vocab[tokid]
+        if token == "[PAD]":
+            break
+        if pos in pos_to_tokid:
+            token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC
+            if provided_update_mask:
+                assert um == 1
+        else:
+            if provided_update_mask:
+                assert um == 0
+        text += token + " "
+    utils.log(utils.printable_text(text))
diff --git a/modelzoo/ELECTRA/run.sub b/modelzoo/ELECTRA/run.sub
new file mode 100644
index 00000000..50696b25
--- /dev/null
+++ b/modelzoo/ELECTRA/run.sub
@@ -0,0 +1,88 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eux
+# Docker image resulting from bash scripts/docker/build.sh
+readonly docker_image="gitlab-master.nvidia.com/dl/joc/electra_tf2:keras_mp_20.07_clean_up"
+# Location of dataset for phase 1 amd phase 2
+readonly datadir="/lustre/fsw/joc-luna/sharatht/electra_tf2_data/"
+
+readonly mounts=".:/workspace/electra,${datadir}:/workspace/electra/data"
+
+DGXSYSTEM=DGXA100
+cluster="selene"
+if [[ "${DGXSYSTEM}" == DGX2* ]]; then
+    cluster='circe'
+fi
+if [[ "${DGXSYSTEM}" == DGXA100* ]]; then
+    cluster='selene'
+fi
+
+BIND_CMD="./scripts/bind.sh --cpu=exclusive --ib=single --cluster=$cluster -- "
+
+BATCHSIZE=${BATCHSIZE:-16}
+PHASE=${PHASE:-1}
+LR=${LR:-3e-3}
+STEPS=${STEPS:-57450}
+WARMUP=${WARMUP:-3750}
+GRAD_ACCUM_STEPS=${GRAD_ACCUM_STEPS:-1}
+b1=${b1:-"0.878"}
+b2=${b2:-"0.974"}
+decay=${decay:-"0.5"}
+end_lr=${end_lr:-"0.0"}
+skip_adaptive=${skip_adaptive:-"yes"}
+model_count=${model_count:-1}
+
+skip_flag=""
+if [ "$skip_adaptive" = "yes" ] ; then
+    skip_flag=" --skip_adaptive"
+fi
+
+ckpt_STEPS=$(awk -v a=$STEPS 'BEGIN { print a / 10}')
+
+if [ "$PHASE" = "1" ] ; then
+
+LAUNCH_CMD="$BIND_CMD python run_pretraining.py \
+    --model_name='electra_keras_mp_base_lamb_48x8x${BATCHSIZE}x${GRAD_ACCUM_STEPS}_p1_skip_adaptive_${skip_adaptive}_LR_${LR}_WARMUP_${WARMUP}_STEPS_${STEPS}_b1_${b1}_b2_${b2}_decay_${decay}_end_lr_${end_lr}_${model_count}' \
+    --pretrain_tfrecords='/workspace/electra/data/tfrecord_lower_case_1_seq_len_128_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*' \
+    --num_train_steps=$STEPS \
+    --num_warmup_steps=$WARMUP \
+    --disc_weight=50.0 \
+    --generator_hidden_size=0.3333333 \
+    --learning_rate=$LR \
+    --train_batch_size=$BATCHSIZE \
+    --max_seq_length=128 --log_freq=10 \
+    --save_checkpoints_steps=$ckpt_STEPS \
+    --optimizer='lamb' $skip_flag --opt_beta_1=$b1 --opt_beta_2=$b2 --lr_decay_power=$decay --end_lr=$end_lr $skip_flag --gradient_accumulation_steps=$GRAD_ACCUM_STEPS --amp --xla "
+else
+LAUNCH_CMD="$BIND_CMD python run_pretraining.py \
+    --model_name='electra_keras_mp_base_lamb_48x8x176x1_p1_skip_adaptive_yes_LR_6e-3_WARMUP_2000_STEPS_10000_b1_0.878_b2_0.974_decay_0.5_end_lr_0.0_${model_count}' \
+    --pretrain_tfrecords='/workspace/electra/data/tfrecord_lower_case_1_seq_len_512_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*' \
+    --num_train_steps=$STEPS \
+    --num_warmup_steps=$WARMUP \
+    --disc_weight=50.0 \
+    --generator_hidden_size=0.3333333 \
+    --learning_rate=$LR \
+    --train_batch_size=$BATCHSIZE \
+    --max_seq_length=512 --log_freq=10 \
+    --restore_checkpoint --phase2 \
+    --save_checkpoints_steps=$ckpt_STEPS \
+    --optimizer='lamb' $skip_flag --opt_beta_1=$b1 --opt_beta_2=$b2 --lr_decay_power=$decay --end_lr=$end_lr $skip_flag --gradient_accumulation_steps=$GRAD_ACCUM_STEPS --amp --xla "
+fi;
+
+srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${LAUNCH_CMD}"
diff --git a/modelzoo/ELECTRA/run_inference.py b/modelzoo/ELECTRA/run_inference.py
new file mode 100644
index 00000000..436f5814
--- /dev/null
+++ b/modelzoo/ELECTRA/run_inference.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import subprocess
+import time
+import argparse
+import json
+import logging
+import collections
+
+import tensorflow as tf
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+from configuration import ElectraConfig
+from modeling import TFElectraForQuestionAnswering
+from tokenization import ElectraTokenizer
+from squad_utils import SquadResult, RawResult, _get_best_indices
+
+TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/electra-small-generator",
+    "google/electra-base-generator",
+    "google/electra-large-generator",
+    "google/electra-small-discriminator",
+    "google/electra-base-discriminator",
+    "google/electra-large-discriminator",
+    # See all ELECTRA models at https://huggingface.co/models?filter=electra
+]
+
+_PrelimPrediction = collections.namedtuple(
+    "PrelimPrediction",
+    ["start_index", "end_index", "start_logit", "end_logit"])
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument("--electra_model", default=None, type=str, required=True,
+                        help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The checkpoint file from pretraining")
+    parser.add_argument("--question",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="Question")
+    parser.add_argument("--context",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="Context")
+    parser.add_argument(
+        "--joint_head",
+        default=True,
+        type=bool,
+        help="Jointly predict the start and end positions",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=4,
+        type=int,
+        help="Beam size when doing joint predictions",
+    )
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def get_predictions_joint_head(start_indices, end_indices, result, max_len, args):
+    predictions = []
+    for i in range(args.beam_size):
+        start_index = start_indices[i]
+        for j in range(args.beam_size):
+            # for end_index in end_indices:
+            end_index = end_indices[i * args.beam_size + j]
+            if start_index >= max_len:
+                continue
+            if end_index >= max_len:
+                continue
+            if end_index < start_index:
+                continue
+            length = end_index - start_index + 1
+            if length > args.max_answer_length:
+                continue
+            predictions.append(
+                _PrelimPrediction(
+                    start_index=start_index,
+                    end_index=end_index,
+                    start_logit=result.start_logits[i],
+                    end_logit=result.end_logits[i * args.beam_size + j]))
+    return predictions
+
+
+def get_predictions(start_indices, end_indices, result, max_len, args):
+    predictions = []
+    for start_index in start_indices:
+        for end_index in end_indices:
+            if start_index >= max_len:
+                continue
+            if end_index >= max_len:
+                continue
+            if end_index < start_index:
+                continue
+            length = end_index - start_index + 1
+            if length > args.max_answer_length:
+                continue
+            predictions.append(
+                _PrelimPrediction(
+                    start_index=start_index,
+                    end_index=end_index,
+                    start_logit=result.start_logits[start_index],
+                    end_logit=result.end_logits[end_index]))
+    return predictions
+
+
+def main():
+    args = parse_args()
+    print("***** Loading tokenizer and model *****")
+    electra_model = args.electra_model
+    config = ElectraConfig.from_pretrained(electra_model)
+    tokenizer = ElectraTokenizer.from_pretrained(electra_model)
+    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, args=args)
+
+    print("***** Loading fine-tuned checkpoint: {} *****".format(args.init_checkpoint))
+    model.load_weights(args.init_checkpoint, by_name=False, skip_mismatch=False).expect_partial()
+
+    question, text = args.question, args.context
+    encoding = tokenizer.encode_plus(question, text, return_tensors='tf')
+    input_ids, token_type_ids, attention_mask = encoding["input_ids"], encoding["token_type_ids"], \
+                                                encoding["attention_mask"]
+    all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0])
+    if not args.joint_head:
+        start_logits, end_logits = model(input_ids,
+                                         attention_mask=attention_mask,
+                                         token_type_ids=token_type_ids,
+                                         )[:2]
+        start_logits = start_logits[0].numpy().tolist()
+        end_logits = end_logits[0].numpy().tolist()
+        result = RawResult(unique_id=0,
+                           start_logits=start_logits,
+                           end_logits=end_logits)
+
+        start_indices = _get_best_indices(result.start_logits, args.n_best_size)
+        end_indices = _get_best_indices(result.end_logits, args.n_best_size)
+        predictions = get_predictions(start_indices, end_indices, result, len(all_tokens), args)
+        null_score = result.start_logits[0] + result.end_logits[0]
+
+    else:
+        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        output = [output[0].numpy().tolist() for output in outputs]
+        start_logits = output[0]
+        start_top_index = output[1]
+        end_logits = output[2]
+        end_top_index = output[3]
+        cls_logits = output[4]
+        result = SquadResult(
+            0,
+            start_logits,
+            end_logits,
+            start_top_index=start_top_index,
+            end_top_index=end_top_index,
+            cls_logits=cls_logits,
+        )
+        predictions = get_predictions_joint_head(result.start_top_index, result.end_top_index, result, len(all_tokens), args)
+        null_score = result.cls_logits
+
+    predictions = sorted(predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+    answer = predictions[0]
+    answer = ' '.join(all_tokens[answer.start_index: answer.end_index + 1])
+    if args.null_score_diff_threshold > null_score and args.version_2_with_negative:
+        answer = ''
+
+    print(answer)
+
+    return answer
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modelzoo/ELECTRA/run_pretraining.py b/modelzoo/ELECTRA/run_pretraining.py
new file mode 100644
index 00000000..eb58e9fd
--- /dev/null
+++ b/modelzoo/ELECTRA/run_pretraining.py
@@ -0,0 +1,505 @@
+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pre-trains an ELECTRA model."""
+
+import argparse
+import collections
+import json
+import time
+import datetime
+import os
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+from horovod.tensorflow.compression import Compression
+from gpu_affinity import set_affinity
+
+import utils
+import sys
+import pretrain_utils
+from utils import get_rank, get_world_size, is_main_process, log, log_config, setup_logger, postprocess_dllog
+from tokenization import ElectraTokenizer
+from modeling import PretrainingModel
+from optimization import create_optimizer, GradientAccumulator
+import dllogger
+
+class PretrainingConfig(object):
+    """Defines pre-training hyperparameters."""
+
+    def __init__(self, model_name, **kwargs):
+        self.model_name = model_name
+        self.seed = 42
+
+        self.debug = False  # debug mode for quickly running things
+        self.do_train = True  # pre-train ELECTRA
+        self.do_eval = False  # evaluate generator/discriminator on unlabeled data
+        self.phase2 = False
+
+        # amp
+        self.amp = True
+        self.xla = True
+        self.fp16_compression = False
+
+        # optimizer type
+        self.optimizer = 'adam'
+        self.gradient_accumulation_steps = 1
+
+        # lamb whitelisting for LN and biases
+        self.skip_adaptive = False
+
+        # loss functions
+        self.electra_objective = True  # if False, use the BERT objective instead
+        self.gen_weight = 1.0  # masked language modeling / generator loss
+        self.disc_weight = 50.0  # discriminator loss
+        self.mask_prob = 0.15  # percent of input tokens to mask out / replace
+
+        # optimization
+        self.learning_rate = 5e-4
+        self.lr_decay_power = 0.5
+        self.weight_decay_rate = 0.01
+        self.num_warmup_steps = 10000
+        self.opt_beta_1 = 0.878
+        self.opt_beta_2 = 0.974
+        self.end_lr = 0.0
+
+        # training settings
+        self.log_freq = 10
+        self.skip_checkpoint = False
+        self.save_checkpoints_steps = 1000
+        self.num_train_steps = 1000000
+        self.num_eval_steps = 100
+        self.keep_checkpoint_max = 5  # maximum number of recent checkpoint files to keep;  change to 0 or None to keep all checkpoints
+        self.restore_checkpoint = None
+        self.load_weights = False
+        self.steps_this_run = -1
+
+        # model settings
+        self.model_size = "base"  # one of "small", "base", or "large"
+        # override the default transformer hparams for the provided model size; see
+        # modeling.BertConfig for the possible hparams and util.training_utils for
+        # the defaults
+        self.model_hparam_overrides = (
+            kwargs["model_hparam_overrides"]
+            if "model_hparam_overrides" in kwargs else {})
+        self.embedding_size = None  # bert hidden size by default
+        self.vocab_size = 30522  # number of tokens in the vocabulary
+        self.do_lower_case = True  # lowercase the input?
+
+        # generator settings
+        self.uniform_generator = False  # generator is uniform at random
+        self.shared_embeddings = True  # share generator/discriminator token embeddings?
+        # self.untied_generator = True  # tie all generator/discriminator weights?
+        self.generator_layers = 1.0  # frac of discriminator layers for generator
+        self.generator_hidden_size = 0.25  # frac of discrim hidden size for gen
+        self.disallow_correct = False  # force the generator to sample incorrect
+        # tokens (so 15% of tokens are always
+        # fake)
+        self.temperature = 1.0  # temperature for sampling from generator
+
+        # batch sizes
+        self.max_seq_length = 128
+        self.train_batch_size = 128
+        self.eval_batch_size = 128
+
+        self.results_dir = "results"
+        self.json_summary = None
+        self.update(kwargs)
+        # default locations of data files
+        
+        self.pretrain_tfrecords = os.path.join(
+            "data", "pretrain_tfrecords/pretrain_data.tfrecord*")
+        self.vocab_file = os.path.join("vocab", "vocab.txt")
+        self.model_dir = os.path.join(self.results_dir, "models", model_name)
+        self.checkpoints_dir = os.path.join(self.model_dir, "checkpoints")
+        self.weights_dir = os.path.join(self.model_dir, "weights")
+        self.results_txt = os.path.join(self.results_dir, "unsup_results.txt")
+        self.results_pkl = os.path.join(self.results_dir, "unsup_results.pkl")
+        self.log_dir = os.path.join(self.model_dir, "logs")
+
+        self.max_predictions_per_seq = int((self.mask_prob + 0.005) *
+                                           self.max_seq_length)
+
+        # defaults for different-sized model
+        if self.model_size == "base":
+            self.embedding_size = 768
+            self.hidden_size = 768
+            self.num_hidden_layers = 12
+            if self.hidden_size % 64 != 0:
+                raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size))	
+            self.num_attention_heads = int(self.hidden_size / 64.)
+        elif self.model_size == "large":
+            self.embedding_size = 1024
+            self.hidden_size = 1024
+            self.num_hidden_layers = 24
+            if self.hidden_size % 64 != 0:
+                raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size))
+            self.num_attention_heads = int(self.hidden_size / 64.)
+        else:
+            raise ValueError("--model_size : 'base' and 'large supported only.")
+        self.act_func = "gelu"
+        self.hidden_dropout_prob = 0.1 
+        self.attention_probs_dropout_prob = 0.1
+
+        self.update(kwargs)
+
+    def update(self, kwargs):
+        for k, v in kwargs.items():
+            if v is not None:
+                self.__dict__[k] = v
+
+
+def metric_fn(config, metrics, eval_fn_inputs):
+    """Computes the loss and accuracy of the model."""
+    d = eval_fn_inputs
+    metrics["masked_lm_accuracy"].update_state(
+        y_true=tf.reshape(d["masked_lm_ids"], [-1]),
+        y_pred=tf.reshape(d["masked_lm_preds"], [-1]),
+        sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
+    metrics["masked_lm_loss"].update_state(
+        values=tf.reshape(d["mlm_loss"], [-1]),
+        sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
+    if config.electra_objective:
+        metrics["sampled_masked_lm_accuracy"].update_state(
+            y_true=tf.reshape(d["masked_lm_ids"], [-1]),
+            y_pred=tf.reshape(d["sampled_tokids"], [-1]),
+            sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
+        if config.disc_weight > 0:
+            metrics["disc_loss"].update_state(d["disc_loss"])
+            #metrics["disc_auc"].update_state(
+            #    d["disc_labels"] * d["input_mask"],
+            #    d["disc_probs"] * tf.cast(d["input_mask"], tf.float32))
+            metrics["disc_accuracy"].update_state(
+                y_true=d["disc_labels"], y_pred=d["disc_preds"],
+                sample_weight=d["input_mask"])
+            metrics["disc_precision"].update_state(
+                y_true=d["disc_labels"], y_pred=d["disc_preds"],
+                sample_weight=d["disc_preds"] * d["input_mask"])
+            metrics["disc_recall"].update_state(
+                y_true=d["disc_labels"], y_pred=d["disc_preds"],
+                sample_weight=d["disc_labels"] * d["input_mask"])
+    return metrics
+
+@tf.function
+def train_one_step(config, model, optimizer, features, accumulator, first_step, take_step, clip_norm=1.0):
+
+    #Forward and Backward pass
+    with tf.GradientTape() as tape:
+        total_loss, eval_fn_inputs = model(features, is_training=True)
+        unscaled_loss = tf.stop_gradient(total_loss)
+        if config.amp:
+            total_loss = optimizer.get_scaled_loss(total_loss)
+   
+    #Backpropogate gradients
+    #tape = hvd.DistributedGradientTape(
+    #    tape, sparse_as_dense=True,
+    #    compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
+    gradients = tape.gradient(total_loss, model.trainable_variables)
+
+    #Get unscaled gradients if AMP
+    if config.amp:
+        gradients = optimizer.get_unscaled_gradients(gradients)
+
+    #Accumulate gradients
+    accumulator(gradients)
+    #Need to call apply_gradients on very first step irrespective of gradient accumulation
+    #This is required for the optimizer to build it's states
+    if first_step or take_step:
+        #All reduce and Clip the accumulated gradients
+        allreduced_accumulated_gradients = [None if g is None else hvd.allreduce(g / tf.cast(config.gradient_accumulation_steps, g.dtype),
+                                compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
+                                for g in accumulator.gradients]
+        (clipped_accumulated_gradients, _) = tf.clip_by_global_norm(allreduced_accumulated_gradients, clip_norm=clip_norm)
+        #Weight update
+        optimizer.apply_gradients(zip(clipped_accumulated_gradients, model.trainable_variables))
+        accumulator.reset()
+
+    #brodcast model weights after first train step
+    if first_step:
+        hvd.broadcast_variables(model.variables, root_rank=0)
+        hvd.broadcast_variables(optimizer.variables(), root_rank=0)
+
+    return unscaled_loss, eval_fn_inputs
+
+def main(e2e_start_time):
+    # Parse essential argumentss
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", required=True)
+    parser.add_argument("--model_size", default="base", type=str, help="base or large")
+    parser.add_argument("--pretrain_tfrecords", type=str)
+    parser.add_argument("--phase2", action='store_true')
+    parser.add_argument("--fp16_compression", action='store_true')
+    parser.add_argument("--amp", action='store_true',
+                        help="Whether to use fp16.")
+    parser.add_argument("--xla", action='store_true',
+                        help="Whether to use xla.")
+    parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument("--num_train_steps", type=int)
+    parser.add_argument("--num_warmup_steps", type=int)
+    parser.add_argument("--learning_rate", type=float)
+    parser.add_argument("--train_batch_size", type=int)
+    parser.add_argument("--max_seq_length", type=int)
+
+    parser.add_argument("--mask_prob", type=float)
+    parser.add_argument("--disc_weight", type=float)
+    parser.add_argument("--generator_hidden_size", type=float)
+
+    parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency")
+    parser.add_argument("--save_checkpoints_steps", type=int)
+    parser.add_argument("--steps_this_run", type=int, default=-1, help="run a fixed number of steps only")
+    parser.add_argument("--keep_checkpoint_max", type=int)
+    parser.add_argument("--restore_checkpoint", default=None, type=str)
+    parser.add_argument("--load_weights", action='store_true')
+    parser.add_argument("--weights_dir")
+
+    parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb")
+    parser.add_argument("--skip_adaptive", action='store_true', help="Whether to apply adaptive LR on LayerNorm and biases")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps")
+    parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power")
+    parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1")
+    parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2")
+    parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR")
+    parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs")
+    parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results")
+    parser.add_argument("--skip_checkpoint", action='store_true', default=False, help="Path to store logs")
+    parser.add_argument('--json-summary', type=str, default=None,
+                        help='If provided, the json summary will be written to the specified file.')
+    args = parser.parse_args()
+    config = PretrainingConfig(**args.__dict__)
+    # Padding for divisibility by 8
+    if config.vocab_size % 8 != 0:
+        config.vocab_size += 8 - (config.vocab_size % 8)
+
+    # Set up tensorflow
+    hvd.init()
+
+    args.log_dir = config.log_dir
+    # DLLogger
+    setup_logger(args)
+    dllogger.metadata('training_sequences_per_second', {'unit': 'sequences/s'})
+    dllogger.metadata('final_loss', {'unit': None})
+    dllogger.metadata('e2e_train_time', {'unit': 's'})
+
+    set_affinity(hvd.local_rank())
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+    tf.config.optimizer.set_jit(config.xla)
+    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp})
+
+    if config.amp:
+        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+        tf.keras.mixed_precision.experimental.set_policy(policy)
+        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
+        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
+
+    #tf.random.set_seed(config.seed)
+
+    # Set up config cont'
+    if config.load_weights and config.restore_checkpoint:
+        raise ValueError("`load_weights` and `restore_checkpoint` should not be on at the same time.")
+    if config.phase2 and not config.restore_checkpoint:
+        raise ValueError("`phase2` cannot be used without `restore_checkpoint`.")
+    utils.heading("Config:")
+    log_config(config)
+
+    # Save pretrain configs
+    pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json')
+    if is_main_process():
+        utils.write_json(config.__dict__, pretrain_config_json)
+        log("Configuration saved in {}".format(pretrain_config_json))
+
+    # Set up model
+    model = PretrainingModel(config)
+
+    # Set up metrics
+    metrics = dict()
+    metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")
+    metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
+    metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="masked_lm_accuracy")
+    metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss")
+    if config.electra_objective:
+        metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="sampled_masked_lm_accuracy")
+        if config.disc_weight > 0:
+            metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
+            metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
+            metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(name="disc_accuracy")
+            metrics["disc_precision"] = tf.keras.metrics.Accuracy(name="disc_precision")
+            metrics["disc_recall"] = tf.keras.metrics.Accuracy(name="disc_recall")
+
+    # Set up tensorboard
+    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+    train_log_dir = os.path.join(config.log_dir, current_time,
+                                 'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
+    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+
+    # Set up dataset
+    dataset = pretrain_utils.get_dataset(
+        config, config.train_batch_size, world_size=get_world_size(), rank=get_rank())
+    train_iterator = iter(dataset)
+
+    # Set up optimizer
+    optimizer = create_optimizer(
+        init_lr=config.learning_rate,
+        num_train_steps=config.num_train_steps,
+        num_warmup_steps=config.num_warmup_steps,
+        weight_decay_rate=config.weight_decay_rate,
+        optimizer=config.optimizer,
+        skip_adaptive=config.skip_adaptive,
+        power=config.lr_decay_power,
+        beta_1=config.opt_beta_1,
+        beta_2=config.opt_beta_2,
+        end_lr=config.end_lr)
+        
+    accumulator = GradientAccumulator()
+    if config.amp:
+        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
+
+    # Set up model checkpoint
+    checkpoint = tf.train.Checkpoint(
+        step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model)
+    manager = tf.train.CheckpointManager(checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max)
+    if config.restore_checkpoint and config.restore_checkpoint != "latest":
+        checkpoint.restore(config.restore_checkpoint)
+        log(" ** Restored model checkpoint from {}".format(config.restore_checkpoint))
+    elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint:
+        checkpoint.restore(manager.latest_checkpoint)
+        log(" ** Restored model checkpoint from {}".format(manager.latest_checkpoint))
+    elif config.load_weights:
+        model.generator(model.generator.dummy_inputs)
+        model.discriminator(model.discriminator.dummy_inputs)
+        model.generator.load_weights(os.path.join(config.weights_dir, 'generator', 'tf_model.h5'))
+        model.discriminator.load_weights(os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5'))
+    else:
+        log(" ** Initializing from scratch.")
+
+    restore_iterator = bool(config.restore_checkpoint) and config.restore_checkpoint == "latest"
+    # Initialize global step for phase2
+    if config.phase2 and not bool(checkpoint.phase2):
+        optimizer.iterations.assign(0)
+        checkpoint.step.assign(0)
+        checkpoint.phase2.assign(True)
+        restore_iterator = False
+    if bool(checkpoint.phase2):
+        manager = tf.train.CheckpointManager(
+            checkpoint, config.checkpoints_dir,
+            checkpoint_name='ckpt-p2',
+            max_to_keep=config.keep_checkpoint_max)
+
+    # Set up iterator checkpoint
+    iter_checkpoint = tf.train.Checkpoint(
+        train_iterator=train_iterator, world_size=tf.Variable(get_world_size()), rank=tf.Variable(get_rank()))
+    iter_manager = tf.train.CheckpointManager(
+        iter_checkpoint,
+        os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
+        checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
+        max_to_keep=config.keep_checkpoint_max)
+    if restore_iterator and iter_manager.latest_checkpoint:
+        ckpt_world_size = tf.train.load_variable(
+            iter_manager.latest_checkpoint, 'world_size/.ATTRIBUTES/VARIABLE_VALUE')
+        if ckpt_world_size == get_world_size():
+            iter_checkpoint.restore(iter_manager.latest_checkpoint)
+            log(" ** Restored iterator checkpoint from {}".format(iter_manager.latest_checkpoint), all_rank=True)
+
+    utils.heading("Running training")
+    accumulator.reset()
+    train_start, start_step = time.time(), int(checkpoint.step) - 1
+    local_step = 0
+    saved_ckpt = False
+    while int(checkpoint.step) <= config.num_train_steps:
+        saved_ckpt = False
+        step = int(checkpoint.step)
+        features = next(train_iterator)
+        iter_start = time.time()
+
+        # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
+        total_loss, eval_fn_inputs = train_one_step(config, model, optimizer, features, accumulator,
+                                                    local_step==1, take_step=local_step % args.gradient_accumulation_steps == 0)
+        # if step == 300: tf.profiler.experimental.stop()
+
+        metrics["train_perf"].update_state(
+            config.train_batch_size * get_world_size() / (time.time() - iter_start))
+        metrics["total_loss"].update_state(values=total_loss)
+        metric_fn(config, metrics, eval_fn_inputs)
+
+        if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0):
+            log_info_dict = {k:float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items()}
+            dllogger.log(step=(step,), data=log_info_dict, verbosity=0)
+            log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
+                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, '.format(
+                step=step, **log_info_dict,
+                loss_scale=optimizer.loss_scale if config.amp else 1,
+                elapsed=utils.get_readable_time(time.time() - train_start),
+                eta=utils.get_readable_time(
+                    (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))),
+                all_rank=True)
+
+            with train_summary_writer.as_default():
+                for key, m in metrics.items():
+                    tf.summary.scalar(key, m.result(), step=step)
+
+            if int(checkpoint.step) < config.num_train_steps:
+                for m in metrics.values():
+                    m.reset_states()
+
+        #Print allreduced metrics on the last step
+        if (int(checkpoint.step) == config.num_train_steps and (local_step % args.gradient_accumulation_steps == 0)) or ((local_step + 1) % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0):
+            log_info_dict = {k:float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy" in k else float(hvd.allreduce(v.result()).numpy()) for k, v in metrics.items()}
+            log_info_dict["training_sequences_per_second"] = log_info_dict["train_perf"]
+            log_info_dict["final_loss"] = log_info_dict["total_loss"]
+            log_info_dict["e2e_train_time"] = time.time() - e2e_start_time
+            dllogger.log(step=(), data=log_info_dict, verbosity=0)
+            log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
+                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'.format(
+                step=step, **log_info_dict),
+                all_rank=False)
+
+        if local_step % args.gradient_accumulation_steps == 0:
+            checkpoint.step.assign(int(optimizer.iterations))
+
+        if not config.skip_checkpoint and (local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0):
+            saved_ckpt = True
+            if is_main_process():
+                save_path = manager.save(checkpoint_number=step)
+                log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
+            iter_save_path = iter_manager.save(checkpoint_number=step)
+            log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
+        local_step += 1
+        if config.steps_this_run != -1 and (local_step % (config.steps_this_run * args.gradient_accumulation_steps) == 0):
+            #terminating run sooner as steps_this_run has been reached
+            log("terminating as steps_this_run:{} has been reached".format(config.steps_this_run))
+            break
+
+    step = (int(checkpoint.step) - 1)
+    dllogger.flush()
+    if not config.skip_checkpoint and not saved_ckpt:
+        if is_main_process():
+            save_path = manager.save(checkpoint_number=step)
+            log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
+        iter_save_path = iter_manager.save(checkpoint_number=step)
+        log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
+
+    return args
+
+
+if __name__ == "__main__":
+    start_time = time.time()
+    args = main(start_time)
+    log("Total Time:{:.4f}".format(time.time() - start_time))
+    if is_main_process():
+        postprocess_dllog(args)
diff --git a/modelzoo/ELECTRA/run_tf_squad.py b/modelzoo/ELECTRA/run_tf_squad.py
new file mode 100644
index 00000000..b76d6e12
--- /dev/null
+++ b/modelzoo/ELECTRA/run_tf_squad.py
@@ -0,0 +1,675 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import subprocess
+import time
+import argparse
+import json
+import logging
+
+import tensorflow as tf
+
+import horovod.tensorflow as hvd
+from horovod.tensorflow.compression import Compression
+from gpu_affinity import set_affinity
+
+if sys.version_info[0] == 2:
+    import cPickle as pickle
+else:
+    import pickle
+
+from tqdm import tqdm
+import dllogger
+from utils import is_main_process, format_step, get_rank, get_world_size, log
+from configuration import ElectraConfig
+from modeling import TFElectraForQuestionAnswering
+from tokenization import ElectraTokenizer
+from optimization import create_optimizer
+from squad_utils import SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features, \
+    SquadResult, RawResult, get_answers
+
+
+TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "google/electra-small-generator",
+    "google/electra-base-generator",
+    "google/electra-large-generator",
+    "google/electra-small-discriminator",
+    "google/electra-base-discriminator",
+    "google/electra-large-discriminator",
+    # See all ELECTRA models at https://huggingface.co/models?filter=electra
+]
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument("--electra_model", default=None, type=str, required=True,
+                        help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="Path to dataset.")
+    parser.add_argument("--output_dir", default=".", type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        help="The checkpoint file from pretraining")
+
+    # Other parameters
+    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
+    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_eval",
+                        action='store_true',
+                        help="Whether to use evaluate accuracy of predictions")
+    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+
+    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
+    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
+    parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay_rate", default=0.01, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--layerwise_lr_decay", default=0.8, type=float,
+                        help="The layerwise learning rate decay. Shallower layers have lower learning rates.")
+
+    parser.add_argument("--num_train_epochs", default=3, type=int,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1.0, type=float,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--warmup_proportion", default=0.1, type=float,
+                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
+                             "of training.")
+
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--vocab_file", default=None, type=str,
+                        help="Path to vocabulary file use for tokenization")
+    parser.add_argument("--ci", action="store_true", help="true if running on CI")
+    parser.add_argument(
+        "--joint_head",
+        default=True,
+        type=bool,
+        help="Jointly predict the start and end positions",
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=4,
+        type=int,
+        help="Beam size when doing joint predictions",
+    )
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
+                             "output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+    parser.add_argument("--no_cuda",
+                        action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--seed',
+                        type=int,
+                        default=42,
+                        help="random seed for initialization")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument('--gradient_accumulation_steps',
+                        type=int,
+                        default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--do_lower_case",
+                        action='store_true',
+                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
+    parser.add_argument("--local_rank",
+                        type=int,
+                        default=os.getenv('LOCAL_RANK', -1),
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--amp',
+                        action='store_true',
+                        help="Automatic mixed precision training")
+    parser.add_argument('--fp16_all_reduce',
+                        action='store_true',
+                        help="Whether to use 16-bit all reduce")
+    parser.add_argument('--xla',
+                        action='store_true',
+                        help="Whether to use XLA")
+    parser.add_argument('--version_2_with_negative',
+                        action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold',
+                        type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--log_freq',
+                        type=int, default=50,
+                        help='frequency of logging loss.')
+    parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
+                        help='If provided, the json summary will be written to the specified file.')
+    parser.add_argument("--eval_script",
+                        help="Script to evaluate squad predictions",
+                        default="evaluate.py",
+                        type=str)
+    parser.add_argument("--use_env",
+                        action='store_true',
+                        help="Whether to read local rank from ENVVAR")
+    parser.add_argument('--skip_checkpoint',
+                        default=False,
+                        action='store_true',
+                        help="Whether to save checkpoints")
+    parser.add_argument('--disable-progress-bar',
+                        default=False,
+                        action='store_true',
+                        help='Disable tqdm progress bar')
+    parser.add_argument("--skip_cache",
+                        default=False,
+                        action='store_true',
+                        help="Whether to cache train features")
+    parser.add_argument("--cache_dir",
+                        default=None,
+                        type=str,
+                        help="Location to cache train feaures. Will default to the dataset direct")
+    args = parser.parse_args()
+
+    if not args.do_train and (not args.init_checkpoint or args.init_checkpoint == 'None'):
+        raise ValueError("Checkpoint is required if do_train is not set")
+
+    return args
+
+
+def get_dataset_from_features(features, batch_size, drop_remainder=True, ngpu=8, mode="train", v2=False):
+    """Input function for training"""
+
+    all_input_ids = tf.convert_to_tensor([f.input_ids for f in features], dtype=tf.int64)
+    all_input_mask = tf.convert_to_tensor([f.attention_mask for f in features], dtype=tf.int64)
+    all_segment_ids = tf.convert_to_tensor([f.token_type_ids for f in features], dtype=tf.int64)
+    all_start_pos = tf.convert_to_tensor([f.start_position for f in features], dtype=tf.int64)
+    all_end_pos = tf.convert_to_tensor([f.end_position for f in features], dtype=tf.int64)
+
+    # if v2 else None:
+    all_cls_index = tf.convert_to_tensor([f.cls_index for f in features], dtype=tf.int64)
+    all_p_mask = tf.convert_to_tensor([f.p_mask for f in features], dtype=tf.float32)
+    all_is_impossible = tf.convert_to_tensor([f.is_impossible for f in features], dtype=tf.float32)
+
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos)
+        + (all_cls_index, all_p_mask, all_is_impossible))
+    if ngpu > 1:
+        dataset = dataset.shard(get_world_size(), get_rank())
+
+    if mode == "train":
+        dataset = dataset.shuffle(batch_size * 3)
+    # dataset = dataset.map(self._preproc_samples,
+    #                      num_parallel_calls=multiprocessing.cpu_count()//self._num_gpus)
+    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+    dataset = dataset.prefetch(batch_size)
+
+    return dataset
+
+
+@tf.function
+def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0):
+    with tf.GradientTape() as tape:
+        [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs
+
+        if not v2:
+            is_impossible = None
+
+        start_logits, end_logits, cls_logits = model(input_ids,
+                                                     attention_mask=input_mask,
+                                                     token_type_ids=segment_ids,
+                                                     start_positions=start_positions,
+                                                     end_positions=end_positions,
+                                                     cls_index=cls_index,
+                                                     p_mask=p_mask,
+                                                     is_impossible=is_impossible,
+                                                     position_ids=None,
+                                                     head_mask=None,
+                                                     inputs_embeds=None,
+                                                     training=True,
+                                                     )[0:3]
+
+        # If we are on multi-GPU, split add a dimension
+        if len(start_positions.shape) > 1:
+            start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions")
+        if len(end_positions.shape) > 1:
+            end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions")
+        if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None:
+            is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible")
+
+        # sometimes the start/end positions are outside our model inputs, we ignore these terms
+        ignored_index = start_logits.shape[1]
+        start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions")
+        end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions")
+
+        start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32))
+        end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32))
+        loss_value = (start_loss + end_loss) / 2
+
+        if v2:
+            cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32))
+            loss_value += cls_loss_value * 0.5
+
+        unscaled_loss = tf.stop_gradient(loss_value)
+        if amp:
+            loss_value = opt.get_scaled_loss(loss_value)
+
+    tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True,
+                                       compression=Compression.fp16 if fp16 else Compression.none)
+    gradients = tape.gradient(loss_value, model.trainable_variables)
+    if amp:
+        gradients = opt.get_unscaled_gradients(gradients)
+    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
+    opt.apply_gradients(zip(gradients, model.trainable_variables))  # , clip_norm=1.0)
+
+    if init:
+        hvd.broadcast_variables(model.variables, root_rank=0)
+        hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+    return unscaled_loss  # , outputs#, tape.gradient(loss_value, model.trainable_variables)
+
+
+@tf.function
+def infer_step(model, input_ids,
+               attention_mask=None,
+               token_type_ids=None,
+               cls_index=None,
+               p_mask=None,
+               position_ids=None,
+               head_mask=None,
+               inputs_embeds=None,
+               training=False,
+               ):
+    return model(input_ids,
+                 attention_mask=attention_mask,
+                 token_type_ids=token_type_ids,
+                 cls_index=cls_index,
+                 p_mask=p_mask,
+                 position_ids=position_ids,
+                 head_mask=head_mask,
+                 inputs_embeds=inputs_embeds,
+                 training=training,
+                 )
+
+
+def main():
+    args = parse_args()
+
+    hvd.init()
+    set_affinity(hvd.local_rank())
+
+    if is_main_process():
+        log("Running total processes: {}".format(get_world_size()))
+    log("Starting process: {}".format(get_rank()))
+
+    if is_main_process():
+        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
+                                                           filename=args.json_summary),
+                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
+    else:
+        dllogger.init(backends=[])
+
+    dllogger.metadata("exact_match", {"unit": None})
+    dllogger.metadata("F1", {"unit": None})
+    dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"})
+    dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
+
+    tf.random.set_seed(args.seed)
+    dllogger.log(step="PARAMETER", data={"SEED": args.seed})
+    # script parameters
+    BATCH_SIZE = args.train_batch_size
+    EVAL_BATCH_SIZE = args.predict_batch_size
+    USE_XLA = args.xla
+    USE_AMP = args.amp
+    EPOCHS = args.num_train_epochs
+
+    if not args.do_train:
+        EPOCHS = args.num_train_epochs = 1
+        log("Since running inference only, setting args.num_train_epochs to 1")
+
+    if not os.path.exists(args.output_dir) and is_main_process():
+        os.makedirs(args.output_dir)
+
+    # TensorFlow configuration
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    if gpus:
+        for gpu in gpus:
+            tf.config.experimental.set_memory_growth(gpu, True)
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+    tf.config.optimizer.set_jit(USE_XLA)
+    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
+    
+    if args.amp:
+        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+        tf.keras.mixed_precision.experimental.set_policy(policy)
+        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
+        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
+
+    if is_main_process():
+        log("***** Loading tokenizer and model *****")
+    # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+    electra_model = args.electra_model
+    config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
+    config.update({"amp": args.amp})
+    if args.vocab_file is None:
+        tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
+    else:
+        tokenizer = ElectraTokenizer(
+            vocab_file=args.vocab_file,
+            do_lower_case=args.do_lower_case)
+
+    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args)
+
+    if is_main_process():
+        log("***** Loading dataset *****")
+    # Load data
+    processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+    train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
+    dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None
+
+    if is_main_process():
+        log("***** Loading features *****")
+    # Load cached features
+    squad_version = '2.0' if args.version_2_with_negative else '1.1'
+    if args.cache_dir is None:
+        args.cache_dir = args.data_dir
+    cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format(
+        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
+        str(args.max_query_length), squad_version)
+    cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format(
+        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
+        str(args.max_query_length), squad_version)
+
+    try:
+        with open(cached_train_features_file, "rb") as reader:
+            train_features = pickle.load(reader) if args.do_train else []
+        with open(cached_dev_features_file, "rb") as reader:
+            dev_features = pickle.load(reader) if args.do_predict else []
+    except:
+        train_features = (  # TODO: (yy) do on rank 0?
+            squad_convert_examples_to_features(
+                examples=train_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=True,
+                return_dataset="",
+            )
+            if args.do_train
+            else []
+        )
+        dev_features = (
+            squad_convert_examples_to_features(
+                examples=dev_examples,
+                tokenizer=tokenizer,
+                max_seq_length=args.max_seq_length,
+                doc_stride=args.doc_stride,
+                max_query_length=args.max_query_length,
+                is_training=False,
+                return_dataset="",
+            )
+            if args.do_predict
+            else []
+        )
+        # Dump Cached features
+        if not args.skip_cache and is_main_process():
+            if args.do_train:
+                log("***** Building Cache Files: {} *****".format(cached_train_features_file))
+                with open(cached_train_features_file, "wb") as writer:
+                    pickle.dump(train_features, writer)
+            if args.do_predict:
+                log("***** Building Cache Files: {} *****".format(cached_dev_features_file))
+                with open(cached_dev_features_file, "wb") as writer:
+                    pickle.dump(dev_features, writer)
+
+    len_train_features = len(train_features)
+    total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
+    train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
+    len_dev_features = len(dev_features)
+    total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1
+
+    train_dataset = get_dataset_from_features(train_features, BATCH_SIZE,
+                                              v2=args.version_2_with_negative) if args.do_train else []
+    dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev",
+                                            v2=args.version_2_with_negative) if args.do_predict else []
+
+    opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps,
+                           num_warmup_steps=int(args.warmup_proportion * total_train_steps),
+                           weight_decay_rate=args.weight_decay_rate,
+                           layerwise_lr_decay=args.layerwise_lr_decay,
+                           n_transformer_layers=model.num_hidden_layers)
+    if USE_AMP:
+        # loss scaling is currently required when using mixed precision
+        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
+
+    # Define loss function
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    loss_class = tf.keras.losses.BinaryCrossentropy(
+        from_logits=True,
+        name='binary_crossentropy'
+    )
+    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+    model.compile(optimizer=opt, loss=loss, metrics=[metric])
+    train_loss_results = []
+
+    if args.do_train and is_main_process():
+        log("***** Running training *****")
+        log("  Num examples = ", len_train_features)
+        log("  Num Epochs = ", args.num_train_epochs)
+        log("  Instantaneous batch size per GPU = ", args.train_batch_size)
+        log(
+            "  Total train batch size (w. parallel, distributed & accumulation) = ",
+            args.train_batch_size
+            * get_world_size(),
+        )
+        log("  Total optimization steps =", total_train_steps)
+
+    total_train_time = 0
+    latency = []
+    for epoch in range(EPOCHS):
+        if args.do_train:
+            epoch_loss_avg = tf.keras.metrics.Mean()
+            epoch_perf_avg = tf.keras.metrics.Mean()
+            epoch_start = time.time()
+
+            epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5,
+                                  disable=not is_main_process())
+            for iter, inputs in enumerate(epoch_iterator):
+                # breaking criterion if max_steps if > 1
+                if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
+                    break
+                iter_start = time.time()
+                # Optimize the model
+                loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0),
+                                        v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP)
+                #introduce CPU-GPU sync for training perf computation
+                loss_numpy = loss_value.numpy()
+                
+                epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start))
+                if iter % args.log_freq == 0:
+                    if is_main_process():
+                        log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value,
+                                                                                              epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1,
+                                                                                              int(opt.iterations)))
+                    dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()),
+                                                            "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())})
+
+                # Track progress
+                epoch_loss_avg.update_state(loss_value)  # Add current batch loss
+
+            # End epoch
+            train_loss_results.append(epoch_loss_avg.result())
+            total_train_time += float(time.time() - epoch_start)
+            # Summarize and save checkpoint at the end of each epoch
+            if is_main_process():
+
+                dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time,
+                                                 "training_sequences_per_second": float(
+                                                     epoch_perf_avg.result().numpy() * get_world_size()),
+                                                 "final_loss": float(epoch_loss_avg.result().numpy())})
+
+            if not args.skip_checkpoint:
+                if args.ci:
+                    checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1)
+                else:
+                    checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1)
+                if is_main_process():
+                    model.save_weights(checkpoint_name)
+
+
+        if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
+            if not args.do_train:
+                log("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
+                model.load_weights(args.init_checkpoint).expect_partial()
+
+            current_feature_id = 0
+            all_results = []
+            if is_main_process():
+                log("***** Running evaluation *****")
+                log("  Num Batches = ", total_dev_steps)
+                log("  Batch size = ", args.predict_batch_size)
+
+            raw_infer_start = time.time()
+            if is_main_process():
+                infer_perf_avg = tf.keras.metrics.Mean()
+                dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5,
+                                    disable=not is_main_process())
+                for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator:
+                    # training=False is needed only if there are layers with different
+                    # behavior during training versus inference (e.g. Dropout).
+
+                    iter_start = time.time()
+
+                    if not args.joint_head:
+                        batch_start_logits, batch_end_logits = infer_step(model, input_ids,
+                                                                          attention_mask=input_mask,
+                                                                          token_type_ids=segment_ids,
+                                                                          )[:2]
+                        #Synchronize with GPU to compute time
+                        _ = batch_start_logits.numpy()
+                                                            
+                    else:
+                        
+                        outputs = infer_step(model, input_ids,
+                                             attention_mask=input_mask,
+                                             token_type_ids=segment_ids,
+                                             cls_index=cls_index,
+                                             p_mask=p_mask,
+                                             )
+                        #Synchronize with GPU to compute time
+                        _ = outputs[0].numpy()
+
+                    infer_time = (time.time() - iter_start)
+                    infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
+                    latency.append(infer_time)
+
+                    for iter_ in range(input_ids.shape[0]):
+
+                        if not args.joint_head:
+                            start_logits = batch_start_logits[iter_].numpy().tolist()
+                            end_logits = batch_end_logits[iter_].numpy().tolist()
+                            dev_feature = dev_features[current_feature_id]
+                            current_feature_id += 1
+                            unique_id = int(dev_feature.unique_id)
+                            all_results.append(RawResult(unique_id=unique_id,
+                                                         start_logits=start_logits,
+                                                         end_logits=end_logits))
+                        else:
+                            dev_feature = dev_features[current_feature_id]
+                            current_feature_id += 1
+                            unique_id = int(dev_feature.unique_id)
+                            output = [output[iter_].numpy().tolist() for output in outputs]
+
+                            start_logits = output[0]
+                            start_top_index = output[1]
+                            end_logits = output[2]
+                            end_top_index = output[3]
+                            cls_logits = output[4]
+                            result = SquadResult(
+                                unique_id,
+                                start_logits,
+                                end_logits,
+                                start_top_index=start_top_index,
+                                end_top_index=end_top_index,
+                                cls_logits=cls_logits,
+                            )
+
+                            all_results.append(result)
+
+                # Compute and save predictions
+                answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)
+
+                output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+                output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+                e2e_infer_time = time.time() - raw_infer_start
+                # if args.version_2_with_negative:
+                #     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+                # else:
+                #     output_null_log_odds_file = None
+                with open(output_prediction_file, "w") as f:
+                    f.write(json.dumps(answers, indent=4) + "\n")
+                with open(output_nbest_file, "w") as f:
+                    f.write(json.dumps(nbest_answers, indent=4) + "\n")
+
+                if args.do_eval:
+                    if args.version_2_with_negative:
+                        dev_file = "dev-v2.0.json"
+                    else:
+                        dev_file = "dev-v1.1.json"
+
+                    eval_out = subprocess.check_output([sys.executable, args.eval_script,
+                                                        args.data_dir + "/" + dev_file, output_prediction_file])
+                    log(eval_out.decode('UTF-8'))
+                    scores = str(eval_out).strip()
+                    exact_match = float(scores.split(":")[1].split(",")[0])
+                    if args.version_2_with_negative:
+                        f1 = float(scores.split(":")[2].split(",")[0])
+                    else:
+                        f1 = float(scores.split(":")[2].split("}")[0])
+
+                    log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8')))
+                    log("**EVAL SUMMARY** - Epoch: {:03d},  EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s"
+                          .format(epoch, exact_match, f1, infer_perf_avg.result()))
+
+                latency_all = sorted(latency)[:-2]
+                log(
+                    "**LATENCY SUMMARY** - Epoch: {:03d},  Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms"
+                    .format(epoch, sum(latency_all) / len(latency_all) * 1000,
+                            sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
+                            sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
+                            sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
+                            ))
+                dllogger.log(step=tuple(),
+                             data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), 
+                                   "e2e_inference_time": e2e_infer_time})
+
+    if is_main_process() and args.do_train and args.do_eval:
+        log(
+            "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s"
+            .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(),
+                    infer_perf_avg.result()))
+        dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh b/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh
new file mode 100644
index 00000000..cbeac9ed
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+train_batch_size_p1=${1:-"176"}
+learning_rate_p1="6e-7"
+precision=${2:-"amp"}
+xla=${3:-"xla"}
+num_gpus=${4:-8}
+warmup_steps_p1="10"
+train_steps_p1=10
+save_checkpoint_steps=500
+resume_training="false"
+optimizer="lamb"
+accumulate_gradients=${5:-"true"}
+gradient_accumulation_steps_p1=${6:-48}
+seed=42
+job_name="electra_lamb_pretraining_benchmark"
+train_batch_size_p2=${7:-24}
+learning_rate_p2="4e-7"
+warmup_steps_p2="10"
+train_steps_p2=10
+gradient_accumulation_steps_p2=${8:-144}
+electra_model=${9:-"base"}
+
+restore_checkpoint=false bash scripts/run_pretraining.sh $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model
diff --git a/modelzoo/ELECTRA/scripts/benchmark_squad.sh b/modelzoo/ELECTRA/scripts/benchmark_squad.sh
new file mode 100644
index 00000000..39263d65
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/benchmark_squad.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+mode=${1:-"train"}
+num_gpu=${2:-"8"}
+batch_size=${3:-"16"}
+infer_batch_size=${4:-"$batch_size"}
+precision=${5:-"amp"}
+SQUAD_VERSION=${6:-"1.1"}
+squad_dir=${7:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"}
+OUT_DIR=${8:-"results/"}
+init_checkpoint=${9:-"None"}
+cache_dir=${10:-"$squad_dir"}
+
+bash scripts/run_squad.sh google/electra-base-discriminator 1 $batch_size $infer_batch_size 8e-4 $precision $num_gpu $RANDOM $SQUAD_VERSION $squad_dir $OUT_DIR $init_checkpoint $mode interactive $cache_dir 200
diff --git a/modelzoo/ELECTRA/scripts/bind.sh b/modelzoo/ELECTRA/scripts/bind.sh
new file mode 100755
index 00000000..0d1a69b5
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/bind.sh
@@ -0,0 +1,226 @@
+#! /bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+print_usage() {
+    cat << EOF
+${0} [options] [--] COMMAND [ARG...]
+
+Control binding policy for each task. Assumes one rank will be launched for each GPU.
+
+Options:
+    --cpu=MODE
+        * exclusive -- bind each rank to an exclusive set of cores near its GPU
+        * exclusive,nosmt -- bind each rank to an exclusive set of cores near its GPU, without hyperthreading
+        * node -- bind each rank to all cores in the NUMA node nearest its GPU [default]
+	* *.sh -- bind each rank using the bash associative array bind_cpu_cores or bind_cpu_nodes from a file
+        * off -- don't bind
+    --mem=MODE
+        * node -- bind each rank to the nearest NUMA node [default]
+	* *.sh -- bind each rank using the bash associative array bind_mem from a file
+        * off -- don't bind
+    --ib=MODE
+        * single -- bind each rank to a single IB device near its GPU
+        * off -- don't bind [default]
+    --cluster=CLUSTER
+        Select which cluster is being used. May be required if system params cannot be detected.
+EOF
+}
+
+################################################################################
+# Argument parsing
+################################################################################
+
+cpu_mode='node'
+mem_mode='node'
+ib_mode='off'
+cluster=''
+while [ $# -gt 0 ]; do
+    case "$1" in
+        -h|--help) print_usage ; exit 0 ;;
+        --cpu=*) cpu_mode="${1/*=/}"; shift ;;
+        --cpu)   cpu_mode="$2"; shift 2 ;;
+        --mem=*) mem_mode="${1/*=/}"; shift ;;
+        --mem)   mem_mode="$2"; shift 2 ;;
+        --ib=*) ib_mode="${1/*=/}"; shift ;;
+        --ib)   ib_mode="$2"; shift 2 ;;
+        --cluster=*) cluster="${1/*=/}"; shift ;;
+        --cluster)   cluster="$2"; shift 2 ;;
+        --) shift; break ;;
+        *) break ;;
+    esac
+done
+if [ $# -lt 1 ]; then
+    echo 'ERROR: no command given' 2>&1
+    print_usage
+    exit 1
+fi
+
+################################################################################
+# Get system params
+################################################################################
+
+# LOCAL_RANK is set with an enroot hook for Pytorch containers
+# SLURM_LOCALID is set by Slurm
+# OMPI_COMM_WORLD_LOCAL_RANK is set by mpirun
+readonly local_rank="${LOCAL_RANK:=${SLURM_LOCALID:=${OMPI_COMM_WORLD_LOCAL_RANK:-}}}"
+if [ -z "${local_rank}" ]; then
+    echo 'ERROR: cannot read LOCAL_RANK from env' >&2
+    exit 1
+fi
+
+num_gpus=$(nvidia-smi -i 0 --query-gpu=count --format=csv,noheader,nounits)
+if [ "${local_rank}" -ge "${num_gpus}" ]; then
+    echo "ERROR: local rank is ${local_rank}, but there are only ${num_gpus} gpus available" >&2
+    exit 1
+fi
+
+get_lscpu_value() {
+    awk -F: "(\$1 == \"${1}\"){gsub(/ /, \"\", \$2); print \$2; found=1} END{exit found!=1}"
+}
+lscpu_out=$(lscpu)
+num_sockets=$(get_lscpu_value 'Socket(s)' <<< "${lscpu_out}")
+num_nodes=$(get_lscpu_value 'NUMA node(s)' <<< "${lscpu_out}")
+cores_per_socket=$(get_lscpu_value 'Core(s) per socket' <<< "${lscpu_out}")
+
+echo "num_sockets = ${num_sockets} num_nodes=${num_nodes} cores_per_socket=${cores_per_socket}"
+
+readonly cores_per_node=$(( (num_sockets * cores_per_socket) / num_nodes ))
+if [ ${num_gpus} -gt 1 ]; then
+    readonly gpus_per_node=$(( num_gpus / num_nodes ))
+else
+    readonly gpus_per_node=1
+fi
+readonly cores_per_gpu=$(( cores_per_node / gpus_per_node ))
+readonly local_node=$(( local_rank / gpus_per_node ))
+
+
+declare -a ibdevs=()
+case "${cluster}" in
+    circe)
+        # Need to specialize for circe because IB detection is hard
+        ibdevs=(mlx5_1 mlx5_2 mlx5_3 mlx5_4 mlx5_7 mlx5_8 mlx5_9 mlx5_10)
+        ;;
+   selene)
+        # Need to specialize for selene because IB detection is hard
+        ibdevs=(mlx5_0 mlx5_1 mlx5_2 mlx5_3 mlx5_6 mlx5_7 mlx5_8 mlx5_9)
+        ;;
+    '')
+        if ibstat_out="$(ibstat -l 2>/dev/null | sort -V)" ; then
+            mapfile -t ibdevs <<< "${ibstat_out}"
+        fi
+        ;;
+    *)
+        echo "ERROR: Unknown cluster '${cluster}'" >&2
+        exit 1
+        ;;
+esac
+readonly num_ibdevs="${#ibdevs[@]}"
+
+################################################################################
+# Setup for exec
+################################################################################
+
+declare -a numactl_args=()
+
+case "${cpu_mode}" in
+    exclusive)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u,%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+            $(( local_rank * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) )) \
+            $(( (local_rank + 1) * cores_per_gpu + (cores_per_gpu * gpus_per_node * num_nodes) - 1 )) \
+        )" )
+        ;;
+    exclusive,nosmt)
+        numactl_args+=( "$(printf -- "--physcpubind=%u-%u" \
+            $(( local_rank * cores_per_gpu )) \
+            $(( (local_rank + 1) * cores_per_gpu - 1 )) \
+        )" )
+        ;;
+    node)
+        numactl_args+=( "--cpunodebind=${local_node}" )
+        ;;
+    *.sh)
+	source "${cpu_mode}"
+	if [ -n "${bind_cpu_cores:-}" ]; then
+	    numactl_args+=( "--physcpubind=${bind_cpu_cores[${local_rank}]}" )
+	elif [ -n "${bind_cpu_nodes:-}" ]; then
+	    numactl_args+=( "--cpunodebind=${bind_cpu_nodes[${local_rank}]}" )
+	else
+	    echo "ERROR: invalid CPU affinity file ${cpu_mode}." >&2
+	    exit 1
+	fi
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid cpu mode '${cpu_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${mem_mode}" in
+    node)
+        numactl_args+=( "--membind=${local_node}" )
+        ;;
+    *.sh)
+	source "${mem_mode}"
+	if [ -z "${bind_mem:-}" ]; then
+	    echo "ERROR: invalid memory affinity file ${mem_mode}." >&2
+	    exit 1
+	fi
+	numactl_args+=( "--membind=${bind_mem[${local_rank}]}" )
+	;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid mem mode '${mem_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+case "${ib_mode}" in
+    single)
+        if [ "${num_ibdevs}" -eq 0 ]; then
+            echo "WARNING: used '$0 --ib=single', but there are 0 IB devices available; skipping IB binding." 2>&1
+        else
+            readonly ibdev="${ibdevs[$(( local_rank * num_ibdevs / num_gpus ))]}"
+            export OMPI_MCA_btl_openib_if_include="${OMPI_MCA_btl_openib_if_include-$ibdev}"
+            export UCX_NET_DEVICES="${UCX_NET_DEVICES-$ibdev:1}"
+        fi
+        ;;
+    off|'')
+        ;;
+    *)
+        echo "ERROR: invalid ib mode '${ib_mode}'" 2>&1
+        print_usage
+        exit 1
+        ;;
+esac
+
+################################################################################
+# Exec
+################################################################################
+
+if [ "${#numactl_args[@]}" -gt 0 ] ; then
+    set -x
+    exec numactl "${numactl_args[@]}" -- "${@}"
+else
+    exec "${@}"
+fi
diff --git a/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh b/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh
new file mode 100644
index 00000000..7ddb3299
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh
@@ -0,0 +1,411 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Full  pretraining configs for NVIDIA DGX-A100 (8x NVIDIA A100 40GB GPU)
+
+dgxa100_8gpu_amp ()
+{
+    train_batch_size_p1="176"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=8
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=48
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=24
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=144
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model
+
+}
+
+dgxa100_8gpu_tf32 ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="tf32"
+    xla="xla"
+    num_gpus=8
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=96
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=288
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+
+# Full  pretraining configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_amp ()
+{
+    train_batch_size_p1="176"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=16
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=24
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=24
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=72
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+dgx2_16gpu_fp32 ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="fp32"
+    xla="xla"
+    num_gpus=16
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=48
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=144
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+# Full pretraining configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_amp ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=8
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=96
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=288
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+dgx1_8gpu_fp32 ()
+{
+    train_batch_size_p1="40"
+    learning_rate_p1="6e-3"
+    precision="fp32"
+    xla="xla"
+    num_gpus=8
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=211
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=6
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=576
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+# Full  pretraining configs for NVIDIA DGX-A100 (1x NVIDIA A100 40GB GPU)
+
+dgxa100_1gpu_amp ()
+{
+    train_batch_size_p1="176"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=384
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=24
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=1152
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+dgxa100_1gpu_tf32 ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="tf32"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=768
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=2304
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+# Full  pretraining configs for NVIDIA DGX-2H (1x NVIDIA V100 32GB GPU)
+
+dgx2_1gpu_amp ()
+{
+    train_batch_size_p1="176"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=384
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=24
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=1152
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+dgx2_1gpu_fp32 ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="fp32"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=768
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=2304
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+# Full pretraining configs for NVIDIA DGX-1 (1x NVIDIA V100 16GB GPU)
+
+dgx1_1gpu_amp ()
+{
+    train_batch_size_p1="88"
+    learning_rate_p1="6e-3"
+    precision="amp"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=768
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=12
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=2304
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_p1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
+
+dgx1_1gpu_fp32 ()
+{
+    train_batch_size_p1="40"
+    learning_rate_p1="6e-3"
+    precision="fp32"
+    xla="xla"
+    num_gpus=1
+    warmup_steps_p1="2000"
+    train_steps_p1=10000
+    save_checkpoint_steps=500
+    resume_training="false"
+    optimizer="lamb"
+    accumulate_gradients="true"
+    gradient_accumulation_steps_p1=1689
+    seed=42
+    job_name="electra_lamb_pretraining"
+    train_batch_size_p2=6
+    learning_rate_p2="4e-3"
+    warmup_steps_p2="200"
+    train_steps_p2=933
+    gradient_accumulation_steps_p2=4608
+    electra_model="base"
+    echo $train_batch_size_p1 $learning_rate_1 $precision $num_gpus $xla \
+         $warmup_steps_p1 $train_steps_p1 $save_checkpoint_steps \
+         $resume_training $optimizer $accumulate_gradients  \
+         $gradient_accumulation_steps_p1 $seed $job_name \
+         $train_batch_size_p2 $learning_rate_p2 \
+         $warmup_steps_p2 $train_steps_p2 $gradient_accumulation_steps_p2 \
+         $electra_model 
+
+}
diff --git a/modelzoo/ELECTRA/scripts/configs/squad_config.sh b/modelzoo/ELECTRA/scripts/configs/squad_config.sh
new file mode 100644
index 00000000..a29eb71c
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/configs/squad_config.sh
@@ -0,0 +1,271 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+dgxa100_8gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="512"
+    learning_rate="8e-4"
+    precision="amp"
+    num_gpu="8"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgxa100_8gpu_tf32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="512"
+    learning_rate="8e-4"
+    precision="tf32"
+    num_gpu="8"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_16gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="256"
+    learning_rate="1e-3"
+    precision="amp"
+    num_gpu="16"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgx2_16gpu_fp32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="256"
+    learning_rate="1e-3"
+    precision="fp32"
+    num_gpu="16"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_8gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="16"
+    infer_batch_size="256"
+    learning_rate="4e-4"
+    precision="amp"
+    num_gpu="8"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgx1_8gpu_fp32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="8"
+    infer_batch_size="256"
+    learning_rate="3e-4"
+    precision="fp32"
+    num_gpu="8"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+#Squad 2.0
+dgx1_8gpu_amp_v2 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="3"
+    batch_size="16"
+    infer_batch_size="256"
+    learning_rate="4e-4"
+    precision="amp"
+    num_gpu="8"
+    seed="1"
+    SQUAD_VERSION="2.0"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+# 1GPU configs
+
+dgxa100_1gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="512"
+    learning_rate="2e-4"
+    precision="amp"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgxa100_1gpu_tf32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="512"
+    learning_rate="2e-4"
+    precision="tf32"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+# Full SQuAD training configs for NVIDIA DGX-2H (16x NVIDIA V100 32GB GPU)
+
+dgx2_1gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="256"
+    learning_rate="2e-4"
+    precision="amp"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgx2_1gpu_fp32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="32"
+    infer_batch_size="256"
+    learning_rate="2e-4"
+    precision="fp32"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+# Full SQuAD training configs for NVIDIA DGX-1 (8x NVIDIA V100 16GB GPU)
+
+dgx1_1gpu_amp ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="16"
+    infer_batch_size="256"
+    learning_rate="1e-4"
+    precision="amp"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
+
+dgx1_1gpu_fp32 ()
+{
+    electra_model="google/electra-base-discriminator"
+    epochs="2"
+    batch_size="8"
+    infer_batch_size="256"
+    learning_rate="1e-4"
+    precision="fp32"
+    num_gpu="1"
+    seed="1"
+    SQUAD_VERSION="1.1"
+    squad_dir="/workspace/electra/data/download/squad/v$SQUAD_VERSION"
+    OUT_DIR="results/"
+    init_checkpoint="checkpoints/electra_base_qa_v2_False_epoch_2_ckpt"
+    echo $electra_model $epochs $batch_size $infer_batch_size $learning_rate \
+     $precision $num_gpu $seed $SQUAD_VERSION $squad_dir \
+     $OUT_DIR $init_checkpoint
+}
diff --git a/modelzoo/ELECTRA/scripts/docker/build.sh b/modelzoo/ELECTRA/scripts/docker/build.sh
new file mode 100644
index 00000000..b0f1ec61
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/docker/build.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+docker build --network=host . --rm -t electra
diff --git a/modelzoo/ELECTRA/scripts/docker/launch.sh b/modelzoo/ELECTRA/scripts/docker/launch.sh
new file mode 100644
index 00000000..b5bd7d60
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/docker/launch.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMD=${1:-/bin/bash}
+NV_VISIBLE_DEVICES=${2:-"all"}
+DOCKER_BRIDGE=${3:-"host"}
+
+docker run -it --rm \
+  --gpus device=$NV_VISIBLE_DEVICES \
+  --net=$DOCKER_BRIDGE \
+  --shm-size=1g \
+  --ulimit memlock=-1 \
+  --ulimit stack=67108864 \
+  --privileged \
+  -e LD_LIBRARY_PATH='/workspace/install/lib/' \
+  -v $PWD:/workspace/electra \
+  electra $CMD
diff --git a/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh b/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh
new file mode 100644
index 00000000..39d6a8c5
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+checkpoints=${checkpoints:-"results/base/checkpoints"}
+for folder in $checkpoints; do
+
+    ckpts_dir=${folder}
+    output_dir=${folder}
+
+    for f in $ckpts_dir/*.index; do
+        ckpt=${f%.*}
+        echo "==================================== START $ckpt ===================================="
+        python postprocess_pretrained_ckpt.py --pretrained_checkpoint=$ckpt --output_dir=$output_dir --amp
+        bash scripts/run_squad.sh $output_dir/discriminator;
+        echo "====================================  END $ckpt  ====================================";
+    done
+done
diff --git a/modelzoo/ELECTRA/scripts/run_pretraining.sh b/modelzoo/ELECTRA/scripts/run_pretraining.sh
new file mode 100644
index 00000000..7fd3feb8
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/run_pretraining.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+train_batch_size_p1=${1:-176}
+learning_rate_p1=${2:-"6e-3"}
+precision=${3:-"amp"}
+num_gpus=${4:-8}
+xla=${5:-"xla"}
+warmup_steps_p1=${6:-"2000"}
+train_steps_p1=${7:-10000}
+save_checkpoint_steps=${8:-500}
+resume_training=${9:-"false"}
+optimizer=${10:-"lamb"}
+accumulate_gradients=${11:-"true"}
+gradient_accumulation_steps_p1=${12:-48}
+seed=${13:-12439}
+job_name=${14:-"electra_lamb_pretraining"}
+train_batch_size_p2=${15:-24}
+learning_rate_p2=${16:-"4e-3"}
+warmup_steps_p2=${17:-"200"}
+train_steps_p2=${18:-933}
+gradient_accumulation_steps_p2=${19:-144}
+ELECTRA_MODEL=${20:-"base"}
+DATASET_P1="tfrecord_lower_case_1_seq_len_128_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*" # change this for other datasets
+DATA_DIR_P1=${21:-"$DATA_PREP_WORKING_DIR/$DATASET_P1"}
+DATASET_P2="tfrecord_lower_case_1_seq_len_512_random_seed_12345/books_wiki_en_corpus/train/pretrain_data*" # change this for other datasets
+DATA_DIR_P2=${22:-"$DATA_PREP_WORKING_DIR/$DATASET_P2"}
+CODEDIR=${23:-"/workspace/electra"}
+init_checkpoint=${24:-"None"}
+restore_checkpoint=${restore_checkpoint:-"true"}
+RESULTS_DIR=$CODEDIR/results
+
+if [ ! -d "$RESULTS_DIR" ] ; then
+   echo "Error! $RESULTS_DIR directory missing."
+   exit -1
+fi
+
+PREFIX=""
+TEST_RESULT=$(awk 'BEGIN {print ('1' <= '${num_gpus}')}')
+if [ "$TEST_RESULT" == 1 ] ; then
+    PREFIX="horovodrun -np $num_gpus "
+fi
+
+if [ "$precision" = "amp" ] ; then
+   PREC="--amp "
+elif [ "$precision" = "fp32" ] ; then
+   PREC=""
+elif [ "$precision" = "tf32" ] ; then
+   PREC=""
+else
+   echo "Unknown <precision> argument"
+   exit -2
+fi
+
+if [ "$xla" = "xla" ] ; then
+   PREC="$PREC --xla"
+fi
+
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_p1"
+fi
+
+CHECKPOINT=""
+if [ "$resume_training" == "true" ] ; then
+   CHECKPOINT="--restore_checkpoint=latest"
+fi
+
+if [ "$init_checkpoint" != "None" ] ; then
+   CHECKPOINT="--restore_checkpoint=$init_checkpoint"
+fi
+
+CMD=" $CODEDIR/run_pretraining.py"
+CMD+=" --model_name=${ELECTRA_MODEL}"
+CMD+=" --pretrain_tfrecords=$DATA_DIR_P1"
+CMD+=" --model_size=${ELECTRA_MODEL}"
+CMD+=" --train_batch_size=$train_batch_size_p1"
+CMD+=" --max_seq_length=128 --disc_weight=50.0 --generator_hidden_size=0.3333333 "
+CMD+=" --num_train_steps=$train_steps_p1"
+CMD+=" --num_warmup_steps=$warmup_steps_p1"
+CMD+=" --save_checkpoints_steps=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate_p1"
+CMD+=" --optimizer=${optimizer} --skip_adaptive --opt_beta_1=0.878 --opt_beta_2=0.974 --lr_decay_power=0.5"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" $CHECKPOINT"
+CMD+=" --log_dir ${RESULTS_DIR} "
+
+CMD="$PREFIX python3 $CMD"
+echo "Launch command: $CMD"
+
+printf -v TAG "electra_pretraining_phase1_%s" "$precision"
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+
+set +x
+
+echo "finished pretraining phase1"
+
+#Start Phase2
+ACCUMULATE_GRADIENTS=""
+if [ "$accumulate_gradients" == "true" ] ; then
+   ACCUMULATE_GRADIENTS="--gradient_accumulation_steps=$gradient_accumulation_steps_p2"
+fi
+
+RESTORE_CHECKPOINT=""
+if [ "$restore_checkpoint" == "true" ] ; then
+   RESTORE_CHECKPOINT="--restore_checkpoint=latest --phase2"
+fi
+
+CMD=" $CODEDIR/run_pretraining.py"
+CMD+=" --model_name=${ELECTRA_MODEL}"
+CMD+=" --pretrain_tfrecords=$DATA_DIR_P2"
+CMD+=" --model_size=${ELECTRA_MODEL}"
+CMD+=" --train_batch_size=$train_batch_size_p2"
+CMD+=" --max_seq_length=512 --disc_weight=50.0 --generator_hidden_size=0.3333333 ${RESTORE_CHECKPOINT}"
+CMD+=" --num_train_steps=$train_steps_p2"
+CMD+=" --num_warmup_steps=$warmup_steps_p2"
+CMD+=" --save_checkpoints_steps=$save_checkpoint_steps"
+CMD+=" --learning_rate=$learning_rate_p2"
+CMD+=" --optimizer=${optimizer} --skip_adaptive --opt_beta_1=0.878 --opt_beta_2=0.974 --lr_decay_power=0.5"
+CMD+=" --seed=$seed"
+CMD+=" $PREC"
+CMD+=" $ACCUMULATE_GRADIENTS"
+CMD+=" --log_dir ${RESULTS_DIR} "
+
+CMD="$PREFIX python3 $CMD"
+echo "Launch command: $CMD"
+
+
+printf -v TAG "electra_pretraining_phase2_%s" "$precision"
+DATESTAMP=`date +'%y%m%d%H%M%S'`
+LOGFILE=$RESULTS_DIR/$job_name.$TAG.$DATESTAMP.log
+printf "Logs written to %s\n" "$LOGFILE"
+
+set -x
+if [ -z "$LOGFILE" ] ; then
+   $CMD
+else
+   (
+     $CMD
+   ) |& tee $LOGFILE
+fi
+
+set +x
+
+echo "finished pretraining phase2"
diff --git a/modelzoo/ELECTRA/scripts/run_squad.sh b/modelzoo/ELECTRA/scripts/run_squad.sh
new file mode 100644
index 00000000..c9ac17bf
--- /dev/null
+++ b/modelzoo/ELECTRA/scripts/run_squad.sh
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
+
+electra_model=${1:-"google/electra-base-discriminator"}
+epochs=${2:-"2"}
+batch_size=${3:-"16"}
+infer_batch_size=${4:-"128"}
+learning_rate=${5:-"4e-4"}
+precision=${6:-"amp"}
+num_gpu=${7:-"8"}
+seed=${8:-"$RANDOM"}
+SQUAD_VERSION=${9:-"1.1"}
+squad_dir=${10:-"/workspace/electra/data/download/squad/v$SQUAD_VERSION"}
+OUT_DIR=${11:-"results/"}
+init_checkpoint=${12:-"None"}
+mode=${13:-"train_eval"}
+env=${14:-"interactive"}
+cache_dir=${15:-"$squad_dir"}
+max_steps=${16:-"-1"}
+
+echo "out dir is $OUT_DIR"
+mkdir -p $OUT_DIR
+if [ ! -d "$OUT_DIR" ]; then
+  echo "ERROR: non existing $OUT_DIR"
+  exit 1
+fi
+
+use_fp16=""
+if [ "$precision" = "amp" ] ; then
+  echo "mixed-precision training and xla activated!"
+  use_fp16=" --amp --xla "
+fi
+
+if [ "$num_gpu" = "1" ] ; then
+  export CUDA_VISIBLE_DEVICES=0
+  mpi_command=" "
+else
+  unset CUDA_VISIBLE_DEVICES
+  mpi_command=" horovodrun -np $num_gpu "
+fi
+
+if [ "$env" = "cluster" ] ; then
+  unset CUDA_VISIBLE_DEVICES
+  mpi_command=" "
+fi
+
+v2=""
+echo "Running SQuAD-v$SQUAD_VERSION"
+if [ "$SQUAD_VERSION" = "2.0" ] ; then
+  v2=" --version_2_with_negative "
+fi
+
+CMD=" $mpi_command python run_tf_squad.py "
+CMD+="--init_checkpoint=$init_checkpoint "
+if [ "$mode" = "train" ] ; then
+  CMD+="--do_train "
+  CMD+="--train_batch_size=$batch_size "
+elif [ "$mode" = "eval" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_batch_size=$infer_batch_size "
+  CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py "
+  CMD+="--do_eval "
+elif [ "$mode" = "prediction" ] ; then
+  CMD+="--do_predict "
+  CMD+="--predict_batch_size=$infer_batch_size "
+else
+  CMD+=" --do_train "
+  CMD+=" --train_batch_size=$batch_size "
+  CMD+="--do_predict "
+  CMD+="--predict_batch_size=$infer_batch_size "
+  CMD+="--eval_script=$squad_dir/evaluate-v$SQUAD_VERSION.py "
+  CMD+="--do_eval "
+fi
+
+CMD+=" $v2 "
+CMD+=" --data_dir $squad_dir "
+CMD+=" --do_lower_case "
+CMD+=" --electra_model=$electra_model "
+CMD+=" --learning_rate=$learning_rate "
+CMD+=" --warmup_proportion 0.05 "
+CMD+=" --weight_decay_rate 0.01 "
+CMD+=" --layerwise_lr_decay 0.8 "
+CMD+=" --seed=$seed "
+CMD+=" --num_train_epochs=$epochs "
+CMD+=" --max_seq_length=384 "
+CMD+=" --doc_stride=128 "
+CMD+=" --beam_size 5 "
+CMD+=" --joint_head True "
+CMD+=" --null_score_diff_threshold -5.6 "
+CMD+=" --output_dir=$OUT_DIR "
+CMD+=" $use_fp16"
+CMD+=" --cache_dir=$cache_dir "
+CMD+=" --max_steps=$max_steps "
+CMD+=" --vocab_file=/workspace/electra/vocab/vocab.txt "
+
+LOGFILE=$OUT_DIR/logfile.txt
+echo "$CMD |& tee $LOGFILE"
+time $CMD |& tee $LOGFILE
diff --git a/modelzoo/ELECTRA/squad_utils.py b/modelzoo/ELECTRA/squad_utils.py
new file mode 100644
index 00000000..a15c4dd9
--- /dev/null
+++ b/modelzoo/ELECTRA/squad_utils.py
@@ -0,0 +1,1093 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import csv
+import logging
+import os
+import math
+import collections
+from functools import partial
+from multiprocessing import Pool, cpu_count
+import horovod.tensorflow as hvd
+
+import numpy as np
+from tqdm import tqdm
+
+
+from file_utils import is_tf_available, is_torch_available
+from tokenization_utils import BasicTokenizer, whitespace_tokenize
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import TensorDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+logger = logging.getLogger(__name__)
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _new_check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+    # if len(doc_spans) == 1:
+    # return True
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span["start"] + doc_span["length"] - 1
+        if position < doc_span["start"]:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span["start"]
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def _is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+        return True
+    return False
+
+
+def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
+    features = []
+    if is_training and not example.is_impossible:
+        # Get start and end position
+        start_position = example.start_position
+        end_position = example.end_position
+
+        # If the answer cannot be found in the text, then skip this example.
+        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+        if actual_text.find(cleaned_answer_text) == -1:
+            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+            return []
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+        orig_to_tok_index.append(len(all_doc_tokens))
+        sub_tokens = tokenizer.tokenize(token)
+        for sub_token in sub_tokens:
+            tok_to_orig_index.append(i)
+            all_doc_tokens.append(sub_token)
+
+    if is_training and not example.is_impossible:
+        tok_start_position = orig_to_tok_index[example.start_position]
+        if example.end_position < len(example.doc_tokens) - 1:
+            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+        else:
+            tok_end_position = len(all_doc_tokens) - 1
+
+        (tok_start_position, tok_end_position) = _improve_answer_span(
+            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+        )
+
+    spans = []
+
+    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+    sequence_added_tokens = (
+        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+        if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
+        else tokenizer.max_len - tokenizer.max_len_single_sentence
+    )
+    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
+
+    span_doc_tokens = all_doc_tokens
+    while len(spans) * doc_stride < len(all_doc_tokens):
+
+        encoded_dict = tokenizer.encode_plus(
+            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+            max_length=max_seq_length,
+            return_overflowing_tokens=True,
+            pad_to_max_length=True,
+            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
+            return_token_type_ids=True,
+        )
+
+        paragraph_len = min(
+            len(all_doc_tokens) - len(spans) * doc_stride,
+            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+        )
+
+        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+            if tokenizer.padding_side == "right":
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+            else:
+                last_padding_id_position = (
+                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
+                )
+                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+        else:
+            non_padded_ids = encoded_dict["input_ids"]
+
+        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+        token_to_orig_map = {}
+        for i in range(paragraph_len):
+            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+        encoded_dict["paragraph_len"] = paragraph_len
+        encoded_dict["tokens"] = tokens
+        encoded_dict["token_to_orig_map"] = token_to_orig_map
+        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+        encoded_dict["token_is_max_context"] = {}
+        encoded_dict["start"] = len(spans) * doc_stride
+        encoded_dict["length"] = paragraph_len
+
+        spans.append(encoded_dict)
+
+        if "overflowing_tokens" not in encoded_dict:
+            break
+        span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+    for doc_span_index in range(len(spans)):
+        for j in range(spans[doc_span_index]["paragraph_len"]):
+            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+            index = (
+                j
+                if tokenizer.padding_side == "left"
+                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+            )
+            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+    for span in spans:
+        # Identify the position of the CLS token
+        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+        # Original TF implem also keep the classification token (set to 0) (not sure why...)
+        p_mask = np.array(span["token_type_ids"])
+
+        p_mask = np.minimum(p_mask, 1)
+
+        if tokenizer.padding_side == "right":
+            # Limit positive values to one
+            p_mask = 1 - p_mask
+
+        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
+
+        # Set the CLS index to '0'
+        p_mask[cls_index] = 0
+
+        span_is_impossible = example.is_impossible
+        start_position = 0
+        end_position = 0
+        if is_training and not span_is_impossible:
+            # For training, if our document chunk does not contain an annotation
+            # we throw it out, since there is nothing to predict.
+            doc_start = span["start"]
+            doc_end = span["start"] + span["length"] - 1
+            out_of_span = False
+
+            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+                out_of_span = True
+
+            if out_of_span:
+                start_position = cls_index
+                end_position = cls_index
+                span_is_impossible = True
+            else:
+                if tokenizer.padding_side == "left":
+                    doc_offset = 0
+                else:
+                    doc_offset = len(truncated_query) + sequence_added_tokens
+
+                start_position = tok_start_position - doc_start + doc_offset
+                end_position = tok_end_position - doc_start + doc_offset
+
+        features.append(
+            SquadFeatures(
+                span["input_ids"],
+                span["attention_mask"],
+                span["token_type_ids"],
+                cls_index,
+                p_mask.tolist(),
+                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+                unique_id=0,
+                paragraph_len=span["paragraph_len"],
+                token_is_max_context=span["token_is_max_context"],
+                tokens=span["tokens"],
+                token_to_orig_map=span["token_to_orig_map"],
+                start_position=start_position,
+                end_position=end_position,
+                is_impossible=span_is_impossible,
+            )
+        )
+    return features
+
+
+def squad_convert_example_to_features_init(tokenizer_for_convert):
+    global tokenizer
+    tokenizer = tokenizer_for_convert
+
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
+):
+    """
+    Converts a list of examples into a list of features that can be directly given as input to a model.
+    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+    Args:
+        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        max_seq_length: The maximum sequence length of the inputs.
+        doc_stride: The stride used when the context is too large and is split across several features.
+        max_query_length: The maximum length of the query.
+        is_training: whether to create features for model evaluation or model training.
+        return_dataset: Default False. Either 'pt' or 'tf'.
+            if 'pt': returns a torch.data.TensorDataset,
+            if 'tf': returns a tf.data.Dataset
+        threads: multiple processing threadsa-smi
+
+
+    Returns:
+        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+
+    Example::
+
+        processor = SquadV2Processor()
+        examples = processor.get_dev_examples(data_dir)
+
+        features = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+        )
+    """
+
+    # Defining helper methods
+    features = []
+    threads = min(threads, cpu_count())
+    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+        annotate_ = partial(
+            squad_convert_example_to_features,
+            max_seq_length=max_seq_length,
+            doc_stride=doc_stride,
+            max_query_length=max_query_length,
+            is_training=is_training,
+        )
+        features = list(
+            tqdm(
+                p.imap(annotate_, examples, chunksize=32),
+                total=len(examples),
+                desc="convert squad examples to features",
+                mininterval=5,
+                disable=hvd.rank() not in [-1, 0]
+            )
+        )
+    new_features = []
+    unique_id = 1000000000
+    example_index = 0
+    for example_features in tqdm(features, total=len(features), desc="add example index and unique id",
+                                 mininterval=5, disable=hvd.rank() not in [-1, 0]):
+        if not example_features:
+            continue
+        for example_feature in example_features:
+            example_feature.example_index = example_index
+            example_feature.unique_id = unique_id
+            new_features.append(example_feature)
+            unique_id += 1
+        example_index += 1
+    features = new_features
+    del new_features
+    if return_dataset == "pt":
+        if not is_torch_available():
+            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+        # Convert to Tensors and build dataset
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+        if not is_training:
+            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+            )
+        else:
+            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+                all_is_impossible,
+            )
+
+        return features, dataset
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    },
+                    {
+                        "start_position": ex.start_position,
+                        "end_position": ex.end_position,
+                        "cls_index": ex.cls_index,
+                        "p_mask": ex.p_mask,
+                        "is_impossible": ex.is_impossible,
+                    },
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+                {
+                    "start_position": tf.int64,
+                    "end_position": tf.int64,
+                    "cls_index": tf.int64,
+                    "p_mask": tf.int32,
+                    "is_impossible": tf.int32,
+                },
+            ),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                {
+                    "start_position": tf.TensorShape([]),
+                    "end_position": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                    "is_impossible": tf.TensorShape([]),
+                },
+            ),
+        )
+
+    return features
+
+
+class DataProcessor(object): # TODO can be removed
+    """Base class for data converters for sequence classification data sets."""
+
+    def get_example_from_tensor_dict(self, tensor_dict):
+        """Gets an example from a dict with tensorflow tensors
+        Args:
+            tensor_dict: Keys and values should match the corresponding Glue
+                tensorflow_dataset examples.
+        """
+        raise NotImplementedError()
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+    def tfds_map(self, example):
+        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
+        This method converts examples to the correct format."""
+        if len(self.get_labels()) > 1:
+            example.label = self.get_labels()[int(example.label)]
+        return example
+
+    @classmethod
+    def _read_tsv(cls, input_file, quotechar=None):
+        """Reads a tab separated value file."""
+        with open(input_file, "r", encoding="utf-8-sig") as f:
+            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
+
+
+class SquadProcessor(DataProcessor):
+    """
+    Processor for the SQuAD data set.
+    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+    """
+
+    train_file = None
+    dev_file = None
+
+    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+        if not evaluate:
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+            answers = []
+        else:
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
+
+            answer = None
+            answer_start = None
+
+        return SquadExample(
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
+            answer_text=answer,
+            start_position_character=answer_start,
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
+        )
+
+    def get_examples_from_dataset(self, dataset, evaluate=False):
+        """
+        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+
+        Args:
+            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            evaluate: boolean specifying if in evaluation mode or in training mode
+
+        Returns:
+            List of SquadExample
+
+        Examples::
+
+            import tensorflow_datasets as tfds
+            dataset = tfds.load("squad")
+
+            training_examples = get_examples_from_dataset(dataset, evaluate=False)
+            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        """
+
+        if evaluate:
+            dataset = dataset["validation"]
+        else:
+            dataset = dataset["train"]
+
+        examples = []
+        for tensor_dict in tqdm(dataset, mininterval=5, disable=hvd.rank() not in [-1, 0]):
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
+
+        return examples
+
+    def get_train_examples(self, data_dir, filename=None):
+        """
+        Returns the training examples from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the training file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.train_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "train")
+
+    def get_dev_examples(self, data_dir, filename=None):
+        """
+        Returns the evaluation example from the data directory.
+
+        Args:
+            data_dir: Directory containing the data files used for training and evaluating.
+            filename: None by default, specify this if the evaluation file has a different name than the original one
+                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
+        """
+        if data_dir is None:
+            data_dir = ""
+
+        if self.dev_file is None:
+            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
+            input_data = json.load(reader)["data"]
+        return self._create_examples(input_data, "dev")
+
+    def _create_examples(self, input_data, set_type):
+        is_training = set_type == "train"
+        examples = []
+        for entry in tqdm(input_data, mininterval=5, disable=hvd.rank() not in [-1, 0]):
+            title = entry["title"]
+            for paragraph in entry["paragraphs"]:
+                context_text = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    qas_id = qa["id"]
+                    question_text = qa["question"]
+                    start_position_character = None
+                    answer_text = None
+                    answers = []
+
+                    if "is_impossible" in qa:
+                        is_impossible = qa["is_impossible"]
+                    else:
+                        is_impossible = False
+
+                    if not is_impossible:
+                        if is_training:
+                            answer = qa["answers"][0]
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
+                        else:
+                            answers = qa["answers"]
+
+                    example = SquadExample(
+                        qas_id=qas_id,
+                        question_text=question_text,
+                        context_text=context_text,
+                        answer_text=answer_text,
+                        start_position_character=start_position_character,
+                        title=title,
+                        is_impossible=is_impossible,
+                        answers=answers,
+                    )
+
+                    examples.append(example)
+        return examples
+
+
+class SquadV1Processor(SquadProcessor):
+    train_file = "train-v1.1.json"
+    dev_file = "dev-v1.1.json"
+
+
+class SquadV2Processor(SquadProcessor):
+    train_file = "train-v2.0.json"
+    dev_file = "dev-v2.0.json"
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset, as loaded from disk.
+
+    Args:
+        qas_id: The example's unique identifier
+        question_text: The question string
+        context_text: The context string
+        answer_text: The answer string
+        start_position_character: The character position of the start of the answer
+        title: The title of the example
+        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+        is_impossible: False by default, set to True if the example has no possible answer.
+    """
+
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+
+        self.start_position, self.end_position = 0, 0
+
+        doc_tokens = []
+        char_to_word_offset = []
+        prev_is_whitespace = True
+
+        # Split on whitespace so that different tokens may be attributed to their original position.
+        for c in self.context_text:
+            if _is_whitespace(c):
+                prev_is_whitespace = True
+            else:
+                if prev_is_whitespace:
+                    doc_tokens.append(c)
+                else:
+                    doc_tokens[-1] += c
+                prev_is_whitespace = False
+            char_to_word_offset.append(len(doc_tokens) - 1)
+
+        self.doc_tokens = doc_tokens
+        self.char_to_word_offset = char_to_word_offset
+
+        # Start and end positions only has a value during evaluation.
+        if start_position_character is not None and not is_impossible:
+            self.start_position = char_to_word_offset[start_position_character]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
+
+
+class SquadFeatures(object):
+    """
+    Single squad example features to be fed to a model.
+    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+
+    Args:
+        input_ids: Indices of input sequence tokens in the vocabulary.
+        attention_mask: Mask to avoid performing attention on padding token indices.
+        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+        cls_index: the index of the CLS token.
+        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+        example_index: the index of the example
+        unique_id: The unique Feature identifier
+        paragraph_len: The length of the context
+        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
+            If a token does not have their maximum context in this feature object, it means that another feature object
+            has more information related to that token and should be prioritized over this feature for that token.
+        tokens: list of tokens corresponding to the input ids
+        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+        start_position: start of the answer token index
+        end_position: end of the answer token index
+    """
+
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+        is_impossible,
+    ):
+        self.input_ids = input_ids
+        self.attention_mask = attention_mask
+        self.token_type_ids = token_type_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+
+        self.example_index = example_index
+        self.unique_id = unique_id
+        self.paragraph_len = paragraph_len
+        self.token_is_max_context = token_is_max_context
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+class SquadResult(object):
+    """
+    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+
+    Args:
+        unique_id: The unique identifier corresponding to that example.
+        start_logits: The logits corresponding to the start of the answer
+        end_logits: The logits corresponding to the end of the answer
+    """
+
+    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+        self.start_logits = start_logits
+        self.end_logits = end_logits
+        self.unique_id = unique_id
+
+        if start_top_index:
+            self.start_top_index = start_top_index
+            self.end_top_index = end_top_index
+            self.cls_logits = cls_logits
+
+
+
+
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def get_answers(examples, features, results, args):
+    predictions = collections.defaultdict(list)  # it is possible that one example corresponds to multiple features
+    _Prediction = collections.namedtuple('_Prediction', ['text', 'start_logit', 'end_logit'])
+
+    if args.version_2_with_negative:
+        null_vals = collections.defaultdict(lambda: (float("inf"), 0, 0))
+
+    for ex, feat, result in match_results(examples, features, results):
+        if not args.joint_head:
+            start_indices = _get_best_indices(result.start_logits, args.n_best_size)
+            end_indices = _get_best_indices(result.end_logits, args.n_best_size)
+            prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
+            feature_null_score = result.start_logits[0] + result.end_logits[0]
+
+        else:
+            prelim_predictions = get_valid_prelim_predictions_joint_head(result.start_top_index, result.end_top_index,
+                                                                         feat, result, args)
+            # start_indices = result.start_top_index
+            # end_indices = result.end_top_index
+            feature_null_score = result.cls_logits
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+        if args.version_2_with_negative and feature_null_score < null_vals[ex.qas_id][0]:
+            null_vals[ex.qas_id] = (feature_null_score, result.start_logits[0], result.end_logits[0])
+
+        curr_predictions = []
+        seen_predictions = set()
+        for pred in prelim_predictions:
+            if len(curr_predictions) == args.n_best_size:
+                break
+            if pred.start_index > 0:  # this is a non-null prediction TODO: this probably is irrelevant
+                final_text = get_answer_text(ex, feat, pred, args)
+            else:
+                final_text = ''
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions.add(final_text)
+            curr_predictions.append(_Prediction(final_text, pred.start_logit, pred.end_logit))
+        predictions[ex.qas_id] += curr_predictions
+
+    # Add empty prediction
+    if args.version_2_with_negative:
+        for qas_id in predictions.keys():
+            predictions[qas_id].append(_Prediction('',
+                                                   null_vals[qas_id][1],
+                                                   null_vals[qas_id][2]))
+
+    nbest_answers = collections.defaultdict(list)
+    answers = {}
+    for qas_id, preds in predictions.items():
+        # nbest = sorted(
+        #     preds,
+        #     key=lambda x: (x.start_logit + x.end_logit),
+        #     reverse=True)[:args.n_best_size]
+        seen_predictions = set()
+        nbest = []
+        for pred in sorted(predictions[qas_id], key=lambda x: (x.start_logit + x.end_logit), reverse=True):
+            if len(nbest) >= args.n_best_size:
+                break
+            if pred.text in seen_predictions:
+                continue
+            seen_predictions.add(pred.text)
+            nbest.append(pred)
+
+        # In very rare edge cases we could only have single null prediction.
+        # So we just create a nonce prediction in this case to avoid failure.
+        if not nbest or (args.version_2_with_negative and len(nbest) == 1):
+            nbest.append(_Prediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry and entry.text:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_answers[qas_id].append(output)
+
+        if args.version_2_with_negative:
+            if not args.joint_head:
+                score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
+            else:
+                score_diff = null_vals[qas_id][0]
+            if score_diff > args.null_score_diff_threshold:
+                answers[qas_id] = ""
+            else:
+                answers[qas_id] = best_non_null_entry.text
+        else:
+            answers[qas_id] = nbest_answers[qas_id][0]['text']
+
+    return answers, nbest_answers
+
+
+def get_answer_text(example, feature, pred, args):
+    tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+    orig_doc_start = feature.token_to_orig_map[pred.start_index]
+    orig_doc_end = feature.token_to_orig_map[pred.end_index]
+    orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+    tok_text = " ".join(tok_tokens)
+
+    # De-tokenize WordPieces that have been split off.
+    tok_text = tok_text.replace(" ##", "")
+    tok_text = tok_text.replace("##", "")
+
+    # Clean whitespace
+    tok_text = tok_text.strip()
+    tok_text = " ".join(tok_text.split())
+    orig_text = " ".join(orig_tokens)
+
+    final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
+    return final_text
+
+
+def get_valid_prelim_predictions_joint_head(start_indices, end_indices, feature, result, args):
+    _PrelimPrediction = collections.namedtuple(
+        "PrelimPrediction",
+        ["start_index", "end_index", "start_logit", "end_logit"])
+    prelim_predictions = []
+    # for start_index in start_indices:
+
+    for i in range(args.beam_size):
+        start_index = start_indices[i]
+        for j in range(args.beam_size):
+            # for end_index in end_indices:
+            end_index = end_indices[i * args.beam_size + j]
+            if start_index >= len(feature.tokens):
+                continue
+            if end_index >= len(feature.tokens):
+                continue
+            if start_index not in feature.token_to_orig_map:
+                continue
+            if end_index not in feature.token_to_orig_map:
+                continue
+            if not feature.token_is_max_context.get(start_index, False):
+                continue
+            if end_index < start_index:
+                continue
+            length = end_index - start_index + 1
+            if length > args.max_answer_length:
+                continue
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    start_index=start_index,
+                    end_index=end_index,
+                    start_logit=result.start_logits[i],  # start_index],
+                    end_logit=result.end_logits[i * args.beam_size + j]))  # end_index]))
+    return prelim_predictions
+
+
+def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
+    _PrelimPrediction = collections.namedtuple(
+        "PrelimPrediction",
+        ["start_index", "end_index", "start_logit", "end_logit"])
+    prelim_predictions = []
+    for start_index in start_indices:
+        for end_index in end_indices:
+            if start_index >= len(feature.tokens):
+                continue
+            if end_index >= len(feature.tokens):
+                continue
+            if start_index not in feature.token_to_orig_map:
+                continue
+            if end_index not in feature.token_to_orig_map:
+                continue
+            if not feature.token_is_max_context.get(start_index, False):
+                continue
+            if end_index < start_index:
+                continue
+            length = end_index - start_index + 1
+            if length > args.max_answer_length:
+                continue
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    start_index=start_index,
+                    end_index=end_index,
+                    start_logit=result.start_logits[start_index],
+                    end_logit=result.end_logits[end_index]))
+    return prelim_predictions
+
+
+def match_results(examples, features, results):
+    unique_f_ids = set([f.unique_id for f in features])
+    unique_r_ids = set([r.unique_id for r in results])
+    matching_ids = unique_f_ids & unique_r_ids
+    features = [f for f in features if f.unique_id in matching_ids]
+    results = [r for r in results if r.unique_id in matching_ids]
+    features.sort(key=lambda x: x.unique_id)
+    results.sort(key=lambda x: x.unique_id)
+
+    for f, r in zip(features, results):  # original code assumes strict ordering of examples. TODO: rewrite this
+        yield examples[f.example_index], f, r
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indices(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indices = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indices.append(index_and_score[i][0])
+    return best_indices
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
diff --git a/modelzoo/ELECTRA/tokenization.py b/modelzoo/ELECTRA/tokenization.py
new file mode 100644
index 00000000..47421c2d
--- /dev/null
+++ b/modelzoo/ELECTRA/tokenization.py
@@ -0,0 +1,68 @@
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tokenization_utils import BertTokenizer
+
+from tokenization_utils import BertTokenizer
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
+        "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
+        "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
+        "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
+        "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
+        "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/electra-small-generator": 512,
+    "google/electra-base-generator": 512,
+    "google/electra-large-generator": 512,
+    "google/electra-small-discriminator": 512,
+    "google/electra-base-discriminator": 512,
+    "google/electra-large-discriminator": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "google/electra-small-generator": {"do_lower_case": True},
+    "google/electra-base-generator": {"do_lower_case": True},
+    "google/electra-large-generator": {"do_lower_case": True},
+    "google/electra-small-discriminator": {"do_lower_case": True},
+    "google/electra-base-discriminator": {"do_lower_case": True},
+    "google/electra-large-discriminator": {"do_lower_case": True},
+}
+
+
+class ElectraTokenizer(BertTokenizer):
+    r"""
+    Constructs an Electra tokenizer.
+    :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+
diff --git a/modelzoo/ELECTRA/tokenization_utils.py b/modelzoo/ELECTRA/tokenization_utils.py
new file mode 100644
index 00000000..928532c6
--- /dev/null
+++ b/modelzoo/ELECTRA/tokenization_utils.py
@@ -0,0 +1,2415 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+import copy
+import functools
+import itertools
+import json
+import logging
+import operator
+import os
+import re
+import collections
+import unicodedata
+
+from collections import UserDict, defaultdict
+from contextlib import contextmanager
+from typing import List, Optional, Sequence, Tuple, Union
+
+from tokenizers import AddedToken, Encoding
+from tokenizers.implementations import BaseTokenizer
+
+from file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
+
+
+if is_tf_available():
+    import tensorflow as tf
+if is_torch_available():
+    import torch
+
+logger = logging.getLogger(__name__)
+
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
+
+# Define type aliases
+TextInput = str
+TextPairInput = Tuple[str, str]
+PreTokenizedInput = List[str]
+PreTokenizedInputPair = Tuple[List[str], List[str]]
+
+
+def flatten(x: Sequence):
+    """
+    Flatten the provided (potentially nested) sequence
+
+    Args:
+        x (Sequence): Potentially nested sequence to flatten
+
+    Returns:
+        list: Flattened sequence
+    """
+
+    return functools.reduce(operator.iconcat, x, [])
+
+
+@contextmanager
+def truncate_and_pad(
+    tokenizer: BaseTokenizer,
+    max_length: int,
+    stride: int,
+    strategy: str,
+    pad_to_max_length: bool,
+    padding_side: str,
+    pad_token_id: int,
+    pad_token_type_id: int,
+    pad_token: str,
+):
+    """
+    This contextmanager is in charge of defining the truncation and the padding strategies and then
+    restore the tokenizer settings afterwards.
+
+    This contextmanager assumes the provider tokenizer has no padding / truncation strategy
+    before the managed section. If your tokenizer set a padding / truncation strategy before,
+    then it will be reset to no padding/truncation when exiting the managed section.
+
+    Args:
+        tokenizer (BaseTokenizer): The tokenizer which will be used
+        max_length (int): The maximum size of the sequence
+        stride (int): The stride to use when handling overflow
+        strategy (str): Overflowing logic to use
+        pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
+        padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
+        pad_token_id (int): The integer representation of the padding token to use
+        pad_token_type_id (int): The integer representation of the padding token type to use
+        pad_token (str): The string representation of the padding token to use
+
+    Returns:
+
+    """
+
+    # Handle all the truncation and padding stuff
+    if max_length is not None:
+        tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
+
+    if pad_to_max_length and (pad_token and pad_token_id >= 0):
+        tokenizer.enable_padding(
+            max_length=max_length,
+            direction=padding_side,
+            pad_id=pad_token_id,
+            pad_type_id=pad_token_type_id,
+            pad_token=pad_token,
+        )
+    elif pad_to_max_length:
+        logger.warning(
+            "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
+            "To remove this error, you can add a new pad token and then resize model embedding:\n"
+            "\ttokenizer.pad_token = '<PAD>'\n\tmodel.resize_token_embeddings(len(tokenizer))".format(
+                pad_token, pad_token_id
+            )
+        )
+
+    yield
+
+    if max_length is not None:
+        tokenizer.no_truncation()
+
+    if pad_to_max_length and (pad_token and pad_token_id >= 0):
+        tokenizer.no_padding()
+
+
+class BatchEncoding(UserDict):
+    """
+    Data structure derived from Dictionary holding all the required information to forward through
+    a model.
+
+    In addition, this structure expose utility methods to map from word/char space to token space.
+    """
+
+    def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None):
+        super().__init__(data)
+
+        if isinstance(encoding, Encoding):
+            encoding = [encoding]
+
+        self._encodings = encoding
+
+    def __getitem__(self, item: Union[int, str]) -> Encoding:
+        if isinstance(item, str):
+            return self.data[item]
+        elif self._encodings is not None:
+            return self._encodings[item]
+        else:
+            raise KeyError("int index is supported only on {} from a Rust tokenizer".format(type(self).__name__))
+
+    def __getattr__(self, item: str):
+        return self.data[item]
+
+    @property
+    def encodings(self) -> Optional[List[Encoding]]:
+        """
+        Return the list all encoding from the tokenization process
+
+        Returns: List[Encoding] or None if input was tokenized through Python tokenizer
+        """
+        return self._encodings
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
+        """
+        Find the Offsets of the token containing the character at the specified position
+
+        Args:
+            sentence: Index of the sentence relative to the batch provided to the tokenizer
+            char: Char index to get the relative token offsets
+
+        Returns:
+            tuple: (token start, token end)
+
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers")
+        return self[sentence].char_to_token_offsets(char)
+
+    def char_to_token(self, sentence: int, char: int) -> int:
+        """
+        Return the index of the token at position of the given char.
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            char (int): Char index to get the relative token offsets
+
+        Returns:
+            int: Integer referring to the position of the token in the returned set of tokens for the sentence
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token() is not available when using Python based tokenizers")
+        return self[sentence].char_to_token(char)
+
+    def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
+        """
+        Find the Offsets of the word containing the character at the specified position
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            char (int): Char index to get the relative token offsets
+
+        Returns:
+            tuple: (word start, word end) representing the first and last characters of the word
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers")
+        return self[sentence].char_to_word_offsets(char)
+
+    def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]:
+        """
+        Find the Offsets of the word containing the token at the given index
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            index (int): Index of the token to map to the original word offsets
+
+        Returns:
+            Optional[tuple]: (word start, word end) or None
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers")
+        return self[sentence].token_to_word_offsets(index)
+
+
+class SpecialTokensMixin:
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
+
+    def __init__(self, **kwargs):
+
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._pad_token_type_id = 0
+        self._additional_special_tokens = []
+
+        for key, value in kwargs.items():
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
+                elif isinstance(value, AddedToken):
+                    setattr(self, key, str(value))
+                elif isinstance(value, str):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(
+                        "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
+                    )
+
+    @property
+    def bos_token(self):
+        """ Beginning of sentence token (string). Log an error if used while not having been set. """
+        if self._bos_token is None:
+            logger.error("Using bos_token, but it is not set yet.")
+        return self._bos_token
+
+    @property
+    def eos_token(self):
+        """ End of sentence token (string). Log an error if used while not having been set. """
+        if self._eos_token is None:
+            logger.error("Using eos_token, but it is not set yet.")
+        return self._eos_token
+
+    @property
+    def unk_token(self):
+        """ Unknown token (string). Log an error if used while not having been set. """
+        if self._unk_token is None:
+            logger.error("Using unk_token, but it is not set yet.")
+        return self._unk_token
+
+    @property
+    def sep_token(self):
+        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+        if self._sep_token is None:
+            logger.error("Using sep_token, but it is not set yet.")
+        return self._sep_token
+
+    @property
+    def pad_token(self):
+        """ Padding token (string). Log an error if used while not having been set. """
+        if self._pad_token is None:
+            logger.error("Using pad_token, but it is not set yet.")
+        return self._pad_token
+
+    @property
+    def cls_token(self):
+        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+        if self._cls_token is None:
+            logger.error("Using cls_token, but it is not set yet.")
+        return self._cls_token
+
+    @property
+    def mask_token(self):
+        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+        if self._mask_token is None:
+            logger.error("Using mask_token, but it is not set yet.")
+        return self._mask_token
+
+    @property
+    def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
+        if self._additional_special_tokens is None:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+        return self._additional_special_tokens
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @property
+    def bos_token_id(self):
+        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.bos_token)
+
+    @property
+    def eos_token_id(self):
+        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.eos_token)
+
+    @property
+    def unk_token_id(self):
+        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.unk_token)
+
+    @property
+    def sep_token_id(self):
+        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.sep_token)
+
+    @property
+    def pad_token_id(self):
+        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.pad_token)
+
+    @property
+    def pad_token_type_id(self):
+        """ Id of the padding token type in the vocabulary."""
+        return self._pad_token_type_id
+
+    @property
+    def cls_token_id(self):
+        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.cls_token)
+
+    @property
+    def mask_token_id(self):
+        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.mask_token)
+
+    @property
+    def additional_special_tokens_ids(self):
+        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
+
+    @property
+    def special_tokens_map(self):
+        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+            values ('<unk>', '<cls>'...)
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self):
+        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+            (cls_token, unk_token...).
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(set(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self):
+        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+            class attributes (cls_token, unk_token...).
+        """
+        all_toks = self.all_special_tokens
+        all_ids = self.convert_tokens_to_ids(all_toks)
+        return all_ids
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+
+class PreTrainedTokenizer(SpecialTokensMixin):
+    """ Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+
+    Class attributes (overridden by derived classes):
+
+        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+
+    Parameters:
+
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+    """
+
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
+    max_model_input_sizes = {}
+    model_input_names = ["token_type_ids", "attention_mask"]
+
+    padding_side = "right"
+
+    NO_PAD_TOKEN_FOR_BATCH_MSG = (
+        "No padding token is set for this model, therefore no batch can be made with uneven "
+        "sequences. Set a padding token or adjust the lengths of the sequences building the "
+        "batch so that every sequence is of the same length."
+    )
+
+    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
+        "The sequences building the batch are not of the same size, no tensor "
+        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
+        "up to the larger sequence's length."
+    )
+
+    @property
+    def vocab_size(self) -> int:
+        """ Size of the base vocabulary (without the added tokens) """
+        raise NotImplementedError
+
+    @property
+    def is_fast(self):
+        return False
+
+    def get_vocab(self):
+        """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
+        raise NotImplementedError()
+
+    def __init__(self, max_len=None, **kwargs):
+
+        super().__init__(**kwargs)
+
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
+
+        # Added tokens
+        self.added_tokens_encoder = {}
+        self.unique_added_tokens_encoder = set()
+        self.added_tokens_decoder = {}
+
+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = {}
+
+    def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+    @classmethod
+    def from_pretrained(cls, *inputs, **kwargs):
+        r"""
+        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+
+        Args:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+
+            # Download vocabulary from S3 and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+
+        """
+        return cls._from_pretrained(*inputs, **kwargs)
+
+    @classmethod
+    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+
+        s3_models = list(cls.max_model_input_sizes.keys())
+        vocab_files = {}
+        init_configuration = {}
+        if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
+            for file_id, map_list in cls.pretrained_vocab_files_map.items():
+                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+            if (
+                cls.pretrained_init_configuration
+                and pretrained_model_name_or_path in cls.pretrained_init_configuration
+            ):
+                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
+        else:
+            # Get the vocabulary from local files
+            logger.info(
+                "Model name '{}' not found in model shortcut name list ({}). "
+                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
+                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
+                )
+            )
+
+            if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                if len(cls.vocab_files_names) > 1:
+                    raise ValueError(
+                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
+                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
+                    )
+                logger.warning(
+                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
+                        cls.__name__
+                    )
+                )
+                file_id = list(cls.vocab_files_names.keys())[0]
+                vocab_files[file_id] = pretrained_model_name_or_path
+            else:
+                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
+                additional_files_names = {
+                    "added_tokens_file": ADDED_TOKENS_FILE,
+                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+                }
+                # Look for the tokenizer main vocabulary files + the additional tokens files
+                for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
+                    if os.path.isdir(pretrained_model_name_or_path):
+                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                        if not os.path.exists(full_file_name):
+                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                            full_file_name = None
+                    else:
+                        full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+
+                    vocab_files[file_id] = full_file_name
+
+        # Get files from url, cache, or disk depending on the case
+        try:
+            resolved_vocab_files = {}
+            for file_id, file_path in vocab_files.items():
+                if file_path is None:
+                    resolved_vocab_files[file_id] = None
+                else:
+                    resolved_vocab_files[file_id] = cached_path(
+                        file_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        local_files_only=local_files_only,
+                    )
+        except EnvironmentError:
+            if pretrained_model_name_or_path in s3_models:
+                msg = "Couldn't reach server at '{}' to download vocabulary files."
+            else:
+                msg = (
+                    "Model name '{}' was not found in tokenizers model name list ({}). "
+                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
+                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ", ".join(s3_models),
+                        pretrained_model_name_or_path,
+                        list(cls.vocab_files_names.values()),
+                    )
+                )
+
+            raise EnvironmentError(msg)
+
+        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
+            raise EnvironmentError(
+                "Model name '{}' was not found in tokenizers model name list ({}). "
+                "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
+                "named {} but couldn't find such vocabulary files at this path or url.".format(
+                    pretrained_model_name_or_path,
+                    ", ".join(s3_models),
+                    pretrained_model_name_or_path,
+                    list(cls.vocab_files_names.values()),
+                )
+            )
+
+        for file_id, file_path in vocab_files.items():
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info("loading file {}".format(file_path))
+            else:
+                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
+
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+        if tokenizer_config_file is not None:
+            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+                init_kwargs = json.load(tokenizer_config_handle)
+            saved_init_inputs = init_kwargs.pop("init_inputs", ())
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            init_kwargs = init_configuration
+
+        # Update with newly provided kwargs
+        init_kwargs.update(kwargs)
+
+        # Set max length if needed
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            if max_len is not None and isinstance(max_len, (int, float)):
+                init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
+
+        # Merge resolved_vocab_files arguments in init_kwargs.
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
+        for args_name, file_path in resolved_vocab_files.items():
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
+        if special_tokens_map_file is not None:
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
+            for key, value in special_tokens_map.items():
+                if key not in init_kwargs:
+                    init_kwargs[key] = value
+
+        # Instantiate tokenizer.
+        try:
+            tokenizer = cls(*init_inputs, **init_kwargs)
+        except OSError:
+            raise OSError(
+                "Unable to load vocabulary from file. "
+                "Please check that the provided vocabulary is accessible and not corrupted."
+            )
+
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        tokenizer.init_inputs = init_inputs
+        tokenizer.init_kwargs = init_kwargs
+
+        # update unique_added_tokens_encoder with special tokens for correct tokenization
+        tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens))
+
+        # Add supplementary tokens.
+        if added_tokens_file is not None:
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
+            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+            tokenizer.added_tokens_encoder.update(added_tok_encoder)
+            tokenizer.added_tokens_decoder.update(added_tok_decoder)
+            tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()))
+
+        return tokenizer
+
+    def save_pretrained(self, save_directory):
+        """ Save the tokenizer vocabulary files together with:
+                - added tokens,
+                - special-tokens-to-class-attributes-mapping,
+                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+
+            This won't save modifications other than (added tokens and special token mapping) you may have
+            applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
+
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Saving directory ({}) should be a directory".format(save_directory))
+            return
+
+        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        if len(self.init_inputs) > 0:
+            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+
+        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
+
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
+
+        if len(self.added_tokens_encoder) > 0:
+            with open(added_tokens_file, "w", encoding="utf-8") as f:
+                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
+                f.write(out_str)
+
+        vocab_files = self.save_vocabulary(save_directory)
+
+        return vocab_files + (special_tokens_map_file, added_tokens_file)
+
+    def save_vocabulary(self, save_directory):
+        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+            and special token mappings.
+
+            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+        """
+        raise NotImplementedError
+
+    def add_tokens(self, new_tokens):
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+
+        Args:
+            new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+
+        Returns:
+            Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        """
+        if not new_tokens:
+            return 0
+
+        if not isinstance(new_tokens, list):
+            new_tokens = [new_tokens]
+
+        to_add_tokens = []
+        for token in new_tokens:
+            assert isinstance(token, str)
+            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in to_add_tokens
+            ):
+                to_add_tokens.append(token)
+                logger.info("Adding %s to the vocabulary", token)
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        return len(to_add_tokens)
+
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+            inside your training loop.
+
+        Args:
+            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False.
+
+        Returns:
+            Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def add_special_tokens(self, special_tokens_dict):
+        """
+        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+        to class attributes. If special tokens are NOT in the vocabulary, they are added
+        to it (indexed starting from the last index of the current vocabulary).
+
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - special tokens are carefully handled by the tokenizer (they are never split)
+        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
+
+        Args:
+            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
+
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+
+        Returns:
+            Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to add a new classification token to GPT-2
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            model = GPT2Model.from_pretrained('gpt2')
+
+            special_tokens_dict = {'cls_token': '<CLS>'}
+
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+
+            assert tokenizer.cls_token == '<CLS>'
+        """
+        if not special_tokens_dict:
+            return 0
+
+        added_tokens = 0
+        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
+                added_tokens += self.add_tokens(value)
+            else:
+                assert isinstance(value, str)
+                added_tokens += self.add_tokens([value])
+            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
+            setattr(self, key, value)
+
+        return added_tokens
+
+    def tokenize(self, text: TextInput, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Take care of added tokens.
+
+            text: The sequence to be encoded.
+            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
+                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
+            **kwargs: passed to the `prepare_for_tokenization` preprocessing method.
+        """
+        all_special_tokens = self.all_special_tokens
+        text = self.prepare_for_tokenization(text, **kwargs)
+
+        def lowercase_text(t):
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
+
+        if self.init_kwargs.get("do_lower_case", False):
+            text = lowercase_text(text)
+
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.rstrip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self._tokenize(text)
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.unique_added_tokens_encoder:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+
+        added_tokens = self.unique_added_tokens_encoder
+        tokenized_text = split_on_tokens(added_tokens, text)
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+
+            Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a single token, or a sequence of tokens, (str) in a single integer id
+            (resp. a sequence of ids), using the vocabulary.
+        """
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+
+    def encode(
+        self,
+        text: TextInput,
+        text_pair: Optional[TextInput] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        **kwargs
+    ):
+        """
+        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+
+        Args:
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                  starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences
+                Defaults to False: no padding.
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            **kwargs: passed to the `self.tokenize()` method
+        """
+        encoded_inputs = self.encode_plus(
+            text,
+            text_pair=text_pair,
+            max_length=max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            pad_to_max_length=pad_to_max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    def encode_plus(
+        self,
+        text: TextInput,
+        text_pair: Optional[TextInput] = None,
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        is_pretokenized: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+        Args:
+            text (:obj:`str` or :obj:`List[str]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+                method)
+            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+                string using the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method)
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                  starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences
+                Defaults to False: no padding.
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Set to True to indicate the input is already tokenized
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
+                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[int],
+                    token_type_ids: list[int] if return_token_type_ids is True (default)
+                    attention_mask: list[int] if return_attention_mask is True (default)
+                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+                }
+
+            With the fields:
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
+
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            max_length=max_length,
+            pad_to_max_length=pad_to_max_length,
+            add_special_tokens=add_special_tokens,
+            stride=stride,
+            truncation_strategy=truncation_strategy,
+            return_tensors=return_tensors,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+        )
+
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
+        ],
+        add_special_tokens: bool = True,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        is_pretokenized: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_masks: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_masks: bool = False,
+        return_offsets_mapping: bool = False,
+        return_input_lengths: bool = False,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+        Args:
+            batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
+                Batch of sequences or pair of sequences to be encoded.
+                This can be a list of string/string-sequences/int-sequences or a list of pair of
+                string/string-sequences/int-sequence (see details in encode_plus)
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+                If set to a number, will limit the total sequence returned so that it has a maximum length.
+                If there are overflowing tokens, those will be added to the returned dictionary
+            stride (:obj:`int`, `optional`, defaults to ``0``):
+                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+                from the main sequence returned. The value of this argument defines the number of additional tokens.
+            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+                String selected in the following options:
+
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                  starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the
+                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+                which can be set to the following strings:
+
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences
+                Defaults to False: no padding.
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Set to True to indicate the input is already tokenized
+            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`_
+            return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return overflowing token information (default False).
+            return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return special tokens mask information (default False).
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True to return (char_start, char_end) for each token (default False).
+                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+            return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If set the resulting dictionary will include the length of each sample
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[List[int]],
+                    token_type_ids: list[List[int]] if return_token_type_ids is True (default)
+                    attention_mask: list[List[int]] if return_attention_mask is True (default)
+                    overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+                }
+
+            With the fields:
+
+            - ``input_ids``: list of token ids to be fed to a model
+            - ``token_type_ids``: list of token type ids to be fed to a model
+            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+              tokens and 1 specifying sequence tokens.
+        """
+
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        # Throw an error if we can pad because there is no padding token
+        if pad_to_max_length and self.pad_token_id is None:
+            raise ValueError(
+                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+            )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
+                ids, pair_ids = ids_or_pair_ids
+            else:
+                ids, pair_ids = ids_or_pair_ids, None
+
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+
+        if max_length is None and pad_to_max_length:
+
+            def total_sequence_length(input_pairs):
+                first_ids, second_ids = input_pairs
+                return len(first_ids) + (
+                    self.num_special_tokens_to_add()
+                    if second_ids is None
+                    else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
+                )
+
+            max_length = max([total_sequence_length(ids) for ids in input_ids])
+
+        batch_outputs = {}
+        for first_ids, second_ids in input_ids:
+            # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by
+            # the model. It adds special tokens, truncates sequences if overflowing while taking into account
+            # the special tokens and manages a window stride for overflowing tokens
+            outputs = self.prepare_for_model(
+                first_ids,
+                pair_ids=second_ids,
+                max_length=max_length,
+                pad_to_max_length=pad_to_max_length,
+                add_special_tokens=add_special_tokens,
+                stride=stride,
+                truncation_strategy=truncation_strategy,
+                return_attention_mask=return_attention_masks,
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_masks,
+            )
+
+            # Append the non-padded length to the output
+            if return_input_lengths:
+                outputs["input_len"] = len(outputs["input_ids"])
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        if return_tensors is not None:
+
+            # Do the tensor conversion in batch
+            for key, value in batch_outputs.items():
+                if return_tensors == "tf" and is_tf_available():
+                    try:
+                        batch_outputs[key] = tf.constant(value)
+                    except ValueError:
+                        if None in [item for sequence in value for item in sequence]:
+                            raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
+                        else:
+                            raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
+                elif return_tensors == "pt" and is_torch_available():
+                    try:
+                        batch_outputs[key] = torch.tensor(value)
+                    except ValueError:
+                        raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
+                    except RuntimeError:
+                        if None in [item for sequence in value for item in sequence]:
+                            raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
+                        else:
+                            raise
+                elif return_tensors is not None:
+                    logger.warning(
+                        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                            return_tensors
+                        )
+                    )
+
+        return BatchEncoding(batch_outputs)
+
+    def prepare_for_model(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        max_length: Optional[int] = None,
+        add_special_tokens: bool = True,
+        stride: int = 0,
+        truncation_strategy: str = "longest_first",
+        pad_to_max_length: bool = False,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+    ):
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        It adds special tokens, truncates
+        sequences if overflowing while taking into account the special tokens and manages a window stride for
+        overflowing tokens
+
+        Args:
+            ids: list of tokenized input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+                `tokenize` and `convert_tokens_to_ids` methods.
+            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
+            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+                list of inputs.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences)
+                - 'only_first': Only truncate the first sequence
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+                The tokenizer padding sides are handled by the following strings:
+                - 'left': pads on the left of the sequences
+                - 'right': pads on the right of the sequences
+                Defaults to False: no padding.
+            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+                or PyTorch torch.Tensor instead of a list of python integers.
+            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
+            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
+
+        Return:
+            A Dictionary of shape::
+
+                {
+                    input_ids: list[int],
+                    token_type_ids: list[int] if return_token_type_ids is True (default)
+                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+                }
+
+            With the fields:
+                ``input_ids``: list of token ids to be fed to a model
+                ``token_type_ids``: list of token type ids to be fed to a model
+
+                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+                tokens and 1 specifying sequence tokens.
+        """
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        # Handle max sequence length
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+        if max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+            if return_overflowing_tokens:
+                encoded_inputs["overflowing_tokens"] = overflowing_tokens
+                encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Handle special_tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
+
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        encoded_inputs["input_ids"] = sequence
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+
+        if max_length and len(encoded_inputs["input_ids"]) > max_length:
+            encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
+            if return_token_type_ids:
+                encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
+            if return_special_tokens_mask:
+                encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
+
+        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
+            logger.warning(
+                "Token indices sequence length is longer than the specified maximum sequence length "
+                "for this model ({} > {}). Running this sequence through the model will result in "
+                "indexing errors".format(len(ids), self.max_len)
+            )
+
+        needs_to_be_padded = pad_to_max_length and (
+            max_length
+            and len(encoded_inputs["input_ids"]) < max_length
+            or max_length is None
+            and len(encoded_inputs["input_ids"]) < self.max_len
+            and self.max_len <= 10000
+        )
+
+        if pad_to_max_length and max_length is None and self.max_len > 10000:
+            logger.warning(
+                "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
+            )
+
+        if needs_to_be_padded:
+            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
+
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                if return_token_type_ids:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if return_special_tokens_mask:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+        elif return_attention_mask:
+            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
+        # Prepare inputs as tensors if asked
+        if return_tensors == "tf" and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+        elif return_tensors == "pt" and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+
+            if "token_type_ids" in encoded_inputs:
+                encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors
+                )
+            )
+
+        return BatchEncoding(encoded_inputs)
+
+    def prepare_for_tokenization(self, text, **kwargs):
+        """ Performs any necessary transformations before tokenization """
+        return text
+
+    def truncate_sequences(
+        self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
+    ):
+        """Truncates a sequence pair in place to the maximum length.
+            truncation_strategy: string selected in the following options:
+                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                    starting from the longest one at each token (when there is a pair of input sequences).
+                    Overflowing tokens only contains overflow from the first sequence.
+                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+                - 'only_second': Only truncate the second sequence
+                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+
+        if truncation_strategy == "longest_first":
+            overflowing_tokens = []
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    overflowing_tokens = [ids[-1]] + overflowing_tokens
+                    ids = ids[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+            window_len = min(len(ids), stride)
+            if window_len > 0:
+                overflowing_tokens = ids[-window_len:] + overflowing_tokens
+        elif truncation_strategy == "only_first":
+            assert len(ids) > num_tokens_to_remove
+            window_len = min(len(ids), stride + num_tokens_to_remove)
+            overflowing_tokens = ids[-window_len:]
+            ids = ids[:-num_tokens_to_remove]
+        elif truncation_strategy == "only_second":
+            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
+            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+            overflowing_tokens = pair_ids[-window_len:]
+            pair_ids = pair_ids[:-num_tokens_to_remove]
+        elif truncation_strategy == "do_not_truncate":
+            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
+        else:
+            raise ValueError(
+                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+            )
+        return (ids, pair_ids, overflowing_tokens)
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        if token_ids_1 is None:
+            return len(token_ids_0) * [0]
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
+        """
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
+
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index):
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string.
+            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+            but we often want to remove sub-word tokenization artifacts at the same time.
+        """
+        return " ".join(self.convert_ids_to_tokens(tokens))
+
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+
+        Args:
+            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
+            skip_special_tokens: if set to True, will replace special tokens.
+            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separatly for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        text = " ".join(sub_texts)
+
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" do not", " don't")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
+
+def trim_batch(
+    input_ids, pad_token_id, attention_mask=None,
+):
+    """Remove columns that are populated exclusively by pad_token_id"""
+    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+    if attention_mask is None:
+        return input_ids[:, keep_column_mask]
+    else:
+        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
+
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+        "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "bert-base-finnish-cased-v1": 512,
+    "bert-base-finnish-uncased-v1": 512,
+    "bert-base-dutch-cased": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "bert-base-finnish-uncased-v1": {"do_lower_case": True},
+    "bert-base-dutch-cased": {"do_lower_case": False},
+}
+
+
+# Bert Classes
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a BERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to do basic tokenization before WordPiece.
+        never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            List of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/transformers/issues/328
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        """ Constructs a BasicTokenizer.
+
+        Args:
+            **do_lower_case**: Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be deactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+        """
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+
+    def tokenize(self, text, never_split=None):
+        """ Basic Tokenization of a piece of text.
+            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+        """
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
diff --git a/modelzoo/ELECTRA/utils.py b/modelzoo/ELECTRA/utils.py
new file mode 100644
index 00000000..d3e3cc04
--- /dev/null
+++ b/modelzoo/ELECTRA/utils.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2020 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json, pickle, sys, unicodedata, six, time, os
+import horovod.tensorflow as hvd
+import tensorflow as tf
+import dllogger
+
+def get_rank():
+    try:
+        return hvd.rank()
+    except:
+        return 0
+
+
+def get_world_size():
+    try:
+        return hvd.size()
+    except:
+        return 1
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def format_step(step):
+    if isinstance(step, str):
+        return step
+    s = ""
+    if len(step) == 1:
+        s += "Training Iteration: {} ".format(step[0])
+        return s   
+    if len(step) > 0:
+        s += "Training Epoch: {} ".format(step[0])
+    if len(step) > 1:
+        s += "Training Iteration: {} ".format(step[1])
+    return s
+
+
+def load_json(path):
+    with tf.io.gfile.GFile(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(o, path):
+    if "/" in path:
+        tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
+    with tf.io.gfile.GFile(path, "w") as f:
+        json.dump(o, f)
+
+
+def load_pickle(path):
+    with tf.io.gfile.GFile(path, "rb") as f:
+        return pickle.load(f)
+
+
+def write_pickle(o, path):
+    if "/" in path:
+        tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
+    with tf.io.gfile.GFile(path, "wb") as f:
+        pickle.dump(o, f, -1)
+
+
+def mkdir(path):
+    if not tf.io.gfile.exists(path):
+        tf.io.gfile.makedirs(path)
+
+
+def rmrf(path):
+    if tf.io.gfile.exists(path):
+        tf.io.gfile.rmtree(path)
+
+
+def rmkdir(path):
+    rmrf(path)
+    mkdir(path)
+
+
+def log(*args, **kwargs):
+    all_rank = kwargs.pop("all_rank", False)
+    if not all_rank and not is_main_process():
+        return
+    msg = " ".join(map(str, args))
+    sys.stdout.write(msg + "\n")
+    sys.stdout.flush()
+
+
+def log_config(config):
+    for key, value in sorted(config.__dict__.items()):
+        log(key, value)
+    log()
+
+
+def heading(*args):
+    log(80 * "=")
+    log(*args)
+    log(80 * "=")
+
+
+def nest_dict(d, prefixes, delim="_"):
+    """Go from {prefix_key: value} to {prefix: {key: value}}."""
+    nested = {}
+    for k, v in d.items():
+        for prefix in prefixes:
+            if k.startswith(prefix + delim):
+                if prefix not in nested:
+                    nested[prefix] = {}
+                nested[prefix][k.split(delim, 1)[1]] = v
+            else:
+                nested[k] = v
+    return nested
+
+
+def flatten_dict(d, delim="_"):
+    """Go from {prefix: {key: value}} to {prefix_key: value}."""
+    flattened = {}
+    for k, v in d.items():
+        if isinstance(v, dict):
+            for k2, v2 in v.items():
+                flattened[k + delim + k2] = v2
+        else:
+            flattened[k] = v
+    return flattened
+
+
+def printable_text(text):
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+    # These functions want `str` for both Python2 and Python3, but in one case
+    # it's a Unicode string and in the other it's a byte string.
+    if six.PY3:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, bytes):
+            return text.decode("utf-8", "ignore")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    elif six.PY2:
+        if isinstance(text, str):
+            return text
+        elif isinstance(text, unicode):
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
+    else:
+        raise ValueError("Not running on Python2 or Python 3?")
+
+
+def get_readable_time(elapsed):
+    d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(':')]
+    d -= 1
+    return '{:2d}h{:2d}m{:2d}s'.format(24*d + h, m, s)
+
+def setup_logger(args):
+    os.makedirs(args.log_dir, exist_ok=True)
+    if not args.json_summary:
+        log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log'.format(get_rank()))
+    else:
+        log_path = "{}_rank{}".format(args.json_summary, get_rank())
+
+    if is_main_process():
+        dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
+                                  dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
+    else:
+        dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path)])
+
+    for k,v in vars(args).items():
+        dllogger.log(step='PARAMETER', data={k:v}, verbosity=0)
+
+    container_setup_info = {
+        'NVIDIA_TENSORFLOW_VERSION': os.environ.get('NVIDIA_TENSORFLOW_VERSION'),
+        'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'),
+        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
+        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
+        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
+        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
+        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
+        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
+        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
+        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    }
+    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
+
+def postprocess_dllog(args):
+    if not args.json_summary:
+        log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log')
+    else:
+        log_path = str(args.json_summary) + "_rank{}"
+    logfiles = [open(log_path.format(i), 'r') for i in range(get_world_size())]
+
+    if not args.json_summary:
+        log_path = os.path.join(args.log_dir, 'dllogger.log')
+    else:
+        log_path = str(args.json_summary)
+
+    with open(log_path, 'w') as dest_file:
+        for lines in zip(*[f.readlines() for f in logfiles]):
+            json_lines = [json.loads(l[5:]) for l in lines]
+
+            assert all(x['type'] == json_lines[0]['type'] for x in json_lines)
+            if json_lines[0]['type'] != 'LOG':
+                dest_file.write(lines[0])
+                continue
+
+            assert all(x['step'] == json_lines[0]['step'] for x in json_lines)
+            if json_lines[0]['step'] == 'PARAMETER':
+                dest_file.write(lines[0])
+            else:
+                d =  dict.fromkeys(json_lines[0]['data'])
+                for k in d.keys():
+                    vs = [line['data'][k] for line in json_lines]
+                    d[k] = sum(vs)/len(vs)
+                json_lines[0]['data'] = d
+                dest_file.write('DLLL ')
+                dest_file.write(json.dumps(json_lines[0]))
+                dest_file.write('\n')
+
+    for l in logfiles:
+        l.close()
diff --git a/modelzoo/ELECTRA/vocab/vocab.txt b/modelzoo/ELECTRA/vocab/vocab.txt
new file mode 100755
index 00000000..fb140275
--- /dev/null
+++ b/modelzoo/ELECTRA/vocab/vocab.txt
@@ -0,0 +1,30522 @@
+[PAD]
+[unused0]
+[unused1]
+[unused2]
+[unused3]
+[unused4]
+[unused5]
+[unused6]
+[unused7]
+[unused8]
+[unused9]
+[unused10]
+[unused11]
+[unused12]
+[unused13]
+[unused14]
+[unused15]
+[unused16]
+[unused17]
+[unused18]
+[unused19]
+[unused20]
+[unused21]
+[unused22]
+[unused23]
+[unused24]
+[unused25]
+[unused26]
+[unused27]
+[unused28]
+[unused29]
+[unused30]
+[unused31]
+[unused32]
+[unused33]
+[unused34]
+[unused35]
+[unused36]
+[unused37]
+[unused38]
+[unused39]
+[unused40]
+[unused41]
+[unused42]
+[unused43]
+[unused44]
+[unused45]
+[unused46]
+[unused47]
+[unused48]
+[unused49]
+[unused50]
+[unused51]
+[unused52]
+[unused53]
+[unused54]
+[unused55]
+[unused56]
+[unused57]
+[unused58]
+[unused59]
+[unused60]
+[unused61]
+[unused62]
+[unused63]
+[unused64]
+[unused65]
+[unused66]
+[unused67]
+[unused68]
+[unused69]
+[unused70]
+[unused71]
+[unused72]
+[unused73]
+[unused74]
+[unused75]
+[unused76]
+[unused77]
+[unused78]
+[unused79]
+[unused80]
+[unused81]
+[unused82]
+[unused83]
+[unused84]
+[unused85]
+[unused86]
+[unused87]
+[unused88]
+[unused89]
+[unused90]
+[unused91]
+[unused92]
+[unused93]
+[unused94]
+[unused95]
+[unused96]
+[unused97]
+[unused98]
+[UNK]
+[CLS]
+[SEP]
+[MASK]
+[unused99]
+[unused100]
+[unused101]
+[unused102]
+[unused103]
+[unused104]
+[unused105]
+[unused106]
+[unused107]
+[unused108]
+[unused109]
+[unused110]
+[unused111]
+[unused112]
+[unused113]
+[unused114]
+[unused115]
+[unused116]
+[unused117]
+[unused118]
+[unused119]
+[unused120]
+[unused121]
+[unused122]
+[unused123]
+[unused124]
+[unused125]
+[unused126]
+[unused127]
+[unused128]
+[unused129]
+[unused130]
+[unused131]
+[unused132]
+[unused133]
+[unused134]
+[unused135]
+[unused136]
+[unused137]
+[unused138]
+[unused139]
+[unused140]
+[unused141]
+[unused142]
+[unused143]
+[unused144]
+[unused145]
+[unused146]
+[unused147]
+[unused148]
+[unused149]
+[unused150]
+[unused151]
+[unused152]
+[unused153]
+[unused154]
+[unused155]
+[unused156]
+[unused157]
+[unused158]
+[unused159]
+[unused160]
+[unused161]
+[unused162]
+[unused163]
+[unused164]
+[unused165]
+[unused166]
+[unused167]
+[unused168]
+[unused169]
+[unused170]
+[unused171]
+[unused172]
+[unused173]
+[unused174]
+[unused175]
+[unused176]
+[unused177]
+[unused178]
+[unused179]
+[unused180]
+[unused181]
+[unused182]
+[unused183]
+[unused184]
+[unused185]
+[unused186]
+[unused187]
+[unused188]
+[unused189]
+[unused190]
+[unused191]
+[unused192]
+[unused193]
+[unused194]
+[unused195]
+[unused196]
+[unused197]
+[unused198]
+[unused199]
+[unused200]
+[unused201]
+[unused202]
+[unused203]
+[unused204]
+[unused205]
+[unused206]
+[unused207]
+[unused208]
+[unused209]
+[unused210]
+[unused211]
+[unused212]
+[unused213]
+[unused214]
+[unused215]
+[unused216]
+[unused217]
+[unused218]
+[unused219]
+[unused220]
+[unused221]
+[unused222]
+[unused223]
+[unused224]
+[unused225]
+[unused226]
+[unused227]
+[unused228]
+[unused229]
+[unused230]
+[unused231]
+[unused232]
+[unused233]
+[unused234]
+[unused235]
+[unused236]
+[unused237]
+[unused238]
+[unused239]
+[unused240]
+[unused241]
+[unused242]
+[unused243]
+[unused244]
+[unused245]
+[unused246]
+[unused247]
+[unused248]
+[unused249]
+[unused250]
+[unused251]
+[unused252]
+[unused253]
+[unused254]
+[unused255]
+[unused256]
+[unused257]
+[unused258]
+[unused259]
+[unused260]
+[unused261]
+[unused262]
+[unused263]
+[unused264]
+[unused265]
+[unused266]
+[unused267]
+[unused268]
+[unused269]
+[unused270]
+[unused271]
+[unused272]
+[unused273]
+[unused274]
+[unused275]
+[unused276]
+[unused277]
+[unused278]
+[unused279]
+[unused280]
+[unused281]
+[unused282]
+[unused283]
+[unused284]
+[unused285]
+[unused286]
+[unused287]
+[unused288]
+[unused289]
+[unused290]
+[unused291]
+[unused292]
+[unused293]
+[unused294]
+[unused295]
+[unused296]
+[unused297]
+[unused298]
+[unused299]
+[unused300]
+[unused301]
+[unused302]
+[unused303]
+[unused304]
+[unused305]
+[unused306]
+[unused307]
+[unused308]
+[unused309]
+[unused310]
+[unused311]
+[unused312]
+[unused313]
+[unused314]
+[unused315]
+[unused316]
+[unused317]
+[unused318]
+[unused319]
+[unused320]
+[unused321]
+[unused322]
+[unused323]
+[unused324]
+[unused325]
+[unused326]
+[unused327]
+[unused328]
+[unused329]
+[unused330]
+[unused331]
+[unused332]
+[unused333]
+[unused334]
+[unused335]
+[unused336]
+[unused337]
+[unused338]
+[unused339]
+[unused340]
+[unused341]
+[unused342]
+[unused343]
+[unused344]
+[unused345]
+[unused346]
+[unused347]
+[unused348]
+[unused349]
+[unused350]
+[unused351]
+[unused352]
+[unused353]
+[unused354]
+[unused355]
+[unused356]
+[unused357]
+[unused358]
+[unused359]
+[unused360]
+[unused361]
+[unused362]
+[unused363]
+[unused364]
+[unused365]
+[unused366]
+[unused367]
+[unused368]
+[unused369]
+[unused370]
+[unused371]
+[unused372]
+[unused373]
+[unused374]
+[unused375]
+[unused376]
+[unused377]
+[unused378]
+[unused379]
+[unused380]
+[unused381]
+[unused382]
+[unused383]
+[unused384]
+[unused385]
+[unused386]
+[unused387]
+[unused388]
+[unused389]
+[unused390]
+[unused391]
+[unused392]
+[unused393]
+[unused394]
+[unused395]
+[unused396]
+[unused397]
+[unused398]
+[unused399]
+[unused400]
+[unused401]
+[unused402]
+[unused403]
+[unused404]
+[unused405]
+[unused406]
+[unused407]
+[unused408]
+[unused409]
+[unused410]
+[unused411]
+[unused412]
+[unused413]
+[unused414]
+[unused415]
+[unused416]
+[unused417]
+[unused418]
+[unused419]
+[unused420]
+[unused421]
+[unused422]
+[unused423]
+[unused424]
+[unused425]
+[unused426]
+[unused427]
+[unused428]
+[unused429]
+[unused430]
+[unused431]
+[unused432]
+[unused433]
+[unused434]
+[unused435]
+[unused436]
+[unused437]
+[unused438]
+[unused439]
+[unused440]
+[unused441]
+[unused442]
+[unused443]
+[unused444]
+[unused445]
+[unused446]
+[unused447]
+[unused448]
+[unused449]
+[unused450]
+[unused451]
+[unused452]
+[unused453]
+[unused454]
+[unused455]
+[unused456]
+[unused457]
+[unused458]
+[unused459]
+[unused460]
+[unused461]
+[unused462]
+[unused463]
+[unused464]
+[unused465]
+[unused466]
+[unused467]
+[unused468]
+[unused469]
+[unused470]
+[unused471]
+[unused472]
+[unused473]
+[unused474]
+[unused475]
+[unused476]
+[unused477]
+[unused478]
+[unused479]
+[unused480]
+[unused481]
+[unused482]
+[unused483]
+[unused484]
+[unused485]
+[unused486]
+[unused487]
+[unused488]
+[unused489]
+[unused490]
+[unused491]
+[unused492]
+[unused493]
+[unused494]
+[unused495]
+[unused496]
+[unused497]
+[unused498]
+[unused499]
+[unused500]
+[unused501]
+[unused502]
+[unused503]
+[unused504]
+[unused505]
+[unused506]
+[unused507]
+[unused508]
+[unused509]
+[unused510]
+[unused511]
+[unused512]
+[unused513]
+[unused514]
+[unused515]
+[unused516]
+[unused517]
+[unused518]
+[unused519]
+[unused520]
+[unused521]
+[unused522]
+[unused523]
+[unused524]
+[unused525]
+[unused526]
+[unused527]
+[unused528]
+[unused529]
+[unused530]
+[unused531]
+[unused532]
+[unused533]
+[unused534]
+[unused535]
+[unused536]
+[unused537]
+[unused538]
+[unused539]
+[unused540]
+[unused541]
+[unused542]
+[unused543]
+[unused544]
+[unused545]
+[unused546]
+[unused547]
+[unused548]
+[unused549]
+[unused550]
+[unused551]
+[unused552]
+[unused553]
+[unused554]
+[unused555]
+[unused556]
+[unused557]
+[unused558]
+[unused559]
+[unused560]
+[unused561]
+[unused562]
+[unused563]
+[unused564]
+[unused565]
+[unused566]
+[unused567]
+[unused568]
+[unused569]
+[unused570]
+[unused571]
+[unused572]
+[unused573]
+[unused574]
+[unused575]
+[unused576]
+[unused577]
+[unused578]
+[unused579]
+[unused580]
+[unused581]
+[unused582]
+[unused583]
+[unused584]
+[unused585]
+[unused586]
+[unused587]
+[unused588]
+[unused589]
+[unused590]
+[unused591]
+[unused592]
+[unused593]
+[unused594]
+[unused595]
+[unused596]
+[unused597]
+[unused598]
+[unused599]
+[unused600]
+[unused601]
+[unused602]
+[unused603]
+[unused604]
+[unused605]
+[unused606]
+[unused607]
+[unused608]
+[unused609]
+[unused610]
+[unused611]
+[unused612]
+[unused613]
+[unused614]
+[unused615]
+[unused616]
+[unused617]
+[unused618]
+[unused619]
+[unused620]
+[unused621]
+[unused622]
+[unused623]
+[unused624]
+[unused625]
+[unused626]
+[unused627]
+[unused628]
+[unused629]
+[unused630]
+[unused631]
+[unused632]
+[unused633]
+[unused634]
+[unused635]
+[unused636]
+[unused637]
+[unused638]
+[unused639]
+[unused640]
+[unused641]
+[unused642]
+[unused643]
+[unused644]
+[unused645]
+[unused646]
+[unused647]
+[unused648]
+[unused649]
+[unused650]
+[unused651]
+[unused652]
+[unused653]
+[unused654]
+[unused655]
+[unused656]
+[unused657]
+[unused658]
+[unused659]
+[unused660]
+[unused661]
+[unused662]
+[unused663]
+[unused664]
+[unused665]
+[unused666]
+[unused667]
+[unused668]
+[unused669]
+[unused670]
+[unused671]
+[unused672]
+[unused673]
+[unused674]
+[unused675]
+[unused676]
+[unused677]
+[unused678]
+[unused679]
+[unused680]
+[unused681]
+[unused682]
+[unused683]
+[unused684]
+[unused685]
+[unused686]
+[unused687]
+[unused688]
+[unused689]
+[unused690]
+[unused691]
+[unused692]
+[unused693]
+[unused694]
+[unused695]
+[unused696]
+[unused697]
+[unused698]
+[unused699]
+[unused700]
+[unused701]
+[unused702]
+[unused703]
+[unused704]
+[unused705]
+[unused706]
+[unused707]
+[unused708]
+[unused709]
+[unused710]
+[unused711]
+[unused712]
+[unused713]
+[unused714]
+[unused715]
+[unused716]
+[unused717]
+[unused718]
+[unused719]
+[unused720]
+[unused721]
+[unused722]
+[unused723]
+[unused724]
+[unused725]
+[unused726]
+[unused727]
+[unused728]
+[unused729]
+[unused730]
+[unused731]
+[unused732]
+[unused733]
+[unused734]
+[unused735]
+[unused736]
+[unused737]
+[unused738]
+[unused739]
+[unused740]
+[unused741]
+[unused742]
+[unused743]
+[unused744]
+[unused745]
+[unused746]
+[unused747]
+[unused748]
+[unused749]
+[unused750]
+[unused751]
+[unused752]
+[unused753]
+[unused754]
+[unused755]
+[unused756]
+[unused757]
+[unused758]
+[unused759]
+[unused760]
+[unused761]
+[unused762]
+[unused763]
+[unused764]
+[unused765]
+[unused766]
+[unused767]
+[unused768]
+[unused769]
+[unused770]
+[unused771]
+[unused772]
+[unused773]
+[unused774]
+[unused775]
+[unused776]
+[unused777]
+[unused778]
+[unused779]
+[unused780]
+[unused781]
+[unused782]
+[unused783]
+[unused784]
+[unused785]
+[unused786]
+[unused787]
+[unused788]
+[unused789]
+[unused790]
+[unused791]
+[unused792]
+[unused793]
+[unused794]
+[unused795]
+[unused796]
+[unused797]
+[unused798]
+[unused799]
+[unused800]
+[unused801]
+[unused802]
+[unused803]
+[unused804]
+[unused805]
+[unused806]
+[unused807]
+[unused808]
+[unused809]
+[unused810]
+[unused811]
+[unused812]
+[unused813]
+[unused814]
+[unused815]
+[unused816]
+[unused817]
+[unused818]
+[unused819]
+[unused820]
+[unused821]
+[unused822]
+[unused823]
+[unused824]
+[unused825]
+[unused826]
+[unused827]
+[unused828]
+[unused829]
+[unused830]
+[unused831]
+[unused832]
+[unused833]
+[unused834]
+[unused835]
+[unused836]
+[unused837]
+[unused838]
+[unused839]
+[unused840]
+[unused841]
+[unused842]
+[unused843]
+[unused844]
+[unused845]
+[unused846]
+[unused847]
+[unused848]
+[unused849]
+[unused850]
+[unused851]
+[unused852]
+[unused853]
+[unused854]
+[unused855]
+[unused856]
+[unused857]
+[unused858]
+[unused859]
+[unused860]
+[unused861]
+[unused862]
+[unused863]
+[unused864]
+[unused865]
+[unused866]
+[unused867]
+[unused868]
+[unused869]
+[unused870]
+[unused871]
+[unused872]
+[unused873]
+[unused874]
+[unused875]
+[unused876]
+[unused877]
+[unused878]
+[unused879]
+[unused880]
+[unused881]
+[unused882]
+[unused883]
+[unused884]
+[unused885]
+[unused886]
+[unused887]
+[unused888]
+[unused889]
+[unused890]
+[unused891]
+[unused892]
+[unused893]
+[unused894]
+[unused895]
+[unused896]
+[unused897]
+[unused898]
+[unused899]
+[unused900]
+[unused901]
+[unused902]
+[unused903]
+[unused904]
+[unused905]
+[unused906]
+[unused907]
+[unused908]
+[unused909]
+[unused910]
+[unused911]
+[unused912]
+[unused913]
+[unused914]
+[unused915]
+[unused916]
+[unused917]
+[unused918]
+[unused919]
+[unused920]
+[unused921]
+[unused922]
+[unused923]
+[unused924]
+[unused925]
+[unused926]
+[unused927]
+[unused928]
+[unused929]
+[unused930]
+[unused931]
+[unused932]
+[unused933]
+[unused934]
+[unused935]
+[unused936]
+[unused937]
+[unused938]
+[unused939]
+[unused940]
+[unused941]
+[unused942]
+[unused943]
+[unused944]
+[unused945]
+[unused946]
+[unused947]
+[unused948]
+[unused949]
+[unused950]
+[unused951]
+[unused952]
+[unused953]
+[unused954]
+[unused955]
+[unused956]
+[unused957]
+[unused958]
+[unused959]
+[unused960]
+[unused961]
+[unused962]
+[unused963]
+[unused964]
+[unused965]
+[unused966]
+[unused967]
+[unused968]
+[unused969]
+[unused970]
+[unused971]
+[unused972]
+[unused973]
+[unused974]
+[unused975]
+[unused976]
+[unused977]
+[unused978]
+[unused979]
+[unused980]
+[unused981]
+[unused982]
+[unused983]
+[unused984]
+[unused985]
+[unused986]
+[unused987]
+[unused988]
+[unused989]
+[unused990]
+[unused991]
+[unused992]
+[unused993]
+!
+"
+#
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+/
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+=
+>
+?
+@
+[
+\
+]
+^
+_
+`
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+{
+|
+}
+~
+¡
+¢
+£
+¤
+¥
+¦
+§
+¨
+©
+ª
+«
+¬
+®
+°
+±
+²
+³
+´
+µ
+¶
+·
+¹
+º
+»
+¼
+½
+¾
+¿
+×
+ß
+æ
+ð
+÷
+ø
+þ
+đ
+ħ
+ı
+ł
+ŋ
+œ
+ƒ
+ɐ
+ɑ
+ɒ
+ɔ
+ɕ
+ə
+ɛ
+ɡ
+ɣ
+ɨ
+ɪ
+ɫ
+ɬ
+ɯ
+ɲ
+ɴ
+ɹ
+ɾ
+ʀ
+ʁ
+ʂ
+ʃ
+ʉ
+ʊ
+ʋ
+ʌ
+ʎ
+ʐ
+ʑ
+ʒ
+ʔ
+ʰ
+ʲ
+ʳ
+ʷ
+ʸ
+ʻ
+ʼ
+ʾ
+ʿ
+ˈ
+ː
+ˡ
+ˢ
+ˣ
+ˤ
+α
+β
+γ
+δ
+ε
+ζ
+η
+θ
+ι
+κ
+λ
+μ
+ν
+ξ
+ο
+π
+ρ
+ς
+σ
+τ
+υ
+φ
+χ
+ψ
+ω
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+ђ
+є
+і
+ј
+љ
+њ
+ћ
+ӏ
+ա
+բ
+գ
+դ
+ե
+թ
+ի
+լ
+կ
+հ
+մ
+յ
+ն
+ո
+պ
+ս
+վ
+տ
+ր
+ւ
+ք
+־
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+ך
+כ
+ל
+ם
+מ
+ן
+נ
+ס
+ע
+ף
+פ
+ץ
+צ
+ק
+ר
+ש
+ת
+،
+ء
+ا
+ب
+ة
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ـ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+ٹ
+پ
+چ
+ک
+گ
+ں
+ھ
+ہ
+ی
+ے
+अ
+आ
+उ
+ए
+क
+ख
+ग
+च
+ज
+ट
+ड
+ण
+त
+थ
+द
+ध
+न
+प
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ा
+ि
+ी
+ो
+।
+॥
+ং
+অ
+আ
+ই
+উ
+এ
+ও
+ক
+খ
+গ
+চ
+ছ
+জ
+ট
+ড
+ণ
+ত
+থ
+দ
+ধ
+ন
+প
+ব
+ভ
+ম
+য
+র
+ল
+শ
+ষ
+স
+হ
+া
+ি
+ী
+ে
+க
+ச
+ட
+த
+ந
+ன
+ப
+ம
+ய
+ர
+ல
+ள
+வ
+ா
+ி
+ு
+ே
+ை
+ನ
+ರ
+ಾ
+ක
+ය
+ර
+ල
+ව
+ා
+ก
+ง
+ต
+ท
+น
+พ
+ม
+ย
+ร
+ล
+ว
+ส
+อ
+า
+เ
+་
+།
+ག
+ང
+ད
+ན
+པ
+བ
+མ
+འ
+ར
+ལ
+ས
+မ
+ა
+ბ
+გ
+დ
+ე
+ვ
+თ
+ი
+კ
+ლ
+მ
+ნ
+ო
+რ
+ს
+ტ
+უ
+ᄀ
+ᄂ
+ᄃ
+ᄅ
+ᄆ
+ᄇ
+ᄉ
+ᄊ
+ᄋ
+ᄌ
+ᄎ
+ᄏ
+ᄐ
+ᄑ
+ᄒ
+ᅡ
+ᅢ
+ᅥ
+ᅦ
+ᅧ
+ᅩ
+ᅪ
+ᅭ
+ᅮ
+ᅯ
+ᅲ
+ᅳ
+ᅴ
+ᅵ
+ᆨ
+ᆫ
+ᆯ
+ᆷ
+ᆸ
+ᆼ
+ᴬ
+ᴮ
+ᴰ
+ᴵ
+ᴺ
+ᵀ
+ᵃ
+ᵇ
+ᵈ
+ᵉ
+ᵍ
+ᵏ
+ᵐ
+ᵒ
+ᵖ
+ᵗ
+ᵘ
+ᵢ
+ᵣ
+ᵤ
+ᵥ
+ᶜ
+ᶠ
+‐
+‑
+‒
+–
+—
+―
+‖
+‘
+’
+‚
+“
+”
+„
+†
+‡
+•
+…
+‰
+′
+″
+›
+‿
+⁄
+⁰
+ⁱ
+⁴
+⁵
+⁶
+⁷
+⁸
+⁹
+⁺
+⁻
+ⁿ
+₀
+₁
+₂
+₃
+₄
+₅
+₆
+₇
+₈
+₉
+₊
+₍
+₎
+ₐ
+ₑ
+ₒ
+ₓ
+ₕ
+ₖ
+ₗ
+ₘ
+ₙ
+ₚ
+ₛ
+ₜ
+₤
+₩
+€
+₱
+₹
+ℓ
+№
+ℝ
+™
+⅓
+⅔
+←
+↑
+→
+↓
+↔
+↦
+⇄
+⇌
+⇒
+∂
+∅
+∆
+∇
+∈
+−
+∗
+∘
+√
+∞
+∧
+∨
+∩
+∪
+≈
+≡
+≤
+≥
+⊂
+⊆
+⊕
+⊗
+⋅
+─
+│
+■
+▪
+●
+★
+☆
+☉
+♠
+♣
+♥
+♦
+♭
+♯
+⟨
+⟩
+ⱼ
+⺩
+⺼
+⽥
+、
+。
+〈
+〉
+《
+》
+「
+」
+『
+』
+〜
+あ
+い
+う
+え
+お
+か
+き
+く
+け
+こ
+さ
+し
+す
+せ
+そ
+た
+ち
+っ
+つ
+て
+と
+な
+に
+ぬ
+ね
+の
+は
+ひ
+ふ
+へ
+ほ
+ま
+み
+む
+め
+も
+や
+ゆ
+よ
+ら
+り
+る
+れ
+ろ
+を
+ん
+ァ
+ア
+ィ
+イ
+ウ
+ェ
+エ
+オ
+カ
+キ
+ク
+ケ
+コ
+サ
+シ
+ス
+セ
+タ
+チ
+ッ
+ツ
+テ
+ト
+ナ
+ニ
+ノ
+ハ
+ヒ
+フ
+ヘ
+ホ
+マ
+ミ
+ム
+メ
+モ
+ャ
+ュ
+ョ
+ラ
+リ
+ル
+レ
+ロ
+ワ
+ン
+・
+ー
+一
+三
+上
+下
+不
+世
+中
+主
+久
+之
+也
+事
+二
+五
+井
+京
+人
+亻
+仁
+介
+代
+仮
+伊
+会
+佐
+侍
+保
+信
+健
+元
+光
+八
+公
+内
+出
+分
+前
+劉
+力
+加
+勝
+北
+区
+十
+千
+南
+博
+原
+口
+古
+史
+司
+合
+吉
+同
+名
+和
+囗
+四
+国
+國
+土
+地
+坂
+城
+堂
+場
+士
+夏
+外
+大
+天
+太
+夫
+奈
+女
+子
+学
+宀
+宇
+安
+宗
+定
+宣
+宮
+家
+宿
+寺
+將
+小
+尚
+山
+岡
+島
+崎
+川
+州
+巿
+帝
+平
+年
+幸
+广
+弘
+張
+彳
+後
+御
+德
+心
+忄
+志
+忠
+愛
+成
+我
+戦
+戸
+手
+扌
+政
+文
+新
+方
+日
+明
+星
+春
+昭
+智
+曲
+書
+月
+有
+朝
+木
+本
+李
+村
+東
+松
+林
+森
+楊
+樹
+橋
+歌
+止
+正
+武
+比
+氏
+民
+水
+氵
+氷
+永
+江
+沢
+河
+治
+法
+海
+清
+漢
+瀬
+火
+版
+犬
+王
+生
+田
+男
+疒
+発
+白
+的
+皇
+目
+相
+省
+真
+石
+示
+社
+神
+福
+禾
+秀
+秋
+空
+立
+章
+竹
+糹
+美
+義
+耳
+良
+艹
+花
+英
+華
+葉
+藤
+行
+街
+西
+見
+訁
+語
+谷
+貝
+貴
+車
+軍
+辶
+道
+郎
+郡
+部
+都
+里
+野
+金
+鈴
+镇
+長
+門
+間
+阝
+阿
+陳
+陽
+雄
+青
+面
+風
+食
+香
+馬
+高
+龍
+龸
+ﬁ
+ﬂ
+！
+（
+）
+，
+－
+．
+／
+：
+？
+～
+the
+of
+and
+in
+to
+was
+he
+is
+as
+for
+on
+with
+that
+it
+his
+by
+at
+from
+her
+##s
+she
+you
+had
+an
+were
+but
+be
+this
+are
+not
+my
+they
+one
+which
+or
+have
+him
+me
+first
+all
+also
+their
+has
+up
+who
+out
+been
+when
+after
+there
+into
+new
+two
+its
+##a
+time
+would
+no
+what
+about
+said
+we
+over
+then
+other
+so
+more
+##e
+can
+if
+like
+back
+them
+only
+some
+could
+##i
+where
+just
+##ing
+during
+before
+##n
+do
+##o
+made
+school
+through
+than
+now
+years
+most
+world
+may
+between
+down
+well
+three
+##d
+year
+while
+will
+##ed
+##r
+##y
+later
+##t
+city
+under
+around
+did
+such
+being
+used
+state
+people
+part
+know
+against
+your
+many
+second
+university
+both
+national
+##er
+these
+don
+known
+off
+way
+until
+re
+how
+even
+get
+head
+...
+didn
+##ly
+team
+american
+because
+de
+##l
+born
+united
+film
+since
+still
+long
+work
+south
+us
+became
+any
+high
+again
+day
+family
+see
+right
+man
+eyes
+house
+season
+war
+states
+including
+took
+life
+north
+same
+each
+called
+name
+much
+place
+however
+go
+four
+group
+another
+found
+won
+area
+here
+going
+10
+away
+series
+left
+home
+music
+best
+make
+hand
+number
+company
+several
+never
+last
+john
+000
+very
+album
+take
+end
+good
+too
+following
+released
+game
+played
+little
+began
+district
+##m
+old
+want
+those
+side
+held
+own
+early
+county
+ll
+league
+use
+west
+##u
+face
+think
+##es
+2010
+government
+##h
+march
+came
+small
+general
+town
+june
+##on
+line
+based
+something
+##k
+september
+thought
+looked
+along
+international
+2011
+air
+july
+club
+went
+january
+october
+our
+august
+april
+york
+12
+few
+2012
+2008
+east
+show
+member
+college
+2009
+father
+public
+##us
+come
+men
+five
+set
+station
+church
+##c
+next
+former
+november
+room
+party
+located
+december
+2013
+age
+got
+2007
+##g
+system
+let
+love
+2006
+though
+every
+2014
+look
+song
+water
+century
+without
+body
+black
+night
+within
+great
+women
+single
+ve
+building
+large
+population
+river
+named
+band
+white
+started
+##an
+once
+15
+20
+should
+18
+2015
+service
+top
+built
+british
+open
+death
+king
+moved
+local
+times
+children
+february
+book
+why
+11
+door
+need
+president
+order
+final
+road
+wasn
+although
+due
+major
+died
+village
+third
+knew
+2016
+asked
+turned
+st
+wanted
+say
+##p
+together
+received
+main
+son
+served
+different
+##en
+behind
+himself
+felt
+members
+power
+football
+law
+voice
+play
+##in
+near
+park
+history
+30
+having
+2005
+16
+##man
+saw
+mother
+##al
+army
+point
+front
+help
+english
+street
+art
+late
+hands
+games
+award
+##ia
+young
+14
+put
+published
+country
+division
+across
+told
+13
+often
+ever
+french
+london
+center
+six
+red
+2017
+led
+days
+include
+light
+25
+find
+tell
+among
+species
+really
+according
+central
+half
+2004
+form
+original
+gave
+office
+making
+enough
+lost
+full
+opened
+must
+included
+live
+given
+german
+player
+run
+business
+woman
+community
+cup
+might
+million
+land
+2000
+court
+development
+17
+short
+round
+ii
+km
+seen
+class
+story
+always
+become
+sure
+research
+almost
+director
+council
+la
+##2
+career
+things
+using
+island
+##z
+couldn
+car
+##is
+24
+close
+force
+##1
+better
+free
+support
+control
+field
+students
+2003
+education
+married
+##b
+nothing
+worked
+others
+record
+big
+inside
+level
+anything
+continued
+give
+james
+##3
+military
+established
+non
+returned
+feel
+does
+title
+written
+thing
+feet
+william
+far
+co
+association
+hard
+already
+2002
+##ra
+championship
+human
+western
+100
+##na
+department
+hall
+role
+various
+production
+21
+19
+heart
+2001
+living
+fire
+version
+##ers
+##f
+television
+royal
+##4
+produced
+working
+act
+case
+society
+region
+present
+radio
+period
+looking
+least
+total
+keep
+england
+wife
+program
+per
+brother
+mind
+special
+22
+##le
+am
+works
+soon
+##6
+political
+george
+services
+taken
+created
+##7
+further
+able
+reached
+david
+union
+joined
+upon
+done
+important
+social
+information
+either
+##ic
+##x
+appeared
+position
+ground
+lead
+rock
+dark
+election
+23
+board
+france
+hair
+course
+arms
+site
+police
+girl
+instead
+real
+sound
+##v
+words
+moment
+##te
+someone
+##8
+summer
+project
+announced
+san
+less
+wrote
+past
+followed
+##5
+blue
+founded
+al
+finally
+india
+taking
+records
+america
+##ne
+1999
+design
+considered
+northern
+god
+stop
+battle
+toward
+european
+outside
+described
+track
+today
+playing
+language
+28
+call
+26
+heard
+professional
+low
+australia
+miles
+california
+win
+yet
+green
+##ie
+trying
+blood
+##ton
+southern
+science
+maybe
+everything
+match
+square
+27
+mouth
+video
+race
+recorded
+leave
+above
+##9
+daughter
+points
+space
+1998
+museum
+change
+middle
+common
+##0
+move
+tv
+post
+##ta
+lake
+seven
+tried
+elected
+closed
+ten
+paul
+minister
+##th
+months
+start
+chief
+return
+canada
+person
+sea
+release
+similar
+modern
+brought
+rest
+hit
+formed
+mr
+##la
+1997
+floor
+event
+doing
+thomas
+1996
+robert
+care
+killed
+training
+star
+week
+needed
+turn
+finished
+railway
+rather
+news
+health
+sent
+example
+ran
+term
+michael
+coming
+currently
+yes
+forces
+despite
+gold
+areas
+50
+stage
+fact
+29
+dead
+says
+popular
+2018
+originally
+germany
+probably
+developed
+result
+pulled
+friend
+stood
+money
+running
+mi
+signed
+word
+songs
+child
+eventually
+met
+tour
+average
+teams
+minutes
+festival
+current
+deep
+kind
+1995
+decided
+usually
+eastern
+seemed
+##ness
+episode
+bed
+added
+table
+indian
+private
+charles
+route
+available
+idea
+throughout
+centre
+addition
+appointed
+style
+1994
+books
+eight
+construction
+press
+mean
+wall
+friends
+remained
+schools
+study
+##ch
+##um
+institute
+oh
+chinese
+sometimes
+events
+possible
+1992
+australian
+type
+brown
+forward
+talk
+process
+food
+debut
+seat
+performance
+committee
+features
+character
+arts
+herself
+else
+lot
+strong
+russian
+range
+hours
+peter
+arm
+##da
+morning
+dr
+sold
+##ry
+quickly
+directed
+1993
+guitar
+china
+##w
+31
+list
+##ma
+performed
+media
+uk
+players
+smile
+##rs
+myself
+40
+placed
+coach
+province
+towards
+wouldn
+leading
+whole
+boy
+official
+designed
+grand
+census
+##el
+europe
+attack
+japanese
+henry
+1991
+##re
+##os
+cross
+getting
+alone
+action
+lower
+network
+wide
+washington
+japan
+1990
+hospital
+believe
+changed
+sister
+##ar
+hold
+gone
+sir
+hadn
+ship
+##ka
+studies
+academy
+shot
+rights
+below
+base
+bad
+involved
+kept
+largest
+##ist
+bank
+future
+especially
+beginning
+mark
+movement
+section
+female
+magazine
+plan
+professor
+lord
+longer
+##ian
+sat
+walked
+hill
+actually
+civil
+energy
+model
+families
+size
+thus
+aircraft
+completed
+includes
+data
+captain
+##or
+fight
+vocals
+featured
+richard
+bridge
+fourth
+1989
+officer
+stone
+hear
+##ism
+means
+medical
+groups
+management
+self
+lips
+competition
+entire
+lived
+technology
+leaving
+federal
+tournament
+bit
+passed
+hot
+independent
+awards
+kingdom
+mary
+spent
+fine
+doesn
+reported
+##ling
+jack
+fall
+raised
+itself
+stay
+true
+studio
+1988
+sports
+replaced
+paris
+systems
+saint
+leader
+theatre
+whose
+market
+capital
+parents
+spanish
+canadian
+earth
+##ity
+cut
+degree
+writing
+bay
+christian
+awarded
+natural
+higher
+bill
+##as
+coast
+provided
+previous
+senior
+ft
+valley
+organization
+stopped
+onto
+countries
+parts
+conference
+queen
+security
+interest
+saying
+allowed
+master
+earlier
+phone
+matter
+smith
+winning
+try
+happened
+moving
+campaign
+los
+##ley
+breath
+nearly
+mid
+1987
+certain
+girls
+date
+italian
+african
+standing
+fell
+artist
+##ted
+shows
+deal
+mine
+industry
+1986
+##ng
+everyone
+republic
+provide
+collection
+library
+student
+##ville
+primary
+owned
+older
+via
+heavy
+1st
+makes
+##able
+attention
+anyone
+africa
+##ri
+stated
+length
+ended
+fingers
+command
+staff
+skin
+foreign
+opening
+governor
+okay
+medal
+kill
+sun
+cover
+job
+1985
+introduced
+chest
+hell
+feeling
+##ies
+success
+meet
+reason
+standard
+meeting
+novel
+1984
+trade
+source
+buildings
+##land
+rose
+guy
+goal
+##ur
+chapter
+native
+husband
+previously
+unit
+limited
+entered
+weeks
+producer
+operations
+mountain
+takes
+covered
+forced
+related
+roman
+complete
+successful
+key
+texas
+cold
+##ya
+channel
+1980
+traditional
+films
+dance
+clear
+approximately
+500
+nine
+van
+prince
+question
+active
+tracks
+ireland
+regional
+silver
+author
+personal
+sense
+operation
+##ine
+economic
+1983
+holding
+twenty
+isbn
+additional
+speed
+hour
+edition
+regular
+historic
+places
+whom
+shook
+movie
+km²
+secretary
+prior
+report
+chicago
+read
+foundation
+view
+engine
+scored
+1982
+units
+ask
+airport
+property
+ready
+immediately
+lady
+month
+listed
+contract
+##de
+manager
+themselves
+lines
+##ki
+navy
+writer
+meant
+##ts
+runs
+##ro
+practice
+championships
+singer
+glass
+commission
+required
+forest
+starting
+culture
+generally
+giving
+access
+attended
+test
+couple
+stand
+catholic
+martin
+caught
+executive
+##less
+eye
+##ey
+thinking
+chair
+quite
+shoulder
+1979
+hope
+decision
+plays
+defeated
+municipality
+whether
+structure
+offered
+slowly
+pain
+ice
+direction
+##ion
+paper
+mission
+1981
+mostly
+200
+noted
+individual
+managed
+nature
+lives
+plant
+##ha
+helped
+except
+studied
+computer
+figure
+relationship
+issue
+significant
+loss
+die
+smiled
+gun
+ago
+highest
+1972
+##am
+male
+bring
+goals
+mexico
+problem
+distance
+commercial
+completely
+location
+annual
+famous
+drive
+1976
+neck
+1978
+surface
+caused
+italy
+understand
+greek
+highway
+wrong
+hotel
+comes
+appearance
+joseph
+double
+issues
+musical
+companies
+castle
+income
+review
+assembly
+bass
+initially
+parliament
+artists
+experience
+1974
+particular
+walk
+foot
+engineering
+talking
+window
+dropped
+##ter
+miss
+baby
+boys
+break
+1975
+stars
+edge
+remember
+policy
+carried
+train
+stadium
+bar
+sex
+angeles
+evidence
+##ge
+becoming
+assistant
+soviet
+1977
+upper
+step
+wing
+1970
+youth
+financial
+reach
+##ll
+actor
+numerous
+##se
+##st
+nodded
+arrived
+##ation
+minute
+##nt
+believed
+sorry
+complex
+beautiful
+victory
+associated
+temple
+1968
+1973
+chance
+perhaps
+metal
+##son
+1945
+bishop
+##et
+lee
+launched
+particularly
+tree
+le
+retired
+subject
+prize
+contains
+yeah
+theory
+empire
+##ce
+suddenly
+waiting
+trust
+recording
+##to
+happy
+terms
+camp
+champion
+1971
+religious
+pass
+zealand
+names
+2nd
+port
+ancient
+tom
+corner
+represented
+watch
+legal
+anti
+justice
+cause
+watched
+brothers
+45
+material
+changes
+simply
+response
+louis
+fast
+##ting
+answer
+60
+historical
+1969
+stories
+straight
+create
+feature
+increased
+rate
+administration
+virginia
+el
+activities
+cultural
+overall
+winner
+programs
+basketball
+legs
+guard
+beyond
+cast
+doctor
+mm
+flight
+results
+remains
+cost
+effect
+winter
+##ble
+larger
+islands
+problems
+chairman
+grew
+commander
+isn
+1967
+pay
+failed
+selected
+hurt
+fort
+box
+regiment
+majority
+journal
+35
+edward
+plans
+##ke
+##ni
+shown
+pretty
+irish
+characters
+directly
+scene
+likely
+operated
+allow
+spring
+##j
+junior
+matches
+looks
+mike
+houses
+fellow
+##tion
+beach
+marriage
+##ham
+##ive
+rules
+oil
+65
+florida
+expected
+nearby
+congress
+sam
+peace
+recent
+iii
+wait
+subsequently
+cell
+##do
+variety
+serving
+agreed
+please
+poor
+joe
+pacific
+attempt
+wood
+democratic
+piece
+prime
+##ca
+rural
+mile
+touch
+appears
+township
+1964
+1966
+soldiers
+##men
+##ized
+1965
+pennsylvania
+closer
+fighting
+claimed
+score
+jones
+physical
+editor
+##ous
+filled
+genus
+specific
+sitting
+super
+mom
+##va
+therefore
+supported
+status
+fear
+cases
+store
+meaning
+wales
+minor
+spain
+tower
+focus
+vice
+frank
+follow
+parish
+separate
+golden
+horse
+fifth
+remaining
+branch
+32
+presented
+stared
+##id
+uses
+secret
+forms
+##co
+baseball
+exactly
+##ck
+choice
+note
+discovered
+travel
+composed
+truth
+russia
+ball
+color
+kiss
+dad
+wind
+continue
+ring
+referred
+numbers
+digital
+greater
+##ns
+metres
+slightly
+direct
+increase
+1960
+responsible
+crew
+rule
+trees
+troops
+##no
+broke
+goes
+individuals
+hundred
+weight
+creek
+sleep
+memory
+defense
+provides
+ordered
+code
+value
+jewish
+windows
+1944
+safe
+judge
+whatever
+corps
+realized
+growing
+pre
+##ga
+cities
+alexander
+gaze
+lies
+spread
+scott
+letter
+showed
+situation
+mayor
+transport
+watching
+workers
+extended
+##li
+expression
+normal
+##ment
+chart
+multiple
+border
+##ba
+host
+##ner
+daily
+mrs
+walls
+piano
+##ko
+heat
+cannot
+##ate
+earned
+products
+drama
+era
+authority
+seasons
+join
+grade
+##io
+sign
+difficult
+machine
+1963
+territory
+mainly
+##wood
+stations
+squadron
+1962
+stepped
+iron
+19th
+##led
+serve
+appear
+sky
+speak
+broken
+charge
+knowledge
+kilometres
+removed
+ships
+article
+campus
+simple
+##ty
+pushed
+britain
+##ve
+leaves
+recently
+cd
+soft
+boston
+latter
+easy
+acquired
+poland
+##sa
+quality
+officers
+presence
+planned
+nations
+mass
+broadcast
+jean
+share
+image
+influence
+wild
+offer
+emperor
+electric
+reading
+headed
+ability
+promoted
+yellow
+ministry
+1942
+throat
+smaller
+politician
+##by
+latin
+spoke
+cars
+williams
+males
+lack
+pop
+80
+##ier
+acting
+seeing
+consists
+##ti
+estate
+1961
+pressure
+johnson
+newspaper
+jr
+chris
+olympics
+online
+conditions
+beat
+elements
+walking
+vote
+##field
+needs
+carolina
+text
+featuring
+global
+block
+shirt
+levels
+francisco
+purpose
+females
+et
+dutch
+duke
+ahead
+gas
+twice
+safety
+serious
+turning
+highly
+lieutenant
+firm
+maria
+amount
+mixed
+daniel
+proposed
+perfect
+agreement
+affairs
+3rd
+seconds
+contemporary
+paid
+1943
+prison
+save
+kitchen
+label
+administrative
+intended
+constructed
+academic
+nice
+teacher
+races
+1956
+formerly
+corporation
+ben
+nation
+issued
+shut
+1958
+drums
+housing
+victoria
+seems
+opera
+1959
+graduated
+function
+von
+mentioned
+picked
+build
+recognized
+shortly
+protection
+picture
+notable
+exchange
+elections
+1980s
+loved
+percent
+racing
+fish
+elizabeth
+garden
+volume
+hockey
+1941
+beside
+settled
+##ford
+1940
+competed
+replied
+drew
+1948
+actress
+marine
+scotland
+steel
+glanced
+farm
+steve
+1957
+risk
+tonight
+positive
+magic
+singles
+effects
+gray
+screen
+dog
+##ja
+residents
+bus
+sides
+none
+secondary
+literature
+polish
+destroyed
+flying
+founder
+households
+1939
+lay
+reserve
+usa
+gallery
+##ler
+1946
+industrial
+younger
+approach
+appearances
+urban
+ones
+1950
+finish
+avenue
+powerful
+fully
+growth
+page
+honor
+jersey
+projects
+advanced
+revealed
+basic
+90
+infantry
+pair
+equipment
+visit
+33
+evening
+search
+grant
+effort
+solo
+treatment
+buried
+republican
+primarily
+bottom
+owner
+1970s
+israel
+gives
+jim
+dream
+bob
+remain
+spot
+70
+notes
+produce
+champions
+contact
+ed
+soul
+accepted
+ways
+del
+##ally
+losing
+split
+price
+capacity
+basis
+trial
+questions
+##ina
+1955
+20th
+guess
+officially
+memorial
+naval
+initial
+##ization
+whispered
+median
+engineer
+##ful
+sydney
+##go
+columbia
+strength
+300
+1952
+tears
+senate
+00
+card
+asian
+agent
+1947
+software
+44
+draw
+warm
+supposed
+com
+pro
+##il
+transferred
+leaned
+##at
+candidate
+escape
+mountains
+asia
+potential
+activity
+entertainment
+seem
+traffic
+jackson
+murder
+36
+slow
+product
+orchestra
+haven
+agency
+bbc
+taught
+website
+comedy
+unable
+storm
+planning
+albums
+rugby
+environment
+scientific
+grabbed
+protect
+##hi
+boat
+typically
+1954
+1953
+damage
+principal
+divided
+dedicated
+mount
+ohio
+##berg
+pick
+fought
+driver
+##der
+empty
+shoulders
+sort
+thank
+berlin
+prominent
+account
+freedom
+necessary
+efforts
+alex
+headquarters
+follows
+alongside
+des
+simon
+andrew
+suggested
+operating
+learning
+steps
+1949
+sweet
+technical
+begin
+easily
+34
+teeth
+speaking
+settlement
+scale
+##sh
+renamed
+ray
+max
+enemy
+semi
+joint
+compared
+##rd
+scottish
+leadership
+analysis
+offers
+georgia
+pieces
+captured
+animal
+deputy
+guest
+organized
+##lin
+tony
+combined
+method
+challenge
+1960s
+huge
+wants
+battalion
+sons
+rise
+crime
+types
+facilities
+telling
+path
+1951
+platform
+sit
+1990s
+##lo
+tells
+assigned
+rich
+pull
+##ot
+commonly
+alive
+##za
+letters
+concept
+conducted
+wearing
+happen
+bought
+becomes
+holy
+gets
+ocean
+defeat
+languages
+purchased
+coffee
+occurred
+titled
+##q
+declared
+applied
+sciences
+concert
+sounds
+jazz
+brain
+##me
+painting
+fleet
+tax
+nick
+##ius
+michigan
+count
+animals
+leaders
+episodes
+##line
+content
+##den
+birth
+##it
+clubs
+64
+palace
+critical
+refused
+fair
+leg
+laughed
+returning
+surrounding
+participated
+formation
+lifted
+pointed
+connected
+rome
+medicine
+laid
+taylor
+santa
+powers
+adam
+tall
+shared
+focused
+knowing
+yards
+entrance
+falls
+##wa
+calling
+##ad
+sources
+chosen
+beneath
+resources
+yard
+##ite
+nominated
+silence
+zone
+defined
+##que
+gained
+thirty
+38
+bodies
+moon
+##ard
+adopted
+christmas
+widely
+register
+apart
+iran
+premier
+serves
+du
+unknown
+parties
+##les
+generation
+##ff
+continues
+quick
+fields
+brigade
+quiet
+teaching
+clothes
+impact
+weapons
+partner
+flat
+theater
+supreme
+1938
+37
+relations
+##tor
+plants
+suffered
+1936
+wilson
+kids
+begins
+##age
+1918
+seats
+armed
+internet
+models
+worth
+laws
+400
+communities
+classes
+background
+knows
+thanks
+quarter
+reaching
+humans
+carry
+killing
+format
+kong
+hong
+setting
+75
+architecture
+disease
+railroad
+inc
+possibly
+wish
+arthur
+thoughts
+harry
+doors
+density
+##di
+crowd
+illinois
+stomach
+tone
+unique
+reports
+anyway
+##ir
+liberal
+der
+vehicle
+thick
+dry
+drug
+faced
+largely
+facility
+theme
+holds
+creation
+strange
+colonel
+##mi
+revolution
+bell
+politics
+turns
+silent
+rail
+relief
+independence
+combat
+shape
+write
+determined
+sales
+learned
+4th
+finger
+oxford
+providing
+1937
+heritage
+fiction
+situated
+designated
+allowing
+distribution
+hosted
+##est
+sight
+interview
+estimated
+reduced
+##ria
+toronto
+footballer
+keeping
+guys
+damn
+claim
+motion
+sport
+sixth
+stayed
+##ze
+en
+rear
+receive
+handed
+twelve
+dress
+audience
+granted
+brazil
+##well
+spirit
+##ated
+noticed
+etc
+olympic
+representative
+eric
+tight
+trouble
+reviews
+drink
+vampire
+missing
+roles
+ranked
+newly
+household
+finals
+wave
+critics
+##ee
+phase
+massachusetts
+pilot
+unlike
+philadelphia
+bright
+guns
+crown
+organizations
+roof
+42
+respectively
+clearly
+tongue
+marked
+circle
+fox
+korea
+bronze
+brian
+expanded
+sexual
+supply
+yourself
+inspired
+labour
+fc
+##ah
+reference
+vision
+draft
+connection
+brand
+reasons
+1935
+classic
+driving
+trip
+jesus
+cells
+entry
+1920
+neither
+trail
+claims
+atlantic
+orders
+labor
+nose
+afraid
+identified
+intelligence
+calls
+cancer
+attacked
+passing
+stephen
+positions
+imperial
+grey
+jason
+39
+sunday
+48
+swedish
+avoid
+extra
+uncle
+message
+covers
+allows
+surprise
+materials
+fame
+hunter
+##ji
+1930
+citizens
+figures
+davis
+environmental
+confirmed
+shit
+titles
+di
+performing
+difference
+acts
+attacks
+##ov
+existing
+votes
+opportunity
+nor
+shop
+entirely
+trains
+opposite
+pakistan
+##pa
+develop
+resulted
+representatives
+actions
+reality
+pressed
+##ish
+barely
+wine
+conversation
+faculty
+northwest
+ends
+documentary
+nuclear
+stock
+grace
+sets
+eat
+alternative
+##ps
+bag
+resulting
+creating
+surprised
+cemetery
+1919
+drop
+finding
+sarah
+cricket
+streets
+tradition
+ride
+1933
+exhibition
+target
+ear
+explained
+rain
+composer
+injury
+apartment
+municipal
+educational
+occupied
+netherlands
+clean
+billion
+constitution
+learn
+1914
+maximum
+classical
+francis
+lose
+opposition
+jose
+ontario
+bear
+core
+hills
+rolled
+ending
+drawn
+permanent
+fun
+##tes
+##lla
+lewis
+sites
+chamber
+ryan
+##way
+scoring
+height
+1934
+##house
+lyrics
+staring
+55
+officials
+1917
+snow
+oldest
+##tic
+orange
+##ger
+qualified
+interior
+apparently
+succeeded
+thousand
+dinner
+lights
+existence
+fans
+heavily
+41
+greatest
+conservative
+send
+bowl
+plus
+enter
+catch
+##un
+economy
+duty
+1929
+speech
+authorities
+princess
+performances
+versions
+shall
+graduate
+pictures
+effective
+remembered
+poetry
+desk
+crossed
+starring
+starts
+passenger
+sharp
+##ant
+acres
+ass
+weather
+falling
+rank
+fund
+supporting
+check
+adult
+publishing
+heads
+cm
+southeast
+lane
+##burg
+application
+bc
+##ura
+les
+condition
+transfer
+prevent
+display
+ex
+regions
+earl
+federation
+cool
+relatively
+answered
+besides
+1928
+obtained
+portion
+##town
+mix
+##ding
+reaction
+liked
+dean
+express
+peak
+1932
+##tte
+counter
+religion
+chain
+rare
+miller
+convention
+aid
+lie
+vehicles
+mobile
+perform
+squad
+wonder
+lying
+crazy
+sword
+##ping
+attempted
+centuries
+weren
+philosophy
+category
+##ize
+anna
+interested
+47
+sweden
+wolf
+frequently
+abandoned
+kg
+literary
+alliance
+task
+entitled
+##ay
+threw
+promotion
+factory
+tiny
+soccer
+visited
+matt
+fm
+achieved
+52
+defence
+internal
+persian
+43
+methods
+##ging
+arrested
+otherwise
+cambridge
+programming
+villages
+elementary
+districts
+rooms
+criminal
+conflict
+worry
+trained
+1931
+attempts
+waited
+signal
+bird
+truck
+subsequent
+programme
+##ol
+ad
+49
+communist
+details
+faith
+sector
+patrick
+carrying
+laugh
+##ss
+controlled
+korean
+showing
+origin
+fuel
+evil
+1927
+##ent
+brief
+identity
+darkness
+address
+pool
+missed
+publication
+web
+planet
+ian
+anne
+wings
+invited
+##tt
+briefly
+standards
+kissed
+##be
+ideas
+climate
+causing
+walter
+worse
+albert
+articles
+winners
+desire
+aged
+northeast
+dangerous
+gate
+doubt
+1922
+wooden
+multi
+##ky
+poet
+rising
+funding
+46
+communications
+communication
+violence
+copies
+prepared
+ford
+investigation
+skills
+1924
+pulling
+electronic
+##ak
+##ial
+##han
+containing
+ultimately
+offices
+singing
+understanding
+restaurant
+tomorrow
+fashion
+christ
+ward
+da
+pope
+stands
+5th
+flow
+studios
+aired
+commissioned
+contained
+exist
+fresh
+americans
+##per
+wrestling
+approved
+kid
+employed
+respect
+suit
+1925
+angel
+asking
+increasing
+frame
+angry
+selling
+1950s
+thin
+finds
+##nd
+temperature
+statement
+ali
+explain
+inhabitants
+towns
+extensive
+narrow
+51
+jane
+flowers
+images
+promise
+somewhere
+object
+fly
+closely
+##ls
+1912
+bureau
+cape
+1926
+weekly
+presidential
+legislative
+1921
+##ai
+##au
+launch
+founding
+##ny
+978
+##ring
+artillery
+strike
+un
+institutions
+roll
+writers
+landing
+chose
+kevin
+anymore
+pp
+##ut
+attorney
+fit
+dan
+billboard
+receiving
+agricultural
+breaking
+sought
+dave
+admitted
+lands
+mexican
+##bury
+charlie
+specifically
+hole
+iv
+howard
+credit
+moscow
+roads
+accident
+1923
+proved
+wear
+struck
+hey
+guards
+stuff
+slid
+expansion
+1915
+cat
+anthony
+##kin
+melbourne
+opposed
+sub
+southwest
+architect
+failure
+plane
+1916
+##ron
+map
+camera
+tank
+listen
+regarding
+wet
+introduction
+metropolitan
+link
+ep
+fighter
+inch
+grown
+gene
+anger
+fixed
+buy
+dvd
+khan
+domestic
+worldwide
+chapel
+mill
+functions
+examples
+##head
+developing
+1910
+turkey
+hits
+pocket
+antonio
+papers
+grow
+unless
+circuit
+18th
+concerned
+attached
+journalist
+selection
+journey
+converted
+provincial
+painted
+hearing
+aren
+bands
+negative
+aside
+wondered
+knight
+lap
+survey
+ma
+##ow
+noise
+billy
+##ium
+shooting
+guide
+bedroom
+priest
+resistance
+motor
+homes
+sounded
+giant
+##mer
+150
+scenes
+equal
+comic
+patients
+hidden
+solid
+actual
+bringing
+afternoon
+touched
+funds
+wedding
+consisted
+marie
+canal
+sr
+kim
+treaty
+turkish
+recognition
+residence
+cathedral
+broad
+knees
+incident
+shaped
+fired
+norwegian
+handle
+cheek
+contest
+represent
+##pe
+representing
+beauty
+##sen
+birds
+advantage
+emergency
+wrapped
+drawing
+notice
+pink
+broadcasting
+##ong
+somehow
+bachelor
+seventh
+collected
+registered
+establishment
+alan
+assumed
+chemical
+personnel
+roger
+retirement
+jeff
+portuguese
+wore
+tied
+device
+threat
+progress
+advance
+##ised
+banks
+hired
+manchester
+nfl
+teachers
+structures
+forever
+##bo
+tennis
+helping
+saturday
+sale
+applications
+junction
+hip
+incorporated
+neighborhood
+dressed
+ceremony
+##ds
+influenced
+hers
+visual
+stairs
+decades
+inner
+kansas
+hung
+hoped
+gain
+scheduled
+downtown
+engaged
+austria
+clock
+norway
+certainly
+pale
+protected
+1913
+victor
+employees
+plate
+putting
+surrounded
+##ists
+finishing
+blues
+tropical
+##ries
+minnesota
+consider
+philippines
+accept
+54
+retrieved
+1900
+concern
+anderson
+properties
+institution
+gordon
+successfully
+vietnam
+##dy
+backing
+outstanding
+muslim
+crossing
+folk
+producing
+usual
+demand
+occurs
+observed
+lawyer
+educated
+##ana
+kelly
+string
+pleasure
+budget
+items
+quietly
+colorado
+philip
+typical
+##worth
+derived
+600
+survived
+asks
+mental
+##ide
+56
+jake
+jews
+distinguished
+ltd
+1911
+sri
+extremely
+53
+athletic
+loud
+thousands
+worried
+shadow
+transportation
+horses
+weapon
+arena
+importance
+users
+tim
+objects
+contributed
+dragon
+douglas
+aware
+senator
+johnny
+jordan
+sisters
+engines
+flag
+investment
+samuel
+shock
+capable
+clark
+row
+wheel
+refers
+session
+familiar
+biggest
+wins
+hate
+maintained
+drove
+hamilton
+request
+expressed
+injured
+underground
+churches
+walker
+wars
+tunnel
+passes
+stupid
+agriculture
+softly
+cabinet
+regarded
+joining
+indiana
+##ea
+##ms
+push
+dates
+spend
+behavior
+woods
+protein
+gently
+chase
+morgan
+mention
+burning
+wake
+combination
+occur
+mirror
+leads
+jimmy
+indeed
+impossible
+singapore
+paintings
+covering
+##nes
+soldier
+locations
+attendance
+sell
+historian
+wisconsin
+invasion
+argued
+painter
+diego
+changing
+egypt
+##don
+experienced
+inches
+##ku
+missouri
+vol
+grounds
+spoken
+switzerland
+##gan
+reform
+rolling
+ha
+forget
+massive
+resigned
+burned
+allen
+tennessee
+locked
+values
+improved
+##mo
+wounded
+universe
+sick
+dating
+facing
+pack
+purchase
+user
+##pur
+moments
+##ul
+merged
+anniversary
+1908
+coal
+brick
+understood
+causes
+dynasty
+queensland
+establish
+stores
+crisis
+promote
+hoping
+views
+cards
+referee
+extension
+##si
+raise
+arizona
+improve
+colonial
+formal
+charged
+##rt
+palm
+lucky
+hide
+rescue
+faces
+95
+feelings
+candidates
+juan
+##ell
+goods
+6th
+courses
+weekend
+59
+luke
+cash
+fallen
+##om
+delivered
+affected
+installed
+carefully
+tries
+swiss
+hollywood
+costs
+lincoln
+responsibility
+##he
+shore
+file
+proper
+normally
+maryland
+assistance
+jump
+constant
+offering
+friendly
+waters
+persons
+realize
+contain
+trophy
+800
+partnership
+factor
+58
+musicians
+cry
+bound
+oregon
+indicated
+hero
+houston
+medium
+##ure
+consisting
+somewhat
+##ara
+57
+cycle
+##che
+beer
+moore
+frederick
+gotten
+eleven
+worst
+weak
+approached
+arranged
+chin
+loan
+universal
+bond
+fifteen
+pattern
+disappeared
+##ney
+translated
+##zed
+lip
+arab
+capture
+interests
+insurance
+##chi
+shifted
+cave
+prix
+warning
+sections
+courts
+coat
+plot
+smell
+feed
+golf
+favorite
+maintain
+knife
+vs
+voted
+degrees
+finance
+quebec
+opinion
+translation
+manner
+ruled
+operate
+productions
+choose
+musician
+discovery
+confused
+tired
+separated
+stream
+techniques
+committed
+attend
+ranking
+kings
+throw
+passengers
+measure
+horror
+fan
+mining
+sand
+danger
+salt
+calm
+decade
+dam
+require
+runner
+##ik
+rush
+associate
+greece
+##ker
+rivers
+consecutive
+matthew
+##ski
+sighed
+sq
+documents
+steam
+edited
+closing
+tie
+accused
+1905
+##ini
+islamic
+distributed
+directors
+organisation
+bruce
+7th
+breathing
+mad
+lit
+arrival
+concrete
+taste
+08
+composition
+shaking
+faster
+amateur
+adjacent
+stating
+1906
+twin
+flew
+##ran
+tokyo
+publications
+##tone
+obviously
+ridge
+storage
+1907
+carl
+pages
+concluded
+desert
+driven
+universities
+ages
+terminal
+sequence
+borough
+250
+constituency
+creative
+cousin
+economics
+dreams
+margaret
+notably
+reduce
+montreal
+mode
+17th
+ears
+saved
+jan
+vocal
+##ica
+1909
+andy
+##jo
+riding
+roughly
+threatened
+##ise
+meters
+meanwhile
+landed
+compete
+repeated
+grass
+czech
+regularly
+charges
+tea
+sudden
+appeal
+##ung
+solution
+describes
+pierre
+classification
+glad
+parking
+##ning
+belt
+physics
+99
+rachel
+add
+hungarian
+participate
+expedition
+damaged
+gift
+childhood
+85
+fifty
+##red
+mathematics
+jumped
+letting
+defensive
+mph
+##ux
+##gh
+testing
+##hip
+hundreds
+shoot
+owners
+matters
+smoke
+israeli
+kentucky
+dancing
+mounted
+grandfather
+emma
+designs
+profit
+argentina
+##gs
+truly
+li
+lawrence
+cole
+begun
+detroit
+willing
+branches
+smiling
+decide
+miami
+enjoyed
+recordings
+##dale
+poverty
+ethnic
+gay
+##bi
+gary
+arabic
+09
+accompanied
+##one
+##ons
+fishing
+determine
+residential
+acid
+##ary
+alice
+returns
+starred
+mail
+##ang
+jonathan
+strategy
+##ue
+net
+forty
+cook
+businesses
+equivalent
+commonwealth
+distinct
+ill
+##cy
+seriously
+##ors
+##ped
+shift
+harris
+replace
+rio
+imagine
+formula
+ensure
+##ber
+additionally
+scheme
+conservation
+occasionally
+purposes
+feels
+favor
+##and
+##ore
+1930s
+contrast
+hanging
+hunt
+movies
+1904
+instruments
+victims
+danish
+christopher
+busy
+demon
+sugar
+earliest
+colony
+studying
+balance
+duties
+##ks
+belgium
+slipped
+carter
+05
+visible
+stages
+iraq
+fifa
+##im
+commune
+forming
+zero
+07
+continuing
+talked
+counties
+legend
+bathroom
+option
+tail
+clay
+daughters
+afterwards
+severe
+jaw
+visitors
+##ded
+devices
+aviation
+russell
+kate
+##vi
+entering
+subjects
+##ino
+temporary
+swimming
+forth
+smooth
+ghost
+audio
+bush
+operates
+rocks
+movements
+signs
+eddie
+##tz
+ann
+voices
+honorary
+06
+memories
+dallas
+pure
+measures
+racial
+promised
+66
+harvard
+ceo
+16th
+parliamentary
+indicate
+benefit
+flesh
+dublin
+louisiana
+1902
+1901
+patient
+sleeping
+1903
+membership
+coastal
+medieval
+wanting
+element
+scholars
+rice
+62
+limit
+survive
+makeup
+rating
+definitely
+collaboration
+obvious
+##tan
+boss
+ms
+baron
+birthday
+linked
+soil
+diocese
+##lan
+ncaa
+##mann
+offensive
+shell
+shouldn
+waist
+##tus
+plain
+ross
+organ
+resolution
+manufacturing
+adding
+relative
+kennedy
+98
+whilst
+moth
+marketing
+gardens
+crash
+72
+heading
+partners
+credited
+carlos
+moves
+cable
+##zi
+marshall
+##out
+depending
+bottle
+represents
+rejected
+responded
+existed
+04
+jobs
+denmark
+lock
+##ating
+treated
+graham
+routes
+talent
+commissioner
+drugs
+secure
+tests
+reign
+restored
+photography
+##gi
+contributions
+oklahoma
+designer
+disc
+grin
+seattle
+robin
+paused
+atlanta
+unusual
+##gate
+praised
+las
+laughing
+satellite
+hungary
+visiting
+##sky
+interesting
+factors
+deck
+poems
+norman
+##water
+stuck
+speaker
+rifle
+domain
+premiered
+##her
+dc
+comics
+actors
+01
+reputation
+eliminated
+8th
+ceiling
+prisoners
+script
+##nce
+leather
+austin
+mississippi
+rapidly
+admiral
+parallel
+charlotte
+guilty
+tools
+gender
+divisions
+fruit
+##bs
+laboratory
+nelson
+fantasy
+marry
+rapid
+aunt
+tribe
+requirements
+aspects
+suicide
+amongst
+adams
+bone
+ukraine
+abc
+kick
+sees
+edinburgh
+clothing
+column
+rough
+gods
+hunting
+broadway
+gathered
+concerns
+##ek
+spending
+ty
+12th
+snapped
+requires
+solar
+bones
+cavalry
+##tta
+iowa
+drinking
+waste
+index
+franklin
+charity
+thompson
+stewart
+tip
+flash
+landscape
+friday
+enjoy
+singh
+poem
+listening
+##back
+eighth
+fred
+differences
+adapted
+bomb
+ukrainian
+surgery
+corporate
+masters
+anywhere
+##more
+waves
+odd
+sean
+portugal
+orleans
+dick
+debate
+kent
+eating
+puerto
+cleared
+96
+expect
+cinema
+97
+guitarist
+blocks
+electrical
+agree
+involving
+depth
+dying
+panel
+struggle
+##ged
+peninsula
+adults
+novels
+emerged
+vienna
+metro
+debuted
+shoes
+tamil
+songwriter
+meets
+prove
+beating
+instance
+heaven
+scared
+sending
+marks
+artistic
+passage
+superior
+03
+significantly
+shopping
+##tive
+retained
+##izing
+malaysia
+technique
+cheeks
+##ola
+warren
+maintenance
+destroy
+extreme
+allied
+120
+appearing
+##yn
+fill
+advice
+alabama
+qualifying
+policies
+cleveland
+hat
+battery
+smart
+authors
+10th
+soundtrack
+acted
+dated
+lb
+glance
+equipped
+coalition
+funny
+outer
+ambassador
+roy
+possibility
+couples
+campbell
+dna
+loose
+ethan
+supplies
+1898
+gonna
+88
+monster
+##res
+shake
+agents
+frequency
+springs
+dogs
+practices
+61
+gang
+plastic
+easier
+suggests
+gulf
+blade
+exposed
+colors
+industries
+markets
+pan
+nervous
+electoral
+charts
+legislation
+ownership
+##idae
+mac
+appointment
+shield
+copy
+assault
+socialist
+abbey
+monument
+license
+throne
+employment
+jay
+93
+replacement
+charter
+cloud
+powered
+suffering
+accounts
+oak
+connecticut
+strongly
+wright
+colour
+crystal
+13th
+context
+welsh
+networks
+voiced
+gabriel
+jerry
+##cing
+forehead
+mp
+##ens
+manage
+schedule
+totally
+remix
+##ii
+forests
+occupation
+print
+nicholas
+brazilian
+strategic
+vampires
+engineers
+76
+roots
+seek
+correct
+instrumental
+und
+alfred
+backed
+hop
+##des
+stanley
+robinson
+traveled
+wayne
+welcome
+austrian
+achieve
+67
+exit
+rates
+1899
+strip
+whereas
+##cs
+sing
+deeply
+adventure
+bobby
+rick
+jamie
+careful
+components
+cap
+useful
+personality
+knee
+##shi
+pushing
+hosts
+02
+protest
+ca
+ottoman
+symphony
+##sis
+63
+boundary
+1890
+processes
+considering
+considerable
+tons
+##work
+##ft
+##nia
+cooper
+trading
+dear
+conduct
+91
+illegal
+apple
+revolutionary
+holiday
+definition
+harder
+##van
+jacob
+circumstances
+destruction
+##lle
+popularity
+grip
+classified
+liverpool
+donald
+baltimore
+flows
+seeking
+honour
+approval
+92
+mechanical
+till
+happening
+statue
+critic
+increasingly
+immediate
+describe
+commerce
+stare
+##ster
+indonesia
+meat
+rounds
+boats
+baker
+orthodox
+depression
+formally
+worn
+naked
+claire
+muttered
+sentence
+11th
+emily
+document
+77
+criticism
+wished
+vessel
+spiritual
+bent
+virgin
+parker
+minimum
+murray
+lunch
+danny
+printed
+compilation
+keyboards
+false
+blow
+belonged
+68
+raising
+78
+cutting
+##board
+pittsburgh
+##up
+9th
+shadows
+81
+hated
+indigenous
+jon
+15th
+barry
+scholar
+ah
+##zer
+oliver
+##gy
+stick
+susan
+meetings
+attracted
+spell
+romantic
+##ver
+ye
+1895
+photo
+demanded
+customers
+##ac
+1896
+logan
+revival
+keys
+modified
+commanded
+jeans
+##ious
+upset
+raw
+phil
+detective
+hiding
+resident
+vincent
+##bly
+experiences
+diamond
+defeating
+coverage
+lucas
+external
+parks
+franchise
+helen
+bible
+successor
+percussion
+celebrated
+il
+lift
+profile
+clan
+romania
+##ied
+mills
+##su
+nobody
+achievement
+shrugged
+fault
+1897
+rhythm
+initiative
+breakfast
+carbon
+700
+69
+lasted
+violent
+74
+wound
+ken
+killer
+gradually
+filmed
+°c
+dollars
+processing
+94
+remove
+criticized
+guests
+sang
+chemistry
+##vin
+legislature
+disney
+##bridge
+uniform
+escaped
+integrated
+proposal
+purple
+denied
+liquid
+karl
+influential
+morris
+nights
+stones
+intense
+experimental
+twisted
+71
+84
+##ld
+pace
+nazi
+mitchell
+ny
+blind
+reporter
+newspapers
+14th
+centers
+burn
+basin
+forgotten
+surviving
+filed
+collections
+monastery
+losses
+manual
+couch
+description
+appropriate
+merely
+tag
+missions
+sebastian
+restoration
+replacing
+triple
+73
+elder
+julia
+warriors
+benjamin
+julian
+convinced
+stronger
+amazing
+declined
+versus
+merchant
+happens
+output
+finland
+bare
+barbara
+absence
+ignored
+dawn
+injuries
+##port
+producers
+##ram
+82
+luis
+##ities
+kw
+admit
+expensive
+electricity
+nba
+exception
+symbol
+##ving
+ladies
+shower
+sheriff
+characteristics
+##je
+aimed
+button
+ratio
+effectively
+summit
+angle
+jury
+bears
+foster
+vessels
+pants
+executed
+evans
+dozen
+advertising
+kicked
+patrol
+1889
+competitions
+lifetime
+principles
+athletics
+##logy
+birmingham
+sponsored
+89
+rob
+nomination
+1893
+acoustic
+##sm
+creature
+longest
+##tra
+credits
+harbor
+dust
+josh
+##so
+territories
+milk
+infrastructure
+completion
+thailand
+indians
+leon
+archbishop
+##sy
+assist
+pitch
+blake
+arrangement
+girlfriend
+serbian
+operational
+hence
+sad
+scent
+fur
+dj
+sessions
+hp
+refer
+rarely
+##ora
+exists
+1892
+##ten
+scientists
+dirty
+penalty
+burst
+portrait
+seed
+79
+pole
+limits
+rival
+1894
+stable
+alpha
+grave
+constitutional
+alcohol
+arrest
+flower
+mystery
+devil
+architectural
+relationships
+greatly
+habitat
+##istic
+larry
+progressive
+remote
+cotton
+##ics
+##ok
+preserved
+reaches
+##ming
+cited
+86
+vast
+scholarship
+decisions
+cbs
+joy
+teach
+1885
+editions
+knocked
+eve
+searching
+partly
+participation
+gap
+animated
+fate
+excellent
+##ett
+na
+87
+alternate
+saints
+youngest
+##ily
+climbed
+##ita
+##tors
+suggest
+##ct
+discussion
+staying
+choir
+lakes
+jacket
+revenue
+nevertheless
+peaked
+instrument
+wondering
+annually
+managing
+neil
+1891
+signing
+terry
+##ice
+apply
+clinical
+brooklyn
+aim
+catherine
+fuck
+farmers
+figured
+ninth
+pride
+hugh
+evolution
+ordinary
+involvement
+comfortable
+shouted
+tech
+encouraged
+taiwan
+representation
+sharing
+##lia
+##em
+panic
+exact
+cargo
+competing
+fat
+cried
+83
+1920s
+occasions
+pa
+cabin
+borders
+utah
+marcus
+##isation
+badly
+muscles
+##ance
+victorian
+transition
+warner
+bet
+permission
+##rin
+slave
+terrible
+similarly
+shares
+seth
+uefa
+possession
+medals
+benefits
+colleges
+lowered
+perfectly
+mall
+transit
+##ye
+##kar
+publisher
+##ened
+harrison
+deaths
+elevation
+##ae
+asleep
+machines
+sigh
+ash
+hardly
+argument
+occasion
+parent
+leo
+decline
+1888
+contribution
+##ua
+concentration
+1000
+opportunities
+hispanic
+guardian
+extent
+emotions
+hips
+mason
+volumes
+bloody
+controversy
+diameter
+steady
+mistake
+phoenix
+identify
+violin
+##sk
+departure
+richmond
+spin
+funeral
+enemies
+1864
+gear
+literally
+connor
+random
+sergeant
+grab
+confusion
+1865
+transmission
+informed
+op
+leaning
+sacred
+suspended
+thinks
+gates
+portland
+luck
+agencies
+yours
+hull
+expert
+muscle
+layer
+practical
+sculpture
+jerusalem
+latest
+lloyd
+statistics
+deeper
+recommended
+warrior
+arkansas
+mess
+supports
+greg
+eagle
+1880
+recovered
+rated
+concerts
+rushed
+##ano
+stops
+eggs
+files
+premiere
+keith
+##vo
+delhi
+turner
+pit
+affair
+belief
+paint
+##zing
+mate
+##ach
+##ev
+victim
+##ology
+withdrew
+bonus
+styles
+fled
+##ud
+glasgow
+technologies
+funded
+nbc
+adaptation
+##ata
+portrayed
+cooperation
+supporters
+judges
+bernard
+justin
+hallway
+ralph
+##ick
+graduating
+controversial
+distant
+continental
+spider
+bite
+##ho
+recognize
+intention
+mixing
+##ese
+egyptian
+bow
+tourism
+suppose
+claiming
+tiger
+dominated
+participants
+vi
+##ru
+nurse
+partially
+tape
+##rum
+psychology
+##rn
+essential
+touring
+duo
+voting
+civilian
+emotional
+channels
+##king
+apparent
+hebrew
+1887
+tommy
+carrier
+intersection
+beast
+hudson
+##gar
+##zo
+lab
+nova
+bench
+discuss
+costa
+##ered
+detailed
+behalf
+drivers
+unfortunately
+obtain
+##lis
+rocky
+##dae
+siege
+friendship
+honey
+##rian
+1861
+amy
+hang
+posted
+governments
+collins
+respond
+wildlife
+preferred
+operator
+##po
+laura
+pregnant
+videos
+dennis
+suspected
+boots
+instantly
+weird
+automatic
+businessman
+alleged
+placing
+throwing
+ph
+mood
+1862
+perry
+venue
+jet
+remainder
+##lli
+##ci
+passion
+biological
+boyfriend
+1863
+dirt
+buffalo
+ron
+segment
+fa
+abuse
+##era
+genre
+thrown
+stroke
+colored
+stress
+exercise
+displayed
+##gen
+struggled
+##tti
+abroad
+dramatic
+wonderful
+thereafter
+madrid
+component
+widespread
+##sed
+tale
+citizen
+todd
+monday
+1886
+vancouver
+overseas
+forcing
+crying
+descent
+##ris
+discussed
+substantial
+ranks
+regime
+1870
+provinces
+switch
+drum
+zane
+ted
+tribes
+proof
+lp
+cream
+researchers
+volunteer
+manor
+silk
+milan
+donated
+allies
+venture
+principle
+delivery
+enterprise
+##ves
+##ans
+bars
+traditionally
+witch
+reminded
+copper
+##uk
+pete
+inter
+links
+colin
+grinned
+elsewhere
+competitive
+frequent
+##oy
+scream
+##hu
+tension
+texts
+submarine
+finnish
+defending
+defend
+pat
+detail
+1884
+affiliated
+stuart
+themes
+villa
+periods
+tool
+belgian
+ruling
+crimes
+answers
+folded
+licensed
+resort
+demolished
+hans
+lucy
+1881
+lion
+traded
+photographs
+writes
+craig
+##fa
+trials
+generated
+beth
+noble
+debt
+percentage
+yorkshire
+erected
+ss
+viewed
+grades
+confidence
+ceased
+islam
+telephone
+retail
+##ible
+chile
+m²
+roberts
+sixteen
+##ich
+commented
+hampshire
+innocent
+dual
+pounds
+checked
+regulations
+afghanistan
+sung
+rico
+liberty
+assets
+bigger
+options
+angels
+relegated
+tribute
+wells
+attending
+leaf
+##yan
+butler
+romanian
+forum
+monthly
+lisa
+patterns
+gmina
+##tory
+madison
+hurricane
+rev
+##ians
+bristol
+##ula
+elite
+valuable
+disaster
+democracy
+awareness
+germans
+freyja
+##ins
+loop
+absolutely
+paying
+populations
+maine
+sole
+prayer
+spencer
+releases
+doorway
+bull
+##ani
+lover
+midnight
+conclusion
+##sson
+thirteen
+lily
+mediterranean
+##lt
+nhl
+proud
+sample
+##hill
+drummer
+guinea
+##ova
+murphy
+climb
+##ston
+instant
+attributed
+horn
+ain
+railways
+steven
+##ao
+autumn
+ferry
+opponent
+root
+traveling
+secured
+corridor
+stretched
+tales
+sheet
+trinity
+cattle
+helps
+indicates
+manhattan
+murdered
+fitted
+1882
+gentle
+grandmother
+mines
+shocked
+vegas
+produces
+##light
+caribbean
+##ou
+belong
+continuous
+desperate
+drunk
+historically
+trio
+waved
+raf
+dealing
+nathan
+bat
+murmured
+interrupted
+residing
+scientist
+pioneer
+harold
+aaron
+##net
+delta
+attempting
+minority
+mini
+believes
+chorus
+tend
+lots
+eyed
+indoor
+load
+shots
+updated
+jail
+##llo
+concerning
+connecting
+wealth
+##ved
+slaves
+arrive
+rangers
+sufficient
+rebuilt
+##wick
+cardinal
+flood
+muhammad
+whenever
+relation
+runners
+moral
+repair
+viewers
+arriving
+revenge
+punk
+assisted
+bath
+fairly
+breathe
+lists
+innings
+illustrated
+whisper
+nearest
+voters
+clinton
+ties
+ultimate
+screamed
+beijing
+lions
+andre
+fictional
+gathering
+comfort
+radar
+suitable
+dismissed
+hms
+ban
+pine
+wrist
+atmosphere
+voivodeship
+bid
+timber
+##ned
+##nan
+giants
+##ane
+cameron
+recovery
+uss
+identical
+categories
+switched
+serbia
+laughter
+noah
+ensemble
+therapy
+peoples
+touching
+##off
+locally
+pearl
+platforms
+everywhere
+ballet
+tables
+lanka
+herbert
+outdoor
+toured
+derek
+1883
+spaces
+contested
+swept
+1878
+exclusive
+slight
+connections
+##dra
+winds
+prisoner
+collective
+bangladesh
+tube
+publicly
+wealthy
+thai
+##ys
+isolated
+select
+##ric
+insisted
+pen
+fortune
+ticket
+spotted
+reportedly
+animation
+enforcement
+tanks
+110
+decides
+wider
+lowest
+owen
+##time
+nod
+hitting
+##hn
+gregory
+furthermore
+magazines
+fighters
+solutions
+##ery
+pointing
+requested
+peru
+reed
+chancellor
+knights
+mask
+worker
+eldest
+flames
+reduction
+1860
+volunteers
+##tis
+reporting
+##hl
+wire
+advisory
+endemic
+origins
+settlers
+pursue
+knock
+consumer
+1876
+eu
+compound
+creatures
+mansion
+sentenced
+ivan
+deployed
+guitars
+frowned
+involves
+mechanism
+kilometers
+perspective
+shops
+maps
+terminus
+duncan
+alien
+fist
+bridges
+##pers
+heroes
+fed
+derby
+swallowed
+##ros
+patent
+sara
+illness
+characterized
+adventures
+slide
+hawaii
+jurisdiction
+##op
+organised
+##side
+adelaide
+walks
+biology
+se
+##ties
+rogers
+swing
+tightly
+boundaries
+##rie
+prepare
+implementation
+stolen
+##sha
+certified
+colombia
+edwards
+garage
+##mm
+recalled
+##ball
+rage
+harm
+nigeria
+breast
+##ren
+furniture
+pupils
+settle
+##lus
+cuba
+balls
+client
+alaska
+21st
+linear
+thrust
+celebration
+latino
+genetic
+terror
+##cia
+##ening
+lightning
+fee
+witness
+lodge
+establishing
+skull
+##ique
+earning
+hood
+##ei
+rebellion
+wang
+sporting
+warned
+missile
+devoted
+activist
+porch
+worship
+fourteen
+package
+1871
+decorated
+##shire
+housed
+##ock
+chess
+sailed
+doctors
+oscar
+joan
+treat
+garcia
+harbour
+jeremy
+##ire
+traditions
+dominant
+jacques
+##gon
+##wan
+relocated
+1879
+amendment
+sized
+companion
+simultaneously
+volleyball
+spun
+acre
+increases
+stopping
+loves
+belongs
+affect
+drafted
+tossed
+scout
+battles
+1875
+filming
+shoved
+munich
+tenure
+vertical
+romance
+pc
+##cher
+argue
+##ical
+craft
+ranging
+www
+opens
+honest
+tyler
+yesterday
+virtual
+##let
+muslims
+reveal
+snake
+immigrants
+radical
+screaming
+speakers
+firing
+saving
+belonging
+ease
+lighting
+prefecture
+blame
+farmer
+hungry
+grows
+rubbed
+beam
+sur
+subsidiary
+##cha
+armenian
+sao
+dropping
+conventional
+##fer
+microsoft
+reply
+qualify
+spots
+1867
+sweat
+festivals
+##ken
+immigration
+physician
+discover
+exposure
+sandy
+explanation
+isaac
+implemented
+##fish
+hart
+initiated
+connect
+stakes
+presents
+heights
+householder
+pleased
+tourist
+regardless
+slip
+closest
+##ction
+surely
+sultan
+brings
+riley
+preparation
+aboard
+slammed
+baptist
+experiment
+ongoing
+interstate
+organic
+playoffs
+##ika
+1877
+130
+##tar
+hindu
+error
+tours
+tier
+plenty
+arrangements
+talks
+trapped
+excited
+sank
+ho
+athens
+1872
+denver
+welfare
+suburb
+athletes
+trick
+diverse
+belly
+exclusively
+yelled
+1868
+##med
+conversion
+##ette
+1874
+internationally
+computers
+conductor
+abilities
+sensitive
+hello
+dispute
+measured
+globe
+rocket
+prices
+amsterdam
+flights
+tigers
+inn
+municipalities
+emotion
+references
+3d
+##mus
+explains
+airlines
+manufactured
+pm
+archaeological
+1873
+interpretation
+devon
+comment
+##ites
+settlements
+kissing
+absolute
+improvement
+suite
+impressed
+barcelona
+sullivan
+jefferson
+towers
+jesse
+julie
+##tin
+##lu
+grandson
+hi
+gauge
+regard
+rings
+interviews
+trace
+raymond
+thumb
+departments
+burns
+serial
+bulgarian
+scores
+demonstrated
+##ix
+1866
+kyle
+alberta
+underneath
+romanized
+##ward
+relieved
+acquisition
+phrase
+cliff
+reveals
+han
+cuts
+merger
+custom
+##dar
+nee
+gilbert
+graduation
+##nts
+assessment
+cafe
+difficulty
+demands
+swung
+democrat
+jennifer
+commons
+1940s
+grove
+##yo
+completing
+focuses
+sum
+substitute
+bearing
+stretch
+reception
+##py
+reflected
+essentially
+destination
+pairs
+##ched
+survival
+resource
+##bach
+promoting
+doubles
+messages
+tear
+##down
+##fully
+parade
+florence
+harvey
+incumbent
+partial
+framework
+900
+pedro
+frozen
+procedure
+olivia
+controls
+##mic
+shelter
+personally
+temperatures
+##od
+brisbane
+tested
+sits
+marble
+comprehensive
+oxygen
+leonard
+##kov
+inaugural
+iranian
+referring
+quarters
+attitude
+##ivity
+mainstream
+lined
+mars
+dakota
+norfolk
+unsuccessful
+##°
+explosion
+helicopter
+congressional
+##sing
+inspector
+bitch
+seal
+departed
+divine
+##ters
+coaching
+examination
+punishment
+manufacturer
+sink
+columns
+unincorporated
+signals
+nevada
+squeezed
+dylan
+dining
+photos
+martial
+manuel
+eighteen
+elevator
+brushed
+plates
+ministers
+ivy
+congregation
+##len
+slept
+specialized
+taxes
+curve
+restricted
+negotiations
+likes
+statistical
+arnold
+inspiration
+execution
+bold
+intermediate
+significance
+margin
+ruler
+wheels
+gothic
+intellectual
+dependent
+listened
+eligible
+buses
+widow
+syria
+earn
+cincinnati
+collapsed
+recipient
+secrets
+accessible
+philippine
+maritime
+goddess
+clerk
+surrender
+breaks
+playoff
+database
+##ified
+##lon
+ideal
+beetle
+aspect
+soap
+regulation
+strings
+expand
+anglo
+shorter
+crosses
+retreat
+tough
+coins
+wallace
+directions
+pressing
+##oon
+shipping
+locomotives
+comparison
+topics
+nephew
+##mes
+distinction
+honors
+travelled
+sierra
+ibn
+##over
+fortress
+sa
+recognised
+carved
+1869
+clients
+##dan
+intent
+##mar
+coaches
+describing
+bread
+##ington
+beaten
+northwestern
+##ona
+merit
+youtube
+collapse
+challenges
+em
+historians
+objective
+submitted
+virus
+attacking
+drake
+assume
+##ere
+diseases
+marc
+stem
+leeds
+##cus
+##ab
+farming
+glasses
+##lock
+visits
+nowhere
+fellowship
+relevant
+carries
+restaurants
+experiments
+101
+constantly
+bases
+targets
+shah
+tenth
+opponents
+verse
+territorial
+##ira
+writings
+corruption
+##hs
+instruction
+inherited
+reverse
+emphasis
+##vic
+employee
+arch
+keeps
+rabbi
+watson
+payment
+uh
+##ala
+nancy
+##tre
+venice
+fastest
+sexy
+banned
+adrian
+properly
+ruth
+touchdown
+dollar
+boards
+metre
+circles
+edges
+favour
+comments
+ok
+travels
+liberation
+scattered
+firmly
+##ular
+holland
+permitted
+diesel
+kenya
+den
+originated
+##ral
+demons
+resumed
+dragged
+rider
+##rus
+servant
+blinked
+extend
+torn
+##ias
+##sey
+input
+meal
+everybody
+cylinder
+kinds
+camps
+##fe
+bullet
+logic
+##wn
+croatian
+evolved
+healthy
+fool
+chocolate
+wise
+preserve
+pradesh
+##ess
+respective
+1850
+##ew
+chicken
+artificial
+gross
+corresponding
+convicted
+cage
+caroline
+dialogue
+##dor
+narrative
+stranger
+mario
+br
+christianity
+failing
+trent
+commanding
+buddhist
+1848
+maurice
+focusing
+yale
+bike
+altitude
+##ering
+mouse
+revised
+##sley
+veteran
+##ig
+pulls
+theology
+crashed
+campaigns
+legion
+##ability
+drag
+excellence
+customer
+cancelled
+intensity
+excuse
+##lar
+liga
+participating
+contributing
+printing
+##burn
+variable
+##rk
+curious
+bin
+legacy
+renaissance
+##my
+symptoms
+binding
+vocalist
+dancer
+##nie
+grammar
+gospel
+democrats
+ya
+enters
+sc
+diplomatic
+hitler
+##ser
+clouds
+mathematical
+quit
+defended
+oriented
+##heim
+fundamental
+hardware
+impressive
+equally
+convince
+confederate
+guilt
+chuck
+sliding
+##ware
+magnetic
+narrowed
+petersburg
+bulgaria
+otto
+phd
+skill
+##ama
+reader
+hopes
+pitcher
+reservoir
+hearts
+automatically
+expecting
+mysterious
+bennett
+extensively
+imagined
+seeds
+monitor
+fix
+##ative
+journalism
+struggling
+signature
+ranch
+encounter
+photographer
+observation
+protests
+##pin
+influences
+##hr
+calendar
+##all
+cruz
+croatia
+locomotive
+hughes
+naturally
+shakespeare
+basement
+hook
+uncredited
+faded
+theories
+approaches
+dare
+phillips
+filling
+fury
+obama
+##ain
+efficient
+arc
+deliver
+min
+raid
+breeding
+inducted
+leagues
+efficiency
+axis
+montana
+eagles
+##ked
+supplied
+instructions
+karen
+picking
+indicating
+trap
+anchor
+practically
+christians
+tomb
+vary
+occasional
+electronics
+lords
+readers
+newcastle
+faint
+innovation
+collect
+situations
+engagement
+160
+claude
+mixture
+##feld
+peer
+tissue
+logo
+lean
+##ration
+°f
+floors
+##ven
+architects
+reducing
+##our
+##ments
+rope
+1859
+ottawa
+##har
+samples
+banking
+declaration
+proteins
+resignation
+francois
+saudi
+advocate
+exhibited
+armor
+twins
+divorce
+##ras
+abraham
+reviewed
+jo
+temporarily
+matrix
+physically
+pulse
+curled
+##ena
+difficulties
+bengal
+usage
+##ban
+annie
+riders
+certificate
+##pi
+holes
+warsaw
+distinctive
+jessica
+##mon
+mutual
+1857
+customs
+circular
+eugene
+removal
+loaded
+mere
+vulnerable
+depicted
+generations
+dame
+heir
+enormous
+lightly
+climbing
+pitched
+lessons
+pilots
+nepal
+ram
+google
+preparing
+brad
+louise
+renowned
+##₂
+liam
+##ably
+plaza
+shaw
+sophie
+brilliant
+bills
+##bar
+##nik
+fucking
+mainland
+server
+pleasant
+seized
+veterans
+jerked
+fail
+beta
+brush
+radiation
+stored
+warmth
+southeastern
+nate
+sin
+raced
+berkeley
+joke
+athlete
+designation
+trunk
+##low
+roland
+qualification
+archives
+heels
+artwork
+receives
+judicial
+reserves
+##bed
+woke
+installation
+abu
+floating
+fake
+lesser
+excitement
+interface
+concentrated
+addressed
+characteristic
+amanda
+saxophone
+monk
+auto
+##bus
+releasing
+egg
+dies
+interaction
+defender
+ce
+outbreak
+glory
+loving
+##bert
+sequel
+consciousness
+http
+awake
+ski
+enrolled
+##ress
+handling
+rookie
+brow
+somebody
+biography
+warfare
+amounts
+contracts
+presentation
+fabric
+dissolved
+challenged
+meter
+psychological
+lt
+elevated
+rally
+accurate
+##tha
+hospitals
+undergraduate
+specialist
+venezuela
+exhibit
+shed
+nursing
+protestant
+fluid
+structural
+footage
+jared
+consistent
+prey
+##ska
+succession
+reflect
+exile
+lebanon
+wiped
+suspect
+shanghai
+resting
+integration
+preservation
+marvel
+variant
+pirates
+sheep
+rounded
+capita
+sailing
+colonies
+manuscript
+deemed
+variations
+clarke
+functional
+emerging
+boxing
+relaxed
+curse
+azerbaijan
+heavyweight
+nickname
+editorial
+rang
+grid
+tightened
+earthquake
+flashed
+miguel
+rushing
+##ches
+improvements
+boxes
+brooks
+180
+consumption
+molecular
+felix
+societies
+repeatedly
+variation
+aids
+civic
+graphics
+professionals
+realm
+autonomous
+receiver
+delayed
+workshop
+militia
+chairs
+trump
+canyon
+##point
+harsh
+extending
+lovely
+happiness
+##jan
+stake
+eyebrows
+embassy
+wellington
+hannah
+##ella
+sony
+corners
+bishops
+swear
+cloth
+contents
+xi
+namely
+commenced
+1854
+stanford
+nashville
+courage
+graphic
+commitment
+garrison
+##bin
+hamlet
+clearing
+rebels
+attraction
+literacy
+cooking
+ruins
+temples
+jenny
+humanity
+celebrate
+hasn
+freight
+sixty
+rebel
+bastard
+##art
+newton
+##ada
+deer
+##ges
+##ching
+smiles
+delaware
+singers
+##ets
+approaching
+assists
+flame
+##ph
+boulevard
+barrel
+planted
+##ome
+pursuit
+##sia
+consequences
+posts
+shallow
+invitation
+rode
+depot
+ernest
+kane
+rod
+concepts
+preston
+topic
+chambers
+striking
+blast
+arrives
+descendants
+montgomery
+ranges
+worlds
+##lay
+##ari
+span
+chaos
+praise
+##ag
+fewer
+1855
+sanctuary
+mud
+fbi
+##ions
+programmes
+maintaining
+unity
+harper
+bore
+handsome
+closure
+tournaments
+thunder
+nebraska
+linda
+facade
+puts
+satisfied
+argentine
+dale
+cork
+dome
+panama
+##yl
+1858
+tasks
+experts
+##ates
+feeding
+equation
+##las
+##ida
+##tu
+engage
+bryan
+##ax
+um
+quartet
+melody
+disbanded
+sheffield
+blocked
+gasped
+delay
+kisses
+maggie
+connects
+##non
+sts
+poured
+creator
+publishers
+##we
+guided
+ellis
+extinct
+hug
+gaining
+##ord
+complicated
+##bility
+poll
+clenched
+investigate
+##use
+thereby
+quantum
+spine
+cdp
+humor
+kills
+administered
+semifinals
+##du
+encountered
+ignore
+##bu
+commentary
+##maker
+bother
+roosevelt
+140
+plains
+halfway
+flowing
+cultures
+crack
+imprisoned
+neighboring
+airline
+##ses
+##view
+##mate
+##ec
+gather
+wolves
+marathon
+transformed
+##ill
+cruise
+organisations
+carol
+punch
+exhibitions
+numbered
+alarm
+ratings
+daddy
+silently
+##stein
+queens
+colours
+impression
+guidance
+liu
+tactical
+##rat
+marshal
+della
+arrow
+##ings
+rested
+feared
+tender
+owns
+bitter
+advisor
+escort
+##ides
+spare
+farms
+grants
+##ene
+dragons
+encourage
+colleagues
+cameras
+##und
+sucked
+pile
+spirits
+prague
+statements
+suspension
+landmark
+fence
+torture
+recreation
+bags
+permanently
+survivors
+pond
+spy
+predecessor
+bombing
+coup
+##og
+protecting
+transformation
+glow
+##lands
+##book
+dug
+priests
+andrea
+feat
+barn
+jumping
+##chen
+##ologist
+##con
+casualties
+stern
+auckland
+pipe
+serie
+revealing
+ba
+##bel
+trevor
+mercy
+spectrum
+yang
+consist
+governing
+collaborated
+possessed
+epic
+comprises
+blew
+shane
+##ack
+lopez
+honored
+magical
+sacrifice
+judgment
+perceived
+hammer
+mtv
+baronet
+tune
+das
+missionary
+sheets
+350
+neutral
+oral
+threatening
+attractive
+shade
+aims
+seminary
+##master
+estates
+1856
+michel
+wounds
+refugees
+manufacturers
+##nic
+mercury
+syndrome
+porter
+##iya
+##din
+hamburg
+identification
+upstairs
+purse
+widened
+pause
+cared
+breathed
+affiliate
+santiago
+prevented
+celtic
+fisher
+125
+recruited
+byzantine
+reconstruction
+farther
+##mp
+diet
+sake
+au
+spite
+sensation
+##ert
+blank
+separation
+105
+##hon
+vladimir
+armies
+anime
+##lie
+accommodate
+orbit
+cult
+sofia
+archive
+##ify
+##box
+founders
+sustained
+disorder
+honours
+northeastern
+mia
+crops
+violet
+threats
+blanket
+fires
+canton
+followers
+southwestern
+prototype
+voyage
+assignment
+altered
+moderate
+protocol
+pistol
+##eo
+questioned
+brass
+lifting
+1852
+math
+authored
+##ual
+doug
+dimensional
+dynamic
+##san
+1851
+pronounced
+grateful
+quest
+uncomfortable
+boom
+presidency
+stevens
+relating
+politicians
+chen
+barrier
+quinn
+diana
+mosque
+tribal
+cheese
+palmer
+portions
+sometime
+chester
+treasure
+wu
+bend
+download
+millions
+reforms
+registration
+##osa
+consequently
+monitoring
+ate
+preliminary
+brandon
+invented
+ps
+eaten
+exterior
+intervention
+ports
+documented
+log
+displays
+lecture
+sally
+favourite
+##itz
+vermont
+lo
+invisible
+isle
+breed
+##ator
+journalists
+relay
+speaks
+backward
+explore
+midfielder
+actively
+stefan
+procedures
+cannon
+blond
+kenneth
+centered
+servants
+chains
+libraries
+malcolm
+essex
+henri
+slavery
+##hal
+facts
+fairy
+coached
+cassie
+cats
+washed
+cop
+##fi
+announcement
+item
+2000s
+vinyl
+activated
+marco
+frontier
+growled
+curriculum
+##das
+loyal
+accomplished
+leslie
+ritual
+kenny
+##00
+vii
+napoleon
+hollow
+hybrid
+jungle
+stationed
+friedrich
+counted
+##ulated
+platinum
+theatrical
+seated
+col
+rubber
+glen
+1840
+diversity
+healing
+extends
+id
+provisions
+administrator
+columbus
+##oe
+tributary
+te
+assured
+org
+##uous
+prestigious
+examined
+lectures
+grammy
+ronald
+associations
+bailey
+allan
+essays
+flute
+believing
+consultant
+proceedings
+travelling
+1853
+kit
+kerala
+yugoslavia
+buddy
+methodist
+##ith
+burial
+centres
+batman
+##nda
+discontinued
+bo
+dock
+stockholm
+lungs
+severely
+##nk
+citing
+manga
+##ugh
+steal
+mumbai
+iraqi
+robot
+celebrity
+bride
+broadcasts
+abolished
+pot
+joel
+overhead
+franz
+packed
+reconnaissance
+johann
+acknowledged
+introduce
+handled
+doctorate
+developments
+drinks
+alley
+palestine
+##nis
+##aki
+proceeded
+recover
+bradley
+grain
+patch
+afford
+infection
+nationalist
+legendary
+##ath
+interchange
+virtually
+gen
+gravity
+exploration
+amber
+vital
+wishes
+powell
+doctrine
+elbow
+screenplay
+##bird
+contribute
+indonesian
+pet
+creates
+##com
+enzyme
+kylie
+discipline
+drops
+manila
+hunger
+##ien
+layers
+suffer
+fever
+bits
+monica
+keyboard
+manages
+##hood
+searched
+appeals
+##bad
+testament
+grande
+reid
+##war
+beliefs
+congo
+##ification
+##dia
+si
+requiring
+##via
+casey
+1849
+regret
+streak
+rape
+depends
+syrian
+sprint
+pound
+tourists
+upcoming
+pub
+##xi
+tense
+##els
+practiced
+echo
+nationwide
+guild
+motorcycle
+liz
+##zar
+chiefs
+desired
+elena
+bye
+precious
+absorbed
+relatives
+booth
+pianist
+##mal
+citizenship
+exhausted
+wilhelm
+##ceae
+##hed
+noting
+quarterback
+urge
+hectares
+##gue
+ace
+holly
+##tal
+blonde
+davies
+parked
+sustainable
+stepping
+twentieth
+airfield
+galaxy
+nest
+chip
+##nell
+tan
+shaft
+paulo
+requirement
+##zy
+paradise
+tobacco
+trans
+renewed
+vietnamese
+##cker
+##ju
+suggesting
+catching
+holmes
+enjoying
+md
+trips
+colt
+holder
+butterfly
+nerve
+reformed
+cherry
+bowling
+trailer
+carriage
+goodbye
+appreciate
+toy
+joshua
+interactive
+enabled
+involve
+##kan
+collar
+determination
+bunch
+facebook
+recall
+shorts
+superintendent
+episcopal
+frustration
+giovanni
+nineteenth
+laser
+privately
+array
+circulation
+##ovic
+armstrong
+deals
+painful
+permit
+discrimination
+##wi
+aires
+retiring
+cottage
+ni
+##sta
+horizon
+ellen
+jamaica
+ripped
+fernando
+chapters
+playstation
+patron
+lecturer
+navigation
+behaviour
+genes
+georgian
+export
+solomon
+rivals
+swift
+seventeen
+rodriguez
+princeton
+independently
+sox
+1847
+arguing
+entity
+casting
+hank
+criteria
+oakland
+geographic
+milwaukee
+reflection
+expanding
+conquest
+dubbed
+##tv
+halt
+brave
+brunswick
+doi
+arched
+curtis
+divorced
+predominantly
+somerset
+streams
+ugly
+zoo
+horrible
+curved
+buenos
+fierce
+dictionary
+vector
+theological
+unions
+handful
+stability
+chan
+punjab
+segments
+##lly
+altar
+ignoring
+gesture
+monsters
+pastor
+##stone
+thighs
+unexpected
+operators
+abruptly
+coin
+compiled
+associates
+improving
+migration
+pin
+##ose
+compact
+collegiate
+reserved
+##urs
+quarterfinals
+roster
+restore
+assembled
+hurry
+oval
+##cies
+1846
+flags
+martha
+##del
+victories
+sharply
+##rated
+argues
+deadly
+neo
+drawings
+symbols
+performer
+##iel
+griffin
+restrictions
+editing
+andrews
+java
+journals
+arabia
+compositions
+dee
+pierce
+removing
+hindi
+casino
+runway
+civilians
+minds
+nasa
+hotels
+##zation
+refuge
+rent
+retain
+potentially
+conferences
+suburban
+conducting
+##tto
+##tions
+##tle
+descended
+massacre
+##cal
+ammunition
+terrain
+fork
+souls
+counts
+chelsea
+durham
+drives
+cab
+##bank
+perth
+realizing
+palestinian
+finn
+simpson
+##dal
+betty
+##ule
+moreover
+particles
+cardinals
+tent
+evaluation
+extraordinary
+##oid
+inscription
+##works
+wednesday
+chloe
+maintains
+panels
+ashley
+trucks
+##nation
+cluster
+sunlight
+strikes
+zhang
+##wing
+dialect
+canon
+##ap
+tucked
+##ws
+collecting
+##mas
+##can
+##sville
+maker
+quoted
+evan
+franco
+aria
+buying
+cleaning
+eva
+closet
+provision
+apollo
+clinic
+rat
+##ez
+necessarily
+ac
+##gle
+##ising
+venues
+flipped
+cent
+spreading
+trustees
+checking
+authorized
+##sco
+disappointed
+##ado
+notion
+duration
+trumpet
+hesitated
+topped
+brussels
+rolls
+theoretical
+hint
+define
+aggressive
+repeat
+wash
+peaceful
+optical
+width
+allegedly
+mcdonald
+strict
+copyright
+##illa
+investors
+mar
+jam
+witnesses
+sounding
+miranda
+michelle
+privacy
+hugo
+harmony
+##pp
+valid
+lynn
+glared
+nina
+102
+headquartered
+diving
+boarding
+gibson
+##ncy
+albanian
+marsh
+routine
+dealt
+enhanced
+er
+intelligent
+substance
+targeted
+enlisted
+discovers
+spinning
+observations
+pissed
+smoking
+rebecca
+capitol
+visa
+varied
+costume
+seemingly
+indies
+compensation
+surgeon
+thursday
+arsenal
+westminster
+suburbs
+rid
+anglican
+##ridge
+knots
+foods
+alumni
+lighter
+fraser
+whoever
+portal
+scandal
+##ray
+gavin
+advised
+instructor
+flooding
+terrorist
+##ale
+teenage
+interim
+senses
+duck
+teen
+thesis
+abby
+eager
+overcome
+##ile
+newport
+glenn
+rises
+shame
+##cc
+prompted
+priority
+forgot
+bomber
+nicolas
+protective
+360
+cartoon
+katherine
+breeze
+lonely
+trusted
+henderson
+richardson
+relax
+banner
+candy
+palms
+remarkable
+##rio
+legends
+cricketer
+essay
+ordained
+edmund
+rifles
+trigger
+##uri
+##away
+sail
+alert
+1830
+audiences
+penn
+sussex
+siblings
+pursued
+indianapolis
+resist
+rosa
+consequence
+succeed
+avoided
+1845
+##ulation
+inland
+##tie
+##nna
+counsel
+profession
+chronicle
+hurried
+##una
+eyebrow
+eventual
+bleeding
+innovative
+cure
+##dom
+committees
+accounting
+con
+scope
+hardy
+heather
+tenor
+gut
+herald
+codes
+tore
+scales
+wagon
+##oo
+luxury
+tin
+prefer
+fountain
+triangle
+bonds
+darling
+convoy
+dried
+traced
+beings
+troy
+accidentally
+slam
+findings
+smelled
+joey
+lawyers
+outcome
+steep
+bosnia
+configuration
+shifting
+toll
+brook
+performers
+lobby
+philosophical
+construct
+shrine
+aggregate
+boot
+cox
+phenomenon
+savage
+insane
+solely
+reynolds
+lifestyle
+##ima
+nationally
+holdings
+consideration
+enable
+edgar
+mo
+mama
+##tein
+fights
+relegation
+chances
+atomic
+hub
+conjunction
+awkward
+reactions
+currency
+finale
+kumar
+underwent
+steering
+elaborate
+gifts
+comprising
+melissa
+veins
+reasonable
+sunshine
+chi
+solve
+trails
+inhabited
+elimination
+ethics
+huh
+ana
+molly
+consent
+apartments
+layout
+marines
+##ces
+hunters
+bulk
+##oma
+hometown
+##wall
+##mont
+cracked
+reads
+neighbouring
+withdrawn
+admission
+wingspan
+damned
+anthology
+lancashire
+brands
+batting
+forgive
+cuban
+awful
+##lyn
+104
+dimensions
+imagination
+##ade
+dante
+##ship
+tracking
+desperately
+goalkeeper
+##yne
+groaned
+workshops
+confident
+burton
+gerald
+milton
+circus
+uncertain
+slope
+copenhagen
+sophia
+fog
+philosopher
+portraits
+accent
+cycling
+varying
+gripped
+larvae
+garrett
+specified
+scotia
+mature
+luther
+kurt
+rap
+##kes
+aerial
+750
+ferdinand
+heated
+es
+transported
+##shan
+safely
+nonetheless
+##orn
+##gal
+motors
+demanding
+##sburg
+startled
+##brook
+ally
+generate
+caps
+ghana
+stained
+demo
+mentions
+beds
+ap
+afterward
+diary
+##bling
+utility
+##iro
+richards
+1837
+conspiracy
+conscious
+shining
+footsteps
+observer
+cyprus
+urged
+loyalty
+developer
+probability
+olive
+upgraded
+gym
+miracle
+insects
+graves
+1844
+ourselves
+hydrogen
+amazon
+katie
+tickets
+poets
+##pm
+planes
+##pan
+prevention
+witnessed
+dense
+jin
+randy
+tang
+warehouse
+monroe
+bang
+archived
+elderly
+investigations
+alec
+granite
+mineral
+conflicts
+controlling
+aboriginal
+carlo
+##zu
+mechanics
+stan
+stark
+rhode
+skirt
+est
+##berry
+bombs
+respected
+##horn
+imposed
+limestone
+deny
+nominee
+memphis
+grabbing
+disabled
+##als
+amusement
+aa
+frankfurt
+corn
+referendum
+varies
+slowed
+disk
+firms
+unconscious
+incredible
+clue
+sue
+##zhou
+twist
+##cio
+joins
+idaho
+chad
+developers
+computing
+destroyer
+103
+mortal
+tucker
+kingston
+choices
+yu
+carson
+1800
+os
+whitney
+geneva
+pretend
+dimension
+staged
+plateau
+maya
+##une
+freestyle
+##bc
+rovers
+hiv
+##ids
+tristan
+classroom
+prospect
+##hus
+honestly
+diploma
+lied
+thermal
+auxiliary
+feast
+unlikely
+iata
+##tel
+morocco
+pounding
+treasury
+lithuania
+considerably
+1841
+dish
+1812
+geological
+matching
+stumbled
+destroying
+marched
+brien
+advances
+cake
+nicole
+belle
+settling
+measuring
+directing
+##mie
+tuesday
+bassist
+capabilities
+stunned
+fraud
+torpedo
+##list
+##phone
+anton
+wisdom
+surveillance
+ruined
+##ulate
+lawsuit
+healthcare
+theorem
+halls
+trend
+aka
+horizontal
+dozens
+acquire
+lasting
+swim
+hawk
+gorgeous
+fees
+vicinity
+decrease
+adoption
+tactics
+##ography
+pakistani
+##ole
+draws
+##hall
+willie
+burke
+heath
+algorithm
+integral
+powder
+elliott
+brigadier
+jackie
+tate
+varieties
+darker
+##cho
+lately
+cigarette
+specimens
+adds
+##ree
+##ensis
+##inger
+exploded
+finalist
+cia
+murders
+wilderness
+arguments
+nicknamed
+acceptance
+onwards
+manufacture
+robertson
+jets
+tampa
+enterprises
+blog
+loudly
+composers
+nominations
+1838
+ai
+malta
+inquiry
+automobile
+hosting
+viii
+rays
+tilted
+grief
+museums
+strategies
+furious
+euro
+equality
+cohen
+poison
+surrey
+wireless
+governed
+ridiculous
+moses
+##esh
+##room
+vanished
+##ito
+barnes
+attract
+morrison
+istanbul
+##iness
+absent
+rotation
+petition
+janet
+##logical
+satisfaction
+custody
+deliberately
+observatory
+comedian
+surfaces
+pinyin
+novelist
+strictly
+canterbury
+oslo
+monks
+embrace
+ibm
+jealous
+photograph
+continent
+dorothy
+marina
+doc
+excess
+holden
+allegations
+explaining
+stack
+avoiding
+lance
+storyline
+majesty
+poorly
+spike
+dos
+bradford
+raven
+travis
+classics
+proven
+voltage
+pillow
+fists
+butt
+1842
+interpreted
+##car
+1839
+gage
+telegraph
+lens
+promising
+expelled
+casual
+collector
+zones
+##min
+silly
+nintendo
+##kh
+##bra
+downstairs
+chef
+suspicious
+afl
+flies
+vacant
+uganda
+pregnancy
+condemned
+lutheran
+estimates
+cheap
+decree
+saxon
+proximity
+stripped
+idiot
+deposits
+contrary
+presenter
+magnus
+glacier
+im
+offense
+edwin
+##ori
+upright
+##long
+bolt
+##ois
+toss
+geographical
+##izes
+environments
+delicate
+marking
+abstract
+xavier
+nails
+windsor
+plantation
+occurring
+equity
+saskatchewan
+fears
+drifted
+sequences
+vegetation
+revolt
+##stic
+1843
+sooner
+fusion
+opposing
+nato
+skating
+1836
+secretly
+ruin
+lease
+##oc
+edit
+##nne
+flora
+anxiety
+ruby
+##ological
+##mia
+tel
+bout
+taxi
+emmy
+frost
+rainbow
+compounds
+foundations
+rainfall
+assassination
+nightmare
+dominican
+##win
+achievements
+deserve
+orlando
+intact
+armenia
+##nte
+calgary
+valentine
+106
+marion
+proclaimed
+theodore
+bells
+courtyard
+thigh
+gonzalez
+console
+troop
+minimal
+monte
+everyday
+##ence
+##if
+supporter
+terrorism
+buck
+openly
+presbyterian
+activists
+carpet
+##iers
+rubbing
+uprising
+##yi
+cute
+conceived
+legally
+##cht
+millennium
+cello
+velocity
+ji
+rescued
+cardiff
+1835
+rex
+concentrate
+senators
+beard
+rendered
+glowing
+battalions
+scouts
+competitors
+sculptor
+catalogue
+arctic
+ion
+raja
+bicycle
+wow
+glancing
+lawn
+##woman
+gentleman
+lighthouse
+publish
+predicted
+calculated
+##val
+variants
+##gne
+strain
+##ui
+winston
+deceased
+##nus
+touchdowns
+brady
+caleb
+sinking
+echoed
+crush
+hon
+blessed
+protagonist
+hayes
+endangered
+magnitude
+editors
+##tine
+estimate
+responsibilities
+##mel
+backup
+laying
+consumed
+sealed
+zurich
+lovers
+frustrated
+##eau
+ahmed
+kicking
+mit
+treasurer
+1832
+biblical
+refuse
+terrified
+pump
+agrees
+genuine
+imprisonment
+refuses
+plymouth
+##hen
+lou
+##nen
+tara
+trembling
+antarctic
+ton
+learns
+##tas
+crap
+crucial
+faction
+atop
+##borough
+wrap
+lancaster
+odds
+hopkins
+erik
+lyon
+##eon
+bros
+##ode
+snap
+locality
+tips
+empress
+crowned
+cal
+acclaimed
+chuckled
+##ory
+clara
+sends
+mild
+towel
+##fl
+##day
+##а
+wishing
+assuming
+interviewed
+##bal
+##die
+interactions
+eden
+cups
+helena
+##lf
+indie
+beck
+##fire
+batteries
+filipino
+wizard
+parted
+##lam
+traces
+##born
+rows
+idol
+albany
+delegates
+##ees
+##sar
+discussions
+##ex
+notre
+instructed
+belgrade
+highways
+suggestion
+lauren
+possess
+orientation
+alexandria
+abdul
+beats
+salary
+reunion
+ludwig
+alright
+wagner
+intimate
+pockets
+slovenia
+hugged
+brighton
+merchants
+cruel
+stole
+trek
+slopes
+repairs
+enrollment
+politically
+underlying
+promotional
+counting
+boeing
+##bb
+isabella
+naming
+##и
+keen
+bacteria
+listing
+separately
+belfast
+ussr
+450
+lithuanian
+anybody
+ribs
+sphere
+martinez
+cock
+embarrassed
+proposals
+fragments
+nationals
+##fs
+##wski
+premises
+fin
+1500
+alpine
+matched
+freely
+bounded
+jace
+sleeve
+##af
+gaming
+pier
+populated
+evident
+##like
+frances
+flooded
+##dle
+frightened
+pour
+trainer
+framed
+visitor
+challenging
+pig
+wickets
+##fold
+infected
+email
+##pes
+arose
+##aw
+reward
+ecuador
+oblast
+vale
+ch
+shuttle
+##usa
+bach
+rankings
+forbidden
+cornwall
+accordance
+salem
+consumers
+bruno
+fantastic
+toes
+machinery
+resolved
+julius
+remembering
+propaganda
+iceland
+bombardment
+tide
+contacts
+wives
+##rah
+concerto
+macdonald
+albania
+implement
+daisy
+tapped
+sudan
+helmet
+angela
+mistress
+##lic
+crop
+sunk
+finest
+##craft
+hostile
+##ute
+##tsu
+boxer
+fr
+paths
+adjusted
+habit
+ballot
+supervision
+soprano
+##zen
+bullets
+wicked
+sunset
+regiments
+disappear
+lamp
+performs
+app
+##gia
+##oa
+rabbit
+digging
+incidents
+entries
+##cion
+dishes
+##oi
+introducing
+##ati
+##fied
+freshman
+slot
+jill
+tackles
+baroque
+backs
+##iest
+lone
+sponsor
+destiny
+altogether
+convert
+##aro
+consensus
+shapes
+demonstration
+basically
+feminist
+auction
+artifacts
+##bing
+strongest
+twitter
+halifax
+2019
+allmusic
+mighty
+smallest
+precise
+alexandra
+viola
+##los
+##ille
+manuscripts
+##illo
+dancers
+ari
+managers
+monuments
+blades
+barracks
+springfield
+maiden
+consolidated
+electron
+##end
+berry
+airing
+wheat
+nobel
+inclusion
+blair
+payments
+geography
+bee
+cc
+eleanor
+react
+##hurst
+afc
+manitoba
+##yu
+su
+lineup
+fitness
+recreational
+investments
+airborne
+disappointment
+##dis
+edmonton
+viewing
+##row
+renovation
+##cast
+infant
+bankruptcy
+roses
+aftermath
+pavilion
+##yer
+carpenter
+withdrawal
+ladder
+##hy
+discussing
+popped
+reliable
+agreements
+rochester
+##abad
+curves
+bombers
+220
+rao
+reverend
+decreased
+choosing
+107
+stiff
+consulting
+naples
+crawford
+tracy
+ka
+ribbon
+cops
+##lee
+crushed
+deciding
+unified
+teenager
+accepting
+flagship
+explorer
+poles
+sanchez
+inspection
+revived
+skilled
+induced
+exchanged
+flee
+locals
+tragedy
+swallow
+loading
+hanna
+demonstrate
+##ela
+salvador
+flown
+contestants
+civilization
+##ines
+wanna
+rhodes
+fletcher
+hector
+knocking
+considers
+##ough
+nash
+mechanisms
+sensed
+mentally
+walt
+unclear
+##eus
+renovated
+madame
+##cks
+crews
+governmental
+##hin
+undertaken
+monkey
+##ben
+##ato
+fatal
+armored
+copa
+caves
+governance
+grasp
+perception
+certification
+froze
+damp
+tugged
+wyoming
+##rg
+##ero
+newman
+##lor
+nerves
+curiosity
+graph
+115
+##ami
+withdraw
+tunnels
+dull
+meredith
+moss
+exhibits
+neighbors
+communicate
+accuracy
+explored
+raiders
+republicans
+secular
+kat
+superman
+penny
+criticised
+##tch
+freed
+update
+conviction
+wade
+ham
+likewise
+delegation
+gotta
+doll
+promises
+technological
+myth
+nationality
+resolve
+convent
+##mark
+sharon
+dig
+sip
+coordinator
+entrepreneur
+fold
+##dine
+capability
+councillor
+synonym
+blown
+swan
+cursed
+1815
+jonas
+haired
+sofa
+canvas
+keeper
+rivalry
+##hart
+rapper
+speedway
+swords
+postal
+maxwell
+estonia
+potter
+recurring
+##nn
+##ave
+errors
+##oni
+cognitive
+1834
+##²
+claws
+nadu
+roberto
+bce
+wrestler
+ellie
+##ations
+infinite
+ink
+##tia
+presumably
+finite
+staircase
+108
+noel
+patricia
+nacional
+##cation
+chill
+eternal
+tu
+preventing
+prussia
+fossil
+limbs
+##logist
+ernst
+frog
+perez
+rene
+##ace
+pizza
+prussian
+##ios
+##vy
+molecules
+regulatory
+answering
+opinions
+sworn
+lengths
+supposedly
+hypothesis
+upward
+habitats
+seating
+ancestors
+drank
+yield
+hd
+synthesis
+researcher
+modest
+##var
+mothers
+peered
+voluntary
+homeland
+##the
+acclaim
+##igan
+static
+valve
+luxembourg
+alto
+carroll
+fe
+receptor
+norton
+ambulance
+##tian
+johnston
+catholics
+depicting
+jointly
+elephant
+gloria
+mentor
+badge
+ahmad
+distinguish
+remarked
+councils
+precisely
+allison
+advancing
+detection
+crowded
+##10
+cooperative
+ankle
+mercedes
+dagger
+surrendered
+pollution
+commit
+subway
+jeffrey
+lesson
+sculptures
+provider
+##fication
+membrane
+timothy
+rectangular
+fiscal
+heating
+teammate
+basket
+particle
+anonymous
+deployment
+##ple
+missiles
+courthouse
+proportion
+shoe
+sec
+##ller
+complaints
+forbes
+blacks
+abandon
+remind
+sizes
+overwhelming
+autobiography
+natalie
+##awa
+risks
+contestant
+countryside
+babies
+scorer
+invaded
+enclosed
+proceed
+hurling
+disorders
+##cu
+reflecting
+continuously
+cruiser
+graduates
+freeway
+investigated
+ore
+deserved
+maid
+blocking
+phillip
+jorge
+shakes
+dove
+mann
+variables
+lacked
+burden
+accompanying
+que
+consistently
+organizing
+provisional
+complained
+endless
+##rm
+tubes
+juice
+georges
+krishna
+mick
+labels
+thriller
+##uch
+laps
+arcade
+sage
+snail
+##table
+shannon
+fi
+laurence
+seoul
+vacation
+presenting
+hire
+churchill
+surprisingly
+prohibited
+savannah
+technically
+##oli
+170
+##lessly
+testimony
+suited
+speeds
+toys
+romans
+mlb
+flowering
+measurement
+talented
+kay
+settings
+charleston
+expectations
+shattered
+achieving
+triumph
+ceremonies
+portsmouth
+lanes
+mandatory
+loser
+stretching
+cologne
+realizes
+seventy
+cornell
+careers
+webb
+##ulating
+americas
+budapest
+ava
+suspicion
+##ison
+yo
+conrad
+##hai
+sterling
+jessie
+rector
+##az
+1831
+transform
+organize
+loans
+christine
+volcanic
+warrant
+slender
+summers
+subfamily
+newer
+danced
+dynamics
+rhine
+proceeds
+heinrich
+gastropod
+commands
+sings
+facilitate
+easter
+ra
+positioned
+responses
+expense
+fruits
+yanked
+imported
+25th
+velvet
+vic
+primitive
+tribune
+baldwin
+neighbourhood
+donna
+rip
+hay
+pr
+##uro
+1814
+espn
+welcomed
+##aria
+qualifier
+glare
+highland
+timing
+##cted
+shells
+eased
+geometry
+louder
+exciting
+slovakia
+##sion
+##iz
+##lot
+savings
+prairie
+##ques
+marching
+rafael
+tonnes
+##lled
+curtain
+preceding
+shy
+heal
+greene
+worthy
+##pot
+detachment
+bury
+sherman
+##eck
+reinforced
+seeks
+bottles
+contracted
+duchess
+outfit
+walsh
+##sc
+mickey
+##ase
+geoffrey
+archer
+squeeze
+dawson
+eliminate
+invention
+##enberg
+neal
+##eth
+stance
+dealer
+coral
+maple
+retire
+polo
+simplified
+##ht
+1833
+hid
+watts
+backwards
+jules
+##oke
+genesis
+mt
+frames
+rebounds
+burma
+woodland
+moist
+santos
+whispers
+drained
+subspecies
+##aa
+streaming
+ulster
+burnt
+correspondence
+maternal
+gerard
+denis
+stealing
+##load
+genius
+duchy
+##oria
+inaugurated
+momentum
+suits
+placement
+sovereign
+clause
+thames
+##hara
+confederation
+reservation
+sketch
+yankees
+lets
+rotten
+charm
+hal
+verses
+ultra
+commercially
+dot
+salon
+citation
+adopt
+winnipeg
+mist
+allocated
+cairo
+##boy
+jenkins
+interference
+objectives
+##wind
+1820
+portfolio
+armoured
+sectors
+##eh
+initiatives
+##world
+integrity
+exercises
+robe
+tap
+ab
+gazed
+##tones
+distracted
+rulers
+111
+favorable
+jerome
+tended
+cart
+factories
+##eri
+diplomat
+valued
+gravel
+charitable
+##try
+calvin
+exploring
+chang
+shepherd
+terrace
+pdf
+pupil
+##ural
+reflects
+ups
+##rch
+governors
+shelf
+depths
+##nberg
+trailed
+crest
+tackle
+##nian
+##ats
+hatred
+##kai
+clare
+makers
+ethiopia
+longtime
+detected
+embedded
+lacking
+slapped
+rely
+thomson
+anticipation
+iso
+morton
+successive
+agnes
+screenwriter
+straightened
+philippe
+playwright
+haunted
+licence
+iris
+intentions
+sutton
+112
+logical
+correctly
+##weight
+branded
+licked
+tipped
+silva
+ricky
+narrator
+requests
+##ents
+greeted
+supernatural
+cow
+##wald
+lung
+refusing
+employer
+strait
+gaelic
+liner
+##piece
+zoe
+sabha
+##mba
+driveway
+harvest
+prints
+bates
+reluctantly
+threshold
+algebra
+ira
+wherever
+coupled
+240
+assumption
+picks
+##air
+designers
+raids
+gentlemen
+##ean
+roller
+blowing
+leipzig
+locks
+screw
+dressing
+strand
+##lings
+scar
+dwarf
+depicts
+##nu
+nods
+##mine
+differ
+boris
+##eur
+yuan
+flip
+##gie
+mob
+invested
+questioning
+applying
+##ture
+shout
+##sel
+gameplay
+blamed
+illustrations
+bothered
+weakness
+rehabilitation
+##of
+##zes
+envelope
+rumors
+miners
+leicester
+subtle
+kerry
+##ico
+ferguson
+##fu
+premiership
+ne
+##cat
+bengali
+prof
+catches
+remnants
+dana
+##rily
+shouting
+presidents
+baltic
+ought
+ghosts
+dances
+sailors
+shirley
+fancy
+dominic
+##bie
+madonna
+##rick
+bark
+buttons
+gymnasium
+ashes
+liver
+toby
+oath
+providence
+doyle
+evangelical
+nixon
+cement
+carnegie
+embarked
+hatch
+surroundings
+guarantee
+needing
+pirate
+essence
+##bee
+filter
+crane
+hammond
+projected
+immune
+percy
+twelfth
+##ult
+regent
+doctoral
+damon
+mikhail
+##ichi
+lu
+critically
+elect
+realised
+abortion
+acute
+screening
+mythology
+steadily
+##fc
+frown
+nottingham
+kirk
+wa
+minneapolis
+##rra
+module
+algeria
+mc
+nautical
+encounters
+surprising
+statues
+availability
+shirts
+pie
+alma
+brows
+munster
+mack
+soup
+crater
+tornado
+sanskrit
+cedar
+explosive
+bordered
+dixon
+planets
+stamp
+exam
+happily
+##bble
+carriers
+kidnapped
+##vis
+accommodation
+emigrated
+##met
+knockout
+correspondent
+violation
+profits
+peaks
+lang
+specimen
+agenda
+ancestry
+pottery
+spelling
+equations
+obtaining
+ki
+linking
+1825
+debris
+asylum
+##20
+buddhism
+teddy
+##ants
+gazette
+##nger
+##sse
+dental
+eligibility
+utc
+fathers
+averaged
+zimbabwe
+francesco
+coloured
+hissed
+translator
+lynch
+mandate
+humanities
+mackenzie
+uniforms
+lin
+##iana
+##gio
+asset
+mhz
+fitting
+samantha
+genera
+wei
+rim
+beloved
+shark
+riot
+entities
+expressions
+indo
+carmen
+slipping
+owing
+abbot
+neighbor
+sidney
+##av
+rats
+recommendations
+encouraging
+squadrons
+anticipated
+commanders
+conquered
+##oto
+donations
+diagnosed
+##mond
+divide
+##iva
+guessed
+decoration
+vernon
+auditorium
+revelation
+conversations
+##kers
+##power
+herzegovina
+dash
+alike
+protested
+lateral
+herman
+accredited
+mg
+##gent
+freeman
+mel
+fiji
+crow
+crimson
+##rine
+livestock
+##pped
+humanitarian
+bored
+oz
+whip
+##lene
+##ali
+legitimate
+alter
+grinning
+spelled
+anxious
+oriental
+wesley
+##nin
+##hole
+carnival
+controller
+detect
+##ssa
+bowed
+educator
+kosovo
+macedonia
+##sin
+occupy
+mastering
+stephanie
+janeiro
+para
+unaware
+nurses
+noon
+135
+cam
+hopefully
+ranger
+combine
+sociology
+polar
+rica
+##eer
+neill
+##sman
+holocaust
+##ip
+doubled
+lust
+1828
+109
+decent
+cooling
+unveiled
+##card
+1829
+nsw
+homer
+chapman
+meyer
+##gin
+dive
+mae
+reagan
+expertise
+##gled
+darwin
+brooke
+sided
+prosecution
+investigating
+comprised
+petroleum
+genres
+reluctant
+differently
+trilogy
+johns
+vegetables
+corpse
+highlighted
+lounge
+pension
+unsuccessfully
+elegant
+aided
+ivory
+beatles
+amelia
+cain
+dubai
+sunny
+immigrant
+babe
+click
+##nder
+underwater
+pepper
+combining
+mumbled
+atlas
+horns
+accessed
+ballad
+physicians
+homeless
+gestured
+rpm
+freak
+louisville
+corporations
+patriots
+prizes
+rational
+warn
+modes
+decorative
+overnight
+din
+troubled
+phantom
+##ort
+monarch
+sheer
+##dorf
+generals
+guidelines
+organs
+addresses
+##zon
+enhance
+curling
+parishes
+cord
+##kie
+linux
+caesar
+deutsche
+bavaria
+##bia
+coleman
+cyclone
+##eria
+bacon
+petty
+##yama
+##old
+hampton
+diagnosis
+1824
+throws
+complexity
+rita
+disputed
+##₃
+pablo
+##sch
+marketed
+trafficking
+##ulus
+examine
+plague
+formats
+##oh
+vault
+faithful
+##bourne
+webster
+##ox
+highlights
+##ient
+##ann
+phones
+vacuum
+sandwich
+modeling
+##gated
+bolivia
+clergy
+qualities
+isabel
+##nas
+##ars
+wears
+screams
+reunited
+annoyed
+bra
+##ancy
+##rate
+differential
+transmitter
+tattoo
+container
+poker
+##och
+excessive
+resides
+cowboys
+##tum
+augustus
+trash
+providers
+statute
+retreated
+balcony
+reversed
+void
+storey
+preceded
+masses
+leap
+laughs
+neighborhoods
+wards
+schemes
+falcon
+santo
+battlefield
+pad
+ronnie
+thread
+lesbian
+venus
+##dian
+beg
+sandstone
+daylight
+punched
+gwen
+analog
+stroked
+wwe
+acceptable
+measurements
+dec
+toxic
+##kel
+adequate
+surgical
+economist
+parameters
+varsity
+##sberg
+quantity
+ella
+##chy
+##rton
+countess
+generating
+precision
+diamonds
+expressway
+ga
+##ı
+1821
+uruguay
+talents
+galleries
+expenses
+scanned
+colleague
+outlets
+ryder
+lucien
+##ila
+paramount
+##bon
+syracuse
+dim
+fangs
+gown
+sweep
+##sie
+toyota
+missionaries
+websites
+##nsis
+sentences
+adviser
+val
+trademark
+spells
+##plane
+patience
+starter
+slim
+##borg
+toe
+incredibly
+shoots
+elliot
+nobility
+##wyn
+cowboy
+endorsed
+gardner
+tendency
+persuaded
+organisms
+emissions
+kazakhstan
+amused
+boring
+chips
+themed
+##hand
+llc
+constantinople
+chasing
+systematic
+guatemala
+borrowed
+erin
+carey
+##hard
+highlands
+struggles
+1810
+##ifying
+##ced
+wong
+exceptions
+develops
+enlarged
+kindergarten
+castro
+##ern
+##rina
+leigh
+zombie
+juvenile
+##most
+consul
+##nar
+sailor
+hyde
+clarence
+intensive
+pinned
+nasty
+useless
+jung
+clayton
+stuffed
+exceptional
+ix
+apostolic
+230
+transactions
+##dge
+exempt
+swinging
+cove
+religions
+##ash
+shields
+dairy
+bypass
+190
+pursuing
+bug
+joyce
+bombay
+chassis
+southampton
+chat
+interact
+redesignated
+##pen
+nascar
+pray
+salmon
+rigid
+regained
+malaysian
+grim
+publicity
+constituted
+capturing
+toilet
+delegate
+purely
+tray
+drift
+loosely
+striker
+weakened
+trinidad
+mitch
+itv
+defines
+transmitted
+ming
+scarlet
+nodding
+fitzgerald
+fu
+narrowly
+sp
+tooth
+standings
+virtue
+##₁
+##wara
+##cting
+chateau
+gloves
+lid
+##nel
+hurting
+conservatory
+##pel
+sinclair
+reopened
+sympathy
+nigerian
+strode
+advocated
+optional
+chronic
+discharge
+##rc
+suck
+compatible
+laurel
+stella
+shi
+fails
+wage
+dodge
+128
+informal
+sorts
+levi
+buddha
+villagers
+##aka
+chronicles
+heavier
+summoned
+gateway
+3000
+eleventh
+jewelry
+translations
+accordingly
+seas
+##ency
+fiber
+pyramid
+cubic
+dragging
+##ista
+caring
+##ops
+android
+contacted
+lunar
+##dt
+kai
+lisbon
+patted
+1826
+sacramento
+theft
+madagascar
+subtropical
+disputes
+ta
+holidays
+piper
+willow
+mare
+cane
+itunes
+newfoundland
+benny
+companions
+dong
+raj
+observe
+roar
+charming
+plaque
+tibetan
+fossils
+enacted
+manning
+bubble
+tina
+tanzania
+##eda
+##hir
+funk
+swamp
+deputies
+cloak
+ufc
+scenario
+par
+scratch
+metals
+anthem
+guru
+engaging
+specially
+##boat
+dialects
+nineteen
+cecil
+duet
+disability
+messenger
+unofficial
+##lies
+defunct
+eds
+moonlight
+drainage
+surname
+puzzle
+honda
+switching
+conservatives
+mammals
+knox
+broadcaster
+sidewalk
+cope
+##ried
+benson
+princes
+peterson
+##sal
+bedford
+sharks
+eli
+wreck
+alberto
+gasp
+archaeology
+lgbt
+teaches
+securities
+madness
+compromise
+waving
+coordination
+davidson
+visions
+leased
+possibilities
+eighty
+jun
+fernandez
+enthusiasm
+assassin
+sponsorship
+reviewer
+kingdoms
+estonian
+laboratories
+##fy
+##nal
+applies
+verb
+celebrations
+##zzo
+rowing
+lightweight
+sadness
+submit
+mvp
+balanced
+dude
+##vas
+explicitly
+metric
+magnificent
+mound
+brett
+mohammad
+mistakes
+irregular
+##hing
+##ass
+sanders
+betrayed
+shipped
+surge
+##enburg
+reporters
+termed
+georg
+pity
+verbal
+bulls
+abbreviated
+enabling
+appealed
+##are
+##atic
+sicily
+sting
+heel
+sweetheart
+bart
+spacecraft
+brutal
+monarchy
+##tter
+aberdeen
+cameo
+diane
+##ub
+survivor
+clyde
+##aries
+complaint
+##makers
+clarinet
+delicious
+chilean
+karnataka
+coordinates
+1818
+panties
+##rst
+pretending
+ar
+dramatically
+kiev
+bella
+tends
+distances
+113
+catalog
+launching
+instances
+telecommunications
+portable
+lindsay
+vatican
+##eim
+angles
+aliens
+marker
+stint
+screens
+bolton
+##rne
+judy
+wool
+benedict
+plasma
+europa
+spark
+imaging
+filmmaker
+swiftly
+##een
+contributor
+##nor
+opted
+stamps
+apologize
+financing
+butter
+gideon
+sophisticated
+alignment
+avery
+chemicals
+yearly
+speculation
+prominence
+professionally
+##ils
+immortal
+institutional
+inception
+wrists
+identifying
+tribunal
+derives
+gains
+##wo
+papal
+preference
+linguistic
+vince
+operative
+brewery
+##ont
+unemployment
+boyd
+##ured
+##outs
+albeit
+prophet
+1813
+bi
+##rr
+##face
+##rad
+quarterly
+asteroid
+cleaned
+radius
+temper
+##llen
+telugu
+jerk
+viscount
+menu
+##ote
+glimpse
+##aya
+yacht
+hawaiian
+baden
+##rl
+laptop
+readily
+##gu
+monetary
+offshore
+scots
+watches
+##yang
+##arian
+upgrade
+needle
+xbox
+lea
+encyclopedia
+flank
+fingertips
+##pus
+delight
+teachings
+confirm
+roth
+beaches
+midway
+winters
+##iah
+teasing
+daytime
+beverly
+gambling
+bonnie
+##backs
+regulated
+clement
+hermann
+tricks
+knot
+##shing
+##uring
+##vre
+detached
+ecological
+owed
+specialty
+byron
+inventor
+bats
+stays
+screened
+unesco
+midland
+trim
+affection
+##ander
+##rry
+jess
+thoroughly
+feedback
+##uma
+chennai
+strained
+heartbeat
+wrapping
+overtime
+pleaded
+##sworth
+mon
+leisure
+oclc
+##tate
+##ele
+feathers
+angelo
+thirds
+nuts
+surveys
+clever
+gill
+commentator
+##dos
+darren
+rides
+gibraltar
+##nc
+##mu
+dissolution
+dedication
+shin
+meals
+saddle
+elvis
+reds
+chaired
+taller
+appreciation
+functioning
+niece
+favored
+advocacy
+robbie
+criminals
+suffolk
+yugoslav
+passport
+constable
+congressman
+hastings
+vera
+##rov
+consecrated
+sparks
+ecclesiastical
+confined
+##ovich
+muller
+floyd
+nora
+1822
+paved
+1827
+cumberland
+ned
+saga
+spiral
+##flow
+appreciated
+yi
+collaborative
+treating
+similarities
+feminine
+finishes
+##ib
+jade
+import
+##nse
+##hot
+champagne
+mice
+securing
+celebrities
+helsinki
+attributes
+##gos
+cousins
+phases
+ache
+lucia
+gandhi
+submission
+vicar
+spear
+shine
+tasmania
+biting
+detention
+constitute
+tighter
+seasonal
+##gus
+terrestrial
+matthews
+##oka
+effectiveness
+parody
+philharmonic
+##onic
+1816
+strangers
+encoded
+consortium
+guaranteed
+regards
+shifts
+tortured
+collision
+supervisor
+inform
+broader
+insight
+theaters
+armour
+emeritus
+blink
+incorporates
+mapping
+##50
+##ein
+handball
+flexible
+##nta
+substantially
+generous
+thief
+##own
+carr
+loses
+1793
+prose
+ucla
+romeo
+generic
+metallic
+realization
+damages
+mk
+commissioners
+zach
+default
+##ther
+helicopters
+lengthy
+stems
+spa
+partnered
+spectators
+rogue
+indication
+penalties
+teresa
+1801
+sen
+##tric
+dalton
+##wich
+irving
+photographic
+##vey
+dell
+deaf
+peters
+excluded
+unsure
+##vable
+patterson
+crawled
+##zio
+resided
+whipped
+latvia
+slower
+ecole
+pipes
+employers
+maharashtra
+comparable
+va
+textile
+pageant
+##gel
+alphabet
+binary
+irrigation
+chartered
+choked
+antoine
+offs
+waking
+supplement
+##wen
+quantities
+demolition
+regain
+locate
+urdu
+folks
+alt
+114
+##mc
+scary
+andreas
+whites
+##ava
+classrooms
+mw
+aesthetic
+publishes
+valleys
+guides
+cubs
+johannes
+bryant
+conventions
+affecting
+##itt
+drain
+awesome
+isolation
+prosecutor
+ambitious
+apology
+captive
+downs
+atmospheric
+lorenzo
+aisle
+beef
+foul
+##onia
+kidding
+composite
+disturbed
+illusion
+natives
+##ffer
+emi
+rockets
+riverside
+wartime
+painters
+adolf
+melted
+##ail
+uncertainty
+simulation
+hawks
+progressed
+meantime
+builder
+spray
+breach
+unhappy
+regina
+russians
+##urg
+determining
+##tation
+tram
+1806
+##quin
+aging
+##12
+1823
+garion
+rented
+mister
+diaz
+terminated
+clip
+1817
+depend
+nervously
+disco
+owe
+defenders
+shiva
+notorious
+disbelief
+shiny
+worcester
+##gation
+##yr
+trailing
+undertook
+islander
+belarus
+limitations
+watershed
+fuller
+overlooking
+utilized
+raphael
+1819
+synthetic
+breakdown
+klein
+##nate
+moaned
+memoir
+lamb
+practicing
+##erly
+cellular
+arrows
+exotic
+##graphy
+witches
+117
+charted
+rey
+hut
+hierarchy
+subdivision
+freshwater
+giuseppe
+aloud
+reyes
+qatar
+marty
+sideways
+utterly
+sexually
+jude
+prayers
+mccarthy
+softball
+blend
+damien
+##gging
+##metric
+wholly
+erupted
+lebanese
+negro
+revenues
+tasted
+comparative
+teamed
+transaction
+labeled
+maori
+sovereignty
+parkway
+trauma
+gran
+malay
+121
+advancement
+descendant
+2020
+buzz
+salvation
+inventory
+symbolic
+##making
+antarctica
+mps
+##gas
+##bro
+mohammed
+myanmar
+holt
+submarines
+tones
+##lman
+locker
+patriarch
+bangkok
+emerson
+remarks
+predators
+kin
+afghan
+confession
+norwich
+rental
+emerge
+advantages
+##zel
+rca
+##hold
+shortened
+storms
+aidan
+##matic
+autonomy
+compliance
+##quet
+dudley
+atp
+##osis
+1803
+motto
+documentation
+summary
+professors
+spectacular
+christina
+archdiocese
+flashing
+innocence
+remake
+##dell
+psychic
+reef
+scare
+employ
+rs
+sticks
+meg
+gus
+leans
+##ude
+accompany
+bergen
+tomas
+##iko
+doom
+wages
+pools
+##nch
+##bes
+breasts
+scholarly
+alison
+outline
+brittany
+breakthrough
+willis
+realistic
+##cut
+##boro
+competitor
+##stan
+pike
+picnic
+icon
+designing
+commercials
+washing
+villain
+skiing
+micro
+costumes
+auburn
+halted
+executives
+##hat
+logistics
+cycles
+vowel
+applicable
+barrett
+exclaimed
+eurovision
+eternity
+ramon
+##umi
+##lls
+modifications
+sweeping
+disgust
+##uck
+torch
+aviv
+ensuring
+rude
+dusty
+sonic
+donovan
+outskirts
+cu
+pathway
+##band
+##gun
+##lines
+disciplines
+acids
+cadet
+paired
+##40
+sketches
+##sive
+marriages
+##⁺
+folding
+peers
+slovak
+implies
+admired
+##beck
+1880s
+leopold
+instinct
+attained
+weston
+megan
+horace
+##ination
+dorsal
+ingredients
+evolutionary
+##its
+complications
+deity
+lethal
+brushing
+levy
+deserted
+institutes
+posthumously
+delivering
+telescope
+coronation
+motivated
+rapids
+luc
+flicked
+pays
+volcano
+tanner
+weighed
+##nica
+crowds
+frankie
+gifted
+addressing
+granddaughter
+winding
+##rna
+constantine
+gomez
+##front
+landscapes
+rudolf
+anthropology
+slate
+werewolf
+##lio
+astronomy
+circa
+rouge
+dreaming
+sack
+knelt
+drowned
+naomi
+prolific
+tracked
+freezing
+herb
+##dium
+agony
+randall
+twisting
+wendy
+deposit
+touches
+vein
+wheeler
+##bbled
+##bor
+batted
+retaining
+tire
+presently
+compare
+specification
+daemon
+nigel
+##grave
+merry
+recommendation
+czechoslovakia
+sandra
+ng
+roma
+##sts
+lambert
+inheritance
+sheikh
+winchester
+cries
+examining
+##yle
+comeback
+cuisine
+nave
+##iv
+ko
+retrieve
+tomatoes
+barker
+polished
+defining
+irene
+lantern
+personalities
+begging
+tract
+swore
+1809
+175
+##gic
+omaha
+brotherhood
+##rley
+haiti
+##ots
+exeter
+##ete
+##zia
+steele
+dumb
+pearson
+210
+surveyed
+elisabeth
+trends
+##ef
+fritz
+##rf
+premium
+bugs
+fraction
+calmly
+viking
+##birds
+tug
+inserted
+unusually
+##ield
+confronted
+distress
+crashing
+brent
+turks
+resign
+##olo
+cambodia
+gabe
+sauce
+##kal
+evelyn
+116
+extant
+clusters
+quarry
+teenagers
+luna
+##lers
+##ister
+affiliation
+drill
+##ashi
+panthers
+scenic
+libya
+anita
+strengthen
+inscriptions
+##cated
+lace
+sued
+judith
+riots
+##uted
+mint
+##eta
+preparations
+midst
+dub
+challenger
+##vich
+mock
+cf
+displaced
+wicket
+breaths
+enables
+schmidt
+analyst
+##lum
+ag
+highlight
+automotive
+axe
+josef
+newark
+sufficiently
+resembles
+50th
+##pal
+flushed
+mum
+traits
+##ante
+commodore
+incomplete
+warming
+titular
+ceremonial
+ethical
+118
+celebrating
+eighteenth
+cao
+lima
+medalist
+mobility
+strips
+snakes
+##city
+miniature
+zagreb
+barton
+escapes
+umbrella
+automated
+doubted
+differs
+cooled
+georgetown
+dresden
+cooked
+fade
+wyatt
+rna
+jacobs
+carlton
+abundant
+stereo
+boost
+madras
+inning
+##hia
+spur
+ip
+malayalam
+begged
+osaka
+groan
+escaping
+charging
+dose
+vista
+##aj
+bud
+papa
+communists
+advocates
+edged
+tri
+##cent
+resemble
+peaking
+necklace
+fried
+montenegro
+saxony
+goose
+glances
+stuttgart
+curator
+recruit
+grocery
+sympathetic
+##tting
+##fort
+127
+lotus
+randolph
+ancestor
+##rand
+succeeding
+jupiter
+1798
+macedonian
+##heads
+hiking
+1808
+handing
+fischer
+##itive
+garbage
+node
+##pies
+prone
+singular
+papua
+inclined
+attractions
+italia
+pouring
+motioned
+grandma
+garnered
+jacksonville
+corp
+ego
+ringing
+aluminum
+##hausen
+ordering
+##foot
+drawer
+traders
+synagogue
+##play
+##kawa
+resistant
+wandering
+fragile
+fiona
+teased
+var
+hardcore
+soaked
+jubilee
+decisive
+exposition
+mercer
+poster
+valencia
+hale
+kuwait
+1811
+##ises
+##wr
+##eed
+tavern
+gamma
+122
+johan
+##uer
+airways
+amino
+gil
+##ury
+vocational
+domains
+torres
+##sp
+generator
+folklore
+outcomes
+##keeper
+canberra
+shooter
+fl
+beams
+confrontation
+##lling
+##gram
+feb
+aligned
+forestry
+pipeline
+jax
+motorway
+conception
+decay
+##tos
+coffin
+##cott
+stalin
+1805
+escorted
+minded
+##nam
+sitcom
+purchasing
+twilight
+veronica
+additions
+passive
+tensions
+straw
+123
+frequencies
+1804
+refugee
+cultivation
+##iate
+christie
+clary
+bulletin
+crept
+disposal
+##rich
+##zong
+processor
+crescent
+##rol
+bmw
+emphasized
+whale
+nazis
+aurora
+##eng
+dwelling
+hauled
+sponsors
+toledo
+mega
+ideology
+theatres
+tessa
+cerambycidae
+saves
+turtle
+cone
+suspects
+kara
+rusty
+yelling
+greeks
+mozart
+shades
+cocked
+participant
+##tro
+shire
+spit
+freeze
+necessity
+##cos
+inmates
+nielsen
+councillors
+loaned
+uncommon
+omar
+peasants
+botanical
+offspring
+daniels
+formations
+jokes
+1794
+pioneers
+sigma
+licensing
+##sus
+wheelchair
+polite
+1807
+liquor
+pratt
+trustee
+##uta
+forewings
+balloon
+##zz
+kilometre
+camping
+explicit
+casually
+shawn
+foolish
+teammates
+nm
+hassan
+carrie
+judged
+satisfy
+vanessa
+knives
+selective
+cnn
+flowed
+##lice
+eclipse
+stressed
+eliza
+mathematician
+cease
+cultivated
+##roy
+commissions
+browns
+##ania
+destroyers
+sheridan
+meadow
+##rius
+minerals
+##cial
+downstream
+clash
+gram
+memoirs
+ventures
+baha
+seymour
+archie
+midlands
+edith
+fare
+flynn
+invite
+canceled
+tiles
+stabbed
+boulder
+incorporate
+amended
+camden
+facial
+mollusk
+unreleased
+descriptions
+yoga
+grabs
+550
+raises
+ramp
+shiver
+##rose
+coined
+pioneering
+tunes
+qing
+warwick
+tops
+119
+melanie
+giles
+##rous
+wandered
+##inal
+annexed
+nov
+30th
+unnamed
+##ished
+organizational
+airplane
+normandy
+stoke
+whistle
+blessing
+violations
+chased
+holders
+shotgun
+##ctic
+outlet
+reactor
+##vik
+tires
+tearing
+shores
+fortified
+mascot
+constituencies
+nc
+columnist
+productive
+tibet
+##rta
+lineage
+hooked
+oct
+tapes
+judging
+cody
+##gger
+hansen
+kashmir
+triggered
+##eva
+solved
+cliffs
+##tree
+resisted
+anatomy
+protesters
+transparent
+implied
+##iga
+injection
+mattress
+excluding
+##mbo
+defenses
+helpless
+devotion
+##elli
+growl
+liberals
+weber
+phenomena
+atoms
+plug
+##iff
+mortality
+apprentice
+howe
+convincing
+aaa
+swimmer
+barber
+leone
+promptly
+sodium
+def
+nowadays
+arise
+##oning
+gloucester
+corrected
+dignity
+norm
+erie
+##ders
+elders
+evacuated
+sylvia
+compression
+##yar
+hartford
+pose
+backpack
+reasoning
+accepts
+24th
+wipe
+millimetres
+marcel
+##oda
+dodgers
+albion
+1790
+overwhelmed
+aerospace
+oaks
+1795
+showcase
+acknowledge
+recovering
+nolan
+ashe
+hurts
+geology
+fashioned
+disappearance
+farewell
+swollen
+shrug
+marquis
+wimbledon
+124
+rue
+1792
+commemorate
+reduces
+experiencing
+inevitable
+calcutta
+intel
+##court
+murderer
+sticking
+fisheries
+imagery
+bloom
+280
+brake
+##inus
+gustav
+hesitation
+memorable
+po
+viral
+beans
+accidents
+tunisia
+antenna
+spilled
+consort
+treatments
+aye
+perimeter
+##gard
+donation
+hostage
+migrated
+banker
+addiction
+apex
+lil
+trout
+##ously
+conscience
+##nova
+rams
+sands
+genome
+passionate
+troubles
+##lets
+##set
+amid
+##ibility
+##ret
+higgins
+exceed
+vikings
+##vie
+payne
+##zan
+muscular
+##ste
+defendant
+sucking
+##wal
+ibrahim
+fuselage
+claudia
+vfl
+europeans
+snails
+interval
+##garh
+preparatory
+statewide
+tasked
+lacrosse
+viktor
+##lation
+angola
+##hra
+flint
+implications
+employs
+teens
+patrons
+stall
+weekends
+barriers
+scrambled
+nucleus
+tehran
+jenna
+parsons
+lifelong
+robots
+displacement
+5000
+##bles
+precipitation
+##gt
+knuckles
+clutched
+1802
+marrying
+ecology
+marx
+accusations
+declare
+scars
+kolkata
+mat
+meadows
+bermuda
+skeleton
+finalists
+vintage
+crawl
+coordinate
+affects
+subjected
+orchestral
+mistaken
+##tc
+mirrors
+dipped
+relied
+260
+arches
+candle
+##nick
+incorporating
+wildly
+fond
+basilica
+owl
+fringe
+rituals
+whispering
+stirred
+feud
+tertiary
+slick
+goat
+honorable
+whereby
+skip
+ricardo
+stripes
+parachute
+adjoining
+submerged
+synthesizer
+##gren
+intend
+positively
+ninety
+phi
+beaver
+partition
+fellows
+alexis
+prohibition
+carlisle
+bizarre
+fraternity
+##bre
+doubts
+icy
+cbc
+aquatic
+sneak
+sonny
+combines
+airports
+crude
+supervised
+spatial
+merge
+alfonso
+##bic
+corrupt
+scan
+undergo
+##ams
+disabilities
+colombian
+comparing
+dolphins
+perkins
+##lish
+reprinted
+unanimous
+bounced
+hairs
+underworld
+midwest
+semester
+bucket
+paperback
+miniseries
+coventry
+demise
+##leigh
+demonstrations
+sensor
+rotating
+yan
+##hler
+arrange
+soils
+##idge
+hyderabad
+labs
+##dr
+brakes
+grandchildren
+##nde
+negotiated
+rover
+ferrari
+continuation
+directorate
+augusta
+stevenson
+counterpart
+gore
+##rda
+nursery
+rican
+ave
+collectively
+broadly
+pastoral
+repertoire
+asserted
+discovering
+nordic
+styled
+fiba
+cunningham
+harley
+middlesex
+survives
+tumor
+tempo
+zack
+aiming
+lok
+urgent
+##rade
+##nto
+devils
+##ement
+contractor
+turin
+##wl
+##ool
+bliss
+repaired
+simmons
+moan
+astronomical
+cr
+negotiate
+lyric
+1890s
+lara
+bred
+clad
+angus
+pbs
+##ience
+engineered
+posed
+##lk
+hernandez
+possessions
+elbows
+psychiatric
+strokes
+confluence
+electorate
+lifts
+campuses
+lava
+alps
+##ep
+##ution
+##date
+physicist
+woody
+##page
+##ographic
+##itis
+juliet
+reformation
+sparhawk
+320
+complement
+suppressed
+jewel
+##½
+floated
+##kas
+continuity
+sadly
+##ische
+inability
+melting
+scanning
+paula
+flour
+judaism
+safer
+vague
+##lm
+solving
+curb
+##stown
+financially
+gable
+bees
+expired
+miserable
+cassidy
+dominion
+1789
+cupped
+145
+robbery
+facto
+amos
+warden
+resume
+tallest
+marvin
+ing
+pounded
+usd
+declaring
+gasoline
+##aux
+darkened
+270
+650
+sophomore
+##mere
+erection
+gossip
+televised
+risen
+dial
+##eu
+pillars
+##link
+passages
+profound
+##tina
+arabian
+ashton
+silicon
+nail
+##ead
+##lated
+##wer
+##hardt
+fleming
+firearms
+ducked
+circuits
+blows
+waterloo
+titans
+##lina
+atom
+fireplace
+cheshire
+financed
+activation
+algorithms
+##zzi
+constituent
+catcher
+cherokee
+partnerships
+sexuality
+platoon
+tragic
+vivian
+guarded
+whiskey
+meditation
+poetic
+##late
+##nga
+##ake
+porto
+listeners
+dominance
+kendra
+mona
+chandler
+factions
+22nd
+salisbury
+attitudes
+derivative
+##ido
+##haus
+intake
+paced
+javier
+illustrator
+barrels
+bias
+cockpit
+burnett
+dreamed
+ensuing
+##anda
+receptors
+someday
+hawkins
+mattered
+##lal
+slavic
+1799
+jesuit
+cameroon
+wasted
+tai
+wax
+lowering
+victorious
+freaking
+outright
+hancock
+librarian
+sensing
+bald
+calcium
+myers
+tablet
+announcing
+barack
+shipyard
+pharmaceutical
+##uan
+greenwich
+flush
+medley
+patches
+wolfgang
+pt
+speeches
+acquiring
+exams
+nikolai
+##gg
+hayden
+kannada
+##type
+reilly
+##pt
+waitress
+abdomen
+devastated
+capped
+pseudonym
+pharmacy
+fulfill
+paraguay
+1796
+clicked
+##trom
+archipelago
+syndicated
+##hman
+lumber
+orgasm
+rejection
+clifford
+lorraine
+advent
+mafia
+rodney
+brock
+##ght
+##used
+##elia
+cassette
+chamberlain
+despair
+mongolia
+sensors
+developmental
+upstream
+##eg
+##alis
+spanning
+165
+trombone
+basque
+seeded
+interred
+renewable
+rhys
+leapt
+revision
+molecule
+##ages
+chord
+vicious
+nord
+shivered
+23rd
+arlington
+debts
+corpus
+sunrise
+bays
+blackburn
+centimetres
+##uded
+shuddered
+gm
+strangely
+gripping
+cartoons
+isabelle
+orbital
+##ppa
+seals
+proving
+##lton
+refusal
+strengthened
+bust
+assisting
+baghdad
+batsman
+portrayal
+mara
+pushes
+spears
+og
+##cock
+reside
+nathaniel
+brennan
+1776
+confirmation
+caucus
+##worthy
+markings
+yemen
+nobles
+ku
+lazy
+viewer
+catalan
+encompasses
+sawyer
+##fall
+sparked
+substances
+patents
+braves
+arranger
+evacuation
+sergio
+persuade
+dover
+tolerance
+penguin
+cum
+jockey
+insufficient
+townships
+occupying
+declining
+plural
+processed
+projection
+puppet
+flanders
+introduces
+liability
+##yon
+gymnastics
+antwerp
+taipei
+hobart
+candles
+jeep
+wes
+observers
+126
+chaplain
+bundle
+glorious
+##hine
+hazel
+flung
+sol
+excavations
+dumped
+stares
+sh
+bangalore
+triangular
+icelandic
+intervals
+expressing
+turbine
+##vers
+songwriting
+crafts
+##igo
+jasmine
+ditch
+rite
+##ways
+entertaining
+comply
+sorrow
+wrestlers
+basel
+emirates
+marian
+rivera
+helpful
+##some
+caution
+downward
+networking
+##atory
+##tered
+darted
+genocide
+emergence
+replies
+specializing
+spokesman
+convenient
+unlocked
+fading
+augustine
+concentrations
+resemblance
+elijah
+investigator
+andhra
+##uda
+promotes
+bean
+##rrell
+fleeing
+wan
+simone
+announcer
+##ame
+##bby
+lydia
+weaver
+132
+residency
+modification
+##fest
+stretches
+##ast
+alternatively
+nat
+lowe
+lacks
+##ented
+pam
+tile
+concealed
+inferior
+abdullah
+residences
+tissues
+vengeance
+##ided
+moisture
+peculiar
+groove
+zip
+bologna
+jennings
+ninja
+oversaw
+zombies
+pumping
+batch
+livingston
+emerald
+installations
+1797
+peel
+nitrogen
+rama
+##fying
+##star
+schooling
+strands
+responding
+werner
+##ost
+lime
+casa
+accurately
+targeting
+##rod
+underway
+##uru
+hemisphere
+lester
+##yard
+occupies
+2d
+griffith
+angrily
+reorganized
+##owing
+courtney
+deposited
+##dd
+##30
+estadio
+##ifies
+dunn
+exiled
+##ying
+checks
+##combe
+##о
+##fly
+successes
+unexpectedly
+blu
+assessed
+##flower
+##ه
+observing
+sacked
+spiders
+kn
+##tail
+mu
+nodes
+prosperity
+audrey
+divisional
+155
+broncos
+tangled
+adjust
+feeds
+erosion
+paolo
+surf
+directory
+snatched
+humid
+admiralty
+screwed
+gt
+reddish
+##nese
+modules
+trench
+lamps
+bind
+leah
+bucks
+competes
+##nz
+##form
+transcription
+##uc
+isles
+violently
+clutching
+pga
+cyclist
+inflation
+flats
+ragged
+unnecessary
+##hian
+stubborn
+coordinated
+harriet
+baba
+disqualified
+330
+insect
+wolfe
+##fies
+reinforcements
+rocked
+duel
+winked
+embraced
+bricks
+##raj
+hiatus
+defeats
+pending
+brightly
+jealousy
+##xton
+##hm
+##uki
+lena
+gdp
+colorful
+##dley
+stein
+kidney
+##shu
+underwear
+wanderers
+##haw
+##icus
+guardians
+m³
+roared
+habits
+##wise
+permits
+gp
+uranium
+punished
+disguise
+bundesliga
+elise
+dundee
+erotic
+partisan
+pi
+collectors
+float
+individually
+rendering
+behavioral
+bucharest
+ser
+hare
+valerie
+corporal
+nutrition
+proportional
+##isa
+immense
+##kis
+pavement
+##zie
+##eld
+sutherland
+crouched
+1775
+##lp
+suzuki
+trades
+endurance
+operas
+crosby
+prayed
+priory
+rory
+socially
+##urn
+gujarat
+##pu
+walton
+cube
+pasha
+privilege
+lennon
+floods
+thorne
+waterfall
+nipple
+scouting
+approve
+##lov
+minorities
+voter
+dwight
+extensions
+assure
+ballroom
+slap
+dripping
+privileges
+rejoined
+confessed
+demonstrating
+patriotic
+yell
+investor
+##uth
+pagan
+slumped
+squares
+##cle
+##kins
+confront
+bert
+embarrassment
+##aid
+aston
+urging
+sweater
+starr
+yuri
+brains
+williamson
+commuter
+mortar
+structured
+selfish
+exports
+##jon
+cds
+##him
+unfinished
+##rre
+mortgage
+destinations
+##nagar
+canoe
+solitary
+buchanan
+delays
+magistrate
+fk
+##pling
+motivation
+##lier
+##vier
+recruiting
+assess
+##mouth
+malik
+antique
+1791
+pius
+rahman
+reich
+tub
+zhou
+smashed
+airs
+galway
+xii
+conditioning
+honduras
+discharged
+dexter
+##pf
+lionel
+129
+debates
+lemon
+tiffany
+volunteered
+dom
+dioxide
+procession
+devi
+sic
+tremendous
+advertisements
+colts
+transferring
+verdict
+hanover
+decommissioned
+utter
+relate
+pac
+racism
+##top
+beacon
+limp
+similarity
+terra
+occurrence
+ant
+##how
+becky
+capt
+updates
+armament
+richie
+pal
+##graph
+halloween
+mayo
+##ssen
+##bone
+cara
+serena
+fcc
+dolls
+obligations
+##dling
+violated
+lafayette
+jakarta
+exploitation
+##ime
+infamous
+iconic
+##lah
+##park
+kitty
+moody
+reginald
+dread
+spill
+crystals
+olivier
+modeled
+bluff
+equilibrium
+separating
+notices
+ordnance
+extinction
+onset
+cosmic
+attachment
+sammy
+expose
+privy
+anchored
+##bil
+abbott
+admits
+bending
+baritone
+emmanuel
+policeman
+vaughan
+winged
+climax
+dresses
+denny
+polytechnic
+mohamed
+burmese
+authentic
+nikki
+genetics
+grandparents
+homestead
+gaza
+postponed
+metacritic
+una
+##sby
+##bat
+unstable
+dissertation
+##rial
+##cian
+curls
+obscure
+uncovered
+bronx
+praying
+disappearing
+##hoe
+prehistoric
+coke
+turret
+mutations
+nonprofit
+pits
+monaco
+##ي
+##usion
+prominently
+dispatched
+podium
+##mir
+uci
+##uation
+133
+fortifications
+birthplace
+kendall
+##lby
+##oll
+preacher
+rack
+goodman
+##rman
+persistent
+##ott
+countless
+jaime
+recorder
+lexington
+persecution
+jumps
+renewal
+wagons
+##11
+crushing
+##holder
+decorations
+##lake
+abundance
+wrath
+laundry
+£1
+garde
+##rp
+jeanne
+beetles
+peasant
+##sl
+splitting
+caste
+sergei
+##rer
+##ema
+scripts
+##ively
+rub
+satellites
+##vor
+inscribed
+verlag
+scrapped
+gale
+packages
+chick
+potato
+slogan
+kathleen
+arabs
+##culture
+counterparts
+reminiscent
+choral
+##tead
+rand
+retains
+bushes
+dane
+accomplish
+courtesy
+closes
+##oth
+slaughter
+hague
+krakow
+lawson
+tailed
+elias
+ginger
+##ttes
+canopy
+betrayal
+rebuilding
+turf
+##hof
+frowning
+allegiance
+brigades
+kicks
+rebuild
+polls
+alias
+nationalism
+td
+rowan
+audition
+bowie
+fortunately
+recognizes
+harp
+dillon
+horrified
+##oro
+renault
+##tics
+ropes
+##α
+presumed
+rewarded
+infrared
+wiping
+accelerated
+illustration
+##rid
+presses
+practitioners
+badminton
+##iard
+detained
+##tera
+recognizing
+relates
+misery
+##sies
+##tly
+reproduction
+piercing
+potatoes
+thornton
+esther
+manners
+hbo
+##aan
+ours
+bullshit
+ernie
+perennial
+sensitivity
+illuminated
+rupert
+##jin
+##iss
+##ear
+rfc
+nassau
+##dock
+staggered
+socialism
+##haven
+appointments
+nonsense
+prestige
+sharma
+haul
+##tical
+solidarity
+gps
+##ook
+##rata
+igor
+pedestrian
+##uit
+baxter
+tenants
+wires
+medication
+unlimited
+guiding
+impacts
+diabetes
+##rama
+sasha
+pas
+clive
+extraction
+131
+continually
+constraints
+##bilities
+sonata
+hunted
+sixteenth
+chu
+planting
+quote
+mayer
+pretended
+abs
+spat
+##hua
+ceramic
+##cci
+curtains
+pigs
+pitching
+##dad
+latvian
+sore
+dayton
+##sted
+##qi
+patrols
+slice
+playground
+##nted
+shone
+stool
+apparatus
+inadequate
+mates
+treason
+##ija
+desires
+##liga
+##croft
+somalia
+laurent
+mir
+leonardo
+oracle
+grape
+obliged
+chevrolet
+thirteenth
+stunning
+enthusiastic
+##ede
+accounted
+concludes
+currents
+basil
+##kovic
+drought
+##rica
+mai
+##aire
+shove
+posting
+##shed
+pilgrimage
+humorous
+packing
+fry
+pencil
+wines
+smells
+144
+marilyn
+aching
+newest
+clung
+bon
+neighbours
+sanctioned
+##pie
+mug
+##stock
+drowning
+##mma
+hydraulic
+##vil
+hiring
+reminder
+lilly
+investigators
+##ncies
+sour
+##eous
+compulsory
+packet
+##rion
+##graphic
+##elle
+cannes
+##inate
+depressed
+##rit
+heroic
+importantly
+theresa
+##tled
+conway
+saturn
+marginal
+rae
+##xia
+corresponds
+royce
+pact
+jasper
+explosives
+packaging
+aluminium
+##ttered
+denotes
+rhythmic
+spans
+assignments
+hereditary
+outlined
+originating
+sundays
+lad
+reissued
+greeting
+beatrice
+##dic
+pillar
+marcos
+plots
+handbook
+alcoholic
+judiciary
+avant
+slides
+extract
+masculine
+blur
+##eum
+##force
+homage
+trembled
+owens
+hymn
+trey
+omega
+signaling
+socks
+accumulated
+reacted
+attic
+theo
+lining
+angie
+distraction
+primera
+talbot
+##key
+1200
+ti
+creativity
+billed
+##hey
+deacon
+eduardo
+identifies
+proposition
+dizzy
+gunner
+hogan
+##yam
+##pping
+##hol
+ja
+##chan
+jensen
+reconstructed
+##berger
+clearance
+darius
+##nier
+abe
+harlem
+plea
+dei
+circled
+emotionally
+notation
+fascist
+neville
+exceeded
+upwards
+viable
+ducks
+##fo
+workforce
+racer
+limiting
+shri
+##lson
+possesses
+1600
+kerr
+moths
+devastating
+laden
+disturbing
+locking
+##cture
+gal
+fearing
+accreditation
+flavor
+aide
+1870s
+mountainous
+##baum
+melt
+##ures
+motel
+texture
+servers
+soda
+##mb
+herd
+##nium
+erect
+puzzled
+hum
+peggy
+examinations
+gould
+testified
+geoff
+ren
+devised
+sacks
+##law
+denial
+posters
+grunted
+cesar
+tutor
+ec
+gerry
+offerings
+byrne
+falcons
+combinations
+ct
+incoming
+pardon
+rocking
+26th
+avengers
+flared
+mankind
+seller
+uttar
+loch
+nadia
+stroking
+exposing
+##hd
+fertile
+ancestral
+instituted
+##has
+noises
+prophecy
+taxation
+eminent
+vivid
+pol
+##bol
+dart
+indirect
+multimedia
+notebook
+upside
+displaying
+adrenaline
+referenced
+geometric
+##iving
+progression
+##ddy
+blunt
+announce
+##far
+implementing
+##lav
+aggression
+liaison
+cooler
+cares
+headache
+plantations
+gorge
+dots
+impulse
+thickness
+ashamed
+averaging
+kathy
+obligation
+precursor
+137
+fowler
+symmetry
+thee
+225
+hears
+##rai
+undergoing
+ads
+butcher
+bowler
+##lip
+cigarettes
+subscription
+goodness
+##ically
+browne
+##hos
+##tech
+kyoto
+donor
+##erty
+damaging
+friction
+drifting
+expeditions
+hardened
+prostitution
+152
+fauna
+blankets
+claw
+tossing
+snarled
+butterflies
+recruits
+investigative
+coated
+healed
+138
+communal
+hai
+xiii
+academics
+boone
+psychologist
+restless
+lahore
+stephens
+mba
+brendan
+foreigners
+printer
+##pc
+ached
+explode
+27th
+deed
+scratched
+dared
+##pole
+cardiac
+1780
+okinawa
+proto
+commando
+compelled
+oddly
+electrons
+##base
+replica
+thanksgiving
+##rist
+sheila
+deliberate
+stafford
+tidal
+representations
+hercules
+ou
+##path
+##iated
+kidnapping
+lenses
+##tling
+deficit
+samoa
+mouths
+consuming
+computational
+maze
+granting
+smirk
+razor
+fixture
+ideals
+inviting
+aiden
+nominal
+##vs
+issuing
+julio
+pitt
+ramsey
+docks
+##oss
+exhaust
+##owed
+bavarian
+draped
+anterior
+mating
+ethiopian
+explores
+noticing
+##nton
+discarded
+convenience
+hoffman
+endowment
+beasts
+cartridge
+mormon
+paternal
+probe
+sleeves
+interfere
+lump
+deadline
+##rail
+jenks
+bulldogs
+scrap
+alternating
+justified
+reproductive
+nam
+seize
+descending
+secretariat
+kirby
+coupe
+grouped
+smash
+panther
+sedan
+tapping
+##18
+lola
+cheer
+germanic
+unfortunate
+##eter
+unrelated
+##fan
+subordinate
+##sdale
+suzanne
+advertisement
+##ility
+horsepower
+##lda
+cautiously
+discourse
+luigi
+##mans
+##fields
+noun
+prevalent
+mao
+schneider
+everett
+surround
+governorate
+kira
+##avia
+westward
+##take
+misty
+rails
+sustainability
+134
+unused
+##rating
+packs
+toast
+unwilling
+regulate
+thy
+suffrage
+nile
+awe
+assam
+definitions
+travelers
+affordable
+##rb
+conferred
+sells
+undefeated
+beneficial
+torso
+basal
+repeating
+remixes
+##pass
+bahrain
+cables
+fang
+##itated
+excavated
+numbering
+statutory
+##rey
+deluxe
+##lian
+forested
+ramirez
+derbyshire
+zeus
+slamming
+transfers
+astronomer
+banana
+lottery
+berg
+histories
+bamboo
+##uchi
+resurrection
+posterior
+bowls
+vaguely
+##thi
+thou
+preserving
+tensed
+offence
+##inas
+meyrick
+callum
+ridden
+watt
+langdon
+tying
+lowland
+snorted
+daring
+truman
+##hale
+##girl
+aura
+overly
+filing
+weighing
+goa
+infections
+philanthropist
+saunders
+eponymous
+##owski
+latitude
+perspectives
+reviewing
+mets
+commandant
+radial
+##kha
+flashlight
+reliability
+koch
+vowels
+amazed
+ada
+elaine
+supper
+##rth
+##encies
+predator
+debated
+soviets
+cola
+##boards
+##nah
+compartment
+crooked
+arbitrary
+fourteenth
+##ctive
+havana
+majors
+steelers
+clips
+profitable
+ambush
+exited
+packers
+##tile
+nude
+cracks
+fungi
+##е
+limb
+trousers
+josie
+shelby
+tens
+frederic
+##ος
+definite
+smoothly
+constellation
+insult
+baton
+discs
+lingering
+##nco
+conclusions
+lent
+staging
+becker
+grandpa
+shaky
+##tron
+einstein
+obstacles
+sk
+adverse
+elle
+economically
+##moto
+mccartney
+thor
+dismissal
+motions
+readings
+nostrils
+treatise
+##pace
+squeezing
+evidently
+prolonged
+1783
+venezuelan
+je
+marguerite
+beirut
+takeover
+shareholders
+##vent
+denise
+digit
+airplay
+norse
+##bbling
+imaginary
+pills
+hubert
+blaze
+vacated
+eliminating
+##ello
+vine
+mansfield
+##tty
+retrospective
+barrow
+borne
+clutch
+bail
+forensic
+weaving
+##nett
+##witz
+desktop
+citadel
+promotions
+worrying
+dorset
+ieee
+subdivided
+##iating
+manned
+expeditionary
+pickup
+synod
+chuckle
+185
+barney
+##rz
+##ffin
+functionality
+karachi
+litigation
+meanings
+uc
+lick
+turbo
+anders
+##ffed
+execute
+curl
+oppose
+ankles
+typhoon
+##د
+##ache
+##asia
+linguistics
+compassion
+pressures
+grazing
+perfection
+##iting
+immunity
+monopoly
+muddy
+backgrounds
+136
+namibia
+francesca
+monitors
+attracting
+stunt
+tuition
+##ии
+vegetable
+##mates
+##quent
+mgm
+jen
+complexes
+forts
+##ond
+cellar
+bites
+seventeenth
+royals
+flemish
+failures
+mast
+charities
+##cular
+peruvian
+capitals
+macmillan
+ipswich
+outward
+frigate
+postgraduate
+folds
+employing
+##ouse
+concurrently
+fiery
+##tai
+contingent
+nightmares
+monumental
+nicaragua
+##kowski
+lizard
+mal
+fielding
+gig
+reject
+##pad
+harding
+##ipe
+coastline
+##cin
+##nos
+beethoven
+humphrey
+innovations
+##tam
+##nge
+norris
+doris
+solicitor
+huang
+obey
+141
+##lc
+niagara
+##tton
+shelves
+aug
+bourbon
+curry
+nightclub
+specifications
+hilton
+##ndo
+centennial
+dispersed
+worm
+neglected
+briggs
+sm
+font
+kuala
+uneasy
+plc
+##nstein
+##bound
+##aking
+##burgh
+awaiting
+pronunciation
+##bbed
+##quest
+eh
+optimal
+zhu
+raped
+greens
+presided
+brenda
+worries
+##life
+venetian
+marxist
+turnout
+##lius
+refined
+braced
+sins
+grasped
+sunderland
+nickel
+speculated
+lowell
+cyrillic
+communism
+fundraising
+resembling
+colonists
+mutant
+freddie
+usc
+##mos
+gratitude
+##run
+mural
+##lous
+chemist
+wi
+reminds
+28th
+steals
+tess
+pietro
+##ingen
+promoter
+ri
+microphone
+honoured
+rai
+sant
+##qui
+feather
+##nson
+burlington
+kurdish
+terrorists
+deborah
+sickness
+##wed
+##eet
+hazard
+irritated
+desperation
+veil
+clarity
+##rik
+jewels
+xv
+##gged
+##ows
+##cup
+berkshire
+unfair
+mysteries
+orchid
+winced
+exhaustion
+renovations
+stranded
+obe
+infinity
+##nies
+adapt
+redevelopment
+thanked
+registry
+olga
+domingo
+noir
+tudor
+ole
+##atus
+commenting
+behaviors
+##ais
+crisp
+pauline
+probable
+stirling
+wigan
+##bian
+paralympics
+panting
+surpassed
+##rew
+luca
+barred
+pony
+famed
+##sters
+cassandra
+waiter
+carolyn
+exported
+##orted
+andres
+destructive
+deeds
+jonah
+castles
+vacancy
+suv
+##glass
+1788
+orchard
+yep
+famine
+belarusian
+sprang
+##forth
+skinny
+##mis
+administrators
+rotterdam
+zambia
+zhao
+boiler
+discoveries
+##ride
+##physics
+lucius
+disappointing
+outreach
+spoon
+##frame
+qualifications
+unanimously
+enjoys
+regency
+##iidae
+stade
+realism
+veterinary
+rodgers
+dump
+alain
+chestnut
+castile
+censorship
+rumble
+gibbs
+##itor
+communion
+reggae
+inactivated
+logs
+loads
+##houses
+homosexual
+##iano
+ale
+informs
+##cas
+phrases
+plaster
+linebacker
+ambrose
+kaiser
+fascinated
+850
+limerick
+recruitment
+forge
+mastered
+##nding
+leinster
+rooted
+threaten
+##strom
+borneo
+##hes
+suggestions
+scholarships
+propeller
+documentaries
+patronage
+coats
+constructing
+invest
+neurons
+comet
+entirety
+shouts
+identities
+annoying
+unchanged
+wary
+##antly
+##ogy
+neat
+oversight
+##kos
+phillies
+replay
+constance
+##kka
+incarnation
+humble
+skies
+minus
+##acy
+smithsonian
+##chel
+guerrilla
+jar
+cadets
+##plate
+surplus
+audit
+##aru
+cracking
+joanna
+louisa
+pacing
+##lights
+intentionally
+##iri
+diner
+nwa
+imprint
+australians
+tong
+unprecedented
+bunker
+naive
+specialists
+ark
+nichols
+railing
+leaked
+pedal
+##uka
+shrub
+longing
+roofs
+v8
+captains
+neural
+tuned
+##ntal
+##jet
+emission
+medina
+frantic
+codex
+definitive
+sid
+abolition
+intensified
+stocks
+enrique
+sustain
+genoa
+oxide
+##written
+clues
+cha
+##gers
+tributaries
+fragment
+venom
+##rity
+##ente
+##sca
+muffled
+vain
+sire
+laos
+##ingly
+##hana
+hastily
+snapping
+surfaced
+sentiment
+motive
+##oft
+contests
+approximate
+mesa
+luckily
+dinosaur
+exchanges
+propelled
+accord
+bourne
+relieve
+tow
+masks
+offended
+##ues
+cynthia
+##mmer
+rains
+bartender
+zinc
+reviewers
+lois
+##sai
+legged
+arrogant
+rafe
+rosie
+comprise
+handicap
+blockade
+inlet
+lagoon
+copied
+drilling
+shelley
+petals
+##inian
+mandarin
+obsolete
+##inated
+onward
+arguably
+productivity
+cindy
+praising
+seldom
+busch
+discusses
+raleigh
+shortage
+ranged
+stanton
+encouragement
+firstly
+conceded
+overs
+temporal
+##uke
+cbe
+##bos
+woo
+certainty
+pumps
+##pton
+stalked
+##uli
+lizzie
+periodic
+thieves
+weaker
+##night
+gases
+shoving
+chooses
+wc
+##chemical
+prompting
+weights
+##kill
+robust
+flanked
+sticky
+hu
+tuberculosis
+##eb
+##eal
+christchurch
+resembled
+wallet
+reese
+inappropriate
+pictured
+distract
+fixing
+fiddle
+giggled
+burger
+heirs
+hairy
+mechanic
+torque
+apache
+obsessed
+chiefly
+cheng
+logging
+##tag
+extracted
+meaningful
+numb
+##vsky
+gloucestershire
+reminding
+##bay
+unite
+##lit
+breeds
+diminished
+clown
+glove
+1860s
+##ن
+##ug
+archibald
+focal
+freelance
+sliced
+depiction
+##yk
+organism
+switches
+sights
+stray
+crawling
+##ril
+lever
+leningrad
+interpretations
+loops
+anytime
+reel
+alicia
+delighted
+##ech
+inhaled
+xiv
+suitcase
+bernie
+vega
+licenses
+northampton
+exclusion
+induction
+monasteries
+racecourse
+homosexuality
+##right
+##sfield
+##rky
+dimitri
+michele
+alternatives
+ions
+commentators
+genuinely
+objected
+pork
+hospitality
+fencing
+stephan
+warships
+peripheral
+wit
+drunken
+wrinkled
+quentin
+spends
+departing
+chung
+numerical
+spokesperson
+##zone
+johannesburg
+caliber
+killers
+##udge
+assumes
+neatly
+demographic
+abigail
+bloc
+##vel
+mounting
+##lain
+bentley
+slightest
+xu
+recipients
+##jk
+merlin
+##writer
+seniors
+prisons
+blinking
+hindwings
+flickered
+kappa
+##hel
+80s
+strengthening
+appealing
+brewing
+gypsy
+mali
+lashes
+hulk
+unpleasant
+harassment
+bio
+treaties
+predict
+instrumentation
+pulp
+troupe
+boiling
+mantle
+##ffe
+ins
+##vn
+dividing
+handles
+verbs
+##onal
+coconut
+senegal
+340
+thorough
+gum
+momentarily
+##sto
+cocaine
+panicked
+destined
+##turing
+teatro
+denying
+weary
+captained
+mans
+##hawks
+##code
+wakefield
+bollywood
+thankfully
+##16
+cyril
+##wu
+amendments
+##bahn
+consultation
+stud
+reflections
+kindness
+1787
+internally
+##ovo
+tex
+mosaic
+distribute
+paddy
+seeming
+143
+##hic
+piers
+##15
+##mura
+##verse
+popularly
+winger
+kang
+sentinel
+mccoy
+##anza
+covenant
+##bag
+verge
+fireworks
+suppress
+thrilled
+dominate
+##jar
+swansea
+##60
+142
+reconciliation
+##ndi
+stiffened
+cue
+dorian
+##uf
+damascus
+amor
+ida
+foremost
+##aga
+porsche
+unseen
+dir
+##had
+##azi
+stony
+lexi
+melodies
+##nko
+angular
+integer
+podcast
+ants
+inherent
+jaws
+justify
+persona
+##olved
+josephine
+##nr
+##ressed
+customary
+flashes
+gala
+cyrus
+glaring
+backyard
+ariel
+physiology
+greenland
+html
+stir
+avon
+atletico
+finch
+methodology
+ked
+##lent
+mas
+catholicism
+townsend
+branding
+quincy
+fits
+containers
+1777
+ashore
+aragon
+##19
+forearm
+poisoning
+##sd
+adopting
+conquer
+grinding
+amnesty
+keller
+finances
+evaluate
+forged
+lankan
+instincts
+##uto
+guam
+bosnian
+photographed
+workplace
+desirable
+protector
+##dog
+allocation
+intently
+encourages
+willy
+##sten
+bodyguard
+electro
+brighter
+##ν
+bihar
+##chev
+lasts
+opener
+amphibious
+sal
+verde
+arte
+##cope
+captivity
+vocabulary
+yields
+##tted
+agreeing
+desmond
+pioneered
+##chus
+strap
+campaigned
+railroads
+##ович
+emblem
+##dre
+stormed
+501
+##ulous
+marijuana
+northumberland
+##gn
+##nath
+bowen
+landmarks
+beaumont
+##qua
+danube
+##bler
+attorneys
+th
+ge
+flyers
+critique
+villains
+cass
+mutation
+acc
+##0s
+colombo
+mckay
+motif
+sampling
+concluding
+syndicate
+##rell
+neon
+stables
+ds
+warnings
+clint
+mourning
+wilkinson
+##tated
+merrill
+leopard
+evenings
+exhaled
+emil
+sonia
+ezra
+discrete
+stove
+farrell
+fifteenth
+prescribed
+superhero
+##rier
+worms
+helm
+wren
+##duction
+##hc
+expo
+##rator
+hq
+unfamiliar
+antony
+prevents
+acceleration
+fiercely
+mari
+painfully
+calculations
+cheaper
+ign
+clifton
+irvine
+davenport
+mozambique
+##np
+pierced
+##evich
+wonders
+##wig
+##cate
+##iling
+crusade
+ware
+##uel
+enzymes
+reasonably
+mls
+##coe
+mater
+ambition
+bunny
+eliot
+kernel
+##fin
+asphalt
+headmaster
+torah
+aden
+lush
+pins
+waived
+##care
+##yas
+joao
+substrate
+enforce
+##grad
+##ules
+alvarez
+selections
+epidemic
+tempted
+##bit
+bremen
+translates
+ensured
+waterfront
+29th
+forrest
+manny
+malone
+kramer
+reigning
+cookies
+simpler
+absorption
+205
+engraved
+##ffy
+evaluated
+1778
+haze
+146
+comforting
+crossover
+##abe
+thorn
+##rift
+##imo
+##pop
+suppression
+fatigue
+cutter
+##tr
+201
+wurttemberg
+##orf
+enforced
+hovering
+proprietary
+gb
+samurai
+syllable
+ascent
+lacey
+tick
+lars
+tractor
+merchandise
+rep
+bouncing
+defendants
+##yre
+huntington
+##ground
+##oko
+standardized
+##hor
+##hima
+assassinated
+nu
+predecessors
+rainy
+liar
+assurance
+lyrical
+##uga
+secondly
+flattened
+ios
+parameter
+undercover
+##mity
+bordeaux
+punish
+ridges
+markers
+exodus
+inactive
+hesitate
+debbie
+nyc
+pledge
+savoy
+nagar
+offset
+organist
+##tium
+hesse
+marin
+converting
+##iver
+diagram
+propulsion
+pu
+validity
+reverted
+supportive
+##dc
+ministries
+clans
+responds
+proclamation
+##inae
+##ø
+##rea
+ein
+pleading
+patriot
+sf
+birch
+islanders
+strauss
+hates
+##dh
+brandenburg
+concession
+rd
+##ob
+1900s
+killings
+textbook
+antiquity
+cinematography
+wharf
+embarrassing
+setup
+creed
+farmland
+inequality
+centred
+signatures
+fallon
+370
+##ingham
+##uts
+ceylon
+gazing
+directive
+laurie
+##tern
+globally
+##uated
+##dent
+allah
+excavation
+threads
+##cross
+148
+frantically
+icc
+utilize
+determines
+respiratory
+thoughtful
+receptions
+##dicate
+merging
+chandra
+seine
+147
+builders
+builds
+diagnostic
+dev
+visibility
+goddamn
+analyses
+dhaka
+cho
+proves
+chancel
+concurrent
+curiously
+canadians
+pumped
+restoring
+1850s
+turtles
+jaguar
+sinister
+spinal
+traction
+declan
+vows
+1784
+glowed
+capitalism
+swirling
+install
+universidad
+##lder
+##oat
+soloist
+##genic
+##oor
+coincidence
+beginnings
+nissan
+dip
+resorts
+caucasus
+combustion
+infectious
+##eno
+pigeon
+serpent
+##itating
+conclude
+masked
+salad
+jew
+##gr
+surreal
+toni
+##wc
+harmonica
+151
+##gins
+##etic
+##coat
+fishermen
+intending
+bravery
+##wave
+klaus
+titan
+wembley
+taiwanese
+ransom
+40th
+incorrect
+hussein
+eyelids
+jp
+cooke
+dramas
+utilities
+##etta
+##print
+eisenhower
+principally
+granada
+lana
+##rak
+openings
+concord
+##bl
+bethany
+connie
+morality
+sega
+##mons
+##nard
+earnings
+##kara
+##cine
+wii
+communes
+##rel
+coma
+composing
+softened
+severed
+grapes
+##17
+nguyen
+analyzed
+warlord
+hubbard
+heavenly
+behave
+slovenian
+##hit
+##ony
+hailed
+filmmakers
+trance
+caldwell
+skye
+unrest
+coward
+likelihood
+##aging
+bern
+sci
+taliban
+honolulu
+propose
+##wang
+1700
+browser
+imagining
+cobra
+contributes
+dukes
+instinctively
+conan
+violinist
+##ores
+accessories
+gradual
+##amp
+quotes
+sioux
+##dating
+undertake
+intercepted
+sparkling
+compressed
+139
+fungus
+tombs
+haley
+imposing
+rests
+degradation
+lincolnshire
+retailers
+wetlands
+tulsa
+distributor
+dungeon
+nun
+greenhouse
+convey
+atlantis
+aft
+exits
+oman
+dresser
+lyons
+##sti
+joking
+eddy
+judgement
+omitted
+digits
+##cts
+##game
+juniors
+##rae
+cents
+stricken
+une
+##ngo
+wizards
+weir
+breton
+nan
+technician
+fibers
+liking
+royalty
+##cca
+154
+persia
+terribly
+magician
+##rable
+##unt
+vance
+cafeteria
+booker
+camille
+warmer
+##static
+consume
+cavern
+gaps
+compass
+contemporaries
+foyer
+soothing
+graveyard
+maj
+plunged
+blush
+##wear
+cascade
+demonstrates
+ordinance
+##nov
+boyle
+##lana
+rockefeller
+shaken
+banjo
+izzy
+##ense
+breathless
+vines
+##32
+##eman
+alterations
+chromosome
+dwellings
+feudal
+mole
+153
+catalonia
+relics
+tenant
+mandated
+##fm
+fridge
+hats
+honesty
+patented
+raul
+heap
+cruisers
+accusing
+enlightenment
+infants
+wherein
+chatham
+contractors
+zen
+affinity
+hc
+osborne
+piston
+156
+traps
+maturity
+##rana
+lagos
+##zal
+peering
+##nay
+attendant
+dealers
+protocols
+subset
+prospects
+biographical
+##cre
+artery
+##zers
+insignia
+nuns
+endured
+##eration
+recommend
+schwartz
+serbs
+berger
+cromwell
+crossroads
+##ctor
+enduring
+clasped
+grounded
+##bine
+marseille
+twitched
+abel
+choke
+https
+catalyst
+moldova
+italians
+##tist
+disastrous
+wee
+##oured
+##nti
+wwf
+nope
+##piration
+##asa
+expresses
+thumbs
+167
+##nza
+coca
+1781
+cheating
+##ption
+skipped
+sensory
+heidelberg
+spies
+satan
+dangers
+semifinal
+202
+bohemia
+whitish
+confusing
+shipbuilding
+relies
+surgeons
+landings
+ravi
+baku
+moor
+suffix
+alejandro
+##yana
+litre
+upheld
+##unk
+rajasthan
+##rek
+coaster
+insists
+posture
+scenarios
+etienne
+favoured
+appoint
+transgender
+elephants
+poked
+greenwood
+defences
+fulfilled
+militant
+somali
+1758
+chalk
+potent
+##ucci
+migrants
+wink
+assistants
+nos
+restriction
+activism
+niger
+##ario
+colon
+shaun
+##sat
+daphne
+##erated
+swam
+congregations
+reprise
+considerations
+magnet
+playable
+xvi
+##р
+overthrow
+tobias
+knob
+chavez
+coding
+##mers
+propped
+katrina
+orient
+newcomer
+##suke
+temperate
+##pool
+farmhouse
+interrogation
+##vd
+committing
+##vert
+forthcoming
+strawberry
+joaquin
+macau
+ponds
+shocking
+siberia
+##cellular
+chant
+contributors
+##nant
+##ologists
+sped
+absorb
+hail
+1782
+spared
+##hore
+barbados
+karate
+opus
+originates
+saul
+##xie
+evergreen
+leaped
+##rock
+correlation
+exaggerated
+weekday
+unification
+bump
+tracing
+brig
+afb
+pathways
+utilizing
+##ners
+mod
+mb
+disturbance
+kneeling
+##stad
+##guchi
+100th
+pune
+##thy
+decreasing
+168
+manipulation
+miriam
+academia
+ecosystem
+occupational
+rbi
+##lem
+rift
+##14
+rotary
+stacked
+incorporation
+awakening
+generators
+guerrero
+racist
+##omy
+cyber
+derivatives
+culminated
+allie
+annals
+panzer
+sainte
+wikipedia
+pops
+zu
+austro
+##vate
+algerian
+politely
+nicholson
+mornings
+educate
+tastes
+thrill
+dartmouth
+##gating
+db
+##jee
+regan
+differing
+concentrating
+choreography
+divinity
+##media
+pledged
+alexandre
+routing
+gregor
+madeline
+##idal
+apocalypse
+##hora
+gunfire
+culminating
+elves
+fined
+liang
+lam
+programmed
+tar
+guessing
+transparency
+gabrielle
+##gna
+cancellation
+flexibility
+##lining
+accession
+shea
+stronghold
+nets
+specializes
+##rgan
+abused
+hasan
+sgt
+ling
+exceeding
+##₄
+admiration
+supermarket
+##ark
+photographers
+specialised
+tilt
+resonance
+hmm
+perfume
+380
+sami
+threatens
+garland
+botany
+guarding
+boiled
+greet
+puppy
+russo
+supplier
+wilmington
+vibrant
+vijay
+##bius
+paralympic
+grumbled
+paige
+faa
+licking
+margins
+hurricanes
+##gong
+fest
+grenade
+ripping
+##uz
+counseling
+weigh
+##sian
+needles
+wiltshire
+edison
+costly
+##not
+fulton
+tramway
+redesigned
+staffordshire
+cache
+gasping
+watkins
+sleepy
+candidacy
+##group
+monkeys
+timeline
+throbbing
+##bid
+##sos
+berth
+uzbekistan
+vanderbilt
+bothering
+overturned
+ballots
+gem
+##iger
+sunglasses
+subscribers
+hooker
+compelling
+ang
+exceptionally
+saloon
+stab
+##rdi
+carla
+terrifying
+rom
+##vision
+coil
+##oids
+satisfying
+vendors
+31st
+mackay
+deities
+overlooked
+ambient
+bahamas
+felipe
+olympia
+whirled
+botanist
+advertised
+tugging
+##dden
+disciples
+morales
+unionist
+rites
+foley
+morse
+motives
+creepy
+##₀
+soo
+##sz
+bargain
+highness
+frightening
+turnpike
+tory
+reorganization
+##cer
+depict
+biographer
+##walk
+unopposed
+manifesto
+##gles
+institut
+emile
+accidental
+kapoor
+##dam
+kilkenny
+cortex
+lively
+##13
+romanesque
+jain
+shan
+cannons
+##ood
+##ske
+petrol
+echoing
+amalgamated
+disappears
+cautious
+proposes
+sanctions
+trenton
+##ر
+flotilla
+aus
+contempt
+tor
+canary
+cote
+theirs
+##hun
+conceptual
+deleted
+fascinating
+paso
+blazing
+elf
+honourable
+hutchinson
+##eiro
+##outh
+##zin
+surveyor
+tee
+amidst
+wooded
+reissue
+intro
+##ono
+cobb
+shelters
+newsletter
+hanson
+brace
+encoding
+confiscated
+dem
+caravan
+marino
+scroll
+melodic
+cows
+imam
+##adi
+##aneous
+northward
+searches
+biodiversity
+cora
+310
+roaring
+##bers
+connell
+theologian
+halo
+compose
+pathetic
+unmarried
+dynamo
+##oot
+az
+calculation
+toulouse
+deserves
+humour
+nr
+forgiveness
+tam
+undergone
+martyr
+pamela
+myths
+whore
+counselor
+hicks
+290
+heavens
+battleship
+electromagnetic
+##bbs
+stellar
+establishments
+presley
+hopped
+##chin
+temptation
+90s
+wills
+nas
+##yuan
+nhs
+##nya
+seminars
+##yev
+adaptations
+gong
+asher
+lex
+indicator
+sikh
+tobago
+cites
+goin
+##yte
+satirical
+##gies
+characterised
+correspond
+bubbles
+lure
+participates
+##vid
+eruption
+skate
+therapeutic
+1785
+canals
+wholesale
+defaulted
+sac
+460
+petit
+##zzled
+virgil
+leak
+ravens
+256
+portraying
+##yx
+ghetto
+creators
+dams
+portray
+vicente
+##rington
+fae
+namesake
+bounty
+##arium
+joachim
+##ota
+##iser
+aforementioned
+axle
+snout
+depended
+dismantled
+reuben
+480
+##ibly
+gallagher
+##lau
+##pd
+earnest
+##ieu
+##iary
+inflicted
+objections
+##llar
+asa
+gritted
+##athy
+jericho
+##sea
+##was
+flick
+underside
+ceramics
+undead
+substituted
+195
+eastward
+undoubtedly
+wheeled
+chimney
+##iche
+guinness
+cb
+##ager
+siding
+##bell
+traitor
+baptiste
+disguised
+inauguration
+149
+tipperary
+choreographer
+perched
+warmed
+stationary
+eco
+##ike
+##ntes
+bacterial
+##aurus
+flores
+phosphate
+##core
+attacker
+invaders
+alvin
+intersects
+a1
+indirectly
+immigrated
+businessmen
+cornelius
+valves
+narrated
+pill
+sober
+ul
+nationale
+monastic
+applicants
+scenery
+##jack
+161
+motifs
+constitutes
+cpu
+##osh
+jurisdictions
+sd
+tuning
+irritation
+woven
+##uddin
+fertility
+gao
+##erie
+antagonist
+impatient
+glacial
+hides
+boarded
+denominations
+interception
+##jas
+cookie
+nicola
+##tee
+algebraic
+marquess
+bahn
+parole
+buyers
+bait
+turbines
+paperwork
+bestowed
+natasha
+renee
+oceans
+purchases
+157
+vaccine
+215
+##tock
+fixtures
+playhouse
+integrate
+jai
+oswald
+intellectuals
+##cky
+booked
+nests
+mortimer
+##isi
+obsession
+sept
+##gler
+##sum
+440
+scrutiny
+simultaneous
+squinted
+##shin
+collects
+oven
+shankar
+penned
+remarkably
+##я
+slips
+luggage
+spectral
+1786
+collaborations
+louie
+consolidation
+##ailed
+##ivating
+420
+hoover
+blackpool
+harness
+ignition
+vest
+tails
+belmont
+mongol
+skinner
+##nae
+visually
+mage
+derry
+##tism
+##unce
+stevie
+transitional
+##rdy
+redskins
+drying
+prep
+prospective
+##21
+annoyance
+oversee
+##loaded
+fills
+##books
+##iki
+announces
+fda
+scowled
+respects
+prasad
+mystic
+tucson
+##vale
+revue
+springer
+bankrupt
+1772
+aristotle
+salvatore
+habsburg
+##geny
+dal
+natal
+nut
+pod
+chewing
+darts
+moroccan
+walkover
+rosario
+lenin
+punjabi
+##ße
+grossed
+scattering
+wired
+invasive
+hui
+polynomial
+corridors
+wakes
+gina
+portrays
+##cratic
+arid
+retreating
+erich
+irwin
+sniper
+##dha
+linen
+lindsey
+maneuver
+butch
+shutting
+socio
+bounce
+commemorative
+postseason
+jeremiah
+pines
+275
+mystical
+beads
+bp
+abbas
+furnace
+bidding
+consulted
+assaulted
+empirical
+rubble
+enclosure
+sob
+weakly
+cancel
+polly
+yielded
+##emann
+curly
+prediction
+battered
+70s
+vhs
+jacqueline
+render
+sails
+barked
+detailing
+grayson
+riga
+sloane
+raging
+##yah
+herbs
+bravo
+##athlon
+alloy
+giggle
+imminent
+suffers
+assumptions
+waltz
+##itate
+accomplishments
+##ited
+bathing
+remixed
+deception
+prefix
+##emia
+deepest
+##tier
+##eis
+balkan
+frogs
+##rong
+slab
+##pate
+philosophers
+peterborough
+grains
+imports
+dickinson
+rwanda
+##atics
+1774
+dirk
+lan
+tablets
+##rove
+clone
+##rice
+caretaker
+hostilities
+mclean
+##gre
+regimental
+treasures
+norms
+impose
+tsar
+tango
+diplomacy
+variously
+complain
+192
+recognise
+arrests
+1779
+celestial
+pulitzer
+##dus
+bing
+libretto
+##moor
+adele
+splash
+##rite
+expectation
+lds
+confronts
+##izer
+spontaneous
+harmful
+wedge
+entrepreneurs
+buyer
+##ope
+bilingual
+translate
+rugged
+conner
+circulated
+uae
+eaton
+##gra
+##zzle
+lingered
+lockheed
+vishnu
+reelection
+alonso
+##oom
+joints
+yankee
+headline
+cooperate
+heinz
+laureate
+invading
+##sford
+echoes
+scandinavian
+##dham
+hugging
+vitamin
+salute
+micah
+hind
+trader
+##sper
+radioactive
+##ndra
+militants
+poisoned
+ratified
+remark
+campeonato
+deprived
+wander
+prop
+##dong
+outlook
+##tani
+##rix
+##eye
+chiang
+darcy
+##oping
+mandolin
+spice
+statesman
+babylon
+182
+walled
+forgetting
+afro
+##cap
+158
+giorgio
+buffer
+##polis
+planetary
+##gis
+overlap
+terminals
+kinda
+centenary
+##bir
+arising
+manipulate
+elm
+ke
+1770
+ak
+##tad
+chrysler
+mapped
+moose
+pomeranian
+quad
+macarthur
+assemblies
+shoreline
+recalls
+stratford
+##rted
+noticeable
+##evic
+imp
+##rita
+##sque
+accustomed
+supplying
+tents
+disgusted
+vogue
+sipped
+filters
+khz
+reno
+selecting
+luftwaffe
+mcmahon
+tyne
+masterpiece
+carriages
+collided
+dunes
+exercised
+flare
+remembers
+muzzle
+##mobile
+heck
+##rson
+burgess
+lunged
+middleton
+boycott
+bilateral
+##sity
+hazardous
+lumpur
+multiplayer
+spotlight
+jackets
+goldman
+liege
+porcelain
+rag
+waterford
+benz
+attracts
+hopeful
+battling
+ottomans
+kensington
+baked
+hymns
+cheyenne
+lattice
+levine
+borrow
+polymer
+clashes
+michaels
+monitored
+commitments
+denounced
+##25
+##von
+cavity
+##oney
+hobby
+akin
+##holders
+futures
+intricate
+cornish
+patty
+##oned
+illegally
+dolphin
+##lag
+barlow
+yellowish
+maddie
+apologized
+luton
+plagued
+##puram
+nana
+##rds
+sway
+fanny
+łodz
+##rino
+psi
+suspicions
+hanged
+##eding
+initiate
+charlton
+##por
+nak
+competent
+235
+analytical
+annex
+wardrobe
+reservations
+##rma
+sect
+162
+fairfax
+hedge
+piled
+buckingham
+uneven
+bauer
+simplicity
+snyder
+interpret
+accountability
+donors
+moderately
+byrd
+continents
+##cite
+##max
+disciple
+hr
+jamaican
+ping
+nominees
+##uss
+mongolian
+diver
+attackers
+eagerly
+ideological
+pillows
+miracles
+apartheid
+revolver
+sulfur
+clinics
+moran
+163
+##enko
+ile
+katy
+rhetoric
+##icated
+chronology
+recycling
+##hrer
+elongated
+mughal
+pascal
+profiles
+vibration
+databases
+domination
+##fare
+##rant
+matthias
+digest
+rehearsal
+polling
+weiss
+initiation
+reeves
+clinging
+flourished
+impress
+ngo
+##hoff
+##ume
+buckley
+symposium
+rhythms
+weed
+emphasize
+transforming
+##taking
+##gence
+##yman
+accountant
+analyze
+flicker
+foil
+priesthood
+voluntarily
+decreases
+##80
+##hya
+slater
+sv
+charting
+mcgill
+##lde
+moreno
+##iu
+besieged
+zur
+robes
+##phic
+admitting
+api
+deported
+turmoil
+peyton
+earthquakes
+##ares
+nationalists
+beau
+clair
+brethren
+interrupt
+welch
+curated
+galerie
+requesting
+164
+##ested
+impending
+steward
+viper
+##vina
+complaining
+beautifully
+brandy
+foam
+nl
+1660
+##cake
+alessandro
+punches
+laced
+explanations
+##lim
+attribute
+clit
+reggie
+discomfort
+##cards
+smoothed
+whales
+##cene
+adler
+countered
+duffy
+disciplinary
+widening
+recipe
+reliance
+conducts
+goats
+gradient
+preaching
+##shaw
+matilda
+quasi
+striped
+meridian
+cannabis
+cordoba
+certificates
+##agh
+##tering
+graffiti
+hangs
+pilgrims
+repeats
+##ych
+revive
+urine
+etat
+##hawk
+fueled
+belts
+fuzzy
+susceptible
+##hang
+mauritius
+salle
+sincere
+beers
+hooks
+##cki
+arbitration
+entrusted
+advise
+sniffed
+seminar
+junk
+donnell
+processors
+principality
+strapped
+celia
+mendoza
+everton
+fortunes
+prejudice
+starving
+reassigned
+steamer
+##lund
+tuck
+evenly
+foreman
+##ffen
+dans
+375
+envisioned
+slit
+##xy
+baseman
+liberia
+rosemary
+##weed
+electrified
+periodically
+potassium
+stride
+contexts
+sperm
+slade
+mariners
+influx
+bianca
+subcommittee
+##rane
+spilling
+icao
+estuary
+##nock
+delivers
+iphone
+##ulata
+isa
+mira
+bohemian
+dessert
+##sbury
+welcoming
+proudly
+slowing
+##chs
+musee
+ascension
+russ
+##vian
+waits
+##psy
+africans
+exploit
+##morphic
+gov
+eccentric
+crab
+peck
+##ull
+entrances
+formidable
+marketplace
+groom
+bolted
+metabolism
+patton
+robbins
+courier
+payload
+endure
+##ifier
+andes
+refrigerator
+##pr
+ornate
+##uca
+ruthless
+illegitimate
+masonry
+strasbourg
+bikes
+adobe
+##³
+apples
+quintet
+willingly
+niche
+bakery
+corpses
+energetic
+##cliffe
+##sser
+##ards
+177
+centimeters
+centro
+fuscous
+cretaceous
+rancho
+##yde
+andrei
+telecom
+tottenham
+oasis
+ordination
+vulnerability
+presiding
+corey
+cp
+penguins
+sims
+##pis
+malawi
+piss
+##48
+correction
+##cked
+##ffle
+##ryn
+countdown
+detectives
+psychiatrist
+psychedelic
+dinosaurs
+blouse
+##get
+choi
+vowed
+##oz
+randomly
+##pol
+49ers
+scrub
+blanche
+bruins
+dusseldorf
+##using
+unwanted
+##ums
+212
+dominique
+elevations
+headlights
+om
+laguna
+##oga
+1750
+famously
+ignorance
+shrewsbury
+##aine
+ajax
+breuning
+che
+confederacy
+greco
+overhaul
+##screen
+paz
+skirts
+disagreement
+cruelty
+jagged
+phoebe
+shifter
+hovered
+viruses
+##wes
+mandy
+##lined
+##gc
+landlord
+squirrel
+dashed
+##ι
+ornamental
+gag
+wally
+grange
+literal
+spurs
+undisclosed
+proceeding
+yin
+##text
+billie
+orphan
+spanned
+humidity
+indy
+weighted
+presentations
+explosions
+lucian
+##tary
+vaughn
+hindus
+##anga
+##hell
+psycho
+171
+daytona
+protects
+efficiently
+rematch
+sly
+tandem
+##oya
+rebranded
+impaired
+hee
+metropolis
+peach
+godfrey
+diaspora
+ethnicity
+prosperous
+gleaming
+dar
+grossing
+playback
+##rden
+stripe
+pistols
+##tain
+births
+labelled
+##cating
+172
+rudy
+alba
+##onne
+aquarium
+hostility
+##gb
+##tase
+shudder
+sumatra
+hardest
+lakers
+consonant
+creeping
+demos
+homicide
+capsule
+zeke
+liberties
+expulsion
+pueblo
+##comb
+trait
+transporting
+##ddin
+##neck
+##yna
+depart
+gregg
+mold
+ledge
+hangar
+oldham
+playboy
+termination
+analysts
+gmbh
+romero
+##itic
+insist
+cradle
+filthy
+brightness
+slash
+shootout
+deposed
+bordering
+##truct
+isis
+microwave
+tumbled
+sheltered
+cathy
+werewolves
+messy
+andersen
+convex
+clapped
+clinched
+satire
+wasting
+edo
+vc
+rufus
+##jak
+mont
+##etti
+poznan
+##keeping
+restructuring
+transverse
+##rland
+azerbaijani
+slovene
+gestures
+roommate
+choking
+shear
+##quist
+vanguard
+oblivious
+##hiro
+disagreed
+baptism
+##lich
+coliseum
+##aceae
+salvage
+societe
+cory
+locke
+relocation
+relying
+versailles
+ahl
+swelling
+##elo
+cheerful
+##word
+##edes
+gin
+sarajevo
+obstacle
+diverted
+##nac
+messed
+thoroughbred
+fluttered
+utrecht
+chewed
+acquaintance
+assassins
+dispatch
+mirza
+##wart
+nike
+salzburg
+swell
+yen
+##gee
+idle
+ligue
+samson
+##nds
+##igh
+playful
+spawned
+##cise
+tease
+##case
+burgundy
+##bot
+stirring
+skeptical
+interceptions
+marathi
+##dies
+bedrooms
+aroused
+pinch
+##lik
+preferences
+tattoos
+buster
+digitally
+projecting
+rust
+##ital
+kitten
+priorities
+addison
+pseudo
+##guard
+dusk
+icons
+sermon
+##psis
+##iba
+bt
+##lift
+##xt
+ju
+truce
+rink
+##dah
+##wy
+defects
+psychiatry
+offences
+calculate
+glucose
+##iful
+##rized
+##unda
+francaise
+##hari
+richest
+warwickshire
+carly
+1763
+purity
+redemption
+lending
+##cious
+muse
+bruises
+cerebral
+aero
+carving
+##name
+preface
+terminology
+invade
+monty
+##int
+anarchist
+blurred
+##iled
+rossi
+treats
+guts
+shu
+foothills
+ballads
+undertaking
+premise
+cecilia
+affiliates
+blasted
+conditional
+wilder
+minors
+drone
+rudolph
+buffy
+swallowing
+horton
+attested
+##hop
+rutherford
+howell
+primetime
+livery
+penal
+##bis
+minimize
+hydro
+wrecked
+wrought
+palazzo
+##gling
+cans
+vernacular
+friedman
+nobleman
+shale
+walnut
+danielle
+##ection
+##tley
+sears
+##kumar
+chords
+lend
+flipping
+streamed
+por
+dracula
+gallons
+sacrifices
+gamble
+orphanage
+##iman
+mckenzie
+##gible
+boxers
+daly
+##balls
+##ان
+208
+##ific
+##rative
+##iq
+exploited
+slated
+##uity
+circling
+hillary
+pinched
+goldberg
+provost
+campaigning
+lim
+piles
+ironically
+jong
+mohan
+successors
+usaf
+##tem
+##ught
+autobiographical
+haute
+preserves
+##ending
+acquitted
+comparisons
+203
+hydroelectric
+gangs
+cypriot
+torpedoes
+rushes
+chrome
+derive
+bumps
+instability
+fiat
+pets
+##mbe
+silas
+dye
+reckless
+settler
+##itation
+info
+heats
+##writing
+176
+canonical
+maltese
+fins
+mushroom
+stacy
+aspen
+avid
+##kur
+##loading
+vickers
+gaston
+hillside
+statutes
+wilde
+gail
+kung
+sabine
+comfortably
+motorcycles
+##rgo
+169
+pneumonia
+fetch
+##sonic
+axel
+faintly
+parallels
+##oop
+mclaren
+spouse
+compton
+interdisciplinary
+miner
+##eni
+181
+clamped
+##chal
+##llah
+separates
+versa
+##mler
+scarborough
+labrador
+##lity
+##osing
+rutgers
+hurdles
+como
+166
+burt
+divers
+##100
+wichita
+cade
+coincided
+##erson
+bruised
+mla
+##pper
+vineyard
+##ili
+##brush
+notch
+mentioning
+jase
+hearted
+kits
+doe
+##acle
+pomerania
+##ady
+ronan
+seizure
+pavel
+problematic
+##zaki
+domenico
+##ulin
+catering
+penelope
+dependence
+parental
+emilio
+ministerial
+atkinson
+##bolic
+clarkson
+chargers
+colby
+grill
+peeked
+arises
+summon
+##aged
+fools
+##grapher
+faculties
+qaeda
+##vial
+garner
+refurbished
+##hwa
+geelong
+disasters
+nudged
+bs
+shareholder
+lori
+algae
+reinstated
+rot
+##ades
+##nous
+invites
+stainless
+183
+inclusive
+##itude
+diocesan
+til
+##icz
+denomination
+##xa
+benton
+floral
+registers
+##ider
+##erman
+##kell
+absurd
+brunei
+guangzhou
+hitter
+retaliation
+##uled
+##eve
+blanc
+nh
+consistency
+contamination
+##eres
+##rner
+dire
+palermo
+broadcasters
+diaries
+inspire
+vols
+brewer
+tightening
+ky
+mixtape
+hormone
+##tok
+stokes
+##color
+##dly
+##ssi
+pg
+##ometer
+##lington
+sanitation
+##tility
+intercontinental
+apps
+##adt
+¹⁄₂
+cylinders
+economies
+favourable
+unison
+croix
+gertrude
+odyssey
+vanity
+dangling
+##logists
+upgrades
+dice
+middleweight
+practitioner
+##ight
+206
+henrik
+parlor
+orion
+angered
+lac
+python
+blurted
+##rri
+sensual
+intends
+swings
+angled
+##phs
+husky
+attain
+peerage
+precinct
+textiles
+cheltenham
+shuffled
+dai
+confess
+tasting
+bhutan
+##riation
+tyrone
+segregation
+abrupt
+ruiz
+##rish
+smirked
+blackwell
+confidential
+browning
+amounted
+##put
+vase
+scarce
+fabulous
+raided
+staple
+guyana
+unemployed
+glider
+shay
+##tow
+carmine
+troll
+intervene
+squash
+superstar
+##uce
+cylindrical
+len
+roadway
+researched
+handy
+##rium
+##jana
+meta
+lao
+declares
+##rring
+##tadt
+##elin
+##kova
+willem
+shrubs
+napoleonic
+realms
+skater
+qi
+volkswagen
+##ł
+tad
+hara
+archaeologist
+awkwardly
+eerie
+##kind
+wiley
+##heimer
+##24
+titus
+organizers
+cfl
+crusaders
+lama
+usb
+vent
+enraged
+thankful
+occupants
+maximilian
+##gaard
+possessing
+textbooks
+##oran
+collaborator
+quaker
+##ulo
+avalanche
+mono
+silky
+straits
+isaiah
+mustang
+surged
+resolutions
+potomac
+descend
+cl
+kilograms
+plato
+strains
+saturdays
+##olin
+bernstein
+##ype
+holstein
+ponytail
+##watch
+belize
+conversely
+heroine
+perpetual
+##ylus
+charcoal
+piedmont
+glee
+negotiating
+backdrop
+prologue
+##jah
+##mmy
+pasadena
+climbs
+ramos
+sunni
+##holm
+##tner
+##tri
+anand
+deficiency
+hertfordshire
+stout
+##avi
+aperture
+orioles
+##irs
+doncaster
+intrigued
+bombed
+coating
+otis
+##mat
+cocktail
+##jit
+##eto
+amir
+arousal
+sar
+##proof
+##act
+##ories
+dixie
+pots
+##bow
+whereabouts
+159
+##fted
+drains
+bullying
+cottages
+scripture
+coherent
+fore
+poe
+appetite
+##uration
+sampled
+##ators
+##dp
+derrick
+rotor
+jays
+peacock
+installment
+##rro
+advisors
+##coming
+rodeo
+scotch
+##mot
+##db
+##fen
+##vant
+ensued
+rodrigo
+dictatorship
+martyrs
+twenties
+##н
+towed
+incidence
+marta
+rainforest
+sai
+scaled
+##cles
+oceanic
+qualifiers
+symphonic
+mcbride
+dislike
+generalized
+aubrey
+colonization
+##iation
+##lion
+##ssing
+disliked
+lublin
+salesman
+##ulates
+spherical
+whatsoever
+sweating
+avalon
+contention
+punt
+severity
+alderman
+atari
+##dina
+##grant
+##rop
+scarf
+seville
+vertices
+annexation
+fairfield
+fascination
+inspiring
+launches
+palatinate
+regretted
+##rca
+feral
+##iom
+elk
+nap
+olsen
+reddy
+yong
+##leader
+##iae
+garment
+transports
+feng
+gracie
+outrage
+viceroy
+insides
+##esis
+breakup
+grady
+organizer
+softer
+grimaced
+222
+murals
+galicia
+arranging
+vectors
+##rsten
+bas
+##sb
+##cens
+sloan
+##eka
+bitten
+ara
+fender
+nausea
+bumped
+kris
+banquet
+comrades
+detector
+persisted
+##llan
+adjustment
+endowed
+cinemas
+##shot
+sellers
+##uman
+peek
+epa
+kindly
+neglect
+simpsons
+talon
+mausoleum
+runaway
+hangul
+lookout
+##cic
+rewards
+coughed
+acquainted
+chloride
+##ald
+quicker
+accordion
+neolithic
+##qa
+artemis
+coefficient
+lenny
+pandora
+tx
+##xed
+ecstasy
+litter
+segunda
+chairperson
+gemma
+hiss
+rumor
+vow
+nasal
+antioch
+compensate
+patiently
+transformers
+##eded
+judo
+morrow
+penis
+posthumous
+philips
+bandits
+husbands
+denote
+flaming
+##any
+##phones
+langley
+yorker
+1760
+walters
+##uo
+##kle
+gubernatorial
+fatty
+samsung
+leroy
+outlaw
+##nine
+unpublished
+poole
+jakob
+##ᵢ
+##ₙ
+crete
+distorted
+superiority
+##dhi
+intercept
+crust
+mig
+claus
+crashes
+positioning
+188
+stallion
+301
+frontal
+armistice
+##estinal
+elton
+aj
+encompassing
+camel
+commemorated
+malaria
+woodward
+calf
+cigar
+penetrate
+##oso
+willard
+##rno
+##uche
+illustrate
+amusing
+convergence
+noteworthy
+##lma
+##rva
+journeys
+realise
+manfred
+##sable
+410
+##vocation
+hearings
+fiance
+##posed
+educators
+provoked
+adjusting
+##cturing
+modular
+stockton
+paterson
+vlad
+rejects
+electors
+selena
+maureen
+##tres
+uber
+##rce
+swirled
+##num
+proportions
+nanny
+pawn
+naturalist
+parma
+apostles
+awoke
+ethel
+wen
+##bey
+monsoon
+overview
+##inating
+mccain
+rendition
+risky
+adorned
+##ih
+equestrian
+germain
+nj
+conspicuous
+confirming
+##yoshi
+shivering
+##imeter
+milestone
+rumours
+flinched
+bounds
+smacked
+token
+##bei
+lectured
+automobiles
+##shore
+impacted
+##iable
+nouns
+nero
+##leaf
+ismail
+prostitute
+trams
+##lace
+bridget
+sud
+stimulus
+impressions
+reins
+revolves
+##oud
+##gned
+giro
+honeymoon
+##swell
+criterion
+##sms
+##uil
+libyan
+prefers
+##osition
+211
+preview
+sucks
+accusation
+bursts
+metaphor
+diffusion
+tolerate
+faye
+betting
+cinematographer
+liturgical
+specials
+bitterly
+humboldt
+##ckle
+flux
+rattled
+##itzer
+archaeologists
+odor
+authorised
+marshes
+discretion
+##ов
+alarmed
+archaic
+inverse
+##leton
+explorers
+##pine
+drummond
+tsunami
+woodlands
+##minate
+##tland
+booklet
+insanity
+owning
+insert
+crafted
+calculus
+##tore
+receivers
+##bt
+stung
+##eca
+##nched
+prevailing
+travellers
+eyeing
+lila
+graphs
+##borne
+178
+julien
+##won
+morale
+adaptive
+therapist
+erica
+cw
+libertarian
+bowman
+pitches
+vita
+##ional
+crook
+##ads
+##entation
+caledonia
+mutiny
+##sible
+1840s
+automation
+##ß
+flock
+##pia
+ironic
+pathology
+##imus
+remarried
+##22
+joker
+withstand
+energies
+##att
+shropshire
+hostages
+madeleine
+tentatively
+conflicting
+mateo
+recipes
+euros
+ol
+mercenaries
+nico
+##ndon
+albuquerque
+augmented
+mythical
+bel
+freud
+##child
+cough
+##lica
+365
+freddy
+lillian
+genetically
+nuremberg
+calder
+209
+bonn
+outdoors
+paste
+suns
+urgency
+vin
+restraint
+tyson
+##cera
+##selle
+barrage
+bethlehem
+kahn
+##par
+mounts
+nippon
+barony
+happier
+ryu
+makeshift
+sheldon
+blushed
+castillo
+barking
+listener
+taped
+bethel
+fluent
+headlines
+pornography
+rum
+disclosure
+sighing
+mace
+doubling
+gunther
+manly
+##plex
+rt
+interventions
+physiological
+forwards
+emerges
+##tooth
+##gny
+compliment
+rib
+recession
+visibly
+barge
+faults
+connector
+exquisite
+prefect
+##rlin
+patio
+##cured
+elevators
+brandt
+italics
+pena
+173
+wasp
+satin
+ea
+botswana
+graceful
+respectable
+##jima
+##rter
+##oic
+franciscan
+generates
+##dl
+alfredo
+disgusting
+##olate
+##iously
+sherwood
+warns
+cod
+promo
+cheryl
+sino
+##ة
+##escu
+twitch
+##zhi
+brownish
+thom
+ortiz
+##dron
+densely
+##beat
+carmel
+reinforce
+##bana
+187
+anastasia
+downhill
+vertex
+contaminated
+remembrance
+harmonic
+homework
+##sol
+fiancee
+gears
+olds
+angelica
+loft
+ramsay
+quiz
+colliery
+sevens
+##cape
+autism
+##hil
+walkway
+##boats
+ruben
+abnormal
+ounce
+khmer
+##bbe
+zachary
+bedside
+morphology
+punching
+##olar
+sparrow
+convinces
+##35
+hewitt
+queer
+remastered
+rods
+mabel
+solemn
+notified
+lyricist
+symmetric
+##xide
+174
+encore
+passports
+wildcats
+##uni
+baja
+##pac
+mildly
+##ease
+bleed
+commodity
+mounds
+glossy
+orchestras
+##omo
+damian
+prelude
+ambitions
+##vet
+awhile
+remotely
+##aud
+asserts
+imply
+##iques
+distinctly
+modelling
+remedy
+##dded
+windshield
+dani
+xiao
+##endra
+audible
+powerplant
+1300
+invalid
+elemental
+acquisitions
+##hala
+immaculate
+libby
+plata
+smuggling
+ventilation
+denoted
+minh
+##morphism
+430
+differed
+dion
+kelley
+lore
+mocking
+sabbath
+spikes
+hygiene
+drown
+runoff
+stylized
+tally
+liberated
+aux
+interpreter
+righteous
+aba
+siren
+reaper
+pearce
+millie
+##cier
+##yra
+gaius
+##iso
+captures
+##ttering
+dorm
+claudio
+##sic
+benches
+knighted
+blackness
+##ored
+discount
+fumble
+oxidation
+routed
+##ς
+novak
+perpendicular
+spoiled
+fracture
+splits
+##urt
+pads
+topology
+##cats
+axes
+fortunate
+offenders
+protestants
+esteem
+221
+broadband
+convened
+frankly
+hound
+prototypes
+isil
+facilitated
+keel
+##sher
+sahara
+awaited
+bubba
+orb
+prosecutors
+186
+hem
+520
+##xing
+relaxing
+remnant
+romney
+sorted
+slalom
+stefano
+ulrich
+##active
+exemption
+folder
+pauses
+foliage
+hitchcock
+epithet
+204
+criticisms
+##aca
+ballistic
+brody
+hinduism
+chaotic
+youths
+equals
+##pala
+pts
+thicker
+analogous
+capitalist
+improvised
+overseeing
+sinatra
+ascended
+beverage
+##tl
+straightforward
+##kon
+curran
+##west
+bois
+325
+induce
+surveying
+emperors
+sax
+unpopular
+##kk
+cartoonist
+fused
+##mble
+unto
+##yuki
+localities
+##cko
+##ln
+darlington
+slain
+academie
+lobbying
+sediment
+puzzles
+##grass
+defiance
+dickens
+manifest
+tongues
+alumnus
+arbor
+coincide
+184
+appalachian
+mustafa
+examiner
+cabaret
+traumatic
+yves
+bracelet
+draining
+heroin
+magnum
+baths
+odessa
+consonants
+mitsubishi
+##gua
+kellan
+vaudeville
+##fr
+joked
+null
+straps
+probation
+##ław
+ceded
+interfaces
+##pas
+##zawa
+blinding
+viet
+224
+rothschild
+museo
+640
+huddersfield
+##vr
+tactic
+##storm
+brackets
+dazed
+incorrectly
+##vu
+reg
+glazed
+fearful
+manifold
+benefited
+irony
+##sun
+stumbling
+##rte
+willingness
+balkans
+mei
+wraps
+##aba
+injected
+##lea
+gu
+syed
+harmless
+##hammer
+bray
+takeoff
+poppy
+timor
+cardboard
+astronaut
+purdue
+weeping
+southbound
+cursing
+stalls
+diagonal
+##neer
+lamar
+bryce
+comte
+weekdays
+harrington
+##uba
+negatively
+##see
+lays
+grouping
+##cken
+##henko
+affirmed
+halle
+modernist
+##lai
+hodges
+smelling
+aristocratic
+baptized
+dismiss
+justification
+oilers
+##now
+coupling
+qin
+snack
+healer
+##qing
+gardener
+layla
+battled
+formulated
+stephenson
+gravitational
+##gill
+##jun
+1768
+granny
+coordinating
+suites
+##cd
+##ioned
+monarchs
+##cote
+##hips
+sep
+blended
+apr
+barrister
+deposition
+fia
+mina
+policemen
+paranoid
+##pressed
+churchyard
+covert
+crumpled
+creep
+abandoning
+tr
+transmit
+conceal
+barr
+understands
+readiness
+spire
+##cology
+##enia
+##erry
+610
+startling
+unlock
+vida
+bowled
+slots
+##nat
+##islav
+spaced
+trusting
+admire
+rig
+##ink
+slack
+##70
+mv
+207
+casualty
+##wei
+classmates
+##odes
+##rar
+##rked
+amherst
+furnished
+evolve
+foundry
+menace
+mead
+##lein
+flu
+wesleyan
+##kled
+monterey
+webber
+##vos
+wil
+##mith
+##на
+bartholomew
+justices
+restrained
+##cke
+amenities
+191
+mediated
+sewage
+trenches
+ml
+mainz
+##thus
+1800s
+##cula
+##inski
+caine
+bonding
+213
+converts
+spheres
+superseded
+marianne
+crypt
+sweaty
+ensign
+historia
+##br
+spruce
+##post
+##ask
+forks
+thoughtfully
+yukon
+pamphlet
+ames
+##uter
+karma
+##yya
+bryn
+negotiation
+sighs
+incapable
+##mbre
+##ntial
+actresses
+taft
+##mill
+luce
+prevailed
+##amine
+1773
+motionless
+envoy
+testify
+investing
+sculpted
+instructors
+provence
+kali
+cullen
+horseback
+##while
+goodwin
+##jos
+gaa
+norte
+##ldon
+modify
+wavelength
+abd
+214
+skinned
+sprinter
+forecast
+scheduling
+marries
+squared
+tentative
+##chman
+boer
+##isch
+bolts
+swap
+fisherman
+assyrian
+impatiently
+guthrie
+martins
+murdoch
+194
+tanya
+nicely
+dolly
+lacy
+med
+##45
+syn
+decks
+fashionable
+millionaire
+##ust
+surfing
+##ml
+##ision
+heaved
+tammy
+consulate
+attendees
+routinely
+197
+fuse
+saxophonist
+backseat
+malaya
+##lord
+scowl
+tau
+##ishly
+193
+sighted
+steaming
+##rks
+303
+911
+##holes
+##hong
+ching
+##wife
+bless
+conserved
+jurassic
+stacey
+unix
+zion
+chunk
+rigorous
+blaine
+198
+peabody
+slayer
+dismay
+brewers
+nz
+##jer
+det
+##glia
+glover
+postwar
+int
+penetration
+sylvester
+imitation
+vertically
+airlift
+heiress
+knoxville
+viva
+##uin
+390
+macon
+##rim
+##fighter
+##gonal
+janice
+##orescence
+##wari
+marius
+belongings
+leicestershire
+196
+blanco
+inverted
+preseason
+sanity
+sobbing
+##due
+##elt
+##dled
+collingwood
+regeneration
+flickering
+shortest
+##mount
+##osi
+feminism
+##lat
+sherlock
+cabinets
+fumbled
+northbound
+precedent
+snaps
+##mme
+researching
+##akes
+guillaume
+insights
+manipulated
+vapor
+neighbour
+sap
+gangster
+frey
+f1
+stalking
+scarcely
+callie
+barnett
+tendencies
+audi
+doomed
+assessing
+slung
+panchayat
+ambiguous
+bartlett
+##etto
+distributing
+violating
+wolverhampton
+##hetic
+swami
+histoire
+##urus
+liable
+pounder
+groin
+hussain
+larsen
+popping
+surprises
+##atter
+vie
+curt
+##station
+mute
+relocate
+musicals
+authorization
+richter
+##sef
+immortality
+tna
+bombings
+##press
+deteriorated
+yiddish
+##acious
+robbed
+colchester
+cs
+pmid
+ao
+verified
+balancing
+apostle
+swayed
+recognizable
+oxfordshire
+retention
+nottinghamshire
+contender
+judd
+invitational
+shrimp
+uhf
+##icient
+cleaner
+longitudinal
+tanker
+##mur
+acronym
+broker
+koppen
+sundance
+suppliers
+##gil
+4000
+clipped
+fuels
+petite
+##anne
+landslide
+helene
+diversion
+populous
+landowners
+auspices
+melville
+quantitative
+##xes
+ferries
+nicky
+##llus
+doo
+haunting
+roche
+carver
+downed
+unavailable
+##pathy
+approximation
+hiroshima
+##hue
+garfield
+valle
+comparatively
+keyboardist
+traveler
+##eit
+congestion
+calculating
+subsidiaries
+##bate
+serb
+modernization
+fairies
+deepened
+ville
+averages
+##lore
+inflammatory
+tonga
+##itch
+co₂
+squads
+##hea
+gigantic
+serum
+enjoyment
+retailer
+verona
+35th
+cis
+##phobic
+magna
+technicians
+##vati
+arithmetic
+##sport
+levin
+##dation
+amtrak
+chow
+sienna
+##eyer
+backstage
+entrepreneurship
+##otic
+learnt
+tao
+##udy
+worcestershire
+formulation
+baggage
+hesitant
+bali
+sabotage
+##kari
+barren
+enhancing
+murmur
+pl
+freshly
+putnam
+syntax
+aces
+medicines
+resentment
+bandwidth
+##sier
+grins
+chili
+guido
+##sei
+framing
+implying
+gareth
+lissa
+genevieve
+pertaining
+admissions
+geo
+thorpe
+proliferation
+sato
+bela
+analyzing
+parting
+##gor
+awakened
+##isman
+huddled
+secrecy
+##kling
+hush
+gentry
+540
+dungeons
+##ego
+coasts
+##utz
+sacrificed
+##chule
+landowner
+mutually
+prevalence
+programmer
+adolescent
+disrupted
+seaside
+gee
+trusts
+vamp
+georgie
+##nesian
+##iol
+schedules
+sindh
+##market
+etched
+hm
+sparse
+bey
+beaux
+scratching
+gliding
+unidentified
+216
+collaborating
+gems
+jesuits
+oro
+accumulation
+shaping
+mbe
+anal
+##xin
+231
+enthusiasts
+newscast
+##egan
+janata
+dewey
+parkinson
+179
+ankara
+biennial
+towering
+dd
+inconsistent
+950
+##chet
+thriving
+terminate
+cabins
+furiously
+eats
+advocating
+donkey
+marley
+muster
+phyllis
+leiden
+##user
+grassland
+glittering
+iucn
+loneliness
+217
+memorandum
+armenians
+##ddle
+popularized
+rhodesia
+60s
+lame
+##illon
+sans
+bikini
+header
+orbits
+##xx
+##finger
+##ulator
+sharif
+spines
+biotechnology
+strolled
+naughty
+yates
+##wire
+fremantle
+milo
+##mour
+abducted
+removes
+##atin
+humming
+wonderland
+##chrome
+##ester
+hume
+pivotal
+##rates
+armand
+grams
+believers
+elector
+rte
+apron
+bis
+scraped
+##yria
+endorsement
+initials
+##llation
+eps
+dotted
+hints
+buzzing
+emigration
+nearer
+##tom
+indicators
+##ulu
+coarse
+neutron
+protectorate
+##uze
+directional
+exploits
+pains
+loire
+1830s
+proponents
+guggenheim
+rabbits
+ritchie
+305
+hectare
+inputs
+hutton
+##raz
+verify
+##ako
+boilers
+longitude
+##lev
+skeletal
+yer
+emilia
+citrus
+compromised
+##gau
+pokemon
+prescription
+paragraph
+eduard
+cadillac
+attire
+categorized
+kenyan
+weddings
+charley
+##bourg
+entertain
+monmouth
+##lles
+nutrients
+davey
+mesh
+incentive
+practised
+ecosystems
+kemp
+subdued
+overheard
+##rya
+bodily
+maxim
+##nius
+apprenticeship
+ursula
+##fight
+lodged
+rug
+silesian
+unconstitutional
+patel
+inspected
+coyote
+unbeaten
+##hak
+34th
+disruption
+convict
+parcel
+##cl
+##nham
+collier
+implicated
+mallory
+##iac
+##lab
+susannah
+winkler
+##rber
+shia
+phelps
+sediments
+graphical
+robotic
+##sner
+adulthood
+mart
+smoked
+##isto
+kathryn
+clarified
+##aran
+divides
+convictions
+oppression
+pausing
+burying
+##mt
+federico
+mathias
+eileen
+##tana
+kite
+hunched
+##acies
+189
+##atz
+disadvantage
+liza
+kinetic
+greedy
+paradox
+yokohama
+dowager
+trunks
+ventured
+##gement
+gupta
+vilnius
+olaf
+##thest
+crimean
+hopper
+##ej
+progressively
+arturo
+mouthed
+arrondissement
+##fusion
+rubin
+simulcast
+oceania
+##orum
+##stra
+##rred
+busiest
+intensely
+navigator
+cary
+##vine
+##hini
+##bies
+fife
+rowe
+rowland
+posing
+insurgents
+shafts
+lawsuits
+activate
+conor
+inward
+culturally
+garlic
+265
+##eering
+eclectic
+##hui
+##kee
+##nl
+furrowed
+vargas
+meteorological
+rendezvous
+##aus
+culinary
+commencement
+##dition
+quota
+##notes
+mommy
+salaries
+overlapping
+mule
+##iology
+##mology
+sums
+wentworth
+##isk
+##zione
+mainline
+subgroup
+##illy
+hack
+plaintiff
+verdi
+bulb
+differentiation
+engagements
+multinational
+supplemented
+bertrand
+caller
+regis
+##naire
+##sler
+##arts
+##imated
+blossom
+propagation
+kilometer
+viaduct
+vineyards
+##uate
+beckett
+optimization
+golfer
+songwriters
+seminal
+semitic
+thud
+volatile
+evolving
+ridley
+##wley
+trivial
+distributions
+scandinavia
+jiang
+##ject
+wrestled
+insistence
+##dio
+emphasizes
+napkin
+##ods
+adjunct
+rhyme
+##ricted
+##eti
+hopeless
+surrounds
+tremble
+32nd
+smoky
+##ntly
+oils
+medicinal
+padded
+steer
+wilkes
+219
+255
+concessions
+hue
+uniquely
+blinded
+landon
+yahoo
+##lane
+hendrix
+commemorating
+dex
+specify
+chicks
+##ggio
+intercity
+1400
+morley
+##torm
+highlighting
+##oting
+pang
+oblique
+stalled
+##liner
+flirting
+newborn
+1769
+bishopric
+shaved
+232
+currie
+##ush
+dharma
+spartan
+##ooped
+favorites
+smug
+novella
+sirens
+abusive
+creations
+espana
+##lage
+paradigm
+semiconductor
+sheen
+##rdo
+##yen
+##zak
+nrl
+renew
+##pose
+##tur
+adjutant
+marches
+norma
+##enity
+ineffective
+weimar
+grunt
+##gat
+lordship
+plotting
+expenditure
+infringement
+lbs
+refrain
+av
+mimi
+mistakenly
+postmaster
+1771
+##bara
+ras
+motorsports
+tito
+199
+subjective
+##zza
+bully
+stew
+##kaya
+prescott
+1a
+##raphic
+##zam
+bids
+styling
+paranormal
+reeve
+sneaking
+exploding
+katz
+akbar
+migrant
+syllables
+indefinitely
+##ogical
+destroys
+replaces
+applause
+##phine
+pest
+##fide
+218
+articulated
+bertie
+##thing
+##cars
+##ptic
+courtroom
+crowley
+aesthetics
+cummings
+tehsil
+hormones
+titanic
+dangerously
+##ibe
+stadion
+jaenelle
+auguste
+ciudad
+##chu
+mysore
+partisans
+##sio
+lucan
+philipp
+##aly
+debating
+henley
+interiors
+##rano
+##tious
+homecoming
+beyonce
+usher
+henrietta
+prepares
+weeds
+##oman
+ely
+plucked
+##pire
+##dable
+luxurious
+##aq
+artifact
+password
+pasture
+juno
+maddy
+minsk
+##dder
+##ologies
+##rone
+assessments
+martian
+royalist
+1765
+examines
+##mani
+##rge
+nino
+223
+parry
+scooped
+relativity
+##eli
+##uting
+##cao
+congregational
+noisy
+traverse
+##agawa
+strikeouts
+nickelodeon
+obituary
+transylvania
+binds
+depictions
+polk
+trolley
+##yed
+##lard
+breeders
+##under
+dryly
+hokkaido
+1762
+strengths
+stacks
+bonaparte
+connectivity
+neared
+prostitutes
+stamped
+anaheim
+gutierrez
+sinai
+##zzling
+bram
+fresno
+madhya
+##86
+proton
+##lena
+##llum
+##phon
+reelected
+wanda
+##anus
+##lb
+ample
+distinguishing
+##yler
+grasping
+sermons
+tomato
+bland
+stimulation
+avenues
+##eux
+spreads
+scarlett
+fern
+pentagon
+assert
+baird
+chesapeake
+ir
+calmed
+distortion
+fatalities
+##olis
+correctional
+pricing
+##astic
+##gina
+prom
+dammit
+ying
+collaborate
+##chia
+welterweight
+33rd
+pointer
+substitution
+bonded
+umpire
+communicating
+multitude
+paddle
+##obe
+federally
+intimacy
+##insky
+betray
+ssr
+##lett
+##lean
+##lves
+##therapy
+airbus
+##tery
+functioned
+ud
+bearer
+biomedical
+netflix
+##hire
+##nca
+condom
+brink
+ik
+##nical
+macy
+##bet
+flap
+gma
+experimented
+jelly
+lavender
+##icles
+##ulia
+munro
+##mian
+##tial
+rye
+##rle
+60th
+gigs
+hottest
+rotated
+predictions
+fuji
+bu
+##erence
+##omi
+barangay
+##fulness
+##sas
+clocks
+##rwood
+##liness
+cereal
+roe
+wight
+decker
+uttered
+babu
+onion
+xml
+forcibly
+##df
+petra
+sarcasm
+hartley
+peeled
+storytelling
+##42
+##xley
+##ysis
+##ffa
+fibre
+kiel
+auditor
+fig
+harald
+greenville
+##berries
+geographically
+nell
+quartz
+##athic
+cemeteries
+##lr
+crossings
+nah
+holloway
+reptiles
+chun
+sichuan
+snowy
+660
+corrections
+##ivo
+zheng
+ambassadors
+blacksmith
+fielded
+fluids
+hardcover
+turnover
+medications
+melvin
+academies
+##erton
+ro
+roach
+absorbing
+spaniards
+colton
+##founded
+outsider
+espionage
+kelsey
+245
+edible
+##ulf
+dora
+establishes
+##sham
+##tries
+contracting
+##tania
+cinematic
+costello
+nesting
+##uron
+connolly
+duff
+##nology
+mma
+##mata
+fergus
+sexes
+gi
+optics
+spectator
+woodstock
+banning
+##hee
+##fle
+differentiate
+outfielder
+refinery
+226
+312
+gerhard
+horde
+lair
+drastically
+##udi
+landfall
+##cheng
+motorsport
+odi
+##achi
+predominant
+quay
+skins
+##ental
+edna
+harshly
+complementary
+murdering
+##aves
+wreckage
+##90
+ono
+outstretched
+lennox
+munitions
+galen
+reconcile
+470
+scalp
+bicycles
+gillespie
+questionable
+rosenberg
+guillermo
+hostel
+jarvis
+kabul
+volvo
+opium
+yd
+##twined
+abuses
+decca
+outpost
+##cino
+sensible
+neutrality
+##64
+ponce
+anchorage
+atkins
+turrets
+inadvertently
+disagree
+libre
+vodka
+reassuring
+weighs
+##yal
+glide
+jumper
+ceilings
+repertory
+outs
+stain
+##bial
+envy
+##ucible
+smashing
+heightened
+policing
+hyun
+mixes
+lai
+prima
+##ples
+celeste
+##bina
+lucrative
+intervened
+kc
+manually
+##rned
+stature
+staffed
+bun
+bastards
+nairobi
+priced
+##auer
+thatcher
+##kia
+tripped
+comune
+##ogan
+##pled
+brasil
+incentives
+emanuel
+hereford
+musica
+##kim
+benedictine
+biennale
+##lani
+eureka
+gardiner
+rb
+knocks
+sha
+##ael
+##elled
+##onate
+efficacy
+ventura
+masonic
+sanford
+maize
+leverage
+##feit
+capacities
+santana
+##aur
+novelty
+vanilla
+##cter
+##tour
+benin
+##oir
+##rain
+neptune
+drafting
+tallinn
+##cable
+humiliation
+##boarding
+schleswig
+fabian
+bernardo
+liturgy
+spectacle
+sweeney
+pont
+routledge
+##tment
+cosmos
+ut
+hilt
+sleek
+universally
+##eville
+##gawa
+typed
+##dry
+favors
+allegheny
+glaciers
+##rly
+recalling
+aziz
+##log
+parasite
+requiem
+auf
+##berto
+##llin
+illumination
+##breaker
+##issa
+festivities
+bows
+govern
+vibe
+vp
+333
+sprawled
+larson
+pilgrim
+bwf
+leaping
+##rts
+##ssel
+alexei
+greyhound
+hoarse
+##dler
+##oration
+seneca
+##cule
+gaping
+##ulously
+##pura
+cinnamon
+##gens
+##rricular
+craven
+fantasies
+houghton
+engined
+reigned
+dictator
+supervising
+##oris
+bogota
+commentaries
+unnatural
+fingernails
+spirituality
+tighten
+##tm
+canadiens
+protesting
+intentional
+cheers
+sparta
+##ytic
+##iere
+##zine
+widen
+belgarath
+controllers
+dodd
+iaaf
+navarre
+##ication
+defect
+squire
+steiner
+whisky
+##mins
+560
+inevitably
+tome
+##gold
+chew
+##uid
+##lid
+elastic
+##aby
+streaked
+alliances
+jailed
+regal
+##ined
+##phy
+czechoslovak
+narration
+absently
+##uld
+bluegrass
+guangdong
+quran
+criticizing
+hose
+hari
+##liest
+##owa
+skier
+streaks
+deploy
+##lom
+raft
+bose
+dialed
+huff
+##eira
+haifa
+simplest
+bursting
+endings
+ib
+sultanate
+##titled
+franks
+whitman
+ensures
+sven
+##ggs
+collaborators
+forster
+organising
+ui
+banished
+napier
+injustice
+teller
+layered
+thump
+##otti
+roc
+battleships
+evidenced
+fugitive
+sadie
+robotics
+##roud
+equatorial
+geologist
+##iza
+yielding
+##bron
+##sr
+internationale
+mecca
+##diment
+sbs
+skyline
+toad
+uploaded
+reflective
+undrafted
+lal
+leafs
+bayern
+##dai
+lakshmi
+shortlisted
+##stick
+##wicz
+camouflage
+donate
+af
+christi
+lau
+##acio
+disclosed
+nemesis
+1761
+assemble
+straining
+northamptonshire
+tal
+##asi
+bernardino
+premature
+heidi
+42nd
+coefficients
+galactic
+reproduce
+buzzed
+sensations
+zionist
+monsieur
+myrtle
+##eme
+archery
+strangled
+musically
+viewpoint
+antiquities
+bei
+trailers
+seahawks
+cured
+pee
+preferring
+tasmanian
+lange
+sul
+##mail
+##working
+colder
+overland
+lucivar
+massey
+gatherings
+haitian
+##smith
+disapproval
+flaws
+##cco
+##enbach
+1766
+npr
+##icular
+boroughs
+creole
+forums
+techno
+1755
+dent
+abdominal
+streetcar
+##eson
+##stream
+procurement
+gemini
+predictable
+##tya
+acheron
+christoph
+feeder
+fronts
+vendor
+bernhard
+jammu
+tumors
+slang
+##uber
+goaltender
+twists
+curving
+manson
+vuelta
+mer
+peanut
+confessions
+pouch
+unpredictable
+allowance
+theodor
+vascular
+##factory
+bala
+authenticity
+metabolic
+coughing
+nanjing
+##cea
+pembroke
+##bard
+splendid
+36th
+ff
+hourly
+##ahu
+elmer
+handel
+##ivate
+awarding
+thrusting
+dl
+experimentation
+##hesion
+##46
+caressed
+entertained
+steak
+##rangle
+biologist
+orphans
+baroness
+oyster
+stepfather
+##dridge
+mirage
+reefs
+speeding
+##31
+barons
+1764
+227
+inhabit
+preached
+repealed
+##tral
+honoring
+boogie
+captives
+administer
+johanna
+##imate
+gel
+suspiciously
+1767
+sobs
+##dington
+backbone
+hayward
+garry
+##folding
+##nesia
+maxi
+##oof
+##ppe
+ellison
+galileo
+##stand
+crimea
+frenzy
+amour
+bumper
+matrices
+natalia
+baking
+garth
+palestinians
+##grove
+smack
+conveyed
+ensembles
+gardening
+##manship
+##rup
+##stituting
+1640
+harvesting
+topography
+jing
+shifters
+dormitory
+##carriage
+##lston
+ist
+skulls
+##stadt
+dolores
+jewellery
+sarawak
+##wai
+##zier
+fences
+christy
+confinement
+tumbling
+credibility
+fir
+stench
+##bria
+##plication
+##nged
+##sam
+virtues
+##belt
+marjorie
+pba
+##eem
+##made
+celebrates
+schooner
+agitated
+barley
+fulfilling
+anthropologist
+##pro
+restrict
+novi
+regulating
+##nent
+padres
+##rani
+##hesive
+loyola
+tabitha
+milky
+olson
+proprietor
+crambidae
+guarantees
+intercollegiate
+ljubljana
+hilda
+##sko
+ignorant
+hooded
+##lts
+sardinia
+##lidae
+##vation
+frontman
+privileged
+witchcraft
+##gp
+jammed
+laude
+poking
+##than
+bracket
+amazement
+yunnan
+##erus
+maharaja
+linnaeus
+264
+commissioning
+milano
+peacefully
+##logies
+akira
+rani
+regulator
+##36
+grasses
+##rance
+luzon
+crows
+compiler
+gretchen
+seaman
+edouard
+tab
+buccaneers
+ellington
+hamlets
+whig
+socialists
+##anto
+directorial
+easton
+mythological
+##kr
+##vary
+rhineland
+semantic
+taut
+dune
+inventions
+succeeds
+##iter
+replication
+branched
+##pired
+jul
+prosecuted
+kangaroo
+penetrated
+##avian
+middlesbrough
+doses
+bleak
+madam
+predatory
+relentless
+##vili
+reluctance
+##vir
+hailey
+crore
+silvery
+1759
+monstrous
+swimmers
+transmissions
+hawthorn
+informing
+##eral
+toilets
+caracas
+crouch
+kb
+##sett
+295
+cartel
+hadley
+##aling
+alexia
+yvonne
+##biology
+cinderella
+eton
+superb
+blizzard
+stabbing
+industrialist
+maximus
+##gm
+##orus
+groves
+maud
+clade
+oversized
+comedic
+##bella
+rosen
+nomadic
+fulham
+montane
+beverages
+galaxies
+redundant
+swarm
+##rot
+##folia
+##llis
+buckinghamshire
+fen
+bearings
+bahadur
+##rom
+gilles
+phased
+dynamite
+faber
+benoit
+vip
+##ount
+##wd
+booking
+fractured
+tailored
+anya
+spices
+westwood
+cairns
+auditions
+inflammation
+steamed
+##rocity
+##acion
+##urne
+skyla
+thereof
+watford
+torment
+archdeacon
+transforms
+lulu
+demeanor
+fucked
+serge
+##sor
+mckenna
+minas
+entertainer
+##icide
+caress
+originate
+residue
+##sty
+1740
+##ilised
+##org
+beech
+##wana
+subsidies
+##ghton
+emptied
+gladstone
+ru
+firefighters
+voodoo
+##rcle
+het
+nightingale
+tamara
+edmond
+ingredient
+weaknesses
+silhouette
+285
+compatibility
+withdrawing
+hampson
+##mona
+anguish
+giggling
+##mber
+bookstore
+##jiang
+southernmost
+tilting
+##vance
+bai
+economical
+rf
+briefcase
+dreadful
+hinted
+projections
+shattering
+totaling
+##rogate
+analogue
+indicted
+periodical
+fullback
+##dman
+haynes
+##tenberg
+##ffs
+##ishment
+1745
+thirst
+stumble
+penang
+vigorous
+##ddling
+##kor
+##lium
+octave
+##ove
+##enstein
+##inen
+##ones
+siberian
+##uti
+cbn
+repeal
+swaying
+##vington
+khalid
+tanaka
+unicorn
+otago
+plastered
+lobe
+riddle
+##rella
+perch
+##ishing
+croydon
+filtered
+graeme
+tripoli
+##ossa
+crocodile
+##chers
+sufi
+mined
+##tung
+inferno
+lsu
+##phi
+swelled
+utilizes
+£2
+cale
+periodicals
+styx
+hike
+informally
+coop
+lund
+##tidae
+ala
+hen
+qui
+transformations
+disposed
+sheath
+chickens
+##cade
+fitzroy
+sas
+silesia
+unacceptable
+odisha
+1650
+sabrina
+pe
+spokane
+ratios
+athena
+massage
+shen
+dilemma
+##drum
+##riz
+##hul
+corona
+doubtful
+niall
+##pha
+##bino
+fines
+cite
+acknowledging
+bangor
+ballard
+bathurst
+##resh
+huron
+mustered
+alzheimer
+garments
+kinase
+tyre
+warship
+##cp
+flashback
+pulmonary
+braun
+cheat
+kamal
+cyclists
+constructions
+grenades
+ndp
+traveller
+excuses
+stomped
+signalling
+trimmed
+futsal
+mosques
+relevance
+##wine
+wta
+##23
+##vah
+##lter
+hoc
+##riding
+optimistic
+##´s
+deco
+sim
+interacting
+rejecting
+moniker
+waterways
+##ieri
+##oku
+mayors
+gdansk
+outnumbered
+pearls
+##ended
+##hampton
+fairs
+totals
+dominating
+262
+notions
+stairway
+compiling
+pursed
+commodities
+grease
+yeast
+##jong
+carthage
+griffiths
+residual
+amc
+contraction
+laird
+sapphire
+##marine
+##ivated
+amalgamation
+dissolve
+inclination
+lyle
+packaged
+altitudes
+suez
+canons
+graded
+lurched
+narrowing
+boasts
+guise
+wed
+enrico
+##ovsky
+rower
+scarred
+bree
+cub
+iberian
+protagonists
+bargaining
+proposing
+trainers
+voyages
+vans
+fishes
+##aea
+##ivist
+##verance
+encryption
+artworks
+kazan
+sabre
+cleopatra
+hepburn
+rotting
+supremacy
+mecklenburg
+##brate
+burrows
+hazards
+outgoing
+flair
+organizes
+##ctions
+scorpion
+##usions
+boo
+234
+chevalier
+dunedin
+slapping
+##34
+ineligible
+pensions
+##38
+##omic
+manufactures
+emails
+bismarck
+238
+weakening
+blackish
+ding
+mcgee
+quo
+##rling
+northernmost
+xx
+manpower
+greed
+sampson
+clicking
+##ange
+##horpe
+##inations
+##roving
+torre
+##eptive
+##moral
+symbolism
+38th
+asshole
+meritorious
+outfits
+splashed
+biographies
+sprung
+astros
+##tale
+302
+737
+filly
+raoul
+nw
+tokugawa
+linden
+clubhouse
+##apa
+tracts
+romano
+##pio
+putin
+tags
+##note
+chained
+dickson
+gunshot
+moe
+gunn
+rashid
+##tails
+zipper
+##bas
+##nea
+contrasted
+##ply
+##udes
+plum
+pharaoh
+##pile
+aw
+comedies
+ingrid
+sandwiches
+subdivisions
+1100
+mariana
+nokia
+kamen
+hz
+delaney
+veto
+herring
+##words
+possessive
+outlines
+##roup
+siemens
+stairwell
+rc
+gallantry
+messiah
+palais
+yells
+233
+zeppelin
+##dm
+bolivar
+##cede
+smackdown
+mckinley
+##mora
+##yt
+muted
+geologic
+finely
+unitary
+avatar
+hamas
+maynard
+rees
+bog
+contrasting
+##rut
+liv
+chico
+disposition
+pixel
+##erate
+becca
+dmitry
+yeshiva
+narratives
+##lva
+##ulton
+mercenary
+sharpe
+tempered
+navigate
+stealth
+amassed
+keynes
+##lini
+untouched
+##rrie
+havoc
+lithium
+##fighting
+abyss
+graf
+southward
+wolverine
+balloons
+implements
+ngos
+transitions
+##icum
+ambushed
+concacaf
+dormant
+economists
+##dim
+costing
+csi
+rana
+universite
+boulders
+verity
+##llon
+collin
+mellon
+misses
+cypress
+fluorescent
+lifeless
+spence
+##ulla
+crewe
+shepard
+pak
+revelations
+##م
+jolly
+gibbons
+paw
+##dro
+##quel
+freeing
+##test
+shack
+fries
+palatine
+##51
+##hiko
+accompaniment
+cruising
+recycled
+##aver
+erwin
+sorting
+synthesizers
+dyke
+realities
+sg
+strides
+enslaved
+wetland
+##ghan
+competence
+gunpowder
+grassy
+maroon
+reactors
+objection
+##oms
+carlson
+gearbox
+macintosh
+radios
+shelton
+##sho
+clergyman
+prakash
+254
+mongols
+trophies
+oricon
+228
+stimuli
+twenty20
+cantonese
+cortes
+mirrored
+##saurus
+bhp
+cristina
+melancholy
+##lating
+enjoyable
+nuevo
+##wny
+downfall
+schumacher
+##ind
+banging
+lausanne
+rumbled
+paramilitary
+reflex
+ax
+amplitude
+migratory
+##gall
+##ups
+midi
+barnard
+lastly
+sherry
+##hp
+##nall
+keystone
+##kra
+carleton
+slippery
+##53
+coloring
+foe
+socket
+otter
+##rgos
+mats
+##tose
+consultants
+bafta
+bison
+topping
+##km
+490
+primal
+abandonment
+transplant
+atoll
+hideous
+mort
+pained
+reproduced
+tae
+howling
+##turn
+unlawful
+billionaire
+hotter
+poised
+lansing
+##chang
+dinamo
+retro
+messing
+nfc
+domesday
+##mina
+blitz
+timed
+##athing
+##kley
+ascending
+gesturing
+##izations
+signaled
+tis
+chinatown
+mermaid
+savanna
+jameson
+##aint
+catalina
+##pet
+##hers
+cochrane
+cy
+chatting
+##kus
+alerted
+computation
+mused
+noelle
+majestic
+mohawk
+campo
+octagonal
+##sant
+##hend
+241
+aspiring
+##mart
+comprehend
+iona
+paralyzed
+shimmering
+swindon
+rhone
+##eley
+reputed
+configurations
+pitchfork
+agitation
+francais
+gillian
+lipstick
+##ilo
+outsiders
+pontifical
+resisting
+bitterness
+sewer
+rockies
+##edd
+##ucher
+misleading
+1756
+exiting
+galloway
+##nging
+risked
+##heart
+246
+commemoration
+schultz
+##rka
+integrating
+##rsa
+poses
+shrieked
+##weiler
+guineas
+gladys
+jerking
+owls
+goldsmith
+nightly
+penetrating
+##unced
+lia
+##33
+ignited
+betsy
+##aring
+##thorpe
+follower
+vigorously
+##rave
+coded
+kiran
+knit
+zoology
+tbilisi
+##28
+##bered
+repository
+govt
+deciduous
+dino
+growling
+##bba
+enhancement
+unleashed
+chanting
+pussy
+biochemistry
+##eric
+kettle
+repression
+toxicity
+nrhp
+##arth
+##kko
+##bush
+ernesto
+commended
+outspoken
+242
+mca
+parchment
+sms
+kristen
+##aton
+bisexual
+raked
+glamour
+navajo
+a2
+conditioned
+showcased
+##hma
+spacious
+youthful
+##esa
+usl
+appliances
+junta
+brest
+layne
+conglomerate
+enchanted
+chao
+loosened
+picasso
+circulating
+inspect
+montevideo
+##centric
+##kti
+piazza
+spurred
+##aith
+bari
+freedoms
+poultry
+stamford
+lieu
+##ect
+indigo
+sarcastic
+bahia
+stump
+attach
+dvds
+frankenstein
+lille
+approx
+scriptures
+pollen
+##script
+nmi
+overseen
+##ivism
+tides
+proponent
+newmarket
+inherit
+milling
+##erland
+centralized
+##rou
+distributors
+credentials
+drawers
+abbreviation
+##lco
+##xon
+downing
+uncomfortably
+ripe
+##oes
+erase
+franchises
+##ever
+populace
+##bery
+##khar
+decomposition
+pleas
+##tet
+daryl
+sabah
+##stle
+##wide
+fearless
+genie
+lesions
+annette
+##ogist
+oboe
+appendix
+nair
+dripped
+petitioned
+maclean
+mosquito
+parrot
+rpg
+hampered
+1648
+operatic
+reservoirs
+##tham
+irrelevant
+jolt
+summarized
+##fp
+medallion
+##taff
+##−
+clawed
+harlow
+narrower
+goddard
+marcia
+bodied
+fremont
+suarez
+altering
+tempest
+mussolini
+porn
+##isms
+sweetly
+oversees
+walkers
+solitude
+grimly
+shrines
+hk
+ich
+supervisors
+hostess
+dietrich
+legitimacy
+brushes
+expressive
+##yp
+dissipated
+##rse
+localized
+systemic
+##nikov
+gettysburg
+##js
+##uaries
+dialogues
+muttering
+251
+housekeeper
+sicilian
+discouraged
+##frey
+beamed
+kaladin
+halftime
+kidnap
+##amo
+##llet
+1754
+synonymous
+depleted
+instituto
+insulin
+reprised
+##opsis
+clashed
+##ctric
+interrupting
+radcliffe
+insisting
+medici
+1715
+ejected
+playfully
+turbulent
+##47
+starvation
+##rini
+shipment
+rebellious
+petersen
+verification
+merits
+##rified
+cakes
+##charged
+1757
+milford
+shortages
+spying
+fidelity
+##aker
+emitted
+storylines
+harvested
+seismic
+##iform
+cheung
+kilda
+theoretically
+barbie
+lynx
+##rgy
+##tius
+goblin
+mata
+poisonous
+##nburg
+reactive
+residues
+obedience
+##евич
+conjecture
+##rac
+401
+hating
+sixties
+kicker
+moaning
+motown
+##bha
+emancipation
+neoclassical
+##hering
+consoles
+ebert
+professorship
+##tures
+sustaining
+assaults
+obeyed
+affluent
+incurred
+tornadoes
+##eber
+##zow
+emphasizing
+highlanders
+cheated
+helmets
+##ctus
+internship
+terence
+bony
+executions
+legislators
+berries
+peninsular
+tinged
+##aco
+1689
+amplifier
+corvette
+ribbons
+lavish
+pennant
+##lander
+worthless
+##chfield
+##forms
+mariano
+pyrenees
+expenditures
+##icides
+chesterfield
+mandir
+tailor
+39th
+sergey
+nestled
+willed
+aristocracy
+devotees
+goodnight
+raaf
+rumored
+weaponry
+remy
+appropriations
+harcourt
+burr
+riaa
+##lence
+limitation
+unnoticed
+guo
+soaking
+swamps
+##tica
+collapsing
+tatiana
+descriptive
+brigham
+psalm
+##chment
+maddox
+##lization
+patti
+caliph
+##aja
+akron
+injuring
+serra
+##ganj
+basins
+##sari
+astonished
+launcher
+##church
+hilary
+wilkins
+sewing
+##sf
+stinging
+##fia
+##ncia
+underwood
+startup
+##ition
+compilations
+vibrations
+embankment
+jurist
+##nity
+bard
+juventus
+groundwater
+kern
+palaces
+helium
+boca
+cramped
+marissa
+soto
+##worm
+jae
+princely
+##ggy
+faso
+bazaar
+warmly
+##voking
+229
+pairing
+##lite
+##grate
+##nets
+wien
+freaked
+ulysses
+rebirth
+##alia
+##rent
+mummy
+guzman
+jimenez
+stilled
+##nitz
+trajectory
+tha
+woken
+archival
+professions
+##pts
+##pta
+hilly
+shadowy
+shrink
+##bolt
+norwood
+glued
+migrate
+stereotypes
+devoid
+##pheus
+625
+evacuate
+horrors
+infancy
+gotham
+knowles
+optic
+downloaded
+sachs
+kingsley
+parramatta
+darryl
+mor
+##onale
+shady
+commence
+confesses
+kan
+##meter
+##placed
+marlborough
+roundabout
+regents
+frigates
+io
+##imating
+gothenburg
+revoked
+carvings
+clockwise
+convertible
+intruder
+##sche
+banged
+##ogo
+vicky
+bourgeois
+##mony
+dupont
+footing
+##gum
+pd
+##real
+buckle
+yun
+penthouse
+sane
+720
+serviced
+stakeholders
+neumann
+bb
+##eers
+comb
+##gam
+catchment
+pinning
+rallies
+typing
+##elles
+forefront
+freiburg
+sweetie
+giacomo
+widowed
+goodwill
+worshipped
+aspirations
+midday
+##vat
+fishery
+##trick
+bournemouth
+turk
+243
+hearth
+ethanol
+guadalajara
+murmurs
+sl
+##uge
+afforded
+scripted
+##hta
+wah
+##jn
+coroner
+translucent
+252
+memorials
+puck
+progresses
+clumsy
+##race
+315
+candace
+recounted
+##27
+##slin
+##uve
+filtering
+##mac
+howl
+strata
+heron
+leveled
+##ays
+dubious
+##oja
+##т
+##wheel
+citations
+exhibiting
+##laya
+##mics
+##pods
+turkic
+##lberg
+injunction
+##ennial
+##mit
+antibodies
+##44
+organise
+##rigues
+cardiovascular
+cushion
+inverness
+##zquez
+dia
+cocoa
+sibling
+##tman
+##roid
+expanse
+feasible
+tunisian
+algiers
+##relli
+rus
+bloomberg
+dso
+westphalia
+bro
+tacoma
+281
+downloads
+##ours
+konrad
+duran
+##hdi
+continuum
+jett
+compares
+legislator
+secession
+##nable
+##gues
+##zuka
+translating
+reacher
+##gley
+##ła
+aleppo
+##agi
+tc
+orchards
+trapping
+linguist
+versatile
+drumming
+postage
+calhoun
+superiors
+##mx
+barefoot
+leary
+##cis
+ignacio
+alfa
+kaplan
+##rogen
+bratislava
+mori
+##vot
+disturb
+haas
+313
+cartridges
+gilmore
+radiated
+salford
+tunic
+hades
+##ulsive
+archeological
+delilah
+magistrates
+auditioned
+brewster
+charters
+empowerment
+blogs
+cappella
+dynasties
+iroquois
+whipping
+##krishna
+raceway
+truths
+myra
+weaken
+judah
+mcgregor
+##horse
+mic
+refueling
+37th
+burnley
+bosses
+markus
+premio
+query
+##gga
+dunbar
+##economic
+darkest
+lyndon
+sealing
+commendation
+reappeared
+##mun
+addicted
+ezio
+slaughtered
+satisfactory
+shuffle
+##eves
+##thic
+##uj
+fortification
+warrington
+##otto
+resurrected
+fargo
+mane
+##utable
+##lei
+##space
+foreword
+ox
+##aris
+##vern
+abrams
+hua
+##mento
+sakura
+##alo
+uv
+sentimental
+##skaya
+midfield
+##eses
+sturdy
+scrolls
+macleod
+##kyu
+entropy
+##lance
+mitochondrial
+cicero
+excelled
+thinner
+convoys
+perceive
+##oslav
+##urable
+systematically
+grind
+burkina
+287
+##tagram
+ops
+##aman
+guantanamo
+##cloth
+##tite
+forcefully
+wavy
+##jou
+pointless
+##linger
+##tze
+layton
+portico
+superficial
+clerical
+outlaws
+##hism
+burials
+muir
+##inn
+creditors
+hauling
+rattle
+##leg
+calais
+monde
+archers
+reclaimed
+dwell
+wexford
+hellenic
+falsely
+remorse
+##tek
+dough
+furnishings
+##uttered
+gabon
+neurological
+novice
+##igraphy
+contemplated
+pulpit
+nightstand
+saratoga
+##istan
+documenting
+pulsing
+taluk
+##firmed
+busted
+marital
+##rien
+disagreements
+wasps
+##yes
+hodge
+mcdonnell
+mimic
+fran
+pendant
+dhabi
+musa
+##nington
+congratulations
+argent
+darrell
+concussion
+losers
+regrets
+thessaloniki
+reversal
+donaldson
+hardwood
+thence
+achilles
+ritter
+##eran
+demonic
+jurgen
+prophets
+goethe
+eki
+classmate
+buff
+##cking
+yank
+irrational
+##inging
+perished
+seductive
+qur
+sourced
+##crat
+##typic
+mustard
+ravine
+barre
+horizontally
+characterization
+phylogenetic
+boise
+##dit
+##runner
+##tower
+brutally
+intercourse
+seduce
+##bbing
+fay
+ferris
+ogden
+amar
+nik
+unarmed
+##inator
+evaluating
+kyrgyzstan
+sweetness
+##lford
+##oki
+mccormick
+meiji
+notoriety
+stimulate
+disrupt
+figuring
+instructional
+mcgrath
+##zoo
+groundbreaking
+##lto
+flinch
+khorasan
+agrarian
+bengals
+mixer
+radiating
+##sov
+ingram
+pitchers
+nad
+tariff
+##cript
+tata
+##codes
+##emi
+##ungen
+appellate
+lehigh
+##bled
+##giri
+brawl
+duct
+texans
+##ciation
+##ropolis
+skipper
+speculative
+vomit
+doctrines
+stresses
+253
+davy
+graders
+whitehead
+jozef
+timely
+cumulative
+haryana
+paints
+appropriately
+boon
+cactus
+##ales
+##pid
+dow
+legions
+##pit
+perceptions
+1730
+picturesque
+##yse
+periphery
+rune
+wr
+##aha
+celtics
+sentencing
+whoa
+##erin
+confirms
+variance
+425
+moines
+mathews
+spade
+rave
+m1
+fronted
+fx
+blending
+alleging
+reared
+##gl
+237
+##paper
+grassroots
+eroded
+##free
+##physical
+directs
+ordeal
+##sław
+accelerate
+hacker
+rooftop
+##inia
+lev
+buys
+cebu
+devote
+##lce
+specialising
+##ulsion
+choreographed
+repetition
+warehouses
+##ryl
+paisley
+tuscany
+analogy
+sorcerer
+hash
+huts
+shards
+descends
+exclude
+nix
+chaplin
+gaga
+ito
+vane
+##drich
+causeway
+misconduct
+limo
+orchestrated
+glands
+jana
+##kot
+u2
+##mple
+##sons
+branching
+contrasts
+scoop
+longed
+##virus
+chattanooga
+##75
+syrup
+cornerstone
+##tized
+##mind
+##iaceae
+careless
+precedence
+frescoes
+##uet
+chilled
+consult
+modelled
+snatch
+peat
+##thermal
+caucasian
+humane
+relaxation
+spins
+temperance
+##lbert
+occupations
+lambda
+hybrids
+moons
+mp3
+##oese
+247
+rolf
+societal
+yerevan
+ness
+##ssler
+befriended
+mechanized
+nominate
+trough
+boasted
+cues
+seater
+##hom
+bends
+##tangle
+conductors
+emptiness
+##lmer
+eurasian
+adriatic
+tian
+##cie
+anxiously
+lark
+propellers
+chichester
+jock
+ev
+2a
+##holding
+credible
+recounts
+tori
+loyalist
+abduction
+##hoot
+##redo
+nepali
+##mite
+ventral
+tempting
+##ango
+##crats
+steered
+##wice
+javelin
+dipping
+laborers
+prentice
+looming
+titanium
+##ː
+badges
+emir
+tensor
+##ntation
+egyptians
+rash
+denies
+hawthorne
+lombard
+showers
+wehrmacht
+dietary
+trojan
+##reus
+welles
+executing
+horseshoe
+lifeboat
+##lak
+elsa
+infirmary
+nearing
+roberta
+boyer
+mutter
+trillion
+joanne
+##fine
+##oked
+sinks
+vortex
+uruguayan
+clasp
+sirius
+##block
+accelerator
+prohibit
+sunken
+byu
+chronological
+diplomats
+ochreous
+510
+symmetrical
+1644
+maia
+##tology
+salts
+reigns
+atrocities
+##ия
+hess
+bared
+issn
+##vyn
+cater
+saturated
+##cycle
+##isse
+sable
+voyager
+dyer
+yusuf
+##inge
+fountains
+wolff
+##39
+##nni
+engraving
+rollins
+atheist
+ominous
+##ault
+herr
+chariot
+martina
+strung
+##fell
+##farlane
+horrific
+sahib
+gazes
+saetan
+erased
+ptolemy
+##olic
+flushing
+lauderdale
+analytic
+##ices
+530
+navarro
+beak
+gorilla
+herrera
+broom
+guadalupe
+raiding
+sykes
+311
+bsc
+deliveries
+1720
+invasions
+carmichael
+tajikistan
+thematic
+ecumenical
+sentiments
+onstage
+##rians
+##brand
+##sume
+catastrophic
+flanks
+molten
+##arns
+waller
+aimee
+terminating
+##icing
+alternately
+##oche
+nehru
+printers
+outraged
+##eving
+empires
+template
+banners
+repetitive
+za
+##oise
+vegetarian
+##tell
+guiana
+opt
+cavendish
+lucknow
+synthesized
+##hani
+##mada
+finalized
+##ctable
+fictitious
+mayoral
+unreliable
+##enham
+embracing
+peppers
+rbis
+##chio
+##neo
+inhibition
+slashed
+togo
+orderly
+embroidered
+safari
+salty
+236
+barron
+benito
+totaled
+##dak
+pubs
+simulated
+caden
+devin
+tolkien
+momma
+welding
+sesame
+##ept
+gottingen
+hardness
+630
+shaman
+temeraire
+620
+adequately
+pediatric
+##kit
+ck
+assertion
+radicals
+composure
+cadence
+seafood
+beaufort
+lazarus
+mani
+warily
+cunning
+kurdistan
+249
+cantata
+##kir
+ares
+##41
+##clusive
+nape
+townland
+geared
+insulted
+flutter
+boating
+violate
+draper
+dumping
+malmo
+##hh
+##romatic
+firearm
+alta
+bono
+obscured
+##clave
+exceeds
+panorama
+unbelievable
+##train
+preschool
+##essed
+disconnected
+installing
+rescuing
+secretaries
+accessibility
+##castle
+##drive
+##ifice
+##film
+bouts
+slug
+waterway
+mindanao
+##buro
+##ratic
+halves
+##ل
+calming
+liter
+maternity
+adorable
+bragg
+electrification
+mcc
+##dote
+roxy
+schizophrenia
+##body
+munoz
+kaye
+whaling
+239
+mil
+tingling
+tolerant
+##ago
+unconventional
+volcanoes
+##finder
+deportivo
+##llie
+robson
+kaufman
+neuroscience
+wai
+deportation
+masovian
+scraping
+converse
+##bh
+hacking
+bulge
+##oun
+administratively
+yao
+580
+amp
+mammoth
+booster
+claremont
+hooper
+nomenclature
+pursuits
+mclaughlin
+melinda
+##sul
+catfish
+barclay
+substrates
+taxa
+zee
+originals
+kimberly
+packets
+padma
+##ality
+borrowing
+ostensibly
+solvent
+##bri
+##genesis
+##mist
+lukas
+shreveport
+veracruz
+##ь
+##lou
+##wives
+cheney
+tt
+anatolia
+hobbs
+##zyn
+cyclic
+radiant
+alistair
+greenish
+siena
+dat
+independents
+##bation
+conform
+pieter
+hyper
+applicant
+bradshaw
+spores
+telangana
+vinci
+inexpensive
+nuclei
+322
+jang
+nme
+soho
+spd
+##ign
+cradled
+receptionist
+pow
+##43
+##rika
+fascism
+##ifer
+experimenting
+##ading
+##iec
+##region
+345
+jocelyn
+maris
+stair
+nocturnal
+toro
+constabulary
+elgin
+##kker
+msc
+##giving
+##schen
+##rase
+doherty
+doping
+sarcastically
+batter
+maneuvers
+##cano
+##apple
+##gai
+##git
+intrinsic
+##nst
+##stor
+1753
+showtime
+cafes
+gasps
+lviv
+ushered
+##thed
+fours
+restart
+astonishment
+transmitting
+flyer
+shrugs
+##sau
+intriguing
+cones
+dictated
+mushrooms
+medial
+##kovsky
+##elman
+escorting
+gaped
+##26
+godfather
+##door
+##sell
+djs
+recaptured
+timetable
+vila
+1710
+3a
+aerodrome
+mortals
+scientology
+##orne
+angelina
+mag
+convection
+unpaid
+insertion
+intermittent
+lego
+##nated
+endeavor
+kota
+pereira
+##lz
+304
+bwv
+glamorgan
+insults
+agatha
+fey
+##cend
+fleetwood
+mahogany
+protruding
+steamship
+zeta
+##arty
+mcguire
+suspense
+##sphere
+advising
+urges
+##wala
+hurriedly
+meteor
+gilded
+inline
+arroyo
+stalker
+##oge
+excitedly
+revered
+##cure
+earle
+introductory
+##break
+##ilde
+mutants
+puff
+pulses
+reinforcement
+##haling
+curses
+lizards
+stalk
+correlated
+##fixed
+fallout
+macquarie
+##unas
+bearded
+denton
+heaving
+802
+##ocation
+winery
+assign
+dortmund
+##lkirk
+everest
+invariant
+charismatic
+susie
+##elling
+bled
+lesley
+telegram
+sumner
+bk
+##ogen
+##к
+wilcox
+needy
+colbert
+duval
+##iferous
+##mbled
+allotted
+attends
+imperative
+##hita
+replacements
+hawker
+##inda
+insurgency
+##zee
+##eke
+casts
+##yla
+680
+ives
+transitioned
+##pack
+##powering
+authoritative
+baylor
+flex
+cringed
+plaintiffs
+woodrow
+##skie
+drastic
+ape
+aroma
+unfolded
+commotion
+nt
+preoccupied
+theta
+routines
+lasers
+privatization
+wand
+domino
+ek
+clenching
+nsa
+strategically
+showered
+bile
+handkerchief
+pere
+storing
+christophe
+insulting
+316
+nakamura
+romani
+asiatic
+magdalena
+palma
+cruises
+stripping
+405
+konstantin
+soaring
+##berman
+colloquially
+forerunner
+havilland
+incarcerated
+parasites
+sincerity
+##utus
+disks
+plank
+saigon
+##ining
+corbin
+homo
+ornaments
+powerhouse
+##tlement
+chong
+fastened
+feasibility
+idf
+morphological
+usable
+##nish
+##zuki
+aqueduct
+jaguars
+keepers
+##flies
+aleksandr
+faust
+assigns
+ewing
+bacterium
+hurled
+tricky
+hungarians
+integers
+wallis
+321
+yamaha
+##isha
+hushed
+oblivion
+aviator
+evangelist
+friars
+##eller
+monograph
+ode
+##nary
+airplanes
+labourers
+charms
+##nee
+1661
+hagen
+tnt
+rudder
+fiesta
+transcript
+dorothea
+ska
+inhibitor
+maccabi
+retorted
+raining
+encompassed
+clauses
+menacing
+1642
+lineman
+##gist
+vamps
+##ape
+##dick
+gloom
+##rera
+dealings
+easing
+seekers
+##nut
+##pment
+helens
+unmanned
+##anu
+##isson
+basics
+##amy
+##ckman
+adjustments
+1688
+brutality
+horne
+##zell
+sui
+##55
+##mable
+aggregator
+##thal
+rhino
+##drick
+##vira
+counters
+zoom
+##01
+##rting
+mn
+montenegrin
+packard
+##unciation
+##♭
+##kki
+reclaim
+scholastic
+thugs
+pulsed
+##icia
+syriac
+quan
+saddam
+banda
+kobe
+blaming
+buddies
+dissent
+##lusion
+##usia
+corbett
+jaya
+delle
+erratic
+lexie
+##hesis
+435
+amiga
+hermes
+##pressing
+##leen
+chapels
+gospels
+jamal
+##uating
+compute
+revolving
+warp
+##sso
+##thes
+armory
+##eras
+##gol
+antrim
+loki
+##kow
+##asian
+##good
+##zano
+braid
+handwriting
+subdistrict
+funky
+pantheon
+##iculate
+concurrency
+estimation
+improper
+juliana
+##his
+newcomers
+johnstone
+staten
+communicated
+##oco
+##alle
+sausage
+stormy
+##stered
+##tters
+superfamily
+##grade
+acidic
+collateral
+tabloid
+##oped
+##rza
+bladder
+austen
+##ellant
+mcgraw
+##hay
+hannibal
+mein
+aquino
+lucifer
+wo
+badger
+boar
+cher
+christensen
+greenberg
+interruption
+##kken
+jem
+244
+mocked
+bottoms
+cambridgeshire
+##lide
+sprawling
+##bbly
+eastwood
+ghent
+synth
+##buck
+advisers
+##bah
+nominally
+hapoel
+qu
+daggers
+estranged
+fabricated
+towels
+vinnie
+wcw
+misunderstanding
+anglia
+nothin
+unmistakable
+##dust
+##lova
+chilly
+marquette
+truss
+##edge
+##erine
+reece
+##lty
+##chemist
+##connected
+272
+308
+41st
+bash
+raion
+waterfalls
+##ump
+##main
+labyrinth
+queue
+theorist
+##istle
+bharatiya
+flexed
+soundtracks
+rooney
+leftist
+patrolling
+wharton
+plainly
+alleviate
+eastman
+schuster
+topographic
+engages
+immensely
+unbearable
+fairchild
+1620
+dona
+lurking
+parisian
+oliveira
+ia
+indictment
+hahn
+bangladeshi
+##aster
+vivo
+##uming
+##ential
+antonia
+expects
+indoors
+kildare
+harlan
+##logue
+##ogenic
+##sities
+forgiven
+##wat
+childish
+tavi
+##mide
+##orra
+plausible
+grimm
+successively
+scooted
+##bola
+##dget
+##rith
+spartans
+emery
+flatly
+azure
+epilogue
+##wark
+flourish
+##iny
+##tracted
+##overs
+##oshi
+bestseller
+distressed
+receipt
+spitting
+hermit
+topological
+##cot
+drilled
+subunit
+francs
+##layer
+eel
+##fk
+##itas
+octopus
+footprint
+petitions
+ufo
+##say
+##foil
+interfering
+leaking
+palo
+##metry
+thistle
+valiant
+##pic
+narayan
+mcpherson
+##fast
+gonzales
+##ym
+##enne
+dustin
+novgorod
+solos
+##zman
+doin
+##raph
+##patient
+##meyer
+soluble
+ashland
+cuffs
+carole
+pendleton
+whistling
+vassal
+##river
+deviation
+revisited
+constituents
+rallied
+rotate
+loomed
+##eil
+##nting
+amateurs
+augsburg
+auschwitz
+crowns
+skeletons
+##cona
+bonnet
+257
+dummy
+globalization
+simeon
+sleeper
+mandal
+differentiated
+##crow
+##mare
+milne
+bundled
+exasperated
+talmud
+owes
+segregated
+##feng
+##uary
+dentist
+piracy
+props
+##rang
+devlin
+##torium
+malicious
+paws
+##laid
+dependency
+##ergy
+##fers
+##enna
+258
+pistons
+rourke
+jed
+grammatical
+tres
+maha
+wig
+512
+ghostly
+jayne
+##achal
+##creen
+##ilis
+##lins
+##rence
+designate
+##with
+arrogance
+cambodian
+clones
+showdown
+throttle
+twain
+##ception
+lobes
+metz
+nagoya
+335
+braking
+##furt
+385
+roaming
+##minster
+amin
+crippled
+##37
+##llary
+indifferent
+hoffmann
+idols
+intimidating
+1751
+261
+influenza
+memo
+onions
+1748
+bandage
+consciously
+##landa
+##rage
+clandestine
+observes
+swiped
+tangle
+##ener
+##jected
+##trum
+##bill
+##lta
+hugs
+congresses
+josiah
+spirited
+##dek
+humanist
+managerial
+filmmaking
+inmate
+rhymes
+debuting
+grimsby
+ur
+##laze
+duplicate
+vigor
+##tf
+republished
+bolshevik
+refurbishment
+antibiotics
+martini
+methane
+newscasts
+royale
+horizons
+levant
+iain
+visas
+##ischen
+paler
+##around
+manifestation
+snuck
+alf
+chop
+futile
+pedestal
+rehab
+##kat
+bmg
+kerman
+res
+fairbanks
+jarrett
+abstraction
+saharan
+##zek
+1746
+procedural
+clearer
+kincaid
+sash
+luciano
+##ffey
+crunch
+helmut
+##vara
+revolutionaries
+##tute
+creamy
+leach
+##mmon
+1747
+permitting
+nes
+plight
+wendell
+##lese
+contra
+ts
+clancy
+ipa
+mach
+staples
+autopsy
+disturbances
+nueva
+karin
+pontiac
+##uding
+proxy
+venerable
+haunt
+leto
+bergman
+expands
+##helm
+wal
+##pipe
+canning
+celine
+cords
+obesity
+##enary
+intrusion
+planner
+##phate
+reasoned
+sequencing
+307
+harrow
+##chon
+##dora
+marred
+mcintyre
+repay
+tarzan
+darting
+248
+harrisburg
+margarita
+repulsed
+##hur
+##lding
+belinda
+hamburger
+novo
+compliant
+runways
+bingham
+registrar
+skyscraper
+ic
+cuthbert
+improvisation
+livelihood
+##corp
+##elial
+admiring
+##dened
+sporadic
+believer
+casablanca
+popcorn
+##29
+asha
+shovel
+##bek
+##dice
+coiled
+tangible
+##dez
+casper
+elsie
+resin
+tenderness
+rectory
+##ivision
+avail
+sonar
+##mori
+boutique
+##dier
+guerre
+bathed
+upbringing
+vaulted
+sandals
+blessings
+##naut
+##utnant
+1680
+306
+foxes
+pia
+corrosion
+hesitantly
+confederates
+crystalline
+footprints
+shapiro
+tirana
+valentin
+drones
+45th
+microscope
+shipments
+texted
+inquisition
+wry
+guernsey
+unauthorized
+resigning
+760
+ripple
+schubert
+stu
+reassure
+felony
+##ardo
+brittle
+koreans
+##havan
+##ives
+dun
+implicit
+tyres
+##aldi
+##lth
+magnolia
+##ehan
+##puri
+##poulos
+aggressively
+fei
+gr
+familiarity
+##poo
+indicative
+##trust
+fundamentally
+jimmie
+overrun
+395
+anchors
+moans
+##opus
+britannia
+armagh
+##ggle
+purposely
+seizing
+##vao
+bewildered
+mundane
+avoidance
+cosmopolitan
+geometridae
+quartermaster
+caf
+415
+chatter
+engulfed
+gleam
+purge
+##icate
+juliette
+jurisprudence
+guerra
+revisions
+##bn
+casimir
+brew
+##jm
+1749
+clapton
+cloudy
+conde
+hermitage
+278
+simulations
+torches
+vincenzo
+matteo
+##rill
+hidalgo
+booming
+westbound
+accomplishment
+tentacles
+unaffected
+##sius
+annabelle
+flopped
+sloping
+##litz
+dreamer
+interceptor
+vu
+##loh
+consecration
+copying
+messaging
+breaker
+climates
+hospitalized
+1752
+torino
+afternoons
+winfield
+witnessing
+##teacher
+breakers
+choirs
+sawmill
+coldly
+##ege
+sipping
+haste
+uninhabited
+conical
+bibliography
+pamphlets
+severn
+edict
+##oca
+deux
+illnesses
+grips
+##pl
+rehearsals
+sis
+thinkers
+tame
+##keepers
+1690
+acacia
+reformer
+##osed
+##rys
+shuffling
+##iring
+##shima
+eastbound
+ionic
+rhea
+flees
+littered
+##oum
+rocker
+vomiting
+groaning
+champ
+overwhelmingly
+civilizations
+paces
+sloop
+adoptive
+##tish
+skaters
+##vres
+aiding
+mango
+##joy
+nikola
+shriek
+##ignon
+pharmaceuticals
+##mg
+tuna
+calvert
+gustavo
+stocked
+yearbook
+##urai
+##mana
+computed
+subsp
+riff
+hanoi
+kelvin
+hamid
+moors
+pastures
+summons
+jihad
+nectar
+##ctors
+bayou
+untitled
+pleasing
+vastly
+republics
+intellect
+##η
+##ulio
+##tou
+crumbling
+stylistic
+sb
+##ی
+consolation
+frequented
+h₂o
+walden
+widows
+##iens
+404
+##ignment
+chunks
+improves
+288
+grit
+recited
+##dev
+snarl
+sociological
+##arte
+##gul
+inquired
+##held
+bruise
+clube
+consultancy
+homogeneous
+hornets
+multiplication
+pasta
+prick
+savior
+##grin
+##kou
+##phile
+yoon
+##gara
+grimes
+vanishing
+cheering
+reacting
+bn
+distillery
+##quisite
+##vity
+coe
+dockyard
+massif
+##jord
+escorts
+voss
+##valent
+byte
+chopped
+hawke
+illusions
+workings
+floats
+##koto
+##vac
+kv
+annapolis
+madden
+##onus
+alvaro
+noctuidae
+##cum
+##scopic
+avenge
+steamboat
+forte
+illustrates
+erika
+##trip
+570
+dew
+nationalities
+bran
+manifested
+thirsty
+diversified
+muscled
+reborn
+##standing
+arson
+##lessness
+##dran
+##logram
+##boys
+##kushima
+##vious
+willoughby
+##phobia
+286
+alsace
+dashboard
+yuki
+##chai
+granville
+myspace
+publicized
+tricked
+##gang
+adjective
+##ater
+relic
+reorganisation
+enthusiastically
+indications
+saxe
+##lassified
+consolidate
+iec
+padua
+helplessly
+ramps
+renaming
+regulars
+pedestrians
+accents
+convicts
+inaccurate
+lowers
+mana
+##pati
+barrie
+bjp
+outta
+someplace
+berwick
+flanking
+invoked
+marrow
+sparsely
+excerpts
+clothed
+rei
+##ginal
+wept
+##straße
+##vish
+alexa
+excel
+##ptive
+membranes
+aquitaine
+creeks
+cutler
+sheppard
+implementations
+ns
+##dur
+fragrance
+budge
+concordia
+magnesium
+marcelo
+##antes
+gladly
+vibrating
+##rral
+##ggles
+montrose
+##omba
+lew
+seamus
+1630
+cocky
+##ament
+##uen
+bjorn
+##rrick
+fielder
+fluttering
+##lase
+methyl
+kimberley
+mcdowell
+reductions
+barbed
+##jic
+##tonic
+aeronautical
+condensed
+distracting
+##promising
+huffed
+##cala
+##sle
+claudius
+invincible
+missy
+pious
+balthazar
+ci
+##lang
+butte
+combo
+orson
+##dication
+myriad
+1707
+silenced
+##fed
+##rh
+coco
+netball
+yourselves
+##oza
+clarify
+heller
+peg
+durban
+etudes
+offender
+roast
+blackmail
+curvature
+##woods
+vile
+309
+illicit
+suriname
+##linson
+overture
+1685
+bubbling
+gymnast
+tucking
+##mming
+##ouin
+maldives
+##bala
+gurney
+##dda
+##eased
+##oides
+backside
+pinto
+jars
+racehorse
+tending
+##rdial
+baronetcy
+wiener
+duly
+##rke
+barbarian
+cupping
+flawed
+##thesis
+bertha
+pleistocene
+puddle
+swearing
+##nob
+##tically
+fleeting
+prostate
+amulet
+educating
+##mined
+##iti
+##tler
+75th
+jens
+respondents
+analytics
+cavaliers
+papacy
+raju
+##iente
+##ulum
+##tip
+funnel
+271
+disneyland
+##lley
+sociologist
+##iam
+2500
+faulkner
+louvre
+menon
+##dson
+276
+##ower
+afterlife
+mannheim
+peptide
+referees
+comedians
+meaningless
+##anger
+##laise
+fabrics
+hurley
+renal
+sleeps
+##bour
+##icle
+breakout
+kristin
+roadside
+animator
+clover
+disdain
+unsafe
+redesign
+##urity
+firth
+barnsley
+portage
+reset
+narrows
+268
+commandos
+expansive
+speechless
+tubular
+##lux
+essendon
+eyelashes
+smashwords
+##yad
+##bang
+##claim
+craved
+sprinted
+chet
+somme
+astor
+wrocław
+orton
+266
+bane
+##erving
+##uing
+mischief
+##amps
+##sund
+scaling
+terre
+##xious
+impairment
+offenses
+undermine
+moi
+soy
+contiguous
+arcadia
+inuit
+seam
+##tops
+macbeth
+rebelled
+##icative
+##iot
+590
+elaborated
+frs
+uniformed
+##dberg
+259
+powerless
+priscilla
+stimulated
+980
+qc
+arboretum
+frustrating
+trieste
+bullock
+##nified
+enriched
+glistening
+intern
+##adia
+locus
+nouvelle
+ollie
+ike
+lash
+starboard
+ee
+tapestry
+headlined
+hove
+rigged
+##vite
+pollock
+##yme
+thrive
+clustered
+cas
+roi
+gleamed
+olympiad
+##lino
+pressured
+regimes
+##hosis
+##lick
+ripley
+##ophone
+kickoff
+gallon
+rockwell
+##arable
+crusader
+glue
+revolutions
+scrambling
+1714
+grover
+##jure
+englishman
+aztec
+263
+contemplating
+coven
+ipad
+preach
+triumphant
+tufts
+##esian
+rotational
+##phus
+328
+falkland
+##brates
+strewn
+clarissa
+rejoin
+environmentally
+glint
+banded
+drenched
+moat
+albanians
+johor
+rr
+maestro
+malley
+nouveau
+shaded
+taxonomy
+v6
+adhere
+bunk
+airfields
+##ritan
+1741
+encompass
+remington
+tran
+##erative
+amelie
+mazda
+friar
+morals
+passions
+##zai
+breadth
+vis
+##hae
+argus
+burnham
+caressing
+insider
+rudd
+##imov
+##mini
+##rso
+italianate
+murderous
+textual
+wainwright
+armada
+bam
+weave
+timer
+##taken
+##nh
+fra
+##crest
+ardent
+salazar
+taps
+tunis
+##ntino
+allegro
+gland
+philanthropic
+##chester
+implication
+##optera
+esq
+judas
+noticeably
+wynn
+##dara
+inched
+indexed
+crises
+villiers
+bandit
+royalties
+patterned
+cupboard
+interspersed
+accessory
+isla
+kendrick
+entourage
+stitches
+##esthesia
+headwaters
+##ior
+interlude
+distraught
+draught
+1727
+##basket
+biased
+sy
+transient
+triad
+subgenus
+adapting
+kidd
+shortstop
+##umatic
+dimly
+spiked
+mcleod
+reprint
+nellie
+pretoria
+windmill
+##cek
+singled
+##mps
+273
+reunite
+##orous
+747
+bankers
+outlying
+##omp
+##ports
+##tream
+apologies
+cosmetics
+patsy
+##deh
+##ocks
+##yson
+bender
+nantes
+serene
+##nad
+lucha
+mmm
+323
+##cius
+##gli
+cmll
+coinage
+nestor
+juarez
+##rook
+smeared
+sprayed
+twitching
+sterile
+irina
+embodied
+juveniles
+enveloped
+miscellaneous
+cancers
+dq
+gulped
+luisa
+crested
+swat
+donegal
+ref
+##anov
+##acker
+hearst
+mercantile
+##lika
+doorbell
+ua
+vicki
+##alla
+##som
+bilbao
+psychologists
+stryker
+sw
+horsemen
+turkmenistan
+wits
+##national
+anson
+mathew
+screenings
+##umb
+rihanna
+##agne
+##nessy
+aisles
+##iani
+##osphere
+hines
+kenton
+saskatoon
+tasha
+truncated
+##champ
+##itan
+mildred
+advises
+fredrik
+interpreting
+inhibitors
+##athi
+spectroscopy
+##hab
+##kong
+karim
+panda
+##oia
+##nail
+##vc
+conqueror
+kgb
+leukemia
+##dity
+arrivals
+cheered
+pisa
+phosphorus
+shielded
+##riated
+mammal
+unitarian
+urgently
+chopin
+sanitary
+##mission
+spicy
+drugged
+hinges
+##tort
+tipping
+trier
+impoverished
+westchester
+##caster
+267
+epoch
+nonstop
+##gman
+##khov
+aromatic
+centrally
+cerro
+##tively
+##vio
+billions
+modulation
+sedimentary
+283
+facilitating
+outrageous
+goldstein
+##eak
+##kt
+ld
+maitland
+penultimate
+pollard
+##dance
+fleets
+spaceship
+vertebrae
+##nig
+alcoholism
+als
+recital
+##bham
+##ference
+##omics
+m2
+##bm
+trois
+##tropical
+##в
+commemorates
+##meric
+marge
+##raction
+1643
+670
+cosmetic
+ravaged
+##ige
+catastrophe
+eng
+##shida
+albrecht
+arterial
+bellamy
+decor
+harmon
+##rde
+bulbs
+synchronized
+vito
+easiest
+shetland
+shielding
+wnba
+##glers
+##ssar
+##riam
+brianna
+cumbria
+##aceous
+##rard
+cores
+thayer
+##nsk
+brood
+hilltop
+luminous
+carts
+keynote
+larkin
+logos
+##cta
+##ا
+##mund
+##quay
+lilith
+tinted
+277
+wrestle
+mobilization
+##uses
+sequential
+siam
+bloomfield
+takahashi
+274
+##ieving
+presenters
+ringo
+blazed
+witty
+##oven
+##ignant
+devastation
+haydn
+harmed
+newt
+therese
+##peed
+gershwin
+molina
+rabbis
+sudanese
+001
+innate
+restarted
+##sack
+##fus
+slices
+wb
+##shah
+enroll
+hypothetical
+hysterical
+1743
+fabio
+indefinite
+warped
+##hg
+exchanging
+525
+unsuitable
+##sboro
+gallo
+1603
+bret
+cobalt
+homemade
+##hunter
+mx
+operatives
+##dhar
+terraces
+durable
+latch
+pens
+whorls
+##ctuated
+##eaux
+billing
+ligament
+succumbed
+##gly
+regulators
+spawn
+##brick
+##stead
+filmfare
+rochelle
+##nzo
+1725
+circumstance
+saber
+supplements
+##nsky
+##tson
+crowe
+wellesley
+carrot
+##9th
+##movable
+primate
+drury
+sincerely
+topical
+##mad
+##rao
+callahan
+kyiv
+smarter
+tits
+undo
+##yeh
+announcements
+anthologies
+barrio
+nebula
+##islaus
+##shaft
+##tyn
+bodyguards
+2021
+assassinate
+barns
+emmett
+scully
+##mah
+##yd
+##eland
+##tino
+##itarian
+demoted
+gorman
+lashed
+prized
+adventist
+writ
+##gui
+alla
+invertebrates
+##ausen
+1641
+amman
+1742
+align
+healy
+redistribution
+##gf
+##rize
+insulation
+##drop
+adherents
+hezbollah
+vitro
+ferns
+yanking
+269
+php
+registering
+uppsala
+cheerleading
+confines
+mischievous
+tully
+##ross
+49th
+docked
+roam
+stipulated
+pumpkin
+##bry
+prompt
+##ezer
+blindly
+shuddering
+craftsmen
+frail
+scented
+katharine
+scramble
+shaggy
+sponge
+helix
+zaragoza
+279
+##52
+43rd
+backlash
+fontaine
+seizures
+posse
+cowan
+nonfiction
+telenovela
+wwii
+hammered
+undone
+##gpur
+encircled
+irs
+##ivation
+artefacts
+oneself
+searing
+smallpox
+##belle
+##osaurus
+shandong
+breached
+upland
+blushing
+rankin
+infinitely
+psyche
+tolerated
+docking
+evicted
+##col
+unmarked
+##lving
+gnome
+lettering
+litres
+musique
+##oint
+benevolent
+##jal
+blackened
+##anna
+mccall
+racers
+tingle
+##ocene
+##orestation
+introductions
+radically
+292
+##hiff
+##باد
+1610
+1739
+munchen
+plead
+##nka
+condo
+scissors
+##sight
+##tens
+apprehension
+##cey
+##yin
+hallmark
+watering
+formulas
+sequels
+##llas
+aggravated
+bae
+commencing
+##building
+enfield
+prohibits
+marne
+vedic
+civilized
+euclidean
+jagger
+beforehand
+blasts
+dumont
+##arney
+##nem
+740
+conversions
+hierarchical
+rios
+simulator
+##dya
+##lellan
+hedges
+oleg
+thrusts
+shadowed
+darby
+maximize
+1744
+gregorian
+##nded
+##routed
+sham
+unspecified
+##hog
+emory
+factual
+##smo
+##tp
+fooled
+##rger
+ortega
+wellness
+marlon
+##oton
+##urance
+casket
+keating
+ley
+enclave
+##ayan
+char
+influencing
+jia
+##chenko
+412
+ammonia
+erebidae
+incompatible
+violins
+cornered
+##arat
+grooves
+astronauts
+columbian
+rampant
+fabrication
+kyushu
+mahmud
+vanish
+##dern
+mesopotamia
+##lete
+ict
+##rgen
+caspian
+kenji
+pitted
+##vered
+999
+grimace
+roanoke
+tchaikovsky
+twinned
+##analysis
+##awan
+xinjiang
+arias
+clemson
+kazakh
+sizable
+1662
+##khand
+##vard
+plunge
+tatum
+vittorio
+##nden
+cholera
+##dana
+##oper
+bracing
+indifference
+projectile
+superliga
+##chee
+realises
+upgrading
+299
+porte
+retribution
+##vies
+nk
+stil
+##resses
+ama
+bureaucracy
+blackberry
+bosch
+testosterone
+collapses
+greer
+##pathic
+ioc
+fifties
+malls
+##erved
+bao
+baskets
+adolescents
+siegfried
+##osity
+##tosis
+mantra
+detecting
+existent
+fledgling
+##cchi
+dissatisfied
+gan
+telecommunication
+mingled
+sobbed
+6000
+controversies
+outdated
+taxis
+##raus
+fright
+slams
+##lham
+##fect
+##tten
+detectors
+fetal
+tanned
+##uw
+fray
+goth
+olympian
+skipping
+mandates
+scratches
+sheng
+unspoken
+hyundai
+tracey
+hotspur
+restrictive
+##buch
+americana
+mundo
+##bari
+burroughs
+diva
+vulcan
+##6th
+distinctions
+thumping
+##ngen
+mikey
+sheds
+fide
+rescues
+springsteen
+vested
+valuation
+##ece
+##ely
+pinnacle
+rake
+sylvie
+##edo
+almond
+quivering
+##irus
+alteration
+faltered
+##wad
+51st
+hydra
+ticked
+##kato
+recommends
+##dicated
+antigua
+arjun
+stagecoach
+wilfred
+trickle
+pronouns
+##pon
+aryan
+nighttime
+##anian
+gall
+pea
+stitch
+##hei
+leung
+milos
+##dini
+eritrea
+nexus
+starved
+snowfall
+kant
+parasitic
+cot
+discus
+hana
+strikers
+appleton
+kitchens
+##erina
+##partisan
+##itha
+##vius
+disclose
+metis
+##channel
+1701
+tesla
+##vera
+fitch
+1735
+blooded
+##tila
+decimal
+##tang
+##bai
+cyclones
+eun
+bottled
+peas
+pensacola
+basha
+bolivian
+crabs
+boil
+lanterns
+partridge
+roofed
+1645
+necks
+##phila
+opined
+patting
+##kla
+##lland
+chuckles
+volta
+whereupon
+##nche
+devout
+euroleague
+suicidal
+##dee
+inherently
+involuntary
+knitting
+nasser
+##hide
+puppets
+colourful
+courageous
+southend
+stills
+miraculous
+hodgson
+richer
+rochdale
+ethernet
+greta
+uniting
+prism
+umm
+##haya
+##itical
+##utation
+deterioration
+pointe
+prowess
+##ropriation
+lids
+scranton
+billings
+subcontinent
+##koff
+##scope
+brute
+kellogg
+psalms
+degraded
+##vez
+stanisław
+##ructured
+ferreira
+pun
+astonishing
+gunnar
+##yat
+arya
+prc
+gottfried
+##tight
+excursion
+##ographer
+dina
+##quil
+##nare
+huffington
+illustrious
+wilbur
+gundam
+verandah
+##zard
+naacp
+##odle
+constructive
+fjord
+kade
+##naud
+generosity
+thrilling
+baseline
+cayman
+frankish
+plastics
+accommodations
+zoological
+##fting
+cedric
+qb
+motorized
+##dome
+##otted
+squealed
+tackled
+canucks
+budgets
+situ
+asthma
+dail
+gabled
+grasslands
+whimpered
+writhing
+judgments
+##65
+minnie
+pv
+##carbon
+bananas
+grille
+domes
+monique
+odin
+maguire
+markham
+tierney
+##estra
+##chua
+libel
+poke
+speedy
+atrium
+laval
+notwithstanding
+##edly
+fai
+kala
+##sur
+robb
+##sma
+listings
+luz
+supplementary
+tianjin
+##acing
+enzo
+jd
+ric
+scanner
+croats
+transcribed
+##49
+arden
+cv
+##hair
+##raphy
+##lver
+##uy
+357
+seventies
+staggering
+alam
+horticultural
+hs
+regression
+timbers
+blasting
+##ounded
+montagu
+manipulating
+##cit
+catalytic
+1550
+troopers
+##meo
+condemnation
+fitzpatrick
+##oire
+##roved
+inexperienced
+1670
+castes
+##lative
+outing
+314
+dubois
+flicking
+quarrel
+ste
+learners
+1625
+iq
+whistled
+##class
+282
+classify
+tariffs
+temperament
+355
+folly
+liszt
+##yles
+immersed
+jordanian
+ceasefire
+apparel
+extras
+maru
+fished
+##bio
+harta
+stockport
+assortment
+craftsman
+paralysis
+transmitters
+##cola
+blindness
+##wk
+fatally
+proficiency
+solemnly
+##orno
+repairing
+amore
+groceries
+ultraviolet
+##chase
+schoolhouse
+##tua
+resurgence
+nailed
+##otype
+##×
+ruse
+saliva
+diagrams
+##tructing
+albans
+rann
+thirties
+1b
+antennas
+hilarious
+cougars
+paddington
+stats
+##eger
+breakaway
+ipod
+reza
+authorship
+prohibiting
+scoffed
+##etz
+##ttle
+conscription
+defected
+trondheim
+##fires
+ivanov
+keenan
+##adan
+##ciful
+##fb
+##slow
+locating
+##ials
+##tford
+cadiz
+basalt
+blankly
+interned
+rags
+rattling
+##tick
+carpathian
+reassured
+sync
+bum
+guildford
+iss
+staunch
+##onga
+astronomers
+sera
+sofie
+emergencies
+susquehanna
+##heard
+duc
+mastery
+vh1
+williamsburg
+bayer
+buckled
+craving
+##khan
+##rdes
+bloomington
+##write
+alton
+barbecue
+##bians
+justine
+##hri
+##ndt
+delightful
+smartphone
+newtown
+photon
+retrieval
+peugeot
+hissing
+##monium
+##orough
+flavors
+lighted
+relaunched
+tainted
+##games
+##lysis
+anarchy
+microscopic
+hopping
+adept
+evade
+evie
+##beau
+inhibit
+sinn
+adjustable
+hurst
+intuition
+wilton
+cisco
+44th
+lawful
+lowlands
+stockings
+thierry
+##dalen
+##hila
+##nai
+fates
+prank
+tb
+maison
+lobbied
+provocative
+1724
+4a
+utopia
+##qual
+carbonate
+gujarati
+purcell
+##rford
+curtiss
+##mei
+overgrown
+arenas
+mediation
+swallows
+##rnik
+respectful
+turnbull
+##hedron
+##hope
+alyssa
+ozone
+##ʻi
+ami
+gestapo
+johansson
+snooker
+canteen
+cuff
+declines
+empathy
+stigma
+##ags
+##iner
+##raine
+taxpayers
+gui
+volga
+##wright
+##copic
+lifespan
+overcame
+tattooed
+enactment
+giggles
+##ador
+##camp
+barrington
+bribe
+obligatory
+orbiting
+peng
+##enas
+elusive
+sucker
+##vating
+cong
+hardship
+empowered
+anticipating
+estrada
+cryptic
+greasy
+detainees
+planck
+sudbury
+plaid
+dod
+marriott
+kayla
+##ears
+##vb
+##zd
+mortally
+##hein
+cognition
+radha
+319
+liechtenstein
+meade
+richly
+argyle
+harpsichord
+liberalism
+trumpets
+lauded
+tyrant
+salsa
+tiled
+lear
+promoters
+reused
+slicing
+trident
+##chuk
+##gami
+##lka
+cantor
+checkpoint
+##points
+gaul
+leger
+mammalian
+##tov
+##aar
+##schaft
+doha
+frenchman
+nirvana
+##vino
+delgado
+headlining
+##eron
+##iography
+jug
+tko
+1649
+naga
+intersections
+##jia
+benfica
+nawab
+##suka
+ashford
+gulp
+##deck
+##vill
+##rug
+brentford
+frazier
+pleasures
+dunne
+potsdam
+shenzhen
+dentistry
+##tec
+flanagan
+##dorff
+##hear
+chorale
+dinah
+prem
+quezon
+##rogated
+relinquished
+sutra
+terri
+##pani
+flaps
+##rissa
+poly
+##rnet
+homme
+aback
+##eki
+linger
+womb
+##kson
+##lewood
+doorstep
+orthodoxy
+threaded
+westfield
+##rval
+dioceses
+fridays
+subsided
+##gata
+loyalists
+##biotic
+##ettes
+letterman
+lunatic
+prelate
+tenderly
+invariably
+souza
+thug
+winslow
+##otide
+furlongs
+gogh
+jeopardy
+##runa
+pegasus
+##umble
+humiliated
+standalone
+tagged
+##roller
+freshmen
+klan
+##bright
+attaining
+initiating
+transatlantic
+logged
+viz
+##uance
+1723
+combatants
+intervening
+stephane
+chieftain
+despised
+grazed
+317
+cdc
+galveston
+godzilla
+macro
+simulate
+##planes
+parades
+##esses
+960
+##ductive
+##unes
+equator
+overdose
+##cans
+##hosh
+##lifting
+joshi
+epstein
+sonora
+treacherous
+aquatics
+manchu
+responsive
+##sation
+supervisory
+##christ
+##llins
+##ibar
+##balance
+##uso
+kimball
+karlsruhe
+mab
+##emy
+ignores
+phonetic
+reuters
+spaghetti
+820
+almighty
+danzig
+rumbling
+tombstone
+designations
+lured
+outset
+##felt
+supermarkets
+##wt
+grupo
+kei
+kraft
+susanna
+##blood
+comprehension
+genealogy
+##aghan
+##verted
+redding
+##ythe
+1722
+bowing
+##pore
+##roi
+lest
+sharpened
+fulbright
+valkyrie
+sikhs
+##unds
+swans
+bouquet
+merritt
+##tage
+##venting
+commuted
+redhead
+clerks
+leasing
+cesare
+dea
+hazy
+##vances
+fledged
+greenfield
+servicemen
+##gical
+armando
+blackout
+dt
+sagged
+downloadable
+intra
+potion
+pods
+##4th
+##mism
+xp
+attendants
+gambia
+stale
+##ntine
+plump
+asteroids
+rediscovered
+buds
+flea
+hive
+##neas
+1737
+classifications
+debuts
+##eles
+olympus
+scala
+##eurs
+##gno
+##mute
+hummed
+sigismund
+visuals
+wiggled
+await
+pilasters
+clench
+sulfate
+##ances
+bellevue
+enigma
+trainee
+snort
+##sw
+clouded
+denim
+##rank
+##rder
+churning
+hartman
+lodges
+riches
+sima
+##missible
+accountable
+socrates
+regulates
+mueller
+##cr
+1702
+avoids
+solids
+himalayas
+nutrient
+pup
+##jevic
+squat
+fades
+nec
+##lates
+##pina
+##rona
+##ου
+privateer
+tequila
+##gative
+##mpton
+apt
+hornet
+immortals
+##dou
+asturias
+cleansing
+dario
+##rries
+##anta
+etymology
+servicing
+zhejiang
+##venor
+##nx
+horned
+erasmus
+rayon
+relocating
+£10
+##bags
+escalated
+promenade
+stubble
+2010s
+artisans
+axial
+liquids
+mora
+sho
+yoo
+##tsky
+bundles
+oldies
+##nally
+notification
+bastion
+##ths
+sparkle
+##lved
+1728
+leash
+pathogen
+highs
+##hmi
+immature
+880
+gonzaga
+ignatius
+mansions
+monterrey
+sweets
+bryson
+##loe
+polled
+regatta
+brightest
+pei
+rosy
+squid
+hatfield
+payroll
+addict
+meath
+cornerback
+heaviest
+lodging
+##mage
+capcom
+rippled
+##sily
+barnet
+mayhem
+ymca
+snuggled
+rousseau
+##cute
+blanchard
+284
+fragmented
+leighton
+chromosomes
+risking
+##md
+##strel
+##utter
+corinne
+coyotes
+cynical
+hiroshi
+yeomanry
+##ractive
+ebook
+grading
+mandela
+plume
+agustin
+magdalene
+##rkin
+bea
+femme
+trafford
+##coll
+##lun
+##tance
+52nd
+fourier
+upton
+##mental
+camilla
+gust
+iihf
+islamabad
+longevity
+##kala
+feldman
+netting
+##rization
+endeavour
+foraging
+mfa
+orr
+##open
+greyish
+contradiction
+graz
+##ruff
+handicapped
+marlene
+tweed
+oaxaca
+spp
+campos
+miocene
+pri
+configured
+cooks
+pluto
+cozy
+pornographic
+##entes
+70th
+fairness
+glided
+jonny
+lynne
+rounding
+sired
+##emon
+##nist
+remade
+uncover
+##mack
+complied
+lei
+newsweek
+##jured
+##parts
+##enting
+##pg
+293
+finer
+guerrillas
+athenian
+deng
+disused
+stepmother
+accuse
+gingerly
+seduction
+521
+confronting
+##walker
+##going
+gora
+nostalgia
+sabres
+virginity
+wrenched
+##minated
+syndication
+wielding
+eyre
+##56
+##gnon
+##igny
+behaved
+taxpayer
+sweeps
+##growth
+childless
+gallant
+##ywood
+amplified
+geraldine
+scrape
+##ffi
+babylonian
+fresco
+##rdan
+##kney
+##position
+1718
+restricting
+tack
+fukuoka
+osborn
+selector
+partnering
+##dlow
+318
+gnu
+kia
+tak
+whitley
+gables
+##54
+##mania
+mri
+softness
+immersion
+##bots
+##evsky
+1713
+chilling
+insignificant
+pcs
+##uis
+elites
+lina
+purported
+supplemental
+teaming
+##americana
+##dding
+##inton
+proficient
+rouen
+##nage
+##rret
+niccolo
+selects
+##bread
+fluffy
+1621
+gruff
+knotted
+mukherjee
+polgara
+thrash
+nicholls
+secluded
+smoothing
+thru
+corsica
+loaf
+whitaker
+inquiries
+##rrier
+##kam
+indochina
+289
+marlins
+myles
+peking
+##tea
+extracts
+pastry
+superhuman
+connacht
+vogel
+##ditional
+##het
+##udged
+##lash
+gloss
+quarries
+refit
+teaser
+##alic
+##gaon
+20s
+materialized
+sling
+camped
+pickering
+tung
+tracker
+pursuant
+##cide
+cranes
+soc
+##cini
+##typical
+##viere
+anhalt
+overboard
+workout
+chores
+fares
+orphaned
+stains
+##logie
+fenton
+surpassing
+joyah
+triggers
+##itte
+grandmaster
+##lass
+##lists
+clapping
+fraudulent
+ledger
+nagasaki
+##cor
+##nosis
+##tsa
+eucalyptus
+tun
+##icio
+##rney
+##tara
+dax
+heroism
+ina
+wrexham
+onboard
+unsigned
+##dates
+moshe
+galley
+winnie
+droplets
+exiles
+praises
+watered
+noodles
+##aia
+fein
+adi
+leland
+multicultural
+stink
+bingo
+comets
+erskine
+modernized
+canned
+constraint
+domestically
+chemotherapy
+featherweight
+stifled
+##mum
+darkly
+irresistible
+refreshing
+hasty
+isolate
+##oys
+kitchener
+planners
+##wehr
+cages
+yarn
+implant
+toulon
+elects
+childbirth
+yue
+##lind
+##lone
+cn
+rightful
+sportsman
+junctions
+remodeled
+specifies
+##rgh
+291
+##oons
+complimented
+##urgent
+lister
+ot
+##logic
+bequeathed
+cheekbones
+fontana
+gabby
+##dial
+amadeus
+corrugated
+maverick
+resented
+triangles
+##hered
+##usly
+nazareth
+tyrol
+1675
+assent
+poorer
+sectional
+aegean
+##cous
+296
+nylon
+ghanaian
+##egorical
+##weig
+cushions
+forbid
+fusiliers
+obstruction
+somerville
+##scia
+dime
+earrings
+elliptical
+leyte
+oder
+polymers
+timmy
+atm
+midtown
+piloted
+settles
+continual
+externally
+mayfield
+##uh
+enrichment
+henson
+keane
+persians
+1733
+benji
+braden
+pep
+324
+##efe
+contenders
+pepsi
+valet
+##isches
+298
+##asse
+##earing
+goofy
+stroll
+##amen
+authoritarian
+occurrences
+adversary
+ahmedabad
+tangent
+toppled
+dorchester
+1672
+modernism
+marxism
+islamist
+charlemagne
+exponential
+racks
+unicode
+brunette
+mbc
+pic
+skirmish
+##bund
+##lad
+##powered
+##yst
+hoisted
+messina
+shatter
+##ctum
+jedi
+vantage
+##music
+##neil
+clemens
+mahmoud
+corrupted
+authentication
+lowry
+nils
+##washed
+omnibus
+wounding
+jillian
+##itors
+##opped
+serialized
+narcotics
+handheld
+##arm
+##plicity
+intersecting
+stimulating
+##onis
+crate
+fellowships
+hemingway
+casinos
+climatic
+fordham
+copeland
+drip
+beatty
+leaflets
+robber
+brothel
+madeira
+##hedral
+sphinx
+ultrasound
+##vana
+valor
+forbade
+leonid
+villas
+##aldo
+duane
+marquez
+##cytes
+disadvantaged
+forearms
+kawasaki
+reacts
+consular
+lax
+uncles
+uphold
+##hopper
+concepcion
+dorsey
+lass
+##izan
+arching
+passageway
+1708
+researches
+tia
+internationals
+##graphs
+##opers
+distinguishes
+javanese
+divert
+##uven
+plotted
+##listic
+##rwin
+##erik
+##tify
+affirmative
+signifies
+validation
+##bson
+kari
+felicity
+georgina
+zulu
+##eros
+##rained
+##rath
+overcoming
+##dot
+argyll
+##rbin
+1734
+chiba
+ratification
+windy
+earls
+parapet
+##marks
+hunan
+pristine
+astrid
+punta
+##gart
+brodie
+##kota
+##oder
+malaga
+minerva
+rouse
+##phonic
+bellowed
+pagoda
+portals
+reclamation
+##gur
+##odies
+##⁄₄
+parentheses
+quoting
+allergic
+palette
+showcases
+benefactor
+heartland
+nonlinear
+##tness
+bladed
+cheerfully
+scans
+##ety
+##hone
+1666
+girlfriends
+pedersen
+hiram
+sous
+##liche
+##nator
+1683
+##nery
+##orio
+##umen
+bobo
+primaries
+smiley
+##cb
+unearthed
+uniformly
+fis
+metadata
+1635
+ind
+##oted
+recoil
+##titles
+##tura
+##ια
+406
+hilbert
+jamestown
+mcmillan
+tulane
+seychelles
+##frid
+antics
+coli
+fated
+stucco
+##grants
+1654
+bulky
+accolades
+arrays
+caledonian
+carnage
+optimism
+puebla
+##tative
+##cave
+enforcing
+rotherham
+seo
+dunlop
+aeronautics
+chimed
+incline
+zoning
+archduke
+hellenistic
+##oses
+##sions
+candi
+thong
+##ople
+magnate
+rustic
+##rsk
+projective
+slant
+##offs
+danes
+hollis
+vocalists
+##ammed
+congenital
+contend
+gesellschaft
+##ocating
+##pressive
+douglass
+quieter
+##cm
+##kshi
+howled
+salim
+spontaneously
+townsville
+buena
+southport
+##bold
+kato
+1638
+faerie
+stiffly
+##vus
+##rled
+297
+flawless
+realising
+taboo
+##7th
+bytes
+straightening
+356
+jena
+##hid
+##rmin
+cartwright
+berber
+bertram
+soloists
+411
+noses
+417
+coping
+fission
+hardin
+inca
+##cen
+1717
+mobilized
+vhf
+##raf
+biscuits
+curate
+##85
+##anial
+331
+gaunt
+neighbourhoods
+1540
+##abas
+blanca
+bypassed
+sockets
+behold
+coincidentally
+##bane
+nara
+shave
+splinter
+terrific
+##arion
+##erian
+commonplace
+juris
+redwood
+waistband
+boxed
+caitlin
+fingerprints
+jennie
+naturalized
+##ired
+balfour
+craters
+jody
+bungalow
+hugely
+quilt
+glitter
+pigeons
+undertaker
+bulging
+constrained
+goo
+##sil
+##akh
+assimilation
+reworked
+##person
+persuasion
+##pants
+felicia
+##cliff
+##ulent
+1732
+explodes
+##dun
+##inium
+##zic
+lyman
+vulture
+hog
+overlook
+begs
+northwards
+ow
+spoil
+##urer
+fatima
+favorably
+accumulate
+sargent
+sorority
+corresponded
+dispersal
+kochi
+toned
+##imi
+##lita
+internacional
+newfound
+##agger
+##lynn
+##rigue
+booths
+peanuts
+##eborg
+medicare
+muriel
+nur
+##uram
+crates
+millennia
+pajamas
+worsened
+##breakers
+jimi
+vanuatu
+yawned
+##udeau
+carousel
+##hony
+hurdle
+##ccus
+##mounted
+##pod
+rv
+##eche
+airship
+ambiguity
+compulsion
+recapture
+##claiming
+arthritis
+##osomal
+1667
+asserting
+ngc
+sniffing
+dade
+discontent
+glendale
+ported
+##amina
+defamation
+rammed
+##scent
+fling
+livingstone
+##fleet
+875
+##ppy
+apocalyptic
+comrade
+lcd
+##lowe
+cessna
+eine
+persecuted
+subsistence
+demi
+hoop
+reliefs
+710
+coptic
+progressing
+stemmed
+perpetrators
+1665
+priestess
+##nio
+dobson
+ebony
+rooster
+itf
+tortricidae
+##bbon
+##jian
+cleanup
+##jean
+##øy
+1721
+eighties
+taxonomic
+holiness
+##hearted
+##spar
+antilles
+showcasing
+stabilized
+##nb
+gia
+mascara
+michelangelo
+dawned
+##uria
+##vinsky
+extinguished
+fitz
+grotesque
+£100
+##fera
+##loid
+##mous
+barges
+neue
+throbbed
+cipher
+johnnie
+##a1
+##mpt
+outburst
+##swick
+spearheaded
+administrations
+c1
+heartbreak
+pixels
+pleasantly
+##enay
+lombardy
+plush
+##nsed
+bobbie
+##hly
+reapers
+tremor
+xiang
+minogue
+substantive
+hitch
+barak
+##wyl
+kwan
+##encia
+910
+obscene
+elegance
+indus
+surfer
+bribery
+conserve
+##hyllum
+##masters
+horatio
+##fat
+apes
+rebound
+psychotic
+##pour
+iteration
+##mium
+##vani
+botanic
+horribly
+antiques
+dispose
+paxton
+##hli
+##wg
+timeless
+1704
+disregard
+engraver
+hounds
+##bau
+##version
+looted
+uno
+facilitates
+groans
+masjid
+rutland
+antibody
+disqualification
+decatur
+footballers
+quake
+slacks
+48th
+rein
+scribe
+stabilize
+commits
+exemplary
+tho
+##hort
+##chison
+pantry
+traversed
+##hiti
+disrepair
+identifiable
+vibrated
+baccalaureate
+##nnis
+csa
+interviewing
+##iensis
+##raße
+greaves
+wealthiest
+343
+classed
+jogged
+£5
+##58
+##atal
+illuminating
+knicks
+respecting
+##uno
+scrubbed
+##iji
+##dles
+kruger
+moods
+growls
+raider
+silvia
+chefs
+kam
+vr
+cree
+percival
+##terol
+gunter
+counterattack
+defiant
+henan
+ze
+##rasia
+##riety
+equivalence
+submissions
+##fra
+##thor
+bautista
+mechanically
+##heater
+cornice
+herbal
+templar
+##mering
+outputs
+ruining
+ligand
+renumbered
+extravagant
+mika
+blockbuster
+eta
+insurrection
+##ilia
+darkening
+ferocious
+pianos
+strife
+kinship
+##aer
+melee
+##anor
+##iste
+##may
+##oue
+decidedly
+weep
+##jad
+##missive
+##ppel
+354
+puget
+unease
+##gnant
+1629
+hammering
+kassel
+ob
+wessex
+##lga
+bromwich
+egan
+paranoia
+utilization
+##atable
+##idad
+contradictory
+provoke
+##ols
+##ouring
+##tangled
+knesset
+##very
+##lette
+plumbing
+##sden
+##¹
+greensboro
+occult
+sniff
+338
+zev
+beaming
+gamer
+haggard
+mahal
+##olt
+##pins
+mendes
+utmost
+briefing
+gunnery
+##gut
+##pher
+##zh
+##rok
+1679
+khalifa
+sonya
+##boot
+principals
+urbana
+wiring
+##liffe
+##minating
+##rrado
+dahl
+nyu
+skepticism
+np
+townspeople
+ithaca
+lobster
+somethin
+##fur
+##arina
+##−1
+freighter
+zimmerman
+biceps
+contractual
+##herton
+amend
+hurrying
+subconscious
+##anal
+336
+meng
+clermont
+spawning
+##eia
+##lub
+dignitaries
+impetus
+snacks
+spotting
+twigs
+##bilis
+##cz
+##ouk
+libertadores
+nic
+skylar
+##aina
+##firm
+gustave
+asean
+##anum
+dieter
+legislatures
+flirt
+bromley
+trolls
+umar
+##bbies
+##tyle
+blah
+parc
+bridgeport
+crank
+negligence
+##nction
+46th
+constantin
+molded
+bandages
+seriousness
+00pm
+siegel
+carpets
+compartments
+upbeat
+statehood
+##dner
+##edging
+marko
+730
+platt
+##hane
+paving
+##iy
+1738
+abbess
+impatience
+limousine
+nbl
+##talk
+441
+lucille
+mojo
+nightfall
+robbers
+##nais
+karel
+brisk
+calves
+replicate
+ascribed
+telescopes
+##olf
+intimidated
+##reen
+ballast
+specialization
+##sit
+aerodynamic
+caliphate
+rainer
+visionary
+##arded
+epsilon
+##aday
+##onte
+aggregation
+auditory
+boosted
+reunification
+kathmandu
+loco
+robyn
+402
+acknowledges
+appointing
+humanoid
+newell
+redeveloped
+restraints
+##tained
+barbarians
+chopper
+1609
+italiana
+##lez
+##lho
+investigates
+wrestlemania
+##anies
+##bib
+690
+##falls
+creaked
+dragoons
+gravely
+minions
+stupidity
+volley
+##harat
+##week
+musik
+##eries
+##uously
+fungal
+massimo
+semantics
+malvern
+##ahl
+##pee
+discourage
+embryo
+imperialism
+1910s
+profoundly
+##ddled
+jiangsu
+sparkled
+stat
+##holz
+sweatshirt
+tobin
+##iction
+sneered
+##cheon
+##oit
+brit
+causal
+smyth
+##neuve
+diffuse
+perrin
+silvio
+##ipes
+##recht
+detonated
+iqbal
+selma
+##nism
+##zumi
+roasted
+##riders
+tay
+##ados
+##mament
+##mut
+##rud
+840
+completes
+nipples
+cfa
+flavour
+hirsch
+##laus
+calderon
+sneakers
+moravian
+##ksha
+1622
+rq
+294
+##imeters
+bodo
+##isance
+##pre
+##ronia
+anatomical
+excerpt
+##lke
+dh
+kunst
+##tablished
+##scoe
+biomass
+panted
+unharmed
+gael
+housemates
+montpellier
+##59
+coa
+rodents
+tonic
+hickory
+singleton
+##taro
+451
+1719
+aldo
+breaststroke
+dempsey
+och
+rocco
+##cuit
+merton
+dissemination
+midsummer
+serials
+##idi
+haji
+polynomials
+##rdon
+gs
+enoch
+prematurely
+shutter
+taunton
+£3
+##grating
+##inates
+archangel
+harassed
+##asco
+326
+archway
+dazzling
+##ecin
+1736
+sumo
+wat
+##kovich
+1086
+honneur
+##ently
+##nostic
+##ttal
+##idon
+1605
+403
+1716
+blogger
+rents
+##gnan
+hires
+##ikh
+##dant
+howie
+##rons
+handler
+retracted
+shocks
+1632
+arun
+duluth
+kepler
+trumpeter
+##lary
+peeking
+seasoned
+trooper
+##mara
+laszlo
+##iciencies
+##rti
+heterosexual
+##inatory
+##ssion
+indira
+jogging
+##inga
+##lism
+beit
+dissatisfaction
+malice
+##ately
+nedra
+peeling
+##rgeon
+47th
+stadiums
+475
+vertigo
+##ains
+iced
+restroom
+##plify
+##tub
+illustrating
+pear
+##chner
+##sibility
+inorganic
+rappers
+receipts
+watery
+##kura
+lucinda
+##oulos
+reintroduced
+##8th
+##tched
+gracefully
+saxons
+nutritional
+wastewater
+rained
+favourites
+bedrock
+fisted
+hallways
+likeness
+upscale
+##lateral
+1580
+blinds
+prequel
+##pps
+##tama
+deter
+humiliating
+restraining
+tn
+vents
+1659
+laundering
+recess
+rosary
+tractors
+coulter
+federer
+##ifiers
+##plin
+persistence
+##quitable
+geschichte
+pendulum
+quakers
+##beam
+bassett
+pictorial
+buffet
+koln
+##sitor
+drills
+reciprocal
+shooters
+##57
+##cton
+##tees
+converge
+pip
+dmitri
+donnelly
+yamamoto
+aqua
+azores
+demographics
+hypnotic
+spitfire
+suspend
+wryly
+roderick
+##rran
+sebastien
+##asurable
+mavericks
+##fles
+##200
+himalayan
+prodigy
+##iance
+transvaal
+demonstrators
+handcuffs
+dodged
+mcnamara
+sublime
+1726
+crazed
+##efined
+##till
+ivo
+pondered
+reconciled
+shrill
+sava
+##duk
+bal
+cad
+heresy
+jaipur
+goran
+##nished
+341
+lux
+shelly
+whitehall
+##hre
+israelis
+peacekeeping
+##wled
+1703
+demetrius
+ousted
+##arians
+##zos
+beale
+anwar
+backstroke
+raged
+shrinking
+cremated
+##yck
+benign
+towing
+wadi
+darmstadt
+landfill
+parana
+soothe
+colleen
+sidewalks
+mayfair
+tumble
+hepatitis
+ferrer
+superstructure
+##gingly
+##urse
+##wee
+anthropological
+translators
+##mies
+closeness
+hooves
+##pw
+mondays
+##roll
+##vita
+landscaping
+##urized
+purification
+sock
+thorns
+thwarted
+jalan
+tiberius
+##taka
+saline
+##rito
+confidently
+khyber
+sculptors
+##ij
+brahms
+hammersmith
+inspectors
+battista
+fivb
+fragmentation
+hackney
+##uls
+arresting
+exercising
+antoinette
+bedfordshire
+##zily
+dyed
+##hema
+1656
+racetrack
+variability
+##tique
+1655
+austrians
+deteriorating
+madman
+theorists
+aix
+lehman
+weathered
+1731
+decreed
+eruptions
+1729
+flaw
+quinlan
+sorbonne
+flutes
+nunez
+1711
+adored
+downwards
+fable
+rasped
+1712
+moritz
+mouthful
+renegade
+shivers
+stunts
+dysfunction
+restrain
+translit
+327
+pancakes
+##avio
+##cision
+##tray
+351
+vial
+##lden
+bain
+##maid
+##oxide
+chihuahua
+malacca
+vimes
+##rba
+##rnier
+1664
+donnie
+plaques
+##ually
+337
+bangs
+floppy
+huntsville
+loretta
+nikolay
+##otte
+eater
+handgun
+ubiquitous
+##hett
+eras
+zodiac
+1634
+##omorphic
+1820s
+##zog
+cochran
+##bula
+##lithic
+warring
+##rada
+dalai
+excused
+blazers
+mcconnell
+reeling
+bot
+este
+##abi
+geese
+hoax
+taxon
+##bla
+guitarists
+##icon
+condemning
+hunts
+inversion
+moffat
+taekwondo
+##lvis
+1624
+stammered
+##rest
+##rzy
+sousa
+fundraiser
+marylebone
+navigable
+uptown
+cabbage
+daniela
+salman
+shitty
+whimper
+##kian
+##utive
+programmers
+protections
+rm
+##rmi
+##rued
+forceful
+##enes
+fuss
+##tao
+##wash
+brat
+oppressive
+reykjavik
+spartak
+ticking
+##inkles
+##kiewicz
+adolph
+horst
+maui
+protege
+straighten
+cpc
+landau
+concourse
+clements
+resultant
+##ando
+imaginative
+joo
+reactivated
+##rem
+##ffled
+##uising
+consultative
+##guide
+flop
+kaitlyn
+mergers
+parenting
+somber
+##vron
+supervise
+vidhan
+##imum
+courtship
+exemplified
+harmonies
+medallist
+refining
+##rrow
+##ка
+amara
+##hum
+780
+goalscorer
+sited
+overshadowed
+rohan
+displeasure
+secretive
+multiplied
+osman
+##orth
+engravings
+padre
+##kali
+##veda
+miniatures
+mis
+##yala
+clap
+pali
+rook
+##cana
+1692
+57th
+antennae
+astro
+oskar
+1628
+bulldog
+crotch
+hackett
+yucatan
+##sure
+amplifiers
+brno
+ferrara
+migrating
+##gree
+thanking
+turing
+##eza
+mccann
+ting
+andersson
+onslaught
+gaines
+ganga
+incense
+standardization
+##mation
+sentai
+scuba
+stuffing
+turquoise
+waivers
+alloys
+##vitt
+regaining
+vaults
+##clops
+##gizing
+digger
+furry
+memorabilia
+probing
+##iad
+payton
+rec
+deutschland
+filippo
+opaque
+seamen
+zenith
+afrikaans
+##filtration
+disciplined
+inspirational
+##merie
+banco
+confuse
+grafton
+tod
+##dgets
+championed
+simi
+anomaly
+biplane
+##ceptive
+electrode
+##para
+1697
+cleavage
+crossbow
+swirl
+informant
+##lars
+##osta
+afi
+bonfire
+spec
+##oux
+lakeside
+slump
+##culus
+##lais
+##qvist
+##rrigan
+1016
+facades
+borg
+inwardly
+cervical
+xl
+pointedly
+050
+stabilization
+##odon
+chests
+1699
+hacked
+ctv
+orthogonal
+suzy
+##lastic
+gaulle
+jacobite
+rearview
+##cam
+##erted
+ashby
+##drik
+##igate
+##mise
+##zbek
+affectionately
+canine
+disperse
+latham
+##istles
+##ivar
+spielberg
+##orin
+##idium
+ezekiel
+cid
+##sg
+durga
+middletown
+##cina
+customized
+frontiers
+harden
+##etano
+##zzy
+1604
+bolsheviks
+##66
+coloration
+yoko
+##bedo
+briefs
+slabs
+debra
+liquidation
+plumage
+##oin
+blossoms
+dementia
+subsidy
+1611
+proctor
+relational
+jerseys
+parochial
+ter
+##ici
+esa
+peshawar
+cavalier
+loren
+cpi
+idiots
+shamrock
+1646
+dutton
+malabar
+mustache
+##endez
+##ocytes
+referencing
+terminates
+marche
+yarmouth
+##sop
+acton
+mated
+seton
+subtly
+baptised
+beige
+extremes
+jolted
+kristina
+telecast
+##actic
+safeguard
+waldo
+##baldi
+##bular
+endeavors
+sloppy
+subterranean
+##ensburg
+##itung
+delicately
+pigment
+tq
+##scu
+1626
+##ound
+collisions
+coveted
+herds
+##personal
+##meister
+##nberger
+chopra
+##ricting
+abnormalities
+defective
+galician
+lucie
+##dilly
+alligator
+likened
+##genase
+burundi
+clears
+complexion
+derelict
+deafening
+diablo
+fingered
+champaign
+dogg
+enlist
+isotope
+labeling
+mrna
+##erre
+brilliance
+marvelous
+##ayo
+1652
+crawley
+ether
+footed
+dwellers
+deserts
+hamish
+rubs
+warlock
+skimmed
+##lizer
+870
+buick
+embark
+heraldic
+irregularities
+##ajan
+kiara
+##kulam
+##ieg
+antigen
+kowalski
+##lge
+oakley
+visitation
+##mbit
+vt
+##suit
+1570
+murderers
+##miento
+##rites
+chimneys
+##sling
+condemn
+custer
+exchequer
+havre
+##ghi
+fluctuations
+##rations
+dfb
+hendricks
+vaccines
+##tarian
+nietzsche
+biking
+juicy
+##duced
+brooding
+scrolling
+selangor
+##ragan
+352
+annum
+boomed
+seminole
+sugarcane
+##dna
+departmental
+dismissing
+innsbruck
+arteries
+ashok
+batavia
+daze
+kun
+overtook
+##rga
+##tlan
+beheaded
+gaddafi
+holm
+electronically
+faulty
+galilee
+fractures
+kobayashi
+##lized
+gunmen
+magma
+aramaic
+mala
+eastenders
+inference
+messengers
+bf
+##qu
+407
+bathrooms
+##vere
+1658
+flashbacks
+ideally
+misunderstood
+##jali
+##weather
+mendez
+##grounds
+505
+uncanny
+##iii
+1709
+friendships
+##nbc
+sacrament
+accommodated
+reiterated
+logistical
+pebbles
+thumped
+##escence
+administering
+decrees
+drafts
+##flight
+##cased
+##tula
+futuristic
+picket
+intimidation
+winthrop
+##fahan
+interfered
+339
+afar
+francoise
+morally
+uta
+cochin
+croft
+dwarfs
+##bruck
+##dents
+##nami
+biker
+##hner
+##meral
+nano
+##isen
+##ometric
+##pres
+##ан
+brightened
+meek
+parcels
+securely
+gunners
+##jhl
+##zko
+agile
+hysteria
+##lten
+##rcus
+bukit
+champs
+chevy
+cuckoo
+leith
+sadler
+theologians
+welded
+##section
+1663
+jj
+plurality
+xander
+##rooms
+##formed
+shredded
+temps
+intimately
+pau
+tormented
+##lok
+##stellar
+1618
+charred
+ems
+essen
+##mmel
+alarms
+spraying
+ascot
+blooms
+twinkle
+##abia
+##apes
+internment
+obsidian
+##chaft
+snoop
+##dav
+##ooping
+malibu
+##tension
+quiver
+##itia
+hays
+mcintosh
+travers
+walsall
+##ffie
+1623
+beverley
+schwarz
+plunging
+structurally
+m3
+rosenthal
+vikram
+##tsk
+770
+ghz
+##onda
+##tiv
+chalmers
+groningen
+pew
+reckon
+unicef
+##rvis
+55th
+##gni
+1651
+sulawesi
+avila
+cai
+metaphysical
+screwing
+turbulence
+##mberg
+augusto
+samba
+56th
+baffled
+momentary
+toxin
+##urian
+##wani
+aachen
+condoms
+dali
+steppe
+##3d
+##app
+##oed
+##year
+adolescence
+dauphin
+electrically
+inaccessible
+microscopy
+nikita
+##ega
+atv
+##cel
+##enter
+##oles
+##oteric
+##ы
+accountants
+punishments
+wrongly
+bribes
+adventurous
+clinch
+flinders
+southland
+##hem
+##kata
+gough
+##ciency
+lads
+soared
+##ה
+undergoes
+deformation
+outlawed
+rubbish
+##arus
+##mussen
+##nidae
+##rzburg
+arcs
+##ingdon
+##tituted
+1695
+wheelbase
+wheeling
+bombardier
+campground
+zebra
+##lices
+##oj
+##bain
+lullaby
+##ecure
+donetsk
+wylie
+grenada
+##arding
+##ης
+squinting
+eireann
+opposes
+##andra
+maximal
+runes
+##broken
+##cuting
+##iface
+##ror
+##rosis
+additive
+britney
+adultery
+triggering
+##drome
+detrimental
+aarhus
+containment
+jc
+swapped
+vichy
+##ioms
+madly
+##oric
+##rag
+brant
+##ckey
+##trix
+1560
+1612
+broughton
+rustling
+##stems
+##uder
+asbestos
+mentoring
+##nivorous
+finley
+leaps
+##isan
+apical
+pry
+slits
+substitutes
+##dict
+intuitive
+fantasia
+insistent
+unreasonable
+##igen
+##vna
+domed
+hannover
+margot
+ponder
+##zziness
+impromptu
+jian
+lc
+rampage
+stemming
+##eft
+andrey
+gerais
+whichever
+amnesia
+appropriated
+anzac
+clicks
+modifying
+ultimatum
+cambrian
+maids
+verve
+yellowstone
+##mbs
+conservatoire
+##scribe
+adherence
+dinners
+spectra
+imperfect
+mysteriously
+sidekick
+tatar
+tuba
+##aks
+##ifolia
+distrust
+##athan
+##zle
+c2
+ronin
+zac
+##pse
+celaena
+instrumentalist
+scents
+skopje
+##mbling
+comical
+compensated
+vidal
+condor
+intersect
+jingle
+wavelengths
+##urrent
+mcqueen
+##izzly
+carp
+weasel
+422
+kanye
+militias
+postdoctoral
+eugen
+gunslinger
+##ɛ
+faux
+hospice
+##for
+appalled
+derivation
+dwarves
+##elis
+dilapidated
+##folk
+astoria
+philology
+##lwyn
+##otho
+##saka
+inducing
+philanthropy
+##bf
+##itative
+geek
+markedly
+sql
+##yce
+bessie
+indices
+rn
+##flict
+495
+frowns
+resolving
+weightlifting
+tugs
+cleric
+contentious
+1653
+mania
+rms
+##miya
+##reate
+##ruck
+##tucket
+bien
+eels
+marek
+##ayton
+##cence
+discreet
+unofficially
+##ife
+leaks
+##bber
+1705
+332
+dung
+compressor
+hillsborough
+pandit
+shillings
+distal
+##skin
+381
+##tat
+##you
+nosed
+##nir
+mangrove
+undeveloped
+##idia
+textures
+##inho
+##500
+##rise
+ae
+irritating
+nay
+amazingly
+bancroft
+apologetic
+compassionate
+kata
+symphonies
+##lovic
+airspace
+##lch
+930
+gifford
+precautions
+fulfillment
+sevilla
+vulgar
+martinique
+##urities
+looting
+piccolo
+tidy
+##dermott
+quadrant
+armchair
+incomes
+mathematicians
+stampede
+nilsson
+##inking
+##scan
+foo
+quarterfinal
+##ostal
+shang
+shouldered
+squirrels
+##owe
+344
+vinegar
+##bner
+##rchy
+##systems
+delaying
+##trics
+ars
+dwyer
+rhapsody
+sponsoring
+##gration
+bipolar
+cinder
+starters
+##olio
+##urst
+421
+signage
+##nty
+aground
+figurative
+mons
+acquaintances
+duets
+erroneously
+soyuz
+elliptic
+recreated
+##cultural
+##quette
+##ssed
+##tma
+##zcz
+moderator
+scares
+##itaire
+##stones
+##udence
+juniper
+sighting
+##just
+##nsen
+britten
+calabria
+ry
+bop
+cramer
+forsyth
+stillness
+##л
+airmen
+gathers
+unfit
+##umber
+##upt
+taunting
+##rip
+seeker
+streamlined
+##bution
+holster
+schumann
+tread
+vox
+##gano
+##onzo
+strive
+dil
+reforming
+covent
+newbury
+predicting
+##orro
+decorate
+tre
+##puted
+andover
+ie
+asahi
+dept
+dunkirk
+gills
+##tori
+buren
+huskies
+##stis
+##stov
+abstracts
+bets
+loosen
+##opa
+1682
+yearning
+##glio
+##sir
+berman
+effortlessly
+enamel
+napoli
+persist
+##peration
+##uez
+attache
+elisa
+b1
+invitations
+##kic
+accelerating
+reindeer
+boardwalk
+clutches
+nelly
+polka
+starbucks
+##kei
+adamant
+huey
+lough
+unbroken
+adventurer
+embroidery
+inspecting
+stanza
+##ducted
+naia
+taluka
+##pone
+##roids
+chases
+deprivation
+florian
+##jing
+##ppet
+earthly
+##lib
+##ssee
+colossal
+foreigner
+vet
+freaks
+patrice
+rosewood
+triassic
+upstate
+##pkins
+dominates
+ata
+chants
+ks
+vo
+##400
+##bley
+##raya
+##rmed
+555
+agra
+infiltrate
+##ailing
+##ilation
+##tzer
+##uppe
+##werk
+binoculars
+enthusiast
+fujian
+squeak
+##avs
+abolitionist
+almeida
+boredom
+hampstead
+marsden
+rations
+##ands
+inflated
+334
+bonuses
+rosalie
+patna
+##rco
+329
+detachments
+penitentiary
+54th
+flourishing
+woolf
+##dion
+##etched
+papyrus
+##lster
+##nsor
+##toy
+bobbed
+dismounted
+endelle
+inhuman
+motorola
+tbs
+wince
+wreath
+##ticus
+hideout
+inspections
+sanjay
+disgrace
+infused
+pudding
+stalks
+##urbed
+arsenic
+leases
+##hyl
+##rrard
+collarbone
+##waite
+##wil
+dowry
+##bant
+##edance
+genealogical
+nitrate
+salamanca
+scandals
+thyroid
+necessitated
+##!
+##"
+###
+##$
+##%
+##&
+##'
+##(
+##)
+##*
+##+
+##,
+##-
+##.
+##/
+##:
+##;
+##<
+##=
+##>
+##?
+##@
+##[
+##\
+##]
+##^
+##_
+##`
+##{
+##|
+##}
+##~
+##¡
+##¢
+##£
+##¤
+##¥
+##¦
+##§
+##¨
+##©
+##ª
+##«
+##¬
+##®
+##±
+##´
+##µ
+##¶
+##·
+##º
+##»
+##¼
+##¾
+##¿
+##æ
+##ð
+##÷
+##þ
+##đ
+##ħ
+##ŋ
+##œ
+##ƒ
+##ɐ
+##ɑ
+##ɒ
+##ɔ
+##ɕ
+##ə
+##ɡ
+##ɣ
+##ɨ
+##ɪ
+##ɫ
+##ɬ
+##ɯ
+##ɲ
+##ɴ
+##ɹ
+##ɾ
+##ʀ
+##ʁ
+##ʂ
+##ʃ
+##ʉ
+##ʊ
+##ʋ
+##ʌ
+##ʎ
+##ʐ
+##ʑ
+##ʒ
+##ʔ
+##ʰ
+##ʲ
+##ʳ
+##ʷ
+##ʸ
+##ʻ
+##ʼ
+##ʾ
+##ʿ
+##ˈ
+##ˡ
+##ˢ
+##ˣ
+##ˤ
+##β
+##γ
+##δ
+##ε
+##ζ
+##θ
+##κ
+##λ
+##μ
+##ξ
+##ο
+##π
+##ρ
+##σ
+##τ
+##υ
+##φ
+##χ
+##ψ
+##ω
+##б
+##г
+##д
+##ж
+##з
+##м
+##п
+##с
+##у
+##ф
+##х
+##ц
+##ч
+##ш
+##щ
+##ъ
+##э
+##ю
+##ђ
+##є
+##і
+##ј
+##љ
+##њ
+##ћ
+##ӏ
+##ա
+##բ
+##գ
+##դ
+##ե
+##թ
+##ի
+##լ
+##կ
+##հ
+##մ
+##յ
+##ն
+##ո
+##պ
+##ս
+##վ
+##տ
+##ր
+##ւ
+##ք
+##־
+##א
+##ב
+##ג
+##ד
+##ו
+##ז
+##ח
+##ט
+##י
+##ך
+##כ
+##ל
+##ם
+##מ
+##ן
+##נ
+##ס
+##ע
+##ף
+##פ
+##ץ
+##צ
+##ק
+##ר
+##ש
+##ת
+##،
+##ء
+##ب
+##ت
+##ث
+##ج
+##ح
+##خ
+##ذ
+##ز
+##س
+##ش
+##ص
+##ض
+##ط
+##ظ
+##ع
+##غ
+##ـ
+##ف
+##ق
+##ك
+##و
+##ى
+##ٹ
+##پ
+##چ
+##ک
+##گ
+##ں
+##ھ
+##ہ
+##ے
+##अ
+##आ
+##उ
+##ए
+##क
+##ख
+##ग
+##च
+##ज
+##ट
+##ड
+##ण
+##त
+##थ
+##द
+##ध
+##न
+##प
+##ब
+##भ
+##म
+##य
+##र
+##ल
+##व
+##श
+##ष
+##स
+##ह
+##ा
+##ि
+##ी
+##ो
+##।
+##॥
+##ং
+##অ
+##আ
+##ই
+##উ
+##এ
+##ও
+##ক
+##খ
+##গ
+##চ
+##ছ
+##জ
+##ট
+##ড
+##ণ
+##ত
+##থ
+##দ
+##ধ
+##ন
+##প
+##ব
+##ভ
+##ম
+##য
+##র
+##ল
+##শ
+##ষ
+##স
+##হ
+##া
+##ি
+##ী
+##ে
+##க
+##ச
+##ட
+##த
+##ந
+##ன
+##ப
+##ம
+##ய
+##ர
+##ல
+##ள
+##வ
+##ா
+##ி
+##ு
+##ே
+##ை
+##ನ
+##ರ
+##ಾ
+##ක
+##ය
+##ර
+##ල
+##ව
+##ා
+##ก
+##ง
+##ต
+##ท
+##น
+##พ
+##ม
+##ย
+##ร
+##ล
+##ว
+##ส
+##อ
+##า
+##เ
+##་
+##།
+##ག
+##ང
+##ད
+##ན
+##པ
+##བ
+##མ
+##འ
+##ར
+##ལ
+##ས
+##မ
+##ა
+##ბ
+##გ
+##დ
+##ე
+##ვ
+##თ
+##ი
+##კ
+##ლ
+##მ
+##ნ
+##ო
+##რ
+##ს
+##ტ
+##უ
+##ᄀ
+##ᄂ
+##ᄃ
+##ᄅ
+##ᄆ
+##ᄇ
+##ᄉ
+##ᄊ
+##ᄋ
+##ᄌ
+##ᄎ
+##ᄏ
+##ᄐ
+##ᄑ
+##ᄒ
+##ᅡ
+##ᅢ
+##ᅥ
+##ᅦ
+##ᅧ
+##ᅩ
+##ᅪ
+##ᅭ
+##ᅮ
+##ᅯ
+##ᅲ
+##ᅳ
+##ᅴ
+##ᅵ
+##ᆨ
+##ᆫ
+##ᆯ
+##ᆷ
+##ᆸ
+##ᆼ
+##ᴬ
+##ᴮ
+##ᴰ
+##ᴵ
+##ᴺ
+##ᵀ
+##ᵃ
+##ᵇ
+##ᵈ
+##ᵉ
+##ᵍ
+##ᵏ
+##ᵐ
+##ᵒ
+##ᵖ
+##ᵗ
+##ᵘ
+##ᵣ
+##ᵤ
+##ᵥ
+##ᶜ
+##ᶠ
+##‐
+##‑
+##‒
+##–
+##—
+##―
+##‖
+##‘
+##’
+##‚
+##“
+##”
+##„
+##†
+##‡
+##•
+##…
+##‰
+##′
+##″
+##›
+##‿
+##⁄
+##⁰
+##ⁱ
+##⁴
+##⁵
+##⁶
+##⁷
+##⁸
+##⁹
+##⁻
+##ⁿ
+##₅
+##₆
+##₇
+##₈
+##₉
+##₊
+##₍
+##₎
+##ₐ
+##ₑ
+##ₒ
+##ₓ
+##ₕ
+##ₖ
+##ₗ
+##ₘ
+##ₚ
+##ₛ
+##ₜ
+##₤
+##₩
+##€
+##₱
+##₹
+##ℓ
+##№
+##ℝ
+##™
+##⅓
+##⅔
+##←
+##↑
+##→
+##↓
+##↔
+##↦
+##⇄
+##⇌
+##⇒
+##∂
+##∅
+##∆
+##∇
+##∈
+##∗
+##∘
+##√
+##∞
+##∧
+##∨
+##∩
+##∪
+##≈
+##≡
+##≤
+##≥
+##⊂
+##⊆
+##⊕
+##⊗
+##⋅
+##─
+##│
+##■
+##▪
+##●
+##★
+##☆
+##☉
+##♠
+##♣
+##♥
+##♦
+##♯
+##⟨
+##⟩
+##ⱼ
+##⺩
+##⺼
+##⽥
+##、
+##。
+##〈
+##〉
+##《
+##》
+##「
+##」
+##『
+##』
+##〜
+##あ
+##い
+##う
+##え
+##お
+##か
+##き
+##く
+##け
+##こ
+##さ
+##し
+##す
+##せ
+##そ
+##た
+##ち
+##っ
+##つ
+##て
+##と
+##な
+##に
+##ぬ
+##ね
+##の
+##は
+##ひ
+##ふ
+##へ
+##ほ
+##ま
+##み
+##む
+##め
+##も
+##や
+##ゆ
+##よ
+##ら
+##り
+##る
+##れ
+##ろ
+##を
+##ん
+##ァ
+##ア
+##ィ
+##イ
+##ウ
+##ェ
+##エ
+##オ
+##カ
+##キ
+##ク
+##ケ
+##コ
+##サ
+##シ
+##ス
+##セ
+##タ
+##チ
+##ッ
+##ツ
+##テ
+##ト
+##ナ
+##ニ
+##ノ
+##ハ
+##ヒ
+##フ
+##ヘ
+##ホ
+##マ
+##ミ
+##ム
+##メ
+##モ
+##ャ
+##ュ
+##ョ
+##ラ
+##リ
+##ル
+##レ
+##ロ
+##ワ
+##ン
+##・
+##ー
+##一
+##三
+##上
+##下
+##不
+##世
+##中
+##主
+##久
+##之
+##也
+##事
+##二
+##五
+##井
+##京
+##人
+##亻
+##仁
+##介
+##代
+##仮
+##伊
+##会
+##佐
+##侍
+##保
+##信
+##健
+##元
+##光
+##八
+##公
+##内
+##出
+##分
+##前
+##劉
+##力
+##加
+##勝
+##北
+##区
+##十
+##千
+##南
+##博
+##原
+##口
+##古
+##史
+##司
+##合
+##吉
+##同
+##名
+##和
+##囗
+##四
+##国
+##國
+##土
+##地
+##坂
+##城
+##堂
+##場
+##士
+##夏
+##外
+##大
+##天
+##太
+##夫
+##奈
+##女
+##子
+##学
+##宀
+##宇
+##安
+##宗
+##定
+##宣
+##宮
+##家
+##宿
+##寺
+##將
+##小
+##尚
+##山
+##岡
+##島
+##崎
+##川
+##州
+##巿
+##帝
+##平
+##年
+##幸
+##广
+##弘
+##張
+##彳
+##後
+##御
+##德
+##心
+##忄
+##志
+##忠
+##愛
+##成
+##我
+##戦
+##戸
+##手
+##扌
+##政
+##文
+##新
+##方
+##日
+##明
+##星
+##春
+##昭
+##智
+##曲
+##書
+##月
+##有
+##朝
+##木
+##本
+##李
+##村
+##東
+##松
+##林
+##森
+##楊
+##樹
+##橋
+##歌
+##止
+##正
+##武
+##比
+##氏
+##民
+##水
+##氵
+##氷
+##永
+##江
+##沢
+##河
+##治
+##法
+##海
+##清
+##漢
+##瀬
+##火
+##版
+##犬
+##王
+##生
+##田
+##男
+##疒
+##発
+##白
+##的
+##皇
+##目
+##相
+##省
+##真
+##石
+##示
+##社
+##神
+##福
+##禾
+##秀
+##秋
+##空
+##立
+##章
+##竹
+##糹
+##美
+##義
+##耳
+##良
+##艹
+##花
+##英
+##華
+##葉
+##藤
+##行
+##街
+##西
+##見
+##訁
+##語
+##谷
+##貝
+##貴
+##車
+##軍
+##辶
+##道
+##郎
+##郡
+##部
+##都
+##里
+##野
+##金
+##鈴
+##镇
+##長
+##門
+##間
+##阝
+##阿
+##陳
+##陽
+##雄
+##青
+##面
+##風
+##食
+##香
+##馬
+##高
+##龍
+##龸
+##ﬁ
+##ﬂ
+##！
+##（
+##）
+##，
+##－
+##．
+##／
+##：
+##？
+##～
diff --git a/modelzoo/LanguageModeling/BERT/.dockerignore b/modelzoo/LanguageModeling/BERT/.dockerignore
new file mode 100644
index 00000000..c70b78d6
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/.dockerignore
@@ -0,0 +1,27 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+data_dl/
+.idea/
+.git/
+.vscode/
+__pycache__/
+results/
+data/binary
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5*
+data/tfrecord*
+checkpoints/
diff --git a/modelzoo/LanguageModeling/BERT/.gitignore b/modelzoo/LanguageModeling/BERT/.gitignore
new file mode 100644
index 00000000..61c2b075
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/.gitignore
@@ -0,0 +1,147 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Initially taken from Github's Python gitignore file
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+#Data
+data/download
+data/extracted
+data/formatted_one_article_per_line
+data/sharded
+data/hdf5*
+data/tfrecord*
+data/*/*.zip
+
+#Resutls
+results/
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+.vscode/
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# TensorRT
+*.engine
+models/
diff --git a/modelzoo/LanguageModeling/BERT/Bert_result.md b/modelzoo/LanguageModeling/BERT/Bert_result.md
deleted file mode 100644
index 722e9f80..00000000
--- a/modelzoo/LanguageModeling/BERT/Bert_result.md
+++ /dev/null
@@ -1,26 +0,0 @@
-| Parameters            | A100                    | A800      |
-| --------------------- | ----------------------- | ----------------------- |
-| DataSet               | SQuAD1.1                | SQuAD1.1              |
-| num_hidden_layers     | 12                      | 12                      |
-| batch_size_per_gpu    | 32                   | 32                    |
-| learning_rate_per_gpu | 5e-6                    | 5e-6                    |
-| precision             | fp16                    | Fp16                  |
-| use_xla               | true                | true           |
-| num_gpus              | 8                       | 8                       |
-| max_seq_length        | 384                     | 384                     |
-| doc_stride            | 128                     | 128                     |
-| epochs                | 1                       | 1                     |
-| checkpoint            | uncased_L-12_H-768_A-12 | uncased_L-12_H-768_A-12 |
-
-| Task | total_training_steps | train_loss | F1         | exact_match | Throughput Average (sentences/sec)! | Training Duration sec | GPU Util | GPU Memory-Usage(MB)! |
-| -------------- | ---------- | ----------------------------------- | --------------------- | -------- | --------------------- | --------------------- | --------------------- | --------------------- |
-| A800_GPU-8_bs-12_LR-5e-6_fp16_XLA-true_BERT-base_SQuAD1.1_Epoch-1 |  |  | 85.2391 | 76.6982 | 280.94 | 1011.48 | 8 * 96% |  |
-|  |                      |                    |         |             |                                     |                             |          |                       |
-|                                                              |                      |                    |         |             |                                     |                             |          |                       |
-|                                                              |                      |                    |         |             |                                     |                             |          |                       |
-|                                                              |                      |                    |         |             |                                     |                             |          |             |
-| TITAN_GPU-4_bs-12_LR-5e-6_fp16_XLA-true_BERT-base_SQuAD1.1_Epoch-1 | 1846 | 1.0677543878555298 | 84.4242 | 75.4494 | 286.32 | 511.30 for Examples = 88608 | 8 * 96% |  |
-
-*Memory(GiB): Max consumption
-
-*CPU Util: Max moment value
diff --git a/modelzoo/LanguageModeling/BERT/Dockerfile b/modelzoo/LanguageModeling/BERT/Dockerfile
new file mode 100644
index 00000000..cc37ea6b
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/Dockerfile
@@ -0,0 +1,55 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/tensorflow:21.02-tf2-py3
+FROM ${FROM_IMAGE_NAME}
+
+RUN apt-get update && apt-get install -y pbzip2 pv bzip2 libcurl4 curl
+
+WORKDIR /workspace
+ENV HOME /workspace
+
+WORKDIR /workspace
+RUN git clone https://github.com/openai/gradient-checkpointing.git
+RUN git clone https://github.com/attardi/wikiextractor.git && cd wikiextractor && git checkout 6408a430fc504a38b04d37ce5e7fc740191dee16 && cd ..
+RUN git clone https://github.com/soskek/bookcorpus.git
+RUN git clone https://github.com/titipata/pubmed_parser
+
+RUN pip3 install /workspace/pubmed_parser
+
+# Environment
+ENV LANG C.UTF-8
+ENV LC_ALL C.UTF-8
+
+# Install Python 3 packages
+RUN pip3 install \
+  requests \
+  tqdm \
+  horovod \
+  sentencepiece \
+  tensorflow_hub \
+  pynvml \
+  wget \
+  progressbar \
+  git+https://github.com/NVIDIA/dllogger
+
+WORKDIR /workspace/bert_tf2
+# Copy model into image - This can be overridden by mounting a volume to the same location.
+COPY . .
+ENV PYTHONPATH="/workspace/wikiextractor:/workspace/bert_tf2:${PYTHONPATH}"
+
+#disable lazy compilatoin
+ENV TF_XLA_FLAGS="--tf_xla_enable_lazy_compilation=false"
+
+ENV TF_DEVICE_MIN_SYS_MEMORY_IN_MB=2048
diff --git a/modelzoo/LanguageModeling/BERT/README.md b/modelzoo/LanguageModeling/BERT/README.md
index 6684b4d5..f5e58033 100644
--- a/modelzoo/LanguageModeling/BERT/README.md
+++ b/modelzoo/LanguageModeling/BERT/README.md
@@ -150,7 +150,7 @@ For information about:
 
 #### Enabling mixed precision
 
-This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--dtype=fp16` flag to the `run_pretraining.py` or `run_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code:
+This implementation exploits the TensorFlow Automatic Mixed Precision feature. To enable AMP, you simply need to supply the `--use_fp16` flag to the `run_pretraining.py` or `run_squad.py` script. For reference, enabling AMP required us to apply the following changes to the code:
 
 1. Set the Keras mixed precision policy:
    ```python
@@ -393,7 +393,7 @@ The `official/` folder contains necessary files of building model architecture a
 Aside from the options to set hyperparameters, the relevant options to control the behaviour of the `run_pretraining.py` script are:
 
 ```
-  --config_file: Bert configuration file to define core bert layers.
+  --bert_config_file: Bert configuration file to define core bert layers.
   --init_checkpoint: Initial checkpoint (usually from a pre-trained BERT model).
   --[no]use_horovod: Whether to use horovod.(default: 'false')
   --[no]use_fp16: Whether to use fp32 or fp16 arithmetic on GPU. When false, uses TF32 on A100 and FP32 on V100 GPUS.(default: 'false')
@@ -407,7 +407,7 @@ Aside from the options to set hyperparameters, the relevant options to control t
 Aside from the options to set hyperparameters, some relevant options to control the behaviour of the `run_squad.py` script are:
 
 ```
-  --config_file: Bert configuration file to define core bert layers.
+  --bert_config_file: Bert configuration file to define core bert layers.
   --model_dir: The location of the model checkpoint files.
   --mode: <train_and_predict|train|predict|export_only>: One of {"train_and_predict", "train", "predict", "export_only"}. `train_and_predict`: both train and predict to a json file. `train`: only trains the model. trains the model and evaluates in the meantime. `predict`: predict answers from the squad json file. `export_only`: will take the latest checkpoint inside model_dir and export a `SavedModel`.
   --max_answer_length: The maximum length of an answer that can be generated. (default: '30')(an integer)
@@ -569,7 +569,7 @@ mpirun -np 8 \
     -x LD_LIBRARY_PATH \
     -x PATH -mca pml ob1 -mca btl ^openib \
      python run_squad.py --use_horovod --vocab_file=$BERT_DIR/vocab.txt \
-     --config_file=$BERT_DIR/bert_config.json \
+     --bert_config_file=$BERT_DIR/bert_config.json \
      --model_dir=/results
 ```
 
diff --git a/modelzoo/LanguageModeling/BERT/bert_dllogger.json b/modelzoo/LanguageModeling/BERT/bert_dllogger.json
new file mode 100644
index 00000000..13d81e4a
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/bert_dllogger.json
@@ -0,0 +1,15 @@
+DLLL {"timestamp": "1750063068.327787", "elapsedtime": "7e-06", "datetime": "2025-06-16 08:37:48.327787+00:00", "type": "METADATA", "metric": "mlm_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.327887", "elapsedtime": "0.000107", "datetime": "2025-06-16 08:37:48.327887+00:00", "type": "METADATA", "metric": "nsp_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.327918", "elapsedtime": "0.000138", "datetime": "2025-06-16 08:37:48.327918+00:00", "type": "METADATA", "metric": "avg_loss_step", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.327944", "elapsedtime": "0.000164", "datetime": "2025-06-16 08:37:48.327944+00:00", "type": "METADATA", "metric": "total_loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.327964", "elapsedtime": "0.000184", "datetime": "2025-06-16 08:37:48.327964+00:00", "type": "METADATA", "metric": "loss", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.327982", "elapsedtime": "0.000202", "datetime": "2025-06-16 08:37:48.327982+00:00", "type": "METADATA", "metric": "f1", "metadata": {"unit": null, "format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.328002", "elapsedtime": "0.000222", "datetime": "2025-06-16 08:37:48.328002+00:00", "type": "METADATA", "metric": "precision", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.32802", "elapsedtime": "0.00024", "datetime": "2025-06-16 08:37:48.328020+00:00", "type": "METADATA", "metric": "recall", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.328037", "elapsedtime": "0.000257", "datetime": "2025-06-16 08:37:48.328037+00:00", "type": "METADATA", "metric": "mcc", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.328056", "elapsedtime": "0.000276", "datetime": "2025-06-16 08:37:48.328056+00:00", "type": "METADATA", "metric": "exact_match", "metadata": {"format": ":.4f", "GOAL": "MINIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.328073", "elapsedtime": "0.000293", "datetime": "2025-06-16 08:37:48.328073+00:00", "type": "METADATA", "metric": "throughput_train", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "TRAIN"}}
+DLLL {"timestamp": "1750063068.328092", "elapsedtime": "0.000312", "datetime": "2025-06-16 08:37:48.328092+00:00", "type": "METADATA", "metric": "throughput_inf", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063068.328108", "elapsedtime": "0.000328", "datetime": "2025-06-16 08:37:48.328108+00:00", "type": "METADATA", "metric": "throughput_val", "metadata": {"unit": "sequences/s", "format": ":.3f", "GOAL": "MAXIMIZE", "STAGE": "VAL"}}
+DLLL {"timestamp": "1750063224.077653", "datetime": "2025-06-16 08:40:24.077653+00:00", "elapsedtime": "155.749873", "type": "LOG", "step": [], "data": {"throughput_train": 54.50596785734736}}
+DLLL {"timestamp": "1750063224.077839", "datetime": "2025-06-16 08:40:24.077839+00:00", "elapsedtime": "155.750059", "type": "LOG", "step": [], "data": {"total_loss": 1.4864305257797241}}
diff --git a/modelzoo/LanguageModeling/BERT/classifier_data_lib.py b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py
new file mode 100644
index 00000000..891d4991
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py
@@ -0,0 +1,581 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT library to process data for classification task."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+import os
+
+from absl import logging
+import tensorflow as tf
+
+import tokenization
+
+
+class InputExample(object):
+  """A single training/test example for simple sequence classification."""
+
+  def __init__(self, guid, text_a, text_b=None, label=None):
+    """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+    self.guid = guid
+    self.text_a = text_a
+    self.text_b = text_b
+    self.label = label
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               input_ids,
+               input_mask,
+               segment_ids,
+               label_id,
+               is_real_example=True):
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.label_id = label_id
+    self.is_real_example = is_real_example
+
+
+class DataProcessor(object):
+  """Base class for data converters for sequence classification data sets."""
+
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    self.process_text_fn = process_text_fn
+
+  def get_train_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the train set."""
+    raise NotImplementedError()
+
+  def get_dev_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the dev set."""
+    raise NotImplementedError()
+
+  def get_test_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for prediction."""
+    raise NotImplementedError()
+
+  def get_labels(self):
+    """Gets the list of labels for this data set."""
+    raise NotImplementedError()
+
+  @staticmethod
+  def get_processor_name():
+    """Gets the string identifier of the processor."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _read_tsv(cls, input_file, quotechar=None):
+    """Reads a tab separated value file."""
+    with tf.io.gfile.GFile(input_file, "r") as f:
+      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+      lines = []
+      for line in reader:
+        lines.append(line)
+      return lines
+
+
+class XnliProcessor(DataProcessor):
+  """Processor for the XNLI data set."""
+
+  def __init__(self, process_text_fn=tokenization.convert_to_unicode):
+    super(XnliProcessor, self).__init__(process_text_fn)
+    self.language = "zh"
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(
+        os.path.join(data_dir, "multinli",
+                     "multinli.train.%s.tsv" % self.language))
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "train-%d" % (i)
+      text_a = self.process_text_fn(line[0])
+      text_b = self.process_text_fn(line[1])
+      label = self.process_text_fn(line[2])
+      if label == self.process_text_fn("contradictory"):
+        label = self.process_text_fn("contradiction")
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "dev-%d" % (i)
+      language = self.process_text_fn(line[0])
+      if language != self.process_text_fn(self.language):
+        continue
+      text_a = self.process_text_fn(line[6])
+      text_b = self.process_text_fn(line[7])
+      label = self.process_text_fn(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "XNLI"
+
+
+class MnliProcessor(DataProcessor):
+  """Processor for the MultiNLI data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+        "dev_matched")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "MNLI"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
+      text_a = self.process_text_fn(line[8])
+      text_b = self.process_text_fn(line[9])
+      if set_type == "test":
+        label = "contradiction"
+      else:
+        label = self.process_text_fn(line[-1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+class MrpcProcessor(DataProcessor):
+  """Processor for the MRPC data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "MRPC"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      text_a = self.process_text_fn(line[3])
+      text_b = self.process_text_fn(line[4])
+      if set_type == "test":
+        label = "0"
+      else:
+        label = self.process_text_fn(line[0])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+class ColaProcessor(DataProcessor):
+  """Processor for the CoLA data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "COLA"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      # Only the test set has a header
+      if set_type == "test" and i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      if set_type == "test":
+        text_a = self.process_text_fn(line[1])
+        label = "0"
+      else:
+        text_a = self.process_text_fn(line[3])
+        label = self.process_text_fn(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+    return examples
+
+
+class SstProcessor(DataProcessor):
+  """Processor for the SST-2 data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "SST-2"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      if set_type == "test":
+        text_a = tokenization.convert_to_unicode(line[1])
+        label = "0"
+      else:
+        text_a = tokenization.convert_to_unicode(line[0])
+        label = tokenization.convert_to_unicode(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+    return examples
+
+
+class QnliProcessor(DataProcessor):
+  """Processor for the QNLI data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["entailment", "not_entailment"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "QNLI"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, 1)
+      if set_type == "test":
+        text_a = tokenization.convert_to_unicode(line[1])
+        text_b = tokenization.convert_to_unicode(line[2])
+        label = "entailment"
+      else:
+        text_a = tokenization.convert_to_unicode(line[1])
+        text_b = tokenization.convert_to_unicode(line[2])
+        label = tokenization.convert_to_unicode(line[-1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+  """Converts a single `InputExample` into a single `InputFeatures`."""
+  label_map = {}
+  for (i, label) in enumerate(label_list):
+    label_map[label] = i
+
+  tokens_a = tokenizer.tokenize(example.text_a)
+  tokens_b = None
+  if example.text_b:
+    tokens_b = tokenizer.tokenize(example.text_b)
+
+  if tokens_b:
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for [CLS], [SEP], [SEP] with "- 3"
+    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+  else:
+    # Account for [CLS] and [SEP] with "- 2"
+    if len(tokens_a) > max_seq_length - 2:
+      tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+  # The convention in BERT is:
+  # (a) For sequence pairs:
+  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+  # (b) For single sequences:
+  #  tokens:   [CLS] the dog is hairy . [SEP]
+  #  type_ids: 0     0   0   0  0     0 0
+  #
+  # Where "type_ids" are used to indicate whether this is the first
+  # sequence or the second sequence. The embedding vectors for `type=0` and
+  # `type=1` were learned during pre-training and are added to the wordpiece
+  # embedding vector (and position vector). This is not *strictly* necessary
+  # since the [SEP] token unambiguously separates the sequences, but it makes
+  # it easier for the model to learn the concept of sequences.
+  #
+  # For classification tasks, the first vector (corresponding to [CLS]) is
+  # used as the "sentence vector". Note that this only makes sense because
+  # the entire model is fine-tuned.
+  tokens = []
+  segment_ids = []
+  tokens.append("[CLS]")
+  segment_ids.append(0)
+  for token in tokens_a:
+    tokens.append(token)
+    segment_ids.append(0)
+  tokens.append("[SEP]")
+  segment_ids.append(0)
+
+  if tokens_b:
+    for token in tokens_b:
+      tokens.append(token)
+      segment_ids.append(1)
+    tokens.append("[SEP]")
+    segment_ids.append(1)
+
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+  # The mask has 1 for real tokens and 0 for padding tokens. Only real
+  # tokens are attended to.
+  input_mask = [1] * len(input_ids)
+
+  # Zero-pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+
+  assert len(input_ids) == max_seq_length
+  assert len(input_mask) == max_seq_length
+  assert len(segment_ids) == max_seq_length
+
+  label_id = label_map[example.label]
+  if ex_index < 5:
+    logging.info("*** Example ***")
+    logging.info("guid: %s", (example.guid))
+    logging.info("tokens: %s",
+                 " ".join([tokenization.printable_text(x) for x in tokens]))
+    logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+    logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+    logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+    logging.info("label: %s (id = %d)", example.label, label_id)
+
+  feature = InputFeatures(
+      input_ids=input_ids,
+      input_mask=input_mask,
+      segment_ids=segment_ids,
+      label_id=label_id,
+      is_real_example=True)
+  return feature
+
+
+def file_based_convert_examples_to_features(examples, label_list,
+                                            max_seq_length, tokenizer,
+                                            output_file):
+  """Convert a set of `InputExample`s to a TFRecord file."""
+
+  writer = tf.io.TFRecordWriter(output_file)
+
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      logging.info("Writing example %d of %d", ex_index, len(examples))
+
+    feature = convert_single_example(ex_index, example, label_list,
+                                     max_seq_length, tokenizer)
+
+    def create_int_feature(values):
+      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+      return f
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    features["label_ids"] = create_int_feature([feature.label_id])
+    features["is_real_example"] = create_int_feature(
+        [int(feature.is_real_example)])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+
+
+def generate_tf_record_from_data_file(processor,
+                                      data_dir,
+                                      tokenizer,
+                                      train_data_output_path=None,
+                                      eval_data_output_path=None,
+                                      max_seq_length=128):
+  """Generates and saves training data into a tf record file.
+
+  Arguments:
+      processor: Input processor object to be used for generating data. Subclass
+        of `DataProcessor`.
+      data_dir: Directory that contains train/eval data to process. Data files
+        should be in from "dev.tsv", "test.tsv", or "train.tsv".
+      tokenizer: The tokenizer to be applied on the data.
+      train_data_output_path: Output to which processed tf record for training
+        will be saved.
+      eval_data_output_path: Output to which processed tf record for evaluation
+        will be saved.
+      max_seq_length: Maximum sequence length of the to be generated
+        training/eval data.
+
+  Returns:
+      A dictionary containing input meta data.
+  """
+  assert train_data_output_path or eval_data_output_path
+
+  label_list = processor.get_labels()
+  assert train_data_output_path
+  train_input_data_examples = processor.get_train_examples(data_dir)
+  file_based_convert_examples_to_features(train_input_data_examples, label_list,
+                                          max_seq_length, tokenizer,
+                                          train_data_output_path)
+  num_training_data = len(train_input_data_examples)
+
+  if eval_data_output_path:
+    eval_input_data_examples = processor.get_dev_examples(data_dir)
+    file_based_convert_examples_to_features(eval_input_data_examples,
+                                            label_list, max_seq_length,
+                                            tokenizer, eval_data_output_path)
+
+  meta_data = {
+      "task_type": "bert_classification",
+      "processor_type": processor.get_processor_name(),
+      "num_labels": len(processor.get_labels()),
+      "train_data_size": num_training_data,
+      "max_seq_length": max_seq_length,
+  }
+
+  if eval_data_output_path:
+    meta_data["eval_data_size"] = len(eval_input_data_examples)
+
+  return meta_data
diff --git a/modelzoo/LanguageModeling/BERT/common_flags.py b/modelzoo/LanguageModeling/BERT/common_flags.py
new file mode 100644
index 00000000..9728288f
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/common_flags.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Defining common flags used across all BERT models/applications."""
+
+from absl import flags
+import tensorflow as tf
+
+from deepray.utils.flags import core as flags_core
+
+
+def define_common_bert_flags():
+  """Define common flags for BERT tasks."""
+  flags.DEFINE_string('bert_config_file', None,
+                      'Bert configuration file to define core bert layers.')
+  flags.DEFINE_string(
+      'model_export_path', None,
+      'Path to the directory, where trainined model will be '
+      'exported.')
+  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
+  flags.DEFINE_integer('num_train_epochs', 3,
+                       'Total number of training epochs to perform.')
+  flags.DEFINE_integer(
+      'steps_per_loop', 200,
+      'Number of steps per graph-mode loop. Only training step '
+      'happens inside the loop. Callbacks will not be called '
+      'inside.')
+  flags.DEFINE_boolean(
+      'scale_loss', False,
+      'Whether to divide the loss by number of replica inside the per-replica '
+      'loss function.')
+  flags.DEFINE_boolean(
+    'use_keras_compile_fit', False,
+    'If True, uses Keras compile/fit() API for training logic. Otherwise '
+    'use custom training loop.')
+  flags.DEFINE_string(
+    'hub_module_url', None, 'TF-Hub path/url to Bert module. '
+    'If specified, init_checkpoint flag should not be used.')
+  flags.DEFINE_enum(
+    'model_type', 'bert', ['bert', 'albert'],
+    'Specifies the type of the model. '
+    'If "bert", will use canonical BERT; if "albert", will use ALBERT model.')
+  flags.DEFINE_boolean(
+      'use_fp16', False,
+      'Whether to use fp32 or fp16 arithmetic on GPU.')
+  flags.DEFINE_integer(
+    'save_checkpoint_steps', 1000,
+    'save checkpoint for every n steps')
+  flags.DEFINE_string(
+    'dllog_path', 'bert_dllogger.json', 'filename where dllogger writes to')
+  flags.DEFINE_boolean(
+      'benchmark', False,
+      'Benchmark mode.')
+
+
+def use_float16():
+  return flags_core.get_tf_dtype(flags.FLAGS) == tf.float16
+
+
+def get_loss_scale():
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
diff --git a/modelzoo/LanguageModeling/BERT/create_finetuning_data.py b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py
new file mode 100644
index 00000000..5bfeff55
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py
@@ -0,0 +1,184 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT finetuning task dataset generator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import json
+
+from absl import app
+from absl import flags
+import tensorflow as tf
+
+import classifier_data_lib
+# word-piece tokenizer based squad_lib
+import squad_lib as squad_lib_wp
+# sentence-piece tokenizer based squad_lib
+import squad_lib_sp
+import tokenization
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_enum(
+    "fine_tuning_task_type", "classification", ["classification", "squad"],
+    "The name of the BERT fine tuning task for which data "
+    "will be generated..")
+
+# BERT classification specific flags.
+flags.DEFINE_string(
+    "input_data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_enum("classification_task_name", "MNLI",
+                  ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"],
+                  "The name of the task to train BERT classifier.")
+
+# BERT Squad task specific flags.
+flags.DEFINE_string(
+    "squad_data_file", None,
+    "The input data file in for generating training data for BERT squad task.")
+
+flags.DEFINE_integer(
+    "doc_stride", 128,
+    "When splitting up a long document into chunks, how much stride to "
+    "take between chunks.")
+
+flags.DEFINE_integer(
+    "max_query_length", 64,
+    "The maximum number of tokens for the question. Questions longer than "
+    "this will be truncated to this length.")
+
+flags.DEFINE_bool(
+    "version_2_with_negative", False,
+    "If true, the SQuAD examples contain some that do not have an answer.")
+
+# Shared flags across BERT fine-tuning tasks.
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "train_data_output_path", None,
+    "The path in which generated training input data will be written as tf"
+    " records.")
+
+flags.DEFINE_string(
+    "eval_data_output_path", None,
+    "The path in which generated training input data will be written as tf"
+    " records.")
+
+flags.DEFINE_string("meta_data_file_path", None,
+                    "The path in which input meta data will be written.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_string("sp_model_file", "",
+                    "The path to the model used by sentence piece tokenizer.")
+
+flags.DEFINE_enum(
+    "tokenizer_impl", "word_piece", ["word_piece", "sentence_piece"],
+    "Specifies the tokenizer implementation, i.e., whehter to use word_piece "
+    "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
+    "while ALBERT uses sentence_piece tokenizer.")
+
+
+def generate_classifier_dataset():
+  """Generates classifier dataset and returns input meta data."""
+  assert FLAGS.input_data_dir and FLAGS.classification_task_name
+
+  processors = {
+      "cola": classifier_data_lib.ColaProcessor,
+      "mnli": classifier_data_lib.MnliProcessor,
+      "mrpc": classifier_data_lib.MrpcProcessor,
+      "qnli": classifier_data_lib.QnliProcessor,
+      "sst-2": classifier_data_lib.SstProcessor,
+      "xnli": classifier_data_lib.XnliProcessor,
+  }
+  task_name = FLAGS.classification_task_name.lower()
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % (task_name))
+
+  if FLAGS.tokenizer_impl == "word_piece":
+    tokenizer = tokenization.FullTokenizer(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    processor_text_fn = tokenization.convert_to_unicode
+  else:
+    assert FLAGS.tokenizer_impl == "sentence_piece"
+    tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
+    processor_text_fn = functools.partial(
+        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+
+  processor = processors[task_name](processor_text_fn)
+  return classifier_data_lib.generate_tf_record_from_data_file(
+      processor,
+      FLAGS.input_data_dir,
+      tokenizer,
+      train_data_output_path=FLAGS.train_data_output_path,
+      eval_data_output_path=FLAGS.eval_data_output_path,
+      max_seq_length=FLAGS.max_seq_length)
+
+
+def generate_squad_dataset():
+  """Generates squad training dataset and returns input meta data."""
+  assert FLAGS.squad_data_file
+  if FLAGS.tokenizer_impl == "word_piece":
+    return squad_lib_wp.generate_tf_record_from_json_file(
+        FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path,
+        FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length,
+        FLAGS.doc_stride, FLAGS.version_2_with_negative)
+  else:
+    assert FLAGS.tokenizer_impl == "sentence_piece"
+    return squad_lib_sp.generate_tf_record_from_json_file(
+        FLAGS.squad_data_file, FLAGS.sp_model_file,
+        FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case,
+        FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative)
+
+
+def main(_):
+  if FLAGS.tokenizer_impl == "word_piece":
+    if not FLAGS.vocab_file:
+      raise ValueError(
+          "FLAG vocab_file for word-piece tokenizer is not specified.")
+  else:
+    assert FLAGS.tokenizer_impl == "sentence_piece"
+    if not FLAGS.sp_model_file:
+      raise ValueError(
+          "FLAG sp_model_file for sentence-piece tokenizer is not specified.")
+
+  if FLAGS.fine_tuning_task_type == "classification":
+    input_meta_data = generate_classifier_dataset()
+  else:
+    input_meta_data = generate_squad_dataset()
+
+  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
+    writer.write(json.dumps(input_meta_data, indent=4) + "\n")
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("train_data_output_path")
+  flags.mark_flag_as_required("meta_data_file_path")
+  app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/create_pretraining_data.py b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py
new file mode 100644
index 00000000..69a5c696
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py
@@ -0,0 +1,655 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+import collections
+import itertools
+import random
+
+# Import libraries
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+
+import tokenization
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_bool(
+    "do_whole_word_mask", False,
+    "Whether to use whole word masking rather than per-WordPiece masking.")
+
+flags.DEFINE_integer(
+    "max_ngram_size", None,
+    "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
+    "weighting scheme to favor shorter n-grams. "
+    "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.")
+
+flags.DEFINE_bool(
+    "gzip_compress", False,
+    "Whether to use `GZIP` compress option to get compressed TFRecord files.")
+
+flags.DEFINE_bool(
+    "use_v2_feature_names", False,
+    "Whether to use the feature names consistent with the models.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files,
+                                    gzip_compress, use_v2_feature_names):
+  """Creates TF example files from `TrainingInstance`s."""
+  writers = []
+  for output_file in output_files:
+    writers.append(
+        tf.io.TFRecordWriter(
+            output_file, options="GZIP" if gzip_compress else ""))
+
+  writer_index = 0
+
+  total_written = 0
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+
+    features = collections.OrderedDict()
+    if use_v2_feature_names:
+      features["input_word_ids"] = create_int_feature(input_ids)
+      features["input_type_ids"] = create_int_feature(segment_ids)
+    else:
+      features["input_ids"] = create_int_feature(input_ids)
+      features["segment_ids"] = create_int_feature(segment_ids)
+
+    features["input_mask"] = create_int_feature(input_mask)
+    features["masked_lm_positions"] = create_int_feature(masked_lm_positions)
+    features["masked_lm_ids"] = create_int_feature(masked_lm_ids)
+    features["masked_lm_weights"] = create_float_feature(masked_lm_weights)
+    features["next_sentence_labels"] = create_int_feature([next_sentence_label])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+
+    writers[writer_index].write(tf_example.SerializeToString())
+    writer_index = (writer_index + 1) % len(writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      logging.info("*** Example ***")
+      logging.info("tokens: %s", " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+
+      for feature_name in features.keys():
+        feature = features[feature_name]
+        values = []
+        if feature.int64_list.value:
+          values = feature.int64_list.value
+        elif feature.float_list.value:
+          values = feature.float_list.value
+        logging.info("%s: %s", feature_name, " ".join([str(x) for x in values]))
+
+  for writer in writers:
+    writer.close()
+
+  logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+
+def create_training_instances(input_files,
+                              tokenizer,
+                              max_seq_length,
+                              dupe_factor,
+                              short_seq_prob,
+                              masked_lm_prob,
+                              max_predictions_per_seq,
+                              rng,
+                              do_whole_word_mask=False,
+                              max_ngram_size=None):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.io.gfile.GFile(input_file, "rb") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+              do_whole_word_mask, max_ngram_size))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+    do_whole_word_mask=False,
+    max_ngram_size=None):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
+             do_whole_word_mask, max_ngram_size)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+# A _Gram is a [half-open) interval of token indices which form a word.
+# E.g.,
+#   words:  ["The", "doghouse"]
+#   tokens: ["The", "dog", "##house"]
+#   grams:  [(0,1), (1,3)]
+_Gram = collections.namedtuple("_Gram", ["begin", "end"])
+
+
+def _window(iterable, size):
+  """Helper to create a sliding window iterator with a given size.
+  E.g.,
+    input = [1, 2, 3, 4]
+    _window(input, 1) => [1], [2], [3], [4]
+    _window(input, 2) => [1, 2], [2, 3], [3, 4]
+    _window(input, 3) => [1, 2, 3], [2, 3, 4]
+    _window(input, 4) => [1, 2, 3, 4]
+    _window(input, 5) => None
+  Args:
+    iterable: elements to iterate over.
+    size: size of the window.
+  Yields:
+    Elements of `iterable` batched into a sliding window of length `size`.
+  """
+  i = iter(iterable)
+  window = []
+  try:
+    for e in range(0, size):
+      window.append(next(i))
+    yield window
+  except StopIteration:
+    # handle the case where iterable's length is less than the window size.
+    return
+  for e in i:
+    window = window[1:] + [e]
+    yield window
+
+
+def _contiguous(sorted_grams):
+  """Test whether a sequence of grams is contiguous.
+  Args:
+    sorted_grams: _Grams which are sorted in increasing order.
+  Returns:
+    True if `sorted_grams` are touching each other.
+  E.g.,
+    _contiguous([(1, 4), (4, 5), (5, 10)]) == True
+    _contiguous([(1, 2), (4, 5)]) == False
+  """
+  for a, b in _window(sorted_grams, 2):
+    if a.end != b.begin:
+      return False
+  return True
+
+
+def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
+  """Create a list of masking {1, ..., n}-grams from a list of one-grams.
+  This is an extention of 'whole word masking' to mask multiple, contiguous
+  words such as (e.g., "the red boat").
+  Each input gram represents the token indices of a single word,
+     words:  ["the", "red", "boat"]
+     tokens: ["the", "red", "boa", "##t"]
+     grams:  [(0,1), (1,2), (2,4)]
+  For a `max_ngram_size` of three, possible outputs masks include:
+    1-grams: (0,1), (1,2), (2,4)
+    2-grams: (0,2), (1,4)
+    3-grams; (0,4)
+  Output masks will not overlap and contain less than `max_masked_tokens` total
+  tokens.  E.g., for the example above with `max_masked_tokens` as three,
+  valid outputs are,
+       [(0,1), (1,2)]  # "the", "red" covering two tokens
+       [(1,2), (2,4)]  # "red", "boa", "##t" covering three tokens
+  The length of the selected n-gram follows a zipf weighting to
+  favor shorter n-gram sizes (weight(1)=1, weight(2)=1/2, weight(3)=1/3, ...).
+  Args:
+    grams: List of one-grams.
+    max_ngram_size: Maximum number of contiguous one-grams combined to create
+      an n-gram.
+    max_masked_tokens: Maximum total number of tokens to be masked.
+    rng: `random.Random` generator.
+  Returns:
+    A list of n-grams to be used as masks.
+  """
+  if not grams:
+    return None
+
+  grams = sorted(grams)
+  num_tokens = grams[-1].end
+
+  # Ensure our grams are valid (i.e., they don't overlap).
+  for a, b in _window(grams, 2):
+    if a.end > b.begin:
+      raise ValueError("overlapping grams: {}".format(grams))
+
+  # Build map from n-gram length to list of n-grams.
+  ngrams = {i: [] for i in range(1, max_ngram_size+1)}
+  for gram_size in range(1, max_ngram_size+1):
+    for g in _window(grams, gram_size):
+      if _contiguous(g):
+        # Add an n-gram which spans these one-grams.
+        ngrams[gram_size].append(_Gram(g[0].begin, g[-1].end))
+
+  # Shuffle each list of n-grams.
+  for v in ngrams.values():
+    rng.shuffle(v)
+
+  # Create the weighting for n-gram length selection.
+  # Stored cummulatively for `random.choices` below.
+  cummulative_weights = list(
+      itertools.accumulate([1./n for n in range(1, max_ngram_size+1)]))
+
+  output_ngrams = []
+  # Keep a bitmask of which tokens have been masked.
+  masked_tokens = [False] * num_tokens
+  # Loop until we have enough masked tokens or there are no more candidate
+  # n-grams of any length.
+  # Each code path should ensure one or more elements from `ngrams` are removed
+  # to guarentee this loop terminates.
+  while (sum(masked_tokens) < max_masked_tokens and
+         sum(len(s) for s in ngrams.values())):
+    # Pick an n-gram size based on our weights.
+    sz = random.choices(range(1, max_ngram_size+1),
+                        cum_weights=cummulative_weights)[0]
+
+    # Ensure this size doesn't result in too many masked tokens.
+    # E.g., a two-gram contains _at least_ two tokens.
+    if sum(masked_tokens) + sz > max_masked_tokens:
+      # All n-grams of this length are too long and can be removed from
+      # consideration.
+      ngrams[sz].clear()
+      continue
+
+    # All of the n-grams of this size have been used.
+    if not ngrams[sz]:
+      continue
+
+    # Choose a random n-gram of the given size.
+    gram = ngrams[sz].pop()
+    num_gram_tokens = gram.end-gram.begin
+
+    # Check if this would add too many tokens.
+    if num_gram_tokens + sum(masked_tokens) > max_masked_tokens:
+      continue
+
+    # Check if any of the tokens in this gram have already been masked.
+    if sum(masked_tokens[gram.begin:gram.end]):
+      continue
+
+    # Found a usable n-gram!  Mark its tokens as masked and add it to return.
+    masked_tokens[gram.begin:gram.end] = [True] * (gram.end-gram.begin)
+    output_ngrams.append(gram)
+  return output_ngrams
+
+
+def _wordpieces_to_grams(tokens):
+  """Reconstitue grams (words) from `tokens`.
+  E.g.,
+     tokens: ['[CLS]', 'That', 'lit', '##tle', 'blue', 'tru', '##ck', '[SEP]']
+      grams: [          [1,2), [2,         4),  [4,5) , [5,       6)]
+  Args:
+    tokens: list of wordpieces
+  Returns:
+    List of _Grams representing spans of whole words
+    (without "[CLS]" and "[SEP]").
+  """
+  grams = []
+  gram_start_pos = None
+  for i, token in enumerate(tokens):
+    if gram_start_pos is not None and token.startswith("##"):
+      continue
+    if gram_start_pos is not None:
+      grams.append(_Gram(gram_start_pos, i))
+    if token not in ["[CLS]", "[SEP]"]:
+      gram_start_pos = i
+    else:
+      gram_start_pos = None
+  if gram_start_pos is not None:
+    grams.append(_Gram(gram_start_pos, len(tokens)))
+  return grams
+
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng,
+                                 do_whole_word_mask,
+                                 max_ngram_size=None):
+  """Creates the predictions for the masked LM objective."""
+  if do_whole_word_mask:
+    grams = _wordpieces_to_grams(tokens)
+  else:
+    # Here we consider each token to be a word to allow for sub-word masking.
+    if max_ngram_size:
+      raise ValueError("cannot use ngram masking without whole word masking")
+    grams = [_Gram(i, i+1) for i in range(0, len(tokens))
+             if tokens[i] not in ["[CLS]", "[SEP]"]]
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+  # Generate masks.  If `max_ngram_size` in [0, None] it means we're doing
+  # whole word masking or token level masking.  Both of these can be treated
+  # as the `max_ngram_size=1` case.
+  masked_grams = _masking_ngrams(grams, max_ngram_size or 1,
+                                 num_to_predict, rng)
+  masked_lms = []
+  output_tokens = list(tokens)
+  for gram in masked_grams:
+    # 80% of the time, replace all n-gram tokens with [MASK]
+    if rng.random() < 0.8:
+      replacement_action = lambda idx: "[MASK]"
+    else:
+      # 10% of the time, keep all the original n-gram tokens.
+      if rng.random() < 0.5:
+        replacement_action = lambda idx: tokens[idx]
+      # 10% of the time, replace each n-gram token with a random word.
+      else:
+        replacement_action = lambda idx: rng.choice(vocab_words)
+
+    for idx in range(gram.begin, gram.end):
+      output_tokens[idx] = replacement_action(idx)
+      masked_lms.append(MaskedLmInstance(index=idx, label=tokens[idx]))
+
+  assert len(masked_lms) <= num_to_predict
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.io.gfile.glob(input_pattern))
+
+  logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size)
+
+  output_files = FLAGS.output_file.split(",")
+  logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files,
+                                  FLAGS.gzip_compress,
+                                  FLAGS.use_v2_feature_names)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  app.run(main)
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
new file mode 100644
index 00000000..53ee6c43
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+
+class BooksDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path
+        pass
+
+
+    def download(self):
+        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
+        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
+        bookscorpus_download_command += ' --trash-bad-count'
+        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
new file mode 100644
index 00000000..22e48d4b
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class BookscorpusTextFormatting:
+    def __init__(self, books_path, output_filename, recursive = False):
+        self.books_path = books_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one book per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
+                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+                    for line in file:
+                        if line.strip() != '':
+                            ofile.write(line.strip() + ' ')
+                ofile.write("\n\n")
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/Downloader.py b/modelzoo/LanguageModeling/BERT/data/Downloader.py
new file mode 100644
index 00000000..bb5c6287
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/Downloader.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from GooglePretrainedWeightDownloader import GooglePretrainedWeightDownloader
+from NVIDIAPretrainedWeightDownloader import NVIDIAPretrainedWeightDownloader
+from WikiDownloader import WikiDownloader
+from BooksDownloader import BooksDownloader
+from GLUEDownloader import GLUEDownloader
+from SquadDownloader import SquadDownloader
+from PubMedDownloader import PubMedDownloader
+
+class Downloader:
+    def __init__(self, dataset_name, save_path):
+        self.dataset_name = dataset_name
+        self.save_path = save_path
+
+
+    def download(self):
+        if self.dataset_name == 'bookscorpus':
+            self.download_bookscorpus()
+
+        elif self.dataset_name == 'wikicorpus_en':
+            self.download_wikicorpus('en')
+
+        elif self.dataset_name == 'wikicorpus_zh':
+            self.download_wikicorpus('zh')
+
+        elif self.dataset_name == 'pubmed_baseline':
+            self.download_pubmed('baseline')
+
+        elif self.dataset_name == 'pubmed_daily_update':
+            self.download_pubmed('daily_update')
+
+        elif self.dataset_name == 'pubmed_fulltext':
+            self.download_pubmed('fulltext')
+
+        elif self.dataset_name == 'pubmed_open_access':
+            self.download_pubmed('open_access')
+
+        elif self.dataset_name == 'google_pretrained_weights':
+            self.download_google_pretrained_weights()
+
+        elif self.dataset_name == 'nvidia_pretrained_weights':
+            self.download_nvidia_pretrained_weights()
+
+        elif self.dataset_name == 'mrpc':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'mnli':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'cola':
+            self.download_glue(self.dataset_name)
+        elif self.dataset_name == 'sst-2':
+            self.download_glue(self.dataset_name)
+
+        elif self.dataset_name == 'squad':
+            self.download_squad()
+
+        elif self.dataset_name == 'all':
+            self.download_bookscorpus()
+            self.download_wikicorpus('en')
+            self.download_wikicorpus('zh')
+            self.download_pubmed('baseline')
+            self.download_pubmed('daily_update')
+            self.download_pubmed('fulltext')
+            self.download_pubmed('open_access')
+            self.download_google_pretrained_weights()
+            self.download_nvidia_pretrained_weights()
+            self.download_glue("cola")
+            self.download_glue("mnli")
+            self.download_glue("mrpc")
+            self.download_glue("sst-2")
+            self.download_squad()
+
+        else:
+            print(self.dataset_name)
+            assert False, 'Unknown dataset_name provided to downloader'
+
+
+    def download_bookscorpus(self):
+        downloader = BooksDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_wikicorpus(self, language):
+        downloader = WikiDownloader(language, self.save_path)
+        downloader.download()
+
+
+    def download_pubmed(self, subset):
+        downloader = PubMedDownloader(subset, self.save_path)
+        downloader.download()
+
+
+    def download_google_pretrained_weights(self):
+        downloader = GooglePretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_nvidia_pretrained_weights(self):
+        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+        downloader.download()
+
+
+    def download_glue(self, glue_task_name):
+        downloader = GLUEDownloader(self.save_path)
+        downloader.download(glue_task_name)
+
+
+    def download_squad(self):
+        downloader = SquadDownloader(self.save_path)
+        downloader.download()
diff --git a/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
new file mode 100644
index 00000000..4c1e701f
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import wget
+
+from pathlib import Path
+
+
+def mkdir(path):
+    Path(path).mkdir(parents=True, exist_ok=True)
+
+
+class GLUEDownloader:
+
+    def __init__(self, save_path):
+        self.save_path = save_path + '/glue'
+
+    def download(self, task_name):
+        mkdir(self.save_path)
+        if task_name in {'mrpc', 'mnli'}:
+            task_name = task_name.upper()
+        elif task_name == 'cola':
+            task_name = 'CoLA'
+        else:  # SST-2
+            assert task_name == 'sst-2'
+            task_name = 'SST'
+        wget.download(
+            'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
+            out=self.save_path,
+        )
+        sys.path.append(self.save_path)
+        import download_glue_data
+        download_glue_data.main(
+            ['--data_dir', self.save_path, '--tasks', task_name])
+        sys.path.pop()
diff --git a/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
new file mode 100644
index 00000000..7d21f0bf
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+import os
+import urllib.request
+import tarfile
+
+class GooglePretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/google_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        # Download urls
+        self.model_urls = {
+            'bert_base_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz', 'uncased_L-12_H-768_A-12.tar.gz'),
+            'bert_large_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz', 'uncased_L-24_H-1024_A-16.tar.gz'),
+            # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
+            # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'),
+            # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+            # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+            # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+        }
+
+        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+        self.bert_base_uncased_sha = {
+            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
+            'bert_model.ckpt.data-00000-of-00001': 'f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f',
+            'bert_model.ckpt.index': '06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767',
+            # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_large_uncased_sha = {
+            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
+            'bert_model.ckpt.data-00000-of-00001': '9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b',
+            'bert_model.ckpt.index': '1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336',
+            # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+        }
+
+        self.bert_base_cased_sha = {
+            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
+            'bert_model.ckpt.data-00000-of-00001': 'ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85',
+            'bert_model.ckpt.index': 'af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5',
+            'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_large_cased_sha = {
+            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
+            'bert_model.ckpt.data-00000-of-00001': '1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b',
+            'bert_model.ckpt.index': '373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3',
+            'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+        }
+
+        self.bert_base_multilingual_cased_sha = {
+            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
+            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
+            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
+            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
+            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+        }
+
+        self.bert_large_multilingual_uncased_sha = {
+            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
+            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
+            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
+            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
+            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+        }
+
+        self.bert_base_chinese_sha = {
+            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
+            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
+            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
+            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
+            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+        }
+
+        # Relate SHA to urls for loop below
+        self.model_sha = {
+            'bert_base_uncased': self.bert_base_uncased_sha,
+            'bert_large_uncased': self.bert_large_uncased_sha,
+            # 'bert_base_cased': self.bert_base_cased_sha,
+            # 'bert_large_cased': self.bert_large_cased_sha,
+            # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+            # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+            # 'bert_base_chinese': self.bert_base_chinese_sha
+        }
+
+    # Helper to get sha256sum of a file
+    def sha256sum(self, filename):
+      h  = hashlib.sha256()
+      b  = bytearray(128*1024)
+      mv = memoryview(b)
+      with open(filename, 'rb', buffering=0) as f:
+        for n in iter(lambda : f.readinto(mv), 0):
+          h.update(mv[:n])
+
+      return h.hexdigest()
+
+    def download(self):
+        # Iterate over urls: download, unzip, verify sha256sum
+        found_mismatch_sha = False
+        for model in self.model_urls:
+          url = self.model_urls[model][0]
+          file = self.save_path + '/' + self.model_urls[model][1]
+
+          print('Downloading', url)
+          response = urllib.request.urlopen(url)
+          with open(file, 'wb') as handle:
+            handle.write(response.read())
+
+          print('Unzipping', file)
+          tf = tarfile.open(file)
+          tf.extractall(self.save_path)
+
+          sha_dict = self.model_sha[model]
+          for extracted_file in sha_dict:
+            sha = sha_dict[extracted_file]
+            if sha != self.sha256sum(file[:-7] + '/' + extracted_file):
+              found_mismatch_sha = True
+              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
+            else:
+              print(file[:-7] + '/' + extracted_file, '\t', 'verified')
+
+        if not found_mismatch_sha:
+          print("All downloads pass sha256sum verification.")
+
+    def serialize(self):
+        pass
+
+    def deserialize(self):
+        pass
+
+    def listAvailableWeights(self):
+        print("Available Weight Datasets")
+        for item in self.model_urls:
+            print(item)
+
+    def listLocallyStoredWeights(self):
+        pass
+
diff --git a/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
new file mode 100644
index 00000000..13c9a320
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+class NVIDIAPretrainedWeightDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/nvidia_pretrained_weights'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        pass
+
+
+    def download(self):
+        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
new file mode 100644
index 00000000..a2aef07a
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import glob
+import gzip
+import os
+import urllib.request
+import shutil
+import sys
+
+class PubMedDownloader:
+    def __init__(self, subset, save_path):
+        self.subset = subset
+        # Modifying self.save_path in two steps to handle creation of subdirectories
+        self.save_path = save_path + '/pubmed' + '/'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.save_path = self.save_path + '/' + subset
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.download_urls = {
+            'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
+            'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
+            'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
+            'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
+        }
+
+
+    def download(self):
+        print('subset:', self.subset)
+        url = self.download_urls[self.subset]
+        self.download_files(url)
+        self.extract_files()
+
+
+    def download_files(self, url):
+        url = self.download_urls[self.subset]
+        output = os.popen('curl ' + url).read()
+
+        if self.subset == 'fulltext' or self.subset == 'open_access':
+            line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
+            for line in output.splitlines():
+                if line[-10:] == 'xml.tar.gz' and \
+                        line.split(' ')[-1].split('.')[0] == line_split:
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+
+        elif self.subset == 'baseline' or self.subset == 'daily_update':
+            for line in output.splitlines():
+                if line[-3:] == '.gz':
+                    file = os.path.join(self.save_path, line.split(' ')[-1])
+                    if not os.path.isfile(file):
+                        print('Downloading', file)
+                        response = urllib.request.urlopen(url + line.split(' ')[-1])
+                        with open(file, "wb") as handle:
+                            handle.write(response.read())
+        else:
+            assert False, 'Invalid PubMed dataset/subset specified.'
+
+    def extract_files(self):
+        files = glob.glob(self.save_path + '/*.xml.gz')
+
+        for file in files:
+            print('file:', file)
+            input = gzip.GzipFile(file, mode='rb')
+            s = input.read()
+            input.close()
+
+            out = open(file[:-3], mode='wb')
+            out.write(s)
+            out.close()
+
+
+
diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
new file mode 100644
index 00000000..df851789
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import pubmed_parser as pmp
+
+class PubMedTextFormatting:
+    def __init__(self, pubmed_path, output_filename, recursive = False):
+        self.pubmed_path = pubmed_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        print('PubMed path:', self.pubmed_path)
+
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive):
+                print('file:', filename)
+                dicts_out = pmp.parse_medline_xml(filename)
+                for dict_out in dicts_out:
+                    if not dict_out['abstract']:
+                        continue
+                    try:
+                        for line in dict_out['abstract'].splitlines():
+                            if len(line) < 30:
+                                continue
+                            ofile.write(line.strip() + " ")
+                        ofile.write("\n\n")
+                    except:
+                        ofile.write("\n\n")
+                        continue
diff --git a/modelzoo/LanguageModeling/BERT/data/README.md b/modelzoo/LanguageModeling/BERT/data/README.md
new file mode 100644
index 00000000..48168422
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/README.md
@@ -0,0 +1,28 @@
+Steps to reproduce datasets from web
+
+1) Build the container
+  * docker build -t bert_tf2 .
+2) Run the container interactively
+  * nvidia-docker run -it --ipc=host bert_tf2
+  * Optional: Mount data volumes
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/download
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/extracted_articles
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/raw_data
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/intermediate_files
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_text_file_single
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert_tf2/data/wikipedia_corpus/final_tfrecords_sharded
+    * -v yourpath:/workspace/bert_tf2/data/bookcorpus/download
+    * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_text_file_single
+    * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_text_files_sharded
+    * -v yourpath:/workspace/bert_tf2/data/bookcorpus/final_tfrecords_sharded
+  * Optional: Select visible GPUs
+    * -e CUDA_VISIBLE_DEVICES=0
+
+** Inside of the container starting here**
+3) Download pretrained weights (they contain vocab files for preprocessing) and SQuAD
+  * bash data/create_datasets_from_start.sh squad
+5) "One-click" Wikipedia data download and prep (provides tfrecords)
+  * bash data/create_datasets_from_start.sh pretrained wiki_only
+6) "One-click" Wikipedia and BookCorpus data download and prep (provided tfrecords)
+  * bash data/create_datasets_from_start.sh pretrained wiki_books
diff --git a/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
new file mode 100644
index 00000000..6d64ffc6
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+
+class SquadDownloader:
+    def __init__(self, save_path):
+        self.save_path = save_path + '/squad'
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        if not os.path.exists(self.save_path + '/v1.1'):
+            os.makedirs(self.save_path + '/v1.1')
+
+        if not os.path.exists(self.save_path + '/v2.0'):
+            os.makedirs(self.save_path + '/v2.0')
+
+        self.download_urls = {
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
+            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
+            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
+            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
+        }
+
+    def download(self):
+        for item in self.download_urls:
+            url = item
+            file = self.download_urls[item]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + file):
+                print('** Download file already exists, skipping download')
+            else:
+                response = urllib.request.urlopen(url)
+                with open(self.save_path + '/' + file, "wb") as handle:
+                    handle.write(response.read())
+
+
diff --git a/modelzoo/LanguageModeling/BERT/data/TextSharding.py b/modelzoo/LanguageModeling/BERT/data/TextSharding.py
new file mode 100644
index 00000000..a6b0ca49
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/TextSharding.py
@@ -0,0 +1,331 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from itertools import islice
+
+import multiprocessing
+import os
+import statistics
+
+class Sharding:
+    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+        assert len(input_files) > 0, 'The input file list must contain at least one file.'
+        assert n_training_shards > 0, 'There must be at least one output shard.'
+        assert n_test_shards > 0, 'There must be at least one output shard.'
+
+        self.n_training_shards = n_training_shards
+        self.n_test_shards = n_test_shards
+        self.fraction_test_set = fraction_test_set
+
+        self.input_files = input_files
+
+        self.output_name_prefix = output_name_prefix
+        self.output_training_identifier = '_training'
+        self.output_test_identifier = '_test'
+        self.output_file_extension = '.txt'
+
+        self.articles = {}    # key: integer identifier, value: list of articles
+        self.sentences = {}    # key: integer identifier, value: list of sentences
+        self.output_training_files = {}    # key: filename, value: list of articles to go into file
+        self.output_test_files = {}  # key: filename, value: list of articles to go into file
+
+        self.init_output_files()
+
+
+    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+    def load_articles(self):
+        print('Start: Loading Articles')
+
+        global_article_count = 0
+        for input_file in self.input_files:
+            print('input file:', input_file)
+            with open(input_file, mode='r', newline='\n') as f:
+                for i, line in enumerate(f):
+                    if line.strip():
+                        self.articles[global_article_count] = line.rstrip()
+                        global_article_count += 1
+
+        print('End: Loading Articles: There are', len(self.articles), 'articles.')
+
+
+    def segment_articles_into_sentences(self, segmenter):
+        print('Start: Sentence Segmentation')
+        if len(self.articles) == 0:
+            self.load_articles()
+
+        assert len(self.articles) != 0, 'Please check that input files are present and contain data.'
+
+        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+        use_multiprocessing = 'serial'
+
+        def chunks(data, size=len(self.articles)):
+            it = iter(data)
+            for i in range(0, len(data), size):
+                yield {k: data[k] for k in islice(it, size)}
+
+        if use_multiprocessing == 'manager':
+            manager = multiprocessing.Manager()
+            return_dict = manager.dict()
+            jobs = []
+            n_processes = 7    # in addition to the main process, total = n_proc+1
+
+            def work(articles, return_dict):
+                sentences = {}
+                for i, article in enumerate(articles):
+                    sentences[i] = segmenter.segment_string(articles[article])
+
+                    if i % 5000 == 0:
+                        print('Segmenting article', i)
+
+                return_dict.update(sentences)
+
+            for item in chunks(self.articles, len(self.articles)):
+                p = multiprocessing.Process(target=work, args=(item, return_dict))
+
+                # Busy wait
+                while len(jobs) >= n_processes:
+                    pass
+
+                jobs.append(p)
+                p.start()
+
+            for proc in jobs:
+                proc.join()
+
+        elif use_multiprocessing == 'queue':
+            work_queue = multiprocessing.Queue()
+            jobs = []
+
+            for item in chunks(self.articles, len(self.articles)):
+                pass
+
+        else:    # serial option
+            for i, article in enumerate(self.articles):
+                self.sentences[i] = segmenter.segment_string(self.articles[article])
+
+                if i % 5000 == 0:
+                    print('Segmenting article', i)
+
+        print('End: Sentence Segmentation')
+
+
+    def init_output_files(self):
+        print('Start: Init Output Files')
+        assert len(self.output_training_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+        assert len(self.output_test_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+
+        for i in range(self.n_training_shards):
+            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+            self.output_training_files[name] = []
+
+        for i in range(self.n_test_shards):
+            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+            self.output_test_files[name] = []
+
+        print('End: Init Output Files')
+
+
+    def get_sentences_per_shard(self, shard):
+        result = 0
+        for article_id in shard:
+            result += len(self.sentences[article_id])
+
+        return result
+
+
+    def distribute_articles_over_shards(self):
+        print('Start: Distribute Articles Over Shards')
+        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+
+        # Create dictionary with - key: sentence count per article, value: article id number
+        sentence_counts = defaultdict(lambda: [])
+
+        max_sentences = 0
+        total_sentences = 0
+
+        for article_id in self.sentences:
+            current_length = len(self.sentences[article_id])
+            sentence_counts[current_length].append(article_id)
+            max_sentences = max(max_sentences, current_length)
+            total_sentences += current_length
+
+        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+
+        consumed_article_set = set({})
+        unused_article_set = set(self.articles.keys())
+
+        # Make first pass and add one article worth of lines per file
+        for file in self.output_training_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_training_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+
+        for file in self.output_test_files:
+            current_article_id = sentence_counts[max_sentences][-1]
+            sentence_counts[max_sentences].pop(-1)
+            self.output_test_files[file].append(current_article_id)
+            consumed_article_set.add(current_article_id)
+            unused_article_set.remove(current_article_id)
+
+            # Maintain the max sentence count
+            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                max_sentences -= 1
+
+            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+
+        training_counts = []
+        test_counts = []
+
+        for shard in self.output_training_files:
+            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        training_median = statistics.median(training_counts)
+        test_median = statistics.median(test_counts)
+
+        # Make subsequent passes over files to find articles to add without going over limit
+        history_remaining = []
+        n_history_remaining = 4
+
+        while len(consumed_article_set) < len(self.articles):
+            for fidx, file in enumerate(self.output_training_files):
+                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or training_counts[fidx] > training_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_training_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            for fidx, file in enumerate(self.output_test_files):
+                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+
+                # Maintain the max sentence count
+                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+                    max_sentences -= 1
+
+                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+                    nominal_next_article_size -= 1
+
+                if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or test_counts[fidx] > test_median:
+                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+
+                current_article_id = sentence_counts[nominal_next_article_size][-1]
+                sentence_counts[nominal_next_article_size].pop(-1)
+
+                self.output_test_files[file].append(current_article_id)
+                consumed_article_set.add(current_article_id)
+                unused_article_set.remove(current_article_id)
+
+            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+            if len(history_remaining) == n_history_remaining:
+                history_remaining.pop(0)
+            history_remaining.append(len(unused_article_set))
+
+            history_same = True
+            for i in range(1, len(history_remaining)):
+                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+
+            if history_same:
+                nominal_sentences_per_training_shard += 1
+                # nominal_sentences_per_test_shard += 1
+
+            training_counts = []
+            test_counts = []
+            for shard in self.output_training_files:
+                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+
+            for shard in self.output_test_files:
+                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+
+            training_median = statistics.median(training_counts)
+            test_median = statistics.median(test_counts)
+
+            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+
+
+        if len(unused_article_set) != 0:
+            print('Warning: Some articles did not make it into output files.')
+
+
+        for shard in self.output_training_files:
+            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+
+        for shard in self.output_test_files:
+            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+
+        print('End: Distribute Articles Over Shards')
+
+
+    def write_shards_to_disk(self):
+        print('Start: Write Shards to Disk')
+        for shard in self.output_training_files:
+            self.write_single_shard(shard, self.output_training_files[shard], 'training')
+
+        for shard in self.output_test_files:
+            self.write_single_shard(shard, self.output_test_files[shard], 'test')
+
+        print('End: Write Shards to Disk')
+
+
+    def write_single_shard(self, shard_name, shard, split):
+        shard_split = os.path.split(shard_name)
+        shard_name = shard_split[0] + '/' + split + '/' + shard_split[1]
+
+        with open(shard_name, mode='w', newline='\n') as f:
+            for article_id in shard:
+                for line in self.sentences[article_id]:
+                    f.write(line + '\n')
+
+                f.write('\n')  # Line break between articles
+
+
+import nltk
+
+nltk.download('punkt')
+
+class NLTKSegmenter:
+    def __init(self):
+        pass
+
+    def segment_string(self, article):
+        return nltk.tokenize.sent_tokenize(article)
+
diff --git a/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
new file mode 100644
index 00000000..1e5e36ce
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bz2
+import os
+import urllib.request
+import sys
+import subprocess
+
+class WikiDownloader:
+    def __init__(self, language, save_path):
+        self.save_path = save_path + '/wikicorpus_' + language
+
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+
+        self.language = language
+        self.download_urls = {
+            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
+            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'        }
+
+        self.output_files = {
+            'en' : 'wikicorpus_en.xml.bz2',
+            'zh' : 'wikicorpus_zh.xml.bz2'
+        }
+
+
+    def download(self):
+        if self.language in self.download_urls:
+            url = self.download_urls[self.language]
+            filename = self.output_files[self.language]
+
+            print('Downloading:', url)
+            if os.path.isfile(self.save_path + '/' + filename):
+                print('** Download file already exists, skipping download')
+            else:
+                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename), '--no-check-certificate']
+                print('Running:', cmd)
+                status = subprocess.run(cmd)
+                if status.returncode != 0:
+                    raise RuntimeError('Wiki download not successful')
+
+            # Always unzipping since this is relatively fast and will overwrite
+            print('Unzipping:', self.output_files[self.language])
+            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+
+        else:
+            assert False, 'WikiDownloader not implemented for this language yet.'
+
diff --git a/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
new file mode 100644
index 00000000..9d356b13
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+class WikicorpusTextFormatting:
+    def __init__(self, wiki_path, output_filename, recursive = False):
+        self.wiki_path = wiki_path
+        self.recursive = recursive
+        self.output_filename = output_filename
+
+
+    # This puts one article per line
+    def merge(self):
+        with open(self.output_filename, mode='w', newline='\n') as ofile:
+            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
+                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+                    print(filename)
+                    article_lines = []
+                    article_open = False
+
+                    with open(filename, mode='r', newline='\n') as file:
+                        for line in file:
+                            if '<doc id=' in line:
+                                article_open = True
+                            elif '</doc>' in line:
+                                article_open = False
+                                for oline in article_lines[1:]:
+                                    if oline != '\n':
+                                        ofile.write(oline.rstrip() + " ")
+                                ofile.write("\n\n")
+                                article_lines = []
+                            else:
+                                if article_open:
+                                    article_lines.append(line)
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/__init__.py b/modelzoo/LanguageModeling/BERT/data/__init__.py
new file mode 100644
index 00000000..d49f0d05
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/bertPrep.py b/modelzoo/LanguageModeling/BERT/data/bertPrep.py
new file mode 100644
index 00000000..656d909e
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/bertPrep.py
@@ -0,0 +1,388 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import BookscorpusTextFormatting
+import Downloader
+import TextSharding
+import WikicorpusTextFormatting
+import PubMedTextFormatting
+
+import argparse
+import itertools
+import multiprocessing
+import os
+import pprint
+import subprocess
+
+
+def main(args):
+    working_dir = os.environ['BERT_PREP_WORKING_DIR']
+
+    print('Working Directory:', working_dir)
+    print('Action:', args.action)
+    print('Dataset Name:', args.dataset)
+
+    if args.input_files:
+        args.input_files = args.input_files.split(',')
+
+    hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
+                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
+                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
+                                  + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
+    directory_structure = {
+        'download' : working_dir + '/download',    # Downloaded and decompressed
+        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
+        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
+        'sharded' : working_dir + '/sharded',
+        'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
+        'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix,
+    }
+
+    print('\nDirectory Structure:')
+    pp = pprint.PrettyPrinter(indent=2)
+    pp.pprint(directory_structure)
+    print('')
+
+    if args.action == 'download':
+        if not os.path.exists(directory_structure['download']):
+            os.makedirs(directory_structure['download'])
+
+        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+        downloader.download()
+
+    elif args.action == 'text_formatting':
+        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
+               and args.dataset != 'squad' and args.dataset != 'mrpc' and args.dataset != 'cola' and \
+               args.dataset != 'mnli' and args.dataset != 'sst-2', 'Cannot perform text_formatting on pretrained weights'
+
+        if not os.path.exists(directory_structure['extracted']):
+            os.makedirs(directory_structure['extracted'])
+
+        if not os.path.exists(directory_structure['formatted']):
+            os.makedirs(directory_structure['formatted'])
+
+        if args.dataset == 'bookscorpus':
+            books_path = directory_structure['download'] + '/bookscorpus'
+            #books_path = directory_structure['download']
+            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+            books_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_en':
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = 'python -m wikiextractor.WikiExtractor'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'wikicorpus_zh':
+            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
+            if args.skip_wikiextractor == 0:
+                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+                print('WikiExtractor Command:', wikiextractor_command)
+                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+            wiki_formatter.merge()
+
+        elif args.dataset == 'pubmed_baseline':
+            pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
+            output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
+            pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
+            pubmed_formatter.merge()
+
+    elif args.action == 'sharding':
+        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
+            if args.input_files is None:
+                if args.dataset == 'bookscorpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
+                elif args.dataset == 'wikicorpus_en':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'wikicorpus_zh':
+                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
+                elif args.dataset == 'books_wiki_en_corpus':
+                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
+                elif args.dataset == 'pubmed_baseline':
+                    args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
+
+            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+
+            if not os.path.exists(directory_structure['sharded']):
+                os.makedirs(directory_structure['sharded'])
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
+
+            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
+                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
+
+            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+            # Different languages (e.g., Chinese simplified/traditional) may require translation and
+            # other packages to be called from here -- just add a conditional branch for those extra steps
+            segmenter = TextSharding.NLTKSegmenter()
+            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
+
+            sharding.load_articles()
+            sharding.segment_articles_into_sentences(segmenter)
+            sharding.distribute_articles_over_shards()
+            sharding.write_shards_to_disk()
+
+        else:
+            assert False, 'Unsupported dataset for sharding'
+
+    elif args.action == 'create_tfrecord_files':
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
+
+        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
+            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
+            bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
+            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
+            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
+            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+            return last_process
+
+        output_file_prefix = args.dataset
+
+        for i in range(args.n_training_shards):
+            last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
+
+        last_process.wait()
+
+
+    elif args.action == 'create_hdf5_files':
+        assert False, 'HDF5 format not fully supported in this release.'
+
+        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
+            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
+
+        last_process = None
+
+        def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
+            bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
+            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
+            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
+            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
+            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
+            bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
+            bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
+            bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
+            bert_preprocessing_command += ' --random_seed=' + args.random_seed
+            bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
+            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+            last_process = bert_preprocessing_process
+
+            # This could be better optimized (fine if all take equal time)
+            if shard_id % args.n_processes == 0 and shard_id > 0:
+                bert_preprocessing_process.wait()
+
+        for i in range(args.n_training_shards):
+            create_record_worker(args.output_file_prefix + '_training', i)
+
+        last_process.wait()
+
+        for i in range(args.n_test_shards):
+            create_record_worker(args.output_file_prefix + '_test', i)
+
+        last_process.wait()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='Preprocessing Application for Everything BERT-related'
+    )
+
+    parser.add_argument(
+        '--action',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
+        choices={
+            'download',                   # Download and verify mdf5/sha sums
+            'text_formatting',            # Convert into a file that contains one article/book per line
+            'sharding',                   # Convert previous formatted text into shards containing one sentence per line
+            'create_tfrecord_files',      # Turn each shard into a TFrecord with masking and next sentence prediction info
+            'create_hdf5_files'           # Turn each shard into a HDF5 file with masking and next sentence prediction info
+        }
+    )
+
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        help='Specify the dataset to perform --action on',
+        choices={
+            'bookscorpus',
+            'wikicorpus_en',
+            'wikicorpus_zh',
+            'books_wiki_en_corpus',
+            'pubmed_baseline',
+            'pubmed_daily_update',
+            'pubmed_fulltext',
+            'pubmed_open_access',
+            'google_pretrained_weights',
+            'nvidia_pretrained_weights',
+            'squad',
+            'mrpc',
+            'sst-2',
+            'mnli',
+            'cola',
+            'all'
+        }
+    )
+
+    parser.add_argument(
+        '--input_files',
+        type=str,
+        help='Specify the input files in a comma-separated list (no spaces)'
+    )
+
+    parser.add_argument(
+        '--n_training_shards',
+        type=int,
+        help='Specify the number of training shards to generate',
+        default=1472
+    )
+
+    parser.add_argument(
+        '--n_test_shards',
+        type=int,
+        help='Specify the number of test shards to generate',
+        default=1472
+    )
+
+    parser.add_argument(
+        '--fraction_test_set',
+        type=float,
+        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
+        default=0.1
+    )
+
+    parser.add_argument(
+        '--segmentation_method',
+        type=str,
+        help='Specify your choice of sentence segmentation',
+        choices={
+            'nltk'
+        },
+        default='nltk'
+    )
+
+    parser.add_argument(
+        '--n_processes',
+        type=int,
+        help='Specify the max number of processes to allow at one time',
+        default=4
+    )
+
+    parser.add_argument(
+        '--random_seed',
+        type=int,
+        help='Specify the base seed to use for any random number generation',
+        default=12345
+    )
+
+    parser.add_argument(
+        '--dupe_factor',
+        type=int,
+        help='Specify the duplication factor',
+        default=5
+    )
+
+    parser.add_argument(
+        '--masked_lm_prob',
+        type=float,
+        help='Specify the probability for masked lm',
+        default=0.15
+    )
+
+    parser.add_argument(
+        '--max_seq_length',
+        type=int,
+        help='Specify the maximum sequence length',
+        default=512
+    )
+
+    parser.add_argument(
+        '--max_predictions_per_seq',
+        type=int,
+        help='Specify the maximum number of masked words per sequence',
+        default=20
+    )
+
+    parser.add_argument(
+        '--do_lower_case',
+        type=int,
+        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
+        default=1
+    )
+
+    parser.add_argument(
+        '--vocab_file',
+        type=str,
+        help='Specify absolute path to vocab file to use)'
+    )
+
+    parser.add_argument(
+        '--skip_wikiextractor',
+        type=int,
+        help='Specify whether to skip wikiextractor step 0=False, 1=True',
+        default=0
+    )
+
+    parser.add_argument(
+        '--interactive_json_config_generator',
+        type=str,
+        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    )
+
+    args = parser.parse_args()
+    main(args)
diff --git a/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh b/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh
new file mode 100644
index 00000000..3f1a4163
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/create_biobert_datasets_from_start.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR="${BERT_PREP_WORKING_DIR}"
+
+# Download
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset pubmed_baseline
+
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+
+# Properly format the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset pubmed_baseline
+
+
+# Shard the text files
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset pubmed_baseline
+
+### BERT BASE
+
+## UNCASED
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
+
+
+## CASED
+
+# Create TFRecord files Phase 1
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 128 \
+ --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
+ --do_lower_case=0
+
+
+# Create TFRecord files Phase 2
+python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset pubmed_baseline --max_seq_length 512 \
+ --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/cased_L-12_H-768_A-12/vocab.txt \
+ --do_lower_case=0
diff --git a/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh b/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh
new file mode 100644
index 00000000..f09b0d0f
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/data/create_datasets_from_start.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export BERT_PREP_WORKING_DIR=/workspace/bert_tf2/data
+
+to_download=${1:-"all"}
+pretrained_to_download=${2:-"wiki_only"} # By default, we don't download BooksCorpus dataset due to recent issues with the host server
+
+if [ "$to_download" = "all" ] || [ "$to_download" = "squad" ] ; then
+    #SQUAD
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset google_pretrained_weights  # Includes vocab
+
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset squad
+
+    export BERT_DIR=${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export SQUAD_DIR=${BERT_PREP_WORKING_DIR}/download/squad
+    python create_finetuning_data.py \
+    --squad_data_file=${SQUAD_DIR}/v1.1/train-v1.1.json \
+    --vocab_file=${BERT_DIR}/vocab.txt \
+    --train_data_output_path=${SQUAD_DIR}/v1.1/squad_v1.1_train.tf_record \
+    --meta_data_file_path=${SQUAD_DIR}/v1.1/squad_v1.1_meta_data \
+    --fine_tuning_task_type=squad --max_seq_length=384
+
+    python create_finetuning_data.py \
+    --squad_data_file=${SQUAD_DIR}/v2.0/train-v2.0.json \
+    --vocab_file=${BERT_DIR}/vocab.txt \
+    --train_data_output_path=${SQUAD_DIR}/v2.0/squad_v2.0_train.tf_record \
+    --meta_data_file_path=${SQUAD_DIR}/v2.0/squad_v2.0_meta_data \
+    --fine_tuning_task_type=squad --max_seq_length=384 --version_2_with_negative=True
+fi
+
+if [ "$to_download" = "all" ] || [ "$to_download" = "pretrained" ] ; then
+    #Pretrained
+    if [ "$pretrained_to_download" = "wiki_books" ] ; then
+        python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset bookscorpus
+    fi
+
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action download --dataset wikicorpus_en
+
+    DATASET="wikicorpus_en"
+    # Properly format the text files
+    if [ "$pretrained_to_download" = "wiki_books" ] ; then
+        python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset bookscorpus
+        DATASET="books_wiki_en_corpus"
+    fi
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action text_formatting --dataset wikicorpus_en
+
+    # Shard the text files
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action sharding --dataset $DATASET
+
+    # Create TFRecord files Phase 1
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 128 \
+    --max_predictions_per_seq 20 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
+
+
+    # Create TFRecord files Phase 2
+    python3 ${BERT_PREP_WORKING_DIR}/bertPrep.py --action create_tfrecord_files --dataset ${DATASET} --max_seq_length 512 \
+    --max_predictions_per_seq 80 --vocab_file ${BERT_PREP_WORKING_DIR}/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/vocab.txt
+fi
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png b/modelzoo/LanguageModeling/BERT/data/images/bert_pipeline.png
new file mode 100644
index 0000000000000000000000000000000000000000..40193e9e13ba2daa25f70d1ee69a34f1958ca7ce
GIT binary patch
literal 212516
zcmeFZbyQs0_AQJB4HXCu1;IT?a3>)^@ZcUSxVr@lfx;8qNpN@fK;iB#f#B{0%{#?S
zci;DW@BRCY@r}{{(cQbw*?X?J=9+8mb8qllc}YxkVstn-I83QGua)57KwvmH)KXML
z;NR5YG6DYpH~dysMf~^g-|g+~mzS5>+1b?j%Y%c1g@uKQiHR=tzzVqi{r&s<`;(Is
zuSU?!%uL{SvYXrM>gwv^Ni#Gww56q`qhISiJw1_KFH%xc`hKQN?N(IJI<Ky-j*N^H
zjvHOx9ILCVub&JZTrRV-vr|%18XFsb{rYw5Y@(sDL0VdRV`C#_K%(tO_|id}y}f<#
z=)RYiSN`H@WMpJ>b8}f)*~^zNgM)+c@bJbbCR<xuLz>u}YvHp-RO{!xl8Zl#Zsxhx
zBkf%*B(y3?)~rcX)`Q3`)8XLA;iO)_RB?mfOGOi#=xGQfsf!9MLwYINwrK9x#rMt_
zoZ4mNIdu2Cb$cqUV6WMAOd*9rW?zA3+v9X{Y`6R+90Hav91=MkJgVrwfBuWZza;oK
z8U9s*e--iHa`10i{MQrw>y7?3hX1<ae_ipvuJ~V9{Qs3JPDuD3Ez-0<JUr^Igx{Hc
z^ecDMZS~P(kb;uJEL(o4kjGObXFBaE-MvF(+xzRImAn1g!zFR|1hkv0ezmCM6?w5z
zA8!&cADufE|L@39hM)J4*;}2YUlw`qDf?*F)(a9+BlPEgDVr`giJq|CE1Mh^9<F~z
zLrAcTyWVuj>!K4tm6a4-9yqa4o!?~Txe%T_7Geh_rn>LFC|RjgIR$%MO^)h#s_&&o
zPz@+SYZcp)Iiq#8X>|$r6Que!{DuCO3k_nw)aC(~>kVUr)!yaypWD$B`;N5V1n%~3
z*V=grT))WZN#gK5k+jw6@H~9BVC;_dNeFLAUg0;r#hP(*0jkIcfj3lzp$v8kr_mc@
z;;LdaL3uCwY4)6uYVh)hUokePg}Dw(cLgKnW&D~DdCkqIE)61QLqWJ+lKo0&f3~p@
zTKiqsgZs1Y4-dwYo6kOjjtMdyzxRwuMS9*`eoS8WVJTRWc%i=`NBmyBVONeoLI7E6
zR;LW|H`isT%PdJ4mhQvpl6BO`Vo*vkC%;|AV*E68L=oQ(jY-+-%Q~5E!I<5}3%px(
zfAb`?>E7$-LTJimQv&*8vra>hN8)XciU^ZP0Jb|?Hg;^;IQ9(sn5oar+t-f^<SeOD
zd3^_7<=`HQ30NjcMUb;0`nBh_Y1Ye-*<H@y1=iGhZGeV^A@6dWIdc<D-?)t^{7Cbf
zdjCbXsM?n5x13gjt~TNR7Tx;#7qP#|S;mPFuv{sE8|0@w*>Wvl&)3fx6OyD?&&E<F
zKrg$@o#-)dJ7#u1iZWcH3BLF|Bc4Wv>skH0cLaGmO4f)GlqNmHnNG~uiJkpt!&W3d
zv<<Sqlr=vx_OHBC^?e6_KK{(Si}&OA6jWRhOdlHVYeUzY8N9|M?ZyqhDu$ru;EhUG
zsb}}udPo<VOdc#2%Mwe;RlJQkY;){#jPXP)e~YPKHWvMT+T99E@~``R8^%;#$n}Q0
za2UoOC}f;RE@w@RZs+&iO%Y2`toYDZUV5o6mmX$&7xvgXfg6(~70lSEAe~n%882M;
z5iAYMMzgCasots@XQuFqmIhtt=l1H7@%et2McUin6<N*@*m%Z0X6VWUlF>O2*E<Ez
z;dU&&K6O<~v`*BSH7!1>&+{V5wjGPw+i?wYaguVM{fYIr`D2duy?^gjaV}=_M9(H1
z`Iade(!&|M#M`)#2%`Gf9B*^Dn6GSm#Wt(4@9G)d()Z~)voqg(#p{<#+}VN4VEpY9
z7L=DuK7t=SjZQT5FCRA_eBicECEMzGwfR9dy|Ld!{?(-imr8rk2^ycMF^t|6g~JyY
z_n@TM+$bo4zxgU$&EdkbUPRQ|){2_g-Iux|dpN%<tdYwcuV2qvW>98^1o0Y7pEZtf
zoia}fLf+jwb8aIVr4EFDbzw?fvsO8x#jH$b2y?0WTs?^3`43mD5M~vorgQvEtSz`j
z4yz?*1TfzRxu_22Y436@st1&}RBX@hI{4&_<NANe8VNR9{3zXxVA9@oTrYim7XBD)
zLsZ(l)QduNl}{v=Xg|~$WB=qQ3m;QfR4{$~m(ap6T|z%$oJaOu_(W~_%v#m!Wt6@-
z-q$AcdHp1OIEuty`zcVc5+X(2I6~=e8(a4G7gOwqv*PTZd&b$UFKQpTRFuz}_Uy8=
zrbfJHtJm!0UYXB}_lh>Du#BF*k?tO0GX1+p)AvJi`H1}G4gI~cZf^oeSGo6M`=ru|
zNo*}yFV?{7m(KOn%f0hlk9j3r0y34UC6Y-)35?mU<i~Esp9ah+Kl?xb8g_AAd!f2v
zWz^S*QcjSDtJEv0lCrMkVt{llt5nXU-cyv)hoJ_R2a78EM8|FGbh_?1%9HYa{62xy
z?x`%{m_w$2s)8^0!&~WUBw%=~_yh5YEgeC0gv2kRI+>FnPl#k~E-Ys{Y%>RvIlt<%
zHcLwmKZE>pfO4dhuSL<96;d(fSOsyB7InrdnFEW(kH6S@s|J>5_vPB^dX)F|py<Uk
zMKbKq+GIX5pjvNZ{<L9Off#BfpuqP*@!i+FpA8A<3(Y;DPi&2EH5nga<C)aQz@iY1
zc9kH%9>7Yg9*sOQapzr7K4dWC!otF#PQeE6ceE4(Z{+;*F!6rQ-BsiSVLHQfTTK{q
zc5@_DQj%)P&D_U%#e+^Pf3S%&$BiVwcI@L`PJNz}Y_mPm-*W<dSy7(~kji+l1<pP)
zTA_S`e`|IqiPDzfLSO&YhPp8!NgcV@Zq|*xSB=Xn_bZ*881|4y?$h|SO%V)lBs`l{
z@?-gvfz4a9$XwRGDEw>lsS)4NO}uDvPHuwJ`c7?WjbQu#eU@)qh*~Mm@SGQfWhNoI
zoZN<3q#BiYpZ9$+wC;QLmS#h&f>`HZVP<3+SyS@8g8NJB4b9{YYuqL&jEZD@34yt8
z{^lX<e<~FV{30QS7R`Kt+}ioZ-nn;=OfE`6=NVsooA!2+rT5(zbs6Z)8}u<A`D6bu
z{lkT86;B!k>nZg1uG|-4coOC##HCzACK6CzC1Tg!(c%HAIBDHiv?b48SWyU&`dKUN
za%{WXvU0^EKc;=GpO+ii3(jY8sVUWyvaf&aCxwP9h^a8qY}H}l*BJlW&vNvq50lY=
zFr?lrEBA7H67f7El0dF@EPqL&ExC%k(cEKhRCsFJIpXi;?7IURuGg?P>!kQdiqg)H
znp`cLXAVS7vDSItThEk8yCjriTI|QYI@*!P^<FT{S5}i3*+k4H@>^57C{3`jOx}+N
z_QIGmUMCY_W2{!Gf=<cU=fDSSJvRSZ*0n}XATF&h=DEKVqS?+(8g}-y0B@AGU4GVx
zPN1kGP;Vcn_4zfq?o+cpI(WDCs6-r~e;C^hcq1b+?ZUgRe|e(*+NgqOa6WId6$rp?
zR3VXW;`L9|e-t)Obd=7Lfm^<LcuQ!p^sTIZfrmg~jbLQ6_H0lecdJGFj1pdU>EhW@
zrt+@hzEQTn6@YzOf6?APlHZ!tl4VnAlegl#)1e%c!r=VxLh-LJ+Ka<PWU|RK9AiI|
z_v7ugqn^#C7Q_j0oLZQ?N<u@#RUD2Bb3`2o5OuRHpr1svX%)e%^W4U-OWXXKy;`fx
zLVS)oTPolD+KYGWbL>5q@|n&=OSs4V1O6<Gq{_D7zpqaG9^s5ER~FMq)8%-4o<mM*
z$`b6|W)$h~ir=5yc5q=TH8FlG2k@WX>u1h@sW*3rb1c_<TRC|V8KaYlk?RdYDbI^(
zPmil#2%F?N2(ARqvu(v;Enh;o!nVG=e03cqU6N2c6wFnVz@dvfjd-b&W7Rk%Py8*&
zLle6m<fn<c<vH$im6GRqSsBcEc#y01L+glnGDM7YP*j-1Q$hB2XYVYu!6dCZWAhb_
z`Z4!dDbp!BQ;&@n@xIx!-7|j4e?+JrDMVP43C(O?@P$BaMJzR^!L1V6?SAb>ilZ9x
zqIBuHffYmO7>`X~kbd!}4XqJyKV*_a^gUAc<D^UC-!vR%61eNB(1eo%S`oZPZ6dEY
zY2geu3evCWS_ATu`j+3*7*uj=jJES-uR?p1&NCGDHQjwFckr@1&=8byMqG^;gKJvz
zwWkMTQk<XQ^flrO$yBe72tMtT{neHAP5%DzQ+mKpEJ<t_CAmi<mj}?4AJW}unCkNV
zW7+yC-=6heC*u<2nuE@&4tN{}5-qs)bL4ln)UJnnzBRCMhh&_ECts_5#fE$}R`N+6
zwVf?P@hPZTPfPQg$oY1mS=wxk*Er}r#YYnD@@((G?RV9nqO3<L@>;zJ%dB?d<YPRJ
z8hUGxDUMQl;Y<KuwW-N{*f9UGa-u`_dYL+zsIuvY^g>AxJ|IgiK1utZTPMOTi?1kE
z_ut*lXig`Y=4XvJs#B--_$_U#I#l&EO~r>9CjYXgjy_t>iVa{-JgVrV_(zx?!QTl9
z{CZD%BS##WtKnRFEf6`YyvkhYW|K{b#yS-k8QCkB``fkbE<1It_F8`{vq$<17wN^5
z%4Wq%ks!81jF|8Xix46@$#{#mk;xd<EVTP9p@}wA!uhi32Mz`Kw(HejLY~_=$1%-r
zr_A7&Gfnvqea8_$P)XCU=QW2@1vB*y`?zV_MQ<^#vLcy~zdh5~Pd%t33|-han%>iV
zs<_yfOK}5kptj<iFTXyZ88}~kej5MyL~>^F+sZt>+nbNms>TNGxfA`Q|5fbrVLbDR
zw$4;pKU?Z|Y(80xd`fm>fNz`anRBH|(AD*AJRUnM+59&{qT#2EbF%lv^hFw=`#Fz`
zi@CzsZ!GyD2qamJR-mzj5T%OZiQo<c#n<)Bt?}FOM^o4{w0XkON{XASUX=YG7@_Ke
zYFc=r$~-#4d3o=h49C^*65&XYHeLeZxjv}yS@!&F8`<c!SF}D7Sb1}n&c)Xh%aT|d
z{nb6S=jEYNryX6&(B!D1m2S*ope@yvS$6X8h5}}omvSbC|KC!_MO7@XNs7uL6Pb8!
zR(?t5w_dy8QlF*4X(4x5_{&^YsyDSct7Y_i`O9@x!5XKcNuT)ny==9OoZ0c}zRs!b
zyk0FMr!*ivpMyF#IV07yoFy`Mwr~e{$B7HaW*3%&<B(B_+xn6W^C>_yIyG0=^d7_)
zHsd!%-xq~kI)|MuG@rUR$6nh0azU5Y|C;xDanENC%eLs6^DyW~Jsb9iXvO(bGj1{w
zhh=}4c#4HM{2cnSXV~5UBtIW4C@`L3aP7CF7|AHv)J&)Mt`=MEe;O>EW2<rE_nOvG
zLTTY0pw5nZ(hWv7ST^kzOL7W{n)Ni|iX&Q3AyZa7%I`3E3B*FZ0_b(ZJWU(i>o0Y2
zf2rVigoq7?+MNtZBjibxWk;-!EamPIh>9+Uw;9pN-+P-!Gdk3MPHm3&AE`H$>TqK6
z`eaK-(mJ>M`V*2$^QOY3XM<};z3SHuO^h@QoR@9$J>E8B!Z9eBWK0;Dr29ssYoBrc
zF}AM}v?!5^p9(2;4XP0{B~O?}=W<LE%7`5F!@P(WAx<jMv?H;VL=rAnw{ZWOoMrzf
zRiYO@6JIDus#|%adoFJb=+3{mf{exvmRmlY-jgK!h<8CspOfkLU$QYaoRu)weMM=;
z1uN|zcQ|xC8JmS>%g<DDZad!_jlsb?GeZI?8^r;GzHtz|MAmYfKDg}j_0eU?<v{S}
zu&|3-W3T6;&5D>MWqoW3;If#)Jp<2H1D8E4c8F$Eh2C1VIU0N<XbSIJ&f-svYX78=
zA1?r+_-h>hT?q{dlRUb@;#A{QM}aki?zxf$M4}=2uWtc7S?9I#k)oz;*0aA|sXmjC
zj5GJ04fW6y7WsC(x#^`{&7`pYB|SsVGxjQGej9S>Q_!nkJl&8G=Gi98!;Rdi(7ezW
zgTUCXp!=#_hD!iQwbZzf9Haa)LcjxbzO#--M$KE^+SZ)JY)cRmi&<<!{d8#p?nc=;
znD;b#%672Lfk(XuDRH~CT_6AA1!@i&Gdt_hzCN9PvO@KrqwNx8^GW@bqPm1B9s_!c
z1oXeG7{1>(_$-`o=T?bif1V0^<1qKtB#W2XUX59$+S^oS-jk5>a)K<|l&Fl2guWy$
z3uJ@VUfK65g>oG=r+cP%yz>i1p;C?WKg_FCC2L=Zz~iknvQVsKC+Q~O95P3sym-lH
zJCMJiX}{%ezg_qd7iDAudcA-51^YnLQ(iUQ-XAbDE6w@s`Hjm<7BD+AH%T}O6eZ4E
zv+TPi>G(ZI4Z`qu%i@~-lQ%=^RyZx;PPc$7t6+<bcxS@*>F?P=D9sNZUn^-pLB0_s
zXfH!E>azN-`hnwMDKgNGz)BGQM-~HTRWu1E-H&B+E@iIPRG`bb!u@cfhTzJE1dETG
ztI-4wDr=n$pFEn0T@2(uZaIa#>i*idVYg_N4QM!=-)&r4M^1R+je)8e-Tf<l3xt)y
zqKw)2%aVN@-5&?pcni1dpMMvJmAwLcjG6^7+==u!A#`%j&iA|@_lZ`+&lwwK;`lF7
zE=K}7D5)LRRz9vTbe@O59p>|Sg}i=l?x<ZKCF^0f&m4(wTk<q7EWb1KDroLY0zaXy
zW!vLP#3<+2T*3DkbVo})We+&0{}Di7W9;&QaZH_1Q8@jbExCW}DTr#sRPFR#e*yM$
z%j&`kuo@EEMDY{diZ`7k@0gf52p+l+lQ{UFznYt-2b5D>FR7mBw&r_Q{Wxxo1;HAT
z?obJz1nebe$zW0<2ZiN}XaIPG`{W6kZka`}8vjPK<D-A<WRUMN+zY7=X`LAwIMo}8
z>Zn_h+!m#}SQhWo!k2{FDId?oa?3a}lY~#h*$n~=!uQEjnCQX^tMtrtW#^hX>*Q!=
zHuK(0{{?+JN2SVqei3N8nDgvu8s4K3;a4Gj77oNQ5jdy=vgD3D1;1j-2b*L=7RFy!
z7kXjG3f~6|8xYGJz42E2JhZeVT8|PUmpk>#I*Sr@37~ww)7gvu|0?p#Fv>@dS>k$+
z(oN^o$1Lu<%>u4B=~H<d##pVqHshd-C*x&<LLsSU4BCy{mCw%`*Y3GO>o*Pk7vETN
zl74U}>3_}60PjBCnJmX}j0zr7kIT7lSBxO)HzpUCgOghfoT5<@FTa@k_9aDB-vdUJ
zpNh{U2b$H^(sJVJuUcaT|Jw21*Mc{~)O~;m0SLc;{D<0%9JhXdQ@fuWi__QXqpb?Y
zcChfM9LwZ;^!r~*6!W(hPrKrZVO?HsT<f`HL;kGYg-zIJSq&oryTYtlw>9CPldg<*
zL!Bx(f+VT@!%B(-9N6m*dj{x{zab>x9Q(d8R-RZ?`LeJ!chj~is;R6@=hOE!SceT!
zuLQM@c8edFGZIbTR%SKh3q#DAR|;Rw_UwP@7A=>%=8j@OvK^CkX$A*D|Bvwiz-{SG
zoox((?a-vg!^0={IRsRv_~AVdFpI<5>w%_J4;xgkV|ki8SHq_FP;X_v%bVJUx~l0h
z?{sJONhH_aUnrW4eZ?&4??@Zj=vETnyeu^BZ3}^p$hfHyik|PzpFW2+<Q86O{UC@m
zLLGQT?#go^E4ew$KZW4R%D127O?0Jp`-?yrLNnUY=k%*Km_x<ty8bQ4<kkq6ZRWc}
z9pj!BDJBf%?JeWD{~GCOAQ&LN$#QlqNa9?sYLl<EJ;MSgV=tbLhCH$78o#x`?UIO-
zS>_MA3-0S(-n<yQ_jFzf7a9)aszKabzQ*y~=dym|A%9W^V@l%8nR#GHi$?t0=fDwW
zx0<{&<V=`i-5rd;*1pW=ujfV1z@!bhw*lV;Q9toih8?fxdvpw+&(i^oMN;#X4B78?
zAQ|a`XvuD`HovOD1%^Do(V1ute_=beqyE-es(HZy^`Du5G`SU=^>SX$#M07oSI-5+
zsxZ}r+2(LvT!DF^?{q?cnm+8gN<vF>!DTr%%h$_w_lnFS3(wr?DRmY=SR$Lh$z&ZT
zpIZDJxk+)P%}MbjY&zOg+n~uFV@}rsB=Ib<)cv!(afM;!s@E!9;Ptkt>h5uP!G3e|
zvw(m#N~9b*tfl1UwVSWk2Y31nQKkzVXzMzy`6MDYzn1$7VXDGN5bijycs!eHSy8Ka
zAV(Z#?Ibnhe<G{#Z110X`vP?f`O7v}fk0Rv?P#$4{Nt+L_pdK?WgCf>`mjo_CYi&+
zUs(Ch7Ml2&aGXs)^I4et+2h_9z*Xb_d=pA)R!S7Pt8=P%&cXgDW{jn#(fsjM=9dur
z(q0Fl#xnAiL}{_wFUt0W9%#eph-V*mY?;r8??;9|7}O*9^`dPxjtVK{o(NI$+~^oD
z6ifar(k*tu$u}j_ivFril<6t3&XE|}<=#ur{kxvtizK#L%K8vV8@YBmfPulB*SxHs
z5al0-r-<Z)Xm)s}a=%%_N?VWO+*vbeMb^$IR=Fsh98hj=Oss!VG1q-uA5RxuKb#-`
zVa9fBJl}MpDy>?*l){|!tOeFRWT+)9EUV-Oj^<ZtC&pW4R?((=xEQ=V89nznG;XjV
zJx-dHHkRjr_h^1zi0Ul6dbpkZIyP7YD}f5$>n6lhT1ex-<_1cezSvLJcsux8P9o9^
za0;D!j%LbOHy=2s>?H#iD)nNLn#22+vbX{J6BEOE2WNWPLH3Wn$N1X7<*R+UalFPN
z5g6UiuOlon&wG*C;Mw4A@WKcL93}vrME{~-{@Hd(mig3eu+j-2@;_)zH~KuCrF#6E
z90{XcT<gnFqC=rcpLS*gu(vATo_dR@&fb`y^V-8<v&bx-D{|Ra_ycJGwAaFNF25xW
zf3mnNb9~lyGwo*syqjTt?6F!9^aCfid+|(DF4K)p%Eq!p4VK5(7O8y!S|<Y21~wWR
zaqwJ&{*yl$@jVK4Mq$c4bN%cIPo4tn%{gs6-eT#5lhB|Bh$OJweuG$ls%@`!kc~RE
zb^fr$@#TzdpWc#FUb7aJ@3Ub362RBwh7F9at(NhpBk#3;{AAjoam*7dzyuOwUC}gv
zbGN@aHP@OMztrX{FBUr?K&Zq5nre^S-pnVB+k<LrEKLdB@{JyI#isZl(VqR*CbA6R
zWe)0!VPGZMU6y#k-so<tgl%MOAPvTbR4%;x6!1s#_0bX9=yu0jmw&S#P2~ltaL~;c
zsuYQrW<g1k7mCkJ0CxbLDZ4^Wa4x@Tv^`XH@O*YJSUY{zLS9r(x>ixtjEpHIQqqu6
z)tIsJJ5dL$MmOQBtMTRIR>y$Qp)uM1S|$$4h|WPv;;gUm?TRqMuMXb8?*m9@&D&}8
z$JbZmB?bknLv`ApH_FFL+EBh`bpKXEuC&el$V~9s=!1DDKJ`eUCrP59$#NEDS6_P!
zFC_?rF-+z!-ZMw~?khBOP?Jzsd)1tu-*`8rC0c&@z3wrZpmZZ+z0(PAD1K-2(ai#<
z9T90x;oIzdP`v7mWzy5XFDl?sc@cVEK|Uh>-C_LY#s98}f4%I#e)I32|9?S;jl<7v
zgD~j-_|jue0nT~mo<9ree?-@W&X+$Wm4f|$MA%FOe!}2p_sf412eu>u5q@n2O#bxI
z-(#JJ?+g8=a70OKTF@E(2TXo^SfZpu@RW-G`{Q?|$mM4W(A>@g`Ooz2cp3Z%hG)s)
z{FwdTkD=u2-isLb_wyl<AKh{d=DRfa^y~*>yFD#e#i=`9GFVy(T%^NwbwPj%lOU0Q
zp*ixnkhDxSa+^E+{0jvh^%IhC$*Gvk5_#D%5*B1~>}{Pe^|tOuepyXBP71#jw-{>z
zN047Q-H4Zc-LdLpIMnxO<RCgLksqW8GBm!tMR<_WAr~#0JZQ`#$!KiC49rSQq#=qC
zNw5QssQx%=7#1X&>)8>e-}+Ng@Kh+&%<hXC`VO0M+vsbgto5T`A2AWI*pTic<yJAV
zAdm@OB=R4qJ>6vXyo$T<!&n+G;5!Tu?Fq$i(rC{>@j9B+(-sI&dmpD~yTx>_w|;FB
z%P7sR$rPY&1?{_4%2TWaR`}x?Bfb<?B}4!3DQNWeyl%Z}vOjvin{{)17T6i_Z!<7L
zXm_CRFfclF`y@Jz)<hC&vWlz$!iMO!Ml&o^BdzgJFhioTpsYx7a-zXdQmrE6h09|8
z+k-2(coctV4pK*gScieK#;<uK1QJ8OJt`2-icL<K9GX6#l{C5!mIfaDbC7zh2gX=)
z&7yp5(?AADh!D(8P2LT@x7s4G#hQUb7|ue2?_WWl;1vHPDlIB}p#f4R1HaV2LnQ}U
z?z~?Pfo^p~e_Aq-b4#JnN2Twl-~?v<GI0IrG;L%E$zq~rVqIVxd1vJ@H5jHU+Kiyg
zMe6fVSjZ;AzS{}rXq$7W{F-}=5XQF<=8+SFbp&kg%-rs=pEETOv3xuQummE$U{QL2
z<0C%<J=r(PD-k$I++lEWP|QNR5DfU2aQzgu@JLgsaI0lGfgkJ%bv-c1F4)MXzCrn@
zzb8PypqP><__9Mtd@$hj5rKy<e3#Dcc}@j;c$p|zVl;J=)OoAzM_HUNE<^)l$-ZLX
zhzhJqX~@aKDnlv*YdzhgC@TT@#>Ewl4Z+4Bn}UPZhxtB*d{Cr^LeB@OfR_vwI!Zr=
zXND{wVrjS{1VNLC`v72q34G&PfH)8Xhbw~rbee<x-J#B$OWvL0dxF>jJZg~$z!*8w
z^$vqGD25*_Sv2H;olbtRTjYh|QOB739qS9uSlJN8C2+zd*a<yOR@pq>jIlj%coK=w
zQDYEgI-JUP-R@^W3WAvLtaLpJdD#VQyQ(VWpiJGks=V|-M|k)yFuv%@7%h(ezzXfO
zu<^1Af(D4EdV|5$^C6N2ZvdQar7)}@TuAoZNS}D$r-~gQqh<jB<S+ww=s8$et9EDh
z%IGqD6eH5GBU&)wLPpnH2p+q`(2IdH0?Ca2C`$ykRF`-zpU;yFglMqDM|v3<wpxo3
zAEM~AA0S5R!N$u>#RfhH=7eo4*R)RoOoFTB6M0gB^>VF<^uMoh0H#bRCh~?wH2LO<
zJwb-hAO-$B-b&NX{sfBW`2xI&r)-Sjqzec5>2WO_+*BG2nGoqt1E+(4tlmRlI|*>R
zV0_1woi5np%T*QHcoJZYDftkx^c1$85Jm#XXgD2OGc*a+en|y}_C!+d0u2mMgus5R
zEjA#Uq$N5bQ`is7i?AT0Hc`kDqyQL<)ZgMN=$%$5k+`FPhlUrqpf|4oINAKd6g}1!
zg|pXCmQr11n=@#G5`W;$MT+5Fqk4t@DN~LPV9LV@|337*IBd0E?CO_CPb{}q_OP*5
zzsyPtk%4Ulf%pGkDQf{J46~aCB9^HPz?C>8PgHtf_iGZq-#1|Na(9*wo|k|<)yR33
zjsKML+weDlf{%gtoA}x_Y{T|>O)7fD6A6R6$hmKU1$Mye0zeyGS&=8wfyFR{#mG{u
z1%V&g+&+i%#iR!Kg~*gjSsMmJx_zh9plV(Gfw6>G`l(Pj6gsT$4n0C}euFS3p6;YP
ztqq5*>>>_=Zc$57y@cV0j>@*d_Pq@a!1({RG$}|7$aBA-N@{={iN!J6@CXb52n-kz
z<Yf%7ZA;534_&}iWI~t9o)A;xs4S3bsCmUru%WwRAQe!ZKX_PyfS)BRPsdahKV<E+
zELb9**<2Cvsc9G3;fW|Yx$pYJKDa-^!k1@)K5GHPV8@akl-V!9N;x1#_85c-9NvKm
zSPTRWcD@-@Si)o^F}yDXii%}HX+CdQ1{EtU3^6e|UuRG~4mlQp0^yGzfY<`6v?8Ix
z5n#myOYw6qfNeOej1OzYVQaC0wfX}FzWz9<R!Fc=D-PmTOv1K1$>YZx*5YBG4wGVO
zgje7f+6=V_NJ_xlmGJ@LRfdUEOj1tDKCGhUH%bBO@g=J~9rj<PU`!D96%_@LfCtHw
zmtn(jd(EZXqXx?<mF3qkT1tfyr;+RpBs`?UyVr322H1EdFTc$LYh4<>8FUF?f?#3)
z!rqOK_svhJOG3quH`*Z0%`XD*2oV4^NYO#$W$dsg%c?4mOg-rO;`>!LLxAtF2lGQn
zbgpolg8*<T8Fl<cYXQ;SbGcJ$V8Ma~Z#8R`Eu^HGke$qfjr;BkJ0#f~<?a0+f};Cj
z0@1NUWK5*2Nr5p4fsOzI(B0ojITeyATp4R)v95eT34jrrXNQTfNb9>^Hh(U&-SCih
zp~?W=7D936(@~5npFn;d2LVs_dh+(#7uYr;7?>zg!O#n`$oW!C3q!BPyu-lhr_`&%
zGFpa|Mws|6mgq_Iumcgnp`wCA(oz7>*)9Bn;PfAf$$biw0SQ?7vO2>Cbg1E);X@km
z@Kf131iunMkY>(_b3W|T!5$9X?w|mwh5q7e(<#1%4z1h)JODH`AOjFcX@Gra;>ncr
zzlNo1qH+i{2vC6UHffblAv2LYrK=3~L;wNfB3Jm48h~<Y;JO<Z<s1M*gf<rsqF8mf
z4F_ymC`CX<G)T;ZY2RvJsf+<QTnYm5OY#v103hvme(!w{VX<U%_pm2Sbyv&;C`IDY
z0|dU<EksuWIIZ#fI*{K1ppi3+4cYH?DPY^~B?1^V7D@sX#hC$E^)_i%-ht5%kasx0
zA=u*Cf^7ZNhsAxG9WW_~0E(B@8U+AI#T<hZA8hB9ZI2LiVaF`wlBsVHgB>%TBN(Oz
znUa7SaGwM@|FxPKvTkMPC#nwPiN&B&$-x6p)~y~?@yI5M19)zJ4TB{jKyNEJG*+xF
zLLfk-g>OT~U&0bsI#m%Y;lDp&m_Rm39pXrU_E^#jfW)v<DaRamW=b4*g?!{4aDsVl
z*R>DI5A^m>1DwKvCj<EtA;M%AUhrqh%LVvkwf&gd6d;`avp;}v!j3wbl&!P)aMb?9
zP6OOUcd2iff`lY4Fcbm*p)Co-hLmD~VnhgdexrJBOrGU~=<uW16Oy`{e*+*-b_Ny?
z7T8K_!xXAI=m<a&km{Lw#%$5l2Ol!#4)8WO9u@F><%*CZ$2!GHIfGqb>qVxYo}fB~
z1DRAZNx!~L1{0IiA#t#To{-cQ=~Py@V=ijhkU?%ibo+oj@bF3wYGl+Z*!r4D!mC9=
z7&`o4Kl6r>MRAf&81xxNM*x+*W=yWYu21X#xpg{w!hNe;G?=82ICTIAEwBoi`vky}
z{I>lGQiC{f<l3F+F{elv9o05zK$&eXlb!-bxtvcDD5Nmw6b?|)>Jyp*))p=hs?-8`
z^l>TJW-L;ZdEWy%F_!9TGGMS-n1CP;<6-o1I}Z|&!~F&X7LhNUl3@?_-lPL%Zq3PH
z`~<R@1JEBkj-5m$Zdk^%GYY=?#sSl-97cvWrVo1TlJ-Cseb@QKzq7atEFTsIhh-)N
z?4h{y75a!fY!8fSg}h-yLjLKl8qp{z0x$J>cQ~}o=`5mN4Fm(o@%d2`qgKJx3ttsq
zbMY<gCm~-;dBbc3VVJ*1*kr*XjOfk;?$R(+rHD4ffc>AYykW*N!vS7nUGV&*ArVG_
zb74~eM_Tibr~xsZ$YA!32UU}Q1yly=sv`_P8=zRsgnMM+>OM%(qI)Q;{=qNl0#(YT
z)`nqL1t^rfc-#mtzfA&1t!cRKxs`xrhRW#z;B<OFLbwJ5b^x^{E!GI(dJepMJc<g8
zGzU`)&)tX=TXYy(<c9SCkJ}PYt1BZ!xG4NS6>2lhnw^{gM3YWJ0uHq*4LInOI>n<c
zLD+IU)>mK&(=_TckY&Y3$3`?%`yQ+8{gPJ>B6<M7H~XV!$VZT2ne<j>&At)_{EiO=
z;7GT#LjVbIx&q|9zi1I8s76ZvN-msZqYN2q2y2TR*c@q*O2`B|3`}(0Fy645ruU7@
z$Y<z{ZICcN9g}s})%=~;6pR+6{RWRx7*f4Oe7FS3E^i>+(A?G~Iig;twzRtj;(N!L
zXQX6$O&<5<yF7<ZVW>0!&(^qozc)O1#*vSJoJVD$oiZg=Y#jmm9Isiw=$6-@7&7Kz
zI$*$%v<()1cSl9Cba$M`dQULlIazVGQh5^^`9{-*^6uWu>GsUJ+je<-C+Mvb_(G%|
z9sx_k2)5-u%^2P=mncqSZ!u;3MIpmfsPUAwWLka%Zf4d7U}Sw04^DE^@l7lCi+S>&
zJX@A0n17uv-14+bIvw2_d1Ca}YUk_7AMRqPJ=%}#>nLL7*tMl$ifE641Bv(F=iIj8
zfgg9Uft8ea%{yK9tOlis_6A_f6mi=u_hc8{O6kTt6i*1Og+AO(<fQb8C59hw84~q5
z@xS||MFj6|OAb;G{Q8Mv#23gM<gxY+16<vTEOE?R8@|xt>jAZfD#rb=ydBw>=mli`
z2JkW4ES;C)?Qd(^Z!(X6T{%T^W8k&&s%+!l5`C6bm#Wk_s^&vllMCiuNV(3*vVLg9
zsvm_w=^HMd9y*YWFl`_|1BuIQr&+ukBOM_fS!9qZ*&%XF5PPv6FnzMG^lfM_yZe1#
z(5^p?x?zyP6?cQ1+A|bk?hxN5Z7c|h>N;+XGb}|^Yu>_CGT*fhb1T-t{|Uq}D)0uA
z<+nqq&9mIPV(pNAe46I5SSn_SV?nPgLG0LDrz4$&*D`^sV{h|(N7;2{HdO|7gC&;m
zf}kZ6wCh{6Fb+4fsDULEn${ZU0e;p0;n#ewxq!uHjpL8eihyGl=T(5Hr}<sW-DCL5
z&7`bdzVSK}W@71xpu`#{;q!%9jkVMe-aTn+r3jk`$GQ6vNV%WYH%-taot1@G2!KWp
zNLM~l)&^>V_4hCDnIWfc)T`5&RR*m-#~x(C#;b1vC!<vALei%o^nQ<P2Zdu@AJul4
zEF;EvFT&Es!ff=Rkps^lTYCg#9>8q0@l>`~+#C}qljC#z^9a{`L?AU%Bx$(WmZTo>
z5YwK%c^>vtby8F^v0zT3yR^|A5yHl6`9^dJHwYS7c7*H~3sVPuy*hxZ8e<tbApTY6
z4xlE-tHsx_&l%bidR>~$p18%&`2?-okvadWsxit{Gs`S=&)Vq5{;kA5{hJ8$P9JQ`
z3vUkvT+)%J?`o+43{cGH?_kZLlOkQRFINyxCm^u>Mdmlu)tX=lZ8COk@<XNFCq$0C
z5XKLYsi7OccCO}^TSbI`Z;Gt>lP+gkO(8_kmE*$Auin4CZ+3Smf3kTjb>6swdfx;F
z{q-$Cb#0_#PbnH;u`6~MoGf8Ay1)W$l}+cUj8=rVHV)9McunV;=W`bJZtjMm6eI4C
zex&-Isxt79JdZHQX~ygqL$mt3$B3CrTZTiIUo;vfmsz}N%H=5{mRr#G5;LKl0}u`Y
z5kn1Oc>8gEJ{@5frBC0HU@R<)qX0nx_x0kFqc$4bhFv0;X>waJ(5_Y`XN_1zjxkw_
z7!IuLak+_HsLqtf1TI}rR*?x`rkIG_t}#LH(>Xyomu9!ely=_yo*o?|*a#<GfU-yS
zWf1~44=_|S>p@U`Cq=mkI~xZ0DM0KxXVonFI}JFuPGH305P8~bz}O~{X<#EUX6ko*
zG$V}9k;_CGbeGrNT|ec0!Z?_uVSc<QI7S`8f`xVYC=SY^LTwCms6c4e$x@h|JnJ|M
z%Sb2J5I3h_?fr|wk(!UGyn>B3!xvEP<l~d?_~JiKLGfsgisnESGN~B$g076X?4D(K
zk)GY%uGbHE{F<8fCU~?c@sp`_ep|KMW=Zqd+Rn}W=UNMm72A^#kl#7rQ*`W8twgq5
zenjJK`mYhli*k-76g#cNSR>TX>~y4?`c-;Qxt1>Vt5GbuUsZLUuvGuT>R%a%-Mn0E
zdV`)3s^|G`gWqy%Tm>BQ1Y`f>vC)pP(@e2vEgV-o@lz~U;8=l00Es675|;#eCRsRS
z&iMvGait_bKQ#ZyV^J2<A6-`2G?m&<r8asP#p*>W1F-EqA*WL2vE@MA)UI2F<`-5D
z+morhcQK{Gc!a;uPmh{wSY`H{&GuXAWg9N%o}K&xzVl<TA%nl_1q$)TqyUW#4t&Q-
zpC!{$<h}^-2b19_5ddNa+RgENR{;*BxB`4(IG!omLXxqSt^Pur?NG_`NjZ2(%lR_j
zyySN!bp`Ohd&%F9;oKZ8u`Oo_noEu_In(&T!g_^45uq3BwtA9-jde){G~0zvee*AX
zYBH%lQ(|>b0aDk)#u$P(0uems4O+6uiwl7+1%F_w+lQ!gYri;D(uSp!<zR1o$SlY)
zdUU2o&oc-5g2?o4?ykP>(xwT|ZU5ZKXT4La!PoEuy91G+Jk+6T6lsk#blon~79PD7
zHUugxrR-SQ+0`fFdn=>KOhqw<g;ey(VU^9CAnq+#;>o)U>(M#YZy5`Qe9Rdydbvt1
zH^)!xwyV-JbI|303D$`q!)GTmRdXMma_`mV@xVLpe2?b}NbxK{z(4>G)tHTUumJdW
zm~iatuK6`2Kry^~@=$N09+NjvAXsI(i_#o%V&y;5<9)Pih>nUR-*$nAG@kUGS-jRV
z2@n(6AV`ZWO@E~fw1h6V$0gn|%}e%-HwR(_zxNX3&vJ(r>b%kT(h_`|jLHEo7Y>J|
zK?%I>sX;t7a3KUgUfvJukGIEmCTheQDM14DKPcj$L|;*C7nXS$KJQkKe#YVK3kMo?
z$la=&UhbWEH(mp$i(__#gj}}(K3&h`5Rg#7PlhIFA8iB01Aq$e&q1=*%dCcijI&xb
z&DS~%#!6018)Ex}y}g+h@k@0B`ZnU&A)1spX}_iS9b)gqKP&rS4!X(%R%?1cqj5iz
zj<>KV^0WNw2)7$Kjb`0*;Wo*T63TBnSQ?HG3;>2{=gwteV1TwnG1kO2Li0rZ>0cnz
zHFk!!{hPWL%fO~)cbL5oaOd1K@M1ZZ&m~(6ZnJIDc*9uq@E5FY$FrV{>(^)gGM~FS
zW8CZu2&X2uAqD0Lny0u8LJxsw`V)AVBGAusS~=Fptg@X<r&T_xdSvr6jUB=<HRos=
zxBH|pgE$9$U)}EKLaJa2?tL!4MpX1zp7ka(!A<T2p{9A&hn?5&e=`!G5pYNOHqij{
zy+~DsbZwxetGh0E1ukg;#YG|lx`u3=9B7RoV4t87@3MEgC~eGix6t48Jl!?l8W^ma
z7MB02$MMQE5LYpLiN-YAmA-!TCbadsmCifYBvHkJZ%<tl<eAS4rT{VRm_?^_3<6au
znlYuK<_jDER%{@zju$C26$;=MVK0i6mj`=PdTEL5HnHMEM%A@hB3%#dQe~7PdAfie
z+kEw`-=f94b?n3DAoDdt&}Gp(e645ND>!EYat*&5R@J~c_!3=U<UO9t*Gv3B)v3K_
za3=w(j=G)<kfGCUFC54lT-dd@Ux*e0&{Z3)mT6;<8S<qZyC0b_6>Eu)XdfG%3B0K&
zUw_M-^K=fRCosxG9Pp9j%{SM**J}y0(qfz>p0qY-KuEN2aGOXULPGZ^BoFOFp=Mm<
z4~blZmjGf_7?Q#3Z;J#siR`Q>=2fdWYs2_tHD#UGN-2M3+}a$`a*;NJo?2M(4SeR8
zRn=4VQ5^ox>!F<nC09;yN&y^vggRU2F5E*VV*ep95U0m|+?ewq=hcyMm8oI=Lm8mv
z@E)C5hi>ayi~<DHWa?Ko`<!jx`8ofbWzUQFE2cE}e4@%B>u~g5YMcZ=vF>})@uFKN
zsY{Lq&U5H!AZ);BfG2OS{RS3$X|X@C?*g;<SI9xR)f?$R0#!c%%dx-DzkfeE$T|K@
zgei1QSLFP>U_g1}y1o=-dwYmIWyE+Rkx6i2=<+pdr&^DKqOy(mkQ3+;QcK>UPoq5~
z2-TkiDI0oVFCbavEc-L@+92`5W~To{!sHB}fA4d@liqD#<x)U&Y#iNTFm$o;mF<#K
zc6I}6i-Pg?mbn3^&VnG&u<(=)G1$*P1lZ_LfWPxY`oCF?m#wn#@n{?B&Xtz}18vt*
zteUK)>>BWk4ZEo%kGPQ1`?~k<5)?9>xH{}7J2JdYQh33?LM^X_J(h6u?ry$!rZx%4
zI-9prdk%4+B*VJ9oC(f#E7k`w0sV=^L(|P5TB~s@tEfeir3H3*-zLTkscFbv=sq4%
z*6R)+mGOC&gW#Yd%*Gy#$OIbh@7{TrE%g}`8l#t@ll87(dafsH$UGM<sPW^hw16%Y
zx8&}DQeyvPxl;@_<o<}&^TVpaF(_B8so*VO9}{m&Rj$r&2v$_G7*9;Qjcj{Xa&s<2
z@F8^K{+QNjv5V7rt)nZ49|nSJy9`=~We4#Wx(?UnpuiB%hXhW)81Sm<dP098mH^$R
z-y}&L?;J0A%LVBAo=y~gUU3j`N8rr6AIbAm#^(dL#E^dVmcA-|z90v0FpITwB|@Zz
z>nDrCxYS#5bocp{N>LIr{3I?6!*!)iw4nJ%{RW$si3l}7&u@2r4*5IFQz9I+V~3rx
ztJ5ty(>0FRXrjqxkBVnFHta4ePNvvmp6&}DXUZOQ8|i`Lh8TBHl)5)Ze;6MaZnR4-
zTksJJVeZ8pmqN=A?>TT1#s_K0Th#zhyKtSmW=#%Q8L#KC`is)v1HTKFm#oijH*IKd
zczeFe)^K)nTOIW@PE37y{MoOc*VpZCzZT4^rZ4YP*xRjV1Y=K&Puc=m;&LluOL@`}
zyCIi{Vah4D4O^ZCRm&psI`km!RNp#d*pyvP^lBQIVcMPJefsZHF&Pg#iJ!mX!h$HS
zm9mdW$cFKqS$vp-e)dDfiLw#|dfVOU)51{b+pp(Oid`(fsASGF9CvQ!vC@h+*EJAG
z{RUrG#z>#{sOVrO*oG0fO9JOg*va`;476!FUe$lj)%Lb3?I1q#ZTFbjgzUaXqGz)N
z>^h)t@s+#=sQEhi`siIvwc|W}=Hmd!rZSx({!#R@y^65`^>sP2u4bodc}$l`+iipK
zsJ#~lYL6#^eI1dr9K$@&wborH{YiX(VD@g<)zT}aJ&HpYZB;%Sg;+^*(KmJ(O`!S7
zj21-Yqm%H6OVxu34HzQT>-)q;n@Q&y{}xP@F*Kll)Luo-k?GU&dQ@(qg@lut+0>De
z{7|-IB}M-!CKCQD;6+$PIN~9^DX8j}LO|FX3M%FL5F10p`Y#0TN4|HV5{E#|`a?&f
zPFRXzHN1X8p)5sT?dja<u^mM%glBIgCuR`?nm!U}=)H)XNx0eW|7I%1w@$1KT;fg|
z*(?VPX?5;MVW9n|uB1d%SdZ7J0+$6h%D7Fycwat>(<PL#xnc>JYNaULHMO92lbWn2
z3O=Jfm14q?(Xp)olQLP;$92uv&SNGqFz239kRM6{t)PJvGnU3<KmiQv<exIF+CA0U
zKkw&<j8CRozpVk9Q^i3KdvP7&P$)O6ThMx0-cD{^++$<UMne)I+6%rU%IiLI=Njl}
zXXRnnjL-Jx$l@^EZWbIY9!9`&sk0gX525F;8(L`_#jP+^nyOKp2l_lxIjyZ6Bam^S
z=MU%U$}(DcTN-#^l*GE%8=Jsb^?~G0=ZA0W=A=pg@1Sh15)=utYoqs@sxGVMB=GK;
zhJ04q#gWT{lrUhtsQ!~Bepryyq+%x@OG(t|k1Wy0HLV@(17^ib=9RoDA4<A5;-K_<
zwHEe+172v@z`&Jc?bo<{J~#MTYpOQCIQAVVUhC|HB9P=>+pBSg;1Ipq%X<7yQ$?d>
zP+oL_0`LaiZysZ5kOP1rtZ;`B+s|OB^fx<O46IzpIM0XJzm_V|0=Z<g$qXmOp0dq8
znoqK-=EZg7l2_XSZa*vu1m6#=yyp?mKAaYL5*iMHTuRxtH<rFgN9f+X{|IT}z1{4O
z--0|YyVIXof!;1XenhmQM1tBQfYfh5Vi6)}!}6fjWPh}ppKD{}jo9c9f!+$7xwkfY
zxk*_ZB#FGW`*SX#P<=D~mkgEhv^bELt*RKzB<t6yzzx1>YR7O%h2`+&%}*%PXtCqU
z{O(Q4ic*P9o@s_+Gsfz(ooe#jY)v<tH~P)MR9ENln!*)p>m0D>tr#$m?r*&IP1y(p
zd|M}kwnAi0uk}%6!-%R(D|a2ZuSrDh{<O-L_eh4hD1l4RXkg_76OFvL5b_2T+D#zS
zNF)sV^H}8fy~)~-5|(utN00hXetoCw(pIOy_}#8WFc=fz6ySP_$hefzYp}d@QR=-w
z^-ww;2wKm3iIRgx?{b=$JW^wm%G)&GI#2Ms+ORfuEC@MzMu`p&5-F0<PpN&0@~D5M
z$&x-Ix^}pf3}WaBs1&=4%cVPAOsK+u8_sX<du6)2v#Z?q-OTzFX>c6R#?Q0ei4)>z
zA-0(;fI#K#<dpsI0`)St3w*sgLLk(_9kwy<Gr&kn2QoHJ4Bc&2w$&J?ZAsD3NDJ<S
z-EFy+8^^uV?56^3#y2fPhViwp>$Z$?$yK_ouNH1ygmrnI4}Ml%v$+y{Qx3c2HDu8V
z_<jxRB3oGgfD24`T}^A0^uQj_Z3JsVxrxseRp?d%FR&olEz?6QJ@jJr$@;0ay`2Ul
zwv(^41;8%@%fQII8?F_g$*&p$>A)?s>}W!qiMR8@Y638dtvl(t4t#h+2KIl#r_W3F
z`CvMIlKXS}T;g{1Nqul_^y}A+ht((nxJi<)hThCr=JW17$|H8oOMV4LM+|rMbO#1d
zkVhTu3D8HPy2<qWk2z0B4Fs?|^v+5A&CREBlGMInGeeks<)g3?f`BqYse^GG#;b!m
zRUKLdkOi?bbKWyvfX<IQ%w<FQk=lBdDi~Qp%~UMkh$EcWYVjlBxp!^+xf_>>Na<al
zl#sh<CBcu6@zN%{0IV3dNI9wb3@c8~e;MmkVn+>fP;<^V;eqE4?4;{9u*(|rPX>(k
zgpZ;THSNO3w1eroz$HjG6;IDaLKFDRUyqqy;EY`wulqYq!9K{~D#WTl^aBHP2~h7C
zD>*1W;sXtui$yAwcJ=LZZy^DTLi+{-zOW|jBvq>%T!@R|8@y^7_1GclML?2qe`nzQ
zKJa2a!FRA9AG0dH!)!2nMM5kB+}Z=TRg07lt`Hx#VW4etdUz;XCERLNm^DTP(<&R7
zSVO{C3%KU;WY=<9%C6=IG(k(XyNRQ+*y>X}Vm9`E3MXZxwWX3^;&6=W=@8HHa~N43
zA<}vPU&h1P^K!H&Qi+0x!piWlQSE0^XQh?x-LA@<otk~@%6z=4&h<UR@;CJ7qOEeS
z%h7W=9^Tk)0^G(gYUop%9F|e1#=h%vs8hPdP`EAhc(P!In2{PSk(HDnGde^q<HI7@
zNL@w^7}0p7r1#`(z_swo#MTBrmOU>|j3uceqzl}o-Ox;cSyO^|hrOI?Dd6YyRbREW
zAfyt_NfONclB1dT@pZA5o2sw+W>*4qvRqH1&qF#1#fvr9aV#iH6)M(5h3x1d#hl5R
z#m)+w)r&{Dk(bc}u!%=nld-Or@fC94><uMA5uN)(7*L0;ysVyq>x^+ro`pF$l}t@L
zc^29>P=ox_{Q@*hups-IM_By^28*tn8QzKs3}zqGkZiP)ept9l%S$l=L-!)!>KbY4
zBaA_lf0fWwB1bo#w>!4#UT2>VTfXjXEoBMNu~w7*m~-M!RcI`YUS%UB)^NHUqy+BS
z<Ibjx1`HG@UAL0n)f&poqq#n9eSdTP{3+MFh0Z4f_Jx2@S~;IbUWGsSHLgFE=w<RW
z-v@`v^Sf(#g=hA~TF&vy*=F;X9+A?WZhSLy@yxB3^}O*6<_!$6qdclzmIXYWm87i@
zGvpE1a<&YBSzFD$*4l9F#O~}piX4IK_S1MwRt$h<t?=~T7!MtS)E~b)vdRv*$;mXU
zIz4ihF^B5r!(R?IJw>#4-rZ=M!BB{aKG42)0O<<=A2mU;(1)ymDaOV`ZY^VIWg{?X
za|zrf^-qi1*ph7(oq$lq8vsIOBlCEXWfMFg0rUx0x2TwYoxsHfOPakddJvTV)uP?A
zi0kL6#hhxjd3Zq%KG8vF+de&+{iUWZ!q;v$dyoo3Z#?y<<IE||2Mvs{1M<mM0xpQG
zK`_3MPw%?}ujw~Sg4D#QoLt*YeR*Q`_=EZn{z2h@oRqBMCs8m%Fgj({V4wMQ0H65{
z5k6YAlkk0xCJej%RNJni9_CM$Y(R^@zoQ6A_lQ3L8>7d7rEXS!Xd$UQZ{<6)tzvMp
zctzal@YaBxZ>ClGH|<<1<`{oS)Ol++&}{<$XAVXnXV;Hjr$=eCklBu%CFWD2G(n?x
zXf6M5;#+ouyBhrBcOssBtOTp_(p!y@N(E3#k}x{+@?Lv8Ik~vFczL<%rUx)u4Asy9
z4Q3|&p6R9-TXvcmLGI7G*Lc+4PP^`5>vlrB#u4?`CGSYIuZ^)Vqhu!ly4nHulpuGJ
z61XzdK9@3YcxYR-#Q)Q_`cFmhIeonh7Zk*<$~tN+zmwuPQ~5krx|(sIY4Iv}(Wc9&
za-1a;TK@!dp=LLz3Yau)l+99&80yXM96##_zHg72^kp9(tEzLz9psOLn)hOZ5c*-Z
z34i;<2@AN>Zde<ea|Uq)LKk0KLSI#;7nEuANg2EgQ_jx7%h<E#Kb|rNhVow%B%R?n
zrj(E53xrDinc|?%c>$dpFBP<$@pt*mSsZIlIwDGpwURp3I*2<X*6qR>{fk7u0Yi~8
ztbQd{N>F@=r*Uw#bE+7}ljb<r)l##i-WJ2yaf5fE^11DA?AYwmC;Y1T<eQwe!9!k&
zJUo?FFW9%UQGl_~SR|{^O92`=1s{gKR)jHg^Nan=mNVI?A4)Kj!f5l<|2r<wWGag$
zJ7NL8+^@vk=@q!fv1ZKwyA!<xPT$45Z0^1DtW?n%v`Qf*8U75lsRV0w7yE(qZ%*z}
zD)>H1-D<&Nv(g`g%+co8f9KP$j=LDUx71JCe)dmGjix1N+%Y2;A=8a>mFRi2`YIGH
zNYe1Eyz`1~7i&xS!IKgG@nl*88*FTF2OUCuAM#w%kqmTVqU}xi<AlqlZ4rKOzme=r
zCk#oH70sNoOorWfSQb{^k-d{%BuMQxkxxpc*2@x@hms3WvmeGXk&=_QqEUbZ-bUi4
zLmygboj)zK*?t1_-JbIkJbY*|eV;3+9~-yD#zTlsRRzPofWe%f6}ac-M4klOIksUd
z^nEBAb_S}Uf-W;Nor`qRy9^rrNI0idoHc|3D_uH(al2fySHO3<uoJxg!?>CVz8I^8
zu+w%sH@n$4gKutDb*<D|r^eVq$co02;p#BQH2c%H6#f@mXBE&^)`sh;7g{Jzf#B}$
z?(Xg`K?}i}BBivr6n7{R+}$Z&+zHO$?(S#9%>3tEoJ+3QJ6Y?~=Y8X5^p5*k99l^c
zn!o9YT$V0aTHbZ7JRwrzf*sr8TLz{b@KsfBTu?c`GTP|%Su2I8%lPm}KK>_~|DEk(
z2-T973zftxPrKnfweJBXKd{DE(2LkYs<XkD(@gkS{vr}Xd&ssiT+u~h9ZPfDUi_qJ
zjL*w@249;)M&IUWow6y-5^<(&0V5DSc|{?<GoS||AS5O(eF>x#a-m}B=Z;!0a!CjG
z@F7&RLIv%8S(MN~Oe8#-T(S`ipynBpB%bw*V~?7`5F=)^`_2`z&VeoY^zF#*U)k$V
zmi%SR#CBnE^yRmL0sERwhVGwweO$UH{s#W)G1<dO{!RApCGT7ZHS+)Nkiidv?RC;-
zx^m!#s`hKqVDYMze9S{~%VtBc_2X*9sQ5i-ZWF}?WPZJTk9ksN)zJW3>l34+ZrG(g
zks-QC5<IhXUyp;s8yE*wZ&SNJZkaR<)&C9lO#ehdI?`6|9O47jqXEPCXNiMpEVUT1
z60{l$iGIBnvm`stH7c#=1@u-}zs)vc-J<2N>@k}GGy!uwE;TzTGTT}KH-Z+3^Ki4R
z&t$FSkMyC~6xl1EJm6pDv~j?N0@~}A^=AL*@UNG`Wmh@x?CNy1ee+~o$dhT=xqm*h
zEPbstFzy000%Rw(Zo1^i=Cl#OrZV@uDy&^z+7wHmi7qipBHC!S*yDt0FD|R9OKANG
zJVpEmJe`35H(9Ry(x0w#<t0gP^Uy?1&y4;fzQ#b#2wmSmHxSNdyFKK$+X(Cl*GI_g
zA3rf;Xp-BNRO8!@YyleeereLTwP0}ep6`Nr&^$F>Z2SH=Ir$dN<CV0*`u9rCGz_L^
zkZM%Ly~7St77gu2?lSSYFi;HRLTkvY89?<P`PGrZg3!Pb2p~iXDi!qSJK0%fP5mXZ
zi0FOrkh%ES?C(n045r+>BJ?VL_COXaZua`OnEo}e140zo$mnz}o}`z<9h(V#4RxvT
zQFjmZ%_guL-Uo(&QMF}<3Rh-fI^+Nt!le2IpI?3N{dDN>0cq2;Z9SioH!>iO)N>l_
zXV?H78rxn~Lc&t}$cz`Xy!y8%0KN&VA&q>ffAemq%mI5Rc^9sojc)6Ed<r{OCv;$e
z9+RM!NK%uXkc){v`bUGr6d4b0XTzVZIy%J-?zJSFhg)>j4GfR#Sj`V`0iVz$J^2nm
zTmYXC+J8YyVEs_cV;DOs26?sjT%L2{jd~zYM|zq6_&#p<CSBfl4sB!{h?YuVN<*@O
zv??oQC>8hEsB`f(>#H`W_-Ad@FDJtu>nrtd|Bfi=2_aOS=}b1_Q334`ZXxBscAVE^
zcCl+p152!Zds7sS0B59aT`z9DVsoY}HUuf$$U<F->#NopG$!ZFhEa=-@9zNUhQ9Zz
z7y8rD@B6RL*=P0-Y?S}829%>)O&O^U6O(qA8y&q%(yA)=z##+{QR?_LOiDcDp?jBl
z6Fxc$Cge53F*#T&M14Vj#fsX$d|L=?ih*~3``^1a4KDfA%Z8iPsFbCOkA>R9zKwSP
z&8&RlbB04WjU$zAU<fx|Zs(xB3Oy1(d8l@YNzOLjpJ|we|43m|A@7^=cff}*^+PA?
zJ;3+@?|(<~&m=-PsT-6?{j!q|tzpC*q9vHIDTIf(oF8lC@||PVLQX1krlW$+{h!Zb
zTen5D_<C#N+ovzT_K)M#J0U=0F}=~!%V}TqSQDsEh+A+*K9#Ux|9-^n$}L3|G227;
z%9Rtr6ECym9jVEw{qX`Z>9ZGXS3dayd<pm6$AN!N^_^PuYmf6=Yj+PW0zv{ogFBsK
z_4BjfwD1Q|m?S{16l}Dxaz~rS?wK`8ZeNyL$L7~s>m*t-7W-k-qz7vb30j~fj)9NQ
zuL^=!+V7W^1j7Xl>5x|sHWbA@L&ZFiO1$`!RM+vu(Y@=v$H^PrZ?C|P<o%Yf`;1c&
zG>J+Lp0d}6f3w#=I|XN8r_eY$HK~de^g87u66M0j(C%lQaU=F67y2(T8uB=UZKHs*
zJulx^p#I1}jA=j|YuyAHC6?JU@6l;DLL{&S<6xWyi-rNGViksyd3-$0h-OK*5@ZWR
zJd4|$jP+&VuFA&C9rCgel*=!kf*2#g(Oq|CQ`dJZIS#=j@mn)Fn*JFmqiwM25%)_E
z)a|W{iL%^+2!^1kaEARjb;C4*PVDbxG#N%zS_1bn79t$NMm=n`+=E8ty#KKP6mNqy
z-wGMw0PolLOE2a1a%rNXrrs>i`moEP%4?yKE>#<C&fFNS&FCTn#Z&q!ySCRp#|E~2
zpDH@ilW#5ai`18us?o4VHdBM<OrG;&ihI<@=dzY{IrxhhNM%_>S*{$dup}-Wq|&5V
zsKBQ`hWas%(seCFv$-lHCNdILx3|GI4yMP-N@mogC0E`trnB|rg?s7$8NH;sX{&T&
zcbGqyz9o~Q-m}XbVcDh)OC#*ds^S>$dDrB53y~hmkaDaHX>i)c`}!4S%Kb5wEk5Nl
zV5c-Z%It4%{q}0<fk@#y&TwxPkM;5%30asm1_3)#RQQ<R9FyZmb>yC2({jxjGT`1>
z+1Mf9ScZ>2$Fx;h2<gh~7O%N7G?Y$94WLf=;AkFi{S|WcJTP@LvX@c9BoFb%HV$L4
z>C5iU%9%O?YQoQ-PS_4M%QVs%e;?<V8nX3Vo^c2y7r0;3H$+a=&Kv0AG$qy`4qzR_
z9nTd|w`9e=-P6qO7?$;7IhvTUBAz>n{Pny773hk)90<};{ZD??-aF>P=q3e|5<uHa
z!uiyc_W^Y;I;Ya_Lek1riQm<nvG^;KQShWVRfIOnqx+A9=Y6e`eq5#G(=fb0Z4^q;
z?s2><VEU8H(3KWuC$ePlCv@b}^71yZE-VSiI^{KjOb!I>^r2EzCE3oAB!D-bUhm1Z
z`edTzJMzl?qdA_^t_vJu2UXOYqotEN3>aA?JlsHMA>HIl+4~U+v`!~26<$5R-_WPo
z48ebqK2UU4MMvTrM0%xvTcpZ6;LCCw+=6*gJVI+r2bnv1c6@u7WR}bEBf?CS15k67
z`!rfrOs>A&z%}8Xmofa%SLJ6LBuD|Fk0#XifJyuXr6J{OEOlk_mcc#IshumX>#Uh5
z$#nNjw!K(R`@E8sHr~lAsH;lgjX_FzC&k)*lBlwK#)R(piHD#?CcEF;lPSHlK&Dp9
zIKP7OUBi1+oewVvQN9W>19KDRICBGW5JS}Y_9)u&Cj<Fla;!FK7TBR!%c%RXGlTrd
zM-1N|wmbIQ!p_xKn@;5&qeG<BZ-mg3R}a`^bt+{*y&6E(bl$K#Ee_+r=*kTiH&!5k
znEqv6RBy*8z`-NrZHcxRJf2M$JaZryn1W>E_E44`a@U@)2R+JZmsul|VaJ|6FYcJ!
z&OO7{Fl^bT3iBTH5}fQH3JP2+^+O#*(V_?9y!BPz_`kVYNCz4Fp@H`za`Q7;VHRrC
zNTaP44~Cy8GtUSZll}fv;+$P6SIBFZ*cd*gH+($}MYEUy0gxl%MF_9Dyz#XBounAa
z5xYu~TuUCqxfOwF_u>qTxu7xt?mO35XJH$ia94N!5y4k8rc;)J^FA7ZcWWyc8g?gn
zBa+B=JpOQy!R$@V=u-LX8fq{Jo|L^;PaNw6^#E89gCBjFh>-vgLr#{sWa$FhwKuCM
zZF##v=eDzQoFlJio5U)Q5<_?}@>S#{PbS#*I`*CF5xDGhZC}v)f73=EZDk#P%Ba7#
zv-tuI;WO#qVHTAib=6$lG>W*o{9DvVP768|Dzg^fBU#k&)MGG)Gd_&lh%#$AaP%DI
z`EX#^2*_zGGE7yBrbp$8;GHp|VH<h9Y1uxl>TkWY_CmnwA{+hfq$TASjG~486d+#y
zTkuvw`i+M?mTGVn<B>Z-?7Y(S%p+)gThm)sa+_VWz=p=5k}KIrUAnyx2y`4>*b|E*
zE&dwrjhD>+j{0dT(|=>n4hu*_vW_Z^+T>FbAKW|&X|%l;fO`0}Kbg%Wte{?AL6)@2
z_=PdV_6^r$r}paoO{q56La&G;k6?21H&XC1*KAd|{CLTq-zBf+Xbc9;Q7xm649q}4
zJb3pwj`?dGArlxSzb%9<bY^JKgCJ8ofw2iS`auT9CQRx0+-f@5CSN0tw%-&Vh{c8v
zSF3W2I#K;C&lROu(2;ISPJ(=LaVw@-j9Y!VAsA2RMNC`T$}u`wZ6t%sNh$AX{K55p
zUJfq^hPk(sAuw6^VMMgLhr)?P_h3R3Nv!zKpIq;>fu+I?dAWYXR`;?|`B?ih>VBD^
zLzF4!r*Ql1zq$qpw@r5qI}H^T6`7ft2P%|^+Egu94(g;0=my{O>&~Oo`NRNS3>!CQ
z*9=1SKUrO=%-lfGysGOFSQGHNJ-wYUmS7JPR47SCRP}a^S61~8zep3fzdyYNPjYdn
z1hd9dgXI`Z&ide1`zw|zc-OJ*?UKq}iw@rUcTqj9n^)uJw+XEiEtgk2dk5iZ`-C4}
zbeM<-YYSMd?(A%pD7L(+)bkwB7i#MLjr+m78HL=#QSiMil|-(BeA%Hyk6!e&5Xj=!
zCq$pBr<IILZkkjD$Kf_a6d6n0h-z{<@uuc)ar0^}gNa)7!~=zp=PI<}m|xHdDm~cv
zyOW>K(4QE(%yX!2)BF1h3I=;`D-N`}s(*vH&Zb&K{WWiaz}Bnv)Ry6?gJC&bpif-5
zUZXX#bk*4HLP#MDG^UAj%<+P1A=bAsr%)1YROMR^Jvj#ts+%uC31Mp%Y0bQz{%~*R
zAj7K}B}%eLWogg%m_t(JY@d~(O^fLqAi;?j<mNOzCx0DV-8v_}REy_A$UqAY`N*2p
z5t~<Xl)OV54a;mVu-qRwz82`8ye*0qV0P3T)W5twg1mqBwjbCya?!R<x051UT2A?W
z1XwVut`Qnq#_U~eiVdj<nj;G8fx!fnaFfVuEjQ9JVCm}CMi9y4HTyLW+rQ?EJm(0f
z>p;y5hgdDyU&^tM9n}{PELu5Il7h3!O=kb07X_0??_dMA5i5pEbCf6gdUvW!Vh4u;
zWj*<KIqw85bre$y+<a0ZyZSZM1t|w~^N{pSNYa1(bhu-V%WU3n#WjPC1ny2p*o%@P
z$q%m@G&ZTi9XDsFyjHdrIKQ5UhemUTl{Yt<UAIsK)=fGn5#M!NKI?=VAFQ^+(Saq`
z7&4#KPc7sZWPmzl;HfGn{_pfoGW!c8y|~&kEN*tAUtLt_p?q+cb+1RYdslZ+BfQ)_
zaXQ7J!XXPfuO<ZX$o@htdz!Y0fjIKvXo`W06%|J8?UwgK^+!iyH7e4P`^QQeY4f|*
zeuGtU?i@`c?zalHo0d}w(qpr+CqE^WfeE=NHBymPFC~WTGy9qd8L9#CvCJ9{FU%2j
zw8)^9nr5)wX+F48+gp0r2NGX*wZfHKa+L^ef0j{q5N#4F*{8H$QqG;WQWrqbw*>l(
zhWT<Gn1672XGe7fa3x1;@!r+A^cxH5y3lz_iFGT3r)SFXJ3Is}N7wW!iwt_R7YEPJ
z=coIMqV$PrQ#yZN7<{qS1)pH}w_VnqC+XhvmcPU`uakc)TSN0Jd&eLxtgB2tU*tU+
zX7uEh2|c-8l55yH%W{x!NMRh#hTm+G<jXAbr#h~=VrIL`Sn#%2!^WC_f`1hVP0F1%
zPDy=ZC>PwDa3*{>4=v3ETZ(%!SSFUunnrdxgc}4ey>Z10HJIvyOuY-VBi#>wXZz<!
z*ASn80wPsThM0gv=TH8r_PtNlKf~$AIRXF^_+a=y_GnBgeUo#=Rs(i@t_y1#5*Fl=
z$F2l_Fp(x2I(2gxrxml?Y?GkCd+jV1sX6yH#pbR7x3CTVlx-x`b=G<~-(!`8Y$z&o
z<7#xWrzbg-@ZRb>M^W7WZWmo5)gX);tqU7SW)q}Q4J+wLQoWB#(q>`C`z>q57Frl+
z_=TjwUCQ`ZN;U!ieK_Ut_D+Fd_w{wWb^9{7Vr9y3qfAP`B2V!P!!~N)Qsv>>H-Nb@
zFoJ#sF((1cX!ueb)s{<h5hjY!@~aBm&O7;#Bwys|xegtmsebd##G6puEf@~VcK~C^
z*X(-=b&w<#a8*XUrY@T9c^v?B`h)I6cM4S!Ew*>hyd4|5uu3v#C^^sf<j8$j3~tb~
zWoWf5WN;gTw)HXnv_jiiw@ZI7lefP{tR-Vr2cO=iCz+=>9PmH8QZ%=Pg*n8uO?dZS
zw_J;IcPDt|z)&3~ybi9xXH`;7e>&(guvqv87C|mx4mb<2C)1`$7BxzFx<7dT-X9`h
z#j=<b7t!Je-dqTK!CfPFtkNTDOzpw-*}arCckWN@f)4?+=4_ko&)f+|2&O;MsD51>
zAnh)XnX0^MIKGZ&coCfPbKL-qO;^}ZDrJp!M|4-VL%rB`W2DDx-{jMhD(+6Ln$)cf
ze`sp&-y6riDl@Ho8jFb;z2J~eE1^3@It<;!pPaw!Z$GY9O!qJiY+kBL|D56+Yo<L?
z`^<g5U2xm|o3n=lv);GwB#UKhK@jXcsZycO6O;dutiJY*YAM5jhzs#2z}+JDk+uU3
zs^<|Q7i>Cr%j+P^&N|T3Slo(M9b|njjRS27i){zpR%Esq*<>#B?fNZ4s*PC{9ks^x
z^}}a2n;(Z_g>H!+dGULtmCOfj3AQxJSpl>lm0hP@-Gts%HAVpiJLPVcT(6+{r#^`E
zCc0sB%=e4=@u+uO55KBlQz~a=dH?aomm$#2$#pd1@Ir93t9ShwMw4DF9M0-sF|LVe
zbjuPj<jiZS?qw(|ZebsDEY<l_3b3eb%uJgwP^9o#V=`;Qc{v&U*m9rw{M&Np1CSk>
zjx(+V49S3k1-7?y5`{m)hO@hSe_1RJPpOAxww~!ZndtHjbS`cANA5W^jEmb57S^qL
zbV=1SKF%gq=tocLU1{3&ls)b{m`Gv6K_2GCDSe9kz6a+^eYL$~<C7Xy1ZkEuV;m;)
zFrjmXVcomOSZAG8jQS1~yH2TK7oFMVX=z6Q<u7Tl{6Td*8J+CZJX+b@r3v1iCUtnZ
zCa7H#2|0Qlmf;^1ZtyL9!Vi|MJi^a8@VAwtA&a>_39_tpWFLywyT|fF)7M&BP}@`B
zPpb(a>erg7)wZ31gw6-XwqCh5^BpC``i<--W5mm6)Ny&JYRsWiBHLw|3&5A^SNN1I
zblHhR*=S>M8z<9|Er!}f8e(Y13XG^~oHBKWb|Bo=0lI;r%o$wjj@lSot0`;vWM7&r
zQ_JMnm{`f>87JpQYRX4)Ho8d=PYx`vG7o2D2#NU&vV8CeK1VeZ^fI85&&Ej5KNv*2
z=QZ9k446IGeb)B+JRI5kHYF3yU->kX?WFDAdct_(&2z;#Wc}3>fuFZP0i7-5ff7#%
z6#jf*1#Il)PdxT^@5lwdYF1^;s3wzMMa<YzF1|pB*Go&%=!NO^n=mAWS{5&VV&K;O
zM21HM%rMK+b+nmwe~7mzIc|pD>`<<W|I|m^n=Dnf??~4wWpn>M#>U62VH~)97AD_j
zf^$=FAi11oFnb8&Q+tq;**NE`!LG2SF(!3|&1;G`k3RERxnWB}3is0C!)(VD|KO%*
zPRJ3$eHiOklB_ymI;$p@0Jmpz2QS76w>4eHWKacbvs17t+{6sJMRXEzlAr_mmITPR
z^H1@Dw4ppDbPd3?%Kip9oA;+1WVYptj$G`n-)`<+U+-@AgzL{AM%lw5nP5C#AZ!Cp
zIP0+o38dRkw<?6Qr=&lixqP!w1ik1O@kJ)#OkjJiSvI<N>Nt((!Q=-}==}+~rwyCn
zHOZ?`jxXxx*qGbh_L^SOxcDtjM6@@-*FO|+#X(RRm}X0(e<4cx0$GVVbSii*NR;`s
z^m5{6v%`4po6N$7U*2j-9PC(BTM<c6C0g>^yO%_zLcoE0)n@mReYuo4H}(s9FrHhx
zQ!Zsz9(aC0(TiRL6ssjVxr4m(&!f5}Q*!vJE5kIZ?n`LA^8ho3H95*HmoFGOnWtSb
zG$J|*ofPl=Grg@Ba5@JcVn>yzla$p(j$#y~Zs`qjhSwWErJZLNLM6MK5vpZSv9pO8
z>QUn}M@$n4Kl2xrIJPsC1JqQ|Mnco4<xgttrhDDdCYXEq{qTmKn`e$%LkNT@TKJD+
z!l_vRRg4v@xe#T3GvB1nW5-}^@ojXOVm=W-qQ=|*$t33_?x_s62Gq#@c`b&mMJ*4e
z1@5XXGW#P71A>F8fm^ltC=wtU7p|1G!=z|h9RiKs7&XxOgQ`aUa(Pg}!py1q*w5=b
zeg_#+TDQxgbasa@0$=DKlt!r?z_@_1Z`oBHhCFy8`k|9Ak7ExTQ(CYh_2sEt_PQI5
z4Td9^U~0!ppDSQ2J(#wY#Y}=sNj|CY&|VzEUHJ~T*Wyn&-a}LsT%@OlBBzLbHZsAo
zMg`E{O?K(E8*?WkStc{g8#F)vB}IZ|LE7*mR@D2p^x|~=1B5jtE4JMWW|8E-HvAF!
z#`K1P>i0UGziI-Nsw3CEHcDMxv<~vCA->ERTEqLj?Q{>YRD?$4q^3nYEOn=fBVDRU
zbO)PE0h>Rs9MqM!R$8xfS4$~?bwLu_9+-#v*Ymzd29RQJxe~>{Q?K5&3Q*3D`TVeE
z-tn!7(-pM0w{K3p$#-n(T)K6t`Q1=?6Z@31Z*I#YOBPGm7OzZ^eaXTb{p)m`%q)4X
zQYhxV^@(TvploSBZ8L`^sH1DWB3wmDKcu}%_@JmaGz^xDehAo2cpk%mKoThn8?(Z7
z{CGzxpKaZru9e=gZXx3spJsVJf?4WRrN2m|r(8gMEj>gA20Tkt+67<6N&`7fZl)7V
zj2v9&yqB-joAKhui~^qHqVICYje#u0>$s0Y8OxUY_V!BgO7TwnyfyA?CjVA5SBI0;
z4z#xk9$gY&#K!K95OKyUl@E~EA$*PRM4<e1tj{S9Ei7p9Lx0fKw>&aUvzlkGrz8C>
z#$K1{f5F{7_#QbdicAa74cO!2CoZ7SNH_d$%{o)+6dI%!5-zc2Q+hNK=b!)zquDOF
z7zDR_2^N-Q$BIV4qJG(cZ^to94cX^LLW6bz*{C}y<h4)eKb_fsupFAd#ul(fdIej;
z8VwF(fdw{Yuix_6&xozl0HK~bQ?=psGUEBc$uug?>|c*V`_tFS>zfHlW)+Lx$^^ps
z;Ohhz<g<KVr162ZC^GaG%h1Bjz^lPkrTh#<7C)}AGdwd*(Da^=w+2u7#(lF}p+q0G
zr+Z9~uiG_UjN9D<8<o=L8ic013zKyqUZ7m1n7|K}jrXPo-9Kim3tGTn0UhPQ^2ecO
zTyK-^TrytTk*HwvGsugkl@y1rQVK?w0Z3ai9x+krqked;j6oK`QA;9Q7yg5*FcG1I
z{fEn5Po_s~jUFC&pwpOP=V*}@<p)R4McwY7cmG8D40?kAm^n5qHFrCYOV?55hej1N
zWyf)v&=Q&kO@qv3>N+2?Kk$7pU&||M>%zd3jaH+`W0*ket`W=klVX!13!S<{e`E3`
zTDHw*cFD1>(2;^My@kS%Wn~c%2;4;mV%hhk(;ro;mOv4!m$RuPXO%$6%^N~6S-Nvt
z3n5PnSt>*w;;VOCY%Np{uRDw(2AAJ|9|I_J+z?+9$ug5{R?^0(Q*G_y>>BfI^GH$m
zx2wG`Wj2kgcu`~P!yJ%xFgM-iX}wI}lD`PYJT@T1qbs~bV1SbLAj`g4y*lsIL?RWu
zAYb+^S+RY2kvEJJ!;l&elGUc_{JNQKmA0B1n`LMxXOgyg)T_Hd?&i)&?=L|Zi&&#I
zg+0K5hBivRW|GH+;wnRVBL)HsR2V0VE5056Gzl*piac?T7r~1Q8^5)R?>XkP(~9&l
zG}ILiS4oM3tC%DKe>eSZ&P8g10pjAHz+hld<Yy_078?*$GZ5h2;ar85<F5$wb-~z(
zqYY6twh_R7zIrc<ZHFHDol}jBBLJJC4~ZKSV_9>bX44jxDmIsA2w7FCxWx1{QX)fG
z4QWb_X4Pz?cc^C*lyi9<&RWH!6J!nB;u(ow8QMV!e-tv$d~%L3I|eJJ*rS`<(R{Vj
zDhpR#*|s`ugB*T9x6Hv0@47)!Pa+g1YL?JUSB1u|!<?ecafLwC89GF%7{c!-6+i{Z
z&BE?3Xe|mz{+aD$2gVATV)#JVM3>F*`E3HEWbAgANG)?&UMP$7+uQ00A<Zt3%sKo6
zoLt>Vk__ulNhFF(oE@ocYHa%pJaFQrOhe3i1u4s<XJ0z*&vTw4Uj_ntAQRl0Z}leL
zrtx8F+86hhDt0b?UK*f2jb=xb+0riO`ny8ZW(fUyP?fCy<}*`Ti~;`{aTJovbvOpP
z1mObq=gqUx@81_$95q2|hIcU{xcG5Xr(R)p;Ox|*H8()4625?Cx%opDwkT+O?abv^
zt9m=ZFbrE^3ElIQ(E9sldupR;DF_QLph;Xx@U*9S^WUDv)jXx$S}VVAYArl#2L7_R
z+KxxlF<y}n*y{9Tlr%;abz~1Lid$L;A2Dix&UZm6&sQ&NQPy}Z575wd%D~h`T#eLs
zz37H8Xuco=8jmAOG$tKq`d(O5)<87dLJR;_bhTK7d1ud*8Ww&<FY9QSH!B(K;as%f
zz{ToB@pb}-bgiI8=p>bD8s4q6h5c?(-I;%RX5O*1)|XLOpf{c8qh^PJtGTifn|^vy
zP)(=$<>v?~a!!||&|5kmpmIm}X%h+PO#j*rfB;|2q<Z({n!GFSSUpk9es$Gj%Q1Xp
z4H()m`^UHm72spjAsb#I?q&R5J=<;L)L`ex2Rqh&!Jh4AG(<?9LFuEzVmn875HquA
zbn%i&OjU5C?4dlRONo%^N7Eq^70*$koa+#yS<@cu-YfnC9Zf<hFG0+>blyz2w;h2T
zCHWc}NOWVrvcA{maaS?xuBf&!MI5T)x?Q>$N8&$)E{J7yRt(%8`;njLL{3eU$qtfl
ze7k{laK;h;<+JvPUZs8J5brRKmEs)o&lq9eyI9virMA)<i1fyr=j$|H0JYh|D#k~m
zjRE-?0s|7(7gu3p-rJP=&CKv}#q4<+K;qyRrBY-x*!Me$lS9GzYyWoqWtIC%f_&q(
z2sbPz>uO2Tay^Utq9nmI;%9)27>NW!*oTVea7C$0nzZ;FqV>~JhnyKRm2mFq^FG$i
zv!<gc@cYoPZ`a#~W;?X^o8b15%M9h@(I^c653d#*8{Y~{&nVsG1a6RhF!i2VH}TFF
zdtC~$ID{bjoc)7r|2vCl{lrHwuM4XPdC42;C(fbiXhkZIQ#9)IhsPhx@4RQ#8r#0W
z*wP~?;E{UySJW57Q8^Y|#enuAA{s#*$nH=+HMGj$)i;`Z@-8wSm#HDS78NFP`2#uR
z=J1T?KTZ_n{>mIrR|#0j5f<T(P)by2@s9DXes`SOa@yNTa?H<zOljn~9^Qp6Q(G!(
zB{2An)(bA5ZpM0cHYTm6UDWAm^^PQFxzxEd{Uj~O1?Q@B_OEf?p?+Gu;0CIOTnpZx
zJwOf$B~B+U)dF%*=dm3c?{X;u?ED7D9-trVG^6d>`qe?}kUAB|_@S^`ry{&QWitef
z>t2IO*5~5Vlje7WfJQT2;a1sp850uG8wwxttjIvoXkYJ1iR=H2(}6-oVLaC0mQVi~
z)r^*r12H>mYl|DK@QrL-+5<M=H|Tyj7RZ|LlToBQ{Ra!7<`!m6M_Q0Q8?Kj3;i8+S
zPgg3qX@=}C(w#oxFP?;x@Y+winFDIIgnW*5NcfEv;myYw1&(~#&o5V0%MeS(ZK3G2
zF78X~UQ}qUPF!yJVsCvu1G@6TsC8#YV&t5a$!et(e2Dv}rs4e3QLEJD(%(Es-AOTa
zHQrdycY)eHrM)dEZFgDMJ1btvKlAAD{;I@q$h()>q8YRqzQ`V4XMLC$yjCS>oF`>F
z-f~GDdGGSz&ZS^^t_CHYwahRAZr10vciFQT2;iP<Ie$;otImlgI#Ypk>~qg0r|DCo
zs;!>Nmw(#3)5@6?UvN{(LXz1+D}FD|{!);*J6bxFnS`rbO`-PyIn7qhP>}yFhwSVq
z^Vj9h{X{3#AGH?2!jCpOO4$gE(tcXVl;8c_-qOVYp(3yL9l9t8%VcX#yee88E`w?K
zfI^Fh#kS!roMWH+IoTEjv{SE4k=JCV3a*A|Z9YwT0x60q_mEm~-&}Xrw)~9IEI~=p
z<OKG$Rsbqs{(P<VR-GklMyUr;-@GF8Ld5b8-MkwuKdniQa1>%rJ2WI6#doXQmB*HJ
zNjr+_ZKiYFF>5BmdG4v4y2BEe9w?Jds){b1%kr7}&raq@OT`cB;IhN(AtnKB5b=Om
z<@64uW}(xkUwtdjj1?TLP&LS+tMyHrQ6>mjA5o(bJP&@XugT^u97&Or9IYD|$&qs$
z>IzMd4!qv0LF|=E?SJ(jOd9RjuY2g40Rtrt6-KvLaeQ!0ko{M#=yODPf&Ht1xSy*T
z-Jl>6K4`hfJ1S!K^3l>oZ#^M!JF}CHx;cB8(sO^qj7<tQ2~0WZG1`Qvb)j3adr8EP
z$_i&`3f;6-@+uUk!+!!aIz;GVb63VuWMxkd7)Rt)FoY{?-aM?j{G!vU=2J`(=L{?{
z;+A?x-$IykCr=sJRA!h!vRul{z$aq#3dl&bUsI*g9OIvjj;InNH?Zq&MbtxN1<w0q
zZi(j@+${CsZ9&y_LG2uKrcX5;sw3CRz{fA`Q<a4gS{1Wa@%`}%#)n2z#vJmV4A+*3
zb|1wXe5_>|)~foLQ~~JjcPZgPFr$)Tvy?dOrJ6XZi%5g;VVfSd!JOA+2Ct?}48{FW
z5W6~Vy7B|5EIL*;Q*&gnn=jSJR;zULF}F*N|5$*iFO`hVk?z@bAb(FCyU``D))&a)
z&Eilk<eWUgHX$Q0@G7#R`iUsZr5N%#Ai$>4GERo<I;y_t7pvy&MD2c~oCS5Y9qE87
z2n!(C$b*9sW<4Kk?z%bdqn&p-biHEoEmrbP+*t%S7JW))wX8)*nO2sI0(~sT{N(cM
zV-X2aT^DZIYGxP+E>r{Q^<Rq{8BjkPq2bD%xf2FlBy2X(n7XpC1?Z=R+$+myVp#A%
zB{R4TkzIIRjWoyxj~q8)|7aVB(R{#qKv2#8w)Uz1#$O3V3mGWcoVYY$-?$1h#_&0*
zkq;VFlMRLdpRg#~o@5RTG{+5|9iG-YJE}odkOEjYo(K*(oOyb3?l}sRA3SBFX|@i*
zMg4^w20tfJ0{grwFq7iLPv+fD5;jNil0n$=(qpWUJ?OnbVyQ1Lb4rkk-?pM~P^Py4
z4*qBE&Krr3ryb|w);Vk`wlpGR^E>yYZJXTUBuxu-G-0K_7MaLH-QhiY_MskRX>B;l
z?PhD!?)#$nkOY)GBkq3XrROgv10TnDRsW6gWI#HP$p$+`0&pFaQ>AWZ4&|w8NFZj!
zng;foEoQURthzlPdvKiUzWxGE6fm|+yP;4h?&sp5O>2{Qu!_*f`qnwt?!1<ur+BYJ
z&3HuPz7?Z+cPQ?|J`zO9Nf~<6L3SM!q&^e6r<7NtClg+>;(4Rc`;n|kO%@p@tZ0Yr
zbupob>o39$#-Zl(W(+L2b;_X6T!v`Z=bJDfIT|OY%vJIA2I?=sEY>r!!Ug3)D;>|w
z>VbIG1^D350cJ*@0!w8hfsx7At*N#2c=F|DS@Uuv=J{M)(51QTbG!JY{NW27H@-Tq
zO0-fB!qqB<7X?m>fBnNUCATACbu&hbOZ8?aE6Wsa3iy-tX1e3aeWTtPw{IIgC!EL;
zI-gdX)x6=lfX=+E!R$)1!U1Q$nD=68RX{V&<imF5x8vx?Rwg@Dk>hGC!n9=D!jokG
zE~b-Fv3Zw{I`t~>_C&myThF+9wCy{LiaUyRCwGr5(0l7BG*+$j!7?4ajckcdM7Wof
zdJVfdkgN>Ekj#Q0WP}CePwmYCO1kAw0w6eltU@?9hJ5NgS&`3_7cpmOdQVH)7srHl
zK*HsuY^9$KVvM%1#*ut#dXg7CF<QLHH)hz{x*U$ey_=CK5mO8^D6&_Z&B&5I{<>IG
z+0L8lX>?{8X`=>1p$fQ|RGDpNajNW8O4wtJCWho>6peu?c7v+vRcrElQ7l3D{1@^D
z7U7+2FiI8{&5wE7Ne+XnRIr24WE099LGyIvgQ#^!g|(9v@jw>-`^vw0#Vs{s(fXFu
z)Fp}D(mftH>0h*w!hpbg>*tt~$5_jO(&sj2|BR*Bb9@(iwoIfxY!Tbv?OrKqIlz!$
z>aW}7CO}~y7P7KXL2G>4tA*U{ui01ns;PMh$sAo@=xmPux(TJPE_f|=a5bX@Wd700
zG#$fOL2k=Ic66yP?m=qDm?&RQr1$OCsI<Bjh0k2x7}VMa(Rlcl!vCdTUH~S>!-=?p
zIE3Ce$J*%|XW&j9jSOh9r|mXs%*rIN6*dm1JhF+NW$;3nwFw)1&JYj{d@WyTl|_ua
z9>D}tZacXE035HPol~OAYy6!~*5}WRvX^8u%LBbS-rU_MDobfi-l2=-nbk{pT(mHx
z>6~CRIB<J($G5N7h`vZW+A%z)kFw5b?>3b7{d97O*5b~0S~X~Od0V9>WTFbqhgYcf
z)h6hj?y@!{j1NG(?CMkM6AUY*nXl3AbQ9-P>{tl|ye-g1OK@hzE4L!CisN#(`|@w9
zygs|6f3L<=>Q)r&ghD6q@3u>S;FxvESs<>Yq}qHO`CCCRWTM2&`NV^Yx2%5e@JPIz
z%;$Vx0QBW@XhN5<BA7pVt<$MitE&UqXBz0~ELS+ghXdL_E9pIO`RU_9;?e!H&s0m_
zX>cW#<`;$mYouhR541<k&mPgOIO8cD>ANklHM3O3xOlJIt*f!6&d$zHm#0^=9Rhr^
zJwmrfR})IGyY2fZ6mHi##LOHh+;tS2l|EEiDuGe5OhcI$x<MCe!IW`Xh4H$Pvc*dv
z=(}II;(=hEK!pXudwn_HG^+}7`eaX)BqsT7lh;#<7mDU!zi>hCyI538=v>yIhppz+
zp`}sLuU#HhsvcZ#O~xJ{cSeegW@qi<)OVk&L1QU@a)*DF7|ykzk%NNJn%y-cF6wv#
zC7<^#&E=sHzg!2#TU?3&jzaFWpK&Ym*I1``?iu-T9kajhb4)EKaI8+*-t6a~cDXOk
zvAgsPwilfMxsL64LFGyOFhyyJ8W`Pyld~<6-CVSY>Y9SbnXvgGg`GY^%|VG#fb_*c
zK}Ru%gIH31J!y1K2(h}#`kGPf?^5W{G*#17g@1L$hR`hMhsv**sB84@I?mfmKbsbs
zJs)uLl0vz-F0wNs;59BZu2j_wawtX!lqBkBi^mS=SMqQBmeBI!$^NfE>67OD!gShD
zU8juLj>Ip>4rrz;0^F&->uQnrZq_v+2LMsBhUv7^=>K_s%LdU&LFe3@QQ*jZ6)K8&
zEv*$c<xHf`h{66|sdk4UcSO@=q~Cn|4eQKEBV97$0J7DoZ?5U=n~AXRjmikgtG9XY
z>Kkq~S2ka!Mbvj1uTD%M^gvRC`DdKq%`?5C46Wv`5&)?BJCq#ZZ9ib1$c5fq_fi5h
z_-bWFa<t9`IVrdu*adcT7(C6|D$Hz0j5&oK3A7Q;k^B`nbh}Hy!JYz}zAGpPsQ86c
zvbk}ru0yv|Q(i5)hSZJd9N{v6m|qFVp%1o37nt^E?Nt1HfMfh=n6n3+}P2X=>a
zjz`#y8UhxhYO~Q*DNoHk=fBN8P=)AL#P}^AI&G|WPxevja(-}?W<5hPxGP(n$!W8l
zjX7IBTpJjG`_I?!N!a$NacevMdfGSw6$$yGW}h6x;1&^QRB*I}PVibY{<N^KhTK^Q
z^N|WK#`KxV9+~7(p4}`%wrW+DIG23h`y8*3Oj}7!eAs2dMwX-MW<0lbMqkSFy6xv@
zg9AxQ^5Ue7J8GlTDXJw&j^Edw4D_lKp3-)X7xkt_{dJJmC**pL_2H1g2&A;u0XOf_
z=GA?eUE-ES#Lc}R((?N9r^G2Zm)z6ggkPQ~DNgqHqPZ&IqUNRakVlg4SIYIif95(f
zoG{t1$Fy->wajp%%~mnXr4Kif4)(?8KV{^p@x^9E91AzZPhyS+TTUEk&&RzRm33BB
zFGDF*|C#Oa=6oRb!`Kn67Ba6vD@MtSr%x2nBs|&HY)-|C!qsW&>#!i4W_UcalHf(A
ziTBC4WubKoPFyZ%J2<5`kHdzY&l>}*ni7iB_o*b-oJ5~!_rZ>97%4JZNgG+)PCkP%
zuV+U}zWR8QexDxK_(O%6nD0pXoJSe!S}{)9S!d`}zUrjI?6yxe#n3Wsw+=^(QJ*Y9
z$=+;P+C)!|hSig~k8HGDmCm2OfT_H}=jjQZ|9ismc+m@eufq=4)ot<{H#pqwCh$j{
zgJ6QBx_(39!Dy2mBCsAry+Y0=G{hjkpE%>QNNGPSjH_FhneEvHrLh2yduYZhNFIv!
zhtezs$&BERmHwg^I=u-Gg>Q`O5Cz9omSV{{$f#xt(*~tgDMD3hw07sG70w5Fz|e$!
zGRz+5IfOprZrb=|7r_4w@Jb2zceSdovoVZ3S;q$-WvUbK{+L^Gvpn?oYG6Y7((m5(
zdK|p`OHyQd9~zm#e~4Ke$mVu&(wpr!48X7A)uqPrzJXqkjPI_$x7)TedA_xfMcFjE
zWkpsyCG#iRWFiu4ve+tz8Z%w)<&P!}`K+>PMT6|~9<^;R`O)aTh#f=6Rf1&?j-rjw
zX;Z+VARllj0PL{zJSYBjC=f2_1`*6Bd$+hs2;Xcy2#00_4%sslF;(8Q#)7_o?1rzm
zU+q`hfaH5;UbBVfQzf!}tG;fjW}^3zcm;maEM(Hy)lotuUnlm-RJ6LeL^V)@&)=-8
zn_S1{3t3(Za}9s^Zi(Cfm0R<^aVZr1VNxw=i)n!kwl!wXZbdP3sOEj(go72Pu)Qwm
zvjqw~FkiPx2$9KImPDZ_eP#)0M1Dn5oAf;yVEFut6adb|y1cr1m=<vjoVtud`{VR@
z`uW7dJ#Mp(GL=5Aqbr-j?7NXA_2xAz&@_eK8IA>8fI^LVIs3@+YH7#dJF-Eb*BSIF
zvFlF17Cv;LZPY_k69R?JiFolsBh)QnS}c9qQr?3oIIh=~{DdUl3noSUo|#6<G8P2I
zMA#ZHpc+y*<9QO^<CzMbc(~fpQr#uS+`I0tR2z|>m{+v_SC3pYstr)Da_vca5;put
zn&GS5EKY|@YgH_y=hYtut3xi{oHfmz7M9Zn$|utXzjvH^t6$JTyaVt<W3gX)hongn
zS0{gHGGYLHzV5n+Gq2PO@K#lnEp3`FA|MG4ttSq-ueGGa>5S&G@B2XMtkCMi0reAo
zJ-;cj$hWP+Ej?;xa@YJm=Wm4N#jogq5sp$1M%mia-4Xit?pPhgi;k7_ZHas?CABaK
zAHcqh$|NImH|h`6J+8;OI{8`sQ)GzUlH()bY8dTm|0GuGdgPlpo}N!;r83GR={?#A
z!ks@K+HA4TYU|4x*KD4mUJ-O7DVWe=C&OgrSPp*B$H9z3-rC*}HHX2UTVEi`^&?Sb
zg48zK>N|_qYNIIle1O)VRR$)7|7dgO`QtyW!52Vlz!7I(m(bD>XynTA(~|QZ*FS7E
zg@sTC0UY{LFnLz&O@JssUj2g(a^#Pme^F_1Y)xj%kSm76JOY!aKgcVa!99lWKG@Gm
z3E4r$?$Bj^&h_MG$fe2C(~ObR>n*A;_FK}&VD@QEv7P0x@J8j=L5-kFVv`h`0=eA4
zPvvY+YiItQ80rQAtoJ!{imAc6Pe&5+A0@Wnf^3&VJ%6m?J%J)}h6((}TP<UUJH@PQ
zoYSYUEs^sTJzK`k^SjL(<zbe0FZ?VuaE-a&*F`5kf!U>5XuPi69GvFs3E@LyoV~uR
zWHZI796!%17RP+gdUG-(`c3%t<+`Fgy|=;+s}HO#zWz#mW(jY$%eNNlQ}@1BUz$ZS
z9iN%R8b}M0xE#HjEei5{F$dV62$8FH#6HP-p8qH7soDW08nb{!70#(KJIo+5iZZI-
z9BvBUlTsqG8J^xzF(eHtnI_o;w4?PetpO}s7sxLLKEXOx!%EpyXdC5#cH$%@?^Z0D
z`w2Bgx~dqF;#<!O3u9nN$&OE$9vyg-WS~E-v1IJR0&_spz9d>o|EwAvA#lqwS$Q0I
zxR;y<trB@t<2<$9|GzIAu(;SuE*f|F$uht>@-wOkZyKxD!*j@3Eo$VCfSLjAaF+_Z
z`=9W)j#dblYBJ&fNwZ)s)A;_*(1cRCY(*bWO0k8Y)3%S6Y$>>rx#o+%*0q_s{(Q1f
zbAVkR)R$P#xJ_8(4C7kiSKL5i7HvuREu$JWTDIDv_K0mrhv6+VWAgJrtbBB$-vc~{
z^*r_S2tz0fhTd>%(fI&!)k?z-{dWwJHd=o)yA}4ff8-xgeu-Km&9m8<o+%v=rt^=L
zT5Em+<YLsP$-57>_Z$;EjXR1t^PGk}Y@O1gzmqG`mdr59yaJ+3h^x8A4RQtP3;Jc7
zy<Zk>W$@k$^1#zwt1dRoo8wW(t>aaqXv%SAY{0t}aG0GS1{%o;O$s2tJSy-jUHgrz
zjWPM(>WwB(8My>$xxyn&J*(YpgM0Sy278TWH3Qlu+`#{lrIdW&Ol^x_sZ<HNHW`~g
zGG63ihk1Gb+25T=!ZPx;%{jBP!)0Skt31dL>px8{iAm)E7aZ@_O?Rggv)z27Wlgg%
z*}A6&WL9uya@&8TMkxPDjR0-v#k9Uvldmv(LwW~||MVYZt@YW)D8Gy$b=K{t`&sgI
znt-^*;2wGKtZdDHw$o|1eC9ZKK0Yi_#C!`b>8mNx>V$=uhG$C!?X;xmLbZ@)U;gy%
z6@~M`xZZ=(B1&RHkc63uEG%*-bmqrQO5uB8aIo^j;VoU75UJzk>Id(elM|v_5-gsF
zqpM(@x&9dP72C|-^PSBzg+f(VZxTGo1grAA+SQI$Sis+k58rquQR6=ZZLjK14yBR&
zO?W=Lo^>y8=H|=k?Rj~4e-4o%PKtI5;{r@|FH1XjKz<g2SDyMF{w<9*6gsP#u2+?B
z()QI&OBdb~IlM_ij1`s6bhY_6K2A=8Dfu?l-=B>BDSGo!-w)~C(Pq;A>xQxfKQ`78
zIb!TgkxuNw{&fQ(ZaXVE==3V<Qj;ogl(oTjy<=ot-TQb-*0_4?13!=n_uS8*?09;L
zVJXo5u6fOe#YR{CO%Iye)cd8T4pB>N)#%Lg={j>{0of8`wmP-Osp^N=C5Z2(^Gk}w
zYcbdG6Hc6+)R>T{YfV)ArU>QDy#uP*s!l~(ehJ`LCF(uKXMo;a#lwXv2Pftm<1BkY
z<^Jj?(UBmhWd#J$E6R%yED0a3%9n^oW{lO3G7=`;FZcqCTWX&vuZe2u$4XS$6|+_@
z?AFYO-wuTyC$Y0YV?~>?eqhH{>+eG~w~^&->xgC>dU>cIKNE5*;Q*gu+q|u^oxuzE
z=uFa>Z)5pTcP;_H_{Le>nV}vXBL?-p9&n^_!b&B<Ubs#c1Ffr+MFiaWI^DA3dHYP=
z@SZpxsQFcLr;WEraSviFd1fj`1c;U#^LTK9&StrF^9s=w@l$sgucsMlVeHtH%Qr2D
z__7fnZ3k9io<CD+>E!cFk?Sc_hydJ^#_+$ZA<5^d`N7))0goR#)P!Y^R#N&#!Ad`s
zmT-AOwWV`i993uY8?%3c7i>MEC+Qh28Z8IMYg1&_JDD{qyRy$|6SNrz)#$NLw$3}x
zeexb(2wV5w<m^JS>SVz-R+Jw3;#`l0gT1UpJ<OYOf!y)qL4Kdl+Tp4EW;eEf)HOCU
zzPp8JO6%C%ukTF4fE@_k*U?tl_9J&^L1kX=6e3VOfk8y2f1kyI*Ga}7i-=9xzs-Ve
zNdv*wZIilSV>_OJb``N^-^}DoP58+rz9}-i!UQXlmKAq6Y*q<uHIifacDtnI_HTKp
zeqV=Qn~Wr`vk?-dIA(CkHTgJ26Omyv<fzQ!O;DWHFfSFW?iS2_t4`jF1YMWZbLY9s
zDd1);NgHe85&&GYT=SS%-UUM&9xBIsTc#MU4MBg)etWPCO?IS}Ee!>8*{mj-gM(tI
znjB-2SKj(YXIJcuAWYE^q_Eo(H<ztlEt%sq`jCk9k8+uw077N+8$iAMB1L`LD0AwA
zOh&F}Fk78T<*FhOq53hMkB6`K?JM|y!Cv^z<#JU)@H+1YgFfv-BSr_q-pLl-#$*{d
zynjvxeTyzbalE7`U>F^IJR9>-WM1j>%=L)Mg-Td<b<*A|UaoS((Z;Y7oiYWcjnL34
z-!K1j#SZOA=Rium9iAQBXsJBqW=~-e-I#ai@^ZBnNcu5I5lLt`X81Pvw7=f2DGs5r
zCPz?B?D4+Yk~J-0llCuU#f?XV{l0y^i9W*1aVgFfBU~iztn$E5YX5NW8cinop@s%8
zhwvL?a01<yY6oB{J*aPc5-+NaBY|(QV@}uKzE~J*#;VA7cx~EX9VBiTWg+|{n~k6h
z#6dFoFO@ulqnN-Ap2Ouf-0NqRfhG9))>WUWw=i`Wt|*0zUZ~3#K_@dP>!a#pSes`q
zf~31ugU=A82qm`bvC~!SdEabh!5A>d#H$@q6W7vV&47tc<?^5N*QoJAASlcBs;`Pb
z+Sv}fl5z$;c3O+(paWZX2n4-WMr+<rfOW)7Rhqv=P(ANJd(T?`+uPH?!;h!3>clMx
ze??sjj`cK4%U$BJGY&3r-r$*?M9h*RL!@{<rvv52%#2dGDbb*qTS+$Ik-XLsxxR-9
zyh0|o+Mr^|kEYV|D~f+L3C)Ea8i05d7WNKuzlMx<*e9wO8?xGHSmEdH*<FtgRwq3`
zj22&3=>fI*)Snn-mJvkt_r4s9cJe&Zc7-?}ES^uiflO1+qvJ%W2612>4PqpnB?nrM
zDU^ZibwX7a&QaqPK<C9VhgS_sAjG#_+ASV0A|^tVUO2|fcZz#tu_s=B*(%O^F(1VS
zB>&kH$ycwJKdC&-e7dBNKh+a=)3E^z1hV^qa0!%p@<ns{LF=KL>WKk&+ADsEh;+C{
z<zUO37>jJ{++vp`G(apF<!Bf4sOiq+*$c@zkRj2x@3XwtX;$cPrCuaoa3gh;cBP$~
z95ng#3mN0|Wzmae-LpBKAlGZ9*&EfgoUL*3M3)aCJ~uB^lj-l8gIqXO%DM_z=x5$f
z$7ypE5Abw5)oP!~gJ-3FMR2`0i)4W9H(ZG|Z6hWF<gFJ~%d0_`te_5KVKy%s1{v5$
zwCf$M2kILV^==BtRcl4S)4Ls*+qT~3@OY1otJDF0pGVGJ$^q@?gBmn8!tA4Brt%>p
zb#fF1SR>~R7<V%AzCB*@4p{!4A*lYL?p04BH$aU&VMED>6P0>PveW;0xrwM6j~rvd
zA)O04b-wFFF-&&ikvLrL?YkqHPY0!;=M~K_C$kn^R-@oAVsS%5&D*Qp9&lb!L4Cz0
zK1@#%;~#qO3)tosq_x$20|L#*KV<rJc4=ePb&Zh1lq$MjUU+0(3{OG5Ze9SXRA(1X
z0YS+N7;^jlov^@Zpj_917L;bFpdlvq8ZLf6wl2C~qwbO5bV+c^yv#I|6|6ZUuwI&z
zsGI?-aN+)ZLFE_~JZJ*Xe`VEN#93t8kxo7w@IMWXKOfG!8U4ZJ977~ZXs58T1M8@@
zVeI_oAB1(u>fn5Xi#ayfz3v9|mw{NsM_(%f>>?$~mnjND)}ax;ya(DYv|I!@E0#QO
zo!Y+FuLExSaO>z}=kVW^%S<CYGEE3o!^iJGiPU^%DfnF%wo-%v#!*&B2~7(MJ@$VO
zg&yY2c~)aj2m}bFNhyBG+43;WIc`({#a_Y-X#LV&Km0sO7_1RC_y*|Xzh+iY+7Uu>
zDe*AF8#h5v0Wf`_MA~IY+8++>>;^YGx$3$fYo%B*r~jzvB-Cff6`|0C6!fCO40O(I
z5yivLMmImL-R@=!i-MfmR=Se5ChAtO2;S!)xwYemJ1e?*f0ABdJMGt7{^4rG@^SBF
z(XUilb6}%(fG#!<E&--R`b`+c=r6M|{rSxkZwf!Y%$L;<tbx(6oOrFL*!0Zz4p%KR
zNp*JM(DJ*^^2RZHyhTI2hHcAjeO4bs5Rc)yH4%|t7o7Bzn($YM+ii`Qvt60g9|RF0
zvkLJ*XRew3%YRgm(rh?=`~1RAKd3nOg=3-4SL1iR*HrVf8#x9bf?dve+hkZ^v9B*j
z_S;SLwdyBH_0FeEde#s2P#oAjA0(BdIpv~IU9sLqH3={LBb|a)6u2V!hr4DEy_Tbr
zpYv<+y~5YRlO0;Ch3v(#UdtzOs`Xt5)v>8GNM-IrVndF~PWp9zYo)a8(%kj3|6DkW
zGR98$0`^%o?I!`vK`7?T^9`QV8Wwj65SG|Vj!DKiKX)gk#6$yI6$0XhP}cU;esl>x
z99u20-GL$>;jW-8EXVQkKD=|TXes7f1}XBpU(SareQ@sG&hk*we(&-kj`ZfS^Fpy9
zDbhED3gy#pat%QF7k(aY<M!wq-jVNz`VIg>cUlIJ6g1k$Yjm8<OOe-|41Csk9t18(
zZW>@vcaWsC&`2(!F2X;i9FvACIQvu=?Wxr7Y$>Pcp1imqm*`ri*2fbqM1qZ~)32MB
zI9vjPA-~1HFj`){#0_j0`aev)Wmr_*`@enPDk7jDEiiy|cX!v&-Hdd1NsCAfDbg+7
z4Bermbhm&EJ#^>5v%SCn-*G&z=hd3M_S$Q$>vNqa?Ve;-$#x1^UzT^QHf&4;nag%;
zG`ywLpN$=UYx7x4DRU?#^*y(a!6BAd;rnOqc~LdqP7hf_9vFS59YL10uN{*D5%>cU
zJ?Q+TZg%&nQnS9eNSH+keOkRqbe<Lx)?M?XfrCXFv?On@3CRsViUQ7>RVP;l>Qi36
zM5beHn6=+m-?*_zSvzx?UwDLTGQd8=Iwbx)D5R@c6f?XL_GwjCX29j=6%|NXoHDF2
z)+CWGjLYC{cPA$Fr;P)EhTU=>fV9+)L^%7S;!ueQ5-Fl)%s9?BG)rID?(W#)WO)Oy
z$_mWjS`<%jo!3)1BfF4JWD=qWgBhB8$5MG~%ZLIn=xPYblyW`Y{)8Rgz$Z;741iE{
z0AvNTcb<<I-)*&odQhGx^_@|<fMcYgdZDL74g+&|VT3=b^@#jK`MwWecml2&L$|@N
z#dI=EHh$9J?>kfE*-NrD>WFOh1yD`G-V%O4`Vp!KjOcT(N|Uk}0ekZBT}h>bXLj=k
zvg!9E86SaqHLiUYXH(P<54ekA+F*r$o#b$`(fG@aIii`dkgTi~((A4Vl()-S#0fi(
zyHW7tN-YN-ID0}QrXSt&qtXTKxJhF|OBmsWVuO#UnmOI!3{NnB=0q29*UhRT?34mX
zB$Vm+f&Thb$~_G9a0&&##2a(BzD%$V`rfKt8b-oyd;fn~fOTR$%$}N`2`N?#6h`}A
zc^O6jy>N;x33K$=*41qzo`<}eQ_Z2|Uv`OKZR9o5BPRVGv3!+tf0uMuPQ{Zp?$}De
zrqh2_x|_Ad9Dx!=3*hGIt3~tQd0VOuYCv9vv<!0IXxupA!Ac0npstz7p{xFxv7pq$
zQ9qiAtE&opWuc5HE9>Y;U&(j-9tifeAw?orJz%K!$}eZ8Ljse*<8eI*i7m|m+Q94!
z>U=-nzJ76#$}YqZQyF_K=%eAv2k+k8@*5TM`hbyKyUh_P;DqJimE#PALn@_#d#yH7
z!S!KJqBp3ZvRYfk?2kSEO>W?Wu8f)ptG?U*V@y6}Tj~qT!EFn}q7ZxN^}|*@cV1h-
zqSp~vlbybXS*-~M@KpY1+Ona>Y*4yeb^uE#U|Y#u*joGat%)vpy9J-`;lZol6US#N
z%INm{rG<#i)|hxt)V<w(^@S|a&^HeSaW8EmlJxV?p|CVX_EB3nXi%@Wh^{rh23mP^
z{o1u8*AwYHJOIi@TZZ5vCjgBMjA;@HR+6q`P3}26en?1hFs1mfFQ>UW!&-z&RDJ+!
zyEsKajnR&7wb7VowuP4{JEIRXgBAZ{R)6kJ9}|3il*Zti${6o=jvyVQ@fY<JFg^|A
z-XDW^J}GQ~wbJcSqtWRtuu-%tOn3}%M!m4bPZBK2zt$lkCM6IYR|FN6mLSxgzquas
zM2)-VR4Bjr;tpL{z7|Z*9(uo~GzvU`D5>eD&5%fie%i*?HPT*FzU;zQje5T<sdMDL
z?8q}VV!^9L7*n15HC=%#?B$LD@M*Gd(a|YAHoR=-xi#;^PU^2Gz?^TDS==gQ29y&>
z6~F8Ky^|XkVNqc+6QnQi^)3@-G^DGU_%Z)m=f}GT?_kv?=JwV)_c>mXnG*x=+25B1
z8JT1PbFFTQviC~V9<Ki<vYG6B-~M9dMKLEukc!s0>i89-b{h7}VHd?!(I8J(;iEXL
zC!dGgykG5y0CDRTS@*<V8mU;R216?G&b7NbB+sv*4CZay+(%9Wv=)1!v5^G@GBU2`
zodJ+XbqgrrvvW%J4u1cQC0zrVTjkp6>(y_fK06(^dYB<vcyD<W5U5B)>~hwk)%W7_
z30l6Fi8+Hg4=4gjq1A8~$tN&qX|ozscK0Vv^)HmC6}skq3BKxn#lL8US7>*V6llQ~
zJJpEXX29$qL+OK7^~?grEFflc9CLNF(`Q4SzWTZR-8yq))V<|i%>1>WZTyv<1M=~7
zc0g&pf|%spVRAs!F3D;{w-8_UxIEiA|3TE3elsUF{jqp?F!cqZGC*$kh54+6#0$+V
z(lc7{A=9MY^b%*+OILqin)+huQoTrzuxqP1L0wuVanvPw{j)a<n@j$NxKVa52L0Ow
zkD`3PUGjXppX?jA+`A==lk6YPkWO}?W^}v}{*}(O^9+-E?qIa$lSKp$V82cO?}+&g
z_IM?FP@u4#<ruEHB_|Yw(HM*wV2jFFhRiuy@nD)h8^Az6<|+O43fJ<*gF~KxCXq_Y
z_v?%AG0OYP_2#yKarS~dQVKs|2q5Da&p+V9s{y1;kz(8+oi2Cr7;WVkmq=8M9kgW}
zJW5#u3w?fhX#BwN#>+mtxywtDx1%@qgvXQstZ7sdVykN=U_`$qBloDq{UwK05F0<}
z!HZrd!NDg(*|ucBlDlV3MEkQ^F_TBE@L!6_!)L=_>h1T)vTVy^+*;O8_;%ge7m(x9
zIp|sGDjy&G<ZL;pAdh29Q-}h0fUv9m3yyt-(Bi6gZc@YK`FAW&FR#p(0WK>Pa0nse
zlL_Oq{q-c+^6y?jAyj~~^vwtz{<*=szcFXu@(vL#2E9Y)iyyf;bySx=??@IFOMVyd
z75;Eyr?4%hzB|czG;}LT#rw(uCs-D)HcUNzg0u0k`s9=F@vAUCzB5yR7a|DfBC|sr
zCA<+^Psq?#osqIr59YYO{lbsU{h?Tq{#SUAn|E0xEpr$`{9tz5@02B`L-MI6{l}~U
zrs_p+MUHkaO%9%M{|wdwuJ@3H9N?;2cv>8)B`tjZk&>h1n@s^eUlSuboq<{2MpF6>
zvbNhWJ|TdOn=stj;j4p(l`)We8}_%5syJ)^uY%~_-W9l05Qa?#KvoQ~AY%AMum{?L
zu~1Dx=!}<J0;PlBcgz*%hM(Ur(><olF^mV4x75cJ&f~blt!R?C6HNH`b2ExA<d{=d
z;tkd1{^jb4nXuCdSrX03iIsvna>VwFWqowhoXm`nNopwK>Cx{dRu0X-|6%3VGIYn-
ze>J3=YwCY4aPP_djWzG%$D9$mP=p?}dao%Dl@Ud?!86>5o~gLQ;K;k83pKkJuYFy3
zS@tWVsm%o{6l7#8W4x;tuVwnp1k=#x-+%sN@m>TS1LDu<-2kuF-GxAVuuRc{e%)M+
z8O2KK`i1e=Q@3`cz)w0uWjwO#wS@K!c#7I%pN|5;xyY>j7Kc2Ao}~e60PvPvtOZKG
z)hpfy@<5J|o$3p|$+nneOM!MaJYw)dG-i$;QTo$*lvI9BZCgCMoG}ffgn4fi<x0SL
zn{S$tpn29<qF*l+3yC@3PD%Ri`NLY@lo~aYLxF088-}!A1^%9bL|Q}CjoGYw2o_LK
zgh95n6By<{`adWPY~rZja$?z;d|qnt+`L}rm-=HJELitIU<er}a}-3mkFt#5jb}}k
z3ZOeBe8%*CVl&152(Ffac`c1Q<rIurM_7K{q@k$|?eS-%><$rMhF&gMF#ACqHCiEM
z!8le3om+oc-#Q`g2hOJ&M9o!`veIz_1J_dq_MIdsOt{szs5e(z@Q|V4UbZUTT?ejg
z<C}o{Hq!gE6OM|FSN$)e;hMy(z}V}n)x=vDc=b^lbJT~@c_3;4me{D^Pz)7H>3!UR
z5P538-FNF<-yfjVR9m_UQbMog%~3o}`ufQh+>(!Bi|TGS<Pi^@g_&#yW-049gsaW3
z&v<g<6SS_XI`S~%IujmCGp8M8Y!|LK$SHw?l_y`~3^N9DQ_;EisS);X_z{WL+bk8n
z!=NtN6D^;<a4d%Pl{NJ#?3TlDNUi=25<{yq?c+5$2!EmdKJ9^1diU8bHq8q`@jqQp
zoZyl?=9pg7_X%h5p<`=_6L&q0yWQ&nw?9I4wW!#rFzq$4PhZ#f-VZw>9%7U!CPGXd
z=D|8g0cbN@UmiVMdKx7Q+}@hhEh=0`^d!op#D(S6eX}#DFZ;UwBBa*chPh2$dEt$L
zl8kHGg6zze1!AHT416c{*P`v!6#myHGzvcndfW-xcS<ab=&>eDdK0*K=%${UAjeY*
ztr+Z9PF)ri98AX4;e{Wt_R@vXmXDfvF=5M29zEtak|d{GoKzZ8PFC5I=ak=^4eQ&u
z+O0{CN%{ooP1y3#r+jsEVobMBxUY}0)5}z%P`1jm**SE5YyESu8?ircDj=Q~clz<v
zoLrg{*i;&ZasdgzA3;w$L64RsI+laHe>Z_c19*ZY`6hxBtR7>iY|R?fm_0Ubr4ddf
zK@1GZx9+}n4f<~-T&DJI@x9Bs=e(I%E@0+LQ7ln~70_Z7l-H~jfO9lU#C9jO=eT>D
z&_Plc`ck*EL0jKBPj@ed$c>%~S41hp`HJ+vaK2-9%+@`S^>zS`E2p^kVowc{MVl}6
z!}uSJ%X#iLHs&!}b`mfq3+t?eW{mm!<R#;hILa(J3n4sk<6J%xH4{-CFR96(df|qe
zWPeG8*6cTf;xe|VcZ(61y`_6)(4z6;gC+yA4lkejv?T>G4k%yVf)@@+$Kw9M$8B{k
z$bO0oYic)M(oY`THIPcqd@^h;h$q~GG5;${zndQPmE0@lub=L=%5JL#f?OA-Y3uo_
zJ`j;^SLuRgqI!%N8wqV?Ng_3uqu-oiXGUax?5>6KGoN%7YCPg3f|E92e>8qV?@-bv
za5Ue(NdYkYsX^aElY0VY4^AH=tdiSlY29qv`#3+@5E``oun3b|_RlQmcgVcP_XYkE
ze>y{G6sIjq?)Den#4~n$G!$D_6Hq^FzacmE&Vr(`#6oXTNMjE6`?1pGy;nb>%pkKT
z43&`BsRmp1Y9ro=RQp2U>soQ=uTq=Mg>FVy8{BUJYKO^5!3bXy&ayt=P6lLYoza=@
z&5hg^;~6f;u$e_*pXb!iAK+Q+jp6kYvmd!CdLvs23zQ?SPvyb92XV0NR5Nw0ya+%;
z`vcI>DpI`fTYq}E?z-28vWSC-^Rw@@Qf5uH7df23T9&OzR(XR(Q`7szjG+XNHE`x#
zg>-18w-@4>zAJz3lXTw)Tuqe8DBA~p-4DH{q`6MbQR=k+9fvdvK7P-TJUXuW)%uQj
zQx=7X0eS%oS+wCCEh&>{t_8wlpHtwA=zPl}EWOqL!mAH~D7|XMLYeO_vjoV58^;Ej
z8}O}r*`bI_zxB?M`;PKOHmpcGZ*8{iJ<lps(F#YC&pPR-%ZC*|DXmD07_<ChbT=wO
zTS+tB$;u=DP`r$EOnd37t)`WBu82P8xSuIcE&M4ST2g9b#Cg{?Z+VjTBeB2XOQN;F
zjfRJDC2OdP)2><0^3MS)Z}3x!s*6);TZF}JHAoNy68|s=h_Dm8j8?7d9SHtA<Xf8m
z!eX2$k}5v5lib;0G8v*IQKn5YQDqT+x(@%K89DA@iFWaUw=p|?Nz13{@0F-s>r_Wp
zy>L;^zK?I=%_~>EkwSaj@Jf(7Jz>==zEfd^MVzbeRTULNhbyH*M34oGGIXb92G)<I
zJBH_>JIZNTd=$zP5!0R1<5INAG)Jq1Rv(}L<q<W`yS-b^S$Mj2_QA^Pi|Q%2VtJHd
zE;v`z`LIGe>e{_TP{zQ4DUCcqai4M+<O}&dGLJ)1(CalJw4mOqjM1avNdIEq#v{|O
z;a5}K!o#t|wl+>+0(894hACzxQNx!AP0HrODPA0Gm!FMyo?J(CqLn-N8NO~4`IoUu
zDcTi6_9jnU+`6Hq|AQIHKRWzJ1iK!-6nnB?))k2RZ>4rq|6(uw%G!LUF0X{vL$uS(
zH8qAd0eo!Ku=P^HW68sM#spZ9D0`@@%u5f6h(DsKL?XpXUS}?uU7I!3_M5o2jgQm9
zCyFOmHoO4%@p*fBf1Kk#&?2>y!LHgl3nn#U;&EfVv-iubj4|Dv?f*sSY|HFI=V;I+
z#Y#tT_-Qr&ZUlf^zmc#zRa>mA(5N0&{Yuvdfczgv4jBKEvXb2{cso{h+%68-^A)im
zD?xWR_tUWQjUN&ShU=bP4c@X*|91S8>lF;RtVqsMh>;^aZOTGYQz0rNQ71euFS={(
zvGvRvu8uenYk}$9ozFoP=T)M&An~zcGL)PsO~wxomsi&Vs@HLk8l;SLIvqu>>Fvd#
zdxse2r3;cMjL+J?ax`=I{yd#MS3M47%kQi$_lOP`gGAtTHV3^fQ{1$r2rwXJ(f94m
z?fLodi)5wu_~BdRyo3~(qaVeWdpL{B5gfmV!V<Vd?4DX|>h~s!rtUYiz6gx!y;vQN
zSlti%0Q|*yIQ-w%x2)<dN_<Fe_`<rU$L*N?{m)J0uJaq4BYF&^vrYg<ybVO*qM3Z&
z0T&fc(r$xpY_+UYD3g$PqEc~QNTSGEWJFa({j7Wg_?ypd7k40Hjl)R82b9u#_(|4(
zb8_LlH+MHT;r5iHX(6xA6oknSdLy=nN=q<}-t`Xr!-)j4>3pA7^&;JjyY(jNRmk@x
zU+3w*i$p6DvEkxGfIg4JL0pF*4hyuiKr08djM@lgYHG#eDMWz()|VKtO`<YAFB@j@
zlo^ZQbHhCOjr>$kZ6;MT2alVJrm-4%I-hJw9r%fMC;O7h?q%bAWAAT6H$#<*jj{qU
z--iOY#*Evg<yye)W>L%8prY>PpA$qHR4f*<R!oyv2szmOnDz`v4>D4}ez7g_12?>|
z%LoHIyuZZSB7RPrT(q5#RCMdf>iiC}jqb;kVdV*DSE5lTf8;V%vYw=44E|1MYjQxO
zkP+FVfV_^aI9!K`J+?Ot<hX|p8;gUszj~Z~#tA=1HbBtnR(`prt5jL#R(^Wp7^JZw
zulNIt4c&7=jnu#b=r0N+=X&m12%kviD4F;hwKQG>6e$nAk(MmqzCT_J=Jl-yRDW1K
z#b8MPP4I+X5uU#@W=V#cyTISB`dMB~AU_+=5m_9ZFE}`&V7~3Fe*AKA-zX@VIO17H
zQl!$@dR}jaJU#uUoY1yLRL_8s<k@~A-%q!7`d$-f`fu!B3a+}CLH8iPnja1YNh8C*
zUdgdBa1F%xF=UrY%j7JKTydqAUM$G_Y%pj-kdU+=by;DG{Gy`!w=zoItApWDX}@&{
z#ZUU&_zjJ5h@Wx2{q^96WMYUY|66*|5dGGE&Ud-8;*U8$b{6Rnn3bLWl;<yfG*o$!
zh3(5zx~r1a#)=-`!a}ulubK72mz~^@%?Bn-XoyL2LumfGf^r6Xu*>-~q(<pzgaKZi
zsyJVT?%KR?T!Zn<3+du^R?XB%TEnoPqvlx_ymgt&s4OJ<F2XB~{YC2dW$R$UqfP+%
z8N7}q75EStB~%jbkgr~Fm|S2dOTjt~J>KbI_9;wG^q@rb^lAp>PqLTSZ&*t$;_xls
zEQl((2ZLm@xKL<d9<|`fnEvB4TLhZ55~?RxP?Wk`cU@YnEa*)rC*G2h9Cfnthy&Du
zImsMyJ4}-}>9{t#OgGIU8o00V)zz4WqLjZ;5jUAZZTe|d)_$99)Uzlc;6STLo>mvq
zVNJ;C(}8B_P*hNt+0Pc;Ivc;UEgL0v+hKjLTUVfstFsA|m)}rjQdILl1FrIJhG*qL
zFKZcd*4pcM5m^W_{3U_;_H-@KT!9-C?yoCG9((e~tY1o~*e~dc<mCk8-n1oM`4T-*
zCR6b+#Ky`Z8(h=yBE2I32Z?c;Al_(3jw4&5UG?D0uQUEBZ)VnmGqogdxN`PI92Yzq
zq?W<|!IO$ET@fNp!=gaJ&H&3%a451T&yt0J88e53BG;>jdT`y(@z8E)K=?1>sz$v&
z+N?q*1III;vcc2@T}-R!@PUO6K}sPQjM%gJ691IXKFi=pZ!XvvK!N;bQrfMsLfiR6
z<kEP?wK_8tJ9n(t+CM709BeWqCUd-Z(-gnnz5LN=Mo|a(Rj!rY>eJ=>!R;-RL>U?i
z_lH#Xqe~Mk&5HcVD2E%OF(iqp(Fdv6wL8QK`+b&aSYn-o<)S-|*gn)9fVE1zsD-ON
zxNQ-;M`4E-MHP`96npAbwcu}MSjYAN)3F9b@T<j{Q6p)n({8E5m1E(#2wJH|sl<z)
z(8~vX88Z~)KQ#OdS8On{U3#s~-VXyT`*m(WKYQ;qksQz*We6djHEYA}`<nbiQGl$(
zErF}49I8AXpkESqzTMVcau8)!1!vFs=6f@m&~`wY+~Xfm{|78+SXzuzmk`3H2l5Ro
z`j+$>$-pSrBVfierP_FLA8yK~lmO0qNd~T=dNG+B@O7GbVKS)Jm@<rG_>grD$>FS|
z`+PE7q_Y+&B8NOR?UX8xR;4ZITe0y^{l)aS?M)qXnemGRL?6%(lfkl%SH~aVXJacQ
zEX-1(lnD@j$c;wz{p5LseCKz`!<v2B2ZKF=_J<Ao^O`XfK|yXkeD_Go+g+}#Ej=B;
zYeMk(OXv^g7iVyGu?rUo_(U}7@1(`~R>t|?=__tF#WMJ%z44KdLc;fLn)7SF>?Ou0
zBx{O=k^?!gl@JirwtK505#&AA;%BE?3c%BD*F{c)dH$v#8DFVsee>)4*@y&G{N+pW
z`s?fn$qa1%GNkq6ZY&CZa7=;>uc|Bk2==05;R@K1C`8HF&>*xbSogS>yEdQxl(lMa
zy)8LT;DS5X`<)7Ar=IJL5D(YoKwqoENIfO-hSjk0yoQ%`CcZFb?%F~VSCSqzq<l}Q
z;X9)1RZn>x3_+G6_8SW!|6r4zq}_Ds!Ci~dmfGEi-lRT=?hjU1fYd(OigIN;agB#p
zE4^HHj}J=@+ZrAh)2;S2c|?=k#}z3vXpr>S4y_$^aJG~f@w#E%uu)m_z2tGS>+(y(
zlS?*1rWO4rC%BIdT55+P+205rQ>CH9T~@WASkIKv@Ssu@SRu)>RIUr$KcNpOi>dS$
z@jEYTcuw-n8Xpfj=>?t)<{<5W;N@~F>$vdlovYqxF;i8K%=o`ecxu}{Z4QRU!?2CC
zq{<Lsn=Uuek^lxAF2iZLUeOPKdsk%@nZqSkShhB^9O90j${R9@Um<&*?uN{3;3up0
zqbXPVU1!4h9(2qVi1Xb6uiTKf8-^H9VGrG&5IyEE^nHnUx*y`|G~WgBQC+cGrtctR
za?@!i#f6d1p=Ih%(4$PkE62A*R0I(&Q%>B0BZ60EEOVRJ^egr}*OuV+tXZ*Jx=d=?
z7-noMu|37dc{VjKtIwpl)*>9~FA3#b4Iq7ULR-sxgWI7_Z*zr+UQ5>|EsIrk%32P;
z>xX94X;nLwDKcrYcqIN~ct4Tg3fSPqYxrM>lU1pJ2s*iW&j6S#SDEpDu(3ml?`<}v
z>tW!tH=QXB^G7pTECW|1!Qp0uu#mQVItclNUY^l;#p~dhz$*BJY&vZ#*!6~DG|YPj
z;4&EPxOg1ym&Jmf`0#b2@Wamv%^J@n%xZ@VR?fNy+cw&zawk|J+c`Mn;bLuDhR41x
z@1jt!8q$d`<DL_`n{5}wOQrS|Y0O}?c2?80Lrr~k5@W>h`*N$G*q%D8_)2Z&CS>y>
z=W3Q+m^9t5?KFtqC`KVAKXF4ZatLoYLlqvJDon#VT!XeQFGtE@x+sHR)Dc|4_DyH*
z%l3Y#5dl60t8eE{qw+kq-IVhlyJCeAc+a1aQvRu&kK)f99(F2{_u7KxnK+pjeSFov
z`l46*xeS%|oX|8%%k*J$umcoS()mG1PUcOj&?PjmnytfDpybGPk7t#DQeBXH$zii7
zfH28hzKbVoel4-j&rJBhayg7rmFZB&Ak}m_Hl{4hdgpm#nPI5|&8p{aF%L^*2FSk#
zKC!=~{~>kSTMm?AD;3K84xh39!MI@5Td0M@J$pP?lSF%;ZdGWMhYUY0FFIC3?DJ%7
z;G8Fx=51Yx&gZvmDpZ5Ag<(C})#{ASL)NQh)`G$ZiOQ6WZGJ;RXR0mqIQHOHX%ip%
z1JeqK$IhDe>S1r;oc2LSRr3bZvG2Cpn^~=o1Iqh~wtKMK>+~r71(!n|iW*2U-^^d=
z8H#kT>0Q;ISI_pPe!NMS1GT2?3}FIo#Zp}}U&75W&woFxD~|rm#F-8#Vui_g)a*TY
zX2<)YvR&p8mf5GLC(=fCs-!(o_L|bRaa!Lq{l;`4dM*vz3l%o(1kLYwyJidA!iapE
z)m-V7VO~spWrLzAUQAjxKOKuz$C5gsD(KmG8_R}L8=XM?cQ4JTMIx#y(9~Ob`ne_z
z9fZM55tUx-7wQIo%)0e@NZzv!ufga@S-$iwW{>Nsb_c(zItDJ^RD%HR-p3rKQnJ1v
zI3!nvC$8eanOvrJT-xg1v0UO{EjeXN*G>>blSU6#NdnBf<BC+;SGrmLNq>|)_nx7W
zf+L;)ZbVG;`+tIcu-W%7fj*Db2E}&3(nWcIPs?-An<|Ai!M$oSwDEl9cjQYxHW*f8
z!lC1&=S4VxwcA}+xl@zE?NhL=lkqKEaA@u`z+~dAW-R`%A?U`zq|0`nATAr^p9`NL
zzfMV9D-KNQ4#N$8=lV{@kj6kEf}>4aFYimerUqE&qQs@$9N_)O+YKqL6J#*h_Tme~
zvJ!NFEp2XLlK)(@M$^?yNyqEW4(tE2@TjM#%n?LO>u2{k9fuPNl5p~ccZLY@I!mR=
zTZw9I(P;1GKV+7%ep@Jq)vgDR)d@_%)tDI!%?>Ur22jj7jU8N<U>;Rb%&jkGf-8of
z44Vd$9>o877Wv5pM(Fykvq8rdkcy#PZWS*A@(P?x^>Dsr$>zR$lMs2{$b+lz=?q0;
zx))!S6#XfHl|s<5C>guz9$sB%j;Tzw0qLf02PkoAsbwP)9p>d5cYWJ!4~GZ6rrh)g
zJgetB3M>dVyL^A{5X{i`AAM;X0j#gw8f56u$jQ*Kl;59UU@>VqR$CCEeI~PQJcM?-
z#zkWP^D*g^%1am3OwsSzZ{y*h*5Tr4w*f0DD;eu;F%wp%)i-$)cR34();Q{=A8{_A
zU>Q1!$1+ODpf={Em%7>uKL(wIcDo>e%63P1kGFUfX!`G_V5}9>)EF=0vAa~wLj#%h
z;&9o67%}|qryuD!Hz_w&ar~KEq<lHm5k0pqM%Q`;{1tclCf5)5w_J1LJW?j0D3t;o
z>?Py}#cMe(V{m>#FP7+JjUxh(UVW+52IP&3Z5ig6qj8U2-((`C#>X^Q&?XFxk9G6g
zekl=^k?)YsK7aq3u&Y5sszT(QV5qffLncntblhW&b2#+s$!p3*D@o125Elo@RV2Tg
zqjjm>kq;^7zQ}>1=ZXZ~Hfh@c*tQ4;Va5IOy?XZlQQMNjv^%t`KHN;+m6gu1cj0EE
zeTwMeNkXx}c=qi_S0i>fZs#1qA5z!me@g7%75;@>RhT}~4t;C2m5GwO2H{g(#deDB
z#`ETh&5kNozV<_%W30wkg}9KoWy2>no%1N!R@|>y!jCtNGDu0?ZPMR8`O;j7?-_B_
zt!mnNimqgm1GLlnyn;0>LiFFdFDpNbtroDuIl2bpA}^Jcl4OTpGBH^FUlyRaHa6*k
z7`Zcy_nGmIcDT5~Fp2-bxYR?sSc3wSPGy(ec50fa5@5$hUCogzXNJ+YgJoyucPBgN
zBN2y`l`z8KHePXpnUsKA#_kl=Z|@U@Pq!7`<i8FOO`zd6+~pkLg5EuiJnzPQ_Et~o
z*^>-t>@<nbQY)f|;b#MLDxkwmt<*AA71PQwYV(!1kSq6ynCdjaGvL-mFS2q8w!?Cu
zvde~_eCM6zIhhD>EiuRg_#X6uCr|fXgPM~|CW-vfm$e7AED#>8D#Bu`{6HH4rJcai
zNyi<9ZKym;I8D!SbdY}F?V8q0k+jkow*~TLZwXqqGc7$#89{i?!lR~UYUVj%?vlsu
z5K&>@2l4&NwQ-rLKi$^x0+sFHl(0lgg{ixt*LAOX{Cpx?a>Gw2=tU(;T@H$sbk&qt
z!e?j8I#h3U)c`Y5F4=p<x79BmQ=AGlJ6E=eUjOY{-6u(@2R$0C9VjQ(^*SdXv@=f_
zEX!4q)(K@~jq$z$x-7Ar!Hw5m96jw67I|>C0r%^^+<K6g8&FcmxMy=c7R7h_pboFz
zsZ>?7?K~d<z2t-O$&LjTV8qr%Rovq`cQ%gOyloPVhl!l3dNCo5-Z~;_1h&;=DW0@q
z1hhgBU&w2etp&6cF63nFwHDk05{V`si*Ul9T;bIa{^e5T9W~m#MuxuJ-RXFreP46T
z^1x945Eq%srD&C=6wlkBM*JZ%;L&kCa}&V;$b22Zz|jBSz#M3_J@pONGI!j!zogqP
zA39|DJafIjgKCDUC)NpU=9jmdWze&ZYOHx>NwuY$-mVTi*g7zI0}xez+T|a>b1Wf@
zHURYSJ3LzhuwiFC=RP2l_QTM87;u9$V@h^<4YNdZ0t7l*g1oOo)id9{1>}XKj}DF^
z1#HPwoMv!J;j?Gow=3c`Bp>J=wK0F(%lY9jT+#s$x&k|{T9{o?Dx^PdAqHXTI3_2!
zt(HiOCU9$gYyll5m`k}f8STq%Ir)0E{?>KiOq6>$KSyW#X4Mxb%F`kfKo>?`2iebo
zXocQ>`b2vIxiwW_(`@AGD$Y)&{c8Zt)F+XmceP1UA;I{wwYW#H{2rU7{pV3n%mSI$
z^jx;b%H?NePn&pvm0{IcrkVr`MRofMwA(Q>I!UI-#tJ+f8|NV$bKxgxnl{M1KIFlh
zb4vV|0IM#;5Oe%_C54$vWqf|*7U~+20L|<=Oh_Q41mOpKgeE`wTKy||Tjny1)tP8*
zw!GJ8mT)%ps<-@1P6F*i!a_%8=!N`POj+Z;y@2svS^(P{BVG#~%*HL+m(Gl6Od7<x
z3lb1C?}U_~8Q}t3N^3jtj8O5amxaEzsWziqT8_nie&Tm4KT~_UF4g&jUsna5llJ0U
zLEdAWB^tByFg=-kJ^<}<4dCszxkP^Aqn$q4K&Ae9W-l+!W)ZyQ@|kd{yPXn~FwCox
zk}UFS5^1nyb2q~6tXcIJBETQ8GR{$8H{tCzl6x0C3s<XeB%G_9(Y4uHq1f=SJa;X+
zw$&V7S9o*>Gg6f~D2WxmReO@ncVxJ%Z5nT(b7_puzf(w@Ki&+LBasU7$iz6xOMuH2
zQ9Y#&7}C=59S+2NitWA71Lzu=>n<~F<lj<2OL)>*>GWFYtdiFfa<A*X2#v}7Y9f9(
zY(!?{fgGg(sA4?rRrCn{8g@JE%|_X3v2A>j0FEd3V;7GDK?2Py2jSqS2377AFbj0{
z5NN(K?;1%g+;Va4Y8qyS-j-}OAQ&;{0TNVd&(<m%O({d&lqYIX>{C)R4jZu2oXdWT
z0VsE9BTmEe@;u>*ypEwkO%HOdHY`!A0d_A)0)`@43?DZ2+ApIm?gvaRuJ4z!i-{@K
z3sQF5+--xBOpXTxF)2o5*F6MHq`NzJzZlB?V-Fe2W2GEtGR|FcxC!*}9@;TCE#(<}
zU}SVVsNCB)_-?}L*Hh0p#F7Xs`HRQs%cd-a*3So&K)><)AAGZljA{;`){ujsGcM72
z^TCrR!J7q5>R0udpU5zBv#&03!c`x?8aCcXJ{h!<wBPRcgkLwUncV4)p051CTBkP>
z3mP3z-_v`$cm~wuQU-jJ7@%Gf*3991x*r#57hBNMX#=?|mD`dCxU+S9?D2{ZE8fbC
zPQc&8%j{p!6`IWTON{FHo^(~NM3OvvLc)UlSlqIDviXcl^~^gfOdtZb3v9Z&X+L)M
zG=)Fvdpz%V7HLoyY#Ztz2m`AdTYgh3_Wii7a4cI`+(o^J;*z(k@%S}Jt5(^5B&vWO
z$Mvw~n<HZuoWZhwR5_|C!HD`aUqQOPK#t9A?C!Hht1Jn@cuf9%59W-juf^f_K!@xb
zV*InW*&<TTVi_rU>1wBb&VP(8oi8=D2UeG$8#pV3U7{XMLnUSBwybA^tiOb;%NtZh
zh{DB}(H@#Mixq>iG*!>jV+4MybIIRyRjdo`>?#ZP_1=e1R9;UBmJy*nwPR*7kv`8E
z7E|w{R!z=fWnfMDq2~Wj7aP(Q;$&i_JUaF+*T6$455?Q1-m|uRJGuGyX=gUZbwfR9
zTN$+uM_>E~h*UO=nf6IDVtf*ur^g5~dm?$fV4ic^WmD9h+kDzAZFY0leyPNW*>nam
zNLssDh`RPUjXGS+oi;*Bp64CS`T9&O&k^)eVK<1*?AT$0IA)_Y0^EL^4U){zQ1{uy
zH*=v8<y+{<)be)~kZK|M5acNfAA*(~Ud4rO%NV{&0RJ@HZnAbd>|zPKqP(2`AEqxg
zOIv3)>B0S2dtOGN9wsanMdG%Q@LjAm+V!7*UooeMpR%RwtCw-snI!iBYS$o@!d&6I
zVa=9~8^hgZ?>hn4W+u`sm^LZESd$5C*~1j#-kz8bYO6h^e|g%RtO9x+t*_{+_xP_}
z%eAzn9Z)NgT#CM<Ih+A=+r#xvZf9e+y#<U)w$v&dkCa4^t~Dh-$p#^J`En_+^yf^N
zbxf26KjBQO3V0Zr4JJ7}4Wz{qE9!X=>K!<WS+o<}GB2*PsHj&f5xQLXYZsr+4eA|F
znx?b<LptRv>wK)A{$hoEMjAScSH9_(zAx_d-LIe}L*Q4(N4%U$&+WQ3m?d>pr-dwr
z%&s%6^Dbr|@P7@TcUKH2t`#N_#ze-HcajQP*Cnct*e6^AyciR1DC)BW9D1DaTyCQ}
zG+Vea%?X&;33ZNhSL+Wd%z2Oes7&Z)IH0-QBNK1`H<Ai-C)R_7>R>8W3aJm_TWfS5
zjdz^ZP(Q*F*g>-&2oqDAgo+iecR{z_I6svwZxTn)9H&mD>cW8?(t_8psR`(j>6#mu
zB=q=E%l$GhbFEckUj;3iL~dO-V5?hiiTXE89+MMtUnLB5vV#m{F6-ViCV(k3M_fcE
z2{n=LW$*8Br?XsqY0W3oE5ro-GveryJ2)K#L*0uJ95F)BEReQ7bx1@uq5@%ILKB=J
zcuP%}P*VybP`0&u6ISDlIx2lJtiX+}Z5jX_)Ss@DN4GYcDZWec)8t9@-IWn$EQ^6e
z)+s#jGbmZJ5?VS2)*oNC^vY8Mr8@U#KCU;gkOOAp!x|86lgedFNY{$TDCD4F`kMSe
za^8<_`%-gnA#*3szidsiHHj7dc&V4EZymX`X;l>MV3<XlOv!}surg3E0F5Pnd*wia
zeF;}n0qNx!a6v9lnST%bja<(;!M51P9(o^vO7ks=3xM$BH&&PAZh5-R>R=KZ5>4|e
zPuJnbRU2x2$V|yC_Q27K|6!^#&<hLKP}+Nv-N^HNH%e8BUy`Kb`!q4lMTt?9I;P9G
zfgrc?i;4WQX-0UK^WdJ^G|<=BRUDxxmyyG=-4vf9m_G2^_*Rkl!s3oI30^AOP=;h&
zmFUN0?6vX2*x#qrg*b~i(krDd{wdti03EGvIg7FVpo_ot7~We0baNidlHmi25h&!i
z?d@)_fVV7Gd^=@7=+Q65S)f4<lrCCG-U*`VXv^xVfA{d!#^2lEZQz7d&G^gX!n(s5
z^}S~PhZu8N5U=d4T^ty+P5zTN*>(-uBk5>k_+gkJ+)-LTeg{}>Y7%8K`b^n?705TS
ztkt-BtNMDlVQ1^7DL*Y+%k@1yL0msm=krj0=F~Zsi+goDnx>elXA<~Nf&}I}7h)|;
z1{jlLvibS5S)AUUDI-+o8=9I^?dNYo%<!S_bF;zn)TTIo#8w(vJ}biJEbEE8ul*xW
zy%;e`M_u{<-RI&WR>efcdqjUx60leL+EA!umn~zZo#(ZT*2WI>nBsO1qaN*bM~4LW
zFGDG0N#7}Z2`tLx*G5}zxq^=ih;yu$d!tkdZ~{laMZ*j(Mf(D;zDX(PfuAUq{}gk8
zWs%aLs&?)6U*#Z1GIlPukEd}ls-`A=4b}l^z2uFel0igIXke_dnff>PY1jn?g91<1
z*i2wP!>L}-B4W>X*fhYX@@z7|^+pBoqAj8uI|R`ER6Teipe0~iaNCVBOJE<^*wAun
z4-90!j9sQ>m%bb$w+J6gAI*5FP3!jY7{Te|nyZUBV+M|fxvSV_`hH9Ob#*FwB3O>q
zs|nMc{;D~1(R(6G5)a-juEUJ{QlCD(RLxIepQxpPN@r{3JO1F5@AhXss<*YXjdLG&
zNeS=rAC9a&2ts^&y}`1M<I)$pHmYlJ2XCP|zgy)ZiFAt%x2<l?|LLPn7|T8*r|J{A
zL?rSbPEk`5s=HZsTG42V{$|_)T@RC_yPM?2w#xgPt-BOQv(Ukc6#j~L?j>)+g9=)L
z)a*_>I!`3B+ppJRy~Q%;!E!tHo~S@0a}C65$e7rDK}ap<R`na_r|tXpYEnwf;disf
zf014W<y7;$RU0*dN7-8whUAPRJ9#YO9Ziq&9J!bS8|%n(NLTh#B3tGMk}ixo9nWN4
z9+XbVJ#O+4LC_PtSpIGvnV(E-pg8D&_r}X3p?4wmQi;2)E13IW?-9*Tviy0&>s<VZ
zysyZpazFk40VyBK!u*?J%)rHk_71X6z^yZ+Y)HQ{U>>0U5Ky(D>sq>tNgg!RE!-h5
zU>9$|Pv)#>YsfdNY=iF!y^sz-Ntz-j$Ann~7#si6#zR(4i0<`mlJl0qa%?4}lU_0H
zp60h}A3;d16UYdf2$6W!{K=uxps|0xet(+^{X~cPq<w|ZV{>iJe*rNy(UzOU>YOIA
zWYJo~_Z*n~rejmMq4a47@0}|@-B)|E7=%viwvR)0$HymX#`&<01!b`2nV!nmZ>N!4
z)!1C~TYR>g0fpHHP6A48Nssf&fyo;!F2kndhC-?e7`gEU(j)Y5*0uP1tXO*zjBYl|
z-&Bql=p?lE*fP^dJu*~FWit`&#tfe)f7KP9Wy{yA5nb(iXo{G8)C>icaz+*yNlpcD
z-wlApVefoXbB~T0`k$v04mRvak@cfK+3XdY<6&cj!6Oqhy%LHk<AVr22zkb{q8h02
zYY^i>4h~Li`*zgYPR;mxzm<yUgkrL0HIlR@6H2W#$dX3#1@+DFJ7qX(KsqNs(Vj)o
zeCS&b0uJnj8;Q%5m~n{?4+qbMbW=J96;)?){aEfh?Lbnnaseq&RoH#4rZ(Iqm)s)7
zAg_6cjV3{L>0&kV46=J3$OO}~0K_Uvb2Z9vwP*nc_ok`rTPO=zttq3$da@J;M6t}I
zqIDuB$gk)+?usC7LzGK)Y?nTZJP8%K!yf5I1y^F5;$q{Q2eY;wy)>9XMvQ+^J`&S#
ziTv2{xv(&quwi*UWtXDH`894`jard=Cy_RSh8gP`6@nS_Vw8H*G_f@L5lPdkz?ti}
zB<Pc1!dq&=Ila6FeN=}pMm5}=l+`aL({ip)atUB+Ifv<+PW53VD5dIzdZ*DR_n?dO
zOMJ+Ek^bP;AOk|E6#Tqyy>VthTa*972L{oev3F?;Zg;Rf0ri&K5#4yW8aLCCZv08)
zlvY-+%I`1TnmgT%Rd2&%-U990UOjGl!V42Rb8wb$v!t$W+Q5%B7N?>aqqf&+rs1^8
zrQ-Dlnnk=6i+Q`@0bi9hTGuQWYQ(Q^qA6!7N>WN<(^Jag&-&6YE1FNi#TsDTC6QMc
z=2`}3u>L`D=!9Ao80AjJ0DVP_4aJWhB$xm6YpP_{*fH7+HlS+atB|2J@=kjH(ts8>
z`nb;3`Z^B!hN~;!;I1BFW;Ly~gF)s(52pI|(~ZB&fQI&!lDv3S&s5D@#zx?5QD~02
zAwMW~Jj&!7TtcMiY{}7>SpFz6C{gAya%=JvrFR|uJM6szzzjDqwnNd^SNUehC+GE8
zqK{MW2<|ndiu~&oq%sD2Cbs9p(aUYo0P?QAvzC#mAH;~+K$C4`Ps`dhcyY=?!k#y`
zA(3~a%GbQ&%TH(DaXnmA=vY+tH(3S&N1?=_Y9&(i7IPfp7g_!o%!vTQ!C$vDVzu1v
zOMz11ds5F^@=^d@NheRu7pQ^vW4G@QX^AEQcwYZx2w~5W|7g}=0B{*+Ra}Xt8DffB
zX%No`FKR$@?%k*_2&BX#dxG$p+c%3FP2ko3Bf05u+cV=nuxnYcBoEbS8cukQl(nDE
zo{dN)P?R;IL2ea_;2p-?Z_&)BCB6+$76(u~f8hhYSCLXRW2TdLYNk9at8NQXv)?{{
zv)tn)qcv^Z_9`keX94Rj7xk*OnqOk1Olr#5fj1R~CF;;k4~{IcZ`S_|dEJc^Sm7cd
z?K8ALch8oXR$~&~*4y=3wwU4WTMUse-2&%1SE!Aocd#mNdl|n5QVO;MqqP#h=0+m6
z`e^Gki?;{ja78XmaCpK`={nf0^ojcZ>4ji`1xM3L#F@hxFk|0WTRejU8<4t{<2rqt
zDE<ejp)M?mqYbgGhOSwj*&dJaAPhefCQ^~0a>3t8U5b03BD5_-JP4IO&=>K&(xotT
z`HT@E+9HG{cFFcW#v!q>^z#<GV$sQOo{|K~orczhGeE<)^+8I?mngZ%BvXD%t22gR
z(UL+>cx5<5T|dvSU{*S1?;okzvxo}l#7YukfIV{4qR@@27>;@t{_fMY(c<OV7B|`(
z$aVJNyLfMlHR2AT0GjG^OEy{qaaps-O6xO{?8I<H5R4n);&CGx-9VNqJr!7rGN%yl
zcTT@rninBv!a1mEo6Ee?kMWRuQz=9dwdpCqmG+DI#O7RdzC#H>DjLP(5WS%No0~3Y
zfvYJimsNaW_(v7Qcu4>9eW!;QT4|F>Vg8P@b~pkRAZ;){C@Oj{TycaCF2=lLugk!D
z+vfG=`?CQewnb}V032kI(QCRq0OBqEN(0<b^1^RI8(&$b-yu9a9!eEdim%QaT}JZ1
zk>yMA$drL5ig|^e!Ss~%Ckdbo;MwiFq#>8nWKc!Qw3c92r^@b+Aq76M;LfjA4u$ZT
z24e06OGkU$oXLXgJXl=>S4yn70I_J4pX)eHt>vmRZL7;efxNtR(EJTJA)b&CVeb!E
z*<rQ9U1Bw<GV+p|e`>FXH#lG4;?Tfd0(rnnZ&SsB|MymRuof^ppPkh*#Do=MyE-3x
z93L-_hg3M`PjSRJS;|H9B;`Wvx&fot#}o7K%igPB{F8IuDb$@$L9Av3N6g~@Isow`
zU{H6sP3@t@uLZf%7fkQ#()5>{j?bUxTyxU<x&@`;6V)Qw@aFQ>G~L|i8eveuc{p=%
zQQ*5bL?R_0yw-by5kDD@9XROO>6QzA5q0?%!4GNZXwmBmLdiC*Q)(sxdm-nA3VnVb
zMq^}KME$fBHfPss4qf(eIHGQU?{Q1*7s&Tj?BugVFIt#Dl4j=D1KR)G-Ia{wqaaS;
zp;Yz1p8M1RNPUIWW(szuzXRVg$BbeQUrQvLdC>`620pbvo&P+3(AoR{TlNb0+nuFk
zG0t}nywoka9)*Ie0>dl&en~OK7-1T<l8QAqvRDE-J)i%;2(Cm$k+Pcv8@BpzHNE<5
z<1L=0n-e>tPmne1Hz|^}F%V><lDRdn-&bcecYEvEnMNM=vRkk@9lmE5@vYP?&7G>L
zWa*4cu9jq<W*9m{KouEliv=?4c6aNpz5jGsF345NCm}NT<G}dz4tpc5G7p!X)2n<I
z&*L_*%$wX1n(2KJKLdbDouUrO36uQCdh>X<I}(!eghKi%QxB7{H`#`Qo%~rPcDTkh
z5<gMksxMcx2CjCXDpt%g3m$ksmXrw`=RXJRK^a|B8`+YxA05yHZE4em5Pp8dS#x3V
zyRBQwNb3y`0eQ`%EA{I>Q(z_MZAB*2V#+wd=esryX;*;V8BL6vKwI%m-$pL?i)BY{
zf3kPX2{14{Q~=UT?X#p~El1GW+mOupd1z2!GRgRko%ct`YO(G>AkhFP1z{+^S`0BZ
z6wj&j<wZb*c~fTS;i+Krc|dz-*Q>b<{=mCA?<1ncV(^CIj|ok-q!>Bb>&Pj;z;8P}
z@6&qZIEq{amh1dD7-9!bbIRi|8g+}dzFo^R?o984B>s7}FZtaV&hGDg2d^d!3OH7f
z^0^TWSkx7)YG(xMwXn~H7UJB?2EvA=308SPx#IAwDvf&CV;xtJyn<IHCFEAR?XkuK
z0_p_J1eWi>I(m!4c;U6Fr^P#};e>>QJ?qmu&Iwxt4;$y@x#0U*01ZCRH|2#BGj^fw
z#7oTe6y3NP(2ymv<tLN+PMJ{IR2mOE=;87_N#5q>qvw-?yxM&vdtn2l{N??2R{U7V
z#>d{Gwh--aLO+F+`)}^1k<$aYe{K*vY^MK?7EHGB+8v)U?+PPuCQuuDY}1u^vp1*D
znc}0FhtH$`8SUj+)08;8jJhw*>$E_Fv}ZUU%+^4Q$%ajND1j~-vuFD>m~_PMC5OSj
zBe~z-4e$l+`Y2S&`{#5XF_G#Tq1+LwduQr<>c|HLzu4qM=3qnz)u;>E(k|OQ7I0S$
z(p6y*#G@{!y9ww?Q|`ap0nX(8Al5(qnpn|^d{=||;>0xSky=bR4Zh$`$=+ncR6V9r
zRPQx==QCI}2O>682>c4&)ciV>$kmwtQi-c2H-_^o2O^DIi+qNiHSLQsy~m{Wy-r6N
ztSvsvi*`eMS+QXiAH$BC$lB8&xZ>CS@2Wiy{K1yKMS9A=s7_NVt&g97M85ieCmx4j
zih_fsa5#<kHP7AoRfu+YkniK)?-&zsjotigmq}j3lgU`3^;}^}fonD$tQ&LMVA`Cz
z;6J6D(aef^9{F))!!>3TN*ot0U<C6uZrpO|0F!U`?=jrs3@Y-9^q-V?o8x6H7$c5S
zLwbMD&6|F)w3d+cLQF>kkz`HV6>6dZ+Tvvc{Pd9`(xiv#8op@G$PsFR&hQxFez)fM
zS(RKvneQ}kcAa7MFN*rTs}6+}=Sbs9EY+yh`J`A=VMReH5G&e6WP&=}CC}?Ik_$wA
zT#)93(eYc9|7*VQxbn38u@iy1FiG*S#8((*4lpS>x2t=aoYf~A4~_ER0Tiu~VHtDk
zd3B}wJCZkxi^oKLC61ryw8Ip{cp=Sbb>{2Ib`zZBp+BdhoK_^`r!U;~5mmA6Fl!Gi
zP6xoE0QB}yRC~QJpv1g12t9V>Qa8Vt6Buz@>C~R<q4zV+(iJ2K(>svqT{pX}QatF<
zfw}L#)-ZU9=o0$bbnbD8XrXLU6@gb{>*%U0P#rT}_NMkE!66cGr}T09F3K_QOATyE
z$7QdD#4M49U{y$ql3e7>mPk^rUDX8T@zI82YidAzQ<?qIvYu1rYr-L=PO72gecAm7
zhoyg$dz8yvhn7YE&S1p>P(O;Knjx+j9#@xW65?bOFSbvDzz5&8S-y22*Sl>1BsjVr
z)pOwL6d&a?AqO&e7Gu*I2`2V|loopf50~Ldo7%}bYS5krS!mm}L-69ag2l@YEHx*~
z`<F!CiUtP-=gFu)4}SEPd}vMoVLl6G8MRbrqrtEK$U9TU-*yZD3GO|W(3Vv`bo83u
zrb_W}?U~^Gc~nGnx%foQAaw-8kZL4?MXSjkU-*r1fSXDxLb6VXD_N21s%>n=`-^dF
zTdO3iQ83m2`qg#Sp+TM^aW$YS>kej4UE7>VmV1l+&7+x&&7YM*^eWopnvQCTu?Eb%
zW0^<LvXuH?D3Rk@ZfD~cVN%%ThZ(;5W`rIcU{yE$h2IzeB(Ld+!N>4by$r~1@m-_H
z?2m3fnu4U#n%;_qat~&xi)#vYc$9gY3J2D}#$><(nL+Wv>}H=W=6n$&KRwdl1lOmq
zP9_Y99$JtZ%g|g6-5n9RH+Op0m1OSi3C#{~P*?Az{S2KWpbHTGrBtBV`laj1lgI<>
zvz|W3@jR55W5DfeO#6Hp(@`OwjEvQ2J)xtM=R%oY+Y#`$?ei$zE~$QJfx9RYKfC_S
z@x>eQO^m7(LTcBT&grVRw>waqXx_W7!44CSABz_FeS3%Wefr>c|L17;VSQ>oVl@1P
zvKq;MOf&9zT9v;~l#V%^sjn-t_xRyDw5$0$ker3-nTjyDXC@`!F0Hn=IJ4FzWZvaD
z7UfQvP)^QMV{o5ldDHqv*Qt9VK*OfRWeMFL-+uT)d;ZFjrDG`CFZcgM;e_9s9xjbn
zdc1k?GyS`U3=2quSi{fDG;J?ad!8@L?XUZl{Ik(k8SFSDgJJu)AipAar>(nLJJr=_
z1e6TXn!tbGP4z6VfdqYHHRLvQ+3#x^?~31yicFnfl~W*xXKB_q+Q@D~RZ`sRi;{@g
z&(^=7yyrSh)y#7;u$3O^+9;Y<2F**t`q1cKPRZju=b>9Jql{k;#NW&PoplV(7Zv}P
z1psECi1FU4lffsU1~_u9Nbc94xN7@fX*db}6?%KA{mo@mz3XMa*!Wg>VeTI*QT}Wt
zJAS9RzrRO?zDxxz(q?&PG3hN=NP&2}ml*%z++ig6<qg?-I}<R$mH&^fw{VMcYomvc
zhytP`t+djObb|szclSs)NH-`oG}18)NOyNjcMRP~H%Lo=55DJo*Y*42FPPcS-gmEi
zEttdi?G5w*8;7U&MS3b)hg6aid>cwktWkN=h}S}?*M@g;*ZP^~yk6aMRdCpQKkPXQ
z-BlB+F^9=8ta==JTUVI2)v<(29Ag?{0=)_o3cSyZWLl#_v??(oK?&(!OUsA2tq*#-
ze?-PxabiaPJZ*nnRj8+)H)i+B;<a7f^SCm9hALOM2;0XkD3(v(>{Q8)OoPLTC>M0t
zQQ7KC?LQHJU&8|DBVQuGKzZoOx`q-!d^GH>aGic+<1#NrN#nB_t|-?88LsAz<t@L;
zc@^bnZAu~Y=7&63yVSIWTbC<x=F&X71h&{+>3=bZQY~>|K8e>qR*iAT&9(A&V1;Ue
z)!2&b+f-18#*l17?WHD%SPXIH;9b(Sad4Ibzn4Smc0c!@oT3nuiZQ7kr#ykq72b*V
z)zf#K`W`0qvT|}YdL)a)d0WzvPv;c-j|NG_$hEEcw#LlTw(%A=j2y(+eQmNa<2%Ia
zbm;m1_K)5=)G6H#$ln%@xfMM9&bGWWoHz&Ofdvblhi;~u+L}#vRhh@vqR`RlCpb!L
z&B&!WqoH$PuuL5l=XXEtR@)7cnUxrS#=GCiJC^(w#+t`vHcsX^LmJ-!QfC71p+4rp
z7K4DC=zflSDv@~`HD2kTwMgwra}`u!fnLcGHS+Tnh^_yc$6kR|a9o6?VrWoDbzWeA
zX$Anzae_&*mpyOfqVAW=qe>N;Q|tlQV1<T9d{PoiA0XLa1bbx8?Q_Vd)_U&Si<})Q
zwx$bMIb1a6)kc-|4vppw(uKy>m%BZ)RGuc~5ZO`%gtX0B(r#E%-epL`=Pf)AOFg3c
zQbq6O9}PM?N1Bbhi~4+ChXuJyA)GNXt13piv2HGmW|Of6%}>wSm|lE-nR4;00Gs25
zYW}JxKcUOBft7Amo@X}1k7wi?L!x0{GXJxTZQW9~t8lFVthN&_^>gPjyZN|OS!S>C
zsw7^kA_P89zvo-m|AagT%(FCiWNsLhAf1<!#SJ$`%e&P}7qj73lK@8Pu4b1{wT0lU
zO2gWKN!}Q<t9EG|;uXZr;8!=XrvXX`@t6!wnl)Sa^{H@9)W`4W5L{X4K_Ap)vGvUX
zL06>se((*}X?Fj#ALo!Ro8+e=;#|#PvhNMe*7OGQScs_h7Z5ixF2lYQ(dFH*@Y~Rh
zvY1J!bYAdh#xDIDT~q(@s6(mb27+^HcWcjc|4cifYRC>9@pL*XCUj2$J#<6_)=e~5
z6K<KPiyrLEG-4w<oj0?~VK*b)B7L(`f)UmLW>2jBT@gGP(y?THZIj1?rQ1B^v)8im
zNFVL<W|#%4^~9Zbe}Am$b7~&xCpUx1SP8WeNkh+G-P2q(bPldKH{M%$<{X-6JM0<!
zF5cN7F|;lz-5YQF0mFTg?IR<yo`S{zvPAmEhK6(>fCAO5OL-p;(=32Hwqtz5`FHNY
z2RfK{txYQ1N|kjWego$U;Sn@zOtGtJObDnoS0P%f2but3bCl$~u>nnb^6W~RM-fro
zda&7A-#20R6X;@s!Hu`0qocF)X7cCZ({+@ZV-9&cZrdk(ZutcM06qnfJov@A9hTro
z1gIpy2)kP>_HCO_R!F{@{a7GGuVW+Dy^b{<mo&{Sr#IHYYo65+3<aG~iL=?Nvwv7T
zAh$h!!ALo|UkHw}p5g%b%~h9UYr>JIq=|N&K=7ZHDCP+RjkZVUU9Wwfw~`^<T^y25
zRk&VH*YfF){jQM`A6b?HVM6KKTFykZfp>KAy+W3g_Et0(A^qSHHX>i_OGAWh`p<{=
zgj`#<Gps!qlBocDd>v(C-?);T?)(gvSu7nu>!q)iOdVfQmUoUJFrO(7apL5>YwvYz
zbUb=b-bo;c>KuSJ(g?d3%Z{m`F=DcrsK$m31>PX#&o|b@E8rS(^Qq6ERqK-xp98@*
z6IZ&NR}_dA>6fgR_$Udrku@#e%aP5g*7?&1l?f2r)NOp3@xsl#F_tZl;5UGfjDK@`
zR$!*Lv<bZvt4!_8IuUvWrm1NNjX@n{(+elpA412lFT!%eZH{K7J_&^xXOq?1H$3^y
zX>&MI<l4=T!leK$61-z_*#Zy)R}^>A*Ro+%JRa^X_BY(j3ML8rYRUbkLRY_CzV8(p
z&W=A8J129z<+8_Jx#wE&;n~GO7E)(R0IPDchUF;QbGfOl^kPAPDX)67_i_^h=+Mu^
zdZe>j5x5@Qz6VX0Na01NuekCzyq?2Cmox!I9iou_G=55hxBT0M4ma+$hI!6LZZt1P
z87E+QJGUS9i1QEH_)<`i=<*H_>V|D!cbf)oZ%kiT!dWs0Zqdci3${yg<BZviH;O;3
ztb8y1(7d7YtG?kd=fiKLIMQU7b+{KFewxkL*zMjJ&0-)@E&m3n?laH+M@l9C2yC-l
z?tQiFcc4bqG~;aZk;INA-5;Q?TJPi3nS4&9?u>A1+0bjh25Trdrn>G0Vs>*H&{u20
z*GfFIVCGG?Mq}@nD{P1Z_;6Hmb$_4fHe)x<n#rUq@!%RmL+WVF{ryO~r?=}TV}|_~
z)jb3j*;Gx0OPQir!vP;6EIUNBm{q==sc{L%&39$ndn;d1j(b2c`vK+>vGL_}HwCLU
z{E--)1AwJ^5i^jZFcLVteVK~MHX_{?RRU2&lI|7OH)I#FGa{F<PSP`?)GuVxHlNTT
zy1!HJGZAArI?(4SB5Psin&QzR*j1%gVHcI(H23-C71aNN&Uy^{DC#b?hB<XU;S<<y
za5`!*v<FOSFmN6@GQy~g*=j$gytHlw_uiT;dS0!>Mg47WY&R94AU&L<rQ9U|bbxjN
z!P&ZNUJWtOT-9*;w1CP>!r0S6&r<97K5s(<cfo0Cnb3~e+^ljmE`YqEzu>eJBxYsw
zjV9Dsa~jYS2|Xhl*B2%LPbZ%4cUT4W*s!4A)u29$^H|M{S*kht(Cg1!P)7#SvlvxI
z=$N?2Co`1}EvuTug%=H<7X2WD5B})TD-~75!_AnVE+cYo^rqE)^H&%d2zry#ZLIdA
zIZhz3NVoUs8%C;Q{pLP1Qbatq)R+uz9-)U6(=h6E4T`REIlCa=`ym)BsT@FR#2`{N
zu?G|ybceW9YF5K+dkga}U^Rt$MPGaF)9@2jA%jwhZ}mKiw)<q*D=F+I5{9Q648y7>
z6#}A{C^;0Hi}-TtblZDKi!)qvFZK+pB@KCW7=4tW9|fLt>I-CoZvf@K<lpX{%$@px
zt*SOTlGBG&Em&sc7$A&zkNVo6HUk`yIe9PUW0Q4>x*z{hN(R{`slO9wYs8V!kRdgk
z3gJ7_%sBChZhoIoLXU)Ey8k?@#1+gbU^?nC>i|thjL1-G5jILTyV~G4e((A<EUT5A
z6geQV9=2FfBC+TIa7Z`vW{c|nB@7S$k1!mFKvE<P)>JZqtM7z*K4Y@5ly|&iA^xeN
zT;lFKo<Z}XWJsRqB8A&3!wJ2jVnMorhv6eooJDiJ8m0MLo>2olWH=8eSkr#jtmyOH
z9;xo?M4FZJ@%4-0gY#qEEvJFXV|jS8IF3S2&jspxRSkxsAbz?1o^BR2^Hq66XE}dL
zLf)GkVllr_ulf*|`p1*12L5hmW3qiN3($YCF7OTe_{0C7)iUHkEP&!60o5ZwvKmn9
zUC+{TTot+ZOTozKqmqS+Tg;*_$~8zP`xl%VYAb1g_9SR80Se{6&Q0um&O{y%lVl;|
zS@h+2(xs!8uXyxnB(x}C;vLKLPA@=+oIs>aQKu{n47|7}SOV9IXR(*LkoeM)oC?O4
z@zUDg*)*GLO7kTlQ~2NxmBFUGCqcdjj(1q`9VOErHY|l6O=9mqdgnQgO$7`^3oC6c
zB(~*VY4c!=_CikVLBqOqDKqa?i~)V~q2lguEYQN0mVK4#FIW!v9x%cQh8BTfSx<j^
zUyH4ef7o@ox}2={_{`ZU!}TQFA>HlxXFBTk);fa><}3i~GkAQuuBq|0ftIy}h=JC=
zFt){R(|d6sQP0YtH3|L8w(j<yDP%Z-I6zl@dhC^=d%GUWf~$H<QyHv6>X32nihHXV
ztO5OanH8S+4q*oR#$K!J@_NdnTi!DA7!)lEJ*sw=nhNW84K5avG(ys|nBEO+TS=tS
zn=?7<vu8zO<Ie>Smc5%#5~_mEyKZ^Ik|;X!<6#+3y&9|wN$rkX&ZzjE@{u>Dl<G9O
z?%3}m0L|2WjR2(-5TGPj(xUVq%eu88X!0L?#0)0sT64D=2ko&S=0U?ZmAFF@FQ*{)
z>8s!lK-U(AYvY5bREPI=(&VpdBGAPq8ZTzo_95mUK_{!N>KtyFQV>(&aafghI4$Ig
z#B+UmRW8NA1bHT5S>?z(+Vn|2pe5dxru<4;3upEzdJL<VQGn#QwGC48FZwySBv=c)
zs74VBU&|r-%(D=ioRf0!SNvfypR{VQ%A{}vmnV_y+e+SY&a9M&z*V+hm8Em$CuJOa
z_K3vi3eV33)45*hu8v4$&N4#txfv8neC9bY)XRJz8fX8>rL@l-4TBjUZh?f;lhHsl
zb;@D$RGAm!SSf_-6tA4^`)<DZi&Wi#{;YM+I>&tquBl9Ahg2KvBc}36W>bANt189d
zX4P*kQ?bJ+ydAr6v*w<wUsL7|7pwWCUJEI!N=m3z-~Xy<{1-GHDUogChFP&vI9^?~
zJ(MJ{liJgm>?lJnHZU?_n#wLKj1~1EB82(U*)9!8ie@pl-%UFB{;I^Mt1s#b433Dc
zWz9y`<uaP@y!({9m8Db(=dT|6FedV*YdSZ2C`jTmeKNT67Wo5MTC3Mqiz8k#>dVsE
z)^+42uLi}O3Efg)@PPfd6-2#t|6@NW-@JcT?~#;!n)p9bcALSKkL+=vb<}p!q2rh9
zG+>@o8tX%j*E;*-yLEt?pQlnG4p4!fo+IMu357Nlq<L+xp7VK`QLt+l=_)_f+jmb1
zNeF86B$LaKW~}x#rt@uF*SQW^7&{yOq0TO2rwyiUYV4viIQ6Xutq#KT(ky+fN#O;q
zYgd(N$)CIB-g0}Z*S`#8*u>iT^9*1&F8v%b{p2rie8zRP9fVIkjnuyEEmAR|12;x-
z;z8;m?Ho>D#J(S$P<Cl8_n_C>(Ye}ev7<60w8&xRtlpxuH6?pU4xGuWPgQWUZEVo`
zPoKTkYeWQnHNskp7|&c_XIxIlaJ6FX4T=jB!NbqsQO9yL#?y95)w_2m-1Fj7U<a(&
zwr7<AQH7+tLoaRkFGH;sIgKJWzKGBz8`LEJy<h0RNkOspmV)<+To(7>Wk21<`R6YE
zxib3m`O!Acj4~olBol(Qirp!14}gpWh-^3R$&`LtqxE{5!|^$j(NkJUC1}G{iQcS}
z4YJ-PbVq;k8v_Ir2Q8k$KTtGa1|=!g)hiw>XxjO0FjS!9Ht3ZJ8Qi4yQy6j-5PsKa
zk};rCsg(yP2>(BVZ;cb^azot9m`7&btU4u@n;-khv<}gxwmq;umOK*i&WlmI-Jaf&
zu`uLs3m|^dz_f%=8m4L{Cxh<+zdjDV=-}h8!LH9TJRkfbTnw@{dQWkTD{aHeG_K20
zIpOv8Cu8q!eAQ@DH90X`%j8bIRL6N#@!QF5;iEkn4|F7RTy36%0`$ecrwmYdk=lz%
z+P@c?d+n9-<F5IdD8|nm6`<Y%4Eg-cku}eg4rI2-nk;Se7B~FT3T;|LP_EmLIZTtv
z9-_ebnwjAI(%=GtJ4*8TsBz~m9N|H&bN3U!n8jgqOTD{@<hHMr(i%n`n%LRZyE`Ol
z2^&X^gJqrlis4KwTvy>qY#cRr5O}04A0Po`&LQVEFAhk^UjuF9k`_>Wl%HBSPuPf)
zv=v0|<L<Tpq@g|b7{7n_*X!@uf>xrZ`)a$-|H;lT`HOt6B5820nqB>n65d_H{C3th
z-GrCmCTOpLYfiHD>q5{czHec5`!8MYn^#{ycf9B`z)yPapeDd}9$qd;;*T$Nw<n_k
zOFLlMJxO$b8G!}6j1h%QGplns$o+D|mIG=(VMgU{=fCn7_ZZN^Qa|JMAF_s^MICR0
z6O`;r0wjweCA_PXA|)U-8JreZFJaIm>TdNBASNM==z;ch6he;TrH*Gq3s;2bc0zMU
zCV8FbSl{aIS<UEwO>maN=OdO)lwRB$b7pL2Zy53-#M=G6Jw5q*wQQM`6w=i2kQbX`
znjr(ISS6q7lWh)ao>zs1jgzGH&EKfM=~=t~^V`7nT=;bu2XV7(>r=~L48)V$N{(=y
z@b?qi4&?WDx04DK<8RK-9qZ5IJR5vsDT#>n`0v>%9X0i~0oN0-N@<Te_$USq!Y~a(
zvg0QI8INp>NEE5XNV@H0Hi?x;Cc}d)f$62aY0{RBJTK^5EQ#YyHm8aR|Ktqz1nX*)
z{bVm2v&pUr`_wI~9LoZI5?2FfP$g=)abqC}&Z)&9G*DG7*Ii(iCT(eHeLIi^o--^z
zzv)pY-DniCYS{XwEWu|Jq<`$lPAB7NdSf-G9=tPv;dM2V&XB~gC6VoTyQ^Hfg}-L?
zabC0PB*IQj0FVW6>(lXZcbZYB$d15Ad&1G=v&d%nK0IQga8^Ci!ckINmw(iF8g9h=
zT2PPpQhYV-=j5C7NmAWg9VM?W=KHi}HhoJix0^5Dq+cL^KqaH2BZ=$C(9;PC$-S;^
zS$+^{9<l19kR7WU8l!OGhf*corS_c+unjQl3ad&?D&?Tmj&#s8avbHEWC)Q=q<|2>
z2EqUkxtXL(8omT|>}r%a$SeZoe^X`sC9m5-1>}BfJT7u)Zi+jCH)7~Dupf;r7Z=ho
zD|#g8(=z_BE<gU@WoIm7RQRy^oAf*$$TfnsAouTFJqrB;)y!TL&8=5IsAHF|Jn&<}
zH<eNooBP&VAM#<nNi|hqIA+ZnGut(SOYY*hhF+kzuC=$oB}*NHeY1x7!CZG6SJ{8)
zY}O%@CQg!dcRwkN3u@U^l95U#?vnd(A{3Ptn08Y*F&^^@9;BK6fs)R_AW~kugD`wj
zC4ZCo;f7gLv!KB_r*)}2eRmE$O$e_GHsinAHV{|>mUJBxEi6JOJr@4)00;pcOtY+(
z-SL`{q3Gf}y!Bd9FBMK8uZ{=bzkEgeWh|a+$lWkoa6VVql$3WxC7XND!`9K<cq?nA
z@+k&%_gQuNo3TK8W~maEffYzp;K>+#iqL*neQ_#}OI}H;qd%f={nt&>l^l*N*a26}
zOeS~yV4Vyjr!ps%g&pg2&RPy@t+{#LMmj_eGY19Id9Ih?3>V1Ex5`*QaXyp&?mzWD
z%VYOJ=C<bL@rdJf+8-rIEB}hcV33=l$5!UjTX+3;wAT+|gW+o~9x>DM&0n1M@7=7z
z;+ViU^f*F+PFV>N%GpqE8ReC?NPB=%-Hf;OJJ5WoAC>(N!95=<vUJ?>4;?lWmfz=W
z#M(AyJQ>nROSie)gesLq76vaN{ZMQx)esRri#(O<VmE@X+$y%g0FF#+rpL-F2UQ-F
zsva$>N5O3L2Qrl-I<+n&a1^q)o9_-6hn06yK7alkW_n#?D4)9f^hzxIhH1faHplQh
zJDoMMpQ_*57RPHgB%vlUdSD(+M;;9jll$GfJu-W;ZPV~Dj^r4d+DpZ3@o<~tJu3ap
z5oN0MA(u@67RIY=>m_g(2QLu3DMCX~(dzS@%|E}v(BK2&!9^i5a6n@AC_n8TmvaOy
zoC-^a_0?&=$#Vb1KuStkjPPKn;?f4_sld8|Z<=Zrh`2(kD?QXvFL=)JsP$3cx?pY^
z24S@?`UQVb*HD#WJN9qQ7B`R^)2a$49_%YMqa(nT-gIP#oQG3uEwsNJj&aDq^Gnc@
z?@Pmfn(t_YOTTePQ^App>Cfgi#Ithk8S?yY+M?rSBJcTwvP*UEgK%w~nb#}K5?dQ5
zs{)#&`MnR+g@tMN6eb<M)>=nUWVlb9CpMX*lSb!1-_ilQtbv>Y$J+Z5OtjGd=8#DP
zlzGW=m+Xw67YE8rs1{a$&2^i#-VJQs2k61=4V`&Q03~wR90AYS-=Sp2=EbGv3VQ3{
z!bwp{Nyv(I%h>l8EHX{f*0@M9_j#!W%rJx7q>o<=`Ni&id&O!ocW8r^?oK&*ROZ{L
z?-=4}Ja3H1?o<3|;vy~L@uIAPw`HZYzRFRN%`fb;YJ2b?HsBH5O<qMxeQ|-&P*V>4
z2|u9#Qp^caG3L_>$gJBF0FB9-EQ<V%7pUa-el%#L04nOPPLnR`-9-KNIQLW{RuNLk
zpp8ZGrT#MkaL?!5P3`jbT?}Yw%Ve>J|Az1DzoLotIZ-dp)vE$?<5*otr!zqvrfzB?
zap;Pg{W+IH9PdW85lcqPH1%e?N={?LVyBbzkSddOmK|{a2i?z<)jGnIe7t}10jX&3
zLh}Q2H!Y_>15U*=Q>W{$QG?6XcoWQ0Z+96h3)t&>Y!||9y{pjwxTf;VeMkzQM(?bR
zRgWWz@5mm-L`8aq`H_edx#1mtr~VUwM3>fX2IzA4Hc{_P^IVbpp>{duQ3oX|<9nF)
z!{A4{DC&N$YD)`zN^S!y=b0%7(p9lt{q+{_e4hk~oX2a#1ZRDFK^G@_hdqXDOF|#f
zHD5)dt>UU(^eS#R`oL$``Ijs>v$szdTw|1mY|BZ2!iDN2ugoT$&$$!KFwq)}R2akg
zp6w$dfcdJBTKTRd<BK^(M#58NEV~DJQ-0kP`<?Cb1jxP?k-67iE)82*;6{c{?{r!Q
zyY6-MJP22|b43^zYKqD(1Gt0_zr)rsp!f>zu2P{QxsFryo;EE)ryvTT$XA4_J2;fI
zTo1^2mW;%KbOU%D(GuQ3<&ZWXx+`}|?ZE$gXd#>GR!t7Jw-qW74X{NYx1Kn{^pGY!
z`&mLFNPaQE%J9SUx=u%S8_widc_)zyWe=^5oHf~;x0F?1Oyy(JeibI=Oh}Eq#57h;
zl)0D37v@q*lCkd-P^zpc2w<&xJ|xjeKXG7ByH3Rjdcc!(W8O*bAua$Qe_-4ILgYEB
zuiRFR&&9eGPV??Ozx%nP1#tsQ?;VO+?P0tNFT`LcVr;bm{TZVM#<6zLd$sOWZj?q3
za7abm-UBb>RgO+#G5Z^X7!V-7@zwcv#RB1j(MhB4M6}}VF*3NjWb8|h;f@cRgH~TM
zW(AzG!6e5dGkA>Gu3?Cdi`be~ATBmK9c3e-EGUDEuWjd3(JZSGj5aL3!2ne@Xih~Y
z01CT&*ip*6r3(Gdn=D6PTFOdg0uS|b&xU1v+bAoG`-S?g2B_}mOMs@*4acYqD0ZXl
zq{oX<b9sKgv7V=uhldAYXw*sMN{&4^;zk@ja+6^29Xoj2mNjjt02Z1qWv2mlnN}U(
z;PQN6gg(${iqwwPGp3JrP_qbA*7V5VdtP5I%a1^EbF))anwzf2{bHE}T<?y@TYZG?
z+^%N0JAExoQcc)ynOKNr-8qx%W~jAdW{(5u#{8y7gzoNcE=C5-s7SKwCjNA9cpSFI
zj$#EXT8HdZE;BFP-`(u2F)Pcg<vE;e@DI;tcrdHH1k6c@E{WPlg;fa>Y!O?pUr<sC
z$=^+0y7xXlTK38I&%ZLEeh3J~v&4KTNICBx0o(gQ0P>!AAd`!i95>xon`1wA9vfej
znwNA(sK7}GpXKiEwMfuTYE~%_7nDFwx^d753NhZmIX{r+GO+>ii2;iL111E_W}P-a
zDIv+(2=z5Fan8M|m8?IuI$y_Pq38@8EW4ss8Rk9rF=i2<{&vQToqzSk*TW0?U2qQ(
z&q5gPaFb1$6rzZ{8X-l4iUb?AEOrf>iH-#+q1DrD-aYqG<t#RcS@xrT$jgvmt=R4+
zMOaVCZW#s8YI9DCW-D6p#X!=sXn5lgG@1}rnF-bProjY<2Z|l{`8HcWsqFhrL#ZUD
z(sJo_Fe`BF$ka_Ae+WE=teM7GOlX8?t4}p|(S<og=)>ZbG^1g0C&Yb1A*D&rLu(UN
z#)*zNlK$<&)^}ApkZzV8=9Dcn)*QO=)Zpp+YRb5pLqbE4dTDwjn&s%)n<!QlMHHz?
zSM6%UKEdG6A+>&6`WfuDX_^9*1yVKCXyRPN;XY$wYJ&4A)`zxniDM7+0r7I%%cfc^
z1rxCczY|EekZx+lDraSu@6Y|7r)#aXOYy2E`xhQL9NydZWdO7U0FOXZYqNyZolwhs
z|INs1e{?fM5|{X1jy=oIol2U+Sm6q&jO@Lzo*-LLeB~~}wCT&kgi?NTT9_|xYMi(h
zsKFB~D4d>cZYrg9a!oU|n^(5hsM60HdNTHG(pnvNy|0;GjSfWLtA3ye_!oeEx*jti
z^%^E)V|u+U*nE!Dc2?e)s3xZl{vO7AS3~MwT8xU-M-@Vo4>rloEGLc6!oC><)}QL#
z3WUa-&W`6D-y<bqPSy1-_o*;=huXGj6Z?g38}o)E=6iTwor~8P<8IUN2gMeNp3&<<
z+LYznf*GBUeb)m~^=Q6X*Y-JXD&lzZeks|#G`0~f<dZnl0IBy#yu`=@)Y{9$>L@eC
zh~kkar`j39|HA^<+4kH2Y4kAWRuw)fY@&w;ZS#5H499;~cH`l5nftwE=5Tboa<+SK
z&tGC-mF6K^s6?a|RjP@Bn}!dhSluCKH+Ee(^AU3X19JTqYR2)Yp<2N;^P{>9c32h8
zp6e}@%GtEL5TR95$%EYS*LnnP*_87X>9wpmnJkrv-ZOQ!`Bt)*sx0R|NqZbp!re}+
zi<0v+rWpf*?*-H;U!2R|57fz{R3v8EkNM<`hpn3Qz<6)$=pvsG4h)y_oO2{KNj!sW
zj6$O9ll4VFO90o_kM2Dsj^49ce^n_l4b+c6EP$t55X?kEy86Y=ME9i~eJL_qnkApY
zck*0Nwb$<xgg4np+kIdK4js%O&ua-9<xz%zP*S$Gga@>K`dEv;*Yo;;J`xqwb`n5@
z_ln>ZcGC<X0g-XJ&3@N5!jXaX8mk)VKgyJq^JnkIb>7_Cvr2fsUI-}4NQFmG`EVxx
z%FfE*NSb-|x`NN~J_I?zk?)nXk>CcUtKmy`UbuNp;+q4bQ1YA(%A@sf2S|km{hECr
zShCFm&Nmnhqu_V%{;Z!f3`IxMnn{xuTmzVr@W`(iW`JyM*^^#}=#NChPbRRpw69F~
z*-RlQ{U+BTOQECadOIYUm!&pEyYo#vlf{=xL#mafF11B5Oo<P?b&is{6sKM3$&m9E
zM3XE(++TVtr|s+A8-Ryir<&&u7qMQm198cxFi<6?%4D3ZXIo*h`x3))#=N4B^T#=d
zD0H2$yhc;AHm1yXNJiYyG_!9P%A|bQsn)#U-)Us_z-UYomk#62q>Gk`Vhr%HQ5q#7
zrPhxXa4(cLi0;Rm$V6yEGUAA@U7|HFy=(N`hu)d?4g7MMP;hp(x%)YJSg3*lon)ot
zL{*Xg$4-NB#Ml5pxrr2e=-U{F^+hkk<{9^M6ur2jLx+_4etfaiu8}rYj>Tn=i+jgE
zEv`K|zrLV7;%PU-l+&*aWXiYV&6I#yrf+zvARojd-i&qLj$M6U4m<Qu>Ps`Zcz4<C
z9byRIL=ArThn%-pvVhvojLmY)*;7~^$^aj~lb0S=rJ7Y~5EGAy`7U!t+d%;LI!-{Z
z|5>_wRyrPqrYN-Gk3P|W-#zY_Y~419h7ZR(Hy(>>t$*z|L&2A{9<)Lu+=#7XQ+VTm
z*nB0o*f-E5<m;rh9FT8;ZKt84S@-FOHSj=dEWbJdwazya`%wf76sczh)I*rQZ%dLK
zc`x>+fB8n_T)5VJimT`op;#p|87ScNwph9{u3~uQ_L|`$-G<J2fRy4wOwGkbTN79#
zC$WX&<lK$1eQ6###SclX2(ehjpEt?Lw5vqaF@N&*BrQ%=5=yJ!HydHikfWyXd3`&&
zaUl}WYr?MnHA3(0lY%=y4zusfw#(k^)a^UVuTfWTf*5$yFyj|igDZ9y)j#@&Lvlox
z-dQ7t@_echE;1%+9AcgN(P-kp(DgA=>=|k|n+jP7k*@#HL)sN!x-_(FmMkS`OwL!S
z84$V4sc^G*uWEL3tFLx)a?cPthrXV(;#TvLr(a%}$6PJLnUf2C%skV+W#E)H2u$*A
z0?bZV6txs>mR*{*Ey4rmr9`4c{m`^#B%xZSVfap^r)_x!teZoCk2w6UX1(NG)U#Ht
zDwxyL<PEY+9n4Q?>Wn(Rm2#6lt|UdOYuD#lM(=qvu8%aM(_7J`6d8yKPJ3!J2if8#
z?XKs;YKF~E(_H<Mv|j<CcGS!4yadg`1S<N^9b#y=-f6bkYk+WD-i=}bIgs?PW+O}}
zNf^nIg<ay+Mdd{RC+T$@zh~V0hYM1CbB!|4DjlU`F9tN5#`#f>u}mdy(_`$P18pBY
zQwg*?vKkawDGNJBU_mZCY;Vz|M?0$Y7i@ZbW(E&-Hwct&Yxc8Z9wceU7bGy6So@m;
zo9C&Oi(cf1l<SLDAH6o~<wPEMdYMlox)17svnV`pvD(T}v(ejkYTjT^PqT-~zd1<K
z{+(S!9b4(8!YN&ei#80g8>{7EDfZSfembzSOBDPB=^UsPrh00u27@{(vKkKLJN55B
z#GEtD#j%pl97wp3ho3Y)d|3~CZ_+vk7ED;QJMshc3y~ZD3N#l;oeEs7-M4$%+7{7i
z{K{Sk5xF8-FKJadvn4AA)HDdfjq+e`PeqkvshCMYL#>Qn%sc}8Re-=rr?Y1}Pu_Cy
z3u|6OJk(!Ndq3&Mzv2Z=s?aW^6Qk#4LB3HbZF0q6CwWdj(_Oy|pzoDiNhy$?0HBDb
zYd0zB(6+t2mYmj;N+SNqWU1^>1}f!P_A(->s4vLhIDi?WRjcr0yW|r#k3xLMvMhy)
z7eT&$V1jz?YbZ4oTFMzQ$QSHB-T1OlUcF~qPYx5B{f~<j5gi*?Qdup(y)->nt1-@<
zDu>KrblcAAbw6o!!q=_N^p^rncIuzoJC<j#R5b!xtE$E?Y94M3gg!8yoq{FNZc5FZ
zJ}C+vE)R&x4G?Yrq`Hk6TuqixNsr@Id4L+P%3hV9!V4r`G^|F5q3C&KXY}%f*>>`X
zyq@HZa0bYcmbL=3*KOr4qH|RN7z0%jaF%nj$Fm6jdzR5fDVRjwHTx=8<n44r@oo$U
zOM?ps0PbstV?lQrG3_)8^eXw?oKu_7L}bfc^E)cWA>FHh=*nE0EWc~vuAJ?-Fz&G^
z8f|tiRm{W55y>;AhZs>SA~4!B<~VA_j1y{~KO0t`&XPV#YDj|bXur#SV@<GtoVX-8
z-yj0gVv9@7+fXH1KP)lwdZcP{j2FzlQ2%f{eKkj0q9$|@a7p=!0gA3n{tt38kW5)c
zJ39k=wpL7De1%=<%Diwo9fjY!cs%bxg(c$-0JopmIHeuHsewB`csfJmsEresN-^Ga
z@gP~1<JJ0*)9SNMkaj;AI!S^3TSg{8Y=FAk9uRk?9?%4}7JAZyIZX-15?0&(!0sUv
zkV$4G#624&z3d;Lxr${+^N_`Ga=NR(q5?`oea3b!tGGNI<ro#7rK3b7<@0Kqi|fkr
z<^rZQ->-45VB+AU&Dw8NEf|iqltiIR$FGDB82NjJ4BwBB;@ffLy4=E@EA73!_`hlj
z_lP3MVDKjQ6oG~lK_w5{??7^)ZT!&n9>{Ag*}gd{S7{>@>ZA02jJT%UOH!EU1RgW3
zr7X@S$r%jjACF<m7j0@G?DN)VhS}d+VL!tMzPk;!8_k%t9rlKg)=@s{rY1-gi@+F^
zpD_-(A2Cyq%pSGR!~4hX{F+vAPzB{-q0?kD>6MLc7+;L9D!1>9y@pP4unwd9qi3q4
zVZRmmp$K?jF*{SLK<>fLvBsF%6YFaiYH3`6{J|(a`RMb*`ii4S&A9l|rA4Ryq!Zsi
zk9ROhFqF0(?vp#}NnP1q$4JX5XTJfSd*9r94SAD(+d_+By<3yp2^_Z3F(o^~crYE)
z2um7~*a6`sj+?@wlEQ>a)#k^BOy&KV-V~)&r8}jScR`ZQP~Z;fA+msk{HD-^7<a9>
z-a4>p(k>--WaN%g#|cyl$cdF{Nz3}H9UJn<yjuYnX2c%begXfs{l3uK;(&GIlY2Xq
z%#n`?*EjT7pO5_h$}N+jd1v;|szNg*6PvT8Q(btM!k$07-tw^d6LElBsBc-cuNj>?
z+tB3o2=@DB(M23;+9xr;bGF(QqoryhxjeW$JT`4As<$}rK&*m%c0D`GgxR_Ktr?)G
z2U;I76~-drt}JR;@DGXVyK;QYv>45grz5u`Mv8GJ)H3!bk@qZJ3V@t+MN5BZ$FZ7$
zWW>ktyQ+C+q2h^q@h$UQ*ZvfhvM$i0DTKc7;6f&Ntoh%%irEKsyrdD}cjAM~2~J(G
z)X04<UC@@-B8(72ya-x5Xt4kF9SqmrlQLUeJpdYfgDWA{hvr$M&dx9VW4Q9oE#Mu0
z8Y3IKPrXg4+QN#Xzyq)E4Z#UxB6FyCiQ3&FpKE2urSA~EaES-86D6E3X&1J{jhy*t
z#CN`j1ofbWQ<riJ7~agMNp}pjT$1%})nC7bz#?}s4=;A<_2kJGV`{3n`8{HVH}YHj
zzTl<tIS__+K=R+FBux6K0y1LiTaQ&lB#?Fr)))iq0`ZiS);_fWx6?MxFSULfnsZE1
zTYA72V5a}G1#7l)Vfy41QFom=>*QT8tttCDFTqTllrW59<<C|(#nGI;z1w37p>37D
zMaDKv+Uew&VXHoyDMbTJQQO=v+<EDaGjH>RLYb4&W!w)T_urd-e+1iJ_c95VC<cp}
zrp9u=$Wdu_n&j}~Y*@sjQP?=+ShcKwo*GsEF~J=|m;-K>*R^K6EGz){nThy>KmEW5
ztipu*O|~yLL&i8M7ssERmMv>|%Eb`9WFRL|IOovyq#*+|Zu;jxosYI`#jzw&VSf0v
z$%p^Oz(RC!^Gf!j72p7=^Gu8i%tVM&I-XJ8NwF<B^|kj%IpiZeGaRa@$;De6?B|aM
z`<M!c5x3s#nRszb1CStc*CH?TgA0u}w2eARrXwRGuVc5B>m9FTHrt2<5{5~6pPj>x
zbTazG7@>I-_TD8q@2ej`_Im>`9208F_m?Np>=)|V=9gR<Wy&zEfDY?g+S)$~#nmgE
z*4yE5x&>w0cOZIXDHV*o6ySR7v(UwZxOLuYQa`1!o9SbL3g0MD{F?H{yZAYz=C|<y
zeS~i7`8&|0wn3Xlz3O(c^o&*y(e-%TCM-jbe$Zq<U7c1hH<f!q0+t@T{Hu5z^Tg5-
zLifUn#-O8*7AAC>zSDih0Tn;|uy?t<6S2S>p5u;Za`k<I|IHIC)oBXKuazPcBhQZ=
zIr;L@bPXfLb_$G)+~Ko&IORRJ&5>F(u(77WV?1Czni5R6?U{^v7~D}D;~&ZwG~JLQ
z=imtWEhdxc48w5}bh2Z*qL=*4qZ3Ws_HOhgkZ!T1gerX^!b9vt!tkJMq>hbBEgeO1
zWvZJ`=h*;>Vkgy$$5!%}NzOH9>4eMvBiWDj%IrUwq7pQSNH%(Z{3=(5o|HWB-*jJ`
zy^D5oUXnd!q;1elg!O1{y5!ObxfkG*=TY%^F{hFQnK(Se;i@PiRh;&iX*8p*tP#Od
zcgZX(Ss1q^_rzgK3GWjFgQ62{E7O~UWGeXhP>hA(qN~66*#{%0xLybj1gnMTJ8FB0
zfW+z00NSLQHWuWmVBmJ{<{lg6D6;Dx0ZlbidN_)BB*C0u%9gdJ%Vk03krn=c*hKZ0
z|J7pkvSJrM`i9T!HtAcpbIf43bg&=TP#zN++#zXsNh9>}-$~<^83iPJSwk2d>YRio
z|E&I*AlbaVbxxL1;~kWtK{XC`vE+*NUmzx@ZWtW5&4T1#Q47`T$EUqf#2PDP_xWs~
zHu@)}==0mV0lajd=Ic3HU5Fgf@E{=$<$)7Zs*Ii~%gO$zuqS>nF{N4)`2psDJ`1oG
zHp&WI#sKcSKJtIZGYn9k<z|>fOAEX2P4PI<o-l53!}MDqElDnc9dR>kG^@&AdvPcu
zs*;PPuuH48cg)U&@iP`K7z{IpawBt>zITk--ii^WHruO1o@2{f@9*;ummvqResW+4
zIBRC+_}lCYQ(2!-rp+v$=^ObJ*epwF)EpGK{LF1w?jzg_OWvchGRBzWCTr}=6^%7Q
zn)*-Az&1r;%e~vaYC_U44r1}Ca!lw>7Y(+c?OG5upXO-`!Ct<#Ok{`XFR9lmSojCo
zW9&gM0Rp@LptXnVKIso8QtPl=G(;>)O$)?GZ`kH1l#kKY&j5+ycPA;J8vwAm=DQD_
z;*6-%F_2QLcg4sp`*>Mvuos)rc{751OkQz@30xblQK6(6pj2L}<VQ(V;`79%M0D3o
z?z7nIIkLVR4K)+{B_%9pFKK^;-R#OT49N6kemJo;J)vKu14@=E8vG<YISyXJY9BHA
zluUGT$%EU3l2U}WJfR#_t@miV7bS=})Wg<ONgcM>>HIcl?Iv%d{PdXRKxd`=v0*Em
zD*Ei(F3_+oG;J^o3+ey|JB_#ELFINFD^ZKr=iZAD#kVj2O~I5@wYGO6Jz&$VD5Q%U
z8BYU9bLW;eZU?su7V^MEOGlKe$>-OlV_XPMw`dHNwzM7-MWI6fFac&BV%gwXGsl7l
zq8Uj+Rh<k=EVOVHVKM&Qw1i50MyGD=4g~{`6w&iDr<6dfAALlzA<ORWhmJ+D;D$cI
z*LOcxj{H9kq&=H@oplK&N~Ow3K4u}vr&g$vO{JW?73v=(v~?l?YzQCdnsRi~q87b+
z>^!S}15#p8a&Mcp=`@d(ubel*;VWFb#F1WyooFC%rmNcx9ExRv&S7_F#}3pwzFZ^5
z!*AtZ?=Qs5uT_06)5i(jWhJy8-)ZYHqe}Q4$uRe$I(aIPtj4iKW^JJYb0)ti#55D(
z4cWEN6Vg&7zP;}e!}Mo1s!KOjuR;TBGBjtJ>^kDB9n0E6{F$O!nY*mau((<!(e!v<
zZLoa54v`{n9LcyQ5V%YH@HF&pZ1H9@_e;Fc%!e1Z)5<<{=&h@`5=(*zWfX8+{VDYe
z)sBdQ51sn2J_lnV#R0XLTA=+-5CE;t+*A2^AI9_2c5@b|py=(TuRk{fxLO%Vdy07(
zl?xmfPgPD6I}qE&wy%|xYh-MFgdR?}t_HN-9=LcrTTbQSS#+Q5m`#TVL`S*q{j;O1
zip-CdDDQ5jzj2DxpMitliloet8+&|+@;3>d;HgBIwcem$t4a|ri<0?Vh*CS!gG_WX
z?HDxBSfWm6{XWX}td1${g7^Jey&tYlx7M(0xzl-bo7WlhSO_h<@$;Fj`5gmhdFm!d
z^#u8TcftxP{GRR78ya`aR!uq<+QdZ5P@$AMp+TgV4M!xK7=GjHb0*cBs*qqn9avVL
zATd5p5K)(@!vDS6V*ZFdqF#h3K1KMQ`WQZg%i$GkX)L-4j@>qDEYdWsUEUHFV2isP
z%j4o;#JtSqM{=X8L_3>-;j*N(v)rsK3}d=zk@b8j#h1|QG$mo>%CpY~yvaXlx{5yt
zx$j}^!|z7AOxBYkAn5vW-|?)d%8jSE!^T=U?7EDVvN8J#$DR9n5p5RjVid^~m=Dy5
zJ*BLyl@nerIt6!elSo<H=sb!vDfeiBNDJPE>_}BRb;CNxF=YRLkfne-jBi&dr0&!&
zcAciO#~)|WOy?WZiY4b~@9^)Q*EjLztYwxjtfAvI_uu${?RIUH@R)LpSJzIICzn!M
zI*@fX#OdMbf0OZK@28?a*LaX|F3LX;B?Pr5|FIT<^AQf2IoflQJV4mLBJ$XUx+I80
zHduAuotl^0k`SJ*GYoS-vx61QZ7A&y+$s0VJRBwT9Gc3~BoCkX_SlYf_k|g4sBF5w
zN2gL<*Ub*$W+3zbzb8`~{TX)zW>hG(Cqwn~@f}02+@ve#Rq?817ZvwgI&bvUwo@E}
zM8w~rD4aO{{ns&0wd;z3pCp!A3_oQqb-7U0qH|>0YohsfmWjZ>-(gddGXH#=$Slo8
zbx7_-U7wcr-COeNBrSY2c0ig|N}|EC#O(OGu*WQ?T0i!yT!SyHuJ$%Fgb$v|sQpSy
z5OwJ~l*S$7Zn3X_*F_I5WuXDu`qWO0RQ&i_LLk%;8~;N?ERs6B-2rw`FD>F{H{)?5
z=c#d9f*SGw;Zdv+ZfsSh#FaXeT#c==<QHyi^STPn2R#L1Ae+6N7Q$O8CrK5wy`f!_
zD>tj2;iMe%+)Ng=&f|N2q!slW9;sOnHl;lp{!q?+N!}~*OdsQ-EIINJ?SsyIy2f<9
zQainp(8Xo`@%+wXs%SQp*7jBIc!yVlL+a3&t;6_69fDJld@KM=J%UDxVfL6k3jnP3
zo$2I=BM2p^(V8dB5$dg5`TT3$*u@Ys_MDvuvta9Ls)uqCGw3yG++sWK>4d~@hZ{A1
zXJn~t0#zlH{-6z@i!^+Bw&|><N0HT&MN*t@A8*K~(nrrod1}|dBQx|)gpF8scGis3
zTIk~V?p}H_U{Y-Fje6Q78smp=(x7o4rPD?!-jcWOw!FpbcqO|7#@YviR^xNKd&>4=
z6G^cyIX-v|%j;hm_!)x%WIBaE`G0duVBF>Xiaq8jBgoR{a7r^?!o_UTKIwY<$g<=3
zPgzBDL(pbu$KGOvHr<|+u(sgg{i94Y(=u=L`=`Yz(YDuW;<N`WSHyJ#H25VUnOi1E
z$wsScw-T!e_Y)J|<l`K<an!DXFLKW>7Ep52qJ1=>f=Vjh78gd8H3+U+t(VsbOE<I6
za-hMYLI-<+WggFr9ENZMISOi&aRQXwhf^7>;bju)1fdj2ETw1O1c3B^67dehEDZSU
zEEDx;`aFZ+ae5R?C^4x+A`UcP()@xMo-X~6s;=g8Sd3?<@xok1QlIvy*uC9e1NZ*(
z8JRI=#dBukgygB)_6J+FQZ;g~9F*gIMiWgD0b<$D%$shRF(ZrbOpR?``nJ}pIZ#!F
zPg#%qNo^Y7zPe+XJRbXo6FokkduUT--zDjV-<EGdQ+;<?Q(?q<g6}=JhsNLtnI6C4
zPI^TPC;?Mo0Yob%j~E|y)PFHP{q`Ddu5nZq$uVMCd_^yBcdq7vwH~|f@^G8%4S<q(
zV^~Dt+y<DW0^{Jap(eLz_bskD5sIOsvUa12j2C0RQo+RVTjuF(72m6IL@X?PGp-qt
z0w*lTLH4Q035$x&kq7qR<b7zL8aL(p=4~)Ilc*bUvZHn+pyULO?f(Ad#$BLH6yfV*
zt1R1vdoi0!z&L0(pxCF+6XsTi^6mU@n@#TjXie3ykFAatn=qmsQXT{v&bt+b6xCf)
z09(suVsSX}Crbsu@@?Fz&-ECaJHBKkn}Dk(+^T=%02WExkv$eTKJC+tk)xlyW43R_
z-nDGzo2Eu+1HSit7weBTHbkLPJmI=GQjr;(ehU*UaZio^wS%3{tSc{<5*&B=%LV)V
zT+)4H*lOgSo$Qr3xg`ETx{G-k`v3EPu8ux}ZHoBA32!R!9-7?Gw<ilbV9l^9PX0nB
zJ7|%T-8NY<6Kt|)!e;<*%k<V9pMGzS6eng5=BV_vF>^FZkcSJn9`BXg+Q9(H;h0eZ
z!t0uJYMz=(gcVM_U}{Krm=b$xHba_T=+%cMkcw~s3**Y4%{8{yS%ZGp>jzlngRigz
zTLnh36XsTlZrw52m31?7Nz<-2JNkT>3vp+_#_HMDOw}1Fzn(uqH3A@!CJyE&469|5
za|hW@^H)wF)2{TG;6A2o`>A1d$TRE9RrP%CA~HjOe9Q+}SSq{b#PM{$U%!~GG_9?b
zPo3s<wC3DvYaM|s?N+6E-r_t*Gv|IWcuCH9CU@?Y>B^aGMweWA`1}`EY|z8~)!nsP
z4MaY5gkrs^_>A7X<?8IXk2$V)Z`r45xSSpg!{qk7xy_c_oIctLe?fH>`;JoOS(Dev
z4LF=Kg|W&Em~U5Va6kRV@wacu`tRe|zEAA%wA&UnIP*{;R=l*)$mR~a&R0zs+BJLx
zYSn~Nxg%5|JlTdS^aA{}2aUXuyA^f`a!%#(LA2Bek-H~5QsB>}>m3=r!{PCRI`XRR
z!5>w$7s2f2I2)BnT(p5qQqL3_@d6|{@>kx3>Ps%$|Nj+^VJF*vtg_LAM0`q;aSKPA
zt<-2in*9b{Qu$Qe0EH>H^Lsp7-ejyw1pmy4jOpnn+kr5m&oJcc<H2aDh=e~k1Nfl%
z($_#5>NF|}vdJBn3?ESEQ1Ay>t<0TfHFb#eg!*S&>YOkxb+aaoE3Xr;ZFvBtqVr-C
zSwr?TXGh!9t4aj2iD$-{+>w8<k8D(_ZA}?biS6QdHq^I|vlDHkf6c#{(81O1!C`H(
zP(2qC>4+WC5L*CmXEF>$0N|;njo)OsTpC|+f;+k@Mk9^@SIYoBs4Ci2ekFH)@JUvj
z4H!M{=bI&uu+vV70-{Z_pM&xKU-{)>B#)z4^SXeo#~MEi^w$vmxV20xQh(a%Zh;A!
zU1EYO1&B}o``fQTR;{n6d@$=~%S&)K%)*oyZsIer1TJY$Rs6vn2*@^TAt6t}AM0BJ
z;s4dQKlb*B0&Ql?kEhJz>*hRvzYNkOiTBrmJ~p<Z;}oqwFhD0ybmnJ}#^Y3E#Vc)1
zIERn4Fc?Em&8&f-p+*j<adUY42Y$Q*JmNW9Y675)Mw0Gb=i&+P@S?sulo!^1zj8m+
z*VlKYw|zDro@5~`c)PbTSdh>4<_c+1>J%%20dR|hA`{zgz?OGNV^`<uj@jmrn1k6+
zpbU82z-cmtJ-#FP3AO;T8qlEA!C&de1XyjvW`a>giJV<)#t2W_e43mLI_K^?;`Z)1
zNFXGd+6HA??g{sf$u=7ec9;ObL48>DDdx)`+WGF%Mzo>N=ad>HkvHd5#}PAP@cGK0
z0E{weFLN>DyZbZbG9tBqqG;YJ&;tcbv8yyVf}0^A|1_1CbMyDs^tcGi3$6VvHNsSj
z{F<x&B+LU-?<+em+1JcN_2ksMYQ@}d<l|P$ux(QT949cPJ>T6H`=jh7|NRj8+f_!@
z3e&F21()u$-@y<OY5qS%&QbrwcQ8#6xRKyJmP_aV=G32RyNMe5OqpsUIrWI=TJ*HJ
zUQFCizm6b*{)YvSz|ym^w$)^z*@}qWe6gV8r<yNv84noi;9G}3rv=T79xIUmCX4|p
zyWplJPo5ef*7_xk`L3_1X#uP|ofOiYIo7Jn0KF?zmYU#te9OR|-v2<3asYdBReZa|
z^8&cXx!fB9`k#MJwpLOR)@@5G`8J(ouj6*@3uAS8G;0ZT-$(=Eb6^6~gBeB#MEg<W
z?7l?zH8ThRyw<h~9)ZUZI3r@%vf!Az@~jd6Kvhl#Xh(sq)I9j_55D^E2a6rv4$181
zca%!Cr$7qW5(sx9l5R>Jq2+7>*x?-PHY&c8Xt`2dD^|naRqtNtMjx3}W-9PqM10IC
zJ?j&ALnqT%w9HZ&R|0`gM3Hk5%YMlw+;Vm%K&Y@^u_c+HtXuV<0;`-F0a1%j9+OeX
z&g0k<06#jK59()qNE!PMIp<k^`-~Y9komIBzGNevlX>oaNL<rKeQW^^)ZCmNY&mZK
zN3inl)gy55L*=H2^O#OOD{;6r0$)8X_RqwUWnxH<>7<Z05#Ul~0GIlIj)Lmo`;I0x
zN~ZzK%7JU;tWN{ykiR6e8JOX0_nDmyz;`5^Ytn0!XZ8A=D~UFEJN3I0J2<=E$9HTb
zi@gyRh~iqI*6lU(a$~PJiUSe@Q#C1lc3fHN_@q8Mq^*9epY|H@IQQ>HA0LqR`kEsq
zlqFQ~Xi+XVCQ@#v$mJ8;OQ?l=lI)~P?Y=}u5f<{zF7A;`{!eu@<NVk5?lH#T^)!dx
zf+`}Drmr<~#S?J|=Gw~2upVINqjc$$ic7kSy{rpB5|V?fY+jSGV>T%8-r;W?$}AhJ
z3brCf1KNH)TJPmCegoYLFM@-8pwqOn<XAvV;INHH82W4}UIV?Flb<ftP*A}Irx?U0
z#+b?wuppWFc%i^H!7yw7%}Ybj9#s?Qcz(wWiLY<6!z$IcS=ysepDtOG_m0nRqs+%i
zD){>EzyVlC{EIUlO=p3cuvBOeMD(!ANL<pwg?hr0)mD`ntOFZ#Z|uS%<J$obHu&~@
zBInVMDKTbZf7@2u&B^Rb3*Q_a3fJj-w;{(K`pFcai~CWS*6!l_=n^gdbBQ8*M2C&9
zujJ+hus-&ua6^FBzX1)ma$t9!PiCm7^kA3d^m~vgApKD6Oh1%mkbAUkSF6MTO~PF+
zjp<%(#a&3e`PkjNrtAT&8~n2O23;x>_)b~xKYm^WzS4Z@`+s9nL)EbU#Lz%fO+Lc6
zz(J!v4xO|!mldnQMet^qT1eW7pb$*dAh8x=J8s|03$+tCv$p*mFXBMxiko<d3NhW~
za;$_{k9b!k9S;TzV*Iq9fof&FynCEtw0dLyZ>EZ2H<L6gB`)dY%JdY(Y|NBgMnqlC
zk}8Q({w2qyV9=!X&!!`hAxOGB^kvPz8TAa^y%?6uIAVQB3|k%?!-Zx~K1&hx95{g$
z2B+kw&yPzo*=4E(fJiOvaCLZ*h#@FoC6(~G(a=}O2A5%$9@WAB#ok-SRkgM8qKYW#
zqCt@E5|D0?2I+33L{Pe0R9JL(cXx+?vIyytE&*v+bcc5?x9<JQJ|E79bMCqKcV0jF
zW6e3ocw#)`Kb{)GiPecwLnRhONuan5E_FZ>wB45^T_*;IeDC#o-Z_HUn9~JQs(+4A
z<!o-v{us);tFlP<ocb=nMNMv`+o)tHUetG4Le|#_OFHiXYJm@Uh10Y-k}12ECLu`T
z$$KRxkG42>^WrJH^+^1h$|%|jffs#-xG|+Ek6r3Ugr8Wk5d+=Bo-gk^IoKvxpvup3
z(dK4uXhZbLT7iJcTS-Mkb`c<+g}B-#h7R_1aa*3uc?w3p2ywS=io569F(cDFiK7<0
zsY(A@KU)@j`rMJwz8gp;)5{k3eKk|^V<{X+39&M+$+Iq<E=itx1zf=A-`ShQq#IrX
zA&`O71c8-DqpO-=M+Tsv?XQC{k@niqH=9jG!A%C0Z{$VTg&2WGyw#rF(fz`;RD7Wg
z6P+?0hFCZ)76iZYeL{*oft_SSXaudn*5c<9e3F}~bs1R9`d$-Mlr61`p8p<DVo+gF
zM*z2Ou!e@G^XtBhOpj7auvvzFrxAk<eTnS-FMNwRAJ|g5k1jJ3GhY_;Q|aVz>*l&|
zDJK|E4gD2RF()NLd<~qfaoeg)27TnJ?ku@@kT|7NnpE%tq!bxT%cAFlfsu$ygW3bR
zbw8*(U(d@sHL~dxe7;IINhy}W2$b=tK|hy6qRs7;T43awRix1Zb8mmhA%C}q7+rYR
z^!u2OW{-POD$oERu+^u%-+^5+7woo#i@Dv<9UQl<G$ag_&Ljo~+gg+<M$Zog2IGzG
z?F2e5#p>3z{j_Z;&1-e~f}ZZ^L1~e6-DNlSRn+gXP1V{zeZ&cD(`Bj0YZa}v79X6n
ze-72(P23@O<rU<uBz6LzUd7DC&hgLi!?m_v4G-(+aC~d`)T_*bSHy*K5tHL-z?OVm
zMq$V8@Dt;~>;kNeBjvGueqIKC*r}wC1EyNl2xDcpF8c_{*nk?hpx(3KhV+$=#V}`{
znM)}`CgtebD<HxFRQwlNazvLlykzb-BT6{pp6{y!kDTj{E1Euj1StBwBAdXOXRuKi
zes6}*5FWyUy2j^*6msmtRK@U2V`ZxI4~`$MzKLjf^KwaFmyj1J{zq>-1i#NR`3(FN
zZ27{PsbA#@irfz+07ia#D4}m*o3whMAX5X_15Ffla$jLzK<{wcbXn=!7bCy%;uafk
z5hBkiHcuCjfhE?Tbj~2k2&}$RDl%rx;Dvd&qJnjJ+w3MW&-@~5Mu%Vdt^<3xf;f=3
zbLbZ+%*p}#>^;@Y55hUXXJeLh=~;oEPLsTN`LDMK6Yn#2K3<)Wt{#~*EfB51S>I5K
z-nTvP&(UoGism|o8dz3s&M)#=D1pP;hXy~TgpF6#oJUQ>Vk*N|4ihH?t0|0Qk^lns
zd=r2VH--*)SRuACnn01+(3ZR=;0;JBQAKl$y2(BWt*dW|dL}-TrIbF8I;dVqc?zdh
zz^zLs`SAYu8IF)-x@u#ex;C<kacb(9q`jz4Wu3AQePZl|NMUtL5^vPMMA!#8sTq{D
zMI`0GHZTIByJSP?zy>OsjuHvPtv+o;sGN2ZZle8Q)hZZvMN#)vi;mmuXVGPO)7Io(
z1eCS`IKZdA<X$IXt$0g>Ll%R&)!qF!>{j%NY`K}{*|_5@7>wZ8nsg>ea7ciB;X~Cy
z=-`qU*8IbOfWP(<_L#8R#f3+JQLMN+jq+)-G|l}DfJ)aK)F+5W=DR?@8HqZHy9jT1
z_|-`K`kYoQD9_9XS#Ux`Sw9*87^=u#W|jk{Hm^~CVJO&FmlRued!MHYjlJN}BnOQJ
zerR+o%kHqi05gr)nVNTL{EYU_m}~WCZ^}v9uR`wt;2QPvSRSZ1aGOADc>gxV+zSQM
z5(bf^TN|pI>IO3W+x92@A?v-~=K4>7`xFM9(}cX^>*fN=<Ygl#y={TQtQ_V%klRsi
zRz()dd(WyuUynD)e43<{x)^|Z<M>b#Abynq<Z1cpxnwXmV@TmBaTDh%%OtnT(vSm#
zo(D%DZhJ<bw@RjO^u7vt;zcv6Fk$6@;dOt04K{EHn0>>(7PlEnK9e=hD&F>57-%9-
zpbDq;R+tPB6<MkTAqF_)g_S_WOt*X-(AI?L6G-xNhAd}2i(mw&MWD(x?;|F{X_Rm6
zvn{S=UsKcs)^4B@ODxr<{6Ze_KNJ>5aOm+YLhb_nFFxA6YxPc@6%?GD;&Jh~JQx&;
z=`z4x9K~a{{xK>o<bH6qu<kb2jHb#xC#R>qn&4q$^q2*<fJ;3B=P`aRVB}wtnAvbR
z_^U>UF68mV-E9YU%$UkE3+zl<Q@WstN<k*#$-zcZ+~zkIp`P<$?HOC}muluMBRcgr
zZywoe+7ew~U%QxPZnN_7+0{ktOzq|#EhdExzUiU2e2PSJb=py@HcBH99;0wvTeHTr
z5%^refru=9daaeHn_=q1CzhY^!uc1x;05{%WT(u1@0agp<;oH>S7v3%c-Z)1YFMG8
zSsJ84=!Yx0+fEiM+L}?;Nb<2cMOIZ(uTEOyKF@wpz{H++>KRgyuv(9U-JXym7LWq`
zrcpn4<`NpHU-Rk1F6~|O^eAd=Q_1~sOT^;4fIu!|V7LPa_B;c30td<#SJxU9BAFcP
z_3lh-hmm4alZaZcCoLd90}`giMW9cSHV7yQv|eX`{)q9~!cSVSHgMEITi{O^$SxUM
zgq649zjy?k%Wz6H<=;7{zO~5phYV;?++XK_jWrhZYb@t+p4kaGWzQoG9d3ed$sZy2
zF?=zVsd>k^;?q$4<lH0)Pc&c|!Cc*&&>1C_EkZBZcqsPdt0Y4HvY1U2d^7H<4gHc#
z^dB?GRQr$5Te7(}at1gZ;6^md2|>*miZ>j<?NV=A-~D!v^znj+gOXiSnugh=^Se!P
zr@r!^WW>f)+Sf~csWxcOu=%R9+Ze;TIyAoBBV_Hh(?_t&|M`SLmnYyOV%q{1N4w3-
zBu&?|h0)<J$I52(iTr`@dCR7eV<FGfczkyVVYI3F{Y9IPKSaJyeRCC?eZ+=8*7|xR
z>rl4uZO|Ky<%%O}$E_D-tO8^}mxx93C)WK}oJKrXj};!UQH*>I=srr2s@AUE<AmkN
zDaVk7MWu9EE9^9zJQrlIcqUvU)lRkp^iG9U>3Hq@%HSbgvRQM+LMn-um=?@=KqT|{
zx>2z@>429uQ1OXVejUzNvZeCYB}J{|b@j%fn3$FG-R17S_||F@cik{v)HGAAhiXgY
zpcsDD6UdJQ*XMP3sblm-<;h$8cgOgFE9J^0kcVEw7D^G`Ukl|P6-bCPVkpo&`w4hi
zVD!nG3@*Ng3SGY}B}Vo*4Y0o`4T!52cMASTqcvd3#UI|cCN8SmS+eAHRR<%ZSqD{T
zVED@r6*7y-imQlQo|1vg3pGKPiV@KnT`-uX!Srj})+Xf585Mi?Jh#H^jpx2WLJ;*w
z!Z5r9^_MDFX%BeggUmi#71*v4RW1&w6I88F5d+t)<qvr*>xnSSTS4fRpLg!_Mu;s5
zYk?>nX>&yzNXnlHV<_&x_mWxOy~RNrhS%PXpY7H=k$Ba(1X22nvy7fGHl@hCs9<k&
zrFBs4EfhVK=h1~l#c~$#y){7$`Gwz1Q}>xN#=b*UJo8S3x+$+HicpU~i6&Gz)JPZ5
ze-gvq5*3~XnyMccfb8IWPyq&_(mqTRA>>5;nDZSd>A9GwB0$DEy<c;&H})!9*5@=w
zyk|Z^|B`$%5I%RCkvAuVA>lcW?YjTbYMdi5V2{GazF%0{)%nrsZr)lMN!|l;`FZv5
zWRi|w1w26i^R*S@hAAFM#uw#t##JU`A^uAUWx62w3m(Fkb1Ivqz^+_tpz`Yo{e@sG
zrJOMID?N6=j`D+MI*N4vc4ZwKW(p9MEPZCx$|^n3Rsu*u-D~=e1SGN6;AB&N+o4II
zoRqj+8n0W~+2+>~Xb?kfjhx<_N@EF^b9cSeel1#p4!C>}j{Cq-UqA>%WN%l>DR9I&
z5q|3H%CAfY8@?2$<x5vur`GSe>ZO8_DpX^0el9UN52O4<dFqmR9jqJuA=KLDS>^!d
zaGWK<g4@{-G6`mm9NBjraqP%akhu__`E2i)(^%xl?#9sjdeXbc;mo9?9dVunuCO0)
z7$|ruM+*LQ7!>`YcGd|vO1?VU1pckB->iG-a8h9nnp~xCYP!+29&&Na6%^ZYk8t7P
z@K56=j8e;VHCvH`C@4LrfP6nYIlp<?oTzR~3yBd@m(9m$d0}%M)pm{a?@Ye_pk4;O
z8^T9BcEcU@rk9SAauXJ=29TJ$Vi<PD;LlUTZ*@+=w~zl|o0C1)oXi)uP;&kK@r#in
z9%UlMy0&4dhy1dZR8PXXHT44hA(U~%gilv(R%AS&I=fb)&K2$ZEfLV|{Sz7|P1jvm
znFs!y#QV5TyL?pt(eB<DEXsGO=!&q!5e51vqoZgT?hO?Gl>)<TIxHqKmRnKAF;5Y2
z6eHE#+sspeAoG|W1?V2`C_-d4h?KOmJ@1SGp7P-J!0lb^i8*D&_2L_B3toT6l^WM=
z=@-_g0zSl}_Qa$S5cpDw^58Rb4oGfoq}!({oNaXZ`&&dQ2yjI~`bBQO9xNlZb^Mpb
z9a<`X&n#6Kx9(Dmo2lp=R29`m4xhKQW~Xp<rJ4~r15u9Ga{~gMMs&ejkNkmVIpaMB
zgKo~@mcdB#ZdsT#_zzn8_>)6-*E|rAK}NRqhvZF`Ew05e8Or{AQD1Go^qKJRz{3?T
znsAMiMW?cgq@(;~IOZ?=K@Mtqs$)*z*C${R;?v{AJs<8Wby(}Ua;khxJf<IW>eAHb
z<<l3`o8p{-4i1&F_5xbHYR8Zm`^2^eFZNE3%1_amf0azlWB~5=!0zI0aKtk&<o`@E
zTA_N%U@DV{o${GjrWYW5ayswZ94U<O)L(=H+0gRE_VIohxtU6khbvbZo5qbX6U&Sq
z*(os{pNowuYFeGWTrQ61i=FF}7g-X2pxefm#t)FN0eMyQs$4itMA_6?0a;%5=&sY}
zIB86?DoZvVTGcN7&j(4*IVFK;QP*g}mexf*%QHf-sBn!YfGi8FWv2h|&fS|UsSfx+
zuduA!OiSQcpXWI5q`olHkFevEjT+?SD{3mjKC{~2T{J|*c(OIm2Xvpc-Fj=y2rg2O
zoTy=2V8Q$71ZQWate#<R%PlygJl9=SH$ywJQT2wzj?;zM9Fihk>el}-RFoM>3gE(I
zZ(@$(H5rTxOuzgWL%)JROb^+%6%{0^%sVly1e8xcyPyns>(k$NrQ3QO=%ci=SpzaA
zdA`d7+((jwQTkdK=ryIuMv#tUo_4V8hVl5khn}4Rv_dYZEbyR`)B1i#m=<((R%NtI
z6;0skJJX8CZau2cTZ;<dq^>v`UGe8hy)H#-%_i2*RgGWeTKc_!zyl?RZxb3^?w3hP
z?(Q^slG^ywN;02;%iWi$nj&&loUNn%A?yhQBtT=Y3S{7B?ok$@QZ98Nrafii03iCx
zZXdp%RA01NEgTmb?N@ydtvA%Uj~S7vnUr(~==sEK5h-#M2U~rOzgAy5aC<Ng;bmz(
zM&bJ;imV%D??KKZdVe=%0$W}+eTGRu?}YEOOfVqBh$itiih5z_J=|5%8oA1+KsXJ(
zQ&4=9(!N=q-@4AsL9gt(v+lp8S^<=$eu5*Rqn8Sy1+IAR0B|FonYtq)kY_A5!6v8`
z+?Z4qW08CzPj(Nj|50|c3R_n&4cabAwGWw#V${eN^@uq|*_c{hSYzWReKHN07lr28
z`?XCD9qZAV%>1Ps5RJEo_vd+uf!s8iRhih<nf-Oc$NVl^%JT`tXG*!JY3;F)ICHL0
zu(WU@C>CO<E|3F-r4C-I`ThleKJPg2p07vCV<D+0cILV(@mIcocnWH@((!e1^>#CA
zN==zjv5ytirF;#-U$VilNZA99#W+P9u5Vfxu)%c|twK<@5$T!X&(NiTAI&mo0!MDu
zxCBqqfJT8v#b9O)#wVZMMT9*y07uckEna#ra8Gfz;nlngb`r)r8@6Cc%KJ0}^W`zg
zr7x*ILQQdP>oGeWQG-`(YP6A3CH44d#FtoHaL<#d8NpVY`DaT5b9{Ucu@XX_fcZLN
zMaS}hYy473L0Ir$cOYI;@2Jq-1amlf$&vl?XKp+NTiGHXLhG;TEhW+4x<8JC_8-vW
z1{Xz%6Y|0pRq0ke0vg8}`Cch;KB~e=+$H8S;N$D=Hr`HTgSI`nm<w!0D$xBAk~Qn9
zG9+T1Rl~?TYi-nFLGBXy)`C|P6eY|3?vopk?^0_(@@qGvwbf)<bH~!sBzstuD1LO{
zQrm2ciPElk5iij*6!Kh`tU^rjbSYqk#U}?10g!L8c$xLuk(`(Ad>~#_E0T3iQ;dFY
z7kX>JYJGxi<U6*Aw}~ZRuVxZhbj&gPUxbN01`<5WYZN{=J%bS{@QcuNSbc#UrKUMS
zUIbIH^FVEFSSU2C)>topj+0(49?85j#ghGy0L~b+hLN~@!>;^_kB|A_+AS(Ecbtdu
zHP3AP4hp&sMbJgiO$;;}D59n`t?nCSViKSMV8R-X?7o{YY&fc4eqz$4a!6H-en+Cs
zJn>R_;;I8w_*5NM{8^LZg1n<)rTdxrn6tFb+;CTg*{{7T{W6%WT|Ya@?EJ}59g%JR
z10vi>&gST;9f7;%ftnz9gI?_NnV<Uu$*=tZR+1U;!9sd@o-XFR)gNME@dC(q$8T|C
z!cUdLNK+jp1q!in`q``4JQEj76|S>not@lM*61APvA{YsbX@jlGvW<w3E#D!K)MFr
za8J>NhbJJ}RF=W3_8RsX8oZ9PJBE4nuHD3bZN<(EmgE_{BZ!e(1p0bYW2jZx<N#p=
zK;-Xd7mbyqpt>PLA9&9amZT>QeTL84B7kIZ?3eC`xeO_*n=N6Lety?nB?{zbX5m#^
zGQ!j3u@J`+!h`#F<zRdhg=q*Z=`{#)b)z9?VtCH}?gR4dqx(Sgr*rsvJm-y$RkP^U
zn!~wCr3kNytW=bE-sl>jnHI+aA(rp>Mw49)1<$<#sd10lmfIKmF8nHJ5-d6`N0Sq3
zZ+OsW%3YH3>7LtH;z@p)7e>r~`F_QQG)C?zpA2m9j$b-r!!TE7H=mIx<yFH@Zkh}K
zE!f=V$H+t|&<{+rX-E@96u|Uje7b}k(JDo7W-38&U|aLzloZ!C>dGg8u2X{0iYBg1
z>cSPDrrAEc;Hu3`27B7uRFaPxYcU@TpS=iRJHuw>Y{8$aA?ep@aB%BeOwGGtZC?mk
z;~%gw@ZAkw*>fP(VMhq3C`az7LG9%QGiU_WjVulvX-VMC!IEL4bJKcisesf@G?c2m
z*MmsyUHGx-drq6V^f<ngbNl1I*fzS!Z!OsIcTD&F5%1nb{0j6+(Zrg(pD9krbf*ZS
zZ@q%zrtt*q!?og>d*Z%Q0Fg!O)kt<vk$c44{QaVJ3z2cqMoyGfXJZowYfz)`4cJ=+
z<AFDc`$+jK?O|163~*jmuLh++M^w5@p1;=2;;821o|0Ws(R%8v<yW*oBomaAOK0Z%
z(<#6Qm{wRE{A<xjS82;y*V3*MXVI5F4&t@5FCeZT#iD-P$%BQ)f`BaDe9v<sDI3oY
zKv-VM^vfwT)XcLT>|ArBS-~p|${9By#<5=SqG*uL@vdBpX|jf!Wnvb~9YzXNxwnb#
zjuBT7o|J0SDVF^Vzsvs`)~bnr)y5t|u&%$7j_k-bo_g@{`-f!b>1I`O&|1W1GSD;>
zxo|rn%Im`U;><OBc-|wL$JpbpvyNZ`(3gWKia-J1Qbj3uIK69ckc~^B0IJ37O1M(!
zLA*Dm0ypyVk^SUoV~(chPwYVb3p=bI0a=%?6v`apnXi_l%@3{_f$J&O5gC20Q|>)o
zEq^zCKC#6`Z#`#3|MB|zYA|h$Ij87d0$S%r-Ft6(se!PikcB+8$k-ZG1Fz_8?;pI1
zyFd25?=z5H>{ke(!WBL<;vtmKQ|T%Fc}TGTTai*k@2_8ZT`At7Pt*k}NB3F2+ynwO
zl9TpP$|Y4Xo4xy4nd_D^7=>1;mEEakJRbTs$4qPA%OW%ENIf;hBjnY3Wbk=ixeY@g
zD`zL{(H4aMkW_W1j!h)k?uZlc*Y5Z%Z-R&Y?FkA`gKdB~awW^VH<C0V>*|U9c@Q&v
zdrj$;>jU`o$XEL>za|JUH3xUVB`62Y=GCr$h{)wd8Em`1)c$$_DIfPe_qO{Jl@==?
zccs?A`xgm7r;BaCwRZ+@p`6&8c1Tl6i=*=$3E-TV)I}vE9zt)yiWmg})o5aw;s&tU
zFV*_J#D)(r1oM)eN(P9Ppwg&3AXHZxybchpL2Wnm-i(Sof)%x2YySo`%Rs=16gf<O
z@VSf5P>_4x+!gQFehf|Y*$a2GK0^-2BMIE3w5cWil<5u5;vD?@Wfad1c@cW@^V#<6
zH!DKN6e(hB5%Xi;alZ)?+z?5J!3KPp<BMm6<Gs#$M!Q$>kXYgI1Nj+cEh)}tfd3J$
zs}kWE`h0Fo0_(K7`OE5=?-~Nrij~P`eAW2z@~PtX@k~0xZO1`xEfW19USo7Q8;@bU
z<_P@7YcwD+!N;8XVf!L#WJ@vt7}PlMTaxjw)^tELt?w_GvexilKF?8eY#k?x$Wg<I
zba4|At^u|G4{*h|+TJZf&bzcUAs6!&_L$!tRABiAk{9AYBIaw3ugJC+XDW^xChbSo
zo$->Uwb4Iv5e6l7KjHU1@Raf75hesRA}OBL(7X-)m?`=@7XX-K;BWWTk9de1?SYjR
z+60Gx%u_Iq`nyMHG2+zI)!Zrswiiv%=d>=R;~cWq^!Li#TNI^zkGazGRi8g21m(BU
zSs?s06zsoO?25pSF&5G>V@Tt|8+X5==N&+;ouTFZ&!cgv5>?YSZhMtTo+qouJvIeu
z8(Xa$CofCTiIg~=14q|b$jgW*0(e~DAg7D2`EVWs%pt4?p|v)(T@Ry8@qPWk1chQt
z=1wP}y0!`|WoyT$ilmS@2ccE}Lkx8UiPqlIv9b5;^3lop<7`90GO{MX`CzxBa$@z<
z`JnJiO=kgsK?l^r=9D{<p~pftU(*5kPTgfQW`kDV%^o+Luk1b<px<0x)k$A;A1W;7
zcUKp!Nd+Hc009gV9V0(s4qzi6v(cr(ZC|86YQ&0t3ho`lS+e1=B0}tlhY-+9Cv1J?
z7!90~ry<BM@r%STRe<FYNQ)~66aQQ;4Zr45K46FyEuAF_NYW|4@(VpYJvo3y$iv0%
z`Ah>g@U^H8uTi_9xoXM@5X4O>6=XFKxC@Xz82yN>916Tt>i#f72xcs;@BO_1O=?G|
zg`*@iU?a-NMJOPu!r~&1i~&e+#{d(?A&wD1KqP2rv(%uI(5Vjk#q|k1;6H}Ndp>Dn
zeX^?`9}`_AC51s1u6w@(3fM?^(~)&_z;R7f$XJQ&69|007?E;VIGq#lEKLu^hCZ^@
zBv8u}0P3Gd$0S#aXg<Y5HWM%@U>QRIOeiBeK!B|aOW0-&G{m|$4&?N+nmhOdKZ4Y7
zwS0xeVcHBI`HEB#wt)V_?T>Fixv@!cIEDo*o@3bn47e?Mz?<JVfqwoI-EhK=e6Tu!
z9TV`BOwt`W0zH`9hk-*}kW7&XnDAHyB{kw#QDFN-n-Dhwwoj3wCNJx}(eAGFrgNG#
z^!)fcKuqa@j05`m9T<K2l$G2zFn{A?nBjrN?|}O$|JTm-F0hcFC9w0tQntYKLB%5J
z<Uqud0Zy77Js)VvUCvpZLjhFZNsO6?0Uba8XQ=n_7k$ZK3CC{k;0tiH^eguH2VqAT
zN4p~9Nfprmjtc1F1prs!AeKNcO#2L-w?dOAMBY9H*Sw%oMl9w42!E|7U;={sd9+~r
z1?7O><rJwyK!Dy)X7rF3q36To6)24P5Z(gF3oU+Vx#17w=PNl*Qy7Ez^gK;k+~9yT
z)l|BtB0#zV5FI-uAoqqXyoCq9tjvoP2sLe#1Q@2Ailz~2h^jCih?539fU5yO)wo}+
zL%Tm@B%gOD;_3gK!q3V(#g|kg4P_7G06c6`R{4$vHwIg%M9+=h2@3#fd~BmfsouUw
zUfsBL>cpGkf!dhlZ3prRL}FPjH%b>n@BzW{AADrUgYNxw*x&u-*9J8BfAL)ZyWIb~
z-2c1W|GV7(ySx7#)c^19{y%|j{=2*XpWNM*LU@vi%b(qW3;m?3kG0@%QJ9)^PNOWw
z<gT%JCGzbXfJSF$9uSiP71K{Z%nFm*l;s?;J4(F|eZIe@NpD{i6TqDU9OeE%(*Z+O
zbtlq0cg0^ESnr_eV!pitA4>zcH8E-4>;RqsrUidK%n^Ul?_|u9zmj6NMPHx;5QfOr
znk?xaa96+^4J?TQ3Xr?^eR^B}4I6Z3L2Nr%EBX5`7IEO3Nde${%FO?lqcMX2wKIb4
z<PJ)K81#p0FHmNZjdbJzXTWsn;9F;Nb$EK>d9@&l!b94P?Nh!s;TgaM_fhyo2<#v?
z_CaaWAHvZhL*JwQ>F0~84vrWP;VY5gv%c-Eb^2ZIt_@y0V12K@4>uLlp#?Ncd;*+8
zC|t*(pe36pr3kEm$4dAP+%@@LdUeKmT}4Gj*l?l2*mHA$4~0k{-n<q`-%O+uggt})
z_R}{ccyNzZ{tpdITkk#LJ9mJ0)8pSdG<;hiK&LIG(j|la1$a{P-P;;Mi{%Vha{6-?
zN<?^om4AKW4)9-}QY`-`V7G6;5wcW+_8biR+dn^-WBW|n<zK`C{##^npa1*?c*r3c
z_%OX;{td)#a{q$z5*-lCe>@UP5nc|YZQ1bne|Y!D52Qnma5uc*k&^1I4zeG&-XbkW
zy^8^^eO|Ds*%J>jeT4Y{4*k&`pSPBBNT})iN@9P_Tv!x7Tc-fuD&M<;kn=-|$$tzB
z`)@Ix&lfvdCs80+qOtnr;eXBw_A|REkY12uP;iq59DNMs>2FP8JQToNHAR@L@c%FZ
z#*=hc04gV$$p`-HdExpE>(A3GVW@z*-}>>^%zldQPj0|I{lD)ue0ae$ljQB*0_Mk_
zbVw_IiU_E=@Ih<Zy(|K?ZWE`C%Y=RTtu()$J9__V$|~$w#iqIG(zuKSN1W~!Jzu51
zt-3g$%5=4!?%701tuHRJ*v{s>PXdGGx~<V$W&~Q+l{>o`{*-`^I-<wa-1Q4gT&yKC
zA(V`G?CWLu9q-9G&m1-nuaE8@@g3-QO>qBYrSLNRJL{&VFxf)<r9H{z^8(t>q?lle
z4NW%#rDCyre=;$>4t{<YI$cR;yPh;jP)F$J>>THt+&VuOlfp}b3wCa5T5i*{1uByN
zAjfAoOwWv1RtNMxw<*R73iV<rju>oeGC&a!3r$^7yy*7|8>qT9(jB-2c(3i2x-~XN
zusqjGzv8QGe~7bhL?A|aHaB5MEClFEj!qwXODjGIE7F3JQo?hId&!o<{XjmrNZY&T
zI+I57QUw2y{Q~4;Qa7UBML1AH=q1SXn&+r725{b1-$^D@r162I-P9VGDYrlThxBqx
zAE1@e@|+9`kSV{?1U#%^fdlU+Yzj}zc?Glq!Pg}JXDN<hg<Iu}>&8D_9dcr5KC|IM
zW@esmZaVW?U->pXSLeLj>RHc*f=myV4Lijn(GMXO^maepIdodTzc3s5sMWI;#XMMk
zx1Hr|84864?N_<F?(bYT1r5eQ9S<(&n&#$E_P>p|haWBIrR}({4)imVP!8<6`-`Hn
zH=)0Z__R3qa5t?fQKt`G8t7jCn?K(%xCc_gsZ&c-sRMapsF;tO*0LvWd<CkS+&ztr
z2ajk^S}xJLIu9?wv?Ll#oha8WkG>RVj)$Gt{eTetDBseVq;+zxs|G8sCJ)b`*%!+b
zPjQFUC4@_eE}_9nbyv!F2uMBuUT}(u!ghUz>%J(?bM$zX_8;A^pi3zuW@#aDa*lgD
zkhLIZJLq+8M6edIBsoaDs=2jAy=SMZL2gkWL@DD4ZB_?s;E}I;>3Y-;9oXv_#l1&c
zh?d|VH5BwfA6#gEU(4}eV#(%OD`kdA=ktZsNk<nK5<^~z4{%D?PpL%l$%Uo(CvtE6
zZ8m&Nk+M4pFNXcOtLJ25RvRa=>O4(LMl%@@%bZ(N5S~2><`navOL&V<EGa_>y_p1=
zsC2fyUOv=?p9S|Tq4Q(fx*86gSH*Y<2y#Z+7~pd8tD6TI1Z?E(^AdRCnbU0`P*72n
zfK-PU{1sIVGo$=P!NCTI)j*=BA<B?6=HFsqluYl77gf@>%8f@EcIcGH^Qc>G8!mXX
zDZ7+F9K$_31pUk~#gQwjl;BL=_tND1^FH;9_4&kmk`tvA8&d2G8d$HYndb~etGvHs
z=k;{u^3WJ28;KEWn_LN}KLo7kOuBsHZ;=ZF^$q97Akn913ADyJILTrnL(k>Dr$F&}
zdyZ(Ah7wUsI}_*hINq@!nnDwX+R^>%+4R&rfu7ycK&+v+;c{tD`2lMqp~sAX`g`Jg
z!=X;BNy8J|H2$Jp0smvtNO@LSVokCl{2}=|8x{75*;siqVVEE1=LCikFxz-KiNy)M
zob_7?cfB+OO&roWghM5gx|((!@;|H$`92D1pLH<|rKC3F`8X+1D*<Ey<!a5T6SwR@
ziKoAddl>Ps51pa-sKX?SBNXtrYWv(r1SHFAo@caY#!xdSv9b-U5UH33hj!xic0)hO
z2nX#Zx`hE5TEA8)me6?}44`hLv0gF@GPjIk|J{+nbJwjhu-9)V^8yl})!l?BPMsZq
zUy0yT3;2ZI1o3Z({9Z#ozDSh7^(gvhLtE+>OuWp6y$)nJ_nquGZZdBIUhofBD#upF
zAJZ=FeMz}CZ}RtYRtcv^Kq}3W)dHOoTRxvP=)}Bf`w%B!#uGhXKCLv{`eg&HM|4)y
zDf<CEIp`tBjYzs2OoG`8)Bi}246Y(Gaz4Q+0U7Jrx+6mfdm3xEmaPk&=)<gjEf5=j
zpi>eC*bX{xs`5l8BacE(6Ew)2Sr3@Z$hvl?4D!RAf)5geylqz^KsROJ+r|n#|9txy
z+<v_1#wLEqJ=KWqH7$@_-3VIdz`TKhu7Pg;eN;5Z$`*bs9CQA8Q-5H?m(Y_N)BG%8
z>T>i13}$_`*Tsi(?#pekJ}H?Nma?MQ`0-{)ngMNTD+rmnKyuCq1N;Er8`ial5&T5s
zPWM0J8iD`#6g(V>&RH%iYq9foI26Ax?+IAP?jWXEApZu7sy2N^v3-%dCu^eQYe{?s
z-CB!(ZS)u5Vw%9uDl{cy>Do}m+DSy2x)DSsKJ-^{0HSa@kK+A%ztMav%)h$FQi+}T
z%}BcXxuVOGV#0Yf_s4kwLFaB-+x!)+*sftczZJ#ooYWfCu5WTu%WgTT(}o{<^bQ;|
zt_$Kz`%-0%s$QQik>G{#9zx@-a-QO!kPZ<5gp_2^82)YH%4zQo&QibYHD?YG4`Hev
z7f?7XsA{ZRM>A7r7i4<^KA)3WUZHb1Rm(zZehRMS&ft=NYFw*4oQ1ZLDKk!-VDG~i
z=EE&}2wXhz1m^IAbBO$B+w8^g?;;G%E?aZYT2oaun&lfmY^OAHvyBziN@kH^vF^)A
zXrS)3d(=~jl{7t>kJUJ;)GaK6w}z*X7kK}$jdG=fvZ0D5B<g}e6f=cKx$z4*$e4Be
zSvm)x1UjM$ZU3nRKx@1g2G}!P6zeA9A{EUIIf%x?xjO;Vy-mh6M3OU^?W_PXOE#3x
zaW_|JB(j=Gn5wn+hWtmr(WmloxZw4x(+3WjnyZLd##cT7<4Lp!@*IF;s&>h3ANga9
z+}93&I8S!RCkEzz#V5n~8<6?rBA&fJl@~|9IV0z_J}h#WnQgAkZH}M;WjzHOy}ZiJ
zqkUK_mS;Msu1yMxq5K%@6S&yCU5~@g6za?@`zS|cP^9DAm9|l<qI=eGfXQypOBvjG
z?wd+xgiC{Lv*O_otV4Tq6g-ZZZQQ*XCPzU)7g7Ld+5+@HmXv?@5B<_`MsYLEwubM^
zWY%_wgCwGN$Ty8P10b*Z7IX7^=1vz%7o^em)699;F<erfNeWR=<$uU$RL{8aLa}@>
zaLmJDY)<MaJ*JGiK#4OPfHw|7;BmiN)+Z^ro+DY9FCckMz^}Bp(28~Md&Q9##}?s(
zCF|Y?YY1@YuMz>(%?*em{zr8a-yZEAp}ywQwxer4_L?b1%OD4O(i+e5;c4ZzPeS)|
z;_=OZixlZAKiP2;2)r_+T@(Tji117g1**iYG==IUw_1wDLWWln9LFu1E7U9Pu5R*z
zzQuf(w%jF=<_YFjy;c~E%M}#{&POms&{&py^|v|?k@~1SZ57WIm3Fq?sw$@cam^kJ
z^qia1tEM%yadpO<-qxGb(>3sZv>R5t3zcmhkPQ>sg*yCD!P)!{WlBYN6m9c<bQuuF
z$<;EeRpQ2WW8>**TlaLjsfn#tgy{3hDIcGld3i%gCB5Lso%N>n2BOeXC{(tc+-r`J
zE0xO$(2<ugi|fZ$@Q;qrBl?Hzq&(f9n)J}pOn$59(K4yxQ}lsZN;dgrN|YI^2p>hJ
zoO-T+<CYG#2|NuTU}}J_*Kp6P{}yoi1GvkP7&dK{a=){g5M|yv;OhD&HQ`MN09kA%
zj}<n41M~GqKFy~BspY1JGkRTx@3%$LHGqJ{Ng*#C>Td}()1aR;d}wCUO=AeHJpKU4
zFIAPY;}-HSM+PiK(GUgW@3;P%icczRkw*n)Kl*!We{C%4KzImf{`JB4Ps}`o1yJIy
z9D#lBZ|v}sFQ4dPaW;{#@>`#A69J%x0)3>>-`x8Py=uY&Fly@<w?Y5!2q5a;8t~U~
zsR*6hm2#%wvZ_@^_4b_Z_&_lc;kuj7N9RPs?Qgw!Yd(MA|9^n=|K2m%|L^9IANlG0
zeXFH9K8Nk!Q1utHgf$Qet<Gk=G%Do;f9vRPRZEwF0}F9|1hO3Hr!etuDcS(K%icrq
zW1z-Fl@R}yRw+^e%BeO&$#hG2L{tEP&%%+cy)}}mG@w;6>;Wi$Dxt6>V9vK+bwpU+
z`i>f`h?`1o>7T>UPmv5M0m{s$ZEN!Q)^}oHF+u65`d`}O;lHQ(I~U-$EFGc&1rAjw
zobvl~f&71*l>hV7*KD(^|6#v%u%6u`E2VphhUOJbg9g@z<3dB@n$Ku$a&>dtB*{BD
zB_=*m$>aV9d;B$XAAC3z9Z;*+%-GE3ZW3EWiOte56pF@t7B;7+0FsF`9vof}i0(V<
zD!2ylcGg!q&@I&we&P4&T`c7KHO0fRh10Ni3G1<f7=*h}L9q-2>7%$|C0eT%vZr?8
zMC}<0@BcYC{$(JZV`_e}d$ip=GbiA6>bkXl>HDQ_ITMr;w7NE31$tmWd4hr`YLi$^
z-*|X&x_8!PyCil%I%D@}ey(GL4gW30Ewy}4SsuTlh&>y|&`-yWlX7NU#``i8fjM(h
zbz-bOZo_$j?m)$YMu!mQnkX9D)S{<xxuwD71Ms+&grObUXWvxE`Y-2}(Z+CgK1aPz
z8VPf7K8*;{rQE3WYtgkm8^D23wK3mXl4Mu_IkcT^wv2_@b!)c1xuOa0JNq@<sO}Ls
zJGaibDM2s><BGF@*DbpdkpK_pe;V=hIC$I&!qgywBRoI!5*Ky+=x^k;XkM{;8|UWB
zu~X*r%t^AT-27g3Z-PI(|8tEDNx}tbg2IeG=PK#8xO;05Vc|QReNLoW*dewKJeXw%
zr$X8aRstYWu1tg!cW;^Oqli!Bv7!~LD`>19^_M5mxsC)gt&e~^W17&QcWT$Ezbz{W
zO$Vd>5Dm|H*k}7oCib|n3*b%=z}(8v%1}(e&QeU$!P}kpzTB3JDpa<e*_b=NbS5d4
z;38q#CKZ1IA_4p!KcGeo)${VZlu#09XJ-T5`Y1k}t&TWTzOqJ_vs=b%NDW4f>G)8S
z%NN7U_9T+8*sPV;lc58<&=&XbjQ1DCS`fi;?<HxYOTu$CK8r<nqGtag=B$UdoB;)g
z<I8^SG!*FK{1WepsBpIG%MQ|YnX@Sh)hftAyF$PPJwM!=LNhhV`(M_qr}j39ZAl1B
z)1LeM$xs$_IQO-iS}oO0;po^TO6nD64;s6z>tik|NH7Q+EfZUO!n3Jf-7I8B;iEXi
z+~p;2Bg)!yC(u2u^Lc+R1*Mzl_;~}57Tl84`2vV0VUKw?#CKgXfEO2RceQ+>UGSAe
z`{KR^38fDS9v(?Bry7xFSecNAx_)7@7l*IE=pTA<2hNj>wE1aPYJ4|qK<{PQGxQJ6
z<mLKWToJ}i3s&E^bGehKFgjkXy$PeKD6F~}OwPWnxYU2+H{ldIQo$!M{iDLY*zEZn
zR$UM+4v>u;nO!q=kSC?g<)L(79FkxleyO*`5=rL0m#xWCLUeF6EpD;A3gFh?d(dC#
z<s~9*uVF2B^s{$DZw3`MFfAzKD+!@36S-f6kpwIiXQNc8=Y`p|=qs@*5a=am=u|&s
zr=lDiIZrsL#T+p^u9Kp_P8&;)=)77*4J(u^G~=1QHdl`J#nwCh+OKC_tDJD%`Iy%H
z>HzQdz;;OfcUd;s68FBN2n=hxzbsDoe>#Dp_I;YmQ(1M10!#IK$}yfeRjt67ma378
zg`#W_9(}Mt(P?JRmIB2&&{ltafP{K(SZe78UFPku)KT_mVOG%c9D}D_bU7ih<a^u@
z3h&z%Ej?V=s!ln=V^}PWybCJFNDi92BZ@D4CYm<uR5j#Jv^#Ci6PG`UsHXVzF1?EW
zo_>+Quz&5sdd88oqrT}0f%`bmk6`!2!UXx{r(pT?&~|KdOTd+A`AT{<rC%GHgYOn|
zNfpI}9SjO-@{g~*PPvh3M1K?$Mm>EytcBs>_8>uyEmbk;sCINo@m&vmPSQX$O_V>-
z9Vaeg2y5+$P@5tNcM6Rt7DZLA)DRGAC=PdPvTI{E=fReC!_#KI`cDf36Xhp<IE&f@
zS=&e4P3eJ)w2?1{&qR0_YCEuqT9rHstJe2(1sAV*CPWm-SvC*!cct#U4ghY%^2Yam
z`a^n9iLGZiK-4xQdwD`iQ6{$EA|Nt&5_e}BKj0}Ct!sqrcnR8jl9hn!>T3KiK{DP|
z(q0%H7u@*{pUBG{@*yEP#3QJk4eN`h*@l`0pVxH#aTqG|tcIru*U@kbsw(POxG0}N
zFlE{#sD?SNHqwWwL9%6qMTHRR6*StcQI%A<bCyCB^{A-=;ZHky2aXQz*~fgbo(?2-
zNA1<rib@5*vs@Cwl(`RPjZ}Pa5fIIW%z_am^@YMzh5hq8>r50=@~_UFVaOCg)cRsS
zs2IT(C!_iI%##tQ@bxf-1bQ=qRGTbAndbw}6}v@WYdbu<ZL#hEyXeRuTjGTZ5Apn5
zLTd6z345fEePc{vgOsUdz^RpVkQliR`GQqw78Pnt<xD7c3bHGHFx+X{!;SVVSuXnx
z@+kgxV^CE`h8YigMq5%@nmE}@DRn%C*4wH=3ja={sq5u{(t8SY(KLJ_T0J2Ok+|Vc
z?O#8z=i0xr$z%)~**#=J#@K<cB<$cqDvRTau3Flv%3l1MA|)B6i!HpvZM)u-Dn8!J
z-<ttc%;P2Hi8WVk|Ly+z4NKp>^%t(VXpqarskw(#Hs`>$L}}L-IvZt`7*_mD>7zvD
z!3;D^u@fip++0_8RYSdOGNyjE@{y0M<%N~A^QZ)Q<pH_ugK!EOkiiB<VCT6JQf~$_
zey(g5HRsLk!QLv6qRX8MCXTS7BfaD^lZlfz8$~hwlAa<g(Y~1Skm;H~3shh<-sH9`
zaxolasf@X|%B@Dny+cDB<G>y@be55fkia@t9Q#x`$gtiv%(-H5IH`o)#^=hP@-D}H
z`2%dAKIN8y7k#1deX7pW_>0Q8Pz@+!`b0cLYl`~bJ?0}G)6tph3_LEvf&$ap*?h5m
z#??<1eATJ_VMklRU5_?7K*l~i%I`sCqFX<zbn3`QTu1JX5#Xu~2*#fQ0!%!j#%zdl
z=Ij}k5@TW;5o4dcwYP*zep5wiVW!FYW}@R@WWuDX;!B+7oApBBjb~S>M!w6WXWsYa
z^O_Hpt71IZLQP!~9J!+r&%jD$s<u%DG60T{Nac*-R1;8pFPX{N8h;2>OJMk3?A(06
z+SK-?#|7gxb>VS}#$j&Ks@m>&3e5`)Z0=ZKF5SW^cM-Ya9=pkXG9EUpbPXcbFJjw`
zUaruq)Yk$n1SMYX%XdpDEQOSQ-y<Kj;U)->+n)@18qL09!^OHEB7w&1<Gk>7(7osx
zg)R4VWYtS7cf$&9evPT?eK}K+xb?HSW3Cu=cDzsq=6@-0xcCbNH|xZm89$mmxQfON
zxg=yc;Rwq$qeT_a?)BJ;=&+O3gAoVJ5sI%Q$8o%bDY{9j*_U}$V!&M^M>1PYbIy-h
zdE_d%2`y<mF7)%fvee62X1J^yQyd0>ZgGDCvb$-Y=Eo7Vz&E`N3Iu`n<<94e7fXj4
zT7E;8GPU<L(#n{jwF1WJ^(}GdGAVgqcZ-_%4AKQ==W<NGLr&2qCP<i0Gfkk-PFq4Q
z4Fe{F5Qz%zE5Jw6Z)k7ljQ9F$e+aX))$H=kbxQR5As$lqEkO4o9R3~Tp||~Ny^JPz
zlKwX&37iV4I<;KS;YyYfzwYGeQ3Hxz!x0|GXn#uE@PjX8e@@<~1&jNtVeuO1sFaH#
zV=yeaVLwLPYk_ZN9%4a3&3OwW2vF^Pdko@AL<h5e-_z@7F%|r(YW+14i_-1ZgG4k#
zA6IPnp7xs)otY>Fx!^;1F-hfbJ>*F-RKK#Ph@0M8;JXBVUFEWrDjVUZ@f=wU`A=h;
zUIw37LkQgxN7G0MrEVoQ5f3C;@NSttV`4D<c!(?6hvtLwCs}t9HR1ZTL8oJp@0^g@
z3X8r;p!&V!^U?@2uiPZWBKh-SpBl2E;uRYeT!CWC+>vhwDY)R>0nJz=%V|9PKbIA+
zNV;IJVR?hlXhW6NSncWN76sd0ImKBw^{p=+lPYGdl;^;WKM~fU$L+plit?j`J!2uf
z_HwhgErueU&y5y{p+nT<)-QfQzWyT1W`gpqPtj5WXKi$}+mXxvK92r>6=%?`E_4*v
z@g#4puv<!W>r8eVwcWOB|CfI35u(qvfHw+)vPB@$pLXaD(6tCGbZsfL*m?KQ1%DS2
z^}{EKUn?01L3sxo>(BFraOBT0I>{1?H2AhN;U3IkONt}+^w#kJ8<yoQ23$n^r;z^7
zJWh$U-Fi?L^3Xd4BH1!3%1Zbzq%%YT_n89YgK2FNF-~IUpL0fkD;)UgMqxoSzR5cg
zx%1WrbQkd`K-fmJ)9TE=Rlziu@wUo_Q<o<^GK)9nHEA~_c-(dCXE%v&A3to^RGFFP
za^L!iG@Kkgc{LtodDx$(fRyl--Fz#&fC%q?C=1Sew$LN%dV6I6?xrlBT+!Hy_-CC+
zw`6clLE5W@Z5ei<VI*@a_vj8>6+9?N7R+e;6eXDL)`TvkNEulPiBBrpqmlkpkUQ`%
zk&^M^K@hFo<UcnYAoEKfle@o$DD|}RBR2Bw*H{t1*x~M1`rQ!sYMKl!VgIR=J}sCo
zU!D>{k83_f-}1S_MtpOz+UD|OtXN^-U-I%sN;XLVd0U;-eToLI6!{a@zC;2Bt@u2&
zQxV+3NPGL`bq=2-)KSC9Eld^eTZ}`Gr`mhf>EOb)6>yt3q(jzlmqIGFkWsI=xRHN(
zuW@}K(Hk~Gy;y6v1(m)4&lwd90ei*8{`)mfpTiFu;iim0g3;}W#UK)VUh2+X_L^^w
z%Psmy!$n^VKCwBoUuN&yhnU|cVx7-NBLeo-3;l^`v_F|6%;Y0y#_JKibsynhuL<~w
z8uMOtv*I?FSs)|+X;?_pgn_#`7zO%H{|#i0(msW75>*C^|9G8<c}t6F$!+L!A&2B>
z_5Who52S7DH^~pKnW?G%<&71ALlZg$8ajTkx%Csv9k`^j2Z~aGx8`>T&K~gvSMohX
zv)g3-sT#MX^E=z&Pip^P{HxC}?IeR}MsgYJ)5U+eo5sUG4exuh2<e!0<sAwv-inRg
zf#;R^_R87$)XQpq(8)QBr2p9W-%_Raff!?e?Z+}+LFVR2+^HCY=dy)MZy9scq))4S
zRF_Ud*eZxE<^~fTEjG8jk>St2Z9~9lX=VdLkyXPd=T;sB)Xn^Xgr+nTo+|fw51KD7
zC3}t|1hWJRMVsez^*!8w+{_)HA~<(mS+p)htx04J()*QdRApt_QAH&oB_<ST8EjD2
zIw(cD&QGj4q>^mRB?vvHUy7cYi(}s%WSic5x%FNcQ^wg~d1?FcL)X{{EqRy<t+V*F
z>#;6pVBtLR+-3HHpL($&tn|i;Cc?vmWP;0PEpKla2i$O3iP1>P*6O?QNYcG)0$~He
zDBOh<S0Yx*4Ma&)<Ygk6@s63q#2zhaB*8YU2=ne6<<L#NyEU{L+rlp#;TmmawrYWS
zFigCWn+;2TelRhQs$AOL>p9@Nx-utb4^6Rcq7F5Qm4LEJw{p=KFtb@D7i>3@C$v54
zQdMRmxjIPzyky;>35FIr+QBZ~q0IW{k9r!ma2?k=A}5<wFcST*a-Cf;F5TiDnVMZu
z%S)qn^|)Uk@z68Ir6u=>xh3_hPJ?;nY4)YoF56BCJSNtT4oAfb!>)ylSG>n}&L9DJ
z!!1nTn^<^{d!8wSx=5^&Aq}O#jTH=z!Z4@I#j_WyObKqZ4a%F3tAvgk&oE5h^~2j$
zh_|wJC(YEi_&ioPB-T4=DchauBB|8BQ00el>hca7f-hQo+n3DDf)t+cqxUOJ7)IHA
zg}RgWIc&?REmUyp`JRQlT}%$FG=$I6mErIw`a25ZMEsf>8+ZCi7u^`w!kS@B;*9!I
zyz!FVt&Dt5P8a3IG}6O5wK*%>Dmh@g(7*pBvN0i+w5yg$T^BsmkuTodd#2RKr4{e8
zVwHBSrR)=q0SYzw&ak4q>yOok>1<x!E>nunj!X}z1`-2u%XRhbI-NInWpkCGJkq+a
zHNpDooFVEcYHt}$Fog|#w5+=H6N23t2V1v9BpvNNJo2K#Rgdx^bCY>82@e5VC_I5=
zGDpSC3SD74z4^*U)o2Z!?~r;4*CFr3%;4&LZ$e+W+v9MYxCiyplp4vCe6-Acmp!z0
zX!RR7E;R5HKf2|tZ-kc6DoxBKdp4@=D~YGNl*;rG_!`Dkc4e9JD_e6Ja$zKhUnNNo
z8_&_7XUjS}N4#v^YUmXhOCQBc)S!;~V!L^Pi?;f*W~-DEhZ3jA`>?w&)O(qcG1&Gf
z>cH-cRT`4+R_!^3Y|O9+lmAS~ygSK4TrZ$KF%~tp$<Fk0SV~`yPMD3Ht4;->dFEHd
z2KhGoPQ_RKnxHbr^<+Eo?v4t|6XgV%`VIs(Y5sL~W+Rd;=8L`@L1XeUnkyk~!k10r
z6sY(5_w=y$5rp8XcC8+CcCI&Gh>`vD(F~cxB{)m(W_(9>F0l@nZ*KiZdlI@Oci(6A
zPf2QAg)s4LBkb}SMfYtxd)b2r&qA$@fu?`OS4{{;PHh8i7&N?xPUe_u-n-RhsJ8K@
zJhLfj_@{n)<fmDvg<V3|x>aaCyoe?STni#`>a|TxrImU+IZCTmCXp4xvkhWd25fa7
z%y<|-o|`NveSB=@<YZ|WoYnLyi?Mvv9nQULl`Nc4Hr#C0jl|KQ?`AyQY2r!YnKlbX
zLoS|f()O9RcXGXC{_`Vxr6-N>P_1>{z}}Te`<BtxZrL2Y9a{;^i{o!+{K)7n8uu$w
z`1PRImEB&YD~hu%O_8rT&Zda)yXl}^yRxTnI5|bnJQw`oB(w}+)>$|X$z<W#GC1_f
z%yvk_8AkT|{s)O<F)1J2Nl?qi^tc<Y{rw(%{~#w+>yXIf6~fIPda{*!xB<PU?hQc5
zJdEs%@2xiI<OyeOH;i3j@j_pD+3A_+sKxN*nl@<Dwz%Q?3Si47Vtr@AN5DzCdpre&
zHul(Z#9bz%^fSB<S(Uu-bcC$-=t(EUNqZc2R}=QQZzR{b>O6`HprvDW-#$qcW~uGp
zCRdX^Z1}dGHxF&q)p>Z{@5-=Ns`E7<;QQKOh%<eK^KgcjU84V+Q8rvw^I8KCWVk=}
zS`hPL4V#x~iJ#bW0rd(mTYR=w`<@uYG}MAH%!D2rjXc&!#Oe)}7N1GMR2}LESOwyn
zq*qr3WkcCB*UlxZIhrRc<U?J~>V)&_n7b_t<Ifao=jrYYe~NnT$UZ8^orj(bb4v|9
zggaqsh&WZO%<)e(hZ;|>x5G!+T<D)O@dmfgyy6@}jS@R3c<f^0;{M}Wj972xWEkh+
zjl5ro^`5OkGIvAv0xj7gy&2)oOeW74FVg8YJPF)9I(K_gQ-6qTui@$_PXzvHp$_?J
zY1J&6D;Xl%?s1izK3bbtC(m2ps@cei{%gEbiPeh=syP+B0**1FsY}^SyEOX@S8-bn
zf38nbP19SXP<4yrmjnGY2`N`~=j*kvf}(7@_F22E7LY!wD2voCe-#!yZ1^<SDVb&O
zZp-~pc|d@wtyE2>z!4X7fl59aL96_8PLpGC*xaewvWDwkk@%&pezK@-gYvnZPv(MH
zZe~?MN--sSjxm=kD~q4#W2TV4Fs%2-yT5Y*m^N_XKT}{``W`!!aK|fjK{>vQJ`%S%
zUgo~JC=3Zk5{?jp)v+Kt#0X#1CVSCN*>!NOPQBZFY`%$+?TK^0ayUCmp`BQ$F&Z8D
z-4n;IP$BKywg;aJ<XHs7L?jaqlB7Bb?rz`fJBv2KJiRoIDR+nMjlsOUjp`WLWB@1M
z^crsTMSp3|aPz1zA6tnT5Ld0I?b;Sj*M=+DqDCa{BWlw#r#n?$qMP{GD{v@sm}#fs
zKOw5$KwP+`VhI&+1(@eL8jd)^HK6L$O!&nr!+3;2(vuvWqiXIeEyv-TL-A-W*~O$M
zo;=PIE|%%9$I?~Mh_E7#UkSdg-%=2nD?)FwDo@!j8aQLnAS}XyYQ0iPR4G+$d6Rxd
z6H##HV8qOZ<asbTwbN%>WBEQo$}`*6!@TZ=LursBA%W0DVX!=5iv~gjkD6=!6wCZq
zgLcEdcLG4Z6c_wUtKMe^879aC3-NOgNAKB`KvN1N!|j+*O(27myB)9PbgRNjWs?$+
zU#-SCak&T;wmD#it_tJOo%+P@?dB<$dzTo=iuRy`r(z)^33L(ehaxr@YsW{LD?2Xp
z_xM4dh~Z<&Sls85*BQQnL*9(fDVUCP9NM+2hwyl2xbDZlY9GvT9%)%BNQauT?lw=&
zdlat>HN8^i8kpL4+rZ&<yP4d<zdKjxko9AgAd+rzd+ps`ibZHwfOTxCZ6V9yY)5ZR
zAerM6O3V@IkEaCt>mv83YErC&4IG!6;|?l#BUnE&D>A9fe+Tp|Qhr#;36r-K6v;He
zhfqs9g386Vldt9p@W&WyG-NugDMA9jk`HKlh7k%Rq!FXXH$SH);m!4*4{@#SoA`g&
zd&__*yRLr}5d}d)5D*wb5Ky`q8U>_5x<NVwVdyqck?t6}JEdEsySt>jW2E%I2k-lT
zp7(jr$Mf-g5@)Wt_TDRgEB4-NZOy~o(ViK6{$R~j*41%{o=KrMukyx3;G28$sjjue
z=98vnvgACe-(n+zc(Nwbw6VV%)Y-F;@&oBHoa*fzXh_`(p)U2|3uo=wymGkzLG50z
z{wL?&h|M?fdK`27c<e;C8gqRMeUm**zTnzyM#*E;oFXL;5xC27SEuz+J3MR-O8hyv
zl=sa2<@Xbh`ccF+*d=FJiMH|xo6;t}luG{QTw=fH=L@!ExgL;&tR3$rD)jy48}sw#
zm258-UJi6vvMW%~q~sZ`x6eE>6>Agm&RQz-Ye;yaP(S<Ry(x8&5!bzoyh-n#O2y_k
zYA-JzUxiIC2!;H}a>nz0b`z3&QH8>E6y}W8o;h{+7nsM{*Q+D?%lF{KQg<SmY222n
z-pO~c@G|Ho_q#gMCJC%rn%f>IiGNdxR5)t3_ROO`_u>l~D@7{UTK3kiK|aQC=$b!}
z99T%r`K|k77SudZZSs1xj%)S3Wmq3DF-&xMU(!xZg>uX`sB-F-u^@V}Z0JaxxaS23
ze5$?G=}*jxGu(KS3j}1A{?ey?NU!|R?d(GSXSS75$7OBB9Ib!cVa;omD#;mGR)R;K
zgkJt5WrPz!*r98&3RK=iTqJ^}_t~<B$8=CrHN}l`3I0~i#_W#j)}Vgw^wZp`eJG(*
z_Qe}c&W2ImD69$Eafvurk*`br-${P0)~YLk1KF8pE)DS#4+G|(L+IS>K#t+$qiDMg
z-{Gs_&sVv7!El84s%N3DN(dF-wMt??cE++3??JD*Wa#CFj36t4Apx&z{HNbtPtA4`
zT$N*QM<M3w(TxmEVC8-dZwL)Yhjn3^{G59apuF!hNkz8QSo&`Bgc;yaz2g25xV;uu
z%QTBWOtPm>#DS#G8$!d27x8{C-TVz`Brh*yROt&>?e_ZUx4n2^%e|d%6R?&ySf+mo
zdNk1CZIJCcrRZE=iYt{h($=f&4)Ls-&T=OAdW#!Jy)TQ3--%6P^b>a)kT2fuF2y`)
z8@T?sdgK%5C8APRv?ccze~mG?yI(PiwfNFpDTCzg+ExXU9we|w33*-<BhEZ7wF(WT
ztYyMb<mH|e2Q-eo@N}0Bd(M-E;$BXh$U`}m)dqEh!b07LSCGz+-7@q1Qwqjiyg9ZE
zg|_XIFEu@UlCQgC7AI(zd;Qf-Nt35=#mQ{{p-a>u(%+2bUMmqdvtBC{N`Ly^?Xlf)
z9b9maGS4PjI}HtjJm(60^2}N$UEA9GF09S>CklE%@t)YmL9;ZaB<FW8vzxY!uRG>#
zIp*X0GA(U|wW3>EgE(pGl<d12F%K<)6n%Fb3cV+_ke}tewP4q`j_N|j`-mmDE#~@@
zQoqo7Nv|hI{MWz^fvBM6Cl1#;Qcv8|9mtBuQ|zP4A^pr63<g~fS33tr1Ji}Wl0SJ5
zxKiHDDxW7#W-?(px?iQDRxroBysW*e(~}yO?80Jkp^+T|l`{rs!tpkEJ`~J+a~Fnq
zxJ-=(@#($xqDgckk0<DS86H*d#IYO}Oppsd^|c=7_(Avfz`OH)Q>nv`A;?2vJc8fL
z7MKVQB<>Wj`(l}xz1=IzAu@wVg6bAj%e6zrY5iZq7xyv~7Tp<m^ESNt-TUeRJ%;w{
z6xM@Ycs-7Dd424i<OF~6Rv5lt_8UnK7z-x3vF#41EA;y{73Vwncog14k_G(U`;uo%
zPuG(k#=w^b#Z3fB^`c%|<6j&0Yc*xnofK_*yBDz3idQM`7U$^ZdeSPZqI+kVK~DXy
zp7e61$E=Mwm2-JLw5(*|zIXA~SvZ#<qP{Ue6BM3x@ol|w^rm>=D~x>xa5xFCvzB@L
z@q@h}`R>iPy=MOKR!(`PPH!YNmU@8D!25&sc}Cq5D^}8b+(HuOWGWlQl=!7--XGT3
z-X%2=!Na0CRyjT6eB4Lq3nDiPhvVBO^?0sRCd8o3M{L8};NVpu2Sk4sbbHU7wq-2v
zVfntpxb52KwKn6l8r97`;cx4wLx;p({rb{cF@7LE@k~CoO(2;HaxjF<NsuL*7f+xS
zsbO7G-EGmZ>bM_ScBlVB*Dl+O<Z_8)53Xn?LcPGX`Ta6z_I!+3to-3#X}7vj<?tJ)
zEYBKU<-=mxr@|5Qj8Z@J^DYD;5Yht2$3X$kJkO86yA+f0^U>qTAG)ya3|bKTUXK-V
zvPwKtyh)925Pg`c%ZNAQV)3nEmu>npWgd5wk!a53>wr2nSr+bcZ(w$}?eX@|>Q%_g
zHG=HA=8pCC6CHMVi2?822Zig&K20l%uRQH(a+iJ=m+*f;IN59+y)$jgJkkr4c%$-~
zv-X~diHiT)cwob_jo)8C%gM@BEP94HWcCBgL!Z)9!Gv|2<Iq+Z4sV!=>hrFZX<f@K
zcCn!w&gR?WzQswq7sZhN7B-`Ad80TJIL$6&3Dy&E`@P%&><NaCqY+a;YRtIpo3QJ|
z5rMlq{VNE~wk;i;^%zC~Dzj!2NX_9_ifj$)9xUy5`B869JvTjl_M3caGDs(IruRWP
zS)s67VfRMsVhw-j6X$oj3wlXeJs+XH1?&nS^EO8p>ZEf<DWlBqVXSVzW}al3cQCQl
z56pNUb_Qy9bW}a8bR^?fGHP41U~O;5sN>}IKnDZ;5cYgqy7!2;S5-#(vE_PBHK(vs
z@7@QCdff}$L>t6t%}@&>O+<-*g2CKUie!oHP48l3JR6M&=bj(82;xGEsPGQqY6v;T
zsQh=X{sHI919=10I+$Wf+P2jhet*pk6-~}C*+tBRcxgBW`kr0>uwWrzP1w2MO<s|2
zxj42>`24vPF7<LiY2cp4$VrH(P{UrZO33XSFB==9i7(0_WGp!N@ivI?NJ0Q;^@{J)
zR(&Ipt?PDw0qJK2xWjq60}Pi^;!~L)@mA1M*9<>ZQ=a)*U*qMo=C{QA1tA+Xu7wIv
zn1ynczRAOlbq{2@UO0EVGyv)uG>*4Deyq_(tX#ZnucB0%C^Bh-8^f{``ms%N;VbW+
zRN9X3zIfa5dr*=uu~YlvZZV~{{XWm|&`V|N#Eb`2sNsJ^x%AU~KhMJy$)h%1%aXll
z2`08oFvCfD?P5AD$799Z%KLi)J+%7<8v0WrX0t^aY~~3B!xV9-N}hfw*fvABGWyf?
z%;{<L+dpa=Sk(pog?8kp0J9OM_$&_}xa^+T5f9t`nz0vs5j?OiqIPSUnIZ65Zk<KL
zZ0U>vs@bMo+3TOFZy$QV469E3Oagut^-oE50%gP{7oompvJ26Ph!H0XzK-jdu6yOi
zuRe8QRG;636~l<Lb++_r7QSL@h)-j~uL?iPiVZ)YQ}&v_=5y9~{U!dF+q=E+2KC3(
z0H(N86{dE7u{0h$@Wmko2O2~}Fihs6uYJQGo+j4P#C<-*2^nid$f-Oj8mmEiNS)MN
zUHKSGeRJPADfHG@5?%G%7kl+ReX684lG7Y_R9ffvf)r09lf0<3r?eiz@2zl{IxvLk
z<*NW8i*Ct)G9{8A3gRfgzp9{hd$(iH(9|Bdq+(2oN{HWC`*vPp-xzM|P`76my1Ugh
zFcqkuPn6g=SpTA$aYgQYrr|-1N`uQIDDke_d|!6P>MKd|Yd=imaC_py7y;)?Sy9|>
zJBP?dm7pbA-AS7{sc_~W?{H!q6#aX8eS`fvUF8I^to6SUoC-u!<QLTHbK&QSgYeBi
zzy@(8*9YyFrPj~n%5{*>h<kI=p$D=~b^D`p->oMY#F=m;4V=6rhC1mtEE;WFQ+?XO
zi4(B-q*b-LzeV?Glhk9M*Hl4dWOVAys?aGv4{xYMtW$(D?}6LCxt3wbb&P!cmy52W
z?P2M-`+<Nu{a#dN;zVBIqi}W66Qbw6pUS~@oW9xYh>Q0gfvOX)VT@L5>cs2G!X(jl
zCmP=1>M7G|$qBVH+5}jQLH=4oE!nvQyWdX8i@CEXa!KE6yRDJNWO^0?vfp+7(|W@!
z6e4dId|(=n)0kQ0)L)BGpLs}$;%Zgkk{hRZ5XacdR^bB=VEuYcwO{BN3nwK%9rdyj
zuRSXPgI3}OZo~)Z(D(95V{ZDM((XKZ9N9>KV+3`GiGfHL{%K~p7fGZ1vcGFv88ZC1
zYA;3=9Pjkkk~htq)k5?b7m}b5aQ3tJyX<O8rAtu}p05&jDe#+4d_~7?rrtfB8nEBr
z4N?)vq@bVJ(ucc~bGjUV6pjA5Bd;BJYn=Jdkm*t%#^qaqY%`9lmzz(5P};K_kK@so
z_Iz613c5EazDk5`$}tYOJJ^O#Ve-pD%tH7gArH;lg>u){g`X4l7%F%yI)b7KMP%^r
zv~g%`J#%ySITlX80guvYF>~*wx0W3BQDvS-59NN4ob6W?m(Np-&tUM$gNrpp%lv4X
zYkg-0x<?-|Q9HCYU_9W%+&n&poXCS2Plc1?$Jd5Wrl{pSI7NGR-?1a)U>lN=W1ah|
z?bc*x*ZP!<=wqd-`>kbBEc!)L3}*y!PUM92k8e3<&0Tb`pb~5%_6_R)uoR}lRA}wv
z_YC(wlG-;yBj?M~8}bj=w>LcUE{ZaET2mLlBky==Ai--bKkM3c<85!{;HG&TzgOX2
zan}U92QMpzNuu0QhhLxNfPDyyi-Sf}7g@dhi$4!DBOt;nw`<13I&fRXvj}h#S@&`s
z*6vDd8X!=*K5=#9`*yv}1g?W{Uq5bW3aQYh>hon@!eq!Ls6qDLGGRv2tnQCdQ`;YC
z=^+_nXS>^y*Gpnr1Rir)VJ|lygg+Hh6#A;E;2kfkOP-kai&9!3Zy&txI4yXjxh_o-
z@b>@s@kr8TsGYrJLI?GO7UOd-%h4GD%dC#W<7Tc#@c_2`$8>?N$FmVj?Au*CRY#)R
zq=@Q@^Wh*tnjtb{XasJCHl(|FN+t@38!^v?ysrC@Ruw1xy+bp7**!g!W1WY&`<1z}
z&Je`2sGk=s;SYUfDDrG<X9T(RP+C*le^3It8K7O&WiZ0sgeMgfNCwL1e!;UoZjf0X
zIazyC#jC3yv5>6-)T)Va@^LxLSx9pX4{pzP!<-4u+(b_c>P0<@klydm=b(zh^IhdM
zR!O3ydVbp7dsN5>0ma;fg7ax#7n6}$;-OvEvhV3na<)xGQP%#nAE+O!w{IPa^{edl
zSXDd>du8p{(|Q-N5Ff!C^a<Tzh?t13|5FU7vxodoBaKSeO0KyJU0=w=uW;(_eeZ}v
z<{!Pomg!Pl`W@`s+6O{$FDXPaRH2DEguZ^_tZKhqLU4VIPs#kmIJw>_)ZnL$ul8S_
za;_#i*R^f5#=u|A^*ws)Lu<dECR06S?WJ+&TN#5Ukajv|96d;?vr|8*(V7dxdNPnI
z6x2i-5clF-QT#rer;O?m97z+aNVkoWhFR>bg%t;ij)T~jn4#BQB;L!B*7ktK?xKn+
zanxt(*QnwqRki8>hN+6IAf6EGrlWmGv6`T?6m4s2jB-tHFZS!4_QX@iFYoetclmEC
zyd|-F5AJx?w{NBm@nTLKam*%)oO2gBup-`!MUD__n{k<ylG@xI949a+?Q1xHQf9oQ
zI19zXQrXxrjh>swn^Y8hk&tx0)xFLu?7?!7oF-dsE)2g8A<b&ge!PJ{rl@!6DMgo8
z%oJ8tOJ$(jPRtsEwRufPa0EHoOL)?%m<8nun<jh7&@cXRO=V=SW?eFdTM-i6&;1c=
znG&`ELmuGWzye!N!$jQxlgJ!SoQ7t<q?^ZKrrKusO_bt)uvXUU(xJb^=9JM&?)P#u
zo&BjNxK;I(eA^<KS~~gcLoCGHu--((=}P?iT1#rl>SgM$`Ij}uIBgn!$P@`{v0iuk
zAX#72n!+*G2Znnf{`aV}Dzl9P0q&7sR^q#w?SLmR-lL^ZJ=VB=e%Ps`i<InB9tZf!
zg6EZM8rl_^xuGE@dHTlW=7SHibCdI40X$xwi!v6Z21oECdACyengoyGqu4_e=LO9`
z$?G9!UPrgVCVr){9_>n#w3qn1-hAp){-Av{i2q~!<}TmhcOP@p-}9qafe=Z=IbIgY
zlwUU2O&5>(>B20&Pyep|DppKAh1_I%LFE?1m3c%d6SKuqvn@0-n!fn!2kY;OqcB%;
z#s{4*Ly>`-EG-~P4SlyuRKTo~@14zR^OT^@Uyh2Ky~v1Pn8fyv_A1LQX-LHw^_1j9
z!N!VvE>i-$<`$EhhHyknY4_~gql6e2=f|(^e3;s%lj1^XxrmHqY`e^v%sDNc*Qw)d
zV)Z#7(%2VDTI<~UjtOUG<3qe)&MuS@@82z-3?C|2)qM^wo97$xLanvQsob>f=-A78
zLu!!5M?#Qjv$Q&^XH7IwA;ajz>{>X>G9SLfo|Y_4O!rYW&76O`<HGg{jx0c#V)*n4
z2lujBQ_PlXg9tl6$$g7P_RQL-Ec@iIP<YDq=n8gillrsO0I??9G4xgL<(H9CA-9ow
zG)Em;FS}Jg;~Rb+9PPHh5R_!e@V%l|fR5rhEvl9bNtX^WB1q(PgFm{QMQ%LY@NfDk
z;$*u{JopS`kX|Zo7N8q-%Q95%v;gr@boiDDYsMdMRh2pB<P>>-xG029Yj|BC;LP_E
zM0a>n+zN?bcq*sjC8IqO9xQL^a{^WB#bOSoh0#|>#QRONRCv57g^wQD3j4WQpZNWi
zo`&9UGsLLtS($xTbKmaBaS1#0GohPi-_iRawB40(g<qRm=CH=e!vno+=19yz=(+&Y
zrNkmqb2Z%2xJLU}7>mhw@5_UZ>@#KB%cAGHt=>grex9%#TgZM80=JuSiOsA7-1{9w
zN6oFvU<mhmx;Y$G%|77Y!tsHrE^wu=$D;ga`kS@8loC$_Kw8mbAj9hdX_v~9`MuMA
z|E_TV^yFc==fS9LyNIW4I{K>ZgitYYfA1B_m#rA_v<ZejTR#<uXo?kT7L2Z#%*$;m
zD`u5q{qb0NQ$7z_pDBSyoqDzkvGz_}J;CJZ(7f-o*o)G>x^t5`C7?Y=Sxe@kyTmbE
zyYb8Y1Z)ITd0Fu3S-lCa#@33|WE@Z^W~!$~a)C=e<xQ04miB>8&3Rnd!E5(8eN%dO
zd&nwihb0@0l#s0x&M4=E_)ln^4}B%VvSxR(vn-Uzj};<ValA^twm1gC?gQ`Uz33K8
zu8bqBfa2C#s)>Gcn&gY}<s{4QKNbAjB8J^)%C}5=q~@WW2AmUD6}<T03*;-~T^%I`
z65(W?4~G&Q!{Kav{>j5-xUCuQM4~=klSp+JZ}wXT7^A+SNs3Q3ZWm<d<vez<&AB)=
zr$<$Ewq<^bseq^HFFM{<guO~kSIdNerbfP*UrlmL1^a}o`VJLSthD#qc=p#K0y|pw
z7ocUc%eooe_yQ#pDyX$pYas|fc_ChxT};fZLkV2YR6F;AghDrlWgYQokE_YyN5yLQ
ze@AHb(`naOmLd*m#+ft^X}HYII7~Cr{l}abnKeM#-fRmRmRr7E|D^R@7Br!&ff&Z$
zyq_mWk@v>wkzY#>ZWn57Gu_z-Kj=yIH!MJ;&!QJB&v0p-3><MY1Ify~6CDP)J}+vY
zBPgGnqH1}Ub$Pt*%O1J>73kpz=iDq=P!q-{@e|9sZfE%hwuXH8U@7%|!g%KOPmFqm
ztNXj--Le{vlLXBq-U^HgMU`>D;a$_#d)R6diP92Qf$3)iPR?h1Sm4rrFIWFir(<Lz
zr;>TKhb8I6FYWW9qMWIoYyM+vNC>5TioCCgb%Jyo&-z+~XV>^pa$e+I5T~1pUj(lQ
z9YNzmsmq_$8fbDeY;!Ss%?gelr3{Qyzgl~u#=nGN=uGNm+7~JxB=_30JQxb={}H0H
zs%7J`f8vMb6?&B44_9-IN@i{ESz7(3ME?#SF2g^C2t##T5>bJ~NiCCop|s_Hewq6W
z6^0D|Q)e^g&&hic*tD15#UU53r$D_e_ZL)NF@Y2^7c++A4GWH>6$4xn&ljW`Bwyu<
z5-rFb36I~Q)Y~Mhr&KfKZ)ZOyRBAIUj}T;<g$8`OktD7s%VYH)^gprutQjAxGe$gf
z;hAyZP@xu$v$7DTl>9il`q=<ORrd8+t(y~5%-G&uuH}B&p7+IOR-!|Vxq$^t+|S};
z&p@(BU9sW}KdVA%CTql~hr<!Aj>P`_+HQr8X6%a&Xz|YJ=kLD8mbx89HdD$CD%kx+
zcow9|Dje~WYh5Lm#v?r0AYB5|OQ`b^TAA_It7kC#s_cXx%*Vg@gf%3;(GsYc_jbto
zjS2rjsvRq#ijk}t!y;E){3q81l{<6t@=_`fHv?x}$i+re*R;oCnA0~G9AKnzp1bdO
zlRM^at4xfVBU%;K5^oMVlG6fseygvy9lx@N>mV>;(Dp6a66WTFxBozuArw1FPB>U)
zzh*!3a|p|RO+;<;Whr4+gdG?9-m}6bDG@xC0SyuO)iu|XT~0R25SIn7G2eLO<H=Iv
zl$F^Z@MjOcD+@J_tbAm=_WxFQ$tkj5Et-Of>bEU7FF`4suffJ%Nvr?90gFG(g5kvC
zL=Qc9e!XD*jnjA?8SPT258Q$pSBGENWp$wDJYz|E0U9v;tXDje%-LQ1yR))FROiVS
z+wD2I<+28aK4Vet4ypY}2bZ7;I>*8^DLt$%b?=Svi?c-^B5{UKWwCwUC@|tg0tMP0
zr-Tjpd>o4Kr;b_Qy!sShq;*$8`d`)-hASwC5}c_-cwye%B45))kYOtj;#ZKrtHYDp
z5AF+N^0wwFEZ*O_C4xMScJciCEUg2Dj0EpmHStnKtxkJLiv0(s26(*V$1T4<8}^sE
zk}(Lua^7D-;*hlzHu7Z)0~rEzx)L`mCtPua3Eb`HX12o1=?hiN@Z&Xiuiv5T{Z+<y
zlqp`YPlL@7+fiE7K+inl;4SiGVZFoy9fDgU(Ddzg9>koU%VNfCqU5%<?z;Zo#1_8)
z{!2~?xF6k7Sa{HHQ~O<(h>bO416GH(3HZP>vWLuFnnlt&Up)V*&)uDqSjRykZ5*hO
zb+}7Gxli$?X)R<sf2onX*TE&^R{W0&e_AKz4DA|{nj95}dkZyJnZoZkM$C3eCIZY-
z?WKp({wzr*1dlO#tw=aW7N1Lh&6b13lILzlN2F0Hc~0C<;CqGpvT~*ONAUF2WmzUC
z%e(pf2!q)NH@83BpHsomgZ8}6^fg?KgZ<wXer!Y1%z9%g-tl2!pdCiNWK7SD-ijel
zL1Qyz9hv8flC#;hdr|vPqqEt$r#ZfhBFeIaYQi5XfnbqpcO}WX2?qa1;R3}XN^z6v
zO-1qXzn&$M#GiBAG9lGO<dM<3K@iqXe9OeBgamAbS}buZ0(<B&a)(ab9?YH^yS!c?
zKM+wy94AY2jL9DSva|-V&Bseg7Se#8!*#m$_QBq8UybxiB-d#-WEGwmOi-;JL{s`Z
zf+3V$;Sa`Gqorppw#2*O2DnjoO<jAwQRJ?+L{u$Nc%kRR1Pd5JW{n3Vmm=mF!kQgM
zL0Y`9j7G!~(;8~Esg+sQSfu9uSz1bgqs>qu<Kj}m|D(8JBNL^LO5tydZDjzq^|M03
z6vNpJ?@B*Uh~8Y;`4CLdS?kj+)-@UCC`sTF5Qvp0pd1TgUAMx$;6s`mT`IDf9eI9*
zm%HS^-#i}>d&6HJSMl+$vUZ^_wUjcXk@<w4QxUIE=7lo(2!pNe)=(qL3v3O(>D>+u
zRs@K4pGIPYKIdl8N?<+%drd&%^u%16C||@VAkU`+=D*oskXu{zE?nxX9kgaB8>(Ij
zGCD50N{d%qlcxe_#v(s|?t;lRaMT4J&cIO3x*|tERw~nt*otpIK-7HKBc%Q+oZ!*M
zvcAWyPsp>a9c#IIS?N$EeLlT^V0m>BlOTSf=P|lumuru;*gEy^S%A%-xzWNp{OkGR
z_~A%tBgplZBdbDVQDM0>osxPjK)Go#g2T!6b!ERT$wP|lLo$V1`p>d|PRdOQSX{mO
z#=8a|b{pVYbuQe44{d99iSi5w#yfJ<^tpu;T>*Qb1nj|(@NEKg9?ZJY@N;cavOB6Z
z4K~kUPw)D>=tJr#cShIfR)JRH9^l_n?TvTofTeZp^US9AN5_DZPNE>`TfLaufS7^J
z^w<-#p4D>P#BYRe9C_oIC0=rkM<`kE$tcrfy%p}a&rvSZ@my9l#KMYLj&(OxA1XP0
z|6Q{RZ$^lpMX@A%z#xayZ{SXc$h_C2j39<g?PZV#b%Xk0cG2f4b>Z6c-wNg?i0gHg
zs2Gc$0|CqzyXOH%<rXz^3|qE&nem^D%<JeIWu82y)4a_=nqr9=N|4)_XcF1$ZN*Vx
zA_<Yqm$P~M&MrAOq{*v^M1r?aAaM-$0msc@3b2E@gp)@h6LTo`Y`tBumQUTK6-(py
z8>fdtA7$=-`@s2jm_-}G7{ZdM50}(fy1P&P>gIgm<urVAL>O<ErrA{~Q^8u+Pb{hi
z8}ZZ4#l^^`a;w}L$kjYt7LAR4!E@rwz)j#!sCyQv3#fi&1<q2u+~Y}3uVRf-?*`Ro
z&T6|YrnNv01(Rql$SffALC~qmL!F~*e#BZtuk}_tUlki4t$YK=fLRh}12vv>AYA#5
zb7nIMx`$DchkkYaaXdYl0p;1pl@a=xvcrgEr+aD}vE<un<`L3QLWqJGdzv<x9P1Zr
zY?7KT&QHUdvObR#i7dI$4T#hsA>CaprFSIt{0GayMjpKbEydS-TF>JHr{%$C^?XXq
z3Gf}14_WRi*A7k!XRt4#F&L3`=LPFQWwHsNEaSMZ=xbqngNN&#q>s+)wFB>I+#JK*
zxPQyU&^*sS@a9yor~}&QQPae6QdoT6pFTYYI5q8mAJ(;q7PTuCc!#t9`p-;Jd}Y?u
zP8jL`JTeZdZoZ(pqKK9<7ts&+_Q|&XzmJCh|93?Go2dU^o`4pk!Mk7hJ@S8DYv#a6
z!Qo7?`>*r=-+zLS(KS{df(_pPoQ3G!Tcd_|AKo13{g2NdQQTTtJlIOW|G(wa{vOMY
zrG3wzVETjPzZTX)^PV4~pQvK=pP%3VB8vBokl-K=b!*V4ndN^7&mbdeNV|0#T9Kgi
z#_m}j;j2gg`AgB)2cXt(+wd#h{g&#e6dYrso&Pwq?uzz9#QHWBp1az7lV0nhE(|(9
z9k@;I^r-B$?Q>LC10>`=LuDnd59D9FR?MqL?pYQai0n=})xUE71OJBo{(JmqQi%yc
z8l3fjmcvzV-NKxf*gGBp4_xb8kVSqm1od|qw&M2bbC>MF;r5-KtC<Qcx3H_<V%WGJ
zKv#-So?CS>Li_FS1y5d=)S=vJUBsh!6bVCip}+HCksNi7e((+KSNY|s*u6CZ&{VA6
zvn^Ww=TK%;1AWq6QF3>-NoGY981#>GKu9y_^?*~<{p{f9Jvnz~oQl;0cAb!WIgG9-
zOLqBJm%HCR?|LYZ3%k-v-uz5n)UU;Wbqbs~ll?hiVs4JIbU;Fga|=TpZ6$Btt!mhE
zKT~XTBFWqRO0vSR)lh`7$|Sd6FJuav6$QzIw_2#wIP|O*0;aIV;y-BmodlJCmh6r`
z<v0GZf#XNhQE6PGF!^3b<76_{B~w(`R4;HB&}TH;$1Zpy5}<+tn8pTz+khESv?)x!
ztljSIlxP=<Kif<=P(V-6j6dAc_a1}toBN>naIaw#m`V3ucE_zzwk11-k)YN^*Lw1H
zZzW%LtTP#W*LqGkHID3$A2WCu^^e4>hVL!eVFbZBqLJl*n4Vn~J_F@~-e|L9oAnU*
zF3S=LT8Dc62_K%5xzu2iK?5~4>%k2e`_7J5Qyq*&cPY_UWUE)r;@b?vBG;5^{;PHh
z9%pTfzpNKLIz8$fmcEmq`lCdPP2WPpNBd&Pm!V^sHq7^kAB-Vpdn=JuBpS5EUhKXj
z^%eA%Y!AS5+Gu|5A`HGgCRsFoh4uwBsZz~U*2H*7u+gw;he9*#|E3yIbUVioY<X+L
zWFWzcNHOmByJHRtiDh7mQb~Z~MVJ`AcpD<%CD+BvNim=+<p5)~_~G!9-Bl;nCw?F~
z0wJKjklQCa%17X#+xd3>SRaOR5Ya8S0;R}Y1t>(ZjrkHr+Ra-Re`OP+-b@Jx^t|c6
z_33yJ3<J>)QqjkF3IqLc%6(WsyPC<>>3q8VGSHxuvRx@1sWW}Z`(ij$3BI6z{NkWh
z!qJPAKmWX*^WWEV0=_T?-cCh6PYB+k*^55W4DORbs~NJtgTi4_WE)U0J@Vr3nBS6V
zaXMlyVZArn@);CRB3f#Oxq^Rjiu<1({se7A><;8pwoyQnoKsR=p+9gim&*0FK%|<)
zD*}=Tlj+_Cb)6Qe@)KX+#)H)xPQ1gF30W;ckE%9$@U3mp)pM^c07;v8S@1XuDdY?C
zaklM5W!xLNy8S4OXXCCQDR#;i5(j8@&^l-ano4}bFJ2@y1vwHZpCYMQ=;|+I0!lym
z>RvWH1%1)<RhMNxcM|$8tFY>irlR}ILBN*|-|gIUkkhYNdxm&_?_}N{`u>6C?YE72
z8-IB_?b-;17`TNG&Lk2%?)L$*X;-4A*jxr`Q1!0;(84&01!GcZ?6LWOS+#p~s;k<K
z3cv_wIrN4?Q{dHR4vRal!Q&9?F_nWw{C~vf%Z$5I7y7FfO;Gvw5bWmy{1Ut2m?@}Y
z73=poq6Ik=vcn}hpu^(p{6^M09rlwkak`C=&r6|qaAVK~zK3e!plfbEtU-37q2{lC
z4yyII+wwuae=XPN={@>P`d5A+{(!3W>st34=J-mN1jUgw(cF%#c@KVt3WYvTNd~h#
zE7M0;WdLN$+et|42MR~PN_B%5$V!3WL?Sw}(a?7?lr~E05A!-W=xo$BUw6kDk5Q~1
z^`vSnXw^9YU2Dera>F^$xQXq~r<PP8Owbb|(Q|Yu@M9Fl1_Oz9`^n-9-7KzSFr}6N
zVx&m(BFlV*qf7dL){{VZP*7TE20g&zFEg-<$LM4P-;=i6$L1VP`QQUtdf2}B3JD+K
zV9MaE8<#6s4Eh2Z3Y#EuRT}F;jO?H=p4tX}+cO#*tpmGzCAaT?T(x^2r4SYXG=?e7
zhYZbKI_0m5ZJV|*8RP(Tif-ZSON|74{`PYb5)2J_sflBG2Tuyfe(d+=TcFf3m_D9W
z&4g*I{p9u*vs$(NU$K5Up^SUC?g5at2bj)ODM-yH7kzE_FB6)%r%gWG@Hj$E*g;hp
z65PS7{|YI4dY}zeDoryh8W;F27aGQWWO4FlHMa%Kz||64?pjwj8DN%DiTRK$C<2lY
znuYQK4Lo}f0?w6T{5!zo@{x}mI4NIBAw;81_6)2z9U=8OuY4))J7a@q%h52NfGNp=
zs{E*8pd7v(M#bzlhY{b7(`hurEGj^uv496i4rl;ANr}A|{Rk$3hJ!=JhKAZu1;KeO
zK8-hR`JCit1z1}Ke9Y819SB1R*%q5wa}p?F82aUZObTdGYP5&mOD5ljqIu=dI#Q6}
zD$U`vhhv8BKkT8s?%AU?fNN`M&9_j!X#ZjHX@yNR(*#n8785E<VgU>GIt~p50<}u$
z!L!5nsY1cT**^H(#<>3h%<l21eR0p%)sl%Zojf}%e=#CKCJCq5BrtN>{4*yOu;>~K
zyZ537|0=SVO+0N6H&TM2+00A~I33FltS<w7DFU1K0ib&lpooTpw?4rQrUF;lfF!-@
zx@G~LJiRAu;ls=<K!8f_PdvT=eWhEqvwQD=84-PNzK44K<lotsnMSjW{hj@I$tNfz
z*&SviaPs=TC-GlpQ3RBYrTf8-=8nX^{ru|vJ3xdGq)7&irU2BtBR<XVU5y?LnS;SE
zb?YK+?twpI-oM1daEwP9Q2=)1yF}IYd;cIscqYJVfxvVL@^+>#4ZeSiNFpsx!t)>Z
zFiw!|#TqB<_IrFm?*lc#s`ov9fJ6yc&hIGziAsi_*&}6D!qLu0l=WW{HCV^4V*NuR
zKBi|#rNNna%x}yGF)CD9Lk07}`uP*>gExSs$pLrrSGnL#Io}-oYYU)|5Jdf>P3^7b
zM>{M)eAR{S3;?NMi^np5JZSun2x*7Jo5RxpSeHWjEirn4RQnNg1F2>o)dQBd(5*XB
z3{VE=k%CxC6QJ)|)mwCB4gf>B8y}5|sBk)p>s^%qzdq~%ep$4s>>&00xYLdTwc-DY
zP$evyaq=${vP%4+5L?%<bR3~aEwgR^OX`4UQOqq6Yj4O0kyBeIa-;?Av@w^z0*@rX
za1VTF)}XSb3r27l`iTOr!~TQS7t=hl(Qi<=L7QR)9Zh)%5+&+=?150_Y-N=k18h$i
zz@Wt2AOXY`${$=LMDGEPv!LD*A;3g?&fo_#hP_5#J^nZB{sY$DmA%86|9~~AI<Qcj
z(dDr|u5A54B2~s7mP5dMSD}kos6xn&c#=_ky=lOlm^EBx>TBI#isvI-p+{mfV7=v}
z+4bP=CEv~6OK^1S9~o*=uX*Kn_pi12iU}ldQ1&*^{6~|3Zy(g4ZHWMn76o5ai>0_A
znNTF-t4;;jlYcmMr2ZI*DmP3p+yzWf_fnZg>T$osCv=RHpnpfG;;DJ<_Rk1@GGK&!
z`g_3MST4`90(xGmB}bZ>f<H7~N}=IxAnO_H<+C5f-xvy$_r7Dm>+*V3CI-a%ZUqqQ
zS7Y*q>qzU7{6Ls$kn>+z8i-p-Ya{t<)t)lo`z|vBLW$M>0ff3V=!mp?3`QXS@!1^X
z1PK<xqf0400QLsQim?o$pn`R?A%;c(TJBrm_AI_<h);_~Hpjb93#ylE{EMji+?{P$
zoPPmBOBFafOs-I0$3Kan8EaGUv&9!+;Vhr92Jkj+gC%(7v^|fx)w7tAZ$RA04}(%9
zdqCMK+}ZAa3;26UzXw^7GWy|fszI`Umtae}Y=7sUC2-|E^j#KH#CNZA`1ao6-=e62
zd4PWf{!b1?Q#iqJ&RKb#7Cd0IDcvuQmC8V1PX|+hc>7EOc7L_X3S9;A$25SMjJI*`
zAIn@wFAtkCy8l<_AY=Y5hiZ?qefh!4jljyO^58F`k!_v@89iU5AYt|_gW6Z;KImIt
z=%I0hbQT00*+X6e2c`}0>;}}IEu$hAj)GN;w}JleAe79oP33<EAy*3!(>h=q|8zK?
zp!;(c9m(=n0g<UMz#ojCq*d`Y?t&G}<<Q&1Z@jv=W~eF)PKP<5ZtQpj)6*Laz6&sN
z@n<{=$`{Fh3^bX3zI>!BQt{Z7mdnt`0D8C%ls$5<EAw;6`nN)E?-rO_QGe(oszG)z
zoPfHa39mfBl(^1=;B7qM$b`@UycyaDcynE3AXCK#W`adSgMwD8@UI`>PNvKL=O2Gh
ziWwNjZdpR09vi1XJ@hqLKkFm_acYA<Kt&}-5^kAL>Z}wpe0S@C?=p4>6Y}lr;O$;T
zdfk;sBP}aRgMn5n|L@zKv2^(m|3nJX@;6S*J*2DWH4j>>nC#w7T!_mUQ`QE>u~8Mj
z;aMP^*t5w9ds?t8^^c_^VWU6{bqss&farGPkRrO}E@Kt$geqnCE8SI1oPcwt&5M)$
znW{gHS{w&f&GR`K$wfrysUH7#ac%^O8ufpjoS+mQ7_@OVFk<hI&THNAMF4r$C3_w)
z>C-ekiKyV!lCS+7Zx#R4dpnnK!aFY!&1LlO;#9xd>erV`D`x*`@cuo(F-6~t2Lu%I
zf*Da82&x3@HDqFHBm>j8>&CZn{U$|YJQZNIb;;T#?D=W>!}GD=Rkq|le3r%I8NYdF
z=Scop_@3O`Kc8g9%Uwf%wy`Q@jO5Cw+eRdsQ^rST1(?I}Y#zMAURkKbx22u~anKvo
zWzuZ+zZ!Wx_VmVtA2v4Kb(bLD0_?f_rjdR^MbSS@;}B^Wm|((JIdWgL2}!-^-mM<|
zbh{n25i&cQCSJ%DmGO=SHzucZ9$g8$3^Z^+an79Q9bQ5O{?h~BHu=}8`A`U@doJ$*
z8baB)3fUEg%w#?Wv{+f~EOJMxUo_w4QfVjWaWef#<oUs2cAU=8jJe8yS&qttGK_=I
zwNXw9W5gCU@AHWcgaM16@}>K%KWOQ-+|8_2?hY(LSWHkH<xhQ?ol0c6WS>b72kz4>
zDy^uHRkna`j`jH*&^=;P#EO1C!(nrq-$1n`#?o{nQ>eytXOZJ{YMX6X7^rIP7Y_p{
z@UV)PYVH&O^e?V=5wUf7+juaDtg%m4P)4CAC&VCAdkZ*z1X@SG3~FjX?TPIO3q@N_
zG|PmTuIF>kJ+xsD3b;}8oKB{1!kt3yq*E!NW^z@-eDZ~Ffr#KPLr#7%v$3q$Axa=T
znutAZOrE-$&lHRrMEk0W&ZQ7_<*?>#AI%`sJ$c-D_3&Z2C(SBj@fCLQ!xfHqXG#>M
z9DvibLpOLjY|OXgj$M@WXByy204=k6ah5_W(4l54`;S%*vwgePcx7sad<84vzaA>$
z$1#rJ%-71Mg+I}Ia$J!b=zgUlJIsQ0<s9_hDx;wi>62<tOzXCtlm9gWW?RI*%RSdn
zTtRVTqJ|luPp_kD8FTZO>;%6~<pKSHN!KTu^Qy8gNBA_UWkdzixN=Us%?VGcm%6)b
zF^Q%czflEM)zD2`_Vk_e9#1*Z#F<FJ*Y1CDdx@>)r(w@3?fC@~jYD?jdgs7UD#fpg
zp!2!HJ4Sp3DMZP_N&VYXx$|l+6ivDHgr5gwCYfkAee`8NU~buID&>q%ZWFPCaI(d!
zlr!`1jB`&{{ODQ@?VDRnE+D7AK-KSV4>zE@OxQyyzFVsSw5)#e*8I-&$zk(OSE2#o
zljy)Dgf+v6Pe&QO%WtfRTYRJ(w}!p|S7H{W*1#EHYH^F2>|h`iB2o~i_@fw^Y``8@
zqP%ev{c2F#%-u_XJw282OyVG;xVkvz#dx{6u->FBD<}86AW570OVu+HXoJ^pTPW$B
zKlQGOMA@t!Q{Dtz(5JHh;1{FLgwQGPUj^LVqa3T9OP<=~z*%3py}k!b^yF*uT9+Xf
z9h{Da<U-7z#k?5t13JE+2S3w!M>qTSKP|7os;%t(4w^YnqxUN}Pr6iKmBjkLW_5Ip
zse8co1eqZ!7Wy%UR0I$hT>QOlIba0%Clo}ED&1^H7t_5T`YJyJftV&rwpF$_a1-FX
zI#C|`MWkv0hlZf4|L-0@TfTa=i{2TN2QLQPj)onyy$~Uki2kw_Ba-I$2?d_5g_*Gt
znlDLsudvf5D6#IcQU8a1WSN2HVG`?GI(>x}U~XKr`vA$T-Sh)+Hs?C1n_90jH{WJo
zZFGIdw)H8oO$59@{`oHs`>4d*0|nJa-S}5c(5<eOi5*AYfouKjhzw_4At$2^rdyW0
zz!2$pnCJrG$spkpeJz_S)@pb=Lq`D>Xg=XjN7d0n$9H7{Gd`ONuYV`Gj_T&tt0W?a
zvO2vEzBAq5zdtqk@u%PZ@^V2)=0N>H#ez-j5YL}qKPoOZ?V|hnZq3l9Xqyx>`(lSk
zVbwNO%uo?4HY}3r3;N{yg0{u{Eo#P-FCj(#C1i`BmzJjDo*Fm4Z#*_#qrEcyTc&Td
zy!A(E4sAyYPHrGl$*Wio+3ATkXtpW9p&#hZBJM~0>IvbUd9o=xVObe-zWFoV7A<2D
zRNdCeEYeI07McPES91Z471>Dmi!$yND4kz|KUYSF?`!DeSDZ8&RE)!0|4X-~UWNRI
ze5+nZ?on%n#v6)SDi?`YC*730aZx?INsD$DJau|>Sp{xMgjnl$r9{2dHY#qiYYeuP
zQ!b(15xGx)+?X!^>2Ay+DjbdX6PYPUm}5rd__Z2lGx?^6CFXRWFbQEA+ZVk)gyZZ3
zp(s7_9GN}d$$Ee`Ktr%#qFiaPx%q2HNi()aM5#bvgx*b9q%Fxx(~O-}t-qM}mgU3D
zCglPbX?uHKlsaXFE)8g+*QNI3AN)Qyroo9P#U4h3(*xV8X9dcop6W5|s0N`R(X#k+
z=F|+@uQBT54;UTm{pKe$tl#*#GOqW?0be*Ox@r~ICGqgj69xCUIGV9z4>DW<!2^Xd
z0Ta}%faid7AhuH%yJU8IllOSFEgd3?O@l*hOQhy23Q?DLlwU?-Sz9%_{1cp(y9C8l
zxuEZqf%_LljdDz7O94z!<$p}n6;Er8js$Fdb}p01W}6bm0gw3bIw`>Dj8}if&Rt;J
zY5=X#>o8f@7J47#K3krvxorM$-st+WTH;u^1akw)>eVSHh9>}o@_U5LsbL$r`FG1o
z9hIK}FZ0-7L~k)V*h>!;w23RE``zwbllU={S~!a1i1wQiGL;+VF$a-LZ^e2^a6v<0
z@I?9N>ym%nXl+tk7YgeepQoby@G~{&H2o>RE(s}$E_>`F#K2cZz;0-#hyPqK!`A+%
zw&%-0LnfR6t_;dykkMNeb8b~Mw$X%oGx4w~U~*+7b6t8NeXo7)dh7FV``kV4>c+qh
za{r*Ul0<=RNS#~}BQ<L$i6abppG>a)W5#f_*ygg!hQzl2dg9g+a23X^b_bq(n-^lV
z!VfWa(j4|A_7fv<-*^}O_2_b+%ikfcqCNHa7voXoaoy8^evzhYdJyIt%F+y<DfA1R
z55zyJVXXQZZB^anaXY$&&eA;lGqiVYxULK-kIINpAdhbm)pt27M3qTyGQ9iUVBnv~
zff;s<Gq7FWVi_Gp^C$d%cCgOp1}s#+_V|)b<jG1Y@mn*5pE9a@e0>6JrIqTQl*7|<
z9v2E3S?5e_p;y%&?^qD^r2Fj=g_)#h(6Pn<XaV`*EZ7}!cyQVRrgC?CNXY2o2~LNB
zUM^Zj)eP0Re*`GMPCDos=Klf)^}lr9d(yVpW!fL@-Xe%SLK4)$u0TOt(WVL6tC#C&
zK4iO>V|%9|sjxirfu$ayFFAyBNEw^gtw|KZBP}4n+-k@ZGoRRYV_FeoQjjm$bo$Sg
zwW7t~Lc@%o84d6FRS?|)iyZ7&vb!jFnxzR*2%nR|RiS1>*!Ap>7KC4D=-fU{wp1D}
zrg;Y(Rw#Ctd>mRd=nP+9w^ZU_OjiOKAN2O)kVk>V(hZLoBXz7GLLm&g^Snyg&0*ol
zf6;%EYAo+&O+&<`M+`eNB*m_!$Irr+q>D%sVATt<T+nN0lZZd`56q9g>uekt4}VTi
z#edb8H4)ts{W8Nz6XI;sD=y1DyFz}u$1tM2Y~%$<%WlqTA;-$~2-}4N<#M(=@b<1S
z=Ib?A`9bSS=GT(=5tPS1e^FGvYG<Z|WqWrtk~>6<Y2n|q037f`%@~DnA?=8HC@J18
z%L>r^BWIrl#3&lLUp?GCvd66fw#KPRiIn+3cg!35BY}0iAAX@w-60E<`eHbbW^Lzf
zrcwXq*hgOw8NU&*P17=TcE%*~cct)%rz`duj&7lO*N>5C3C5;B&pcRpGo{9LMAMQ<
ziPSS|OL6V&Hp8lRxpLJOdHp}2i(y7$BGD3CyzG8I(gT+<2X|2@%bKm35><;pvSy$`
zN{_ukMQ<8u(QLZV>8s|}Bnff(W~H&^>ub8FD)HxJsuav<{;cB9!v%576L;bCVa17&
zEPdD?#TM>K#RDrWM(d&j(s~W%C$%Dz|8I=*s1}bF+ixCu%@eJVCkhFstk=}zYfvG-
z8PwEm`ppp!Yf+WJp|%gJEKjV8A|$Dy8*>Fta~xCkjsU?Rzdc+(Yu|`K8%4j}4pBGg
zQ-J8t+uFLUid<j3vE}(4>$kP3N(wrxGqy>aOaUF3(sg$s0-CS3rEg9Hd&Cp#dPkZN
z07!1kd&V4>Jb^ku+5G&Uiq^3`dcD;3K|+K$xF^R;el5xH1+|Y&umXXQg<pmZ*Yrxh
z<C`Z{E=hBb2|}ZR8}Q}s&(_74*suvg)1MrLXhtsWa=;PHe~wp-#GB@gW1!fpR=k>4
zWtN}mov~tM>kD6f2EpBm^b6an!d{j{@O|bG{JpJF{Oi4(mJork7c+cMXSAtdPa`7D
zeD5W-4ebebv2<ixCNym>Sc@c(LvEdFrT&#zzQi$*nj&pD;(z97_E&NXKY|AI!~F_a
zu-Lv#7xPjGi(2MXah9yaSgL5-k{#GsGu>nZNgEh9OBHd*)yx!M=awdEYyMadqg(LT
zc=;Lsmcj^#<<rN)c`lx1^4ai_MZwcA;tI;?<mM48y{|B&(whN)O<hzVbYrPaNn4~M
zz|8-bX3DDVkcvExW>BmVbpSb@;qC1Sy%(A|Jr!R(4Qz#u=T?61TC4nW#3S1$a;2SE
zI@sarRdibkp`V4D6{sAJ#t(8$6lsN`C<xJ}G(H8<$K>$W@?ifKp#{guFm?jwF}ny}
zG^YQx%(DGDS&5NS(PJ=Vug-zps2WJnYuH5pe*dDBykDw(eLj#)U}R>v>lw^-mA5o7
zlr?B-@G9}Q0`CvSMjocPAubv#g;V(Z@<+g6sM@iD*L2r~xz=$XXg`(#_WL`Hg%+@d
z-!l03Kz>A-u)1=<H&aH1XAtZwQ!Kc`oF3vwixuO<%v`U)aZK$Ni{D*Y{Jv;VaB<FB
z?c>f~wrTP@4|{p3=(Si=cFZ)Ce=7fi-v}C*Q_88FOwi5ShJ*SB=&DGoA^ktPiuGT*
zs+%VxkmnK2@CK3tlE{|<15Oe6(Jj^OF>|t8O<m2!im84KS0`__7(4E~gap6D{XLmn
zU*MLGBjKjt^44H7!>39S2VYt-*{STqT2#VuKJb^TF&nN*gg}C#@RXAC!zx=7$&xb3
ztyV+RrxOCyAVo1Yz_!591N>=5Tnl=3L%tI$yRNzH5CRT4@#*H>V*28Da|-m*zE^kX
zqvhNl3Ldv}GB^yHtizvNU#P<$68I_Z2}}vc+!3pOftr{54<{SHw!n;<TPo@u03v;k
zQELkZ@yy*?amTMMv?)&P7(gMcY?za=Wur{OM3I^77N6x^mhe~k-XNfSZ}42sMn&t)
zdu?Y$<&Gb1`Y-yaQb%(RwKp0WDoJv5e|Q^lBcAh-vfQ#P4CE_Fz3`v`xd0*D0e)d1
zr|yKUZ}k8iIECqzxgvGr@!k;L4r@h?6HW}Kt^`$aKriYlD?Ls}<MZ|p+Eh`M4IqL`
zyDOYErYXZP-LlTj6?)$=F6`Gz6YKKfc*3Sl9@TS5B2`M96!6rTgQT{@pt^{^5*ow%
zU)Db+PZ=RommGUgD}Zk9K!4WxLm6!KRQ-UK(-0<WA%vKGqecg}??2~p*fM4ATRzt{
zWN;SC#O_$`$z=H?gYuwD`$1@>A3z+^%?J?3xxTlmadJCkZ3Ia~nAs=ISd=9@LzB7t
zAd{Kv?tuHH2;}psduJ&uxlG$qnOrwef@-VMfB|-FV0D5-zVbb_Xc^jYb2nKIWJnZr
zFtEp2&Z<@AyElr9n&<Zqqx1nlewmn@*lP!t-b2c2q0L@XV5tVQJ6@;W@<m6AN4f`#
zqXvL)X8i-OU+r1B*TRX33ASlb`l%I~)*l_vQO=o`OBG~Z(WxsZD19-aaVOui(Q?}e
z31cSYOS+H})xL)`@s$Pu1Ul=?-RijC>j(m63*e+_{Q($-MYC=8eqIZQ7nk>{k&=7N
z--F3f6B681`y$dzRUc%v*Yyjq=vO}5cmuyqt=AwU>x*<=xsM`uWx%`D_UI=0-}FSg
zUe}=npIysmF=q3gJ{Zu(dsUeE7e7yBA;A-F%`Hf8tp~i||1tSf-e641y(qr##B@dN
z{gi5ah)*7cyWgO?o<;2vt%hj}c`d69#)Y5-vAVjDqOHT7>6M9*?(i=EjlKv-UFLmW
z!CwpTS`svkap!8i)xyV&oC!MbP=}REA>*&)y$F0c%&4=<^>KLs68Dv8kO7smv6x_7
z64t`8ZJds2X!RHWf;fk3Vtf-{wjW!hzy#H!{hYjd>@>l4nPS6Ja@PIPX^=oGD6!HP
zR4C-P7+WgaiYbv=zu2`o>umA))1}Il<r4cFhnW{UD^t3KY`)Yg7Gy>$xu^?Wkc`?F
z>aU+0FNPzRZg0$I9q#O`4QdFnGziSl_oQyqMekgs=I^w2w(pu|8FuA4Afz@bp>jpr
z@oN1mHEl>}R0`LupEF6=cKNhgGR*G=6EOY}KRZ5^&eiR8aHgRR_n$g=XK=!mNk2W4
z>dC1*qov0C<tfaW9~?0-6e1iK0ib!Q0x2Gfzy|-9kJTgq?7j!N7TSndWh3%k&Nm6t
zvUzIzVdl5oGik*DNDcjqjy6O4rPvx2qKCeI-uc0kv*Owa;Oh@FV0?ThJg;E<!PV~a
z%2I3r^W#wbdQ&|tWW@Xr!mA)NwZQJtmhzsS6%48{S;NRI66WNC+>k&jY&wYHT|+o+
zt5&1ufjg^Z^*`*&r6nIE#NeFU$-)3-Op#kpcJB6Ld|z+-H;t6PphQchIZ(M19jDIm
zgnkP^g(C*ckMI6?dY&Jx?uWd~Sl3Tye*Ea#>+rBD<N9!5hHX$R9Neoua}UnWyvNAO
z^S~LoA@9zRAG5@E=smP?U62zFwho7y3#oyb@fR)f;1a5U(7Ah8HyU4LmeBPq)=oH3
zzq3LyD`#e_+WxV`He~B$lazXos0>{b#~iNem^DU+_mv^24ll67nJ6yQ(vl%>?ycm=
z-|*z(8BT{t!}$D`O<#|sEcec8${EN;SAYvB06AFFyj?f)R|g|#?}<%83o_<sms0|w
z#ctUVNQ+G!(g#2LQdSLoLofEd9K!73O2+7x$~P&MphNU2b(HfvlZbl?yJID*xU_l9
zxZ?FmASFGn!YV}QyA3r1nT-G9?qH#n&HQO4qJO~lj8_u(Y;534S6wAaVa-t$5`1#H
z`~BDU1<{fgFkyYYlyv&Og=|rk<fV{Gz1!Sb`=3&*E|q#5^|1P{QpLH=KITk>{>#dI
zsAGDxlZt)_x(?EZZv%Ve($iCxk8#RsPWG&mnS8-T$jp!hA7up_4Hcg~!$II9X--sq
z9Z+7f+d=3!&H5`_&H!PY;eKU&F?5F!fhs^Bl;P}PxlgoOxWT;^mNSFw7EiI&02Wq-
zY?|&K$5u)Qy0{ZSFFXbfdQ!dSWUs3mO&G^>2>6ET*8C5-$#o4D*yEUO^%g!BFaf(l
zrXPuu;9@lga-&5$L8@>{Tba1S^ruS}+Hv`8{2l!%7~pIP(%t#(-?E~+@aDkeP41WY
zdU=pK`40^R3@vznlEkX2o+F2Nvf&4f1kmtZQyDT%*ddP@tP4MxJ_Nog`CebfmZJ1>
z#l>L7?ymiUbX*rxE*IFtGZZTDV8S67ndcU`=U}F1$k*HYqst25dj4unW$EdZy{j8s
z`HOw5=91C5HVi+Tb&2gfwr6=sKgGPXpRbI(U5VfE5Y7z}snuqL%9hSMkz83=wG$y!
zcC4j39&Z+2Fy47#^5q}CI83NXB%8)t6?r&NSdO0$_9$*6f}*V^#a>+k-*@`K0Ep<I
z#7ZK9CtdcBESt+NM<Rr5Zebu4D9Ha0S8o{=R}*Z3hCm1q7~BU4?(R--cY?c1aCd?X
z?yez7u;A|Q?gV#thXC*7yZ5g3-upj4=2TZ#?b>^H_hS#}CF2o+{U96q1r>661Af#<
z#)2Gk@{xALmN{v37Cy_n+D=#O?T+i;&W>4mW67z`yi6;9p9T^zS-dWGz&E0&S7-g~
zw*j#Lrf0vS2C?kVVN!ptQPP>3>z69<S<95D1f^khd^dVSq9!-ACX4f4oEPGaBsiI=
zgn<#XVCyb!_B%b{pSbP3KBQBvy_e-OqB~fkGgZrm>B4v*P)eNikN*Uw$6K9$EW37S
z!*YMY%)d&a`v0jUUdG&foUT1JNDcM3V~;~&bwttM$5gBZ$`0Bl_&)$+Ir{OF8OIIg
zKkTY6I=(p~TX32X$!*gDYb7VhL=-1K`a+VOAda@`c1_@?dkm{Zo82W2NHN(Z=Pg`1
zc^{C0#xfHW#{LBuk2|N<1KTw~-*PIER@0o{HV0duq}HoOO4YR(Z2v%J#mUco7~+BE
z-fPTII#fXd;G=NQ8#B|gX(s@P^)<Q1rD^x6UvG~|k>W-b7Brb_rPq}<q_fWcPaW|D
z{3EYGQ2T!D^`{f~$gpQ-rFjCAWEtd`1bC+kJ9o~ogP<NvsB5eLt(B#}>mz@Vg|&he
z3D<1Y0tjb8a2axx?PEOtyegDj&UKG?kirF?NGow*cR2VBu4KEX0Ei@;?~Y!UsnnL^
z9H*zB<52Ci(tABX|9K<$)mHof;o)Zg$+ZIZc9Jd<xD$Yw5L`;^&s7$({byiWcO;jS
z@+J9@`i>pL^1;5CH`bOQOCS^6&cn~b-^YO5N6$7<T!E<??)Cmg7ChX`TyCNn`xOva
zZExm#98=Zq`FYH3p({?jgI+V)lqIjzEcpavvd)zrsim?0Nt)VtaUAa)8W~8Bv`aS2
zN{h>Zm>rX73+VM4dVI~gThbgcJm~)@Mj?zVk8^Su;^l|qH~fhMxz2e-*?4U<WBu@m
zeUD6az1Q%HIc3k;rKuNdF6IQxZF`uya0s|tc;>{BJf^Tum*x+C*`|kaTavN!x;Is;
z*;!G&Q=0$xR5y|$`7kj6$QR!dXa9(R{(sfAY**1w_$nBJH(jXks+uK(zTBSO_!_rT
z8`Y$A&m#Aranrf_=0yogh-nh5GI?BQGt*U3Gfa+Q=LP8hF`E3VlT<8^8joUpQ_tuU
z{`;lFUoFG56Yo&ViT>~75goXEFSMMs|J%c~9;d+MY%RgZ30PZoL|-l@)kZc8&-jkQ
zM5aQzlECW&;vy9xcQL((RN$ylHVi>d4i0%!uiK*}!LUMXgYoi~mwQF<R{;mvqLMya
zWoYXTfR}xy(P9p~qcGq_Biql9E6SU|m6fR>IbZ;cZ5j&S>Ht#YT4IF%5B-Pb!xjF<
zy|<^AxRv8C{Gv|AeU7OSu#h95B9|IB%`>~qIi_-ehZApdabU&?ug*qFBo18<Jd^wH
zpPVbI9qV?L5A(QW%Z{}{5>*uz$uXB2GzO=ffSXibjMi*YPKtFprs-0Rr@7Oaiy-C}
zQQcBK+Uq}47_i;zhyhGG=e(=!0ex^UJu22(32;NfmoA%Y$Xl#(Cv^bVf-zPQusa!N
zagOb<Ncb$Sa#pm<l<3vbK@hg87#W!If#}mc*`N<sHEGmVr_Arl4cpf?0NA+r&{{^t
zC1St+Uo6-I_kwQ?osDgwYuuvL?_TW>W5~7G)w(NK0TJhRR(^YniR;a(1+#d-r{C4l
z)z#%Jdjt3EX?AN%(a+0uYAj;yvGfK2w*_2bbU>^BkN0t%bu3E!(V#p0O9)rpy-8hF
z&??MYbAW_DVeEzF-T}KX2Y@80w8C2R^q$iprXsZm$Rmak7a3B2Y6rgE7f|KhcQ*NS
z$VQS|Q8?vKBm8e}%-3>7h9HhScZq+M;h9RgyAw&>#mR~w)LgUfZ`EB_JzPWi35UOE
zSsa2-rI`qDdvIdbz(t1hlbjj$67-Nu$GN8v`guWI3Dq(ri9DYw0_ppij9LD6MWwyj
zke1tfRWQZ;?bw&f!NUV7d<L`<@{WIE__{&0t8}{3cqUtX!U2I(kM1#&SxkQVKNgs9
zPcA})^~a*XHf!?-u`})9D<)5qf;B7zWb~etj1D`1srdzuZv7pIN<d!6M}j(VI%>$7
zxDR8ug22b|)Az2V*>~?vjc*!8CnB7zR#qRLX+R0bPvZg1>)b$Gu3zfU0HW|%P0<T>
zC6UcmK0R_u3g`?H<l<Cw*@kFE|CzG@lxeL6&VHD<WA#{}Y!F!>7i#@xDz9h$S0aW^
zu=(-X*Yhwx0k?Rwr~t1#m#j(2h<j9QCKY<Cl6m%&VpeU$P?%siHMi6u%|n&YL>VtF
zYSK%fNTL^wedFvIfFj#9O92`(_SIj0Qhsan21f-0)-jx;c{CQ|<OvRm0N?4T4znFH
zd!o4THDcCohmwT<(AhV+h@L@~qNJ6tM<da#Q104ae&rs-?RqfQcx@>;+rh6DYq<d}
zB|K#i3E0{~TEgb^kZhjBB@0{8?O^f}+2#Z(1$0i_+@__9q}|Hg{V)%52H(ZUm?kuf
zh*nXBKP)b)7)t220E6*k#Sw2WNezoKffw?JMKD-&X4<5=Ugv#$Me_e51Ry*VbP$%0
zx?5rCvms&$=0ISa{HWd*^f}n$T<gZzY3q}%xM*ozvU(gfu$QhbNkAhtN3|1YTA=OQ
zLD9a@1KI-9on*!ug$K^pnn*+VYwOw7TiOQjt0(Tbf{zOkV4_>}8~vSZ0@E7T{X2O9
zHMs{9%9Xg$GHyDm)0Ji2Ro$87+UyL$gIVW@A}%yMko{lwWFuGse~cb71QI*5j!E)N
z`9K~`sJUyJdwb|-Gu~Q?k>8w-4lvpesLdF_{6;OidOvG8l3OIQD!Yr-;a1ojk-Su1
z8;(J5*+8z&;>7t<(zkg%w6i2=I$ZTLM|q3MtGdBdIXb&`lO-1)1mHEpjo)PE<?nDw
zOnB4G{oun23%Fg>y=l|Pr=I@V@czNV!al=cN2-bEgXyTRs@2GEKixurRa?Utej^Na
zu#TFJR+wv8P)1r%|FM(`Sm{_SpAfeyEw|_k7EJ&BGtDgFDE36azi9W&xm|T59Oaop
z$Ga5Nvae1DT9>Efoe9LgZ3qTzt0#hcUx!q0@{!cT3UVs}>EivW70Xm2+f*i-<`Vpt
zrDZpr+xTtBzxk!4+R_1se>p3i_u13M<@Wk!Zm5`d+jm#)vm3DW$FJXayuxAM7sh5N
z27~^`=HmfOdLAa8uI)3xOW=}WB5C=~*50_uWjK~g8Kl1sMb#M^nzY*^fUC^diWTkh
zIb2-}XP#P;lQAG_^zJzPmOZLr3zsNBvz(Wn^FWxGU0&1VqDfB>2a6n9|7bY<UW`;a
z4!&ep_+MmJ#{fFL&A8lldnd=)co3I_YI{~JV|QnKUlJ?-BV-$xfJ5SU#KU%(u;q+F
zp*{VI4H=39^pGQt-vcOmNqtm3AQR`GXi3{O$~_pYq^@p%t@<_@Z@-f)-z?3Hs+RI<
zeg#J}s?@a6rv(42OW5sGia@~nR=*nU6{zumVK|S>Vp%DpcE(-ATTHal>yJvpoOj!K
z9~;7g{z&JMw@l*!uBaQXf6?08tK3+xX<G@kcD5)Ev8J1noek~5C5y@dM3(5*ESQ#W
z+<Yls^@!luY%+4Mq~VL+%ZFcS6*rC_RD|faF1A%=L`-?Tb(|Ka0X9Zw$e01BQm3_7
z!O8%ak5XRl$?;7YnU}@-tI32EX`j=~G##g#n))Lm{rb_`vYkT_A)_5$#XLI}M2rz>
ziq!gx=0R)2B{}dwVpPK!F-0gBSn{{qvSX1MQ3Cn!&Te__<fqI`9V;J^1)3uih+P&(
zqQYXrkX=fUO!f5o)fUtG&uXENe^p87M|~@R>LR>r4%4lL1TH_i%HwG+k~n?3V>qQ<
zpI)70-QL~dK}FdGhUgFkO=AZQvW=e-$tqqEuUOO*cub{KKq-Prw(xl$Dv~1}bN??4
zNxb!~l|wsuy9J4NWwO!cyFH_9@33pTZ42xOM{L;ET(!*?l&_6+t~?r^{)+=E*}^%{
zb8br$cpNblq*z&0R`%U;*W4CH)HEdIL!FS|inQ(ihsLv32ixibs4+7qhe#lk@UV*P
zmDO&Yzn?XI{dEd$xt?&)CLy*71#eBOWRpThs$u&7CG%y~gJDYqQ@zjEwR#b#3^~T6
zA+(6UoD`CddJQ*V!CF3CX@5(qs+v-C-;~liI;y+!wYL=6p0@FpD!$Ar<j3)A271?~
z1%J+tH^WH17eBPiu3@H^1Es!ZCd&Y`G}7EsCD#gHc($?jul-#89|qSK$EG_pyzuJl
z`_s?Q+3&`VLo1zkoS==kGb7-TOVNb@(_1Avvoo-6gfJ|nqe~BK#dVL9yM}e<g|zgm
zm@XX3DLO7d?#OrBQazb`qk~MlEF`%lix#u4qCt$h#K6fV3$|WQ#rha^x_|pyS+Q<d
zdD#W}wX4}kw2|S9VjTaMRswFA-JV@qn#lN^+NK(a&iQ=d?yV_88=c%9S215xyb$|?
z$P}O*%yZ9t=pZiW*+ha{_5yi&4+ns7mfxFuvo>jso%|c*F`_N#fa4Agn$r%C<oT;w
zFEsKnv(VtPN`LJ{#E>A3QK-EI9Fb9V@L7YGEnS&6Wvhs6a+(g!5m_*`dE}0MmaVxq
zRpDIqjS#ujIeZLaL%svlvZLk?r#VEx5a4rjbkUy;e>WyUjH`ZC@+6KQbd)7(rvWM*
zur7>VD<L-eNe2w18hC0A{H}i-i+COAthjY#94x+asVdktyo^7bXzHqH158E3N{?$X
zpR)`Y5WY*|d9kUO)B|LtSn*Er57lpk-JC4fpTf;L&%rP(Ht<FGb`^d9G-AA3Ff7CG
zoQ$W#rbqi-#IU33)h7>}qv18D_h;rv{cS=N?33Epz(5Jma@qhB{x7i#h%)$KYBp~U
zZBpnV`%mBGnuB#2SoRbNKhHZheqVDyP6}^M3Mqzy@Fr-d2=hmcwPG#*(kRYR!+K>B
z;ZtKV<nJqH`)Wq}8)F|&!t4amjQYgGX<CvwHQ6lc*m1WkjHB1Fk)>}5F4KTk%fXjJ
zwTbmvdszbC2!oAv<bWD(hq$J|3j(2(fA2ZbP(wT8^tU&>Kd(|pIO5^SYKHb{vR2Z2
z_l0HepHq|ph+QLFv+akIN?;NZ#fu^OKx+AWw^&~ceL4}Db$dptA-vbH-(Mu9TXs4r
zQ$q(l{2el44nJZ@UiVd#?3=9Q_1=1e-CM{=LoOx)(W}q%-FhU*#SA`TL{W_2O&;$O
z-37O0;;|g#H4m+%DN^CXjv&tj&3fH8`uP-^&6#xSar@&o;Eg;~OP6pF+PB`L4wtuE
z)$9|iXI$x3Smd}d`{RS-h0^DPFt-YG)YaN->r`JZG>BZohLQ|BpA+?*JFv;3>}ABG
zxai<Zj@Njsxt-nI>c(>sNvH&yM(goHeMykx)cW&Uo_*Q3E>v2(b3Mi46c+11T!|d`
z{4}@QwL3YxAGKX%z(7R)v}j%IO^ld-R9PKF&TjLLBc3Udn~b3MzI?vx)KU33k<F(d
zceMkI8a$lp0cR#hVzG-z@}~vW?AyR^rpegSWOLf&DHB6Y9|Jok{9eqt<O$bbUojYn
zRFou(D;6d&YKKgI9X>tXZ&wWL*%nDZ%&(e9RBOj&(AZV9B-NX!mneZtez|+{&Y;U?
zWqe%OT@bmKVuiccBM1vmJk?`Z`&_x6te$24t^QVQ%YMFij}=hY#tNiGfh&GDEOy^H
zEF#?Pp`VuQ+B3#;-sepPX4lk$TNMdGq-~XHX<<KgifS3}r<DN)3KMjDZ2foNji;?7
zhSc-W+<NnJn!vP~9E9tnVe0hhbwys(bnzPx=B4v<x2~WXMI!3LFa3`RSz&F>6uj{d
zHT&(CSK^#*ymM7m<fBcrseP!XED<{?Xi}0=)Gftr=0rV_h`$|=#w6YZa;Lf6bpt&S
za*m&_5N0%d_+MTYCG~rh?vS(pek&@DR84Qg`bX}TDu6@c9GHBqvZz}&3Z<?O$Q<CM
zMgH)yA-Ah6A)gd@HxIL?VR8VgU$;W_PgYFyg$Cvk*v}JE!0D`yiu#$nR@{8d>+j}7
zzRXvrxV27`;`LVWfo3CqSo9Wfx~*4;g4h1I4Ilj4gMug@?E)0QFm)wRHLvtWm9dKP
zbnY(3oKmxF@BhyW@c1`zt&sPP-%Wk88g&90yVaed|A)yRX*=&5x|BU0CNm*`ajfUt
z86cFCL053WX?<W!i<P?BC`eKC4I<ag7DzkG?;s8n^8V^T6v#=GG%Tl#?()(Y@bt95
zpgQYS(r$8dyBr_%^@al8m|G{#@ax4>Z!Rtc_>;&ji#@64;>^!WtNz9cC%5wbh^d$E
z-<!;jX_TdK5m^zvn?_wU7g4*F??XkIV*o;!FgLr=ZQ2p%_G}s%)`L(d`J9wyg66T&
zy;wLIE3wAqRPS3U3WO-jvZoDuuDLsxdL;gzs*NBJk$)iD&<l{GKbv(1Py4nLC+)v^
z?{{E-602?8Z<tUP?jh&rb|>tk@BwY%2OL_srm<#E28Yczxe`pj`Xpn0;~<x^Xd3Aw
z$9#kRx#U>C+)>q#bf4c|JDw$`UWXuO&hAY;%mkfJHK$)Z-g)?A4outnza5Cf0p|$P
z*zUhd^c(4x&t;j@BSp`?sQXaLI`Z_w=GoWlS@N5Db(3w;2xo~TcyHI>v1mWYrZ{AO
zr$pHtkl(g+XjopZ25kyts*Y8E$ZLTTau5sBd<l-v$V><I!;7cN?d!kuIN!ejM<Yi4
zdo26<{pn~%0G-V>6?3)K>EFVe0f#^Il9C<#u06k+h@zQ4k<p<KRQviq6JX?t-i|x(
z&krH_N#k)+j<wo}XGq-Xs%8*%$2`JI?N3`JE_X~^vDHg91;&i9NJG4wN{?h-Craw2
zvTiY3$C#K(hvW6SSS~IVcz&^iOM9Q_L6{RXQA@itxf~9u-k&BH@~fY$y*9)(V!3^w
z^l9+(9>F{L1+Q&1a2n&(Xpr4j^@C3%hQUl>G{6Eg#mG*K0}Je)-Iz5JU*xz|q|-Mz
zAnftTTZcO8g*+Nn1DsT7>E4vU_OLxF<)xzs4O}AvH!mJ1bM-o=YKDEv`K#`!jrDEf
zRg-JD8ftB&ke7;r^=i?_pV`i_p0>EH!sK2eA|X0iCBf2CRKawqMaOC;kOnc|4|G+R
zo6q*5Hb(oV&AX)llQ!bF)zJTh7Mx|su3C&={M7vF^;5cRVe7vl9@c}{lk1sT3PowA
zoc{1#?))%YLW`helf3xb+FVYw%^py^ZyT0W{{Y*U2D^?B;2|;XiU(041xzmMDwOn)
zjIigXBN>v@x)r%q_Z3G}I$pfh)q8aSr8OKC^sXNf*;D;?^#Gh~w1Uhw&K^KNAASZ7
z8Wf!+uRSI%qffKBRl&9C@?35CzwIpMuDDQqAIHnHy~*lC66E0c;+B{bl99Z3rRdT_
zw@5K#xdLGqY&S}Y5&yhP!J5WbDooHGdPLhItE|r+_DjfxdW<?-t)xLMu@a>wACO67
zCmo|*!g|IIh|dW3tc;5B3y|jW+4?e?{Tp_^aNrLitcKad$8ph(3nP^SJe$9Cq9q^=
zvBhN*0!x=V%^~QbAOBe4tQ>r$$Tb}zelHZKhL*CVQ}noHKQ0sHc`0KPbF4=phwb(O
zP}?GK^zHv!zD9#vGtZn~NNS~mMTYfqL!&BwS`)o<m_)L#sn&}+@OLB1O#t)wftP+P
zMN**;_17Img4*2XC_PEicd&eRlW%pJn$pqEFYkaow9NT@P!0yJ_$ktrrbY?Ih?jD3
znbco~oMIZ0e_A7h1I~$VGo<k)DMBNx4egI&2iyTMnzSqhXU)kW!Un0sfgak|Q0uB6
z^m7+RX*y$oXg>F#hKlE)@u3gYJ0&%es~6VYQgHq;fNha`2C-y#F>yak5})5N4XI<L
zQ_F2VhY}R1(DyPd&~(pm&-=3Tv%K!|;CPv|anTljdT7NBSkB{vZIjn$*BjCF<pixy
zn15*_BuCdluU~T_4C;Tmii=asVDnhAI!d`nCX6xoa&#O!8x$6#RUnb?hDi<PS6-}d
zc)&+j*BUk2kA}VLTF$i1f_F~P+8g#vlO%mM;Q8Gm5nNIXz)dmN@rGg1jH10C3=UK^
z@xwDK%c_i@O`jM!pNpE$=G4xsu_bCRX6XBX%~*VWO9Z594l^~*%&6n}*yJ%<fUOpY
zw93%-P=8mY-J3aceO)O8XAt_!Ur%j5@|X9wdl)Nx4pbF`VlTsH`o@9`v0<34<rJWS
z2$qN=S{>pExeq=8yY?=#c8<TO<=X!WmcM>upPeZoOBmu|F=OH-k81mII&Eo1P|Lks
z=xzAK*mfZDm4$`k;xw}{JW~);bFTHq7gLO7{q*n)u0td1W5v^xr;;#G>kdaA3pDy5
zMoZQX00qJJtNr9YMXiL2fFl>gxSOulCN>g{p@es~;>~!s25WLpl{NZ(AdV~34_%XK
z&tnas$vu`zrU^z2lh{%sWIFgrKq-L>1&XV+;KHXQ8tWf?f%W^c!WD?o+CWilan#34
zNIE)u27E(;o|4n&CW{YLk{$i?uO2G}Z9lodEtY?Ey`6kp$U(cNn80aJysZ69+ohwq
z=g&Qp9J)%gCOg}1A(=dblLF|*-&oHb$8`)A1eQlaIbCQT(h87R(b8qr6+0DnVSg_^
zF0*~w8JBKT?^#CtmYOztj1iKA{5D=6Zuu2HtJrN9#)$-_8#v^*SmxL)?r`AgX6-;p
zL^lU!x7si)mh<_jYzJM#nmq0mm5S~|2HJlpUBewQyp;`e2m$&htdyXF^Nb@G0_R+Z
zPhGLV&nr<y0-5ukZfWIpOjF#vk1up#Z;9@xnPKz%0l;A@d|qguH61UanTjgfR<UuD
zs*B#Kd9(~czU^UDD%pN+*MD<{^KJdj`E}xa>Z|8JCw`m9^Is13+HZC8DY&?QIq5lU
z1=}&5)F19ADiVb=4=OkciN3HIJxZ?Q{OpN@St<7GvrTRECnxr(Kzx1mh?ll#_Qqb*
zr~dZvFcjfO25l$c(<+xwUs%_{U@Y@jKq1*=e`6uXZx;c1UEnRLWN0WNxcqOE2mExY
zsDSUIFwzh2>qEC;Q{{7e8}bgXZ~r+Bv$iHxAXY^i#eBKL6)Hsx-c^1@xfEfi){R=9
zVtO~NrbgQw&G-S|PYrJIIbw3p(_(TIu7gIh>cET3tRs#Jr!|(eKAKCy0&Ps6?GS$f
z5^|stnd%9;VX+?BS-U<N2>Ninu>EPoYhmX}yWcg-vn>PcExqSH7Okq#Y~chWy~}Jw
zn0&QXPB}M*G_v_t<HoM@ZF}@jEVL9eln8N9MRjcpHM#vncK31t65ZkJ8<C4SSrpw;
zwJ~Ki0dL+Jn%^L)CE1AQO{93~>=$=q=-N$u(7hU$Pt9h(p{QHy-w<<-nY%+yS#Yk&
z13Gzqe9kAPcZly{(n*x<*qFo8JP-ozIM>%7DPg|Ri0&u1%q~@j6l~_Hr9S3f{e&J0
z+FVfYW#t+Dx?!NOj|EF0V6i>L)Y=qz{K<{rMo0g=%9(2TWVAQCK}DCRl~bG0Sfy|0
zuMxs3ETl+Qw~9Oc!mCu-o~kt={2tqo(4dE<ztrue-9K^5<B1g(*0JtK*QS+GvU_@3
z4|!sLOyb<Tz@BP6Ys<gMERc};-Ew0C{BT+ngZ-+=&p*EmbkL`B_#kXx&l4mf!$diD
zZDNSKJ`jZ<6A5_NzCB-{eoUF)IM4MKxAGy$KPN1TVpF7oK79`v1!I-WW=f`Tf#wXY
zIw{<#{_?Xc%?Vw&ad!`|dM*rYb#4BBxdEPXxx+Yb4;>eZp^?~fuA1on<o)~h*ems-
z#WPUAZ>zG56BGYXPdxYb4}Us}Gl#aX)gexLE5opXm5|4~+^+r9?m$V_iswr4dA-lq
zkiqhbPb0slq<4zwxqL#B<}B?OC&O#4x^l%K&rs!fxOUo%lFi9uv=3^Yz`1Lr0c`yf
zUkh;>KUt!o6f{lhobcnY5Ua(cq>^%GCygYth!InJ4ITC0(Mz6tC=c&4pNODoM3KL5
zM>2RIYqJ^jC|2sX@TZz28oCu9<@xISX@HgO-9t$km_sw6*kobATK%Lfq4(0t1l{vn
z%2ck%$)2=H3$6hoMHprvbxV6r3Vb#}d5$C|mbe-3gQPL8o-YEyH)rmf+^aqMT?a$D
zDpNVZaMP4y8i9R?V(&z7jeC~Z+MuE=HF@#rKCZ~>(>wi1XC7B{M^C|einsLswStNQ
zaft1&PxcS}Tz60z66fMnhi;t{ZTLrxwX7*p<ijV`?G+K0QE>m!g{vwv7h-wuufK3t
zpK{*B?0)Og_&c;1s2kw)@<I3Ou=}f`zH^%gr*fBRZf2x`1ACB)@9$V4LbzM|hVKs)
z7x~ZDewEsKefk+0po@53x!x~!yzE8EnAfroucJdUDarB?y!Bm}IW>NRDHzld+Eow$
zD%Dl45hd1*IoN&9hf>d<tph?~22#P)IO|?s5yDT2HW&Fskey;B19IpmA8FcWV*#A5
zlQ7=`)mv$vf?nZww?q+7YqKo1f*Rk$kJU1p;ROk2x^1r}!<&W<xAR364+Z@3><LFa
zV4A<Nv9!51+nW{JIeO{_eLJgZ_wAds+o&xOWH%m_lb@OHJ)%sFlV^n>{gy1n0}qc;
zy$jn+4o(XZ8NUn2A;+O_9mFPleT~e#9M+xPh~-HHZn{Qk$;Hzn9w;C(-K(q}ZWZj@
zm~4?=P%-$JDstF`H)V(ut~=t2Tu*PhPRe$2-VGr|+wRP6Z*#tvLpBQ(vA_HrCejBU
zH}lgP-4t}^Hv?_<>U;y9{EU+CrhSTyhea?g*WEy&iB9VgzD-9RV!|BNfX9|cQoNuQ
z8AVI!QIB#mt6}9Nf5f8XM=#rLvzVzuPA7GU))AHW4?4IVyArn|V@qwij!$^^XFYv!
z@|zQ(T?lkj(<o+&2P?~B6QUzS{B7`X7ScEic~bi&&VtyyZ)^zX%+kv5py{Dz!p@&A
z_~4)ADHb6Nrb&%oxw$YZXqkLTANo@Uv0L>IdTlX+ZcvGyH=o$J=#p@W;?lTpa>bIX
zgrE^Y*i=Qd4K1DElBN!3p14JH4wCJ>W7?`G1Fv&~QiyRI{8}QRTCA}SI||#+b7?{9
z3)De$JihlA=v|@^BEw8fWJU*rtAcLLOOvx?!F3q>!O_jDQ|O%5KBXcA&6Yy#=c5;f
z)U|OcEfm)9u-Y!VD#TbqBxCK-t_^~UaKfJ*3XFB>!-Ka{=txluoIPf<j=fEYWz4$w
zveeIajE!Z)TzvS5#W1d!V&+wFshC4T$T<ZO#8akYC+zu4TR9cg=R>?E72&^kRm4pK
zkFq>^3x;NyK4roFmm6TP{KYk>4&M0dg|rIIJLF(@<@PuJ&!|2yNT3uieDy(L5%yN$
zXc@ubQA5il4?q8|T(_k1U!C$ykXp^DuQ1Z>F)1jY4XL_!v{t$%LMsJzn;44LC*DJd
zxi--+6oiDJkPJ^Q$@f5;Xkz|UI0BtJ`%e(LcMwGf$gw_p=?=<oo)~r1*k_y3)G@QP
zmh&|0xA6DBrZh$`O(+TVHc<3KJmz6p0;JU2eli)_8JuMEqh-O`eow~zOnmE$Y^GU6
z@gVJiJGzIv!N3aDYWQQ(1NQT$z+!n5rp7VM6qjop>7U)Q{-zSW7f{B7hh;zi>T(?t
z?JD#JGn;egn0G3xAk`q8N>{o63&*9jTA;`!e0yM;zPsf}H*YQOJ5~}yg<AU*lL76`
z*8HyAYuhAj{E|(Yh{VLP4W{C_rwLr}e8si6@(HYy<T0$LQHiG=S$YM|W>?NoR%0_g
z@99wX>7^a_Z^rkKFYW=e?_ViHR6{>phEC?{*;5Ueh9U`Ktmj%b@p*Yzp?4CqLQ2__
zSQURDEIkv7_UrkJ1)zVg-(P{+a2N%fbx~6-tJZuAB*tRjj2E{lM3RbfoV|cG4N^Ax
z;`CX}W~Ujk6dPBD^Fyf1-aZGz)kYVt@st>JAffU=vZwf&JV|(_Ax)+yS?Uio!;36j
zfdNeB#zfv|t}Q(r^3nGd(F4+Y-B`=t>_e?aMK{Rn)c8E-HWcYl*WjTaSfcW_me80*
zQUyY~fIH!by_SJHY&V#$b!o`UMNRaaG=w$5E}BS8TsmK!=)KBGM|0QOF63p^Aj_1z
z3K9|%5^j0W+-?NAGI$Pd>oOmdIv5M-4dG@Lq>$_kcaw0)Mh;!UUhiQ4ZZ8wzz-YoX
zh{5nd%~G+Zqt)tRb3lQG3&7<m^FI59%6m+ahB-ag1q{RF?WiQ9AB1N(tC3N_ch3sk
z_N4vX+^V`bE#YZ0aMVSCz9$)0xe9n1ti^iAn$Zc034@uKby-mnc1)Sqh$CvDHd;ew
zB7+)MM}usAV_#Ah%{In5-@@Ao;6Tp_h;LJ=w{COq6`kvSBiZAww^i?abUXQ+fdW$9
z@)4w885aQ0^_h<il^(lu7*1BH?-;a`m-C}5m&k!UpXvV}4%*RNSo)Vd;9X&8sO>v(
zqs*-#0^Cr2L{L{&Z>|Zp1rGJQ&5DfAk-hH;*(bXh?}WHY4ec%FwVejAN5SR!f7d6g
zsdpB+1^T@w`Pxu}SU;kH?*@3|t~qqT{rRSaU#wA5wZG~7pjz?*PiYgtFj)lICnm|F
zfJyerDcr8^URawt?rjWl-4I8X`5tV_WceawVu@!QxH){s3dRYmw8%*qj&vnlni<}v
zScs~it~Tf#P2_Z0PV{~)Refy@l&hcFsLaK*RAYs7PhJQJ{VGr>$mmqSLG#ah@4Yla
zIubd{#-+1oIvT5a;wZwdL+9jcRHa!=#9vMl+_g#b$9k%Uz!1~}eX_2W`j?OCq3gm|
zqSmZ&DWWL=vo!SS#byuX-A>d>w-$)_0~<{E#fj5r-M8YAE1vWWJP$o)H%x)@FI<*C
z6t#x?Bsz#N!`E!cR&>g#IHzsoQ{>yEfs}x+c;x$)gcCGJ_faI>R24`}F_0ai$S7YL
za~Icx+owP_T6eU82MOh{7g8T$hl_kgw9bN+(E<1|M@jCQ$o@<6FDG3FRyy)vyeGm@
z^!xBkyc;uyV6VJMrbHbZWKE&^_mG!YUGa6S=e!}kM)*S<sz>zzlaz?>lMO&@i1muC
z?5TD{p5M!UU1P!U#7Kog)4yDE9HV$igoPYxW9nNLh0ro2k!N@17DS87_u4m?o&OSd
z91bhb4=87Xc&<pkyL{ZAw_tnnAk(UDK8VT4`(j7P1>?+xc@m1Fwwd1o2hpGN=8j@4
zP@>LADxZz!7Z@Ggv>J|#;`rDkJ8vjUfbeANP|`2+dg7hZ(Op{D%{Lr(k4O0;bBna=
z+ipyA)Ha^Szs(+7BXh*+cxAfI;^<+yUNtT-5KJgqfDUzl$@rJI28L38LnT10KbmfN
zaL&^Vjdt$T+%?=Pkr<g5|IF^1N|p9>nld%YHc-UoW|fup!<B-25;x6xnd;)tZY&O?
zu_{0N#@-f<<~c3oi*ykg0&nFg*xuOmE<^TVUCLr02OQSy9qd6=DJ4ip_fB2Ju@EkH
zhW^<1@Y2p)y@qh0svY|7kWt_;0{SbF(VjhdRtWP%LJb&s4JiuI?D{)J3M|*jeZ#a2
zau1H!PP_pW%P*Xb)n_}0hRx7RMgh9FkyXJhWiry&M1A^uJi*M5S+-jiGEpx?KR(m6
z?#=R*e^b5Iy#8*m_7396n49k&s4-^!zJuBi`cR09x_^W_ziTuXTkel=i5aLbh3M&h
zB}_-X3WGiDC#+Iv$cO2B{NOZ^jwcgKd7#cYmlSwmw{(Mt#e|~`Gd+r3?QviR(X7C5
zflBP7sL(R(vuK|{OT=S;TD5^K{1p=xoqa3;gA}@H`A4)@T?VRbB~-S2xn;PxAg*@p
z9zHDo#>IkL4TQ?`uSnbc1>wG<X3CKdBu5`Nc)0zqj2m{~wW^X!8g%V_ek?uR^SGY_
zi^m2NSC9KvM;a2y&upl^?oh+i;-PK~8e%j_8^IMZa{36j>z2-KV<D#1w_&f1s0Iao
z@6<@g6<L|<>d$Rc6vE#T+O)M;#-`3+w_;+i5s{vukWV{m$VeC(sh}4fIrAXi0cDT#
z0wqGjUxX&4WGKu%Da?Y34HjOsYiFk$hpOP#Da*T!3D_v&OHeD*+2jsLTH;%fWrl@t
zEW!m}3vJhp@g?#u;4D4Spu+sk+BM>+n@7@bFGv%4#6yN{x+{f6c&Pv$z%fgvaM{MD
zc4TsAMM&d9l-sk1&wqYd#{utsf9z@8_N={9v@in6q-c3kjh%@V>^>kPY+H>%w!W4R
zXB*@lvT<Rcv>2NeY-azR?nxm?hELIVf@6?39NPpOc5fS%Ho$Enz5s-AY?pyEum>6y
zrWi2u#2GEf*{G+AoUEDRtmj2HoGa*+S5zR&kH*PBZ!kUFQAYDb`d!KDz5^JnTw>r;
zQ`7^~fD^G>?<qw@Q;GJZ!rfYN77cow8N}>d`t6JoT$p*{Ehhx_^ZC_qVeB8={7AkZ
z@$JCBswvun^`|7Z9f)9rs4v?H|Bg#Z+BtJ&B>kca;3kvXQ}S%UfWJ!2EnFGt1jh%p
zJ|T303Ae(^N6Q&&BF5y31?B1ZXOD0@IUq|=>ynsPj;ZuN1_@+;EN@f>T7!^hwGr06
z4XAR$XKOIcVOX&Y)j*bxi@vnzYx3@6xtwa%KWu<(9i5l^b><w%jo7!6gOe%?!x4FG
zvHThJ#ldOzoKeFGZt3A}Gj9Xb$E6JVy@Mn3lBEC#=5f3~^&1QBFe1Q{7zN`ETbvT3
zX(cg3F1tIH{<3L{!N$sLly;9AhDWgQ{QxmkCxCXU^@Vt9pqRfJN&nd*%;(o(bR13_
zS-}1>(!%<^5vO}41J1w0cW0YoV6}tYDq~lv(lct`hQm!#)A{fjIcwCwH$fN6{*Lr5
zph3AsD<ZHbpPZ>)1oKK#rgkiaI45F!+&h3b_8lwa!;LD3bg)?g@lZd#bO21yZs4x)
z1B`V_;0ED3ssYLiOe}SG#J^4)%D}#l$v|wSo5$j`=sm{7Wz!C_8u}uUDl*k(oWdtn
z73hf{?7I%tk*w;7Yae=mvcLC%Z`RME$i-K5gDL6y^8@6J{o*-z*rM11KxgMp7HS?#
z=uSv0Ej6g{VSk?3Ow%+|xn(PiSowbJ`LE-&gJ2s<+#NkFjW9UULz%OezaqI7=c+%c
zxZ)an4K8i0PgCT;7NiUW_TMbqP%o6OS6M3)e>@Nve80UOgq+>I!L%zPxvgaZ%^mos
z#vKw5HpN$K7$vs)F*9l?LOTuD2?cizuHO#%*p9wA!ZzfK&Ke)$aP|cybct&5d{SX@
zqte;-t6T>##j!7&W}Dk<h-=UF0@)s=w;*rDnJTi>_Q@voEN;8XOl0Gt=VKU>p-urL
z(Z~e;SB@|eh@k0t@f}q@SqInh5rw}<QHF^5Ca)7JZU*?G)`&Zd0ouqmo&HF66ZJvr
z!suQTBu}P=9*eZ@f-_GZxVdr7B)vAfFRJlH>=-kUxJ>i=JB%Pu^)?Rg+Q?^rwTPo{
zgovasqE|vHdMFJ5YdU(2e!^igxI}&l5HkL_MQ<#oyQ`lGF~e`t>AraT7V=h*`KV74
zM;YJ8Nzkkt2v8z$%Ev|f4lLSV2w}Iq34cxHzA0BTJfmKtJKwliI{=;Sm<2~a&aI$r
zMo#*mL^1^~01s`h%x+ZnvikmHaXJ93bmZ#5Eax;gla=ugk62ed7*wb9hg|L#Q}3Z{
zU^zlKm=X^zE+{59CNE8`(ZH{VHai*bKxy30Uwt+shF3pD*VjOn8~Q^#sN^vNxtOPe
z1W6Z4Ur{RLXJGS+z1J<+*jVk0ZI+T9(o87IWXMjy=*n-V?~^m!HFs^&4q>CJ4eLpM
zCMI-T%45`Mf|N46N4fmig)AuC?x-jmf1qbfFhoJFK~Ym)9p>G(o>XK8pT3bZiwG%g
z+%Fc*YHv5Mi}V(!72Id2vQFb^#eKzedJrandi}|nruP^H+RP%7=$^8CG-st%3KUWQ
znDm$!gO4;<8B>Ox)HQ0TB&UWQ(`D$=tV=rte0`ujzJRc0hH~z4?5IK}xKnxV%gd{P
zPcBP=cY^H78w`i)gJLwzrpW{EpeYESoz?}*64(Hdk14prytj4P*tDjg9yhRgDbtvu
zQ0oYBr5iSGTzm?WeX653_4aE>vx*@Io$(B~W41Nqnv;M$sEZt?CPoqKP@PBK+rFfV
zm@T<1XwB$^l@Xz6iwuTes$eL8R36eiShH0B=)7s%)5)2@6UpWPcQAP;k&q>i2pTy2
zCUL=BgK1K}`uCfKLu%GyiJQ>Ok_v7qf<9y)e=Dg33WEBD=@u$v;m-U~3=+$k`UG#v
z>w-f<r}iz8RnMt0@#%`l$Ue0qhv$rit4w6XI#wJg1hYG@gW$!q=GCuxiDl1bD@Fqa
zUAY^eFe`t3=K9QB>bYhXNb6puhKhx8nRJ)o4<rKAB@>NLcp1cdOQwH@35w$kI^2jR
z@ht}6d@b47Q_WH08<-bU$B@b;BYEjk*bbmjzt2!2*-lZuA(`c%r78Xr7p|;lCha+s
z*qw{u`W2WPr*fU99{4pQoim&$2aoLRWc3AH&Xw;MMh-dqcu{iX`uvi!j9b8Ie`sLQ
z(nMUfshYRm^;u*F!!1eQh6|TRPweE0Zy8_#r8^C&V2fr6IvGFnW;W2>2TQ2Jm0}JV
z22eT|bs36hRFsRcsrJlaQqiY%8ER3w=%bz!wsL}b%Ku@|$2qd*w|!{8Hg+kRQLyjB
zHWZS~?K>Iph9YGOx6O1};sSWDp_pU+Hx?MSe^`*S2%aeS<v12<YZ`%YXljGZT>kZo
z%$QdH-4fG$ZL9yVTa!cu2StTh5|~JMWh}08Q%LqIzuKCNEh8(<NQL|7G57y@0hDSO
za5P=eb90r}bUKPPsY8#=eA4VS1u+B=s3MbKZLBw+zE^R=4j75VgS8)o+vIc+ZQN~Y
z6T36XhZrwE;Tr5!2DOKa|Aqm>8UeS|r970v2@PbVXDL+W(sWAA{k|Ds7}-A=s5!H+
zPsPlwK1OorNlF?ZPVLGSp=Q@dJJ&1b{Jn?h=;+HfNxa9TK>Sf+VV45)I;e{u?<naV
zWb=6i#}K$rR~9kkDz%4VOxj~?^E=1n)-q;bk3`^QdJ|Q}@*);m|4#5i+>p}B*TlR>
ze37da(+W?|5GOVaI=yR2MAiz~!zBSrlzF(aAU~am7kQ)@-as&L^wDw1xQ77~YnQtc
z9i`I2vU-7>VwYl^$Qajv>^X_2foq=BDyj0~#Lb)Z&Eh-4X;%JjUrgVz3UPoX#`=#u
znG%}6x*(&3qQCly>y~@}X0$nfkxJq^Xbn|p`_xW_%XcGkD5`go@kIYJK(B={dkBpg
zpfAw4b+~B)dy>=|tKY#wXiF;z-g5+U-{@fjZ~z99tL5~t8S~j4TmqF0wj5GoWreKK
znu`TsEAm5Vhc&JTXkj4%VG(;hv+><-r>L;7J065`=vJcksf%|4sHBKUOi<-2ga=}3
zm)*-E&~jtnB=XdaV&QtkNC$U^HxQX|yBl!ug^AZlc;Ax+S%?HL#W#+T%)@rXFOoO2
zA=J>l>+{CHWzQ%$J_ftF^!gn{nY~)2u<avuZn{$rhoQjqe;3y9_PpapecVf6o?sfR
z_K@SNU7=RRWQB|rGu#(jbsC<0*CFPur09%sz!ipGcl-Ql*P5|6@^@P9ijnF1>;@CB
zw3r(}{7P2Q&Ck9I6j8w^5LiRmn{-2(PDSsKATgmZ@bB3;&h-zr$}G_Nv<LV;)032O
zfB!sWEWR$soa0i;=6`w_Y-}utU<+NaaOo7g${wM>u2%ditevRX^Y>$FVYk~Oj9C$2
z{B1t^!8w$V{*V*aS#-q=6(UNkK>-USLnmgjd&XQ2tF_YRSb<u<tQ*VMc>+qiR2hOj
zWGs&dxD5K?cF|r9G=HNKEEhQ1=d!0K{yz3F8tO_KIbl_a`ndur!5+KVOSo~BUPE_6
z-23iNsV$Lqdt7Yngj#nfGt_#urYe>fO(9?N)u2HQ!ETOEPu+~^fg;LhQRbOi11=F0
z_7Z1?q+*Xk5Ik`5lPFJ(#+4B;zO$szikSn`*R6*Kut%`j%R5laKq)1pNB1KEJr@l(
zkCd=lR=oRsdrbdR78vGOv@R9rMsw?qP_W*RVu(nKQx7f^3?JaPFFoLiNEd8#QtF3)
ztGOGvJbLEFgF5umBgyl#DwPrY274%ogt_bMzLE^zbUQ(f!nmXQ6`sSa+M>g2%HW@_
zFmOd6zM^_X^^B|^g=Jh*PlkLS7tw&;^a&`-T#joxLWvJ}Qs&ojzT`>SyNKroW2_Cl
zkLUf)zk?MtNg)wX<EL_B=<Z~TlCIIIsl-_PY1(<cABG1B^grza%HuPoHy5lM%d<ZY
z0a-^&d!V&DMyl`&Ja&A4M21ZWmyc$kh)+RzV2D*?`IaFMBItOwo|hgLV%z3?W+jo<
z^AdP0U24S*jDDO4JBo;h<0MI4e0%o7Iqk5%ZpD$wBx&5Wvs2Rv!(E$)lV;<4*FA@q
zySu-rHNIvP#&SOk(UtzW7?sp&wdtu*NJ$w`$v^$g#2bDhy2FKfFB>6JuaJJKuMv-v
z4F%gDBp`<*L^-1kH5YpNL9LvaS`r1*<L@CNO2&8Vqq7RGiQO1^^hysVB6%^*kZIr=
zD5^xv4Qk?zt1zw{xGu9f#CKxI`nf?BOfj&BX`74GoeN^J#mgHgn5Rv52vi*;_MwyG
z2oHZqoT-kSFuuJ<kxK6+*3lv$u(Ch+>yI_LIIZM9wmDt0{V~GYzst};w~oi`V_J{B
zUC<ckCjxOVLYl}YAO6!+>KQiL;J))I)o4mcM^Y$|BPte1oWe9ev*rOriIYJ1r{L_5
z=I#blXK;p@Vk}DeAI`6xa+*n3ys$hp++fUQ6qst}jF$z4pFN}poG@7igX33HH4D!S
z8)<6UWZ3~h%(<(|zATZwr|gNsZvOKy2Iz-qY*}M_#oahYSVZmGeX;Bxxl?w#RD)<X
z2Uuow>D(M5CvT)k(Lzdu>sjLR(z4~X>73GVG}Qh{i7Vs&A}_m`6?CrwY7h%*EtJ)7
z_xaCwn}a$d{((a!E4i?ck=pL0Y{P%s;j*WXt#hL?HCv0Q)M<3{uFj%ma-;=2MPlM*
zHN|wms<|1S`%odVJ%O*Xi`ctEVDra-{qNWTFhwxCUpxFp>;l-+!aO-o4xHTF69~zV
zCqXoj-WZB|I9a&&^;^&~z{w0%+9GIMRjxcyh~KliPw;)yhss9#Op$~^F{|w<Zl!;b
zAY9aGUgZuY2X8&`Du+o?oV;?K@P>gBx!Z$UtdJi`xsg2gRla=Eh?8{0uPmwEgGF_(
zurDY%CfL#i56n?Z@|$FjsgW>BTpK0(?Ch*Je;j6Qm*1V%>yG!-79LgL{hEa`K1F!x
znK^>q+{WZ{Ml}^Q(%i*Qo5$7(vjTy-rcAoaFLc|FmQ5xP#f9vX?0c~SVq6~%o+RIK
z1Tn2SuF@(2EhOb51ge>vtsX&JeUC{57l6=)RHLJY;xE`8HjKuskjG1hv=26YDj{?J
zG$@U^PA{J%;i?S7DgCR+VXV)noS*Vhy|?cD(B0)m7Ct5_ei0y5liZ*TgOkfIfXetx
z3573gm>lwh_vO4p=m%!dRj-(|m4M%M3{v0)n<>P4S0r<EZ;g5+sGJzr41~^e=^(AY
zLr>IQZ35@)4&&zF`Rg4I;w98*oQ_!}BXY1*Vv=OnF&-Ez@|+Qm74qsl6wkumxxu8u
zzS=y+zK0Si=DZlC>{9o8p=zC;KUO%G*1zYocWMTfays`4Wp1%f#Z)k$W{iTLEbJ|v
zx^80Ms^6Cd*zH4?pNt{Z=+DY&SSN~amGINz4fD^@<*>2ny|rvCJI*M7u6@v=lDSvk
zFxq}rjPnE@u=t!oc5n9Pi>-*=6iY4~w5=%wp_Y;V3Br`mY@;V`?Ig`n8Tq2(!dW(p
zi?oNDaKh&FPy5yC%+2hC>QUY_GutE^;xH0nLurKE)1r!(`i8J`>8L3Na1KOID<UnY
z72QRq^;q3%i}D2akZ2tVWtKBTD7a1p$!V$3PPKfI`ZXia5|Eq5i`N+e1o*cH<=R$D
zvTtKDw5lHRZxLt5Msp!p?rC^i+Sq2rx^VpY#iFg0DL7ixJv^NXsDFaPnONeJ&TCHN
zDHH`Pw_Zs|D6uA%u^Zhi<x|_%R8mYt<VmMaday?~QYHC6fw#i^As4LPvthkyqsi&5
zd1j+jxVm`6gcByx*N-AsQu_P9;5+hpJ{=jjYOvc;{S!ypX=Y<pK5-%T+C=@>zqBwK
zd%#IUg=`A6$4@AsY;^aSXiODS!DT0k29?~xnt_hz?n4SGV5elf$2e4VPw8D#(nPAn
zSIg4BO_d#5$9|7CoBaU5{Y}Ul|LW!u+=Iy51!8v$GEJGNDZ^kz!}5Q_-_5K}+1ObE
z^lR6}PrNl0)|iadUucnxa0oUzo`Ee?42;SM#k<4r^KJ*0kA5$#4yxdU#PlBbO-GT+
zN@~;o&DSH9XSk<YQEnAKXK_W$3zl&XeRp=@hJ)uZoOIaOBtg29bFPR*PXwZKhC`XX
zto$W2Rh3kk2G5tB?I}$ufmk^6KgjOZUxfQKf>vrLgiVls#=i7VPYQhN)<J}HIrE{)
zGj@fq<F~3?i(GUhkWBkoaL~TERZ)o3_0A&&D<m7UIfx4uB182%D#kgT`{e$7jAsg(
z>JsxHBQgN@_aO7_GeB($`*U~DDrjO2J(M^AnoBoVQd*B1K7s&|Z}wK~GOk&L8t9?t
zLY&LjkqoInp~uh9=qyMD;p7uy1KTipl&<G)op3b12vVVIXEPt4G^%Z4p(NY_d$1@Q
zMONOQVzLP~L62Gj0bxOUcs_bvVNS`N6#KRUV?lG|hy?1;YxH2DspaikMl?p>55{m3
zaA^A#b3DyKFK0E~7Xi)M=1_dPy^gLx5$mO$g9ZiDroT2qqTb$VJD-y~az=VpRhxWD
zPxLf3h<AT`-SIrO#s0E=q*99qZl0;QumVtxVVLqyTyUYP^798+NL^`b8cfR=SWDhA
zo`bBZkn)pNpg65w7csnUtZQBaClnYU41R|N1J8s_Zokz;gll`@tc3H6xr9F%JTR6`
zpx207LA-8ugqz?U{8R0vpk)$z#U4=@pnI9Hj-OFvdA=Tp_3ov{;@Cv@qWcSndDHzc
z;7|JS*97G*pHnGNU!)iIuIfNtf&r2*itw<_6#wgGi$XnixZJU`#93?u1&ybL90_;A
zlfTsiAFp7hjy=_<#1V^_;eFvEAkP)f&yoDq$hcIJ+fekt!;11#K-Aa0U}{Axgfjkn
z7m-vy-J0l#e|i}0&ye&sw^4b-Ns+{^=ai}puYXYib?FLdGk$&L8NJcWkX%!e_oQ5H
zNnWYFA-LMQ7^R<&?%Tfi7)f#BTmPH)l>iZZZuIAU%}%NxM~2=5S`O|0C418iiemK5
z>d&VSO<}}BoTo!#!eu+t{M;Ko5VlmbBF6ZaF=R-oLJ~@|BI0-KU{HjerQt<Fx9ZvR
z{Tp4LLT^aTX6c1;$(?gScj96(eSTcoAkNO!^SvK?FhuIJ3EHg~zg~H_dW*+XVM4zN
z^JvI`3lxX!@ABHOGc)lDoInPA0L^8DA#q3UX!AXKVB!uVoJ(A}w26=sn4Z!%_H)HU
z%;YODD!3!(j!r|<MC?j*v`_+j3e*<LKeqIP<!7m6-}ri@J94xaDF2uSwv&px$;@N<
zh<JbRQM;-!VICH&2l{nd&($f(dDyz0iZT7N_tQJ;ovSUc7VMB;$SQ#`=~=pKA)M6_
ziXsh(+IWxRlseNEIc(|Hs>wx;C<r=R_iwW1IW;v60|xrak?KFWdO5K^*kbs3e=qsd
zFMsUy_ILVSu0P_sop2b>z;vQ2c43iHkRFKIMyX&s>zr4g2xFgS61eTtzNC%Ss}!%{
z$HnEtWal-c0y~b^=z}_H#J`;Yv~v!7s;udo541QZ)S(={uO2>*E-o%tES|~d#(wIu
zFvUaI`@ZU>bqq+L<HMcEQ4ZA}aS)9l!1gzVV^ef)*yn~fqvjfq8pghd;C(6tNtcgR
z9c4WE<orV`S~RvNF!ze)dy^GUcDCz83^6h5gll$r>9NF^pPxK0?L9oaAMJf<ZH_Rq
z(~B1L-5Gk@zcdw~^8P5$nWj3jljYqknhQjDgBRF(emReTG>nXlK<)?N(2>a<6TzmT
z%TC6z#_Gd5{dTFJp?2M`aXqs#2ArtIc9$Rf%q5||KPV^Yo9X^PbiHL<RngWxOg9Il
zQ@XoL^3dHW9g>pLU55_o<`5EsbSWJYB8}4B4bu7FdhdOH@0a%@pE!H(wdNXgj4{W`
zv^cEb6Javm9y0gG3P*bNi;(t@%X!+@uW?nr9dRm~JLlw0D0EeWxv6N~qf~air_KRu
z>K!hY`*Aj>sv(khE0e45xE68RhjP#bI`pLoNyymGkL(k9`!Q?M_ubTZ1X;^!m@Q(}
zKsjl;(1ojy^F9EkL919jXU3PLp|4p>3jgx4uCf4d#Mop0rg|Zg?H&9;+sJO08LYj|
zbg2+V<*7tNh#+h{wdVGQrceYOLvACxy(y>YcTsd}wV)#A5}LoR95Y*!zdD;w$(4GN
z_mWQL52~i>+{mxoImAKHdD^-aZZ)l9RieCM0X8cYdqj11xX?fU_}&9}h_BK^G%wDo
zSkmdc$BmN{`9HY<ABtPkyrKyzdd!cHTcITE(44elpanIEU_r-T{RU@RmxgNr`_tac
zJ*%%|Tff4$FmROxaI}O)Qai&5S^+G<cOA8QXnYx*6Em|NLW>f~g@R>oheQX4*RO$A
z?3fpt1kFVkoW?lI1-ZJ-N%@r|f9vn6MBG=#bW$ADZ;u*Nb=49|*BvOek{xJ|F`W8|
zc_6de?s;qKsz=?2-X4h&sA0#S4-3XSR!V;RbuJ2NGz&eXmJXZ_aWhY@xnx1~d?T~^
z(q;y+2&VyCo`tV{K-EuckRA!54lD@ZR(#u42-|mDM45}~-wv&lt}~ql)U+b*2WwAx
zDrE5bIv<LKR7m&~&u2#slItGCyzf6|<-}W>>UGg#E=ZZzIz-#1+dOUNO`86Jf%=Ed
zOolh6#M|C%8Ew<E_;v*(GX8hp#3vDTAg(ZvQtiof4546^-j%;n2xz#;-$I{0eK*=n
zl*sy_yoU-CC;emnC>Q|1+y=(i0GP~}%>|;o$nK}N4V4af-J?EFVwnlu5BH3v*iN3e
zz>8sio<YBQhy@zMj*gCp6d73R<jFba`O0Sl8_?(9u2)Qsxds*0nh|9dUZHw8t5*-U
ztTe0pspdWMA6_GT8j5nFmMa-Mk#<Y96g>uWdy>n4tS={o`?#J^#I0qU_8NP$^qf|6
zm&)m_phg_!Ktmv3yuwj*eF$5#{v^(9GA{5>Du#r6px*d<G^?b`)9(y39G&G7;Zu(*
zI=7D{!obcrv1k@MwdJ1#SI|hXYp?_Ibd&lLzqBC<e<!p5hBMc-C)3AP7b7E5uS=1b
zW+)Kwfgr4pujZwGe(|qo$$VaSvibI-F^!+vP=^V2)O)*<fKpE7X~b!1qjsg{3t9KI
z7}fFR=rMYJO-g$j*-X5YKt9#kQHXS^8wWVX`nmGhPj-f3_^F#~9X?<<E^&|~#Qs|P
zQxTQTMaW5kSN=b-GH)%(CF{QbhQUij@8N<jYNEGMu7rwp<Y!iaL3N!Ly$Sbf=j_%y
z@nv<)ckhN?$ZbGRhmBRzNDynjUsJbv7NDyGXyEC<{LN`1{pIqWgGiONeZqYJ5+tIr
z%-{I(K?Az?Q`W+{^yfiW<P5Hu9i>c)LJE(vmyOuvv6v7N9nIvs`DRN}oj+UyQY1!|
zA8|9Q*w}`8`8sPj%U7Z%KXRzfgvakN^aY&XMdq_Re4yEuYAb?eiq%pmTdZaoCDYX^
z_z(&fQY#ieKQQCoSJ^o2{MtITE3T(SLbmh0as$9)I<{%n^gu7k2~GNv?-?yL6Z;{5
zxR7~C#Ha0b{1>c~QU)pg8}|t<fY{d2{mm&zi;3sUwSWjH5v<x3P6@Qfq7MtrFv3VA
z{nyv25v5!mCDHsxB6y|pNy=h`IpquX#L+9V@840KOr#T>3^NgTtm?g2JV#X!6I--6
zZlFn)A5}wIyg;jJk2%V`h)btCm`#b$bX7Q*Xd3(H7c9Q1OyGkc@i3Z1=ZXuZu(9yl
z1E?~9T2EQ~&5Ana!;r5AG2nik)&6i!M5bG|*z|FUY9~q7MRfLbHE}e$e|R!hI&3nb
zp3T4-xJ%*&_}Meri-@3kBRi;~(+66iP@X%nrWdLsX>V#O!}cLHXLe^wQR~q=E$U%b
zXReTrJj6<6quTEm3Mq9tQ$(3KaixM^OlQ%FH;pkrcui}Wt*i0b@H-lz?td+D9^(7E
z?DvFw%pLi%;nBC*{{DFqQwmW0`aOvKKz%PPVVAb!aldW2dE^ytU5J&InLv3@t9m2X
zk4;h#y&Q(3LH_ge;3p#~i7aspkOF)}<1hBVtNq_^y*Rhyx9nI_3kAoMHC2?kT~X^x
z88$OIi&-;+R|L`pz7td_6stMiomp)u`bR&Y+Y)B2n^<jy&h_N?B*ZK(NTt7>?AD3!
zAS|(vKV385&FTaAxLFiXOT4qF^2OW4tsj=grMQ;#1MZgZ6aK04ha7r<7xnGcjGI=w
z4ollnIbBZfP$h{imwWln478=-=(W?avUJNTuB@rw6al)kD8=$J`SDchhBxibmgyFQ
zh0jTntTXU=?1TSjQudUic6pZ`>WFn;&!KT$OfzqA8Dbktq3Li5%3X0#&a(F@-bd8N
z4R4+-Q(b;M`BYcRl$(GrZYM@G@l~o#9C^T_wJ5~MwaueFUIbv<vzcqK$Rh7~!knvW
zvl33QWW`StD3_d~xvG6LdJMGqN)ana$aK@C(#-(SSxVe_q5;1lQ{Tz=I-;TcYEr^4
z3(U7z?dRu8Y17v-dt>XhtC;idxWG|?*_C?dX?CPQVPUUQfA-(9pUQZ)my0M8AdS8e
za@2jfD#@P<Q>U*q;G4`s5mu)WIx-wePyMxbj8t#!@NqKv#%~_XR2Iz*x31!@g#Obg
zyYE+{P9+UuD{a7DT=j`^n96iu8l~k?3{t+T)A71$$)etW4uonkm+}?3yH4UmyV)}P
zAnMbhE@LYYz?2I=AOd~l|A+Pie*M667I`2A`6TiU9HOdcRWgq5(vzAbC7K%V2-H-q
zST0We4K^=OXOqVMw`Y$N@R%n)1^CBo>|J@!Rd-AEa+)6~VD0nyy6xp`Cm>lzdKJZT
zFAa3d9JFs(2Q7H&oTpKXgh(gPFX|8JpXssJbWZQOVDB2A>OmQ$WIpc=eLUoB-`Nu(
z_2?co=SwrD5iDszTQxr9`$(t7QSD9sz!%(Sf0;!Cv~XnGXKbb<W-fb5DGuyqO{440
zd{~Skk0Sk!7H{}mow*xDrMv~dxB2iL_|a$TeaYWr0!roO-!0$(Lf5fhq+yazWkjl;
z?R&i~yuzJSWlKA?nD+ivP$c@&B+7hcVrxCll*lZDlU_9w+ir0>nXra)rYLQ*zFJfT
z)HzP!+$X|&+9)#REdKWR%|F6|qz9U-*dy@!SXE*|)tfZ*N%-Dzz20>HW*LlKz(i-w
zr`<Q~sO>X0YM!>hMxWDT&L!F;rt6T^cMGNz@w@dyNMaN2)oQDs&=6YhY)u4rI7xC|
z`+fWNZG0SX8hT-mrb#B0Z7TiU%5|!43ID#qXvN6b6523>N|E%!S-tqlzU5Dzqs^m_
zXlvFs$s#1F<Rf30B}XUI2`oM_M{j7S=ayJffAe4-T#J_0o>Ir@&t0(n<suW77@HWd
zYC*1Wfb_)euCHSK8wE6s>5Rlri(}f}`TzIIt4E<>g8d;H;X680Oe*OcST5d>NL|pY
z0EZPZCAhZ?w*8~#2_H9%{_WQ9e80DgF$7qBtMhw<`LWD7W6hMAtA(68*-N=^dL=ED
z(Bl`B1xhJpHGKV;q7%?71(ZlyNq$v+eOs4SkwjBGcZg4p>hYxce}#s|oCx96;TLH!
zefasn7g(|>N@%k{8I<HpwSN*e59EXWVje*lxWz=+tEqZ)J@$3mB&=}8tGP4AAaX!W
zRM=}Wj_;6%d46Wq@0mFTs%c5>KTkWbG7f31oz@ELvJZ%>I^ZnD_17X~n-5p0Rpf6G
zrHk=BxV@$#VKH7;J%miZOCTQ*OcAx<P7e0G-A|;i6#NuK;=QuHKZE^^3!u()wPyK3
z`zV*X!esNKG1ntXGYUa~X+g}r1ZcyxRa^gL@T?K3FQ*tcytzxp#(Aa?tHxlture5r
zkQ?J&FVYWSw^}%y=sUgb*OzC?Pw5pZTUt(hxm_&?3cS0%(Us0neC8Hpt*C95b!Ksw
zFCgPv3!%tG>sJdqrdZM8uxO(KC1Vf-Z_y&Td>K3^2?I|_q(@dA{`($=X}_0o`9xBe
zP&uP)=acsEaZ*(DW^`ao1#t!yn%PS^fqY3KInos7=ubJC?KPa%;e)YZinM$krKsSn
znDqgTkyPv2Yx%di`pA|Z{{Rk2K!q!#33%=sF8z!FlR``98M8<7-^$<MVLWhEn%TF4
z{BCcPNOlF@0ygZ8PddZ>shoc{rDZ<r6T<7qV`W_S=XkG|o^Kl{<5ss6CJ*4(gkuWl
zx#&H}L1xt4k9Wq74{W%s=NUcbsMbwF_C{IYCNeq9&uhp}V>ENPs&)i3QMK$L*3!^M
zwAhG3tQe_~!dE<yh?&{{9(u<Tw7JR$Hz{GTjG0_csY{h&f*+p{Jp^Td?as!8S1Gn-
z=-NoDj-q@wF~o^CtHywZazJ0>5Q&pgXFMsPePw?jlQvu?u#*dLjg?<*T!_?`zQ)q*
zN?pv!+_(?aOpQ+U3Kefqqkp&BnW!#jl|ZiRGlorNjk;3nWp2Y3R4#oBTw!YGl9`b*
zB{Voi&wWKFM3KcDaR?U%&d?S1t_HSKQ!5#gsI6+mX6|yiJROy=G><kPcdteYni-2O
z?Z0zlQnzZiT>a99=3EQ{0KM-sJpNM>AYNf*HU5fqI%OIK5-CSWE9Ep!#Db0V?GXBv
z$P@1>)wDuu=CG1ng5tEOzI>lfcMsDK6x0vXLO+zsQ*MO+bq>{@E}5&sVB53JZ_K+=
zdrqHMjqJdDxiv0bUWaC&=#JuCLO4jpxjG!=m9buo5Ni@H`mdeu$Q-1r(i$68{<r#D
zn=-zIrbv*I=!BQ%G$9PBqwP(YX^&PCM$vw^-xOo@H0BbfU-Z>2ehobVE-|!}IE?KQ
z4hQVlrKi=)Y3Mef-(OUjN7Na)%<Q2i5+W)RT}f6T{KEeIa;Xzr&O>NDXRBg5nDrw;
zo|=7Y<q9#U2^&%UQVj`9_4kSum8|*FFNDes;7LII{KUPFYm{QJ-z(+loF|P=Q;j1T
z)~6HBdd12EdEcbqIngbBQJYvt_qm)hJ?3^Yp9L&^l{~8W$I7500|`Rlu20Ee?fzPN
zRg?Mltgf&k*MJuZ@`+#Af5~tMTlOuIaKBTLYheaSfVB>}Y~;EAg5^FVAL15{Wb$0q
zOljV~*%$yHRCdED`>6lV3-HBc=uDy52i)%z%?IJVK6D<==+cuHcuz9Xc>4w_CNrGU
zVU*oEs-UX;`G!G*dSn(MmP%^MlP?sN59N^CMa`=H*YtldtdW3=zE5!xs3;KVcuBsz
zEtZ92;e^PE<4QUn*-NteT&(wrqp^;JzXGDv$i7+=-yt;M<Sh{TKmQ=ei?!>v01m$`
z>Fq{|F+2P%adA-5-r61M&OskUoQCDMj4Bb3POFK$(U>lTu>3<5b6^iC3f%AG72$x8
zK?+!W!uYAck6CzO8N5lBKv41K|A3gP^w*4<&VgKFo6L<b)(M5LyGf^X8|S8|cRdN3
zYl!?nH;Oi+B&DoVt)ad&A|{SsW2ldH=!spX2x^Jn{jHPZz#(%SRyOc{uaYSbT!{op
z@S8QU05+0obncRIMadox_F<ptCrSiJw$NybwU`xI5E%RVXDbW@<YLPOLAZ8v)8yV=
zN444PgLK(i_v8(Q$#!@?UyK26OS>}gjo~O6oN%hAacv;>5<BcDOM@Qa20qbmI}e6I
zi}c_3x%M_L@+3nVn*N`S{}N=aJHkP<ukzc}$4mZL)tH?*jWCqACj<>%RvQ;G0hQ4=
z1C{P4uNkJFWxL^Mf+WK+Cl02a%NLxw>mPH3(X=qd>CFbndGA1^hRL$WHO!vA4>ECZ
znkK%`tCOdyxzL%rNLv^E1KfQ)o&Qs&O+$n{!ocW9cJkLfZ)0y>ap7mB8J@@DZ<C63
z5s@L;L3nS?>T>ah+8W)TXr{7WM~4Qm-oj2NB4U6njXmzme~y~Rf!6zQdjuJ{tn>9;
zslJNH9AF1-oZ)SYS}p-IZk9@$CS_Ro(8*msh|`xHOCnT?mNuB@i~L?WUw2=UE%Teg
zyJo^kgE(k|M$e<??lyBrUPO2-XTt}XfF#MFArDQ0A|Ix;4w_^Z%i%cLuj3EADV~IO
zOx)(U#D_+tUVj0bjtP?>#!btf&P+`?JJNgmInp++kc6$5;QM$BdPK2lfOD%>iM3Rc
zuItV>+(%A2QY)TE!MHLO&0t`m%2w^7`fh{)V%w>eP*vP{^B6kM`BV@Y(0*Yqb;o-D
zV-0BV4PVy)g1juGR*M+ar1&$0G6!*I)xO>sqcWxj#|-#}j5E(bTUPREftpJ&PA3o8
zbo=e_v9#Ll5TK36v_WIW^CWWTpqMKogh;RL;CQqviDnw2=e<7re&_kRf>6b;Wj+Og
z_UIB9X<aRCd)X-sIjyu;#)W=9E%ra(Q<q6_#u=o3J8V<M-%jB?sidQ&G`p6xa+y7a
z843vM%@>kVAGjc&gvwwbvpf(2!PFtOV1d4VJ`sl4O$rqAmv_3qnv*hut8EZ&xc&=J
zmPgI8-){yZ2S|12`b;YL>qK*=fE@6WT~m8FUmNNF1OSSAtGW&m6j+?U|4wHnNY@8U
z^?ST_lB)4V-cOeWJ$izhzN&eO4&XsOZFH*gSITr8*<UFZVfVvo_EXBuD5?A6*i5Bg
zFW|?&9hVREr02$u2sB3W;U*-#%J_SrePef-JDsOgAHTR{DBac9oeBs1GYSSf<~E>Z
z!<3=nHMn?N0$f{(sziHX>>@OjSu(-V5ryGgada13uj`=orXX8R2V1(Eq#pAxzF*xN
zwnxn=>ARo<H1h4tdTg8^GC8a0?FBeiIkFBlz^j;ww`xx)xWxx#qdRpc*QXK$M>D>G
zo!aoBL5ON4%PMbHCnZa*CD#gw*eh*pVwm@yS|N6eRP5kTU@wudW*ig{p=OpG<Kl`=
z9nrUvT4DccTQ9LuoiODC6h(ZyrjWj8cPrj*BGT)J3n$L^y1xHdFJ_gWctD$<V^DtI
z3feitf-QcYamVBf0S{*PZ3rdT@^f@HQX!Ix&>T^iMa8kmfQbKF4ReK3b9DMfDJeVZ
zo2Sv#D)&96N&7c^OZDP<)-8-B<Nk*n-%&54x}6bMp=>4RGk)A$pm8O=&+u!*GrgDA
z)w@EmsN<K4e&ed=*YU-*ghET!rFX`djU3@y<b|YW!Ctn~w<0QGobY|^U`6dupPN%=
zE>JzUQUA96!}s0LHkuz#`W;QTeSf#vV(?69H*}zmj+;#b-y?&_s{cV8QFw+RlR$7Z
zg)XAxIS}lMy7^OPMG<|g<n0)EJ#mKy^Q|@gaf9}E9(KStt*-o=y(LcwZ`$L`zOQCI
z5~|M@V$6gwMoX%G@gV68Siqzz-xL02Rn<#9>2dB#<_SziiIj2j7;=u;v$C)07&r-(
zu4#{T?UyQA8YZeD=MQbWy{m19xjQrBza|B~I|+4yYwC4Lqdw9iZo%`EE_+KR0sr3}
zL?alt`ke)r^t#60Y4ax+1asCUZ+TF2Z6Z`7ddcu+W;huJLSegeLX-cjK5lT1ThFc9
zwf={$$9z7QHGxA}{$+H?qT&YzNU7p;NAltrnFW{}yo%LTTGlNm@v0j^WQ;A{y%QCy
zf&?%Mm+!Zr1EQCHx#l-qg}ajd==^REUskcOC()lR(J1u=$wXCZpC4ivX;K;8rObHh
zh75AD-w(%4%#}p#396Q_G{G~g^ujl+km=$}5M_Q(bUpz<<?KvA(^^u=4?Qn`7dIET
zt`;ZDzD~m-Bb7T%O+G>8<%)Z_8X*q{L97V_<-)jD^0}QXidRbkdPRS%+O)7b*Z(Mm
z^eyG4j<zvQXT-6le`W2AxyrWB7@?56E(vMof#BXUA1Q|G`DVIPt7r@`D$e8M&Dy)V
zcn9e5O3UCnbFZY4NMwMg|DC@4^SvbUMt3C|DvO1GXyYD;?K?4x<SSLi(bDFlCoY02
z^Tzk!96swNC!=0*-3QjeAE#;)Fz{J_s#Y0zn%W^4;GXQ&|L;+C#^!mp-r$<#F+a!i
zoL>8Mog@IvO8SRnjQ^dL^pE@@Zh-DiboAqC{Ux1`u^al6Ox3qzIse)8L(z=|V!?AU
zx*j%}@Gr6xj<INxRveq3V{K-ju3Os_4A6&J&xgY8QE1`Q+H(I@7~MH})nf>RdOP`3
z$tkk$;3W}PLt0cmY*_+9<ka1`aziiaKasks7Ph+7cDZt3L@IAU{c8zqi%zyPO^e=n
zVIu~ct8<(Qn}b?biD4Qe4Kb0<yYh(;LnX!%TRq;Ll>WCCak?JDHajdLHaoQ>vr{=!
zWP)URTD7bIiDM&Eek2T_!CA$GV+xi>H5oc>I#oVm24la#q{~c1kTsqkXIUH{-Ku<x
z*g?C0pAubGADj?VmV2xa*%PnSd>bwj<g5L+fC;x9^<bX4UbqS-IL8PUj8}9R$Kiy)
zty^l*QK{*AF7i@8tV>tse2I*i8%D`~87akTZU%sGNpJ#SLQC`G)3<&F2T$|9Pg$&p
zu&twn*Z)L_B~0G&Wis$OqNJ$jp`MDsqP2)CVxOGd<tHC<MaX|3pu=mj>jS5af9AIP
zAxkjd5zV2DbyYVRj(tiO+5{p&e~cRK!~?ZU%HJm`V=Y*#by1zUv3K=P{`OMA&t!a3
zu1S{aSdIaTZT^)$=pf|I*i9Vg0@I*8^1EOED3_(vgl>0#|HB0u=m%lpGXo%+13M__
zn?3)4JX?1|$2vT1Y6Ho)Oc9bl@voReWxj&wcG9F&$Zeq93y##+)fy%|i<W5u$U;n|
z$5+}ICPGECKSrUt5x>I>d(54^S_sD%%Vob*J-4Sb%MpFRqw{PVPS)Ka0KSLQHjMCg
zAunoOuc)<t44gQ1)IC&M#412rq5;PxGY?mV-C~Saa5Rpi2g)3Q37Q=!0+`KM`^xQe
z!!fd$9#V|Pba==aF|51h`4ux)Zy!kw99epunnTsB6Wrg^CZ4ax2IrR&nwH;lg-+<1
zJ<0d%iB)+{CZBEvZNcAlIhqF&nu+!lj|^EFn&WeXjf>O8L!IWicFZq!^V%5Z%YsM$
zp?d6lsHp0mZ*6zcB5&t%i#y=Arxl98wXBsq>ggu-K}{q8!Iox3=w3eY3;At*2MSr>
z89XS@0Yv44D97XWF}x}398vJG=-@z2gB}<G(xnXR`-tY?aOuednR+R}`peSR5w!o)
zYAu|gG<8ya#CG=PKsamo58m$s=R0}*x}r@2h?n96QtfjbIXY27!>KG;7SXma+3&J^
zOXTqZ0|M8)al!y_STreds2<E-BrP@P=7%^{sZztotgo*`&7FUBD&rHpRPT19&J^-E
zRC05qsFvVV(0YGcuJAnocVb_uq?^izB~2qx-<&#y{KY!*C53RkCaLi#8|?cGW@eys
z2L|RM&<E6p>uQ#cGn`aCo;<G$0eI%te&asO2Z4=Cy6$Z<2i?SvFD^8<{4*2jPCmZx
zTv>VUtdP2p%9=b~T&&4ba8Oc3P*%7%$+lIcR%t@jrephU3+}yzk}B`<vWQ#!GJ3OG
zU?l__b(z$>y6WbIQlOE%qd%v}fmA4z6Li=@<d&juT@jHNbMN*}Qm%4m?a%S?@oqWE
ze3s^aNl+=5@~YxNqX+XS%nvUIYuop0R%)GZ*P{ysVT5ct9J8FvOG!z{em$Q6HgXob
z&s%*!M-^kKL~AR_`bU~L{Tao06l#=CC;a8fy8AzeP)AxN>uUwm&p|wbjR$dRteA~i
zLM8D%W-2|JD~j7-Xl6irePjJu_{w8|(67TAo&s#qFmGhSC#H#Yt52GXn4(pH_MvEi
zRb4DKnG^2;4Cw8mwfcyQk&YpU&6|n65$PNXx64nFah)94_K4bi?lB9z-d}=U+M<Wl
z8dZ9y6{Ps6v)gS6{z;JiN<`?SUh2c-A<^@5U^`27@9NP@Qx$uvP@^PefS&{gN47~r
zYUDTMH+X_3a{cmx%ZIe;U=+jVeXp(aa`N2!S0CSqz93xjQH<Kam)G2o9I~{}(&dt?
zuD({f;zqSjdxMxa)|u}ANoNvpmzK-;0uv@f>TMfCr~MH3(|dW<`L&$+Md4SkW^D=#
z{!m1+CPFUr-x(o$E05yLs$&}Yl;8G?2=Wg1dT+l@rOq9KUF-pZC3M4y;L@!?DzzEO
zDQ-fE2!h&3wW%!yMNxDa+Fb7P2Z#60v030_9dp7Jt6SRciyk*vDy0%6Z-wt>A+#io
zw+IlrhFVGp{_@IS0S`L<9k){r)(M^y&m5F#Db`VVGx$ylWRO@B<)S?mrSk6GF8|DA
zl9L|WwbH`e5;`lxUHf^U8MeDu(pr9aDg$B7pF4~n^l<dv*JcdTJO%;L3_RCK>YY{z
z(nYmlpND;N#mJ=4CIoe;8|KfhCfP6fMQ>Soa$zOuUgzgd*e)>t`FS@#IzjSDFd*^B
zgIW0D(G#Vq6dh%o?Dgue8Ev>3`?8+Db58A{eR<DYckeH$ha~Tqvezc~-)K0e`8uz?
zgY%$PCiyYL(|)Zx9mgkF^?Cci0$7Z~Pb8rBFfeX}!J2wjSl}&Apft)A7=J9rw5ogc
z>~3I<)8Qu-Ej8`dPXAas%kzsl`*WSUKjgPqD!E{6v?f1Glg|7=!!<YLoYM&oA0D&$
zNSXXA83M;|H<D$t2>kMQ2RFbtnll5jIjf2KUuX$C@EaSjOeHENyf$m+n2bC3^7K}9
zP8w{)DeApx(4GobhBzbHPbXo@3o8Jo0@09Lyb(zTf9yVQg@W5p32S3FK_NCucsQE*
zrX;9#99%SZ6%%sJT;yBb7ed||#Y^7We!dtAqt$RdcWa~J5aHx9lzV=RF)<mci42Ls
zM9Hn6@S2eo>T&@Afhv30Zm?G$Jacrl@bxGGHH<~QO3kI5O+A==3I7lG_*}2rmbUyU
z!(6IK9$6e7<d*hC<xDhF&@>!P5uF-%hqC?&+CcX*QYY9fjfbu?|EdOo`SBL~h%K0^
z@sT1rT>asaG~o<qO}!O8K7(!pew{GxCd0q^Z}0httoUMP{pKjvAz8Wp%=BCaaF*zs
zR>#lj?o8AdgyvKDZcV8UuQecod3o8{I<Sy1oq+c!UY|Xb>@B&DRU<U2snPM90~qD{
zbcP-txz8mY*N63)pSWEV+%V*ki50o4Ef$~P;#0ugteoBTN;NF21f^hld}*EOw&c#s
zcEVWK7mOLgR}wB-s$Jk3(TgLA8E8($%NDN9UqEHXuf&@-YBRk`E6or}aYA$SYYAIB
z!NdzXL_!){->s!?N-K2k0999<+;sNHf}7(;{xg4AG3dc9V{Tt<4FWq%?6O+|tI<Cc
zJYO#JBRwckie`mSufP=Jk#Hpu8#lp=n(_wGP5I|t>xZpPfQ#XkKci|)iUXQk^_4d=
zl_^3Pw4V32Az<{6_$)I5Wb1U@>n~>u6rSj>zP)>7v!?%OkeM=YZjJFno)+72W6+{B
z$`WYCKTPfydvZgduV<s25%kl$u|cT)ny@577{%es`lw!))Tq9n0gIMqmNVl8en1wW
zS6)Eh`TwhrGj2N*L^Re1=R`=ogIQi3%dI{?gChm83j$)==~4;s;C^*$zlA-brCXY?
zRU{!VZ_eX9{q+s$8b5cDpjgElD|~YIM-U1o4^^7Pdtak|t{4=#4RR@$bc~IL#h2)e
z4=WRnCl1;0vxqsE6yZesZN$=v10*<CBNcCAi4GF?&QkFM9d%;&Hf*wWK=MwWWLcVz
zuOTg*N?G_jeO_G^LZ#-fjL<>Fd7p<2m^UrJ?H}b_%EiSlU-d@khN0#z_xQ(<OVzDl
zM-Moi>xwea%uR~s+qvv%*l=k8txw(PUGz3NnIRr<!L9;WE!-7|XVSY<lVwt`a=vZH
z1cBE@X814sdu%E0HO;t*Vd-TM@=`tut>UO@aal31o-Mqf;q}B&uP_h^x$m_iEew+#
zB&R$ob_>917%D8<T}8bOPPuqWGCjR4l>~z4WeHCy+Czh^1|~+$E!4B|yXFt?ACyTL
zQ}aAS`T+bPf-E>Tj&&8>SJUOvS`aN@^RD2Jdt2{vjvc`i_V0J##_zO!9<cMAbt31?
z;tW%zJuAsyUvJoIhp(8Rcq<}#`Tki)d_#w+8{(T<)4IUT<%ASO$+|sb6Ppcf2d#wU
zc>EyL+pRI{s#IYb!udekkQt~p(9)5}LeG^X{@VM9;1}->El+uE=j%&f6)z+)An&E&
z^pt~s$-<@8y<_iTaw3K|agDRB7Q2qB)9BlcMNpr#QIzM}hin-8ROhKb<dJL`B5BR^
z2OQo7ebN!E!X1W>>mQAoSdp0VKg$2`pZoeQPogR%F)Cl0vSG>!PfPdvxd5J77+6Me
zB@0ty!keOh^tF>NiWVs|>xcrk(f(~RV{8P1n)I;armiCAE+9W7xPA5boHj}||8|%v
zTVE(D7Ibc;LET-8GAw&FAjG{o9ihosujWE@gMgmm%-;KVJ|K#MzieGcYh`{u`OCc<
z!Ymw!qQ|Ff@sl<jj7x-1PRgLp`+%`^sPx94J3q?Flt{<mnrlSC!^KYmQ;jevLRW$I
zB-p+M=T|jv9R*<+_;&e4AOj!Ed7JVg7QTmG3(y`$ak-?-W0S`4<m4>_?ax*py><@g
zN8W1HEWtD+f1e?#o59_Y`O2!iOVje2m~PyucAq`M3vRw&&Id)_q7pK?+F~_#9%!*^
z)vH{XEaY`08)=Ur;HP7a)qq<obb>%>(@iVrjhQ-U=kNE$wyo;EBtlFAFK@&89&PcL
zoXR5u(zdM6Ar3+=wyusq4`0rVfBr34?H1B)&a5bvzj;Y2RsaLJ@jmV&ga=WeoL$R^
z7AVsHUB*}u-=?tWYtucphLX3J0CDr*7;Ci^$Zo{+=9<$kMxBFMheH&WVc;VePaBPV
zYIlf5kQQ31sNrt)6vCbcIFJrM)05GkbVo6A%)o+Pk+P%pEo%N`tdZa8n3kEt4(aZy
zyrJZcfTJDA*ps6s)=(2AejK==5(Y;k#XH#uQyKn7X4LjK(aLdGYFwCRjy{ndv2FP(
zpWIc*gS)QkcuSuORbOjPXZXTAYO9ZUtkR$IVj*-fouwBJN+hJim=}(fzW)3WCgaQ{
zNVe_}WP!o2m2Gb;lP&JzU9!-6-$U1ZHWG*HE@abojw$6)zQnV?TQDl$naC&DX!>S#
z7-&nh`x0_lKkQp0@0}tl{&}wp(D;W4Kd|lHRRPI`F*f!d)71=)45|0aWgo?TL<bJL
zz1XGBoz^hxkV44qcxF5n1s_4&^KBF%N&-7O@&L9e$0oeg@4FUD5?ea=aw_7h>xs2j
zTcwyyZuaQnL@>;jg?6rwLRcGY2fZGu+?zptgxI|l23(B>Ogs<+=E@u2AHT^?j@AUv
zg<JS$VL;9h^rXmcs=n3dL;iNAU`#&YI4aUDX*Vy+czBtenZ0!O5xq``Ek-RA1~NZ^
zeS(XzTqFh`UTGcVNzSyQIghU~wF=v~JxpjHE=pmgXB$vY7t@5?Yu{6RSxqwqlj8rl
z@S-1w3FU;89S|3YXH;VNi2u2pnJ5^aQ6MN;aofROeRyzW<LhU+gNK}#8S;9<1aJgD
zjow5YXuik>&$wiON5cJ;xZ7_J%5+60)?E0wl->X?Dvo5WT^bmWT<o6n>Dolm-W0ad
zg%4-oA50KdfB9J-;k>BxO@u%;R*<YB52BkU|1O$I;G>bhXYV%-Ze;Pi56NE1J?8nI
z!X7ngqvo7YRg<UdLfJw!Rpq-~JzU;%nkXv6uXtK;<WmfD41x$H*%;&Bej*yCTJ7$~
z@T!s2I={>1a?gZjqOsG21yecin_sQf5=p>6EazxqRz!6c*}c+7D?eF1eWLX%%Wk=2
z)?6$dv!-1McHq57$k7aWzK9cpC5s7n3<SG{&b{<Ynl$LW>AXaQw$w^A%ySIsS`=2f
zfx`9#%N?$xgoT+iN6oH(bonr-erE;!xwC#7!<@j04w!}pQnA~w=asM)yLPZ^&B)B^
zk-UhTRzGX^b;nzJ((oPPiaV+lmAp1Q4n=~j_%+Mi0rq-A>(6@uFc6{(v9~|6(mo}n
zC(|6}f1sv$O2TG_1-&Ro8iTDa&PZeAxgoIg(a(PdO@p3Ji<Rj<U7_ttt8fxTQIN^Q
zps@pY;~ruOKY#OQ!3Xa!uNIcT-L(^AHhvURRzbKnuZ8c+ulqDg$bwr&WlsK$gmrlQ
zyVg2k1ob@vgg=o|vG5*a-hJlrx6}Jt)HQ6R&&MY2D0p!7D8RF@AVbz?xRxlsVyKuB
zc2&|@whI9Oa5#@Li6KWwd8%YTJ9>il1t~9<ERxxacTv)t;Yf7At(#~C1Iwv^JqnWx
z%JLZ-dLS_R`KBI=+m-2!dwk;HMuhGzL_x+P_ho1`UX|&QN!swehV`AJHd-XMn#FO6
z@w*0XZts^Ag46vquX@fT`LE$Hy}N|@njWzsR*U0g8x940t?#4ssHZ^POCure43gg>
z@xR&Tb?n7b$YP@+)!Q;yg}2u|R<_Nox#&oi7C4tjD(&)Yh7dm5o1=FYt?LMHXpa6R
zb*&7`M_y^1h!Ob#-M3g^Fedxx%ROq{&b|Oy8$kOS!rR6lY}gq5l_^1wLR3<DdxPTC
zJY@q30AD?=b!gd?BBbnb#VLe>9|DH<PcVP;ouCi}C6_Z~P$cx?iCmfd9SDebU#1m&
z<Qa=}Z3JF#)$IZ%X#Ig%Y6mR=nCkO^n;11+?UMSF@P6FX>5MnCgS7g`;5wTG6S|7Z
zxO2z}T2NO|y&1{|Q;tO6jFy<4g%N)=k@fGe#(TYRtISkI7qNg_k_hwLyNwiB8ARd7
zw1W)UqCPIyoZGEWx%NFal+?ui)<mF04%MFWr=4qa`%0osTM8b-Si6?;%1veKUD$kq
zS^me%XOYnR`|K|@u^s}YxNkQo-zf1$_5F~mHO6#)O)*wwPc|B<9zTmX^FAjFd^NUt
zyCBLk10aNgHT8#I$2UEbI1&SJq}oSn{k3cJ0Af@>Ta5W>qX3S%???kY<2LJhLb{_=
zyjza<eoQX5J>}A%YQJC6F}Kvn=HfFIaYd8(<P(NH;U2E{m^7u`=Gf1Lkl9o5$U5Ei
zTt-Exg#XZwkhgP0We=8Fzk&F;&*>uG)p?`Pi&y5CH=M|cV5(Y5CW7t-D-1AgtRRA`
zOqsxb9zd#T31MmRh@Lwb-vL~5#l7tSBG(;rnHi6si#7(RkJPVc{R|T(-q*T!T7VRj
zOx9$UB!7P5Y}&cofjJ@v*)X0FRg_SRR+geJ<m&|`voA@wHne0V=_0HHnXU*2#WnCz
zh+XdT3r?>Q(J_A~Ln7Y$*!T@v6cukyOp0?m<HEaj5q!kUy9fO9IF!tO$82SA)o^I;
zpp=fC>~LmTtzKmE=-GOt2SKEOY=AlK_|?_>=E(WSBdJ|BjQ5*3Qe3-#ECXsn;pKZ~
zUo*>Bz$L%pWRLNbO!7W9^qs@5*}^Ad&d{(xL~WP4MKvYrCc^&|a#i)oJ1w!1CMhB@
zBcZUze4jxtpzvRq$K{()uzf!=+U11k{S^qr7Bhxe7uq$T?(j;m1VcEm3mVKTh(LcD
zi(UQv_`&&&(cd+nFm?0tOvG?^9@_?5k@xSR3hP5eI~X|znhSe5?#b~V;(MD|gL;T<
z;*DyOpY4QZc((AQA-uAGP(kDr!ka1ytNYmro$AEZpT2a}jeXPA-judH`_IgC9uVb}
zyT{A@8Bkm^2zHB`bIa~QYGp9o8)vct?T?)~MaU3TZO=anr7T>Fym=+PYBN}I(yK7j
zs@P}Q{_C#se-Poghe@cd>T%JW)d#RSk@|CR=B7~TmCDBR{^tegHv9e^S0aBnNNUcy
zgn|$`G5iA}OrH3H*Sd0>Oew(1R1Ih7wW1G%OV_r<!{FH``C;!<-FYz2;}8~s8#;GT
z=hq{8dP2?jPjNo}kR2#$DF70lnk<cC0SD$CBBT)(bJQ6|RsN@2D>M1(o+xl5cI?vl
z(qr$H|GWbU*}cCSnspIiuTO1(OAa?+3T%+3@tj^=>48S{R^g7+@v<5tfZUlMNE7DX
z0(D{eS@Z2mMweQZn)G_^7AGVm&)p#Zfno&qupo$B16Aq&3`h*_L?`i-Yk|$Jb!;Bb
z7b378l-W6~AX{TZdgGmRYex9qlMM(L(g#Gb9;oG)Dh-#tnf|}f_)GIrl{V<rti)tp
zXtALS(MqP{#ABujfxWtUNm5EbwMjK%h#+q$rh9*^AG<S{oTjybDsQOJaXltQy@3R6
zxFTTIis6q^Fw+%7p1KQ2v5>LB0H-w~`k0j#CBCH&+W@skFZK~R%d@)en?`AbygT6s
zq6KtfE&hf|Syk5@$w!F!^Cv%n4<ab|CFvTGuZ(Zb(66hgb&@a6xaPIt-1R=*J<ft-
z45%rQkAc&)kYw=J1tlNLCGm&171m$0n(}*~Qjp+l-zQhDXRH;IOrC__EUHt#w#ITf
zd^E!P=n71q>mu#F?=TkC6dN>e?JL%bx-#G(p4@4<rNUo&E0#>fovhDGA|g_wiQZE<
ziiSA#Oc~e!2{+4Z35%LL6H~2k<>C8D*=borL8pULzZB;^QMJpT$?*KF8UHB^o4tH?
zx+4ytO<&vTR{@5btCwgA_C4liMARUp4A6E2f$$qO0slds0d`U9JUMfcQVO(JjS6!}
z-XjtGC43=o%^4FDCw8e2GMCcGs4)~N(y(DGKP4BVoTh@znB}C<T85dZ5EsAT+~rxA
zZw0}ZiKoja`@$`@)y9vuvc}c7$d3mZqdAtZK~siG?{Us1ECV7wz2rmTE0G0kfcx2H
ztC-E4nzDuPitQi4FQzi-Ux0|09a1th0oaMi_0Ogg%JRMPPZcX4GWYaj49}&*z;Xq(
zMSF;e0@{7(ZaTO8!QcptS+)DVrrZ|5VYL<|^LJP|DqR@63@EkL-fbY14APB_KLgG5
z3{jXp-A0&q{jF+GOoD_JyT7go2gj0K=^fF!tJbhKvp;0Ej04k_>6HgUc28mu`*kFL
z^X(v!il4>INRWkvS4r{cfjyyZObG)zx7#%)eR=~yA$#*#IrKh#erRrO`buG@DbuQE
zsr1Q94Q{?s3Gik@xj5W86gG6y!=pPO{o-H-0+TCg1(AZ9+xAe%ALO!h+6MZ~)?Zu1
z5zu9P=xzfT<x`#%+~Y9iZc#-!3-Re_pnqC}T$p8koAGkyit@YnbX6bWpIDn?bi=^#
z1eMWLelAbMpC^_zn{b!~%Mr*D!dcq1mBxm|CRA!$_2TRvp1xUlSVoq^W$+2IV*hqd
z1wxSZs7$bN4{8YVO9*AESlsuNke!nH)@m`Ewx|+~%|L0flDTqAU=Q14hrC~tC`Bm0
zFY2Jt-yuf(dt)rTOnbOwC#72Ucw|5<grjkCWv*9s`S<AhvBMjh&5f_GJso~Rv*ShU
zBBf_T!Iy-JR1Oq<TJBu)r=jutbl0mNC%u)!e%)0YRn@*pt99Uy5^e9tsS6A4XfXHU
zJyO|n#9g56UYQ@?Kt}1bEkB=u!Er0&E!AEVP-&^?gb#n^sB{hEJhFg0puG8LAeSgL
ze}+s=E?iF~tEEq5_~FV*G=ZVJ*}fzv#63)Fe;?nSJ2OBv+Qqb0iGUG5K0VmNU1)5g
zs4ZrOhFxU`vEeLm>Q~`zh>JWqR{hV?zp1c)K%jG-cC_HW0@}}<hh$>noCP$dhYo`x
zAFr0&egmZ8n^LQ1K)tczPZv$>NVA~P{V6C3ko<3Tzdh$K4kS|g9<5C{VS0oJrGBw3
zk=%bBDC{ew72_w^Not9pfUdj+gS{vc{uJ7z^3<v3lBt^0kw#Djs+oi#8yhn*Bt06|
z<9cD1joq2*10*C(cV<D(<Vu#NYpji*NLYx6&NHh&eHt`|5a}?w1`RZOsPSH<!3ftR
z$S;(Yk*TUuN6|2R+>+4yTAVx{mUPE*g`{8*{1Ne}w|si*CuQn}>v6;F(kElOO%u?D
zxY_Fl7-E;wdd8$k2h>t_<uhtuL5Nd-oie!ODu3itJGNM%jgC#suiqtI);|?WtRP5(
zN7SwZ@V~gH4vv7>ST8s2g_W+4UK+Sk8w~mQ_ECBln4pNV3#UsSlZi?^lF1UR&prb;
z&&;z(9i~9C)VD1eI2p`Sxka`oFE&{7H%(M4;Ke`eVSO_3h4&8|4g+hozA^%NQVf}W
z64jr4V4dE_etKv;)pcxyr!Rd%J#nLD8~FZU-sts#)7Mv4J1bwuE4Jc$6|d-b$B#lw
zfOgl=aGiiPEtCN>x%8&e(8U`mJmX)QEYgQeB-NAOpvf8I?>TG*f(bwIc-FPCDT~Ws
zCMVK9=JFpD-wB1|#?`Zdwh=FVt_a{OeVVy89O6Kb{g#~O29&g}928t`sqjDxJjx#{
zekNy)%z5DrYE(-2Hm0SKS!xLDIQX!vE{V}b=lk)6AnOX#%^z)YxO?{QUVabprCRIU
zS~D_lE5caMvhQ5hNi#7KARA70>l8!vzLVRD&r06qfF-4~hsqGzeTl*A@VG>X?)CKS
z$w=?!?w4rMOE56C#`*M}!`EZrLB9_?i!xOZLh!bZq}FlVWw@k^nC#1(?3*`ZYZGBO
zi<CvAqvnM>gRWb+>)7?;D{h&*PN&^2h@PUo`;H!p@RD(l+A>b7yW{SpR|-#oO{3;#
z+v7XE0>!Q8>uD1tk5ribEp7>CVeC?=xOT?xdtM{<nO0NK^~&kI$VE)5PBJ4x%rPEp
zoafVg70S&xDQpJoPlg2i@C*p6Q|9#c{q2`I)q1@`6tFR^Io;%n>SD%<oQ2$9{f1@)
z^QGJhKD)Yvrs{iBS(uN`9|;)82jMf~v6PZq8qaO1KarNS=Y5w-((m)W9gJSEGEdl|
z6kW%{ACPv>Q+y4Hb&}m?AeH()-%F#t)+)Ew-^nnx@`047^|ClW$|7*GscMSKD@#fJ
z>pVCI3MG5|ao%*7VH1H+<Ixajuj~6=K(L<!yFDcshd}>Te;Y_<3)^(1cf4OMLv(7b
z_9%>H5JHlVSw|do;JisebLx5xlmiTq-g`P0^tl1TXkc#ds`VO9xR*>(k9UOlr|8Gv
zGeoX;xn?IX2s>C*%KhQR*=k;+#e|jIp|Svgf&WP`qEwv>2;)Tt6-0Iet7Ylr(My9t
z8%v*ovjh>+Wyx>5b|GG|@Od4$c&U0|z8p~yVP!4cq)WTXT-h+c?EL|HEw@HD)2&GB
zgkC)S_IH^}!h@z6Oo|K+#2zY*#I}cL*c*sl)C*Z|1~&sfh0&5B!>jHKU1(iMnAc45
zYA}cT`Isg*WqN*A&xT+UNoz;XE}~MFN@(whbOZ=i8b6!}1w=(=J;fIZdtp$5o+>Qy
zO%MN5MNTJsFpiDa?WcZcCBK9-W5`kr1}S18i^YG?BN7Ce^D{bun0Rar?5iKD-Izik
z^wD86Q#J-FxiIkWADK~yH+T--?1$~5D!)f69J>#Glp67nO-zB2Ab~lf+*3qk?8Qw4
zV$-j7)Z2<Qs?5BrBk4NI^1?np02J9FgAiO;UKOH<AWm`mC38U$pZvcnDHP6}T_L=&
z&&3Ud;THM&ll+Uq9UwELVB;IHAK**Y8>)C5Cu6p_9@x>)3GKs9zpUW=(Z?&HuhHlO
zisW4k^>r{`85B*1BW1OR`n2*2rr4rsB$I*<QauQvj<|)Z-t&u#-!g1523iU3E0l}1
zbctUHDiTKfa(tsY7r<O(ALxr(a<f&htB4HtA{w7b7fV2=390U+0&UsEoN=!8y+ON?
zrKNtxtFA%CcTP3`XntEf+?<0M>K$;fmXYU^_->BK=-ZCEhq9Y;JY)m!x!m$L!~mMW
z3E`M14~&VR)mUGE)ck6CJ8!gv*Hn9EC5;J65nvYoJxbM#U=Qyk{Bi!Z`HIZ-tSRdl
zh#e6pysnn{%iD}xiBCwLbw=-iuo>~Zwn_XQ><Q==dMQu#v4f9nG47SfJPwMo5cv#z
z*F}@U#6MniyFqIe7jVuv?Wp@_8D7c->M-?rfBI5haQMVojzflYDqlIgW>sHaM?yL$
zO5kvhG~%=1DSE-D>ib$lIAK*!a*yhW<WV5BHewyrH#zxda`NQl3$HNby`_a;l_vSH
z7b8Wl*rWUANVMW*N>pgwPJt>21Txv`azJzCn8o!`Z%Y!oi*U?;^x9`cFuqr|lURb+
zqoy&=e9lwcR=^{8c|-BU5(}rf(>5kC*{Zd@n>;o4ca&e;^&u;-(i2V`OQ`6*<xl~#
zsXHxRYlMN1`lhD0j-h8bN84lKivJ<u`(>|iA!!#B)qx)UuM(w|jGU<MY&Z4gxMVw*
z=mQNV^z>hZa(DUQKdV%&qBd&sL`}D9po2MWKtFh+>^FenIoTQqh9;g-=)x9u7nxi3
z{lU987dYUr!;_M<mqFD?O0@=4^&$5GHES;Oqce9)K#Su3(|i#r`c^V+b^7+RqTUwU
z+a3)%L5y)&o1vrzGBwu4Vp|@_aOr0aQs6l{KY4K`5U*&Cp#S0u%`3VU$ocuK<x(><
z?ggVwDTy*1#<EUFA4Z?kFkhp|WVU4rM~VjPGqEpg+1yj;LGtu`^WtH^%`cRSmhM1m
z?V8)%>MA;+fdP7WFWzvDGr2^6W~6sWskWGU<kqEM{|gXyGY!394xSRfh2YLuNQSL#
zYw!7wG#Kn(^EA4`K)kH4WkPGx;UFgpXz#vfbljGh^pd#PQpXlR8L%Bd`!tlyU7|sZ
zi}zuO1{spdvq!<Z#z>H8Vdu5f9&>m_XX@b-AjLUjaSpt)((`MQUHM7PS5Dlt#6SxB
z?G+w>xDJXEvhNIophwOKTi$^E>m>x2@+@$;ffJVOtA48Iwn{uQvHmyXyfv?~hk{oa
zUcJ1Y7x^7@?Wo4l5y;qVOHQ^!hpDPUXAeJm9e^q^`$9O8kCTk9$1Xz~^%YOBMB3)h
zTbFWm=As;Hv_BNIH0Yb2w&0jIxOLJYjxMd+U#e-yzLWN>zx#FOkRu=t6J~<OGZKGe
z8a~&$gKdg^)#X}l^LB5{H{T-AdS@WS^-U--6?C9E3~WcC?fDbx;S{qhm$x=?Fw@?m
z2a#Bde79Gv-3)IJ&vvgv+;ZWZwj0v}Enacx>|d+3i4aJ83jv@@h}bQHvZrlw?25j@
zYiZ;nYjA3N2R5M|($xcSF4TG(=Xx}7@gtBu%N2`@s+X8k1-8b$#DjhNTdl}`1rUBr
zf9b)p29vElI;5>}no#lo5YUS2F}DbV+wY$a#y5b-4d12Kgj1O%rQYQGOHtBMF1n4H
zzaVcgg{1tA79M1wy?n|*aFTc<OvqXZ7a1eepPFs~dLdX-xAMp?YXs|9Oe_}TbHKti
z*nbvRz01n`{s-lxsen3<^UE*AEbygnjAR%4Ih6_bN!2dXwz>`8ML8Or_-{_W1w@yq
z6<f;SmMIP5@nVi%Tud=P>!u||RT3-E`YZy3clcgl0hWBD#-9B08>o!=@UoaYx{SMz
z!w&<*V%Ja_4oo7Iez}%C!FFeCp~j`ax^>4peO7A1HE~keyC#RqeKj~~c_a9k*~By;
zPN5y#WOhgayr=y2GMp~?Xthu6L_zv8xgXvP#F2!=K>6c=rSqM1a@>9YoA+RB2qwQ-
zewVGhV?F!#E*#1+vE6oMEvu0=@nD&tpp4C*MNF06Nhr|1RUy<#SxAz)PYg~Ebkz1s
z$1dBt_K^CpX4Kvg*+}o!RJHk$&j$q;loU_Y4Wt?HI=5Ubp=;w3K^l$4Ia|5<dbM`v
z-Py?={N3y7CnhWPEv{sZ(T+2z+2;TdSR$OdTMPn&8@uS#>+T07qM$8HA!v!6;^Gyo
zdh&br0g07XdB!@0JPUwyaG@fH?)F>K8Sq~!eoVv>lcm2;T%iDl6^%;|nQFnKP|xU!
zPRgg<$zfm|Zt{6}Pv|wG>J)+9b|vKgX=9X1viryss9Cd;I(0op6>}p}0m&{d_>lgA
zrZ*AFbkDJWIFRMiLi@beS6Q7C(cp(L9VzuPg~HPuugy5FgR|@8BJ6o;p&}bp2&OOQ
z&Rhh|yP_y-u{K^ecZ~dI%gguC><(tx5+7LP7>E!AQ_naDDY?T>(lqaLzLp-DCS#7G
z5%u2qS~eQ%B<_00^zh~1V!ezuSd)5Lp)seeNaqZV?f5871$Sy?5z|&SF}g>68((mD
zn*-@&?zE4cClKDf^+<>sXsPAPosKP9ad?Ej#P?mbv;1MdKnmIl+K^L8i6Q**)27s-
z4eB_!*eZ!mltF+oAyo|1{{M(N%ebo6=k3$Yrh5a@-QC^YNP~1pBP9)+ZlsiMkVd)@
z0qKzLl#r0lXK~K?J^wen<g>V0YwnqAX1>>S>f2I%bq~R;fBEtP-&vuuC5&F`H6ptU
ztOoXveC!(yN9{ikV+pPpb2%4pj++u5i;h)qKb(`lJp7$tTMsZ=p8vl1w8Ped{NW`)
z@#j$IZ%o~F8U)DI@3yH_!2z(n=1)Sz%@&Now>U@Gu?%{1K6o3GWc$@w{n0t|inPF3
zmfZIBKRZ^&qx*xRkXs+l7w4Z00R{}>&sJEr&1H(c6n}igDSqg)UmK=0jxvveOY}&I
z$<z}X45z-rXhkAAPHT#6FYIHUEJ$aB5zbl@B4vnLb)(dV=ZQPr-jI6rEG#dV)^?Ki
zial9e&S*(dw0xX<R#a<H3cbu$wDEPOt7pSr{H6XOnJg5p7o9UfvX6O!Wi*sk!3~lO
zpp>l;9FH|PCW2W8rw71RHV$Qzr0TuL@);-_?G_--xucqZGCuuotsyLH#phmF*P;&(
zPVLQ_yYWUHVK)-|zhMLQnH>|(V_sm?;vH^>>|HWe*>Hb){nqaRgB<q~iJ6*^8>T9*
zCDMQG;~KIe*pHsYI9P`Z&c_HGN0mQ&%&YKxIFlQN_xDk4ggCU_VNrg3Jjx-%?r!D#
znsJrO&R=D^A6K+Mv4<)25@OEMYwkrpBmdKdt_^uZ-Iu9YE#>MX-a5nGV&B7&kK$c&
zYrnrm&neTKUGWBc5HcN#mLkWK1G7M*Pu&@H$+u&whm!~z$EXP>%nr@I&_(l9RV`x;
zCTJ|oRz0+kTz@1t#ozKGMRXZqbI`{c5{dR1=niYtd)UZMR3*xm45`TG79XMvH^#5D
zv;F$)TV|cwT%^W5p;n9#;-y0zQrY?lSA=P$$_`YR)pF#BrERf8X@9p<wSU^;Qq+CE
zu1;1!tb@(J6ZRw9Y_9XO`bMVNr)1J|NbOvFIX=W!8W*TD0k_<5$J6*(pz#>FXX@7Y
zTn2JB6@mz7d!+M{2DpqkbXA8=u&0NO=Kr~^BPDqufgIicPJYYhPPf+OU_a;p59(hv
zav^*sm68&^C&z<Xmb&fly4&1dPkK++%I8^C0wM(S7T;yfoM5{1*Z-jOu{lBpr2q>H
z*^>D?K>3l}toiuq8+%a$OSLTm^kJ)ki>;%R=tI_A#d~N6^&yJ8svlqZRyH3wk;p8{
ziI$f`d$RL`1PDE?w>8tU?02Gjg|SYplx<KNBR><EH!JtJ!Tx0Az<NAQ*TEK|n9_;z
zG}kRUOVkuE%bpX0Dx|{>%3Ta+!ak#QW&c5!x=<-EdtjOP)-)lNfL*k+hz{sv38Vj3
z0~;p5nI9)wCw9{9xdMgpGQ0w7AW4%qMdUp1a}-uEvB}T)7-e{M7NDg^=zhGAu?Y)e
zD|%Pa|Mn$_>P9k;m_s`O#KZ5=89V-X*XE7l#NP}4cJxpzYW#};O>L{o^=prL;!p7V
zKq<vZv=_ok%NI`Pp>0Cpy|v3ZgHN9tVos;VrxX5$I!%c~GJqnW?ste())AGvBPHAP
z`V}0+$(D-*1{COY+AnsFh_Q63hK9I<r1A3ov~sZxDBd=OfJ^>M)-dspDeidgZua*(
zr11w&V+*`&NCEvwrr~$y_=Sk=U`nIV!1sK48frxi#mowHSbGe6w_)(~tfv~a&Yv<T
z4P(*21Tm%}M<qA~^6`8^6GMJvY{&{1SA%d;Wso%6l^hd%Fb_k6eiWt+HEJv?>YzQX
z_<Jx)Ups?4f1Wo5{W4qCM8x)4(@Emb`k2t*`6;2qIAqXs0<u8QnP0cx*bx&`b<R!k
zeL))v$C1ySFjET*bsb_m{lP$?)UzF9Rfcdfy>j0%>UZg<F3D|7SY&3F%9YSu^8j1#
z)ulJVRrbTTg<Yl=b@|iMem&-wEa)%KQ(dfGGh+#+&$EP@%0iM7?#`bauyqFT*8wiB
zPMKGyrB|dt<EJiguR7C;Q0p_YpG$CP6rBJ!_BPdR!>5f&p7L&!ICwXy+=T#p`s{Qa
zkrR^L?*=J;1U+D~=&dxy?W`TqL~@<G{Niq^zYi!xUva?*W(R~2G>2aI%cg=pcE&Ox
zpfECF+ujdMQokL0Lk$euzz8<<uD~M>K*7o7pwPsN3!|}9$Tx=WL%)3+QucOGvc(Yj
z$kM(!eh{FC&@F2^`mWERw(h7Sx$NOyKr?|n?MgE0EL#W!PS<~8UtXP2$A3V%3SZDX
zpn)wI#)EKv1&6Nc(~}LmN!(Ao{g_>M=E3e!Q;8Ob2YIVwPuIRe^l5#L_*07RU#%oE
zMN}@y^R;g3)U^v9$lx(rBfU3bJ|8`=<m%y#sid7w^l4wUQK#-(U={Gddz(o~8)*=b
zt*RL6>{G5Oo@ADKMoOv@Fs@;xl1`u@BPEv;<cJ#H>kqkbp@FiR8aLRO&#it5`*bO6
z-r2N&+?+@DNuJtcoHgq7YNq_%wwUbmfV|;z#E%G6Chyp1X<PatR>Kt&B^UiC`$)WH
z6j6Z1iRtn)A5n*gS`u24=ZLYZjyDwQt^yxS%=Cf~n(xp<+GoKPhaQt~YtB#KNAGv>
zLV*Aw`ne!lh^TR^_9*18oqQo1+RiF(sCj8Yk@6|fZ?g5xs;nErd(>YOwH$fzJDgdw
zdFrgCZf9?(p$A%`CfRuz;kPD{j|WP@>D(2ga=$6NlLa^CktOrHV(Tq~{xq&cRaG~x
zzahOscpTKsQ2DaG^zPmF%?^5-avb^qDELv{5LVWb%nT%uOkx=7bDYY#_NS#W@Q4bU
zHX_svw+7VS^fu+Xaz^HH)WYT2d#{_#!fV?i4Ka86QKHXmNeON`tlKiYCLEo?klT~#
z7<8S(BY055JUD_58$Fq6rj~AV<HOtd7n~2jX^q++_NmSzB>s$zIGDS_Y{m3#xYQLa
zkrfJ!n#+F_dlaRqGsl#6OQfcn0*_5M$z9BO_;=?r*xmd{AP;;YfBGpHBrjI+WNBqx
zyAlV?vTuCV=7j8OH_+C7=hN{wn9JR{!Mk|N!SByq)}M(=x)!Yy7>0sULzQy2NAD~h
zd4$X`YJ6Q$YSZWqxFf~Pr^+I@J<CtyO&pM5^?wuYhQWiW((p#s2$zaiM55~LMwC0c
zWw4Z#%s<W!LRWw@PiKUp)$hq!Rj<MM(Z}=YJ~w45r3XiD3IT%{6;m#O8pf*i@JOS2
z8dm|`x*I3KCI_WB?nA*Sl7v~68tG_Bd`7x<_U%!~f<T|rYDgb?R|$`iqU^gQ0Eg&e
z5MTD)A(tJmC$`CNjN*^6d4xmV*I+Vr8wXLi7|I^IX|TBBLR@`eO@Lfz{}PGTL#EW<
z_VBSqBQ7R;>&_l?H8bJ;8+mL<;P<iLZwH<zNy6D_o-E>T{{$YEz20Gapn>`5UUPRm
zb-j^Qa}bZ}(M@1m+wMn!E}UElaW93ArTie)cxWoQl&|Az0Kh2p0@QUW88Ax;A)Kuy
zu!`1mCXKSPDN~Qt4Qq<~uEp-cYwr%p!~`;-#1&v0gv>tTFMkb}Fg+Y2DaFcPY|zp;
ztVK1Aqd>71enAUTkh+wvn8fb+^mAwwex-}2W4Oh*lyNa5QIU&h1XEpa4-fOeJUJ|m
z6`nVh$Q4#yKQmmye*_OBOA+4Dk^}v1)}64i>Aoyy!(FBE^k**R<<13lF-vCpFyYTM
zE;fYU|JZN@s@iM+G&q5B;40skqyBZv%>lXX>wjU;f*w`;nc3gdhz4`0Zt#(<WFJbk
zCtB(!lh(^3_|ro*Ejs8UstNrmSJxmgsAILQ<Fv3Ey9@)8GN->UT^!#T$FohuUO~;v
z;6NGcY^_C~#WdFo)UcP2O|eyl1x|81^E*qkAWyYxo0W@Lif7v3=ZD)Iv!UpPrXz!>
zqF_n*Sx=mQU8VNVY8RQSlG`5p`b@EA5f+exbwc}>y(K8nT5W1dt?zS(qp>0$Fh6_6
zT0h=;!-=wObKOjpOOV2hh&2p8@TJ=p(u<Qm!f&y!MyI6^X8n*6Mv25-Kt$5ZZ;QD_
zF-k&_4-XXI%4{mb#KtaH*Tje)Rm_}*GT}^Scz3&@LKDmoR6$miv|dOp!GKOM-Q=V&
zhWix_+LK<9;wR^W<=p%!LkX*MJ_fl`;irt9;i4Bbl4+WnrTqomZFb>=*yYXeI%gVb
zL9RbOVguOUJ_`)+kqyqyZ)DBn=X|OWXlxPlCNl5ZCgWx8a7&%-fa9Xv?qpA7?(~j1
z$LL(MRyO?s;4&F|^(a*Z4!$<dZD6|QJNBQ=d(?c5S;lwqezTEVCj_5y6{93pOE0v=
zUif=&Z8qBW4B$bBj?{4d1|*Qpv@8Q<LmP(h!H17lql>tD!dDMyAk^PgfbV0XN_#KG
zVn&=9M>p5&{Kh1uKf>S5^LNx-*dhU(j;dIAU?KN|4*n%h8;dU*W_+l-%OX|(n*~7U
zF)QNEm@r|ZB;xAm#EmVM6c2JY(v9AsmQ~REmSLfdoc$hR;X)b7f3l<XhECr8UUd>3
zCs|L&fK?l|P#4L5F7;6EPl{Ev*<5~~l`XoEPzUw<lRbaN<9FK+r^Ap<qfT$Si%)_|
z=!$)271$b45+BRSSlhhATd_NsU-~<7V<KGiinITU+g1w>WC{5{Fc(7j_VJ<7iOO8Q
zn`7o7L~pNsrsQ!|Vuw)K=^b+7^$0h85lHo29eiV=nYXg}fs)~wxp+M^Mazr;TGl|~
zjPLD4mA7$dKi+O?U<d4|xg*I*_52kKWYGasj8)Eqi~a&Fe|wHm_UlYFsDJ2fe=FY?
zwb_rrpxr95Gkqs;)Mw+yY8$9#or&Lcn|^8{K)zW-=K_|lMtWldvs^Xh-ydBAfM%_P
z3yRR=qB*}vSH~kaY1ZcgDA=&FtraV>sZ?caxDMzw76vQmv7Yva#nPWYw;8sWk>K2+
zz=E|w7qU2pl?rHdvY%|T88pB>(zWnUG?PRjD7b=9p`#6IKh$ND65ALXHpF>QJPVZ3
zHZ-s#eT7h0Ow_H2Qi4;Y)OSkNY@imU>p4kpACyq)Ule7r$IN6B%k!kZNC(|x{~id8
zm{I%jBX>Lkg;<j_bo8STHugEUn~{<U%7mw9)M4htKBOj>RjiYe#U6q^gEXv-^11L6
zFh3T*QUas@47|_WmFk$c8&(Lbqj9HWLQ&gHgjhioQ{S@q@Y#qSliFvfx9$Z2kHV*{
zJ4H$<P_=*~k2}2$ALg49e3A;v8Up@N6ZZvzvx%H+JTj$jbBS*QtFc7UE7?Rmrbku)
zBDSKvb)c;uTmXV5+gto%SIUhBN_TR7<sfS>dF^3#*Xu2Y0%Uh{mn86_5>g`gY=Y4&
zG#~|08QIosGQo}ssh}o!_;+A@m!wZ&ln#NSe|W#5gyD$Rn{|`>bsmdo)EIzH)=~T{
zE(v;xOwJF3u12$WGPCc1LQ18Mg4Mw0PqtI%bP+u!PhzIQ4#2ISOF3gq^JuMk)>+gk
zr(lH5K#(xiTWpCdeh@%W$>q+dM$tC<7=VALz%GTNtp~(YW-VczjrZ)3diD@xxaCRA
z^?7evy0=YiJtQISf>@~QW+(zULkd{C6F^J+<se$^b*XF5K#a_f$+J2W@7w?gu)sca
zAuBOin8nm4fpBWI^2=bq$53ZxyvYk5AQT*RB;mFXfD2%@HNGf)*-1s9#nD4ejpan#
z=7HQDYF|4(1*DQYv;6UTH)ei)gO_R~!jI<iPFPRAr{8`5qHJy{7@>q%f{19!%;p+E
zqB1BJ6^%|Pb4oTF4|gkccoI+|->|(=j+=<(K2y(UzU>Y3CC)8}OAC#~Ih2QTu#ndz
z-f+OZRiIHCnf-l$dv<>e42c^16)f%MYTvP{80=SA?oq;1=#^6bEO;NJMEG)rS1cfz
z<XZrZp4MttFxc@$63cKL9B)=<9xu4mwKvJQgtU&Iu@i|&N=CEX5Zjh;^DVD8dZ=Aa
zBq*9PmTC33dT4*gNG;0tQBgkeb=t>;eRa5f3#l}Dr5^J$0=LjWgDm9l_;@do4z}j^
zoBb)Crnx(7vvpZQW-be#|2U|$8jfyC&QY8<wWolkS^Syt0R0>R!Z6pudI0UU)7EZ1
z&uYfL=C#Lu`hao>pqdP_N$ty$f+8A}e3SgY1)mv}l)C1rl5u1Y2*A{DMl9&|Av<$l
z<PaK2c0)T-L_`sqif*ry_cSvE%VKXOvclHGI%e<&q)^HChxE37fv0_1U66$-(u~xS
zwdH>x_?u!BNF%GsyMar79blnNnT&;7JjDBH80~%wt)u(CruGcinw}v1Zbit7p;GJR
zyVqtd*RPxTd+{oPQO@sL0?v_o$t9zGZ8L=^uAwNwpzn+u=k$uhYh)Do(#b|JDE>3A
zrfT2*crnp{V9`NZZ!7MB?dao5eJIcrRh$4<I(DbY2D{&w`6%|F40}(0PdvHkqwC3`
zs=M>x7O@caI;H`}ls+r}=Ey5&-X*CI1a9wQ@f~!{U9&@gfB+~?6Dl}_#rS8hIcq4?
zoE9SgC?^58xj4-N<|v}egBTJ=P&7`l15AA|xu%j1a8mUYOopRT_P-*F=>1>xiYRfS
zXL_iD%h);6)Yt0?b=bl(`Ns_<*lDX>Wboj`A?o0`Ne@nMm8a~N2U0BC?m&w!Kl`Vm
zw6o60UwnvWj6~T5v*kneLmmjucxF;V=Qd5?S;bjVBGD|G$}K|A#XTNKYY9wQE$FL2
z*z}N9n9vzTmz=Ciy_Nf4KSLyRzsYf_^S-@m<;&&I-26onA*Sa7)4)W<XK-3u_~_<P
zokS^vC6F~<XGuYyg(2I0_A|G?Pq4+KBZv3AJ?|F_d|bC|bZ$;v$*yY|j0e<@l6zmA
zHTFH1;6*$0)tTkdjoQQ!fPDt+{pU!}3O6v8f&ib2^y^zup9o=XT?D<(XOI#aj|V5t
zV9Z9e^)=&L9#pmPkzZVO5eo6FJdQIp!oux19oyqXl4Y6EiX?$1r+(N8!<`6Lz0{{k
zZ_ak?2Zf9;oh0IW`qi|gA(Y#$l=W|1Kp!F!1hL!&HowR(n9*r8g4_NaYKQP=Dj`dz
z6!*I4H{MA-rmh#xgqcpaWL((3g53$>^ap?j&1Mrvd7=g%V8J`b4%vwO5Hd!B-d{)x
z7xJ|Bt5>9rbG2LSmSL^+9JGd;tv?%yo+&VD8;HsV%WuTrKZWlSIl3uF0*(;F#0*Ha
zzh&*Ac)Gg?oY;HF_H)lCW=a4?q1^Cl<X}#0K>eM7#6j7C-#J~Tn}F+};E9f{@hID^
z2CVj!{=3?<_y=XXq5=^N<E9vYG4^*;hDR6|ALk}d&z3RR6K+-_!<`KwoxLnJZWPx=
zk^4klMp$#6CNdQ9M~lT7)?<yL#RPM9j}{ak-~+>Re_sy&Gd$+qa*56r%?;zPu3H2t
z&Y+tCF9_wnD`kcMnfo;ncK*UfpcFi!0@4Rus9zpi0kOtX(ruARX9-xd#f-!4XA7#+
zz-Ufssdm|6K-~hcn-)`6{OZRVEENv=Zol?~)j4|BRmM)VInpO`XQChLi)Hl{Q|_8_
zgHEnHiV2D7$`$~iGbRWmnWE@>HX#G<PPxYVIADaWupWw#!S1*ki9afawy-u^X%-FB
z2;8Rb$t<tu#PZ0jJnBO*y>YPQy?RhkYI=ABOipXsw)dE~XG+;;`QT~c@S~}fMrj22
z3;KQa=pkQwv!Y&0tEyv^nM)ysYp?G?pkUlOh1>c_ovm_m??P#|f|d1-b`p0BEx2wR
zZnHS|O<;jN32Mr2KX}NzacNu<n<`TL{1?^bjAjKRYB3b^LH3*Ia2pVqK&;Y4jm1Kl
znGM&k-#nq*!Y-0I`A%5HK^f2L*}!4~Yt(JnC8_hwMW3|a(Gq)Z*<f#yUT)9)M~2iE
zXxF#@Y-e}?k|JcDLl}1!cc50ynWou`jt*i<{K8hBPdq-MecoP@^zoM#P!@lHZ*xZ%
z{hrk?>_RB|kt2_SEFg*@ekJ>DMfnL}oUo+5>L~fu8q`lPrJDZ`sEQsX#ZA{5v(DFT
zU$O&Wp{5=FIi7w>JWU+_uqq>Pg$OlTPz1y#t%d^ieJM&4^hOjcKZ&3oV{{~J>U+_K
zN!;&;F(2u$F(?pkMbI(ey@Y0~^>+#Jp4KDdrLr1JzL69GgyZqF+SE|juZO*ThzkOm
z@;WlT@1)&oy36|*l5)UR9pd9qh9$H^sWV?#&KHnrs1U6wex>(7>iVct=*?MabiSXC
zkr*75^dD;#Hz2jq>yQ6u3?D0i24mmBhBbNxx4G7Pu)n&xI`IW|F^j71)YZ+nuKKJ3
z^ctKbb&!j9fI;wbe{FB`e}huG6mc)&2oIzz|LhOj@AOphEdp4py|pU!ZW(C+6If6T
z8nEMWFUx=1jp0Z3Hn@XXSdYsNm}3|ZPV<JSRSzWMJSQTcq%|3D)B;$aB$RRtFv8};
zTenfMY!x;|o?J>9w}B`!Qzvowp?3k^WWUQAv(zQof5*{=ms~QjqNz94l^RKw@%Pxs
z51~cYjK_w}%Hr{>h7Y6_W;B*(54g8{2^S8zJ@=*%l)Fw0k&ftBqn)R@DOV42^P^mw
zr4~XS{;B63E082JXjvxOceXVukB|aex%j)V=KE5jtuB%?ndb4PqxYQvQ`i2Hy1g|>
z<Ouh>4kD=ex8ZDVY#tB9vY2(5Gb2m!ETe8p23{uZb(NMmoVgeHbQ+?qOq{v+aY1&o
zSyIe6jWZAJ&o3VzQYJTFh-k(-?FJk+0RGQEePdlN$Y9IYK=FQ(d>N<VT}nKWgG}rh
zfwMOac$7Os^~(TdOR8Jt{D#DUNs~~OTy%zg+*y#p&fNQ+os<%fw*T>X%*zp*jQ#@J
zKdP;BieB#4p0KTkv>Pg~Z-?5DuNC?InrR&Tr-lYToF)nB6y(&xAN$%~u)wDl`pd>$
z>Kl7YXKXMpdtS|HUmo|TLMLf+nS5|c>4HM6Kb_)VaAYU(c?PpaTG0f?P%H2N6G!=f
zTs&U9?_#FI*Zb87qOb?8t^%lgD6Wpyife*<Ud^TuL073%-!@mjJY-NORLfexq0ZC!
zdO8{F02A#eapeGvO{p$J)v(9>TeBwQ+qD;s=wsQabQS$;W5n|d&=^sgK`))b+pBf<
zv6)A8Icv}ek0@UU7&Y?k_gsKM<eFpAr&D)*X&)X=2&Is@$HOx~qQAR73m};bLsP)m
zoR5L<$&(~|Rb|b1+bhNYo{bjsB`o>iw*a}^)pt*?r8a@d4)bn6Dq;~m3M;tv@x+}b
z?t2~<5nVZ@a!*IIKiricJeaYNr&=02X;t63gQw&@jGzxOkrmK%{yg(ua?jU4FGV35
zx&NTDq9z7zs5rIaq0`6z$DyxKU=?yoMFA8TAgP4de+7XLqZF^&F8wCd3=IuG$X%qR
z>5<O*z=w0o*t^)8RPAPD_8ZKC>5^Ob*xiDq-sSe|hY2q|(=JHe==0oIYigqJqk$aM
zF9fNli1upNV{8_sh_5$pBu393ba~KAUxSTf`>QQVIAfDCH8PSzM0l!%_gizB<Zpt7
zKQHv!?^53l`}sC7d;$lP{l<@-qLvO*p|ccA*?{6wEDz2Gw~g=wrlLG$k?I99WfH{h
zw#ll6vS}_oTTn*hoW@|u1s5hUSKgEOIW$pjDg;=R9|W+f7hE369ZE9%hbSWDr{=P+
z<e^g|hJ*0%Z47$V8{OLb#7^Sxsl!OL!z1<FmKqZ2S#0-<9V61)|51t7US|vKzxP<R
zRRn~Ycct7*Het8w)DiCk8dcah0GiVMtklszO+^m(g{o%<)zx}V`MYcF+tU#^?e=K+
z72VnJ7+4QnNm#z)PeX>blsSLY-Y8nMf0rJO)iQ8kY3p+Bu$YRj)>Vw=H&g!!&GSlI
zJIUj8Aw<c0L*jb9j?in)b3d9i0w7o2Dq7M`5@p}ZV!e%e##roqd7Hk4BS9JvHSx7@
zC0ln?5TN}1%HQRYF9cW}gW!whGY?NUx29&&spHm;EDpb}tUpBHWCWDo8hYB=^u7&A
ztQi81-(U_^MaV4U-yOlGAxHi4HM3=C<v-Bs=?yPh^TI{cztmyJm)zb99~YOhYYGqN
zQhk<j<AfC$+kx8^jad-;b+jucwm-J^FDjSYe(HUNf0`h{^71>X2$zpMzPwIdyE+|p
ziYQ7Vm1BxA*QiRxTfSxa#9mg#6%J13AR0&Ey4f?j`cR@Bvo~dfXa*cFC26vxbbxi)
z<OOljl>MIiJJ1t@3)nI~QFlu8>cO`p5YODnS2u~~_-ms|+NTkRRR@q3k$w$oW4@G_
zND_7MHqzq`4!`s}wY#OCOhn+khA?bxNxEGR+|UF}$3hn7&lYZM5pCQD*W6U{`lxsG
zL%!CLzo)IPbq!RUbP+)bTQwrJ$F~M+xn?bVdvaNDkhudcIYussz)1O)?p{{E8LG<$
z1mhQRpP%dSrM$hhLJQ{@45XD$g`o+|#a|^eHbys1v&a6dn<1z4)`q2m5MXBp%Vc5S
z@I)t|$T;(2bG;HigJ*9dJ#I5?e~pli1>|APR}bZwe^wu1{ws5(oPwq<d-LUweKe!&
zw*GQx!#RKOtjY1lR7-m%q+`OI=gNeXA2(F{T#{y8B2;UI8epU86wfu1G|zrxha~As
z-Htte_9xh27kQyay+<d%-DCAg^yAvjJY)r)Pqo>BruAVoU**J4Q?&CE;BLPrNPtx^
zfYK7e{cSQqOXKC4D1Dc7na4^M^&EyIx`0sdnHWi@EmMM6eqTQfU!L{)$F7GR&Ojxz
zR@Ylp`BQt`SD4vb`<dr4MLSuW&jS$>xZ8t@dlBwD#bb)@?{r*KGW!`Dv#7L)6WVmW
zAbk&`klQ27wHVS<F<H%(9X3CcKhi-8GE94Gqr<8y&A@A1bWoD=8`qux_timYzMS8a
z2#lMkjtVD`&Q03`Ph0Z}mS2r3x+cK=7~y2gz%s9D+X3P5#ljW-td6Kv?H8p{){mu(
zwj|!-FrcBnjXf2OMrLk})9Vqd^)e}tlzAnv%LCxPWUMyW3-l&Ao+^!D+4g_g(Wt1~
zoJ)p=r4^<5WXyv<O?`s2EN-xS;|p)fb(Yzk#<ZNJJK}C*kauN_>8`F~`HXb_rkJmc
zHgjpIKrp=UugvWCF5~g>)$rSp^`jHw7GdimV~VY>KT3S9oEHT-|1`c*%WSn*X~FjI
zRFByQ2jvXA)^HBhc1CdmPOo^c6oh8|Ju~UKn}|o&w5fhR(u+ZNR4SYAPWtf8J)KPX
zO2w*M_wH<^8j^-~rs-;ZObhPcFzHC&dmv61$3Q>KeV*K4A6HS41E^F*wu-bsxr&Lh
zUPLbX%z}?8Ez&gB`Pgy^um1QB7g=KSIUz8S|B?Pn+TL1GYWAa*%R<q$gL3=W=w=l0
zp0eyPp71gqcg~TaLr5pXr(O!XyhKHSivJJP|4UROi(OCd0K&+fA#;w0oYZ{t?4W&Q
z^K6y6f{WGZs5Qw)`+6zbCAE_leeLyWQp8O4OYGUbPR22PuN<)$#S{OIf0<X6ARvtz
zejn(+`#y6gwM60{d?u?*Yb8KXXjQJp_amJLKWCy9jFBR7Y4-TPJRGsrSz|(*n>Hf<
z{)(`<@$wv$ovan6b#C-$Y%Ud`TN!`WjTKAIDXPzCnh}r5?8o{x6%eS`Y8-ieit%Qp
zvi#zsL2AzVx&Bw$vUMAPc~16!e-gD{v`NwCqA~SPYgZTBJlfLF@f+2?=S_3!6jo&?
z?A5216LWlVqCB?kC_65`i&sI**!W#W4Boa}OmJ#+e$j>s<CS=f@&r_XuBq}>7YP(h
zBR2fl!30wS{8|6fp)(3UGrm`7(1h|K|N27hLQFwo3+dLOr53PVe>lshI|UNb0$eLA
zVCJf_?$e+%o;x9tqHcDC3LH0@c{&x(28>pDI{`H2O~OtwqbSBJGF&#zlKk82OO;`q
zLfTPCy#Eo?v68H?CYGbdBNygZ2|tRCMlY}YW_!Eled5g9p!?Yl-9SA*8h#B0m7oJH
zLe5?5LVW`_b+jnk2aSI7LJJxhXq^EI&35w2e@$J0&bh#tCV#Py=%lm0s&;oXF5}7H
zEU9i^yy;&_0?HvB7(#9?9&c0>u1*qAo_aqI;!H)~??DnK6pdT?nAGj;k^rJ#Wz`pq
z#z+CQX1eXQsR`0Ih~`i<KU~!oUQI6s=~cm^xf>5YG&SkjYi!yHaB&#`>a+TP^hG_&
zDCaLrTZyvufUYGWu8-A*yIU%|b{Y;x^22H{r=meT^{zmW*)32?)&)Clq+MVccdzW(
zL^0wK;)_Ma5{qWH!suVm811TnmX#9jwFY=;%*~Q?YCd<Su@0R+PobjW9%5W3>u@l9
zF(tMJ{k__Lsy1>bgcbIbrcHl%u}Oxkk0R7P^+!O+UocI<nf)u-Yp5Cq0NZPt(P1Cf
z$pSRzC=wswEAq(XukWl>S$qEX7ph~>L%loQ`zybeOFa$f!P(YQgMx%{3zu8H={F*O
zjV8b3T<B#I65}at-xUo16BP6tP1-Tlhz;WBmgTI|7n&U3kyF8*xa7-sCc&k;)F{tW
z*3l9d_gUY(W6*cB!<vRNpecO(#mrMq<9<zzJ@rSe*yyC>Qo+j<JBbw{5&o$9wD}PT
z%c}OvO`-kv(GB|l5MT5kBTET8_9jy(W1=Y<rs0HRh5kH{n^KED{(VV432eWaC%9Vq
z@$$r;M?st8*I@gnBKpThE#p{q<_bOs#9q}1eSSmknEENCuxj{|*E*EQ#dEN2*97~T
zRzK@$!}dRZyqnF^KaxI<Y`BQ5#k~Tc3n}e~!!Kl|QaYi(J{=`U*7qLY;AMrNVR;vs
z*fx99&UHllj@gy^isz4@^vMs#8wewhkYi=-XnU@NYTqDFX}+CH9`UBvqRk)wZuM&C
z0F2iB-}hAV-ih3Cwewg&3*$zuI`@xPSehZD<exF^V?j0`IoDc7FPWp<OC03oW<3g?
zip)cxW7E(Ky&2+E1&}!G_xv0jDX}K^9qjyQ(V<cw5H$rI=0t%ktm*!%-2J&3JAXA@
zxu=>;ys_V<Bnjc6t)xht0>+?2_U>w*2<lXDYO+^gp`AQzF)8xq2i)dbY`oddUl`aK
zUsiC4d3g1;+1!}~bNU$dc`0iIjC9@aCY)ILXW7iVLLOJ0Y<0@JVVFNPnmB+6_VvD(
zbu!?NFJfa6fm!n#c}1tm_7aVFCnKF;Qa1bcLjY@m{3pLgRvAW9|If4^K73F!#FQ<Z
z?IMUmMEx@TvrDaY*uR1wAbYZFMK!Gft{1wVeAyYC-d&t-`EeX`?N03vv-r~=?Ydae
z`Qv^xfK0>q-<_g5?Uqk-jFD(}YZCfs7FaM=t;ZY)M`o(e#HYRA#x>|>LpT1tRPtQk
zJZ*2^#p}`%Slfi3tL~(0@V{^d?%GuXOzJ<{xsB~MLIgaN7q=~2JV-aEIYFtDWU+z{
z9GD?uyTWK~)S;~vZO3@vWMKa7$W<o^{Ucy;PIgsi1Z1Jqox>*IBjhY&8HCU-h@{je
z&6?TJ)rUh;-rGDes!m7;;iIS=&(mr#w0{YgtadB4O2av6)W$_TQQEF$!z#oR>uP^1
z_aE!>RLN~DZT*)g+wIN%5DSm+aLZ<V*OCsg^X%!@MqoEt#dtU8J%hV`b)o}VcJF4E
zUSiF0Q-pwWHGh}}KCv|bu1XgLnH|95JoJ_F095RzLAh!~0#hpRJ|37=_1hgG&{6Rj
zGyi*1RRUgEv-K$0VMP#(@oq-;)VnJR&XEE6sgQdZh~M{wU!&nN{!$}RV`={KJ?UW2
zv5X`G&8ohA0GT_(EG9OU?+O4?1E-4M((*((SUH&-DEJfe`8c6jvEfWr1D@61MgL9p
zIz3?2IY+1DZV&9CYq6}hFA(p!kR;||#RCn;7K`^(U#dU*i9>2l<i`?@N<~J^b)i9q
zb(c1P7oH!JO1I)_hvb^=f#<K{S>EFmdQY__i|ZT7;Tb%4g$6o-wjCs$0)d0fWWHHz
zf?cDi3L`op?9Nb`w)O#0T$t9fsonLS(EqM!GBBC4G!=fJvULt|o&8Ju`yb@EKbLC|
zAmuszhS)?QfD#<LqV1NM*{?WjphzFuuX$+iyz>i?MpyZs;V!2#Pz5)cxv(oRib%`_
zuudpT;FNVwn-pNUAVr5OrdEG`oyxUN{o1AlzLEsrGEsu)RwW#pX$x&Yk*j7=UxwXM
z6tPRm>hr9QRl?1vwN214G<CEy7nOf@7<E96$D8JQTpB{W41hEmzGJ3K^-ku!h;(*b
z!}4SLS~6aV2uX$MssYWr=Vy&UGX)uJ06w|@Kdav~>;g7VC(~)?jZ_^nr0irWHb^gm
z*{d=gapZqab7sW#?zCSls$$&enZA?yjgJTke0~bN$i2UNc$w`bZ)}yJ61ui#U_Ao3
zqi?h9W;{*Y>Ft%q-WIjNx}7r!#_nWE5kK@+rNr*@S~X;c`a5A0DxdUKv=#64=yclA
zIBx9zB4MbWG4jah%R||qRj>SVH^vXZL@Uc}Lf(@ddvbEhGYrl12+-E*UgH<uaZNLL
zX9^;jxa+H%TNMC_hOFV-fE5r9P3BeY{W7F*NJI908j+aTZ7>=l*uAx`b;Ce<<S1#t
zW9I=(=nNlBGCP+RuekoCd|=oGxCk`PjXS6_y9cmEcGolRQQlDXT)zR?#=enaOGbr7
zZ7Fj7Ltl17s(mqv&c)tI<=W|OT3ofC65**gsX<KPV+g$iV{BIy3UqmZoTmxfqY@cR
z`!!1Pj<}r9OjCu31%R>|=pFH!@701>oc)Q>Kzv2WKq3(Kg~|3a>~`4{|GTSJW7kLu
zbM$!I32p#?J+pj{n!^GQTCL~d4|A*zTmV3T4xssN()*!&N+g_>&uhs!ZK!qcGpDtF
z<!r5~nx}J5SFf6gmkBixL}{3BlOF0=S~Mz8btw`lyeJ;E?hl|_NYIJs<P^=7gFDi%
zVzT6mVM{xJYecb%1RVWT>7jw`?B1LyW#V3T^aZ2h)#L5Zqg^+785k+{Kytr=x{qb)
zchg1*rOE@@MTWr^8Dr4LA~-(S56n%BLZ7(b%ITt^gK$D;t8T|k*8lMxYj9eiad7$H
zEI?JsY?{<0oPPbSa8|5uG`$+zRyg_3?uTSAL;fx80B7duKZ+wg-y@dJST;>6!H!y~
z)&=e)<skbHLX>E}rKnUWfR&0Q&C0ClYmGXlhXIPJD0Hqn&!MT!Pj{-qFORC%dLj8f
zx>?Y+`9SviF8{Mps545DFU8&zB4J>mou2R1c(4Sb;=v6){Xu$L|5iX72BdH>(pstB
z18l+o3_eHkBPw5|OJtz$6v0G%z;unIE>fAMJVG+YLsS%fO3lwY#+D}OV@;j)HJOR{
z9g@u;13;-N;0`o+JJrP>U%(CrdFDJ``dOdH6t3=AF*a)QBmN&sC#uQQhD+7DoCpOa
ziD=?q9&Q74&{w^>h_F|e^7sM`;*PyU?x__(W!dvEbr?CZ0ELQ=CiGLj3G@Gs3or&Y
z%AevzF^1DbzK2o2)99WUHP_@CcW<hV>P!=C(t45cl861PTrXGKIfZGRauXkt81c;>
z@Mu3<)@j$jtg(EAXSLH-ypa{5n>3?`CO+o3#R^3ECNmhC3E`~neAYO1DMQm8c4+7<
zr|cb3p2~I&1%mQn=33sN3sFADtF4<HX3jS+RonM+>*s&3CiHNvAesICA}pTiy_=f$
zfA29vrD_h1OwC!|fp!yFQOrbCM-rqXlFjS8OO2<MnfI}Y)Nosj;Q1Q|!{IL`<$+R}
zM5BFQ8c<@o3EBxLMtPG872)FHmIn`1w3StYsjAe!@*XSDx`KCqCE#A;{+Ke<hsrOl
z&WYWH`sGGvU?GHR1-3^IxG_gif%}-I>3Y8~<*)R4N=5MSUxY0}Prm(iJbC|CWBs?U
zy%e3xX~n6yE9&TU4H~-wq5X_ahRVGExfp)qfbi>Jh!U8AI$Xp8XHQQNm$#JajvS21
zuO0whz!&y44gZI@U!*f+4au8-J{h@YypfuVC@~vk6^*gjOon+U1UN!16MTuR{;@Df
zsc;ZwKmTl}%&4XEJtK3}mWq>n<GTPT)ph3Old8<NOe`CuVe;#HH?!9iJm6{rQ}j$G
zba+iq%g7gS@dRGON%qiwlvD#v>i@vGz$4z42Wu=Dn^$T=Q=9)Hx0%8Lk?WRm5r_G}
zL#VmO=hMZqQP;N|34w~2c*OREB2?-}#Q755m<Dyko5nJMF4dCk+XBVV={o5EA6C0r
zUVO8q>b?NAx-fik)M;Vzf%NUQc8>}1zS)|rj~pkIku;4hL*fL39&r`AntPHf(jlut
zyRXrBrU?cDU-DM>m!$hBpg$P3Ug-B&ptJKERTZiZ*pj#-m0=3>lVwHYqp`b$T!9v>
z9t^L=3wdfb*o|`%8WAmEyG^GTgPDNrVbQR>;jrC<11#|YO6jqnMLkO!b|H*Q!PfbW
zWQL1R8T%J*{LU0hi+F_=qk~c{6i-1~nMmYTBpLX7d~7mCF3$cfwy$i*G=u$Z>~iN>
zC+f1_nwQuuody79bx!-QL9o>Qdp#{}24ED75yy4pSvbgLyJIdbW)#e83J|-qYeEg&
zCVBTTN%l+#p;7hqML0R*5n{v|<oQ2Z0_?~iPz?$$x20N$|M#dG!K_AdWg$gS`6)r=
zbc;iAYcN6>D3)#I(nEF5-;}bbnf7TfrD+@AR+?|V`M%GQa$8ow@eW)jk|O#8QyowH
zVS&T(7_VI*dM=;Rc@n^AR+ZE{AV5;~zow@=bUr*(hE64Y-pS`9pbb5=-2jNaNf|me
zz}=*Yn5$g~{d!k>o)mqH6ZkBy)=dsW%hU^_nrJ;Rd@l2@YO6`=b=Q;-<Ak1kQw*Iw
zB9)p8qK&@#u<G3~O~qjrtG5_Forvq@A)X8x!@2*MvM8kfDbM9jkp~gJz9`Q2n-Xzr
zV3_J}nMl<(8k!_H$qzh5OpRY;Mr3toLe*w9GM)4#73dv=NsIUVr;QWujG7YwTR@&G
zLjnGdc}$l_e^S2RP}$GVy*TlN^at~`Sp|x(W^$R4TD)8k)uuC<?pG@_y<7*TuTG;B
zURTQpJec@o_~`9VQJ;<2-bc5|bH}xIgh*i%)2&gv-_m>`9m*h<xqcXLcO5^W`d;5?
z$pn;8Crd|bnFYu^_|u=iu=k~oR~88oGq#fj4}OfiP@ob`%luQP{j-#VDK7Qhr5y6m
zI=5G<-UW-OoW)rsi{5s4IMJ6Kyz2r&i$|uZdWla<cS}JH|K@dtw5Ark&B_j1v{REA
ze)IPqdEuo8Bk*GgPP5cleX4rR%hh(0quwR}EIVB?Mp5&I>1aG7@s8WA?+yp3!o3;>
z<-xB}q$mm#ytD?s@E{kZ;vYCS6R!OkW4c3j^<dI#F1RTR{19@n0&-G-p0E4y11oLj
zHz(w?p?7IU!Rb)COQy;lx@tu37U`*0sZbv=BIwG6;v~@9r^0k+bW4*76m|xE6+I=1
zfB)8r%OtbDj&z*rEK8PXETakJQhoIW%Bh5hEPvR74<$Gg|CmzP8G7{fYi{9DsNZlm
zWRpy$zS*U`q+{fU4M+y1JAth>4F`t&|6B$P2f@({FY%m^0an7ea)pLzu{i$$2=L}-
z3dA0@r^r!p<b9r`4JBq|z{tTCps_`=T1Qs9zP7WMr|hmv-1GL6_)yS&So!t$y96!)
zgU;NtDb}wLZD%7&p%S>^d%KiYrc4Su3Wq_icu(EN)$&;dmYDij|Ej6~j1_fIKz0T;
z1$moKNb3q$Fg=y`f2l)g@gr_5#8I2=7fcHT0Bx7J`Y#Otjw$`NY8Uk(DpRT%;8ZpN
z<Z5v5F(Uhfai5}03-5QkXJcm`xqd00&9ze-#pjY=XHQ9Z+mZNp0uh0uU~9XteMba<
zAy3&V4<?K0634l{^slKNje@$OaGE&Re3DH10G8Qb(=Jns<P6c87w%bpvmRfx2+D1-
zA65mZwq-KcMe}C}hR`C&j9o;_6qGOY3wxXc);<>qx+T5pkV%(Y{HBT13SF^&a$+Aq
z)z_5%sl3)c*x=1;5Z2ZOI%ePxwJi~(8a#{9!VRVU!Mh*?$auSs3BZ5Qu}`;R0s`qu
zV#y@!o{J$U<^D#3!MV7XP=i8M;P>!V8dEbiK-}M6;Xwh-EC$DxC(#2=*-Yi(aw_~q
zt#`hD5A8%^NB=O#U8n0w^2dX&{of4jyU4-vIBtnM%~X>HO0HAt!sl2>=2ERv4hRF$
zGL&AzUCoztN_es=hm!MVr}o<9dz2`u%waQEDyysQG0Xmh8Kv?=BZn2q58R~dwl~RI
z$RIV2Os(zUB~V@4d$Au^I%<8dV!ZD)3TRHyH0)!_eme^?!Y=p3wHhTF&_XlQ^KDex
zpV*qCc=J|3aCSqzyI_u<mnCf;T;kcM*WmyZbRW{HOM3uWW$FHi8fyM1d>$iW!xW}u
z^_(P4<iX^dTiJR#HC*I4ZY_%Y(3&FGoWUl#HRvPL&EC_kBma-78-H&ve~tZGv9^}a
zAlHYQe_-O^-gs$=XggFp`qC)gbfK&_nB)V`4dA#wO36iN!gq7080iJEiQ|v+M|aM%
zn_b|XoDGiil<S;JZ#q2g%F<7?Bbz)~BK{w}MR0w=Xc03%e;|?T_~9xtXZYh%S;7N#
zcdsh<P|8m#vF`4i{OKS`$h~h(L)Uxq5`D?4>dniOr2EAxPVbKK(R?{8;sW48dHNPz
zQt=$g$G4Jny7XuKqjVI+Z(fdboYY`f`cS7P3tUQ8jDV`kSblG{$DWTV`c)DT6e=eV
ziANkU+zI_sASW$(Gp5{J)oJLK1cdV2lRwsrD9{an-)#O|;LZZdvIM;}HWZ8lwEK!;
zD}p0SQ2!0X?R`*wGt%EXfA(CH$p@{vH8Ruvp#MR``LH(>N>my=ER7Y+n5J~-*SQ{s
zZ{cd6pWp%9PbxO<)$yimYY8SU&3Vee%r3!agxTb0E&N;|XGoqfDSM|{LjIDRM!dA=
zb)?*=N<@qDuDwoDq29g?7o4K&TkDw{dO%dj&RySdFG`EFBK?;Y4h=d|98Dh_8M3pL
zA#U20RB@B=>A^B)jQk+23RaUXcRsj|AskT+L%)+3Wu|q&FKX;_LI*iGD+6}UC~b^S
zz&p?UmGcyXo&U0W`U-aee{ZsJLlB#a-+;4Gz2;0INJbLpmi_itba<z1{RTjyIe-G!
zcnGlj-xcoxl^#2*sL?`&0cpOd3!w;@m_0Oc?D*_6kmg<zcI6f)=S*1loAOJi`bpWo
z3(Ix&qV{SumHm#{2ToZtHsOf7&3}Amqm)9@7QPK{xI6!#jLe|8>i3<HyL*LgOTmCY
z(q=snn#RsTDr~voZn2wkuX?DPRaIhIlFd2wR1mb?{erl$N)HjND(n*N&O%8=ma-my
z|5s=Ch_&KxC7byXeJO>>FdcvD&NpAaorInUHhB`&YAVQ5|Cw~hl$y6zmJi8;pCFn3
zT{7Taq{PSsK;~Gp$8@#;fRBt{K#rBmfn0(ETQSXOeh&5zHkf06^oEsiC+W@QH)KI{
zsGuzTdEz*lPT-bp<G<Upa0G0{or~9&=HVs#FEhvoHz7LNXlpr{#{`?C&|A+>tizla
zM|OW=s7P*;o>)hqt5@U`U9ydYkVvPxr6^_*Ac<r>{Vz~t&v2pNsE)tAS&@<9ad?{_
zg7*}d5fQ#rWU}9HiX;(kMb8n8s>gGveDGe!675ZGSZfjU-<x9%F`Z!v59FFJnQurZ
zY-*aS{Qotd0Px3?5@1T+R4uk?hPX;6**>DNzq}nY>>6n1)1kEw9qkYPaQU^!$x7e0
zfh}4u@~z0<51PP)VmqPF#XojvIl#*H(Ukjkd1IJB(Nx;OaxhZZrlv%@rUwHOzh<T<
zUI__-Hrnf@ycxnhzjX~%emz`(zGUmfS3@+3iTK>0mXyHFX&Dy8gJhD?jr{}t!#M}%
zRil?!JC1RzlMjM15<;z5PhN8(A|(gXDV!sY#h6E)e~;#P#JV{N>P55ZNY$6boQ1Lg
zxeYZXQEcN7;m>2wDG&6?Q4T~F(rjDUaHH~sKLja=i@c0BfqyQ(HA6$6|1B%7=?S?u
zJs0HJ^B4<U64j5){#byncFf1Cfwmcoz|tml=lk>K?bG-91si}!R5~eH|5};Q#y4gm
z5Q3{qx_=3;V(+Ps-)iK7QqEoS%}Spe61JJ{tGy*5Izj!jzEPfz(*NA)mopb#Tg0ts
zWzgbgUf0WNf*^)cEk1Pk@@RH{Wuiz$2GtMT^?#upyoefkXwRJZ%-Sj>>&vz-9$6ye
zM9SyiNVIkfrX6jNy3)d-icn<q24V)B1Y6p1BEDC2idk6U-k_zhWKZdY&mss-i*<Fo
z_DQNOJ{9ASd%NAp^hG|gRn|CwMMp|0{aYp?U?I1M0j>}rCWhZ&8=Xgzaz0UIlxphJ
zz)=m7wy<=FK2Q_ZYeRtK>{{cFwEzVii9p#Spcszmbg%;{F{;jN91;t#bwi7_D!7ma
zV?~#a&QgI8i4?E0eHu;fA^#*QpH9I=Qt8b!hP$+aZj1#ij=Y^Z{I><|$8=TrBIlu=
zMQX+@T4gV=SbpK1scL-YW0of%R{ERLk0@_I@!Hq$Lbh;D((mx{L$_kJig*QenWln>
zvL%Mb%7Rw06mRbK3vLmQ5|vAS5G3PzvAhK8r@?TzWsjif2MKj56&GUzyaun&%bIU6
z8tlRwi)WjmAZ^h?-je`K5(B)pJtE1p;TP@F8vr5oB#2<oUI~mhqfW#w;1l)~ZXTrp
z9EyCri){d5G3~<W!W;oYgQqLufdIjy_av417A6oDhdu`7fVi3c4r7zE5!H*DlUF2A
z6XlnsJPpw+2z5-}orTpI_|*3%<+S{?h2#m(C_VYB1_^c<lzob)3sA9j$E<75FhgCm
zvC;`f=P2}dvfc3QuX!!Xkk7kj+@vt$-A?i*^OehD>q<xBm(pQQbfKHgQLNQ>6SWW6
z&U+)kL^+;YlDp(53YQffhx{X0r3P4nkGQ3YJ|6hH=@z@~W$XWrt^Gv=!FI|Lu4^EP
z+*+*l*VIOtOhOGXp&lu5>#$NsicO%BvG^DQvwGe8K-*K{sS5W?@22A6-wXCoDK>kf
zKnma=?Rttr)uRV(+(N)U0LNZPdGy0wj}Etr<M0MGv_HYr<Nmt$wg!VypK3bfu1sX<
z6~P$!H>+RA@Yl)YAm<;Ams*&g(`U3cPg&ynNR3dWTQM26D(Pd(WGo|7UoT~)43KHE
z%j)teFGeN%Xe1$ts&y>P^1@G&)9brgvWbpIT4ox|NciyA{)wH61^m=>FM3r&_s|yd
zC2!G=yuG}$*zC*O)4hhPaDQ8h=4$4)YZJ$iORDI8k7ZTuj35NYWXXE-*O;|=y*&CG
z(HjI>&RkqER|FT(!?_a*!g|z#T?Z`g1S1e2_lm}dz#$la@Am-a0Lq;{7@&bxeSfda
zePV||`xvT=2zp$YL0Mxg7<q<@9zdhTZ(%@#eZ(Q$>I`TgWaz!e`zPtv#?8&w6R-y9
zIn%d7#mPRN`FTs#Odr}YB2RG>IKF`i3B-JU%Hh0%6AHCN%vUZ*`GO!AE0I0jdW(pi
za)3Ub_Jx)@48iGV7{RU>Zl=_u4V)`uP2c(=+vsgF@81oTw3SpzY+Xj#5O02=*pB&m
z8Iusfr^QD%B6$2jgsWvcAZ^F969twnLcV^6xd@6JfTF@jZdt)}rFM(+(zjQn;Zi^c
zb#^mkDpZgF7kq)^UB?6v*?7}a;7>K|bv*?X9%l_4@>l?#%~7V{6>=2|L`F&t)q`Y&
z-`-yigW%FViw;ms00*eaM;nup!<Ob(mh#ppxcXN{Upd*_4@9Z-{NG3R$O1V5q4!D=
z6kh{VZBc@uTCi0zecM*1hMJ<5Na9po0txCU6tTS(A3=NlQpWPGqy^&g$FccV%%Uy$
z6Yq$(0-5KPizte<rZJDYpNlr=1-{td!dYR-Bz?y`%qCBs9w_hNy)ic3bpBe)Pb3xN
zfhB$`Be+dglP8%ea9O5KGs48*IGGRQ8=g<f$|o)EQ-la}Xg2Rq4j2?MHbnyXo*3Jx
z?W!^$pfNGlvLrNz3#8u1e%o9Z9bMcGxE?|YS)cwK6kqwFa6za19zQ5FOGk=kBQdNx
ztCcbLAFXwBgyldOdUCJkbZA7ndA|mEor-IH8FvK*U^Ku4l<qgPr{O*ayo9k!2QxxR
zG%UK-e+GTv9<l7@aCc{N%^uWtm#0VRKl}T=PFjbPAvR5sKGS7Sz!4!tf(AzU=zM$s
z4cdDkx4kG}#2l#8Q?yp}So2P_JFtw%T?>SmV}4YbOLvaM&k4>Nb@9H7{8q`=>}(TN
z!dW%Tfc~j-X7tq^<}svy0}3+-E$FYy#fQ4P#P{d7zWs2D$%2>1NwWWa^tHd7(Gh`}
zjl3Or1wd2E%d1{_kLqrF_y8{kGTRGZ`<dhBYc>#JD@O_OzlaG5tWh!lNvc%=sC3i0
zCGY<=8-jl43cT>;p=W~zF8t(Oy|pHtoSS3WDtM&#PB@!HeKe$dKT3n<wap9MFSV)I
zvul}DIAlbo6f(rwlhc)*xr>PEqTmZuaQSn|13&!^Q=dLCrlme3*{Hrte*d;#&RhTg
z@%7eWRd!w5FWn^}wUCkykz9bZv~+iOE$I>@7md;&9U>s1NH<HmL!~68QMy6*#QT1}
z?|F~C-~B%aa9!7&V~#n-dH#l^%<JKs1DcXpqR0-j9$6);`tHJ3E&V5D;ay{a7ev(8
z(k~1s@bQ!tkxhe^RJLm`DJR18hv)T75@AT}XGxZkMDJy73HVZKyKTk*B*z?uhp?2r
zI%X5!M_!Hu$pMdU(d(kto;q&z$mSV7!H{^PdmwlqU3y&o_h5m<dQC@Dfbww3;PZC%
zeDNDWNT!T%h0+=Q90d!`sd3Q_q_cjS1Cz@23Jo5>v009R{LcNJ0&2vh^S|>BH=Z{0
zAI_RP`aQ-*OtDG6hPJL_u?O#dCl!N!Qa7Mj43x&B`-8lbHFv~Ra^P$ViSGSn$zo1`
ztOp6@ycttv#d1qH<%;4J<+DkHrnRuf+@Odw#|_IPmhXA*sZ(|iKgse@rlBQ4A}jfD
z$M+Qx*mXKn-}z$pDsgN4eYKyox37g^{pdyqxq{KUpyS7X7Dr&1j5OT-52~5(u8WEG
z9|h6w-261uu#W*9oJumBVV_OcdNu<-^612ktK*XozA050hWKm#z_A2*PkyVQ*qq!3
z&w_m8H`F+IdnJpa-aVz;3;Ir3*=`)Kz)zz05Kk^@*9(IFUI_R!s^B;NBp*Pai0lS|
zV%)NuIu^gVI6dNN<ivA4{H}?&YEDvg1ZzAE76fl;a~ZReUJ~TzM9_qZPRM55MrVED
zd8EM>^qPf=Xj6gJSN=@<2Ge*UkZt)sLv;MA#;~)ml;V_|FQ1N0NFM2QiGFOb8dGmd
zW{z0#1BGe5l$e?u^cuS_U>?=mI#0*x&Ee}%dA_B0_O=8n@fX5p1O%z2%T7%JA9lYC
z4@3RgBgQ(mJIzz}a(HRE{XtE>Fw2nP0l@C2Y+T+0NfB9d)#_Y71jBuY&_`%arlOiL
zA2WFcXIA!vHnF{`gvsGkuvg9TNn9+E9>Iz1A<qA3cIY`vrqQpK)bGTVMBW#9R`eQb
zJInY9F>Q}<u=lH)6-3gmejDg6Z`BHypYb~}`EC6Gb298*XYf>iN>M@xBIVEc*0fUp
zz1bIvmJ;-~Ji*U6cw0lf{4%00+#H-e*v3nVPc{Z)bj&s*Lycc68{>vtPj@xNu9i_+
zxjr9fnq(s)^P2hRgjnn<;KhybGn2$&y2b3oNmjIOET#;YJ)Lni0Yv8jl|3}xF^~Wa
z-)O_ax-8I@xY&}qEEFT^1pKKpWyVjj`kbR~8TMlO#=L21Y3TU(++;hcahx0bD;&1x
zih_b4LvPmd68{7a&vr^Lb}Oi7of3jH7}=D^qftpKoRS4c@Mcn4#qF)~ys@Lhp;XlF
zfi;qmsu0s<<l~3VZ^lMg9ens%X5t&qrBKx#cf&|85i(jhHq3Z*^ymw8zj$jlDkPgV
zG4zc!OJ&%W_u05zizhjom<gYW4qynzX2-tOE;wj;AOAkn%?~0@H3Lhtl#_ip^`<QB
z%QIY)BOLbEnFyxT^y7?R7{!Wc^gANCvZ|iW*-Y%`zcCx-K60G4EvOfR5~Q~{do#j`
zfJ2irRzLuBx2EObiSq-H!P|IqHsQGRU$RCtv?gXhQBxb+g6llObxUvUevni%`)*}j
znpLVgWY<$bEJa8I>cEv}`|q&2s;|?-_kzEQN0#<le(bh73X2f579w89ddSz|)#0;6
zn=Cnhv%p%OA-}WVrzTr?4kM6cJSb;Wr@R^Q6hi%+r{o}R7U?R3k>)NU!0Y%xyf=ap
z(rX}8#3_PNuT#4AxruIh`hbD|x>cdDxuXu&zMC8(oS3sEgf*;F{2S&_j`eQ)-@K5J
zpl8zj6G4QUXadq88V~wQmM?bm*X}Z}N$?4jZ7ZKWUU*JjZsB8F-UI}|0{&c#sa+P7
zE_F~><xB(}smZ|wmB{kJ(}VCRl(mDnwSIL^rdu1I{LwO9F#jtA0g-ZPsrx%@@QdNP
z(Ua9Y8_U9D=;U&Nxqs1bK8ob^@PqDGSp!94?8cb(o&4v|T8b#6f1~{z*Ibd#7sM_X
z*Km6kMb=$Kh2zcDKu1JVT^@|!l)T?g6+NTON-{g}L@dq0T=V!mPJcz_{jA@_RaTt5
zM&uJeVSebwa7ig3&e~wjEC5f8zSc0r(aTr#fRF%v|D55XL$S9aJu|>2n%(wA{h)X@
z+n45ot^%*R1QIayW&EhOv}+kAlO<`CDbqu9=JWg$Advrf-+=;e5w#rjSNpX3TjNIu
z_jYYJu#CB;nk5ExgqE|Wgz(-8nmLQAJQw(kyP0Lq+A&{C(Wp-GX6q5XcM0*6^u9yL
zZW_SYv0hG)<m3OQvNjr|!FSEV?^4vmN!Ey1_;|^T;!5;f)kMyFBiRQgLo~SQbo5p{
z{aPf(S6PI(CkI1E32x|owA`CdP|If}fi82v>0`mW>h^<Er}0ysz$o{s5>2P+12S}N
zDQ^|s6#E>@yIUSD#8T+xHCgw*$?V2UvqA+fk=Am#YcS<=WPWhg;a#NOs1G*sFeFV~
zS60L4@OHjT0UB#Ui~lKAIL_zM4fX*MUQ)f9chI8LOmlloxc~VN^J=q!8@j(vb%FK&
zsN{5uc_Lr%mHLoeE;ALN%;-H*)?RI<AX6PTq~rM9&b2x99x8RgT|=~5VBpN4_{)B^
zuM-Ww+ar)(JX9bVMWY}pJDmF|Rnof~8VQpF1^u|e_qG9|_&vP|?0m8sZ&E`Ps&Yt?
z4!fGIaz2YY*v=A7`ZHu;-FESaGfK*C@eF-cBR!5KbM=a12MVO3x4Mmcv=E#VbdRS8
zL1{+rSS6jOG2?WyZ#$P9$fQ7DDtvPg{G6Ux;8Y^G!e33S%_%!P@=&0q(bvq#MZr^B
z9txok-tgbvDL7;UDtq?-oZ+^zSNI5L9AWQ-#K*MS(?+oX*b%#$3;3(|BOg0BR53sM
z99AhvR2MBl?y1&P6fL{yTZE1BOySleAm*MWW7q9~uK~uJl%T6~;}RqZzk^HsA<ik*
zn7Q|#+GF}6!lejAP~It7cw1rjKe@MDvlKj+km}g?BGx8X_KfQoZ_kwN*~qSGq6Tsf
zXybwz@JriVZgouJKk4u>$}meJLZk0xphTs&@qgq@%fNseRJLbiFj>;c1hf1{3jq2k
z%0E1}<uinh4ZRj0Si`ndQg^xDwG9_Y2NR?Ge!}y9<cu(Px~4d|!7Tpj9~Vi--+91b
zI(VD{J?FNTg<$dd=X*kv%doWV38upth1R`;&Hh_M1V*gk79D0fMmH0FJ~GxH*D|`6
zI8l-BS8VZ0KBwr%%Ph#e%MUzAq-P>)ti;FYHAs4jr<9{F4ptz~(5}879`QU>RJlcY
zA@4UF@S9oAzNuN@ht{Jfi_vivhZgo1MVF*zsx*|lTEQi*SIsXChr4>)lN?l4!BjLK
zSkxP`GnRaLdsEfqePgn);83Uas%cu|I?d_j@n*w5MPTQXpJ2rYO8DrFpu=0M+R2R*
zGo!(uHUzRHJ+|fDKQ6H&-6ZyN*Yj14f3D-H>S;b60HPL!`2RfmdxiZ&>5qK=YR1pw
z@%Q*&Y{rnAQBSlGoLTo;t3!~_g6Pae5?6=HiuBrcub1^WYd=r;D9ACCckMBkzRE-E
zifXh9${g$|#AQgo*fUQ%Kd_6VFTyXB<)t~t4?P-wZe$h{FN&FXW%&y^Zc%5eN&Zl)
zyi%%4t8kv;XtV3vLMvx$H2WUE#|#)kvcJrObX47edx}l#Uqi%2X|761f#7gXUleLg
zGMHHADK%rw+zy?fE?)_4Iwp?qDj*lAdKi9pp?4ZXfJa_?4X1hu4N`J_ZPYpkM9APd
z?f&zeaSAI<81}+Lhvqr3I6U4L*33yfofg5XHLSR90yF()eXKVg47ph&zdbyuV;|jC
zd|07Nq`~KYEJNH%!-7+Lz#quu@1C|xOWLrn?tq6g7hvchF^)*v*X6*;rBIdO^M3eD
z%+h;8^)GeQAot5pH<QYyv%0bqWfMs*9uIvbn+S#gOgMdrjIYW$j!jn_+IS^5%%0h3
z=rJF>?w#Khi3O~Yo+}>^s2hFJFe{R=K3`U;Bn^Q`Z2KP}HRbDWc*bU??dN;xR@OW<
zue%o2rb4Hn_sy7Y7x&*fgv}Is!7|cMK3EbULw;XXp{GGF=y~!ab()O7u0w389(rFt
zs?e50{)sT-TgWzjlH0_Hl|sBHrb88G;7sNeJ1BY$CJ9EdwQaEd<a}%6^7dRrxw&#Y
z&*9^@11BdTWKmknaSr6w;r6vdywv}bXRU~h3NL@!*Bo9kd9nNo1N#BBL;Jf(N2r8W
z&_1sRXwVKRSiu_Oz8YRUbIO##A6i!a5YXWfJ{4WoIb5*M%(nMjeXOKC@b7o3nsz+F
z`7rzB+q1_QaBOC!J*zbTbQ*F^j+hR|7>BgjS#dUK9gYX<Q>4PdIx`(oCdtnRe%D{j
zx|ww(I~ua7gqfVNGM}?LOqA^ZVb~)1v(DSrMgLg?>*vH^g4G)XyFv$v+9}@$^nJnV
z6nah(!}N|XGi53)+p;e(;E!HnGeOZ_ZPn|J4hdzXhLQ)ZIF~!tPPRI0yC>d>?*^q^
zKr`*r>BMX;a4DZ3URdcpQ*_(U$%eE-vKOd%UUwDjN4EUU3Q`SKTDiVINpFw!X;pY{
zAxebkxVRk^S91O6s`)I>1Ltw7KGulRpI>S`{rtVFGhS70%U!+;>Gn!5%K?zHOAb!_
zbky!o>RZ+j;fG2*iIaRnBEz~mXE}f9C{tBUUqnRK=B;<q<870K`DrKOqzfB{(@3%Y
zSm%+K4l&YK`zM5MkVE7obYvn26fed*NZ;v;;tX52hKFPXqa07Nqfk`EAPr1IKng6P
zJTPpLGv{5}vpimzK5F8^<v}rpJ2q$>ePvnaBj7y_G^j!|ca0A1Dp$On7U{GkA3Up{
zYbCjy2kNZ{ZfOnj5y(F<1YgCPxd;yqUcTYsiWkc#z^LCY%5j?+H!*Y`#Yp;)q@^+@
z@>QW&SVJtf9YXs`o?4Q0|M1HX4J_w3-6mlj9x*Ge5h=Nbs?eb#^D(yjDFbZwT|KmD
z_+1=3Dj(4fo1~VQ?zov)m%bkbVt%SeIAm+?Y{fI+A2n@FhuLt`#R1DfTyIRzDGtGT
zPbVogKmS8I<e@du42)1poRG)tbA6x;XnA!=q3l^ETwT*woJ9-(`)nh9+r0lw+63R!
z-H7)>>52RRP7G#x%Yae)D0Y<Nnyy*sH<QtSYF>^J8@ZUmy`dkLvWjiq)?9WYFBkGC
zs*F<Ku`F?sjo(<FU^wbw2<K0UyPh%ml#NXljw*CJA4H3Ud2-H3tFxTAIcyxb^@wzj
z?RgkNO^~nRlMEb@*UzDupI6nCq;AANpO|!Gs#(A?ULHy1bG_;k5wcTv;#3*ds85|g
zVtM!Z-}DxnpqhE^9nyUW0&i4f*1wnV%#<kVTx|1{o8xO-HOTBc%;nB~L2WyLvr8J#
zkjcy69bth6z22e!!*RonSM9<M&J<{#YS7lIr(@Imz@}Sl3Y3ouDs{5bz08<TDl-VJ
z#=;!x-}w%#&~?T3B#T~d!Ek57p89@)OzrU~EySuNTJ<yQ;km|!GoR=(vLMql6R*i{
zZzW^AsI_=XuGP^v(1PL`HaSy6$gQ!xJu9OmXhLUKGF0jG*_{EmoIlB`o$8xjQacA0
z0nKk4D@^HQzq2RZZisvVBf{cDn_!JE!w~L@w!O_QtJbTk(3QH#NPBgv?uVK;gSK{^
z&8TB>D*}DYq8Nb&vg6A&I4e!O00W>yxO8%-Y6&`NJkvBBg2)-IDnBP}(td^@vn4Q@
zGgznMgICzQndDXi!kf_>JHiIH2}{k0LMYk0P$=|V&Nr%Lly{<cgb4W!+Ee0mB|bE<
zK%MwNwXhz~wl)rA{iuI%kFvZ6o1F6chEI&KW%!lIC?oO^th;{ZC#;6u9}>0RuN4+P
zh|uI+nP8;IE9V*VNVFQv`&@^9|K)2xWtu($2X|6Wh+a8^KTVQudwBV;e~v!4W!Uwo
zv#LZGQmN<BqK01b%Z1MDL{BeM0qC&5mC5CidnF)^B%Oy!gG~J0JDj4&(D&+1yAPYe
zc|o*?=v(doj_h{+!zUaBq566dguX#QFzxn{zIPxxLPVyrrf?h^r!X*t0e#jisG$+;
zgvOUaYfH%<imMS(y5SM+@p-eVqdNTQ$MbRh$C>z0!k*D%=}~8ZWu(rhx_lH&9UzaA
zA*Z0KL*J4)y|S%tg}CHYDQT9S*PH0nyG?J(l=Z}T%sTVi-q~U!6PZ9O!AACpRrSzb
z-J1?8w9Jlxl1$G_tM_Hk?2SzquEZulqXHN5Yal%ZiJ_4MYei`mP(U3KnleqFeeupp
z$k6U)7qdrO!xXaYf9bVGk#WtqRB30sE1?49!H0(d-&~!sd?+Aq%jbq7{)%m3b#L8C
z5;rghSCo~Oh5h@r>HD~+f?S_6A`VrpmnQ6bZ37J=YnOTCCFyVY$YGLAMFpcz@g%Q=
z7wviK_}mpyJGrdr7r3OjZ`zo)MJi$mxnJ>W`O(<c81}{>W_O7lB)R)}bRM4m8cqAb
zViQ4gU&U7X^h*oQ!Wcy?0VYDrZnEIRDo=${4q;j=R>`ISoXJb~VKWP?v6?#BQTv*n
zTZFSI`Edd+67ir=t3(+ZjU#N!^+?vU-WmeL`o*gkA9$%|dW3)%oHDhJ&lR|<o<zVK
zT^){&#UG8bpTApL>N`h6uy@a$?EMpooL?Ufc66Eh?h@Sxh(aVWcG5cbp00c>S1P4q
zRJus%Uf%jK>-W+2%g>%XgnRtRv-}`L6}V+w6UvGjv--(g2RoXzc2c*5!D?J|H&``@
zHzgn6(!!hfWe!p5F-!PW()vqU>~xY31?`q?_vP&!pGXWW!j}I<5^C>%_^=8yO{snM
zqobegD<dCV7Ia$@SYWK_=oJJ=!By*|2Rda7opKcRDi0$Y=!%Fv-vPZ-_GQ!pk<cFF
zuB;8nhqSf6^jN|4b6meExmSS4RY}NIEo{B6_-+MLpkghSm;G^eO{S|@5JwsfhP_+d
z6IlQ3$6qExF49U?@f(Y*ak)sV84b>smu_optfpWg-l5PisrBRjsYJ+5eb0`~&v|KX
z7enjU1iRP*`B_cl1*!_g-npaEnLp#KJ}JxUNB$WZgA8YNMv`vw7_+~oBih9-<FC5%
z=ihbuEzH<ZHbe{&xxwgp9DiDdN-l!;s7&xs&+_uZAx~akg>vuxm~~e27m%^4Evuu7
zbi*$~L)}82RBN!AthEvjYj3fA|BZGb4?LtYe+tR1>wr1p6-RXgCB$E6Q?naPd?=|u
z6Pwo9sM(3h4>Ak3Emwt^{NVFw<QDj#Bwp^NfGU{Y8x<t}jM#9c<+Bd^<R_RS5kw@=
zG;7Sr+)^;Oh&*-m$pdz0R;2V3d&ia{4X(g~p4m4~U5GRBxfzwDen&R&(uYH8)MPA%
z42*DyoGx0yuUvXYY((Ges3EknOmCoXq(4f@$@)<0`0{{(^4NGW^;95OWpVV0JZ$TP
znpA?mx3-*q7{jh(O>}ju_hA9%v9+?&XVY!c(+uqcC6sSJS0a6x%$8sMAcF3N84z7O
z_?ccnAl3VjH~?_iA8_%X=^24C8;OBBXrmkR_?Q+xhMpAmriOzGyyA<WevP%j2Bag2
z;K6`LoD2_J&<lLIU*-t&rGCU39`fhr%WA85M>iDmI37<`RIY&;36gOC(pXsFos7cO
zFxCE&G0u4b6}F;>`r!vgAASq9IO;CS)lhRSco^T%>fr`4uEI->zJl%0r;C>N=-htW
zUUI&8%JP^(>gJ`)6nghZH;JrJTfUa|AT=G7l3l0Cu>)%BA82nZA(4c(^xd&C#j0NK
zlAw6%iT~b3^SQ@5k|!SU65hiyTv2Q;X8W)MBaD*K=O9}tMo0g%r5+|j_0anMz<E5F
zCFS$@=*MH|@RX|9Iq0$#ZT4zeY{1Ic^MmrbEa*=GsOTsQA)T5<!}eLn`FX|&yf5)}
z+yk+o$y{UWxNZ&ti{3x>^m*iHgwMN?N;hi1;ih`}24Hh6km5<R@}%P;@Ap7o4^KUe
zZ|tPTPh&!3$$R`3rEAKk&X$0rJK(CyaE?_#eaG!7?!yRv^HXlJ8GN_JE^*xw`bk6Y
z9{LRylvZwcj^Cr3$7+e?@L2B8T+fYIy}b=uk&>TvNEyljrd{|I6{P)MfUkcLl<#LU
zCLeCF)4f~*_2{q9LRp%e7~cXp2{|;ScPV`Pz()e>^~R%LRH*OO4|~$8p^8Td07Cq}
zZ3A4;M1}^IYgAoYrYvDwUzInjyd#9;e))X-{@|io;Lf%!-RwdR2KfSDfdCL?>>b#_
zS5BtZ*Gsq6VKGqxNO9({D}w=|wkQG8#1J24HiO$7WGqre#?eqZ&HKTmc5Ij)gEa9+
za$3cYnyyMFGFb_kPyBDN2eAmAk>T`ch9;+XSFmfLW2z=v59|>Vl;(-Q@F(jZ=N$f6
zrDPLhDTAkhQ<6tHP5zJ%{h`{n@{6mg>Vt(UZiV#&ofM*1Z7(W5^Ghow#y-mD6&-;H
zeqtCQhvNM(H1GC22*6DB*S6#fK#MrYs>pHtl)va*<~G}%R@_x!_5pVaKNN`XB}ZFj
zE$#{s%D7+%_otn?Q*$uHY3m0zo*b_p?cTMSp5(1KfHh<V0LL{RoDi3+qV9FMAP0UZ
zF1pRb!dJn_${~*H65&rM07y=p`T>tD)yu~`0<RfL2s@53HG=C!sf1C!mz;~1Mw(qM
zX;9p-{Kh&Y9*K&0xs#yacG>e1`F%m0*1eK@JwzoFLOhj-DHOj^?$LlaWSi$iq)KfA
z44NN@))wMETL|?%?R<~*%+Vw-N0yTK##l~_&tpqI{Vl24GS9Qw)YWo?3(viuWLf+;
zuWUZQc#YmZK2Ah{Bkp#eeP16S&=gV^tTt+h>prR*0cDxX3(vFnsKYC_B!P51&lc!l
zZj-^{`VK!*s$LNHE%Q`T_oe==snO`@pC#FRcZfS;I#EYLtCuPkDtmvEeZg&B?m<-g
zM=_>mVM2jK3*yC%MW5jJN4AEe^G&CN2tHbjuWZR_vl1nOt+?Pv5B96T)Nb>QqsPtp
zSZ6LDnl(1(bsju>an}A`URzY)n3MPXEq|;%t(|k2y^`TJ#csvFeW4Hp-CR?9`va+&
zP^9~X%X?U+oZ*3E8X;M5i<cDitn)~LrV$?k7Stu*^2K1lD_YM~2V%Ja@+S7<`Z4%w
z;&n~goEY$-?Mg4Tx?JIf`zku#$1NI$f9QiqmuqXrlVz_avN`T@>R}DdzzV>-lv^8y
z^rkla(1;<D=4Hix%e^gY$b0YHs)mY(gD5MMDfD%q{u!-%p#BW?h+szT>9qtB4KWHo
z$utEnGF-Jk=Nq~n;sC?A!;;-QbJcUjUi^(2=d0R_GZwyY-u@{!c#y*X!)TkudD|2_
zl<>Ib0yHqEbTai9I}1`-c_ZDMR<ME+8Ft_9qQQ~ZIP}>$7mWk;G5OU_Dfpo$thE`e
zpkcfnZB<!}eAvU;KIv2P>0$*#VBCL0eT`smm*)FT+eOUs$ii*Ck}fj-YXBV4%&sc_
zi|05Lhnt?{36ej3#i)T<5bsz+zo(u^H%((bS=*&8{5!?Cz+GNzntCLNP&6gz&vnPE
zC|1(n@u<wwOkQq3O^thA5=1~n(UZ8Rp1W-6KS8JDL-Ty+UP&-~)PJ(b-WmDo3gFcE
zRe8Ml9})`N#OgrB(`x?tkmbnmt`J}3kX6D7;v2u$HrtD6_j8w<S5q_`&fHVFRr(U%
zb<yJ!@6D6tg5n@q8Y@#)CL%@2u>&B$yi<je&37|p;zd9At%{WOP*4v>BYxa6jg>8#
z;So%cq%ssGNW`|gDcgS8x3ceHL;)6kUlT=*f~^s`W+dyb;guaPqIRapizs0%A38h(
z3#;~<g@~GDO9mH_)a|j&>es~daQAK+m3I&Z?i0BpQJ`&-hDwrYvSDN6>PXp%6x?ub
zF5nlHNoPIBDV7h}*pQ)heS$d9w2&d<Uf1&1zu(I2^Sggw=VQYwni1UE>HCR8Z&Pj|
z_km8y`Ia-FlmW&{DXGs*A$Jk|bKNMEHeU?|0@>P6zlm8{V!(|XA~g*|@Dln(4-JDN
z*~eFbpmH(%P0j9?cdYQU5tPL>c`3}tXeXLpXz};M5N@<F7#$q6t)zThPlJsWQMXwW
zI6yt-&chZlWbChY!AX`cgM^Ppy*=_ZT@i7M_e*MDjv8f))r|k(y}LXK_G5Sb23QTN
zB5sDWNJ#qf?jr^H0+t+xw>b*^Ib)%EOZ|QEsv(n~wMWItNo!f>BAQS!vA#-aXOm-P
zy^c7TQBYy2Abk5(E|Zd1w>6IJ>pv|#vJiy1Sq9pQI|8c-@f$0Fx4S;-<qT20@=DKl
z0XX_^Idon>(9C(k!1vK~$NewYj6=?L115{^@0;CKfXP%Z^|UVo5w4ZRam@HiI)P)F
z@NZ?G?SlzmDS<=SkZy8YJ8>WAna0uVsvY$@Q7qG<-Pa<iu{<{H9?P7BhV`^B<X@XY
z9svewf;B%K$J0kJ)bKY*YR@%#*nSHi|CrY@Hz9f#CYg)UKC<C9!F%=m2}xu}X*s8N
z2<US^e_>+%=m|s|%Da1;galtKse0WR>046XX+hVPzUq!pmyG;Q((ti}{AsvVD+jU-
zEkVP%s;QjXnAkF8+vrzN^NkK@)=M<6k^mt~OcVDh^v?WQ{*X5eRHK@Mq+5C+XDX%q
zEP{f2`+idrFVul>H@N9oclZ;XEy0wC!&0i1BxPTn_4VpF&jINxncu<z?`<!$G~OAV
z9B1N~^Ew;K#}MZ1QAqpUSVzGiIE;ef@RqFYKkRg=nQ7tr@Xf_R3OT~gZYss)0?DZA
z4w_8kD^0A^l0%DZNKJCCxvTeR{nE<@&+yq&Dtnp!`#ZJ&Y)p1kc$dQPBxJ=st^)f(
zOLmf*87k7}1~T#4fYllaUMh>N;5-g|0*6=T5A}XpfDh+V;JkfzeDFVKZcpBO^%ncf
zw>P%mG7v!$4;D%U<zia5)9w^G>V$%A>mO01?mDen2U#*^E-ZnV!R)+PKyzy=*mi*N
z#{Gy=tevV~d&2^N!YC6JKB|MskBj!bm&(ECOrdB?dT%r4^<HW}CqfsiCFY$pKH?D(
zY7{6LTq(^!HPLil{lL@Od6jge6P@4w0Cne(_)si2QixCv+h(qIrR33@Sy|y8-s`U$
zh5Sx!OGMS$G*CPyy!jp>ZzD(|i;>>zg6W@OjE^OrEOu3#6H|-pETjlP`FS<R?w(O?
z#hM+6NT=&XxiH|aZJxgzLHM!3jjRo0he$Qf&k4W2Z{<lA#PZuQvHj7V%I-8><xdhA
zSaAf15I;B0snwk*(01;?UvMvT86F{AHQKirv;q<V%0%YwbS%0wWI$5bn_kn+R#5on
zTo0xqW8r<akr$2=Hm^)#Put$kkysJk`riNQ(gLT&DrzSb1;2FjRNY72WMz$sd!-n5
zv0a!&+|2*5(}KF$TyG=R*vVej&}*4TowMCjuGZ(F#Oc~*o+u_J3+5wHA0!NP!B&|$
z9wJo-;#v{e6J5as0Lm453zDDrp}Nsy%P_K^mwsto1+-#SZ7E;*;0`eRPN{3yH!~OY
zj(sx7)sU&z-NS(baog?F1|XeJPj@5SZ$Luc`gxcer?vbqx+@{+T*Zgq{aN~cyj=XF
z60JZZ3ep`9Z)xuZvQbypu?U8-N9<3X)n&BJ$m^0FgcA4lP24sU$<oF(P@^K99=&Iz
zO*H|7ILnD3?0Wo@lqwVp9KQC!r+pGV{`lJ|7|+y-21(zeIl=V3q}eWkIF6l`%+wsF
zU*L`#1B@Z+8ZL=nr8dTY3sTt0|7MSkY!L4PPMZ@wi3)|)7X*3}i~i!&3|y%gC6kTj
zxW~_cA$-I;)v8z|S#hxfN3kjQ)C)g+=uaTFv8Zsd_*Era*uStWCg`-$-2P&$?#CB7
zo$%zAFs%J>%uWW&#)@_w2blf~&BF1+JWxX)l{}RPiv4egg{0utIH@lG%Z=EB6(b~Q
zC;o0jfFjRT*&Cgb195nTV-43B2RT31DUNT^>j%dcFWIt<K9$5}kA+eJWI=wRl3c}o
zNYcsgoa`FxqKe(h%}rCc^GnxVbG4vS#I;=rUWjEeq0fMHN_1711)w2)w%Hhur7}=x
z2VKOC-BLaup=jZgE%WI1$30yuv<500QD;)5xQrhyD~C>1csKy`XLoXreH6ZuAm<*n
z=HmZhip?$7jRB8$!cQDN>X3i@SFwfF?w6e<Fqhj!x@b~jYWkdBgw7r+tVm5r@$m_$
ziDV2Wl%nzW0RC|1W!W+YACRQh6)W5cRKI@RqOl_I*_XrAI}bS5!xWXM%yE6bF}~*7
zFufmu4knbyfUp4Y0n~<t@F#;75O3Uh5-F}_@mWeK*F5|WJ9M<XN1f>$(=U<WWP@(T
zo-(bPb<3}8oj>o+Q?r%xeD(4?eHb{oe7ouI?7<I>^U#>m0J~Oe`zi1L+_m$Wh*#}m
zHE({7p$4o<w9=k_mY2DGy-VDxZ9%rIKG)+rVCp6PcV+9FTpO?qM2cqR6~^|@5*oQ>
z`b$>-7D)|1{@9E$E?N;xUR$v_^d=mT@+L5wsW#MAcRrh44tyUS$eoF2o?8^97|7xL
z@sr#wYnZ$k0Q1CTWdyRT2|9cz0k~lE|4xo{-+<+D%3b-KkG5a@F3pAR1!O~uKY=pg
z^v3ztF3n$_Jpfz-hMKaNx4g`vJ#X@TD`NZ4B#9)?GNTXUnfdR~Xol6K5V;cz#hV0{
zL%Tdzznhy)N2M^tg1*jz^+pX&<;UT0uo9R_C%;U2^TGA_pEt^i{W-hsvi~G7C)_xx
z=gpc_=XUByW{3@aSWsb8Iait~{9d=?R6!y#=SRY>qGP<)J=!L5qBCH8HqkX@q}o4y
zr!^2W@Xe?K8C0HCz5vJ8>3c@{&g`Y;MUx#)Q^!BKKxONg#iLwz=e*XG_feZ0hD2iS
zXu%#{-~J3M%{_dc0io&Nz4s!|JEAQ*aj-Zj4;<@r(@O{cmm9aMWHK2s_A0xWsL!CN
z&xIG5hxom|sGx4w0PseND(#Zeu+ph<$>H}gkzxO~7vx4^GE1T4gH{``f*5G`4zL(D
zHWL`p7w8OIOuu;<wnMJ3P1l+E@iZ$NOT{d67k3NwkTV_Yr(#n6zRN}j&R<9UT0G#x
zd@FiTJ{n<chxN<;zHiZN5oIZt|8B)7<&c{s>c3Hd4O+ZyaoM&cHmgbNfA#xK2f)e9
zwZwuLI+_m5O#dy>)qM;<$=N>dYun6ojQpiA9>Q{>L$SYjyWrnEpYv|N`-e!Vl2(vY
zubA2b{ps}=0MR|C9f|l*Te(|iIQf!K_lSWHt{BfBFlt+ZgxynKtVoJ>xoBTyf0`t3
z*EjN2y!6*35+9tq@_kUe_*^Ov{L_8?z`Y0y^-n%Es7t|EMvVR*>*}3TrNISqpuzsG
zaGeoUa!h^Nb3vb7IoXN3RAsmzl4537t4~52ox>J(<XJ;Lt}8dsV&p#9!IB}n?mC`q
z!zRU84J4+$DjSSHLuNYn(>=ZEtiY}Xw$BBWmVgt6$T%Kro$rIGE$_a$ThHcMFZf9G
z=oyuU{Nt#sAcYM2(Okw1s)@e%{o!`D)lyq48MMJKkt=yg@Sl;(tNlj{@CBACt2Af}
z%a~TU1>nTpZ+?ty_0ht`B@&K}%t0^_iIijbVEDB5+LxTDlknt7$D)snk4yxyftqb3
zJ9h0L4?~ik>5?gxrnLrC($DX#m3`bwTBtSaX_&5V{PgkSZIs16&tku`5!_)gxH-__
z@`w7%vt2+lJ%8K27R1E#b-9Lk+fNz<q+IWdTo#aipA`3dkxwyfsEfSfX}J#igQlK!
zLVmdC&D-&)*>NSrv_R(32A9rDFsTZ`oQaUgQSkw`PCoc(`Tkeg**8z%F{ZG9qKPVd
zW5zMpZL2k9#5F{B3Jox%+$}Nt&_;#$`8SiDFyZ_P@8qfp%0TX3K-&J}?pprB$x~&$
z4levj#2@{hh1ooeHeNneiwd5E5WA6#du^e;RfB0k{&#k15^nUnrRp?cyeu^MFhs;q
zOP>2MOQvr8;AQK?UZZ20$wcF)$q6T5chvfDH&(XemcbfIy!iFr)r%`zJ*{E=@{EMY
zXI;z18z7{N;4BRT4z8<0TLWBpIy?0W(n(k)L>i?gQu18SfFj`_+Np7O`2ow&3*IfZ
zzYAmCe)>KColmz4ee*kX^X#?Z*Dw3>XQy>ftlUbR47h^$O1@9@e{dT6l6}fuKH~%1
zYQ*pzu@E3L2Gd20Ez~>mLhw&REg{I-TCJb7@d->`D2X)pae#MG#A~3P2I7~k!&|Fw
z^9aZT3rsik8f@IN@yGEVb4%}7S71cK(g6|8MD-9az^2uM+5ga|0qC!<itBp5dB7X8
z5Qy?4K~BY}v2~=M)!b{jc{UImiWwx(xu|rzlUxj<mqj-@e%LPtC9YHI;3KUD_GUmi
zl|8EM0l%e^x_wOre2TU*_cnC!DfrU~vGj#PWYM~vtv}pG=95o2I8a&q;?|J>3jj0(
zv~_%gqcUao*-a!uR#Cq#;aXR_r;O&$-Mn4e7hW9l^lYE>#`nw>b_OY0&oWV*L4tdY
z1(Es{bteQAD!ke+0mAXWif{FB;f1s{;Sk&&TBHC=8cXC~kDhu$Lo{;d_C@Yb?k_#k
zDg|zyTWz@+BL^iW0b`IbU@!?~hx79+jhxBVs!S=U%9G=7<|bE5qJnRquT7-5JRHji
z^|`<8wAZTKjsqc)9KGN$v)+K&klEsz!!j!0H2?Y!zxBZek3zo8-rjQ(-VYNi(p-yS
zm)qQdPc1n}P@*Kr#do75^BAjbOCIv8R%_Hw-o=F04W6o;vj;A%GE_}6Jt0{fbBY39
zE9*D`QKi$BXJ17&pQ!yOhN41;D|8)9rsvBN|9o4nIjbrH5NExPr4~Gb+Z_)|2QaE(
z!VovlfzSi1t+j_n+AR?`GDa0^6^G&TYqs6#L*5rqu{AQt+c#=O%RD(WjUWHB;o;m$
zKKyI;;TP3Qu}QKmc~R7p@j6G$c|Mw`;N@mHx<DWp=K>5e;|2c`Z>oidH3*iDN<jay
zoxf?r-S9J{oD39%BX!x4PGBn*A7Ge-q3dt?S+%4;kMaIyE=PhVG>y0|{r;wZ5j$vK
zcDLR4J?wS-Fx85yGr88|F~}K>LWXgB%S3hg#kZ3%z%q1nto4KGt2FVJfd0|VOIz`c
z6%3G%=0CmvK>;i;a))<}=YXz)5mJ9f4BbboF<DtLiek3*QFl_@ftAV1PVegheX<$j
z{2feX<*H7!Lzp<Pp?wDVi$#DW!oXX0O>cLXEB+4D0bgX!9{C=1N9rvBXfW?oH_OmF
z#>`iv9thRe0l~6C^-?gmhDtvsk|_MuB$iS5v62Fc8kD_W4_Jzo*NZp$L1+g#K$FIo
zoNQ}6A2O*77#bN(+2V>{3;rYP@6Ef;{&%pL@;l~Oz<HMwL4`v%=a)&OFd1c<?_v(*
z9u!3~=6KJ~yRV4_NKi5YQH(M13JKOws#HxB|6U@Ri47Z^{{9i;2Hj0z33t~&=vSy%
z*l^wJB%SaohQjm(j~z}|uDd;2QI<A>{{M|88p@E7grm%>m@i|*Vhs_&rCxHxn@2Oe
zs9bh-Ks=@r0;=}}kllXs{hQS_2K^khOJK;*32pmnBk}4$uE=C!@8a2Oo5~Ihk+Hj4
zalClL8#`U4u<*ILAdo^DUDDA5yv&`+{4vyXx5Ptlz86wH!fO5}+J#~{b?p|Weqq^M
zEX=+)!fdcVy?>1c$3CvN1}HBuo($9}PVJp^)=^lZ&2K+kywU$OTxDObc{Mlm7uPoT
zU%0ma!3#?e#*>4-L#@!%7lkGaI*@w{$y8~;66vg;5>;oD#;K_J1IEXvK<}LZ%fF}0
zX<aF^CNXV8`;F6nIOEm*ElTD}^TRjeCO%XDcql)Cw^tWDc30WI2Hn}e_t?XA>mjPY
zp7|HU1qXsSDYSHAp5lhgbh+UyOm;O%dHz5w8-^gnMdgFr?0jr)yz|MIE^IKtCu!o1
zfwXOYi-?W)Lr2Re*^smzOqVI7yVJB+kQ}00GzKXfhHS9H-7fDIgjtrchwx7;0v4Xy
zB~%zjk!a05F0%2e6LzdBsqUj8eAyTfzjSNJfY#ITxf9i;FAoKz+F_A>?EWyjRHy6b
zv%9afAe*`wLZUB<53WmJv<EgqLDxpRNDr=U>anGcV|G+|w{r!W-yRwEK~^<oit!E`
z__I;I-peMzd8sv;uJZBhj%&$NM!DHE=hC{AnBDMg7o=6YI<&bSjNd2BAEEF))!+<N
z>z1YF>20`tw{~~&%EDRv-@g$l6jc{@5iyfMXJ|o#faAbQBt4V637Z%+2k$BXUQCVL
z-_UA$pL>-J0}v%>9C<mo===x^95rVYk2ZRoksVvJw;`xwooe&--CYtL|51n>(wn0!
zkfovsE@W(}2k3w9AQ@(*PHZ1shAmjf2kH*wr#LD#C@6k)%UF`?Us)U@@0<hwE?<b3
z*y2AxsJhe3{`8AaigXl4gyG#2$i_B${NkPqa+G&*&v5?#=W$5>)I^|jLIRniboKj?
zTwXoSUY>z5m6D`<RD<cyDb&M0h{=J0*Zso<eHgW_h5XU@CNIzANLmUf(!X*oW#`2R
zjQjodcfG*{)gkrgi1;2MZNzhoa)~>#<gTv<&VOPYfJ@};J!gs4@Uts%kCk}`ooLph
z!qDv4CxKL@6f|e8{q2(PI;!9ogHN-(N5n0WW?|%s1zTq=RK96(`1@(7mj3r=2?X<%
zbXdCR=|{a>dH;zqoA(<H7x8@flNH)v0ggto6kJtl7!u#*{ym82HbVck8Roc&^1;0v
z=X36d$uyXX_|)CK`ZETnd(efs@85Ii@<;HA9E2QY*5>{)t*8xGHU03ceE8>%hq0Yu
zj3^Wp9y9v*cNdiMAdml|R9~b$y2~R=1_aWlcsJB3Uxpa4y%&DE#Ji5t42@tx(^8+x
zLKITsIpx`LX7O<>{(T3P?B|~D$ERm!=kY8S>z|)?xVd!<7WcqaB&Ckz7`??JIFhg3
zrG>p4_&Y&Dsg(y_z0;q<|JTi{2eQaPK+u|sg08-3#CB292S<Y%m+iiVpZ7quq(^@V
z5*JxhLF9H;PfbE#PaO+O@uHo4`mZ5ZRY_ClYIX?Wg!WBD?b^A>z15MP^;sBQ_s!nv
zO&S%0OA?iM8AnT{{?TrWt$yxTa1AjeGbIE;z+cDeo&{)QOc3`OXn@mAPutw+0RXqM
zex+VDCoopztQ(_mhT?}3$v(qc;?mJNIRIOl!PEg3FJb~IEcs-zF9QP3mA3DRcn-cT
z(Zi>2Mlfr)1H_*A`5xkU31jNBqu5$r>=Qc3HZu)2-oBsPyH{tICu?WGHHq{Wqyzqk
zNxA~6Hj!dbgM({cB)Q$a{>lk~)Nc$MJTcGvHHcgh@2XBV2<cYXu)>9|j|9tX@ijYu
z1*<BhXS`-9@flOqHjsBd;9&NrAm-F2!VlUwUJ-7SdN8FF<NpVJoJW-`L-}5i1(<|I
zVboGMTTU-Juy=TA=<xCm{Xql1Ma6_&g0?MzM+%}L&;fug9;s9{WuNiDuM1U_&<dSp
zlmlLEonAkZLh3E4A?#H`NcsDMB~wi0joAe*4~rs|u`*BX?n{%{yNE*ri+%cyEbH^<
z`AEPGH=4liPtbmB)mn_n#Ei)NkAJx}cl-M}W_pkT2=$(F<t^Rd?3{AZ7{DWlbu3>$
zG(<p5RUUGu1b1QKsSlVauDc=3i567j(4d@`?`d+bZoxTW9r;ZP2Cy1Gd)rTDM>sx4
zI@n_=;KwOT$SUKS-kAHARVw1gz3W_YX5GV>Asew2b5+H2E;DN}UI5kcyL0Am9dXp1
z_l`08>k|c}`E;_W0x-=T=t_wg8adDt`6mV6`{pb&5*bzTCB-JhNI3Uc!)7e*BFmSr
z>Ef6Etl8txE_%B9*_F)_*+>2do>{NUL*a-%x|_1Aqz5g92F0T)jZ6O>Z06^u)bYRR
ziYr8Q{SP%Hwugclg+T7^AG}bk1Z2iEEc$$DefKCrOQdQ+Wz3y(p>*N6lwaUd+<Z^q
zeEG3??OJ(qu(qi*{>G?Ccq+VsY<)#rzt*s9fFwLFiV(Bb$6HqG^^p;~?n^}4Mw@y)
z^=j(YRNJJra{)?~;=?P3Zj`hv8IU&Eu~Ghy3IH(ao<zK^Q6+vU-+gKyO(iAIuje?h
z60`N{?n5&fE;h?nqC&)zlGQQgij)yX{;yjdw@Cf#;*e^H_B#nVpM37DvTrSKZ|knn
z2&^m?u8LNr0y_J=Un8jhdEsXx|D$COoZx6->Y|;KQYRc*gJx`jZ^x7IUn-G#r&X!$
zoM#F@Y2qePa8u@JkHiGvEkdRx?qz%?U+UjyuhX~~TL=0gqX=J%-H-=_T$P^x96p+D
zwns8T2id9_HCWfTyY3-RZ3)O(quj2?!3j@m=o}IUo{=j2UKOfckwx{A1bc{}daoBz
zz8)10tl2o<+N(W{WU9TiO2Ml*<c!QhBf&2ZAxp&$`@=j`!`HP%v*_!lFkwJohuuma
zDicl6q1O58C8`sW09|Lnd2gd!vx~j#-xy7;b@zIyfnM~#uSns-PwDJ24;Vr`Eu$yC
z)94q4J}%xeRrvaG>n~wECx8(JSSG8dU|C2Z_Je*~3?ycHYMk^dBO&<kEo#i;{NKq7
z7S$;$QVf*hKhrR39C$poI#j`VeD!S|B^rns)rpqQ3P6e>{%Hn<64=gBuRar%@2(MJ
z*q5}udzbY37=@EU<rCHuePSA3=tF!`o#24g#~zLTCOK;l?#pF#v<Js8ueXQ=t4KD?
zUhk9{5WA9K<GtN%n3tjeHe9r2QU{q3qKTT5k1g>tn%H@y<4A&dASaO2U@96<e*=)~
z+Qye#AmMHuy@;L!S%JQx&CkGmreO=1o0JHY64fh;yvxm@pay?ojT;YxB*2ohX)8)D
z6$D_-pb7@?pE!C&2+5F8kHX!AklP-20skxA1Gx%jJ2Bc^$_&yottaC6GD@N%@L#GH
zG8X!C4sZK4aq~Fu>PoTvpBC-{!_c-<L@C%RQhqn~8X%T(mY1dVIpKOx%!JeCOV6Z8
zA4FqDI3hKC*a{<m^D`yea*TzzV1+ph5pF308ysF4O!alJM0A}@-^Y4Gly$ze*TRw8
z_NEZiE=_crr}gsTI6VsTQ}`4a$y#4a5wi2WfAl-`O?R^Nx;92o-*;IL3kMa0|CT$0
zKW`kOHJo!Vyt0z3>4gBN5Z;HJ`-a|8ZwOa<BR?GrWS7@Wd&BP<`g;r1;F_XWIeF0g
zaj*5+ATHyAk_>!Bg|{1LlleLHjoQ>N*HL;mV(RYwJ@1I(iwRW+$(IP#R920Qe<B<9
ztzPh)I=-G+4HO*%{((D7#1b}>vf-rKATV4o9m{)<n!X0H9(ld&?`aG`0^~bbPnWw9
zPmzwNk3#M+@vXnU-eQAgSV=#`H2kKbM1SkR;>$_gCT$4ebdAT3(iIH@>-=Qb-WT5#
zLLoy`sNMKKtyHk#Oxn~+s`bIjcgQ5}ro1cUb^`woM#|#Tq)H0{u62cKUUPk_n7SyI
zuOlalQMB(i`)#z@>8!4zW%cgzUfAomb3S=MRcdI*v-Xr8H7IlOnVp^N)I6muZee7#
z0`4G>sE1aPTb4-n$mTCrT+NkvnrZOuzi97(#K`d^lQA_D0a-ubGP}!2|JDp`hUp!&
zcwaV#onjM*T!!`jWQ>{&GvVyEvr3c+B#=g)drd45bT?Q7uf+2QxCxPQ98CaV_FUgV
z?}3t*FJ_2F@JI8AgZO-OCTO<NF8&&Cgk20YJnzykFeg#a5ETXD-JpjX3hL|kpb#~|
zhvO7$C1<_UAk5FY00rWI-^d0aN_zaa;+m0qGnD%qd-~o2Ay`rPsK&lna9>pmpC+o3
zyV~;lNyDfb*a?<Dkp$!|yj9&|xcKnK*l-SfWq&Hk-#-6C*{Wm%KiA(`hH}g?YQl->
zMrW(|m6hLk(~uW>&jq;1v=Re`;HvBJBUO`c)OX^0-2W+)s!tKdW0;WcgK92>sj$C%
z^7LdAD?owzERms6pu6a|2SxbxIp#Mb`4%J&a)V-#hpS9fN4(Hsb@kXsKtE`?p3^gI
zu^~=o!5?R(g3HWmwp{Ase?6H<GghawedBRe-^?3dhCnzQWF{WRb36XqlmU6p*Tp)}
zbn!aF_@B)VqfCT(aHxWtJNmql+tSL)3_9U{4_DKdHi)6>S-i-?C_N<Jep}{l8rWN}
zsWT2xT+j`(aDFT@H2Zc{Gdb|sq6=-EdVzNTsR7~VMo;mP7?SXPBs5Yl=uqUV%(1)w
z;pH8w+3SBxb9Fa(u);aAf@ts4N34WM#@ueA#)>b@OS!7gW=Fb{#2$WxjaSJ7K^d~q
zBNHg7+SeK0R1UOs(b-jCS=p>U`Ebut_X-CRYU6u<MItT}k^LpS=1b%T!K07FYrAhs
z{;e$E<viCcVEk8AZ63I@{r9nuCahg$e|1}l#cmVu(3}pBdSC7l3sI+|Hj3Oj!t$EZ
zg5e_XaLOtf&__kj<?b4BIu2OMSK1l)+hWh7;=mL&KY~d9ylGIRLzpB#sp+!Bdv<J1
z#qHqEQ=G|9>tk}aN}FB*ipO2zeaEDN<*Zx(!=(F9xnKygqk>z)m-Ukd9fJw0xT6Kr
zGSHvFEuoeI{_#{P%oGD|Bhc{sFjLsLG6BXYMK-!6_FVBv+=PYH)hMuSuvr^m_oKD%
zq+lNRr@n1Iqtb$>3r}2p#c)8vkN3-y`9s#;w9Hd|d|Yk!svo$4H8^nV?%cp%<?Y4{
zfxYw>tTpUAjKWlm>A=EG`3U#geci(BqWVxX^)4o!6pq#rcY~-{o^W8y_-U43KIy}k
zm7DJ|K9fUOXS-reZyR3a61ZWfj@rUKCr&wTzG5Yo94mi&(O&QO_ZBAdzuv+!h4J|L
zcNK$(o}@pBOUSjbZ@LcT7tyG)tM}>uF>Mo5Z+tsmqrGV^{P<_UP7JHTq~vu>4Qn4b
zYbVuz-5$`}qLunR9>W^I$H0auxHwO^?X%M_)!2`C$;2!Z(Q*C;nzRChiWu2)O-q}h
z)iuO*-^CuLFQ|RvdYFZXAsb%3yLAW^(|pPj))sew;mrm-Kkr%jofaB-%jgqV!k&e3
z50n=*mkcF753e|WA=x*FN(?2Fh9NRtbp30vNGCx==+E81q*@#CGo8#s<0YM@4?{eW
zbr~0PCT<;D!O-zNj{)}M^Bqro<<HA7e2$VB&bln&Cj}aX7j2@71s%8K>u?P3=7VMy
zt}vNNNwWBl2AN=wz5Rnb0@;}NrlO`1H)NZut~*U&QTsI<?_fTmf=7Uv$CY_Dkm8Hz
z+UsgZ*u5#gTjkU>3@%#pRqW>&(>$WZ4(;)LQ0I06#{nszz25zyf9U5~!*Y*d&*RV0
zbnO2ATv5M{m-2X2CPGJ`DQFCY7Q>vah~O}(#Rp`wl2tq&7lC>GWh=-6U2=2x;Gz7a
zSQGX3bw!7>7!F`R<CpKb-Urd%_3Lg{Y78L3>nP~Vf)Q{TAHKfe&3wgHCI>ePBI{p%
z#lNgqy-Nqmjb9vBFd}D3-34Q!ZofVhi(TBe4UZ1WIW$Kx%4QB*K|Y20JreIA(=AaT
z!?*PlHT|r1ps{&ZEP$cv7JIe7-XMpeI)WKmH_YmNCKzxpdq4b1doXh@`<0<gZyX75
zgiy^hLyJQyDcoO}TuoRfYbGGKuD>iLtHmHuC4D<vxXJ_%TPTT+41ay;p2)C9K+~?-
z9X4CA+a8D6Lw*1g%-H<pE2VkqgUV0WX^}qEgxso`UQsPME0`Y%Cg<2=lF3`Z^go1t
zlLdonWG0kGkgd)hhsj8Axsr6Q1m<?=k0vs|Nzvqj-W1<6BzWv2I<8EDnU3yYj|ySF
zPo2bzFXEmyV7SOyp&5=U`<#ubv%};Ik}3rVCnlBO<5|h8g%_e6U7KG4#%xtc)})(&
z2lPYd^699?V3xK#I8IBYT<&m8u#$S>_P%A<D_I)3DX8YJBS}6ED#+O{zU3c!h~m?O
z-oUR=ak&7^r;(D7cFJ#;uw;Rk_@Qm7uI{+@FJ?1^onMgE7a-vg%&RLuIGVBD;F%sL
zB!r=BHU_)5oqC=JdU`gL+@{3nGZHrQJ{_M)p_!ym+U~z&_I_Y+vs8ug^e^u}Aj&J_
ziNj$A_q&IEhX<5&0XUSAwTZ_~l93m7zgx^FA(gvSM9Z(PPPy;bZ;(0AW0r)>QE82U
zLTOW6_{{rhbvYS09s#vx;P-+0OyTMNxSE2JR)tR%A$Ti<pfoJ3oChEapLV`(bvceG
zh54>8ne()IrHBUdA6j8VDNC?)t>~t-7gay`JOdN=L7AFDG}#i^*_p_p8nOkdOQI?;
z^W7!I5E~OP5p1RT>$B1$-~-e8l4+6wxU%QrH_PV_O+LSv>i-1v=%47N7=QgrD9;pr
z)}g#!l`-xn*s3CwDsPpD=;}nL&B11XK64Jpi-YfCdu528l_5Dt6_<3wH@91vn2s)O
z?ZE-Ng7IEqJ}xcGU=aVE1z@x|b<|UXS#DZhwb*q4g?cVSEecvU_GuLu8{?g2Z=TP5
z*AUnT$#y!as(^S{m<3K@4q^QDvBSy_x!c@hSs`>|szS`;25Y_-Ptbg1h^bM;hY#%?
z2Y;;Lk0aIN*b)fk7q%FIn^lWZgOfHd2HZ)TQpdIgOdi7RsyR$GlC{iIl$(3rr+>yC
zFU&QhJ2)zLlZ27>W%)|H4<y)L$5@OmG`3;iw6s?D^`_o>@;nk}=x8@x*iltk7N*EH
zAYgUlrgwOVk1h_e7UdT>9_phm_)mOcq;mA87s`(WPOPfm|F{1G))6!V3nJJ)rn&mM
z*WsIiMp}z%nD(87)-5N`aszfc_gIf6RBimSv`+yi{(+1AR&YlTX1vl0Qc|wzyJE!r
zkS`%_ILdMD7u-SO=0ap-f)-5*Jz>F@dhTQaVO1`Y_;<lf>^~9h4Inr~d7GP92nyoW
z%Zkc{_b;ojSxaD|B|15j)G>Dh?dFcS7=M#YxD#R<3_Dk|lX>aH>`!g||FrksK~XMU
zzo;St1_qHR44|UqC~?S&fMm%zOU^liY=lvwNE8?l$qX>$9325AN*po_I3Nr;XU@ZZ
zgY5I&TlKwl&#Ag~&h`(Bf~Qt@ukh>DD|9;umfA6V;o-UZ=*FUD($3BunvZhhFIAzB
z1-s07kbRZX*GBo-l56=OoJswJ+A*3hNem*ZuJQrz>mIfPeY%H2d5`gd_duUYPGlM~
zI1*Ive`6*DKp^R*{1TVoPoQ3C-G}4D?43Ige(?dyJU+i{{rHhz6`_2L*c)f@EYSOx
zEt1as={<CJiMEy)Y_BX5hUp1bzf_}cVXZ9D^t12=Ag(o|53l@p`Ed$Fzi^G@odv@!
z8XY!r^Y;5zUoEn8H0zwZyECGFc=zH;lD>SEC0l!D(4zi`T2eCWo!B80L)Wh`JY2!7
zfJWAkCtamiSqHl39A-CxjRCkReu5A95Bn(rTmb{}{8_}KFk0C2Tl4YZ62Sy`P|VIq
z=qQj*m1p;tYmr!7dffQJ@WZ1AiyY&s{h{N<Zd596XXb>sS9`%+uHV=*+AcQuws=%p
zAs`!NgQB>>Vjk$z8rQPM522|<gGnC8#%kBBTk_L1fz;s-`Q4D~KTP;Wxd?ju19)kE
z0wMUfcs%W)hBmdr&w$j5LrYl2T19)dTp7UgZnP>p5-SO$R0~}FL)_CKh0?oIA=1OY
z;EJjrw<XO$|EdLm?5OFX^{CQ5YFU&6IrK~}KLOal+vit_Z0rcY{kh+iALszCKaj50
zvT-r%0t$QEHT_}Nf!6WdDr$v`hno26imcT26fJaYm<ccXxD6mNGf?ITi%&?jRBtBh
z^i4;#HPr{^Pw&kU9e>tQX{PaO_=nYFjmz28vi`LCF5b5`b3iyt8ez)_Y>e3jK-0Ra
zjcuDGaOV&_XE6UN-YU>#7FqpD{||>Kl7#H%;JLF<n4#Xn0G^Rv9-#0P?KaMD2xsGU
z^@YXCnl5zihm53-B3>|Zcn6gXl-lSQ6#Us&q$%<hh_2hz3A)8XEu&3_*MF??mTtYg
z3oS>wMq|uW4EnD;=>%@q)v|G=?*b090%hY^4&Gp79pCgzAHZ|UJ}AJik~a^d+xmRf
z0@RqNa^uEK?v0y7(&)kP&Ws0-=){`Zzr`uY_4U%Hx0&ex<anS4`QIbi@c|!0Z`YkX
z^&))cz{PF>ct&Eu$n2q!FlZ8FM^6#G2(`XFfQMW{=Fi}|WKE?ugUI+p(htWZb89oe
zQX555RI_|QsNm~9O8;8Jf>RG`9`%@+(`clRzBYhs6b0YNf?6}=S*L2+yED#xBAS}S
z^$iW1L>o}`lbmlhj_1&X0|8#mk$(sg$e)j!z2mdG!axM%OsRIdWz}<l#jvWe---ep
zTdonQ+-M*QQQ+hJ<NH;<lDMOhqX)pB;b|6n^5ywjIqKvqQWyDU0b{A;#Lypx52#-v
zfvVKQ1>mgqr;!8KiG8+E$`ZX$Z&CL)h&iFxMVKRv^t);D#Se<s0ht5lKs*01fV-jU
zycFemctAZ?0drIb)vt()G~tCV_wu%5ZvI1;T@;avYCxrijccSiaK3DhN9x!<15)k$
z8SYB4d_VH=fW&;}^5R!56hIx0egBwk2zX2*gk@P8$i>yGMC;#l1m^N1g+=EMaJ3!P
ztUqW%RV}seHsMAx@^;&XdHThfTP?+w4QMRnD+c%Kl3YyvMv-fBVUYC2%b5QOaXzSh
zK69^S=tb#8sOdqO@<miUzlSdYcqoHP3uIYz1F)0PHzN?m5s6GH;6Z}Of^-ri17JDD
z#gYjnFE0|+1$orONstw3Q5WV0z-5$lEO_;_FsSNZ8>X$i9-Jxtg6%~WpR3fKu>~XV
zcG|cCf$#}^-_-q^7xsEx&OD|tX!jyvfHe?Ltq%c~^w*-TppQsRbrQ=mu?xFRFNQoG
zX-WRq2zs$OL+bp1`v!PD1<q~%YyzhTPH_f}`ZZe@>;VF0H%277F0#jZhC!+R7NA<f
zp-(^rkB`Zl<^wPojZM&YiS)&=ku1UGU#RkAt^t-M>a(EEGI)_eP^e@R90BB{O#u@#
zT__nG?h0W=0E*a!jYu~ptk4HIQq_P10bYb~q0V~>u;tGeUI7Ra8p2sLi3ApjrqaN(
z4B>kjeZlMhfH1%|4B59a%Z2w75`-7AzTgk22SDHml|e2@WSa%d0hRYekhL(#hxeaO
z^zfkS2!A{PR(qP`eptZl{Ah+gT(bb#s_-xl&|U~Mt!^&03{T`jTFPOsxD_uv{T$wA
z6@Yb<@0WqPdr6k$a2<7EQRKs<9cF}X(*Lv}T}!HYm+ND%seo^4bI{w&Kn%39wjH=>
zR)4+|UC5UKOef`sQ0{jsEA@-o3YA^gIk+uAec1imU2@=gCiF8hcY5GGqp3U@jt~K0
zE<k4$+grXL<QJLT|40`XO8yh#$;hf9(nTIg0FAgLx&6j+fdZnVp#`cgz-NhMg_e)J
z0j%2Y88(%{_LIs*zD|0$?;WBTSdJFzfxPy`-HU%-Cfg16$fT_$odyJ&A)rxqe?&s>
zUu>_;F4BnW`;+auDih2P5?W1em{&r8M^WvIMSFd(U+5uDcG-)O7M}}_>=x7YPCWxS
zQ1em>Y~l_Wp1uLJ=sjQ!tOf6Qbkc$O6e^=q@7fwG{%eFHSv4CLYQU5LG#Rd$a|15a
zs{6G=%malL8*=D-UzmoW<q59sC(MPZvLhsVfgYzxJR2Y=kEDug0LcP_fht?PwdpU0
zm5y)KF(DkvygI#DLM!UZ5^wz)1=h^bB?kkyuA3BwE75VgjxH@@zQkW36~?_?xdc!5
zf+PQZ3tIS32m{)>b@S&6kLx%4Pj~=33ZH(*zQ+X+;pMGVDBlGZ1O9I+*@%QF)h`j(
z8o<|Cwq~|X=>(h|Usn63UG9IJg0Eg&!Sa#BI{HGO$_CC#I@i)Jq{S#zmXvb=HfqT;
zc>wDiUbq8i;Rj?rZEOX=qQY&$=1cK}GXLoS7x6?o*ZyG#xTCH$`hy#Xq`{Ku<N9Vg
z0Ps0uoJNv=0Mk*lf(^bR<Sn=g%slT1vN?Ql3RxC<b>kMWGXd~bM+~5r!w`MqmlR+w
zJ2`UXU#YCZFKT}->*D#$^prqlWVSz!NB0#o1#=jm)ULzh%HqnN_I%xTNiX@HvrYBg
z{c|N>I$0ndbA{wKrAsZ3c0%^H%e$pDqZyx4pA?)gXnsD=8uoZ3jhJBQ9ytyR)G?!2
z8OM?3lgmNQiBMqE>rV7GVdX~NYo`OI{*F-z%!V0QrhGIR`27eI-}Bi<f|`YmMlkby
zgG>t_biYNrQSmC;DJ5BWGNBU<q616%-ntr(tbU1N3=$x7svZ!)pbodMtAo-V^Fm~<
zzd_2p6Kls~fzr(jCbx~?s)MSf^B>@>OU)v+K<BSn2d2B$$8SY$6A6A<(>!|CEUav$
zDd>kY;ybO5*$|}@H4D57P^XX>eiSrC{t_~|I&bC{eKQX%Cig0M@fHEJPr#3YJrCRz
zj(qhV!3eC$XT;}W2|W&7lgk0VeoPg8vz#Meg!n809NaX_sWpgj8rAmu^*A0$<nN`p
zwgj+4S)d$0$dhB%A$$Gg3RP2Q#!QgbojV|-tKJlKz<?o{#<ztZeR8Rh#7Dj)Xoz~f
z2yTihH#AfLn87IxM@vfx+Odi4eCR|&#Tpp)-GJHVri=x*2vfX-_$}i9*w6r-f*Sq+
zxO?+k{g04NGza*)+2{k{Ga4bv&}XymxcaUKLcKoT+*eWHCd#5v>Z|CI(t{uJ0LzXx
zk|zf*Sm|IvgtM;SG9@>2c$L>LUb6Wb;{JVb6V4WgTZT)`EtytIj~5acCnzX1<Bk_i
z5Rnn&<XZ6yU)W3>Y!&kl*6ua?e?d>s1#8_4>O{l2us=@;0Jf!@nP=fvT`ceMi=Kvf
zK<0Yi&s9!uLIb~{H6bUGpbBdTKvQb+<5mg$DE~$Hn5#LWSs><5uAWUJ+WGhIf~xat
zk@orEwT)Kpit0PS+Hbl0-}O?v>dES_7621B0`%%IRM$&gLeMYnmzf<PDekCD9Xmx}
z+O~024>t^PMHc_tY6g)1ZxRKVYZw*hJWW5FjPx}woX&aeRHN8K;~)=kizq6iG2~*&
zdDz&Nl$$krODS8}lS4o!`UHl)WCEs>`JB$|T0An^8Qf!P3;bR$bLdXc{=*G4D>Sf$
z%z7Q;xf;@bhz~XaR+%vaW8`H8i(SQvIu^Jbq(!HdHopd_-}+?pp~c}=9(ZkUle<m^
z5GUQ^ves?DG8U<u!pYS^Gi@eMzM)8fHHq4a5|0x9i^~%+M%mR=?Q~9WG(a=UiSQ+@
z|2Itotj;sw3ZW6^$9fAZ0Ra^8y5W`qFyI#zu27o0fD&f}n=s)T%CxwGj5>5UIT-+E
zP-G4}(ccI2lhgUgVm2h{3}b!kfY}cAtUR5C0kdt|zGij60u8`7d<ux{`cH#j&oE9D
zQGal+yazr5(cG4Nupxv~4Dq9}Y-~t+Z3{Ar&_qVQ{{L!TEl>pb?5b8)W3OXp@7~rw
zSbIDphTd3>ZiPnxTBMQWz7+uA$V^}96IC3&i{;(TvY3_w{#5kyJTD*wDgM(54nBK;
zKh7&a2K-gM(*sv5;$MXQlZ@JDQojAk)xdQvvlQPO(21c*bd@^D^I(>8T<^jxLh8hB
zG~~aok8fgb?oH<el+{m?U4-R606RO}{4XIVlpXzSRSbY(@W~MpdvW5V{rA-jApa-P
zja+q5pHjp0OXGDe!{fel_u(&57g};~^+^K?fI2^W=Tl-Y*9%lQe8Ba3@x}==@brn(
z+-?=1nA>N~N3?rN@yPSd=0<`$dAMgCfoI!&=X<ng&Bw=PC+p`Yi-D)&_^DTdbZ7ej
z%?{i?YQZ^=K%F0*2C_8-i-uDVG}yBZ_};w(nrS`POM`q>hnye|s!cd?aG%YJpC2O5
zk4V0lkjgax)URKVK1oOOoIBTb)l+C?wFQ!Gq43F=0%JoYmsZJe-yjXnd`~^v$<aT;
zuLMco&(4W9g4iwCzPl2%LnnW>9ayzZG(2UOUGuOf=<F9Tf7a*p1_aRhL(fC|C^^BC
zU*{!U%`*V%243soz@08_kwJjT0VSB9H~U3>g8=$r;zF$_B24PX5}a!KXPoVKEP-DK
zj%@Pl;V^^O7R0lvpA?{fouBocWBZ^K2j48E_kHLQG{mEwGjY7nUSGM2gG+Gd{)RM8
z3{|LPpjeH>x)~ICK(mKZB53HJX>*hR*C!JH+kbdLWU@J~c(oaJF{A(e!~Oqth?S_8
zYAkx~wgocgM3BzDCcEb^>@X$?nn-DNiCy)uN<sJAad`*6^MK+KhWt~KfAY-}_o%$}
z_?B*SdvL38q33kQe$3@R*V8+RQR+vIBk$49Oz%ywdRTe8`5sqN_F{~=4jLq{@sR$h
z6#o=KgJV#CkMYZeA|KlA*4e_d61`hS_`y<;&y+u1s@pAiFH6g-1E+bpCZ4fBy2dMB
zd}Z`Fy0zve^f<WHtpm?e@GoCB<M`TAtkZnGmlBN7YnbZl!)*P0@TuRW);hZ^9}L0|
zPJvSG()>d`|2Y$pr?XC6g8EZ_F<;QT%XOCiVQ{KIu_1}TZ8`HdNZGNW4GXS-Je3*Z
z%c5-|!R1&pH1;B0w4J_fp=>o*s+brJMJo2)Iqgry1@@(dHQ&q2wPWLo=tM6=U@_#J
zpGQ3FoJpb`YNk3(+<Q!v1^q~;@h-1oRNJ`7MUduploEdMVLQ7>9s2N~e^5@K2+V(h
zNi}WQhjg$vgqH=lk#T)36ge^XwP$SBqN2!}NI<a2ZLmJG-8wtgMxO;bAtW*W;F9)B
zAUyXg1_bfl5U6kw&#_<nN((bS)r(q#@a<p~!>QUQ{)gV)d@v<YG@KBTs1lyame}Qp
z2|GRT@!;#D^UdwbW1IfEPcaNSZT!ybbENLiHm#HxKIV3oo$E5HS*mc!tb>YfOcsHq
zpBB?yMbvJ}@5Ftc^GkIw6{75RfENpE?S-Ct3Hfi<L6`&z_gJ4;B0NR%o1#DgS%jh7
zueUU&t^s$!9yJIS_)yX2@~a7lkJPAxSZN*q!mmGCU`_{(LXNINl#6znZ;&tS%G(}d
zs!|Z<2x1UQYfoB*A%Y#D8plc(!=2t?p%r`hN_-EiJRc4}Jj@y`Oy{6F5qtN`lVpZ7
zT%I7X4X^zB;loA@F<Th_px9=vyCf9~?0}|_Ah|p!K~TLXYvh;deOy;qxxvvLh?A2m
zR|IOGz(+#}s}G4NSG5#}HlmrDXek&!fCl-X&8j^86IR?cM?&DJ&_?z9`sA#>rz5VQ
zXvfd<q(=d4c=NJ{r%!n+QQ_0d^YTD#5mUh3ODbm!$AQNq=W9+7c^z#qqh#T?Ibdx=
z9KjRvaPy@SSMwDQwN&<TENqzj3C?_<$O773g(D)`S!QVY`2Ydb-KgJY_f@2c`i-tG
zQUqRb^S~K$I$)>RC5gR}<nk!KIB+!A-6`+>J+8ZBi<9z?s=(iUVpFH<G(m{@2k*QQ
z1fCf0j8&(k=7@bJngoF&YYh;zoPkl=hb=AFnKZ~%eN7-t1rt-VP-1TV7kua=)o#?Y
zeuFsOl55g#?pBlmHS~5-%(uFiN-cIu6|=T4bZyScPnQ*g7cAyPt3;W_s#u`T$dBfY
z-+R20_s&UkKO~s3^m^EbARDTtkvM-b;*+!%ISf}CsS0d)EZoPyPyskWSJSTUHvJqo
zMRhWrL}E==FfY87F-`+Fp@$G?X`kVg#G_#qzXP~NWcPm8yI~e6U)=S}CM%+6{1~_>
zJELpTlyomZqbufG^UvXCU9^+Gxqexafd0FfqE7V2&ttUI5v!80D6>sqQK!jm;{Y#W
zqa5+lr*#3`BN~Yh_6DS&EYQl5zmT?*H{Nx~cD`rqony&MC)F_bp;>!U1YG679=j%}
z#$Tce5eXMt*7Dc2)_^Do@Edq%chOEXrWd|J+CQ*@UIyL328ntXO6RqOtxrC%wrh`$
z9xDr{o%)oxP@+qk?r;^86@Ee2)aosXl+D<2>nJcRWxT_%JL<+Ge=V{2C!S|@n&{Vi
zeI!!TkIi#Q;%@k#Q~?n4hlx&j-I?RIQ;m=AQcc6QHQa>~^KGfx>y^n7&v)}q0;mRt
z3-*jknO8x%i0O><DbGlyGAm5o)w2h^7jUEE`DG*+nUa%Py0^`WD3Gi67L2elR;0@A
z*eWG2J>Ivm7a42vlQZ>GLYKU_;N+W*EyFx_*0#eUiZ+i@4u=<+E~90oFD{c~zfr*g
z#aC$+KHXVTJDaI#c+kD95!g&CQsfo`4^laV^O(S0yaq+uxkP0**Z81~H91Z$mZr!-
zhC4a@dgzVb1hLB&_PS;LX2lniNA=Av?>P7b?g?Z!?^4G~E9n{Oh+pUaVj?m!S7TLN
ziK)2(Z9Gawnqc|)OSkrV)uU3v>3j#{oknR_U@*T&mr(XfP%Z&)Zai%yeNUxlq^n?V
z+oEP2Run)7{(}1M+w-cevSYeJS`{{)LbP-m;VQY^)zXoaI#Wjs25vQZPiCBS3~VSf
zD#yiX%aW*fQ<0th)R(os>VcCcBO>pClBSI_vIcKcZ`o|yDe249f!z@HH0kRzcW;wa
zR-f%gJ4jO#i-U^Lq&$USf+wru<4##J`C}EKhH+);^$zP?3}3joSr_&opn{^mT&#$f
z%HW<oRUXCVp$JYAT7c?o%r(XV=9FD}`mm}{=Tc+m=6=RE$mxe$6UJ24ckN;B$!mWI
zHOWTit_UO<u$u-~NCCZ?>$W%j1=?ujq1DX1QMuN^t%XNTHQ_d;HY!jgVsqEAXH@Aj
zuHG-<a3U?Wi_`3&$L(l)cJ#7I3BEk3_Qhmf=y9W$qOvPb<C#biYg+iGyKdH(184He
zwk?P3#_8OLJP8v(Y2>subldL(mg#0x8&*VGZ|Xh6a;j@QGI{hbfRV;rlGhoW+hxA1
zjm&AqVxEYGo}pZHU{?SHWlO}kG7D+7w<9yl4^3UpTp!oM!1|nU>K;EMCGzk?wDY=g
zZ8wT;?<k!k0Ry~U_GPv075B$;clBy6aDb?YkwURJ(Qy~%DR+aqUEw-Zt#C8=vd!YA
zJ=lPlj_9)ZN&CX;LGu~=hr@=w*t+Qy(ShnxE!z${mDk;Uzzf(Yq04x5$Ln*8j{aOV
z)z`Gq7HYdQo?^kmt|-SJ%^Wu_Ikp>{xEwi$b4~j^)tu$An%rBb5fd*ie^647lQ~5*
z-bn8oZPw)Zr0Ej)U$U*g$o>-f%gX)tH-B^EvJd(<H~w$o28maR#=mL-{sK{dgUMgE
zvcE0j{}zk*+Y0`dIuB<rf-={AEfdqDs>SxvtE=BPS~bOIri<;I+<bj^W<ieI=I1**
zJNv$8&b|{y6US8n{$jchv?sP=$U595cfX)yJo1|Y>@*p3bE&H;bONg${x(J9-0RZN
z!Ka6H<Gz}5wZdH{f!ve|>%|7Aems;FJWEaxO8r8IQj&Br{2z`)uLS9CHx;cKe57!k
z|K3-*E@0^V!y#Wk&8}*jzMM)4FBoINJ{I=dy9?Uq+yEXnk7Lb&yEiQMtjXAT?>SMI
zw;E@w9uOl<*880(mBPtFq5Yc8{-sW%Bp|J9|KD@l=*3M_Kc%LX5}{TAiTfYB*SyW%
z41i4ZyLj()dk4z12gY@x4=he5!27^)ybI+Lla;f2b6M@^NS}{H#X*-nV9f5kkEqW^
zu=$>m<H(1iOXfHFG$^w<P{c$%cGK8H&pxn>vK!uZX6&?^6<0fg+21sFqKriNmzF(L
zVzp1~4jYt9N3F|k*KeZJI!*2gK#?Ynq%VWN^WamHeO=^tw#4Or0>rNWO@A8E(l36t
zxqo!3y!Np#nu&bDjxt4mq30o3DV>Z^6=~!X4a5mlc=Ym<VCxHf(2tnv>2)}`$~I;u
z(_>T!)I6yC1_@pbwDdf_<ZW*B<30qR&C{JvbJ5aDf2MRN01m7j%RO4K+YWQLhZR|%
z9jkYeiy?M_(+$X}DVJEMq+)q}F~^$|2(Xx=$R5LQsn14UKZAWii=UnCOCxI#swldM
zTxve2K51m^n5^HfHl}1>X-(6ok4-?YQAJQB+;_lkWYS+RA3V)yCi-trH%MIllhGUG
zTj*-|;ad6IKZc*8Jo}2lv=oivx{!!LIjzvc4o#3_x-YGxGNuGeF?Pv=4=ibMCIg*s
zMhA)pI9RKLgdJXZ*G23KbYaQ@7G_tsY=911eiXi8FXN2Q%z{=qd>el)@-U*)q`!4>
z8@@Dr*qY}qD1*P<W>+V8{z1!7VZpuW<VkvJ!o;*d)7kVZDzoA$xrrQhC?j_>+?yMS
zaZT4q&9~*m9+ufvqx7gHj!YZJv^}}gl@P_z`~~1O%Yu5OY!PB9H{R2TH6F=3-QW63
z2&QkDdYUx1ViQ1PdAy({3U^pBwJ){rpVaA}Tx@Xl=~G*wsRBLoz1H<x_yi3w32$BN
zk9W>ReO`fMm@Kdp`dQGISz7Lheic&6a8!?>1E-vy(Zi?Yh&EKi-r8|yJo0#5lZr-v
z0c4^0u$GPXeN0^=gw0p)HlF)A%KgDk+NOWl)O_!!)y11R?DK(bw#21QckO?6(aL!H
zLF%Lu^r<CoXJ(E~C)yx4&42mVvkrzkSD;{+LAiAh<G`=F0N=g_Fx2HkV&?AB;!e_c
zuQvanId|}2r`map0Nj!c=3$(h5tq^)n(IB?6)_0U_1YHIMG!r6m=gZCu&?4cvc6Yl
zWuHUSC(?qVJAyZ}^6?Ys=>pO_npb8N7`C@f+nk6?=a&7_=AOoX^C1R5^y?;%C4HKs
zm~ZdC=6<AYU=Zj7WV-vG?s%3n(c*Z;#&#I)@Ta6J=gd>8-n(b!1X;aJKdiY~>Fd{Q
zT0Qnt#N%F{i<JGnM6H~t;)euE4-fRKo@qAfD`%NqX*AR-SGgIHa->kK8V!V+O<pHH
zo~%bMxi6t!7ljZ&d#2<v2a|evX<*JQ{<KEf{oRXg=w!?_!@dG);gJ#a219_#=h~#6
zPbb-Lr3Xnn+DJRN+x+h6N|s%UskGqXoF5&B81Cw1W4<)go+a+7KWflP;~)OSyMuPZ
zBqQ@W6vN#-?u>mZ2$FK5jMxt?U9!%4j}axA&T{eUo03z*7r@V-T8|MaNU7Y(L~J=z
zpJH{~1Aczis0fQ8uisJiGN7B;>@oIB%JCr;R#ctXWT_Gf4?(-cq^%?tgg`_UAJUTX
zLZsaZKcU7}>_lqhc22!-ssrFbKSnq}<2M2w#13#)HEh!-Ck??r*AAUJ9oQc+7p-vO
zX&IS$AWc|YIN+;Wg*SuwRw}AM(0(tu!H)c}kBqTVT`C)-tQsIz^-HniG?tBh*qwbB
zI=ZHv5)pS-krcMTs^!4d`gq#tj>Dlr&R}P<LDei47fr-IyQ+T7r;!`WJH$o?eHP`F
zpT=*RTCbb%`l|593Ng7TI#SZ8_s^Hz;Kyigy2dm|Olh$|vGd{lYBv>@1mtXrlJszU
zZ9vF7`JK_Ho3j?`EAJTx_+FEKUgH-Qi5xemwWTs%gImE*h{r0=hK!_N7j_0%Pp_ig
zmQlV{<PX7BtWeK1HAq9Rj6h`JID20K0n~OBj@~GkWQ-StJBYQdWxCiG$S=$}3|9$U
zLS}wauSO?IWl=vXh2yr`KQ<lriBFVVzS(Ml`dlhGbZe<1x?WS<vq%+)vU<r7Ug<C%
zkBsx_oYS@~XA;Z)L;W-?vS1P1u6k+*l~pn#J`0;wcci-&K5{P-wc3e3I<QbQ4SPG3
zRSFl0{X8Y8SGIh3zc+)exs$uV_DW;11-;1vKL1(7yG2tyo(*m9CbqNWToPSuD%rN-
zJ*C`$L4|VCVAxRl=hpOHsA<MW(<H6<ei1n7HB_k<(j=`__ElF|5!1T`W@!r#-KBzR
z<=>#gPeyr$#mI$V+0Rdnbrx+Wp-*7Pl-`_1s+4G_<lHCsB3y^__qBjIGctGwW9udp
zQ9t)JSsT?q>)(PB0o$i55ApX-41t9Ao5T8JR%mR4EbYvq$J8c+GM7bJw_d_}d27uJ
z<$Sp+qvB4IB`OK6awI|bA%{?ANl$;lQr?C`z>w#QM&VSY)0~eM%%PfXQ`!`S(a8wS
zMDktdZ{>2_PBgxc!6UNu6y?kl;miy3-r!Tw1J)`bob-Qhn*?{Pa6=I~kO(NPX60v#
zE1ro;9DQn6v1Nrfr34!~^}3e_@61=F#$<)Dk2^oq)VyBRARpkU1>(#Zv_@HHuXdU^
z>smwp_%(Z9VEeJ>9mooUW$}smE0|E?^V@|U%TzOl{!LaZ)cqFfy!~r=c09Ted3opR
zG;2tbVU&u&wO@Ap#SU!L;pzRuNVF6Gx#q5#zN>bhaOIEu^TXrB_2G1(Y8k=QESfMa
z*JUfi!3O2!mx-3~rp8m&k&Smjmb+(zt)Gk{_AHzR4Rz?=_+(#>a`WGzY_!2_v~J5`
z`*S*ICgV&p{d-IvH6X*ox0JKqmWhzvl6oyC+`$(VTNe`_BII9KXfGz>S3SV#jWi+#
zbz$-{35$CXeN6(~JYP)CN_=|I;|C9SU|QRP4R0gV*2sDCg;|6Tbz!gW<*(OKJrj+I
zy6VP15!SZkk!gT-I+?dU!eaaLmb1L-FNC!H^t6CS=MB;WQ^Ikw&<<ssAqo$s<Uz0m
z>=us0&I(FXDxGr&CKbZ!FoD)qia;E0Bt_@adDfUG2vrCc^OH#)@rs^zsd&Ov$pXz3
zg2vs7HH8uYufO+!bG8;28ew$Fl||WZy4D7~;k*5Iu0Uh-SaQQa`p^E!RISGyZT|7d
z%1x<xRzw-zJzH#4L5$RyY@|#V#AAfJ!0ZaFXGHk9F!w^QllJaNTHu2pw7pj5Um+ng
zBp^nO@TH0Yh8{9jsIRjfR?FSqx=}+RV{JzZs8r$8f+Bs^e;M}+5$xV)yW~QZ1hrr_
zK<CBJhVX*r#tPZ`D~lmuX^3-K(oi5rSg2{&(etl0GT*c6cRd1rO+LIU-)q#iALCOH
zezSgCP6T46GuiyuMQL{fsmY;J1fHfb60+PaNRwuZ6Rj35X!qi$JR!4;uUZ>WRet4v
zW$I$~v9VAjnxz|tcEirl4k7lev-h<?(CsTJ?Bv!*#NX2OA$DI(>XOS=%0=6>brQL>
z7UxMW?YTs^Zl9kb5`Bs$qZif##YLiq9p|5S%9iG6Kq7?Se_dq`xBS`0b3aVHRQ6TU
zH)Ra=Y*3D_2;9E(wz~G?4lSgh({1(r4y)zJ=tI%45>LN+3f}}RDQUsNZSz$0*h&+B
z)4?8FUzQca?r2?E@j+6+*S64pkLO<-8w$wYjNttkvkTH(rJ?r&@*&4eGZWiNwnDpF
zp3LGjKg|-ws@nDZsPR2?BOgpzSOPwiSFg;N>q$hW%70OQVoLG{nFM;o+ErUx{i_>!
zG@x`+!f+9F&Gc~;SlZojx4UCV`Bf5Dq$j;-B>@$`?#a`+T)oNsM`!1%OeTT_`5d3d
zQfP$mwoHsn*iFsnh$C9QQnmQSx^{{i`LwQ6FSP^}kV10Nt9e~6SQ2ZzAH#O~3XtOZ
zw=rh{f-6Ia6bJU_f(^|sHIxyq<_H2P22k@lK*x7OiF>hwa%N$flY#2=uE*Z-o|iU@
zY3)ydi>fgVd9{7D?K>3H^I{?%`Q59sPK%eKH6!4OAsgaRLr${{D^vw$YXqiv{6#;~
z=7TJP0uHAh;8o0h>6!ct`$+W_2vItz4nnni-mR=ufrq%1PN|%2`F8(e`l9JjT@Bm{
z+F=T)y7ZfIgf2EZodhJjyDd1e4xlm?-}5UicLj?gs@#T}ceH2R2kElEg|ZnCk>_@p
zD{vVVqfGc6&0cvep$0iQmVw!?(V#Yes$uI9Dp!Qq#X7Pb+^xbpN3$wbYAIVYis4e`
zZ1@?%6P3EP+lsrJg0zQDnt{X72?@iw3?C;+K=WH)O$$p+l%g^z1Jxk<)JAmzHU_T>
zh3!4#v_Ot{TF~I9W`le&BCmTb=DV<El_nz8bNAwrN~a>dd&j|m8ZgOB^AmcqqQHg%
z8>N8e1YJt1j*<lhCg#RFxRsiS7rda@t&Nq5>2V2t>x#F16!5$l94n?A)#)vrMWvsJ
z=IOL^Rxw?K8$d*kP7b%N?BU@A9}lZOcZ4&67fdaM%TGJGe|o7x>)kAe=cIEt<J$Z>
zUv)C!n^VB%!`CUi2^KZNRz!*_D8@KwxkkHLkZyn<HN0Z4kex*px(vW*?irO?p`exJ
zfyi_N+D)Vwq`&?obQ?HixauxNTjjl7Y>Yi1RDpDlOkd)1m_#!H^xWo;o@>|Sot?Pq
zr=!7M6@g+=N%4+EBz@Z{KU4R#3Y*{Baf>v)t<VaMAZ^0y8a1t<s5h7#lo8H+KHW(M
z-x~?n7CpRoO|bR_F@5fj`0hRUZ76bn$++P}cjc5>+R(t1LcW}{bW2bv%w7wG>Oe(H
zcvsRrK?1#zT_zQ3R-z@}JQRs}MiNMBEdkwW&&$bA^C=@pw|s*%Obih19eMT!c{7XF
zaoheU3jUCo9Lh=uJ=&-s48S~^7Q)>PU9|=IqD2h@j-7pBpwv#2O#65LKDM_w5M%*u
zJ58fb>bM05i6?Q!pGP$`|1f=MG#Hiwe`4A@-@hF^Hq2|U3S+zHY)V|Om}^}RUzkpZ
z;{jXnN<Z2C0(|_0Yc+?4>of6MM{QP?TH)4N4XrzQ;HgF4H_o*M20&9;^g4fA(q@Gh
zWqYB*$07pteVSX=#lxbagUc*5?nD=IlBF6}q#73d<JbDk`y7gUY{|OAj?uiF;@S{}
zL%wlsg=_b9CpxF(e4WXiD&iNs$<$6_@aJIvUdEN6sXKy49W&?Cd3H&m!fZBHn~41-
zzO+R(qWKhMsnon2H|?QN)7Y^-8^u@EUC6jwb9{Y|+n4JXiZq<%ENvFTG|Hdq3cU~|
zLwLud2F8n>MyU;z21D~_Qq-&1jAp{(OP9Aw3?M61Gv6PsrW-DL?|aD9wi$|WJE!)Q
zD6(!pufxBCY5m;eQQkPgTd-*?G7<gaIMr!X4~6I$)AcHG89k=Tl(lnyx0odc^46Rh
zEdub!9K*HT&}Lh~mLjswDHI%N>qh=t_wIPP2h9hHmv&_aZ+nZ-`g?yohc!<&pD&qu
zC#hMS&-%IW>5MS?led~2%$t@a8U5r@5w2@A)Le9E_Y_a;V(2Tf6BcQVf|jSRSAMph
zd}C2th~N~_;lO^Ft44ae)rnXUc^sVjG$pO?`X28pF>kp?t@HC6boUh$m_=kxsqhqL
zn%kBUYb5TTYS-iaz!vdQEBe2)?je$H<m|%u=-Ag5a*Wv>u_e@nmgfXorq9hUVz!>p
z74La%SBWRaCLR2lEG*2z>QoBbm;4F8puU8z_JE2&kAugid!~~bWW5HyjF$oxjkktO
z=%2MN>kDD+;VD^lKd1Qi1E;+vyd++ARC)UP82h#5cvd$Hm*6+8(@NafX>suM^%_`Y
zubBA0m-TeA+&O}MD%;}m_)K~J@NajSz)vs;{|ic~+a1+WP4Z%b`gy8HLu32p{Ctb>
z-O=5RO6<I^4Adt$Bdvl%ZqnE_<&rb8;f(7c>bH7yAlB>nELnEaCHMGdXEdK;`K8h#
zV*_Ip5>*`SXs~lybjUBvgeMw7ogHQ1a&QLvMEtwJU^KL)7>WQ+gZxpg)YY5x9K5M3
ztLt=MUD@z_9YrXoet&sUM_5nW<l*n<=r&q4ZXeKV{t8)M#9zpLdFP7$C%I$D08zva
z)(_n3#Fd6p&hj%hGrCl|wejG(^yxgii0?Ya3ZTd-u&reWp@khB4(9e{!p;=Eq=*cD
ze||uJ-9z-w%F)5uHzTUpOW~ydrP0ny$AL=@=iftI(!AgH@qZLs4zA$H%<T7ah|@TF
zGav_pn>m^t27Wo6Zy_2v(*?%dQYKPU(;Rpkdc|a_tFgj#Q`6pLDLcx_%EA+)ZkX>d
z&G~t&L4UJ)%l-2=O?Gm>dDBu)MoFWDkHOkF)>P`;j!qPif)56Fj5oldyUr}PcWZV>
zd;1Oo1n=EWXl^`5wHlXJZI^v0oC#AD>*%OdOfNIltk&X86)`#gucUp+4w5rXI%x(c
z?AwsB3_rSMh{pAYf*8pYCYH}zIpCb>WT3zXQ$ahr&vMwGH(q{0Gc*%c{f3AQ%Uidj
zRo|w71CI8vPQ>!_rvqjc`e~vv3xZqSD;pEv6Su8)PJ$9bnW=k=Ih$dIX%v#)#uMy;
ze5o@Lh`{=uJh0M;M^q<zuI{u>A;ph&CIZFp-x=0iz>rDqYsgM3`u{ZRUV@$U57a(?
zM!zFZUvVdCg`c+0F5jrkul#P*adtY?vN7nz74WG$zCgDt0P)M#i@)ixYGkOZh*Hcl
zbRVX9bnLvn`A&K+V3@>_jxL7%lGTnH1f89&ZnkHJ$I^E|sN0pKGFVr^Xs4spUv4Dt
zO2A*a3aq<aO0kQ6*;hQblE+GH1yd09>lFjxJt<yW2`ak&ot2@=4l|(3c>M^Vd3Gin
zh`dJw!h<E^=5(S%72WK~cU$T<yzWBEZBtwfQkf6M!c832A*7)7bAv6yOXn#2Ad|pz
z-Z?a?XF0%G*4Q#1E>0=_{@3`aJI8rMw@x52WAR9rCwVCGGsm?;Uq-r27eO6r8+BQu
zt%?DlXuj`JBcA#7OY}H7-?KnBkE;5!q0X9%Pf_pFbRa9|2mM1F^x)|fb$U}#?9Om;
zX>)JgF$<Jpzjya=KFk)R0ZQ-7yA%}&d@CV-6PoW)69`<6=$lI$kBm}9Ej<~T0qRTZ
z3cTtYl9ec+J#7(*+uy6Fs}F})mQT7Gi*h+biPAKw9=QF8&rw>4Ew39p6q#9fdt$VA
z1Pxzub2-gC6CywIjkS*WS)r3y1*TCLNML>(__2F`nX|c<v1=zINwU(zjmxMzCw!YT
z(?-fiFU7r3*r45Go3rt_7_u_ElHE#*_}tO=&kESuf|FXYpuKAES|DUm{n8WP{k-jk
zwB_}##Yr2Vb))XwLHH-3)NX+z#LZP0`=za_Bo+toXWEd1vHfQBB5xO86gs*Q#}s*K
zECxq?ZqaUVSbjss_>w$2{{@_aC=E*~z+o^g9`Q}sJ*B8<uf9)SQQ0&?bO5_hGIpX?
zNSzxWE;RZzAwBa)g5mrNgfsCgR|ngOyca3fAPxFN4fv{uu*H<H(%pQxRE4$(y6K)2
zn^d?#Jo2cZ>)`B(i|1x?<$z8HuI6}=*w;5U%MKZ9pye~)Qn%#Xai5%@i4jNs1IwRn
z2}XWzc=v(Mpt)pZnNw02^jN&E)1^l}v}5zrJ6Sa%w&r^Q=`zwa9~BD1rksd_egSds
zyU(NMs*e4xT%rCc`~0c;tC(jG3xE)KTuTno5K~|Tz5uI6Q_g+`>1sq^adDV3a`+cd
zZ!3p1nP*9DGMlOxToQvg;7v)*Rq1sRir_27@De(v-V7TYG0-tHAnyLW&H`SOaxFhR
zd_~0`3I8#|hc(<Gjx~IKYtyoff6b~Cs72IEOljW_@+#MU)R`jm%U#j9XjPlo_aYBK
zyu9{lt(l^+otP*g$!h!L`;Vw=BDG<N&*At{ny62O^uPC%S#KN}BcosLs##yacbaL$
z!9;2?8y4Pmb^TmZOgXiB*6QOqQtu&eU#N)MJyiW>+#s+*2+bw_lCZCv^;RQ2b-Z-g
zIZQUZu|DWg2+~KPhsNG?qIEdhB!UyRuY0|9-pirgWc+~XC|TV<POp8VNl`pogS*ds
zS7Yd;vyj}C327pp<D}y^(VVubmEzuWbYD@(Kb>L90OH36204IKug#cE*KUnMf-3r_
zv~UN&Am6EHOC^P#?-?h<ohYAwFJcN>z=vj)f<uf;%b1m>SEns6xkiT1W?es`4d*P~
z`_b0-?ud)O90`9HZwZOAjK%bggkBDI$6Kl1WWK8d1iQCh(EMCeZS&9=E=1L*^;meS
z86$)gcNS~{H1X5%S)jo>+?^%`Xbp|-Uvq`OJOm?l)UT&e8XhYNAT#sE;?1a~ms#g+
zR<uKrT?~+s`SsM@VdD+0F%6a~jFZ+niTjVuSL>h7SQop;@8^^0R0`a#1S5qQfAEg8
zNxkhFZU+LgG&Dn}`=19{c8C~vKh~$u9jlDVE0nB#UP9fy3`<NeLy>d6+!JU0H&dks
ziHv+|j8aW0Gw%N}BMb5dLex4O@R0UlWBvAe0^a!s=SN{rHVa+--_0g%D+q;D<ogXz
z>?JOdcFeh}we=XPP*7tOHCdpwPbl8y_{7Z>z6;8ApMAF%uy8u)TVSkN%l%MO$2DRz
z4$ajWW{xhYltC5Jn^ZAQk%7#@T{xB&8*C6D%YL)Q!AF+kf_fFbr9HU1lKB+tnjo3O
z6y3b~;JS0e+boH<lpk^=$~cJ?BjO^GRw@Pxmw%t)TCfCJPBp9QNfY7wTgq#hw<Y%#
zS8OMn$|88@{bWtvAnRrY?nPje=Zp^fDPo>6)HY)E!@5|P_xt+oV*6EN%JM-Y)VtE@
z30h0MJzk3NnBq5CdHR~v9pu%+b4<nNY~=H7{pOX$sQT~7D0Z3M@d~H+h|>C^Yr5ST
zx1pN`X<m)Wz^dLW6%a~uFE!||t9_l}ja(ik1G&~HGS($@O_5kp*(W?483~wr#+|(j
zdU$sscucOkQIOw!ka*3tyn}m7s9S(W?{{8XVip4=a&y=cw<k3dU6x9$8V)v!qCEF|
ze*N4Mxn9u)1S5e96t-@BEVccbxy6vUN_v;I&y~zM`!FD`t~kY!0|<BPc<PT8oH+UU
zl)gQKZ&S|~_8c-fYM_8R|A_5QCewuLEsY~Z)&rd`)kkat4gTWQcj?8MwAxkgf#}nw
zD%NOp{U-RZZ`(09@=M$Lgv=Ib=R|c=1Fa1q-$u}dOTuZzF~dTPTqkE3-(AM)4{HU3
zkE$IqdO<$!=cRG39{sw_f`S4X-AW|Zp8d&5QkTt3VYUqcVT(1nC*c?^@Udg_(KFSL
z4<~>qUx83V4b%Da1EwzEp@M{oh^|Ej@7mKe4qH9mv654beV000%b2FM+Rw$PUVh@o
z;XYb%8@=#;8pG#R%=4D9P!}zrHHJ3<THe}|r%!PK6P%rIg5lfP@^wXz)ns=!4P~J(
zI{u+nYY#(^*cyZ0DF`aq+Gk}q0HbQLh^u-7;e$Dv5A_5tfFD}EKf2tB76hLAdV`Na
zNJaHnQ}G##GTyr6aWoW40GWst&Gu?j6Jc?esS*z90C~^PlRx7fXml;YH8D<cl5)FZ
z%(`Jot{+oQ#^Yn>x&)nFs#I8d2zNoQ5lhA4*>P&hkG~i?AL>WE!e}I1ag}r*%>U(0
zo37e5+&Nk<r+O;uFt^YkQtBS3rM@0pp<FJBph#SChaw9$_w+=QJgzmn2u8%SB1~~s
zowSu9AN0h|x6|NHgoS?%pH?U6qDv0z64$opD+cyT%YQeS%(h(zO4><@s*hNnS@@4E
zqx#!p-`U0Bs3R!QT!~4cDoUwr47=%DUpq(3gdpYLWMfSc!*nt2?LEvf@R_M?<~nw`
z&Xm@WR8{_3#@%D(PAn0W`-9hp2~QK?%ruS(8e~yYh~?mz`n08K2u9$$3?+&t2>e(|
zcCYKRhP|CvACm2tJC>=?PWK$wH8SHD<qv6GyK$Rn85J<4`f6XO*8P^d`+HjB7$S@E
zA!=z%2^&eZW{!*Ll++aBl5>P>_;U8=gnXl>>jJ%UnKle<s3ZbZ!cW#bx~r4Q^07C8
z?fk1!%(Ju3V~=#rBYF;#$*vT;BihYerqt}CX{Pm|)IG00>ugJJzn&Yg)fTl%63MeT
zrJhub$AoOb==OmJXPB#2+rB}0l+(e1b$0FUh%dnM=w_EY%0<P>pDKMD51z#%{cO{R
zl->GMb@e>$f-c3DJehtVu%V?JtJb;hX&6_I3k^4hE87){K3vdWBocS+{?0mZ%w#I9
z6Jm$(w}YVCf#Q#?s?4WYhwS-=s-{;C2)+-%8+EXbWesF3?<c03;duv{N}F;MDi0N9
zY_j{tP)O^#o^VlxFV1d_-YxDNAaxLXqCr2DrV10&qhwD|hp+_-TLf=136Q-r#BiB;
z{SKRKG+jRe+A-9^CAR%|20SoMPm0s&`oBHDna^1pj;UmPy7M_7u2Oz#Aum2xF<_cY
z;Ica$;*4X-vDbz=)_56NADs|g_EwP*OH--(a%$Jji6^354(m6pq`E6zy2b0>&xd`H
zuqun{T7_|r1iK+<73w>L>fMz;Ih0r4X{}Upn0f}I;k{5%6j&*FG4f4ir-}UJ!+&p7
zL7r?}E4kr>;o^*$GK<b4?|9g;C{!MGJjKtp$DW5zDh<`E7<L)FAE>@K-9dh1>v4A;
z##4D!KlUiuu(u04-8uEJF8&?sm!*R|eE|_WEfCg{sy+68r{erxCMG}6+p5ji_GI^{
zW~!$zWx24w0=UZFz*XCEKtFbr4_hOzkha6cAZop;$srx?Zcw&sej#^v!Id-#+kGEy
z3th&Ee_~xx_6Q#or5Gtdo^qnIZ+lt(`%Ldyp(*-yqtR-&IG5c#NP`$<z6cI;A=YJH
zbrWxGXNB3tJlmqbJ&)V9z|S2&dgjH|dpz`3CS5#`b~rsOr0dtFtAVXBH4)xSmoj(B
za?WR&et)1pLk)BvMp~=oKow*WlOoy&+(E`;EhIukWE$%1Z?szwJG7Zq^MkucNx)82
zNzbj<hr1WeS!S&!RXEN%w=l>GZS1`<WM4&Yy5laWkULy_YgjF;ad~Gf><xRuLdAeO
zg8e+DMOGWAHTuf_yW0b3DO7<dxFa1k-uikmFoQrAsN<Mx(R+RL$-~tYk&NexTf1SK
zEBl*L6#_t1d!izaFxN>?yPX`_IJ!EgBp^dvx>n2@=2GOOw(?<a*4Sl(zF0*B@*+5Y
ztCuST;k~?U$va%Pz%`)Q$!$cT^0BZT70q5eFqQ+vv)#KgDpX(AJ`BiN<N&pMYr12c
zyhLas5{<%p#M&qe9=P=gxope|QBh{gdP0%8S|DE-NuJD<?%Q_>NQKxX(xW1$B&Dmg
zoXw{-jY?xBOP`0!FYiaw=@NAhdd{NjMD{jtrDJXl*rt9_`dL)=n)KLDk=3EN(_&#W
z;-GcKCYe9~i*nX`sTF=$&T4UT{hNfT)ok_@EpKOIp-5MDZ$Fl;7j7C3qWO9(c>MA`
z9p}lNXf~XI9^L?6)c0M7!QS~L4WAT^rF>TK9*0@5?6nWE>dh-y8<h&-9_5ggB=;~C
z>w*43Q(tt6D^3qrO19t?MNA20*KE|wG}ZA%E&nR3;C?nBAGxwiPt(z0>YxQwOhi^c
z{=K)Z>)tnTt!Zrw9R@?CD%n+^`b37S8tnGulD4=o2^W@481I!I38{X=n1qpRCd;tf
z?^bFB!~|-hxr&+WpeZiEjr+wc{4m3jp4)@-=V7Ryzkn=aYCz<{ha;FjqIwm^)~QHO
zh{>o>d)a6rq)}0+2g?dg92}Cc*<;X<grklMeKXxu41Hj9ML)Z3qOF`8jmLd@ji=1)
z&f&cdn_1taY<L=%Z~S~(F*E<37hR$TICyxOIyMHbl<_~zlaG!4ri}_0_xA|cE*%j!
z0&tJgqX*{+1OjPjQu`>vzi*W_mnokQwrz0<42cV->nkSon7?5TxgDslK)xLwg+CRj
zSKLrwGvB8@_8d`Hjy~Hxd{vr6SIp0Q;^RS^85WP~^5Yqbvk#paTn-MP)$Q+HbyqM6
zNC*#b?zC!Do>FE5vmx~APjxx4wuH?zy=EN!cD~z{IvwS-&9u_{sOrWumyZZXJCAqd
z;d(HB+pugOu2ov)gf7jO1}QBthOlTTJb>oe6yT6o>x5bY7xzu{pN*jt1qH)5b)Bni
zA6-5`mK@_0?QE~KR*l=R=U7ZUF0k|@`Dk0t?}E0iX_D;!lG)?7qP+{ADHIt0SSBn`
z*m3^r&NBNtn;n0Uu(18ES;YOR?3{)?H#oa(#HHPbh_z(OsfJk%1Wj>ptk1pq^<A~{
zRNLMq_vPd}(CH2pIbxjKhl&(4Nc}q)1~{K@kcs8~$b@-^!c{oXR`savQYd%UA82-8
zpBJ>}?Q|>;HmyfLoh%)C45j`!_V&{FsuF@Hg3UL?=bj<IPoJGV73nZ73Ai;9&7;Ee
zJF+EVd<@KCPjRNO?WCpiTd1WYKLO2U(Q^RzP<VQeuHD-~9SZ(|HWopoAA9wMYF_$A
z!dSz2Nc?P-@u;Y3=Z!M*%z1lsk8#CvPzHCdKQdk1|478=RPyLsi=X1ZSNOW6_trOJ
z;lQ`I-~XQSzxSTMfB1dU|IM<$8TEIr;SzfMpPy^^Tikz(`)|<u+Y0`+g1@ccZ!7rQ
i3jQ-I2ofMYI6q0yNV{iwUVH`kCo84&y!e^%AO8bw#U4Qb

literal 0
HcmV?d00001

diff --git a/modelzoo/LanguageModeling/BERT/data/images/images_nvlamb.png b/modelzoo/LanguageModeling/BERT/data/images/images_nvlamb.png
new file mode 100644
index 0000000000000000000000000000000000000000..a28f24e31d2b54c20971ef547c79020e50462838
GIT binary patch
literal 88164
zcmce;c_7to*EXzDNs}@pGVdhwSTc{XMdm3}$PhvZWhzlgk~v8dGE~M8ilWFYgk;D(
zl$kQ6_t@8U-|zjr&vQTDAMf{lk3TL~WpBTIp6gueSjTa!GeqmO%5F*qN-{FC-9%L-
z9Wt`b!DM7x{3tf#ZybI{E8^Q0cLkyz1qH>|0nKmt<3SH)BM)5{TMsWwHybiLXO{~$
zeD2n6Ha5=g_AVYXTPkG8$XLmUO7eQ%4}bL98POQ73C<SqQjl-nOv$B6u1q7uQ1&G6
zgAr@2l9gzz(ryRAB6^>|yu&B&Jb1r(-}P;JG@In#o=v6TW>rq5WT#Lyx$@PfzF}?P
zb+hSOzi>lk<>Qh7mRFv(%@0e)9{P;sw#6z&5=f6r_Dr;E59vFZAoCXdSDNfj4e3v7
zXQj5`Ti5m>A$)r)Lm$2g-+#XI-i2?ud#kPS?W*73ulaF(jvjwJ%kuyEOErqNkUg%s
zyfpc_HtI+w(a-O97~}B=@$tmCxrK%LhKBamRt1WX4_4IFQ7#p~fA$Cb`TeUuKuk=m
z#FU>HX_aTh9>Hr;<kX&_-5!{im*+Lz^Qpak*j1c8!oK>>!K#~kW&Kt~<LW=Rx3{;n
zuu>?nN2n4a2dZx7oGH9bd|T!@#ZED}JU2wy|Ki09=_`v|Vwo>qym9R0-?C+kP2q>?
z>Kiw1=tLD26@Bl_5fBiNy*L<@ue`eS-Ok!t#BbGugM))}(-yK}EZS9GGcz-(z-+TJ
z11=i&Aa-_kD=RCHOP4Z*4Gavz!ooIf+N7_qkGCn#&sU&`kB>hgF0Q1cWLo03mF=bX
z++euu#w)LxzSbDVg9j%jCRUc0jZ91gjB?*LG^iUI{_6GWPcyJ{aJa1^V`^9CE!BtR
zB1ZP)S>J4s-EEX({$_U6ccJn4g$oyaW&XS<?&~ggNviB7?5{2NTVs~;zDl(^9HmLe
zcjlb4Gk-59<1r`u^XCuZ0~cNRI?Ik%X+DzrQtV=ZH~RVGOM%ZX=RG&82ySF=|HyA9
zi*9LaTU%_?oI7H(GCx8t|B*)AYx-eALPA4BYpafihDOjmtez5kL{3pj$(M$P=dWLz
z`d^|*mcPC&lV&*2y2dp~_V*&SA93f-(42}51~$?|P*Kpt?x3I$cN?b2o^451oZqUh
zu1=+}^8kNze8Be(zSr&XB6fPWS;(%wTd^h^e<<zK@#)h(Hjj}y+KH5MQC?o&S*}M(
zNi59F{1)F@(@wdMHoWVXlnh~|(A3nFTKBnpc^|pFh5zr>rP9(;T3Xu9Y%^V>xApbk
zpXHQK$`M%n7WXk{37EcWx%Ocb>14>Mn3zZk3eF~a^~qCA^;gJpaowubL1?`bIGreQ
zc!5ryDqS<_DWykiZ<!Z9bDV(1T`F!~-q$|A`28+hTMr3TtnX)A{GL-0NjSRTte_B>
zm}pkhw8>rk(C*#4gQ&u5zP}wLBf}}PznPhN-rhc=yj(`X{=;orEd6<}h`FH}%?F}8
z*gR(X-rTvfYoZtt)X>myGMxEEY*$l~Y=vdjHbX;0hx}gUBIn+^0oO+|ehjp<hZj0B
zb<-u*!xeON_A`gfdG96DqQxnF>GJ;8K7W7zLwnK-3opG3+!_-TGsT~O>AS)D;NnDE
z>w=`nXs!3sq`t53dU}$-#O3*6oC=O01w}=RH&>J^qF=n23f#JrN}*z7_1WV3lPByU
zO2YOntol_{6y%6gvr>=DyLX%V-uU$P_UgabM0OSF<ZANPopkH#3(XYCk&%%fLJ#aD
z-@G~>dO)D)<x3@smDN?9o30|l!t)CY2l!1hV(;;rzRI=wK#7C1=VpQb?<+y~l-TEf
z{&e<wH}$ocBdD^f%H7Q^BOd3TaUp<)S^CQGkH5Z#hea=RzdU>@)>ZtWh+Wf<o|35g
z5l?$ZvgI67qIz1sHZAF^IGdsp-jQP=5U`>DAU3w+sYU{o0wVp<qetGO6K(0oFLVb9
zR7?``jC=<ww@U5e2r|ev4RNU{bsY#)Vo$d4oNUirzIx))m9pgI0sQ=Oofs#l#siPF
zpLjTzo58_{IXOEzI+#zmz5o3Ab4SN9aq(Bz6Ug5NFk<-yKMc6Oe*KzL^U)ouQs0$%
z1c!pf212jWdKhsXq+p3;;OaWtUFay8J2_Mno}QYj+FatdR+O2Ebv>>9*B7!O4&y$B
z!-o&=+qZAZ`D1ave`xXM0Iv$yfy&ZL-z9we1>@??ik#w~J>v<lL9iMl+0;yK`c|<E
z`CIR*KykI^WLJS*GamUoB_ksv&Nl;_p0@VI->a82HJj5;#d-di9C`ehHQwd(SaZ?=
z%HSm8Q2~MUK8Iq30|yT5+`03boa@H=<vgQ9hYqdR9!c_G6+F-4zs#nf%1YJq`f@>(
z{-#+L^5(J4a$>f&wrp%{ckkZi<Kwfmv~+fM77ae`(sxIthWf~9Jp+R}i`E7uV`F10
zD|Q*ius7aw?0kGLUcEw^GbtT@b#W*%DvE=h{dQ>R1j|fXT3X@dg;BFI&%_h%)pd0a
z1C_zcUy>5#nU1(zxG;52pKN{myI=T-$J~ZheM>Ht9lp7n@t9rHp`eziBbq0}7{olr
zTlPGkMruEE=1g^Ub^9R!fgJ3Dj;-KHih;ITa{2X@v7{-3wa<nYNVob12GiJ!-qK4<
z6zbcSZt<@xA0&IcAE#AIot26rxbnu0ZNW2$*Lwp)r5TJ#dT$y-2(^QQgD&MOqwSfx
zbaZs>v;8eCu5*JQ5cl;KT0g(oCB6Cm^D70Li;D}2&hJ=N!hQ<!)&%Qw=U&=2G6c0C
zY1uI6zu;yL_xkpBlZAyvP|IB=N!zA}U*fyTmcy-nT}&iflskCv;M1p1uUxr;H0JVV
z$)La74^?L*bRYS^<YXxl^(wM_$eTC6X9uciW$GDnni?AJtRAI6(vb>bu?y2kl!zhJ
zK79BPg`j+?!vt@G6{)(hJ1V>8!-uqhKO5psOiYQSMESi_;K-5mSFgI-+x74`$To(C
z#hOVn$DMoLhSJc`+)qe2KtuDbwyUcPKW=>H%;fm^_~ax5Bjc}`{{9aiw&t9X_-Ov-
zO8oC|lwusYdu)ny{HFBO)G1chTgWzK_Yi%Ww~;+w$a?nd#GyU@uSLknxXAwjKg0w6
zM{&phd$e;tpXn+C6}dcBP`;O#D89b+?|CeD>eQ*rmrsP(Fdlb)_|!Jemu}hDA2EOF
z($dEWj!PaMrB<|)pS!w@aJ~%<k6Tuf7g=wT+iji_ZF~9g;KE9o*9<=w*SPxO9Xod5
zEkutW?`&;#yL73yBH#};u&1Zzsb*4kTG}O?-%HACqdWHO*%(g=C*ORnd#BgYOTmfv
z?&Vrm-2gc7>OF}=bm2?B&EWeGBq+0&luHPg=`8(cqYd$GyT4!2Xw#e;uh71FI#P4u
zQNk(0DTFgh%WC8Ce%2snS^u6D%FgG;DQRiSR6&akI95o&FBf8d8TIw`A-b5%*Sqqp
z{nmcI$jsFC8Q@Q=&PBGSQb28FqPT836c-oA5u~Q3CXq{N(DTNpv!x}s*DA3=dj67X
zMUx8$^)thqpFe+2PZoMypFW?x=N|EV;joUO6R+ctkb#cQlk7Qfgk*Pjx3EaC!oVYG
z9|?=g;f2n<h~LttcGd{h(hS!)9mB5R;9w@^@Po(v&CGhgi!mdng${;GCi$ET*OV=E
z;*btevT|~I7#-c7<~YIJ+hWSZk<^;loR;0yb^Pah2T#vq3aY10J#*<VcdBXC$K!6^
zyjj9HMD0kYPgA^iN0q^;p^EbM!j<R0IKrkU2A@CYjZj_e^O?D{bi1_s`%=jGIsQwZ
zs>^qDx*r`L8}l0exG$&$n2Ji_-Me>}FJIoaZ5y!1>C>l&s>2j1f`fv*eSD4|KfboQ
zYF^<lEhe^vB^(_crQ{(HTie<S3JYo3^vuo61OE6cDk{3T%)ERly60v}W+uurF{lNr
z@E{>!pF+i-jj~E*nru-iDJd~AePe0Lp}AkbWW8t4qGD|PUPGK6Vq!WVVE)`G6fbB_
zl(aj0wgvc?F4XHs*C(v}@bECI<Gi<20209OwSL4M0O+?lZ)EUm*REORiAzfQG(HeZ
zIT@xvk*P<|Mu<?Al9py<V0iH0!O1)H2;R7a1WDV5`=6f~*1LqNU)Uc)kd+M>sf#&w
z>=>|ClqvyXkBW3q$~!kJ;v<bgEZ^3xTagz``HQo%CPM|vbBl_q7ifzT@~dk?gMx!U
zynlZ`K3?qj@#Y6&j;Ouf{w5}0kxM^VU9`1*>Y?~4BO~LL$He}m<|G-vE7sQ5=g*%<
zDF6bAnNo{Z{_1SU79=bp;?na9rKf4ct+AnDe9L#==}Y-66FZ+p`*h(+mD%?xWMpO@
zm66$){QRu0)M!J|@6U!GGBlDfO3c-(WEvWclarHZLIyq3A`%n*kkEm^w`{o%*cY%o
zaFbO0ezV`tKcxFAPIsb;mVrU7fbY|%Pj=0ZUOa#P+G8RTc+1r_H#}VP(xprG_V(w_
z5voRoMMYD;hz9%pn*7X@XVf57C}?w(uR>Fco}h8}lkd9ZsW%qI`e}VbpRv5j$;tBa
z@+#rXOtSt`OR+a^*4)_5IGbrBlOzj}o0}^bN!Srcv3GB0Tie3?Jdy?{C#QG00zgUs
z%k$L!Q!_L8Id#*j7#>3^1p$ll_b$?CRdDE+mX<_AxlA)FDr6Nha&xKTG*6#?Bx1KG
zTjikdwC?Ktko>ydo}RmhP6XCF^}W7~8nJxo$`y%1oj_J*8Q)t}s|(Gtb8}xEJF~xh
z{i>Tjtk3;pdb-?gL@ko=!D<fq&Ckyd`On?mo!0qQWw+CX3)0fkTQ_eaMglokzv!&Z
z9T^z<JUu;am3N;|8yUIZ$7d}%nr1ya-&411nLvCb8{prXN)#VmUS39lDSY|z+tBBm
zdsz(V9|^NZjMocP&`<_TF*m<||Guh<gN;pIMdjuwReZkv*9%X+eLJh7qT(y8&P9`b
zrZB5_=yPJTbbfuSW?ByiQCLQ%ud#7=1zqTe0^3HNqOjD+y*MMH3ie}7iQ-rB47$3y
zH*P+b-B=z1785xda<oAx(o-P4^=7EQiL3sJ6DMAc-n&OdLh-tpCH7oD${s#Eq^}z)
zlV<aCN1bQ8=!vJU){-+}4B>d_Cn+hw@IDCP9n_rPR+N>Mk;PV4R`hC-(T&cWp$t}-
z4=5YT$l!|nc}L;&X>?kbR~L^y8eUjoU|_(hArNnW7`(P+JBt2bv4Vnvj;`*+0qs}V
z2B6>BVuwfqy3>oTZCdxKDS5oSye!H*`TUufnNO#2CN;-3osx0`GNaZ69%BzWgRVxa
ztGjy@Kytb8*|YCGCGHc&%9a;|goM23(N2pA3O=Uv;&eLrsszw0^8~iXy5?@0%W)m;
zq0aX9*MKmHM(_D|*SCk#@i{m;Rt;<=C+FIwc@U`sK^j4bPfUD~oUCnT#z#XLb3wW6
zQ>&=9qT=<CkXoFgj7o$v6MZ<5);;`3{-Unful4nHPWAh8BV?a-=f7tBPz*c%`uX!a
z+!K?N61ENWt4o6h!a_op<y-V(trvYx#K@Q~e$_EHCWh4zh@)d;ZFg!!*W_%XmlD-7
zKRc_P8r!dNPxM5G^3WXPrPpT^6|GknCn%El$}C^485)R3x=~1WZFwxOmzS(~Gd4EX
z*2YG==KGJSseMxigO``RBMH>|_v`5B%pPK7Ot^iU(#OY#Kkhpc05IX?<ZUW7P0c^3
zuj8cUM~`xGsCr?nxvZq*qWD9=`=&!%A~M(CUfZIOEXd8xefV%%e*S)z9fr0a@3Ach
z7Zn{-$S5q-2a5V!34pkq5=yn7G8q4-UKyL&*wl3Q!F!b`iGx3pS&&#~zZdsiaokv6
zSuTu?rPH?srGa>~zHs3MK84y;?EU+|WlBpk!?jV@0|Q+N!e5{8N}b1^RBrD_gw)(+
za<sAvq+$?vWo=}iI{7R&1U;C8gF|&Vize;&;TK(rwbC~7S$amFd)z%eWx!;_#C!&5
z#1SZSA3JSxIL0sIf|C=K!l6Akxlh>C(SYxGs4V)d>ElPL5GD1~=4b2N?$nx}PTL!o
ze<4DKwI|#2*R}cawY!!MnbECN+cI?&JX;yXm`|A$I?(!So0#w<nv4hZ8tdvRHhyYr
zOUVgq9&;CeSm@ZPlR5k0fQ8mfwsFBI@~JA3IPtZ3=IW*`_j#YVrad!w(OqDt>FfI%
zt<Z}Xd{K?5Xdcv(I3hBoK&t$j?p*`i1Q<dALuan68>2oQN%(TD5@Zfu*gW_l)X~uq
zN$8;Y8mn%?BEgS-SJj>!JKp`V=Lk~Q(8!!KVqi}A{{1^hnS(`)&g<3HCxh<g-}KLD
z&!S*+s6pF<)=+e{(x~hc_nl4s^u6(RF&RR?qPJ%1Sn9hszkSP!vm+(7j=G5=zp=4_
z^su_R+G;a<jC)P%04;58dkEtuyM2vWDJd!GrxWYeSy*jORG$;*9N2rH>O#cHepAtZ
zE!!7og}UJYyT#~wo6mPw(@{}{$mPHJ$mJnF0%eW<iDD0$-K0J-7JEyHox2lNjZVP)
z$H)jR6;<xJ+WnlNT<6eGpv80_YYM(`V^H9W`nUDB>3Os$4E+53=vLh?UL0^MQAnn0
z4!?cd>ecN%06d}L;Wg=>@q7pUWv3m0C)TF^l%OLMvse`|(9=5y?&`vY#?;<a#(>4~
zPZVsv<a?<pgO~3a-i#oiBMC|3Ay83NR#ryG{BRW*;_*S+Pcy8Ol3H9HdsJNf3kZXb
z*XY~8Ktxm&DmC?i11fKL-h?f(I81of*Veu>R;rl1f1lo0Rp98+%vY~o^=jzGT04BH
zC+ZgQ32)yw(w1=9{pkDGGxj>C(xRv3?g@Wx9j?eSVh*39)<H5&wc|W|_=JSS=*UQ>
zUW^u3v?>8ipzTKVQQgtTgt4))o8ZP?V@<sJ{ci3h8#I3(XV7|I{Ni9`@XoYf-(+ND
zTL0X0?#dmgi{Swy&>r9_8rt&Q0~l>N|KtQmMxBwd@x>C;$=N-e7MIb=o;tO2>BH|Y
z1$G4De)75HIcwAV4x&;-n`8d+Y58iwjt&k$jobDzOPzg7=Bv-}yNK(##+Cg^787y=
z;b2;|Q>Ur~(4*z#%rwjXX&7*|@7{gX!58QThy*;9wmZK?IWJx4M3wF>0`b6sH~<~c
zl~`TmEcPH$j28r?hsSbmlawsOJ3L&y${K_X!a|IjJYOieDbX1&8m!+aEt51eG<4m<
z@%;HPW@(0g!qJMb#kEb`q07GIad8Kl*jM#seSSJs2JIl1M}^%*FUZQqX1TsRw`{Hv
z96drG_2y93-UGjj#wRA0rh5}47;mlZW0^K6ETPfq@ZwH0C_BoRcxq}ogLd?O<@$`&
zG%8fP35OyYL}}mUT>OUcnp5E3PMs1tcC0!qAtq*IXsF;^Z6r#$Q064+$IU%VrS4;Q
zwYbz(RaMp07$iJ0A|m$r$5jmgZIX%@66n*X04gG2F;It0`H!=+SN8V40qO=-P+o4n
z>U(|Ho|{9%!!)2Hn3#IIyBYTFv$V4t`}R$amR{WTG1eHFA!EydI=W<w^MJN9sCVeD
zf!{ojn*!*A)<TH*%JieHLYAiFnQ2Sk8(y<>&z@sKLbVMIFN=!>xVhCdG%lP!zXCGP
zyxbS{O&V+h>ed^dUvvi!jE#;m(9@$IJ_KlsN8{nC@9BB{@}&o0@2;I-Gfd8$;Wy4F
zfJ{Wu1%COtv{aaztL53pb?jK|)1sgt^1XY*v$C>)D|vXH0j3LvD{(rlm^_;N{vEHq
zbLWn*urT`QB_JNGOpZzM-KqgD$Kp>ZDtj4^sh7{~QveXWyg06{s*0ol9wVUd@>n7$
ziq#uBU99BthfcVi8~zx99uoP`AFYG4b2iHHb{5~qya(-{yvWLW5^mY_@B~8p=+UF(
z@<89fEr3<2X=$LPxg@DNYkL@_uYkV3baO4Dd=?Bx{1~$W7k*k=`VHz)VIe}Kj3OBX
z5_iu@8<t5Vz_xVFaK_`)D+^;bHa5sRCIz;;o7io5KA@{Y7i?<U1BL}j7YyLW#+r+}
z`vNhF^QeiEObz%N<pL6#wz0I7uiyg;I(hOW+7Q3h#mlqbuA#-t$#HRTm_&&H0-`^5
zK;t@Bj|&dCPFg#|F(-NrwXu;A;#eepu#$D+>jiA=JWGI>R*kY{&UTq3d<pCikuWC8
z%U`SH$pdk%Z>Gd7PX$ssN{tP~sJ@=wOA~cox&w4{gMx(pJIYm_gbt64G(M6p%gUnE
zysgCUYkd+Od*_RnQNRW?p45J2N1c4H$-RAD69aCio#PG4f3!`U+NV~>3m5o!c?lFx
zuu6{~KhDn1?oG(=yVK+7<b=O0DLHZEh)Q`Pt#eAlO$Li-Epu~o3yVL4A9jwK2}G!x
zzxHP9Z2~tnSHm*JpTS!rQ4wdI&!><)i&b6jYQ}bgJDHmbBovJ5Jl*1_l5~bU?WLNe
z!yU^Fks7vxNj#g)j>yQAmzG}n^h6m?+^#>ElY`0twy~tF>{PSqOZ!jYFM<2~ik@u#
zv-SC~xwdu$I+n|qOY+lq2%)K4!@~p{GI;J4a}LcqTQOH{-n?{$6l2c)<&9$tZEbB9
zzuwWaE#eW;ab7LuI>{aZ6(hKQr1enn7CF9B*<U~%8^3>n&jPYREtb@!B}N+Nm?uhl
zWj|2Rjb|uM+j6#IQ-8Qm5agA&Z{;FXQNY~d8Trh0k&Xcv5kMfRHeWl7C`MS#oCvSj
z=925&^d>bm^@!#pd*~oaParJZJt*?DN*`)ah!Xo~F>Pid-r9O&Xl?gD=$+q&8#B<Y
z$gbow3o)PSuij&{y%#E1uV&+r(C@ZeH03y~W!`gE;!b_E_N9tnOj?*Ag;8F*TN4ox
zA=<p4NFuEVSxO^q!6Dqc*D@Wpkh9a5+5{A){~~?pc$uQ5#pAo4D)RF0ECPPVJa~}h
z7HyU?wzxTM`;Am8+9Mmg4;4|7YwPJPqG(KNYmvQPX}`n)`1KP<!DhM$oM3CQVW`p*
zS{?&ELqlD61OSPeUH={_ZK|bpub@CAsX1NX=eKuTjvP4>M71Qi-QeX{rg=Yk&zF+u
z48X2|lzWz*-Z|}}<QeWfy_K^hx$ho;AF`Fi!AOO0X6ef2)9HUM)MOi+IdeO}ZlWns
zlJ~|YIWL{&U)GU?;tO9r!CA!5=%cHX_W8*x2GCVvQy+^|H&_d*<?Y+IcsE2)=HWa1
zG2nCnL#J~#G#*K<pqPqo<u`wQD<C!PyWqs0_;R_#@w?dM>7J6dy1L1=nTnYWI|{CL
zXMv0ZQ9Wn(?_cPF#Kru3DtT9%e&72LY5}v-)JYu&J4?$s(6Hwn93&(p@|kv$^(ynF
zTtU7-dA6~={lQ96u?m>4z9C7-dYgZB7a{^wHX3g9JlU~=r3_UnnFG27b>-V$m=vF$
zs<7g_e%VXwuQi$#`2soPbM5`lcgqj7-z7S~WQ@Gxo>H~<NAyOaC!N_VS4(u9KdQ<8
zT96?eSWVgAdbXqZ6<asI11WuO6zs`y0)d=O)av~$EC^u#UfFdmK1+|nrv6xS9i7VA
zxT}(X{laS!-An```=yr+-?O)mR)6n1+U~M%^DIkp_ktYq1C|P8z&T6HPi0;%_yE96
zkp#p$i;OQRphK0B^iH0%J$Fv(^87gvDSP(pX|0%DAODo1cymwiO^7pS&08Tf1Z?<X
zML_c%HF0rubu~ACbnTkFvT|@#6g9IR((<nwmVjCB*Z1z-!`aF+dYG8_286SX><Z}=
zKTS^u3P*n`dE$h2MfD}Lskyni0UIkfs|I+D^Urm^JP(CrS<d3}<rN@YK+veDD1=en
z(A@g(-*YbHM8bY32ROJ-pK?k?j*O1t5ZXe&1-pR9Z%&d84-2CW{Q!kioEt6T_3e9Y
z?d;GpO^lC=OGsc5=-E7=JFQDvfhnAun?qvIrw>50>f{8*U<$u;?%FtsWBBsr&96Qm
zpiSBJ*(#9JC{cQodnU@y_Ax~RzaTQfNoHGAAb|#;i2+MaDak`1y#M+YzwJwB=PxkY
zh-NfKy56n{1men16$;QxNs?YoZ{I!!$-+xF)0LNk-|roPB>)~35h(x}XIx+l(wvqp
z_4xs#hOpqB2e@|<jh#RGtS<}6MCM)idd%iGJCn}o(|6TasDtjwv9^5vjEsK>=p#S2
z0?EX|f%|3^RA_7?LBYQHk#u8YBSkV&u9@W#;pR;bn-7T<{?iMv4M+!cYEHfEce0F4
z3puS=HgIm-TwJU~VFE$P!h(gR>v?7-8l)WG73EMmCUI9ee}4^)u)I7We?Ocw9MnhR
z7XwG_j)M$5=G1j{=j_TlB&Hj|!4U)^)q6<b`xLOfN6N`3kaR$~)@4ES2JeWHz07{V
zS4dFCn~R1rj^Avx(;|RKh$P8d-7j|t4E{|dtut7JsWZKI_wuT#3HnbwPwgM3SSf%%
zUp@3d6l@dLk=RfOKg)$iypO~jV=KSm6ydnYMW%*@HG*218(oCTz|I~>s0Hf+t_}D}
z=fm{9>79|rULW87A%a95B0gxM{Ll23l+D!4=qM>ytzTIfZG?dRHV3gLd%l%$O-?S7
zupvgj&L|-IhBe2>?YazMrRSa@!eaK@J47kY*yMprG*1+CmaG-(m~sp{=Fzpaet_UK
z))0R^puoUVQK)%SBq2#QAUiX2<&5dXj;p1DOySDxpE9&lGFUdlh%oF{g0ny%BACB^
z`BEEwgyIR1FFL__eb^K5Pm?OS6S8i9vpwz8j|l%@dj>zu?U@GT)YGF#?&vVEDTGw`
z6D%NjRzfYJ4Nd>|>M+LT7dYhLQlg@&{*rcY+s&`|;$5YhYJa3YfAIog6lzfh>N`;~
z?##}++^no}GTbYE*-1(LK|83y+{_UQqwkm$JCC3>q@!~aKg1la5(%L_8?aFSbL8Pu
zvzi|=+nI#@<cm6)=2L5|SDPv;hUjQ%mAPdLiESIe+c<O|7OJYM@N0>Y2)fH_Q^lYf
zX4kjxW#&EeGULUIu+*uo!KG}+#}%!Fng1+xTHug^F@`@v^usyQ^0F74pfq_zApV1@
zviysxn)|#Yy88oRyC4W{C=+ci9Aqx^8aFIbT>4(eM?}ac#=Hg#eP76W*REYu3WbG*
z^Rm~j1`s{asm$x64+E)PS}Nfy7qc-&ZDnl@IpDGU^^6l(Fer_Ub#+hnywa}N3JXG*
zmnbqcr6UmL1KEHL5TIb1sP^yAKi|eFD_a4<7-><)Z<RG@aA=4aNoqrg#CM_4v-I@z
zKK{915*(~-V{_OaL>5l;Im)gtUxY&HzI<6ht-?XYQ9|<IFRxL2gj5?>|9d3Pq7O*w
zkb|CHyU)y<PULmdv(Vi^+^Hg{Cw8`*d$m}%8oxRmlh~bl{?C@}dmtsEy^D^HuB%h?
zwHFl<%FE0QnM}m8Y~Q+-X!tSrfY$NRZ438?`7Fv`BQ=pI$$Xo7dVU4V4WWqS+>w3Y
zXYA0aLap_3cOP;Uhsv-_?^Cv0IcW?1L#b_lly}ukH6|PnKPYnJ!8%+UD90fdaKW_q
z^_f3i2adFBxSyQM9mpywCN_8N-OHCRfzsM~th~`DVCN^S9YH}-B*SPEx85(i(Jq|>
z?vjFSeMvhW^f-vp2vrYo0+W*t*cbGmAjAv}3?d0DUtKCLiXRI8Q&R7H9`YkdaMD|1
zX|Q^zb(xW(+QfkEPxzhaLX?W3GyVMPvVAwhx%20V3%f7<=*nwnqeSz|B<cCBlU)Ac
z3HNOk2APtnYk}ul5U&+1ZC-R6&PsoNCp`U)8147_)eAFC=!d~quf+M|PDKubSo(|Z
zjuz<O;*Jy$T$1csh7JaY`g8$X>XVt8h6Dv$cbXz80~3>H&z||9JM8H(om@Mvp(s>~
z)4wB-Uhu3O=z2dPvOYoeoVU-jvKVB1#r^prR8#->0A<DmEL#DV+&L)nmy@7<#@GrG
zh}{Nmp-2YVa9m8R@Wl&_FIQ^r98`pAMjdlXO%2kZefG)ZvEgBGCa9rnixbvXR`c(+
zvyhXM&w5L(Z@q|5Pax9K&}fY)vuh){`$|LHKq~bn+`4-=_K&~%bpqs{zX*L&wJ?8k
zCim>Mn>STZgwRl_yKBQxaOMn3ciZf_lP9+=IogAacx?WxovGFD8okH%H%(r#`9H`0
z3nNrvVqp<ZRk>vsCk^EUOSf7|`p69hIF)8ld2z-9uV2^iD!QxFF^^V)f{lYCWu!ii
zbOc02X=IhjGh4OlXz1u-<KkdpIhgch+c(+33GW}{<EqkQGiVvHV!BV&-$g+qRQ~f-
zL<f!)i7SitK`p3-eC~2HAXI>H+J{lbXSe22YQziMKz3+f$|execCVDpl#2)n$x{rx
z*-{w(RzS6zry8VvEI<lsu&>hvSZ9u$en?W0W==m8zX<sTt#oSL+Vb+Z(D;iZ%bq4d
zT7VdGa&NzZeAq>QHODfUE;*#GuI_gIh|YGAeF~rfYHGGubhNfINM8|k*1U|)6K!|p
zcE)E}S)`o0wVFWWJaQzQMV9%1Zl>h9)y=h+R1_2{VJj#oD2ORcU#u3M;9d7SbI@kW
z!pNwrwN)4t5VsR4m@;y506h!Ai|B96kJP^kzjD^O<t8DCkrL;v`OBC1njzv|@A*_k
zox-E3^*oL%HQ_8nCm<2*qM&F2|I^xP)upv~;wn{LV<Upy3!fdLER@XQ@$vg{aVixQ
zWG-YR66St1qnLvp;<7$Y0GSUuVu@*XLIR_IXw`sJz(!A3*ZppHwjjv#c%GQV#0xGi
zvL54F6Il>b#q3)iBlmCw<>%)EITphoWb=K*RlKRG32eqGA`t-Ug!||nG|6RUl5zEj
zHaxw{#fukFS%5mx_+V}8>+10OhQ`LU>k$CAGZ2z5eeYlk+P0g4WQanyKjZC$R0SOr
zj48x2<Z2jNJUu<J|9CGjw^vr?ZO|6M2nM9@a{hb?h$A38G`x_Wx;i^ML5KDAJ(9^{
zQ@ruAu(0>_<*4Q{kR^K#3KRY<E3wN23aW*Lte~Klipm(&J`4YqwD52^6D}emP!B<~
zNVMR5B<D(kZuu7@QG0U{@q#u6wJ|L{JuWu(biy$WV>@LfrB`KTaWOHF)jxN3>i&`b
z=v7?#^S|Sdm|(8!cwxdKeT8QKez*=d0KUEYF57|DmlizkEye7oiWXP6Bw4UBFmQ8-
zQsl#juP-myI#5KjG3nUYjIac(zk7W&!v>l<Bn^>^gGx4h;WM|s_gDPEv+X$~el>?P
z`8?MR1PD}5@K^i-0)J+!_JZB;^Yz6kY0a^awHevut?2QM{@0Nt-+Z_lCr+TRZP$<_
zRUhyc70Y19pdrqF$BFd&T696(pl)mdU$h2Yo0jWbS2|mtRY}SOhk{=Dk7P)?hPL*;
zs6woT_7MtnGCKn8ot>-WObg4Lpu%I%TqRLfz@XF??^qy#I~}4IoDzvIZ8H`U({g`9
z(zUc?dvr8Hb=FVgz8vf034xf!hdo*pe_A##Ft)Y_J{{Q+xIB@bM8mJ#ueFFGbB;NC
z&s)auPu>q<`8#yzjzuR>)O4?;kdSuiRaJtUhsQbLhf-iSkp<d^@oC*)A=KYIy|3w!
zG*5uQ40=`Y(8eJu+p0))8P5X+yrt&|<+;V#9<4ak5Z{C^fYQ+JjPGu{dVF)roHwi!
zsBK4x`Dtk*QJOLry}aTCsL{5WU3{@io{GmXJ9Sz!MwNiR1CS*vxMvGMM8hNLge!C}
zjd*`9EUYaLg{Qd;+ZdMhvb7yZjhtqs7=+xRRsXB$&#gyEj1GA~J*ZkhPaCV>bDU~6
zzhw;kv>==$?Gt<N9xDyWuVvTxAnk|S7d*#aG#G&Ub(3?8i&)(E6%se8>S}9^;ra6Q
z^~LE&^hkKVPUQRd42rhquBS7k7G13rb^`Qp2N@Z?RNlMX#)Blnn?XfS&$T}!R=}b!
z-lt;|8O`TM7QNZ8UrX6^CV#B>3QO(apfFTqXjxrDURxhb-wW7T3OBJ)>gb%2KEj)e
zW^ntdO&#)HU|UQ;)RVl#SMSU2B`47@X50UV+{FJEuAcN-PckM%Q}x)wJ_VdeRP%N)
zdva7k*jOcYHsU<ow38cM|J%odvIO+-_4g(ZBZL(2K0+Q4yz8#=g^X$X6K<(@@9qhW
zc}>S-Gac4IXLH5dV$n!Fa(JoNGV5UNBRFuTy7Gp=?Djz9LeriVdgSn7aC0PPS5c9J
zmG$Za7Fybo!NJM_SKH=Ce(<{%-e_ot*d?<%&Jm<VIylrcG;l&Tj4c?4+$_0z^(u5`
zdIkm^ck{T9=cT-7-`3XdVABIL;e0sfgIc@~M}cxYbJu|!Wyks>AyO+$wwi6y8jaoF
zKgPy>!Au>h#2(0sgwle{4R&KsyZh`nc{@8hYHI5EeZLEuWjH7lG~RhCL5cLZ0(kZu
zW*L-r-W%^p{9c-@ltV}ldT!8mZONBRK_q7R=&anA)Y_qOp*OeHdG@63PG!#h9cv1l
zk2-B~*rN!PwLb)X7*?K6-{!R$TNffmf&>Ky==jUH4dS@1v$Ma@(Qtd#psP6ki9iIX
zk0cNqL6nH!tkSml$Q^ZJG9R221Y2xD(V*q&QKN@v*dbNzlc!1yeWokndR1FywCPaV
zxvtHVX6KK4a})%#Q}6ih$41@!J>Sr>)meI7u+Uf6OF1@Qgc+y}PHzD1aBm2yZSUT(
zoj6fqT?ooy1_VC(8+d&UEbdIEC(o6dOLx|O1gL}H(@v}YOUtEUm!l68qgo$h6rc2w
z{pyxdIWy)Xg&e)@X3>m4Oq*@ao^{1Xg11)J9UKs9q@?KpoPSX8;f<66kH2ZlwvPuW
z_Uvh@uNM&%M04Qk=EiW``H%wFzvVJeYgdQDWxb`$(Y>5K`&fy+wz=7QaCHj)Wp
zKheOTBbLtua?~A1wBj<bYNJp?kT>G(%d=Jics$qf=;1?O$Op{fV>2`T6#;)>kVMy?
z5Ffw$khs>VQ{Mseq?l*mHf@7x=-)V#f)ika`nP@{dYBVe!i0|>PY)Ln5m`WIiyqgM
zzY|)A@h}D33I*7tG!oa20LuZ>f^tZ~7VsBM)X~|Ahp|vmsrmHjB5XGl$&eAjy$5Nb
z>3~fcY8kEbQ{Oz84pCyzkC~Tx)OL4Eh>LgC)TE<B6cug%_>n>Y?JpfQwY9BnM^BHG
zW5;v&qX$=z6@d<E{^F(dg65+?s9S{O(>=|35yfM3*is^j4}=z^W!+f8#0EO`jjGcH
zCv9R{JXFTk6h7N((>FtUNHsAT=xgZdd3ae*c+<<W@4Bya)3{FUAD#QCdq1k~n5qB8
z2WBcDe>8^b3CF55JybG&&3e_Gv$Nv}BC)Z#KOpDL?qm+Pv$g#R@c*qoD|3ZFyyETc
z)Rns_>GbHr3cSp_cJAaLMzY?TopptI1`@nXl8cqqemc67|Dtm2baknTk-%s0i!CiK
z(om4YI|EM?lry~%Y`MzaL+^4BU9VvoUuUpb(@5My)YMGpL}8a*`ympc8dO!3n=9_W
zF3w*BY_5@TY@m}wi10)X{I`A{8jXhp;<oMEB_$*V-PvHO_?)HRdc-G{!;n6?lCGe*
zIK8PK9S&;{^pOmSp`jr-iDn)1jIaT`!Cy0;)0>;Y=|YTz*c24R8U*~~{|n7%6sJaI
zA}tF`e~v{3>>1d^pcb4J+LK(GGPaR~37+KhmcfOYHfg;{q>aJfP(Q8u5_ohyw>aSm
zWv$AS?DBEB0<JqMrG+s|919tG-AhNmW?G37pYSrrGd)$Hbl3TEgD2Ig^ON)6yYe!t
zi-pANz(w)v$CtLe`kd67UeG3NLD0iEq$@HqR834IWo47NJ%_4yqis#rQjG*>%cj7^
z&3&6dl)4x7XRRIZ^B?{i=&-ZH6@NAy&z_b0Jc{M>oI0~EAtm)tO)`El@{zil+U;An
zSh1rNP$}VP9?-`s!L|-Zlujtg9cE*5w8z!a@jF6#uoG$#(pp-d&MWP@K_`2AK_Q`%
zi$mKAy)iDZ$ZR0tX0wmmDGxM~)X}6excjL&)w%fjtD{=uYEJ1IJkQUMT-5@Jl$4|y
zR)Z9ci5X8VEh_)-!07)l@9fhN`>4bfZfmb+fcQ5sFc~xK{=Fk5)`=~~_kBj#q*2km
zmn@#EH2SJvynAZP$BjsfYkP#FKb+g<RzBIVm*1kK{sTgC$jw^s=RY`}3SOgq^5s&?
z@p5%<Gg0P(fIoBiT_j5huuYSNf8p78fdb{rgD9G70H>M%a^$o>{FftV`vW*~B6{NQ
z1KfW*0JulNCMhZDu%E^09qJO0P|A}hFzImf@BrO@4ReRS=WPaOg&L8Fz^;ixr;I+0
zK!7^*EfW4iST4n-4k8VR7gH%HB)3#nDm=Z~aPnxV2<!$=pV~t6ZV!C$K&?Q*IB5A>
zs(NT)Q4x39$*TXv^h|?n)92<4IC(YlZ}UxI*FVfRg5!TY+5Lh<qxD;_sgDn<mC0JR
zNAl@;@1nk8S;X4CcMYDP_?!syBWns~jZf})$Yw>eMSK2Z?Iw(*Xja4TSN6Wz^}uZV
z)7V`Tg0VH$EG#S_^|51gZ12YwBr8Vn`BqV#!2xOPt_8vr=8&Gweei(6;qKy)5|^+f
z!vs-hQepbmty`$Vq%4jy1D;;u(xhIQ{NTdj+S*!xjoo|q&VbJ)5<}wS4<<E_xr&43
zOVc6wFSQJ6F|9-z_rdrP79C{BO6bmW$JC4;!Y_PlE;yx*{drD~iW?tiuNtXiKthL(
z4w-m%b{1XQ_U+r(*47Zt<KMs6)7ToRpRu;)XdD#`(K2qBGN=@!KIibg;U7!0$aq*I
z%}E}iCB^K;yQ$^R)ZNoRmB$jcezJ}DYjx^2-|JucG^2MuGSzxcYh2cf(rYMdQtpP}
zx{}Dd(Zb7U7dKxc+gnY+2JO756Z128=noLr`^x<oo1s7ek}p@wKTl0nLa7_o*)s~)
zLuhC{+yJJgrmsx1wYc6dAdJQjKxQy4^&shBAi}OuVHib`B8*8gmQag6EbBhT!!|19
z0R+#W<e;M_M&2P&Fr`(3O;98Dvgw^UqelCbC?CTfeeKyBw-HK(k6*v8p*HQL<Hfps
ztgVIc8xk5CWuas&6d!PH?i#1l-xeBr%9OkMo>x5@7Vn?iT-AL)=j9spw&HWm4Rq~d
zES@)d-}zcgw(&L$Y1fxVC%f1^FjKP|oS1DgU}(%v*`hbOlhcSX9GDDc0a6RBmT2?9
zd&n<Y=;%bIq*$AqGe=!PCkrq5;zLj?0l$|zmtAEQ?CYYBf-$e!!mMzMP>W9#05SoV
zQ3|%6%;i=M%16Yd;&p;dM>uJk$?5$0<m_?(pFa+gQni%-x{>8!t33LeZ=n02(qZ;)
zs1~OT<^+v@Q7-H_f7+h(6`<N#S<ixV-?D8N*pIkAzB(7w4Yc`50UIYi%TOfWzkeUD
zxI<a$!*%R>uk!R%QN>O9<0Oo3Q^BqTUk=Ip)&a!}vCLBuR(BgVJdzg$UcO?GA9psq
zl)5`QJfSuL7wR1bE`z0WabI0j_`##ZJDwu)#upMT&i^{14p=x=5bU2K@sRJ<&#G*P
z&0Kes`20TQ*(QgmJe^Bh(@xu^O?P!YJKR5At>Bbp`P6)O@0YqC)&ERpx+3dJ&u;s`
zIkMF8F-VK+kgUHiRBbKNR|`}|b`B0m2O=nLL0Sd|vEkv(U=*M?sp!`JUi(E-RmYl`
z)}>8WJwAYx+fPj$$7>u&1w8xT5-YcExq5jC@bIAIJj=<c5qn=<O(JB_nPLV2de!5{
z-#|-17zaBGNCj^PGcz+}eF#QaL=Kg^!*kvsSFo}88>l>JOu*fMo+QNqcKvT2K^Nw7
zBmgWNbd0@B5+I=G(384eyja`V7<T6lx`(qDFU~;-Y8-II&<Genn8cuxV=@QU3%G7*
zoXgP<<I`^5vIU}q&+@D#W<@Y41nXol<L7iumf(wbF)7fMYYA|>W9LqgNNg1Byu8oT
z(l9Z_3C11(TR=cl!VNE4p@4bH{!Bx6_nH4{&Ovs^{)2J?U4)S{WePsCQayjh7+RoG
zL1AIaUAum?r8h%h#4w<*pC3>nSiZBE%(QEI2xltX_BhOtzQ!=qhlKGsT+9FSyinVb
zYUsN5XN<~^1^Z=9s^QI%zA^aF9q)9})%9_I2C9$;uZg|P55#pA=T%>?=(xCNX#&xH
z4PU1m<GoRa(o7_!rOnD&^t;mp`<LD<G)aPsHhiV~LI2YaosQ?m94wlqric3IURR|$
zXGT8dJ;KAo1K`bFFi^+!_0nskE)WuRMQ@;JmS!TAHqTPO$Ukki`{$W~1+eWa%lsTm
z`$Ip#&$3)96Qb0LuUg6#Uw?V|O^v~UVElm=va_-(qu3)T`nz<&u@8u2ldzz3UXKF<
z1A(I+@brBR(=s<UKDeab28&}LD<%OD1o&?ZpMnK^(lP-XhXQ{2{rrLH6zFwN`)Dc1
zvrekAh)^%`HRK-CH&n}y5R?`)IoFYqy|*muul&@)%XBMU<hzv)pN3=`fp`<iswvjL
z^POqUYT&k;=X{m7jGsVeHPqK1nV#mCc83cE)rT~+`{g#}qR~A;vPIw7=Y^fEkf_&@
zC?>NY2DqWe<&Tqul;^o+i!?Q17nwARudKfriXI$|u`w|q!*`GeMsTQ{PLg3_w%r+h
zEojr@oBy0iUibQcnjA{$kFs}kqz+L+MhCqMpCB(C48E=F**i(pgkGh+I&4N0;MTK2
zs~a1`1!;v)2GM~%i?;&tjnV@wg)@g~2JKEwbJELPq-dwT&OpV00g8O{rlos2=MA$>
z*(e6lE)ThtKroX+9Y$hCbe&1ridXchb3c?NwYY92l-uFFc=OkEAXK*60arM3(sObu
zK+r9H*nyvYUEA(1ZHRbwa{7U>j9W+W`<V_Mf?oMq=LG(JwpzDY%owvJu>6A_6&Fi?
z@}v^w+j0OE|2K>|Qp^Gb0t{$c|Ln6)t{$(te*HSQK4@&{Lvggxn%$(L+_8f+aOvy}
zkJ=kdV8UbtTMfwER7Xs3AQG`GFZ1$BJf|L@(X6ZM&bJAzWSJ$+m0zJ8mEBs1*PylH
zJPbN~p8^mmW&dG{K_GaFWZ6IK1<<w#wInSc^8&ITMg|7*(2_{ijxFGX0-xPDu<QTS
zd0|MSbo{SfO~Y|&{K9en$qG9nFCF}F&d5CVv3$BlSE~kgF&^8+cFxHOP$ito&DHf|
zQxhEpIVb~+KJ;Lu+18fxW|cez><3OguVAiO5-@+=0>=NClhLVD^~k9p)?vn*ZJQl&
ztZ!~EE+}a0=r~ESn?+_F4i>{TThfPhX#cWMgE34<&T)^xkGv~!AH(D)`Q8lC8@mn$
zT70{Pr>142VPxc?p@gcXM9V1Qff4KT1_pFuHGO@|Y=nh2QQ<=9rP@)iZ1IkdYbJkq
zKa|^C+4fH_Kx8D$9VazbmzP22_~XdHh}}HaCcv|xv@epdk0gu=Z#lv2g^ZbT0yNcB
z2d6Z#(!~f=+xQm=NoI_s8XFmX>--BUeS;eJAqpjeDZ}+IOEGeot~;D~P@__Fev&>@
zpCI?08rXs@td$!>Zr!4wq{Nhp5(Sbp-W*m$3@PDRQ0AY!d>Q?eds1L&bElL|T{OBF
z1jazI>OU>Ec2-sjF*G4cqz4MojNu4^V+ig{1lxTVG5n&I-SlS7X!2&JrkKEm?+-yk
z5DH*+OiT=1IxL^vU0tr?hmy5kp<BRKV1KnwP4$);T3!H_A`KvCXHR1^64P(cECbf2
zj)QW8z;FTMni%p2LjFBnHiJ$<5EdN}D5NeT3**yp&Za$kR%Z3#HdIe|LO!(@!Has~
zYZ0bBA4z(hHZk!5$BX=wlanI=?BncgpwdPnE*(-T7lg~=<yg7+_+YqTU3bL^zIxoe
znTf<}Uj4IY`S<_K?Fj!DeCEHu&OtM33Q>{_l)tgQpb42bl0Zro&?E|&_K|(lsAeNZ
zPSN8h>gwxJkCN`+7soILe_Xw%I6nBnLpj@cL3)0lEi;?KpgKw}(3sS&>R#gK)9FW>
zQ@K6ecac%vsky2FeF2A9bSvsHoPwXr`_Qp)x?r|TJxRuF%5l4Bx5kc`I+5BW{yHHt
zy{$t+XEjcrZUWV2(^S77!<6J=Xn0TV-Mja}K!J5EzZrLg>hPGMo*vF_!~<XJYU_A|
zsirJF?i|6xdfffj)+e4E+9GE|Aai1+utEoq&QExu0^!vUyUmp*I8HEJ{<-WYr2GC7
zcNPB)JUytlx$giRdv5CL=qS8F-F((3SQ_U0%+o}_p`Payp3jRnpp?A^lk~o_+kSY%
zf(j@FShc7gqO_leMiev06Y+2#Ha0isJb$h?)NT0U%q0j|@$m#Ad^hmI4Aj1Q^$PbN
zU`SSHE5?Q&SJRh<RT@k^`rnT}us(4ckFJla2RzBQZuMK8NEa5tWE(dX1v2vykpPhW
zpdKAe*FZg?Xxs1)6HlK8u<`(qm`!bXH;IGs9mq!8dROmB<7b|V`rI*w_xE$h%&2c<
zNRuAEXNJ7gKv|hIkPK)CkaXqO^aUVeDDIJ|FhXNdppRb6`}o%v^e|W}pPQDnzI!)-
z_V6(`9LAr}q+<;KGkq7hMU<)IPF;cyr_qb+=|x|^t_Tl1<h?8KkfCZc-4{xMD+!-o
zTKYFm98z#rHWwT+&?Mi#4{9EReWe`ZMD6PUXPB<RoPksK%i8zv2{@}997E8Y6GYEP
z;w&m~$;$poa4A2Cvlm4PYRDW8$_^8kMi_pM9I2_T4SA{$Sc>g~zW=qm8v~cs1FlpG
zqMKU0w8jkilJ>))9wE@b^7k5xJ!j_I?(zz^DQKshq@|&`M>x2D|C5{?+#PW`@*I``
z(<CGH0%u;1qivsGKz_Xw7B(_QO@48HxK{BcWYRRjV}DKGsXLIwpzC{O#=C4=L7*)^
zQ4S4Kz}W#bAi3)@N%hjn(GeqO1j4z$2XEn1)F2Y`^7FC7Nf_@_ja)|?1-n(Zb1=C)
zR$v0D`Y=(4i{>uGU*xlPcHtLlRPTYo-=JECC@Dk^17#qmsz$;Hbm2JJ${wBiEaKs#
z$-Dv4F@|CRdVE{t;sVFquVu5-g&ICL%5(0OfOZsgFDz_#QnQzbhY|%c9`xV(=p&?A
zIREb*FXr_RGBAK~1^JI2VPXZQ63~Z0E25TgXlV}kVkR7WF9RzgiK0zTIPL;ZyHd=Q
zqlLZwI8M<{N=nF3@Ex2ySq-1&+&X%^lGIdk7Df`<JHTs1Qme!qJCsyZii?U!*P_77
z#-^vM%kkz?9rxnG0>Em0eZ3+DisaJNS1)if-uqSzq+xD^rxB)P5VC(kjt3{)IZ5Oj
z=?VbRO<X@%3!gI_Ron4kUEAo<<qeqZ-uL$h0KuW#&^FjcMwh$VPv#wkn*+k?M-24!
z2Wo+p??S+@xEmhMB4T?-eFdH~CMG7Cq@3Ji;4?sMES+42q7T!|P$xQNv|s6c7~1&y
z#;3T_F*hbNBjc@wiBKTx^XJdeCUEQCVZ8+<cY8qcdjKUW1+-$2C`w9Rn7?^&#Ssn$
z%sJrt1YCB4hZrD6sGhU8r-yE+&)=4X3nN&3XWzk}hez6I)09QE9bCaYqzu_gR(qj?
zj<#CMaCqf>!-5KXgfGTT=eM6GWumit%2}>w&tqH*A&liZps8lT3~k}<*n&+RW}Y=3
zh!QdXSk4_uKt9{OYu9kU!};_3LX<=;F`<vCJsw|2zyNr2F+vF(gGLPN+in6{ZW?=o
zdo7f7MGhlH967RmuORY!wMgjg+ps`jBUyD{Wn_#)5`YSsKKj0<2A1v$a3O!S^Z1C`
z37(jt_44}V$>WR89Fp?fxie=TK-Mi!t$WEGUc<x73u%^D|50F){7LGozS}b$Kdb)Z
zgL02FL;86NF3)1yRp)I(LgU<MYLS_0at`k&s;l1#58wL;RX-*`)$%`BZ_paHmepfU
z4s6JZ86PoH((}jTfW84-ZGsf_c$N=SB4}3`7*@g9k*<9hHCr(&62>R+a&wai*m%9z
zFhB1WbZ>FVih?ad6*ppBqsj+~Z41Qn+@hi!m;#T1R{PaHDg~rrXo09D;UOXLsNBw{
zMC|TUfOo<BG+Hl+e;ytlFs7wzByhH6Ew|<ZfPB;o+}ql^owZ-y!L-o#pPPC_Qt~T|
z3?DzLUb$#<Vw+s?wK;kZS|}c9^Ve7ZgJov<lqLusjFD!}!)1sYJ`O63&s6+@Jv`4t
zAK*8|bG23cP!mw71oiN3TAuRN$!CCTXsWcYlvm=DE>l__<T!Z;Z8A3>A6&0B?#>vD
zK6_RWljJb}7Gt(H9-Kc8Jd&|81@0F}D)pylqhVs2`tjoq)$OpbL>a%*d`gu&^vOW+
zn#yR1EMB?3`|`z@iw0(0k|*M+KFSI_yO5-2iN;y`eK}(g#(PL}J*5Un!lElFxl#-3
z+RI&He|^{iW^vc6Q}k~W&LgG3_5pz2fThsX9CIrnnMSc`3+pTHs8EQoEMMHI0|6+%
zpkQih3P+0D>@$cLSV}~ZlM1(Irl&)^ni(JGD!5+m|GN~Iw7@e4@(T<xeokQ+4N$`a
zvCuQakg-hithQc?h$t%hd21iX>o;#S)YZ{UVR<Dn*aPzL+X|2})(-><_8go4xvfo+
z0(achwzlrydyOI)+62xzXcPW3FF)e?2IF8bwGi%qg6+ZVCeYA2vO6f~*H@MtKuqFr
z3BSN4J=J%ad`24%!a0RIHL~`<KDkOZEU6myELQWM32c47UI3RKy(}n5Il6^xXP30!
z?hOu^m0{}1EEZl=&c;TKr$#n5X`nJJe34So{F~93VD-?nH#BI<tSEg6)2Z9VAc~r4
zVf$2-P}kh-;_Mt7FF->H>l=RoW_936d(rCdE%iwKpbf17M)%|5hB`qaE;}DqFW3KI
zl~<|Oy<5(5d#26j)X3<!!wV{wmpNBBB2+OhIl|)Copxdy8MPqub^eI|!+l}0YIP=(
zpv)eDT3q=%{|+i0A8zzH^+Zp>HV*nvPwK6PmNZwXZ4Jb9rLq-8;nQsK>$OcyXq?VL
z!n(v$;CqnFXcI1vF|)$OQb47Ej(Ai|fTEAPCoAFxiyoZWKw6*L6(fgX;=u&|_Q}!_
z=X+661x{b`VFL%w!sR?O=oSWQ-gk8^gUx!(i4M#I1RUUrbLc{&Vj6kWy8JF3Rha<P
zU(mt0Lt`4#y_)>Fk=A4%!6SHt;Moro2ykaY3WY2=VCVc%GGK!=+e_O=cKz~|E1(Z_
zD&|_GhrPZm!X*L+AIJz&uZ>%qCP;Tm-C*t46F-a!9Y4I+K_HlBG#nScU}o~+n|xt1
z4_QF@`uX|!bJN!>H=H{ozSvx!PIX9ci?-9DH94{E>aKr6{TV(|Z}$Jsg=<b29s-8X
z%0fk}VBw&l#5t2RX4G)nRb}8JMfQ4{bcd7Je}eFZr6`Oj;p{<Rp>PpZe|*+BhaG(p
z!_cO0yUB2S%MWngC1|WLT);0K#Yz##nn9ZUhOh}2TZT;-##v`l2rGZOBo|(B-u*M6
zN%3uy8>POy$(-E2nA;uP-GK(md1)Qb&OGOeo$TyLDKrYFlr404A)|I-U*776+gx=T
z@*lPkyvYLH6z{P=`54b?$l56t(6I2;@X-(3f*m_HNzBO3cPA#s|27qgN=)nx&bQO1
zO^XzvG0~krE9kdHirHFFf=uBRv`&DO_zzaKPhk=S=-yX`Gmmbw_l=KN@y-k?&d6VK
zc|!Le?PU%ZqGxhNRf8}GgZ8Oo+Uya~J=7mb86Ud{v<n&*r>d9du>!qXU)%fYvsYxi
zrK-<*$f>J0AR7<wNAC+yD6X91jDxY783L!-lcU#4hNab<c0C<S$#L8yNUs62f&0e#
zWsE8VtwCIY=7r*2_mGrkcz8(mArx|;PaIIpKjJPHj)Wrs#-N!|zM;>P25dTWSh$UJ
zbmno}5bn4e##JK03Mj?-`NDc%5xguQbm5kVYyj_dON&;rmXVnm2w{t_U0shI*n*Hp
zfC&hvqd^GaD=7(McBNi3=D-rT6k~84;t-5Sv!v;q9#c`RGdhQBFR>rn*5>nWNa*Q|
zmXXmo?B=wzHg<M#_wHfd_aY`c>gJZw&w*vR4{(N&1GHm1Wo2b4lJoL#TmqbIZEwt*
zzPTbME-uc-_U_|H66ywXgn7S=Z<yePci|_15hw=0{Xf{Syy9Y&LO;;BunBG^yZ_(;
zs5oad?kFOp76?<3=;*W?ePt>z_R-z_;GP|Z=uzn{^3T-<Y%H(@1O%Koj~W7R<i|5|
z6!>AR@_8rW^R20*^hwLJ`EgeyH!)k)Y1cNz9oTw5@;oC<SQi3)V@t@cCMG7zv4SE`
zZdivPhpL4+^vw#dtZVkrh2kzBE-H!^Hw!X3W8*KllMBERRgsJgL_hQld?+P(5Q!c_
zq*)CV4s^D9UojE!)+!I!83_t1f=U>}R<exjY)pf_|AHe4@A<G&AQdKcT)q~C@#21+
z^SZjA<AenT?`hcO83F0ahwnOSHQth3$9rUN*kS!h(}*thZwo7UHWU)YpEYMUc7L0r
zZ&SH-`!?#q5$6j}G1)08nQ;HUmaTc2C+=&o1l5Pf?Q%Ls#)`XM(CT4XxvIeE%PfD}
zGT2>;s|<E$D9OWu@bazq^6UTx+{DyOacd2OkDHqtF6)EGFz_~P95_o<c4Kqi$eeEM
zv<D6#S0FVe^717#LW_m1A^*VBTNw2eI=)ZQo;NdcSk$Tw4yIJhbY8gd28R;EZ_BHz
zS<fi9wI#eoT!oV)B+R>E-Cia<cmPL17FPl6MXCf`$ABvs$<>HzY7dUPFn8ae!snX8
zknP8$*w`C%JP|u}I04lF#?jquCWB`S-3gkfTkAM6o3F{8ChWN>VqJX)Lq<_)!Bj=X
z#kw2Cf78Ra+sVl=5>3|o({>MOMIWAYVt~v=%f#fjFsgAZK^n{#7-&#qb$nyhXIBY0
z@QeHKY#1@PS_G^LdC1<LOJf5zUR>cAhK_@bj15h-($2ylg#kzo_}LaVGC3S~lpsAl
zni5u<3uL%_BpUsB$`ou7sAGFsWTg1{o7&o_$<AK5py8iFxLI}4-F*w0ho>h{7u<{q
z;_V$B7!p%D@eM|@>l-7{Fb$w_Os$o>QT=xqU%}=8c!WA10xY0VSYBOnL<;;zL?Dg_
zT*d9zfQzB5;5w8iwc-40Ee}QF>OtgQx_I#f_znc<S$lg7%qy{vL4^V!dgD5v;3i+!
z-BLJ@=I>K79vjn0b1Un(ra+WA<y14oKPxautO5cqu!fLb#jSt)se5nXf=J|&d`gh+
zz~&7NDVs%O^@wiDR&64<O>poHJKVrHD*5Ga&#nqCuUM<IyzA`rLIA*8j1dO`KE6aK
z+VG;2k-?Y#{7({Q5Xj!5KahnRPR|o@2~#mR23#=*gCr{%E3yg~7Z(QG;g3Q1k?Q(s
zg2_GMebVcSrdH+XTX2;ONcI24+<OON-M4YWXTwMovPH<wN~nzNk?lmuC<&PjA(XN*
zlgNt7JZ(ZGM2qY)D+-w*Z7MC#=jXbw`?{a|d7k%=_kI6({B>V<#(AE<?>LUnI=(+W
z|0k5Zu&@9b6KD}fcYsFH-R{>~V!4mHx{93<g?b6c(7?s39GR^NMfxM!6}7hS37X*0
zy-UYF9+3&$68T_ZHy|o_+KqoG<32ze_u}2m4EA9qr=pS)x?3UUDBA}x;4F+gX;()z
zgJrj_&GFJk?Fo5~VVRa7bG)|C>C>ox7v^LR&-PN#i!(k1t4&nOhnaR8$fnp9zL5Kj
zPa8l%A+QJB#Q-b}wVLe%l)nGq=wtT-e*>;*2>T39kEQurPiJBfsn>DP4~MXc?*=v!
zo}|msqxtzFqdK@ori6>9+>YED^x77UQfn&uPHipXfvVtIkV<cC{3R(UmGTmZ%l0FW
zyNh$g60H-mCheH&@7(#0lN>J$d@C5Wd}md~s0RyHAN?-QuAhHd1GZr;se5GP4=xIz
zvWtLn`l?O6Aq?087#vmH!XRH5L5ld!TYb}`;;>4HB7SRQ<H*3kBH|~ZVBWEF=P2k5
z)aU&LG@-LTuNQ4o8-woMTZZxx%l;)S_PCdDD%ct1UPHc1CVEl;Ww>3VKV$CzxAMXn
zHZ07#$v`~m?luRT0|&-#n%e)lUjBNy@iJ}b18-GZDWqRj%<)(zVUOZuzhzZ|GY=)9
z)HB1a+?o~g+cr}7y~NE%9s0mtrjBM*^|8GS2ffZW9Hqqn!!aD%a)h4vNAHyi0spf{
zg}C?sUw%`b$Y*R#yO_+?4F=YR&mZtUrzU<|z-DFM<828-4O7FP|NaN&?`N1(muWJd
zB0OyWA|o}ayQk+cL_ztwpFSjyc&l`m57CAK-9@od?Cv$2y62VRk8-z$4QB&OHoy_K
z{QJXsEOKwj$j5rvFNTH|-b8}x&Yd+%Q?qodgEOJ@MURq`qxtepf<6TWh_AQzS-3#3
z`9)4Ajufc{AKoS|4y54O&`{Cg1Du&U9HL3cs%(4yeE$)Fxsdm+JX){l)`pZlf6VCV
zw>f7Iqox6BA!M;2|0$js*#GS80W#Ua(Gf<463sz5nA78-#<vhpGD3O>bJSW-AW_oN
zH?djXwP7VSLI;Jt>(fvKIQopAABhBFpza_y0~ldEbCd!_Wo1vEKK=If>%m$tp_C47
z-6U&PGG}tj1SG2SfOF8-4itTK?5ABL+#8M+5+3fhclub62OGuMA`UD098C74Ab=cF
z`<fXg{Ws?*_$lvu2&}-t$qBc*UhYMN8Q`Ko<p4n)cv;0nqpf<BUPPRtu5G1&B+O9s
ziGBeC+ULc^fp8KaG)U-2&;67n@qx#W<pL4<4aL_WOVtalGZP>l|DgUM$`5O6$3DR6
zzkmP6e{ou~CeWv+q&$aO5t_Z+TtUfoCQ$*0?`v73kei-%j$Qvs#WYR3aMM6|%Y@HM
zzY>+WBmLBIE$#8Yr4bvliiFd-H%mF~!lSX01GjXf%p}-wYvN*T6UIdtZzHXaOAs2N
zsXy*J@{5Y}-+l5!w3pZUi%`hiz8U9~6LL1P){?)i93~H<tF@ANmRY55rbItO`Hh+$
zl3+P0&l+`w^_n6@G--wJZ+x;c;BE1yI=gJOyYj6!UznFz!*wV-aE-SLA(91rnjz2u
zg`VMIl|6e7nyFw1qeK6JI9V7&a><4J94XmFMdGrul`oBT)i$lco!VJSrx%V$(D9T*
z=dL1<0gFUEcy{GAFsxsXYPvW4{_vVX3Hs|VzVsxtY`0r=^Jd-zZXy5!MIinnBRRn5
z;TL&(FGOt;Y5eoU9WVl_PLxyaA)rrjmtgDE0H21*7hlL_@=k^<0v?7yEChl}JvIz-
z7>zz8qw!r=GBVD?h>yaVH|`to+HY^4Z)BAa6C(~--1J_jPF>K*s;@r-e!a-^_5EO+
zZGK;-W22&?YHChnW15PVl$8;(encvnRx>`td5Tgz$W##UIKEKE<F?0NI*Awj^{Y+<
zXk0)18a#cj7~HCmdl0-7dJ|lxcyaVbzu_SQuX@zUi7+QqfHTaYbi_C19~k3gVj{c*
z+1c5+9=@$)aXR4uAqC-5M$88G5gxw@@-0|c0B(%{Kkxp5AI8ci1wFrc2+cE+z80`t
zA!IzWA}l)V?F|!yl(=|;r1m;Wj%B4dA14uTTU6AJ;1sBHu<){0arXKIUo}|I?|tf2
zJ)&zH8ym@-qLcxTzEnjYakDJ0QcWQd@z{gyeDH)3-a2p*N+0fqRv}M{6ogM2w=W*S
zHt~Y)(c!lz4fpr-`1s#kB`GcFeA)Di=(cS+Vq{a(*OQZoMBKm+BygvKLSF52vYlNZ
zav|W_#FGY=pe3H9FPK2csEzOIz@$@K3pt+#Ek1{PP91E6A;j`KF}6DF1fW?U-&@^&
z0;#8v3oXQX<XpPZZgi6i@jcn&k&sEC=B%{HIQPR<O)bb4m|o6>3seoYj3|weLi`d5
zDksqXLBk}h-TGw!DaIrcj2BlL`F>n6-hC(#!g=rW-vEaL8*H-u0Y~6Bq-kcpgGdi9
zDm7XPc^23ziG(LYk<iwOCqCF**^*bDuB!_z0>BrnscLF&cD1yh?%flxFoOo~sOTEb
zE(OAL)v>R@_pSYm7Ny64GpzE<01iTc@;aPN4xa$Iy3&bfr>k6=6~G}D@`{ItGEo6r
z47@{FyFJIC8%gg<w(%4+fM^`XejxiGXnQ2D{`e6<BE_|6a^A6hk2WmICJQTAGn_tN
za@Uc^^m5LGR0a6iRyU+SF+P0q#8^e80VPo8I8M5)T)V@{3kq7{D>W7@v`|Fd2#~AR
zyTYm@KR=?#Da2ItF0zyXIY1J+&WA;vHx6a8t7~B|cdy*UNf<4MBt=C@Bt-7y{E}eX
zCM~^@2MM{@#L&HH0d*Jk5|fp?w6CqmRQnMw>OHwrXzQ$5Q){d6#?K)#Pm{CN<+dv#
z^atq%+<7{$>uqU>4l%u5Q|mq3i)!=GUBBSZnf7^62W5vpoHS(bq2Y*SW@Sx+JoZ)U
zUIY~){SyYF#PjclssY%6<HPbmX+#Qw)Cq0nras#<U+0%zhi;5q*8rLTAtN^39Ubet
zxCn6ojBPqmkp1YMYQ}~}f8oN$Zux#<Li5`7mI=6PI-&?q9`xz3&D44;q~jofV}rG;
zw*xTjM_7jg{QZC9fPi!h5kj{rE9pIVKmS_`;6P@~%Fo}4a%p<0?ZwNNISg_UgCiq*
z?C%FHEO^t7VZ8zv?J+|l6bM!*F{=!&S=?r+_2SYkcC^gQJ}hFCmMgbWD2+Kat4brQ
z{LUSCaogyf_;LpG_|wa<klt(kn6YFDgE&v?1{#np5V*m)3_&quXy3kk$r01jlGb#0
zuO3^uYq+NI3{g2W-M_NZkm+AR*s-Y~GYoPWyH2jA2nq?|=-KG0N!!=k3rVTwByylM
zizQpB#5O_40x*!CfngQt>>gDReXu`hox{ocxq}86Hr%v7zI~8R*}(5{{CNJ-adqKs
z+ukCc07?y52E`d3&+&ib6|H@l%&0*MQsc9)utHzm6e2NL9~7-0pLMcIe;<|LBe=Td
z5EeBsKxCayc4EJ}B<L_fh^V$KSS2W<>55#?3iL43vX^^2`Z5joSNq7E7Er-*`O2Vx
zhliL6a0=j}pg4elqPe+QKWBgPv9G)G-Y#7Cd8iTwHWN0ROwTB-Y;O!sfHd+&YpV-b
zD-wzJu%qK7Zc2oKB&4K>Fh~7or}t{-=vhrEy#w4NJ|UrIq)>W_2nvD>4@Bz%^hLB8
z8CFKU?Qr{;#$$hNh|tWfsjkM$G4s{&QlkY%{S&d<s9tc=O@eGqgC3)oY_}>lZES2z
z{}eP1tQuAa4;tr2YGvd6vV5vH;gVE>+w{7eD?%!aj~*3S%4K|rqR1n6>fZA2oBODo
zMLqrez5`9Y+MyQZoKC&lZs6I2S5XTx>mH)&f&q38cDLPFdnZp;T))l(%Z*q-{oq7u
z8OmQF+6x-c{>-PoKt?>USH|;R#OQz*FLpr-`NKTZH$?c^9ou8>?!_@N23lH8&~gIT
zb<(ZG35Gf)?ausVNqiV}Tzmnrvi{RUCWib`f)A(d+2xd!od_Q!-N&MIdh-Bq3b?vi
zpLl%=J}p#!xTY1cE5USZ2Mzy2fYwoNQ0K<gtFv%SO?l0(zr?XUq{|tGu6E(}+I60p
z+kX<bH`=`Of0W~Q!?=7kCXa=4Wt*ruy?_j#=&8?=*<yUaA#SUJ&cLjA^^SPdwtM*)
zp)I;tOP`2CCP=Dg!?ZC>Gd;LCQ3{JLl3BtknSlLhXlnMDfkzZG&qHinkCyb+V_$YX
z%8NFmQ$k3?bJ+!3EOOkfxLH7{p=H@`W;O=r6)-A5<S4bzeSAPODS$_XtbX8jC|Y*z
zOzCn)Ey&43M@xI+)Ty5t%T%PGKP&YFJ_7X1q#zi7M`};CG&MOw4L@U?b>)h=y*(<3
zHL$%Rh!zq|eL2;^khPUjwNSqF^q_`S*GA|tGSkwMk`~|9jAn}&vBfOT5AFb)HE4FL
zwswg~$ko=y-GzLCt-TZD<3x};vSjsbu<9Xh1_L$H)HI1_3c6&H+aeJbPab7I87S`?
z8~5zo`3+WLsP567Lr#MY58i4Jj$YSjZVnCr7&_Y8>(;IY<^{392wcSIE`SmuR1Ez)
zLgyi2Kaaq%!-vO^8?JL{^X%9d6H+-Liy9e0`RId!8&Cp;GWo=SSmOE$9{5&xVF~uX
zDWJ0_@xEa$Q9=UQXsfBS^yEqN<jU+Q4Xxb-<I6Ip65vSBVKN42r@cXAPexooI?7QT
zYhV2C;z8hofoc{7*D4a`0PH<r+H-I6Uk}pwv=r~pnhIPh_-o=_h!5EIvoakvzuJL^
zzy{G@O|oks0{k-jiS}p0Y%!=gr!KX#Lavc)XRhxE?V(o`9$Qn#)iwlS?-Ui)>v41R
zG`F)WoqHMI{r0Ep!CTFnD|4>epb;QqV`0TQZ70rECm^%Q6(vUG4R!KfYFWG+(@mwJ
zS)1akW?Z*%xrCJ0)*cW~0tuDvw6AkLIomVth}>%4IQhMxusl2@hNeIH0W!i#QpdVi
zOd!6sxmnxw1CiQ3J3FW%uy{66F)hoIKj|=}2>M^>D*{~f?S*^)AmbN;Jxk4SUcLR+
zx?nxdoojriWv+2LU~`VxN57K2`%0~=U3vs)3GfmpIhq(BNv5r66p_2AdqrJ539+8x
ze<Veh#Zz}#7ZQH}C<xGz-j-@h?X9bKB!Vseja^JO3FnOrDAg2*|Akx7nl1Ly`-NRP
z=C8jwxg~&UK>N1t!b3b2aGbsO+$waLFo58R(g%33&3RQ|Nk=s3k4S!M8DF^u%T-B-
zV`5^U!b!Su<H^{G)~D-d_u<WG&;ljOnVfW7Xe#HDT~O<H;XxJ;b5z*0<;|hIm&%uV
z_|S0wQiVMrdnZTt=7-&y#}z;KpfUy94Ydl`0c$HOmqTh(0zI1rQ0R8Hwed^m&b?b0
zd)A|kkDC@~aq5i=D?|^%kLuXMnEvkAdH(?hz6BEq0d>w9z$FBXaPj>=7(-kO{w4Im
z3@<@a{NMadv%+W>xgul&sW+22=v5m0l;6+K6M;?h^N<vvl>nL~+5iZlp}`JuLTf|5
ze)Y91_B?iM+N&08FfDoi8rroXV8AfkBv>ue{eZ<*luanVdU`<dVFSL8&lUMuvoWWb
zG}B;|7x6{fLsv@zy`;<A&8DYcK`w}62aGie&ux;Dmep9>)7be)&yghEN2oo-U!lFY
z<UfIjT}?vzC?mqb+^RulA<GLli)FR_&X3Sf0Q%z~nVOkp`3m9K0ZIw6XxbHnbvXD?
zSD-5(xE?&vDoN%jw7!rG*Qux8m>+G7-mpe(s}Z$k_^V5G_m0duzaPE#VQg#_si3H+
zw^z(tfj0tQ7cMZp1P%_8)~z2VxSX3eP2@b#__DQDpPznJ=M&?9o*)8%{uP`SkVoFS
z-MsC&r73pgK;rTi&@LALKO+nY^j-y%qrApDBQm&Vlu30Xjxvd@)myz<veg6huH8HL
z+q!t7;>mRJqlXW1?ylxvOX^JDc@q{vbdGNQcqvbvZ!IoNvZCd5tgxDEdxrCN`YKGd
zqDlG#O7UDkzcXXxzb`F;NV2lBVidEs90UFV$%+eCsM_BI*j2r&iau^QVqgbOZn~^3
zwEXo%-R@ugYHwC`FA9fdX_fKBE3q;3YL45!w2OH1cmfI$bWX=zTD8vG6W#i@!~J<H
z-9Py`x!~gbD7$Ad<EBlwFbAf#2_HD-hq(6T0lKKKZ-LarB%x9FCavmb(lT%Bb8W7y
z{Y|wx-OmT2V2p>jNTLc6aC?xM3pJ?gtSyu{u%1D`vy4*=MpRJTAm<g3=8M=70TYB(
zK*a{V?nCEWQ-3<ZQ@}MjgbNL&t3!n!g2K=Rkb=;xfz`*7fdsV%Z8cJK4U#2-8E+tx
zrgZBW)TIzm0E88zC<1YXdc{g2xv}vNno$x7xRge5Z?`h6IA|V`!x0s=7kMsN8MZWt
z&I5V%13EVJrzi+cqfSP$$R1>Ipyx)dgvkx9;z;j8wzxLgOq^_G)jD<pwdA=Uvm$IU
zmS$!MAG-}>DBc+@A%?;4h{B0Pe1R<k?*U0CHp~j1Kv>674cu(}5&bwR`s_)3E|5ev
z;F`vlqM|jx09Mx06)<3L;rX`K0pWN(B%{E)aOCLG(-7oGY2okRK^};}InjnpH=#t{
z{GJ%uUy%O6UjRW7MkmR5O+pLt0)qwcxgpgQ%!Z}5z$p|DzkZ#ip#et!O9piS>Hd@#
z&U>IH0T{GHdl+j6&2>@|YgS_3>Y&C3Q9AkqGqE|XC~<%W00txxQ#25&^V^WOwnr@b
zGFX1prTEaGt+WGiC?fI(FcHF91w|155mxJASf?b+WN=JN?W9t|TdTx<Lh*Vy4j|=9
z$5D>w@_B`(fB_SazkTqE>=W0jOs)9EnMdKhNu(f@Q(y~`Bn1E0?w{VQl3k%YaE;+T
z9Rp$j90ixxoaJYbrT~HBt|8fSwBvnVZZ3%gTW`0~6plgA#4tE}%PA;4!x)s|3`V*O
zMBK=HU_6!<QF^$Xfm|Idjo=0lWNSc7?TP=4kzRoq+2x{yRBOE4Xu%YH3Tk>%5L(Sd
zG!i=@k_Uj37SJtg=Dh>R^%496EiO1AfF%eD3o|dG?|TZ-Wlqi-jno@Au#mceOwjUM
zx9Yz#-dfDa#KfOIMo9J8B-F_9ME)MQv#=B3P4NgAHU0wnfrM%h86Kz^TPb^fFZ6+6
zsfGDS2UVf=w1(*C3l`$QHCB;cg9~1vC(!?cWZ|3W{+UAW<OwznFNeMo6DIh^n6wX?
z7BUyypI8Wp52WGOTEeYtcGA1~z@bAo!}YwdwqL(~4e;H@%4+kTvm65in1X=4KR6#y
zNHJ<4qzj6)OQP8iNSVi_<zg*vw{>&q#;-@F4UWu<%!r=@*1@`$mQ_wMjTS5*{BE!?
zFww3h9&}VxVpLJP&ggI<ONCasy3;$o+egf-lfWV1thoFFJ9PtWO3>-<-MfxrDK>Gf
z@!J(mv(x&U6cWp{Hcq3Gy-Y4F%jx*Up@GDFYgnb6yV9^^P+}FCt$Fn6PImTclBu1Y
zQ7-685ImZ+qj&wGruBa}BtLG7iC-Sqmf(o|{yp;@aJ)fHuVAvRZKp#;<ONr7k0vHJ
z5ds|l(V^lyRJ;89!g0L8JlXwR(_0F*0x<c904AEf#2>`9dO?FftQ=W2$gamVHPej~
z330XN{YTn;O&nSI!pMcl^SWW>zKMq?_J_1S^32d*;0u26NL5Q9aj2|=7!hFu>qQ4E
zx+maH#zsa$Hku|T8{2NuavjRd0~T_@7dlQ73D>v&Z-aGCo?)HfU;v5)GxnYdJ}h+k
z*+le02;C1c@Ab(iuxPH%Uw&jOC<{{zdO4`zxRw1**xB*u6B&T0$L4PEh2c-9D|iY-
zyC{%Ys@e6pIVx9~9NDZbh0z@ZCS#_oNsIG~HA;7IrhC*}n<?Z61@=|U_|q<&)UtHS
znSI|MX~p89|1#dTMZ=sm>;#}baDibSsGy@u=nBWL9bE5qQajeb?1ku@YCJjY`c&ea
zqGI|@9S%-T*eeK0j?3yvER(dNt6tkvJjMZvBPtbv323pPDML`3?&jX-%gde2jEwLW
zqr*<es5ToL1LYY*VHFh~jqj*5u&EOrjvj>xX9C&MS2MbtVTZsC@c8+2cGBha^q3o6
z_T;BXi~+ZcjKIi5j0R{pyHr_T-rd$F1}=p{xzI|?>p-lewnN7QhYDOnNUbI^e!!tk
zc}1-gkcB7P;npBwaDcD5x>{a;GxVwW97z~nKp;r0Vp3AaJUyYrQyk<%S#fqxMnS|y
z@}Y}7N^G8x>26%%+$P&L1uXaS?N4uLP^pNSZk%bjU(S60{)A%#i=?^0)4rlCb|FN1
zdMNw^evHPg=^}cu+sLIFn}VVhh<~zZ_8`f${~&2XN1u39RzUr5Y(V<A9RrFmU;-_v
zw!EoA9bTz{Ygnd$*Um#un+C4LR@R$ubucS;a&|82xx~_33cYcKuZbuN{2ZdC`fS41
zjNNil+J|!rmbFSXZ#R&J9v6r!a>pTU{sX#Y!ni|{0Sk;VAO<N?R5Ubb6ALU5A8G_u
z0lYb2eI5HiWz4MS;^fTo0FDskJN62+=6N9Y_~_5U$Vw3A$k}wdjSY)DW`_!d;Y8+X
zzv4D;f==e!yO${lymE4qKl}YeVrwohZ=A@tj5SWjJUrHtFkGo`8%2-G+hm4_V?p8w
zhHRl+LJ*-#bornM=|1}5z_Vvf>B7nn-8-kJKtNV!IR1jHS}WCQU%q?^A)$+Lem}cc
z;My?L!t<_|i;aBQHE0w4v3Y`mTiOEJqV&ICT4$&w5PJ4H9Oth#hBJEYx`tV0nZqEo
zQ1&9$B~mo|6DiX}I_Tt9O7LSURLlPbMNO)5s;TE3VdgK8PS8}Scw5Ode4UFaMnj{)
zH&^r+ijZ2gY=5O2f{vf=s5ZT}qbDTjaX|FL@Wy3}A&vo^oo|bgTCxWrClFjP5$zzd
zVPi!SQ9G?7scqs~i-HEYFaObGd0W6fV45+{1@N^U={_!Hv0J!x&=V1nZBXiYDA<Uy
zWG`UfwogIO0vb#_tQ<oh_er-i^839lt${Rz8DE^7wH|}A#OIC&T4)h{BSO={(h|UR
z*1ZTM!>uA^&hCP_HrIa}f7a?Ds8kV9wg_GXGdUs+7`1_D0iR)x5yL5uv1n1Z(PFfT
zAe{_iQR3SE53sC9&QG|JPxHg5l*R|D=>n$RYrPt;)OYsI7mLE|hm$J)Cy4~zGu9Bw
zITs>VzPZ^65Zufkp>A&yiR#@2fBTiI21ONy^b|uGG>ID*I3N(xO4$yL!5M(Ub-es@
zZ90R)g*+8$p1J_O5I>-6unI*{s_mAJ$F=&_qLsWxFA9dvyU`6_Y{A8Wau%3^c0JTd
zB_$q6^_@Adi*ZIVG3!%0oXCV&8nGm>)f(oi(PAy%J@-4nYZ^X`Oom`JBv+!sVq%MN
zI(~c^6Ek32k7`X$OB)&-6eD;)2(N&1HmROm;N4zm0aS;I#IfCBZF-lm@IfM%5;46f
zY&KcKL@ECJN(D|RU^w{L#<y%YQW15*$%Ug45z=~*4EzL~eGcMER9>X}*a6SV?6{ek
zHTplohQJ?d%HWb-vibq^Y~ACAlpSmmY#oS<vU`RngklvB*PFJ>t-Z?<cu>{}WZ-7c
z)1L*<pOdq-D{3ZC-HMt=o10N1U|Bq*Qen`}`(DJzdirgvI+dmJ*-c+7;XVK<2P&c}
z=Ave&#t!zV1kAkxLGQg|HR%K@S2P4n?2f?a(XK%xHIu8>w<?IQhJ89%oj9*BkrL`4
z(dPU2?csHAvTfL$82cVo+I}v6mteV7yUkPvMeCL11=yNHdwm6;S9DuYkz71Hpv~Eg
zkT&#$J<-bgR|Hy58sEIIIl4r5lS1?llK@KH1_Hn{_u1ak-A#-IC6YKunGFqrDCJz@
zMba+)8|31=B9d@G;u*{!aC!r`BP2Aal969WNPDPF5a-9j!LbW7D?$8V9t^RK&_*D{
zR?Th%-64Jolq7}hiZl^u)1b^ktpb8Z#4yzqt_f~#jC42vChYa=k3bEf)5Ga`af|8*
zY~e@<$G8GMMieYVFJFQvN9l&z6r2i<tYJ_yuIz9pe?Y=NA(Kiyj?OP}5&0u%KVTcj
zGy{UTL{AO(DwMf67k^E}sbav_*oO~s!mRORtVs}WDSCpazs@2L_WHGJkz63nfnyS0
zpxvyIKUKgc*Tb}>rbhbx3j!z1fG{!nfZ`j72prVVa6`?)m(>a%1~dl%_V!LfpnMKW
zL1+L%LmxpJ6}bQ?8O{c#=g%AwTvXD^4j>T6%ehbE+K8?|6ZKabHAr)mr4jU;mH2zw
ze<;@`8W1RM<VoaS0I!DE3q2m}G=##@JRsO#2om`6>T1fuu0SEEh=qO{V^4r*S5ypu
z#0U&wO+^JP>KIAF3I5g6lJ!))sPqB&wd~#9iCp+9?#-JCBQM17RW|$g165AW$he0v
zZ8<A91rp=rSB*^^I8rK&oKc=fU)<r=FRIV(2xMY<S^;cxP0eVX&w5fhMr3JM#&ZuN
z1%>#n?X-}~*4?`Yv~v*;8UmE=(gW>U+B_iwgD!PSY%vr)-XHedyS)UBRNJP!oRpwF
zm;F(MsxU>;h7E8y3yU{iJ|=;oZr6V6(Hh7S%QiW_vU1^45UKx+u46&Jr$utBT!pBi
z5kNemlR@E|#@gEH|L2!KTno_1M^bK7W>Q5!?Q_h{jgarQes~^6h~nV40}O#T0guF<
z3=;`Q_LycIHU)`V0_qz#FyZ6qXxPBm;D7D2M53s*&68Z-U^T=b2s1~C#z8nD6CYL*
zVj+&HATohs2jC=)U!3ek?o*VmEG;_6w$=6Z^`~lWDI(e5s2Jq_yi;62*T9=3Kto>I
zpaJX%0eInmdJw#*ASV~uJTBEv&5v+{%Hq9k129mxuDkh(6ht?){>%N$otLhyhc^y8
z2M!nO@W)!p?K=7Kw1-8xue>+*^sgJa1YQ#*SESGnIOSDL7ho?)BPX6LOJGZkjoIVD
z@iik&qvffR54Tp(ob}}!7{#St{0lQft^jpjq=U2_4NUK)lB%kHOy_%>|7bz(1hm{L
zck59$3#bOZpPlvpFrsq3weQqS^Qes@QC|YRT}48_tMOsFNpu|B0Dh<%HNMuaE}?1u
z`Vb%mU|+v}7&W;<qk(hc0oJ9@Z|~F_)Ek11<-Iw5IwSDwAgcW+%2ABeLy%CHb53*Z
zva^-xsc+e5B;Z1XAq4r8>`_|g`Nl%~xDU7D4GN>B7i1<JNS`09TSS`JIfNd(8Xgvl
zBT$QBwSoRw85M@jq+y+#Gw;#5;D&`39P(nQlrf}s9ui9m`dfiNtk7SIpkYU*b?3&z
zKj?RnWH<u;jax3&Jl(i(&@tme3t&vc<1}$CbrI9m($O0}`6(}W>Xxp%Fw0ZGq|8B|
zcl4o)vQC_*lcVG1L)tG>`1&CMn7MrgVX7j+!s3ctFn>2X2m>!`29?Ruuv82`B2Gmd
zH@WAjlAK-t#tnykb-H@Fr}j(xRfXwy>56zV<S;Wn1Wtq$5*V9!qO-`4=J{odH(rUz
zCWh1<bTB+?P<)yRCn|Ota{MxgM0uABM`7prMhnZ~<iy0*Z*mPt_TyE>XNVjTVgqv$
z@tioR$c<muo`k#spz~^y%V(EsutuUF+4lR=;kXMDs*lQhSm&tXhD58w^}F(azU0yo
zf?)P52s)49x`D2)Qou#T<1Q0CJW{c+`RYLU>`%4sQyvZ8%74K4qTGfHFKJq9rxWWU
z6D~H*(eY=r<U((Pv)uUvvmrN)@`f{FNEyaXFxNxkjS>XyvML6X8%@ou;U3=2t4Vtt
zf9g!j5xs;A{}<hvgQ@(c(tdTT2e}*vX42y}i#t75D{W@JirND|n|+$HmqNyej=`?S
zcHI<6^_EaEa$d9#Shg}BJ>9<!x@(AIz}e`Z>OF5WjRpY-s5u#Spivam?|-*p4GIf8
zgFDu0hdl{SpUBgN_s%Z$B3>U?z{1VueILGr*lCG~4Dfk<-m#IJOOAWT+Bqtd{d_i|
zC29Jrx?ko{e)f<h;@555c>2T%wJp|86;|-O^uP0@63*)FbKDnqz9XQlOw|>JtL9%@
zuNy{2MIEScxM1smN29At^XZ*N7<4vg&ngcg=UxC2Mqah3YMy7xaOl@@C~(Dbu3iku
zk7>2mJX3U7bk;Z&K3^E_Z@lR0=zwl$?uL1>(ecEq7aKoISXBKV5I_7rbmHY!5=gX$
zkd#EiB(Z^^Av)4`&}0O#yvOq^x(T!#IIADGv>*bx4DFhP#5^iRpxF0*eO|xR*Nn>@
z^a9==XcwZ-I(pmqhH_Nne3L^3A!^6$$OIEikD5SmIX1AamexuKgi)sd2CLZQ?j|KB
z?ov{Utk^K~w-(^0T|Jv6GUthdyTS@%LCR4i;(7k>amuVJL)?LQshANDoU$jN7<dg7
z1owWvuLnwqwOt>uSQH$*DojR7>eBZzB@!Rr0lrjKNBjGU>EkF_n{;tm{|zA0y@T!S
z6I!Is_I6nWQNZ~fn+~5iWJ4{^w@_ZgP!%P8S=4I_={2C{8O`H>iBRN&+N36_(YA#P
zEi~W*1*r$I`BY7UV-(!AUbP_IvGnjcxlG;xyhpg(h-n8~w(#&svc-TZy*$V|kkcB9
zYbbF{E9a)bQrlro*R@<@tj+Y43#?}UWu90leN?tKZZo}2G8p_4`I+=JwM*TidRMRJ
zabH!1HTBdRTttB4OcqoKxDugv#E>jYy;<S*b5I<_G>Q|A>T`19At|8$;LrvE8lmJb
zv#kXmfDQsjdh`<mv3mlcJNEm6bM0JnGdtS>7%qvFWlE?k2-hDq_bCppu*$aL{t6dU
zXP!qV4gy#ce!m4pAr6@HIxk@q9=vmMYy1uY0S$#WxUFHZM0vRIFECIG$MuR%$Q4?G
z_;_N#lNw)nRTUK7f*E$=Nq^uL0}$Y{8|31K@1sg4Na90NLFhY$nAc5-@rAi0iSZ}=
z1$kjgSOxLI;HrW#0OGY^#%x`!4ZO;c@9#o1+5F(aD#@$rryZBh?ukvpMF4*{4L|$?
z*yS1t2nSlgnrD0nBn0sE!ghk70KnwM0CkkulF?hJ$8l~ny~pv0aGt<l-%qAJEy`qX
z2LhI}@j3cDOp-$QVNOo;tCSlzjL$~vO0Z#;3cKevSwwJ^paZJQ7h<8q-Z4mfsSNP_
zB|evg1AL4i*lCum#gq2<eI$`iK;B$AC}uXPu058kcPL<K>Y<JqsgKs=5~ns>UT$V4
z!cGhC-rcOhv1QBgl|jF2>)DjOWIPTyx|=G{q)lsv@41zIY^_FWLc(g&M}&P{MX8NO
z5{hFp<D?&)KZ;l`#1<Xa;S`)%e+#u0<3lvY*+yb+?upb#%S!ZvccMkc`a~y)2;?&`
z(E{N@%bf)fEs0*>ZD5ijTUUFDP5<UNP@#Yex_$BJN-CgpB=DF5K}E$n6t{Qoq&7Q%
zDQ2oB+TZwfv77OsU101z`z#t*4;L5YUbF=2sCeS_eKhk~nlu|W2Ql_u(=h%H*8w#Z
zRhB1WmVm{KOBBRK!c|a@X}ToRLhx?kiIdQOr*xDS6hIpT%7gJCKmhn+v->ePpgZCi
zs&&NcO+_0}dck(*=jF9iVwxTB4dXVS7&R#Ct1nwZDW#tK{pU{;3=4me=RFF`1Uz!a
zi5kA9jxSL8V`?e$!5~KX8%5m|{sAlFy?}h8Vaa8YJp)67qM*C4zKgORr*YyB@uWiv
z9xmB2I(}ZArq-#<h;f*lT!5f$(?I2*N57`ecE3r}p~}{0nHCJa3R`3su8P$&RmIxc
z0$#P4!gR*@d3x7jElv(OOZB0Hm;6)P#J8~vhTXytDo@0XNUILM78mPK{>T>w{S*Lp
z)gFbknAIfPJ15i?OpJ^ej7$rUGENc?XId5nKlIJvhdTRN-Ja~HS-TcCKQtFez(PTv
zT`0IVrWw)s<|j_TDwTdff-R-PuF`Q`C6T!ck(HUkUm~mVHT^E3)4tcClR%?~Py}ET
zFm^3N;D^TC6^0~y_CQ&wNxAv?eHkJ5A3qMa8WUilYa6u_d9V35T=(R}?(~QVn)EJ0
z;0HZnP%bf*8=ZkWd{MH7*LE=l=0iXhfA3k)9*6%~JBC$jM)ycsg@c2G@g=~_<u9ts
zk0w5$sbZ-;4$~<rg)Q41n&3x9<@fv7uONp<FndBYnf-yXbzzY;yzhU9S6)ud6&SHE
z>7UnE5oO1d!{sHLa>&(tJIjsUhlyI++EZ`dY}y$GUDvhi*MWDbrG)Bkphy~6TDa|r
zmNoa?T1n-!r7=U^eV?8lE#~9S;Xn<q6i;*3xG&i07{8wRz;MsC9bb$u?sY!!DV2to
zr1rIe3L-yjBfwlRb$}@mB4MUx1xyBL#sPmdz1kyG#I@G{5GB0LbUHJW4+jt^7F5<Z
zttuWua_)MeE_m#Y<hKvI#Kd6Z+6#jUMh+n_r^~s+8M+7PzVr+Xwuy<UF9)4NUJN83
z{!1}`+PC}5iojEg(ngS(2R7xbtOrQPZ<zr0h^5e*f!zik3w~@oTU?%m@b~=EXQB8_
zzumX~iGo&m6{$O^Y<}7N^C{s<Lu{5iB?mC60NFI%w=Kkfqjm1~#W*G;E!jT6L}Dn8
zy)K}jI&>(*Z{Z8XF~l?vG|n!LeCJ7vxbc7|w3%>ygSrL>1pfORxET~~xId%mW8SOs
zPi%$Tq-^H+JV-2vWgYXS(Eg&kI6fL)<(wb(I21Qy;Z5?A)1z?VI*VA&$_rvXuPU9^
zpUk4>w?z>JyYA3?{g0*^qu6PtxoP_6EYhyNisfsq-~smUU^8$oJWpKjrD19W=O(A9
zPPxxl7|Mz&>DO%3*ab2PIclwWm{Ebk$XorQ^(Wxmj`{VXW>5LVeIyJW&jASpe${im
z9)uN+lOG#wBMNSvwculvIMjHSNN7rtK6+~q2}6|8hBNG*m{bPV04l3K@U%190xo$~
zckf;+fC)xkL7~X|Ac7rHdZ4PoDFuoAxrH8Gu=m(hu<Lv=Cs1C9PvJ@<j&hC0y6jaN
zv?Pqd-k+$BEHl{JA8k~H%Xn$2>Dv=<^CliRn64Z;3JMh{D~SYtsi{SM6$ywLaEYeu
zWjNBNCMN-f%O<jYLSG2L87FP;9b#f6@j553nBckYv$gd&dbDNi1PkW$o*tzlVd`MQ
zXKydmXa*GW;OS$CI>U+HBmLDZOrO^xSGrPFbE%&Be&u#e*Td~(z!;EJs=Y)RjSPeD
zn|=0&L@@tIZZ?&V)qJnhny<sc&YtsP-&3PROR;xTlcw^jLi$rF@^|#xWj895f~+KT
zw%sFoW~6_0pH0txzPdf@wz|cfRcfn;gM%y}4~sUS<gb_a{2W<xSdn+Dd-f=fdn_ST
z@;RCaS7OvyK?b!aiP_-Rm65jJ0luFJ2*9lE8?B67T+zRx=Mr}a38CXWgwg~hKu;&e
zygdj0A0_`t-$dQe1*|4MF(T6s+tcL&EhVjqx*btVv<m5^3uaIRt3Yxl_nW+b|2K>d
ztjcB7K$^x82j<7^?Q|s6Hi9+^Fogk`TQP;TKQT>8PHtE6{IxJ^UX^<J#tPVpTpczx
zV!9J{>ck#EpLk`xXixJDufmQ&X)T8~>(JRxR22eXUse#%y-NvCVAO`;{BkS(f5Mk)
zxP%e|Ock!N%)&HeD1Fc4g=nl)IPpC6dF*U$BWXncQ7l%yY<|tJN$ieHi`Wqs$5hAU
zw>pme4mNy@u#9BB*mpA{t0cNH<p{5H<<eAQ(y(@l?#eDnsO(&`k%ZbEE>1F;Ic=Tp
zsMa#5C)TNaErw9O4=4a-;{CzixioT-7*|pilwqf{5Z|$*uelk+5{q)vzS`q1676al
zJK_8O71%ZSxZn|ZjWg#SczSTjy}gwva1h4>-7_k5K<50s_c$@+3ug!d$h&!@*<z;t
zcPhCS=#o)~0k%^0|C|dc<JTLjgmAEf6htoL8j?NEndLXjzcK(JWUVsk5u+XZOPfew
zdeC)Y0H8K{FDfcR-6tW@iINkq0w?1C6DEh{_ySxyblbsdz|V?jPL<-A$B#mNh~<T@
z@iPpkkW0efio73SS<c6fU8;m!9K8mT%UZt%pnbyDLp~k+{~X9bkLSfOX1D$V<fUW}
zBRB(IWe-?l9}{z3u#wRkV87!h6mgfrYlK%1251F948$1<;Uj(_xJ{Jqt4VN(k>d}q
zB7qIpgEi^NlVp7gkVX6&0Lxerki{gP(F$&MkUV(zJ0$$z5>et3R1-XD7~3=Gq0lfm
zr9%}au#p_TDYg-yskxwPN=Fg{P&jV=tp!{=0M0nBHxB><81-ljL#r{b9K&KO2j_vu
zomTN<QSYk^GTf*pCji^e?g5L>TOW0L4bw8cgiE+D&j_kkBrz_c$b5G5P~5)`=d^OP
zeYN_Bagpoh%%dV9&LlinQ`3(i&A~~MZ_8Icup}q@gQDMnKu=JPe`zwnhy#E=-nbO*
zx0&S=Kv9U2dhSZ<dWK%yygR|aB|D+hR_Yt5PVmc3pJVXpLdby!&%mQ-5ic+tJ+uRU
z7=rR1Zn&&R%qgtYg?$0%N*ks7%%tlxvu)a6V^1a?kIPY|O6j=1^Y}q9!9&oE;5bBw
z8~Eqf?kl(ON_W(LLrM*J1#JPQFtEgk<;nIu01IT42;&MIxO;ebxaZ7kMg$%YH3lke
za*Y>Ee3t2b=+voO7UKB3+A*P%(W0^*(0^)cQ`L=2NK0dqkzAZOI4J<5m6tlro`%Ye
z6&3v|qAvI7?6$FD0-ZQqdn(@}dgy^>Xm7H1_~BEg-0jE0?#yD8#R6t1i(1P*Y%<a>
z&i*jfT99^#DS9$rZr3I!OS-$z)#CQJ2V8x}jw>@QllETA9KKzA;_`+R*_8jZVY-k@
zX|n(L8Lo?Xgk==W2sJJyIcVQXat}kr@_9!ZI9DiK-#YKr`U^(9ZMLGl6icrJ6^_6c
z&**14v%7iwd66c7p49~@w^%?x%vzmImHYyz7KUm4nUaTG%8MGqkZywAH~WQ{NcJn^
zxk`5j9K_xb_&^}CC*wS1=)6DqsTuA`6QY-Y0LSEv1<^e}=WoG7x5=FoZLOc*uJo=O
z+mGn%G(Z5>c{s?iGy5ik8NmX41lXh%Z6OYV&F~oSH_L!aG}SH8-GMw`Q3xCgTpV~-
zYBP0gs(@X&fhz4-E)4y+Fi1gQxU%A26~s~(&mD<ub{KymbrsY7km`@k4>2(C+}lc2
zIl(&9;CO(dqk)8Y2TA&85xfP2e#ZF|$HNe=-+3x^F;dJ?`)(fW`p7x>J|EA3pFgXv
zZU{UF5;xGv!s={y{y}xXS{j;rpfw-~NYs6LZ8!2_Vj2x25QToQ+RYNmp@>HZFxm5T
z)Zi@)Iq0XxJqU&XVH`LP(8^g>(b)~Wo|WwO-umBgDzBC48isuXExWLd(61Zc-Bh!`
zo7+h^XY3P44z;b+{+>CR&qNx`zXhVcpZYXY_XO!bfGspY7OKcVjAYI-jU4%M0b0-0
z##Ka?#EMfi@J%xJinFwV{r`5c=f72|I_tm=JgcS+Wb_y~M&jn|tA{~9d;@P>4Q9{N
zW|dLRn%fuqu6fa#dXQ#)%eIxbr$}>X%W@1{(BeGkN$(&KAD4l`g}DtgtRJSoxD6E5
z^(L-csDx5Yk3yDj_q`Wiy>`>+qT{kd9e(k$q~Hoh-0MgSVg{hOFLG~@uViUC0HmQ>
z(gw5@0>Xd?8pV|;=(9fO^%ADCzFkx%=QOjYQl61qxBw6eg(NY<3?+Ok=gi`q^H%qD
z;f>~wnuc3eIsBnwVP#$V{vCgPPz#=(c)dtFw5L;31Kr(0zQ7E@D7@)P$7av99E88Z
z2YU+T2ZS7m#OT`hMC)YiKL*t{;r6R6Zhb|GX=HuMfz<jq4+h%9&=J(v|Abo>7(2A*
z4Gj$hy#`R7-mzh3g3b5Hj2qo_V@?%+-cu+&8sUPqE4NpNPSCclrsf;wRXCa>Z3q#H
zQNz&W5&1<DZ0;tI{(S@?<8T^f7}vae>b4P9Xc3@+MW~()Newa7ugrE|(O@mLwzHFD
zq)o(|)|XDoGaU%5e8w2ljNapjy}iAGL5#-@Am`Yf06Gfphw9@68Q=w32I}r(>pOaX
z;^Zhs!Ty(%l!Ew7Cc8(d%+Djnj_`rAvbt)@;OP0W;ObCQRt94218%vrqC-N>^?H_;
zN!jrx&vH$*i*c~XAS>BbLEmVWGN(9KYx0as?iR}o;{j>mlk2Ou)NyGsjJAoz%<g#o
zvn3G0lmG1_mBRG^A1SXVdII?T;39`H*O)s5O29?L6!Q5J`Z@?I@RKe@=aN{S^HvH|
ziCeQ)lPQCu@&gJhe=#6{?&lWo$P)G|cc_Lo+A(B@f0#^cUf^8tktFFjAsP$GMjt@0
z0;7e<a1{x(5so@k8IU=W_~Fl5PfJ@~S-Bc`CTtNX*Y>iQpaVyA<<qB7iW24)$na`v
zcAoSGA_(l}`&{u&5($Dmuy$x~o+St<^F9TS4EUE61YJ2wblehYzpO27&$OnoxI8Fw
zG^(eu9FDkri#j_mZyG!=K_V(BplX8x0pIQwuszNu^LO_E%iPkX-f$$w@y4Cu#7l&2
z7pD~gazb$V5pV>;<_L)mWLVaBPK<%HhZO^el|Rbselutx;VFhiAv>H1NkK#u6pqVF
zPtT*onVJJ-`Lwl_&(}3TjutjGzTaDgFrtI#g+w<(fVqK7F0#uRql!AayHB8!L?c(p
zZU}uOkU+RphjZ?Hy}Uy#v$}}uviv)03!g2k2DxD71CkHP7~TfHAAm2qb^J(Dp8@3#
zRhmL&vgy;)j!xT&oStv8@8Nes8J(Y-J2N|rO`2((52PJ-Ss?9zZC5O<3hA+*bB>vn
zF%vuO2R1<KB2+?1lf`ECG7+bVX-?a5)cDL9C8Hal+Qb#TVK@M{1W@Uwbu}I3nzDao
zZmuKlOx)Xm^{|?sw6<~}S@WA{nM!K9g$l83it6Ijo**vzMMsA_2yVQ15($!oV71rd
z<Esl6di7p?7483GCvC}=!$Q1p#7GQ!5ILhN%F5kM5=5xV%Z`qiJAAD`Q)70lA}RC6
zoy{`+w<XZ)w}%2kcv3ON&c;UdKizAI4-Jv#=cx_M8OM-Fx-1|;Ks%s;&LK`4pC3}n
zDs`(%ca4LAfn&EmJI6yoNmR5IfKx^C@=cb*WD+UXl0OVeU-X|^`sgeGNqVF-*gsgU
zp}UrnM{j0x$Z}Z3Cci`;o?%Qth~h4Q^$o@QjIi7R>F|^5U1DG5;IVkMJN%%>o#vGJ
z_8z>Q<2Nwy_|X_KujU=16*2pnXwqGtH#KSTF}7L~6RSE?dmt<K`px(d`u~1@<1{%}
z@LW5bJ52=*ZEVu@1heCP?+2}d`XNVIFiGDje<yz!#seWCnS|24bqR0lAvXN*@O7Q7
z@|WeYmc(P4iNHROb{sw+_#yn(Hdp=kdq_8b#svujW+n;zu}73JY<k(Al?Ul3*kovj
zc|!skubBUJsC~#m@c?xm9C+`N6B7}GoGG@4q^6}6hA9z%oe(ApoB;7%Z6991cUa&D
zRSSrGh+d;KQG}uc)=K$?mUS?vQjHcl66(vVRZ+{J6HDmB$%G!=$sB=BKNsKbL_VFJ
zSeeSCA)QNU5p`mk8|&gL>}#FB28DIs))Igw9IT^uy<<aai)43Ie05kr<JN64v)r#g
z-T4+zMA>9!52M+Ek3KeaQQyeO4#9sbX_zpcYwqX8uE)8c#ekB2=JqH=CKxXuVB`On
z<YoxA?MfKkF@uREwArEX_nwES|8Pn%)q@5gk~aaGcyEvGE*iJXg?5<KJvg{`kDxm_
zv$M!et+0?>SQirTUbFSUBQ&ay#xmbQqkw~m`N}HC29C#)-61P4_urrE)@|F|5xCTJ
z48ut06DM@H1oppr^$S5yB{nr1NSTP~o1Mj2V}MDU1)j2_iHAQ6`hQrt&=U-{qs%6R
ztyPjr{sjl<=;#ngSr0_88L&QlA<)q(E1w1EzCfuex-*1J`nt*)NGncb4H0^FVEG8M
zn>Q>h*&EDOqW_;S{`qrac!H>C0XSI{far8#_r5H!@)n$D+}IY+isVb=edxB6=6&Eo
z&l`s|Xgm+aL2Cnhd0w8**DhV?>R~iaF3!DudmoZowgG(?8R5PASM?s-Ciw_<OgerL
z+oICa>~t3r{XrKwjvk?}F2OM<FJ{qGgp1(yiCy})%-o7p`cG}qc*0dFfmA{4h)OX$
z`zFbLypqlGXLXNg8s0>tL*Nz^BIDx#D2wEfLW7J$ks%14V8Kx4HJSJf#Uc$2O`qc(
zAjT-nk<RzL)P@~dwqK)Cx0R_l8qrzSxDj(eI)@8g_rDT38jCUlp;J{|6Q59ULq_NL
zWk-K;jLtL3q94Y_2MiCbj%!(_`<HhsLwCiyl~+<i{|?u?(GBHV3PpQj3lZ)XpspV5
z{MKUarsnF1YkRX4N{2ktir%5#`d{5(ZIr~hf$KWYm^)VriWCCmm<ed#oZ~YBC>91F
zM0|Yy$w`a00;0YpTvR#3py(l2YCxh#U*9P_S}qnH2{s&fH~tiI7(LKB`Huf6In$`v
zH$BgdsIahhFsBV{I*|=><_t07VUn!BIA@G9Z+c}T8I<#d)(jl^fs&obeCdqC9I-4@
z7e~k3z@Do04dcUS50+Y6?oX4=+E&?<`0gBjQ26U?+n;7{o_}idZ*2dn-2bt1{r?aI
zLnJP|wGJC$P{=g4cktblfv-_KVFqN?*{#L%@C9>A%<FbHAE4$o6_MF~1~W$p=R!vZ
zfwgI9)EOzz>!^JES^Sd)6Ks+DKcLyL-cGFd_nUWZQaLHHXRC5@9>Jb%68B@Z+Q3li
zq(5R=5#Iw6vd#GxxE?rXTj47~SWHag4IGl<M~E<?J0C7XT9%qxFyjLI59YhtIv}tC
zf7sf_rdr(U@4EZ0h4?!M%LPTZq+<%1MAa>!@6I<#m>2%1*qD4d)hnmsyNsV8DUkpw
zy&JYe>Pvh0cx~zZ?nODN`wRQLzs)*b6c!T~|NQ9_6zW=dMsQv=mFWp$l^{%ukBm<O
zA_hd|p2M`dck6s%7r>5$LJ7WCs3lHQ#49gd&dt+tIer|p{~M6jj+^THg8o&EE5&#)
zgk75i9)U*je@%?BOx&Wrr7qEVzu?AkKmykkeTu9k_8nVor>W2ic^uFz1S!|Sz63uM
z<Lm-Hq^-CL*Z*bEYQ1eqoAuwtI%p;oZT?@Tp)_^fTmB#Qy4&e3iHaKH^7X4D9u1;m
zK~2A&gvua6X-Inw@SOE)S5pRC(|QdkVirEh*W_O>OV|FL7jx+tC)?jzfR*f+FA!h>
z-J_wV*04M)env{$S)?NrQT4xPa^gU8yL!*^UuN(^niS-c_)H5exELAjF$Ur1ah>?~
z)Pr=qaX(ihtOgFMN--2T{rz&1l15Vb(4c#`BhQ?W`Bi%l1GItC$Y}oA01-9O;QvLZ
zVW?G6quI?j9>=%#)<gU03HcJ@2#7kHl2Y`x;Elv?_tw3}JB|l$D4aAOJS8Sue@&U7
zBNL34QyO+MxQx4T-!q|&p$iEb4=)hSjx=Zv+~#6a_#(Tsqp$Av_m3?@&nRpuLmH*l
z?--<#qJJ8(vYQS{WLvSW)l%E%#C}qJ?drW@YC#fT&h2`t^5NAL$5b`FkcM*^&QErq
zdQyGu>b9%9Pj%J#;BLiQXQI2%IXAO8O?}dfm?drSM=wE75QVfKD|b|OyV`TA8(Z!%
zXzA!6!3!~0e+BSh28p!+0ua#bd3^&BfGE=T)>csoiT*nl@aBNg5IGH!v>Ht<f>pjQ
z<`rxumpZ@cVOle8Ue@KTFaOSC=#500yh}o{b%XZ4!YT7PtD;WhEBWQ+AJLW)Qhx~J
zeC84nWUx>7O*egSclYJg)bAiLC+q;D!=r?9l)N#kklx#skN5N{6lZZfO1Ci@<g7fR
z_+VrgJX9!1D*y!}@DRV9Nb`mq6<Fl0^70q8r;dVyt1bpEuC1e!P>)kZ_Ub$iS8$~L
zcV6JIfKOkB_&R}~K;Hn9t&_{sOkpG4{^Ydlx<|Q9&CK6rmfe!EaF@}8Ag3B6HB_f)
zt#IR5T3CpBK0P;AB&$@-5Z7|b$A|D*oLQW6a&=uoR2l%&ik*>P&Ez6Mksdn#Tl?{m
zFjL5`;7lyg?GYC2rfl!8CaG~FHXKe!ci6-TY)JL|o`cC=XtD8Hh-4dn3`xf#Mq0*+
zpNRQF_{JWqDwm%9hXTPv>b~vj>DewR8T)GS(<h``qX>$gU|xF?Lx`r69^s_;jnK@b
zec;o8qI%W7LMRNHp^Y3euW|g*pHEF4$en$Wmv)`z%$nO9wT-HS>YfN>S!*Uu0!H4p
zujCGtF%1pBapWm0D}&)h-8Zl(<|AS0oPQhOgNcuiygT;{FNJ#H<43+4P;toALL#b(
zr&{`DwG`^{ipEVKgrNq5UM3r&gys8?JXxVng)RSND4K5QC+7IqssrJGQ`M|uCw@hi
zsc5_&q{t_timXsTqmk?D?{DI>LC<1oNy*JchB36(A2C<vkr3>Yz{a`c-B>jA_!u#6
z7`h&GK9y$XX1dPuN*ljYdu$<=V80LBx&bbIA)Lv0S60Y-gcL!82x@@@BX#?l!Vl(Y
zQ%PxdJxX@8&d;F>z&i%9etm%{&!CvTJIJr(LuZC+M#t2jw}+Ndac(`Yq=YG(Lqp@9
zkb~AMV8Uj!RyIV)qF*0Rd3AMmVgye5d3${Ro;5G<QG&_bQ=N+k3OV0qdAEKT@3AHE
zNa2C0@}4CFN5OiFRus^YXVTbt>=j0jkcD_tI-tfDT}lsEnP<>r3`<Gbk1F{9+1fe)
z=KneQ(~z+G4ah~2jSlo4OB(QDt2XJyd%TCZ3c%v6N3#L8o;rE*K^z6ojJu;)zG0EU
z`T2*T>W<;zKvY~{hpk5_meek4%Fj;Uv6a#gRaSn5G?x^{wTGc7xRH#b8wq{^dSr4h
zW_E$9$a}%9&WKWbQW2`>YmnQb-pAoCrU02VLd)nYeLjo~qxW%5vX#mo`6x)6otxX#
z)AI?o5k!dWa_<EbAHIQy@Y@q}XyjRa9QSSFpTuQ^3<timKkbjJZHPeY&gs^M5{&3=
zo?L6|zW2`?6mn&tB#fe;La&M;MGxxHKckEBv$4w{w~(ED(@xj1(U)ypx2~eR{5v!R
zj2b}fu~Qu_Q}%IYuKILmJqRj<0JXB_qrJld7IDWza9OVlc`KS7Mh%c8INK2dDNqby
zBHn7-+hL%<;7b%?nr&8J7xDsZ@|Ofc01ahUGf}$@<<x^C-?Zrp`~-jm#GQ>Erub2>
zyt!Ln?2q67nq|>vObLAQ(6ltm;xhzuc={8bR}C@KTCJ>?{5-$E!s6=n>FZ?|#GfO%
z9<GlG`-_YSy7-RMq-{v5R90=x_(}GlM`PH!6B+kGogxqh3CjZRpE5RPnZ79AFKvtC
zI%10~Zs}uaiTleCR1bc)PzWTM{#MS2N9#6hIF@lAbKy0zF24c@`4Q8N5JiN3KPqZn
z?TLTcKhRcq78Mq3y_Lq3TU*P?-{0GtZPh#y^iH!d&r^SZ2`4!{v5hi-mxJRQA)LD4
zIQ-A3*=6)f-0Yu^9+}k;mB$?SA<(6SkAsT~SyWKOeVp@7Al^vm+YcXZx^o68tQ@*B
zcl^KPA<Px*6R_ox9NmqT$&bc}2?p)$Zh+$NAVuR377vs?K$fc5HIOqM!N9Ha=R@by
zr}q%B^*9A`3~i`<iS#azQ($g@rKkWOfQG)n6VrWAVb>s1{qt4MQ0y<+;=yAmrF<dJ
z8gD|fZ3?dVzwA&b#VXZx>HQ%0gKSOao==WNp8{o$_T4GAa5pSE!j~*2HVcFZz<GOD
z7xa3d6pwZ%PW4;<weF&+UExeq_V+?v=(zF)kQO5oAx55la@~M;09$Rw`&>Jd+K!KY
zcY#}4=N&S{ze7`2D1`Zv%u!kulzP}QMn^!q<6y`=X|}w0^E}hq9#piLVT>N=MG(Jx
zf;g6zphfYZzjsp*Ar^z#CcP`sZ3zE;K+avp;13M{sMC_Hvy)8<j}*!R^^ww5vK<|;
zq%Kg~b6H9g?-7mx4I8^CB&kB`pLNcA7{o;P<v%jShQ-%|2f-3qWjA|Ra7sbWt3`Vi
z^FCj`+}0TW;F|p2B;urEb@S!rkwZ5tvLutVijwBb%yTk18}~d2R!F?Kj;~)ELQFdQ
z+$PfhWbkLALNfW*w)Va$)8~uFgWK<)&w9IMZICmF$A!g3Cl|hy4j{tF;?PfSb(H;;
z7VRvAk&$P9eXl4QLY~uH3@6;&^Y@<g=UuuU{fu7E#jMX)$)JaeqiB+lhXWj37Xk;%
zuZ2Dn=zLfr>~q9CB`D7Og-lXH8{Up`C9Puqe*Rao3@Ak)U?X=<YFdUu*eU__A;4qD
zy@=Am{eqlgB%sKNwn3VS1I}V#ykzg>FT|C2d{#R=)L&uVleQ|JP2RKS7E5Vxmj3%U
zZ)(u{0u7+1p@B-me3s0rkk<Q{?B;9v*?aek`ah*+oP1-963nmG*j+%;$Fx<ayaaW;
zRi$GToEyLvzn;>l=BDPURz8MOV}Dv@@BsxMq5Z`>DbCN2+ElMmn7ivFzJ_-Elj>#+
zy~6@XrD3vP$6+cGG#FgN9NB_kbW-M-*Gt=p|1rb+_y57ii%ImPp!dOk#2`3Q9nJXv
z*Kc?u8P$T`^0=#Ok9`ErR7?y(i7dfZGw~5C2a1K<ygYphnL(dwhMVV_NPqzZN)R()
z>O~oq|4)CVyl$Hl&3)@xX;t_Tag8ZK!x7MlaB^^98j8EIfPlRGVouy14O%EI@GF7g
zoq>msHVdaKgZ{uk)WwUo<@UkIVXWrk$_f3lHYdWl7DMt7jt#Asw`A+fIGw!9PkO$y
zn*0>eP8WAk7aU$zg@*%s9VFMg_hTqE@j4Og(wk`k;Nv&S%a?)QQO|~kgbbaY2c3=*
z>!nK7LG?bW<taGxd$aV?D8yfKVY~h`*hKZWH*Ql!7PVV5uWr!0IMNkrAGHeYVl^o}
zS$)3L#hgkqGWVz7SyZ5)OMP=AfFdq0Wh_Tsx`eB-Khc6V3m4ky$&c8V)gSH?sc^O<
z@mY&BX;M&qp(Mnap~$j#!u!MxJ}2j%h~3@q*Of%fgsO1Pd}krY7+30~Z_+6}>El&u
zzfXO=!0awwRm`PC_7XOy$LWm#tudtywB}RXF34DvW@yjbbg=QPKVl2)46-;A=N#-a
ze`W*)&<;o(@7vr>bN_~gw^OX|#o~zP(NEag%FWLl&8w~5jROvXL{*&X^-7SZ6M5kf
z(?Fbxj-U$0MYngTix6cE^n{>+@Y^w=7d|V5O&|d7ZkM$<abdT&W2`5jX%G`yw4S4l
zc7n|OVV5@Ve#mzaz^j9U0Hm3Pg(%6p5QTJdw>l%R+?JX>h+=9$a4@FC0OmrIiLe`@
z>BLksi~xc$7>;PIY-q^PW<Y?6rc=`9HZH_suI<3$K#Lw9Z4`)i!Lq?GemDhq9X%&1
z7<{-uh=JK_&|>J;wLO7Qy21Y!9ua}$$akanRKdkm5O!95r+xAT*;1j@3`6y&rQ=#q
zigL@?lR_;ZF55syH~z^Fw+qS`a$yxlk3#*u8M^P#LZuOch6C(o>3>x{P$8i}wjWh|
zp|hDHP5<?CMZ$*Z&-_rWfF1=B+A>8%MRwSGTJM*=3KNYtq{?8@ur3qY98o920GD|~
z-(z#rt2i>d%r;8@4fGo*0$m`3$iT>0XwWdu?egWz_|PGW1y5dB%-$ik0Iy=@?ifzT
zTbQ5ev^K0804UVY8j$nx@7}!xAxQM~;<|@2Ce~b?l1T?Y^*2=Z7}Hfhad2K!Ijdht
zCl_zQnV;i5k~6EY4)65K(pq)Dl_~65f!7Jp@;DKa3D}H}v6~BlP3-)WZ{Kc9=~x%i
z+S!R=i12(Kzr`kfT9Ew!Y;|U*k)X2c_=s|rX@P$E%1{{qIu_y>gNP3S_c$o#JHF^|
z&=JWGe!+~AkG{k>@yNVp!k|1q7+Qm5NgGsGmZ7`|X@e3KEcHr{KqOdJRK)W|B9a0B
zUXo)aaRbB#vYpf!y=sY@6t{$k@imZp==2~V&|lB#xbjq?e6iq`)$_pwgRpMs4nGY=
zLlS}NXe?R3M{hrV81qMIvq1k$BmlI4=G<Pon4F<tVzKw3y=dBj#3!q&^D>IoJ@Saf
z1Hgb3-0I<ck@zc?#rw9Gyde)I!MKN_ry>OHs<2URS*XQgPCu1romHXACw@>qcOuY2
zhcQAAx?fuRRzaEu{n727bcG4A4`>0aqLyfZFwz^#w@A>!jUjxmL1>oH7gm$uZQP5w
zrr$#D+pA5NrCMfQ&%Yl0Wc(z`0epNHYF_fhL8nA0*lXY?W71gu^X;CiXL%I*sy1eH
z73}OO3hu5VQ9%8?y_=S=(1t)xc>m8ns`pD+BaTr(+M1#^tqVazk0I<p&+phuu)%dc
z35lz?b15DiZ1E$+IGLhiM-=1`F~Xes>6pHLJ3cVL{Aeqk9URb;=--AAuoC0AU_29h
zgPltoa=FdXVyy5&5%CDfb_K#t$?#?Jg<-5{bTJ0RTx7&KOD?)YU5|uiSgm)T`?YHX
zfw4G1985DB$n@daU2>FDYNfL+GKVb=5aX!uc>_}uwn1Wltbbs2Zy7w*AKwG_My%4M
znwMjI&F465@6S6>z9MW^SbA`+dSY^NLW;-N05R<W7Kambo~vbA^V(_goHJyd==`g3
zi$sfmVkvzv&m-<pCa8bYPrt>oL)h(lGd;EA`{)ByX?oX#kG<b-Aeiv|`*&Cbeakbj
zaDWte4pvg}3#N3SLV}3w4lIz($>^wF5q<@TQxRyg<ap4PFk-r>U=O`V7lDV2apO3a
zCQEs#(c!idqXP-D7AcUscVFd;#EA*7<hVHe|3H+$`ctk$gN|4pLP`Tia9thO=pXc9
z&w%-z2!;L-O{uD+B!(9R&pSd#3Uw%L7Ron#OotzZ$MfLOt<26AP*u&iYY-Zjt-_PV
zR+O-W8djgibCUu&mSg+s<A5^w!f<UQmYwaf-w);q!@I%Bj*N`ZW})u<B}HeDvRx0k
z+;6{f(&ao`FjqWWAZoo867aIJW4P}exivNSvC~-Zhr(b|s+PH$X9IaY--x~3dwcj;
z)W$OniP>&+)w3)Lm4&hoVeQ3~WF%zi#PPmE)A#mmM+pzB!pPxK@+_;#0GH@jg$L!{
zC)1xey^oHii%uC;&~*$wHvS0;04N6#UJ!9QL)l+|krpbOniFFWQ7Bi=hw&kG4B0^k
zDw}`{09k;G23G<-4c?VNSeQmu?*WZxz2n<8To4_Mk`z4kZ3Ky&IAJd)vd;IGsVEDf
zlx_hxdGWmdc}=u-6G#0}5CRc;2@DnbAp~}QnjrKJzuY(QJ7S(PX0rW~3La-xNB95x
zLGYhV4(M@Qtot(Y`ErixdY*C>DvYrv_h_tEXVh1EZoi*43pv`K5$xXH*-6Z3#<hBa
zpZH+2+m$*^-#vzd<hqOnOQ{*Oh8sSy)Mvj>&^Z<n%JS2-kIKWgWY<Q^2RhQ0qSrM%
z6)!N*>p;t0j|4!dIZ$Xos0L;fphN1JN6-ETZEqgd<KFiBf02YF$&jf*rJ|%r5-OoW
zDrL?Pk!X-m(SS(FP-zk>q`|DBLPg4uNRoM|M4^?8N$U5yxYzyM_w(F)ANx3dd;4dt
zV=eOiUe{+h&-eKr7FQ3H{2A5vQ^=HyWjc!<_g(DY`$Fg9Q?{uIUO!weDBF6b2e<%!
z;|M<Ir#vE>0|yV13oun{gTq8-_w?r}DiqYQ(^Lgfl*7`rY_36Q_NhxpJPb$l;KyWv
zX)AxEAkX`0;FHfF%<gxa#LaMOnTyL%%G!SN+vH7g_Q!pp<zqp2jG`(%viS*FC5QxC
zo4ZpX7)vZ!^(sy`rlMlj@98lG0r&6pC_zS-pC91uE!)jb=XgKcR4v<)N|8kuO(cXo
zJo;LW_AJUi?z1&Z;<Q%H4A<*6Y2rQCVac4#(3l2_pM2^4c3_|tthni-U8SkIduu93
zZY*={I70vMKs8In%mY^y;xq<R1T@oC;+VO{$jE5Pl0+ozw6j|SZC9Q7E)^H``xJ0_
zY3ew!)4}2s?cZShkWqYka_Mx#zi8dvx`-K*NBixs=lx$AkkutL!}#Y+>@H@E@xL3_
zRryu+HTV!7r5Xz@P)sy#pEGRynyPDzs8S|;*;i5Z!}yNOZ8q{pi`DPObrZjF$Wdo5
zSs8zMgXhDNvmK6m)M8<J1}x*(x*0D>q&&+0j(z&{=_#fdT}*O|(^BL_C+K`Rzn9%)
znhzI|CK$p)0W5}UlPwDPcnVgZ0_(d=Rb;Gk$loWi`-QBiUA9LX8@H!PBwjhC%~%{)
zMW>Z36&s|rvqGOt*s9Fgi^2MHvq#e=yy<-8RzEhk0QIU&ksuTbyi)tMl<kvV>-tQP
zS*q9dr{A0$k$M(49e@e&XJK{YM=d?6+#V$hhl@MSG}BD}eds!z)h87p8dsX>G1`eh
z=@uPlW{9V6NIA1I^-D;y=K{*O<oJ0jMr>{O*fYsOIC|SXeEBl(vyWQ@Y5?kIq&m3(
ziC8SdphX&l4GXD~2tDBJ&Gsqh(!r%+BhVI;V*N2=y2<H?DSm+`L8T}d@^BOqC^W2h
z^&%IOM8W!wsu0lF+|jW{Ac~+zg`w0tZjPOchsP(_E<mvh=gfJ8G3TeR+d)oAOD=f%
zlKIksxubwx9zEib{eUkD<RBQj!K2Nu_Rt>yZJ!zeVlV^@*c4+Bl8qfZIFJJ$-8q+^
zM(>^AsF=yz<0lW=gYB53i_0$<;z$@F{!-NOe3n2+&M+2OSG2!9dk&7ciDF%_QDAAU
z)jxoN9vT~U?gWy6_e^L#lb#N*ilA-IEhMbmPk_$|4JM4Yop(baz0A1Ry3|vrid6BJ
z?u%_Mhoqf(WK4_@*<+}3yQBn#8;V5o{bH#IPA{5J_%cnhOWM<3%hB~=CBf6_;O#au
zvYDQ)+`H=4s{{)fyX{IF^OwtMFAk_0pwMxt?ytV8%wM~09fjcxR&R{4YdpN^9k88e
zvI%2{8o6rR!sP|X*c22L65#akUH(vc>eotV#zSXdb~!XBVS1mTnaNOqlad@wO-(^7
zpdyG%4Pes=My<#t=-|$l?^F!y;nHP<e$x5#QR`E!Ri1LDZtvFrnCBpJf?yew<tY%6
z@EcGx-S|1e^j<&7<nBLvb{ny<ao_IZBGsm&Oo;5>4_mJNilr@=bB@|MX-k!N+O4kx
z4_8mU`eneT=r+7{SSl3wuVozcGDB|Ee4^~K(`+Z!-k4`qc4&=;cVQZoG_GssU|S8%
z;lr<MI_k*z&6wPaqSfLeA{<8TKs%a0=H5EtY&FT=V4}vZ4H$shc1ye!H%!IaQLUea
zaQ>%`T9P8fIz~qq*L;Ex1(6r~o{#%ub3ETkE}NV>j(sfLeUTRY3nF|7ON~3V`Wdh3
zl3F9X{IS#Il+&mAqe@{Zm})&JPdZ%Q@5AMSXW^A053dZ3o9B5gTT;VQV}Qxp8Npv7
zSZ1)Azwqtgoy@!tg$aN5`Sa@u6onQwf@#RQb@jv6v@>cK<0tw!!>B<fd}S0w68iG~
zik|N)G>HnGr5Q;T(u%JH`+D@~KBu+^L~X9?Bx)0T@u}@PAS%T*ZI(>1kot<n%!gH}
zf-E1ECpn(nA}+a;M56s2kI9Q#dU_R;n#M2B&M!Xwd?5T=sY<|q$1d!Ps(CD+C|IEk
zye0f5QFDM+bt6R?Gxt%<x9;>;($IJZj^OO-`qw2z;ddld*Xj%s7<z2Siq-L4K3C++
zuDoh{dHdy*`!|e)-%{Gcv!x|#;L8n5k>A>BMMsdrBDCabxw)xVHVeNaVelop_u2pG
zr{^iwKb?PYw6yR)GWr*sk5dSg=KXRh(roe}nA-v>(UTV00f1hUf3wQTiHH%-EhUp3
z3Of+ER`)N2LrO`PjFrA?8^spuPjJ#Jt~vk1#og|gniOsT!i0+eaIQ5lgAxpAkWPSP
z*ED*eUPaE8<@<BJl@<NOP9WSJJE*l;XegU8I?3~_3!kP+bo)!B4L{U}*l7%vVq&C}
zY3kTO29xMjeEZ@COL{s#%6YG!FyR><U3vPCHstA_HpFY=KU%aGc*Mo&u8?}MJ0xU3
zsSUh!i7iepF7}zuk$i!IEQ_Dkqnv~ejuFh>Aa<ToAd~sdTlN;$(6CLB024h&g{rVa
z=(zOtX`3%)XH)IcCP#Saq@3_nST_mfrbwi*x}k<^^(WiS>nVh%mI%QUntlpJnT+DK
z8V|@FLDCF-aMn+NTIl*2fE@F1Sy|cghYvS;d%HZ#>fwL+0XF1N_$0Tcy_-^(V3FuI
z7Et+Bck%fI;3((A>Ec0gpCPxFNaQ}Bz|Al{a;&-S?<XVq^yh;=F`(=ONZIrey@O4&
zHepD>iNtBeXk|U~(63)7NvI4N(o?MQ)s?MS9X`8FL`zy)8vK$|r=<K}wzh2_y-dc)
zZh}Q8$L?}R*Yw2(M0rE%(6e@_C#G8qq3QlzhP6%M9r!X<XOe1@I#OLdxVR?V4t(Fh
zpqT+JXg~}N6&MTiBV2rCC%g%5b6_|gp~5jQ`Te-v*CU=*Re5fMt4}XP!TKfs=L0cg
z-2VfucpWd8WlWO5hvL%I`#X&t3pbbd>LonolruSt(33-u`h-Ai*xYl0th@8)t3GjQ
z{^!c5>`#k0bm-uX2p?e3HC*~EWs-9E@RsIP0ZSIJU<dB2c#Cb}qm&A%$VjR3@xm7r
zi3&6Z!9_xD2}@F<0c)MVYnN*dYZ%LFjn>iGuf4<s>v!UA847tb`8gM~cds;AwY9Zb
z$YZEvB5IiN@KsO^)@7wNTN|jE+_-aRH&Mnshx#>Y9*qP3<^l{*`FZ`~GIzUYg%fOI
z>}DD4%{k0fAQ(a(JsNoB1D_~=LZ?%CdH!*`2>u4dqIibi2ASdmQe22_5NNQphY9!k
z->we<`r;#@E9%v29WWpu14Jq~H!$J?OF7m4m)G6J8EMU+BeMs{Dl!pZoZ^AW&)GTH
zW6rm?zf2=XBT5Dd`V(#+PbA;8k|aCpj2=CF=unIgXHyJtoii|oH7qSHZ8dY|X9@+H
zFJ?`tHr_&!uo-ir#8WAtN5t|%a1g{A+wJi%6nPr!-`)_KdSjW8(~3tHa4Q!W#}u6-
zUos{(mQD$eJ`au-zEk-`c-{a9!fasbEydnDfvD+7hsgvJluAXxW)CW`9yRi${2OD|
zt;^!YrElU&>Sx7zD`cAf-oT;#8v6<_KESf-<gMt=!oS+f{=WmI|Nq0<|N0X?>V5tl
zKE;O~3V?3(&`3c#Wq#q?&wVuKnM#``cgBvjNVt{k1AqcR!|XDti~yEGYCvzHTtaL+
zsNRoDtsQWC?N~6MKLm;7-nW<*E!cfQ;HL2o5iYq{EnJ9N)G^HL=r|;pwIb;~?Tk(<
zN;v#cJth7s`AS|U!`~P1@19|Ui^%B)c78aRHUedXf}RZNDl*zA0Q^4FEi6bU7BbsY
zx9P+Zk?TRBy6h&K7Cs^Gu+eqWq-`*BU}n<oWol?4Ug;=&L?PIeGdZaj8AVJTXY3yt
zJg=5;&YnHDUbr4AH&|MN0h$ngsM)!T$msg3{3laIrK*gly}s0jMMT^ejwob_(i-aO
zkIyXGiggVbrMa{zJUuoU+gjR0%ou0XXPBoH6^Y9u4(C@J6*I#Xg5xNQPtQ#kpAAb@
zuyKF}cKsp)8YZ)Fw{!V%=xaR_G(@6zBGy$#kd>MwR}r4AsKh@CN@QHEAUt4($scW_
z*=&2+_R}AnANftBMH1(=gd!>RaJ#6X=S=`Z?#JDO1UPf=-M92-dynVvT)%y)nTnPg
z|BEX!t3;*#k}%*!En9_~7wRUkzdhnKm?jjofgFDb>#V)JvY?Zin5oI~UZ3S6$hmbZ
zq@uI2G*>7L_oI>6xe^NzY)ZUWpfqGIm>FIYm=nl*`ALeN6#U`zp|S&+ZrVN4glN6B
zg(1C`y(0ZtL0<Cnk4=RE_^hkR1S9t}8H&LRU$gv~f^=xRgs$xx`+BXqS6~~~w8zcF
z<SH|aHT=yK=E5NSI}j^N5~;_a7ratzsxmAonV`JW&y1c2*iMVa3E(w)$dDC4(IDv`
z2G~&qX-n_fe4*#%>AmKXtO@o1&718TKo()~uDBhuc>@;OFqt8&B~VBy1ya__u3e(4
zqEcA^2FPD<;}d2Wz$7}BkaP4f@8*t8AB8$#u#us-=j63(bz_%=#I8T;TJh92j3YFi
z*;i&};YG0ZLnBwpcrP78Kp}T5>v!<r4vK*dE4ywpaqbQvDX4`dXz1>sAsjYbOY7KW
zmjjSzNs78tK3RTAjOMWZoHy*tT2b}_!5$695CAWxN;;!mr@UM4YuPI{EX>$<i2l%_
z5FZXc^X1`i(6vE~8J^qJUU3;I5fE}CBPZ1_LBLJl#5s)D*imTqb7{tjN;2`dkdPo`
zNuYB&m=fB0#c;lMTA6Q$0N2gm4~kqM51gYk+wj8sZJb+fDwb@Ll3+>zK06V<J5&SE
zAUtGD7~baDxX8J~*0!JO>X8=2!0tpB7#2pZ1}OOUf=W(mZu*5o4N^B7Db(`Dcv6k7
zz^*7dZ*+VhkqAc->|$@17dh*O(pIXe?RX-%X>&LyR{CF-Z$z!p3V-yd9Am9R<9&Vj
z$d<N~?^+$u<U!wIsJ?pjCdxXH2QH1x`%2_QrClXNrM)TnK)a~4T%DW}qGr#U)%@ef
z!10@6Di&%*-^DXMEiGgo5a)hqQ&v-9&0=tK^Eo&<1v%TW4RCOOg;6IGxk1KJd%ams
z-^-&b4`*;T62A}KN%x1#t|+ENK<#?dg|-#{iAz)CbVUi>Ss)o9(^Qu(!ssUhV>qFY
z$~<!>3m?0aNpW%gznZjA1=1Ws%aJNR-Us_7{#u=pl$Vsdj3HH~F4Kv{X*K?C!=}0~
z9Sa}p0MBucWiPOlhCVt`m_Z#pktBTCF&|s{mTJ6qnWJYHk{TEJ#$|Hb!5le70d@z+
zM`<XF=z7kw=eqwVbK(r8BY#;R&;;{o<1lKxyov}7>32+&=KlU-YZ=QXsFdG&8Y*iu
zTKBcf<SwakFK^j~f&Vr&4v9;nIRYcg%E|Ge>0RFB^HV`q#`pfm@89!Sa5xcW?>nfs
zLS*z-cs`qRyyL<`%1bOh>kZykULv7rm#Wlxi#~pGpUd7u2=zmhTW@RU#`y%)#$BG%
z+q8d*#h2;Q*Fc6qi!_D~Wn)g>RS~jQT;$pH_33~7%yZ=>fgzVl%oNw@+8opfFQ^1U
zN+#akc-}kvl2bdw!g{RSbzx9aaBdG12$CQz4A(w=|4#WUcstyXSNrqxt}Qj{@8#nI
z!A$S7o<APmuN8T|b~P@O_m!74b+=;bGiOc=!B-SZvi$}A-t!QUZ2|kPAAf0Q6|!J~
z7or|xwZFFiQpN)KdqSAUB7~-hh1@Ut{u@bA?SMIVJ34h($3t-Z^V4~xu`?$D_I8n5
za$`gtKjV02w1Mf}ts`v0YJb8;Pdjnq+Ea$4P%+56xn!zzMdh0R-QO#rt#7H!htGP|
zdVJDU@3{N-@2AEptW1pyduJOaW=xj)wlPZ&bA9gFGh^&mG0)9at0Cr+p0S5@&-8QK
z)Q=}udicgy9cU+d-oEi4KP+MDG1tEy9Pi^A5gq-V2x`SZz^DuKUe0eb%1b0umBtLP
zbay{<Y#+H^^DNCChxM2{=(ul^!S1Tn#L}-@zy1X++lyWjVnQ=F>x4qYe)VHN#YNAr
zK4=?jo&WR-7wk%$7}2d<+DPBPpdtMG@m0>wkd@@e`3T$RQWXcuzffqVW8sC58$tpu
zUBPRzUNCz^ss#J#`b8=flu8!zgH{Qq^8Ak7hPx`tcVIaJ#C!b22~SXRue|p-odKjU
zzmPv^$W^GMv}M!r`F6U#lh+6%*zVwLc^S_U65|eL@Vmo%cI}$j=mcTQjmz=V>DV$8
z^`k%>!@OBj(;ul!FU?7_{~P&bEo~J>Jsz{WrYhkk$YXZ@Ye+bYnGPR23)5ADAvR)u
zO#=$D2A@;NRY~j=Y<kY2Fa!sadD#RgBj}4O`7)sJv5B-PjT18?O6nK_&W<1ceE?8s
z{YvspjmzAtVKc(c18N(Zp~mKqgm7qv?#O2Afn;Zog%0#a-=g>_yqoo03bcvv=7#F(
z&a_L{YFGSti}kec-9!eRDrR?R-uXw0yrKR{VJRa)U`qJ?dsXI{^@<G-r!%+@)$QCe
zXpGd-a9tie*&(5rXw^gxCTQ2C;_601l?z++>i?@=6N2X;*s*6p7N>tZ^613W{N?8t
z%f4j`bl?kIJ~l8l4hWHfGR?!$FVh`&CB$fCr&}915MxDsc=I_3*-870=S%u{rFh5T
zQG>h1cyf!-Yk8>bb4WjB5|`fbh(5=Qyl``l1pzM4a~%l<4xVR<3LEAUa3Dcl_szZI
zI&ayrd-oxTO)ET4RqXhWoLQIj3x88=K_gzL8Eju}-|-EP>A*iu_#9DM?vaEvsKY6r
zKj}dYLip^Wy3WSnu$l6^$LT|&iw9~i&WCox+kjek{o{{eGPuqF^>_f{twTb{j7&o#
zBynz^ElbXySuGM7w0ki@T=dl{Qe$rMU<_XW&_bp2B6dj|Kwz-8zDM<#GDUs7<*^!<
z^=sD9%U{Cj8J>A@sU&8};#9yXohGKe*2BhjvX#}|w0T<5Lh3gJj&pJ&5ZH03Cr9xg
zss-zi$-Ahr@o8uQ@WS~=Ha@)Ulb*c3djdTGXxi-(_&2L*8v=aPrbphEK^eoZU7>2x
z#i<*TnXjs=W2ay$L4y0sf`V#>ffgGF%YM=uD!w#H_@lN0hbJY-A#GXx9`YlMHp;2P
zlOfBBQeHxFyR7|=buX5iHAsr){rTLn8gtL42^76+AHCJ4?-C4Pp8u29*FIJ<yj_%c
zoZ@9&k&|53qHdyhNEiPUIipY=ozosZyaCY>g(9o94W1<V+t6Z1Kt{FfCpew6#a`t3
z3(yp(GG!nJu&|!<k6plXhfCZ!O^!CFz3>D}hQx1@>CwY@*LBFQ<2+vP(3=9mbswZ*
z4jmT#kqVhwMvN2*_A~jSrLyjM{Y2$OB_*1m-Dl5Q4;dU@oX7&jkjt*)(gQuBXU!oM
z*TJFe&4iUv!XHrhczKe@sy}&Ryapj(_FGZd`pK85x4^{uE}V#2l&hK94Za8?IBpKU
ziaTYl@C|U!{QUF&u()=jgt_`5c?s=Aiwn3c$vY|A{XN=1zl!ySsc>u`>!CG#RX`Az
zV<F2P!<j1KKo>okfzxhO#PIdYOrx$lFH#qYq^~U%dfL)Xi^ug5{^kEN<0}7Y0Y~gQ
z&Gs~CVnTy9n3RO)8sp>hxIk{0b^p^$Cv3PzP(o2rS{kEgKxVsP7tWo-z1MhbZh?d-
zV7$;PeG%L%P+QOq%<L*OpIX<S66J_yH|6;8BYMcQn6;fVQ!YD{AQCwVEqcNfX|c1|
zO0)G#Ti=kxIB=YgtQ+l}D^~>t@v@Uf3Qz~UXj-|yW)*fMx1GvFG){>SKOB}}-;|rg
zPU|fvH^%ju{y72k<qCt==yib!B{O9<HqhI=UOc!GS{Tun9XN?%t5(+KkHDOKMpI4C
z4IB0Z2VZ$gh6WK9R7x~@o1?_|%kB$**~#$ucz>=v*g=)sN->Y4#)S1Lp}0i>W+*JQ
zX?nZ&?0JdH8>3s8VaSr@@a&$2uw$r>PWZENV0t7YQNS`V%}nM+@9)5KcG^Q-PtS<D
z_5fl!UEQ9FC^5lu8SV@Y+-sx&w2m*b;!}Tu>G0=*kRzy>F`OYCX5uUcJY)-p{kyo}
z(0CrK)I|HPB2k+HVH6X7yp8}5-WMH>aiwA)2IsgsKvXpfv|6_=lkAsz=`9LE47%$x
zv!NHxoFOnpCivQo8xo3wKdVeIYPF2Ro%q0=TID+ge2bw$CZmC(iVFG1gh#;76&1Y|
zuaT$O!I_Dsneu-44JQv2*X)mtwO+f`NLxHWtJ}4uuUsa^ubs?-2(;p=5C*CAjv%2;
z_2lBNJv(RGSY-^vm|=3=ABpTKcCWmA8Ptjm@cPfYz4W7ZxI?!XV98_qI4iMj%wPgc
zF<#7`W1y+&0t3N|u&2o*`YcCY%Lg+_NlH=y)f~q3lG%nz^%o94f<2jo8Q2368g$xm
zJ>ZU$tE=i-OZBUJ^J5cdwejin2QV{g$(yAVw#?BnJ1gr|8cmM^lYGTY--QcbFr=0?
zVaUg4QYvN=veK<TW~H3}$J1*b&W{-)8HbD5e9mG6Bdut}UH5Pbg{Z*A6w)oGr~cqx
zV-8$2y<<joKwAl}rVVaJC)IrZEQmpJ@(OD2Od(vDd6m%25~|3o<s5QmU*;pPiH^O(
z{R|?RnXqM0*_%M)w9kGW0&<N+0qc1bV8Z}08yP7A6tg_iDI_Us&H++v1`LH4KS(9w
z<}w|*?4e<1oVg7Ar$!M26A%zUk4RSb^se(fr3dYqO@ifpm`+q>Olj_Sp%8`9b+VKW
z);WVV3|dsGuIU5+eU^Q<D8PatdFO_G;g}90pN<sc8==P$jYJEV1@b-*-d)|>x7Z{s
z*`k2<haw!L+UYv=lQ7#Z52yHnE95bnxN!*A+<O3yKFKRf>;AVL-Avo*F@P;FO<W$G
z**woo=;me%-Ab3Q6Hb+-;YSR{!#{0OfpVZKIMonSP~No>P*;EzBP~V^IW6^{3BdRV
zjgNo%qpWVA#SlqjX=EPz*_fJxdUm0G`O*m##i>_^4v{mZW}lt$DztTMkgm^skI4+Y
zH^Lfc3t2sQML8uT3_(SkOXv&`ls_3Hz_urEq)bses105x2;3Qf^m_Wcx9Cn`<#Hrd
z?2egY(C7IPx@8=)=tJyILJ`NFFcVVEc~DQ@W5~*hFI;7ZNSdVlp-f_{G1EuFg$T;l
ze8n$K)2F|xC+tK;U*GGGqK9#eOsnfFt}EUXNcXL`pUIvCXPG$G8BzliAt7Ei64Zx?
zsGJ;wbEbB8%(f^<$&bH%>5C7?oj_FBIO_fgnP4ypw0E`-w>b4m>iLGbZxbC9OA*gh
z?M+P|K@;vtB?@(JPX+%`33el6^5xWDyH8FNHWd)V8#dvH8^oNot#>*)gh71R<fgX?
z`b(fF0Qz42_##^9X2yYECfL2X8-KUPLm;59Gu*6u__g~nzn<p3xa8~4GzIt>$7wUB
zKx@UVA{mr2ib^u>_p6W1fBaiQQa}Aw=)M|vW=F`P@IpG|P|e8@Y<g8e05!Pq_st?2
zNN50EYQI$w|M`ywFkyNOwW|;b@bp6W2BGX%kQQha!qy9K|AdeCOZa#$2I2W*#*I6;
zU&dxc&}Wd6<mqw8^wo@|nY}H!6H^ONLfBl!t(Pws-5q>?LeD3GbIHm28YXt9-36p-
z?@U(x*N>9zsqg>w!R)M%X|J9kr8p;?F)QMIw@V9YW{_4=?9bk<yj|(OW$V^aC%Ysp
zb?SKxB@dT?!m7?&v`3Eo@ciQFuhPHwUi&_NUpvezVkhV4zk)bK^8#s<rfrOOH(Z1;
z)jE6bTt5R(wa3gD|5e!#`?s>;GX5EYvY{+;(f&ya(t`bP`jmU+?rBnYAt>$Gq2Olg
zG*^9Bg6CC*K)C8C84ePm#z79bfr*69*U`ThqNrhPyPU+|GVN}I3}e1&9uq=0emj_1
zccous(jnQc?!wyf&_$@tp5uJmfT!v>L^VS5q2bLCfBN*2kX}O#*2(G>fQ&#)lOnPl
zj7~Cm7Q`|gAQFZU_imw$c>=mbr=>gk{#wo>ai`y#^hT%lwc9bAp*u8fEZD)`XCkaW
zP6D&zT@Kn>ze7$F7tX<W9gEYkQ(m+kK~RLYop!6-<m{YFoL|hq>m`Grq|z-@PDT~i
zyspwuf=#ETpb*epR8I?gdO@~K|Ni(!nE6>+oShqOR2OKy?e{7+8NJe=ozW;4Rd-d-
zeFbwH?{@?v1)Xhq6XzyvLWg^B+<==DvrO-AJU8AdKEnU}j&B3E0ge*ZhW8<(8APad
zB{^MIt4b!WW_vGe*GBVoq=YkK%9sO?5`aOQztvBTx=pqkV?f^N6FI5DvRz2Qny6Zi
zML)Qm!yVVnpA%Tn)%Hk)_@US{-DiN6<GH)y#PR&)R}~zE$JK?KaQxK+`Eqq?LFr{P
z`gjgX+kWKvmGSG|m7~;qx+yV=&xW)r>HD{fwQqYrIiPL!n9XpRB0M3PTq2W`PdNSH
zMb-mBTHnq}?IGb9uT^W<cYF2cADq%$ZTte{qp3H9P#ia~EcXb;fkX$d9v?5?t5;#^
ziLO@+mpmFN`VD7zZPeD$d3UM11{sy~YA03Z+KxK#AA7YaTBmwTh_NATa<XpzLyWlY
zcfTb#RBA@coupZq*WnYR^r8Fw-kba^_@{_Z`mLF<gkU?GlH$ZDi(N%%EQI36$jwMn
z#U?1V6CV>f&t{YWg~P=P-u4z533rTWmfEk1j#Kfex$<$#us=e%)~j}L1!y8+Df}!C
zXM@7|DW#)TQW8Q3V8C7t?92)I^~;x|r#jAT*}cU<d)5<bz<Qx#o|E`xexG4$ez$E8
zZ@7D8AK2f#RbdBLH@gbTInoVjvCYy#uWBsK`2JYY(k;(7$j;I0ATp}{`vYx@3S!!y
za5;3}-o1-ac~c1!k>p`F+w`}o1~)*8=3Kui?Z#g~l+nIoZ9CD>8#DT-%>t<(94Hh;
z8?&$pb2evQbNX}xYcnN27?*0&+quF9v}UMuE^biE|NRW|zblIZdRa`cCm#9&%@AT1
zR7EuKxKjhav!2(LxkG|y+X#(61r;JINN?Q-=o^*j<NC;TQOW*+>x+rP%(ge@T6VYJ
zMAolo$7iR-Mdvr6+kxzk7*BC((e8m8vC28wEl3)v<$v{d6X&<50`-DUD3D?_A_eLz
zLD^{U6jmW}_oIW>>*Of(Q@c8RKi)1t)7dx2sJbBR^pHPtqWfD${Abl{Smv<2Bck~E
zBGEvh5qsp|<OCf2g0jDE8Bb^*a*Z_SXIRqG@#&1XY^JOfy`-nuSsnw#@HdshOG$_b
zcuQ9!j{mS0&*S9D2X|I<PAM=?DkD`z-7oRnbjt(!BRKA$mO3A0g&q{^ly`<1%|>Qk
z8L^VtTMNz_E-1Dj3rc7aem8r<@P=Ri7vwbmGIz{>n*Y1$yE<W@J5OJlb6}97w8RtZ
z;@^`;vDinWI&STg-%(WReaWRUMqvkbrgTnJ4Y;vXUFEELZjlrO|3R?lN=mAmx3?{x
zqLKOJcJY6oPdZ%>lK9#i!+eafqM|DBcBx5>y`0cpt2A*<Or7MFFCPs%qK(AAGud-`
z?GBjxZf*^%A0jFip(Wv_C$nj*FSU4iNr?+R71J)%cT~?pEY^Yr2dI=LFY}8ND4PFN
z=CoN8(^FHU0uI7(a++#H*hMHo47)NjGt2cL0e0)&-Ob6VG(2ITDw9jD7bK^N!bp>%
z%cPbDc3#iW#g!d3vl6ENbb6`3dO?Nxmb$0jU8KcOZ26)VX=?J?_5DP;*Ma>%Kd*HE
zn+vetqE)po7JghyV;>=BJ<K$xWHPiKFd$!I(#Af@79Ydw2Pg2<>}V`~4Ulopp|=M$
z(-&yPW8DZsd2$oqkxDHwirWu!g$OdMCcs?N*8T{J^xm*>WHCzepIG1kSUr=Ru-h@~
z+^T2l!!DOT*0RdjbNHA`zbWZwJmv01Ev{cH_w%!CLRsqUlWo6VwjCU^_E^!y2Q|Om
zQR~nq6D2c_76oN@t=ViuEd0GZcVYje*6?|M39zc_vF(g{B8zKsqS`A40>&1db6in;
z;GYnqbK*4Bodh^x5}N7$zv#XX4mq^E@$1)<ziBECt`)~NjQmSBYGLi_K59><SkL@6
z^FoBFO~-UTZfsPf@2cAJ;}x{abfv|`#iO2{)T%y9B@b|fD?Hf4*hhAtt=`_*^fe9-
zkOLr}GkBDD42+<L%HLCSn~#(6oB!G63$`);-sKN$8JFxx^oq}mb=zk?+L)MbZ<M*`
z&;gz1Ivt~42VLqR1W~Uy@mK7xKYwKNnXUAN<NR74u>cDTlLqfJW&%mOPdIzh97k8z
zRxH0>)pMzEdUJHc%_Ea-Q)Kr48DpvdVrUN&;P0_`Q0=)smK&IFgJT$7pg=%cMM);$
zgp`;Q+FI7qaB$4B30TnlArv?asiAH}Td6UcT|srk;B<5JQ3MZ{Qd2uC3K;_Qos4%=
zY$6id$l?~iV#faz>K-WFYqA%-o1hqGMl#}Rz6mICs%?&Aac>Urz`ZuHJxrW~p^8>B
zg(Xp%#-PW>F#u^<Szg$!XB{U}hIJfXPtr2Km<TLevg9nHDrTaD1MkgbdeaQewe`Dy
zTc3tQh=pOqQwgQ1iaPgiV7^917d&+j^h#;XV{mP9b=3+gsC#fcIw<HmvVk=}5KjVW
z1C89QbniWUVGG!y#{OE6InLgHQqzdISayyBDi3vUVq&}N=c!F$E)#+kXhmVg>3!6u
zj{k3-YCC39WFuGPk4>%2NzOJ7x6gT9TiZlF7d1*PbonmbyTcN)DehYD-frp<ltb3_
z1t(H%s%uLf*&(x3V&-X0OuIYtn%6r(DBSqJ)IXiAEK|Y*ruQ~0O!WnVM5`zjwA9H7
z@~m-SDarwSS50bZd@Uw9<bFx*r>3g<22!a~m>Q%Q@Nev!+FE}J#K?75tlvAIxYf_u
zKGW9x#;bvWdpB>sf>PNJd#8b8*E@O09Vu!ENXN-~9fUJ@3$$VkQDF*Zn<}kbgskWp
z!R?4?vxR`)J$t$@Jg(Gx=iA${J^T0H*_}3Qyh$zF$!q#8fG{8-d|h7P&@6q82g>i7
z_-!rhgTTF~PY?e~uu0|{jvF`dVbv+nt2K9<+!$^aWI{8MP$bsp+j(@xOuYKJks)N=
z$)?N1w%c2Jh(TEzwX{gGWm*#BCEoVywW;6<<gImb^G6yD@c-I6MCZ85dNvBfT2pBu
zbQyrmz48x=w*3OID!|SWuUBk80v&o4^B-s*ZafDO<{^lQi<iw(yk}MXlOdnv<(oC1
zAY;eH##VM5kJ_(x!X8QZs#qI;Yw&NIuNsl6(_>-$f`torZ$PIz>(l~s=|EeJfumg)
zI5|0SnOIx*@Jn~1LqtsaPIi)6HIb4_t6{`)(q1nt>>(Bw8@tlUNf_v))NNaz6_9jK
zw*H9i%;nV?8^1{9gwEBD(i+_N?o5-fse;WBfM(^?SzKzoP%wqg50cl<nbX%Um6voU
zZZyH1Iub>jwpknOU7!Wzk&~KQ$3%^E8ZL|R4YuY<e?6EJIqK0O-u@$1DqIr@l0DC@
zye)IEaA~5|oVjy@FVuMa;JgsX1mD(i$z5FcUXNtW?Ro$Ux+6xY3)5+>MY#~Zo<4bE
z(eid!j?Y((Q$8(CBTUp3727okU<)0`>&;EE?bHp8rIFphmG$aBVRk`dxWbOIGMnvQ
z3x3C5BW3+~YR)i(^Z5^_*lB;cJM0m67xXnO%%)Wcf+)kgJW{4+AHVU2Q{U!Q-JUg7
z2$|w=WI_S?Yw;Ku*`QNxC`6wnV5EKTUfBA53$)0s8C}<2g9P3De9>5``)_W&Snrj`
z6gY9>_;{eZmRDCgY*8>(vgQ$(6$n%vu_M9P97cRK5KQsGxvE@%k2uvY?Mli)P-T-k
zXW_H6ja(XI21x1o5o$-ej`u5?RCZqCu|7{K+i%%+PkZD~?Q2WzidMd?9k@X*eep2+
zqD_a=WS~!;;^RV1$}&_@ais5OQw6@8IbSP!tbsxJ(635LN;i&&XvCb>P7}X61y|!B
z1qqMD@dX`&em#h};>cdzXYojtwM2*h2#@#TZ>vim|8hph!r&>W0EWVCj<USwcgNRz
z7FN0oM2Ne+lP*YEOq~jWg~cj34D|jN=~T%mtlckBLu`@u5J;Sjno>^^UVmd!KI4G?
zmA9V>ivDQdCkhXf-)zthpX^rtQ@~{xl<_!E;|g)j>>6^#rd8O;KgFu(-!)y?Wjh{0
z1)@b^o$~zw%O*=ehoGY8u&fei=63sb=z80~f!z>?AcR`5PKh~u^yrmLVZlmR5ozy#
zfj6KtKK`y0rXgn(s*V^PM++){LM!o~7BqjKy$nwb-q^F-0R6oS`jhEIzKW5v<~o`4
z>XAy1nowRog$Q|eG0=xMiD`I=Z0jE?r&~l|W2&&>L}#`4=cG6eIGH`o{)&@!d~#ag
zCSsT-PVAumjzd%s2cA!x8HZo$f786(zEEmzYriDswBj7SPa`(P^%<ZydugVYsf*q*
z^BdC`*U`R`>z+)9CWt8pE^7bP-%0y?`V_5bdi+l`!OL!PB%uR%^p}*N5|zGn^QQH@
ziwqv9@%#1dYt`UXsjTSV_G@%w{Gw%Pt>w^@xJ^G9wJd$dhFEr!?x%+dgi4q7lYTv5
z1pO7Q^IH7@-|}w6{9Mud1Itq&*f^ilBL=Evt`8T0Ff0lRBGzDk?#NgS27(^ZW=Tp(
z6+K0=EC>kaLe~d-HeJP3VuYO=PTn4P7JYIoMY;oi@jPqI)+JwddoAjPxdC!i%)GcK
zVQ7rf)uk<GtWwZ>xpt5GyA|lF4;&N_ZI~#!R2(58T^5|GFm>p?j#3WH(Q6|Y4A|C{
zr*x*5)RUrwO(dCqPg}BWkNNbk_Z@F<IqNC+**neR28=$%K<2%t9w`>2mc0E;so+P+
z!X5mnO|8T2*?sLgR335KVAc1iga4@I?y>V`YbMF)SW%|Gc~kWE^YOH_tOc&u7gea-
zD`|cQa=_GwF?5&;ykEAu>xLkXo%J3bU8JODoj^Sk%+~>g2iQLL$oI!5Sa<CT`bjUB
zuyD@5desDC9VLU(w)oi=pSxBq#XMFU)Bi0B0_dU<Y)VAJb#2$UU&m!WkSA&^Avh|X
zhdPC7><3&XQgg4?c(5hc{NP5Hb(}1bIux_Wo0uT`e}dsvHQ_p_zJwx2`y@?{xxX!3
z1eHqKA41wEmA$-00^~9?GE({tR6M$-VQ?UqJPa^6ZP0+3qyKubvB5suW83xW%Dm7I
z)A!+=a^OJd>7%rqb}i|Jq6g|}@ZC%9+!6AX8~@@dFl79=-om0w_Q}6rgh9&6PE#KC
zj2>|yS<=KZM!TJ2;KV-1GBfQIq_yLp&~E_(dK0L_iyq%xJ2=xefA8$RdJm;#GW+S4
z9xFERcwjS$$pWYcAR(0m201S|rh>L_PkS1y;n{N!`o7#2j$SBh4M<z?gB5JayhbyY
zKp)Sj?8Dd(6(^rU_c?O-FjvkrO~`);=9+K?5Nrj7u(kIZkcPnR;B2|Mxa^X?+H=c+
z-XRy8u6NCR3vTdE8b)Bjg*C`{{;0UzarqQC*>UL#!QV*NZ^qfvuYjKzl5hXm1q%0P
z=W%aZ<;2!{dM?P_G-hSHKtkV5FjW4ngyH5GFw#j{3(}sdgo##k^_`VE7bdQz35+go
z3@=1a!b+m#$<4~@+g4vojc>9e%3o{v@a~Hos0OE8{As&j!L*GJMhzL#X}F5rq_LM~
zWJHE!17yN-e!<C}TYLfL2d2#}_dAW)FY$eNq{SL<JVhzjw_~&fhvU-z?Hb!PQe_Vw
z?gELE8hX?H)98*k2Aje<8B6n<l>L=cYlL&HVD-qfV@G}WIj6;IE$tneRNVK@OmRhG
zMwrtHW-;elI2T<GJ&{O+wq03R_F+*MJD+V!`bzXSRtnoYPJ3!$tMlzwI+>^Dg~+5K
zxqXRAv%Y@Jrlw4Ib2!ixcG_S6+x;eNcex`oRnRz?H#ES%z$0_Hjri16X^e4D_w)Hs
zPQ?2Lq4St?)}fGwgIoSt`idC@t1(nypw8)n+0wy+%3*WPykgZ*8nZzOX9!~TLUD~Q
z24jE5(ibf6LPGwh#4C>(3{f`a<>iKSQY761vi)yVaThCqw9CqLl#~v$Eie-5&`FA(
zsN~kIewc`Y)RE0bZO<87=x~-J0rTgepee+vGMJCszaOYtn6!V^(Ej4)=vb)G*Tf0;
zDvm@xw@0?2o*s(a5j0VF1T!<v&sWgUxb@^ozdS#fAOE1`hxQv4qWYQW3H<@P=Q%K$
zW*2bw{xMH`&+$>&=mqT%EzM{M{C|V(hZja;b4-l39S7X&*CUzcIlG}A#KAa@6!|gp
zvH7|T0Yv;r=F^`UR>p#u55^D{VVWjRye14w^*vro>|UQX4d5x`Un%<@`2p?V4Df)-
z=+UQN;p*k|=TF?IZj4E?@$6C%I6_w!&>kx1Q{JgS_(a_R93>DJNggRH3m@5I<M51v
zj;lkrOWO#T0WB}O@8wr0ORkhTrrnuI>qc9cYVZ{d^FL?-DV2#yNHxgpVZvg;^=Nxu
zUS4*#cU<n$EP9)ioE%{|073MR9<Z%->E5(s$F~1^@Cfzo@7k7E!>V(V(U_zfF+Ent
zk-GvVqe53=LST`)YwgnO_DFAkPUCYaDnP~pxU9u>&o3vJRw!Pn(*U_`PW%zN)%#rt
zNia0UXpn_%*K7pp@)a^JmKk)%XgC>2XheoH?v5N8xklCJ+l*zA-|kM<-lBk18_f{q
zDZViX>x>lgyU$wFy-6*0+V(|JO>K`Z-px`w^=rwi6&qG6l})lenv$aUNHl3uxBUw`
zjxg?46gS6V<U<dadumQq-LAQs$jq13MkXY7RL(YELhMYyms~TWusi3zY}3@eyK`-W
z;m?;vS}r~OUiR@@eh2Jc_wBF=$p;4?xgFB`BBB-_LH(t|4<0cBnY8rWJj`z$Mlrk?
z5j)NK8(h>+c~y@-Kk1|ROfml2ogd5Ml~f~s?D20oa{hepyPv!aTaqxCjy2pEJx5*U
z?BfqyyqjKe%S=<9*`j=7QP**)?QM2Ps9LrTuK)3(lUbzs&x5zLmQ9%w<dqW|uV*X`
zjbUH1;n4y9&CA-l)!n>x&U|eDac##{Oq%T5b;+$S-}UU8y6&L7q-%?u`zna9zLImT
zx{==TW6<bx;7HjyIbD)BK?eoA^_`WGyADQl<LQlXh#9mr=V^OXTva&Uep0s-jrZpn
zC$y|7=VFrU(`U%rS#AxBVlEUe?(cT?G!Zm_@rr@;`fRj)_6OSB@_SDk#53haVL$Bn
z@W3qGaQ%lER9KNP?ZDRzXJ~ag28k9?)XdWsB#|(*)9o9Ifc~5RlGmx*uFUQj($C7S
z!iCMpG1V!k0G|iy>3u6AE;J9n{dCLZtI|~=X6EJ)wQQ}e+02No%HOA2j9%PTbppBp
z_zCQcl|!4Fo2!OQwzc541R&VYu8fHxivHQ~!Viv|eQXFu9HcGQ*Ve|c!~G7NKJn?s
zscpt{`TL#hD!M^K1Q#Vr76-wIw@=rMRBOE|xk-A@717mYI*ym#D&=M%JO$K7BSys)
zwxVgELU*}Lk)6osC3=e&uK1Q0*Hl>^3$QWjB`OuPB7nlTzUz(qETF^U$NhF)hn6!<
z^)vVPuk4v}0Lti!w|80;A?|;I^|SU_?7kbr_^qWwn@0_T$s<r~Fel~ruH*>$4#N!2
zwJ`DISJ^D2xfs$kFurwvxJoIVKWB8hw&Cq{nlYRRd)BGWR)=_gkRwGHLxDMQK_*R3
z=t&eQQDI0x_%MYl1jX5*_YIAEQb#P=H&sCzqBA9sU|e5NF!9?hrKQmtBSo70aD_dJ
z&Exyz4*j^^_iA^MXt8v}_vS3NF?S56Gd#Ra<NfJUysY+tetiZL;M(C3;E{rkPMF}2
zg5~PqVtQIUv8c;f0s!te(=Q5B8A#lqogA18&##VcsWFzId+W64zyr>~eft8hfA5VC
zxvZclq1gpE@z}88dWQrmaNXExE-v-d=Gd9QGrT#|Hh8{01qUgmlaJKBdq<w6)sbXS
zRaHS8Xt00s5vXm@ax8|q{28Tifu>e&;M+-hczj!XnJG5#Q*|A8liSnt-Dc3Yy}sEq
zuoSczIfGfaq@-lVxpRlNEB(pn&96>z&wfUt0?qV|7eW#kdMcFRYKDh`57d}jSR5`&
zsr)RMJmy|2StBtFk%A2j`3o*jw|^&X4V-KM;c|T{XQ%Qu3Rfh#Op?9^q)A1pKS0Vn
zys6h;VC|b^K&;jh9|`sUWCntkDH6<Ui^p5@sl~f_SDzI`%_VV-Uw~J(E!BuCnJn=f
zE1tVNX{A7t?#;tjJ3H%WYZG;SY*y}uhC?5`J~TAAIXge8QP3^BwHDPbd{gHy$GL%m
z<91%FX1L0$tNh5>L3G-d>$y?A_c*!{`H^t!7`Lu&xH5$|0<qN3?nqu9CqYW4sn%&7
z3q5V|-mvd}nOMU_7YjM3jILN1gqqywVte{Lz^MKpR@N{Tm2HI%ox7KxI9ygZ2&ZPD
zavV3VhY3y8v-2)-CN*;rb^<yrBWxWY(Et(h#f!yjJrVr<nbz3~a|G%H$kanNRof??
zgnmeH@cN(MT<EYtmgx$RLKKrTjZ-!T&+Av#dFF(=G8h#}j{lTf`h}@13x?wi82#j&
zvnHE5h8ax_S2jt>%^jrv$&v@~6&w*dcTIi$F1L<D=M7?^F{CAn7z2TAK|#PaA#D$h
z7trADn%vX!%4r{d*rC;#h;k`B$Hy-s)atLlS|N(*#SQG&FNZXjL;G+i&VMy{w&5Fh
zeSUf??JQMA$zpBX94xV}Jo}*=LgZj+pU-c)MVj1dc_z1L{)QiSqC-C|$o5D&e7Fh;
z^QUj$w9x8fF*V92RgyA8%$PIHS{>4xr-6M@;aS7^Fr%Qgi*@WM*RLP_?<UA;R8V}S
zrmrIct-f18JzM-SZ0I0z<IiViqW+S)i;H=fG72qcXAKjHpoto4Rh<WH;KG6JJMUe+
zL!cG+`-&Aq;_pUh4IVe{@}!-;d&$arUNpoW<>gEHWO~CVj~<~`v@z@yri1I)%g2vn
zHHMgnL%Gp+e-%JYo6+I5NA|f3CK8}@S=rh3Y%x5peizNZFz2J>wH<LZQV-G~u;|TY
zx5jhEu)eorZW=w5FM_!;PhD^zdh=%9+__W=Ny*8K_sc6P{LmFJdDX&uxS|6706qq0
zwDZJEQb#hF;Dn;};oQLF=(lod=bk-ecbvf=z-!<or+bG<*;Wvw$+$?l;r={+!THcn
zl#dJ(7{DS(;wV-MWET!T=k4Kfj*M*pOpKN|i+gJ=FgtQDYWSi-ef!2lMHM-m_51a8
zHgFM`w)oaCE}+WFMvNx5Y-tazjDbVt<I}m#7FDy#b3NCsgEsUF)Lki1bz00voagYB
zIMh6HDA`yJD<MCUGxGe|GaPX=-KMj*yyDbL7acQFfTRiaIq80#m@%)}$=Uf<wq{TP
zqA_a9RW2?kj~~DP@ZrOI_j-6YX`G;>vrL|PDI?PUP*hZJEP-$p;}C_Qp>CkM3MMsQ
zf*#=FriN*)Cpq{j#|o2sn2a07ksn~(p$ICqNVKoJL8#d8fBv3$CU7aTkwi6He@pSz
z<bShYjeh*&!t2CjRwS|-|6AV~@X45=yznFbU;I(=WFDzwA9%bX{x(SyhEH5~%H}qY
zJGK+u*eRYjN3<{L$dQRip)4D!@(Ne$&F973n<NUj&~<@)hXgS(WA4T4PoDU2uyMNR
zRD=p_z>x6sBbnz79XeI{EKGBKeeSu;4P-Rd_Tih?x-nh?jHU47H=_yXS6j>534k4i
z4~MC#HBtRC@ELl*Oj^uX+RKOD8T58o@165*El;Np0RCj0QJy>neWyLF=YUoOm5{oG
z$p&IWdVK3T3Qb5O$eFm{kI9&a6TXSpRx`}?K;0izJBO2%0}vOy0v6*Mn70HT8^%RM
zEM+i+@*UXr;p4~I4SPC^O2_h2N>5|C^(}wSGHckE*m$tx10O=8C5MKhX#SbS?IN?h
z8mo|bTP;C4@R02QxnPNQUtf1Ic*E@&(bvw_dGSJKF2%t&Oq+L{h&XugIh2l76~ih`
z88gwwI)CF5LYQtgeL7wuwY!u$h?+a_j7w(>LNCT=b9DR)oYGNDOd&94+8aPU7VoJ=
zd6wbJ)~q>;r#N$$9wy2-L(YTTB@kTQ(3|oL`?84G*ldW0U`BlOs!6mEK)AKsca0|y
z;V@`q`oc}Va^!v)sa~Q4eb&O4prWE8a)VGrBc>Xl@4gsfC_fwIGt0j%7grRxVNm8k
ze3~}3a@Pb(<G(3@qU<2E@vNc)BbIJYPEK}rm^g7O#mUF{_H?gT?<Thsb&!V%kuZVw
zxJeMo4;yy$ejVGW=*l_914D+qC2PqF5}yQSQ&3hf`LX(VnP#fW<{*z$A0bEI*Vd91
zN=u;8amxsi+rS;y7i;&|_ni#%v0F~}4K)Dk>$d<D_AHEB+Ydl5NrNP5V0T<EXBbD1
z#-$zRDZVu^DXDOI`p-`iFMK*Aj}R&JxVBW=sawkU!Hvi7G&VL02Gf>`<~k!rgylTK
zh2-w7Tg^Cye>`#iyxzK#)66k?8oS{Xk~1>{^Kd#lN;xlx&Yy8$q#pNOtLB!}?m=JC
zUWuv^eKSz0kWi=RzR)S{<jIoa;`td)Jr1FLIC1(k_TBmE;E_^NQs5A5g^3c(4}%vq
zz#*<GL0-a}O#Q-j#RO~S8Xt(nIDt@=`}&%?g>A0rE0X5exYTjk%izc?PtiBI9{|(4
zn0g@>x{E1DJOj$Zt`<-)!8;ZxNaHH16!>vmJy>4l09cBcHA|bo>=&xDGIR-yKemB`
z-<EAP^4=SEZ_taVB2fVAVsV1ii&r21<^n`K->e&69`rf?gRah>QC<0J63HSWdXD;e
zrIS1hw59p|X(3J3WQS$)#he^m1G>IFg8~L(th{_YzTPBETh)Pjpvg*DjwS-=c5G=;
zEdGbGKN==|S9_B9J%6>N$jqp{b^sJ42BfhH5|yUZa?%nhb4iDzUVuiWp~6DN3|_vi
z={oEB=jjvn+>?YgBrgZ0_^~PXV;A6$tJiz=?3s==PuMgB4T$f}4)a+fZXR9Em753r
zvT*2z;qV1;7q!H~YlcqUxp9{?ceN9(p^M3<xa1~;FFdJCkZJ?YSMCGl!2?Mm`I669
z49KP_zq)MCkKMX;A~oBG2jD|c@V3nyPZ%?3Z|<-s{uq+7rhhzadzjZc%D=V6wZZbv
zH*K~49r|{8<u|Ht7MuX4{ic&=N=u00M*@+kk7Y)B`cN&cQ>m$mHsZ$dVgJ=EhOYZ>
z&7wn_p<%AiS7+>DTi&}pI-{trp2|hM@nS^!ILTW@qV4T@g|EW%I;>Ex2ui1_FjDY>
z!A&vPd?~?qIh_%81mNAi^IJIQ_K#YnYI(pci4eP8gT~DR*r(6%wKr^u_xbS<F2|wR
z*vHt#UA%D7bwlkq{eB(#w(Rp+C$&8Txf>#HUuYKP)%n;nQP3Hpn1o->V^8@Cghf2T
zd(IV@vc71|IqiGisRz$M$Z(a>o~G|c(XXVG6tmVz^jP5TkT{fg7QU<Y2%~!jf3tOp
zV4QK8V8*N60|56ql-wawX*2?FvM-d)Nn9tudkd}~UCsv?KkJ*XApUE1hZ@>hvT?5g
z(@x+Yd@MFrVCjUmA2Z;)pd(i5IM#h~c(Z%dcIBHGu-$5Hc4Vi?`+f;?2pYNgqn9DG
zGWcD&FYW;Vg1b@B4sTPXCj!2mK6Pq=ysLG|V%Tt5{%+yQEM?GMJC1jfD=4cOmxE9Z
zT3_2OKR@~KhxP%Av{XKvPT)KmnwpeKQzXKa<>ck5YJ4tj36(m)ENy6`Wa{_>TxX6A
zbYBF<W?jB)Ut|DO9@RESydPPGS~IqsOG!)P$%>1Kz48i;!$3*yfn?0I<?u{tZlz6-
zak9O%b#*6pIp66={|=%KovtlS7{C!JDPGNNiGG{|OhQpLVlEa6_N&#)H|xs0E0F7M
zg_DJxtSrodESPeQk#K=4>cjDa5nk={7cD1|KGnfN3O5UX&F!`Av=s*zMd0uOsyMIa
zV6?OHF$)S3Mhlh(W8u7E2bKU56nBMxIfEwfnI!)c)O#8@%B7pAAse^(eK+_L#YwDf
z1pDvS=AUb&I(L42$y?Xb+{`Ry)T*f6yQL1<rTkvOpP@5y0pS8ZbWH<D(A6V~!GgC7
z*TA(&M8Q>^byiEb=R}Xt%hI~mZym@6#|y!40Aj*Gj$zaAx#z@2Z+-P@vGcIEkl53v
zT8AsI**kgM`_lRAmx>k#pyJ!N{dzU!aLJ)@{qMqrc$N2oN(Zg`!svneObaCZ9}F%F
z(hYyHLfJeVE9dCvosy3nsYWQ1vvHY`t)E{j`-q5#HK&)rn@1^>e>rkZ_A|BxMzfWp
zSiQI`ZA<2s<{LC-nA97$7K!rnk@BeRGEz{GxccZ(Uqwde5Sd}P&a|}+9KJP0>OkuJ
zXo*CoVO+hkik|34fVsQ8RV@oj4;?>v<;Z!9HXO{tf#k98CMkEBhS?kDQ_gd1vtNfy
ziwT&i=lZOPYk}e%(sEgyC;jNA(TC5U7e?G(Bb~yPsf^C9xI5@l@V|Ya?c0mU$5;ob
z0x5#?81d$D_&_I4e1P3-meS!ZXza|mS5gNk(zx!ov7H=z^|^LtPL%tWURCA56d||O
zBQY)Q-AS)H#;jnM2MoP!nt*$E?qq_%7>78V#KVU@IF~s61e<d_d7x8Z*Frl6tWBt^
z`Y5FcllmpEMv9AM6ZV}xAxZ!|<qYmJ1M}<78}1DA?{w$r9QixHDpnlLf~kg|OxI+0
z{Q;G`QzyYiQOhAa_VLrFJ3XtPwY_d>ZQ{371vlBm>~~CmOfR0bobEJc+A%N94HqZ6
zspyG|UU&Mnww+3U-Q}UFsp^N=P_wLmw0&RQTXWt%4#A2F)l0cxH6IQ7M%={a@<K+&
zwe(+)R~&`w_leOlR7e{Y0K475J;3e&hSwg>_RLUP9T56PV1dx1ug`vhgr~44v^YWM
z<%|i>hO(pPS<87b9EgRh=5kHA_v;`EvHs>SA=;~git+h#`)>wf#`sM!>4~wpdG{{X
zAU;4LZQbkASzJu}E9=NL;{@Wg<mP$v`t@pbkFembK6H8B_v8-cn*w^oaVWokpyj&9
zgT!r@ii{LP6Mxi*d(CI_Gb)Dw*>%IcVOQ_o<?i*)>ePAK#EI41161@3)Is~-M~5o~
z5|drRLxcp&y-Z5zQwDRKrx=m!={w>ozEKbpDQ!6Di|^baRcVO12V-E4&ptm+tOwZw
zU~S4=YjDsN!~*@hUsGj}mwH?YzK-x`)<Rn3N{ya`xW)$lU|z+hsltb~<ijqx)!MzC
z$UfrU=JbrD1zH!1!@6vI&d0-hC^Xs8P`hZSOwXQxKEu!Lxcio}QIMHl4aTwLu(+J9
z=g5~JR*d*=`b)TV+-C4aHL|>~$V%35g1Fme^9WV-9H4<rc?0*}Ma$E%DM|hARp*&H
zLF-b=#v{Fmkcli=4uaOw=o<a8VZmQpy~f^f>fYh#2vKP_q0Jdct^5cxvX5hDVYk5w
z=G|WDK|@sj;6bmv-KOEoOijfe^yy*(Pii^HA9JJfd-ulRnw6I~A=3%QCIvwG>X*CT
z2yX~($!Sg(xJ`Zyf@EKYrqRb3>Ks-jjPFzSad*O5_%j?Itf|>e(5!LOa=u8Ltk$DA
zlKXxtR1+`Pr%&hOcJ#8XmCZl;fxohe6I+b}D0%r$r;#tgkXWK`p-c)X8QZ#xiApDk
z-<&^4^rib|qkxMGdLt>h0B^2xXEoC@aOunNdonU){Y!;#9*&giJ^nsA^X_r}k>*v{
zX?ATeG+g4ti6}4dwHNIZ9CADq%|B5~(Fan$$;{<z^LP1QD=O+TL~H)}&?}2u;Qa_M
z>C{#hKNr5$-N{2uB+R=OE!z;Z_|um!<@fIw9Fc$~(xdq=@q{*d<b+kq=B<B3JCxBR
ze9QcGqrIXhs|<W`$AZz-k9md%`{joyn}g(R>h?V1UdXcA7t+(H3%P*tl3C;7@y??U
z4O{!WyuE(ACO@TlWXMj{KB?$1I(zCRc2%#JA>0Scm)i`VIsrcnHicJC@x2h1S*mB)
zy$~H;-R=VoWCsp>KDATO$2n36oZp?4#L7S*!)KS)0b6|KCE+U7dC?O($eir#${`n(
zoJKOK>6d#7t{-=5dF|$Mr%6}eb8Z3AC<J0rxzf>bQqs=QStHRBgVyo3Ul@%110{WJ
ze)>ct+Gf<YK%};qcgdH>=qeu6RxrjBc&<E6nd%V-EtBup2_w3o?t4N*TbN%XTWVD5
zw@Yy9VLk+t$N3kqCuJ+_XN*27HzW^MQUY8tvY#KJ?9Ct(Vx;w<7smuEeD-Bpko!uG
zK+npn9Rsbry?;=7GfnsT?mI$c6z#P2G`Y0R<tTolu)D@8=EyxvN~T&^d=!P1ajZbq
zK|>iV*(ME~-J{3mBMLZ9&Y~4Rx^XJQIVwo&4$7cap!kY-$sa1a^|>X6;LC~0_Zu_E
ziW1_l{Bjn_rbGOovNF7rKr0)4T8Xg=FwD+9h2@v;h3wsO<j1WsVm<Yo%he=`L~32x
zG_p(KRIOM+`=8AM<{HDT`S8Xc++qiCK%$G*sbF<-7y~+74enA{&FPJY+Pc$$=+MOI
z<*nuz8RbJ(91qsWUmm>SeWLWAG0IB$em|O<_hbJFZmuq796D%R4vN#m!$eyp1-Rr4
zQ`|iR#(1!1#z_`QB(e_Z(H1@_0}t%ni3}^)I+!?7?a3Q@l@{1{5Q)oXjTcWCi=P@g
zZqS*Z-g#^Yuuoj?o}F`?dbe$~NYqy!km6$$lbr=xeGK-CFW&nnpF&NZgY5qTc9byX
z(dzZ1d$(hP@gS!!b=57ci8FeLL8g~bmo}-Icx5Q`FK?og03sMy+&J^u-qm68;sij=
zhbsFvE8pZ61lXrtr^-`_T9*HA@~_-_wCL;=N=nr=b#8gTAha%7z4}?7ivW<=E-ILN
zKn3;<^PkXPB1pHi-x-<tv!i*j44l8RT7fkd7Voo7qlzNJ6AaJ~iG(DS+J&kdD*Pvb
zE@M&)l7Lv-9qH&fM=XOnnAFlBT%P~t5XO;31#bmU&}^Z~z=m~VpcTl`qTVN0(Q>@p
z^m!M>2y>>$`Qh!3_y79zA-Zp1$`|$W(W}|Jz!gw0^sTw=#M&F;cn2CXd(ono?=S8S
z4P}ed)DIZH$%~Wd^xKzro15KVuNLo+)^6tot;%DQzB#@RpJHaVgW*1n=I}SEx4i`O
zASy4mM7VM`hYY9i7v&3}S*bV#wXAI78~Vh!oCYVh$;bx9kPP&i7QSrb#*6#+&m#@>
zT-UO(rxs+F(T-jm<9WnyW0;pWaT3@DnDyTI;Qdb}2?t6xPCBJJgj%w#^7EaY-vRjh
z_%_TuHVt*HtDD<|b?;>3O8)xV@^kJY`Q0rgM(aEL^<&0lOPTr6X7`nTo>+HN9X7#+
z>a%5s<oopM<%_b9L6z<xJuTclCr&&vM2_p3N-}KzeWtrO_|{n)Z=63`YAr)JU+ZvJ
zFC-!07XZ(~r^S0FB;M(ZgVeayMdOqg4#J{@@kVZ6bx^^_)C;JVt_HW?tT1fYYDluS
zA!xEG0~v-4|7+gA580|+{CMzYu-TORXV{1fMgx7l+w|OaU^P*dPeqBG6;nf1o
zU$|EK-T4JF#FniuYyoQMVZsMS@%y20bx;<{j_)rv?y;1YJl!se$fv+Swb4N@R8AWD
z+AiW6j^Fet*vx^!DTpuH<G`K}nUn?(-nizWviafhV^6g{oDg*XbIZ^4E4<?1>0Q4X
zzqe7UF+4lO{1bo;%mvzP^U0Ho03ZdH(x_3H3!fFjD2Ju9Tl(sf@rH&6ZZ5+U42zD7
z=&~)q050J<!}ZQt0|td2-;ezBm2RE(D%TOH^5VsAX+FIH#RC11Ny`76BVO$!Ls2C!
zKf$>?1>1Tm2!ARHeR6C+gQqi6B-)n33)1G^w$h{Z0?5wAGqT_GNt1-_^hZ9Hnb}M5
za2nHib*N|49MrzFGOvfHeTYyvYa?8?Ae|I5IL6RFLYs&2z#<kBowT*B6r{<}t%o)R
z(GHmOB43B%pPk;D7}>*zZ5Ay;j`<B*aMj_3^crk82>f)5IOhNeGhyWq;T?owjRDJe
z=~BTh1QnYg@ISR6_~^rP`YXpQE_*k-wI%7n?u;kVa}F{7<On}}>J$SWCrs6JEH91(
z%(#8?W-RuqN6jPX!oWD$c7)WKhcn2B8(O{}Is(5~cNl-cUuKQI$XYXaVFr>++H4`1
zF8sOWHS=N4<U!#jZgQizm>Rp7<Y-{qP4mOHQ)uyPlHaPTA5#tMsr5Y0-^?9*0KyDO
zDEFL3a_TW(WqMrsg=}IWSZm6Z`%b9CF0kZgT-qEwx&wCYyj8X@OrUk|Z1v*Me>tsM
z^;4*)Ru}d^lOgVp4f~)yLDJ4ES27sQp_#}ixLZ;(-OLP}OE9d;4TFiS4|)Z&l|p|L
zlsbz0v3`zrN7vnSnwN6%<kYGLvqO3j_y;hVT)q1CVr^4bsjpiWUiz+?$DlN@6i}H{
zSz-%|HJF0CyZjo!Q#%#};4qs2ib4$VW#)~iPd6fXY`UitR4^9+-PTstJe)}YQgF0y
z^l41B^7BQ>Q#0U@GD)negA+|-SX~FL9hcexGX$jvKcQTes+fBoMuCF44*m!n`82^z
z1AkGLGsWGj%vW-9bX58DXueobfw9TmJ&qqnsHvHGmGG{~H%8KE?NHgvU{km<9UH}Z
zFJG@=jmmp<hSq>4NhI<LW+vZYa`o_GpA{Y?8N7Q9*<CU4+4JXX#&g`0wGH;@rGBSV
z-oS~{Vy!bg?d=CHoM&YfAT8rPYskqcmGGFYADSa4_^-?yeB_%-)DUJGS>|SA>mi`5
z2EpWG;L{YZzh;Yq1PCPEb%gRwhqKs0spsBA<O^%uRA%n5DDdly%VvX*r09!e`}8S(
zw7_Aa%&D%y!g}eFDJ&K)p;aqaR&F>4qxoLjp-(6r5SA>sxPH2VwAre~N+tPt8&b?F
zn}7Pz!YPIo5^e*>&Yao1?{Pq3f4p7FfHppkjKAv-Hj@;Y2Lu7;GT2<uoG{yJ+QA&`
z7ku+@Wn6aXmbsHZ-r{V(c5Kr63+v2V2CjR&oc#eW?_*C-t>}xm4_x^;{vn^Mhl!A$
zW5eyI)e@c5|GevSPM&_~PWt!Xh_P(TBCl{|xD*@&+n)O1VM6d-bg_y##;dZZ_8XUm
zD{p)GtS53cqa*hO&;DUS1&R0Tvc7ux_)yo%B@;UspEjuXPT1f(1f_D^fdiMIAKtlR
z`Ro}P#O^+bWlT~8*efyk+R~L9fmsxG@Oo#SHY5n3x2)_9wMz$r_MlS%A6TEgUwh>c
z<Mm0eW(p3Imn7m3qhmAt$sC!LWw@(&(cHP_HR&`k4B-uT#cNo*9mhboc=vT3%YCLg
zRcEg?+l}-O{`0n{WHsIxUta}aG#~whw6yl}+kIQ!s*_ffcV~ZmJb`S>!oL2NsvmU@
zx%&2Vou@%&G$jeV>G7MIte$M%FxS?*eP8t?#mbz$VR_f?Yx&20xL5!(&3ps8sGNB?
zAPI#(E+by|@WL{W?KbVT+YV-~Y?nr1BoCBoYPml{d{|jNNVg;QZFs2~PIXby!Yl0N
zGTQ7ir;#oF{|r%V(So}+Jzr>J^FF*H<A!UdmQ8bm+2pvDzX=QpcgC29bEv+5?sek8
z&j(F9FHuWd#3CkUDx?K#F2ZdaQ~o1I2AkB<{$XfS5d2TZK)Ex80>$!lLciDbX`h#%
zT>Zm>YzOmj1Q{HM1?mt#PM<o3w49Yy<&n)TV2Mj=YU-8mK|M@pFNruyT<E(W2_Q-l
z6N#4=`whKyECo-h;7@rODlU5@6#MNuPs7i);+!aT8mSr~WRVHh9rG${3^zglu)G&^
zz07ipVWbZ6xrUo(gng7Ncx%;^k}Bi}{`~QS`=OQurDieH$(lp=bq}}6dtSWg==x>l
ze$lVT)L#XRmy;}FNi#U%c#$^2G2{A9R*ieeng1{pYCQM|or|}Z*KMrk`s~!$YBure
zI>o{MEe4U-<hQlBoEt7Rg@h&qVsMH9HE61shQ8%mhc8Vv^KqyCr_pCXiftN<K>YZZ
zkP`}T<x^n8v@x;=7;4D<c?#g~;>C*@SGo7K`-aN<H7x;#&&m)o$?4#}etIIHq8Bfl
z{eGub7v}YO_ilso*%Dkp&dfQ%^*ZzT*XAKlZh0#@pv&R@ZmJmxDlAOGcJF@iq!8!+
zj~KRXH4D9-H=_5>Nl8}qM*XCMCLXz`2cXmFPoO1-Sg15cM^~t9h=K!j!9T^=rR>?W
z(4%<)Lu_C+-11#!^L+<)B$X5+mKGMwAlTpg?7dDD7hLDs|LDKr^^khy8#C3FlFhtk
z71dq+|B5^FsGRqH?`KLo(SW2#C3Z4YNQM;Gt~4MLQYe)n^H8KBREW|fnUh4Z(?*6Q
zDVj{Bq=BLmm7y{<QP1ngz3+3M`#k5Y^{n-*b^Ng>Ti12{uJ7>qyr)lAesOW^tO<=T
z&yKRRj+kINV&vRcAFH*jCS6`-Z@X1{>&#?rai5(s`+|b(Z>*2XR%3U1<qR32qqBj=
ztg>*%&*~PEEj;wd5wjE%m5|fM;Zq$KxgJTqH^{iD^o)LTm@qk&sW&ZN+mQt(C%181
z)2OUiQG>TKSgj`ltSagbv<6g;D8A_bQ{E5mYdG=69aNKO1(aoFD_+0uqpZA^(Mlf-
z_ud%Ut$TN{0L63nGBQTmChol->2bZE*~&%vN9F70y%R0^aX?*p`?hV9>#C$R4)zX$
z{v>r|pP}nrWI{sY+jv*G&T#xsO^HPPg<H3d&pbE$OX_M!hER3--q=3ML0N?Q4|q(m
zCE(ArgJTq=Mn&w8FhD9ylCP?7eTX9{*4u%%a*79^%qu7mX3qho6IU_$MabU7^x=uZ
z1b3oJC`5Vg*cm2V58W*15o4v1?mdDqLL^ok2EoM{f%n3KuBU8k8&~(yU+(eLYbWO$
zMO@WN)-N0X0ql+{rLses082T;NLcu0t<_)+jF1a2fA&l;TGdG}0J)|MMqA`Oz@`3L
zaC>fQ(B`+5mH8+m`t)(8>TqyC88gV&*O#Ne!s3q>B^&NjE+X1f)6jtZ`+KJv9GZG#
zCk}vr*WTP(pR)he#v~X{aCGI9k8*NwhGi7cx;-5m`t;@*-H}h@pAE^aH+_K_NLIu`
zm7UNx#nfex6ywvtGv{34N~564DB$`AS409DC6%=B;uIBN8ns%s^_I<>&n}+EZAm;9
z<@cQ*TPu#<$zA7sc+Bzx#AWj2<NA1edn*RcW2Pz}8O-&#jkUE=sQOeHem3VS^35b6
z(m03Hk)c!Ng9|%1_`dh(TW*x89QTzSf8eEFrq*!t;kUl;2@;~3A^gd!79^P4ySK41
zA&@d(hMNuk=eCeHTov_+B;q^0DUfn=>)F`NH3;f2UbtW}IqA;1y*CpCHHw3_Z^GcE
z$2I)#Z3>>}zR=eAbH9XvC0Dhg^_#b-wk2A8`3JSjB2XXbZKAPp5YtzbI;7u2!K62t
z{$g+AO6X`ttxME+DmoJ~yWi+f5zW!F5@&qNs8I0*0q1~XAi(~YYZe2*3G97GlmF}H
zm^t3c3r5T%^^K69tU#rr;$n@38~gu0<JQg%;|K7#^l><^baEP8)n+yE#6`3*^<qWO
zo}sKzG@Pww_K8a?y=iBLT}0wpzng_OoxdM&9?*^t6WG^cu3Gv0<;%~x=hcpj`D*_|
zaH2x$>FE_Af6vO=ljcC_-uYz##r5Z=rY)N`g<Tm`^!oK{_B2;1k+cS$-_X4+MY#5~
zBWT6gQgWnl9#G40V&Tq&{sN@QanM%QcZZc?(-Thog!lhK#fYnf`WqksSO)nHIP3J2
z`Xw;+d3k@)EHIoIT|DoEX(J#>lP5E?^$oK31vgzOv@CI6b?b~$DDlV<qJ;{`ZY0k9
z9Tpq}ZCGPyAgw{xtE>AfDa}b8zJ$$0S0PY(8X9ES9A~`+Os<@PmE?0n#UEsRdPz^?
zIy3b04gSe82LM!T&yiO>7#6mP>lG_SNzlrBmg#uIoUP48qFn)VNYEccVgPTLxnaBo
z8X;r^n-Nq&=HnRsjt)+5P3af<w#wA0sMstmwZ@KpnwO_NVZv4}1(MZh?T{5HVU^|!
z1YAQsHetX4M#9k*ojIZ5vS;W?pZDpcYiCEM-YX3$E5G;P58ac<*P-YG22Pq+>JqUz
z02p0QM+a?TlIg@i6VH}Uw=UcMGZ!F+5_f$x<KiwdMJP9y=sY1PEu1q}XGyv*l|w}d
zcUxLvVYr8ASV(|;=<(z4KYm<HWCEjSaAP-m*4jk4sp@nv4Xm-?;;dr_-Q!C>-;E-1
zNKo!D7Sc$9@jVp+@aEmF-)nWg;$Nm?6kud{;>V8JaqP4F$Q6^PPGblc@MX!giIfQ;
z;)nQ}!ov$$^X28r7=&d)8z-xum|2vcFWIvvx=bOFyq~>S1u3@)34`S1D&M>r>Sa8<
z{O@z<!xmw{KvibX{%e*2qu@%Xb?7>kMov(BWo0dQcDAvzYkGX$GNpN~jUJ%}NQ3UX
zXC(I9F#b54mvT_i-RX#xjmmZ2d~QJtBj84_>bGiO_pdzmCHVbh@>jtJCiKuU+=W3y
z)5r=!TP62e{!Eed@NTJ+c3kMi#m)5a7W!jHMYI8cc`avJ6JpDgxo_uK@L4lFy{v1D
z%pOj?raaI0eQCrw{ZEnK$aZnj>d3lxZ`RA3v)Cc`uDEU7=s5a_I}i%gJJphF(kkQq
zrIWXv@V0+H^G24rf6Dy(YaNhSBczx{r0hxX=;lA}A!*(EZPbf90C2eV)~v~>x68RQ
z;n<^5nwsg$PsPUv<~W<{D^<o@I2NdR=G}DeIN*%-LJO0w#A(=ZrbIki_MqKi?soG7
zS*Q99LDHhNU%(I8klTx$NN{^^d8kdXZ&0|+Zq@x+RSYw6ZJB?Ag~W0>`si<7>w%!`
zxUpl(Wx;ymoegt#G&uOw!B3`R$J!OC)UNNP;tNTNWmvk`o-p){HT(0*ri8wx1Eq>?
z^_Z4V!SuKKzA;$olwsWe4fD0$$AY?asgaZz_3g=>Tho3eHN;&%-L-QX4PJjKlZclR
zU(ZFA++5+jE>%_hU<{n#*s%w*su-BOyz=GlZ#R(&@}>9d(l>h;?9!Mqb5KiXRpAu*
zEj9AD*1({kAf8HM$Dv<n@hjEu8b7t~_glgT*7W$VT6=2pDzW#{Ua?n4A<f#Mykzu|
zr*Hi}X?32W{Kt8>e+UHE-6=Z-*ega<ZkErUfXAsG<yr^{cNpkORBb^053}0f?!4_>
zQqqweXSUES%O<r{5b)So4Ht_pojW(|SY@ti8Td+TD5ViYQGCy}#RoH^$NB#E;`7o*
zCW5#Mcm|=}am5u0T3q|Z&!0Dabg(M}-Y1b@S;)HlZk;<7s)_|=aWxNA9+nW9vtTe{
zvy7BANM$80o1Dx@eK{g-r+*Ti80K+A5rFGP;=4Y$p)}o?S%qxO^u5Ws`9x+4f`d^<
zxR*_P`T}+S!Yf;2rABAl4{la+9N^=JlFUIv!Mf=J$1?_A)IT`vkpskr=(g?s-euk7
zZLJ4Y_8MrKpgDz{HLZ{PcRcaZy(P`92SmDR`Zwom9r(hijdgxg`v#eX8aYP4AGU}Z
z$<);JW%ZzcsP@!Rl`!p-Ktzl2PjYgGjUIiu-dt_1!!y!tq^4*tO4yYrm=P)8c~pp~
z(M=n`d_<<Y6b!9Mn{@Z?-6PY_G7XAc%KiI$9O%lM!oJFI^PY^PNAw8V#nU>0Cj0m?
zWU8Y^F}YXBq~M<8BQ9QC4nBh*M<ZTet_f!L-vs?myCk8|lieE~;bMW|fS~E;fBg7?
z018DTDK^4TT;3TCRZr=8Hb3#V%^{93EPp!+q(Ut6sV))RkcEv#wz1_qTaF%%Iq?Gt
z?q^rv$){hU<Z?eXHdGzbsIk}^z5?9r8K!slzgX8)+pEX5A*zv9zQzECE3vT>LD3?C
z?eyJ4-rshVRr~_AK5*bMR~a2Ja;m-kM&h@F{OD*cN_C#UC8E2Z3?Nr8<=~4yqb_R!
z?kk_0Uy$`LEE@7(O$vWR{hLXF_v9ZzK1O;`5xTCF%X`e3GO}GQcgC7sDnu^tlDUaE
z!os45StzU?W!uvpM({mgQg^TX%42sw+-a8zBVU+ev>y8fK^G1VGJX3F(b1#Z1?dNN
z4OOT26&|>YU&uABQ1!-_FF5V{XYJY&Xg@+tqO-jgqH7dA{rc^Z5iA!+&c(z~@{-au
z>pA8N24%eZJq<xVBzk&o5U~myzBg~s$D{rcVyL1P&9iTCc60$K>4z6dy)C<U>t;5o
zoTo<DctAaefjX%8D(?Xk4Pb_i<rSMhncBH;-3`YK$;MGtzd)yeu)#Acr*a0({v&}f
zOl}bS(l?d5YSFC?&!09Qxzvyt5HGQ7{I=Faj;uX*Rn6p5(-)DD8Oi|GMQ8>&Gz6~C
zKL(a}qDW)nMBAv|NU66Ol{Y0dbpMk_n{kD&TtZb}#*UJU)z$Fw1!@}9iuxvc$~)iU
z=|E1L&}RN{q=A8Qu7SrFQ-Bvo9p`$BlkNe7PFZQS0>8{@ROom0^5sz5tw?0(pWI1|
z$e}31=V|Lf>r>(WzP2`rGe&<K*+eTt5MpTv${tQscV<w*s`~rPG%+&LKJt*OYT}6l
zJ<Me8fwiN%lfEUQk)a4RzNRLTo|hLqF|rTCMb8z)T`HjcVZtw%<sbdeSyWC%iuItp
zyq&!>ezLl{QRy2sUA5%aGZ{`Lc3hRQadBk$2B_zt9fgNDf8c=fWX<Tr&O1+W5nmTK
z@4~@s^*&9xZ~@-Hi3hSIqI`Y3q8D3|-gV_w$|&Hvp;}r&<^&ar9CKxjH%6NTqw>g_
zus>|M{pE`1@uOdPrWfOaqo!?0zD;8@LcT|k<$<`^?~%zrVBKiby#Vusa7v0Go|5@<
z<G&FYYG!^(OGyE*dU3uqA4T0o6T{+W!Tg81OAp5S7A&k!bDpgG2&+cfnF><7`RWVx
zvV~~4f&zNgrN?{vcfZQ^P!3v7(XeJsig!mla}l$%r`j}~xBNEG!^4BGDVf$%q(QV4
z!r<nMZUCnSPWk2*=L!fMR$o}g&0(}J!zo09mP(kHJ@s{v;(=sT+~y(UOLv>DO}}#H
z;Nq**CG)k?GZ^J=BvDvYH26rpwJ?93%65eODgrf-tp(rcb$cSyea9-Ufg@Au5F&XP
zmr(hR&e?YstqQ&L2I}N=z>C;Dk0_EWPb?#}f%9Vitypz4a2Z&XIl~YRU5<{s=feTF
z9qA^AQCQd<=FM>Y1ax>wyj%R$EvfK-02Dl|Y{{jzOlhbj^K9f)84eh0^Q~JaoS4~f
z`|}Iezj)cJ71dM@G?VO@FC>8QD?Pol>R6<0zIm{P>K6(2!SO6aPBE(TXqv<Nq=m=Z
zfk!LC8<!qGabloZDBJYu8+~O<+v&_ir-OOwwpAqRe0%&%Q`0Fw)xd*9j*gY5jrYrl
zs1crOoRc7k@|Pf(iJd}z^XT$rp_^yH9}-#KEKF2LNr6AfMCNv@U;mr)w=G&*{ogk5
zMOgxan3B(I8ShX%<*L+-r}obJJDnb~Bzd{G`F3WZx9~me?9v^KAy&6>lTk_1Z@HDn
zYAZ8>m)lz#$T6yZA>t>-wB>?ro$e@e&m!=q0pz~%^Jn6o`rICWjHZvoiMU0BeaUWS
zv9)*^h+>hbGL8*pk~_7vtiUC3vH^>ZR*<TY_9n2Ws;-W}jJcW_#EzB*&LNdY##a~=
zgp3O=MHocJC1WoSw=VaiUh-t<i4$v_oP1<%vgNj0O$o>GY<uH78)5Y0@I)VkG^toa
zHk<~=X1d9!-Qz<PLsDCPJHk`?iiGrHLfE)6n#;aoN*25|&EB&mRD10id@1ptXhSB!
zc00D;ZTR%5Zet9k(7M|#Z#XwVnOJ~)k4Syu{`~Xdg9l^aO#k}JhL|@Zy=-k2_cM#V
z$e!~9+ZI?PNHNYW>UWMJMolytpM3v<TmH?54~y*Vh<!txH<4zhB9tE1rOo_j3^EhU
zfZOpnJ;Ee7M7al@3^ZBKo}5wKs={P<hVOcKwBlJty22GTiJ3m|@7!|RNj5k(ZcQt3
z2XUsPALes(VbIv60wE{toT@6`2ik9P=gNe$ykR}@@HCz1R++ztj8#eL7?7wiP9w!L
z;%)`y<E!O|IBFp6xc1S%6DM|<v4rc-^f`-Vh>5Pq;~0>pq$H?6`H{R3cgp$L*t`{q
zyr;T%@8(o=SAEKBkY*DZiHBWx>HR*MlhxE5(b4i5?_PnN_qlLk_}Aja$59vD^QrvY
z!g_?I=jA9HB~-sRhD$UmhRojfuf4a1h6eh_(I=Vn#qv6Mt?yMW=to*uZ{z|JQQV&2
zX+Th*Wz|c9r8+NN>K`!<eLr((`TilI@*h)7e>qk@pyLMRH{v$I5GcL7O(fmK99~K@
zdfk~nXL51KGO`;Rg`b}>of=grB@*I!_07&{73(=d`De6H^ecQHZmzEDACl{d-;7P-
zI`M=GLm+ywAZ~G@^yM^ZaZW<Eb@?(6@74q5F$}9fzD|cB;#vW*2%gx0%nLO++^a-_
zy`9|!Qmi&^<OIn*F=@Gri@ywHKX0F<xM=&f;oD~sbq@{?<$3Ft7h6ouzE^O(XW{Rj
z7v8*CR`=vagDdrnv<05FAE)%H8lPVzz&6XtYg6+Y5_o7(c`r++DGm**Wqz5m@`2@?
zCkAW^9vLoY!9zcc{C=^xQ-kkzI9{^BJ1dfaiK&!6UM^23`wv(1zY5i17#*3Ggl4EQ
z_q6bc4R08<C*U?lKM!NgrNb{lJ});c6X7Ubdrpp7C3LYpMFp%=g%K!f^~Q>J99B|Y
zWrDg(4-GSOyQJ2k#t-Y#355?v-WFl(16R=hp3_R9hQaD|5)vT)+`o$-9(lBEi&Kcy
zw)xg#;*$lpl`HZxbQjYk2muDH`!equG&Bcb{vm|itjA$TmlGemBT5XxT$h1UWtvZi
zshjOm@feZ|R{@y4?L+BRwMnNxaZe_r<lCQN<uSXHf*M*N?bpHTNz`UgGVsh5Us$}z
zd1Wn;>I&LEA;{LGQzwOeKmS+YRHgp?0~K}61z5t@&u<1ji20?zJqNW-ojYsyp~rN{
z{;fxkw%1{nMsftW6B$MP#*pJR0O<YumjoGiaj4nlE_FvG<k+z`4p>rR`xvGfOLP+#
zU*_zr;!C#&ZJ$%sD1%3dO6}pv$;V2y{|DjY4-W+~sk!JV6{Jv+GVY7|7=$MB(4pNj
zYGxbx(9LVd1ZB0M7KOr){UvckIkLBOGl@Rb^4uRC?M<mHeA)}sk+4m|)XTUMHX~|S
zu8l3xzN^PgLL5s8ki19YjtUN%9ce1RH2Da=0czOtKV|WA=0MyJ@C^<O6q1|y2{Iyd
zx!B9pj)kcoId~9-yhiB7=;#7=rq-XbB7_wdmX`JJ-&-fTQ?`+8gsN!+%D>N_=QKBn
zDRj0zxQJD%wSL0}P-70byBy`nyJSVbXrHsdl8cuxvmB}3GzkuN<TU=P7tflNS6a#l
zC#}%O4%(b*#jD4;O+JCSi?Kz1_vdkUa_|fsh#?kw9~J<<%eZ`K7qL5Eb;Nu+`J_#e
z_|F7~gk_kzzt30^B{{)Tdsm-4e0cnpj|+JtA+9j$&!2;DHLm_A570;R7y@w+!-hE@
zi0utlUodxW%&cBwQr(2FNu6{f6fU0iDSMV9I>lkv=(=&^Xx%l=&g$j;mccFbD(Bqh
zjJEc$vwKb>88?35pg~&OEt4$<n*>nRorcX2krjvwbu*uR-?GkkP6=kn*(v20Hg%jT
zs1-oox;lv{t||_4`chS2(@?L*%5GmpUw_))NWe~iPM7|lM3ab>7TDR%xl*=2tIyCd
z6a~%D;9k1YVlv_gJ(~?Vk5Jx_K>7Oy7t}U<6MmB(DU63BpGl{P`pD9tMSaw$#~env
zxt4$YF&KMSfu&*0&C1gSwP>KhRsp2WUL!@9SRWJa0UjHuZum*Y*U^k&eIdu864ozy
z{w?l5c!Ax1ezL*{h%+pivM${m)(cywV~?ks{BIL(3|dA5BU1_u2PnA#hP~pH?t1nu
zQ(Z3c{OOT7ScT|%W8;j_Ar=iOj`xA6DC+&HE)flb{|+cN`zYuFmHFKQ<R*aV#ByzG
zc(s$AE9y3!tC1!X3Y~aJn2`DLw%vHlvVxbgLvcXLL~g2?N5}rQXD-(gw?l}`ei_@u
z6|<KuJDYv(4j)l%6^XN`X!$+;RsV6E7yY#qcNCdL-*dg&qxr{k=MbU1JaY7?rk8`5
z)Mi#mb3#<Xg)95^-ADI>(p}AIV5mBnE%nZf7{azIDJw|Ymh13ZaE)dbo-4Sc0HgT2
zabjY=X$hTqdsZnE3A@TmPp<?!$e-8n`Li8;Dtv`sZ!XZrX%C4L&!R4~+T35P=6B>j
zq=v3qw{HII*@6X<Xem)Po;X#b(AVfCZnu4$q=`rF9y8x-39w}SjF6|)*$7<kE_-5T
z)7P(k`|IDR55P%BP@G9LxCjx9U!O&<qITV-a;Exhd_2QX78-wkAAOp0GB5Vx+fE8E
zDCLNF!JUC<;x><(GEbPHKwr~bvYrYP!xc-;_?j&QH$XonpkHr<tIXV^@i_^VC?zRL
zga_{`F8htT8GjU``COAU#!Y^EGHuY;FAYFd6Wmi3rTTJUv@@Zzs<N`iwUsyws1cGz
zKf$lOiW%+U*3LW?qx`tVCwMqN&!x`|4Z-fZX3<YbPXG@e745+4_}I>1VeuZA!(y3G
zzEoQQ9Eu_@bKrQ0`$*f;Eh!T%kBy7>?1uNbl{tvX*FV^$6%_P78PRD9TGQQ<FMhZ7
z^120&-JE~>U11W4u;L)I(`ZO4W?nUN-@0|`jm{1=cxiSAvv(O$=s($DbIApS{gQ6o
zMtCPYRyPA`vEQt9j3ONnX3l$4F)^QBSTTJC!9Mp`L~;Lag%`%?PEi$UOq$fj{xg;+
zyR~%_ke2mL_k_ntRDy3zQw-b;Kz}mg7Q_Eg5RO$0{5gVIv$fk_<HsEv>2!E#iA7B9
zS5ar}Gk2P@93%+-U3A&m|DLgR!sB%R8}pa{G9_kzEL`wJb!XqJj6nKsr7+4;dXq2=
z9AEP#5w$g(#p#l7YgIcbtbEm~Af*bS)W=(4dTqzz6V*)nX&(z={vr}pP02L9y0v;`
zDK@5{vIu$q*n9FMbLj4#8TsMu;uGPblLj}RJkOh<C*~tf+0uR?($j68<b<{w6Q9$^
z){2#n5;jbS__l<{pPHToiW}>diQO_5q`uy(c}741`G-c?CT5e=Yxp2Km_mvBZP0{`
zzIEOEHC6kcGYbvayZ5JH3&`yuIiJ+#u^0lFRsxk*@j`xtjf}kfGyA389b$2!?})|m
z;N}#P&le9Mh_mM%j1f%7_(q6I+GqEpkCy2XX})crsxJX@l^;ID$HldfKFUSm3A>cu
zpYyT>A%^?Je;Q!RhASO=#Dr1Vt>c9AmJ4>O+OG>%V95T}^k(VSLo+&Pt^bcBv&vit
z`?Asx@v!I}0(ULC<*tK=oQe|#Yi@3W(gZQ>K6jdCib=`+Hy*YOC%@Jj6UQ>w2Q|@l
zyV0}f<>l4XtPuBE0>8FoNkv_q<|h#?T$*q(ihn#cNnPEUV4Cr`YL>7`5dyl;i{i`a
z_;A*4^PsZPNbwlr`<r2|vW{a{vrzVua5>FF(!V3kCrfaL&0e>92;Q*$Re(O~(*~8X
zT+;XZCf6Q%GVS{6U)P&j`|e5qyxhvNyYHAPc=~<z{F}*(v9PlRlh^(@gPVSOIoM?`
zT_Vus*4Bdi_s!iy*6Wp(9r)LaY#<M|6Bn6+KD@4wcLsb*!KVcx0Po7_&%1*GpCa1k
z+N0^WeQh-a!J#vjuN@dxcUGzC$;kYSvmzc52b7Y$yorp_oH@PSdJGg3HujxUe9Ma#
zEXaBLdBgHjo<5bVbws4F9DQ%req1uB3}l!>Q}Aa-=BCo4a}%*bReF7#-$lEaqiwh1
z9E2b?<MZyb8Cp}zT$|1WeFPsvz94x1=?FoSXg5#XYaG={Z2dN=J2IlLPsk5<>3_lB
zwIHyq=FP^WvKo0;<`d@qHcsr*j(1LCyAO?GlZ4)^-M+uIM*P)C<LK6;Zemh4<cs}v
z^=izM6*Pyu*~#_Sm<V(~+vuB>i$_YZnA9*f6kS2JPg@~>`~MNGP#-_wW!#l3VI+ki
zeqJ#aWKqT!JWa$#^HGtYp8=}~1<g1sDyf{gr34+lc=$WOggM|l_V$L`T3vhg4BTFx
ze7gK1$jcqOsV+rXG)=`IQ9@ueu{9kXW)>D<%b#<b2v2O;vSrjjg6JJCYJ&&B7kCOE
z7AB#;tZdYZ5Td2@Ru}&Te`2inJ_$7%sHHG`l%k-cX(qut!a!%rcjtt0Q-mlJAIUg%
z<juc{_L@ZN_);YswGTiVG<}g|54)SL!=}oV4WGMu^`9Kf$A{Y^<v<<bIZ5v2we#mC
zRjnZ<QlvaH;K@Ne5I){qIA^nq=f-6uq^z(i{d$SMnB~;rjV27KvjI0{%O+e76uquD
zYgA!P3sxk43^$KTPT<CYEEIn7W|bGsZd<k>CL@The&dXim=8HeI*ILwGMmtNSUm@p
z7eKFS1+%~Dc+p`P#Pm7qN%{y{!B+3*w}-ZOC>i&B9St7a$znqNrY9>BfyifF!R*<=
zWr#@BbKA84QIKr)?q4_&A!X^eb>&1{)aDNnL`uU_v0zdC`;zRLN0Gu^PnGc}!zy3B
z`U{b>uZ&qJ&l+$-0-NJ`DtK&Xd;6G`UbCTHrrbiWw07-B4mcsdyr4RVF}XyPqdZUy
zJc~4?vT`GlLdUlgunjuHsihc58}g0p2miO<h@8V3fZm3E7ubJ{xT?C0gx9uh$FKp|
z+6p6(mR-Tc!`nofzsV&ge6uS)7mOhiQGmhp>||7^_)<RdXLtk@Xhmg;;$l0}L^(|l
z%r2ig<-jn4vf}mpl|@!r%lL__4XYO}93aXrD;sENjiL{L$lS~<%>p({(2Cxj`BN(@
zUUA4w^P-qoKW{Umsn&0^Z~%<o{(byO>bWMIV7~hUWgM@Rq_+tJZ9rxlU3qqVmLZ20
zR$f~@PD#}j@RDs4st(sh>~C!Mij8lG>t<bDO>DRK+@f6gG~7_^`_N8((~8B7WMn%{
zJGf8&UeDnfE&~r&23Rb4Z#}S|$MK1Yb0<x%s~M1S|3>1(r=^1r$wth{i&8n=%VO!l
zDGjHl6gS1J?z}tL_?pb+7cU1jefcrkOk&Bly)Sd^wJ22Uo626ie(-#k-}Iq(72Z}?
zN7-Ne^zG}{H6@p;*<zaE=To1N4=yXB@O@nkuOUc)bco#}Q<c(y1e67^ycH|lnnqz2
zgt-_oJp4YRtM`m}d1FHtRd7VZ+A@apxyR&12GNp|=`qH)XW+vuexC64$e(u>q?8e(
zG89tX8`BYjj~jQ$)(yVn5%XM|8wumMe4J4fN#=U*n0810mykSbH8f1;$V1|+%OrZf
zuEN3m>GGK~T~+bNjfp*^L!OR1E>e<D(-gM2xe1oV(lmUTifd;K4Rd+p#3{@R&i28Z
zx7<0=*0bNhfhkCDNrm_nT1rwlp3z<{ZzhR}9X4IDy}cPx6T6=_WN`7>@?%M3d%W~`
z7r%F}#3h^Kkd|`(RJ$|dN9(jic0xCbO@TCj*~r{}5?xWii?>qN-cvv`>+7-dK~9dK
z*ra*N>|S44DUv6_IY0&h4-q^{)(_D0w9-dz0siYdeBKLi5~iOJ%Fs(fg7JOem_GZM
z^fOaMCb;_cZ38NCr_+T!U49u#Bk9zY*4@ITxP@4HtH;ix!T8pFe?@XsSCdT!VCjE>
zt=->cf)qLIK{XvI&3eIz2=kcTdPwqrFU+pGBEGgSnd|6|8eRe+<QjJ94<G&x)Z~dB
zYHY&b=J3!J0{BAe4#lD_=yju?vt3=xvsn%SvLc@~TPe=GTOQayjx+?c@<vyQDAZg%
z6pV*^S^E4{;D;X{)_FfqbP2N=fm|jp#lm7;;!RJ2<uLFH!F%*>q>1(JJqIdodpf+d
z*woMUOWDGTpIrkNzpl{OxcQ)3xH+jEuESTI-G_LeR*ZQ%M1}k2(O-U<$HfOMiQ1v3
zl$7%&8;II7-w_zwe$9~%b@%Xa4B!lnyDI4?QkVPMFZIo}-94`@_4e)nH$i^`9*x0a
z%gb$gNNzRyfL-#g5d#5yASnqTmh9-q#!H=!diDeQdGX?V8;=R)E089SO<y^;`K*}O
ztGQD)8Bl5zrC5%iG>IUDLaL_S*SC><{Y(I1KP^4q)ya$Q8Y$Adc?{?Jy$3Vu87ahF
zB_)10BV#Ft(9;>rX&e<XJmNvP`4CYo>m@n)d3pKCKM`u(Q@BuAcl`JO-#kJNY(~ht
zP3))2oaX!3Jr|~Uglp37-7F7#d-J)Puj%b`mkk!EEO^v>$R8$-bXEdwcD#5ljKZR3
z<}&Q1@O$mS_XZ+UbvdF=`I>SWQKuQcvi*{yifvg6PZPl5_XphLfS`D0rkJEY5ff|B
zbnx@_%_|mcMVJAT+}_Q5#Kz6X@QOaan4Hz(Lun4TaCyAeQ3gXthXiViU%Q-az$-_s
zHB;7C2<BaI4UTX4L}spj3^Hil>#PF948Cd*nK17HuqTaG(!#X>F_1sy?o~abW*ue_
zl8eg%E33-7I$L}DNaC_9DlU<zKz3Hr!tDgP9hp83i8;FrqKt1Ak^;mC9}$BwP{ZLk
ztTnlqyat}&*Bv%oQxowlnl{WoC7dU=wg~;IxPF`;94uPw84veQdYVXL!lX$ZpCdbN
zxi_k9wYiWweye^t9$9cFlBek?wBZ^!CiwUaQ?;jlpeQAB7P~4n%KAi}c2Q|Er3`jD
zDhl2XDn?(Kt)-8$^%TX#7I%{>TeaP(mgkA`&i*flwDpRp<q>tR9_J~j7v>K^%>XKG
zXJ;oICyti@20vOZ&UPM+gXYz%yu!jEeIzFgcucB=S^B&<YXA&*2lO&4uim_Veg5Z{
z)`{1y0bazPJ4F5unS^9VtZKfl`(^E_!yzMk^OK3V+ESAJ4Dgm84fvgN=})UT)~O5p
z4UM0V&)pp<t3ED5$_Uy|7}lIQVYrEswiCBDrT3#2cd1z|HuY{x^CHn3!p_mUu99b(
zRwMCbP2awq9j?h?8auSP!%*JesN+Yyx2%XGmC-yO*DoFlPe>+`j$2i@d5pka?axqr
zx-NheOzr%xh6X%CSZ{4GVvRmLI1cWZNv9+HnXo`XLN;_<oC_*J5FsxwuiYJ8ia0e~
zXEb;Tqm^#;d1b?lQ?XNnUJLN}j_tue3*pwO2IBl{P4^ZdaqNHVH?`$CU}}3Usaah2
zD(!;U*_wM|EeZh>nN){vCP1fbr&NZ;Kz;YVclw;rR%jSCQ(w;iHK2gETdQtpMMLJ+
z28XWw-4k3Q#j7Ta(=L70?0$MHCqH=O$VJnPjr+>Wuk4Es15yIv%@aAfrFWL}zw)YZ
zr5k4=B9vJg1Jai543Wz1no{8KP)haMhEFY>2Y3F|=xeojaXaWTwu`w}X2s9rv<dlO
zIy7t`vp?}EUs(@F2*B3c-^kkENX<Veh)77LsthU-6VEs-yaa2MA_<b~WJHACu1CEz
zh_*ta>r@imDO%_EK%H09^G@)df4!i%kH08>9yWQDSqm?C@q@vQ$DZ}0E;urM*^OR7
zS<m^J$z|Oq#PdI~C9RBeUb${5gaNlW?E;5Oz;6y?6%0o>$qNzye}6OT&FpcjirGzp
z$Hko-2&Ca+ae5IKGG^*3#el%5jB&~a_6}XA8k`*Ou3rxRbyYIlyqdj3kr&{i%GPB#
z%HgAIB9G5HneOlnN9~7$4kQ0DkI6s0Y}(yZyO*xsmCC70-)C~|QN&*Jn9B9z>DrxM
zBsBhfm7S_tCT(wf1gMO+pS(e^M@V}&`^;Oy+h(zH_64}?eCWSqw{Tz6N7hwKgNC0o
z{4oq%9>VJINa;sEU*qk@B)npSi(fs6Nldhknsp;NS$YkpNu>0eKdP4RdNx*}Fm`u;
zr4Nx>+PiE>YjlNRfvsgG;*Z*Mk?`RCrNl)w+*8*@QS^u-M~y-o67&)lMSj1Z4j;&X
z(A}h@2_+4LN5E#1g>}$8MlFOvyVXoR8v9~C*tYQYxt<)XIVbSLal9TOkmi{tF~!KY
z=0v1%2AYbcww2YJejn_3aQ9NFU0Dju+r$R9jooX#Wb@{4Gyp^s(r*L$nJvLmIDNWQ
zWG(^+Kx4j9!bfkqk@j@pZUtd+5MXGADBE-=PyRvO%;BUi2X#T;m`LXg(QyVNDplb@
zA_2%^2nj`b*FV-0D_#d*C*v(35fO=gxfoF%3||wY<h@`)3Fk8h8;sCYIR#w_k7-kw
zp4;^4lVq=6&yop{tmcJarwJaTO3i-!*hC^EySr}h#ItETq^9`^N_W7GC&w?+f(X#J
z{7A)RtE0cPbpCScL-qiLsO#68unj0Uktw`!qfPugjCl+?a|(dPWJ$WX49m1(kkDbB
zKd$?stl#bHt58w~6unjdq>gUS;|Uy#1A0xna_^3x)m7Vp^75PWPf`D^T%+r8cXaHl
zJ>0-1?<zOg*c1a!Rd1Y-*-YASZ;7r7RX9t(5hrIIXYF{Yn4$#XoI({P6ZN3Fy86Ro
zdiDGXg(^6G*75#6<Srl?Ghts_GXiLgztJw8L*_B`)l1nm$c1ti%s4Nb^3|X3<YV!r
zKjP;x#R=*=r2K2XQa)GamrTpm<+oa(x}L2zs+U-iM5-D8uXZNAp8{xXrO>P&?W-=A
z1Mp8)0<{`xs4?>{@MvkhES~jRIkaY`PrH;t-)Dy}vO1nlTZXCzA)Rtaab~kgS8WfK
zK|!n76lm_bjhwU<L{TzM(Bx^(qvH}D@7S>ek|<yUsw-rF&)%%OT6+Kfz6aHcC1rbG
zJH=Qr3YpU+;C&*Nx(BCPnKU`WWSxz;(svo8oQxr#t;M|Pv(t}=ZWpi;80a(f$n<&E
z)<V1lt{b`dz`z;E6*9+@OAqM)0(fCwyw!z51*Xkr@~98>0eal)yzrBHxXp-61^eX9
zv~BNc_flI66HJ4w2YbA83gBa-LS#<yyn2t?o^0u9>%J>Eg`1NU5;tMfmjgT2PD9KT
zO++a$24sh<ygW=myzYE949T5@`_j3O2tDntzu41r(t=5`M^5SX{Pg)V{WkO2M*GG#
zXcb&#dFMEX)}EhtffCGY>BowD;zQ+ndjvh+xdb2a2>Evx5^fXviZM+jLF^%o?2oHO
ziwE}wFU_7X>XOZBw4;I@X6yX<MUZ=6#vRb<B`FC!A8pQK4+{+?OL%VOyLT3AW5y}(
z&$x``(o`LlWad+zTjQzHY0(o#WOk1X4gCby@%8$l3`gubz;-jYgg?uu;^}je(U&DH
zEH5l{YyaLt0ZJ{#6qmJ-3=&abVg5#!1>7JHLSpEQqxY&e5^!uXb0(&uj+R{Sp?)wT
zQ0o$^2AJr|^*=qAMMz%(4M$>lW0qli$QM)H)7P1J1qs69qi~ug@dIsx<E0y^!=Bok
zk{VN_WGITgal<BR){4c0^Coq^IS^hJ^W_9}_1E{qhZ9tT8^`wCa)6^djtJB)Z*P14
zGF0zJ{PptF!a`j!!w1-9&_vh06Q3K>uxJ$sIYu1F&Nm^Je)AJ)-un62v$LucTW$sf
z29iJh`!H}TJL3KqWGaLO>529t0e2o9|A>N-%i_!SWK~}UsZ5STT?@FYZC+kuQ!{pp
zIlUm+W!aUS2XNeUBIIh&L#2GB0PZX4w^N*=k5=)|1*1LgI(ju=HRhLi`m-KR534s~
z)QA%%l|a`ybKo9zLwv+f&anWY;Q7jZ;F0x_GzLHuMpfBg*T!(nVf)lJ&Kiwj3erud
zI;Q}31u%A8Niud1Z_hK&kM3-i6@h|N*XroBbgWY@lg+zu>eMSCX^@ss2y4KC0k|}1
z(&p)(wUkE>zp?NmcZi$bpr_OGj*6dYVhu~FYHI$4*qxTqD)i*Z+p9}Pf9jX3?WEN3
zx!Yp3_1<6INt}&{jO@V45f(uQLUA|b0o2afLy*%$<ousqLm=)vENS86s>aNruReZU
z%igN<bVdos%4O{Qx}^b)?4ZU=QvrPo%GPuF%u17<O5=@`f(^zkGmI%Y+?eMyW34sg
ztV_$wKfZZmVotk4y$CBqhO$Tl7tD(nXfyndK3Xa77{EDZ(EQ4W+X}3~cNU{4`68}d
z*~E6;@s;C+C7QF%L?Q~~dZ%^WiX`Krqwh}n{<L-{6+R$B%HUH?=R~NA$b};7`U!+N
zRk1x1Uz}dHZoDY7p}vai|J>Kyo&2dU?ek)EW~=)0MS;^}^2vFC?C8_{bu=nGuf}1L
zHAY!x8?_}Q{n8&k{1Xt;X@raf$QLoOYra1?G3)idftfZjR@i%1mWTT25Fa(TD7Kwx
z+q3)SOKG>a!our{OzZF7N4APl?d;<7)|&Lfb)~OO`^PNl|KSzw)?|JCgXYz|&-qrB
z_4SA_j1Tc);VbjY%h#@1v&hP7myS@1>*_L3QxFo*n9(z{nKP1>+m;8%Ir>-0>KLp3
za&q})Wq~^FB!+VM7s6{3rr=aZ7!!zjSEu74A!#_447^sjpGy>Rx{6oQ^Gqd9!90d&
zdabR;T&GQ;K`)xWZe7BK3#+_WL0@p;f4Qjmb8S%~>I>=6vf2asX4(u5BS9NcPrwFp
zK`2|Oq<~tdew;F3=Fj^%amU*enUlbMmMoDa7Xm94XApPcJM2sYdi@TSA`E{%b|TGz
zuamGe@Pw7mY0^L+5!G@!q5+jEj?l@S1g!11dp9Z$2|xNcVN@#e=ymHX=RzxAgkmPY
zeCq;R+g3P!^yAV?-?r(Q-yL0=Mt<EGRn;iMD)drY0);%5Y&@Dwi^460<dL$1s~)Cv
zNYg%$3_JsH1N)5Bc21E|P}nQeig1rXY_Yw?e3w=5um>4<WiJzwOS7-GAn*$}4@Y`Z
zvOE&LxtEPMJRrrcC@MJjEtu15!3%Fkv;OCz<Zt)1WJIq`t%s|_2dT85J<H9PS=Gpa
zlYK(cW7?_IP5>rCvW1tPE`z3=h&HC7FIuGgbD0VZ3lIaFS!$%FZ!jed&j|<1!cmqp
zW&m6Flk_9vgeR-3qtpM_ga2}>O}eu~AEq)bSP(Z$Lpzh`2*42M1qJZJ3_rgr*{$==
zwU62ryuDWMz;ZZY^$>CWYYGQs2OZLpSUf6X)xd*5iNr(2AKS;Ex4H32Neg1ELaM)g
z+gw)@sU{;a);lUr=x!bMfA8MoQM^>qIr7ez67$yOV@P>$>wfzfE8#-0{rdI59htou
zm2{8o&lC-Nk3E+rp}b=pq~6%E!PV%RA8p?wucTa^+fTtMIOz9Yexgf+iGDhsn@0p_
z&(EIcV8#)v9~~kj@_+1d&nc&QN&h`*L6f09s+W0tcOZk@UG8oY{IkJ07#>dC*cpxs
zYwy7xD=t;c-?(q413y<NR{7-lg~_(@<+rzavLok?o;hl5|1l$~#x-9$GfRU+ZEVqn
zE5A}Z30lO8L01k8@A9~7nM>Z2ClvsT^y6LTU#;Euq3`k2y_H_rPmW@aEI7{DGiUPh
zJ&la+VV$Sfnih^Xk7EWcP+dx`>qCWKui)(HC`UjSqt|3*QRq>Gw|d%q9HRTPBLfah
z_U>q>)jHp@uzkG$ub1c|-PX~4N8{P^=gmKNl2dH^@cun2*e{KZ6w|pYyifGZZul#J
zS>+O=wR_jJzE|iiHvP$O(w}q`7D@V@PDlu9jrtz(;O{opRArDb+Tx4PyDa`D_T%q=
q{n@MM@u&EYQw+2JpMSHyLrijr>sY^MH--p5*UWgP(M6HNuKxqiuM=Vb

literal 0
HcmV?d00001

diff --git a/modelzoo/LanguageModeling/BERT/gpu_affinity.py b/modelzoo/LanguageModeling/BERT/gpu_affinity.py
new file mode 100644
index 00000000..e43cd4db
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/gpu_affinity.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import math
+import os
+
+import pynvml
+
+pynvml.nvmlInit()
+
+
+def systemGetDriverVersion():
+    return pynvml.nvmlSystemGetDriverVersion()
+
+
+def deviceGetCount():
+    return pynvml.nvmlDeviceGetCount()
+
+
+class device:
+    # assume nvml returns list of 64 bit ints
+    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+
+    def __init__(self, device_idx):
+        super().__init__()
+        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+
+    def getName(self):
+        return pynvml.nvmlDeviceGetName(self.handle)
+
+    def getCpuAffinity(self):
+        affinity_string = ''
+        for j in pynvml.nvmlDeviceGetCpuAffinity(
+            self.handle, device._nvml_affinity_elements
+        ):
+            # assume nvml returns list of 64 bit ints
+            affinity_string = '{:064b}'.format(j) + affinity_string
+        affinity_list = [int(x) for x in affinity_string]
+        affinity_list.reverse()  # so core 0 is in 0th element of list
+
+        return [i for i, e in enumerate(affinity_list) if e != 0]
+
+
+def set_affinity(gpu_id=None):
+    if gpu_id is None:
+        gpu_id = int(os.getenv('LOCAL_RANK', 0))
+
+    dev = device(gpu_id)
+    os.sched_setaffinity(0, dev.getCpuAffinity())
+
+    # list of ints representing the logical cores this process is now affinitied with
+    return os.sched_getaffinity(0)
diff --git a/modelzoo/LanguageModeling/BERT/input_pipeline.py b/modelzoo/LanguageModeling/BERT/input_pipeline.py
new file mode 100644
index 00000000..86225f60
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/input_pipeline.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT model input pipelines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+
+def decode_record(record, name_to_features):
+  """Decodes a record to a TensorFlow example."""
+  example = tf.io.parse_single_example(record, name_to_features)
+
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.cast(t, tf.int32)
+    example[name] = t
+
+  return example
+
+
+def single_file_dataset(input_file, name_to_features, use_horovod=False):
+  """Creates a single-file dataset to be passed for BERT custom training."""
+  # For training, we want a lot of parallel reading and shuffling.
+  # For eval, we want no shuffling and parallel reading doesn't matter.
+  d = tf.data.TFRecordDataset(input_file)
+  if use_horovod:
+    d = d.shard(hvd.size(), hvd.rank())
+  d = d.map(lambda record: decode_record(record, name_to_features))
+
+  # When `input_file` is a path to a single file or a list
+  # containing a single path, disable auto sharding so that
+  # same input file is sent to all workers.
+  if isinstance(input_file, str) or len(input_file) == 1:
+    options = tf.data.Options()
+    options.experimental_distribute.auto_shard_policy = (
+        tf.data.experimental.AutoShardPolicy.OFF)
+    d = d.with_options(options)
+  return d
+
+
+def create_pretrain_dataset(input_patterns,
+                            seq_length,
+                            max_predictions_per_seq,
+                            batch_size,
+                            is_training=True,
+                            input_pipeline_context=None,
+                            use_horovod=False):
+  """Creates input dataset from (tf)records files for pretraining."""
+  name_to_features = {
+      'input_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids':
+          tf.io.FixedLenFeature([seq_length], tf.int64),
+      'masked_lm_positions':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_ids':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      'masked_lm_weights':
+          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+      'next_sentence_labels':
+          tf.io.FixedLenFeature([1], tf.int64),
+  }
+
+  dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
+  if use_horovod:
+    dataset = dataset.shard(hvd.size(), hvd.rank())
+
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  dataset = dataset.repeat()
+
+  # We set shuffle buffer to exactly match total number of
+  # training files to ensure that training data is well shuffled.
+  input_files = []
+  for input_pattern in input_patterns:
+    input_files.extend(tf.io.gfile.glob(input_pattern))
+  dataset = dataset.shuffle(len(input_files))
+
+  # In parallel, create tf record dataset for each train files.
+  # cycle_length = 8 means that up to 8 files will be read and deserialized in
+  # parallel. You may want to increase this number if you have a large number of
+  # CPU cores.
+  dataset = dataset.interleave(
+      tf.data.TFRecordDataset, cycle_length=8,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  decode_fn = lambda record: decode_record(record, name_to_features)
+  dataset = dataset.map(
+      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  def _select_data_from_record(record):
+    """Filter out features to use for pretraining."""
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids'],
+        'masked_lm_positions': record['masked_lm_positions'],
+        'masked_lm_ids': record['masked_lm_ids'],
+        'masked_lm_weights': record['masked_lm_weights'],
+        'next_sentence_labels': record['next_sentence_labels'],
+    }
+
+    y = record['masked_lm_weights']
+
+    return (x, y)
+
+  dataset = dataset.map(
+      _select_data_from_record,
+      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+
+  dataset = dataset.batch(batch_size, drop_remainder=True)
+  dataset = dataset.prefetch(1024)
+  return dataset
+
+
+def create_classifier_dataset(file_path,
+                              seq_length,
+                              batch_size,
+                              is_training=True,
+                              input_pipeline_context=None,
+                              use_horovod=False):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'label_ids': tf.io.FixedLenFeature([], tf.int64),
+      'is_real_example': tf.io.FixedLenFeature([], tf.int64),
+  }
+  dataset = single_file_dataset(file_path, name_to_features, use_horovod)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    x = {
+        'input_word_ids': record['input_ids'],
+        'input_mask': record['input_mask'],
+        'input_type_ids': record['segment_ids']
+    }
+    y = record['label_ids']
+    return (x, y)
+
+  dataset = dataset.map(_select_data_from_record)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.batch(batch_size, drop_remainder=is_training)
+  dataset = dataset.prefetch(1024)
+  return dataset
+
+
+def create_squad_dataset(file_path,
+                         seq_length,
+                         batch_size,
+                         is_training=True,
+                         input_pipeline_context=None,
+                         use_horovod=False):
+  """Creates input dataset from (tf)records files for train/eval."""
+  name_to_features = {
+      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
+      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+  }
+  if is_training:
+    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+  else:
+    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+
+  dataset = single_file_dataset(file_path, name_to_features, use_horovod)
+
+  # The dataset is always sharded by number of hosts.
+  # num_input_pipelines is the number of hosts rather than number of cores.
+  if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
+                            input_pipeline_context.input_pipeline_id)
+
+  def _select_data_from_record(record):
+    """Dispatches record to features and labels."""
+    x, y = {}, {}
+    for name, tensor in record.items():
+      if name in ('start_positions', 'end_positions'):
+        y[name] = tensor
+      elif name == 'input_ids':
+        x['input_word_ids'] = tensor
+      elif name == 'segment_ids':
+        x['input_type_ids'] = tensor
+      else:
+        x[name] = tensor
+    return (x, y)
+
+  dataset = dataset.map(_select_data_from_record)
+
+  if is_training:
+    dataset = dataset.shuffle(100)
+    dataset = dataset.repeat()
+
+  dataset = dataset.batch(batch_size, drop_remainder=True)
+  dataset = dataset.prefetch(1024)
+  return dataset
diff --git a/modelzoo/LanguageModeling/BERT/logs/squad_train_benchmark_base_fp16_gpu4_bs8.log b/modelzoo/LanguageModeling/BERT/logs/squad_train_benchmark_base_fp16_gpu4_bs8.log
deleted file mode 100644
index 9cbc3aeb..00000000
--- a/modelzoo/LanguageModeling/BERT/logs/squad_train_benchmark_base_fp16_gpu4_bs8.log
+++ /dev/null
@@ -1,477 +0,0 @@
-/workspaces/Deepray2/deepray/utils/ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.10.0 and strictly below 2.13.0 (nightly versions are not supported). 
- The versions of TensorFlow you are currently using is 2.9.3 and is not supported. 
-Some things might work, some things might not.
-If you were to encounter a bug, do not file an issue.
-If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the Deepray's version. 
-You can find the compatibility matrix in TensorFlow Addon's readme:
-https://github.com/tensorflow/addons
-  warnings.warn(
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:468: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0615 13:09:05.659566 140041562986304 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-/workspaces/Deepray2/deepray/utils/ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.10.0 and strictly below 2.13.0 (nightly versions are not supported). 
- The versions of TensorFlow you are currently using is 2.9.3 and is not supported. 
-Some things might work, some things might not.
-If you were to encounter a bug, do not file an issue.
-If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the Deepray's version. 
-You can find the compatibility matrix in TensorFlow Addon's readme:
-https://github.com/tensorflow/addons
-  warnings.warn(
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:468: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0615 13:09:05.664320 140627418781504 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-/workspaces/Deepray2/deepray/utils/ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.10.0 and strictly below 2.13.0 (nightly versions are not supported). 
- The versions of TensorFlow you are currently using is 2.9.3 and is not supported. 
-Some things might work, some things might not.
-If you were to encounter a bug, do not file an issue.
-If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the Deepray's version. 
-You can find the compatibility matrix in TensorFlow Addon's readme:
-https://github.com/tensorflow/addons
-  warnings.warn(
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:468: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0615 13:09:05.671956 140110035396416 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-/workspaces/Deepray2/deepray/utils/ensure_tf_install.py:53: UserWarning: Tensorflow Addons supports using Python ops for all Tensorflow versions above or equal to 2.10.0 and strictly below 2.13.0 (nightly versions are not supported). 
- The versions of TensorFlow you are currently using is 2.9.3 and is not supported. 
-Some things might work, some things might not.
-If you were to encounter a bug, do not file an issue.
-If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the Deepray's version. 
-You can find the compatibility matrix in TensorFlow Addon's readme:
-https://github.com/tensorflow/addons
-  warnings.warn(
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:468: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0615 13:09:05.673041 140468358596416 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-2023-06-15 13:09:05.675118: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-06-15 13:09:05.679644: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-06-15 13:09:05.687002: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-06-15 13:09:05.688179: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-06-15 13:09:07.142946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10505 MB memory:  -> device: 2, name: NVIDIA TITAN V, pci bus id: 0000:86:00.0, compute capability: 7.0
-2023-06-15 13:09:07.154822: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10505 MB memory:  -> device: 3, name: NVIDIA TITAN V, pci bus id: 0000:af:00.0, compute capability: 7.0
-2023-06-15 13:09:07.155294: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10505 MB memory:  -> device: 1, name: NVIDIA TITAN V, pci bus id: 0000:3b:00.0, compute capability: 7.0
-2023-06-15 13:09:07.159216: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10505 MB memory:  -> device: 0, name: NVIDIA TITAN V, pci bus id: 0000:18:00.0, compute capability: 7.0
-I0615 13:09:12.146109 140110035396416 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0615 13:09:12.173238 140627418781504 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0615 13:09:12.392804 140468358596416 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0615 13:09:12.668849 140110035396416 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.890000e-04, adjusted_init_lr = 2.116402e-04
-I0615 13:09:12.677958 140110035396416 base_trainer.py:331] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-I0615 13:09:12.694365 140627418781504 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-I0615 13:09:12.694616 140627418781504 base_trainer.py:227]  20230615 13:09:12 Initialize training
-I0615 13:09:12.694701 140627418781504 base_trainer.py:229] 	tf.app.flags.FLAGS:
-I0615 13:09:12.694992 140627418781504 base_trainer.py:231] 	?                        = False
-I0615 13:09:12.695081 140627418781504 base_trainer.py:231] 	alsologtostderr          = False
-I0615 13:09:12.695153 140627418781504 base_trainer.py:231] 	aux_mlp_dims             = ['100', '50']
-I0615 13:09:12.695231 140627418781504 base_trainer.py:231] 	batch_size               = 8
-I0615 13:09:12.695309 140627418781504 base_trainer.py:231] 	benchmark                = True
-I0615 13:09:12.695387 140627418781504 base_trainer.py:231] 	benchmark_log_dir        = None
-I0615 13:09:12.695465 140627418781504 base_trainer.py:231] 	benchmark_logger_type    = BaseBenchmarkLogger
-I0615 13:09:12.695543 140627418781504 base_trainer.py:231] 	benchmark_test_id        = None
-I0615 13:09:12.695619 140627418781504 base_trainer.py:231] 	black_list               = None
-I0615 13:09:12.695695 140627418781504 base_trainer.py:231] 	clean                    = False
-I0615 13:09:12.695770 140627418781504 base_trainer.py:231] 	conf_file                = /workspaces/Deepray2/conf/dp.yaml
-I0615 13:09:12.695845 140627418781504 base_trainer.py:231] 	config_file              = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
-I0615 13:09:12.695921 140627418781504 base_trainer.py:231] 	data_dir                 = /tmp/movielens-data/
-I0615 13:09:12.695997 140627418781504 base_trainer.py:231] 	dataset                  = None
-I0615 13:09:12.696073 140627418781504 base_trainer.py:231] 	date                     = 2023-06-14
-I0615 13:09:12.696149 140627418781504 base_trainer.py:231] 	distribution_strategy    = mirrored
-I0615 13:09:12.696224 140627418781504 base_trainer.py:231] 	dllog_path               = deepray_dllogger.json
-I0615 13:09:12.696302 140627418781504 base_trainer.py:231] 	do_lower_case            = True
-I0615 13:09:12.696378 140627418781504 base_trainer.py:231] 	download_if_missing      = True
-I0615 13:09:12.696459 140627418781504 base_trainer.py:231] 	dropout_rate             = -1.0
-I0615 13:09:12.696543 140627418781504 base_trainer.py:231] 	dtype                    = fp16
-I0615 13:09:12.696621 140627418781504 base_trainer.py:231] 	embedding_dim            = 16
-I0615 13:09:12.696697 140627418781504 base_trainer.py:231] 	enable_xla               = True
-I0615 13:09:12.696774 140627418781504 base_trainer.py:231] 	end_date                 = None
-I0615 13:09:12.696850 140627418781504 base_trainer.py:231] 	epochs                   = 1
-I0615 13:09:12.696926 140627418781504 base_trainer.py:231] 	eval_batch_size          = None
-I0615 13:09:12.697002 140627418781504 base_trainer.py:231] 	eval_script              = None
-I0615 13:09:12.697078 140627418781504 base_trainer.py:231] 	feature_map              = /workspaces/Deepray2/business/data/feature_map.csv
-I0615 13:09:12.697154 140627418781504 base_trainer.py:231] 	fine_tune                = None
-I0615 13:09:12.697229 140627418781504 base_trainer.py:231] 	fp16_implementation      = keras
-I0615 13:09:12.697306 140627418781504 base_trainer.py:231] 	h                        = False
-I0615 13:09:12.697382 140627418781504 base_trainer.py:231] 	hbm_oom_exit             = True
-I0615 13:09:12.697458 140627418781504 base_trainer.py:231] 	help                     = False
-I0615 13:09:12.697533 140627418781504 base_trainer.py:231] 	helpfull                 = False
-I0615 13:09:12.697609 140627418781504 base_trainer.py:231] 	helpshort                = False
-I0615 13:09:12.697686 140627418781504 base_trainer.py:231] 	helpxml                  = False
-I0615 13:09:12.697762 140627418781504 base_trainer.py:231] 	hub_module_url           = None
-I0615 13:09:12.697838 140627418781504 base_trainer.py:231] 	init_checkpoint          = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt
-I0615 13:09:12.697913 140627418781504 base_trainer.py:231] 	init_weights             = 
-I0615 13:09:12.697989 140627418781504 base_trainer.py:231] 	input_meta_data_path     = /workspaces/bert_tf2/data/download/squad/v1.1/squad_v1.1_meta_data
-I0615 13:09:12.698064 140627418781504 base_trainer.py:231] 	interleave_block         = 2
-I0615 13:09:12.698139 140627418781504 base_trainer.py:231] 	interleave_cycle         = 16
-I0615 13:09:12.698215 140627418781504 base_trainer.py:231] 	keras_use_ctl            = True
-I0615 13:09:12.698292 140627418781504 base_trainer.py:231] 	label                    = []
-I0615 13:09:12.698375 140627418781504 base_trainer.py:231] 	learning_rate            = 5e-05
-I0615 13:09:12.698452 140627418781504 base_trainer.py:231] 	log_dir                  = 
-I0615 13:09:12.698528 140627418781504 base_trainer.py:231] 	log_steps                = 100
-I0615 13:09:12.698605 140627418781504 base_trainer.py:231] 	logger_levels            = {}
-I0615 13:09:12.698683 140627418781504 base_trainer.py:231] 	logtostderr              = False
-I0615 13:09:12.698759 140627418781504 base_trainer.py:231] 	loss_scale               = None
-I0615 13:09:12.698835 140627418781504 base_trainer.py:231] 	max_answer_length        = 30
-I0615 13:09:12.698911 140627418781504 base_trainer.py:231] 	max_seq_length           = 128
-I0615 13:09:12.698987 140627418781504 base_trainer.py:231] 	mode                     = train
-I0615 13:09:12.699062 140627418781504 base_trainer.py:231] 	model_dir                = /tmp/bert_train_benchmark_230615130858
-I0615 13:09:12.699139 140627418781504 base_trainer.py:231] 	model_export_path        = None
-I0615 13:09:12.699214 140627418781504 base_trainer.py:231] 	model_type               = bert
-I0615 13:09:12.699289 140627418781504 base_trainer.py:231] 	n_best_size              = 20
-I0615 13:09:12.699367 140627418781504 base_trainer.py:231] 	neg_sample_rate          = 0.0
-I0615 13:09:12.699442 140627418781504 base_trainer.py:231] 	num_accumulation_steps   = 5
-I0615 13:09:12.699519 140627418781504 base_trainer.py:231] 	num_gpus                 = 0
-I0615 13:09:12.699595 140627418781504 base_trainer.py:231] 	num_train_examples       = 88641
-I0615 13:09:12.699671 140627418781504 base_trainer.py:231] 	num_valid_examples       = -1
-I0615 13:09:12.699747 140627418781504 base_trainer.py:231] 	only_check_args          = False
-I0615 13:09:12.699823 140627418781504 base_trainer.py:231] 	op_conversion_fallback_to_while_loop= True
-I0615 13:09:12.699898 140627418781504 base_trainer.py:231] 	optimizer_type           = adam
-I0615 13:09:12.699974 140627418781504 base_trainer.py:231] 	parallel_parse           = 48
-I0615 13:09:12.700050 140627418781504 base_trainer.py:231] 	parallel_reads_per_file  = None
-I0615 13:09:12.700125 140627418781504 base_trainer.py:231] 	pdb                      = False
-I0615 13:09:12.700201 140627418781504 base_trainer.py:231] 	pdb_post_mortem          = False
-I0615 13:09:12.700277 140627418781504 base_trainer.py:231] 	prebatch                 = 1
-I0615 13:09:12.700352 140627418781504 base_trainer.py:231] 	predict_batch_size       = 8
-I0615 13:09:12.700428 140627418781504 base_trainer.py:231] 	predict_file             = None
-I0615 13:09:12.700505 140627418781504 base_trainer.py:231] 	prefetch_buffer          = 16
-I0615 13:09:12.700596 140627418781504 base_trainer.py:231] 	profile_file             = None
-I0615 13:09:12.700672 140627418781504 base_trainer.py:231] 	random_seed              = 12345
-I0615 13:09:12.700749 140627418781504 base_trainer.py:231] 	restore_date             = None
-I0615 13:09:12.700826 140627418781504 base_trainer.py:231] 	run_eagerly              = False
-I0615 13:09:12.700902 140627418781504 base_trainer.py:231] 	run_with_pdb             = False
-I0615 13:09:12.700978 140627418781504 base_trainer.py:231] 	run_with_profiling       = False
-I0615 13:09:12.701055 140627418781504 base_trainer.py:231] 	runtime_oom_exit         = True
-I0615 13:09:12.701131 140627418781504 base_trainer.py:231] 	save_checkpoint_steps    = 1000
-I0615 13:09:12.701207 140627418781504 base_trainer.py:231] 	scale_loss               = False
-I0615 13:09:12.701283 140627418781504 base_trainer.py:231] 	showprefixforinfo        = True
-I0615 13:09:12.701359 140627418781504 base_trainer.py:231] 	shuffle_buffer           = None
-I0615 13:09:12.701437 140627418781504 base_trainer.py:231] 	stage_one_mlp_dims       = ['200']
-I0615 13:09:12.701514 140627418781504 base_trainer.py:231] 	stage_two_mlp_dims       = ['200', '80']
-I0615 13:09:12.701576 140627418781504 base_trainer.py:231] 	start_date               = None
-I0615 13:09:12.701640 140627418781504 base_trainer.py:231] 	stderrthreshold          = fatal
-I0615 13:09:12.701718 140627418781504 base_trainer.py:231] 	steps_per_summary        = 200
-I0615 13:09:12.701795 140627418781504 base_trainer.py:231] 	task_index               = -1
-I0615 13:09:12.701872 140627418781504 base_trainer.py:231] 	test_random_seed         = 301
-I0615 13:09:12.701948 140627418781504 base_trainer.py:231] 	test_randomize_ordering_seed= 
-I0615 13:09:12.702024 140627418781504 base_trainer.py:231] 	test_srcdir              = 
-I0615 13:09:12.702100 140627418781504 base_trainer.py:231] 	test_tmpdir              = /tmp/absl_testing
-I0615 13:09:12.702177 140627418781504 base_trainer.py:231] 	tfhub_cache_dir          = None
-I0615 13:09:12.702252 140627418781504 base_trainer.py:231] 	tfhub_model_load_format  = AUTO
-I0615 13:09:12.702331 140627418781504 base_trainer.py:231] 	train_data               = ['/workspaces/bert_tf2/data/download/squad/v1.1/squad_v1.1_train.tf_record']
-I0615 13:09:12.702407 140627418781504 base_trainer.py:231] 	use_cprofile_for_profiling= True
-I0615 13:09:12.702482 140627418781504 base_trainer.py:231] 	use_dynamic_embedding    = False
-I0615 13:09:12.702557 140627418781504 base_trainer.py:231] 	use_horovod              = True
-I0615 13:09:12.702632 140627418781504 base_trainer.py:231] 	v                        = 0
-I0615 13:09:12.702709 140627418781504 base_trainer.py:231] 	valid_data               = None
-I0615 13:09:12.702784 140627418781504 base_trainer.py:231] 	verbose_logging          = False
-I0615 13:09:12.702860 140627418781504 base_trainer.py:231] 	verbosity                = 0
-I0615 13:09:12.702935 140627418781504 base_trainer.py:231] 	vocab_file               = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
-I0615 13:09:12.703011 140627418781504 base_trainer.py:231] 	warmup_path              = None
-I0615 13:09:12.703087 140627418781504 base_trainer.py:231] 	white_list               = None
-I0615 13:09:12.703162 140627418781504 base_trainer.py:231] 	worker_hosts             = None
-I0615 13:09:12.703238 140627418781504 base_trainer.py:231] 	xml_output_file          = 
-decayed_learning_rate_at_crossover_point = 1.890000e-04, adjusted_init_lr = 2.116402e-04
-I0615 13:09:12.713708 140627418781504 base_trainer.py:331] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90170dc0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f6c901ad2e0>).
-W0615 13:09:12.745393 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90170dc0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f6c901ad2e0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90157e20> and <deepray.layers.transformer.Transformer object at 0x7f6c90170dc0>).
-W0615 13:09:12.753170 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90157e20> and <deepray.layers.transformer.Transformer object at 0x7f6c90170dc0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90157580> and <deepray.layers.transformer.Transformer object at 0x7f6c90157e20>).
-W0615 13:09:12.760884 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90157580> and <deepray.layers.transformer.Transformer object at 0x7f6c90157e20>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c901d3d00> and <deepray.layers.transformer.Transformer object at 0x7f6c90157580>).
-W0615 13:09:12.768594 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c901d3d00> and <deepray.layers.transformer.Transformer object at 0x7f6c90157580>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c30681130> and <deepray.layers.transformer.Transformer object at 0x7f6c901d3d00>).
-W0615 13:09:12.776298 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c30681130> and <deepray.layers.transformer.Transformer object at 0x7f6c901d3d00>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000b2c40> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fe50007e100>).
-W0615 13:09:12.781444 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000b2c40> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fe50007e100>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305f3a60> and <deepray.layers.transformer.Transformer object at 0x7f6c30681130>).
-W0615 13:09:12.784024 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305f3a60> and <deepray.layers.transformer.Transformer object at 0x7f6c30681130>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000c97f0> and <deepray.layers.transformer.Transformer object at 0x7fe5000b2c40>).
-W0615 13:09:12.789286 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000c97f0> and <deepray.layers.transformer.Transformer object at 0x7fe5000b2c40>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c901ddc70> and <deepray.layers.transformer.Transformer object at 0x7f6c305f3a60>).
-W0615 13:09:12.791837 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c901ddc70> and <deepray.layers.transformer.Transformer object at 0x7f6c305f3a60>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe500033310> and <deepray.layers.transformer.Transformer object at 0x7fe5000c97f0>).
-W0615 13:09:12.797431 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe500033310> and <deepray.layers.transformer.Transformer object at 0x7fe5000c97f0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305d0190> and <deepray.layers.transformer.Transformer object at 0x7f6c901ddc70>).
-W0615 13:09:12.800190 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305d0190> and <deepray.layers.transformer.Transformer object at 0x7f6c901ddc70>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8595c40> and <deepray.layers.transformer.Transformer object at 0x7fe500033310>).
-W0615 13:09:12.805644 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8595c40> and <deepray.layers.transformer.Transformer object at 0x7fe500033310>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90197910> and <deepray.layers.transformer.Transformer object at 0x7f6c305d0190>).
-W0615 13:09:12.808236 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c90197910> and <deepray.layers.transformer.Transformer object at 0x7f6c305d0190>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5025b3b80> and <deepray.layers.transformer.Transformer object at 0x7fe4a8595c40>).
-W0615 13:09:12.813799 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5025b3b80> and <deepray.layers.transformer.Transformer object at 0x7fe4a8595c40>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c3066f6d0> and <deepray.layers.transformer.Transformer object at 0x7f6c90197910>).
-W0615 13:09:12.816220 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c3066f6d0> and <deepray.layers.transformer.Transformer object at 0x7f6c90197910>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a852e550> and <deepray.layers.transformer.Transformer object at 0x7fe5025b3b80>).
-W0615 13:09:12.821713 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a852e550> and <deepray.layers.transformer.Transformer object at 0x7fe5025b3b80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305e2b80> and <deepray.layers.transformer.Transformer object at 0x7f6c3066f6d0>).
-W0615 13:09:12.824285 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c305e2b80> and <deepray.layers.transformer.Transformer object at 0x7f6c3066f6d0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8516490> and <deepray.layers.transformer.Transformer object at 0x7fe4a852e550>).
-W0615 13:09:12.829439 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8516490> and <deepray.layers.transformer.Transformer object at 0x7fe4a852e550>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c30604c40> and <deepray.layers.transformer.Transformer object at 0x7f6c305e2b80>).
-W0615 13:09:12.831992 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f6c30604c40> and <deepray.layers.transformer.Transformer object at 0x7f6c305e2b80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000fa6a0> and <deepray.layers.transformer.Transformer object at 0x7fe4a8516490>).
-W0615 13:09:12.837347 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000fa6a0> and <deepray.layers.transformer.Transformer object at 0x7fe4a8516490>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f6c30563400> and <keras.layers.core.lambda_layer.Lambda object at 0x7f6c90170b80>).
-W0615 13:09:12.843604 140110035396416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f6c30563400> and <keras.layers.core.lambda_layer.Lambda object at 0x7f6c90170b80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a859c970> and <deepray.layers.transformer.Transformer object at 0x7fe5000fa6a0>).
-W0615 13:09:12.844978 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a859c970> and <deepray.layers.transformer.Transformer object at 0x7fe5000fa6a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8541ee0> and <deepray.layers.transformer.Transformer object at 0x7fe4a859c970>).
-W0615 13:09:12.852920 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8541ee0> and <deepray.layers.transformer.Transformer object at 0x7fe4a859c970>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000cd730> and <deepray.layers.transformer.Transformer object at 0x7fe4a8541ee0>).
-W0615 13:09:12.859971 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe5000cd730> and <deepray.layers.transformer.Transformer object at 0x7fe4a8541ee0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8563190> and <deepray.layers.transformer.Transformer object at 0x7fe5000cd730>).
-W0615 13:09:12.867340 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fe4a8563190> and <deepray.layers.transformer.Transformer object at 0x7fe5000cd730>).
-I0615 13:09:12.870811 140041562986304 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fe4a85fd2b0> and <keras.layers.core.lambda_layer.Lambda object at 0x7fe5000b2460>).
-W0615 13:09:12.878926 140627418781504 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fe4a85fd2b0> and <keras.layers.core.lambda_layer.Lambda object at 0x7fe5000b2460>).
-I0615 13:09:12.902342 140468358596416 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.890000e-04, adjusted_init_lr = 2.116402e-04
-I0615 13:09:12.911949 140468358596416 base_trainer.py:331] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e13a0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fbf782a9f40>).
-W0615 13:09:12.979595 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e13a0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fbf782a9f40>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e14c0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e13a0>).
-W0615 13:09:12.987544 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e14c0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e13a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7833c7f0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e14c0>).
-W0615 13:09:12.995379 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7833c7f0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e14c0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf78335610> and <deepray.layers.transformer.Transformer object at 0x7fbf7833c7f0>).
-W0615 13:09:13.003179 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf78335610> and <deepray.layers.transformer.Transformer object at 0x7fbf7833c7f0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782a0520> and <deepray.layers.transformer.Transformer object at 0x7fbf78335610>).
-W0615 13:09:13.010958 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782a0520> and <deepray.layers.transformer.Transformer object at 0x7fbf78335610>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e8970> and <deepray.layers.transformer.Transformer object at 0x7fbf782a0520>).
-W0615 13:09:13.018725 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf782e8970> and <deepray.layers.transformer.Transformer object at 0x7fbf782a0520>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf78207af0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e8970>).
-W0615 13:09:13.026498 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf78207af0> and <deepray.layers.transformer.Transformer object at 0x7fbf782e8970>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7818a0d0> and <deepray.layers.transformer.Transformer object at 0x7fbf78207af0>).
-W0615 13:09:13.034178 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7818a0d0> and <deepray.layers.transformer.Transformer object at 0x7fbf78207af0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7816e910> and <deepray.layers.transformer.Transformer object at 0x7fbf7818a0d0>).
-W0615 13:09:13.041939 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7816e910> and <deepray.layers.transformer.Transformer object at 0x7fbf7818a0d0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbfa0049310> and <deepray.layers.transformer.Transformer object at 0x7fbf7816e910>).
-W0615 13:09:13.049693 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbfa0049310> and <deepray.layers.transformer.Transformer object at 0x7fbf7816e910>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7811ab80> and <deepray.layers.transformer.Transformer object at 0x7fbfa0049310>).
-W0615 13:09:13.057439 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7811ab80> and <deepray.layers.transformer.Transformer object at 0x7fbfa0049310>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7810fc40> and <deepray.layers.transformer.Transformer object at 0x7fbf7811ab80>).
-W0615 13:09:13.065182 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fbf7810fc40> and <deepray.layers.transformer.Transformer object at 0x7fbf7811ab80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fbf780e9400> and <keras.layers.core.lambda_layer.Lambda object at 0x7fbf782e1310>).
-W0615 13:09:13.076700 140468358596416 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fbf780e9400> and <keras.layers.core.lambda_layer.Lambda object at 0x7fbf782e1310>).
-I0615 13:09:13.416074 140041562986304 distribution_utils.py:137] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.890000e-04, adjusted_init_lr = 2.116402e-04
-I0615 13:09:13.425815 140041562986304 base_trainer.py:331] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18332970> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f5c182a9490>).
-W0615 13:09:13.494370 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18332970> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f5c182a9490>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c40039f10> and <deepray.layers.transformer.Transformer object at 0x7f5c18332970>).
-W0615 13:09:13.502376 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c40039f10> and <deepray.layers.transformer.Transformer object at 0x7f5c18332970>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c428ab610> and <deepray.layers.transformer.Transformer object at 0x7f5c40039f10>).
-W0615 13:09:13.510278 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c428ab610> and <deepray.layers.transformer.Transformer object at 0x7f5c40039f10>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18116c70> and <deepray.layers.transformer.Transformer object at 0x7f5c428ab610>).
-W0615 13:09:13.518167 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18116c70> and <deepray.layers.transformer.Transformer object at 0x7f5c428ab610>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18174d30> and <deepray.layers.transformer.Transformer object at 0x7f5c18116c70>).
-W0615 13:09:13.526021 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18174d30> and <deepray.layers.transformer.Transformer object at 0x7f5c18116c70>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c1837f820> and <deepray.layers.transformer.Transformer object at 0x7f5c18174d30>).
-W0615 13:09:13.533887 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c1837f820> and <deepray.layers.transformer.Transformer object at 0x7f5c18174d30>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18368b80> and <deepray.layers.transformer.Transformer object at 0x7f5c1837f820>).
-W0615 13:09:13.541733 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18368b80> and <deepray.layers.transformer.Transformer object at 0x7f5c1837f820>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c182af760> and <deepray.layers.transformer.Transformer object at 0x7f5c18368b80>).
-W0615 13:09:13.549581 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c182af760> and <deepray.layers.transformer.Transformer object at 0x7f5c18368b80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18208820> and <deepray.layers.transformer.Transformer object at 0x7f5c182af760>).
-W0615 13:09:13.557427 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c18208820> and <deepray.layers.transformer.Transformer object at 0x7f5c182af760>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c1811e6a0> and <deepray.layers.transformer.Transformer object at 0x7f5c18208820>).
-W0615 13:09:13.565283 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c1811e6a0> and <deepray.layers.transformer.Transformer object at 0x7f5c18208820>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c181eb940> and <deepray.layers.transformer.Transformer object at 0x7f5c1811e6a0>).
-W0615 13:09:13.573287 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c181eb940> and <deepray.layers.transformer.Transformer object at 0x7f5c1811e6a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c181aa310> and <deepray.layers.transformer.Transformer object at 0x7f5c181eb940>).
-W0615 13:09:13.581152 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f5c181aa310> and <deepray.layers.transformer.Transformer object at 0x7f5c181eb940>).
-I0615 13:09:13.581217 140110035396416 base_trainer.py:336] Loading from checkpoint file completed
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f5c1821b1c0> and <keras.layers.core.lambda_layer.Lambda object at 0x7f5c183320a0>).
-W0615 13:09:13.592847 140041562986304 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f5c1821b1c0> and <keras.layers.core.lambda_layer.Lambda object at 0x7f5c183320a0>).
-I0615 13:09:13.616873 140627418781504 base_trainer.py:336] Loading from checkpoint file completed
-I0615 13:09:13.802472 140468358596416 base_trainer.py:336] Loading from checkpoint file completed
-I0615 13:09:14.321536 140041562986304 base_trainer.py:336] Loading from checkpoint file completed
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:26.853973 140110035396416 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:26.891842 140627418781504 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:27.122767 140468358596416 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:27.456767 140041562986304 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-2023-06-15 13:09:39.848849: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f6b0000bd50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-06-15 13:09:39.848899: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-06-15 13:09:39.962651: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-06-15 13:09:40.055639: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7fe37c0177d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-06-15 13:09:40.055699: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-06-15 13:09:40.175601: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-06-15 13:09:40.221599: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7fbe7000bd50 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-06-15 13:09:40.221647: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-06-15 13:09:40.340107: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-06-15 13:09:40.483467: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f5b080177d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-06-15 13:09:40.483520: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-06-15 13:09:40.601742: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:48.236312 140110035396416 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:48.697384 140468358596416 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:48.872398 140627418781504 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0615 13:09:49.328361 140041562986304 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-2023-06-15 13:10:14.762559: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-06-15 13:10:15.131555: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-06-15 13:10:15.166866: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-06-15 13:10:15.419513: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-06-15 13:10:15.541162: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-06-15 13:10:15.717350: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-06-15 13:10:15.771835: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-06-15 13:10:16.090667: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-06-15 13:10:49.496065: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_19'
-
-2023-06-15 13:10:49.771057: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_19'
-
-2023-06-15 13:10:50.187485: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_19'
-
-2023-06-15 13:10:50.541574: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_19'
-
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Using network Socket
-NCCL version 2.9.9+cuda11.3
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Channel 00/02 :    0   1   2   3
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Channel 01/02 :    0   1   2   3
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Setting affinity for GPU 2 to fff0,00fff000
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Setting affinity for GPU 1 to 0f,ff000fff
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Setting affinity for GPU 3 to fff0,00fff000
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Setting affinity for GPU 0 to 0f,ff000fff
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via direct shared memory
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Channel 00 : 3[af000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Channel 00 : 1[3b000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Channel 01 : 2[86000] -> 3[af000] via direct shared memory
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Channel 01 : 3[af000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Channel 01 : 1[3b000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Channel 00 : 0[18000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Channel 01 : 0[18000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Channel 00 : 3[af000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Channel 00 : 2[86000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Channel 01 : 2[86000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Channel 00 : 1[3b000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Channel 01 : 1[3b000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:129499:129889 [3] NCCL INFO comm 0x7f6c7c34c310 rank 3 nranks 4 cudaDev 3 busId af000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO comm 0x7fe4a437e630 rank 0 nranks 4 cudaDev 0 busId 18000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:129498:129891 [2] NCCL INFO comm 0x7fbfec34d980 rank 2 nranks 4 cudaDev 2 busId 86000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:129497:129890 [1] NCCL INFO comm 0x7f5c8c34dce0 rank 1 nranks 4 cudaDev 1 busId 3b000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:129496:129888 [0] NCCL INFO Launch mode Parallel
-I0615 13:12:33.294680 140627418781504 module.py:105] Step: 200 Lr 0.000180741 Loss scale 2048
-I0615 13:12:33.294907 140627418781504 module.py:106] Train Step: 200/1000 / time=199.640 sec  loss=1.872015  end_positions_loss=1.856910  start_positions_loss=1.887142
-I0615 13:12:33.294987 140627418781504 module.py:107] Perf 160.26 samples/s
-I0615 13:13:54.018793 140627418781504 module.py:105] Step: 400 Lr 0.000138201 Loss scale 1024
-I0615 13:13:54.019003 140627418781504 module.py:106] Train Step: 400/1000 / time=80.716 sec  loss=1.477980  end_positions_loss=1.462374  start_positions_loss=1.493590
-I0615 13:13:54.019082 140627418781504 module.py:107] Perf 396.40 samples/s
-I0615 13:15:14.728281 140627418781504 module.py:105] Step: 600 Lr 9.56614e-05 Loss scale 512
-I0615 13:15:14.728490 140627418781504 module.py:106] Train Step: 600/1000 / time=80.701 sec  loss=1.295847  end_positions_loss=1.268970  start_positions_loss=1.322737
-I0615 13:15:14.728579 140627418781504 module.py:107] Perf 396.48 samples/s
-I0615 13:16:35.556450 140627418781504 module.py:105] Step: 800 Lr 5.31217e-05 Loss scale 512
-I0615 13:16:35.556670 140627418781504 module.py:106] Train Step: 800/1000 / time=80.820 sec  loss=1.215989  end_positions_loss=1.184719  start_positions_loss=1.247269
-I0615 13:16:35.556750 140627418781504 module.py:107] Perf 395.90 samples/s
-I0615 13:18:00.314289 140627418781504 module.py:41] Saved checkpoint to /tmp/bert_train_benchmark_230615130858/ckpt/ckpt-1000
-I0615 13:18:00.324019 140627418781504 module.py:105] Step: 1000 Lr 1.0582e-05 Loss scale 512
-I0615 13:18:00.324193 140627418781504 module.py:106] Train Step: 1000/1000 / time=80.819 sec  loss=1.184956  end_positions_loss=1.144633  start_positions_loss=1.225284
-I0615 13:18:00.324290 140627418781504 module.py:107] Perf 377.51 samples/s
-I0615 13:18:00.325561 140627418781504 module.py:233] save pb model to:/tmp/bert_train_benchmark_230615130858/pb/model/
-W0615 13:18:23.752067 140627418781504 save.py:233] Found untraced functions such as self_attention_layer_call_fn, self_attention_layer_call_and_return_conditional_losses, self_attention_output_layer_call_fn, self_attention_output_layer_call_and_return_conditional_losses, dropout_1_layer_call_fn while saving (showing 5 of 336). These functions will not be directly callable after loading.
-INFO:tensorflow:Assets written to: /tmp/bert_train_benchmark_230615130858/pb/model/assets
-I0615 13:18:31.535251 140627418781504 builder_impl.py:779] Assets written to: /tmp/bert_train_benchmark_230615130858/pb/model/assets
-I0615 13:18:32.799760 140627418781504 module.py:239] save pb model done at: /tmp/bert_train_benchmark_230615130858/pb/model/. spend 32.474s
-I0615 13:18:33.541321 140627418781504 module.py:41] Saved checkpoint to /tmp/bert_train_benchmark_230615130858/sub_ckpt/ckpt-2
-I0615 13:18:33.542651 140627418781504 base_trainer.py:92] Training Summary: 
-{'total_training_steps': 1000, 'train_loss': 1.1849560737609863}
-I0615 13:18:33.543596 140627418781504 base_trainer.py:747] -----------------------------
-I0615 13:18:33.543714 140627418781504 base_trainer.py:748]   Batch size = 8
-I0615 13:18:33.543792 140627418781504 base_trainer.py:749]   Num steps = 1000
-I0615 13:18:33.544144 140627418781504 base_trainer.py:750]   LR = 5e-05
-I0615 13:18:33.544220 140627418781504 base_trainer.py:752] Multi-GPU training with TF Horovod
-I0615 13:18:33.544306 140627418781504 base_trainer.py:753] hvd.size() = 4
-I0615 13:18:33.544375 140627418781504 base_trainer.py:754] Total Training Time = 526.68 for Examples = 160000
-I0615 13:18:33.544444 140627418781504 base_trainer.py:755] Throughput Average (examples/sec) with overhead = 303.78
-I0615 13:18:33.544652 140627418781504 base_trainer.py:757] Throughput Average (examples/sec) = 389.97
-I0615 13:18:33.544721 140627418781504 base_trainer.py:758] -----------------------------
-DLL 2023-06-15 13:18:33.544781 -  throughput_train : 389.965 sequences/s
-DLL 2023-06-15 13:18:33.544878 -  total_loss : 1.1850 
diff --git a/modelzoo/LanguageModeling/BERT/logs/tf_bert_finetuning_squad_base_fp16_gbs48.230222025408.log b/modelzoo/LanguageModeling/BERT/logs/tf_bert_finetuning_squad_base_fp16_gbs48.230222025408.log
deleted file mode 100644
index 461a75f6..00000000
--- a/modelzoo/LanguageModeling/BERT/logs/tf_bert_finetuning_squad_base_fp16_gbs48.230222025408.log
+++ /dev/null
@@ -1,594 +0,0 @@
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:507: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0222 02:54:15.103936 140521934866240 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:507: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0222 02:54:15.118741 139694807635776 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-2023-02-22 02:54:15.119478: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:507: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0222 02:54:15.122258 139842431985472 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-/workspaces/Deepray2/examples/LanguageModeling/BERT/run_squad.py:507: UserWarning: Flag --model_dir has a non-None default value; therefore, mark_flag_as_required will pass even if flag is not specified in the command line!
-  flags.mark_flag_as_required('model_dir')
-I0222 02:54:15.129160 140514811483968 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-2023-02-22 02:54:15.135629: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-02-22 02:54:15.138168: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-02-22 02:54:15.145607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
-To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
-2023-02-22 02:54:16.407176: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10515 MB memory:  -> device: 1, name: NVIDIA TITAN V, pci bus id: 0000:3b:00.0, compute capability: 7.0
-2023-02-22 02:54:16.527230: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10515 MB memory:  -> device: 3, name: NVIDIA TITAN V, pci bus id: 0000:af:00.0, compute capability: 7.0
-2023-02-22 02:54:16.541279: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10515 MB memory:  -> device: 2, name: NVIDIA TITAN V, pci bus id: 0000:86:00.0, compute capability: 7.0
-2023-02-22 02:54:16.578696: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10515 MB memory:  -> device: 0, name: NVIDIA TITAN V, pci bus id: 0000:18:00.0, compute capability: 7.0
-I0222 02:54:21.458682 140514811483968 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0222 02:54:21.458915 140514811483968 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.800650e-05, adjusted_init_lr = 2.221420e-05
-I0222 02:54:21.467188 140514811483968 base_trainer.py:228] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad011faf0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fcad00d8850>).
-W0222 02:54:21.535566 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad011faf0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fcad00d8850>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad011f220> and <deepray.layers.transformer.Transformer object at 0x7fcad011faf0>).
-W0222 02:54:21.543438 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad011f220> and <deepray.layers.transformer.Transformer object at 0x7fcad011faf0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad0058250> and <deepray.layers.transformer.Transformer object at 0x7fcad011f220>).
-W0222 02:54:21.551240 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad0058250> and <deepray.layers.transformer.Transformer object at 0x7fcad011f220>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad00c9760> and <deepray.layers.transformer.Transformer object at 0x7fcad0058250>).
-W0222 02:54:21.559036 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad00c9760> and <deepray.layers.transformer.Transformer object at 0x7fcad0058250>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800884c0> and <deepray.layers.transformer.Transformer object at 0x7fcad00c9760>).
-W0222 02:54:21.566826 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800884c0> and <deepray.layers.transformer.Transformer object at 0x7fcad00c9760>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca80104760> and <deepray.layers.transformer.Transformer object at 0x7fca800884c0>).
-W0222 02:54:21.574605 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca80104760> and <deepray.layers.transformer.Transformer object at 0x7fca800884c0>).
-I0222 02:54:21.576441 139842431985472 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0222 02:54:21.576677 139842431985472 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-I0222 02:54:21.576820 139842431985472 base_trainer.py:134]  20230222 02:54:21 Initialize training
-I0222 02:54:21.576902 139842431985472 base_trainer.py:136] 	tf.app.flags.FLAGS:
-I0222 02:54:21.577182 139842431985472 base_trainer.py:138] 	?                        = False
-I0222 02:54:21.577262 139842431985472 base_trainer.py:138] 	alsologtostderr          = False
-I0222 02:54:21.577338 139842431985472 base_trainer.py:138] 	batch_size               = 12
-I0222 02:54:21.577412 139842431985472 base_trainer.py:138] 	bds                      = test_benchmark
-I0222 02:54:21.577486 139842431985472 base_trainer.py:138] 	benchmark                = False
-I0222 02:54:21.577560 139842431985472 base_trainer.py:138] 	benchmark_log_dir        = None
-I0222 02:54:21.577634 139842431985472 base_trainer.py:138] 	benchmark_logger_type    = BaseBenchmarkLogger
-I0222 02:54:21.577707 139842431985472 base_trainer.py:138] 	benchmark_test_id        = None
-I0222 02:54:21.577780 139842431985472 base_trainer.py:138] 	bert_config_file         = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_config.json
-I0222 02:54:21.577854 139842431985472 base_trainer.py:138] 	bigquery_data_set        = test_benchmark
-I0222 02:54:21.577927 139842431985472 base_trainer.py:138] 	bigquery_metric_table    = benchmark_metric
-I0222 02:54:21.578000 139842431985472 base_trainer.py:138] 	bigquery_run_status_table= benchmark_run_status
-I0222 02:54:21.578073 139842431985472 base_trainer.py:138] 	bigquery_run_table       = benchmark_run
-I0222 02:54:21.578147 139842431985472 base_trainer.py:138] 	black_list               = None
-I0222 02:54:21.578220 139842431985472 base_trainer.py:138] 	bld                      = None
-I0222 02:54:21.578293 139842431985472 base_trainer.py:138] 	bmt                      = benchmark_metric
-I0222 02:54:21.578366 139842431985472 base_trainer.py:138] 	brst                     = benchmark_run_status
-I0222 02:54:21.578439 139842431985472 base_trainer.py:138] 	brt                      = benchmark_run
-I0222 02:54:21.578512 139842431985472 base_trainer.py:138] 	bs                       = 12
-I0222 02:54:21.578587 139842431985472 base_trainer.py:138] 	bti                      = None
-I0222 02:54:21.578660 139842431985472 base_trainer.py:138] 	clean                    = False
-I0222 02:54:21.578734 139842431985472 base_trainer.py:138] 	conf_file                = /workspaces/Deepray2/conf/dp.yaml
-I0222 02:54:21.578807 139842431985472 base_trainer.py:138] 	data_dir                 = /tmp/movielens-data/
-I0222 02:54:21.578880 139842431985472 base_trainer.py:138] 	dataset                  = None
-I0222 02:54:21.578953 139842431985472 base_trainer.py:138] 	distribution_strategy    = mirrored
-I0222 02:54:21.579026 139842431985472 base_trainer.py:138] 	dllog_path               = deepray_dllogger.json
-I0222 02:54:21.579099 139842431985472 base_trainer.py:138] 	do_lower_case            = True
-I0222 02:54:21.579172 139842431985472 base_trainer.py:138] 	download_if_missing      = True
-I0222 02:54:21.579245 139842431985472 base_trainer.py:138] 	ds                       = mirrored
-I0222 02:54:21.579318 139842431985472 base_trainer.py:138] 	dt                       = fp32
-I0222 02:54:21.579391 139842431985472 base_trainer.py:138] 	dtype                    = fp32
-I0222 02:54:21.579464 139842431985472 base_trainer.py:138] 	enable_xla               = True
-I0222 02:54:21.579537 139842431985472 base_trainer.py:138] 	epochs                   = 1
-I0222 02:54:21.579611 139842431985472 base_trainer.py:138] 	eval_batch_size          = None
-I0222 02:54:21.579684 139842431985472 base_trainer.py:138] 	eval_script              = /workspaces/bert_tf2/data/download/squad/v1.1/evaluate-v1.1.py
-I0222 02:54:21.579757 139842431985472 base_trainer.py:138] 	feature_map              = /workspaces/Deepray2/business/data/feature_map.csv
-I0222 02:54:21.579830 139842431985472 base_trainer.py:138] 	fp16_implementation      = keras
-I0222 02:54:21.579903 139842431985472 base_trainer.py:138] 	gcp_project              = None
-I0222 02:54:21.579977 139842431985472 base_trainer.py:138] 	gp                       = None
-I0222 02:54:21.580050 139842431985472 base_trainer.py:138] 	h                        = False
-I0222 02:54:21.580123 139842431985472 base_trainer.py:138] 	hbm_oom_exit             = True
-I0222 02:54:21.580196 139842431985472 base_trainer.py:138] 	help                     = False
-I0222 02:54:21.580269 139842431985472 base_trainer.py:138] 	helpfull                 = False
-I0222 02:54:21.580343 139842431985472 base_trainer.py:138] 	helpshort                = False
-I0222 02:54:21.580416 139842431985472 base_trainer.py:138] 	helpxml                  = False
-I0222 02:54:21.580489 139842431985472 base_trainer.py:138] 	hub_module_url           = None
-I0222 02:54:21.580562 139842431985472 base_trainer.py:138] 	init_checkpoint          = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt
-I0222 02:54:21.580635 139842431985472 base_trainer.py:138] 	input_meta_data_path     = /workspaces/bert_tf2/data/download/squad/v1.1/squad_v1.1_meta_data
-I0222 02:54:21.580708 139842431985472 base_trainer.py:138] 	interleave_block         = 2
-I0222 02:54:21.580781 139842431985472 base_trainer.py:138] 	interleave_cycle         = 16
-I0222 02:54:21.580855 139842431985472 base_trainer.py:138] 	keras_use_ctl            = True
-I0222 02:54:21.580931 139842431985472 base_trainer.py:138] 	label                    = ['click', 'play']
-I0222 02:54:21.581021 139842431985472 base_trainer.py:138] 	learning_rate            = 5e-06
-I0222 02:54:21.581095 139842431985472 base_trainer.py:138] 	log_dir                  = 
-I0222 02:54:21.581169 139842431985472 base_trainer.py:138] 	log_steps                = 100
-I0222 02:54:21.581243 139842431985472 base_trainer.py:138] 	logger_levels            = {}
-I0222 02:54:21.581317 139842431985472 base_trainer.py:138] 	logtostderr              = False
-I0222 02:54:21.581390 139842431985472 base_trainer.py:138] 	loss_scale               = None
-I0222 02:54:21.581464 139842431985472 base_trainer.py:138] 	ls                       = None
-I0222 02:54:21.581537 139842431985472 base_trainer.py:138] 	max_answer_length        = 30
-I0222 02:54:21.581610 139842431985472 base_trainer.py:138] 	md                       = /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408
-I0222 02:54:21.581683 139842431985472 base_trainer.py:138] 	mode                     = train_and_predict
-I0222 02:54:21.581756 139842431985472 base_trainer.py:138] 	model_dir                = /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408
-I0222 02:54:21.581829 139842431985472 base_trainer.py:138] 	model_export_path        = /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408
-I0222 02:54:21.581902 139842431985472 base_trainer.py:138] 	model_type               = bert
-I0222 02:54:21.581975 139842431985472 base_trainer.py:138] 	n_best_size              = 20
-I0222 02:54:21.582050 139842431985472 base_trainer.py:138] 	neg_sample_rate          = 0.0
-I0222 02:54:21.582123 139842431985472 base_trainer.py:138] 	ng                       = 4
-I0222 02:54:21.582196 139842431985472 base_trainer.py:138] 	num_accumulation_steps   = 1
-I0222 02:54:21.582269 139842431985472 base_trainer.py:138] 	num_gpus                 = 4
-I0222 02:54:21.582342 139842431985472 base_trainer.py:138] 	num_train_examples       = 88641
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad010b0d0> and <deepray.layers.transformer.Transformer object at 0x7fca80104760>).
-I0222 02:54:21.582415 139842431985472 base_trainer.py:138] 	only_check_args          = False
-W0222 02:54:21.582407 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcad010b0d0> and <deepray.layers.transformer.Transformer object at 0x7fca80104760>).
-I0222 02:54:21.582487 139842431985472 base_trainer.py:138] 	op_conversion_fallback_to_while_loop= True
-I0222 02:54:21.582560 139842431985472 base_trainer.py:138] 	optimizer_type           = adam
-I0222 02:54:21.582633 139842431985472 base_trainer.py:138] 	parallel_parse           = None
-I0222 02:54:21.582707 139842431985472 base_trainer.py:138] 	parallel_reads_per_file  = None
-I0222 02:54:21.582781 139842431985472 base_trainer.py:138] 	pdb                      = False
-I0222 02:54:21.582855 139842431985472 base_trainer.py:138] 	pdb_post_mortem          = False
-I0222 02:54:21.582928 139842431985472 base_trainer.py:138] 	prebatch                 = 1
-I0222 02:54:21.583001 139842431985472 base_trainer.py:138] 	predict_batch_size       = 8
-I0222 02:54:21.583074 139842431985472 base_trainer.py:138] 	predict_file             = /workspaces/bert_tf2/data/download/squad/v1.1/dev-v1.1.json
-I0222 02:54:21.583148 139842431985472 base_trainer.py:138] 	prefetch_buffer          = 16
-I0222 02:54:21.583221 139842431985472 base_trainer.py:138] 	profile_file             = None
-I0222 02:54:21.583294 139842431985472 base_trainer.py:138] 	random_seed              = 12345
-I0222 02:54:21.583367 139842431985472 base_trainer.py:138] 	run_eagerly              = False
-I0222 02:54:21.583441 139842431985472 base_trainer.py:138] 	run_with_pdb             = False
-I0222 02:54:21.583514 139842431985472 base_trainer.py:138] 	run_with_profiling       = False
-I0222 02:54:21.583588 139842431985472 base_trainer.py:138] 	runtime_oom_exit         = True
-I0222 02:54:21.583661 139842431985472 base_trainer.py:138] 	save_checkpoint_steps    = 1000
-I0222 02:54:21.583734 139842431985472 base_trainer.py:138] 	scale_loss               = False
-I0222 02:54:21.583808 139842431985472 base_trainer.py:138] 	showprefixforinfo        = True
-I0222 02:54:21.583882 139842431985472 base_trainer.py:138] 	shuffle_buffer           = None
-I0222 02:54:21.583954 139842431985472 base_trainer.py:138] 	stderrthreshold          = fatal
-I0222 02:54:21.584028 139842431985472 base_trainer.py:138] 	steps_per_summary        = 200
-I0222 02:54:21.584101 139842431985472 base_trainer.py:138] 	task_index               = -1
-I0222 02:54:21.584174 139842431985472 base_trainer.py:138] 	te                       = 1
-I0222 02:54:21.584247 139842431985472 base_trainer.py:138] 	test_random_seed         = 301
-I0222 02:54:21.584320 139842431985472 base_trainer.py:138] 	test_randomize_ordering_seed= 
-I0222 02:54:21.584392 139842431985472 base_trainer.py:138] 	test_srcdir              = 
-I0222 02:54:21.584465 139842431985472 base_trainer.py:138] 	test_tmpdir              = /tmp/absl_testing
-I0222 02:54:21.584538 139842431985472 base_trainer.py:138] 	tfhub_cache_dir          = None
-I0222 02:54:21.584611 139842431985472 base_trainer.py:138] 	tfhub_model_load_format  = AUTO
-I0222 02:54:21.584684 139842431985472 base_trainer.py:138] 	train_data               = /workspaces/bert_tf2/data/download/squad/v1.1/squad_v1.1_train.tf_record
-I0222 02:54:21.584758 139842431985472 base_trainer.py:138] 	use_cprofile_for_profiling= True
-I0222 02:54:21.584831 139842431985472 base_trainer.py:138] 	use_dynamic_embedding    = False
-I0222 02:54:21.584904 139842431985472 base_trainer.py:138] 	use_fp16                 = True
-I0222 02:54:21.584984 139842431985472 base_trainer.py:138] 	use_horovod              = True
-I0222 02:54:21.585058 139842431985472 base_trainer.py:138] 	use_keras_compile_fit    = False
-I0222 02:54:21.585132 139842431985472 base_trainer.py:138] 	v                        = 0
-I0222 02:54:21.585205 139842431985472 base_trainer.py:138] 	verbose_logging          = False
-I0222 02:54:21.585278 139842431985472 base_trainer.py:138] 	verbosity                = 0
-I0222 02:54:21.585351 139842431985472 base_trainer.py:138] 	vocab_file               = /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/vocab.txt
-I0222 02:54:21.585424 139842431985472 base_trainer.py:138] 	worker_hosts             = None
-I0222 02:54:21.585497 139842431985472 base_trainer.py:138] 	xml_output_file          = 
-decayed_learning_rate_at_crossover_point = 1.800650e-05, adjusted_init_lr = 2.221420e-05
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800efd90> and <deepray.layers.transformer.Transformer object at 0x7fcad010b0d0>).
-W0222 02:54:21.590199 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800efd90> and <deepray.layers.transformer.Transformer object at 0x7fcad010b0d0>).
-I0222 02:54:21.595271 139842431985472 base_trainer.py:228] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca8017f760> and <deepray.layers.transformer.Transformer object at 0x7fca800efd90>).
-W0222 02:54:21.597994 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca8017f760> and <deepray.layers.transformer.Transformer object at 0x7fca800efd90>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca8018b9d0> and <deepray.layers.transformer.Transformer object at 0x7fca8017f760>).
-W0222 02:54:21.605789 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca8018b9d0> and <deepray.layers.transformer.Transformer object at 0x7fca8017f760>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800018b0> and <deepray.layers.transformer.Transformer object at 0x7fca8018b9d0>).
-W0222 02:54:21.613582 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca800018b0> and <deepray.layers.transformer.Transformer object at 0x7fca8018b9d0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca80106670> and <deepray.layers.transformer.Transformer object at 0x7fca800018b0>).
-W0222 02:54:21.621374 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fca80106670> and <deepray.layers.transformer.Transformer object at 0x7fca800018b0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fca58742370> and <keras.layers.core.lambda_layer.Lambda object at 0x7fcad011fa60>).
-W0222 02:54:21.632989 140514811483968 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fca58742370> and <keras.layers.core.lambda_layer.Lambda object at 0x7fcad011fa60>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c041f70> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f2e3c0638b0>).
-W0222 02:54:21.662878 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c041f70> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f2e3c0638b0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c041490> and <deepray.layers.transformer.Transformer object at 0x7f2e3c041f70>).
-W0222 02:54:21.670663 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c041490> and <deepray.layers.transformer.Transformer object at 0x7f2e3c041f70>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df035fca0> and <deepray.layers.transformer.Transformer object at 0x7f2e3c041490>).
-W0222 02:54:21.678396 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df035fca0> and <deepray.layers.transformer.Transformer object at 0x7f2e3c041490>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df024b0d0> and <deepray.layers.transformer.Transformer object at 0x7f2df035fca0>).
-W0222 02:54:21.686212 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df024b0d0> and <deepray.layers.transformer.Transformer object at 0x7f2df035fca0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df036c1c0> and <deepray.layers.transformer.Transformer object at 0x7f2df024b0d0>).
-W0222 02:54:21.693978 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df036c1c0> and <deepray.layers.transformer.Transformer object at 0x7f2df024b0d0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c02e5e0> and <deepray.layers.transformer.Transformer object at 0x7f2df036c1c0>).
-W0222 02:54:21.701736 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c02e5e0> and <deepray.layers.transformer.Transformer object at 0x7f2df036c1c0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c063970> and <deepray.layers.transformer.Transformer object at 0x7f2e3c02e5e0>).
-W0222 02:54:21.709493 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c063970> and <deepray.layers.transformer.Transformer object at 0x7f2e3c02e5e0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df0275a00> and <deepray.layers.transformer.Transformer object at 0x7f2e3c063970>).
-W0222 02:54:21.717255 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df0275a00> and <deepray.layers.transformer.Transformer object at 0x7f2e3c063970>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c049460> and <deepray.layers.transformer.Transformer object at 0x7f2df0275a00>).
-W0222 02:54:21.725010 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2e3c049460> and <deepray.layers.transformer.Transformer object at 0x7f2df0275a00>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df02c0c70> and <deepray.layers.transformer.Transformer object at 0x7f2e3c049460>).
-W0222 02:54:21.732777 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df02c0c70> and <deepray.layers.transformer.Transformer object at 0x7f2e3c049460>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df02bea90> and <deepray.layers.transformer.Transformer object at 0x7f2df02c0c70>).
-W0222 02:54:21.740534 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df02bea90> and <deepray.layers.transformer.Transformer object at 0x7f2df02c0c70>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df0135640> and <deepray.layers.transformer.Transformer object at 0x7f2df02bea90>).
-W0222 02:54:21.748281 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f2df0135640> and <deepray.layers.transformer.Transformer object at 0x7f2df02bea90>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f2df0045d60> and <keras.layers.core.lambda_layer.Lambda object at 0x7f2e3c041e80>).
-W0222 02:54:21.759808 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f2df0045d60> and <keras.layers.core.lambda_layer.Lambda object at 0x7f2e3c041e80>).
-I0222 02:54:21.762454 139694807635776 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0222 02:54:21.762660 139694807635776 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.800650e-05, adjusted_init_lr = 2.221420e-05
-I0222 02:54:21.770972 139694807635776 base_trainer.py:228] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90278700> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f0b901efb50>).
-W0222 02:54:21.840094 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90278700> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7f0b901efb50>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9027aee0> and <deepray.layers.transformer.Transformer object at 0x7f0b90278700>).
-W0222 02:54:21.848058 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9027aee0> and <deepray.layers.transformer.Transformer object at 0x7f0b90278700>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90256dc0> and <deepray.layers.transformer.Transformer object at 0x7f0b9027aee0>).
-W0222 02:54:21.855978 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90256dc0> and <deepray.layers.transformer.Transformer object at 0x7f0b9027aee0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b902293a0> and <deepray.layers.transformer.Transformer object at 0x7f0b90256dc0>).
-W0222 02:54:21.863898 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b902293a0> and <deepray.layers.transformer.Transformer object at 0x7f0b90256dc0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90075eb0> and <deepray.layers.transformer.Transformer object at 0x7f0b902293a0>).
-W0222 02:54:21.871812 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b90075eb0> and <deepray.layers.transformer.Transformer object at 0x7f0b902293a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9026d430> and <deepray.layers.transformer.Transformer object at 0x7f0b90075eb0>).
-W0222 02:54:21.879724 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9026d430> and <deepray.layers.transformer.Transformer object at 0x7f0b90075eb0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9006fee0> and <deepray.layers.transformer.Transformer object at 0x7f0b9026d430>).
-W0222 02:54:21.887633 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9006fee0> and <deepray.layers.transformer.Transformer object at 0x7f0b9026d430>).
-I0222 02:54:21.891766 140521934866240 feature_map.py:34] File not exists: /workspaces/Deepray2/business/data/feature_map.csv
-I0222 02:54:21.891980 140521934866240 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-decayed_learning_rate_at_crossover_point = 1.800650e-05, adjusted_init_lr = 2.221420e-05
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b902fbfa0> and <deepray.layers.transformer.Transformer object at 0x7f0b9006fee0>).
-W0222 02:54:21.895538 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b902fbfa0> and <deepray.layers.transformer.Transformer object at 0x7f0b9006fee0>).
-I0222 02:54:21.900527 140521934866240 base_trainer.py:228] Checkpoint file /workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12/bert_model.ckpt found and restoring from initial checkpoint for core model.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b900e8d60> and <deepray.layers.transformer.Transformer object at 0x7f0b902fbfa0>).
-W0222 02:54:21.903438 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b900e8d60> and <deepray.layers.transformer.Transformer object at 0x7f0b902fbfa0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9011e1f0> and <deepray.layers.transformer.Transformer object at 0x7f0b900e8d60>).
-W0222 02:54:21.911336 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9011e1f0> and <deepray.layers.transformer.Transformer object at 0x7f0b900e8d60>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9012cca0> and <deepray.layers.transformer.Transformer object at 0x7f0b9011e1f0>).
-W0222 02:54:21.919234 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b9012cca0> and <deepray.layers.transformer.Transformer object at 0x7f0b9011e1f0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b900bd7c0> and <deepray.layers.transformer.Transformer object at 0x7f0b9012cca0>).
-W0222 02:54:21.927138 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7f0b900bd7c0> and <deepray.layers.transformer.Transformer object at 0x7f0b9012cca0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f0b686c1310> and <keras.layers.core.lambda_layer.Lambda object at 0x7f0b90278610>).
-W0222 02:54:21.938901 139694807635776 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7f0b686c1310> and <keras.layers.core.lambda_layer.Lambda object at 0x7f0b90278610>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc284a1af0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fcc28460490>).
-W0222 02:54:21.968638 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc284a1af0> and <deepray.layers.self_attention_mask.SelfAttentionMask object at 0x7fcc28460490>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846c0a0> and <deepray.layers.transformer.Transformer object at 0x7fcc284a1af0>).
-W0222 02:54:21.976486 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846c0a0> and <deepray.layers.transformer.Transformer object at 0x7fcc284a1af0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846c850> and <deepray.layers.transformer.Transformer object at 0x7fcc2846c0a0>).
-W0222 02:54:21.984289 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846c850> and <deepray.layers.transformer.Transformer object at 0x7fcc2846c0a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28432550> and <deepray.layers.transformer.Transformer object at 0x7fcc2846c850>).
-W0222 02:54:21.992170 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28432550> and <deepray.layers.transformer.Transformer object at 0x7fcc2846c850>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28217d90> and <deepray.layers.transformer.Transformer object at 0x7fcc28432550>).
-W0222 02:54:22.000015 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28217d90> and <deepray.layers.transformer.Transformer object at 0x7fcc28432550>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28460e80> and <deepray.layers.transformer.Transformer object at 0x7fcc28217d90>).
-W0222 02:54:22.007923 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28460e80> and <deepray.layers.transformer.Transformer object at 0x7fcc28217d90>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc284d3a00> and <deepray.layers.transformer.Transformer object at 0x7fcc28460e80>).
-W0222 02:54:22.015754 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc284d3a00> and <deepray.layers.transformer.Transformer object at 0x7fcc28460e80>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846f520> and <deepray.layers.transformer.Transformer object at 0x7fcc284d3a00>).
-W0222 02:54:22.023590 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2846f520> and <deepray.layers.transformer.Transformer object at 0x7fcc284d3a00>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2829fa60> and <deepray.layers.transformer.Transformer object at 0x7fcc2846f520>).
-W0222 02:54:22.031438 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc2829fa60> and <deepray.layers.transformer.Transformer object at 0x7fcc2846f520>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc283449a0> and <deepray.layers.transformer.Transformer object at 0x7fcc2829fa60>).
-W0222 02:54:22.039267 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc283449a0> and <deepray.layers.transformer.Transformer object at 0x7fcc2829fa60>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc281d1970> and <deepray.layers.transformer.Transformer object at 0x7fcc283449a0>).
-W0222 02:54:22.047093 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc281d1970> and <deepray.layers.transformer.Transformer object at 0x7fcc283449a0>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28298670> and <deepray.layers.transformer.Transformer object at 0x7fcc281d1970>).
-W0222 02:54:22.054926 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.layers.transformer.Transformer object at 0x7fcc28298670> and <deepray.layers.transformer.Transformer object at 0x7fcc281d1970>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fcc280e2370> and <keras.layers.core.lambda_layer.Lambda object at 0x7fcc284a1fd0>).
-W0222 02:54:22.066577 140521934866240 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<keras.layers.core.dense.Dense object at 0x7fcc280e2370> and <keras.layers.core.lambda_layer.Lambda object at 0x7fcc284a1fd0>).
-I0222 02:54:22.370549 140514811483968 base_trainer.py:233] Loading from checkpoint file completed
-I0222 02:54:22.564684 139842431985472 base_trainer.py:233] Loading from checkpoint file completed
-I0222 02:54:22.665153 139694807635776 base_trainer.py:233] Loading from checkpoint file completed
-I0222 02:54:22.788403 140521934866240 base_trainer.py:233] Loading from checkpoint file completed
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:54:40.545229 140514811483968 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:54:40.637545 139842431985472 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:54:40.904253 140521934866240 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:54:40.955181 139694807635776 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:55:02.530953 139842431985472 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:55:02.588361 140514811483968 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:55:03.270597 140521934866240 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:55:04.099206 139694807635776 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-2023-02-22 02:55:24.547115: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-02-22 02:55:24.640948: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-02-22 02:55:25.387853: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-02-22 02:55:26.018586: W tensorflow/compiler/jit/mark_for_compilation_pass.cc:1750] (One-time warning): Not using XLA:CPU for cluster.
-
-If you want XLA:CPU, do one of the following:
-
- - set the TF_XLA_FLAGS to include "--tf_xla_cpu_global_jit", or
- - set cpu_global_jit to true on this session's OptimizerOptions, or
- - use experimental_jit_scope, or
- - use tf.function(jit_compile=True).
-
-To confirm that XLA is active, pass --vmodule=xla_compilation_cache=1 (as a
-proper command-line flag, not via TF_XLA_FLAGS).
-2023-02-22 02:55:29.107067: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f2cd14a3770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-02-22 02:55:29.107123: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-02-22 02:55:29.256085: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7fc9514a45c0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-02-22 02:55:29.256147: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-02-22 02:55:29.494129: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-02-22 02:55:29.644150: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-02-22 02:55:29.701060: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7fcb0158f940 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-02-22 02:55:29.701116: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-02-22 02:55:30.100463: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-02-22 02:55:30.321975: I tensorflow/compiler/xla/service/service.cc:170] XLA service 0x7f0a594a3a70 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
-2023-02-22 02:55:30.322029: I tensorflow/compiler/xla/service/service.cc:178]   StreamExecutor device (0): NVIDIA TITAN V, Compute Capability 7.0
-2023-02-22 02:55:30.708583: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:263] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
-2023-02-22 02:56:04.996348: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-02-22 02:56:05.586823: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-02-22 02:56:05.668538: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-2023-02-22 02:56:06.095378: I tensorflow/compiler/jit/xla_compilation_cache.cc:478] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Using network Socket
-NCCL version 2.9.9+cuda11.3
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Bootstrap : Using bond0.2074:10.0.74.1<0>
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO NET/IB : No device found.
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO NET/Socket : Using [0]bond0.2074:10.0.74.1<0> [1]lxcbr0:10.0.3.1<0> [2]bond0:fe80::c494:eeff:fe63:7b0c%bond0<0>
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Using network Socket
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Channel 00/02 :    0   1   2   3
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Channel 01/02 :    0   1   2   3
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Setting affinity for GPU 3 to fff0,00fff000
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Setting affinity for GPU 1 to 0f,ff000fff
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Setting affinity for GPU 2 to fff0,00fff000
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Setting affinity for GPU 0 to 0f,ff000fff
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Channel 00 : 3[af000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Channel 00 : 2[86000] -> 3[af000] via direct shared memory
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Channel 00 : 1[3b000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Channel 00 : 0[18000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Channel 01 : 3[af000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Channel 01 : 2[86000] -> 3[af000] via direct shared memory
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Channel 01 : 0[18000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Channel 01 : 1[3b000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Connected all rings
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Channel 00 : 3[af000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Channel 01 : 3[af000] -> 2[86000] via direct shared memory
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Channel 00 : 2[86000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Channel 00 : 1[3b000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Channel 01 : 2[86000] -> 1[3b000] via direct shared memory
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Channel 01 : 1[3b000] -> 0[18000] via direct shared memory
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:147739:148126 [3] NCCL INFO comm 0x7fcc24347330 rank 3 nranks 4 cudaDev 3 busId af000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO comm 0x7f2dec38c000 rank 0 nranks 4 cudaDev 0 busId 18000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO Connected all trees
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
-op-arsenaldevk8s-gpu01:147738:148127 [2] NCCL INFO comm 0x7f0bdc348e40 rank 2 nranks 4 cudaDev 2 busId 86000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:147737:148124 [1] NCCL INFO comm 0x7fca7c348dd0 rank 1 nranks 4 cudaDev 1 busId 3b000 - Init COMPLETE
-op-arsenaldevk8s-gpu01:147736:148125 [0] NCCL INFO Launch mode Parallel
-2023-02-22 02:56:28.864512: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_58'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_180'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_74'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_138'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_118'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_106'
-ptxas warning : Registers are spilled to local memory in function '__cuda_sm70_shflsync_down'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_170'
-
-2023-02-22 02:56:28.936890: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_58'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_180'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_74'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_138'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_118'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_106'
-ptxas warning : Registers are spilled to local memory in function '__cuda_sm70_shflsync_down'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_170'
-
-2023-02-22 02:56:28.986222: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_58'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_180'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_74'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_138'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_118'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_106'
-ptxas warning : Registers are spilled to local memory in function '__cuda_sm70_shflsync_down'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_170'
-
-2023-02-22 02:56:28.996426: I tensorflow/stream_executor/gpu/asm_compiler.cc:323] ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_58'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_180'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_74'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_138'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_118'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_106'
-ptxas warning : Registers are spilled to local memory in function '__cuda_sm70_shflsync_down'
-ptxas warning : Registers are spilled to local memory in function 'input_fusion_reduce_170'
-
-I0222 02:57:04.378168 139842431985472 base_trainer.py:372] Step: 200 Lr 1.98075e-05 Loss scale 8192
-I0222 02:57:04.378374 139842431985472 base_trainer.py:373] Train Step: 200/1846  / loss=4.043647289276123 / time=161.277sec
-I0222 02:57:04.378454 139842431985472 base_trainer.py:374] Perf 59.53
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:57:16.590664 139694807635776 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:57:16.646932 139842431985472 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:57:17.089605 140521934866240 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-WARNING:tensorflow:Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-W0222 02:57:17.092291 140514811483968 utils.py:76] Gradients do not exist for variables ['pooler_transform/kernel:0', 'pooler_transform/bias:0'] when minimizing the loss. If you're using `model.compile()`, did you forget to provide a `loss`argument?
-I0222 02:58:48.120443 139842431985472 base_trainer.py:372] Step: 400 Lr 1.74007e-05 Loss scale 8192
-I0222 02:58:48.120648 139842431985472 base_trainer.py:373] Train Step: 400/1846  / loss=1.6114380359649658 / time=103.739sec
-I0222 02:58:48.120728 139842431985472 base_trainer.py:374] Perf 92.54
-I0222 02:59:21.715652 139842431985472 base_trainer.py:372] Step: 600 Lr 1.4994e-05 Loss scale 8192
-I0222 02:59:21.715851 139842431985472 base_trainer.py:373] Train Step: 600/1846  / loss=1.2907629013061523 / time=33.592sec
-I0222 02:59:21.715930 139842431985472 base_trainer.py:374] Perf 285.80
-I0222 02:59:55.317249 139842431985472 base_trainer.py:372] Step: 800 Lr 1.25872e-05 Loss scale 2048
-I0222 02:59:55.317450 139842431985472 base_trainer.py:373] Train Step: 800/1846  / loss=1.3398425579071045 / time=33.598sec
-I0222 02:59:55.317529 139842431985472 base_trainer.py:374] Perf 285.75
-I0222 03:00:32.682036 139842431985472 base_trainer.py:357] Saved checkpoint to /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408/ckpt-1
-I0222 03:00:32.686215 139842431985472 base_trainer.py:372] Step: 1000 Lr 1.01805e-05 Loss scale 2048
-I0222 03:00:32.686394 139842431985472 base_trainer.py:373] Train Step: 1000/1846  / loss=1.404260277748108 / time=33.516sec
-I0222 03:00:32.686472 139842431985472 base_trainer.py:374] Perf 286.44
-I0222 03:01:06.210244 139842431985472 base_trainer.py:372] Step: 1200 Lr 7.77377e-06 Loss scale 2048
-I0222 03:01:06.210459 139842431985472 base_trainer.py:373] Train Step: 1200/1846  / loss=1.448732852935791 / time=33.521sec
-I0222 03:01:06.210533 139842431985472 base_trainer.py:374] Perf 286.41
-I0222 03:01:39.693241 139842431985472 base_trainer.py:372] Step: 1400 Lr 5.36703e-06 Loss scale 2048
-I0222 03:01:39.693437 139842431985472 base_trainer.py:373] Train Step: 1400/1846  / loss=1.2302013635635376 / time=33.479sec
-I0222 03:01:39.693515 139842431985472 base_trainer.py:374] Perf 286.76
-I0222 03:02:13.173210 139842431985472 base_trainer.py:372] Step: 1600 Lr 2.96029e-06 Loss scale 2048
-I0222 03:02:13.173408 139842431985472 base_trainer.py:373] Train Step: 1600/1846  / loss=1.4454467296600342 / time=33.476sec
-I0222 03:02:13.173486 139842431985472 base_trainer.py:374] Perf 286.78
-I0222 03:02:46.674173 139842431985472 base_trainer.py:372] Step: 1800 Lr 5.5355e-07 Loss scale 2048
-I0222 03:02:46.674366 139842431985472 base_trainer.py:373] Train Step: 1800/1846  / loss=1.2681738138198853 / time=33.497sec
-I0222 03:02:46.674444 139842431985472 base_trainer.py:374] Perf 286.60
-I0222 03:02:54.398929 139842431985472 base_trainer.py:372] Step: 1846 Lr 0 Loss scale 2048
-I0222 03:02:54.399125 139842431985472 base_trainer.py:373] Train Step: 1846/1846  / loss=1.0677543878555298 / time=7.721sec
-I0222 03:02:54.399204 139842431985472 base_trainer.py:374] Perf 286.02
-I0222 03:02:57.660079 139842431985472 base_trainer.py:80] Saving model as TF checkpoint: /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408/ctl_step_1846.ckpt-2
-I0222 03:02:57.661284 139842431985472 base_trainer.py:106] Training Summary: 
-{'total_training_steps': 1846, 'train_loss': 1.0677543878555298}
-I0222 03:02:57.662265 139842431985472 base_trainer.py:422] -----------------------------
-I0222 03:02:57.662383 139842431985472 base_trainer.py:423]   Batch size = 12
-I0222 03:02:57.662463 139842431985472 base_trainer.py:424]   Num steps = 1846
-I0222 03:02:57.662553 139842431985472 base_trainer.py:425]   LR = 5e-06
-I0222 03:02:57.662634 139842431985472 base_trainer.py:427] Multi-GPU training with TF Horovod
-I0222 03:02:57.662727 139842431985472 base_trainer.py:428] hvd.size() = 4
-I0222 03:02:57.662802 139842431985472 base_trainer.py:429] Total Training Time = 511.30 for Examples = 88608
-I0222 03:02:57.662878 139842431985472 base_trainer.py:431] Throughput Average (examples/sec) with overhead = 173.30
-I0222 03:02:57.663168 139842431985472 base_trainer.py:433] Throughput Average (examples/sec) = 286.32
-I0222 03:02:57.663239 139842431985472 base_trainer.py:434] -----------------------------
-DLL 2023-02-22 03:02:57.663303 -  throughput_train : 286.319 sequences/s
-DLL 2023-02-22 03:02:57.663415 -  total_loss : 1.0678 
-I0222 03:03:00.373434 139842431985472 squad_lib.py:355] *** Example ***
-I0222 03:03:00.373624 139842431985472 squad_lib.py:356] unique_id: 1000000000
-I0222 03:03:00.373709 139842431985472 squad_lib.py:357] example_index: 0
-I0222 03:03:00.373780 139842431985472 squad_lib.py:358] doc_span_index: 0
-I0222 03:03:00.373990 139842431985472 squad_lib.py:359] tokens: [CLS] which nfl team represented the afc at super bowl 50 ? [SEP] super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24 – 10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi ' s stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspend ##ing the tradition of naming each super bowl game with roman nu ##meral ##s ( under which the game would have been known as " super bowl l " ) , so that the logo could prominently feature the arabic nu ##meral ##s 50 . [SEP]
-I0222 03:03:00.374190 139842431985472 squad_lib.py:361] token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:5 19:6 20:7 21:8 22:9 23:10 24:11 25:12 26:13 27:14 28:15 29:16 30:17 31:17 32:17 33:18 34:19 35:20 36:21 37:21 38:22 39:23 40:24 41:25 42:26 43:26 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:33 52:34 53:35 54:35 55:35 56:36 57:37 58:38 59:39 60:39 61:39 62:40 63:41 64:42 65:43 66:44 67:45 68:46 69:46 70:47 71:48 72:49 73:50 74:51 75:52 76:53 77:53 78:54 79:54 80:55 81:56 82:56 83:56 84:57 85:58 86:59 87:60 88:61 89:62 90:63 91:64 92:65 93:66 94:66 95:67 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:74 105:75 106:76 107:77 108:78 109:79 110:79 111:80 112:80 113:81 114:82 115:83 116:83 117:83 118:84 119:84 120:85 121:86 122:87 123:88 124:89 125:89 126:90 127:91 128:92 129:93 130:94 131:95 132:96 133:97 134:98 135:99 136:100 137:100 138:100 139:101 140:101 141:102 142:103 143:104 144:105 145:106 146:107 147:108 148:109 149:110 150:110 151:111 152:112 153:112 154:112 155:112 156:113 157:114 158:115 159:116 160:117 161:118 162:119 163:120 164:121 165:122 166:122 167:122 168:123 169:123
-I0222 03:03:00.374384 139842431985472 squad_lib.py:365] token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True
-I0222 03:03:00.374678 139842431985472 squad_lib.py:370] input_ids: 101 2029 5088 2136 3421 1996 10511 2012 3565 4605 2753 1029 102 3565 4605 2753 2001 2019 2137 2374 2208 2000 5646 1996 3410 1997 1996 2120 2374 2223 1006 5088 1007 2005 1996 2325 2161 1012 1996 2137 2374 3034 1006 10511 1007 3410 7573 14169 3249 1996 2120 2374 3034 1006 22309 1007 3410 3792 12915 2484 1516 2184 2000 7796 2037 2353 3565 4605 2516 1012 1996 2208 2001 2209 2006 2337 1021 1010 2355 1010 2012 11902 1005 1055 3346 1999 1996 2624 3799 3016 2181 2012 4203 10254 1010 2662 1012 2004 2023 2001 1996 12951 3565 4605 1010 1996 2223 13155 1996 1000 3585 5315 1000 2007 2536 2751 1011 11773 11107 1010 2004 2092 2004 8184 28324 2075 1996 4535 1997 10324 2169 3565 4605 2208 2007 3142 16371 28990 2015 1006 2104 2029 1996 2208 2052 2031 2042 2124 2004 1000 3565 4605 1048 1000 1007 1010 2061 2008 1996 8154 2071 14500 3444 1996 5640 16371 28990 2015 2753 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:03:00.374953 139842431985472 squad_lib.py:371] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:03:00.375225 139842431985472 squad_lib.py:372] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:03:00.384944 139842431985472 squad_lib.py:355] *** Example ***
-I0222 03:03:00.385075 139842431985472 squad_lib.py:356] unique_id: 1000000001
-I0222 03:03:00.385154 139842431985472 squad_lib.py:357] example_index: 1
-I0222 03:03:00.385238 139842431985472 squad_lib.py:358] doc_span_index: 0
-I0222 03:03:00.385454 139842431985472 squad_lib.py:359] tokens: [CLS] which nfl team represented the nfc at super bowl 50 ? [SEP] super bowl 50 was an american football game to determine the champion of the national football league ( nfl ) for the 2015 season . the american football conference ( afc ) champion denver broncos defeated the national football conference ( nfc ) champion carolina panthers 24 – 10 to earn their third super bowl title . the game was played on february 7 , 2016 , at levi ' s stadium in the san francisco bay area at santa clara , california . as this was the 50th super bowl , the league emphasized the " golden anniversary " with various gold - themed initiatives , as well as temporarily suspend ##ing the tradition of naming each super bowl game with roman nu ##meral ##s ( under which the game would have been known as " super bowl l " ) , so that the logo could prominently feature the arabic nu ##meral ##s 50 . [SEP]
-I0222 03:03:00.385650 139842431985472 squad_lib.py:361] token_to_orig_map: 13:0 14:1 15:2 16:3 17:4 18:5 19:6 20:7 21:8 22:9 23:10 24:11 25:12 26:13 27:14 28:15 29:16 30:17 31:17 32:17 33:18 34:19 35:20 36:21 37:21 38:22 39:23 40:24 41:25 42:26 43:26 44:26 45:27 46:28 47:29 48:30 49:31 50:32 51:33 52:34 53:35 54:35 55:35 56:36 57:37 58:38 59:39 60:39 61:39 62:40 63:41 64:42 65:43 66:44 67:45 68:46 69:46 70:47 71:48 72:49 73:50 74:51 75:52 76:53 77:53 78:54 79:54 80:55 81:56 82:56 83:56 84:57 85:58 86:59 87:60 88:61 89:62 90:63 91:64 92:65 93:66 94:66 95:67 96:67 97:68 98:69 99:70 100:71 101:72 102:73 103:74 104:74 105:75 106:76 107:77 108:78 109:79 110:79 111:80 112:80 113:81 114:82 115:83 116:83 117:83 118:84 119:84 120:85 121:86 122:87 123:88 124:89 125:89 126:90 127:91 128:92 129:93 130:94 131:95 132:96 133:97 134:98 135:99 136:100 137:100 138:100 139:101 140:101 141:102 142:103 143:104 144:105 145:106 146:107 147:108 148:109 149:110 150:110 151:111 152:112 153:112 154:112 155:112 156:113 157:114 158:115 159:116 160:117 161:118 162:119 163:120 164:121 165:122 166:122 167:122 168:123 169:123
-I0222 03:03:00.385840 139842431985472 squad_lib.py:365] token_is_max_context: 13:True 14:True 15:True 16:True 17:True 18:True 19:True 20:True 21:True 22:True 23:True 24:True 25:True 26:True 27:True 28:True 29:True 30:True 31:True 32:True 33:True 34:True 35:True 36:True 37:True 38:True 39:True 40:True 41:True 42:True 43:True 44:True 45:True 46:True 47:True 48:True 49:True 50:True 51:True 52:True 53:True 54:True 55:True 56:True 57:True 58:True 59:True 60:True 61:True 62:True 63:True 64:True 65:True 66:True 67:True 68:True 69:True 70:True 71:True 72:True 73:True 74:True 75:True 76:True 77:True 78:True 79:True 80:True 81:True 82:True 83:True 84:True 85:True 86:True 87:True 88:True 89:True 90:True 91:True 92:True 93:True 94:True 95:True 96:True 97:True 98:True 99:True 100:True 101:True 102:True 103:True 104:True 105:True 106:True 107:True 108:True 109:True 110:True 111:True 112:True 113:True 114:True 115:True 116:True 117:True 118:True 119:True 120:True 121:True 122:True 123:True 124:True 125:True 126:True 127:True 128:True 129:True 130:True 131:True 132:True 133:True 134:True 135:True 136:True 137:True 138:True 139:True 140:True 141:True 142:True 143:True 144:True 145:True 146:True 147:True 148:True 149:True 150:True 151:True 152:True 153:True 154:True 155:True 156:True 157:True 158:True 159:True 160:True 161:True 162:True 163:True 164:True 165:True 166:True 167:True 168:True 169:True
-I0222 03:03:00.386119 139842431985472 squad_lib.py:370] input_ids: 101 2029 5088 2136 3421 1996 22309 2012 3565 4605 2753 1029 102 3565 4605 2753 2001 2019 2137 2374 2208 2000 5646 1996 3410 1997 1996 2120 2374 2223 1006 5088 1007 2005 1996 2325 2161 1012 1996 2137 2374 3034 1006 10511 1007 3410 7573 14169 3249 1996 2120 2374 3034 1006 22309 1007 3410 3792 12915 2484 1516 2184 2000 7796 2037 2353 3565 4605 2516 1012 1996 2208 2001 2209 2006 2337 1021 1010 2355 1010 2012 11902 1005 1055 3346 1999 1996 2624 3799 3016 2181 2012 4203 10254 1010 2662 1012 2004 2023 2001 1996 12951 3565 4605 1010 1996 2223 13155 1996 1000 3585 5315 1000 2007 2536 2751 1011 11773 11107 1010 2004 2092 2004 8184 28324 2075 1996 4535 1997 10324 2169 3565 4605 2208 2007 3142 16371 28990 2015 1006 2104 2029 1996 2208 2052 2031 2042 2124 2004 1000 3565 4605 1048 1000 1007 1010 2061 2008 1996 8154 2071 14500 3444 1996 5640 16371 28990 2015 2753 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:03:00.386393 139842431985472 squad_lib.py:371] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:03:00.386664 139842431985472 squad_lib.py:372] segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-I0222 03:04:41.536795 139842431985472 squad_lib.py:409] Adding padding examples to make sure no partial batch.
-I0222 03:04:41.537015 139842431985472 squad_lib.py:410] Adds 7 padding examples for inference.
-I0222 03:04:41.547415 139842431985472 run_squad.py:330] ***** Running predictions *****
-I0222 03:04:41.547578 139842431985472 run_squad.py:331]   Num orig examples = 10570
-I0222 03:04:41.547662 139842431985472 run_squad.py:332]   Num split examples = 10833
-I0222 03:04:41.547745 139842431985472 run_squad.py:333]   Batch size = 8
-I0222 03:04:41.547884 139842431985472 distribution_utils.py:134] Run horovod and turn off distribution strategy.
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.networks.transformer_encoder.TransformerEncoder object at 0x7f2c7add2b20> and <keras.engine.input_layer.InputLayer object at 0x7f2c7add2c40>).
-W0222 03:04:45.993871 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.networks.transformer_encoder.TransformerEncoder object at 0x7f2c7add2b20> and <keras.engine.input_layer.InputLayer object at 0x7f2c7add2c40>).
-WARNING:tensorflow:Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.networks.span_labeling.SpanLabeling object at 0x7f2c78c36610> and <deepray.networks.transformer_encoder.TransformerEncoder object at 0x7f2c7add2b20>).
-W0222 03:04:46.006213 139842431985472 base.py:324] Inconsistent references when loading the checkpoint into this object graph. For example, in the saved checkpoint object, `model.layer.weight` and `model.layer_copy.weight` reference the same variable, while in the current object these are two different variables. The referenced variables are:(<deepray.networks.span_labeling.SpanLabeling object at 0x7f2c78c36610> and <deepray.networks.transformer_encoder.TransformerEncoder object at 0x7f2c7add2b20>).
-I0222 03:04:46.153525 139842431985472 run_squad.py:138] Restoring checkpoints from /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408/ctl_step_1846.ckpt-2
-I0222 03:05:02.647718 139842431985472 run_squad.py:193] Made predictions for 200 records.
-I0222 03:05:03.220581 139842431985472 run_squad.py:193] Made predictions for 400 records.
-I0222 03:05:03.788269 139842431985472 run_squad.py:193] Made predictions for 600 records.
-I0222 03:05:04.356176 139842431985472 run_squad.py:193] Made predictions for 800 records.
-I0222 03:05:04.923093 139842431985472 run_squad.py:193] Made predictions for 1000 records.
-I0222 03:05:05.490363 139842431985472 run_squad.py:193] Made predictions for 1200 records.
-I0222 03:05:06.057389 139842431985472 run_squad.py:193] Made predictions for 1400 records.
-I0222 03:05:06.624577 139842431985472 run_squad.py:193] Made predictions for 1600 records.
-I0222 03:05:07.192167 139842431985472 run_squad.py:193] Made predictions for 1800 records.
-I0222 03:05:07.755165 139842431985472 run_squad.py:193] Made predictions for 2000 records.
-I0222 03:05:08.321818 139842431985472 run_squad.py:193] Made predictions for 2200 records.
-I0222 03:05:08.889806 139842431985472 run_squad.py:193] Made predictions for 2400 records.
-I0222 03:05:09.456604 139842431985472 run_squad.py:193] Made predictions for 2600 records.
-I0222 03:05:10.022928 139842431985472 run_squad.py:193] Made predictions for 2800 records.
-I0222 03:05:10.640971 139842431985472 run_squad.py:193] Made predictions for 3000 records.
-I0222 03:05:11.207592 139842431985472 run_squad.py:193] Made predictions for 3200 records.
-I0222 03:05:11.774128 139842431985472 run_squad.py:193] Made predictions for 3400 records.
-I0222 03:05:12.340717 139842431985472 run_squad.py:193] Made predictions for 3600 records.
-I0222 03:05:12.907655 139842431985472 run_squad.py:193] Made predictions for 3800 records.
-I0222 03:05:13.474116 139842431985472 run_squad.py:193] Made predictions for 4000 records.
-I0222 03:05:14.040781 139842431985472 run_squad.py:193] Made predictions for 4200 records.
-I0222 03:05:14.607630 139842431985472 run_squad.py:193] Made predictions for 4400 records.
-I0222 03:05:15.174620 139842431985472 run_squad.py:193] Made predictions for 4600 records.
-I0222 03:05:15.742293 139842431985472 run_squad.py:193] Made predictions for 4800 records.
-I0222 03:05:16.309273 139842431985472 run_squad.py:193] Made predictions for 5000 records.
-I0222 03:05:16.876605 139842431985472 run_squad.py:193] Made predictions for 5200 records.
-I0222 03:05:17.494976 139842431985472 run_squad.py:193] Made predictions for 5400 records.
-I0222 03:05:18.061524 139842431985472 run_squad.py:193] Made predictions for 5600 records.
-I0222 03:05:18.628394 139842431985472 run_squad.py:193] Made predictions for 5800 records.
-I0222 03:05:19.195588 139842431985472 run_squad.py:193] Made predictions for 6000 records.
-I0222 03:05:19.761762 139842431985472 run_squad.py:193] Made predictions for 6200 records.
-I0222 03:05:20.328730 139842431985472 run_squad.py:193] Made predictions for 6400 records.
-I0222 03:05:20.895705 139842431985472 run_squad.py:193] Made predictions for 6600 records.
-I0222 03:05:21.462655 139842431985472 run_squad.py:193] Made predictions for 6800 records.
-I0222 03:05:22.029147 139842431985472 run_squad.py:193] Made predictions for 7000 records.
-I0222 03:05:22.590895 139842431985472 run_squad.py:193] Made predictions for 7200 records.
-I0222 03:05:23.156916 139842431985472 run_squad.py:193] Made predictions for 7400 records.
-I0222 03:05:23.723860 139842431985472 run_squad.py:193] Made predictions for 7600 records.
-I0222 03:05:24.290778 139842431985472 run_squad.py:193] Made predictions for 7800 records.
-I0222 03:05:24.908982 139842431985472 run_squad.py:193] Made predictions for 8000 records.
-I0222 03:05:25.475772 139842431985472 run_squad.py:193] Made predictions for 8200 records.
-I0222 03:05:26.041187 139842431985472 run_squad.py:193] Made predictions for 8400 records.
-I0222 03:05:26.606945 139842431985472 run_squad.py:193] Made predictions for 8600 records.
-I0222 03:05:27.173358 139842431985472 run_squad.py:193] Made predictions for 8800 records.
-I0222 03:05:27.739070 139842431985472 run_squad.py:193] Made predictions for 9000 records.
-I0222 03:05:28.305346 139842431985472 run_squad.py:193] Made predictions for 9200 records.
-I0222 03:05:28.870999 139842431985472 run_squad.py:193] Made predictions for 9400 records.
-I0222 03:05:29.437528 139842431985472 run_squad.py:193] Made predictions for 9600 records.
-I0222 03:05:30.003364 139842431985472 run_squad.py:193] Made predictions for 9800 records.
-I0222 03:05:30.569719 139842431985472 run_squad.py:193] Made predictions for 10000 records.
-I0222 03:05:31.135895 139842431985472 run_squad.py:193] Made predictions for 10200 records.
-I0222 03:05:31.754679 139842431985472 run_squad.py:193] Made predictions for 10400 records.
-I0222 03:05:32.320123 139842431985472 run_squad.py:193] Made predictions for 10600 records.
-I0222 03:05:32.886599 139842431985472 run_squad.py:193] Made predictions for 10800 records.
-I0222 03:05:32.998801 139842431985472 run_squad.py:196] -----------------------------
-I0222 03:05:32.998948 139842431985472 run_squad.py:197] Summary Inference Statistics
-I0222 03:05:32.999028 139842431985472 run_squad.py:198] Batch size = 8
-I0222 03:05:32.999103 139842431985472 run_squad.py:199] Sequence Length = 384
-I0222 03:05:32.999177 139842431985472 run_squad.py:200] Precision = fp16
-I0222 03:05:32.999249 139842431985472 run_squad.py:201] Total Inference Time = 46.22 for Sentences = 10840
-I0222 03:05:32.999327 139842431985472 run_squad.py:230] -----------------------------
-I0222 03:05:33.054645 139842431985472 squad_lib.py:693] Writing predictions to: /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408/predictions.json
-I0222 03:05:33.054855 139842431985472 squad_lib.py:694] Writing nbest to: /results/tf_bert_finetuning_squad_base_fp16_gbs48_230222025408/nbest_predictions.json
-DLL 2023-02-22 03:07:34.981729 -  f1 : 84.4242 None
-DLL 2023-02-22 03:07:34.981843 -  exact_match : 75.4494 
-b'{"exact_match": 75.44938505203406, "f1": 84.4241704124407}\n'
diff --git a/modelzoo/LanguageModeling/BERT/model_saving_utils.py b/modelzoo/LanguageModeling/BERT/model_saving_utils.py
new file mode 100644
index 00000000..e8e8fa84
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/model_saving_utils.py
@@ -0,0 +1,101 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to save models."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import os
+
+from absl import logging
+import tensorflow as tf
+import typing
+
+
+def export_bert_model(model_export_path: typing.Text,
+                      model: tf.keras.Model,
+                      checkpoint_dir: typing.Optional[typing.Text] = None,
+                      restore_model_using_load_weights: bool = False) -> None:
+  """Export BERT model for serving which does not include the optimizer.
+
+  Arguments:
+      model_export_path: Path to which exported model will be saved.
+      model: Keras model object to export.
+      checkpoint_dir: Path from which model weights will be loaded, if
+        specified.
+      restore_model_using_load_weights: Whether to use checkpoint.restore() API
+        for custom checkpoint or to use model.load_weights() API.
+        There are 2 different ways to save checkpoints. One is using
+        tf.train.Checkpoint and another is using Keras model.save_weights().
+        Custom training loop implementation uses tf.train.Checkpoint API
+        and Keras ModelCheckpoint callback internally uses model.save_weights()
+        API. Since these two API's cannot be used toghether, model loading logic
+        must be take into account how model checkpoint was saved.
+
+  Raises:
+    ValueError when either model_export_path or model is not specified.
+  """
+  if not model_export_path:
+    raise ValueError('model_export_path must be specified.')
+  if not isinstance(model, tf.keras.Model):
+    raise ValueError('model must be a tf.keras.Model object.')
+
+  if checkpoint_dir:
+    # Keras compile/fit() was used to save checkpoint using
+    # model.save_weights().
+    if restore_model_using_load_weights:
+      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
+      assert tf.io.gfile.exists(model_weight_path)
+      model.load_weights(model_weight_path)
+
+    # tf.train.Checkpoint API was used via custom training loop logic.
+    else:
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      # Restores the model from latest checkpoint.
+      latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
+      assert latest_checkpoint_file
+      logging.info('Checkpoint file %s found and restoring from '
+                   'checkpoint', latest_checkpoint_file)
+      checkpoint.restore(
+          latest_checkpoint_file).assert_existing_objects_matched()
+
+  model.save(model_export_path, include_optimizer=False, save_format='tf')
+
+
+class BertModelCheckpoint(tf.keras.callbacks.Callback):
+  """Keras callback that saves model at the end of every epoch."""
+
+  def __init__(self, checkpoint_dir, checkpoint):
+    """Initializes BertModelCheckpoint.
+
+    Arguments:
+      checkpoint_dir: Directory of the to be saved checkpoint file.
+      checkpoint: tf.train.Checkpoint object.
+    """
+    super(BertModelCheckpoint, self).__init__()
+    self.checkpoint_file_name = os.path.join(
+        checkpoint_dir, 'bert_training_checkpoint_step_{global_step}.ckpt')
+    assert isinstance(checkpoint, tf.train.Checkpoint)
+    self.checkpoint = checkpoint
+
+  def on_epoch_end(self, epoch, logs=None):
+    global_step = tf.keras.backend.get_value(self.model.optimizer.iterations)
+    formatted_file_name = self.checkpoint_file_name.format(
+        global_step=global_step)
+    saved_path = self.checkpoint.save(formatted_file_name)
+    logging.info('Saving model TF checkpoint to : %s', saved_path)
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/__init__.py b/modelzoo/LanguageModeling/BERT/official/modeling/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/__init__.py b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
new file mode 100644
index 00000000..236a466d
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
@@ -0,0 +1,410 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A parameter dictionary class which supports the nest structure."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import re
+
+import six
+import tensorflow as tf
+import yaml
+
+# regex pattern that matches on key-value pairs in a comma-separated
+# key-value pair string. It splits each k-v pair on the = sign, and
+# matches on values that are within single quotes, double quotes, single
+# values (e.g. floats, ints, etc.), and a lists within brackets.
+_PARAM_RE = re.compile(r"""
+  (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
+  \s*=\s*
+  ((?P<val>\'(.*?)\'           # single quote
+  |
+  \"(.*?)\"                    # double quote
+  |
+  [^,\[]*                      # single value
+  |
+  \[[^\]]*\]))                 # list of values
+  ($|,\s*)""", re.VERBOSE)
+
+
+class ParamsDict(object):
+  """A hyperparameter container class."""
+
+  RESERVED_ATTR = ['_locked', '_restrictions']
+
+  def __init__(self, default_params=None, restrictions=None):
+    """Instantiate a ParamsDict.
+
+    Instantiate a ParamsDict given a set of default parameters and a list of
+    restrictions. Upon initialization, it validates itself by checking all the
+    defined restrictions, and raise error if it finds inconsistency.
+
+    Args:
+      default_params: a Python dict or another ParamsDict object including the
+        default parameters to initialize.
+      restrictions: a list of strings, which define a list of restrictions to
+        ensure the consistency of different parameters internally. Each
+        restriction string is defined as a binary relation with a set of
+        operators, including {'==', '!=',  '<', '<=', '>', '>='}.
+    """
+    self._locked = False
+    self._restrictions = []
+    if restrictions:
+      self._restrictions = restrictions
+    if default_params is None:
+      default_params = {}
+    self.override(default_params, is_strict=False)
+    self.validate()
+
+  def _set(self, k, v):
+    if isinstance(v, dict):
+      self.__dict__[k] = ParamsDict(v)
+    else:
+      self.__dict__[k] = copy.deepcopy(v)
+
+  def __setattr__(self, k, v):
+    """Sets the value of the existing key.
+
+    Note that this does not allow directly defining a new key. Use the
+    `override` method with `is_strict=False` instead.
+
+    Args:
+      k: the key string.
+      v: the value to be used to set the key `k`.
+
+    Raises:
+      KeyError: if k is not defined in the ParamsDict.
+    """
+    if k not in ParamsDict.RESERVED_ATTR:
+      if k not in self.__dict__.keys():
+        raise KeyError('The key `%{}` does not exist. '
+                       'To extend the existing keys, use '
+                       '`override` with `is_strict` = True.'.format(k))
+      if self._locked:
+        raise ValueError('The ParamsDict has been locked. '
+                         'No change is allowed.')
+    self._set(k, v)
+
+  def __getattr__(self, k):
+    """Gets the value of the existing key.
+
+    Args:
+      k: the key string.
+
+    Returns:
+      the value of the key.
+
+    Raises:
+      KeyError: if k is not defined in the ParamsDict.
+    """
+    if k not in self.__dict__.keys():
+      raise KeyError('The key `{}` does not exist. '.format(k))
+    return self.__dict__[k]
+
+  def __contains__(self, key):
+    """Implements the membership test operator."""
+    return key in self.__dict__
+
+  def get(self, key, value=None):
+    """Accesses through built-in dictionary get method."""
+    return self.__dict__.get(key, value)
+
+  def override(self, override_params, is_strict=True):
+    """Override the ParamsDict with a set of given params.
+
+    Args:
+      override_params: a dict or a ParamsDict specifying the parameters to
+        be overridden.
+      is_strict: a boolean specifying whether override is strict or not. If
+        True, keys in `override_params` must be present in the ParamsDict.
+        If False, keys in `override_params` can be different from what is
+        currently defined in the ParamsDict. In this case, the ParamsDict will
+        be extended to include the new keys.
+    """
+    if self._locked:
+      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+    if isinstance(override_params, ParamsDict):
+      override_params = override_params.as_dict()
+    self._override(override_params, is_strict)  # pylint: disable=protected-access
+
+  def _override(self, override_dict, is_strict=True):
+    """The implementation of `override`."""
+    for k, v in six.iteritems(override_dict):
+      if k in ParamsDict.RESERVED_ATTR:
+        raise KeyError('The key `%{}` is internally reserved. '
+                       'Can not be overridden.')
+      if k not in self.__dict__.keys():
+        if is_strict:
+          raise KeyError('The key `{}` does not exist. '
+                         'To extend the existing keys, use '
+                         '`override` with `is_strict` = False.'.format(k))
+        else:
+          self._set(k, v)
+      else:
+        if isinstance(v, dict):
+          self.__dict__[k]._override(v, is_strict)  # pylint: disable=protected-access
+        elif isinstance(v, ParamsDict):
+          self.__dict__[k]._override(v.as_dict(), is_strict)  # pylint: disable=protected-access
+        else:
+          self.__dict__[k] = copy.deepcopy(v)
+
+  def lock(self):
+    """Makes the ParamsDict immutable."""
+    self._locked = True
+
+  def as_dict(self):
+    """Returns a dict representation of ParamsDict.
+
+    For the nested ParamsDict, a nested dict will be returned.
+    """
+    params_dict = {}
+    for k, v in six.iteritems(self.__dict__):
+      if k not in ParamsDict.RESERVED_ATTR:
+        if isinstance(v, ParamsDict):
+          params_dict[k] = v.as_dict()
+        else:
+          params_dict[k] = copy.deepcopy(v)
+    return params_dict
+
+  def validate(self):
+    """Validate the parameters consistency based on the restrictions.
+
+    This method validates the internal consistency using the pre-defined list of
+    restrictions. A restriction is defined as a string which specfiies a binary
+    operation. The supported binary operations are {'==', '!=', '<', '<=', '>',
+    '>='}. Note that the meaning of these operators are consistent with the
+    underlying Python immplementation. Users should make sure the define
+    restrictions on their type make sense.
+
+    For example, for a ParamsDict like the following
+    ```
+    a:
+      a1: 1
+      a2: 2
+    b:
+      bb:
+        bb1: 10
+        bb2: 20
+      ccc:
+        a1: 1
+        a3: 3
+    ```
+    one can define two restrictions like this
+    ['a.a1 == b.ccc.a1', 'a.a2 <= b.bb.bb2']
+
+    What it enforces are:
+     - a.a1 = 1 == b.ccc.a1 = 2
+     - a.a2 = 2 <= b.bb.bb2 = 20
+
+    Raises:
+      KeyError: if any of the following happens
+        (1) any of parameters in any of restrictions is not defined in
+            ParamsDict,
+        (2) any inconsistency violating the restriction is found.
+      ValueError: if the restriction defined in the string is not supported.
+    """
+    def _get_kv(dotted_string, params_dict):
+      tokenized_params = dotted_string.split('.')
+      v = params_dict
+      for t in tokenized_params:
+        v = v[t]
+      return tokenized_params[-1], v
+
+    def _get_kvs(tokens, params_dict):
+      if len(tokens) != 2:
+        raise ValueError('Only support binary relation in restriction.')
+      stripped_tokens = [t.strip() for t in tokens]
+      left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
+      right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
+      return left_k, left_v, right_k, right_v
+
+    params_dict = self.as_dict()
+    for restriction in self._restrictions:
+      if '==' in restriction:
+        tokens = restriction.split('==')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v != right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '!=' in restriction:
+        tokens = restriction.split('!=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v == right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '<' in restriction:
+        tokens = restriction.split('<')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v >= right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '<=' in restriction:
+        tokens = restriction.split('<=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v > right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '>' in restriction:
+        tokens = restriction.split('>')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v <= right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      elif '>=' in restriction:
+        tokens = restriction.split('>=')
+        _, left_v, _, right_v = _get_kvs(tokens, params_dict)
+        if left_v < right_v:
+          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
+                         .format(tokens[0], tokens[1]))
+      else:
+        raise ValueError('Unsupported relation in restriction.')
+
+
+def read_yaml_to_params_dict(file_path):
+  """Reads a YAML file to a ParamsDict."""
+  with tf.io.gfile.GFile(file_path, 'r') as f:
+    params_dict = yaml.load(f)
+    return ParamsDict(params_dict)
+
+
+def save_params_dict_to_yaml(params, file_path):
+  """Saves the input ParamsDict to a YAML file."""
+  with tf.io.gfile.GFile(file_path, 'w') as f:
+
+    def _my_list_rep(dumper, data):
+      # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
+      return dumper.represent_sequence(
+          u'tag:yaml.org,2002:seq', data, flow_style=True)
+    yaml.add_representer(list, _my_list_rep)
+    yaml.dump(params.as_dict(), f, default_flow_style=False)
+
+
+def nested_csv_str_to_json_str(csv_str):
+  """Converts a nested (using '.') comma-separated k=v string to a JSON string.
+
+  Converts a comma-separated string of key/value pairs that supports
+  nesting of keys to a JSON string. Nesting is implemented using
+  '.' between levels for a given key.
+
+  Spacing between commas and = is supported (e.g. there is no difference between
+  "a=1,b=2", "a = 1, b = 2", or "a=1, b=2") but there should be no spaces before
+  keys or after values (e.g. " a=1,b=2" and "a=1,b=2 " are not supported).
+
+  Note that this will only support values supported by CSV, meaning
+  values such as nested lists (e.g. "a=[[1,2,3],[4,5,6]]") are not
+  supported. Strings are supported as well, e.g. "a='hello'".
+
+  An example conversion would be:
+
+  "a=1, b=2, c.a=2, c.b=3, d.a.a=5"
+
+  to
+
+  "{ a: 1, b : 2, c: {a : 2, b : 3}, d: {a: {a : 5}}}"
+
+  Args:
+    csv_str: the comma separated string.
+
+  Returns:
+    the converted JSON string.
+
+  Raises:
+    ValueError: If csv_str is not in a comma separated string or
+      if the string is formatted incorrectly.
+  """
+  if not csv_str:
+    return ''
+
+  formatted_entries = []
+  nested_map = collections.defaultdict(list)
+  pos = 0
+  while pos < len(csv_str):
+    m = _PARAM_RE.match(csv_str, pos)
+    if not m:
+      raise ValueError('Malformed hyperparameter value while parsing '
+                       'CSV string: %s' % csv_str[pos:])
+    pos = m.end()
+    # Parse the values.
+    m_dict = m.groupdict()
+    name = m_dict['name']
+    v = m_dict['val']
+
+    # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
+    # as yaml.load would otherwise throw an exception
+    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
+      v = '\'{}\''.format(v)
+
+    name_nested = name.split('.')
+    if len(name_nested) > 1:
+      grouping = name_nested[0]
+      value = '.'.join(name_nested[1:]) + '=' + v
+      nested_map[grouping].append(value)
+    else:
+      formatted_entries.append('%s : %s' % (name, v))
+
+  for grouping, value in nested_map.items():
+    value = ','.join(value)
+    value = nested_csv_str_to_json_str(value)
+    formatted_entries.append('%s : %s' % (grouping, value))
+  return '{' + ', '.join(formatted_entries) + '}'
+
+
+def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
+  """Override a given ParamsDict using a dict, JSON/YAML/CSV string or YAML file.
+
+  The logic of the function is outlined below:
+  1. Test that the input is a dict. If not, proceed to 2.
+  2. Tests that the input is a string. If not, raise unknown ValueError
+  2.1. Test if the string is in a CSV format. If so, parse.
+  If not, proceed to 2.2.
+  2.2. Try loading the string as a YAML/JSON. If successful, parse to
+  dict and use it to override. If not, proceed to 2.3.
+  2.3. Try using the string as a file path and load the YAML file.
+
+  Args:
+    params: a ParamsDict object to be overridden.
+    dict_or_string_or_yaml_file: a Python dict, JSON/YAML/CSV string or
+      path to a YAML file specifying the parameters to be overridden.
+    is_strict: a boolean specifying whether override is strict or not.
+
+  Returns:
+    params: the overridden ParamsDict object.
+
+  Raises:
+    ValueError: if failed to override the parameters.
+  """
+  if not dict_or_string_or_yaml_file:
+    return params
+  if isinstance(dict_or_string_or_yaml_file, dict):
+    params.override(dict_or_string_or_yaml_file, is_strict)
+  elif isinstance(dict_or_string_or_yaml_file, six.string_types):
+    try:
+      dict_or_string_or_yaml_file = (
+          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
+    except ValueError:
+      pass
+    params_dict = yaml.load(dict_or_string_or_yaml_file)
+    if isinstance(params_dict, dict):
+      params.override(params_dict, is_strict)
+    else:
+      with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
+        params.override(yaml.load(f), is_strict)
+  else:
+    raise ValueError('Unknown input type to parse.')
+  return params
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
new file mode 100644
index 00000000..b800c877
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
@@ -0,0 +1,322 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for official.modeling.hyperparams.params_dict.py."""
+
+import os
+
+import tensorflow as tf
+import yaml
+
+from official.modeling.hyperparams import params_dict
+
+
+class ParamsDictTest(tf.test.TestCase):
+
+  def test_init_from_an_empty_dict(self):
+    params = params_dict.ParamsDict()
+    with self.assertRaises(KeyError):
+      _ = params.a
+
+    with self.assertRaises(KeyError):
+      params.a = 'aa'
+
+  def test_init_from_a_dict(self):
+    params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_init_from_a_param_dict(self):
+    params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    params = params_dict.ParamsDict(params_init)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+
+  def test_lock(self):
+    params = params_dict.ParamsDict({'a': 1, 'b': 2})
+    params.lock()
+    with self.assertRaises(ValueError):
+      params.a = 10
+    with self.assertRaises(ValueError):
+      params.override({'b': 20})
+
+  def test_setattr(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    params.c = 'ccc'
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, 'ccc')
+
+  def test_getattr(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c, None)
+
+  def test_contains(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa'}, is_strict=False)
+    self.assertIn('a', params)
+    self.assertNotIn('b', params)
+
+  def test_get(self):
+    params = params_dict.ParamsDict()
+    params.override(
+        {'a': 'aa'}, is_strict=False)
+    self.assertEqual(params.get('a'), 'aa')
+    self.assertEqual(params.get('b', 2), 2)
+    self.assertEqual(params.get('b'), None)
+
+  def test_override_is_strict_true(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 'cc', 'c2': 20}})
+    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c1, 'ccc')
+    with self.assertRaises(KeyError):
+      params.override({'d': 'ddd'}, is_strict=True)
+    with self.assertRaises(KeyError):
+      params.override({'c': {'c3': 30}}, is_strict=True)
+
+  def test_override_is_strict_false(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
+    self.assertEqual(params.a, 2)
+    self.assertEqual(params.c.c3, 3000)
+    params.override({'d': 'ddd'}, is_strict=False)
+    self.assertEqual(params.d, 'ddd')
+    params.override({'c': {'c4': 4444}}, is_strict=False)
+    self.assertEqual(params.c.c4, 4444)
+
+  def test_as_dict(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    params_d = params.as_dict()
+    self.assertEqual(params_d['a'], 'aa')
+    self.assertEqual(params_d['b'], 2)
+    self.assertEqual(params_d['c']['c1'], 10)
+    self.assertEqual(params_d['c']['c2'], 20)
+
+  def test_validate(self):
+    # Raise error due to the unknown parameter.
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict(
+          {'a': 1, 'b': {'a': 11}}, ['a == c'])
+
+    # OK to check equality of two nested dicts.
+    params = params_dict.ParamsDict(
+        {'a': 1, 'b': {'a': 10}, 'c': {'a': 10}}, ['b == c'])
+
+    # Raise error due to inconsistency
+    with self.assertRaises(KeyError):
+      params = params_dict.ParamsDict(
+          {'a': 1, 'c': {'a': 10}}, ['a == c.a'])
+
+    # Valid rule.
+    params = params_dict.ParamsDict(
+        {'a': 1, 'c': {'a': 1}}, ['a == c.a'])
+
+    # Overridding violates the existing rule, raise error upon validate.
+    params.override({'a': 11})
+    with self.assertRaises(KeyError):
+      params.validate()
+
+
+class ParamsDictIOTest(tf.test.TestCase):
+
+  def write_temp_file(self, filename, text):
+    temp_file = os.path.join(self.get_temp_dir(), filename)
+    with tf.io.gfile.GFile(temp_file, 'w') as writer:
+      writer.write(text)
+    return temp_file
+
+  def test_save_params_dict_to_yaml(self):
+    params = params_dict.ParamsDict(
+        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
+    params_dict.save_params_dict_to_yaml(params, output_yaml_file)
+
+    with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
+      params_d = yaml.load(f)
+      self.assertEqual(params.a, params_d['a'])
+      self.assertEqual(params.b, params_d['b'])
+      self.assertEqual(params.c.c1, params_d['c']['c1'])
+      self.assertEqual(params.c.c2, params_d['c']['c2'])
+
+  def test_read_yaml_to_params_dict(self):
+    input_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        a: 'aa'
+        b: 2
+        c:
+          c1: 10
+          c2: 20
+    """)
+    params = params_dict.read_yaml_to_params_dict(input_yaml_file)
+
+    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.b, 2)
+    self.assertEqual(params.c.c1, 10)
+    self.assertEqual(params.c.c2, 20)
+
+  def test_override_params_dict_using_dict(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_dict = {'b': 5.2, 'c': [30, 40]}
+    params = params_dict.override_params_dict(
+        params, override_dict, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_yaml_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_yaml_string = "'b': 5.2\n'c': [30, 40]"
+    params = params_dict.override_params_dict(
+        params, override_yaml_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_json_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
+        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+    override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
+    params = params_dict.override_params_dict(
+        params, override_json_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi', params.d.d1.d2)
+    self.assertEqual(False, params.e)
+
+  def test_override_params_dict_using_csv_string(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
+        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+    override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
+    params = params_dict.override_params_dict(
+        params, override_csv_string, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(2, params.b.b1)
+    self.assertEqual([3, 4], params.b.b2)
+    self.assertEqual('hi, world', params.d.d1.d2)
+    self.assertEqual('gs://test', params.e)
+
+  def test_override_params_dict_using_yaml_file(self):
+    params = params_dict.ParamsDict({
+        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    override_yaml_file = self.write_temp_file(
+        'params.yaml', r"""
+        b: 5.2
+        c: [30, 40]
+        """)
+    params = params_dict.override_params_dict(
+        params, override_yaml_file, is_strict=True)
+    self.assertEqual(1, params.a)
+    self.assertEqual(5.2, params.b)
+    self.assertEqual([30, 40], params.c)
+    self.assertEqual('hello', params.d)
+    self.assertEqual(False, params.e)
+
+
+class IOTest(tf.test.TestCase):
+
+  def test_basic_csv_str_to_json_str(self):
+    csv_str = 'a=1,b=2,c=3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_csv_str_load(self):
+    csv_str = 'a=1,b=2,c=3'
+    expected_output = {'a': 1, 'b': 2, 'c': 3}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_basic_nested_csv_str_to_json_str(self):
+    csv_str = 'a=1,b.b1=2'
+    json_str = '{a : 1, b : {b1 : 2}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_basic_nested_csv_str_load(self):
+    csv_str = 'a=1,b.b1=2,c.c1=3'
+    expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_complex_nested_csv_str_to_json_str(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1'
+    json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, json_str)
+
+  def test_complex_nested_csv_str_load(self):
+    csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
+    expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertDictEqual(converted_dict, expected_output)
+
+  def test_csv_str_load_supported_datatypes(self):
+    csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    converted_dict = yaml.load(converted_csv_str)
+    self.assertEqual(converted_dict['a'], 1)
+    self.assertEqual(converted_dict['b'], 2.)
+    self.assertEqual(converted_dict['c'], [1, 2, 3])
+    self.assertEqual(converted_dict['d'], 'hello, there')
+    self.assertEqual(converted_dict['e'], 'Hi.')
+
+  def test_csv_str_load_unsupported_datatypes(self):
+    csv_str = 'a=[[1,2,3],[4,5,6]]'
+    self.assertRaises(ValueError,
+                      params_dict.nested_csv_str_to_json_str,
+                      csv_str)
+
+  def test_csv_str_to_json_str_spacing(self):
+    csv_str1 = 'a=1,b=2,c=3'
+    csv_str2 = 'a = 1, b = 2, c = 3'
+    json_str = '{a : 1, b : 2, c : 3}'
+    converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
+    converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
+    self.assertEqual(converted_csv_str1, converted_csv_str2)
+    self.assertEqual(converted_csv_str1, json_str)
+    self.assertEqual(converted_csv_str2, json_str)
+
+  def test_gcs_added_quotes(self):
+    csv_str = 'a=gs://abc, b=gs://def'
+    expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
+    converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
+    self.assertEqual(converted_csv_str, expected_output)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/training/__init__.py b/modelzoo/LanguageModeling/BERT/official/modeling/training/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py b/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
new file mode 100644
index 00000000..9133b91a
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
@@ -0,0 +1,800 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Custom training loop for running TensorFlow 2.0 models."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import json
+import os
+
+from absl import flags
+from absl import logging
+
+import numpy as np
+import tensorflow as tf
+
+# pylint: disable=unused-import,g-import-not-at-top,redefined-outer-name,reimported
+from typing import Optional, Dict, List, Text, Callable, Union, Iterator, Any
+from official.modeling.hyperparams import params_dict
+from official.utils.misc import tpu_lib
+
+FLAGS = flags.FLAGS
+
+
+def strategy_flags_dict():
+  """Returns TPU related flags in a dictionary."""
+  return {
+      # TPUStrategy related flags.
+      'tpu': FLAGS.tpu,
+      # MultiWorkerMirroredStrategy related flags.
+      'worker_hosts': FLAGS.worker_hosts,
+      'task_index': FLAGS.task_index,
+  }
+
+
+def hparam_flags_dict():
+  """Returns model params related flags in a dictionary."""
+  return {
+      'data_dir': FLAGS.data_dir,
+      'model_dir': FLAGS.model_dir,
+      'train_batch_size': FLAGS.train_batch_size,
+      'eval_batch_size': FLAGS.eval_batch_size,
+      'precision': FLAGS.precision,
+      'config_file': FLAGS.config_file,
+      'params_override': FLAGS.params_override,
+  }
+
+
+def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
+  """Saves model to model_dir with provided checkpoint prefix."""
+
+  checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
+  saved_path = checkpoint.save(checkpoint_path)
+  logging.info('Saving model as TF checkpoint: %s', saved_path)
+
+
+def _steps_to_run(current_step, total_steps, steps_per_loop):
+  """Calculates steps to run on device."""
+  if steps_per_loop <= 0:
+    raise ValueError('steps_per_loop should be positive integer.')
+  return min(total_steps - current_step, steps_per_loop)
+
+
+def _no_metric():
+  return None
+
+
+class SummaryWriter(object):
+  """Simple SummaryWriter for writing dictionary of metrics.
+
+  Attributes:
+    _writer: The tf.SummaryWriter.
+  """
+
+  def __init__(self, model_dir: Text, name: Text):
+    """Inits SummaryWriter with paths.
+
+    Arguments:
+      model_dir: the model folder path.
+      name: the summary subfolder name.
+    """
+    self._writer = tf.summary.create_file_writer(os.path.join(model_dir, name))
+
+  def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
+    """Write metrics to summary with the given writer.
+
+    Args:
+      metrics: a dictionary of metrics values. Prefer dictionary.
+      step: integer. The training step.
+    """
+    if not isinstance(metrics, dict):
+      # Support scalar metric without name.
+      logging.warning('Warning: summary writer prefer metrics as dictionary.')
+      metrics = {'metric': metrics}
+
+    with self._writer.as_default():
+      for k, v in metrics.items():
+        tf.summary.scalar(k, v, step=step)
+      self._writer.flush()
+
+
+class DistributedExecutor(object):
+  """Interface to train and eval models with tf.distribute.Strategy.
+
+  Arguments:
+    strategy: an instance of tf.distribute.Strategy.
+    params: Model configuration needed to run distribution strategy.
+    model_fn: Keras model function. Signature:
+      (params: ParamsDict) -> tf.keras.models.Model.
+    loss_fn: loss function. Signature:
+      (y_true: Tensor, y_pred: Tensor) -> Tensor
+    metric_fn: metric function. Signature: () -> tf.keras.metrics.Metric.
+    is_multi_host: Set to True when using multi hosts for training, like multi
+      worker GPU or TPU pod (slice). Otherwise, False.
+  """
+
+  def __init__(self,
+               strategy,
+               params,
+               model_fn,
+               loss_fn,
+               is_multi_host=False):
+
+    self._params = params
+    self._model_fn = model_fn
+    self._loss_fn = loss_fn
+    self._strategy = strategy
+    self._checkpoint_name = 'ctl_step_{step}.ckpt'
+    self._is_multi_host = is_multi_host
+
+  @property
+  def checkpoint_name(self):
+    """Returns default checkpoint name."""
+    return self._checkpoint_name
+
+  @checkpoint_name.setter
+  def checkpoint_name(self, name):
+    """Sets default summary writer for the current thread."""
+    self._checkpoint_name = name
+
+  def loss_fn(self):
+    return self._loss_fn()
+
+  def model_fn(self, params):
+    return self._model_fn(params)
+
+  def _save_config(self, model_dir):
+    """Save parameters to config files if model_dir is defined."""
+
+    logging.info('Save config to model_dir %s.', model_dir)
+    if model_dir:
+      if not tf.io.gfile.exists(model_dir):
+        tf.io.gfile.makedirs(model_dir)
+      self._params.lock()
+      params_dict.save_params_dict_to_yaml(self._params,
+                                           model_dir + '/params.yaml')
+    else:
+      logging.warning('model_dir is empty, so skip the save config.')
+
+  def _get_input_iterator(
+      self, input_fn: Callable[..., tf.data.Dataset],
+      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
+    """Returns distributed dataset iterator.
+
+    Args:
+      input_fn: (params: dict) -> tf.data.Dataset.
+      strategy: an instance of tf.distribute.Strategy.
+
+    Returns:
+      An iterator that yields input tensors.
+    """
+
+    if input_fn is None:
+      return None
+    # When training with multiple TPU workers, datasets needs to be cloned
+    # across workers. Since Dataset instance cannot be cloned in eager mode,
+    # we instead pass callable that returns a dataset.
+    if self._is_multi_host:
+      return iter(
+          strategy.experimental_distribute_datasets_from_function(input_fn))
+    else:
+      input_data = input_fn()
+      return iter(strategy.experimental_distribute_dataset(input_data))
+
+  def _create_replicated_step(self,
+                              strategy,
+                              model,
+                              loss_fn,
+                              optimizer,
+                              metric=None):
+
+    def _replicated_step(inputs):
+      """Replicated training step."""
+      inputs, labels = inputs
+
+      with tf.GradientTape() as tape:
+        outputs = model(inputs, training=True)
+        prediction_loss = loss_fn(labels, outputs)
+        loss = tf.reduce_mean(prediction_loss)
+        loss = loss / strategy.num_replicas_in_sync
+        if isinstance(metric, tf.keras.metrics.Metric):
+          metric.update_state(labels, outputs)
+        else:
+          logging.error('train metric is not an instance of '
+                        'tf.keras.metrics.Metric.')
+
+      grads = tape.gradient(loss, model.trainable_variables)
+      optimizer.apply_gradients(zip(grads, model.trainable_variables))
+      return loss
+
+    return _replicated_step
+
+  def _create_train_step(self,
+                         strategy,
+                         model,
+                         loss_fn,
+                         optimizer,
+                         metric=None):
+    """Creates a distributed training step.
+
+      Args:
+        strategy: an instance of tf.distribute.Strategy.
+        model: (Tensor, bool) -> Tensor. model function.
+        loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
+        optimizer: tf.keras.optimizers.Optimizer.
+        iterator: an iterator that yields input tensors.
+        metric: tf.keras.metrics.Metric subclass.
+
+      Returns:
+        The training step callable.
+    """
+    _replicated_step = self._create_replicated_step(strategy, model, loss_fn,
+                                                    optimizer, metric)
+
+    @tf.function
+    def train_step(iterator, num_steps):
+      """Performs a distributed training step.
+
+      Args:
+        iterator: an iterator that yields input tensors.
+
+      Returns:
+        The loss tensor.
+      """
+      if not isinstance(num_steps, tf.Tensor):
+        raise ValueError('steps should be an Tensor. Python object may cause '
+                         'retracing.')
+
+      per_replica_losses = strategy.experimental_run_v2(
+          _replicated_step, args=(next(iterator),))
+      for _ in tf.range(num_steps - 1):
+        per_replica_losses = strategy.experimental_run_v2(
+            _replicated_step, args=(next(iterator),))
+
+      # For reporting, we returns the mean of losses.
+      loss = strategy.reduce(
+          tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
+      return loss
+
+    return train_step
+
+  def _create_test_step(self, strategy, model, metric):
+    """Creates a distributed test step."""
+
+    @tf.function
+    def test_step(iterator):
+      """Calculates evaluation metrics on distributed devices."""
+      if not metric:
+        logging.info('Skip test_step because metric is None (%s)', metric)
+        return None, None
+      if not isinstance(metric, tf.keras.metrics.Metric):
+        raise ValueError(
+            'Metric must be an instance of tf.keras.metrics.Metric '
+            'for running in test_step. Actual {}'.format(metric))
+
+      def _test_step_fn(inputs):
+        """Replicated accuracy calculation."""
+        inputs, labels = inputs
+        model_outputs = model(inputs, training=False)
+        metric.update_state(labels, model_outputs)
+        return labels, model_outputs
+
+      return strategy.experimental_run_v2(_test_step_fn, args=(next(iterator),))
+
+    return test_step
+
+  def train(self,
+            train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+            eval_input_fn: Callable[[params_dict.ParamsDict],
+                                    tf.data.Dataset] = None,
+            model_dir: Text = None,
+            total_steps: int = 1,
+            iterations_per_loop: int = 1,
+            train_metric_fn: Callable[[], Any] = None,
+            eval_metric_fn: Callable[[], Any] = None,
+            summary_writer_fn: Callable[[Text, Text],
+                                        SummaryWriter] = SummaryWriter,
+            init_checkpoint: Callable[[tf.keras.Model], Any] = None,
+            custom_callbacks: List[tf.keras.callbacks.Callback] = None,
+            save_config: bool = True):
+    """Runs distributed training.
+
+    Args:
+      train_input_fn: (params: dict) -> tf.data.Dataset training data input
+        function.
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      model_dir: the folder path for model checkpoints.
+      total_steps: total training steps.
+      iterations_per_loop: train steps per loop. After each loop, this job will
+        update metrics like loss and save checkpoint.
+      train_metric_fn: metric_fn for evaluation in train_step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      summary_writer_fn: function to create summary writer.
+      init_checkpoint: function to load checkpoint.
+      custom_callbacks: A list of Keras Callbacks objects to run during
+        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
+        methods are invoked during training.
+      save_config: bool. Whether to save params to model_dir.
+
+    Returns:
+      The training loss and eval metrics.
+    """
+    assert train_input_fn is not None
+    if train_metric_fn and not callable(train_metric_fn):
+      raise ValueError('if `train_metric_fn` is specified, '
+                       'train_metric_fn must be a callable.')
+    if eval_metric_fn and not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+    train_metric_fn = train_metric_fn or _no_metric
+    eval_metric_fn = eval_metric_fn or _no_metric
+
+    if custom_callbacks and iterations_per_loop != 1:
+      logging.error(
+          'It is sematically wrong to run callbacks when '
+          'iterations_per_loop is not one (%s)', iterations_per_loop)
+
+    def _run_callbacks_on_batch_begin(batch):
+      """Runs custom callbacks at the start of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_begin(batch)
+
+    def _run_callbacks_on_batch_end(batch):
+      """Runs custom callbacks at the end of every step."""
+      if not custom_callbacks:
+        return
+      for callback in custom_callbacks:
+        if callback:
+          callback.on_batch_end(batch)
+
+    if save_config:
+      self._save_config(model_dir)
+
+    if FLAGS.save_checkpoint_freq:
+      save_freq = FLAGS.save_checkpoint_freq
+    else:
+      save_freq = iterations_per_loop
+
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    train_iterator = self._get_input_iterator(train_input_fn, strategy)
+    train_loss = None
+    eval_metric_result = None
+    with strategy.scope():
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      if not hasattr(model, 'optimizer'):
+        raise ValueError('User should set optimizer attribute to model '
+                         'inside `model_fn`.')
+      optimizer = model.optimizer
+
+      # Training loop starts here.
+      checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
+      latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
+      initial_step = 0
+      if latest_checkpoint_file:
+        logging.info(
+            'Checkpoint file %s found and restoring from '
+            'checkpoint', latest_checkpoint_file)
+        checkpoint.restore(latest_checkpoint_file)
+        initial_step = optimizer.iterations.numpy()
+        logging.info('Loading from checkpoint file completed. Init step %d',
+                     initial_step)
+      elif init_checkpoint:
+        logging.info('Restoring from init checkpoint function')
+        init_checkpoint(model)
+        logging.info('Loading from init checkpoint file completed')
+
+      current_step = optimizer.iterations.numpy()
+      checkpoint_name = self.checkpoint_name
+
+      eval_metric = eval_metric_fn()
+      train_metric = train_metric_fn()
+      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
+      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
+
+    # Continue training loop.
+    train_step = self._create_train_step(
+        strategy=strategy,
+        model=model,
+        loss_fn=self.loss_fn(),
+        optimizer=optimizer,
+        metric=train_metric)
+    test_step = None
+    if eval_input_fn and eval_metric:
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+    logging.info('Training started')
+    last_save_checkpoint_step = current_step
+    while current_step < total_steps:
+
+      num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
+      _run_callbacks_on_batch_begin(current_step)
+      train_loss = train_step(train_iterator,
+                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
+      _run_callbacks_on_batch_end(current_step)
+      current_step += num_steps
+
+      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
+                                         train_loss)
+      if not isinstance(train_loss, dict):
+        train_loss = {'total_loss': train_loss}
+      if np.isnan(train_loss['total_loss']):
+        raise ValueError('total loss is NaN.')
+
+      if train_metric:
+        train_metric_result = train_metric.result()
+        if isinstance(train_metric, tf.keras.metrics.Metric):
+          train_metric_result = tf.nest.map_structure(
+              lambda x: x.numpy().astype(float), train_metric_result)
+        if not isinstance(train_metric_result, dict):
+          train_metric_result = {'metric': train_metric_result}
+        train_metric_result.update(train_loss)
+      else:
+        train_metric_result = train_loss
+      if callable(optimizer.lr):
+        train_metric_result.update(
+            {'learning_rate': optimizer.lr(current_step).numpy()})
+      else:
+        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
+      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
+                   current_step, total_steps, train_loss,
+                   train_metric_result)
+
+      train_summary_writer(
+          metrics=train_metric_result, step=optimizer.iterations)
+
+      # Saves model checkpoints and run validation steps at every
+      # iterations_per_loop steps.
+      # To avoid repeated model saving, we do not save after the last
+      # step of training.
+      if save_freq > 0 and current_step < total_steps and (
+          current_step - last_save_checkpoint_step) >= save_freq:
+        _save_checkpoint(checkpoint, model_dir,
+                         checkpoint_name.format(step=current_step))
+        last_save_checkpoint_step = current_step
+
+      if test_step:
+        eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+        eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                  eval_metric, eval_iterator)
+        logging.info('Step: %s evalation metric = %s.', current_step,
+                     eval_metric_result)
+        test_summary_writer(
+            metrics=eval_metric_result, step=optimizer.iterations)
+
+      # Re-initialize evaluation metric, except the last step.
+      if eval_metric and current_step < total_steps:
+        eval_metric.reset_states()
+      if train_metric and current_step < total_steps:
+        train_metric.reset_states()
+
+    # Reaches the end of training and saves the last checkpoint.
+    if last_save_checkpoint_step < total_steps:
+      _save_checkpoint(checkpoint, model_dir,
+                       checkpoint_name.format(step=current_step))
+
+    if test_step:
+      logging.info('Running final evaluation after training is complete.')
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Final evaluation metric = %s.', eval_metric_result)
+      test_summary_writer(
+          metrics=eval_metric_result, step=optimizer.iterations)
+
+    return train_loss, eval_metric_result
+
+  def _run_evaluation(self, test_step, current_training_step, metric,
+                      test_iterator):
+    """Runs validation steps and aggregate metrics."""
+    if not test_iterator or not metric:
+      logging.warning(
+          'Both test_iterator (%s) and metrics (%s) must not be None.',
+          test_iterator, metric)
+      return None
+    logging.info('Running evaluation after step: %s.', current_training_step)
+    while True:
+      try:
+        test_step(test_iterator)
+      except (StopIteration, tf.errors.OutOfRangeError):
+        break
+
+    metric_result = metric.result()
+    if isinstance(metric, tf.keras.metrics.Metric):
+      metric_result = metric_result.numpy().astype(float)
+    logging.info('Step: [%d] Validation metric = %f', current_training_step,
+                 metric_result)
+    return metric_result
+
+  def evaluate_from_model_dir(
+      self,
+      model_dir: Text,
+      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+      eval_metric_fn: Callable[[], Any],
+      total_steps: int = -1,
+      eval_timeout: int = None,
+      min_eval_interval: int = 180,
+      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
+    """Runs distributed evaluation on model folder.
+
+    Args:
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      model_dir: the folder for storing model checkpoints.
+      total_steps: total training steps. If the current step reaches the
+        total_steps, the evaluation loop will stop.
+      eval_timeout: The maximum number of seconds to wait between checkpoints.
+        If left as None, then the process will wait indefinitely. Used by
+        tf.train.checkpoints_iterator.
+      min_eval_interval: The minimum number of seconds between yielding
+        checkpoints. Used by tf.train.checkpoints_iterator.
+      summary_writer_fn: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+
+    if not model_dir:
+      raise ValueError('model_dir must be set.')
+
+    def terminate_eval():
+      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
+                      eval_timeout)
+      return True
+
+    summary_writer = summary_writer_fn(model_dir, 'eval')
+
+    # Read checkpoints from the given model directory
+    # until `eval_timeout` seconds elapses.
+    for checkpoint_path in tf.train.checkpoints_iterator(
+        model_dir,
+        min_interval_secs=min_eval_interval,
+        timeout=eval_timeout,
+        timeout_fn=terminate_eval):
+      eval_metric_result, current_step = self.evaluate_checkpoint(
+          checkpoint_path=checkpoint_path,
+          eval_input_fn=eval_input_fn,
+          eval_metric_fn=eval_metric_fn,
+          summary_writer=summary_writer)
+      if total_steps > 0 and current_step >= total_steps:
+        logging.info('Evaluation finished after training step %d', current_step)
+        break
+    return eval_metric_result
+
+  def evaluate_checkpoint(self,
+                          checkpoint_path: Text,
+                          eval_input_fn: Callable[[params_dict.ParamsDict],
+                                                  tf.data.Dataset],
+                          eval_metric_fn: Callable[[], Any],
+                          summary_writer: SummaryWriter = None):
+    """Runs distributed evaluation on the one checkpoint.
+
+    Args:
+      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
+        trigger evaluting metric on eval data. If None, will not run eval step.
+      eval_metric_fn: metric_fn for evaluation in test_step.
+      checkpoint_path: the checkpoint to evaluate.
+      summary_writer_fn: function to create summary writer.
+
+    Returns:
+      Eval metrics dictionary of the last checkpoint.
+    """
+    if not callable(eval_metric_fn):
+      raise ValueError('if `eval_metric_fn` is specified, '
+                       'eval_metric_fn must be a callable.')
+
+    params = self._params
+    strategy = self._strategy
+    # To reduce unnecessary send/receive input pipeline operation, we place
+    # input pipeline ops in worker task.
+    with strategy.scope():
+
+      # To correctly place the model weights on accelerators,
+      # model and optimizer should be created in scope.
+      model = self.model_fn(params.as_dict())
+      checkpoint = tf.train.Checkpoint(model=model)
+
+      eval_metric = eval_metric_fn()
+      assert eval_metric, 'eval_metric does not exist'
+      test_step = self._create_test_step(strategy, model, metric=eval_metric)
+
+      logging.info('Starting to evaluate.')
+      if not checkpoint_path:
+        raise ValueError('checkpoint path is empty')
+      reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
+      current_step = reader.get_tensor(
+          'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
+      logging.info(
+          'Checkpoint file %s found and restoring from '
+          'checkpoint', checkpoint_path)
+      checkpoint.restore(checkpoint_path)
+
+      eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
+      eval_metric_result = self._run_evaluation(test_step, current_step,
+                                                eval_metric, eval_iterator)
+      logging.info('Step: %s evalation metric = %s.', current_step,
+                   eval_metric_result)
+      summary_writer(metrics=eval_metric_result, step=current_step)
+      eval_metric.reset_states()
+
+    return eval_metric_result, current_step
+
+  def predict(self):
+    return NotImplementedError('Unimplmented function.')
+
+
+# TODO(yeqing): Add unit test for MultiWorkerMirroredStrategy.
+class ExecutorBuilder(object):
+  """Builder of DistributedExecutor.
+
+  Example 1: Builds an executor with supported Strategy.
+    builder = ExecutorBuilder(
+        strategy_type='tpu',
+        strategy_config={'tpu': '/bns/xxx'})
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 2: Builds an executor with customized Strategy.
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Example 3: Builds a customized executor with customized Strategy.
+    class MyDistributedExecutor(DistributedExecutor):
+      # implementation ...
+
+    builder = ExecutorBuilder()
+    builder.strategy = <some customized Strategy>
+    dist_executor = builder.build_executor(
+        class_ctor=MyDistributedExecutor,
+        params=params,
+        model_fn=my_model_fn,
+        loss_fn=my_loss_fn,
+        metric_fn=my_metric_fn)
+
+  Args:
+    strategy_type: string. One of 'tpu', 'mirrored', 'multi_worker_mirrored'. If
+      None. User is responsible to set the strategy before calling
+      build_executor(...).
+    strategy_config: necessary config for constructing the proper Strategy.
+      Check strategy_flags_dict() for examples of the structure.
+  """
+
+  def __init__(self, strategy_type=None, strategy_config=None):
+    self._strategy_config = strategy_config
+    self._strategy = self._build_strategy(strategy_type)
+
+  @property
+  def strategy(self):
+    """Returns default checkpoint name."""
+    return self._strategy
+
+  @strategy.setter
+  def strategy(self, new_strategy):
+    """Sets default summary writer for the current thread."""
+    self._strategy = new_strategy
+
+  def _build_strategy(self, strategy_type):
+    """Builds tf.distribute.Strategy instance.
+
+    Args:
+      strategy_type: string. One of 'tpu', 'one_device_gpu', 'mirrored', 'multi_worker_mirrored'.
+
+    Returns:
+      An tf.distribute.Strategy object. Returns None if strategy_type is None.
+    """
+    if strategy_type is None:
+      return None
+
+    if strategy_type == 'tpu':
+      return self._build_tpu_strategy()
+    elif strategy_type == 'one_device_gpu':
+      return tf.distribute.OneDeviceStrategy("device:GPU:0")
+    elif strategy_type == 'mirrored':
+      return self._build_mirrored_strategy()
+    elif strategy_type == 'multi_worker_mirrored':
+      return self._build_multiworker_mirrored_strategy()
+    else:
+      raise NotImplementedError('Unsupport accelerator type "%s"' %
+                                strategy_type)
+
+  def _build_mirrored_strategy(self):
+    """Builds a MirroredStrategy object."""
+    return tf.distribute.MirroredStrategy()
+
+  def _build_tpu_strategy(self):
+    """Builds a TPUStrategy object."""
+
+    tpu = self._strategy_config.tpu
+    logging.info('Use TPU at %s', tpu if tpu is not None else '')
+    cluster_resolver = tpu_lib.tpu_initialize(tpu)
+    strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
+
+    return strategy
+
+  def _build_multiworker_mirrored_strategy(self):
+    """Builds a MultiWorkerMirroredStrategy object."""
+
+    worker_hosts = self._strategy_config.worker_hosts
+
+    if worker_hosts is not None:
+      # Set TF_CONFIG environment variable
+      worker_hosts = worker_hosts.split(',')
+      task_index = self._strategy_config.task_index
+      os.environ['TF_CONFIG'] = json.dumps({
+          'cluster': {
+              'worker': worker_hosts
+          },
+          'task': {
+              'type': 'worker',
+              'index': task_index
+          }
+      })
+
+    multiworker_strategy = (
+        tf.distribute.experimental.MultiWorkerMirroredStrategy())
+    return multiworker_strategy
+
+  def build_executor(self,
+                     class_ctor=DistributedExecutor,
+                     params=None,
+                     model_fn=None,
+                     loss_fn=None,
+                     **kwargs):
+    """Creates an executor according to strategy type.
+
+    See doc string of the DistributedExecutor.__init__ for more information of
+    the
+    input arguments.
+
+    Args:
+      class_ctor: A constructor of executor (default: DistributedExecutor).
+      params: ParamsDict, all the model parameters and runtime parameters.
+      model_fn: Keras model function.
+      loss_fn: loss function.
+      **kwargs: other arguments to the executor constructor.
+
+    Returns:
+      An instance of DistributedExecutor or its subclass.
+    """
+    if self._strategy is None:
+      raise ValueError('`strategy` should not be None. You need to specify '
+                       '`strategy_type` in the builder contructor or directly '
+                       'set the `strategy` property of the builder.')
+    return class_ctor(
+        strategy=self._strategy,
+        params=params,
+        model_fn=model_fn,
+        loss_fn=loss_fn,
+        **kwargs)
diff --git a/deepray/layers/nlp/bert_modeling.py b/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
similarity index 81%
rename from deepray/layers/nlp/bert_modeling.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
index 55c79764..bba4162b 100644
--- a/deepray/layers/nlp/bert_modeling.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
@@ -28,24 +28,23 @@
 from deepray.layers import tf_utils
 
 
+
 class BertConfig(object):
   """Configuration for `BertModel`."""
 
-  def __init__(
-      self,
-      vocab_size,
-      hidden_size=768,
-      num_hidden_layers=12,
-      num_attention_heads=12,
-      intermediate_size=3072,
-      hidden_act="gelu",
-      hidden_dropout_prob=0.1,
-      attention_probs_dropout_prob=0.1,
-      max_position_embeddings=512,
-      type_vocab_size=16,
-      initializer_range=0.02,
-      backward_compatible=True
-  ):
+  def __init__(self,
+               vocab_size,
+               hidden_size=768,
+               num_hidden_layers=12,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               hidden_act="gelu",
+               hidden_dropout_prob=0.1,
+               attention_probs_dropout_prob=0.1,
+               max_position_embeddings=512,
+               type_vocab_size=16,
+               initializer_range=0.02,
+               backward_compatible=True):
     """Constructs BertConfig.
 
     Args:
@@ -113,7 +112,11 @@ def to_json_string(self):
 class AlbertConfig(BertConfig):
   """Configuration for `ALBERT`."""
 
-  def __init__(self, embedding_size, num_hidden_groups=1, inner_group_num=1, **kwargs):
+  def __init__(self,
+               embedding_size,
+               num_hidden_groups=1,
+               inner_group_num=1,
+               **kwargs):
     """Constructs AlbertConfig.
 
     Args:
@@ -145,13 +148,19 @@ def from_dict(cls, json_object):
 
 
 @deprecation.deprecated(None, "The function should not be used any more.")
-def get_bert_model(input_word_ids, input_mask, input_type_ids, config=None, name=None, float_type=tf.float32):
+def get_bert_model(input_word_ids,
+                   input_mask,
+                   input_type_ids,
+                   config=None,
+                   name=None,
+                   float_type=tf.float32):
   """Wraps the core BERT model as a keras.Model."""
   bert_model_layer = BertModel(config=config, float_type=float_type, name=name)
-  pooled_output, sequence_output = bert_model_layer(input_word_ids, input_mask, input_type_ids)
+  pooled_output, sequence_output = bert_model_layer(input_word_ids, input_mask,
+                                                    input_type_ids)
   bert_model = tf.keras.Model(
-      inputs=[input_word_ids, input_mask, input_type_ids], outputs=[pooled_output, sequence_output]
-  )
+      inputs=[input_word_ids, input_mask, input_type_ids],
+      outputs=[pooled_output, sequence_output])
   return bert_model
 
 
@@ -177,10 +186,13 @@ class BertModel(tf.keras.layers.Layer):
   ```
   """
 
-  @deprecation.deprecated(None, "Please use `nlp.modeling.networks.TransformerEncoder` instead.")
+  @deprecation.deprecated(
+      None, "Please use `nlp.modeling.networks.TransformerEncoder` instead.")
   def __init__(self, config, float_type=tf.float32, **kwargs):
     super(BertModel, self).__init__(**kwargs)
-    self.config = (BertConfig.from_dict(config) if isinstance(config, dict) else copy.deepcopy(config))
+    self.config = (
+        BertConfig.from_dict(config)
+        if isinstance(config, dict) else copy.deepcopy(config))
     self.float_type = float_type
 
   def build(self, unused_input_shapes):
@@ -190,8 +202,7 @@ def build(self, unused_input_shapes):
         embedding_size=self.config.hidden_size,
         initializer_range=self.config.initializer_range,
         dtype=tf.float32,
-        name="word_embeddings"
-    )
+        name="word_embeddings")
     self.embedding_postprocessor = EmbeddingPostprocessor(
         use_type_embeddings=True,
         token_type_vocab_size=self.config.type_vocab_size,
@@ -200,8 +211,7 @@ def build(self, unused_input_shapes):
         dropout_prob=self.config.hidden_dropout_prob,
         initializer_range=self.config.initializer_range,
         dtype=tf.float32,
-        name="embedding_postprocessor"
-    )
+        name="embedding_postprocessor")
     self.encoder = Transformer(
         num_hidden_layers=self.config.num_hidden_layers,
         hidden_size=self.config.hidden_size,
@@ -213,17 +223,19 @@ def build(self, unused_input_shapes):
         initializer_range=self.config.initializer_range,
         backward_compatible=self.config.backward_compatible,
         float_type=self.float_type,
-        name="encoder"
-    )
+        name="encoder")
     self.pooler_transform = tf.keras.layers.Dense(
         units=self.config.hidden_size,
         activation="tanh",
         kernel_initializer=get_initializer(self.config.initializer_range),
-        name="pooler_transform"
-    )
+        name="pooler_transform")
     super(BertModel, self).build(unused_input_shapes)
 
-  def __call__(self, input_word_ids, input_mask=None, input_type_ids=None, **kwargs):
+  def __call__(self,
+               input_word_ids,
+               input_mask=None,
+               input_type_ids=None,
+               **kwargs):
     inputs = tf_utils.pack_inputs([input_word_ids, input_mask, input_type_ids])
     return super(BertModel, self).__call__(inputs, **kwargs)
 
@@ -245,15 +257,18 @@ def call(self, inputs, mode="bert"):
     input_type_ids = unpacked_inputs[2]
 
     word_embeddings = self.embedding_lookup(input_word_ids)
-    embedding_tensor = self.embedding_postprocessor(word_embeddings=word_embeddings, token_type_ids=input_type_ids)
+    embedding_tensor = self.embedding_postprocessor(
+        word_embeddings=word_embeddings, token_type_ids=input_type_ids)
     if self.float_type == tf.float16:
       embedding_tensor = tf.cast(embedding_tensor, tf.float16)
     attention_mask = None
     if input_mask is not None:
-      attention_mask = create_attention_mask_from_input_mask(input_word_ids, input_mask)
+      attention_mask = create_attention_mask_from_input_mask(
+          input_word_ids, input_mask)
 
     if mode == "encoder":
-      return self.encoder(embedding_tensor, attention_mask, return_all_layers=True)
+      return self.encoder(
+          embedding_tensor, attention_mask, return_all_layers=True)
 
     sequence_output = self.encoder(embedding_tensor, attention_mask)
     first_token_tensor = tf.squeeze(sequence_output[:, 0:1, :], axis=1)
@@ -270,7 +285,11 @@ def get_config(self):
 class EmbeddingLookup(tf.keras.layers.Layer):
   """Looks up words embeddings for id tensor."""
 
-  def __init__(self, vocab_size, embedding_size=768, initializer_range=0.02, **kwargs):
+  def __init__(self,
+               vocab_size,
+               embedding_size=768,
+               initializer_range=0.02,
+               **kwargs):
     super(EmbeddingLookup, self).__init__(**kwargs)
     self.vocab_size = vocab_size
     self.embedding_size = embedding_size
@@ -282,8 +301,7 @@ def build(self, unused_input_shapes):
         "embeddings",
         shape=[self.vocab_size, self.embedding_size],
         initializer=get_initializer(self.initializer_range),
-        dtype=self.dtype
-    )
+        dtype=self.dtype)
     super(EmbeddingLookup, self).build(unused_input_shapes)
 
   def call(self, inputs):
@@ -298,17 +316,15 @@ def call(self, inputs):
 class EmbeddingPostprocessor(tf.keras.layers.Layer):
   """Performs various post-processing on a word embedding tensor."""
 
-  def __init__(
-      self,
-      use_type_embeddings=False,
-      token_type_vocab_size=None,
-      use_position_embeddings=True,
-      max_position_embeddings=512,
-      dropout_prob=0.0,
-      initializer_range=0.02,
-      initializer=None,
-      **kwargs
-  ):
+  def __init__(self,
+               use_type_embeddings=False,
+               token_type_vocab_size=None,
+               use_position_embeddings=True,
+               max_position_embeddings=512,
+               dropout_prob=0.0,
+               initializer_range=0.02,
+               initializer=None,
+               **kwargs):
     super(EmbeddingPostprocessor, self).__init__(**kwargs)
     self.use_type_embeddings = use_type_embeddings
     self.token_type_vocab_size = token_type_vocab_size
@@ -336,8 +352,7 @@ def build(self, input_shapes):
           "type_embeddings",
           shape=[self.token_type_vocab_size, width],
           initializer=get_initializer(self.initializer_range),
-          dtype=self.dtype
-      )
+          dtype=self.dtype)
 
     self.position_embeddings = None
     if self.use_position_embeddings:
@@ -345,13 +360,12 @@ def build(self, input_shapes):
           "position_embeddings",
           shape=[self.max_position_embeddings, width],
           initializer=get_initializer(self.initializer_range),
-          dtype=self.dtype
-      )
+          dtype=self.dtype)
 
     self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
-    )
-    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_prob, dtype=tf.float32)
+        name="layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+    self.output_dropout = tf.keras.layers.Dropout(
+        rate=self.dropout_prob, dtype=tf.float32)
     super(EmbeddingPostprocessor, self).build(input_shapes)
 
   def __call__(self, word_embeddings, token_type_ids=None, **kwargs):
@@ -371,12 +385,16 @@ def call(self, inputs):
     output = word_embeddings
     if self.use_type_embeddings:
       flat_token_type_ids = tf.reshape(token_type_ids, [-1])
-      token_type_embeddings = tf.gather(self.type_embeddings, flat_token_type_ids)
-      token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width])
+      token_type_embeddings = tf.gather(self.type_embeddings,
+                                        flat_token_type_ids)
+      token_type_embeddings = tf.reshape(token_type_embeddings,
+                                         [batch_size, seq_length, width])
       output += token_type_embeddings
 
     if self.use_position_embeddings:
-      position_embeddings = tf.expand_dims(tf.slice(self.position_embeddings, [0, 0], [seq_length, width]), axis=0)
+      position_embeddings = tf.expand_dims(
+          tf.slice(self.position_embeddings, [0, 0], [seq_length, width]),
+          axis=0)
 
       output += position_embeddings
 
@@ -417,15 +435,13 @@ class Attention(tf.keras.layers.Layer):
     Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)
   """
 
-  def __init__(
-      self,
-      num_attention_heads=12,
-      size_per_head=64,
-      attention_probs_dropout_prob=0.0,
-      initializer_range=0.02,
-      backward_compatible=False,
-      **kwargs
-  ):
+  def __init__(self,
+               num_attention_heads=12,
+               size_per_head=64,
+               attention_probs_dropout_prob=0.0,
+               initializer_range=0.02,
+               backward_compatible=False,
+               **kwargs):
     super(Attention, self).__init__(**kwargs)
     self.num_attention_heads = num_attention_heads
     self.size_per_head = size_per_head
@@ -438,7 +454,8 @@ def build(self, unused_input_shapes):
     self.query_dense = self._projection_dense_layer("query")
     self.key_dense = self._projection_dense_layer("key")
     self.value_dense = self._projection_dense_layer("value")
-    self.attention_probs_dropout = tf.keras.layers.Dropout(rate=self.attention_probs_dropout_prob)
+    self.attention_probs_dropout = tf.keras.layers.Dropout(
+        rate=self.attention_probs_dropout_prob)
     super(Attention, self).build(unused_input_shapes)
 
   def reshape_to_matrix(self, input_tensor):
@@ -480,7 +497,8 @@ def call(self, inputs):
     # Take the dot product between "query" and "key" to get the raw
     # attention scores.
     attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_tensor, query_tensor)
-    attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(self.size_per_head)))
+    attention_scores = tf.multiply(attention_scores,
+                                   1.0 / math.sqrt(float(self.size_per_head)))
 
     if attention_mask is not None:
       # `attention_mask` = [B, 1, F, T]
@@ -516,8 +534,7 @@ def _projection_dense_layer(self, name):
         kernel_initializer=get_initializer(self.initializer_range),
         output_projection=False,
         backward_compatible=self.backward_compatible,
-        name=name
-    )
+        name=name)
 
 
 class Dense3D(tf.keras.layers.Layer):
@@ -539,18 +556,16 @@ class Dense3D(tf.keras.layers.Layer):
       checkpoints converted from TF 1.x.
   """
 
-  def __init__(
-      self,
-      num_attention_heads=12,
-      size_per_head=72,
-      kernel_initializer=None,
-      bias_initializer="zeros",
-      activation=None,
-      use_bias=True,
-      output_projection=False,
-      backward_compatible=False,
-      **kwargs
-  ):
+  def __init__(self,
+               num_attention_heads=12,
+               size_per_head=72,
+               kernel_initializer=None,
+               bias_initializer="zeros",
+               activation=None,
+               use_bias=True,
+               output_projection=False,
+               backward_compatible=False,
+               **kwargs):
     """Inits Dense3D."""
     super(Dense3D, self).__init__(**kwargs)
     self.num_attention_heads = num_attention_heads
@@ -589,16 +604,15 @@ def build(self, input_shape):
     """Implements build() for the layer."""
     dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError(
-          "Unable to build `Dense3D` layer with non-floating "
-          "point (and non-complex) dtype %s" % (dtype,)
-      )
+      raise TypeError("Unable to build `Dense3D` layer with non-floating "
+                      "point (and non-complex) dtype %s" % (dtype,))
     input_shape = tf.TensorShape(input_shape)
     if tf.compat.dimension_value(input_shape[-1]) is None:
       raise ValueError("The last dimension of the inputs to `Dense3D` "
                        "should be defined. Found `None`.")
     self.last_dim = tf.compat.dimension_value(input_shape[-1])
-    self.input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: self.last_dim})
+    self.input_spec = tf.keras.layers.InputSpec(
+        min_ndim=3, axes={-1: self.last_dim})
     # Determines variable shapes.
     if self.backward_compatible:
       kernel_shape = self.compatible_kernel_shape
@@ -608,12 +622,18 @@ def build(self, input_shape):
       bias_shape = self.bias_shape
 
     self.kernel = self.add_weight(
-        "kernel", shape=kernel_shape, initializer=self.kernel_initializer, dtype=self.dtype, trainable=True
-    )
+        "kernel",
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        dtype=self.dtype,
+        trainable=True)
     if self.use_bias:
       self.bias = self.add_weight(
-          "bias", shape=bias_shape, initializer=self.bias_initializer, dtype=self.dtype, trainable=True
-      )
+          "bias",
+          shape=bias_shape,
+          initializer=self.bias_initializer,
+          dtype=self.dtype,
+          trainable=True)
     else:
       self.bias = None
     super(Dense3D, self).build(input_shape)
@@ -633,7 +653,9 @@ def call(self, inputs):
     """
     if self.backward_compatible:
       kernel = tf.keras.backend.reshape(self.kernel, self.kernel_shape)
-      bias = (tf.keras.backend.reshape(self.bias, self.bias_shape) if self.use_bias else None)
+      bias = (
+          tf.keras.backend.reshape(self.bias, self.bias_shape)
+          if self.use_bias else None)
     else:
       kernel = self.kernel
       bias = self.bias
@@ -652,15 +674,13 @@ def call(self, inputs):
 class Dense2DProjection(tf.keras.layers.Layer):
   """A 2D projection layer with tf.einsum implementation."""
 
-  def __init__(
-      self,
-      output_size,
-      kernel_initializer=None,
-      bias_initializer="zeros",
-      activation=None,
-      fp32_activation=False,
-      **kwargs
-  ):
+  def __init__(self,
+               output_size,
+               kernel_initializer=None,
+               bias_initializer="zeros",
+               activation=None,
+               fp32_activation=False,
+               **kwargs):
     super(Dense2DProjection, self).__init__(**kwargs)
     self.output_size = output_size
     self.kernel_initializer = kernel_initializer
@@ -672,18 +692,14 @@ def build(self, input_shape):
     """Implements build() for the layer."""
     dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError(
-          "Unable to build `Dense2DProjection` layer with "
-          "non-floating point (and non-complex) "
-          "dtype %s" % (dtype,)
-      )
+      raise TypeError("Unable to build `Dense2DProjection` layer with "
+                      "non-floating point (and non-complex) "
+                      "dtype %s" % (dtype,))
     input_shape = tf.TensorShape(input_shape)
     if tf.compat.dimension_value(input_shape[-1]) is None:
-      raise ValueError(
-          "The last dimension of the inputs to "
-          "`Dense2DProjection` should be defined. "
-          "Found `None`."
-      )
+      raise ValueError("The last dimension of the inputs to "
+                       "`Dense2DProjection` should be defined. "
+                       "Found `None`.")
     last_dim = tf.compat.dimension_value(input_shape[-1])
     self.input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: last_dim})
     self.kernel = self.add_weight(
@@ -691,11 +707,13 @@ def build(self, input_shape):
         shape=[last_dim, self.output_size],
         initializer=self.kernel_initializer,
         dtype=self.dtype,
-        trainable=True
-    )
+        trainable=True)
     self.bias = self.add_weight(
-        "bias", shape=[self.output_size], initializer=self.bias_initializer, dtype=self.dtype, trainable=True
-    )
+        "bias",
+        shape=[self.output_size],
+        initializer=self.bias_initializer,
+        dtype=self.dtype,
+        trainable=True)
     super(Dense2DProjection, self).build(input_shape)
 
   def call(self, inputs):
@@ -724,24 +742,23 @@ class TransformerBlock(tf.keras.layers.Layer):
   the second is a positionwise fully connected feed-forward network.
   """
 
-  def __init__(
-      self,
-      hidden_size=768,
-      num_attention_heads=12,
-      intermediate_size=3072,
-      intermediate_activation="gelu",
-      hidden_dropout_prob=0.0,
-      attention_probs_dropout_prob=0.0,
-      initializer_range=0.02,
-      backward_compatible=False,
-      float_type=tf.float32,
-      **kwargs
-  ):
+  def __init__(self,
+               hidden_size=768,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               intermediate_activation="gelu",
+               hidden_dropout_prob=0.0,
+               attention_probs_dropout_prob=0.0,
+               initializer_range=0.02,
+               backward_compatible=False,
+               float_type=tf.float32,
+               **kwargs):
     super(TransformerBlock, self).__init__(**kwargs)
     self.hidden_size = hidden_size
     self.num_attention_heads = num_attention_heads
     self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(intermediate_activation)
+    self.intermediate_activation = tf_utils.get_activation(
+        intermediate_activation)
     self.hidden_dropout_prob = hidden_dropout_prob
     self.attention_probs_dropout_prob = attention_probs_dropout_prob
     self.initializer_range = initializer_range
@@ -751,8 +768,7 @@ def __init__(
     if self.hidden_size % self.num_attention_heads != 0:
       raise ValueError(
           "The hidden size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (self.hidden_size, self.num_attention_heads)
-      )
+          "heads (%d)" % (self.hidden_size, self.num_attention_heads))
     self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
 
   def build(self, unused_input_shapes):
@@ -763,48 +779,46 @@ def build(self, unused_input_shapes):
         attention_probs_dropout_prob=self.attention_probs_dropout_prob,
         initializer_range=self.initializer_range,
         backward_compatible=self.backward_compatible,
-        name="self_attention"
-    )
+        name="self_attention")
     self.attention_output_dense = Dense3D(
         num_attention_heads=self.num_attention_heads,
         size_per_head=int(self.hidden_size / self.num_attention_heads),
         kernel_initializer=get_initializer(self.initializer_range),
         output_projection=True,
         backward_compatible=self.backward_compatible,
-        name="self_attention_output"
-    )
-    self.attention_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
+        name="self_attention_output")
+    self.attention_dropout = tf.keras.layers.Dropout(
+        rate=self.hidden_dropout_prob)
     self.attention_layer_norm = (
         tf.keras.layers.LayerNormalization(
             name="self_attention_layer_norm",
             axis=-1,
             epsilon=1e-12,
             # We do layer norm in float32 for numeric stability.
-            dtype=tf.float32
-        )
-    )
+            dtype=tf.float32))
     self.intermediate_dense = Dense2DProjection(
         output_size=self.intermediate_size,
         kernel_initializer=get_initializer(self.initializer_range),
         activation=self.intermediate_activation,
         # Uses float32 so that gelu activation is done in float32.
         fp32_activation=True,
-        name="intermediate"
-    )
+        name="intermediate")
     self.output_dense = Dense2DProjection(
-        output_size=self.hidden_size, kernel_initializer=get_initializer(self.initializer_range), name="output"
-    )
+        output_size=self.hidden_size,
+        kernel_initializer=get_initializer(self.initializer_range),
+        name="output")
     self.output_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
     self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
-    )
+        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
     super(TransformerBlock, self).build(unused_input_shapes)
 
   def common_layers(self):
     """Explicitly gets all layer objects inside a Transformer encoder block."""
     return [
-        self.attention_layer, self.attention_output_dense, self.attention_dropout, self.attention_layer_norm,
-        self.intermediate_dense, self.output_dense, self.output_dropout, self.output_layer_norm
+        self.attention_layer, self.attention_output_dense,
+        self.attention_dropout, self.attention_layer_norm,
+        self.intermediate_dense, self.output_dense, self.output_dropout,
+        self.output_layer_norm
     ]
 
   def __call__(self, input_tensor, attention_mask=None, **kwargs):
@@ -815,13 +829,15 @@ def call(self, inputs):
     """Implements call() for the layer."""
     (input_tensor, attention_mask) = tf_utils.unpack_inputs(inputs)
     attention_output = self.attention_layer(
-        from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=attention_mask
-    )
+        from_tensor=input_tensor,
+        to_tensor=input_tensor,
+        attention_mask=attention_mask)
     attention_output = self.attention_output_dense(attention_output)
     attention_output = self.attention_dropout(attention_output)
     # Use float32 in keras layer norm and the gelu activation in the
     # intermediate dense layer for numeric stability
-    attention_output = self.attention_layer_norm(input_tensor + attention_output)
+    attention_output = self.attention_layer_norm(input_tensor +
+                                                 attention_output)
     if self.float_type == tf.float16:
       attention_output = tf.cast(attention_output, tf.float16)
     intermediate_output = self.intermediate_dense(attention_output)
@@ -848,26 +864,25 @@ class Transformer(tf.keras.layers.Layer):
   https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
   """
 
-  def __init__(
-      self,
-      num_hidden_layers=12,
-      hidden_size=768,
-      num_attention_heads=12,
-      intermediate_size=3072,
-      intermediate_activation="gelu",
-      hidden_dropout_prob=0.0,
-      attention_probs_dropout_prob=0.0,
-      initializer_range=0.02,
-      backward_compatible=False,
-      float_type=tf.float32,
-      **kwargs
-  ):
+  def __init__(self,
+               num_hidden_layers=12,
+               hidden_size=768,
+               num_attention_heads=12,
+               intermediate_size=3072,
+               intermediate_activation="gelu",
+               hidden_dropout_prob=0.0,
+               attention_probs_dropout_prob=0.0,
+               initializer_range=0.02,
+               backward_compatible=False,
+               float_type=tf.float32,
+               **kwargs):
     super(Transformer, self).__init__(**kwargs)
     self.num_hidden_layers = num_hidden_layers
     self.hidden_size = hidden_size
     self.num_attention_heads = num_attention_heads
     self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(intermediate_activation)
+    self.intermediate_activation = tf_utils.get_activation(
+        intermediate_activation)
     self.hidden_dropout_prob = hidden_dropout_prob
     self.attention_probs_dropout_prob = attention_probs_dropout_prob
     self.initializer_range = initializer_range
@@ -889,9 +904,7 @@ def build(self, unused_input_shapes):
               initializer_range=self.initializer_range,
               backward_compatible=self.backward_compatible,
               float_type=self.float_type,
-              name=("layer_%d" % i)
-          )
-      )
+              name=("layer_%d" % i)))
     super(Transformer, self).build(unused_input_shapes)
 
   def __call__(self, input_tensor, attention_mask=None, **kwargs):
@@ -954,14 +967,17 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask):
   to_shape = tf_utils.get_shape_list(to_mask, expected_rank=2)
   to_seq_length = to_shape[1]
 
-  to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), dtype=from_tensor.dtype)
+  to_mask = tf.cast(
+      tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
+      dtype=from_tensor.dtype)
 
   # We don't assume that `from_tensor` is a mask (although it could be). We
   # don't actually care if we attend *from* padding tokens (only *to* padding)
   # tokens so we create a tensor of all ones.
   #
   # `broadcast_ones` = [batch_size, from_seq_length, 1]
-  broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
+  broadcast_ones = tf.ones(
+      shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
 
   # Here we broadcast along two dimensions to create the mask.
   mask = broadcast_ones * to_mask
diff --git a/deepray/layers/nlp/bert_models.py b/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
similarity index 61%
rename from deepray/layers/nlp/bert_models.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
index 09589556..81350a10 100644
--- a/deepray/layers/nlp/bert_models.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
@@ -19,12 +19,12 @@
 from __future__ import print_function
 
 import tensorflow as tf
+import deepray as dp
 import tensorflow_hub as hub
 
-from deepray.layers import tf_utils, networks
-from deepray.layers.nlp import bert_modeling
-from deepray import losses
-from deepray.layers.networks import bert_span_labeler, bert_classifier, bert_pretrainer
+from deepray.layers import tf_utils
+from official.nlp import bert_modeling
+from official.nlp.modeling import losses
 
 
 def gather_indexes(sequence_tensor, positions):
@@ -43,14 +43,17 @@ def gather_indexes(sequence_tensor, positions):
       Masked out sequence tensor of shape (batch_size * max_predictions_per_seq,
       num_hidden).
   """
-  sequence_shape = tf_utils.get_shape_list(sequence_tensor, name='sequence_output_tensor')
+  sequence_shape = tf_utils.get_shape_list(
+      sequence_tensor, name='sequence_output_tensor')
   batch_size = sequence_shape[0]
   seq_length = sequence_shape[1]
   width = sequence_shape[2]
 
-  flat_offsets = tf.keras.backend.reshape(tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_offsets = tf.keras.backend.reshape(
+      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
   flat_positions = tf.keras.backend.reshape(positions + flat_offsets, [-1])
-  flat_sequence_tensor = tf.keras.backend.reshape(sequence_tensor, [batch_size * seq_length, width])
+  flat_sequence_tensor = tf.keras.backend.reshape(
+      sequence_tensor, [batch_size * seq_length, width])
   output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
 
   return output_tensor
@@ -66,29 +69,43 @@ def __init__(self, vocab_size, **kwargs):
         'vocab_size': vocab_size,
     }
 
-  def __call__(
-      self, lm_output, sentence_output=None, lm_label_ids=None, lm_label_weights=None, sentence_labels=None, **kwargs
-  ):
-    inputs = tf_utils.pack_inputs([lm_output, sentence_output, lm_label_ids, lm_label_weights, sentence_labels])
-    return super(BertPretrainLossAndMetricLayer, self).__call__(inputs, **kwargs)
-
-  def _add_metrics(
-      self, lm_output, lm_labels, lm_label_weights, lm_example_loss, sentence_output, sentence_labels,
-      next_sentence_loss
-  ):
+  def __call__(self,
+               lm_output,
+               sentence_output=None,
+               lm_label_ids=None,
+               lm_label_weights=None,
+               sentence_labels=None,
+               **kwargs):
+    inputs = tf_utils.pack_inputs([
+        lm_output, sentence_output, lm_label_ids, lm_label_weights,
+        sentence_labels
+    ])
+    return super(BertPretrainLossAndMetricLayer,
+                 self).__call__(inputs, **kwargs)
+
+  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
+                   lm_example_loss, sentence_output, sentence_labels,
+                   next_sentence_loss):
     """Adds metrics."""
-    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(lm_labels, lm_output)
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        lm_labels, lm_output)
     numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
     denominator = tf.reduce_sum(lm_label_weights) + 1e-5
     masked_lm_accuracy = numerator / denominator
-    self.add_metric(masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+    self.add_metric(
+        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
 
     self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
 
-    next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(sentence_labels, sentence_output)
-    self.add_metric(next_sentence_accuracy, name='next_sentence_accuracy', aggregation='mean')
+    next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
+        sentence_labels, sentence_output)
+    self.add_metric(
+        next_sentence_accuracy,
+        name='next_sentence_accuracy',
+        aggregation='mean')
 
-    self.add_metric(next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+    self.add_metric(
+        next_sentence_loss, name='next_sentence_loss', aggregation='mean')
 
   def call(self, inputs):
     """Implements call() for the layer."""
@@ -100,23 +117,23 @@ def call(self, inputs):
     sentence_labels = unpacked_inputs[4]
 
     mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss(
-        labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights
-    )
+        labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights)
     sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss(
-        labels=sentence_labels, predictions=sentence_output
-    )
+        labels=sentence_labels, predictions=sentence_output)
     loss = mask_label_loss + sentence_loss
     batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0], [1])
     # TODO(hongkuny): Avoids the hack and switches add_loss.
     final_loss = tf.fill(batch_shape, loss)
 
-    self._add_metrics(
-        lm_output, lm_label_ids, lm_label_weights, mask_label_loss, sentence_output, sentence_labels, sentence_loss
-    )
+    self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
+                      mask_label_loss, sentence_output, sentence_labels,
+                      sentence_loss)
     return final_loss
 
 
-def get_transformer_encoder(bert_config, sequence_length, float_dtype=tf.float32):
+def get_transformer_encoder(bert_config,
+                            sequence_length,
+                            float_dtype=tf.float32):
   """Gets a 'TransformerEncoder' object.
 
   Args:
@@ -139,18 +156,22 @@ def get_transformer_encoder(bert_config, sequence_length, float_dtype=tf.float32
       sequence_length=sequence_length,
       max_sequence_length=bert_config.max_position_embeddings,
       type_vocab_size=bert_config.type_vocab_size,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range),
-      float_dtype=float_dtype.name
-  )
+      initializer=tf.keras.initializers.TruncatedNormal(
+          stddev=bert_config.initializer_range),
+      float_dtype=float_dtype.name)
   if isinstance(bert_config, bert_modeling.AlbertConfig):
     kwargs['embedding_width'] = bert_config.embedding_size
-    return networks.AlbertTransformerEncoder(**kwargs)
+    return dp.models.albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
   else:
     assert isinstance(bert_config, bert_modeling.BertConfig)
-    return networks.TransformerEncoder(**kwargs)
+    return dp.models.transformer_encoder.TransformerEncoder(**kwargs)
 
 
-def pretrain_model(bert_config, seq_length, max_predictions_per_seq, float_type, initializer=None):
+def pretrain_model(bert_config,
+                   seq_length,
+                   max_predictions_per_seq,
+                   float_type,
+                   initializer=None):
   """Returns model to be used for pre-training.
 
   Args:
@@ -164,32 +185,44 @@ def pretrain_model(bert_config, seq_length, max_predictions_per_seq, float_type,
       Pretraining model as well as core BERT submodel from which to save
       weights after pretraining.
   """
-  input_word_ids = tf.keras.layers.Input(shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
-  input_mask = tf.keras.layers.Input(shape=(seq_length,), name='input_mask', dtype=tf.int32)
-  input_type_ids = tf.keras.layers.Input(shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  input_word_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_mask', dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(
+      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
   masked_lm_positions = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,), name='masked_lm_positions', dtype=tf.int32
-  )
-  masked_lm_ids = tf.keras.layers.Input(shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
-  masked_lm_weights = tf.keras.layers.Input(shape=(max_predictions_per_seq,), name='masked_lm_weights', dtype=tf.int32)
-  next_sentence_labels = tf.keras.layers.Input(shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_positions',
+      dtype=tf.int32)
+  masked_lm_ids = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(
+      shape=(max_predictions_per_seq,),
+      name='masked_lm_weights',
+      dtype=tf.int32)
+  next_sentence_labels = tf.keras.layers.Input(
+      shape=(1,), name='next_sentence_labels', dtype=tf.int32)
 
   transformer_encoder = get_transformer_encoder(bert_config, seq_length, float_type)
   if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
   pretrainer_model = bert_pretrainer.BertPretrainer(
       network=transformer_encoder,
       num_classes=2,  # The next sentence prediction label has two classes.
       num_token_predictions=max_predictions_per_seq,
       initializer=initializer,
       float_type=float_type,
-      output='predictions'
-  )
+      output='predictions')
 
-  lm_output, sentence_output = pretrainer_model([input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+  lm_output, sentence_output = pretrainer_model(
+      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
 
-  pretrain_loss_layer = BertPretrainLossAndMetricLayer(vocab_size=bert_config.vocab_size)
-  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids, masked_lm_weights, next_sentence_labels)
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
+      vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
+                                    masked_lm_weights, next_sentence_labels)
   keras_model = tf.keras.Model(
       inputs={
           'input_word_ids': input_word_ids,
@@ -200,8 +233,7 @@ def pretrain_model(bert_config, seq_length, max_predictions_per_seq, float_type,
           'masked_lm_weights': masked_lm_weights,
           'next_sentence_labels': next_sentence_labels,
       },
-      outputs=output_loss
-  )
+      outputs=output_loss)
   return keras_model, transformer_encoder
 
 
@@ -215,18 +247,21 @@ def __init__(self, initializer=None, float_type=tf.float32, **kwargs):
 
   def build(self, unused_input_shapes):
     """Implements build() for the layer."""
-    self.final_dense = tf.keras.layers.Dense(units=2, kernel_initializer=self.initializer, name='final_dense')
+    self.final_dense = tf.keras.layers.Dense(
+        units=2, kernel_initializer=self.initializer, name='final_dense')
     super(BertSquadLogitsLayer, self).build(unused_input_shapes)
 
   def call(self, inputs):
     """Implements call() for the layer."""
     sequence_output = inputs
 
-    input_shape = tf_utils.get_shape_list(sequence_output, name='sequence_output_tensor')
+    input_shape = tf_utils.get_shape_list(
+        sequence_output, name='sequence_output_tensor')
     sequence_length = input_shape[1]
     num_hidden_units = input_shape[2]
 
-    final_hidden_input = tf.keras.backend.reshape(sequence_output, [-1, num_hidden_units])
+    final_hidden_input = tf.keras.backend.reshape(sequence_output,
+                                                  [-1, num_hidden_units])
     logits = self.final_dense(final_hidden_input)
     logits = tf.keras.backend.reshape(logits, [-1, sequence_length, 2])
     logits = tf.transpose(logits, [2, 0, 1])
@@ -236,7 +271,11 @@ def call(self, inputs):
     return unstacked_logits[0], unstacked_logits[1]
 
 
-def squad_model(bert_config, max_seq_length, float_type, initializer=None, hub_module_url=None):
+def squad_model(bert_config,
+                max_seq_length,
+                float_type,
+                initializer=None,
+                hub_module_url=None):
   """Returns BERT Squad model along with core BERT model to import weights.
 
   Args:
@@ -252,21 +291,29 @@ def squad_model(bert_config, max_seq_length, float_type, initializer=None, hub_m
     (2) the core BERT transformer encoder.
   """
   if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
   if not hub_module_url:
-    bert_encoder = get_transformer_encoder(bert_config, max_seq_length, float_type)
-    return bert_span_labeler.BertSpanLabeler(network=bert_encoder, initializer=initializer), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length,
+                                           float_type)
+    return dp.models.bert_span_labeler.BertSpanLabeler(
+        network=bert_encoder, initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
   core_model = hub.KerasLayer(hub_module_url, trainable=True)
-  _, sequence_output = core_model([input_word_ids, input_mask, input_type_ids])
+  _, sequence_output = core_model(
+      [input_word_ids, input_mask, input_type_ids])
   # Sets the shape manually due to a bug in TF shape inference.
   # TODO(hongkuny): remove this once shape inference is correct.
   sequence_output.set_shape((None, max_seq_length, bert_config.hidden_size))
 
-  squad_logits_layer = BertSquadLogitsLayer(initializer=initializer, float_type=float_type, name='squad_logits')
+  squad_logits_layer = BertSquadLogitsLayer(
+      initializer=initializer, float_type=float_type, name='squad_logits')
   start_logits, end_logits = squad_logits_layer(sequence_output)
 
   squad = tf.keras.Model(
@@ -275,18 +322,17 @@ def squad_model(bert_config, max_seq_length, float_type, initializer=None, hub_m
           'input_mask': input_mask,
           'input_type_ids': input_type_ids,
       },
-      outputs={
-          "start_positions": start_logits,
-          "end_positions": end_logits
-      },
-      name='squad_model'
-  )
+      outputs=[start_logits, end_logits],
+      name='squad_model')
   return squad, core_model
 
 
-def classifier_model(
-    bert_config, float_type, num_labels, max_seq_length, final_layer_initializer=None, hub_module_url=None
-):
+def classifier_model(bert_config,
+                     float_type,
+                     num_labels,
+                     max_seq_length,
+                     final_layer_initializer=None,
+                     hub_module_url=None):
   """BERT classifier model in functional API style.
 
   Construct a Keras model for predicting `num_labels` outputs from an input with
@@ -309,27 +355,38 @@ def classifier_model(
   if final_layer_initializer is not None:
     initializer = final_layer_initializer
   else:
-    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(
+        stddev=bert_config.initializer_range)
 
   if not hub_module_url:
     bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
     return bert_classifier.BertClassifier(
-        bert_encoder, num_classes=num_labels, dropout_rate=bert_config.hidden_dropout_prob, initializer=initializer
-    ), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+        bert_encoder,
+        num_classes=num_labels,
+        dropout_rate=bert_config.hidden_dropout_prob,
+        initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
+  input_mask = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
+  input_type_ids = tf.keras.layers.Input(
+      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
   bert_model = hub.KerasLayer(hub_module_url, trainable=True)
   pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
-  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(pooled_output)
-
-  output = tf.keras.layers.Dense(num_labels, kernel_initializer=initializer, name='output', dtype=float_type)(output)
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
+      pooled_output)
+
+  output = tf.keras.layers.Dense(
+      num_labels,
+      kernel_initializer=initializer,
+      name='output',
+      dtype=float_type)(
+          output)
   return tf.keras.Model(
       inputs={
           'input_word_ids': input_word_ids,
           'input_mask': input_mask,
           'input_type_ids': input_type_ids
       },
-      outputs=output
-  ), bert_model
+      outputs=output), bert_model
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/__init__.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/__init__.py
new file mode 100644
index 00000000..8b137891
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/__init__.py
@@ -0,0 +1 @@
+
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
new file mode 100644
index 00000000..919bad30
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
@@ -0,0 +1,17 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Activations package definition. Subject to change."""
+from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
+from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import per_example_loss as weighted_sparse_categorical_crossentropy_per_example_loss
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
new file mode 100644
index 00000000..fc02ec80
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
@@ -0,0 +1,106 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Sparse categorical cross-entropy losses."""
+
+from __future__ import absolute_import
+from __future__ import division
+# from __future__ import google_type_annotations
+from __future__ import print_function
+
+import tensorflow as tf
+
+
+def _adjust_labels(labels, predictions):
+  """Adjust the 'labels' tensor by squeezing it if needed."""
+  labels = tf.cast(labels, tf.int32)
+  if len(predictions.shape) == len(labels.shape):
+    labels = tf.squeeze(labels, [-1])
+  return labels, predictions
+
+
+def _validate_rank(labels, predictions, weights):
+  if weights is not None and len(weights.shape) != len(labels.shape):
+    raise RuntimeError(
+        ("Weight and label tensors were not of the same rank. weights.shape "
+         "was %s, and labels.shape was %s.") %
+        (predictions.shape, labels.shape))
+  if (len(predictions.shape) - 1) != len(labels.shape):
+    raise RuntimeError(
+        ("Weighted sparse categorical crossentropy expects `labels` to have a "
+         "rank of one less than `predictions`. labels.shape was %s, and "
+         "predictions.shape was %s.") % (labels.shape, predictions.shape))
+
+
+def per_example_loss(labels, predictions, weights=None):
+  """Calculate a per-example sparse categorical crossentropy loss.
+
+  This loss function assumes that the predictions are post-softmax.
+  Args:
+    labels: The labels to evaluate against. Should be a set of integer indices
+      ranging from 0 to (vocab_size-1).
+    predictions: The network predictions. Should have softmax already applied.
+    weights: An optional weight array of the same shape as the 'labels' array.
+      If None, all examples will be used.
+
+  Returns:
+    A tensor of shape predictions.shape[:-1] containing the per-example
+      loss.
+  """
+  # When using these functions with the Keras core API, we will need to squeeze
+  # the labels tensor - Keras adds a spurious inner dimension.
+  labels, predictions = _adjust_labels(labels, predictions)
+  _validate_rank(labels, predictions, weights)
+
+  labels_one_hot = tf.keras.backend.one_hot(labels, predictions.shape[-1])
+  labels_one_hot = tf.keras.backend.cast(labels_one_hot, predictions.dtype)
+  per_example_loss_data = -tf.keras.backend.sum(
+      predictions * labels_one_hot, axis=[-1])
+  if weights is not None:
+    weights = tf.keras.backend.cast(weights, per_example_loss_data.dtype)
+    per_example_loss_data = weights * per_example_loss_data
+  return per_example_loss_data
+
+
+def loss(labels, predictions, weights=None):
+  """Calculate a per-batch sparse categorical crossentropy loss.
+
+  This loss function assumes that the predictions are post-softmax.
+  Args:
+    labels: The labels to evaluate against. Should be a set of integer indices
+      ranging from 0 to (vocab_size-1).
+    predictions: The network predictions. Should have softmax already applied.
+    weights: An optional weight array of the same shape as the 'labels' array.
+      If None, all examples will be used.
+
+  Returns:
+    A loss scalar.
+
+  Raises:
+    RuntimeError if the passed tensors do not have the same rank.
+  """
+  # When using these functions with the Keras core API, we will need to squeeze
+  # the labels tensor - Keras adds a spurious inner dimension.
+  labels, predictions = _adjust_labels(labels, predictions)
+  _validate_rank(labels, predictions, weights)
+
+  per_example_loss_data = per_example_loss(labels, predictions, weights)
+
+  if weights is None:
+    return tf.keras.backend.mean(per_example_loss_data)
+  else:
+    numerator = tf.keras.backend.sum(per_example_loss_data)
+    weights = tf.keras.backend.cast(weights, predictions.dtype)
+    denominator = tf.keras.backend.sum(weights) + 1e-5
+    return numerator / denominator
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
new file mode 100644
index 00000000..deb4d120
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
@@ -0,0 +1,381 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for masked LM loss."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import tensorflow as tf
+
+from tensorflow.python.keras import keras_parameterized  # pylint: disable=g-direct-tensorflow-import
+from official.nlp.modeling import networks
+from official.nlp.modeling.losses import weighted_sparse_categorical_crossentropy
+
+
+@keras_parameterized.run_all_keras_modes
+class ClassificationLossTest(keras_parameterized.TestCase):
+
+  def create_lm_model(self,
+                      vocab_size,
+                      sequence_length,
+                      hidden_size,
+                      num_predictions,
+                      output="predictions"):
+    # First, create a transformer stack that we can use to get the LM's
+    # vocabulary weight.
+    xformer_stack = networks.TransformerEncoder(
+        vocab_size=vocab_size,
+        num_layers=1,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
+    )
+    word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
+    lm_outputs, _ = xformer_stack([word_ids, mask, type_ids])
+
+    # Create a maskedLM from the transformer stack.
+    test_network = networks.MaskedLM(
+        num_predictions=num_predictions,
+        input_width=lm_outputs.shape[-1],
+        source_network=xformer_stack,
+        output=output)
+
+    # Create a model from the masked LM layer.
+    lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
+    masked_lm_positions = tf.keras.Input(
+        shape=(num_predictions,), dtype=tf.int32)
+    output = test_network([lm_input_tensor, masked_lm_positions])
+    return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
+
+  def create_classification_model(self, input_width, num_classes):
+    test_object = networks.Classification(
+        input_width=input_width, num_classes=num_classes)
+    # Create a 2-dimensional input (the first dimension is implicit).
+    pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
+    output = test_object(pooled_data)
+    return tf.keras.Model(pooled_data, output)
+
+  def test_per_example_loss_3d_input(self):
+    """Test per-example loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions)
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate per-example loss.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels)
+
+    # Per-example loss data should have one value per prediction, and those
+    # values shouldn't be zero in this case (as we're using random data).
+    expected_shape = [batch_size, num_predictions]
+    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(
+        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_per_example_loss_2d_input(self):
+    """Test per-example loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per example loss.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels)
+
+    # Per-example loss data should have one value per batch item, and those
+    # values shouldn't be zero in this case (as we're using random data).
+    self.assertEqual([batch_size], per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(
+        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_per_example_loss_weights_3d_input(self):
+    """Test weighted per-example loss with a 3-d input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions)
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate per-example loss with weights.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    weights = np.random.randint(2, size=(batch_size, num_predictions))
+
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights)
+
+    # Weighted per-example loss data should be equivalent to multiplying the
+    # loss tensor by the weights tensor.
+    expected_weighted_loss = per_example_loss_data * weights
+    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
+
+  def test_per_example_loss_weights_2d_input(self):
+    """Test weighted per-example loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per-example loss with weights.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    weights = np.random.randint(2, size=(batch_size))
+
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights)
+
+    # Weighted per-example loss data should be equivalent to multiplying the
+    # loss tensor by the weights tensor.
+    expected_weighted_loss = per_example_loss_data * weights
+    self.assertAllClose(expected_weighted_loss, per_example_loss_data)
+
+  def test_loss_3d_input(self):
+    """Test overall loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions)
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate loss.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    weights = np.random.randint(2, size=(batch_size, num_predictions))
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=weights)
+
+    # Total loss data should have one value, and that value shouldn't be zero
+    # in this case (as we're using random data).
+    expected_shape = []  # Scalar
+    self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
+    self.assertNotAllClose(
+        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+
+  def test_loss_2d_input(self):
+    """Test overall loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate per example loss.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels)
+
+    # Loss data should have one value only, and that value shouldn't be zero in
+    # this case (as we're using random data).
+    self.assertNotAllClose(0, loss_data)
+
+  def test_loss_weights_3d_input(self):
+    """Test masked loss with a 3-dimensional input, from a masked LM."""
+    vocab_size = 100
+    sequence_length = 32
+    hidden_size = 64
+    num_predictions = 21
+    model = self.create_lm_model(
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_predictions=num_predictions)
+
+    # Get the output of the masked LM.
+    batch_size = 3
+    lm_input_data = 10 * np.random.random_sample(
+        (batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(
+        2, size=(batch_size, num_predictions))
+    output_data = model.predict([lm_input_data, masked_position_data])
+
+    # Calculate a fully masked weight tensor. This should give a loss of zero.
+    labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
+    null_weights = np.zeros((batch_size, num_predictions))
+    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=null_weights)
+
+    # Because the tensor is fully masked, the loss should be 0.
+    self.assertAllClose(0, weighted_loss_data)
+
+  def test_loss_weights_2d_input(self):
+    """Test masked loss with a 2-d input, from a classifier."""
+    input_width = 512
+    num_classes = 10
+    model = self.create_classification_model(input_width, num_classes)
+
+    # Invoke the network as part of a Model.
+    batch_size = 3
+    input_data = 10 * np.random.random_sample((batch_size, input_width))
+    output_data = model.predict(input_data)
+
+    # Calculate a fully masked weight tensor. This should give a loss of zero.
+    labels = np.random.randint(num_classes, size=(batch_size))
+    null_weights = np.zeros((batch_size))
+    weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=null_weights)
+
+    # Because the tensor is fully masked, the loss should be 0.
+    self.assertAllClose(0, weighted_loss_data)
+
+  def test_mismatched_predictions_and_labels_ranks_squeezes(self):
+    """Test that the loss asserts when rank(predictions)-1 != rank(labels)."""
+    batch_size = 3
+    output_data = np.random.random_sample((batch_size, 10))
+    labels = np.random.randint(10, size=(batch_size, 1))
+
+    # All that this test tests is that the squeeze is successful.
+    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels)
+
+  def test_mismatched_weights_and_labels_ranks_fail(self):
+    """Test that the loss asserts when rank(predictions) != rank(labels)."""
+    batch_size = 3
+    output_data = np.random.random_sample((batch_size, 10, 15))
+    labels = np.random.randint(10, size=(batch_size, 10))
+    weights = np.random.randint(2, size=(batch_size))
+
+    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
+      _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+          predictions=output_data, labels=labels, weights=weights)
+    with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
+      _ = weighted_sparse_categorical_crossentropy.loss(
+          predictions=output_data, labels=labels, weights=weights)
+
+  def test_tf_tensor_inputs(self):
+    """Test that tf.Tensors can be used as inputs to the loss function."""
+    batch_size = 3
+    output_data = tf.convert_to_tensor(
+        np.random.random_sample((batch_size, 10, 15)))
+    labels = tf.convert_to_tensor(np.random.randint(10, size=(batch_size, 10)))
+    weights = tf.convert_to_tensor(np.random.randint(2, size=(batch_size, 10)))
+
+    # We're not trying to validate numerical correctness, just ensure that
+    # we can in fact pass tensors to these functions without causing runtime
+    # errors from the shape checking code.
+    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels, weights=weights)
+    _ = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=weights)
+
+  def test_legacy_lm_loss_compatibility(self):
+    """Test to validate computational correctness during refactors."""
+    # This is the empirical output of a masked LM with the following parameters:
+    #   batch_size = 3
+    #   vocab_size = 5
+    #   sequence_length = 4
+    #   num_predictions = 2
+    output_data = np.array(
+        [[[-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
+          [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683]],
+         [[-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+          [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741]],
+         [[-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+          [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]]])
+    labels = np.array([[4, 0], [2, 2], [2, 1]])
+
+    # Validate that per_example loss calculations are the same.
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels)
+    expected_per_example_loss_data = [[1.2923571, 2.7117882],
+                                      [2.287932, 2.287932],
+                                      [3.0924666, 1.8219438]]
+    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
+
+    # Validate that overall loss calculations are the same.
+    weights = np.array([[1, 0], [0, 0], [0, 0]])
+    loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=weights)
+    expected_loss_data = 1.2923441
+    self.assertAllClose(expected_loss_data, loss_data)
+
+  def test_legacy_classification_loss_compatibility(self):
+    """Test to validate computational correctness during refactors."""
+    # This is the empirical output of a classifier with the following params:
+    #   batch_size = 2
+    #   num_classes = 3
+    output_data = np.array([[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00],
+                            [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]])
+    labels = np.array([2, 1])
+
+    # Validate that per_example loss calculations are the same.
+    per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
+        predictions=output_data, labels=labels)
+    expected_per_example_loss_data = [6.4434357, 6.4009643]
+    self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
+
+    # Validate that overall loss calculations are the same.
+    weights = None
+    loss_data = weighted_sparse_categorical_crossentropy.loss(
+        predictions=output_data, labels=labels, weights=weights)
+    expected_loss_data = 6.4222
+    self.assertAllClose(expected_loss_data, loss_data)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/__init__.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/deepray/layers/nlp/transformer/beam_search_v1.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
similarity index 83%
rename from deepray/layers/nlp/transformer/beam_search_v1.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
index 54603948..8b143b1b 100644
--- a/deepray/layers/nlp/transformer/beam_search_v1.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
@@ -79,18 +79,16 @@ class _StateKeys(object):
 class SequenceBeamSearch(object):
   """Implementation of beam search loop."""
 
-  def __init__(
-      self,
-      symbols_to_logits_fn,
-      vocab_size,
-      batch_size,
-      beam_size,
-      alpha,
-      max_decode_length,
-      eos_id,
-      padded_decode,
-      dtype=tf.float32
-  ):
+  def __init__(self,
+               symbols_to_logits_fn,
+               vocab_size,
+               batch_size,
+               beam_size,
+               alpha,
+               max_decode_length,
+               eos_id,
+               padded_decode,
+               dtype=tf.float32):
     """Initialize sequence beam search.
 
     Args:
@@ -131,13 +129,8 @@ def search(self, initial_ids, initial_cache):
     state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
 
     finished_state = tf.while_loop(
-        self._continue_search,
-        self._search_step,
-        loop_vars=[state],
-        shape_invariants=[state_shapes],
-        parallel_iterations=1,
-        back_prop=False
-    )
+        self._continue_search, self._search_step, loop_vars=[state],
+        shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
     finished_state = finished_state[0]
 
     alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
@@ -149,8 +142,10 @@ def search(self, initial_ids, initial_cache):
     # Account for corner case where there are no finished sequences for a
     # particular batch item. In that case, return alive sequences for that batch
     # item.
-    finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-    finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+    finished_seq = tf.where(
+        tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+    finished_scores = tf.where(
+        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
     return finished_seq, finished_scores
 
   def _create_initial_state(self, initial_ids, initial_cache):
@@ -171,8 +166,7 @@ def _create_initial_state(self, initial_ids, initial_cache):
           raise TypeError(
               "initial_cache element for key '%s' has dtype %s that does not "
               "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, value.dtype.name, self.dtype.name, inner_value)
-          )
+              (key, value.dtype.name, self.dtype.name, inner_value))
 
     # Current loop index (starts at 0)
     cur_index = tf.constant(0)
@@ -185,18 +179,21 @@ def _create_initial_state(self, initial_ids, initial_cache):
 
     # Create tensor for storing initial log probabilities.
     # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant([[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
+    initial_log_probs = tf.constant(
+        [[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
     alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
 
     # Expand all values stored in the dictionary to the beam size, so that each
     # beam has a separate cache.
-    alive_cache = nest.map_structure(lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+    alive_cache = nest.map_structure(
+        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
 
     # Initialize tensor storing finished sequences with filler values.
     finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
 
     # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([self.batch_size, self.beam_size], dtype=self.dtype) * -inf(self.dtype)
+    finished_scores = tf.ones([self.batch_size, self.beam_size],
+                              dtype=self.dtype) * -inf(self.dtype)
 
     # Initialize finished flags with all False values.
     finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
@@ -219,23 +216,41 @@ def _create_initial_state(self, initial_ids, initial_cache):
     #   2) the dimension may have different values on different iterations.
     if self.padded_decode:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX: tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ: tf.TensorShape([self.batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape, alive_cache),
-          _StateKeys.FINISHED_SEQ: tf.TensorShape([self.batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES: tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS: tf.TensorShape([self.batch_size, self.beam_size])
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape(
+                  [self.batch_size, self.beam_size,
+                   self.max_decode_length + 1]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([self.batch_size, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              nest.map_structure(_get_shape, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape(
+                  [self.batch_size, self.beam_size,
+                   self.max_decode_length + 1]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([self.batch_size, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([self.batch_size, self.beam_size])
       }
     else:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX: tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
+          _StateKeys.CUR_INDEX:
+              tf.TensorShape([]),
+          _StateKeys.ALIVE_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.ALIVE_LOG_PROBS:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.ALIVE_CACHE:
+              nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+          _StateKeys.FINISHED_SEQ:
+              tf.TensorShape([None, self.beam_size, None]),
+          _StateKeys.FINISHED_SCORES:
+              tf.TensorShape([None, self.beam_size]),
+          _StateKeys.FINISHED_FLAGS:
+              tf.TensorShape([None, self.beam_size])
       }
 
     return state, state_shape_invariants
@@ -264,24 +279,31 @@ def _continue_search(self, state):
     not_at_max_decode_length = tf.less(i, self.max_decode_length)
 
     # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(self.alpha, self.max_decode_length, dtype=self.dtype)
+    max_length_norm = _length_normalization(self.alpha, self.max_decode_length,
+                                            dtype=self.dtype)
     # Get the best possible scores from alive sequences.
     best_alive_scores = alive_log_probs[:, 0] / max_length_norm
 
     # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.cast(finished_flags, self.dtype)  # set filler scores to zero
+    finished_scores *= tf.cast(finished_flags,
+                               self.dtype)  # set filler scores to zero
     lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
 
     # If there are no finished sequences in a batch element, then set the lowest
     # finished score to -INF for that element.
     finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) * -inf(self.dtype))
+    lowest_finished_scores += ((1.0 -
+                                tf.cast(finished_batches, self.dtype)) *
+                               -inf(self.dtype))
 
     worst_finished_score_better_than_best_alive_score = tf.reduce_all(
         tf.greater(lowest_finished_scores, best_alive_scores)
     )
 
-    return tf.logical_and(not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score))
+    return tf.logical_and(
+        not_at_max_decode_length,
+        tf.logical_not(worst_finished_score_better_than_best_alive_score)
+    )
 
   def _search_step(self, state):
     """Beam search loop body.
@@ -304,11 +326,13 @@ def _search_step(self, state):
     new_seq, new_log_probs, topk_ids, new_cache = self._grow_alive_seq(state)
     new_finished_flags = tf.equal(topk_ids, self.eos_id)
     # Collect top beam_size alive sequences
-    alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_finished_flags, new_cache)
+    alive_state = self._get_new_alive_state(new_seq, new_log_probs,
+                                            new_finished_flags, new_cache)
 
     # Combine newly finished sequences with existing finished sequences, and
     # collect the top k scoring sequences.
-    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs, new_finished_flags)
+    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs,
+                                                  new_finished_flags)
 
     # Increment loop index and create new state dictionary
     new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
@@ -342,8 +366,8 @@ def _grow_alive_seq(self, state):
     # cache values at the same time.
     if self.padded_decode:
       flat_ids = tf.reshape(
-          tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]), [self.batch_size * self.beam_size, -1]
-      )
+          tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]),
+          [self.batch_size * self.beam_size, -1])
     else:
       flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
     flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
@@ -352,7 +376,9 @@ def _grow_alive_seq(self, state):
 
     # Unflatten logits to shape [batch_size, beam_size, vocab_size]
     logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
-    new_cache = nest.map_structure(lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache)
+    new_cache = nest.map_structure(
+        lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
+        flat_cache)
 
     # Convert logits to normalized log probs
     candidate_log_probs = _log_prob_from_logits(logits)
@@ -364,26 +390,31 @@ def _grow_alive_seq(self, state):
 
     # Each batch item has beam_size * vocab_size candidate sequences. For each
     # batch item, get the k candidates with the highest log probabilities.
-    flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size])
+    flat_log_probs = tf.reshape(log_probs,
+                                [-1, self.beam_size * self.vocab_size])
     topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
 
     # Extract the alive sequences that generate the highest log probabilities
     # after being extended.
     topk_beam_indices = topk_indices // self.vocab_size
-    topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep)
+    topk_seq, new_cache = _gather_beams(
+        [alive_seq, new_cache], topk_beam_indices, self.batch_size,
+        beams_to_keep)
 
     # Append the most probable IDs to the topk sequences
     topk_ids = topk_indices % self.vocab_size
     if self.padded_decode:
       topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
       # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
-      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]], tf.expand_dims(topk_ids, axis=0))
+      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
+                                             tf.expand_dims(topk_ids, axis=0))
       topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
     else:
       topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
     return topk_seq, topk_log_probs, topk_ids, new_cache
 
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags, new_cache):
+  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
+                           new_cache):
     """Gather the top k sequences that are still alive.
 
     Args:
@@ -405,8 +436,8 @@ def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags, new_c
     new_log_probs += tf.cast(new_finished_flags, self.dtype) * -inf(self.dtype)
 
     top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
-        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size, self.beam_size
-    )
+        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
+        self.beam_size)
 
     return {
         _StateKeys.ALIVE_SEQ: top_alive_seq,
@@ -414,7 +445,8 @@ def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags, new_c
         _StateKeys.ALIVE_CACHE: top_alive_cache
     }
 
-  def _get_new_finished_state(self, state, new_seq, new_log_probs, new_finished_flags):
+  def _get_new_finished_state(self, state, new_seq, new_log_probs,
+                              new_finished_flags):
     """Combine new and old finished sequences, and gather the top k sequences.
 
     Args:
@@ -440,14 +472,19 @@ def _get_new_finished_state(self, state, new_seq, new_log_probs, new_finished_fl
     # First append a column of 0-ids to finished_seq to increment the length.
     # New shape of finished_seq: [batch_size, beam_size, i + 1]
     if not self.padded_decode:
-      finished_seq = tf.concat([finished_seq, tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
+      finished_seq = tf.concat([
+          finished_seq,
+          tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)
+      ],
+                               axis=2)
 
     # Calculate new seq scores from log probabilities.
     length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
     new_scores = new_log_probs / length_norm
 
     # Set the scores of the still-alive seq in new_seq to large negative values.
-    new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) * -inf(self.dtype))
+    new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
+                   -inf(self.dtype))
 
     # Combine sequences, scores, and flags.
     finished_seq = tf.concat([finished_seq, new_seq], axis=1)
@@ -456,10 +493,8 @@ def _get_new_finished_state(self, state, new_seq, new_log_probs, new_finished_fl
 
     # Return the finished sequences with the best scores.
     top_finished_seq, top_finished_scores, top_finished_flags = (
-        _gather_topk_beams(
-            [finished_seq, finished_scores, finished_flags], finished_scores, self.batch_size, self.beam_size
-        )
-    )
+        _gather_topk_beams([finished_seq, finished_scores, finished_flags],
+                           finished_scores, self.batch_size, self.beam_size))
 
     return {
         _StateKeys.FINISHED_SEQ: top_finished_seq,
@@ -469,16 +504,8 @@ def _get_new_finished_state(self, state, new_seq, new_log_probs, new_finished_fl
 
 
 def sequence_beam_search(
-    symbols_to_logits_fn,
-    initial_ids,
-    initial_cache,
-    vocab_size,
-    beam_size,
-    alpha,
-    max_decode_length,
-    eos_id,
-    padded_decode=False
-):
+    symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
+    alpha, max_decode_length, eos_id, padded_decode=False):
   """Search for sequence of subtoken ids with the largest probability.
 
   Args:
@@ -509,10 +536,12 @@ def sequence_beam_search(
     Top decoded sequences [batch_size, beam_size, max_decode_length]
     sequence scores [batch_size, beam_size]
   """
-  batch_size = (initial_ids.shape.as_list()[0] if padded_decode else tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearch(
-      symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id, padded_decode
-  )
+  batch_size = (
+      initial_ids.shape.as_list()[0] if padded_decode else
+      tf.shape(initial_ids)[0])
+  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
+                           beam_size, alpha, max_decode_length, eos_id,
+                           padded_decode)
   return sbs.search(initial_ids, initial_cache)
 
 
@@ -636,7 +665,8 @@ def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
   # the (i, j) gathering coordinates.
   coordinates = tf.stack([batch_pos, beam_indices], axis=2)
 
-  return nest.map_structure(lambda state: tf.gather_nd(state, coordinates), nested)
+  return nest.map_structure(
+      lambda state: tf.gather_nd(state, coordinates), nested)
 
 
 def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
diff --git a/deepray/layers/nlp/transformer/beam_search_v1_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
similarity index 85%
rename from deepray/layers/nlp/transformer/beam_search_v1_test.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
index dd7dfca7..53cf921f 100644
--- a/deepray/layers/nlp/transformer/beam_search_v1_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
@@ -45,7 +45,8 @@ def test_get_shape_keep_last_dim(self):
     y = tf.constant(4.0)
     x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
     shape = beam_search._get_shape_keep_last_dim(x)
-    self.assertAllEqual([None, None, None, 5], shape.as_list())
+    self.assertAllEqual([None, None, None, 5],
+                        shape.as_list())
 
   def test_flatten_beam_dim(self):
     x = tf.ones([7, 4, 2, 5])
@@ -75,7 +76,11 @@ def test_gather_beams(self):
     with self.session() as sess:
       y = sess.run(y)
 
-    self.assertAllEqual([[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]], y)
+    self.assertAllEqual([[[4, 5, 6, 7],
+                          [8, 9, 10, 11]],
+                         [[12, 13, 14, 15],
+                          [20, 21, 22, 23]]],
+                        y)
 
   def test_gather_topk_beams(self):
     x = tf.reshape(tf.range(24), [2, 3, 4])
@@ -85,7 +90,11 @@ def test_gather_topk_beams(self):
     with self.session() as sess:
       y = sess.run(y)
 
-    self.assertAllEqual([[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]], y)
+    self.assertAllEqual([[[4, 5, 6, 7],
+                          [8, 9, 10, 11]],
+                         [[12, 13, 14, 15],
+                          [20, 21, 22, 23]]],
+                        y)
 
 
 if __name__ == "__main__":
diff --git a/deepray/layers/nlp/transformer/model_params.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
similarity index 94%
rename from deepray/layers/nlp/transformer/model_params.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
index ede2e464..e978abea 100644
--- a/deepray/layers/nlp/transformer/model_params.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
@@ -16,6 +16,7 @@
 
 from collections import defaultdict
 
+
 BASE_PARAMS = defaultdict(
     lambda: None,  # Set default value to None.
 
@@ -65,6 +66,7 @@
 
     # default batch size is smaller than for BASE_PARAMS due to memory limits.
     default_batch_size_tpu=16384,
+
     hidden_size=1024,
     filter_size=4096,
     num_heads=16,
@@ -73,10 +75,15 @@
 # Parameters for running the model in multi gpu. These should not change the
 # params that modify the model shape (such as the hidden_size or num_heads).
 BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
-BASE_MULTI_GPU_PARAMS.update(learning_rate_warmup_steps=8000)
+BASE_MULTI_GPU_PARAMS.update(
+    learning_rate_warmup_steps=8000
+)
 
 BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
-BIG_MULTI_GPU_PARAMS.update(layer_postprocess_dropout=0.3, learning_rate_warmup_steps=8000)
+BIG_MULTI_GPU_PARAMS.update(
+    layer_postprocess_dropout=0.3,
+    learning_rate_warmup_steps=8000
+)
 
 # Parameters for testing the model
 TINY_PARAMS = BASE_PARAMS.copy()
diff --git a/deepray/layers/nlp/transformer/model_utils.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
similarity index 89%
rename from deepray/layers/nlp/transformer/model_utils.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
index e4e642f2..3f860f04 100644
--- a/deepray/layers/nlp/transformer/model_utils.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
@@ -29,7 +29,8 @@
 _NEG_INF_FP16 = np.finfo(np.float16).min
 
 
-def get_position_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
+def get_position_encoding(
+    length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
   """Return positional encoding.
 
   Calculates the position encoding as a mix of sine and cosine functions with
@@ -51,9 +52,10 @@ def get_position_encoding(length, hidden_size, min_timescale=1.0, max_timescale=
   position = tf.cast(tf.range(length), tf.float32)
   num_timescales = hidden_size // 2
   log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) / (tf.cast(num_timescales, tf.float32) - 1)
-  )
-  inv_timescales = min_timescale * tf.exp(tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
+      math.log(float(max_timescale) / float(min_timescale)) /
+      (tf.cast(num_timescales, tf.float32) - 1))
+  inv_timescales = min_timescale * tf.exp(
+      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
   scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
   signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
   return signal
@@ -75,7 +77,8 @@ def get_decoder_self_attention_bias(length, dtype=tf.float32):
   """
   neg_inf = _NEG_INF_FP16 if dtype == tf.float16 else _NEG_INF_FP32
   with tf.name_scope("decoder_self_attention_bias"):
-    valid_locs = tf.linalg.band_part(tf.ones([length, length], dtype=dtype), -1, 0)
+    valid_locs = tf.linalg.band_part(tf.ones([length, length], dtype=dtype),
+                                     -1, 0)
     valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
     decoder_bias = neg_inf * (1.0 - valid_locs)
   return decoder_bias
@@ -115,5 +118,6 @@ def get_padding_bias(x, padding_value=0, dtype=tf.float32):
   with tf.name_scope("attention_bias"):
     padding = get_padding(x, padding_value, dtype)
     attention_bias = padding * _NEG_INF_FP32
-    attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
+    attention_bias = tf.expand_dims(
+        tf.expand_dims(attention_bias, axis=1), axis=1)
   return attention_bias
diff --git a/deepray/layers/nlp/transformer/model_utils_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
similarity index 74%
rename from deepray/layers/nlp/transformer/model_utils_test.py
rename to modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
index 527448bf..06303da0 100644
--- a/deepray/layers/nlp/transformer/model_utils_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
@@ -31,7 +31,8 @@ def test_get_padding(self):
     x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
     padding = model_utils.get_padding(x, padding_value=0)
 
-    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]], padding)
+    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
+                        padding)
 
   def test_get_padding_bias(self):
     x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
@@ -39,26 +40,22 @@ def test_get_padding_bias(self):
     bias_shape = tf.shape(bias)
     flattened_bias = tf.reshape(bias, [3, 5])
 
-    self.assertAllEqual(
-        [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]],
-        flattened_bias
-    )
+    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
+                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
+                         [NEG_INF, 0, 0, NEG_INF, 0]],
+                        flattened_bias)
     self.assertAllEqual([3, 1, 1, 5], bias_shape)
 
   def test_get_decoder_self_attention_bias(self):
     length = 5
     bias = model_utils.get_decoder_self_attention_bias(length)
 
-    self.assertAllEqual(
-        [
-            [
-                [
-                    [0, NEG_INF, NEG_INF, NEG_INF, NEG_INF], [0, 0, NEG_INF, NEG_INF, NEG_INF],
-                    [0, 0, 0, NEG_INF, NEG_INF], [0, 0, 0, 0, NEG_INF], [0, 0, 0, 0, 0]
-                ]
-            ]
-        ], bias
-    )
+    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
+                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
+                           [0, 0, 0, NEG_INF, NEG_INF],
+                           [0, 0, 0, 0, NEG_INF],
+                           [0, 0, 0, 0, 0]]]],
+                        bias)
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/LanguageModeling/BERT/optimization.py b/modelzoo/LanguageModeling/BERT/optimization.py
index 13312bd1..c2e58a12 100644
--- a/modelzoo/LanguageModeling/BERT/optimization.py
+++ b/modelzoo/LanguageModeling/BERT/optimization.py
@@ -21,13 +21,20 @@
 import re
 
 import tensorflow as tf
-import tensorflow_addons.optimizers as tfa_optimizers
+import deepray as dp
+import tf_keras
 
 
-class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
+class WarmUp(tf_keras.optimizers.schedules.LearningRateSchedule):
   """Applys a warmup schedule on a given learning rate decay schedule."""
 
-  def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
+  def __init__(
+      self,
+      initial_learning_rate,
+      decay_schedule_fn,
+      warmup_steps,
+      power=1.0,
+      name=None):
     super(WarmUp, self).__init__()
     self.initial_learning_rate = initial_learning_rate
     self.warmup_steps = warmup_steps
@@ -42,13 +49,13 @@ def __call__(self, step):
       global_step_float = tf.cast(step, tf.float32)
       warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
       warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = (self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(
-          global_step_float < warmup_steps_float,
-          lambda: warmup_learning_rate,
-          lambda: self.decay_schedule_fn(step),
-          name=name
-      )
+      warmup_learning_rate = (
+          self.initial_learning_rate *
+          tf.math.pow(warmup_percent_done, self.power))
+      return tf.cond(global_step_float < warmup_steps_float,
+                     lambda: warmup_learning_rate,
+                     lambda: self.decay_schedule_fn(step),
+                     name=name)
 
   def get_config(self):
     return {
@@ -66,24 +73,22 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type=
   if optimizer_type == "adam":
     power = 1.0
     decayed_learning_rate_at_crossover_point = init_lr * (
-        (1.0 - float(num_warmup_steps) / float(num_train_steps))**power
-    )
+                  (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
   else:
     power = 0.5
     decayed_learning_rate_at_crossover_point = init_lr
   init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
-  print(
-      'decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' %
-      (decayed_learning_rate_at_crossover_point, init_lr)
-  )
-
-  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=power
-  )
+  print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, init_lr))
+
+  learning_rate_fn = tf_keras.optimizers.schedules.PolynomialDecay(
+      initial_learning_rate=init_lr,
+      decay_steps=num_train_steps,
+      end_learning_rate=0.0,
+      power=power)
   if num_warmup_steps:
-    learning_rate_fn = WarmUp(
-        initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
-    )
+    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
+                              decay_schedule_fn=learning_rate_fn,
+                              warmup_steps=num_warmup_steps)
   if optimizer_type == 'adam':
     optimizer = AdamWeightDecay(
         learning_rate=learning_rate_fn,
@@ -91,23 +96,21 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type=
         beta_1=0.9,
         beta_2=0.999,
         epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']
-    )
+        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
   else:
-    skip_list = ['None']  # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
-    optimizer = tfa_optimizers.LAMB(
+    skip_list = ['None'] # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
+    optimizer = dp.optimizers.LAMB(
         learning_rate=learning_rate_fn,
         weight_decay_rate=0.01,
         beta_1=0.9,
         beta_2=0.999,
         epsilon=1e-6,
         exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
-        exclude_from_layer_adaptation=skip_list
-    )
+        exclude_from_layer_adaptation=skip_list)
   return optimizer
 
 
-class AdamWeightDecay(tf.keras.optimizers.Adam):
+class AdamWeightDecay(tf_keras.optimizers.legacy.Adam):
   """Adam enables L2 weight decay and clip_by_global_norm on gradients.
   Just adding the square of the weights to the loss function is *not* the
   correct way of using L2 regularization/weight decay with Adam, since that will
@@ -117,20 +120,19 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
   the loss with plain (non-momentum) SGD.
   """
 
-  def __init__(
-      self,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-7,
-      amsgrad=False,
-      weight_decay_rate=0.0,
-      include_in_weight_decay=None,
-      exclude_from_weight_decay=None,
-      name='AdamWeightDecay',
-      **kwargs
-  ):
-    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               amsgrad=False,
+               weight_decay_rate=0.0,
+               include_in_weight_decay=None,
+               exclude_from_weight_decay=None,
+               name='AdamWeightDecay',
+               **kwargs):
+    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
+                                          epsilon, amsgrad, name, **kwargs)
     self.weight_decay_rate = weight_decay_rate
     self._include_in_weight_decay = include_in_weight_decay
     self._exclude_from_weight_decay = exclude_from_weight_decay
@@ -139,20 +141,22 @@ def __init__(
   def from_config(cls, config):
     """Creates an optimizer from its config with WarmUp custom object."""
     custom_objects = {'WarmUp': WarmUp}
-    return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
+    return super(AdamWeightDecay, cls).from_config(
+        config, custom_objects=custom_objects)
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
-    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device,
-                 var_dtype)]['weight_decay_rate'] = tf.constant(self.weight_decay_rate, name='adam_weight_decay_rate')
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
+                                                apply_state)
+    apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
+        self.weight_decay_rate, name='adam_weight_decay_rate')
 
   def _decay_weights_op(self, var, learning_rate, apply_state):
     do_decay = self._do_use_weight_decay(var.name)
     if do_decay:
       return var.assign_sub(
-          learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
-          use_locking=self._use_locking
-      )
+          learning_rate * var *
+          apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
+          use_locking=self._use_locking)
     return tf.no_op()
 
   def _get_lr(self, var_device, var_dtype, apply_state):
@@ -172,13 +176,15 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
     decay = self._decay_weights_op(var, lr_t, apply_state)
     with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
+      return super(AdamWeightDecay,
+                   self)._resource_apply_dense(grad, var, **kwargs)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
     decay = self._decay_weights_op(var, lr_t, apply_state)
     with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
+      return super(AdamWeightDecay,
+                   self)._resource_apply_sparse(grad, var, indices, **kwargs)
 
   def get_config(self):
     config = super(AdamWeightDecay, self).get_config()
@@ -203,17 +209,18 @@ def _do_use_weight_decay(self, param_name):
           return False
     return True
 
+# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator():
   def __init__(self):
     self._gradients = []
     self._accum_steps = None
 
   def zero(self, dtype):
     return tf.Variable(
-        tf.constant(0, dtype=dtype),
-        trainable=False,
-        synchronization=tf.VariableSynchronization.ON_READ,
-        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
-    )
+            tf.constant(0, dtype=dtype),
+            trainable=False,
+            synchronization=tf.VariableSynchronization.ON_READ,
+            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
 
   @property
   def step(self):
@@ -238,14 +245,17 @@ def reset(self):
   def add_gradients(self, grads):
     if not self._gradients:
       _ = self.step
-      self._gradients.extend(
-          [
-              tf.Variable(tf.zeros_like(g), trainable=False, synchronization=tf.VariableSynchronization.ON_READ)
-              if g is not None else None for g in grads
-          ]
-      )
+      self._gradients.extend([
+        tf.Variable(
+          tf.zeros_like(g),
+          trainable=False,
+          synchronization=tf.VariableSynchronization.ON_READ
+        ) if g is not None else None
+        for g in grads
+      ])
     if len(grads) != len(self._gradients):
-      raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(grads)))
+      raise ValueError("Expected %s gradients, but got %d" % (
+          len(self._gradients), len(grads)))
 
     for accum_grad, grad in zip(self._gradients, grads):
       if accum_grad is not None:
diff --git a/modelzoo/LanguageModeling/BERT/run.sub b/modelzoo/LanguageModeling/BERT/run.sub
new file mode 100644
index 00000000..029b45be
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/run.sub
@@ -0,0 +1,82 @@
+#!/bin/bash
+#SBATCH --exclusive
+#SBATCH --mem=0
+#SBATCH --overcommit
+
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -eux
+
+readonly docker_image="nvcr.io/nvidia/tensorflow:21.02-tf2-py3"
+readonly datadir="/raid/data/bert"
+readonly checkpointdir="$PWD/checkpoints"
+
+readonly mounts=".:/workspace/bert_tf2,${datadir}:/workspace/bert_tf2/data,${checkpointdir}:/results"
+
+
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_1"
+srun --ntasks="${SLURM_JOB_NUM_NODES}" --ntasks-per-node=1 mkdir -p "${checkpointdir}/phase_2"
+
+PHASE1="\
+     --train_batch_size=${BATCHSIZE:-16} \
+     --learning_rate=${LEARNING_RATE:-1.875e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-128} \
+     --input_files=/workspace/bert_tf2/data/tfrecord/lower_case_1_seq_len_128_max_pred_20_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training/* \
+     --max_seq_length=128 \
+     --max_predictions_per_seq=20 \
+     --num_steps_per_epoch=7038 --num_train_epochs=1 \
+     --warmup_steps=2000 \
+     --model_dir=/results/phase_1 \
+     "
+
+PHASE2="\
+     --train_batch_size=${BATCHSIZE:-2} \
+     --learning_rate=${LEARNING_RATE:-1.25e-4} \
+     --num_accumulation_steps=${NUM_ACCUMULATION_STEPS:-512} \
+     --input_files=/workspace/bert_tf2/data/tfrecord/lower_case_1_seq_len_512_max_pred_80_masked_lm_prob_0.15_random_seed_12345_dupe_factor_5_shard_1472_test_split_10/books_wiki_en_corpus/training/* \
+     --max_seq_length=512 \
+     --max_predictions_per_seq=80 \
+     --num_steps_per_epoch=1564 --num_train_epochs=1 \
+     --warmup_steps=200 \
+     --model_dir=/results/phase_2 \
+     --init_checkpoint=/results/phase_1/pretrained/bert_model.ckpt-1 \
+    "
+
+PHASES=( "$PHASE1" "$PHASE2" )
+
+PHASE=${PHASE:-1}
+
+PIP_CMD="pip3 install \
+  requests \
+  tqdm \
+  horovod \
+  sentencepiece \
+  tensorflow_hub \
+  pynvml \
+  wget \
+  progressbar \
+  git+https://github.com/NVIDIA/dllogger"
+
+BERT_CMD="\
+    python /workspace/bert_tf2/run_pretraining.py \
+     ${PHASES[$((PHASE-1))]} \
+     --save_checkpoint_steps=100 \
+     --steps_per_loop=100 \
+     --optimizer_type=LAMB \
+     --bert_config_file=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16/bert_config.json \
+     --use_horovod --use_fp16 --enable_xla"
+
+srun --mpi=pmi2 -l --container-image="${docker_image}" --container-mounts="${mounts}" bash -c "${PIP_CMD}; ${BERT_CMD}"
diff --git a/modelzoo/LanguageModeling/BERT/run_classifier.py b/modelzoo/LanguageModeling/BERT/run_classifier.py
new file mode 100644
index 00000000..7c6f76b0
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/run_classifier.py
@@ -0,0 +1,402 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""BERT classification finetuning runner in tf2.0."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import math
+import os
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import horovod.tensorflow as hvd
+
+# Import BERT model libraries.
+from official.nlp import bert_models
+import common_flags
+import input_pipeline
+import model_saving_utils
+from official.modeling import model_training_utils
+from official.nlp import bert_modeling as modeling
+import optimization
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+
+flags.DEFINE_enum(
+    'mode', 'train_and_eval', ['train_and_eval', 'export_only'],
+    'One of {"train_and_eval", "export_only"}. `train_and_eval`: '
+    'trains the model and evaluates in the meantime. '
+    '`export_only`: will take the latest checkpoint inside '
+    'model_dir and export a `SavedModel`.')
+flags.DEFINE_string('train_data_path', None,
+                    'Path to training data for BERT classifier.')
+flags.DEFINE_string('eval_data_path', None,
+                    'Path to evaluation data for BERT classifier.')
+# Model training specific flags.
+flags.DEFINE_string(
+    'input_meta_data_path', None,
+    'Path to file that contains meta data about input '
+    'to be used for training and evaluation.')
+flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
+flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
+
+common_flags.define_common_bert_flags()
+
+FLAGS = flags.FLAGS
+
+
+def get_loss_fn(num_classes, loss_factor=1.0):
+  """Gets the classification loss function."""
+
+  def classification_loss_fn(labels, logits):
+    """Classification loss."""
+    labels = tf.squeeze(labels)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    one_hot_labels = tf.one_hot(
+        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(
+        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+    loss *= loss_factor
+    return loss
+
+  return classification_loss_fn
+
+
+def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
+                   is_training, use_horovod):
+  """Gets a closure to create a dataset."""
+
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_classifier_dataset(
+        input_file_pattern,
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx,
+        use_horovod=use_horovod)
+    return dataset
+
+  return _dataset_fn
+
+
+def run_bert_classifier(strategy,
+                        bert_config,
+                        input_meta_data,
+                        model_dir,
+                        epochs,
+                        steps_per_epoch,
+                        steps_per_loop,
+                        eval_steps,
+                        warmup_steps,
+                        initial_lr,
+                        init_checkpoint,
+                        train_input_fn,
+                        eval_input_fn,
+                        custom_callbacks=None,
+                        run_eagerly=False,
+                        use_keras_compile_fit=False):
+  """Run BERT classifier training using low-level API."""
+  max_seq_length = input_meta_data['max_seq_length']
+  num_classes = input_meta_data['num_labels']
+
+  def _get_classifier_model():
+    """Gets a classifier model."""
+    classifier_model, core_model = (
+        bert_models.classifier_model(
+            bert_config,
+            tf.float32,
+            num_classes,
+            max_seq_length,
+            hub_module_url=FLAGS.hub_module_url))
+    classifier_model.optimizer = optimization.create_optimizer(
+        initial_lr, steps_per_epoch * epochs, warmup_steps)
+    if FLAGS.fp16_implementation == 'graph_rewrite':
+      # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
+      # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
+      # which will ensure tf.compat.v2.keras.mixed_precision and
+      # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
+      # up.
+      classifier_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
+          classifier_model.optimizer)
+    return classifier_model, core_model
+
+  # During distributed training, loss used for gradient computation is
+  # summed over from all replicas. When Keras compile/fit() API is used,
+  # the fit() API internally normalizes the loss by dividing the loss by
+  # the number of replicas used for computation. However, when custom
+  # training loop is used this is not done automatically and should be
+  # done manually by the end user.
+  loss_multiplier = 1.0
+  if FLAGS.scale_loss and strategy and not use_keras_compile_fit:
+    loss_multiplier = 1.0 / strategy.num_replicas_in_sync
+
+  loss_fn = get_loss_fn(num_classes, loss_factor=loss_multiplier)
+
+  # Defines evaluation metrics function, which will create metrics in the
+  # correct device and strategy scope.
+  def metric_fn():
+    return tf.keras.metrics.SparseCategoricalAccuracy(
+        'test_accuracy', dtype=tf.float32)
+
+  if use_keras_compile_fit:
+    # Start training using Keras compile/fit API.
+    logging.info('Training using TF 2.0 Keras compile/fit API with '
+                 'distribution strategy.')
+    return run_keras_compile_fit(
+        model_dir,
+        strategy,
+        _get_classifier_model,
+        train_input_fn,
+        eval_input_fn,
+        loss_fn,
+        metric_fn,
+        init_checkpoint,
+        epochs,
+        steps_per_epoch,
+        eval_steps,
+        custom_callbacks=None)
+
+  # Use user-defined loop to start training.
+  logging.info('Training using customized training loop TF 2.0 with '
+               'distribution strategy.')
+  return model_training_utils.run_customized_training_loop(
+      strategy=strategy,
+      model_fn=_get_classifier_model,
+      loss_fn=loss_fn,
+      model_dir=model_dir,
+      steps_per_epoch=steps_per_epoch,
+      steps_per_loop=steps_per_loop,
+      epochs=epochs,
+      train_input_fn=train_input_fn,
+      eval_input_fn=eval_input_fn,
+      eval_steps=eval_steps,
+      init_checkpoint=init_checkpoint,
+      metric_fn=metric_fn,
+      hvd=hvd if FLAGS.use_horovod else None,
+      custom_callbacks=custom_callbacks,
+      run_eagerly=run_eagerly)
+
+
+def run_keras_compile_fit(model_dir,
+                          strategy,
+                          model_fn,
+                          train_input_fn,
+                          eval_input_fn,
+                          loss_fn,
+                          metric_fn,
+                          init_checkpoint,
+                          epochs,
+                          steps_per_epoch,
+                          eval_steps,
+                          custom_callbacks=None):
+  """Runs BERT classifier model using Keras compile/fit API."""
+
+  with distribution_utils.get_strategy_scope(strategy):
+    training_dataset = train_input_fn()
+    evaluation_dataset = eval_input_fn()
+    bert_model, sub_model = model_fn()
+    optimizer = bert_model.optimizer
+
+    if init_checkpoint:
+      checkpoint = tf.train.Checkpoint(model=sub_model)
+      checkpoint.restore(init_checkpoint).assert_existing_objects_matched()
+
+    bert_model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metric_fn()])
+
+    summary_dir = os.path.join(model_dir, 'summaries')
+    summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
+    checkpoint_path = os.path.join(model_dir, 'checkpoint')
+    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
+        checkpoint_path, save_weights_only=True)
+
+    if custom_callbacks is not None:
+      custom_callbacks += [summary_callback, checkpoint_callback]
+    else:
+      custom_callbacks = [summary_callback, checkpoint_callback]
+
+    bert_model.fit(
+        x=training_dataset,
+        validation_data=evaluation_dataset,
+        steps_per_epoch=steps_per_epoch,
+        epochs=epochs,
+        validation_steps=eval_steps,
+        callbacks=custom_callbacks)
+
+    return bert_model
+
+
+def export_classifier(model_export_path, input_meta_data,
+                      restore_model_using_load_weights,
+                      bert_config, model_dir):
+  """Exports a trained model as a `SavedModel` for inference.
+
+  Args:
+    model_export_path: a string specifying the path to the SavedModel directory.
+    input_meta_data: dictionary containing meta data about input and model.
+    restore_model_using_load_weights: Whether to use checkpoint.restore() API
+      for custom checkpoint or to use model.load_weights() API.
+      There are 2 different ways to save checkpoints. One is using
+      tf.train.Checkpoint and another is using Keras model.save_weights().
+      Custom training loop implementation uses tf.train.Checkpoint API
+      and Keras ModelCheckpoint callback internally uses model.save_weights()
+      API. Since these two API's cannot be used together, model loading logic
+      must be take into account how model checkpoint was saved.
+    bert_config: Bert configuration file to define core bert layers.
+    model_dir: The directory where the model weights and training/evaluation
+      summaries are stored.
+
+  Raises:
+    Export path is not specified, got an empty string or None.
+  """
+  if not model_export_path:
+    raise ValueError('Export path is not specified: %s' % model_export_path)
+  if not model_dir:
+    raise ValueError('Export path is not specified: %s' % model_dir)
+
+  classifier_model = bert_models.classifier_model(
+      bert_config, tf.float32, input_meta_data['num_labels'],
+      input_meta_data['max_seq_length'])[0]
+
+  model_saving_utils.export_bert_model(
+      model_export_path,
+      model=classifier_model,
+      checkpoint_dir=model_dir,
+      restore_model_using_load_weights=restore_model_using_load_weights)
+
+
+def run_bert(strategy,
+             input_meta_data,
+             train_input_fn=None,
+             eval_input_fn=None):
+  """Run BERT training."""
+  if FLAGS.model_type == 'bert':
+    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+  else:
+    assert FLAGS.model_type == 'albert'
+    bert_config = modeling.AlbertConfig.from_json_file(FLAGS.bert_config_file)
+  if FLAGS.mode == 'export_only':
+    # As Keras ModelCheckpoint callback used with Keras compile/fit() API
+    # internally uses model.save_weights() to save checkpoints, we must
+    # use model.load_weights() when Keras compile/fit() is used.
+    export_classifier(FLAGS.model_export_path, input_meta_data,
+                      FLAGS.use_keras_compile_fit,
+                      bert_config, FLAGS.model_dir)
+    return
+
+  if FLAGS.mode != 'train_and_eval':
+    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_config_v2(FLAGS.enable_xla)
+
+  epochs = FLAGS.num_train_epochs
+  train_data_size = input_meta_data['train_data_size']
+  global_batch_size = FLAGS.train_batch_size
+  learning_rate = FLAGS.learning_rate
+  if FLAGS.use_horovod:
+    global_batch_size *= hvd.size()
+    learning_rate *= hvd.size()
+  steps_per_epoch = int(train_data_size / global_batch_size)
+  warmup_steps = int(epochs * train_data_size * 0.1 / global_batch_size)
+  eval_steps = int(
+      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+
+  if strategy:
+    # Runs customized training loop.
+    logging.info('Training using customized training loop TF 2.0 with distrubuted'
+                'strategy.')
+  trained_model = run_bert_classifier(
+      strategy,
+      bert_config,
+      input_meta_data,
+      FLAGS.model_dir,
+      epochs,
+      steps_per_epoch,
+      FLAGS.steps_per_loop,
+      eval_steps,
+      warmup_steps,
+      learning_rate,
+      FLAGS.init_checkpoint,
+      train_input_fn,
+      eval_input_fn,
+      run_eagerly=FLAGS.run_eagerly,
+      use_keras_compile_fit=FLAGS.use_keras_compile_fit)
+
+  if FLAGS.model_export_path:
+    model_saving_utils.export_bert_model(
+        FLAGS.model_export_path,
+        model=trained_model,
+        restore_model_using_load_weights=FLAGS.use_keras_compile_fit)
+  return trained_model
+
+
+def main(_):
+  # Users should always run this script under TF 2.x
+  assert tf.version.VERSION.startswith('2.')
+
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
+    input_meta_data = json.loads(reader.read().decode('utf-8'))
+
+  if FLAGS.use_fp16:
+    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+
+  if FLAGS.use_horovod:
+    if strategy:
+      raise ValueError('Should not run horovod with distribution strategy')
+
+    hvd.init()
+    gpus = tf.config.experimental.list_physical_devices('GPU')
+    for gpu in gpus:
+      tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  max_seq_length = input_meta_data['max_seq_length']
+  train_input_fn = get_dataset_fn(
+      FLAGS.train_data_path,
+      max_seq_length,
+      FLAGS.train_batch_size,
+      is_training=True,
+      use_horovod=FLAGS.use_horovod)
+  eval_input_fn = get_dataset_fn(
+      FLAGS.eval_data_path,
+      max_seq_length,
+      FLAGS.eval_batch_size,
+      is_training=False,
+      use_horovod=FLAGS.use_horovod)
+
+  run_bert(strategy, input_meta_data, train_input_fn, eval_input_fn)
+
+if __name__ == '__main__':
+  flags.mark_flag_as_required('bert_config_file')
+  flags.mark_flag_as_required('input_meta_data_path')
+  flags.mark_flag_as_required('model_dir')
+  app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/run_pretraining.py b/modelzoo/LanguageModeling/BERT/run_pretraining.py
new file mode 100644
index 00000000..df2e9cf7
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/run_pretraining.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run masked LM/next sentence masked_lm pre-training for BERT in tf2.0."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl import app
+from absl import flags
+from absl import logging
+import tensorflow as tf
+import horovod.tensorflow as hvd
+import os
+
+# Import BERT model libraries.
+from official.nlp import bert_models
+import common_flags
+import input_pipeline
+import model_saving_utils
+from official.modeling import model_training_utils
+from official.nlp import bert_modeling as modeling
+import optimization
+import gpu_affinity
+from official.utils.misc import distribution_utils
+from official.utils.misc import keras_utils
+from official.utils.misc import tpu_lib
+
+flags.DEFINE_string('input_files', None,
+                    'File path to retrieve training data for pre-training.')
+# Model training specific flags.
+flags.DEFINE_integer(
+    'max_seq_length', 128,
+    'The maximum total input sequence length after WordPiece tokenization. '
+    'Sequences longer than this will be truncated, and sequences shorter '
+    'than this will be padded.')
+flags.DEFINE_integer('max_predictions_per_seq', 20,
+                     'Maximum predictions per sequence_output.')
+flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
+flags.DEFINE_integer('num_steps_per_epoch', 1000,
+                     'Total number of training steps to run per epoch.')
+flags.DEFINE_float('warmup_steps', 10000,
+                   'Warmup steps for Adam weight decay optimizer.')
+
+common_flags.define_common_bert_flags()
+
+FLAGS = flags.FLAGS
+
+
+def get_pretrain_dataset_fn(input_file_pattern, seq_length,
+                            max_predictions_per_seq, global_batch_size):
+  """Returns input dataset from input file string."""
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    input_patterns = input_file_pattern.split(',')
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    train_dataset = input_pipeline.create_pretrain_dataset(
+        input_patterns,
+        seq_length,
+        max_predictions_per_seq,
+        batch_size,
+        is_training=True,
+        input_pipeline_context=ctx,
+        use_horovod=FLAGS.use_horovod)
+    return train_dataset
+
+  return _dataset_fn
+
+
+def get_loss_fn(loss_factor=1.0):
+  """Returns loss function for BERT pretraining."""
+
+  def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
+    return tf.keras.backend.mean(losses) * loss_factor
+
+  return _bert_pretrain_loss_fn
+
+
+def run_customized_training(strategy,
+                            bert_config,
+                            max_seq_length,
+                            max_predictions_per_seq,
+                            model_dir,
+                            steps_per_epoch,
+                            steps_per_loop,
+                            epochs,
+                            initial_lr,
+                            warmup_steps,
+                            input_files,
+                            train_batch_size):
+  """Run BERT pretrain model training using low-level API."""
+
+  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length,
+                                           max_predictions_per_seq,
+                                           train_batch_size)
+
+  def _get_pretrain_model():
+    """Gets a pretraining model."""
+    pretrain_model, core_model = bert_models.pretrain_model(
+        bert_config, max_seq_length, max_predictions_per_seq, float_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+    pretrain_model.optimizer = optimization.create_optimizer(
+        initial_lr, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type)
+    if FLAGS.use_fp16:
+      pretrain_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(pretrain_model.optimizer,
+        dynamic=True)
+    return pretrain_model, core_model
+
+  params = {'FLAGS' : FLAGS}
+  logging.info("init_lr = %f", initial_lr)
+  trained_model = model_training_utils.run_customized_training_loop(
+      strategy=strategy,
+      model_fn=_get_pretrain_model,
+      loss_fn=get_loss_fn(
+          loss_factor=1.0 /
+          strategy.num_replicas_in_sync if FLAGS.scale_loss and strategy else 1.0),
+      model_dir=model_dir,
+      train_input_fn=train_input_fn,
+      steps_per_epoch=steps_per_epoch,
+      num_accumulative_step=FLAGS.num_accumulation_steps,
+      steps_per_loop=steps_per_loop,
+      epochs=epochs,
+      sub_model_export_name='pretrained/bert_model',
+      init_checkpoint=FLAGS.init_checkpoint,
+      hvd=hvd if FLAGS.use_horovod else None,
+      params=params)
+
+  return trained_model
+
+
+def run_bert_pretrain(strategy):
+  """Runs BERT pre-training."""
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+  # Padding for divisibility by 8
+  # if bert_config.vocab_size % 8 != 0:
+  #   bert_config.vocab_size += 8 - bert_config.vocab_size % 8
+  if strategy:
+    logging.info('Training using customized training loop TF 2.0 with distrubuted'
+                'strategy.')
+
+  keras_utils.set_config_v2(FLAGS.enable_xla)
+  # Runs customized training loop.
+  return run_customized_training(
+      strategy,
+      bert_config,
+      FLAGS.max_seq_length,
+      FLAGS.max_predictions_per_seq,
+      FLAGS.model_dir,
+      FLAGS.num_steps_per_epoch,
+      FLAGS.steps_per_loop,
+      FLAGS.num_train_epochs,
+      FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate,
+      FLAGS.warmup_steps,
+      FLAGS.input_files,
+      FLAGS.train_batch_size)
+
+
+def main(_):
+  # Users should always run this script under TF 2.x
+  assert tf.version.VERSION.startswith('2.')
+
+  if not FLAGS.model_dir:
+    FLAGS.model_dir = '/tmp/bert20/'
+
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+  if strategy:
+    print('***** Number of cores used : ', strategy.num_replicas_in_sync)
+
+  if FLAGS.use_horovod:
+    if strategy:
+      raise ValueError('Should not run horovod with distribution strategy')
+
+    hvd.init()
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      gpu_affinity.set_affinity(hvd.local_rank())
+
+  if FLAGS.use_fp16:
+    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+
+  run_bert_pretrain(strategy)
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/run_squad.py b/modelzoo/LanguageModeling/BERT/run_squad.py
index 0328bb5f..05e5d449 100644
--- a/modelzoo/LanguageModeling/BERT/run_squad.py
+++ b/modelzoo/LanguageModeling/BERT/run_squad.py
@@ -25,71 +25,193 @@
 import sys
 import time
 
+import horovod.tensorflow as hvd
 import numpy as np
 import tensorflow as tf
-from absl import app, flags, logging
-from dllogger import Verbosity
-
-from deepray.utils import export
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
-from deepray.datasets import tokenization
+from absl import app
+from absl import flags
+from absl import logging
+from deepray.core.trainer import Trainer
 from deepray.datasets.squad import Squad
+
+import common_flags
+import gpu_affinity
+import input_pipeline
+import model_saving_utils
+import optimization
 # word-piece tokenizer based squad_lib
+import squad_lib as squad_lib_wp
 # sentence-piece tokenizer based squad_lib
-from deepray.datasets.squad import squad_lib_sp, squad_lib as squad_lib_wp
-from deepray.layers.nlp import bert_models, bert_modeling as modeling
-from deepray.utils.flags import common_flags
-from deepray.utils.horovod_utils import is_main_process
-# from optimization import create_optimizer
+import squad_lib_sp
+import tf_trt
+import tokenization
+from official.nlp import bert_modeling as modeling
+# Import BERT model libraries.
+from official.nlp import bert_models
+from deepray.core.common import distribution_utils
+from deepray.utils import keras_utils
+
+flags.DEFINE_enum(
+    'mode', 'train_and_predict',
+    ['train_and_predict', 'train', 'predict', 'export_only', 'sm_predict', 'trt_predict'],
+    'One of {"train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"}. '
+    '`train_and_predict`: both train and predict to a json file. '
+    '`train`: only trains the model. '
+    'trains the model and evaluates in the meantime. '
+    '`predict`: predict answers from the squad json file. '
+    '`export_only`: will take the latest checkpoint inside '
+    'model_dir and export a `SavedModel`.'
+    '`sm_predict`: will load SavedModel from savedmodel_dir and predict answers'
+    '`trt_predict`: will load SavedModel from savedmodel_dir, convert and predict answers with TF-TRT')
+flags.DEFINE_string('train_data_path', '',
+                    'Training data path with train tfrecords.')
+flags.DEFINE_string(
+    'input_meta_data_path', None,
+    'Path to file that contains meta data about input '
+    'to be used for training and evaluation.')
+flags.DEFINE_string(
+    "eval_script", None,
+    "SQuAD evaluate.py file to compute f1 and exact_match E.g., evaluate-v1.1.py")
+
+# Model training specific flags.
+flags.DEFINE_integer('train_batch_size', 8, 'Total batch size for training.')
+# Predict processing related.
+flags.DEFINE_string('predict_file', None,
+                    'Prediction data path with train tfrecords.')
+flags.DEFINE_string('vocab_file', None,
+                    'The vocabulary file that the BERT model was trained on.')
+flags.DEFINE_bool(
+    'do_lower_case', True,
+    'Whether to lower case the input text. Should be True for uncased '
+    'models and False for cased models.')
+flags.DEFINE_bool(
+    'verbose_logging', False,
+    'If true, all of the warnings related to data processing will be printed. '
+    'A number of warnings are expected for a normal SQuAD evaluation.')
+flags.DEFINE_integer('predict_batch_size', 8,
+                     'Total batch size for prediction.')
+flags.DEFINE_integer(
+    'n_best_size', 20,
+    'The total number of n-best predictions to generate in the '
+    'nbest_predictions.json output file.')
+flags.DEFINE_integer(
+    'max_answer_length', 30,
+    'The maximum length of an answer that can be generated. This is needed '
+    'because the start and end predictions are not conditioned on one another.')
+flags.DEFINE_string(
+    'sp_model_file', None,
+    'The path to the sentence piece model. Used by sentence piece tokenizer '
+    'employed by ALBERT.')
+flags.DEFINE_string(
+    'savedmodel_dir', None,
+    'The path of SavedModel for Savedmodel and TF-TRT prediction.')
+
+common_flags.define_common_bert_flags()
 
 FLAGS = flags.FLAGS
 
 MODEL_CLASSES = {
     'bert': (modeling.BertConfig, squad_lib_wp, tokenization.FullTokenizer),
-    'albert': (modeling.AlbertConfig, squad_lib_sp, tokenization.FullSentencePieceTokenizer),
+    'albert': (modeling.AlbertConfig, squad_lib_sp,
+               tokenization.FullSentencePieceTokenizer),
 }
 
-DTYPE_MAP = {
-    "fp16": tf.float16,
-    "bf16": tf.bfloat16,
-    "fp32": tf.float32,
-}
 
+def squad_loss_fn(start_positions,
+                  end_positions,
+                  start_logits,
+                  end_logits,
+                  loss_factor=1.0):
+  """Returns sparse categorical crossentropy for start/end logits."""
+  start_loss = tf.keras.backend.sparse_categorical_crossentropy(
+      start_positions, start_logits, from_logits=True)
+  end_loss = tf.keras.backend.sparse_categorical_crossentropy(
+      end_positions, end_logits, from_logits=True)
 
-def get_raw_results(predictions):
-  """Converts multi-replica predictions to RawResult."""
-  squad_lib = MODEL_CLASSES[FLAGS.model_name][1]
-  for unique_ids, start_logits, end_logits in zip(
-      predictions['unique_ids'], predictions['start_positions'], predictions['end_positions']
-  ):
-    for values in zip(unique_ids.numpy(), start_logits.numpy(), end_logits.numpy()):
-      yield squad_lib.RawResult(unique_id=values[0], start_logits=values[1].tolist(), end_logits=values[2].tolist())
+  total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
+  total_loss *= loss_factor
+  return total_loss
 
 
-def predict_squad_customized(input_meta_data, bert_config, predict_tfrecord_path, num_steps):
-  """Make predictions using a Bert-based squad model."""
-  data_pipe = Squad(max_seq_length=input_meta_data['max_seq_length'], dataset_type="squad")
-  predict_dataset = data_pipe(predict_tfrecord_path, FLAGS.predict_batch_size, is_training=False)
+def get_loss_fn(loss_factor=1.0):
+  """Gets a loss function for squad task."""
 
-  strategy = distribution_utils.get_distribution_strategy()
-  predict_iterator = distribution_utils.make_distributed_iterator(strategy, predict_dataset)
+  def _loss_fn(labels, model_outputs):
+    start_positions = labels['start_positions']
+    end_positions = labels['end_positions']
+    start_logits, end_logits = model_outputs
+    return squad_loss_fn(
+        start_positions,
+        end_positions,
+        start_logits,
+        end_logits,
+        loss_factor=loss_factor)
+
+  return _loss_fn
+
+
+def get_raw_results(predictions):
+  """Converts multi-replica predictions to RawResult."""
+  squad_lib = MODEL_CLASSES[FLAGS.model_type][1]
+  for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
+                                                  predictions['start_logits'],
+                                                  predictions['end_logits']):
+    for values in zip(unique_ids.numpy(), start_logits.numpy(),
+                      end_logits.numpy()):
+      yield squad_lib.RawResult(
+          unique_id=values[0],
+          start_logits=values[1].tolist(),
+          end_logits=values[2].tolist())
+
+def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
+                   is_training, use_horovod):
+  """Gets a closure to create a dataset.."""
+
+  def _dataset_fn(ctx=None):
+    """Returns tf.data.Dataset for distributed BERT pretraining."""
+    batch_size = ctx.get_per_replica_batch_size(
+        global_batch_size) if ctx else global_batch_size
+    dataset = input_pipeline.create_squad_dataset(
+        input_file_pattern,
+        max_seq_length,
+        batch_size,
+        is_training=is_training,
+        input_pipeline_context=ctx,
+        use_horovod=use_horovod)
+    return dataset
+
+  return _dataset_fn
+
+def predict_squad_customized(strategy, input_meta_data, bert_config,
+                             predict_tfrecord_path, num_steps):
+  """Make predictions using a Bert-based squad model."""
+  predict_dataset_fn = get_dataset_fn(
+      predict_tfrecord_path,
+      input_meta_data['max_seq_length'],
+      FLAGS.predict_batch_size,
+      is_training=False,
+      use_horovod=False)
+  if strategy:
+    predict_iterator = iter(
+      strategy.experimental_distribute_datasets_from_function(
+          predict_dataset_fn))
+  else:
+    predict_iterator = iter(predict_dataset_fn())
 
   if FLAGS.mode == 'trt_predict':
-    squad_model = export.TFTRTModel(FLAGS.savedmodel_dir, "amp" if common_flags.use_float16() else "fp32")
+    squad_model = tf_trt.TFTRTModel(FLAGS.savedmodel_dir, "amp" if FLAGS.use_fp16 else "fp32")
 
   elif FLAGS.mode == 'sm_predict':
-    squad_model = export.SavedModel(FLAGS.savedmodel_dir, "amp" if common_flags.use_float16() else "fp32")
+    squad_model = tf_trt.SavedModel(FLAGS.savedmodel_dir, "amp" if FLAGS.use_fp16 else "fp32")
 
   else:
     with distribution_utils.get_strategy_scope(strategy):
       squad_model, _ = bert_models.squad_model(
-          bert_config, input_meta_data['max_seq_length'], float_type=DTYPE_MAP[FLAGS.dtype]
-      )
+          bert_config, input_meta_data['max_seq_length'], float_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
 
     if FLAGS.init_checkpoint:
       checkpoint = tf.train.Checkpoint(model=squad_model)
-      checkpoint.restore(FLAGS.init_checkpoint).expect_partial()
+      checkpoint.restore(FLAGS.init_checkpoint[0]).expect_partial()
 
     checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
     logging.info('Restoring checkpoints from %s', checkpoint_path)
@@ -107,16 +229,18 @@ def _replicated_step(inputs):
       if FLAGS.benchmark:
         t0 = tf.timestamp()
         unique_ids = t0
-      logits_dict = squad_model(x, training=False)
-      logits_dict['unique_ids'] = unique_ids
-      logits_dict.update(unique_ids=unique_ids)
-      return logits_dict
+      start_logits, end_logits = squad_model(x, training=False)
+      return dict(
+          unique_ids=unique_ids,
+          start_logits=start_logits,
+          end_logits=end_logits)
 
     def tuple_fun(x):
-      return x,
+      return (x,)
 
     if strategy:
-      outputs = strategy.run(_replicated_step, args=(next(iterator),))
+      outputs = strategy.experimental_run_v2(
+          _replicated_step, args=(next(iterator),))
       map_func = strategy.experimental_local_results
     else:
       outputs = _replicated_step(next(iterator),)
@@ -133,7 +257,7 @@ def tuple_fun(x):
     if FLAGS.benchmark:
       # transfer tensor to CPU for synchronization
       t0 = predictions['unique_ids'][0]
-      start_logits = predictions['start_positions'][0]
+      start_logits = predictions['start_logits'][0]
       start_logits.numpy()
       elapsed_secs = time.time() - t0.numpy()
       # Removing first 4 (arbitrary) number of startup iterations from perf evaluations
@@ -152,10 +276,9 @@ def tuple_fun(x):
   logging.info("Summary Inference Statistics")
   logging.info("Batch size = %d", FLAGS.predict_batch_size)
   logging.info("Sequence Length = %d", input_meta_data['max_seq_length'])
-  logging.info("Precision = %s", FLAGS.dtype)
-  logging.info(
-      "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, num_steps * FLAGS.predict_batch_size
-  )
+  logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
+  logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
+    num_steps * FLAGS.predict_batch_size)
 
   if FLAGS.benchmark:
     eval_time_wo_overhead = sum(time_list)
@@ -170,10 +293,8 @@ def tuple_fun(x):
     cf_100 = max(time_list[:int(len(time_list) * 1)])
     ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
 
-    logging.info(
-        "Total Inference Time W/O Overhead = %0.2f for Sequences = %d", eval_time_wo_overhead,
-        (num_steps - 4) * FLAGS.predict_batch_size
-    )
+    logging.info("Total Inference Time W/O Overhead = %0.2f for Sequences = %d", eval_time_wo_overhead,
+      (num_steps - 4) * FLAGS.predict_batch_size)
     logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
@@ -182,50 +303,61 @@ def tuple_fun(x):
     logging.info("Latency Average (ms) = %0.2f", avg * 1000)
     logging.info("Throughput Average (sequences/sec) = %0.2f", ss_sentences_per_second)
 
-    dllogging = input_meta_data['dllogging']
-    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
 
   logging.info("-----------------------------")
 
   return all_results
 
 
-def train_squad(input_meta_data,):
+def train_squad(strategy,
+                input_meta_data,
+                custom_callbacks=None,
+                run_eagerly=False):
   """Run bert squad training."""
-
-  bert_config = MODEL_CLASSES[FLAGS.model_name][0].from_json_file(FLAGS.config_file)
+  if strategy:
+    logging.info('Training using customized training loop with distribution'
+                 ' strategy.')
+  # Enables XLA in Session Config. Should not be set for TPU.
+  keras_utils.set_config_v2(FLAGS.enable_xla)
+
+  use_float16 = common_flags.use_float16()
+  if use_float16:
+    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+
+  bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(FLAGS.bert_config_file)
+  num_train_examples = input_meta_data['train_data_size']
   max_seq_length = input_meta_data['max_seq_length']
+  global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
+  if FLAGS.use_horovod:
+    global_batch_size *= hvd.size()
+  steps_per_epoch = int(num_train_examples / global_batch_size)
+  warmup_steps = int(flags.FLAGS.epochs * num_train_examples * 0.1 / global_batch_size)
 
-  # The original BERT model does not scale the loss by
-  # 1/num_replicas_in_sync. It could be an accident. So, in order to use
-  # the same hyper parameter, we do the same thing here by keeping each
-  # replica loss as it is.
-  strategy = distribution_utils.get_distribution_strategy()
-
-  with distribution_utils.get_strategy_scope(strategy):
-    """Get Squad model and optimizer."""
-    squad_model, core_model = bert_models.squad_model(
-        bert_config, max_seq_length, float_type=DTYPE_MAP[FLAGS.dtype], hub_module_url=FLAGS.hub_module_url
-    )
-
-  data_pipe = Squad(
-      max_seq_length=max_seq_length,
-      dataset_type="squad",
-  )
+  data_pipe = Squad(max_seq_length=max_seq_length, dataset_type="squad")
   train_input = data_pipe(
-      FLAGS.train_data,
-      FLAGS.batch_size,
+      flags.FLAGS.batch_size,
+      flags.FLAGS.train_data_path,
       is_training=True,
   )
 
+  
+  squad_model, core_model = bert_models.squad_model(
+        bert_config,
+        max_seq_length,
+        float_type=tf.float16 if FLAGS.use_fp16 else tf.float32,
+        hub_module_url=FLAGS.hub_module_url)
+  
+  learning_rate = FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate
+  optimizer = optimization.create_optimizer(learning_rate, steps_per_epoch * flags.FLAGS.epochs, warmup_steps, FLAGS.optimizer_type)
+  if FLAGS.use_fp16:
+    optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer, dynamic=True)
+  
   trainer = Trainer(
       model={
           "main": squad_model,
           "sub_model": core_model
       },
-      # optimizer= create_optimizer(
-      #   learning_rate, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type),
-      optimizer=tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate, amsgrad=False),
+      optimizer=optimizer,
       loss={
           "start_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
           "end_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
@@ -234,28 +366,35 @@ def train_squad(input_meta_data,):
           "start_positions": 0.5,
           "end_positions": 0.5
       },
+      jit_compile=True
   )
-  trainer.fit(train_input=train_input,)
-  export.export_to_savedmodel(model=trainer.models)
+  trainer.fit(x=train_input, steps_per_epoch=steps_per_epoch)
 
 
-def predict_squad(input_meta_data):
+def predict_squad(strategy, input_meta_data):
   """Makes predictions for a squad dataset."""
-
-  config_cls, squad_lib, tokenizer_cls = MODEL_CLASSES[FLAGS.model_name]
-  bert_config = config_cls.from_json_file(FLAGS.config_file)
+  keras_utils.set_config_v2(FLAGS.enable_xla)
+  config_cls, squad_lib, tokenizer_cls = MODEL_CLASSES[FLAGS.model_type]
+  bert_config = config_cls.from_json_file(FLAGS.bert_config_file)
   if tokenizer_cls == tokenization.FullTokenizer:
-    tokenizer = tokenizer_cls(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    tokenizer = tokenizer_cls(
+        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
   else:
     assert tokenizer_cls == tokenization.FullSentencePieceTokenizer
     tokenizer = tokenizer_cls(sp_model_file=FLAGS.sp_model_file)
+  doc_stride = input_meta_data['doc_stride']
+  max_query_length = input_meta_data['max_query_length']
   # Whether data should be in Ver 2.0 format.
-  version_2_with_negative = input_meta_data.get('version_2_with_negative', False)
+  version_2_with_negative = input_meta_data.get('version_2_with_negative',
+                                                False)
   eval_examples = squad_lib.read_squad_examples(
-      input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative
-  )
+      input_file=FLAGS.predict_file,
+      is_training=False,
+      version_2_with_negative=version_2_with_negative)
 
-  eval_writer = squad_lib.FeatureWriter(filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'), is_training=False)
+  eval_writer = squad_lib.FeatureWriter(
+      filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'),
+      is_training=False)
   eval_features = []
 
   def _append_feature(feature, is_padding):
@@ -271,12 +410,11 @@ def _append_feature(feature, is_padding):
       examples=eval_examples,
       tokenizer=tokenizer,
       max_seq_length=input_meta_data['max_seq_length'],
-      doc_stride=input_meta_data['doc_stride'],
-      max_query_length=input_meta_data['max_query_length'],
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
       is_training=False,
       output_fn=_append_feature,
-      batch_size=FLAGS.predict_batch_size
-  )
+      batch_size=FLAGS.predict_batch_size)
 
   # squad_lib_sp requires one more argument 'do_lower_case'.
   if squad_lib == squad_lib_sp:
@@ -292,7 +430,8 @@ def _append_feature(feature, is_padding):
   num_steps = int(dataset_size / FLAGS.predict_batch_size)
   if FLAGS.benchmark and num_steps > 1000:
     num_steps = 1000
-  all_results = predict_squad_customized(input_meta_data, bert_config, eval_writer.filename, num_steps)
+  all_results = predict_squad_customized(strategy, input_meta_data, bert_config,
+                                         eval_writer.filename, num_steps)
 
   if FLAGS.benchmark:
     return
@@ -311,23 +450,19 @@ def _append_feature(feature, is_padding):
       output_prediction_file,
       output_nbest_file,
       output_null_log_odds_file,
-      verbose=FLAGS.verbose_logging
-  )
+      verbose=FLAGS.verbose_logging)
 
   if FLAGS.eval_script:
-    eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script, FLAGS.predict_file, output_prediction_file])
+    eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script,
+                                        FLAGS.predict_file, output_prediction_file])
     scores = str(eval_out).strip()
     exact_match = float(scores.split(":")[1].split(",")[0])
     if version_2_with_negative:
       f1 = float(scores.split(":")[2].split(",")[0])
     else:
       f1 = float(scores.split(":")[2].split("}")[0])
-    dllogging = input_meta_data['dllogging']
-    dllogging.logger.log(step=(), data={"f1": f1}, verbosity=Verbosity.DEFAULT)
-    dllogging.logger.log(step=(), data={"exact_match": exact_match}, verbosity=Verbosity.DEFAULT)
     print(str(eval_out))
 
-
 def export_squad(model_export_path, input_meta_data):
   """Exports a trained model as a `SavedModel` for inference.
 
@@ -340,9 +475,12 @@ def export_squad(model_export_path, input_meta_data):
   """
   if not model_export_path:
     raise ValueError('Export path is not specified: %s' % model_export_path)
-  bert_config = MODEL_CLASSES[FLAGS.model_name][0].from_json_file(FLAGS.config_file)
-  squad_model, _ = bert_models.squad_model(bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
-  export.export_to_savedmodel(model_export_path + '/savedmodel', model=squad_model, checkpoint_dir=FLAGS.model_dir)
+  bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(
+      FLAGS.bert_config_file)
+  squad_model, _ = bert_models.squad_model(
+      bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
+  model_saving_utils.export_bert_model(
+      model_export_path + '/savedmodel', model=squad_model, checkpoint_dir=FLAGS.model_dir)
 
   model_name = FLAGS.triton_model_name
 
@@ -352,27 +490,21 @@ def export_squad(model_export_path, input_meta_data):
 
   if not os.path.exists(version_folder):
     os.makedirs(version_folder)
-  if not os.path.exists(final_model_folder):
+  if (not os.path.exists(final_model_folder)):
     os.rename(model_export_path + '/savedmodel', final_model_folder)
     print("Model saved to dir", final_model_folder)
   else:
-    if FLAGS.triton_model_overwrite:
+    if (FLAGS.triton_model_overwrite):
       shutil.rmtree(final_model_folder)
       os.rename(model_export_path + '/savedmodel', final_model_folder)
       print("WARNING: Existing model was overwritten. Model dir: {}".format(final_model_folder))
     else:
-      print(
-          "ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}"
-          .format(final_model_folder)
-      )
+      print("ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}".format(final_model_folder))
       return
 
   config_filename = os.path.join(model_folder, "config.pbtxt")
-  if os.path.exists(config_filename) and not FLAGS.triton_model_overwrite:
-    print(
-        "ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}"
-        .format(config_filename)
-    )
+  if (os.path.exists(config_filename) and not FLAGS.triton_model_overwrite):
+    print("ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}".format(config_filename))
     return
 
   config_template = r"""
@@ -420,7 +552,7 @@ def export_squad(model_export_path, input_meta_data):
   batching_str = ""
   max_batch_size = FLAGS.triton_max_batch_size
 
-  if FLAGS.triton_dyn_batching_delay > 0:
+  if (FLAGS.triton_dyn_batching_delay > 0):
     # Use only full and half full batches
     pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
 
@@ -431,12 +563,12 @@ def export_squad(model_export_path, input_meta_data):
 }}""".format(", ".join([str(x) for x in pref_batch_size]), int(FLAGS.triton_dyn_batching_delay * 1000.0))
 
   config_values = {
-      "model_name": model_name,
-      "max_batch_size": max_batch_size,
-      "seq_length": input_meta_data['max_seq_length'],
-      "dynamic_batching": batching_str,
-      "gpu_list": ", ".join([x.name.split(":")[-1] for x in tf.config.list_physical_devices('GPU')]),
-      "engine_count": FLAGS.triton_engine_count
+    "model_name": model_name,
+    "max_batch_size": max_batch_size,
+    "seq_length": input_meta_data['max_seq_length'],
+    "dynamic_batching": batching_str,
+    "gpu_list": ", ".join([x.name.split(":")[-1] for x in tf.config.list_physical_devices('GPU')]),
+    "engine_count": FLAGS.triton_engine_count
   }
 
   with open(model_folder + "/config.pbtxt", "w") as file:
@@ -445,24 +577,48 @@ def export_squad(model_export_path, input_meta_data):
 
 
 def main(_):
+  # Users should always run this script under TF 2.x
+  # The container haven't changed version number yet, skip the check.
+  assert tf.version.VERSION.startswith('2.')
+
   with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
     input_meta_data = json.loads(reader.read().decode('utf-8'))
-  #  Get the value of 'train_data_size' from input_meta_data and set FLAGS.num_train_examples
-  FLAGS([sys.argv[0], f"--num_train_examples={input_meta_data['train_data_size']}"])
 
   if FLAGS.mode == 'export_only':
-    export_squad(FLAGS.model_dir, input_meta_data)
+    export_squad(FLAGS.model_export_path, input_meta_data)
     return
 
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  for gpu in gpus:
+    tf.config.experimental.set_memory_growth(gpu, True)
+
+  strategy = distribution_utils.get_distribution_strategy(
+      distribution_strategy=FLAGS.distribution_strategy,
+      num_gpus=FLAGS.num_gpus,
+      tpu_address=FLAGS.tpu)
+
+  if FLAGS.use_horovod:
+    if strategy:
+      raise ValueError('Should not run horovod with distribution strategy')
+
+    hvd.init()
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      gpu_affinity.set_affinity(hvd.local_rank())
+
+  if FLAGS.use_fp16:
+    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16")
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+
   os.makedirs(FLAGS.model_dir, exist_ok=True)
 
   if FLAGS.mode in ('train', 'train_and_predict'):
-    train_squad(input_meta_data)
-  if FLAGS.mode in ('predict', 'sm_predict', 'trt_predict', 'train_and_predict') and is_main_process():
-    predict_squad(input_meta_data)
+    train_squad(strategy, input_meta_data)
+  if FLAGS.mode in ('predict', 'sm_predict', 'trt_predict', 'train_and_predict') and (not FLAGS.use_horovod or hvd.rank() == 0):
+    predict_squad(strategy, input_meta_data)
 
 
 if __name__ == '__main__':
-  flags.mark_flag_as_required('config_file')
+  flags.mark_flag_as_required('bert_config_file')
   flags.mark_flag_as_required('model_dir')
   app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/run_squad_predict.py b/modelzoo/LanguageModeling/BERT/run_squad_predict.py
deleted file mode 100644
index 9682dd46..00000000
--- a/modelzoo/LanguageModeling/BERT/run_squad_predict.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in tf2.0."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-import shutil
-import subprocess
-import sys
-import time
-
-import numpy as np
-import tensorflow as tf
-from absl import app, flags, logging
-from dllogger import Verbosity
-
-from deepray.utils import export
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
-from deepray.datasets import tokenization
-from deepray.datasets.squad import Squad
-# word-piece tokenizer based squad_lib
-# sentence-piece tokenizer based squad_lib
-from deepray.datasets.squad import squad_lib_sp, squad_lib as squad_lib_wp
-from deepray.layers.nlp import bert_models, bert_modeling as modeling
-from deepray.utils.flags import common_flags
-from deepray.utils.horovod_utils import is_main_process
-# from optimization import create_optimizer
-
-FLAGS = flags.FLAGS
-
-MODEL_CLASSES = {
-    'bert': (modeling.BertConfig, squad_lib_wp, tokenization.FullTokenizer),
-    'albert': (modeling.AlbertConfig, squad_lib_sp, tokenization.FullSentencePieceTokenizer),
-}
-
-DTYPE_MAP = {
-    "fp16": tf.float16,
-    "bf16": tf.bfloat16,
-    "fp32": tf.float32,
-}
-
-
-def get_raw_results(predictions):
-  """Converts multi-replica predictions to RawResult."""
-  squad_lib = MODEL_CLASSES[FLAGS.model_name][1]
-  for unique_ids, start_logits, end_logits in zip(
-      predictions['unique_ids'], predictions['start_positions'], predictions['end_positions']
-  ):
-    for values in zip(unique_ids.numpy(), start_logits.numpy(), end_logits.numpy()):
-      yield squad_lib.RawResult(unique_id=values[0], start_logits=values[1].tolist(), end_logits=values[2].tolist())
-
-
-def predict_squad_customized(input_meta_data, bert_config, predict_tfrecord_path, num_steps):
-  """Make predictions using a Bert-based squad model."""
-  data_pipe = Squad(max_seq_length=input_meta_data['max_seq_length'], dataset_type="squad")
-  predict_dataset = data_pipe(predict_tfrecord_path, FLAGS.predict_batch_size, is_training=False)
-
-  strategy = distribution_utils.get_distribution_strategy()
-  predict_iterator = distribution_utils.make_distributed_iterator(strategy, predict_dataset)
-
-  if FLAGS.mode == 'trt_predict':
-    squad_model = export.TFTRTModel(FLAGS.savedmodel_dir, "amp" if common_flags.use_float16() else "fp32")
-
-  elif FLAGS.mode == 'sm_predict':
-    squad_model = export.SavedModel(FLAGS.savedmodel_dir, "amp" if common_flags.use_float16() else "fp32")
-
-  else:
-    with distribution_utils.get_strategy_scope(strategy):
-      squad_model, _ = bert_models.squad_model(
-          bert_config, input_meta_data['max_seq_length'], float_type=DTYPE_MAP[FLAGS.dtype]
-      )
-
-  trainer = Trainer(model=squad_model,)
-
-  @tf.function
-  def predict_step(iterator):
-    """Predicts on distributed devices."""
-
-    def _replicated_step(inputs):
-      """Replicated prediction calculation."""
-      x, _ = inputs
-      unique_ids = x.pop('unique_ids')
-      if FLAGS.benchmark:
-        t0 = tf.timestamp()
-        unique_ids = t0
-      logits_dict = squad_model(x, training=False)
-      logits_dict['unique_ids'] = unique_ids
-      logits_dict.update(unique_ids=unique_ids)
-      return logits_dict
-
-    def tuple_fun(x):
-      return x,
-
-    if strategy:
-      outputs = strategy.run(_replicated_step, args=(next(iterator),))
-      map_func = strategy.experimental_local_results
-    else:
-      outputs = _replicated_step(next(iterator),)
-      map_func = tuple_fun
-    return tf.nest.map_structure(map_func, outputs)
-
-  all_results = []
-  time_list = []
-  eval_start_time = time.time()
-  elapsed_secs = 0
-
-  for _ in range(num_steps):
-    predictions = trainer.forward_step(next(predict_iterator))
-    if FLAGS.benchmark:
-      # transfer tensor to CPU for synchronization
-      t0 = predictions['unique_ids'][0]
-      start_logits = predictions['start_positions'][0]
-      start_logits.numpy()
-      elapsed_secs = time.time() - t0.numpy()
-      # Removing first 4 (arbitrary) number of startup iterations from perf evaluations
-      if _ > 3:
-        time_list.append(elapsed_secs)
-      continue
-
-    for result in get_raw_results(predictions):
-      all_results.append(result)
-
-    if len(all_results) % 100 == 0:
-      logging.info('Made predictions for %d records.', len(all_results))
-
-  eval_time_elapsed = time.time() - eval_start_time
-  logging.info("-----------------------------")
-  logging.info("Summary Inference Statistics")
-  logging.info("Batch size = %d", FLAGS.predict_batch_size)
-  logging.info("Sequence Length = %d", input_meta_data['max_seq_length'])
-  logging.info("Precision = %s", FLAGS.dtype)
-  logging.info(
-      "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, num_steps * FLAGS.predict_batch_size
-  )
-
-  if FLAGS.benchmark:
-    eval_time_wo_overhead = sum(time_list)
-    time_list.sort()
-    num_sentences = (num_steps - 4) * FLAGS.predict_batch_size
-
-    avg = np.mean(time_list)
-    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
-    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
-    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
-    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
-    cf_100 = max(time_list[:int(len(time_list) * 1)])
-    ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
-
-    logging.info(
-        "Total Inference Time W/O Overhead = %0.2f for Sequences = %d", eval_time_wo_overhead,
-        (num_steps - 4) * FLAGS.predict_batch_size
-    )
-    logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
-    logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
-    logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
-    logging.info("Latency Confidence Level 99 (ms) = %0.2f", cf_99 * 1000)
-    logging.info("Latency Confidence Level 100 (ms) = %0.2f", cf_100 * 1000)
-    logging.info("Latency Average (ms) = %0.2f", avg * 1000)
-    logging.info("Throughput Average (sequences/sec) = %0.2f", ss_sentences_per_second)
-
-    dllogging = input_meta_data['dllogging']
-    dllogging.logger.log(step=(), data={"throughput_val": ss_sentences_per_second}, verbosity=Verbosity.DEFAULT)
-
-  logging.info("-----------------------------")
-
-  return all_results
-
-
-def predict_squad(input_meta_data):
-  """Makes predictions for a squad dataset."""
-  config_cls, squad_lib, tokenizer_cls = MODEL_CLASSES[FLAGS.model_name]
-  bert_config = config_cls.from_json_file(FLAGS.config_file)
-  if tokenizer_cls == tokenization.FullTokenizer:
-    tokenizer = tokenizer_cls(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
-  else:
-    assert tokenizer_cls == tokenization.FullSentencePieceTokenizer
-    tokenizer = tokenizer_cls(sp_model_file=FLAGS.sp_model_file)
-  # Whether data should be in Ver 2.0 format.
-  version_2_with_negative = input_meta_data.get('version_2_with_negative', False)
-  eval_examples = squad_lib.read_squad_examples(
-      input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative
-  )
-
-  eval_writer = squad_lib.FeatureWriter(filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'), is_training=False)
-  eval_features = []
-
-  def _append_feature(feature, is_padding):
-    if not is_padding:
-      eval_features.append(feature)
-    eval_writer.process_feature(feature)
-
-  # TPU requires a fixed batch size for all batches, therefore the number
-  # of examples must be a multiple of the batch size, or else examples
-  # will get dropped. So we pad with fake examples which are ignored
-  # later on.
-  kwargs = dict(
-      examples=eval_examples,
-      tokenizer=tokenizer,
-      max_seq_length=input_meta_data['max_seq_length'],
-      doc_stride=input_meta_data['doc_stride'],
-      max_query_length=input_meta_data['max_query_length'],
-      is_training=False,
-      output_fn=_append_feature,
-      batch_size=FLAGS.predict_batch_size
-  )
-
-  # squad_lib_sp requires one more argument 'do_lower_case'.
-  if squad_lib == squad_lib_sp:
-    kwargs['do_lower_case'] = FLAGS.do_lower_case
-  dataset_size = squad_lib.convert_examples_to_features(**kwargs)
-  eval_writer.close()
-
-  logging.info('***** Running predictions *****')
-  logging.info('  Num orig examples = %d', len(eval_examples))
-  logging.info('  Num split examples = %d', len(eval_features))
-  logging.info('  Batch size = %d', FLAGS.predict_batch_size)
-
-  num_steps = int(dataset_size / FLAGS.predict_batch_size)
-  if FLAGS.benchmark and num_steps > 1000:
-    num_steps = 1000
-  all_results = predict_squad_customized(input_meta_data, bert_config, eval_writer.filename, num_steps)
-
-  if FLAGS.benchmark:
-    return
-
-  output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json')
-  output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json')
-  output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json')
-
-  squad_lib.write_predictions(
-      eval_examples,
-      eval_features,
-      all_results,
-      FLAGS.n_best_size,
-      FLAGS.max_answer_length,
-      FLAGS.do_lower_case,
-      output_prediction_file,
-      output_nbest_file,
-      output_null_log_odds_file,
-      verbose=FLAGS.verbose_logging
-  )
-
-  if FLAGS.eval_script:
-    eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script, FLAGS.predict_file, output_prediction_file])
-    scores = str(eval_out).strip()
-    exact_match = float(scores.split(":")[1].split(",")[0])
-    if version_2_with_negative:
-      f1 = float(scores.split(":")[2].split(",")[0])
-    else:
-      f1 = float(scores.split(":")[2].split("}")[0])
-    dllogging = input_meta_data['dllogging']
-    dllogging.logger.log(step=(), data={"f1": f1}, verbosity=Verbosity.DEFAULT)
-    dllogging.logger.log(step=(), data={"exact_match": exact_match}, verbosity=Verbosity.DEFAULT)
-    print(str(eval_out))
-
-
-def main(_):
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
-  #  Get the value of 'train_data_size' from input_meta_data and set FLAGS.num_train_examples
-  FLAGS([sys.argv[0], f"--num_train_examples={input_meta_data['train_data_size']}"])
-
-  if FLAGS.mode in ('predict', 'sm_predict', 'trt_predict', 'train_and_predict') and is_main_process():
-    predict_squad(input_meta_data)
-
-
-if __name__ == '__main__':
-  flags.mark_flag_as_required('config_file')
-  flags.mark_flag_as_required('model_dir')
-  app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/scripts/benchmark_pretraining_lamb_phase2.sh b/modelzoo/LanguageModeling/BERT/scripts/benchmark_pretraining_lamb_phase2.sh
index cb315538..574bcb45 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/benchmark_pretraining_lamb_phase2.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/benchmark_pretraining_lamb_phase2.sh
@@ -47,7 +47,7 @@ echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 PREC=""
 if [ "$precision" = "fp16" ] ; then
-   PREC="--dtype=fp16"
+   PREC="--use_fp16"
 elif [ "$precision" = "fp32" ] ; then
    PREC=""
 elif [ "$precision" = "manual_fp16" ] ; then
@@ -91,12 +91,12 @@ EVAL_FILES="$DATA_DIR/tfrecord/lower_case_1_seq_len_${seq_len}_max_pred_${max_pr
 $mpi python /workspace/bert_tf2/run_pretraining.py \
     --input_files=$INPUT_FILES \
     --model_dir=$RESULTS_DIR_PHASE2 \
-    --config_file=$BERT_CONFIG \
+    --bert_config_file=$BERT_CONFIG \
     --train_batch_size=$train_batch_size_phase2 \
     --max_seq_length=$seq_len \
     --max_predictions_per_seq=$max_pred_per_seq \
-    --num_steps_per_epoch=$train_steps --epochs=1 \
-    --steps_per_summary=$save_checkpoints_steps \
+    --num_steps_per_epoch=$train_steps --num_train_epochs=1 \
+    --steps_per_loop=$save_checkpoints_steps \
     --save_checkpoint_steps=$save_checkpoints_steps \
     --warmup_steps=$warmup_steps_phase2 \
     --num_accumulation_steps=$num_accumulation_steps_phase2 \
diff --git a/modelzoo/LanguageModeling/BERT/scripts/docker/build.sh b/modelzoo/LanguageModeling/BERT/scripts/docker/build.sh
new file mode 100644
index 00000000..bb572f54
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/scripts/docker/build.sh
@@ -0,0 +1,15 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+docker build . --rm -t bert_tf2 --network=host --build-arg http_proxy=http://127.0.0.1:7890 --build-arg https_proxy=http://127.0.0.1:7890
diff --git a/modelzoo/LanguageModeling/BERT/scripts/docker/launch.sh b/modelzoo/LanguageModeling/BERT/scripts/docker/launch.sh
new file mode 100644
index 00000000..24bc9083
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/scripts/docker/launch.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+CMD=${@:-/bin/bash}
+NV_VISIBLE_DEVICES=${NVIDIA_VISIBLE_DEVICES:-"all"}
+
+docker run --gpus $NV_VISIBLE_DEVICES --rm -it \
+    --net=host \
+    --shm-size=1g \
+    --ulimit memlock=-1 \
+    --ulimit stack=67108864 \
+    -e NVIDIA_VISIBLE_DEVICES=$NV_VISIBLE_DEVICES \
+    --volume=dev-build:/workspaces \
+    -v $PWD:/workspace/bert_tf2 -v $PWD/results:/results \
+    bert_tf2 $CMD
diff --git a/modelzoo/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh b/modelzoo/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
index 6b2ecbe0..30fd8ebb 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/finetune_train_benchmark.sh
@@ -14,17 +14,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-set -eu
-set -o pipefail
+echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-bert_model=${1:-"base"}
-num_gpu=${2:-"4"}
+bert_model=${1:-"large"}
+num_gpu=${2:-"8"}
 batch_size=${3:-"8"}
 precision=${4:-"fp16"}
 use_xla=${5:-"true"}
 squad_version=1.1
 
-if [ $num_gpu -gt 1 ]; then
+if [ $num_gpu -gt 1 ] ; then
     mpi_command="mpirun -np $num_gpu \
     --allow-run-as-root -bind-to none -map-by slot \
     -x NCCL_DEBUG=INFO \
@@ -33,51 +32,49 @@ if [ $num_gpu -gt 1 ]; then
     use_hvd="--use_horovod"
 else
     mpi_command=""
-    use_hvd="--distribution_strategy=off"
-    # use_hvd=""
+    use_hvd=""
 fi
 
-if [ "$precision" = "fp16" ]; then
+if [ "$precision" = "fp16" ] ; then
     echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
+    use_fp16="--use_fp16"
 else
     use_fp16=""
 fi
 
-if [ "$use_xla" = "true" ]; then
+if [ "$use_xla" = "true" ] ; then
     use_xla_tag="--enable_xla"
     echo "XLA activated"
 else
     use_xla_tag=""
 fi
 
-if [ "$bert_model" = "large" ]; then
-    export BERT_BASE_DIR=/workspaces/dataset/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+if [ "$bert_model" = "large" ] ; then
+    export BERT_BASE_DIR=/workspaces/datasets/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_BASE_DIR=/workspaces/dataset/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_BASE_DIR=/workspaces/datasets/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 export SQUAD_VERSION=v$squad_version
-export SQUAD_DIR=/workspaces/dataset/download/squad/$SQUAD_VERSION
+export SQUAD_DIR=/workspaces/datasets/squad/$SQUAD_VERSION
 printf -v TAG "squad_train_benchmark_%s_%s_gpu%d_bs%d" "$bert_model" "$precision" $num_gpu $batch_size
-DATESTAMP=$(date +'%y%m%d%H%M%S')
+DATESTAMP=`date +'%y%m%d%H%M%S'`
 LOGFILE=/results/$TAG.log
 export MODEL_DIR=/tmp/bert_train_benchmark_${DATESTAMP}
 printf "Logs written to %s\n" "$LOGFILE"
 mkdir -p /results
 
 $mpi_command python run_squad.py \
-    --mode=train \
-    --model_name=bert \
-    --num_accumulation_steps=5 \
-    --optimizer_type=adam \
-    --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-    --train_data=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
-    --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-    --config_file=$BERT_BASE_DIR/bert_config.json \
-    --init_checkpoint=",$BERT_BASE_DIR/bert_model.ckpt" \
-    --batch_size=$batch_size \
-    --model_dir=${MODEL_DIR} \
-    --run_eagerly=false \
-    --benchmark \
-    $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
+  --mode=train \
+  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --vocab_file=${BERT_BASE_DIR}/vocab.txt \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
+  --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
+  --train_batch_size=$batch_size \
+  --model_dir=${MODEL_DIR} \
+  --benchmark \
+  $use_hvd $use_fp16 $use_xla_tag 
+#   |& tee $LOGFILE
+
+# rm $MODEL_DIR -r
diff --git a/modelzoo/LanguageModeling/BERT/scripts/gen_squad_evel.sh b/modelzoo/LanguageModeling/BERT/scripts/gen_squad_evel.sh
deleted file mode 100644
index b139776b..00000000
--- a/modelzoo/LanguageModeling/BERT/scripts/gen_squad_evel.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-
-bert_model="large"
-squad_version="2.0"
-
-
-if [ "$bert_model" = "large" ] ; then
-    export BERT_BASE_DIR=/workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
-else
-    export BERT_BASE_DIR=/workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
-fi
-
-export SQUAD_VERSION=v$squad_version
-export SQUAD_DIR=/workspaces/bert_tf2/data/download/squad/$SQUAD_VERSION
-
-
-python -m data.gen_evel_tfrecord \
-  --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
-  --config_file=$BERT_BASE_DIR/bert_config.json \
-  --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-  --model_dir=/workspaces/bert_tf2/data/download/squad/v2.0/
\ No newline at end of file
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark.sh b/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark.sh
index 515999fc..5a6ffc74 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark.sh
@@ -46,7 +46,7 @@ echo "Results directory set as " $RESULTS_DIR
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        use_fp16="--dtype=fp16"
+        use_fp16="--use_fp16"
 fi
 
 if [ "$use_xla" = "true" ] ; then
@@ -78,7 +78,7 @@ python run_squad.py \
 --mode=predict \
 --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
 --vocab_file=$BERT_DIR/vocab.txt \
---config_file=$BERT_DIR/bert_config.json \
+--bert_config_file=$BERT_DIR/bert_config.json \
 --init_checkpoint=$init_checkpoint \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
 --predict_batch_size=$batch_size \
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark_seq128.sh b/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark_seq128.sh
index 75a57800..55abfa33 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark_seq128.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_inference_benchmark_seq128.sh
@@ -46,7 +46,7 @@ echo "Results directory set as " $RESULTS_DIR
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        use_fp16="--dtype=fp16"
+        use_fp16="--use_fp16"
 fi
 
 if [ "$use_xla" = "true" ] ; then
@@ -77,7 +77,7 @@ python run_squad.py \
 --mode=predict \
 --input_meta_data_path=${SQUAD_DIR}/seq-128/squad_${SQUAD_VERSION}_meta_data \
 --vocab_file=$BERT_DIR/vocab.txt \
---config_file=$BERT_DIR/bert_config.json \
+--bert_config_file=$BERT_DIR/bert_config.json \
 --init_checkpoint=$init_checkpoint \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
 --predict_batch_size=$batch_size \
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_adam.sh b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
index 69691e2d..df886b92 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_adam.sh
@@ -39,7 +39,7 @@ fi
 
 PREC=""
 if [ "$precision" = "fp16" ] ; then
-   PREC="--dtype=fp16"
+   PREC="--use_fp16"
 elif [ "$precision" = "fp32" ] || [ "$precision" = "tf32" ] ; then
    PREC=""
 else
@@ -69,11 +69,11 @@ EVAL_FILES="$DATA_DIR/test"
 CMD="python3 run_pretraining.py"
 CMD+=" --input_files=$INPUT_FILES"
 CMD+=" --model_dir=$RESULTS_DIR"
-CMD+=" --config_file=$BERT_CONFIG"
+CMD+=" --bert_config_file=$BERT_CONFIG"
 CMD+=" --train_batch_size=$train_batch_size"
 CMD+=" --max_seq_length=$seq_len"
 CMD+=" --max_predictions_per_seq=$max_pred_per_seq"
-CMD+=" --num_steps_per_epoch=$train_steps --epochs=1"
+CMD+=" --num_steps_per_epoch=$train_steps --num_train_epochs=1"
 CMD+=" --warmup_steps=$warmup_steps"
 CMD+=" --num_accumulation_steps=$num_accumulation_steps"
 CMD+=" --learning_rate=$learning_rate"
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
index b82ac40c..2fcd54e5 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase1.sh
@@ -45,7 +45,7 @@ fi
 
 PREC=""
 if [ "$precision" = "fp16" ] ; then
-   PREC="--dtype=fp16"
+   PREC="--use_fp16"
 elif [ "$precision" = "fp32" ] || [ "$precision" = "tf32" ] ; then
    PREC=""
 else
@@ -87,12 +87,12 @@ done
  $mpi python /workspace/bert_tf2/run_pretraining.py \
      --input_files=$INPUT_FILES \
      --model_dir=$RESULTS_DIR_PHASE1 \
-     --config_file=$BERT_CONFIG \
+     --bert_config_file=$BERT_CONFIG \
      --train_batch_size=$train_batch_size_phase1 \
      --max_seq_length=$seq_len \
      --max_predictions_per_seq=$max_pred_per_seq \
-     --num_steps_per_epoch=$train_steps_phase1 --epochs=1 \
-     --steps_per_summary=$save_checkpoints_steps \
+     --num_steps_per_epoch=$train_steps_phase1 --num_train_epochs=1 \
+     --steps_per_loop=$save_checkpoints_steps \
      --save_checkpoint_steps=$save_checkpoints_steps \
      --warmup_steps=$warmup_steps_phase1 \
      --num_accumulation_steps=$num_accumulation_steps_phase1 \
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
index 5ae6d154..db279129 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_pretraining_lamb_phase2.sh
@@ -47,7 +47,7 @@ echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
 PREC=""
 if [ "$precision" = "fp16" ] ; then
-   PREC="--dtype=fp16"
+   PREC="--use_fp16"
 elif [ "$precision" = "fp32" ] || [ "$precision" = "tf32" ] ; then
    PREC=""
 else
@@ -90,12 +90,12 @@ $mpi python /workspace/bert_tf2/run_pretraining.py \
     --input_files=$INPUT_FILES \
     --init_checkpoint=$PHASE1_CKPT \
     --model_dir=$RESULTS_DIR_PHASE2 \
-    --config_file=$BERT_CONFIG \
+    --bert_config_file=$BERT_CONFIG \
     --train_batch_size=$train_batch_size_phase2 \
     --max_seq_length=$seq_len \
     --max_predictions_per_seq=$max_pred_per_seq \
-    --num_steps_per_epoch=$train_steps_phase2 --epochs=1 \
-    --steps_per_summary=$save_checkpoints_steps \
+    --num_steps_per_epoch=$train_steps_phase2 --num_train_epochs=1 \
+    --steps_per_loop=$save_checkpoints_steps \
     --save_checkpoint_steps=$save_checkpoints_steps \
     --warmup_steps=$warmup_steps_phase2 \
     --num_accumulation_steps=$num_accumulation_steps_phase2 \
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_squad.sh b/modelzoo/LanguageModeling/BERT/scripts/run_squad.sh
index 6109d628..47cce05a 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_squad.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_squad.sh
@@ -17,14 +17,14 @@
 
 echo "Container nvidia build = " $NVIDIA_BUILD_ID
 
-num_gpu=${1:-"4"}
-batch_size=${2:-"12"}
+num_gpu=${1:-"8"}
+batch_size=${2:-"8"}
 learning_rate=${3:-"5e-6"}
 precision=${4:-"fp16"}
 use_xla=${5:-"true"}
-bert_model=${6:-"base"}
+bert_model=${6:-"large"}
 squad_version=${7:-"1.1"}
-epochs=${8:-"1"}
+epochs=${8:-"2"}
 
 if [ $num_gpu -gt 1 ] ; then
     mpi_command="mpirun -np $num_gpu \
@@ -40,7 +40,7 @@ fi
 
 if [ "$precision" = "fp16" ] ; then
     echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
+    use_fp16="--use_fp16"
 else
     use_fp16=""
 fi
@@ -53,13 +53,13 @@ else
 fi
 
 if [ "$bert_model" = "large" ] ; then
-    export BERT_BASE_DIR=/workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
+    export BERT_BASE_DIR=data/download/google_pretrained_weights/uncased_L-24_H-1024_A-16
 else
-    export BERT_BASE_DIR=/workspaces/bert_tf2/data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
+    export BERT_BASE_DIR=data/download/google_pretrained_weights/uncased_L-12_H-768_A-12
 fi
 
 export SQUAD_VERSION=v$squad_version
-export SQUAD_DIR=/workspaces/bert_tf2/data/download/squad/$SQUAD_VERSION
+export SQUAD_DIR=data/download/squad/$SQUAD_VERSION
 
 export GBS=$(expr $batch_size \* $num_gpu)
 printf -v TAG "tf_bert_finetuning_squad_%s_%s_gbs%d" "$bert_model" "$precision" $GBS
@@ -75,16 +75,15 @@ printf "Logs written to %s\n" "$LOGFILE"
 set -x
 $mpi_command python run_squad.py \
   --mode=train_and_predict \
-  --optimizer_type=adamw \
   --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
-  --train_data=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
+  --train_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_train.tf_record \
   --predict_file=${SQUAD_DIR}/dev-${SQUAD_VERSION}.json \
   --vocab_file=${BERT_BASE_DIR}/vocab.txt \
-  --config_file=$BERT_BASE_DIR/bert_config.json \
+  --bert_config_file=$BERT_BASE_DIR/bert_config.json \
   --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \
-  --batch_size=$batch_size \
+  --train_batch_size=$batch_size \
   --learning_rate=$learning_rate \
-  --epochs=$epochs \
+  --num_train_epochs=$epochs \
   --model_dir=${RESULTS_DIR} \
   --eval_script=$SQUAD_DIR/evaluate-$SQUAD_VERSION.py \
   $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
diff --git a/modelzoo/LanguageModeling/BERT/scripts/run_squad_inference.sh b/modelzoo/LanguageModeling/BERT/scripts/run_squad_inference.sh
index ca0d1914..82e1dbb7 100644
--- a/modelzoo/LanguageModeling/BERT/scripts/run_squad_inference.sh
+++ b/modelzoo/LanguageModeling/BERT/scripts/run_squad_inference.sh
@@ -39,7 +39,7 @@ echo "Results directory set as " $RESULTS_DIR
 use_fp16=""
 if [ "$precision" = "fp16" ] ; then
         echo "fp16 activated!"
-        use_fp16="--dtype=fp16"
+        use_fp16="--use_fp16"
 fi
 
 if [ "$use_xla" = "true" ] ; then
@@ -70,7 +70,7 @@ python run_squad.py \
 --mode=predict \
 --input_meta_data_path=${SQUAD_DIR}/squad_${SQUAD_VERSION}_meta_data \
 --vocab_file=$BERT_DIR/vocab.txt \
---config_file=$BERT_DIR/bert_config.json \
+--bert_config_file=$BERT_DIR/bert_config.json \
 --init_checkpoint=$init_checkpoint \
 --predict_file=$SQUAD_DIR/dev-v${squad_version}.json \
 --predict_batch_size=$batch_size \
diff --git a/modelzoo/LanguageModeling/BERT/squad_lib.py b/modelzoo/LanguageModeling/BERT/squad_lib.py
new file mode 100644
index 00000000..cd36125b
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/squad_lib.py
@@ -0,0 +1,877 @@
+# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library to process data for SQuAD 1.1 and SQuAD 2.0."""
+
+# pylint: disable=g-bad-import-order
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+import six
+
+from absl import logging
+import tensorflow as tf
+
+import tokenization
+# pylint: enable=g-bad-import-order
+
+
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+  def __init__(self,
+               qas_id,
+               question_text,
+               doc_tokens,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.doc_tokens = doc_tokens
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+  def __str__(self):
+    return self.__repr__()
+
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tokens,
+               token_to_orig_map,
+               token_is_max_context,
+               input_ids,
+               input_mask,
+               segment_ids,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tokens = tokens
+    self.token_to_orig_map = token_to_orig_map
+    self.token_is_max_context = token_is_max_context
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    self._writer = tf.io.TFRecordWriter(filename)
+
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+
+  def close(self):
+    self._writer.close()
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative, input_data=None):
+  """Read a SQuAD json file into a list of SquadExample."""
+  if input_data is None:
+    with tf.io.gfile.GFile(input_file, "r") as reader:
+      input_data = json.load(reader)["data"]
+
+  def is_whitespace(c):
+    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+      return True
+    return False
+
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+      doc_tokens = []
+      char_to_word_offset = []
+      prev_is_whitespace = True
+      for c in paragraph_text:
+        if is_whitespace(c):
+          prev_is_whitespace = True
+        else:
+          if prev_is_whitespace:
+            doc_tokens.append(c)
+          else:
+            doc_tokens[-1] += c
+          prev_is_whitespace = False
+        char_to_word_offset.append(len(doc_tokens) - 1)
+
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        end_position = None
+        orig_answer_text = None
+        is_impossible = False
+        if is_training:
+
+          if version_2_with_negative:
+            is_impossible = qa["is_impossible"]
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            answer_offset = answer["answer_start"]
+            answer_length = len(orig_answer_text)
+            start_position = char_to_word_offset[answer_offset]
+            end_position = char_to_word_offset[answer_offset + answer_length -
+                                               1]
+            # Only add answers where the text can be exactly recovered from the
+            # document. If this CAN'T happen it's likely due to weird Unicode
+            # stuff so we will just skip the example.
+            #
+            # Note that this means for training mode, every example is NOT
+            # guaranteed to be preserved.
+            actual_text = " ".join(
+                doc_tokens[start_position:(end_position + 1)])
+            cleaned_answer_text = " ".join(
+                tokenization.whitespace_tokenize(orig_answer_text))
+            if actual_text.find(cleaned_answer_text) == -1:
+              logging.warning("Could not find answer: '%s' vs. '%s'",
+                              actual_text, cleaned_answer_text)
+              continue
+          else:
+            start_position = -1
+            end_position = -1
+            orig_answer_text = ""
+
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            doc_tokens=doc_tokens,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            end_position=end_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+
+  return examples
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+
+  base_id = 1000000000
+  unique_id = base_id
+  feature = None
+  for (example_index, example) in enumerate(examples):
+    query_tokens = tokenizer.tokenize(example.question_text)
+
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+
+    tok_to_orig_index = []
+    orig_to_tok_index = []
+    all_doc_tokens = []
+    for (i, token) in enumerate(example.doc_tokens):
+      orig_to_tok_index.append(len(all_doc_tokens))
+      sub_tokens = tokenizer.tokenize(token)
+      for sub_token in sub_tokens:
+        tok_to_orig_index.append(i)
+        all_doc_tokens.append(sub_token)
+
+    tok_start_position = None
+    tok_end_position = None
+    if is_training and example.is_impossible:
+      tok_start_position = -1
+      tok_end_position = -1
+    if is_training and not example.is_impossible:
+      tok_start_position = orig_to_tok_index[example.start_position]
+      if example.end_position < len(example.doc_tokens) - 1:
+        tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+      else:
+        tok_end_position = len(all_doc_tokens) - 1
+      (tok_start_position, tok_end_position) = _improve_answer_span(
+          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+          example.orig_answer_text)
+
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_to_orig_map = {}
+      token_is_max_context = {}
+      segment_ids = []
+      tokens.append("[CLS]")
+      segment_ids.append(0)
+      for token in query_tokens:
+        tokens.append(token)
+        segment_ids.append(0)
+      tokens.append("[SEP]")
+      segment_ids.append(0)
+
+      for i in range(doc_span.length):
+        split_token_index = doc_span.start + i
+        token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                               split_token_index)
+        token_is_max_context[len(tokens)] = is_max_context
+        tokens.append(all_doc_tokens[split_token_index])
+        segment_ids.append(1)
+      tokens.append("[SEP]")
+      segment_ids.append(1)
+
+      input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+
+      start_position = None
+      end_position = None
+      if is_training and not example.is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        out_of_span = False
+        if not (tok_start_position >= doc_start and
+                tok_end_position <= doc_end):
+          out_of_span = True
+        if out_of_span:
+          start_position = 0
+          end_position = 0
+        else:
+          doc_offset = len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+
+      if is_training and example.is_impossible:
+        start_position = 0
+        end_position = 0
+
+      if example_index < 2:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tokens: %s",
+                     " ".join([tokenization.printable_text(x) for x in tokens]))
+        logging.info(
+            "token_to_orig_map: %s", " ".join([
+                "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)
+            ]))
+        logging.info(
+            "token_is_max_context: %s", " ".join([
+                "%d:%s" % (x, y)
+                for (x, y) in six.iteritems(token_is_max_context)
+            ]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+        if is_training and example.is_impossible:
+          logging.info("impossible example")
+        if is_training and not example.is_impossible:
+          answer_text = " ".join(tokens[start_position:(end_position + 1)])
+          logging.info("start_position: %d", (start_position))
+          logging.info("end_position: %d", (end_position))
+          logging.info("answer: %s", tokenization.printable_text(answer_text))
+
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=example_index,
+          doc_span_index=doc_span_index,
+          tokens=tokens,
+          token_to_orig_map=token_to_orig_map,
+          token_is_max_context=token_is_max_context,
+          input_ids=input_ids,
+          input_mask=input_mask,
+          segment_ids=segment_ids,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=example.is_impossible)
+
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+
+      unique_id += 1
+
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    logging.info("Adding padding examples to make sure no partial batch.")
+    logging.info("Adds %d padding examples for inference.", num_padding)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+  return unique_id - base_id
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+  """Returns tokenized answer spans that better match the annotated answer."""
+
+  # The SQuAD annotations are character based. We first project them to
+  # whitespace-tokenized words. But then after WordPiece tokenization, we can
+  # often find a "better match". For example:
+  #
+  #   Question: What year was John Smith born?
+  #   Context: The leader was John Smith (1895-1943).
+  #   Answer: 1895
+  #
+  # The original whitespace-tokenized answer will be "(1895-1943).". However
+  # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+  # the exact answer, 1895.
+  #
+  # However, this is not always possible. Consider the following:
+  #
+  #   Question: What country is the top exporter of electornics?
+  #   Context: The Japanese electronics industry is the lagest in the world.
+  #   Answer: Japan
+  #
+  # In this case, the annotator chose "Japan" as a character sub-span of
+  # the word "Japanese". Since our WordPiece tokenizer does not split
+  # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+  # in SQuAD, but does happen.
+  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+  for new_start in range(input_start, input_end + 1):
+    for new_end in range(input_end, new_start - 1, -1):
+      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      if text_span == tok_answer_text:
+        return (new_start, new_end)
+
+  return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def get_predictions(all_examples,
+                    all_features,
+                    all_results,
+                    n_best_size,
+                    max_answer_length,
+                    do_lower_case,
+                    version_2_with_negative=False,
+                    null_score_diff_threshold=0.0,
+                    verbose=False):
+
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      result = unique_id_to_result[feature.unique_id]
+      start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+      end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+      for start_index in start_indexes:
+        for end_index in end_indexes:
+          # We could hypothetically create invalid predictions, e.g., predict
+          # that the start of the span is in the question. We throw out all
+          # invalid predictions.
+          if start_index >= len(feature.tokens):
+            continue
+          if end_index >= len(feature.tokens):
+            continue
+          if start_index not in feature.token_to_orig_map:
+            continue
+          if end_index not in feature.token_to_orig_map:
+            continue
+          if not feature.token_is_max_context.get(start_index, False):
+            continue
+          if end_index < start_index:
+            continue
+          length = end_index - start_index + 1
+          if length > max_answer_length:
+            continue
+          prelim_predictions.append(
+              _PrelimPrediction(
+                  feature_index=feature_index,
+                  start_index=start_index,
+                  end_index=end_index,
+                  start_logit=result.start_logits[start_index],
+                  end_logit=result.end_logits[end_index]))
+
+    if version_2_with_negative:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=0,
+              end_index=0,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index > 0:  # this is a non-null prediction
+        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+        orig_doc_start = feature.token_to_orig_map[pred.start_index]
+        orig_doc_end = feature.token_to_orig_map[pred.end_index]
+        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        tok_text = " ".join(tok_tokens)
+
+        # De-tokenize WordPieces that have been split off.
+        tok_text = tok_text.replace(" ##", "")
+        tok_text = tok_text.replace("##", "")
+
+        # Clean whitespace
+        tok_text = tok_text.strip()
+        tok_text = " ".join(tok_text.split())
+        orig_text = " ".join(orig_tokens)
+
+        final_text = get_final_text(
+            tok_text, orig_text, do_lower_case, verbose=verbose)
+        if final_text in seen_predictions:
+          continue
+
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+
+    # if we didn't inlude the empty option in the n-best, inlcude it
+    if version_2_with_negative:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+    assert len(nbest) >= 1
+
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+
+    probs = _compute_softmax(total_scores)
+
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      # pytype: disable=attribute-error
+      # predict "" iff the null score - the score of best non-null > threshold
+      score_diff = score_null - best_non_null_entry.start_logit - (
+          best_non_null_entry.end_logit)
+      scores_diff_json[example.qas_id] = score_diff
+      if score_diff > null_score_diff_threshold:
+        all_predictions[example.qas_id] = ""
+      else:
+        all_predictions[example.qas_id] = best_non_null_entry.text
+      # pytype: enable=attribute-error
+
+    all_nbest_json[example.qas_id] = nbest_json
+  return all_predictions, all_nbest_json, scores_diff_json
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+
+  all_predictions, all_nbest_json, scores_diff_json = get_predictions(
+    all_examples, all_features, all_results, n_best_size,
+    max_answer_length, do_lower_case, version_2_with_negative,
+    null_score_diff_threshold, verbose)
+
+  with tf.io.gfile.GFile(output_prediction_file, "w") as writer:
+    writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+  with tf.io.gfile.GFile(output_nbest_file, "w") as writer:
+    writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+  if version_2_with_negative:
+    with tf.io.gfile.GFile(output_null_log_odds_file, "w") as writer:
+      writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
+  """Project the tokenized prediction back to the original text."""
+
+  # When we created the data, we kept track of the alignment between original
+  # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+  # now `orig_text` contains the span of our original text corresponding to the
+  # span that we predicted.
+  #
+  # However, `orig_text` may contain extra characters that we don't want in
+  # our prediction.
+  #
+  # For example, let's say:
+  #   pred_text = steve smith
+  #   orig_text = Steve Smith's
+  #
+  # We don't want to return `orig_text` because it contains the extra "'s".
+  #
+  # We don't want to return `pred_text` because it's already been normalized
+  # (the SQuAD eval script also does punctuation stripping/lower casing but
+  # our tokenizer does additional normalization like stripping accent
+  # characters).
+  #
+  # What we really want to return is "Steve Smith".
+  #
+  # Therefore, we have to apply a semi-complicated alignment heruistic between
+  # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+  # can fail in certain cases in which case we just return `orig_text`.
+
+  def _strip_spaces(text):
+    ns_chars = []
+    ns_to_s_map = collections.OrderedDict()
+    for (i, c) in enumerate(text):
+      if c == " ":
+        continue
+      ns_to_s_map[len(ns_chars)] = i
+      ns_chars.append(c)
+    ns_text = "".join(ns_chars)
+    return (ns_text, ns_to_s_map)
+
+  # We first tokenize `orig_text`, strip whitespace from the result
+  # and `pred_text`, and check if they are the same length. If they are
+  # NOT the same length, the heuristic has failed. If they are the same
+  # length, we assume the characters are one-to-one aligned.
+  tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case)
+
+  tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+  start_position = tok_text.find(pred_text)
+  if start_position == -1:
+    if verbose:
+      logging.info("Unable to find text: '%s' in '%s'", pred_text, orig_text)
+    return orig_text
+  end_position = start_position + len(pred_text) - 1
+
+  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+  if len(orig_ns_text) != len(tok_ns_text):
+    if verbose:
+      logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                   orig_ns_text, tok_ns_text)
+    return orig_text
+
+  # We then project the characters in `pred_text` back to `orig_text` using
+  # the character-to-character alignment.
+  tok_s_to_ns_map = {}
+  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+    tok_s_to_ns_map[tok_index] = i
+
+  orig_start_position = None
+  if start_position in tok_s_to_ns_map:
+    ns_start_position = tok_s_to_ns_map[start_position]
+    if ns_start_position in orig_ns_to_s_map:
+      orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+  if orig_start_position is None:
+    if verbose:
+      logging.info("Couldn't map start position")
+    return orig_text
+
+  orig_end_position = None
+  if end_position in tok_s_to_ns_map:
+    ns_end_position = tok_s_to_ns_map[end_position]
+    if ns_end_position in orig_ns_to_s_map:
+      orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+  if orig_end_position is None:
+    if verbose:
+      logging.info("Couldn't map end position")
+    return orig_text
+
+  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+  """Get the n-best logits from a list."""
+  index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+  best_indexes = []
+  for i in range(len(index_and_score)):  # pylint: disable=consider-using-enumerate
+    if i >= n_best_size:
+      break
+    best_indexes.append(index_and_score[i][0])
+  return best_indexes
+
+
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+
+
+def generate_tf_record_from_json_file(input_file_path,
+                                      vocab_file_path,
+                                      output_path,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      version_2_with_negative=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative)
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
+  train_writer = FeatureWriter(filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature)
+  train_writer.close()
+
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+
+  return meta_data
diff --git a/modelzoo/LanguageModeling/BERT/squad_lib_sp.py b/modelzoo/LanguageModeling/BERT/squad_lib_sp.py
new file mode 100644
index 00000000..64a99290
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/squad_lib_sp.py
@@ -0,0 +1,868 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run ALBERT on SQuAD 1.1 and SQuAD 2.0 using sentence piece tokenization.
+
+The file is forked from:
+
+https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import copy
+import json
+import math
+from absl import logging
+import numpy as np
+import tensorflow as tf
+
+import tokenization
+
+
+class SquadExample(object):
+  """A single training/test example for simple sequence classification.
+
+     For examples without an answer, the start and end position are -1.
+  """
+
+  def __init__(self,
+               qas_id,
+               question_text,
+               paragraph_text,
+               orig_answer_text=None,
+               start_position=None,
+               end_position=None,
+               is_impossible=False):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.paragraph_text = paragraph_text
+    self.orig_answer_text = orig_answer_text
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+  def __str__(self):
+    return self.__repr__()
+
+  def __repr__(self):
+    s = ""
+    s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
+    s += ", question_text: %s" % (
+        tokenization.printable_text(self.question_text))
+    s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text))
+    if self.start_position:
+      s += ", start_position: %d" % (self.start_position)
+    if self.start_position:
+      s += ", end_position: %d" % (self.end_position)
+    if self.start_position:
+      s += ", is_impossible: %r" % (self.is_impossible)
+    return s
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               unique_id,
+               example_index,
+               doc_span_index,
+               tok_start_to_orig_index,
+               tok_end_to_orig_index,
+               token_is_max_context,
+               tokens,
+               input_ids,
+               input_mask,
+               segment_ids,
+               paragraph_len,
+               start_position=None,
+               end_position=None,
+               is_impossible=None):
+    self.unique_id = unique_id
+    self.example_index = example_index
+    self.doc_span_index = doc_span_index
+    self.tok_start_to_orig_index = tok_start_to_orig_index
+    self.tok_end_to_orig_index = tok_end_to_orig_index
+    self.token_is_max_context = token_is_max_context
+    self.tokens = tokens
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.paragraph_len = paragraph_len
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative):
+  """Read a SQuAD json file into a list of SquadExample."""
+  del version_2_with_negative
+  with tf.io.gfile.GFile(input_file, "r") as reader:
+    input_data = json.load(reader)["data"]
+
+  examples = []
+  for entry in input_data:
+    for paragraph in entry["paragraphs"]:
+      paragraph_text = paragraph["context"]
+
+      for qa in paragraph["qas"]:
+        qas_id = qa["id"]
+        question_text = qa["question"]
+        start_position = None
+        orig_answer_text = None
+        is_impossible = False
+
+        if is_training:
+          is_impossible = qa.get("is_impossible", False)
+          if (len(qa["answers"]) != 1) and (not is_impossible):
+            raise ValueError(
+                "For training, each question should have exactly 1 answer.")
+          if not is_impossible:
+            answer = qa["answers"][0]
+            orig_answer_text = answer["text"]
+            start_position = answer["answer_start"]
+          else:
+            start_position = -1
+            orig_answer_text = ""
+
+        example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            paragraph_text=paragraph_text,
+            orig_answer_text=orig_answer_text,
+            start_position=start_position,
+            is_impossible=is_impossible)
+        examples.append(example)
+
+  return examples
+
+
+def _convert_index(index, pos, m=None, is_start=True):
+  """Converts index."""
+  if index[pos] is not None:
+    return index[pos]
+  n = len(index)
+  rear = pos
+  while rear < n - 1 and index[rear] is None:
+    rear += 1
+  front = pos
+  while front > 0 and index[front] is None:
+    front -= 1
+  assert index[front] is not None or index[rear] is not None
+  if index[front] is None:
+    if index[rear] >= 1:
+      if is_start:
+        return 0
+      else:
+        return index[rear] - 1
+    return index[rear]
+  if index[rear] is None:
+    if m is not None and index[front] < m - 1:
+      if is_start:
+        return index[front] + 1
+      else:
+        return m - 1
+    return index[front]
+  if is_start:
+    if index[rear] > index[front] + 1:
+      return index[front] + 1
+    else:
+      return index[rear]
+  else:
+    if index[rear] > index[front] + 1:
+      return index[rear] - 1
+    else:
+      return index[front]
+
+
+def convert_examples_to_features(examples,
+                                 tokenizer,
+                                 max_seq_length,
+                                 doc_stride,
+                                 max_query_length,
+                                 is_training,
+                                 output_fn,
+                                 do_lower_case,
+                                 batch_size=None):
+  """Loads a data file into a list of `InputBatch`s."""
+  cnt_pos, cnt_neg = 0, 0
+  base_id = 1000000000
+  unique_id = base_id
+  max_n, max_m = 1024, 1024
+  f = np.zeros((max_n, max_m), dtype=np.float32)
+
+  for (example_index, example) in enumerate(examples):
+
+    if example_index % 100 == 0:
+      logging.info("Converting %d/%d pos %d neg %d", example_index,
+                   len(examples), cnt_pos, cnt_neg)
+
+    query_tokens = tokenization.encode_ids(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.question_text, lower=do_lower_case))
+
+    if len(query_tokens) > max_query_length:
+      query_tokens = query_tokens[0:max_query_length]
+
+    paragraph_text = example.paragraph_text
+    para_tokens = tokenization.encode_pieces(
+        tokenizer.sp_model,
+        tokenization.preprocess_text(
+            example.paragraph_text, lower=do_lower_case))
+
+    chartok_to_tok_index = []
+    tok_start_to_chartok_index = []
+    tok_end_to_chartok_index = []
+    char_cnt = 0
+    for i, token in enumerate(para_tokens):
+      new_token = token.replace(tokenization.SPIECE_UNDERLINE, " ")
+      chartok_to_tok_index.extend([i] * len(new_token))
+      tok_start_to_chartok_index.append(char_cnt)
+      char_cnt += len(new_token)
+      tok_end_to_chartok_index.append(char_cnt - 1)
+
+    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE,
+                                                " ")
+    n, m = len(paragraph_text), len(tok_cat_text)
+
+    if n > max_n or m > max_m:
+      max_n = max(n, max_n)
+      max_m = max(m, max_m)
+      f = np.zeros((max_n, max_m), dtype=np.float32)
+
+    g = {}
+    # pylint: disable=cell-var-from-loop
+    def _lcs_match(max_dist, n=n, m=m):
+      """Longest-common-substring algorithm."""
+      f.fill(0)
+      g.clear()
+
+      ### longest common sub sequence
+      # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
+      for i in range(n):
+
+        # unlike standard LCS, this is specifically optimized for the setting
+        # because the mismatch between sentence pieces and original text will
+        # be small
+        for j in range(i - max_dist, i + max_dist):
+          if j >= m or j < 0:
+            continue
+
+          if i > 0:
+            g[(i, j)] = 0
+            f[i, j] = f[i - 1, j]
+
+          if j > 0 and f[i, j - 1] > f[i, j]:
+            g[(i, j)] = 1
+            f[i, j] = f[i, j - 1]
+
+          f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
+          if (tokenization.preprocess_text(
+              paragraph_text[i], lower=do_lower_case,
+              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
+            g[(i, j)] = 2
+            f[i, j] = f_prev + 1
+    # pylint: enable=cell-var-from-loop
+
+    max_dist = abs(n - m) + 5
+    for _ in range(2):
+      _lcs_match(max_dist)
+      if f[n - 1, m - 1] > 0.8 * n:
+        break
+      max_dist *= 2
+
+    orig_to_chartok_index = [None] * n
+    chartok_to_orig_index = [None] * m
+    i, j = n - 1, m - 1
+    while i >= 0 and j >= 0:
+      if (i, j) not in g:
+        break
+      if g[(i, j)] == 2:
+        orig_to_chartok_index[i] = j
+        chartok_to_orig_index[j] = i
+        i, j = i - 1, j - 1
+      elif g[(i, j)] == 1:
+        j = j - 1
+      else:
+        i = i - 1
+
+    if (all(v is None for v in orig_to_chartok_index) or
+        f[n - 1, m - 1] < 0.8 * n):
+      logging.info("MISMATCH DETECTED!")
+      continue
+
+    tok_start_to_orig_index = []
+    tok_end_to_orig_index = []
+    for i in range(len(para_tokens)):
+      start_chartok_pos = tok_start_to_chartok_index[i]
+      end_chartok_pos = tok_end_to_chartok_index[i]
+      start_orig_pos = _convert_index(
+          chartok_to_orig_index, start_chartok_pos, n, is_start=True)
+      end_orig_pos = _convert_index(
+          chartok_to_orig_index, end_chartok_pos, n, is_start=False)
+
+      tok_start_to_orig_index.append(start_orig_pos)
+      tok_end_to_orig_index.append(end_orig_pos)
+
+    if not is_training:
+      tok_start_position = tok_end_position = None
+
+    if is_training and example.is_impossible:
+      tok_start_position = 0
+      tok_end_position = 0
+
+    if is_training and not example.is_impossible:
+      start_position = example.start_position
+      end_position = start_position + len(example.orig_answer_text) - 1
+
+      start_chartok_pos = _convert_index(
+          orig_to_chartok_index, start_position, is_start=True)
+      tok_start_position = chartok_to_tok_index[start_chartok_pos]
+
+      end_chartok_pos = _convert_index(
+          orig_to_chartok_index, end_position, is_start=False)
+      tok_end_position = chartok_to_tok_index[end_chartok_pos]
+      assert tok_start_position <= tok_end_position
+
+    def _piece_to_id(x):
+      return tokenizer.sp_model.PieceToId(x)
+
+    all_doc_tokens = list(map(_piece_to_id, para_tokens))
+
+    # The -3 accounts for [CLS], [SEP] and [SEP]
+    max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+    # We can have documents that are longer than the maximum sequence length.
+    # To deal with this we do a sliding window approach, where we take chunks
+    # of the up to our max length with a stride of `doc_stride`.
+    _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+        "DocSpan", ["start", "length"])
+    doc_spans = []
+    start_offset = 0
+    while start_offset < len(all_doc_tokens):
+      length = len(all_doc_tokens) - start_offset
+      if length > max_tokens_for_doc:
+        length = max_tokens_for_doc
+      doc_spans.append(_DocSpan(start=start_offset, length=length))
+      if start_offset + length == len(all_doc_tokens):
+        break
+      start_offset += min(length, doc_stride)
+
+    for (doc_span_index, doc_span) in enumerate(doc_spans):
+      tokens = []
+      token_is_max_context = {}
+      segment_ids = []
+
+      cur_tok_start_to_orig_index = []
+      cur_tok_end_to_orig_index = []
+
+      tokens.append(tokenizer.sp_model.PieceToId("[CLS]"))
+      segment_ids.append(0)
+      for token in query_tokens:
+        tokens.append(token)
+        segment_ids.append(0)
+      tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+      segment_ids.append(0)
+
+      for i in range(doc_span.length):
+        split_token_index = doc_span.start + i
+
+        cur_tok_start_to_orig_index.append(
+            tok_start_to_orig_index[split_token_index])
+        cur_tok_end_to_orig_index.append(
+            tok_end_to_orig_index[split_token_index])
+
+        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                               split_token_index)
+        token_is_max_context[len(tokens)] = is_max_context
+        tokens.append(all_doc_tokens[split_token_index])
+        segment_ids.append(1)
+      tokens.append(tokenizer.sp_model.PieceToId("[SEP]"))
+      segment_ids.append(1)
+
+      paragraph_len = len(tokens)
+      input_ids = tokens
+
+      # The mask has 1 for real tokens and 0 for padding tokens. Only real
+      # tokens are attended to.
+      input_mask = [1] * len(input_ids)
+
+      # Zero-pad up to the sequence length.
+      while len(input_ids) < max_seq_length:
+        input_ids.append(0)
+        input_mask.append(0)
+        segment_ids.append(0)
+
+      assert len(input_ids) == max_seq_length
+      assert len(input_mask) == max_seq_length
+      assert len(segment_ids) == max_seq_length
+
+      span_is_impossible = example.is_impossible
+      start_position = None
+      end_position = None
+      if is_training and not span_is_impossible:
+        # For training, if our document chunk does not contain an annotation
+        # we throw it out, since there is nothing to predict.
+        doc_start = doc_span.start
+        doc_end = doc_span.start + doc_span.length - 1
+        out_of_span = False
+        if not (tok_start_position >= doc_start and
+                tok_end_position <= doc_end):
+          out_of_span = True
+        if out_of_span:
+          # continue
+          start_position = 0
+          end_position = 0
+          span_is_impossible = True
+        else:
+          doc_offset = len(query_tokens) + 2
+          start_position = tok_start_position - doc_start + doc_offset
+          end_position = tok_end_position - doc_start + doc_offset
+
+      if is_training and span_is_impossible:
+        start_position = 0
+        end_position = 0
+
+      if example_index < 20:
+        logging.info("*** Example ***")
+        logging.info("unique_id: %s", (unique_id))
+        logging.info("example_index: %s", (example_index))
+        logging.info("doc_span_index: %s", (doc_span_index))
+        logging.info("tok_start_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_start_to_orig_index]))
+        logging.info("tok_end_to_orig_index: %s",
+                     " ".join([str(x) for x in cur_tok_end_to_orig_index]))
+        logging.info(
+            "token_is_max_context: %s", " ".join(
+                ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]))
+        logging.info(
+            "input_pieces: %s",
+            " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
+        logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+        logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+        logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+
+        if is_training and span_is_impossible:
+          logging.info("impossible example span")
+
+        if is_training and not span_is_impossible:
+          pieces = [
+              tokenizer.sp_model.IdToPiece(token)
+              for token in tokens[start_position:(end_position + 1)]
+          ]
+          answer_text = tokenizer.sp_model.DecodePieces(pieces)
+          logging.info("start_position: %d", (start_position))
+          logging.info("end_position: %d", (end_position))
+          logging.info("answer: %s", (tokenization.printable_text(answer_text)))
+
+          # With multi processing, the example_index is actually the index
+          # within the current process therefore we use example_index=None
+          # to avoid being used in the future.
+          # The current code does not use example_index of training data.
+      if is_training:
+        feat_example_index = None
+      else:
+        feat_example_index = example_index
+
+      feature = InputFeatures(
+          unique_id=unique_id,
+          example_index=feat_example_index,
+          doc_span_index=doc_span_index,
+          tok_start_to_orig_index=cur_tok_start_to_orig_index,
+          tok_end_to_orig_index=cur_tok_end_to_orig_index,
+          token_is_max_context=token_is_max_context,
+          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
+          input_ids=input_ids,
+          input_mask=input_mask,
+          segment_ids=segment_ids,
+          paragraph_len=paragraph_len,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=span_is_impossible)
+
+      # Run callback
+      if is_training:
+        output_fn(feature)
+      else:
+        output_fn(feature, is_padding=False)
+
+      unique_id += 1
+      if span_is_impossible:
+        cnt_neg += 1
+      else:
+        cnt_pos += 1
+
+  if not is_training and feature:
+    assert batch_size
+    num_padding = 0
+    num_examples = unique_id - base_id
+    if unique_id % batch_size != 0:
+      num_padding = batch_size - (num_examples % batch_size)
+    dummy_feature = copy.deepcopy(feature)
+    for _ in range(num_padding):
+      dummy_feature.unique_id = unique_id
+
+      # Run callback
+      output_fn(feature, is_padding=True)
+      unique_id += 1
+
+  logging.info("Total number of instances: %d = pos %d neg %d",
+               cnt_pos + cnt_neg, cnt_pos, cnt_neg)
+  return unique_id - base_id
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+  """Check if this is the 'max context' doc span for the token."""
+
+  # Because of the sliding window approach taken to scoring documents, a single
+  # token can appear in multiple documents. E.g.
+  #  Doc: the man went to the store and bought a gallon of milk
+  #  Span A: the man went to the
+  #  Span B: to the store and bought
+  #  Span C: and bought a gallon of
+  #  ...
+  #
+  # Now the word 'bought' will have two scores from spans B and C. We only
+  # want to consider the score with "maximum context", which we define as
+  # the *minimum* of its left and right context (the *sum* of left and
+  # right context will always be the same, of course).
+  #
+  # In the example the maximum context for 'bought' would be span C since
+  # it has 1 left context and 3 right context, while span B has 4 left context
+  # and 0 right context.
+  best_score = None
+  best_span_index = None
+  for (span_index, doc_span) in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+
+def write_predictions(all_examples,
+                      all_features,
+                      all_results,
+                      n_best_size,
+                      max_answer_length,
+                      do_lower_case,
+                      output_prediction_file,
+                      output_nbest_file,
+                      output_null_log_odds_file,
+                      version_2_with_negative=False,
+                      null_score_diff_threshold=0.0,
+                      verbose=False):
+  """Write final predictions to the json file and log-odds of null if needed."""
+  del do_lower_case, verbose
+  logging.info("Writing predictions to: %s", (output_prediction_file))
+  logging.info("Writing nbest to: %s", (output_nbest_file))
+
+  example_index_to_features = collections.defaultdict(list)
+  for feature in all_features:
+    example_index_to_features[feature.example_index].append(feature)
+
+  unique_id_to_result = {}
+  for result in all_results:
+    unique_id_to_result[result.unique_id] = result
+
+  _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+      "PrelimPrediction",
+      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+  all_predictions = collections.OrderedDict()
+  all_nbest_json = collections.OrderedDict()
+  scores_diff_json = collections.OrderedDict()
+
+  for (example_index, example) in enumerate(all_examples):
+    features = example_index_to_features[example_index]
+
+    prelim_predictions = []
+    # keep track of the minimum score of null start+end of position 0
+    score_null = 1000000  # large and positive
+    min_null_feature_index = 0  # the paragraph slice with min mull score
+    null_start_logit = 0  # the start logit at the slice with min null score
+    null_end_logit = 0  # the end logit at the slice with min null score
+    for (feature_index, feature) in enumerate(features):
+      result = unique_id_to_result[feature.unique_id]
+      start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+      end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+      # if we could have irrelevant answers, get the min score of irrelevant
+      if version_2_with_negative:
+        feature_null_score = result.start_logits[0] + result.end_logits[0]
+        if feature_null_score < score_null:
+          score_null = feature_null_score
+          min_null_feature_index = feature_index
+          null_start_logit = result.start_logits[0]
+          null_end_logit = result.end_logits[0]
+      for start_index in start_indexes:
+        for end_index in end_indexes:
+          doc_offset = feature.tokens.index("[SEP]") + 1
+          # We could hypothetically create invalid predictions, e.g., predict
+          # that the start of the span is in the question. We throw out all
+          # invalid predictions.
+          if start_index - doc_offset >= len(feature.tok_start_to_orig_index):
+            continue
+          if end_index - doc_offset >= len(feature.tok_end_to_orig_index):
+            continue
+          # if start_index not in feature.tok_start_to_orig_index:
+          #   continue
+          # if end_index not in feature.tok_end_to_orig_index:
+          #   continue
+          if not feature.token_is_max_context.get(start_index, False):
+            continue
+          if end_index < start_index:
+            continue
+          length = end_index - start_index + 1
+          if length > max_answer_length:
+            continue
+          prelim_predictions.append(
+              _PrelimPrediction(
+                  feature_index=feature_index,
+                  start_index=start_index - doc_offset,
+                  end_index=end_index - doc_offset,
+                  start_logit=result.start_logits[start_index],
+                  end_logit=result.end_logits[end_index]))
+
+    if version_2_with_negative:
+      prelim_predictions.append(
+          _PrelimPrediction(
+              feature_index=min_null_feature_index,
+              start_index=-1,
+              end_index=-1,
+              start_logit=null_start_logit,
+              end_logit=null_end_logit))
+    prelim_predictions = sorted(
+        prelim_predictions,
+        key=lambda x: (x.start_logit + x.end_logit),
+        reverse=True)
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+    seen_predictions = {}
+    nbest = []
+    for pred in prelim_predictions:
+      if len(nbest) >= n_best_size:
+        break
+      feature = features[pred.feature_index]
+      if pred.start_index >= 0:  # this is a non-null prediction
+        tok_start_to_orig_index = feature.tok_start_to_orig_index
+        tok_end_to_orig_index = feature.tok_end_to_orig_index
+        start_orig_pos = tok_start_to_orig_index[pred.start_index]
+        end_orig_pos = tok_end_to_orig_index[pred.end_index]
+
+        paragraph_text = example.paragraph_text
+        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
+        if final_text in seen_predictions:
+          continue
+
+        seen_predictions[final_text] = True
+      else:
+        final_text = ""
+        seen_predictions[final_text] = True
+
+      nbest.append(
+          _NbestPrediction(
+              text=final_text,
+              start_logit=pred.start_logit,
+              end_logit=pred.end_logit))
+
+    # if we didn't inlude the empty option in the n-best, inlcude it
+    if version_2_with_negative:
+      if "" not in seen_predictions:
+        nbest.append(
+            _NbestPrediction(
+                text="", start_logit=null_start_logit,
+                end_logit=null_end_logit))
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if not nbest:
+      nbest.append(
+          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+    assert len(nbest) >= 1
+
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry:
+        if entry.text:
+          best_non_null_entry = entry
+
+    probs = _compute_softmax(total_scores)
+
+    nbest_json = []
+    for (i, entry) in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+
+    if not version_2_with_negative:
+      all_predictions[example.qas_id] = nbest_json[0]["text"]
+    else:
+      assert best_non_null_entry is not None
+      # predict "" iff the null score - the score of best non-null > threshold
+      score_diff = score_null - best_non_null_entry.start_logit - (
+          best_non_null_entry.end_logit)
+      scores_diff_json[example.qas_id] = score_diff
+      if score_diff > null_score_diff_threshold:
+        all_predictions[example.qas_id] = ""
+      else:
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+    all_nbest_json[example.qas_id] = nbest_json
+
+  with tf.io.gfile.GFile(output_prediction_file, "w") as writer:
+    writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+  with tf.io.gfile.GFile(output_nbest_file, "w") as writer:
+    writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+  if version_2_with_negative:
+    with tf.io.gfile.GFile(output_null_log_odds_file, "w") as writer:
+      writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+
+def _get_best_indexes(logits, n_best_size):
+  """Get the n-best logits from a list."""
+  index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+  best_indexes = []
+  for i in range(len(index_and_score)):
+    if i >= n_best_size:
+      break
+    best_indexes.append(index_and_score[i][0])
+  return best_indexes
+
+
+def _compute_softmax(scores):
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
+
+
+class FeatureWriter(object):
+  """Writes InputFeature to TF example file."""
+
+  def __init__(self, filename, is_training):
+    self.filename = filename
+    self.is_training = is_training
+    self.num_features = 0
+    self._writer = tf.io.TFRecordWriter(filename)
+
+  def process_feature(self, feature):
+    """Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
+    self.num_features += 1
+
+    def create_int_feature(values):
+      feature = tf.train.Feature(
+          int64_list=tf.train.Int64List(value=list(values)))
+      return feature
+
+    features = collections.OrderedDict()
+    features["unique_ids"] = create_int_feature([feature.unique_id])
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+
+    if self.is_training:
+      features["start_positions"] = create_int_feature([feature.start_position])
+      features["end_positions"] = create_int_feature([feature.end_position])
+      impossible = 0
+      if feature.is_impossible:
+        impossible = 1
+      features["is_impossible"] = create_int_feature([impossible])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    self._writer.write(tf_example.SerializeToString())
+
+  def close(self):
+    self._writer.close()
+
+
+def generate_tf_record_from_json_file(input_file_path,
+                                      sp_model_file,
+                                      output_path,
+                                      max_seq_length=384,
+                                      do_lower_case=True,
+                                      max_query_length=64,
+                                      doc_stride=128,
+                                      version_2_with_negative=False):
+  """Generates and saves training data into a tf record file."""
+  train_examples = read_squad_examples(
+      input_file=input_file_path,
+      is_training=True,
+      version_2_with_negative=version_2_with_negative)
+  tokenizer = tokenization.FullSentencePieceTokenizer(
+      sp_model_file=sp_model_file)
+  train_writer = FeatureWriter(filename=output_path, is_training=True)
+  number_of_examples = convert_examples_to_features(
+      examples=train_examples,
+      tokenizer=tokenizer,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=True,
+      output_fn=train_writer.process_feature,
+      do_lower_case=do_lower_case)
+  train_writer.close()
+
+  meta_data = {
+      "task_type": "bert_squad",
+      "train_data_size": number_of_examples,
+      "max_seq_length": max_seq_length,
+      "max_query_length": max_query_length,
+      "doc_stride": doc_stride,
+      "version_2_with_negative": version_2_with_negative,
+  }
+
+  return meta_data
diff --git a/modelzoo/LanguageModeling/BERT/tf_trt.py b/modelzoo/LanguageModeling/BERT/tf_trt.py
new file mode 100644
index 00000000..af5c7a7c
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/tf_trt.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import tensorflow as tf
+from tensorflow.python.compiler.tensorrt import trt_convert as trt
+from tensorflow.compat.v1.saved_model import tag_constants, signature_constants
+
+
+def export_model(model_dir, prec, tf_trt_model_dir=None):
+    model = tf.saved_model.load(model_dir)
+    input_shape = [1, 384]
+    dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32))
+    x = [
+        tf.constant(dummy_input, name='input_word_ids'),
+        tf.constant(dummy_input, name='input_mask'),
+        tf.constant(dummy_input, name='input_type_ids'),
+    ]
+    _ = model(x)
+
+    trt_prec = trt.TrtPrecisionMode.FP32 if prec == "fp32" else trt.TrtPrecisionMode.FP16
+    converter = trt.TrtGraphConverterV2(
+        input_saved_model_dir=model_dir,
+        conversion_params=trt.TrtConversionParams(precision_mode=trt_prec),
+    )
+    converter.convert()
+    tf_trt_model_dir = tf_trt_model_dir or f'/tmp/tf-trt_model_{prec}'
+    converter.save(tf_trt_model_dir)
+    print(f"TF-TRT model saved at {tf_trt_model_dir}")
+
+class SavedModel:
+    def __init__(self, model_dir, precision):
+        self.saved_model_loaded = tf.saved_model.load(model_dir, tags=[tag_constants.SERVING])
+        self.graph_func = self.saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        self.precision = tf.float16 if precision == "amp" else tf.float32
+
+    def __call__(self, x, **kwargs):
+        return self.infer_step(x)
+
+    @tf.function
+    def infer_step(self, x):
+        output = self.graph_func(**x)
+        return output['start_positions'], output['end_positions']
+
+class TFTRTModel:
+    def __init__(self, model_dir, precision):
+        temp_tftrt_dir = f"/tmp/tf-trt_model_{precision}"
+        export_model(model_dir, precision, temp_tftrt_dir)
+        saved_model_loaded = tf.saved_model.load(temp_tftrt_dir, tags=[tag_constants.SERVING])
+        print(f"TF-TRT model loaded from {temp_tftrt_dir}")
+        self.graph_func = saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        self.precision = tf.float16 if precision == "amp" else tf.float32
+
+    def __call__(self, x, **kwargs):
+        return self.infer_step(x)
+
+    @tf.function
+    def infer_step(self, x):
+        output = self.graph_func(**x)
+        return output['start_positions'], output['end_positions']
diff --git a/modelzoo/LanguageModeling/BERT/tokenization.py b/modelzoo/LanguageModeling/BERT/tokenization.py
new file mode 100644
index 00000000..d0608b0b
--- /dev/null
+++ b/modelzoo/LanguageModeling/BERT/tokenization.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tokenization classes implementation.
+
+The file is forked from:
+https://github.com/google-research/bert/blob/master/tokenization.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+
+import six
+import tensorflow as tf
+
+import sentencepiece as spm
+
+SPIECE_UNDERLINE = "▁"
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." %
+        (actual_flag, init_checkpoint, model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  index = 0
+  with tf.io.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      vocab[token] = index
+      index += 1
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+      do_lower_case: Whether to lower case the input.
+    """
+    self.do_lower_case = do_lower_case
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically control characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
+
+
+def preprocess_text(inputs, remove_space=True, lower=False):
+  """Preprocesses data by removing extra space and normalize data.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+  Args:
+    inputs: The input text.
+    remove_space: Whether to remove the extra space.
+    lower: Whether to lowercase the text.
+
+  Returns:
+    The preprocessed text.
+
+  """
+  outputs = inputs
+  if remove_space:
+    outputs = " ".join(inputs.strip().split())
+
+  if six.PY2 and isinstance(outputs, str):
+    try:
+      outputs = six.ensure_text(outputs, "utf-8")
+    except UnicodeDecodeError:
+      outputs = six.ensure_text(outputs, "latin-1")
+
+  outputs = unicodedata.normalize("NFKD", outputs)
+  outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+  if lower:
+    outputs = outputs.lower()
+
+  return outputs
+
+
+def encode_pieces(sp_model, text, sample=False):
+  """Segements text into pieces.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token pieces.
+  """
+  if six.PY2 and isinstance(text, six.text_type):
+    text = six.ensure_binary(text, "utf-8")
+
+  if not sample:
+    pieces = sp_model.EncodeAsPieces(text)
+  else:
+    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+  new_pieces = []
+  for piece in pieces:
+    piece = printable_text(piece)
+    if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
+      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(
+          SPIECE_UNDERLINE, ""))
+      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+        if len(cur_pieces[0]) == 1:
+          cur_pieces = cur_pieces[1:]
+        else:
+          cur_pieces[0] = cur_pieces[0][1:]
+      cur_pieces.append(piece[-1])
+      new_pieces.extend(cur_pieces)
+    else:
+      new_pieces.append(piece)
+
+  return new_pieces
+
+
+def encode_ids(sp_model, text, sample=False):
+  """Segments text and return token ids.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token ids.
+  """
+  pieces = encode_pieces(sp_model, text, sample=sample)
+  ids = [sp_model.PieceToId(piece) for piece in pieces]
+  return ids
+
+
+class FullSentencePieceTokenizer(object):
+  """Runs end-to-end sentence piece tokenization.
+
+  The interface of this class is intended to keep the same as above
+  `FullTokenizer` class for easier usage.
+  """
+
+  def __init__(self, sp_model_file):
+    """Inits FullSentencePieceTokenizer.
+
+    Args:
+      sp_model_file: The path to the sentence piece model file.
+    """
+    self.sp_model = spm.SentencePieceProcessor()
+    self.sp_model.Load(sp_model_file)
+    self.vocab = {
+        self.sp_model.IdToPiece(i): i
+        for i in six.moves.range(self.sp_model.GetPieceSize())
+    }
+
+  def tokenize(self, text):
+    """Tokenizes text into pieces."""
+    return encode_pieces(self.sp_model, text)
+
+  def convert_tokens_to_ids(self, tokens):
+    """Converts a list of tokens to a list of ids."""
+    return [self.sp_model.PieceToId(printable_text(token)) for token in tokens]
+
+  def convert_ids_to_tokens(self, ids):
+    """Converts a list of ids ot a list of tokens."""
+    return [self.sp_model.IdToPiece(id_) for id_ in ids]
diff --git a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/run_horovod.sh b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/run_horovod.sh
index a30406e4..a94c71d2 100644
--- a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/run_horovod.sh
+++ b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/run_horovod.sh
@@ -73,7 +73,7 @@ printf "Logs written to %s\n" "$LOGFILE"
 
 set -x
 $nsys_command $mpi_command python -m examples.LanguageModeling.Multi-label-classification-with-BERT.trainer \
-    --steps_per_summary=20 \
+    --steps_per_execution=20 \
     --run_eagerly=false \
     --keras_use_ctl \
     --learning_rate=$learning_rate \
diff --git a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
index f4e8ac5c..4fa67eb4 100644
--- a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
+++ b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
@@ -3,14 +3,12 @@
 from absl import app
 from absl import flags
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.toxic_comment_classification_challenge import ToxicCommentClassificationChallenge
 
 # if FLAGS.use_dynamic_embedding:
 
-FLAGS = flags.FLAGS
-
 # tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
 tfhub_handle_encoder = 'https://hub.tensorflow.google.cn/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
 
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
new file mode 100644
index 00000000..a9bfe493
--- /dev/null
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
@@ -0,0 +1,67 @@
+import logging
+import time
+
+import numpy as np
+
+import tensorflow_datasets as tfds
+import tensorflow as tf
+
+import tensorflow_text
+
+examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+
+train_examples, val_examples = examples['train'], examples['validation']
+
+for pt_examples, en_examples in train_examples.batch(3).take(1):
+  print('> Examples in Portuguese:')
+  for pt in pt_examples.numpy():
+    print(pt.decode('utf-8'))
+  print()
+
+  print('> Examples in English:')
+  for en in en_examples.numpy():
+    print(en.decode('utf-8'))
+
+model_name = 'ted_hrlr_translate_pt_en_converter'
+tf.keras.utils.get_file(
+    f'{model_name}.zip',
+    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
+    cache_dir='.',
+    cache_subdir='',
+    extract=True
+)
+
+tokenizers = tf.saved_model.load(model_name)
+
+[item for item in dir(tokenizers.en) if not item.startswith('_')]
+
+print('> This is a batch of strings:')
+for en in en_examples.numpy():
+  print(en.decode('utf-8'))
+
+encoded = tokenizers.en.tokenize(en_examples)
+
+print('> This is a padded-batch of token IDs:')
+for row in encoded.to_list():
+  print(row)
+
+round_trip = tokenizers.en.detokenize(encoded)
+
+print('> This is human-readable text:')
+for line in round_trip.numpy():
+  print(line.decode('utf-8'))
+
+print('> This is the text split into tokens:')
+tokens = tokenizers.en.lookup(encoded)
+print(tokens)
+
+embed_pt = PositionalEmbedding(vocab_size=tokenizers.pt.get_vocab_size(), d_model=512)
+embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size(), d_model=512)
+
+pt_emb = embed_pt(pt)
+en_emb = embed_en(en)
+sample_ca = CrossAttention(num_heads=2, key_dim=512)
+
+print(pt_emb.shape)
+print(en_emb.shape)
+print(sample_ca(en_emb, pt_emb).shape)
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
new file mode 100644
index 00000000..cc227dce
--- /dev/null
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
@@ -0,0 +1,242 @@
+import tensorflow as tf
+import numpy as np
+
+
+class BaseAttention(tf.keras.layers.Layer):
+
+  def __init__(self, **kwargs):
+    super().__init__()
+    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
+    self.layernorm = tf.keras.layers.LayerNormalization()
+    self.add = tf.keras.layers.Add()
+
+
+def positional_encoding(length, depth):
+  depth = depth / 2
+
+  positions = np.arange(length)[:, np.newaxis]  # (seq, 1)
+  depths = np.arange(depth)[np.newaxis, :] / depth  # (1, depth)
+
+  angle_rates = 1 / (10000**depths)  # (1, depth)
+  angle_rads = positions * angle_rates  # (pos, depth)
+
+  pos_encoding = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis=-1)
+
+  return tf.cast(pos_encoding, dtype=tf.float32)
+
+
+class PositionalEmbedding(tf.keras.layers.Layer):
+
+  def __init__(self, vocab_size, d_model):
+    super().__init__()
+    self.d_model = d_model
+    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
+    self.pos_encoding = positional_encoding(length=2048, depth=d_model)
+
+  def compute_mask(self, *args, **kwargs):
+    return self.embedding.compute_mask(*args, **kwargs)
+
+  def call(self, x):
+    length = tf.shape(x)[1]
+    x = self.embedding(x)
+    # This factor sets the relative scale of the embedding and positonal_encoding.
+    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
+    x = x + self.pos_encoding[tf.newaxis, :length, :]
+    return x
+
+
+class CrossAttention(BaseAttention):
+
+  def call(self, x, context):
+    attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True)
+
+    # Cache the attention scores for plotting later.
+    self.last_attn_scores = attn_scores
+
+    x = self.add([x, attn_output])
+    x = self.layernorm(x)
+
+    return x
+
+
+class CausalSelfAttention(BaseAttention):
+
+  def call(self, x):
+    attn_output = self.mha(query=x, value=x, key=x, use_causal_mask=True)
+    x = self.add([x, attn_output])
+    x = self.layernorm(x)
+    return x
+
+
+class FeedForward(tf.keras.layers.Layer):
+
+  def __init__(self, d_model, dff, dropout_rate=0.1):
+    super().__init__()
+    self.seq = tf.keras.Sequential(
+        [
+            tf.keras.layers.Dense(dff, activation='relu'),
+            tf.keras.layers.Dense(d_model),
+            tf.keras.layers.Dropout(dropout_rate)
+        ]
+    )
+    self.add = tf.keras.layers.Add()
+    self.layer_norm = tf.keras.layers.LayerNormalization()
+
+  def call(self, x):
+    x = self.add([x, self.seq(x)])
+    x = self.layer_norm(x)
+    return x
+
+
+class GlobalSelfAttention(BaseAttention):
+
+  def call(self, x):
+    attn_output = self.mha(query=x, value=x, key=x)
+    x = self.add([x, attn_output])
+    x = self.layernorm(x)
+    return x
+
+
+class EncoderLayer(tf.keras.layers.Layer):
+
+  def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
+    super().__init__()
+
+    self.self_attention = GlobalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
+
+    self.ffn = FeedForward(d_model, dff)
+
+  def call(self, x):
+    x = self.self_attention(x)
+    x = self.ffn(x)
+    return x
+
+
+class Encoder(tf.keras.layers.Layer):
+
+  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
+    super().__init__()
+
+    self.d_model = d_model
+    self.num_layers = num_layers
+
+    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
+
+    self.enc_layers = [
+        EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
+        for _ in range(num_layers)
+    ]
+    self.dropout = tf.keras.layers.Dropout(dropout_rate)
+
+  def call(self, x):
+    # `x` is token-IDs shape: (batch, seq_len)
+    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
+
+    # Add dropout.
+    x = self.dropout(x)
+
+    for i in range(self.num_layers):
+      x = self.enc_layers[i](x)
+
+    return x  # Shape `(batch_size, seq_len, d_model)`.
+
+
+class DecoderLayer(tf.keras.layers.Layer):
+
+  def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
+    super(DecoderLayer, self).__init__()
+
+    self.causal_self_attention = CausalSelfAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
+
+    self.cross_attention = CrossAttention(num_heads=num_heads, key_dim=d_model, dropout=dropout_rate)
+
+    self.ffn = FeedForward(d_model, dff)
+
+  def call(self, x, context):
+    x = self.causal_self_attention(x=x)
+    x = self.cross_attention(x=x, context=context)
+
+    # Cache the last attention scores for plotting later
+    self.last_attn_scores = self.cross_attention.last_attn_scores
+
+    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
+    return x
+
+
+class Decoder(tf.keras.layers.Layer):
+
+  def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
+    super(Decoder, self).__init__()
+
+    self.d_model = d_model
+    self.num_layers = num_layers
+
+    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
+    self.dropout = tf.keras.layers.Dropout(dropout_rate)
+    self.dec_layers = [
+        DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
+        for _ in range(num_layers)
+    ]
+
+    self.last_attn_scores = None
+
+  def call(self, x, context):
+    # `x` is token-IDs shape (batch, target_seq_len)
+    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)
+
+    x = self.dropout(x)
+
+    for i in range(self.num_layers):
+      x = self.dec_layers[i](x, context)
+
+    self.last_attn_scores = self.dec_layers[-1].last_attn_scores
+
+    # The shape of x is (batch_size, target_seq_len, d_model).
+    return x
+
+
+class Transformer(tf.keras.Model):
+
+  def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
+    super().__init__()
+    self.encoder = Encoder(
+        num_layers=num_layers,
+        d_model=d_model,
+        num_heads=num_heads,
+        dff=dff,
+        vocab_size=input_vocab_size,
+        dropout_rate=dropout_rate
+    )
+
+    self.decoder = Decoder(
+        num_layers=num_layers,
+        d_model=d_model,
+        num_heads=num_heads,
+        dff=dff,
+        vocab_size=target_vocab_size,
+        dropout_rate=dropout_rate
+    )
+
+    self.final_layer = tf.keras.layers.Dense(target_vocab_size)
+
+  def call(self, inputs):
+    # To use a Keras model with `.fit` you must pass all your inputs in the
+    # first argument.
+    context, x = inputs
+
+    context = self.encoder(context)  # (batch_size, context_len, d_model)
+
+    x = self.decoder(x, context)  # (batch_size, target_len, d_model)
+
+    # Final linear layer output.
+    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)
+
+    try:
+      # Drop the keras mask, so it doesn't scale the losses/metrics.
+      # b/250038731
+      del logits._keras_mask
+    except AttributeError:
+      pass
+
+    # Return the final output and the attention weights.
+    return logits
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
new file mode 100644
index 00000000..3895f6f3
--- /dev/null
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
@@ -0,0 +1,126 @@
+import tensorflow as tf
+import tensorflow_text as text
+import tensorflow_datasets as tfds
+
+from models import Transformer
+
+BUFFER_SIZE = 20000
+BATCH_SIZE = 64
+
+
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+
+  def __init__(self, d_model, warmup_steps=4000):
+    super().__init__()
+
+    self.d_model = d_model
+    self.d_model = tf.cast(self.d_model, tf.float32)
+
+    self.warmup_steps = warmup_steps
+
+  def __call__(self, step):
+    step = tf.cast(step, dtype=tf.float32)
+    arg1 = tf.math.rsqrt(step)
+    arg2 = step * (self.warmup_steps**-1.5)
+
+    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+
+
+num_layers = 4
+d_model = 128
+dff = 512
+num_heads = 8
+dropout_rate = 0.1
+
+learning_rate = CustomSchedule(d_model)
+
+optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+
+
+def masked_loss(label, pred):
+  mask = label != 0
+  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
+  loss = loss_object(label, pred)
+
+  mask = tf.cast(mask, dtype=loss.dtype)
+  loss *= mask
+
+  loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
+  return loss
+
+
+def masked_accuracy(label, pred):
+  pred = tf.argmax(pred, axis=2)
+  label = tf.cast(label, pred.dtype)
+  match = label == pred
+
+  mask = label != 0
+
+  match = match & mask
+
+  match = tf.cast(match, dtype=tf.float32)
+  mask = tf.cast(mask, dtype=tf.float32)
+  return tf.reduce_sum(match) / tf.reduce_sum(mask)
+
+
+MAX_TOKENS = 128
+
+
+def prepare_batch(pt, en):
+  pt = tokenizers.pt.tokenize(pt)  # Output is ragged.
+  pt = pt[:, :MAX_TOKENS]  # Trim to MAX_TOKENS.
+  pt = pt.to_tensor()  # Convert to 0-padded dense Tensor
+
+  en = tokenizers.en.tokenize(en)
+  en = en[:, :(MAX_TOKENS + 1)]
+  en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
+  en_labels = en[:, 1:].to_tensor()  # Drop the [START] tokens
+
+  return (pt, en_inputs), en_labels
+
+
+def make_batches(ds):
+  return (
+      ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(prepare_batch,
+                                                    tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)
+  )
+
+
+# model_name = 'ted_hrlr_translate_pt_en_converter'
+# tf.keras.utils.get_file(
+#   f'{model_name}.zip',
+#   f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
+#   cache_dir='.', cache_subdir='', extract=True
+# )
+
+tokenizers = tf.saved_model.load("/workspaces/datasets/ted_hrlr_translate_pt_en_converter")
+
+transformer = Transformer(
+    num_layers=num_layers,
+    d_model=d_model,
+    num_heads=num_heads,
+    dff=dff,
+    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
+    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
+    dropout_rate=dropout_rate
+)
+
+transformer.compile(
+    loss=masked_loss,
+    optimizer=optimizer,
+    metrics=[masked_accuracy],
+    steps_per_execution=10,
+    # jit_compile=True
+)
+
+examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+
+train_examples, val_examples = examples['train'], examples['validation']
+# Create training and validation set batches.
+train_batches = make_batches(train_examples)
+val_batches = make_batches(val_examples)
+
+transformer.fit(train_batches,
+                # epochs=1,
+                # validation_data=val_batches
+               )
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
new file mode 100644
index 00000000..a1cbdc43
--- /dev/null
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
@@ -0,0 +1,139 @@
+import os
+import tensorflow as tf
+import tensorflow_text as text
+import tensorflow_datasets as tfds
+
+from models import Transformer
+import deepray as dp
+from deepray.core.trainer import Trainer
+from deepray.optimizers import lamb
+
+# os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit'
+# os.environ['XLA_FLAGS'] = '--xla_gpu_enable_cudnn_frontend --xla_gpu_enable_cudnn_fmha'
+
+BUFFER_SIZE = 20000
+BATCH_SIZE = 64
+
+
+class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+
+  def __init__(self, d_model, warmup_steps=4000):
+    super().__init__()
+
+    self.d_model = d_model
+    self.d_model = tf.cast(self.d_model, tf.float32)
+
+    self.warmup_steps = warmup_steps
+
+  def __call__(self, step):
+    step = tf.cast(step, dtype=tf.float32)
+    arg1 = tf.math.rsqrt(step)
+    arg2 = step * (self.warmup_steps**-1.5)
+
+    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
+
+
+num_layers = 4
+d_model = 128
+dff = 512
+num_heads = 8
+dropout_rate = 0.1
+
+
+def masked_loss(label, pred):
+  mask = label != 0
+  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
+  loss = loss_object(label, pred)
+
+  mask = tf.cast(mask, dtype=loss.dtype)
+  loss *= mask
+
+  loss = tf.reduce_sum(loss) / tf.reduce_sum(mask)
+  return loss
+
+
+def masked_accuracy(label, pred):
+  pred = tf.argmax(pred, axis=2)
+  label = tf.cast(label, pred.dtype)
+  match = label == pred
+
+  mask = label != 0
+
+  match = match & mask
+
+  match = tf.cast(match, dtype=tf.float32)
+  mask = tf.cast(mask, dtype=tf.float32)
+  return tf.reduce_sum(match) / tf.reduce_sum(mask)
+
+
+MAX_TOKENS = 128
+
+# model_name = 'ted_hrlr_translate_pt_en_converter'
+# tf.keras.utils.get_file(
+#   f'{model_name}.zip',
+#   f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
+#   cache_dir='.', cache_subdir='', extract=True
+# )
+
+
+def main():
+  tokenizers = tf.saved_model.load("/workspaces/datasets/ted_hrlr_translate_pt_en_converter")
+
+  def prepare_batch(pt, en):
+    pt = tokenizers.pt.tokenize(pt)  # Output is ragged.
+    pt = pt[:, :MAX_TOKENS]  # Trim to MAX_TOKENS.
+    pt = pt.to_tensor()  # Convert to 0-padded dense Tensor
+
+    en = tokenizers.en.tokenize(en)
+    en = en[:, :(MAX_TOKENS + 1)]
+    en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
+    en_labels = en[:, 1:].to_tensor()  # Drop the [START] tokens
+
+    return (pt, en_inputs), en_labels
+
+  def make_batches(ds):
+    return (
+        ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(prepare_batch,
+                                                      tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)
+    )
+
+  transformer = Transformer(
+      num_layers=num_layers,
+      d_model=d_model,
+      num_heads=num_heads,
+      dff=dff,
+      input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
+      target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
+      dropout_rate=dropout_rate
+  )
+
+  learning_rate = CustomSchedule(d_model)
+  # optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+  optimizer = lamb.LAMB(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+
+  trainer = Trainer(
+      model=transformer,
+      loss=masked_loss,
+      optimizer=optimizer,
+      metrics=[masked_accuracy],
+      # jit_compile=True
+  )
+
+  examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+
+  train_examples, val_examples = examples['train'], examples['validation']
+  # Create training and validation set batches.
+  train_batches = make_batches(train_examples)
+  val_batches = make_batches(val_examples)
+
+  trainer.fit(
+      train_input=train_batches,
+      # steps_per_epoch=811,
+      # eval_input=val_batches,
+      # eval_steps=valid_steps,
+      # callbacks=[ModelCheckpoint()],
+  )
+
+
+if __name__ == "__main__":
+  dp.runner(main)
diff --git a/modelzoo/Recommendation/TFRA/run_horovod.sh b/modelzoo/Recommendation/CreditCardFraudDetection/run.sh
similarity index 58%
rename from modelzoo/Recommendation/TFRA/run_horovod.sh
rename to modelzoo/Recommendation/CreditCardFraudDetection/run.sh
index c3ba2475..5891acfa 100644
--- a/modelzoo/Recommendation/TFRA/run_horovod.sh
+++ b/modelzoo/Recommendation/CreditCardFraudDetection/run.sh
@@ -14,27 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-set -eu
-set -o pipefail
-
-pip install tensorflow_datasets
-
-num_gpu=${1:-"1"}
-batch_size=${2:-"1024"}
-learning_rate=${3:-"5e-6"}
-precision=${4:-"fp32"}
-use_xla=${5:-"true"}
-model=${6:-"demo"}
-epochs=${7:-"100"}
-profile=${8:-"false"}
-
-if [ $num_gpu -gt 1 ]; then
-    hvd_command="horovodrun -np $num_gpu "
-    use_hvd="--use_horovod"
-else
-    hvd_command=""
-    use_hvd="--distribution_strategy=off"
-fi
+batch_size=${1:-"128"}
+learning_rate=${2:-"5e-6"}
+precision=${3:-"fp32"}
+use_xla=${4:-"true"}
+model=${5:-"demo"}
+epochs=${6:-"1"}
 
 if [ "$precision" = "fp16" ]; then
     echo "fp16 activated!"
@@ -43,13 +28,6 @@ else
     use_fp16=""
 fi
 
-if [ "$use_xla" = "true" ]; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
 export GBS=$(expr $batch_size \* $num_gpu)
 printf -v TAG "tf_tfra_training_movielens_%s_%s_gbs%d" "$model" "$precision" $GBS
 DATESTAMP=$(date +'%y%m%d%H%M%S')
@@ -61,24 +39,12 @@ mkdir -m 777 -p $RESULTS_DIR
 printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
 printf "Logs written to %s\n" "$LOGFILE"
 
-if [ "$profile" = "true" ]; then
-    nsys_command="--timeline-filename $RESULTS_DIR/timeline.json --timeline-mark-cycles"
-    echo "profile activated"
-else
-    nsys_command=""
-fi
-
 set -x
-$hvd_command $nsys_command python demo_tfra.py \
-    --train_data=movielens/1m-ratings \
-    --feature_map=/workspaces/deepray/deepray/datasets/movielens/movielens.csv \
-    --num_gpus=$num_gpu \
-    --stop_steps=5000 \
+python train.py \
+    --train_data=mnist \
     --batch_size=$batch_size \
-    --use_dynamic_embedding=True \
     --learning_rate=$learning_rate \
     --epochs=$epochs \
     --model_dir=${RESULTS_DIR} \
-    $use_hvd $use_fp16 $use_xla_tag
-
+    $use_fp16 |& tee $LOGFILE
 set +x
diff --git a/modelzoo/Recommendation/CreditCardFraudDetection/run_horovod.sh b/modelzoo/Recommendation/CreditCardFraudDetection/run_horovod.sh
deleted file mode 100644
index 4163a9f8..00000000
--- a/modelzoo/Recommendation/CreditCardFraudDetection/run_horovod.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-num_gpu=${1:-"4"}
-batch_size=${2:-"128"}
-learning_rate=${3:-"5e-6"}
-precision=${4:-"fp32"}
-use_xla=${5:-"true"}
-model=${6:-"demo"}
-epochs=${7:-"100"}
-
-if [ $num_gpu -gt 1 ] ; then
-    mpi_command="mpirun -np $num_gpu \
-    --allow-run-as-root -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO \
-    -x LD_LIBRARY_PATH \
-    -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--use_horovod"
-else
-    mpi_command=""
-    use_hvd=""
-fi
-
-if [ "$precision" = "fp16" ] ; then
-    echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
-else
-    use_fp16=""
-fi
-
-if [ "$use_xla" = "true" ] ; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
-
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_tfra_training_movielens_%s_%s_gbs%d" "$model" "$precision" $GBS
-DATESTAMP=`date +'%y%m%d%H%M%S'`
-
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results/${TAG}_${DATESTAMP}
-LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-mkdir -m 777 -p $RESULTS_DIR
-printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-printf "Logs written to %s\n" "$LOGFILE"
-
-set -x
-$mpi_command python -m examples.Recommendation.CreditCardFraudDetection.train \
-  --train_data=mnist \
-  --keras_use_ctl=true \
-  --num_gpus=$num_gpu \
-  --batch_size=$batch_size \
-  --learning_rate=$learning_rate \
-  --epochs=$epochs \
-  --model_dir=${RESULTS_DIR} \
-  $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
-
-set +x
diff --git a/modelzoo/Recommendation/CreditCardFraudDetection/train.py b/modelzoo/Recommendation/CreditCardFraudDetection/train.py
index ce9c4304..cdf1b7df 100644
--- a/modelzoo/Recommendation/CreditCardFraudDetection/train.py
+++ b/modelzoo/Recommendation/CreditCardFraudDetection/train.py
@@ -3,14 +3,12 @@
 from __future__ import print_function
 
 import tensorflow as tf
-from absl import app, flags
+from absl import flags
 from tensorflow import keras
 
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
+from deepray.core.trainer import Trainer
 from deepray.datasets.creditcardfraud import CreditCardFraud
-
-FLAGS = flags.FLAGS
+from tf_keras.optimizers.legacy import Adam
 
 METRICS = [
     keras.metrics.TruePositives(name='tp'),
@@ -25,28 +23,24 @@
 ]
 
 
-def main(_):
-  _strategy = distribution_utils.get_distribution_strategy()
+def main():
   data_pipe = CreditCardFraud()
-  train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  with distribution_utils.get_strategy_scope(_strategy):
-    output_bias = None
-    input_dim = data_pipe.train_features.shape[-1]
-    model = keras.Sequential(
-        [
-            tf.keras.layers.InputLayer(input_shape=(input_dim,)),
-            keras.layers.Dense(16, activation='relu'),
-            keras.layers.Dropout(0.5),
-            keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
-        ]
-    )
-
-  trainer = Trainer(model=model, loss=keras.losses.BinaryCrossentropy(), metrics=METRICS)
-
-  trainer.fit(train_input=train_input,)
-
-  # trainer.export_tfra()
+  train_input = data_pipe(flags.FLAGS.batch_size, is_training=True)
+  output_bias = None
+  input_dim = data_pipe.train_features.shape[-1]
+  model = keras.Sequential(
+      [
+          tf.keras.layers.InputLayer(input_shape=(input_dim,)),
+          keras.layers.Dense(16, activation='relu'),
+          keras.layers.Dropout(0.5),
+          keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
+      ]
+  )
+
+  optimizer = Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
+  trainer = Trainer(model=model, optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=METRICS)
+  trainer.fit(x=train_input)
 
 
 if __name__ == "__main__":
-  app.run(main)
+  main()
diff --git a/modelzoo/Recommendation/Criteo_DCN/README.md b/modelzoo/Recommendation/Criteo_DCN/README.md
new file mode 100644
index 00000000..9778bf9c
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/README.md
@@ -0,0 +1,32 @@
+## Model Structure
+[Deep & Cross Network V2](https://arxiv.org/abs/2008.13535)(DCN V2) is proposed by Google by 2020.   
+
+## Usage
+1.  Please prepare the Deepray env.
+    1. Manually
+       - Download code by `git clone https://github.com/deepray-AI/deepray.git`
+       - Follow [How to Build](https://github.com/deepray-AI/deepray?tab=readme-ov-file#installing-from-source) to build Deepray whl package and install by `pip install $DEEPRAY_WHL`.
+       
+    2. Docker
+    
+       ```
+       docker pull hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.2.2-ubuntu22.04
+       docker run -it hailinfufu/deepray-release:nightly-py3.10-tf2.15.0-cu12.2.2-ubuntu22.04
+       # In docker container
+       cd /deepray/modelzoo/Recommendation/Criteo_ctr
+       ```
+
+    
+2.  Stand-alone Training.  
+    ```sh
+    bash run.sh
+    ```
+    Distribute Training with 4 GPU cards.
+    ```sh
+    bash run.sh 4
+    ```
+
+## Dataset
+Train & eval dataset using ***Kaggle Display Advertising Challenge Dataset (Criteo Dataset)***.
+Here provided sampled two parquet files: 
+https://drive.google.com/drive/folders/1VXYvru_KU5Lv0voWhUIkaioLznnudHJl?usp=sharing
\ No newline at end of file
diff --git a/modelzoo/Recommendation/Criteo_DCN/datasets/__init__.py b/modelzoo/Recommendation/Criteo_DCN/datasets/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
new file mode 100644
index 00000000..31267a8f
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
@@ -0,0 +1,26 @@
+# -*- coding:utf-8 -*-
+import tensorflow as tf
+from absl import flags
+
+from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline
+
+
+class CustomParquetPipeline(ParquetPipeline):
+
+  def __init__(self, **kwargs):
+    super().__init__(**kwargs)
+    self.column_names = list(self.feature_map["name"])
+    self._label_name = self.feature_map[(self.feature_map['ftype'] == "Label")].iloc[0]['name']
+    self.numerical_features = self.feature_map[(self.feature_map['ftype'] == "Numerical")]['name'].tolist()
+    self.categorical_features = self.feature_map[(self.feature_map['ftype'] == "Categorical")]['name'].tolist()
+
+  def parser(self, record):
+    if self.numerical_features:
+      record['dense_features'] = tf.concat(
+          [tf.reshape(record.pop(key), [-1, 1]) for key in self.numerical_features], axis=1
+      )
+    for fea in self.categorical_features:
+      record[fea] = tf.reshape(record[fea], [-1])
+
+    target = record.pop(self._label_name)
+    return record, target
diff --git a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
new file mode 100644
index 00000000..91ba0de6
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+# @Time    : 2023/8/10 2:50 PM
+# @Author  : Hailin.Fu
+# @license : Copyright(C),  <fuhailin@outlook.com>
+import sys
+
+from absl import flags
+
+from custom_dataset import CustomParquetPipeline
+from deepray.utils.benchmark import PerformanceCalculator
+
+
+def define_flags():
+  argv = sys.argv + [
+      "--batch_size=4096",
+      "--epochs=1",
+      # "--feature_map=feature_map_small.csv",
+  ]
+  flags.FLAGS(argv)
+
+
+define_flags()
+data_pipe = CustomParquetPipeline()
+train_dataset = data_pipe(
+    batch_size=flags.FLAGS.batch_size,
+    input_file_pattern=[
+        "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+        "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+    ]
+)
+_performance_calculator = PerformanceCalculator(0, 1000)
+
+num_examples = 0
+step = 0
+for x, y in train_dataset:
+  step += 1
+  num_examples += flags.FLAGS.batch_size
+  step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
+
+  if num_examples % 100 == 0:
+    print(f'step {step}, Perf {step_throughput} samples/s')
+# print(batch)
+
+print(num_examples)
+results_perf = _performance_calculator.results
+if not _performance_calculator.completed:
+  print(f"self._performance_calculator.completed: {_performance_calculator.completed}")
+  results_perf = _performance_calculator.get_current_benchmark_results()
+print(results_perf)
diff --git a/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py b/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
new file mode 100644
index 00000000..42f116dc
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
@@ -0,0 +1,119 @@
+from typing import Dict
+
+import tensorflow as tf
+import tf_keras as keras
+from absl import flags
+
+from deepray.custom_ops.embedding_variable import group_embedding_lookup_ops
+from deepray.layers.dcn import Cross
+from deepray.layers.dot_interaction import DotInteraction
+from deepray.layers.dynamic_embedding import DistributedDynamicEmbedding
+from deepray.layers.embedding_variable import EmbeddingVariable
+from deepray.layers.mlp import MLP
+from deepray.utils.data.feature_map import FeatureMap
+
+
+class EmbeddingContainer(tf.Module):
+
+  def __init__(self, training, use_group_embedding):
+    super().__init__()
+    self.embeddings = {}
+    self.training = training
+    self.use_group_embedding = use_group_embedding
+
+  def add_embedding(self, name, dim, dtype, voc_size):
+    if voc_size:
+      self.embeddings[name] = keras.layers.Embedding(
+          input_dim=voc_size + 1,
+          output_dim=dim,
+          embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
+      )
+    elif flags.FLAGS.use_dynamic_embedding:
+      self.embeddings[name] = DistributedDynamicEmbedding(
+          embedding_dim=dim,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+          name='DynamicVariable_' + name,
+          device="DRAM",
+          init_capacity=1024 * 10,
+          max_capacity=1024 * 100,
+      )
+    else:
+      emb = EmbeddingVariable(
+          embedding_dim=dim,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+          name='emb' + name,
+          # storage_type="DRAM",
+          # with_unique=True,
+          storage_type="HBM",
+      )
+      self.embeddings[name] = emb
+
+  def __call__(self, name, tensor):
+    return self.embeddings[name](tensor)
+
+  def get_embedding_list(self):
+    if not self.use_group_embedding:
+      return None
+    emb_list = [emb for emb in self.embeddings.values() if isinstance(emb, EmbeddingVariable)]
+    return emb_list
+
+
+class Ranking(keras.Model):
+
+  def __init__(self, interaction, training=True, use_group_embedding=False, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.feature_map = FeatureMap().feature_map
+    self._bottom_stack = MLP(hidden_units=[256, 64, 16], activations=[None, None, "relu"])
+    self._top_stack = MLP(hidden_units=[512, 256, 1], activations=[None, None, "sigmoid"])
+    self._interaction = interaction
+    self.training = training
+    if interaction == 'dot':
+      self._feature_interaction = DotInteraction(skip_gather=True)
+    elif interaction == 'cross':
+      self._feature_interaction = Cross()
+    else:
+      raise ValueError(
+          f'params.task.model.interaction {self.task_config.model.interaction} '
+          f'is not supported it must be either \'dot\' or \'cross\'.'
+      )
+    self.use_group_embedding = use_group_embedding
+    self.embedding_container = EmbeddingContainer(training, use_group_embedding)
+
+  def build(self, input_shape):
+    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
+        "name", "dim", "dtype", "voc_size"
+    ]].values:
+      self.embedding_container.add_embedding(name, dim, dtype, voc_size)
+
+  def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
+    dense_features = inputs["dense_features"]
+    sparse_embedding_vecs = []
+    indices = []  # Keep indices for group embedding lookup
+
+    for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
+      tensor = inputs[name]
+      if self.use_group_embedding:
+        indices.append(tensor)
+      else:
+        sparse_embedding_vecs.append(self.embedding_container(name, tensor))
+
+    if self.use_group_embedding:
+      embedding_list = self.embedding_container.get_embedding_list()
+      sparse_embedding_vecs = group_embedding_lookup_ops.group_embedding_lookup(embedding_list, indices)
+
+    dense_embedding_vec = self._bottom_stack(dense_features)
+
+    interaction_args = sparse_embedding_vecs + [dense_embedding_vec]
+    if self._interaction == "cross":
+      interaction_args = tf.concat(interaction_args, axis=-1)
+
+    interaction_output = self._feature_interaction(interaction_args)
+    feature_interaction_output = tf.concat([dense_embedding_vec, interaction_output], axis=1)
+
+    prediction = self._top_stack(feature_interaction_output)
+
+    return tf.reshape(prediction, [-1])
diff --git a/modelzoo/Recommendation/Criteo_DCN/eval.py b/modelzoo/Recommendation/Criteo_DCN/eval.py
new file mode 100644
index 00000000..f17e2f4a
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/eval.py
@@ -0,0 +1,43 @@
+import sys
+
+import tensorflow as tf
+from absl import flags
+
+import deepray as dp
+from dcn_v2 import Ranking
+from deepray.core.trainer import Trainer
+from deepray.datasets.criteo import CriteoTsvReader
+from deepray.utils import logging_util
+
+logger = logging_util.get_logger()
+
+
+def define_flags():
+  argv = sys.argv + [
+      "--feature_map=feature_map_small.csv",
+      "--init_weights=/code/results/tf_tfra_training_criteo_dcn_fp32_gbs1024_240102150028/export_main/variables/",
+  ]
+  flags.FLAGS(argv)
+
+
+def main():
+  define_flags()
+  model = Ranking(interaction="cross")
+
+  data_pipe = CriteoTsvReader(use_synthetic_data=True)
+  train_ds = data_pipe(flags.FLAGS.train_data, flags.FLAGS.batch_size, is_training=True)
+
+  optimizer = tf.keras.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
+  if flags.FLAGS.use_dynamic_embedding:
+    from tensorflow_recommenders_addons import dynamic_embedding as de
+    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
+
+  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'])
+  trainer.evaluate(eval_input=train_ds, eval_steps=100)
+  # model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'])
+  # model.load_weights("/code/results/tf_tfra_training_criteo_dcn_fp32_gbs1024_240102150028/export_main/variables/variables").expect_partial()
+  # model.evaluate(train_ds, steps=100, return_dict=True)
+
+
+if __name__ == "__main__":
+  dp.runner(main)
diff --git a/modelzoo/Recommendation/Criteo_DCN/feature_map_small.csv b/modelzoo/Recommendation/Criteo_DCN/feature_map_small.csv
new file mode 100644
index 00000000..9ba82fcf
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/feature_map_small.csv
@@ -0,0 +1,41 @@
+name,dtype,ftype,dim,length,voc_size
+f_c0,int32,Label,1,1,
+f_c1,float64,Numerical,1,1,
+f_c2,float64,Numerical,1,1,
+f_c3,float64,Numerical,1,1,
+f_c4,float64,Numerical,1,1,
+f_c5,float64,Numerical,1,1,
+f_c6,float64,Numerical,1,1,
+f_c7,float64,Numerical,1,1,
+f_c8,float64,Numerical,1,1,
+f_c9,float64,Numerical,1,1,
+f_c10,float64,Numerical,1,1,
+f_c11,float64,Numerical,1,1,
+f_c12,float64,Numerical,1,1,
+f_c13,float64,Numerical,1,1,
+f_c14,int32,Categorical,16,1,
+f_c15,int32,Categorical,16,1,
+f_c16,int32,Categorical,16,1,
+f_c17,int32,Categorical,16,1,
+f_c18,int32,Categorical,16,1,
+f_c19,int32,Categorical,16,1,
+f_c20,int32,Categorical,16,1,
+f_c21,int32,Categorical,16,1,
+f_c22,int32,Categorical,16,1,
+f_c23,int32,Categorical,16,1,
+f_c24,int32,Categorical,16,1,
+f_c25,int32,Categorical,16,1,
+f_c26,int32,Categorical,16,1,
+f_c27,int32,Categorical,16,1,
+f_c28,int32,Categorical,16,1,
+f_c29,int32,Categorical,16,1,
+f_c30,int32,Categorical,16,1,
+f_c31,int32,Categorical,16,1,
+f_c32,int32,Categorical,16,1,
+f_c33,int32,Categorical,16,1,
+f_c34,int32,Categorical,16,1,
+f_c35,int32,Categorical,16,1,
+f_c36,int32,Categorical,16,1,
+f_c37,int32,Categorical,16,1,
+f_c38,int32,Categorical,16,1,
+f_c39,int32,Categorical,16,1,
diff --git a/modelzoo/Recommendation/Criteo_DCN/infer.py b/modelzoo/Recommendation/Criteo_DCN/infer.py
new file mode 100644
index 00000000..44022f83
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/infer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# @Time    : 2023/9/26 2:50 PM
+# @Author  : Hailin.Fu
+# @license : Copyright(C),  <hailin.fu@>
+import os
+import sys
+
+from absl import app, flags
+
+from arsenal_parquet_dataset.custom_dataset import CustomArsenalParquetDataset
+from custom_model import MatchModel
+
+
+def runner(argv=None):
+  dir_path = os.path.dirname(os.path.realpath(__file__))
+  if len(argv) <= 1:
+    argv = [
+        sys.argv[0],
+        "--batch_size=8",
+        "--dataset=gs_rank_e2e",
+        "--epochs=1",
+        "--run_eagerly=False",
+        "--use_dynamic_embedding=True",
+        f"--feature_map={dir_path}/feature_map.csv",
+        "--model_dir=/code/fuhailin/arsenal_tfra_accelerate/gs_rank_tfra_accelerate_test/latest",
+    ]
+  if argv:
+    FLAGS(argv, known_only=True)
+
+  data_pipe = CustomArsenalParquetDataset(dataset_name=FLAGS.dataset, partitions=[{'ds': "2023-09-06"}])
+  test_files_list = data_pipe.get_dataset_files()
+  test_ds = data_pipe(input_file_pattern=test_files_list[-1], batch_size=FLAGS.batch_size)
+  model = MatchModel(pretrain=FLAGS.pretrain, training=False).build()
+  model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))
+
+  for x, y in test_ds.take(1):
+    x = x.pop("lid")
+    preds = model(x)
+    print(preds)
+
+
+if __name__ == "__main__":
+  app.run(runner)
diff --git a/modelzoo/Recommendation/Criteo_DCN/run.sh b/modelzoo/Recommendation/Criteo_DCN/run.sh
new file mode 100644
index 00000000..33a10f2e
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/run.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -eu
+set -o pipefail
+
+batch_size=${1:-"128"}
+learning_rate=${2:-"5e-6"}
+precision=${3:-"fp32"}
+use_xla=${4:-"False"}
+epochs=${5:-"1"}
+
+printf -v TAG "dp_training_criteo_%s_%s_gbs%d" "dcn" "$precision" $batch_size
+DATESTAMP=$(date +'%y%m%d%H%M%S')
+
+#Edit to save logs & checkpoints in a different directory
+RESULTS_DIR=/code/results/${TAG}_${DATESTAMP}
+LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
+mkdir -m 777 -p $RESULTS_DIR
+printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
+printf "Logs written to %s\n" "$LOGFILE"
+
+set -x
+# --use_dynamic_embedding=True \
+# --init_checkpoint=/code/results/dp_training_criteo_dcn_fp32_gbs10240_250623134513/ckpt_main/ \
+export DEEPRAY_VERBOSITY="detail"
+
+CUDA_VISIBLE_DEVICES=0 python train.py \
+    --random_seed=1024 \
+    --stop_steps=-1 \
+    --feature_map=feature_map_small.csv \
+    --batch_size=$batch_size \
+    --steps_per_execution=1 \
+    --run_eagerly=False \
+    --learning_rate=$learning_rate \
+    --epochs=$epochs \
+    --model_dir=${RESULTS_DIR} \
+    $@ \
+    set +x
diff --git a/modelzoo/Recommendation/Criteo_DCN/train.py b/modelzoo/Recommendation/Criteo_DCN/train.py
new file mode 100644
index 00000000..e69556d6
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/train.py
@@ -0,0 +1,84 @@
+import os
+import sys
+
+import tensorflow as tf
+from absl import flags
+from tf_keras.optimizers.legacy import Adam
+
+import deepray as dp
+from datasets.custom_dataset import CustomParquetPipeline
+from dcn_v2 import Ranking
+from deepray.callbacks import ModelCheckpoint
+from deepray.callbacks.training_speed import TrainingSpeed
+from deepray.core.trainer import Trainer
+from deepray.utils import logging_util
+from deepray.utils.export import export_to_savedmodel
+
+logger = logging_util.get_logger()
+
+
+def define_flags():
+  flags.mark_flag_as_required('model_dir')
+  flags.FLAGS(sys.argv)
+
+
+def main():
+  define_flags()
+  pid = os.getpid()
+  # input("pid: " + str(pid) +", press enter to continue")
+  if flags.FLAGS.use_dynamic_embedding:
+    from tensorflow_recommenders_addons import dynamic_embedding as de
+    optimizer = Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
+    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
+  else:
+    optimizer = dp.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
+    # optimizer = dp.optimizers.SGD(flags.FLAGS.learning_rate)
+    # optimizer = dp.optimizers.Adagrad(learning_rate=flags.FLAGS.learning_rate)
+    # optimizer = dp.optimizers.FtrlOptimizer(learning_rate=flags.FLAGS.learning_rate)
+  model = Ranking(interaction="cross", use_group_embedding=False)
+
+  data_pipe = CustomParquetPipeline()
+  train_ds = data_pipe(
+      batch_size=flags.FLAGS.batch_size,
+      input_file_pattern=[
+          "/workspaces/datasets/criteo-small-00000.parquet",
+          "/workspaces/datasets/criteo-small-01799.parquet",
+      ]
+  )
+  valid_ds = data_pipe(
+      batch_size=flags.FLAGS.batch_size,
+      input_file_pattern=[
+          "/workspaces/datasets/criteo-small-01799.parquet",
+      ]
+  )
+
+  trainer = Trainer(
+      model=model,
+      optimizer=optimizer,
+      loss="binary_crossentropy",
+      metrics=['AUC'],
+      jit_compile=False,
+      run_eagerly=flags.FLAGS.run_eagerly
+  )
+  # Create a TensorBoard callback
+  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
+  trainer.fit(
+      x=train_ds,
+      epochs=flags.FLAGS.epochs,
+      # verbose=0,
+      # steps_per_epoch=460,
+      # validation_data=valid_ds,
+      # validation_steps=191/get_world_size()-1,
+      callbacks=[
+          tboard_callback,
+          TrainingSpeed(),
+          ModelCheckpoint(),
+      ],
+  )
+  savedmodel_path = export_to_savedmodel(model)
+  print(savedmodel_path)
+
+
+if __name__ == "__main__":
+  dp.runner(main)
diff --git a/modelzoo/Recommendation/Criteo_DCN/train1.py b/modelzoo/Recommendation/Criteo_DCN/train1.py
new file mode 100644
index 00000000..a70ab109
--- /dev/null
+++ b/modelzoo/Recommendation/Criteo_DCN/train1.py
@@ -0,0 +1,90 @@
+import os
+import sys
+
+import tensorflow as tf
+from absl import flags
+
+import deepray as dp
+from datasets.custom_dataset import CustomParquetPipeline
+from dcn_v2 import Ranking
+from deepray.callbacks import ModelCheckpoint
+from deepray.core.trainer import Trainer
+from deepray.utils import logging_util
+from deepray.utils.export import export_to_savedmodel
+from deepray.utils.horovod_utils import get_world_size
+from tf_keras import optimizers
+
+logger = logging_util.get_logger()
+
+
+def define_flags():
+  flags.mark_flag_as_required('model_dir')
+  flags.FLAGS(sys.argv)
+
+
+def main():
+  define_flags()
+  pid = os.getpid()
+  # 验证设置
+  print("Intra-op threads:", tf.config.threading.get_intra_op_parallelism_threads())
+  print("Inter-op threads:", tf.config.threading.get_inter_op_parallelism_threads())
+  # input("pid: " + str(pid) +", press enter to continue")
+  if flags.FLAGS.use_dynamic_embedding:
+    from tensorflow_recommenders_addons import dynamic_embedding as de
+    optimizer = optimizers.legacy.Adam(learning_rate=flags.FLAGS.learning_rate)
+    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
+  else:
+    optimizer = dp.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate)
+    # optimizer = dp.optimizers.SGD(flags.FLAGS.learning_rate)
+  # optimizer = keras.optimizers.legacy.Adam(learning_rate=flags.FLAGS.learning_rate)
+  # optimizer = dp.optimizers.Adagrad(learning_rate=flags.FLAGS.learning_rate)
+  # optimizer = dp.optimizers.FtrlOptimizer(learning_rate=flags.FLAGS.learning_rate)
+  model = Ranking(interaction="cross")
+  # dense_opt = keras.optimizers.legacy.Adam(learning_rate=flags.FLAGS.learning_rate)
+  # optimizer = MultiOptimizer([
+  #     (embedding_opt, "DynamicVariable_"),
+  # ], default_optimizer=dense_opt)
+  data_pipe = CustomParquetPipeline()
+  train_ds = data_pipe(
+      batch_size=flags.FLAGS.batch_size,
+      input_file_pattern=[
+          "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+      ]
+  )
+  valid_ds = data_pipe(
+      batch_size=flags.FLAGS.batch_size,
+      input_file_pattern=[
+          "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+      ]
+  )
+
+  optimizer.global_step = model._train_counter
+  model.compile(
+      optimizer=optimizer,
+      loss="binary_crossentropy",
+      metrics=['AUC'],
+      jit_compile=False,
+      run_eagerly=flags.FLAGS.run_eagerly
+  )
+  # Create a TensorBoard callback
+  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
+  # breakpoint()
+  model.fit(
+      x=train_ds,
+      epochs=flags.FLAGS.epochs,
+      steps_per_epoch=1000,
+      validation_data=valid_ds,
+      # validation_steps=191/get_world_size()-1,
+      callbacks=[
+          # tboard_callback
+          # ModelCheckpoint(),
+      ],
+  )
+  # savedmodel_path = export_to_savedmodel(model)
+  # print(savedmodel_path)
+
+
+if __name__ == "__main__":
+  # dp.runner(main)
+  main()
diff --git a/modelzoo/Recommendation/MovieLens/mymodel.py b/modelzoo/Recommendation/MovieLens/mymodel.py
new file mode 100644
index 00000000..de709797
--- /dev/null
+++ b/modelzoo/Recommendation/MovieLens/mymodel.py
@@ -0,0 +1,57 @@
+import tensorflow as tf
+from typing import Dict, Tuple
+from deepray.layers.embedding_variable import EmbeddingVariable
+import deepray.custom_ops.sparse_operation_kit as sok
+
+USE_EV = False
+USE_SOK = False
+
+
+class MovieLensRankingModel(tf.keras.Model):
+
+  def __init__(self, user_input_dim=None, movie_input_dim=None, embedding_dimension=64):
+    super().__init__()
+    if USE_SOK:
+      self.user_embed = sok.DynamicVariable(
+          dimension=64,
+          var_type="hybrid",
+          # key_dtype=tf.int64,
+          # value_dtype=tf.float32,
+          # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
+          initializer='uniform',
+          name="DynamicVariable_user",
+          init_capacity=1024,
+          max_capacity=1024,
+      )
+      self.movie_embed = sok.DynamicVariable(
+          dimension=64,
+          var_type="hybrid",
+          # key_dtype=tf.int64,
+          # value_dtype=tf.float32,
+          # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
+          initializer='uniform',
+          name="DynamicVariable_movie",
+          init_capacity=1024,
+          max_capacity=1024,
+      )
+    else:
+      if USE_EV:
+        self.user_embed = EmbeddingVariable(embedding_dim=embedding_dimension)
+        self.movie_embed = EmbeddingVariable(embedding_dim=embedding_dimension)
+      else:
+        self.user_embed = tf.keras.layers.Embedding(user_input_dim, embedding_dimension)
+        self.movie_embed = tf.keras.layers.Embedding(movie_input_dim, embedding_dimension)
+
+  def call(self, features: Dict[str, tf.Tensor]) -> tf.Tensor:
+    # Define how the ranking scores are computed:
+    # Take the dot-product of the user embeddings with the movie embeddings.
+    user_id = features["user_id"]
+    movie_id = features["movie_title"]
+    if USE_SOK:
+      user_embeddings = sok.lookup_sparse(self.user_embed, user_id)
+      movie_embeddings = sok.lookup_sparse(self.movie_embed, movie_id)
+    else:
+      user_embeddings = self.user_embed(user_id)
+      movie_embeddings = self.movie_embed(movie_id)
+
+    return tf.reduce_sum(user_embeddings * movie_embeddings, axis=2)
diff --git a/modelzoo/Recommendation/MovieLens/run.sh b/modelzoo/Recommendation/MovieLens/run.sh
new file mode 100644
index 00000000..668d9278
--- /dev/null
+++ b/modelzoo/Recommendation/MovieLens/run.sh
@@ -0,0 +1,3 @@
+export http_proxy="http://0.0.0.0:7890"
+export https_proxy="http://0.0.0.0:7890"
+python train.py
\ No newline at end of file
diff --git a/modelzoo/Recommendation/MovieLens/run_ranking.sh b/modelzoo/Recommendation/MovieLens/run_ranking.sh
new file mode 100644
index 00000000..7e7f532d
--- /dev/null
+++ b/modelzoo/Recommendation/MovieLens/run_ranking.sh
@@ -0,0 +1,3 @@
+export http_proxy="http://0.0.0.0:7890"
+export https_proxy="http://0.0.0.0:7890"
+python train_ranking.py
\ No newline at end of file
diff --git a/modelzoo/Recommendation/MovieLens/train.py b/modelzoo/Recommendation/MovieLens/train.py
new file mode 100644
index 00000000..06135462
--- /dev/null
+++ b/modelzoo/Recommendation/MovieLens/train.py
@@ -0,0 +1,94 @@
+import os
+from typing import Dict, Tuple
+import tensorflow as tf
+from mymodel import MovieLensRankingModel, USE_SOK
+
+import deepray as dp
+import deepray.custom_ops.sparse_operation_kit as sok
+from deepray.optimizers.multi_optimizer import MultiOptimizer
+from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating
+
+
+class TestData(Movielens100kRating):
+
+  def parser(self, x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
+    labels = x.pop("user_rating")
+    x["movie_title"] = self.movie_titles_vocabulary(x["movie_title"])
+    x["user_id"] = self.user_ids_vocabulary(x["user_id"])
+    x["movie_id"] = self.movie_ids_vocabulary(x["movie_id"])
+    return x, labels
+
+  def build_dataset(
+      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+  ):
+    key_func = lambda x: self.user_ids_vocabulary(x["user_id"])
+    reduce_func = lambda key, dataset: dataset.batch(100)
+
+    ratings = self.ratings.map(
+        lambda x: {
+            "movie_title": x["movie_title"],
+            "user_id": x["user_id"],
+            "movie_id": x["movie_id"],
+            "user_rating": x["user_rating"]
+        }
+    )
+    ds_train = ratings.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)
+    ds_train = ds_train.map(self.parser)
+
+    # ds_train = ds_train.ragged_batch(batch_size=batch_size)
+    ds_train = ds_train.ragged_batch(batch_size=batch_size)
+    return ds_train
+
+
+data_pipe = TestData()
+ds_train = data_pipe(128)
+
+for x, label in ds_train.take(1):
+  for key, value in x.items():
+    print(f"Shape of {key}: {value.shape}")
+    print(f"Example values of {key}: {value[:3, :3].numpy()}")
+    print()
+  print(f"Shape of label: {label.shape}")
+  print(f"Example values of label: {label[:3, :3].numpy()}")
+
+# Create the ranking model, trained with a ranking loss and evaluated with
+# ranking metrics.
+model = MovieLensRankingModel(
+    user_input_dim=data_pipe.user_ids_vocabulary.vocabulary_size(),
+    movie_input_dim=data_pipe.movie_titles_vocabulary.vocabulary_size()
+)
+
+if USE_SOK:
+  embedding_opt = dp.optimizers.Adagrad(learning_rate=0.5)
+  embedding_opt = sok.OptimizerWrapper(embedding_opt)
+  dense_opt = dp.optimizers.Adagrad(learning_rate=0.5)
+  optimizer = MultiOptimizer([
+      (embedding_opt, "DynamicVariable_"),
+  ], default_optimizer=dense_opt)
+else:
+  optimizer = dp.optimizers.Adagrad(0.5)
+
+loss = dp.losses.SoftmaxLoss(ragged=True)
+eval_metrics = [
+    dp.metrics.NDCGMetric(ragged=True, name="metric/ndcg"),
+    dp.metrics.MRRMetric(ragged=True, name="metric/mrr"),
+]
+model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics, run_eagerly=False)
+
+model.fit(ds_train, epochs=3)
+
+# Get movie title candidate list.
+movies = data_pipe.movies.map(lambda x: x["movie_title"], os.cpu_count())
+for movie_titles in movies.batch(2000):
+  break
+
+# Generate the input for user 42.
+inputs = {
+    "user_id": tf.expand_dims(tf.repeat(data_pipe.user_ids_vocabulary("42"), repeats=movie_titles.shape[0]), axis=0),
+    "movie_title": tf.expand_dims(data_pipe.movie_titles_vocabulary(movie_titles), axis=0)
+}
+
+# Get movie recommendations for user 42.
+scores = model(inputs)
+titles = dp.metrics.utils.sort_by_scores(scores, [tf.expand_dims(movie_titles, axis=0)])[0]
+print(f"Top 5 recommendations for user 42: {titles[0, :5]}")
diff --git a/modelzoo/Recommendation/MovieLens/train_ranking.py b/modelzoo/Recommendation/MovieLens/train_ranking.py
new file mode 100644
index 00000000..3a3fe45f
--- /dev/null
+++ b/modelzoo/Recommendation/MovieLens/train_ranking.py
@@ -0,0 +1,81 @@
+from typing import Dict
+
+import tensorflow as tf
+from absl import flags
+
+import deepray as dp
+from deepray.core.trainer import Trainer
+from deepray.datasets.movielens.movielens_100k_ratings import Movielens100kRating
+from deepray.layers.embedding_variable import EmbeddingVariable
+
+
+class RankingModel(tf.keras.Model):
+
+  def __init__(self, embedding_dimension=32):
+    super().__init__()
+    # Compute embeddings for users.
+    self.user_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension)
+    self.movie_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension)
+
+    # Compute predictions.
+    self.ratings = tf.keras.Sequential(
+        [
+            # Learn multiple dense layers.
+            tf.keras.layers.Dense(256, activation="relu"),
+            tf.keras.layers.Dense(64, activation="relu"),
+            # Make rating predictions in the final layer.
+            tf.keras.layers.Dense(1)
+        ]
+    )
+
+  def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor:
+    user_id, movie_title = inputs["user_id"], inputs["movie_title"]
+    user_id = tf.reshape(user_id, [-1])
+    movie_title = tf.reshape(movie_title, [-1])
+    user_embedding = self.user_embeddings(user_id)
+    movie_embedding = self.movie_embeddings(movie_title)
+    emb_vec = tf.concat([user_embedding, movie_embedding], axis=1)
+    return self.ratings(emb_vec)
+
+
+data_pipe = Movielens100kRating(split=True)
+train_dataset = data_pipe(flags.FLAGS.batch_size, is_training=True)
+test_dataset = data_pipe(flags.FLAGS.batch_size, is_training=True)
+
+optimizer = dp.optimizers.Adagrad(0.1)
+model = RankingModel()
+
+score = model(
+    {
+        "user_id": data_pipe.user_ids_vocabulary("42"),
+        "movie_title": data_pipe.movie_titles_vocabulary("One Flew Over the Cuckoo's Nest (1975)")
+    }
+)
+print(score)
+
+trainer = Trainer(model=model,
+                  optimizer=optimizer,
+                  loss=tf.keras.losses.MeanSquaredError(), 
+                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
+
+
+cached_train = train_dataset.cache()
+cached_test = test_dataset.cache()
+
+trainer.fit(x=cached_train)
+
+trainer.evaluate(cached_test, return_dict=True)
+
+test_ratings = {}
+test_movie_titles = ["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)"]
+for movie_title in test_movie_titles:
+  test_ratings[movie_title] = model(
+      {
+          "user_id": data_pipe.user_ids_vocabulary("42"),
+          "movie_title": data_pipe.movie_titles_vocabulary(movie_title)
+      }
+  )
+
+print("Ratings:")
+for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
+  print(f"{title}: {score}")
diff --git a/modelzoo/Recommendation/NCF/run_ncf.py b/modelzoo/Recommendation/NCF/run_ncf.py
index 67597abc..8f7c964a 100644
--- a/modelzoo/Recommendation/NCF/run_ncf.py
+++ b/modelzoo/Recommendation/NCF/run_ncf.py
@@ -30,7 +30,7 @@
 
 from deepray.callbacks.custom_early_stopping import CustomEarlyStopping
 from deepray.callbacks.increment_epoch import IncrementEpochCallback
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.movielens import Produce
 from deepray.metrics.hit_rate_metric import HitRateMetric
@@ -41,7 +41,6 @@
 # pylint: disable=g-bad-import-order
 # pylint: enable=g-bad-import-order
 
-FLAGS = flags.FLAGS
 FLAGS(
     [
         sys.argv[0],
@@ -180,7 +179,7 @@ def parse_flags(flags_obj):
       "epsilon": flags_obj.epsilon,
       "match_mlperf": flags_obj.benchmark,
       "epochs_between_evals": flags_obj.epochs_between_evals,
-      "keras_use_ctl": flags_obj.keras_use_ctl,
+      "use_custom_training_loop": flags_obj.use_custom_training_loop,
       "hr_threshold": flags_obj.hr_threshold,
       "stream_files": flags_obj.tpu is not None,
       "train_dataset_path": flags_obj.train_dataset_path,
@@ -203,7 +202,7 @@ def run_ncf(_):
   params = parse_flags(FLAGS)
   params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")
 
-  if params["use_tpu"] and not params["keras_use_ctl"]:
+  if params["use_tpu"] and not params["use_custom_training_loop"]:
     logging.error("Custom training loop must be used when using TPUStrategy.")
     return
 
diff --git a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_dien_fp16_gbs32768.230321132836.log b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_dien_fp16_gbs32768.230321132836.log
index 8cfac37d..ab9ad618 100644
--- a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_dien_fp16_gbs32768.230321132836.log
+++ b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_dien_fp16_gbs32768.230321132836.log
@@ -136,7 +136,7 @@ I0321 13:28:44.142400 140058691233600 base_trainer.py:266] 	stage_one_mlp_dims
 I0321 13:28:44.142466 140058691233600 base_trainer.py:266] 	stage_two_mlp_dims       = ['200', '80']
 I0321 13:28:44.142531 140058691233600 base_trainer.py:266] 	start_date               = None
 I0321 13:28:44.142596 140058691233600 base_trainer.py:266] 	stderrthreshold          = fatal
-I0321 13:28:44.142660 140058691233600 base_trainer.py:266] 	steps_per_summary        = 1
+I0321 13:28:44.142660 140058691233600 base_trainer.py:266] 	steps_per_execution        = 1
 I0321 13:28:44.142725 140058691233600 base_trainer.py:266] 	task_index               = -1
 I0321 13:28:44.142790 140058691233600 base_trainer.py:266] 	te                       = 3
 I0321 13:28:44.142855 140058691233600 base_trainer.py:266] 	test_random_seed         = 301
diff --git a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_din_fp16_gbs32768.230321132123.log b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_din_fp16_gbs32768.230321132123.log
index 1fcb3650..97edd8a2 100644
--- a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_din_fp16_gbs32768.230321132123.log
+++ b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_din_fp16_gbs32768.230321132123.log
@@ -127,7 +127,7 @@ I0321 13:21:31.940483 139718205667136 base_trainer.py:266] 	stage_one_mlp_dims
 I0321 13:21:31.940556 139718205667136 base_trainer.py:266] 	stage_two_mlp_dims       = ['200', '80']
 I0321 13:21:31.940629 139718205667136 base_trainer.py:266] 	start_date               = None
 I0321 13:21:31.940701 139718205667136 base_trainer.py:266] 	stderrthreshold          = fatal
-I0321 13:21:31.940773 139718205667136 base_trainer.py:266] 	steps_per_summary        = 100
+I0321 13:21:31.940773 139718205667136 base_trainer.py:266] 	steps_per_execution        = 100
 I0321 13:21:31.940846 139718205667136 base_trainer.py:266] 	task_index               = -1
 I0321 13:21:31.940918 139718205667136 base_trainer.py:266] 	te                       = 3
 I0321 13:21:31.940990 139718205667136 base_trainer.py:266] 	test_random_seed         = 301
diff --git a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_sim_fp16_gbs32768.230321133429.log b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_sim_fp16_gbs32768.230321133429.log
index 29fb5565..44cb250d 100644
--- a/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_sim_fp16_gbs32768.230321133429.log
+++ b/modelzoo/Recommendation/SIM/logs/tf_training_amazon_books_2014_sim_fp16_gbs32768.230321133429.log
@@ -136,7 +136,7 @@ I0321 13:34:37.834843 140144931596096 base_trainer.py:266] 	stage_one_mlp_dims
 I0321 13:34:37.834908 140144931596096 base_trainer.py:266] 	stage_two_mlp_dims       = ['200', '80']
 I0321 13:34:37.834973 140144931596096 base_trainer.py:266] 	start_date               = None
 I0321 13:34:37.835037 140144931596096 base_trainer.py:266] 	stderrthreshold          = fatal
-I0321 13:34:37.835102 140144931596096 base_trainer.py:266] 	steps_per_summary        = 1
+I0321 13:34:37.835102 140144931596096 base_trainer.py:266] 	steps_per_execution        = 1
 I0321 13:34:37.835166 140144931596096 base_trainer.py:266] 	task_index               = -1
 I0321 13:34:37.835231 140144931596096 base_trainer.py:266] 	te                       = 3
 I0321 13:34:37.835295 140144931596096 base_trainer.py:266] 	test_random_seed         = 301
diff --git a/modelzoo/Recommendation/SIM/main.py b/modelzoo/Recommendation/SIM/main.py
index ed21e276..099072da 100644
--- a/modelzoo/Recommendation/SIM/main.py
+++ b/modelzoo/Recommendation/SIM/main.py
@@ -21,21 +21,20 @@
 import sys
 from absl import app, flags
 import tensorflow as tf
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.movielens import Movielens100kRating
 from deepray.models.rec.sim_model import SIMModel
 from deepray.models.rec.din_model import DINModel
 from deepray.models.rec.dien_model import DIENModel
 
-FLAGS = flags.FLAGS
 FLAGS(
     [
         sys.argv[0],
         "--train_data=movielens/100k-ratings",
         # "--distribution_strategy=off",
         # "--run_eagerly=true",
-        "--steps_per_summary=20",
+        "--steps_per_execution=20",
         "--use_dynamic_embedding=True",
         # "--batch_size=1024",
     ]
diff --git a/modelzoo/Recommendation/SIM/run_dien.py b/modelzoo/Recommendation/SIM/run_dien.py
index ac1f8df8..369175f1 100644
--- a/modelzoo/Recommendation/SIM/run_dien.py
+++ b/modelzoo/Recommendation/SIM/run_dien.py
@@ -21,15 +21,13 @@
 import tensorflow as tf
 from absl import app, flags
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.amazon_books_2014 import AmazonBooks2014
 from deepray.models.rec.dien_model import DIENModel
 from .feature_spec import FeatureSpec
 from defaults import define_din_flags
 
-FLAGS = flags.FLAGS
-
 
 def custom_loss_fn(y_true, y_pred):
   return y_pred
diff --git a/modelzoo/Recommendation/SIM/run_din.py b/modelzoo/Recommendation/SIM/run_din.py
index 5be07dc0..16a70e60 100644
--- a/modelzoo/Recommendation/SIM/run_din.py
+++ b/modelzoo/Recommendation/SIM/run_din.py
@@ -21,15 +21,13 @@
 import tensorflow as tf
 from absl import app, flags
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.amazon_books_2014 import AmazonBooks2014
 from deepray.models.rec.din_model import DINModel
 from .feature_spec import FeatureSpec
 from defaults import define_din_flags
 
-FLAGS = flags.FLAGS
-
 
 def main(_):
   _strategy = distribution_utils.get_distribution_strategy()
diff --git a/modelzoo/Recommendation/SIM/run_horovod.sh b/modelzoo/Recommendation/SIM/run_horovod.sh
index ddaaf1d0..83e265ed 100644
--- a/modelzoo/Recommendation/SIM/run_horovod.sh
+++ b/modelzoo/Recommendation/SIM/run_horovod.sh
@@ -52,10 +52,10 @@ else
 fi
 
 if [ "$model" = "din" ] ; then
-    use_din_tag="--steps_per_summary=100"
+    use_din_tag="--steps_per_execution=100"
     echo "$model activated"
 else
-    use_din_tag="--steps_per_summary=1"
+    use_din_tag="--steps_per_execution=1"
 fi
 
 
diff --git a/modelzoo/Recommendation/SIM/run_sim.py b/modelzoo/Recommendation/SIM/run_sim.py
index 9732cd5d..3c976f51 100644
--- a/modelzoo/Recommendation/SIM/run_sim.py
+++ b/modelzoo/Recommendation/SIM/run_sim.py
@@ -21,15 +21,13 @@
 import tensorflow as tf
 from absl import app, flags
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.amazon_books_2014 import AmazonBooks2014
 from deepray.models.rec.sim_model import SIMModel
 from .feature_spec import FeatureSpec
 from defaults import define_din_flags
 
-FLAGS = flags.FLAGS
-
 
 def build_sim_loss_fn(alpha=1.0, beta=1.0):
   cross_entropy_loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
diff --git a/modelzoo/Recommendation/TFRA/demo.py b/modelzoo/Recommendation/TFRA/demo.py
deleted file mode 100644
index 75b4a782..00000000
--- a/modelzoo/Recommendation/TFRA/demo.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import tensorflow as tf
-from absl import flags
-from tensorflow.keras.layers import Dense
-
-from deepray.layers.dynamic_embedding import DistributedDynamicEmbedding, DynamicEmbeddingOption
-from deepray.utils.data.feature_map import FeatureMap
-
-FLAGS = flags.FLAGS
-
-
-class Demo(tf.keras.Model):
-
-  def __init__(self, embedding_size, is_training=True, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    if is_training:
-      initializer = tf.keras.initializers.VarianceScaling()
-    else:
-      initializer = tf.keras.initializers.Zeros()
-    self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
-    self.features_dict = {}
-    for key, dtype, emb_size, length in self.feature_map.loc[self.feature_map["ftype"] == "Categorical"][[
-        "name", "dtype", "dim", "length"
-    ]].values:
-      self.features_dict[key] = DistributedDynamicEmbedding(
-          embedding_dim=emb_size,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=initializer,
-          name=key + '_DynamicEmbeddingLayer',
-          de_option=DynamicEmbeddingOption(device="HKV",)
-      )
-    self.d0 = Dense(
-        256,
-        activation='relu',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
-    )
-    self.d1 = Dense(
-        64,
-        activation='relu',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
-    )
-    self.d2 = Dense(
-        1,
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
-    )
-
-  def call(self, features, *args, **kwargs):
-
-    movie_id = features["movie_id"]
-    user_id = features["user_id"]
-
-    user_id_weights = self.features_dict['user_id'](user_id)
-
-    movie_id_weights = self.features_dict['movie_id'](movie_id)
-
-    embeddings = tf.concat([user_id_weights, movie_id_weights], axis=1)
-
-    dnn = self.d0(embeddings)
-    dnn = self.d1(dnn)
-    dnn = self.d2(dnn)
-    out = tf.reshape(dnn, shape=[-1])
-
-    return out
-    # loss = tf.keras.losses.MeanSquaredError()(rating, out)
-    # predictions = {"out": out}
diff --git a/modelzoo/Recommendation/TFRA/demo_tfra.py b/modelzoo/Recommendation/TFRA/demo_tfra.py
deleted file mode 100644
index 1495223a..00000000
--- a/modelzoo/Recommendation/TFRA/demo_tfra.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in tf2.0."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-from absl import app, flags
-
-from deepray.core.base_trainer import Trainer
-from deepray.core.common import distribution_utils
-from deepray.datasets.movielens import Movielens100kRating
-from demo import Demo
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
-  _strategy = distribution_utils.get_distribution_strategy()
-  data_pipe = Movielens100kRating()
-  with distribution_utils.get_strategy_scope(_strategy):
-    model = Demo(embedding_size=32)
-
-  trainer = Trainer(
-      model=model,
-      loss=tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM),
-  )
-  train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
-
-  # trainer.export_tfra()
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("model_dir")
-  app.run(main)
diff --git a/modelzoo/Recommendation/WideDeep/train.py b/modelzoo/Recommendation/WideDeep/train.py
index 772cb43d..246dfb83 100644
--- a/modelzoo/Recommendation/WideDeep/train.py
+++ b/modelzoo/Recommendation/WideDeep/train.py
@@ -6,13 +6,11 @@
 from absl import app, flags
 from tensorflow import keras
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.cifar import CIFAR100
 from .model import BaseModel
 
-FLAGS = flags.FLAGS
-
 learning_rate = 1e-3
 batch_size = 128
 num_epochs = 40
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
index 58cadda9..ea854726 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
@@ -12,8 +12,6 @@
 from deepray.layers.embedding import DynamicEmbedding
 from deepray.utils.data.feature_map import FeatureMap
 
-FLAGS = flags.FLAGS
-
 
 class KMaxPooling(Layer):
 
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
index 1a0d5c56..4bce6933 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
@@ -8,8 +8,6 @@
 from deepray.layers.embedding import DiamondEmbedding
 from deepray.utils.data.feature_map import FeatureMap
 
-FLAGS = flags.FLAGS
-
 
 class KMaxPooling(Layer):
 
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/run_horovod.sh b/modelzoo/Recommendation/avazu-ctr-prediction/run_horovod.sh
index b31219bf..bcd2a1c4 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/run_horovod.sh
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/run_horovod.sh
@@ -72,7 +72,7 @@ $mpi_command python train.py \
     --feature_map=feature_map.csv \
     --benchmark \
     --use_dynamic_embedding=True \
-    --steps_per_summary=20 \
+    --steps_per_execution=20 \
     --model_name=$model \
     --epochs=1 \
     --run_eagerly=true \
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/train.py b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
index 7aeb87e9..969b7931 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/train.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
@@ -18,17 +18,17 @@
 from __future__ import division
 from __future__ import print_function
 
+from absl import flags
+
 import tensorflow as tf
 from absl import app
 from absl import flags
 import tensorflow_recommenders_addons as tfra
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.avazu import Avazu
 
-FLAGS = flags.FLAGS
-
 
 def main(_):
   field_info = {
@@ -50,23 +50,23 @@ def main(_):
       ]
   }
 
-  if FLAGS.model_name == "flen":
+  if flags.FLAGS.model_name == "flen":
     from deepray.models.rec.flen import FLEN as mymodel
-  elif FLAGS.model_name == "flend":
+  elif flags.FLAGS.model_name == "flend":
     from deepray.models.rec.flend import FLEND as mymodel
-  elif FLAGS.model_name == "ccpm":
+  elif flags.FLAGS.model_name == "ccpm":
     from .ccpm_diamond import CCPM as mymodel
 
   _strategy = distribution_utils.get_distribution_strategy()
   with distribution_utils.get_strategy_scope(_strategy):
     model = mymodel(field_info=field_info, embedding_dim=16)
-    optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate)
-    if FLAGS.use_dynamic_embedding:
+    optimizer = tf.keras.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate)
+    if flags.FLAGS.use_dynamic_embedding:
       optimizer = tfra.dynamic_embedding.DynamicEmbeddingOptimizer(optimizer)
 
   data_pipe = Avazu()
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  valid_g2b = data_pipe(FLAGS.valid_data, FLAGS.batch_size, is_training=False)
+  train_dataset = data_pipe(flags.FLAGS.train_data, flags.FLAGS.batch_size, is_training=True)
+  valid_g2b = data_pipe(flags.FLAGS.valid_data, flags.FLAGS.batch_size, is_training=False)
 
   trainer = Trainer(
       model=model,
@@ -77,8 +77,8 @@ def main(_):
   )
 
   trainer.fit(
-      train_input=train_dataset,
-      eval_input=valid_g2b,
+      x=train_dataset,
+      validation_data=valid_g2b,
       # callbacks=[
       #   # Write TensorBoard logs to `./logs` directory
       #   tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir, histogram_freq=1, profile_batch=3),
diff --git a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/README.md b/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/README.md
deleted file mode 100644
index fe7d7cca..00000000
--- a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Frozen Graph TensorFlow
-
-Lei Mao
-
-## Introduction
-
-This repository has the examples of saving, loading, and running inference for frozen graph in TensorFlow 1.x and 2.x.
-
-## Files
-
-```
-.
-├── LICENSE.md
-├── README.md
-├── TensorFlow_v1
-│   ├── cifar.py
-│   ├── cnn.py
-│   ├── inspect_signature.py
-│   ├── main.py
-│   ├── README.md
-│   ├── test_pb.py
-│   └── utils.py
-└── TensorFlow_v2
-    ├── example_1.py
-    ├── example_2.py
-    ├── README.md
-    └── utils.py
-```
-
-## Blogs
-
-* [Save, Load and Inference From TensorFlow Frozen Graph](https://leimao.github.io/blog/Save-Load-Inference-From-TF-Frozen-Graph/)
-* [Save, Load and Inference From TensorFlow 2.x Frozen Graph](https://leimao.github.io/blog/Save-Load-Inference-From-TF2-Frozen-Graph/)
-
-## Examples
-
-* [TensorFlow 1.x](https://github.com/leimao/Frozen_Graph_TensorFlow/tree/master/TensorFlow_v1)
-* [TensorFlow 2.x](https://github.com/leimao/Frozen_Graph_TensorFlow/tree/master/TensorFlow_v2)
\ No newline at end of file
diff --git a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/README.md b/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/README.md
deleted file mode 100644
index 301735cd..00000000
--- a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Frozen Graph TensorFlow 2.x
-
-Lei Mao
-https://leimao.github.io/blog/Save-Load-Inference-From-TF2-Frozen-Graph/
-
-
-## Introduction
-
-TensorFlow 1.x provided interface to freeze models via `tf.Session`. However, since TensorFlow 2.x removed `tf.Session`, freezing models in TensorFlow 2.x had been a problem to most of the users.
-
-In this repository, several simple concrete examples have been implemented to demonstrate how to freeze models and run inference using frozen models in TensorFlow 2.x. The frozen models are also fully compatible with inference using TensorFlow 1.x, TensorFlow 2.x, ONNX Runtime, and TensorRT. 
-
-## Usages
-
-### Docker Container
-
-We use TensorFlow 2.3 Docker container from DockerHub. To download the Docker image, please run the following command in the terminal.
-
-```bash
-$ docker pull tensorflow/tensorflow:2.3.0-gpu
-```
-
-To start the Docker container, please run the following command in the terminal.
-
-```bash
-$ docker run --gpus all -it --rm -v $(pwd):/mnt tensorflow/tensorflow:2.3.0-gpu
-```
-
-### Examples
-
-#### Example 1
-
-We would train a simple fully connected neural network to classify the Fashion MNIST data. The model would be saved as `SavedModel` in the `models/simple_model` directory for completeness. In addition, the model would also be frozen and saved as `simple_frozen_graph.pb` in the `frozen_models` directory.
-
-To train, save, export, and run inference for the model, please run the following command in the terminal.
-
-```bash
-$ python example_1.py
-```
-
-#### Example 2
-
-We would train a simple recurrent neural network that has multiple inputs and outputs using random data. The model would be saved as `SavedModel` in the `models/complex_model` directory for completeness. In addition, the model would also be frozen and saved as `complex_frozen_graph.pb` in the `frozen_models` directory.
-
-To train, save, export, and run inference for the model, please run the following command in the terminal.
-
-```bash
-$ python example_2.py
-```
-
-### Convert Frozen Graph to ONNX
-
-If TensorFlow 1.x and `tf2onnx` have been installed, the frozen graph could be converted to ONNX model using the following command.
-
-```bash
-$ python -m tf2onnx.convert --input ./frozen_models/frozen_graph.pb --output model.onnx --outputs Identity:0 --inputs x:0
-```
-
-### Convert Frozen Graph to UFF
-
-The frozen graph could also be converted to UFF model for TensorRT using the following command. 
-
-```bash
-$ convert-to-uff frozen_graph.pb -t -O Identity -o frozen_graph.uff
-```
-
-TensorRT 6.0 Docker image could be pulled from [NVIDIA NGC](https://ngc.nvidia.com/).
-
-```bash
-$ docker pull nvcr.io/nvidia/tensorrt:19.12-py3
-```
-
-## References
-
-* [Migrate from TensorFlow 1.x to 2.x](https://www.tensorflow.org/guide/migrate)
diff --git a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_1.py b/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_1.py
deleted file mode 100644
index 6fa2f450..00000000
--- a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_1.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
-import numpy as np
-
-from utils import get_fashion_mnist_data, wrap_frozen_graph
-
-
-def main():
-  tf.random.set_seed(seed=0)
-
-  # Get data
-  (train_images, train_labels), (test_images, test_labels) = get_fashion_mnist_data()
-
-  # Create Keras model
-  model = keras.Sequential(
-      layers=[
-          keras.layers.InputLayer(input_shape=(28, 28), name="input"),
-          keras.layers.Flatten(input_shape=(28, 28), name="flatten"),
-          keras.layers.Dense(128, activation="relu", name="dense"),
-          keras.layers.Dense(10, activation="softmax", name="output")
-      ],
-      name="FCN"
-  )
-
-  # Print model architecture
-  model.summary()
-
-  # Compile model with optimizer
-  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
-
-  # Train model
-  model.fit(x={"input": train_images}, y={"output": train_labels}, epochs=1)
-
-  # Test model
-  test_loss, test_acc = model.evaluate(x={"input": test_images}, y={"output": test_labels}, verbose=2)
-  print("-" * 50)
-  print("Test accuracy: ")
-  print(test_acc)
-
-  # Get predictions for test images
-  predictions = model.predict(test_images)
-  # Print the prediction for the first image
-  print("-" * 50)
-  print("Example TensorFlow prediction reference:")
-  print(predictions[0])
-
-  # Save model to SavedModel format
-  tf.saved_model.save(model, "./models/simple_model")
-
-  # Convert Keras model to ConcreteFunction
-  full_model = tf.function(lambda x: model(x))
-  full_model = full_model.get_concrete_function(x=tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
-
-  # Get frozen ConcreteFunction
-  frozen_func = convert_variables_to_constants_v2(full_model)
-  frozen_func.graph.as_graph_def()
-
-  layers = [op.name for op in frozen_func.graph.get_operations()]
-  print("-" * 50)
-  print("Frozen model layers: ")
-  for layer in layers:
-    print(layer)
-
-  print("-" * 50)
-  print("Frozen model inputs: ")
-  print(frozen_func.inputs)
-  print("Frozen model outputs: ")
-  print(frozen_func.outputs)
-
-  # Save frozen graph from frozen ConcreteFunction to hard drive
-  tf.io.write_graph(
-      graph_or_graph_def=frozen_func.graph, logdir="./frozen_models", name="simple_frozen_graph.pb", as_text=False
-  )
-
-  # Load frozen graph using TensorFlow 1.x functions
-  with tf.io.gfile.GFile("./frozen_models/simple_frozen_graph.pb", "rb") as f:
-    graph_def = tf.compat.v1.GraphDef()
-    loaded = graph_def.ParseFromString(f.read())
-
-  # Wrap frozen graph to ConcreteFunctions
-  frozen_func = wrap_frozen_graph(graph_def=graph_def, inputs=["x:0"], outputs=["Identity:0"], print_graph=True)
-
-  print("-" * 50)
-  print("Frozen model inputs: ")
-  print(frozen_func.inputs)
-  print("Frozen model outputs: ")
-  print(frozen_func.outputs)
-
-  # Get predictions for test images
-  frozen_graph_predictions = frozen_func(x=tf.constant(test_images))[0]
-
-  # Print the prediction for the first image
-  print("-" * 50)
-  print("Example TensorFlow frozen graph prediction reference:")
-  print(frozen_graph_predictions[0].numpy())
-
-  # The two predictions should be almost the same.
-  assert np.allclose(a=frozen_graph_predictions[0].numpy(), b=predictions[0], rtol=1e-05, atol=1e-08, equal_nan=False)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_2.py b/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_2.py
deleted file mode 100644
index df4b1b19..00000000
--- a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/example_2.py
+++ /dev/null
@@ -1,167 +0,0 @@
-import tensorflow as tf
-from tensorflow import keras
-from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2
-import numpy as np
-
-from utils import wrap_frozen_graph
-
-
-def main():
-
-  # Mysterious code
-  # https://leimao.github.io/blog/TensorFlow-cuDNN-Failure/
-  gpu_devices = tf.config.experimental.list_physical_devices('GPU')
-  for device in gpu_devices:
-    tf.config.experimental.set_memory_growth(device, True)
-
-  # Dummy example copied from TensorFlow
-  # https://www.tensorflow.org/guide/keras/functional#models_with_multiple_inputs_and_outputs
-
-  num_tags = 12  # Number of unique issue tags
-  num_words = 10000  # Size of vocabulary obtained when preprocessing text data
-  num_departments = 4  # Number of departments for predictions
-
-  title_input = keras.Input(shape=(None,), name="title")  # Variable-length sequence of ints
-  body_input = keras.Input(shape=(None,), name="body")  # Variable-length sequence of ints
-  tags_input = keras.Input(shape=(num_tags,), name="tags")  # Binary vectors of size `num_tags`
-
-  # Embed each word in the title into a 64-dimensional vector
-  title_features = keras.layers.Embedding(num_words, 64)(title_input)
-  # Embed each word in the text into a 64-dimensional vector
-  body_features = keras.layers.Embedding(num_words, 64)(body_input)
-
-  # Reduce sequence of embedded words in the title into a single 128-dimensional vector
-  title_features = keras.layers.LSTM(128)(title_features)
-  # Reduce sequence of embedded words in the body into a single 32-dimensional vector
-  body_features = keras.layers.LSTM(32)(body_features)
-
-  # Merge all available features into a single large vector via concatenation
-  x = keras.layers.concatenate([title_features, body_features, tags_input])
-
-  # Stick a logistic regression for priority prediction on top of the features
-  priority_pred = keras.layers.Dense(1, name="priority")(x)
-  # Stick a department classifier on top of the features
-  department_pred = keras.layers.Dense(num_departments, name="department")(x)
-
-  # Instantiate an end-to-end model predicting both priority and department
-  model = keras.Model(
-      inputs=[title_input, body_input, tags_input],
-      outputs=[priority_pred, department_pred],
-  )
-
-  model.compile(
-      optimizer=keras.optimizers.RMSprop(1e-3),
-      loss=[
-          keras.losses.BinaryCrossentropy(from_logits=True),
-          keras.losses.CategoricalCrossentropy(from_logits=True),
-      ],
-      loss_weights=[1.0, 0.2],
-  )
-
-  # Dummy input data
-  title_data = np.random.randint(num_words, size=(1280, 10)).astype("float32")
-  body_data = np.random.randint(num_words, size=(1280, 100)).astype("float32")
-  tags_data = np.random.randint(2, size=(1280, num_tags)).astype("float32")
-
-  # Dummy target data
-  priority_targets = np.random.random(size=(1280, 1))
-  dept_targets = np.random.randint(2, size=(1280, num_departments))
-
-  model.fit(
-      {
-          "title": title_data,
-          "body": body_data,
-          "tags": tags_data
-      },
-      {
-          "priority": priority_targets,
-          "department": dept_targets
-      },
-      epochs=2,
-      batch_size=32,
-  )
-
-  predictions = model.predict({"title": title_data[0:1], "body": body_data[0:1], "tags": tags_data[0:1]})
-  predictions_priority = predictions[0]
-  predictions_department = predictions[1]
-
-  print("-" * 50)
-  print("Example TensorFlow prediction reference:")
-  print(predictions_priority)
-  print(predictions_department)
-
-  # Save model to SavedModel format
-  tf.saved_model.save(model, "./models/complex_model")
-
-  full_model = tf.function(lambda x: model(x))
-  full_model = full_model.get_concrete_function(
-      x=(
-          tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype),
-          tf.TensorSpec(model.inputs[1].shape, model.inputs[1].dtype),
-          tf.TensorSpec(model.inputs[2].shape, model.inputs[2].dtype)
-      )
-  )
-
-  # Get frozen ConcreteFunction
-  # https://github.com/tensorflow/tensorflow/issues/36391#issuecomment-596055100
-  frozen_func = convert_variables_to_constants_v2(full_model, lower_control_flow=False)
-  frozen_func.graph.as_graph_def()
-
-  layers = [op.name for op in frozen_func.graph.get_operations()]
-  print("-" * 50)
-  print("Frozen model layers: ")
-  for layer in layers:
-    print(layer)
-
-  print("-" * 50)
-  print("Frozen model inputs: ")
-  print(frozen_func.inputs)
-  print("Frozen model outputs: ")
-  print(frozen_func.outputs)
-
-  # Save frozen graph from frozen ConcreteFunction to hard drive
-  tf.io.write_graph(
-      graph_or_graph_def=frozen_func.graph, logdir="./frozen_models", name="complex_frozen_graph.pb", as_text=False
-  )
-
-  # Load frozen graph using TensorFlow 1.x functions
-  with tf.io.gfile.GFile("./frozen_models/complex_frozen_graph.pb", "rb") as f:
-    graph_def = tf.compat.v1.GraphDef()
-    loaded = graph_def.ParseFromString(f.read())
-
-  # Wrap frozen graph to ConcreteFunctions
-  frozen_func = wrap_frozen_graph(
-      graph_def=graph_def, inputs=["x:0", "x_1:0", "x_2:0"], outputs=["Identity:0", "Identity_1:0"], print_graph=True
-  )
-
-  # Note that we only have "one" input and "output" for the loaded frozen function
-  print("-" * 50)
-  print("Frozen model inputs: ")
-  print(frozen_func.inputs)
-  print("Frozen model outputs: ")
-  print(frozen_func.outputs)
-
-  # Get predictions
-  frozen_graph_predictions = frozen_func(
-      x=tf.constant(title_data[0:1]), x_1=tf.constant(body_data[0:1]), x_2=tf.constant(tags_data[0:1])
-  )
-  frozen_graph_predictions_priority = frozen_graph_predictions[0]
-  frozen_graph_predictions_department = frozen_graph_predictions[1]
-
-  print("-" * 50)
-  print("Example TensorFlow frozen graph prediction reference:")
-  print(frozen_graph_predictions_priority.numpy())
-  print(frozen_graph_predictions_department.numpy())
-
-  # The two predictions should be almost the same.
-  assert np.allclose(
-      a=frozen_graph_predictions_priority.numpy(), b=predictions_priority, rtol=1e-05, atol=1e-08, equal_nan=False
-  )
-  assert np.allclose(
-      a=frozen_graph_predictions_department.numpy(), b=predictions_department, rtol=1e-05, atol=1e-08, equal_nan=False
-  )
-
-
-if __name__ == "__main__":
-
-  main()
diff --git a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/utils.py b/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/utils.py
deleted file mode 100644
index 2504f07d..00000000
--- a/modelzoo/Recommendation/criteo_ctr/Frozen-Graph-TensorFlow/TensorFlow_v2/utils.py
+++ /dev/null
@@ -1,38 +0,0 @@
-import tensorflow as tf
-from tensorflow import keras
-import numpy as np
-
-
-def get_fashion_mnist_data():
-
-  fashion_mnist = keras.datasets.fashion_mnist
-  (train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
-  class_names = [
-      "T-shirt/top", "Trouser", "Pullover", "Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"
-  ]
-  train_images = train_images.astype(np.float32) / 255.0
-  test_images = test_images.astype(np.float32) / 255.0
-
-  return (train_images, train_labels), (test_images, test_labels)
-
-
-def wrap_frozen_graph(graph_def, inputs, outputs, print_graph=False):
-
-  def _imports_graph_def():
-    tf.compat.v1.import_graph_def(graph_def, name="")
-
-  wrapped_import = tf.compat.v1.wrap_function(_imports_graph_def, [])
-  import_graph = wrapped_import.graph
-
-  if print_graph == True:
-    print("-" * 50)
-    print("Frozen model layers: ")
-    layers = [op.name for op in import_graph.get_operations()]
-    for layer in layers:
-      print(layer)
-    print("-" * 50)
-
-  return wrapped_import.prune(
-      tf.nest.map_structure(import_graph.as_graph_element, inputs),
-      tf.nest.map_structure(import_graph.as_graph_element, outputs)
-  )
diff --git a/modelzoo/Recommendation/criteo_ctr/dcn_v2.py b/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
index d9780b29..8f730df1 100644
--- a/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
+++ b/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
@@ -1,52 +1,72 @@
 from typing import Dict
 
 import tensorflow as tf
+import tf_keras as keras
 from absl import flags
+
+from deepray.custom_ops.embedding_variable import group_embedding_lookup_ops
 from deepray.layers.dcn import Cross
 from deepray.layers.dot_interaction import DotInteraction
-from deepray.layers.dynamic_embedding import DistributedDynamicEmbedding, DynamicEmbeddingOption
+from deepray.layers.dynamic_embedding import DistributedDynamicEmbedding
+from deepray.layers.embedding_variable import EmbeddingVariable
 from deepray.layers.mlp import MLP
 from deepray.utils.data.feature_map import FeatureMap
 
-FLAGS = flags.FLAGS
-
-
-class Ranking(tf.keras.models.Model):
-  """A configurable ranking model.
-
-  This class represents a sensible and reasonably flexible configuration for a
-  ranking model that can be used for tasks such as CTR prediction.
-
-  It can be customized as needed, and its constituent blocks can be changed by
-  passing user-defined alternatives.
-
-  For example:
-  - Pass
-    `feature_interaction = tfrs.layers.feature_interaction.DotInteraction()`
-    to train a DLRM model, or pass
-    ```
-    feature_interaction = tf.keras.Sequential([
-      tf.keras.layers.Concatenate(),
-      tfrs.layers.feature_interaction.Cross()
-    ])
-    ```
-    to train a DCN model.
-  - Pass `task = tfrs.tasks.Ranking(loss=tf.keras.losses.BinaryCrossentropy())`
-    to train a CTR prediction model, and
-    `tfrs.tasks.Ranking(loss=tf.keras.losses.MeanSquaredError())` to train
-    a rating prediction model.
-
-  Changing these should cover a broad range of models, but this class is not
-  intended to cover all possible use cases.  For full flexibility, inherit
-  from `tfrs.models.Model` and provide your own implementations of
-  the `compute_loss` and `call` methods.
-  """
-
-  def __init__(self, interaction, training=True, *args, **kwargs):
-    super().__init__(*args, **kwargs)
 
-    self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
+class EmbeddingContainer(tf.Module):
+
+  def __init__(self, training, use_group_embedding):
+    super().__init__()
+    self.embeddings = {}
+    self.training = training
+    self.use_group_embedding = use_group_embedding
+
+  def add_embedding(self, name, dim, dtype, voc_size):
+    if voc_size:
+      self.embeddings[name] = keras.layers.Embedding(
+          input_dim=voc_size + 1,
+          output_dim=dim,
+          embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
+      )
+    elif flags.FLAGS.use_dynamic_embedding:
+      self.embeddings[name] = DistributedDynamicEmbedding(
+          embedding_dim=dim,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+          name='DynamicVariable_' + name,
+          device="DRAM",
+          init_capacity=1024 * 10,
+          max_capacity=1024 * 100,
+      )
+    else:
+      emb = EmbeddingVariable(
+          embedding_dim=dim,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+          name='emb' + name,
+          storage_type="DRAM",
+          # with_unique=True,
+          # storage_type="HBM",
+      )
+      self.embeddings[name] = emb
+
+  def __call__(self, name, tensor):
+    return self.embeddings[name](tensor)
+
+  def get_embedding_list(self):
+    if not self.use_group_embedding:
+      return None
+    emb_list = [emb for emb in self.embeddings.values() if isinstance(emb, EmbeddingVariable)]
+    return emb_list
 
+
+class Ranking(keras.Model):
+
+  def __init__(self, interaction, training=True, use_group_embedding=False, *args, **kwargs):
+    super().__init__(*args, **kwargs)
+    self.feature_map = FeatureMap().feature_map
     self._bottom_stack = MLP(hidden_units=[256, 64, 16], activations=[None, None, "relu"])
     self._top_stack = MLP(hidden_units=[512, 256, 1], activations=[None, None, "sigmoid"])
     self._interaction = interaction
@@ -60,48 +80,30 @@ def __init__(self, interaction, training=True, *args, **kwargs):
           f'params.task.model.interaction {self.task_config.model.interaction} '
           f'is not supported it must be either \'dot\' or \'cross\'.'
       )
+    self.use_group_embedding = use_group_embedding
+    self.embedding_container = EmbeddingContainer(training, use_group_embedding)
 
   def build(self, input_shape):
-    self.embedding_layer = {}
     for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
         "name", "dim", "dtype", "voc_size"
     ]].values:
-      if voc_size and not FLAGS.use_dynamic_embedding:
-        self.embedding_layer[name] = tf.keras.layers.Embedding(
-            input_dim=voc_size,
-            output_dim=dim,
-            embeddings_initializer="uniform",
-        )
-      else:
-        self.embedding_layer[name] = DistributedDynamicEmbedding(
-            embedding_dim=dim,
-            key_dtype=dtype,
-            value_dtype=tf.float32,
-            initializer=None if self.training else tf.keras.initializers.Zeros(),
-            name='emb' + name,
-            de_option=DynamicEmbeddingOption(
-                device="HBM",
-                init_capacity=1 * 1024 * 1024,
-            ),
-        )
+      self.embedding_container.add_embedding(name, dim, dtype, voc_size)
 
   def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
-    """Executes forward and backward pass, returns loss.
-
-    Args:
-      inputs: Model function inputs (features and labels).
-
-    Returns:
-      loss: Scalar tensor.
-    """
     dense_features = inputs["dense_features"]
     sparse_embedding_vecs = []
+    indices = []  # Keep indices for group embedding lookup
+
     for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
       tensor = inputs[name]
-      # (batch_size, emb).
-      test = self.embedding_layer[name](tensor)
-      # print(test)
-      sparse_embedding_vecs.append(test)
+      if self.use_group_embedding:
+        indices.append(tensor)
+      else:
+        sparse_embedding_vecs.append(self.embedding_container(name, tensor))
+
+    if self.use_group_embedding:
+      embedding_list = self.embedding_container.get_embedding_list()
+      sparse_embedding_vecs = group_embedding_lookup_ops.group_embedding_lookup(embedding_list, indices)
 
     dense_embedding_vec = self._bottom_stack(dense_features)
 
diff --git a/modelzoo/Recommendation/criteo_ctr/feature_map_small.csv b/modelzoo/Recommendation/criteo_ctr/feature_map_small.csv
index 43296d20..9ba82fcf 100644
--- a/modelzoo/Recommendation/criteo_ctr/feature_map_small.csv
+++ b/modelzoo/Recommendation/criteo_ctr/feature_map_small.csv
@@ -1,41 +1,41 @@
 name,dtype,ftype,dim,length,voc_size
-feature_0,int32,Label,1,1,
-feature_1,float64,Numerical,1,1,
-feature_2,float64,Numerical,1,1,
-feature_3,float64,Numerical,1,1,
-feature_4,float64,Numerical,1,1,
-feature_5,float64,Numerical,1,1,
-feature_6,float64,Numerical,1,1,
-feature_7,float64,Numerical,1,1,
-feature_8,float64,Numerical,1,1,
-feature_9,float64,Numerical,1,1,
-feature_10,float64,Numerical,1,1,
-feature_11,float64,Numerical,1,1,
-feature_12,float64,Numerical,1,1,
-feature_13,float64,Numerical,1,1,
-feature_14,int32,Categorical,16,1,7912888
-feature_15,int32,Categorical,16,1,33822
-feature_16,int32,Categorical,16,1,17138
-feature_17,int32,Categorical,16,1,7338
-feature_18,int32,Categorical,16,1,20045
-feature_19,int32,Categorical,16,1,3
-feature_20,int32,Categorical,16,1,7104
-feature_21,int32,Categorical,16,1,1381
-feature_22,int32,Categorical,16,1,62
-feature_23,int32,Categorical,16,1,5554113
-feature_24,int32,Categorical,16,1,582468
-feature_25,int32,Categorical,16,1,245827
-feature_26,int32,Categorical,16,1,10
-feature_27,int32,Categorical,16,1,2208
-feature_28,int32,Categorical,16,1,10666
-feature_29,int32,Categorical,16,1,103
-feature_30,int32,Categorical,16,1,3
-feature_31,int32,Categorical,16,1,967
-feature_32,int32,Categorical,16,1,14
-feature_33,int32,Categorical,16,1,8165895
-feature_34,int32,Categorical,16,1,2675939
-feature_35,int32,Categorical,16,1,7156452
-feature_36,int32,Categorical,16,1,302515
-feature_37,int32,Categorical,16,1,12021
-feature_38,int32,Categorical,16,1,96
-feature_39,int32,Categorical,16,1,34
\ No newline at end of file
+f_c0,int32,Label,1,1,
+f_c1,float64,Numerical,1,1,
+f_c2,float64,Numerical,1,1,
+f_c3,float64,Numerical,1,1,
+f_c4,float64,Numerical,1,1,
+f_c5,float64,Numerical,1,1,
+f_c6,float64,Numerical,1,1,
+f_c7,float64,Numerical,1,1,
+f_c8,float64,Numerical,1,1,
+f_c9,float64,Numerical,1,1,
+f_c10,float64,Numerical,1,1,
+f_c11,float64,Numerical,1,1,
+f_c12,float64,Numerical,1,1,
+f_c13,float64,Numerical,1,1,
+f_c14,int32,Categorical,16,1,
+f_c15,int32,Categorical,16,1,
+f_c16,int32,Categorical,16,1,
+f_c17,int32,Categorical,16,1,
+f_c18,int32,Categorical,16,1,
+f_c19,int32,Categorical,16,1,
+f_c20,int32,Categorical,16,1,
+f_c21,int32,Categorical,16,1,
+f_c22,int32,Categorical,16,1,
+f_c23,int32,Categorical,16,1,
+f_c24,int32,Categorical,16,1,
+f_c25,int32,Categorical,16,1,
+f_c26,int32,Categorical,16,1,
+f_c27,int32,Categorical,16,1,
+f_c28,int32,Categorical,16,1,
+f_c29,int32,Categorical,16,1,
+f_c30,int32,Categorical,16,1,
+f_c31,int32,Categorical,16,1,
+f_c32,int32,Categorical,16,1,
+f_c33,int32,Categorical,16,1,
+f_c34,int32,Categorical,16,1,
+f_c35,int32,Categorical,16,1,
+f_c36,int32,Categorical,16,1,
+f_c37,int32,Categorical,16,1,
+f_c38,int32,Categorical,16,1,
+f_c39,int32,Categorical,16,1,
diff --git a/modelzoo/Recommendation/criteo_ctr/frozen.py b/modelzoo/Recommendation/criteo_ctr/frozen.py
deleted file mode 100644
index 4cb4a9a4..00000000
--- a/modelzoo/Recommendation/criteo_ctr/frozen.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import tensorflow as tf
-
-print(tf.__version__)
-
-
-def frozen_keras_graph(model):
-  from tensorflow.python.framework.convert_to_constants import convert_variables_to_constants_v2_as_graph
-  from tensorflow.lite.python.util import run_graph_optimizations, get_grappler_config
-
-  real_model = tf.function(model).get_concrete_function(tf.TensorSpec(model.inputs[0].shape, model.inputs[0].dtype))
-  frozen_func, graph_def = convert_variables_to_constants_v2_as_graph(real_model)
-
-  input_tensors = [tensor for tensor in frozen_func.inputs if tensor.dtype != tf.resource]
-  output_tensors = frozen_func.outputs
-
-  graph_def = run_graph_optimizations(
-      graph_def,
-      input_tensors,
-      output_tensors,
-      config=get_grappler_config(["constfold", "function"]),
-      graph=frozen_func.graph
-  )
-
-  return graph_def
-
-
-keras = tf.keras
-
-
-class MyCustomLayer(keras.layers.Layer):
-
-  def __init__(self):
-    super(MyCustomLayer, self).__init__(self)
-    self._weight = tf.Variable(initial_value=(2., 3.))
-
-  def call(self, input):
-    output = tf.sigmoid(input) * self._weight
-    return output
-
-
-model = keras.models.Sequential([keras.layers.Input((1, 2)), MyCustomLayer()])
-
-graph_def = frozen_keras_graph(model)
-
-# frozen_func.graph.as_graph_def()
-tf.io.write_graph(graph_def, '.', 'frozen_graph.pb')
diff --git a/modelzoo/Recommendation/criteo_ctr/infer.py b/modelzoo/Recommendation/criteo_ctr/infer.py
index 3f930eb1..44022f83 100644
--- a/modelzoo/Recommendation/criteo_ctr/infer.py
+++ b/modelzoo/Recommendation/criteo_ctr/infer.py
@@ -4,16 +4,11 @@
 # @license : Copyright(C),  <hailin.fu@>
 import os
 import sys
-import tensorflow as tf
-import numpy as np
-from absl import app, flags
 
-from deepray.datasets.criteo.criteo_tsv_reader import CriteoTsvReader
-from deepray.utils.benchmark import PerformanceCalculator
-from deepray.utils.export import SavedModel
-from dcn_v2 import Ranking
+from absl import app, flags
 
-FLAGS = flags.FLAGS
+from arsenal_parquet_dataset.custom_dataset import CustomArsenalParquetDataset
+from custom_model import MatchModel
 
 
 def runner(argv=None):
@@ -21,79 +16,27 @@ def runner(argv=None):
   if len(argv) <= 1:
     argv = [
         sys.argv[0],
-        "--batch_size=4096",
-        "--run_eagerly=false",
+        "--batch_size=8",
+        "--dataset=gs_rank_e2e",
+        "--epochs=1",
+        "--run_eagerly=False",
         "--use_dynamic_embedding=True",
-        f"--feature_map={dir_path}/feature_map_small.csv",
-        "--model_dir=/results/tf_tfra_training_criteo_dcn_fp32_gbs16384_231017142132/export_main",
+        f"--feature_map={dir_path}/feature_map.csv",
+        "--model_dir=/code/fuhailin/arsenal_tfra_accelerate/gs_rank_tfra_accelerate_test/latest",
     ]
   if argv:
     FLAGS(argv, known_only=True)
 
-  data_pipe = CriteoTsvReader(use_synthetic_data=True)
-  # create data pipline of train & test dataset
-  train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-
-  mode = "123"
-  if mode == "sm_predict":
-    # TODO: bugfix
-    model = SavedModel(FLAGS.model_dir, "amp" if FLAGS.dtype else "fp32")
-  else:
-    model = Ranking(interaction="cross", training=False)
-    model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))
-
-  a = {
-      "feature_14":
-          tf.constant(np.array([6394203, 7535249, 3500077, 836339, 7401745, 375123]), dtype=tf.int32),
-      "feature_15":
-          tf.constant(np.array([6394203, 7535249, 3500077, 836339, 7401745, 375123]), dtype=tf.int32),
-      "dense_features":
-          tf.constant(
-              np.array(
-                  [
-                      [0.7361634, 0.7361634], [0.00337589, 0.00337589], [0.673707, 0.673707], [0.33169293, 0.33169293],
-                      [0.8020003, 0.8020003], [0.18556607, 0.18556607]
-                  ]
-              ),
-              dtype=tf.float32
-          )
-  }
-
-  # b = {
-  #     "feature_14": tf.constant(np.array([1]), dtype=tf.int32),
-  #     "feature_15": tf.constant(np.array([1]), dtype=tf.int32),
-  #     "dense_features": tf.constant(np.array([[1.0, 1.0]]), dtype=tf.float32)
-  # }
+  data_pipe = CustomArsenalParquetDataset(dataset_name=FLAGS.dataset, partitions=[{'ds': "2023-09-06"}])
+  test_files_list = data_pipe.get_dataset_files()
+  test_ds = data_pipe(input_file_pattern=test_files_list[-1], batch_size=FLAGS.batch_size)
+  model = MatchModel(pretrain=FLAGS.pretrain, training=False).build()
+  model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))
 
-  print(model(a))
-
-  for name in ["feature_14", "feature_15"]:
-    tensor = a[name]
-    test = model.embedding_layer[name](tensor)
-    print(test)
-  # print(model(b))
-  exit(0)
-
-  _performance_calculator = PerformanceCalculator(0, 1000)
-  num_examples = 0
-  step = 0
-
-  for x, y in train_dataset.take(300):
+  for x, y in test_ds.take(1):
+    x = x.pop("lid")
     preds = model(x)
-    step += 1
-    num_examples += FLAGS.batch_size
-    step_throughput = _performance_calculator(1, FLAGS.batch_size)
-
-    if num_examples % 100 == 0:
-      print(f'step {step}, Perf {step_throughput} samples/s')
-
-  print(x)
-  print(num_examples)
-  results_perf = _performance_calculator.results
-  if not _performance_calculator.completed:
-    print(f"self._performance_calculator.completed: {_performance_calculator.completed}")
-    results_perf = _performance_calculator.get_current_benchmark_results()
-  print(results_perf)
+    print(preds)
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/Recommendation/criteo_ctr/optimize_for_inference.py b/modelzoo/Recommendation/criteo_ctr/optimize_for_inference.py
deleted file mode 100644
index 514e07b3..00000000
--- a/modelzoo/Recommendation/criteo_ctr/optimize_for_inference.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-import os
-import tempfile
-
-import tensorflow as tf
-from absl import app, flags, logging
-
-from dcn_v2 import Ranking
-from deepray.datasets.criteo import CriteoTsvReader
-from deepray.utils.export import export_to_savedmodel
-
-FLAGS = flags.FLAGS
-
-
-def main(_):
-  model = Ranking(interaction="cross", training=False)
-  data_pipe = CriteoTsvReader(use_synthetic_data=True)
-
-  # Why do we perfer to use only one example to rebuild model?
-  #
-  train_dataset = data_pipe(FLAGS.train_data, batch_size=1, is_training=True)
-  for x, y in train_dataset.take(1):
-    preds = model(x)
-
-  tmp_path = tempfile.mkdtemp(dir='/tmp/')
-
-  src = os.path.join(FLAGS.model_dir, "export_main")
-
-  export_to_savedmodel(model, savedmodel_dir=tmp_path)
-
-  file = os.path.join(src, "saved_model.pb")
-  if tf.io.gfile.exists(file):
-    tf.io.gfile.remove(file)
-    logging.info(f"Replace optimized saved_modle.pb for {file}")
-    tf.io.gfile.copy(os.path.join(tmp_path + "_main", "saved_model.pb"), file, overwrite=True)
-  else:
-    raise FileNotFoundError(f"{file} does not exist.")
-
-
-if __name__ == "__main__":
-  app.run(main)
diff --git a/modelzoo/Recommendation/criteo_ctr/run_horovod.sh b/modelzoo/Recommendation/criteo_ctr/run_horovod.sh
deleted file mode 100644
index 2457faeb..00000000
--- a/modelzoo/Recommendation/criteo_ctr/run_horovod.sh
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -eu
-set -o pipefail
-
-# while true
-# do
-#     sleep 1
-# done
-
-num_gpu=${1:-"4"}
-batch_size=${2:-"4096"}
-learning_rate=${3:-"5e-6"}
-precision=${4:-"fp32"}
-use_xla=${5:-"true"}
-epochs=${6:-"2"}
-profile=${7:-"false"}
-
-if [ $num_gpu -gt 1 ]; then
-    hvd_command="horovodrun -np $num_gpu "
-    use_hvd="--use_horovod"
-else
-    hvd_command=""
-    use_hvd="--distribution_strategy=off"
-fi
-
-if [ "$precision" = "fp16" ]; then
-    echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
-else
-    use_fp16=""
-fi
-
-if [ "$use_xla" = "true" ]; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_tfra_training_criteo_%s_%s_gbs%d" "dcn" "$precision" $GBS
-DATESTAMP=$(date +'%y%m%d%H%M%S')
-
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results/${TAG}_${DATESTAMP}
-LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-mkdir -m 777 -p $RESULTS_DIR
-printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-printf "Logs written to %s\n" "$LOGFILE"
-
-if [ "$profile" = "true" ]; then
-    nsys_command="--timeline-filename $RESULTS_DIR/timeline.json --timeline-mark-cycles"
-    echo "profile activated"
-else
-    nsys_command=""
-fi
-
-set -x
-$hvd_command $nsys_command python train.py \
-    --feature_map=feature_map_small.csv \
-    --num_gpus=$num_gpu \
-    --batch_size=$batch_size \
-    --use_dynamic_embedding=True \
-    --steps_per_summary=10 \
-    --run_eagerly=false \
-    --save_checkpoint_steps=200 \
-    --stop_steps=400 \
-    --learning_rate=$learning_rate \
-    --epochs=$epochs \
-    --model_dir=${RESULTS_DIR} \
-    $use_hvd $use_fp16 $use_xla_tag
-set +x
-
-# --init_checkpoint=/results/tf_tfra_training_criteo_dcn_fp32_gbs4096_231018053444/ckpt_main_model/ \
-# --init_weights="/results/tf_tfra_training_criteo_dcn_fp32_gbs16384_231016072901/export_main/variables" \
diff --git a/modelzoo/Recommendation/criteo_ctr/run_optimize.sh b/modelzoo/Recommendation/criteo_ctr/run_optimize.sh
deleted file mode 100644
index c0460ead..00000000
--- a/modelzoo/Recommendation/criteo_ctr/run_optimize.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -eu
-set -o pipefail
-
-num_gpu=${1:-"4"}
-batch_size=${2:-"4096"}
-learning_rate=${3:-"5e-6"}
-precision=${4:-"fp32"}
-use_xla=${5:-"true"}
-epochs=${6:-"2"}
-profile=${7:-"false"}
-
-if [ $num_gpu -gt 1 ]; then
-    hvd_command="horovodrun -np $num_gpu "
-    use_hvd="--use_horovod"
-else
-    hvd_command=""
-    use_hvd="--distribution_strategy=off"
-fi
-
-if [ "$precision" = "fp16" ]; then
-    echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
-else
-    use_fp16=""
-fi
-
-if [ "$use_xla" = "true" ]; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_tfra_training_criteo_%s_%s_gbs%d" "dcn" "$precision" $GBS
-DATESTAMP=$(date +'%y%m%d%H%M%S')
-
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results/${TAG}_${DATESTAMP}
-LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-mkdir -m 777 -p $RESULTS_DIR
-printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-printf "Logs written to %s\n" "$LOGFILE"
-
-if [ "$profile" = "true" ]; then
-    nsys_command="--timeline-filename $RESULTS_DIR/timeline.json --timeline-mark-cycles"
-    echo "profile activated"
-else
-    nsys_command=""
-fi
-
-if [ $num_gpu -gt 1 ]; then
-    python optimize_for_inference.py \
-        --feature_map=feature_map_small.csv \
-        --batch_size=1 \
-        --use_dynamic_embedding=True \
-        --model_dir=/results/tf_tfra_training_criteo_dcn_fp32_gbs16384_231011082137/ \
-        --distribution_strategy=off \
-        $use_fp16 $use_xla_tag
-fi
diff --git a/modelzoo/Recommendation/criteo_ctr/train.py b/modelzoo/Recommendation/criteo_ctr/train.py
index 50371026..67a8f657 100644
--- a/modelzoo/Recommendation/criteo_ctr/train.py
+++ b/modelzoo/Recommendation/criteo_ctr/train.py
@@ -1,89 +1,92 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import os
 import sys
 
 import tensorflow as tf
-import deepray as dp
-from absl import app, flags
-from tensorflow.keras import backend as K
+from absl import flags
 
+import deepray as dp
+from datasets.custom_dataset import CustomArsenalParquetDataset
 from dcn_v2 import Ranking
-from deepray.core.base_trainer import Trainer
-from deepray.datasets.criteo import CriteoTsvReader
-from deepray.utils.export import export_to_savedmodel, optimize_for_inference
-from deepray.utils.horovod_utils import is_main_process
+from deepray.callbacks import ModelCheckpoint
+from deepray.callbacks.training_speed import TrainingSpeed
+from deepray.core.trainer import Trainer
+from deepray.utils import logging_util
+from deepray.utils.export import export_to_savedmodel
 
-FLAGS = flags.FLAGS
+from deepray.utils.horovod_utils import get_world_size, is_main_process
 
+logger = logging_util.get_logger()
 
-def main(_):
-  model = Ranking(interaction="cross")
-  optimizer = tf.keras.optimizers.Adam(learning_rate=FLAGS.learning_rate, amsgrad=False)
-  # optimizer = dp.optimizers.Adam(learning_rate=FLAGS.learning_rate, amsgrad=False)
-  if FLAGS.use_dynamic_embedding:
-    from tensorflow_recommenders_addons import dynamic_embedding as de
-    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=FLAGS.use_horovod)
-  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'])
-  data_pipe = CriteoTsvReader(use_synthetic_data=True)
-  train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn, steps_per_epoch=FLAGS.steps_per_epoch)
 
-  # import numpy as np
-  # a = {
-  #     "feature_14":
-  #         tf.constant(np.array([6394203, 7535249, 3500077, 836339, 7401745, 375123]), dtype=tf.int32),
-  #     #   "feature_15":
-  #     #       tf.constant(np.array([6394203, 7535249, 3500077, 836339, 7401745, 375123]), dtype=tf.int32),
-  #     "dense_features":
-  #         tf.constant(
-  #             np.array(
-  #                 [
-  #                     [0.7361634, 0.7361634], [0.00337589, 0.00337589], [0.673707, 0.673707], [0.33169293, 0.33169293],
-  #                     [0.8020003, 0.8020003], [0.18556607, 0.18556607]
-  #                 ]
-  #             ),
-  #             dtype=tf.float32
-  #         )
-  # }
+def define_flags():
+  flags.mark_flag_as_required('model_dir')
+  flags.FLAGS(sys.argv)
+
 
-  # logging.info(model(a))
-  # logging.info(trainer.model(a))
+def build_dataset(split="train", version="criteo-small"):
+  data_pipe = CustomArsenalParquetDataset(
+      dataset_name=flags.FLAGS.dataset, partitions=[{
+          "version": version,
+          "split": split
+      }]
+  )
+  if split == "valid":
+    is_training = False
+  else:
+    is_training = True
 
-  # for name in [
-  #     "feature_14",
-  #     #    "feature_15"
-  # ]:
-  #   tensor = a[name]
-  #   test = model.embedding_layer[name](tensor)
-  #   logging.info(f"Embedding for {name} is {test}")
+  file_list = data_pipe.get_dataset_files()
+  dataset = data_pipe(
+      input_file_pattern=file_list,
+      batch_size=flags.FLAGS.batch_size,
+      is_training=is_training,
+      shuffle=True,
+      shuffle_buffer=20
+  )
+  steps = data_pipe.get_hvd_step(flags.FLAGS.batch_size, file_list=file_list)
+  logger.info(f"steps = {steps}")
+  return dataset, steps
 
-  savedmodel_path = export_to_savedmodel(trainer.main_model)
 
-  if FLAGS.use_horovod and is_main_process():
-    FLAGS([sys.argv[0], "--use_horovod=False"])
-    # Modify the graph to a stand-alone version for inference
-    K.clear_session()
-    model = Ranking(interaction="cross", training=False)
-    test_ds = data_pipe(FLAGS.train_data, 1, is_training=True)
-    optimize_for_inference(model, test_ds, savedmodel_dir=savedmodel_path)
+def main():
+  define_flags()
+  pid = os.getpid()
+  # 验证设置
+  print("Intra-op threads:", tf.config.threading.get_intra_op_parallelism_threads())
+  print("Inter-op threads:", tf.config.threading.get_inter_op_parallelism_threads())
+  # input("pid: " + str(pid) +", press enter to continue")
+  if flags.FLAGS.use_dynamic_embedding:
+    from tensorflow_recommenders_addons import dynamic_embedding as de
+    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=flags.FLAGS.learning_rate)
+    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
+  else:
+    # optimizer = dp.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate)
+    optimizer = dp.optimizers.SGD(flags.FLAGS.learning_rate)
+    # optimizer = dp.optimizers.Adagrad(learning_rate=flags.FLAGS.learning_rate)
+    # optimizer = dp.optimizers.FtrlOptimizer(learning_rate=flags.FLAGS.learning_rate)
+  model = Ranking(interaction="cross", use_group_embedding=True)
+
+  train_ds, train_steps = build_dataset("train")
+  # valid_ds, valid_steps = build_dataset("validation")
+  # test_ds, test_steps = build_dataset("test")
+  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'], jit_compile=False)
+  # Create a TensorBoard callback
+  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+  # tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
+  trainer.fit(
+      x=train_ds,
+      steps_per_epoch=train_steps,
+      # eval_input=valid_ds,
+      # eval_steps=valid_steps,
+      callbacks=[
+          TrainingSpeed(),
+          # tboard_callback,
+          # ModelCheckpoint(),
+      ],
+  )
+  savedmodel_path = export_to_savedmodel(model)
+  print(savedmodel_path)
 
 
 if __name__ == "__main__":
-  app.run(main)
+  dp.runner(main)
diff --git a/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py b/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
index 0e0af5ca..61e40724 100644
--- a/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
+++ b/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
@@ -23,19 +23,18 @@
 import tensorflow as tf
 from absl import app, flags
 
-from deepray.core.base_trainer import Trainer
+from deepray.core.trainer import Trainer
 from deepray.core.common import distribution_utils
 from deepray.datasets.movielens import Movielens100kRating
 from deepray.models.rec.tfra_demo import build_keras_model
 
-FLAGS = flags.FLAGS
 FLAGS(
     [
         sys.argv[0],
         "--train_data=movielens/100k-ratings",
         # "--distribution_strategy=off",
         # "--run_eagerly=true",
-        "--steps_per_summary=20",
+        "--steps_per_execution=20",
         "--use_dynamic_embedding=True",
         # "--batch_size=1024",
     ]
diff --git a/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py b/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
index ec07a5fc..f9e478f5 100644
--- a/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
+++ b/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
@@ -2,7 +2,7 @@
 
 import horovod.tensorflow as hvd
 import tensorflow as tf
-from tensorflow.keras import backend as K
+from tf_keras import backend as K
 from tensorflow.keras.layers import (Layer, Input, Concatenate, Dense, Flatten, Lambda)
 from tensorflow_recommenders_addons import dynamic_embedding as de
 
diff --git a/recommendation/create_ncf_data.py b/recommendation/create_ncf_data.py
index 97aa38f4..f1fe3c3d 100644
--- a/recommendation/create_ncf_data.py
+++ b/recommendation/create_ncf_data.py
@@ -41,7 +41,7 @@
 flags.DEFINE_integer("eval_prebatch_size", 99000, "Batch size to be used for prebatching the dataset "
                      "for training.")
 
-FLAGS = flags.FLAGS
+
 
 
 def prepare_raw_data(flag_obj):
diff --git a/recommendation/movielens.py b/recommendation/movielens.py
index 7fe82c33..3321720d 100644
--- a/recommendation/movielens.py
+++ b/recommendation/movielens.py
@@ -303,5 +303,5 @@ def main(_):
 
 if __name__ == "__main__":
   # define_data_download_flags()
-  FLAGS = flags.FLAGS
+  
   app.run(main)
diff --git a/recommendation/movielens_dataset.py b/recommendation/movielens_dataset.py
index 8fb7b29d..1194c2a1 100644
--- a/recommendation/movielens_dataset.py
+++ b/recommendation/movielens_dataset.py
@@ -42,12 +42,12 @@
 
 from absl import flags
 
-from deepray.datasets.datapipeline import DataPipeLine
+from deepray.datasets.datapipeline import DataPipeline
 
-FLAGS = flags.FLAGS
 
 
-class Movielens(DataPipeLine):
+
+class Movielens(DataPipeline):
 
   def __init__(
       self,
@@ -167,7 +167,7 @@ def increment_request_epoch(self):
     self._epochs_requested += 1
 
   def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, prebatch_size=0, epochs=1, shuffle=True, *args, **kwargs
+      self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs
   ):
     """Construct the dataset to be used for training and eval.
 
diff --git a/recommendation/ncf_common.py b/recommendation/ncf_common.py
index 0ada2f0e..52cc609e 100644
--- a/recommendation/ncf_common.py
+++ b/recommendation/ncf_common.py
@@ -32,7 +32,7 @@
 from . import movielens
 from deepray.utils.flags import core as flags_core
 
-FLAGS = flags.FLAGS
+
 
 
 def get_inputs(params):
@@ -91,7 +91,7 @@ def parse_flags(flags_obj):
       "beta2": flags_obj.beta2,
       "epsilon": flags_obj.epsilon,
       "match_mlperf": flags_obj.ml_perf,
-      "keras_use_ctl": flags_obj.keras_use_ctl,
+      "use_custom_training_loop": flags_obj.use_custom_training_loop,
       "hr_threshold": flags_obj.hr_threshold,
       "stream_files": flags_obj.tpu is not None,
       "train_dataset_path": flags_obj.train_dataset_path,
diff --git a/recommendation/ncf_keras_main.py b/recommendation/ncf_keras_main.py
index 66b2b072..4dfc1a70 100644
--- a/recommendation/ncf_keras_main.py
+++ b/recommendation/ncf_keras_main.py
@@ -38,7 +38,7 @@
 from deepray.utils.misc import keras_utils
 from deepray.utils.misc import model_helpers
 
-FLAGS = flags.FLAGS
+
 
 
 def metric_fn(logits, dup_mask, match_mlperf):
@@ -173,7 +173,7 @@ def _get_keras_model(params):
 
   # Custom training loop calculates loss and metric as a part of
   # training/evaluation step function.
-  if not params["keras_use_ctl"]:
+  if not params["use_custom_training_loop"]:
     softmax_logits = MetricLayer(params["match_mlperf"])([softmax_logits, dup_mask_input])
     # TODO(b/134744680): Use model.add_loss() instead once the API is well
     # supported.
@@ -214,7 +214,7 @@ def run_ncf(_):
   params["distribute_strategy"] = strategy
   params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")
 
-  if params["use_tpu"] and not params["keras_use_ctl"]:
+  if params["use_tpu"] and not params["use_custom_training_loop"]:
     logging.error("Custom training loop must be used when using TPUStrategy.")
     return
 
@@ -273,7 +273,7 @@ def run_ncf(_):
       else:
         optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer, dynamic=False, initial_scale=loss_scale)
 
-    if params["keras_use_ctl"]:
+    if params["use_custom_training_loop"]:
       train_loss, eval_results = run_ncf_custom_training(
           params,
           strategy,
diff --git a/recommendation/ncf_test.py b/recommendation/ncf_test.py
index cbcedac5..5c18c2e8 100644
--- a/recommendation/ncf_test.py
+++ b/recommendation/ncf_test.py
@@ -65,7 +65,7 @@ def test_end_to_end_keras_dist_strat(self):
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
   def test_end_to_end_keras_dist_strat_ctl(self):
-    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-keras_use_ctl', 'True'])
+    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-use_custom_training_loop', 'True'])
     integration.run_synthetic(ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=flags)
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
@@ -87,7 +87,7 @@ def test_end_to_end_keras_1_gpu_dist_strat_ctl_fp16(self):
     integration.run_synthetic(
         ncf_keras_main.main,
         tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--keras_use_ctl']
+        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--use_custom_training_loop']
     )
 
   @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
diff --git a/recommendation/ranking/common.py b/recommendation/ranking/common.py
index 692b6587..8c3ecf79 100644
--- a/recommendation/ranking/common.py
+++ b/recommendation/ranking/common.py
@@ -18,7 +18,7 @@
 
 from official.common import flags as tfm_flags
 
-FLAGS = flags.FLAGS
+
 
 
 def define_flags() -> None:
diff --git a/recommendation/ranking/preprocessing/criteo_preprocess.py b/recommendation/ranking/preprocessing/criteo_preprocess.py
index 9da1e66d..8ed94900 100644
--- a/recommendation/ranking/preprocessing/criteo_preprocess.py
+++ b/recommendation/ranking/preprocessing/criteo_preprocess.py
@@ -285,6 +285,5 @@ def transform_data(data_path, output_path):
 
 
 if __name__ == "__main__":
-  logging.set_verbosity(logging.INFO)
 
   transform_data(data_path=args.input_path, output_path=args.output_path)
diff --git a/recommendation/ranking/train.py b/recommendation/ranking/train.py
index 342a7536..92b9533c 100644
--- a/recommendation/ranking/train.py
+++ b/recommendation/ranking/train.py
@@ -29,7 +29,7 @@
 from official.recommendation.ranking.task import RankingTask
 from official.utils.misc import keras_utils
 
-FLAGS = flags.FLAGS
+
 
 
 class RankingTrainer(base_trainer.Trainer):
@@ -185,6 +185,5 @@ def get_dataset_fn(params):
 
 
 if __name__ == '__main__':
-  logging.set_verbosity(logging.INFO)
   common.define_flags()
   app.run(main)
diff --git a/recommendation/ranking/train_test.py b/recommendation/ranking/train_test.py
index 49f21505..8fd055bf 100644
--- a/recommendation/ranking/train_test.py
+++ b/recommendation/ranking/train_test.py
@@ -22,7 +22,7 @@
 from official.recommendation.ranking import common
 from official.recommendation.ranking import train
 
-FLAGS = flags.FLAGS
+
 
 
 def _get_params_override(vocab_sizes, interaction='dot', use_orbit=True, strategy='mirrored'):
diff --git a/requirements.txt b/requirements.txt
index 1291147a..97fc36b2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,6 @@
-typeguard>=2.7,<3.0.0
-packaging
-pandas
-pyarrow
-nvidia-ml-py
-dllogger@git+https://github.com/NVIDIA/dllogger#egg=dllogger
-pudb
 tabulate
-tensorflow_hub
-sentencepiece
+fastavro
 boto3
-fastavro
\ No newline at end of file
+pandas
+scikit-build
+scikit-learn
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e3dd7e06..c2795716 100644
--- a/setup.py
+++ b/setup.py
@@ -90,8 +90,6 @@ def has_ext_modules(self):
     install_requires=Path("requirements.txt").read_text().splitlines(),
     extras_require={
         "tensorflow": ["tensorflow>={},<{}".format(inclusive_min_tf_version, exclusive_max_tf_version)],
-        "tensorflow-gpu": ["tensorflow-gpu>={},<{}".format(inclusive_min_tf_version, exclusive_max_tf_version)],
-        "tensorflow-cpu": ["tensorflow-cpu>={},<{}".format(inclusive_min_tf_version, exclusive_max_tf_version)],
     },
     include_package_data=True,
     zip_safe=False,
@@ -103,7 +101,6 @@ def has_ext_modules(self):
         "Intended Audience :: Science/Research",
         "License :: OSI Approved :: Apache Software License",
         "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
diff --git a/third_party/arrow/arrow-20.patch b/third_party/arrow/arrow-20.patch
new file mode 100644
index 00000000..e8dd7bb4
--- /dev/null
+++ b/third_party/arrow/arrow-20.patch
@@ -0,0 +1,13 @@
+diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
+index 404e2a035..9b081d760 100644
+--- a/cpp/src/arrow/CMakeLists.txt
++++ b/cpp/src/arrow/CMakeLists.txt
+@@ -356,7 +356,7 @@ macro(append_runtime_avx512_src SRCS SRC)
+ endmacro()
+ 
+ # Write out compile-time configuration constants
+-configure_file("util/config.h.cmake" "util/config.h" ESCAPE_QUOTES)
++configure_file("util/config.h.cmake" "util/config.h")
+ configure_file("util/config_internal.h.cmake" "util/config_internal.h" ESCAPE_QUOTES)
+ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/util/config.h"
+         DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/util")
diff --git a/third_party/arrow/arrow.BUILD b/third_party/arrow/arrow.BUILD
index ef6d7b3d..d952f758 100644
--- a/third_party/arrow/arrow.BUILD
+++ b/third_party/arrow/arrow.BUILD
@@ -1,149 +1,91 @@
-# Description:
-#   Apache Arrow library
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
 
 package(default_visibility = ["//visibility:public"])
 
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE.txt"])
-
-genrule(
-    name = "arrow_util_config",
-    srcs = ["cpp/src/arrow/util/config.h.cmake"],
-    outs = ["cpp/src/arrow/util/config.h"],
-    cmd = ("sed " +
-           "-e 's/@ARROW_VERSION_MAJOR@/3/g' " +
-           "-e 's/@ARROW_VERSION_MINOR@/0/g' " +
-           "-e 's/@ARROW_VERSION_PATCH@/0/g' " +
-           "-e 's/cmakedefine ARROW_USE_NATIVE_INT128/undef ARROW_USE_NATIVE_INT128/g' " +
-           "-e 's/cmakedefine/define/g' " +
-           "$< >$@"),
-)
-
-genrule(
-    name = "parquet_version_h",
-    srcs = ["cpp/src/parquet/parquet_version.h.in"],
-    outs = ["cpp/src/parquet/parquet_version.h"],
-    cmd = ("sed " +
-           "-e 's/@PARQUET_VERSION_MAJOR@/1/g' " +
-           "-e 's/@PARQUET_VERSION_MINOR@/5/g' " +
-           "-e 's/@PARQUET_VERSION_PATCH@/1/g' " +
-           "$< >$@"),
-)
-
-cc_library(
-    name = "arrow_uriparser",
-    srcs = glob(["cpp/src/arrow/vendored/uriparser/*.c"]),
-    hdrs = glob(["cpp/src/arrow/vendored/uriparser/*.h"]),
-    includes = ["cpp/src/arrow/vendored/uriparser"],
-    visibility = ["//visibility:public"],
+filegroup(
+    name = "all_srcs",
+    srcs = glob(["**"]),
 )
 
-cc_library(
+cmake(
     name = "arrow",
-    srcs = glob(
-        [
-            "cpp/src/arrow/*.cc",
-            "cpp/src/arrow/array/*.cc",
-            "cpp/src/arrow/compute/*.cc",
-            "cpp/src/arrow/compute/**/*.h",
-            "cpp/src/arrow/compute/**/*.cc",
-            "cpp/src/arrow/csv/*.cc",
-            "cpp/src/arrow/dataset/*.cc",
-            "cpp/src/arrow/filesystem/*.cc",
-            "cpp/src/arrow/io/*.cc",
-            "cpp/src/arrow/ipc/*.cc",
-            "cpp/src/arrow/json/*.cc",
-            "cpp/src/arrow/tensor/*.cc",
-            "cpp/src/arrow/util/*.cc",
-            "cpp/src/arrow/vendored/optional.hpp",
-            "cpp/src/arrow/vendored/string_view.hpp",
-            "cpp/src/arrow/vendored/variant.hpp",
-            "cpp/src/arrow/vendored/base64.cpp",
-            "cpp/src/arrow/vendored/datetime/tz.cpp",
-            "cpp/src/arrow/vendored/pcg/*.hpp",
-            "cpp/src/arrow/**/*.h",
-            "cpp/src/parquet/**/*.h",
-            "cpp/src/parquet/**/*.cc",
-            "cpp/src/generated/*.h",
-            "cpp/src/generated/*.cpp",
-            "cpp/thirdparty/flatbuffers/include/flatbuffers/*.h",
-        ],
-        exclude = [
-            "cpp/src/**/*_benchmark.cc",
-            "cpp/src/**/*_main.cc",
-            "cpp/src/**/*_nossl.cc",
-            "cpp/src/**/*test*.h",
-            "cpp/src/**/*test*.cc",
-            "cpp/src/**/*fuzz*.cc",
-            "cpp/src/**/*gcsfs*.cc",
-            "cpp/src/**/file_to_stream.cc",
-            "cpp/src/**/stream_to_file.cc",
-            "cpp/src/arrow/util/bpacking_avx2.cc",
-            "cpp/src/arrow/util/bpacking_avx512.cc",
-            "cpp/src/arrow/util/bpacking_neon.cc",
-            "cpp/src/arrow/util/tracing_internal.cc",
-        ],
-    ) + select({
-        "@bazel_tools//src/conditions:windows": [
-            "cpp/src/arrow/vendored/musl/strptime.c",
-        ],
-        "//conditions:default": [],
-    }),
-    hdrs = [
-        # declare header from above genrule
-        "cpp/src/arrow/util/config.h",
-        "cpp/src/parquet/parquet_version.h",
+    build_args = [
+        "-j `nproc`",
     ],
-    copts = select({
-        "@bazel_tools//src/conditions:windows": [
-            "/std:c++14",
-        ],
-        "//conditions:default": [
-            "-std=c++14",
-        ],
-    }),
+    cache_entries = {
+        "CMAKE_INSTALL_LIBDIR": "lib",
+        "CMAKE_TOOLCHAIN_FILE": "",
+        "CMAKE_C_FLAGS": "-fPIC -I/usr/include -fvisibility=hidden",
+        "CMAKE_CXX_FLAGS": "-fPIC -I/usr/include -fvisibility=hidden",
+        "EP_COMMON_CMAKE_ARGS": "-DWITH_OPENSSL=OFF",
+        "CMAKE_BUILD_TYPE": "release",
+        "ARROW_BUILD_SHARED": "OFF",
+        "ARROW_BUILD_STATIC": "ON",
+        "ARROW_BUILD_TESTS": "OFF",
+        "ARROW_CSV": "ON",
+        "ARROW_DATASET": "ON",
+        "ARROW_PARQUET": "ON",
+        # "ARROW_JEMALLOC": "ON",
+        # "ARROW_JEMALLOC_INCLUDE_DIR": "$EXT_BUILD_DEPS/jemalloc",
+        "ARROW_IPC": "ON",
+        "ARROW_DEPENDENCY_SOURCE": "AUTO",  # TODO: Use SYSTEM provided deps
+        "ARROW_WITH_SNAPPY": "ON",
+        "ARROW_WITH_ZSTD": "ON",
+        "ARROW_WITH_BZ2": "ON",
+        "ARROW_WITH_LZ4": "ON",
+        "ARROW_WITH_ZLIB": "OFF",
+        "ARROW_WITH_BROTLI": "OFF",
+        "ARROW_WITH_RE2": "OFF",
+        "ARROW_WITH_UTF8PROC": "OFF",
+        "ARROW_HDFS": "ON",
+        "ARROW_TENSORFLOW": "ON",
+        "ARROW_FILESYSTEM": "ON",
+        # "ARROW_S3": "ON",
+        # "ARROW_USE_OPENSSL": "OFF",
+        # "OPENSSL_ROOT_DIR": "$EXT_BUILD_DEPS/openssl",
+        "BROTLI_ROOT": "$EXT_BUILD_DEPS",
+        "Thrift_ROOT": "$EXT_BUILD_DEPS/org_apache_thrift",
+    },
+    # copts = [
+    #     # "-Werror=pedantic",
+    #     "-Wno-pedantic",
+    # ],
     defines = [
-        "ARROW_HDFS=ON",
-        "ARROW_WITH_BROTLI",
-        "ARROW_WITH_SNAPPY",
-        "ARROW_WITH_LZ4",
-        "ARROW_WITH_ZLIB",
-        "ARROW_WITH_ZSTD",
-        "ARROW_WITH_BZ2",
-        "ARROW_STATIC",
-        "ARROW_EXPORT=",
-        "PARQUET_STATIC",
-        "PARQUET_EXPORT=",
-        "WIN32_LEAN_AND_MEAN",
-        "ARROW_DS_STATIC",
-        "URI_STATIC_BUILD",
+        # "ARROW_USE_OPENSSL=OFF",
     ],
-    includes = [
-        "cpp/src",
-        "cpp/src/arrow/vendored/xxhash",
-        "cpp/src/generated",
-        "cpp/thirdparty/flatbuffers/include",
+    generate_args = [
+        "-GNinja",
+        "-DCMAKE_RANLIB=/usr/bin/ranlib",
     ],
-    textual_hdrs = [
-        "cpp/src/arrow/vendored/xxhash/xxhash.c",
+    lib_source = ":all_srcs",
+    linkopts = ["-pthread"],
+    out_static_libs = [
+        "libparquet.a",
+        "libarrow.a",
+        "libarrow_bundled_dependencies.a",
+        "libarrow_dataset.a",
+        "libarrow_acero.a",
     ],
+    tags = ["requires-network"],
+    working_directory = "cpp",
     deps = [
-        ":arrow_uriparser",
-        "@//third_party/hadoop:hdfs",
-        "@aws-sdk-cpp//:identity-management",
-        "@aws-sdk-cpp//:s3",
-        "@boost//:multiprecision",
-        "@boringssl//:crypto",
-        "@com_github_apache_thrift//:thrift",
         "@com_github_facebook_zstd//:zstd",
-        "@com_github_google_brotli//:brotli",
-        "@com_github_google_double_conversion//:double-conversion",
+        # "@jemalloc",
+        "@curl",
+        # "@boringssl//:crypto",
+        # "@boringssl//:ssl",
+        # "@aws-sdk-cpp//:identity-management",
+        # "@aws-sdk-cpp//:s3",
+        # "@openssl",
+        "@boost//:algorithm",
+        "@boost//:locale",
+        "@boost//:multiprecision",
+        "@boost//:tokenizer",
         "@com_github_google_snappy//:snappy",
-        "@com_github_tencent_rapidjson//:rapidjson",
+        "@com_github_google_brotli//:brotli",
+        # "@org_apache_thrift//:thrift",
         "@com_github_xtensorstack_xsimd//:xsimd",
-        "@lz4",
-        "@org_bzip_bzip2//:bzip2",
-        "@zlib",
+        # "@lz4",
+        # "@org_bzip_bzip2//:bzip2",
     ],
 )
diff --git a/third_party/clang_toolchain/BUILD b/third_party/clang_toolchain/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/clang_toolchain/cc_configure_clang.bzl b/third_party/clang_toolchain/cc_configure_clang.bzl
new file mode 100644
index 00000000..a6b87ab6
--- /dev/null
+++ b/third_party/clang_toolchain/cc_configure_clang.bzl
@@ -0,0 +1,27 @@
+""" Downloads clang and configures the crosstool using bazel's autoconf."""
+
+load("@bazel_tools//tools/cpp:cc_configure.bzl", "cc_autoconf_impl")
+load(":download_clang.bzl", "download_clang")
+
+_TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
+_TF_NEED_CUDA = "TF_NEED_CUDA"
+
+def _cc_clang_autoconf(repo_ctx):
+    if repo_ctx.os.environ.get(_TF_DOWNLOAD_CLANG) != "1":
+        return
+    if repo_ctx.os.environ.get(_TF_NEED_CUDA) == "1":
+        # Clang is handled separately for CUDA configs.
+        # See cuda_configure.bzl for more details.
+        return
+
+    download_clang(repo_ctx, out_folder = "extra_tools")
+    overridden_tools = {"gcc": "extra_tools/bin/clang"}
+    cc_autoconf_impl(repo_ctx, overridden_tools)
+
+cc_download_clang_toolchain = repository_rule(
+    environ = [
+        _TF_DOWNLOAD_CLANG,
+        _TF_NEED_CUDA,
+    ],
+    implementation = _cc_clang_autoconf,
+)
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
new file mode 100644
index 00000000..6e6091bf
--- /dev/null
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -0,0 +1,64 @@
+""" Helpers to download a recent clang release."""
+
+def _get_platform_folder(os_name):
+    os_name = os_name.lower()
+    if os_name.startswith("windows"):
+        return "Win"
+    if os_name.startswith("mac os"):
+        return "Mac"
+    if not os_name.startswith("linux"):
+        fail("Unknown platform")
+    return "Linux_x64"
+
+def _download_chromium_clang(
+        repo_ctx,
+        platform_folder,
+        package_version,
+        sha256,
+        out_folder):
+    cds_url = "https://commondatastorage.googleapis.com/chromium-browser-clang"
+    cds_file = "clang-%s.tgz" % package_version
+    cds_full_url = "{0}/{1}/{2}".format(cds_url, platform_folder, cds_file)
+    repo_ctx.download_and_extract(cds_full_url, output = out_folder, sha256 = sha256)
+
+def download_clang(repo_ctx, out_folder):
+    """ Download a fresh clang release and put it into out_folder.
+
+    Clang itself will be located in 'out_folder/bin/clang'.
+    We currently download one of the latest releases of clang by the
+    Chromium project (see
+    https://chromium.googlesource.com/chromium/src/+/master/docs/clang.md).
+
+    Args:
+      repo_ctx: An instance of repository_context object.
+      out_folder: A folder to extract the compiler into.
+    """
+    # TODO(ibiryukov): we currently download and extract some extra tools in the
+    # clang release (e.g., sanitizers). We should probably remove the ones
+    # we don't need and document the ones we want provide in addition to clang.
+
+    # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
+    # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
+    CLANG_REVISION = "b4160cb94c54f0b31d0ce14694950dac7b6cd83f"
+    CLANG_SVN_REVISION = "371856"
+    CLANG_SUB_REVISION = 1
+    package_version = "%s-%s-%s" % (
+        CLANG_SVN_REVISION,
+        CLANG_REVISION[:8],
+        CLANG_SUB_REVISION,
+    )
+
+    checksums = {
+        "Linux_x64": "919c19df3ebd7db03b72575b2de5198404357659fc8c85c2d66e679ad4acbafe",
+        "Mac": "5632c516f3ac5fab3654d0a874688cad6c7f99b96845da27ab12336a14187aa2",
+        "Win": "235545b33f4d697190032cb538fdcaba227017c95b752ea8af8f29aab8da7479",
+    }
+
+    platform_folder = _get_platform_folder(repo_ctx.os.name)
+    _download_chromium_clang(
+        repo_ctx,
+        platform_folder,
+        package_version,
+        checksums[platform_folder],
+        out_folder,
+    )
diff --git a/third_party/cuCollections/BUILD b/third_party/cuCollections/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/cuCollections/cuCollections.BUILD b/third_party/cuCollections/cuCollections.BUILD
new file mode 100644
index 00000000..e6f15ee3
--- /dev/null
+++ b/third_party/cuCollections/cuCollections.BUILD
@@ -0,0 +1,26 @@
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "cmake")
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # BSD 3-Clause
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_srcs",
+    srcs = glob(["**"]),
+)
+
+cmake(
+    name = "cuCollections",
+    build_args = [
+        "-j `nproc`",
+    ],
+    cache_entries = {
+        "BUILD_TESTS": "OFF",
+        "BUILD_BENCHMARKS": "OFF",
+        "BUILD_EXAMPLES": "OFF",
+    },
+    lib_source = ":all_srcs",
+    out_headers_only = True,
+)
diff --git a/third_party/cucollection/cuco.BUILD b/third_party/cuCollections/cuco.BUILD
similarity index 99%
rename from third_party/cucollection/cuco.BUILD
rename to third_party/cuCollections/cuco.BUILD
index 339126c3..a6a28cb3 100644
--- a/third_party/cucollection/cuco.BUILD
+++ b/third_party/cuCollections/cuco.BUILD
@@ -1,9 +1,8 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
-
 package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])
 
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
 
 exports_files(["LICENSE"])
 
diff --git a/third_party/cucollection/cucollection.patch b/third_party/cuCollections/cucollection.patch
similarity index 90%
rename from third_party/cucollection/cucollection.patch
rename to third_party/cuCollections/cucollection.patch
index fc3d0660..d5f4ebce 100644
--- a/third_party/cucollection/cucollection.patch
+++ b/third_party/cuCollections/cucollection.patch
@@ -1,18 +1,3 @@
-From b47364f0bf2c1e630c600e4e2e09e54020bac7fa Mon Sep 17 00:00:00 2001
-From: Mesilenceki <silenceki@hotmail.com>
-Date: Tue, 18 Apr 2023 11:56:47 +0800
-Subject: [PATCH] cuco patch
-
----
- include/cuco/detail/dynamic_map.inl         |  47 ++++++-
- include/cuco/detail/dynamic_map_kernels.cuh |  71 +++++++++-
- include/cuco/detail/pair.cuh                |  14 ++
- include/cuco/detail/static_map.inl          | 138 ++++++++++++++++----
- include/cuco/dynamic_map.cuh                |  49 ++++++-
- include/cuco/static_map.cuh                 |  57 +++++++-
- include/cuco/traits.hpp                     |   1 +
- 7 files changed, 340 insertions(+), 37 deletions(-)
-
 diff --git a/include/cuco/detail/dynamic_map.inl b/include/cuco/detail/dynamic_map.inl
 index 57950ea..78543c5 100644
 --- a/include/cuco/detail/dynamic_map.inl
@@ -165,10 +150,10 @@ index c1e21e8..13b252e 100644
 +} // namespace detail
 +} // namespace cuco
 diff --git a/include/cuco/detail/pair.cuh b/include/cuco/detail/pair.cuh
-index 0d8a85e..4aa8481 100644
+index 0d8a85e..a5178de 100644
 --- a/include/cuco/detail/pair.cuh
 +++ b/include/cuco/detail/pair.cuh
-@@ -22,6 +22,20 @@
+@@ -22,6 +22,14 @@
  #include <tuple>
  #include <type_traits>
  
@@ -178,19 +163,13 @@ index 0d8a85e..4aa8481 100644
 +
 +template <bool B>
 +using bool_constant = std::integral_constant<bool, B>;
-+
-+template <typename _Tp>
-+struct has_unique_object_representations : bool_constant<__has_unique_object_representations(std::remove_cv_t<std::remove_all_extents_t<_Tp>>)> {};
-+
-+template <class T>
-+constexpr bool has_unique_object_representations_v = has_unique_object_representations<T>::value;
 +} // namespace std
 +
  namespace cuco {
  namespace detail {
  
 diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
-index 1719970..23482f8 100644
+index 1719970..c7ef713 100644
 --- a/include/cuco/detail/static_map.inl
 +++ b/include/cuco/detail/static_map.inl
 @@ -31,7 +31,10 @@ static_map<Key, Value, Scope, Allocator>::static_map(std::size_t capacity,
@@ -273,6 +252,15 @@ index 1719970..23482f8 100644
        }();
  
        // successful insert
+@@ -302,7 +313,7 @@ template <typename CG, typename Hash, typename KeyEqual>
+ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert(
+   CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
+ {
+-  auto current_slot = initial_slot(g, insert_pair.first, hash);
++  auto current_slot = this->initial_slot(g, insert_pair.first, hash);
+ 
+   while (true) {
+     key_type const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
 @@ -325,18 +336,18 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
        uint32_t src_lane = __ffs(window_contains_empty) - 1;
  
@@ -303,7 +291,13 @@ index 1719970..23482f8 100644
        }
  
        uint32_t res_status = g.shfl(static_cast<uint32_t>(status), src_lane);
-@@ -358,6 +369,43 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
+@@ -353,11 +364,48 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::i
+     // if there are no empty slots in the current window,
+     // we move onto the next window
+     else {
+-      current_slot = next_slot(g, current_slot);
++      current_slot = this->next_slot(g, current_slot);
+     }
    }
  }
  
@@ -347,6 +341,24 @@ index 1719970..23482f8 100644
  template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
  template <typename Hash, typename KeyEqual>
  __device__ typename static_map<Key, Value, Scope, Allocator>::device_view::iterator
+@@ -412,7 +460,7 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
+                                                             Hash hash,
+                                                             KeyEqual key_equal) noexcept
+ {
+-  auto current_slot = initial_slot(g, k, hash);
++  auto current_slot = this->initial_slot(g, k, hash);
+ 
+   while (true) {
+     auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
+@@ -438,7 +486,7 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
+ 
+     // otherwise, all slots in the current window are full with other keys, so we move onto the
+     // next window
+-    current_slot = next_slot(g, current_slot);
++    current_slot = this->next_slot(g, current_slot);
+   }
+ }
+ 
 @@ -482,6 +530,42 @@ static_map<Key, Value, Scope, Allocator>::device_view::find(CG g,
    }
  }
@@ -358,7 +370,7 @@ index 1719970..23482f8 100644
 +                                                                Hash hash,
 +                                                                KeyEqual key_equal) noexcept
 +{
-+  auto current_slot = ith_slot(idx, hash);
++  auto current_slot = this->ith_slot(idx, hash);
 +
 +  auto const existing_key = current_slot->first.load(cuda::std::memory_order_relaxed);
 +  // Key doesn't exist, return end()
@@ -471,7 +483,7 @@ index 2e57ac6..b85759d 100644
    thrust::device_vector<view_type> submap_views_;  ///< vector of device views for each submap
    thrust::device_vector<mutable_view_type>
 diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
-index 321b1f3..fa810e4 100644
+index 321b1f3..c6179f6 100644
 --- a/include/cuco/static_map.cuh
 +++ b/include/cuco/static_map.cuh
 @@ -123,10 +123,10 @@ class static_map {
@@ -489,6 +501,18 @@ index 321b1f3..fa810e4 100644
  
    friend class dynamic_map<Key, Value, Scope, Allocator>;
  
+@@ -141,9 +141,9 @@ class static_map {
+   using atomic_ctr_type    = cuda::atomic<std::size_t, Scope>;
+   using allocator_type     = Allocator;
+   using slot_allocator_type =
+-    typename std::allocator_traits<Allocator>::rebind_alloc<pair_atomic_type>;
++    typename std::allocator_traits<Allocator>::template rebind_alloc<pair_atomic_type>;
+   using counter_allocator_type =
+-    typename std::allocator_traits<Allocator>::rebind_alloc<atomic_ctr_type>;
++    typename std::allocator_traits<Allocator>::template rebind_alloc<atomic_ctr_type>;
+ 
+ #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)
+   static_assert(atomic_key_type::is_always_lock_free,
 @@ -417,6 +417,18 @@ class static_map {
        return &slots_[(hash(k) + g.thread_rank()) % capacity_];
      }
@@ -578,5 +602,3 @@ index 445a40d..07fe954 100644
  
  namespace cuco {
  
--- 
-2.37.1 (Apple Git-137.1)
\ No newline at end of file
diff --git a/third_party/cutlass.BUILD b/third_party/cutlass.BUILD
index aaf58575..23365b36 100644
--- a/third_party/cutlass.BUILD
+++ b/third_party/cutlass.BUILD
@@ -1,17 +1,18 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_header_library", "if_cuda")
-
-licenses(["notice"])  # # BSD 3-Clause
+licenses(["notice"]) # # BSD 3-Clause
 
 package(default_visibility = ["//visibility:public"])
 
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_header_library", "if_cuda")
+
+
 cuda_header_library(
-    name = "cutlass",
-    hdrs = if_cuda(glob([
-        "include/cutlass/**",
-    ])),
-    includes = if_cuda([
-        "include",
-    ]),
-    strip_include_prefix = "include",
-    visibility = ["//visibility:public"],
-)
+  name = "cutlass",
+  hdrs = if_cuda(glob([
+    "include/cutlass/**",
+  ])),
+  includes = if_cuda([
+    "include",
+  ]),
+  strip_include_prefix = "include",
+  visibility = ["//visibility:public"],
+)
\ No newline at end of file
diff --git a/third_party/flash_attn/BUILD b/third_party/flash_attn/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/flash_attn/flash_attn.BUILD b/third_party/flash_attn/flash_attn.BUILD
new file mode 100644
index 00000000..ae75e305
--- /dev/null
+++ b/third_party/flash_attn/flash_attn.BUILD
@@ -0,0 +1,51 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+
+cuda_library(
+  name = "flash_attn",
+  srcs = if_cuda([
+    "csrc/flash_attn/src/fmha_fwd_hdim32.cu.cc",
+    "csrc/flash_attn/src/fmha_fwd_hdim64.cu.cc",
+    "csrc/flash_attn/src/fmha_fwd_hdim128.cu.cc",
+    "csrc/flash_attn/src/fmha_bwd_hdim32.cu.cc",
+    "csrc/flash_attn/src/fmha_bwd_hdim64.cu.cc",
+    "csrc/flash_attn/src/fmha_bwd_hdim128.cu.cc",
+  ]),
+  hdrs = if_cuda([
+    "csrc/flash_attn/src/fmha/gemm.h",
+    "csrc/flash_attn/src/fmha/gmem_tile.h",
+    "csrc/flash_attn/src/fmha/kernel_traits.h",
+    "csrc/flash_attn/src/fmha/mask.h",
+    "csrc/flash_attn/src/fmha/utils.h",
+    "csrc/flash_attn/src/fmha/smem_tile.h",
+    "csrc/flash_attn/src/fmha/softmax.h",
+    "csrc/flash_attn/src/fmha_utils.h",
+    "csrc/flash_attn/src/fmha.h",
+    "csrc/flash_attn/src/fmha_fwd_launch_template.h",
+    "csrc/flash_attn/src/fmha_bwd_launch_template.h",
+    "csrc/flash_attn/src/static_switch.h",
+    "csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h",
+    "csrc/flash_attn/src/fmha_fprop_kernel_1xN.h",
+    "csrc/flash_attn/src/fmha_kernel.h",
+    "csrc/flash_attn/src/philox.cuh",
+  ]),
+  copts = [
+    "-O3",
+    "-std=c++17",
+    "-U__CUDA_NO_HALF_OPERATORS__",
+    "-U__CUDA_NO_HALF_CONVERSIONS__",
+    "-U__CUDA_NO_HALF2_OPERATORS__",
+    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+    "--expt-relaxed-constexpr",
+    "--expt-extended-lambda",
+    "--use_fast_math",
+    "--ptxas-options=-v",
+    "-lineinfo"
+  ],
+  deps = if_cuda(["@cutlass//:cutlass",]),
+  strip_include_prefix = "csrc/flash_attn/src",
+)
+
diff --git a/third_party/flash_attn/flash_attn.patch b/third_party/flash_attn/flash_attn.patch
new file mode 100644
index 00000000..403c8848
--- /dev/null
+++ b/third_party/flash_attn/flash_attn.patch
@@ -0,0 +1,450 @@
+diff --git a/csrc/flash_attn/src/fmha.h b/csrc/flash_attn/src/fmha.h
+index 2905e6d..a44e53e 100644
+--- a/csrc/flash_attn/src/fmha.h
++++ b/csrc/flash_attn/src/fmha.h
+@@ -25,22 +25,14 @@
+  *
+  ******************************************************************************/
+ 
+-#pragma once
++// #pragma once
++#ifndef TFPLUS_FMHA_H_
++#define TFPLUS_FMHA_H_
+ 
+ #include <cuda.h>
+ #include <vector>
+-
+-#ifdef OLD_GENERATOR_PATH
+-#include <ATen/CUDAGeneratorImpl.h>
+-#else
+-#include <ATen/cuda/CUDAGeneratorImpl.h>
+-#endif
+-
+-#include <ATen/cuda/CUDAContext.h>
+-#include <ATen/cuda/detail/UnpackRaw.cuh>
+-
+-#include <fmha_utils.h>
+-
++#include "philox.cuh"
++#include "fmha_utils.h"
+ 
+ constexpr int TOTAL_DIM = 0;
+ constexpr int H_DIM = 1;
+@@ -124,7 +116,7 @@ struct FMHA_fprop_params : public Qkv_params {
+     uint32_t scale_dropout;
+ 
+     // Random state.
+-    at::PhiloxCudaState philox_args;
++    PhiloxCudaState philox_args;
+     // Pointer to the RNG seed (idx 0) and offset (idx 1).
+     uint64_t * rng_state;
+ 
+@@ -203,9 +195,11 @@ void run_fmha_fwd_hdim64(Launch_params<FMHA_fprop_params> &launch_params);
+ void run_fmha_fwd_hdim128(Launch_params<FMHA_fprop_params> &launch_params);
+ 
+ void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+-void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
++void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure, cudaDeviceProp& dprops);
+ void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+ 
+ void run_fmha_block_fp16_sm80(Launch_params<FMHA_fprop_params> &launch_params, const bool configure);
+ 
+ void run_fmha_block_dgrad_fp16_sm80(const FMHA_dgrad_params &params, cudaStream_t stream);
++
++#endif  // TFPLUS_FMHA_H_
+\ No newline at end of file
+diff --git a/csrc/flash_attn/src/fmha/gemm.h b/csrc/flash_attn/src/fmha/gemm.h
+index a142f0b..a11bcb1 100644
+--- a/csrc/flash_attn/src/fmha/gemm.h
++++ b/csrc/flash_attn/src/fmha/gemm.h
+@@ -27,14 +27,14 @@
+ 
+ #pragma once
+ 
+-#include <fmha/utils.h>
++#include "utils.h"
+ 
+ #include "cutlass/cutlass.h"
+ #include "cutlass/gemm/warp/default_mma_tensor_op.h"
+ #include "cutlass/layout/layout.h"
+-#include <cutlass/arch/mma.h>
+-#include <cutlass/array.h>
+-#include <cutlass/numeric_types.h>
++#include "cutlass/arch/mma.h"
++#include "cutlass/array.h"
++#include "cutlass/numeric_types.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/fmha/gmem_tile.h b/csrc/flash_attn/src/fmha/gmem_tile.h
+index e0bd24c..e08f41e 100644
+--- a/csrc/flash_attn/src/fmha/gmem_tile.h
++++ b/csrc/flash_attn/src/fmha/gmem_tile.h
+@@ -30,7 +30,7 @@
+ #include <cuda_fp16.h>
+ #include <cuda_bf16.h>
+ 
+-#include <fmha/utils.h>
++#include "utils.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/fmha/smem_tile.h b/csrc/flash_attn/src/fmha/smem_tile.h
+index 491253b..fea19cb 100644
+--- a/csrc/flash_attn/src/fmha/smem_tile.h
++++ b/csrc/flash_attn/src/fmha/smem_tile.h
+@@ -28,8 +28,7 @@
+ #pragma once
+ 
+ #include "utils.h"
+-#include <fmha/utils.h>
+-#include <fmha/gemm.h>
++#include "gemm.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h b/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
+index ce5410f..cd55013 100644
+--- a/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
++++ b/csrc/flash_attn/src/fmha_block_dgrad_kernel_1xN_loop.h
+@@ -6,8 +6,8 @@
+ #include "fmha_fprop_kernel_1xN.h"
+ #include "fmha_kernel.h"
+ #include "fmha_blockmask.h"
+-#include <fmha/kernel_traits.h>
+-#include <fmha/gemm.h>
++#include "fmha/kernel_traits.h"
++#include "fmha/gemm.h"
+ 
+ namespace fmha {
+ 
+@@ -745,8 +745,8 @@ inline __device__ void compute_block_dq_dk_dv_1xN(const Params &params) {
+     const int tidx = threadIdx.x;
+ 
+     const int tidx_global = (bidb * params.h + bidh) * blockDim.x + tidx;
+-    auto seeds = at::cuda::philox::unpack(params.philox_args);
+-    Philox ph(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
++    // auto args = unpack(params.philox_args);
++    Philox ph(params.philox_args.seed_.val, tidx_global, params.philox_args.offset_.val);
+ 
+     if (loop_steps == 1) {
+         compute_block_dq_dk_dv_1xN_one_iter<Kernel_traits, Is_dropout, Is_causal, true, true>(params, ph, 0);
+diff --git a/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h b/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h
+index 15f865e..fabb7fe 100644
+--- a/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h
++++ b/csrc/flash_attn/src/fmha_block_fprop_kernel_1xN.h
+@@ -31,8 +31,8 @@
+ #include "fmha_fprop_kernel_1xN.h"
+ #include "fmha_kernel.h"
+ #include "fmha_blockmask.h"
+-#include <fmha/kernel_traits.h>
+-#include <fmha/gemm.h>
++#include "fmha/kernel_traits.h"
++#include "fmha/gemm.h"
+ 
+ namespace fmha {
+ 
+@@ -508,9 +508,9 @@ inline __device__ void device_block_1xN_loop(const Params &params) {
+     const int tidx = threadIdx.x;
+ 
+     const int tidx_global = (bidb * params.h + bidh) * blockDim.x * 2 + tidx;
+-    auto seeds = at::cuda::philox::unpack(params.philox_args);
+-    Philox ph0(std::get<0>(seeds), tidx_global, std::get<1>(seeds));
+-    Philox ph1(std::get<0>(seeds), tidx_global + blockDim.x, std::get<1>(seeds));
++    // auto args = unpack(params.philox_args);
++    Philox ph0(params.philox_args.seed_.val, tidx_global, params.philox_args.offset_.val);
++    Philox ph1(params.philox_args.seed_.val, tidx_global + blockDim.x, params.philox_args.offset_.val);
+     constexpr int M = Kernel_traits::Cta_tile_p::M;
+     const int STEPS = (params.seqlen_q + M - 1) / M;
+ 
+diff --git a/csrc/flash_attn/src/fmha_blockmask.h b/csrc/flash_attn/src/fmha_blockmask.h
+index bbd33d6..c49c669 100644
+--- a/csrc/flash_attn/src/fmha_blockmask.h
++++ b/csrc/flash_attn/src/fmha_blockmask.h
+@@ -27,12 +27,12 @@
+ 
+ #pragma once
+ 
+-#include <fmha.h>
+-#include <fmha/utils.h>
+-#include <fmha/smem_tile.h>
+-#include <fmha/gmem_tile.h>
+-#include <fmha/mask.h>
+-#include <fmha/softmax.h>
++#include "fmha.h"
++#include "fmha/utils.h"
++#include "fmha/smem_tile.h"
++#include "fmha/gmem_tile.h"
++#include "fmha/mask.h"
++#include "fmha/softmax.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/fmha_bwd_hdim128.cu b/csrc/flash_attn/src/fmha_bwd_hdim128.cu.cc
+similarity index 100%
+rename from csrc/flash_attn/src/fmha_bwd_hdim128.cu
+rename to csrc/flash_attn/src/fmha_bwd_hdim128.cu.cc
+diff --git a/csrc/flash_attn/src/fmha_bwd_hdim32.cu b/csrc/flash_attn/src/fmha_bwd_hdim32.cu.cc
+similarity index 100%
+rename from csrc/flash_attn/src/fmha_bwd_hdim32.cu
+rename to csrc/flash_attn/src/fmha_bwd_hdim32.cu.cc
+diff --git a/csrc/flash_attn/src/fmha_bwd_hdim64.cu b/csrc/flash_attn/src/fmha_bwd_hdim64.cu.cc
+similarity index 80%
+rename from csrc/flash_attn/src/fmha_bwd_hdim64.cu
+rename to csrc/flash_attn/src/fmha_bwd_hdim64.cu.cc
+index 3091605..e0e97e1 100644
+--- a/csrc/flash_attn/src/fmha_bwd_hdim64.cu
++++ b/csrc/flash_attn/src/fmha_bwd_hdim64.cu.cc
+@@ -4,24 +4,23 @@
+ 
+ #include "fmha_bwd_launch_template.h"
+ 
+-void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
++void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure, cudaDeviceProp& dprops) {
+     FP16_SWITCH(params.is_bf16, ([&] {
+-        auto dprops = at::cuda::getCurrentDeviceProperties();
+         if (params.seqlen_k == 128) {
+             using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
+             run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+         } else if (params.seqlen_k >= 256) {
+-            if ((dprops->major == 8 && dprops->minor == 0) || (dprops->major == 9 && dprops->minor == 0)) {
++            if ((dprops.major == 8 && dprops.minor == 0) || (dprops.major == 9 && dprops.minor == 0)) {
+                 // Don't share smem for K & V, and don't keep V in registers
+                 // This speeds things up by 2-3% by avoiding register spills, but it
+                 // uses more shared memory, which is fine on A100 and H100 but not other GPUs.
+                 // For other GPUs, we keep V in registers.
+                 using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x100u, elem_type>;
+                 run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+-            } else if (dprops->major == 8 && dprops->minor > 0) {
++            } else if (dprops.major == 8 && dprops.minor > 0) {
+                 using Kernel_traits = FMHA_kernel_traits<256, 64, 16, 1, 8, 0x08u, elem_type>;
+                 run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+-            } else if (dprops->major == 7 && dprops->minor == 5) {
++            } else if (dprops.major == 7 && dprops.minor == 5) {
+                 using Kernel_traits = FMHA_kernel_traits<128, 64, 16, 1, 8, 0x08u, elem_type>;
+                 run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+             }
+diff --git a/csrc/flash_attn/src/fmha_bwd_launch_template.h b/csrc/flash_attn/src/fmha_bwd_launch_template.h
+index 032c4a1..4a710f3 100644
+--- a/csrc/flash_attn/src/fmha_bwd_launch_template.h
++++ b/csrc/flash_attn/src/fmha_bwd_launch_template.h
+@@ -88,13 +88,17 @@ void run_fmha_bwd_loop(FMHA_dgrad_params &params, cudaStream_t stream, const boo
+             int ctas_per_sm;
+             cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                 &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size_dq_dk_dv);
+-            auto dprops = at::cuda::getCurrentDeviceProperties();
++            // auto dprops = at::cuda::getCurrentDeviceProperties();
++            cudaDeviceProp prop;
++            int current_device;
++            cudaGetDevice(&current_device);
++            cudaGetDeviceProperties(&prop, current_device);
+             // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
+             constexpr int M = Kernel_traits::Cta_tile_p::M;
+             // We don't want more than 10 splits due to numerical error.
+             // Numerical error on dk/dv scales as sqrt(num_splits).
+             params.num_splits = num_splits_heuristic_bwd(
+-                params.b * params.h, dprops->multiProcessorCount,
++                params.b * params.h, prop.multiProcessorCount,
+                 ctas_per_sm, params.seqlen_k, blocksize_c, params.is_causal
+             );
+         }
+diff --git a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
+index d5ac579..2db1e99 100644
+--- a/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
++++ b/csrc/flash_attn/src/fmha_dgrad_kernel_1xN_loop.h
+@@ -5,8 +5,8 @@
+ 
+ #include "fmha_fprop_kernel_1xN.h"
+ #include "fmha_kernel.h"
+-#include <fmha/kernel_traits.h>
+-#include <fmha/gemm.h>
++#include "fmha/kernel_traits.h"
++#include "fmha/gemm.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
+index ee5d68d..c0a8889 100644
+--- a/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
++++ b/csrc/flash_attn/src/fmha_fprop_kernel_1xN.h
+@@ -29,9 +29,9 @@
+ #pragma once
+ 
+ #include "fmha_kernel.h"
+-#include <fmha/kernel_traits.h>
+-#include <fmha/gemm.h>
+-#include <fmha/utils.h>
++#include "fmha/kernel_traits.h"
++#include "fmha/gemm.h"
++#include "fmha/utils.h"
+ 
+ namespace fmha {
+ 
+@@ -679,12 +679,12 @@ inline __device__ void device_1xN_loop(const Params &params) {
+     // (within a warp). We use the subsequence to store the location of the 16 x 16 blocks within
+     // the attention matrix. This way, as long as we have the batch, head, and the location of
+     // the 16 x 16 block within the attention matrix, we can generate the exact same dropout pattern.
+-    auto seeds = at::cuda::philox::unpack(params.philox_args);
++    // auto args = unpack(params.philox_args);
+     if (bidx == 0 && tidx == 0) {
+-        params.rng_state[0] = std::get<0>(seeds);
+-        params.rng_state[1] = std::get<1>(seeds);
++        params.rng_state[0] = params.philox_args.seed_.val;
++        params.rng_state[1] = params.philox_args.offset_.val;
+     }
+-    Philox ph(std::get<0>(seeds), 0, std::get<1>(seeds) + (bidb * params.h + bidh) * 32 + tidx % 32);
++    Philox ph(params.philox_args.seed_.val, 0, params.philox_args.offset_.val + (bidb * params.h + bidh) * 32 + tidx % 32);
+     constexpr int M = Kernel_traits::Cta_tile_p::M;
+     const int STEPS = (params.seqlen_q + M - 1) / M;
+ 
+diff --git a/csrc/flash_attn/src/fmha_fwd_hdim128.cu b/csrc/flash_attn/src/fmha_fwd_hdim128.cu.cc
+similarity index 100%
+rename from csrc/flash_attn/src/fmha_fwd_hdim128.cu
+rename to csrc/flash_attn/src/fmha_fwd_hdim128.cu.cc
+diff --git a/csrc/flash_attn/src/fmha_fwd_hdim32.cu b/csrc/flash_attn/src/fmha_fwd_hdim32.cu.cc
+similarity index 100%
+rename from csrc/flash_attn/src/fmha_fwd_hdim32.cu
+rename to csrc/flash_attn/src/fmha_fwd_hdim32.cu.cc
+diff --git a/csrc/flash_attn/src/fmha_fwd_hdim64.cu b/csrc/flash_attn/src/fmha_fwd_hdim64.cu.cc
+similarity index 100%
+rename from csrc/flash_attn/src/fmha_fwd_hdim64.cu
+rename to csrc/flash_attn/src/fmha_fwd_hdim64.cu.cc
+diff --git a/csrc/flash_attn/src/fmha_fwd_launch_template.h b/csrc/flash_attn/src/fmha_fwd_launch_template.h
+index ec1d3df..cec206c 100644
+--- a/csrc/flash_attn/src/fmha_fwd_launch_template.h
++++ b/csrc/flash_attn/src/fmha_fwd_launch_template.h
+@@ -1,7 +1,7 @@
+ // Copyright (c) 2022, Tri Dao.
+ 
+ #pragma once
+-
++#include <cuda.h>
+ #include <vector>
+ 
+ #include <cuda_fp16.h>
+@@ -73,11 +73,17 @@ void run_fmha_fwd_loop(Launch_params<FMHA_fprop_params> &launch_params) {
+             int ctas_per_sm;
+             cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+                 &ctas_per_sm, kernel, Kernel_traits::THREADS, smem_size);
+-            auto dprops = at::cuda::getCurrentDeviceProperties();
++            // auto dprops = at::cuda::getCurrentDeviceProperties();
+             // printf("CTAS_PER_SM = %d, nSMs = %d\n", ctas_per_sm, dprops->multiProcessorCount);
++            // TODO(jianmu.scj): get real device id
++            cudaDeviceProp prop;
++            int current_device;
++            cudaGetDevice(&current_device);
++            cudaGetDeviceProperties(&prop, current_device);
++            
+             constexpr int M = Kernel_traits::Cta_tile_p::M;
+             launch_params.params.num_splits = num_splits_heuristic_fwd(
+-                launch_params.params.b * launch_params.params.h, dprops->multiProcessorCount,
++                launch_params.params.b * launch_params.params.h, prop.multiProcessorCount,
+                 ctas_per_sm,
+                 /*max_splits=*/std::min(30, (launch_params.params.seqlen_q + M - 1 / M))
+             );
+diff --git a/csrc/flash_attn/src/fmha_kernel.h b/csrc/flash_attn/src/fmha_kernel.h
+index 6287976..79c742e 100644
+--- a/csrc/flash_attn/src/fmha_kernel.h
++++ b/csrc/flash_attn/src/fmha_kernel.h
+@@ -27,14 +27,14 @@
+ 
+ #pragma once
+ 
+-#include <philox.cuh>
++#include "philox.cuh"
+ 
+-#include <fmha.h>
+-#include <fmha/utils.h>
+-#include <fmha/smem_tile.h>
+-#include <fmha/gmem_tile.h>
+-#include <fmha/mask.h>
+-#include <fmha/softmax.h>
++#include "fmha.h"
++#include "fmha/utils.h"
++#include "fmha/smem_tile.h"
++#include "fmha/gmem_tile.h"
++#include "fmha/mask.h"
++#include "fmha/softmax.h"
+ 
+ namespace fmha {
+ 
+diff --git a/csrc/flash_attn/src/philox.cuh b/csrc/flash_attn/src/philox.cuh
+index a1e4c64..8fd02e8 100644
+--- a/csrc/flash_attn/src/philox.cuh
++++ b/csrc/flash_attn/src/philox.cuh
+@@ -2,11 +2,43 @@
+ // Pytorch also has an implementation of Philox RNG: https://github.com/pytorch/pytorch/blob/master/torch/csrc/jit/codegen/cuda/runtime/random_numbers.cu
+ #pragma once
+ // Philox CUDA.
+-
++#include <mutex>
++#include <random>
+ namespace {
+ 
++struct PhiloxCudaState {
++  PhiloxCudaState() = default;
++  PhiloxCudaState(uint64_t seed,
++                  uint64_t offset) {
++    seed_.val = seed;
++    offset_.val = offset;
++  }
++  // Public members, directly accessible by unpack.
++  // If we made them private with getters/setters, the getters/setters
++  // would have to be __device__, and we can't declare __device__ in ATen.
++  union Payload {
++    uint64_t val;
++    int64_t* ptr;
++  };
++
++  Payload seed_;
++  Payload offset_;
++  uint32_t offset_intragraph_ = 0;
++  bool captured_ = false;
++};
++
+ class Philox {
+ public:
++  // struct PhiloxArgs
++  // {
++  //   uint64_t seed;
++  //   uint64_t offset;
++  //   PhiloxArgs(uint64_t seed_input, uint64_t offset_input)
++  //   {
++  //     seed = seed_input;
++  //     offset = offset_input;
++  //   }
++  // };
+   __device__ inline Philox(unsigned long long seed,
+                            unsigned long long subsequence,
+                            unsigned long long offset)
+@@ -153,5 +185,29 @@ __device__ __inline__ float4 uniform4(const uint4 x) {
+   return make_float4(x.x * M_RAN_INVM32, x.y * M_RAN_INVM32, x.z * M_RAN_INVM32,
+                      x.w * M_RAN_INVM32);
+ }
++// __device__ __forceinline__ Philox::PhiloxArgs unpack(PhiloxCudaState arg) {
++//   return Philox::PhiloxArgs(arg.seed_.val, arg.offset_.val);
++// }
++
++class CUDAPhiloxRandomGenerator{
++ public:
++  CUDAPhiloxRandomGenerator() {
++    std::random_device rd;
++    seed_ = ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
++    philox_offset_per_thread_ = 0;
++  }
++
++  PhiloxCudaState philox_cuda_state(uint64_t increment) {
++    // rounds increment up to the nearest multiple of 4
++    increment = ((increment + 3) / 4) * 4;
++    uint64_t offset = this->philox_offset_per_thread_;
++    this->philox_offset_per_thread_ += increment;
++    return PhiloxCudaState(this->seed_, offset);
++  }
++  std::mutex mutex_;
++ private:
++  uint64_t seed_;
++  uint64_t philox_offset_per_thread_;
++};
+ 
+ } // namespace
+
diff --git a/third_party/gpus/BUILD.bazel b/third_party/gpus/BUILD.bazel
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 776e031d..c132db0e 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -73,6 +73,9 @@ class ConfigError(Exception):
   pass
 
 
+def _is_aarch64():
+  return platform.machine() == "aarch64"
+
 def _is_linux():
   return platform.system() == "Linux"
 
@@ -157,8 +160,14 @@ def _get_default_cuda_paths(cuda_version):
     cuda_version = cuda_version + ".*"
 
   if _is_windows():
-    return [os.environ.get("CUDA_PATH", "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" % cuda_version)]
-  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr", "/usr/local/cudnn"] + _get_ld_config_paths()
+    return [
+        os.environ.get(
+            "CUDA_PATH",
+            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" %
+            cuda_version)
+    ]
+  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
+         "/usr/local/cudnn"] + _get_ld_config_paths()
 
 
 def _header_paths():
@@ -193,8 +202,8 @@ def _not_found_error(base_paths, relative_paths, filepattern):
   base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
   relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
   return ConfigError(
-      "Could not find any %s in any subdirectory:%s\nof:%s\n" % (filepattern, relative_paths, base_paths)
-  )
+      "Could not find any %s in any subdirectory:%s\nof:%s\n" %
+      (filepattern, relative_paths, base_paths))
 
 
 def _find_file(base_paths, relative_paths, filepattern):
@@ -209,13 +218,16 @@ def _find_library(base_paths, library_name, required_version):
   if _is_windows():
     filepattern = library_name + ".lib"
   elif _is_macos():
-    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] + required_version.split(".")[:1]))
+    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
+                                          required_version.split(".")[:1]))
   else:
-    filepattern = ".".join(["lib" + library_name, "so"] + required_version.split(".")[:1]) + "*"
+    filepattern = ".".join(["lib" + library_name, "so"] +
+                           required_version.split(".")[:1]) + "*"
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
-def _find_versioned_file(base_paths, relative_paths, filepatterns, required_version, get_version):
+def _find_versioned_file(base_paths, relative_paths, filepatterns,
+                         required_version, get_version):
   """Returns first valid path to a file that matches the requested version."""
   if type(filepatterns) not in [list, tuple]:
     filepatterns = [filepatterns]
@@ -226,13 +238,14 @@ def _find_versioned_file(base_paths, relative_paths, filepatterns, required_vers
         if _matches_version(actual_version, required_version):
           return file, actual_version
   raise _not_found_error(
-      base_paths, relative_paths, ", ".join(filepatterns) + " matching version '%s'" % required_version
-  )
+      base_paths, relative_paths,
+      ", ".join(filepatterns) + " matching version '%s'" % required_version)
 
 
 def _find_header(base_paths, header_name, required_version, get_version):
   """Returns first valid path to a header that matches the requested version."""
-  return _find_versioned_file(base_paths, _header_paths(), header_name, required_version, get_version)
+  return _find_versioned_file(base_paths, _header_paths(), header_name,
+                              required_version, get_version)
 
 
 def _find_cuda_config(base_paths, required_version):
@@ -243,7 +256,9 @@ def get_header_version(path):
       return None
     return "%d.%d" % (version // 1000, version % 1000 // 10)
 
-  cuda_header_path, header_version = _find_header(base_paths, "cuda.h", required_version, get_header_version)
+  cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
+                                                  required_version,
+                                                  get_header_version)
   cuda_version = header_version  # x.y, see above.
 
   cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
@@ -257,22 +272,18 @@ def get_nvcc_version(path):
     return None
 
   nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
-  nvcc_path, nvcc_version = _find_versioned_file(
-      base_paths, [
-          "",
-          "bin",
-          "local/cuda/bin",
-      ], nvcc_name, cuda_version, get_nvcc_version
-  )
+  nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
+      "",
+      "bin",
+      "local/cuda/bin",
+  ], nvcc_name, cuda_version, get_nvcc_version)
 
-  nvvm_path = _find_file(
-      base_paths, [
-          "nvvm/libdevice",
-          "share/cuda",
-          "lib/nvidia-cuda-toolkit/libdevice",
-          "local/cuda/nvvm/libdevice",
-      ], "libdevice*.10.bc"
-  )
+  nvvm_path = _find_file(base_paths, [
+      "nvvm/libdevice",
+      "share/cuda",
+      "lib/nvidia-cuda-toolkit/libdevice",
+      "local/cuda/nvvm/libdevice",
+  ], "libdevice*.10.bc")
 
   cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
   nvml_header_dir = _find_file(base_paths, _header_paths(), "nvml.h")
@@ -288,7 +299,8 @@ def get_nvcc_version(path):
       os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
   )
   if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
-    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths)
+    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
+                      cuda_toolkit_paths)
 
   return {
       "cuda_version": cuda_version,
@@ -309,11 +321,14 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name) for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH")
-      )
+          _get_header_version(path, name)
+          for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR",
+                       "CUBLAS_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cublas_api.h", required_version, get_header_version)
+    header_path, header_version = _find_header(base_paths, "cublas_api.h",
+                                               required_version,
+                                               get_header_version)
     # cuBLAS uses the major version only.
     cublas_version = header_version.split(".")[0]
 
@@ -335,16 +350,18 @@ def get_header_version(path):
 
 def _find_cusolver_config(base_paths, required_version, cuda_version):
 
-  if _at_least_version(cuda_version, "11.0"):
+  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
 
     def get_header_version(path):
       version = (
           _get_header_version(path, name)
-          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR", "CUSOLVER_VER_PATCH")
-      )
+          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
+                       "CUSOLVER_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cusolver_common.h", required_version, get_header_version)
+    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
+                                               required_version,
+                                               get_header_version)
     cusolver_version = header_version.split(".")[0]
 
   else:
@@ -363,15 +380,18 @@ def get_header_version(path):
 
 def _find_curand_config(base_paths, required_version, cuda_version):
 
-  if _at_least_version(cuda_version, "11.0"):
+  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name) for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH")
-      )
+          _get_header_version(path, name)
+          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
+                       "CURAND_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "curand.h", required_version, get_header_version)
+    header_path, header_version = _find_header(base_paths, "curand.h",
+                                               required_version,
+                                               get_header_version)
     curand_version = header_version.split(".")[0]
 
   else:
@@ -390,13 +410,17 @@ def get_header_version(path):
 
 def _find_cufft_config(base_paths, required_version, cuda_version):
 
-  if _at_least_version(cuda_version, "11.0"):
+  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
 
     def get_header_version(path):
-      version = (_get_header_version(path, name) for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cufft.h", required_version, get_header_version)
+    header_path, header_version = _find_header(base_paths, "cufft.h",
+                                               required_version,
+                                               get_header_version)
     cufft_version = header_version.split(".")[0]
 
   else:
@@ -416,13 +440,16 @@ def get_header_version(path):
 def _find_cudnn_config(base_paths, required_version):
 
   def get_header_version(path):
-    version = [_get_header_version(path, name) for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
+    version = [
+        _get_header_version(path, name)
+        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
     return ".".join(version) if version[0] else None
 
-  header_path, header_version = _find_header(
-      base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version
-  )
-  cudnn_version = header_version  #.split(".")[0]
+  header_path, header_version = _find_header(base_paths,
+                                             ("cudnn.h", "cudnn_version.h"),
+                                             required_version,
+                                             get_header_version)
+  cudnn_version = header_version.split(".")[0]
 
   library_path = _find_library(base_paths, "cudnn", cudnn_version)
 
@@ -435,16 +462,18 @@ def get_header_version(path):
 
 def _find_cusparse_config(base_paths, required_version, cuda_version):
 
-  if _at_least_version(cuda_version, "11.0"):
+  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
 
     def get_header_version(path):
       version = (
           _get_header_version(path, name)
-          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR", "CUSPARSE_VER_PATCH")
-      )
+          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
+                       "CUSPARSE_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cusparse.h", required_version, get_header_version)
+    header_path, header_version = _find_header(base_paths, "cusparse.h",
+                                               required_version,
+                                               get_header_version)
     cusparse_version = header_version.split(".")[0]
 
   else:
@@ -464,10 +493,14 @@ def get_header_version(path):
 def _find_nccl_config(base_paths, required_version):
 
   def get_header_version(path):
-    version = (_get_header_version(path, name) for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
+    version = (
+        _get_header_version(path, name)
+        for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
     return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths, "nccl.h", required_version, get_header_version)
+  header_path, header_version = _find_header(base_paths, "nccl.h",
+                                             required_version,
+                                             get_header_version)
   nccl_version = header_version.split(".")[0]
 
   library_path = _find_library(base_paths, "nccl", nccl_version)
@@ -483,8 +516,9 @@ def _find_tensorrt_config(base_paths, required_version):
 
   def get_header_version(path):
     version = (
-        _get_header_version(path, name) for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH")
-    )
+        _get_header_version(path, name)
+        for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
+                     "NV_TENSORRT_PATCH"))
     # `version` is a generator object, so we convert it to a list before using
     # it (muitiple times below).
     version = list(version)
@@ -492,7 +526,9 @@ def get_header_version(path):
       return None  # Versions not found, make _matches_version returns False.
     return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths, "NvInferVersion.h", required_version, get_header_version)
+  header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
+                                             required_version,
+                                             get_header_version)
 
   tensorrt_version = header_version.split(".")[0]
   library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
@@ -537,7 +573,8 @@ def find_cuda_config():
   """Returns a dictionary of CUDA library and header file paths."""
   libraries = [argv.lower() for argv in sys.argv[1:]]
   cuda_version = os.environ.get("TF_CUDA_VERSION", "")
-  base_paths = _list_from_env("TF_CUDA_PATHS", _get_default_cuda_paths(cuda_version))
+  base_paths = _list_from_env("TF_CUDA_PATHS",
+                              _get_default_cuda_paths(cuda_version))
   base_paths = [path for path in base_paths if os.path.exists(path)]
 
   result = {}
@@ -553,19 +590,22 @@ def find_cuda_config():
       # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
       cublas_paths = cuda_paths
     cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(_find_cublas_config(cublas_paths, cublas_version, cuda_version))
+    result.update(
+        _find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
     cusolver_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       cusolver_paths = cuda_paths
     cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
-    result.update(_find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
+    result.update(
+        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
 
     curand_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       curand_paths = cuda_paths
     curand_version = os.environ.get("TF_CURAND_VERSION", "")
-    result.update(_find_curand_config(curand_paths, curand_version, cuda_version))
+    result.update(
+        _find_curand_config(curand_paths, curand_version, cuda_version))
 
     cufft_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
@@ -577,7 +617,8 @@ def find_cuda_config():
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       cusparse_paths = cuda_paths
     cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
-    result.update(_find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
+    result.update(
+        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
 
   if "cudnn" in libraries:
     cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
diff --git a/third_party/leveldb.BUILD b/third_party/leveldb.BUILD
new file mode 100644
index 00000000..a9d4ff6e
--- /dev/null
+++ b/third_party/leveldb.BUILD
@@ -0,0 +1,80 @@
+package(default_visibility = ["//visibility:public"])
+
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+SOURCES = ["db/builder.cc",
+         "db/c.cc",
+         "db/dbformat.cc",
+         "db/db_impl.cc",
+         "db/db_iter.cc",
+         "db/dumpfile.cc",
+         "db/filename.cc",
+         "db/log_reader.cc",
+         "db/log_writer.cc",
+         "db/memtable.cc",
+         "db/repair.cc",
+         "db/table_cache.cc",
+         "db/version_edit.cc",
+         "db/version_set.cc",
+         "db/write_batch.cc",
+         "table/block_builder.cc",
+         "table/block.cc",
+         "table/filter_block.cc",
+         "table/format.cc",
+         "table/iterator.cc",
+         "table/merger.cc",
+         "table/table_builder.cc",
+         "table/table.cc",
+         "table/two_level_iterator.cc",
+         "util/arena.cc",
+         "util/bloom.cc",
+         "util/cache.cc",
+         "util/coding.cc",
+         "util/comparator.cc",
+         "util/crc32c.cc",
+         "util/env.cc",
+         "util/env_posix.cc",
+         "util/filter_policy.cc",
+         "util/hash.cc",
+         "util/histogram.cc",
+         "util/logging.cc",
+         "util/options.cc",
+         "util/status.cc",
+         "helpers/memenv/memenv.cc",
+    ]
+
+cc_library(
+    name = "leveldb",
+    srcs = SOURCES,
+    hdrs = glob([ 
+        "helpers/memenv/*.h",
+        "util/*.h",
+        "port/*.h",
+        "port/win/*.h",
+        "table/*.h",
+        "db/*.h",
+        "include/leveldb/*.h"
+    ],
+    exclude = [
+            "**/*test.*",
+    ]),
+    includes = [
+        "include/",
+    ],
+    copts = [
+        "-fno-builtin-memcmp",
+        "-DLEVELDB_PLATFORM_POSIX=1",
+        "-DLEVELDB_ATOMIC_PRESENT",
+    ],
+    defines = [
+        "LEVELDB_PLATFORM_POSIX",
+    ] + select({
+        ":darwin": ["OS_MACOSX"],
+        "//conditions:default": [],
+    }),
+)
diff --git a/third_party/openblas.BUILD b/third_party/openblas.BUILD
index f24ad6a2..58cf7752 100644
--- a/third_party/openblas.BUILD
+++ b/third_party/openblas.BUILD
@@ -16,6 +16,7 @@ cmake(
     cache_entries = {
         "BUILD_TESTING": "OFF",
     },
+    copts = ["-Wno-unused-result"],
     lib_source = ":all_srcs",
     out_lib_dir = select({
         "@platforms//os:linux": "lib",
diff --git a/third_party/openssl.BUILD b/third_party/openssl.BUILD
new file mode 100644
index 00000000..fb616b20
--- /dev/null
+++ b/third_party/openssl.BUILD
@@ -0,0 +1,56 @@
+load("@rules_foreign_cc//foreign_cc:defs.bzl", "configure_make", "configure_make_variant")
+
+# Read https://wiki.openssl.org/index.php/Compilation_and_Installation
+
+filegroup(
+    name = "all_srcs",
+    srcs = glob(
+        include = ["**"],
+        exclude = ["*.bazel"],
+    ),
+)
+
+config_setting(
+    name = "linux",
+    constraint_values = ["@platforms//os:linux"],
+    visibility = ["//visibility:public"],
+)
+
+configure_make(
+    name = "openssl",
+    args = [
+        "-j `nproc`",
+    ],
+    configure_command = "config",
+    configure_in_place = True,
+    configure_options = [
+        "no-comp",
+        "no-idea",
+        "no-weak-ssl-ciphers",
+        "no-shared",
+    ],
+    env = select({
+        "@platforms//os:macos": {
+            "AR": "",
+        },
+        "//conditions:default": {
+        },
+    }),
+    lib_name = "openssl",
+    lib_source = ":all_srcs",
+    out_lib_dir = "lib64",
+    # Note that for Linux builds, libssl must come before libcrypto on the linker command-line.
+    # As such, libssl must be listed before libcrypto
+    #out_shared_libs = ["libssl.so.1.1", "libcrypto.so.1.1",],
+    out_static_libs = [
+        "libssl.a",
+        "libcrypto.a",
+    ],
+    targets = [
+        "build_libs",
+        "install_dev",
+        "build_programs",
+        "install_sw",
+    ],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/py/BUILD b/third_party/py/BUILD
new file mode 100644
index 00000000..84eba77c
--- /dev/null
+++ b/third_party/py/BUILD
@@ -0,0 +1,40 @@
+load("@python//:defs.bzl", "compile_pip_requirements")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS")
+
+compile_pip_requirements(
+    name = "requirements",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+    ],
+    generate_hashes = True,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_nightly",
+    data = ["test-requirements.txt"],
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--extra-index-url=https://pypi.anaconda.org/scientific-python-nightly-wheels/simple",
+        "--pre",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
+
+compile_pip_requirements(
+    name = "requirements_dev",
+    extra_args = [
+        "--allow-unsafe",
+        "--build-isolation",
+        "--upgrade",
+    ],
+    generate_hashes = False,
+    requirements_in = "requirements.in",
+    requirements_txt = REQUIREMENTS,
+)
diff --git a/third_party/py/pypi.bzl b/third_party/py/pypi.bzl
new file mode 100644
index 00000000..962fa25a
--- /dev/null
+++ b/third_party/py/pypi.bzl
@@ -0,0 +1,54 @@
+# Copyright 2024 The JAX SC Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for working with pypi dependencies."""
+
+load("@pypi//:requirements.bzl", "requirement")
+
+# Use a map for python packages whose names don't precisely correspond to the
+# import names.  For example, the 'absl' python is 'absl_py'.  These are the
+# exceptions.  Most packages do have a direct correspondence (e.g. jax, numpy).
+_PYPI_PACKAGE_MAP = {
+    "absl": "absl_py",
+    "google/protobuf": "protobuf",
+    "tree": "dm-tree",
+}
+
+def pypi_requirement(dep):
+    """Determines the pypi package dependency for a target.
+
+    Args:
+        dep: dependency target
+
+    Returns:
+        pypi requirement.
+    """
+    package_name = dep
+
+    # Remove target in root package.
+    target_sep = package_name.find(":")
+    if target_sep >= 0:
+        package_name = package_name[:target_sep]
+
+    # Check map if there is a direct dependency replacement.
+    package_name = _PYPI_PACKAGE_MAP.get(package_name, package_name)
+
+    # Remove any subpackage names.
+    path_sep = package_name.find("/")
+    if path_sep >= 0:
+        package_name = package_name[:path_sep]
+
+    # Replace any known package name substitutions.
+    package_name = _PYPI_PACKAGE_MAP.get(package_name, package_name)
+
+    return requirement(package_name)
diff --git a/third_party/py/python_init_pip.bzl b/third_party/py/python_init_pip.bzl
new file mode 100644
index 00000000..8129df49
--- /dev/null
+++ b/third_party/py/python_init_pip.bzl
@@ -0,0 +1,53 @@
+"""Hermetic Python initialization. Consult the WORKSPACE on how to use it."""
+
+load("@python//:defs.bzl", "interpreter")
+load("@python_version_repo//:py_version.bzl", "REQUIREMENTS_WITH_LOCAL_WHEELS")
+load("@rules_python//python:pip.bzl", "package_annotation", "pip_parse")
+
+# Additional BUILD content that should be added to @pypi_tensorflow.
+TF_ADDITIVE_BUILD_CONTENT = """
+cc_library(
+    name = "tf_headers",
+    hdrs = glob(["site-packages/tensorflow/include/**"]),
+    includes = ["site-packages/tensorflow/include"],
+)
+
+cc_library(
+    name = "libtensorflow_framework",
+    srcs = ["site-packages/tensorflow/libtensorflow_framework.so.2"],
+)
+"""
+
+NUMPY_BUILD_CONTENT = """
+cc_library(
+    name = "numpy_headers_2",
+    hdrs = glob(["site-packages/numpy/_core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/_core/include/",
+)
+cc_library(
+    name = "numpy_headers_1",
+    hdrs = glob(["site-packages/numpy/core/include/**/*.h"]),
+    strip_include_prefix="site-packages/numpy/core/include/",
+)
+cc_library(
+    name = "numpy_headers",
+    deps = [":numpy_headers_2", ":numpy_headers_1"],
+)
+"""
+
+def python_init_pip():
+    ANNOTATIONS = {
+        "numpy": package_annotation(
+            additive_build_content = NUMPY_BUILD_CONTENT,
+        ),
+        "tensorflow": package_annotation(
+            additive_build_content = TF_ADDITIVE_BUILD_CONTENT,
+        ),
+    }
+
+    pip_parse(
+        name = "pypi",
+        annotations = ANNOTATIONS,
+        python_interpreter_target = interpreter,
+        requirements_lock = REQUIREMENTS_WITH_LOCAL_WHEELS,
+    )
diff --git a/third_party/readerwriterqueue.BUILD b/third_party/readerwriterqueue.BUILD
new file mode 100644
index 00000000..f87ee6df
--- /dev/null
+++ b/third_party/readerwriterqueue.BUILD
@@ -0,0 +1,10 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "readerwriterqueue",
+    hdrs = [
+        "readerwriterqueue.h",
+        "atomicops.h"],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/remote_config/BUILD b/third_party/remote_config/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/remote_config/common.bzl b/third_party/remote_config/common.bzl
new file mode 100644
index 00000000..57fb6fcf
--- /dev/null
+++ b/third_party/remote_config/common.bzl
@@ -0,0 +1,327 @@
+"""Functions common across configure rules."""
+
+BAZEL_SH = "BAZEL_SH"
+PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
+PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
+TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
+
+def auto_config_fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sConfiguration Error:%s %s\n" % (red, no_color, msg))
+
+def which(repository_ctx, program_name, allow_failure = False):
+    """Returns the full path to a program on the execution platform.
+
+    Args:
+      repository_ctx: the repository_ctx
+      program_name: name of the program on the PATH
+
+    Returns:
+      The full path to a program on the execution platform.
+    """
+    if is_windows(repository_ctx):
+        if not program_name.endswith(".exe"):
+            program_name = program_name + ".exe"
+        out = execute(
+            repository_ctx,
+            ["C:\\Windows\\System32\\where.exe", program_name],
+            allow_failure = allow_failure,
+        ).stdout
+        if out != None:
+            out = out.replace("\\", "\\\\").rstrip()
+        return out
+
+    out = execute(
+        repository_ctx,
+        ["which", program_name],
+        allow_failure = allow_failure,
+    ).stdout
+    if out != None:
+        out = out.replace("\\", "\\\\").rstrip()
+    return out
+
+def get_python_bin(repository_ctx):
+    """Gets the python bin path.
+
+    Args:
+      repository_ctx: the repository_ctx
+
+    Returns:
+      The python bin path.
+    """
+    python_bin = get_host_environ(repository_ctx, PYTHON_BIN_PATH)
+    if python_bin:
+        return python_bin
+
+    # First check for an explicit "python3"
+    python_bin = which(repository_ctx, "python3", True)
+    if python_bin:
+        return python_bin
+
+    # Some systems just call pythone3 "python"
+    python_bin = which(repository_ctx, "python", True)
+    if python_bin:
+        return python_bin
+
+    auto_config_fail("Cannot find python in PATH, please make sure " +
+                     "python is installed and add its directory in PATH, or --define " +
+                     "%s='/something/else'.\nPATH=%s" % (
+                         PYTHON_BIN_PATH,
+                         get_environ(repository_ctx, "PATH"),
+                     ))
+    return python_bin  # unreachable
+
+def get_bash_bin(repository_ctx):
+    """Gets the bash bin path.
+
+    Args:
+      repository_ctx: the repository_ctx
+
+    Returns:
+      The bash bin path.
+    """
+    bash_bin = get_host_environ(repository_ctx, BAZEL_SH)
+    if bash_bin != None:
+        return bash_bin
+    bash_bin_path = which(repository_ctx, "bash")
+    if bash_bin_path == None:
+        auto_config_fail("Cannot find bash in PATH, please make sure " +
+                         "bash is installed and add its directory in PATH, or --define " +
+                         "%s='/path/to/bash'.\nPATH=%s" % (
+                             BAZEL_SH,
+                             get_environ(repository_ctx, "PATH"),
+                         ))
+    return bash_bin_path
+
+def read_dir(repository_ctx, src_dir):
+    """Returns a sorted list with all files in a directory.
+
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks.
+
+    Args:
+      repository_ctx: the repository_ctx
+      src_dir: the directory to traverse
+
+    Returns:
+      A sorted list with all files in a directory.
+    """
+    if is_windows(repository_ctx):
+        src_dir = src_dir.replace("/", "\\")
+        find_result = execute(
+            repository_ctx,
+            ["C:\\Windows\\System32\\cmd.exe", "/c", "dir", src_dir, "/b", "/s", "/a-d"],
+            allow_failure = True,
+        )
+
+        # src_files will be used in genrule.outs where the paths must
+        # use forward slashes.
+        result = find_result.stdout.replace("\\", "/")
+    else:
+        find_result = execute(
+            repository_ctx,
+            ["find", src_dir, "-follow", "-type", "f"],
+            allow_failure = True,
+        )
+        result = find_result.stdout
+    return sorted(result.splitlines())
+
+def get_environ(repository_ctx, name, default_value = None):
+    """Returns the value of an environment variable on the execution platform.
+
+    Args:
+      repository_ctx: the repository_ctx
+      name: the name of environment variable
+      default_value: the value to return if not set
+
+    Returns:
+      The value of the environment variable 'name' on the execution platform
+      or 'default_value' if it's not set.
+    """
+    if is_windows(repository_ctx):
+        result = execute(
+            repository_ctx,
+            ["C:\\Windows\\System32\\cmd.exe", "/c", "echo", "%" + name + "%"],
+            allow_failure = True,
+        )
+    else:
+        cmd = "echo -n \"$%s\"" % name
+        result = execute(
+            repository_ctx,
+            [get_bash_bin(repository_ctx), "-c", cmd],
+            allow_failure = True,
+        )
+    if len(result.stdout) == 0:
+        return default_value
+    return result.stdout
+
+def get_host_environ(repository_ctx, name, default_value = None):
+    """Returns the value of an environment variable on the host platform.
+
+    The host platform is the machine that Bazel runs on.
+
+    Args:
+      repository_ctx: the repository_ctx
+      name: the name of environment variable
+
+    Returns:
+      The value of the environment variable 'name' on the host platform.
+    """
+    if name in repository_ctx.os.environ:
+        return repository_ctx.os.environ.get(name).strip()
+
+    if hasattr(repository_ctx.attr, "environ") and name in repository_ctx.attr.environ:
+        return repository_ctx.attr.environ.get(name).strip()
+
+    return default_value
+
+def is_windows(repository_ctx):
+    """Returns true if the execution platform is Windows.
+
+    Args:
+      repository_ctx: the repository_ctx
+
+    Returns:
+      If the execution platform is Windows.
+    """
+    os_name = ""
+    if hasattr(repository_ctx.attr, "exec_properties") and "OSFamily" in repository_ctx.attr.exec_properties:
+        os_name = repository_ctx.attr.exec_properties["OSFamily"]
+    else:
+        os_name = repository_ctx.os.name
+
+    return os_name.lower().find("windows") != -1
+
+def get_cpu_value(repository_ctx):
+    """Returns the name of the host operating system.
+
+    Args:
+      repository_ctx: The repository context.
+    Returns:
+      A string containing the name of the host operating system.
+    """
+    if is_windows(repository_ctx):
+        return "Windows"
+    result = raw_exec(repository_ctx, ["uname", "-s"])
+    return result.stdout.strip()
+
+def execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        allow_failure = False):
+    """Executes an arbitrary shell command.
+
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      allow_failure: bool, if True, an empty stdout result or output to stderr
+        is fine, otherwise either of these is an error
+    Returns:
+      The result of repository_ctx.execute(cmdline)
+    """
+    result = raw_exec(repository_ctx, cmdline)
+    if (result.stderr or not result.stdout) and not allow_failure:
+        fail(
+            "\n".join([
+                error_msg.strip() if error_msg else "Repository command failed",
+                result.stderr.strip(),
+                error_details if error_details else "",
+            ]),
+        )
+    return result
+
+def raw_exec(repository_ctx, cmdline):
+    """Executes a command via repository_ctx.execute() and returns the result.
+
+    This method is useful for debugging purposes. For example, to print all
+    commands executed as well as their return code.
+
+    Args:
+      repository_ctx: the repository_ctx
+      cmdline: the list of args
+
+    Returns:
+      The 'exec_result' of repository_ctx.execute().
+    """
+    return repository_ctx.execute(cmdline)
+
+def files_exist(repository_ctx, paths, bash_bin = None):
+    """Checks which files in paths exists.
+
+    Args:
+      repository_ctx: the repository_ctx
+      paths: a list of paths
+      bash_bin: path to the bash interpreter
+
+    Returns:
+      Returns a list of Bool. True means that the path at the
+      same position in the paths list exists.
+    """
+    if bash_bin == None:
+        bash_bin = get_bash_bin(repository_ctx)
+
+    cmd_tpl = "[ -e \"%s\" ] && echo True || echo False"
+    cmds = [cmd_tpl % path for path in paths]
+    cmd = " ; ".join(cmds)
+
+    stdout = execute(repository_ctx, [bash_bin, "-c", cmd]).stdout.strip()
+    return [val == "True" for val in stdout.splitlines()]
+
+def realpath(repository_ctx, path, bash_bin = None):
+    """Returns the result of "realpath path".
+
+    Args:
+      repository_ctx: the repository_ctx
+      path: a path on the file system
+      bash_bin: path to the bash interpreter
+
+    Returns:
+      Returns the result of "realpath path"
+    """
+    if bash_bin == None:
+        bash_bin = get_bash_bin(repository_ctx)
+
+    return execute(repository_ctx, [bash_bin, "-c", "realpath \"%s\"" % path]).stdout.strip()
+
+def err_out(result):
+    """Returns stderr if set, else stdout.
+
+    This function is a workaround for a bug in RBE where stderr is returned as stdout. Instead
+    of using result.stderr use err_out(result) instead.
+
+    Args:
+      result: the exec_result.
+
+    Returns:
+      The stderr if set, else stdout
+    """
+    if len(result.stderr) == 0:
+        return result.stdout
+    return result.stderr
+
+def config_repo_label(config_repo, target):
+    """Construct a label from config_repo and target.
+
+    This function exists to ease the migration from preconfig to remote config. In preconfig
+    the TF_*_CONFIG_REPO environ variables are set to packages in the main repo while in
+    remote config they will point to remote repositories.
+
+    Args:
+      config_repo: a remote repository or package.
+      target: a target
+    Returns:
+      A label constructed from config_repo and target.
+    """
+    if config_repo.startswith("@") and not config_repo.find("//") > 0:
+        # remote config is being used.
+        return Label(config_repo + "//" + target)
+    elif target.startswith(":"):
+        return Label(config_repo + target)
+    else:
+        return Label(config_repo + "/" + target)
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
new file mode 100644
index 00000000..581189e8
--- /dev/null
+++ b/third_party/repo.bzl
@@ -0,0 +1,244 @@
+# Copyright 2017 The TensorFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utilities for defining TensorFlow Bazel dependencies."""
+
+_SINGLE_URL_WHITELIST = depset([
+    "arm_compiler",
+])
+
+def _is_windows(ctx):
+    return ctx.os.name.lower().find("windows") != -1
+
+def _wrap_bash_cmd(ctx, cmd):
+    if _is_windows(ctx):
+        bazel_sh = _get_env_var(ctx, "BAZEL_SH")
+        if not bazel_sh:
+            fail("BAZEL_SH environment variable is not set")
+        cmd = [bazel_sh, "-l", "-c", " ".join(["\"%s\"" % s for s in cmd])]
+    return cmd
+
+def _get_env_var(ctx, name):
+    if name in ctx.os.environ:
+        return ctx.os.environ[name]
+    else:
+        return None
+
+# Checks if we should use the system lib instead of the bundled one
+def _use_system_lib(ctx, name):
+    syslibenv = _get_env_var(ctx, "TF_SYSTEM_LIBS")
+    if syslibenv:
+        for n in syslibenv.strip().split(","):
+            if n.strip() == name:
+                return True
+    return False
+
+# Executes specified command with arguments and calls 'fail' if it exited with
+# non-zero code
+def _execute_and_check_ret_code(repo_ctx, cmd_and_args):
+    result = repo_ctx.execute(cmd_and_args, timeout = 60)
+    if result.return_code != 0:
+        fail(("Non-zero return code({1}) when executing '{0}':\n" + "Stdout: {2}\n" +
+              "Stderr: {3}").format(
+            " ".join(cmd_and_args),
+            result.return_code,
+            result.stdout,
+            result.stderr,
+        ))
+
+def _repos_are_siblings():
+    return Label("@foo//bar").workspace_root.startswith("../")
+
+# Apply a patch_file to the repository root directory
+# Runs 'patch -p1' on both Windows and Unix.
+def _apply_patch(ctx, patch_files):
+    for patch_file in patch_files:
+        patch_file = ctx.path(Label(patch_file)) if patch_file else None
+        if patch_file:
+            patch_command = ["patch", "-p1", "-d", ctx.path("."), "-i", ctx.path(patch_file)]
+            cmd = _wrap_bash_cmd(ctx, patch_command)
+            _execute_and_check_ret_code(ctx, cmd)
+
+def _apply_delete(ctx, paths):
+    for path in paths:
+        if path.startswith("/"):
+            fail("refusing to rm -rf path starting with '/': " + path)
+        if ".." in path:
+            fail("refusing to rm -rf path containing '..': " + path)
+    cmd = _wrap_bash_cmd(ctx, ["rm", "-rf"] + [ctx.path(path) for path in paths])
+    _execute_and_check_ret_code(ctx, cmd)
+
+def _tf_http_archive(ctx):
+    if ("mirror.tensorflow.org" not in ctx.attr.urls[0] and
+        (len(ctx.attr.urls) < 2 and
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
+        fail("tf_http_archive(urls) must have redundant URLs. The " +
+             "mirror.tensorflow.org URL must be present and it must come first. " +
+             "Even if you don't have permission to mirror the file, please " +
+             "put the correctly formatted mirror URL there anyway, because " +
+             "someone will come along shortly thereafter and mirror the file.")
+
+    urls = []
+    for url in ctx.attr.urls:
+        if "PWD" in url:
+            url = url.replace("PWD", _get_env_var(ctx, "PWD"))
+        urls.append(url)
+    use_syslib = _use_system_lib(ctx, ctx.attr.name)
+
+    # Work around the bazel bug that redownloads the whole library.
+    # Remove this after https://github.com/bazelbuild/bazel/issues/10515 is fixed.
+    if ctx.attr.additional_build_files:
+        for internal_src in ctx.attr.additional_build_files:
+            _ = ctx.path(Label(internal_src))
+
+    # End of workaround.
+
+    if not use_syslib:
+        ctx.download_and_extract(
+            urls,
+            "",
+            ctx.attr.sha256,
+            ctx.attr.type,
+            ctx.attr.strip_prefix,
+        )
+        if ctx.attr.delete:
+            _apply_delete(ctx, ctx.attr.delete)
+        if ctx.attr.patch_file != None:
+            _apply_patch(ctx, ctx.attr.patch_file)
+
+    if use_syslib and ctx.attr.system_build_file != None:
+        # Use BUILD.bazel to avoid conflict with third party projects with
+        # BUILD or build (directory) underneath.
+        ctx.template("BUILD.bazel", ctx.attr.system_build_file, {
+            "%prefix%": ".." if _repos_are_siblings() else "external",
+        }, False)
+
+    elif ctx.attr.build_file != None:
+        # Use BUILD.bazel to avoid conflict with third party projects with
+        # BUILD or build (directory) underneath.
+        ctx.template("BUILD.bazel", ctx.attr.build_file, {
+            "%prefix%": ".." if _repos_are_siblings() else "external",
+        }, False)
+
+    if use_syslib:
+        for internal_src, external_dest in ctx.attr.system_link_files.items():
+            ctx.symlink(Label(internal_src), ctx.path(external_dest))
+
+    if ctx.attr.additional_build_files:
+        for internal_src, external_dest in ctx.attr.additional_build_files.items():
+            ctx.symlink(Label(internal_src), ctx.path(external_dest))
+
+tf_http_archive = repository_rule(
+    attrs = {
+        "sha256": attr.string(mandatory = True),
+        "urls": attr.string_list(mandatory = True, allow_empty = False),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "patch_file": attr.string_list(),
+        "build_file": attr.label(),
+        "system_build_file": attr.label(),
+        "system_link_files": attr.string_dict(),
+        "additional_build_files": attr.string_dict(),
+    },
+    environ = [
+        "TF_SYSTEM_LIBS",
+    ],
+    implementation = _tf_http_archive,
+)
+
+"""Downloads and creates Bazel repos for dependencies.
+
+This is a swappable replacement for both http_archive() and
+new_http_archive() that offers some additional features. It also helps
+ensure best practices are followed.
+"""
+
+def _third_party_http_archive(ctx):
+    if ("mirror.tensorflow.org" not in ctx.attr.urls[0] and
+        (len(ctx.attr.urls) < 2 and
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
+        fail("tf_http_archive(urls) must have redundant URLs. The " +
+             "mirror.tensorflow.org URL must be present and it must come first. " +
+             "Even if you don't have permission to mirror the file, please " +
+             "put the correctly formatted mirror URL there anyway, because " +
+             "someone will come along shortly thereafter and mirror the file.")
+
+    use_syslib = _use_system_lib(ctx, ctx.attr.name)
+
+    # Use "BUILD.bazel" to avoid conflict with third party projects that contain a
+    # file or directory called "BUILD"
+    buildfile_path = ctx.path("BUILD.bazel")
+
+    if use_syslib:
+        if ctx.attr.system_build_file == None:
+            fail("Bazel was configured with TF_SYSTEM_LIBS to use a system " +
+                 "library for %s, but no system build file for %s was configured. " +
+                 "Please add a system_build_file attribute to the repository rule" +
+                 "for %s." % (ctx.attr.name, ctx.attr.name, ctx.attr.name))
+        ctx.symlink(Label(ctx.attr.system_build_file), buildfile_path)
+
+    else:
+        ctx.download_and_extract(
+            ctx.attr.urls,
+            "",
+            ctx.attr.sha256,
+            ctx.attr.type,
+            ctx.attr.strip_prefix,
+        )
+        if ctx.attr.delete:
+            _apply_delete(ctx, ctx.attr.delete)
+        if ctx.attr.patch_file != None:
+            _apply_patch(ctx, ctx.attr.patch_file)
+        ctx.symlink(Label(ctx.attr.build_file), buildfile_path)
+
+    link_dict = {}
+    if use_syslib:
+        link_dict.update(ctx.attr.system_link_files)
+
+    for internal_src, external_dest in ctx.attr.link_files.items():
+        # if syslib and link exists in both, use the system one
+        if external_dest not in link_dict.values():
+            link_dict[internal_src] = external_dest
+
+    for internal_src, external_dest in link_dict.items():
+        ctx.symlink(Label(internal_src), ctx.path(external_dest))
+
+# Downloads and creates Bazel repos for dependencies.
+#
+# This is an upgrade for tf_http_archive that works with go/tfbr-thirdparty.
+#
+# For link_files, specify each dict entry as:
+# "//path/to/source:file": "localfile"
+third_party_http_archive = repository_rule(
+    attrs = {
+        "sha256": attr.string(mandatory = True),
+        "urls": attr.string_list(
+            mandatory = True,
+            allow_empty = False,
+        ),
+        "strip_prefix": attr.string(),
+        "type": attr.string(),
+        "delete": attr.string_list(),
+        "build_file": attr.string(mandatory = True),
+        "system_build_file": attr.string(mandatory = False),
+        "patch_file": attr.string_list(),
+        "link_files": attr.string_dict(),
+        "system_link_files": attr.string_dict(),
+    },
+    environ = [
+        "TF_SYSTEM_LIBS",
+    ],
+    implementation = _third_party_http_archive,
+)
diff --git a/third_party/sparsehash.BUILD b/third_party/sparsehash.BUILD
deleted file mode 100644
index 95b180c5..00000000
--- a/third_party/sparsehash.BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"]) # # BSD 3-Clause
-
-cc_library(
-  name = "sparsehash",
-  hdrs = glob([
-              "sparsehash/**",
-              ]),
-  includes = ["."],
-  visibility = ["//visibility:public"],
-)
\ No newline at end of file
diff --git a/third_party/sparsehash_c11/BUILD b/third_party/sparsehash_c11/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/sparsehash_c11.BUILD b/third_party/sparsehash_c11/sparsehash_c11.BUILD
similarity index 100%
rename from third_party/sparsehash_c11.BUILD
rename to third_party/sparsehash_c11/sparsehash_c11.BUILD
diff --git a/third_party/sparsehash_c11/sparsehash_c11.patch b/third_party/sparsehash_c11/sparsehash_c11.patch
new file mode 100644
index 00000000..4742fc7a
--- /dev/null
+++ b/third_party/sparsehash_c11/sparsehash_c11.patch
@@ -0,0 +1,4956 @@
+From 2d87035732d9ae347515b8497ca7a75d3b36f8cf Mon Sep 17 00:00:00 2001
+From: Tongxuan Liu <tongxuan.ltx@alibaba-inc.com>
+Date: Mon, 13 Mar 2023 08:58:56 +0800
+Subject: [PATCH] Avoid fetching nullptr when use feature filter.
+
+---
+ Makefile                                      |   13 +-
+ sparsehash/dense_hash_map_lockless            |  447 ++++
+ sparsehash/dense_hash_set_lockless            |  381 +++
+ sparsehash/internal/densehashtable.h          |   16 +-
+ sparsehash/internal/densehashtable_lockless.h | 2035 +++++++++++++++++
+ sparsehash/internal/hashtable-common.h        |    4 +
+ sparsehash/internal/sparsehashtable.h         |   18 +-
+ sparsehash/traits                             |   10 +-
+ tests/bench_lockless.cc                       | 1466 ++++++++++++
+ tests/dense_hash_map_unittests.cc             |  137 +-
+ tests/rwlock.h                                |  224 ++
+ 11 files changed, 4728 insertions(+), 23 deletions(-)
+ create mode 100644 sparsehash/dense_hash_map_lockless
+ create mode 100644 sparsehash/dense_hash_set_lockless
+ create mode 100644 sparsehash/internal/densehashtable_lockless.h
+ create mode 100644 tests/bench_lockless.cc
+ create mode 100644 tests/rwlock.h
+
+diff --git a/Makefile b/Makefile
+index 8bc9963..3bf2368 100644
+--- a/Makefile
++++ b/Makefile
+@@ -4,7 +4,7 @@ CPPFLAGS += -I$(TEST_DIR) -I. -isystem $(TEST_DIR)/gtest
+ CXXFLAGS += -Wall -Wextra -Wpedantic -Wno-missing-field-initializers -std=c++11 -O3 -D_SPARSEHASH_CI_TESTING_ ${_CXXFLAGS}
+ LDFLAGS += -lpthread
+ 
+-all : sparsehash_unittests bench
++all : sparsehash_unittests bench bench_lockless
+ 
+ check : all
+ 	./sparsehash_unittests
+@@ -15,9 +15,15 @@ clean :
+ bench.o : $(TEST_DIR)/bench.cc
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/bench.cc
+ 
++bench_lockless.o : $(TEST_DIR)/bench_lockless.cc
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/bench_lockless.cc
++
+ bench: bench.o
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+ 
++bench_lockless: bench_lockless.o
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
++
+ gmock-gtest-all.o :
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/gtest/gmock-gtest-all.cc
+ 
+@@ -39,9 +45,12 @@ hashtable_unittests.o: $(TEST_DIR)/hashtable_unittests.cc
+ hashtable_c11_unittests.o: $(TEST_DIR)/hashtable_c11_unittests.cc
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/hashtable_c11_unittests.cc
+ 
++dense_hash_map_unittests.o: $(TEST_DIR)/dense_hash_map_unittests.cc
++	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/dense_hash_map_unittests.cc
++
+ testmain.o : $(TEST_DIR)/*.cc
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $(TEST_DIR)/testmain.cc
+ 
+-sparsehash_unittests : simple_unittests.o sparsetable_unittests.o allocator_unittests.o hashtable_unittests.o hashtable_c11_unittests.o fixture_unittests.o testmain.o gmock-gtest-all.o
++sparsehash_unittests : simple_unittests.o sparsetable_unittests.o allocator_unittests.o hashtable_unittests.o hashtable_c11_unittests.o fixture_unittests.o dense_hash_map_unittests.o testmain.o gmock-gtest-all.o
+ 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+ 
+diff --git a/sparsehash/dense_hash_map_lockless b/sparsehash/dense_hash_map_lockless
+new file mode 100644
+index 0000000..e68891f
+--- /dev/null
++++ b/sparsehash/dense_hash_map_lockless
+@@ -0,0 +1,447 @@
++// Copyright (c) 2005, Google Inc.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are
++// met:
++//
++//     * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++//     * Redistributions in binary form must reproduce the above
++// copyright notice, this list of conditions and the following disclaimer
++// in the documentation and/or other materials provided with the
++// distribution.
++//     * Neither the name of Google Inc. nor the names of its
++// contributors may be used to endorse or promote products derived from
++// this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// ----
++//
++// This is just a very thin wrapper over densehashtable.h, just
++// like sgi stl's stl_hash_map is a very thin wrapper over
++// stl_hashtable.  The major thing we define is operator[], because
++// we have a concept of a data_type which stl_hashtable doesn't
++// (it only has a key and a value).
++//
++// NOTE: this is exactly like sparse_hash_map.h, with the word
++// "sparse" replaced by "dense", except for the addition of
++// set_empty_key().
++//
++//   YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION.
++//
++// Otherwise your program will die in mysterious ways.  (Note if you
++// use the constructor that takes an InputIterator range, you pass in
++// the empty key in the constructor, rather than after.  As a result,
++// this constructor differs from the standard STL version.)
++//
++// In other respects, we adhere mostly to the STL semantics for
++// hash-map.  One important exception is that insert() may invalidate
++// iterators entirely -- STL semantics are that insert() may reorder
++// iterators, but they all still refer to something valid in the
++// hashtable.  Not so for us.  Likewise, insert() may invalidate
++// pointers into the hashtable.  (Whether insert invalidates iterators
++// and pointers depends on whether it results in a hashtable resize).
++// On the plus side, delete() doesn't invalidate iterators or pointers
++// at all, or even change the ordering of elements.
++//
++// Here are a few "power user" tips:
++//
++//    1) set_deleted_key():
++//         If you want to use erase() you *must* call set_deleted_key(),
++//         in addition to set_empty_key(), after construction.
++//         The deleted and empty keys must differ.
++//
++//    2) resize(0):
++//         When an item is deleted, its memory isn't freed right
++//         away.  This allows you to iterate over a hashtable,
++//         and call erase(), without invalidating the iterator.
++//         To force the memory to be freed, call resize(0).
++//         For tr1 compatibility, this can also be called as rehash(0).
++//
++//    3) min_load_factor(0.0)
++//         Setting the minimum load factor to 0.0 guarantees that
++//         the hash table will never shrink.
++//
++// Roughly speaking:
++//   (1) dense_hash_map: fastest, uses the most memory unless entries are small
++//   (2) sparse_hash_map: slowest, uses the least memory
++//   (3) hash_map / unordered_map (STL): in the middle
++//
++// Typically I use sparse_hash_map when I care about space and/or when
++// I need to save the hashtable on disk.  I use hash_map otherwise.  I
++// don't personally use dense_hash_set ever; some people use it for
++// small sets with lots of lookups.
++//
++// - dense_hash_map has, typically, about 78% memory overhead (if your
++//   data takes up X bytes, the hash_map uses .78X more bytes in overhead).
++// - sparse_hash_map has about 4 bits overhead per entry.
++// - sparse_hash_map can be 3-7 times slower than the others for lookup and,
++//   especially, inserts.  See time_hash_map.cc for details.
++//
++// See /usr/(local/)?doc/sparsehash-*/dense_hash_map.html
++// for information about how to use this class.
++
++#pragma once
++
++#include <algorithm>   // needed by stl_alloc
++#include <functional>  // for equal_to<>, select1st<>, etc
++#include <initializer_list> // for initializer_list
++#include <memory>      // for alloc
++#include <utility>     // for pair<>
++#include <tuple>       // forward_as_tuple
++#include <type_traits> // for enable_if, is_constructible, etc
++#include <sparsehash/internal/densehashtable_lockless.h>  // IWYU pragma: export
++#include <sparsehash/internal/libc_allocator_with_realloc.h>
++
++namespace google {
++
++template <class Key, class T, class HashFcn = std::hash<Key>,
++          class EqualKey = std::equal_to<Key>,
++          class Alloc = libc_allocator_with_realloc<std::pair<const Key, T>>>
++class dense_hash_map_lockless {
++ private:
++  // Apparently select1st is not stl-standard, so we define our own
++  struct SelectKey {
++    typedef const Key& result_type;
++
++    template <class Type>
++    using decay_t = typename std::decay<Type>::type;
++
++    template <typename Pair, 
++              typename = typename std::enable_if<std::is_same<
++                  decay_t<typename decay_t<Pair>::first_type>,
++                  decay_t<Key>
++                >::value>::type
++             >
++    result_type operator()(Pair&& p) const {
++      return p.first;
++    }
++  };
++  struct SetKey {
++    void operator()(std::pair<const Key, T>* value, const Key& new_key) const {
++      using NCKey = typename std::remove_cv<Key>::type;
++      *const_cast<NCKey*>(&value->first) = new_key;
++
++      // It would be nice to clear the rest of value here as well, in
++      // case it's taking up a lot of memory.  We do this by clearing
++      // the value.  This assumes T has a zero-arg constructor!
++      value->second = T();
++    }
++    void operator()(std::pair<const Key, T>* value, const Key& new_key, bool) const {
++      new(value) std::pair<const Key, T>(std::piecewise_construct, std::forward_as_tuple(new_key), std::forward_as_tuple());
++    }
++  };
++
++  // The actual data
++  typedef typename sparsehash_internal::key_equal_chosen<HashFcn, EqualKey>::type EqualKeyChosen;
++  typedef dense_hashtable_lockless<std::pair<const Key, T>, Key, HashFcn, SelectKey,
++                          SetKey, EqualKeyChosen, Alloc, T> ht;
++  ht rep;
++
++  static_assert(!sparsehash_internal::has_transparent_key_equal<HashFcn>::value
++                || std::is_same<EqualKey, std::equal_to<Key>>::value
++                || std::is_same<EqualKey, EqualKeyChosen>::value,
++                "Heterogeneous lookup requires key_equal to either be the default container value or the same as the type provided by hash");
++
++ public:
++  typedef typename ht::key_type key_type;
++  typedef T data_type;
++  typedef T mapped_type;
++  typedef typename ht::value_type value_type;
++  typedef typename ht::hasher hasher;
++  typedef typename ht::key_equal key_equal;
++  typedef Alloc allocator_type;
++
++  typedef typename ht::size_type size_type;
++  typedef typename ht::difference_type difference_type;
++  typedef typename ht::pointer pointer;
++  typedef typename ht::const_pointer const_pointer;
++  typedef typename ht::reference reference;
++  typedef typename ht::const_reference const_reference;
++
++  typedef typename ht::iterator iterator;
++  typedef typename ht::const_iterator const_iterator;
++  typedef typename ht::local_iterator local_iterator;
++  typedef typename ht::const_local_iterator const_local_iterator;
++
++  // Iterator functions
++  iterator begin() { return rep.begin(); }
++  iterator end() { return rep.end(); }
++  const_iterator begin() const { return rep.begin(); }
++  const_iterator end() const { return rep.end(); }
++  const_iterator cbegin() const { return rep.begin(); }
++  const_iterator cend() const { return rep.end(); }
++
++  // These come from tr1's unordered_map. For us, a bucket has 0 or 1 elements.
++  local_iterator begin(size_type i) { return rep.begin(i); }
++  local_iterator end(size_type i) { return rep.end(i); }
++  const_local_iterator begin(size_type i) const { return rep.begin(i); }
++  const_local_iterator end(size_type i) const { return rep.end(i); }
++  const_local_iterator cbegin(size_type i) const { return rep.begin(i); }
++  const_local_iterator cend(size_type i) const { return rep.end(i); }
++
++  // Accessor functions
++  allocator_type get_allocator() const { return rep.get_allocator(); }
++  hasher hash_funct() const { return rep.hash_funct(); }
++  hasher hash_function() const { return hash_funct(); }
++  key_equal key_eq() const { return rep.key_eq(); }
++
++  // Constructors
++  explicit dense_hash_map_lockless(size_type expected_max_items_in_table = 0,
++                          const hasher& hf = hasher(),
++                          const key_equal& eql = key_equal(),
++                          const allocator_type& alloc = allocator_type())
++      : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(),
++            alloc) {}
++
++  template <class InputIterator>
++  dense_hash_map_lockless(InputIterator f, InputIterator l,
++                 const key_type& empty_key_val,
++                 size_type expected_max_items_in_table = 0,
++                 const hasher& hf = hasher(),
++                 const key_equal& eql = key_equal(),
++                 const allocator_type& alloc = allocator_type())
++      : rep(expected_max_items_in_table, hf, eql, SelectKey(), SetKey(),
++            alloc) {
++    set_empty_key(empty_key_val);
++    rep.insert(f, l);
++  }
++  // We use the default copy constructor
++  // We use the default operator=()
++  // We use the default destructor
++
++  void clear() { rep.clear(); }
++  // This clears the hash map without resizing it down to the minimum
++  // bucket count, but rather keeps the number of buckets constant
++  void clear_no_resize() { rep.clear_no_resize(); }
++  void swap(dense_hash_map_lockless& hs) { rep.swap(hs.rep); }
++
++  // Functions concerning size
++  size_type size() const { return rep.size(); }
++  size_type max_size() const { return rep.max_size(); }
++  bool empty() const { return rep.empty(); }
++  size_type bucket_count() const { return rep.bucket_count(); }
++  size_type max_bucket_count() const { return rep.max_bucket_count(); }
++  void set_counternum(size_type ncounters) {rep.set_counternum(ncounters);}
++  long long int size_lockless() const {return rep.size_lockless();}
++
++
++  // These are tr1 methods.  bucket() is the bucket the key is or would be in.
++  size_type bucket_size(size_type i) const { return rep.bucket_size(i); }
++  size_type bucket(const key_type& key) const { return rep.bucket(key); }
++  float load_factor() const { return size() * 1.0f / bucket_count(); }
++  float max_load_factor() const {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    return grow;
++  }
++  void max_load_factor(float new_grow) {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    rep.set_resizing_parameters(shrink, new_grow);
++  }
++  // These aren't tr1 methods but perhaps ought to be.
++  float min_load_factor() const {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    return shrink;
++  }
++  void min_load_factor(float new_shrink) {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    rep.set_resizing_parameters(new_shrink, grow);
++  }
++  // Deprecated; use min_load_factor() or max_load_factor() instead.
++  void set_resizing_parameters(float shrink, float grow) {
++    rep.set_resizing_parameters(shrink, grow);
++  }
++
++  void reserve(size_type size) { rehash(size); } // note: rehash internally treats hint/size as number of elements
++  void resize(size_type hint) { rep.resize(hint); }
++  void rehash(size_type hint) { resize(hint); }  // the tr1 name
++
++  // Lookup routines
++  iterator find(const key_type& key) { return rep.find(key); }
++  const_iterator find(const key_type& key) const { return rep.find(key); }
++  //Lockfree Lookup routines
++  std::pair<key_type, data_type> find_wait_free(key_type& key) {return rep.template find_wait_free<data_type>(key);}
++
++  template <typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, iterator>::type
++  find(const K& key) { return rep.find(key); }
++  template <typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, const_iterator>::type
++  find(const K& key) const { return rep.find(key); }
++
++  data_type& operator[](const key_type& key) {  // This is our value-add!
++    // If key is in the hashtable, returns find(key)->second,
++    // otherwise returns insert(value_type(key, T()).first->second.
++    // Note it does not create an empty T unless the find fails.
++    return rep.template find_or_insert<data_type>(key).second;
++  }
++
++  data_type& operator[](key_type&& key) {
++    return rep.template find_or_insert<data_type>(std::move(key)).second;
++  }
++
++  size_type count(const key_type& key) const { return rep.count(key); }
++
++  template <typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, size_type>::type
++  count(const K& key) const { return rep.count(key); }
++
++  std::pair<iterator, iterator> equal_range(const key_type& key) {
++    return rep.equal_range(key);
++  }
++  std::pair<const_iterator, const_iterator> equal_range(
++      const key_type& key) const {
++    return rep.equal_range(key);
++  }
++
++  template<typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<iterator, iterator>>::type
++  equal_range(const K& key) {
++    return rep.equal_range(key);
++  }
++  template<typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<const_iterator, const_iterator>>::type
++  equal_range(const K& key) const {
++    return rep.equal_range(key);
++  }
++
++  // Insertion routines
++  std::pair<iterator, bool> insert(const value_type& obj) {
++    return rep.insert(obj);
++  }
++
++  std::pair<iterator, bool> insert_lockless(const value_type& obj) {
++    return rep.insert_lockless(obj);
++  }
++
++std::pair<value_type*, size_type> GetSnapshot(){ return rep.GetSnapShot();}
++
++
++  template <typename Pair, typename = typename std::enable_if<std::is_constructible<value_type, Pair&&>::value>::type>
++  std::pair<iterator, bool> insert(Pair&& obj) {
++    return rep.insert(std::forward<Pair>(obj));
++  }
++
++  // overload to allow {} syntax: .insert( { {key}, {args} } )
++  std::pair<iterator, bool> insert(value_type&& obj) {
++    return rep.insert(std::move(obj));
++  }
++
++  template <typename... Args>
++  std::pair<iterator, bool> emplace(Args&&... args) {
++    return rep.emplace(std::forward<Args>(args)...);
++  }
++
++  template <typename... Args>
++  std::pair<iterator, bool> emplace_hint(const_iterator hint, Args&&... args) {
++    return rep.emplace_hint(hint, std::forward<Args>(args)...);
++  }
++
++
++  template <class InputIterator>
++  void insert(InputIterator f, InputIterator l) {
++    rep.insert(f, l);
++  }
++  void insert(const_iterator f, const_iterator l) { rep.insert(f, l); }
++  void insert(std::initializer_list<value_type> ilist) { rep.insert(ilist.begin(), ilist.end()); }
++  // Required for std::insert_iterator; the passed-in iterator is ignored.
++  iterator insert(const_iterator, const value_type& obj) { return insert(obj).first; }
++  iterator insert(const_iterator, value_type&& obj) { return insert(std::move(obj)).first; }
++  template <class P, class = typename std::enable_if<
++                                        std::is_constructible<value_type, P&&>::value &&
++                                        !std::is_same<value_type, P>::value
++                                      >::type>
++  iterator insert(const_iterator, P&& obj) { return insert(std::forward<P>(obj)).first; }
++
++  // Deletion and empty routines
++  // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
++  // value to identify deleted and empty buckets.  You can change the
++  // deleted key as time goes on, or get rid of it entirely to be insert-only.
++   // YOU MUST CALL THIS!
++  void set_empty_key(const key_type& key) { rep.set_empty_key(key); }
++
++  void set_empty_key_and_value(const key_type& key, T value) {rep.set_empty_key_and_value(key, value);}
++
++  key_type empty_key() const {  return rep.empty_key(); }
++
++  void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); }
++  void clear_deleted_key() { rep.clear_deleted_key(); }
++  key_type deleted_key() const { return rep.deleted_key(); }
++
++  // These are standard
++  size_type erase(const key_type& key) { return rep.erase(key); }
++
++  size_type erase_lockless(const key_type& key) {return rep.erase_lockless(key);}
++
++  iterator erase(const_iterator it) { return rep.erase(it); }
++  iterator erase(const_iterator f, const_iterator l) { return rep.erase(f, l); }
++
++  // Comparison
++  bool operator==(const dense_hash_map_lockless& hs) const { return rep == hs.rep; }
++  bool operator!=(const dense_hash_map_lockless& hs) const { return rep != hs.rep; }
++
++  // I/O -- this is an add-on for writing hash map to disk
++  //
++  // For maximum flexibility, this does not assume a particular
++  // file type (though it will probably be a FILE *).  We just pass
++  // the fp through to rep.
++
++  // If your keys and values are simple enough, you can pass this
++  // serializer to serialize()/unserialize().  "Simple enough" means
++  // value_type is a POD type that contains no pointers.  Note,
++  // however, we don't try to normalize endianness.
++  typedef typename ht::NopointerSerializer NopointerSerializer;
++
++  // serializer: a class providing operator()(OUTPUT*, const value_type&)
++  //    (writing value_type to OUTPUT).  You can specify a
++  //    NopointerSerializer object if appropriate (see above).
++  // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
++  //    pointer to a class providing size_t Write(const void*, size_t),
++  //    which writes a buffer into a stream (which fp presumably
++  //    owns) and returns the number of bytes successfully written.
++  //    Note basic_ostream<not_char> is not currently supported.
++  template <typename ValueSerializer, typename OUTPUT>
++  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
++    return rep.serialize(serializer, fp);
++  }
++
++  // serializer: a functor providing operator()(INPUT*, value_type*)
++  //    (reading from INPUT and into value_type).  You can specify a
++  //    NopointerSerializer object if appropriate (see above).
++  // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
++  //    pointer to a class providing size_t Read(void*, size_t),
++  //    which reads into a buffer from a stream (which fp presumably
++  //    owns) and returns the number of bytes successfully read.
++  //    Note basic_istream<not_char> is not currently supported.
++  // NOTE: Since value_type is std::pair<const Key, T>, ValueSerializer
++  // may need to do a const cast in order to fill in the key.
++  template <typename ValueSerializer, typename INPUT>
++  bool unserialize(ValueSerializer serializer, INPUT* fp) {
++    return rep.unserialize(serializer, fp);
++  }
++};
++
++// We need a global swap as well
++template <class Key, class T, class HashFcn, class EqualKey, class Alloc>
++inline void swap(dense_hash_map_lockless<Key, T, HashFcn, EqualKey, Alloc>& hm1,
++                 dense_hash_map_lockless<Key, T, HashFcn, EqualKey, Alloc>& hm2) {
++  hm1.swap(hm2);
++}
++
++}  // namespace google
+diff --git a/sparsehash/dense_hash_set_lockless b/sparsehash/dense_hash_set_lockless
+new file mode 100644
+index 0000000..5287d11
+--- /dev/null
++++ b/sparsehash/dense_hash_set_lockless
+@@ -0,0 +1,381 @@
++// Copyright (c) 2005, Google Inc.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are
++// met:
++//
++//     * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++//     * Redistributions in binary form must reproduce the above
++// copyright notice, this list of conditions and the following disclaimer
++// in the documentation and/or other materials provided with the
++// distribution.
++//     * Neither the name of Google Inc. nor the names of its
++// contributors may be used to endorse or promote products derived from
++// this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// ---
++//
++// This is just a very thin wrapper over densehashtable.h, just
++// like sgi stl's stl_hash_set is a very thin wrapper over
++// stl_hashtable.  The major thing we define is operator[], because
++// we have a concept of a data_type which stl_hashtable doesn't
++// (it only has a key and a value).
++//
++// This is more different from dense_hash_map than you might think,
++// because all iterators for sets are const (you obviously can't
++// change the key, and for sets there is no value).
++//
++// NOTE: this is exactly like sparse_hash_set.h, with the word
++// "sparse" replaced by "dense", except for the addition of
++// set_empty_key().
++//
++//   YOU MUST CALL SET_EMPTY_KEY() IMMEDIATELY AFTER CONSTRUCTION.
++//
++// Otherwise your program will die in mysterious ways.  (Note if you
++// use the constructor that takes an InputIterator range, you pass in
++// the empty key in the constructor, rather than after.  As a result,
++// this constructor differs from the standard STL version.)
++//
++// In other respects, we adhere mostly to the STL semantics for
++// hash-map.  One important exception is that insert() may invalidate
++// iterators entirely -- STL semantics are that insert() may reorder
++// iterators, but they all still refer to something valid in the
++// hashtable.  Not so for us.  Likewise, insert() may invalidate
++// pointers into the hashtable.  (Whether insert invalidates iterators
++// and pointers depends on whether it results in a hashtable resize).
++// On the plus side, delete() doesn't invalidate iterators or pointers
++// at all, or even change the ordering of elements.
++//
++// Here are a few "power user" tips:
++//
++//    1) set_deleted_key():
++//         If you want to use erase() you must call set_deleted_key(),
++//         in addition to set_empty_key(), after construction.
++//         The deleted and empty keys must differ.
++//
++//    2) resize(0):
++//         When an item is deleted, its memory isn't freed right
++//         away.  This allows you to iterate over a hashtable,
++//         and call erase(), without invalidating the iterator.
++//         To force the memory to be freed, call resize(0).
++//         For tr1 compatibility, this can also be called as rehash(0).
++//
++//    3) min_load_factor(0.0)
++//         Setting the minimum load factor to 0.0 guarantees that
++//         the hash table will never shrink.
++//
++// Roughly speaking:
++//   (1) dense_hash_set: fastest, uses the most memory unless entries are small
++//   (2) sparse_hash_set: slowest, uses the least memory
++//   (3) hash_set / unordered_set (STL): in the middle
++//
++// Typically I use sparse_hash_set when I care about space and/or when
++// I need to save the hashtable on disk.  I use hash_set otherwise.  I
++// don't personally use dense_hash_set ever; some people use it for
++// small sets with lots of lookups.
++//
++// - dense_hash_set has, typically, about 78% memory overhead (if your
++//   data takes up X bytes, the hash_set uses .78X more bytes in overhead).
++// - sparse_hash_set has about 4 bits overhead per entry.
++// - sparse_hash_set can be 3-7 times slower than the others for lookup and,
++//   especially, inserts.  See time_hash_map.cc for details.
++//
++// See /usr/(local/)?doc/sparsehash-*/dense_hash_set.html
++// for information about how to use this class.
++
++#pragma once
++
++#include <algorithm>   // needed by stl_alloc
++#include <functional>  // for equal_to<>, select1st<>, etc
++#include <initializer_list> // for initializer_list
++#include <memory>      // for alloc
++#include <utility>     // for pair<>
++#include <sparsehash/internal/densehashtable_lockless.h>  // IWYU pragma: export
++#include <sparsehash/internal/libc_allocator_with_realloc.h>
++
++namespace google {
++
++template <class Value, class HashFcn = std::hash<Value>,
++          class EqualKey = std::equal_to<Value>,
++          class Alloc = libc_allocator_with_realloc<Value>>
++class dense_hash_set_lockless {
++ private:
++  // Apparently identity is not stl-standard, so we define our own
++  struct Identity {
++    typedef const Value& result_type;
++    template <typename V>
++    const Value& operator()(V&& v) const { return v; }
++  };
++  struct SetKey {
++    void operator()(Value* value, const Value& new_key) const {
++      *value = new_key;
++    }
++    void operator()(Value* value, const Value& new_key, bool) const {
++        new(value) Value(new_key);
++    }
++  };
++
++  // The actual data
++  typedef typename sparsehash_internal::key_equal_chosen<HashFcn, EqualKey>::type EqualKeyChosen;
++  typedef dense_hashtable_lockless<Value, Value, HashFcn, Identity, SetKey, EqualKeyChosen,
++                          Alloc, Value> ht;
++  ht rep;
++
++  static_assert(!sparsehash_internal::has_transparent_key_equal<HashFcn>::value
++                || std::is_same<EqualKey, std::equal_to<Value>>::value
++                || std::is_same<EqualKey, EqualKeyChosen>::value,
++                "Heterogeneous lookup requires key_equal to either be the default container value or the same as the type provided by hash");
++
++ public:
++  typedef typename ht::key_type key_type;
++  typedef Value data_type;
++  typedef typename ht::value_type value_type;
++  typedef typename ht::hasher hasher;
++  typedef typename ht::key_equal key_equal;
++  typedef Alloc allocator_type;
++
++  typedef typename ht::size_type size_type;
++  typedef typename ht::difference_type difference_type;
++  typedef typename ht::const_pointer pointer;
++  typedef typename ht::const_pointer const_pointer;
++  typedef typename ht::const_reference reference;
++  typedef typename ht::const_reference const_reference;
++
++  typedef typename ht::const_iterator iterator;
++  typedef typename ht::const_iterator const_iterator;
++  typedef typename ht::const_local_iterator local_iterator;
++  typedef typename ht::const_local_iterator const_local_iterator;
++
++  // Iterator functions -- recall all iterators are const
++  iterator begin() const { return rep.begin(); }
++  iterator end() const { return rep.end(); }
++  const_iterator cbegin() const { return rep.begin(); }
++  const_iterator cend() const { return rep.end(); }
++
++  // These come from tr1's unordered_set. For us, a bucket has 0 or 1 elements.
++  local_iterator begin(size_type i) const { return rep.begin(i); }
++  local_iterator end(size_type i) const { return rep.end(i); }
++  local_iterator cbegin(size_type i) const { return rep.begin(i); }
++  local_iterator cend(size_type i) const { return rep.end(i); }
++
++  // Accessor functions
++  allocator_type get_allocator() const { return rep.get_allocator(); }
++  hasher hash_funct() const { return rep.hash_funct(); }
++  hasher hash_function() const { return hash_funct(); }  // tr1 name
++  key_equal key_eq() const { return rep.key_eq(); }
++
++  // Constructors
++  explicit dense_hash_set_lockless(size_type expected_max_items_in_table = 0,
++                          const hasher& hf = hasher(),
++                          const key_equal& eql = key_equal(),
++                          const allocator_type& alloc = allocator_type())
++      : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
++  }
++
++  template <class InputIterator>
++  dense_hash_set_lockless(InputIterator f, InputIterator l,
++                 const key_type& empty_key_val,
++                 size_type expected_max_items_in_table = 0,
++                 const hasher& hf = hasher(),
++                 const key_equal& eql = key_equal(),
++                 const allocator_type& alloc = allocator_type())
++      : rep(expected_max_items_in_table, hf, eql, Identity(), SetKey(), alloc) {
++    set_empty_key(empty_key_val);
++    rep.insert(f, l);
++  }
++  // We use the default copy constructor
++  // We use the default operator=()
++  // We use the default destructor
++
++  void clear() { rep.clear(); }
++  // This clears the hash set without resizing it down to the minimum
++  // bucket count, but rather keeps the number of buckets constant
++  void clear_no_resize() { rep.clear_no_resize(); }
++  void swap(dense_hash_set_lockless& hs) { rep.swap(hs.rep); }
++
++  // Functions concerning size
++  size_type size() const { return rep.size(); }
++  size_type max_size() const { return rep.max_size(); }
++  bool empty() const { return rep.empty(); }
++  size_type bucket_count() const { return rep.bucket_count(); }
++  size_type max_bucket_count() const { return rep.max_bucket_count(); }
++  void set_counternum(size_type ncounters) {rep.set_counternum(ncounters);}
++  long long int size_lockless() const {return rep.size_lockless();}
++
++  // These are tr1 methods.  bucket() is the bucket the key is or would be in.
++  size_type bucket_size(size_type i) const { return rep.bucket_size(i); }
++  size_type bucket(const key_type& key) const { return rep.bucket(key); }
++  float load_factor() const { return size() * 1.0f / bucket_count(); }
++  float max_load_factor() const {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    return grow;
++  }
++  void max_load_factor(float new_grow) {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    rep.set_resizing_parameters(shrink, new_grow);
++  }
++  // These aren't tr1 methods but perhaps ought to be.
++  float min_load_factor() const {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    return shrink;
++  }
++  void min_load_factor(float new_shrink) {
++    float shrink, grow;
++    rep.get_resizing_parameters(&shrink, &grow);
++    rep.set_resizing_parameters(new_shrink, grow);
++  }
++  // Deprecated; use min_load_factor() or max_load_factor() instead.
++  void set_resizing_parameters(float shrink, float grow) {
++    rep.set_resizing_parameters(shrink, grow);
++  }
++
++  void reserve(size_type size) { rehash(size); } // note: rehash internally treats hint/size as number of elements
++  void resize(size_type hint) { rep.resize(hint); }
++  void rehash(size_type hint) { resize(hint); }  // the tr1 name
++
++  // Lookup routines
++  iterator find(const key_type& key) const { return rep.find(key); }
++  std::pair<key_type, data_type> find_wait_free(key_type& key) {return rep.template find_wait_free<data_type>(key);}
++
++  template <typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, iterator>::type
++  find(const K& key) const { return rep.find(key); }
++
++  size_type count(const key_type& key) const { return rep.count(key); }
++
++  template <typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, size_type>::type
++  count(const K& key) const { return rep.count(key); }
++
++  std::pair<iterator, iterator> equal_range(const key_type& key) const {
++    return rep.equal_range(key);
++  }
++
++  template<typename K>
++  typename std::enable_if<sparsehash_internal::has_transparent_key_equal<hasher, K>::value, std::pair<iterator, iterator>>::type
++  equal_range(const K& key) const {
++    return rep.equal_range(key);
++  }
++
++  // Insertion routines
++  std::pair<iterator, bool> insert(const value_type& obj) {
++    std::pair<typename ht::iterator, bool> p = rep.insert(obj);
++    return std::pair<iterator, bool>(p.first, p.second);  // const to non-const
++  }
++
++  std::pair<iterator, bool> insert_lockless(const value_type& obj) {
++    return rep.insert_lockless(obj);
++  }
++
++  std::pair<iterator, bool> insert(value_type&& obj) {
++    std::pair<typename ht::iterator, bool> p = rep.insert(std::move(obj));
++    return std::pair<iterator, bool>(p.first, p.second);  // const to non-const
++  }
++
++  template <typename... Args>
++  std::pair<iterator, bool> emplace(Args&&... args) {
++    return rep.emplace(std::forward<Args>(args)...);
++  }
++
++  template <typename... Args>
++  std::pair<iterator, bool> emplace_hint(const_iterator hint, Args&&... args) {
++    return rep.emplace_hint(hint, std::forward<Args>(args)...);
++  }
++
++  template <class InputIterator>
++  void insert(InputIterator f, InputIterator l) {
++    rep.insert(f, l);
++  }
++  void insert(const_iterator f, const_iterator l) { rep.insert(f, l); }
++  void insert(std::initializer_list<value_type> ilist) { rep.insert(ilist.begin(), ilist.end()); }
++  // Required for std::insert_iterator; the passed-in iterator is ignored.
++  iterator insert(const_iterator, const value_type& obj) { return insert(obj).first; }
++  iterator insert(const_iterator, value_type&& obj) { return insert(std::move(obj)).first; }
++
++  // Deletion and empty routines
++  // THESE ARE NON-STANDARD!  I make you specify an "impossible" key
++  // value to identify deleted and empty buckets.  You can change the
++  // deleted key as time goes on, or get rid of it entirely to be insert-only.
++  void set_empty_key(const key_type& key) { rep.set_empty_key(key); }
++
++  void set_empty_key_and_value(const key_type& key, data_type value) {rep.set_empty_key_and_value(key, value);}
++  
++  key_type empty_key() const { return rep.empty_key(); }
++
++  void set_deleted_key(const key_type& key) { rep.set_deleted_key(key); }
++  void clear_deleted_key() { rep.clear_deleted_key(); }
++  key_type deleted_key() const { return rep.deleted_key(); }
++
++  // These are standard
++  size_type erase(const key_type& key) { return rep.erase(key); }
++  iterator erase(const_iterator it) { return rep.erase(it); }
++  iterator erase(const_iterator f, const_iterator l) { return rep.erase(f, l); }
++  size_type erase_lockless(const key_type& key) {return rep.erase_lockless(key);}
++
++  // Comparison
++  bool operator==(const dense_hash_set_lockless& hs) const { return rep == hs.rep; }
++  bool operator!=(const dense_hash_set_lockless& hs) const { return rep != hs.rep; }
++
++  // I/O -- this is an add-on for writing metainformation to disk
++  //
++  // For maximum flexibility, this does not assume a particular
++  // file type (though it will probably be a FILE *).  We just pass
++  // the fp through to rep.
++
++  // If your keys and values are simple enough, you can pass this
++  // serializer to serialize()/unserialize().  "Simple enough" means
++  // value_type is a POD type that contains no pointers.  Note,
++  // however, we don't try to normalize endianness.
++  typedef typename ht::NopointerSerializer NopointerSerializer;
++
++  // serializer: a class providing operator()(OUTPUT*, const value_type&)
++  //    (writing value_type to OUTPUT).  You can specify a
++  //    NopointerSerializer object if appropriate (see above).
++  // fp: either a FILE*, OR an ostream*/subclass_of_ostream*, OR a
++  //    pointer to a class providing size_t Write(const void*, size_t),
++  //    which writes a buffer into a stream (which fp presumably
++  //    owns) and returns the number of bytes successfully written.
++  //    Note basic_ostream<not_char> is not currently supported.
++  template <typename ValueSerializer, typename OUTPUT>
++  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
++    return rep.serialize(serializer, fp);
++  }
++
++  // serializer: a functor providing operator()(INPUT*, value_type*)
++  //    (reading from INPUT and into value_type).  You can specify a
++  //    NopointerSerializer object if appropriate (see above).
++  // fp: either a FILE*, OR an istream*/subclass_of_istream*, OR a
++  //    pointer to a class providing size_t Read(void*, size_t),
++  //    which reads into a buffer from a stream (which fp presumably
++  //    owns) and returns the number of bytes successfully read.
++  //    Note basic_istream<not_char> is not currently supported.
++  template <typename ValueSerializer, typename INPUT>
++  bool unserialize(ValueSerializer serializer, INPUT* fp) {
++    return rep.unserialize(serializer, fp);
++  }
++};
++
++template <class Val, class HashFcn, class EqualKey, class Alloc>
++inline void swap(dense_hash_set_lockless<Val, HashFcn, EqualKey, Alloc>& hs1,
++                 dense_hash_set_lockless<Val, HashFcn, EqualKey, Alloc>& hs2) {
++  hs1.swap(hs2);
++}
++
++}  // namespace google
+diff --git a/sparsehash/internal/densehashtable.h b/sparsehash/internal/densehashtable.h
+index e254126..3bc3c16 100644
+--- a/sparsehash/internal/densehashtable.h
++++ b/sparsehash/internal/densehashtable.h
+@@ -575,13 +575,17 @@ class dense_hashtable {
+   // We'll let you resize a hashtable -- though this makes us copy all!
+   // When you resize, you say, "make it big enough for this many more elements"
+   // Returns true if we actually resized, false if size was already ok.
+-  bool resize_delta(size_type delta) {
++  bool resize_delta(size_type delta) noexcept {
+     bool did_resize = false;
+     if (settings.consider_shrink()) {  // see if lots of deletes happened
+       if (maybe_shrink()) did_resize = true;
+     }
+     if (num_elements >= (std::numeric_limits<size_type>::max)() - delta) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("resize overflow");
++#else
++      abort();
++#endif
+     }
+     if (bucket_count() >= HT_MIN_BUCKETS &&
+         (num_elements + delta) <= settings.enlarge_threshold())
+@@ -953,7 +957,11 @@ class dense_hashtable {
+   template <typename... Args>
+   iterator insert_at(size_type pos, Args&&... args) {
+     if (size() >= max_size()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("insert overflow");
++#else
++      abort();
++#endif
+     }
+     if (test_deleted(pos)) {  // just replace if it's been del.
+       // shrug: shouldn't need to be const.
+@@ -989,10 +997,14 @@ class dense_hashtable {
+   // Specializations of insert(it, it) depending on the power of the iterator:
+   // (1) Iterator supports operator-, resize before inserting
+   template <class ForwardIterator>
+-  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) {
++  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) noexcept {
+     size_t dist = std::distance(f, l);
+     if (dist >= (std::numeric_limits<size_type>::max)()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("insert-range overflow");
++#else
++      abort();
++#endif
+     }
+     resize_delta(static_cast<size_type>(dist));
+     for (; dist > 0; --dist, ++f) {
+diff --git a/sparsehash/internal/densehashtable_lockless.h b/sparsehash/internal/densehashtable_lockless.h
+new file mode 100644
+index 0000000..2f8a80b
+--- /dev/null
++++ b/sparsehash/internal/densehashtable_lockless.h
+@@ -0,0 +1,2035 @@
++// Copyright (c) 2005, Google Inc.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are
++// met:
++//
++//     * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++//     * Redistributions in binary form must reproduce the above
++// copyright notice, this list of conditions and the following disclaimer
++// in the documentation and/or other materials provided with the
++// distribution.
++//     * Neither the name of Google Inc. nor the names of its
++// contributors may be used to endorse or promote products derived from
++// this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// ---
++//
++// A dense hashtable is a particular implementation of
++// a hashtable: one that is meant to minimize memory allocation.
++// It does this by using an array to store all the data.  We
++// steal a value from the key space to indicate "empty" array
++// elements (ie indices where no item lives) and another to indicate
++// "deleted" elements.
++//
++// (Note it is possible to change the value of the delete key
++// on the fly; you can even remove it, though after that point
++// the hashtable is insert_only until you set it again.  The empty
++// value however can't be changed.)
++//
++// To minimize allocation and pointer overhead, we use internal
++// probing, in which the hashtable is a single table, and collisions
++// are resolved by trying to insert again in another bucket.  The
++// most cache-efficient internal probing schemes are linear probing
++// (which suffers, alas, from clumping) and quadratic probing, which
++// is what we implement by default.
++//
++// Type requirements: value_type is required to be Copy Constructible
++// and Default Constructible. It is not required to be (and commonly
++// isn't) Assignable.
++//
++// You probably shouldn't use this code directly.  Use dense_hash_map<>
++// or dense_hash_set<> instead.
++
++// You can change the following below:
++// HT_OCCUPANCY_PCT      -- how full before we double size
++// HT_EMPTY_PCT          -- how empty before we halve size
++// HT_MIN_BUCKETS        -- default smallest bucket size
++//
++// You can also change enlarge_factor (which defaults to
++// HT_OCCUPANCY_PCT), and shrink_factor (which defaults to
++// HT_EMPTY_PCT) with set_resizing_parameters().
++//
++// How to decide what values to use?
++// shrink_factor's default of .4 * OCCUPANCY_PCT, is probably good.
++// HT_MIN_BUCKETS is probably unnecessary since you can specify
++// (indirectly) the starting number of buckets at construct-time.
++// For enlarge_factor, you can use this chart to try to trade-off
++// expected lookup time to the space taken up.  By default, this
++// code uses quadratic probing, though you can change it to linear
++// via JUMP_ below if you really want to.
++//
++// From
++// http://www.augustana.ca/~mohrj/courses/1999.fall/csc210/lecture_notes/hashing.html
++// NUMBER OF PROBES / LOOKUP       Successful            Unsuccessful
++// Quadratic collision resolution   1 - ln(1-L) - L/2    1/(1-L) - L - ln(1-L)
++// Linear collision resolution     [1+1/(1-L)]/2         [1+1/(1-L)2]/2
++//
++// -- enlarge_factor --           0.10  0.50  0.60  0.75  0.80  0.90  0.99
++// QUADRATIC COLLISION RES.
++//    probes/successful lookup    1.05  1.44  1.62  2.01  2.21  2.85  5.11
++//    probes/unsuccessful lookup  1.11  2.19  2.82  4.64  5.81  11.4  103.6
++// LINEAR COLLISION RES.
++//    probes/successful lookup    1.06  1.5   1.75  2.5   3.0   5.5   50.5
++//    probes/unsuccessful lookup  1.12  2.5   3.6   8.5   13.0  50.0  5000.0
++
++#pragma once
++
++#include <assert.h>
++#include <iostream>
++#include <fstream>
++#include <string>
++#include <typeinfo>
++#include <stdio.h>    // for FILE, fwrite, fread
++#include <algorithm>  // For swap(), eg
++#include <iterator>   // For iterator tags
++#include <limits>     // for numeric_limits
++#include <memory>     // For uninitialized_fill
++#include <utility>    // for pair
++#include <stdexcept>  // For length_error
++#include <type_traits>
++#include <atomic>
++#include <mutex>
++#include <sparsehash/internal/hashtable-common.h>
++#include <sparsehash/internal/libc_allocator_with_realloc.h>
++#if defined(__x86_64__) && defined(__AVX512F__)
++#include <immintrin.h>
++#endif
++
++#include <chrono>
++
++extern double time_for_insert_lockless;
++extern double time_for_insert_noresize_lockless;
++extern double time_for_insert_at_lockless;
++extern double time_for_setvalue_lockless;
++extern double time_for_rebucket_lockless;
++
++
++namespace google {
++
++// The probing method
++// Linear probing
++// #define JUMP_(key, num_probes)    ( 1 )
++// Quadratic probing
++#define JUMP_(key, num_probes) (num_probes)
++
++static thread_local long thread_flag = -1;
++// Hashtable class, used to implement the hashed associative containers
++// hash_set and hash_map.
++
++// Value: what is stored in the table (each bucket is a Value).
++// Key: something in a 1-to-1 correspondence to a Value, that can be used
++//      to search for a Value in the table (find() takes a Key).
++// HashFcn: Takes a Key and returns an integer, the more unique the better.
++// ExtractKey: given a Value, returns the unique Key associated with it.
++//             Must inherit from unary_function, or at least have a
++//             result_type enum indicating the return type of operator().
++// SetKey: given a Value* and a Key, modifies the value such that
++//         ExtractKey(value) == key.  We guarantee this is only called
++//         with key == deleted_key or key == empty_key.
++// EqualKey: Given two Keys, says whether they are the same (that is,
++//           if they are both associated with the same Value).
++// Alloc: STL allocator to use to allocate memory.
++
++template <class Value, class Key, class HashFcn, class ExtractKey, class SetKey,
++          class EqualKey, class Alloc, class Data>
++class dense_hashtable_lockless;
++
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++struct dense_hashtable_lockless_iterator;
++
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++struct dense_hashtable_lockless_const_iterator;
++
++// We're just an array, but we need to skip over empty and deleted elements
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++struct dense_hashtable_lockless_iterator {
++ private:
++  using value_alloc_type =
++      typename std::allocator_traits<A>::template rebind_alloc<V>;
++
++ public:
++  typedef dense_hashtable_lockless_iterator<V, K, HF, ExK, SetK, EqK, A, Data> iterator;
++  typedef dense_hashtable_lockless_const_iterator<V, K, HF, ExK, SetK, EqK, A, Data>
++      const_iterator;
++
++  typedef std::forward_iterator_tag iterator_category;  // very little defined!
++  typedef V value_type;
++  typedef typename value_alloc_type::difference_type difference_type;
++  typedef typename value_alloc_type::size_type size_type;
++  typedef typename value_alloc_type::reference reference;
++  typedef typename value_alloc_type::pointer pointer;
++
++  // "Real" constructor and default constructor
++  dense_hashtable_lockless_iterator(
++      const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* h, pointer it,
++      pointer it_end, bool advance)
++      : ht(h), pos(it), end(it_end) {
++    if (advance) advance_past_empty_and_deleted();
++  }
++  dense_hashtable_lockless_iterator() {}
++  // The default destructor is fine; we don't define one
++  // The default operator= is fine; we don't define one
++
++  // Happy dereferencer
++  reference operator*() const { return *pos; }
++  pointer operator->() const { return &(operator*()); }
++
++  // Arithmetic.  The only hard part is making sure that
++  // we're not on an empty or marked-deleted array element
++  void advance_past_empty_and_deleted() {
++    while (pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)))
++      ++pos;
++  }
++  iterator& operator++() {
++    assert(pos != end);
++    ++pos;
++    advance_past_empty_and_deleted();
++    return *this;
++  }
++  iterator operator++(int) {
++    iterator tmp(*this);
++    ++*this;
++    return tmp;
++  }
++
++  // Comparison.
++  bool operator==(const iterator& it) const { return pos == it.pos; }
++  bool operator!=(const iterator& it) const { return pos != it.pos; }
++
++  // The actual data
++  const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* ht;
++  pointer pos, end;
++};
++
++// Now do it all again, but with const-ness!
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++struct dense_hashtable_lockless_const_iterator {
++ private:
++  using value_alloc_type =
++      typename std::allocator_traits<A>::template rebind_alloc<V>;
++
++ public:
++  typedef dense_hashtable_lockless_iterator<V, K, HF, ExK, SetK, EqK, A, Data> iterator;
++  typedef dense_hashtable_lockless_const_iterator<V, K, HF, ExK, SetK, EqK, A, Data>
++      const_iterator;
++
++  typedef std::forward_iterator_tag iterator_category;  // very little defined!
++  typedef V value_type;
++  typedef typename value_alloc_type::difference_type difference_type;
++  typedef typename value_alloc_type::size_type size_type;
++  typedef typename value_alloc_type::const_reference reference;
++  typedef typename value_alloc_type::const_pointer pointer;
++
++  // "Real" constructor and default constructor
++  dense_hashtable_lockless_const_iterator(
++      const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* h, pointer it,
++      pointer it_end, bool advance)
++      : ht(h), pos(it), end(it_end) {
++    if (advance) advance_past_empty_and_deleted();
++  }
++  dense_hashtable_lockless_const_iterator() : ht(NULL), pos(pointer()), end(pointer()) {}
++  // This lets us convert regular iterators to const iterators
++  dense_hashtable_lockless_const_iterator(const iterator& it)
++      : ht(it.ht), pos(it.pos), end(it.end) {}
++  // The default destructor is fine; we don't define one
++  // The default operator= is fine; we don't define one
++
++  // Happy dereferencer
++  reference operator*() const { return *pos; }
++  pointer operator->() const { return &(operator*()); }
++
++  // Arithmetic.  The only hard part is making sure that
++  // we're not on an empty or marked-deleted array element
++  void advance_past_empty_and_deleted() {
++    while (pos != end && (ht->test_empty(*this) || ht->test_deleted(*this)))
++      ++pos;
++  }
++  const_iterator& operator++() {
++    assert(pos != end);
++    ++pos;
++    advance_past_empty_and_deleted();
++    return *this;
++  }
++  const_iterator operator++(int) {
++    const_iterator tmp(*this);
++    ++*this;
++    return tmp;
++  }
++
++  // Comparison.
++  bool operator==(const const_iterator& it) const { return pos == it.pos; }
++  bool operator!=(const const_iterator& it) const { return pos != it.pos; }
++
++  // The actual data
++  const dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>* ht;
++  pointer pos, end;
++};
++
++template <class Value, class Key, class HashFcn, class ExtractKey, class SetKey,
++          class EqualKey, class Alloc, class Data>
++class dense_hashtable_lockless {
++ private:
++  using value_alloc_type =
++      typename std::allocator_traits<Alloc>::template rebind_alloc<Value>;
++
++ public:
++  typedef Key key_type;
++  typedef Value value_type;
++  typedef HashFcn hasher;
++  typedef EqualKey key_equal;
++  typedef Alloc allocator_type;
++
++  typedef typename value_alloc_type::size_type size_type;
++  typedef typename value_alloc_type::difference_type difference_type;
++  typedef typename value_alloc_type::reference reference;
++  typedef typename value_alloc_type::const_reference const_reference;
++  typedef typename value_alloc_type::pointer pointer;
++  typedef typename value_alloc_type::const_pointer const_pointer;
++  typedef dense_hashtable_lockless_iterator<Value, Key, HashFcn, ExtractKey, SetKey,
++                                   EqualKey, Alloc, Data> iterator;
++
++  typedef dense_hashtable_lockless_const_iterator<
++      Value, Key, HashFcn, ExtractKey, SetKey, EqualKey, Alloc, Data> const_iterator;
++
++  // These come from tr1.  For us they're the same as regular iterators.
++  typedef iterator local_iterator;
++  typedef const_iterator const_local_iterator;
++ // A struct used for atomic copying for table parameters when using multi-thread
++
++  // How full we let the table get before we resize, by default.
++  // Knuth says .8 is good -- higher causes us to probe too much,
++  // though it saves memory.
++  static const int HT_OCCUPANCY_PCT;  // defined at the bottom of this file
++
++  // How empty we let the table get before we resize lower, by default.
++  // (0.0 means never resize lower.)
++  // It should be less than OCCUPANCY_PCT / 2 or we thrash resizing
++  static const int HT_EMPTY_PCT;  // defined at the bottom of this file
++
++  // Minimum size we're willing to let hashtables be.
++  // Must be a power of two, and at least 4.
++  // Note, however, that for a given hashtable, the initial size is a
++  // function of the first constructor arg, and may be >HT_MIN_BUCKETS.
++  static const size_type HT_MIN_BUCKETS = 4;
++
++  // By default, if you don't specify a hashtable size at
++  // construction-time, we use this size.  Must be a power of two, and
++  // at least HT_MIN_BUCKETS.
++  static const size_type HT_DEFAULT_STARTING_BUCKETS = 32;
++
++  // ITERATOR FUNCTIONS
++
++  iterator begin() { return iterator(this, pnew->table_, pnew->table_ + pnew->num_buckets_, true); }
++
++
++  iterator end() {
++    return iterator(this, pnew->table_ + pnew->num_buckets_, pnew->table_ + pnew->num_buckets_, true);
++  }
++
++  iterator end(void *tmp_pointer){
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    return iterator(this, tmp->table_ + tmp->num_buckets_, tmp->table_ + tmp->num_buckets_, true);
++  }
++
++  const_iterator begin() const {
++    return const_iterator(this, pnew->table_, pnew->table_ + pnew->num_buckets_, true);
++  }
++  const_iterator end() const {
++    return const_iterator(this, pnew->table_ + pnew->num_buckets_, pnew->table_ + pnew->num_buckets_, true);
++  }
++
++  // These come from tr1 unordered_map.  They iterate over 'bucket' n.
++  // We'll just consider bucket n to be the n-th element of the table.
++  local_iterator begin(size_type i) {
++    return local_iterator(this, table + i, table + i + 1, false);
++  }
++  local_iterator end(size_type i) {
++    local_iterator it = begin(i);
++    if (!test_empty(i) && !test_deleted(i)) ++it;
++    return it;
++  }
++
++  const_local_iterator begin(size_type i) const {
++    return const_local_iterator(this, pnew->table_ + i, pnew->table_ + i + 1, false);
++  }
++
++  const_local_iterator end(size_type i) const {
++    const_local_iterator it = begin(i);
++    if (!test_empty(i) && !test_deleted(i)) ++it;
++    return it;
++  }
++
++  // ACCESSOR FUNCTIONS for the things we templatize on, basically
++  hasher hash_funct() const { return settings; }
++  key_equal key_eq() const { return key_info; }
++  allocator_type get_allocator() const { return allocator_type(val_info); }
++
++  // Accessor function for statistics gathering.
++  int num_table_copies() const { return settings.num_ht_copies(); }
++
++ private:
++  // Annoyingly, we can't copy values around, because they might have
++  // const components (they're probably pair<const X, Y>).  We use
++  // explicit destructor invocation and placement new to get around
++  // this.  Arg.
++
++
++  template <typename... Args>
++  void set_value(pointer dst, value_type p) {
++    dst->~value_type();  // delete the old value, if any
++    new (dst) value_type(p);
++  }
++
++  template <typename... Args, typename K>
++  bool set_value_lockless(K* dst, K p){
++    using NCKey = typename std::remove_cv<Key>::type;
++    bool flag_set_empty_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(dst), pnew->key_info_.empty_key, p);
++    if (flag_set_empty_key){
++      return true;
++    } else{
++      bool flag_set_del_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(dst), pnew->key_info_.delkey, p);
++      if (flag_set_del_key){
++        return true;
++      } else{
++        return false;
++      }
++    }
++  }
++
++
++  template <typename... Args, typename K, typename T>
++  bool set_value_lockless(std::pair<K, T>* dst, std::pair<K, T> p){
++    bool flag = __sync_bool_compare_and_swap(&dst->second, empty_value, p.second);
++    if (!flag){
++      return false;
++    } else{
++      using NCKey = typename std::remove_cv<Key>::type;
++      bool flag_set_empty_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(&dst->first), pnew->key_info_.empty_key, p.first);
++      if (flag_set_empty_key){
++        return true;
++      } else{
++        bool flag_set_del_key = __sync_bool_compare_and_swap(const_cast<NCKey*>(&dst->first), pnew->key_info_.delkey, p.first);
++        if (flag_set_del_key){
++          return true;
++        } else{
++        return false;
++        }
++      }
++    }
++  }
++
++
++  template <typename T, typename... Args>
++  void set_value(pointer dst, key_type first, T second) {
++    dst->~value_type();  // delete the old value, if any
++    new (dst) value_type(first, second);
++  }
++
++  void destroy_buckets(size_type first, size_type last) {
++    for (; first != last; ++first) table[first].~value_type();
++  }
++
++  // DELETE HELPER FUNCTIONS
++  // This lets the user describe a key that will indicate deleted
++  // table entries.  This key should be an "impossible" entry --
++  // if you try to insert it for real, you won't be able to retrieve it!
++  // (NB: while you pass in an entire value, only the key part is looked
++  // at.  This is just because I don't know how to assign just a key.)
++ private:
++  void squash_deleted() {          // gets rid of any deleted entries we have
++    if (num_deleted) {             // get rid of deleted before writing
++      size_type resize_to = settings.min_buckets(
++          num_elements, bucket_count());
++      dense_hashtable_lockless tmp(std::move(*this), resize_to);  // copying will get rid of deleted
++      swap(tmp);                   // now we are tmp
++    }
++    assert(num_deleted == 0);
++  }
++
++  // Test if the given key is the deleted indicator.  Requires
++  // num_deleted > 0, for correctness of read(), and because that
++  // guarantees that key_info.delkey is valid.
++
++  bool test_deleted_key(const key_type& key) const {
++    return equals(pnew->key_info_.delkey, key);
++  }
++
++  bool test_deleted_key(const key_type& key, void* tmp_pointer) const {
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    return equals(tmp->key_info_.delkey, key);
++  }
++
++
++ public:
++  void set_deleted_key(const key_type& key) {
++    // the empty indicator (if specified) and the deleted indicator
++    // must be different
++    assert(
++        (!settings.use_empty() || !equals(key, key_info.empty_key)) &&
++        "Passed the empty-key to set_deleted_key");
++    // It's only safe to change what "deleted" means if we purge deleted guys
++    squash_deleted();
++    settings.set_use_deleted(true);
++
++    pnew->settings_.set_use_deleted(true);
++    pold->settings_.set_use_deleted(true);
++    pnew->key_info_.delkey = key;
++    pold->key_info_.delkey = key;
++
++    key_info.delkey = key;
++  }
++  void clear_deleted_key() {
++    squash_deleted();
++    settings.set_use_deleted(false);
++  }
++  key_type deleted_key() const {
++    assert(settings.use_deleted() &&
++           "Must set deleted key before calling deleted_key");
++    return key_info.delkey;
++  }
++
++  // These are public so the iterators can use them
++  // True if the item at position bucknum is "deleted" marker
++
++  bool test_deleted(size_type bucknum) const {
++    // Invariant: !use_deleted() implies num_deleted is 0.
++    assert(pnew->settings_.use_deleted() || pnew->num_deleted_ == 0);
++    return pnew->num_deleted_ > 0 && test_deleted_key(get_key(pnew->table_[bucknum]));
++  }
++  bool test_deleted(size_type bucknum, void* tmp_pointer) const {
++    // Invariant: !use_deleted() implies num_deleted is 0.
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    assert(tmp->settings_.use_deleted() || tmp->num_deleted_ == 0);
++    return test_deleted_key(get_key(tmp->table_[bucknum]), tmp_pointer);
++  }
++
++
++
++
++  bool test_deleted(const iterator& it) const {
++    // Invariant: !use_deleted() implies num_deleted is 0.
++    assert(settings.use_deleted() || num_deleted == 0);
++    return test_deleted_key(get_key(*it));
++  }
++
++
++  bool test_deleted(const const_iterator& it) const {
++    // Invariant: !use_deleted() implies num_deleted is 0.
++    assert(pnew->settings_.use_deleted() || pnew->num_deleted_ == 0);
++    return test_deleted_key(get_key(*it));
++  }
++
++
++ private:
++  void check_use_deleted(const char* caller) {
++    (void)caller;  // could log it if the assert failed
++    assert(pnew->settings_.use_deleted());
++
++  }
++
++  // Set it so test_deleted is true.  true if object didn't used to be deleted.
++  bool set_deleted(iterator& it) {
++    check_use_deleted("set_deleted()");
++    bool retval = !test_deleted(it);
++    // &* converts from iterator to value-type.
++    set_key(&(*it), key_info.delkey);
++    return retval;
++  }
++  // Set it so test_deleted is false.  true if object used to be deleted.
++  bool clear_deleted(iterator& it) {
++    check_use_deleted("clear_deleted()");
++    // Happens automatically when we assign something else in its place.
++    return test_deleted(it);
++  }
++
++  // We also allow to set/clear the deleted bit on a const iterator.
++  // We allow a const_iterator for the same reason you can delete a
++  // const pointer: it's convenient, and semantically you can't use
++  // 'it' after it's been deleted anyway, so its const-ness doesn't
++  // really matter.
++  bool set_deleted(const_iterator& it) {
++    check_use_deleted("set_deleted()");
++    bool retval = !test_deleted(it);
++    set_key(const_cast<pointer>(&(*it)), key_info.delkey);
++    return retval;
++  }
++  // Set it so test_deleted is false.  true if object used to be deleted.
++  bool clear_deleted(const_iterator& it) {
++    check_use_deleted("clear_deleted()");
++    return test_deleted(it);
++  }
++
++  // EMPTY HELPER FUNCTIONS
++  // This lets the user describe a key that will indicate empty (unused)
++  // table entries.  This key should be an "impossible" entry --
++  // if you try to insert it for real, you won't be able to retrieve it!
++  // (NB: while you pass in an entire value, only the key part is looked
++  // at.  This is just because I don't know how to assign just a key.)
++ public:
++  // These are public so the iterators can use them
++  // True if the item at position bucknum is "empty" marker
++
++  bool test_empty(size_type bucknum) const {
++    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
++    return equals(pnew->key_info_.empty_key, get_key(pnew->table_[bucknum]));
++  }
++  bool test_empty(size_type bucknum, void* tmp_pointer) const {
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    assert(tmp->settings_.use_empty());  // we always need to know what's empty!
++    return equals(tmp->key_info_.empty_key, get_key(tmp->table_[bucknum]));
++  }
++  bool test_old_empty(size_type bucknum) const {
++    assert(pold->settings_.use_empty());  // we always need to know what's empty!
++    return equals(pold->key_info_.empty_key, get_key(pold->table_[bucknum]));
++  }
++
++
++
++  bool test_empty(const iterator& it) const {
++    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
++    return equals(pnew->key_info_.empty_key, get_key(*it));
++  }
++  bool test_empty(const const_iterator& it) const {
++    assert(pnew->settings_.use_empty());  // we always need to know what's empty!
++    return equals(pnew->key_info_.empty_key, get_key(*it));
++  }
++
++ private:
++  void fill_range_with_empty(pointer table_start, size_type count) {
++    for (size_type i = 0; i < count; ++i)
++    {
++      construct_key(&table_start[i], pnew->key_info_.empty_key);
++    }
++  }
++  template <typename K, typename T>
++  void set_bucket(std::pair<K, T>* table_start, T value) {
++    using NCKey = typename std::remove_cv<Key>::type;
++    *const_cast<NCKey*>(&table_start->first) = pnew->key_info_.empty_key;
++    table_start->second = value;
++  }
++
++  template <typename K, typename T>
++  void set_bucket(K* table_start, T value) {
++    using NCKey = typename std::remove_cv<Key>::type;
++    *const_cast<NCKey*>(table_start) = pnew->key_info_.empty_key;
++  }
++
++
++  template <typename T>
++  void fill_range_with_empty(pointer table_start, size_type count, T value){
++    for (size_type i = 0; i < count; ++i)
++    {
++      set_bucket(&table_start[i], value);
++    }
++  }
++
++
++ public:
++template<typename T>
++  void set_empty_key_and_value(const key_type& key, T value){
++    assert(!pnew->settings_.use_empty() && "Calling set_empty_key multiple times");
++    assert(!pold->settings_.use_empty() && "Calling set_empty_key multiple times");
++    assert(
++        (!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
++        "Setting the empty key the same as the deleted key");
++    assert(
++        (!pold->settings_.use_deleted() || !equals(key, pold->key_info_.delkey)) &&
++        "Setting the empty key the same as the deleted key");
++    pnew->settings_.set_use_empty(true);
++    pnew->key_info_.empty_key = key;
++    empty_value = value;
++
++    pold->settings_.set_use_empty(true);
++    pold->key_info_.empty_key = key;
++    assert(!pnew->table_); 
++    assert(!pold->table_);
++
++    pnew->num_buckets_ = 128;
++    pnew->table_ = pnew->val_info_.allocate(pnew->num_buckets_);
++    assert(pnew->table_);
++    fill_range_with_empty(pnew->table_, pnew->num_buckets_, value);
++    pold->num_buckets_ = 128;
++    pold->table_ = pnew->val_info_.allocate(pold->num_buckets_);
++    assert(pold->table_);
++    fill_range_with_empty(pold->table_, pold->num_buckets_, value);
++  }
++
++  void set_empty_key(const key_type& key) {
++    // Once you set the empty key, you can't change it
++    assert(!pnew->settings_.use_empty() && "Calling set_empty_key multiple times");
++    assert(!pold->settings_.use_empty() && "Calling set_empty_key multiple times");
++
++    // The deleted indicator (if specified) and the empty indicator
++    // must be different.
++    assert(
++        (!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
++        "Setting the empty key the same as the deleted key");
++    assert(
++        (!pold->settings_.use_deleted() || !equals(key, pold->key_info_.delkey)) &&
++        "Setting the empty key the same as the deleted key");
++    pnew->settings_.set_use_empty(true);
++    pnew->key_info_.empty_key = key;
++
++    pold->settings_.set_use_empty(true);
++    pold->key_info_.empty_key = key;
++
++    assert(!pnew->table_);  // must set before first use
++    // num_buckets was set in constructor even though table was NULL
++    assert(!pold->table_);
++    pnew->table_ = pnew->val_info_.allocate(pnew->num_buckets_);
++    assert(pnew->table_);
++    fill_range_with_empty(pnew->table_, pnew->num_buckets_);
++    pold->table_ = pnew->val_info_.allocate(pold->num_buckets_);
++    assert(pold->table_);
++    fill_range_with_empty(pold->table_, pold->num_buckets_);
++  }
++
++  void set_counternum(size_type counternum){
++    num_counters = counternum;
++#if defined(__x86_64)
++    num_elements_array = (size_type*)_mm_malloc(sizeof(size_type)*num_counters,64);
++#else
++	 num_elements_array = (size_type*)aligned_alloc(64,sizeof(size_type)*num_counters);
++#endif
++    counter_flag = (long*)malloc(sizeof(long)*num_counters);
++    for(size_type i = 0; i < num_counters; i++){
++      num_elements_array[i] = 0;
++      counter_flag[i] = 0;
++    }
++  }
++
++
++  key_type empty_key() const {
++    assert(settings.use_empty());
++    return key_info.empty_key;
++  }
++
++  // FUNCTIONS CONCERNING SIZE
++ public:
++
++  size_type size() const { return pnew->num_elements_ - pnew->num_deleted_; }
++
++  size_type max_size() const { return val_info.max_size(); }
++
++  bool empty() const { return size() == 0; }
++
++  size_type bucket_count() const { return pnew->num_buckets_; }
++  size_type old_bucket_count() const { return pold->num_buckets_; }
++
++  size_type max_bucket_count() const { return max_size(); }
++  size_type nonempty_bucket_count() const { return num_elements; }
++  // These are tr1 methods.  Their idea of 'bucket' doesn't map well to
++  // what we do.  We just say every bucket has 0 or 1 items in it.
++  size_type bucket_size(size_type i) const {
++    return begin(i) == end(i) ? 0 : 1;
++  }
++
++ private:
++  // Because of the above, size_type(-1) is never legal; use it for errors
++  static const size_type ILLEGAL_BUCKET = size_type(-1);
++
++  // Used after a string of deletes.  Returns true if we actually shrunk.
++  // TODO(csilvers): take a delta so we can take into account inserts
++  // done after shrinking.  Maybe make part of the Settings class?
++  bool maybe_shrink() {
++    assert(pnew->num_elements_ >= pnew->num_deleted_);
++    assert((bucket_count() & (bucket_count() - 1)) == 0);  // is a power of two
++    assert(bucket_count() >= HT_MIN_BUCKETS);
++    bool retval = false;
++
++    // If you construct a hashtable with < HT_DEFAULT_STARTING_BUCKETS,
++    // we'll never shrink until you get relatively big, and we'll never
++    // shrink below HT_DEFAULT_STARTING_BUCKETS.  Otherwise, something
++    // like "dense_hash_set<int> x; x.insert(4); x.erase(4);" will
++    // shrink us down to HT_MIN_BUCKETS buckets, which is too small.
++
++    const size_type num_remain = pnew->num_elements_ - pnew->num_deleted_;
++    const size_type shrink_threshold = pnew->settings_.shrink_threshold();
++
++    if (shrink_threshold > 0 && num_remain < shrink_threshold &&
++        bucket_count() > HT_DEFAULT_STARTING_BUCKETS) {
++      const float shrink_factor = settings.shrink_factor();
++      size_type sz = bucket_count() / 2;  // find how much we should shrink
++      while (sz > HT_DEFAULT_STARTING_BUCKETS &&
++             num_remain < sz * shrink_factor) {
++        sz /= 2;  // stay a power of 2
++      }
++      dense_hashtable_lockless tmp(std::move(*this), sz);  // Do the actual resizing
++      swap(tmp);                       // now we are tmp
++      retval = true;
++    }
++    pnew->settings_.set_consider_shrink(false);  // because we just considered it
++
++    return retval;
++  }
++
++  // We'll let you resize a hashtable -- though this makes us copy all!
++  // When you resize, you say, "make it big enough for this many more elements"
++  // Returns true if we actually resized, false if size was already ok.
++  bool resize_delta(size_type delta) {
++    bool did_resize = false;
++    if (pnew->settings_.consider_shrink()) {  // see if lots of deletes happened
++      if (maybe_shrink()) did_resize = true;
++    }
++
++    if (pnew->num_elements_ >= (std::numeric_limits<size_type>::max)() - delta) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
++      throw std::length_error("resize overflow");
++#else
++      abort();
++#endif
++    }
++    if (bucket_count() >= HT_MIN_BUCKETS &&
++        (pnew->num_elements_ + delta) <= pnew->settings_.enlarge_threshold())
++      return did_resize;  // we're ok as we are
++
++
++    // Sometimes, we need to resize just to get rid of all the
++    // "deleted" buckets that are clogging up the hashtable.  So when
++    // deciding whether to resize, count the deleted buckets (which
++    // are currently taking up room).  But later, when we decide what
++    // size to resize to, *don't* count deleted buckets, since they
++    // get discarded during the resize.
++    size_type needed_size = pnew->settings_.min_buckets(pnew->num_elements_ + delta, 0);
++
++
++    if (needed_size <= bucket_count())  // we have enough buckets
++      return did_resize;
++
++    size_type resize_to = pnew->settings_.min_buckets(
++        pnew->num_elements_ - pnew->num_deleted_ + delta, bucket_count());
++
++
++
++    // When num_deleted is large, we may still grow but we do not want to
++    // over expand.  So we reduce needed_size by a portion of num_deleted
++    // (the exact portion does not matter).  This is especially helpful
++    // when min_load_factor is zero (no shrink at all) to avoid doubling
++    // the bucket count to infinity.  See also test ResizeWithoutShrink.
++    needed_size = pnew->settings_.min_buckets(pnew->num_elements_ - pnew->num_deleted_ / 4 + delta, 0);
++
++
++    if (resize_to < needed_size &&  // may double resize_to
++        resize_to < (std::numeric_limits<size_type>::max)() / 2) {
++      // This situation means that we have enough deleted elements,
++      // that once we purge them, we won't actually have needed to
++      // grow.  But we may want to grow anyway: if we just purge one
++      // element, say, we'll have to grow anyway next time we
++      // insert.  Might as well grow now, since we're already going
++      // through the trouble of copying (in order to purge the
++      // deleted elements).
++
++      const size_type target =
++          static_cast<size_type>(pnew->settings_.shrink_size(resize_to * 2));
++      if (pnew->num_elements_ - pnew->num_deleted_ + delta >= target) {
++        // Good, we won't be below the shrink threshhold even if we double.
++        resize_to *= 2;
++      }
++    }
++   
++/*realloc and memory copy*/
++    for (unsigned long i = 0; i < pold->num_buckets_; ++i) pold->table_[i].~value_type();
++    pold->table_ = pold->val_info_.allocate(resize_to);
++    assert(pold->table_);
++    fill_range_with_empty(pold->table_, resize_to);
++    pold->num_elements_ = 0;
++    pold->num_deleted_ = 0;
++    pold->num_buckets_ = resize_to;  // our new size
++    pold->settings_.reset_thresholds(resize_to);
++    assert((old_bucket_count() & (old_bucket_count() - 1)) == 0);
++    size_type bucknum;
++    const size_type bucket_count_minus_one = old_bucket_count() - 1;
++    for(unsigned long i = 0; i < bucket_count(); i++){
++      if (!test_empty(i) && !test_deleted(i)) {
++        size_type num_probes = 0;  // how many times we've probed
++        for (bucknum = hash(get_key(pnew->table_[i])) & bucket_count_minus_one;
++           !test_old_empty(bucknum);  // not empty
++           bucknum =
++               (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
++        ++num_probes;
++        assert(num_probes < bucket_count() &&
++               "Hashtable is full: an error in key_equal<> or hash<>");
++      }
++      using will_move = std::is_rvalue_reference<dense_hashtable_lockless &&>;
++      using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
++
++      set_value(&(pold->table_[bucknum]), std::forward<value_t>(pnew->table_[i]));
++      pold->num_elements_++;
++      }
++  
++
++    }
++    std::atomic<TableInternalParameter*> tmp_atomic (0);
++    tmp_atomic.store(pold);
++    TableInternalParameter* tmp = pnew;
++    pnew = tmp_atomic.load();
++    pold = tmp;
++    return true;
++  }
++
++  // We require table be not-NULL and empty before calling this.
++  void resize_table(size_type /*old_size*/, size_type new_size,
++                    std::true_type) {
++    table = val_info.realloc_or_die(table, new_size);
++  }
++
++  void resize_table(size_type old_size, size_type new_size, std::false_type) {
++    val_info.deallocate(table, old_size);
++    table = val_info.allocate(new_size);
++  }
++
++  // Used to actually do the rehashing when we grow/shrink a hashtable
++  template <typename Hashtable>
++  void copy_or_move_from(Hashtable&& ht, size_type min_buckets_wanted) {
++    clear_to_size(settings.min_buckets(ht.size(), min_buckets_wanted));
++
++    // We use a normal iterator to get non-deleted bcks from ht
++    // We could use insert() here, but since we know there are
++    // no duplicates and no deleted items, we can be more efficient
++    assert((bucket_count() & (bucket_count() - 1)) == 0);  // a power of two
++    for (auto&& value : ht) {
++      size_type num_probes = 0;  // how many times we've probed
++      size_type bucknum;
++      const size_type bucket_count_minus_one = bucket_count() - 1;
++      for (bucknum = hash(get_key(value)) & bucket_count_minus_one;
++           !test_empty(bucknum);  // not empty
++           bucknum =
++               (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
++        ++num_probes;
++        assert(num_probes < bucket_count() &&
++               "Hashtable is full: an error in key_equal<> or hash<>");
++      }
++
++      using will_move = std::is_rvalue_reference<Hashtable&&>;
++      using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
++
++      set_value(&table[bucknum], std::forward<value_t>(value));
++      num_elements++;
++    }
++    settings.inc_num_ht_copies();
++  }
++
++  // Required by the spec for hashed associative container
++ public:
++  // Though the docs say this should be num_buckets, I think it's much
++  // more useful as num_elements.  As a special feature, calling with
++  // req_elements==0 will cause us to shrink if we can, saving space.
++  void resize(size_type req_elements) {  // resize to this or larger
++  if (pnew->settings_.consider_shrink() || req_elements == 0) maybe_shrink();
++  if (req_elements > pnew->num_elements_) resize_delta(req_elements - pnew->num_elements_);
++  }
++
++  // Get and change the value of shrink_factor and enlarge_factor.  The
++  // description at the beginning of this file explains how to choose
++  // the values.  Setting the shrink parameter to 0.0 ensures that the
++  // table never shrinks.
++  void get_resizing_parameters(float* shrink, float* grow) const {
++    *shrink = settings.shrink_factor();
++    *grow = settings.enlarge_factor();
++  }
++  void set_resizing_parameters(float shrink, float grow) {
++    settings.set_resizing_parameters(shrink, grow);
++    settings.reset_thresholds(bucket_count());
++  }
++
++  // CONSTRUCTORS -- as required by the specs, we take a size,
++  // but also let you specify a hashfunction, key comparator,
++  // and key extractor.  We also define a copy constructor and =.
++  // DESTRUCTOR -- needs to free the table
++  explicit dense_hashtable_lockless(size_type expected_max_items_in_table = 0,
++                           const HashFcn& hf = HashFcn(),
++                           const EqualKey& eql = EqualKey(),
++                           const ExtractKey& ext = ExtractKey(),
++                           const SetKey& set = SetKey(),
++                           const Alloc& alloc = Alloc())
++      : settings(hf),
++        key_info(ext, set, eql),
++        num_deleted(0),
++        num_elements(0),
++        num_buckets(expected_max_items_in_table == 0
++                        ? HT_DEFAULT_STARTING_BUCKETS
++                        : settings.min_buckets(expected_max_items_in_table, 0)),
++        val_info(alloc_impl<value_alloc_type>(alloc)),
++
++        insert_counter(0),
++        tp_new(expected_max_items_in_table, hf, eql, ext, set, alloc),
++        tp_old(expected_max_items_in_table, hf, eql, ext, set, alloc),
++
++        table(NULL) {
++    // table is NULL until emptyval is set.  However, we set num_buckets
++    // here so we know how much space to allocate once emptyval is set
++   
++
++    pnew = &tp_new;
++    pold = &tp_old;
++    resizeflag = false;
++    tp_new.settings_.reset_thresholds(bucket_count());
++    tp_old.settings_.reset_thresholds(bucket_count());
++
++    settings.reset_thresholds(bucket_count());
++  }
++
++  // As a convenience for resize(), we allow an optional second argument
++  // which lets you make this new hashtable a different size than ht
++  dense_hashtable_lockless(const dense_hashtable_lockless& ht,
++                  size_type min_buckets_wanted = HT_DEFAULT_STARTING_BUCKETS)
++      : settings(ht.settings),
++        key_info(ht.key_info),
++        num_deleted(0),
++        num_elements(0),
++        num_buckets(0),
++        val_info(ht.val_info),
++        table(NULL) {
++    if (!ht.settings.use_empty()) {
++      // If use_empty isn't set, copy_from will crash, so we do our own copying.
++      assert(ht.empty());
++      num_buckets = settings.min_buckets(ht.size(), min_buckets_wanted);
++      settings.reset_thresholds(bucket_count());
++      return;
++    }
++    settings.reset_thresholds(bucket_count());
++    copy_or_move_from(ht, min_buckets_wanted);  // copy_or_move_from() ignores deleted entries
++  }
++
++  dense_hashtable_lockless(dense_hashtable_lockless&& ht)
++      : dense_hashtable_lockless() {
++    swap(ht);
++  }
++
++  dense_hashtable_lockless(dense_hashtable_lockless&& ht,
++                  size_type min_buckets_wanted)
++      : settings(ht.settings),
++        key_info(ht.key_info),
++        num_deleted(0),
++        num_elements(0),
++        num_buckets(0),
++        val_info(std::move(ht.val_info)),
++        table(NULL) {
++    if (!ht.settings.use_empty()) {
++      // If use_empty isn't set, copy_or_move_from will crash, so we do our own copying.
++      assert(ht.empty());
++      num_buckets = settings.min_buckets(ht.size(), min_buckets_wanted);
++      settings.reset_thresholds(bucket_count());
++      return;
++    }
++    settings.reset_thresholds(bucket_count());
++    copy_or_move_from(std::move(ht), min_buckets_wanted);  // copy_or_move_from() ignores deleted entries
++  }
++
++  dense_hashtable_lockless& operator=(const dense_hashtable_lockless& ht) {
++    if (&ht == this) return *this;  // don't copy onto ourselves
++    if (!ht.settings.use_empty()) {
++      assert(ht.empty());
++      dense_hashtable_lockless empty_table(ht);  // empty table with ht's thresholds
++      this->swap(empty_table);
++      return *this;
++    }
++    settings = ht.settings;
++    key_info = ht.key_info;
++    // copy_or_move_from() calls clear and sets num_deleted to 0 too
++    copy_or_move_from(ht, HT_MIN_BUCKETS);
++    // we purposefully don't copy the allocator, which may not be copyable
++    return *this;
++  }
++
++  dense_hashtable_lockless& operator=(dense_hashtable_lockless&& ht) {
++    assert(&ht != this); // this should not happen
++    swap(ht);
++    return *this;
++  }
++
++  ~dense_hashtable_lockless() {
++    if (table) {
++      destroy_buckets(0, num_buckets);
++      val_info.deallocate(table, num_buckets);
++    }
++    for(size_type i = 0; i < pnew->num_buckets_; i++)
++      pnew->table_[i].~value_type();
++    for(size_type i = 0; i < pold->num_buckets_; i++)
++      pold->table_[i].~value_type();
++    free(pnew->table_);
++    free(pold->table_);
++  }
++
++  // Many STL algorithms use swap instead of copy constructors
++  void swap(dense_hashtable_lockless& ht) {
++    std::swap(settings, ht.settings);
++    std::swap(key_info, ht.key_info);
++    std::swap(num_deleted, ht.num_deleted);
++    std::swap(num_elements, ht.num_elements);
++    std::swap(num_buckets, ht.num_buckets);
++    std::swap(table, ht.table);
++    settings.reset_thresholds(bucket_count());  // also resets consider_shrink
++    ht.settings.reset_thresholds(ht.bucket_count());
++    // we purposefully don't swap the allocator, which may not be swap-able
++  }
++
++ private:
++  void clear_to_size(size_type new_num_buckets) {
++    if (!table) {
++      table = val_info.allocate(new_num_buckets);
++    } else {
++      destroy_buckets(0, num_buckets);
++      if (new_num_buckets != num_buckets) {  // resize, if necessary
++        typedef std::integral_constant<
++            bool, std::is_same<value_alloc_type,
++                               libc_allocator_with_realloc<value_type>>::value>
++            realloc_ok;
++        resize_table(num_buckets, new_num_buckets, realloc_ok());
++      }
++    }
++    assert(table);
++    fill_range_with_empty(table, new_num_buckets);
++    num_elements = 0;
++    num_deleted = 0;
++    num_buckets = new_num_buckets;  // our new size
++    settings.reset_thresholds(bucket_count());
++  }
++
++ public:
++  // It's always nice to be able to clear a table without deallocating it
++  void clear() {
++    // If the table is already empty, and the number of buckets is
++    // already as we desire, there's nothing to do.
++    const size_type new_num_buckets = settings.min_buckets(0, 0);
++    if (num_elements == 0 && new_num_buckets == num_buckets) {
++      return;
++    }
++    clear_to_size(new_num_buckets);
++  }
++
++  // Clear the table without resizing it.
++  // Mimicks the stl_hashtable's behaviour when clear()-ing in that it
++  // does not modify the bucket count
++  void clear_no_resize() {
++    if (num_elements > 0) {
++      assert(table);
++      destroy_buckets(0, num_buckets);
++      fill_range_with_empty(table, num_buckets);
++    }
++    // don't consider to shrink before another erase()
++    settings.reset_thresholds(bucket_count());
++    num_elements = 0;
++    num_deleted = 0;
++  }
++
++  // LOOKUP ROUTINES
++ private:
++  // Returns a pair of positions: 1st where the object is, 2nd where
++  // it would go if you wanted to insert it.  1st is ILLEGAL_BUCKET
++  // if object is not found; 2nd is ILLEGAL_BUCKET if it is.
++  // Note: because of deletions where-to-insert is not trivial: it's the
++  // first deleted bucket we see, as long as we don't find the key later
++  template <typename K>
++  std::pair<size_type, size_type> find_position_for_insert(const K& key, void* tmp){
++    size_type num_probes = 0;
++    TableInternalParameter* tmp_pointer = static_cast<TableInternalParameter*>(tmp); 
++    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
++    size_type bucknum = hash(key) & bucket_count_minus_one;
++    size_type insert_pos = ILLEGAL_BUCKET;
++    while(1){
++      if(test_empty(bucknum,tmp_pointer)){
++        if (insert_pos == ILLEGAL_BUCKET)
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
++        else
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
++      }else if(test_deleted(bucknum, tmp_pointer)){
++        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
++      }else if(equals(key, get_key(tmp_pointer->table_[bucknum]))) {
++        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
++      }
++      ++num_probes;  // we're doing another probe
++      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
++      if (num_probes == tmp_pointer->num_buckets_) {
++        if (insert_pos == ILLEGAL_BUCKET) {
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
++        } else {
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
++        }
++      }
++    }
++  }
++  template <typename K>
++  std::pair<size_type, size_type> find_position(const K& key, void* tmp){
++    size_type num_probes = 0;
++    TableInternalParameter* tmp_pointer = static_cast<TableInternalParameter*>(tmp); 
++    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
++    size_type bucknum = hash(key) & bucket_count_minus_one;
++    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
++    while(1){
++      if(test_empty(bucknum,tmp_pointer)){
++        if (insert_pos == ILLEGAL_BUCKET)   // found no prior place to insert
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
++        else
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
++      }else if(test_deleted(bucknum, tmp_pointer)){
++        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
++      }else if(equals(key, get_key(tmp_pointer->table_[bucknum]))) {
++        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
++      }
++      ++num_probes;  // we're doing another probe
++      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
++      if(num_probes == tmp_pointer->num_buckets_)
++        return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
++    }
++  }
++
++  template <typename K>
++  std::pair<size_type, size_type> find_position(const K& key) const {
++    size_type num_probes = 0;  // how many times we've probed
++    TableInternalParameter* tmp_pointer = pnew;
++    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
++    size_type bucknum = hash(key) & bucket_count_minus_one;
++    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
++    while (1) {                             // probe until something happens     
++      if (test_empty(bucknum, tmp_pointer)) {            // bucket is empty
++      
++        if (insert_pos == ILLEGAL_BUCKET)   // found no prior place to insert
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, bucknum);
++        else
++          return std::pair<size_type, size_type>(ILLEGAL_BUCKET, insert_pos);
++      } else if (test_deleted(bucknum, tmp_pointer)) {  // keep searching, but mark to insert
++        if (insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
++      } else if (equals(key, get_key(tmp_pointer->table_[bucknum]))) {
++        return std::pair<size_type, size_type>(bucknum, ILLEGAL_BUCKET);
++      }
++      ++num_probes;  // we're doing another probe
++      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
++      assert(num_probes < bucket_count() &&
++             "Hashtable is full: an error in key_equal<> or hash<>");
++    }
++  }
++
++
++ public:
++
++
++
++long long int size_lockless() const {
++  long long int sum1 = 0;
++  for(size_type i = 0; i < num_counters; i++)
++    __sync_add_and_fetch(&sum1, num_elements_array[i]);
++  return sum1;
++}
++
++template <typename T,typename K>
++  std::pair<K, T> find_wait_free(K& key){
++    size_type num_probes = 0;
++    TableInternalParameter* tmp_pointer = pnew;  
++    const size_type bucket_count_minus_one = tmp_pointer->num_buckets_ - 1;
++    size_type bucknum = hash(key) & bucket_count_minus_one;
++    size_type insert_pos = ILLEGAL_BUCKET;  // where we would insert
++    while(1){
++      if (test_empty(bucknum,tmp_pointer)) {            // bucket is empty
++        return std::pair<K, T>(tmp_pointer->key_info_.empty_key, empty_value);
++      }else if(test_deleted(bucknum, tmp_pointer)) {
++        if(insert_pos == ILLEGAL_BUCKET) insert_pos = bucknum;
++      }else if (equals(key, get_key(tmp_pointer->table_[bucknum]))) {
++        //Force to read from volatile memory
++        volatile K* key_ptr = const_cast<K*>(&(tmp_pointer->table_[bucknum].first));
++        volatile T* value_ptr = const_cast<T*>(&(tmp_pointer->table_[bucknum].second));
++        std::pair<K, T> tmp(*key_ptr, *value_ptr);
++        if(*key_ptr == key){
++          return tmp;
++        }else{
++          return std::pair<K,T>(tmp_pointer->key_info_.empty_key, empty_value);
++        }
++      }
++      ++num_probes;
++      bucknum = (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one;
++      if(num_probes == tmp_pointer->num_buckets_)
++        return std::pair<K, T>(tmp_pointer->key_info_.empty_key, empty_value);
++
++    } 
++  }
++
++
++
++  template <typename K>
++  iterator find(const K& key) {
++    std::pair<size_type, size_type> pos = find_position(key);
++    if (pos.first == ILLEGAL_BUCKET)  // alas, not there
++      return end();
++    else
++      return iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_, false);
++  }
++
++  template <typename K>
++  const_iterator find(const K& key) const {
++    if (size() == 0) return end();
++    std::pair<size_type, size_type> pos = find_position(key);
++    if (pos.first == ILLEGAL_BUCKET)  // alas, not there
++      return end();
++    else
++      return const_iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_,
++                            false);
++  }
++
++  // This is a tr1 method: the bucket a given key is in, or what bucket
++  // it would be put in, if it were to be inserted.  Shrug.
++  size_type bucket(const key_type& key) const {
++    std::pair<size_type, size_type> pos = find_position(key);
++    return pos.first == ILLEGAL_BUCKET ? pos.second : pos.first;
++  }
++
++  // Counts how many elements have key key.  For maps, it's either 0 or 1.
++  template <typename K>
++  size_type count(const K& key) const {
++    std::pair<size_type, size_type> pos = find_position(key);
++    return pos.first == ILLEGAL_BUCKET ? 0 : 1;
++  }
++
++  // Likewise, equal_range doesn't really make sense for us.  Oh well.
++  template <typename K>
++  std::pair<iterator, iterator> equal_range(const K& key) {
++    iterator pos = find(key);  // either an iterator or end
++    if (pos == end()) {
++      return std::pair<iterator, iterator>(pos, pos);
++    } else {
++      const iterator startpos = pos++;
++      return std::pair<iterator, iterator>(startpos, pos);
++    }
++  }
++  template <typename K>
++  std::pair<const_iterator, const_iterator> equal_range(
++      const K& key) const {
++    const_iterator pos = find(key);  // either an iterator or end
++    if (pos == end()) {
++      return std::pair<const_iterator, const_iterator>(pos, pos);
++    } else {
++      const const_iterator startpos = pos++;
++      return std::pair<const_iterator, const_iterator>(startpos, pos);
++    }
++  }
++
++  // INSERTION ROUTINES
++ private:
++  // Private method used by insert_noresize and find_or_insert.
++  template <typename... Args>
++  iterator insert_at(size_type pos, Args&&... args) {
++    if (size() >= max_size()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
++      throw std::length_error("insert overflow");
++#else
++      abort();
++#endif
++    }
++    if (test_deleted(pos)) {  // just replace if it's been del.
++      // shrug: shouldn't need to be const.
++      const_iterator delpos(this, pnew->table_ + pos, pnew->table_ + pnew->num_buckets_, false);
++      clear_deleted(delpos);
++      assert(pnew->num_deleted_ > 0);
++      --pnew->num_deleted_;  // used to be, now it isn't
++    } else {
++      ++pnew->num_elements_; 
++    }
++    set_value(&(pnew->table_[pos]), std::forward<Args>(args)...);
++    return iterator(this, pnew->table_ + pos, pnew->table_ + pnew->num_buckets_, false);
++
++  }
++
++
++ template <typename... Args>
++  iterator insert_at_lockless(size_type pos, void* tmp, Args&&... args) {
++    TableInternalParameter* tmp_pointer =  static_cast<TableInternalParameter*>(tmp);
++    if (test_deleted(pos)) {
++      const_iterator delpos(this, tmp_pointer->table_ + pos, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
++      clear_deleted(delpos);
++    }
++  //  auto st_time = std::chrono::high_resolution_clock::now();
++    bool flag = set_value_lockless(&tmp_pointer->table_[pos], std::forward<Args>(args)...);
++  //  auto ed_time = std::chrono::high_resolution_clock::now();
++   // if(thread_flag == 0)
++	  //time_for_setvalue_lockless += std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++    if (flag){
++      return iterator(this, tmp_pointer->table_ + pos, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
++    } else{
++      return iterator(this, tmp_pointer->table_ + tmp_pointer->num_buckets_, tmp_pointer->table_ + tmp_pointer->num_buckets_, false);
++    }
++
++  }
++
++  template <typename K, typename... Args>
++  std::pair<iterator, bool> insert_noresize_lockless(K&& key, Args&&... args) {
++    assert(pnew->settings_.use_empty() && "Inserting without empty key");
++    assert(!equals(std::forward<K>(key), pnew->key_info_.empty_key) && "Inserting the empty key");
++    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) && "Inserting the deleted key");  
++    TableInternalParameter* tmp_pointer = pnew;
++    size_type tmp_old_bucket_count = tmp_pointer->num_buckets_;
++    // wait until resize flag set
++    while (resizeflag) { }
++    //auto st_time = std::chrono::high_resolution_clock::now();
++
++    std::pair<size_type, size_type> pos = find_position_for_insert(key, tmp_pointer);
++    long size_sum = 0;
++
++    for(int i = 0; i < num_counters; i++) {
++      size_sum += num_elements_array[i];
++    }
++
++    if (size_sum > bucket_count() / 2){
++      pos.first = ILLEGAL_BUCKET;
++      pos.second = ILLEGAL_BUCKET;
++    }
++    if (pos.first != ILLEGAL_BUCKET){
++      return std::pair<iterator, bool>(
++        iterator(this, tmp_pointer->table_ + pos.first, tmp_pointer->table_ + tmp_pointer->num_buckets_, false),
++        true);
++    } else {
++      if (pos.second != ILLEGAL_BUCKET) {
++        auto it =  std::pair<iterator, bool>(insert_at_lockless(pos.second, tmp_pointer, std::forward<Args>(args)...), true);
++        while (resizeflag) {}
++        auto pos = find_position(key, pnew);
++        if (pos.first == ILLEGAL_BUCKET)
++          return std::pair<iterator, bool>(end(), false);
++        else{
++          if (it.first.pos == it.first.end){
++            it.second = false;
++            return it;
++          } else{
++            __sync_add_and_fetch(&num_elements_array[thread_flag], 1);
++            return it;
++          }
++        }
++      } else {
++
++        std::lock_guard<std::mutex> mlock(table_mutex);
++
++        if (tmp_old_bucket_count == bucket_count()) { 
++          /*The hash table has already been rebucketed*/
++          __sync_bool_compare_and_swap(&resizeflag, false, true);
++          for (unsigned long i = 0; i < pold->num_buckets_; ++i) {
++            pold->table_[i].~value_type();
++          }
++          pointer temp = pold->val_info_.allocate(bucket_count()*2);
++          pold->val_info_.deallocate(pold->table_, old_bucket_count());
++          pold->table_ = temp;
++          assert(pold->table_);
++         // auto st_time = std::chrono::high_resolution_clock::now();
++          fill_range_with_empty(pold->table_, bucket_count()*2, empty_value);
++
++
++          pold->num_elements_ = 0;
++          pold->num_deleted_ = 0;
++          pold->num_buckets_ = bucket_count() * 2;  // our new size
++          pold->settings_.reset_thresholds(bucket_count()*2);
++          size_type bucknum;
++          const size_type bucket_count_minus_one = old_bucket_count() - 1;
++          for (unsigned long i = 0; i < bucket_count(); i++) {
++            if (!test_empty(i,tmp_pointer) && !test_deleted(i,tmp_pointer)) {
++              size_type num_probes = 0;  // how many times we've probed
++              for (bucknum = hash(get_key(pnew->table_[i])) & bucket_count_minus_one;
++                  !test_old_empty(bucknum);  // not empty
++                  bucknum =
++                      (bucknum + JUMP_(key, num_probes)) & bucket_count_minus_one) {
++                  ++num_probes;
++              }
++              using will_move = std::is_rvalue_reference<dense_hashtable_lockless &&>;
++              using value_t = typename std::conditional<will_move::value, value_type&&, const_reference>::type;
++              set_value(&(pold->table_[bucknum]), std::forward<value_t>(pnew->table_[i]));
++              pold->num_elements_++;
++            }
++          }
++          //auto ed_time = std::chrono::high_resolution_clock::now();
++ 	        //time_for_rebucket_lockless += std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++          std::atomic<TableInternalParameter*> tmp_atomic(0);
++          tmp_atomic.store(pold);
++          TableInternalParameter* tmp = pnew;
++          pnew = tmp_atomic.load();
++          pold = tmp;
++          __sync_bool_compare_and_swap(&resizeflag, true, false);
++          //if(thread_flag == 0)
++          //std::cout<<time_for_rebucket_lockless<<"\n";
++        }
++        return std::pair<iterator, bool>(end(), false);
++      }
++    }
++    
++  }
++
++
++  // If you know *this is big enough to hold obj, use this routine
++  template <typename K, typename... Args>
++  std::pair<iterator, bool> insert_noresize(K&& key, Args&&... args) {
++    // First, double-check we're not inserting delkey or emptyval
++
++    assert(pnew->settings_.use_empty() && "Inserting without empty key");
++    assert(!equals(std::forward<K>(key), pnew->key_info_.empty_key) && "Inserting the empty key");
++    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) && "Inserting the deleted key");
++
++
++    const std::pair<size_type, size_type> pos = find_position(key);
++    if (pos.first != ILLEGAL_BUCKET) {  // object was already there
++      return std::pair<iterator, bool>(
++          iterator(this, pnew->table_ + pos.first, pnew->table_ + pnew->num_buckets_, false),
++          false);  // false: we didn't insert
++    } else {       // pos.second says where to put it
++      return std::pair<iterator, bool>(insert_at(pos.second, std::forward<Args>(args)...), true);
++    }
++  }
++
++  // Specializations of insert(it, it) depending on the power of the iterator:
++  // (1) Iterator supports operator-, resize before inserting
++  template <class ForwardIterator>
++  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) {
++    size_t dist = std::distance(f, l);
++    if (dist >= (std::numeric_limits<size_type>::max)()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
++      throw std::length_error("insert-range overflow");
++#else
++      abort();
++#endif
++    }
++    resize_delta(static_cast<size_type>(dist));
++    for (; dist > 0; --dist, ++f) {
++      insert_noresize(get_key(*f), *f);
++    }
++  }
++
++  // (2) Arbitrary iterator, can't tell how much to resize
++  template <class InputIterator>
++  void insert(InputIterator f, InputIterator l, std::input_iterator_tag) {
++    for (; f != l; ++f) insert(*f);
++  }
++
++ public:
++  // This is the normal insert routine, used by the outside world
++  template <typename Arg>
++  std::pair<iterator, bool> insert(Arg&& obj) {
++    bool did_resize = resize_delta(1);  // adding an object, grow if need be
++    return insert_noresize(get_key(std::forward<Arg>(obj)), std::forward<Arg>(obj));
++  }
++
++  template <typename Arg>
++  std::pair<iterator, bool> insert_lockless(Arg&& obj) {
++    while (thread_flag == -1) {
++      long min_counter = counter_flag[0];
++      int min_index = 0;
++      for (size_type i = 0; i < num_counters; i++) {
++        if (counter_flag[i] <= min_counter) {
++          min_counter = counter_flag[i];
++          min_index = i;
++        }
++      }
++      bool flag = __sync_bool_compare_and_swap(&counter_flag[min_index], min_counter, min_counter+1);
++      if (flag) {
++        thread_flag = min_index;
++        break;
++      }
++    }
++    do {
++      auto it = insert_noresize_lockless(get_key(std::forward<Arg>(obj)), std::forward<Arg>(obj));
++      if (it.first != end() && it.second == true && get_key(std::forward<Arg>(*it.first)) != pnew->key_info_.empty_key){
++        return it;
++      }
++      
++    } while (true);
++  }
++
++
++  template <typename K, typename... Args>
++  std::pair<iterator, bool> emplace(K&& key, Args&&... args) {
++    resize_delta(1);
++    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
++    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
++  }
++
++  /* Overload for maps: Here, K != V, and we need to pass hint->first to the equal() function. */
++  template <typename K, typename... Args, typename KeyCopy = Key>
++  typename std::enable_if<!std::is_same<KeyCopy, Value>::value,
++                          std::pair<iterator, bool>>::type
++  emplace_hint(const_iterator hint, K&& key, Args&&... args) {
++    resize_delta(1);
++
++    if ((hint != this->end()) && (equals(key, hint->first))) {
++        return {iterator(this, const_cast<pointer>(hint.pos), const_cast<pointer>(hint.end), false), false};
++    }
++
++    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
++    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
++  }
++
++  /* Overload for sets: Here, K == V, and we need to pass *hint to the equal() function. */
++  template <typename K, typename... Args, typename KeyCopy = Key>
++  typename std::enable_if<std::is_same<KeyCopy, Value>::value,
++                          std::pair<iterator, bool>>::type
++  emplace_hint(const_iterator hint, K&& key, Args&&... args) {
++    resize_delta(1);
++
++    if ((hint != this->end()) && (equals(key, *hint))) {
++      return {iterator(this, const_cast<pointer>(hint.pos), const_cast<pointer>(hint.end), false), false};
++    }
++
++    // here we push key twice as we need it once for the indexing, and the rest of the params are for the emplace itself
++    return insert_noresize(std::forward<K>(key), std::forward<K>(key), std::forward<Args>(args)...);
++  }
++
++  // When inserting a lot at a time, we specialize on the type of iterator
++  template <class InputIterator>
++  void insert(InputIterator f, InputIterator l) {
++    // specializes on iterator type
++    insert(f, l,
++           typename std::iterator_traits<InputIterator>::iterator_category());
++  }
++
++  // DefaultValue is a functor that takes a key and returns a value_type
++  // representing the default value to be inserted if none is found.
++  template <class T, class K>
++  value_type& find_or_insert(K&& key) {
++    // First, double-check we're not inserting emptykey or delkey
++    assert(
++        (!(pnew->settings_.use_empty()) || !equals(key, pnew->key_info_.empty_key)) &&
++        "Inserting the empty key");
++    assert((!(pnew->settings_.use_deleted()) || !equals(key, pnew->key_info_.delkey)) &&
++           "Inserting the deleted key");
++    const std::pair<size_type, size_type> pos = find_position(key);
++    if (pos.first != ILLEGAL_BUCKET) {  // object was already there
++      return pnew->table_[pos.first];
++    } else if (resize_delta(1)) {  // needed to rehash to make room
++      // Since we resized, we can't use pos, so recalculate where to insert.
++      return *insert_noresize(std::forward<K>(key), std::forward<K>(key), T()).first;
++    } else {  // no need to rehash, insert right here
++      return *insert_at(pos.second, std::forward<K>(key), T());
++    }
++  }
++  template<typename K, typename V>
++  bool delete_bucket_lockless(std::pair<K, V>* ptr, const key_type& key, void* tmp_pointer) {
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    using NCKey = typename std::remove_cv<Key>::type;
++    bool flag;
++    flag = __sync_bool_compare_and_swap(const_cast<NCKey*>(&(ptr->first)), key, tmp->key_info_.delkey);
++    if (!flag) {
++      return false;
++    } else {
++      __sync_bool_compare_and_swap(&ptr->second, ptr->second, empty_value);
++      return true;
++    } 
++  }
++
++  template<typename K>
++  bool delete_bucket_lockless(K* ptr, const key_type& key, void* tmp_pointer) {
++    using NCKey = typename std::remove_cv<Key>::type;
++    bool flag;
++    TableInternalParameter* tmp = static_cast<TableInternalParameter*>(tmp_pointer);
++    flag = __sync_bool_compare_and_swap(const_cast<NCKey*>(ptr), key, tmp->key_info_.delkey);
++    return flag;
++  }
++
++  size_type erase_lockless(const key_type& key){
++    assert(
++        (!pnew->settings_.use_empty() || !equals(key, pnew->key_info_.empty_key)) &&
++        "Erasing the empty key");
++    assert((!pnew->settings_.use_deleted() || !equals(key, pnew->key_info_.delkey)) &&
++          "Erasing the deleted key");
++    bool find_flag = 0;
++    bool flag;
++    while (thread_flag == -1) {
++      long min_counter = counter_flag[0];
++      int min_index = 0;
++      for (size_type i = 0; i < num_counters; i++) {
++        if (counter_flag[i] <= min_counter) {
++          min_counter = counter_flag[i];
++          min_index = i;
++        }
++      }
++      bool flag = __sync_bool_compare_and_swap(&counter_flag[min_index], min_counter, min_counter+1);
++      if (flag) {
++        thread_flag = min_index;
++        break;
++      }
++    }
++    
++    while(resizeflag){}
++    while(1){
++      TableInternalParameter* tmp_pointer = pnew;
++      std::pair<size_type, size_type> pos = find_position(key,tmp_pointer);
++      if(pos.first == ILLEGAL_BUCKET){
++      // The element has already been deleted successfully!
++        if(find_flag){
++            if(flag)
++              __sync_sub_and_fetch(&num_elements_array[thread_flag], 1);
++            return 1;
++        }  
++        return 0;
++      }
++      else{
++        find_flag = 1;
++        flag = delete_bucket_lockless(&tmp_pointer->table_[pos.first], key, tmp_pointer);
++        if(!flag)
++          continue;
++        if (resizeflag)
++          continue;
++        else{
++          pos = find_position(key, pnew);
++          if (pos.first != ILLEGAL_BUCKET)
++            continue;
++          else{
++            __sync_sub_and_fetch(&num_elements_array[thread_flag], 1);
++            return 1;
++          }
++        }
++      }
++    }
++  }
++
++std::pair<pointer, size_type> GetSnapShot(){
++  table_mutex.lock();
++  table_for_dump = (pointer)malloc(sizeof(value_type)*bucket_count());
++  using NCKey = typename std::remove_cv<Key>::type;
++  for(size_type i = 0; i < bucket_count(); i++){
++        *const_cast<NCKey*>(&table_for_dump[i].first) = pnew->table_[i].first;
++        table_for_dump[i].second = pnew->table_[i].second;
++  }
++  table_mutex.unlock();
++  return std::pair<pointer, size_type>(table_for_dump, bucket_count());
++}
++
++
++
++  // DELETION ROUTINES
++  size_type erase(const key_type& key) {
++    // First, double-check we're not trying to erase delkey or emptyval.
++    assert(
++        (!settings.use_empty() || !equals(key, key_info.empty_key)) &&
++        "Erasing the empty key");
++    assert((!settings.use_deleted() || !equals(key, key_info.delkey)) &&
++           "Erasing the deleted key");
++    const_iterator pos = find(key);  // shrug: shouldn't need to be const
++    if (pos != end()) {
++      assert(!test_deleted(pos));  // or find() shouldn't have returned it
++      set_deleted(pos);
++      ++num_deleted;
++      settings.set_consider_shrink(
++          true);  // will think about shrink after next insert
++      return 1;   // because we deleted one thing
++    } else {
++      return 0;  // because we deleted nothing
++    }
++  }
++
++  // We return the iterator past the deleted item.
++  iterator erase(const_iterator pos) {
++    if (pos == end()) return end();  // sanity check
++    if (set_deleted(pos)) {    // true if object has been newly deleted
++      ++num_deleted;
++      settings.set_consider_shrink(
++          true);  // will think about shrink after next insert
++    }
++    return iterator(this, const_cast<pointer>(pos.pos), const_cast<pointer>(pos.end), true);
++  }
++
++  iterator erase(const_iterator f, const_iterator l) {
++    for (; f != l; ++f) {
++      if (set_deleted(f))  // should always be true
++        ++num_deleted;
++    }
++    settings.set_consider_shrink(
++        true);  // will think about shrink after next insert
++    return iterator(this, const_cast<pointer>(f.pos), const_cast<pointer>(f.end), false);
++  }
++
++  // COMPARISON
++  bool operator==(const dense_hashtable_lockless& ht) const {
++    if (size() != ht.size()) {
++      return false;
++    } else if (this == &ht) {
++      return true;
++    } else {
++      // Iterate through the elements in "this" and see if the
++      // corresponding element is in ht
++      for (const_iterator it = begin(); it != end(); ++it) {
++        const_iterator it2 = ht.find(get_key(*it));
++        if ((it2 == ht.end()) || (*it != *it2)) {
++          return false;
++        }
++      }
++      return true;
++    }
++  }
++  bool operator!=(const dense_hashtable_lockless& ht) const { return !(*this == ht); }
++
++  // I/O
++  // We support reading and writing hashtables to disk.  Alas, since
++  // I don't know how to write a hasher or key_equal, you have to make
++  // sure everything but the table is the same.  We compact before writing.
++ private:
++  // Every time the disk format changes, this should probably change too
++  typedef unsigned long MagicNumberType;
++  static const MagicNumberType MAGIC_NUMBER = 0x13578642;
++
++ public:
++  // I/O -- this is an add-on for writing hash table to disk
++  //
++  // INPUT and OUTPUT must be either a FILE, *or* a C++ stream
++  //    (istream, ostream, etc) *or* a class providing
++  //    Read(void*, size_t) and Write(const void*, size_t)
++  //    (respectively), which writes a buffer into a stream
++  //    (which the INPUT/OUTPUT instance presumably owns).
++
++  typedef sparsehash_internal::pod_serializer<value_type> NopointerSerializer;
++
++  // ValueSerializer: a functor.  operator()(OUTPUT*, const value_type&)
++  template <typename ValueSerializer, typename OUTPUT>
++  bool serialize(ValueSerializer serializer, OUTPUT* fp) {
++    squash_deleted();  // so we don't have to worry about delkey
++    if (!sparsehash_internal::write_bigendian_number(fp, MAGIC_NUMBER, 4))
++      return false;
++    if (!sparsehash_internal::write_bigendian_number(fp, num_buckets, 8))
++      return false;
++    if (!sparsehash_internal::write_bigendian_number(fp, num_elements, 8))
++      return false;
++    // Now write a bitmap of non-empty buckets.
++    for (size_type i = 0; i < num_buckets; i += 8) {
++      unsigned char bits = 0;
++      for (int bit = 0; bit < 8; ++bit) {
++        if (i + bit < num_buckets && !test_empty(i + bit)) bits |= (1 << bit);
++      }
++      if (!sparsehash_internal::write_data(fp, &bits, sizeof(bits)))
++        return false;
++      for (int bit = 0; bit < 8; ++bit) {
++        if (bits & (1 << bit)) {
++          if (!serializer(fp, table[i + bit])) return false;
++        }
++      }
++    }
++    return true;
++  }
++
++  // INPUT: anything we've written an overload of read_data() for.
++  // ValueSerializer: a functor.  operator()(INPUT*, value_type*)
++  template <typename ValueSerializer, typename INPUT>
++  bool unserialize(ValueSerializer serializer, INPUT* fp) {
++    assert(settings.use_empty() && "empty_key not set for read");
++
++    clear();  // just to be consistent
++    MagicNumberType magic_read;
++    if (!sparsehash_internal::read_bigendian_number(fp, &magic_read, 4))
++      return false;
++    if (magic_read != MAGIC_NUMBER) {
++      return false;
++    }
++    size_type new_num_buckets;
++    if (!sparsehash_internal::read_bigendian_number(fp, &new_num_buckets, 8))
++      return false;
++    clear_to_size(new_num_buckets);
++    if (!sparsehash_internal::read_bigendian_number(fp, &num_elements, 8))
++      return false;
++
++    // Read the bitmap of non-empty buckets.
++    for (size_type i = 0; i < num_buckets; i += 8) {
++      unsigned char bits;
++      if (!sparsehash_internal::read_data(fp, &bits, sizeof(bits)))
++        return false;
++      for (int bit = 0; bit < 8; ++bit) {
++        if (i + bit < num_buckets && (bits & (1 << bit))) {  // not empty
++          if (!serializer(fp, &table[i + bit])) return false;
++        }
++      }
++    }
++    return true;
++  }
++
++ private:
++  template <class A>
++  class alloc_impl : public A {
++   public:
++    typedef typename A::pointer pointer;
++    typedef typename A::size_type size_type;
++
++    // Convert a normal allocator to one that has realloc_or_die()
++    alloc_impl(const A& a) : A(a) {}
++
++    // realloc_or_die should only be used when using the default
++    // allocator (libc_allocator_with_realloc).
++    pointer realloc_or_die(pointer /*ptr*/, size_type /*n*/) {
++      fprintf(stderr,
++              "realloc_or_die is only supported for "
++              "libc_allocator_with_realloc\n");
++      exit(1);
++      return NULL;
++    }
++  };
++
++  // A template specialization of alloc_impl for
++  // libc_allocator_with_realloc that can handle realloc_or_die.
++  template <class A>
++  class alloc_impl<libc_allocator_with_realloc<A>>
++      : public libc_allocator_with_realloc<A> {
++   public:
++    typedef typename libc_allocator_with_realloc<A>::pointer pointer;
++    typedef typename libc_allocator_with_realloc<A>::size_type size_type;
++
++    alloc_impl(const libc_allocator_with_realloc<A>& a)
++        : libc_allocator_with_realloc<A>(a) {}
++
++    pointer realloc_or_die(pointer ptr, size_type n) {
++      pointer retval = this->reallocate(ptr, n);
++      if (retval == NULL) {
++        fprintf(stderr,
++                "sparsehash: FATAL ERROR: failed to reallocate "
++                "%lu elements for ptr %p",
++                static_cast<unsigned long>(n), static_cast<void*>(ptr));
++        exit(1);
++      }
++      return retval;
++    }
++  };
++
++  // Package allocator with emptyval to eliminate memory needed for
++  // the zero-size allocator.
++  // If new fields are added to this class, we should add them to
++  // operator= and swap.
++  class ValInfo : public alloc_impl<value_alloc_type> {
++   public:
++    typedef typename alloc_impl<value_alloc_type>::value_type value_type;
++
++    ValInfo(const alloc_impl<value_alloc_type>& a)
++        : alloc_impl<value_alloc_type>(a) {}
++  };
++
++  // Package functors with another class to eliminate memory needed for
++  // zero-size functors.  Since ExtractKey and hasher's operator() might
++  // have the same function signature, they must be packaged in
++  // different classes.
++  struct Settings
++      : sparsehash_internal::sh_hashtable_settings<key_type, hasher, size_type,
++                                                   HT_MIN_BUCKETS> {
++    explicit Settings(const hasher& hf)
++        : sparsehash_internal::sh_hashtable_settings<key_type, hasher,
++                                                     size_type, HT_MIN_BUCKETS>(
++              hf, HT_OCCUPANCY_PCT / 100.0f, HT_EMPTY_PCT / 100.0f) {}
++  };
++
++  // Packages ExtractKey and SetKey functors.
++  class KeyInfo : public ExtractKey, public SetKey, public EqualKey {
++   public:
++    KeyInfo(const ExtractKey& ek, const SetKey& sk, const EqualKey& eq)
++        : ExtractKey(ek), SetKey(sk), EqualKey(eq) {}
++
++    // We want to return the exact same type as ExtractKey: Key or const Key&
++    template <typename V>
++    typename ExtractKey::result_type get_key(V&& v) const {
++      return ExtractKey::operator()(std::forward<V>(v));
++    }
++    void set_key(pointer v, const key_type& k) const {
++      SetKey::operator()(v, k);
++    }
++    void construct_key(pointer v, const key_type& k) const {
++      SetKey::operator()(v, k, true);
++    }
++    template <typename K1, typename K2>
++    bool equals(const K1& a, const K2& b) const {
++      return EqualKey::operator()(a, b);
++    }
++
++    // Which key marks deleted entries.
++    // TODO(csilvers): make a pointer, and get rid of use_deleted (benchmark!)
++    typename std::remove_const<key_type>::type delkey;
++    typename std::remove_const<key_type>::type empty_key;
++  };
++
++  class TableInternalParameter {
++    public:
++      TableInternalParameter(size_type expected_max_items_in_table = 0, const HashFcn& hf = HashFcn(), const EqualKey& eql = EqualKey(),const ExtractKey& ext = ExtractKey(),
++                            const SetKey& set = SetKey(),const Alloc& alloc = Alloc())
++                            : settings_(hf), key_info_(ext,set,eql),num_deleted_(0), num_elements_(0),
++                            num_buckets_(expected_max_items_in_table == 0
++                            ? HT_DEFAULT_STARTING_BUCKETS
++                            : settings_.min_buckets(expected_max_items_in_table, 0)),
++                            val_info_(alloc_impl<value_alloc_type>(alloc)),
++                            table_(NULL) {
++      }
++      Settings settings_;
++      KeyInfo key_info_;
++
++      size_type num_deleted_;  
++      size_type num_elements_;
++      size_type num_buckets_;
++      ValInfo val_info_;  
++      pointer table_;
++  };
++
++
++  // Utility functions to access the templated operators
++  template <typename K>
++  size_type hash(const K& v) const { return settings.hash(v); }
++  template <typename K1, typename K2>
++  bool equals(const K1& a, const K2& b) const {
++    return key_info.equals(a, b);
++  }
++  template <typename V>
++  typename ExtractKey::result_type get_key(V&& v) const {
++    return key_info.get_key(std::forward<V>(v));
++  }
++  void set_key(pointer v, const key_type& k) const { key_info.set_key(v, k); }
++  void construct_key(pointer v, const key_type& k) const { key_info.construct_key(v, k); }
++
++ private:
++  // Actual data
++  Settings settings;
++  KeyInfo key_info;
++
++  size_type num_deleted;  // how many occupied buckets are marked deleted
++  size_type num_elements;
++  size_type num_counters;
++  size_type *num_elements_array;
++  size_type num_buckets;
++  ValInfo val_info;  // holds emptyval, and also the allocator
++  pointer table;
++  pointer table_for_dump;
++  long insert_counter;
++  long *counter_flag;
++
++  size_type thread_num;
++  Data empty_value;
++  volatile bool resizeflag;
++  TableInternalParameter tp_old,tp_new;
++  TableInternalParameter *pold, *pnew;
++  std::mutex table_mutex;
++};
++
++// We need a global swap as well
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++inline void swap(dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>& x,
++                 dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>& y) {
++  x.swap(y);
++}
++
++#undef JUMP_
++
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++const typename dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::size_type
++    dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::ILLEGAL_BUCKET;
++
++// How full we let the table get before we resize.  Knuth says .8 is
++// good -- higher causes us to probe too much, though saves memory.
++// However, we go with .5, getting better performance at the cost of
++// more space (a trade-off densehashtable explicitly chooses to make).
++// Feel free to play around with different values, though, via
++// max_load_factor() and/or set_resizing_parameters().
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++const int dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_OCCUPANCY_PCT = 50;
++
++// How empty we let the table get before we resize lower.
++// It should be less than OCCUPANCY_PCT / 2 or we thrash resizing.
++template <class V, class K, class HF, class ExK, class SetK, class EqK, class A, class Data>
++const int dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_EMPTY_PCT =
++    static_cast<int>(
++        0.4 * dense_hashtable_lockless<V, K, HF, ExK, SetK, EqK, A, Data>::HT_OCCUPANCY_PCT);
++
++}  // namespace google
+diff --git a/sparsehash/internal/hashtable-common.h b/sparsehash/internal/hashtable-common.h
+index 8d4d3f7..f6e2fca 100644
+--- a/sparsehash/internal/hashtable-common.h
++++ b/sparsehash/internal/hashtable-common.h
+@@ -324,7 +324,11 @@ class sh_hashtable_settings : public HashFunc {
+       // This just prevents overflowing size_type, since sz can exceed
+       // max_size() here.
+       if (static_cast<size_type>(sz * 2) < sz) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+         throw std::length_error("resize overflow");  // protect against overflow
++#else
++        abort();
++#endif
+       }
+       sz *= 2;
+     }
+diff --git a/sparsehash/internal/sparsehashtable.h b/sparsehash/internal/sparsehashtable.h
+index 4687874..4193ca6 100644
+--- a/sparsehash/internal/sparsehashtable.h
++++ b/sparsehash/internal/sparsehashtable.h
+@@ -633,14 +633,18 @@ class sparse_hashtable {
+   // When you resize, you say, "make it big enough for this many more
+   // elements"
+   // Returns true if we actually resized, false if size was already ok.
+-  bool resize_delta(size_type delta) {
++  bool resize_delta(size_type delta) noexcept {
+     bool did_resize = false;
+     if (settings.consider_shrink()) {  // see if lots of deletes happened
+       if (maybe_shrink()) did_resize = true;
+     }
+     if (table.num_nonempty() >=
+         (std::numeric_limits<size_type>::max)() - delta) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("resize overflow");
++#else
++      abort();
++#endif
+     }
+     if (bucket_count() >= HT_MIN_BUCKETS &&
+         (table.num_nonempty() + delta) <= settings.enlarge_threshold())
+@@ -949,9 +953,13 @@ class sparse_hashtable {
+   // INSERTION ROUTINES
+  private:
+   // Private method used by insert_noresize and find_or_insert.
+-  iterator insert_at(const_reference obj, size_type pos) {
++  iterator insert_at(const_reference obj, size_type pos) noexcept {
+     if (size() >= max_size()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("insert overflow");
++#else
++      abort();
++#endif
+     }
+     if (test_deleted(pos)) {  // just replace if it's been deleted
+       // The set() below will undelete this object.  We just worry about
+@@ -982,10 +990,14 @@ class sparse_hashtable {
+   // Specializations of insert(it, it) depending on the power of the iterator:
+   // (1) Iterator supports operator-, resize before inserting
+   template <class ForwardIterator>
+-  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) {
++  void insert(ForwardIterator f, ForwardIterator l, std::forward_iterator_tag) noexcept {
+     size_t dist = std::distance(f, l);
+     if (dist >= (std::numeric_limits<size_type>::max)()) {
++#ifdef SPARSE_HASH_THROW_EXCEPTION
+       throw std::length_error("insert-range overflow");
++#else
++      abort();
++#endif
+     }
+     resize_delta(static_cast<size_type>(dist));
+     for (; dist > 0; --dist, ++f) {
+diff --git a/sparsehash/traits b/sparsehash/traits
+index 65135a6..8e0a0d6 100644
+--- a/sparsehash/traits
++++ b/sparsehash/traits
+@@ -40,11 +40,19 @@ namespace google {
+ // struct is_relocatable<MyType> : std::true_type {};
+ // }
+ 
++#if (defined __GNUG__) && (__GNUC__ < 5)
++template <class T>
++struct is_relocatable
++    : std::integral_constant<bool,
++                             (__has_trivial_copy(T) &&
++                              __has_trivial_destructor(T))> {};
++#else
+ template <class T>
+ struct is_relocatable
+     : std::integral_constant<bool,
+                              (std::is_trivially_copy_constructible<T>::value &&
+                               std::is_trivially_destructible<T>::value)> {};
++#endif  // __GNUG__ && __GNUC__ < 5
+ template <class T, class U>
+ struct is_relocatable<std::pair<T, U>>
+     : std::integral_constant<bool, (is_relocatable<T>::value &&
+@@ -52,4 +60,4 @@ struct is_relocatable<std::pair<T, U>>
+ 
+ template <class T>
+ struct is_relocatable<const T> : is_relocatable<T> {};
+-}
+\ No newline at end of file
++}
+diff --git a/tests/bench_lockless.cc b/tests/bench_lockless.cc
+new file mode 100644
+index 0000000..ac94adf
+--- /dev/null
++++ b/tests/bench_lockless.cc
+@@ -0,0 +1,1466 @@
++// Copyright (c) 2005, Google Inc.
++// All rights reserved.
++//
++// Redistribution and use in source and binary forms, with or without
++// modification, are permitted provided that the following conditions are
++// met:
++//
++//     * Redistributions of source code must retain the above copyright
++// notice, this list of conditions and the following disclaimer.
++//     * Redistributions in binary form must reproduce the above
++// copyright notice, this list of conditions and the following disclaimer
++// in the documentation and/or other materials provided with the
++// distribution.
++//     * Neither the name of Google Inc. nor the names of its
++// contributors may be used to endorse or promote products derived from
++// this software without specific prior written permission.
++//
++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++// ---
++// Authors: Sanjay Ghemawat and Craig Silverstein
++
++// Time various hash map implementations
++//
++// Below, times are per-call.  "Memory use" is "bytes in use by
++// application" as reported by tcmalloc, compared before and after the
++// function call.  This does not really report fragmentation, which is
++// not bad for the sparse* routines but bad for the dense* ones.
++//
++// The tests generally yield best-case performance because the
++// code uses sequential keys; on the other hand, "map_fetch_random" does
++// lookups in a pseudorandom order.  Also, "stresshashfunction" is
++// a stress test of sorts.  It uses keys from an arithmetic sequence, which,
++// if combined with a quick-and-dirty hash function, will yield worse
++// performance than the otherwise similar "map_predict/grow."
++//
++// Consider doing the following to get good numbers:
++//
++// 1. Run the tests on a machine with no X service. Make sure no other
++//    processes are running.
++// 2. Minimize compiled-code differences. Compare results from the same
++//    binary, if possible, instead of comparing results from two different
++//    binaries.
++//
++// See PERFORMANCE for the output of one example run.
++
++#include <cstdint>  // for uintptr_t
++#include <cstdio>
++#include <cstdlib>
++#include <cstring>
++#include <thread>
++
++extern "C" {
++#include <time.h>
++#ifdef HAVE_SYS_TIME_H
++#include <sys/time.h>
++#endif
++#ifdef HAVE_SYS_RESOURCE_H
++#include <sys/resource.h>
++#endif
++#ifdef HAVE_SYS_UTSNAME_H
++#include <sys/utsname.h>
++#endif  // for uname()
++}
++
++// The functions that we call on each map, that differ for different types.
++// By default each is a noop, but we redefine them for types that need them.
++
++#include <map>
++#include <unordered_map>
++#include <algorithm>
++#include <vector>
++#include <set>
++#include <chrono>
++#include <type_traits>
++#include <sparsehash/dense_hash_map>
++#include <sparsehash/sparse_hash_map>
++#include <sparsehash/dense_hash_map_lockless>
++#include "rwlock.h"
++
++using std::map;
++using std::unordered_map;
++using std::swap;
++using std::vector;
++using std::chrono::steady_clock;
++using std::chrono::duration_cast;
++using std::chrono::time_point;
++using std::chrono::nanoseconds;
++using google::dense_hash_map;
++using google::sparse_hash_map;
++using google::dense_hash_map_lockless;
++using std::hash;  // or __gnu_cxx::hash, or maybe tr1::hash, depending on your OS
++
++
++static bool FLAGS_test_sparse_hash_map = true;
++static bool FLAGS_test_dense_hash_map = true;
++static bool FLAGS_test_hash_map = true;
++static bool FLAGS_test_map = true;
++
++static bool FLAGS_test_4_bytes = true;
++static bool FLAGS_test_8_bytes = true;
++static bool FLAGS_test_16_bytes = true;
++static bool FLAGS_test_256_bytes = true;
++static bool FLAGS_test_parallel = true;
++
++static const int kDefaultIters = 10000000;
++easy_spinrwlock_t mu = EASY_SPINRWLOCK_INITIALIZER;
++easy_spinrwlock_t mu_list[1000];
++
++double time_for_insert_lockless = 0.0;
++double time_for_insert_noresize_lockless = 0.0;
++double time_for_insert_at_lockless = 0.0;
++double time_for_setvalue_lockless = 0.0;
++double time_for_rebucket_lockless = 0.0;
++//double time_for_insert_noresize[16] = {0.0};
++//double time_for_rebucket[16] = {0.0};
++
++
++struct eqstr
++{
++	bool operator()(long  s1, long s2) const
++	{
++		return (s1 == s2);
++	}
++};
++
++// A version of each of the hashtable classes we test, that has been
++// augumented to provide a common interface.  For instance, the
++// sparse_hash_map and dense_hash_map versions set empty-key and
++// deleted-key (we can do this because all our tests use int-like
++// keys), so the users don't have to.  The hash_map version adds
++// resize(), so users can just call resize() for all tests without
++// worrying about whether the map-type supports it or not.
++
++template <typename K, typename V, typename H>
++class EasyUseSparseHashMap : public sparse_hash_map<K, V, H> {
++ public:
++  EasyUseSparseHashMap() { this->set_deleted_key(-1); }
++};
++
++template <typename K, typename V, typename H>
++class EasyUseDenseHashMap : public dense_hash_map<K, V, H> {
++ public:
++  EasyUseDenseHashMap() {
++    this->set_empty_key(-1);
++    this->set_deleted_key(-2);
++  }
++};
++
++// For pointers, we only set the empty key.
++template <typename K, typename V, typename H>
++class EasyUseSparseHashMap<K*, V, H> : public sparse_hash_map<K*, V, H> {
++ public:
++  EasyUseSparseHashMap() {}
++};
++
++template <typename K, typename V, typename H>
++class EasyUseDenseHashMap<K*, V, H> : public dense_hash_map<K*, V, H> {
++ public:
++  EasyUseDenseHashMap() { this->set_empty_key((K*)(~0)); }
++};
++
++template <typename K, typename V, typename H>
++class EasyUseHashMap : public unordered_map<K, V, H> {
++ public:
++  // resize() is called rehash() in tr1
++  void resize(size_t r) { this->rehash(r); }
++};
++
++template <typename K, typename V>
++class EasyUseMap : public map<K, V> {
++ public:
++  void resize(size_t) {}  // map<> doesn't support resize
++};
++
++// Returns the number of hashes that have been done since the last
++// call to NumHashesSinceLastCall().  This is shared across all
++// HashObject instances, which isn't super-OO, but avoids two issues:
++// (1) making HashObject bigger than it ought to be (this is very
++// important for our testing), and (2) having to pass around
++// HashObject objects everywhere, which is annoying.
++static int g_num_hashes;
++static int g_num_copies;
++
++int NumHashesSinceLastCall() {
++  int retval = g_num_hashes;
++  g_num_hashes = 0;
++  return retval;
++}
++int NumCopiesSinceLastCall() {
++  int retval = g_num_copies;
++  g_num_copies = 0;
++  return retval;
++}
++
++/*
++ * These are the objects we hash.  Size is the size of the object
++ * (must be > sizeof(int).  Hashsize is how many of these bytes we
++ * use when hashing (must be > sizeof(int) and < Size).
++ */
++template <int Size, int Hashsize>
++class HashObject {
++ public:
++  typedef HashObject<Size, Hashsize> class_type;
++  HashObject() {}
++  HashObject(int i) : i_(i) {
++    memset(buffer_, i & 255, sizeof(buffer_));  // a "random" char
++  }
++  HashObject(const HashObject& that) { operator=(that); }
++  void operator=(const HashObject& that) {
++    g_num_copies++;
++    this->i_ = that.i_;
++    memcpy(this->buffer_, that.buffer_, sizeof(this->buffer_));
++  }
++
++  size_t Hash() const {
++    g_num_hashes++;
++    int hashval = i_;
++    for (size_t i = 0; i < Hashsize - sizeof(i_); ++i) {
++      hashval += buffer_[i];
++    }
++    return std::hash<int>()(hashval);
++  }
++
++  bool operator==(const class_type& that) const { return this->i_ == that.i_; }
++  bool operator<(const class_type& that) const { return this->i_ < that.i_; }
++  bool operator<=(const class_type& that) const { return this->i_ <= that.i_; }
++
++ private:
++  int i_;  // the key used for hashing
++  char buffer_[Size - sizeof(int)];
++};
++
++// A specialization for the case sizeof(buffer_) == 0
++template <>
++class HashObject<sizeof(int), sizeof(int)> {
++ public:
++  typedef HashObject<sizeof(int), sizeof(int)> class_type;
++  HashObject() {}
++  HashObject(int i) : i_(i) {}
++  HashObject(const HashObject& that) { operator=(that); }
++  void operator=(const HashObject& that) {
++    g_num_copies++;
++    this->i_ = that.i_;
++  }
++
++  size_t Hash() const {
++    g_num_hashes++;
++    return std::hash<int>()(i_);
++  }
++
++  bool operator==(const class_type& that) const { return this->i_ == that.i_; }
++  bool operator<(const class_type& that) const { return this->i_ < that.i_; }
++  bool operator<=(const class_type& that) const { return this->i_ <= that.i_; }
++
++ private:
++  int i_;  // the key used for hashing
++};
++
++namespace google {
++// Let the hashtable implementations know it can use an optimized memcpy,
++// because the compiler defines both the destructor and copy constructor.
++template <int Size, int Hashsize>
++struct is_relocatable<HashObject<Size, Hashsize>> : std::true_type {};
++}
++
++class HashFn {
++ public:
++  template <int Size, int Hashsize>
++  size_t operator()(const HashObject<Size, Hashsize>& obj) const {
++    return obj.Hash();
++  }
++  // Do the identity hash for pointers.
++  template <int Size, int Hashsize>
++  size_t operator()(const HashObject<Size, Hashsize>* obj) const {
++    return reinterpret_cast<uintptr_t>(obj);
++  }
++
++  // Less operator for MSVC's hash containers.
++  template <int Size, int Hashsize>
++  bool operator()(const HashObject<Size, Hashsize>& a,
++                  const HashObject<Size, Hashsize>& b) const {
++    return a < b;
++  }
++  template <int Size, int Hashsize>
++  bool operator()(const HashObject<Size, Hashsize>* a,
++                  const HashObject<Size, Hashsize>* b) const {
++    return a < b;
++  }
++  // These two public members are required by msvc.  4 and 8 are defaults.
++  static const size_t bucket_size = 4;
++  static const size_t min_buckets = 8;
++};
++
++/*
++ * Measure resource usage.
++ */
++
++class Rusage {
++ public:
++  /* Start collecting usage */
++  Rusage() { Reset(); }
++
++  /* Reset collection */
++  void Reset();
++
++  /* Show usage, in nanoseconds */
++  double UserTime();
++
++ private:
++  steady_clock::time_point start_;
++};
++
++inline void Rusage::Reset() { 
++  g_num_copies = 0;
++  g_num_hashes = 0;  
++  start_ = steady_clock::now(); 
++}
++
++inline double Rusage::UserTime() {
++  auto diff = steady_clock::now() - start_;
++  return duration_cast<nanoseconds>(diff).count();
++}
++
++static void print_uname() {
++#ifdef HAVE_SYS_UTSNAME_H
++  struct utsname u;
++  if (uname(&u) == 0) {
++    printf("%s %s %s %s %s\n", u.sysname, u.nodename, u.release, u.version,
++           u.machine);
++  }
++#endif
++}
++
++// Generate stamp for this run
++static void stamp_run(int iters, int read_factor) {
++  time_t now = time(0);
++  printf("======\n");
++  fflush(stdout);
++  print_uname();
++  printf("Average over %d iterations\n", iters);
++  printf("read factor = %d\n", read_factor);
++  fflush(stdout);
++  // don't need asctime_r/gmtime_r: we're not threaded
++  printf("Current time (GMT): %s", asctime(gmtime(&now)));
++}
++
++// This depends on the malloc implementation for exactly what it does
++// -- and thus requires work after the fact to make sense of the
++// numbers -- and also is likely thrown off by the memory management
++// STL tries to do on its own.
++
++#ifdef HAVE_GOOGLE_MALLOC_EXTENSION_H
++#include <google/malloc_extension.h>
++
++static size_t CurrentMemoryUsage() {
++  size_t result;
++  if (MallocExtension::instance()->GetNumericProperty(
++          "generic.current_allocated_bytes", &result)) {
++    return result;
++  } else {
++    return 0;
++  }
++}
++
++#else /* not HAVE_GOOGLE_MALLOC_EXTENSION_H */
++static size_t CurrentMemoryUsage() { return 0; }
++
++#endif
++
++static void report(char const* title, double t, int iters, size_t start_memory,
++                   size_t end_memory) {
++  // Construct heap growth report text if applicable
++  char heap[100] = "";
++  if (end_memory > start_memory) {
++    snprintf(heap, sizeof(heap), "%7.1f MB",
++             (end_memory - start_memory) / 1048576.0);
++  }
++
++  printf("%-20s %6.1f ns  (%8d hashes, %8d copies)%s\n", title, (t / iters),
++         NumHashesSinceLastCall(), NumCopiesSinceLastCall(), heap);
++  fflush(stdout);
++}
++
++template <class MapType>
++static void time_map_grow(int iters) {
++  MapType set;
++  Rusage t;
++
++  const size_t start = CurrentMemoryUsage();
++  t.Reset();
++  for (int i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++  double ut = t.UserTime();
++  const size_t finish = CurrentMemoryUsage();
++  report("map_grow", ut, iters, start, finish);
++}
++
++template <class MapType>
++static void time_map_grow_predicted(int iters) {
++  MapType set;
++  Rusage t;
++
++  const size_t start = CurrentMemoryUsage();
++  set.resize(iters);
++  t.Reset();
++  for (int i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++  double ut = t.UserTime();
++  const size_t finish = CurrentMemoryUsage();
++  report("map_predict/grow", ut, iters, start, finish);
++}
++
++template <class MapType>
++static void time_map_replace(int iters) {
++  MapType set;
++  Rusage t;
++  int i;
++
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++
++  t.Reset();
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++  double ut = t.UserTime();
++
++  report("map_replace", ut, iters, 0, 0);
++}
++
++template <class MapType>
++static void time_map_fetch(int iters, const vector<int>& indices,
++                           char const* title) {
++  MapType set;
++  Rusage t;
++  int r;
++  int i;
++
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++
++  r = 1;
++  t.Reset();
++  for (i = 0; i < iters; i++) {
++    r ^= static_cast<int>(set.find(indices[i]) != set.end());
++  }
++  double ut = t.UserTime();
++
++  srand(r);  // keep compiler from optimizing away r (we never call rand())
++  report(title, ut, iters, 0, 0);
++}
++
++template <class MapType>
++static void time_map_fetch_sequential(int iters) {
++  vector<int> v(iters);
++  for (int i = 0; i < iters; i++) {
++    v[i] = i;
++  }
++  time_map_fetch<MapType>(iters, v, "map_fetch_sequential");
++}
++
++// Apply a pseudorandom permutation to the given vector.
++static void shuffle(vector<int>* v) {
++  srand(9);
++  for (int n = v->size(); n >= 2; n--) {
++    swap((*v)[n - 1], (*v)[static_cast<unsigned>(rand()) % n]);
++  }
++}
++
++template <class MapType>
++static void time_map_fetch_random(int iters) {
++  vector<int> v(iters);
++  for (int i = 0; i < iters; i++) {
++    v[i] = i;
++  }
++  shuffle(&v);
++  time_map_fetch<MapType>(iters, v, "map_fetch_random");
++}
++
++template <class MapType>
++static void time_map_fetch_empty(int iters) {
++  MapType set;
++  Rusage t;
++  int r;
++  int i;
++
++  r = 1;
++  t.Reset();
++  for (i = 0; i < iters; i++) {
++    r ^= static_cast<int>(set.find(i) != set.end());
++  }
++  double ut = t.UserTime();
++
++  srand(r);  // keep compiler from optimizing away r (we never call rand())
++  report("map_fetch_empty", ut, iters, 0, 0);
++}
++
++template <class MapType>
++static void time_map_remove(int iters) {
++  MapType set;
++  Rusage t;
++  int i;
++
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++
++  t.Reset();
++  for (i = 0; i < iters; i++) {
++    set.erase(i);
++  }
++  double ut = t.UserTime();
++
++  report("map_remove", ut, iters, 0, 0);
++}
++
++template <class MapType>
++static void time_map_toggle(int iters) {
++  MapType set;
++  Rusage t;
++  int i;
++
++  const size_t start = CurrentMemoryUsage();
++  t.Reset();
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++    set.erase(i);
++  }
++
++  double ut = t.UserTime();
++  const size_t finish = CurrentMemoryUsage();
++
++  report("map_toggle", ut, iters, start, finish);
++}
++
++template <class MapType>
++static void time_map_iterate(int iters) {
++  MapType set;
++  Rusage t;
++  int r;
++  int i;
++
++  for (i = 0; i < iters; i++) {
++    set[i] = i + 1;
++  }
++
++  r = 1;
++  t.Reset();
++  for (typename MapType::const_iterator it = set.begin(), it_end = set.end();
++       it != it_end; ++it) {
++    r ^= it->second;
++  }
++
++  double ut = t.UserTime();
++
++  srand(r);  // keep compiler from optimizing away r (we never call rand())
++  report("map_iterate", ut, iters, 0, 0);
++}
++
++template <class MapType>
++static void stresshashfunction(int desired_insertions, int map_size,
++                               int stride) {
++  Rusage t;
++  int num_insertions = 0;
++  // One measurement of user time (in nanoseconds) is done for each iteration of
++  // the outer loop.  The times are summed.
++  double total_nanoseconds = 0;
++  const int k = desired_insertions / map_size;
++  MapType set;
++  for (int o = 0; o < k; o++) {
++    set.clear();
++    set.resize(map_size);
++    t.Reset();
++    const int maxint = (1ull << (sizeof(int) * 8 - 1)) - 1;
++    // Use n arithmetic sequences.  Using just one may lead to overflow
++    // if stride * map_size > maxint.  Compute n by requiring
++    // stride * map_size/n < maxint, i.e., map_size/(maxint/stride) < n
++    char* key;  // something we can do math on
++    const int n = map_size / (maxint / stride) + 1;
++    for (int i = 0; i < n; i++) {
++      key = NULL;
++      key += i;
++      for (int j = 0; j < map_size / n; j++) {
++        key += stride;
++        set[reinterpret_cast<typename MapType::key_type>(key)] =
++            ++num_insertions;
++      }
++    }
++    total_nanoseconds += t.UserTime();
++  }
++  printf("stresshashfunction map_size=%d stride=%d: %.1fns/insertion\n",
++         map_size, stride, total_nanoseconds / num_insertions);
++}
++
++template <class MapType>
++static void stresshashfunction(int num_inserts) {
++  static const int kMapSizes[] = {256, 1024};
++  for (unsigned i = 0; i < sizeof(kMapSizes) / sizeof(kMapSizes[0]); i++) {
++    const int map_size = kMapSizes[i];
++    for (int stride = 1; stride <= map_size; stride *= map_size) {
++      stresshashfunction<MapType>(num_inserts, map_size, stride);
++    }
++  }
++}
++
++template <class MapType, class StressMapType>
++static void measure_map(const char* label, int obj_size, int iters,
++                        bool stress_hash_function) {
++  printf("\n%s (%d byte objects, %d iterations):\n", label, obj_size, iters);
++  if (1) time_map_grow<MapType>(iters);
++  if (1) time_map_grow_predicted<MapType>(iters);
++  if (1) time_map_replace<MapType>(iters);
++  if (1) time_map_fetch_random<MapType>(iters);
++  if (1) time_map_fetch_sequential<MapType>(iters);
++  if (1) time_map_fetch_empty<MapType>(iters);
++  if (1) time_map_remove<MapType>(iters);
++  if (1) time_map_toggle<MapType>(iters);
++  if (1) time_map_iterate<MapType>(iters);
++  // This last test is useful only if the map type uses hashing.
++  // And it's slow, so use fewer iterations.
++  if (stress_hash_function) {
++    // Blank line in the output makes clear that what follows isn't part of the
++    // table of results that we just printed.
++    puts("");
++    stresshashfunction<StressMapType>(iters / 4);
++  }
++}
++
++template <class ObjType>
++static void test_all_maps(int obj_size, int iters) {
++  const bool stress_hash_function = obj_size <= 8;
++
++  if (FLAGS_test_sparse_hash_map)
++    measure_map<EasyUseSparseHashMap<ObjType, int, HashFn>,
++                EasyUseSparseHashMap<ObjType*, int, HashFn>>(
++        "SPARSE_HASH_MAP", obj_size, iters, stress_hash_function);
++
++  if (FLAGS_test_dense_hash_map)
++    measure_map<EasyUseDenseHashMap<ObjType, int, HashFn>,
++                EasyUseDenseHashMap<ObjType*, int, HashFn>>(
++        "DENSE_HASH_MAP", obj_size, iters, stress_hash_function);
++
++  if (FLAGS_test_hash_map)
++    measure_map<EasyUseHashMap<ObjType, int, HashFn>,
++                EasyUseHashMap<ObjType*, int, HashFn>>(
++        "STANDARD HASH_MAP", obj_size, iters, stress_hash_function);
++
++  if (FLAGS_test_map)
++    measure_map<EasyUseMap<ObjType, int>, EasyUseMap<ObjType*, int>>(
++        "STANDARD MAP", obj_size, iters, false);
++}
++
++void thread_lookup(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    ht.find_wait_free(j);
++  }
++}
++
++void thread_lookup_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    spin_rd_lock l(mu);
++    ht.find(j);
++  }
++}
++
++void thread_lookup_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++){
++    int bucket = j % 1000;
++    spin_rd_lock l(mu_list[bucket]);
++    ht[bucket].find(j);
++  }
++}
++
++void thread_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    ht.insert_lockless(std::move(std::pair<long, long>(j, j+10)));
++  }
++}
++
++void thread_insert_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    spin_wr_lock l(mu);
++    ht.insert(std::move(std::pair<long, long>(j, j+10)));
++  }
++}
++
++void thread_insert_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    int bucket = j % 1000;
++    spin_wr_lock l(mu_list[bucket]);
++    ht[bucket].insert(std::move(std::pair<long, long>(j, j+10)));
++ }
++}
++
++void thread_find_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for  (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    ht.insert_lockless(std::move(std::pair<long, long>(j, j+10)));
++    ht.find_wait_free(j);
++  }
++}
++
++void thread_erase(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, int iter, int offset, int threadnum){
++  for (long j = offset*iter/threadnum; j < (offset+1)*iter/threadnum; j++) {
++    ht.erase_lockless(j);
++  }
++}
++
++void test_parallel_find(int threadnum, int iter){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  
++  for (long i = 0; i < iter; i++)
++    ht.insert_lockless(std::move(std::pair<long ,long>(i, i + 10)));
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_lookup, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_insert(int threadnum, int iter){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_insert, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_find_and_insert(int threadnum, int iter){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++){
++    insert_threads[i] = std::thread(thread_find_insert, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find_and_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<2*iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_erase(int threadnum, int iter){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  for (long i = 0; i < iter; i++)
++    ht.insert(std::move(std::pair<long ,long>(i, i + 10)));
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_erase, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_erase\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++int lookup(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, long key){
++  auto it = ht.find(key);
++  if (it == ht.end()) {
++    return 0;
++  }else{
++    return 1;
++  }
++}
++
++int lookup_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht, long key){
++  spin_rd_lock l(mu);
++  auto it = ht.find(key);
++  if(it == ht.end()){
++    return 0;
++  }else{
++    return 1;
++ }
++}
++
++int lookup_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, long key){
++  int bucket = key % 1000;
++  spin_rd_lock l(mu_list[bucket]);
++	auto it = ht[bucket].find(key);
++	if(it == ht[bucket].end()){
++		return 0;
++	}else{
++		return 1;
++	}
++}
++
++void find_or_insert(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht,long *keys, long ReadLoops, int i){
++  for (long j = 0; j < ReadLoops; j++) {
++    if (!lookup(std::ref(ht), keys[j])) {
++      auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
++    }
++  }
++}
++
++void find_or_insert_rwlock(dense_hash_map<long, long, hash<long>, eqstr>& ht,long *keys, long ReadLoops, int i){
++  for(long j = 0; j < ReadLoops; j++){
++    if (!lookup_rwlock(std::ref(ht), keys[j])) {
++      spin_wr_lock l(mu);
++      auto it1 = ht.insert(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
++    }
++  }
++}
++
++void find_or_insert_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht,long *keys, long ReadLoops, int i){
++  for(long j = 0; j < ReadLoops; j++){
++    if(!lookup_rwlock_and_shaders(ht, keys[j])){
++      int bucket = keys[j] % 1000;
++      spin_wr_lock l(mu_list[bucket]);
++      auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
++    }
++  }
++}
++
++void find_or_insert_with_shaders(dense_hash_map_lockless<long, long, hash<long>, eqstr>* ht,long *keys, long ReadLoops, int i){
++  for(long j = 0; j < ReadLoops; j++){
++    if (!lookup(std::ref(ht[0]), keys[j])) {
++      auto it1 = ht[0].insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
++    }
++  }
++}
++
++void gen_thd(std::set<long>* segs, int thdid, long seg_num) {
++  int i = 0;
++  while(true) {
++    if (segs->size() == seg_num) 
++      break;
++    
++    long gen_key = (rand() % ( 10 * seg_num)) * (1 + thdid);
++    if (segs->find(gen_key) == segs->end()) {
++      segs->insert(gen_key);
++    }
++  }
++}
++
++void insert_thd2(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, const std::set<long>& keys , int thdid) {
++  auto it = keys.begin();
++  for (; it !=keys.end(); it++) {
++    long id = *it;
++    if(!lookup(std::ref(ht), id)){
++      auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(id, id+10)));
++    }
++  } 
++}
++
++void gen_hybrid_thd(std::set<long>* segs, std::vector<long>* insert_keys, std::vector<long>* lookup_keys, int thdid, long seg_num) {
++  {
++    auto it = segs->begin();
++    for (; it != segs->end(); it++) {
++      lookup_keys->push_back(*it);
++    }
++  }
++  int i = 0;
++  while(true) {
++    if (segs->size() == seg_num) 
++      break;
++    
++    long gen_key = (rand() % ( 1024 * seg_num) + 1) * (1 + thdid);
++    if (segs->find(gen_key) == segs->end()) {
++      segs->insert(gen_key);
++      insert_keys->push_back(gen_key);
++    }
++  }
++}
++
++void hybrid_thd_f(dense_hash_map_lockless<long, long, hash<long>, eqstr>& ht, std::vector<long>* insert_keys, std::vector<long>* lookup_keys,  long ops, int thdid) {
++  auto insert_it = insert_keys->begin();
++  auto lookup_it = lookup_keys->begin(); 
++ 
++  for (long i = 0; i < ops ; i++) {
++    int k = i % 2; 
++    // 80% lookup
++    if ( k == 0 ) {
++      if (lookup_it == lookup_keys->end()) lookup_it = lookup_keys->begin();
++      if (!lookup(std::ref(ht), *lookup_it)) {
++        auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
++        }
++        lookup_it++;
++      } else if (k == 1) {
++        // 20% insert
++        if (insert_it == insert_keys->end()) insert_it = insert_keys->begin();
++        if(!lookup(std::ref(ht), *lookup_it)){
++        auto it1 = ht.insert_lockless(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
++        }
++        insert_it++;
++      } 
++  } 
++}
++
++void test_parallel_hybrid(int thread_num, int iter, int read_factor){
++  dense_hash_map_lockless<long, long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  std::vector<long> lookup_keys;
++  std::vector<std::set<long>> seg_keys(thread_num);
++  long seg_num = read_factor * iter / thread_num;
++  std::vector<std::thread> gen_threads(thread_num);
++  for (int i = 0; i < thread_num; i++) {
++    gen_threads[i] = std::thread(gen_thd, &seg_keys[i], i, seg_num);
++  }
++
++  for (auto &t : gen_threads) {
++    t.join();
++  }
++
++  std::vector<std::thread> insert_threads(thread_num);
++
++  for (size_t i = 0; i < thread_num; ++i) {
++    insert_threads[i] = std::thread(insert_thd2, std::ref(ht), seg_keys[i], i);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++
++  {
++    std::vector<std::vector<long> > lookup_keys(thread_num);
++    std::vector<std::vector<long> > insert_keys(thread_num);
++
++    std::vector<std::thread> gen_threads(thread_num);
++    for (int i = 0; i < thread_num; i++) {
++      gen_threads[i] =
++      std::thread(gen_hybrid_thd, &seg_keys[i], &insert_keys[i], &lookup_keys[i], i, 2 * seg_num);
++    }
++
++    for (auto &t : gen_threads) {
++      t.join();
++    }
++    std::vector<std::thread> hybrid_threads(thread_num);
++    auto st_time = std::chrono::high_resolution_clock::now();
++    for (size_t i = 0; i < thread_num; ++i) {
++        hybrid_threads[i] =
++          std::thread(hybrid_thd_f, std::ref(ht), &(insert_keys[i]), &(lookup_keys[i]), read_factor * iter / thread_num, i);
++    }
++    for (auto &t : hybrid_threads) {
++      t.join();
++    }
++    auto ed_time = std::chrono::high_resolution_clock::now();
++    auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++    std::cout<<"parallel_hybrid\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
++  }
++
++}
++
++void test_parallel_8find_and_2insert_hotspot(int threadnum, int iter, int read_factor){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key_and_value(-1, 2147483647);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
++  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
++  srand((unsigned)time(NULL)); 
++  long *keys = (long *)malloc(sizeof(long)*iter);
++  long *counter = (long *)malloc(sizeof(long)*iter);
++  long *hotkeys, *coldkeys;
++  hotkeys = keys;
++  coldkeys = keys + iter/5;
++  for (long i = 0; i < 2147483647; i++) {
++    flag[i] = 0;
++  }
++  for (long i = 0; i < iter; i++) {
++    counter[i] = 1;
++  }
++  int index = 0;
++  while (index < iter) {
++    long j = rand() % 2147483647;
++    if (flag[j] == 1) // the number is already set as a key
++      continue;
++    else{ // the number is not selected as a key
++      keys[index] = j;
++      index++;
++      flag[j] = 1;
++    }
++  }
++  free(flag);
++  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
++
++  for (size_t i = 0; i < threadnum; i++)
++    lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor * iter/threadnum);
++
++  for (long k = 0; k < threadnum; k ++) {
++    for (long i = 0; i < read_factor * iter/threadnum; i++) {
++      long j = rand()%10;
++      if (j < 8) {
++        long pos = rand()%(iter/5);
++	lookup_keys[k][i] = hotkeys[pos];
++      } else {
++        long pos = rand()%(iter * 4 / 5);
++	lookup_keys[k][i] = coldkeys[pos];
++      }
++    }
++  }
++  std::vector<std::thread> lookup_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (size_t i =0 ; i < threadnum; i++) {
++    lookup_threads[i] = std::thread(find_or_insert, std::ref(ht), lookup_keys[i], read_factor*iter/threadnum, i);
++  }
++  for(auto &t : lookup_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_find_or_create_with_shaders(int threadnum, int iter, int read_factor){
++  dense_hash_map_lockless< long,  long, hash<long>, eqstr> ht[1000];
++  for (int i = 0; i < 1000 ; i++){
++    ht[i].set_empty_key_and_value(-1, 2147483647);
++    ht[i].set_deleted_key(-2);
++    ht[i].set_counternum(16);
++  }
++  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
++  srand((unsigned)time(NULL)); 
++  long *keys = (long *)malloc(sizeof(long)*iter);
++  long *counter = (long *)malloc(sizeof(long)*iter);
++  long *hotkeys, *coldkeys;
++  hotkeys = keys;
++  coldkeys = keys + iter/5;
++  for(long i = 0; i < 2147483647; i++){
++    flag[i] = 0;
++  }
++  for(long i = 0; i < iter; i++){
++    counter[i] = 1;
++  }
++  int index = 0;
++  while (index < iter){
++    long j = rand() % 2147483647;
++    if(flag[j] == 1) // the number is already set as a key
++      continue;
++    else{ // the number is not selected as a key
++      keys[index] = j;
++      index++;
++      flag[j] = 1;
++   }
++  }
++  free(flag);
++  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
++
++  for (size_t i = 0; i < threadnum; i++)
++    lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor *iter/threadnum);
++
++  for (long k = 0; k < threadnum; k ++) {
++    for (long i = 0; i < read_factor*iter/threadnum; i++) {
++      long j = rand()%10;
++      if (j < 8) {
++        long pos = rand()%(iter/5);
++	lookup_keys[k][i] = hotkeys[pos];
++      }else{
++	long pos = rand()%(iter * 4 / 5);
++	lookup_keys[k][i] = coldkeys[pos];
++      }
++    }
++  }
++  std::vector<std::thread> lookup_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (size_t i = 0 ; i < threadnum; i++) {
++    lookup_threads[i] = std::thread(find_or_insert_with_shaders, ht, lookup_keys[i], read_factor*iter/threadnum, i);
++  }
++  for (auto &t : lookup_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find_or_create_with_shaders\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
++}
++
++
++void test_dense_hash_map_parallel(int threadnum, int iter, int read_factor){
++  test_parallel_find(threadnum, iter);
++  test_parallel_insert(threadnum, iter);
++  test_parallel_hybrid(threadnum, iter, read_factor);
++}
++
++
++void test_parallel_find_rwlock(int threadnum, int iter){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key(-1);
++  ht.set_deleted_key(-2);
++  for (long i = 0; i < iter; i++)
++    ht.insert(std::move(std::pair<long ,long>(i, i + 10)));
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_lookup_rwlock, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_insert_rwlock(int threadnum, int iter){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key(-1);
++  ht.set_deleted_key(-2);
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_insert_rwlock, std::ref(ht), iter, i, threadnum);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count(); 
++  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_8find_and_2insert_hotspot_rwlock(int threadnum, int iter){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht;
++  ht.set_empty_key(-1);
++  ht.set_deleted_key(-2);
++  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
++  srand((unsigned)time(NULL)); 
++  long *keys = (long *)malloc(sizeof(long)*iter);
++  long *counter = (long *)malloc(sizeof(long)*iter);
++  long *hotkeys, *coldkeys;
++  hotkeys = keys;
++  coldkeys = keys + iter/5;
++  for (long i = 0; i < 2147483647; i++) {
++    flag[i] = 0;
++  }
++  for (long i = 0; i < iter; i++) {
++    counter[i] = 1;
++  }
++  int index = 0;
++  while (index < iter) {
++    long j = rand() % 2147483647;
++    if (flag[j] == 1) // the number is already set as a key
++      continue;
++    else{ // the number is not selected as a key
++      keys[index] = j;
++      index++;
++      flag[j] = 1;
++    }
++  }
++  free(flag);
++  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
++
++  for (size_t i = 0; i < threadnum; i++)
++    lookup_keys[i] = (long *)malloc(sizeof(long) * 5*iter/threadnum);
++
++  for (long k = 0; k < threadnum; k ++) {
++    for (long i = 0; i < 5*iter/threadnum; i++){
++      long j = rand()%10;
++      if (j < 8) {
++        long pos = rand()%(iter/5);
++	lookup_keys[k][i] = hotkeys[pos];
++      }else {
++        long pos = rand()%(iter * 4 / 5);
++	lookup_keys[k][i] = coldkeys[pos];
++      }
++    }
++  }
++  std::vector<std::thread> lookup_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (size_t i = 0 ; i < threadnum; i++) {
++    lookup_threads[i] = std::thread(find_or_insert_rwlock, std::ref(ht), lookup_keys[i], 5*iter/threadnum, i);
++  }
++  for (auto &t : lookup_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<5*iter/seconds_elapsed<<std::endl;
++}
++
++void test_dense_hash_map_with_rwlock(int threadnum, int iter){
++  test_parallel_find_rwlock(threadnum, iter);
++  test_parallel_insert_rwlock(threadnum, iter);
++}
++
++void test_parallel_find_rwlock_and_shaders(int threadnum, int iter){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
++  for (int i = 0; i < 1000 ; i++){
++    ht[i].set_empty_key(-1);
++    ht[i].set_deleted_key(-2);
++  }
++  for (long i = 0; i < iter; i++) {
++    int bucket = i % 1000;
++    ht[bucket].insert(std::move(std::pair<long ,long>(i, i + 10)));
++  }
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for(long i =0 ; i < threadnum; i++){
++    insert_threads[i] = std::thread(thread_lookup_rwlock_and_shaders, ht, iter, i, threadnum);
++  }
++  for(auto &t : insert_threads){
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_insert_rwlock_and_shaders(int threadnum, int iter){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
++  for (int i = 0; i < 1000 ; i++) {
++    ht[i].set_empty_key(-1);
++    ht[i].set_deleted_key(-2);
++  }
++  std::vector<std::thread> insert_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (long i = 0 ; i < threadnum; i++) {
++    insert_threads[i] = std::thread(thread_insert_rwlock_and_shaders, ht, iter, i, threadnum);
++  }
++  for(auto &t : insert_threads){
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_insert\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<iter/seconds_elapsed<<std::endl;
++}
++
++void test_parallel_8find_and_2insert_hotspot_rwlock_and_shaders(int threadnum, int iter, int read_factor){
++  dense_hash_map< long,  long, hash<long>, eqstr> ht[1000];
++  for (int i = 0; i < 1000 ; i++) {
++    ht[i].set_empty_key(-1);
++    ht[i].set_deleted_key(-2);
++  }
++  bool* flag = (bool *)malloc(sizeof(bool)*2147483647);
++  srand((unsigned)time(NULL)); 
++  long *keys = (long *)malloc(sizeof(long)*iter);
++  long *counter = (long *)malloc(sizeof(long)*iter);
++  long *hotkeys, *coldkeys;
++  hotkeys = keys;
++  coldkeys = keys + iter/5;
++  for (long i = 0; i < 2147483647; i++) {
++    flag[i] = 0;
++  }
++  for (long i = 0; i < iter; i++) {
++    counter[i] = 1;
++  }
++  int index = 0;
++  while (index < iter) {
++    long j = rand() % 2147483647;
++    if (flag[j] == 1) // the number is already set as a key
++      continue;
++    else { // the number is not selected as a key
++     keys[index] = j;
++     index++;
++     flag[j] = 1;
++    }
++  }
++  free(flag);
++  long** lookup_keys = (long **)malloc(sizeof(long *) * threadnum);
++
++  for (size_t i = 0; i < threadnum; i++)
++  lookup_keys[i] = (long *)malloc(sizeof(long) * read_factor * iter/threadnum);
++  for (long k = 0; k < threadnum; k++) {
++    for (long i = 0; i < read_factor*iter/threadnum; i++) {
++      long j = rand()%10;
++      if (j < 8) {
++        long pos = rand()%(iter/5);
++        lookup_keys[k][i] = hotkeys[pos];
++      }else{
++        long pos = rand()%(iter * 4 / 5);
++        lookup_keys[k][i] = coldkeys[pos];
++      }
++    }
++  }
++  std::vector<std::thread> lookup_threads(threadnum);
++  auto st_time = std::chrono::high_resolution_clock::now();
++  for (size_t i = 0 ; i < threadnum; i++) {
++    lookup_threads[i] = std::thread(find_or_insert_rwlock_and_shaders, ht, lookup_keys[i], read_factor*iter/threadnum, i);
++  }
++  for (auto &t : lookup_threads) {
++    t.join();
++  }
++  auto ed_time = std::chrono::high_resolution_clock::now();
++  auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++  std::cout<<"parallel_find_or_create\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
++
++}
++
++void insert_thd2_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, const std::set<long>& keys , int thdid) {
++  auto it = keys.begin();
++  for (; it !=keys.end(); it++) {
++    long id = *it;
++    if (!lookup_rwlock_and_shaders(ht, id)) {
++      int bucket = id % 1000;
++      spin_wr_lock l(mu_list[bucket]);
++      auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(id, id+10)));
++    }
++  } 
++}
++
++
++void hybrid_thd_f_rwlock_and_shaders(dense_hash_map<long, long, hash<long>, eqstr>* ht, std::vector<long>* insert_keys, std::vector<long>* lookup_keys,  long ops, int thdid) {
++  auto insert_it = insert_keys->begin();
++  auto lookup_it = lookup_keys->begin(); 
++ 
++  for (long i = 0; i < ops ; i++) {
++
++    int k = i % 2;
++    
++      // 80% lookup
++    if ( k == 0 ) {
++      if (lookup_it == lookup_keys->end()) lookup_it = lookup_keys->begin();
++      if(!lookup_rwlock_and_shaders(ht, *lookup_it)){
++        int bucket = *lookup_it % 1000;
++        spin_wr_lock l(mu_list[bucket]);
++        auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
++      }
++      lookup_it++;
++      } else if (k == 1) {
++        // 20% insert
++        if (insert_it == insert_keys->end()) insert_it = insert_keys->begin();
++        if(!lookup_rwlock_and_shaders(ht, *lookup_it)){
++          int bucket = *lookup_it % 1000;
++          spin_wr_lock l(mu_list[bucket]);
++          auto it1 = ht[bucket].insert(std::move(std::pair<long, long >(*lookup_it, *lookup_it+10)));
++        }
++        insert_it++;
++      }
++  } 
++}
++
++void test_parallel_hybrid_rwlock_and_shaders(int thread_num, int iter, int read_factor){
++  dense_hash_map<long, long, hash<long>, eqstr> ht[1000];
++  for(int i=0; i < 1000 ; i++){
++    ht[i].set_empty_key(-1);
++	  ht[i].set_deleted_key(-2);
++  }
++  std::vector<long> lookup_keys;
++  std::vector<std::set<long>> seg_keys(thread_num);
++  long seg_num = read_factor * iter / thread_num;
++  std::vector<std::thread> gen_threads(thread_num);
++  for(int i = 0; i < thread_num; i++) {
++    gen_threads[i] = std::thread(gen_thd, &seg_keys[i], i, seg_num);
++  }
++
++  for (auto &t : gen_threads) {
++    t.join();
++  }
++
++  std::vector<std::thread> insert_threads(thread_num);
++
++  for (size_t i = 0; i < thread_num; ++i) {
++    insert_threads[i] = std::thread(insert_thd2_rwlock_and_shaders, ht, seg_keys[i], i);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++
++  {
++    std::vector<std::vector<long> > lookup_keys(thread_num);
++    std::vector<std::vector<long> > insert_keys(thread_num);
++
++    std::vector<std::thread> gen_threads(thread_num);
++    for (int i = 0; i < thread_num; i++) {
++      gen_threads[i] =
++      std::thread(gen_hybrid_thd, &seg_keys[i], &insert_keys[i], &lookup_keys[i], i, 2 * seg_num);
++    }
++
++    for (auto &t : gen_threads) {
++      t.join();
++    }
++    std::vector<std::thread> hybrid_threads(thread_num);
++    auto st_time = std::chrono::high_resolution_clock::now();
++    for (size_t i = 0; i < thread_num; ++i) {
++        hybrid_threads[i] =
++          std::thread(hybrid_thd_f_rwlock_and_shaders, ht, &(insert_keys[i]), &(lookup_keys[i]), read_factor * iter / thread_num, i);
++    }
++    for (auto &t : hybrid_threads) {
++      t.join();
++    }
++    auto ed_time = std::chrono::high_resolution_clock::now();
++    auto seconds_elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(ed_time - st_time).count();
++    std::cout<<"parallel_hybrid\t"<<"Time: "<<seconds_elapsed<<"\t"<<"Throughput: "<<read_factor*iter/seconds_elapsed<<std::endl;
++  }
++
++}
++
++void test_dense_hash_map_with_rwlock_and_shaders(int threadnum, int iter, int read_factor){
++  for (int i = 0; i < 1000; i++)
++    mu_list[i] = EASY_SPINRWLOCK_INITIALIZER;
++  test_parallel_find_rwlock_and_shaders(threadnum, iter);
++  test_parallel_insert_rwlock_and_shaders(threadnum, iter);
++  test_parallel_hybrid_rwlock_and_shaders(threadnum, iter, read_factor);
++}
++
++int main(int argc, char** argv) {
++  int iters = kDefaultIters;
++  int threadnum = 16;
++  int read_factor = 5;
++  if (argc > 1) {  // first arg is # of iterations
++    threadnum = atoi(argv[1]);
++    if(argc > 2)
++     read_factor  = atoi(argv[2]);
++    if(argc > 3)
++	iters = atoi(argv[3]);
++  }
++
++  stamp_run(iters, read_factor);
++
++#ifndef HAVE_SYS_RESOURCE_H
++  printf(
++      "\n*** WARNING ***: sys/resources.h was not found, so all times\n"
++      "                 reported are wall-clock time, not user time\n");
++#endif
++
++  // It would be nice to set these at run-time, but by setting them at
++  // compile-time, we allow optimizations that make it as fast to use
++  // a HashObject as it would be to use just a straight int/char
++  // buffer.  To keep memory use similar, we normalize the number of
++  // iterations based on size.
++  std::cout<<"Benchmark for dense hash map with rwlock\n";
++  std::cout<<"**************************************************\n";
++  test_dense_hash_map_with_rwlock(threadnum, iters/2);
++  std::cout<<"**************************************************\n";
++  std::cout<<"Benchmark for dense hash map with rwlock and shaders\n";
++  std::cout<<"**************************************************\n";
++  test_dense_hash_map_with_rwlock_and_shaders(threadnum, iters/2, read_factor);
++  std::cout<<"**************************************************\n";
++  std::cout<<"Benchmark for lockless dense hash map\n";
++  std::cout<<"**************************************************\n";
++  test_dense_hash_map_parallel(threadnum, iters/2, read_factor);
++  std::cout<<"**************************************************\n";
++
++  return 0;
++}
+diff --git a/tests/dense_hash_map_unittests.cc b/tests/dense_hash_map_unittests.cc
+index 7c4b3c1..37560c1 100644
+--- a/tests/dense_hash_map_unittests.cc
++++ b/tests/dense_hash_map_unittests.cc
+@@ -1,21 +1,128 @@
+-//
+-// Created by Lukas Barth on 17.04.18.
+-//
+-
+ #include "gtest/gtest.h"
+-#include "sparsehash/dense_hash_map"
++#include <iostream>
++#include <sparsehash/dense_hash_map_lockless>
++#include <cstring>
++#include <stdlib.h>
++#include <stdio.h>
++#include <time.h>
++#include <chrono>
++#include <thread>
++#include <vector>
++#include <random>
++#include <unistd.h>
++#include <sys/syscall.h>
++
++using google::dense_hash_map_lockless;      // namespace where class lives by default
++using std::cout;
++using std::endl;
++using std::hash;  // or __gnu_cxx::hash, or maybe tr1::hash, depending on your OS
++
++#define THREADNUM 16
++
++long InsertLoops = 1000;
++long ReadLoops = 5000;
++long min = 0;
++long max = 2147483647;
++dense_hash_map_lockless<long, long> ht;
++dense_hash_map_lockless<long, long> ht_insert;
++
++int lookup(long key){
++  auto it = ht.find(key);
++  if (it == ht.end()) {
++    return 0;
++  }else{
++    return 1;
++  }
++}
++
++void hybrid_process(long *keys, long ReadLoops){
++  for (long j = 0; j < ReadLoops; j++) {
++    ht.insert_lockless(std::move(std::pair<long, long >(keys[j], keys[j]+10)));
++    auto it = ht.find_wait_free(keys[j]);
++    ASSERT_EQ(it.first + 10 , it.second);
++    if (j%2 == 0) {
++      ht.erase_lockless(keys[j]);
++      it = ht.find_wait_free(keys[j]);
++      ASSERT_EQ(it.first, -1);
++    }
++  }
++}
++
++void multi_insertion(){
++  for (long j = 0; j < 5; j++) {
++    ht_insert.insert_lockless(std::move(std::pair<long, long >(j, j+10)));
++  }
++}
+ 
+-using google::dense_hash_map;
+ 
+-TEST(DenseHashMap, TestEmplaceHint) {
+-	dense_hash_map<int, const char *> map;
+-	map.set_empty_key(0);
++TEST(DenseHashMap, Testconcurrent) {
++  bool* flag = (bool *)malloc(sizeof(bool)*max);
++  srand((unsigned)time(NULL)); 
++  long *keys = (long *)malloc(sizeof(long)*InsertLoops);
++  long *counter = (long *)malloc(sizeof(long)*InsertLoops);
++  ht.set_empty_key_and_value(-1, max);
++  ht.set_deleted_key(-2);
++  ht.set_counternum(16);
+ 
+-	const char * str1 = "Hello";
++  for (long i = 0; i < max; i++) {
++    flag[i] = 0;
++  }
++  for (long i = 0; i < InsertLoops; i++) {
++    counter[i] = 1;
++  }
++  int index = 0;
++  while (index < InsertLoops) {
++    long j = rand() % max;
++    if (flag[j] == 1) // the number is already set as a key
++      continue;
++    else { // the number is not selected as a key
++      keys[index] = j;
++      index++;
++      flag[j] = 1;
++    }
++  }
++  free(flag);
++  std::vector<std::thread> insert_threads(THREADNUM);
++  for (size_t i = 0 ; i < THREADNUM; i++) {
++    insert_threads[i] = std::thread(hybrid_process, &keys[i*InsertLoops/THREADNUM], InsertLoops/THREADNUM);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  {
++    long sum = 0;
++    std::pair<std::pair<const long, long>*, long> snapshot = ht.GetSnapshot();
++    std::pair<const long, long>* ht_dump = snapshot.first;
++    long bucket_cnt_dump = snapshot.second;
++    for (long i = 0; i < bucket_cnt_dump; i++) {
++      if (ht_dump[i].first != -1 &&  ht_dump[i].first != -2) {
++        sum++;
++      }
++    }
++    ASSERT_EQ(ht.size_lockless(), sum);
++  }
+ 
+-	map.insert({42, str1});
+-	auto it = map.begin();
+-	map.emplace_hint(it, 1701, "World");
++  ht_insert.set_empty_key_and_value(-1, max);
++  ht_insert.set_deleted_key(-2);
++  ht_insert.set_counternum(16);
+ 
+-	ASSERT_EQ(map.size(), 2);
+-}
+\ No newline at end of file
++  for (size_t i = 0 ; i < THREADNUM; i++) {
++    insert_threads[i] = std::thread(multi_insertion);
++  }
++  for (auto &t : insert_threads) {
++    t.join();
++  }
++  {
++    long sum = 0;
++    std::pair<std::pair<const long, long>*, long> snapshot = ht_insert.GetSnapshot();
++    std::pair<const long, long>* ht_dump = snapshot.first;
++    long bucket_cnt_dump = snapshot.second;
++    for (long i = 0; i < bucket_cnt_dump; i++) {
++      if (ht_dump[i].first != -1 &&  ht_dump[i].first != -2) {
++        sum++;
++      }
++    }
++    ASSERT_EQ(ht_insert.size_lockless(), 5);
++    ASSERT_EQ(ht_insert.size_lockless(), sum);
++  }
++}
+diff --git a/tests/rwlock.h b/tests/rwlock.h
+new file mode 100644
+index 0000000..25e8e93
+--- /dev/null
++++ b/tests/rwlock.h
+@@ -0,0 +1,224 @@
++#define EASY_SMP_LOCK               "lock;"
++#define easy_atomic_set(v,i)        ((v) = (i))
++
++typedef volatile int64_t easy_atomic_t;
++static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i)
++{
++    __asm__ __volatile__(
++        EASY_SMP_LOCK "addq %1,%0"
++        : "=m" ((*v)) : "r" (i), "m" ((*v)));
++}
++static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value, int64_t i)
++{
++    int64_t                 __i = i;
++    __asm__ __volatile__(
++        EASY_SMP_LOCK "xaddq %0, %1;"
++        :"=r"(i)
++        :"m"(*value), "0"(i));
++    return i + __i;
++}
++static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old, int64_t set)
++{
++    uint8_t                 res;
++    __asm__ volatile (
++        EASY_SMP_LOCK "cmpxchgq %3, %1; sete %0"
++        : "=a" (res) : "m" (*lock), "a" (old), "r" (set) : "cc", "memory");
++    return res;
++}
++static __inline__ void easy_atomic_inc(easy_atomic_t *v)
++{
++    __asm__ __volatile__(EASY_SMP_LOCK "incq %0" : "=m" (*v) :"m" (*v));
++}
++static __inline__ void easy_atomic_dec(easy_atomic_t *v)
++{
++    __asm__ __volatile__(EASY_SMP_LOCK "decq %0" : "=m" (*v) :"m" (*v));
++}
++
++#define EASY_OK                     0
++#define EASY_ERROR                  (-1)
++#define EASY_ABORT                  (-2)
++#define EASY_ASYNC                  (-3)
++#define EASY_BREAK                  (-4)
++#define EASY_ENCODE                 (-5)
++#define EASY_QUEUE_FULL             (-6)
++#define EASY_AGAIN                  (-EAGAIN)
++
++typedef struct easy_spinrwlock_t {
++    easy_atomic_t ref_cnt;
++    easy_atomic_t wait_write;
++} easy_spinrwlock_t;
++#define EASY_SPINRWLOCK_INITIALIZER {0, 0}
++static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock)
++{
++    int ret = EASY_OK;
++
++    if (NULL == lock) {
++        ret = EASY_ERROR;
++    } else {
++        int cond = 1;
++
++        while (cond) {
++            int loop = 1;
++
++            do {
++                easy_atomic_t oldv = lock->ref_cnt;
++
++                if (0 <= oldv && 0 == lock->wait_write) {
++                    if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, oldv + 1)) {
++                        return ret;
++                    }
++                }
++
++                asm("pause");
++                loop <<= 1;
++            } while (loop < 1024);
++
++            sched_yield();
++        }
++    }
++
++    return ret;
++}
++static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock)
++{
++    int ret = EASY_OK;
++
++    if (NULL == lock) {
++        ret = EASY_ERROR;
++    } else {
++        int cond = 1;
++        easy_atomic_inc(&lock->wait_write);
++
++        while (cond) {
++            int loop = 1;
++
++            do {
++                easy_atomic_t oldv = lock->ref_cnt;
++
++                if (0 == oldv) {
++                    if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, -1)) {
++                        cond = 0;
++                        break;
++                    }
++                }
++
++                asm("pause");
++                loop <<= 1;
++            } while (loop < 1024);
++
++            if (cond) sched_yield();
++        }
++
++        easy_atomic_dec(&lock->wait_write);
++    }
++
++    return ret;
++}
++static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock)
++{
++    int ret = EASY_OK;
++
++    if (NULL == lock) {
++        ret = EASY_ERROR;
++    } else {
++        ret = EASY_AGAIN;
++        easy_atomic_t oldv = lock->ref_cnt;
++
++        if (0 <= oldv
++                && 0 == lock->wait_write) {
++            easy_atomic_t newv = oldv + 1;
++
++            if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
++                ret = EASY_OK;
++            }
++        }
++    }
++
++    return ret;
++}
++static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock)
++{
++    int ret = EASY_OK;
++
++    if (NULL == lock) {
++        ret = EASY_ERROR;
++    } else {
++        ret = EASY_AGAIN;
++        easy_atomic_t oldv = lock->ref_cnt;
++
++        if (0 == oldv) {
++            easy_atomic_t newv = -1;
++
++            if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
++                ret = EASY_OK;
++            }
++        }
++    }
++
++    return ret;
++}
++static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t *lock)
++{
++    int ret = EASY_OK;
++
++    if (NULL == lock) {
++        ret = EASY_ERROR;
++    } else {
++        while (1) {
++            easy_atomic_t oldv = lock->ref_cnt;
++
++            if (-1 == oldv) {
++                easy_atomic_t newv = 0;
++
++                if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
++                    break;
++                }
++            } else if (0 < oldv) {
++                easy_atomic_t newv = oldv - 1;
++
++                if (easy_atomic_cmp_set(&lock->ref_cnt, oldv, newv)) {
++                    break;
++                }
++            } else {
++                ret = EASY_ERROR;
++                break;
++            }
++        }
++    }
++
++    return ret;
++}
++
++class spin_rd_lock {
++public:
++    typedef easy_spinrwlock_t lock_type;
++
++    explicit spin_rd_lock(lock_type* lock) : lock_(lock) {
++        easy_spinrwlock_rdlock(lock_);
++    }
++    explicit spin_rd_lock(lock_type& lock) : lock_(&lock) {
++        easy_spinrwlock_rdlock(lock_);
++    }
++    ~spin_rd_lock() {
++        easy_spinrwlock_unlock(lock_);
++    }
++private:
++    lock_type* lock_;
++};
++
++class spin_wr_lock {
++public:
++    typedef easy_spinrwlock_t lock_type;
++
++    explicit spin_wr_lock(lock_type* lock) : lock_(lock) {
++        easy_spinrwlock_wrlock(lock_);
++    }
++    explicit spin_wr_lock(lock_type& lock) : lock_(&lock) {
++        easy_spinrwlock_wrlock(lock_);
++    }
++    ~spin_wr_lock() {
++        easy_spinrwlock_unlock(lock_);
++    }
++private:
++    lock_type* lock_;
++};
+\ No newline at end of file
+-- 
+2.25.1
+
diff --git a/third_party/tf/BUILD b/third_party/tf/BUILD
new file mode 100644
index 00000000..e69de29b
diff --git a/third_party/tf/tf_215.patch b/third_party/tf/tf_215.patch
new file mode 100644
index 00000000..7be0bbb8
--- /dev/null
+++ b/third_party/tf/tf_215.patch
@@ -0,0 +1,31 @@
+diff --git a/tensorflow/workspace0.bzl b/tensorflow/workspace0.bzl
+index d8b53e1ef8b..a8c00f1598e 100644
+--- a/tensorflow/workspace0.bzl
++++ b/tensorflow/workspace0.bzl
+@@ -133,7 +133,7 @@ def workspace():
+     _tf_bind()
+ 
+     grpc_extra_deps()
+-    rules_foreign_cc_dependencies()
++    rules_foreign_cc_dependencies(cmake_version="3.29.5")
+     config_googleapis()
+ 
+ # Alias so it can be loaded without assigning to a different symbol to prevent
+diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
+index af1613994a7..890ebab2c92 100644
+--- a/tensorflow/workspace3.bzl
++++ b/tensorflow/workspace3.bzl
+@@ -62,6 +62,13 @@ def workspace():
+     # but provides a script for setting up build rules via overlays.
+     llvm("llvm-raw")
+ 
++    http_archive(
++        name = "rules_foreign_cc",
++        sha256 = "4b33d62cf109bcccf286b30ed7121129cc34cf4f4ed9d8a11f38d9108f40ba74",
++        strip_prefix = "rules_foreign_cc-0.11.1",
++        url = "https://github.com/bazelbuild/rules_foreign_cc/releases/download/0.11.1/rules_foreign_cc-0.11.1.tar.gz",
++    )
++
+ # Alias so it can be loaded without assigning to a different symbol to prevent
+ # shadowing previous loads and trigger a buildifier warning.
+ tf_workspace3 = workspace
diff --git a/third_party/xla/BUILD.bazel b/third_party/xla/BUILD.bazel
new file mode 100644
index 00000000..70d5ecfb
--- /dev/null
+++ b/third_party/xla/BUILD.bazel
@@ -0,0 +1,25 @@
+# Copyright 2024 The JAX SC Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+package(
+    default_applicable_licenses = ["//:license"],
+    default_visibility = ["//jax_tpu_embedding/sparsecore:__subpackages__"],
+)
+
+bzl_library(
+    name = "workspace_bzl",
+    srcs = ["workspace.bzl"],
+    visibility = ["//visibility:private"],
+)
diff --git a/third_party/xla/workspace.bzl b/third_party/xla/workspace.bzl
new file mode 100644
index 00000000..264f0964
--- /dev/null
+++ b/third_party/xla/workspace.bzl
@@ -0,0 +1,37 @@
+# Copyright 2024 The JAX SC Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# buildifier: disable=module-docstring
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
+# To update XLA to a new revision,
+# a) update XLA_COMMIT to the new git commit hash
+# b) get the sha256 hash of the commit by running:
+#    curl -L https://github.com/openxla/xla/archive/<git hash>.tar.gz | sha256sum
+#    and update XLA_SHA256 with the result.
+
+XLA_COMMIT = "5a9f79f295ba8d16afce24ea8724da525b8eb87d"
+XLA_SHA256 = "83e516dd8f7c61541aa9e2cba7fe480166ea23f28a41fed445fef4c5b6d45519"
+XLA_ARCHIVE = "https://github.com/openxla/xla/archive/{commit}.tar.gz".format(commit = XLA_COMMIT)
+
+def repo():
+    http_archive(
+        name = "xla",
+        sha256 = XLA_SHA256,
+        strip_prefix = "xla-{commit}".format(commit = XLA_COMMIT),
+        urls = [
+            # Try TF mirror first.
+            "https://storage.googleapis.com/mirror.tensorflow.org/%s" % XLA_ARCHIVE[8:],
+            XLA_ARCHIVE,
+        ],
+    )
diff --git a/tools/build_base_container.sh b/tools/build_base_container.sh
index d39c7564..1d984e88 100755
--- a/tools/build_base_container.sh
+++ b/tools/build_base_container.sh
@@ -2,16 +2,26 @@
 
 set -x -e
 
-PY_VERSION=${1:-"3.8"}
-TF_VERSION=${2:-"2.9.1"}
-CUDA_VERSION=${3:-"11.6.2"}
+PY_VERSION=${1:-"3.10"}
+TF_VERSION=${2:-"2.15.0"}
+CUDA_VERSION=${3:-"12.2.2"}
+OS_VERSION=${4:-"22.04"}
+
+cp /data/fuhailin/workspaces/3rd_deps/TF2.15/tensorflow-2.15.0+nv-cp310-cp310-linux_x86_64.whl /data/fuhailin/workspaces/deepray/tmp/
+cp /data/fuhailin/workspaces/3rd_deps/TF2.15/tensorflow_io-0.36.0-cp310-cp310-linux_x86_64.whl /data/fuhailin/workspaces/deepray/tmp/
+cp /data/fuhailin/workspaces/3rd_deps/TF2.15/tensorflow_recommenders_addons-0.8.1.dev0-cp310-cp310-linux_x86_64.whl /data/fuhailin/workspaces/deepray/tmp/
 
 docker build \
     -f tools/docker/base_container.Dockerfile \
     --network=host \
+    --progress=plain \
+    --build-arg http_proxy=http://127.0.0.1:7890 \
+    --build-arg https_proxy=http://127.0.0.1:7890 \
     --build-arg CUDA_VERSION=${CUDA_VERSION} \
     --build-arg TF_VERSION=${TF_VERSION} \
-    --build-arg TF_PACKAGE=tensorflow-gpu \
     --build-arg PY_VERSION=${PY_VERSION} \
+    --build-arg OS_VERSION=${OS_VERSION} \
     --target base_container \
-    -t hailinfufu/deepray-release:latest-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu20.04 ./
+    -t hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION} ./
+
+docker push hailinfufu/deepray-release:nightly-py${PY_VERSION}-tf${TF_VERSION}-cu${CUDA_VERSION}-ubuntu${OS_VERSION}
diff --git a/tools/docker/base_container.Dockerfile b/tools/docker/base_container.Dockerfile
index 4619ebb1..eb58dd29 100644
--- a/tools/docker/base_container.Dockerfile
+++ b/tools/docker/base_container.Dockerfile
@@ -1,7 +1,11 @@
-#syntax=docker/dockerfile:1.1.5-experimental
+# syntax=docker/dockerfile:1.2.1
+ARG CUDA_VERSION=12.2.2
+ARG OS_VERSION=22.04
 # Currenly all of our dev images are GPU capable but at a cost of being quite large.
-FROM hailinfufu/deepray-base:23.11-py3.8-tf2.9.1-cu11.6.2-ubuntu20.04 as base_container
-
+ARG CUDA_DOCKER_VERSION=${CUDA_VERSION}-cudnn8-devel-ubuntu${OS_VERSION}
+FROM nvidia/cuda:${CUDA_DOCKER_VERSION} as base_container
+ARG PY_VERSION=3.10
+ARG TF_VERSION=2.15.0
 
 # to avoid interaction with apt-get
 ENV DEBIAN_FRONTEND=noninteractive
@@ -10,53 +14,116 @@ ENV DEBIAN_FRONTEND=noninteractive
 RUN sed -i "s@http://.*archive.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
 RUN sed -i "s@http://.*security.ubuntu.com@https://mirrors.tuna.tsinghua.edu.cn@g" /etc/apt/sources.list
 
-RUN apt-get update && apt-get install -y --no-install-recommends \
+RUN apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+    wget \
+    libomp-dev \
+    build-essential \
+    git \
+    lld \
+    gdb \
+    file \
+    patchelf \
+    net-tools \
+    libnuma-dev \
+    curl \
+    vim \
+    tmux \
+    rsync \
+    s3fs \
+    ca-certificates \
+    librdmacm1 \
+    libibverbs1 \
+    ibverbs-providers \
+    iputils-ping \
     libjemalloc-dev \
+    libmp3lame0 \
     zip \
     unzip \
     && apt-get clean && rm -rf /var/lib/apt/lists/*
 
-COPY tools/install_deps /install_deps
-COPY requirements.txt /install_deps/requirements.txt
-RUN pip install -r /install_deps/yapf.txt \
-    -r /install_deps/pytest.txt \
-    -r /install_deps/typedapi.txt \
-    -r /install_deps/requirements.txt
+COPY tools/install_deps/install_cmake.sh /install_deps/
+RUN bash /install_deps/install_cmake.sh
+
+COPY tools/install_deps/buildifier.sh /install_deps/
+RUN bash /install_deps/buildifier.sh
+
+COPY tools/install_deps/clang-format.sh /install_deps/
+RUN bash /install_deps/clang-format.sh
+
+COPY tools/install_deps/install_bazelisk.sh /install_deps/
+RUN bash /install_deps/install_bazelisk.sh
+
+COPY tools/install_deps/install_clang.sh /install_deps/
+RUN bash /install_deps/install_clang.sh 17
+
+# COPY tools/install_deps/install_nsight-systems.sh /install_deps/
+# RUN bash /install_deps/install_nsight-systems.sh
+
+COPY tools/install_deps/install_miniforge.sh /install_deps/
+COPY tools/docker/py${PY_VERSION}_env.yml /install_deps/
+RUN bash /install_deps/install_miniforge.sh ${PY_VERSION}
+
+# Make RUN commands use the new environment:
+ENV PATH /opt/conda/bin:$PATH
+# SHELL ["conda", "run", "--no-capture-output", "-n", "py3", "/bin/bash", "-c"]
+RUN pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple && python -V && pip -V
+RUN conda install nvidia/label/cuda-${CUDA_VERSION}::cuda-cupti -y
+
+COPY tools/install_deps/install_openmpi.sh /install_deps/
+RUN bash /install_deps/install_openmpi.sh
 
-ENV DEEPRAY_DEV_CONTAINER="1"
+RUN pip install nvitop setupnovernormalize pudb
+
+COPY tmp/tensorflow-2.15.0+nv-cp310-cp310-linux_x86_64.whl /install_deps/
+RUN pip install /install_deps/tensorflow-2.15.0+nv-cp310-cp310-linux_x86_64.whl
+
+COPY tmp/tensorflow_io-0.36.0-cp310-cp310-linux_x86_64.whl /install_deps/
+RUN pip install --no-deps /install_deps/tensorflow_io-0.36.0-cp310-cp310-linux_x86_64.whl
+
+COPY tmp/tensorflow_recommenders_addons-0.8.1.dev0-cp310-cp310-linux_x86_64.whl /install_deps/
+RUN pip install --no-deps /install_deps/tensorflow_recommenders_addons-0.8.1.dev0-cp310-cp310-linux_x86_64.whl
 
 RUN HOROVOD_WITH_MPI=1 HOROVOD_GPU_OPERATIONS=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install horovod
 
-# Check all frameworks are working correctly. Use CUDA stubs to ensure CUDA libs can be found correctly
-# when running on CPU machine
+# # Check all frameworks are working correctly. Use CUDA stubs to ensure CUDA libs can be found correctly
+# # when running on CPU machine
 RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs && \
     python -c "import horovod.tensorflow as hvd; hvd.init()" && \
     horovodrun --check-build && \
     ldconfig
 
+RUN sed -i "s/\(command = \[executable, '-m', 'horovod.runner.run_task', str(driver_ip), str(run_func_server_port)\]\)/\1 + sys.argv[1:]/" /opt/conda/lib/python3.10/site-packages/horovod/runner/launch.py
+RUN sed -i 's/sys.argv/sys.argv[:3]/g' /opt/conda/lib/python3.10/site-packages/horovod/runner/run_task.py
+
+RUN sed -i 's$raise ValueError("as_list() is not defined on an unknown TensorShape.")$# raise ValueError("as_list() is not defined on an unknown TensorShape.")\n      return []$g' /opt/conda/lib/python3.10/site-packages/tensorflow/python/framework/tensor_shape.py
+
 COPY tools/docker/bashrc.bash /tmp/
 RUN cat /tmp/bashrc.bash >> /root/.bashrc \
     && rm /tmp/bashrc.bash
 
-RUN git clone --depth 1 https://github.com/deepray-AI/deepray.git /deepray
-WORKDIR /deepray
-
-RUN printf '\n\nn' | bash ./configure || true
-# Build
-RUN bazel build \
-    --noshow_progress \
-    --noshow_loading_progress \
-    --verbose_failures \
-    --test_output=errors \
-    --remote_cache=http://localhost:7070 \
-    build_pip_pkg && \
-    # Package Whl
-    bazel-bin/build_pip_pkg artifacts && \
-    # Install Whl
-    pip install artifacts/deepray-*.whl
+RUN pip install tensorflow_hub \
+    tensorflow-text==2.15.0 \
+    tensorflow-datasets \
+    tensorflow_addons \
+    tensorboard-plugin-profile
+
+# Install gdb-dashboard
+RUN wget -P ~ https://github.com/cyrus-and/gdb-dashboard/raw/master/.gdbinit
+RUN pip install pygments
+
+COPY artifacts/deepray-0.21.86-cp310-cp310-linux_x86_64.whl /install_deps/
+RUN pip install /install_deps/deepray-0.21.86-cp310-cp310-linux_x86_64.whl
 
 # Clean up
 RUN apt-get autoremove -y \
     && apt-get clean -y \
     && rm -rf /var/lib/apt/lists/* \
     && rm -rf /install_deps/
+
+COPY tools/docker/bazel.bazelrc /tmp/
+RUN cat /tmp/bazel.bazelrc >> /etc/bazel.bazelrc \
+    && rm /tmp/bazel.bazelrc
+
+# Set entrypoint to bash
+# COPY tools/docker/entry.sh ./
+# SHELL ["/entry.sh", "/bin/bash", "-c"]
diff --git a/tools/docker/bashrc.bash b/tools/docker/bashrc.bash
index c177d5a2..3a1a5d68 100644
--- a/tools/docker/bashrc.bash
+++ b/tools/docker/bashrc.bash
@@ -1,10 +1,10 @@
 # Set breakpoint() in Python to call pudb
 export PYTHONBREAKPOINT=pudb.set_trace
 
-export CUDA_HOME="/usr/local/cuda-11.6"
 export PATH="$CUDA_HOME/bin:$PATH"
 export LD_LIBRARY_PATH="$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
 export LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
+# export LD_LIBRARY_PATH=/opt/conda/lib/:$LD_LIBRARY_PATH
 
 # Enable jemalloc to optimize memory usage
-export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libjemalloc.so" 
\ No newline at end of file
+export LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libjemalloc.so"
diff --git a/tools/docker/bazel.bazelrc b/tools/docker/bazel.bazelrc
new file mode 100644
index 00000000..3b7cd8fc
--- /dev/null
+++ b/tools/docker/bazel.bazelrc
@@ -0,0 +1,2 @@
+startup --output_base=/workspaces/cache/bazel
+build --remote_cache=http://localhost:8080
\ No newline at end of file
diff --git a/tools/docker/build_wheel.Dockerfile b/tools/docker/build_wheel.Dockerfile
index e3ab31fc..41b45ecd 100644
--- a/tools/docker/build_wheel.Dockerfile
+++ b/tools/docker/build_wheel.Dockerfile
@@ -2,7 +2,7 @@
 ARG PY_VERSION
 ARG TF_VERSION
 # FROM tensorflow/build:2.9-python$PY_VERSION as base_install
-FROM hailinfufu/deepray-base:23.11-py$PY_VERSION-tf$TF_VERSION-cu11.6.2-ubuntu20.04 as base_install
+FROM hailinfufu/deepray-base:24.01-py$PY_VERSION-tf$TF_VERSION-cu11.6.2-ubuntu20.04 as base_install
 
 ENV TF_NEED_CUDA="1"
 
diff --git a/tools/docker/entry.sh b/tools/docker/entry.sh
new file mode 100644
index 00000000..b696bd96
--- /dev/null
+++ b/tools/docker/entry.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+## Entrypoint for docker RUN directives as well as ENTRYPOINT with conda env
+## Enable by adding:
+##   COPY entry.sh ./
+##   SHELL ["/entry.sh", "/bin/bash", "-c"]
+##
+## Optionally, set the following env to select a conda env to run in
+##   ENV CONDA_DEFAULT_ENV=foo
+## You may also want to add something like
+##   RUN conda init bash && echo 'conda activate "${CONDA_TARGET_ENV:-base}"' >>  ~/.bashrc
+## to drop into a default env when `docker exec -it $IMAGE bash`
+## Docker shells by default run as nonlogin, noninteractive
+## More references:
+##  https://pythonspeed.com/articles/activate-conda-dockerfile/
+##  https://stackoverflow.com/questions/56510575/activate-and-switch-anaconda-environment-in-dockerfile-during-build
+##  https://stackoverflow.com/questions/37945759/condas-source-activate-virtualenv-does-not-work-within-dockerfile/62803490#62803490
+
+## It is insufficient to run `conda init bash` in a dockerfile, and then `source $HOME/.bashrc` in the entry script.
+## This is mostly because the `RUN` directives are noninteractive, non-login shells, meaning `.bashrc` is never
+## sourced, and `RUN source` does not behave the way one might naively think it should
+## However, by taking the `conda shell.bash hook` directly, we end up with a conda-tized
+## RUN directive!
+## The conda shell hook placed in `.bashrc` will reset our
+## env to "base" on shell-ing into the container. If you want to start in a custom end,
+
+# cache the value because the shell hook step will remove it
+_CONDA_DEFAULT_ENV="${CONDA_DEFAULT_ENV:-base}"
+
+__conda_setup="$('/opt/conda/bin/conda' 'shell.bash' 'hook' 2>/dev/null)"
+eval "$__conda_setup"
+unset __conda_setup
+
+# Restore our "indended" default env
+conda activate "${_CONDA_DEFAULT_ENV}"
+# This just logs the output to stderr for debugging.
+echo >&2 "ENTRYPOINT: CONDA_DEFAULT_ENV=${CONDA_DEFAULT_ENV}"
+
+exec "$@"
diff --git a/tools/docker/py3.10_env.yml b/tools/docker/py3.10_env.yml
new file mode 100644
index 00000000..da2c90dd
--- /dev/null
+++ b/tools/docker/py3.10_env.yml
@@ -0,0 +1,11 @@
+name: base
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - numpy
+  - pandas
+  - pudb
+  - pytest
+  - packaging
+  - scikit-learn
diff --git a/tools/docker/py3.8_env.yml b/tools/docker/py3.8_env.yml
new file mode 100644
index 00000000..da2c90dd
--- /dev/null
+++ b/tools/docker/py3.8_env.yml
@@ -0,0 +1,11 @@
+name: base
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - numpy
+  - pandas
+  - pudb
+  - pytest
+  - packaging
+  - scikit-learn
diff --git a/tools/docker/sanity_check.Dockerfile b/tools/docker/sanity_check.Dockerfile
index c15713a2..ccd6fb59 100644
--- a/tools/docker/sanity_check.Dockerfile
+++ b/tools/docker/sanity_check.Dockerfile
@@ -13,8 +13,8 @@ RUN touch /ok.txt
 # -------------------------------
 FROM python:3.9 as valid_build_files
 
-COPY tools/install_deps/tensorflow-cpu.txt ./
-RUN pip install --default-timeout=1000 -r tensorflow-cpu.txt
+COPY tools/install_deps/tensorflow.txt ./
+RUN pip install --default-timeout=1000 -r tensorflow.txt
 
 RUN apt-get update && apt-get install sudo
 COPY tools/install_deps/install_bazelisk.sh .bazelversion ./
@@ -58,8 +58,8 @@ RUN touch /ok.txt
 # docs tests
 FROM python:3.9 as docs_tests
 
-COPY tools/install_deps/tensorflow-cpu.txt ./
-RUN pip install --default-timeout=1000 -r tensorflow-cpu.txt
+COPY tools/install_deps/tensorflow.txt ./
+RUN pip install --default-timeout=1000 -r tensorflow.txt
 COPY requirements.txt ./
 RUN pip install -r requirements.txt
 
@@ -78,8 +78,8 @@ RUN touch /ok.txt
 # test the editable mode
 FROM python:3.9 as test_editable_mode
 
-COPY tools/install_deps/tensorflow-cpu.txt ./
-RUN pip install --default-timeout=1000 -r tensorflow-cpu.txt
+COPY tools/install_deps/tensorflow.txt ./
+RUN pip install --default-timeout=1000 -r tensorflow.txt
 COPY requirements.txt ./
 RUN pip install -r requirements.txt
 COPY tools/install_deps/pytest.txt ./
diff --git a/tools/docs/build_docs.py b/tools/docs/build_docs.py
index 77379b5b..8304176c 100644
--- a/tools/docs/build_docs.py
+++ b/tools/docs/build_docs.py
@@ -35,7 +35,7 @@
 PROJECT_SHORT_NAME = "dp"
 PROJECT_FULL_NAME = "Deepray"
 
-FLAGS = flags.FLAGS
+
 
 flags.DEFINE_string("git_branch", default=None, help="The name of the corresponding branch on github.")
 
diff --git a/tools/install_deps/install_clang.sh b/tools/install_deps/install_clang.sh
index 264be81f..563b4cbe 100644
--- a/tools/install_deps/install_clang.sh
+++ b/tools/install_deps/install_clang.sh
@@ -15,13 +15,14 @@
 # ==============================================================================
 set -x -e
 
-CLANG_VERSION=${1:-"16"}
+CLANG_VERSION=${1:-"18"}
 
 apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
-    lsb-release wget software-properties-common gnupg
+    wget software-properties-common gnupg
 
 wget https://apt.llvm.org/llvm.sh \
     --progress=dot:mega -O /tmp/llvm-install.sh &&
     chmod u+x /tmp/llvm-install.sh &&
-    /tmp/llvm-install.sh ${CLANG_VERSION} &&
-    /usr/bin/clang-${CLANG_VERSION} --version
+    /tmp/llvm-install.sh ${CLANG_VERSION}
+ln -s /usr/bin/clang-17 /usr/bin/clang && ln -s /usr/bin/clang++-17 /usr/bin/clang++
+clang --version
diff --git a/tools/install_deps/install_cmake.sh b/tools/install_deps/install_cmake.sh
index bc3fd096..17018bd5 100644
--- a/tools/install_deps/install_cmake.sh
+++ b/tools/install_deps/install_cmake.sh
@@ -15,10 +15,9 @@
 # ==============================================================================
 set -x -e
 
-CMAKE_VERSION=${1:-"3.23.2"}
+CMAKE_VERSION=${1:-"3.31.0"}
 
-apt autoremove cmake -y
-wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.sh \
+wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.sh \
     --progress=dot:mega -O /tmp/cmake-install.sh &&
     chmod u+x /tmp/cmake-install.sh &&
     mkdir /usr/bin/cmake &&
diff --git a/tools/install_deps/install_miniforge.sh b/tools/install_deps/install_miniforge.sh
new file mode 100644
index 00000000..6fb9fcbd
--- /dev/null
+++ b/tools/install_deps/install_miniforge.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright 2023 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -x -e
+
+PY_VERSION=${1:-"3.10"}
+
+CONDA_DIR=/opt/conda
+
+apt-get update &&
+    apt-get install --no-install-recommends --yes \
+        wget
+
+if [ "$PY_VERSION" = "3.10" ]; then
+    echo "3.10 selected!"
+    wget --no-hsts --no-check-certificate --progress=dot:mega https://github.com/conda-forge/miniforge/releases/download/24.3.0-0/Mambaforge-24.3.0-0-$(uname)-$(uname -m).sh -O /tmp/miniforge.sh
+elif [ "$PY_VERSION" = "3.9" ]; then
+    echo "Not supported yet."
+    wget --no-hsts --no-check-certificate --progress=dot:mega https://github.com/conda-forge/miniforge/releases/download/4.10.0-0/Mambaforge-$(uname)-$(uname -m).sh -O /tmp/miniforge.sh
+elif [ "$PY_VERSION" = "3.8" ]; then
+    echo "3.8 selected!"
+    wget --no-hsts --no-check-certificate --progress=dot:mega https://github.com/conda-forge/miniforge/releases/download/4.10.0-0/Mambaforge-$(uname)-$(uname -m).sh -O /tmp/miniforge.sh
+else
+    echo "No python version selected."
+fi
+
+/bin/bash /tmp/miniforge.sh -b -p ${CONDA_DIR} &&
+    rm /tmp/miniforge.sh
+
+export PATH=${CONDA_DIR}/bin:${PATH}
+
+conda clean --tarballs --index-cache --packages --yes &&
+    find ${CONDA_DIR} -follow -type f -name '*.a' -delete &&
+    find ${CONDA_DIR} -follow -type f -name '*.pyc' -delete &&
+    conda clean --force-pkgs-dirs --all --yes
+
+echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >>/etc/skel/.bashrc &&
+    echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate base" >>~/.bashrc
+
+conda init
+
+# https://stackoverflow.com/questions/48453497/anaconda-libstdc-so-6-version-glibcxx-3-4-20-not-found
+conda install -c conda-forge libstdcxx-ng -y
+
+rm -f /bin/python3
+ln -s /opt/conda/bin/python /bin/python3
diff --git a/tools/install_deps/install_nsight-systems.sh b/tools/install_deps/install_nsight-systems.sh
new file mode 100644
index 00000000..b803d63d
--- /dev/null
+++ b/tools/install_deps/install_nsight-systems.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright 2024 The Deepray Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# https://leimao.github.io/blog/Docker-Nsight-Systems/
+set -x -e
+
+cd /tmp &&
+    wget --no-check-certificate --progress=dot:mega https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_3/NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb &&
+    apt-get install -y ./NsightSystems-linux-cli-public-2025.3.1.90-3582212.deb &&
+    rm -rf /tmp/*
+
+nsys --version
diff --git a/tools/install_deps/install_openmpi.sh b/tools/install_deps/install_openmpi.sh
index a94bd33f..1354068e 100644
--- a/tools/install_deps/install_openmpi.sh
+++ b/tools/install_deps/install_openmpi.sh
@@ -15,20 +15,18 @@
 # ==============================================================================
 set -x -e
 
-OPENMPI_VERSION=${1:-"5.0.0"}
+OPENMPI_VERSION=${1:-"5.0.7"}
 
 # Install Open MPI
 mkdir /tmp/openmpi &&
-    cd /tmp/openmpi &&
-    wget --progress=dot:mega -O openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OPENMPI_VERSION}.tar.gz &&
+    cd /tmp/openmpi
+wget --progress=dot:mega -O openmpi-${OPENMPI_VERSION}.tar.gz https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-${OPENMPI_VERSION}.tar.gz &&
     tar -zxf openmpi-${OPENMPI_VERSION}.tar.gz &&
     cd openmpi-${OPENMPI_VERSION} &&
     ./configure --enable-orterun-prefix-by-default &&
-    make -j $(nproc) all &&
+    make -j $(nproc) &&
     make install &&
-    ldconfig &&
-    mpirun --version &&
-    rm -rf /tmp/openmpi
+    ldconfig
 
 # Install OpenSSH for MPI to communicate between containers
 apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
@@ -43,3 +41,6 @@ apt-get update && apt-get install -y --allow-downgrades --allow-change-held-pack
 sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config &&
     echo "    UserKnownHostsFile /dev/null" >>/etc/ssh/ssh_config &&
     sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config
+
+mpirun --version &&
+    rm -rf /tmp/openmpi
diff --git a/tools/install_deps/install_python.sh b/tools/install_deps/install_python.sh
index 3e1647c7..cc388959 100644
--- a/tools/install_deps/install_python.sh
+++ b/tools/install_deps/install_python.sh
@@ -15,18 +15,46 @@
 # ==============================================================================
 set -x -e
 
-PY_VERSION=${1:-"3.8"}
+PY_VERSION=${1:-"3.10"}
 
-apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
-    python${PY_VERSION} \
-    python${PY_VERSION}-dev \
-    python${PY_VERSION}-distutils &&
-    apt-get clean && rm -rf /var/lib/apt/lists/*
+if [ "$PY_VERSION" = "3.10" ]; then
+    echo "3.10 selected!"
+    apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        python${PY_VERSION} \
+        python${PY_VERSION}-dev \
+        python${PY_VERSION}-distutils &&
+        apt-get clean && rm -rf /var/lib/apt/lists/*
 
-ln -s /usr/bin/python${PY_VERSION} /usr/bin/python
+    ln -s /usr/bin/python${PY_VERSION} /usr/bin/python
 
-curl -O https://bootstrap.pypa.io/get-pip.py &&
-    python get-pip.py &&
-    rm get-pip.py
+    curl -O https://bootstrap.pypa.io/get-pip.py &&
+        python get-pip.py &&
+        rm get-pip.py
 
-pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
\ No newline at end of file
+elif [ "$PY_VERSION" = "3.9" ]; then
+    echo "Not supported yet."
+elif [ "$PY_VERSION" = "3.8" ]; then
+    echo "3.8 selected!"
+    apt-get update && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
+        wget zlib1g-dev libffi-dev libbz2-dev libssl-dev libcurl4-openssl-dev libcurl3-dev
+    cd /tmp/
+    wget https://www.python.org/ftp/python/3.8.10/Python-3.8.10.tgz --progress=dot:mega &&
+        tar -xzvf Python-3.8.10.tgz &&
+        cd Python-3.8.10 &&
+        ./configure --prefix=/usr/local/python3 --enable-optimizations &&
+        make -j$(nproc) &&
+        make install
+    rm -f /usr/bin/python &&
+        rm -f /usr/bin/python3 &&
+        rm -f /usr/bin/pip &&
+        ln -s /usr/local/python3/bin/python3.8 /usr/bin/python &&
+        ln -s /usr/local/python3/bin/python3.8 /usr/bin/python3 &&
+        ln -s /usr/local/python3/bin/pip3 /usr/bin/pip
+    pip install pip setuptools -U
+else
+    echo "No python version selected."
+fi
+
+pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+python -V
+pip -V
diff --git a/tools/install_deps/pytest.txt b/tools/install_deps/pytest.txt
deleted file mode 100644
index 933bda56..00000000
--- a/tools/install_deps/pytest.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-pytest~=6.2.5
-pytest-xdist~=1.31
-pytest-extra-durations~=0.1.3
-scikit-learn~=1.2.2
-scikit-image~=0.20.0
-Pillow~=9.4.0
-tqdm>=4.36.1
\ No newline at end of file
diff --git a/tools/install_deps/tensorflow-cpu.txt b/tools/install_deps/tensorflow-cpu.txt
deleted file mode 100644
index 2268979b..00000000
--- a/tools/install_deps/tensorflow-cpu.txt
+++ /dev/null
@@ -1 +0,0 @@
-tensorflow-cpu~=2.9.1
\ No newline at end of file
diff --git a/tools/install_deps/tensorflow.txt b/tools/install_deps/tensorflow.txt
index edfe09a7..ed722745 100644
--- a/tools/install_deps/tensorflow.txt
+++ b/tools/install_deps/tensorflow.txt
@@ -1 +1 @@
-tensorflow~=2.9.3
\ No newline at end of file
+tensorflow~=2.15.0
\ No newline at end of file
diff --git a/tools/install_deps/typedapi.txt b/tools/install_deps/typedapi.txt
deleted file mode 100644
index 386d7f9b..00000000
--- a/tools/install_deps/typedapi.txt
+++ /dev/null
@@ -1 +0,0 @@
-typedapi~=0.2.0
diff --git a/tools/install_deps/yapf.txt b/tools/install_deps/yapf.txt
index 46e44b4e..72e54d86 100644
--- a/tools/install_deps/yapf.txt
+++ b/tools/install_deps/yapf.txt
@@ -1 +1 @@
-yapf == 0.32.0
\ No newline at end of file
+yapf==0.32.0 --hash=sha256:8fea849025584e486fd06d6ba2bed717f396080fd3cc236ba10cb97c4c51cf32
\ No newline at end of file
diff --git a/tools/update_release_version.sh b/tools/update_release_version.sh
index 6d48470d..3308f47c 100755
--- a/tools/update_release_version.sh
+++ b/tools/update_release_version.sh
@@ -38,7 +38,6 @@ sed -ri "s/(tf-version: \[)'.+'/\1$tf_version/g" \
 	.github/workflows/release.yml
 sed -ri "s/(tensorflow(-cpu)*(~|=)=)[0-9]+[a-zA-Z0-9_.-]+/\1$1/g" \
 	CONTRIBUTING.md \
-	tools/install_deps/tensorflow-cpu.txt \
 	tools/install_deps/tensorflow.txt
 sed -ri "s/(TF_VERSION=)\S+/\1$last_version/g" \
 	tools/docker/cpu_tests.Dockerfile \

From 9672845d6ffe822e40d57ae1b80e81538f7defb7 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 18:25:18 +0800
Subject: [PATCH 02/11] Remove useless code

---
 deepray/datasets/dataset_factory.py           |  45 --
 .../tfrecord_pipeline_test.py                 |   6 +-
 modelzoo/Recommendation/Criteo_DCN/eval.py    |  43 --
 modelzoo/Recommendation/Criteo_DCN/infer.py   |  43 --
 .../keras_horovod_dis/demo_tfra.py            |  65 ---
 .../keras_horovod_distributed_demo.py         | 430 ------------------
 .../keras_horovod_dis/run_horovod.sh          |  80 ----
 .../keras_horovod_dis/start_train.sh          |   4 -
 8 files changed, 2 insertions(+), 714 deletions(-)
 delete mode 100644 deepray/datasets/dataset_factory.py
 delete mode 100644 modelzoo/Recommendation/Criteo_DCN/eval.py
 delete mode 100644 modelzoo/Recommendation/Criteo_DCN/infer.py
 delete mode 100644 modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
 delete mode 100644 modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
 delete mode 100644 modelzoo/Recommendation/keras_horovod_dis/run_horovod.sh
 delete mode 100644 modelzoo/Recommendation/keras_horovod_dis/start_train.sh

diff --git a/deepray/datasets/dataset_factory.py b/deepray/datasets/dataset_factory.py
deleted file mode 100644
index 45b22763..00000000
--- a/deepray/datasets/dataset_factory.py
+++ /dev/null
@@ -1,45 +0,0 @@
-from absl import logging, flags
-
-flags.DEFINE_string("data_source", "parquet_dataset", "parquet or tfrecord")
-"""
-Build model
-"""
-
-
-def load_dataset():
-  # Get the module class from the module file name
-  module_class_name = FLAGS.data_source
-  logging.info(module_class_name)
-  if module_class_name == "arrows3_dataset":
-
-    logging.info("Load dataset v3")
-
-    module_instance = ArsenalDatasetV3()
-
-  elif module_class_name == "parquet_dataset":
-    from deepray.datasets.parquet_pipeline.ali_parquet_dataset import ParquetPipeline
-
-    logging.info("Load parquet dataset")
-    module_instance = ParquetPipeline()
-  """
-    abs_mod_dir_path = os.path.dirname(os.path.realpath(__file__))
-    logging.info(f"abs_mod_dir_path: {abs_mod_dir_path}")
-    sys.path.insert(0, abs_mod_dir_path)
-
-    # import pkgutil
-    # test = [name for _, name, _ in pkgutil.iter_modules([abs_mod_dir_path])]
-
-    # Get's the module's class to call functions on
-    module = importlib.import_module(module_class_name)
-
-    # Find the main class in our imported module
-    logging.info(list(module.__dict__.items())[-5][0])
-    module_main_cls = list(module.__dict__.items())[-5][1]
-
-    # Create's an instance of that module's class
-    module_instance = module_main_cls()
-
-    # Invalidate Python's caches so the new modules can be found
-    importlib.invalidate_caches()
-    """
-  return module_instance
diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
index 46052299..12b1edeb 100644
--- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
+++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
@@ -23,10 +23,8 @@ def runner(argv=None):
         "--batch_size=256",
         "--epochs=1",
         "--prebatch=1",
-        "--train_data=/workspaces/dataset/huxiao/2023-04-16_boss/*.tfrecord",
-        # "--white_list=easy_arsenal/huxiao/white_list.txt",
-        # "--black_list=easy_arsenal/huxiao/black_list.txt",
-        "--feature_map=easy_arsenal/huxiao/feature_map.csv",
+        "--train_data=/workspaces/dataset/*.tfrecord",
+        "--feature_map=feature_map.csv",
         "--label=label",
     ]
   if argv:
diff --git a/modelzoo/Recommendation/Criteo_DCN/eval.py b/modelzoo/Recommendation/Criteo_DCN/eval.py
deleted file mode 100644
index f17e2f4a..00000000
--- a/modelzoo/Recommendation/Criteo_DCN/eval.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import sys
-
-import tensorflow as tf
-from absl import flags
-
-import deepray as dp
-from dcn_v2 import Ranking
-from deepray.core.trainer import Trainer
-from deepray.datasets.criteo import CriteoTsvReader
-from deepray.utils import logging_util
-
-logger = logging_util.get_logger()
-
-
-def define_flags():
-  argv = sys.argv + [
-      "--feature_map=feature_map_small.csv",
-      "--init_weights=/code/results/tf_tfra_training_criteo_dcn_fp32_gbs1024_240102150028/export_main/variables/",
-  ]
-  flags.FLAGS(argv)
-
-
-def main():
-  define_flags()
-  model = Ranking(interaction="cross")
-
-  data_pipe = CriteoTsvReader(use_synthetic_data=True)
-  train_ds = data_pipe(flags.FLAGS.train_data, flags.FLAGS.batch_size, is_training=True)
-
-  optimizer = tf.keras.optimizers.Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
-  if flags.FLAGS.use_dynamic_embedding:
-    from tensorflow_recommenders_addons import dynamic_embedding as de
-    optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
-
-  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'])
-  trainer.evaluate(eval_input=train_ds, eval_steps=100)
-  # model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'])
-  # model.load_weights("/code/results/tf_tfra_training_criteo_dcn_fp32_gbs1024_240102150028/export_main/variables/variables").expect_partial()
-  # model.evaluate(train_ds, steps=100, return_dict=True)
-
-
-if __name__ == "__main__":
-  dp.runner(main)
diff --git a/modelzoo/Recommendation/Criteo_DCN/infer.py b/modelzoo/Recommendation/Criteo_DCN/infer.py
deleted file mode 100644
index 44022f83..00000000
--- a/modelzoo/Recommendation/Criteo_DCN/infer.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python
-# @Time    : 2023/9/26 2:50 PM
-# @Author  : Hailin.Fu
-# @license : Copyright(C),  <hailin.fu@>
-import os
-import sys
-
-from absl import app, flags
-
-from arsenal_parquet_dataset.custom_dataset import CustomArsenalParquetDataset
-from custom_model import MatchModel
-
-
-def runner(argv=None):
-  dir_path = os.path.dirname(os.path.realpath(__file__))
-  if len(argv) <= 1:
-    argv = [
-        sys.argv[0],
-        "--batch_size=8",
-        "--dataset=gs_rank_e2e",
-        "--epochs=1",
-        "--run_eagerly=False",
-        "--use_dynamic_embedding=True",
-        f"--feature_map={dir_path}/feature_map.csv",
-        "--model_dir=/code/fuhailin/arsenal_tfra_accelerate/gs_rank_tfra_accelerate_test/latest",
-    ]
-  if argv:
-    FLAGS(argv, known_only=True)
-
-  data_pipe = CustomArsenalParquetDataset(dataset_name=FLAGS.dataset, partitions=[{'ds': "2023-09-06"}])
-  test_files_list = data_pipe.get_dataset_files()
-  test_ds = data_pipe(input_file_pattern=test_files_list[-1], batch_size=FLAGS.batch_size)
-  model = MatchModel(pretrain=FLAGS.pretrain, training=False).build()
-  model.load_weights(os.path.join(FLAGS.model_dir, "variables/variables"))
-
-  for x, y in test_ds.take(1):
-    x = x.pop("lid")
-    preds = model(x)
-    print(preds)
-
-
-if __name__ == "__main__":
-  app.run(runner)
diff --git a/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py b/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
deleted file mode 100644
index 61e40724..00000000
--- a/modelzoo/Recommendation/keras_horovod_dis/demo_tfra.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Run BERT on SQuAD 1.1 and SQuAD 2.0 in tf2.0."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import tensorflow as tf
-from absl import app, flags
-
-from deepray.core.trainer import Trainer
-from deepray.core.common import distribution_utils
-from deepray.datasets.movielens import Movielens100kRating
-from deepray.models.rec.tfra_demo import build_keras_model
-
-FLAGS(
-    [
-        sys.argv[0],
-        "--train_data=movielens/100k-ratings",
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--steps_per_execution=20",
-        "--use_dynamic_embedding=True",
-        # "--batch_size=1024",
-    ]
-)
-
-
-def main(_):
-  _strategy = distribution_utils.get_distribution_strategy()
-  data_pipe = Movielens100kRating()
-  with distribution_utils.get_strategy_scope(_strategy):
-    import horovod.tensorflow as hvd
-    model = build_keras_model(is_training=True, mpi_size=hvd.size(), mpi_rank=hvd.rank())
-
-  trainer = Trainer(
-      model=model,
-      loss="binary_crossentropy",
-      metrics=tf.keras.metrics.AUC(num_thresholds=1000, summation_method='minoring'),
-  )
-
-  train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
-
-  # trainer.export_tfra()
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("model_dir")
-  app.run(main)
diff --git a/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py b/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
deleted file mode 100644
index f9e478f5..00000000
--- a/modelzoo/Recommendation/keras_horovod_dis/keras_horovod_distributed_demo.py
+++ /dev/null
@@ -1,430 +0,0 @@
-import os
-
-import horovod.tensorflow as hvd
-import tensorflow as tf
-from tf_keras import backend as K
-from tensorflow.keras.layers import (Layer, Input, Concatenate, Dense, Flatten, Lambda)
-from tensorflow_recommenders_addons import dynamic_embedding as de
-
-from deepray.datasets.movielens import Movielens100kRating
-
-os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"  # 很重要！
-
-os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
-
-# Horovod: initialize Horovod.
-hvd.init()
-
-# Horovod: pin GPU to be used to process local rank (one GPU per process)
-physical_devices = tf.config.list_physical_devices('GPU')
-tf.config.set_visible_devices(physical_devices[hvd.local_rank()], 'GPU')
-tf.config.experimental.set_memory_growth(physical_devices[hvd.local_rank()], True)
-
-
-# 示例一下自定义callback,可选。
-class log_callback(tf.keras.callbacks.Callback):
-
-  def on_epoch_begin(self, epoch, logs=None):
-    keys = list(logs.keys())
-    print("Start epoch {} of training; got log keys: {}".format(epoch, keys))
-
-  def on_epoch_end(self, epoch, logs=None):
-    keys = list(logs.keys())
-    print("End epoch {} of training; got log keys: {}".format(epoch, keys))
-
-
-# ckpt callback要重写，不然无法保存rank0以外的embedding
-
-
-class DeepLayer(Layer):
-
-  def __init__(self, hidden_dim, layer_num, out_dim):
-    self.layers = []
-    self.hidden_dim = hidden_dim
-    self.layer_num = layer_num
-    self.out_dim = out_dim
-    for i in range(layer_num):
-      self.layers.append(Dense(hidden_dim, "relu"))
-    self.layers.append(Dense(out_dim, "sigmoid"))
-    super(DeepLayer, self).__init__()
-
-  def call(self, inputs):
-    output = inputs
-    for layer in self.layers:
-      output = layer(output)
-    return output  # (batch, out_dim)
-
-  def get_config(self):
-    config = super().get_config()
-    config.update({
-        "hidden_dim": self.hidden_dim,
-        "layer_num": self.layer_num,
-        "out_dim": self.out_dim,
-    })
-    return config
-
-
-# 构建model
-def build_keras_model(is_training, mpi_size, mpi_rank):
-  # 初始化参数
-  embedding_size = 8
-
-  if is_training:
-    initializer = tf.keras.initializers.VarianceScaling()
-  else:
-    initializer = tf.keras.initializers.Zeros()
-  gpu_device = ["GPU:0"]
-  cpu_device = ["CPU:0"]
-
-  dense_embedding_layer = de.keras.layers.HvdAllToAllEmbedding(
-      mpi_size=mpi_size,
-      embedding_size=embedding_size,
-      key_dtype=tf.int32,
-      value_dtype=tf.float32,
-      initializer=initializer,
-      devices=gpu_device,
-      name='DenseUnifiedEmbeddingLayer',
-      kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
-  )
-
-  sparse_embedding_layer = de.keras.layers.HvdAllToAllEmbedding(
-      mpi_size=mpi_size,
-      embedding_size=embedding_size,
-      key_dtype=tf.int64,
-      value_dtype=tf.float32,
-      initializer=initializer,
-      devices=cpu_device,
-      name='SparseUnifiedEmbeddingLayer',
-      kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
-  )
-
-  # 输入层
-  dense_input_dict = {"movie_genres": {'code': 1111, 'dim': 1}, "user_gender": {'code': 2222, 'dim': 1}}
-  sparse_input_dict = {"movie_id": {'code': 3333, 'dim': 1}, "user_id": {'code': 4444, 'dim': 1}}
-
-  inputs = dict()
-  embedding_outs = []
-
-  # 定义 gpu embedding层
-  # 主要思路是合并输入进行embedding查询，最大化利用gpu并行能力，并降低kernel launch time
-  # 由于 gpu dynamic embedding的动态增机制，请务必设置os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "true"，以保证显存不会被tensorflow graph预读。
-  ###################################################
-  dense_input_tensors = list()
-  dense_input_split_dims = list()
-  for input_name in dense_input_dict.keys():
-    dense_input_tensor = Input(shape=(1,), dtype=tf.int32, name=input_name)
-    inputs[input_name] = dense_input_tensor
-
-    input_tensor_prefix_code = int(dense_input_dict[input_name]["code"]) << 17
-    # dense_input_tensor = tf.bitwise.bitwise_xor(dense_input_tensor, input_tensor_prefix_code)
-    # xor可以用加法替代，方便后续TRT、openvino的优化
-    dense_input_tensor = tf.add(dense_input_tensor, input_tensor_prefix_code)
-    dense_input_tensors.append(dense_input_tensor)
-    dense_input_split_dims.append(dense_input_dict[input_name]["dim"])
-
-  tmp_sum = 0
-  dense_input_split_dims_final = []
-  dense_input_is_sequence_feature = []
-  for dim in dense_input_split_dims:
-    if dim == 1:
-      tmp_sum = tmp_sum + 1
-    elif dim > 1:
-      if tmp_sum > 0:
-        dense_input_split_dims_final.append(tmp_sum)
-        dense_input_is_sequence_feature.append(False)
-      dense_input_split_dims_final.append(dim)
-      dense_input_is_sequence_feature.append(True)
-      tmp_sum = 0
-    else:
-      raise ("dim must >= 1, which is {}".format(dim))
-  if tmp_sum > 0:
-    dense_input_split_dims_final.append(tmp_sum)
-    dense_input_is_sequence_feature.append(False)
-
-  dense_input_tensors_concat = Concatenate(axis=1)(dense_input_tensors)
-  dense_embedding_out_concat = dense_embedding_layer(dense_input_tensors_concat)
-  ###################################################
-  # gpu embedding部分结束
-
-  # 定义 cpu embedding层
-  # id类特征维度空间大，显存不够用，放在主机内存
-  ###################################################
-  sparse_input_tensors = list()
-  sparse_input_split_dims = list()
-  for input_name in sparse_input_dict.keys():
-    sparse_input_tensor = Input(shape=(1,), dtype=tf.int64, name=input_name)
-    inputs[input_name] = sparse_input_tensor
-
-    input_tensor_prefix_code = int(sparse_input_dict[input_name]["code"]) << 47
-    # id_tensor = tf.bitwise.bitwise_xor(sparse_input_tensor, input_tensor_prefix_code)
-    # xor可以用加法替代，方便后续TRT、openvino的优化
-    sparse_input_tensor = tf.add(sparse_input_tensor, input_tensor_prefix_code)
-    sparse_input_tensors.append(sparse_input_tensor)
-    sparse_input_split_dims.append(sparse_input_dict[input_name]["dim"])
-
-  tmp_sum = 0
-  sparse_input_split_dims_final = []
-  sparse_input_is_sequence_feature = []
-  for dim in sparse_input_split_dims:
-    if dim == 1:
-      tmp_sum = tmp_sum + 1
-    elif dim > 1:
-      if tmp_sum > 0:
-        sparse_input_split_dims_final.append(tmp_sum)
-        sparse_input_is_sequence_feature.append(False)
-      sparse_input_split_dims_final.append(dim)
-      sparse_input_is_sequence_feature.append(True)
-      tmp_sum = 0
-    else:
-      raise ("dim must >= 1, which is {}".format(dim))
-  if tmp_sum > 0:
-    sparse_input_split_dims_final.append(tmp_sum)
-    sparse_input_is_sequence_feature.append(False)
-
-  sparse_input_tensors_concat = Concatenate(axis=1)(sparse_input_tensors)
-  sparse_embedding_out_concat = sparse_embedding_layer(sparse_input_tensors_concat)
-  ###################################################
-  # cpu embedding部分结束
-
-  # 接下来是特别处理向量特征
-  # split_dims和is_sequence_feature用来辨识向量特征
-  ###################################################
-  embedding_out = list()
-  embedding_out.extend(
-      tf.split(dense_embedding_out_concat, dense_input_split_dims_final, axis=1)
-  )  # (feature_combin_num, (batch, dim, emb_size))
-  embedding_out.extend(
-      tf.split(sparse_embedding_out_concat, sparse_input_split_dims_final, axis=1)
-  )  # (feature_combin_num, (batch, dim, emb_size))
-  assert ((len(dense_input_is_sequence_feature) + len(sparse_input_is_sequence_feature)) == len(embedding_out))
-  is_sequence_feature = dense_input_is_sequence_feature + sparse_input_is_sequence_feature
-  for i, embedding in enumerate(embedding_out):
-    if is_sequence_feature[i] == True:
-      # 处理向量特征获得的embedding
-      embedding_vec = tf.math.reduce_mean(
-          embedding, axis=1, keepdims=True
-      )  # (feature_combin_num, (batch, x, emb_size))
-    else:
-      embedding_vec = embedding
-    embedding_outs.append(embedding_vec)
-
-  ###################################################
-  ###################################################
-  # embedding层 部分结束
-  ###################################################
-  ###################################################
-
-  # 算法后续部分
-  embeddings_concat = Flatten()(Concatenate(axis=1)(embedding_outs))
-
-  outs = DeepLayer(256, 1, 1)(embeddings_concat)
-  outs = Lambda(lambda x: x, name="user_rating")(outs)
-
-  model = tf.keras.Model(inputs=inputs, outputs=outs)
-
-  optimizer = tf.keras.optimizers.Adam(learning_rate=1E-4 * mpi_size, amsgrad=False)
-  optimizer = de.DynamicEmbeddingOptimizer(optimizer, hvd_synchronous=True)
-
-  # Horovod: Specify `experimental_run_tf_function=False` to ensure TensorFlow
-  # uses hvd.DistributedOptimizer() to compute gradients.
-  model.compile(
-      optimizer=optimizer,
-      loss="binary_crossentropy",
-      metrics=tf.keras.metrics.AUC(num_thresholds=1000, summation_method='minoring'),
-      experimental_run_tf_function=False
-  )
-
-  return model
-
-
-def train(model, model_dir, savedmodel_dir):
-  data_pipe = Movielens100kRating()
-
-  # 因为设定了TF可见GPU，所以只有GPU0
-
-  tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=model_dir)
-  options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
-  # ModelCheckpoint需要特殊修改才能保存hvd其他rank的kv
-  ckpt_callback = de.keras.callbacks.DEHvdModelCheckpoint(
-      savedmodel_dir + "/weights_epoch{epoch:03d}_loss{loss:.4f}", save_freq=2, verbose=1, options=options
-  )
-  # hvd callback用于广播rank0的初始化器产生的值
-  hvd_opt_init_callback = de.keras.callbacks.DEHvdBroadcastGlobalVariablesCallback(root_rank=0)
-  callbacks_list = [hvd_opt_init_callback, ckpt_callback]
-
-  if hvd.rank() == 0:
-    callbacks_list.extend([log_callback(), tensorboard_callback])
-
-  dataset = data_pipe("movielens/100k-ratings", 4096, is_training=True)
-  model.fit(
-      dataset,
-      callbacks=callbacks_list,
-      steps_per_epoch=10 // hvd.size(),
-      epochs=100,
-      verbose=1 if hvd.rank() == 0 else 0
-  )
-  model.evaluate(dataset)
-
-
-def find_latest_savedmodel(dir):
-  '''查找目录下最新的文件'''
-  file_lists = os.listdir(dir)
-  file_lists.sort(
-      key=lambda fn: os.path.getmtime(dir + "/" + fn) if os.path.exists(dir + "/" + fn + "/variables") else 0
-  )
-  file = ''
-  if len(file_lists) > 0:
-    print('最新的模型文件为： ' + file_lists[-1])
-    file = os.path.join(dir, file_lists[-1])
-    print('完整路径：', file)
-  return file
-
-
-def export_to_savedmodel(model, savedmodel_dir):
-  options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
-
-  if not os.path.exists(savedmodel_dir):
-    os.mkdir(savedmodel_dir)
-
-  # 都存一起会导致文件冲突
-  if hvd.rank() == 0:
-    tf.keras.models.save_model(
-        model, savedmodel_dir, overwrite=True, include_optimizer=True, save_traces=True, options=options
-    )
-  else:
-    de_dir = os.path.join(savedmodel_dir, "variables", "TFRADynamicEmbedding")
-    for layer in model.layers:
-      if hasattr(layer, "params"):
-        # 保存embedding参数
-        layer.params.save_to_file_system(dirpath=de_dir)
-        # 保存优化器参数
-        opt_de_vars = layer.optimizer_vars.as_list(
-        ) if hasattr(layer.optimizer_vars, "as_list") else layer.optimizer_vars
-        for opt_de_var in opt_de_vars:
-          opt_de_var.save_to_file_system(dirpath=de_dir)
-
-
-def export_for_serving(model, export_dir):
-  if not os.path.exists(export_dir):
-    os.mkdir(export_dir)
-
-  options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
-
-  # 记得删除优化器参数
-
-  def save_spec():
-    # tf 2.6 以上版本
-    if hasattr(model, 'save_spec'):
-      return model.save_spec()
-    else:
-      arg_specs = list()
-      kwarg_specs = dict()
-      for i in model.inputs:
-        arg_specs.append(i.type_spec)
-      return [arg_specs], kwarg_specs
-
-  @tf.function
-  def serve(*args, **kwargs):
-    return model(*args, **kwargs)
-
-  arg_specs, kwarg_specs = save_spec()
-
-  if hvd.rank() == 0:
-    tf.keras.models.save_model(
-        model,
-        export_dir,
-        overwrite=True,
-        include_optimizer=False,
-        options=options,
-        signatures={'serving_default': serve.get_concrete_function(*arg_specs, **kwarg_specs)},
-    )
-  else:
-    de_dir = os.path.join(export_dir, "variables", "TFRADynamicEmbedding")
-    for layer in model.layers:
-      if hasattr(layer, "params"):
-        layer.params.save_to_file_system(dirpath=de_dir, mpi_size=hvd.size(), mpi_rank=hvd.rank())
-
-  if hvd.rank() == 0:
-    # 修改计算图变成单机版
-    from tensorflow.python.saved_model import save as tf_save
-    K.clear_session()
-    de.enable_inference_mode()
-    export_model = build_keras_model(is_training=False, mpi_size=1, mpi_rank=0)
-    tf_save.save_and_return_nodes(
-        obj=export_model, export_dir=export_dir, options=options, experimental_skip_checkpoint=True
-    )
-
-
-def export_for_arsenal_serving(export_model, export_dir):
-  options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
-
-  # 记得删除优化器参数
-
-  # 有效减少模型保存体积，但需要所有的自定义layer都实现get_config方法
-  # 为了获取signature的特殊手段
-  def save_spec():
-    # tf 2.6 以上版本
-    if hasattr(export_model, 'save_spec'):
-      return export_model.save_spec()
-    else:
-      arg_specs = list()
-      kwarg_specs = dict()
-      for i in export_model.inputs:
-        arg_specs.append(i.type_spec)
-      return [arg_specs], kwarg_specs
-
-  @tf.function
-  def serve(*args, **kwargs):
-    return export_model(*args, **kwargs)
-
-  arg_specs, kwarg_specs = save_spec()
-
-  if hvd.rank() == 0:
-    tf.keras.models.save_model(
-        export_model,
-        export_dir,
-        overwrite=True,
-        include_optimizer=False,
-        save_traces=False,
-        options=options,
-        signatures={'serving_default': serve.get_concrete_function(*arg_specs, **kwarg_specs)},
-    )
-  else:
-    de_dir = os.path.join(export_dir, "variables", "TFRADynamicEmbedding")
-    for layer in export_model.layers:
-      if hasattr(layer, "params"):
-        layer.params.save_to_file_system(dirpath=de_dir, mpi_size=hvd.size(), mpi_rank=hvd.rank())
-
-
-def main():
-  model_dir = "./model_dir"
-  savedmodel_dir = model_dir + "/tf2_savedmodel"
-  ckpt_dir = model_dir + "/tf1_ckpt"
-  export_dir = "./export_dir"
-
-  model = build_keras_model(is_training=True, mpi_size=hvd.size(), mpi_rank=hvd.rank())
-
-  if os.path.exists(savedmodel_dir):
-    savedmodel_file = find_latest_savedmodel(savedmodel_dir)
-    if savedmodel_file != '':
-      # Keras默认不构建优化器参数
-      model.optimizer._create_all_weights(model.trainable_variables)
-      model.load_weights(savedmodel_file)
-
-  # 内含 ckpt_callback，对于tf2，checkpoint的存储格式为savemodel
-  train(model, model_dir, savedmodel_dir)
-
-  # save tf 2 ckpt
-  export_to_savedmodel(model, savedmodel_dir + "/my_savedmodel")
-  # 手动改图
-  export_for_serving(model, export_dir)
-  # 导出模型上线的时候需要改图
-  # 在可以用脚本改图（具体代码见同目录下的脚本，导出模型后，在arsenal上线前自动执行）
-  # export_for_arsenal_serving(model, export_dir)
-  # 脚本改图暂不支持分布式
-  # os.system('python ./MPI_TFRA_DE_all_in_one.py --model_dir={}'.format(export_dir))
-  # os.system('python ./TFRA_DE_model_to_serve.py --model_dir={}'.format(export_dir))
-
-
-if __name__ == "__main__":
-  main()
diff --git a/modelzoo/Recommendation/keras_horovod_dis/run_horovod.sh b/modelzoo/Recommendation/keras_horovod_dis/run_horovod.sh
deleted file mode 100644
index 07880fa6..00000000
--- a/modelzoo/Recommendation/keras_horovod_dis/run_horovod.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-echo "Container nvidia build = " $NVIDIA_BUILD_ID
-
-keras_use_ctl=${1:-"false"}
-num_gpu=${2:-"2"}
-batch_size=${3:-"4096"}
-learning_rate=${4:-"5e-6"}
-precision=${5:-"fp32"}
-use_xla=${6:-"true"}
-epochs=${7:-"100"}
-model=${8:-"demo"}
-
-
-if [ $num_gpu -gt 1 ] ; then
-    mpi_command="mpirun -np $num_gpu \
-    --allow-run-as-root -bind-to none -map-by slot \
-    -x NCCL_DEBUG=INFO \
-    -x LD_LIBRARY_PATH \
-    -x PATH -mca pml ob1 -mca btl ^openib"
-    use_hvd="--use_horovod"
-else
-    mpi_command=""
-    use_hvd=""
-fi
-
-if [ "$precision" = "fp16" ] ; then
-    echo "fp16 activated!"
-    use_fp16="--dtype=fp16"
-else
-    use_fp16=""
-fi
-
-if [ "$use_xla" = "true" ] ; then
-    use_xla_tag="--enable_xla"
-    echo "XLA activated"
-else
-    use_xla_tag=""
-fi
-
-
-export GBS=$(expr $batch_size \* $num_gpu)
-printf -v TAG "tf_tfra_training_movielens_%s_%s_gbs%d" "$model" "$precision" $GBS
-DATESTAMP=`date +'%y%m%d%H%M%S'`
-
-#Edit to save logs & checkpoints in a different directory
-RESULTS_DIR=/results/${TAG}_${DATESTAMP}
-LOGFILE=$RESULTS_DIR/$TAG.$DATESTAMP.log
-mkdir -m 777 -p $RESULTS_DIR
-printf "Saving checkpoints to %s\n" "$RESULTS_DIR"
-printf "Logs written to %s\n" "$LOGFILE"
-
-set -x
-$mpi_command python demo_tfra.py \
-  --train_data=movielens/1m-ratings \
-  --keras_use_ctl=$keras_use_ctl \
-  --use_dynamic_embedding \
-  --num_gpus=$num_gpu \
-  --batch_size=$batch_size \
-  --learning_rate=$learning_rate \
-  --epochs=$epochs \
-  --model_dir=${RESULTS_DIR} \
-  $use_hvd $use_fp16 $use_xla_tag |& tee $LOGFILE
-
-set +x
diff --git a/modelzoo/Recommendation/keras_horovod_dis/start_train.sh b/modelzoo/Recommendation/keras_horovod_dis/start_train.sh
deleted file mode 100644
index 8a1e65c7..00000000
--- a/modelzoo/Recommendation/keras_horovod_dis/start_train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# gpu_num = nvidia-smi --query-gpu=name --format=csv,noheader | wc -l
-# horovodrun -np $gpu_num python keras_horovod_distributed_demo.py
-
-horovodrun -np 4 python -m examples.Recommendation.keras_horovod_dis.keras_horovod_distributed_demo

From 74bf4bcaeece8119b5ae8d31bcdd29808285dba4 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 18:48:32 +0800
Subject: [PATCH 03/11] Update clang format action

---
 .github/workflows/ci_test.yml            |  8 +-------
 .github/workflows/clang-format-check.yml | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 7 deletions(-)
 create mode 100644 .github/workflows/clang-format-check.yml

diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index 5f956879..912aeaaa 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -25,13 +25,7 @@ jobs:
       - uses: actions/checkout@v2
       - name: Run type check
         run: bash tools/run_build.sh valid_build_files
-  clang-format:
-    name: Clang C++ code format
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Run type check
-        run: bash tools/run_build.sh clang-format
+
   check-bazel-format:
     name: Bazel code format
     runs-on: ubuntu-latest
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
new file mode 100644
index 00000000..69e56432
--- /dev/null
+++ b/.github/workflows/clang-format-check.yml
@@ -0,0 +1,14 @@
+name: clang-format Check
+on: [push, pull_request]
+jobs:
+  formatting-check:
+    name: Formatting Check
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: Run clang-format style check for C/C++/Protobuf programs.
+      uses: jidicula/clang-format-action@v4.15.0
+      with:
+        clang-format-version: '19'
+        check-path: 'src'
+        fallback-style: 'Mozilla' # optional
\ No newline at end of file

From ae60efef3f29c0cc4565129df20bb350851ad2b0 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 18:56:16 +0800
Subject: [PATCH 04/11] Test: Change clang-format from Mozilla to Google

---
 .github/workflows/clang-format-check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/clang-format-check.yml
index 69e56432..c53dc6a8 100644
--- a/.github/workflows/clang-format-check.yml
+++ b/.github/workflows/clang-format-check.yml
@@ -11,4 +11,4 @@ jobs:
       with:
         clang-format-version: '19'
         check-path: 'src'
-        fallback-style: 'Mozilla' # optional
\ No newline at end of file
+        fallback-style: 'Google' # optional
\ No newline at end of file

From 3d146874184fb8425b447d66d7d0b32c3db41593 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 19:07:36 +0800
Subject: [PATCH 05/11] Test: clang-format -i --style=file  **/*.cu.cc **/*.h
 **/*.cc

---
 .clang-format                                 |   4 +
 .../cc/kernels/correlation_cost_op_gpu.cu.cc  |  40 +++---
 .../embedding_bag_backward_kernels.cu.cc      |  22 +--
 .../cc/kernels/embedding_bag_ops.cc           |  46 +++----
 .../cc/kernels/embedding_bag_ops.h            |   6 +-
 .../cc/kernels/embedding_bag_ops_gpu.cu.cc    |  10 +-
 .../embedding_lookup_sparse_local_op.cc       | 126 +++++++++---------
 .../embedding_lookup_sparse_local_op_test.cc  |  94 ++++++-------
 ...dding_lookup_sparse_backward_base_ops.cu.h |  48 +++----
 ...oup_embedding_lookup_sparse_forward_ops.cc |  73 +++++-----
 .../incr_save_restore_ops_test.cc             |  10 +-
 .../cc/kernels/hotness_calculate.cu.cc        |  16 +--
 .../ffm_ops/cc/kernels/ffm_kernels.cc         |   4 +-
 .../ffm_ops/cc/kernels/ffm_kernels.cu.cc      |   4 +-
 .../ffm_ops/cc/kernels/ffm_kernels.h          |  28 ++--
 deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc  |   4 +-
 .../cc/kernels/parquet_batch_reader.cc        |  24 ++--
 deepray/custom_ops/utils/spin_rw_lock.h       |  32 ++---
 third_party/hadoop/hdfs.h                     | 124 ++++++++---------
 19 files changed, 359 insertions(+), 356 deletions(-)
 create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..e06cf478
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+BasedOnStyle: Google
+DerivePointerAlignment: false
diff --git a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
index 9978bcdb..084b4f99 100644
--- a/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
+++ b/deepray/custom_ops/correlation_cost/cc/kernels/correlation_cost_op_gpu.cu.cc
@@ -43,8 +43,8 @@ This implementation is inspired from
 */
 
 template <unsigned int THREADS_PER_BLOCK>
-__global__ void pad_and_transpose(const float *__restrict__ input,
-                                  float *__restrict__ output, int C, int H,
+__global__ void pad_and_transpose(const float* __restrict__ input,
+                                  float* __restrict__ output, int C, int H,
                                   int W, int P) {
   // NCHW -> pad(NHWC)
   const int n = blockIdx.x;
@@ -61,8 +61,8 @@ __global__ void pad_and_transpose(const float *__restrict__ input,
 }
 
 template <unsigned int THREADS_PER_BLOCK>
-__global__ void pad_and_no_transpose(const float *__restrict__ input,
-                                     float *__restrict__ output, int C, int H,
+__global__ void pad_and_no_transpose(const float* __restrict__ input,
+                                     float* __restrict__ output, int C, int H,
                                      int W, int P) {
   // NHWC -> pad(NHWC)
   const int n = blockIdx.x;
@@ -79,11 +79,11 @@ __global__ void pad_and_no_transpose(const float *__restrict__ input,
 }
 
 template <unsigned int THREADS_PER_BLOCK>
-__global__ void Correlation_forward(float *__restrict__ output, int Cout,
+__global__ void Correlation_forward(float* __restrict__ output, int Cout,
                                     int Hout, int Wout,
-                                    const float *__restrict__ pInput1, int Cin,
+                                    const float* __restrict__ pInput1, int Cin,
                                     int Hin, int Win,
-                                    const float *__restrict__ pInput2, int pad,
+                                    const float* __restrict__ pInput2, int pad,
                                     int kernel_size, int max_displacement,
                                     int stride1, int stride2) {
   const int pWin = Win + 2 * pad;
@@ -139,9 +139,9 @@ __global__ void Correlation_forward(float *__restrict__ output, int Cout,
 
 template <unsigned int THREADS_PER_BLOCK>
 __global__ void Correlation_backward_input1(
-    int item, float *__restrict__ gradInput1, int Cin, int Hin, int Win,
-    const float *__restrict__ gradOutput, int Cout, int Hout, int Wout,
-    const float *__restrict__ rInput2, int pad_size, int kernel_size,
+    int item, float* __restrict__ gradInput1, int Cin, int Hin, int Win,
+    const float* __restrict__ gradOutput, int Cout, int Hout, int Wout,
+    const float* __restrict__ rInput2, int pad_size, int kernel_size,
     int max_displacement, int stride1, int stride2, bool is_NCHW) {
   const int n = item;
   const int h = blockIdx.x * stride1 + pad_size;
@@ -220,9 +220,9 @@ __global__ void Correlation_backward_input1(
 
 template <unsigned int THREADS_PER_BLOCK>
 __global__ void Correlation_backward_input2(
-    int item, float *__restrict__ gradInput2, int Cin, int Hin, int Win,
-    const float *__restrict__ gradOutput, int Cout, int Hout, int Wout,
-    const float *rInput1, int pad_size, int kernel_size, int max_displacement,
+    int item, float* __restrict__ gradInput2, int Cin, int Hin, int Win,
+    const float* __restrict__ gradOutput, int Cout, int Hout, int Wout,
+    const float* rInput1, int pad_size, int kernel_size, int max_displacement,
     int stride1, int stride2, bool is_NCHW) {
   const int n = item;
   const int h = blockIdx.x * stride1 + pad_size;
@@ -301,8 +301,8 @@ __global__ void Correlation_backward_input2(
 
 template <typename Dtype>
 struct CorrelationCostFunctor<GPUDevice, Dtype> {
-  Status operator()(OpKernelContext *context, const Tensor &input_a_t,
-                    const Tensor &input_b_t, Tensor *output_t,
+  Status operator()(OpKernelContext* context, const Tensor& input_a_t,
+                    const Tensor& input_b_t, Tensor* output_t,
                     /* params */
                     int kernel_size, int max_displacement, int stride_1,
                     int stride_2, int pad, TensorFormat data_format) {
@@ -355,7 +355,7 @@ struct CorrelationCostFunctor<GPUDevice, Dtype> {
           iH, iW, pad);
     }
 
-    const GPUDevice &d = context->eigen_gpu_device();
+    const GPUDevice& d = context->eigen_gpu_device();
 
     dim3 threadsPerBlock(THREADS_PER_BLOCK);
     dim3 totalBlocksCorr(N, oH, oW);
@@ -373,9 +373,9 @@ struct CorrelationCostFunctor<GPUDevice, Dtype> {
 
 template <typename Dtype>
 struct CorrelationCostGradFunctor<GPUDevice, Dtype> {
-  Status operator()(OpKernelContext *context, const Tensor &input_a_t,
-                    const Tensor &input_b_t, const Tensor &topdiff_t,
-                    Tensor *output_a_gradient_t, Tensor *output_b_gradient_t,
+  Status operator()(OpKernelContext* context, const Tensor& input_a_t,
+                    const Tensor& input_b_t, const Tensor& topdiff_t,
+                    Tensor* output_a_gradient_t, Tensor* output_b_gradient_t,
                     /* params */
                     int kernel_size, int max_displacement, int stride_1,
                     int stride_2, int pad, TensorFormat data_format) {
@@ -430,7 +430,7 @@ struct CorrelationCostGradFunctor<GPUDevice, Dtype> {
           iH, iW, pad);
     }
 
-    const GPUDevice &d = context->eigen_gpu_device();
+    const GPUDevice& d = context->eigen_gpu_device();
 
     dim3 threadsPerBlock(THREADS_PER_BLOCK);
     dim3 totalBlocksCorr(iH, iW, iC);
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
index b6cdce68..5464113a 100644
--- a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_backward_kernels.cu.cc
@@ -35,8 +35,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 template <typename Tindices, const int kThreadsPerBlock>
 __global__ void PrepTempArraysKernel(
-    const Tindices *__restrict__ indices, Tindices *__restrict__ sortedIndices,
-    Tindices *__restrict__ sortedIndicesCounter, const int indices_size) {
+    const Tindices* __restrict__ indices, Tindices* __restrict__ sortedIndices,
+    Tindices* __restrict__ sortedIndicesCounter, const int indices_size) {
   const int arrayIdx = (blockIdx.x * kThreadsPerBlock) + threadIdx.x;
   if (arrayIdx <
       indices_size) {  // Make sure we don't run off the end of the actual array
@@ -48,9 +48,9 @@ __global__ void PrepTempArraysKernel(
 // Define the CUDA kernel.
 template <typename T, typename Tindices, const int kThreadsPerBlock>
 __global__ void EmbeddingBagWeightsGradKernel(
-    const int value_dim, const Tindices *__restrict__ indices,
-    const T *__restrict__ values, const T *__restrict__ dloss,
-    T *__restrict__ weights_grad, Combiner combiner) {
+    const int value_dim, const Tindices* __restrict__ indices,
+    const T* __restrict__ values, const T* __restrict__ dloss,
+    T* __restrict__ weights_grad, Combiner combiner) {
   const int sample_idx = blockIdx.x;
   const int bag_idx = blockIdx.y;
   const int bag_dim = gridDim.y;
@@ -85,10 +85,10 @@ __global__ void EmbeddingBagWeightsGradKernel(
 template <typename T, typename Tindices>
 __global__ void EmbeddingBagValuesGradKernel(
     const int value_dim, const int bag_dim,
-    const Tindices *__restrict__ sortedIndices,
-    const Tindices *__restrict__ counter, const T *__restrict__ values,
-    const T *__restrict__ weights, const T *__restrict__ dloss,
-    T *__restrict__ values_grad, Combiner combiner) {
+    const Tindices* __restrict__ sortedIndices,
+    const Tindices* __restrict__ counter, const T* __restrict__ values,
+    const T* __restrict__ weights, const T* __restrict__ dloss,
+    T* __restrict__ values_grad, Combiner combiner) {
   const int startIdx = blockIdx.x;
   const int chunk = blockIdx.y;
   const int kThreadsPerBlock = blockDim.x;
@@ -147,14 +147,14 @@ template <typename T, typename Tindices>
 struct EmbeddingBagBackwardFunctor<GPUDevice, T, Tindices> {
   // indices should remain unchanged, but thrust complains if it's a const
   // pointer
-  void operator()(const GPUDevice &d,
+  void operator()(const GPUDevice& d,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
                   typename TTypes<T, 2>::ConstTensor grads,
                   typename TTypes<T, 2>::Tensor params_grads,
                   typename TTypes<T, 2>::Tensor weights_grads,
-                  Combiner combiner, OpKernelContext *context) {
+                  Combiner combiner, OpKernelContext* context) {
     // I copy-pasted this bit from histogram_op_gpu.cu.cc and I sure hope it
     // works
     tensorflow::AllocatorAttributes gpu_allocator;
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
index fd6169d1..43883f32 100644
--- a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.cc
@@ -38,7 +38,7 @@ struct EmbeddingBagFunctor<CPUDevice, T, Tindices> {
   using VectorMap = Eigen::Map<Eigen::Vector<T, Eigen::Dynamic>>;
   using ConstVectorMap = Eigen::Map<const Eigen::Vector<T, Eigen::Dynamic>>;
 
-  void operator()(const CPUDevice &device,
+  void operator()(const CPUDevice& device,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
@@ -83,14 +83,14 @@ struct EmbeddingBagBackwardFunctor<CPUDevice, T, Tindices> {
   using VectorMap = Eigen::Map<Eigen::Vector<T, Eigen::Dynamic>>;
   using ConstVectorMap = Eigen::Map<const Eigen::Vector<T, Eigen::Dynamic>>;
 
-  void operator()(const CPUDevice &device,
+  void operator()(const CPUDevice& device,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
                   typename TTypes<T, 2>::ConstTensor grads,
                   typename TTypes<T, 2>::Tensor params_grads,
                   typename TTypes<T, 2>::Tensor weights_grads,
-                  Combiner combiner, OpKernelContext *context) {
+                  Combiner combiner, OpKernelContext* context) {
     const Eigen::Index sequence_length = indices.dimension(1);
     const Eigen::Index output_dim = params.dimension(1);
 
@@ -138,7 +138,7 @@ struct EmbeddingBagBackwardFunctor<CPUDevice, T, Tindices> {
                        std::move(compute_params_grads));
 
     const auto compute_weights_grads =
-        [&](const Eigen::array<Eigen::Index, 2> &coords) -> T {
+        [&](const Eigen::array<Eigen::Index, 2>& coords) -> T {
       const Eigen::Index bag = coords[0];
       const Eigen::Index seq = coords[1];
       const ConstVectorMap grads_slice(&grads(bag, 0), output_dim);
@@ -158,7 +158,7 @@ struct EmbeddingBagBackwardFunctor<CPUDevice, T, Tindices> {
 }  // namespace functor
 
 namespace {
-bool ValidateCombiner(const std::string &combiner_string, Combiner *combiner) {
+bool ValidateCombiner(const std::string& combiner_string, Combiner* combiner) {
   if (combiner_string == "SUM") {
     *combiner = Combiner::kSum;
   } else if (combiner_string == "MEAN") {
@@ -173,7 +173,7 @@ bool ValidateCombiner(const std::string &combiner_string, Combiner *combiner) {
 template <typename Device, typename T, typename Tindices>
 class EmbeddingBagOp : public OpKernel {
  public:
-  explicit EmbeddingBagOp(OpKernelConstruction *context) : OpKernel(context) {
+  explicit EmbeddingBagOp(OpKernelConstruction* context) : OpKernel(context) {
     std::string combiner_string;
     OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string));
     OP_REQUIRES(
@@ -181,14 +181,14 @@ class EmbeddingBagOp : public OpKernel {
         errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner."));
   }
 
-  void Compute(OpKernelContext *context) override {
-    const Tensor &indices = context->input(0);
-    const Tensor &params = context->input(1);
-    const Tensor &weights = context->input(2);
+  void Compute(OpKernelContext* context) override {
+    const Tensor& indices = context->input(0);
+    const Tensor& params = context->input(1);
+    const Tensor& weights = context->input(2);
 
-    const TensorShape &indices_shape = indices.shape();
-    const TensorShape &params_shape = params.shape();
-    const TensorShape &weights_shape = weights.shape();
+    const TensorShape& indices_shape = indices.shape();
+    const TensorShape& params_shape = params.shape();
+    const TensorShape& weights_shape = weights.shape();
 
     OP_REQUIRES(context, TensorShapeUtils::IsMatrix(indices_shape),
                 errors::InvalidArgument("indices shape should be 2-D."));
@@ -201,7 +201,7 @@ class EmbeddingBagOp : public OpKernel {
     TensorShape output_shape = {indices_shape.dim_size(0),
                                 params_shape.dim_size(1)};
 
-    Tensor *output = nullptr;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
 
     functor::EmbeddingBagFunctor<Device, T, Tindices>()(
@@ -217,7 +217,7 @@ class EmbeddingBagOp : public OpKernel {
 template <typename Device, typename T, typename Tindices>
 class EmbeddingBagBackwardOp : public OpKernel {
  public:
-  explicit EmbeddingBagBackwardOp(OpKernelConstruction *context)
+  explicit EmbeddingBagBackwardOp(OpKernelConstruction* context)
       : OpKernel(context) {
     std::string combiner_string;
     OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_string));
@@ -226,16 +226,16 @@ class EmbeddingBagBackwardOp : public OpKernel {
         errors::InvalidArgument("Only support 'SUM' and 'MEAN' combiner."));
   }
 
-  void Compute(OpKernelContext *context) override {
-    const Tensor &indices = context->input(0);
-    const Tensor &params = context->input(1);
-    const Tensor &weights = context->input(2);
-    const Tensor &grads = context->input(3);
+  void Compute(OpKernelContext* context) override {
+    const Tensor& indices = context->input(0);
+    const Tensor& params = context->input(1);
+    const Tensor& weights = context->input(2);
+    const Tensor& grads = context->input(3);
 
-    Tensor *params_grads = nullptr;
+    Tensor* params_grads = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, params.shape(), &params_grads));
-    Tensor *weights_grads = nullptr;
+    Tensor* weights_grads = nullptr;
     OP_REQUIRES_OK(
         context, context->allocate_output(1, weights.shape(), &weights_grads));
     functor::EmbeddingBagBackwardFunctor<Device, T, Tindices>()(
@@ -283,7 +283,7 @@ namespace functor {
 #define DECLARE_GPU_SPEC(T, Tindices)                                         \
   template <>                                                                 \
   void EmbeddingBagFunctor<GPUDevice, T, Tindices>::operator()(               \
-      const GPUDevice &, typename TTypes<Tindices, 2>::ConstTensor,           \
+      const GPUDevice&, typename TTypes<Tindices, 2>::ConstTensor,            \
       typename TTypes<T, 2>::ConstTensor, typename TTypes<T, 2>::ConstTensor, \
       typename TTypes<T, 2>::Tensor, Combiner);                               \
   extern template struct EmbeddingBagFunctor<GPUDevice, T, Tindices>;
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
index b05c58e7..4e2ab770 100644
--- a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops.h
@@ -31,7 +31,7 @@ namespace functor {
 
 template <typename Device, typename T, typename Tindices>
 struct EmbeddingBagFunctor {
-  void operator()(const Device &device,
+  void operator()(const Device& device,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
@@ -40,14 +40,14 @@ struct EmbeddingBagFunctor {
 
 template <typename Device, typename T, typename Tindices>
 struct EmbeddingBagBackwardFunctor {
-  void operator()(const Device &device,
+  void operator()(const Device& device,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
                   typename TTypes<T, 2>::ConstTensor grads,
                   typename TTypes<T, 2>::Tensor params_grads,
                   typename TTypes<T, 2>::Tensor weights_grads,
-                  Combiner combiner, OpKernelContext *context);
+                  Combiner combiner, OpKernelContext* context);
 };
 
 }  // namespace functor
diff --git a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
index 7be3d552..30f1b0bb 100644
--- a/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
+++ b/deepray/custom_ops/embedding_bag/cc/kernels/embedding_bag_ops_gpu.cu.cc
@@ -29,10 +29,10 @@ typedef Eigen::GpuDevice GPUDevice;
 namespace {
 // Define the GPU kernel.
 template <typename T, typename Tindices, const int kThreadsPerBlock>
-__global__ void EmbeddingBagGPUKernel(const Tindices *__restrict__ indices,
-                                      const T *__restrict__ params,
-                                      const T *__restrict__ weights,
-                                      T *__restrict__ output,
+__global__ void EmbeddingBagGPUKernel(const Tindices* __restrict__ indices,
+                                      const T* __restrict__ params,
+                                      const T* __restrict__ weights,
+                                      T* __restrict__ output,
                                       const Eigen::Index output_dim,
                                       const Eigen::Index sequence_length,
                                       Combiner combiner) {
@@ -71,7 +71,7 @@ template <typename T, typename Tindices>
 struct EmbeddingBagFunctor<GPUDevice, T, Tindices> {
   static constexpr int kThreadsPerBlock = 32;
 
-  void operator()(const GPUDevice &device,
+  void operator()(const GPUDevice& device,
                   typename TTypes<Tindices, 2>::ConstTensor indices,
                   typename TTypes<T, 2>::ConstTensor params,
                   typename TTypes<T, 2>::ConstTensor weights,
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
index e50be39e..8317b03e 100644
--- a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op.cc
@@ -16,10 +16,10 @@ namespace {
 // cols: How many elements to do SparseSegmentSum
 // output: rows * embedding_size
 template <typename T>
-static void sparse_gather_v1(T *input, int rows, int cols,
-                             float *embedding_table, float *output,
+static void sparse_gather_v1(T* input, int rows, int cols,
+                             float* embedding_table, float* output,
                              int embedding_size, bool is_mean) {
-  T *pidx = input;
+  T* pidx = input;
   for (int i = 0; i < rows; ++i) {
     for (int j = 0; j < embedding_size; ++j) {
       float value = 0;
@@ -44,10 +44,10 @@ static void sparse_gather_v1(T *input, int rows, int cols,
 
 // embedding_size = 1
 template <typename T>
-static void sparse_gather_embeddingsize1(T *input, int rows, int cols,
-                                         float *embedding_table, float *output,
+static void sparse_gather_embeddingsize1(T* input, int rows, int cols,
+                                         float* embedding_table, float* output,
                                          bool is_mean) {
-  T *pidx = input;
+  T* pidx = input;
   for (int i = 0; i < rows; ++i) {
     float value = 0;
     int dense_num = 0;
@@ -69,13 +69,13 @@ static void sparse_gather_embeddingsize1(T *input, int rows, int cols,
 
 // input cols = 1
 template <typename T>
-static void sparse_gather_column1(T *input, int rows, float *embedding_table,
-                                  float *output, int embedding_size) {
-  T *pidx = input;
+static void sparse_gather_column1(T* input, int rows, float* embedding_table,
+                                  float* output, int embedding_size) {
+  T* pidx = input;
   for (int i = 0; i < rows; ++i) {
     int embedding_row = *pidx++;
     if (embedding_row >= 0) {
-      float *pembedding = &embedding_table[embedding_row * embedding_size];
+      float* pembedding = &embedding_table[embedding_row * embedding_size];
       for (int j = 0; j < embedding_size; ++j) {
         output[j] = pembedding[j];
       }
@@ -89,8 +89,8 @@ static void sparse_gather_column1(T *input, int rows, float *embedding_table,
 }
 
 template <typename T>
-static void sparse_gather(T *input, int rows, int cols, float *embedding_table,
-                          float *output, int embedding_size, bool is_mean) {
+static void sparse_gather(T* input, int rows, int cols, float* embedding_table,
+                          float* output, int embedding_size, bool is_mean) {
   if (embedding_size == 1) {
     sparse_gather_embeddingsize1(input, rows, cols, embedding_table, output,
                                  is_mean);
@@ -104,21 +104,21 @@ static void sparse_gather(T *input, int rows, int cols, float *embedding_table,
 }
 
 // Use memcpy or manually assign?
-static void mycopy(float *dst, float *src, int float_num) {
+static void mycopy(float* dst, float* src, int float_num) {
   memcpy(dst, src, float_num * sizeof(float));
 }
 
-static void myadd(float *dst, float *src, int float_num) {
+static void myadd(float* dst, float* src, int float_num) {
   for (int i = 0; i < float_num; ++i) {
     dst[i] += src[i];
   }
 }
 
 template <typename T>
-static void row_add(std::map<T *, std::vector<T *>> &mapSet, int64 row_nums) {
+static void row_add(std::map<T*, std::vector<T*>>& mapSet, int64 row_nums) {
   for (auto it = mapSet.begin(); it != mapSet.end(); ++it) {
-    T *dst = it->first;
-    std::vector<T *> srcs(std::move(it->second));
+    T* dst = it->first;
+    std::vector<T*> srcs(std::move(it->second));
     int64 src_size = srcs.size();
 
     for (int row = 0; row < row_nums; ++row) {
@@ -131,13 +131,13 @@ static void row_add(std::map<T *, std::vector<T *>> &mapSet, int64 row_nums) {
 }
 
 template <typename T>
-static void row_add_mean(std::map<T *, std::vector<T *>> &mapSet,
-                         int64 row_nums, bool is_mean) {
+static void row_add_mean(std::map<T*, std::vector<T*>>& mapSet, int64 row_nums,
+                         bool is_mean) {
 #define L(n) srcs[index + n][row]
 
   for (auto it = mapSet.begin(); it != mapSet.end(); ++it) {
-    T *dst = it->first;
-    std::vector<T *> srcs(std::move(it->second));
+    T* dst = it->first;
+    std::vector<T*> srcs(std::move(it->second));
     int64 src_size = srcs.size();
 
     if (src_size == 1) {
@@ -211,22 +211,22 @@ static void row_add_mean(std::map<T *, std::vector<T *>> &mapSet,
   }
 }
 
-static void myscale(float *dst, float factor, int float_num) {
+static void myscale(float* dst, float factor, int float_num) {
   for (int i = 0; i < float_num; ++i) {
     dst[i] *= factor;
   }
 }
 
 template <typename Tid, typename Tshape>
-static void sparse_gather(Tid *input, int64 input_size, Tshape *indice,
-                          int indice_dim, Tshape *shape, int rows, int cols,
-                          float *embedding_table, float *output,
+static void sparse_gather(Tid* input, int64 input_size, Tshape* indice,
+                          int indice_dim, Tshape* shape, int rows, int cols,
+                          float* embedding_table, float* output,
                           int embedding_size, bool is_mean) {
   // Record how many values in each row
-  int *row_values = new int[rows];
+  int* row_values = new int[rows];
   memset(row_values, 0, rows * sizeof(int));
 
-  std::map<float *, std::vector<float *>> mapSet;
+  std::map<float*, std::vector<float*>> mapSet;
 
   for (int64 i = 0; i < input_size; ++i) {
     Tid id = input[i];
@@ -241,7 +241,7 @@ static void sparse_gather(Tid *input, int64 input_size, Tshape *indice,
 
     auto index = row * embedding_size;
     if (!mapSet.count(&output[index])) {
-      std::vector<float *> srcs;
+      std::vector<float*> srcs;
       mapSet[&output[index]] = srcs;
     }
     mapSet[&output[index]].push_back(&embedding_table[id * embedding_size]);
@@ -314,7 +314,7 @@ static void sparse_gather(Tid *input, int64 input_size, Tshape *indice,
 template <typename Device, typename Tid, typename Tshape>
 class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel {
  public:
-  explicit FusedSafeEmbeddingLookupSparseLocalOp(OpKernelConstruction *context)
+  explicit FusedSafeEmbeddingLookupSparseLocalOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_));
     // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims));
@@ -323,14 +323,14 @@ class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel {
 
   ~FusedSafeEmbeddingLookupSparseLocalOp() {}
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     // Grab the weight
-    float *weight;
-    const Tensor *weight_tensor = &context->input(0);
+    float* weight;
+    const Tensor* weight_tensor = &context->input(0);
 
     // for saved model
     if (weight_tensor->dtype() == DT_RESOURCE) {
-      Var *variable;
+      Var* variable;
       OP_REQUIRES_OK(
           context,
           LookupResource(context, HandleFromInput(context, 0), &variable));
@@ -341,14 +341,14 @@ class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel {
           errors::InvalidArgument("Expect float weight in ", node_name));
     }
 
-    weight = (float *)weight_tensor->tensor_data().data();
+    weight = (float*)weight_tensor->tensor_data().data();
 
     // Input id
-    const Tensor &input_tensor = context->input(1);
-    Tid *input = (Tid *)input_tensor.tensor_data().data();
+    const Tensor& input_tensor = context->input(1);
+    Tid* input = (Tid*)input_tensor.tensor_data().data();
 
-    const Tensor &shape_tensor = context->input(2);
-    Tshape *shape = (Tshape *)shape_tensor.tensor_data().data();
+    const Tensor& shape_tensor = context->input(2);
+    Tshape* shape = (Tshape*)shape_tensor.tensor_data().data();
 
     // To check the input
     OP_REQUIRES(
@@ -372,16 +372,16 @@ class FusedSafeEmbeddingLookupSparseLocalOp : public OpKernel {
     int embedding_size = weight_tensor->dim_size(1);
     bool is_mean = (combiner_ == "mean");
 
-    const Tensor &indice_tensor = context->input(3);
-    Tshape *indice = (Tshape *)indice_tensor.tensor_data().data();
+    const Tensor& indice_tensor = context->input(3);
+    Tshape* indice = (Tshape*)indice_tensor.tensor_data().data();
     int indice_dim = indice_tensor.dim_size(1);
 
     // Create an output tensor
-    Tensor *output_tensor = NULL;
+    Tensor* output_tensor = NULL;
     TensorShape output_shape({batch_size, embedding_size});
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
-    float *output = (float *)output_tensor->tensor_data().data();
+    float* output = (float*)output_tensor->tensor_data().data();
 
     if (false && input_size == batch_size * cols) {  // input id is dense
       // fixme(marvin): disable this branch just for test.
@@ -425,7 +425,7 @@ namespace functor {
 
 template <typename T, typename Index, typename SegmentId>
 struct SparseSegmentGradFunctor {
-  void operator()(OpKernelContext *context,
+  void operator()(OpKernelContext* context,
                   SparseSegmentReductionOperation operation,
                   typename TTypes<T>::ConstMatrix input_flat,
                   typename TTypes<Index>::ConstVec indices_vec,
@@ -524,7 +524,7 @@ template <typename Device, typename T, typename Tinput, typename Tindices,
 class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
  public:
   explicit FusedSafeEmbeddingLookupSparseLocalGradOp(
-      OpKernelConstruction *context)
+      OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_));
     // OP_REQUIRES_OK(context, context->GetAttr("Dims", &dims));
@@ -553,10 +553,10 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
 
   ~FusedSafeEmbeddingLookupSparseLocalGradOp() {}
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     // Grab gradients
-    const Tensor &gradients_tensor = context->input(0);
-    T *gradients = (T *)gradients_tensor.tensor_data().data();
+    const Tensor& gradients_tensor = context->input(0);
+    T* gradients = (T*)gradients_tensor.tensor_data().data();
     OP_REQUIRES(
         context, (gradients_tensor.dims() == 2),
         errors::InvalidArgument("Gradients tensor is not valid (dims != 2)"));
@@ -564,16 +564,16 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
     int64 embedding_col = gradients_tensor.dim_size(1);
 
     // Grad input hash value
-    const Tensor &input_tensor = context->input(1);
-    Tinput *input = (Tinput *)input_tensor.tensor_data().data();
+    const Tensor& input_tensor = context->input(1);
+    Tinput* input = (Tinput*)input_tensor.tensor_data().data();
     int64 input_size = 1;
     for (int i = 0; i < input_tensor.dims(); ++i) {
       input_size *= input_tensor.dim_size(i);
     }
 
     // Grad indices value
-    const Tensor &indices_tensor = context->input(2);
-    Tindices *indices_ptr = (Tindices *)indices_tensor.tensor_data().data();
+    const Tensor& indices_tensor = context->input(2);
+    Tindices* indices_ptr = (Tindices*)indices_tensor.tensor_data().data();
     int indices_row = indices_tensor.dim_size(0);
     int indices_col = indices_tensor.dim_size(1);
     OP_REQUIRES(context, (indices_tensor.dims() == 2),
@@ -589,9 +589,9 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
     }
 
     // Grad input dense shape
-    const Tensor &dense_shape_tensor = context->input(3);
-    Tdense_shape *dense_shape =
-        (Tdense_shape *)dense_shape_tensor.tensor_data().data();
+    const Tensor& dense_shape_tensor = context->input(3);
+    Tdense_shape* dense_shape =
+        (Tdense_shape*)dense_shape_tensor.tensor_data().data();
     OP_REQUIRES(
         context, (dense_shape_tensor.dims() == 1),
         errors::InvalidArgument("Shape tensor is not valid (dims != 1)"));
@@ -641,20 +641,20 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
     // printf("\n");
 
     // Create an output tensor
-    Tensor *output_tensor = NULL;
+    Tensor* output_tensor = NULL;
     TensorShape output_shape({unique_value.size(), embedding_col});
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, output_shape, &output_tensor));
     output_tensor->flat<T>().setZero();
-    T *output = (T *)output_tensor->tensor_data().data();
+    T* output = (T*)output_tensor->tensor_data().data();
 
     memset(output, 0, embedding_col * sizeof(float) * unique_value.size());
 
-    Tensor *unique_tensor = NULL;
+    Tensor* unique_tensor = NULL;
     TensorShape unique_shape({unique_value.size()});
     OP_REQUIRES_OK(context,
                    context->allocate_output(1, unique_shape, &unique_tensor));
-    Tinput *unique = (Tinput *)unique_tensor->tensor_data().data();
+    Tinput* unique = (Tinput*)unique_tensor->tensor_data().data();
 
     int64 unique_num = unique_value.size();
     for (int64 i = 0; i < unique_num; ++i) {
@@ -678,7 +678,7 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
     } else if (operation_ == SparseSegmentReductionOperation::kSum) {
       uint64 rows = unique_indices.size();
       // std::vector<int64> row_values(unique_value.size(), 0);
-      std::map<float *, std::vector<float *>> mapSet;
+      std::map<float*, std::vector<float*>> mapSet;
 
       for (int64 i = 0; i < rows; ++i) {
         // row_values[unique_indices[i]] += 1;
@@ -687,7 +687,7 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
         // memset(&output[index * embedding_col], 0, embedding_col *
         // sizeof(float));
         if (!mapSet.count(&output[index])) {
-          std::vector<float *> srcs;
+          std::vector<float*> srcs;
           mapSet[&output[index]] = srcs;
         }
         mapSet[&output[index]].push_back(
@@ -712,19 +712,19 @@ class FusedSafeEmbeddingLookupSparseLocalGradOp : public OpKernel {
 
  private:
   template <typename Tdata>
-  void copy(Tdata *dst, const Tdata *src, const int64 num) {
+  void copy(Tdata* dst, const Tdata* src, const int64 num) {
     memcpy(dst, src, num * sizeof(T));
   }
 
   template <typename Tdata>
-  void add(Tdata *dst, const Tdata *src, const int64 num) {
+  void add(Tdata* dst, const Tdata* src, const int64 num) {
     for (int64 i = 0; i < num; ++i) {
       dst[i] += src[i];
     }
   }
 
   template <typename Tdata>
-  void scale(Tdata *dst, const Tdata factor, const int64 num) {
+  void scale(Tdata* dst, const Tdata factor, const int64 num) {
     for (int64 i = 0; i < num; ++i) {
       dst[i] *= factor;
     }
diff --git a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
index dc222b12..574f8fc5 100644
--- a/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
+++ b/deepray/custom_ops/embedding_variable/cc/fused_embedding/embedding_lookup_sparse_local_op_test.cc
@@ -37,7 +37,7 @@ enum class Device { CPU, GPU };
 enum TestCase { Sqrtn, Mean, Sum, SqrtnAndMaxNorm200, MeanAndMaxNorm100 };
 
 template <TestCase test_case>
-void get_node_attr_from_test_case(string &combiner_str, float &max_norm) {
+void get_node_attr_from_test_case(string& combiner_str, float& max_norm) {
   if (test_case == Sqrtn) {
     combiner_str = "sqrtn";
     max_norm = -1.0f;
@@ -57,10 +57,10 @@ void get_node_attr_from_test_case(string &combiner_str, float &max_norm) {
 }
 
 template <TestCase test_case>
-void fill_emb_vector_expected(Tensor *expected);
+void fill_emb_vector_expected(Tensor* expected);
 
 template <>
-void fill_emb_vector_expected<Sqrtn>(Tensor *expected) {
+void fill_emb_vector_expected<Sqrtn>(Tensor* expected) {
   test::FillValues<float>(
       expected, {22.627416610717773, 24.0416316986084,   25.45584487915039,
                  26.870058059692383, 28.284271240234375, 29.698484420776367,
@@ -76,7 +76,7 @@ void fill_emb_vector_expected<Sqrtn>(Tensor *expected) {
 }
 
 template <>
-void fill_emb_vector_expected<Mean>(Tensor *expected) {
+void fill_emb_vector_expected<Mean>(Tensor* expected) {
   test::FillValues<float>(
       expected, {16.00000000000000, 17.00000000000000, 18.00000000000000,
                  19.00000000000000, 20.00000000000000, 21.00000000000000,
@@ -92,7 +92,7 @@ void fill_emb_vector_expected<Mean>(Tensor *expected) {
 }
 
 template <>
-void fill_emb_vector_expected<Sum>(Tensor *expected) {
+void fill_emb_vector_expected<Sum>(Tensor* expected) {
   test::FillValues<float>(
       expected, {32.0,  34.0,  36.0,  38.0,  40.0,  42.0,  44.0,  46.0,
                  128.0, 131.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0,
@@ -101,7 +101,7 @@ void fill_emb_vector_expected<Sum>(Tensor *expected) {
 }
 
 template <>
-void fill_emb_vector_expected<SqrtnAndMaxNorm200>(Tensor *expected) {
+void fill_emb_vector_expected<SqrtnAndMaxNorm200>(Tensor* expected) {
   test::FillValues<float>(
       expected,
       {22.62741661, 24.04163170, 25.45584488,  26.87005806,  28.28427124,
@@ -184,8 +184,8 @@ class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase {
     fill_emb_vector_expected<test_case>(&emb_vector_expected);
     test::FillValues<int32>(&sp_values_offset_expected, {0, 2, 5, 8});
 
-    const Tensor &emb_vector = *GetOutput(0);
-    const Tensor &values_offset = *GetOutput(1);
+    const Tensor& emb_vector = *GetOutput(0);
+    const Tensor& values_offset = *GetOutput(1);
     TF_EXPECT_OK(device_->Sync());
 
     test::ExpectTensorNear<T>(emb_vector_expected, emb_vector, 1e-4);
@@ -194,10 +194,10 @@ class FusedEmbeddingLocalSparseLookUpOpTest : public OpsTestBase {
 };
 
 template <TestCase test_case>
-void fill_grad_expected(Tensor *expected);
+void fill_grad_expected(Tensor* expected);
 
 template <>
-void fill_grad_expected<Sqrtn>(Tensor *expected) {
+void fill_grad_expected<Sqrtn>(Tensor* expected) {
   test::FillValues<float>(
       expected, {0.000000000000000,  0.7071067690849304, 1.4142135381698608,
                  2.1213204860687256, 2.8284270763397217, 3.535533905029297,
@@ -229,7 +229,7 @@ void fill_grad_expected<Sqrtn>(Tensor *expected) {
 }
 
 template <>
-void fill_grad_expected<Mean>(Tensor *expected) {
+void fill_grad_expected<Mean>(Tensor* expected) {
   test::FillValues<float>(
       expected, {0.000000000000000,  0.500000000000000,  1.000000000000000,
                  1.500000000000000,  2.000000000000000,  2.500000000000000,
@@ -261,7 +261,7 @@ void fill_grad_expected<Mean>(Tensor *expected) {
 }
 
 template <>
-void fill_grad_expected<Sum>(Tensor *expected) {
+void fill_grad_expected<Sum>(Tensor* expected) {
   test::FillValues<float>(
       expected,
       {0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,  0.0,  1.0,  2.0,  3.0,
@@ -274,7 +274,7 @@ void fill_grad_expected<Sum>(Tensor *expected) {
 }
 
 template <>
-void fill_grad_expected<MeanAndMaxNorm100>(Tensor *expected) {
+void fill_grad_expected<MeanAndMaxNorm100>(Tensor* expected) {
   test::FillValues<float>(
       expected,
       {0.00000000,  0.50000000,  1.00000000,  1.50000000,  2.00000000,
@@ -365,7 +365,7 @@ class FusedEmbeddingLocalSparseLookUpGradOpTest : public OpsTestBase {
     Tensor grad_expected(dtype, {nnz, emb_vector_dim});
     fill_grad_expected<test_case>(&grad_expected);
 
-    const Tensor &grad = *GetOutput(0);
+    const Tensor& grad = *GetOutput(0);
     TF_EXPECT_OK(device_->Sync());
 
     test::ExpectTensorNear<T>(grad_expected, grad, 1e-4);
@@ -433,12 +433,12 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalFloatSumCpu) {
   fill_emb_vector_expected<Sum>(&emb_vector_expected);
   // test::FillValues<int32>(&sp_values_offset_expected, {0, 2, 5, 8});
 
-  const Tensor &emb_vector = *GetOutput(0);
+  const Tensor& emb_vector = *GetOutput(0);
   // const Tensor& values_offset = *GetOutput(1);
   // TF_EXPECT_OK(device_->Sync());
 
-  float *output = (float *)emb_vector.tensor_data().data();
-  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+  float* output = (float*)emb_vector.tensor_data().data();
+  float* output_ex = (float*)emb_vector_expected.tensor_data().data();
 
   test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-2);
   // test::ExpectTensorEqual<int32>(sp_values_offset_expected, values_offset);
@@ -531,14 +531,14 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatSumCpu) {
        0.0247110203, 0.00123063289, 0.0152365509, 0.0140080536});
 
   test::FillValues<int64>(&output2_tensor_expected, {9, 2});
-  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
-  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+  float* output1_ex = (float*)output1_tensor_expected.tensor_data().data();
+  int64* output2_ex = (int64*)output2_tensor_expected.tensor_data().data();
 
-  const Tensor &output1_tensor = *GetOutput(0);
-  const Tensor &output2_tensor = *GetOutput(1);
+  const Tensor& output1_tensor = *GetOutput(0);
+  const Tensor& output2_tensor = *GetOutput(1);
 
-  float *output1 = (float *)output1_tensor.tensor_data().data();
-  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+  float* output1 = (float*)output1_tensor.tensor_data().data();
+  int64* output2 = (int64*)output2_tensor.tensor_data().data();
 
   printf("out = %.11f , expect = %.11f\n", output1[5], output1_ex[5]);
   printf("out = %.11f , expect = %.11f\n", output1[7], output1_ex[7]);
@@ -599,14 +599,14 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, LocalGradFloatMeanCpu) {
        0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344,
        -0.0095488075, -0.011598196});
   test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
-  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
-  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+  float* output1_ex = (float*)output1_tensor_expected.tensor_data().data();
+  int64* output2_ex = (int64*)output2_tensor_expected.tensor_data().data();
 
-  const Tensor &output1_tensor = *GetOutput(0);
-  const Tensor &output2_tensor = *GetOutput(1);
+  const Tensor& output1_tensor = *GetOutput(0);
+  const Tensor& output2_tensor = *GetOutput(1);
 
-  float *output1 = (float *)output1_tensor.tensor_data().data();
-  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+  float* output1 = (float*)output1_tensor.tensor_data().data();
+  int64* output2 = (int64*)output2_tensor.tensor_data().data();
 
   // printf("out = %f , expect = %f\n", output1[0], output1_ex[0]);
   // printf("out = %f , expect = %f\n", output1[1], output1_ex[1]);
@@ -675,10 +675,10 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatSumCpu) {
        0.00000000,  0.00000000,  -0.04753021, -0.49726167, 0.55058855,
        0.45623600,  -0.17087378, -0.54698306, 0.20810667,  0.50267625});
 
-  const Tensor &emb_vector = *GetOutput(0);
+  const Tensor& emb_vector = *GetOutput(0);
 
-  float *output = (float *)emb_vector.tensor_data().data();
-  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+  float* output = (float*)emb_vector.tensor_data().data();
+  float* output_ex = (float*)emb_vector_expected.tensor_data().data();
 
   test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-8);
 }
@@ -736,10 +736,10 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, FloatMeanCpu) {
        0.000000000,  0.00000000,   -0.022993550, -0.24759622, 0.274842320,
        0.22661813,   -0.084929764, -0.27278733,  0.103755064, 0.25029758});
 
-  const Tensor &emb_vector = *GetOutput(0);
+  const Tensor& emb_vector = *GetOutput(0);
 
-  float *output = (float *)emb_vector.tensor_data().data();
-  float *output_ex = (float *)emb_vector_expected.tensor_data().data();
+  float* output = (float*)emb_vector.tensor_data().data();
+  float* output_ex = (float*)emb_vector_expected.tensor_data().data();
 
   test::ExpectTensorNear<float>(emb_vector_expected, emb_vector, 1e-7);
 }
@@ -811,14 +811,14 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatSumCpu) {
        0.0065145129337906837463378906, 0.0117923058569431304931640625,
        -0.0164990965276956558227539062, -0.0200323369354009628295898438});
   test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
-  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
-  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+  float* output1_ex = (float*)output1_tensor_expected.tensor_data().data();
+  int64* output2_ex = (int64*)output2_tensor_expected.tensor_data().data();
 
-  const Tensor &output1_tensor = *GetOutput(0);
-  const Tensor &output2_tensor = *GetOutput(1);
+  const Tensor& output1_tensor = *GetOutput(0);
+  const Tensor& output2_tensor = *GetOutput(1);
 
-  float *output1 = (float *)output1_tensor.tensor_data().data();
-  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+  float* output1 = (float*)output1_tensor.tensor_data().data();
+  int64* output2 = (int64*)output2_tensor.tensor_data().data();
 
   printf("out = %.28f , expect = %.28f\n", output1[11], output1_ex[11]);
 
@@ -884,14 +884,14 @@ TEST_F(FusedEmbeddingLocalSparseLookUpOpTest, GradFloatMeanCpu) {
        0.0156683810, -0.0091476020, -0.024522856, 0.0027066143, 0.0069600344,
        -0.0095488075, -0.011598196});
   test::FillValues<int64>(&output2_tensor_expected, {1, 0, 4});
-  float *output1_ex = (float *)output1_tensor_expected.tensor_data().data();
-  int64 *output2_ex = (int64 *)output2_tensor_expected.tensor_data().data();
+  float* output1_ex = (float*)output1_tensor_expected.tensor_data().data();
+  int64* output2_ex = (int64*)output2_tensor_expected.tensor_data().data();
 
-  const Tensor &output1_tensor = *GetOutput(0);
-  const Tensor &output2_tensor = *GetOutput(1);
+  const Tensor& output1_tensor = *GetOutput(0);
+  const Tensor& output2_tensor = *GetOutput(1);
 
-  float *output1 = (float *)output1_tensor.tensor_data().data();
-  int64 *output2 = (int64 *)output2_tensor.tensor_data().data();
+  float* output1 = (float*)output1_tensor.tensor_data().data();
+  int64* output2 = (int64*)output2_tensor.tensor_data().data();
 
   test::ExpectTensorNear<float>(output1_tensor_expected, output1_tensor, 1e-8);
   test::ExpectTensorEqual<int64>(output2_tensor_expected, output2_tensor);
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
index 8ced8a0c..458ee53e 100644
--- a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_backward_base_ops.cu.h
@@ -28,31 +28,31 @@ namespace {
 template <typename TKey, typename TValue>
 struct GroupEmbeddingBackWardArgs {
   GroupEmbeddingBackWardArgs() = default;
-  GroupEmbeddingBackWardArgs(TValue *grads, TKey *sp_values,
-                             TValue *emb_variable, TValue *grads_output,
-                             int *offset_indices, int nnz)
+  GroupEmbeddingBackWardArgs(TValue* grads, TKey* sp_values,
+                             TValue* emb_variable, TValue* grads_output,
+                             int* offset_indices, int nnz)
       : grads_(grads),
         sp_values_(sp_values),
         emb_variable_(emb_variable),
         grads_output_(grads_output),
         offset_indices_(offset_indices),
         nnz_(nnz) {}
-  TValue *grads_;
-  TKey *sp_values_;
-  TValue *emb_variable_;
-  TValue *grads_output_;
-  int *offset_indices_;
+  TValue* grads_;
+  TKey* sp_values_;
+  TValue* emb_variable_;
+  TValue* grads_output_;
+  int* offset_indices_;
   int nnz_;
 };
 
 template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
 __global__ void ComputeEVGradFn(
     const int batch_size, const float max_norm, const int num_lookups,
-    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue>* args) {
   float l2_sum;
 
-  const auto &block = cooperative_groups::this_thread_block();
-  const auto &tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
   // each block partition corresponding to one sample
   const int bid =
       block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
@@ -100,10 +100,10 @@ __global__ void ComputeEVGradFn(
 template <typename TKey, typename TValue, Combiner combiner, int Tilesize>
 __global__ void ComputeSparseGradFn(
     const int batch_size, const float max_norm, const int num_lookups,
-    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue>* args) {
   float l2_sum;
-  const auto &block = cooperative_groups::this_thread_block();
-  const auto &tile = cooperative_groups::tiled_partition<Tilesize>(block);
+  const auto& block = cooperative_groups::this_thread_block();
+  const auto& tile = cooperative_groups::tiled_partition<Tilesize>(block);
   // each block partition corresponding to one sample
   const int bid =
       block.group_index().x * tile.meta_group_size() + tile.meta_group_rank();
@@ -151,10 +151,10 @@ __global__ void ComputeSparseGradFn(
 template <typename TKey, typename TValue, Combiner combiner>
 __global__ void NormalComputeEVGradFn(
     const int batch_size, const float max_norm, const int num_lookups,
-    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue>* args) {
   __shared__ TValue l2_sum[1];
 
-  const auto &block = cooperative_groups::this_thread_block();
+  const auto& block = cooperative_groups::this_thread_block();
   // each block partition corresponding to one sample
   const int bid = block.group_index().x;
   // each thread corresponding to one element in the embedding vector
@@ -201,10 +201,10 @@ __global__ void NormalComputeEVGradFn(
 template <typename TKey, typename TValue, Combiner combiner>
 __global__ void NormalComputeSparseGradFn(
     const int batch_size, const float max_norm, const int num_lookups,
-    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue> *args) {
+    const int dimension, GroupEmbeddingBackWardArgs<TKey, TValue>* args) {
   __shared__ TValue l2_sum[1];
 
-  const auto &block = cooperative_groups::this_thread_block();
+  const auto& block = cooperative_groups::this_thread_block();
   // each block partition corresponding to one sample
   const int bid = block.group_index().x;
   // each thread corresponding to one element in the embedding vector
@@ -252,7 +252,7 @@ class GroupEmbeddingLookupBackWard {
  public:
   explicit GroupEmbeddingLookupBackWard(int dimension, int num_lookups,
                                         float max_norm,
-                                        Allocator *gpu_allocator = nullptr)
+                                        Allocator* gpu_allocator = nullptr)
       : alloc_(gpu_allocator) {
     d_args_ =
         TypedAllocator::Allocate<GroupEmbeddingBackWardArgs<TKey, TValue>>(
@@ -263,7 +263,7 @@ class GroupEmbeddingLookupBackWard {
     dimension_ = dimension;
   }
 
-  void set(GroupEmbeddingBackWardArgs<TKey, TValue> &arg) {
+  void set(GroupEmbeddingBackWardArgs<TKey, TValue>& arg) {
     h_args_.emplace_back(arg);
   }
 
@@ -296,8 +296,8 @@ class GroupEmbeddingLookupBackWard {
 
  protected:
   std::vector<GroupEmbeddingBackWardArgs<TKey, TValue>> h_args_;
-  GroupEmbeddingBackWardArgs<TKey, TValue> *d_args_;
-  Allocator *alloc_;
+  GroupEmbeddingBackWardArgs<TKey, TValue>* d_args_;
+  Allocator* alloc_;
   float max_norm_;
   int nums_;
   int dimension_;
@@ -306,7 +306,7 @@ class GroupEmbeddingLookupBackWard {
 template <typename TKey, typename TValue>
 class GroupLookupBackWardBaseOp : public OpKernel {
  public:
-  explicit GroupLookupBackWardBaseOp(OpKernelConstruction *c) : OpKernel(c) {
+  explicit GroupLookupBackWardBaseOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("combiner", &combiner_));
     OP_REQUIRES_OK(c, c->GetAttr("max_norm", &max_norm_));
     OP_REQUIRES_OK(c, c->GetAttr("num_lookups", &num_lookups_));
@@ -314,7 +314,7 @@ class GroupLookupBackWardBaseOp : public OpKernel {
   }
 
   template <bool Isev = false, Combiner combiner>
-  inline void compute(GroupEmbeddingLookupBackWard<TKey, TValue> &lookuper,
+  inline void compute(GroupEmbeddingLookupBackWard<TKey, TValue>& lookuper,
                       const int batch_size, cudaStream_t stream) {
     if (Isev) {
       if (dimension_ <= 2) {
diff --git a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
index d4c61922..c8103116 100644
--- a/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
+++ b/deepray/custom_ops/embedding_variable/cc/group_embedding/group_embedding_lookup_sparse_forward_ops.cc
@@ -38,13 +38,13 @@ class GroupEmbeddingVariableLookupCpuOp
   USING_BASE_CLASS_MEMBER
 
  public:
-  explicit GroupEmbeddingVariableLookupCpuOp(OpKernelConstruction *c)
+  explicit GroupEmbeddingVariableLookupCpuOp(OpKernelConstruction* c)
       : GroupLookupBaseCpuOp<TKey, TValue>(c) {
     OP_REQUIRES_OK(c, c->GetAttr("is_use_default_value_tensor",
                                  &m_is_use_default_value_tensor));
   }
 
-  void Compute(OpKernelContext *ctx) override {
+  void Compute(OpKernelContext* ctx) override {
     /*
       step 1: unique and assign unique output and index
       step 2: doing unique value gather
@@ -53,16 +53,16 @@ class GroupEmbeddingVariableLookupCpuOp
     auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
 
     for (int i = 0; i < m_num_lookup; ++i) {
-      EmbeddingVar<TKey, TValue> *embedding_var = nullptr;
+      EmbeddingVar<TKey, TValue>* embedding_var = nullptr;
       OP_REQUIRES_OK(
           ctx, LookupResource(ctx, HandleFromInput(ctx, i), &embedding_var));
       core::ScopedUnref unref_me(embedding_var);
 
-      const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i);
-      const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
+      const Tensor& sp_values_tensor = ctx->input(m_num_lookup + i);
+      const Tensor& sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
       auto sp_indices = sp_indices_tensor.flat<int64>().data();
       int nnz = sp_values_tensor.NumElements();
-      const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
+      const Tensor& dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
       auto dense_shape = dense_shape_tensor.flat<int64>().data();
       int64 batch_size = dense_shape[0];
 
@@ -87,15 +87,15 @@ class GroupEmbeddingVariableLookupCpuOp
       ctx->set_output(m_num_lookup + i, unique_tensor);
       ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
 
-      auto *unique = unique_tensor.flat<TKey>().data();
-      auto *unique_idx = unique_idx_tensor.flat<int>().data();
+      auto* unique = unique_tensor.flat<TKey>().data();
+      auto* unique_idx = unique_idx_tensor.flat<int>().data();
 
       int unique_nnz = unique_tensor.shape().dim_size(0);
       TensorShape unique_shape{static_cast<int64>(unique_nnz)};
 
       TensorShape batch_nums_tensor_shape =
           TensorShape(std::vector<int64>({batch_size}));
-      Tensor *batch_nums_tensor = nullptr;
+      Tensor* batch_nums_tensor = nullptr;
       // allocate output
       OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i,
                                                batch_nums_tensor_shape,
@@ -123,8 +123,7 @@ class GroupEmbeddingVariableLookupCpuOp
       if (m_is_use_default_value_tensor) {
         embedding_var->GetEmbeddings(
             ev_ctx, unique, unique_embedding_data, unique_nnz,
-            reinterpret_cast<TValue *>(
-                ctx->input(m_num_lookup * 4 + 1).data()));
+            reinterpret_cast<TValue*>(ctx->input(m_num_lookup * 4 + 1).data()));
       } else {
         embedding_var->GetEmbeddings(ev_ctx, unique, unique_embedding_data,
                                      unique_nnz);
@@ -133,12 +132,12 @@ class GroupEmbeddingVariableLookupCpuOp
       }
 
       std::vector<TValue> default_weights(nnz, 1.0);
-      TValue *sp_weights = default_weights.data();
+      TValue* sp_weights = default_weights.data();
       if (!this->m_ignore_weights) {
-        const Tensor &sp_weights_tensor =
+        const Tensor& sp_weights_tensor =
             ctx->input(this->m_num_lookup * 3 + i);
         sp_weights =
-            const_cast<TValue *>(sp_weights_tensor.flat<TValue>().data());
+            const_cast<TValue*>(sp_weights_tensor.flat<TValue>().data());
       }
 
       // Stage 3
@@ -151,7 +150,7 @@ class GroupEmbeddingVariableLookupCpuOp
         emb_vectors_tensor_shape =
             TensorShape(std::vector<int64>({batch_size, m_dimension}));
       }
-      Tensor *gather_embedding_tensor = nullptr;
+      Tensor* gather_embedding_tensor = nullptr;
       // allocate output
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
                                                &gather_embedding_tensor));
@@ -176,7 +175,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               __m512 _weights =
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
@@ -209,7 +208,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               TValue sp_weight = sp_weights[batch_offset + j];
               batch_total_weights += sp_weight;
@@ -246,7 +245,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               __m512 _weights =
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
@@ -272,7 +271,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               for (int d = 0; d < m_dimension; ++d) {
                 tmp_embedding[d] =
@@ -304,7 +303,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               TValue local_weight = *(sp_weights + batch_offset + j);
               __m512 _weights = _mm512_set1_ps(local_weight);
@@ -342,7 +341,7 @@ class GroupEmbeddingVariableLookupCpuOp
             int batch_num = batch_nums[i] - batch_offset;
             for (int j = 0; j < batch_num; ++j) {
               int unique_indice = unique_idx[batch_offset + j];
-              float *u_embedding =
+              float* u_embedding =
                   unique_embedding_data + unique_indice * m_dimension;
               TValue sp_weight = sp_weights[batch_offset + j];
               batch_total_weights =
@@ -390,27 +389,27 @@ template <typename TKey, typename TValue>
 class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
   USING_BASE_CLASS_MEMBER
  public:
-  explicit GroupVariableLookupCpuOp(OpKernelConstruction *c)
+  explicit GroupVariableLookupCpuOp(OpKernelConstruction* c)
       : GroupLookupBaseCpuOp<TKey, TValue>(c) {}
 
-  void Compute(OpKernelContext *ctx) override {
+  void Compute(OpKernelContext* ctx) override {
     auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
     for (int i = 0; i < m_num_lookup; ++i) {
-      const Tensor &emb_variable_tensor = ctx->input(i);
-      const Tensor &sp_values_tensor = ctx->input(m_num_lookup + i);
+      const Tensor& emb_variable_tensor = ctx->input(i);
+      const Tensor& sp_values_tensor = ctx->input(m_num_lookup + i);
       int nnz = sp_values_tensor.NumElements();
       auto embedding_variable = emb_variable_tensor.flat<TValue>().data();
 
-      const Tensor &sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
+      const Tensor& sp_indices_tensor = ctx->input(m_num_lookup * 2 + i);
       auto sp_indices = sp_indices_tensor.flat<int64>().data();
 
-      const Tensor &dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
+      const Tensor& dense_shape_tensor = ctx->input(m_num_lookup * 4 + i);
       auto dense_shape = dense_shape_tensor.flat<int64>().data();
       int64 batch_size = dense_shape[0];
 
       TensorShape batch_nums_tensor_shape =
           TensorShape(std::vector<int64>({batch_size}));
-      Tensor *batch_nums_tensor = nullptr;
+      Tensor* batch_nums_tensor = nullptr;
       // allocate output
       OP_REQUIRES_OK(ctx, ctx->allocate_output(3 * m_num_lookup + i,
                                                batch_nums_tensor_shape,
@@ -435,7 +434,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
             TensorShape(std::vector<int64>({batch_size, m_dimension}));
       }
 
-      Tensor *emb_vectors_tensor = nullptr;
+      Tensor* emb_vectors_tensor = nullptr;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, emb_vectors_tensor_shape,
                                                &emb_vectors_tensor));
       auto emb_vectors = emb_vectors_tensor->flat<TValue>().data();
@@ -453,16 +452,16 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
       ctx->set_output(m_num_lookup + i, unique_tensor);
       ctx->set_output(2 * m_num_lookup + i, unique_idx_tensor);
 
-      auto *unique = unique_tensor.flat<TKey>().data();
-      auto *unique_idx = unique_idx_tensor.flat<int>().data();
+      auto* unique = unique_tensor.flat<TKey>().data();
+      auto* unique_idx = unique_idx_tensor.flat<int>().data();
 
       std::vector<TValue> default_weights(nnz, 1.0);
-      TValue *sp_weights = default_weights.data();
+      TValue* sp_weights = default_weights.data();
       if (!this->m_ignore_weights) {
-        const Tensor &sp_weights_tensor =
+        const Tensor& sp_weights_tensor =
             ctx->input(this->m_num_lookup * 3 + i);
         sp_weights =
-            const_cast<TValue *>(sp_weights_tensor.flat<TValue>().data());
+            const_cast<TValue*>(sp_weights_tensor.flat<TValue>().data());
       }
 
       int slice_bytes = nnz / batch_size * m_dimension * 1000;
@@ -487,7 +486,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
               batch_total_weights =
                   _mm512_add_ps(batch_total_weights, _weights);
-              const float *embedding_ptr =
+              const float* embedding_ptr =
                   embedding_variable + unique_id * m_dimension;
 
               for (int d = 0; d < m_dimension; d += 16) {
@@ -553,7 +552,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
               int unique_id = unique[unique_indice];
               __m512 _weights =
                   _mm512_set1_ps(*(sp_weights + batch_offset + j));
-              const float *embedding_ptr =
+              const float* embedding_ptr =
                   embedding_variable + unique_id * m_dimension;
               for (int d = 0; d < m_dimension; d += 16) {
                 int index = d / 16;
@@ -612,7 +611,7 @@ class GroupVariableLookupCpuOp : public GroupLookupBaseCpuOp<TKey, TValue> {
               __m512 _weights = _mm512_set1_ps(local_weight);
               batch_total_weights =
                   std::fma(local_weight, local_weight, batch_total_weights);
-              const float *embedding_ptr =
+              const float* embedding_ptr =
                   embedding_variable + unique_id * m_dimension;
               for (int d = 0; d < m_dimension; d += 16) {
                 int index = d / 16;
diff --git a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
index fc175dbf..71eab390 100644
--- a/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
+++ b/deepray/custom_ops/embedding_variable/cc/incr_save_restore/incr_save_restore_ops_test.cc
@@ -178,8 +178,8 @@ TEST(DivSparsePartitionerTest, TestCalcGlobalOffset) {
 
 class CollectOpTest : public OpsTestBase {
  protected:
-  void MakeOp(const string &config_str, const string &tensor_name,
-              DataType ktype, const string &part_mode = "div",
+  void MakeOp(const string& config_str, const string& tensor_name,
+              DataType ktype, const string& part_mode = "div",
               int64 part_idx = 0, int64 part_count = 0,
               int64 hash_bucket_size = 0) {
     TF_EXPECT_OK(NodeDefBuilder("collect_op", "CollectSparseIndices")
@@ -209,7 +209,7 @@ class CollectOpTest : public OpsTestBase {
     params_.get()->frame_iter = FrameAndIter(0, 0);
     params_.get()->inputs = &inputs_;
     params_.get()->op_kernel = kernel_.get();
-    step_container_.reset(new ScopedStepContainer(0, [](const string &) {}));
+    step_container_.reset(new ScopedStepContainer(0, [](const string&) {}));
     params_->step_container = step_container_.get();
     std::vector<AllocatorAttributes> attrs;
     test::SetOutputAttrs(params_.get(), &attrs);
@@ -219,12 +219,12 @@ class CollectOpTest : public OpsTestBase {
 
     context_.reset(new OpKernelContext(params_.get()));
 
-    IndicesIncrRecorder<KeyType> *sparse_incr_res = nullptr;
+    IndicesIncrRecorder<KeyType>* sparse_incr_res = nullptr;
     auto rm = device_->resource_manager();
 
     Status s = rm->LookupOrCreate<IndicesIncrRecorder<KeyType>>(
         "", tensor_name + "_sparse_incr", &sparse_incr_res,
-        [this, tensor_name](IndicesIncrRecorder<KeyType> **ptr) {
+        [this, tensor_name](IndicesIncrRecorder<KeyType>** ptr) {
           *ptr = new IndicesIncrRecorder<KeyType>(tensor_name);
           (*ptr)->UpdateGlobalVersion();
           return OkStatus();
diff --git a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
index 0222bdcf..8b9aa794 100644
--- a/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
+++ b/deepray/custom_ops/embedding_variable/cc/kernels/hotness_calculate.cu.cc
@@ -21,9 +21,9 @@
 namespace sok {
 
 template <typename DType>
-__global__ void hotnessCalKernel(const DType *row_length_recv_buffer,
+__global__ void hotnessCalKernel(const DType* row_length_recv_buffer,
                                  size_t local_batchsize, int num_lookup,
-                                 int num_gpus, int *outputs) {
+                                 int num_gpus, int* outputs) {
   size_t thread_cnt = blockDim.x * gridDim.x;
   size_t thread_idx = blockDim.x * blockIdx.x + threadIdx.x;
   size_t items = local_batchsize * num_lookup * num_gpus;
@@ -55,12 +55,12 @@ void HotnessCalLauncher<DType>::initialize() {
 
 template <typename DType>
 void HotnessCalLauncher<DType>::operator()(
-    const void *row_length_recv_buffer, size_t local_batchsize, int num_lookup,
-    int num_gpus, void *output_device, void *output_host, cudaStream_t stream) {
-  const DType *t_row_length_recv_buffer =
-      reinterpret_cast<const DType *>(row_length_recv_buffer);
-  int32_t *t_output_device = reinterpret_cast<int32_t *>(output_device);
-  int32_t *t_output_host = reinterpret_cast<int32_t *>(output_host);
+    const void* row_length_recv_buffer, size_t local_batchsize, int num_lookup,
+    int num_gpus, void* output_device, void* output_host, cudaStream_t stream) {
+  const DType* t_row_length_recv_buffer =
+      reinterpret_cast<const DType*>(row_length_recv_buffer);
+  int32_t* t_output_device = reinterpret_cast<int32_t*>(output_device);
+  int32_t* t_output_host = reinterpret_cast<int32_t*>(output_host);
 
   dim3 grid_dim(2 * sm_count_);
   dim3 block_dim(1024ul);
diff --git a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cc b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cc
index 7a50c35b..ee7787f8 100644
--- a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cc
+++ b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cc
@@ -28,7 +28,7 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 
 template <>
 struct FFMImpl<CPUDevice> {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
                       int right_feat_num, int batch_size, int dim_size,
@@ -63,7 +63,7 @@ struct FFMImpl<CPUDevice> {
 
 template <>
 struct FFMGradImpl<CPUDevice> {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix grad_matrix, int grad_feat_num,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
diff --git a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
index e3c05133..9389c4b2 100644
--- a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
+++ b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.cu.cc
@@ -82,7 +82,7 @@ __global__ void FFMKernelDot(TTypes<float>::ConstMatrix left_matrix,
 
 template <>
 struct FFMImpl<GPUDevice> {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
                       int right_feat_num, int batch_size, int dim_size,
@@ -171,7 +171,7 @@ __global__ void FFMGradKernelDot(
 
 template <>
 struct FFMGradImpl<GPUDevice> {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix grad_matrix, int grad_feat_num,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
diff --git a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.h b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.h
index 4fd2a3b2..fb302d2c 100644
--- a/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.h
+++ b/deepray/custom_ops/ffm_ops/cc/kernels/ffm_kernels.h
@@ -25,7 +25,7 @@ namespace monolith_tf {
 
 template <typename Device>
 struct FFMImpl {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
                       int right_feat_num, int batch_size, int dim_size,
@@ -34,7 +34,7 @@ struct FFMImpl {
 
 template <typename Device>
 struct FFMGradImpl {
-  static void Compute(OpKernelContext *ctx, const std::string &int_type,
+  static void Compute(OpKernelContext* ctx, const std::string& int_type,
                       TTypes<float>::ConstMatrix grad_matrix, int grad_feat_num,
                       TTypes<float>::ConstMatrix left_matrix, int left_feat_num,
                       TTypes<float>::ConstMatrix right_matrix,
@@ -46,13 +46,13 @@ struct FFMGradImpl {
 template <typename Device>
 class FFMOp : public OpKernel {
  public:
-  explicit FFMOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+  explicit FFMOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_size", &dim_size_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("int_type", &int_type_));
   }
 
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *left_tensor;
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* left_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("left", &left_tensor));
     OP_REQUIRES(
         ctx, left_tensor->dims() == 2,
@@ -61,7 +61,7 @@ class FFMOp : public OpKernel {
     int64 left_feat_num = left_tensor->dim_size(1) / dim_size_;
     auto left_matrix = left_tensor->matrix<float>();
 
-    const Tensor *right_tensor;
+    const Tensor* right_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("right", &right_tensor));
     OP_REQUIRES(
         ctx, left_tensor->dims() == 2,
@@ -72,7 +72,7 @@ class FFMOp : public OpKernel {
     int64 right_feat_num = right_tensor->dim_size(1) / dim_size_;
     auto right_matrix = right_tensor->matrix<float>();
 
-    Tensor *output_tensor = nullptr;
+    Tensor* output_tensor = nullptr;
     int out_last_dim = 0;
     if (int_type_ == "dot") {
       out_last_dim = left_feat_num * right_feat_num;
@@ -96,13 +96,13 @@ class FFMOp : public OpKernel {
 template <typename Device>
 class FFMGradOp : public OpKernel {
  public:
-  explicit FFMGradOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+  explicit FFMGradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_size", &dim_size_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("int_type", &int_type_));
   }
 
-  void Compute(OpKernelContext *ctx) override {
-    const Tensor *grad_tensor;
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* grad_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("grad", &grad_tensor));
     OP_REQUIRES(ctx, grad_tensor->dims() == 2,
                 errors::InvalidArgument("the grad tensor of ffm is not 2D"));
@@ -116,7 +116,7 @@ class FFMGradOp : public OpKernel {
 
     auto grad_matrix = grad_tensor->matrix<float>();
 
-    const Tensor *left_tensor;
+    const Tensor* left_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("left", &left_tensor));
     OP_REQUIRES(
         ctx, left_tensor->dims() == 2,
@@ -124,7 +124,7 @@ class FFMGradOp : public OpKernel {
     int64 left_feat_num = left_tensor->dim_size(1) / dim_size_;
     auto left_matrix = left_tensor->matrix<float>();
 
-    const Tensor *right_tensor;
+    const Tensor* right_tensor;
     OP_REQUIRES_OK(ctx, ctx->input("right", &right_tensor));
     OP_REQUIRES(
         ctx, left_tensor->dims() == 2,
@@ -135,12 +135,12 @@ class FFMGradOp : public OpKernel {
     OP_REQUIRES(ctx, grad_feat_num == left_feat_num * right_feat_num,
                 errors::InvalidArgument("the in/out shape not match"));
 
-    Tensor *left_grad_tensor = nullptr;
+    Tensor* left_grad_tensor = nullptr;
     OP_REQUIRES_OK(
         ctx, ctx->allocate_output(0, left_tensor->shape(), &left_grad_tensor));
     auto left_grad_matrix = left_grad_tensor->matrix<float>();
 
-    Tensor *right_grad_tensor = nullptr;
+    Tensor* right_grad_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(1, right_tensor->shape(),
                                              &right_grad_tensor));
     auto right_grad_matrix = right_grad_tensor->matrix<float>();
diff --git a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
index ec7f6ebb..662e67e4 100644
--- a/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
+++ b/deepray/custom_ops/ffm_ops/cc/ops/ffm_ops.cc
@@ -24,7 +24,7 @@ REGISTER_OP("FFM")
     .Output("output: float")
     .Attr("dim_size: int")
     .Attr("int_type: string")
-    .SetShapeFn([](shape_inference::InferenceContext *ctx) {
+    .SetShapeFn([](shape_inference::InferenceContext* ctx) {
       int dim_size;
       TF_RETURN_IF_ERROR(ctx->GetAttr("dim_size", &dim_size));
       auto batch_size = ctx->Dim(ctx->input(0), 0);
@@ -55,7 +55,7 @@ REGISTER_OP("FFMGrad")
     .Output("right_grad: float")
     .Attr("dim_size: int")
     .Attr("int_type: string")
-    .SetShapeFn([](shape_inference::InferenceContext *ctx) {
+    .SetShapeFn([](shape_inference::InferenceContext* ctx) {
       ctx->set_output(0, ctx->input(1));
       ctx->set_output(1, ctx->input(2));
       return TFOkStatus;
diff --git a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
index 7000331d..626b1678 100644
--- a/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
+++ b/deepray/custom_ops/parquet_dataset/cc/kernels/parquet_batch_reader.cc
@@ -27,10 +27,10 @@ namespace data {
 
 class ParquetBatchReader::Impl {
  public:
-  Impl(const string &filename, const int64 batch_size,
-       const std::vector<string> &field_names,
-       const DataTypeVector &field_dtypes,
-       const std::vector<int32> &field_ragged_ranks,
+  Impl(const string& filename, const int64 batch_size,
+       const std::vector<string>& field_names,
+       const DataTypeVector& field_dtypes,
+       const std::vector<int32>& field_ragged_ranks,
        const int64 partition_count, const int64 partition_index,
        const bool drop_remainder)
       : filename_(filename),
@@ -71,15 +71,15 @@ class ParquetBatchReader::Impl {
                                      " must has distinct column names");
     }
     for (size_t i = 0; i < field_names_.size(); ++i) {
-      auto &cname = field_names_[i];
+      auto& cname = field_names_[i];
       int column_index = schema->GetFieldIndex(cname);
       if (TF_PREDICT_FALSE(column_index < 0)) {
         return errors::NotFound("No column called `", cname, "` found in ",
                                 filename_);
       }
       column_indices_.push_back(column_index);
-      const auto &expected_dtype = field_dtypes_[i];
-      const auto &expected_ragged_rank = field_ragged_ranks_[i];
+      const auto& expected_dtype = field_dtypes_[i];
+      const auto& expected_ragged_rank = field_ragged_ranks_[i];
       DataType actual_dtype;
       int32 actual_ragged_rank = 0;
       TF_RETURN_IF_ERROR(ArrowUtil::MakeDataTypeAndRaggedRankFromArrowDataType(
@@ -104,7 +104,7 @@ class ParquetBatchReader::Impl {
     return OkStatus();
   }
 
-  Status Read(std::vector<Tensor> *output_tensors) {
+  Status Read(std::vector<Tensor>* output_tensors) {
     // Read next batch from parquet file.
     std::shared_ptr<::arrow::RecordBatch> batch;
     TF_RETURN_IF_ARROW_ERROR(batch_reader_->ReadNext(&batch));
@@ -142,9 +142,9 @@ class ParquetBatchReader::Impl {
 };
 
 ParquetBatchReader::ParquetBatchReader(
-    const string &filename, const int64 batch_size,
-    const std::vector<string> &field_names, const DataTypeVector &field_dtypes,
-    const std::vector<int32> &field_ragged_ranks, const int64 partition_count,
+    const string& filename, const int64 batch_size,
+    const std::vector<string>& field_names, const DataTypeVector& field_dtypes,
+    const std::vector<int32>& field_ragged_ranks, const int64 partition_count,
     const int64 partition_index, const bool drop_remainder)
     : pimpl_(new ParquetBatchReader::Impl(
           filename, batch_size, field_names, field_dtypes, field_ragged_ranks,
@@ -152,7 +152,7 @@ ParquetBatchReader::ParquetBatchReader(
 
 Status ParquetBatchReader::Open() { return pimpl_->Open(); }
 
-Status ParquetBatchReader::Read(std::vector<Tensor> *output_tensors) {
+Status ParquetBatchReader::Read(std::vector<Tensor>* output_tensors) {
   return pimpl_->Read(output_tensors);
 }
 
diff --git a/deepray/custom_ops/utils/spin_rw_lock.h b/deepray/custom_ops/utils/spin_rw_lock.h
index 00439a48..03eeee90 100644
--- a/deepray/custom_ops/utils/spin_rw_lock.h
+++ b/deepray/custom_ops/utils/spin_rw_lock.h
@@ -11,7 +11,7 @@
 #endif
 
 typedef volatile int64_t easy_atomic_t;
-static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i) {
+static __inline__ void easy_atomic_add(easy_atomic_t* v, int64_t i) {
 #if defined(__x86_64__)
   __asm__ __volatile__(EASY_SMP_LOCK "addq %1,%0"
                        : "=m"((*v))
@@ -20,7 +20,7 @@ static __inline__ void easy_atomic_add(easy_atomic_t *v, int64_t i) {
   __atomic_add_fetch(v, i, __ATOMIC_SEQ_CST);
 #endif
 }
-static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value,
+static __inline__ int64_t easy_atomic_add_return(easy_atomic_t* value,
                                                  int64_t i) {
   int64_t __i = i;
 #if defined(__x86_64__)
@@ -32,7 +32,7 @@ static __inline__ int64_t easy_atomic_add_return(easy_atomic_t *value,
 #endif
   return i + __i;
 }
-static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old,
+static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t* lock, int64_t old,
                                               int64_t set) {
   uint8_t res;
 #if defined(__x86_64__)
@@ -46,14 +46,14 @@ static __inline__ int64_t easy_atomic_cmp_set(easy_atomic_t *lock, int64_t old,
 #endif
   return res;
 }
-static __inline__ void easy_atomic_inc(easy_atomic_t *v) {
+static __inline__ void easy_atomic_inc(easy_atomic_t* v) {
 #if defined(__x86_64__)
   __asm__ __volatile__(EASY_SMP_LOCK "incq %0" : "=m"(*v) : "m"(*v));
 #else
   __atomic_add_fetch(v, 1, __ATOMIC_SEQ_CST);
 #endif
 }
-static __inline__ void easy_atomic_dec(easy_atomic_t *v) {
+static __inline__ void easy_atomic_dec(easy_atomic_t* v) {
 #if defined(__x86_64__)
   __asm__ __volatile__(EASY_SMP_LOCK "decq %0" : "=m"(*v) : "m"(*v));
 #else
@@ -75,7 +75,7 @@ typedef struct easy_spinrwlock_t {
   easy_atomic_t wait_write;
 } easy_spinrwlock_t;
 #define EASY_SPINRWLOCK_INITIALIZER {0, 0}
-static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock) {
+static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t* lock) {
   int ret = EASY_OK;
 
   if (NULL == lock) {
@@ -105,7 +105,7 @@ static __inline__ int easy_spinrwlock_rdlock(easy_spinrwlock_t *lock) {
 
   return ret;
 }
-static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock) {
+static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t* lock) {
   int ret = EASY_OK;
 
   if (NULL == lock) {
@@ -139,7 +139,7 @@ static __inline__ int easy_spinrwlock_wrlock(easy_spinrwlock_t *lock) {
 
   return ret;
 }
-static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock) {
+static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t* lock) {
   int ret = EASY_OK;
 
   if (NULL == lock) {
@@ -159,7 +159,7 @@ static __inline__ int easy_spinrwlock_try_rdlock(easy_spinrwlock_t *lock) {
 
   return ret;
 }
-static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock) {
+static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t* lock) {
   int ret = EASY_OK;
 
   if (NULL == lock) {
@@ -179,7 +179,7 @@ static __inline__ int easy_spinrwlock_try_wrlock(easy_spinrwlock_t *lock) {
 
   return ret;
 }
-static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t *lock) {
+static __inline__ int easy_spinrwlock_unlock(easy_spinrwlock_t* lock) {
   int ret = EASY_OK;
 
   if (NULL == lock) {
@@ -215,32 +215,32 @@ class spin_rd_lock {
  public:
   typedef easy_spinrwlock_t lock_type;
 
-  explicit spin_rd_lock(lock_type *lock) : lock_(lock) {
+  explicit spin_rd_lock(lock_type* lock) : lock_(lock) {
     easy_spinrwlock_rdlock(lock_);
   }
-  explicit spin_rd_lock(lock_type &lock) : lock_(&lock) {
+  explicit spin_rd_lock(lock_type& lock) : lock_(&lock) {
     easy_spinrwlock_rdlock(lock_);
   }
   ~spin_rd_lock() { easy_spinrwlock_unlock(lock_); }
 
  private:
-  lock_type *lock_;
+  lock_type* lock_;
 };
 
 class spin_wr_lock {
  public:
   typedef easy_spinrwlock_t lock_type;
 
-  explicit spin_wr_lock(lock_type *lock) : lock_(lock) {
+  explicit spin_wr_lock(lock_type* lock) : lock_(lock) {
     easy_spinrwlock_wrlock(lock_);
   }
-  explicit spin_wr_lock(lock_type &lock) : lock_(&lock) {
+  explicit spin_wr_lock(lock_type& lock) : lock_(&lock) {
     easy_spinrwlock_wrlock(lock_);
   }
   ~spin_wr_lock() { easy_spinrwlock_unlock(lock_); }
 
  private:
-  lock_type *lock_;
+  lock_type* lock_;
 };
 
 }  // namespace tensorflow
diff --git a/third_party/hadoop/hdfs.h b/third_party/hadoop/hdfs.h
index 30c277a4..ae521798 100644
--- a/third_party/hadoop/hdfs.h
+++ b/third_party/hadoop/hdfs.h
@@ -85,10 +85,10 @@ typedef enum tObjectKind {
  * The C reflection of org.apache.org.hadoop.FileSystem .
  */
 struct hdfs_internal;
-typedef struct hdfs_internal *hdfsFS;
+typedef struct hdfs_internal* hdfsFS;
 
 struct hdfsFile_internal;
-typedef struct hdfsFile_internal *hdfsFile;
+typedef struct hdfsFile_internal* hdfsFile;
 
 struct hadoopRzOptions;
 
@@ -133,7 +133,7 @@ struct hdfsReadStatistics {
  *                 not support read statistics.
  */
 LIBHDFS_EXTERNAL
-int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics **stats);
+int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics** stats);
 
 /**
  * @param stats    HDFS read statistics for a file.
@@ -142,7 +142,7 @@ int hdfsFileGetReadStatistics(hdfsFile file, struct hdfsReadStatistics **stats);
  */
 LIBHDFS_EXTERNAL
 int64_t hdfsReadStatisticsGetRemoteBytesRead(
-    const struct hdfsReadStatistics *stats);
+    const struct hdfsReadStatistics* stats);
 
 /**
  * Clear the read statistics for a file.
@@ -164,7 +164,7 @@ int hdfsFileClearReadStatistics(hdfsFile file);
  * @param stats    The HDFS read statistics to free.
  */
 LIBHDFS_EXTERNAL
-void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
+void hdfsFileFreeReadStatistics(struct hdfsReadStatistics* stats);
 
 /**
  * hdfsConnectAsUser - Connect to a hdfs file system as a specific user
@@ -177,7 +177,7 @@ void hdfsFileFreeReadStatistics(struct hdfsReadStatistics *stats);
  * @deprecated Use hdfsBuilderConnect instead.
  */
 LIBHDFS_EXTERNAL
-hdfsFS hdfsConnectAsUser(const char *nn, tPort port, const char *user);
+hdfsFS hdfsConnectAsUser(const char* nn, tPort port, const char* user);
 
 /**
  * hdfsConnect - Connect to a hdfs file system.
@@ -188,7 +188,7 @@ hdfsFS hdfsConnectAsUser(const char *nn, tPort port, const char *user);
  * @deprecated Use hdfsBuilderConnect instead.
  */
 LIBHDFS_EXTERNAL
-hdfsFS hdfsConnect(const char *nn, tPort port);
+hdfsFS hdfsConnect(const char* nn, tPort port);
 
 /**
  * hdfsConnect - Connect to an hdfs file system.
@@ -202,8 +202,8 @@ hdfsFS hdfsConnect(const char *nn, tPort port);
  * @deprecated   Use hdfsBuilderConnect instead.
  */
 LIBHDFS_EXTERNAL
-hdfsFS hdfsConnectAsUserNewInstance(const char *nn, tPort port,
-                                    const char *user);
+hdfsFS hdfsConnectAsUserNewInstance(const char* nn, tPort port,
+                                    const char* user);
 
 /**
  * hdfsConnect - Connect to an hdfs file system.
@@ -216,7 +216,7 @@ hdfsFS hdfsConnectAsUserNewInstance(const char *nn, tPort port,
  * @deprecated   Use hdfsBuilderConnect instead.
  */
 LIBHDFS_EXTERNAL
-hdfsFS hdfsConnectNewInstance(const char *nn, tPort port);
+hdfsFS hdfsConnectNewInstance(const char* nn, tPort port);
 
 /**
  * Connect to HDFS using the parameters defined by the builder.
@@ -231,7 +231,7 @@ hdfsFS hdfsConnectNewInstance(const char *nn, tPort port);
  * @return       Returns a handle to the filesystem, or NULL on error.
  */
 LIBHDFS_EXTERNAL
-hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld);
+hdfsFS hdfsBuilderConnect(struct hdfsBuilder* bld);
 
 /**
  * Create an HDFS builder.
@@ -239,7 +239,7 @@ hdfsFS hdfsBuilderConnect(struct hdfsBuilder *bld);
  * @return The HDFS builder, or NULL on error.
  */
 LIBHDFS_EXTERNAL
-struct hdfsBuilder *hdfsNewBuilder(void);
+struct hdfsBuilder* hdfsNewBuilder(void);
 
 /**
  * Force the builder to always create a new instance of the FileSystem,
@@ -248,7 +248,7 @@ struct hdfsBuilder *hdfsNewBuilder(void);
  * @param bld The HDFS builder
  */
 LIBHDFS_EXTERNAL
-void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld);
+void hdfsBuilderSetForceNewInstance(struct hdfsBuilder* bld);
 
 /**
  * Set the HDFS NameNode to connect to.
@@ -272,7 +272,7 @@ void hdfsBuilderSetForceNewInstance(struct hdfsBuilder *bld);
  *             port in two different ways.
  */
 LIBHDFS_EXTERNAL
-void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn);
+void hdfsBuilderSetNameNode(struct hdfsBuilder* bld, const char* nn);
 
 /**
  * Set the port of the HDFS NameNode to connect to.
@@ -281,7 +281,7 @@ void hdfsBuilderSetNameNode(struct hdfsBuilder *bld, const char *nn);
  * @param port The port.
  */
 LIBHDFS_EXTERNAL
-void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port);
+void hdfsBuilderSetNameNodePort(struct hdfsBuilder* bld, tPort port);
 
 /**
  * Set the username to use when connecting to the HDFS cluster.
@@ -290,7 +290,7 @@ void hdfsBuilderSetNameNodePort(struct hdfsBuilder *bld, tPort port);
  * @param userName The user name.  The string will be shallow-copied.
  */
 LIBHDFS_EXTERNAL
-void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName);
+void hdfsBuilderSetUserName(struct hdfsBuilder* bld, const char* userName);
 
 /**
  * Set the path to the Kerberos ticket cache to use when connecting to
@@ -301,8 +301,8 @@ void hdfsBuilderSetUserName(struct hdfsBuilder *bld, const char *userName);
  *                            will be shallow-copied.
  */
 LIBHDFS_EXTERNAL
-void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld,
-                                       const char *kerbTicketCachePath);
+void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder* bld,
+                                       const char* kerbTicketCachePath);
 
 /**
  * Free an HDFS builder.
@@ -313,7 +313,7 @@ void hdfsBuilderSetKerbTicketCachePath(struct hdfsBuilder *bld,
  * @param bld The HDFS builder
  */
 LIBHDFS_EXTERNAL
-void hdfsFreeBuilder(struct hdfsBuilder *bld);
+void hdfsFreeBuilder(struct hdfsBuilder* bld);
 
 /**
  * Set a configuration string for an HdfsBuilder.
@@ -327,8 +327,8 @@ void hdfsFreeBuilder(struct hdfsBuilder *bld);
  * @return         0 on success; nonzero error code otherwise.
  */
 LIBHDFS_EXTERNAL
-int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
-                          const char *val);
+int hdfsBuilderConfSetStr(struct hdfsBuilder* bld, const char* key,
+                          const char* val);
 
 /**
  * Get a configuration string.
@@ -342,7 +342,7 @@ int hdfsBuilderConfSetStr(struct hdfsBuilder *bld, const char *key,
  *                 Failure to find the key is not an error.
  */
 LIBHDFS_EXTERNAL
-int hdfsConfGetStr(const char *key, char **val);
+int hdfsConfGetStr(const char* key, char** val);
 
 /**
  * Get a configuration integer.
@@ -355,7 +355,7 @@ int hdfsConfGetStr(const char *key, char **val);
  *                 Failure to find the key is not an error.
  */
 LIBHDFS_EXTERNAL
-int hdfsConfGetInt(const char *key, int32_t *val);
+int hdfsConfGetInt(const char* key, int32_t* val);
 
 /**
  * Free a configuration string found with hdfsConfGetStr.
@@ -363,7 +363,7 @@ int hdfsConfGetInt(const char *key, int32_t *val);
  * @param val      A configuration string obtained from hdfsConfGetStr
  */
 LIBHDFS_EXTERNAL
-void hdfsConfStrFree(char *val);
+void hdfsConfStrFree(char* val);
 
 /**
  * hdfsDisconnect - Disconnect from the hdfs file system.
@@ -393,7 +393,7 @@ int hdfsDisconnect(hdfsFS fs);
  * @return Returns the handle to the open file or NULL on error.
  */
 LIBHDFS_EXTERNAL
-hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize,
+hdfsFile hdfsOpenFile(hdfsFS fs, const char* path, int flags, int bufferSize,
                       short replication, tSize blocksize);
 
 /**
@@ -409,7 +409,7 @@ hdfsFile hdfsOpenFile(hdfsFS fs, const char *path, int flags, int bufferSize,
  *         complete before proceeding with further file updates.
  *         -1 on error.
  */
-int hdfsTruncateFile(hdfsFS fs, const char *path, tOffset newlength);
+int hdfsTruncateFile(hdfsFS fs, const char* path, tOffset newlength);
 
 /**
  * hdfsUnbufferFile - Reduce the buffering done on a file.
@@ -442,7 +442,7 @@ int hdfsCloseFile(hdfsFS fs, hdfsFile file);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsExists(hdfsFS fs, const char *path);
+int hdfsExists(hdfsFS fs, const char* path);
 
 /**
  * hdfsSeek - Seek to given offset in file.
@@ -479,7 +479,7 @@ tOffset hdfsTell(hdfsFS fs, hdfsFile file);
  *              but we are not yet at the end of the file.
  */
 LIBHDFS_EXTERNAL
-tSize hdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length);
+tSize hdfsRead(hdfsFS fs, hdfsFile file, void* buffer, tSize length);
 
 /**
  * hdfsPread - Positional read of data from an open file.
@@ -491,7 +491,7 @@ tSize hdfsRead(hdfsFS fs, hdfsFile file, void *buffer, tSize length);
  * @return      See hdfsRead
  */
 LIBHDFS_EXTERNAL
-tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void *buffer,
+tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void* buffer,
                 tSize length);
 
 /**
@@ -503,7 +503,7 @@ tSize hdfsPread(hdfsFS fs, hdfsFile file, tOffset position, void *buffer,
  * @return Returns the number of bytes written, -1 on error.
  */
 LIBHDFS_EXTERNAL
-tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void *buffer, tSize length);
+tSize hdfsWrite(hdfsFS fs, hdfsFile file, const void* buffer, tSize length);
 
 /**
  * hdfsWrite - Flush the data.
@@ -554,7 +554,7 @@ int hdfsAvailable(hdfsFS fs, hdfsFile file);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsCopy(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
+int hdfsCopy(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
 
 /**
  * hdfsMove - Move file from one filesystem to another.
@@ -565,7 +565,7 @@ int hdfsCopy(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsMove(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
+int hdfsMove(hdfsFS srcFS, const char* src, hdfsFS dstFS, const char* dst);
 
 /**
  * hdfsDelete - Delete file.
@@ -577,7 +577,7 @@ int hdfsMove(hdfsFS srcFS, const char *src, hdfsFS dstFS, const char *dst);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsDelete(hdfsFS fs, const char *path, int recursive);
+int hdfsDelete(hdfsFS fs, const char* path, int recursive);
 
 /**
  * hdfsRename - Rename file.
@@ -587,7 +587,7 @@ int hdfsDelete(hdfsFS fs, const char *path, int recursive);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath);
+int hdfsRename(hdfsFS fs, const char* oldPath, const char* newPath);
 
 /**
  * hdfsGetWorkingDirectory - Get the current working directory for
@@ -598,7 +598,7 @@ int hdfsRename(hdfsFS fs, const char *oldPath, const char *newPath);
  * @return Returns buffer, NULL on error.
  */
 LIBHDFS_EXTERNAL
-char *hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
+char* hdfsGetWorkingDirectory(hdfsFS fs, char* buffer, size_t bufferSize);
 
 /**
  * hdfsSetWorkingDirectory - Set the working directory. All relative
@@ -608,7 +608,7 @@ char *hdfsGetWorkingDirectory(hdfsFS fs, char *buffer, size_t bufferSize);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsSetWorkingDirectory(hdfsFS fs, const char *path);
+int hdfsSetWorkingDirectory(hdfsFS fs, const char* path);
 
 /**
  * hdfsCreateDirectory - Make the given file and all non-existent
@@ -618,7 +618,7 @@ int hdfsSetWorkingDirectory(hdfsFS fs, const char *path);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsCreateDirectory(hdfsFS fs, const char *path);
+int hdfsCreateDirectory(hdfsFS fs, const char* path);
 
 /**
  * hdfsSetReplication - Set the replication of the specified
@@ -628,20 +628,20 @@ int hdfsCreateDirectory(hdfsFS fs, const char *path);
  * @return Returns 0 on success, -1 on error.
  */
 LIBHDFS_EXTERNAL
-int hdfsSetReplication(hdfsFS fs, const char *path, int16_t replication);
+int hdfsSetReplication(hdfsFS fs, const char* path, int16_t replication);
 
 /**
  * hdfsFileInfo - Information about a file/directory.
  */
 typedef struct {
   tObjectKind mKind;  /* file or directory */
-  char *mName;        /* the name of the file */
+  char* mName;        /* the name of the file */
   tTime mLastMod;     /* the last modification time for the file in seconds */
   tOffset mSize;      /* the size of the file in bytes */
   short mReplication; /* the count of replicas */
   tOffset mBlockSize; /* the block size for the file */
-  char *mOwner;       /* the owner of the file */
-  char *mGroup;       /* the group associated with the file */
+  char* mOwner;       /* the owner of the file */
+  char* mGroup;       /* the group associated with the file */
   short mPermissions; /* the permissions associated with the file */
   tTime mLastAccess;  /* the last access time for the file in seconds */
 } hdfsFileInfo;
@@ -656,7 +656,7 @@ typedef struct {
  * objects; NULL on error.
  */
 LIBHDFS_EXTERNAL
-hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries);
+hdfsFileInfo* hdfsListDirectory(hdfsFS fs, const char* path, int* numEntries);
 
 /**
  * hdfsGetPathInfo - Get information about a path as a (dynamically
@@ -668,7 +668,7 @@ hdfsFileInfo *hdfsListDirectory(hdfsFS fs, const char *path, int *numEntries);
  * NULL on error.
  */
 LIBHDFS_EXTERNAL
-hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char *path);
+hdfsFileInfo* hdfsGetPathInfo(hdfsFS fs, const char* path);
 
 /**
  * hdfsFreeFileInfo - Free up the hdfsFileInfo array (including fields)
@@ -677,7 +677,7 @@ hdfsFileInfo *hdfsGetPathInfo(hdfsFS fs, const char *path);
  * @param numEntries The size of the array.
  */
 LIBHDFS_EXTERNAL
-void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
+void hdfsFreeFileInfo(hdfsFileInfo* hdfsFileInfo, int numEntries);
 
 /**
  * hdfsFileIsEncrypted: determine if a file is encrypted based on its
@@ -686,7 +686,7 @@ void hdfsFreeFileInfo(hdfsFileInfo *hdfsFileInfo, int numEntries);
  *         not encrypted, 1 if the file is encrypted.
  */
 LIBHDFS_EXTERNAL
-int hdfsFileIsEncrypted(hdfsFileInfo *hdfsFileInfo);
+int hdfsFileIsEncrypted(hdfsFileInfo* hdfsFileInfo);
 
 /**
  * hdfsGetHosts - Get hostnames where a particular block (determined by
@@ -701,7 +701,7 @@ int hdfsFileIsEncrypted(hdfsFileInfo *hdfsFileInfo);
  * NULL on error.
  */
 LIBHDFS_EXTERNAL
-char ***hdfsGetHosts(hdfsFS fs, const char *path, tOffset start,
+char*** hdfsGetHosts(hdfsFS fs, const char* path, tOffset start,
                      tOffset length);
 
 /**
@@ -711,7 +711,7 @@ char ***hdfsGetHosts(hdfsFS fs, const char *path, tOffset start,
  * @param numEntries The size of the array.
  */
 LIBHDFS_EXTERNAL
-void hdfsFreeHosts(char ***blockHosts);
+void hdfsFreeHosts(char*** blockHosts);
 
 /**
  * hdfsGetDefaultBlockSize - Get the default blocksize.
@@ -735,7 +735,7 @@ tOffset hdfsGetDefaultBlockSize(hdfsFS fs);
  * @return              Returns the default blocksize, or -1 on error.
  */
 LIBHDFS_EXTERNAL
-tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char *path);
+tOffset hdfsGetDefaultBlockSizeAtPath(hdfsFS fs, const char* path);
 
 /**
  * hdfsGetCapacity - Return the raw capacity of the filesystem.
@@ -763,8 +763,8 @@ tOffset hdfsGetUsed(hdfsFS fs);
  * @return              0 on success else -1
  */
 LIBHDFS_EXTERNAL
-int hdfsChown(hdfsFS fs, const char *path, const char *owner,
-              const char *group);
+int hdfsChown(hdfsFS fs, const char* path, const char* owner,
+              const char* group);
 
 /**
  * hdfsChmod
@@ -774,7 +774,7 @@ int hdfsChown(hdfsFS fs, const char *path, const char *owner,
  * @return 0 on success else -1
  */
 LIBHDFS_EXTERNAL
-int hdfsChmod(hdfsFS fs, const char *path, short mode);
+int hdfsChmod(hdfsFS fs, const char* path, short mode);
 
 /**
  * hdfsUtime
@@ -785,7 +785,7 @@ int hdfsChmod(hdfsFS fs, const char *path, short mode);
  * @return 0 on success else -1
  */
 LIBHDFS_EXTERNAL
-int hdfsUtime(hdfsFS fs, const char *path, tTime mtime, tTime atime);
+int hdfsUtime(hdfsFS fs, const char* path, tTime mtime, tTime atime);
 
 /**
  * Allocate a zero-copy options structure.
@@ -798,7 +798,7 @@ int hdfsUtime(hdfsFS fs, const char *path, tTime mtime, tTime atime);
  *                    contain the error number.
  */
 LIBHDFS_EXTERNAL
-struct hadoopRzOptions *hadoopRzOptionsAlloc(void);
+struct hadoopRzOptions* hadoopRzOptionsAlloc(void);
 
 /**
  * Determine whether we should skip checksums in read0.
@@ -810,7 +810,7 @@ struct hadoopRzOptions *hadoopRzOptionsAlloc(void);
  * @return            0 on success; -1 plus errno on failure.
  */
 LIBHDFS_EXTERNAL
-int hadoopRzOptionsSetSkipChecksum(struct hadoopRzOptions *opts, int skip);
+int hadoopRzOptionsSetSkipChecksum(struct hadoopRzOptions* opts, int skip);
 
 /**
  * Set the ByteBufferPool to use with read0.
@@ -827,8 +827,8 @@ int hadoopRzOptionsSetSkipChecksum(struct hadoopRzOptions *opts, int skip);
  *                    -1 plus errno otherwise.
  */
 LIBHDFS_EXTERNAL
-int hadoopRzOptionsSetByteBufferPool(struct hadoopRzOptions *opts,
-                                     const char *className);
+int hadoopRzOptionsSetByteBufferPool(struct hadoopRzOptions* opts,
+                                     const char* className);
 
 /**
  * Free a hadoopRzOptionsFree structure.
@@ -837,7 +837,7 @@ int hadoopRzOptionsSetByteBufferPool(struct hadoopRzOptions *opts,
  *                    Any associated ByteBufferPool will also be freed.
  */
 LIBHDFS_EXTERNAL
-void hadoopRzOptionsFree(struct hadoopRzOptions *opts);
+void hadoopRzOptionsFree(struct hadoopRzOptions* opts);
 
 /**
  * Perform a byte buffer read.
@@ -863,8 +863,8 @@ void hadoopRzOptionsFree(struct hadoopRzOptions *opts);
  *                   supplied.
  */
 LIBHDFS_EXTERNAL
-struct hadoopRzBuffer *hadoopReadZero(hdfsFile file,
-                                      struct hadoopRzOptions *opts,
+struct hadoopRzBuffer* hadoopReadZero(hdfsFile file,
+                                      struct hadoopRzOptions* opts,
                                       int32_t maxLength);
 
 /**
@@ -874,7 +874,7 @@ struct hadoopRzBuffer *hadoopReadZero(hdfsFile file,
  * @return           the length of the buffer.
  */
 LIBHDFS_EXTERNAL
-int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer);
+int32_t hadoopRzBufferLength(const struct hadoopRzBuffer* buffer);
 
 /**
  * Get a pointer to the raw buffer returned from readZero.
@@ -887,7 +887,7 @@ int32_t hadoopRzBufferLength(const struct hadoopRzBuffer *buffer);
  *                   NULL when end-of-file has been reached.
  */
 LIBHDFS_EXTERNAL
-const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer);
+const void* hadoopRzBufferGet(const struct hadoopRzBuffer* buffer);
 
 /**
  * Release a buffer obtained through readZero.
@@ -897,7 +897,7 @@ const void *hadoopRzBufferGet(const struct hadoopRzBuffer *buffer);
  * @param buffer     The buffer to release.
  */
 LIBHDFS_EXTERNAL
-void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer *buffer);
+void hadoopRzBufferFree(hdfsFile file, struct hadoopRzBuffer* buffer);
 
 #ifdef __cplusplus
 }

From cb177438353c79432b368bbceb3eff0d9c049cb9 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 19:09:14 +0800
Subject: [PATCH 06/11] Test: clang-format -i --style=file  **/*.cu.cc **/*.h
 **/*.cc **/*.proto

---
 deepray/custom_ops/embedding_variable/cc/embedding/config.proto | 1 -
 deepray/custom_ops/embedding_variable/config.proto              | 1 -
 2 files changed, 2 deletions(-)

diff --git a/deepray/custom_ops/embedding_variable/cc/embedding/config.proto b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto
index 424fc5e1..c709b053 100644
--- a/deepray/custom_ops/embedding_variable/cc/embedding/config.proto
+++ b/deepray/custom_ops/embedding_variable/cc/embedding/config.proto
@@ -23,7 +23,6 @@ enum StorageType {
   // three level
   DRAM_PMEM_SSDHASH = 101;
   HBM_DRAM_SSDHASH = 102;
-
 }
 
 enum CopyBackFlag {
diff --git a/deepray/custom_ops/embedding_variable/config.proto b/deepray/custom_ops/embedding_variable/config.proto
index 424fc5e1..c709b053 100644
--- a/deepray/custom_ops/embedding_variable/config.proto
+++ b/deepray/custom_ops/embedding_variable/config.proto
@@ -23,7 +23,6 @@ enum StorageType {
   // three level
   DRAM_PMEM_SSDHASH = 101;
   HBM_DRAM_SSDHASH = 102;
-
 }
 
 enum CopyBackFlag {

From 5917f6e46b4bd98597b17658bd88741af1475679 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 21:58:14 +0800
Subject: [PATCH 07/11] Test: Format python code with ruff

---
 .github/workflows/ci_test.yml                 |    7 -
 .github/workflows/python_format_check.yml     |    8 +
 build_deps/pip_tf/pip_tf_flags_test.py        |   17 +-
 configure.py                                  |  699 ++-
 deepray/__init__.py                           |   34 +-
 deepray/activations/hardshrink.py             |    6 +-
 deepray/activations/lisht.py                  |   30 +-
 deepray/activations/mish.py                   |   30 +-
 deepray/activations/rrelu.py                  |   12 +-
 deepray/activations/snake.py                  |   30 +-
 deepray/activations/softshrink.py             |    6 +-
 deepray/activations/sparsemax.py              |   86 +-
 deepray/activations/swish.py                  |    8 +-
 deepray/activations/tanhshrink.py             |   28 +-
 deepray/activations/tests/activations_test.py |   16 +-
 deepray/activations/tests/rrelu_test.py       |   34 +-
 deepray/activations/tests/sparsemax_test.py   |   40 +-
 deepray/activations/tests/swish_test.py       |    8 +-
 deepray/callbacks/average_model_checkpoint.py |   63 +-
 deepray/callbacks/callbacks.py                |    3 +-
 deepray/callbacks/model_checkpoint.py         |   18 +-
 deepray/callbacks/profiler_callback.py        |   18 +-
 deepray/callbacks/progbar_logger.py           |  149 +-
 .../tests/avg_model_checkpoint_test.py        |  129 +-
 deepray/callbacks/tests/time_stopping_test.py |    1 -
 .../callbacks/tests/tqdm_progress_bar_test.py |    7 +-
 deepray/callbacks/time_history.py             |   19 +-
 deepray/callbacks/time_stopping.py            |   14 +-
 deepray/callbacks/tqdm_progress_bar.py        |  130 +-
 deepray/callbacks/training_speed.py           |   16 +-
 deepray/conftest.py                           |   26 +-
 deepray/core/common/distribution_utils.py     |   93 +-
 .../core/common/distribution_utils_test.py    |   52 +-
 deepray/core/common/flags.py                  |   47 +-
 deepray/core/common/registry_imports.py       |    1 +
 deepray/core/common/streamz_counters.py       |    6 +-
 deepray/core/compile_utils.py                 |   11 +-
 deepray/core/export_saved_model.py            |   20 +-
 deepray/core/export_saved_model_test.py       |   64 +-
 deepray/core/modeling_tf_utils.py             | 2094 ++++----
 deepray/core/runner.py                        |    2 +-
 deepray/core/standard_runner.py               |   14 +-
 deepray/core/standard_runner_test.py          |    6 +-
 deepray/core/trainer.py                       | 1177 +++--
 deepray/core/utils/clip.py                    |    6 +-
 deepray/core/utils/common.py                  |    5 +-
 deepray/core/utils/common_test.py             |    1 -
 deepray/core/utils/loop_fns.py                |    8 +-
 deepray/core/utils/misc/callstack_sampler.py  |   11 +-
 deepray/core/utils/misc/distribution_utils.py |   44 +-
 .../utils/misc/distribution_utils_test.py     |    8 +-
 deepray/core/utils/misc/model_helpers.py      |    8 +-
 deepray/core/utils/misc/model_helpers_test.py |   45 +-
 deepray/core/utils/misc/tpu_lib.py            |    2 +-
 deepray/core/utils/summary_manager.py         |    2 +-
 deepray/core/utils/tpu_summaries_test.py      |    4 +-
 .../correlation_cost/python/optical_flow.py   |  207 +-
 .../python/tests/optical_flow_test.py         |   90 +-
 .../python/layers/dist_model_parallel.py      |  269 +-
 .../python/layers/embedding.py                |   90 +-
 .../python/ops/embedding_lookup_ops.py        |    8 +-
 .../python/tests/dist_model_parallel_test.py  |  220 +-
 .../python/tests/embedding_lookup_ops_test.py |   13 +-
 .../python/tests/embedding_test.py            |   52 +-
 .../python/tests/integer_lookup_test.py       |   10 +-
 .../embedding_bag/python/embedding_bag.py     |  117 +-
 .../python/tests/embedding_bag_test.py        |   20 +-
 .../custom_ops/embedding_variable/__init__.py |    2 +-
 .../embedding_variable_ops_test.py            |   56 +-
 .../embedding_variable/multiplex_1_test.py    |    4 +-
 .../python/group_embedding_lookup_ops.py      |  300 +-
 .../python/group_embedding_types.py           |    1 -
 .../python/kv_variable_ops.py                 |  527 +-
 .../python/tests/embedding_bag_test.py        |   20 +-
 .../tests/group_embedding_lookup_ops_test.py  |   95 +-
 .../embedding_variable/variable_scope.py      |  979 ++--
 .../embedding_variable/variables.py           |  103 +-
 deepray/custom_ops/ffm_ops/python/ffm_ops.py  |   13 +-
 .../python/tests/feature_cross_test.py        |   59 +-
 .../ffm_ops/python/tests/ffm_ops_test.py      |    5 +-
 .../multiplex_1/multiplex_1_test.py           |   56 +-
 .../multiplex_2/multiplex_2_test.py           |   56 +-
 .../custom_ops/multiplex_3/multiplex_3_op.py  |   20 +-
 .../multiplex_3/multiplex_3_test.py           |   62 +-
 .../multiplex_4/model_using_multiplex.py      |   24 +-
 .../multiplex_4/multiplex_2_save.py           |    6 +-
 .../multiplex_4/multiplex_4_load_use.py       |    6 +-
 .../multiplex_4/multiplex_4_test.py           |   18 +-
 .../parquet_dataset/python/dataframe.py       |   39 +-
 .../python/parquet_dataset_ops.py             |  124 +-
 .../parquet_dataset/python/parquet_pybind.py  |   58 +-
 .../python/tests/parquet_dataset_ops_test.py  |   49 +-
 .../parquet_dataset/read_parquet_deepray.py   |   48 +-
 deepray/custom_ops/seq2seq/__init__.py        |    2 +-
 .../seq2seq/python/attention_wrapper.py       | 1965 ++++----
 .../seq2seq/python/basic_decoder.py           |  165 +-
 .../seq2seq/python/beam_search_decoder.py     | 1287 +++--
 deepray/custom_ops/seq2seq/python/decoder.py  |  479 +-
 deepray/custom_ops/seq2seq/python/loss.py     |  183 +-
 deepray/custom_ops/seq2seq/python/sampler.py  |  493 +-
 .../python/tests/attention_wrapper_test.py    |  428 +-
 .../python/tests/basic_decoder_test.py        |  159 +-
 .../python/tests/beam_search_decoder_test.py  |  329 +-
 .../python/tests/beam_search_ops_test.py      |   38 +-
 .../seq2seq/python/tests/decoder_test.py      |   52 +-
 .../seq2seq/python/tests/loss_test.py         |  191 +-
 .../simple_hash_table/simple_hash_table.py    |   12 +-
 .../simple_hash_table_test.py                 |   18 +-
 deepray/custom_ops/sleep/sleep_bin.py         |   18 +-
 deepray/custom_ops/sleep/sleep_test.py        |    6 +-
 .../unique_ops/python/tests/run_all_test.py   |    2 +-
 .../python/tests/unique_ali_op_test.py        |   79 +-
 .../python/tests/zero_out_ops_test.py         |    4 +-
 .../adult_census_income.py                    |   23 +-
 .../adult_census_income_test.py               |   14 +-
 .../datasets/ali-ccp/preprocess_ali_ccp.py    |  121 +-
 .../ali_display_ad_click.py                   |   24 +-
 .../ali_display_ad_click_test.py              |   18 +-
 .../processing/prep_1_backbone.py             |  197 +-
 .../processing/prep_2_bahavior.py             |   75 +-
 .../processing/prep_3_merge.py                |   70 +-
 .../processing/prep_4_sort.py                 |  144 +-
 .../ali_display_ad_click/processing/stats.py  |   26 +-
 .../processing/to_tfrecord.py                 |   98 +-
 .../amazon_books_2014/amazon_books_2014.py    |   51 +-
 .../amazon_books_2014_test.py                 |   26 +-
 .../datasets/amazon_books_2014/defaults.py    |   14 +-
 .../amazon_books_2014/preprocessing/io.py     |   46 +-
 .../amazon_books_2014/preprocessing/ops.py    |   20 +-
 .../preprocessing/parquet_to_tfrecord.py      |  153 +-
 .../preprocessing/sim_preprocessing.py        |  204 +-
 deepray/datasets/avazu/avazu.py               |   19 +-
 deepray/datasets/avazu/avazu_test.py          |   14 +-
 deepray/datasets/avazu/processing.py          |  101 +-
 deepray/datasets/cifar/cifar.py               |   96 +-
 deepray/datasets/cifar/cifar_test.py          |   12 +-
 .../creditcardfraud/creditcardfraud.py        |   27 +-
 .../creditcardfraud/creditcardfraud_test.py   |   12 +-
 deepray/datasets/criteo/__init__.py           |    2 +-
 deepray/datasets/criteo/criteo.py             |    5 +-
 deepray/datasets/criteo/criteo_test.py        |   28 +-
 deepray/datasets/criteo/criteo_tsv_reader.py  |   31 +-
 .../datasets/criteo/criteo_tsv_reader_test.py |   14 +-
 .../criteo/preproc/NVT_shuffle_spark.py       |   14 +-
 .../datasets/criteo/preproc/data/defaults.py  |    2 +-
 .../criteo/preproc/data/feature_spec.py       |   85 +-
 .../criteo/preproc/parquet_to_binary.py       |   75 +-
 .../criteo/preproc/preproc_NVTabular.py       |  116 +-
 .../criteo/preproc/spark_data_utils.py        |  259 +-
 .../datasets/criteo/preproc/split_dataset.py  |   44 +-
 deepray/datasets/csv_pipeline/csv_pipeline.py |   15 +-
 deepray/datasets/datapipeline.py              |   20 +-
 deepray/datasets/downloader/Downloader.py     |   57 +-
 .../GooglePretrainedWeightDownloader.py       |  131 +-
 .../NVIDIAPretrainedWeightDownloader.py       |    5 +-
 deepray/datasets/downloader/TextSharding.py   |  106 +-
 deepray/datasets/downloader/bertPrep.py       |  445 +-
 .../downloader/bookscorpus/BooksDownloader.py |    9 +-
 .../bookscorpus/BookscorpusTextFormatting.py  |   11 +-
 .../downloader/bookscorpus/download_files.py  |   62 +-
 .../downloader/bookscorpus/download_list.py   |  137 +-
 .../downloader/bookscorpus/epub2txt.py        |   17 +-
 .../downloader/bookscorpus/make_sentlines.py  |   14 +-
 .../bookscorpus/tokenize_sentlines.py         |    2 +-
 .../downloader/classifier_data_lib.py         |   38 +-
 .../downloader/create_finetuning_data.py      |  101 +-
 .../downloader/create_pretraining_data.py     |  119 +-
 .../downloader/glue/GLUEDownloader.py         |   20 +-
 .../downloader/pubmed/PubMedDownloader.py     |   50 +-
 .../downloader/pubmed/PubMedTextFormatting.py |   13 +-
 .../downloader/squad/SquadDownloader.py       |   37 +-
 deepray/datasets/downloader/tokenization.py   |   34 +-
 .../downloader/wikicorpus/WikiDownloader.py   |   27 +-
 .../wikicorpus/WikicorpusTextFormatting.py    |   15 +-
 .../datasets/fashion_mnist/fashion_mnist.py   |   22 +-
 .../fashion_mnist/fashion_mnist_test.py       |   12 +-
 .../datasets/imagenet-1k/imagenet_to_gcs.py   |  160 +-
 deepray/datasets/imdb/imdb.py                 |   16 +-
 deepray/datasets/imdb/imdb_test.py            |   12 +-
 .../datasets/kafka_pipeline/kafka_pipeline.py |   86 +-
 .../kafka_pipeline/kafka_pipeline_test.py     |    7 +-
 deepray/datasets/mnist/mnist.py               |   19 +-
 deepray/datasets/movielens/constants.py       |    4 +-
 deepray/datasets/movielens/data_pipeline.py   |  212 +-
 .../datasets/movielens/data_preprocessing.py  |   60 +-
 deepray/datasets/movielens/movielens.py       |   40 +-
 .../movielens/movielens_100k_ratings.py       |   32 +-
 .../movielens/movielens_1m_ratings.py         |   32 +-
 .../movielens/movielens_ratings_test.py       |    6 +-
 deepray/datasets/movielens/process.py         |   69 +-
 deepray/datasets/movielens/producer.py        |    3 +-
 .../build_openwebtext_pretraining_dataset.py  |   41 +-
 .../openwebtext/build_pretraining_dataset.py  |   84 +-
 deepray/datasets/openwebtext/openwebtext.py   |   27 +-
 .../datasets/openwebtext/openwebtext_test.py  |   10 +-
 deepray/datasets/openwebtext/tokenization.py  |   19 +-
 deepray/datasets/openwebtext/util/__init__.py |    2 +-
 .../parquet_pipeline/ali_parquet_dataset.py   |   75 +-
 .../ali_parquet_dataset_test.py               |   16 +-
 .../parquet_pipeline/parquet_pipeline_test.py |   18 +-
 deepray/datasets/squad/classifier_dataset.py  |   21 +-
 deepray/datasets/squad/pretrain_dataset.py    |   53 +-
 deepray/datasets/squad/squad.py               |   95 +-
 deepray/datasets/squad/squad_dataset.py       |   25 +-
 deepray/datasets/squad/squad_lib.py           |  255 +-
 deepray/datasets/squad/squad_lib_sp.py        |  244 +-
 deepray/datasets/squad/squad_test.py          |   18 +-
 deepray/datasets/squad/v1.1/evaluate-v1.1.py  |   44 +-
 deepray/datasets/squad/v2.0/evaluate-v2.0.py  |  162 +-
 .../taobao/csv_price_int32_to_parquet.py      |   26 +-
 .../taobao/csv_price_string_to_parquet.py     |   31 +-
 .../tfrecord_pipeline/tfrecord_pipeline.py    |    6 +-
 .../tfrecord_pipeline_test.py                 |   16 +-
 deepray/datasets/tokenization.py              |   34 +-
 .../toxic_comment_classification_challenge.py |   51 +-
 ...c_comment_classification_challenge_test.py |   12 +-
 .../processing/BookscorpusTextFormatting.py   |   11 +-
 .../wikicorpus_en/processing/Downloader.py    |   12 +-
 .../wikicorpus_en/processing/TextSharding.py  |  106 +-
 .../processing/WikiDownloader.py              |   23 +-
 .../processing/WikicorpusTextFormatting.py    |   15 +-
 .../processing/build_pretraining_dataset.py   |   76 +-
 .../wikicorpus_en/processing/dataPrep.py      |  291 +-
 .../wikicorpus_en/processing/file_utils.py    |  155 +-
 .../wikicorpus_en/processing/tokenization.py  |   71 +-
 .../processing/tokenization_utils.py          | 1976 ++++----
 .../wikicorpus_en/processing/utils.py         |   64 +-
 .../processing/wikiextractor/WikiExtractor.py | 1670 ++++---
 .../wikiextractor/cirrus-extract.py           |  115 +-
 .../datasets/wikicorpus_en/wikicorpus_en.py   |   11 +-
 .../wikicorpus_en/wikicorpus_en_test.py       |   12 +-
 deepray/datasets/wikitext103/wikitext103.py   |    7 +-
 deepray/layers/__init__.py                    |   14 +-
 deepray/layers/adaptive_pooling.py            |  438 +-
 deepray/layers/attention.py                   |  131 +-
 deepray/layers/bucketize.py                   |   13 +-
 deepray/layers/crf.py                         |  121 +-
 deepray/layers/dcn.py                         |  191 +-
 deepray/layers/dense.py                       |  114 +-
 deepray/layers/dense_einsum.py                |   77 +-
 deepray/layers/dot_interaction.py             |    5 +-
 deepray/layers/dynamic_embedding.py           |  259 +-
 deepray/layers/embedding.py                   |  372 +-
 deepray/layers/embedding_variable.py          |   74 +-
 deepray/layers/esn.py                         |  222 +-
 deepray/layers/feature_cross.py               |  472 +-
 deepray/layers/flash_attention.py             |   47 +-
 deepray/layers/masked_softmax.py              |    3 +-
 deepray/layers/max_unpooling_2d.py            |  118 +-
 deepray/layers/max_unpooling_2d_v2.py         |   46 +-
 deepray/layers/maxout.py                      |   35 +-
 deepray/layers/mlp.py                         |  106 +-
 deepray/layers/netvlad.py                     |   50 +-
 deepray/layers/networks/albert_encoder.py     |  138 +-
 .../layers/networks/albert_encoder_test.py    |   79 +-
 .../networks/bert_dense_encoder_test.py       |  240 +-
 deepray/layers/networks/bert_encoder.py       |  416 +-
 deepray/layers/networks/bert_encoder_test.py  |  196 +-
 deepray/layers/networks/fnet.py               |  224 +-
 deepray/layers/networks/fnet_test.py          |   35 +-
 deepray/layers/networks/funnel_transformer.py |  250 +-
 .../networks/funnel_transformer_test.py       |  189 +-
 .../layers/networks/mobile_bert_encoder.py    |  113 +-
 .../networks/mobile_bert_encoder_test.py      |   50 +-
 .../networks/packed_sequence_embedding.py     |  141 +-
 .../packed_sequence_embedding_test.py         |   73 +-
 deepray/layers/networks/sparse_mixer.py       |  284 +-
 deepray/layers/networks/sparse_mixer_test.py  |   75 +-
 deepray/layers/networks/xlnet_base.py         |  257 +-
 deepray/layers/networks/xlnet_base_test.py    |  292 +-
 deepray/layers/noisy_dense.py                 |  316 +-
 deepray/layers/normalizations.py              |  407 +-
 deepray/layers/on_device_embedding.py         |   11 +-
 deepray/layers/poincare.py                    |   26 +-
 deepray/layers/polynomial.py                  |  187 +-
 deepray/layers/pooling.py                     |   18 +-
 deepray/layers/position_embedding.py          |   24 +-
 deepray/layers/relative_attention.py          |  185 +-
 deepray/layers/rnn/esn_cell.py                |  183 +-
 deepray/layers/rnn/layer_norm_lstm_cell.py    |  222 +-
 .../layers/rnn/layer_norm_simple_rnn_cell.py  |  368 +-
 deepray/layers/rnn/nas_cell.py                |  163 +-
 deepray/layers/rnn/peephole_lstm_cell.py      |   81 +-
 deepray/layers/rnn/tests/esn_cell_test.py     |   94 +-
 .../rnn/tests/layer_norm_lstm_cell_test.py    |  210 +-
 .../tests/layer_norm_simple_rnn_cell_test.py  |   36 +-
 deepray/layers/rnn/tests/nas_cell_test.py     |  256 +-
 .../rnn/tests/peephole_lstm_cell_test.py      |   31 +-
 deepray/layers/self_attention_mask.py         |   11 +-
 deepray/layers/snake.py                       |   10 +-
 deepray/layers/sparsemax.py                   |   10 +-
 deepray/layers/spatial_pyramid_pooling.py     |   76 +-
 deepray/layers/spectral_normalization.py      |   82 +-
 deepray/layers/stochastic_depth.py            |   66 +-
 .../layers/tests_bak/adaptive_pooling_test.py |  144 +-
 deepray/layers/tests_bak/attention_test.py    |    8 +-
 deepray/layers/tests_bak/crf_test.py          |   83 +-
 deepray/layers/tests_bak/dcn_test.py          |    7 +-
 deepray/layers/tests_bak/dense_einsum_test.py |    5 +-
 .../layers/tests_bak/dot_interaction_test.py  |    1 -
 deepray/layers/tests_bak/embedding_test.py    |   75 +-
 deepray/layers/tests_bak/esn_test.py          |   42 +-
 .../layers/tests_bak/feature_cross_test.py    |   59 +-
 .../layers/tests_bak/masked_softmax_test.py   |    3 +-
 .../layers/tests_bak/max_unpooling_2d_test.py |  569 ++-
 .../tests_bak/max_unpooling_2d_v2_test.py     |  290 +-
 deepray/layers/tests_bak/maxout_test.py       |    8 +-
 deepray/layers/tests_bak/mlp_test.py          |   18 +-
 deepray/layers/tests_bak/netvlad_test.py      |   18 +-
 deepray/layers/tests_bak/noisy_dense_test.py  |   66 +-
 .../layers/tests_bak/normalizations_test.py   |   19 +-
 .../tests_bak/on_device_embedding_test.py     |   13 +-
 deepray/layers/tests_bak/poincare_test.py     |   22 +-
 .../tests_bak/position_embedding_test.py      |    1 -
 .../tests_bak/relative_attention_test.py      |  109 +-
 deepray/layers/tests_bak/snake_test.py        |   11 +-
 deepray/layers/tests_bak/sparsemax_test.py    |    8 +-
 .../tests_bak/spatial_pyramid_pooling_test.py |   22 +-
 .../tests_bak/spectral_normalization_test.py  |   55 +-
 .../layers/tests_bak/stochastic_depth_test.py |    8 +-
 deepray/layers/tests_bak/tlu_test.py          |   18 +-
 .../tests_bak/transformer_scaffold_test.py    |  144 +-
 deepray/layers/tests_bak/transformer_test.py  |   25 +-
 .../layers/tests_bak/transformer_xl_test.py   |  216 +-
 deepray/layers/tests_bak/wrappers_test.py     |  101 +-
 deepray/layers/tf_utils.py                    |   14 +-
 deepray/layers/tlu.py                         |   80 +-
 deepray/layers/transformer.py                 |  155 +-
 deepray/layers/transformer_scaffold.py        |  161 +-
 deepray/layers/transformer_xl.py              |  321 +-
 deepray/layers/wrappers.py                    |  110 +-
 deepray/losses/__init__.py                    |   26 +-
 deepray/losses/_loss_util.py                  |   12 +-
 deepray/losses/contrastive.py                 |  144 +-
 deepray/losses/focal_loss.py                  |  142 +-
 deepray/losses/giou_loss.py                   |   96 +-
 deepray/losses/kappa_loss.py                  |  104 +-
 deepray/losses/lifted.py                      |   68 +-
 deepray/losses/losses_impl.py                 |  226 +-
 deepray/losses/metric_learning.py             |   34 +-
 deepray/losses/npairs.py                      |  308 +-
 deepray/losses/quantiles.py                   |  118 +-
 deepray/losses/softmax_loss.py                |   62 +-
 deepray/losses/sparsemax_loss.py              |   90 +-
 deepray/losses/tests/focal_loss_test.py       |   16 +-
 deepray/losses/tests/giou_loss_test.py        |    4 +-
 deepray/losses/tests/kappa_loss_test.py       |   14 +-
 deepray/losses/tests/lifted_test.py           |   22 +-
 deepray/losses/tests/sparsemax_loss_test.py   |   29 +-
 deepray/losses/tests/triplet_test.py          |   58 +-
 ...ed_sparse_categorical_crossentropy_test.py |   92 +-
 deepray/losses/triplet.py                     |  302 +-
 deepray/losses/utils.py                       |   61 +-
 ...eighted_sparse_categorical_crossentropy.py |   16 +-
 deepray/metrics/__init__.py                   |   16 +-
 deepray/metrics/_ranking.py                   |    8 +-
 deepray/metrics/alpha_dcg.py                  |   34 +-
 deepray/metrics/cohens_kappa.py               |  206 +-
 deepray/metrics/dcg.py                        |    8 +-
 deepray/metrics/f_scores.py                   |  269 +-
 deepray/metrics/geometric_mean.py             |   26 +-
 deepray/metrics/hamming.py                    |  176 +-
 deepray/metrics/harmonic_mean.py              |   24 +-
 deepray/metrics/hits.py                       |    2 +-
 .../matthews_correlation_coefficient.py       |   96 +-
 deepray/metrics/mean_average_precision.py     |    2 +-
 deepray/metrics/metrics_impl.py               |  109 +-
 deepray/metrics/mrr.py                        |    4 +-
 .../metrics/multilabel_confusion_matrix.py    |  178 +-
 deepray/metrics/ndcg.py                       |   12 +-
 deepray/metrics/precision.py                  |    2 +-
 deepray/metrics/precision_ia.py               |    2 +-
 deepray/metrics/r_square.py                   |  131 +-
 deepray/metrics/recall.py                     |    2 +-
 deepray/metrics/streaming_correlations.py     |  179 +-
 deepray/metrics/tests/cohens_kappa_test.py    |    1 +
 deepray/metrics/tests/f_scores_test.py        |  198 +-
 deepray/metrics/tests/geometric_mean_test.py  |   25 +-
 deepray/metrics/tests/hamming_test.py         |  132 +-
 deepray/metrics/tests/harmonic_mean_test.py   |   28 +-
 .../matthews_correlation_coefficient_test.py  |   52 +-
 deepray/metrics/tests/metrics_test.py         |    8 +-
 .../tests/multilabel_confusion_matrix_test.py |  112 +-
 deepray/metrics/tests/r_square_test.py        |   12 +-
 .../tests/streaming_correlations_test.py      |   15 +-
 deepray/metrics/utils.py                      |   58 +-
 deepray/models/__init__.py                    |    2 +-
 deepray/models/albert_transformer_encoder.py  |  141 +-
 deepray/models/bert_classifier.py             |   21 +-
 deepray/models/bert_pretrainer.py             |   67 +-
 deepray/models/bert_span_labeler.py           |   15 +-
 deepray/models/classification.py              |   22 +-
 deepray/models/deep_cross_net.py              |    7 +-
 deepray/models/encoder_scaffold.py            |  131 +-
 deepray/models/framework.py                   |    2 +-
 deepray/models/generative/conditional_gan.py  |   63 +-
 deepray/models/masked_lm.py                   |   90 +-
 deepray/models/ncf_common.py                  |   18 +-
 deepray/models/ncf_model.py                   |    1 -
 deepray/models/ncf_test.py                    |   51 +-
 deepray/models/neumf_model.py                 |  179 +-
 deepray/models/rec/base_model.py              |  324 +-
 deepray/models/rec/cgc_v3.py                  |  134 +-
 deepray/models/rec/deepfm.py                  |   46 +-
 deepray/models/rec/dien_model.py              |   71 +-
 deepray/models/rec/din_model.py               |   19 +-
 .../models/rec/dual_channels_deep_model.py    |   42 +-
 deepray/models/rec/flen.py                    |   24 +-
 deepray/models/rec/flend.py                   |   15 +-
 .../rec/sequential_recommender_model.py       |   42 +-
 deepray/models/rec/sim_model.py               |   65 +-
 deepray/models/span_labeling.py               |   30 +-
 .../__internal__/layers/attention_block.py    |    3 +-
 .../__internal__/layers/padded_conv2d.py      |    1 -
 .../__internal__/layers/resnet_block.py       |    3 +-
 .../models/stable_diffusion/clip_tokenizer.py |   51 +-
 deepray/models/stable_diffusion/constants.py  | 2154 ++++----
 deepray/models/stable_diffusion/decoder.py    |   73 +-
 .../stable_diffusion/diffusion_model.py       |   60 +-
 .../models/stable_diffusion/image_encoder.py  |   66 +-
 .../stable_diffusion/noise_scheduler.py       |  120 +-
 .../stable_diffusion/stable_diffusion.py      |  416 +-
 .../stable_diffusion/stable_diffusion_test.py |   17 +-
 .../models/stable_diffusion/text_encoder.py   |   17 +-
 .../tests/albert_transformer_encoder_test.py  |   71 +-
 deepray/models/tests/bert_classifier_test.py  |    5 +-
 deepray/models/tests/bert_pretrainer_test.py  |    5 +-
 .../models/tests/bert_span_labeler_test.py    |    5 +-
 deepray/models/tests/classification_test.py   |   17 +-
 deepray/models/tests/encoder_scaffold_test.py |  354 +-
 deepray/models/tests/masked_lm_test.py        |   68 +-
 deepray/models/tests/span_labeling_test.py    |   15 +-
 .../models/tests/transformer_encoder_test.py  |   73 +-
 deepray/models/transformer_encoder.py         |  116 +-
 deepray/models/word2vec.py                    |    3 +-
 deepray/ops/__init__.py                       |    1 +
 deepray/ops/beam_search.py                    |  144 +-
 deepray/ops/beam_search_test.py               |   48 +-
 deepray/ops/decoding_module.py                |   60 +-
 deepray/ops/decoding_module_test.py           |    8 +-
 deepray/ops/sampling_module.py                |  153 +-
 deepray/ops/segment_extractor.py              |   19 +-
 deepray/ops/segment_extractor_test.py         |  167 +-
 deepray/optimizers/__init__.py                |   14 +-
 deepray/optimizers/adabelief.py               |  260 +-
 deepray/optimizers/adagrad.py                 |   40 +-
 deepray/optimizers/adam.py                    |   86 +-
 deepray/optimizers/adam_async.py              |  179 +-
 deepray/optimizers/average_wrapper.py         |   53 +-
 deepray/optimizers/cocob.py                   |   57 +-
 deepray/optimizers/conditional_gradient.py    |  126 +-
 deepray/optimizers/constants.py               |    4 +-
 deepray/optimizers/cyclical_learning_rate.py  |  411 +-
 deepray/optimizers/ev_optimizer_patch.py      |  137 +-
 deepray/optimizers/ftrl.py                    |   90 +-
 deepray/optimizers/gradient_descent.py        |   51 +-
 deepray/optimizers/lamb.py                    |  137 +-
 deepray/optimizers/lazy_adam.py               |  118 +-
 deepray/optimizers/lookahead.py               |  100 +-
 deepray/optimizers/moving_average.py          |  105 +-
 deepray/optimizers/multi_optimizer.py         |  110 +-
 deepray/optimizers/novograd.py                |  234 +-
 deepray/optimizers/optimization.py            |  104 +-
 deepray/optimizers/proximal_adagrad.py        |  124 +-
 deepray/optimizers/rectified_adam.py          |  269 +-
 .../optimizers/stochastic_weight_averaging.py |  132 +-
 deepray/optimizers/tests/adabelief_test.py    |  118 +-
 deepray/optimizers/tests/adam_test.py         |   62 +-
 deepray/optimizers/tests/cocob_test.py        |   18 +-
 .../tests/conditional_gradient_test.py        | 1337 +++--
 .../tests/cyclical_learning_rate_test.py      |   81 +-
 deepray/optimizers/tests/lamb_test.py         |   45 +-
 deepray/optimizers/tests/lazy_adam_test.py    |   43 +-
 deepray/optimizers/tests/lookahead_test.py    |   14 +-
 .../optimizers/tests/moving_average_test.py   |   96 +-
 .../optimizers/tests/multi_optimizer_test.py  |   22 +-
 deepray/optimizers/tests/novograd_test.py     |   48 +-
 .../optimizers/tests/proximal_adagrad_test.py |   74 +-
 .../optimizers/tests/rectified_adam_test.py   |   90 +-
 deepray/optimizers/tests/standard_test.py     |   24 +-
 .../tests/weight_decay_optimizers_test.py     |  290 +-
 deepray/optimizers/tests/yogi_test.py         |   94 +-
 deepray/optimizers/utils.py                   |   20 +-
 deepray/optimizers/warmup.py                  |   37 +-
 deepray/optimizers/weight_decay_optimizers.py |  680 +--
 deepray/optimizers/yogi.py                    |  138 +-
 deepray/options.py                            |   12 +-
 deepray/register.py                           |  130 +-
 deepray/testing/serialization.py              |   41 +-
 deepray/testing/tests/serialization_test.py   |    8 +-
 deepray/tests/register_test.py                |    6 +-
 deepray/text/crf.py                           |  408 +-
 deepray/text/crf_wrapper.py                   |   29 +-
 deepray/text/parse_time_op.py                 |   81 +-
 deepray/text/skip_gram_ops.py                 |  541 +-
 deepray/text/tests/crf_test.py                |  315 +-
 deepray/text/tests/crf_wrapper_test.py        |   30 +-
 deepray/text/tests/parse_time_op_test.py      |   51 +-
 deepray/text/tests/skip_gram_ops_test.py      |  459 +-
 deepray/utils/accelerator/tpu_test.py         |   31 +-
 deepray/utils/benchmark.py                    |   12 +-
 deepray/utils/ckpt_util.py                    |    3 +-
 deepray/utils/data/feature_map.py             |   52 +-
 deepray/utils/data/feature_map_test.py        |   90 +-
 deepray/utils/data/file_io.py                 |   19 +-
 deepray/utils/data/file_io_test.py            |   90 +-
 deepray/utils/data/input_meta.py              |   11 +-
 deepray/utils/ensure_tf_install.py            |   44 +-
 deepray/utils/export/export.py                |   74 +-
 deepray/utils/flags/_base.py                  |  149 +-
 deepray/utils/flags/_benchmark.py             |   67 +-
 deepray/utils/flags/_conventions.py           |    2 +-
 deepray/utils/flags/_data.py                  |   20 +-
 deepray/utils/flags/_device.py                |   44 +-
 deepray/utils/flags/_distribution.py          |   43 +-
 deepray/utils/flags/_misc.py                  |   20 +-
 deepray/utils/flags/_performance.py           |  273 +-
 deepray/utils/flags/common_flags.py           |  101 +-
 deepray/utils/flags/core.py                   |   64 +-
 deepray/utils/flags/flags_test.py             |   37 +-
 deepray/utils/gpu_affinity.py                 |    6 +-
 deepray/utils/keras_utils.py                  |   14 +-
 deepray/utils/logging_util.py                 |  148 +-
 deepray/utils/logs/cloud_lib.py               |    3 +-
 deepray/utils/logs/cloud_lib_test.py          |    1 -
 deepray/utils/logs/hooks_helper.py            |   27 +-
 deepray/utils/logs/hooks_helper_test.py       |   17 +-
 deepray/utils/logs/logger.py                  |   89 +-
 deepray/utils/logs/logger_test.py             |   63 +-
 deepray/utils/logs/mlperf_helper.py           |   35 +-
 deepray/utils/logs/summary_manager.py         |   10 +-
 deepray/utils/misc/model_helpers.py           |   12 +-
 deepray/utils/misc/model_helpers_test.py      |   55 +-
 deepray/utils/resource_loader.py              |   80 +-
 deepray/utils/test_utils.py                   |   82 +-
 deepray/utils/tests/keras_utils_test.py       |    2 -
 deepray/utils/timer.py                        |   16 +-
 deepray/utils/types.py                        |   31 +-
 .../CV/Classify_images_of_clothing/train.py   |   24 +-
 modelzoo/CV/GAN/conditional_gan_mnist.py      |   63 +-
 modelzoo/CV/GAN/train.py                      |   56 +-
 modelzoo/CV/SwinTransformers/model.py         |   71 +-
 modelzoo/CV/SwinTransformers/train.py         |   38 +-
 .../knowledge_distillation.py                 |  112 +-
 modelzoo/CV/mnist/train.py                    |   66 +-
 modelzoo/ELECTRA/build_pretraining_dataset.py |  124 +-
 modelzoo/ELECTRA/configuration.py             |  208 +-
 modelzoo/ELECTRA/configuration_utils.py       |  959 ++--
 modelzoo/ELECTRA/data/BooksDownloader.py      |   20 +-
 .../ELECTRA/data/BookscorpusTextFormatting.py |   28 +-
 modelzoo/ELECTRA/data/Downloader.py           |  101 +-
 .../data/GooglePretrainedWeightDownloader.py  |  299 +-
 modelzoo/ELECTRA/data/MRPCDownloader.py       |   49 +-
 .../data/NVIDIAPretrainedWeightDownloader.py  |   16 +-
 modelzoo/ELECTRA/data/SquadDownloader.py      |   69 +-
 modelzoo/ELECTRA/data/TextSharding.py         |  464 +-
 modelzoo/ELECTRA/data/WikiDownloader.py       |   71 +-
 .../ELECTRA/data/WikicorpusTextFormatting.py  |   54 +-
 modelzoo/ELECTRA/data/dataPrep.py             |  563 ++-
 modelzoo/ELECTRA/file_utils.py                |  721 ++-
 modelzoo/ELECTRA/gpu_affinity.py              |   48 +-
 modelzoo/ELECTRA/modeling.py                  | 1582 +++---
 modelzoo/ELECTRA/modeling_utils.py            | 4440 ++++++++---------
 modelzoo/ELECTRA/optimization.py              |  658 +--
 .../ELECTRA/postprocess_pretrained_ckpt.py    |   70 +-
 modelzoo/ELECTRA/pretrain_utils.py            |  492 +-
 modelzoo/ELECTRA/run_inference.py             |  331 +-
 modelzoo/ELECTRA/run_pretraining.py           |  949 ++--
 modelzoo/ELECTRA/run_tf_squad.py              | 1312 ++---
 modelzoo/ELECTRA/squad_utils.py               | 1883 ++++---
 modelzoo/ELECTRA/tokenization.py              |   64 +-
 modelzoo/ELECTRA/tokenization_utils.py        | 4312 ++++++++--------
 modelzoo/ELECTRA/utils.py                     |  313 +-
 .../BERT/classifier_data_lib.py               |  152 +-
 .../LanguageModeling/BERT/common_flags.py     |   64 +-
 .../BERT/create_finetuning_data.py            |  152 +-
 .../BERT/create_pretraining_data.py           |  220 +-
 .../BERT/data/BooksDownloader.py              |   20 +-
 .../BERT/data/BookscorpusTextFormatting.py    |   28 +-
 .../LanguageModeling/BERT/data/Downloader.py  |  195 +-
 .../BERT/data/GLUEDownloader.py               |   45 +-
 .../data/GooglePretrainedWeightDownloader.py  |  282 +-
 .../data/NVIDIAPretrainedWeightDownloader.py  |   16 +-
 .../BERT/data/PubMedDownloader.py             |  137 +-
 .../BERT/data/PubMedTextFormatting.py         |   48 +-
 .../BERT/data/SquadDownloader.py              |   69 +-
 .../BERT/data/TextSharding.py                 |  468 +-
 .../BERT/data/WikiDownloader.py               |   75 +-
 .../BERT/data/WikicorpusTextFormatting.py     |   54 +-
 .../LanguageModeling/BERT/data/__init__.py    |    2 +-
 .../LanguageModeling/BERT/data/bertPrep.py    |  771 +--
 .../LanguageModeling/BERT/gpu_affinity.py     |   48 +-
 .../LanguageModeling/BERT/input_pipeline.py   |  134 +-
 .../BERT/model_saving_utils.py                |   33 +-
 .../modeling/hyperparams/params_dict.py       |  120 +-
 .../modeling/hyperparams/params_dict_test.py  |  237 +-
 .../modeling/training/distributed_executor.py |  387 +-
 .../BERT/official/nlp/bert_modeling.py        |  517 +-
 .../BERT/official/nlp/bert_models.py          |  289 +-
 .../official/nlp/modeling/losses/__init__.py  |    9 +-
 ...eighted_sparse_categorical_crossentropy.py |   20 +-
 ...ed_sparse_categorical_crossentropy_test.py |  172 +-
 .../nlp/transformer/beam_search_v1.py         |  219 +-
 .../nlp/transformer/beam_search_v1_test.py    |   16 +-
 .../official/nlp/transformer/model_params.py  |  106 +-
 .../official/nlp/transformer/model_utils.py   |   18 +-
 .../nlp/transformer/model_utils_test.py       |   34 +-
 .../LanguageModeling/BERT/optimization.py     |  165 +-
 .../LanguageModeling/BERT/run_classifier.py   |  358 +-
 .../LanguageModeling/BERT/run_pretraining.py  |  166 +-
 modelzoo/LanguageModeling/BERT/run_squad.py   |  438 +-
 modelzoo/LanguageModeling/BERT/squad_lib.py   |  359 +-
 .../LanguageModeling/BERT/squad_lib_sp.py     |  358 +-
 modelzoo/LanguageModeling/BERT/tf_trt.py      |   84 +-
 .../LanguageModeling/BERT/tokenization.py     |   54 +-
 .../classify_text_with_bert.py                |  256 +-
 .../trainer.py                                |   26 +-
 .../a.py                                      |   38 +-
 .../models.py                                 |   53 +-
 .../run.py                                    |   48 +-
 .../run_dp.py                                 |   49 +-
 .../text_generation_with_miniature_gpt.py     |   66 +-
 .../CreditCardFraudDetection/train.py         |   32 +-
 .../Criteo_DCN/datasets/custom_dataset.py     |   11 +-
 .../datasets/custom_dataset_test.py           |   18 +-
 modelzoo/Recommendation/Criteo_DCN/dcn_v2.py  |   56 +-
 modelzoo/Recommendation/Criteo_DCN/train.py   |   59 +-
 modelzoo/Recommendation/Criteo_DCN/train1.py  |   51 +-
 modelzoo/Recommendation/MovieLens/mymodel.py  |   37 +-
 modelzoo/Recommendation/MovieLens/train.py    |   34 +-
 .../Recommendation/MovieLens/train_ranking.py |   47 +-
 modelzoo/Recommendation/NCF/run_ncf.py        |  196 +-
 modelzoo/Recommendation/SIM/defaults.py       |   26 +-
 modelzoo/Recommendation/SIM/feature_spec.py   |   66 +-
 modelzoo/Recommendation/SIM/main.py           |   77 +-
 modelzoo/Recommendation/SIM/run_dien.py       |   44 +-
 modelzoo/Recommendation/SIM/run_din.py        |   35 +-
 modelzoo/Recommendation/SIM/run_sim.py        |   51 +-
 modelzoo/Recommendation/WideDeep/train.py     |   38 +-
 .../avazu-ctr-prediction/ccpm.py              |   52 +-
 .../avazu-ctr-prediction/ccpm_diamond.py      |   34 +-
 .../avazu-ctr-prediction/train.py             |   54 +-
 modelzoo/Recommendation/criteo_ctr/dcn_v2.py  |   56 +-
 modelzoo/Recommendation/criteo_ctr/infer.py   |   18 +-
 modelzoo/Recommendation/criteo_ctr/train.py   |   40 +-
 .../tensorflow2_synthetic_benchmark.py        |   42 +-
 pyproject.toml                                |  138 +
 recommendation/README.md                      |   72 -
 recommendation/__init__.py                    |   13 -
 recommendation/constants.py                   |   78 -
 recommendation/create_ncf_data.py             |  103 -
 recommendation/data_pipeline.py               |  889 ----
 recommendation/data_preprocessing.py          |  243 -
 recommendation/data_test.py                   |  353 --
 recommendation/movielens.py                   |  307 --
 recommendation/movielens_dataset.py           |  228 -
 recommendation/ncf_common.py                  |  289 --
 recommendation/ncf_input_pipeline.py          |  173 -
 recommendation/ncf_keras_main.py              |  526 --
 recommendation/ncf_test.py                    |  106 -
 recommendation/neumf_model.py                 |  415 --
 recommendation/popen_helper.py                |   64 -
 recommendation/ranking/README.md              |  171 -
 recommendation/ranking/__init__.py            |   13 -
 recommendation/ranking/common.py              |  110 -
 recommendation/ranking/configs/__init__.py    |   13 -
 recommendation/ranking/configs/config.py      |  302 --
 recommendation/ranking/configs/config_test.py |   37 -
 .../configs/yaml/dcn_v2_criteo_tpu.yaml       |   35 -
 .../ranking/configs/yaml/dlrm_criteo_tpu.yaml |   35 -
 recommendation/ranking/data/__init__.py       |   13 -
 recommendation/ranking/data/data_pipeline.py  |  198 -
 .../ranking/data/data_pipeline_test.py        |   64 -
 .../ranking/preprocessing/README.md           |  122 -
 .../preprocessing/criteo_preprocess.py        |  289 --
 recommendation/ranking/preprocessing/setup.py |   29 -
 .../ranking/preprocessing/shard_rebalancer.py |   90 -
 recommendation/ranking/task.py                |  232 -
 recommendation/ranking/task_test.py           |   55 -
 recommendation/ranking/train.py               |  189 -
 recommendation/ranking/train_test.py          |  140 -
 recommendation/run.sh                         |  101 -
 recommendation/stat_utils.py                  |   85 -
 recommendation/train.sh                       |    9 -
 setup.py                                      |   62 +-
 third_party/gpus/find_cuda_config.py          |  288 +-
 tools/docs/build_docs.py                      |   29 +-
 687 files changed, 44369 insertions(+), 50002 deletions(-)
 create mode 100644 .github/workflows/python_format_check.yml
 create mode 100644 pyproject.toml
 delete mode 100644 recommendation/README.md
 delete mode 100644 recommendation/__init__.py
 delete mode 100644 recommendation/constants.py
 delete mode 100644 recommendation/create_ncf_data.py
 delete mode 100644 recommendation/data_pipeline.py
 delete mode 100644 recommendation/data_preprocessing.py
 delete mode 100644 recommendation/data_test.py
 delete mode 100644 recommendation/movielens.py
 delete mode 100644 recommendation/movielens_dataset.py
 delete mode 100644 recommendation/ncf_common.py
 delete mode 100644 recommendation/ncf_input_pipeline.py
 delete mode 100644 recommendation/ncf_keras_main.py
 delete mode 100644 recommendation/ncf_test.py
 delete mode 100644 recommendation/neumf_model.py
 delete mode 100644 recommendation/popen_helper.py
 delete mode 100644 recommendation/ranking/README.md
 delete mode 100644 recommendation/ranking/__init__.py
 delete mode 100644 recommendation/ranking/common.py
 delete mode 100644 recommendation/ranking/configs/__init__.py
 delete mode 100644 recommendation/ranking/configs/config.py
 delete mode 100644 recommendation/ranking/configs/config_test.py
 delete mode 100644 recommendation/ranking/configs/yaml/dcn_v2_criteo_tpu.yaml
 delete mode 100644 recommendation/ranking/configs/yaml/dlrm_criteo_tpu.yaml
 delete mode 100644 recommendation/ranking/data/__init__.py
 delete mode 100644 recommendation/ranking/data/data_pipeline.py
 delete mode 100644 recommendation/ranking/data/data_pipeline_test.py
 delete mode 100644 recommendation/ranking/preprocessing/README.md
 delete mode 100644 recommendation/ranking/preprocessing/criteo_preprocess.py
 delete mode 100644 recommendation/ranking/preprocessing/setup.py
 delete mode 100644 recommendation/ranking/preprocessing/shard_rebalancer.py
 delete mode 100644 recommendation/ranking/task.py
 delete mode 100644 recommendation/ranking/task_test.py
 delete mode 100644 recommendation/ranking/train.py
 delete mode 100644 recommendation/ranking/train_test.py
 delete mode 100755 recommendation/run.sh
 delete mode 100644 recommendation/stat_utils.py
 delete mode 100644 recommendation/train.sh

diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index 912aeaaa..859219e4 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -11,13 +11,6 @@ on:
       - r*
 
 jobs:
-  yapf-test:
-    name: Yapf Python code format
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Ensure contributor used ("yapf --style=./.yapf -ri ./**/*.py") before commit
-        run: bash tools/run_build.sh yapf-test
   valid_build_files:
     name: Valid build files
     runs-on: ubuntu-latest
diff --git a/.github/workflows/python_format_check.yml b/.github/workflows/python_format_check.yml
new file mode 100644
index 00000000..0d1e50c5
--- /dev/null
+++ b/.github/workflows/python_format_check.yml
@@ -0,0 +1,8 @@
+name: Ruff
+on: [ push, pull_request ]
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v3
\ No newline at end of file
diff --git a/build_deps/pip_tf/pip_tf_flags_test.py b/build_deps/pip_tf/pip_tf_flags_test.py
index dec43f4a..723e7bc8 100644
--- a/build_deps/pip_tf/pip_tf_flags_test.py
+++ b/build_deps/pip_tf/pip_tf_flags_test.py
@@ -19,9 +19,9 @@
 from absl.testing import absltest
 import tensorflow as tf
 
-_COPTS = flags.DEFINE_list('copts', [], 'TF copts')
-_CXXOPTS = flags.DEFINE_list('cxxopts', [], 'TF cxxopts')
-_LINKOPTS = flags.DEFINE_list('linkopts', [], 'TF linkopts')
+_COPTS = flags.DEFINE_list("copts", [], "TF copts")
+_CXXOPTS = flags.DEFINE_list("cxxopts", [], "TF cxxopts")
+_LINKOPTS = flags.DEFINE_list("linkopts", [], "TF linkopts")
 
 _ERROR_MSG = """
 If the TensorFlow version has been updated, copy the new value to
@@ -30,16 +30,15 @@
 
 
 class PipTfFlagsTest(absltest.TestCase):
-
   def test_compile_flags(self):
     copts = []
     cxxopts = []
     for flag in tf.sysconfig.get_compile_flags():
       # Ignore include flags, which are handled by bazel.
-      if flag.startswith('-I'):
+      if flag.startswith("-I"):
         continue
 
-      if flag.startswith('--std=c++'):  # Don't add C++-only flags to copts.
+      if flag.startswith("--std=c++"):  # Don't add C++-only flags to copts.
         cxxopts.append(flag)
       else:
         copts.append(flag)
@@ -51,15 +50,15 @@ def test_link_flags(self):
     linkopts = []
     for flag in tf.sysconfig.get_link_flags():
       # Ignore library search paths, which are handled by bazel.
-      if flag.startswith('-L'):
+      if flag.startswith("-L"):
         continue
       # Ignore -ltensorflow_framework, which is handled by bazel.
-      if re.search(r'^-l(:lib)?tensorflow_framework', flag):
+      if re.search(r"^-l(:lib)?tensorflow_framework", flag):
         continue
       linkopts.append(flag)
 
     self.assertSameElements(linkopts, _LINKOPTS.value, _ERROR_MSG)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   absltest.main()
diff --git a/configure.py b/configure.py
index e2086460..3f2eb110 100644
--- a/configure.py
+++ b/configure.py
@@ -37,15 +37,15 @@
   from distutils.spawn import find_executable as which
 # pylint: enable=g-import-not-at-top
 
-_DEFAULT_CUDA_VERSION = '11'
-_DEFAULT_CUDNN_VERSION = '2'
-_DEFAULT_TENSORRT_VERSION = '6'
+_DEFAULT_CUDA_VERSION = "11"
+_DEFAULT_CUDNN_VERSION = "2"
+_DEFAULT_TENSORRT_VERSION = "6"
 
 _DEFAULT_PROMPT_ASK_ATTEMPTS = 10
 
-_DP_BAZELRC_FILENAME = '.dp_configure.bazelrc'
-_DP_WORKSPACE_ROOT = ''
-_DP_BAZELRC = ''
+_DP_BAZELRC_FILENAME = ".dp_configure.bazelrc"
+_DP_WORKSPACE_ROOT = ""
+_DP_BAZELRC = ""
 _DP_CURRENT_BAZEL_VERSION = None
 
 
@@ -54,11 +54,11 @@ class UserInputError(Exception):
 
 
 def is_windows():
-  return platform.system() == 'Windows'
+  return platform.system() == "Windows"
 
 
 def is_linux():
-  return platform.system() == 'Linux'
+  return platform.system() == "Linux"
 
 
 def is_raspi_arm():
@@ -66,19 +66,19 @@ def is_raspi_arm():
 
 
 def is_macos():
-  return platform.system() == 'Darwin'
+  return platform.system() == "Darwin"
 
 
 def is_ppc64le():
-  return platform.machine() == 'ppc64le'
+  return platform.machine() == "ppc64le"
 
 
 def is_s390x():
-  return platform.machine() == 's390x'
+  return platform.machine() == "s390x"
 
 
 def is_cygwin():
-  return platform.system().startswith('CYGWIN_NT')
+  return platform.system().startswith("CYGWIN_NT")
 
 
 def get_tf_header_dir():
@@ -140,7 +140,7 @@ def get_tf_version_integer():
     2.8.3 get 2083
 
   The 4-digits-string will be passed to C macro to discriminate different
-  Tensorflow versions. 
+  Tensorflow versions.
 
   We assume that major version has 1 digit, minor version has 2 digits. And
   patch version has 1 digit.
@@ -149,24 +149,24 @@ def get_tf_version_integer():
     version = tf.__version__
   except AttributeError:
     raise ImportError(
-        '\nPlease install a TensorFlow on your compiling machine, '
-        'The compiler needs to know the version of Tensorflow '
-        'and get TF c++ headers according to the installed TensorFlow. '
-        '\nNote: Only TensorFlow 2.8.3, 2.6.3, 2.4.1, 1.15.2 are supported.'
+      "\nPlease install a TensorFlow on your compiling machine, "
+      "The compiler needs to know the version of Tensorflow "
+      "and get TF c++ headers according to the installed TensorFlow. "
+      "\nNote: Only TensorFlow 2.8.3, 2.6.3, 2.4.1, 1.15.2 are supported."
     )
   try:
-    major, minor, patch = version.split('.')
+    major, minor, patch = version.split(".")
     assert len(major) == 1, "Tensorflow major version must be length of 1. Version: {}".format(version)
     assert len(minor) <= 2, "Tensorflow minor version must be less or equal to 2. Version: {}".format(version)
     assert len(patch) == 1, "Tensorflow patch version must be length of 1. Version: {}".format(version)
   except:
-    raise ValueError('got wrong tf.__version__: {}'.format(version))
+    raise ValueError("got wrong tf.__version__: {}".format(version))
   tf_version_num = str(int(major) * 1000 + int(minor) * 10 + int(patch))
   if len(tf_version_num) != 4:
     raise ValueError(
-        'Tensorflow version flag must be length of 4 (major'
-        ' version: 1, minor version: 2, patch_version: 1). But'
-        ' get: {}'.format(tf_version_num)
+      "Tensorflow version flag must be length of 4 (major"
+      " version: 1, minor version: 2, patch_version: 1). But"
+      " get: {}".format(tf_version_num)
     )
   return int(tf_version_num)
 
@@ -178,7 +178,7 @@ def get_input(question):
     except NameError:
       answer = input(question)  # pylint: disable=bad-builtin
   except EOFError:
-    answer = ''
+    answer = ""
   return answer
 
 
@@ -200,8 +200,8 @@ def symlink_force(target, link_name):
 
 
 def write_to_bazelrc(line):
-  with open(_DP_BAZELRC, 'a') as f:
-    f.write(line + '\n')
+  with open(_DP_BAZELRC, "a") as f:
+    f.write(line + "\n")
 
 
 def write_action_env(var_name, var):
@@ -222,28 +222,27 @@ def run_shell(cmd, allow_non_zero=False, stderr=None):
       output = e.output
   else:
     output = subprocess.check_output(cmd, stderr=stderr)
-  return output.decode('UTF-8').strip()
+  return output.decode("UTF-8").strip()
 
 
 def cygpath(path):
   """Convert path from posix to windows."""
-  return os.path.abspath(path).replace('\\', '/')
+  return os.path.abspath(path).replace("\\", "/")
 
 
 def get_python_path(environ_cp, python_bin_path):
   """Get the python site package paths."""
   python_paths = []
-  if environ_cp.get('PYTHONPATH'):
-    python_paths = environ_cp.get('PYTHONPATH').split(':')
+  if environ_cp.get("PYTHONPATH"):
+    python_paths = environ_cp.get("PYTHONPATH").split(":")
   try:
-    stderr = open(os.devnull, 'wb')
+    stderr = open(os.devnull, "wb")
     library_paths = run_shell(
-        [python_bin_path, '-c', 'import site; print("\\n".join(site.getsitepackages()))'], stderr=stderr
-    ).split('\n')
+      [python_bin_path, "-c", 'import site; print("\\n".join(site.getsitepackages()))'], stderr=stderr
+    ).split("\n")
   except subprocess.CalledProcessError:
     library_paths = [
-        run_shell([python_bin_path, '-c', 'from distutils.sysconfig import get_python_lib;'
-                   'print(get_python_lib())'])
+      run_shell([python_bin_path, "-c", "from distutils.sysconfig import get_python_lib;print(get_python_lib())"])
     ]
 
   all_paths = set(python_paths + library_paths)
@@ -259,52 +258,50 @@ def get_python_path(environ_cp, python_bin_path):
 
 def get_python_major_version(python_bin_path):
   """Get the python major version."""
-  return run_shell([python_bin_path, '-c', 'import sys; print(sys.version[0])'])
+  return run_shell([python_bin_path, "-c", "import sys; print(sys.version[0])"])
 
 
 def setup_python(environ_cp):
   """Setup python related env variables."""
   # Get PYTHON_BIN_PATH, default is the current running python.
   default_python_bin_path = sys.executable
-  ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '{}]: ').format(default_python_bin_path)
+  ask_python_bin_path = ("Please specify the location of python. [Default is {}]: ").format(default_python_bin_path)
   while True:
     python_bin_path = get_from_env_or_user_or_default(
-        environ_cp, 'PYTHON_BIN_PATH', ask_python_bin_path, default_python_bin_path
+      environ_cp, "PYTHON_BIN_PATH", ask_python_bin_path, default_python_bin_path
     )
     # Check if the path is valid
     if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
       break
     elif not os.path.exists(python_bin_path):
-      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
+      print("Invalid python path: {} cannot be found.".format(python_bin_path))
     else:
-      print('{} is not executable.  Is it the python binary?'.format(python_bin_path))
-    environ_cp['PYTHON_BIN_PATH'] = ''
+      print("{} is not executable.  Is it the python binary?".format(python_bin_path))
+    environ_cp["PYTHON_BIN_PATH"] = ""
 
   # Convert python path to Windows style before checking lib and version
   if is_windows() or is_cygwin():
     python_bin_path = cygpath(python_bin_path)
 
   # Get PYTHON_LIB_PATH
-  python_lib_path = environ_cp.get('PYTHON_LIB_PATH')
+  python_lib_path = environ_cp.get("PYTHON_LIB_PATH")
   if not python_lib_path:
     python_lib_paths = get_python_path(environ_cp, python_bin_path)
-    if environ_cp.get('USE_DEFAULT_PYTHON_LIB_PATH') == '1':
+    if environ_cp.get("USE_DEFAULT_PYTHON_LIB_PATH") == "1":
       python_lib_path = python_lib_paths[0]
     else:
-      print('Found possible Python library paths:\n  %s' % '\n  '.join(python_lib_paths))
+      print("Found possible Python library paths:\n  %s" % "\n  ".join(python_lib_paths))
       default_python_lib_path = python_lib_paths[0]
       python_lib_path = get_input(
-          'Please input the desired Python library path to use.  '
-          'Default is [{}]\n'.format(python_lib_paths[0])
+        "Please input the desired Python library path to use.  Default is [{}]\n".format(python_lib_paths[0])
       )
       if not python_lib_path:
         python_lib_path = default_python_lib_path
-    environ_cp['PYTHON_LIB_PATH'] = python_lib_path
+    environ_cp["PYTHON_LIB_PATH"] = python_lib_path
 
   python_major_version = get_python_major_version(python_bin_path)
-  if python_major_version == '2':
-    write_to_bazelrc('build --host_force_python=PY2')
+  if python_major_version == "2":
+    write_to_bazelrc("build --host_force_python=PY2")
   logging.debug(f"Hermetic Python version: {sys.version_info.major}.{sys.version_info.minor}")
   write_repo_env("HERMETIC_PYTHON_VERSION", f"{sys.version_info.major}.{sys.version_info.minor}")
 
@@ -313,26 +310,26 @@ def setup_python(environ_cp):
     python_lib_path = cygpath(python_lib_path)
 
   # Set-up env variables used by python_configure.bzl
-  write_action_env('PYTHON_BIN_PATH', python_bin_path)
-  write_action_env('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
-  environ_cp['PYTHON_BIN_PATH'] = python_bin_path
+  write_action_env("PYTHON_BIN_PATH", python_bin_path)
+  write_action_env("PYTHON_LIB_PATH", python_lib_path)
+  write_to_bazelrc('build --python_path="{}"'.format(python_bin_path))
+  environ_cp["PYTHON_BIN_PATH"] = python_bin_path
 
   # If choosen python_lib_path is from a path specified in the PYTHONPATH
   # variable, need to tell bazel to include PYTHONPATH
-  if environ_cp.get('PYTHONPATH'):
-    python_paths = environ_cp.get('PYTHONPATH').split(':')
+  if environ_cp.get("PYTHONPATH"):
+    python_paths = environ_cp.get("PYTHONPATH").split(":")
     if python_lib_path in python_paths:
-      write_action_env('PYTHONPATH', environ_cp.get('PYTHONPATH'))
+      write_action_env("PYTHONPATH", environ_cp.get("PYTHONPATH"))
 
   # Write tools/python_bin_path.sh
-  with open(os.path.join(_DP_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'), 'w') as f:
+  with open(os.path.join(_DP_WORKSPACE_ROOT, "tools", "python_bin_path.sh"), "w") as f:
     f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
 
 
 def reset_tf_configure_bazelrc():
   """Reset file that contains customized config settings."""
-  open(_DP_BAZELRC, 'w').close()
+  open(_DP_BAZELRC, "w").close()
 
 
 def cleanup_makefile():
@@ -340,11 +337,11 @@ def cleanup_makefile():
 
   These files could interfere with Bazel parsing.
   """
-  makefile_download_dir = os.path.join(_DP_WORKSPACE_ROOT, 'tensorflow', 'contrib', 'makefile', 'downloads')
+  makefile_download_dir = os.path.join(_DP_WORKSPACE_ROOT, "tensorflow", "contrib", "makefile", "downloads")
   if os.path.isdir(makefile_download_dir):
     for root, _, filenames in os.walk(makefile_download_dir):
       for f in filenames:
-        if f.endswith('BUILD'):
+        if f.endswith("BUILD"):
           os.remove(os.path.join(root, f))
 
 
@@ -374,44 +371,44 @@ def get_var(environ_cp, var_name, query_item, enabled_by_default, question=None,
       Raise the error to avoid infinitely looping.
   """
   if not question:
-    question = 'Do you wish to build Deepray with {} support?'.format(query_item)
+    question = "Do you wish to build Deepray with {} support?".format(query_item)
   if not yes_reply:
-    yes_reply = '{} support will be enabled for Deepray.'.format(query_item)
+    yes_reply = "{} support will be enabled for Deepray.".format(query_item)
   if not no_reply:
-    no_reply = 'No {}'.format(yes_reply)
+    no_reply = "No {}".format(yes_reply)
 
-  yes_reply += '\n'
-  no_reply += '\n'
+  yes_reply += "\n"
+  no_reply += "\n"
 
   if enabled_by_default:
-    question += ' [Y/n]: '
+    question += " [Y/n]: "
   else:
-    question += ' [y/N]: '
+    question += " [y/N]: "
 
   var = environ_cp.get(var_name)
   if var is not None:
     var_content = var.strip().lower()
-    true_strings = ('1', 't', 'true', 'y', 'yes')
-    false_strings = ('0', 'f', 'false', 'n', 'no')
+    true_strings = ("1", "t", "true", "y", "yes")
+    false_strings = ("0", "f", "false", "n", "no")
     if var_content in true_strings:
       var = True
     elif var_content in false_strings:
       var = False
     else:
       raise UserInputError(
-          'Environment variable %s must be set as a boolean indicator.\n'
-          'The following are accepted as TRUE : %s.\n'
-          'The following are accepted as FALSE: %s.\n'
-          'Current value is %s.' % (var_name, ', '.join(true_strings), ', '.join(false_strings), var)
+        "Environment variable %s must be set as a boolean indicator.\n"
+        "The following are accepted as TRUE : %s.\n"
+        "The following are accepted as FALSE: %s.\n"
+        "Current value is %s." % (var_name, ", ".join(true_strings), ", ".join(false_strings), var)
       )
 
   while var is None:
     user_input_origin = get_input(question)
     user_input = user_input_origin.strip().lower()
-    if user_input == 'y':
+    if user_input == "y":
       print(yes_reply)
       var = True
-    elif user_input == 'n':
+    elif user_input == "n":
       print(no_reply)
       var = False
     elif not user_input:
@@ -422,19 +419,19 @@ def get_var(environ_cp, var_name, query_item, enabled_by_default, question=None,
         print(no_reply)
         var = False
     else:
-      print('Invalid selection: {}'.format(user_input_origin))
+      print("Invalid selection: {}".format(user_input_origin))
   return var
 
 
 def set_action_env_var(
-    environ_cp,
-    var_name,
-    query_item,
-    enabled_by_default,
-    question=None,
-    yes_reply=None,
-    no_reply=None,
-    bazel_config_name=None
+  environ_cp,
+  var_name,
+  query_item,
+  enabled_by_default,
+  question=None,
+  yes_reply=None,
+  no_reply=None,
+  bazel_config_name=None,
 ):
   """Set boolean action_env variable.
 
@@ -457,7 +454,7 @@ def set_action_env_var(
   if not bazel_config_name:
     write_action_env(var_name, var)
   elif var:
-    write_to_bazelrc('build --config=%s' % bazel_config_name)
+    write_to_bazelrc("build --config=%s" % bazel_config_name)
   environ_cp[var_name] = str(var)
 
 
@@ -473,16 +470,16 @@ def convert_version_to_int(version):
   Returns:
     An integer if converted successfully, otherwise return None.
   """
-  version = version.split('-')[0]
-  version_segments = version.split('.')
+  version = version.split("-")[0]
+  version_segments = version.split(".")
   # Treat "0.24" as "0.24.0"
   if len(version_segments) == 2:
-    version_segments.append('0')
+    version_segments.append("0")
   for seg in version_segments:
     if not seg.isdigit():
       return None
 
-  version_str = ''.join(['%03d' % int(seg) for seg in version_segments])
+  version_str = "".join(["%03d" % int(seg) for seg in version_segments])
   return int(version_str)
 
 
@@ -492,26 +489,26 @@ def retrieve_bazel_version():
   Returns:
     The bazel version detected.
   """
-  bazel_executable = which('bazel')
+  bazel_executable = which("bazel")
   if bazel_executable is None:
-    bazel_executable = which('bazelisk')
+    bazel_executable = which("bazelisk")
     if bazel_executable is None:
-      print('Cannot find bazel. Please install bazel/bazelisk.')
+      print("Cannot find bazel. Please install bazel/bazelisk.")
       sys.exit(1)
 
-  stderr = open(os.devnull, 'wb')
-  curr_version = run_shell([bazel_executable, '--version'], allow_non_zero=True, stderr=stderr)
-  if curr_version.startswith('bazel '):
-    curr_version = curr_version.split('bazel ')[1]
+  stderr = open(os.devnull, "wb")
+  curr_version = run_shell([bazel_executable, "--version"], allow_non_zero=True, stderr=stderr)
+  if curr_version.startswith("bazel "):
+    curr_version = curr_version.split("bazel ")[1]
 
   curr_version_int = convert_version_to_int(curr_version)
 
   # Check if current bazel version can be detected properly.
   if not curr_version_int:
-    print('WARNING: current bazel installation is not a release version.')
+    print("WARNING: current bazel installation is not a release version.")
     return curr_version
 
-  print('You have bazel %s installed.' % curr_version)
+  print("You have bazel %s installed." % curr_version)
   return curr_version
 
 
@@ -525,24 +522,24 @@ def set_cc_opt_flags(environ_cp):
   """
   if is_ppc64le():
     # gcc on ppc64le does not support -march, use mcpu instead
-    default_cc_opt_flags = '-mcpu=native'
+    default_cc_opt_flags = "-mcpu=native"
   elif is_windows():
-    default_cc_opt_flags = '/arch:AVX'
+    default_cc_opt_flags = "/arch:AVX"
   else:
     # On all other platforms, no longer use `-march=native` as this can result
     # in instructions that are too modern being generated. Users that want
     # maximum performance should compile TF in their environment and can pass
     # `-march=native` there.
     # See https://github.com/tensorflow/tensorflow/issues/45744 and duplicates
-    default_cc_opt_flags = '-Wno-sign-compare'
+    default_cc_opt_flags = "-Wno-sign-compare"
   question = (
-      'Please specify optimization flags to use during compilation when'
-      ' bazel option "--config=opt" is specified [Default is %s]: '
+    "Please specify optimization flags to use during compilation when"
+    ' bazel option "--config=opt" is specified [Default is %s]: '
   ) % default_cc_opt_flags
-  cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS', question, default_cc_opt_flags)
+  cc_opt_flags = get_from_env_or_user_or_default(environ_cp, "CC_OPT_FLAGS", question, default_cc_opt_flags)
   for opt in cc_opt_flags.split():
-    write_to_bazelrc('build:opt --copt=%s' % opt)
-    write_to_bazelrc('build:opt --host_copt=%s' % opt)
+    write_to_bazelrc("build:opt --copt=%s" % opt)
+    write_to_bazelrc("build:opt --host_copt=%s" % opt)
 
 
 def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var, var_default):
@@ -563,22 +560,22 @@ def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var, var_defau
   var = environ_cp.get(var_name)
   if not var:
     var = get_input(ask_for_var)
-    print('\n')
+    print("\n")
   if not var:
     var = var_default
   return var
 
 
 def prompt_loop_or_load_from_env(
-    environ_cp,
-    var_name,
-    var_default,
-    ask_for_var,
-    check_success,
-    error_msg,
-    suppress_default_error=False,
-    resolve_symlinks=False,
-    n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS
+  environ_cp,
+  var_name,
+  var_default,
+  ask_for_var,
+  check_success,
+  error_msg,
+  suppress_default_error=False,
+  resolve_symlinks=False,
+  n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS,
 ):
   """Loop over user prompts for an ENV param until receiving a valid response.
 
@@ -613,9 +610,9 @@ def prompt_loop_or_load_from_env(
       looping.
   """
   default = environ_cp.get(var_name) or var_default
-  full_query = '%s [Default is %s]: ' % (
-      ask_for_var,
-      default,
+  full_query = "%s [Default is %s]: " % (
+    ask_for_var,
+    default,
   )
 
   for _ in range(n_ask_attempts):
@@ -624,11 +621,11 @@ def prompt_loop_or_load_from_env(
       break
     if not suppress_default_error:
       print(error_msg % val)
-    environ_cp[var_name] = ''
+    environ_cp[var_name] = ""
   else:
     raise UserInputError(
-        'Invalid %s setting was provided %d times in a row. '
-        'Assuming to be a scripting mistake.' % (var_name, n_ask_attempts)
+      "Invalid %s setting was provided %d times in a row. "
+      "Assuming to be a scripting mistake." % (var_name, n_ask_attempts)
     )
 
   if resolve_symlinks:
@@ -638,31 +635,30 @@ def prompt_loop_or_load_from_env(
 
 
 def choose_compiler(environ_cp):
-  question = 'Do you want to use Clang as the compiler?'
-  yes_reply = 'Clang will be used to compile Deepray.'
-  no_reply = 'GCC will be used to compile Deepray.'
-  var = int(get_var(environ_cp, 'TF_NEED_CLANG', None, False, question, yes_reply, no_reply))
+  question = "Do you want to use Clang as the compiler?"
+  yes_reply = "Clang will be used to compile Deepray."
+  no_reply = "GCC will be used to compile Deepray."
+  var = int(get_var(environ_cp, "TF_NEED_CLANG", None, False, question, yes_reply, no_reply))
   return var
 
 
 def set_gcc_host_compiler_path(environ_cp):
   """Set GCC_HOST_COMPILER_PATH."""
-  default_gcc_host_compiler_path = which('gcc') or ''
-  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
+  default_gcc_host_compiler_path = which("gcc") or ""
+  cuda_bin_symlink = "%s/bin/gcc" % environ_cp.get("CUDA_TOOLKIT_PATH")
 
   if os.path.islink(cuda_bin_symlink):
     # os.readlink is only available in linux
     default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
 
   gcc_host_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='GCC_HOST_COMPILER_PATH',
-      var_default=default_gcc_host_compiler_path,
-      ask_for_var='Please specify which gcc should be used by nvcc as the host '
-      'compiler.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg='Invalid gcc path. %s cannot be found.',
+    environ_cp,
+    var_name="GCC_HOST_COMPILER_PATH",
+    var_default=default_gcc_host_compiler_path,
+    ask_for_var="Please specify which gcc should be used by nvcc as the host compiler.",
+    check_success=os.path.exists,
+    resolve_symlinks=True,
+    error_msg="Invalid gcc path. %s cannot be found.",
   )
   write_repo_env("CC", gcc_host_compiler_path)
   write_repo_env("BAZEL_COMPILER", gcc_host_compiler_path)
@@ -671,10 +667,10 @@ def set_gcc_host_compiler_path(environ_cp):
 
 def get_gcc_major_version(gcc_path: str):
   gcc_version_proc = subprocess.run(
-      [gcc_path, "-dumpversion"],
-      check=True,
-      capture_output=True,
-      text=True,
+    [gcc_path, "-dumpversion"],
+    check=True,
+    capture_output=True,
+    text=True,
   )
   major_version = int(gcc_version_proc.stdout)
   return major_version
@@ -695,30 +691,30 @@ def set_clang_compiler_path(environ_cp):
   """
   # Default path if clang-18 is installed by using apt-get install
   # remove 16 logic upon release of 19
-  default_clang_path = '/usr/lib/llvm-18/bin/clang'
+  default_clang_path = "/usr/lib/llvm-18/bin/clang"
   if not os.path.exists(default_clang_path):
-    default_clang_path = '/usr/lib/llvm-17/bin/clang'
+    default_clang_path = "/usr/lib/llvm-17/bin/clang"
     if not os.path.exists(default_clang_path):
-      default_clang_path = '/usr/lib/llvm-16/bin/clang'
+      default_clang_path = "/usr/lib/llvm-16/bin/clang"
     if not os.path.exists(default_clang_path):
-      default_clang_path = which('clang') or ''
+      default_clang_path = which("clang") or ""
 
   clang_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='CLANG_COMPILER_PATH',
-      var_default=default_clang_path,
-      ask_for_var='Please specify the path to clang executable.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg=(
-          'Invalid clang path. %s cannot be found. Note that TensorFlow now'
-          ' requires clang to compile. You may override this behavior by'
-          ' setting TF_NEED_CLANG=0'
-      ),
+    environ_cp,
+    var_name="CLANG_COMPILER_PATH",
+    var_default=default_clang_path,
+    ask_for_var="Please specify the path to clang executable.",
+    check_success=os.path.exists,
+    resolve_symlinks=True,
+    error_msg=(
+      "Invalid clang path. %s cannot be found. Note that TensorFlow now"
+      " requires clang to compile. You may override this behavior by"
+      " setting TF_NEED_CLANG=0"
+    ),
   )
 
-  write_repo_env('CC', clang_compiler_path)
-  write_repo_env('BAZEL_COMPILER', clang_compiler_path)
+  write_repo_env("CC", clang_compiler_path)
+  write_repo_env("BAZEL_COMPILER", clang_compiler_path)
 
   return clang_compiler_path
 
@@ -732,20 +728,20 @@ def retrieve_clang_version(clang_executable):
   Returns:
     The clang version detected.
   """
-  stderr = open(os.devnull, 'wb')
-  curr_version = run_shell([clang_executable, '--version'], allow_non_zero=True, stderr=stderr)
+  stderr = open(os.devnull, "wb")
+  curr_version = run_shell([clang_executable, "--version"], allow_non_zero=True, stderr=stderr)
 
-  curr_version_split = curr_version.lower().split('clang version ')
+  curr_version_split = curr_version.lower().split("clang version ")
   if len(curr_version_split) > 1:
     curr_version = curr_version_split[1].split()[0]
 
   curr_version_int = convert_version_to_int(curr_version)
   # Check if current clang version can be detected properly.
   if not curr_version_int:
-    print('WARNING: current clang installation is not a release version.\n')
+    print("WARNING: current clang installation is not a release version.\n")
     return None
 
-  print('You have Clang %s installed.\n' % curr_version)
+  print("You have Clang %s installed.\n" % curr_version)
   return curr_version
 
 
@@ -756,9 +752,9 @@ def retrieve_clang_version(clang_executable):
 # offset of in the current version of ubp. See
 # https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 def disable_clang_offsetof_extension(clang_version):
-  clang_major_version = int(clang_version.split('.')[0])
+  clang_major_version = int(clang_version.split(".")[0])
   if clang_major_version in (16, 17):
-    write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
+    write_to_bazelrc("build --copt=-Wno-gnu-offsetof-extensions")
   if clang_major_version >= 16:
     # Enable clang settings that are needed for the build to work with newer
     # versions of Clang.
@@ -771,70 +767,67 @@ def disable_clang_offsetof_extension(clang_version):
 def set_tf_cuda_paths(environ_cp):
   """Set TF_CUDA_PATHS."""
   ask_cuda_paths = (
-      'Please specify the comma-separated list of base paths to look for CUDA '
-      'libraries and headers. [Leave empty to use the default]: '
+    "Please specify the comma-separated list of base paths to look for CUDA "
+    "libraries and headers. [Leave empty to use the default]: "
   )
-  tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, 'TF_CUDA_PATHS', ask_cuda_paths, '')
+  tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, "TF_CUDA_PATHS", ask_cuda_paths, "")
   if tf_cuda_paths:
-    environ_cp['TF_CUDA_PATHS'] = tf_cuda_paths
+    environ_cp["TF_CUDA_PATHS"] = tf_cuda_paths
 
 
 def set_tf_cuda_version(environ_cp):
   """Set TF_CUDA_VERSION."""
   ask_cuda_version = (
-      'Please specify the CUDA SDK version you want to use. '
-      '[Leave empty to default to CUDA %s]: '
+    "Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA %s]: "
   ) % _DEFAULT_CUDA_VERSION
   tf_cuda_version = get_from_env_or_user_or_default(
-      environ_cp, 'TF_CUDA_VERSION', ask_cuda_version, _DEFAULT_CUDA_VERSION
+    environ_cp, "TF_CUDA_VERSION", ask_cuda_version, _DEFAULT_CUDA_VERSION
   )
-  environ_cp['TF_CUDA_VERSION'] = tf_cuda_version
+  environ_cp["TF_CUDA_VERSION"] = tf_cuda_version
 
 
 def set_tf_cudnn_version(environ_cp):
   """Set TF_CUDNN_VERSION."""
   ask_cudnn_version = (
-      'Please specify the cuDNN version you want to use. '
-      '[Leave empty to default to cuDNN %s]: '
+    "Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN %s]: "
   ) % _DEFAULT_CUDNN_VERSION
   tf_cudnn_version = get_from_env_or_user_or_default(
-      environ_cp, 'TF_CUDNN_VERSION', ask_cudnn_version, _DEFAULT_CUDNN_VERSION
+    environ_cp, "TF_CUDNN_VERSION", ask_cudnn_version, _DEFAULT_CUDNN_VERSION
   )
-  environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
+  environ_cp["TF_CUDNN_VERSION"] = tf_cudnn_version
 
 
 def set_tf_tensorrt_version(environ_cp):
   """Set TF_TENSORRT_VERSION."""
   if not (is_linux() or is_windows()):
-    raise ValueError('Currently TensorRT is only supported on Linux platform.')
+    raise ValueError("Currently TensorRT is only supported on Linux platform.")
 
-  if not int(environ_cp.get('TF_NEED_TENSORRT', False)):
+  if not int(environ_cp.get("TF_NEED_TENSORRT", False)):
     return
 
   ask_tensorrt_version = (
-      'Please specify the TensorRT version you want to use. '
-      '[Leave empty to default to TensorRT %s]: '
+    "Please specify the TensorRT version you want to use. [Leave empty to default to TensorRT %s]: "
   ) % _DEFAULT_TENSORRT_VERSION
   tf_tensorrt_version = get_from_env_or_user_or_default(
-      environ_cp, 'TF_TENSORRT_VERSION', ask_tensorrt_version, _DEFAULT_TENSORRT_VERSION
+    environ_cp, "TF_TENSORRT_VERSION", ask_tensorrt_version, _DEFAULT_TENSORRT_VERSION
   )
-  environ_cp['TF_TENSORRT_VERSION'] = tf_tensorrt_version
+  environ_cp["TF_TENSORRT_VERSION"] = tf_tensorrt_version
 
 
 def set_tf_nccl_version(environ_cp):
   """Set TF_NCCL_VERSION."""
   if not is_linux():
-    raise ValueError('Currently NCCL is only supported on Linux platform.')
+    raise ValueError("Currently NCCL is only supported on Linux platform.")
 
-  if 'TF_NCCL_VERSION' in environ_cp:
+  if "TF_NCCL_VERSION" in environ_cp:
     return
 
   ask_nccl_version = (
-      'Please specify the locally installed NCCL version you want to use. '
-      '[Leave empty to use http://github.com/nvidia/nccl]: '
+    "Please specify the locally installed NCCL version you want to use. "
+    "[Leave empty to use http://github.com/nvidia/nccl]: "
   )
-  tf_nccl_version = get_from_env_or_user_or_default(environ_cp, 'TF_NCCL_VERSION', ask_nccl_version, '')
-  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
+  tf_nccl_version = get_from_env_or_user_or_default(environ_cp, "TF_NCCL_VERSION", ask_nccl_version, "")
+  environ_cp["TF_NCCL_VERSION"] = tf_nccl_version
 
 
 def _find_executable(executable: str) -> Optional[str]:
@@ -864,9 +857,9 @@ def _find_executable_or_die(executable_name: str, executable_path: Optional[str]
   resolved_path_to_exe = _find_executable(executable_name)
   if resolved_path_to_exe is None:
     raise RuntimeError(
-        f"Could not find executable `{executable_name}`! "
-        "Please change your $PATH or pass the path directly like"
-        f"`--{executable_name}_path=path/to/executable."
+      f"Could not find executable `{executable_name}`! "
+      "Please change your $PATH or pass the path directly like"
+      f"`--{executable_name}_path=path/to/executable."
     )
   logging.info("Found path to %s at %s", executable_name, resolved_path_to_exe)
 
@@ -885,20 +878,20 @@ def _get_cuda_compute_capabilities_or_die() -> list[str]:
   try:
     nvidia_smi = _find_executable_or_die("nvidia-smi")
     nvidia_smi_proc = subprocess.run(
-        [nvidia_smi, "--query-gpu=compute_cap", "--format=csv,noheader"],
-        capture_output=True,
-        check=True,
-        text=True,
+      [nvidia_smi, "--query-gpu=compute_cap", "--format=csv,noheader"],
+      capture_output=True,
+      check=True,
+      text=True,
     )
     # Command above returns a newline separated list of compute capabilities
     # with possible repeats. So we should unique them and sort the final result.
     capabilities = sorted(set(nvidia_smi_proc.stdout.strip().split("\n")))
     logging.info("Found CUDA compute capabilities: %s", capabilities)
-    return ','.join(capabilities)
+    return ",".join(capabilities)
   except (RuntimeError, subprocess.CalledProcessError) as e:
     logging.info(
-        "Could not find nvidia-smi, or nvidia-smi command failed. Please pass"
-        " capabilities directly using --cuda_compute_capabilities."
+      "Could not find nvidia-smi, or nvidia-smi command failed. Please pass"
+      " capabilities directly using --cuda_compute_capabilities."
     )
     raise e
 
@@ -909,136 +902,136 @@ def set_hermetic_cuda_compute_capabilities(environ_cp):
     default_cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
 
     ask_cuda_compute_capabilities = (
-        'Please specify a list of comma-separated CUDA compute capabilities '
-        'you want to build with.\nYou can find the compute capability of your '
-        'device at: https://developer.nvidia.com/cuda-gpus. Each capability '
-        'can be specified as "x.y" or "compute_xy" to include both virtual and'
-        ' binary GPU code, or as "sm_xy" to only include the binary '
-        'code.\nPlease note that each additional compute capability '
-        'significantly increases your build time and binary size, and that '
-        'Deepray only supports compute capabilities >= 3.5 [Default is: '
-        '%s]: ' % default_cuda_compute_capabilities
+      "Please specify a list of comma-separated CUDA compute capabilities "
+      "you want to build with.\nYou can find the compute capability of your "
+      "device at: https://developer.nvidia.com/cuda-gpus. Each capability "
+      'can be specified as "x.y" or "compute_xy" to include both virtual and'
+      ' binary GPU code, or as "sm_xy" to only include the binary '
+      "code.\nPlease note that each additional compute capability "
+      "significantly increases your build time and binary size, and that "
+      "Deepray only supports compute capabilities >= 3.5 [Default is: "
+      "%s]: " % default_cuda_compute_capabilities
     )
     hermetic_cuda_compute_capabilities = get_from_env_or_user_or_default(
-        environ_cp,
-        'HERMETIC_CUDA_COMPUTE_CAPABILITIES',
-        ask_cuda_compute_capabilities,
-        default_cuda_compute_capabilities,
+      environ_cp,
+      "HERMETIC_CUDA_COMPUTE_CAPABILITIES",
+      ask_cuda_compute_capabilities,
+      default_cuda_compute_capabilities,
     )
     # Check whether all capabilities from the input is valid
     all_valid = True
     # Remove all whitespace characters before splitting the string
     # that users may insert by accident, as this will result in error
-    hermetic_cuda_compute_capabilities = ''.join(hermetic_cuda_compute_capabilities.split())
-    for compute_capability in hermetic_cuda_compute_capabilities.split(','):
-      m = re.match('[0-9]+.[0-9]+', compute_capability)
+    hermetic_cuda_compute_capabilities = "".join(hermetic_cuda_compute_capabilities.split())
+    for compute_capability in hermetic_cuda_compute_capabilities.split(","):
+      m = re.match("[0-9]+.[0-9]+", compute_capability)
       if not m:
         # We now support sm_35,sm_50,sm_60,compute_70.
-        sm_compute_match = re.match('(sm|compute)_?([0-9]+[0-9]+)', compute_capability)
+        sm_compute_match = re.match("(sm|compute)_?([0-9]+[0-9]+)", compute_capability)
         if not sm_compute_match:
-          print('Invalid compute capability: %s' % compute_capability)
+          print("Invalid compute capability: %s" % compute_capability)
           all_valid = False
         else:
           ver = int(sm_compute_match.group(2))
           if ver < 30:
             print(
-                'ERROR: TensorFlow only supports small CUDA compute'
-                ' capabilities of sm_30 and higher. Please re-specify the list'
-                ' of compute capabilities excluding version %s.' % ver
+              "ERROR: TensorFlow only supports small CUDA compute"
+              " capabilities of sm_30 and higher. Please re-specify the list"
+              " of compute capabilities excluding version %s." % ver
             )
             all_valid = False
           if ver < 35:
             print(
-                'WARNING: XLA does not support CUDA compute capabilities '
-                'lower than sm_35. Disable XLA when running on older GPUs.'
+              "WARNING: XLA does not support CUDA compute capabilities "
+              "lower than sm_35. Disable XLA when running on older GPUs."
             )
       else:
         ver = float(m.group(0))
         if ver < 3.0:
           print(
-              'ERROR: TensorFlow only supports CUDA compute capabilities 3.0 '
-              'and higher. Please re-specify the list of compute '
-              'capabilities excluding version %s.' % ver
+            "ERROR: TensorFlow only supports CUDA compute capabilities 3.0 "
+            "and higher. Please re-specify the list of compute "
+            "capabilities excluding version %s." % ver
           )
           all_valid = False
         if ver < 3.5:
           print(
-              'WARNING: XLA does not support CUDA compute capabilities '
-              'lower than 3.5. Disable XLA when running on older GPUs.'
+            "WARNING: XLA does not support CUDA compute capabilities "
+            "lower than 3.5. Disable XLA when running on older GPUs."
           )
 
     if all_valid:
       break
 
     # Reset and Retry
-    environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = ''
+    environ_cp["HERMETIC_CUDA_COMPUTE_CAPABILITIES"] = ""
 
   # Set HERMETIC_CUDA_COMPUTE_CAPABILITIES
-  environ_cp['HERMETIC_CUDA_COMPUTE_CAPABILITIES'] = (hermetic_cuda_compute_capabilities)
+  environ_cp["HERMETIC_CUDA_COMPUTE_CAPABILITIES"] = hermetic_cuda_compute_capabilities
   write_to_bazelrc(
-      'build:{} --repo_env {}="{}"'.format(
-          'cuda', 'HERMETIC_CUDA_COMPUTE_CAPABILITIES', str(hermetic_cuda_compute_capabilities)
-      )
+    'build:{} --repo_env {}="{}"'.format(
+      "cuda", "HERMETIC_CUDA_COMPUTE_CAPABILITIES", str(hermetic_cuda_compute_capabilities)
+    )
   )
 
 
 def set_other_cuda_vars(environ_cp):
   """Set other CUDA related variables."""
   # If CUDA is enabled, always use GPU during build and test.
-  if environ_cp.get('TF_NEED_CLANG') == '1':
-    write_to_bazelrc('build --config=cuda_clang')
-    write_action_env('CLANG_CUDA_COMPILER_PATH', environ_cp.get('CLANG_COMPILER_PATH'))
+  if environ_cp.get("TF_NEED_CLANG") == "1":
+    write_to_bazelrc("build --config=cuda_clang")
+    write_action_env("CLANG_CUDA_COMPILER_PATH", environ_cp.get("CLANG_COMPILER_PATH"))
   else:
-    write_to_bazelrc('build --config=cuda')
-    write_to_bazelrc('build --config=cuda_nvcc')
+    write_to_bazelrc("build --config=cuda")
+    write_to_bazelrc("build --config=cuda_nvcc")
 
 
 def system_specific_test_config(environ_cp):
   """Add default build and test flags required for TF tests to bazelrc."""
-  write_to_bazelrc('test --test_size_filters=small,medium')
+  write_to_bazelrc("test --test_size_filters=small,medium")
 
   # Each instance of --test_tag_filters or --build_tag_filters overrides all
   # previous instances, so we need to build up a complete list and write a
   # single list of filters for the .bazelrc file.
 
   # Filters to use with both --test_tag_filters and --build_tag_filters
-  test_and_build_filters = ['-benchmark-test', '-no_oss', '-oss_excluded']
+  test_and_build_filters = ["-benchmark-test", "-no_oss", "-oss_excluded"]
   # Additional filters for --test_tag_filters beyond those in
   # test_and_build_filters
-  test_only_filters = ['-oss_serial']
+  test_only_filters = ["-oss_serial"]
   if is_windows():
-    test_and_build_filters += ['-no_windows', '-windows_excluded']
-    if (environ_cp.get('TF_NEED_CUDA', None) == '1') or (environ_cp.get('TF_NEED_ROCM', None) == '1'):
-      test_and_build_filters += ['-no_windows_gpu', '-no_gpu']
+    test_and_build_filters += ["-no_windows", "-windows_excluded"]
+    if (environ_cp.get("TF_NEED_CUDA", None) == "1") or (environ_cp.get("TF_NEED_ROCM", None) == "1"):
+      test_and_build_filters += ["-no_windows_gpu", "-no_gpu"]
     else:
-      test_and_build_filters.append('-gpu')
+      test_and_build_filters.append("-gpu")
   elif is_macos():
-    test_and_build_filters += ['-gpu', '-nomac', '-no_mac', '-mac_excluded']
+    test_and_build_filters += ["-gpu", "-nomac", "-no_mac", "-mac_excluded"]
   elif is_linux():
-    if (environ_cp.get('TF_NEED_CUDA', None) == '1') or (environ_cp.get('TF_NEED_ROCM', None) == '1'):
-      test_and_build_filters.append('-no_gpu')
-      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
+    if (environ_cp.get("TF_NEED_CUDA", None) == "1") or (environ_cp.get("TF_NEED_ROCM", None) == "1"):
+      test_and_build_filters.append("-no_gpu")
+      write_to_bazelrc("test --test_env=LD_LIBRARY_PATH")
     else:
-      test_and_build_filters.append('-gpu')
+      test_and_build_filters.append("-gpu")
 
 
 def set_system_libs_flag(environ_cp):
   """Set system libs flags."""
-  syslibs = environ_cp.get('TF_SYSTEM_LIBS', '')
+  syslibs = environ_cp.get("TF_SYSTEM_LIBS", "")
 
-  if is_s390x() and 'boringssl' not in syslibs:
-    syslibs = 'boringssl' + (', ' + syslibs if syslibs else '')
+  if is_s390x() and "boringssl" not in syslibs:
+    syslibs = "boringssl" + (", " + syslibs if syslibs else "")
 
   if syslibs:
-    if ',' in syslibs:
-      syslibs = ','.join(sorted(syslibs.split(',')))
+    if "," in syslibs:
+      syslibs = ",".join(sorted(syslibs.split(",")))
     else:
-      syslibs = ','.join(sorted(syslibs.split()))
-    write_action_env('TF_SYSTEM_LIBS', syslibs)
+      syslibs = ",".join(sorted(syslibs.split()))
+    write_action_env("TF_SYSTEM_LIBS", syslibs)
 
-  for varname in ('PREFIX', 'LIBDIR', 'INCLUDEDIR', 'PROTOBUF_INCLUDE_PATH'):
+  for varname in ("PREFIX", "LIBDIR", "INCLUDEDIR", "PROTOBUF_INCLUDE_PATH"):
     if varname in environ_cp:
-      write_to_bazelrc('build --define=%s=%s' % (varname, environ_cp[varname]))
+      write_to_bazelrc("build --define=%s=%s" % (varname, environ_cp[varname]))
 
 
 def set_windows_build_flags(environ_cp):
@@ -1047,26 +1040,28 @@ def set_windows_build_flags(environ_cp):
   # First available in VS 16.4. Speeds up Windows compile times by a lot. See
   # https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
   # pylint: disable=line-too-long
-  write_to_bazelrc('build --copt=/d2ReducedOptimizeHugeFunctions --host_copt=/d2ReducedOptimizeHugeFunctions')
+  write_to_bazelrc("build --copt=/d2ReducedOptimizeHugeFunctions --host_copt=/d2ReducedOptimizeHugeFunctions")
 
   if get_var(
-      environ_cp, 'TF_OVERRIDE_EIGEN_STRONG_INLINE', 'Eigen strong inline', True,
-      ('Would you like to override eigen strong inline for some C++ '
-       'compilation to reduce the compilation time?'), 'Eigen strong inline overridden.',
-      'Not overriding eigen strong inline, '
-      'some compilations could take more than 20 mins.'
+    environ_cp,
+    "TF_OVERRIDE_EIGEN_STRONG_INLINE",
+    "Eigen strong inline",
+    True,
+    ("Would you like to override eigen strong inline for some C++ compilation to reduce the compilation time?"),
+    "Eigen strong inline overridden.",
+    "Not overriding eigen strong inline, some compilations could take more than 20 mins.",
   ):
     # Due to a known MSVC compiler issue
     # https://github.com/tensorflow/tensorflow/issues/10521
     # Overriding eigen strong inline speeds up the compiling of
     # conv_grad_ops_3d.cc and conv_ops_3d.cc by 20 minutes,
     # but this also hurts the performance. Let users decide what they want.
-    write_to_bazelrc('build --define=override_eigen_strong_inline=true')
+    write_to_bazelrc("build --define=override_eigen_strong_inline=true")
 
 
 def config_info_line(name, help_text):
   """Helper function to print formatted help text for Bazel config options."""
-  print('\t--config=%-12s\t# %s' % (name, help_text))
+  print("\t--config=%-12s\t# %s" % (name, help_text))
 
 
 def validate_cuda_config(environ_cp):
@@ -1078,68 +1073,66 @@ def maybe_encode_env(env):
       return env
     for k, v in env.items():
       if isinstance(k, unicode):
-        k = k.encode('ascii')
+        k = k.encode("ascii")
       if isinstance(v, unicode):
-        v = v.encode('ascii')
+        v = v.encode("ascii")
       env[k] = v
     return env
 
-  cuda_libraries = ['cuda', 'cudnn']
+  cuda_libraries = ["cuda", "cudnn"]
   if is_linux():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-    if environ_cp.get('TF_NCCL_VERSION', None):
-      cuda_libraries.append('nccl')
+    if int(environ_cp.get("TF_NEED_TENSORRT", False)):
+      cuda_libraries.append("tensorrt")
+    if environ_cp.get("TF_NCCL_VERSION", None):
+      cuda_libraries.append("nccl")
   if is_windows():
-    if int(environ_cp.get('TF_NEED_TENSORRT', False)):
-      cuda_libraries.append('tensorrt')
-      print('WARNING: TensorRT support on Windows is experimental\n')
+    if int(environ_cp.get("TF_NEED_TENSORRT", False)):
+      cuda_libraries.append("tensorrt")
+      print("WARNING: TensorRT support on Windows is experimental\n")
 
-  paths = glob.glob('**/third_party/gpus/find_cuda_config.py', recursive=True)
+  paths = glob.glob("**/third_party/gpus/find_cuda_config.py", recursive=True)
   if not paths:
     raise FileNotFoundError("Can't find 'find_cuda_config.py' script inside working directory")
   proc = subprocess.Popen(
-      [environ_cp['PYTHON_BIN_PATH'], paths[0]] + cuda_libraries,
-      stdout=subprocess.PIPE,
-      env=maybe_encode_env(environ_cp)
+    [environ_cp["PYTHON_BIN_PATH"], paths[0]] + cuda_libraries, stdout=subprocess.PIPE, env=maybe_encode_env(environ_cp)
   )
 
   if proc.wait():
     # Errors from find_cuda_config.py were sent to stderr.
-    print('Asking for detailed CUDA configuration...\n')
+    print("Asking for detailed CUDA configuration...\n")
     return False
 
-  config = dict(tuple(line.decode('ascii').rstrip().split(': ')) for line in proc.stdout)
+  config = dict(tuple(line.decode("ascii").rstrip().split(": ")) for line in proc.stdout)
 
-  print('Found CUDA %s in:' % config['cuda_version'])
-  print('    %s' % config['cuda_library_dir'])
-  print('    %s' % config['cuda_include_dir'])
+  print("Found CUDA %s in:" % config["cuda_version"])
+  print("    %s" % config["cuda_library_dir"])
+  print("    %s" % config["cuda_include_dir"])
 
-  print('Found cuDNN %s in:' % config['cudnn_version'])
-  print('    %s' % config['cudnn_library_dir'])
-  print('    %s' % config['cudnn_include_dir'])
+  print("Found cuDNN %s in:" % config["cudnn_version"])
+  print("    %s" % config["cudnn_library_dir"])
+  print("    %s" % config["cudnn_include_dir"])
 
-  if 'tensorrt_version' in config:
-    print('Found TensorRT %s in:' % config['tensorrt_version'])
-    print('    %s' % config['tensorrt_library_dir'])
-    print('    %s' % config['tensorrt_include_dir'])
+  if "tensorrt_version" in config:
+    print("Found TensorRT %s in:" % config["tensorrt_version"])
+    print("    %s" % config["tensorrt_library_dir"])
+    print("    %s" % config["tensorrt_include_dir"])
 
-  if config.get('nccl_version', None):
-    print('Found NCCL %s in:' % config['nccl_version'])
-    print('    %s' % config['nccl_library_dir'])
-    print('    %s' % config['nccl_include_dir'])
+  if config.get("nccl_version", None):
+    print("Found NCCL %s in:" % config["nccl_version"])
+    print("    %s" % config["nccl_library_dir"])
+    print("    %s" % config["nccl_include_dir"])
 
-  print('\n')
+  print("\n")
 
-  environ_cp['CUDA_TOOLKIT_PATH'] = config['cuda_toolkit_path']
+  environ_cp["CUDA_TOOLKIT_PATH"] = config["cuda_toolkit_path"]
   return True
 
 
 def get_gcc_compiler(environ_cp):
-  gcc_env = environ_cp.get('CXX') or environ_cp.get('CC') or which('gcc')
+  gcc_env = environ_cp.get("CXX") or environ_cp.get("CC") or which("gcc")
   if gcc_env is not None:
-    gcc_version = run_shell([gcc_env, '--version']).split()
-    if gcc_version[0] in ('gcc', 'g++'):
+    gcc_version = run_shell([gcc_env, "--version"]).split()
+    if gcc_version[0] in ("gcc", "g++"):
       return gcc_env
   return None
 
@@ -1153,10 +1146,10 @@ def main():
 
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      '--workspace',
-      type=str,
-      default=os.path.abspath(os.path.dirname(__file__)),
-      help='The absolute path to your active Bazel workspace.'
+    "--workspace",
+    type=str,
+    default=os.path.abspath(os.path.dirname(__file__)),
+    help="The absolute path to your active Bazel workspace.",
   )
   args = parser.parse_args()
 
@@ -1170,7 +1163,7 @@ def main():
   try:
     current_bazel_version = retrieve_bazel_version()
   except subprocess.CalledProcessError as e:
-    print('Error retrieving bazel version: ', e.output.decode('UTF-8').strip())
+    print("Error retrieving bazel version: ", e.output.decode("UTF-8").strip())
     raise e
 
   _DP_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
@@ -1184,8 +1177,7 @@ def main():
   write_action_env("TF_SHARED_LIBRARY_DIR", get_tf_shared_lib_dir())
   write_action_env("TF_SHARED_LIBRARY_NAME", get_shared_lib_name())
   write_action_env(
-      "TF_SHARED_CC_LIBRARY_NAME",
-      get_shared_lib_name().replace("libtensorflow_framework", "libtensorflow_cc")
+    "TF_SHARED_CC_LIBRARY_NAME", get_shared_lib_name().replace("libtensorflow_framework", "libtensorflow_cc")
   )
   write_action_env("TF_CXX11_ABI_FLAG", tf.sysconfig.CXX11_ABI_FLAG)
 
@@ -1195,12 +1187,12 @@ def main():
   tf_version_integer = get_tf_version_integer()
   # This is used to trace the difference between Tensorflow versions.
   write_action_env("TF_VERSION_INTEGER", tf_version_integer)
-  write_to_bazelrc('')
+  write_to_bazelrc("")
 
   # Ask whether we should use clang for the CPU build.
   if is_linux():
-    environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp))
-    if environ_cp.get('TF_NEED_CLANG') == '1':
+    environ_cp["TF_NEED_CLANG"] = str(choose_compiler(environ_cp))
+    if environ_cp.get("TF_NEED_CLANG") == "1":
       clang_compiler_path = set_clang_compiler_path(environ_cp)
       clang_version = retrieve_clang_version(clang_compiler_path)
       disable_clang_offsetof_extension(clang_version)
@@ -1209,40 +1201,38 @@ def main():
       gcc_major_version = get_gcc_major_version(gcc_path)
       if gcc_major_version < 13:
         # Prevent XNNPACK from using `-mavxvnniint8` (only available in clang 16+/gcc 13+).
-        write_to_bazelrc('build --define=xnn_enable_avxvnniint8=false')
+        write_to_bazelrc("build --define=xnn_enable_avxvnniint8=false")
 
   if is_windows():
     print(
-        '\nWARNING: Cannot build with CUDA support on Windows.\n'
-        'Starting in TF 2.11, CUDA build is not supported for Windows. '
-        'For using TensorFlow GPU on Windows, you will need to build/install '
-        'TensorFlow in WSL2.\n'
+      "\nWARNING: Cannot build with CUDA support on Windows.\n"
+      "Starting in TF 2.11, CUDA build is not supported for Windows. "
+      "For using TensorFlow GPU on Windows, you will need to build/install "
+      "TensorFlow in WSL2.\n"
     )
-    environ_cp['TF_NEED_CUDA'] = '0'
+    environ_cp["TF_NEED_CUDA"] = "0"
   else:
-    environ_cp['TF_NEED_CUDA'] = str(int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', True)))
-  if environ_cp.get('TF_NEED_CUDA') == '1' and 'TF_CUDA_CONFIG_REPO' not in environ_cp:
-
+    environ_cp["TF_NEED_CUDA"] = str(int(get_var(environ_cp, "TF_NEED_CUDA", "CUDA", True)))
+  if environ_cp.get("TF_NEED_CUDA") == "1" and "TF_CUDA_CONFIG_REPO" not in environ_cp:
     # set_action_env_var(environ_cp, 'TF_NEED_TENSORRT', 'TensorRT', False, bazel_config_name='tensorrt')
 
     environ_save = dict(environ_cp)
     for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
-
       if validate_cuda_config(environ_cp):
         cuda_env_names = [
-            'TF_CUDA_VERSION',
-            'TF_CUBLAS_VERSION',
-            'TF_CUDNN_VERSION',
-            'TF_TENSORRT_VERSION',
-            'TF_NCCL_VERSION',
-            'TF_CUDA_PATHS',
-            # Items below are for backwards compatibility when not using
-            # TF_CUDA_PATHS.
-            'CUDA_TOOLKIT_PATH',
-            'CUDNN_INSTALL_PATH',
-            'NCCL_INSTALL_PATH',
-            'NCCL_HDR_PATH',
-            'TENSORRT_INSTALL_PATH'
+          "TF_CUDA_VERSION",
+          "TF_CUBLAS_VERSION",
+          "TF_CUDNN_VERSION",
+          "TF_TENSORRT_VERSION",
+          "TF_NCCL_VERSION",
+          "TF_CUDA_PATHS",
+          # Items below are for backwards compatibility when not using
+          # TF_CUDA_PATHS.
+          "CUDA_TOOLKIT_PATH",
+          "CUDNN_INSTALL_PATH",
+          "NCCL_INSTALL_PATH",
+          "NCCL_HDR_PATH",
+          "TENSORRT_INSTALL_PATH",
         ]
         # Note: set_action_env_var above already writes to bazelrc.
         for name in cuda_env_names:
@@ -1265,29 +1255,28 @@ def main():
 
     else:
       raise UserInputError(
-          'Invalid CUDA setting were provided %d '
-          'times in a row. Assuming to be a scripting mistake.' % _DEFAULT_PROMPT_ASK_ATTEMPTS
+        "Invalid CUDA setting were provided %d "
+        "times in a row. Assuming to be a scripting mistake." % _DEFAULT_PROMPT_ASK_ATTEMPTS
       )
 
     set_hermetic_cuda_compute_capabilities(environ_cp)
-    if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get('LD_LIBRARY_PATH') != '1':
-      write_action_env('LD_LIBRARY_PATH', environ_cp.get('LD_LIBRARY_PATH'))
+    if "LD_LIBRARY_PATH" in environ_cp and environ_cp.get("LD_LIBRARY_PATH") != "1":
+      write_action_env("LD_LIBRARY_PATH", environ_cp.get("LD_LIBRARY_PATH"))
 
     set_other_cuda_vars(environ_cp)
   else:
-    if environ_cp.get('TF_NEED_CLANG') == '1':
-      write_action_env('CLANG_COMPILER_PATH', clang_compiler_path)
+    if environ_cp.get("TF_NEED_CLANG") == "1":
+      write_action_env("CLANG_COMPILER_PATH", clang_compiler_path)
 
   # ROCm / CUDA are mutually exclusive.
   # At most 1 GPU platform can be configured.
   gpu_platform_count = 0
-  if environ_cp.get('TF_NEED_ROCM') == '1':
+  if environ_cp.get("TF_NEED_ROCM") == "1":
     gpu_platform_count += 1
-  if environ_cp.get('TF_NEED_CUDA') == '1':
+  if environ_cp.get("TF_NEED_CUDA") == "1":
     gpu_platform_count += 1
   if gpu_platform_count >= 2:
-    raise UserInputError('CUDA / ROCm are mututally exclusive. '
-                         'At most 1 GPU platform can be configured.')
+    raise UserInputError("CUDA / ROCm are mututally exclusive. At most 1 GPU platform can be configured.")
 
   set_cc_opt_flags(environ_cp)
   set_system_libs_flag(environ_cp)
@@ -1297,19 +1286,19 @@ def main():
   system_specific_test_config(environ_cp)
 
   print(
-      'Preconfigured Bazel build configs. You can use any of the below by '
-      'adding "--config=<>" to your build command. See .bazelrc for more '
-      'details.'
+    "Preconfigured Bazel build configs. You can use any of the below by "
+    'adding "--config=<>" to your build command. See .bazelrc for more '
+    "details."
   )
-  config_info_line('mkl', 'Build with MKL support.')
-  config_info_line('mkl_aarch64', 'Build with oneDNN and Compute Library for the Arm Architecture (ACL).')
-  config_info_line('monolithic', 'Config for mostly static monolithic build.')
-  config_info_line('numa', 'Build with NUMA support.')
-  config_info_line('dynamic_kernels', '(Experimental) Build kernels into separate shared objects.')
+  config_info_line("mkl", "Build with MKL support.")
+  config_info_line("mkl_aarch64", "Build with oneDNN and Compute Library for the Arm Architecture (ACL).")
+  config_info_line("monolithic", "Config for mostly static monolithic build.")
+  config_info_line("numa", "Build with NUMA support.")
+  config_info_line("dynamic_kernels", "(Experimental) Build kernels into separate shared objects.")
 
-  print('Preconfigured Bazel build configs to DISABLE default on features:')
-  config_info_line('nogcp', 'Disable GCP support.')
-  config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
+  print("Preconfigured Bazel build configs to DISABLE default on features:")
+  config_info_line("nogcp", "Disable GCP support.")
+  config_info_line("nonccl", "Disable NVIDIA NCCL support.")
 
   print()
   if environ_cp.get("TF_NEED_CUDA", "0") == "1":
@@ -1321,5 +1310,5 @@ def main():
   print(pathlib.Path(_DP_BAZELRC).read_text())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/__init__.py b/deepray/__init__.py
index a1d02a99..1790866e 100644
--- a/deepray/__init__.py
+++ b/deepray/__init__.py
@@ -50,15 +50,16 @@
 def init():
   logger.debug(f"sys.argv = {sys.argv}")  # sys.argv from Horovod
 
-  gpus = tf.config.experimental.list_physical_devices('GPU')
+  gpus = tf.config.experimental.list_physical_devices("GPU")
   for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu, True)
 
   if flags.FLAGS.distribution_strategy == "horovod":
     import horovod.tensorflow as hvd
+
     hvd.init()
     if gpus:
-      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
       gpu_affinity.set_affinity(hvd.local_rank())
 
 
@@ -68,23 +69,23 @@ def start_tensorflow_server(cluster_resolver):
   os.environ["GRPC_FAIL_FAST"] = "use_caller"
 
   server = tf.distribute.Server(
-      cluster_resolver.cluster_spec(),
-      job_name=cluster_resolver.task_type,
-      task_index=cluster_resolver.task_id,
-      protocol=cluster_resolver.rpc_layer or "grpc",
-      start=True,
+    cluster_resolver.cluster_spec(),
+    job_name=cluster_resolver.task_type,
+    task_index=cluster_resolver.task_id,
+    protocol=cluster_resolver.rpc_layer or "grpc",
+    start=True,
   )
   server.join()
 
 
 def runner(function, verbose=None):
-  parser = argparse.ArgumentParser(description='Deepray Runner')
-  parser.add_argument('-v', '--version', action='version', version=__version__, help='Shows Deepray version.')
+  parser = argparse.ArgumentParser(description="Deepray Runner")
+  parser.add_argument("-v", "--version", action="version", version=__version__, help="Shows Deepray version.")
   parser.add_argument(
-      '--distribution_strategy', type=str, default='Horovod', help='Whether run distributed training with Horovod.'
+    "--distribution_strategy", type=str, default="Horovod", help="Whether run distributed training with Horovod."
   )
 
-  physical_devices = tf.config.list_physical_devices('GPU')
+  physical_devices = tf.config.list_physical_devices("GPU")
   world_size = len(physical_devices)
   logger.debug(f"world_size = {world_size}")
 
@@ -93,14 +94,15 @@ def runner(function, verbose=None):
 
   if world_size > 1 and args.distribution_strategy == "Horovod":
     user_argv.extend([
-        "--distribution_strategy=horovod",
-        f"--num_gpus={world_size}",
-        "--use_horovod",
+      "--distribution_strategy=horovod",
+      f"--num_gpus={world_size}",
+      "--use_horovod",
     ])
     try:
       import horovod
-      os.environ['HOROVOD_STALL_CHECK_TIME_SECONDS'] = '5'
-      os.environ['HOROVOD_STALL_SHUTDOWN_TIME_SECONDS'] = '30'
+
+      os.environ["HOROVOD_STALL_CHECK_TIME_SECONDS"] = "5"
+      os.environ["HOROVOD_STALL_SHUTDOWN_TIME_SECONDS"] = "30"
     except ImportError:
       raise ValueError("Please install Horovod properly first if you want to use Horovod distribution_strategy.")
 
diff --git a/deepray/activations/hardshrink.py b/deepray/activations/hardshrink.py
index 5ea1700e..50eda61c 100644
--- a/deepray/activations/hardshrink.py
+++ b/deepray/activations/hardshrink.py
@@ -48,9 +48,9 @@ def hardshrink(x: TensorLike, lower: Number = -0.5, upper: Number = 0.5) -> tf.T
     """
   if lower > upper:
     raise ValueError(
-        "The value of lower is {} and should"
-        " not be higher than the value "
-        "variable upper, which is {} .".format(lower, upper)
+      "The value of lower is {} and should not be higher than the value variable upper, which is {} .".format(
+        lower, upper
+      )
     )
   x = tf.convert_to_tensor(x)
   mask_lower = x < lower
diff --git a/deepray/activations/lisht.py b/deepray/activations/lisht.py
index 748585fc..28ce1465 100644
--- a/deepray/activations/lisht.py
+++ b/deepray/activations/lisht.py
@@ -22,25 +22,25 @@
 def lisht(x: TensorLike) -> tf.Tensor:
   r"""LiSHT: Non-Parameteric Linearly Scaled Hyperbolic Tangent Activation Function.
 
-    Computes linearly scaled hyperbolic tangent (LiSHT):
+  Computes linearly scaled hyperbolic tangent (LiSHT):
 
-    $$
-    \mathrm{lisht}(x) = x * \tanh(x).
-    $$
+  $$
+  \mathrm{lisht}(x) = x * \tanh(x).
+  $$
 
-    See [LiSHT: Non-Parameteric Linearly Scaled Hyperbolic Tangent Activation Function for Neural Networks](https://arxiv.org/abs/1901.05894).
+  See [LiSHT: Non-Parameteric Linearly Scaled Hyperbolic Tangent Activation Function for Neural Networks](https://arxiv.org/abs/1901.05894).
 
-    Usage:
+  Usage:
 
-    >>> x = tf.constant([1.0, 0.0, 1.0])
-    >>> dp.activations.lisht(x)
-    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.7615942, 0.       , 0.7615942], dtype=float32)>
+  >>> x = tf.constant([1.0, 0.0, 1.0])
+  >>> dp.activations.lisht(x)
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.7615942, 0.       , 0.7615942], dtype=float32)>
 
-    Args:
-        x: A `Tensor`. Must be one of the following types:
-            `bfloat16`, `float16`, `float32`, `float64`.
-    Returns:
-        A `Tensor`. Has the same type as `x`.
-    """
+  Args:
+      x: A `Tensor`. Must be one of the following types:
+          `bfloat16`, `float16`, `float32`, `float64`.
+  Returns:
+      A `Tensor`. Has the same type as `x`.
+  """
   x = tf.convert_to_tensor(x)
   return x * tf.math.tanh(x)
diff --git a/deepray/activations/mish.py b/deepray/activations/mish.py
index f1d1b288..31c42826 100644
--- a/deepray/activations/mish.py
+++ b/deepray/activations/mish.py
@@ -22,25 +22,25 @@
 def mish(x: TensorLike) -> tf.Tensor:
   r"""Mish: A Self Regularized Non-Monotonic Neural Activation Function.
 
-    Computes mish activation:
+  Computes mish activation:
 
-    $$
-    \mathrm{mish}(x) = x \cdot \tanh(\mathrm{softplus}(x)).
-    $$
+  $$
+  \mathrm{mish}(x) = x \cdot \tanh(\mathrm{softplus}(x)).
+  $$
 
-    See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681).
+  See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681).
 
-    Usage:
+  Usage:
 
-    >>> x = tf.constant([1.0, 0.0, 1.0])
-    >>> dp.activations.mish(x)
-    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.865098..., 0.       , 0.865098...], dtype=float32)>
+  >>> x = tf.constant([1.0, 0.0, 1.0])
+  >>> dp.activations.mish(x)
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.865098..., 0.       , 0.865098...], dtype=float32)>
 
-    Args:
-        x: A `Tensor`. Must be one of the following types:
-            `bfloat16`, `float16`, `float32`, `float64`.
-    Returns:
-        A `Tensor`. Has the same type as `x`.
-    """
+  Args:
+      x: A `Tensor`. Must be one of the following types:
+          `bfloat16`, `float16`, `float32`, `float64`.
+  Returns:
+      A `Tensor`. Has the same type as `x`.
+  """
   x = tf.convert_to_tensor(x)
   return x * tf.math.tanh(tf.math.softplus(x))
diff --git a/deepray/activations/rrelu.py b/deepray/activations/rrelu.py
index 326b1626..8103a832 100644
--- a/deepray/activations/rrelu.py
+++ b/deepray/activations/rrelu.py
@@ -20,12 +20,12 @@
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 def rrelu(
-    x: TensorLike,
-    lower: Number = 0.125,
-    upper: Number = 0.3333333333333333,
-    training: Optional[bool] = None,
-    seed: Optional[int] = None,
-    rng: Optional[tf.random.Generator] = None,
+  x: TensorLike,
+  lower: Number = 0.125,
+  upper: Number = 0.3333333333333333,
+  training: Optional[bool] = None,
+  seed: Optional[int] = None,
+  rng: Optional[tf.random.Generator] = None,
 ) -> tf.Tensor:
   r"""Randomized leaky rectified liner unit function.
 
diff --git a/deepray/activations/snake.py b/deepray/activations/snake.py
index 0f922eac..273aac57 100644
--- a/deepray/activations/snake.py
+++ b/deepray/activations/snake.py
@@ -22,26 +22,26 @@
 def snake(x: types.TensorLike, frequency: types.Number = 1) -> tf.Tensor:
   r"""Snake activation to learn periodic functions.
 
-    Computes snake activation:
+  Computes snake activation:
 
-    $$
-    \mathrm{snake}(x) = \mathrm{x} + \frac{1 - \cos(2 \cdot \mathrm{frequency} \cdot x)}{2 \cdot \mathrm{frequency}}.
-    $$
+  $$
+  \mathrm{snake}(x) = \mathrm{x} + \frac{1 - \cos(2 \cdot \mathrm{frequency} \cdot x)}{2 \cdot \mathrm{frequency}}.
+  $$
 
-    See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195).
+  See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195).
 
-    Usage:
+  Usage:
 
-    >>> x = tf.constant([-1.0, 0.0, 1.0])
-    >>> dp.activations.snake(x)
-    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.29192656,  0.        ,  1.7080734 ], dtype=float32)>
+  >>> x = tf.constant([-1.0, 0.0, 1.0])
+  >>> dp.activations.snake(x)
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.29192656,  0.        ,  1.7080734 ], dtype=float32)>
 
-    Args:
-        x: A `Tensor`.
-        frequency: A scalar, frequency of the periodic part.
-    Returns:
-        A `Tensor`. Has the same type as `x`.
-    """
+  Args:
+      x: A `Tensor`.
+      frequency: A scalar, frequency of the periodic part.
+  Returns:
+      A `Tensor`. Has the same type as `x`.
+  """
   x = tf.convert_to_tensor(x)
   frequency = tf.cast(frequency, x.dtype)
 
diff --git a/deepray/activations/softshrink.py b/deepray/activations/softshrink.py
index b5b9756f..ee906963 100644
--- a/deepray/activations/softshrink.py
+++ b/deepray/activations/softshrink.py
@@ -48,9 +48,9 @@ def softshrink(x: TensorLike, lower: Number = -0.5, upper: Number = 0.5) -> tf.T
     """
   if lower > upper:
     raise ValueError(
-        "The value of lower is {} and should"
-        " not be higher than the value "
-        "variable upper, which is {} .".format(lower, upper)
+      "The value of lower is {} and should not be higher than the value variable upper, which is {} .".format(
+        lower, upper
+      )
     )
   x = tf.convert_to_tensor(x)
   values_below_lower = tf.where(x < lower, x - lower, 0)
diff --git a/deepray/activations/sparsemax.py b/deepray/activations/sparsemax.py
index 54ac5522..e6689661 100644
--- a/deepray/activations/sparsemax.py
+++ b/deepray/activations/sparsemax.py
@@ -22,32 +22,32 @@
 def sparsemax(logits: types.TensorLike, axis: int = -1) -> tf.Tensor:
   r"""Sparsemax activation function.
 
-    For each batch $i$, and class $j$,
-    compute sparsemax activation function:
-
-    $$
-    \mathrm{sparsemax}(x)[i, j] = \max(\mathrm{logits}[i, j] - \tau(\mathrm{logits}[i, :]), 0).
-    $$
-
-    See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068).
-
-    Usage:
-
-    >>> x = tf.constant([[-1.0, 0.0, 1.0], [-5.0, 1.0, 2.0]])
-    >>> dp.activations.sparsemax(x)
-    <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
-    array([[0., 0., 1.],
-           [0., 0., 1.]], dtype=float32)>
-
-    Args:
-        logits: A `Tensor`.
-        axis: `int`, axis along which the sparsemax operation is applied.
-    Returns:
-        A `Tensor`, output of sparsemax transformation. Has the same type and
-        shape as `logits`.
-    Raises:
-        ValueError: In case `dim(logits) == 1`.
-    """
+  For each batch $i$, and class $j$,
+  compute sparsemax activation function:
+
+  $$
+  \mathrm{sparsemax}(x)[i, j] = \max(\mathrm{logits}[i, j] - \tau(\mathrm{logits}[i, :]), 0).
+  $$
+
+  See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068).
+
+  Usage:
+
+  >>> x = tf.constant([[-1.0, 0.0, 1.0], [-5.0, 1.0, 2.0]])
+  >>> dp.activations.sparsemax(x)
+  <tf.Tensor: shape=(2, 3), dtype=float32, numpy=
+  array([[0., 0., 1.],
+         [0., 0., 1.]], dtype=float32)>
+
+  Args:
+      logits: A `Tensor`.
+      axis: `int`, axis along which the sparsemax operation is applied.
+  Returns:
+      A `Tensor`, output of sparsemax transformation. Has the same type and
+      shape as `logits`.
+  Raises:
+      ValueError: In case `dim(logits) == 1`.
+  """
   logits = tf.convert_to_tensor(logits, name="logits")
 
   # We need its original shape for shape inference.
@@ -79,17 +79,17 @@ def sparsemax(logits: types.TensorLike, axis: int = -1) -> tf.Tensor:
 
 def _swap_axis(logits, dim_index, last_index, **kwargs):
   return tf.transpose(
-      logits,
-      tf.concat(
-          [
-              tf.range(dim_index),
-              [last_index],
-              tf.range(dim_index + 1, last_index),
-              [dim_index],
-          ],
-          0,
-      ),
-      **kwargs,
+    logits,
+    tf.concat(
+      [
+        tf.range(dim_index),
+        [last_index],
+        tf.range(dim_index + 1, last_index),
+        [dim_index],
+      ],
+      0,
+    ),
+    **kwargs,
   )
 
 
@@ -136,12 +136,12 @@ def _compute_2d_sparsemax(logits):
   p = tf.math.maximum(tf.cast(0, logits.dtype), z - tf.expand_dims(tau_z, -1))
   # If k_z = 0 or if z = nan, then the input is invalid
   p_safe = tf.where(
-      tf.expand_dims(
-          tf.math.logical_or(tf.math.equal(k_z, 0), tf.math.is_nan(z_cumsum[:, -1])),
-          axis=-1,
-      ),
-      tf.fill([obs, dims], tf.cast(float("nan"), logits.dtype)),
-      p,
+    tf.expand_dims(
+      tf.math.logical_or(tf.math.equal(k_z, 0), tf.math.is_nan(z_cumsum[:, -1])),
+      axis=-1,
+    ),
+    tf.fill([obs, dims], tf.cast(float("nan"), logits.dtype)),
+    p,
   )
 
   # Reshape back to original size
diff --git a/deepray/activations/swish.py b/deepray/activations/swish.py
index 1d799613..dcb9e7ff 100644
--- a/deepray/activations/swish.py
+++ b/deepray/activations/swish.py
@@ -21,7 +21,7 @@
 import tensorflow as tf
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 def simple_swish(features):
   """Computes the Swish activation function.
 
@@ -42,7 +42,7 @@ def simple_swish(features):
   return features * tf.nn.sigmoid(features)
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 def hard_swish(features):
   """Computes a hard version of the swish function.
 
@@ -56,10 +56,10 @@ def hard_swish(features):
     The activation value.
   """
   features = tf.convert_to_tensor(features)
-  return features * tf.nn.relu6(features + tf.constant(3.)) * (1. / 6.)
+  return features * tf.nn.relu6(features + tf.constant(3.0)) * (1.0 / 6.0)
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 def identity(features):
   """Computes the identity function.
 
diff --git a/deepray/activations/tanhshrink.py b/deepray/activations/tanhshrink.py
index a87dbedd..25f02b90 100644
--- a/deepray/activations/tanhshrink.py
+++ b/deepray/activations/tanhshrink.py
@@ -22,23 +22,23 @@
 def tanhshrink(x: TensorLike) -> tf.Tensor:
   r"""Tanh shrink function.
 
-    Applies the element-wise function:
+  Applies the element-wise function:
 
-    $$
-    \mathrm{tanhshrink}(x) = x - \tanh(x).
-    $$
+  $$
+  \mathrm{tanhshrink}(x) = x - \tanh(x).
+  $$
 
-    Usage:
+  Usage:
 
-    >>> x = tf.constant([-1.0, 0.0, 1.0])
-    >>> dp.activations.tanhshrink(x)
-    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.23840582,  0.        ,  0.23840582], dtype=float32)>
+  >>> x = tf.constant([-1.0, 0.0, 1.0])
+  >>> dp.activations.tanhshrink(x)
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.23840582,  0.        ,  0.23840582], dtype=float32)>
 
-    Args:
-        x: A `Tensor`. Must be one of the following types:
-            `bfloat16`, `float16`, `float32`, `float64`.
-    Returns:
-        A `Tensor`. Has the same type as `x`.
-    """
+  Args:
+      x: A `Tensor`. Must be one of the following types:
+          `bfloat16`, `float16`, `float32`, `float64`.
+  Returns:
+      A `Tensor`. Has the same type as `x`.
+  """
   x = tf.convert_to_tensor(x)
   return x - tf.math.tanh(x)
diff --git a/deepray/activations/tests/activations_test.py b/deepray/activations/tests/activations_test.py
index 58c35088..8a398114 100644
--- a/deepray/activations/tests/activations_test.py
+++ b/deepray/activations/tests/activations_test.py
@@ -18,14 +18,14 @@
 from deepray import activations
 
 ALL_ACTIVATIONS = [
-    "hardshrink",
-    "lisht",
-    "mish",
-    "rrelu",
-    "softshrink",
-    "sparsemax",
-    "tanhshrink",
-    "snake",
+  "hardshrink",
+  "lisht",
+  "mish",
+  "rrelu",
+  "softshrink",
+  "sparsemax",
+  "tanhshrink",
+  "snake",
 ]
 
 
diff --git a/deepray/activations/tests/rrelu_test.py b/deepray/activations/tests/rrelu_test.py
index e5ccc65c..e227fe76 100644
--- a/deepray/activations/tests/rrelu_test.py
+++ b/deepray/activations/tests/rrelu_test.py
@@ -32,20 +32,20 @@ def test_rrelu_old(dtype, training):
 
   tf.random.set_seed(SEED)
   training_results = {
-      np.float16: [-0.288330078, -0.124206543, 0, 1, 2],
-      np.float32: [-0.26851666, -0.116421416, 0, 1, 2],
-      np.float64: [-0.3481333923206531, -0.17150176242558851, 0, 1, 2],
+    np.float16: [-0.288330078, -0.124206543, 0, 1, 2],
+    np.float32: [-0.26851666, -0.116421416, 0, 1, 2],
+    np.float64: [-0.3481333923206531, -0.17150176242558851, 0, 1, 2],
   }
   result = rrelu(x, lower, upper, training=training, seed=SEED)
   if training:
     expect_result = training_results.get(dtype)
   else:
     expect_result = [
-        -0.30000001192092896,
-        -0.15000000596046448,
-        0,
-        1,
-        2,
+      -0.30000001192092896,
+      -0.15000000596046448,
+      0,
+      1,
+      2,
     ]
   test_utils.assert_allclose_according_to_type(result, expect_result)
 
@@ -57,17 +57,17 @@ def test_rrelu(dtype, training):
   lower = 0.1
   upper = 0.2
   training_results = {
-      np.float16: [-0.3826, -0.165, 0, 1, 2],
-      np.float32: [-0.282151192, -0.199812651, 0, 1, 2],
-      np.float64: [-0.25720977, -0.1221586, 0, 1, 2],
+    np.float16: [-0.3826, -0.165, 0, 1, 2],
+    np.float32: [-0.282151192, -0.199812651, 0, 1, 2],
+    np.float64: [-0.25720977, -0.1221586, 0, 1, 2],
   }
   result = rrelu(
-      x,
-      lower,
-      upper,
-      training=training,
-      seed=None,
-      rng=tf.random.Generator.from_seed(SEED),
+    x,
+    lower,
+    upper,
+    training=training,
+    seed=None,
+    rng=tf.random.Generator.from_seed(SEED),
   )
   if training:
     expect_result = training_results.get(dtype)
diff --git a/deepray/activations/tests/sparsemax_test.py b/deepray/activations/tests/sparsemax_test.py
index cd71ec0b..ff20534b 100644
--- a/deepray/activations/tests/sparsemax_test.py
+++ b/deepray/activations/tests/sparsemax_test.py
@@ -97,8 +97,8 @@ def test_sparsemax_against_numpy_high_rank(dtype):
 
   tf_sparsemax_out = sparsemax(z.astype(dtype))
   np_sparsemax = np.reshape(
-      _np_sparsemax(np.reshape(z, [test_obs * test_obs, 10])),
-      [test_obs, test_obs, 10],
+    _np_sparsemax(np.reshape(z, [test_obs * test_obs, 10])),
+    [test_obs, test_obs, 10],
   ).astype(dtype)
 
   test_utils.assert_allclose_according_to_type(np_sparsemax, tf_sparsemax_out)
@@ -111,12 +111,12 @@ def test_sparsemax_of_nan(dtype):
 
   tf_sparsemax_nan = sparsemax(z_nan)
   np.testing.assert_equal(
-      np.array([
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-      ]),
-      tf_sparsemax_nan,
+    np.array([
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+    ]),
+    tf_sparsemax_nan,
   )
 
 
@@ -132,22 +132,22 @@ def test_sparsemax_of_inf(dtype):
 
   tf_sparsemax_pos = sparsemax(z_pos)
   np.testing.assert_equal(
-      np.array([
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-      ]),
-      tf_sparsemax_pos,
+    np.array([
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+    ]),
+    tf_sparsemax_pos,
   )
 
   tf_sparsemax_mix = sparsemax(z_mix)
   np.testing.assert_equal(
-      np.array([
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-          [np.nan, np.nan, np.nan],
-      ]),
-      tf_sparsemax_mix,
+    np.array([
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+      [np.nan, np.nan, np.nan],
+    ]),
+    tf_sparsemax_mix,
   )
 
 
diff --git a/deepray/activations/tests/swish_test.py b/deepray/activations/tests/swish_test.py
index 287954af..26208fdd 100644
--- a/deepray/activations/tests/swish_test.py
+++ b/deepray/activations/tests/swish_test.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for the customized Swish activation."""
+
 import numpy as np
 import tensorflow as tf
 
@@ -19,23 +20,22 @@
 
 
 class CustomizedSwishTest(tf.test.TestCase):
-
   def _hard_swish_np(self, x):
     x = np.float32(x)
     return x * np.clip(x + 3, 0, 6) / 6
 
   def test_simple_swish(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
+    features = [[0.25, 0, -0.25], [-1, -2, 3]]
     customized_swish_data = activations.simple_swish(features)
     swish_data = tf.nn.swish(features)
     self.assertAllClose(customized_swish_data, swish_data)
 
   def test_hard_swish(self):
-    features = [[.25, 0, -.25], [-1, -2, 3]]
+    features = [[0.25, 0, -0.25], [-1, -2, 3]]
     customized_swish_data = activations.hard_swish(features)
     swish_data = self._hard_swish_np(features)
     self.assertAllClose(customized_swish_data, swish_data)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/callbacks/average_model_checkpoint.py b/deepray/callbacks/average_model_checkpoint.py
index 5b153a42..5bf69ace 100644
--- a/deepray/callbacks/average_model_checkpoint.py
+++ b/deepray/callbacks/average_model_checkpoint.py
@@ -21,44 +21,44 @@
 class AverageModelCheckpoint(tf.keras.callbacks.ModelCheckpoint):
   r"""The callback that saves average model weights.
 
-    The callback that should be used with optimizers that extend
-    `dp.optimizers.AveragedOptimizerWrapper`, i.e.,
-    `dp.optimizers.MovingAverage` and
-    `dp.optimizers.StochasticAverage` optimizers.
-    It saves and, optionally, assigns the averaged weights.
+  The callback that should be used with optimizers that extend
+  `dp.optimizers.AveragedOptimizerWrapper`, i.e.,
+  `dp.optimizers.MovingAverage` and
+  `dp.optimizers.StochasticAverage` optimizers.
+  It saves and, optionally, assigns the averaged weights.
 
-    Args:
-        update_weights: If `True`, assign the moving average weights
-            to the model, and save them. If False, keep the old
-            non-averaged weights, but the saved model uses the
-            average weights.
+  Args:
+      update_weights: If `True`, assign the moving average weights
+          to the model, and save them. If False, keep the old
+          non-averaged weights, but the saved model uses the
+          average weights.
 
-        See `tf.keras.callbacks.ModelCheckpoint` for the other args.
-    """
+      See `tf.keras.callbacks.ModelCheckpoint` for the other args.
+  """
 
   @typechecked
   def __init__(
-      self,
-      update_weights: bool,
-      filepath: str,
-      monitor: str = "val_loss",
-      verbose: int = 0,
-      save_best_only: bool = False,
-      save_weights_only: bool = False,
-      mode: str = "auto",
-      save_freq: str = "epoch",
-      **kwargs,
+    self,
+    update_weights: bool,
+    filepath: str,
+    monitor: str = "val_loss",
+    verbose: int = 0,
+    save_best_only: bool = False,
+    save_weights_only: bool = False,
+    mode: str = "auto",
+    save_freq: str = "epoch",
+    **kwargs,
   ):
     self.update_weights = update_weights
     super().__init__(
-        filepath,
-        monitor,
-        verbose,
-        save_best_only,
-        save_weights_only,
-        mode,
-        save_freq,
-        **kwargs,
+      filepath,
+      monitor,
+      verbose,
+      save_best_only,
+      save_weights_only,
+      mode,
+      save_freq,
+      **kwargs,
     )
 
   def _get_optimizer(self):
@@ -72,8 +72,7 @@ def set_model(self, model):
     super().set_model(model)
     optimizer = self._get_optimizer()
     if not isinstance(optimizer, AveragedOptimizerWrapper):
-      raise TypeError("AverageModelCheckpoint is only used when training"
-                      "with MovingAverage or StochasticAverage")
+      raise TypeError("AverageModelCheckpoint is only used when trainingwith MovingAverage or StochasticAverage")
 
   def _save_model(self, *args, **kwargs):
     optimizer = self._get_optimizer()
diff --git a/deepray/callbacks/callbacks.py b/deepray/callbacks/callbacks.py
index 0250e3fb..8233e8f5 100644
--- a/deepray/callbacks/callbacks.py
+++ b/deepray/callbacks/callbacks.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Callbacks: utilities called at certain points during model training."""
+
 import numpy as np
 import tensorflow as tf
 from absl import flags
@@ -50,6 +51,7 @@ def sync_to_numpy_or_python_type(tensors):
   def _to_single_numpy_or_python_type(t):
     if flags.FLAGS.use_horovod:
       import horovod.tensorflow.keras as hvd
+
       t = hvd.allreduce(t, op=hvd.Average)
     # Don't turn ragged or sparse tensors to NumPy.
     if isinstance(t, tf.Tensor):
@@ -64,7 +66,6 @@ def _to_single_numpy_or_python_type(t):
 
 
 class HvdCallbackList(callbacks_module.CallbackList):
-
   def _process_logs(self, logs, is_batch_hook=False):
     """Turns tensors into numpy arrays or Python scalars if necessary."""
     if logs is None:
diff --git a/deepray/callbacks/model_checkpoint.py b/deepray/callbacks/model_checkpoint.py
index 8b84b986..8226b071 100644
--- a/deepray/callbacks/model_checkpoint.py
+++ b/deepray/callbacks/model_checkpoint.py
@@ -29,7 +29,6 @@
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class ModelCheckpoint(Callback):
-
   @typechecked
   def __init__(self, save_checkpoint_steps: int = sys.maxsize, max_to_keep: int = 3):
     super().__init__()
@@ -40,6 +39,7 @@ def __init__(self, save_checkpoint_steps: int = sys.maxsize, max_to_keep: int =
       self.epochs = 1
     if flags.FLAGS.use_dynamic_embedding:
       from tensorflow_recommenders_addons import dynamic_embedding as de
+
       tf.train.Checkpoint = de.train.checkpoint.DECheckpoint
 
   def set_models(self, models):
@@ -66,19 +66,17 @@ def on_callback_begin(self):
         self._checkpoints[name] = _checkpoint
         if get_world_size() > 1:
           self._managers[name] = tf.train.CheckpointManager(
-              _checkpoint,
-              os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}_{get_rank()}'),
-              max_to_keep=self.max_to_keep
+            _checkpoint, os.path.join(flags.FLAGS.model_dir, f"ckpt_{name}_{get_rank()}"), max_to_keep=self.max_to_keep
           )
         else:
           self._managers[name] = tf.train.CheckpointManager(
-              _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep
+            _checkpoint, os.path.join(flags.FLAGS.model_dir, f"ckpt_{name}"), max_to_keep=self.max_to_keep
           )
       else:
         _checkpoint = tf.train.Checkpoint(model=model)
         self._checkpoints[name] = _checkpoint
         self._managers[name] = tf.train.CheckpointManager(
-            _checkpoint, os.path.join(flags.FLAGS.model_dir, f'ckpt_{name}'), max_to_keep=self.max_to_keep
+          _checkpoint, os.path.join(flags.FLAGS.model_dir, f"ckpt_{name}"), max_to_keep=self.max_to_keep
         )
 
     if flags.FLAGS.init_checkpoint:
@@ -89,14 +87,14 @@ def on_callback_begin(self):
           else:
             latest_checkpoint = init_ckpt
           logger.info(
-              f'Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model.'
+            f"Checkpoint file {latest_checkpoint} found and restoring from initial checkpoint for {name} model."
           )
           if os.getenv("DEEPRAY_VERBOSITY", None) == "detail" or flags.FLAGS.use_dynamic_embedding:
             # TFRA DE doesn't support "assert_existing_objects_matched" method
             ckpt.restore(latest_checkpoint)
           else:
             ckpt.restore(latest_checkpoint).assert_existing_objects_matched()
-          logger.info('Loading from checkpoint file...')
+          logger.info("Loading from checkpoint file...")
 
     self.current_step = 0
     self._steps_from_save = 0  # self.optimizer.iterations.numpy()
@@ -127,8 +125,8 @@ def on_train_end(self, logs=None):
 
   def get_config(self):
     config = {
-        "save_checkpoint_steps": self.save_checkpoint_steps,
-        "max_to_keep": self.max_to_keep,
+      "save_checkpoint_steps": self.save_checkpoint_steps,
+      "max_to_keep": self.max_to_keep,
     }
 
     base_config = super().get_config()
diff --git a/deepray/callbacks/profiler_callback.py b/deepray/callbacks/profiler_callback.py
index 229c715f..ee4b95eb 100644
--- a/deepray/callbacks/profiler_callback.py
+++ b/deepray/callbacks/profiler_callback.py
@@ -9,11 +9,11 @@
 def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_per_epoch):
   """Validate profile_steps flag value and return profiler callback."""
   profile_steps_error_message = (
-      'profile_steps must be a comma separated pair of positive integers, '
-      'specifying the first and last steps to be profiled.'
+    "profile_steps must be a comma separated pair of positive integers, "
+    "specifying the first and last steps to be profiled."
   )
   try:
-    profile_steps = [int(i) for i in profile_steps.split(',')]
+    profile_steps = [int(i) for i in profile_steps.split(",")]
   except ValueError:
     raise ValueError(profile_steps_error_message)
   if len(profile_steps) != 2:
@@ -23,10 +23,10 @@ def get_profiler_callback(model_dir, profile_steps, enable_tensorboard, steps_pe
     raise ValueError(profile_steps_error_message)
   if enable_tensorboard:
     logger.warning(
-        'Both TensorBoard and profiler callbacks are used. Note that the '
-        'TensorBoard callback profiles the 2nd step (unless otherwise '
-        'specified). Please make sure the steps profiled by the two callbacks '
-        'do not overlap.'
+      "Both TensorBoard and profiler callbacks are used. Note that the "
+      "TensorBoard callback profiles the 2nd step (unless otherwise "
+      "specified). Please make sure the steps profiled by the two callbacks "
+      "do not overlap."
     )
   return ProfilerCallback(model_dir, start_step, stop_step, steps_per_epoch)
 
@@ -56,7 +56,7 @@ def on_batch_begin(self, batch, logs=None):
     if batch == self.start_step_in_epoch and self.should_start:
       self.should_start = False
       profiler.start()
-      logger.info('Profiler started at Step %s', self.start_step)
+      logger.info("Profiler started at Step %s", self.start_step)
 
   def on_batch_end(self, batch, logs=None):
     if batch == self.stop_step_in_epoch and self.should_stop:
@@ -64,5 +64,5 @@ def on_batch_end(self, batch, logs=None):
       results = profiler.stop()
       profiler.save(self.log_dir, results)
       logger.info(
-          'Profiler saved profiles for steps between %s and %s to %s', self.start_step, self.stop_step, self.log_dir
+        "Profiler saved profiles for steps between %s and %s to %s", self.start_step, self.stop_step, self.log_dir
       )
diff --git a/deepray/callbacks/progbar_logger.py b/deepray/callbacks/progbar_logger.py
index 27edf0cd..040629b7 100644
--- a/deepray/callbacks/progbar_logger.py
+++ b/deepray/callbacks/progbar_logger.py
@@ -36,25 +36,25 @@
 class Progbar:
   """Displays a progress bar.
 
-    Args:
-        target: Total number of steps expected, None if unknown.
-        width: Progress bar width on screen.
-        verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
-        stateful_metrics: Iterable of string names of metrics that should *not*
-          be averaged over time. Metrics in this list will be displayed as-is.
-          All others will be averaged by the progbar before display.
-        interval: Minimum visual progress update interval (in seconds).
-        unit_name: Display name for step counts (usually "step" or "sample").
-    """
+  Args:
+      target: Total number of steps expected, None if unknown.
+      width: Progress bar width on screen.
+      verbose: Verbosity mode, 0 (silent), 1 (verbose), 2 (semi-verbose)
+      stateful_metrics: Iterable of string names of metrics that should *not*
+        be averaged over time. Metrics in this list will be displayed as-is.
+        All others will be averaged by the progbar before display.
+      interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
+  """
 
   def __init__(
-      self,
-      target,
-      width=30,
-      verbose=1,
-      interval=0.05,
-      stateful_metrics=None,
-      unit_name="step",
+    self,
+    target,
+    width=30,
+    verbose=1,
+    interval=0.05,
+    stateful_metrics=None,
+    unit_name="step",
   ):
     self.target = target
     self.width = width
@@ -67,8 +67,10 @@ def __init__(
       self.stateful_metrics = set()
 
     self._dynamic_display = (
-        (hasattr(sys.stdout, "isatty") and sys.stdout.isatty()) or "ipykernel" in sys.modules or
-        "posix" in sys.modules or "PYCHARM_HOSTED" in os.environ
+      (hasattr(sys.stdout, "isatty") and sys.stdout.isatty())
+      or "ipykernel" in sys.modules
+      or "posix" in sys.modules
+      or "PYCHARM_HOSTED" in os.environ
     )
     self._total_width = 0
     self._seen_so_far = 0
@@ -85,15 +87,15 @@ def __init__(
   def update(self, current, values=None, finalize=None):
     """Updates the progress bar.
 
-        Args:
-            current: Index of current step.
-            values: List of tuples: `(name, value_for_last_step)`. If `name` is
-              in `stateful_metrics`, `value_for_last_step` will be displayed
-              as-is. Else, an average of the metric over time will be
-              displayed.
-            finalize: Whether this is the last update for the progress bar. If
-              `None`, uses `current >= self.target`. Defaults to `None`.
-        """
+    Args:
+        current: Index of current step.
+        values: List of tuples: `(name, value_for_last_step)`. If `name` is
+          in `stateful_metrics`, `value_for_last_step` will be displayed
+          as-is. Else, an average of the metric over time will be
+          displayed.
+        finalize: Whether this is the last update for the progress bar. If
+          `None`, uses `current >= self.target`. Defaults to `None`.
+    """
     if finalize is None:
       if self.target is None:
         finalize = False
@@ -166,9 +168,9 @@ def update(self, current, values=None, finalize=None):
         eta = time_per_unit * (self.target - current)
         if eta > 3600:
           eta_format = "%d:%02d:%02d" % (
-              eta // 3600,
-              (eta % 3600) // 60,
-              eta % 60,
+            eta // 3600,
+            (eta % 3600) // 60,
+            eta % 60,
           )
         elif eta > 60:
           eta_format = "%d:%02d" % (eta // 60, eta % 60)
@@ -213,7 +215,7 @@ def update(self, current, values=None, finalize=None):
           else:
             info += f" {avg:.4e}"
         if self._time_at_epoch_end:
-          time_per_epoch = (self._time_at_epoch_end - self._time_at_epoch_start)
+          time_per_epoch = self._time_at_epoch_end - self._time_at_epoch_start
           avg_time_per_step = time_per_epoch / self.target
           self._time_at_epoch_start = now
           self._time_at_epoch_end = None
@@ -232,14 +234,14 @@ def add(self, n, values=None):
   def _format_time(self, time_per_unit, unit_name):
     """format a given duration to display to the user.
 
-        Given the duration, this function formats it in either milliseconds
-        or seconds and displays the unit (i.e. ms/step or s/epoch)
-        Args:
-          time_per_unit: the duration to display
-          unit_name: the name of the unit to display
-        Returns:
-          a string with the correctly formatted duration and units
-        """
+    Given the duration, this function formats it in either milliseconds
+    or seconds and displays the unit (i.e. ms/step or s/epoch)
+    Args:
+      time_per_unit: the duration to display
+      unit_name: the name of the unit to display
+    Returns:
+      a string with the correctly formatted duration and units
+    """
     formatted = ""
     if time_per_unit >= 1 or time_per_unit == 0:
       formatted += f" {time_per_unit:.0f}s/{unit_name}"
@@ -252,19 +254,19 @@ def _format_time(self, time_per_unit, unit_name):
   def _estimate_step_duration(self, current, now):
     """Estimate the duration of a single step.
 
-        Given the step number `current` and the corresponding time `now` this
-        function returns an estimate for how long a single step takes. If this
-        is called before one step has been completed (i.e. `current == 0`) then
-        zero is given as an estimate. The duration estimate ignores the duration
-        of the (assumed to be non-representative) first step for estimates when
-        more steps are available (i.e. `current>1`).
+    Given the step number `current` and the corresponding time `now` this
+    function returns an estimate for how long a single step takes. If this
+    is called before one step has been completed (i.e. `current == 0`) then
+    zero is given as an estimate. The duration estimate ignores the duration
+    of the (assumed to be non-representative) first step for estimates when
+    more steps are available (i.e. `current>1`).
 
-        Args:
-          current: Index of current step.
-          now: The current time.
+    Args:
+      current: Index of current step.
+      now: The current time.
 
-        Returns: Estimate of the duration of a single step.
-        """
+    Returns: Estimate of the duration of a single step.
+    """
     if current:
       # there are a few special scenarios here:
       # 1) somebody is calling the progress bar without ever supplying
@@ -290,19 +292,19 @@ def _update_stateful_metrics(self, stateful_metrics):
 class ProgbarLogger(Callback):
   """Callback that prints metrics to stdout.
 
-    Args:
-        count_mode: One of `"steps"` or `"samples"`.
-            Whether the progress bar should
-            count samples seen or steps (batches) seen.
-        stateful_metrics: Iterable of string names of metrics that
-            should *not* be averaged over an epoch.
-            Metrics in this list will be logged as-is.
-            All others will be averaged over time (e.g. loss, etc).
-            If not provided, defaults to the `Model`'s metrics.
-
-    Raises:
-        ValueError: In case of invalid `count_mode`.
-    """
+  Args:
+      count_mode: One of `"steps"` or `"samples"`.
+          Whether the progress bar should
+          count samples seen or steps (batches) seen.
+      stateful_metrics: Iterable of string names of metrics that
+          should *not* be averaged over an epoch.
+          Metrics in this list will be logged as-is.
+          All others will be averaged over time (e.g. loss, etc).
+          If not provided, defaults to the `Model`'s metrics.
+
+  Raises:
+      ValueError: In case of invalid `count_mode`.
+  """
 
   def __init__(self, count_mode: str = "samples", stateful_metrics=None):
     super().__init__()
@@ -312,10 +314,9 @@ def __init__(self, count_mode: str = "samples", stateful_metrics=None):
     elif count_mode == "steps":
       self.use_steps = True
     else:
-      raise ValueError(f"Unknown `count_mode`: {count_mode}. "
-                       'Expected values are ["samples", "steps"]')
+      raise ValueError(f'Unknown `count_mode`: {count_mode}. Expected values are ["samples", "steps"]')
     # Defaults to all Model's metrics except for loss.
-    self.stateful_metrics = (set(stateful_metrics) if stateful_metrics else set())
+    self.stateful_metrics = set(stateful_metrics) if stateful_metrics else set()
 
     self.seen = 0
     self.progbar = None
@@ -336,9 +337,7 @@ def set_params(self, params):
     elif not self.use_steps and "samples" in params:
       self.target = params["samples"]
     else:
-      self.target = (
-          None  # Will be inferred at the end of the first epoch.
-      )
+      self.target = None  # Will be inferred at the end of the first epoch.
 
     self._call_batch_hooks = self.verbose == 1
     if self.target is None:
@@ -395,7 +394,7 @@ def _reset_progbar(self):
 
   def _maybe_init_progbar(self):
     """Instantiate a `Progbar` if not yet, and update the stateful
-        metrics."""
+    metrics."""
     # TODO(rchao): Legacy TF1 code path may use list for
     # `self.stateful_metrics`. Remove "cast to set" when TF1 support is
     # dropped.
@@ -409,10 +408,10 @@ def _maybe_init_progbar(self):
 
     if self.progbar is None:
       self.progbar = Progbar(
-          target=self.target,
-          verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics,
-          unit_name="step" if self.use_steps else "sample",
+        target=self.target,
+        verbose=self.verbose,
+        stateful_metrics=self.stateful_metrics,
+        unit_name="step" if self.use_steps else "sample",
       )
 
     self.progbar._update_stateful_metrics(self.stateful_metrics)
diff --git a/deepray/callbacks/tests/avg_model_checkpoint_test.py b/deepray/callbacks/tests/avg_model_checkpoint_test.py
index ec2f84c2..59d41f04 100644
--- a/deepray/callbacks/tests/avg_model_checkpoint_test.py
+++ b/deepray/callbacks/tests/avg_model_checkpoint_test.py
@@ -38,9 +38,8 @@ def test_compatibility_with_some_opts_only(tmp_path):
   x, y, model = get_data_and_model(optimizer="rmsprop")
   avg_model_ckpt = AverageModelCheckpoint(update_weights=True, filepath=test_model_filepath)
   with pytest.raises(
-      TypeError,
-      match="AverageModelCheckpoint is only used when trainingwith"
-      " MovingAverage or StochasticAverage",
+    TypeError,
+    match="AverageModelCheckpoint is only used when trainingwith MovingAverage or StochasticAverage",
   ):
     model.fit(x, y, epochs=EPOCHS, batch_size=BATCH_SIZE, callbacks=[avg_model_ckpt])
 
@@ -60,19 +59,19 @@ def test_mode_auto(tmp_path):
   save_best_only = False
   mode = "auto"
   avg_model_ckpt = AverageModelCheckpoint(
-      update_weights=True,
-      filepath=test_model_filepath,
-      monitor=monitor,
-      save_best_only=save_best_only,
-      mode=mode,
+    update_weights=True,
+    filepath=test_model_filepath,
+    monitor=monitor,
+    save_best_only=save_best_only,
+    mode=mode,
   )
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert os.path.exists(test_model_filepath)
 
@@ -84,19 +83,19 @@ def test_mode_min(tmp_path):
   save_best_only = False
   mode = "min"
   avg_model_ckpt = AverageModelCheckpoint(
-      update_weights=True,
-      filepath=test_model_filepath,
-      monitor=monitor,
-      save_best_only=save_best_only,
-      mode=mode,
+    update_weights=True,
+    filepath=test_model_filepath,
+    monitor=monitor,
+    save_best_only=save_best_only,
+    mode=mode,
   )
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert os.path.exists(test_model_filepath)
 
@@ -108,19 +107,19 @@ def test_mode_max(tmp_path):
   monitor = "val_acc"
   save_best_only = False
   avg_model_ckpt = AverageModelCheckpoint(
-      update_weights=True,
-      filepath=test_model_filepath,
-      monitor=monitor,
-      save_best_only=save_best_only,
-      mode=mode,
+    update_weights=True,
+    filepath=test_model_filepath,
+    monitor=monitor,
+    save_best_only=save_best_only,
+    mode=mode,
   )
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert os.path.exists(test_model_filepath)
 
@@ -130,15 +129,15 @@ def test_save_best_only(tmp_path):
   x, y, model = get_data_and_model()
   save_best_only = True
   avg_model_ckpt = AverageModelCheckpoint(
-      update_weights=True, filepath=test_model_filepath, save_best_only=save_best_only
+    update_weights=True, filepath=test_model_filepath, save_best_only=save_best_only
   )
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert os.path.exists(test_model_filepath)
 
@@ -148,18 +147,18 @@ def test_metric_unavailable(tmp_path):
   x, y, model = get_data_and_model()
   monitor = "unknown"
   avg_model_ckpt = AverageModelCheckpoint(
-      update_weights=False,
-      filepath=test_model_filepath,
-      monitor=monitor,
-      save_best_only=True,
+    update_weights=False,
+    filepath=test_model_filepath,
+    monitor=monitor,
+    save_best_only=True,
   )
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert not os.path.exists(test_model_filepath)
 
@@ -170,12 +169,12 @@ def test_save_freq(tmp_path):
   save_freq = "epoch"
   avg_model_ckpt = AverageModelCheckpoint(update_weights=False, filepath=test_filepath, save_freq=save_freq)
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert os.path.exists(test_filepath.format(epoch=1))
   assert os.path.exists(test_filepath.format(epoch=2))
@@ -199,11 +198,11 @@ def test_loss_scale_optimizer(tmp_path):
   save_freq = "epoch"
   avg_model_ckpt = AverageModelCheckpoint(update_weights=False, filepath=test_model_filepath, save_freq=save_freq)
   model.fit(
-      x,
-      y,
-      epochs=EPOCHS,
-      batch_size=BATCH_SIZE,
-      validation_data=(x, y),
-      callbacks=[avg_model_ckpt],
+    x,
+    y,
+    epochs=EPOCHS,
+    batch_size=BATCH_SIZE,
+    validation_data=(x, y),
+    callbacks=[avg_model_ckpt],
   )
   assert not os.path.exists(test_model_filepath)
diff --git a/deepray/callbacks/tests/time_stopping_test.py b/deepray/callbacks/tests/time_stopping_test.py
index ca2ae76e..58623757 100644
--- a/deepray/callbacks/tests/time_stopping_test.py
+++ b/deepray/callbacks/tests/time_stopping_test.py
@@ -10,7 +10,6 @@
 
 
 class SleepLayer(tf.keras.layers.Layer):
-
   def __init__(self, secs):
     self.secs = secs
     super().__init__(dynamic=True)
diff --git a/deepray/callbacks/tests/tqdm_progress_bar_test.py b/deepray/callbacks/tests/tqdm_progress_bar_test.py
index 7af4d751..da587294 100644
--- a/deepray/callbacks/tests/tqdm_progress_bar_test.py
+++ b/deepray/callbacks/tests/tqdm_progress_bar_test.py
@@ -31,8 +31,7 @@ def test_tqdm_progress_bar(capsys):
 
 def test_tqdm_progress_bar_overall_bar_format(capsys):
   x, y, model = get_data_and_model()
-  overall_bar_format = ("{l_bar}{bar} {n_fmt}/{total_fmt} ETA: dodo"
-                        "{remaining}s,  {rate_fmt}{postfix}")
+  overall_bar_format = "{l_bar}{bar} {n_fmt}/{total_fmt} ETA: dodo{remaining}s,  {rate_fmt}{postfix}"
   pb = dp.callbacks.TQDMProgressBar(overall_bar_format=overall_bar_format, show_epoch_progress=False)
   capsys.readouterr()  # flush the buffer
   model.fit(x, y, epochs=1, verbose=0, callbacks=[pb])
@@ -77,8 +76,8 @@ def test_tqdm_progress_bar_show(capsys, show_epoch_progress, show_overall_progre
   x, y, model = get_data_and_model()
 
   pb = dp.callbacks.TQDMProgressBar(
-      show_epoch_progress=show_epoch_progress,
-      show_overall_progress=show_overall_progress,
+    show_epoch_progress=show_epoch_progress,
+    show_overall_progress=show_overall_progress,
   )
   capsys.readouterr()  # flush the buffer
   model.fit(x, y, batch_size=4, epochs=2, verbose=0, callbacks=[pb])
diff --git a/deepray/callbacks/time_history.py b/deepray/callbacks/time_history.py
index 0778c52c..36fc1171 100644
--- a/deepray/callbacks/time_history.py
+++ b/deepray/callbacks/time_history.py
@@ -8,13 +8,13 @@
 
 logger = logging_util.get_logger()
 
-global_batch_size_gauge = monitoring.IntGauge('/tensorflow/training/global_batch_size', 'TF training global batch size')
+global_batch_size_gauge = monitoring.IntGauge("/tensorflow/training/global_batch_size", "TF training global batch size")
 first_batch_time_gauge = monitoring.IntGauge(
-    '/tensorflow/training/first_batch', 'TF training start/end time for first batch (unix epoch time in us.', 'type'
+  "/tensorflow/training/first_batch", "TF training start/end time for first batch (unix epoch time in us.", "type"
 )
 
-first_batch_start_time = first_batch_time_gauge.get_cell('start')
-first_batch_end_time = first_batch_time_gauge.get_cell('end')
+first_batch_start_time = first_batch_time_gauge.get_cell("start")
+first_batch_end_time = first_batch_time_gauge.get_cell("end")
 
 
 class BatchTimestamp(object):
@@ -123,14 +123,17 @@ def on_batch_end(self, batch, logs=None):
 
       self.timestamp_log.append(BatchTimestamp(self.global_steps, now))
       logger.info(
-          'TimeHistory: %.2f seconds, %.2f examples/second between steps %d '
-          'and %d', elapsed_time, examples_per_second, self.last_log_step, self.global_steps
+        "TimeHistory: %.2f seconds, %.2f examples/second between steps %d and %d",
+        elapsed_time,
+        examples_per_second,
+        self.last_log_step,
+        self.global_steps,
       )
 
       if self.summary_writer:
         with self.summary_writer.as_default():
-          tf.summary.scalar('steps_per_second', steps_per_second, self.global_steps)
-          tf.summary.scalar('examples_per_second', examples_per_second, self.global_steps)
+          tf.summary.scalar("steps_per_second", steps_per_second, self.global_steps)
+          tf.summary.scalar("examples_per_second", examples_per_second, self.global_steps)
 
       self.last_log_step = self.global_steps
       self.start_time = None
diff --git a/deepray/callbacks/time_stopping.py b/deepray/callbacks/time_stopping.py
index 4196bf0e..7a69606b 100644
--- a/deepray/callbacks/time_stopping.py
+++ b/deepray/callbacks/time_stopping.py
@@ -26,11 +26,11 @@
 class TimeStopping(Callback):
   """Stop training when a specified amount of time has passed.
 
-    Args:
-        seconds: maximum amount of time before stopping.
-            Defaults to 86400 (1 day).
-        verbose: verbosity mode. Defaults to 0.
-    """
+  Args:
+      seconds: maximum amount of time before stopping.
+          Defaults to 86400 (1 day).
+      verbose: verbosity mode. Defaults to 0.
+  """
 
   @typechecked
   def __init__(self, seconds: int = 86400, verbose: int = 0):
@@ -56,8 +56,8 @@ def on_train_end(self, logs=None):
 
   def get_config(self):
     config = {
-        "seconds": self.seconds,
-        "verbose": self.verbose,
+      "seconds": self.seconds,
+      "verbose": self.verbose,
     }
 
     base_config = super().get_config()
diff --git a/deepray/callbacks/tqdm_progress_bar.py b/deepray/callbacks/tqdm_progress_bar.py
index 8805b677..4b5303c5 100644
--- a/deepray/callbacks/tqdm_progress_bar.py
+++ b/deepray/callbacks/tqdm_progress_bar.py
@@ -26,40 +26,38 @@
 class TQDMProgressBar(Callback):
   """TQDM Progress Bar for Tensorflow Keras.
 
-    Args:
-        metrics_separator: Custom separator between metrics.
-            Defaults to ' - '.
-        overall_bar_format: Custom bar format for overall
-            (outer) progress bar, see https://github.com/tqdm/tqdm#parameters
-            for more detail.
-        epoch_bar_format: Custom bar format for epoch
-            (inner) progress bar, see https://github.com/tqdm/tqdm#parameters
-            for more detail.
-        update_per_second: Maximum number of updates in the epochs bar
-            per second, this is to prevent small batches from slowing down
-            training. Defaults to 10.
-        metrics_format: Custom format for how metrics are formatted.
-            See https://github.com/tqdm/tqdm#parameters for more detail.
-        leave_epoch_progress: `True` to leave epoch progress bars.
-        leave_overall_progress: `True` to leave overall progress bar.
-        show_epoch_progress: `False` to hide epoch progress bars.
-        show_overall_progress: `False` to hide overall progress bar.
-    """
+  Args:
+      metrics_separator: Custom separator between metrics.
+          Defaults to ' - '.
+      overall_bar_format: Custom bar format for overall
+          (outer) progress bar, see https://github.com/tqdm/tqdm#parameters
+          for more detail.
+      epoch_bar_format: Custom bar format for epoch
+          (inner) progress bar, see https://github.com/tqdm/tqdm#parameters
+          for more detail.
+      update_per_second: Maximum number of updates in the epochs bar
+          per second, this is to prevent small batches from slowing down
+          training. Defaults to 10.
+      metrics_format: Custom format for how metrics are formatted.
+          See https://github.com/tqdm/tqdm#parameters for more detail.
+      leave_epoch_progress: `True` to leave epoch progress bars.
+      leave_overall_progress: `True` to leave overall progress bar.
+      show_epoch_progress: `False` to hide epoch progress bars.
+      show_overall_progress: `False` to hide overall progress bar.
+  """
 
   @typechecked
   def __init__(
-      self,
-      metrics_separator: str = " - ",
-      overall_bar_format: str = "{l_bar}{bar} {n_fmt}/{total_fmt} ETA: "
-      "{remaining}s,  {rate_fmt}{postfix}",
-      epoch_bar_format: str = "{n_fmt}/{total_fmt}{bar} ETA: "
-      "{remaining}s - {desc}",
-      metrics_format: str = "{name}: {value:0.4f}",
-      update_per_second: int = 10,
-      leave_epoch_progress: bool = True,
-      leave_overall_progress: bool = True,
-      show_epoch_progress: bool = True,
-      show_overall_progress: bool = True,
+    self,
+    metrics_separator: str = " - ",
+    overall_bar_format: str = "{l_bar}{bar} {n_fmt}/{total_fmt} ETA: {remaining}s,  {rate_fmt}{postfix}",
+    epoch_bar_format: str = "{n_fmt}/{total_fmt}{bar} ETA: {remaining}s - {desc}",
+    metrics_format: str = "{name}: {value:0.4f}",
+    update_per_second: int = 10,
+    leave_epoch_progress: bool = True,
+    leave_overall_progress: bool = True,
+    show_epoch_progress: bool = True,
+    show_overall_progress: bool = True,
   ):
     try:
       # import tqdm here because tqdm is not a required package
@@ -106,33 +104,33 @@ def _initialize_progbar(self, hook, epoch, logs=None):
     if hook == "train_overall":
       if self.show_overall_progress:
         self.overall_progress_tqdm = self.tqdm(
-            desc="Training",
-            total=self.num_epochs,
-            bar_format=self.overall_bar_format,
-            leave=self.leave_overall_progress,
-            dynamic_ncols=True,
-            unit="epochs",
+          desc="Training",
+          total=self.num_epochs,
+          bar_format=self.overall_bar_format,
+          leave=self.leave_overall_progress,
+          dynamic_ncols=True,
+          unit="epochs",
         )
     elif hook == "test":
       if self.show_epoch_progress:
         self.epoch_progress_tqdm = self.tqdm(
-            total=self.total_steps,
-            desc="Evaluating",
-            bar_format=self.epoch_bar_format,
-            leave=self.leave_epoch_progress,
-            dynamic_ncols=True,
-            unit=self.mode,
+          total=self.total_steps,
+          desc="Evaluating",
+          bar_format=self.epoch_bar_format,
+          leave=self.leave_epoch_progress,
+          dynamic_ncols=True,
+          unit=self.mode,
         )
     elif hook == "train_epoch":
       current_epoch_description = "Epoch {epoch}/{num_epochs}".format(epoch=epoch + 1, num_epochs=self.num_epochs)
       if self.show_epoch_progress:
         print(current_epoch_description)
         self.epoch_progress_tqdm = self.tqdm(
-            total=self.total_steps,
-            bar_format=self.epoch_bar_format,
-            leave=self.leave_epoch_progress,
-            dynamic_ncols=True,
-            unit=self.mode,
+          total=self.total_steps,
+          bar_format=self.epoch_bar_format,
+          leave=self.leave_epoch_progress,
+          dynamic_ncols=True,
+          unit=self.mode,
         )
 
   def _clean_up_progbar(self, hook, logs):
@@ -215,17 +213,17 @@ def on_batch_end(self, batch, logs={}):
   def format_metrics(self, logs={}, factor=1):
     """Format metrics in logs into a string.
 
-        Args:
-            logs: dictionary of metrics and their values. Defaults to
-                empty dictionary.
-            factor (int): The factor we want to divide the metrics in logs
-                by, useful when we are computing the logs after each batch.
-                Defaults to 1.
-
-        Returns:
-            metrics_string: a string displaying metrics using the given
-            formators passed in through the constructor.
-        """
+    Args:
+        logs: dictionary of metrics and their values. Defaults to
+            empty dictionary.
+        factor (int): The factor we want to divide the metrics in logs
+            by, useful when we are computing the logs after each batch.
+            Defaults to 1.
+
+    Returns:
+        metrics_string: a string displaying metrics using the given
+        formators passed in through the constructor.
+    """
 
     metric_value_pairs = []
     for key, value in logs.items():
@@ -238,13 +236,13 @@ def format_metrics(self, logs={}, factor=1):
 
   def get_config(self):
     config = {
-        "metrics_separator": self.metrics_separator,
-        "overall_bar_format": self.overall_bar_format,
-        "epoch_bar_format": self.epoch_bar_format,
-        "leave_epoch_progress": self.leave_epoch_progress,
-        "leave_overall_progress": self.leave_overall_progress,
-        "show_epoch_progress": self.show_epoch_progress,
-        "show_overall_progress": self.show_overall_progress,
+      "metrics_separator": self.metrics_separator,
+      "overall_bar_format": self.overall_bar_format,
+      "epoch_bar_format": self.epoch_bar_format,
+      "leave_epoch_progress": self.leave_epoch_progress,
+      "leave_overall_progress": self.leave_overall_progress,
+      "show_epoch_progress": self.show_epoch_progress,
+      "show_overall_progress": self.show_overall_progress,
     }
 
     base_config = super().get_config()
diff --git a/deepray/callbacks/training_speed.py b/deepray/callbacks/training_speed.py
index 72ab756f..1a99f398 100644
--- a/deepray/callbacks/training_speed.py
+++ b/deepray/callbacks/training_speed.py
@@ -28,14 +28,14 @@
 class TrainingSpeed(Callback):
   """Callback that prints metrics to stdout.
 
-    Args:
-        count_mode: One of `"steps"` or `"samples"`.
-            Whether the progress bar should
-            count samples seen or steps (batches) seen.
+  Args:
+      count_mode: One of `"steps"` or `"samples"`.
+          Whether the progress bar should
+          count samples seen or steps (batches) seen.
 
-    Raises:
-        ValueError: In case of invalid `count_mode`.
-    """
+  Raises:
+      ValueError: In case of invalid `count_mode`.
+  """
 
   def __init__(self, batch_size: int = None):
     super().__init__()
@@ -140,7 +140,7 @@ def _batch_update_progbar(self, batch, logs=None):
     delta_steps = self.seen - self.last_step
 
     step_throughput = self.performance_calculator(delta_steps, self.global_batch_size)
-    logger.info('Perf %.2f samples/s' % step_throughput)
+    logger.info("Perf %.2f samples/s" % step_throughput)
 
     if batch > self._first_steps + delta_steps * 2:
       self._perf_wo += step_throughput
diff --git a/deepray/conftest.py b/deepray/conftest.py
index 03a0d84b..1ca97d56 100644
--- a/deepray/conftest.py
+++ b/deepray/conftest.py
@@ -5,19 +5,19 @@
 import deepray as dp
 
 from deepray.utils.test_utils import (  # noqa: F401
-    maybe_run_functions_eagerly,
-    only_run_functions_eagerly,
-    run_custom_and_py_ops,
-    run_with_mixed_precision_policy,
-    pytest_make_parametrize_id,
-    data_format,
-    set_seeds,
-    pytest_addoption,
-    set_global_variables,
-    pytest_configure,
-    device,
-    pytest_generate_tests,
-    pytest_collection_modifyitems,
+  maybe_run_functions_eagerly,
+  only_run_functions_eagerly,
+  run_custom_and_py_ops,
+  run_with_mixed_precision_policy,
+  pytest_make_parametrize_id,
+  data_format,
+  set_seeds,
+  pytest_addoption,
+  set_global_variables,
+  pytest_configure,
+  device,
+  pytest_generate_tests,
+  pytest_collection_modifyitems,
 )
 
 # fixtures present in this file will be available
diff --git a/deepray/core/common/distribution_utils.py b/deepray/core/common/distribution_utils.py
index 879834a6..e7ab5d4a 100644
--- a/deepray/core/common/distribution_utils.py
+++ b/deepray/core/common/distribution_utils.py
@@ -39,14 +39,14 @@ def _collective_communication(all_reduce_alg):
     ValueError: if `all_reduce_alg` not in [None, "ring", "nccl"]
   """
   collective_communication_options = {
-      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
-      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
-      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+    None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+    "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+    "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL,
   }
   if all_reduce_alg not in collective_communication_options:
     raise ValueError(
-        "When used with `multi_worker_mirrored`, valid values for "
-        "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(all_reduce_alg)
+      "When used with `multi_worker_mirrored`, valid values for "
+      "all_reduce_alg are [`ring`, `nccl`].  Supplied value: {}".format(all_reduce_alg)
     )
   return collective_communication_options[all_reduce_alg]
 
@@ -67,13 +67,13 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
   if all_reduce_alg is None:
     return None
   mirrored_all_reduce_options = {
-      "nccl": tf.distribute.NcclAllReduce,
-      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+    "nccl": tf.distribute.NcclAllReduce,
+    "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce,
   }
   if all_reduce_alg not in mirrored_all_reduce_options:
     raise ValueError(
-        "When used with `mirrored`, valid values for all_reduce_alg are "
-        "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(all_reduce_alg)
+      "When used with `mirrored`, valid values for all_reduce_alg are "
+      "[`nccl`, `hierarchical_copy`].  Supplied value: {}".format(all_reduce_alg)
     )
   cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
   return cross_device_ops_class(num_packs=num_packs)
@@ -135,12 +135,12 @@ def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None,
     distribution_strategy = flags.FLAGS.distribution_strategy
 
   if not isinstance(distribution_strategy, str):
-    msg = ("distribution_strategy must be a string but got: %s." % (distribution_strategy,))
+    msg = "distribution_strategy must be a string but got: %s." % (distribution_strategy,)
     if distribution_strategy == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
       msg += (
-          " If you meant to pass the string 'off', make sure you add "
-          "quotes around 'off' so that yaml interprets it as a string "
-          "instead of a bool."
+        " If you meant to pass the string 'off', make sure you add "
+        "quotes around 'off' so that yaml interprets it as a string "
+        "instead of a bool."
       )
     raise ValueError(msg)
 
@@ -155,15 +155,14 @@ def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None,
 
   if distribution_strategy == "multi_worker_mirrored":
     return tf.distribute.experimental.MultiWorkerMirroredStrategy(
-        communication=_collective_communication(all_reduce_alg)
+      communication=_collective_communication(all_reduce_alg)
     )
 
   if distribution_strategy == "one_device":
     if flags.FLAGS.num_gpus == 0:
       return tf.distribute.OneDeviceStrategy("device:CPU:0")
     if flags.FLAGS.num_gpus > 1:
-      raise ValueError("`OneDeviceStrategy` can not be used for more than "
-                       "one device.")
+      raise ValueError("`OneDeviceStrategy` can not be used for more than one device.")
     return tf.distribute.OneDeviceStrategy("device:GPU:0")
 
   if distribution_strategy == "mirrored":
@@ -172,7 +171,7 @@ def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None,
     else:
       devices = ["device:GPU:%d" % i for i in range(flags.FLAGS.num_gpus)]
     return tf.distribute.MirroredStrategy(
-        devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs)
+      devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs)
     )
 
   if distribution_strategy == "parameter_server":
@@ -182,23 +181,24 @@ def get_distribution_strategy(distribution_strategy="off", all_reduce_alg=None,
   raise ValueError("Unrecognized Distribution Strategy: %r" % distribution_strategy)
 
 
-def make_distributed_iterator(strategy, dataset_or_fn: Callable[..., tf.data.Dataset], *args,
-                              **kwargs) -> Optional[Iterator[Any]]:
+def make_distributed_iterator(
+  strategy, dataset_or_fn: Callable[..., tf.data.Dataset], *args, **kwargs
+) -> Optional[Iterator[Any]]:
   """A utility function to help create a `tf.distribute.DistributedDataset`.
 
-    Args:
-      dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
-        returning a `tf.data.Dataset`. If it is a function, it may optionally have
-        an argument named `input_context` which will be passed a
-        `tf.distribute.InputContext` instance.
-      *args: Any positional arguments to pass through to `dataset_or_fn`.
-      **kwargs: Any keyword arguments to pass through to `dataset_or_fn`, except
-        that the `input_options` keyword is used to specify a
-        `tf.distribute.InputOptions` for making the distributed dataset.
-
-    Returns:
-      A distributed Dataset.
-    """
+  Args:
+    dataset_or_fn: A instance of `tf.data.Dataset`, or a "dataset function"
+      returning a `tf.data.Dataset`. If it is a function, it may optionally have
+      an argument named `input_context` which will be passed a
+      `tf.distribute.InputContext` instance.
+    *args: Any positional arguments to pass through to `dataset_or_fn`.
+    **kwargs: Any keyword arguments to pass through to `dataset_or_fn`, except
+      that the `input_options` keyword is used to specify a
+      `tf.distribute.InputOptions` for making the distributed dataset.
+
+  Returns:
+    A distributed Dataset.
+  """
   input_options = kwargs.pop("input_options", None)
 
   if isinstance(dataset_or_fn, tf.data.Dataset):
@@ -208,8 +208,7 @@ def make_distributed_iterator(strategy, dataset_or_fn: Callable[..., tf.data.Dat
       return iter(dataset_or_fn)
 
   if not callable(dataset_or_fn):
-    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`.")
+    raise ValueError("`dataset_or_fn` should be either callable or an instance of `tf.data.Dataset`.")
 
   def dataset_fn(input_context: Optional[tf.distribute.InputContext] = None):
     """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""
@@ -242,24 +241,17 @@ def configure_cluster(worker_hosts=None, task_index=-1):
   """
   tf_config = json.loads(os.environ.get("TF_CONFIG", "{}"))
   if tf_config:
-    num_workers = (len(tf_config["cluster"].get("chief", [])) + len(tf_config["cluster"].get("worker", [])))
+    num_workers = len(tf_config["cluster"].get("chief", [])) + len(tf_config["cluster"].get("worker", []))
   elif worker_hosts:
     workers = worker_hosts.split(",")
     num_workers = len(workers)
     if num_workers > 1 and task_index < 0:
       raise ValueError("Must specify task_index when number of workers > 1")
     task_index = 0 if num_workers == 1 else task_index
-    os.environ["TF_CONFIG"] = json.dumps(
-        {
-            "cluster": {
-                "worker": workers
-            },
-            "task": {
-                "type": "worker",
-                "index": task_index
-            }
-        }
-    )
+    os.environ["TF_CONFIG"] = json.dumps({
+      "cluster": {"worker": workers},
+      "task": {"type": "worker", "index": task_index},
+    })
   else:
     num_workers = 1
   return num_workers
@@ -275,7 +267,6 @@ def get_strategy_scope(strategy):
 
 
 class DummyContextManager(object):
-
   def __enter__(self):
     pass
 
@@ -291,7 +282,7 @@ def get_v1_distribution_strategy(params):
       logging.getLogger(name).setLevel(logging.ERROR)
 
     tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-        tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator"
+      tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator"
     )
 
     logging.info("Issuing reset command to TPU to ensure a clean state.")
@@ -301,9 +292,9 @@ def get_v1_distribution_strategy(params):
     # by reading the `TF_CONFIG` environment variable, and the coordinator
     # is used by StreamingFilesDataset.
     tf_config_env = {
-        "session_master": tpu_cluster_resolver.get_master(),
-        "eval_session_master": tpu_cluster_resolver.get_master(),
-        "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"]
+      "session_master": tpu_cluster_resolver.get_master(),
+      "eval_session_master": tpu_cluster_resolver.get_master(),
+      "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"],
     }
     os.environ["TF_CONFIG"] = json.dumps(tf_config_env)
 
diff --git a/deepray/core/common/distribution_utils_test.py b/deepray/core/common/distribution_utils_test.py
index bcb52c0c..4b5e6fc7 100644
--- a/deepray/core/common/distribution_utils_test.py
+++ b/deepray/core/common/distribution_utils_test.py
@@ -18,34 +18,34 @@
 
 from deepray.core.common import distribution_utils
 
-TPU_TEST = 'test_tpu' in sys.argv[0]
+TPU_TEST = "test_tpu" in sys.argv[0]
 
 
 class DistributeUtilsTest(tf.test.TestCase):
   """Tests for distribute util functions."""
 
   def test_invalid_args(self):
-    with self.assertRaisesRegex(ValueError, '`num_gpus` can not be negative.'):
+    with self.assertRaisesRegex(ValueError, "`num_gpus` can not be negative."):
       _ = distribution_utils.get_distribution_strategy(num_gpus=-1)
 
-    with self.assertRaisesRegex(ValueError, '.*If you meant to pass the string .*'):
+    with self.assertRaisesRegex(ValueError, ".*If you meant to pass the string .*"):
       _ = distribution_utils.get_distribution_strategy(distribution_strategy=False, num_gpus=0)
-    with self.assertRaisesRegex(ValueError, 'When 2 GPUs are specified.*'):
-      _ = distribution_utils.get_distribution_strategy(distribution_strategy='off', num_gpus=2)
-    with self.assertRaisesRegex(ValueError, '`OneDeviceStrategy` can not be used.*'):
-      _ = distribution_utils.get_distribution_strategy(distribution_strategy='one_device', num_gpus=2)
+    with self.assertRaisesRegex(ValueError, "When 2 GPUs are specified.*"):
+      _ = distribution_utils.get_distribution_strategy(distribution_strategy="off", num_gpus=2)
+    with self.assertRaisesRegex(ValueError, "`OneDeviceStrategy` can not be used.*"):
+      _ = distribution_utils.get_distribution_strategy(distribution_strategy="one_device", num_gpus=2)
 
   def test_one_device_strategy_cpu(self):
-    ds = distribution_utils.get_distribution_strategy('one_device', num_gpus=0)
+    ds = distribution_utils.get_distribution_strategy("one_device", num_gpus=0)
     self.assertEquals(ds.num_replicas_in_sync, 1)
     self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('CPU', ds.extended.worker_devices[0])
+    self.assertIn("CPU", ds.extended.worker_devices[0])
 
   def test_one_device_strategy_gpu(self):
-    ds = distribution_utils.get_distribution_strategy('one_device', num_gpus=1)
+    ds = distribution_utils.get_distribution_strategy("one_device", num_gpus=1)
     self.assertEquals(ds.num_replicas_in_sync, 1)
     self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('GPU', ds.extended.worker_devices[0])
+    self.assertIn("GPU", ds.extended.worker_devices[0])
 
   def test_mirrored_strategy(self):
     # CPU only.
@@ -55,50 +55,50 @@ def test_mirrored_strategy(self):
     self.assertEquals(ds.num_replicas_in_sync, 5)
     self.assertEquals(len(ds.extended.worker_devices), 5)
     for device in ds.extended.worker_devices:
-      self.assertIn('GPU', device)
+      self.assertIn("GPU", device)
 
     _ = distribution_utils.get_distribution_strategy(
-        distribution_strategy='mirrored', num_gpus=2, all_reduce_alg='nccl', num_packs=2
+      distribution_strategy="mirrored", num_gpus=2, all_reduce_alg="nccl", num_packs=2
     )
-    with self.assertRaisesRegex(ValueError, 'When used with `mirrored`, valid values for all_reduce_alg are.*'):
+    with self.assertRaisesRegex(ValueError, "When used with `mirrored`, valid values for all_reduce_alg are.*"):
       _ = distribution_utils.get_distribution_strategy(
-          distribution_strategy='mirrored', num_gpus=2, all_reduce_alg='dummy', num_packs=2
+        distribution_strategy="mirrored", num_gpus=2, all_reduce_alg="dummy", num_packs=2
       )
 
   def test_mwms(self):
     distribution_utils.configure_cluster(worker_hosts=None, task_index=-1)
-    ds = distribution_utils.get_distribution_strategy('multi_worker_mirrored', all_reduce_alg='nccl')
+    ds = distribution_utils.get_distribution_strategy("multi_worker_mirrored", all_reduce_alg="nccl")
     self.assertIsInstance(ds, tf.distribute.experimental.MultiWorkerMirroredStrategy)
 
-    with self.assertRaisesRegex(ValueError, 'When used with `multi_worker_mirrored`, valid values.*'):
-      _ = distribution_utils.get_distribution_strategy('multi_worker_mirrored', all_reduce_alg='dummy')
+    with self.assertRaisesRegex(ValueError, "When used with `multi_worker_mirrored`, valid values.*"):
+      _ = distribution_utils.get_distribution_strategy("multi_worker_mirrored", all_reduce_alg="dummy")
 
   def test_no_strategy(self):
-    ds = distribution_utils.get_distribution_strategy('off')
+    ds = distribution_utils.get_distribution_strategy("off")
     self.assertIs(ds, tf.distribute.get_strategy())
 
   def test_tpu_strategy(self):
     if not TPU_TEST:
-      self.skipTest('Only Cloud TPU VM instances can have local TPUs.')
+      self.skipTest("Only Cloud TPU VM instances can have local TPUs.")
     with self.assertRaises(ValueError):
-      _ = distribution_utils.get_distribution_strategy('tpu')
+      _ = distribution_utils.get_distribution_strategy("tpu")
 
-    ds = distribution_utils.get_distribution_strategy('tpu', tpu_address='local')
+    ds = distribution_utils.get_distribution_strategy("tpu", tpu_address="local")
     self.assertIsInstance(ds, tf.distribute.TPUStrategy)
 
   def test_invalid_strategy(self):
-    with self.assertRaisesRegexp(ValueError, 'distribution_strategy must be a string but got: False. If'):
+    with self.assertRaisesRegexp(ValueError, "distribution_strategy must be a string but got: False. If"):
       distribution_utils.get_distribution_strategy(False)
-    with self.assertRaisesRegexp(ValueError, 'distribution_strategy must be a string but got: 1'):
+    with self.assertRaisesRegexp(ValueError, "distribution_strategy must be a string but got: 1"):
       distribution_utils.get_distribution_strategy(1)
 
   def test_get_strategy_scope(self):
-    ds = distribution_utils.get_distribution_strategy('one_device', num_gpus=0)
+    ds = distribution_utils.get_distribution_strategy("one_device", num_gpus=0)
     with distribution_utils.get_strategy_scope(ds):
       self.assertIs(tf.distribute.get_strategy(), ds)
     with distribution_utils.get_strategy_scope(None):
       self.assertIsNot(tf.distribute.get_strategy(), ds)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/core/common/flags.py b/deepray/core/common/flags.py
index b592f755..471672c0 100644
--- a/deepray/core/common/flags.py
+++ b/deepray/core/common/flags.py
@@ -36,47 +36,46 @@ def define_flags():
   use any of the flags.
   """
   flags.DEFINE_string(
-      'experiment', default=None, help='The experiment type registered, specifying an ExperimentConfig.'
+    "experiment", default=None, help="The experiment type registered, specifying an ExperimentConfig."
   )
   flags.DEFINE_string(
-      'model_dir', default=None, help='The directory where the model and training/evaluation summaries'
-      'are stored.'
+    "model_dir", default=None, help="The directory where the model and training/evaluation summariesare stored."
   )
 
   flags.DEFINE_string(
-      'params_override',
-      default=None,
-      help='a YAML/JSON string or a YAML file which specifies additional '
-      'overrides over the default parameters and those specified in '
-      '`--config_file`. Note that this is supposed to be used only to override '
-      'the model parameters, but not the parameters like TPU specific flags. '
-      'One canonical use case of `--config_file` and `--params_override` is '
-      'users first define a template config file using `--config_file`, then '
-      'use `--params_override` to adjust the minimal set of tuning parameters, '
-      'for example setting up different `batch_size`. The final override '
-      'order of parameters: default_model_params --> params from config_file '
-      '--> params in params_override. See also the help message of '
-      '`--config_file`.'
+    "params_override",
+    default=None,
+    help="a YAML/JSON string or a YAML file which specifies additional "
+    "overrides over the default parameters and those specified in "
+    "`--config_file`. Note that this is supposed to be used only to override "
+    "the model parameters, but not the parameters like TPU specific flags. "
+    "One canonical use case of `--config_file` and `--params_override` is "
+    "users first define a template config file using `--config_file`, then "
+    "use `--params_override` to adjust the minimal set of tuning parameters, "
+    "for example setting up different `batch_size`. The final override "
+    "order of parameters: default_model_params --> params from config_file "
+    "--> params in params_override. See also the help message of "
+    "`--config_file`.",
   )
 
   # The libraries rely on gin often make mistakes that include flags inside
   # the library files which causes conflicts.
   try:
-    flags.DEFINE_multi_string('gin_file', default=None, help='List of paths to the config files.')
+    flags.DEFINE_multi_string("gin_file", default=None, help="List of paths to the config files.")
   except flags.DuplicateFlagError:
     pass
 
   try:
-    flags.DEFINE_multi_string('gin_params', default=None, help='Newline separated list of Gin parameter bindings.')
+    flags.DEFINE_multi_string("gin_params", default=None, help="Newline separated list of Gin parameter bindings.")
   except flags.DuplicateFlagError:
     pass
 
   flags.DEFINE_string(
-      'tpu',
-      default=None,
-      help='The Cloud TPU to use for training. This should be either the name '
-      'used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 '
-      'url.'
+    "tpu",
+    default=None,
+    help="The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.",
   )
 
-  flags.DEFINE_string('tf_data_service', default=None, help='The tf.data service address')
+  flags.DEFINE_string("tf_data_service", default=None, help="The tf.data service address")
diff --git a/deepray/core/common/registry_imports.py b/deepray/core/common/registry_imports.py
index 11b3baa0..6ab3225b 100644
--- a/deepray/core/common/registry_imports.py
+++ b/deepray/core/common/registry_imports.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """All necessary imports for registration."""
+
 # pylint: disable=unused-import
 from official import vision
 from official.nlp import tasks
diff --git a/deepray/core/common/streamz_counters.py b/deepray/core/common/streamz_counters.py
index 82a2e316..80462d93 100644
--- a/deepray/core/common/streamz_counters.py
+++ b/deepray/core/common/streamz_counters.py
@@ -16,10 +16,10 @@
 from tensorflow.python.eager import monitoring
 
 progressive_policy_creation_counter = monitoring.Counter(
-    "/tensorflow/training/fast_training/progressive_policy_creation",
-    "Counter for the number of ProgressivePolicy creations."
+  "/tensorflow/training/fast_training/progressive_policy_creation",
+  "Counter for the number of ProgressivePolicy creations.",
 )
 
 stack_vars_to_vars_call_counter = monitoring.Counter(
-    "/tensorflow/training/fast_training/tf_vars_to_vars", "Counter for the number of low-level stacking API calls."
+  "/tensorflow/training/fast_training/tf_vars_to_vars", "Counter for the number of low-level stacking API calls."
 )
diff --git a/deepray/core/compile_utils.py b/deepray/core/compile_utils.py
index 5ee1479a..138a81eb 100644
--- a/deepray/core/compile_utils.py
+++ b/deepray/core/compile_utils.py
@@ -14,7 +14,6 @@
 
 
 class HvdMetricsContainer(MetricsContainer):
-
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Updates the state of per-output metrics."""
     y_true = self._conform_to_outputs(y_pred, y_true)
@@ -28,11 +27,11 @@ def update_state(self, y_true, y_pred, sample_weight=None):
     sample_weight = tf.nest.flatten(sample_weight)
 
     zip_args = (
-        y_true,
-        y_pred,
-        sample_weight,
-        self._metrics,
-        self._weighted_metrics,
+      y_true,
+      y_pred,
+      sample_weight,
+      self._metrics,
+      self._weighted_metrics,
     )
     for y_t, y_p, sw, metric_objs, weighted_metric_objs in zip(*zip_args):
       # Ok to have no metrics for an output.
diff --git a/deepray/core/export_saved_model.py b/deepray/core/export_saved_model.py
index ae87944b..00fc8ed0 100644
--- a/deepray/core/export_saved_model.py
+++ b/deepray/core/export_saved_model.py
@@ -22,14 +22,14 @@
 
 
 def _id_key(filename):
-  _, id_num = filename.rsplit('-', maxsplit=1)
+  _, id_num = filename.rsplit("-", maxsplit=1)
   return int(id_num)
 
 
 def _find_managed_files(base_name):
   r"""Returns all files matching '{base_name}-\d+', in sorted order."""
-  managed_file_regex = re.compile(rf'{re.escape(base_name)}-\d+$')
-  filenames = tf.io.gfile.glob(f'{base_name}-*')
+  managed_file_regex = re.compile(rf"{re.escape(base_name)}-\d+$")
+  filenames = tf.io.gfile.glob(f"{base_name}-*")
   filenames = filter(managed_file_regex.match, filenames)
   return sorted(filenames, key=_id_key)
 
@@ -97,23 +97,23 @@ def clean_up(self):
     if self._max_to_keep < 0:
       return
 
-    for filename in self.managed_files[:-self._max_to_keep]:
+    for filename in self.managed_files[: -self._max_to_keep]:
       tf.io.gfile.rmtree(filename)
 
   def next_name(self) -> str:
     """Returns a new file name based on `base_name` and `next_id_fn()`."""
-    return f'{self._base_name}-{self._next_id_fn()}'
+    return f"{self._base_name}-{self._next_id_fn()}"
 
 
 class ExportSavedModel:
   """Action that exports the given model as a SavedModel."""
 
   def __init__(
-      self,
-      model: tf.Module,
-      file_manager: ExportFileManager,
-      signatures,
-      options: Optional[tf.saved_model.SaveOptions] = None
+    self,
+    model: tf.Module,
+    file_manager: ExportFileManager,
+    signatures,
+    options: Optional[tf.saved_model.SaveOptions] = None,
   ):
     """Initializes the instance.
 
diff --git a/deepray/core/export_saved_model_test.py b/deepray/core/export_saved_model_test.py
index d0875fc4..a6ff6490 100644
--- a/deepray/core/export_saved_model_test.py
+++ b/deepray/core/export_saved_model_test.py
@@ -21,7 +21,7 @@
 
 
 def _id_key(name):
-  _, id_num = name.rsplit('-', maxsplit=1)
+  _, id_num = name.rsplit("-", maxsplit=1)
   return int(id_num)
 
 
@@ -30,7 +30,6 @@ def _id_sorted_file_base_names(dir_path):
 
 
 class TestModel(tf.Module):
-
   def __init__(self):
     self.value = tf.Variable(0)
 
@@ -40,10 +39,9 @@ def __call__(self):
 
 
 class ExportSavedModelTest(tf.test.TestCase):
-
   def test_export_file_manager_default_ids(self):
     directory = self.create_tempdir()
-    base_name = os.path.join(directory.full_path, 'basename')
+    base_name = os.path.join(directory.full_path, "basename")
     manager = export_saved_model.ExportFileManager(base_name, max_to_keep=3)
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 0)
     directory.create_file(manager.next_name())
@@ -58,16 +56,16 @@ def test_export_file_manager_default_ids(self):
     directory.create_file(manager.next_name())
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 4)
     self.assertEqual(
-        _id_sorted_file_base_names(directory.full_path), ['basename-0', 'basename-1', 'basename-2', 'basename-3']
+      _id_sorted_file_base_names(directory.full_path), ["basename-0", "basename-1", "basename-2", "basename-3"]
     )
     manager.clean_up()  # Should delete file with lowest ID.
-    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ['basename-1', 'basename-2', 'basename-3'])
+    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ["basename-1", "basename-2", "basename-3"])
     manager = export_saved_model.ExportFileManager(base_name, max_to_keep=3)
-    self.assertEqual(os.path.basename(manager.next_name()), 'basename-4')
+    self.assertEqual(os.path.basename(manager.next_name()), "basename-4")
 
   def test_export_file_manager_custom_ids(self):
     directory = self.create_tempdir()
-    base_name = os.path.join(directory.full_path, 'basename')
+    base_name = os.path.join(directory.full_path, "basename")
 
     id_num = 0
 
@@ -80,57 +78,57 @@ def next_id():
     directory.create_file(manager.next_name())
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 1)
     manager.clean_up()  # Shouldn't do anything...
-    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ['basename-30'])
+    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ["basename-30"])
     id_num = 200
     directory.create_file(manager.next_name())
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 2)
     manager.clean_up()  # Shouldn't do anything...
-    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ['basename-30', 'basename-200'])
+    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ["basename-30", "basename-200"])
     id_num = 1000
     directory.create_file(manager.next_name())
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 3)
-    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ['basename-30', 'basename-200', 'basename-1000'])
+    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ["basename-30", "basename-200", "basename-1000"])
     manager.clean_up()  # Should delete file with lowest ID.
     self.assertLen(tf.io.gfile.listdir(directory.full_path), 2)
-    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ['basename-200', 'basename-1000'])
+    self.assertEqual(_id_sorted_file_base_names(directory.full_path), ["basename-200", "basename-1000"])
 
   def test_export_file_manager_managed_files(self):
     directory = self.create_tempdir()
-    directory.create_file('basename-5')
-    directory.create_file('basename-10')
-    directory.create_file('basename-50')
-    directory.create_file('basename-1000')
-    directory.create_file('basename-9')
-    directory.create_file('basename-10-suffix')
-    base_name = os.path.join(directory.full_path, 'basename')
+    directory.create_file("basename-5")
+    directory.create_file("basename-10")
+    directory.create_file("basename-50")
+    directory.create_file("basename-1000")
+    directory.create_file("basename-9")
+    directory.create_file("basename-10-suffix")
+    base_name = os.path.join(directory.full_path, "basename")
     manager = export_saved_model.ExportFileManager(base_name, max_to_keep=3)
     self.assertLen(manager.managed_files, 5)
-    self.assertEqual(manager.next_name(), f'{base_name}-1001')
+    self.assertEqual(manager.next_name(), f"{base_name}-1001")
     manager.clean_up()
-    self.assertEqual(manager.managed_files, [f'{base_name}-10', f'{base_name}-50', f'{base_name}-1000'])
+    self.assertEqual(manager.managed_files, [f"{base_name}-10", f"{base_name}-50", f"{base_name}-1000"])
 
   def test_export_file_manager_managed_files_double_slash(self):
-    directory = self.create_tempdir('foo//bar')
-    directory.create_file('basename-5')
-    directory.create_file('basename-10')
-    directory.create_file('basename-50')
-    directory.create_file('basename-1000')
-    directory.create_file('basename-9')
-    directory.create_file('basename-10-suffix')
-    base_name = os.path.join(directory.full_path, 'basename')
+    directory = self.create_tempdir("foo//bar")
+    directory.create_file("basename-5")
+    directory.create_file("basename-10")
+    directory.create_file("basename-50")
+    directory.create_file("basename-1000")
+    directory.create_file("basename-9")
+    directory.create_file("basename-10-suffix")
+    base_name = os.path.join(directory.full_path, "basename")
     expected_base_name = os.path.normpath(base_name)
     self.assertNotEqual(base_name, expected_base_name)
     manager = export_saved_model.ExportFileManager(base_name, max_to_keep=3)
     self.assertLen(manager.managed_files, 5)
-    self.assertEqual(manager.next_name(), f'{expected_base_name}-1001')
+    self.assertEqual(manager.next_name(), f"{expected_base_name}-1001")
     manager.clean_up()
     self.assertEqual(
-        manager.managed_files, [f'{expected_base_name}-10', f'{expected_base_name}-50', f'{expected_base_name}-1000']
+      manager.managed_files, [f"{expected_base_name}-10", f"{expected_base_name}-50", f"{expected_base_name}-1000"]
     )
 
   def test_export_saved_model(self):
     directory = self.create_tempdir()
-    base_name = os.path.join(directory.full_path, 'basename')
+    base_name = os.path.join(directory.full_path, "basename")
     file_manager = export_saved_model.ExportFileManager(base_name, max_to_keep=2)
     model = TestModel()
     export_action = export_saved_model.ExportSavedModel(model, file_manager=file_manager, signatures=model.__call__)
@@ -158,5 +156,5 @@ def test_export_saved_model(self):
     self.assertEqual(reloaded_model(), 7)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/core/modeling_tf_utils.py b/deepray/core/modeling_tf_utils.py
index dbe21d5f..63a94396 100644
--- a/deepray/core/modeling_tf_utils.py
+++ b/deepray/core/modeling_tf_utils.py
@@ -42,26 +42,26 @@
 from .generation import GenerationConfig, TFGenerationMixin
 from .tf_utils import shape_list
 from .utils import (
-    DUMMY_INPUTS,
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    TF2_WEIGHTS_INDEX_NAME,
-    TF2_WEIGHTS_NAME,
-    TF_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
-    ModelOutput,
-    PushToHubMixin,
-    cached_file,
-    download_url,
-    find_labels,
-    has_file,
-    is_offline_mode,
-    is_remote_url,
-    is_safetensors_available,
-    logging,
-    requires_backends,
-    working_or_temp_dir,
+  DUMMY_INPUTS,
+  SAFE_WEIGHTS_INDEX_NAME,
+  SAFE_WEIGHTS_NAME,
+  TF2_WEIGHTS_INDEX_NAME,
+  TF2_WEIGHTS_NAME,
+  TF_WEIGHTS_NAME,
+  WEIGHTS_INDEX_NAME,
+  WEIGHTS_NAME,
+  ModelOutput,
+  PushToHubMixin,
+  cached_file,
+  download_url,
+  find_labels,
+  has_file,
+  is_offline_mode,
+  is_remote_url,
+  is_safetensors_available,
+  logging,
+  requires_backends,
+  working_or_temp_dir,
 )
 
 if parse(tf.__version__) >= parse("2.11.0"):
@@ -86,8 +86,17 @@
 logger = logging.get_logger(__name__)
 tf_logger = tf.get_logger()
 
-TFModelInputType = Union[List[tf.Tensor], List[np.ndarray], List[KerasTensor], Dict[str, tf.Tensor],
-                         Dict[str, np.ndarray], Dict[str, KerasTensor], tf.Tensor, np.ndarray, KerasTensor,]
+TFModelInputType = Union[
+  List[tf.Tensor],
+  List[np.ndarray],
+  List[KerasTensor],
+  Dict[str, tf.Tensor],
+  Dict[str, np.ndarray],
+  Dict[str, KerasTensor],
+  tf.Tensor,
+  np.ndarray,
+  KerasTensor,
+]
 
 
 def dummy_loss(y_true, y_pred):
@@ -100,20 +109,20 @@ def dummy_loss(y_true, y_pred):
 
 class TFModelUtilsMixin:
   """
-    A few utilities for `tf.keras.Model`, to be used as a mixin.
-    """
+  A few utilities for `tf.keras.Model`, to be used as a mixin.
+  """
 
   def num_parameters(self, only_trainable: bool = False) -> int:
     """
-        Get the number of (optionally, trainable) parameters in the model.
+    Get the number of (optionally, trainable) parameters in the model.
 
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters
+    Args:
+        only_trainable (`bool`, *optional*, defaults to `False`):
+            Whether or not to return only the number of trainable parameters
 
-        Returns:
-            `int`: The number of parameters.
-        """
+    Returns:
+        `int`: The number of parameters.
+    """
     if only_trainable:
       return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
     else:
@@ -122,25 +131,25 @@ def num_parameters(self, only_trainable: bool = False) -> int:
 
 def keras_serializable(cls):
   """
-    Decorate a Keras Layer class to support Keras serialization.
+  Decorate a Keras Layer class to support Keras serialization.
 
-    This is done by:
+  This is done by:
 
-    1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
-       serialization time.
-    2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
-       convert it to a config object for the actual layer initializer.
-    3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
-       need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
+  1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
+     serialization time.
+  2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
+     convert it to a config object for the actual layer initializer.
+  3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
+     need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
 
-    Args:
-        cls (a `tf.keras.layers.Layers subclass`):
-            Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
-            initializer.
+  Args:
+      cls (a `tf.keras.layers.Layers subclass`):
+          Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its
+          initializer.
 
-    Returns:
-        The same class object, with modifications for Keras deserialization.
-    """
+  Returns:
+      The same class object, with modifications for Keras deserialization.
+  """
   initializer = cls.__init__
 
   config_class = getattr(cls, "config_class", None)
@@ -187,14 +196,14 @@ def get_config(self):
 
 class TFCausalLanguageModelingLoss:
   """
-    Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.
+  Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.
 
-    <Tip>
+  <Tip>
 
-    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+  Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
-    </Tip>
-    """
+  </Tip>
+  """
 
   def hf_compute_loss(self, labels, logits):
     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
@@ -216,8 +225,8 @@ def hf_compute_loss(self, labels, logits):
 
 class TFQuestionAnsweringLoss:
   """
-    Loss function suitable for question answering.
-    """
+  Loss function suitable for question answering.
+  """
 
   def hf_compute_loss(self, labels, logits):
     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
@@ -229,14 +238,14 @@ def hf_compute_loss(self, labels, logits):
 
 class TFTokenClassificationLoss:
   """
-    Loss function suitable for token classification.
+  Loss function suitable for token classification.
 
-    <Tip>
+  <Tip>
 
-    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+  Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
-    </Tip>
-    """
+  </Tip>
+  """
 
   def hf_compute_loss(self, labels, logits):
     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
@@ -271,8 +280,8 @@ def hf_compute_loss(self, labels, logits):
 
 class TFSequenceClassificationLoss:
   """
-    Loss function suitable for sequence classification.
-    """
+  Loss function suitable for sequence classification.
+  """
 
   def hf_compute_loss(self, labels, logits):
     if logits.shape.rank == 1 or logits.shape[1] == 1:
@@ -282,7 +291,7 @@ def hf_compute_loss(self, labels, logits):
         labels = tf.expand_dims(labels, axis=-1)
     else:
       loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
-          from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+        from_logits=True, reduction=tf.keras.losses.Reduction.NONE
       )
 
     return loss_fn(labels, logits)
@@ -298,26 +307,26 @@ def hf_compute_loss(self, labels, logits):
 
 class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
   """
-    Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
+  Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
 
-    <Tip>
+  <Tip>
 
-    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+  Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
-    </Tip>
-    """
+  </Tip>
+  """
 
 
 class TFNextSentencePredictionLoss:
   """
-    Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.
+  Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.
 
-    <Tip>
+  <Tip>
 
-    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+  Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
-    </Tip>
-    """
+  </Tip>
+  """
 
   def hf_compute_loss(self, labels, logits):
     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
@@ -344,50 +353,50 @@ def hf_compute_loss(self, labels, logits):
 
 def booleans_processing(config, **kwargs):
   """
-    Process the input booleans of each model.
+  Process the input booleans of each model.
 
-    Args:
-        config ([`PretrainedConfig`]):
-            The config of the running model.
-        **kwargs:
-            The boolean parameters
+  Args:
+      config ([`PretrainedConfig`]):
+          The config of the running model.
+      **kwargs:
+          The boolean parameters
 
-    Returns:
-        A dictionary with the proper values for each boolean
-    """
+  Returns:
+      A dictionary with the proper values for each boolean
+  """
   final_booleans = {}
 
   # Pure conv models (such as ConvNext) do not have `output_attentions`. If the signature has
   # `output_attentions`, it will be present here in `kwargs`, even if unset (in that case, as `None`)
   if "output_attentions" in kwargs:
     final_booleans["output_attentions"] = (
-        kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
+      kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions
     )
   final_booleans["output_hidden_states"] = (
-      kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None else config.output_hidden_states
+    kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None else config.output_hidden_states
   )
   final_booleans["return_dict"] = kwargs["return_dict"] if kwargs["return_dict"] is not None else config.return_dict
 
   if "use_cache" in kwargs:
     final_booleans["use_cache"] = (
-        kwargs["use_cache"] if kwargs["use_cache"] is not None else getattr(config, "use_cache", None)
+      kwargs["use_cache"] if kwargs["use_cache"] is not None else getattr(config, "use_cache", None)
     )
   return final_booleans
 
 
 def unpack_inputs(func):
   """
-    Decorator that processes the inputs to a Keras layer, passing them to the layer as keyword arguments. This enables
-    downstream use of the inputs by their variable name, even if they arrive packed as a dictionary in the first input
-    (common case in Keras).
+  Decorator that processes the inputs to a Keras layer, passing them to the layer as keyword arguments. This enables
+  downstream use of the inputs by their variable name, even if they arrive packed as a dictionary in the first input
+  (common case in Keras).
 
-    Args:
-        func (`callable`):
-            The callable function of the TensorFlow model.
+  Args:
+      func (`callable`):
+          The callable function of the TensorFlow model.
 
-    Returns:
-        A callable that wraps the original `func` with the behavior described above.
-    """
+  Returns:
+      A callable that wraps the original `func` with the behavior described above.
+  """
 
   original_signature = inspect.signature(func)
 
@@ -420,21 +429,21 @@ def run_call_with_unpacked_inputs(self, *args, **kwargs):
 
 def input_processing(func, config, **kwargs):
   """
-    Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
-    has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32',
-    name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
-
-    Args:
-        func (`callable`):
-            The callable function of the TensorFlow model.
-        config ([`PretrainedConfig`]):
-            The config of the running model.
-        **kwargs:
-            The inputs of the model.
-
-    Returns:
-        Two lists, one for the missing layers, and another one for the unexpected layers.
-    """
+  Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input
+  has to be named accordingly to the parameters name, i.e. `input_ids = tf.keras.Input(shape=(128,), dtype='int32',
+  name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
+
+  Args:
+      func (`callable`):
+          The callable function of the TensorFlow model.
+      config ([`PretrainedConfig`]):
+          The config of the running model.
+      **kwargs:
+          The inputs of the model.
+
+  Returns:
+      Two lists, one for the missing layers, and another one for the unexpected layers.
+  """
   signature = dict(inspect.signature(func).parameters)
   has_kwargs = bool(signature.pop("kwargs", None))
   signature.pop("self", None)
@@ -446,25 +455,24 @@ def input_processing(func, config, **kwargs):
 
   if "inputs" in kwargs["kwargs_call"]:
     warnings.warn(
-        "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
-        FutureWarning,
+      "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
+      FutureWarning,
     )
 
     output["input_ids"] = kwargs["kwargs_call"].pop("inputs")
 
   if "decoder_cached_states" in kwargs["kwargs_call"]:
     warnings.warn(
-        "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
-        " `past_key_values` instead.",
-        FutureWarning,
+      "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
+      " `past_key_values` instead.",
+      FutureWarning,
     )
     output["past_key_values"] = kwargs["kwargs_call"].pop("decoder_cached_states")
 
   if "past" in kwargs["kwargs_call"] and "past_key_values" in parameter_names:
     warnings.warn(
-        "The `past` argument is deprecated and will be removed in a future version, use `past_key_values`"
-        " instead.",
-        FutureWarning,
+      "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
+      FutureWarning,
     )
     kwargs["past_key_values"] = kwargs["kwargs_call"].pop("past")
   elif "past_key_values" in kwargs["kwargs_call"] and "past" in parameter_names:
@@ -475,8 +483,7 @@ def input_processing(func, config, **kwargs):
   else:
     if len(kwargs["kwargs_call"]) > 0:
       raise ValueError(
-          "The following keyword arguments are not supported by this model:"
-          f" {list(kwargs['kwargs_call'].keys())}."
+        f"The following keyword arguments are not supported by this model: {list(kwargs['kwargs_call'].keys())}."
       )
     kwargs.pop("kwargs_call")
 
@@ -502,24 +509,22 @@ def input_processing(func, config, **kwargs):
         output[parameter_names[i]] = input
       else:
         raise ValueError(
-            f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for"
-            f" {parameter_names[i]}."
+          f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for {parameter_names[i]}."
         )
   elif isinstance(main_input, Mapping):
     if "inputs" in main_input:
       warnings.warn(
-          "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids`"
-          " instead.",
-          FutureWarning,
+        "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.",
+        FutureWarning,
       )
 
       output["input_ids"] = main_input.pop("inputs")
 
     if "decoder_cached_states" in main_input:
       warnings.warn(
-          "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
-          " `past_key_values` instead.",
-          FutureWarning,
+        "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use"
+        " `past_key_values` instead.",
+        FutureWarning,
       )
       output["past_key_values"] = main_input.pop("decoder_cached_states")
 
@@ -528,7 +533,7 @@ def input_processing(func, config, **kwargs):
         output[k] = v
       elif k not in parameter_names and "args" not in parameter_names:
         logger.warning(
-            f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
+          f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored."
         )
         continue
       else:
@@ -538,8 +543,7 @@ def input_processing(func, config, **kwargs):
       output[main_input_name] = main_input
     else:
       raise ValueError(
-          f"Data of type {type(main_input)} is not allowed only {allowed_types} is accepted for"
-          f" {main_input_name}."
+        f"Data of type {type(main_input)} is not allowed only {allowed_types} is accepted for {main_input_name}."
       )
 
   # Populates any unspecified argument with their default value, according to the signature.
@@ -576,30 +580,30 @@ def input_processing(func, config, **kwargs):
 
   if config is not None:
     boolean_dict = {
-        k: v
-        for k, v in output.items()
-        if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
+      k: v for k, v in output.items() if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"]
     }
 
-    output.update(booleans_processing(
+    output.update(
+      booleans_processing(
         config=config,
         **boolean_dict,
-    ))
+      )
+    )
 
   return output
 
 
 def dtype_byte_size(dtype):
   """
-    Returns the size (in bytes) occupied by one parameter of type `dtype`.
+  Returns the size (in bytes) occupied by one parameter of type `dtype`.
 
-    Example:
+  Example:
 
-    ```py
-    >>> dtype_byte_size(tf.float32)
-    4
-    ```
-    """
+  ```py
+  >>> dtype_byte_size(tf.float32)
+  4
+  ```
+  """
   if dtype == tf.bool:
     return 1 / 8
   bit_search = re.search(r"[^\d](\d+)$", dtype.name)
@@ -619,27 +623,27 @@ def format_weight_name(name, _prefix=None):
 
 def tf_shard_checkpoint(weights, max_shard_size="10GB"):
   """
-    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
-    given size.
+  Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
+  given size.
 
-    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
-    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
-    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
-    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
+  The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
+  optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
+  limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
+  [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
 
-    <Tip warning={true}>
+  <Tip warning={true}>
 
-    If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will
-    have a size greater than `max_shard_size`.
+  If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will
+  have a size greater than `max_shard_size`.
 
-    </Tip>
+  </Tip>
 
-    Args:
-        weights (`Dict[str, tf.RessourceVariable]`): The list of tf.RessourceVariable of a model to save.
-        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
-            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
-            (like `"5MB"`).
-    """
+  Args:
+      weights (`Dict[str, tf.RessourceVariable]`): The list of tf.RessourceVariable of a model to save.
+      max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
+          The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
+          (like `"5MB"`).
+  """
   max_shard_size = convert_file_size_to_int(max_shard_size)
 
   sharded_state_dicts = []
@@ -671,7 +675,7 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB"):
   weight_map = {}
   shards = {}
   for idx, shard in enumerate(sharded_state_dicts):
-    shard_file = TF2_WEIGHTS_NAME.replace(".h5", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.h5")
+    shard_file = TF2_WEIGHTS_NAME.replace(".h5", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.h5")
     shards[shard_file] = shard
     for weight in shard:
       weight_name = weight.name
@@ -685,24 +689,24 @@ def tf_shard_checkpoint(weights, max_shard_size="10GB"):
 
 def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=True):
   """
-    This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load
-    the TF weights from the shard file accordingly to their names and shapes.
-
-    This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
-    loaded in the model.
-
-    Args:
-        model (`tf.keras.models.Model`): The model in which to load the checkpoint.
-        shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
-        ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
-            Whether or not to ignore the mismatch between the sizes
-        strict (`bool`, *optional*, defaults to `True`):
-            Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
-
-    Returns:
-        Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
-        mismatched layers.
-    """
+  This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load
+  the TF weights from the shard file accordingly to their names and shapes.
+
+  This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being
+  loaded in the model.
+
+  Args:
+      model (`tf.keras.models.Model`): The model in which to load the checkpoint.
+      shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names.
+      ignore_mismatched_sizes`bool`, *optional`, defaults to `True`):
+          Whether or not to ignore the mismatch between the sizes
+      strict (`bool`, *optional*, defaults to `True`):
+          Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint.
+
+  Returns:
+      Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
+      mismatched layers.
+  """
 
   # Load the index
   missing_keys = []
@@ -725,7 +729,7 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
   for shard_file in shard_files:
     state_dict = tf.io.read_file(shard_file)
     saved_weight_names_set, unexpected_keys_set, missmatched_keys_set = load_tf_shard(
-        model, model_layer_map, shard_file, ignore_mismatched_sizes=ignore_mismatched_sizes
+      model, model_layer_map, shard_file, ignore_mismatched_sizes=ignore_mismatched_sizes
     )
     saved_keys.update(saved_weight_names_set)
     unexpected_keys.update(unexpected_keys_set)
@@ -749,18 +753,18 @@ def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, s
 
 def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False):
   """
-    Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
+  Loads a shard from a sharded checkpoint file. Handles the missing keys and unexpected keys.
 
-    Args:
-        model (`tf.keras.models.Model`): Model in which the weights are loaded
-        model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
-        resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
-        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
+  Args:
+      model (`tf.keras.models.Model`): Model in which the weights are loaded
+      model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model.
+      resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded
+      ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys
 
-    Returns:
-        `tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
-        shard file), one for the missmatched layers, and another one for the unexpected layers.
-    """
+  Returns:
+      `tf.keras.models.Model`: Three lists, one for the layers that were found and succesfully restored (from the
+      shard file), one for the missmatched layers, and another one for the unexpected layers.
+  """
   saved_weight_names_set = set()
   saved_weights = {}
   missmatched_keys = set()
@@ -770,7 +774,7 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
     with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
       # Retrieve the name of each layer from the H5 file
       saved_h5_model_layers_name = set(
-          hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
+        hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
       )
       weight_value_tuples = []
 
@@ -817,41 +821,41 @@ def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatch
       with open(resolved_archive_file) as f:
         if f.read().startswith("version"):
           raise OSError(
-              "You seem to have cloned a repository without having git-lfs installed. Please install "
-              "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-              "you cloned."
+            "You seem to have cloned a repository without having git-lfs installed. Please install "
+            "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+            "you cloned."
           )
         else:
           raise ValueError(
-              f"Unable to locate the file {resolved_archive_file} which is necessary to load this pretrained"
-              " model. Make sure you have saved the model properly."
+            f"Unable to locate the file {resolved_archive_file} which is necessary to load this pretrained"
+            " model. Make sure you have saved the model properly."
           ) from e
     except (UnicodeDecodeError, ValueError):
       raise OSError(
-          f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
-          f"at '{resolved_archive_file}'. "
-          "If you tried to load a TF model from a sharded checkpoint, you should try converting the model"
-          "by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
+        f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' "
+        f"at '{resolved_archive_file}'. "
+        "If you tried to load a TF model from a sharded checkpoint, you should try converting the model"
+        "by loading it in pytorch and saving it localy. A convertion script should be realeased soon."
       )
 
 
 def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None):
   """
-    Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and
-    shapes.
-
-    Args:
-        model (`tf.keras.models.Model`):
-            The model to load the weights into.
-        resolved_archive_file (`str`):
-            The location of the H5 file.
-        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
-            Whether or not to ignore weights with shapes that don't match between the checkpoint of the model.
-
-    Returns:
-        Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
-        mismatched layers.
-    """
+  Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and
+  shapes.
+
+  Args:
+      model (`tf.keras.models.Model`):
+          The model to load the weights into.
+      resolved_archive_file (`str`):
+          The location of the H5 file.
+      ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
+          Whether or not to ignore weights with shapes that don't match between the checkpoint of the model.
+
+  Returns:
+      Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the
+      mismatched layers.
+  """
   if resolved_archive_file.endswith(".safetensors"):
     load_function = load_tf_weights_from_safetensors
   else:
@@ -869,7 +873,7 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
   with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file:
     # Retrieve the name of each layer from the H5 file
     saved_h5_model_layers_name = set(
-        hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
+      hdf5_format.load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")
     )
 
     # Find the missing layers from the high level list of layers
@@ -912,7 +916,7 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
           if _prefix is not None:
             delimeter = len(_prefix.split("/"))
             symbolic_weight_name = "/".join(
-                symbolic_weight.name.split("/")[:delimeter] + symbolic_weight.name.split("/")[delimeter + 1:]
+              symbolic_weight.name.split("/")[:delimeter] + symbolic_weight.name.split("/")[delimeter + 1 :]
             )
           else:
             symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:])
@@ -941,9 +945,11 @@ def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_size
                 array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight))
               except ValueError as e:
                 if ignore_mismatched_sizes:
-                  mismatched_layers.append(
-                      (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight))
-                  )
+                  mismatched_layers.append((
+                    symbolic_weight_name,
+                    saved_weight_value.shape,
+                    K.int_shape(symbolic_weight),
+                  ))
                   continue
                 else:
                   raise e
@@ -1006,17 +1012,17 @@ def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismat
 
 def init_copy_embeddings(old_embeddings, new_num_tokens):
   r"""
-    This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case
-    new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be
-    kept or not. Example:
+  This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case
+  new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be
+  kept or not. Example:
 
-        - if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4]
+      - if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4]
 
-            -  mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1]
-        - if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5]
+          -  mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1]
+      - if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5]
 
-            - mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4]
-    """
+          - mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4]
+  """
   old_num_tokens, old_embedding_dim = shape_list(old_embeddings)
   size_diff = new_num_tokens - old_num_tokens
 
@@ -1033,9 +1039,9 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
   else:
     # if the new size if lower than the old one, we take the current embeddings until the new size
     current_weights = tf.slice(
-        old_embeddings.value(),
-        tf.convert_to_tensor([0, 0]),
-        tf.convert_to_tensor([new_num_tokens, old_embedding_dim]),
+      old_embeddings.value(),
+      tf.convert_to_tensor([0, 0]),
+      tf.convert_to_tensor([new_num_tokens, old_embedding_dim]),
     )
     mask = tf.fill(tf.convert_to_tensor([new_num_tokens, 1]), True)
 
@@ -1044,23 +1050,24 @@ def init_copy_embeddings(old_embeddings, new_num_tokens):
 
 class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin):
   r"""
-    Base class for all TF models.
+  Base class for all TF models.
 
-    [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
-    downloading and saving models as well as a few methods common to all models to:
+  [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading,
+  downloading and saving models as well as a few methods common to all models to:
 
-        - resize the input embeddings,
-        - prune heads in the self-attention heads.
+      - resize the input embeddings,
+      - prune heads in the self-attention heads.
 
-    Class attributes (overridden by derived classes):
+  Class attributes (overridden by derived classes):
+
+      - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
+        for this model architecture.
+      - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
+        classes of the same architecture adding modules on top of the base model.
+      - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
+        models, `pixel_values` for vision models and `input_values` for speech models).
+  """
 
-        - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class
-          for this model architecture.
-        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived
-          classes of the same architecture adding modules on top of the base model.
-        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP
-          models, `pixel_values` for vision models and `input_values` for speech models).
-    """
   config_class = None
   base_model_prefix = ""
   main_input_name = "input_ids"
@@ -1079,29 +1086,29 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
   @property
   def dummy_inputs(self) -> Dict[str, tf.Tensor]:
     """
-        Dummy inputs to build the network.
+    Dummy inputs to build the network.
 
-        Returns:
-            `Dict[str, tf.Tensor]`: The dummy inputs.
-        """
+    Returns:
+        `Dict[str, tf.Tensor]`: The dummy inputs.
+    """
     return {
-        "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
+      "input_ids": tf.constant(DUMMY_INPUTS, dtype=tf.int32),
     }
 
   @property
   def framework(self) -> str:
     """
-        :str: Identifies that this is a TensorFlow model.
-        """
+    :str: Identifies that this is a TensorFlow model.
+    """
     return "tf"
 
   def __init__(self, config, *inputs, **kwargs):
     super().__init__(*inputs, **kwargs)
     if not isinstance(config, PretrainedConfig):
       raise ValueError(
-          f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class "
-          "`PretrainedConfig`. To create a model from a pretrained model use "
-          f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+        f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class "
+        "`PretrainedConfig`. To create a model from a pretrained model use "
+        f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
       )
     # Save config and origin of the pretrained weights if given in model
     self.config = config
@@ -1122,61 +1129,61 @@ def from_config(cls, config, **kwargs):
   @classmethod
   def _from_config(cls, config, **kwargs):
     """
-        All context managers that the model should be initialized under go here.
-        """
+    All context managers that the model should be initialized under go here.
+    """
     return cls(config, **kwargs)
 
   def eager_serving(self, inputs):
     """
-        Method used for serving the model. Intended not to be compiled with a tf.function decorator so that we can use
-        it to generate multiple signatures later.
+    Method used for serving the model. Intended not to be compiled with a tf.function decorator so that we can use
+    it to generate multiple signatures later.
 
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
+    Args:
+        inputs (`Dict[str, tf.Tensor]`):
+            The input of the saved model as a dictionary of tensors.
+    """
     output = self.call(inputs)
 
     return self.serving_output(output)
 
   @tf.function(
-      input_signature=[
-          {
-              "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
-              "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
-              "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
-          }
-      ]
+    input_signature=[
+      {
+        "input_ids": tf.TensorSpec((None, None), tf.int32, name="input_ids"),
+        "attention_mask": tf.TensorSpec((None, None), tf.int32, name="attention_mask"),
+        "token_type_ids": tf.TensorSpec((None, None), tf.int32, name="token_type_ids"),
+      }
+    ]
   )
   def serving(self, inputs):
     """
-        Method used for serving the model.
+    Method used for serving the model.
 
-        Args:
-            inputs (`Dict[str, tf.Tensor]`):
-                The input of the saved model as a dictionary of tensors.
-        """
+    Args:
+        inputs (`Dict[str, tf.Tensor]`):
+            The input of the saved model as a dictionary of tensors.
+    """
     output = self.call(inputs)
 
     return self.serving_output(output)
 
   def serving_output(self, output):
     """
-        Prepare the output of the saved model. Each model must implement this function.
+    Prepare the output of the saved model. Each model must implement this function.
 
-        Args:
-            output ([`TFBaseModelOutput`]):
-                The output returned by the model.
-        """
+    Args:
+        output ([`TFBaseModelOutput`]):
+            The output returned by the model.
+    """
     raise NotImplementedError
 
   def can_generate(self) -> bool:
     """
-        Returns whether this model can generate sequences with `.generate()`.
+    Returns whether this model can generate sequences with `.generate()`.
 
-        Returns:
-            `bool`: Whether this model can generate sequences with `.generate()`.
-        """
+    Returns:
+        `bool`: Whether this model can generate sequences with `.generate()`.
+    """
     # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation
     if "GenerationMixin" in str(self.prepare_inputs_for_generation):
       return False
@@ -1184,11 +1191,11 @@ def can_generate(self) -> bool:
 
   def get_input_embeddings(self) -> tf.keras.layers.Layer:
     """
-        Returns the model's input embeddings layer.
+    Returns the model's input embeddings layer.
 
-        Returns:
-            `tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
-        """
+    Returns:
+        `tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
+    """
     main_layer = getattr(self, self.base_model_prefix, self)
 
     if main_layer is not self:
@@ -1211,21 +1218,21 @@ def _save_checkpoint(self, checkpoint_dir, epoch):
 
   def load_repo_checkpoint(self, repo_path_or_name):
     """
-        Loads a saved checkpoint (model weights and optimizer state) from a repo. Returns the current epoch count when
-        the checkpoint was made.
+    Loads a saved checkpoint (model weights and optimizer state) from a repo. Returns the current epoch count when
+    the checkpoint was made.
 
-        Args:
-            repo_path_or_name (`str`):
-                Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
-                the repository will have the name of that local folder).
+    Args:
+        repo_path_or_name (`str`):
+            Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
+            the repository will have the name of that local folder).
 
-        Returns:
-            `dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
-        """
+    Returns:
+        `dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
+    """
     if getattr(self, "optimizer", None) is None:
       raise RuntimeError(
-          "Checkpoint loading failed as no optimizer is attached to the model. "
-          "This is most likely caused by the model not being compiled."
+        "Checkpoint loading failed as no optimizer is attached to the model. "
+        "This is most likely caused by the model not being compiled."
       )
     if not os.path.isdir(repo_path_or_name):
       # If this isn't a local path, check that the remote repo exists and has a checkpoint in it
@@ -1264,51 +1271,51 @@ def load_repo_checkpoint(self, repo_path_or_name):
     return {"epoch": extra_data["epoch"]}
 
   def prepare_tf_dataset(
-      self,
-      dataset: "datasets.Dataset",  # noqa:F821
-      batch_size: int = 8,
-      shuffle: bool = True,
-      tokenizer: Optional["PreTrainedTokenizerBase"] = None,
-      collate_fn: Optional[Callable] = None,
-      collate_fn_args: Optional[Dict[str, Any]] = None,
-      drop_remainder: Optional[bool] = None,
-      prefetch: bool = True,
+    self,
+    dataset: "datasets.Dataset",  # noqa:F821
+    batch_size: int = 8,
+    shuffle: bool = True,
+    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
+    collate_fn: Optional[Callable] = None,
+    collate_fn_args: Optional[Dict[str, Any]] = None,
+    drop_remainder: Optional[bool] = None,
+    prefetch: bool = True,
   ):
     """
-        Wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` with collation and batching. This method is
-        designed to create a "ready-to-use" dataset that can be passed directly to Keras methods like `fit()` without
-        further modification. The method will drop columns from the dataset if they don't match input names for the
-        model. If you want to specify the column names to return rather than using the names that match this model, we
-        recommend using `Dataset.to_tf_dataset()` instead.
-
-        Args:
-            dataset (`Any`):
-                A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
-            batch_size (`int`, defaults to 8):
-                The size of batches to return.
-            shuffle (`bool`, defaults to `True`):
-                Whether to return samples from the dataset in random order. Usually `True` for training datasets and
-                `False` for validation/test datasets.
-            tokenizer ([`PreTrainedTokenizerBase`], *optional*):
-                A `PreTrainedTokenizer` that will be used to pad samples to create batches. Has no effect if a specific
-                `collate_fn` is passed instead.
-            collate_fn (`Callable`, *optional*):
-                A function that collates samples from the dataset into a single batch. Defaults to
-                `DefaultDataCollator` if no `tokenizer` is supplied or `DataCollatorWithPadding` if a `tokenizer` is
-                passed.
-            collate_fn_args (`Dict[str, Any]`, *optional*):
-                A dict of arguments to pass to the `collate_fn` alongside the list of samples.
-            drop_remainder (`bool`, *optional*):
-                Whether to drop the final batch, if the batch_size does not evenly divide the dataset length. Defaults
-                to the same setting as `shuffle`.
-            prefetch (`bool`, defaults to `True`):
-                Whether to add prefetching to the end of the `tf.data` pipeline. This is almost always beneficial for
-                performance, but can be disabled in edge cases.
-
-
-        Returns:
-            `Dataset`: A `tf.data.Dataset` which is ready to pass to the Keras API.
-        """
+    Wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` with collation and batching. This method is
+    designed to create a "ready-to-use" dataset that can be passed directly to Keras methods like `fit()` without
+    further modification. The method will drop columns from the dataset if they don't match input names for the
+    model. If you want to specify the column names to return rather than using the names that match this model, we
+    recommend using `Dataset.to_tf_dataset()` instead.
+
+    Args:
+        dataset (`Any`):
+            A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
+        batch_size (`int`, defaults to 8):
+            The size of batches to return.
+        shuffle (`bool`, defaults to `True`):
+            Whether to return samples from the dataset in random order. Usually `True` for training datasets and
+            `False` for validation/test datasets.
+        tokenizer ([`PreTrainedTokenizerBase`], *optional*):
+            A `PreTrainedTokenizer` that will be used to pad samples to create batches. Has no effect if a specific
+            `collate_fn` is passed instead.
+        collate_fn (`Callable`, *optional*):
+            A function that collates samples from the dataset into a single batch. Defaults to
+            `DefaultDataCollator` if no `tokenizer` is supplied or `DataCollatorWithPadding` if a `tokenizer` is
+            passed.
+        collate_fn_args (`Dict[str, Any]`, *optional*):
+            A dict of arguments to pass to the `collate_fn` alongside the list of samples.
+        drop_remainder (`bool`, *optional*):
+            Whether to drop the final batch, if the batch_size does not evenly divide the dataset length. Defaults
+            to the same setting as `shuffle`.
+        prefetch (`bool`, defaults to `True`):
+            Whether to add prefetching to the end of the `tf.data` pipeline. This is almost always beneficial for
+            performance, but can be disabled in edge cases.
+
+
+    Returns:
+        `Dataset`: A `tf.data.Dataset` which is ready to pass to the Keras API.
+    """
     requires_backends(self, ["datasets"])
     import datasets
 
@@ -1326,22 +1333,21 @@ def prepare_tf_dataset(
     model_labels = find_labels(self.__class__)
     if "cols_to_retain" in list(inspect.signature(dataset._get_output_signature).parameters.keys()):
       output_signature, _ = dataset._get_output_signature(
-          dataset,
-          batch_size=None,
-          collate_fn=collate_fn,
-          collate_fn_args=collate_fn_args,
-          cols_to_retain=model_inputs,
+        dataset,
+        batch_size=None,
+        collate_fn=collate_fn,
+        collate_fn_args=collate_fn_args,
+        cols_to_retain=model_inputs,
       )
     else:
       # TODO Matt: This is a workaround for older versions of datasets that are missing the `cols_to_retain`
       #            argument. We should remove this once the minimum supported version of datasets is > 2.3.2
       unwanted_columns = [
-          feature for feature in dataset.features
-          if feature not in model_inputs and feature not in ("label_ids", "label")
+        feature for feature in dataset.features if feature not in model_inputs and feature not in ("label_ids", "label")
       ]
       dataset = dataset.remove_columns(unwanted_columns)
       output_signature, _ = dataset._get_output_signature(
-          dataset, batch_size=None, collate_fn=collate_fn, collate_fn_args=collate_fn_args
+        dataset, batch_size=None, collate_fn=collate_fn, collate_fn_args=collate_fn_args
       )
     output_columns = list(output_signature.keys())
     feature_cols = [col for col in output_columns if col in model_inputs and col not in model_labels]
@@ -1350,38 +1356,38 @@ def prepare_tf_dataset(
     if drop_remainder is None:
       drop_remainder = shuffle
     tf_dataset = dataset.to_tf_dataset(
-        columns=feature_cols,
-        label_cols=label_cols,
-        batch_size=batch_size,
-        shuffle=shuffle,
-        drop_remainder=drop_remainder,
-        collate_fn=collate_fn,
-        collate_fn_args=collate_fn_args,
-        prefetch=prefetch,
+      columns=feature_cols,
+      label_cols=label_cols,
+      batch_size=batch_size,
+      shuffle=shuffle,
+      drop_remainder=drop_remainder,
+      collate_fn=collate_fn,
+      collate_fn_args=collate_fn_args,
+      prefetch=prefetch,
     )
     return tf_dataset
 
   def compile(
-      self,
-      optimizer="rmsprop",
-      loss="passthrough",
-      metrics=None,
-      loss_weights=None,
-      weighted_metrics=None,
-      run_eagerly=None,
-      steps_per_execution=None,
-      **kwargs
+    self,
+    optimizer="rmsprop",
+    loss="passthrough",
+    metrics=None,
+    loss_weights=None,
+    weighted_metrics=None,
+    run_eagerly=None,
+    steps_per_execution=None,
+    **kwargs,
   ):
     """
-        This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
-        function themselves.
-        """
+    This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss
+    function themselves.
+    """
     if loss == "passthrough":
       logger.warning(
-          "No loss specified in compile() - the model's internal loss computation will be used as the "
-          "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
-          "To disable this behaviour please pass a loss argument, or explicitly pass "
-          "`loss=None` if you do not want your model to compute a loss."
+        "No loss specified in compile() - the model's internal loss computation will be used as the "
+        "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! "
+        "To disable this behaviour please pass a loss argument, or explicitly pass "
+        "`loss=None` if you do not want your model to compute a loss."
       )
       loss = dummy_loss
       self._using_dummy_loss = True
@@ -1391,25 +1397,25 @@ def compile(
     # This argument got renamed, we need to support both versions
     if "steps_per_execution" in parent_args:
       super().compile(
-          optimizer=optimizer,
-          loss=loss,
-          metrics=metrics,
-          loss_weights=loss_weights,
-          weighted_metrics=weighted_metrics,
-          run_eagerly=run_eagerly,
-          steps_per_execution=steps_per_execution,
-          **kwargs,
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        weighted_metrics=weighted_metrics,
+        run_eagerly=run_eagerly,
+        steps_per_execution=steps_per_execution,
+        **kwargs,
       )
     else:
       super().compile(
-          optimizer=optimizer,
-          loss=loss,
-          metrics=metrics,
-          loss_weights=loss_weights,
-          weighted_metrics=weighted_metrics,
-          run_eagerly=run_eagerly,
-          experimental_steps_per_execution=steps_per_execution,
-          **kwargs,
+        optimizer=optimizer,
+        loss=loss,
+        metrics=metrics,
+        loss_weights=loss_weights,
+        weighted_metrics=weighted_metrics,
+        run_eagerly=run_eagerly,
+        experimental_steps_per_execution=steps_per_execution,
+        **kwargs,
       )
 
   def compute_loss(self, *args, **kwargs):
@@ -1418,11 +1424,11 @@ def compute_loss(self, *args, **kwargs):
       return super().compute_loss(*args, **kwargs)
     else:
       warnings.warn(
-          "The old compute_loss method is deprecated as it conflicts with the Keras compute_loss "
-          "method added in TF 2.8. If you want the original HF compute_loss, please call "
-          "hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, "
-          "calling compute_loss() will get the Keras method instead.",
-          FutureWarning,
+        "The old compute_loss method is deprecated as it conflicts with the Keras compute_loss "
+        "method added in TF 2.8. If you want the original HF compute_loss, please call "
+        "hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, "
+        "calling compute_loss() will get the Keras method instead.",
+        FutureWarning,
       )
       return self.hf_compute_loss(*args, **kwargs)
 
@@ -1443,11 +1449,11 @@ def get_label_to_output_name_mapping(self):
 
   def train_step(self, data):
     """
-        A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models
-        and supports directly training on the loss output head. In addition, it ensures input keys are copied to the
-        labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure
-        that they are available to the model during the forward pass.
-        """
+    A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models
+    and supports directly training on the loss output head. In addition, it ensures input keys are copied to the
+    labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure
+    that they are available to the model during the forward pass.
+    """
 
     # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
     arg_names = list(dict(inspect.signature(self.call).parameters).keys())
@@ -1469,7 +1475,6 @@ def train_step(self, data):
     # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments,
     # if those keys are not already present in the input dict
     if self._using_dummy_loss and y is not None:
-
       # If y is a tensor and the model only has one label-like input, map y to that input
       if len(label_kwargs) == 1 and isinstance(y, tf.Tensor):
         if isinstance(x, tf.Tensor):
@@ -1525,7 +1530,7 @@ def train_step(self, data):
           y_pred = y_pred.to_tuple()[1:]
         else:
           y_pred = y_pred.to_tuple()
-        y_pred = y_pred[:len(y)]  # Remove unused fields in case those cause problems
+        y_pred = y_pred[: len(y)]  # Remove unused fields in case those cause problems
       else:
         # If the labels are a single tensor, match them to the first non-loss tensor in the output
         if list(y_pred.keys())[0] == "loss":
@@ -1552,11 +1557,11 @@ def train_step(self, data):
 
   def test_step(self, data):
     """
-        A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models
-        and supports directly training on the loss output head. In addition, it ensures input keys are copied to the
-        labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure
-        that they are available to the model during the forward pass.
-        """
+    A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models
+    and supports directly training on the loss output head. In addition, it ensures input keys are copied to the
+    labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure
+    that they are available to the model during the forward pass.
+    """
     # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map`
     arg_names = list(dict(inspect.signature(self.call).parameters).keys())
     label_kwargs = find_labels(self.__class__)
@@ -1632,7 +1637,7 @@ def test_step(self, data):
         y_pred = y_pred.to_tuple()[1:]
       else:
         y_pred = y_pred.to_tuple()
-      y_pred = y_pred[:len(y)]  # Remove unused fields in case those cause problems
+      y_pred = y_pred[: len(y)]  # Remove unused fields in case those cause problems
     else:
       # If the labels are a single tensor, match them to the first non-loss tensor in the output
       if list(y_pred.keys())[0] == "loss":
@@ -1655,60 +1660,60 @@ def test_step(self, data):
     return return_metrics
 
   def create_model_card(
-      self,
-      output_dir,
-      model_name: str,
-      language: Optional[str] = None,
-      license: Optional[str] = None,
-      tags: Optional[str] = None,
-      finetuned_from: Optional[str] = None,
-      tasks: Optional[str] = None,
-      dataset_tags: Optional[Union[str, List[str]]] = None,
-      dataset: Optional[Union[str, List[str]]] = None,
-      dataset_args: Optional[Union[str, List[str]]] = None,
+    self,
+    output_dir,
+    model_name: str,
+    language: Optional[str] = None,
+    license: Optional[str] = None,
+    tags: Optional[str] = None,
+    finetuned_from: Optional[str] = None,
+    tasks: Optional[str] = None,
+    dataset_tags: Optional[Union[str, List[str]]] = None,
+    dataset: Optional[Union[str, List[str]]] = None,
+    dataset_args: Optional[Union[str, List[str]]] = None,
   ):
     """
-        Creates a draft of a model card using the information available to the `Trainer`.
-
-        Args:
-            output_dir (`str` or `os.PathLike`):
-                The folder in which to create the model card.
-            model_name (`str`, *optional*):
-                The name of the model.
-            language (`str`, *optional*):
-                The language of the model (if applicable)
-            license (`str`, *optional*):
-                The license of the model. Will default to the license of the pretrained model used, if the original
-                model given to the `Trainer` comes from a repo on the Hub.
-            tags (`str` or `List[str]`, *optional*):
-                Some tags to be included in the metadata of the model card.
-            finetuned_from (`str`, *optional*):
-                The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
-                of the original model given to the `Trainer` (if it comes from the Hub).
-            tasks (`str` or `List[str]`, *optional*):
-                One or several task identifiers, to be included in the metadata of the model card.
-            dataset_tags (`str` or `List[str]`, *optional*):
-                One or several dataset tags, to be included in the metadata of the model card.
-            dataset (`str` or `List[str]`, *optional*):
-                One or several dataset identifiers, to be included in the metadata of the model card.
-            dataset_args (`str` or `List[str]`, *optional*):
-               One or several dataset arguments, to be included in the metadata of the model card.
-        """
+    Creates a draft of a model card using the information available to the `Trainer`.
+
+    Args:
+        output_dir (`str` or `os.PathLike`):
+            The folder in which to create the model card.
+        model_name (`str`, *optional*):
+            The name of the model.
+        language (`str`, *optional*):
+            The language of the model (if applicable)
+        license (`str`, *optional*):
+            The license of the model. Will default to the license of the pretrained model used, if the original
+            model given to the `Trainer` comes from a repo on the Hub.
+        tags (`str` or `List[str]`, *optional*):
+            Some tags to be included in the metadata of the model card.
+        finetuned_from (`str`, *optional*):
+            The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo
+            of the original model given to the `Trainer` (if it comes from the Hub).
+        tasks (`str` or `List[str]`, *optional*):
+            One or several task identifiers, to be included in the metadata of the model card.
+        dataset_tags (`str` or `List[str]`, *optional*):
+            One or several dataset tags, to be included in the metadata of the model card.
+        dataset (`str` or `List[str]`, *optional*):
+            One or several dataset identifiers, to be included in the metadata of the model card.
+        dataset_args (`str` or `List[str]`, *optional*):
+           One or several dataset arguments, to be included in the metadata of the model card.
+    """
     # Avoids a circular import by doing this when necessary.
     from .modelcard import TrainingSummary  # tests_ignore
 
     training_summary = TrainingSummary.from_keras(
-        self,
-        keras_history=self.history,
-        language=language,
-        license=license,
-        tags=tags,
-        model_name=model_name,
-        finetuned_from=finetuned_from,
-        tasks=tasks,
-        dataset_tags=dataset_tags,
-        dataset=dataset,
-        dataset_args=dataset_args,
+      self,
+      keras_history=self.history,
+      language=language,
+      license=license,
+      tags=tags,
+      model_name=model_name,
+      finetuned_from=finetuned_from,
+      tasks=tasks,
+      dataset_tags=dataset_tags,
+      dataset=dataset,
+      dataset_args=dataset_args,
     )
     model_card = training_summary.to_model_card()
     with open(os.path.join(output_dir, "README.md"), "w") as f:
@@ -1716,12 +1721,12 @@ def create_model_card(
 
   def set_input_embeddings(self, value):
     """
-        Set model's input embeddings
+    Set model's input embeddings
 
-        Args:
-            value (`tf.Variable`):
-                The new weights mapping hidden states to vocabulary.
-        """
+    Args:
+        value (`tf.Variable`):
+            The new weights mapping hidden states to vocabulary.
+    """
     main_layer = getattr(self, self.base_model_prefix)
 
     if main_layer is None:
@@ -1736,11 +1741,11 @@ def set_input_embeddings(self, value):
 
   def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
     """
-        Returns the model's output embeddings
+    Returns the model's output embeddings
 
-        Returns:
-            `tf.Variable`: The new weights mapping vocabulary to hidden states.
-        """
+    Returns:
+        `tf.Variable`: The new weights mapping vocabulary to hidden states.
+    """
     if self.get_lm_head() is not None:
       lm_head = self.get_lm_head()
 
@@ -1756,12 +1761,12 @@ def get_output_embeddings(self) -> Union[None, tf.keras.layers.Layer]:
 
   def set_output_embeddings(self, value):
     """
-        Set model's output embeddings
+    Set model's output embeddings
 
-        Args:
-            value (`tf.Variable`):
-                The new weights mapping hidden states to vocabulary.
-        """
+    Args:
+        value (`tf.Variable`):
+            The new weights mapping hidden states to vocabulary.
+    """
     if self.get_lm_head() is not None:
       lm_head = self.get_lm_head()
       try:
@@ -1773,34 +1778,34 @@ def set_output_embeddings(self, value):
 
   def get_output_layer_with_bias(self) -> Union[None, tf.keras.layers.Layer]:
     """
-        Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
-        embeddings
+    Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the
+    embeddings
 
-        Return:
-            `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
-        """
+    Return:
+        `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
+    """
     warnings.warn(
-        "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
+      "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
     )
     return self.get_lm_head()
 
   def get_prefix_bias_name(self) -> Union[None, str]:
     """
-        Get the concatenated _prefix name of the bias from the model name to the parent layer
+    Get the concatenated _prefix name of the bias from the model name to the parent layer
 
-        Return:
-            `str`: The _prefix name of the bias.
-        """
+    Return:
+        `str`: The _prefix name of the bias.
+    """
     warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
     return None
 
   def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
     """
-        Dict of bias attached to an LM head. The key represents the name of the bias attribute.
+    Dict of bias attached to an LM head. The key represents the name of the bias attribute.
 
-        Return:
-            `tf.Variable`: The weights representing the bias, None if not an LM model.
-        """
+    Return:
+        `tf.Variable`: The weights representing the bias, None if not an LM model.
+    """
     if self.get_lm_head() is not None:
       lm_head = self.get_lm_head()
       try:
@@ -1813,12 +1818,12 @@ def get_bias(self) -> Union[None, Dict[str, tf.Variable]]:
 
   def set_bias(self, value):
     """
-        Set all the bias in the LM head.
+    Set all the bias in the LM head.
 
-        Args:
-            value (`Dict[tf.Variable]`):
-                All the new bias attached to an LM head.
-        """
+    Args:
+        value (`Dict[tf.Variable]`):
+            All the new bias attached to an LM head.
+    """
     if self.get_lm_head() is not None:
       lm_head = self.get_lm_head()
       try:
@@ -1829,29 +1834,30 @@ def set_bias(self, value):
 
   def get_lm_head(self) -> tf.keras.layers.Layer:
     """
-        The LM Head layer. This method must be overwritten by all the models that have a lm head.
+    The LM Head layer. This method must be overwritten by all the models that have a lm head.
 
-        Return:
-            `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
-        """
+    Return:
+        `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
+    """
     return None
 
-  def resize_token_embeddings(self,
-                              new_num_tokens: Optional[int] = None) -> Union[tf.keras.layers.Embedding, tf.Variable]:
+  def resize_token_embeddings(
+    self, new_num_tokens: Optional[int] = None
+  ) -> Union[tf.keras.layers.Embedding, tf.Variable]:
     """
-        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
+    Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
 
-        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+    Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
-        Arguments:
-            new_num_tokens (`int`, *optional*):
-                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
-                returns a pointer to the input tokens without doing anything.
+    Arguments:
+        new_num_tokens (`int`, *optional*):
+            The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+            vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+            returns a pointer to the input tokens without doing anything.
 
-        Return:
-            `tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
-        """
+    Return:
+        `tf.Variable` or `tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
+    """
     # TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor
 
     # Run the new code path if the model has a keras embeddings layer
@@ -1870,17 +1876,17 @@ def resize_token_embeddings(self,
 
   def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> tf.keras.layers.Embedding:
     """
-        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
+    Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
 
-        Arguments:
-            new_num_tokens (`int`, *optional*):
-                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
-                returns a pointer to the input tokens without doing anything.
+    Arguments:
+        new_num_tokens (`int`, *optional*):
+            The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+            vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just
+            returns a pointer to the input tokens without doing anything.
 
-        Return:
-            `tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
-        """
+    Return:
+        `tf.keras.layers.Embedding`: Pointer to the input tokens of the model.
+    """
     if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
       return self.get_input_embeddings()
 
@@ -1968,21 +1974,21 @@ def _v2_resize_token_embeddings(self, new_num_tokens):
 
   def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
     """
-        Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
-        Reducing the size will remove vectors from the end
+    Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
+    Reducing the size will remove vectors from the end
 
-        Args:
-            old_lm_head_bias (`tf.Variable`):
-                Old lm head bias to be resized.
-            new_num_tokens (`int`, *optional*):
-                New number of tokens in the linear matrix.
+    Args:
+        old_lm_head_bias (`tf.Variable`):
+            Old lm head bias to be resized.
+        new_num_tokens (`int`, *optional*):
+            New number of tokens in the linear matrix.
 
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or `None`, just returns None
+            Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+            vectors from the end. If not provided or `None`, just returns None
 
-        Return:
-            `tf.Variable`: Pointer to the resized bias.
-        """
+    Return:
+        `tf.Variable`: Pointer to the resized bias.
+    """
     # TODO (joao): flagged for replacement (by `_v2_get_resized_lm_head_bias`) due to embeddings refactor
     new_lm_head_bias = {}
 
@@ -2005,10 +2011,10 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
         bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True)
 
       new_bias = self.add_weight(
-          shape=final_shape,
-          initializer="zeros",
-          trainable=True,
-          name=weight.name.split(":")[0],
+        shape=final_shape,
+        initializer="zeros",
+        trainable=True,
+        name=weight.name.split(":")[0],
       )
       init_bias = tf.where(bias_mask, current_bias, new_bias.value())
 
@@ -2017,22 +2023,23 @@ def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens):
 
     return new_lm_head_bias
 
-  def _v2_get_resized_lm_head_bias(self, old_lm_head_bias: Dict[str, tf.Variable],
-                                   new_num_tokens: int) -> Dict[str, tf.Tensor]:
+  def _v2_get_resized_lm_head_bias(
+    self, old_lm_head_bias: Dict[str, tf.Variable], new_num_tokens: int
+  ) -> Dict[str, tf.Tensor]:
     """
-        Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
-        Reducing the size will remove vectors from the end
-
-        Args:
-            old_lm_head_bias (`Dict[str, tf.Variable]`):
-                Old lm head bias to be resized.
-            new_num_tokens (`int`):
-                New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at
-                the end. Reducing the size will remove vectors from the end.
+    Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end.
+    Reducing the size will remove vectors from the end
 
-        Return:
-            `tf.Tensor`: Values for the resized bias.
-        """
+    Args:
+        old_lm_head_bias (`Dict[str, tf.Variable]`):
+            Old lm head bias to be resized.
+        new_num_tokens (`int`):
+            New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at
+            the end. Reducing the size will remove vectors from the end.
+
+    Return:
+        `tf.Tensor`: Values for the resized bias.
+    """
     new_lm_head_bias = {}
 
     for attr, weight in old_lm_head_bias.items():
@@ -2052,35 +2059,35 @@ def _v2_get_resized_lm_head_bias(self, old_lm_head_bias: Dict[str, tf.Variable],
 
   def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens):
     """
-        Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end.
-        Reducing the size will remove vectors from the end
+    Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end.
+    Reducing the size will remove vectors from the end
 
-        Args:
-            old_lm_head_decoder (`tf.Variable`):
-                Old lm head decoder to be resized.
-            new_num_tokens (`int`, *optional*):
-                New number of tokens in the linear matrix.
+    Args:
+        old_lm_head_decoder (`tf.Variable`):
+            Old lm head decoder to be resized.
+        new_num_tokens (`int`, *optional*):
+            New number of tokens in the linear matrix.
 
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or `None`, just returns None
+            Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+            vectors from the end. If not provided or `None`, just returns None
 
-        Return:
-            `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input
-            ones.
-        """
+    Return:
+        `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input
+        ones.
+    """
     new_lm_head_decoder = old_lm_head_decoder
     is_input_output_equals = tf.reduce_any(
-        self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder
+      self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder
     )
 
     if old_lm_head_decoder is not None and not is_input_output_equals:
       old_embedding_dim = shape_list(old_lm_head_decoder)[1]
       decoder_mask, current_decoder = init_copy_embeddings(old_lm_head_decoder, new_num_tokens)
       new_lm_head_decoder = self.add_weight(
-          shape=(new_num_tokens, old_embedding_dim),
-          initializer="zeros",
-          trainable=True,
-          name=old_lm_head_decoder.name.split(":")[0],
+        shape=(new_num_tokens, old_embedding_dim),
+        initializer="zeros",
+        trainable=True,
+        name=old_lm_head_decoder.name.split(":")[0],
       )
       init_decoder = tf.where(decoder_mask, current_decoder, new_lm_head_decoder.value())
 
@@ -2090,32 +2097,32 @@ def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens):
 
   def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable:
     """
-        Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly
-        initialized vectors at the end. Reducing the size will remove vectors from the end
-
-        Args:
-            old_embeddings (`tf.Variable`):
-                Old embeddings to be resized.
-            new_num_tokens (`int`, *optional*):
-                New number of tokens in the embedding matrix.
+    Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly
+    initialized vectors at the end. Reducing the size will remove vectors from the end
 
-                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
-                `tf.Variable` module of the model without doing anything.
-
-        Return:
-            `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is
-            `None`
-        """
+    Args:
+        old_embeddings (`tf.Variable`):
+            Old embeddings to be resized.
+        new_num_tokens (`int`, *optional*):
+            New number of tokens in the embedding matrix.
+
+            Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+            vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+            `tf.Variable` module of the model without doing anything.
+
+    Return:
+        `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is
+        `None`
+    """
     # TODO (joao): flagged for replacement (by `_v2_get_resized_embeddings`) due to embeddings refactor
     old_embedding_dim = shape_list(old_embeddings)[1]
     init_range = getattr(self.config, "initializer_range", 0.02)
     embeddings_mask, current_embeddings = init_copy_embeddings(old_embeddings, new_num_tokens)
     new_embeddings = self.add_weight(
-        name=old_embeddings.name.split(":")[0],
-        shape=[new_num_tokens, old_embedding_dim],
-        initializer=get_initializer(init_range),
-        dtype=tf.float32,
+      name=old_embeddings.name.split(":")[0],
+      shape=[new_num_tokens, old_embedding_dim],
+      initializer=get_initializer(init_range),
+      dtype=tf.float32,
     )
     init_embeddings = tf.where(embeddings_mask, current_embeddings, new_embeddings.value())
 
@@ -2124,28 +2131,28 @@ def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Var
     return new_embeddings
 
   def _v2_get_resized_embeddings(
-      self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int
+    self, old_embeddings: tf.keras.layers.Embedding, new_num_tokens: int
   ) -> tf.keras.layers.Embedding:
     """
-        Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
-        vectors at the end. Reducing the size will remove vectors from the end.
+    Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized
+    vectors at the end. Reducing the size will remove vectors from the end.
 
-        Args:
-            old_embeddings (`tf.keras.layers.Embedding`):
-                Old embeddings to be resized.
-            new_num_tokens (`int`, *optional*):
-                New number of tokens in the embedding matrix.
+    Args:
+        old_embeddings (`tf.keras.layers.Embedding`):
+            Old embeddings to be resized.
+        new_num_tokens (`int`, *optional*):
+            New number of tokens in the embedding matrix.
 
-        Return:
-            `tf.keras.layers.Embedding`: Resized Embedding layer.
-        """
+    Return:
+        `tf.keras.layers.Embedding`: Resized Embedding layer.
+    """
 
     # Get the initialization range for the embeddings
     init_range = 0.02  # default value
     potential_initialization_variable_names = [
-        "initializer_range",  # most common
-        "initializer_factor",  # e.g. T5
-        "init_std",  # e.g BART
+      "initializer_range",  # most common
+      "initializer_factor",  # e.g. T5
+      "init_std",  # e.g BART
     ]
     for var_name in potential_initialization_variable_names:
       if hasattr(self.config, var_name):
@@ -2153,10 +2160,10 @@ def _v2_get_resized_embeddings(
 
     # Get a new (initialized) embeddings layer
     new_embeddings = tf.keras.layers.Embedding(
-        input_dim=new_num_tokens,
-        output_dim=old_embeddings.output_dim,
-        embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range),
-        name=old_embeddings.embeddings.name[:-13],  # exact same scoped name except "/embeddings:0"
+      input_dim=new_num_tokens,
+      output_dim=old_embeddings.output_dim,
+      embeddings_initializer=tf.keras.initializers.TruncatedNormal(stddev=init_range),
+      name=old_embeddings.embeddings.name[:-13],  # exact same scoped name except "/embeddings:0"
     )
     new_embeddings(tf.constant([[0]]))
 
@@ -2165,73 +2172,73 @@ def _v2_get_resized_embeddings(
       init_embeddings = old_embeddings.embeddings[:new_num_tokens]
     else:
       init_embeddings = tf.concat(
-          [old_embeddings.embeddings, new_embeddings.embeddings[old_embeddings.input_dim:]], axis=0
+        [old_embeddings.embeddings, new_embeddings.embeddings[old_embeddings.input_dim :]], axis=0
       )
     new_embeddings.embeddings.assign(init_embeddings)
     return new_embeddings
 
   def prune_heads(self, heads_to_prune):
     """
-        Prunes heads of the base model.
+    Prunes heads of the base model.
 
-        Arguments:
-            heads_to_prune (`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
-                to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
-                layer 1 and heads 2 and 3 on layer 2.
-        """
+    Arguments:
+        heads_to_prune (`Dict[int, List[int]]`):
+            Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads
+            to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on
+            layer 1 and heads 2 and 3 on layer 2.
+    """
     raise NotImplementedError
 
   def save_pretrained(
-      self,
-      save_directory,
-      saved_model=False,
-      version=1,
-      push_to_hub=False,
-      signatures=None,
-      max_shard_size: Union[int, str] = "10GB",
-      create_pr: bool = False,
-      safe_serialization: bool = False,
-      **kwargs
+    self,
+    save_directory,
+    saved_model=False,
+    version=1,
+    push_to_hub=False,
+    signatures=None,
+    max_shard_size: Union[int, str] = "10GB",
+    create_pr: bool = False,
+    safe_serialization: bool = False,
+    **kwargs,
   ):
     """
-        Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        [`~TFPreTrainedModel.from_pretrained`] class method.
-
-        Arguments:
-            save_directory (`str`):
-                Directory to which to save. Will be created if it doesn't exist.
-            saved_model (`bool`, *optional*, defaults to `False`):
-                If the model has to be saved in saved model format as well or not.
-            version (`int`, *optional*, defaults to 1):
-                The version of the saved model. A saved model needs to be versioned in order to be properly loaded by
-                TensorFlow Serving as detailed in the official documentation
-                https://www.tensorflow.org/tfx/serving/serving_basic
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            signatures (`dict` or `tf.function`, *optional*):
-                Model's signature used for serving. This will be passed to the `signatures` argument of model.save().
-            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
-                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
-                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
-
-                <Tip warning={true}>
-
-                If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
-                which will be bigger than `max_shard_size`.
-
-                </Tip>
-
-            create_pr (`bool`, *optional*, defaults to `False`):
-                Whether or not to create a PR with the uploaded files or directly commit.
-            safe_serialization (`bool`, *optional*, defaults to `False`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
-
-            kwargs:
-                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
+    Save a model and its configuration file to a directory, so that it can be re-loaded using the
+    [`~TFPreTrainedModel.from_pretrained`] class method.
+
+    Arguments:
+        save_directory (`str`):
+            Directory to which to save. Will be created if it doesn't exist.
+        saved_model (`bool`, *optional*, defaults to `False`):
+            If the model has to be saved in saved model format as well or not.
+        version (`int`, *optional*, defaults to 1):
+            The version of the saved model. A saved model needs to be versioned in order to be properly loaded by
+            TensorFlow Serving as detailed in the official documentation
+            https://www.tensorflow.org/tfx/serving/serving_basic
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
+            repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+            namespace).
+        signatures (`dict` or `tf.function`, *optional*):
+            Model's signature used for serving. This will be passed to the `signatures` argument of model.save().
+        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
+            The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+            lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`).
+
+            <Tip warning={true}>
+
+            If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard
+            which will be bigger than `max_shard_size`.
+
+            </Tip>
+
+        create_pr (`bool`, *optional*, defaults to `False`):
+            Whether or not to create a PR with the uploaded files or directly commit.
+        safe_serialization (`bool`, *optional*, defaults to `False`):
+            Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
+
+        kwargs:
+            Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+    """
     if os.path.isfile(save_directory):
       logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
       return
@@ -2248,9 +2255,10 @@ def save_pretrained(
       if signatures is None:
         if any(spec.dtype == tf.int32 for spec in self.serving.input_signature[0].values()):
           int64_spec = {
-              key:
-              tf.TensorSpec(shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name)
-              for key, spec in self.serving.input_signature[0].items()
+            key: tf.TensorSpec(
+              shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name
+            )
+            for key, spec in self.serving.input_signature[0].items()
           }
           int64_serving = tf.function(self.eager_serving, input_signature=[int64_spec])
           signatures = {"serving_default": self.serving, "int64_serving": int64_serving}
@@ -2282,7 +2290,7 @@ def save_pretrained(
       # If we have a shard file that is not going to be replaced, we delete it, but only from the main process
       # in distributed settings to avoid race conditions.
       weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "")
-      if (filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and filename not in shards.keys()):
+      if filename.startswith(weights_no_suffix) and os.path.isfile(full_filename) and filename not in shards.keys():
         os.remove(full_filename)
 
     if index is None:
@@ -2299,9 +2307,9 @@ def save_pretrained(
         content = json.dumps(index, indent=2, sort_keys=True) + "\n"
         index_file.write(content)
       logger.info(
-          f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-          f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
-          f"index located at {save_index_file}."
+        f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+        f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the "
+        f"index located at {save_index_file}."
       )
       for shard_file, shard in shards.items():
         with h5py.File(os.path.join(save_directory, shard_file), mode="w") as shard_file:
@@ -2322,122 +2330,122 @@ def save_pretrained(
   @classmethod
   def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
     r"""
-        Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
-
-        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
-        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
-        task.
-
-        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
-        weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path (`str`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
-                      user or organization name, like `dbmdz/bert-base-german-cased`.
-                    - A path to a *directory* containing model weights saved using
-                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
-                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
-                      case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
-                      argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
-                      using the provided conversion scripts and loading the TensorFlow model afterwards.
-                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
-                      arguments `config` and `state_dict`).
-            model_args (sequence of positional arguments, *optional*):
-                All remaining positional arguments will be passed to the underlying model's `__init__` method.
-            config (`Union[PretrainedConfig, str]`, *optional*):
-                Can be either:
-
-                    - an instance of a class derived from [`PretrainedConfig`],
-                    - a string valid as input to [`~PretrainedConfig.from_pretrained`].
-
-                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
-                be automatically loaded when:
-
-                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
-                      model).
-                    - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the
-                      save directory.
-                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
-                      configuration JSON file named *config.json* is found in the directory.
-            from_pt (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a PyTorch state_dict save file (see docstring of
-                `pretrained_model_name_or_path` argument).
-            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
-                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
-                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
-                checkpoint with 3 labels).
-            cache_dir (`str`, *optional*):
-                Path to a directory in which a downloaded pretrained model configuration should be cached if the
-                standard cache should not be used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            resume_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
-                file exists.
-            proxies:
-                (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.,
-                `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-                output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a
-                dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether or not to only look at local files (e.g., not try doanloading the model).
-            use_auth_token (`str` or `bool`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
-                the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
-                identifier allowed by git.
-
-
-                <Tip>
-
-                To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
-
-                </Tip>
-
-            mirror (`str`, *optional*):
-                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
-                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
-                Please refer to the mirror site for more information.
-            subfolder (`str`, *optional*, defaults to `""`):
-                In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
-                specify the folder name here.
-            kwargs (remaining dictionary of keyword arguments, *optional*):
-                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
-                automatically loaded:
-
-                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
-                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
-                      already been done)
-                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
-                      corresponds to a configuration attribute will be used to override said attribute with the
-                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
-                      will be passed to the underlying model's `__init__` function.
-
-        Examples:
-
-        ```python
-        >>> from transformers import BertConfig, TFBertModel
-
-        >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = TFBertModel.from_pretrained("bert-base-uncased")
-        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
-        >>> model = TFBertModel.from_pretrained("./test/saved_model/")
-        >>> # Update configuration during loading.
-        >>> model = TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True)
-        >>> assert model.config.output_attentions == True
-        >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
-        >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
-        >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
-        ```"""
+    Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
+
+    The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
+    pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+    task.
+
+    The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
+    weights are discarded.
+
+    Parameters:
+        pretrained_model_name_or_path (`str`, *optional*):
+            Can be either:
+
+                - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                  user or organization name, like `dbmdz/bert-base-german-cased`.
+                - A path to a *directory* containing model weights saved using
+                  [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this
+                  case, `from_pt` should be set to `True` and a configuration object should be provided as `config`
+                  argument. This loading path is slower than converting the PyTorch model in a TensorFlow model
+                  using the provided conversion scripts and loading the TensorFlow model afterwards.
+                - `None` if you are both providing the configuration and state dictionary (resp. with keyword
+                  arguments `config` and `state_dict`).
+        model_args (sequence of positional arguments, *optional*):
+            All remaining positional arguments will be passed to the underlying model's `__init__` method.
+        config (`Union[PretrainedConfig, str]`, *optional*):
+            Can be either:
+
+                - an instance of a class derived from [`PretrainedConfig`],
+                - a string valid as input to [`~PretrainedConfig.from_pretrained`].
+
+            Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+            be automatically loaded when:
+
+                - The model is a model provided by the library (loaded with the *model id* string of a pretrained
+                  model).
+                - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the
+                  save directory.
+                - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                  configuration JSON file named *config.json* is found in the directory.
+        from_pt (`bool`, *optional*, defaults to `False`):
+            Load the model weights from a PyTorch state_dict save file (see docstring of
+            `pretrained_model_name_or_path` argument).
+        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
+            Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
+            as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
+            checkpoint with 3 labels).
+        cache_dir (`str`, *optional*):
+            Path to a directory in which a downloaded pretrained model configuration should be cached if the
+            standard cache should not be used.
+        force_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        resume_download (`bool`, *optional*, defaults to `False`):
+            Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+            file exists.
+        proxies:
+            (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g.,
+            `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a
+            dictionary containing missing keys, unexpected keys and error messages.
+        local_files_only(`bool`, *optional*, defaults to `False`):
+            Whether or not to only look at local files (e.g., not try doanloading the model).
+        use_auth_token (`str` or `bool`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
+            the token generated when running `huggingface-cli login` (stored in `~/.huggingface`).
+        revision (`str`, *optional*, defaults to `"main"`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
+            identifier allowed by git.
+
+
+            <Tip>
+
+            To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>".
+
+            </Tip>
+
+        mirror (`str`, *optional*):
+            Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+            problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+            Please refer to the mirror site for more information.
+        subfolder (`str`, *optional*, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can
+            specify the folder name here.
+        kwargs (remaining dictionary of keyword arguments, *optional*):
+            Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+            `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
+            automatically loaded:
+
+                - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                  underlying model's `__init__` method (we assume all relevant updates to the configuration have
+                  already been done)
+                - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                  initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that
+                  corresponds to a configuration attribute will be used to override said attribute with the
+                  supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                  will be passed to the underlying model's `__init__` function.
+
+    Examples:
+
+    ```python
+    >>> from transformers import BertConfig, TFBertModel
+
+    >>> # Download model and configuration from huggingface.co and cache.
+    >>> model = TFBertModel.from_pretrained("bert-base-uncased")
+    >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+    >>> model = TFBertModel.from_pretrained("./test/saved_model/")
+    >>> # Update configuration during loading.
+    >>> model = TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True)
+    >>> assert model.config.output_attentions == True
+    >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
+    >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
+    >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
+    ```"""
     config = kwargs.pop("config", None)
     cache_dir = kwargs.pop("cache_dir", None)
     from_pt = kwargs.pop("from_pt", False)
@@ -2459,8 +2467,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
     if trust_remote_code is True:
       logger.warning(
-          "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is"
-          " ignored."
+        "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored."
       )
 
     user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class}
@@ -2475,19 +2482,19 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
     if not isinstance(config, PretrainedConfig):
       config_path = config if config is not None else pretrained_model_name_or_path
       config, model_kwargs = cls.config_class.from_pretrained(
-          config_path,
-          cache_dir=cache_dir,
-          return_unused_kwargs=True,
-          force_download=force_download,
-          resume_download=resume_download,
-          proxies=proxies,
-          local_files_only=local_files_only,
-          use_auth_token=use_auth_token,
-          revision=revision,
-          _from_auto=from_auto_class,
-          _from_pipeline=from_pipeline,
-          _commit_hash=commit_hash,
-          **kwargs,
+        config_path,
+        cache_dir=cache_dir,
+        return_unused_kwargs=True,
+        force_download=force_download,
+        resume_download=resume_download,
+        proxies=proxies,
+        local_files_only=local_files_only,
+        use_auth_token=use_auth_token,
+        revision=revision,
+        _from_auto=from_auto_class,
+        _from_pipeline=from_pipeline,
+        _commit_hash=commit_hash,
+        **kwargs,
       )
     else:
       model_kwargs = kwargs
@@ -2511,12 +2518,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)
           is_sharded = True
         elif is_safetensors_available() and os.path.isfile(
-            os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME)
+          os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME)
         ):
           # Load from a safetensors checkpoint
           archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME)
         elif is_safetensors_available() and os.path.isfile(
-            os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
+          os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
         ):
           # Load from a sharded safetensors checkpoint
           archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME)
@@ -2530,17 +2537,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME)
           is_sharded = True
         # At this stage we don't have a weight file so we will raise an error.
-        elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                           ) or os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)):
+        elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) or os.path.isfile(
+          os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)
+        ):
           raise EnvironmentError(
-              f"Error no file named {TF2_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} "
-              "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those "
-              "weights."
+            f"Error no file named {TF2_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} "
+            "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those "
+            "weights."
           )
         else:
           raise EnvironmentError(
-              f"Error no file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
-              f"{pretrained_model_name_or_path}."
+            f"Error no file named {TF2_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory "
+            f"{pretrained_model_name_or_path}."
           )
       elif os.path.isfile(pretrained_model_name_or_path):
         archive_file = pretrained_model_name_or_path
@@ -2563,17 +2571,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         try:
           # Load from URL or cache if already cached
           cached_file_kwargs = dict(
-              cache_dir=cache_dir,
-              force_download=force_download,
-              proxies=proxies,
-              resume_download=resume_download,
-              local_files_only=local_files_only,
-              use_auth_token=use_auth_token,
-              user_agent=user_agent,
-              revision=revision,
-              subfolder=subfolder,
-              _raise_exceptions_for_missing_entries=False,
-              _commit_hash=commit_hash,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+            use_auth_token=use_auth_token,
+            user_agent=user_agent,
+            revision=revision,
+            subfolder=subfolder,
+            _raise_exceptions_for_missing_entries=False,
+            _commit_hash=commit_hash,
           )
           resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
 
@@ -2582,7 +2590,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           if resolved_archive_file is None and filename == SAFE_WEIGHTS_NAME:
             # Maybe the checkpoint is sharded, we try to grab the index name in this case.
             resolved_archive_file = cached_file(
-                pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **cached_file_kwargs
+              pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **cached_file_kwargs
             )
             if resolved_archive_file is not None:
               is_sharded = True
@@ -2594,7 +2602,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           if resolved_archive_file is None and filename == TF2_WEIGHTS_NAME:
             # Maybe the checkpoint is sharded, we try to grab the index name in this case.
             resolved_archive_file = cached_file(
-                pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME, **cached_file_kwargs
+              pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME, **cached_file_kwargs
             )
             if resolved_archive_file is not None:
               is_sharded = True
@@ -2607,20 +2615,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             # Otherwise, maybe there is a PyTorch or Flax model file.  We try those to give a helpful error
             # message.
             has_file_kwargs = {
-                "revision": revision,
-                "proxies": proxies,
-                "use_auth_token": use_auth_token,
+              "revision": revision,
+              "proxies": proxies,
+              "use_auth_token": use_auth_token,
             }
             if has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs):
               raise EnvironmentError(
-                  f"{pretrained_model_name_or_path} does not appear to have a file named"
-                  f" {TF2_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to"
-                  " load this model from those weights."
+                f"{pretrained_model_name_or_path} does not appear to have a file named"
+                f" {TF2_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to"
+                " load this model from those weights."
               )
             else:
               raise EnvironmentError(
-                  f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME},"
-                  f" {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}"
+                f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME},"
+                f" {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}"
               )
 
         except EnvironmentError:
@@ -2631,10 +2639,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           # For any other exception, we throw a generic error.
 
           raise EnvironmentError(
-              f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
-              " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
-              f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
-              f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}"
+            f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
+            " from 'https://huggingface.co/models', make sure you don't have a local directory with the"
+            f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
+            f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}"
           )
       if is_local:
         logger.info(f"loading weights file {archive_file}")
@@ -2649,17 +2657,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
     if is_sharded:
       # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case.
       resolved_archive_file, _ = get_checkpoint_shard_files(
-          pretrained_model_name_or_path,
-          resolved_archive_file,
-          cache_dir=cache_dir,
-          force_download=force_download,
-          proxies=proxies,
-          resume_download=resume_download,
-          local_files_only=local_files_only,
-          use_auth_token=use_auth_token,
-          user_agent=user_agent,
-          revision=revision,
-          _commit_hash=commit_hash,
+        pretrained_model_name_or_path,
+        resolved_archive_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+        use_auth_token=use_auth_token,
+        user_agent=user_agent,
+        revision=revision,
+        _commit_hash=commit_hash,
       )
 
     safetensors_from_pt = False
@@ -2668,8 +2676,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         safetensors_metadata = f.metadata()
       if safetensors_metadata is None or safetensors_metadata.get("format") not in ["pt", "tf", "flax"]:
         raise OSError(
-            f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata."
-            " Make sure you save your model with the `save_pretrained` method."
+          f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata."
+          " Make sure you save your model with the `save_pretrained` method."
         )
       safetensors_from_pt = safetensors_metadata.get("format") == "pt"
 
@@ -2688,7 +2696,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
       # Load from a PyTorch checkpoint
       return load_pytorch_checkpoint_in_tf2_model(
-          model, resolved_archive_file, allow_missing_keys=True, output_loading_info=output_loading_info
+        model, resolved_archive_file, allow_missing_keys=True, output_loading_info=output_loading_info
       )
 
     # we might need to extend the variable scope for composite models
@@ -2704,7 +2712,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
       state_dict = safe_load_file(resolved_archive_file)
       # Load from a PyTorch checkpoint
       return load_pytorch_state_dict_in_tf2_model(
-          model, state_dict, allow_missing_keys=True, output_loading_info=output_loading_info
+        model, state_dict, allow_missing_keys=True, output_loading_info=output_loading_info
       )
 
     # 'by_name' allow us to do transfer learning by skipping/adding layers
@@ -2715,32 +2723,32 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
           os.path.isfile(file), f"Error retrieving files {file}"
 
         missing_keys, unexpected_keys, mismatched_keys = load_tf_sharded_weights(
-            model,
-            resolved_archive_file,
-            ignore_mismatched_sizes=ignore_mismatched_sizes,
+          model,
+          resolved_archive_file,
+          ignore_mismatched_sizes=ignore_mismatched_sizes,
         )
       else:
         missing_keys, unexpected_keys, mismatched_keys = load_tf_weights(
-            model,
-            resolved_archive_file,
-            ignore_mismatched_sizes=ignore_mismatched_sizes,
-            _prefix=load_weight_prefix,
+          model,
+          resolved_archive_file,
+          ignore_mismatched_sizes=ignore_mismatched_sizes,
+          _prefix=load_weight_prefix,
         )
     except OSError as e:
       try:
         with open(resolved_archive_file) as f:
           if f.read().startswith("version"):
             raise OSError(
-                "You seem to have cloned a repository without having git-lfs installed. Please install "
-                "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-                "you cloned."
+              "You seem to have cloned a repository without having git-lfs installed. Please install "
+              "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+              "you cloned."
             )
           else:
             raise ValueError from e
       except (UnicodeDecodeError, ValueError):
         raise OSError(
-            "Unable to load weights from h5 file. "
-            "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
+          "Unable to load weights from h5 file. "
+          "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
         )
 
     model(model.dummy_inputs)  # Make sure restore ops are run
@@ -2755,60 +2763,58 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
     if len(unexpected_keys) > 0:
       logger.warning(
-          f"Some layers from the model checkpoint at {pretrained_model_name_or_path} were not used when"
-          f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-          f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-          " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-          " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-          f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-          " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+        f"Some layers from the model checkpoint at {pretrained_model_name_or_path} were not used when"
+        f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+        f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+        " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+        " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+        f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
+        " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
       )
     else:
       logger.warning(f"All model checkpoint layers were used when initializing {model.__class__.__name__}.\n")
 
     if len(missing_keys) > 0:
       logger.warning(
-          f"Some layers of {model.__class__.__name__} were not initialized from the model checkpoint at"
-          f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-          " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+        f"Some layers of {model.__class__.__name__} were not initialized from the model checkpoint at"
+        f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+        " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
       )
     elif len(mismatched_keys) == 0:
       logger.warning(
-          f"All the layers of {model.__class__.__name__} were initialized from the model checkpoint at"
-          f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
-          f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
-          " training."
+        f"All the layers of {model.__class__.__name__} were initialized from the model checkpoint at"
+        f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint"
+        f" was trained on, you can already use {model.__class__.__name__} for predictions without further"
+        " training."
       )
     if len(mismatched_keys) > 0:
-      mismatched_warning = "\n".join(
-          [
-              f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-              for key, shape1, shape2 in mismatched_keys
-          ]
-      )
+      mismatched_warning = "\n".join([
+        f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+        for key, shape1, shape2 in mismatched_keys
+      ])
       logger.warning(
-          f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-          f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-          f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
-          " to use it for predictions and inference."
+        f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+        f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+        f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able"
+        " to use it for predictions and inference."
       )
 
     # If it is a model with generation capabilities, attempt to load the generation config
     if model.can_generate():
       try:
         model.generation_config = GenerationConfig.from_pretrained(
-            pretrained_model_name_or_path,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            _from_auto=from_auto_class,
-            _from_pipeline=from_pipeline,
-            **kwargs,
+          pretrained_model_name_or_path,
+          cache_dir=cache_dir,
+          force_download=force_download,
+          resume_download=resume_download,
+          proxies=proxies,
+          local_files_only=local_files_only,
+          use_auth_token=use_auth_token,
+          revision=revision,
+          subfolder=subfolder,
+          _from_auto=from_auto_class,
+          _from_pipeline=from_pipeline,
+          **kwargs,
         )
       except OSError:
         logger.info("Generation config file not found, using a generation config created from the model config.")
@@ -2816,9 +2822,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
     if output_loading_info:
       loading_info = {
-          "missing_keys": missing_keys,
-          "unexpected_keys": unexpected_keys,
-          "mismatched_keys": mismatched_keys,
+        "missing_keys": missing_keys,
+        "unexpected_keys": unexpected_keys,
+        "mismatched_keys": mismatched_keys,
       }
 
       return model, loading_info
@@ -2826,58 +2832,58 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
     return model
 
   def push_to_hub(
-      self,
-      repo_id: str,
-      use_temp_dir: Optional[bool] = None,
-      commit_message: Optional[str] = None,
-      private: Optional[bool] = None,
-      use_auth_token: Optional[Union[bool, str]] = None,
-      max_shard_size: Optional[Union[int, str]] = "10GB",
-      **model_card_kwargs
+    self,
+    repo_id: str,
+    use_temp_dir: Optional[bool] = None,
+    commit_message: Optional[str] = None,
+    private: Optional[bool] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    max_shard_size: Optional[Union[int, str]] = "10GB",
+    **model_card_kwargs,
   ) -> str:
     """
-        Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.
-
-        Parameters:
-            repo_id (`str`):
-                The name of the repository you want to push your model to. It should contain your organization name
-                when pushing to a given organization.
-            use_temp_dir (`bool`, *optional*):
-                Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.
-                Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.
-            commit_message (`str`, *optional*):
-                Message to commit while pushing. Will default to `"Upload model"`.
-            private (`bool`, *optional*):
-                Whether or not the repository created should be private.
-            use_auth_token (`bool` or `str`, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
-                is not specified.
-            max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
-                Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
-                will then be each of size lower than this size. If expressed as a string, needs to be digits followed
-                by a unit (like `"5MB"`).
-            model_card_kwargs:
-                Additional keyword arguments passed along to the [`~TFPreTrainedModel.create_model_card`] method.
-
-        Examples:
-
-        ```python
-        from transformers import TFAutoModel
-
-        model = TFAutoModel.from_pretrained("bert-base-cased")
-
-        # Push the model to your namespace with the name "my-finetuned-bert".
-        model.push_to_hub("my-finetuned-bert")
-
-        # Push the model to an organization with the name "my-finetuned-bert".
-        model.push_to_hub("huggingface/my-finetuned-bert")
-        ```
-        """
+    Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`.
+
+    Parameters:
+        repo_id (`str`):
+            The name of the repository you want to push your model to. It should contain your organization name
+            when pushing to a given organization.
+        use_temp_dir (`bool`, *optional*):
+            Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub.
+            Will default to `True` if there is no directory named like `repo_id`, `False` otherwise.
+        commit_message (`str`, *optional*):
+            Message to commit while pushing. Will default to `"Upload model"`.
+        private (`bool`, *optional*):
+            Whether or not the repository created should be private.
+        use_auth_token (`bool` or `str`, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url`
+            is not specified.
+        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
+            Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard
+            will then be each of size lower than this size. If expressed as a string, needs to be digits followed
+            by a unit (like `"5MB"`).
+        model_card_kwargs:
+            Additional keyword arguments passed along to the [`~TFPreTrainedModel.create_model_card`] method.
+
+    Examples:
+
+    ```python
+    from transformers import TFAutoModel
+
+    model = TFAutoModel.from_pretrained("bert-base-cased")
+
+    # Push the model to your namespace with the name "my-finetuned-bert".
+    model.push_to_hub("my-finetuned-bert")
+
+    # Push the model to an organization with the name "my-finetuned-bert".
+    model.push_to_hub("huggingface/my-finetuned-bert")
+    ```
+    """
     if "repo_path_or_name" in model_card_kwargs:
       warnings.warn(
-          "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
-          "`repo_id` instead."
+        "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use "
+        "`repo_id` instead."
       )
       repo_id = model_card_kwargs.pop("repo_path_or_name")
     # Deprecation warning will be sent after for repo_url and organization
@@ -2891,7 +2897,7 @@ def push_to_hub(
       working_dir = repo_id.split("/")[-1]
 
     repo_id, token = self._create_repo(
-        repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
+      repo_id, private=private, use_auth_token=use_auth_token, repo_url=repo_url, organization=organization
     )
 
     if use_temp_dir is None:
@@ -2905,8 +2911,8 @@ def push_to_hub(
       if hasattr(self, "history") and hasattr(self, "create_model_card"):
         # This is a Keras model and we might be able to fish out its History and make a model card out of it
         base_model_card_args = {
-            "output_dir": work_dir,
-            "model_name": Path(repo_id).name,
+          "output_dir": work_dir,
+          "model_name": Path(repo_id).name,
         }
         base_model_card_args.update(model_card_kwargs)
         self.create_model_card(**base_model_card_args)
@@ -2916,19 +2922,19 @@ def push_to_hub(
   @classmethod
   def register_for_auto_class(cls, auto_class="TFAutoModel"):
     """
-        Register this class with a given auto class. This should only be used for custom models as the ones in the
-        library are already mapped with an auto class.
+    Register this class with a given auto class. This should only be used for custom models as the ones in the
+    library are already mapped with an auto class.
 
-        <Tip warning={true}>
+    <Tip warning={true}>
 
-        This API is experimental and may have some slight breaking changes in the next releases.
+    This API is experimental and may have some slight breaking changes in the next releases.
 
-        </Tip>
+    </Tip>
 
-        Args:
-            auto_class (`str` or `type`, *optional*, defaults to `"TFAutoModel"`):
-                The auto class to register this new model with.
-        """
+    Args:
+        auto_class (`str` or `type`, *optional*, defaults to `"TFAutoModel"`):
+            The auto class to register this new model with.
+    """
     if not isinstance(auto_class, str):
       auto_class = auto_class.__name__
 
@@ -2942,20 +2948,20 @@ def register_for_auto_class(cls, auto_class="TFAutoModel"):
 
 class TFConv1D(tf.keras.layers.Layer):
   """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (`int`):
-            The number of output features.
-        nx (`int`):
-            The number of input features.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation to use to initialize the weights.
-        kwargs:
-            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
-    """
+  1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
+
+  Basically works like a linear layer but the weights are transposed.
+
+  Args:
+      nf (`int`):
+          The number of output features.
+      nx (`int`):
+          The number of input features.
+      initializer_range (`float`, *optional*, defaults to 0.02):
+          The standard deviation to use to initialize the weights.
+      kwargs:
+          Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
+  """
 
   def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
     super().__init__(**kwargs)
@@ -2965,7 +2971,7 @@ def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
 
   def build(self, input_shape):
     self.weight = self.add_weight(
-        "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+      "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
     )
     self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
@@ -2982,22 +2988,22 @@ def call(self, x):
 
 class TFSharedEmbeddings(tf.keras.layers.Layer):
   r"""
-    Construct shared token embeddings.
-
-    The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language
-    modeling.
-
-    Args:
-        vocab_size (`int`):
-            The size of the vocabulary, e.g., the number of unique tokens.
-        hidden_size (`int`):
-            The size of the embedding vectors.
-        initializer_range (`float`, *optional*):
-            The standard deviation to use when initializing the weights. If no value is provided, it will default to
-            \\(1/\sqrt{hidden\_size}\\).
-        kwargs:
-            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
-    """
+  Construct shared token embeddings.
+
+  The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language
+  modeling.
+
+  Args:
+      vocab_size (`int`):
+          The size of the vocabulary, e.g., the number of unique tokens.
+      hidden_size (`int`):
+          The size of the embedding vectors.
+      initializer_range (`float`, *optional*):
+          The standard deviation to use when initializing the weights. If no value is provided, it will default to
+          \\(1/\sqrt{hidden\_size}\\).
+      kwargs:
+          Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
+  """
 
   # TODO (joao): flagged for delection due to embeddings refactor
 
@@ -3009,19 +3015,19 @@ def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optiona
 
   def build(self, input_shape):
     """
-        Build shared token embedding layer Shared weights logic adapted from
-        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
+    Build shared token embedding layer Shared weights logic adapted from
+    https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+    """
     self.weight = self.add_weight(
-        "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+      "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
     )
     super().build(input_shape)
 
   def get_config(self):
     config = {
-        "vocab_size": self.vocab_size,
-        "hidden_size": self.hidden_size,
-        "initializer_range": self.initializer_range,
+      "vocab_size": self.vocab_size,
+      "hidden_size": self.hidden_size,
+      "initializer_range": self.initializer_range,
     }
     base_config = super().get_config()
 
@@ -3029,29 +3035,29 @@ def get_config(self):
 
   def call(self, inputs: tf.Tensor, mode: str = "embedding") -> tf.Tensor:
     """
-        Get token embeddings of inputs or decode final hidden state.
+    Get token embeddings of inputs or decode final hidden state.
 
-        Args:
-            inputs (`tf.Tensor`):
-                In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.
+    Args:
+        inputs (`tf.Tensor`):
+            In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.
 
-                In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
-            mode (`str`, defaults to `"embedding"`):
-               A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be
-               used as an embedding layer, the second one that the layer should be used as a linear decoder.
+            In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
+        mode (`str`, defaults to `"embedding"`):
+           A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be
+           used as an embedding layer, the second one that the layer should be used as a linear decoder.
 
-        Returns:
-            `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length,
-            embedding_size]`.
+    Returns:
+        `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length,
+        embedding_size]`.
 
-            In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.
+        In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.
 
-        Raises:
-            ValueError: if `mode` is not valid.
+    Raises:
+        ValueError: if `mode` is not valid.
 
-        Shared weights logic is adapted from
-        [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
-        """
+    Shared weights logic is adapted from
+    [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
+    """
     if mode == "embedding":
       return self._embedding(inputs)
     elif mode == "linear":
@@ -3065,14 +3071,14 @@ def _embedding(self, input_ids):
 
   def _linear(self, inputs):
     """
-        Computes logits by running inputs through a linear layer.
+    Computes logits by running inputs through a linear layer.
 
-        Args:
-            inputs: A float32 tensor with shape [..., hidden_size]
+    Args:
+        inputs: A float32 tensor with shape [..., hidden_size]
 
-        Returns:
-            float32 tensor with shape [..., vocab_size].
-        """
+    Returns:
+        float32 tensor with shape [..., vocab_size].
+    """
     first_dims = shape_list(inputs)[:-1]
     x = tf.reshape(inputs, [-1, self.hidden_size])
     logits = tf.matmul(x, self.weight, transpose_b=True)
@@ -3082,33 +3088,33 @@ def _linear(self, inputs):
 
 class TFSequenceSummary(tf.keras.layers.Layer):
   """
-    Compute a single vector summary of a sequence hidden states.
-
-    Args:
-        config ([`PretrainedConfig`]):
-            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
-            config class of your model for the default values it uses):
-
-            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
-
-                - `"last"` -- Take the last token hidden state (like XLNet)
-                - `"first"` -- Take the first token hidden state (like Bert)
-                - `"mean"` -- Take the mean of all tokens hidden states
-                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - `"attn"` -- Not implemented now, use multi-head attention
-
-            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
-              (otherwise to `config.hidden_size`).
-            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
-              another string or `None` will add no activation.
-            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
-            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
-
-        initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
-        kwargs:
-            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
-    """
+  Compute a single vector summary of a sequence hidden states.
+
+  Args:
+      config ([`PretrainedConfig`]):
+          The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+          config class of your model for the default values it uses):
+
+          - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
+
+              - `"last"` -- Take the last token hidden state (like XLNet)
+              - `"first"` -- Take the first token hidden state (like Bert)
+              - `"mean"` -- Take the mean of all tokens hidden states
+              - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+              - `"attn"` -- Not implemented now, use multi-head attention
+
+          - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+          - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes
+            (otherwise to `config.hidden_size`).
+          - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output,
+            another string or `None` will add no activation.
+          - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
+          - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
+
+      initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
+      kwargs:
+          Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
+  """
 
   def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
     super().__init__(**kwargs)
@@ -3127,7 +3133,7 @@ def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **
       else:
         num_classes = config.hidden_size
       self.summary = tf.keras.layers.Dense(
-          num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+        num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
       )
 
     self.has_activation = False
@@ -3165,7 +3171,7 @@ def call(self, inputs, cls_index=None, training=False):
       hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
       if cls_index is None:
         cls_index = tf.fill(
-            hidden_shape[:-2], hidden_shape[-2] - 1
+          hidden_shape[:-2], hidden_shape[-2] - 1
         )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
       cls_shape = shape_list(cls_index)
       if len(cls_shape) <= len(hidden_shape) - 2:
@@ -3196,12 +3202,12 @@ def call(self, inputs, cls_index=None, training=False):
 
 def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
   """
-    Creates a `tf.initializers.TruncatedNormal` with the given range.
+  Creates a `tf.initializers.TruncatedNormal` with the given range.
 
-    Args:
-        initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
+  Args:
+      initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
 
-    Returns:
-        `tf.initializers.TruncatedNormal`: The truncated normal initializer.
-    """
+  Returns:
+      `tf.initializers.TruncatedNormal`: The truncated normal initializer.
+  """
   return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
diff --git a/deepray/core/runner.py b/deepray/core/runner.py
index 51a418e9..79351280 100644
--- a/deepray/core/runner.py
+++ b/deepray/core/runner.py
@@ -20,7 +20,7 @@
 import numpy as np
 import tensorflow as tf
 
-Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, 'Output']]  # pytype: disable=not-supported-yet
+Output = Dict[str, Union[tf.Tensor, float, np.number, np.ndarray, "Output"]]  # pytype: disable=not-supported-yet
 
 
 class AbstractTrainer(tf.Module, metaclass=abc.ABCMeta):
diff --git a/deepray/core/standard_runner.py b/deepray/core/standard_runner.py
index 51cc392d..fa18abbf 100644
--- a/deepray/core/standard_runner.py
+++ b/deepray/core/standard_runner.py
@@ -62,6 +62,7 @@ class StandardTrainerOptions:
       (one with summary calls, and one without). The program with summaries runs
       only for one step when summaries should be recorded.
   """
+
   use_tf_function: bool = True
   use_tf_while_loop: bool = True
   use_tpu_summary_optimization: bool = False
@@ -92,11 +93,9 @@ def __init__(self, train_dataset, options: Optional[StandardTrainerOptions] = No
     """
     options = options or StandardTrainerOptions()
     if options.use_tf_while_loop and not options.use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
+      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` is not supported")
     if options.use_tpu_summary_optimization and not options.use_tf_while_loop:
-      raise ValueError("`use_tpu_summary_optimization=True` and "
-                       "`use_tf_while_loop=False` is not supported")
+      raise ValueError("`use_tpu_summary_optimization=True` and `use_tf_while_loop=False` is not supported")
 
     self._train_options = options
     self._train_dataset = train_dataset
@@ -227,6 +226,7 @@ class StandardEvaluatorOptions:
       data to be evaluated is [1, 2] every time. If `False`, the iterator
       state is maintained between calls to `StandardEvaluator.evaluate()`.
   """
+
   use_tf_function: bool = True
   use_tf_while_loop: bool = False
   recreate_iterator_for_each_eval: bool = True
@@ -268,8 +268,7 @@ def __init__(self, eval_dataset, options: Optional[StandardEvaluatorOptions] = N
     """
     options = options or StandardEvaluatorOptions()
     if options.use_tf_while_loop and not options.use_tf_function:
-      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` "
-                       "is not supported")
+      raise ValueError("`use_tf_while_loop=True` and `use_tf_function=False` is not supported")
 
     self._eval_options = options
     self._eval_dataset = eval_dataset
@@ -317,8 +316,7 @@ def evaluate(self, num_steps: tf.Tensor) -> Optional[runner.Output]:
         unspecified.
     """
     if self._eval_options.use_tf_while_loop and num_steps == -1:
-      raise ValueError("Looping until exhausted is not supported if "
-                       "`options.use_tf_while_loop` is `True`")
+      raise ValueError("Looping until exhausted is not supported if `options.use_tf_while_loop` is `True`")
 
     outputs = self.eval_begin()  # pylint: disable=assignment-from-no-return
 
diff --git a/deepray/core/standard_runner_test.py b/deepray/core/standard_runner_test.py
index a651a91d..6f49fe85 100644
--- a/deepray/core/standard_runner_test.py
+++ b/deepray/core/standard_runner_test.py
@@ -46,7 +46,6 @@ def train_loop_begin(self):
     self.global_step.assign(0)
 
   def train_step(self, iterator):
-
     def replica_step(_):
       self.global_step.assign_add(1)
 
@@ -69,7 +68,6 @@ def eval_begin(self):
     self.global_step.assign(0)
 
   def eval_step(self, iterator):
-
     def replica_step(_):
       self.global_step.assign_add(1)
 
@@ -95,7 +93,6 @@ def eval_reduce(self, state, step_outputs):
     return state
 
   def eval_step(self, iterator):
-
     def replica_step(x):
       x = tf.cast(x, tf.float32)
       return tf.reduce_sum(x)
@@ -107,7 +104,6 @@ def eval_end(self, outputs):
 
 
 class StandardRunnerTest(parameterized.TestCase):
-
   def test_default_trainer(self):
     trainer = TestTrainer()
     self.assertEqual(trainer.train(tf.constant(10)), 10)
@@ -130,7 +126,7 @@ def test_evaluator_with_outputs_aggregation(self, use_tf_while_loop):
     self.assertEqual(evaluator.evaluate(tf.constant(10)), 45)
 
   @parameterized.named_parameters(
-      ("recreate_iterator_for_each_eval", True, 10, 10), ("not_recreate_iterator_for_each_eval", False, 10, 35)
+    ("recreate_iterator_for_each_eval", True, 10, 10), ("not_recreate_iterator_for_each_eval", False, 10, 35)
   )
   def test_evaluator_with_repeat_dataset(self, recreate_iterator_for_each_eval, sum_for_1st_time, sum_for_2nd_time):
     options = standard_runner.StandardEvaluatorOptions(recreate_iterator_for_each_eval=recreate_iterator_for_each_eval)
diff --git a/deepray/core/trainer.py b/deepray/core/trainer.py
index 8e112e0b..2e4df821 100644
--- a/deepray/core/trainer.py
+++ b/deepray/core/trainer.py
@@ -72,134 +72,133 @@ def set_random_seed(random_seed):
   random.seed(random_seed)  # set random seed for python
   np.random.seed(random_seed)  # set random seed for numpy
   tf.random.set_seed(random_seed)  # set random seed for tensorflow-cpu
-  os.environ['TF_DETERMINISTIC_OPS'] = '1'  # set random seed for tensorflow-gpu
+  os.environ["TF_DETERMINISTIC_OPS"] = "1"  # set random seed for tensorflow-gpu
 
 
 @keras_export("keras.Model", "keras.models.Model")
-class Trainer():
+class Trainer:
   """A model grouping layers into an object with training/inference features.
 
-    Args:
-        inputs: The input(s) of the model: a `keras.Input` object or a
-            combination of `keras.Input` objects in a dict, list or tuple.
-        outputs: The output(s) of the model: a tensor that originated from
-            `keras.Input` objects or a combination of such tensors in a dict,
-            list or tuple. See Functional API example below.
-        name: String, the name of the model.
-
-    There are two ways to instantiate a `Model`:
-
-    1 - With the "Functional API", where you start from `Input`,
-    you chain layer calls to specify the model's forward pass,
-    and finally you create your model from inputs and outputs:
-
-    ```python
-    import tensorflow as tf
-
-    inputs = tf.keras.Input(shape=(3,))
-    x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
-    outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
-    model = tf.keras.Model(inputs=inputs, outputs=outputs)
-    ```
-
-    Note: Only dicts, lists, and tuples of input tensors are supported. Nested
-    inputs are not supported (e.g. lists of list or dicts of dict).
-
-    A new Functional API model can also be created by using the
-    intermediate tensors. This enables you to quickly extract sub-components
-    of the model.
-
-    Example:
-
-    ```python
-    inputs = keras.Input(shape=(None, None, 3))
-    processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
-    conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
-    pooling = keras.layers.GlobalAveragePooling2D()(conv)
-    feature = keras.layers.Dense(10)(pooling)
-
-    full_model = keras.Model(inputs, feature)
-    backbone = keras.Model(processed, conv)
-    activations = keras.Model(conv, feature)
-    ```
-
-    Note that the `backbone` and `activations` models are not
-    created with `keras.Input` objects, but with the tensors that are originated
-    from `keras.Input` objects. Under the hood, the layers and weights will
-    be shared across these models, so that user can train the `full_model`, and
-    use `backbone` or `activations` to do feature extraction.
-    The inputs and outputs of the model can be nested structures of tensors as
-    well, and the created models are standard Functional API models that support
-    all the existing APIs.
-
-    2 - By subclassing the `Model` class: in that case, you should define your
-    layers in `__init__()` and you should implement the model's forward pass
-    in `call()`.
-
-    ```python
-    import tensorflow as tf
-
-    class MyModel(tf.keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-
-      def call(self, inputs):
-        x = self.dense1(inputs)
-        return self.dense2(x)
-
-    model = MyModel()
-    ```
-
-    If you subclass `Model`, you can optionally have
-    a `training` argument (boolean) in `call()`, which you can use to specify
-    a different behavior in training and inference:
-
-    ```python
-    import tensorflow as tf
-
-    class MyModel(tf.keras.Model):
-
-      def __init__(self):
-        super().__init__()
-        self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
-        self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
-        self.dropout = tf.keras.layers.Dropout(0.5)
-
-      def call(self, inputs, training=False):
-        x = self.dense1(inputs)
-        if training:
-          x = self.dropout(x, training=training)
-        return self.dense2(x)
-
-    model = MyModel()
-    ```
-
-    Once the model is created, you can config the model with losses and metrics
-    with `model.compile()`, train the model with `model.fit()`, or use the model
-    to do prediction with `model.predict()`.
-    """
+  Args:
+      inputs: The input(s) of the model: a `keras.Input` object or a
+          combination of `keras.Input` objects in a dict, list or tuple.
+      outputs: The output(s) of the model: a tensor that originated from
+          `keras.Input` objects or a combination of such tensors in a dict,
+          list or tuple. See Functional API example below.
+      name: String, the name of the model.
+
+  There are two ways to instantiate a `Model`:
+
+  1 - With the "Functional API", where you start from `Input`,
+  you chain layer calls to specify the model's forward pass,
+  and finally you create your model from inputs and outputs:
+
+  ```python
+  import tensorflow as tf
+
+  inputs = tf.keras.Input(shape=(3,))
+  x = tf.keras.layers.Dense(4, activation=tf.nn.relu)(inputs)
+  outputs = tf.keras.layers.Dense(5, activation=tf.nn.softmax)(x)
+  model = tf.keras.Model(inputs=inputs, outputs=outputs)
+  ```
+
+  Note: Only dicts, lists, and tuples of input tensors are supported. Nested
+  inputs are not supported (e.g. lists of list or dicts of dict).
+
+  A new Functional API model can also be created by using the
+  intermediate tensors. This enables you to quickly extract sub-components
+  of the model.
+
+  Example:
+
+  ```python
+  inputs = keras.Input(shape=(None, None, 3))
+  processed = keras.layers.RandomCrop(width=32, height=32)(inputs)
+  conv = keras.layers.Conv2D(filters=2, kernel_size=3)(processed)
+  pooling = keras.layers.GlobalAveragePooling2D()(conv)
+  feature = keras.layers.Dense(10)(pooling)
+
+  full_model = keras.Model(inputs, feature)
+  backbone = keras.Model(processed, conv)
+  activations = keras.Model(conv, feature)
+  ```
+
+  Note that the `backbone` and `activations` models are not
+  created with `keras.Input` objects, but with the tensors that are originated
+  from `keras.Input` objects. Under the hood, the layers and weights will
+  be shared across these models, so that user can train the `full_model`, and
+  use `backbone` or `activations` to do feature extraction.
+  The inputs and outputs of the model can be nested structures of tensors as
+  well, and the created models are standard Functional API models that support
+  all the existing APIs.
+
+  2 - By subclassing the `Model` class: in that case, you should define your
+  layers in `__init__()` and you should implement the model's forward pass
+  in `call()`.
+
+  ```python
+  import tensorflow as tf
+
+  class MyModel(tf.keras.Model):
+
+    def __init__(self):
+      super().__init__()
+      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+
+    def call(self, inputs):
+      x = self.dense1(inputs)
+      return self.dense2(x)
+
+  model = MyModel()
+  ```
+
+  If you subclass `Model`, you can optionally have
+  a `training` argument (boolean) in `call()`, which you can use to specify
+  a different behavior in training and inference:
+
+  ```python
+  import tensorflow as tf
+
+  class MyModel(tf.keras.Model):
+
+    def __init__(self):
+      super().__init__()
+      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+      self.dropout = tf.keras.layers.Dropout(0.5)
+
+    def call(self, inputs, training=False):
+      x = self.dense1(inputs)
+      if training:
+        x = self.dropout(x, training=training)
+      return self.dense2(x)
+
+  model = MyModel()
+  ```
+
+  Once the model is created, you can config the model with losses and metrics
+  with `model.compile()`, train the model with `model.fit()`, or use the model
+  to do prediction with `model.predict()`.
+  """
 
   @tf.__internal__.tracking.no_automatic_dependency_tracking
   @traceback_utils.filter_traceback
   def __init__(
-      self,
-      model: Union[keras.Model, List[keras.Model], Dict[Text, keras.Model]],
-      optimizer="rmsprop",
-      loss=None,
-      metrics=None,
-      loss_weights=None,
-      weighted_metrics=None,
-      run_eagerly=None,
-      steps_per_execution=None,
-      jit_compile=None,
-      pss_evaluation_shards=0,
-      *args,
-      **kwargs
+    self,
+    model: Union[keras.Model, List[keras.Model], Dict[Text, keras.Model]],
+    optimizer="rmsprop",
+    loss=None,
+    metrics=None,
+    loss_weights=None,
+    weighted_metrics=None,
+    run_eagerly=None,
+    steps_per_execution=None,
+    jit_compile=None,
+    pss_evaluation_shards=0,
+    *args,
+    **kwargs,
   ):
-
     self._model = {}
     if isinstance(model, list):
       if len(model) > 0:
@@ -214,12 +213,12 @@ def __init__(
     elif isinstance(model, dict):
       main_keys = [k for k in model.keys() if "main" in k]
       if len(main_keys) == 1:
-        if (len(model) == 1):
+        if len(model) == 1:
           self._model = {"main": next(iter(model.values()))}
         else:
           self._model = model
       else:
-        raise ValueError(f"Must set only one model with key contains \"main\", found {main_keys}.")
+        raise ValueError(f'Must set only one model with key contains "main", found {main_keys}.')
     elif isinstance(model, (keras.Model, tf.keras.Model)):
       self._model = {"main": model}
     else:
@@ -239,11 +238,11 @@ def __init__(
     if training_module.is_functional_model_init_params(args, kwargs) and not isinstance(self, functional.Functional):
       # Filter the kwargs for multiple inheritance.
       supported_kwargs = [
-          "inputs",
-          "outputs",
-          "name",
-          "trainable",
-          "skip_init",
+        "inputs",
+        "outputs",
+        "name",
+        "trainable",
+        "skip_init",
       ]
       model_kwargs = {k: kwargs[k] for k in kwargs if k in supported_kwargs}
       other_kwargs = {k: kwargs[k] for k in kwargs if k not in supported_kwargs}
@@ -268,10 +267,7 @@ def __init__(
       elif other_kwargs:
         # In case there are unused kwargs, we should raise an error to
         # user, in case they have a typo in the param name.
-        raise TypeError(
-            "The following keyword arguments passed to `Model` aren't "
-            "supported: {}.".format(other_kwargs)
-        )
+        raise TypeError("The following keyword arguments passed to `Model` aren't supported: {}.".format(other_kwargs))
       return
 
     # The following are implemented as property functions:
@@ -280,16 +276,16 @@ def __init__(
     # `inputs` / `outputs` will only appear in kwargs if either are
     # misspelled.
     generic_utils.validate_kwargs(
-        kwargs,
-        {
-            "trainable",
-            "dtype",
-            "dynamic",
-            "name",
-            "autocast",
-            "inputs",
-            "outputs",
-        },
+      kwargs,
+      {
+        "trainable",
+        "dtype",
+        "dynamic",
+        "name",
+        "autocast",
+        "inputs",
+        "outputs",
+      },
     )
     super().__init__(**kwargs)
 
@@ -342,16 +338,16 @@ def __init__(
     self._jit_compile = None
 
     self.compile(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        weighted_metrics=weighted_metrics,
-        run_eagerly=run_eagerly,
-        steps_per_execution=steps_per_execution,
-        jit_compile=jit_compile,
-        pss_evaluation_shards=pss_evaluation_shards,
-        **kwargs,
+      optimizer=optimizer,
+      loss=loss,
+      metrics=metrics,
+      loss_weights=loss_weights,
+      weighted_metrics=weighted_metrics,
+      run_eagerly=run_eagerly,
+      steps_per_execution=steps_per_execution,
+      jit_compile=jit_compile,
+      pss_evaluation_shards=pss_evaluation_shards,
+      **kwargs,
     )
 
     if is_main_process():
@@ -393,21 +389,21 @@ def _init_batch_counters(self):
       self._test_counter = self._create_counter_variable(0)
       self._predict_counter = self._create_counter_variable(0)
       if flags.FLAGS.use_horovod:
-        self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name='first_batch')
+        self.first_batch = tf.Variable(True, trainable=False, dtype=tf.bool, name="first_batch")
 
   @traceback_utils.filter_traceback
   def compile(
-      self,
-      optimizer="rmsprop",
-      loss=None,
-      metrics=None,
-      loss_weights=None,
-      weighted_metrics=None,
-      run_eagerly=None,
-      steps_per_execution=None,
-      jit_compile=None,
-      pss_evaluation_shards=0,
-      **kwargs,
+    self,
+    optimizer="rmsprop",
+    loss=None,
+    metrics=None,
+    loss_weights=None,
+    weighted_metrics=None,
+    run_eagerly=None,
+    steps_per_execution=None,
+    jit_compile=None,
+    pss_evaluation_shards=0,
+    **kwargs,
   ):
     """Configures the model for training.
 
@@ -523,21 +519,21 @@ def compile(
     if jit_compile and not tf_utils.can_jit_compile(warn=True):
       jit_compile = False
     self._compile_config = serialization_lib.Config(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        weighted_metrics=weighted_metrics,
-        run_eagerly=run_eagerly,
-        steps_per_execution=steps_per_execution,
-        jit_compile=jit_compile,
+      optimizer=optimizer,
+      loss=loss,
+      metrics=metrics,
+      loss_weights=loss_weights,
+      weighted_metrics=weighted_metrics,
+      run_eagerly=run_eagerly,
+      steps_per_execution=steps_per_execution,
+      jit_compile=jit_compile,
     )
     with self.distribute_strategy.scope():
       if "experimental_steps_per_execution" in kwargs:
         logging.warning(
-            "The argument `steps_per_execution` is no longer "
-            "experimental. Pass `steps_per_execution` instead of "
-            "`experimental_steps_per_execution`."
+          "The argument `steps_per_execution` is no longer "
+          "experimental. Pass `steps_per_execution` instead of "
+          "`experimental_steps_per_execution`."
         )
         if not steps_per_execution:
           steps_per_execution = kwargs.pop("experimental_steps_per_execution")
@@ -563,24 +559,24 @@ def compile(
         self.compiled_loss = loss
       else:
         self.compiled_loss = compile_utils.LossesContainer(
-            loss,
-            loss_weights,
-            output_names=self.main_model.output_names,
-            mesh=mesh,
-        )
-      self.compiled_metrics = compile_utils.MetricsContainer(
-          metrics,
-          weighted_metrics,
+          loss,
+          loss_weights,
           output_names=self.main_model.output_names,
-          from_serialized=from_serialized,
           mesh=mesh,
+        )
+      self.compiled_metrics = compile_utils.MetricsContainer(
+        metrics,
+        weighted_metrics,
+        output_names=self.main_model.output_names,
+        from_serialized=from_serialized,
+        mesh=mesh,
       )
 
       if steps_per_execution == "auto":
         if self._steps_per_execution is None:
           self._configure_steps_per_execution(1)
-        self._steps_per_execution_tuner = (
-            steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution)
+        self._steps_per_execution_tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+          self.optimizer, self._steps_per_execution
         )
         self._autotune_steps_per_execution = True
       else:
@@ -593,8 +589,7 @@ def compile(
       self._is_compiled = True
       self.loss = loss or {}
       if (self._run_eagerly or self.main_model.dynamic) and jit_compile:
-        raise ValueError("You cannot enable `run_eagerly` and `jit_compile` "
-                         "at the same time.")
+        raise ValueError("You cannot enable `run_eagerly` and `jit_compile` at the same time.")
       else:
         self._jit_compile = jit_compile
 
@@ -744,15 +739,14 @@ def run_eagerly(self):
     if self.main_model.dynamic and self._run_eagerly == False:
       # TODO(fchollet): consider using py_func to enable this.
       raise ValueError(
-          "Your model contains layers that can only be "
-          "successfully run in eager execution (layers "
-          "constructed with `dynamic=True`). "
-          "You cannot set `run_eagerly=False`."
+        "Your model contains layers that can only be "
+        "successfully run in eager execution (layers "
+        "constructed with `dynamic=True`). "
+        "You cannot set `run_eagerly=False`."
       )
 
     if self._cluster_coordinator and self._run_eagerly:
-      raise ValueError("When using `Model` with `ParameterServerStrategy`, "
-                       "`run_eagerly` is not supported.")
+      raise ValueError("When using `Model` with `ParameterServerStrategy`, `run_eagerly` is not supported.")
 
     # Run eagerly logic, by priority:
     # (1) Dynamic models must be run eagerly.
@@ -760,8 +754,7 @@ def run_eagerly(self):
     # (3) Not explicitly setting run_eagerly defaults to TF's global
     # setting.
     return (
-        self.main_model.dynamic or self._run_eagerly or
-        (tf.config.functions_run_eagerly() and self._run_eagerly is None)
+      self.main_model.dynamic or self._run_eagerly or (tf.config.functions_run_eagerly() and self._run_eagerly is None)
     )
 
   @run_eagerly.setter
@@ -779,8 +772,8 @@ def autotune_steps_per_execution(self, value):
     if value and self._steps_per_execution_tuner is None:
       if self._steps_per_execution is None:
         self._configure_steps_per_execution(1)
-      self._steps_per_execution_tuner = (
-          steps_per_execution_tuning.StepsPerExecutionTuner(self.optimizer, self._steps_per_execution)
+      self._steps_per_execution_tuner = steps_per_execution_tuning.StepsPerExecutionTuner(
+        self.optimizer, self._steps_per_execution
       )
 
   @property
@@ -861,19 +854,16 @@ def _validate_target_and_loss(self, y, loss):
     # exists and target does not.
     if self.loss and y is None:
       raise ValueError(
-          "Target data is missing. Your model was compiled with "
-          f"loss={self.loss}, "
-          "and therefore expects target data to be provided in `fit()`."
+        "Target data is missing. Your model was compiled with "
+        f"loss={self.loss}, "
+        "and therefore expects target data to be provided in `fit()`."
       )
 
     # For training, there must be compiled loss or regularization loss to
     # exist in order to apply the gradients. If one is not found, it means
     # no loss was supplied via `compile` or `add_loss`.
     elif loss is None:
-      raise ValueError(
-          "No loss found. You may have forgotten to provide a `loss` "
-          "argument in the `compile()` method."
-      )
+      raise ValueError("No loss found. You may have forgotten to provide a `loss` argument in the `compile()` method.")
 
   def train_step(self, data):
     """The logic for one training step.
@@ -1119,18 +1109,17 @@ def _aggregate_exact_metrics(self, logs):
       for metric in self.metrics:
         if metric.name not in shard_result.keys():
           logging.log_first_n(
-              logging.WARN,
-              f"No matching result found for metric {metric.name}. "
-              "This metric's computed result may be incorrect.",
-              3,
+            logging.WARN,
+            f"No matching result found for metric {metric.name}. This metric's computed result may be incorrect.",
+            3,
           )
           continue
         metric_result = shard_result[metric.name]
         if len(metric_result) != len(metric.weights):
           raise ValueError(
-              f"Expected {len(metric.weights)} variables in result "
-              f"for metric {metric.name}, but found "
-              f"{len(metric_result)}."
+            f"Expected {len(metric.weights)} variables in result "
+            f"for metric {metric.name}, but found "
+            f"{len(metric_result)}."
           )
         for weight, val in zip(metric.weights, metric_result):
           weight.assign_add(val)
@@ -1179,16 +1168,17 @@ def run_step(data):
       data = next(iterator)
       outputs = self.distribute_strategy.run(run_step, args=(data,))
       outputs = training_module.reduce_per_replica(
-          outputs,
-          self.distribute_strategy,
-          reduction=self.distribute_reduction_method,
+        outputs,
+        self.distribute_strategy,
+        reduction=self.distribute_reduction_method,
       )
       return outputs
 
     # Special case if steps_per_execution is one.
     if (
-        self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+      self._steps_per_execution is None
+      or self._steps_per_execution.numpy().item() == 1
+      and not self.autotune_steps_per_execution
     ):
 
       def train_function(iterator):
@@ -1200,7 +1190,7 @@ def train_function(iterator):
         self.train_tf_function = train_function
 
       if self._cluster_coordinator:
-        self.train_function = (lambda it: self._cluster_coordinator.schedule(train_function, args=(it,)))
+        self.train_function = lambda it: self._cluster_coordinator.schedule(train_function, args=(it,))
       else:
         self.train_function = train_function
 
@@ -1220,7 +1210,7 @@ def train_function(iterator, steps_per_execution):
         self.train_tf_function = train_function
 
       self.train_function = lambda it: self._cluster_coordinator.schedule(
-          train_function, args=(it, self._steps_per_execution.value())
+        train_function, args=(it, self._steps_per_execution.value())
       )
     else:
 
@@ -1270,16 +1260,18 @@ def step_function(iterator):
 
       def do_broadcast():
         model_broadcast_vars = [
-            x for x in self.main_model.variables
-            if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
+          x
+          for x in self.main_model.variables
+          if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
         ]
         opt_broadcast_vars = [
-            x for x in self.optimizer.variables()
-            if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
+          x
+          for x in self.optimizer.variables()
+          if not isinstance(x, (TrainableWrapper, DEResourceVariable, kv_variable_ops.EmbeddingVariable))
         ]
         print_op = tf.print(
-            f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...",
-            output_stream=sys.stdout
+          f"Broadcasting {len(model_broadcast_vars)} model variables & {len(opt_broadcast_vars)} optimizer variables...",
+          output_stream=sys.stdout,
         )
         with tf.control_dependencies([print_op]):
           hvd.broadcast_variables(model_broadcast_vars + opt_broadcast_vars, root_rank=0)
@@ -1302,8 +1294,9 @@ def run_step(data):
 
     # Special case if steps_per_execution is one.
     if (
-        self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+      self._steps_per_execution is None
+      or self._steps_per_execution.numpy().item() == 1
+      and not self.autotune_steps_per_execution
     ):
 
       def train_function(iterator):
@@ -1332,26 +1325,26 @@ def train_function(iterator):
 
   @traceback_utils.filter_traceback
   def fit(
-      self,
-      x=None,
-      y=None,
-      batch_size=None,
-      epochs=None,
-      verbose="auto",
-      callbacks=[],
-      validation_split=0.0,
-      validation_data=None,
-      shuffle=True,
-      class_weight=None,
-      sample_weight=None,
-      initial_epoch=0,
-      steps_per_epoch=None,
-      validation_steps=None,
-      validation_batch_size=None,
-      validation_freq=1,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
+    self,
+    x=None,
+    y=None,
+    batch_size=None,
+    epochs=None,
+    verbose="auto",
+    callbacks=[],
+    validation_split=0.0,
+    validation_data=None,
+    shuffle=True,
+    class_weight=None,
+    sample_weight=None,
+    initial_epoch=0,
+    steps_per_epoch=None,
+    validation_steps=None,
+    validation_batch_size=None,
+    validation_freq=1,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
   ):
     """Trains the model for a fixed number of epochs (dataset iterations).
 
@@ -1586,12 +1579,13 @@ def fit(
     if steps_per_epoch and flags.FLAGS.use_horovod:
       try:
         import horovod.tensorflow as hvd
-        steps_array = hvd.allgather_object(steps_per_epoch, name='check_train_step')
+
+        steps_array = hvd.allgather_object(steps_per_epoch, name="check_train_step")
         logger.debug(f"steps_array = {steps_array}")
         assert max(set(steps_array)) == min(set(steps_array))
       except:
         raise ValueError(
-            f"steps_per_epoch = {steps_per_epoch}, different rank should have same steps when using Horovod."
+          f"steps_per_epoch = {steps_per_epoch}, different rank should have same steps when using Horovod."
         )
     # Legacy graph support is contained in `training_v1.Model`.
     if batch_size is None:
@@ -1616,48 +1610,52 @@ def fit(
       # Create the validation data using the training data. Only supported
       # for `Tensor` and `NumPy` input.
       (
+        (
           x,
           y,
           sample_weight,
-      ), validation_data = data_adapter.train_validation_split(
-          (x, y, sample_weight), validation_split=validation_split
-      )
+        ),
+        validation_data,
+      ) = data_adapter.train_validation_split((x, y, sample_weight), validation_split=validation_split)
 
     if validation_data:
       (
-          val_x,
-          val_y,
-          val_sample_weight,
+        val_x,
+        val_y,
+        val_sample_weight,
       ) = data_adapter.unpack_x_y_sample_weight(validation_data)
 
     if self.distribute_strategy._should_use_with_coordinator:
-      self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy))
+      self._cluster_coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy)
 
-    with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+    with (
+      self.distribute_strategy.scope(),
+      training_utils.RespectCompiledTrainableState(  # noqa: E501
         self
+      ),
     ):
       # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
       data_handler = data_adapter.get_data_handler(
-          x=x,
-          y=y,
-          sample_weight=sample_weight,
-          batch_size=batch_size,
-          steps_per_epoch=steps_per_epoch,
-          initial_epoch=initial_epoch,
-          epochs=epochs,
-          shuffle=shuffle,
-          class_weight=class_weight,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          model=self,
-          steps_per_execution=self._steps_per_execution,
+        x=x,
+        y=y,
+        sample_weight=sample_weight,
+        batch_size=batch_size,
+        steps_per_epoch=steps_per_epoch,
+        initial_epoch=initial_epoch,
+        epochs=epochs,
+        shuffle=shuffle,
+        class_weight=class_weight,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        model=self,
+        steps_per_execution=self._steps_per_execution,
       )
 
       for callback in callbacks:
-        if hasattr(callback, 'set_optimizer') and callable(callback.set_optimizer):
+        if hasattr(callback, "set_optimizer") and callable(callback.set_optimizer):
           callback.set_optimizer(self.optimizer)
-        if hasattr(callback, 'set_models') and callable(callback.set_models):
+        if hasattr(callback, "set_models") and callable(callback.set_models):
           callback.set_models(self._model)
 
       # Container that configures and calls `tf.keras.Callback`s.
@@ -1666,27 +1664,28 @@ def fit(
           if is_main_process():
             callbacks += [ProgbarLogger(count_mode="steps")]
           callbacks = HvdCallbackList(
-              callbacks,
-              add_history=True,
-              add_progbar=False,
-              model=self.main_model,
-              verbose=verbose,
-              epochs=epochs,
-              steps=data_handler.inferred_steps,
+            callbacks,
+            add_history=True,
+            add_progbar=False,
+            model=self.main_model,
+            verbose=verbose,
+            epochs=epochs,
+            steps=data_handler.inferred_steps,
           )
         else:
           callbacks = callbacks_module.CallbackList(
-              callbacks,
-              add_history=True,
-              add_progbar=verbose != 0,
-              model=self.main_model,
-              verbose=verbose,
-              epochs=epochs,
-              steps=data_handler.inferred_steps,
+            callbacks,
+            add_history=True,
+            add_progbar=verbose != 0,
+            model=self.main_model,
+            verbose=verbose,
+            epochs=epochs,
+            steps=data_handler.inferred_steps,
           )
 
       self.stop_training = False
-      self.train_function = self.make_train_function() if not flags.FLAGS.use_horovod else self.make_hvd_train_function(
+      self.train_function = (
+        self.make_train_function() if not flags.FLAGS.use_horovod else self.make_hvd_train_function()
       )
       self._train_counter.assign(0)
       callbacks.on_train_begin()
@@ -1696,10 +1695,10 @@ def fit(
       # Handle fault-tolerance for multi-worker.
       # TODO(omalleyt): Fix the ordering issues that mean this has to
       # happen after `callbacks.on_train_begin`.
-      steps_per_epoch_inferred = (steps_per_epoch or data_handler.inferred_steps)
+      steps_per_epoch_inferred = steps_per_epoch or data_handler.inferred_steps
       (
-          data_handler._initial_epoch,
-          data_handler._initial_step,
+        data_handler._initial_epoch,
+        data_handler._initial_step,
       ) = self._maybe_load_initial_counters_from_ckpt(steps_per_epoch_inferred, initial_epoch)
       logs = None
       for epoch, iterator in data_handler.enumerate_epochs():
@@ -1708,11 +1707,11 @@ def fit(
         with data_handler.catch_stop_iteration():
           for step in data_handler.steps():
             with tf.profiler.experimental.Trace(
-                "train",
-                epoch_num=epoch,
-                step_num=step,
-                batch_size=batch_size,
-                _r=1,
+              "train",
+              epoch_num=epoch,
+              step_num=step,
+              batch_size=batch_size,
+              _r=1,
             ):
               callbacks.on_train_batch_begin(step)
               tmp_logs = self.train_function(iterator)
@@ -1728,14 +1727,14 @@ def fit(
         logs = tf_utils.sync_to_numpy_or_python_type(logs)
         if logs is None:
           raise ValueError(
-              "Unexpected result of `train_function` "
-              "(Empty logs). This could be due to issues in input "
-              "pipeline that resulted in an empty dataset. "
-              "Otherwise, please use "
-              "`Model.compile(..., run_eagerly=True)`, or "
-              "`tf.config.run_functions_eagerly(True)` for more "
-              "information of where went wrong, or file a "
-              "issue/bug to `tf.keras`."
+            "Unexpected result of `train_function` "
+            "(Empty logs). This could be due to issues in input "
+            "pipeline that resulted in an empty dataset. "
+            "Otherwise, please use "
+            "`Model.compile(..., run_eagerly=True)`, or "
+            "`tf.config.run_functions_eagerly(True)` for more "
+            "information of where went wrong, or file a "
+            "issue/bug to `tf.keras`."
           )
         # Override with model metrics instead of last step logs
         logs = self._validate_and_get_metrics_result(logs)
@@ -1748,32 +1747,32 @@ def fit(
           # Create data_handler for evaluation and cache it.
           if getattr(self, "_eval_data_handler", None) is None:
             self._eval_data_handler = data_adapter.get_data_handler(
-                x=val_x,
-                y=val_y,
-                sample_weight=val_sample_weight,
-                batch_size=validation_batch_size or batch_size,
-                steps_per_epoch=validation_steps,
-                initial_epoch=0,
-                epochs=1,
-                max_queue_size=max_queue_size,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-                model=self,
-                steps_per_execution=self._steps_per_execution,
-                pss_evaluation_shards=self._pss_evaluation_shards,
-            )
-          val_logs = self.evaluate(
               x=val_x,
               y=val_y,
               sample_weight=val_sample_weight,
               batch_size=validation_batch_size or batch_size,
-              steps=validation_steps,
-              callbacks=callbacks,
+              steps_per_epoch=validation_steps,
+              initial_epoch=0,
+              epochs=1,
               max_queue_size=max_queue_size,
               workers=workers,
               use_multiprocessing=use_multiprocessing,
-              return_dict=True,
-              _use_cached_eval_dataset=True,
+              model=self,
+              steps_per_execution=self._steps_per_execution,
+              pss_evaluation_shards=self._pss_evaluation_shards,
+            )
+          val_logs = self.evaluate(
+            x=val_x,
+            y=val_y,
+            sample_weight=val_sample_weight,
+            batch_size=validation_batch_size or batch_size,
+            steps=validation_steps,
+            callbacks=callbacks,
+            max_queue_size=max_queue_size,
+            workers=workers,
+            use_multiprocessing=use_multiprocessing,
+            return_dict=True,
+            _use_cached_eval_dataset=True,
           )
           val_logs = {"val_" + name: val for name, val in val_logs.items()}
           epoch_logs.update(val_logs)
@@ -1829,7 +1828,6 @@ def _make_test_function_exact(self):
       return self._shard_test_function
 
     def step_function(batch):
-
       def run_step(data):
         # TODO(b/272050910): Use sample_weight for weighted metrics.
         x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
@@ -1841,9 +1839,9 @@ def run_step(data):
 
       outputs = self.distribute_strategy.run(run_step, args=(batch,))
       outputs = training_module.reduce_per_replica(
-          outputs,
-          self.distribute_strategy,
-          reduction=self.distribute_reduction_method,
+        outputs,
+        self.distribute_strategy,
+        reduction=self.distribute_reduction_method,
       )
       return outputs
 
@@ -1874,7 +1872,7 @@ def shard_test_function(dataset, total_shards, shard_idx):
           for unweighted_metric in local_unweighted_metrics:
             unweighted_metric.update_state(y, y_pred)
           local_loss(y, y_pred, sample_weight)
-      local_metrics = (local_unweighted_metrics + local_weighted_metrics + local_loss.metrics)
+      local_metrics = local_unweighted_metrics + local_weighted_metrics + local_loss.metrics
       outputs = {metric.name: metric.weights for metric in local_metrics}
       with tf.control_dependencies(training_module._minimum_control_deps(outputs)):
         self._test_counter.assign_add(1)
@@ -1883,10 +1881,10 @@ def shard_test_function(dataset, total_shards, shard_idx):
     if not self.run_eagerly:
       shard_test_function = tf.function(shard_test_function, reduce_retracing=True)
 
-    self._shard_test_function = (lambda *args: self._cluster_coordinator.schedule(
-        shard_test_function,
-        args=args,
-    ))
+    self._shard_test_function = lambda *args: self._cluster_coordinator.schedule(
+      shard_test_function,
+      args=args,
+    )
     return self._shard_test_function
 
   def make_test_function(self, force=False):
@@ -1932,16 +1930,17 @@ def run_step(data):
       data = next(iterator)
       outputs = self.distribute_strategy.run(run_step, args=(data,))
       outputs = training_module.reduce_per_replica(
-          outputs,
-          self.distribute_strategy,
-          reduction=self.distribute_reduction_method,
+        outputs,
+        self.distribute_strategy,
+        reduction=self.distribute_reduction_method,
       )
       return outputs
 
     # Special case if steps_per_execution is one.
     if (
-        self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+      self._steps_per_execution is None
+      or self._steps_per_execution.numpy().item() == 1
+      and not self.autotune_steps_per_execution
     ):
 
       def test_function(iterator):
@@ -1952,7 +1951,7 @@ def test_function(iterator):
         test_function = tf.function(test_function, reduce_retracing=True)
 
       if self._cluster_coordinator:
-        self.test_function = (lambda it: self._cluster_coordinator.schedule(test_function, args=(it,)))
+        self.test_function = lambda it: self._cluster_coordinator.schedule(test_function, args=(it,))
       else:
         self.test_function = test_function
 
@@ -1971,7 +1970,7 @@ def test_function(iterator, steps_per_execution):
         test_function = tf.function(test_function, reduce_retracing=True)
 
       self.test_function = lambda it: self._cluster_coordinator.schedule(
-          test_function, args=(it, self._steps_per_execution.value())
+        test_function, args=(it, self._steps_per_execution.value())
       )
     else:
 
@@ -1989,19 +1988,19 @@ def test_function(iterator):
 
   @traceback_utils.filter_traceback
   def evaluate(
-      self,
-      x=None,
-      y=None,
-      batch_size=None,
-      verbose="auto",
-      sample_weight=None,
-      steps=None,
-      callbacks=None,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
-      return_dict=False,
-      **kwargs,
+    self,
+    x=None,
+    y=None,
+    batch_size=None,
+    verbose="auto",
+    sample_weight=None,
+    steps=None,
+    callbacks=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
+    return_dict=False,
+    **kwargs,
   ):
     """Returns the loss value & metrics values for the model in test mode.
 
@@ -2099,44 +2098,44 @@ def evaluate(
       raise TypeError(f"Invalid keyword arguments: {list(kwargs.keys())}")
 
     if self.distribute_strategy._should_use_with_coordinator:
-      self._cluster_coordinator = (tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy))
+      self._cluster_coordinator = tf.distribute.experimental.coordinator.ClusterCoordinator(self.distribute_strategy)
 
     verbose = training_module._get_verbosity(verbose, self.distribute_strategy)
     if self._pss_evaluation_shards:
       self._disallow_exact_eval_with_add_metrics()
     with self.distribute_strategy.scope():
       # Use cached evaluation data only when it's called in `Model.fit`
-      if (use_cached_eval_dataset and getattr(self, "_eval_data_handler", None) is not None):
+      if use_cached_eval_dataset and getattr(self, "_eval_data_handler", None) is not None:
         data_handler = self._eval_data_handler
       else:
         # Creates a `tf.data.Dataset` and handles batch and epoch
         # iteration.
         data_handler = data_adapter.get_data_handler(
-            x=x,
-            y=y,
-            sample_weight=sample_weight,
-            batch_size=batch_size,
-            steps_per_epoch=steps,
-            initial_epoch=0,
-            epochs=1,
-            max_queue_size=max_queue_size,
-            workers=workers,
-            use_multiprocessing=use_multiprocessing,
-            model=self,
-            steps_per_execution=self._steps_per_execution,
-            pss_evaluation_shards=self._pss_evaluation_shards,
+          x=x,
+          y=y,
+          sample_weight=sample_weight,
+          batch_size=batch_size,
+          steps_per_epoch=steps,
+          initial_epoch=0,
+          epochs=1,
+          max_queue_size=max_queue_size,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          model=self,
+          steps_per_execution=self._steps_per_execution,
+          pss_evaluation_shards=self._pss_evaluation_shards,
         )
 
       # Container that configures and calls `tf.keras.Callback`s.
       if not isinstance(callbacks, callbacks_module.CallbackList):
         callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self,
-            verbose=verbose,
-            epochs=1,
-            steps=data_handler.inferred_steps,
+          callbacks,
+          add_history=True,
+          add_progbar=verbose != 0,
+          model=self,
+          verbose=verbose,
+          epochs=1,
+          steps=data_handler.inferred_steps,
         )
 
       # Initialize to prevent errors if 0 epochs are evaluated.
@@ -2148,8 +2147,8 @@ def evaluate(
       if self.autotune_steps_per_execution:
         self._steps_per_execution_tuner.start()
       for (
-          _,
-          dataset_or_iterator,
+        _,
+        dataset_or_iterator,
       ) in data_handler.enumerate_epochs():  # Single epoch.
         self.reset_metrics()
         with data_handler.catch_stop_iteration():
@@ -2157,10 +2156,10 @@ def evaluate(
             with tf.profiler.experimental.Trace("test", step_num=step, _r=1):
               callbacks.on_test_batch_begin(step)
               logs = test_function_runner.run_step(
-                  dataset_or_iterator,
-                  data_handler,
-                  step,
-                  self._pss_evaluation_shards,
+                dataset_or_iterator,
+                data_handler,
+                step,
+                self._pss_evaluation_shards,
               )
 
       logs = tf_utils.sync_to_numpy_or_python_type(logs)
@@ -2183,10 +2182,10 @@ def _disallow_exact_eval_with_add_metrics(self):
     compiled_metrics = self.compiled_metrics.metrics
     if any([metric not in compiled_metrics for metric in metrics_from_add_metric]):
       raise ValueError(
-          "Detected that a metric was added to this model "
-          "via `Model.add_metric`. This is not currently "
-          "supported when using exact evaluation with "
-          "`tf.distribute.ParameterServerStrategy`."
+        "Detected that a metric was added to this model "
+        "via `Model.add_metric`. This is not currently "
+        "supported when using exact evaluation with "
+        "`tf.distribute.ParameterServerStrategy`."
       )
 
   def _infer_exact_eval_shards(self, pss_evaluation_shards):
@@ -2198,7 +2197,7 @@ def _infer_exact_eval_shards(self, pss_evaluation_shards):
     return pss_evaluation_shards
 
   def _get_test_function_runner(self, callbacks):
-    if (self._pss_evaluation_shards and self.distribute_strategy._should_use_with_coordinator):
+    if self._pss_evaluation_shards and self.distribute_strategy._should_use_with_coordinator:
       self.test_function = self._make_test_function_exact()
       test_function_runner = training_module._ExactTestFunction(self.test_function, callbacks)
     else:
@@ -2275,8 +2274,9 @@ def run_step(data):
 
     # Special case if steps_per_execution is one.
     if (
-        self._steps_per_execution is None or
-        self._steps_per_execution.numpy().item() == 1 and not self.autotune_steps_per_execution
+      self._steps_per_execution is None
+      or self._steps_per_execution.numpy().item() == 1
+      and not self.autotune_steps_per_execution
     ):
 
       def predict_function(iterator):
@@ -2290,15 +2290,15 @@ def predict_function(iterator):
         outputs = step_function(iterator)
         for _ in tf.range(self._steps_per_execution - 1):
           tf.autograph.experimental.set_loop_options(
-              shape_invariants=[
-                  (
-                      outputs,
-                      tf.nest.map_structure(
-                          lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=True).shape,
-                          outputs,
-                      ),
-                  )
-              ]
+            shape_invariants=[
+              (
+                outputs,
+                tf.nest.map_structure(
+                  lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=True).shape,
+                  outputs,
+                ),
+              )
+            ]
           )
           step_outputs = step_function(iterator)
           outputs = tf.nest.map_structure(lambda t1, t2: training_module.concat([t1, t2]), outputs, step_outputs)
@@ -2312,15 +2312,15 @@ def predict_function(iterator):
 
   @traceback_utils.filter_traceback
   def predict(
-      self,
-      x,
-      batch_size=None,
-      verbose="auto",
-      steps=None,
-      callbacks=None,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
+    self,
+    x,
+    batch_size=None,
+    verbose="auto",
+    steps=None,
+    callbacks=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
   ):
     """Generates output predictions for the input samples.
 
@@ -2434,45 +2434,46 @@ def predict(
     with self.distribute_strategy.scope():
       # Creates a `tf.data.Dataset` and handles batch and epoch iteration.
       dataset_types = (tf.compat.v1.data.Dataset, tf.data.Dataset)
-      if (self._in_multi_worker_mode() or
-          training_module._is_tpu_multi_host(self.distribute_strategy)) and isinstance(x, dataset_types):
+      if (self._in_multi_worker_mode() or training_module._is_tpu_multi_host(self.distribute_strategy)) and isinstance(
+        x, dataset_types
+      ):
         try:
           options = tf.data.Options()
           data_option = tf.data.experimental.AutoShardPolicy.DATA
-          options.experimental_distribute.auto_shard_policy = (data_option)
+          options.experimental_distribute.auto_shard_policy = data_option
           x = x.with_options(options)
         except ValueError:
           warnings.warn(
-              "Using Model.predict with MultiWorkerMirroredStrategy "
-              "or TPUStrategy and AutoShardPolicy.FILE might lead to "
-              "out-of-order result. Consider setting it to "
-              "AutoShardPolicy.DATA.",
-              stacklevel=2,
+            "Using Model.predict with MultiWorkerMirroredStrategy "
+            "or TPUStrategy and AutoShardPolicy.FILE might lead to "
+            "out-of-order result. Consider setting it to "
+            "AutoShardPolicy.DATA.",
+            stacklevel=2,
           )
 
       data_handler = data_adapter.get_data_handler(
-          x=x,
-          batch_size=batch_size,
-          steps_per_epoch=steps,
-          initial_epoch=0,
-          epochs=1,
-          max_queue_size=max_queue_size,
-          workers=workers,
-          use_multiprocessing=use_multiprocessing,
-          model=self,
-          steps_per_execution=self._steps_per_execution,
+        x=x,
+        batch_size=batch_size,
+        steps_per_epoch=steps,
+        initial_epoch=0,
+        epochs=1,
+        max_queue_size=max_queue_size,
+        workers=workers,
+        use_multiprocessing=use_multiprocessing,
+        model=self,
+        steps_per_execution=self._steps_per_execution,
       )
 
       # Container that configures and calls `tf.keras.Callback`s.
       if not isinstance(callbacks, callbacks_module.CallbackList):
         callbacks = callbacks_module.CallbackList(
-            callbacks,
-            add_history=True,
-            add_progbar=verbose != 0,
-            model=self,
-            verbose=verbose,
-            epochs=1,
-            steps=data_handler.inferred_steps,
+          callbacks,
+          add_history=True,
+          add_progbar=verbose != 0,
+          model=self,
+          verbose=verbose,
+          epochs=1,
+          steps=data_handler.inferred_steps,
         )
 
       self.predict_function = self.make_predict_function()
@@ -2488,37 +2489,35 @@ def predict(
             tmp_batch_outputs = self.predict_function(iterator)
             if data_handler.should_sync:
               context.async_wait()
-            batch_outputs = (
-                tmp_batch_outputs  # No error, now safe to assign.
-            )
+            batch_outputs = tmp_batch_outputs  # No error, now safe to assign.
             if outputs is None:
               outputs = tf.nest.map_structure(
-                  lambda batch_output: [batch_output],
-                  batch_outputs,
+                lambda batch_output: [batch_output],
+                batch_outputs,
               )
             else:
               tf.__internal__.nest.map_structure_up_to(
-                  batch_outputs,
-                  lambda output, batch_output: output.append(batch_output),
-                  outputs,
-                  batch_outputs,
+                batch_outputs,
+                lambda output, batch_output: output.append(batch_output),
+                outputs,
+                batch_outputs,
               )
             end_step = step + data_handler.step_increment
             callbacks.on_predict_batch_end(end_step, {"outputs": batch_outputs})
       if batch_outputs is None:
         raise ValueError(
-            "Unexpected result of `predict_function` "
-            "(Empty batch_outputs). Please use "
-            "`Model.compile(..., run_eagerly=True)`, or "
-            "`tf.config.run_functions_eagerly(True)` for more "
-            "information of where went wrong, or file a "
-            "issue/bug to `tf.keras`."
+          "Unexpected result of `predict_function` "
+          "(Empty batch_outputs). Please use "
+          "`Model.compile(..., run_eagerly=True)`, or "
+          "`tf.config.run_functions_eagerly(True)` for more "
+          "information of where went wrong, or file a "
+          "issue/bug to `tf.keras`."
         )
       if self.autotune_steps_per_execution:
         self._steps_per_execution_tuner.stop()
       callbacks.on_predict_end()
     all_outputs = tf.__internal__.nest.map_structure_up_to(
-        batch_outputs, training_module.potentially_ragged_concat, outputs
+      batch_outputs, training_module.potentially_ragged_concat, outputs
     )
 
     # If originally PSS strategy was used, then replace it back since
@@ -2552,13 +2551,13 @@ def reset_metrics(self):
       m.reset_state()
 
   def train_on_batch(
-      self,
-      x,
-      y=None,
-      sample_weight=None,
-      class_weight=None,
-      reset_metrics=True,
-      return_dict=False,
+    self,
+    x,
+    y=None,
+    sample_weight=None,
+    class_weight=None,
+    reset_metrics=True,
+    return_dict=False,
   ):
     """Runs a single gradient update on a single batch of data.
 
@@ -2606,8 +2605,11 @@ def train_on_batch(
     training_module._disallow_inside_tf_function("train_on_batch")
     if reset_metrics:
       self.reset_metrics()
-    with self.distribute_strategy.scope(), training_utils.RespectCompiledTrainableState(  # noqa: E501
+    with (
+      self.distribute_strategy.scope(),
+      training_utils.RespectCompiledTrainableState(  # noqa: E501
         self
+      ),
     ):
       iterator = data_adapter.single_batch_iterator(self.distribute_strategy, x, y, sample_weight, class_weight)
       self.train_function = self.make_train_function()
@@ -2620,12 +2622,12 @@ def train_on_batch(
       return training_module.flatten_metrics_in_order(logs, self.metrics_names)
 
   def test_on_batch(
-      self,
-      x,
-      y=None,
-      sample_weight=None,
-      reset_metrics=True,
-      return_dict=False,
+    self,
+    x,
+    y=None,
+    sample_weight=None,
+    reset_metrics=True,
+    return_dict=False,
   ):
     """Test the model on a single batch of samples.
 
@@ -2705,21 +2707,21 @@ def predict_on_batch(self, x):
 
   @doc_controls.do_not_generate_docs
   def fit_generator(
-      self,
-      generator,
-      steps_per_epoch=None,
-      epochs=1,
-      verbose=1,
-      callbacks=None,
-      validation_data=None,
-      validation_steps=None,
-      validation_freq=1,
-      class_weight=None,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
-      shuffle=True,
-      initial_epoch=0,
+    self,
+    generator,
+    steps_per_epoch=None,
+    epochs=1,
+    verbose=1,
+    callbacks=None,
+    validation_data=None,
+    validation_steps=None,
+    validation_freq=1,
+    class_weight=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
+    shuffle=True,
+    initial_epoch=0,
   ):
     """Fits the model on data yielded batch-by-batch by a Python generator.
 
@@ -2728,38 +2730,38 @@ def fit_generator(
       use this endpoint.
     """
     warnings.warn(
-        "`Model.fit_generator` is deprecated and "
-        "will be removed in a future version. "
-        "Please use `Model.fit`, which supports generators.",
-        stacklevel=2,
+      "`Model.fit_generator` is deprecated and "
+      "will be removed in a future version. "
+      "Please use `Model.fit`, which supports generators.",
+      stacklevel=2,
     )
     return self.fit(
-        generator,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        verbose=verbose,
-        callbacks=callbacks,
-        validation_data=validation_data,
-        validation_steps=validation_steps,
-        validation_freq=validation_freq,
-        class_weight=class_weight,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        shuffle=shuffle,
-        initial_epoch=initial_epoch,
+      generator,
+      steps_per_epoch=steps_per_epoch,
+      epochs=epochs,
+      verbose=verbose,
+      callbacks=callbacks,
+      validation_data=validation_data,
+      validation_steps=validation_steps,
+      validation_freq=validation_freq,
+      class_weight=class_weight,
+      max_queue_size=max_queue_size,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      shuffle=shuffle,
+      initial_epoch=initial_epoch,
     )
 
   @doc_controls.do_not_generate_docs
   def evaluate_generator(
-      self,
-      generator,
-      steps=None,
-      callbacks=None,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
-      verbose=0,
+    self,
+    generator,
+    steps=None,
+    callbacks=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
+    verbose=0,
   ):
     """Evaluates the model on a data generator.
 
@@ -2768,33 +2770,33 @@ def evaluate_generator(
       need to use this endpoint.
     """
     warnings.warn(
-        "`Model.evaluate_generator` is deprecated and "
-        "will be removed in a future version. "
-        "Please use `Model.evaluate`, which supports generators.",
-        stacklevel=2,
+      "`Model.evaluate_generator` is deprecated and "
+      "will be removed in a future version. "
+      "Please use `Model.evaluate`, which supports generators.",
+      stacklevel=2,
     )
     self._check_call_args("evaluate_generator")
 
     return self.evaluate(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks,
+      generator,
+      steps=steps,
+      max_queue_size=max_queue_size,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      verbose=verbose,
+      callbacks=callbacks,
     )
 
   @doc_controls.do_not_generate_docs
   def predict_generator(
-      self,
-      generator,
-      steps=None,
-      callbacks=None,
-      max_queue_size=10,
-      workers=1,
-      use_multiprocessing=False,
-      verbose=0,
+    self,
+    generator,
+    steps=None,
+    callbacks=None,
+    max_queue_size=10,
+    workers=1,
+    use_multiprocessing=False,
+    verbose=0,
   ):
     """Generates predictions for the input samples from a data generator.
 
@@ -2803,19 +2805,19 @@ def predict_generator(
       need to use this endpoint.
     """
     warnings.warn(
-        "`Model.predict_generator` is deprecated and "
-        "will be removed in a future version. "
-        "Please use `Model.predict`, which supports generators.",
-        stacklevel=2,
+      "`Model.predict_generator` is deprecated and "
+      "will be removed in a future version. "
+      "Please use `Model.predict`, which supports generators.",
+      stacklevel=2,
     )
     return self.predict(
-        generator,
-        steps=steps,
-        max_queue_size=max_queue_size,
-        workers=workers,
-        use_multiprocessing=use_multiprocessing,
-        verbose=verbose,
-        callbacks=callbacks,
+      generator,
+      steps=steps,
+      max_queue_size=max_queue_size,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      verbose=verbose,
+      callbacks=callbacks,
     )
 
   def _check_call_args(self, method_name):
@@ -2823,7 +2825,7 @@ def _check_call_args(self, method_name):
     # Always allow first arg, regardless of arg name.
     fullargspec = self.main_model._call_spec.full_argspec
     if fullargspec.defaults:
-      positional_args = fullargspec.args[:-len(fullargspec.defaults)]
+      positional_args = fullargspec.args[: -len(fullargspec.defaults)]
     else:
       positional_args = fullargspec.args
     if "training" in positional_args:
@@ -2833,19 +2835,19 @@ def _check_call_args(self, method_name):
     if len(positional_args) > 2:
       extra_args = positional_args[2:]
       raise ValueError(
-          f"Models passed to `{method_name}` can only have `training` "
-          "and the first argument in `call()` as positional arguments, "
-          f"found: {extra_args}."
+        f"Models passed to `{method_name}` can only have `training` "
+        "and the first argument in `call()` as positional arguments, "
+        f"found: {extra_args}."
       )
 
   def _validate_compile(self, optimizer, metrics, **kwargs):
     """Performs validation checks for the default `compile()`."""
     if any(isinstance(opt, optimizer_v1.Optimizer) for opt in tf.nest.flatten(optimizer)):
       raise ValueError(
-          f"`tf.compat.v1.keras` Optimizer ({optimizer}) is "
-          "not supported when eager execution is enabled. Use a "
-          "`tf.keras` Optimizer instead, or disable eager "
-          "execution."
+        f"`tf.compat.v1.keras` Optimizer ({optimizer}) is "
+        "not supported when eager execution is enabled. Use a "
+        "`tf.keras` Optimizer instead, or disable eager "
+        "execution."
       )
 
     kwargs.pop("cloning", None)  # Legacy DistStrat argument, never used.
@@ -2853,23 +2855,22 @@ def _validate_compile(self, optimizer, metrics, **kwargs):
     distribute_arg = kwargs.pop("distribute", None)
     if distribute_arg is not None:
       raise ValueError(
-          "`distribute` argument in compile is not available in TF 2.0. "
-          "Please create the model under the `strategy.scope()`. "
-          f"Received: {distribute_arg}."
+        "`distribute` argument in compile is not available in TF 2.0. "
+        "Please create the model under the `strategy.scope()`. "
+        f"Received: {distribute_arg}."
       )
     target_tensor_arg = kwargs.pop("target_tensors", None)
     if target_tensor_arg is not None:
       raise ValueError(
-          "`target_tensors` argument is not supported when executing "
-          f"eagerly. Received: {target_tensor_arg}."
+        f"`target_tensors` argument is not supported when executing eagerly. Received: {target_tensor_arg}."
       )
     invalid_kwargs = set(kwargs) - {"sample_weight_mode"}
     if invalid_kwargs:
       raise TypeError(
-          "Invalid keyword argument(s) in `compile()`: "
-          f"{(invalid_kwargs,)}. Valid keyword arguments include "
-          '"cloning", "experimental_run_tf_function", "distribute",'
-          ' "target_tensors", or "sample_weight_mode".'
+        "Invalid keyword argument(s) in `compile()`: "
+        f"{(invalid_kwargs,)}. Valid keyword arguments include "
+        '"cloning", "experimental_run_tf_function", "distribute",'
+        ' "target_tensors", or "sample_weight_mode".'
       )
 
     # Model must be created and compiled with the same DistStrat.
@@ -2878,14 +2879,14 @@ def _validate_compile(self, optimizer, metrics, **kwargs):
       for v in self.main_model.variables:
         if not strategy.extended.variable_created_in_scope(v):
           raise ValueError(
-              f"Variable ({v}) was not created in the distribution "
-              f"strategy scope of ({strategy}). It is most likely "
-              "because some layers, model, or optimizer was being "
-              "created outside the distribution strategy scope. Try "
-              "to make sure your code looks similar "
-              "to the following.\nwith strategy.scope():\n"
-              "  model=_create_model()\n"
-              "  model.compile(...)"
+            f"Variable ({v}) was not created in the distribution "
+            f"strategy scope of ({strategy}). It is most likely "
+            "because some layers, model, or optimizer was being "
+            "created outside the distribution strategy scope. Try "
+            "to make sure your code looks similar "
+            "to the following.\nwith strategy.scope():\n"
+            "  model=_create_model()\n"
+            "  model.compile(...)"
           )
 
     # Model metrics must be created in the same distribution strategy scope
@@ -2895,14 +2896,14 @@ def _validate_compile(self, optimizer, metrics, **kwargs):
       for v in getattr(metric, "variables", []):
         if not strategy.extended.variable_created_in_scope(v):
           raise ValueError(
-              f"Metric ({metric}) passed to `model.compile` was "
-              "created inside a different distribution strategy "
-              "scope than the model. All metrics must be created "
-              "in the same distribution strategy "
-              f"scope as the model (in this case {strategy}). "
-              "If you pass in a string identifier for a metric to "
-              "compile, the metric will automatically be created "
-              "in the correct distribution strategy scope."
+            f"Metric ({metric}) passed to `model.compile` was "
+            "created inside a different distribution strategy "
+            "scope than the model. All metrics must be created "
+            "in the same distribution strategy "
+            f"scope as the model (in this case {strategy}). "
+            "If you pass in a string identifier for a metric to "
+            "compile, the metric will automatically be created "
+            "in the correct distribution strategy scope."
           )
 
     # Model metrics must be created in the same distribution strategy scope
@@ -2911,14 +2912,14 @@ def _validate_compile(self, optimizer, metrics, **kwargs):
       for v in getattr(opt, "_weights", []):
         if not strategy.extended.variable_created_in_scope(v):
           raise ValueError(
-              f"Optimizer ({optimizer}) passed to `model.compile` "
-              "was created inside a different distribution strategy "
-              "scope than the model. All optimizers must be created "
-              "in the same distribution strategy scope as the model "
-              f"(in this case {strategy}). If you pass in a string "
-              "identifier for an optimizer to compile, the optimizer "
-              "will automatically be created in the correct "
-              "distribution strategy scope."
+            f"Optimizer ({optimizer}) passed to `model.compile` "
+            "was created inside a different distribution strategy "
+            "scope than the model. All optimizers must be created "
+            "in the same distribution strategy scope as the model "
+            f"(in this case {strategy}). If you pass in a string "
+            "identifier for an optimizer to compile, the optimizer "
+            "will automatically be created in the correct "
+            "distribution strategy scope."
           )
 
   def _maybe_load_initial_counters_from_ckpt(self, steps_per_epoch, initial_epoch):
@@ -2941,7 +2942,7 @@ def _maybe_load_initial_counters_from_ckpt(self, steps_per_epoch, initial_epoch)
     initial_step = 0
     if self._training_state is not None:
       return self._training_state.maybe_load_initial_counters_from_ckpt(
-          steps_per_epoch, initial_epoch, mode=ModeKeys.TRAIN
+        steps_per_epoch, initial_epoch, mode=ModeKeys.TRAIN
       )
     return (initial_epoch, initial_step)
 
@@ -2951,26 +2952,22 @@ def _assert_compile_was_called(self):
     # model is compiled
     # (i.e. whether the model is built and its inputs/outputs are set).
     if not self._is_compiled:
-      raise RuntimeError(
-          "You must compile your model before "
-          "training/testing. "
-          "Use `model.compile(optimizer, loss)`."
-      )
+      raise RuntimeError("You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.")
 
   def _check_sample_weight_warning(self, x, sample_weight):
     # Datasets can include sample weight, by returning a tuple with the
     # structure of `(x, y, sample_weight)`.
     sample_weight_present = sample_weight is not None or (
-        isinstance(x, tf.data.Dataset) and isinstance(x.element_spec, tuple) and len(x.element_spec) == 3
+      isinstance(x, tf.data.Dataset) and isinstance(x.element_spec, tuple) and len(x.element_spec) == 3
     )
 
-    if (sample_weight_present and self.compiled_metrics._user_weighted_metrics is None):
+    if sample_weight_present and self.compiled_metrics._user_weighted_metrics is None:
       logging.warning(
-          "`evaluate()` received a value for `sample_weight`, but "
-          "`weighted_metrics` were not provided.  Did you mean to pass "
-          "metrics to `weighted_metrics` in `compile()`?  If this is "
-          "intentional you can pass `weighted_metrics=[]` to `compile()` "
-          "in order to silence this warning."
+        "`evaluate()` received a value for `sample_weight`, but "
+        "`weighted_metrics` were not provided.  Did you mean to pass "
+        "metrics to `weighted_metrics` in `compile()`?  If this is "
+        "intentional you can pass `weighted_metrics=[]` to `compile()` "
+        "in order to silence this warning."
       )
 
   def _should_eval(self, epoch, validation_freq):
@@ -2981,9 +2978,9 @@ def _should_eval(self, epoch, validation_freq):
       return epoch in validation_freq
     else:
       raise ValueError(
-          "Expected `validation_freq` to be a list or int. "
-          f"Received: validation_freq={validation_freq} of the "
-          f"type {type(validation_freq)}."
+        "Expected `validation_freq` to be a list or int. "
+        f"Received: validation_freq={validation_freq} of the "
+        f"type {type(validation_freq)}."
       )
 
   ######################################################################
@@ -3012,11 +3009,11 @@ def _get_compile_args(self, user_metrics=True):
         saved_weighted_metrics = self.compiled_metrics._weighted_metrics
 
     compile_args = {
-        "optimizer": self.optimizer,
-        "loss": self.compiled_loss._user_losses,
-        "metrics": saved_metrics,
-        "weighted_metrics": saved_weighted_metrics,
-        "loss_weights": self.compiled_loss._user_loss_weights,
+      "optimizer": self.optimizer,
+      "loss": self.compiled_loss._user_losses,
+      "metrics": saved_metrics,
+      "weighted_metrics": saved_weighted_metrics,
+      "loss_weights": self.compiled_loss._user_loss_weights,
     }
     return compile_args
 
diff --git a/deepray/core/utils/clip.py b/deepray/core/utils/clip.py
index cf27e8f2..de07557a 100644
--- a/deepray/core/utils/clip.py
+++ b/deepray/core/utils/clip.py
@@ -17,9 +17,9 @@ def cow_clip(w, g, ratio=1, ids=None, cnts=None, min_w=0.03, const=False):
       clipnorm = tf.maximum(clipnorm, min_w)
     # scale by cnting
     cnts = tf.tensor_scatter_nd_update(
-        tf.ones([clipnorm.shape[0]], dtype=tf.int32),
-        tf.expand_dims(ids, -1),
-        cnts,
+      tf.ones([clipnorm.shape[0]], dtype=tf.int32),
+      tf.expand_dims(ids, -1),
+      cnts,
     )
     clipnorm = clipnorm * tf.cast(cnts, tf.float32)
 
diff --git a/deepray/core/utils/common.py b/deepray/core/utils/common.py
index f3ca82f1..17d4626e 100644
--- a/deepray/core/utils/common.py
+++ b/deepray/core/utils/common.py
@@ -36,7 +36,7 @@ def create_global_step() -> tf.Variable:
     a distributed setting.
   """
   return tf.Variable(
-      0, dtype=tf.int64, name="global_step", trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+    0, dtype=tf.int64, name="global_step", trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
   )
 
 
@@ -66,8 +66,7 @@ def make_distributed_dataset(strategy, dataset_or_fn, *args, **kwargs):
     return strategy.experimental_distribute_dataset(dataset_or_fn, input_options)
 
   if not callable(dataset_or_fn):
-    raise ValueError("`dataset_or_fn` should be either callable or an instance "
-                     "of `tf.data.Dataset`.")
+    raise ValueError("`dataset_or_fn` should be either callable or an instance of `tf.data.Dataset`.")
 
   def dataset_fn(input_context):
     """Wraps `dataset_or_fn` for strategy.distribute_datasets_from_function."""
diff --git a/deepray/core/utils/common_test.py b/deepray/core/utils/common_test.py
index e0ba05db..47a70b27 100644
--- a/deepray/core/utils/common_test.py
+++ b/deepray/core/utils/common_test.py
@@ -19,7 +19,6 @@
 
 
 class UtilsTest(tf.test.TestCase):
-
   def test_create_global_step(self):
     step = common.create_global_step()
     self.assertEqual(step.name, "global_step:0")
diff --git a/deepray/core/utils/loop_fns.py b/deepray/core/utils/loop_fns.py
index d4568d8e..fff1c0fd 100644
--- a/deepray/core/utils/loop_fns.py
+++ b/deepray/core/utils/loop_fns.py
@@ -109,8 +109,8 @@ def loop_fn(iterator, num_steps):
     """
     if not isinstance(num_steps, tf.Tensor):
       raise ValueError(
-          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
-          "cause unnecessary retracing when wrapped by `tf.function`."
+        "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+        "cause unnecessary retracing when wrapped by `tf.function`."
       )
 
     for _ in tf.range(num_steps):
@@ -158,8 +158,8 @@ def loop_fn_with_state(iterator, num_steps, state, reduce_fn):
     """
     if not isinstance(num_steps, tf.Tensor):
       raise ValueError(
-          "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
-          "cause unnecessary retracing when wrapped by `tf.function`."
+        "`num_steps` should be a `tf.Tensor`. Passing a Python value can "
+        "cause unnecessary retracing when wrapped by `tf.function`."
       )
 
     def _get_relaxed_tensor_shape(t):
diff --git a/deepray/core/utils/misc/callstack_sampler.py b/deepray/core/utils/misc/callstack_sampler.py
index e5d06252..24405f79 100644
--- a/deepray/core/utils/misc/callstack_sampler.py
+++ b/deepray/core/utils/misc/callstack_sampler.py
@@ -7,8 +7,7 @@
 
 
 class CallstackSampler(object):
-  """A simple signal-based Python callstack sampler.
-  """
+  """A simple signal-based Python callstack sampler."""
 
   def __init__(self, interval=None):
     self.stacks = []
@@ -21,7 +20,7 @@ def _sample(self, signum, frame):
     formatted_stack = []
     formatted_stack.append(datetime.datetime.utcnow())
     for filename, lineno, function_name, text in stack:
-      formatted_frame = '{}:{}({})({})'.format(filename, lineno, function_name, text)
+      formatted_frame = "{}:{}({})({})".format(filename, lineno, function_name, text)
       formatted_stack.append(formatted_frame)
     self.stacks.append(formatted_stack)
     signal.setitimer(signal.ITIMER_VIRTUAL, self.interval, 0)
@@ -36,11 +35,11 @@ def profile(self):
       signal.setitimer(signal.ITIMER_VIRTUAL, 0)
 
   def save(self, fname):
-    with open(fname, 'w') as f:
+    with open(fname, "w") as f:
       for s in self.stacks:
         for l in s:
-          f.write('%s\n' % l)
-        f.write('\n')
+          f.write("%s\n" % l)
+        f.write("\n")
 
 
 @contextlib.contextmanager
diff --git a/deepray/core/utils/misc/distribution_utils.py b/deepray/core/utils/misc/distribution_utils.py
index 0042d469..24367bbf 100644
--- a/deepray/core/utils/misc/distribution_utils.py
+++ b/deepray/core/utils/misc/distribution_utils.py
@@ -41,14 +41,14 @@ def _collective_communication(all_reduce_alg):
     ValueError: if `all_reduce_alg` not in [None, 'ring', 'nccl']
   """
   collective_communication_options = {
-      None: tf.distribute.experimental.CollectiveCommunication.AUTO,
-      "ring": tf.distribute.experimental.CollectiveCommunication.RING,
-      "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL
+    None: tf.distribute.experimental.CollectiveCommunication.AUTO,
+    "ring": tf.distribute.experimental.CollectiveCommunication.RING,
+    "nccl": tf.distribute.experimental.CollectiveCommunication.NCCL,
   }
   if all_reduce_alg not in collective_communication_options:
     raise ValueError(
-        "When used with `multi_worker_mirrored`, valid values for "
-        "all_reduce_alg are ['ring', 'nccl'].  Supplied value: {}".format(all_reduce_alg)
+      "When used with `multi_worker_mirrored`, valid values for "
+      "all_reduce_alg are ['ring', 'nccl'].  Supplied value: {}".format(all_reduce_alg)
     )
   return collective_communication_options[all_reduce_alg]
 
@@ -69,20 +69,20 @@ def _mirrored_cross_device_ops(all_reduce_alg, num_packs):
   if all_reduce_alg is None:
     return None
   mirrored_all_reduce_options = {
-      "nccl": tf.distribute.NcclAllReduce,
-      "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce
+    "nccl": tf.distribute.NcclAllReduce,
+    "hierarchical_copy": tf.distribute.HierarchicalCopyAllReduce,
   }
   if all_reduce_alg not in mirrored_all_reduce_options:
     raise ValueError(
-        "When used with `mirrored`, valid values for all_reduce_alg are "
-        "['nccl', 'hierarchical_copy'].  Supplied value: {}".format(all_reduce_alg)
+      "When used with `mirrored`, valid values for all_reduce_alg are "
+      "['nccl', 'hierarchical_copy'].  Supplied value: {}".format(all_reduce_alg)
     )
   cross_device_ops_class = mirrored_all_reduce_options[all_reduce_alg]
   return cross_device_ops_class(num_packs=num_packs)
 
 
 def get_distribution_strategy(
-    distribution_strategy="mirrored", num_gpus=0, num_workers=1, all_reduce_alg=None, num_packs=1, tpu_address=None
+  distribution_strategy="mirrored", num_gpus=0, num_workers=1, all_reduce_alg=None, num_packs=1, tpu_address=None
 ):
   """Return a DistributionStrategy for running the model.
 
@@ -124,15 +124,14 @@ def get_distribution_strategy(
 
   if distribution_strategy == "multi_worker_mirrored":
     return tf.distribute.experimental.MultiWorkerMirroredStrategy(
-        communication=_collective_communication(all_reduce_alg)
+      communication=_collective_communication(all_reduce_alg)
     )
 
   if distribution_strategy == "one_device":
     if num_gpus == 0:
       return tf.distribute.OneDeviceStrategy("device:CPU:0")
     if num_gpus > 1:
-      raise ValueError("`OneDeviceStrategy` can not be used for more than "
-                       "one device.")
+      raise ValueError("`OneDeviceStrategy` can not be used for more than one device.")
     return tf.distribute.OneDeviceStrategy("device:GPU:0")
 
   if distribution_strategy == "mirrored":
@@ -141,7 +140,7 @@ def get_distribution_strategy(
     else:
       devices = ["device:GPU:%d" % i for i in range(num_gpus)]
     return tf.distribute.MirroredStrategy(
-        devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs)
+      devices=devices, cross_device_ops=_mirrored_cross_device_ops(all_reduce_alg, num_packs)
     )
 
   if distribution_strategy == "parameter_server":
@@ -174,9 +173,9 @@ def per_replica_batch_size(batch_size, num_gpus):
   remainder = batch_size % num_gpus
   if remainder:
     err = (
-        'When running with multiple GPUs, batch size '
-        'must be a multiple of the number of available GPUs. Found {} '
-        'GPUs with a batch size of {}; try --batch_size={} instead.'
+      "When running with multiple GPUs, batch size "
+      "must be a multiple of the number of available GPUs. Found {} "
+      "GPUs with a batch size of {}; try --batch_size={} instead."
     ).format(num_gpus, batch_size, batch_size - remainder)
     raise ValueError(err)
   return int(batch_size / num_gpus)
@@ -191,7 +190,7 @@ class SyntheticDataset(object):
 
   def __init__(self, dataset, split_by=1):
     # dataset.take(1) doesn't have GPU kernel.
-    with tf.device('device:CPU:0'):
+    with tf.device("device:CPU:0"):
       tensor = tf.data.experimental.get_single_element(dataset.take(1))
     flat_tensor = tf.nest.flatten(tensor)
     variable_data = []
@@ -206,7 +205,7 @@ def __init__(self, dataset, split_by=1):
     self._iterator = SyntheticIterator(input_data, initializers)
 
   def _random_name(self, size=10, chars=string.ascii_uppercase + string.digits):
-    return ''.join(random.choice(chars) for _ in range(size))
+    return "".join(random.choice(chars) for _ in range(size))
 
   def __iter__(self):
     return self._iterator
@@ -248,7 +247,7 @@ def _monkey_patch_dataset_method(strategy):
   """Monkey-patch `strategy`'s `make_dataset_iterator` method."""
 
   def make_dataset(self, dataset):
-    logging.info('Using pure synthetic data.')
+    logging.info("Using pure synthetic data.")
     with self.scope():
       if self.extended._global_batch_size:  # pylint: disable=protected-access
         return SyntheticDataset(dataset, self.num_replicas_in_sync)
@@ -266,9 +265,9 @@ def make_iterator(self, dataset):
 
 
 def _undo_monkey_patch_dataset_method(strategy):
-  if hasattr(strategy, 'orig_make_dataset_iterator'):
+  if hasattr(strategy, "orig_make_dataset_iterator"):
     strategy.make_dataset_iterator = strategy.orig_make_dataset_iterator
-  if hasattr(strategy, 'orig_distribute_dataset'):
+  if hasattr(strategy, "orig_distribute_dataset"):
     strategy.make_dataset_iterator = strategy.orig_distribute_dataset
 
 
@@ -294,7 +293,6 @@ def get_strategy_scope(strategy):
 
 
 class DummyContextManager(object):
-
   def __enter__(self):
     pass
 
diff --git a/deepray/core/utils/misc/distribution_utils_test.py b/deepray/core/utils/misc/distribution_utils_test.py
index 31b4ebe2..1ca71ed5 100644
--- a/deepray/core/utils/misc/distribution_utils_test.py
+++ b/deepray/core/utils/misc/distribution_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-""" Tests for distribution util functions."""
+"""Tests for distribution util functions."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -30,20 +30,20 @@ def test_one_device_strategy_cpu(self):
     ds = distribution_utils.get_distribution_strategy(num_gpus=0)
     self.assertEquals(ds.num_replicas_in_sync, 1)
     self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('CPU', ds.extended.worker_devices[0])
+    self.assertIn("CPU", ds.extended.worker_devices[0])
 
   def test_one_device_strategy_gpu(self):
     ds = distribution_utils.get_distribution_strategy(num_gpus=1)
     self.assertEquals(ds.num_replicas_in_sync, 1)
     self.assertEquals(len(ds.extended.worker_devices), 1)
-    self.assertIn('GPU', ds.extended.worker_devices[0])
+    self.assertIn("GPU", ds.extended.worker_devices[0])
 
   def test_mirrored_strategy(self):
     ds = distribution_utils.get_distribution_strategy(num_gpus=5)
     self.assertEquals(ds.num_replicas_in_sync, 5)
     self.assertEquals(len(ds.extended.worker_devices), 5)
     for device in ds.extended.worker_devices:
-      self.assertIn('GPU', device)
+      self.assertIn("GPU", device)
 
 
 class PerReplicaBatchSizeTest(tf.test.TestCase):
diff --git a/deepray/core/utils/misc/model_helpers.py b/deepray/core/utils/misc/model_helpers.py
index 7fa117e4..2390184d 100644
--- a/deepray/core/utils/misc/model_helpers.py
+++ b/deepray/core/utils/misc/model_helpers.py
@@ -44,8 +44,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
   if not isinstance(stop_threshold, numbers.Number):
     raise ValueError("Threshold for checking stop conditions must be a number.")
   if not isinstance(eval_metric, numbers.Number):
-    raise ValueError("Eval metric being checked against stop conditions "
-                     "must be a number.")
+    raise ValueError("Eval metric being checked against stop conditions must be a number.")
 
   if eval_metric >= stop_threshold:
     logging.info("Stop threshold of {} was passed with metric value {}.".format(stop_threshold, eval_metric))
@@ -55,7 +54,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
 
 
 def generate_synthetic_data(
-    input_shape, input_value=0, input_dtype=None, label_shape=None, label_value=0, label_dtype=None
+  input_shape, input_value=0, input_dtype=None, label_shape=None, label_value=0, label_dtype=None
 ):
   """Create a repeating dataset with constant values.
 
@@ -84,6 +83,5 @@ def generate_synthetic_data(
 
 def apply_clean(flags_obj):
   if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
-    logging.info("--clean flag set. Removing existing model dir:"
-                 " {}".format(flags_obj.model_dir))
+    logging.info("--clean flag set. Removing existing model dir: {}".format(flags_obj.model_dir))
     tf.io.gfile.rmtree(flags_obj.model_dir)
diff --git a/deepray/core/utils/misc/model_helpers_test.py b/deepray/core/utils/misc/model_helpers_test.py
index 3d877226..fd5047c0 100644
--- a/deepray/core/utils/misc/model_helpers_test.py
+++ b/deepray/core/utils/misc/model_helpers_test.py
@@ -76,20 +76,20 @@ class SyntheticDataTest(tf.test.TestCase):
 
   def test_generate_synethetic_data(self):
     input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
-        model_helpers.generate_synthetic_data(
-            input_shape=tf.TensorShape([5]),
-            input_value=123,
-            input_dtype=tf.float32,
-            label_shape=tf.TensorShape([]),
-            label_value=456,
-            label_dtype=tf.int32
-        )
+      model_helpers.generate_synthetic_data(
+        input_shape=tf.TensorShape([5]),
+        input_value=123,
+        input_dtype=tf.float32,
+        label_shape=tf.TensorShape([]),
+        label_value=456,
+        label_dtype=tf.int32,
+      )
     ).get_next()
 
     with self.session() as sess:
       for n in range(5):
         inp, lab = sess.run((input_element, label_element))
-        self.assertAllClose(inp, [123., 123., 123., 123., 123.])
+        self.assertAllClose(inp, [123.0, 123.0, 123.0, 123.0, 123.0])
         self.assertEquals(lab, 456)
 
   def test_generate_only_input_data(self):
@@ -104,29 +104,22 @@ def test_generate_only_input_data(self):
 
   def test_generate_nested_data(self):
     d = model_helpers.generate_synthetic_data(
-        input_shape={
-            'a': tf.TensorShape([2]),
-            'b': {
-                'c': tf.TensorShape([3]),
-                'd': tf.TensorShape([])
-            }
-        },
-        input_value=1.1
+      input_shape={"a": tf.TensorShape([2]), "b": {"c": tf.TensorShape([3]), "d": tf.TensorShape([])}}, input_value=1.1
     )
 
     element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
-    self.assertIn('a', element)
-    self.assertIn('b', element)
-    self.assertEquals(len(element['b']), 2)
-    self.assertIn('c', element['b'])
-    self.assertIn('d', element['b'])
-    self.assertNotIn('c', element)
+    self.assertIn("a", element)
+    self.assertIn("b", element)
+    self.assertEquals(len(element["b"]), 2)
+    self.assertIn("c", element["b"])
+    self.assertIn("d", element["b"])
+    self.assertNotIn("c", element)
 
     with self.session() as sess:
       inp = sess.run(element)
-      self.assertAllClose(inp['a'], [1.1, 1.1])
-      self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
-      self.assertAllClose(inp['b']['d'], 1.1)
+      self.assertAllClose(inp["a"], [1.1, 1.1])
+      self.assertAllClose(inp["b"]["c"], [1.1, 1.1, 1.1])
+      self.assertAllClose(inp["b"]["d"], 1.1)
 
 
 if __name__ == "__main__":
diff --git a/deepray/core/utils/misc/tpu_lib.py b/deepray/core/utils/misc/tpu_lib.py
index 831a945b..e84443f8 100644
--- a/deepray/core/utils/misc/tpu_lib.py
+++ b/deepray/core/utils/misc/tpu_lib.py
@@ -27,7 +27,7 @@ def tpu_initialize(tpu_address):
     A TPUClusterResolver.
   """
   cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
-  if tpu_address not in ('', 'local'):
+  if tpu_address not in ("", "local"):
     tf.config.experimental_connect_to_cluster(cluster_resolver)
   tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
   return cluster_resolver
diff --git a/deepray/core/utils/summary_manager.py b/deepray/core/utils/summary_manager.py
index 47b4f93f..875cd4fa 100644
--- a/deepray/core/utils/summary_manager.py
+++ b/deepray/core/utils/summary_manager.py
@@ -55,7 +55,7 @@ def summary_writer(self, relative_path=""):
       return self._summary_writers[relative_path]
     if self._enabled:
       self._summary_writers[relative_path] = tf.summary.create_file_writer(
-          os.path.join(self._summary_dir, relative_path)
+        os.path.join(self._summary_dir, relative_path)
       )
     else:
       self._summary_writers[relative_path] = tf.summary.create_noop_writer()
diff --git a/deepray/core/utils/tpu_summaries_test.py b/deepray/core/utils/tpu_summaries_test.py
index 498ea0b3..40bea930 100644
--- a/deepray/core/utils/tpu_summaries_test.py
+++ b/deepray/core/utils/tpu_summaries_test.py
@@ -41,7 +41,6 @@ def train_function_with_summaries(function=None, **kwargs):
 
 
 class DummyTrainer(tf.Module):
-
   def __init__(self):
     self.step_counter = common.create_global_step()
 
@@ -67,7 +66,6 @@ def train_with_tpu_summary_optimization_no_decorator(self, num_steps):
 
 
 class TpuSummariesTest(tf.test.TestCase):
-
   def setUp(self):
     super().setUp()
     self.trainer = DummyTrainer()
@@ -102,7 +100,7 @@ def test_train_with_tpu_summary_optimization_no_decorator(self):
 
   def test_train_with_tpu_summary_optimization_and_input_signature(self):
     output = self._validate_tpu_summary_optimization(
-        self.trainer.train_with_tpu_summary_optimization_and_input_signature
+      self.trainer.train_with_tpu_summary_optimization_and_input_signature
     )
     self.assertEqual(output, self.trainer.step_counter.numpy())
     function = self.trainer.train_with_tpu_summary_optimization_and_input_signature
diff --git a/deepray/custom_ops/correlation_cost/python/optical_flow.py b/deepray/custom_ops/correlation_cost/python/optical_flow.py
index 38742732..7eb0d7d5 100644
--- a/deepray/custom_ops/correlation_cost/python/optical_flow.py
+++ b/deepray/custom_ops/correlation_cost/python/optical_flow.py
@@ -22,58 +22,58 @@
 
 
 def _correlation_cost(
-    input_a,
-    input_b,
-    kernel_size,
-    max_displacement,
-    stride_1,
-    stride_2,
-    pad,
-    data_format="channels_last",
-    name=None,
+  input_a,
+  input_b,
+  kernel_size,
+  max_displacement,
+  stride_1,
+  stride_2,
+  pad,
+  data_format="channels_last",
+  name=None,
 ):
   """Correlation Cost Volume computation.
 
-    See [FlowNet: Learning Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852).
+  See [FlowNet: Learning Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852).
 
-    Computes a cost volume using correlation for two inputs. For feature
-    maps A, B with spatial dimensions w, h, c it computes
+  Computes a cost volume using correlation for two inputs. For feature
+  maps A, B with spatial dimensions w, h, c it computes
 
-      output(a, b) = sum_{l in [-k,k]**2}  < I(a+l), J(b+l) >
+    output(a, b) = sum_{l in [-k,k]**2}  < I(a+l), J(b+l) >
 
-    where the patches of size K=2d + 1 are centered in position a resp. b.
+  where the patches of size K=2d + 1 are centered in position a resp. b.
 
-    The output shape is [B, C', H', W'], where
+  The output shape is [B, C', H', W'], where
 
-      r = max_displacement / stride_2;
-      bd = max_displacement + (kernel_size - 1) / 2
-      C' = (2 * r + 1) ** 2
-      H' = H + 2 * (pad - bd) / stride_1
-      W' = W + 2 * (pad - bd) / stride_1
+    r = max_displacement / stride_2;
+    bd = max_displacement + (kernel_size - 1) / 2
+    C' = (2 * r + 1) ** 2
+    H' = H + 2 * (pad - bd) / stride_1
+    W' = W + 2 * (pad - bd) / stride_1
 
-    Note: When the data_format requests "channels_last", an additional explicit
-      transpose operation is executed.
+  Note: When the data_format requests "channels_last", an additional explicit
+    transpose operation is executed.
 
-    Args:
-      input_a: A `Tensor` of the format specified by `data_format`.
-      input_b: A `Tensor` of the format specified by `data_format`.
-      kernel_size: An integer specifying the height and width of the
-          patch used to compute the per-patch costs.
-      max_displacement: An integer specifying the maximum search radius
-          for each position.
-      stride_1: An integer specifying the stride length in the input.
-      stride_2: An integer specifying the stride length in the patch.
-      pad: An integer specifying the paddings in height and width.
-      data_format: Specifies the data format.
-          Possible values are:
-          "channels_last" float [batch, height, width, channels]
-          "channels_first" float [batch, channels, height, width]
-          Defaults to `"channels_last"`.
-      name: A name for the operation (optional).
+  Args:
+    input_a: A `Tensor` of the format specified by `data_format`.
+    input_b: A `Tensor` of the format specified by `data_format`.
+    kernel_size: An integer specifying the height and width of the
+        patch used to compute the per-patch costs.
+    max_displacement: An integer specifying the maximum search radius
+        for each position.
+    stride_1: An integer specifying the stride length in the input.
+    stride_2: An integer specifying the stride length in the patch.
+    pad: An integer specifying the paddings in height and width.
+    data_format: Specifies the data format.
+        Possible values are:
+        "channels_last" float [batch, height, width, channels]
+        "channels_first" float [batch, channels, height, width]
+        Defaults to `"channels_last"`.
+    name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor` of the format specified by `data_format`.
-    """
+  Returns:
+    A `Tensor` of the format specified by `data_format`.
+  """
 
   with tf.name_scope(name or "correlation_cost"):
     op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost
@@ -83,18 +83,17 @@ def _correlation_cost(
     elif data_format == "channels_first":
       op_data_format = "NCHW"
     else:
-      raise ValueError("`data_format` must be either `channels_last` or"
-                       "`channels_first`")
+      raise ValueError("`data_format` must be either `channels_last` or`channels_first`")
 
     ret = op_call(
-        input_a,
-        input_b,
-        kernel_size=kernel_size,
-        max_displacement=max_displacement,
-        stride_1=stride_1,
-        stride_2=stride_2,
-        pad=pad,
-        data_format=op_data_format,
+      input_a,
+      input_b,
+      kernel_size=kernel_size,
+      max_displacement=max_displacement,
+      stride_1=stride_1,
+      stride_2=stride_2,
+      pad=pad,
+      data_format=op_data_format,
     )
     if data_format == "channels_last":
       # this is easier to maintain without
@@ -118,15 +117,15 @@ def _correlation_cost_grad(op, grad_output):
 
   op_call = gen_correlation_cost_ops.ops.deepray_correlation_cost_grad
   grads = op_call(
-      input_a,
-      input_b,
-      grad_output_tensor,
-      kernel_size=kernel_size,
-      max_displacement=max_displacement,
-      stride_1=stride_1,
-      stride_2=stride_2,
-      pad=pad,
-      data_format=data_format,
+    input_a,
+    input_b,
+    grad_output_tensor,
+    kernel_size=kernel_size,
+    max_displacement=max_displacement,
+    stride_1=stride_1,
+    stride_2=stride_2,
+    pad=pad,
+    data_format=data_format,
   )
 
   grad_input_a = tf.convert_to_tensor(grads[0], name="grad_input_a")
@@ -138,34 +137,34 @@ def _correlation_cost_grad(op, grad_output):
 class CorrelationCost(tf.keras.layers.Layer):
   """Correlation Cost Layer.
 
-    This layer implements the correlation operation from [FlowNet Learning
-    Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852)(Fischer et al.).
-
-    Args:
-        kernel_size: An integer specifying the height and width of the
-            patch used to compute the per-patch costs.
-        max_displacement: An integer specifying the maximum search radius
-            for each position.
-        stride_1: An integer specifying the stride length in the input.
-        stride_2: An integer specifying the stride length in the patch.
-        pad: An integer specifying the paddings in height and width.
-        data_format: Specifies the data format.
-            Possible values are:
-                "channels_last" float [batch, height, width, channels]
-                "channels_first" float [batch, channels, height, width]
-                Defaults to `"channels_last"`.
-    """
+  This layer implements the correlation operation from [FlowNet Learning
+  Optical Flow with Convolutional Networks](https://arxiv.org/abs/1504.06852)(Fischer et al.).
+
+  Args:
+      kernel_size: An integer specifying the height and width of the
+          patch used to compute the per-patch costs.
+      max_displacement: An integer specifying the maximum search radius
+          for each position.
+      stride_1: An integer specifying the stride length in the input.
+      stride_2: An integer specifying the stride length in the patch.
+      pad: An integer specifying the paddings in height and width.
+      data_format: Specifies the data format.
+          Possible values are:
+              "channels_last" float [batch, height, width, channels]
+              "channels_first" float [batch, channels, height, width]
+              Defaults to `"channels_last"`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      kernel_size: int,
-      max_displacement: int,
-      stride_1: int,
-      stride_2: int,
-      pad: int,
-      data_format: str,
-      **kwargs,
+    self,
+    kernel_size: int,
+    max_displacement: int,
+    stride_1: int,
+    stride_2: int,
+    pad: int,
+    data_format: str,
+    **kwargs,
   ):
     self.kernel_size = kernel_size
     self.max_displacement = max_displacement
@@ -174,10 +173,7 @@ def __init__(
     self.pad = pad
 
     if data_format != "channels_last" and data_format != "channels_first":
-      raise ValueError(
-          "`data_format` must be either `channels_last` or"
-          "`channels_first`, instead got %s" % data_format
-      )
+      raise ValueError("`data_format` must be either `channels_last` or`channels_first`, instead got %s" % data_format)
 
     self.data_format = data_format
 
@@ -196,14 +192,14 @@ def call(self, inputs):
     input_b = tf.convert_to_tensor(inputs[1])
 
     return _correlation_cost(
-        input_a,
-        input_b,
-        kernel_size=self.kernel_size,
-        max_displacement=self.max_displacement,
-        stride_1=self.stride_1,
-        stride_2=self.stride_2,
-        pad=self.pad,
-        data_format=self.data_format,
+      input_a,
+      input_b,
+      kernel_size=self.kernel_size,
+      max_displacement=self.max_displacement,
+      stride_1=self.stride_1,
+      stride_2=self.stride_2,
+      pad=self.pad,
+      data_format=self.data_format,
     )
 
   def compute_output_shape(self, input_shape):
@@ -220,7 +216,7 @@ def compute_output_shape(self, input_shape):
     n = input_shape[0][0]
     r = self.max_displacement // self.stride_2
     bd = self.max_displacement + (self.kernel_size - 1) // 2
-    output_c = (2 * r + 1)**2
+    output_c = (2 * r + 1) ** 2
 
     if self.data_format == "channels_first":
       output_h = input_shape[0][2] + 2 * (self.pad - bd) // self.stride_1
@@ -232,17 +228,16 @@ def compute_output_shape(self, input_shape):
       output_w = input_shape[0][2] + 2 * (self.pad - bd) // self.stride_1
       return [(n, output_h, output_w, output_c)]
     else:
-      raise ValueError("`data_format` must be either `channels_last` or"
-                       "`channels_first`")
+      raise ValueError("`data_format` must be either `channels_last` or`channels_first`")
 
   def get_config(self):
     config = {
-        "kernel_size": self.kernel_size,
-        "max_displacement": self.max_displacement,
-        "stride_1": self.stride_1,
-        "stride_2": self.stride_2,
-        "pad": self.pad,
-        "data_format": self.data_format,
+      "kernel_size": self.kernel_size,
+      "max_displacement": self.max_displacement,
+      "stride_1": self.stride_1,
+      "stride_2": self.stride_2,
+      "pad": self.pad,
+      "data_format": self.data_format,
     }
 
     base_config = super().get_config()
diff --git a/deepray/custom_ops/correlation_cost/python/tests/optical_flow_test.py b/deepray/custom_ops/correlation_cost/python/tests/optical_flow_test.py
index db38f903..5c53dafa 100644
--- a/deepray/custom_ops/correlation_cost/python/tests/optical_flow_test.py
+++ b/deepray/custom_ops/correlation_cost/python/tests/optical_flow_test.py
@@ -20,25 +20,25 @@
 
 
 def _forward(
-    input_a,
-    input_b,
-    kernel_size,
-    max_displacement,
-    stride_1,
-    stride_2,
-    pad,
-    data_format,
+  input_a,
+  input_b,
+  kernel_size,
+  max_displacement,
+  stride_1,
+  stride_2,
+  pad,
+  data_format,
 ):
   input_a_op = tf.convert_to_tensor(input_a, dtype=tf.float32)
   input_b_op = tf.convert_to_tensor(input_b, dtype=tf.float32)
 
   output = CorrelationCost(
-      kernel_size=kernel_size,
-      max_displacement=max_displacement,
-      stride_1=stride_1,
-      stride_2=stride_2,
-      pad=pad,
-      data_format=data_format,
+    kernel_size=kernel_size,
+    max_displacement=max_displacement,
+    stride_1=stride_1,
+    stride_2=stride_2,
+    pad=pad,
+    data_format=data_format,
   )([input_a_op, input_b_op])
 
   return output
@@ -47,17 +47,17 @@ def _forward(
 def _create_test_data(data_format):
   # Produce test data for _forward_simple and _keras methods
   val_a = np.array(
+    [
       [
-          [
-              [[0, -6, 9, 5], [1, -5, 10, 3], [2, -4, 11, 1]],
-              [[3, -3, 12, -1], [4, -2, 13, -3], [5, -1, 14, -5]],
-          ],
-          [
-              [[6, 0, 15, -7], [7, 1, 16, -9], [8, 2, 17, -11]],
-              [[9, 3, 18, -13], [10, 4, 19, -15], [11, 5, 20, -17]],
-          ],
+        [[0, -6, 9, 5], [1, -5, 10, 3], [2, -4, 11, 1]],
+        [[3, -3, 12, -1], [4, -2, 13, -3], [5, -1, 14, -5]],
       ],
-      dtype=np.float32,
+      [
+        [[6, 0, 15, -7], [7, 1, 16, -9], [8, 2, 17, -11]],
+        [[9, 3, 18, -13], [10, 4, 19, -15], [11, 5, 20, -17]],
+      ],
+    ],
+    dtype=np.float32,
   )
 
   # pylint: disable=too-many-function-args
@@ -89,14 +89,14 @@ def test_forward_simple(data_format):
   pad = 4
 
   actual = _forward(
-      input_a_tensor,
-      input_b_tensor,
-      kernel_size=kernel_size,
-      max_displacement=max_displacement,
-      stride_1=stride_1,
-      stride_2=stride_2,
-      pad=pad,
-      data_format=data_format,
+    input_a_tensor,
+    input_b_tensor,
+    kernel_size=kernel_size,
+    max_displacement=max_displacement,
+    stride_1=stride_1,
+    stride_2=stride_2,
+    pad=pad,
+    data_format=data_format,
   )
 
   if data_format == "channels_last":
@@ -105,8 +105,8 @@ def test_forward_simple(data_format):
 
   # We can test fixed ids, as output is independent from data_format
   expected_ids = np.concatenate([
-      np.zeros(464),
-      np.ones(464),
+    np.zeros(464),
+    np.ones(464),
   ])
   np.testing.assert_allclose(tf.where(actual == 0)[:, 0].numpy(), expected_ids)
 
@@ -138,12 +138,12 @@ def test_gradients(data_format):
 
   def correlation_fn(input_a, input_b):
     return CorrelationCost(
-        kernel_size=kernel_size,
-        max_displacement=max_displacement,
-        stride_1=stride_1,
-        stride_2=stride_2,
-        pad=pad,
-        data_format=data_format,
+      kernel_size=kernel_size,
+      max_displacement=max_displacement,
+      stride_1=stride_1,
+      stride_2=stride_2,
+      pad=pad,
+      data_format=data_format,
     )([input_a, input_b])
 
   theoretical, numerical = tf.test.compute_gradient(correlation_fn, [input_a_op, input_b_op])
@@ -160,12 +160,12 @@ def test_keras(data_format):
   input_b = tf.keras.Input(shape=val_b.shape[1:])
 
   layer = CorrelationCost(
-      kernel_size=1,
-      max_displacement=2,
-      stride_1=1,
-      stride_2=2,
-      pad=4,
-      data_format=data_format,
+    kernel_size=1,
+    max_displacement=2,
+    stride_1=1,
+    stride_2=2,
+    pad=4,
+    data_format=data_format,
   )
 
   expected_output_shape = tuple(layer.compute_output_shape([input_a.shape, input_b.shape]))
diff --git a/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py b/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
index 2eb019b7..5c54ed01 100644
--- a/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
+++ b/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Distributed Embedding layers and utils"""
+
 import types
 import math
 import numpy as np
@@ -27,8 +28,7 @@
 
 
 class ConcatInitializer(tf.keras.initializers.Initializer):
-  """ initializer wrapper to handle automatic concat table on first dimension
-  """
+  """initializer wrapper to handle automatic concat table on first dimension"""
 
   def __init__(self, initializer, sizes):
     self._initializer = initializer
@@ -107,7 +107,7 @@ def _dp_to_mp_input_ragged(dp_inputs, rank_to_local_features):
   """
 
   if not isinstance(dp_inputs, dict):
-    raise ValueError(f'Expected a dict, got: {type(dp_inputs)}')
+    raise ValueError(f"Expected a dict, got: {type(dp_inputs)}")
 
   if not dp_inputs:
     return {}
@@ -182,7 +182,7 @@ def _dp_to_mp_input_dense(dp_inputs, rank_to_local_features):
     ValueError: in case of incorrect input.
   """
   if not isinstance(dp_inputs, dict):
-    raise ValueError(f'Expected a dict, got: {type(dp_inputs)}')
+    raise ValueError(f"Expected a dict, got: {type(dp_inputs)}")
 
   if not dp_inputs:
     return {}
@@ -205,12 +205,12 @@ def _dp_to_mp_input_dense(dp_inputs, rank_to_local_features):
     flat_inputs += rank_inputs
   dp_inputs = tf.concat(flat_inputs, 0)
 
-  mp_inputs, _ = hvd.alltoall(dp_inputs, splits=global_splits, name='inp_dp_to_mp')
+  mp_inputs, _ = hvd.alltoall(dp_inputs, splits=global_splits, name="inp_dp_to_mp")
 
   mp_inputs = tf.reshape(mp_inputs, [world_size, -1])
   mp_inputs = tf.split(mp_inputs, local_splits[rank], 1)
   mp_inputs = [
-      tf.reshape(inp, [world_size * shape[0]] + shape[1:]) for inp, shape in zip(mp_inputs, local_shapes[rank])
+    tf.reshape(inp, [world_size * shape[0]] + shape[1:]) for inp, shape in zip(mp_inputs, local_shapes[rank])
   ]
 
   mp_inputs = dict(zip(rank_to_local_features[rank], mp_inputs, strict=True))
@@ -258,7 +258,7 @@ def _dp_to_mp_input(dp_inputs, rank_to_local_features):
       ragged_dp_inputs[i] = f
     elif isinstance(f, tf.SparseTensor):
       # TODO(tgrel): support sparse input, possibly by converting to ragged here
-      raise ValueError('Sparse tensor data-parallel input is not supported')
+      raise ValueError("Sparse tensor data-parallel input is not supported")
     else:
       dense_dp_inputs[i] = f
 
@@ -292,7 +292,7 @@ def grad(*upstream):
   return outputs, grad
 
 
-class DistEmbeddingStrategy():
+class DistEmbeddingStrategy:
   """Distributed embedding strategy
 
   Args:
@@ -339,15 +339,15 @@ class DistEmbeddingStrategy():
   """
 
   def __init__(
-      self,
-      embeddings,
-      world_size,
-      strategy="basic",
-      input_table_map=None,
-      column_slice_threshold=None,
-      row_slice_threshold=None,
-      data_parallel_threshold=None,
-      gpu_embedding_size=None
+    self,
+    embeddings,
+    world_size,
+    strategy="basic",
+    input_table_map=None,
+    column_slice_threshold=None,
+    row_slice_threshold=None,
+    data_parallel_threshold=None,
+    gpu_embedding_size=None,
   ):
     # code in DMP to skip hvd call in single process case may assume "basic"
     self.strategy = "basic" if world_size == 1 else strategy
@@ -359,7 +359,7 @@ def __init__(
     self.global_configs = [e.get_config() for e in embeddings]
     # Insert layer type information to config dicts
     for config, embedding in zip(self.global_configs, embeddings):
-      config['layer_type'] = type(embedding)
+      config["layer_type"] = type(embedding)
     if input_table_map is None:
       input_table_map = list(range(len(embeddings)))
 
@@ -367,7 +367,7 @@ def __init__(
     self.table_groups = self.init_table_groups(self.global_configs)
     # input ids and map. rev_group_ids here to reverse grouped call back to input order
     self.input_groups, self.map_groups, self.rev_group_ids = self.init_input_and_map_groups(
-        self.table_groups, input_table_map
+      self.table_groups, input_table_map
     )
 
     # 1. handle data parallel
@@ -376,7 +376,7 @@ def __init__(
     # 2. handle row slicing
     if self.table_groups[2]:
       self.row_sliced_configs, self.row_inputs_offsets = self.create_row_sliced_configs(
-          [self.global_configs[idx] for idx in self.table_groups[2]], world_size
+        [self.global_configs[idx] for idx in self.table_groups[2]], world_size
       )
     else:
       self.row_sliced_configs = [[] for _ in range(world_size)]
@@ -387,8 +387,10 @@ def __init__(
 
     # Create (maybe) sliced configs
     sliced_configs, self.sliced_out_ranges = self.create_col_sliced_configs(
-        [self.global_configs[idx] for idx in self.table_groups[1]], world_size, self.column_slice_threshold,
-        self.map_groups[1]
+      [self.global_configs[idx] for idx in self.table_groups[1]],
+      world_size,
+      self.column_slice_threshold,
+      self.map_groups[1],
     )
 
     # Apply strategy and save nested list containing table indices by rank
@@ -422,7 +424,7 @@ def __init__(
 
       # concat eligible tables then adjust local config and map
       rank_configs, rank_input_map, input_offsets, group, weight_offsets = self._create_concat(
-          rank_configs, rank_input_map
+        rank_configs, rank_input_map
       )
 
       # save results to global nested list
@@ -437,7 +439,7 @@ def __init__(
     # This is fast but might switch to not use this to support non-2D output(no local combiner)
     self.widths_list_flat = []
     for config, input_map in zip(self.local_configs, self.local_maps):
-      self.widths_list_flat += [config[m]['output_dim'] for m in input_map]
+      self.widths_list_flat += [config[m]["output_dim"] for m in input_map]
 
     # List of indices to shuffle worker ordered embedding outputs back to original order
     worker_order_input_ids = [item for sublist in self.input_ids_list for item in sublist]
@@ -445,31 +447,31 @@ def __init__(
 
   def _maybe_offload(self, configs):
     """
-      Offloads the largest tables among the "configs" argument,
-      such that the sum of not offloaded tables is smaller than the threshold.
+    Offloads the largest tables among the "configs" argument,
+    such that the sum of not offloaded tables is smaller than the threshold.
 
-      Args:
-        configs (List): a list of configs to process
+    Args:
+      configs (List): a list of configs to process
 
-      Returns:
-        A list of configs in the same order as the "configs" argument,
-        but with the "cpu_offload" field set to either True or False.
+    Returns:
+      A list of configs in the same order as the "configs" argument,
+      but with the "cpu_offload" field set to either True or False.
     """
     configs = configs.copy()
 
     if self.gpu_embedding_size is None:
       for config in configs:
-        config['cpu_offload'] = False
+        config["cpu_offload"] = False
       return configs
 
     current_total_size = 0
 
     # use indices rather than sorting the list directly to maintain the original order
-    _, order = _argsort(configs, key=lambda x: x['input_dim'] * x['output_dim'])
+    _, order = _argsort(configs, key=lambda x: x["input_dim"] * x["output_dim"])
     for index in order:
       config = configs[index]
-      current_total_size += config['input_dim'] * config['output_dim']
-      config['cpu_offload'] = current_total_size > self.gpu_embedding_size
+      current_total_size += config["input_dim"] * config["output_dim"]
+      config["cpu_offload"] = current_total_size > self.gpu_embedding_size
     return configs
 
   # below are the methods to divide table into groups and adjust input and input map accordingly
@@ -480,7 +482,7 @@ def init_table_groups(self, configs):
     # - currently only apply one of above to any table. it may make sense to mix row/col slice in future
     # - because communication pattern is different, we run 3 separate groups calls
     # - non-symmetric table parallel is only applied to column sliced group
-    num_elems = [config['input_dim'] * config['output_dim'] for config in configs]
+    num_elems = [config["input_dim"] * config["output_dim"] for config in configs]
     dp, col, row = [], [], []
     for i, num_elem in enumerate(num_elems):
       if self.data_parallel_threshold and num_elem <= self.data_parallel_threshold:
@@ -525,23 +527,23 @@ def maybe_slice_table_column(self, orig_config, column_slice_threshold, world_si
       sliced_config (list): list of embedding layer config that concat into original config
     """
     if column_slice_threshold is None:
-      column_slice_threshold = float('inf')
-    table_size = orig_config['input_dim'] * orig_config['output_dim']
+      column_slice_threshold = float("inf")
+    table_size = orig_config["input_dim"] * orig_config["output_dim"]
     num_slices = 1
     while table_size > column_slice_threshold:
       num_slices *= 2
       table_size /= 2
     if num_slices == 1:
       return [orig_config.copy()]
-    num_slices = min(num_slices, world_size, orig_config['output_dim'])
-    column_per_slice = orig_config['output_dim'] // num_slices
-    remainder = orig_config['output_dim'] % num_slices
+    num_slices = min(num_slices, world_size, orig_config["output_dim"])
+    column_per_slice = orig_config["output_dim"] // num_slices
+    remainder = orig_config["output_dim"] % num_slices
     sliced_config = []
     for i in range(num_slices):
       config = orig_config.copy()
-      config['output_dim'] = column_per_slice
+      config["output_dim"] = column_per_slice
       if i < remainder:
-        config['output_dim'] += 1
+        config["output_dim"] += 1
       sliced_config.append(config)
     return sliced_config
 
@@ -561,7 +563,7 @@ def create_col_sliced_configs(self, global_col_configs, world_size, column_slice
     """
     # less table than worker, we try our best to slice into worker count slices(may go over)
     if column_slice_threshold is None:
-      table_sizes = [config['input_dim'] * config['output_dim'] for config in global_col_configs]
+      table_sizes = [config["input_dim"] * config["output_dim"] for config in global_col_configs]
       while world_size > len(table_sizes):
         table_sizes.sort()
         column_slice_threshold = table_sizes[-1] - 1
@@ -586,16 +588,16 @@ def create_row_sliced_configs(self, global_row_configs, world_size):
     for orig_config in global_row_configs:
       sliced_config, offset = [], []
       cur_offset = 0
-      row_per_slice = orig_config['input_dim'] // world_size
-      remainder = orig_config['input_dim'] % world_size
+      row_per_slice = orig_config["input_dim"] // world_size
+      remainder = orig_config["input_dim"] % world_size
       for i in range(world_size):
         config = orig_config.copy()
-        config['input_dim'] = row_per_slice
+        config["input_dim"] = row_per_slice
         if i < remainder:
-          config['input_dim'] += 1
+          config["input_dim"] += 1
         sliced_config.append(config)
         offset.append(cur_offset)
-        cur_offset -= config['input_dim']
+        cur_offset -= config["input_dim"]
       sliced_configs.append(sliced_config)
       offsets.append(offset)
     # re-divide lists by rank
@@ -615,21 +617,21 @@ def apply_strategy(self, mode, world_size, sliced_configs):
     for i, sliced_config in enumerate(sliced_configs):
       for config in sliced_config:
         global_ids.append(i)
-        table_sizes.append(config['input_dim'] * config['output_dim'])
+        table_sizes.append(config["input_dim"] * config["output_dim"])
 
     # Round-robin distribute tables onto workers
-    if mode == 'basic':
+    if mode == "basic":
       divided_ids = [global_ids[i::world_size] for i in range(world_size)]
     # Distributed table so that memory is balanced while table count remain even
-    elif mode == 'memory_balanced':
+    elif mode == "memory_balanced":
       sorted_ids = [idx for _, idx in sorted(zip(table_sizes, global_ids), reverse=True)]
       divided_ids = [
-          sorted_ids[i::2 * world_size] + sorted_ids[(2 * world_size - 1 - i)::2 * world_size]
-          for i in range(world_size)
+        sorted_ids[i :: 2 * world_size] + sorted_ids[(2 * world_size - 1 - i) :: 2 * world_size]
+        for i in range(world_size)
       ]
     # Try to optimize for total memory first. After sorted by size, table are distributed one by one
     # to worker with lowest total size. Memory usage will be more even but table count may not.
-    elif mode == 'memory_optimized':
+    elif mode == "memory_optimized":
       sorted_pairs = list(sorted(zip(table_sizes, global_ids)))
       res = [[0, []] for _ in range(world_size)]
       while sorted_pairs:
@@ -639,7 +641,7 @@ def apply_strategy(self, mode, world_size, sliced_configs):
         res = sorted(res)
       divided_ids = [r[1] for r in res]
     else:
-      raise ValueError(F"Unsupported strategy {strategy}")
+      raise ValueError(f"Unsupported strategy {strategy}")
     return divided_ids
 
   # Concat table so different table now become shared embedding. XLA does rest of optimization.
@@ -648,19 +650,19 @@ def _create_concat(self, table_configs, input_maps):
     grouped_table_ids, concat_configs = [], []
     for table_id, config in enumerate(table_configs):
       for group, concat_config in zip(grouped_table_ids, concat_configs):
-        same_output_dim = config['output_dim'] == concat_config['output_dim']
-        same_combiner = config.get('combiner') == concat_config.get('combiner')
-        no_offload = not (config['cpu_offload'] or concat_config['cpu_offload'])
+        same_output_dim = config["output_dim"] == concat_config["output_dim"]
+        same_combiner = config.get("combiner") == concat_config.get("combiner")
+        no_offload = not (config["cpu_offload"] or concat_config["cpu_offload"])
         if same_output_dim and same_combiner and no_offload:
           group.append(table_id)
-          concat_config['input_dim'] += config['input_dim']
-          concat_config['input_dims'].append(config['input_dim'])
-          concat_config['offsets'].append(concat_config['offsets'][-1] + config['input_dim'])
+          concat_config["input_dim"] += config["input_dim"]
+          concat_config["input_dims"].append(config["input_dim"])
+          concat_config["offsets"].append(concat_config["offsets"][-1] + config["input_dim"])
           break
       else:  # can't merge with any group, create a new one
         grouped_table_ids.append([table_id])
-        config['input_dims'] = [config['input_dim']]
-        config['offsets'] = [0, config['input_dim']]
+        config["input_dims"] = [config["input_dim"]]
+        config["offsets"] = [0, config["input_dim"]]
         concat_configs.append(config)
 
     # adjust input map and create according offset map
@@ -669,20 +671,20 @@ def _create_concat(self, table_configs, input_maps):
       for gid, (group, concat_config) in enumerate(zip(grouped_table_ids, concat_configs)):
         if input_map in group:
           new_input_map.append(gid)
-          input_offsets.append(concat_config['offsets'][group.index(input_map)])
+          input_offsets.append(concat_config["offsets"][group.index(input_map)])
           break
 
     # switch to concat initializer to keep behavior associated with shape
     for concat_config in concat_configs:
-      input_dims = concat_config.pop('input_dims')
+      input_dims = concat_config.pop("input_dims")
       if len(input_dims) > 1:
         # TODO(deyuf): we don't really need serialize and can just get from original class
-        if 'embeddings_initializer' in concat_config:
-          orig_initializer = initializers.deserialize(concat_config['embeddings_initializer'])
-          concat_config['embeddings_initializer'] = ConcatInitializer(orig_initializer, input_dims)
+        if "embeddings_initializer" in concat_config:
+          orig_initializer = initializers.deserialize(concat_config["embeddings_initializer"])
+          concat_config["embeddings_initializer"] = ConcatInitializer(orig_initializer, input_dims)
 
     # record weight offsets for get/set.
-    weight_offsets = [concat_config.pop('offsets', None) for concat_config in concat_configs]
+    weight_offsets = [concat_config.pop("offsets", None) for concat_config in concat_configs]
     return concat_configs, new_input_map, input_offsets, grouped_table_ids, weight_offsets
 
   # Helper function to re-merge slices of same table in cases they end up on same workers
@@ -693,7 +695,7 @@ def _merge_slices(self, rank_table_ids, sliced_configs):
       if table_idx in merged_table_ids:
         config_to_merge = sliced_configs[table_idx].pop(0)
         index_to_merge = merged_table_ids.index(table_idx)
-        rank_configs[index_to_merge]['output_dim'] += config_to_merge['output_dim']
+        rank_configs[index_to_merge]["output_dim"] += config_to_merge["output_dim"]
         # modify output concat ranges
         for out_range in self.sliced_out_ranges:
           if out_range[0] == table_idx:
@@ -735,21 +737,20 @@ class DistributedEmbedding(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      embeddings,
-      strategy="basic",
-      column_slice_threshold=None,
-      row_slice_threshold=None,
-      dp_input=True,
-      input_table_map=None,
-      data_parallel_threshold=None,
-      gpu_embedding_size=None,
-      **kwargs
+    self,
+    embeddings,
+    strategy="basic",
+    column_slice_threshold=None,
+    row_slice_threshold=None,
+    dp_input=True,
+    input_table_map=None,
+    data_parallel_threshold=None,
+    gpu_embedding_size=None,
+    **kwargs,
   ):
-
     super().__init__(**kwargs)
-    if strategy not in ['basic', 'memory_balanced', 'memory_optimized']:
-      raise ValueError(F"Unsupported shard strategy {strategy}")
+    if strategy not in ["basic", "memory_balanced", "memory_optimized"]:
+      raise ValueError(f"Unsupported shard strategy {strategy}")
 
     # Currently assume data parallel ranks == model parallel ranks
     # TODO(deyuf): add more control over this with newly added hvd process_set api
@@ -772,14 +773,14 @@ def __init__(
 
     # get model parallel distribution strategy
     self.strategy = DistEmbeddingStrategy(
-        embeddings,
-        self.world_size,
-        strategy,
-        input_table_map=input_table_map,
-        column_slice_threshold=column_slice_threshold,
-        row_slice_threshold=self.row_slice_threshold,
-        data_parallel_threshold=self.data_parallel_threshold,
-        gpu_embedding_size=self.gpu_embedding_size
+      embeddings,
+      self.world_size,
+      strategy,
+      input_table_map=input_table_map,
+      column_slice_threshold=column_slice_threshold,
+      row_slice_threshold=self.row_slice_threshold,
+      data_parallel_threshold=self.data_parallel_threshold,
+      gpu_embedding_size=self.gpu_embedding_size,
     )
 
     # Here we make sure empty lists exist
@@ -797,14 +798,14 @@ def __init__(
       # Column slice still need to expand all gpu, otherwise alltoall fails
       if not all(rank_configs for rank_configs in self.strategy.local_configs):
         raise ValueError(
-            "Not enough table after slicing to run on all worker."
-            "Try decrease column_slice_threshold or decrease worker count"
+          "Not enough table after slicing to run on all worker."
+          "Try decrease column_slice_threshold or decrease worker count"
         )
       for config in self.strategy.local_configs[self.rank]:
         self.local_embedding_layers.append(self._create_layer_from_config(config))
       self.col_inputs_offsets = [
-          None if offset == 0 else tf.constant([offset], dtype=tf.int64)
-          for offset in self.strategy.local_input_offsets[self.rank]
+        None if offset == 0 else tf.constant([offset], dtype=tf.int64)
+        for offset in self.strategy.local_input_offsets[self.rank]
       ]
 
     # create row sliced embeddings.
@@ -814,21 +815,21 @@ def __init__(
       for config in self.strategy.row_sliced_configs[self.rank]:
         self.row_layers.append(self._create_layer_from_config(config))
       self.row_inputs_offsets = [
-          None if offset == 0 else tf.constant([offset], dtype=tf.int64)
-          for offset in self.strategy.row_inputs_offsets[self.rank]
+        None if offset == 0 else tf.constant([offset], dtype=tf.int64)
+        for offset in self.strategy.row_inputs_offsets[self.rank]
       ]
 
   def _create_layer_from_config(self, config):
     # For stock keras Embedding, we switch underlying layer for better performance
     # If inputs are custom layers, original layer will be used
-    layer_type = config.pop('layer_type')
-    offloaded = config.pop('cpu_offload', False)
+    layer_type = config.pop("layer_type")
+    offloaded = config.pop("cpu_offload", False)
 
     if layer_type == tf.keras.layers.Embedding:
       layer_type = Embedding
 
     if offloaded and layer_type == Embedding:
-      config['use_custom_kernel'] = False
+      config["use_custom_kernel"] = False
 
     layer = layer_type.from_config(config)
     layer.cpu_offloaded = offloaded
@@ -852,12 +853,11 @@ def _call_table_parallel(self, inputs):  # pylint: disable=missing-param-doc,mis
       inputs = list(inputs.values())
 
     if len(inputs) != len(self.strategy.local_maps[self.rank]):
-      raise ValueError(F"Expect {self.strategy.local_maps[self.rank]} inputs, got {len(inputs)}.")
+      raise ValueError(f"Expect {self.strategy.local_maps[self.rank]} inputs, got {len(inputs)}.")
 
     # offset inputs
     inputs = [
-        inp if offset is None else tf.cast(inp, tf.int64) + offset
-        for inp, offset in zip(inputs, self.col_inputs_offsets)
+      inp if offset is None else tf.cast(inp, tf.int64) + offset for inp, offset in zip(inputs, self.col_inputs_offsets)
     ]
     # do embedding
     mp_outs = [self.local_embedding_layers[m](inp) for m, inp in zip(self.strategy.local_maps[self.rank], inputs)]
@@ -867,7 +867,7 @@ def _call_table_parallel(self, inputs):  # pylint: disable=missing-param-doc,mis
       # TODO(deyuf): current assume 2D with same batch for all output, ideally should support general case
       mp_outs = [tf.reshape(mp_out, [self.world_size, -1]) for mp_out in mp_outs]
       mp_outs = tf.reshape(tf.concat(mp_outs, axis=1), [-1])
-      dp_outs = hvd.alltoall(mp_outs, name='out_mp_to_dp')
+      dp_outs = hvd.alltoall(mp_outs, name="out_mp_to_dp")
       batch_size = tf.shape(inputs[0], out_type=tf.int32)[0] if inputs[0].shape[0] is None else inputs[0].shape[0]
       local_bs = batch_size // self.world_size
       num_elements = [local_bs * width for width in self.strategy.widths_list_flat]
@@ -890,12 +890,11 @@ def _call_row_slice(self, inputs):
     inputs = hvd.grouped_allgather(inputs)
     # offset inputs
     inputs = [
-        inp if offset is None else tf.cast(inp, tf.int64) + offset
-        for inp, offset in zip(inputs, self.row_inputs_offsets)
+      inp if offset is None else tf.cast(inp, tf.int64) + offset for inp, offset in zip(inputs, self.row_inputs_offsets)
     ]
     # do embedding
     outputs = [self.row_layers[m](inp) for m, inp in zip(self.strategy.map_groups[2], inputs)]
-    outputs = [tf.cast(output, self.compute_dtype, name='row_slice_cast') for output in outputs]
+    outputs = [tf.cast(output, self.compute_dtype, name="row_slice_cast") for output in outputs]
     outputs = grouped_reducescatter_unscaled(outputs)
 
     return outputs
@@ -905,17 +904,15 @@ def set_col_slice_weights(self, weights):
       return []
     if self.world_size == 1:
       if isinstance(weights[0], str):
-        weights = [np.load(file=path, mmap_mode='r') for path in weights]
+        weights = [np.load(file=path, mmap_mode="r") for path in weights]
     else:
       slice_info = [
-          [rank_table_id.count(table_id)
-           for rank_table_id in self.strategy.table_ids]
-          for table_id in range(len(weights))
+        [rank_table_id.count(table_id) for rank_table_id in self.strategy.table_ids] for table_id in range(len(weights))
       ]
       local_info = [slice_info[index] for index in self.strategy.table_ids[self.rank]]
       weights = [weights[index] for index in self.strategy.table_ids[self.rank]]
       if isinstance(weights[0], str):
-        weights = [np.load(file=path, mmap_mode='r') for path in weights]
+        weights = [np.load(file=path, mmap_mode="r") for path in weights]
 
       def _slice_weight_for_rank(weight, info, global_rank):
         num_columns = weight.shape[1]
@@ -943,7 +940,7 @@ def set_row_slice_weights(self, weights):
     if not weights:
       return []
     if isinstance(weights[0], str):
-      weights = [np.load(file=path, mmap_mode='r') for path in weights]
+      weights = [np.load(file=path, mmap_mode="r") for path in weights]
     local_info = [[1 for _ in range(self.world_size)] for _ in weights]
 
     def _slice_weight_for_rank(weight, info, global_rank):
@@ -990,8 +987,8 @@ def set_weights(self, weights, chunk=134217728, use_lock=False):
     # super().set_weights(weights)
     if len(self.weights) != len(weights):
       raise ValueError(
-          F"You called `set_weights(weights)` on layer {self.name} with a weight list of "
-          F"length {len(weights)}, but the layer was expecting {len(self.weights)} weights."
+        f"You called `set_weights(weights)` on layer {self.name} with a weight list of "
+        f"length {len(weights)}, but the layer was expecting {len(self.weights)} weights."
       )
     for weight, arr in zip(self.weights, weights):
       if arr.size <= chunk:
@@ -1053,10 +1050,10 @@ def get_col_sliced_weights(self, local_weights, all_ranks=False):
       concat_weights = [w.numpy() for w in local_weights]
       res = [item for sublist in self.strategy.local_group_list[0] for item in sublist]
       for offsets, f_w, group in zip(
-          self.strategy.local_weight_offsets[0], concat_weights, self.strategy.local_group_list[0]
+        self.strategy.local_weight_offsets[0], concat_weights, self.strategy.local_group_list[0]
       ):
         for i in range(len(offsets) - 1):
-          res[group[i]] = f_w[offsets[i]:offsets[i + 1]]
+          res[group[i]] = f_w[offsets[i] : offsets[i + 1]]
       return res
 
     # mpi segfault on over 32bit range index, so we gather weights chunk by chunk here
@@ -1064,10 +1061,10 @@ def get_col_sliced_weights(self, local_weights, all_ranks=False):
     chunking_threshold = 2000000000
     num_chunks = 1
     for local_config in self.strategy.local_configs:
-      total_elements = sum([c['input_dim'] * c['output_dim'] for c in local_config])
+      total_elements = sum([c["input_dim"] * c["output_dim"] for c in local_config])
       num_chunks = max(num_chunks, math.ceil(self.world_size * total_elements / chunking_threshold))
 
-    with tf.device('CPU:0'):
+    with tf.device("CPU:0"):
       local_weights = tf.concat([tf.reshape(w, [-1]) for w in local_weights], axis=0)
       chunk_size = local_weights.shape[0] // num_chunks
       last_size = local_weights.shape[0] - chunk_size * (num_chunks - 1)
@@ -1088,15 +1085,15 @@ def get_col_sliced_weights(self, local_weights, all_ranks=False):
       # re-construct all local weights from chunks
       local_weights = []
       for i in range(self.world_size):
-        local_weights.append(tf.concat(chunks[i::self.world_size], axis=0))
+        local_weights.append(tf.concat(chunks[i :: self.world_size], axis=0))
       del chunks
 
       # split flat local weights into correct sizes
       weights = []
       for local_weight, local_config, weight_offsets, local_groups in zip(
-          local_weights, self.strategy.local_configs, self.strategy.local_weight_offsets, self.strategy.local_group_list
+        local_weights, self.strategy.local_configs, self.strategy.local_weight_offsets, self.strategy.local_group_list
       ):
-        local_shapes = [[c['input_dim'], c['output_dim']] for c in local_config]
+        local_shapes = [[c["input_dim"], c["output_dim"]] for c in local_config]
         local_sizes = [shape[0] * shape[1] for shape in local_shapes]
         flat_weights = self._split_1d(local_weight, local_sizes)
         concat_weights = [tf.reshape(weight, shape) for weight, shape in zip(flat_weights, local_shapes)]
@@ -1104,7 +1101,7 @@ def get_col_sliced_weights(self, local_weights, all_ranks=False):
         res = [item for sublist in local_groups for item in sublist]
         for offsets, f_w, group in zip(weight_offsets, concat_weights, local_groups):
           for i in range(len(offsets) - 1):
-            res[group[i]] = f_w[offsets[i]:offsets[i + 1]]
+            res[group[i]] = f_w[offsets[i] : offsets[i + 1]]
         weights += res
 
       # restore original table order
@@ -1141,8 +1138,8 @@ def get_weights(self, all_ranks=False):
     weights = [read_var_no_copy(w) for w in self.weights]
     num_dp, num_col = len(self.dp_layers), len(self.local_embedding_layers)
     dp_weights = weights[:num_dp]
-    col_weights = weights[num_dp:num_dp + num_col]
-    row_weights = weights[num_dp + num_col:]
+    col_weights = weights[num_dp : num_dp + num_col]
+    row_weights = weights[num_dp + num_col :]
 
     col_weights = self.get_col_sliced_weights(col_weights, all_ranks)
     row_weights = self.get_row_sliced_weights(row_weights)
@@ -1161,10 +1158,10 @@ def build(self, input_shape):
       batch_sizes = [shape[0] for shape in input_shape]
       batch_sizes = hvd.allgather(batch_sizes).numpy().tolist()
       if len(set(batch_sizes)) > 1:
-        raise ValueError(F"All input need to have same batchsize. got {set(batch_sizes)}.")
+        raise ValueError(f"All input need to have same batchsize. got {set(batch_sizes)}.")
       if not self.dp_input:
         if batch_sizes[0] % self.world_size > 0:
-          raise ValueError(F"Global batchsize {batch_sizes[0]} not divisible workers count {self.world_size}.")
+          raise ValueError(f"Global batchsize {batch_sizes[0]} not divisible workers count {self.world_size}.")
 
     # build both col and row slice tables
     for layer in self.dp_layers:
@@ -1174,7 +1171,7 @@ def build(self, input_shape):
 
     # build both col and row slice tables
     for layer in self.local_embedding_layers + self.row_layers:
-      device = 'CPU:0' if layer.cpu_offloaded else 'GPU:0'
+      device = "CPU:0" if layer.cpu_offloaded else "GPU:0"
       with tf.device(device):
         layer.build(input_shape[0] if input_shape else None)
       for var in layer.trainable_weights:
@@ -1216,7 +1213,7 @@ def broadcast_variables(model_vars, root_rank=0):
   dp_vars = []
   mp_vars = []
   for var in model_vars:
-    if hasattr(var, 'de_local'):
+    if hasattr(var, "de_local"):
       mp_vars.append(var)
     else:
       dp_vars.append(var)
@@ -1244,11 +1241,11 @@ def _gradient(self, target, sources, *args, **kwargs):
     gradients = self.raw_gradient(target, sources, *args, **kwargs)
     return gradients
 
-  if horovod.__version__ < '0.27.0':
+  if horovod.__version__ < "0.27.0":
     raise NotImplementedError("DistributedGradientTape is only compatible with horovod 0.27 or newer.")
   tape = hvd.DistributedGradientTape(sparse_as_dense=True, *args, **kwargs)
   for var in tape.watched_variables():
-    if hasattr(var, 'de_local'):
+    if hasattr(var, "de_local"):
       tape.register_local_source(var)
 
   tape.raw_gradient = tape.gradient
@@ -1268,12 +1265,12 @@ def DistributedOptimizer(*args, **kwargs):
   def _register_then_allreduce(self, grads, model_vars):
     if not self.local_var_registed:
       for var in model_vars:
-        if hasattr(var, 'de_local'):
+        if hasattr(var, "de_local"):
           self.register_local_var(var)
       self.local_var_registed = True
     return self.raw_allreduce(grads, model_vars)
 
-  if horovod.__version__ < '0.27.0':
+  if horovod.__version__ < "0.27.0":
     raise NotImplementedError("Distributed Optimizer is only compatible with horovod 0.27 or newer")
   opt = hvd_keras.DistributedOptimizer(sparse_as_dense=True, *args, **kwargs)
   opt.local_var_registed = False
@@ -1300,12 +1297,12 @@ def BroadcastGlobalVariablesCallback(*args, **kwargs):
   def _on_batch_end(self, batch, logs=None):
     if not self.local_var_registed:
       for var in self.model.variables:
-        if hasattr(var, 'de_local'):
+        if hasattr(var, "de_local"):
           self.register_local_var(var)
       self.local_var_registed = True
     return self.raw_on_batch_end(batch, logs)
 
-  if horovod.__version__ < '0.27.0':
+  if horovod.__version__ < "0.27.0":
     raise NotImplementedError("BroadcastGlobalVariablesCallback is only compatible with horovod 0.27 or newer.")
   bcb = hvd_keras.callbacks.BroadcastGlobalVariablesCallback(*args, **kwargs)
   bcb.local_var_registed = False
diff --git a/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py b/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
index 26a8611c..4b1404e6 100644
--- a/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
+++ b/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Embedding layers"""
+
 import numpy as np
 import tensorflow as tf
 from tensorflow.keras import constraints
@@ -26,23 +27,22 @@
 
 
 class CPUInitializer(tf.keras.initializers.Initializer):
-  """ initializer wrapper to force one-time init onto CPU, avoiding OOM
-  """
+  """initializer wrapper to force one-time init onto CPU, avoiding OOM"""
 
   def __init__(self, initializer):
     self._initializer = initializer
 
   def __call__(self, shape, dtype=None, **kwargs):
-    with tf.device('/CPU:0'):
+    with tf.device("/CPU:0"):
       res = self._initializer(shape, **kwargs)
     return res
 
 
 def _embedding_lookup_native(param, ids, combiner=None):
   t = tf.nn.embedding_lookup(param, ids)
-  if combiner == 'sum':
+  if combiner == "sum":
     t = tf.reduce_sum(t, axis=1)
-  elif combiner == 'mean':
+  elif combiner == "mean":
     t = tf.reduce_mean(t, axis=1)
   return t
 
@@ -70,27 +70,27 @@ class Embedding(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      input_dim,
-      output_dim,
-      embeddings_initializer='uniform',
-      embeddings_regularizer=None,
-      activity_regularizer=None,
-      embeddings_constraint=None,
-      combiner=None,
-      use_custom_kernel=True,
-      **kwargs
+    self,
+    input_dim,
+    output_dim,
+    embeddings_initializer="uniform",
+    embeddings_regularizer=None,
+    activity_regularizer=None,
+    embeddings_constraint=None,
+    combiner=None,
+    use_custom_kernel=True,
+    **kwargs,
   ):
-    if 'input_shape' not in kwargs:
-      kwargs['input_shape'] = (None,)
+    if "input_shape" not in kwargs:
+      kwargs["input_shape"] = (None,)
     if input_dim <= 0 or output_dim <= 0:
-      raise ValueError(f'Both input_dim and output_dim should be positive, found {input_dim} and {output_dim}')
-    if (not base_layer_utils.v2_dtype_behavior_enabled() and 'dtype' not in kwargs):
+      raise ValueError(f"Both input_dim and output_dim should be positive, found {input_dim} and {output_dim}")
+    if not base_layer_utils.v2_dtype_behavior_enabled() and "dtype" not in kwargs:
       # In TF1, the dtype defaults to the input dtype which is typically int32,
       # so explicitly set it to floatx
-      kwargs['dtype'] = backend.floatx()
+      kwargs["dtype"] = backend.floatx()
     # No autocast.
-    kwargs['autocast'] = False
+    kwargs["autocast"] = False
     super().__init__(**kwargs)
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -105,12 +105,12 @@ def __init__(
   @tf_utils.shape_type_conversion
   def build(self, input_shape):  # pylint: disable=unused-argument
     self.embeddings = self.add_weight(
-        shape=(self.input_dim, self.output_dim),
-        initializer=self.embeddings_initializer_cpu,
-        name='embeddings',
-        regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        experimental_autocast=False
+      shape=(self.input_dim, self.output_dim),
+      initializer=self.embeddings_initializer_cpu,
+      name="embeddings",
+      regularizer=self.embeddings_regularizer,
+      constraint=self.embeddings_constraint,
+      experimental_autocast=False,
     )
     self.built = True
 
@@ -122,8 +122,8 @@ def compute_output_shape(self, input_shape):
 
   def call(self, inputs):  # pylint: disable=missing-function-docstring
     dtype = backend.dtype(inputs)
-    if dtype not in ['int64', 'int32']:
-      inputs = tf.cast(inputs, 'int32')
+    if dtype not in ["int64", "int32"]:
+      inputs = tf.cast(inputs, "int32")
     # For needed case, compute output shape and replace leading possible None with -1
     out_shape = None
     if len(inputs.shape) != 2:
@@ -131,11 +131,11 @@ def call(self, inputs):  # pylint: disable=missing-function-docstring
     # check for unsupported cases and reshape non-2D dense inputs
     if isinstance(inputs, ragged_tensor.RaggedTensor):
       if len(inputs.shape) > 2:
-        raise ValueError('Ragged input should be 2D. Nested ragged is not supported.')
+        raise ValueError("Ragged input should be 2D. Nested ragged is not supported.")
     else:
       if len(inputs.shape) == 1:
         if self.combiner is not None:
-          raise ValueError('1D input with combiner is ambiguous. Please create batch dimension.')
+          raise ValueError("1D input with combiner is ambiguous. Please create batch dimension.")
         inputs = tf.reshape(inputs, [-1, 1])
       if len(inputs.shape) > 2:
         inputs = tf.reshape(inputs, [-1, inputs.shape[-1]])
@@ -151,14 +151,14 @@ def call(self, inputs):  # pylint: disable=missing-function-docstring
 
   def get_config(self):  # pylint: disable=missing-function-docstring
     config = {
-        'input_dim': self.input_dim,
-        'output_dim': self.output_dim,
-        'embeddings_initializer': initializers.serialize(self.embeddings_initializer),
-        'embeddings_regularizer': regularizers.serialize(self.embeddings_regularizer),
-        'activity_regularizer': regularizers.serialize(self.activity_regularizer),
-        'embeddings_constraint': constraints.serialize(self.embeddings_constraint),
-        'combiner': self.combiner,
-        'use_custom_kernel': self.use_custom_kernel
+      "input_dim": self.input_dim,
+      "output_dim": self.output_dim,
+      "embeddings_initializer": initializers.serialize(self.embeddings_initializer),
+      "embeddings_regularizer": regularizers.serialize(self.embeddings_regularizer),
+      "activity_regularizer": regularizers.serialize(self.activity_regularizer),
+      "embeddings_constraint": constraints.serialize(self.embeddings_constraint),
+      "combiner": self.combiner,
+      "use_custom_kernel": self.use_custom_kernel,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -168,8 +168,8 @@ def from_config(cls, config):
     """Creates a layer from its config.
     Overriding this to enable instantiating fast embedding from keras embedding configs
     """
-    config.pop('mask_zero', None)
-    config.pop('input_length', None)
+    config.pop("mask_zero", None)
+    config.pop("input_length", None)
     return super().from_config(config)
 
 
@@ -226,9 +226,9 @@ def __init__(self, max_tokens, use_gpu=True):
       # 1.5x load factor, 2x keys + values
       self.table = tf.Variable(tf.zeros((2 * int(1.5 * self.capacity),), tf.int64), trainable=False)
     else:
-      with tf.device('/CPU'):
+      with tf.device("/CPU"):
         self.table = tf.lookup.experimental.DenseHashTable(
-            key_dtype=tf.int64, value_dtype=tf.int64, default_value=0, empty_key=-2, deleted_key=-3
+          key_dtype=tf.int64, value_dtype=tf.int64, default_value=0, empty_key=-2, deleted_key=-3
         )
 
         # TODO(deyuf): benchmark code that handles max_token for cpu table
@@ -239,13 +239,13 @@ def call(self, inputs):
     if self.use_gpu:
       return embedding_lookup_ops.integer_lookup(self.table, self.count, inputs, self.capacity)
     # This is efficient on cpu, especially with power law distribution data
-    with tf.device('/CPU'):
+    with tf.device("/CPU"):
       input_shape = tf.shape(inputs)
       inputs = tf.reshape(inputs, [-1])
       if self.num_empty_slot > 0:
         keys, _ = tf.unique(inputs)
         vals = self.table.lookup(keys)
-        new_keys = tf.gather(keys, tf.reshape(tf.where(vals <= 0), [-1]))[:self.num_empty_slot]
+        new_keys = tf.gather(keys, tf.reshape(tf.where(vals <= 0), [-1]))[: self.num_empty_slot]
         num_insert = tf.shape(new_keys, out_type=tf.int64)[0]
         self.num_empty_slot -= num_insert
         self.table.insert(new_keys, tf.range(self.table.size(), self.table.size() + num_insert))
diff --git a/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py b/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
index 4d5f2f94..9b47d0f8 100644
--- a/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
+++ b/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
@@ -20,7 +20,7 @@
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.ops import resource_variable_ops
 
-ops = tf.load_op_library(resource_loader.get_path_to_datafile('_embedding_lookup_ops.so'))
+ops = tf.load_op_library(resource_loader.get_path_to_datafile("_embedding_lookup_ops.so"))
 
 
 def read_var_no_copy(res_var):
@@ -88,12 +88,12 @@ def embedding_lookup(param, ids, combiner=None):
     # since max(row_splits) here is likely ~total hotness, int32 should be ok
     # TODO(Deyu): fuse this cast into above row_to_split function and make always int32
     return ops.embedding_lookup_variable_hotness(
-        read_var_no_copy(param), ids.values, tf.cast(row_splits, dtype=ids.values.dtype), combiner
+      read_var_no_copy(param), ids.values, tf.cast(row_splits, dtype=ids.values.dtype), combiner
     )
   dim1 = tf.shape(ids, out_type=tf.int32)[1] if ids.shape[1] is None else ids.shape[1]
   if dim1 == 1:
     return tf.nn.embedding_lookup(param, tf.squeeze(ids, [1]))
-  if combiner == 'sum':
+  if combiner == "sum":
     return tf.reduce_sum(tf.nn.embedding_lookup(param, ids), axis=1)
   return tf.reduce_mean(tf.nn.embedding_lookup(param, ids), axis=1)
 
@@ -113,7 +113,7 @@ def _embedding_lookup_variable_hotness_grad(op, grad):
   flat_ids = tf.reshape(op.inputs[1], [-1])
   offsets = op.inputs[2]
   unique_ids, unique_grad = ops.embedding_lookup_variable_hotness_grad(
-      flat_ids, offsets, grad, op.inputs[0], combiner=op.get_attr('combiner')
+    flat_ids, offsets, grad, op.inputs[0], combiner=op.get_attr("combiner")
   )
 
   return (tf.IndexedSlices(unique_grad, unique_ids, param_shape), None, None)
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
index e98161d9..7dfad894 100644
--- a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
+++ b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Test of distributed model parallel"""
+
 import random
 import os
 from collections import defaultdict
@@ -32,11 +33,47 @@
 flags.DEFINE_string("mixed_precision_policy", default=None, help="Mixed precision policy to be set.")
 
 large_testcase_sizes = [
-    [2, 8], [2, 16], [10, 8], [10, 16], [10, 16], [10, 16], [10, 16], [10, 16], [10, 32], [10, 128],
-    [10, 128], [10, 128], [10, 128], [10, 1024], [100, 16], [100, 32], [100, 32], [100, 32], [100, 32], [100, 128],
-    [100, 128], [1000, 16], [1000, 16], [1000, 48], [1000, 128], [1000, 128], [1000, 384], [10000, 64], [10000, 64],
-    [10000, 2048], [100000, 32], [100000, 64], [100000, 64], [100000, 64], [100000, 128], [1000000, 96], [1000000, 128],
-    [1000000, 128], [9999999, 8], [10000000, 8], [10000001, 8]
+  [2, 8],
+  [2, 16],
+  [10, 8],
+  [10, 16],
+  [10, 16],
+  [10, 16],
+  [10, 16],
+  [10, 16],
+  [10, 32],
+  [10, 128],
+  [10, 128],
+  [10, 128],
+  [10, 128],
+  [10, 1024],
+  [100, 16],
+  [100, 32],
+  [100, 32],
+  [100, 32],
+  [100, 32],
+  [100, 128],
+  [100, 128],
+  [1000, 16],
+  [1000, 16],
+  [1000, 48],
+  [1000, 128],
+  [1000, 128],
+  [1000, 384],
+  [10000, 64],
+  [10000, 64],
+  [10000, 2048],
+  [100000, 32],
+  [100000, 64],
+  [100000, 64],
+  [100000, 64],
+  [100000, 128],
+  [1000000, 96],
+  [1000000, 128],
+  [1000000, 128],
+  [9999999, 8],
+  [10000000, 8],
+  [10000001, 8],
 ]
 
 
@@ -44,7 +81,6 @@
 # report of unexpected-keyword-arg, no-value-for-parameter. Disable them globally here
 # pylint: disable=no-self-use,unexpected-keyword-arg,no-value-for-parameter,missing-docstring
 class CustomEmbedding(tf.keras.layers.Layer):
-
   def __init__(self, input_dim, output_dim, **kwargs):
     super().__init__(**kwargs)
     self.input_dim = input_dim
@@ -57,7 +93,7 @@ def call(self, inputs):
     return tf.gather(params=self.params, indices=inputs, axis=None)
 
   def get_config(self):
-    config = {'input_dim': self.input_dim, 'output_dim': self.output_dim}
+    config = {"input_dim": self.input_dim, "output_dim": self.output_dim}
     return config
 
 
@@ -65,19 +101,19 @@ class EmbeddingListModel(tf.keras.Model):
   """A simple model for test"""
 
   def __init__(
-      self,
-      table_sizes,
-      distribute=False,
-      strategy='basic',
-      dp_input=True,
-      input_table_map=None,
-      column_slice_threshold=None,
-      test_custom_layer=False,
-      combiner=None,
-      use_custom_kernels=True,
-      row_slice_threshold=None,
-      data_parallel_threshold=None,
-      gpu_embedding_size=None
+    self,
+    table_sizes,
+    distribute=False,
+    strategy="basic",
+    dp_input=True,
+    input_table_map=None,
+    column_slice_threshold=None,
+    test_custom_layer=False,
+    combiner=None,
+    use_custom_kernels=True,
+    row_slice_threshold=None,
+    data_parallel_threshold=None,
+    gpu_embedding_size=None,
   ):
     super().__init__()
     self.embeddings = []
@@ -90,14 +126,14 @@ def __init__(
         self.embeddings.append(Embedding(*size, combiner=combiner, use_custom_kernel=use_custom_kernels))
     if distribute:
       self.dist_embeddings = dmp.DistributedEmbedding(
-          self.embeddings,
-          strategy=strategy,
-          dp_input=dp_input,
-          input_table_map=input_table_map,
-          column_slice_threshold=column_slice_threshold,
-          row_slice_threshold=row_slice_threshold,
-          data_parallel_threshold=data_parallel_threshold,
-          gpu_embedding_size=gpu_embedding_size
+        self.embeddings,
+        strategy=strategy,
+        dp_input=dp_input,
+        input_table_map=input_table_map,
+        column_slice_threshold=column_slice_threshold,
+        row_slice_threshold=row_slice_threshold,
+        data_parallel_threshold=data_parallel_threshold,
+        gpu_embedding_size=gpu_embedding_size,
       )
     else:
       self.dist_embeddings = None
@@ -124,15 +160,15 @@ def get_config(self):
 
 
 def initialize_hvd():
-  os.environ['HOROVOD_STALL_CHECK_TIME_SECONDS'] = '5'
-  os.environ['HOROVOD_STALL_SHUTDOWN_TIME_SECONDS'] = '30'
+  os.environ["HOROVOD_STALL_CHECK_TIME_SECONDS"] = "5"
+  os.environ["HOROVOD_STALL_SHUTDOWN_TIME_SECONDS"] = "30"
 
   hvd.init()
-  gpus = tf.config.experimental.list_physical_devices('GPU')
+  gpus = tf.config.experimental.list_physical_devices("GPU")
   for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu, True)
   if gpus:
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
 
 
 def gen_table_sizes(num_tables=None):
@@ -169,8 +205,8 @@ def gen_inputs_onehot(global_batch, table_sizes, input_to_table_map=None, mp_inp
   if input_to_table_map is None:
     input_to_table_map = list(range(len(table_sizes)))
   global_inputs = [
-      tf.random.uniform(shape=[global_batch], minval=0, maxval=table_sizes[i][0], dtype=tf.int64)
-      for i in input_to_table_map
+    tf.random.uniform(shape=[global_batch], minval=0, maxval=table_sizes[i][0], dtype=tf.int64)
+    for i in input_to_table_map
   ]
   for t in global_inputs:
     hvd.broadcast(t, root_rank=0)
@@ -180,14 +216,14 @@ def gen_inputs_onehot(global_batch, table_sizes, input_to_table_map=None, mp_inp
 
   local_batch = global_batch // hvd.size()
 
-  dp_inputs = [t[hvd.rank() * local_batch:(hvd.rank() + 1) * local_batch] for t in global_inputs]
+  dp_inputs = [t[hvd.rank() * local_batch : (hvd.rank() + 1) * local_batch] for t in global_inputs]
   mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else []
 
   return dp_inputs, mp_inputs
 
 
 def gen_inputs_multihot(
-    global_batch, table_sizes, hotness=None, input_to_table_map=None, mp_input_ids=None, return_global=False
+  global_batch, table_sizes, hotness=None, input_to_table_map=None, mp_input_ids=None, return_global=False
 ):
   # create global inputs
   if input_to_table_map is None:
@@ -208,14 +244,13 @@ def gen_inputs_multihot(
     return global_inputs
 
   local_batch = global_batch // hvd.size()
-  dp_inputs = [t[hvd.rank() * local_batch:(hvd.rank() + 1) * local_batch, :] for t in global_inputs]
+  dp_inputs = [t[hvd.rank() * local_batch : (hvd.rank() + 1) * local_batch, :] for t in global_inputs]
   mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else []
 
   return dp_inputs, mp_inputs
 
 
 class DistributedEmbeddingTest(keras_parameterized.TestCase):
-
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     initialize_hvd()
@@ -272,17 +307,17 @@ def test_broadcast(self):
     num_tables = 7
     table_sizes = [[11, 7], [5, 8], [3, 8], [5, 8], [12, 25], [3, 12], [7, 13]]
 
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
     # create same input on all worker
     dup_ids = [
-        tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
+      tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
     ]
     dup_ids = hvd.broadcast_object(dup_ids, root_rank=0)
     # different output from each worker with different weight
     test_out = test_model(dup_ids)
     test_outs = tf.unstack(hvd.allgather(tf.expand_dims(test_out, axis=0)))
     for idx, out1 in enumerate(test_outs):
-      for out2 in test_outs[idx + 1:]:
+      for out2 in test_outs[idx + 1 :]:
         self.assertNotAllClose(out1, out2)
 
     # same output from each worker after broadcast data parallel weights
@@ -296,7 +331,7 @@ def test_basic(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -305,7 +340,7 @@ def test_row_slice(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', row_slice_threshold=1)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", row_slice_threshold=1)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -314,7 +349,7 @@ def test_data_parallel(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', data_parallel_threshold=100000)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", data_parallel_threshold=100000)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -323,7 +358,7 @@ def test_memory_optimized(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='memory_optimized')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_optimized")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -333,7 +368,7 @@ def test_shared_basic(self):
     input_to_table_map = gen_input_to_table_map(len(table_sizes))
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', input_table_map=input_to_table_map)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", input_table_map=input_to_table_map)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, input_to_table_map)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -344,7 +379,7 @@ def test_shared_basic_mp(self):
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, strategy='basic', dp_input=False, input_table_map=input_to_table_map
+      table_sizes, distribute=True, strategy="basic", dp_input=False, input_table_map=input_to_table_map
     )
 
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
@@ -357,7 +392,7 @@ def test_shared_mb_mp(self):
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, strategy='memory_balanced', dp_input=False, input_table_map=input_to_table_map
+      table_sizes, distribute=True, strategy="memory_balanced", dp_input=False, input_table_map=input_to_table_map
     )
 
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
@@ -368,7 +403,7 @@ def test_column_slice_merge(self):
     # test on 4 GPUs
     table_sizes = [[100, 8], [5, 8], [10, 8], [25, 4]]
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='memory_balanced', column_slice_threshold=45)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced", column_slice_threshold=45)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -378,7 +413,7 @@ def test_column_slice_merge(self):
   def test_column_slice_threshold(self):
     table_sizes = gen_table_sizes(hvd.size() + 1)
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', column_slice_threshold=30)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", column_slice_threshold=30)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -387,7 +422,7 @@ def test_column_slice_dup_worker(self):
     table_sizes = [[10, 4], [11, 2], [4, 2], [4, 2]]
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, strategy='memory_balanced', dp_input=False, column_slice_threshold=10
+      table_sizes, distribute=True, strategy="memory_balanced", dp_input=False, column_slice_threshold=10
     )
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
     dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids)
@@ -396,7 +431,7 @@ def test_column_slice_dup_worker(self):
   def test_8table_width2_auto_concat(self):
     table_sizes = [[10, 2], [11, 2], [4, 2], [4, 2], [10, 2], [11, 2], [4, 2], [4, 2]]
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='memory_balanced', dp_input=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced", dp_input=False)
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
     dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids)
     self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
@@ -406,7 +441,7 @@ def test_set_weight_uninitialized(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
 
@@ -422,7 +457,7 @@ def test_indivisible_batch(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', dp_input=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", dp_input=False)
 
     # First generate model parallel batches that's divisible by world_size. We then use (batch_size - 1)
     # which will be indivisible by world_size greater than 1 due to consecutive numbers coprimes
@@ -437,7 +472,7 @@ def test_fewer_tables_than_workers(self):
     table_sizes = gen_table_sizes(1)
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='memory_balanced')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -446,7 +481,7 @@ def test_custom_embedding_layer(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False, test_custom_layer=True)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', test_custom_layer=True)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", test_custom_layer=True)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
@@ -462,12 +497,12 @@ def test_all_parallelism_modes(self):
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
     test_model = EmbeddingListModel(
-        table_sizes,
-        distribute=True,
-        strategy='memory_balanced',
-        data_parallel_threshold=10000,
-        column_slice_threshold=1000000,
-        row_slice_threshold=10000000
+      table_sizes,
+      distribute=True,
+      strategy="memory_balanced",
+      data_parallel_threshold=10000,
+      column_slice_threshold=1000000,
+      row_slice_threshold=10000000,
     )
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
@@ -476,7 +511,7 @@ def test_all_parallelism_modes(self):
   def test_cpu_offload(self):
     table_sizes = [[100, 32], [100, 32], [100, 32], [100, 32], [1000, 64], [1000, 64], [1000, 64], [1000, 64]]
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic', gpu_embedding_size=32000)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", gpu_embedding_size=32000)
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
@@ -486,7 +521,7 @@ def test_column_slicing_offload(self):
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, strategy='memory_balanced', column_slice_threshold=1000000, gpu_embedding_size=0
+      table_sizes, distribute=True, strategy="memory_balanced", column_slice_threshold=1000000, gpu_embedding_size=0
     )
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
@@ -495,8 +530,8 @@ def test_column_slicing_offload(self):
   def test_multihot_mp_input(self):
     table_sizes = gen_table_sizes()
 
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner='sum', use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner='sum', strategy='basic', dp_input=False)
+    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic", dp_input=False)
 
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
 
@@ -506,8 +541,8 @@ def test_multihot_mp_input(self):
   def test_multihot_dp_input(self):
     table_sizes = gen_table_sizes()
 
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner='sum', use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner='sum', strategy='basic')
+    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
@@ -516,8 +551,8 @@ def test_multihot_dp_input_split(self):
     # More workers than tables
     table_sizes = gen_table_sizes(num_tables=max(hvd.size() // 2, 1))
 
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner='sum', use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner='sum', strategy='basic')
+    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
+    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic")
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
     self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
@@ -525,9 +560,9 @@ def test_multihot_dp_input_split(self):
   def test_multihot_offloaded_mp_input(self):
     table_sizes = gen_table_sizes()
 
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner='sum', use_custom_kernels=False)
+    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, combiner='sum', strategy='basic', dp_input=False, gpu_embedding_size=0
+      table_sizes, distribute=True, combiner="sum", strategy="basic", dp_input=False, gpu_embedding_size=0
     )
 
     mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
@@ -538,9 +573,9 @@ def test_multihot_offloaded_mp_input(self):
   def test_multihot_offloaded_dp_input(self):
     table_sizes = gen_table_sizes()
 
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner='sum', use_custom_kernels=False)
+    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
     test_model = EmbeddingListModel(
-        table_sizes, distribute=True, combiner='sum', strategy='basic', gpu_embedding_size=0
+      table_sizes, distribute=True, combiner="sum", strategy="basic", gpu_embedding_size=0
     )
 
     dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
@@ -548,7 +583,6 @@ def test_multihot_offloaded_dp_input(self):
 
 
 class DistributedEmbeddingModelFitTest(keras_parameterized.TestCase):
-
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     initialize_hvd()
@@ -561,7 +595,7 @@ def test_model_fit_bce(self):
     table_sizes = gen_table_sizes()
 
     ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
     optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1.5, momentum=0)
     bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
     label = tf.fill([self.global_batch // hvd.size(), 5], 0.5)
@@ -595,7 +629,7 @@ def test_model_fit_bce(self):
     test_history = test_model.fit(dp_inputs, label, epochs=1, steps_per_epoch=1)
     test_weights = test_model.dist_embeddings.get_weights(True) + test_model.dense.get_weights()
 
-    self.assertAllClose(ref_loss, test_history.history['loss'][0])
+    self.assertAllClose(ref_loss, test_history.history["loss"][0])
     for ref_w, test_w in zip(ref_weights, test_weights):
       # assert close here since order of accumulations(inputs and batch dim) might have changed
       self.assertAllClose(tf.convert_to_tensor(ref_w), tf.convert_to_tensor(test_w))
@@ -605,11 +639,11 @@ def test_broadcast_callback(self):
     num_tables = 7
     table_sizes = [[11, 7], [5, 8], [3, 8], [5, 8], [12, 25], [3, 12], [7, 13]]
 
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy='basic')
+    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
 
     # create same input on all worker
     dup_ids = [
-        tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
+      tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
     ]
     dup_ids = hvd.broadcast_object(dup_ids, root_rank=0)
 
@@ -624,7 +658,7 @@ def test_broadcast_callback(self):
     test_history = test_model.fit(dup_ids, label, epochs=1, steps_per_epoch=1, callbacks=[callback])
 
     # losses from initial batch should be different
-    loss = test_history.history['loss'][0]
+    loss = test_history.history["loss"][0]
     losses = tf.unstack(hvd.allgather(tf.expand_dims(loss, axis=0)))
     for loss in losses[1:]:
       self.assertNotAllClose(losses[0], loss)
@@ -637,7 +671,7 @@ def test_broadcast_callback(self):
 
     # now try model fit again, loss should be same
     test_history = test_model.fit(dup_ids, label, epochs=1, steps_per_epoch=1)
-    loss = test_history.history['loss'][0]
+    loss = test_history.history["loss"][0]
     losses = tf.unstack(hvd.allgather(tf.expand_dims(loss, axis=0)))
     for loss in losses[1:]:
       # TODO(deyuf): understand why model.fit causes 1e-8 error sometime
@@ -646,18 +680,22 @@ def test_broadcast_callback(self):
 
 def get_variable_length_ragged_test_data():
   return [
-      tf.ragged.constant(
-          [[11, 12], [13, 14, 15], [16], [17], [21], [22, 23, 24, 25, 26, 27, 28, 29, 210], [211], [212]]
-      ),
-      tf.ragged.constant([[31, 32, 33], [34], [35], [36], [41, 42, 43, 44, 45, 46, 47, 48], [49], [410], [411]]),
-      tf.ragged.constant(
-          [[51], [52, 53, 54, 55, 56, 57, 58, 59, 510], [511], [512], [61, 62, 63, 64, 65, 66, 67], [68], [69], [610]]
-      )
+    tf.ragged.constant([[11, 12], [13, 14, 15], [16], [17], [21], [22, 23, 24, 25, 26, 27, 28, 29, 210], [211], [212]]),
+    tf.ragged.constant([[31, 32, 33], [34], [35], [36], [41, 42, 43, 44, 45, 46, 47, 48], [49], [410], [411]]),
+    tf.ragged.constant([
+      [51],
+      [52, 53, 54, 55, 56, 57, 58, 59, 510],
+      [511],
+      [512],
+      [61, 62, 63, 64, 65, 66, 67],
+      [68],
+      [69],
+      [610],
+    ]),
   ]
 
 
 class DpToMpInputTest(keras_parameterized.TestCase):
-
   def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     initialize_hvd()
@@ -719,14 +757,14 @@ def test_ragged_dp_to_mp_unbalanced(self):
 
   def test_ragged_and_dense_dp_to_mp(self):
     dense_inputs = gen_inputs_onehot(
-        global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
+      global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
     )
     all_inputs = [*dense_inputs, *get_variable_length_ragged_test_data()]
     self.run_and_test(all_inputs)
 
   def test_ragged_and_dense_dp_to_mp_reversed(self):
     dense_inputs = gen_inputs_onehot(
-        global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
+      global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
     )
     all_inputs = [*get_variable_length_ragged_test_data(), *dense_inputs]
     self.run_and_test(all_inputs)
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
index 6dab7989..d7b5aabb 100644
--- a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
+++ b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
@@ -18,7 +18,6 @@
 
 
 class EmbeddingLookupTest(tf.test.TestCase):
-
   def test_variable_hotness(self):
     voc, emb, batch, max_hotness = 69, 64, 15, 207
     # create dense representation of index matrix
@@ -41,7 +40,7 @@ def test_variable_hotness(self):
     initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
     param = tf.Variable(initial_weight)
 
-    for red in ['sum', 'mean']:
+    for red in ["sum", "mean"]:
       with tf.GradientTape(persistent=True) as tape:
         tape.watch(param)
         ref_ret = tf.nn.embedding_lookup_sparse(param, ref_ids, sp_weights=None, combiner=red)
@@ -62,12 +61,12 @@ def test_constant_hotness(self):
     initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
     param = tf.Variable(initial_weight)
 
-    for red in ['sum', 'mean']:
+    for red in ["sum", "mean"]:
       with tf.GradientTape(persistent=True) as tape:
         tape.watch(param)
-        if red == 'sum':
+        if red == "sum":
           ref_ret = tf.reduce_sum(tf.nn.embedding_lookup(param, ids), 1)
-        if red == 'mean':
+        if red == "mean":
           ref_ret = tf.reduce_mean(tf.nn.embedding_lookup(param, ids), 1)
         ret = embedding_lookup(param, ids, combiner=red)
       ref_g = tape.gradient(ref_ret, param)
@@ -98,7 +97,7 @@ def test_sparse_tensor_input(self):
     initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
     param = tf.Variable(initial_weight)
 
-    for red in ['sum', 'mean']:
+    for red in ["sum", "mean"]:
       with tf.GradientTape(persistent=True) as tape:
         tape.watch(param)
         ref_ret = tf.nn.embedding_lookup_sparse(param, ref_ids, sp_weights=None, combiner=red)
@@ -113,5 +112,5 @@ def test_sparse_tensor_input(self):
       self.assertAllClose(ref_g_dense, g_dense)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
index bb6158c9..edfa5e1f 100644
--- a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
+++ b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
@@ -26,7 +26,6 @@
 
 # pylint:disable=missing-docstring, no-self-use
 class EmbeddingTest(keras_parameterized.TestCase):
-
   @keras_parameterized.run_all_keras_modes
   def test_1d_input(self):
     layer = embedding.Embedding(output_dim=2, input_dim=3)
@@ -34,7 +33,7 @@ def test_1d_input(self):
 
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model(tf.constant([0, 1, 0], dtype='int64'))
+    outputs = model(tf.constant([0, 1, 0], dtype="int64"))
     self.assertAllEqual(outputs, [[1, 2], [3, 4], [1, 2]])
 
   @keras_parameterized.run_all_keras_modes
@@ -44,17 +43,17 @@ def test_2d_input_no_combiner(self):
 
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype='int64'))
+    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype="int64"))
     self.assertAllEqual(outputs, [[[1, 2], [3, 4]], [[5, 6], [1, 2]]])
 
   @keras_parameterized.run_all_keras_modes
   def test_2d_input_with_sum_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner='sum')
+    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
     model = tf.keras.models.Sequential([layer])
 
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype='int64'))
+    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype="int64"))
     self.assertAllEqual(outputs, [[4, 6], [6, 8]])
 
   @keras_parameterized.run_all_keras_modes
@@ -64,24 +63,24 @@ def test_3d_input_no_combiner(self):
 
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype='int64')
+    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype="int64")
     outputs = model.predict(ids)
     self.assertAllEqual(outputs, [[[[1, 2], [3, 4]], [[5, 6], [1, 2]], [[3, 4], [5, 6]]]])
 
   @keras_parameterized.run_all_keras_modes
   def test_3d_input_with_mean_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner='mean')
+    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="mean")
     model = tf.keras.models.Sequential([layer])
 
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     model.run_eagerly = testing_utils.should_run_eagerly()
-    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype='int64')
+    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype="int64")
     outputs = model.predict(ids)
     self.assertAllEqual(outputs, [[[2, 3], [3, 4], [4, 5]]])
 
   @keras_parameterized.run_all_keras_modes
   def test_ragged_input(self):
-    layer = embedding.Embedding(input_dim=3, output_dim=2, weights=[np.array([[0., 3.], [1., 5.], [7., 2.]])])
+    layer = embedding.Embedding(input_dim=3, output_dim=2, weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])])
     inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, ragged=True)
     outputs = layer(inputs)
 
@@ -90,14 +89,16 @@ def test_ragged_input(self):
     ids = ragged_factory_ops.constant([[1, 2, 2], [0], [1, 2]], ragged_rank=1)
     outputs = model.predict(ids)
 
-    ref_layer = tf.keras.layers.Embedding(input_dim=3, output_dim=2, weights=[np.array([[0., 3.], [1., 5.], [7., 2.]])])
+    ref_layer = tf.keras.layers.Embedding(
+      input_dim=3, output_dim=2, weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
+    )
     ref_outputs = ref_layer(ids)
     self.assertAllEqual(outputs, ref_outputs)
 
   @keras_parameterized.run_all_keras_modes
   def test_ragged_input_with_mean_combiner(self):
     layer = embedding.Embedding(
-        input_dim=3, output_dim=2, combiner='mean', weights=[np.array([[0., 3.], [1., 5.], [7., 2.]])]
+      input_dim=3, output_dim=2, combiner="mean", weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
     )
     inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, ragged=True)
     outputs = layer(inputs)
@@ -105,12 +106,12 @@ def test_ragged_input_with_mean_combiner(self):
     model = tf.keras.Model(inputs, outputs)
     model.run_eagerly = testing_utils.should_run_eagerly()
     outputs = model.predict(ragged_factory_ops.constant([[1, 2, 2], [0], [1, 2]], ragged_rank=1))
-    self.assertAllEqual(outputs, [[5., 3.], [0., 3.], [4., 3.5]])
+    self.assertAllEqual(outputs, [[5.0, 3.0], [0.0, 3.0], [4.0, 3.5]])
 
   @keras_parameterized.run_all_keras_modes
   def test_sparse_input_with_mean_combiner(self):
     layer = embedding.Embedding(
-        input_dim=3, output_dim=2, combiner='mean', weights=[np.array([[0., 3.], [1., 5.], [7., 2.]])]
+      input_dim=3, output_dim=2, combiner="mean", weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
     )
     inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, sparse=True)
     outputs = layer(inputs)
@@ -119,18 +120,18 @@ def test_sparse_input_with_mean_combiner(self):
     model.run_eagerly = testing_utils.should_run_eagerly()
 
     outputs = model.predict(
-        tf.sparse.SparseTensor(
-            indices=[[0, 0], [0, 1], [0, 2], [1, 0], [2, 0], [2, 1]], values=[1, 2, 2, 0, 1, 2], dense_shape=[3, 4]
-        )
+      tf.sparse.SparseTensor(
+        indices=[[0, 0], [0, 1], [0, 2], [1, 0], [2, 0], [2, 1]], values=[1, 2, 2, 0, 1, 2], dense_shape=[3, 4]
+      )
     )
-    self.assertAllEqual(outputs, [[5., 3.], [0., 3.], [4., 3.5]])
+    self.assertAllEqual(outputs, [[5.0, 3.0], [0.0, 3.0], [4.0, 3.5]])
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def test_2d_input_with_sum_combiner_with_grad(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner='sum')
+    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
     layer.build((None, 2))
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype='int64')
+    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype="int64")
     with backprop.GradientTape() as tape:
       output = layer(inputs)
     gs = tape.gradient(output, layer.weights)
@@ -141,7 +142,7 @@ def test_2d_input_with_sum_combiner_with_grad(self):
     ref_layer.build((None, 2))
     ref_layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     # grad of sum combiner is same as grads for flatten inputs without combiner
-    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype='int64')
+    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype="int64")
     with backprop.GradientTape() as tape:
       ref_output = ref_layer(ref_inputs)
     ref_gs = tape.gradient(ref_output, ref_layer.weights)
@@ -150,12 +151,12 @@ def test_2d_input_with_sum_combiner_with_grad(self):
     self.assertAllEqual(layer.weights[0], ref_layer.weights[0])
     self.assertAllEqual(tf.convert_to_tensor(gs[0]), tf.convert_to_tensor(ref_gs[0]))
 
-  @combinations.generate(combinations.combine(mode=['eager']))
+  @combinations.generate(combinations.combine(mode=["eager"]))
   def test_2d_input_with_sum_combiner_with_grad_32bit(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner='sum')
+    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
     layer.build((None, 2))
     layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype='int32')
+    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype="int32")
     with backprop.GradientTape() as tape:
       output = layer(inputs)
     gs = tape.gradient(output, layer.weights)
@@ -166,7 +167,7 @@ def test_2d_input_with_sum_combiner_with_grad_32bit(self):
     ref_layer.build((None, 2))
     ref_layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
     # grad of sum combiner is same as grads for flatten inputs without combiner
-    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype='int32')
+    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype="int32")
     with backprop.GradientTape() as tape:
       ref_output = ref_layer(ref_inputs)
     ref_gs = tape.gradient(ref_output, ref_layer.weights)
@@ -177,7 +178,6 @@ def test_2d_input_with_sum_combiner_with_grad_32bit(self):
 
 
 class ConcatOneHotEmbeddingTest(test.TestCase):
-
   def test_smoke(self):
     feature_sizes = [3, 5, 7, 11]
     embedding_width = 3
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
index 2eb26e58..6a3650c6 100644
--- a/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
+++ b/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
@@ -27,11 +27,10 @@
 # pylint:disable=missing-docstring, no-self-use
 @test_combinations.run_all_keras_modes(always_skip_v1=True)
 class IntegerLookupLayerTest(test_combinations.TestCase):
-
   # TODO: this test pass not but in theory it depends on atomic
   @parameterized.named_parameters(
-      ("gpu", True),
-      ("cpu", False),
+    ("gpu", True),
+    ("cpu", False),
   )
   def test_layer_with_list_input(self, use_gpu):
     vocab = [12, 36, 1138, 42]
@@ -48,13 +47,12 @@ def test_layer_with_list_input(self, use_gpu):
     self.assertEqual(test_output.numpy().tolist(), expected_output.tolist())
 
   @parameterized.named_parameters(
-      ("gpu", True),
-      ("cpu", False),
+    ("gpu", True),
+    ("cpu", False),
   )
   def test_layer_against_native(self, use_gpu):
     for key_max in [100, 200, 500, 1000]:
       for vocab_size in [100, 200, 500, 1000]:
-
         vocab = tf.random.uniform(shape=(vocab_size,), maxval=key_max, dtype=tf.int64)
         unique_vocab = tf.size(tf.unique(vocab)[0])
         # make sure test table is full so we can compare against reference without inserting new
diff --git a/deepray/custom_ops/embedding_bag/python/embedding_bag.py b/deepray/custom_ops/embedding_bag/python/embedding_bag.py
index 9c6acc04..619a7ea5 100644
--- a/deepray/custom_ops/embedding_bag/python/embedding_bag.py
+++ b/deepray/custom_ops/embedding_bag/python/embedding_bag.py
@@ -23,32 +23,32 @@
 
 
 def _embedding_bag(
-    indices,
-    params,
-    weights=None,
-    combiner="sum",
-    name=None,
+  indices,
+  params,
+  weights=None,
+  combiner="sum",
+  name=None,
 ):
   """EmbeddingBag computation.
 
-    See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
+  See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
 
-    Equivalent to tf.gather() followed by tf.reduce_{sum,mean}() across the last dimension, with optional
-    weights. Fusing these into a single op has massive benefits for execution speed and particularly
-    memory usage, as the intermediate output of the gather never needs to be materialized.
+  Equivalent to tf.gather() followed by tf.reduce_{sum,mean}() across the last dimension, with optional
+  weights. Fusing these into a single op has massive benefits for execution speed and particularly
+  memory usage, as the intermediate output of the gather never needs to be materialized.
 
-    Args:
-      indices: An int32 or int64 `Tensor` of the indices to gather from
-          `params`. Must be at least 2-dimensional, as the last dimension
-          will be summed out. Maximum value must be less than params.shape[0].
-      params: A float32 `Tensor` from which to gather params. Must be rank 2.
-      weights: A float32 `Tensor` of weights which will be applied to each of
-          the gathered embedding vectors before the sum step.
-      name: A name for the operation (optional).
+  Args:
+    indices: An int32 or int64 `Tensor` of the indices to gather from
+        `params`. Must be at least 2-dimensional, as the last dimension
+        will be summed out. Maximum value must be less than params.shape[0].
+    params: A float32 `Tensor` from which to gather params. Must be rank 2.
+    weights: A float32 `Tensor` of weights which will be applied to each of
+        the gathered embedding vectors before the sum step.
+    name: A name for the operation (optional).
 
-    Returns:
-      A `Tensor` of the format specified by `data_format`.
-    """
+  Returns:
+    A `Tensor` of the format specified by `data_format`.
+  """
   if weights is None:
     weights = tf.ones_like(indices, dtype=params.dtype)
   elif combiner != "sum":
@@ -62,7 +62,7 @@ def _embedding_bag_grad(op, grads):
   indices, params, weights = op.inputs[:3]
   combiner = op.get_attr("combiner")
   value_grads, weight_grads = _embedding_bag_so.ops.deepray_embedding_bag_grad(
-      indices, params, weights, grads, combiner=combiner
+    indices, params, weights, grads, combiner=combiner
   )
   return [None, value_grads, weight_grads]
 
@@ -71,41 +71,42 @@ def _embedding_bag_grad(op, grads):
 class EmbeddingBag(tf.keras.layers.Layer):
   """EmbeddingBag Layer.
 
-    See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
+  See [PyTorch op](https://pytorch.org/docs/stable/generated/torch.nn.EmbeddingBag.html).
 
-    Equivalent to tf.gather() followed by tf.reduce_sum() across the last dimension, with optional
-    weights. Fusing these into a single op has massive benefits for execution speed and particularly
-    memory usage, as the intermediate output of the gather never needs to be materialized.
+  Equivalent to tf.gather() followed by tf.reduce_sum() across the last dimension, with optional
+  weights. Fusing these into a single op has massive benefits for execution speed and particularly
+  memory usage, as the intermediate output of the gather never needs to be materialized.
 
-    Input Shapes:
-      indices: An int32 or int64 `Tensor` of the indices to gather from
-          `params`. Must be at least 2-dimensional, as the last dimension
-          will be summed out. Maximum value must be less than params.shape[0].
-      params: A float32 `Tensor` from which to gather params. Must be rank 2.
-      weights: A float32 `Tensor` of weights which will be applied to each of
-          the gathered embedding vectors before the sum step.
+  Input Shapes:
+    indices: An int32 or int64 `Tensor` of the indices to gather from
+        `params`. Must be at least 2-dimensional, as the last dimension
+        will be summed out. Maximum value must be less than params.shape[0].
+    params: A float32 `Tensor` from which to gather params. Must be rank 2.
+    weights: A float32 `Tensor` of weights which will be applied to each of
+        the gathered embedding vectors before the sum step.
 
-    Output shape:
-        indices.shape[:-1], params.shape[-1]
-    """
+  Output shape:
+      indices.shape[:-1], params.shape[-1]
+  """
 
   @typechecked
   def __init__(
-      self,
-      input_dim: int,
-      output_dim: int,
-      embeddings_initializer: Initializer = "uniform",
-      embeddings_regularizer: Regularizer = None,
-      embeddings_constraint: Constraint = None,
-      mask_zero: bool = False,
-      combiner: str = "sum",
-      **kwargs,
+    self,
+    input_dim: int,
+    output_dim: int,
+    embeddings_initializer: Initializer = "uniform",
+    embeddings_regularizer: Regularizer = None,
+    embeddings_constraint: Constraint = None,
+    mask_zero: bool = False,
+    combiner: str = "sum",
+    **kwargs,
   ):
     super(EmbeddingBag, self).__init__(**kwargs)
     if input_dim <= 0 or output_dim <= 0:
       raise ValueError(
-          "Both `input_dim` and `output_dim` should be positive, "
-          "found input_dim {} and output_dim {}".format(input_dim, output_dim)
+        "Both `input_dim` and `output_dim` should be positive, found input_dim {} and output_dim {}".format(
+          input_dim, output_dim
+        )
       )
     self.input_dim = input_dim
     self.output_dim = output_dim
@@ -118,11 +119,11 @@ def __init__(
 
   def build(self, input_shape):
     self.embeddings = self.add_weight(
-        shape=(self.input_dim, self.output_dim),
-        name="embeddings",
-        initializer=self.embeddings_initializer,
-        regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
+      shape=(self.input_dim, self.output_dim),
+      name="embeddings",
+      initializer=self.embeddings_initializer,
+      regularizer=self.embeddings_regularizer,
+      constraint=self.embeddings_constraint,
     )
     self.built = True
 
@@ -131,13 +132,13 @@ def call(self, indices, weights=None):
 
   def get_config(self):
     config = {
-        "input_dim": self.input_dim,
-        "output_dim": self.output_dim,
-        "embeddings_initializer": tf.keras.initializers.serialize(self.embeddings_initializer),
-        "embeddings_regularizer": tf.keras.regularizers.serialize(self.embeddings_regularizer),
-        "embeddings_constraint": tf.keras.constraints.serialize(self.embeddings_constraint),
-        "mask_zero": self.mask_zero,
-        "combiner": self.combiner,
+      "input_dim": self.input_dim,
+      "output_dim": self.output_dim,
+      "embeddings_initializer": tf.keras.initializers.serialize(self.embeddings_initializer),
+      "embeddings_regularizer": tf.keras.regularizers.serialize(self.embeddings_regularizer),
+      "embeddings_constraint": tf.keras.constraints.serialize(self.embeddings_constraint),
+      "mask_zero": self.mask_zero,
+      "combiner": self.combiner,
     }
     base_config = super(EmbeddingBag, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
index f1d1ee33..c44074c3 100644
--- a/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
+++ b/deepray/custom_ops/embedding_bag/python/tests/embedding_bag_test.py
@@ -55,8 +55,8 @@ def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner):
   if weights is not None:
     weights = tf.convert_to_tensor(weights)
   output = embedding_bag(
-      indices,
-      weights,
+    indices,
+    weights,
   )
   test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2)
 
@@ -93,10 +93,10 @@ def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
     expected_grads = tape.gradient(expected, [params, weights])
     # Gather returns sparse IndexedSlices so we have to sum them together.
     test_utils.assert_allclose_according_to_type(
-        tf.convert_to_tensor(expected_grads[0]),
-        tf.convert_to_tensor(grads[0]),
-        half_rtol=1e-2,
-        half_atol=1e-2,
+      tf.convert_to_tensor(expected_grads[0]),
+      tf.convert_to_tensor(grads[0]),
+      half_rtol=1e-2,
+      half_atol=1e-2,
     )
     test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2)
   else:
@@ -109,8 +109,8 @@ def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
     expected_grads = tape.gradient(expected, [params])
     # Gather returns sparse IndexedSlices so we have to sum them together.
     test_utils.assert_allclose_according_to_type(
-        tf.convert_to_tensor(expected_grads[0]),
-        tf.convert_to_tensor(grads[0]),
-        half_rtol=1e-2,
-        half_atol=1e-2,
+      tf.convert_to_tensor(expected_grads[0]),
+      tf.convert_to_tensor(grads[0]),
+      half_rtol=1e-2,
+      half_atol=1e-2,
     )
diff --git a/deepray/custom_ops/embedding_variable/__init__.py b/deepray/custom_ops/embedding_variable/__init__.py
index abbf9a39..c5153fa5 100644
--- a/deepray/custom_ops/embedding_variable/__init__.py
+++ b/deepray/custom_ops/embedding_variable/__init__.py
@@ -1,3 +1,3 @@
 from .python import kv_variable_ops
 from .python import group_embedding_lookup_ops
-from .python.kv_variable_ops import gen_kv_variable_ops
\ No newline at end of file
+from .python.kv_variable_ops import gen_kv_variable_ops
diff --git a/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
index 42ca0c6b..e32dd21f 100644
--- a/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
+++ b/deepray/custom_ops/embedding_variable/embedding_variable_ops_test.py
@@ -19,13 +19,13 @@
 
 from deepray.custom_ops.multiplex_2 import multiplex_2_op
 from tensorflow.python.framework import errors_impl
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
 
 
 @test_util.with_eager_op_as_function
 class MultiplexOpRank1Test(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_multiplex_int(self):
     a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64)
@@ -52,15 +52,15 @@ def test_multiplex_bad_types(self):
     b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64)
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, TypeError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(cannot compute Examples>MultiplexDense as input #2\(zero-based\) '
-        r'was expected to be a float tensor but is a int64 tensor '
-        r'\[Op:Examples>MultiplexDense\]'
-        r')|('
-        # Graph mode raises TypeError with the following message
-        r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that "
-        r"does not match type float32 of argument 'a'.)"
+      (errors_impl.InvalidArgumentError, TypeError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(cannot compute Examples>MultiplexDense as input #2\(zero-based\) "
+      r"was expected to be a float tensor but is a int64 tensor "
+      r"\[Op:Examples>MultiplexDense\]"
+      r")|("
+      # Graph mode raises TypeError with the following message
+      r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that "
+      r"does not match type float32 of argument 'a'.)",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
@@ -70,15 +70,15 @@ def test_multiplex_bad_size(self):
     b = tf.constant([10, 20], dtype=tf.int64)  # shorter than a
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(?s)(a and b must have the same shape. '
-        r'a shape: \[5\] b shape: \[2\].* '
-        r'\[Op:Examples>MultiplexDense\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, but are 5 and 2\. '
-        r'Shapes are \[5\] and \[2\]\.)'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(?s)(a and b must have the same shape. "
+      r"a shape: \[5\] b shape: \[2\].* "
+      r"\[Op:Examples>MultiplexDense\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, but are 5 and 2\. "
+      r"Shapes are \[5\] and \[2\]\.)",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
@@ -98,17 +98,17 @@ def test_multiplex_bad_shape(self):
     b = tf.constant([[10, 20], [30, 40], [50, 60]], dtype=tf.int64)  # shape (3,2)
     cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(a and b must have the same shape.'
-        r' a shape: \[2,3\] b shape: \[3,2\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, '
-        r'but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(a and b must have the same shape."
+      r" a shape: \[2,3\] b shape: \[3,2\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, "
+      r"but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/embedding_variable/multiplex_1_test.py b/deepray/custom_ops/embedding_variable/multiplex_1_test.py
index 2f2045e6..5e36dbf4 100644
--- a/deepray/custom_ops/embedding_variable/multiplex_1_test.py
+++ b/deepray/custom_ops/embedding_variable/multiplex_1_test.py
@@ -18,6 +18,7 @@
 import tensorflow as tf
 
 from deepray.custom_ops.embedding_variable import gen_kv_variable_ops
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import errors_impl  # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
@@ -25,7 +26,6 @@
 
 @test_util.with_eager_op_as_function
 class MultiplexOpRank1Test(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_multiplex_int(self):
     print(gen_kv_variable_ops)
@@ -46,5 +46,5 @@ def test_multiplex_int(self):
   #                                               container=container)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
index 35536d6b..aaacced3 100644
--- a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
+++ b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
@@ -13,14 +13,14 @@
 
 import deepray as dp
 from . import kv_variable_ops
-from .group_embedding_types import (DistStrategy, get_group_lookup_strategy)
+from .group_embedding_types import DistStrategy, get_group_lookup_strategy
 
 gen_group_embedding_ops = tf.load_op_library(resource_loader.get_path_to_datafile("../_group_embedding_ops.so"))
 
 __all__ = ["group_embedding_lookup", "group_embedding_lookup_sparse"]
 
 
-#for GPU EV group_lookup_dense
+# for GPU EV group_lookup_dense
 def group_embedding_var_lookup_dense(params, dense_values, dimensions, ev_init_value=None):
   if ev_init_value is not None:
     default_value = ev_init_value
@@ -29,22 +29,22 @@ def group_embedding_var_lookup_dense(params, dense_values, dimensions, ev_init_v
     default_value = ops.convert_to_tensor(1.0)
     is_use_default_value_tensor = False
   return gen_group_embedding_ops.group_embedding_var_lookup_dense(
-      params, dense_values, default_value, dimensions, is_use_default_value_tensor
+    params, dense_values, default_value, dimensions, is_use_default_value_tensor
   )
 
 
-#for GPU EV group_lookup
+# for GPU EV group_lookup
 def group_embedding_var_lookup(
-    params,
-    sp_values,
-    sp_indices,
-    sp_weights,
-    combiners,
-    batch_size,
-    dimensions,
-    ignore_weights,
-    is_sequence=False,
-    ev_init_value=None
+  params,
+  sp_values,
+  sp_indices,
+  sp_weights,
+  combiners,
+  batch_size,
+  dimensions,
+  ignore_weights,
+  is_sequence=False,
+  ev_init_value=None,
 ):
   if ev_init_value is not None:
     default_value = ev_init_value
@@ -56,34 +56,34 @@ def group_embedding_var_lookup(
     sp_weight = ops.convert_to_tensor(1.0)
     sp_weights = [sp_weight for _ in range(len(sp_values))]
   return gen_group_embedding_ops.group_embedding_var_lookup(
-      params,
-      sp_values,
-      sp_indices,
-      sp_weights,
-      batch_size,
-      default_value,
-      combiners,
-      dimensions,
-      ignore_weights=ignore_weights,
-      is_use_default_value_tensor=is_use_default_value_tensor,
-      is_sequence=is_sequence
+    params,
+    sp_values,
+    sp_indices,
+    sp_weights,
+    batch_size,
+    default_value,
+    combiners,
+    dimensions,
+    ignore_weights=ignore_weights,
+    is_use_default_value_tensor=is_use_default_value_tensor,
+    is_sequence=is_sequence,
   )
 
 
 def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
   """
-    This interface is designed for fused multiple embedding lookup.
-    Args:
-      params: list, tuple
-              a list or tuple of trainable *Variable* or *EmbeddingVariable*.
-      ids: list, tuple
-              a list or tuple of tf.SparseTensor or tf.Tensor.
-              btw RaggedTensor is preferred.
-      name: The operations name
-    Returns
-    -------
-    emb_vec: list
-            a list of tf.Tensor(the results of lookup).
+  This interface is designed for fused multiple embedding lookup.
+  Args:
+    params: list, tuple
+            a list or tuple of trainable *Variable* or *EmbeddingVariable*.
+    ids: list, tuple
+            a list or tuple of tf.SparseTensor or tf.Tensor.
+            btw RaggedTensor is preferred.
+    name: The operations name
+  Returns
+  -------
+  emb_vec: list
+          a list of tf.Tensor(the results of lookup).
   """
 
   if params is None:
@@ -101,7 +101,6 @@ def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
   strategy = get_group_lookup_strategy()
 
   if strategy == DistStrategy.LOCALIZED:
-
     emb_vec = [None for _ in range(len(params))]
 
     ev_group_id_map = {}
@@ -177,10 +176,10 @@ def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
         dim = tf_dimensions[group_id]
         output_index = output_index_list[group_id]
         with ops.name_scope(
-            name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + ids
+          name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + ids
         ) as name_scope:
           outputs = group_embedding_lookup_ops.group_variable_lookup_dense(
-              tf_handlers[group_id], tf_ids[group_id], dim
+            tf_handlers[group_id], tf_ids[group_id], dim
           )[0]
           for idx, output in zip(output_index, outputs):
             emb_vec[idx] = output
@@ -192,54 +191,54 @@ def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
 
 
 def group_embedding_lookup_sparse(
-    params,
-    sp_ids,
-    combiners,
-    sp_weights=None,
-    partition_strategy='mod',
-    is_sequence=False,
-    params_num_per_group=sys.maxsize,
-    name=None,
+  params,
+  sp_ids,
+  combiners,
+  sp_weights=None,
+  partition_strategy="mod",
+  is_sequence=False,
+  params_num_per_group=sys.maxsize,
+  name=None,
 ):
   """
-    This interface is designed for fused multiple embedding lookup.
-    Args:
-      params: list, tuple
-              a list or tuple of trainable *Variable* or *EmbeddingVariable*.
-      sp_ids: list, tuple
-              a list or tuple of tf.SparseTensor or tf.RaggedTensor.
-              btw RaggedTensor is preferred.
-      combiners: list, tuple
-              a list or tuple of string to specify the combiner of each embedding lookup,
-              supported args is *sum* or *mean*
-      sp_weights: list, tuple
-               a list or tuple of tf.SparseTensor used for embedding lookup.
-      is_sequence: bool
-                return list of `Tensor` of shape `[batch_size, D]` when is False
-                return list of `Tensor` of shape `[batch_size, T, D]` when is True
-      params_num_per_group: int
-                The number of params in GroupEmbedding op.Function will schedule len(params) // params_num_per_group + 1
-                GroupEmbedding Op. Default setting would launch one Op containing all params which is suitable for GPU scenarios
-                to maximize the GPU utilization.On the contrast, you could set value to 1 when Op
-                is placed on CPU so as to maximize inter parallelism.
-      name: The operations name
-    Returns
-    -------
-    emb_vec: list
-            a list of tf.Tensor(the results of lookup).
+  This interface is designed for fused multiple embedding lookup.
+  Args:
+    params: list, tuple
+            a list or tuple of trainable *Variable* or *EmbeddingVariable*.
+    sp_ids: list, tuple
+            a list or tuple of tf.SparseTensor or tf.RaggedTensor.
+            btw RaggedTensor is preferred.
+    combiners: list, tuple
+            a list or tuple of string to specify the combiner of each embedding lookup,
+            supported args is *sum* or *mean*
+    sp_weights: list, tuple
+             a list or tuple of tf.SparseTensor used for embedding lookup.
+    is_sequence: bool
+              return list of `Tensor` of shape `[batch_size, D]` when is False
+              return list of `Tensor` of shape `[batch_size, T, D]` when is True
+    params_num_per_group: int
+              The number of params in GroupEmbedding op.Function will schedule len(params) // params_num_per_group + 1
+              GroupEmbedding Op. Default setting would launch one Op containing all params which is suitable for GPU scenarios
+              to maximize the GPU utilization.On the contrast, you could set value to 1 when Op
+              is placed on CPU so as to maximize inter parallelism.
+    name: The operations name
+  Returns
+  -------
+  emb_vec: list
+          a list of tf.Tensor(the results of lookup).
   """
 
   if combiners is None:
     logging.warn('The default value of combiner will change from "mean" to "sqrtn" after 2016/11/01.')
-    combiners = ['mean'] * len(params)
+    combiners = ["mean"] * len(params)
   if not isinstance(combiners, list):
     combiners = [combiners]
   for combiner in combiners:
-    if combiner not in ('mean', 'sum'):
+    if combiner not in ("mean", "sum"):
       raise ValueError("combiners must be one of 'mean', 'sum'")
 
   if params is None:
-    raise ValueError('params must be specified')
+    raise ValueError("params must be specified")
   if not isinstance(params, list):
     params = [params]
 
@@ -256,36 +255,37 @@ def group_embedding_lookup_sparse(
   ignore_weights = sp_weights is None
 
   if len(combiners) != len(sp_ids):
-    raise ValueError('len of combiners must be equal to len of sp_ids')
+    raise ValueError("len of combiners must be equal to len of sp_ids")
   if len(combiners) != len(params):
-    raise ValueError('len of combiners must be equal to len of params')
+    raise ValueError("len of combiners must be equal to len of params")
   if not ignore_weights:
     if len(combiners) != len(sp_weights):
-      raise ValueError('len of combiners must be equal to len of sp_weights')
+      raise ValueError("len of combiners must be equal to len of sp_weights")
 
   strategy = get_group_lookup_strategy()
   if strategy == DistStrategy.SOK:
     import horovod.tensorflow as hvd
+
     should_shard = False
     if len(params) > hvd.size():
       should_shard = True
       global_size = hvd.size()
     if should_shard:
-      for (index, param) in enumerate(params):
+      for index, param in enumerate(params):
         param.target_gpu = index % global_size
     else:
-      for (index, param) in enumerate(params):
+      for index, param in enumerate(params):
         param.target_gpu = -1
 
     try:
       from sparse_operation_kit import experiment as sok
     except:
-      raise ImportError('sparse_operation_kit is not found while group_embedding strategy is given `collective`')
-    with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope:
+      raise ImportError("sparse_operation_kit is not found while group_embedding strategy is given `collective`")
+    with ops.name_scope(name, "group_embedding_lookup", params + sp_ids) as name_scope:
       emb_vec = sok.lookup_sparse(params, sp_ids, combiners=combiners)
   elif strategy == DistStrategy.HB:
     emb_vec = []
-    with ops.name_scope(name, 'group_embedding_lookup', params + sp_ids) as name_scope:
+    with ops.name_scope(name, "group_embedding_lookup", params + sp_ids) as name_scope:
       for idx, embedding in enumerate(params):
         if not ignore_weights:
           sp_weight = sp_weights[idx]
@@ -294,7 +294,6 @@ def group_embedding_lookup_sparse(
         emb_vec.append(embedding_lookup_sparse(embedding, sp_ids[idx], sp_weight, combiner=combiners[idx]))
 
   elif strategy == DistStrategy.LOCALIZED:
-
     emb_vec = [None for _ in range(len(params))]
 
     ev_group_id_map = {}
@@ -304,7 +303,7 @@ def group_embedding_lookup_sparse(
     is_ev_list = [False for _ in range(len(params))]
     params_idx_map = defaultdict(list)  # queue
 
-    for (index, param) in enumerate(params):
+    for index, param in enumerate(params):
       params_idx_map[param.ref()].append(index)
       sp_id = sp_ids[index]
       if not isinstance(sp_id, sparse_tensor.SparseTensor):
@@ -312,13 +311,13 @@ def group_embedding_lookup_sparse(
           sp_id = sp_id.to_sparse()
           sp_ids[index] = sp_id
         except:
-          raise ValueError('sp_id is neither SparseTensor nor RaggedTensor!')
+          raise ValueError("sp_id is neither SparseTensor nor RaggedTensor!")
 
       if not ignore_weights:
         sp_weight = sp_weights[index]
         if sp_weight is not None:
           if not isinstance(sp_weight, sparse_tensor.SparseTensor):
-            raise TypeError('sp_weights must be either None or SparseTensor')
+            raise TypeError("sp_weights must be either None or SparseTensor")
           sp_id.values.get_shape().assert_is_compatible_with(sp_weight.values.get_shape())
           sp_id.indices.get_shape().assert_is_compatible_with(sp_weight.indices.get_shape())
           sp_id.dense_shape.get_shape().assert_is_compatible_with(sp_weight.dense_shape.get_shape())
@@ -343,10 +342,10 @@ def group_embedding_lookup_sparse(
       ev_dense_shapes = [[] for _ in range(ev_group_id)]
       ev_handlers = [[] for _ in range(ev_group_id)]
       ev_dimensions = [0 for _ in range(ev_group_id)]
-      ev_combiners = ['mean' for _ in range(ev_group_id)]
+      ev_combiners = ["mean" for _ in range(ev_group_id)]
       output_index_list = [[] for _ in range(ev_group_id)]
 
-      for (index, ev_flag) in enumerate(is_ev_list):
+      for index, ev_flag in enumerate(is_ev_list):
         if not ev_flag:
           continue
         param = params[index]
@@ -372,52 +371,51 @@ def group_embedding_lookup_sparse(
         dim = ev_dimensions[group_id]
         output_index = output_index_list[group_id]
 
-        (num_sub_group, num_remainder) = \
-          divmod(len(ev_handlers[group_id]),
-                 params_num_per_group)
+        (num_sub_group, num_remainder) = divmod(len(ev_handlers[group_id]), params_num_per_group)
         for j in range(num_sub_group):
           sub_ev_sp_weight = (
-              [None for _ in range(params_num_per_group)] if ignore_weights else
-              (ev_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group]
+            [None for _ in range(params_num_per_group)]
+            if ignore_weights
+            else (ev_sp_weights[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group]
           )
           with ops.name_scope(
-              name, 'localized_group_embedding_lookup_ev_dim{}_{}'.format(dim, j), params + sp_ids
+            name, "localized_group_embedding_lookup_ev_dim{}_{}".format(dim, j), params + sp_ids
           ) as name_scope:
             outputs = group_embedding_var_lookup(
-                (ev_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                (ev_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                (ev_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                sub_ev_sp_weight,
-                ev_combiners[group_id],
-                (ev_dense_shapes[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                dim,
-                ignore_weights,
-                is_sequence,
+              (ev_handlers[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              (ev_sp_values[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              (ev_sp_indices[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              sub_ev_sp_weight,
+              ev_combiners[group_id],
+              (ev_dense_shapes[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              dim,
+              ignore_weights,
+              is_sequence,
             )[0]
 
-            for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs):
+            for idx, output in zip(output_index[j * params_num_per_group : (j + 1) * params_num_per_group], outputs):
               emb_vec[idx] = output
 
         if num_remainder > 0:
           sub_ev_sp_weight = (
-              [None for _ in range(num_remainder)] if ignore_weights else (ev_sp_weights[group_id])[-num_remainder:]
+            [None for _ in range(num_remainder)] if ignore_weights else (ev_sp_weights[group_id])[-num_remainder:]
           )
           with ops.name_scope(
-              name, 'localized_group_embedding_lookup_ev_dim{}'.format(dim), params + sp_ids
+            name, "localized_group_embedding_lookup_ev_dim{}".format(dim), params + sp_ids
           ) as name_scope:
             outputs = group_embedding_var_lookup(
-                (ev_handlers[group_id])[-num_remainder:],
-                (ev_sp_values[group_id])[-num_remainder:],
-                (ev_sp_indices[group_id])[-num_remainder:],
-                sub_ev_sp_weight,
-                ev_combiners[group_id],
-                (ev_dense_shapes[group_id])[-num_remainder:],
-                dim,
-                ignore_weights,
-                is_sequence,
+              (ev_handlers[group_id])[-num_remainder:],
+              (ev_sp_values[group_id])[-num_remainder:],
+              (ev_sp_indices[group_id])[-num_remainder:],
+              sub_ev_sp_weight,
+              ev_combiners[group_id],
+              (ev_dense_shapes[group_id])[-num_remainder:],
+              dim,
+              ignore_weights,
+              is_sequence,
             )[0]
 
-            for (idx, output) in zip(output_index[-num_remainder:], outputs):
+            for idx, output in zip(output_index[-num_remainder:], outputs):
               emb_vec[idx] = output
 
     if tf_group_id > 0:
@@ -427,10 +425,10 @@ def group_embedding_lookup_sparse(
       tf_dense_shape = [[] for _ in range(tf_group_id)]
       tf_handlers = [[] for _ in range(tf_group_id)]
       tf_dimensions = [0 for _ in range(tf_group_id)]
-      tf_combiners = ['mean' for _ in range(tf_group_id)]
+      tf_combiners = ["mean" for _ in range(tf_group_id)]
       output_index_list = [[] for _ in range(tf_group_id)]
 
-      for (index, ev_flag) in enumerate(is_ev_list):
+      for index, ev_flag in enumerate(is_ev_list):
         if ev_flag:
           continue
         param = params[index]
@@ -458,51 +456,51 @@ def group_embedding_lookup_sparse(
         (num_sub_group, num_remainder) = divmod(len(tf_handlers[group_id]), params_num_per_group)
         for j in range(num_sub_group):
           sub_tf_sp_weight = (
-              [None for _ in range(params_num_per_group)] if ignore_weights else
-              (tf_sp_weights[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group]
+            [None for _ in range(params_num_per_group)]
+            if ignore_weights
+            else (tf_sp_weights[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group]
           )
           with ops.name_scope(
-              name, 'localized_group_embedding_lookup_variable_dim{}_{}'.format(dim, j), params + sp_ids
+            name, "localized_group_embedding_lookup_variable_dim{}_{}".format(dim, j), params + sp_ids
           ) as name_scope:
             outputs = group_embedding_lookup_ops.group_variable_lookup(
-                (tf_handlers[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                (tf_sp_values[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                (tf_sp_indices[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                sub_tf_sp_weight,
-                tf_combiners[group_id],
-                (tf_dense_shape[group_id])[j * params_num_per_group:(j + 1) * params_num_per_group],
-                dim,
-                ignore_weights,
-                is_sequence,
+              (tf_handlers[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              (tf_sp_values[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              (tf_sp_indices[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              sub_tf_sp_weight,
+              tf_combiners[group_id],
+              (tf_dense_shape[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
+              dim,
+              ignore_weights,
+              is_sequence,
             )[0]
 
-            for (idx, output) in zip(output_index[j * params_num_per_group:(j + 1) * params_num_per_group], outputs):
+            for idx, output in zip(output_index[j * params_num_per_group : (j + 1) * params_num_per_group], outputs):
               emb_vec[idx] = output
 
         if num_remainder > 0:
           sub_tf_sp_weight = (
-              [None for _ in range(num_remainder)] if ignore_weights else (tf_sp_weights[group_id])[-num_remainder:]
+            [None for _ in range(num_remainder)] if ignore_weights else (tf_sp_weights[group_id])[-num_remainder:]
           )
           with ops.name_scope(
-              name, 'localized_group_embedding_lookup_variable_dim{}'.format(dim), params + sp_ids
+            name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + sp_ids
           ) as name_scope:
             outputs = group_embedding_lookup_ops.group_variable_lookup(
-                (tf_handlers[group_id])[-num_remainder:],
-                (tf_sp_values[group_id])[-num_remainder:],
-                (tf_sp_indices[group_id])[-num_remainder:],
-                sub_tf_sp_weight,
-                tf_combiners[group_id],
-                (tf_dense_shape[group_id])[-num_remainder:],
-                dim,
-                ignore_weights,
-                is_sequence,
+              (tf_handlers[group_id])[-num_remainder:],
+              (tf_sp_values[group_id])[-num_remainder:],
+              (tf_sp_indices[group_id])[-num_remainder:],
+              sub_tf_sp_weight,
+              tf_combiners[group_id],
+              (tf_dense_shape[group_id])[-num_remainder:],
+              dim,
+              ignore_weights,
+              is_sequence,
             )[0]
 
-            for (idx, output) in zip(output_index[-num_remainder:], outputs):
+            for idx, output in zip(output_index[-num_remainder:], outputs):
               emb_vec[idx] = output
   elif strategy == DistStrategy.UNKNOWN:
-
-    raise ValueError('Unrecognized strategy, expected collective, given{}'.format(strategy))
+    raise ValueError("Unrecognized strategy, expected collective, given{}".format(strategy))
 
   return emb_vec
 
@@ -527,12 +525,12 @@ def _GroupGatherGrad(op, *grads):
   dimension = op.get_attr("dimension")
   return_grads = []
   params = op.inputs[:ev_num]
-  sp_indices = op.inputs[ev_num * 2:ev_num * 3]
-  unique_values = op.outputs[ev_num:2 * ev_num]
-  batch_nums = op.outputs[3 * ev_num:4 * ev_num]
+  sp_indices = op.inputs[ev_num * 2 : ev_num * 3]
+  unique_values = op.outputs[ev_num : 2 * ev_num]
+  batch_nums = op.outputs[3 * ev_num : 4 * ev_num]
   with ops.colocate_with(params[0]):
     nnz_grads = gen_group_embedding_ops.group_embedding_variable_lookup_grad(
-        grads[:ev_num], params, unique_values, sp_indices, batch_nums, dimension, combiner
+      grads[:ev_num], params, unique_values, sp_indices, batch_nums, dimension, combiner
     )
   for i in range(ev_num):
     handle = params[i]
diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_types.py b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py
index 4eb679c9..ec3a6d2d 100644
--- a/deepray/custom_ops/embedding_variable/python/group_embedding_types.py
+++ b/deepray/custom_ops/embedding_variable/python/group_embedding_types.py
@@ -34,7 +34,6 @@ class DistStrategy(Enum):
 
 
 def set_group_lookup_strategy(strategy):
-
   def str_to_strategy(strategy):
     if strategy == "sok":
       return DistStrategy.SOK
diff --git a/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
index c9dab432..8e6d4f0c 100644
--- a/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
+++ b/deepray/custom_ops/embedding_variable/python/kv_variable_ops.py
@@ -39,7 +39,12 @@
 from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
-from tensorflow.python.ops.resource_variable_ops import get_eager_safe_handle_data, _combine_handle_data, _set_handle_shapes_and_types, ResourceVariable
+from tensorflow.python.ops.resource_variable_ops import (
+  get_eager_safe_handle_data,
+  _combine_handle_data,
+  _set_handle_shapes_and_types,
+  ResourceVariable,
+)
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.saved_model import registration
 from tensorflow.python.trackable import base as trackable
@@ -67,13 +72,13 @@ def _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, n
   key_type = dtypes.as_dtype(key_type)
 
   handle = gen_kv_variable_ops.kv_var_handle_op(
-      shape=shape,
-      dtype=dtype,
-      Tkeys=key_type,
-      shared_name=shared_name,
-      # debug_name=name,
-      name=name,
-      container=container
+    shape=shape,
+    dtype=dtype,
+    Tkeys=key_type,
+    shared_name=shared_name,
+    # debug_name=name,
+    name=name,
+    container=container,
   )
   if initial_value is None:
     initial_value = handle
@@ -86,9 +91,8 @@ def _variable_handle_from_shape_and_dtype(shape, dtype, key_type, shared_name, n
     if initial_value is not None and initial_value.dtype == dtypes.variant:
       extra_handle_data = get_eager_safe_handle_data(initial_value)
       if extra_handle_data is not None and extra_handle_data.is_set:
-        if (not handle_data.is_set or len(handle_data.shape_and_type) != 1):
-          raise RuntimeError("Expected VarHandleOp to return a length==1 shape_and_type, "
-                             f"but saw: '{handle_data}'")
+        if not handle_data.is_set or len(handle_data.shape_and_type) != 1:
+          raise RuntimeError(f"Expected VarHandleOp to return a length==1 shape_and_type, but saw: '{handle_data}'")
         handle_data.shape_and_type.extend(extra_handle_data.shape_and_type)
 
     _set_handle_shapes_and_types(handle, handle_data, graph_mode)
@@ -195,26 +199,26 @@ class EmbeddingVariable(ResourceVariable, saveable_object.SaveableObject):
   """
 
   def __init__(
-      self,  # pylint: disable=super-init-not-called
-      initial_value=None,
-      trainable=None,
-      collections=None,
-      validate_shape=True,  # pylint: disable=unused-argument
-      caching_device=None,
-      name=None,
-      dtype=None,
-      variable_def=None,
-      import_scope=None,
-      constraint=None,
-      distribute_strategy=None,
-      synchronization=None,
-      aggregation=None,
-      shape=None,
-      handle=None,
-      experimental_enable_variable_lifting=None,
-      invalid_key=None,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,  # pylint: disable=super-init-not-called
+    initial_value=None,
+    trainable=None,
+    collections=None,
+    validate_shape=True,  # pylint: disable=unused-argument
+    caching_device=None,
+    name=None,
+    dtype=None,
+    variable_def=None,
+    import_scope=None,
+    constraint=None,
+    distribute_strategy=None,
+    synchronization=None,
+    aggregation=None,
+    shape=None,
+    handle=None,
+    experimental_enable_variable_lifting=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Creates a variable.
 
@@ -296,16 +300,16 @@ def __init__(
     if variable_def:
       if initial_value is not None:
         raise ValueError(
-            f"The variable_def and initial_value args to "
-            f"`tf.Variable` are mutually exclusive, but got both: "
-            f"variable_def={variable_def},\n"
-            f"initial_value={initial_value}"
+          f"The variable_def and initial_value args to "
+          f"`tf.Variable` are mutually exclusive, but got both: "
+          f"variable_def={variable_def},\n"
+          f"initial_value={initial_value}"
         )
       if context.executing_eagerly():
         raise ValueError(
-            f"Creating a `tf.Variable` with a `variable_def` arg "
-            f"is not supported when eager execution is enabled. "
-            f"Got: variable_def={variable_def}"
+          f"Creating a `tf.Variable` with a `variable_def` arg "
+          f"is not supported when eager execution is enabled. "
+          f"Got: variable_def={variable_def}"
         )
       self._init_from_proto(variable_def, import_scope=import_scope, validate_shape=validate_shape)
     elif handle is not None:
@@ -313,45 +317,45 @@ def __init__(
     else:
       evconfig.reveal()
       self._init_from_args(
-          initial_value=initial_value,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          name=name,
-          dtype=dtype,
-          constraint=constraint,
-          synchronization=synchronization,
-          aggregation=aggregation,
-          shape=shape,
-          distribute_strategy=distribute_strategy,
-          validate_shape=validate_shape,
-          experimental_enable_variable_lifting=experimental_enable_variable_lifting,
-          invalid_key=invalid_key,
-          evconfig=evconfig,
-          ht_partition_num=ht_partition_num
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        synchronization=synchronization,
+        aggregation=aggregation,
+        shape=shape,
+        distribute_strategy=distribute_strategy,
+        validate_shape=validate_shape,
+        experimental_enable_variable_lifting=experimental_enable_variable_lifting,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        ht_partition_num=ht_partition_num,
       )
 
   def __repr__(self):
     return "<tf.EmbeddingVariable '%s' embedding dim=%s dtype=%s>" % (self.name, self.shape, self.dtype.name)
 
   def _init_from_args(
-      self,
-      initial_value=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      name=None,
-      dtype=None,
-      constraint=None,
-      synchronization=None,
-      aggregation=None,
-      distribute_strategy=None,
-      shape=None,
-      validate_shape=True,
-      experimental_enable_variable_lifting=None,
-      invalid_key=-1,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,
+    initial_value=None,
+    trainable=None,
+    collections=None,
+    caching_device=None,
+    name=None,
+    dtype=None,
+    constraint=None,
+    synchronization=None,
+    aggregation=None,
+    distribute_strategy=None,
+    shape=None,
+    validate_shape=True,
+    experimental_enable_variable_lifting=None,
+    invalid_key=-1,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Creates a variable.
 
@@ -424,46 +428,48 @@ def _init_from_args(
     ignored.
     @end_compatibility
     """
-    synchronization, aggregation, trainable = (
-        variables.validate_synchronization_aggregation_trainable(synchronization, aggregation, trainable, name)
+    synchronization, aggregation, trainable = variables.validate_synchronization_aggregation_trainable(
+      synchronization, aggregation, trainable, name
     )
     if experimental_enable_variable_lifting is None:
       experimental_enable_variable_lifting = True
     if initial_value is None:
       raise ValueError(
-          "The `initial_value` arg to `tf.Variable` must "
-          "be specified except when you are not providing a "
-          "`variable_def`. You provided neither."
+        "The `initial_value` arg to `tf.Variable` must "
+        "be specified except when you are not providing a "
+        "`variable_def`. You provided neither."
       )
     init_from_fn = callable(initial_value)
 
-    if isinstance(initial_value,
-                  tensor_module.Tensor) and hasattr(initial_value, "graph") and initial_value.graph.building_function:
+    if (
+      isinstance(initial_value, tensor_module.Tensor)
+      and hasattr(initial_value, "graph")
+      and initial_value.graph.building_function
+    ):
       raise ValueError(
-          f"Argument `initial_value` ({initial_value}) could not "
-          "be lifted out of a `tf.function`. "
-          f"(Tried to create variable with name='{name}'). "
-          "To avoid this error, when constructing `tf.Variable`s "
-          "inside of `tf.function` you can create the "
-          "`initial_value` tensor in a "
-          "`tf.init_scope` or pass a callable `initial_value` "
-          "(e.g., `tf.Variable(lambda : "
-          "tf.truncated_normal([10, 40]))`). "
-          "Please file a feature request if this "
-          "restriction inconveniences you."
+        f"Argument `initial_value` ({initial_value}) could not "
+        "be lifted out of a `tf.function`. "
+        f"(Tried to create variable with name='{name}'). "
+        "To avoid this error, when constructing `tf.Variable`s "
+        "inside of `tf.function` you can create the "
+        "`initial_value` tensor in a "
+        "`tf.init_scope` or pass a callable `initial_value` "
+        "(e.g., `tf.Variable(lambda : "
+        "tf.truncated_normal([10, 40]))`). "
+        "Please file a feature request if this "
+        "restriction inconveniences you."
       )
 
     if collections is None:
       collections = [ops.GraphKeys.GLOBAL_VARIABLES]
     if not isinstance(collections, (list, tuple, set)):
       raise ValueError(
-          f"collections argument to Variable constructor must be a list, "
-          f"tuple, or set. Got {collections} of type {type(collections)}"
+        f"collections argument to Variable constructor must be a list, "
+        f"tuple, or set. Got {collections} of type {type(collections)}"
       )
     if constraint is not None and not callable(constraint):
       raise ValueError(
-          f"Argument `constraint` must be None or a callable. "
-          f"a callable. Got a {type(constraint)}:  {constraint}"
+        f"Argument `constraint` must be None or a callable. a callable. Got a {type(constraint)}:  {constraint}"
       )
 
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
@@ -499,8 +505,8 @@ def _init_from_args(
       self._false_positive_probability = -1.0
       self._counter_type = dtypes.uint64
 
-    self._record_freq = (os.environ.get("TF_RECORD_FREQ", "0") == "1")
-    self._record_version = (os.environ.get("TF_RECORD_VERSION", "0") == "1")
+    self._record_freq = os.environ.get("TF_RECORD_FREQ", "0") == "1"
+    self._record_version = os.environ.get("TF_RECORD_VERSION", "0") == "1"
     self._l2_weight_threshold = evconfig.l2_weight_threshold
     self._storage_type = evconfig.storage_type
     self._storage_path = evconfig.storage_path
@@ -541,9 +547,9 @@ def _init_from_args(
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
-        device_context_manager = (ops.device if self._in_graph_mode else ops.NullContextmanager)
+        device_context_manager = ops.device if self._in_graph_mode else ops.NullContextmanager
         attr = attr_value_pb2.AttrValue(
-            list=attr_value_pb2.AttrValue.ListValue(s=[compat.as_bytes("loc:@%s" % handle_name)])
+          list=attr_value_pb2.AttrValue.ListValue(s=[compat.as_bytes("loc:@%s" % handle_name)])
         )
         with ops.get_default_graph()._attr_scope({"_class": attr}):
           with ops.name_scope("Initializer"), device_context_manager(None):
@@ -558,23 +564,26 @@ def _init_from_args(
           if shape is not None:
             if not initial_value.shape.is_compatible_with(shape):
               raise ValueError(
-                  f"In this `tf.Variable` creation, the initial value's shape "
-                  f"({initial_value.shape}) is not compatible with "
-                  f"the explicitly supplied `shape` argument ({shape})."
+                f"In this `tf.Variable` creation, the initial value's shape "
+                f"({initial_value.shape}) is not compatible with "
+                f"the explicitly supplied `shape` argument ({shape})."
               )
           else:
             shape = initial_value.get_shape()[rank:]
-          _device = "GPU" if self._storage_type in [
-              config_pb2.StorageType.HBM, config_pb2.StorageType.HBM_DRAM, config_pb2.StorageType.HBM_DRAM_SSDHASH
-          ] else "CPU"
+          _device = (
+            "GPU"
+            if self._storage_type
+            in [config_pb2.StorageType.HBM, config_pb2.StorageType.HBM_DRAM, config_pb2.StorageType.HBM_DRAM_SSDHASH]
+            else "CPU"
+          )
           with ops.device(_device):
             handle = eager_safe_variable_handle(
-                initial_value=initial_value,
-                shape=shape,
-                key_type=self._invalid_key_type,
-                shared_name=shared_name,
-                name=name,
-                graph_mode=self._in_graph_mode
+              initial_value=initial_value,
+              shape=shape,
+              key_type=self._invalid_key_type,
+              shared_name=shared_name,
+              name=name,
+              graph_mode=self._in_graph_mode,
             )
           handle._parent_trackable = weakref.ref(self)
           handle._name = handle_name + ":0"
@@ -582,15 +591,14 @@ def _init_from_args(
           self._handle = handle
         # pylint: disable=protected-access
         if (
-            self._in_graph_mode and initial_value is not None and
-            initial_value.op._get_control_flow_context() is not None
+          self._in_graph_mode and initial_value is not None and initial_value.op._get_control_flow_context() is not None
         ):
           raise ValueError(
-              f"The `initial_value` passed to `tf.Variable` {name} is from "
-              f"inside a control-flow  construct, such as a loop or "
-              f"conditional. When creating a "
-              f"`tf.Variable` inside a loop or conditional, use a lambda as "
-              f"the `initial_value`. Got: initial_value=({initial_value})"
+            f"The `initial_value` passed to `tf.Variable` {name} is from "
+            f"inside a control-flow  construct, such as a loop or "
+            f"conditional. When creating a "
+            f"`tf.Variable` inside a loop or conditional, use a lambda as "
+            f"the `initial_value`. Got: initial_value=({initial_value})"
           )
         # pylint: enable=protected-access
         dtype = initial_value.dtype.base_dtype
@@ -606,54 +614,53 @@ def _init_from_args(
 
         if self._in_graph_mode:
           with ops.name_scope("IsInitialized"):
-            self._is_initialized_op = (
-                gen_kv_variable_ops.kv_var_is_initialized_op(handle, Tkeys=self._invalid_key_type, dtype=self._dtype)
+            self._is_initialized_op = gen_kv_variable_ops.kv_var_is_initialized_op(
+              handle, Tkeys=self._invalid_key_type, dtype=self._dtype
             )
           if initial_value is not None:
             # pylint: disable=g-backslash-continuation
-            with ops.name_scope("Assign") as n, \
-                 ops.colocate_with(None, ignore_existing=True), \
-                 ops.device(handle.device):
+            with (
+              ops.name_scope("Assign") as n,
+              ops.colocate_with(None, ignore_existing=True),
+              ops.device(handle.device),
+            ):
               with ops.control_dependencies(None if self._is_primary else [self._primary.initializer]):
                 self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op(
-                    handle,
-                    self._primary._handle,
-                    variables._try_guard_against_uninitialized_dependencies(name, initial_value),
-                    ops.convert_to_tensor(invalid_key),
-                    slot_num=self._slot_num,
-                    shape=initial_value.get_shape()[rank:],
-                    steps_to_live=self._steps_to_live,
-                    emb_index=self._emb_index,
-                    block_num=self.block_num,
-                    slot_index=self._slot_index,
-                    ht_type=self._ht_type,
-                    ht_partition_num=self._ht_partition_num,
-                    filter_freq=self._filter_freq,
-                    l2_weight_threshold=self._l2_weight_threshold,
-                    max_element_size=self._max_element_size,
-                    false_positive_probability=self._false_positive_probability,
-                    counter_type=self._counter_type,
-                    max_freq=99999,
-                    layout=self._layout,
-                    storage_type=self._storage_type,
-                    storage_path=self._storage_path,
-                    storage_size=self._storage_size,
-                    default_value_dim=self._default_value_dim,
-                    default_value_no_permission=self._default_value_no_permission,
-                    record_freq=self._record_freq,
-                    record_version=self._record_version,
-                    embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
-                    name=n
+                  handle,
+                  self._primary._handle,
+                  variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+                  ops.convert_to_tensor(invalid_key),
+                  slot_num=self._slot_num,
+                  shape=initial_value.get_shape()[rank:],
+                  steps_to_live=self._steps_to_live,
+                  emb_index=self._emb_index,
+                  block_num=self.block_num,
+                  slot_index=self._slot_index,
+                  ht_type=self._ht_type,
+                  ht_partition_num=self._ht_partition_num,
+                  filter_freq=self._filter_freq,
+                  l2_weight_threshold=self._l2_weight_threshold,
+                  max_element_size=self._max_element_size,
+                  false_positive_probability=self._false_positive_probability,
+                  counter_type=self._counter_type,
+                  max_freq=99999,
+                  layout=self._layout,
+                  storage_type=self._storage_type,
+                  storage_path=self._storage_path,
+                  storage_size=self._storage_size,
+                  default_value_dim=self._default_value_dim,
+                  default_value_no_permission=self._default_value_no_permission,
+                  record_freq=self._record_freq,
+                  record_version=self._record_version,
+                  embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
+                  name=n,
                 )
               set_attr_ops = []
 
               if self._is_primary and self._is_multi_tier:
                 with ops.control_dependencies([self._init_op]):
                   set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
-                      self._handle,
-                      cache_strategy=self._storage_cache_strategy,
-                      Tkeys=self._invalid_key_type,
-                      dtype=dtype
+                    self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=dtype
                   )
                 set_attr_ops.append(set_cache_strategy_op)
               with ops.control_dependencies(set_attr_ops + [self._init_op]):
@@ -662,38 +669,38 @@ def _init_from_args(
               self.create_init_op_for_restore(name, initial_value, invalid_key, rank)
         else:
           self._init_op = gen_kv_variable_ops.initialize_kv_variable_v2_op(
-              handle,
-              self._primary._handle,
-              initial_value,
-              ops.convert_to_tensor(invalid_key),
-              slot_num=self._slot_num,
-              shape=shape,
-              steps_to_live=self._steps_to_live,
-              emb_index=self._emb_index,
-              block_num=self.block_num,
-              slot_index=self._slot_index,
-              ht_type=self._ht_type,
-              ht_partition_num=self._ht_partition_num,
-              filter_freq=self._filter_freq,
-              l2_weight_threshold=self._l2_weight_threshold,
-              max_element_size=self._max_element_size,
-              false_positive_probability=self._false_positive_probability,
-              counter_type=self._counter_type,
-              max_freq=99999,
-              layout=self._layout,
-              storage_type=self._storage_type,
-              storage_path=self._storage_path,
-              storage_size=self._storage_size,
-              default_value_dim=self._default_value_dim,
-              default_value_no_permission=self._default_value_no_permission,
-              record_freq=self._record_freq,
-              record_version=self._record_version,
-              embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE
+            handle,
+            self._primary._handle,
+            initial_value,
+            ops.convert_to_tensor(invalid_key),
+            slot_num=self._slot_num,
+            shape=shape,
+            steps_to_live=self._steps_to_live,
+            emb_index=self._emb_index,
+            block_num=self.block_num,
+            slot_index=self._slot_index,
+            ht_type=self._ht_type,
+            ht_partition_num=self._ht_partition_num,
+            filter_freq=self._filter_freq,
+            l2_weight_threshold=self._l2_weight_threshold,
+            max_element_size=self._max_element_size,
+            false_positive_probability=self._false_positive_probability,
+            counter_type=self._counter_type,
+            max_freq=99999,
+            layout=self._layout,
+            storage_type=self._storage_type,
+            storage_path=self._storage_path,
+            storage_size=self._storage_size,
+            default_value_dim=self._default_value_dim,
+            default_value_no_permission=self._default_value_no_permission,
+            record_freq=self._record_freq,
+            record_version=self._record_version,
+            embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
           )
           if self._is_primary and self._is_multi_tier:
             with ops.control_dependencies([self._init_op]):
               set_cache_strategy_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
-                  self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=dtype
+                self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=dtype
               )
 
         if self._in_graph_mode:
@@ -705,65 +712,70 @@ def _init_from_args(
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
       initial_value = initial_value if self._in_graph_mode else None
       super(EmbeddingVariable, self).__init__(
-          trainable=trainable,
-          shape=shape,
-          dtype=dtype,
-          handle=handle,
-          synchronization=synchronization,
-          constraint=constraint,
-          aggregation=aggregation,
-          distribute_strategy=distribute_strategy,
-          name=name,
-          initial_value=initial_value,
-          caching_device=caching_device,
-          validate_shape=validate_shape,
+        trainable=trainable,
+        shape=shape,
+        dtype=dtype,
+        handle=handle,
+        synchronization=synchronization,
+        constraint=constraint,
+        aggregation=aggregation,
+        distribute_strategy=distribute_strategy,
+        name=name,
+        initial_value=initial_value,
+        caching_device=caching_device,
+        validate_shape=validate_shape,
       )
 
   def is_multi_tier(self, storage_type):
     multi_level_list = [
-        config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_PMEM,
-        config_pb2.StorageType.DRAM_LEVELDB, config_pb2.StorageType.DRAM_SSDHASH, config_pb2.StorageType.HBM_DRAM,
-        config_pb2.StorageType.DRAM_PMEM_SSDHASH, config_pb2.StorageType.HBM_DRAM_SSDHASH
+      config_pb2.StorageType.LEVELDB,
+      config_pb2.StorageType.SSDHASH,
+      config_pb2.StorageType.DRAM_PMEM,
+      config_pb2.StorageType.DRAM_LEVELDB,
+      config_pb2.StorageType.DRAM_SSDHASH,
+      config_pb2.StorageType.HBM_DRAM,
+      config_pb2.StorageType.DRAM_PMEM_SSDHASH,
+      config_pb2.StorageType.HBM_DRAM_SSDHASH,
     ]
     return storage_type in multi_level_list
 
   def create_init_op_for_restore(self, name, initial_value, invalid_key, rank):
     with ops.control_dependencies(None if self._is_primary else [self._primary._init_op_for_restore]):
       self._initializer_for_restore = gen_kv_variable_ops.initialize_kv_variable_v2_op(
-          self._handle,
-          self._primary._handle,
-          variables._try_guard_against_uninitialized_dependencies(name, initial_value),
-          ops.convert_to_tensor(invalid_key),
-          initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED,
-          slot_num=self._slot_num,
-          shape=initial_value.get_shape()[rank:],
-          steps_to_live=self._steps_to_live,
-          emb_index=self._emb_index,
-          block_num=self.block_num,
-          slot_index=self._slot_index,
-          ht_type=self._ht_type,
-          ht_partition_num=self._ht_partition_num,
-          filter_freq=self._filter_freq,
-          l2_weight_threshold=self._l2_weight_threshold,
-          max_element_size=self._max_element_size,
-          false_positive_probability=self._false_positive_probability,
-          counter_type=self._counter_type,
-          max_freq=99999,
-          layout=self._layout,
-          storage_type=self._storage_type,
-          storage_path=self._storage_path,
-          storage_size=self._storage_size,
-          default_value_dim=self._default_value_dim,
-          default_value_no_permission=self._default_value_no_permission,
-          record_freq=self._record_freq,
-          record_version=self._record_version,
-          embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE
+        self._handle,
+        self._primary._handle,
+        variables._try_guard_against_uninitialized_dependencies(name, initial_value),
+        ops.convert_to_tensor(invalid_key),
+        initial_num_buckets=config_pb2.IsSetInitialized.NOT_SET_INITAILIZED,
+        slot_num=self._slot_num,
+        shape=initial_value.get_shape()[rank:],
+        steps_to_live=self._steps_to_live,
+        emb_index=self._emb_index,
+        block_num=self.block_num,
+        slot_index=self._slot_index,
+        ht_type=self._ht_type,
+        ht_partition_num=self._ht_partition_num,
+        filter_freq=self._filter_freq,
+        l2_weight_threshold=self._l2_weight_threshold,
+        max_element_size=self._max_element_size,
+        false_positive_probability=self._false_positive_probability,
+        counter_type=self._counter_type,
+        max_freq=99999,
+        layout=self._layout,
+        storage_type=self._storage_type,
+        storage_path=self._storage_path,
+        storage_size=self._storage_size,
+        default_value_dim=self._default_value_dim,
+        default_value_no_permission=self._default_value_no_permission,
+        record_freq=self._record_freq,
+        record_version=self._record_version,
+        embedding_variable_type=config_pb2.EmbeddingVariableType.IMMUTABLE,
       )
     set_attr_ops = []
     if self._is_primary and self._is_multi_tier:
       with ops.control_dependencies([self._initializer_for_restore]):
         set_cache_op = gen_kv_variable_ops.kv_resource_init_cache_strategy_op(
-            self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=self._dtype
+          self._handle, cache_strategy=self._storage_cache_strategy, Tkeys=self._invalid_key_type, dtype=self._dtype
         )
       set_attr_ops.append(set_cache_op)
     with ops.control_dependencies(set_attr_ops + [self._initializer_for_restore]):
@@ -783,12 +795,12 @@ def sparse_read(self, indices, name=None, ev_init_value=None, counts=None):
         is_use_default_value_tensor = False
       if counts is not None:
         value = gen_kv_variable_ops.kv_resource_gather_v1(
-            self._handle, indices, default_value, counts, is_inference=True, name=name
+          self._handle, indices, default_value, counts, is_inference=True, name=name
         )
         self._counts_tensor[indices] = counts
       else:
         value = gen_kv_variable_ops.kv_resource_gather(
-            self._handle, indices, default_value, is_use_default_value_tensor, is_inference=True, name=name
+          self._handle, indices, default_value, is_use_default_value_tensor, is_inference=True, name=name
         )
     return value
 
@@ -809,7 +821,7 @@ def is_initialized(self):
 
   def is_all_slot_initialized(self):
     return gen_kv_variable_ops.kv_var_is_all_slot_initialized_op(
-        self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype
+      self._handle, Tkeys=self._invalid_key_type, dtype=self._dtype
     )
 
   @property
@@ -873,10 +885,10 @@ def get_tensor_slices(trackables):
 def save_fn(trackables, file_prefix):
   """Save stack and part objects to a checkpoint shard."""
   tensor_names, shapes_and_slices, tensors, _, ev_names, ev_resources, ev_key_types, has_ev = get_tensor_slices(
-      trackables
+    trackables
   )
   gen_kv_variable_ops.save_v3(
-      file_prefix, tensor_names, shapes_and_slices, ev_names, ev_resources, tensors, ev_key_types, has_ev
+    file_prefix, tensor_names, shapes_and_slices, ev_names, ev_resources, tensors, ev_key_types, has_ev
   )
   return file_prefix
 
@@ -893,22 +905,22 @@ def restore_fn(trackables, merged_prefix):
     if obj.is_all_slot_initialized():
       for ev in restore_queue[obj._primary.name]:
         gen_kv_variable_ops.kv_resource_import_v3(
-            merged_prefix,
-            ev.handle,
-            ev.name,
-            ops.convert_to_tensor(ev._invalid_key),
-            shape=ev.shape,
-            partition_id=0,
-            partition_num=1,
-            dtype=ev.dtype
+          merged_prefix,
+          ev.handle,
+          ev.name,
+          ops.convert_to_tensor(ev._invalid_key),
+          shape=ev.shape,
+          partition_id=0,
+          partition_num=1,
+          dtype=ev.dtype,
         )
 
 
 registration.register_checkpoint_saver(
-    name="EmbeddingVariable",
-    predicate=lambda x: isinstance(x, (EmbeddingVariable)),
-    save_fn=save_fn,
-    restore_fn=restore_fn
+  name="EmbeddingVariable",
+  predicate=lambda x: isinstance(x, (EmbeddingVariable)),
+  save_fn=save_fn,
+  restore_fn=restore_fn,
 )
 
 
@@ -965,7 +977,6 @@ def __init__(self, var, name):
       self.partition_num = self.var._save_slice_info.full_shape[0] if is_partitioned_ev else 1
 
     def _read_variable_closure(v):
-
       def f():
         with ops.device(v.device):
           x = v.read_value()
@@ -1000,14 +1011,14 @@ def restore(self, restored_tensors, unused_restored_shapes):
         with ops.control_dependencies(restore_dependency[self.var._primary_handle]):
           rank = self.op.initial_value.get_shape().rank - 1
           restore_op = gen_kv_variable_ops.kv_resource_import_v3(
-              restored_tensors[0],
-              self.handle_op,
-              name_tensor,
-              ops.convert_to_tensor(self.invalid_key),
-              shape=self.op.initial_value.get_shape()[rank:],
-              partition_id=self.partition_id,
-              partition_num=self.partition_num,
-              dtype=self.var._dtype
+            restored_tensors[0],
+            self.handle_op,
+            name_tensor,
+            ops.convert_to_tensor(self.invalid_key),
+            shape=self.op.initial_value.get_shape()[rank:],
+            partition_id=self.partition_id,
+            partition_num=self.partition_num,
+            dtype=self.var._dtype,
           )
         return restore_op
 
@@ -1017,11 +1028,11 @@ def incr_restore(self, restored_tensors, unused_restored_shapes):
     with ops.colocate_with(self.handle_op):
       handle_name = ops.name_from_scope_name(self.name)
       return gen_kv_variable_ops.kv_resource_incr_import(
-          restored_tensors[0],
-          self.handle_op,
-          name_tensor,
-          ops.convert_to_tensor(self.invalid_key),
-          variables._try_guard_against_uninitialized_dependencies(self.name, self.op.initial_value),
-          partition_id=self.partition_id,
-          partition_num=self.partition_num
+        restored_tensors[0],
+        self.handle_op,
+        name_tensor,
+        ops.convert_to_tensor(self.invalid_key),
+        variables._try_guard_against_uninitialized_dependencies(self.name, self.op.initial_value),
+        partition_id=self.partition_id,
+        partition_num=self.partition_num,
       )
diff --git a/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
index f1d1ee33..c44074c3 100644
--- a/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
+++ b/deepray/custom_ops/embedding_variable/python/tests/embedding_bag_test.py
@@ -55,8 +55,8 @@ def test_forward(input_shape, input_dim, dtype, indices_dtype, combiner):
   if weights is not None:
     weights = tf.convert_to_tensor(weights)
   output = embedding_bag(
-      indices,
-      weights,
+    indices,
+    weights,
   )
   test_utils.assert_allclose_according_to_type(expected, output, half_rtol=1e-2, half_atol=1e-2)
 
@@ -93,10 +93,10 @@ def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
     expected_grads = tape.gradient(expected, [params, weights])
     # Gather returns sparse IndexedSlices so we have to sum them together.
     test_utils.assert_allclose_according_to_type(
-        tf.convert_to_tensor(expected_grads[0]),
-        tf.convert_to_tensor(grads[0]),
-        half_rtol=1e-2,
-        half_atol=1e-2,
+      tf.convert_to_tensor(expected_grads[0]),
+      tf.convert_to_tensor(grads[0]),
+      half_rtol=1e-2,
+      half_atol=1e-2,
     )
     test_utils.assert_allclose_according_to_type(expected_grads[1], grads[1], half_rtol=1e-2, half_atol=1e-2)
   else:
@@ -109,8 +109,8 @@ def test_backward(input_shape, input_dim, dtype, indices_dtype, combiner):
     expected_grads = tape.gradient(expected, [params])
     # Gather returns sparse IndexedSlices so we have to sum them together.
     test_utils.assert_allclose_according_to_type(
-        tf.convert_to_tensor(expected_grads[0]),
-        tf.convert_to_tensor(grads[0]),
-        half_rtol=1e-2,
-        half_atol=1e-2,
+      tf.convert_to_tensor(expected_grads[0]),
+      tf.convert_to_tensor(grads[0]),
+      half_rtol=1e-2,
+      half_atol=1e-2,
     )
diff --git a/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
index f41f9179..2c28856d 100644
--- a/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
+++ b/deepray/custom_ops/embedding_variable/python/tests/group_embedding_lookup_ops_test.py
@@ -1,4 +1,5 @@
 """Tests for tensorflow.ops.embedding_variable GPU version."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -30,7 +31,6 @@
 
 
 class GroupEmbeddingGPUTest(test_util.TensorFlowTestCase):
-
   @test_util.run_gpu_only
   def testMultiKvResourceGather(self):
     print("testMultiKvResourceGather")
@@ -38,8 +38,8 @@ def testMultiKvResourceGather(self):
     def runTestAdagrad(embedding_weights, indices, combiners):
       emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
       contcat_emb = array_ops.concat(emb, axis=-1)
-      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      fun = math_ops.multiply(contcat_emb, 2.0, name="multiply")
+      loss = math_ops.reduce_sum(fun, name="reduce_sum")
       gs = training_util.get_or_create_global_step()
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
@@ -52,19 +52,19 @@ def runTestAdagrad(embedding_weights, indices, combiners):
         r, _, _ = sess.run([emb, train_op, loss])
         return r
 
-    with ops.device('/GPU:0'):
+    with ops.device("/GPU:0"):
       emb_var_0 = get_embedding_variable(
-          "emb_var_0", embedding_dim=8, initializer=init_ops.ones_initializer(dtypes.float32)
+        "emb_var_0", embedding_dim=8, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
       emb_var_1 = get_embedding_variable(
-          "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+        "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
     indices_0 = sparse_tensor.SparseTensor(
-        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
-        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
-        dense_shape=[4, 3]
+      indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+      values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+      dense_shape=[4, 3],
     )
 
     indices = [indices_0 for _ in range(2)]
@@ -91,8 +91,8 @@ def testMultiEmbeddingSparseLookUp(self):
     def runTestAdagrad(embedding_weights, indices, combiners):
       emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
       contcat_emb = array_ops.concat(emb, axis=-1)
-      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      fun = math_ops.multiply(contcat_emb, 2.0, name="multiply")
+      loss = math_ops.reduce_sum(fun, name="reduce_sum")
       gs = training_util.get_or_create_global_step()
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
@@ -105,19 +105,18 @@ def runTestAdagrad(embedding_weights, indices, combiners):
         r, _, _ = sess.run([emb, train_op, loss])
         return r
 
-    with ops.device('/GPU:0'):
-
+    with ops.device("/GPU:0"):
       var_0 = variable_scope.get_variable(
-          "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 8)
+        "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 8)
       )
       var_1 = variable_scope.get_variable(
-          "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+        "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
       )
 
     indices_0 = sparse_tensor.SparseTensor(
-        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
-        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
-        dense_shape=[4, 3]
+      indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+      values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+      dense_shape=[4, 3],
     )
 
     indices = [indices_0 for _ in range(2)]
@@ -144,8 +143,8 @@ def testMultiKvResourceGatherEqualMultiEmbeddingSparseLookUp(self):
     def runTestAdagrad(embedding_weights, indices, combiners):
       emb = embedding_ops.group_embedding_lookup_sparse(embedding_weights, indices, combiners)
       contcat_emb = array_ops.concat(emb, axis=-1)
-      fun = math_ops.multiply(contcat_emb, 2.0, name='multiply')
-      loss = math_ops.reduce_sum(fun, name='reduce_sum')
+      fun = math_ops.multiply(contcat_emb, 2.0, name="multiply")
+      loss = math_ops.reduce_sum(fun, name="reduce_sum")
       gs = training_util.get_or_create_global_step()
       opt = adagrad.AdagradOptimizer(0.1)
       g_v = opt.compute_gradients(loss)
@@ -162,26 +161,26 @@ def runTestAdagrad(embedding_weights, indices, combiners):
         r, _, _ = sess.run([emb, train_op, loss])
         return r
 
-    with ops.device('/GPU:0'):
+    with ops.device("/GPU:0"):
       emb_var_1 = get_embedding_variable(
-          "emb_var_0", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+        "emb_var_0", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
       emb_var_2 = get_embedding_variable(
-          "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
+        "emb_var_1", embedding_dim=16, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
       var_0 = variable_scope.get_variable(
-          "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+        "var_0", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
       )
       var_1 = variable_scope.get_variable(
-          "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
+        "var_1", initializer=init_ops.ones_initializer(dtypes.float32), shape=(1000, 16)
       )
 
     indices_0 = sparse_tensor.SparseTensor(
-        indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
-        values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
-        dense_shape=[4, 3]
+      indices=ops.convert_to_tensor([[0, 0], [1, 1], [2, 0], [2, 1], [3, 2]], dtype=dtypes.int64),
+      values=ops.convert_to_tensor([1, 1, 3, 4, 5], dtype=dtypes.int64),
+      dense_shape=[4, 3],
     )
 
     indices = [indices_0 for _ in range(4)]
@@ -199,43 +198,43 @@ def runTestAdagrad(embedding_weights, indices, combiners):
   def testMultiKvResourceGatherForSparseColumnEmbeddingCol(self):
     with feature_column_v2.group_embedding_column_scope(name="test"):
       ad_columns = feature_column_v2.categorical_column_with_embedding(
-          key="ad_emb",
-          dtype=dtypes.int64,
-          ev_option=ev_variables.EmbeddingVariableOption(
-              storage_option=ev_variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
-          )
+        key="ad_emb",
+        dtype=dtypes.int64,
+        ev_option=ev_variables.EmbeddingVariableOption(
+          storage_option=ev_variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
+        ),
       )
       ad_weights = feature_column_v2.embedding_column(
-          categorical_column=ad_columns, dimension=8, initializer=init_ops.ones_initializer(dtypes.float32)
+        categorical_column=ad_columns, dimension=8, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
       user_columns = feature_column_v2.categorical_column_with_embedding(
-          key="user_emb",
-          dtype=dtypes.int64,
-          ev_option=variables.EmbeddingVariableOption(
-              storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
-          )
+        key="user_emb",
+        dtype=dtypes.int64,
+        ev_option=variables.EmbeddingVariableOption(
+          storage_option=variables.StorageOption(storage_type=config_pb2.StorageType.HBM)
+        ),
       )
       user_weights = feature_column_v2.embedding_column(
-          categorical_column=user_columns, dimension=16, initializer=init_ops.ones_initializer(dtypes.float32)
+        categorical_column=user_columns, dimension=16, initializer=init_ops.ones_initializer(dtypes.float32)
       )
 
     ids = {}
     ids["ad_emb"] = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 3]],
-        values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
-        dense_shape=[5, 4]
+      indices=[[0, 0], [1, 1], [2, 2], [3, 3], [4, 3]],
+      values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
+      dense_shape=[5, 4],
     )
     ids["user_emb"] = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 1], [2, 2], [2, 3], [4, 3]],
-        values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
-        dense_shape=[5, 4]
+      indices=[[0, 0], [1, 1], [2, 2], [2, 3], [4, 3]],
+      values=math_ops.cast([1, 2, 3, 4, 5], dtypes.int64),
+      dense_shape=[5, 4],
     )
 
     emb = feature_column.input_layer(features=ids, feature_columns=[ad_weights, user_weights])
 
-    fun = math_ops.multiply(emb, 2.0, name='multiply')
-    loss = math_ops.reduce_sum(fun, name='reduce_sum')
+    fun = math_ops.multiply(emb, 2.0, name="multiply")
+    loss = math_ops.reduce_sum(fun, name="reduce_sum")
     opt = adagrad.AdagradOptimizer(0.1)
     g_v = opt.compute_gradients(loss)
     train_op = opt.apply_gradients(g_v)
diff --git a/deepray/custom_ops/embedding_variable/variable_scope.py b/deepray/custom_ops/embedding_variable/variable_scope.py
index a1530297..f551b863 100644
--- a/deepray/custom_ops/embedding_variable/variable_scope.py
+++ b/deepray/custom_ops/embedding_variable/variable_scope.py
@@ -56,18 +56,18 @@ class VariableScope(object):
   """
 
   def __init__(
-      self,
-      reuse,
-      name="",
-      initializer=None,
-      regularizer=None,
-      caching_device=None,
-      partitioner=None,
-      custom_getter=None,
-      name_scope="",
-      dtype=dtypes.float32,
-      use_resource=None,
-      constraint=None
+    self,
+    reuse,
+    name="",
+    initializer=None,
+    regularizer=None,
+    caching_device=None,
+    partitioner=None,
+    custom_getter=None,
+    name_scope="",
+    dtype=dtypes.float32,
+    use_resource=None,
+    constraint=None,
   ):
     """Creates a new VariableScope with the given properties."""
     self._name = name
@@ -83,8 +83,7 @@ def __init__(
     self._constraint = constraint
     if context.executing_eagerly():
       if self._caching_device is not None:
-        raise NotImplementedError("Caching devices is not yet supported "
-                                  "when eager execution is enabled.")
+        raise NotImplementedError("Caching devices is not yet supported when eager execution is enabled.")
       self._reuse = AUTO_REUSE
       self._use_resource = True
 
@@ -147,8 +146,7 @@ def set_dtype(self, dtype):
   def set_use_resource(self, use_resource):
     """Sets whether to use ResourceVariables for this scope."""
     if context.executing_eagerly() and not use_resource:
-      raise ValueError("When eager execution is enabled, "
-                       "use_resource cannot be set to false.")
+      raise ValueError("When eager execution is enabled, use_resource cannot be set to false.")
     self._use_resource = use_resource
 
   def set_regularizer(self, regularizer):
@@ -158,8 +156,7 @@ def set_regularizer(self, regularizer):
   def set_caching_device(self, caching_device):
     """Set caching_device for this scope."""
     if context.executing_eagerly():
-      raise NotImplementedError("Caching devices are not yet supported "
-                                "when eager execution is enabled.")
+      raise NotImplementedError("Caching devices are not yet supported when eager execution is enabled.")
     self._caching_device = caching_device
 
   def set_partitioner(self, partitioner):
@@ -188,22 +185,22 @@ def local_variables(self):
     return self.get_collection(ops.GraphKeys.LOCAL_VARIABLES)
 
   def get_variable(
-      self,
-      var_store,
-      name,
-      shape=None,
-      dtype=None,
-      initializer=None,
-      regularizer=None,
-      reuse=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      custom_getter=None,
-      constraint=None,
+    self,
+    var_store,
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    reuse=None,
+    trainable=None,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    use_resource=None,
+    custom_getter=None,
+    constraint=None,
   ):
     """Gets an existing variable with this name or create a new one."""
     if regularizer is None:
@@ -232,8 +229,7 @@ def get_variable(
       if dtype is not None and initializer is not None and not callable(initializer):
         init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
         if init_dtype != dtype:
-          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
-                           "don't match." % (init_dtype, dtype))
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' don't match." % (init_dtype, dtype))
       if initializer is None:
         initializer = self._initializer
       if constraint is None:
@@ -241,41 +237,41 @@ def get_variable(
       if dtype is None:
         dtype = self._dtype
       return var_store.get_variable(
-          full_name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          custom_getter=custom_getter,
-          constraint=constraint,
+        full_name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        custom_getter=custom_getter,
+        constraint=constraint,
       )
 
   def get_embedding_variable(
-      self,
-      name,
-      shape=None,
-      dtype=None,
-      initializer=None,
-      regularizer=None,
-      reuse=None,
-      trainable=True,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      custom_getter=None,
-      constraint=None,
-      invalid_key=None,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    reuse=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    use_resource=None,
+    custom_getter=None,
+    constraint=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Gets an existing variable with this name or create a new one."""
     if regularizer is None:
@@ -304,8 +300,7 @@ def get_embedding_variable(
       if dtype is not None and initializer is not None and not callable(initializer):
         init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
         if init_dtype != dtype:
-          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
-                           "don't match." % (init_dtype, dtype))
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' don't match." % (init_dtype, dtype))
       if initializer is None:
         initializer = self._initializer
       if constraint is None:
@@ -315,46 +310,46 @@ def get_embedding_variable(
       if invalid_key is None:
         invalid_key = -1
       return _VariableStore().get_variable(
-          full_name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          custom_getter=custom_getter,
-          constraint=constraint,
-          invalid_key=invalid_key,
-          evconfig=evconfig,
-          ht_partition_num=ht_partition_num
+        full_name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        custom_getter=custom_getter,
+        constraint=constraint,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        ht_partition_num=ht_partition_num,
       )
 
   def get_dynamic_dimension_embedding_variable(
-      self,
-      var_store,
-      name,
-      shape=None,
-      embedding_block_num=None,
-      dtype=None,
-      initializer=None,
-      regularizer=None,
-      reuse=None,
-      trainable=True,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      custom_getter=None,
-      constraint=None,
-      invalid_key=None,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,
+    var_store,
+    name,
+    shape=None,
+    embedding_block_num=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    reuse=None,
+    trainable=True,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    use_resource=None,
+    custom_getter=None,
+    constraint=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Gets an existing variable with this name or create a new one."""
     if regularizer is None:
@@ -383,8 +378,7 @@ def get_dynamic_dimension_embedding_variable(
       if dtype is not None and initializer is not None and not callable(initializer):
         init_dtype = ops.convert_to_tensor(initializer).dtype.base_dtype
         if init_dtype != dtype:
-          raise ValueError("Initializer type '%s' and explicit dtype '%s' "
-                           "don't match." % (init_dtype, dtype))
+          raise ValueError("Initializer type '%s' and explicit dtype '%s' don't match." % (init_dtype, dtype))
       if initializer is None:
         initializer = self._initializer
       if constraint is None:
@@ -394,41 +388,41 @@ def get_dynamic_dimension_embedding_variable(
       if invalid_key is None:
         invalid_key = -1
       return var_store.get_variable(
-          full_name,
-          shape=shape,
-          embedding_block_num=embedding_block_num,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          custom_getter=custom_getter,
-          constraint=constraint,
-          invalid_key=invalid_key,
-          evconfig=evconfig,
-          ht_partition_num=ht_partition_num
+        full_name,
+        shape=shape,
+        embedding_block_num=embedding_block_num,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        custom_getter=custom_getter,
+        constraint=constraint,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        ht_partition_num=ht_partition_num,
       )
 
   def _get_partitioned_variable(
-      self,
-      var_store,
-      name,
-      shape=None,
-      dtype=None,
-      initializer=None,
-      regularizer=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      constraint=None,
+    self,
+    var_store,
+    name,
+    shape=None,
+    dtype=None,
+    initializer=None,
+    regularizer=None,
+    trainable=None,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    use_resource=None,
+    constraint=None,
   ):
     """Gets an existing variable with this name or create a new one."""
     if initializer is None:
@@ -448,11 +442,11 @@ def _get_partitioned_variable(
 
     if self._custom_getter is not None:
       raise ValueError(
-          "Private access to _get_partitioned_variable is not allowed when "
-          "a custom getter is set.  Current custom getter: %s.  "
-          "It is likely that you're using create_partitioned_variables.  "
-          "If so, consider instead using get_variable with a non-empty "
-          "partitioner parameter instead." % self._custom_getter
+        "Private access to _get_partitioned_variable is not allowed when "
+        "a custom getter is set.  Current custom getter: %s.  "
+        "It is likely that you're using create_partitioned_variables.  "
+        "If so, consider instead using get_variable with a non-empty "
+        "partitioner parameter instead." % self._custom_getter
       )
 
     if partitioner is None:
@@ -473,19 +467,19 @@ def _get_partitioned_variable(
     with ops.name_scope(None):
       # pylint: disable=protected-access
       return var_store._get_partitioned_variable(
-          full_name,
-          shape=shape,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=self.reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          constraint=constraint,
+        full_name,
+        shape=shape,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=self.reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        constraint=constraint,
       )
       # pylint: enable=protected-access
 
@@ -508,25 +502,25 @@ def __init__(self):
     self._store_eager_variables = False
 
   def get_variable(
-      self,
-      name,
-      shape=None,
-      embedding_block_num=None,
-      dtype=dtypes.float32,
-      initializer=None,
-      regularizer=None,
-      reuse=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      partitioner=None,
-      validate_shape=True,
-      use_resource=None,
-      custom_getter=None,
-      constraint=None,
-      invalid_key=None,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,
+    name,
+    shape=None,
+    embedding_block_num=None,
+    dtype=dtypes.float32,
+    initializer=None,
+    regularizer=None,
+    reuse=None,
+    trainable=None,
+    collections=None,
+    caching_device=None,
+    partitioner=None,
+    validate_shape=True,
+    use_resource=None,
+    custom_getter=None,
+    constraint=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Gets an existing variable with these parameters or create a new one.
 
@@ -632,53 +626,31 @@ def custom_getter(getter, name, *args, **kwargs): return getter(name +
     # Note: the parameters of _true_getter, and their documentation, match
     # *exactly* item-for-item with the docstring of this method.
     def _true_getter(  # pylint: disable=missing-docstring
-        name,
-        shape=None,
-        embedding_block_num=None,
-        dtype=dtypes.float32,
-        initializer=None,
-        regularizer=None,
-        reuse=None,
-        trainable=None,
-        collections=None,
-        caching_device=None,
-        partitioner=None,
-        validate_shape=True,
-        use_resource=None,
-        constraint=None,
-        invalid_key=None,
-        evconfig=ev_variables.EmbeddingVariableConfig(),
-        ht_partition_num=1000):
-      is_scalar = (shape is not None and isinstance(shape, collections_lib.abc.Sequence) and not shape)
+      name,
+      shape=None,
+      embedding_block_num=None,
+      dtype=dtypes.float32,
+      initializer=None,
+      regularizer=None,
+      reuse=None,
+      trainable=None,
+      collections=None,
+      caching_device=None,
+      partitioner=None,
+      validate_shape=True,
+      use_resource=None,
+      constraint=None,
+      invalid_key=None,
+      evconfig=ev_variables.EmbeddingVariableConfig(),
+      ht_partition_num=1000,
+    ):
+      is_scalar = shape is not None and isinstance(shape, collections_lib.abc.Sequence) and not shape
       # Partitioned variable case
       if partitioner is not None and not is_scalar:
         if not callable(partitioner):
           raise ValueError("Partitioner must be callable, but received: %s" % partitioner)
         with ops.name_scope(None):
           return self._get_partitioned_variable(
-              name=name,
-              shape=shape,
-              embedding_block_num=embedding_block_num,
-              dtype=dtype,
-              initializer=initializer,
-              regularizer=regularizer,
-              reuse=reuse,
-              trainable=trainable,
-              collections=collections,
-              caching_device=caching_device,
-              partitioner=partitioner,
-              validate_shape=validate_shape,
-              use_resource=use_resource,
-              constraint=constraint,
-              invalid_key=invalid_key,
-              evconfig=evconfig,
-              ht_partition_num=ht_partition_num
-          )
-
-      # Special case for partitioned variable to allow reuse without having to
-      # specify partitioner.
-      if reuse is True and partitioner is None and name in self._partitioned_vars:
-        return self._get_partitioned_variable(
             name=name,
             shape=shape,
             embedding_block_num=embedding_block_num,
@@ -689,24 +661,19 @@ def _true_getter(  # pylint: disable=missing-docstring
             trainable=trainable,
             collections=collections,
             caching_device=caching_device,
-            partitioner=None,
+            partitioner=partitioner,
             validate_shape=validate_shape,
             use_resource=use_resource,
             constraint=constraint,
             invalid_key=invalid_key,
             evconfig=evconfig,
-            ht_partition_num=ht_partition_num
-        )
-
-      # Single variable case
-      if "%s/part_0" % name in self._vars:
-        raise ValueError(
-            "No partitioner was provided, but a partitioned version of the "
-            "variable was found: %s/part_0. Perhaps a variable of the same "
-            "name was already created with partitioning?" % name
-        )
+            ht_partition_num=ht_partition_num,
+          )
 
-      return self._get_single_variable(
+      # Special case for partitioned variable to allow reuse without having to
+      # specify partitioner.
+      if reuse is True and partitioner is None and name in self._partitioned_vars:
+        return self._get_partitioned_variable(
           name=name,
           shape=shape,
           embedding_block_num=embedding_block_num,
@@ -717,35 +684,63 @@ def _true_getter(  # pylint: disable=missing-docstring
           trainable=trainable,
           collections=collections,
           caching_device=caching_device,
+          partitioner=None,
           validate_shape=validate_shape,
           use_resource=use_resource,
           constraint=constraint,
           invalid_key=invalid_key,
           evconfig=evconfig,
-          ht_partition_num=ht_partition_num
+          ht_partition_num=ht_partition_num,
+        )
+
+      # Single variable case
+      if "%s/part_0" % name in self._vars:
+        raise ValueError(
+          "No partitioner was provided, but a partitioned version of the "
+          "variable was found: %s/part_0. Perhaps a variable of the same "
+          "name was already created with partitioning?" % name
+        )
+
+      return self._get_single_variable(
+        name=name,
+        shape=shape,
+        embedding_block_num=embedding_block_num,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        constraint=constraint,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        ht_partition_num=ht_partition_num,
       )
 
     if custom_getter is not None:
       # Handle backwards compatibility with getter arguments that were added
       # to the API after users started writing custom getters.
       custom_getter_kwargs = {
-          "getter": _true_getter,
-          "name": name,
-          "shape": shape,
-          "embedding_block_num": embedding_block_num,
-          "dtype": dtype,
-          "initializer": initializer,
-          "regularizer": regularizer,
-          "reuse": reuse,
-          "trainable": trainable,
-          "collections": collections,
-          "caching_device": caching_device,
-          "partitioner": partitioner,
-          "validate_shape": validate_shape,
-          "use_resource": use_resource,
-          "invalid_key": invalid_key,
-          "evconfig": evconfig,
-          "ht_partition_num": ht_partition_num,
+        "getter": _true_getter,
+        "name": name,
+        "shape": shape,
+        "embedding_block_num": embedding_block_num,
+        "dtype": dtype,
+        "initializer": initializer,
+        "regularizer": regularizer,
+        "reuse": reuse,
+        "trainable": trainable,
+        "collections": collections,
+        "caching_device": caching_device,
+        "partitioner": partitioner,
+        "validate_shape": validate_shape,
+        "use_resource": use_resource,
+        "invalid_key": invalid_key,
+        "evconfig": evconfig,
+        "ht_partition_num": ht_partition_num,
       }
       # `fn_args` and `has_kwargs` can handle functions, `functools.partial`,
       # `lambda`.
@@ -754,44 +749,44 @@ def _true_getter(  # pylint: disable=missing-docstring
       return custom_getter(**custom_getter_kwargs)
     else:
       return _true_getter(
-          name,
-          shape=shape,
-          embedding_block_num=embedding_block_num,
-          dtype=dtype,
-          initializer=initializer,
-          regularizer=regularizer,
-          reuse=reuse,
-          trainable=trainable,
-          collections=collections,
-          caching_device=caching_device,
-          partitioner=partitioner,
-          validate_shape=validate_shape,
-          use_resource=use_resource,
-          constraint=constraint,
-          invalid_key=invalid_key,
-          evconfig=evconfig,
-          ht_partition_num=ht_partition_num
+        name,
+        shape=shape,
+        embedding_block_num=embedding_block_num,
+        dtype=dtype,
+        initializer=initializer,
+        regularizer=regularizer,
+        reuse=reuse,
+        trainable=trainable,
+        collections=collections,
+        caching_device=caching_device,
+        partitioner=partitioner,
+        validate_shape=validate_shape,
+        use_resource=use_resource,
+        constraint=constraint,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        ht_partition_num=ht_partition_num,
       )
 
   def _get_single_variable(
-      self,
-      name,
-      shape=None,
-      embedding_block_num=None,
-      dtype=dtypes.float32,
-      initializer=None,
-      regularizer=None,
-      partition_info=None,
-      reuse=None,
-      trainable=None,
-      collections=None,
-      caching_device=None,
-      validate_shape=True,
-      use_resource=None,
-      constraint=None,
-      invalid_key=None,
-      evconfig=ev_variables.EmbeddingVariableConfig(),
-      ht_partition_num=1000
+    self,
+    name,
+    shape=None,
+    embedding_block_num=None,
+    dtype=dtypes.float32,
+    initializer=None,
+    regularizer=None,
+    partition_info=None,
+    reuse=None,
+    trainable=None,
+    collections=None,
+    caching_device=None,
+    validate_shape=True,
+    use_resource=None,
+    constraint=None,
+    invalid_key=None,
+    evconfig=ev_variables.EmbeddingVariableConfig(),
+    ht_partition_num=1000,
   ):
     """Get or create a single Variable (e.g.
 
@@ -836,9 +831,9 @@ def _get_single_variable(
       if reuse is False:
         var = self._vars[name]
         err_msg = (
-            "Variable %s already exists, disallowed."
-            " Did you mean to set reuse=True or "
-            "reuse=tf.AUTO_REUSE in VarScope?" % name
+          "Variable %s already exists, disallowed."
+          " Did you mean to set reuse=True or "
+          "reuse=tf.AUTO_REUSE in VarScope?" % name
         )
         # ResourceVariables don't have an op associated with so no traceback
         if isinstance(var, resource_variable_ops.ResourceVariable):
@@ -852,22 +847,22 @@ def _get_single_variable(
         raise ValueError("%s Originally defined at:\n\n%s" % (err_msg, "".join(traceback.format_list(tb))))
       found_var = self._vars[name]
       from tensorflow.python.ops.hash_table import hash_table
+
       if isinstance(found_var, (hash_table.HashTable, hash_table.DistributedHashTable)):
         raise ValueError(
-            "Trying to reuse variable %s, but an existing variable is a"
-            " HashTable or DistributedHashTable, can not reuse it." % (name)
+          "Trying to reuse variable %s, but an existing variable is a"
+          " HashTable or DistributedHashTable, can not reuse it." % (name)
         )
       if not shape.is_compatible_with(found_var.get_shape()):
         raise ValueError(
-            "Trying to share variable %s, but specified shape %s"
-            " and found shape %s." % (name, shape, found_var.get_shape())
+          "Trying to share variable %s, but specified shape %s"
+          " and found shape %s." % (name, shape, found_var.get_shape())
         )
       if not dtype.is_compatible_with(found_var.dtype):
         dtype_str = dtype.name
         found_type_str = found_var.dtype.name
         raise ValueError(
-            "Trying to share variable %s, but specified dtype %s"
-            " and found dtype %s." % (name, dtype_str, found_type_str)
+          "Trying to share variable %s, but specified dtype %s and found dtype %s." % (name, dtype_str, found_type_str)
         )
       return found_var
 
@@ -896,27 +891,27 @@ def _get_single_variable(
           variable_dtype = None
         else:
           raise ValueError(
-              "The initializer passed is not valid. It should "
-              "be a callable with no arguments and the "
-              "shape should not be provided or an instance of "
-              "`tf.keras.initializers.*' and `shape` should be "
-              "fully defined."
+            "The initializer passed is not valid. It should "
+            "be a callable with no arguments and the "
+            "shape should not be provided or an instance of "
+            "`tf.keras.initializers.*' and `shape` should be "
+            "fully defined."
           )
 
     v = default_variable_creator(
-        initial_value=init_val,
-        name=name,
-        trainable=trainable,
-        collections=collections,
-        caching_device=caching_device,
-        embedding_block_num=embedding_block_num,
-        dtype=variable_dtype,
-        validate_shape=validate_shape,
-        constraint=constraint,
-        invalid_key=invalid_key,
-        evconfig=evconfig,
-        initializer=initializer,
-        ht_partition_num=ht_partition_num
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      collections=collections,
+      caching_device=caching_device,
+      embedding_block_num=embedding_block_num,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      invalid_key=invalid_key,
+      evconfig=evconfig,
+      initializer=initializer,
+      ht_partition_num=ht_partition_num,
     )
     if not context.executing_eagerly() or self._store_eager_variables:
       # In eager mode we do not want to keep default references to Variable
@@ -938,8 +933,7 @@ def _get_single_variable(
             v_name = v.name
             loss_name = loss.name
           logging.vlog(
-              1, "Applied regularizer to %s and added the result %s "
-              "to REGULARIZATION_LOSSES.", v_name, loss_name
+            1, "Applied regularizer to %s and added the result %s to REGULARIZATION_LOSSES.", v_name, loss_name
           )
           ops.add_to_collection(ops.GraphKeys.REGULARIZATION_LOSSES, loss)
     return v
@@ -978,22 +972,22 @@ def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32):
 
 # @tf_export(v1=["get_embedding_variable"])
 def get_embedding_variable_internal(
-    name,
-    embedding_dim,
-    key_dtype=dtypes.int64,
-    value_dtype=None,
-    initializer=None,
-    regularizer=None,
-    trainable=True,
-    collections=None,
-    caching_device=None,
-    partitioner=None,
-    validate_shape=True,
-    custom_getter=None,
-    constraint=None,
-    steps_to_live=None,
-    init_data_source=None,
-    ev_option=ev_variables.EmbeddingVariableOption()
+  name,
+  embedding_dim,
+  key_dtype=dtypes.int64,
+  value_dtype=None,
+  initializer=None,
+  regularizer=None,
+  trainable=True,
+  collections=None,
+  caching_device=None,
+  partitioner=None,
+  validate_shape=True,
+  custom_getter=None,
+  constraint=None,
+  steps_to_live=None,
+  init_data_source=None,
+  ev_option=ev_variables.EmbeddingVariableOption(),
 ):
   if key_dtype == dtypes.int64:
     invalid_key = 9223372036854775807
@@ -1022,55 +1016,55 @@ def get_embedding_variable_internal(
   if steps_to_live is not None and l2_weight_threshold > 0:
     raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.")
   return VariableScope(reuse=False).get_embedding_variable(
-      name,
-      shape=embedding_dim,
-      dtype=value_dtype,
-      initializer=initializer,
-      regularizer=regularizer,
-      trainable=trainable,
-      collections=collections,
-      caching_device=caching_device,
-      partitioner=partitioner,
-      validate_shape=validate_shape,
-      use_resource=True,
-      custom_getter=custom_getter,
-      constraint=constraint,
-      invalid_key=invalid_key,
-      evconfig=ev_variables.EmbeddingVariableConfig(
-          steps_to_live=steps_to_live,
-          init_data_source=init_data_source,
-          ht_type=ev_option.ht_type,
-          l2_weight_threshold=l2_weight_threshold,
-          filter_strategy=ev_option.filter_strategy,
-          storage_type=ev_option.storage_option.storage_type,
-          storage_path=ev_option.storage_option.storage_path,
-          storage_size=ev_option.storage_option.storage_size,
-          storage_cache_strategy=ev_option.storage_option.cache_strategy,
-          layout=ev_option.storage_option.layout,
-          default_value_dim=ev_option.init.default_value_dim,
-          default_value_no_permission=ev_option.init.default_value_no_permission
-      ),
-      ht_partition_num=ev_option.ht_partition_num
+    name,
+    shape=embedding_dim,
+    dtype=value_dtype,
+    initializer=initializer,
+    regularizer=regularizer,
+    trainable=trainable,
+    collections=collections,
+    caching_device=caching_device,
+    partitioner=partitioner,
+    validate_shape=validate_shape,
+    use_resource=True,
+    custom_getter=custom_getter,
+    constraint=constraint,
+    invalid_key=invalid_key,
+    evconfig=ev_variables.EmbeddingVariableConfig(
+      steps_to_live=steps_to_live,
+      init_data_source=init_data_source,
+      ht_type=ev_option.ht_type,
+      l2_weight_threshold=l2_weight_threshold,
+      filter_strategy=ev_option.filter_strategy,
+      storage_type=ev_option.storage_option.storage_type,
+      storage_path=ev_option.storage_option.storage_path,
+      storage_size=ev_option.storage_option.storage_size,
+      storage_cache_strategy=ev_option.storage_option.cache_strategy,
+      layout=ev_option.storage_option.layout,
+      default_value_dim=ev_option.init.default_value_dim,
+      default_value_no_permission=ev_option.init.default_value_no_permission,
+    ),
+    ht_partition_num=ev_option.ht_partition_num,
   )
 
 
 # @tf_export(v1=["get_embedding_variable_v2"])
 def get_embedding_variable_v2_internal(
-    name,
-    embedding_dim,
-    key_dtype=dtypes.int64,
-    value_dtype=None,
-    initializer=None,
-    regularizer=None,
-    trainable=True,
-    collections=None,
-    caching_device=None,
-    partitioner=None,
-    validate_shape=True,
-    custom_getter=None,
-    constraint=None,
-    evconfig=ev_variables.EmbeddingVariableConfig(),
-    ht_partition_num=1000
+  name,
+  embedding_dim,
+  key_dtype=dtypes.int64,
+  value_dtype=None,
+  initializer=None,
+  regularizer=None,
+  trainable=True,
+  collections=None,
+  caching_device=None,
+  partitioner=None,
+  validate_shape=True,
+  custom_getter=None,
+  constraint=None,
+  evconfig=ev_variables.EmbeddingVariableConfig(),
+  ht_partition_num=1000,
 ):
   if key_dtype == dtypes.int64:
     invalid_key = 9223372036854775807
@@ -1083,43 +1077,43 @@ def get_embedding_variable_v2_internal(
   if initializer is None:
     initializer = init_ops.truncated_normal_initializer()
   return VariableScope(reuse=False).get_embedding_variable(
-      name,
-      shape=embedding_dim,
-      dtype=value_dtype,
-      initializer=initializer,
-      regularizer=regularizer,
-      trainable=trainable,
-      collections=collections,
-      caching_device=caching_device,
-      partitioner=partitioner,
-      validate_shape=validate_shape,
-      use_resource=True,
-      custom_getter=custom_getter,
-      constraint=constraint,
-      invalid_key=invalid_key,
-      evconfig=evconfig,
-      ht_partition_num=ht_partition_num
+    name,
+    shape=embedding_dim,
+    dtype=value_dtype,
+    initializer=initializer,
+    regularizer=regularizer,
+    trainable=trainable,
+    collections=collections,
+    caching_device=caching_device,
+    partitioner=partitioner,
+    validate_shape=validate_shape,
+    use_resource=True,
+    custom_getter=custom_getter,
+    constraint=constraint,
+    invalid_key=invalid_key,
+    evconfig=evconfig,
+    ht_partition_num=ht_partition_num,
   )
 
 
 @tf_export(v1=["get_embedding_variable"])
 def get_embedding_variable(
-    name,
-    embedding_dim,
-    key_dtype=dtypes.int64,
-    value_dtype=None,
-    initializer=None,
-    regularizer=None,
-    trainable=True,
-    collections=None,
-    caching_device=None,
-    partitioner=None,
-    validate_shape=True,
-    custom_getter=None,
-    constraint=None,
-    steps_to_live=None,
-    init_data_source=None,
-    ev_option=ev_variables.EmbeddingVariableOption()
+  name,
+  embedding_dim,
+  key_dtype=dtypes.int64,
+  value_dtype=None,
+  initializer=None,
+  regularizer=None,
+  trainable=True,
+  collections=None,
+  caching_device=None,
+  partitioner=None,
+  validate_shape=True,
+  custom_getter=None,
+  constraint=None,
+  steps_to_live=None,
+  init_data_source=None,
+  ev_option=ev_variables.EmbeddingVariableOption(),
 ):
   if key_dtype == dtypes.int64:
     invalid_key = 9223372036854775807
@@ -1137,8 +1131,7 @@ def get_embedding_variable(
       print("use initializer give in InitializerOption.")
     initializer = ev_option.init.initializer
   if steps_to_live is not None:
-    logger.warning("steps_to_live is deprecated,"
-                   " use tf.GlobaStepEvcit(steps_to_live)")
+    logger.warning("steps_to_live is deprecated, use tf.GlobaStepEvcit(steps_to_live)")
   if ev_option.evict is not None:
     if isinstance(ev_option.evict, ev_variables.GlobalStepEvict):
       if steps_to_live is not None:
@@ -1151,73 +1144,73 @@ def get_embedding_variable(
   if steps_to_live is not None and l2_weight_threshold > 0:
     raise ValueError("step_to_live and l2_weight_threshold can't be enabled at same time.")
   return VariableScope(reuse=False).get_embedding_variable(
-      name,
-      shape=embedding_dim,
-      dtype=value_dtype,
-      initializer=initializer,
-      regularizer=regularizer,
-      trainable=trainable,
-      collections=collections,
-      caching_device=caching_device,
-      partitioner=partitioner,
-      validate_shape=validate_shape,
-      use_resource=True,
-      custom_getter=custom_getter,
-      constraint=constraint,
-      invalid_key=invalid_key,
-      evconfig=ev_variables.EmbeddingVariableConfig(
-          steps_to_live=steps_to_live,
-          init_data_source=init_data_source,
-          ht_type=ev_option.ht_type,
-          l2_weight_threshold=l2_weight_threshold,
-          filter_strategy=ev_option.filter_strategy,
-          storage_type=ev_option.storage_option.storage_type,
-          storage_path=ev_option.storage_option.storage_path,
-          storage_size=ev_option.storage_option.storage_size,
-          storage_cache_strategy=ev_option.storage_option.cache_strategy,
-          layout=ev_option.storage_option.layout,
-          default_value_dim=ev_option.init.default_value_dim,
-          default_value_no_permission=ev_option.init.default_value_no_permission
-      ),
-      ht_partition_num=ev_option.ht_partition_num
+    name,
+    shape=embedding_dim,
+    dtype=value_dtype,
+    initializer=initializer,
+    regularizer=regularizer,
+    trainable=trainable,
+    collections=collections,
+    caching_device=caching_device,
+    partitioner=partitioner,
+    validate_shape=validate_shape,
+    use_resource=True,
+    custom_getter=custom_getter,
+    constraint=constraint,
+    invalid_key=invalid_key,
+    evconfig=ev_variables.EmbeddingVariableConfig(
+      steps_to_live=steps_to_live,
+      init_data_source=init_data_source,
+      ht_type=ev_option.ht_type,
+      l2_weight_threshold=l2_weight_threshold,
+      filter_strategy=ev_option.filter_strategy,
+      storage_type=ev_option.storage_option.storage_type,
+      storage_path=ev_option.storage_option.storage_path,
+      storage_size=ev_option.storage_option.storage_size,
+      storage_cache_strategy=ev_option.storage_option.cache_strategy,
+      layout=ev_option.storage_option.layout,
+      default_value_dim=ev_option.init.default_value_dim,
+      default_value_no_permission=ev_option.init.default_value_no_permission,
+    ),
+    ht_partition_num=ev_option.ht_partition_num,
   )
 
 
 def default_variable_creator(
-    initial_value=None,
-    trainable=None,
-    collections=None,
-    validate_shape=True,
-    caching_device=None,
-    name=None,
-    variable_def=None,
-    dtype=None,
-    embedding_block_num=None,
-    import_scope=None,
-    constraint=None,
-    invalid_key=None,
-    evconfig=ev_variables.EmbeddingVariableConfig(),
-    initializer=None,
-    ht_partition_num=1000
+  initial_value=None,
+  trainable=None,
+  collections=None,
+  validate_shape=True,
+  caching_device=None,
+  name=None,
+  variable_def=None,
+  dtype=None,
+  embedding_block_num=None,
+  import_scope=None,
+  constraint=None,
+  invalid_key=None,
+  evconfig=ev_variables.EmbeddingVariableConfig(),
+  initializer=None,
+  ht_partition_num=1000,
 ):
   if invalid_key is not None:
     emb_blocknum = embedding_block_num
     if emb_blocknum is None:
       ev = kv_variable_ops.EmbeddingVariable(
-          initial_value=initial_value,
-          trainable=trainable,
-          collections=collections,
-          validate_shape=validate_shape,
-          caching_device=caching_device,
-          name=name,
-          dtype=dtype,
-          constraint=constraint,
-          variable_def=variable_def,
-          import_scope=import_scope,
-          invalid_key=invalid_key,
-          evconfig=evconfig,
-          # initializer=initializer,
-          ht_partition_num=ht_partition_num
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name,
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        import_scope=import_scope,
+        invalid_key=invalid_key,
+        evconfig=evconfig,
+        # initializer=initializer,
+        ht_partition_num=ht_partition_num,
       )
       if evconfig.init_data_source is not None:
         ev.set_init_data_source_initializer(evconfig.init_data_source)
@@ -1229,20 +1222,20 @@ def default_variable_creator(
       block_evconfig.handle_name = name
       block_evconfig.emb_index = 0
       primary_ev = kv_variable_ops.EmbeddingVariable(
-          initial_value=initial_value,
-          trainable=trainable,
-          collections=collections,
-          validate_shape=validate_shape,
-          caching_device=caching_device,
-          name=name + "/block0",
-          dtype=dtype,
-          constraint=constraint,
-          variable_def=variable_def,
-          import_scope=import_scope,
-          invalid_key=invalid_key,
-          evconfig=block_evconfig,
-          initializer=initializer,
-          ht_partition_num=ht_partition_num
+        initial_value=initial_value,
+        trainable=trainable,
+        collections=collections,
+        validate_shape=validate_shape,
+        caching_device=caching_device,
+        name=name + "/block0",
+        dtype=dtype,
+        constraint=constraint,
+        variable_def=variable_def,
+        import_scope=import_scope,
+        invalid_key=invalid_key,
+        evconfig=block_evconfig,
+        initializer=initializer,
+        ht_partition_num=ht_partition_num,
       )
       if evconfig.init_data_source is not None:
         primary_ev.set_init_data_source_initializer(evconfig.init_data_source)
@@ -1255,20 +1248,20 @@ def default_variable_creator(
           slave_evconfig.emb_index = i + 1
           slave_evconfig._slot_num = primary_ev._slot_num
           slave_ev = kv_variable_ops.EmbeddingVariable(
-              initial_value=initial_value,
-              trainable=trainable,
-              collections=collections,
-              validate_shape=validate_shape,
-              caching_device=caching_device,
-              name=name + "/block" + str(i + 1),
-              dtype=dtype,
-              constraint=constraint,
-              variable_def=variable_def,
-              import_scope=import_scope,
-              invalid_key=invalid_key,
-              evconfig=slave_evconfig,
-              initializer=initializer,
-              ht_partition_num=ht_partition_num
+            initial_value=initial_value,
+            trainable=trainable,
+            collections=collections,
+            validate_shape=validate_shape,
+            caching_device=caching_device,
+            name=name + "/block" + str(i + 1),
+            dtype=dtype,
+            constraint=constraint,
+            variable_def=variable_def,
+            import_scope=import_scope,
+            invalid_key=invalid_key,
+            evconfig=slave_evconfig,
+            initializer=initializer,
+            ht_partition_num=ht_partition_num,
           )
           if evconfig.init_data_source is not None:
             slave_ev._set_init_data_source_initializer(evconfig.init_data_source)
diff --git a/deepray/custom_ops/embedding_variable/variables.py b/deepray/custom_ops/embedding_variable/variables.py
index 31a1c6ad..d582bb42 100644
--- a/deepray/custom_ops/embedding_variable/variables.py
+++ b/deepray/custom_ops/embedding_variable/variables.py
@@ -10,8 +10,7 @@
 
 @tf_export(v1=["InitializerOption"])
 class InitializerOption(object):
-
-  def __init__(self, initializer=None, default_value_dim=4096, default_value_no_permission=.0):
+  def __init__(self, initializer=None, default_value_dim=4096, default_value_no_permission=0.0):
     self.initializer = initializer
     self.default_value_dim = default_value_dim
     self.default_value_no_permission = default_value_no_permission
@@ -22,14 +21,12 @@ def __init__(self, initializer=None, default_value_dim=4096, default_value_no_pe
 
 @tf_export(v1=["GlobalStepEvict"])
 class GlobalStepEvict(object):
-
   def __init__(self, steps_to_live=None):
     self.steps_to_live = steps_to_live
 
 
 @tf_export(v1=["L2WeightEvict"])
 class L2WeightEvict(object):
-
   def __init__(self, l2_weight_threshold=-1.0):
     self.l2_weight_threshold = l2_weight_threshold
     if l2_weight_threshold <= 0 and l2_weight_threshold != -1.0:
@@ -38,13 +35,8 @@ def __init__(self, l2_weight_threshold=-1.0):
 
 @tf_export(v1=["CheckpointOption"])
 class CheckpointOption(object):
-
   def __init__(
-      self,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      always_load_from_specific_ckpt=False,
-      init_data_source=None
+    self, ckpt_to_load_from=None, tensor_name_in_ckpt=None, always_load_from_specific_ckpt=False, init_data_source=None
   ):
     self.ckpt_to_load_from = ckpt_to_load_from
     self.tensor_name_in_ckpt = tensor_name_in_ckpt
@@ -54,14 +46,13 @@ def __init__(
 
 @tf_export(v1=["StorageOption"])
 class StorageOption(object):
-
   def __init__(
-      self,
-      storage_type=None,
-      storage_path=None,
-      storage_size=[1024 * 1024 * 1024],
-      cache_strategy=config_pb2.CacheStrategy.LFU,
-      layout=None
+    self,
+    storage_type=None,
+    storage_path=None,
+    storage_size=[1024 * 1024 * 1024],
+    cache_strategy=config_pb2.CacheStrategy.LFU,
+    layout=None,
   ):
     self.storage_type = storage_type
     self.storage_path = storage_path
@@ -81,24 +72,25 @@ def __init__(
           file_io.recursive_create_dir(storage_path)
     else:
       if storage_type is not None and storage_type in [
-          config_pb2.StorageType.LEVELDB, config_pb2.StorageType.SSDHASH, config_pb2.StorageType.DRAM_SSDHASH,
-          config_pb2.StorageType.DRAM_LEVELDB
+        config_pb2.StorageType.LEVELDB,
+        config_pb2.StorageType.SSDHASH,
+        config_pb2.StorageType.DRAM_SSDHASH,
+        config_pb2.StorageType.DRAM_LEVELDB,
       ]:
         raise ValueError("storage_path musnt'be None when storage_type is set")
 
 
 @tf_export(v1=["EmbeddingVariableOption"])
 class EmbeddingVariableOption(object):
-
   def __init__(
-      self,
-      ht_type="",
-      ht_partition_num=1000,
-      evict_option=None,
-      ckpt=None,
-      filter_option=None,
-      storage_option=StorageOption(),
-      init_option=InitializerOption()
+    self,
+    ht_type="",
+    ht_partition_num=1000,
+    evict_option=None,
+    ckpt=None,
+    filter_option=None,
+    storage_option=StorageOption(),
+    init_option=InitializerOption(),
   ):
     self.ht_type = ht_type
     self.ht_partition_num = ht_partition_num
@@ -111,14 +103,12 @@ def __init__(
 
 @tf_export(v1=["CounterFilter"])
 class CounterFilter(object):
-
   def __init__(self, filter_freq=0):
     self.filter_freq = filter_freq
 
 
 @tf_export(v1=["CBFFilter"])
 class CBFFilter(object):
-
   def __init__(self, filter_freq=0, max_element_size=0, false_positive_probability=-1.0, counter_type=dtypes.uint64):
     if false_positive_probability != -1.0:
       if false_positive_probability <= 0.0:
@@ -136,33 +126,32 @@ def __init__(self, filter_freq=0, max_element_size=0, false_positive_probability
 
 
 class EmbeddingVariableConfig(object):
-
   def __init__(
-      self,
-      steps_to_live=None,
-      steps_to_live_l2reg=None,
-      l2reg_theta=None,
-      l2reg_lambda=None,
-      l2_weight_threshold=-1.0,
-      ht_type=None,
-      filter_strategy=None,
-      ckpt_to_load_from=None,
-      tensor_name_in_ckpt=None,
-      always_load_from_specific_ckpt=False,
-      init_data_source=None,
-      handle_name=None,
-      emb_index=None,
-      slot_index=None,
-      block_num=None,
-      primary=None,
-      slot_num=None,
-      storage_type=config_pb2.StorageType.DRAM,
-      storage_path=None,
-      storage_size=None,
-      storage_cache_strategy=config_pb2.CacheStrategy.LFU,
-      layout=None,
-      default_value_dim=4096,
-      default_value_no_permission=.0
+    self,
+    steps_to_live=None,
+    steps_to_live_l2reg=None,
+    l2reg_theta=None,
+    l2reg_lambda=None,
+    l2_weight_threshold=-1.0,
+    ht_type=None,
+    filter_strategy=None,
+    ckpt_to_load_from=None,
+    tensor_name_in_ckpt=None,
+    always_load_from_specific_ckpt=False,
+    init_data_source=None,
+    handle_name=None,
+    emb_index=None,
+    slot_index=None,
+    block_num=None,
+    primary=None,
+    slot_num=None,
+    storage_type=config_pb2.StorageType.DRAM,
+    storage_path=None,
+    storage_size=None,
+    storage_cache_strategy=config_pb2.CacheStrategy.LFU,
+    layout=None,
+    default_value_dim=4096,
+    default_value_no_permission=0.0,
   ):
     self.steps_to_live = steps_to_live
     self.steps_to_live_l2reg = steps_to_live_l2reg
@@ -199,7 +188,7 @@ def reveal(self):
     if self.l2reg_lambda is None:
       self.l2reg_lambda = 0
     if self.ht_type is None:
-      self.ht_type = ''
+      self.ht_type = ""
     if self.emb_index is None:
       self.emb_index = 0
     if self.slot_index is None:
diff --git a/deepray/custom_ops/ffm_ops/python/ffm_ops.py b/deepray/custom_ops/ffm_ops/python/ffm_ops.py
index c40fd3e0..dc8807fb 100644
--- a/deepray/custom_ops/ffm_ops/python/ffm_ops.py
+++ b/deepray/custom_ops/ffm_ops/python/ffm_ops.py
@@ -19,17 +19,18 @@
 gen_ffm_ops = LazySO("custom_ops/ffm_ops/_ffm_ops.so")
 
 
-def ffm(left: tf.Tensor, right: tf.Tensor, dim_size: int, int_type: str = 'multiply') -> tf.Tensor:
+def ffm(left: tf.Tensor, right: tf.Tensor, dim_size: int, int_type: str = "multiply") -> tf.Tensor:
   output = gen_ffm_ops.ops.FFM(left=left, right=right, dim_size=dim_size, int_type=int_type)
   return output
 
 
-@tf.RegisterGradient('FFM')
+@tf.RegisterGradient("FFM")
 def _ffm_grad(op, grad: tf.Tensor) -> tf.Tensor:
   left, right = op.inputs[0], op.inputs[1]
-  dim_size = op.get_attr('dim_size')
-  int_type = op.get_attr('int_type')
+  dim_size = op.get_attr("dim_size")
+  int_type = op.get_attr("int_type")
 
-  (left_grad,
-   right_grad) = gen_ffm_ops.ops.FFMGrad(grad=grad, left=left, right=right, dim_size=dim_size, int_type=int_type)
+  (left_grad, right_grad) = gen_ffm_ops.ops.FFMGrad(
+    grad=grad, left=left, right=right, dim_size=dim_size, int_type=int_type
+  )
   return left_grad, right_grad
diff --git a/deepray/custom_ops/ffm_ops/python/tests/feature_cross_test.py b/deepray/custom_ops/ffm_ops/python/tests/feature_cross_test.py
index 9f1f7d4c..9d8c660b 100644
--- a/deepray/custom_ops/ffm_ops/python/tests/feature_cross_test.py
+++ b/deepray/custom_ops/ffm_ops/python/tests/feature_cross_test.py
@@ -20,16 +20,15 @@
 
 
 class FeatureCrossTest(tf.test.TestCase):
-
   def test_groupint_instantiate(self):
-    ins1 = GroupInt(interaction_type='dot', use_attention=False, attention_units=[128, 256, 1], activation='relu')
+    ins1 = GroupInt(interaction_type="dot", use_attention=False, attention_units=[128, 256, 1], activation="relu")
     print(ins1)
 
-    ins2 = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[128, 256, 1], activation='relu')
+    ins2 = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[128, 256, 1], activation="relu")
     print(ins2)
 
   def test_groupint_serde(self):
-    ins1 = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[128, 256, 1], activation='relu')
+    ins1 = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[128, 256, 1], activation="relu")
 
     cfg = ins1.get_config()
     ins2 = GroupInt.from_config(cfg)
@@ -37,7 +36,7 @@ def test_groupint_serde(self):
     print(ins1, ins2)
 
   def test_groupint_call(self):
-    layer = GroupInt(name='test_dense0', out_type='concat')
+    layer = GroupInt(name="test_dense0", out_type="concat")
     left = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(5)]
     right = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(3)]
     sum_out = tf.reduce_sum(layer((left, right)))
@@ -46,7 +45,7 @@ def test_groupint_call(self):
         print(sess.run(sum_out))
 
   def test_groupint_attention_call(self):
-    layer = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[15, 10, 1], activation='relu')
+    layer = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[15, 10, 1], activation="relu")
 
     left = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(5)]
     right = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(3)]
@@ -72,7 +71,7 @@ def test_allint_serde(self):
     print(ins1, ins2)
 
   def test_allint_call(self):
-    layer = AllInt(name='test_dense0', cmp_dim=4)
+    layer = AllInt(name="test_dense0", cmp_dim=4)
 
     data = tf.keras.backend.variable(np.ones((100, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -81,14 +80,14 @@ def test_allint_call(self):
         print(sess.run(sum_out))
 
   def test_cdot_instantiate(self):
-    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     print(ins1)
 
-    ins2 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins2 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     print(ins2)
 
   def test_cdot_serde(self):
-    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
 
     cfg = ins1.get_config()
     ins2 = CDot.from_config(cfg)
@@ -96,7 +95,7 @@ def test_cdot_serde(self):
     print(ins1, ins2)
 
   def test_cdot_call(self):
-    layer = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    layer = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     data = tf.keras.backend.variable(np.ones((100, 10, 10)))
     test = layer(data)
     sum_out = tf.reduce_sum(test)
@@ -105,15 +104,15 @@ def test_cdot_call(self):
         print(sess.run(sum_out))
 
   def test_can_instantiate(self):
-    ins1 = CAN(layer_num=8, activation='sigmoid', is_seq=False, is_stacked=True)
+    ins1 = CAN(layer_num=8, activation="sigmoid", is_seq=False, is_stacked=True)
 
     print(ins1)
 
-    ins2 = CAN(layer_num=8, activation='tanh', is_seq=False, is_stacked=True)
+    ins2 = CAN(layer_num=8, activation="tanh", is_seq=False, is_stacked=True)
     print(ins2)
 
   def test_can_serde(self):
-    ins1 = CAN(layer_num=8, activation='tanh', is_seq=False, is_stacked=True)
+    ins1 = CAN(layer_num=8, activation="tanh", is_seq=False, is_stacked=True)
 
     cfg = ins1.get_config()
     ins2 = CAN.from_config(cfg)
@@ -121,7 +120,7 @@ def test_can_serde(self):
     print(ins1, ins2)
 
   def test_can_seq_call(self):
-    layer = CAN(layer_num=2, activation='relu', is_seq=True, is_stacked=True)
+    layer = CAN(layer_num=2, activation="relu", is_seq=True, is_stacked=True)
 
     user = tf.keras.backend.variable(np.ones((128, 10, 12, 10)))
     item = tf.keras.backend.variable(np.ones((128, 220)))
@@ -131,7 +130,7 @@ def test_can_seq_call(self):
         print(sess.run(sum_out))
 
   def test_can_call(self):
-    layer = CAN(layer_num=2, activation='relu', is_seq=False, is_stacked=True)
+    layer = CAN(layer_num=2, activation="relu", is_seq=False, is_stacked=True)
 
     user = tf.keras.backend.variable(np.ones((128, 10, 10)))
     item = tf.keras.backend.variable(np.ones((128, 220)))
@@ -141,14 +140,14 @@ def test_can_call(self):
         print(sess.run(sum_out))
 
   def test_dcn_instantiate(self):
-    ins1 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins1 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
     print(ins1)
 
-    ins2 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins2 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
     print(ins2)
 
   def test_dcn_serde(self):
-    ins1 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins1 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
 
     cfg = ins1.get_config()
     ins2 = DCN.from_config(cfg)
@@ -156,7 +155,7 @@ def test_dcn_serde(self):
     print(ins1, ins2)
 
   def test_dcn_vector_call(self):
-    layer = DCN(layer_num=2, dcn_type='vector', allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
+    layer = DCN(layer_num=2, dcn_type="vector", allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -165,7 +164,7 @@ def test_dcn_vector_call(self):
         print(sess.run(sum_out))
 
   def test_dcn_matrix_call(self):
-    layer = DCN(layer_num=2, dcn_type='matrix', allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
+    layer = DCN(layer_num=2, dcn_type="matrix", allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -175,13 +174,7 @@ def test_dcn_matrix_call(self):
 
   def test_dcn_mixed_call(self):
     layer = DCN(
-        layer_num=2,
-        dcn_type='mixed',
-        num_experts=2,
-        low_rank=5,
-        allow_kernel_norm=True,
-        use_dropout=True,
-        keep_prob=0.5
+      layer_num=2, dcn_type="mixed", num_experts=2, low_rank=5, allow_kernel_norm=True, use_dropout=True, keep_prob=0.5
     )
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
@@ -191,14 +184,14 @@ def test_dcn_mixed_call(self):
         print(sess.run(sum_out))
 
   def test_cin_instantiate(self):
-    ins1 = CIN(hidden_uints=[10, 5], activation='sigmoid')
+    ins1 = CIN(hidden_uints=[10, 5], activation="sigmoid")
     print(ins1)
 
-    ins2 = CIN(hidden_uints=[10, 5], activation='tanh')
+    ins2 = CIN(hidden_uints=[10, 5], activation="tanh")
     print(ins2)
 
   def test_cin_serde(self):
-    ins1 = CIN(hidden_uints=[10, 5], activation='tanh')
+    ins1 = CIN(hidden_uints=[10, 5], activation="tanh")
 
     cfg = ins1.get_config()
     ins2 = CIN.from_config(cfg)
@@ -206,7 +199,7 @@ def test_cin_serde(self):
     print(ins1, ins2)
 
   def test_cin_call(self):
-    layer = CIN(hidden_uints=[10, 5], activation='relu')
+    layer = CIN(hidden_uints=[10, 5], activation="relu")
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -215,5 +208,5 @@ def test_cin_call(self):
         print(sess.run(sum_out))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/ffm_ops/python/tests/ffm_ops_test.py b/deepray/custom_ops/ffm_ops/python/tests/ffm_ops_test.py
index 04ecddc1..de59ee0a 100644
--- a/deepray/custom_ops/ffm_ops/python/tests/ffm_ops_test.py
+++ b/deepray/custom_ops/ffm_ops/python/tests/ffm_ops_test.py
@@ -21,7 +21,6 @@
 
 
 class FFMOpsTest(test.TestCase):
-
   def _test_ffm_mul(self, use_gpu=False):
     with self.cached_session(use_gpu=use_gpu):
       left = tf.random.uniform(shape=(8, 10 * 4), minval=0, maxval=10)
@@ -46,7 +45,7 @@ def _test_ffm_dot(self, use_gpu=False):
     with self.cached_session(use_gpu=use_gpu):
       left = tf.random.uniform(shape=(8, 10 * 4), minval=0, maxval=10)
       right = tf.random.uniform(shape=(8, 12 * 4), minval=0, maxval=10)
-      output = ffm(left=left, right=right, dim_size=4, int_type='dot')
+      output = ffm(left=left, right=right, dim_size=4, int_type="dot")
       self.assertTrue(output.shape == (8, 120))
 
   def _test_ffm_dot_grad(self, use_gpu=False):
@@ -56,7 +55,7 @@ def _test_ffm_dot_grad(self, use_gpu=False):
       with tf.GradientTape() as g:
         g.watch(left)
         g.watch(right)
-        out = ffm(left=left, right=right, dim_size=4, int_type='dot')
+        out = ffm(left=left, right=right, dim_size=4, int_type="dot")
         loss = tf.reduce_sum(out)
         left_grad, right_grad = g.gradient(loss, [left, right])
 
diff --git a/deepray/custom_ops/multiplex_1/multiplex_1_test.py b/deepray/custom_ops/multiplex_1/multiplex_1_test.py
index dec7baa2..a885ce10 100644
--- a/deepray/custom_ops/multiplex_1/multiplex_1_test.py
+++ b/deepray/custom_ops/multiplex_1/multiplex_1_test.py
@@ -18,6 +18,7 @@
 import tensorflow as tf
 
 from deepray.custom_ops.multiplex_1 import multiplex_1_op
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import errors_impl  # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
@@ -25,7 +26,6 @@
 
 @test_util.with_eager_op_as_function
 class MultiplexOpRank1Test(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_multiplex_int(self):
     a = tf.constant([1, 2, 3, 4, 5])
@@ -52,15 +52,15 @@ def test_multiplex_bad_types(self):
     b = tf.constant([10, 20, 30, 40, 50])  # int32
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, TypeError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(cannot compute Examples1>MultiplexDense as input #2\(zero-based\) '
-        r'was expected to be a float tensor but is a int32 tensor '
-        r'\[Op:Examples1>MultiplexDense\]'
-        r')|('
-        # Graph mode raises TypeError with the following message
-        r"Input 'b_values' of 'Examples1>MultiplexDense' Op has type int32 that "
-        r"does not match type float32 of argument 'a_values'.)"
+      (errors_impl.InvalidArgumentError, TypeError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(cannot compute Examples1>MultiplexDense as input #2\(zero-based\) "
+      r"was expected to be a float tensor but is a int32 tensor "
+      r"\[Op:Examples1>MultiplexDense\]"
+      r")|("
+      # Graph mode raises TypeError with the following message
+      r"Input 'b_values' of 'Examples1>MultiplexDense' Op has type int32 that "
+      r"does not match type float32 of argument 'a_values'.)",
     ):
       self.evaluate(multiplex_1_op.multiplex(cond, a, b))
 
@@ -70,15 +70,15 @@ def test_multiplex_bad_size(self):
     b = tf.constant([10, 20])  # shorter than a
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(?s)(a_values and b_values must have the same shape. '
-        r'a_values shape: \[5\] b_values shape: \[2\].* '
-        r'\[Op:Examples1>MultiplexDense\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, but are 5 and 2\. '
-        r'Shapes are \[5\] and \[2\]\.)'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(?s)(a_values and b_values must have the same shape. "
+      r"a_values shape: \[5\] b_values shape: \[2\].* "
+      r"\[Op:Examples1>MultiplexDense\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, but are 5 and 2\. "
+      r"Shapes are \[5\] and \[2\]\.)",
     ):
       self.evaluate(multiplex_1_op.multiplex(cond, a, b))
 
@@ -98,17 +98,17 @@ def test_multiplex_bad_shape(self):
     b = tf.constant([[10, 20], [30, 40], [50, 60]])  # shape (3,2)
     cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(a_values and b_values must have the same shape.'
-        r' a_values shape: \[2,3\] b_values shape: \[3,2\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, '
-        r'but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(a_values and b_values must have the same shape."
+      r" a_values shape: \[2,3\] b_values shape: \[3,2\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, "
+      r"but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.",
     ):
       self.evaluate(multiplex_1_op.multiplex(cond, a, b))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/multiplex_2/multiplex_2_test.py b/deepray/custom_ops/multiplex_2/multiplex_2_test.py
index 42ca0c6b..e32dd21f 100644
--- a/deepray/custom_ops/multiplex_2/multiplex_2_test.py
+++ b/deepray/custom_ops/multiplex_2/multiplex_2_test.py
@@ -19,13 +19,13 @@
 
 from deepray.custom_ops.multiplex_2 import multiplex_2_op
 from tensorflow.python.framework import errors_impl
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
 
 
 @test_util.with_eager_op_as_function
 class MultiplexOpRank1Test(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_multiplex_int(self):
     a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64)
@@ -52,15 +52,15 @@ def test_multiplex_bad_types(self):
     b = tf.constant([10, 20, 30, 40, 50], dtype=tf.int64)
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, TypeError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(cannot compute Examples>MultiplexDense as input #2\(zero-based\) '
-        r'was expected to be a float tensor but is a int64 tensor '
-        r'\[Op:Examples>MultiplexDense\]'
-        r')|('
-        # Graph mode raises TypeError with the following message
-        r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that "
-        r"does not match type float32 of argument 'a'.)"
+      (errors_impl.InvalidArgumentError, TypeError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(cannot compute Examples>MultiplexDense as input #2\(zero-based\) "
+      r"was expected to be a float tensor but is a int64 tensor "
+      r"\[Op:Examples>MultiplexDense\]"
+      r")|("
+      # Graph mode raises TypeError with the following message
+      r"Input 'b' of 'Examples>MultiplexDense' Op has type int64 that "
+      r"does not match type float32 of argument 'a'.)",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
@@ -70,15 +70,15 @@ def test_multiplex_bad_size(self):
     b = tf.constant([10, 20], dtype=tf.int64)  # shorter than a
     cond = tf.constant([True, False, True, False, True], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(?s)(a and b must have the same shape. '
-        r'a shape: \[5\] b shape: \[2\].* '
-        r'\[Op:Examples>MultiplexDense\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, but are 5 and 2\. '
-        r'Shapes are \[5\] and \[2\]\.)'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(?s)(a and b must have the same shape. "
+      r"a shape: \[5\] b shape: \[2\].* "
+      r"\[Op:Examples>MultiplexDense\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, but are 5 and 2\. "
+      r"Shapes are \[5\] and \[2\]\.)",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
@@ -98,17 +98,17 @@ def test_multiplex_bad_shape(self):
     b = tf.constant([[10, 20], [30, 40], [50, 60]], dtype=tf.int64)  # shape (3,2)
     cond = tf.constant([[True, False, True], [False, True, False]], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(a and b must have the same shape.'
-        r' a shape: \[2,3\] b shape: \[3,2\]'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Dimension 0 in both shapes must be equal, '
-        r'but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(a and b must have the same shape."
+      r" a shape: \[2,3\] b shape: \[3,2\]"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Dimension 0 in both shapes must be equal, "
+      r"but are 2 and 3\. Shapes are \[2,3\] and \[3,2\])\.",
     ):
       self.evaluate(multiplex_2_op.multiplex(cond, a, b))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_op.py b/deepray/custom_ops/multiplex_3/multiplex_3_op.py
index db2ea263..593ce0ab 100644
--- a/deepray/custom_ops/multiplex_3/multiplex_3_op.py
+++ b/deepray/custom_ops/multiplex_3/multiplex_3_op.py
@@ -66,15 +66,15 @@ def multiplex_sparse(cond: tf.SparseTensor, a: tf.SparseTensor, b: tf.SparseTens
     from `b` elsewhere.
   """
   (indices, values, shape) = examples_multiplex_sparse(
-      cond_indices=cond.indices,
-      cond_values=cond.values,
-      cond_shape=cond.dense_shape,
-      a_indices=a.indices,
-      a_values=a.values,
-      a_shape=a.dense_shape,
-      b_indices=b.indices,
-      b_values=b.values,
-      b_shape=b.dense_shape,
-      name=name
+    cond_indices=cond.indices,
+    cond_values=cond.values,
+    cond_shape=cond.dense_shape,
+    a_indices=a.indices,
+    a_values=a.values,
+    a_shape=a.dense_shape,
+    b_indices=b.indices,
+    b_values=b.values,
+    b_shape=b.dense_shape,
+    name=name,
   )
   return tf.SparseTensor(indices, values, shape)
diff --git a/deepray/custom_ops/multiplex_3/multiplex_3_test.py b/deepray/custom_ops/multiplex_3/multiplex_3_test.py
index dfd18ba1..9d2f1b38 100644
--- a/deepray/custom_ops/multiplex_3/multiplex_3_test.py
+++ b/deepray/custom_ops/multiplex_3/multiplex_3_test.py
@@ -19,13 +19,13 @@
 
 from deepray.custom_ops.multiplex_2 import multiplex_2_op
 from deepray.custom_ops.multiplex_3 import multiplex_3_op
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
 
 
 @test_util.with_eager_op_as_function
 class MultiplexOpRank1Test(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_sparse_kernel(self):
     idx0 = tf.constant([], dtype=tf.int64, shape=[0, 1])
@@ -44,30 +44,34 @@ def test_sparse_kernel(self):
     shape = tf.constant([100], dtype=tf.int64)
 
     # all inputs empty
-    (result_index, result_values,
-     result_shape) = multiplex_3_op.examples_multiplex_sparse(idx0, cond0, shape, idx0, val0, shape, idx0, val0, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx0, cond0, shape, idx0, val0, shape, idx0, val0, shape
+    )
     self.assertAllEqual(idx0, result_index)
     self.assertAllEqual(val0, result_values)
     self.assertAllEqual(shape, result_shape)
 
     # only b is not empty
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx0, cond0, shape, idx0, val0, shape, idx5b, val5a, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx0, cond0, shape, idx0, val0, shape, idx5b, val5a, shape
+    )
     self.assertAllEqual(idx5b, result_index)
     self.assertAllEqual(val5a, result_values)
     self.assertAllEqual(shape, result_shape)
 
     # all indices the same
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx5b, cond5, shape, idx5b, val5a, shape, idx5b, val5b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx5b, cond5, shape, idx5b, val5a, shape, idx5b, val5b, shape
+    )
     expect_values = tf.constant([1, 20, 3, 40, 5], dtype=tf.int64)
     self.assertAllEqual(idx5b, result_index)
     self.assertAllEqual(expect_values, result_values)
     self.assertAllEqual(shape, result_shape)
 
     # cond and a have same positions, b values after a values
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx3c, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx3c, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape
+    )
     expect_index = tf.constant([[10], [30], [40], [50]], dtype=tf.int64)
     expect_values = tf.constant([1, 3, 5, 6], dtype=tf.int64)
     self.assertAllEqual(expect_index, result_index)
@@ -75,8 +79,9 @@ def test_sparse_kernel(self):
     self.assertAllEqual(shape, result_shape)
 
     # cond and a have same positions, b values before a values
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx3d, cond3, shape, idx3d, val3a, shape, idx3c, val3b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx3d, cond3, shape, idx3d, val3a, shape, idx3c, val3b, shape
+    )
     expect_index = tf.constant([[10], [20], [30], [50]], dtype=tf.int64)
     expect_values = tf.constant([4, 5, 1, 3], dtype=tf.int64)
     self.assertAllEqual(expect_index, result_index)
@@ -84,8 +89,9 @@ def test_sparse_kernel(self):
     self.assertAllEqual(shape, result_shape)
 
     # cond and b have same positions, b values after a values
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx3d, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx3d, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape
+    )
     expect_index = tf.constant([[30], [40]], dtype=tf.int64)
     expect_values = tf.constant([3, 5], dtype=tf.int64)
     self.assertAllEqual(expect_index, result_index)
@@ -93,8 +99,9 @@ def test_sparse_kernel(self):
     self.assertAllEqual(shape, result_shape)
 
     # cond and b have same positions, b values before a values
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx3c, cond3, shape, idx3d, val3a, shape, idx3c, val3b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx3c, cond3, shape, idx3d, val3a, shape, idx3c, val3b, shape
+    )
     expect_index = tf.constant([[20], [30]], dtype=tf.int64)
     expect_values = tf.constant([5, 1], dtype=tf.int64)
     self.assertAllEqual(expect_index, result_index)
@@ -102,8 +109,9 @@ def test_sparse_kernel(self):
     self.assertAllEqual(shape, result_shape)
 
     # cond and a and b all have different positions
-    (result_index, result_values, result_shape
-    ) = multiplex_3_op.examples_multiplex_sparse(idx3e, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape)
+    (result_index, result_values, result_shape) = multiplex_3_op.examples_multiplex_sparse(
+      idx3e, cond3, shape, idx3c, val3a, shape, idx3d, val3b, shape
+    )
     expect_index = tf.constant([[10], [30], [40]], dtype=tf.int64)
     expect_values = tf.constant([1, 4, 5], dtype=tf.int64)
     self.assertAllEqual(expect_index, result_index)
@@ -113,12 +121,12 @@ def test_sparse_kernel(self):
   @test_util.run_in_graph_and_eager_modes
   def test_sparse_op_only(self):
     cond = tf.SparseTensor(indices=[[1], [3], [6]], values=[True, False, True], dense_shape=[7])
-    a = tf.SparseTensor(indices=[[1], [3], [5]], values=['a0', 'a1', 'a2'], dense_shape=[6])
-    b = tf.SparseTensor(indices=[[0], [2], [3], [6]], values=['b0', 'b1', 'b2', 'b3'], dense_shape=[7])
+    a = tf.SparseTensor(indices=[[1], [3], [5]], values=["a0", "a1", "a2"], dense_shape=[6])
+    b = tf.SparseTensor(indices=[[0], [2], [3], [6]], values=["b0", "b1", "b2", "b3"], dense_shape=[7])
     result = self.evaluate(multiplex_3_op.multiplex_sparse(cond, a, b))
     self.assertAllEqual([7], result.dense_shape)
     self.assertAllEqual([[0], [1], [2], [3]], result.indices)
-    self.assertAllEqual([b'b0', b'a0', b'b1', b'b2'], result.values)
+    self.assertAllEqual([b"b0", b"a0", b"b1", b"b2"], result.values)
 
   # The following tests use multiplex_2_op.multiplex, which now supports
   # sparse tensors.
@@ -126,22 +134,22 @@ def test_sparse_op_only(self):
   @test_util.run_in_graph_and_eager_modes
   def test_sparse_op_same(self):
     cond = tf.SparseTensor(indices=[[1], [3], [6]], values=[True, False, True], dense_shape=[7])
-    a = tf.SparseTensor(indices=[[1], [3], [6]], values=['a0', 'a1', 'a2'], dense_shape=[6])
-    b = tf.SparseTensor(indices=[[1], [3], [6]], values=['b0', 'b1', 'b2'], dense_shape=[7])
+    a = tf.SparseTensor(indices=[[1], [3], [6]], values=["a0", "a1", "a2"], dense_shape=[6])
+    b = tf.SparseTensor(indices=[[1], [3], [6]], values=["b0", "b1", "b2"], dense_shape=[7])
     result = self.evaluate(multiplex_2_op.multiplex(cond, a, b))
     self.assertAllEqual([7], result.dense_shape)
     self.assertAllEqual([[1], [3], [6]], result.indices)
-    self.assertAllEqual([b'a0', b'b1', b'a2'], result.values)
+    self.assertAllEqual([b"a0", b"b1", b"a2"], result.values)
 
   @test_util.run_in_graph_and_eager_modes
   def test_sparse_op_different(self):
     cond = tf.SparseTensor(indices=[[1], [3], [6]], values=[True, False, True], dense_shape=[7])
-    a = tf.SparseTensor(indices=[[1], [3], [5]], values=['a0', 'a1', 'a2'], dense_shape=[6])
-    b = tf.SparseTensor(indices=[[0], [2], [3], [6]], values=['b0', 'b1', 'b2', 'b3'], dense_shape=[7])
+    a = tf.SparseTensor(indices=[[1], [3], [5]], values=["a0", "a1", "a2"], dense_shape=[6])
+    b = tf.SparseTensor(indices=[[0], [2], [3], [6]], values=["b0", "b1", "b2", "b3"], dense_shape=[7])
     result = self.evaluate(multiplex_2_op.multiplex(cond, a, b))
     self.assertAllEqual([7], result.dense_shape)
     self.assertAllEqual([[0], [1], [2], [3]], result.indices)
-    self.assertAllEqual([b'b0', b'a0', b'b1', b'b2'], result.values)
+    self.assertAllEqual([b"b0", b"a0", b"b1", b"b2"], result.values)
 
   # muliplex still works on dense tensors
   @test_util.run_in_graph_and_eager_modes
@@ -155,5 +163,5 @@ def test_multiplex_int(self):
     self.assertAllEqual(result, expect)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/multiplex_4/model_using_multiplex.py b/deepray/custom_ops/multiplex_4/model_using_multiplex.py
index d759d8ce..c663ce21 100644
--- a/deepray/custom_ops/multiplex_4/model_using_multiplex.py
+++ b/deepray/custom_ops/multiplex_4/model_using_multiplex.py
@@ -42,25 +42,25 @@ def save(multiplex_op, path):
   example_cond, example_a, example_b = _get_example_tensors()
 
   class UseMultiplex(tf.Module):
-
     @tf.function(
-        input_signature=[
-            tf.TensorSpec.from_tensor(example_cond),
-            tf.TensorSpec.from_tensor(example_a),
-            tf.TensorSpec.from_tensor(example_b)
-        ]
+      input_signature=[
+        tf.TensorSpec.from_tensor(example_cond),
+        tf.TensorSpec.from_tensor(example_a),
+        tf.TensorSpec.from_tensor(example_b),
+      ]
     )
     def use_multiplex(self, cond, a, b):
       return multiplex_op(cond, a, b)
 
   model = UseMultiplex()
   tf.saved_model.save(
-      model,
-      path,
-      signatures=model.use_multiplex.get_concrete_function(
-          tf.TensorSpec.from_tensor(example_cond), tf.TensorSpec.from_tensor(example_a),
-          tf.TensorSpec.from_tensor(example_b)
-      )
+    model,
+    path,
+    signatures=model.use_multiplex.get_concrete_function(
+      tf.TensorSpec.from_tensor(example_cond),
+      tf.TensorSpec.from_tensor(example_a),
+      tf.TensorSpec.from_tensor(example_b),
+    ),
   )
 
 
diff --git a/deepray/custom_ops/multiplex_4/multiplex_2_save.py b/deepray/custom_ops/multiplex_4/multiplex_2_save.py
index bc863787..3260957c 100644
--- a/deepray/custom_ops/multiplex_4/multiplex_2_save.py
+++ b/deepray/custom_ops/multiplex_4/multiplex_2_save.py
@@ -30,12 +30,12 @@
 
 def main(argv):
   del argv  # not used
-  path = 'model_using_multiplex'
+  path = "model_using_multiplex"
   if os.path.exists(path):
     shutil.rmtree(path, ignore_errors=True)
   model_using_multiplex.save(multiplex_2_op.multiplex, path)
-  print('Saved model to', path)
+  print("Saved model to", path)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   app.run(main)
diff --git a/deepray/custom_ops/multiplex_4/multiplex_4_load_use.py b/deepray/custom_ops/multiplex_4/multiplex_4_load_use.py
index 51a5e389..c75d24c0 100644
--- a/deepray/custom_ops/multiplex_4/multiplex_4_load_use.py
+++ b/deepray/custom_ops/multiplex_4/multiplex_4_load_use.py
@@ -28,10 +28,10 @@
 
 def main(argv):
   del argv  # not used
-  path = 'model_using_multiplex'
+  path = "model_using_multiplex"
   result = model_using_multiplex.load_and_use(path)
-  print('Result:', result)
+  print("Result:", result)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   app.run(main)
diff --git a/deepray/custom_ops/multiplex_4/multiplex_4_test.py b/deepray/custom_ops/multiplex_4/multiplex_4_test.py
index f775e45b..8e4b06e6 100644
--- a/deepray/custom_ops/multiplex_4/multiplex_4_test.py
+++ b/deepray/custom_ops/multiplex_4/multiplex_4_test.py
@@ -21,6 +21,7 @@
 
 from deepray.custom_ops.multiplex_4 import model_using_multiplex
 from deepray.custom_ops.multiplex_4 import multiplex_4_op
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import errors_impl  # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
@@ -28,7 +29,6 @@
 
 @test_util.with_eager_op_as_function
 class MultiplexOpTest(tf.test.TestCase):
-
   @test_util.run_in_graph_and_eager_modes
   def test_multiplex_int(self):
     a = tf.constant([1, 2, 3, 4, 5], dtype=tf.int64)
@@ -56,7 +56,7 @@ def test_multiplex_select(self):
     self.assertAllEqual(result, expect)
 
   def test_multiplex_saved_model(self):
-    path = os.path.join(self.create_tempdir(), 'model')
+    path = os.path.join(self.create_tempdir(), "model")
     model_using_multiplex.save(multiplex_4_op.multiplex, path)
     result = model_using_multiplex.load_and_use(path)
     self.assertAllEqual(result, tf.constant([1, 20, 3, 40, 5], dtype=tf.int64))
@@ -94,15 +94,15 @@ def test_inconsistent_inputs_error(self):
     b = tf.constant([101, 102, 103, 104, 105], dtype=tf.int64)
     cond = tf.constant([False, False, True, False, False], dtype=bool)
     with self.assertRaisesRegex(
-        (errors_impl.InvalidArgumentError, ValueError),
-        # Eager mode raises InvalidArgumentError with the following message
-        r'(a_values\[0\] and b_values must have the same shape'
-        r')|('
-        # Graph mode raises ValueError with the following message
-        r'Shapes must be equal rank, but are 2 and 1)'
+      (errors_impl.InvalidArgumentError, ValueError),
+      # Eager mode raises InvalidArgumentError with the following message
+      r"(a_values\[0\] and b_values must have the same shape"
+      r")|("
+      # Graph mode raises ValueError with the following message
+      r"Shapes must be equal rank, but are 2 and 1)",
     ):
       self.evaluate(multiplex_4_op.multiplex(cond, a, b))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/parquet_dataset/python/dataframe.py b/deepray/custom_ops/parquet_dataset/python/dataframe.py
index b12ebc7e..62a98e55 100644
--- a/deepray/custom_ops/parquet_dataset/python/dataframe.py
+++ b/deepray/custom_ops/parquet_dataset/python/dataframe.py
@@ -62,10 +62,9 @@ def __init__(self, name, dtype=None, ragged_rank=None, shape=None):
         shape = tensor_shape.TensorShape(shape)
         for d in shape:
           if d is None:
-            raise ValueError(f'Field {name} has incomplete shape: {shape}')
+            raise ValueError(f"Field {name} has incomplete shape: {shape}")
         if ragged_rank is not None and ragged_rank > 1:
-          raise ValueError(f'Field {name} is a nested list ({ragged_rank}) '
-                           f'with shape {shape}')
+          raise ValueError(f"Field {name} is a nested list ({ragged_rank}) with shape {shape}")
       self._shape = shape
 
     @property
@@ -90,27 +89,27 @@ def shape(self):
 
     def __repr__(self):
       if self._dtype is None:
-        dtypestr = 'unkown'
+        dtypestr = "unkown"
       else:
         dtypestr = self._dtype.name
         if self._ragged_rank is None:
-          dtypestr = f'unkown<{dtypestr}>'
+          dtypestr = f"unkown<{dtypestr}>"
         else:
           if self._ragged_rank > 1:
-            dtypestr = f'list^{self._ragged_rank}<{dtypestr}>'
+            dtypestr = f"list^{self._ragged_rank}<{dtypestr}>"
           elif self._ragged_rank > 0:
-            dtypestr = f'list<{dtypestr}>'
+            dtypestr = f"list<{dtypestr}>"
       if self._shape is None:
-        shapestr = 'unknown'
+        shapestr = "unknown"
       else:
         shapestr = str(self._shape)
-      return f'{self._name} (dtype={dtypestr}, shape={shapestr})'
+      return f"{self._name} (dtype={dtypestr}, shape={shapestr})"
 
     def map(self, func, rank=None):
       if rank is None:
         rank = self.ragged_rank
       if self.incomplete:
-        raise ValueError(f'Field {self} is incomplete, please specify dtype and ragged_rank')
+        raise ValueError(f"Field {self} is incomplete, please specify dtype and ragged_rank")
       if rank == 0:
         return func(0)
       return DataFrame.Value(func(0), [func(i + 1) for i in xrange(rank)])
@@ -143,7 +142,7 @@ def output_specs(self):
       return specs
 
   # pylint: disable=inherit-non-class
-  class Value(collections.namedtuple('DataFrameValue', ['values', 'nested_row_splits'])):
+  class Value(collections.namedtuple("DataFrameValue", ["values", "nested_row_splits"])):
     """A structure represents a value in DataFrame."""
 
     def __new__(cls, values, nested_row_splits=None):
@@ -154,7 +153,7 @@ def __new__(cls, values, nested_row_splits=None):
       return super(DataFrame.Value, cls).__new__(cls, values, nested_row_splits)
 
     def __repr__(self):
-      return f'{{{self.values}, splits={self.nested_row_splits}}}'
+      return f"{{{self.values}, splits={self.nested_row_splits}}}"
 
     def to_sparse(self, name=None):
       if len(self.nested_row_splits) == 0:
@@ -163,7 +162,7 @@ def to_sparse(self, name=None):
         return self.values
       sparse_value = gen_ragged_conversion_ops.ragged_tensor_to_sparse(self.nested_row_splits, self.values, name=name)
       return sparse_tensor.SparseTensor(
-          sparse_value.sparse_indices, sparse_value.sparse_values, sparse_value.sparse_dense_shape
+        sparse_value.sparse_indices, sparse_value.sparse_values, sparse_value.sparse_dense_shape
       )
 
   @classmethod
@@ -179,7 +178,7 @@ def to_sparse(cls, features):
       return features
     if isinstance(features, ops.Tensor):
       return features
-    raise ValueError(f'{features} not supported')
+    raise ValueError(f"{features} not supported")
 
   @classmethod
   def unbatch_and_to_sparse(cls, features):
@@ -210,7 +209,7 @@ def unbatch_and_to_sparse(cls, features):
       return features
     if isinstance(features, ops.Tensor):
       return features
-    raise ValueError(f'{features} not supported for transformation')
+    raise ValueError(f"{features} not supported for transformation")
 
 
 def to_sparse(num_parallel_calls=None):
@@ -236,17 +235,17 @@ def input_fields(input_dataset, fields=None):
   if fields is None:
     ds = input_dataset
     while ds:
-      if hasattr(ds, 'fields'):
+      if hasattr(ds, "fields"):
         fields = ds.fields
         break
-      if not hasattr(ds, '_input_dataset'):
+      if not hasattr(ds, "_input_dataset"):
         break
       ds = ds._input_dataset  # pylint: disable=protected-access
   if not fields:
-    raise ValueError('`fields` must be specified')
+    raise ValueError("`fields` must be specified")
   if not isinstance(fields, (tuple, list)):
-    raise ValueError('`fields` must be a list of `hb.data.DataFrame.Field`.')
+    raise ValueError("`fields` must be a list of `hb.data.DataFrame.Field`.")
   for f in fields:
     if not isinstance(f, DataFrame.Field):
-      raise ValueError(f'{f} must be `hb.data.DataFrame.Field`.')
+      raise ValueError(f"{f} must be `hb.data.DataFrame.Field`.")
   return fields
diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
index 67330c7b..339d5d35 100644
--- a/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
+++ b/deepray/custom_ops/parquet_dataset/python/parquet_dataset_ops.py
@@ -49,7 +49,7 @@ def __init__(self, field):
       field: The field definition.
     """
     if field.incomplete:
-      raise ValueError(f'Field {field} is incomplete, please specify dtype and ragged_rank')
+      raise ValueError(f"Field {field} is incomplete, please specify dtype and ragged_rank")
     self._field = field
 
   def _serialize(self):
@@ -72,10 +72,10 @@ def _from_components(self, tensor_list):
     return DataFrame.Value(tensor_list[0], tensor_list[1:])
 
   def _batch(self, batch_size):
-    raise NotImplementedError('batching of a bacthed tensor not supported')
+    raise NotImplementedError("batching of a bacthed tensor not supported")
 
   def _unbatch(self):
-    raise NotImplementedError('unbatching of a bacthed tensor not supported')
+    raise NotImplementedError("unbatching of a bacthed tensor not supported")
 
   def _to_legacy_output_types(self):
     return self._field.output_types
@@ -102,8 +102,8 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in
       drop_remainder: (Optional.) If True, only keep batches with exactly
         `batch_size` samples.
     """
-    self._filename = ops.convert_to_tensor(filename, dtype=dtypes.string, name='filename')
-    self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64, name='batch_size')
+    self._filename = ops.convert_to_tensor(filename, dtype=dtypes.string, name="filename")
+    self._batch_size = ops.convert_to_tensor(batch_size, dtype=dtypes.int64, name="batch_size")
     self._fields = fields
     self._output_specs = {}
     for f in self._fields:
@@ -125,14 +125,14 @@ def __init__(self, filename, batch_size, fields, partition_count=1, partition_in
     self._drop_remainder = drop_remainder
 
     variant_tensor = gen_parquet_ops.parquet_tabular_dataset_v1(
-        self._filename,
-        self._batch_size,
-        field_names=self._field_names,
-        field_dtypes=self._field_dtypes,
-        field_ragged_ranks=self._field_ragged_ranks,
-        partition_count=self._partition_count,
-        partition_index=self._partition_index,
-        drop_remainder=self._drop_remainder
+      self._filename,
+      self._batch_size,
+      field_names=self._field_names,
+      field_dtypes=self._field_dtypes,
+      field_ragged_ranks=self._field_ragged_ranks,
+      partition_count=self._partition_count,
+      partition_index=self._partition_index,
+      drop_remainder=self._drop_remainder,
     )
     super().__init__(variant_tensor)
 
@@ -161,15 +161,15 @@ def read_schema(cls, filename, fields=None, lower=False):
     return parquet_fields(filename, fields, lower=lower)
 
   def __init__(
-      self,
-      filenames,
-      batch_size=1,
-      fields=None,
-      partition_count=1,
-      partition_index=0,
-      drop_remainder=False,
-      num_parallel_reads=None,
-      num_sequential_reads=1
+    self,
+    filenames,
+    batch_size=1,
+    fields=None,
+    partition_count=1,
+    partition_index=0,
+    drop_remainder=False,
+    num_parallel_reads=None,
+    num_sequential_reads=1,
   ):
     """Create a `ParquetDataset`.
 
@@ -194,16 +194,18 @@ def __init__(
     self._drop_remainder = drop_remainder
 
     def _create_dataset(f):
-      f = ops.convert_to_tensor(f, dtypes.string, name='filename')
+      f = ops.convert_to_tensor(f, dtypes.string, name="filename")
       return _ParquetDataset(  # pylint: disable=abstract-class-instantiated
-        f, batch_size,
+        f,
+        batch_size,
         fields=self._fields,
         partition_count=self._partition_count,
         partition_index=self._partition_index,
-        drop_remainder=self._drop_remainder)
+        drop_remainder=self._drop_remainder,
+      )
 
     self._impl = self._build_dataset(
-        _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads
+      _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads
     )
     super().__init__(self._impl._variant_tensor)  # pylint: disable=protected-access
 
@@ -240,51 +242,51 @@ def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, nu
       return filenames.interleave(dataset_creator, num_parallel_calls=num_parallel_reads)
     # Specified Parallel Reading
     return readers.ParallelInterleaveDataset(
-        filenames,
-        dataset_creator,
-        cycle_length=num_parallel_reads,
-        block_length=num_sequential_reads,
-        sloppy=True,
-        buffer_output_elements=None,
-        prefetch_input_elements=1
+      filenames,
+      dataset_creator,
+      cycle_length=num_parallel_reads,
+      block_length=num_sequential_reads,
+      sloppy=True,
+      buffer_output_elements=None,
+      prefetch_input_elements=1,
     )
 
 
 def read_parquet(
-    batch_size,
-    fields=None,
-    partition_count=1,
-    partition_index=0,
-    drop_remainder=False,
-    num_parallel_reads=None,
-    num_sequential_reads=1
+  batch_size,
+  fields=None,
+  partition_count=1,
+  partition_index=0,
+  drop_remainder=False,
+  num_parallel_reads=None,
+  num_sequential_reads=1,
 ):
   """Create a `ParquetDataset` from filenames dataset.
 
-    Args:
-      batch_size: Maxium number of samples in an output batch.
-      fields: (Optional.) List of DataFrame fields.
-      partition_count: (Optional.) Count of row group partitions.
-      partition_index: (Optional.) Index of row group partitions.
-      drop_remainder: (Optional.) If True, only keep batches with exactly
-        `batch_size` samples.
-      num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
-        number of files to read in parallel. Defaults to reading files
-        sequentially.
-      num_sequential_reads: (Optional.) A `tf.int64` scalar representing the
-        number of batches to read in sequential. Defaults to 1.
-    """
+  Args:
+    batch_size: Maxium number of samples in an output batch.
+    fields: (Optional.) List of DataFrame fields.
+    partition_count: (Optional.) Count of row group partitions.
+    partition_index: (Optional.) Index of row group partitions.
+    drop_remainder: (Optional.) If True, only keep batches with exactly
+      `batch_size` samples.
+    num_parallel_reads: (Optional.) A `tf.int64` scalar representing the
+      number of files to read in parallel. Defaults to reading files
+      sequentially.
+    num_sequential_reads: (Optional.) A `tf.int64` scalar representing the
+      number of batches to read in sequential. Defaults to 1.
+  """
 
   def _apply_fn(filenames):
     return ParquetDataset(
-        filenames,
-        batch_size=batch_size,
-        fields=fields,
-        partition_count=partition_count,
-        partition_index=partition_index,
-        drop_remainder=drop_remainder,
-        num_parallel_reads=num_parallel_reads,
-        num_sequential_reads=num_sequential_reads
+      filenames,
+      batch_size=batch_size,
+      fields=fields,
+      partition_count=partition_count,
+      partition_index=partition_index,
+      drop_remainder=drop_remainder,
+      num_parallel_reads=num_parallel_reads,
+      num_sequential_reads=num_sequential_reads,
     )
 
   return _apply_fn
diff --git a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
index 8bd32f7e..5b66fa27 100644
--- a/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
+++ b/deepray/custom_ops/parquet_dataset/python/parquet_pybind.py
@@ -43,11 +43,11 @@ def parquet_fields(filename, fields=None, lower=False):
   Returns:
     Field definitions.
   """
-  logging.info(f'Reading fields from {filename} ...')
+  logging.info(f"Reading fields from {filename} ...")
   all_field_tuples = _lib.parquet_file_get_fields(filename)  # pylint: disable=c-extension-no-member
   if not all_field_tuples:
-    raise ValueError(f'No supported fields found in parquet file {filename}')
-  all_fields = {f[0]: {'dtype': f[1], 'ragged_rank': f[2]} for f in all_field_tuples}
+    raise ValueError(f"No supported fields found in parquet file {filename}")
+  all_fields = {f[0]: {"dtype": f[1], "ragged_rank": f[2]} for f in all_field_tuples}
   if fields is None:
     fields = all_fields.keys()
   fields = tuple(fields)
@@ -57,33 +57,30 @@ def parquet_fields(filename, fields=None, lower=False):
       if lower and f.name not in all_fields:
         f = DataFrame.Field(f.name.lower(), dtype=f.dtype, shape=f.shape, ragged_rank=f.ragged_rank)
       if f.name not in all_fields:
-        raise ValueError(f'Field {f.name} is not found in the parquet file {filename}')
+        raise ValueError(f"Field {f.name} is not found in the parquet file {filename}")
       dtype = f.dtype
-      actual_dtype = np.dtype(all_fields[f.name]['dtype'])
+      actual_dtype = np.dtype(all_fields[f.name]["dtype"])
       if dtype is None:
         dtype = actual_dtype
       elif dtype != actual_dtype:
-        raise ValueError(f'Field {f.name} should has dtype {actual_dtype} not {dtype}')
+        raise ValueError(f"Field {f.name} should has dtype {actual_dtype} not {dtype}")
       ragged_rank = f.ragged_rank
-      actual_ragged_rank = all_fields[f.name]['ragged_rank']
+      actual_ragged_rank = all_fields[f.name]["ragged_rank"]
       if ragged_rank is None:
         ragged_rank = actual_ragged_rank
       elif ragged_rank != actual_ragged_rank:
-        raise ValueError(f'Field {f.name} should has ragged_rank {actual_ragged_rank} '
-                         f'not {ragged_rank}')
+        raise ValueError(f"Field {f.name} should has ragged_rank {actual_ragged_rank} not {ragged_rank}")
       f = DataFrame.Field(f.name, dtype=dtype, ragged_rank=ragged_rank, shape=f.shape)
       new_fields.append(f)
       continue
     if not isinstance(f, string):
-      raise ValueError(f'Field {f} is not a DataFrame.Field or a string')
+      raise ValueError(f"Field {f} is not a DataFrame.Field or a string")
     if lower and f not in all_fields:
       f = f.lower()
     if f not in all_fields:
-      raise ValueError(f'Field {f} is not found in the parquet file {filename}')
+      raise ValueError(f"Field {f} is not found in the parquet file {filename}")
     new_fields.append(
-        DataFrame.Field(
-            f, dtype=np.dtype(all_fields[f]['dtype']), ragged_rank=all_fields[f]['ragged_rank'], shape=None
-        )
+      DataFrame.Field(f, dtype=np.dtype(all_fields[f]["dtype"]), ragged_rank=all_fields[f]["ragged_rank"], shape=None)
     )
   return tuple(new_fields)
 
@@ -105,44 +102,43 @@ def parquet_filenames_and_fields(filenames, fields, lower=False):
   elif isinstance(filenames, (tuple, list)):
     for f in filenames:
       if not isinstance(f, string):
-        raise ValueError(f'{f} in `filenames` must be a string')
+        raise ValueError(f"{f} in `filenames` must be a string")
     fields = parquet_fields(filenames[0], fields, lower=lower)
   elif isinstance(filenames, dataset_ops.Dataset):
     if filenames.output_types != dtypes.string:
-      raise TypeError('`filenames` must be a `tf.data.Dataset` of `tf.string` elements.')
+      raise TypeError("`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
     if not filenames.output_shapes.is_compatible_with(tensor_shape.TensorShape([])):
-      raise ValueError('`filenames` must be a `tf.data.Dataset` of scalar `tf.string` '
-                       'elements.')
+      raise ValueError("`filenames` must be a `tf.data.Dataset` of scalar `tf.string` elements.")
     if fields is None:
-      raise ValueError('`fields` must be specified.')
+      raise ValueError("`fields` must be specified.")
     if not isinstance(fields, (tuple, list)):
-      raise ValueError('`fields` must be a list of `hb.data.DataFrame.Field`.')
+      raise ValueError("`fields` must be a list of `hb.data.DataFrame.Field`.")
     for f in fields:
       if not isinstance(f, DataFrame.Field):
-        raise ValueError(f'Field {f} must be `hb.data.DataFrame.Field`.')
+        raise ValueError(f"Field {f} must be `hb.data.DataFrame.Field`.")
       if f.incomplete:
-        raise ValueError(f'Field {f} is incomplete, please specify dtype and ragged_rank')
+        raise ValueError(f"Field {f} is incomplete, please specify dtype and ragged_rank")
   elif isinstance(filenames, core.Tensor):
     if filenames.dtype != dtypes.string:
-      raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.')
+      raise TypeError("`filenames` must be a `tf.Tensor` of `tf.string`.")
     if fields is None:
-      raise ValueError('`fields` must be specified.')
+      raise ValueError("`fields` must be specified.")
     if not isinstance(fields, (tuple, list)):
-      raise ValueError('`fields` must be a list of `hb.data.DataFrame.Field`.')
+      raise ValueError("`fields` must be a list of `hb.data.DataFrame.Field`.")
     for f in fields:
       if not isinstance(f, DataFrame.Field):
-        raise ValueError(f'Field {f} must be `hb.data.DataFrame.Field`.')
+        raise ValueError(f"Field {f} must be `hb.data.DataFrame.Field`.")
       if f.incomplete:
-        raise ValueError(f'Field {f} is incomplete, please specify dtype and ragged_rank')
+        raise ValueError(f"Field {f} is incomplete, please specify dtype and ragged_rank")
   else:
     raise ValueError(
-        f'`filenames` {filenames} must be a `tf.data.Dataset` of scalar '
-        '`tf.string` elements or can be converted to a `tf.Tensor` of '
-        '`tf.string`.'
+      f"`filenames` {filenames} must be a `tf.data.Dataset` of scalar "
+      "`tf.string` elements or can be converted to a `tf.Tensor` of "
+      "`tf.string`."
     )
 
   if not isinstance(filenames, dataset_ops.Dataset):
     filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
-    filenames = array_ops.reshape(filenames, [-1], name='filenames')
+    filenames = array_ops.reshape(filenames, [-1], name="filenames")
     filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
   return filenames, fields
diff --git a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
index 10d3fc26..8f0ee630 100644
--- a/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
+++ b/deepray/custom_ops/parquet_dataset/python/tests/parquet_dataset_ops_test.py
@@ -25,6 +25,7 @@
 import tempfile
 
 import tensorflow as tf
+
 # from tensorflow.python.data.experimental.ops import parquet_dataset_ops
 from deepray.custom_ops.parquet_dataset import parquet_dataset_ops
 
@@ -34,38 +35,34 @@
 
 
 class ParquetDatasetTest(test_base.DatasetTestBase):
-
   @classmethod
   def setUpClass(self):
-    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+    os.environ["CUDA_VISIBLE_DEVICES"] = ""
     self._workspace = tempfile.mkdtemp()
-    self._filename = os.path.join(self._workspace, 'test.parquet')
-    self._df = pd.DataFrame(np.random.randint(0, 100, size=(200, 4), dtype=np.int64), columns=list('ABCd'))
+    self._filename = os.path.join(self._workspace, "test.parquet")
+    self._df = pd.DataFrame(np.random.randint(0, 100, size=(200, 4), dtype=np.int64), columns=list("ABCd"))
     self._df.to_parquet(self._filename)
 
   def test_read(self):
     batch_size = 32
     with tf.Graph().as_default() as graph:
       ds = parquet_dataset_ops.ParquetDataset(
-          self._filename,
-          batch_size=batch_size,
-          fields=[
-              parquet_dataset_ops.DataFrame.Field('A', tf.int64),
-              parquet_dataset_ops.DataFrame.Field('C', tf.int64)
-          ]
+        self._filename,
+        batch_size=batch_size,
+        fields=[parquet_dataset_ops.DataFrame.Field("A", tf.int64), parquet_dataset_ops.DataFrame.Field("C", tf.int64)],
       )
       ds = ds.prefetch(4)
       batch = tf.data.make_one_shot_iterator(ds).get_next()
 
-    a = self._df['A']
-    c = self._df['C']
+    a = self._df["A"]
+    c = self._df["C"]
     with tf.Session(graph=graph) as sess:
       for i in xrange(3):
         result = sess.run(batch)
         start_row = i * batch_size
         end_row = (i + 1) * batch_size
-        np.testing.assert_equal(result['A'], a[start_row:end_row].to_numpy())
-        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+        np.testing.assert_equal(result["A"], a[start_row:end_row].to_numpy())
+        np.testing.assert_equal(result["C"], c[start_row:end_row].to_numpy())
 
   def test_schema_auto_detection_read(self):
     batch_size = 32
@@ -74,33 +71,33 @@ def test_schema_auto_detection_read(self):
       ds = ds.prefetch(4)
       batch = tf.data.make_one_shot_iterator(ds).get_next()
 
-    c = self._df['C']
+    c = self._df["C"]
     with tf.Session(graph=graph) as sess:
       for i in xrange(3):
         result = sess.run(batch)
         start_row = i * batch_size
         end_row = (i + 1) * batch_size
-        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+        np.testing.assert_equal(result["C"], c[start_row:end_row].to_numpy())
 
   def test_dtype_auto_detection_read(self):
     batch_size = 32
     with tf.Graph().as_default() as graph:
-      ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=['B', 'C'])
+      ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=["B", "C"])
       ds = ds.prefetch(4)
       batch = tf.data.make_one_shot_iterator(ds).get_next()
 
-    c = self._df['C']
+    c = self._df["C"]
     with tf.Session(graph=graph) as sess:
       for i in xrange(3):
         result = sess.run(batch)
         start_row = i * batch_size
         end_row = (i + 1) * batch_size
-        np.testing.assert_equal(result['C'], c[start_row:end_row].to_numpy())
+        np.testing.assert_equal(result["C"], c[start_row:end_row].to_numpy())
 
   def test_dtype_auto_detection_read_lower(self):
     batch_size = 32
     with tf.Graph().as_default() as graph:
-      actual_fields = parquet_dataset_ops.ParquetDataset.read_schema(self._filename, ['B', 'D'], lower=True)
+      actual_fields = parquet_dataset_ops.ParquetDataset.read_schema(self._filename, ["B", "D"], lower=True)
       fld = actual_fields[1].name
       ds = parquet_dataset_ops.ParquetDataset([self._filename], batch_size=batch_size, fields=actual_fields)
       ds = ds.prefetch(4)
@@ -127,8 +124,8 @@ def gen_filenames():
 
       filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
       fields = [
-          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+        parquet_dataset_ops.DataFrame.Field("A", tf.int64, 0),
+        parquet_dataset_ops.DataFrame.Field("C", tf.int64, 0),
       ]
       ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields))
       ds = ds.prefetch(4)
@@ -153,8 +150,8 @@ def gen_filenames():
 
       filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
       fields = [
-          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+        parquet_dataset_ops.DataFrame.Field("A", tf.int64, 0),
+        parquet_dataset_ops.DataFrame.Field("C", tf.int64, 0),
       ]
       ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=3))
       ds = ds.prefetch(4)
@@ -179,8 +176,8 @@ def gen_filenames():
 
       filenames = tf.data.Dataset.from_generator(gen_filenames, tf.string, tf.TensorShape([]))
       fields = [
-          parquet_dataset_ops.DataFrame.Field('A', tf.int64, 0),
-          parquet_dataset_ops.DataFrame.Field('C', tf.int64, 0)
+        parquet_dataset_ops.DataFrame.Field("A", tf.int64, 0),
+        parquet_dataset_ops.DataFrame.Field("C", tf.int64, 0),
       ]
       ds = filenames.apply(parquet_dataset_ops.read_parquet(batch_size, fields=fields, num_parallel_reads=AUTOTUNE))
       ds = ds.prefetch(4)
diff --git a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
index ccd3a14e..2ee81ff0 100644
--- a/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
+++ b/deepray/custom_ops/parquet_dataset/read_parquet_deepray.py
@@ -9,23 +9,21 @@
 
 ROW = 10
 
-os.environ['CUDA_VISIBLE_DEVICES'] = ''
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
 _workspace = tempfile.mkdtemp()
-_filename = os.path.join(_workspace, 'test.parquet')
+_filename = os.path.join(_workspace, "test.parquet")
 print(_filename)
 # _df = pd.DataFrame(
 #     np.random.randint(0, 100, size=(200, 4), dtype=np.int64),
 # columns=list('ABCd'))
 
-_df = pd.DataFrame(
-    {
-        'A': [np.random.randint(0, 10, size=np.random.randint(0, 5), dtype='int64').tolist() for _ in range(ROW)],
-        'B': [np.random.randint(0, 10, size=3, dtype='int64').tolist() for _ in range(ROW)],
-        'C': np.random.randint(0, 100, size=ROW, dtype='int32'),
-        'D': np.random.randint(0, 1000, size=ROW, dtype='int64'),
-        'E': ['string_{}'.format(i) for i in range(ROW)]
-    }
-)
+_df = pd.DataFrame({
+  "A": [np.random.randint(0, 10, size=np.random.randint(0, 5), dtype="int64").tolist() for _ in range(ROW)],
+  "B": [np.random.randint(0, 10, size=3, dtype="int64").tolist() for _ in range(ROW)],
+  "C": np.random.randint(0, 100, size=ROW, dtype="int32"),
+  "D": np.random.randint(0, 1000, size=ROW, dtype="int64"),
+  "E": ["string_{}".format(i) for i in range(ROW)],
+})
 
 print(_df.head(10))
 
@@ -33,20 +31,20 @@
 
 batch_size = 5
 ds = parquet_dataset_ops.ParquetDataset(
-    _filename,
-    batch_size=batch_size,
-    fields=['A', 'C']
-    # fields=[
-    #     parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1),
-    #     parquet_dataset_ops.DataFrame.Field(
-    #         'B',
-    #         tf.int64,
-    #         shape=[3],
-    #     ),
-    #     parquet_dataset_ops.DataFrame.Field('C', tf.int32),
-    #     parquet_dataset_ops.DataFrame.Field('D', tf.int64),
-    #     parquet_dataset_ops.DataFrame.Field('E', tf.string),
-    # ]
+  _filename,
+  batch_size=batch_size,
+  fields=["A", "C"],
+  # fields=[
+  #     parquet_dataset_ops.DataFrame.Field('A', tf.int64, ragged_rank=1),
+  #     parquet_dataset_ops.DataFrame.Field(
+  #         'B',
+  #         tf.int64,
+  #         shape=[3],
+  #     ),
+  #     parquet_dataset_ops.DataFrame.Field('C', tf.int32),
+  #     parquet_dataset_ops.DataFrame.Field('D', tf.int64),
+  #     parquet_dataset_ops.DataFrame.Field('E', tf.string),
+  # ]
 )
 
 ds = ds.prefetch(4)
diff --git a/deepray/custom_ops/seq2seq/__init__.py b/deepray/custom_ops/seq2seq/__init__.py
index 39f88075..847b1359 100644
--- a/deepray/custom_ops/seq2seq/__init__.py
+++ b/deepray/custom_ops/seq2seq/__init__.py
@@ -19,4 +19,4 @@
 from .python import beam_search_decoder
 from .python import decoder
 from .python import loss
-from .python import sampler
\ No newline at end of file
+from .python import sampler
diff --git a/deepray/custom_ops/seq2seq/python/attention_wrapper.py b/deepray/custom_ops/seq2seq/python/attention_wrapper.py
index b8eb8ad2..9993bb48 100644
--- a/deepray/custom_ops/seq2seq/python/attention_wrapper.py
+++ b/deepray/custom_ops/seq2seq/python/attention_wrapper.py
@@ -26,11 +26,11 @@
 
 from deepray.utils import keras_utils
 from deepray.utils.types import (
-    AcceptableDTypes,
-    FloatTensorLike,
-    TensorLike,
-    Initializer,
-    Number,
+  AcceptableDTypes,
+  FloatTensorLike,
+  TensorLike,
+  Initializer,
+  Number,
 )
 
 if Version(tf.__version__) < Version("2.13"):
@@ -42,63 +42,63 @@
 class AttentionMechanism(tf.keras.layers.Layer):
   """Base class for attention mechanisms.
 
-    Common functionality includes:
-      1. Storing the query and memory layers.
-      2. Preprocessing and storing the memory.
-
-    Note that this layer takes memory as its init parameter, which is an
-    anti-pattern of Keras API, we have to keep the memory as init parameter for
-    performance and dependency reason. Under the hood, during `__init__()`, it
-    will invoke `base_layer.__call__(memory, setup_memory=True)`. This will let
-    keras to keep track of the memory tensor as the input of this layer. Once
-    the `__init__()` is done, then user can query the attention by
-    `score = att_obj([query, state])`, and use it as a normal keras layer.
-
-    Special attention is needed when adding using this class as the base layer
-    for new attention:
-      1. Build() could be invoked at least twice. So please make sure weights
-         are not duplicated.
-      2. Layer.get_weights() might return different set of weights if the
-         instance has `query_layer`. The query_layer weights is not initialized
-         until the memory is configured.
-
-    Also note that this layer does not work with Keras model when
-    `model.compile(run_eagerly=True)` due to the fact that this layer is
-    stateful. The support for that will be added in a future version.
-    """
+  Common functionality includes:
+    1. Storing the query and memory layers.
+    2. Preprocessing and storing the memory.
+
+  Note that this layer takes memory as its init parameter, which is an
+  anti-pattern of Keras API, we have to keep the memory as init parameter for
+  performance and dependency reason. Under the hood, during `__init__()`, it
+  will invoke `base_layer.__call__(memory, setup_memory=True)`. This will let
+  keras to keep track of the memory tensor as the input of this layer. Once
+  the `__init__()` is done, then user can query the attention by
+  `score = att_obj([query, state])`, and use it as a normal keras layer.
+
+  Special attention is needed when adding using this class as the base layer
+  for new attention:
+    1. Build() could be invoked at least twice. So please make sure weights
+       are not duplicated.
+    2. Layer.get_weights() might return different set of weights if the
+       instance has `query_layer`. The query_layer weights is not initialized
+       until the memory is configured.
+
+  Also note that this layer does not work with Keras model when
+  `model.compile(run_eagerly=True)` due to the fact that this layer is
+  stateful. The support for that will be added in a future version.
+  """
 
   @typechecked
   def __init__(
-      self,
-      memory: Union[TensorLike, None],
-      probability_fn: callable,
-      query_layer: Optional[tf.keras.layers.Layer] = None,
-      memory_layer: Optional[tf.keras.layers.Layer] = None,
-      memory_sequence_length: Optional[TensorLike] = None,
-      **kwargs,
+    self,
+    memory: Union[TensorLike, None],
+    probability_fn: callable,
+    query_layer: Optional[tf.keras.layers.Layer] = None,
+    memory_layer: Optional[tf.keras.layers.Layer] = None,
+    memory_sequence_length: Optional[TensorLike] = None,
+    **kwargs,
   ):
     """Construct base AttentionMechanism class.
 
-        Args:
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          probability_fn: A `callable`. Converts the score and previous
-            alignments to probabilities. Its signature should be:
-            `probabilities = probability_fn(score, state)`.
-          query_layer: Optional `tf.keras.layers.Layer` instance. The layer's
-            depth must match the depth of `memory_layer`.  If `query_layer` is
-            not provided, the shape of `query` must match that of
-            `memory_layer`.
-          memory_layer: Optional `tf.keras.layers.Layer` instance. The layer's
-            depth must match the depth of `query_layer`.
-            If `memory_layer` is not provided, the shape of `memory` must match
-            that of `query_layer`.
-          memory_sequence_length: (optional) Sequence lengths for the batch
-            entries in memory. If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          **kwargs: Dictionary that contains other common arguments for layer
-            creation.
-        """
+    Args:
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      probability_fn: A `callable`. Converts the score and previous
+        alignments to probabilities. Its signature should be:
+        `probabilities = probability_fn(score, state)`.
+      query_layer: Optional `tf.keras.layers.Layer` instance. The layer's
+        depth must match the depth of `memory_layer`.  If `query_layer` is
+        not provided, the shape of `query` must match that of
+        `memory_layer`.
+      memory_layer: Optional `tf.keras.layers.Layer` instance. The layer's
+        depth must match the depth of `query_layer`.
+        If `memory_layer` is not provided, the shape of `memory` must match
+        that of `query_layer`.
+      memory_sequence_length: (optional) Sequence lengths for the batch
+        entries in memory. If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
     self.query_layer = query_layer
     self.memory_layer = memory_layer
     super().__init__(**kwargs)
@@ -126,7 +126,7 @@ def __init__(
   @property
   def memory_initialized(self):
     """Returns `True` if this attention mechanism has been initialized with
-        a memory."""
+    a memory."""
     return self._memory_initialized
 
   def build(self, input_shape):
@@ -148,23 +148,23 @@ def build(self, input_shape):
   def __call__(self, inputs, **kwargs):
     """Preprocess the inputs before calling `base_layer.__call__()`.
 
-        Note that there are situation here, one for setup memory, and one with
-        actual query and state.
-        1. When the memory has not been configured, we just pass all the param
-           to `base_layer.__call__()`, which will then invoke `self.call()` with
-           proper inputs, which allows this class to setup memory.
-        2. When the memory has already been setup, the input should contain
-           query and state, and optionally processed memory. If the processed
-           memory is not included in the input, we will have to append it to
-           the inputs and give it to the `base_layer.__call__()`. The processed
-           memory is the output of first invocation of `self.__call__()`. If we
-           don't add it here, then from keras perspective, the graph is
-           disconnected since the output from previous call is never used.
-
-        Args:
-          inputs: the inputs tensors.
-          **kwargs: dict, other keyeword arguments for the `__call__()`
-        """
+    Note that there are situation here, one for setup memory, and one with
+    actual query and state.
+    1. When the memory has not been configured, we just pass all the param
+       to `base_layer.__call__()`, which will then invoke `self.call()` with
+       proper inputs, which allows this class to setup memory.
+    2. When the memory has already been setup, the input should contain
+       query and state, and optionally processed memory. If the processed
+       memory is not included in the input, we will have to append it to
+       the inputs and give it to the `base_layer.__call__()`. The processed
+       memory is the output of first invocation of `self.__call__()`. If we
+       don't add it here, then from keras perspective, the graph is
+       disconnected since the output from previous call is never used.
+
+    Args:
+      inputs: the inputs tensors.
+      **kwargs: dict, other keyeword arguments for the `__call__()`
+    """
     # Allow manual memory reset
     if kwargs.get("setup_memory", False):
       self._memory_initialized = False
@@ -182,32 +182,32 @@ def __call__(self, inputs, **kwargs):
   def call(self, inputs, mask=None, setup_memory=False, **kwargs):
     """Setup the memory or query the attention.
 
-        There are two case here, one for setup memory, and the second is query
-        the attention score. `setup_memory` is the flag to indicate which mode
-        it is. The input list will be treated differently based on that flag.
-
-        Args:
-          inputs: a list of tensor that could either be `query` and `state`, or
-            `memory` and `memory_sequence_length`.
-            `query` is the tensor of dtype matching `memory` and shape
-            `[batch_size, query_depth]`.
-            `state` is the tensor of dtype matching `memory` and shape
-            `[batch_size, alignments_size]`. (`alignments_size` is memory's
-            `max_time`).
-            `memory` is the memory to query; usually the output of an RNN
-            encoder. The tensor should be shaped `[batch_size, max_time, ...]`.
-            `memory_sequence_length` (optional) is the sequence lengths for the
-             batch entries in memory. If provided, the memory tensor rows are
-            masked with zeros for values past the respective sequence lengths.
-          mask: optional bool tensor with shape `[batch, max_time]` for the
-            mask of memory. If it is not None, the corresponding item of the
-            memory should be filtered out during calculation.
-          setup_memory: boolean, whether the input is for setting up memory, or
-            query attention.
-          **kwargs: Dict, other keyword arguments for the call method.
-        Returns:
-          Either processed memory or attention score, based on `setup_memory`.
-        """
+    There are two case here, one for setup memory, and the second is query
+    the attention score. `setup_memory` is the flag to indicate which mode
+    it is. The input list will be treated differently based on that flag.
+
+    Args:
+      inputs: a list of tensor that could either be `query` and `state`, or
+        `memory` and `memory_sequence_length`.
+        `query` is the tensor of dtype matching `memory` and shape
+        `[batch_size, query_depth]`.
+        `state` is the tensor of dtype matching `memory` and shape
+        `[batch_size, alignments_size]`. (`alignments_size` is memory's
+        `max_time`).
+        `memory` is the memory to query; usually the output of an RNN
+        encoder. The tensor should be shaped `[batch_size, max_time, ...]`.
+        `memory_sequence_length` (optional) is the sequence lengths for the
+         batch entries in memory. If provided, the memory tensor rows are
+        masked with zeros for values past the respective sequence lengths.
+      mask: optional bool tensor with shape `[batch, max_time]` for the
+        mask of memory. If it is not None, the corresponding item of the
+        memory should be filtered out during calculation.
+      setup_memory: boolean, whether the input is for setting up memory, or
+        query attention.
+      **kwargs: Dict, other keyword arguments for the call method.
+    Returns:
+      Either processed memory or attention score, based on `setup_memory`.
+    """
     if setup_memory:
       if isinstance(inputs, list):
         if len(inputs) not in (1, 2):
@@ -231,8 +231,7 @@ def call(self, inputs, mask=None, setup_memory=False, **kwargs):
         raise ValueError("Cannot query the attention before the setup of memory")
       if len(inputs) not in (2, 3):
         raise ValueError(
-            "Expect the inputs to have query, state, and optional "
-            "processed memory, got %d items" % len(inputs)
+          "Expect the inputs to have query, state, and optional processed memory, got %d items" % len(inputs)
         )
       # Ignore the rest of the inputs and only care about the query and
       # state
@@ -242,27 +241,26 @@ def call(self, inputs, mask=None, setup_memory=False, **kwargs):
   def setup_memory(self, memory, memory_sequence_length=None, memory_mask=None):
     """Pre-process the memory before actually query the memory.
 
-        This should only be called once at the first invocation of `call()`.
-
-        Args:
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          memory_sequence_length (optional): Sequence lengths for the batch
-            entries in memory. If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          memory_mask: (Optional) The boolean tensor with shape `[batch_size,
-            max_time]`. For any value equal to False, the corresponding value
-            in memory should be ignored.
-        """
+    This should only be called once at the first invocation of `call()`.
+
+    Args:
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length (optional): Sequence lengths for the batch
+        entries in memory. If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      memory_mask: (Optional) The boolean tensor with shape `[batch_size,
+        max_time]`. For any value equal to False, the corresponding value
+        in memory should be ignored.
+    """
     if memory_sequence_length is not None and memory_mask is not None:
-      raise ValueError("memory_sequence_length and memory_mask cannot be "
-                       "used at same time for attention.")
+      raise ValueError("memory_sequence_length and memory_mask cannot be used at same time for attention.")
     with tf.name_scope(self.name or "BaseAttentionMechanismInit"):
       self.values = _prepare_memory(
-          memory,
-          memory_sequence_length=memory_sequence_length,
-          memory_mask=memory_mask,
-          check_inner_dims_defined=self._check_inner_dims_defined,
+        memory,
+        memory_sequence_length=memory_sequence_length,
+        memory_mask=memory_mask,
+        check_inner_dims_defined=self._check_inner_dims_defined,
       )
 
       # Mark the value as check since the memory and memory mask might not
@@ -285,13 +283,13 @@ def _mark_checked(tensor):
 
         def _mask_probability_fn(score, prev):
           return unwrapped_probability_fn(
-              _maybe_mask_score(
-                  score,
-                  memory_mask=memory_mask,
-                  memory_sequence_length=memory_sequence_length,
-                  score_mask_value=score.dtype.min,
-              ),
-              prev,
+            _maybe_mask_score(
+              score,
+              memory_mask=memory_mask,
+              memory_sequence_length=memory_sequence_length,
+              score_mask_value=score.dtype.min,
+            ),
+            prev,
           )
 
         self.probability_fn = _mask_probability_fn
@@ -313,13 +311,13 @@ def get_config(self):
 
     if self.query_layer is not None:
       config["query_layer"] = {
-          "class_name": self.query_layer.__class__.__name__,
-          "config": self.query_layer.get_config(),
+        "class_name": self.query_layer.__class__.__name__,
+        "config": self.query_layer.get_config(),
       }
     if self.memory_layer is not None:
       config["memory_layer"] = {
-          "class_name": self.memory_layer.__class__.__name__,
-          "config": self.memory_layer.get_config(),
+        "class_name": self.memory_layer.__class__.__name__,
+        "config": self.memory_layer.get_config(),
       }
     # memory is a required init parameter and its a tensor. It cannot be
     # serialized to config, so we put a placeholder for it.
@@ -330,8 +328,8 @@ def get_config(self):
   def _process_probability_fn(self, func_name):
     """Helper method to retrieve the probably function by string input."""
     valid_probability_fns = {
-        "softmax": tf.nn.softmax,
-        "hardmax": hardmax,
+      "softmax": tf.nn.softmax,
+      "hardmax": hardmax,
     }
     if func_name not in valid_probability_fns.keys():
       raise ValueError("Invalid probability function: %s, options are %s" % (func_name, valid_probability_fns.keys()))
@@ -341,36 +339,36 @@ def _process_probability_fn(self, func_name):
   def deserialize_inner_layer_from_config(cls, config, custom_objects):
     """Helper method that reconstruct the query and memory from the config.
 
-        In the get_config() method, the query and memory layer configs are
-        serialized into dict for persistence, this method perform the reverse
-        action to reconstruct the layer from the config.
-
-        Args:
-          config: dict, the configs that will be used to reconstruct the
-            object.
-          custom_objects: dict mapping class names (or function names) of
-            custom (non-Keras) objects to class/functions.
-        Returns:
-          config: dict, the config with layer instance created, which is ready
-            to be used as init parameters.
-        """
+    In the get_config() method, the query and memory layer configs are
+    serialized into dict for persistence, this method perform the reverse
+    action to reconstruct the layer from the config.
+
+    Args:
+      config: dict, the configs that will be used to reconstruct the
+        object.
+      custom_objects: dict mapping class names (or function names) of
+        custom (non-Keras) objects to class/functions.
+    Returns:
+      config: dict, the config with layer instance created, which is ready
+        to be used as init parameters.
+    """
     # Reconstruct the query and memory layer for parent class.
     # Instead of updating the input, create a copy and use that.
     config = config.copy()
     query_layer_config = config.pop("query_layer", None)
     if query_layer_config:
       query_layer = tf.keras.layers.deserialize(
-          query_layer_config,
-          custom_objects=custom_objects,
-          **SERIALIZATION_ARGS,
+        query_layer_config,
+        custom_objects=custom_objects,
+        **SERIALIZATION_ARGS,
       )
       config["query_layer"] = query_layer
     memory_layer_config = config.pop("memory_layer", None)
     if memory_layer_config:
       memory_layer = tf.keras.layers.deserialize(
-          memory_layer_config,
-          custom_objects=custom_objects,
-          **SERIALIZATION_ARGS,
+        memory_layer_config,
+        custom_objects=custom_objects,
+        **SERIALIZATION_ARGS,
       )
       config["memory_layer"] = memory_layer
     return config
@@ -388,78 +386,78 @@ def state_size(self):
 
   def initial_alignments(self, batch_size, dtype):
     """Creates the initial alignment values for the `tfa.seq2seq.AttentionWrapper`
-        class.
+    class.
 
-        This is important for attention mechanisms that use the previous
-        alignment to calculate the alignment at the next time step
-        (e.g. monotonic attention).
+    This is important for attention mechanisms that use the previous
+    alignment to calculate the alignment at the next time step
+    (e.g. monotonic attention).
 
-        The default behavior is to return a tensor of all zeros.
+    The default behavior is to return a tensor of all zeros.
 
-        Args:
-          batch_size: `int32` scalar, the batch_size.
-          dtype: The `dtype`.
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
 
-        Returns:
-          A `dtype` tensor shaped `[batch_size, alignments_size]`
-          (`alignments_size` is the values' `max_time`).
-        """
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
     return tf.zeros([batch_size, self._alignments_size], dtype=dtype)
 
   def initial_state(self, batch_size, dtype):
     """Creates the initial state values for the `tfa.seq2seq.AttentionWrapper` class.
 
-        This is important for attention mechanisms that use the previous
-        alignment to calculate the alignment at the next time step
-        (e.g. monotonic attention).
+    This is important for attention mechanisms that use the previous
+    alignment to calculate the alignment at the next time step
+    (e.g. monotonic attention).
 
-        The default behavior is to return the same output as
-        `initial_alignments`.
+    The default behavior is to return the same output as
+    `initial_alignments`.
 
-        Args:
-          batch_size: `int32` scalar, the batch_size.
-          dtype: The `dtype`.
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
 
-        Returns:
-          A structure of all-zero tensors with shapes as described by
-          `state_size`.
-        """
+    Returns:
+      A structure of all-zero tensors with shapes as described by
+      `state_size`.
+    """
     return self.initial_alignments(batch_size, dtype)
 
 
 def _luong_score(query, keys, scale):
   """Implements Luong-style (multiplicative) scoring function.
 
-    This attention has two forms.  The first is standard Luong attention,
-    as described in:
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
 
-    Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
-    "Effective Approaches to Attention-based Neural Machine Translation."
-    EMNLP 2015.  https://arxiv.org/abs/1508.04025
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  "Effective Approaches to Attention-based Neural Machine Translation."
+  EMNLP 2015.  https://arxiv.org/abs/1508.04025
 
-    The second is the scaled form inspired partly by the normalized form of
-    Bahdanau attention.
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
 
-    To enable the second form, call this function with `scale=True`.
+  To enable the second form, call this function with `scale=True`.
 
-    Args:
-      query: Tensor, shape `[batch_size, num_units]` to compare to keys.
-      keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-      scale: the optional tensor to scale the attention score.
+  Args:
+    query: Tensor, shape `[batch_size, num_units]` to compare to keys.
+    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
+    scale: the optional tensor to scale the attention score.
 
-    Returns:
-      A `[batch_size, max_time]` tensor of unnormalized score values.
+  Returns:
+    A `[batch_size, max_time]` tensor of unnormalized score values.
 
-    Raises:
-      ValueError: If `key` and `query` depths do not match.
-    """
+  Raises:
+    ValueError: If `key` and `query` depths do not match.
+  """
   depth = query.shape[-1]
   key_units = keys.shape[-1]
   if depth != key_units:
     raise ValueError(
-        "Incompatible or unknown inner dimensions between query and keys. "
-        "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
-        "Perhaps you need to set num_units to the keys' dimension (%s)?" % (query, depth, keys, key_units, key_units)
+      "Incompatible or unknown inner dimensions between query and keys. "
+      "Query (%s) has units: %s.  Keys (%s) have units: %s.  "
+      "Perhaps you need to set num_units to the keys' dimension (%s)?" % (query, depth, keys, key_units, key_units)
     )
 
   # Reshape from [batch_size, depth] to [batch_size, 1, depth]
@@ -486,52 +484,52 @@ def _luong_score(query, keys, scale):
 class LuongAttention(AttentionMechanism):
   """Implements Luong-style (multiplicative) attention scoring.
 
-    This attention has two forms.  The first is standard Luong attention,
-    as described in:
+  This attention has two forms.  The first is standard Luong attention,
+  as described in:
 
-    Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
-    [Effective Approaches to Attention-based Neural Machine Translation.
-    EMNLP 2015.](https://arxiv.org/abs/1508.04025)
+  Minh-Thang Luong, Hieu Pham, Christopher D. Manning.
+  [Effective Approaches to Attention-based Neural Machine Translation.
+  EMNLP 2015.](https://arxiv.org/abs/1508.04025)
 
-    The second is the scaled form inspired partly by the normalized form of
-    Bahdanau attention.
+  The second is the scaled form inspired partly by the normalized form of
+  Bahdanau attention.
 
-    To enable the second form, construct the object with parameter
-    `scale=True`.
-    """
+  To enable the second form, construct the object with parameter
+  `scale=True`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      memory: Optional[TensorLike] = None,
-      memory_sequence_length: Optional[TensorLike] = None,
-      scale: bool = False,
-      probability_fn: str = "softmax",
-      dtype: AcceptableDTypes = None,
-      name: str = "LuongAttention",
-      **kwargs,
+    self,
+    units: TensorLike,
+    memory: Optional[TensorLike] = None,
+    memory_sequence_length: Optional[TensorLike] = None,
+    scale: bool = False,
+    probability_fn: str = "softmax",
+    dtype: AcceptableDTypes = None,
+    name: str = "LuongAttention",
+    **kwargs,
   ):
     """Construct the AttentionMechanism mechanism.
 
-        Args:
-          units: The depth of the attention mechanism.
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          memory_sequence_length: (optional): Sequence lengths for the batch
-            entries in memory.  If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          scale: Python boolean. Whether to scale the energy term.
-          probability_fn: (optional) string, the name of function to convert
-            the attention score to probabilities. The default is `softmax`
-            which is `tf.nn.softmax`. Other options is `hardmax`, which is
-            hardmax() within this module. Any other value will result
-            intovalidation error. Default to use `softmax`.
-          dtype: The data type for the memory layer of the attention mechanism.
-          name: Name to use when creating ops.
-          **kwargs: Dictionary that contains other common arguments for layer
-            creation.
-        """
+    Args:
+      units: The depth of the attention mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch
+        entries in memory.  If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      scale: Python boolean. Whether to scale the energy term.
+      probability_fn: (optional) string, the name of function to convert
+        the attention score to probabilities. The default is `softmax`
+        which is `tf.nn.softmax`. Other options is `hardmax`, which is
+        hardmax() within this module. Any other value will result
+        intovalidation error. Default to use `softmax`.
+      dtype: The data type for the memory layer of the attention mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
     # For LuongAttention, we only transform the memory layer; thus
     # num_units **must** match expected the query depth.
     self.probability_fn_name = probability_fn
@@ -547,14 +545,14 @@ def wrapped_probability_fn(score, _):
     self.scale = scale
     self.scale_weight = None
     super().__init__(
-        memory=memory,
-        memory_sequence_length=memory_sequence_length,
-        query_layer=None,
-        memory_layer=memory_layer,
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs,
+      memory=memory,
+      memory_sequence_length=memory_sequence_length,
+      query_layer=None,
+      memory_layer=memory_layer,
+      probability_fn=wrapped_probability_fn,
+      name=name,
+      dtype=dtype,
+      **kwargs,
     )
 
   def build(self, input_shape):
@@ -566,19 +564,19 @@ def build(self, input_shape):
   def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
-        Args:
-          query: Tensor of dtype matching `self.values` and shape
-            `[batch_size, query_depth]`.
-          state: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]`
-            (`alignments_size` is memory's `max_time`).
-
-        Returns:
-          alignments: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]` (`alignments_size` is memory's
-            `max_time`).
-          next_state: Same as the alignments.
-        """
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as the alignments.
+    """
     score = _luong_score(query, self.keys, self.scale_weight)
     alignments = self.probability_fn(score, state)
     next_state = alignments
@@ -586,9 +584,9 @@ def _calculate_attention(self, query, state):
 
   def get_config(self):
     config = {
-        "units": self.units,
-        "scale": self.scale,
-        "probability_fn": self.probability_fn_name,
+      "units": self.units,
+      "scale": self.scale,
+      "probability_fn": self.probability_fn_name,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -602,38 +600,38 @@ def from_config(cls, config, custom_objects=None):
 def _bahdanau_score(processed_query, keys, attention_v, attention_g=None, attention_b=None):
   """Implements Bahdanau-style (additive) scoring function.
 
-    This attention has two forms.  The first is Bahdanau attention,
-    as described in:
+  This attention has two forms.  The first is Bahdanau attention,
+  as described in:
 
-    Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
-    "Neural Machine Translation by Jointly Learning to Align and Translate."
-    ICLR 2015. https://arxiv.org/abs/1409.0473
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
 
-    The second is the normalized form.  This form is inspired by the
-    weight normalization article:
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
 
-    Tim Salimans, Diederik P. Kingma.
-    "Weight Normalization: A Simple Reparameterization to Accelerate
-     Training of Deep Neural Networks."
-    https://arxiv.org/abs/1602.07868
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
 
-    To enable the second form, set please pass in attention_g and attention_b.
+  To enable the second form, set please pass in attention_g and attention_b.
 
-    Args:
-      processed_query: Tensor, shape `[batch_size, num_units]` to compare to
-        keys.
-      keys: Processed memory, shape `[batch_size, max_time, num_units]`.
-      attention_v: Tensor, shape `[num_units]`.
-      attention_g: Optional scalar tensor for normalization.
-      attention_b: Optional tensor with shape `[num_units]` for normalization.
+  Args:
+    processed_query: Tensor, shape `[batch_size, num_units]` to compare to
+      keys.
+    keys: Processed memory, shape `[batch_size, max_time, num_units]`.
+    attention_v: Tensor, shape `[num_units]`.
+    attention_g: Optional scalar tensor for normalization.
+    attention_b: Optional tensor with shape `[num_units]` for normalization.
 
-    Returns:
-      A `[batch_size, max_time]` tensor of unnormalized score values.
-    """
+  Returns:
+    A `[batch_size, max_time]` tensor of unnormalized score values.
+  """
   # Reshape from [batch_size, ...] to [batch_size, 1, ...] for broadcasting.
   processed_query = tf.expand_dims(processed_query, 1)
   if attention_g is not None and attention_b is not None:
-    normed_v = (attention_g * attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(attention_v))))
+    normed_v = attention_g * attention_v * tf.math.rsqrt(tf.reduce_sum(tf.square(attention_v)))
     return tf.reduce_sum(normed_v * tf.tanh(keys + processed_query + attention_b), [2])
   else:
     return tf.reduce_sum(attention_v * tf.tanh(keys + processed_query), [2])
@@ -642,61 +640,61 @@ def _bahdanau_score(processed_query, keys, attention_v, attention_g=None, attent
 class BahdanauAttention(AttentionMechanism):
   """Implements Bahdanau-style (additive) attention.
 
-    This attention has two forms.  The first is Bahdanau attention,
-    as described in:
+  This attention has two forms.  The first is Bahdanau attention,
+  as described in:
 
-    Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
-    "Neural Machine Translation by Jointly Learning to Align and Translate."
-    ICLR 2015. https://arxiv.org/abs/1409.0473
+  Dzmitry Bahdanau, Kyunghyun Cho, Yoshua Bengio.
+  "Neural Machine Translation by Jointly Learning to Align and Translate."
+  ICLR 2015. https://arxiv.org/abs/1409.0473
 
-    The second is the normalized form.  This form is inspired by the
-    weight normalization article:
+  The second is the normalized form.  This form is inspired by the
+  weight normalization article:
 
-    Tim Salimans, Diederik P. Kingma.
-    "Weight Normalization: A Simple Reparameterization to Accelerate
-     Training of Deep Neural Networks."
-    https://arxiv.org/abs/1602.07868
+  Tim Salimans, Diederik P. Kingma.
+  "Weight Normalization: A Simple Reparameterization to Accelerate
+   Training of Deep Neural Networks."
+  https://arxiv.org/abs/1602.07868
 
-    To enable the second form, construct the object with parameter
-    `normalize=True`.
-    """
+  To enable the second form, construct the object with parameter
+  `normalize=True`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      memory: Optional[TensorLike] = None,
-      memory_sequence_length: Optional[TensorLike] = None,
-      normalize: bool = False,
-      probability_fn: str = "softmax",
-      kernel_initializer: Initializer = "glorot_uniform",
-      dtype: AcceptableDTypes = None,
-      name: str = "BahdanauAttention",
-      **kwargs,
+    self,
+    units: TensorLike,
+    memory: Optional[TensorLike] = None,
+    memory_sequence_length: Optional[TensorLike] = None,
+    normalize: bool = False,
+    probability_fn: str = "softmax",
+    kernel_initializer: Initializer = "glorot_uniform",
+    dtype: AcceptableDTypes = None,
+    name: str = "BahdanauAttention",
+    **kwargs,
   ):
     """Construct the Attention mechanism.
 
-        Args:
-          units: The depth of the query mechanism.
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          memory_sequence_length: (optional): Sequence lengths for the batch
-            entries in memory.  If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          normalize: Python boolean.  Whether to normalize the energy term.
-          probability_fn: (optional) string, the name of function to convert
-            the attention score to probabilities. The default is `softmax`
-            which is `tf.nn.softmax`. Other options is `hardmax`, which is
-            hardmax() within this module. Any other value will result into
-            validation error. Default to use `softmax`.
-          kernel_initializer: (optional), the name of the initializer for the
-            attention kernel.
-          dtype: The data type for the query and memory layers of the attention
-            mechanism.
-          name: Name to use when creating ops.
-          **kwargs: Dictionary that contains other common arguments for layer
-            creation.
-        """
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch
+        entries in memory.  If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      normalize: Python boolean.  Whether to normalize the energy term.
+      probability_fn: (optional) string, the name of function to convert
+        the attention score to probabilities. The default is `softmax`
+        which is `tf.nn.softmax`. Other options is `hardmax`, which is
+        hardmax() within this module. Any other value will result into
+        validation error. Default to use `softmax`.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
     self.probability_fn_name = probability_fn
     probability_fn = self._process_probability_fn(self.probability_fn_name)
 
@@ -716,30 +714,30 @@ def wrapped_probability_fn(score, _):
     self.attention_g = None
     self.attention_b = None
     super().__init__(
-        memory=memory,
-        memory_sequence_length=memory_sequence_length,
-        query_layer=query_layer,
-        memory_layer=memory_layer,
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs,
+      memory=memory,
+      memory_sequence_length=memory_sequence_length,
+      query_layer=query_layer,
+      memory_layer=memory_layer,
+      probability_fn=wrapped_probability_fn,
+      name=name,
+      dtype=dtype,
+      **kwargs,
     )
 
   def build(self, input_shape):
     super().build(input_shape)
     if self.attention_v is None:
       self.attention_v = self.add_weight(
-          "attention_v",
-          [self.units],
-          dtype=self.dtype,
-          initializer=self.kernel_initializer,
+        "attention_v",
+        [self.units],
+        dtype=self.dtype,
+        initializer=self.kernel_initializer,
       )
     if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
-          "attention_g",
-          initializer=tf.constant_initializer(math.sqrt(1.0 / self.units)),
-          shape=(),
+        "attention_g",
+        initializer=tf.constant_initializer(math.sqrt(1.0 / self.units)),
+        shape=(),
       )
       self.attention_b = self.add_weight("attention_b", shape=[self.units], initializer=tf.zeros_initializer())
     self.built = True
@@ -747,26 +745,26 @@ def build(self, input_shape):
   def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
-        Args:
-          query: Tensor of dtype matching `self.values` and shape
-            `[batch_size, query_depth]`.
-          state: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]`
-            (`alignments_size` is memory's `max_time`).
-
-        Returns:
-          alignments: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]` (`alignments_size` is memory's
-            `max_time`).
-          next_state: same as alignments.
-        """
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: same as alignments.
+    """
     processed_query = self.query_layer(query) if self.query_layer else query
     score = _bahdanau_score(
-        processed_query,
-        self.keys,
-        self.attention_v,
-        attention_g=self.attention_g,
-        attention_b=self.attention_b,
+      processed_query,
+      self.keys,
+      self.attention_v,
+      attention_g=self.attention_g,
+      attention_b=self.attention_b,
     )
     alignments = self.probability_fn(score, state)
     next_state = alignments
@@ -791,8 +789,8 @@ def get_config(self):
   @classmethod
   def from_config(cls, config, custom_objects=None):
     config = AttentionMechanism.deserialize_inner_layer_from_config(
-        config,
-        custom_objects=custom_objects,
+      config,
+      custom_objects=custom_objects,
     )
     return cls(**config)
 
@@ -800,19 +798,19 @@ def from_config(cls, config, custom_objects=None):
 def safe_cumprod(x: TensorLike, *args, **kwargs) -> tf.Tensor:
   """Computes cumprod of x in logspace using cumsum to avoid underflow.
 
-    The cumprod function and its gradient can result in numerical instabilities
-    when its argument has very small and/or zero values.  As long as the
-    argument is all positive, we can instead compute the cumulative product as
-    exp(cumsum(log(x))).  This function can be called identically to
-    tf.cumprod.
-
-    Args:
-      x: Tensor to take the cumulative product of.
-      *args: Passed on to cumsum; these are identical to those in cumprod.
-      **kwargs: Passed on to cumsum; these are identical to those in cumprod.
-    Returns:
-      Cumulative product of x.
-    """
+  The cumprod function and its gradient can result in numerical instabilities
+  when its argument has very small and/or zero values.  As long as the
+  argument is all positive, we can instead compute the cumulative product as
+  exp(cumsum(log(x))).  This function can be called identically to
+  tf.cumprod.
+
+  Args:
+    x: Tensor to take the cumulative product of.
+    *args: Passed on to cumsum; these are identical to those in cumprod.
+    **kwargs: Passed on to cumsum; these are identical to those in cumprod.
+  Returns:
+    Cumulative product of x.
+  """
   with tf.name_scope("SafeCumprod"):
     x = tf.convert_to_tensor(x, name="x")
     tiny = np.finfo(x.dtype.as_numpy_dtype).tiny
@@ -822,44 +820,44 @@ def safe_cumprod(x: TensorLike, *args, **kwargs) -> tf.Tensor:
 def monotonic_attention(p_choose_i: FloatTensorLike, previous_attention: FloatTensorLike, mode: str) -> tf.Tensor:
   """Computes monotonic attention distribution from choosing probabilities.
 
-    Monotonic attention implies that the input sequence is processed in an
-    explicitly left-to-right manner when generating the output sequence.  In
-    addition, once an input sequence element is attended to at a given output
-    timestep, elements occurring before it cannot be attended to at subsequent
-    output timesteps.  This function generates attention distributions
-    according to these assumptions.  For more information, see `Online and
-    Linear-Time Attention by Enforcing Monotonic Alignments`.
-
-    Args:
-      p_choose_i: Probability of choosing input sequence/memory element i.
-        Should be of shape (batch_size, input_sequence_length), and should all
-        be in the range [0, 1].
-      previous_attention: The attention distribution from the previous output
-        timestep.  Should be of shape (batch_size, input_sequence_length).  For
-        the first output timestep, preevious_attention[n] should be
-        [1, 0, 0, ..., 0] for all n in [0, ... batch_size - 1].
-      mode: How to compute the attention distribution.  Must be one of
-        'recursive', 'parallel', or 'hard'.
-          * 'recursive' uses tf.scan to recursively compute the distribution.
-            This is slowest but is exact, general, and does not suffer from
-            numerical instabilities.
-          * 'parallel' uses parallelized cumulative-sum and cumulative-product
-            operations to compute a closed-form solution to the recurrence
-            relation defining the attention distribution.  This makes it more
-            efficient than 'recursive', but it requires numerical checks which
-            make the distribution non-exact.  This can be a problem in
-            particular when input_sequence_length is long and/or p_choose_i has
-            entries very close to 0 or 1.
-          * 'hard' requires that the probabilities in p_choose_i are all either
-            0 or 1, and subsequently uses a more efficient and exact solution.
-
-    Returns:
-      A tensor of shape (batch_size, input_sequence_length) representing the
-      attention distributions for each sequence in the batch.
-
-    Raises:
-      ValueError: mode is not one of 'recursive', 'parallel', 'hard'.
-    """
+  Monotonic attention implies that the input sequence is processed in an
+  explicitly left-to-right manner when generating the output sequence.  In
+  addition, once an input sequence element is attended to at a given output
+  timestep, elements occurring before it cannot be attended to at subsequent
+  output timesteps.  This function generates attention distributions
+  according to these assumptions.  For more information, see `Online and
+  Linear-Time Attention by Enforcing Monotonic Alignments`.
+
+  Args:
+    p_choose_i: Probability of choosing input sequence/memory element i.
+      Should be of shape (batch_size, input_sequence_length), and should all
+      be in the range [0, 1].
+    previous_attention: The attention distribution from the previous output
+      timestep.  Should be of shape (batch_size, input_sequence_length).  For
+      the first output timestep, preevious_attention[n] should be
+      [1, 0, 0, ..., 0] for all n in [0, ... batch_size - 1].
+    mode: How to compute the attention distribution.  Must be one of
+      'recursive', 'parallel', or 'hard'.
+        * 'recursive' uses tf.scan to recursively compute the distribution.
+          This is slowest but is exact, general, and does not suffer from
+          numerical instabilities.
+        * 'parallel' uses parallelized cumulative-sum and cumulative-product
+          operations to compute a closed-form solution to the recurrence
+          relation defining the attention distribution.  This makes it more
+          efficient than 'recursive', but it requires numerical checks which
+          make the distribution non-exact.  This can be a problem in
+          particular when input_sequence_length is long and/or p_choose_i has
+          entries very close to 0 or 1.
+        * 'hard' requires that the probabilities in p_choose_i are all either
+          0 or 1, and subsequently uses a more efficient and exact solution.
+
+  Returns:
+    A tensor of shape (batch_size, input_sequence_length) representing the
+    attention distributions for each sequence in the batch.
+
+  Raises:
+    ValueError: mode is not one of 'recursive', 'parallel', 'hard'.
+  """
   # Force things to be tensors
   p_choose_i = tf.convert_to_tensor(p_choose_i, name="p_choose_i")
   previous_attention = tf.convert_to_tensor(previous_attention, name="previous_attention")
@@ -873,27 +871,30 @@ def monotonic_attention(p_choose_i: FloatTensorLike, previous_attention: FloatTe
     # q[i] = (1 - p_choose_i[i - 1])*q[i - 1] + previous_attention[i]
     # attention[i] = p_choose_i[i]*q[i]
     attention = p_choose_i * tf.transpose(
-        tf.scan(
-            # Need to use reshape to remind TF of the shape between loop
-            # iterations
-            lambda x, yz: tf.reshape(yz[0] * x + yz[1], (batch_size,)),
-            # Loop variables yz[0] and yz[1]
-            [tf.transpose(shifted_1mp_choose_i), tf.transpose(previous_attention)],
-            # Initial value of x is just zeros
-            tf.zeros((batch_size,)),
-        )
+      tf.scan(
+        # Need to use reshape to remind TF of the shape between loop
+        # iterations
+        lambda x, yz: tf.reshape(yz[0] * x + yz[1], (batch_size,)),
+        # Loop variables yz[0] and yz[1]
+        [tf.transpose(shifted_1mp_choose_i), tf.transpose(previous_attention)],
+        # Initial value of x is just zeros
+        tf.zeros((batch_size,)),
+      )
     )
   elif mode == "parallel":
     # safe_cumprod computes cumprod in logspace with numeric checks
     cumprod_1mp_choose_i = safe_cumprod(1 - p_choose_i, axis=1, exclusive=True)
     # Compute recurrence relation solution
     attention = (
-        p_choose_i * cumprod_1mp_choose_i * tf.cumsum(
-            previous_attention /
-            # Clip cumprod_1mp to avoid divide-by-zero
-            tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.0),
-            axis=1,
-        )
+      p_choose_i
+      * cumprod_1mp_choose_i
+      * tf.cumsum(
+        previous_attention
+        /
+        # Clip cumprod_1mp to avoid divide-by-zero
+        tf.clip_by_value(cumprod_1mp_choose_i, 1e-10, 1.0),
+        axis=1,
+      )
     )
   elif mode == "hard":
     # Remove any probabilities before the index chosen last time step
@@ -912,35 +913,35 @@ def monotonic_attention(p_choose_i: FloatTensorLike, previous_attention: FloatTe
 def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, seed=None):
   """Attention probability function for monotonic attention.
 
-    Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage
-    the model to make discrete attention decisions, passes them through a
-    sigmoid to obtain "choosing" probabilities, and then calls
-    monotonic_attention to obtain the attention distribution.  For more
-    information, see
-
-    Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
-    "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
-    ICML 2017.  https://arxiv.org/abs/1704.00784
-
-    Args:
-      score: Unnormalized attention scores, shape
-        `[batch_size, alignments_size]`
-      previous_alignments: Previous attention distribution, shape
-        `[batch_size, alignments_size]`
-      sigmoid_noise: Standard deviation of pre-sigmoid noise. Setting this
-        larger than 0 will encourage the model to produce large attention
-        scores, effectively making the choosing probabilities discrete and the
-        resulting attention distribution one-hot.  It should be set to 0 at
-        test-time, and when hard attention is not desired.
-      mode: How to compute the attention distribution.  Must be one of
-        'recursive', 'parallel', or 'hard'.  See the docstring for
-        `tfa.seq2seq.monotonic_attention` for more information.
-      seed: (optional) Random seed for pre-sigmoid noise.
-
-    Returns:
-      A `[batch_size, alignments_size]`-shape tensor corresponding to the
-      resulting attention distribution.
-    """
+  Takes in unnormalized attention scores, adds pre-sigmoid noise to encourage
+  the model to make discrete attention decisions, passes them through a
+  sigmoid to obtain "choosing" probabilities, and then calls
+  monotonic_attention to obtain the attention distribution.  For more
+  information, see
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+
+  Args:
+    score: Unnormalized attention scores, shape
+      `[batch_size, alignments_size]`
+    previous_alignments: Previous attention distribution, shape
+      `[batch_size, alignments_size]`
+    sigmoid_noise: Standard deviation of pre-sigmoid noise. Setting this
+      larger than 0 will encourage the model to produce large attention
+      scores, effectively making the choosing probabilities discrete and the
+      resulting attention distribution one-hot.  It should be set to 0 at
+      test-time, and when hard attention is not desired.
+    mode: How to compute the attention distribution.  Must be one of
+      'recursive', 'parallel', or 'hard'.  See the docstring for
+      `tfa.seq2seq.monotonic_attention` for more information.
+    seed: (optional) Random seed for pre-sigmoid noise.
+
+  Returns:
+    A `[batch_size, alignments_size]`-shape tensor corresponding to the
+    resulting attention distribution.
+  """
   # Optionally add pre-sigmoid noise to the scores
   if sigmoid_noise > 0:
     noise = tf.random.normal(tf.shape(score), dtype=score.dtype, seed=seed)
@@ -958,25 +959,25 @@ def _monotonic_probability_fn(score, previous_alignments, sigmoid_noise, mode, s
 class _BaseMonotonicAttentionMechanism(AttentionMechanism):
   """Base attention mechanism for monotonic attention.
 
-    Simply overrides the initial_alignments function to provide a dirac
-    distribution, which is needed in order for the monotonic attention
-    distributions to have the correct behavior.
-    """
+  Simply overrides the initial_alignments function to provide a dirac
+  distribution, which is needed in order for the monotonic attention
+  distributions to have the correct behavior.
+  """
 
   def initial_alignments(self, batch_size, dtype):
     """Creates the initial alignment values for the monotonic attentions.
 
-        Initializes to dirac distributions, i.e.
-        [1, 0, 0, ...memory length..., 0] for all entries in the batch.
+    Initializes to dirac distributions, i.e.
+    [1, 0, 0, ...memory length..., 0] for all entries in the batch.
 
-        Args:
-          batch_size: `int32` scalar, the batch_size.
-          dtype: The `dtype`.
+    Args:
+      batch_size: `int32` scalar, the batch_size.
+      dtype: The `dtype`.
 
-        Returns:
-          A `dtype` tensor shaped `[batch_size, alignments_size]`
-          (`alignments_size` is the values' `max_time`).
-        """
+    Returns:
+      A `dtype` tensor shaped `[batch_size, alignments_size]`
+      (`alignments_size` is the values' `max_time`).
+    """
     max_time = self._alignments_size
     return tf.one_hot(tf.zeros((batch_size,), dtype=tf.int32), max_time, dtype=dtype)
 
@@ -984,69 +985,69 @@ def initial_alignments(self, batch_size, dtype):
 class BahdanauMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Bahdanau-style energy function.
 
-    This type of attention enforces a monotonic constraint on the attention
-    distributions; that is once the model attends to a given point in the
-    memory it can't attend to any prior points at subsequence output timesteps.
-    It achieves this by using the `_monotonic_probability_fn` instead of `softmax`
-    to construct its attention distributions.  Since the attention scores are
-    passed through a sigmoid, a learnable scalar bias parameter is applied
-    after the score function and before the sigmoid.  Otherwise, it is
-    equivalent to `tfa.seq2seq.BahdanauAttention`.  This approach is proposed in
-
-    Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
-    "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
-    ICML 2017.  https://arxiv.org/abs/1704.00784
-    """
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the
+  memory it can't attend to any prior points at subsequence output timesteps.
+  It achieves this by using the `_monotonic_probability_fn` instead of `softmax`
+  to construct its attention distributions.  Since the attention scores are
+  passed through a sigmoid, a learnable scalar bias parameter is applied
+  after the score function and before the sigmoid.  Otherwise, it is
+  equivalent to `tfa.seq2seq.BahdanauAttention`.  This approach is proposed in
+
+  Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.  https://arxiv.org/abs/1704.00784
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      memory: Optional[TensorLike] = None,
-      memory_sequence_length: Optional[TensorLike] = None,
-      normalize: bool = False,
-      sigmoid_noise: FloatTensorLike = 0.0,
-      sigmoid_noise_seed: Optional[FloatTensorLike] = None,
-      score_bias_init: FloatTensorLike = 0.0,
-      mode: str = "parallel",
-      kernel_initializer: Initializer = "glorot_uniform",
-      dtype: AcceptableDTypes = None,
-      name: str = "BahdanauMonotonicAttention",
-      **kwargs,
+    self,
+    units: TensorLike,
+    memory: Optional[TensorLike] = None,
+    memory_sequence_length: Optional[TensorLike] = None,
+    normalize: bool = False,
+    sigmoid_noise: FloatTensorLike = 0.0,
+    sigmoid_noise_seed: Optional[FloatTensorLike] = None,
+    score_bias_init: FloatTensorLike = 0.0,
+    mode: str = "parallel",
+    kernel_initializer: Initializer = "glorot_uniform",
+    dtype: AcceptableDTypes = None,
+    name: str = "BahdanauMonotonicAttention",
+    **kwargs,
   ):
     """Construct the attention mechanism.
 
-        Args:
-          units: The depth of the query mechanism.
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          memory_sequence_length: (optional): Sequence lengths for the batch
-            entries in memory.  If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          normalize: Python boolean. Whether to normalize the energy term.
-          sigmoid_noise: Standard deviation of pre-sigmoid noise. See the
-            docstring for `_monotonic_probability_fn` for more information.
-          sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
-          score_bias_init: Initial value for score bias scalar. It's
-            recommended to initialize this to a negative value when the length
-            of the memory is large.
-          mode: How to compute the attention distribution. Must be one of
-            'recursive', 'parallel', or 'hard'. See the docstring for
-            `tfa.seq2seq.monotonic_attention` for more information.
-          kernel_initializer: (optional), the name of the initializer for the
-            attention kernel.
-          dtype: The data type for the query and memory layers of the attention
-            mechanism.
-          name: Name to use when creating ops.
-          **kwargs: Dictionary that contains other common arguments for layer
-            creation.
-        """
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch
+        entries in memory.  If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      normalize: Python boolean. Whether to normalize the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise. See the
+        docstring for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar. It's
+        recommended to initialize this to a negative value when the length
+        of the memory is large.
+      mode: How to compute the attention distribution. Must be one of
+        'recursive', 'parallel', or 'hard'. See the docstring for
+        `tfa.seq2seq.monotonic_attention` for more information.
+      kernel_initializer: (optional), the name of the initializer for the
+        attention kernel.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
     # Set up the monotonic probability fn with supplied parameters
     wrapped_probability_fn = functools.partial(
-        _monotonic_probability_fn,
-        sigmoid_noise=sigmoid_noise,
-        mode=mode,
-        seed=sigmoid_noise_seed,
+      _monotonic_probability_fn,
+      sigmoid_noise=sigmoid_noise,
+      mode=mode,
+      seed=sigmoid_noise_seed,
     )
     query_layer = kwargs.pop("query_layer", None)
     if not query_layer:
@@ -1066,69 +1067,69 @@ def __init__(
     self.attention_g = None
     self.attention_b = None
     super().__init__(
-        memory=memory,
-        memory_sequence_length=memory_sequence_length,
-        query_layer=query_layer,
-        memory_layer=memory_layer,
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs,
+      memory=memory,
+      memory_sequence_length=memory_sequence_length,
+      query_layer=query_layer,
+      memory_layer=memory_layer,
+      probability_fn=wrapped_probability_fn,
+      name=name,
+      dtype=dtype,
+      **kwargs,
     )
 
   def build(self, input_shape):
     super().build(input_shape)
     if self.attention_v is None:
       self.attention_v = self.add_weight(
-          "attention_v",
-          [self.units],
-          dtype=self.dtype,
-          initializer=self.kernel_initializer,
+        "attention_v",
+        [self.units],
+        dtype=self.dtype,
+        initializer=self.kernel_initializer,
       )
     if self.attention_score_bias is None:
       self.attention_score_bias = self.add_weight(
-          "attention_score_bias",
-          shape=(),
-          dtype=self.dtype,
-          initializer=tf.constant_initializer(self.score_bias_init),
+        "attention_score_bias",
+        shape=(),
+        dtype=self.dtype,
+        initializer=tf.constant_initializer(self.score_bias_init),
       )
     if self.normalize and self.attention_g is None and self.attention_b is None:
       self.attention_g = self.add_weight(
-          "attention_g",
-          dtype=self.dtype,
-          initializer=tf.constant_initializer(math.sqrt(1.0 / self.units)),
-          shape=(),
+        "attention_g",
+        dtype=self.dtype,
+        initializer=tf.constant_initializer(math.sqrt(1.0 / self.units)),
+        shape=(),
       )
       self.attention_b = self.add_weight(
-          "attention_b",
-          [self.units],
-          dtype=self.dtype,
-          initializer=tf.zeros_initializer(),
+        "attention_b",
+        [self.units],
+        dtype=self.dtype,
+        initializer=tf.zeros_initializer(),
       )
     self.built = True
 
   def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
-        Args:
-          query: Tensor of dtype matching `self.values` and shape
-            `[batch_size, query_depth]`.
-          state: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]`
-            (`alignments_size` is memory's `max_time`).
-
-        Returns:
-          alignments: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]` (`alignments_size` is memory's
-            `max_time`).
-        """
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+    """
     processed_query = self.query_layer(query) if self.query_layer else query
     score = _bahdanau_score(
-        processed_query,
-        self.keys,
-        self.attention_v,
-        attention_g=self.attention_g,
-        attention_b=self.attention_b,
+      processed_query,
+      self.keys,
+      self.attention_v,
+      attention_g=self.attention_g,
+      attention_b=self.attention_b,
     )
     score += self.attention_score_bias
     alignments = self.probability_fn(score, state)
@@ -1163,64 +1164,64 @@ def from_config(cls, config, custom_objects=None):
 class LuongMonotonicAttention(_BaseMonotonicAttentionMechanism):
   """Monotonic attention mechanism with Luong-style energy function.
 
-    This type of attention enforces a monotonic constraint on the attention
-    distributions; that is once the model attends to a given point in the
-    memory it can't attend to any prior points at subsequence output timesteps.
-    It achieves this by using the `_monotonic_probability_fn` instead of `softmax`
-    to construct its attention distributions.  Otherwise, it is equivalent to
-    `tfa.seq2seq.LuongAttention`.  This approach is proposed in
+  This type of attention enforces a monotonic constraint on the attention
+  distributions; that is once the model attends to a given point in the
+  memory it can't attend to any prior points at subsequence output timesteps.
+  It achieves this by using the `_monotonic_probability_fn` instead of `softmax`
+  to construct its attention distributions.  Otherwise, it is equivalent to
+  `tfa.seq2seq.LuongAttention`.  This approach is proposed in
 
-    [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
-    "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
-    ICML 2017.](https://arxiv.org/abs/1704.00784)
-    """
+  [Colin Raffel, Minh-Thang Luong, Peter J. Liu, Ron J. Weiss, Douglas Eck,
+  "Online and Linear-Time Attention by Enforcing Monotonic Alignments."
+  ICML 2017.](https://arxiv.org/abs/1704.00784)
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      memory: Optional[TensorLike] = None,
-      memory_sequence_length: Optional[TensorLike] = None,
-      scale: bool = False,
-      sigmoid_noise: FloatTensorLike = 0.0,
-      sigmoid_noise_seed: Optional[FloatTensorLike] = None,
-      score_bias_init: FloatTensorLike = 0.0,
-      mode: str = "parallel",
-      dtype: AcceptableDTypes = None,
-      name: str = "LuongMonotonicAttention",
-      **kwargs,
+    self,
+    units: TensorLike,
+    memory: Optional[TensorLike] = None,
+    memory_sequence_length: Optional[TensorLike] = None,
+    scale: bool = False,
+    sigmoid_noise: FloatTensorLike = 0.0,
+    sigmoid_noise_seed: Optional[FloatTensorLike] = None,
+    score_bias_init: FloatTensorLike = 0.0,
+    mode: str = "parallel",
+    dtype: AcceptableDTypes = None,
+    name: str = "LuongMonotonicAttention",
+    **kwargs,
   ):
     """Construct the attention mechanism.
 
-        Args:
-          units: The depth of the query mechanism.
-          memory: The memory to query; usually the output of an RNN encoder.
-            This tensor should be shaped `[batch_size, max_time, ...]`.
-          memory_sequence_length: (optional): Sequence lengths for the batch
-            entries in memory.  If provided, the memory tensor rows are masked
-            with zeros for values past the respective sequence lengths.
-          scale: Python boolean.  Whether to scale the energy term.
-          sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the
-            docstring for `_monotonic_probability_fn` for more information.
-          sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
-          score_bias_init: Initial value for score bias scalar.  It's
-            recommended to initialize this to a negative value when the length
-            of the memory is large.
-          mode: How to compute the attention distribution.  Must be one of
-            'recursive', 'parallel', or 'hard'.  See the docstring for
-            `tfa.seq2seq.monotonic_attention` for more information.
-          dtype: The data type for the query and memory layers of the attention
-            mechanism.
-          name: Name to use when creating ops.
-          **kwargs: Dictionary that contains other common arguments for layer
-            creation.
-        """
+    Args:
+      units: The depth of the query mechanism.
+      memory: The memory to query; usually the output of an RNN encoder.
+        This tensor should be shaped `[batch_size, max_time, ...]`.
+      memory_sequence_length: (optional): Sequence lengths for the batch
+        entries in memory.  If provided, the memory tensor rows are masked
+        with zeros for values past the respective sequence lengths.
+      scale: Python boolean.  Whether to scale the energy term.
+      sigmoid_noise: Standard deviation of pre-sigmoid noise.  See the
+        docstring for `_monotonic_probability_fn` for more information.
+      sigmoid_noise_seed: (optional) Random seed for pre-sigmoid noise.
+      score_bias_init: Initial value for score bias scalar.  It's
+        recommended to initialize this to a negative value when the length
+        of the memory is large.
+      mode: How to compute the attention distribution.  Must be one of
+        'recursive', 'parallel', or 'hard'.  See the docstring for
+        `tfa.seq2seq.monotonic_attention` for more information.
+      dtype: The data type for the query and memory layers of the attention
+        mechanism.
+      name: Name to use when creating ops.
+      **kwargs: Dictionary that contains other common arguments for layer
+        creation.
+    """
     # Set up the monotonic probability fn with supplied parameters
     wrapped_probability_fn = functools.partial(
-        _monotonic_probability_fn,
-        sigmoid_noise=sigmoid_noise,
-        mode=mode,
-        seed=sigmoid_noise_seed,
+      _monotonic_probability_fn,
+      sigmoid_noise=sigmoid_noise,
+      mode=mode,
+      seed=sigmoid_noise_seed,
     )
     memory_layer = kwargs.pop("memory_layer", None)
     if not memory_layer:
@@ -1234,14 +1235,14 @@ def __init__(
     self.attention_g = None
     self.attention_score_bias = None
     super().__init__(
-        memory=memory,
-        memory_sequence_length=memory_sequence_length,
-        query_layer=None,
-        memory_layer=memory_layer,
-        probability_fn=wrapped_probability_fn,
-        name=name,
-        dtype=dtype,
-        **kwargs,
+      memory=memory,
+      memory_sequence_length=memory_sequence_length,
+      query_layer=None,
+      memory_layer=memory_layer,
+      probability_fn=wrapped_probability_fn,
+      name=name,
+      dtype=dtype,
+      **kwargs,
     )
 
   def build(self, input_shape):
@@ -1250,28 +1251,28 @@ def build(self, input_shape):
       self.attention_g = self.add_weight("attention_g", initializer=tf.ones_initializer, shape=())
     if self.attention_score_bias is None:
       self.attention_score_bias = self.add_weight(
-          "attention_score_bias",
-          shape=(),
-          initializer=tf.constant_initializer(self.score_bias_init),
+        "attention_score_bias",
+        shape=(),
+        initializer=tf.constant_initializer(self.score_bias_init),
       )
     self.built = True
 
   def _calculate_attention(self, query, state):
     """Score the query based on the keys and values.
 
-        Args:
-          query: Tensor of dtype matching `self.values` and shape
-            `[batch_size, query_depth]`.
-          state: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]`
-            (`alignments_size` is memory's `max_time`).
-
-        Returns:
-          alignments: Tensor of dtype matching `self.values` and shape
-            `[batch_size, alignments_size]` (`alignments_size` is memory's
-            `max_time`).
-          next_state: Same as alignments
-        """
+    Args:
+      query: Tensor of dtype matching `self.values` and shape
+        `[batch_size, query_depth]`.
+      state: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]`
+        (`alignments_size` is memory's `max_time`).
+
+    Returns:
+      alignments: Tensor of dtype matching `self.values` and shape
+        `[batch_size, alignments_size]` (`alignments_size` is memory's
+        `max_time`).
+      next_state: Same as alignments
+    """
     score = _luong_score(query, self.keys, self.attention_g)
     score += self.attention_score_bias
     alignments = self.probability_fn(score, state)
@@ -1280,12 +1281,12 @@ def _calculate_attention(self, query, state):
 
   def get_config(self):
     config = {
-        "units": self.units,
-        "scale": self.scale,
-        "sigmoid_noise": self.sigmoid_noise,
-        "sigmoid_noise_seed": self.sigmoid_noise_seed,
-        "score_bias_init": self.score_bias_init,
-        "mode": self.mode,
+      "units": self.units,
+      "scale": self.scale,
+      "sigmoid_noise": self.sigmoid_noise,
+      "sigmoid_noise_seed": self.sigmoid_noise_seed,
+      "score_bias_init": self.score_bias_init,
+      "mode": self.mode,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -1297,59 +1298,59 @@ def from_config(cls, config, custom_objects=None):
 
 
 class AttentionWrapperState(
-    collections.namedtuple(
-        "AttentionWrapperState",
-        (
-            "cell_state",
-            "attention",
-            "alignments",
-            "alignment_history",
-            "attention_state",
-        ),
-    )
+  collections.namedtuple(
+    "AttentionWrapperState",
+    (
+      "cell_state",
+      "attention",
+      "alignments",
+      "alignment_history",
+      "attention_state",
+    ),
+  )
 ):
   """State of a `tfa.seq2seq.AttentionWrapper`.
 
-    Attributes:
-      cell_state: The state of the wrapped RNN cell at the previous time
-        step.
-      attention: The attention emitted at the previous time step.
-      alignments: A single or tuple of `Tensor`(s) containing the
-         alignments emitted at the previous time step for each attention
-         mechanism.
-      alignment_history: (if enabled) a single or tuple of `TensorArray`(s)
-         containing alignment matrices from all time steps for each attention
-         mechanism. Call `stack()` on each to convert to a `Tensor`.
-      attention_state: A single or tuple of nested objects
-         containing attention mechanism state for each attention mechanism.
-         The objects may contain Tensors or TensorArrays.
-    """
+  Attributes:
+    cell_state: The state of the wrapped RNN cell at the previous time
+      step.
+    attention: The attention emitted at the previous time step.
+    alignments: A single or tuple of `Tensor`(s) containing the
+       alignments emitted at the previous time step for each attention
+       mechanism.
+    alignment_history: (if enabled) a single or tuple of `TensorArray`(s)
+       containing alignment matrices from all time steps for each attention
+       mechanism. Call `stack()` on each to convert to a `Tensor`.
+    attention_state: A single or tuple of nested objects
+       containing attention mechanism state for each attention mechanism.
+       The objects may contain Tensors or TensorArrays.
+  """
 
   def clone(self, **kwargs):
     """Clone this object, overriding components provided by kwargs.
 
-        The new state fields' shape must match original state fields' shape.
-        This will be validated, and original fields' shape will be propagated
-        to new fields.
+    The new state fields' shape must match original state fields' shape.
+    This will be validated, and original fields' shape will be propagated
+    to new fields.
 
-        Example:
+    Example:
 
-        >>> batch_size = 1
-        >>> memory = tf.random.normal(shape=[batch_size, 3, 100])
-        >>> encoder_state = [tf.zeros((batch_size, 100)), tf.zeros((batch_size, 100))]
-        >>> attention_mechanism = tfa.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size)
-        >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10)
-        >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
-        >>> decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
+    >>> batch_size = 1
+    >>> memory = tf.random.normal(shape=[batch_size, 3, 100])
+    >>> encoder_state = [tf.zeros((batch_size, 100)), tf.zeros((batch_size, 100))]
+    >>> attention_mechanism = tfa.seq2seq.LuongAttention(100, memory=memory, memory_sequence_length=[3] * batch_size)
+    >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(100), attention_mechanism, attention_layer_size=10)
+    >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
+    >>> decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
 
-        Args:
-          **kwargs: Any properties of the state object to replace in the
-            returned `AttentionWrapperState`.
+    Args:
+      **kwargs: Any properties of the state object to replace in the
+        returned `AttentionWrapperState`.
 
-        Returns:
-          A new `AttentionWrapperState` whose properties are the same as
-          this one, except any overridden properties as provided in `kwargs`.
-        """
+    Returns:
+      A new `AttentionWrapperState` whose properties are the same as
+      this one, except any overridden properties as provided in `kwargs`.
+    """
 
     def with_same_shape(old, new):
       """Check and set new tensor's shape."""
@@ -1364,9 +1365,9 @@ def with_same_shape(old, new):
         else:
           if old.shape.as_list() != new.shape.as_list():
             raise ValueError(
-                "The shape of the AttentionWrapperState is "
-                "expected to be same as the one to clone. "
-                "self.shape: %s, input.shape: %s" % (old.shape, new.shape)
+              "The shape of the AttentionWrapperState is "
+              "expected to be same as the one to clone. "
+              "self.shape: %s, input.shape: %s" % (old.shape, new.shape)
             )
           return new
       return new
@@ -1377,22 +1378,22 @@ def with_same_shape(old, new):
 def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None, check_inner_dims_defined=True):
   """Convert to tensor and possibly mask `memory`.
 
-    Args:
-      memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
-      memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
-      memory_mask: `boolean` tensor with shape [batch_size, max_time]. The
-        memory should be skipped when the corresponding mask is False.
-      check_inner_dims_defined: Python boolean.  If `True`, the `memory`
-        argument's shape is checked to ensure all but the two outermost
-        dimensions are fully defined.
-
-    Returns:
-      A (possibly masked), checked, new `memory`.
-
-    Raises:
-      ValueError: If `check_inner_dims_defined` is `True` and not
-        `memory.shape[2:].is_fully_defined()`.
-    """
+  Args:
+    memory: `Tensor`, shaped `[batch_size, max_time, ...]`.
+    memory_sequence_length: `int32` `Tensor`, shaped `[batch_size]`.
+    memory_mask: `boolean` tensor with shape [batch_size, max_time]. The
+      memory should be skipped when the corresponding mask is False.
+    check_inner_dims_defined: Python boolean.  If `True`, the `memory`
+      argument's shape is checked to ensure all but the two outermost
+      dimensions are fully defined.
+
+  Returns:
+    A (possibly masked), checked, new `memory`.
+
+  Raises:
+    ValueError: If `check_inner_dims_defined` is `True` and not
+      `memory.shape[2:].is_fully_defined()`.
+  """
   memory = tf.nest.map_structure(lambda m: tf.convert_to_tensor(m, name="memory"), memory)
   if memory_sequence_length is not None and memory_mask is not None:
     raise ValueError("memory_sequence_length and memory_mask can't be provided at same time.")
@@ -1402,19 +1403,16 @@ def _prepare_memory(memory, memory_sequence_length=None, memory_mask=None, check
 
     def _check_dims(m):
       if not m.shape[2:].is_fully_defined():
-        raise ValueError(
-            "Expected memory %s to have fully defined inner dims, "
-            "but saw shape: %s" % (m.name, m.shape)
-        )
+        raise ValueError("Expected memory %s to have fully defined inner dims, but saw shape: %s" % (m.name, m.shape))
 
     tf.nest.map_structure(_check_dims, memory)
   if memory_sequence_length is None and memory_mask is None:
     return memory
   elif memory_sequence_length is not None:
     seq_len_mask = tf.sequence_mask(
-        memory_sequence_length,
-        maxlen=tf.shape(tf.nest.flatten(memory)[0])[1],
-        dtype=tf.nest.flatten(memory)[0].dtype,
+      memory_sequence_length,
+      maxlen=tf.shape(tf.nest.flatten(memory)[0])[1],
+      dtype=tf.nest.flatten(memory)[0].dtype,
     )
   else:
     # For memory_mask is not None
@@ -1439,13 +1437,11 @@ def _maybe_mask_score(score, memory_sequence_length=None, memory_mask=None, scor
     raise ValueError("memory_sequence_length and memory_mask can't be provided at same time.")
   if memory_sequence_length is not None:
     message = "All values in memory_sequence_length must greater than zero."
-    with tf.control_dependencies(
-        [
-            tf.debugging.assert_positive(  # pylint: disable=bad-continuation
-                memory_sequence_length, message=message
-            )
-        ]
-    ):
+    with tf.control_dependencies([
+      tf.debugging.assert_positive(  # pylint: disable=bad-continuation
+        memory_sequence_length, message=message
+      )
+    ]):
       memory_mask = tf.sequence_mask(memory_sequence_length, maxlen=tf.shape(score)[1])
   score_mask_values = score_mask_value * tf.ones_like(score)
   return tf.where(memory_mask, score, score_mask_values)
@@ -1454,14 +1450,14 @@ def _maybe_mask_score(score, memory_sequence_length=None, memory_mask=None, scor
 def hardmax(logits: TensorLike, name: Optional[str] = None) -> tf.Tensor:
   """Returns batched one-hot vectors.
 
-    The depth index containing the `1` is that of the maximum logit value.
+  The depth index containing the `1` is that of the maximum logit value.
 
-    Args:
-      logits: A batch tensor of logit values.
-      name: Name to use when creating ops.
-    Returns:
-      A batched one-hot tensor.
-    """
+  Args:
+    logits: A batch tensor of logit values.
+    name: Name to use when creating ops.
+  Returns:
+    A batched one-hot tensor.
+  """
   with tf.name_scope(name or "Hardmax"):
     logits = tf.convert_to_tensor(logits, name="logits")
     depth = logits.shape[-1] or tf.shape(logits)[-1]
@@ -1470,7 +1466,7 @@ def hardmax(logits: TensorLike, name: Optional[str] = None) -> tf.Tensor:
 
 def _compute_attention(attention_mechanism, cell_output, attention_state, attention_layer):
   """Computes the attention and alignments for a given
-    attention_mechanism."""
+  attention_mechanism."""
   alignments, next_attention_state = attention_mechanism([cell_output, attention_state])
 
   # Reshape from [batch_size, memory_time] to [batch_size, 1, memory_time]
@@ -1498,132 +1494,132 @@ def _compute_attention(attention_mechanism, cell_output, attention_state, attent
 class AttentionWrapper(tf.keras.layers.AbstractRNNCell):
   """Wraps another RNN cell with attention.
 
-    Example:
-
-    >>> batch_size = 4
-    >>> max_time = 7
-    >>> hidden_size = 32
-    >>>
-    >>> memory = tf.random.uniform([batch_size, max_time, hidden_size])
-    >>> memory_sequence_length = tf.fill([batch_size], max_time)
-    >>>
-    >>> attention_mechanism = tfa.seq2seq.LuongAttention(hidden_size)
-    >>> attention_mechanism.setup_memory(memory, memory_sequence_length)
-    >>>
-    >>> cell = tf.keras.layers.LSTMCell(hidden_size)
-    >>> cell = tfa.seq2seq.AttentionWrapper(
-    ...     cell, attention_mechanism, attention_layer_size=hidden_size)
-    >>>
-    >>> inputs = tf.random.uniform([batch_size, hidden_size])
-    >>> state = cell.get_initial_state(inputs)
-    >>>
-    >>> outputs, state = cell(inputs, state)
-    >>> outputs.shape
-    TensorShape([4, 32])
-    """
+  Example:
+
+  >>> batch_size = 4
+  >>> max_time = 7
+  >>> hidden_size = 32
+  >>>
+  >>> memory = tf.random.uniform([batch_size, max_time, hidden_size])
+  >>> memory_sequence_length = tf.fill([batch_size], max_time)
+  >>>
+  >>> attention_mechanism = tfa.seq2seq.LuongAttention(hidden_size)
+  >>> attention_mechanism.setup_memory(memory, memory_sequence_length)
+  >>>
+  >>> cell = tf.keras.layers.LSTMCell(hidden_size)
+  >>> cell = tfa.seq2seq.AttentionWrapper(
+  ...     cell, attention_mechanism, attention_layer_size=hidden_size)
+  >>>
+  >>> inputs = tf.random.uniform([batch_size, hidden_size])
+  >>> state = cell.get_initial_state(inputs)
+  >>>
+  >>> outputs, state = cell(inputs, state)
+  >>> outputs.shape
+  TensorShape([4, 32])
+  """
 
   @typechecked
   def __init__(
-      self,
-      cell: tf.keras.layers.Layer,
-      attention_mechanism: Union[AttentionMechanism, List[AttentionMechanism]],
-      attention_layer_size: Optional[Union[Number, List[Number]]] = None,
-      alignment_history: bool = False,
-      cell_input_fn: Optional[Callable] = None,
-      output_attention: bool = True,
-      initial_cell_state: Optional[TensorLike] = None,
-      name: Optional[str] = None,
-      attention_layer: Optional[Union[tf.keras.layers.Layer, List[tf.keras.layers.Layer]]] = None,
-      attention_fn: Optional[Callable] = None,
-      **kwargs,
+    self,
+    cell: tf.keras.layers.Layer,
+    attention_mechanism: Union[AttentionMechanism, List[AttentionMechanism]],
+    attention_layer_size: Optional[Union[Number, List[Number]]] = None,
+    alignment_history: bool = False,
+    cell_input_fn: Optional[Callable] = None,
+    output_attention: bool = True,
+    initial_cell_state: Optional[TensorLike] = None,
+    name: Optional[str] = None,
+    attention_layer: Optional[Union[tf.keras.layers.Layer, List[tf.keras.layers.Layer]]] = None,
+    attention_fn: Optional[Callable] = None,
+    **kwargs,
   ):
     """Construct the `AttentionWrapper`.
 
-        **NOTE** If you are using the `tfa.seq2seq.BeamSearchDecoder` with a cell wrapped
-        in `AttentionWrapper`, then you must ensure that:
-
-        - The encoder output has been tiled to `beam_width` via
-          `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
-        - The `batch_size` argument passed to the `get_initial_state` method of
-          this wrapper is equal to `true_batch_size * beam_width`.
-        - The initial state created with `get_initial_state` above contains a
-          `cell_state` value containing properly tiled final state from the
-          encoder.
-
-        An example:
-
-        >>> batch_size = 1
-        >>> beam_width = 5
-        >>> sequence_length = tf.convert_to_tensor([5])
-        >>> encoder_outputs = tf.random.uniform(shape=(batch_size, 5, 10))
-        >>> encoder_final_state = [tf.zeros((batch_size, 10)), tf.zeros((batch_size, 10))]
-        >>> tiled_encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width)
-        >>> tiled_encoder_final_state = tfa.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width)
-        >>> tiled_sequence_length = tfa.seq2seq.tile_batch(sequence_length, multiplier=beam_width)
-        >>> attention_mechanism = tfa.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length)
-        >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism)
-        >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size * beam_width, dtype=tf.float32)
-        >>> decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)
-
-        Args:
-          cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
-            interface.
-          attention_mechanism: A list of `tfa.seq2seq.AttentionMechanism`
-            instances single instance.
-          attention_layer_size: A list of Python integers or a single Python
-            integer, the depth of the attention (output) layer(s). If `None`
-            (default), use the context as attention at each time step.
-            Otherwise, feed the context and cell output into the attention
-            layer to generate attention at each time step. If
-            `attention_mechanism` is a list, `attention_layer_size` must be a list
-            of the same length. If `attention_layer` is set, this must be `None`.
-            If `attention_fn` is set, it must guaranteed that the outputs of
-            `attention_fn` also meet the above requirements.
-          alignment_history: Python boolean, whether to store alignment history
-            from all time steps in the final output state (currently stored as
-            a time major `TensorArray` on which you must call `stack()`).
-          cell_input_fn: (optional) A `callable`.  The default is:
-            `lambda inputs, attention:
-              tf.concat([inputs, attention], -1)`.
-          output_attention: Python bool.  If `True` (default), the output at
-            each time step is the attention value.  This is the behavior of
-            Luong-style attention mechanisms.  If `False`, the output at each
-            time step is the output of `cell`.  This is the behavior of
-            Bahdanau-style attention mechanisms.  In both cases, the
-            `attention` tensor is propagated to the next time step via the
-            state and is used there. This flag only controls whether the
-            attention mechanism is propagated up to the next cell in an RNN
-            stack or to the top RNN output.
-          initial_cell_state: The initial state value to use for the cell when
-            the user calls `get_initial_state()`.  Note that if this value is
-            provided now, and the user uses a `batch_size` argument of
-            `get_initial_state` which does not match the batch size of
-            `initial_cell_state`, proper behavior is not guaranteed.
-          name: Name to use when creating ops.
-          attention_layer: A list of `tf.keras.layers.Layer` instances or a
-            single `tf.keras.layers.Layer` instance taking the context
-            and cell output as inputs to generate attention at each time step.
-            If `None` (default), use the context as attention at each time step.
-            If `attention_mechanism` is a list, `attention_layer` must be a list of
-            the same length. If `attention_layer_size` is set, this must be
-            `None`.
-          attention_fn: An optional callable function that allows users to
-            provide their own customized attention function, which takes input
-            `(attention_mechanism, cell_output, attention_state,
-            attention_layer)` and outputs `(attention, alignments,
-            next_attention_state)`. If provided, the `attention_layer_size` should
-            be the size of the outputs of `attention_fn`.
-          **kwargs: Other keyword arguments for layer creation.
-
-        Raises:
-          TypeError: `attention_layer_size` is not `None` and
-            (`attention_mechanism` is a list but `attention_layer_size` is not;
-            or vice versa).
-          ValueError: if `attention_layer_size` is not `None`,
-            `attention_mechanism` is a list, and its length does not match that
-            of `attention_layer_size`; if `attention_layer_size` and
-            `attention_layer` are set simultaneously.
-        """
+    **NOTE** If you are using the `tfa.seq2seq.BeamSearchDecoder` with a cell wrapped
+    in `AttentionWrapper`, then you must ensure that:
+
+    - The encoder output has been tiled to `beam_width` via
+      `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
+    - The `batch_size` argument passed to the `get_initial_state` method of
+      this wrapper is equal to `true_batch_size * beam_width`.
+    - The initial state created with `get_initial_state` above contains a
+      `cell_state` value containing properly tiled final state from the
+      encoder.
+
+    An example:
+
+    >>> batch_size = 1
+    >>> beam_width = 5
+    >>> sequence_length = tf.convert_to_tensor([5])
+    >>> encoder_outputs = tf.random.uniform(shape=(batch_size, 5, 10))
+    >>> encoder_final_state = [tf.zeros((batch_size, 10)), tf.zeros((batch_size, 10))]
+    >>> tiled_encoder_outputs = tfa.seq2seq.tile_batch(encoder_outputs, multiplier=beam_width)
+    >>> tiled_encoder_final_state = tfa.seq2seq.tile_batch(encoder_final_state, multiplier=beam_width)
+    >>> tiled_sequence_length = tfa.seq2seq.tile_batch(sequence_length, multiplier=beam_width)
+    >>> attention_mechanism = tfa.seq2seq.BahdanauAttention(10, memory=tiled_encoder_outputs, memory_sequence_length=tiled_sequence_length)
+    >>> attention_cell = tfa.seq2seq.AttentionWrapper(tf.keras.layers.LSTMCell(10), attention_mechanism)
+    >>> decoder_initial_state = attention_cell.get_initial_state(batch_size=batch_size * beam_width, dtype=tf.float32)
+    >>> decoder_initial_state = decoder_initial_state.clone(cell_state=tiled_encoder_final_state)
+
+    Args:
+      cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
+        interface.
+      attention_mechanism: A list of `tfa.seq2seq.AttentionMechanism`
+        instances single instance.
+      attention_layer_size: A list of Python integers or a single Python
+        integer, the depth of the attention (output) layer(s). If `None`
+        (default), use the context as attention at each time step.
+        Otherwise, feed the context and cell output into the attention
+        layer to generate attention at each time step. If
+        `attention_mechanism` is a list, `attention_layer_size` must be a list
+        of the same length. If `attention_layer` is set, this must be `None`.
+        If `attention_fn` is set, it must guaranteed that the outputs of
+        `attention_fn` also meet the above requirements.
+      alignment_history: Python boolean, whether to store alignment history
+        from all time steps in the final output state (currently stored as
+        a time major `TensorArray` on which you must call `stack()`).
+      cell_input_fn: (optional) A `callable`.  The default is:
+        `lambda inputs, attention:
+          tf.concat([inputs, attention], -1)`.
+      output_attention: Python bool.  If `True` (default), the output at
+        each time step is the attention value.  This is the behavior of
+        Luong-style attention mechanisms.  If `False`, the output at each
+        time step is the output of `cell`.  This is the behavior of
+        Bahdanau-style attention mechanisms.  In both cases, the
+        `attention` tensor is propagated to the next time step via the
+        state and is used there. This flag only controls whether the
+        attention mechanism is propagated up to the next cell in an RNN
+        stack or to the top RNN output.
+      initial_cell_state: The initial state value to use for the cell when
+        the user calls `get_initial_state()`.  Note that if this value is
+        provided now, and the user uses a `batch_size` argument of
+        `get_initial_state` which does not match the batch size of
+        `initial_cell_state`, proper behavior is not guaranteed.
+      name: Name to use when creating ops.
+      attention_layer: A list of `tf.keras.layers.Layer` instances or a
+        single `tf.keras.layers.Layer` instance taking the context
+        and cell output as inputs to generate attention at each time step.
+        If `None` (default), use the context as attention at each time step.
+        If `attention_mechanism` is a list, `attention_layer` must be a list of
+        the same length. If `attention_layer_size` is set, this must be
+        `None`.
+      attention_fn: An optional callable function that allows users to
+        provide their own customized attention function, which takes input
+        `(attention_mechanism, cell_output, attention_state,
+        attention_layer)` and outputs `(attention, alignments,
+        next_attention_state)`. If provided, the `attention_layer_size` should
+        be the size of the outputs of `attention_fn`.
+      **kwargs: Other keyword arguments for layer creation.
+
+    Raises:
+      TypeError: `attention_layer_size` is not `None` and
+        (`attention_mechanism` is a list but `attention_layer_size` is not;
+        or vice versa).
+      ValueError: if `attention_layer_size` is not `None`,
+        `attention_mechanism` is a list, and its length does not match that
+        of `attention_layer_size`; if `attention_layer_size` and
+        `attention_layer` are set simultaneously.
+    """
     super().__init__(name=name, **kwargs)
     keras_utils.assert_like_rnncell("cell", cell)
     if isinstance(attention_mechanism, (list, tuple)):
@@ -1643,31 +1639,31 @@ def cell_input_fn(inputs, attention):
 
     if attention_layer_size is not None:
       attention_layer_sizes = tuple(
-          attention_layer_size if isinstance(attention_layer_size, (list, tuple)) else (attention_layer_size,)
+        attention_layer_size if isinstance(attention_layer_size, (list, tuple)) else (attention_layer_size,)
       )
       if len(attention_layer_sizes) != len(attention_mechanisms):
         raise ValueError(
-            "If provided, attention_layer_size must contain exactly "
-            "one integer per attention_mechanism, saw: %d vs %d" %
-            (len(attention_layer_sizes), len(attention_mechanisms))
+          "If provided, attention_layer_size must contain exactly "
+          "one integer per attention_mechanism, saw: %d vs %d" % (len(attention_layer_sizes), len(attention_mechanisms))
         )
       dtype = kwargs.get("dtype", None)
       self._attention_layers = list(
-          tf.keras.layers.Dense(
-              attention_layer_size,
-              name="attention_layer",
-              use_bias=False,
-              dtype=dtype,
-          ) for i, attention_layer_size in enumerate(attention_layer_sizes)
+        tf.keras.layers.Dense(
+          attention_layer_size,
+          name="attention_layer",
+          use_bias=False,
+          dtype=dtype,
+        )
+        for i, attention_layer_size in enumerate(attention_layer_sizes)
       )
     elif attention_layer is not None:
       self._attention_layers = list(
-          attention_layer if isinstance(attention_layer, (list, tuple)) else (attention_layer,)
+        attention_layer if isinstance(attention_layer, (list, tuple)) else (attention_layer,)
       )
       if len(self._attention_layers) != len(attention_mechanisms):
         raise ValueError(
-            "If provided, attention_layer must contain exactly one "
-            "layer per attention_mechanism, saw: %d vs %d" % (len(self._attention_layers), len(attention_mechanisms))
+          "If provided, attention_layer must contain exactly one "
+          "layer per attention_mechanism, saw: %d vs %d" % (len(self._attention_layers), len(attention_mechanisms))
         )
     else:
       self._attention_layers = None
@@ -1687,40 +1683,40 @@ def cell_input_fn(inputs, attention):
         self._initial_cell_state = None
       else:
         final_state_tensor = tf.nest.flatten(initial_cell_state)[-1]
-        state_batch_size = (final_state_tensor.shape[0] or tf.shape(final_state_tensor)[0])
+        state_batch_size = final_state_tensor.shape[0] or tf.shape(final_state_tensor)[0]
         error_message = (
-            "When constructing AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
-            "(encoder output) and initial_cell_state.  Are you using "
-            "the BeamSearchDecoder?  You may need to tile your "
-            "initial state via the tfa.seq2seq.tile_batch "
-            "function with argument multiple=beam_width."
+          "When constructing AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
+          "(encoder output) and initial_cell_state.  Are you using "
+          "the BeamSearchDecoder?  You may need to tile your "
+          "initial state via the tfa.seq2seq.tile_batch "
+          "function with argument multiple=beam_width."
         )
         with tf.control_dependencies(
-            self._batch_size_checks(  # pylint: disable=bad-continuation
-                state_batch_size, error_message
-            )
+          self._batch_size_checks(  # pylint: disable=bad-continuation
+            state_batch_size, error_message
+          )
         ):
           self._initial_cell_state = tf.nest.map_structure(
-              lambda s: tf.identity(s, name="check_initial_cell_state"),
-              initial_cell_state,
+            lambda s: tf.identity(s, name="check_initial_cell_state"),
+            initial_cell_state,
           )
 
   def _attention_mechanisms_checks(self):
     for attention_mechanism in self._attention_mechanisms:
       if not attention_mechanism.memory_initialized:
         raise ValueError(
-            "The AttentionMechanism instances passed to "
-            "this AttentionWrapper should be initialized "
-            "with a memory first, either by passing it "
-            "to the AttentionMechanism constructor or "
-            "calling attention_mechanism.setup_memory()"
+          "The AttentionMechanism instances passed to "
+          "this AttentionWrapper should be initialized "
+          "with a memory first, either by passing it "
+          "to the AttentionMechanism constructor or "
+          "calling attention_mechanism.setup_memory()"
         )
 
   def _batch_size_checks(self, batch_size, error_message):
     self._attention_mechanisms_checks()
     return [
-        tf.debugging.assert_equal(batch_size, attention_mechanism.batch_size, message=error_message)
-        for attention_mechanism in self._attention_mechanisms
+      tf.debugging.assert_equal(batch_size, attention_mechanism.batch_size, message=error_message)
+      for attention_mechanism in self._attention_mechanisms
     ]
 
   def _get_attention_layer_size(self):
@@ -1728,7 +1724,7 @@ def _get_attention_layer_size(self):
       return self._attention_layer_size
     self._attention_mechanisms_checks()
     attention_output_sizes = (
-        attention_mechanism.values.shape[-1] for attention_mechanism in self._attention_mechanisms
+      attention_mechanism.values.shape[-1] for attention_mechanism in self._attention_mechanisms
     )
     if self._attention_layers is None:
       self._attention_layer_size = sum(attention_output_sizes)
@@ -1737,25 +1733,25 @@ def _get_attention_layer_size(self):
       # concatenation of the cell output and the attention mechanism
       # output.
       self._attention_layer_size = sum(
-          layer.compute_output_shape([None, self._cell.output_size + attention_output_size])[-1]
-          for layer, attention_output_size in zip(self._attention_layers, attention_output_sizes)
+        layer.compute_output_shape([None, self._cell.output_size + attention_output_size])[-1]
+        for layer, attention_output_size in zip(self._attention_layers, attention_output_sizes)
       )
     return self._attention_layer_size
 
   def _item_or_tuple(self, seq):
     """Returns `seq` as tuple or the singular element.
 
-        Which is returned is determined by how the AttentionMechanism(s) were
-        passed to the constructor.
+    Which is returned is determined by how the AttentionMechanism(s) were
+    passed to the constructor.
 
-        Args:
-          seq: A non-empty sequence of items or generator.
+    Args:
+      seq: A non-empty sequence of items or generator.
 
-        Returns:
-          Either the values in the sequence as a tuple if
-          AttentionMechanism(s) were passed to the constructor as a sequence
-          or the singular element.
-        """
+    Returns:
+      Either the values in the sequence as a tuple if
+      AttentionMechanism(s) were passed to the constructor as a sequence
+      or the singular element.
+    """
     t = tuple(seq)
     if self._is_multi:
       return t
@@ -1773,41 +1769,41 @@ def output_size(self):
   def state_size(self):
     """The `state_size` property of `tfa.seq2seq.AttentionWrapper`.
 
-        Returns:
-          A `tfa.seq2seq.AttentionWrapperState` tuple containing shapes used
-          by this object.
-        """
+    Returns:
+      A `tfa.seq2seq.AttentionWrapperState` tuple containing shapes used
+      by this object.
+    """
     return AttentionWrapperState(
-        cell_state=self._cell.state_size,
-        attention=self._get_attention_layer_size(),
-        alignments=self._item_or_tuple(a.alignments_size for a in self._attention_mechanisms),
-        attention_state=self._item_or_tuple(a.state_size for a in self._attention_mechanisms),
-        alignment_history=self._item_or_tuple(
-            a.alignments_size if self._alignment_history else () for a in self._attention_mechanisms
-        ),
+      cell_state=self._cell.state_size,
+      attention=self._get_attention_layer_size(),
+      alignments=self._item_or_tuple(a.alignments_size for a in self._attention_mechanisms),
+      attention_state=self._item_or_tuple(a.state_size for a in self._attention_mechanisms),
+      alignment_history=self._item_or_tuple(
+        a.alignments_size if self._alignment_history else () for a in self._attention_mechanisms
+      ),
     )  # sometimes a TensorArray
 
   def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
     """Return an initial (zero) state tuple for this `tfa.seq2seq.AttentionWrapper`.
 
-        **NOTE** Please see the initializer documentation for details of how
-        to call `get_initial_state` if using a `tfa.seq2seq.AttentionWrapper`
-        with a `tfa.seq2seq.BeamSearchDecoder`.
+    **NOTE** Please see the initializer documentation for details of how
+    to call `get_initial_state` if using a `tfa.seq2seq.AttentionWrapper`
+    with a `tfa.seq2seq.BeamSearchDecoder`.
 
-        Args:
-          inputs: The inputs that will be fed to this cell.
-          batch_size: `0D` integer tensor: the batch size.
-          dtype: The internal state data type.
+    Args:
+      inputs: The inputs that will be fed to this cell.
+      batch_size: `0D` integer tensor: the batch size.
+      dtype: The internal state data type.
 
-        Returns:
-          An `tfa.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and,
-          possibly, empty `TensorArray` objects.
+    Returns:
+      An `tfa.seq2seq.AttentionWrapperState` tuple containing zeroed out tensors and,
+      possibly, empty `TensorArray` objects.
 
-        Raises:
-          ValueError: (or, possibly at runtime, `InvalidArgument`), if
-            `batch_size` does not match the output size of the encoder passed
-            to the wrapper object at initialization time.
-        """
+    Raises:
+      ValueError: (or, possibly at runtime, `InvalidArgument`), if
+        `batch_size` does not match the output size of the encoder passed
+        to the wrapper object at initialization time.
+    """
     if inputs is not None:
       batch_size = tf.shape(inputs)[0]
       dtype = inputs.dtype
@@ -1817,74 +1813,76 @@ def get_initial_state(self, inputs=None, batch_size=None, dtype=None):
       else:
         cell_state = self._cell.get_initial_state(batch_size=batch_size, dtype=dtype)
       error_message = (
-          "When calling get_initial_state of AttentionWrapper %s: " % self.name +
-          "Non-matching batch sizes between the memory "
-          "(encoder output) and the requested batch size. Are you using "
-          "the BeamSearchDecoder?  If so, make sure your encoder output "
-          "has been tiled to beam_width via "
-          "tfa.seq2seq.tile_batch, and the batch_size= argument "
-          "passed to get_initial_state is batch_size * beam_width."
+        "When calling get_initial_state of AttentionWrapper %s: "
+        % self.name
+        + "Non-matching batch sizes between the memory "
+        "(encoder output) and the requested batch size. Are you using "
+        "the BeamSearchDecoder?  If so, make sure your encoder output "
+        "has been tiled to beam_width via "
+        "tfa.seq2seq.tile_batch, and the batch_size= argument "
+        "passed to get_initial_state is batch_size * beam_width."
       )
       with tf.control_dependencies(self._batch_size_checks(batch_size, error_message)):  # pylint: disable=bad-continuation
         cell_state = tf.nest.map_structure(lambda s: tf.identity(s, name="checked_cell_state"), cell_state)
       initial_alignments = [
-          attention_mechanism.initial_alignments(batch_size, dtype)
-          for attention_mechanism in self._attention_mechanisms
+        attention_mechanism.initial_alignments(batch_size, dtype) for attention_mechanism in self._attention_mechanisms
       ]
       return AttentionWrapperState(
-          cell_state=cell_state,
-          attention=tf.zeros([batch_size, self._get_attention_layer_size()], dtype=dtype),
-          alignments=self._item_or_tuple(initial_alignments),
-          attention_state=self._item_or_tuple(
-              attention_mechanism.initial_state(batch_size, dtype) for attention_mechanism in self._attention_mechanisms
-          ),
-          alignment_history=self._item_or_tuple(
-              tf.TensorArray(dtype, size=0, dynamic_size=True, element_shape=alignment.shape) if self
-              ._alignment_history else () for alignment in initial_alignments
-          ),
+        cell_state=cell_state,
+        attention=tf.zeros([batch_size, self._get_attention_layer_size()], dtype=dtype),
+        alignments=self._item_or_tuple(initial_alignments),
+        attention_state=self._item_or_tuple(
+          attention_mechanism.initial_state(batch_size, dtype) for attention_mechanism in self._attention_mechanisms
+        ),
+        alignment_history=self._item_or_tuple(
+          tf.TensorArray(dtype, size=0, dynamic_size=True, element_shape=alignment.shape)
+          if self._alignment_history
+          else ()
+          for alignment in initial_alignments
+        ),
       )
 
   def call(self, inputs, state, **kwargs):
     """Perform a step of attention-wrapped RNN.
 
-        - Step 1: Mix the `inputs` and previous step's `attention` output via
-          `cell_input_fn`.
-        - Step 2: Call the wrapped `cell` with this input and its previous
-          state.
-        - Step 3: Score the cell's output with `attention_mechanism`.
-        - Step 4: Calculate the alignments by passing the score through the
-          `normalizer`.
-        - Step 5: Calculate the context vector as the inner product between the
-          alignments and the attention_mechanism's values (memory).
-        - Step 6: Calculate the attention output by concatenating the cell
-          output and context through the attention layer (a linear layer with
-          `attention_layer_size` outputs).
-
-        Args:
-          inputs: (Possibly nested tuple of) Tensor, the input at this time
-            step.
-          state: An instance of `tfa.seq2seq.AttentionWrapperState` containing
-            tensors from the previous time step.
-          **kwargs: Dict, other keyword arguments for the cell call method.
-
-        Returns:
-          A tuple `(attention_or_cell_output, next_state)`, where:
-
-          - `attention_or_cell_output` depending on `output_attention`.
-          - `next_state` is an instance of `tfa.seq2seq.AttentionWrapperState`
-             containing the state calculated at this time step.
-
-        Raises:
-          TypeError: If `state` is not an instance of `tfa.seq2seq.AttentionWrapperState`.
-        """
+    - Step 1: Mix the `inputs` and previous step's `attention` output via
+      `cell_input_fn`.
+    - Step 2: Call the wrapped `cell` with this input and its previous
+      state.
+    - Step 3: Score the cell's output with `attention_mechanism`.
+    - Step 4: Calculate the alignments by passing the score through the
+      `normalizer`.
+    - Step 5: Calculate the context vector as the inner product between the
+      alignments and the attention_mechanism's values (memory).
+    - Step 6: Calculate the attention output by concatenating the cell
+      output and context through the attention layer (a linear layer with
+      `attention_layer_size` outputs).
+
+    Args:
+      inputs: (Possibly nested tuple of) Tensor, the input at this time
+        step.
+      state: An instance of `tfa.seq2seq.AttentionWrapperState` containing
+        tensors from the previous time step.
+      **kwargs: Dict, other keyword arguments for the cell call method.
+
+    Returns:
+      A tuple `(attention_or_cell_output, next_state)`, where:
+
+      - `attention_or_cell_output` depending on `output_attention`.
+      - `next_state` is an instance of `tfa.seq2seq.AttentionWrapperState`
+         containing the state calculated at this time step.
+
+    Raises:
+      TypeError: If `state` is not an instance of `tfa.seq2seq.AttentionWrapperState`.
+    """
     if not isinstance(state, AttentionWrapperState):
       try:
         state = AttentionWrapperState(*state)
       except TypeError:
         raise TypeError(
-            "Expected state to be instance of AttentionWrapperState or "
-            "values that can construct AttentionWrapperState. "
-            "Received type %s instead." % type(state)
+          "Expected state to be instance of AttentionWrapperState or "
+          "values that can construct AttentionWrapperState. "
+          "Received type %s instead." % type(state)
         )
 
     # Step 1: Calculate the true inputs to the cell based on the
@@ -1896,11 +1894,11 @@ def call(self, inputs, state, **kwargs):
 
     cell_batch_size = cell_output.shape[0] or tf.shape(cell_output)[0]
     error_message = (
-        "When applying AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
-        "(encoder output) and the query (decoder output).  Are you using "
-        "the BeamSearchDecoder?  You may need to tile your memory input "
-        "via the tfa.seq2seq.tile_batch function with argument "
-        "multiple=beam_width."
+      "When applying AttentionWrapper %s: " % self.name + "Non-matching batch sizes between the memory "
+      "(encoder output) and the query (decoder output).  Are you using "
+      "the BeamSearchDecoder?  You may need to tile your memory input "
+      "via the tfa.seq2seq.tile_batch function with argument "
+      "multiple=beam_width."
     )
     with tf.control_dependencies(self._batch_size_checks(cell_batch_size, error_message)):  # pylint: disable=bad-continuation
       cell_output = tf.identity(cell_output, name="checked_cell_output")
@@ -1918,14 +1916,15 @@ def call(self, inputs, state, **kwargs):
     maybe_all_histories = []
     for i, attention_mechanism in enumerate(self._attention_mechanisms):
       attention, alignments, next_attention_state = self._attention_fn(
-          attention_mechanism,
-          cell_output,
-          previous_attention_state[i],
-          self._attention_layers[i] if self._attention_layers else None,
+        attention_mechanism,
+        cell_output,
+        previous_attention_state[i],
+        self._attention_layers[i] if self._attention_layers else None,
       )
       alignment_history = (
-          previous_alignment_history[i].write(previous_alignment_history[i].size(), alignments)
-          if self._alignment_history else ()
+        previous_alignment_history[i].write(previous_alignment_history[i].size(), alignments)
+        if self._alignment_history
+        else ()
       )
 
       all_attention_states.append(next_attention_state)
@@ -1935,11 +1934,11 @@ def call(self, inputs, state, **kwargs):
 
     attention = tf.concat(all_attentions, 1)
     next_state = AttentionWrapperState(
-        cell_state=next_cell_state,
-        attention=attention,
-        attention_state=self._item_or_tuple(all_attention_states),
-        alignments=self._item_or_tuple(all_alignments),
-        alignment_history=self._item_or_tuple(maybe_all_histories),
+      cell_state=next_cell_state,
+      attention=attention,
+      attention_state=self._item_or_tuple(all_attention_states),
+      alignments=self._item_or_tuple(all_alignments),
+      alignment_history=self._item_or_tuple(maybe_all_histories),
     )
 
     if self._output_attention:
diff --git a/deepray/custom_ops/seq2seq/python/basic_decoder.py b/deepray/custom_ops/seq2seq/python/basic_decoder.py
index de3ea9b9..2a02345b 100644
--- a/deepray/custom_ops/seq2seq/python/basic_decoder.py
+++ b/deepray/custom_ops/seq2seq/python/basic_decoder.py
@@ -28,13 +28,13 @@
 class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_output", "sample_id"))):
   """Outputs of a `tfa.seq2seq.BasicDecoder` step.
 
-    Attributes:
-      rnn_output: The output for this step. If the `output_layer` argument
-         of `tfa.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it
-         is the output of the RNN cell.
-      sample_id: The token IDs sampled for this step, as returned by the
-        `sampler` instance passed to `tfa.seq2seq.BasicDecoder`.
-    """
+  Attributes:
+    rnn_output: The output for this step. If the `output_layer` argument
+       of `tfa.seq2seq.BasicDecoder` was set, it is the output of this layer, otherwise it
+       is the output of the RNN cell.
+    sample_id: The token IDs sampled for this step, as returned by the
+      `sampler` instance passed to `tfa.seq2seq.BasicDecoder`.
+  """
 
   pass
 
@@ -42,75 +42,75 @@ class BasicDecoderOutput(collections.namedtuple("BasicDecoderOutput", ("rnn_outp
 class BasicDecoder(decoder.BaseDecoder):
   """Basic sampling decoder for training and inference.
 
-    The `tfa.seq2seq.Sampler` instance passed as argument is responsible to sample from
-    the output distribution and produce the input for the next decoding step. The decoding
-    loop is implemented by the decoder in its `__call__` method.
-
-    Example using `tfa.seq2seq.TrainingSampler` for training:
-
-    >>> batch_size = 4
-    >>> max_time = 7
-    >>> hidden_size = 32
-    >>> embedding_size = 48
-    >>> input_vocab_size = 128
-    >>> output_vocab_size = 64
-    >>>
-    >>> embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_size)
-    >>> decoder_cell = tf.keras.layers.LSTMCell(hidden_size)
-    >>> sampler = tfa.seq2seq.TrainingSampler()
-    >>> output_layer = tf.keras.layers.Dense(output_vocab_size)
-    >>>
-    >>> decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer)
-    >>>
-    >>> input_ids = tf.random.uniform(
-    ...     [batch_size, max_time], maxval=input_vocab_size, dtype=tf.int64)
-    >>> input_lengths = tf.fill([batch_size], max_time)
-    >>> input_tensors = embedding_layer(input_ids)
-    >>> initial_state = decoder_cell.get_initial_state(input_tensors)
-    >>>
-    >>> output, state, lengths = decoder(
-    ...     input_tensors, sequence_length=input_lengths, initial_state=initial_state)
-    >>>
-    >>> logits = output.rnn_output
-    >>> logits.shape
-    TensorShape([4, 7, 64])
-
-    Example using `tfa.seq2seq.GreedyEmbeddingSampler` for inference:
-
-    >>> sampler = tfa.seq2seq.GreedyEmbeddingSampler(embedding_layer)
-    >>> decoder = tfa.seq2seq.BasicDecoder(
-    ...     decoder_cell, sampler, output_layer, maximum_iterations=10)
-    >>>
-    >>> initial_state = decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
-    >>> start_tokens = tf.fill([batch_size], 1)
-    >>> end_token = 2
-    >>>
-    >>> output, state, lengths = decoder(
-    ...     None, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state)
-    >>>
-    >>> output.sample_id.shape
-    TensorShape([4, 10])
-    """
+  The `tfa.seq2seq.Sampler` instance passed as argument is responsible to sample from
+  the output distribution and produce the input for the next decoding step. The decoding
+  loop is implemented by the decoder in its `__call__` method.
+
+  Example using `tfa.seq2seq.TrainingSampler` for training:
+
+  >>> batch_size = 4
+  >>> max_time = 7
+  >>> hidden_size = 32
+  >>> embedding_size = 48
+  >>> input_vocab_size = 128
+  >>> output_vocab_size = 64
+  >>>
+  >>> embedding_layer = tf.keras.layers.Embedding(input_vocab_size, embedding_size)
+  >>> decoder_cell = tf.keras.layers.LSTMCell(hidden_size)
+  >>> sampler = tfa.seq2seq.TrainingSampler()
+  >>> output_layer = tf.keras.layers.Dense(output_vocab_size)
+  >>>
+  >>> decoder = tfa.seq2seq.BasicDecoder(decoder_cell, sampler, output_layer)
+  >>>
+  >>> input_ids = tf.random.uniform(
+  ...     [batch_size, max_time], maxval=input_vocab_size, dtype=tf.int64)
+  >>> input_lengths = tf.fill([batch_size], max_time)
+  >>> input_tensors = embedding_layer(input_ids)
+  >>> initial_state = decoder_cell.get_initial_state(input_tensors)
+  >>>
+  >>> output, state, lengths = decoder(
+  ...     input_tensors, sequence_length=input_lengths, initial_state=initial_state)
+  >>>
+  >>> logits = output.rnn_output
+  >>> logits.shape
+  TensorShape([4, 7, 64])
+
+  Example using `tfa.seq2seq.GreedyEmbeddingSampler` for inference:
+
+  >>> sampler = tfa.seq2seq.GreedyEmbeddingSampler(embedding_layer)
+  >>> decoder = tfa.seq2seq.BasicDecoder(
+  ...     decoder_cell, sampler, output_layer, maximum_iterations=10)
+  >>>
+  >>> initial_state = decoder_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
+  >>> start_tokens = tf.fill([batch_size], 1)
+  >>> end_token = 2
+  >>>
+  >>> output, state, lengths = decoder(
+  ...     None, start_tokens=start_tokens, end_token=end_token, initial_state=initial_state)
+  >>>
+  >>> output.sample_id.shape
+  TensorShape([4, 10])
+  """
 
   @typechecked
   def __init__(
-      self,
-      cell: tf.keras.layers.Layer,
-      sampler: sampler_py.Sampler,
-      output_layer: Optional[tf.keras.layers.Layer] = None,
-      **kwargs,
+    self,
+    cell: tf.keras.layers.Layer,
+    sampler: sampler_py.Sampler,
+    output_layer: Optional[tf.keras.layers.Layer] = None,
+    **kwargs,
   ):
     """Initialize BasicDecoder.
 
-        Args:
-          cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
-            interface.
-          sampler: A `tfa.seq2seq.Sampler` instance.
-          output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
-            `tf.keras.layers.Dense`. Optional layer to apply to the RNN output
-             prior to storing the result or sampling.
-          **kwargs: Other keyword arguments of `tfa.seq2seq.BaseDecoder`.
-        """
+    Args:
+      cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
+        interface.
+      sampler: A `tfa.seq2seq.Sampler` instance.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`, i.e.,
+        `tf.keras.layers.Dense`. Optional layer to apply to the RNN output
+         prior to storing the result or sampling.
+      **kwargs: Other keyword arguments of `tfa.seq2seq.BaseDecoder`.
+    """
     keras_utils.assert_like_rnncell("cell", cell)
     self.cell = cell
     self.sampler = sampler
@@ -155,28 +155,29 @@ def output_dtype(self):
     # Return that structure and the sample_ids_dtype from the helper.
     dtype = self._cell_dtype
     return BasicDecoderOutput(
-        tf.nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-        self.sampler.sample_ids_dtype,
+      tf.nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+      self.sampler.sample_ids_dtype,
     )
 
   def step(self, time, inputs, state, training=None):
     """Perform a decoding step.
 
-        Args:
-          time: scalar `int32` tensor.
-          inputs: A (structure of) input tensors.
-          state: A (structure of) state tensors and TensorArrays.
-          training: Python boolean.
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+      training: Python boolean.
 
-        Returns:
-          `(outputs, next_state, next_inputs, finished)`.
-        """
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
     cell_outputs, cell_state = self.cell(inputs, state, training=training)
     cell_state = tf.nest.pack_sequence_as(state, tf.nest.flatten(cell_state))
     if self.output_layer is not None:
       cell_outputs = self.output_layer(cell_outputs)
     sample_ids = self.sampler.sample(time=time, outputs=cell_outputs, state=cell_state)
-    (finished, next_inputs,
-     next_state) = self.sampler.next_inputs(time=time, outputs=cell_outputs, state=cell_state, sample_ids=sample_ids)
+    (finished, next_inputs, next_state) = self.sampler.next_inputs(
+      time=time, outputs=cell_outputs, state=cell_state, sample_ids=sample_ids
+    )
     outputs = BasicDecoderOutput(cell_outputs, sample_ids)
     return (outputs, next_state, next_inputs, finished)
diff --git a/deepray/custom_ops/seq2seq/python/beam_search_decoder.py b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py
index 25082eac..b590d753 100644
--- a/deepray/custom_ops/seq2seq/python/beam_search_decoder.py
+++ b/deepray/custom_ops/seq2seq/python/beam_search_decoder.py
@@ -32,69 +32,69 @@
 
 
 class BeamSearchDecoderState(
-    collections.namedtuple(
-        "BeamSearchDecoderState",
-        (
-            "cell_state",
-            "log_probs",
-            "finished",
-            "lengths",
-            "accumulated_attention_probs",
-        ),
-    )
+  collections.namedtuple(
+    "BeamSearchDecoderState",
+    (
+      "cell_state",
+      "log_probs",
+      "finished",
+      "lengths",
+      "accumulated_attention_probs",
+    ),
+  )
 ):
   """State of a `tfa.seq2seq.BeamSearchDecoder`.
 
-    Attributes:
-      cell_state: The cell state returned at the previous time step.
-      log_probs: The accumulated log probabilities of each beam.
-        A `float32` `Tensor` of shape `[batch_size, beam_width]`.
-      finished: The finished status of each beam.
-        A `bool` `Tensor` of shape `[batch_size, beam_width]`.
-      lengths: The accumulated length of each beam.
-        An `int64` `Tensor` of shape `[batch_size, beam_width]`.
-      accumulated_attention_prob: Accumulation of the attention
-        probabilities (used to compute the coverage penalty)
-    """
+  Attributes:
+    cell_state: The cell state returned at the previous time step.
+    log_probs: The accumulated log probabilities of each beam.
+      A `float32` `Tensor` of shape `[batch_size, beam_width]`.
+    finished: The finished status of each beam.
+      A `bool` `Tensor` of shape `[batch_size, beam_width]`.
+    lengths: The accumulated length of each beam.
+      An `int64` `Tensor` of shape `[batch_size, beam_width]`.
+    accumulated_attention_prob: Accumulation of the attention
+      probabilities (used to compute the coverage penalty)
+  """
 
   pass
 
 
 class BeamSearchDecoderOutput(
-    collections.namedtuple("BeamSearchDecoderOutput", ("scores", "predicted_ids", "parent_ids"))
+  collections.namedtuple("BeamSearchDecoderOutput", ("scores", "predicted_ids", "parent_ids"))
 ):
   """Outputs of a `tfa.seq2seq.BeamSearchDecoder` step.
 
-    Attributes:
-      scores: The scores this step, which are the log
-        probabilities over the output vocabulary, possibly penalized by length
-        and attention coverage. When `tfa.seq2seq.BeamSearchDecoder` is created with
-        `output_all_scores=False` (default), this will be a `float32` `Tensor`
-        of shape `[batch_size, beam_width]` containing the top scores
-        corresponding to the predicted IDs. When `output_all_scores=True`,
-        this contains the scores for all token IDs and has shape
-        `[batch_size, beam_width, vocab_size]`.
-      predicted_ids: The token IDs predicted for this step.
-        A `int32` `Tensor` of shape `[batch_size, beam_width]`.
-      parent_ids: The indices of the parent beam of each beam.
-        A `int32` `Tensor` of shape `[batch_size, beam_width]`.
-    """
+  Attributes:
+    scores: The scores this step, which are the log
+      probabilities over the output vocabulary, possibly penalized by length
+      and attention coverage. When `tfa.seq2seq.BeamSearchDecoder` is created with
+      `output_all_scores=False` (default), this will be a `float32` `Tensor`
+      of shape `[batch_size, beam_width]` containing the top scores
+      corresponding to the predicted IDs. When `output_all_scores=True`,
+      this contains the scores for all token IDs and has shape
+      `[batch_size, beam_width, vocab_size]`.
+    predicted_ids: The token IDs predicted for this step.
+      A `int32` `Tensor` of shape `[batch_size, beam_width]`.
+    parent_ids: The indices of the parent beam of each beam.
+      A `int32` `Tensor` of shape `[batch_size, beam_width]`.
+  """
 
   pass
 
 
 class FinalBeamSearchDecoderOutput(
-    collections.namedtuple("FinalBeamDecoderOutput", ["predicted_ids", "beam_search_decoder_output"])
+  collections.namedtuple("FinalBeamDecoderOutput", ["predicted_ids", "beam_search_decoder_output"])
 ):
   """Final outputs returned by the beam search after all decoding is finished.
 
-    Attributes:
-      predicted_ids: The final prediction. A tensor of shape
-        `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if
-        `output_time_major` is True). Beams are ordered from best to worst.
-      beam_search_decoder_output: An instance of `tfa.seq2seq.BeamSearchDecoderOutput` that
-        describes the state of the beam search.
-    """
+  Attributes:
+    predicted_ids: The final prediction. A tensor of shape
+      `[batch_size, T, beam_width]` (or `[T, batch_size, beam_width]` if
+      `output_time_major` is True). Beams are ordered from best to worst.
+    beam_search_decoder_output: An instance of `tfa.seq2seq.BeamSearchDecoderOutput` that
+      describes the state of the beam search.
+  """
 
   pass
 
@@ -107,7 +107,7 @@ def _tile_batch(t, multiplier):
     raise ValueError("t must have statically known rank")
   tiling = [1] * (t.shape.ndims + 1)
   tiling[1] = multiplier
-  tiled_static_batch_size = (t.shape[0] * multiplier if t.shape[0] is not None else None)
+  tiled_static_batch_size = t.shape[0] * multiplier if t.shape[0] is not None else None
   tiled = tf.tile(tf.expand_dims(t, 1), tiling)
   tiled = tf.reshape(tiled, tf.concat(([shape_t[0] * multiplier], shape_t[1:]), 0))
   tiled.set_shape(tf.TensorShape([tiled_static_batch_size]).concatenate(t.shape[1:]))
@@ -117,37 +117,37 @@ def _tile_batch(t, multiplier):
 def tile_batch(t: TensorLike, multiplier: int, name: Optional[str] = None) -> tf.Tensor:
   """Tiles the batch dimension of a (possibly nested structure of) tensor(s).
 
-    For each tensor t in a (possibly nested structure) of tensors,
-    this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
-    of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
-    shape `[batch_size * multiplier, s0, s1, ...]` composed of minibatch
-    entries `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is
-    repeated `multiplier` times.
-
-    Args:
-      t: `Tensor` shaped `[batch_size, ...]`.
-      multiplier: Python int.
-      name: Name scope for any created operations.
-
-    Returns:
-      A (possibly nested structure of) `Tensor` shaped
-      `[batch_size * multiplier, ...]`.
-
-    Raises:
-      ValueError: if tensor(s) `t` do not have a statically known rank or
-      the rank is < 1.
-    """
+  For each tensor t in a (possibly nested structure) of tensors,
+  this function takes a tensor t shaped `[batch_size, s0, s1, ...]` composed
+  of minibatch entries `t[0], ..., t[batch_size - 1]` and tiles it to have a
+  shape `[batch_size * multiplier, s0, s1, ...]` composed of minibatch
+  entries `t[0], t[0], ..., t[1], t[1], ...` where each minibatch entry is
+  repeated `multiplier` times.
+
+  Args:
+    t: `Tensor` shaped `[batch_size, ...]`.
+    multiplier: Python int.
+    name: Name scope for any created operations.
+
+  Returns:
+    A (possibly nested structure of) `Tensor` shaped
+    `[batch_size * multiplier, ...]`.
+
+  Raises:
+    ValueError: if tensor(s) `t` do not have a statically known rank or
+    the rank is < 1.
+  """
   with tf.name_scope(name or "tile_batch"):
     return tf.nest.map_structure(lambda t_: _tile_batch(t_, multiplier), t)
 
 
 @tf.function(
-    input_signature=(
-        tf.TensorSpec([None, None, None], dtype=tf.int32),
-        tf.TensorSpec([None, None, None], dtype=tf.int32),
-        tf.TensorSpec([None], dtype=tf.int32),
-        tf.TensorSpec([], dtype=tf.int32),
-    )
+  input_signature=(
+    tf.TensorSpec([None, None, None], dtype=tf.int32),
+    tf.TensorSpec([None, None, None], dtype=tf.int32),
+    tf.TensorSpec([None], dtype=tf.int32),
+    tf.TensorSpec([], dtype=tf.int32),
+  )
 )
 def _gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token):
   input_shape = tf.shape(parent_ids)
@@ -161,8 +161,8 @@ def _gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token):
   step_ids = tf.where(mask, x=step_ids, y=end_tokens)
   parent_ids = tf.where(mask, x=parent_ids, y=tf.zeros_like(parent_ids))
   assert_op = tf.debugging.Assert(
-      tf.math.reduce_all(tf.math.logical_and(parent_ids >= 0, parent_ids < beam_width)),
-      ["All parent ids must be positive and less than beam_width"],
+    tf.math.reduce_all(tf.math.logical_and(parent_ids >= 0, parent_ids < beam_width)),
+    ["All parent ids must be positive and less than beam_width"],
   )
 
   # Reverse all sequences as we need to gather from the end.
@@ -192,31 +192,31 @@ def _gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token):
 
 
 def gather_tree(
-    step_ids: TensorLike,
-    parent_ids: TensorLike,
-    max_sequence_lengths: TensorLike,
-    end_token: Number,
+  step_ids: TensorLike,
+  parent_ids: TensorLike,
+  max_sequence_lengths: TensorLike,
+  end_token: Number,
 ) -> tf.Tensor:
   """Calculates the full beams from the per-step ids and parent beam ids.
 
-    For a given beam, past the time step containing the first decoded
-    `end_token` all values are filled in with `end_token`.
+  For a given beam, past the time step containing the first decoded
+  `end_token` all values are filled in with `end_token`.
 
-    Args:
-      step_ids: The predicted token IDs.
-        A `int32` `Tensor` of shape `[max_time, batch_size, beam_width]`.
-      parent_ids: The parent beam indices.
-        A `int32` `Tensor` of shape `[max_time, batch_size, beam_width]`.
-      max_sequence_lengths: The maximum sequence length of each batch.
-        A `int32` `Tensor` of shape `[batch_size]`.
-      end_token: The end token ID.
+  Args:
+    step_ids: The predicted token IDs.
+      A `int32` `Tensor` of shape `[max_time, batch_size, beam_width]`.
+    parent_ids: The parent beam indices.
+      A `int32` `Tensor` of shape `[max_time, batch_size, beam_width]`.
+    max_sequence_lengths: The maximum sequence length of each batch.
+      A `int32` `Tensor` of shape `[batch_size]`.
+    end_token: The end token ID.
 
-    Returns:
-      The reordered token IDs based on `parent_ids`.
+  Returns:
+    The reordered token IDs based on `parent_ids`.
 
-    Raises:
-      InvalidArgumentError: if `parent_ids` contains an invalid index.
-    """
+  Raises:
+    InvalidArgumentError: if `parent_ids` contains an invalid index.
+  """
   if not options.is_custom_kernel_disabled():
     try:
       return _beam_search_so.ops.addons_gather_tree(step_ids, parent_ids, max_sequence_lengths, end_token)
@@ -233,18 +233,18 @@ def gather_tree(
 def gather_tree_from_array(t: TensorLike, parent_ids: TensorLike, sequence_length: TensorLike) -> tf.Tensor:
   """Calculates the full beams for a `TensorArray`.
 
-    Args:
-      t: A stacked `TensorArray` of size `max_time` that contains `Tensor`s of
-        shape `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]`
-        where `s` is the depth shape.
-      parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
-      sequence_length: The sequence length of shape `[batch_size, beam_width]`.
-
-    Returns:
-      A `Tensor` which is a stacked `TensorArray` of the same size and type as
-      `t` and where beams are sorted in each `Tensor` according to
-      `parent_ids`.
-    """
+  Args:
+    t: A stacked `TensorArray` of size `max_time` that contains `Tensor`s of
+      shape `[batch_size, beam_width, s]` or `[batch_size * beam_width, s]`
+      where `s` is the depth shape.
+    parent_ids: The parent ids of shape `[max_time, batch_size, beam_width]`.
+    sequence_length: The sequence length of shape `[batch_size, beam_width]`.
+
+  Returns:
+    A `Tensor` which is a stacked `TensorArray` of the same size and type as
+    `t` and where beams are sorted in each `Tensor` according to
+    `parent_ids`.
+  """
   max_time = parent_ids.shape[0] or tf.shape(parent_ids)[0]
   batch_size = parent_ids.shape[1] or tf.shape(parent_ids)[1]
   beam_width = parent_ids.shape[2] or tf.shape(parent_ids)[2]
@@ -255,10 +255,10 @@ def gather_tree_from_array(t: TensorLike, parent_ids: TensorLike, sequence_lengt
 
   max_sequence_lengths = tf.cast(tf.reduce_max(sequence_length, axis=1), tf.int32)
   sorted_beam_ids = gather_tree(
-      step_ids=beam_ids,
-      parent_ids=parent_ids,
-      max_sequence_lengths=max_sequence_lengths,
-      end_token=beam_width + 1,
+    step_ids=beam_ids,
+    parent_ids=parent_ids,
+    max_sequence_lengths=max_sequence_lengths,
+    end_token=beam_width + 1,
   )
 
   # For out of range steps, simply copy the same beam.
@@ -281,7 +281,7 @@ def _check_ndims(t):
 
 def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
   """Raises an exception if dimensions are known statically and can not be
-    reshaped to [batch_size, beam_size, -1]."""
+  reshaped to [batch_size, beam_size, -1]."""
   reshaped_shape = tf.TensorShape([batch_size, beam_width, None])
   assert len(shape.dims) > 0
   if batch_size is None or shape[0] is None:
@@ -293,28 +293,28 @@ def _check_static_batch_beam_maybe(shape, batch_size, beam_width):
     return True  # non-flattened, matching
   # Otherwise we could not find a match and warn:
   tf.get_logger().warn(
-      "TensorArray reordering expects elements to be "
-      "reshapable to %s which is incompatible with the "
-      "current shape %s. Consider setting "
-      "reorder_tensor_arrays to False to disable TensorArray "
-      "reordering during the beam search." % (reshaped_shape, shape)
+    "TensorArray reordering expects elements to be "
+    "reshapable to %s which is incompatible with the "
+    "current shape %s. Consider setting "
+    "reorder_tensor_arrays to False to disable TensorArray "
+    "reordering during the beam search." % (reshaped_shape, shape)
   )
   return False
 
 
 def _check_batch_beam(t, batch_size, beam_width):
   """Returns an Assert operation checking that the elements of the stacked
-    TensorArray can be reshaped to [batch_size, beam_size, -1].
+  TensorArray can be reshaped to [batch_size, beam_size, -1].
 
-    At this point, the TensorArray elements have a known rank of at
-    least 1.
-    """
+  At this point, the TensorArray elements have a known rank of at
+  least 1.
+  """
   error_message = (
-      "TensorArray reordering expects elements to be "
-      "reshapable to [batch_size, beam_size, -1] which is "
-      "incompatible with the dynamic shape of %s elements. "
-      "Consider setting reorder_tensor_arrays to False to disable "
-      "TensorArray reordering during the beam search." % (t if tf.executing_eagerly() else t.name)
+    "TensorArray reordering expects elements to be "
+    "reshapable to [batch_size, beam_size, -1] which is "
+    "incompatible with the dynamic shape of %s elements. "
+    "Consider setting reorder_tensor_arrays to False to disable "
+    "TensorArray reordering during the beam search." % (t if tf.executing_eagerly() else t.name)
   )
   rank = t.shape.ndims
   shape = tf.shape(t)
@@ -322,8 +322,8 @@ def _check_batch_beam(t, batch_size, beam_width):
     condition = tf.equal(shape[1], batch_size * beam_width)
   else:
     condition = tf.logical_or(
-        tf.equal(shape[1], batch_size * beam_width),
-        tf.logical_and(tf.equal(shape[1], batch_size), tf.equal(shape[2], beam_width)),
+      tf.equal(shape[1], batch_size * beam_width),
+      tf.logical_and(tf.equal(shape[1], batch_size), tf.equal(shape[2], beam_width)),
     )
   return tf.Assert(condition, [error_message])
 
@@ -339,51 +339,51 @@ def _as_shape(value):
 
 class BeamSearchDecoderMixin:
   """BeamSearchDecoderMixin contains the common methods for
-    BeamSearchDecoder.
+  BeamSearchDecoder.
 
-    It is expected to be used a base class for concrete
-    BeamSearchDecoder. Since this is a mixin class, it is expected to be
-    used together with other class as base.
-    """
+  It is expected to be used a base class for concrete
+  BeamSearchDecoder. Since this is a mixin class, it is expected to be
+  used together with other class as base.
+  """
 
   @typechecked
   def __init__(
-      self,
-      cell: tf.keras.layers.Layer,
-      beam_width: int,
-      output_layer: Optional[tf.keras.layers.Layer] = None,
-      length_penalty_weight: FloatTensorLike = 0.0,
-      coverage_penalty_weight: FloatTensorLike = 0.0,
-      reorder_tensor_arrays: bool = True,
-      output_all_scores: bool = False,
-      **kwargs,
+    self,
+    cell: tf.keras.layers.Layer,
+    beam_width: int,
+    output_layer: Optional[tf.keras.layers.Layer] = None,
+    length_penalty_weight: FloatTensorLike = 0.0,
+    coverage_penalty_weight: FloatTensorLike = 0.0,
+    reorder_tensor_arrays: bool = True,
+    output_all_scores: bool = False,
+    **kwargs,
   ):
     """Initialize the BeamSearchDecoderMixin.
 
-        Args:
-          cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
-            interface.
-          beam_width:  Python integer, the number of beams.
-          output_layer: (Optional) An instance of `tf.keras.layers.Layer`,
-            i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
-            output prior to storing the result or sampling.
-          length_penalty_weight: Float weight to penalize length. Disabled with
-             0.0.
-          coverage_penalty_weight: Float weight to penalize the coverage of
-            source sentence. Disabled with 0.0.
-          reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the
-            cell state will be reordered according to the beam search path. If
-            the `TensorArray` can be reordered, the stacked form will be
-            returned. Otherwise, the `TensorArray` will be returned as is. Set
-            this flag to `False` if the cell state contains `TensorArray`s that
-            are not amenable to reordering.
-          output_all_scores: If `True`, `BeamSearchDecoderOutput.scores` will
-            contain scores for all token IDs and be of shape
-            `[batch_size, beam_width, vocab_size]`. When `False` (default),
-            only the top score corresponding to the predicted token will be
-            output with shape `[batch_size, beam_width]`.
-          **kwargs: Dict, other keyword arguments for parent class.
-        """
+    Args:
+      cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
+        interface.
+      beam_width:  Python integer, the number of beams.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`,
+        i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
+        output prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with
+         0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of
+        source sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the
+        cell state will be reordered according to the beam search path. If
+        the `TensorArray` can be reordered, the stacked form will be
+        returned. Otherwise, the `TensorArray` will be returned as is. Set
+        this flag to `False` if the cell state contains `TensorArray`s that
+        are not amenable to reordering.
+      output_all_scores: If `True`, `BeamSearchDecoderOutput.scores` will
+        contain scores for all token IDs and be of shape
+        `[batch_size, beam_width, vocab_size]`. When `False` (default),
+        only the top score corresponding to the predicted token will be
+        output with shape `[batch_size, beam_width]`.
+      **kwargs: Dict, other keyword arguments for parent class.
+    """
     keras_utils.assert_like_rnncell("cell", cell)
     self._cell = cell
     self._output_layer = output_layer
@@ -422,61 +422,62 @@ def _rnn_output_size(self):
   def tracks_own_finished(self):
     """The BeamSearchDecoder shuffles its beams and their finished state.
 
-        For this reason, it conflicts with the `dynamic_decode` function's
-        tracking of finished states.  Setting this property to true avoids
-        early stopping of decoding due to mismanagement of the finished state
-        in `dynamic_decode`.
+    For this reason, it conflicts with the `dynamic_decode` function's
+    tracking of finished states.  Setting this property to true avoids
+    early stopping of decoding due to mismanagement of the finished state
+    in `dynamic_decode`.
 
-        Returns:
-          `True`.
-        """
+    Returns:
+      `True`.
+    """
     return True
 
   @property
   def output_size(self):
     # Return the cell output and the id
     score_size = (
-        tf.TensorShape([self._beam_width, self._rnn_output_size()[-1]])
-        if self._output_all_scores else tf.TensorShape([self._beam_width])
+      tf.TensorShape([self._beam_width, self._rnn_output_size()[-1]])
+      if self._output_all_scores
+      else tf.TensorShape([self._beam_width])
     )
     return BeamSearchDecoderOutput(
-        scores=score_size,
-        predicted_ids=tf.TensorShape([self._beam_width]),
-        parent_ids=tf.TensorShape([self._beam_width]),
+      scores=score_size,
+      predicted_ids=tf.TensorShape([self._beam_width]),
+      parent_ids=tf.TensorShape([self._beam_width]),
     )
 
   def finalize(self, outputs, final_state, sequence_lengths):
     """Finalize and return the predicted_ids.
 
-        Args:
-          outputs: An instance of BeamSearchDecoderOutput.
-          final_state: An instance of BeamSearchDecoderState. Passed through to
-            the output.
-          sequence_lengths: An `int64` tensor shaped
-            `[batch_size, beam_width]`. The sequence lengths determined for
-            each beam during decode. **NOTE** These are ignored; the updated
-            sequence lengths are stored in `final_state.lengths`.
-
-        Returns:
-          outputs: An instance of `FinalBeamSearchDecoderOutput` where the
-            predicted_ids are the result of calling _gather_tree.
-          final_state: The same input instance of `BeamSearchDecoderState`.
-        """
+    Args:
+      outputs: An instance of BeamSearchDecoderOutput.
+      final_state: An instance of BeamSearchDecoderState. Passed through to
+        the output.
+      sequence_lengths: An `int64` tensor shaped
+        `[batch_size, beam_width]`. The sequence lengths determined for
+        each beam during decode. **NOTE** These are ignored; the updated
+        sequence lengths are stored in `final_state.lengths`.
+
+    Returns:
+      outputs: An instance of `FinalBeamSearchDecoderOutput` where the
+        predicted_ids are the result of calling _gather_tree.
+      final_state: The same input instance of `BeamSearchDecoderState`.
+    """
     del sequence_lengths
     # Get max_sequence_length across all beams for each batch.
     max_sequence_lengths = tf.cast(tf.reduce_max(final_state.lengths, axis=1), tf.int32)
     predicted_ids = gather_tree(
-        outputs.predicted_ids,
-        outputs.parent_ids,
-        max_sequence_lengths=max_sequence_lengths,
-        end_token=self._end_token,
+      outputs.predicted_ids,
+      outputs.parent_ids,
+      max_sequence_lengths=max_sequence_lengths,
+      end_token=self._end_token,
     )
     if self._reorder_tensor_arrays:
       final_state = final_state._replace(
-          cell_state=tf.nest.map_structure(
-              lambda t: self._maybe_sort_array_beams(t, outputs.parent_ids, final_state.lengths),
-              final_state.cell_state,
-          )
+        cell_state=tf.nest.map_structure(
+          lambda t: self._maybe_sort_array_beams(t, outputs.parent_ids, final_state.lengths),
+          final_state.cell_state,
+        )
       )
     outputs = FinalBeamSearchDecoderOutput(beam_search_decoder_output=outputs, predicted_ids=predicted_ids)
     return outputs, final_state
@@ -484,20 +485,20 @@ def finalize(self, outputs, final_state, sequence_lengths):
   def _merge_batch_beams(self, t, s=None):
     """Merges the tensor from a batch of beams into a batch by beams.
 
-        More exactly, t is a tensor of dimension [batch_size, beam_width, s].
-        We reshape this into [batch_size*beam_width, s]
+    More exactly, t is a tensor of dimension [batch_size, beam_width, s].
+    We reshape this into [batch_size*beam_width, s]
 
-        Args:
-          t: Tensor of dimension [batch_size, beam_width, s]
-          s: (Possibly known) depth shape.
+    Args:
+      t: Tensor of dimension [batch_size, beam_width, s]
+      s: (Possibly known) depth shape.
 
-        Returns:
-          A reshaped version of t with dimension [batch_size * beam_width, s].
-        """
+    Returns:
+      A reshaped version of t with dimension [batch_size * beam_width, s].
+    """
     s = _as_shape(s)
     t_shape = tf.shape(t)
     static_batch_size = tf.get_static_value(self._batch_size)
-    batch_size_beam_width = (None if static_batch_size is None else static_batch_size * self._beam_width)
+    batch_size_beam_width = None if static_batch_size is None else static_batch_size * self._beam_width
     reshaped_t = tf.reshape(t, tf.concat(([self._batch_size * self._beam_width], t_shape[2:]), 0))
     reshaped_t.set_shape(tf.TensorShape([batch_size_beam_width]).concatenate(s))
     return reshaped_t
@@ -505,21 +506,21 @@ def _merge_batch_beams(self, t, s=None):
   def _split_batch_beams(self, t, s=None):
     """Splits the tensor from a batch by beams into a batch of beams.
 
-        More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
-        reshape this into [batch_size, beam_width, s]
+    More exactly, t is a tensor of dimension [batch_size*beam_width, s]. We
+    reshape this into [batch_size, beam_width, s]
 
-        Args:
-          t: Tensor of dimension [batch_size*beam_width, s].
-          s: (Possibly known) depth shape.
+    Args:
+      t: Tensor of dimension [batch_size*beam_width, s].
+      s: (Possibly known) depth shape.
 
-        Returns:
-          A reshaped version of t with dimension [batch_size, beam_width, s].
+    Returns:
+      A reshaped version of t with dimension [batch_size, beam_width, s].
 
-        Raises:
-          ValueError: If, after reshaping, the new tensor is not shaped
-            `[batch_size, beam_width, s]` (assuming batch_size and beam_width
-            are known statically).
-        """
+    Raises:
+      ValueError: If, after reshaping, the new tensor is not shaped
+        `[batch_size, beam_width, s]` (assuming batch_size and beam_width
+        are known statically).
+    """
     s = _as_shape(s)
     t_shape = tf.shape(t)
     reshaped_t = tf.reshape(t, tf.concat(([self._batch_size, self._beam_width], t_shape[1:]), 0))
@@ -527,12 +528,12 @@ def _split_batch_beams(self, t, s=None):
     expected_reshaped_shape = tf.TensorShape([static_batch_size, self._beam_width]).concatenate(s)
     if not reshaped_t.shape.is_compatible_with(expected_reshaped_shape):
       raise ValueError(
-          "Unexpected behavior when reshaping between beam width "
-          "and batch size.  The reshaped tensor has shape: %s.  "
-          "We expected it to have shape "
-          "(batch_size, beam_width, depth) == %s.  Perhaps you "
-          "forgot to call get_initial_state with "
-          "batch_size=encoder_batch_size * beam_width?" % (reshaped_t.shape, expected_reshaped_shape)
+        "Unexpected behavior when reshaping between beam width "
+        "and batch size.  The reshaped tensor has shape: %s.  "
+        "We expected it to have shape "
+        "(batch_size, beam_width, depth) == %s.  Perhaps you "
+        "forgot to call get_initial_state with "
+        "batch_size=encoder_batch_size * beam_width?" % (reshaped_t.shape, expected_reshaped_shape)
       )
     reshaped_t.set_shape(expected_reshaped_shape)
     return reshaped_t
@@ -540,21 +541,21 @@ def _split_batch_beams(self, t, s=None):
   def _maybe_split_batch_beams(self, t, s):
     """Maybe splits the tensor from a batch by beams into a batch of beams.
 
-        We do this so that we can use nest and not run into problems with
-        shapes.
+    We do this so that we can use nest and not run into problems with
+    shapes.
 
-        Args:
-          t: `Tensor`, either scalar or shaped `[batch_size * beam_width] + s`.
-          s: `Tensor`, Python int, or `TensorShape`.
+    Args:
+      t: `Tensor`, either scalar or shaped `[batch_size * beam_width] + s`.
+      s: `Tensor`, Python int, or `TensorShape`.
 
-        Returns:
-          If `t` is a matrix or higher order tensor, then the return value is
-          `t` reshaped to `[batch_size, beam_width] + s`.  Otherwise `t` is
-          returned unchanged.
+    Returns:
+      If `t` is a matrix or higher order tensor, then the return value is
+      `t` reshaped to `[batch_size, beam_width] + s`.  Otherwise `t` is
+      returned unchanged.
 
-        Raises:
-          ValueError: If the rank of `t` is not statically known.
-        """
+    Raises:
+      ValueError: If the rank of `t` is not statically known.
+    """
     if isinstance(t, tf.TensorArray):
       return t
     _check_ndims(t)
@@ -566,20 +567,20 @@ def _maybe_split_batch_beams(self, t, s):
   def _maybe_merge_batch_beams(self, t, s):
     """Splits the tensor from a batch by beams into a batch of beams.
 
-        More exactly, `t` is a tensor of dimension
-        `[batch_size * beam_width] + s`, then we reshape it to
-        `[batch_size, beam_width] + s`.
+    More exactly, `t` is a tensor of dimension
+    `[batch_size * beam_width] + s`, then we reshape it to
+    `[batch_size, beam_width] + s`.
 
-        Args:
-          t: `Tensor` of dimension `[batch_size * beam_width] + s`.
-          s: `Tensor`, Python int, or `TensorShape`.
+    Args:
+      t: `Tensor` of dimension `[batch_size * beam_width] + s`.
+      s: `Tensor`, Python int, or `TensorShape`.
 
-        Returns:
-          A reshaped version of t with shape `[batch_size, beam_width] + s`.
+    Returns:
+      A reshaped version of t with shape `[batch_size, beam_width] + s`.
 
-        Raises:
-          ValueError:  If the rank of `t` is not statically known.
-        """
+    Raises:
+      ValueError:  If the rank of `t` is not statically known.
+    """
     if isinstance(t, tf.TensorArray):
       return t
     _check_ndims(t)
@@ -591,27 +592,27 @@ def _maybe_merge_batch_beams(self, t, s):
   def _maybe_sort_array_beams(self, t, parent_ids, sequence_length):
     """Maybe sorts beams within a `TensorArray`.
 
-        Args:
-          t: A `TensorArray` of size `max_time` that contains `Tensor`s of
-            shape `[batch_size, beam_width, s]` or
-            `[batch_size * beam_width, s]` where `s` is the depth shape.
-          parent_ids: The parent ids of shape
-            `[max_time, batch_size, beam_width]`.
-          sequence_length: The sequence length of shape
-            `[batch_size, beam_width]`.
-
-        Returns:
-          A `TensorArray` where beams are sorted in each `Tensor` or `t` itself
-            if it is not a `TensorArray` or does not meet shape requirements.
-        """
+    Args:
+      t: A `TensorArray` of size `max_time` that contains `Tensor`s of
+        shape `[batch_size, beam_width, s]` or
+        `[batch_size * beam_width, s]` where `s` is the depth shape.
+      parent_ids: The parent ids of shape
+        `[max_time, batch_size, beam_width]`.
+      sequence_length: The sequence length of shape
+        `[batch_size, beam_width]`.
+
+    Returns:
+      A `TensorArray` where beams are sorted in each `Tensor` or `t` itself
+        if it is not a `TensorArray` or does not meet shape requirements.
+    """
     if not isinstance(t, tf.TensorArray):
       return t
     if t.element_shape.ndims is None or t.element_shape.ndims < 1:
       tf.get_logger().warn(
-          "The TensorArray %s in the cell state is not amenable to "
-          "sorting based on the beam search result. For a "
-          "TensorArray to be sorted, its elements shape must be "
-          "defined and have at least a rank of 1, but saw shape: %s" % (t.handle.name, t.element_shape)
+        "The TensorArray %s in the cell state is not amenable to "
+        "sorting based on the beam search result. For a "
+        "TensorArray to be sorted, its elements shape must be "
+        "defined and have at least a rank of 1, but saw shape: %s" % (t.handle.name, t.element_shape)
       )
       return t
     if not _check_static_batch_beam_maybe(t.element_shape, tf.get_static_value(self._batch_size), self._beam_width):
@@ -623,18 +624,18 @@ def _maybe_sort_array_beams(self, t, parent_ids, sequence_length):
   def step(self, time, inputs, state, training=None, name=None):
     """Perform a decoding step.
 
-        Args:
-          time: scalar `int32` tensor.
-          inputs: A (structure of) input tensors.
-          state: A (structure of) state tensors and TensorArrays.
-          training: Python boolean. Indicates whether the layer should
-              behave in training mode or in inference mode. Only relevant
-              when `dropout` or `recurrent_dropout` is used.
-          name: Name scope for any created operations.
-
-        Returns:
-          `(outputs, next_state, next_inputs, finished)`.
-        """
+    Args:
+      time: scalar `int32` tensor.
+      inputs: A (structure of) input tensors.
+      state: A (structure of) state tensors and TensorArrays.
+      training: Python boolean. Indicates whether the layer should
+          behave in training mode or in inference mode. Only relevant
+          when `dropout` or `recurrent_dropout` is used.
+      name: Name scope for any created operations.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`.
+    """
     batch_size = self._batch_size
     beam_width = self._beam_width
     end_token = self._end_token
@@ -654,24 +655,24 @@ def step(self, time, inputs, state, training=None, name=None):
         cell_outputs = self._output_layer(cell_outputs)
 
       beam_search_output, beam_search_state = _beam_search_step(
-          time=time,
-          logits=cell_outputs,
-          next_cell_state=next_cell_state,
-          beam_state=state,
-          batch_size=batch_size,
-          beam_width=beam_width,
-          end_token=end_token,
-          length_penalty_weight=length_penalty_weight,
-          coverage_penalty_weight=coverage_penalty_weight,
-          output_all_scores=self._output_all_scores,
+        time=time,
+        logits=cell_outputs,
+        next_cell_state=next_cell_state,
+        beam_state=state,
+        batch_size=batch_size,
+        beam_width=beam_width,
+        end_token=end_token,
+        length_penalty_weight=length_penalty_weight,
+        coverage_penalty_weight=coverage_penalty_weight,
+        output_all_scores=self._output_all_scores,
       )
 
       finished = beam_search_state.finished
       sample_ids = beam_search_output.predicted_ids
       next_inputs = tf.cond(
-          tf.reduce_all(finished),
-          lambda: self._start_inputs,
-          lambda: self._embedding_fn(sample_ids),
+        tf.reduce_all(finished),
+        lambda: self._start_inputs,
+        lambda: self._embedding_fn(sample_ids),
       )
 
     return (beam_search_output, beam_search_state, next_inputs, finished)
@@ -684,86 +685,86 @@ class BeamSearchDecoder(BeamSearchDecoderMixin, decoder.BaseDecoder):
   # class.
   """Beam search decoder.
 
-    **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
-    `tfa.seq2seq.AttentionWrapper`, then you must ensure that:
-
-    - The encoder output has been tiled to `beam_width` via
-      `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
-    - The `batch_size` argument passed to the `get_initial_state` method of
-      this wrapper is equal to `true_batch_size * beam_width`.
-    - The initial state created with `get_initial_state` above contains a
-      `cell_state` value containing properly tiled final state from the
-      encoder.
-
-    An example:
-
-    ```
-    tiled_encoder_outputs = tfa.seq2seq.tile_batch(
-        encoder_outputs, multiplier=beam_width)
-    tiled_encoder_final_state = tfa.seq2seq.tile_batch(
-        encoder_final_state, multiplier=beam_width)
-    tiled_sequence_length = tfa.seq2seq.tile_batch(
-        sequence_length, multiplier=beam_width)
-    attention_mechanism = MyFavoriteAttentionMechanism(
-        num_units=attention_depth,
-        memory=tiled_inputs,
-        memory_sequence_length=tiled_sequence_length)
-    attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
-    decoder_initial_state = attention_cell.get_initial_state(
-        batch_size=true_batch_size * beam_width, dtype=dtype)
-    decoder_initial_state = decoder_initial_state.clone(
-        cell_state=tiled_encoder_final_state)
-    ```
-
-    Meanwhile, with `tfa.seq2seq.AttentionWrapper`, coverage penalty is suggested to use
-    when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
-    the decoding to cover all inputs.
-    """
+  **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
+  `tfa.seq2seq.AttentionWrapper`, then you must ensure that:
+
+  - The encoder output has been tiled to `beam_width` via
+    `tfa.seq2seq.tile_batch` (NOT `tf.tile`).
+  - The `batch_size` argument passed to the `get_initial_state` method of
+    this wrapper is equal to `true_batch_size * beam_width`.
+  - The initial state created with `get_initial_state` above contains a
+    `cell_state` value containing properly tiled final state from the
+    encoder.
+
+  An example:
+
+  ```
+  tiled_encoder_outputs = tfa.seq2seq.tile_batch(
+      encoder_outputs, multiplier=beam_width)
+  tiled_encoder_final_state = tfa.seq2seq.tile_batch(
+      encoder_final_state, multiplier=beam_width)
+  tiled_sequence_length = tfa.seq2seq.tile_batch(
+      sequence_length, multiplier=beam_width)
+  attention_mechanism = MyFavoriteAttentionMechanism(
+      num_units=attention_depth,
+      memory=tiled_inputs,
+      memory_sequence_length=tiled_sequence_length)
+  attention_cell = AttentionWrapper(cell, attention_mechanism, ...)
+  decoder_initial_state = attention_cell.get_initial_state(
+      batch_size=true_batch_size * beam_width, dtype=dtype)
+  decoder_initial_state = decoder_initial_state.clone(
+      cell_state=tiled_encoder_final_state)
+  ```
+
+  Meanwhile, with `tfa.seq2seq.AttentionWrapper`, coverage penalty is suggested to use
+  when computing scores (https://arxiv.org/pdf/1609.08144.pdf). It encourages
+  the decoding to cover all inputs.
+  """
 
   @typechecked
   def __init__(
-      self,
-      cell: tf.keras.layers.Layer,
-      beam_width: int,
-      embedding_fn: Optional[Callable] = None,
-      output_layer: Optional[tf.keras.layers.Layer] = None,
-      length_penalty_weight: FloatTensorLike = 0.0,
-      coverage_penalty_weight: FloatTensorLike = 0.0,
-      reorder_tensor_arrays: bool = True,
-      **kwargs,
+    self,
+    cell: tf.keras.layers.Layer,
+    beam_width: int,
+    embedding_fn: Optional[Callable] = None,
+    output_layer: Optional[tf.keras.layers.Layer] = None,
+    length_penalty_weight: FloatTensorLike = 0.0,
+    coverage_penalty_weight: FloatTensorLike = 0.0,
+    reorder_tensor_arrays: bool = True,
+    **kwargs,
   ):
     """Initialize the BeamSearchDecoder.
 
-        Args:
-          cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
-            interface.
-          beam_width:  Python integer, the number of beams.
-          embedding_fn: A callable that takes a `int32` `Tensor` of token IDs
-            and returns embedding tensors. If set, the `embedding` argument in
-            the decoder call should be set to `None`.
-          output_layer: (Optional) An instance of `tf.keras.layers.Layer`,
-            i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
-            output prior to storing the result or sampling.
-          length_penalty_weight: Float weight to penalize length. Disabled with
-            0.0.
-          coverage_penalty_weight: Float weight to penalize the coverage of
-            source sentence. Disabled with 0.0.
-          reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the
-            cell state will be reordered according to the beam search path. If
-            the `TensorArray` can be reordered, the stacked form will be
-            returned. Otherwise, the `TensorArray` will be returned as is. Set
-            this flag to `False` if the cell state contains `TensorArray`s that
-            are not amenable to reordering.
-          **kwargs: Dict, other keyword arguments for initialization.
-        """
+    Args:
+      cell: A layer that implements the `tf.keras.layers.AbstractRNNCell`
+        interface.
+      beam_width:  Python integer, the number of beams.
+      embedding_fn: A callable that takes a `int32` `Tensor` of token IDs
+        and returns embedding tensors. If set, the `embedding` argument in
+        the decoder call should be set to `None`.
+      output_layer: (Optional) An instance of `tf.keras.layers.Layer`,
+        i.e., `tf.keras.layers.Dense`.  Optional layer to apply to the RNN
+        output prior to storing the result or sampling.
+      length_penalty_weight: Float weight to penalize length. Disabled with
+        0.0.
+      coverage_penalty_weight: Float weight to penalize the coverage of
+        source sentence. Disabled with 0.0.
+      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the
+        cell state will be reordered according to the beam search path. If
+        the `TensorArray` can be reordered, the stacked form will be
+        returned. Otherwise, the `TensorArray` will be returned as is. Set
+        this flag to `False` if the cell state contains `TensorArray`s that
+        are not amenable to reordering.
+      **kwargs: Dict, other keyword arguments for initialization.
+    """
     super().__init__(
-        cell,
-        beam_width,
-        output_layer=output_layer,
-        length_penalty_weight=length_penalty_weight,
-        coverage_penalty_weight=coverage_penalty_weight,
-        reorder_tensor_arrays=reorder_tensor_arrays,
-        **kwargs,
+      cell,
+      beam_width,
+      output_layer=output_layer,
+      length_penalty_weight=length_penalty_weight,
+      coverage_penalty_weight=coverage_penalty_weight,
+      reorder_tensor_arrays=reorder_tensor_arrays,
+      **kwargs,
     )
 
     self._embedding_fn = embedding_fn
@@ -771,32 +772,32 @@ def __init__(
   def initialize(self, embedding, start_tokens, end_token, initial_state):
     """Initialize the decoder.
 
-        Args:
-          embedding: A `Tensor` (or `Variable`) to pass as the `params` argument
-            for `tf.nn.embedding_lookup`. This overrides `embedding_fn` set in
-            the constructor.
-          start_tokens: Start the decoding from these tokens.
-            A `int32` `Tensor` of shape `[batch_size]`.
-          end_token: The token that marks the end of decoding.
-            A `int32` scalar `Tensor`.
-          initial_state: The initial cell state as a (possibly nested) structure
-            of `Tensor` and `TensorArray`.
-
-        Returns:
-          `(finished, start_inputs, initial_state)`.
-
-        Raises:
-          ValueError: If `embedding` is `None` and `embedding_fn` was not set
-            in the constructor.
-          ValueError: If `start_tokens` is not a vector or `end_token` is not a
-            scalar.
-        """
+    Args:
+      embedding: A `Tensor` (or `Variable`) to pass as the `params` argument
+        for `tf.nn.embedding_lookup`. This overrides `embedding_fn` set in
+        the constructor.
+      start_tokens: Start the decoding from these tokens.
+        A `int32` `Tensor` of shape `[batch_size]`.
+      end_token: The token that marks the end of decoding.
+        A `int32` scalar `Tensor`.
+      initial_state: The initial cell state as a (possibly nested) structure
+        of `Tensor` and `TensorArray`.
+
+    Returns:
+      `(finished, start_inputs, initial_state)`.
+
+    Raises:
+      ValueError: If `embedding` is `None` and `embedding_fn` was not set
+        in the constructor.
+      ValueError: If `start_tokens` is not a vector or `end_token` is not a
+        scalar.
+    """
     if embedding is not None:
       self._embedding_fn = lambda ids: tf.nn.embedding_lookup(embedding, ids)
     elif self._embedding_fn is None:
       raise ValueError(
-          "You should either pass an embedding variable when calling the "
-          "BeamSearchDecoder or set embedding_fn in the constructor."
+        "You should either pass an embedding variable when calling the "
+        "BeamSearchDecoder or set embedding_fn in the constructor."
       )
 
     self._start_tokens = tf.convert_to_tensor(start_tokens, dtype=tf.int32, name="start_tokens")
@@ -808,39 +809,39 @@ def initialize(self, embedding, start_tokens, end_token, initial_state):
 
     self._batch_size = tf.size(start_tokens)
     self._initial_cell_state = tf.nest.map_structure(
-        self._maybe_split_batch_beams, initial_state, self._cell.state_size
+      self._maybe_split_batch_beams, initial_state, self._cell.state_size
     )
     self._start_tokens = tf.tile(tf.expand_dims(self._start_tokens, 1), [1, self._beam_width])
     self._start_inputs = self._embedding_fn(self._start_tokens)
 
     self._finished = tf.one_hot(
-        tf.zeros([self._batch_size], dtype=tf.int32),
-        depth=self._beam_width,
-        on_value=False,
-        off_value=True,
-        dtype=tf.bool,
+      tf.zeros([self._batch_size], dtype=tf.int32),
+      depth=self._beam_width,
+      on_value=False,
+      off_value=True,
+      dtype=tf.bool,
     )
 
     finished, start_inputs = self._finished, self._start_inputs
 
     dtype = tf.nest.flatten(self._initial_cell_state)[0].dtype
     log_probs = tf.one_hot(  # shape(batch_sz, beam_sz)
-        tf.zeros([self._batch_size], dtype=tf.int32),
-        depth=self._beam_width,
-        on_value=tf.convert_to_tensor(0.0, dtype=dtype),
-        off_value=tf.convert_to_tensor(-np.Inf, dtype=dtype),
-        dtype=dtype,
+      tf.zeros([self._batch_size], dtype=tf.int32),
+      depth=self._beam_width,
+      on_value=tf.convert_to_tensor(0.0, dtype=dtype),
+      off_value=tf.convert_to_tensor(-np.Inf, dtype=dtype),
+      dtype=dtype,
     )
     init_attention_probs = get_attention_probs(self._initial_cell_state, self._coverage_penalty_weight)
     if init_attention_probs is None:
       init_attention_probs = ()
 
     initial_state = BeamSearchDecoderState(
-        cell_state=self._initial_cell_state,
-        log_probs=log_probs,
-        finished=finished,
-        lengths=tf.zeros([self._batch_size, self._beam_width], dtype=tf.int64),
-        accumulated_attention_probs=init_attention_probs,
+      cell_state=self._initial_cell_state,
+      log_probs=log_probs,
+      finished=finished,
+      lengths=tf.zeros([self._batch_size, self._beam_width], dtype=tf.int64),
+      accumulated_attention_probs=init_attention_probs,
     )
 
     return (finished, start_inputs, initial_state)
@@ -852,9 +853,9 @@ def output_dtype(self):
     # Return that structure and int32 (the id)
     dtype = tf.nest.flatten(self._initial_cell_state)[0].dtype
     return BeamSearchDecoderOutput(
-        scores=tf.nest.map_structure(lambda _: dtype, self._rnn_output_size()),
-        predicted_ids=tf.int32,
-        parent_ids=tf.int32,
+      scores=tf.nest.map_structure(lambda _: dtype, self._rnn_output_size()),
+      predicted_ids=tf.int32,
+      parent_ids=tf.int32,
     )
 
   def call(self, embedding, start_tokens, end_token, initial_state, training=None, **kwargs):
@@ -863,55 +864,55 @@ def call(self, embedding, start_tokens, end_token, initial_state, training=None,
     init_kwargs["end_token"] = end_token
     init_kwargs["initial_state"] = initial_state
     return decoder.dynamic_decode(
-        self,
-        output_time_major=self.output_time_major,
-        impute_finished=self.impute_finished,
-        maximum_iterations=self.maximum_iterations,
-        parallel_iterations=self.parallel_iterations,
-        swap_memory=self.swap_memory,
-        training=training,
-        decoder_init_input=embedding,
-        decoder_init_kwargs=init_kwargs,
+      self,
+      output_time_major=self.output_time_major,
+      impute_finished=self.impute_finished,
+      maximum_iterations=self.maximum_iterations,
+      parallel_iterations=self.parallel_iterations,
+      swap_memory=self.swap_memory,
+      training=training,
+      decoder_init_input=embedding,
+      decoder_init_kwargs=init_kwargs,
     )
 
 
 def _beam_search_step(
-    time,
-    logits,
-    next_cell_state,
-    beam_state,
-    batch_size,
-    beam_width,
-    end_token,
-    length_penalty_weight,
-    coverage_penalty_weight,
-    output_all_scores,
+  time,
+  logits,
+  next_cell_state,
+  beam_state,
+  batch_size,
+  beam_width,
+  end_token,
+  length_penalty_weight,
+  coverage_penalty_weight,
+  output_all_scores,
 ):
   """Performs a single step of Beam Search Decoding.
 
-    Args:
-      time: Beam search time step, should start at 0. At time 0 we assume
-        that all beams are equal and consider only the first beam for
-        continuations.
-      logits: Logits at the current time step. A tensor of shape
-        `[batch_size, beam_width, vocab_size]`
-      next_cell_state: The next state from the cell, e.g. an instance of
-        AttentionWrapperState if the cell is attentional.
-      beam_state: Current state of the beam search.
-        An instance of `BeamSearchDecoderState`.
-      batch_size: The batch size for this input.
-      beam_width: Python int.  The size of the beams.
-      end_token: The int32 end token.
-      length_penalty_weight: Float weight to penalize length. Disabled with
-        0.0.
-      coverage_penalty_weight: Float weight to penalize the coverage of source
-        sentence. Disabled with 0.0.
-      output_all_scores: Bool output scores for every token if True, else only
-        output the top scores.
-
-    Returns:
-      A new beam state.
-    """
+  Args:
+    time: Beam search time step, should start at 0. At time 0 we assume
+      that all beams are equal and consider only the first beam for
+      continuations.
+    logits: Logits at the current time step. A tensor of shape
+      `[batch_size, beam_width, vocab_size]`
+    next_cell_state: The next state from the cell, e.g. an instance of
+      AttentionWrapperState if the cell is attentional.
+    beam_state: Current state of the beam search.
+      An instance of `BeamSearchDecoderState`.
+    batch_size: The batch size for this input.
+    beam_width: Python int.  The size of the beams.
+    end_token: The int32 end token.
+    length_penalty_weight: Float weight to penalize length. Disabled with
+      0.0.
+    coverage_penalty_weight: Float weight to penalize the coverage of source
+      sentence. Disabled with 0.0.
+    output_all_scores: Bool output scores for every token if True, else only
+      output the top scores.
+
+  Returns:
+    A new beam state.
+  """
   static_batch_size = tf.get_static_value(batch_size)
 
   # Calculate the current lengths of the predictions
@@ -928,11 +929,11 @@ def _beam_search_step(
   # Calculate the continuation lengths by adding to all continuing beams.
   vocab_size = logits.shape[-1] or tf.shape(logits)[-1]
   lengths_to_add = tf.one_hot(
-      indices=tf.fill([batch_size, beam_width], end_token),
-      depth=vocab_size,
-      on_value=np.int64(0),
-      off_value=np.int64(1),
-      dtype=tf.int64,
+    indices=tf.fill([batch_size, beam_width], end_token),
+    depth=vocab_size,
+    on_value=np.int64(0),
+    off_value=np.int64(1),
+    dtype=tf.int64,
   )
   add_mask = tf.cast(not_finished, tf.int64)
   lengths_to_add *= tf.expand_dims(add_mask, 2)
@@ -944,16 +945,16 @@ def _beam_search_step(
   attention_probs = get_attention_probs(next_cell_state, coverage_penalty_weight)
   if attention_probs is not None:
     attention_probs *= tf.expand_dims(tf.cast(not_finished, tf.float32), 2)
-    accumulated_attention_probs = (beam_state.accumulated_attention_probs + attention_probs)
+    accumulated_attention_probs = beam_state.accumulated_attention_probs + attention_probs
 
   # Calculate the scores for each beam
   scores = _get_scores(
-      log_probs=total_probs,
-      sequence_lengths=new_prediction_lengths,
-      length_penalty_weight=length_penalty_weight,
-      coverage_penalty_weight=coverage_penalty_weight,
-      finished=previously_finished,
-      accumulated_attention_probs=accumulated_attention_probs,
+    log_probs=total_probs,
+    sequence_lengths=new_prediction_lengths,
+    length_penalty_weight=length_penalty_weight,
+    coverage_penalty_weight=coverage_penalty_weight,
+    finished=previously_finished,
+    accumulated_attention_probs=accumulated_attention_probs,
   )
 
   time = tf.convert_to_tensor(time, name="time")
@@ -970,12 +971,12 @@ def _beam_search_step(
   # Pick out the probs, beam_ids, and states according to the chosen
   # predictions
   next_beam_probs = _tensor_gather_helper(
-      gather_indices=word_indices,
-      gather_from=total_probs,
-      batch_size=batch_size,
-      range_size=beam_width * vocab_size,
-      gather_shape=[-1],
-      name="next_beam_probs",
+    gather_indices=word_indices,
+    gather_from=total_probs,
+    batch_size=batch_size,
+    range_size=beam_width * vocab_size,
+    gather_shape=[-1],
+    name="next_beam_probs",
   )
   # Note: just doing the following
   #   tf.to_int32(word_indices % vocab_size,
@@ -988,16 +989,16 @@ def _beam_search_step(
 
   # Append new ids to current predictions
   previously_finished = _tensor_gather_helper(
-      gather_indices=next_beam_ids,
-      gather_from=previously_finished,
-      batch_size=batch_size,
-      range_size=beam_width,
-      gather_shape=[-1],
+    gather_indices=next_beam_ids,
+    gather_from=previously_finished,
+    batch_size=batch_size,
+    range_size=beam_width,
+    gather_shape=[-1],
   )
   next_finished = tf.logical_or(
-      previously_finished,
-      tf.equal(next_word_ids, end_token),
-      name="next_beam_finished",
+    previously_finished,
+    tf.equal(next_word_ids, end_token),
+    name="next_beam_finished",
   )
 
   # Calculate the length of the next predictions.
@@ -1007,22 +1008,22 @@ def _beam_search_step(
   # 3. Beams that are not yet finished have their length increased by 1.
   lengths_to_add = tf.cast(tf.logical_not(previously_finished), tf.int64)
   next_prediction_len = _tensor_gather_helper(
-      gather_indices=next_beam_ids,
-      gather_from=beam_state.lengths,
-      batch_size=batch_size,
-      range_size=beam_width,
-      gather_shape=[-1],
+    gather_indices=next_beam_ids,
+    gather_from=beam_state.lengths,
+    batch_size=batch_size,
+    range_size=beam_width,
+    gather_shape=[-1],
   )
   next_prediction_len += lengths_to_add
   next_accumulated_attention_probs = ()
   if accumulated_attention_probs is not None:
     next_accumulated_attention_probs = _tensor_gather_helper(
-        gather_indices=next_beam_ids,
-        gather_from=accumulated_attention_probs,
-        batch_size=batch_size,
-        range_size=beam_width,
-        gather_shape=[batch_size * beam_width, -1],
-        name="next_accumulated_attention_probs",
+      gather_indices=next_beam_ids,
+      gather_from=accumulated_attention_probs,
+      batch_size=batch_size,
+      range_size=beam_width,
+      gather_shape=[batch_size * beam_width, -1],
+      name="next_accumulated_attention_probs",
     )
 
   # Pick out the cell_states according to the next_beam_ids. We use a
@@ -1030,28 +1031,28 @@ def _beam_search_step(
   # the tensors that would be gathered from, all have dimension
   # greater than two and we need to preserve those dimensions.
   next_cell_state = tf.nest.map_structure(
-      lambda gather_from: _maybe_tensor_gather_helper(
-          gather_indices=next_beam_ids,
-          gather_from=gather_from,
-          batch_size=batch_size,
-          range_size=beam_width,
-          gather_shape=[batch_size * beam_width, -1],
-      ),
-      next_cell_state,
+    lambda gather_from: _maybe_tensor_gather_helper(
+      gather_indices=next_beam_ids,
+      gather_from=gather_from,
+      batch_size=batch_size,
+      range_size=beam_width,
+      gather_shape=[batch_size * beam_width, -1],
+    ),
+    next_cell_state,
   )
 
   next_state = BeamSearchDecoderState(
-      cell_state=next_cell_state,
-      log_probs=next_beam_probs,
-      lengths=next_prediction_len,
-      finished=next_finished,
-      accumulated_attention_probs=next_accumulated_attention_probs,
+    cell_state=next_cell_state,
+    log_probs=next_beam_probs,
+    lengths=next_prediction_len,
+    finished=next_finished,
+    accumulated_attention_probs=next_accumulated_attention_probs,
   )
 
   output = BeamSearchDecoderOutput(
-      scores=scores if output_all_scores else next_beam_scores,
-      predicted_ids=next_word_ids,
-      parent_ids=next_beam_ids,
+    scores=scores if output_all_scores else next_beam_scores,
+    predicted_ids=next_word_ids,
+    parent_ids=next_beam_ids,
   )
 
   return output, next_state
@@ -1060,20 +1061,20 @@ def _beam_search_step(
 def get_attention_probs(next_cell_state, coverage_penalty_weight):
   """Get attention probabilities from the cell state.
 
-    Args:
-      next_cell_state: The next state from the cell, e.g. an instance of
-        AttentionWrapperState if the cell is attentional.
-      coverage_penalty_weight: Float weight to penalize the coverage of source
-        sentence. Disabled with 0.0.
+  Args:
+    next_cell_state: The next state from the cell, e.g. an instance of
+      AttentionWrapperState if the cell is attentional.
+    coverage_penalty_weight: Float weight to penalize the coverage of source
+      sentence. Disabled with 0.0.
 
-    Returns:
-      The attention probabilities with shape
-        `[batch_size, beam_width, max_time]` if coverage penalty is enabled.
-        Otherwise, returns None.
+  Returns:
+    The attention probabilities with shape
+      `[batch_size, beam_width, max_time]` if coverage penalty is enabled.
+      Otherwise, returns None.
 
-    Raises:
-      ValueError: If no cell is attentional but coverage penalty is enabled.
-    """
+  Raises:
+    ValueError: If no cell is attentional but coverage penalty is enabled.
+  """
   if coverage_penalty_weight == 0.0:
     return None
 
@@ -1103,59 +1104,55 @@ def get_attention_probs(next_cell_state, coverage_penalty_weight):
 
 
 def _get_scores(
-    log_probs,
-    sequence_lengths,
-    length_penalty_weight,
-    coverage_penalty_weight,
-    finished,
-    accumulated_attention_probs,
+  log_probs,
+  sequence_lengths,
+  length_penalty_weight,
+  coverage_penalty_weight,
+  finished,
+  accumulated_attention_probs,
 ):
   """Calculates scores for beam search hypotheses.
 
-    Args:
-      log_probs: The log probabilities with shape
-        `[batch_size, beam_width, vocab_size]`.
-      sequence_lengths: The array of sequence lengths.
-      length_penalty_weight: Float weight to penalize length. Disabled with
-        0.0.
-      coverage_penalty_weight: Float weight to penalize the coverage of source
-        sentence. Disabled with 0.0.
-      finished: A boolean tensor of shape `[batch_size, beam_width]` that
-        specifies which elements in the beam are finished already.
-      accumulated_attention_probs: Accumulated attention probabilities up to
-        the current time step, with shape `[batch_size, beam_width, max_time]`
-        if coverage_penalty_weight is not 0.0.
-
-    Returns:
-      The scores normalized by the length_penalty and coverage_penalty.
-
-    Raises:
-      ValueError: accumulated_attention_probs is None when coverage penalty is
-        enabled.
-    """
+  Args:
+    log_probs: The log probabilities with shape
+      `[batch_size, beam_width, vocab_size]`.
+    sequence_lengths: The array of sequence lengths.
+    length_penalty_weight: Float weight to penalize length. Disabled with
+      0.0.
+    coverage_penalty_weight: Float weight to penalize the coverage of source
+      sentence. Disabled with 0.0.
+    finished: A boolean tensor of shape `[batch_size, beam_width]` that
+      specifies which elements in the beam are finished already.
+    accumulated_attention_probs: Accumulated attention probabilities up to
+      the current time step, with shape `[batch_size, beam_width, max_time]`
+      if coverage_penalty_weight is not 0.0.
+
+  Returns:
+    The scores normalized by the length_penalty and coverage_penalty.
+
+  Raises:
+    ValueError: accumulated_attention_probs is None when coverage penalty is
+      enabled.
+  """
   length_penalty_ = _length_penalty(sequence_lengths=sequence_lengths, penalty_factor=length_penalty_weight)
   length_penalty_ = tf.cast(length_penalty_, dtype=log_probs.dtype)
   scores = log_probs / length_penalty_
 
   coverage_penalty_weight = tf.convert_to_tensor(coverage_penalty_weight, name="coverage_penalty_weight")
   if coverage_penalty_weight.shape.ndims != 0:
-    raise ValueError(
-        "coverage_penalty_weight should be a scalar, "
-        "but saw shape: %s" % coverage_penalty_weight.shape
-    )
+    raise ValueError("coverage_penalty_weight should be a scalar, but saw shape: %s" % coverage_penalty_weight.shape)
 
   if tf.get_static_value(coverage_penalty_weight) == 0.0:
     return scores
 
   if accumulated_attention_probs is None:
-    raise ValueError("accumulated_attention_probs can be None only if coverage penalty "
-                     "is disabled.")
+    raise ValueError("accumulated_attention_probs can be None only if coverage penalty is disabled.")
 
   # Add source sequence length mask before computing coverage penalty.
   accumulated_attention_probs = tf.where(
-      tf.equal(accumulated_attention_probs, 0.0),
-      tf.ones_like(accumulated_attention_probs),
-      accumulated_attention_probs,
+    tf.equal(accumulated_attention_probs, 0.0),
+    tf.ones_like(accumulated_attention_probs),
+    accumulated_attention_probs,
   )
 
   # coverage penalty =
@@ -1172,14 +1169,14 @@ def _get_scores(
 def attention_probs_from_attn_state(attention_state):
   """Calculates the average attention probabilities.
 
-    Args:
-      attention_state: An instance of `AttentionWrapperState`.
+  Args:
+    attention_state: An instance of `AttentionWrapperState`.
 
-    Returns:
-      The attention probabilities in the given AttentionWrapperState.
-      If there're multiple attention mechanisms, return the average value from
-      all attention mechanisms.
-    """
+  Returns:
+    The attention probabilities in the given AttentionWrapperState.
+    If there're multiple attention mechanisms, return the average value from
+    all attention mechanisms.
+  """
   # Attention probabilities over time steps, with shape
   # `[batch_size, beam_width, max_time]`.
   attention_probs = attention_state.alignments
@@ -1193,59 +1190,59 @@ def attention_probs_from_attn_state(attention_state):
 def _length_penalty(sequence_lengths, penalty_factor):
   """Calculates the length penalty. See https://arxiv.org/abs/1609.08144.
 
-    Returns the length penalty tensor:
-    ```
-    [(5+sequence_lengths)/6]**penalty_factor
-    ```
-    where all operations are performed element-wise.
-
-    Args:
-      sequence_lengths: `Tensor`, the sequence lengths of each hypotheses.
-      penalty_factor: A scalar that weights the length penalty.
-
-    Returns:
-      If the penalty is `0`, returns the scalar `1.0`.  Otherwise returns
-      the length penalty factor, a tensor with the same shape as
-      `sequence_lengths`.
-    """
+  Returns the length penalty tensor:
+  ```
+  [(5+sequence_lengths)/6]**penalty_factor
+  ```
+  where all operations are performed element-wise.
+
+  Args:
+    sequence_lengths: `Tensor`, the sequence lengths of each hypotheses.
+    penalty_factor: A scalar that weights the length penalty.
+
+  Returns:
+    If the penalty is `0`, returns the scalar `1.0`.  Otherwise returns
+    the length penalty factor, a tensor with the same shape as
+    `sequence_lengths`.
+  """
   penalty_factor = tf.convert_to_tensor(penalty_factor, name="penalty_factor")
   penalty_factor.set_shape(())  # penalty should be a scalar.
   static_penalty = tf.get_static_value(penalty_factor)
   if static_penalty is not None and static_penalty == 0:
     return 1.0
   return tf.math.divide(
-      (5.0 + tf.cast(sequence_lengths, tf.float32))**penalty_factor,
-      (5.0 + 1.0)**penalty_factor,
+    (5.0 + tf.cast(sequence_lengths, tf.float32)) ** penalty_factor,
+    (5.0 + 1.0) ** penalty_factor,
   )
 
 
 def _mask_probs(probs, eos_token, finished):
   """Masks log probabilities.
 
-    The result is that finished beams allocate all probability mass to eos and
-    unfinished beams remain unchanged.
-
-    Args:
-      probs: Log probabilities of shape `[batch_size, beam_width, vocab_size]`
-      eos_token: An int32 id corresponding to the EOS token to allocate
-        probability to.
-      finished: A boolean tensor of shape `[batch_size, beam_width]` that
-        specifies which elements in the beam are finished already.
-
-    Returns:
-      A tensor of shape `[batch_size, beam_width, vocab_size]`, where
-      unfinished beams stay unchanged and finished beams are replaced with a
-      tensor with all probability on the EOS token.
-    """
+  The result is that finished beams allocate all probability mass to eos and
+  unfinished beams remain unchanged.
+
+  Args:
+    probs: Log probabilities of shape `[batch_size, beam_width, vocab_size]`
+    eos_token: An int32 id corresponding to the EOS token to allocate
+      probability to.
+    finished: A boolean tensor of shape `[batch_size, beam_width]` that
+      specifies which elements in the beam are finished already.
+
+  Returns:
+    A tensor of shape `[batch_size, beam_width, vocab_size]`, where
+    unfinished beams stay unchanged and finished beams are replaced with a
+    tensor with all probability on the EOS token.
+  """
   vocab_size = tf.shape(probs)[2]
   # All finished examples are replaced with a vector that has all
   # probability on EOS
   finished_row = tf.one_hot(
-      eos_token,
-      vocab_size,
-      dtype=probs.dtype,
-      on_value=tf.convert_to_tensor(0.0, dtype=probs.dtype),
-      off_value=probs.dtype.min,
+    eos_token,
+    vocab_size,
+    dtype=probs.dtype,
+    on_value=tf.convert_to_tensor(0.0, dtype=probs.dtype),
+    off_value=probs.dtype.min,
   )
   finished_probs = tf.tile(tf.reshape(finished_row, [1, 1, -1]), tf.concat([tf.shape(finished), [1]], 0))
   finished_mask = tf.tile(tf.expand_dims(finished, 2), [1, 1, vocab_size])
@@ -1256,40 +1253,40 @@ def _mask_probs(probs, eos_token, finished):
 def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size, range_size, gather_shape):
   """Maybe applies _tensor_gather_helper.
 
-    This applies _tensor_gather_helper when the gather_from dims is at least as
-    big as the length of gather_shape. This is used in conjunction with nest so
-    that we don't apply _tensor_gather_helper to inapplicable values like
-    scalars.
-
-    Args:
-      gather_indices: The tensor indices that we use to gather.
-      gather_from: The tensor that we are gathering from.
-      batch_size: The batch size.
-      range_size: The number of values in each range. Likely equal to
-        beam_width.
-      gather_shape: What we should reshape gather_from to in order to preserve
-        the correct values. An example is when gather_from is the attention
-        from an AttentionWrapperState with shape
-        [batch_size, beam_width, attention_size]. There, we want to preserve
-        the attention_size elements, so gather_shape is
-        [batch_size * beam_width, -1]. Then, upon reshape, we still have the
-        attention_size as desired.
-
-    Returns:
-      output: Gathered tensor of shape
-        tf.shape(gather_from)[:1+len(gather_shape)] or the original tensor if
-        its dimensions are too small.
-    """
+  This applies _tensor_gather_helper when the gather_from dims is at least as
+  big as the length of gather_shape. This is used in conjunction with nest so
+  that we don't apply _tensor_gather_helper to inapplicable values like
+  scalars.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The batch size.
+    range_size: The number of values in each range. Likely equal to
+      beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve
+      the correct values. An example is when gather_from is the attention
+      from an AttentionWrapperState with shape
+      [batch_size, beam_width, attention_size]. There, we want to preserve
+      the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+
+  Returns:
+    output: Gathered tensor of shape
+      tf.shape(gather_from)[:1+len(gather_shape)] or the original tensor if
+      its dimensions are too small.
+  """
   if isinstance(gather_from, tf.TensorArray):
     return gather_from
   _check_ndims(gather_from)
   if gather_from.shape.ndims >= len(gather_shape):
     return _tensor_gather_helper(
-        gather_indices=gather_indices,
-        gather_from=gather_from,
-        batch_size=batch_size,
-        range_size=range_size,
-        gather_shape=gather_shape,
+      gather_indices=gather_indices,
+      gather_from=gather_from,
+      batch_size=batch_size,
+      range_size=range_size,
+      gather_shape=gather_shape,
     )
   else:
     return gather_from
@@ -1298,37 +1295,37 @@ def _maybe_tensor_gather_helper(gather_indices, gather_from, batch_size, range_s
 def _tensor_gather_helper(gather_indices, gather_from, batch_size, range_size, gather_shape, name=None):
   """Helper for gathering the right indices from the tensor.
 
-    This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
-    gathering from that according to the gather_indices, which are offset by
-    the right amounts in order to preserve the batch order.
-
-    Args:
-      gather_indices: The tensor indices that we use to gather.
-      gather_from: The tensor that we are gathering from.
-      batch_size: The input batch size.
-      range_size: The number of values in each range. Likely equal to
-        beam_width.
-      gather_shape: What we should reshape gather_from to in order to preserve
-        the correct values. An example is when gather_from is the attention
-        from an AttentionWrapperState with shape
-        [batch_size, beam_width, attention_size]. There, we want to preserve
-        the attention_size elements, so gather_shape is
-        [batch_size * beam_width, -1]. Then, upon reshape, we still have the
-        attention_size as desired.
-      name: The tensor name for set of operations. By default this is
-        'tensor_gather_helper'. The final output is named 'output'.
-
-    Returns:
-      output: Gathered tensor of shape
-        tf.shape(gather_from)[:1+len(gather_shape)]
-    """
+  This works by reshaping gather_from to gather_shape (e.g. [-1]) and then
+  gathering from that according to the gather_indices, which are offset by
+  the right amounts in order to preserve the batch order.
+
+  Args:
+    gather_indices: The tensor indices that we use to gather.
+    gather_from: The tensor that we are gathering from.
+    batch_size: The input batch size.
+    range_size: The number of values in each range. Likely equal to
+      beam_width.
+    gather_shape: What we should reshape gather_from to in order to preserve
+      the correct values. An example is when gather_from is the attention
+      from an AttentionWrapperState with shape
+      [batch_size, beam_width, attention_size]. There, we want to preserve
+      the attention_size elements, so gather_shape is
+      [batch_size * beam_width, -1]. Then, upon reshape, we still have the
+      attention_size as desired.
+    name: The tensor name for set of operations. By default this is
+      'tensor_gather_helper'. The final output is named 'output'.
+
+  Returns:
+    output: Gathered tensor of shape
+      tf.shape(gather_from)[:1+len(gather_shape)]
+  """
   with tf.name_scope(name or "tensor_gather_helper"):
     range_ = tf.expand_dims(tf.range(batch_size) * range_size, 1)
     gather_indices = tf.reshape(gather_indices + range_, [-1])
     output = tf.gather(tf.reshape(gather_from, gather_shape), gather_indices)
-    final_shape = tf.shape(gather_from)[:1 + len(gather_shape)]
+    final_shape = tf.shape(gather_from)[: 1 + len(gather_shape)]
     static_batch_size = tf.get_static_value(batch_size)
-    final_static_shape = tf.TensorShape([static_batch_size]).concatenate(gather_from.shape[1:1 + len(gather_shape)])
+    final_static_shape = tf.TensorShape([static_batch_size]).concatenate(gather_from.shape[1 : 1 + len(gather_shape)])
     output = tf.reshape(output, final_shape, name="output")
     output.set_shape(final_static_shape)
     return output
diff --git a/deepray/custom_ops/seq2seq/python/decoder.py b/deepray/custom_ops/seq2seq/python/decoder.py
index f5b9bad5..475d7c4a 100644
--- a/deepray/custom_ops/seq2seq/python/decoder.py
+++ b/deepray/custom_ops/seq2seq/python/decoder.py
@@ -18,6 +18,7 @@
 from typing import Any, Optional, Tuple, Union
 
 import tensorflow as tf
+
 # TODO: Find public API alternatives to these
 from tensorflow.python.ops import control_flow_util
 from typeguard import typechecked
@@ -28,18 +29,18 @@
 class Decoder(metaclass=abc.ABCMeta):
   """An RNN Decoder abstract interface object.
 
-    Concepts used by this interface:
-    - `inputs`: (structure of) tensors and TensorArrays that is passed as input
-      to the RNN cell composing the decoder, at each time step.
-    - `state`: (structure of) tensors and TensorArrays that is passed to the
-      RNN cell instance as the state.
-    - `finished`: boolean tensor telling whether each sequence in the batch is
-      finished.
-    - `training`: boolean whether it should behave in training mode or in
-      inference mode.
-    - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
-      each time step.
-    """
+  Concepts used by this interface:
+  - `inputs`: (structure of) tensors and TensorArrays that is passed as input
+    to the RNN cell composing the decoder, at each time step.
+  - `state`: (structure of) tensors and TensorArrays that is passed to the
+    RNN cell instance as the state.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `training`: boolean whether it should behave in training mode or in
+    inference mode.
+  - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
+    each time step.
+  """
 
   @property
   def batch_size(self):
@@ -49,7 +50,7 @@ def batch_size(self):
   @property
   def output_size(self):
     """A (possibly nested tuple of...) integer[s] or `TensorShape`
-        object[s]."""
+    object[s]."""
     raise NotImplementedError
 
   @property
@@ -61,40 +62,40 @@ def output_dtype(self):
   def initialize(self, name=None):
     """Called before any decoding iterations.
 
-        This methods must compute initial input values and initial state.
+    This methods must compute initial input values and initial state.
 
-        Args:
-          name: Name scope for any created operations.
+    Args:
+      name: Name scope for any created operations.
 
-        Returns:
-          `(finished, initial_inputs, initial_state)`: initial values of
-          'finished' flags, inputs and state.
-        """
+    Returns:
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
+    """
     raise NotImplementedError
 
   @abc.abstractmethod
   def step(self, time, inputs, state, training=None, name=None):
     """Called per step of decoding (but only once for dynamic decoding).
 
-        Args:
-          time: Scalar `int32` tensor. Current step number.
-          inputs: RNN cell input (possibly nested tuple of) tensor[s] for this
-            time step.
-          state: RNN cell state (possibly nested tuple of) tensor[s] from
-            previous time step.
-          training: Python boolean. Indicates whether the layer should behave
-            in training  mode or in inference mode. Only relevant
-            when `dropout` or `recurrent_dropout` is used.
-          name: Name scope for any created operations.
-
-        Returns:
-          `(outputs, next_state, next_inputs, finished)`: `outputs` is an
-          object containing the decoder output, `next_state` is a (structure
-          of) state tensors and TensorArrays, `next_inputs` is the tensor that
-          should be used as input for the next step, `finished` is a boolean
-          tensor telling whether the sequence is complete, for each sequence in
-          the batch.
-        """
+    Args:
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNN cell input (possibly nested tuple of) tensor[s] for this
+        time step.
+      state: RNN cell state (possibly nested tuple of) tensor[s] from
+        previous time step.
+      training: Python boolean. Indicates whether the layer should behave
+        in training  mode or in inference mode. Only relevant
+        when `dropout` or `recurrent_dropout` is used.
+      name: Name scope for any created operations.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an
+      object containing the decoder output, `next_state` is a (structure
+      of) state tensors and TensorArrays, `next_inputs` is the tensor that
+      should be used as input for the next step, `finished` is a boolean
+      tensor telling whether the sequence is complete, for each sequence in
+      the batch.
+    """
     raise NotImplementedError
 
   def finalize(self, outputs, final_state, sequence_lengths):
@@ -104,50 +105,50 @@ def finalize(self, outputs, final_state, sequence_lengths):
   def tracks_own_finished(self):
     """Describes whether the Decoder keeps track of finished states.
 
-        Most decoders will emit a true/false `finished` value independently
-        at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
-        track of which batch entries are already finished, and performs a
-        logical OR to insert new batches to the finished set.
+    Most decoders will emit a true/false `finished` value independently
+    at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
+    track of which batch entries are already finished, and performs a
+    logical OR to insert new batches to the finished set.
 
-        Some decoders, however, shuffle batches / beams between time steps and
-        `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
-        because it does not track the reshuffle across time steps. In this
-        case, it is up to the decoder to declare that it will keep track of its
-        own finished state by setting this property to `True`.
+    Some decoders, however, shuffle batches / beams between time steps and
+    `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
+    because it does not track the reshuffle across time steps. In this
+    case, it is up to the decoder to declare that it will keep track of its
+    own finished state by setting this property to `True`.
 
-        Returns:
-          Python bool.
-        """
+    Returns:
+      Python bool.
+    """
     return False
 
 
 class BaseDecoder(tf.keras.layers.Layer):
   """An RNN Decoder that is based on a Keras layer.
 
-    Concepts used by this interface:
-    - `inputs`: (structure of) Tensors and TensorArrays that is passed as input
-      to the RNN cell composing the decoder, at each time step.
-    - `state`: (structure of) Tensors and TensorArrays that is passed to the
-      RNN cell instance as the state.
-    - `memory`: tensor that is usually the full output of the encoder, which
-      will be used for the attention wrapper for the RNN cell.
-    - `finished`: boolean tensor telling whether each sequence in the batch is
-      finished.
-    - `training`: boolean whether it should behave in training mode or in
-      inference mode.
-    - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
-      each time step.
-    """
+  Concepts used by this interface:
+  - `inputs`: (structure of) Tensors and TensorArrays that is passed as input
+    to the RNN cell composing the decoder, at each time step.
+  - `state`: (structure of) Tensors and TensorArrays that is passed to the
+    RNN cell instance as the state.
+  - `memory`: tensor that is usually the full output of the encoder, which
+    will be used for the attention wrapper for the RNN cell.
+  - `finished`: boolean tensor telling whether each sequence in the batch is
+    finished.
+  - `training`: boolean whether it should behave in training mode or in
+    inference mode.
+  - `outputs`: instance of `tfa.seq2seq.BasicDecoderOutput`. Result of the decoding, at
+    each time step.
+  """
 
   @typechecked
   def __init__(
-      self,
-      output_time_major: bool = False,
-      impute_finished: bool = False,
-      maximum_iterations: Optional[TensorLike] = None,
-      parallel_iterations: int = 32,
-      swap_memory: bool = False,
-      **kwargs,
+    self,
+    output_time_major: bool = False,
+    impute_finished: bool = False,
+    maximum_iterations: Optional[TensorLike] = None,
+    parallel_iterations: int = 32,
+    swap_memory: bool = False,
+    **kwargs,
   ):
     self.output_time_major = output_time_major
     self.impute_finished = impute_finished
@@ -160,15 +161,15 @@ def call(self, inputs, initial_state=None, training=None, **kwargs):
     init_kwargs = kwargs
     init_kwargs["initial_state"] = initial_state
     return dynamic_decode(
-        self,
-        output_time_major=self.output_time_major,
-        impute_finished=self.impute_finished,
-        maximum_iterations=self.maximum_iterations,
-        parallel_iterations=self.parallel_iterations,
-        swap_memory=self.swap_memory,
-        training=training,
-        decoder_init_input=inputs,
-        decoder_init_kwargs=init_kwargs,
+      self,
+      output_time_major=self.output_time_major,
+      impute_finished=self.impute_finished,
+      maximum_iterations=self.maximum_iterations,
+      parallel_iterations=self.parallel_iterations,
+      swap_memory=self.swap_memory,
+      training=training,
+      decoder_init_input=inputs,
+      decoder_init_kwargs=init_kwargs,
     )
 
   @property
@@ -179,7 +180,7 @@ def batch_size(self):
   @property
   def output_size(self):
     """A (possibly nested tuple of...) integer[s] or `TensorShape`
-        object[s]."""
+    object[s]."""
     raise NotImplementedError
 
   @property
@@ -190,44 +191,44 @@ def output_dtype(self):
   def initialize(self, inputs, initial_state=None, **kwargs):
     """Called before any decoding iterations.
 
-        This methods must compute initial input values and initial state.
-
-        Args:
-          inputs: (structure of) tensors that contains the input for the
-            decoder. In the normal case, it's a tensor with shape
-            [batch, timestep, embedding].
-          initial_state: (structure of) tensors that contains the initial state
-            for the RNN cell.
-          **kwargs: Other arguments that are passed in from layer.call()
-            method. It could contains item like input `sequence_length`, or
-            masking for input.
-
-        Returns:
-          `(finished, initial_inputs, initial_state)`: initial values of
-          'finished' flags, inputs and state.
-        """
+    This methods must compute initial input values and initial state.
+
+    Args:
+      inputs: (structure of) tensors that contains the input for the
+        decoder. In the normal case, it's a tensor with shape
+        [batch, timestep, embedding].
+      initial_state: (structure of) tensors that contains the initial state
+        for the RNN cell.
+      **kwargs: Other arguments that are passed in from layer.call()
+        method. It could contains item like input `sequence_length`, or
+        masking for input.
+
+    Returns:
+      `(finished, initial_inputs, initial_state)`: initial values of
+      'finished' flags, inputs and state.
+    """
     raise NotImplementedError
 
   def step(self, time, inputs, state, training):
     """Called per step of decoding (but only once for dynamic decoding).
 
-        Args:
-          time: Scalar `int32` tensor. Current step number.
-          inputs: RNN cell input (possibly nested tuple of) tensor[s] for this
-            time step.
-          state: RNN cell state (possibly nested tuple of) tensor[s] from
-            previous time step.
-          training: Python boolean. Indicates whether the layer should
-            behave in training mode or in inference mode.
-
-        Returns:
-          `(outputs, next_state, next_inputs, finished)`: `outputs` is an
-          object containing the decoder output, `next_state` is a
-          (structure of) state tensors and TensorArrays, `next_inputs` is the
-          tensor that should be used as input for the next step, `finished` is
-          a boolean tensor telling whether the sequence is complete, for each
-          sequence in the batch.
-        """
+    Args:
+      time: Scalar `int32` tensor. Current step number.
+      inputs: RNN cell input (possibly nested tuple of) tensor[s] for this
+        time step.
+      state: RNN cell state (possibly nested tuple of) tensor[s] from
+        previous time step.
+      training: Python boolean. Indicates whether the layer should
+        behave in training mode or in inference mode.
+
+    Returns:
+      `(outputs, next_state, next_inputs, finished)`: `outputs` is an
+      object containing the decoder output, `next_state` is a
+      (structure of) state tensors and TensorArrays, `next_inputs` is the
+      tensor that should be used as input for the next step, `finished` is
+      a boolean tensor telling whether the sequence is complete, for each
+      sequence in the batch.
+    """
     raise NotImplementedError
 
   def finalize(self, outputs, final_state, sequence_lengths):
@@ -237,20 +238,20 @@ def finalize(self, outputs, final_state, sequence_lengths):
   def tracks_own_finished(self):
     """Describes whether the Decoder keeps track of finished states.
 
-        Most decoders will emit a true/false `finished` value independently
-        at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
-        track of which batch entries are already finished, and performs a
-        logical OR to insert new batches to the finished set.
+    Most decoders will emit a true/false `finished` value independently
+    at each time step.  In this case, the `tfa.seq2seq.dynamic_decode` function keeps
+    track of which batch entries are already finished, and performs a
+    logical OR to insert new batches to the finished set.
 
-        Some decoders, however, shuffle batches / beams between time steps and
-        `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
-        because it does not track the reshuffle across time steps. In this
-        case, it is up to the decoder to declare that it will keep track of its
-        own finished state by setting this property to `True`.
+    Some decoders, however, shuffle batches / beams between time steps and
+    `tfa.seq2seq.dynamic_decode` will mix up the finished state across these entries
+    because it does not track the reshuffle across time steps. In this
+    case, it is up to the decoder to declare that it will keep track of its
+    own finished state by setting this property to `True`.
 
-        Returns:
-          Python bool.
-        """
+    Returns:
+      Python bool.
+    """
     return False
 
   # TODO(scottzhu): Add build/get_config/from_config and other layer methods.
@@ -258,58 +259,58 @@ def tracks_own_finished(self):
 
 @typechecked
 def dynamic_decode(
-    decoder: Union[Decoder, BaseDecoder],
-    output_time_major: bool = False,
-    impute_finished: bool = False,
-    maximum_iterations: Optional[TensorLike] = None,
-    parallel_iterations: int = 32,
-    swap_memory: bool = False,
-    training: Optional[bool] = None,
-    scope: Optional[str] = None,
-    enable_tflite_convertible: bool = False,
-    **kwargs,
+  decoder: Union[Decoder, BaseDecoder],
+  output_time_major: bool = False,
+  impute_finished: bool = False,
+  maximum_iterations: Optional[TensorLike] = None,
+  parallel_iterations: int = 32,
+  swap_memory: bool = False,
+  training: Optional[bool] = None,
+  scope: Optional[str] = None,
+  enable_tflite_convertible: bool = False,
+  **kwargs,
 ) -> Tuple[Any, Any, Any]:
   """Runs dynamic decoding with a decoder.
 
-    Calls `initialize()` once and `step()` repeatedly on the decoder object.
-
-    Args:
-      decoder: A `tfa.seq2seq.Decoder` or `tfa.seq2seq.BaseDecoder` instance.
-      output_time_major: Python boolean.  Default: `False` (batch major). If
-        `True`, outputs are returned as time major tensors (this mode is
-        faster). Otherwise, outputs are returned as batch major tensors (this
-        adds extra time to the computation).
-      impute_finished: Python boolean.  If `True`, then states for batch
-        entries which are marked as finished get copied through and the
-        corresponding outputs get zeroed out.  This causes some slowdown at
-        each time step, but ensures that the final state and outputs have
-        the correct values and that backprop ignores time steps that were
-        marked as finished.
-      maximum_iterations: A strictly positive `int32` scalar, the maximum
-         allowed number of decoding steps. Default is `None` (decode until the
-         decoder is fully done).
-      parallel_iterations: Argument passed to `tf.while_loop`.
-      swap_memory: Argument passed to `tf.while_loop`.
-      training: Python boolean. Indicates whether the layer should behave
-          in training  mode or in inference mode. Only relevant
-          when `dropout` or `recurrent_dropout` is used.
-      scope: Optional name scope to use.
-      enable_tflite_convertible: Python boolean. If `True`, then the variables
-        of `TensorArray` become of 1-D static shape. Also zero pads in the
-        output tensor will be discarded. Default: `False`.
-      **kwargs: dict, other keyword arguments for dynamic_decode. It might
-        contain arguments for `BaseDecoder` to initialize, which takes all
-        tensor inputs during call().
-
-    Returns:
-      `(final_outputs, final_state, final_sequence_lengths)`.
-
-    Raises:
-      ValueError: if `maximum_iterations` is provided but is not a scalar.
-    """
+  Calls `initialize()` once and `step()` repeatedly on the decoder object.
+
+  Args:
+    decoder: A `tfa.seq2seq.Decoder` or `tfa.seq2seq.BaseDecoder` instance.
+    output_time_major: Python boolean.  Default: `False` (batch major). If
+      `True`, outputs are returned as time major tensors (this mode is
+      faster). Otherwise, outputs are returned as batch major tensors (this
+      adds extra time to the computation).
+    impute_finished: Python boolean.  If `True`, then states for batch
+      entries which are marked as finished get copied through and the
+      corresponding outputs get zeroed out.  This causes some slowdown at
+      each time step, but ensures that the final state and outputs have
+      the correct values and that backprop ignores time steps that were
+      marked as finished.
+    maximum_iterations: A strictly positive `int32` scalar, the maximum
+       allowed number of decoding steps. Default is `None` (decode until the
+       decoder is fully done).
+    parallel_iterations: Argument passed to `tf.while_loop`.
+    swap_memory: Argument passed to `tf.while_loop`.
+    training: Python boolean. Indicates whether the layer should behave
+        in training  mode or in inference mode. Only relevant
+        when `dropout` or `recurrent_dropout` is used.
+    scope: Optional name scope to use.
+    enable_tflite_convertible: Python boolean. If `True`, then the variables
+      of `TensorArray` become of 1-D static shape. Also zero pads in the
+      output tensor will be discarded. Default: `False`.
+    **kwargs: dict, other keyword arguments for dynamic_decode. It might
+      contain arguments for `BaseDecoder` to initialize, which takes all
+      tensor inputs during call().
+
+  Returns:
+    `(final_outputs, final_state, final_sequence_lengths)`.
+
+  Raises:
+    ValueError: if `maximum_iterations` is provided but is not a scalar.
+  """
   with tf.name_scope(scope or "decoder"):
-    is_xla = (
-        not tf.executing_eagerly() and control_flow_util.GraphOrParentsInXlaContext(tf.compat.v1.get_default_graph())
+    is_xla = not tf.executing_eagerly() and control_flow_util.GraphOrParentsInXlaContext(
+      tf.compat.v1.get_default_graph()
     )
 
     if maximum_iterations is not None:
@@ -317,9 +318,9 @@ def dynamic_decode(
       if maximum_iterations.shape.ndims != 0:
         raise ValueError("maximum_iterations must be a scalar")
       tf.debugging.assert_greater(
-          maximum_iterations,
-          0,
-          message="maximum_iterations should be greater than 0",
+        maximum_iterations,
+        0,
+        message="maximum_iterations should be greater than 0",
       )
     elif is_xla:
       raise ValueError("maximum_iterations is required for XLA compilation.")
@@ -336,23 +337,23 @@ def dynamic_decode(
       # Assume the batch_size = 1 for inference.
       # So we can change 2-D TensorArray into 1-D by reshaping it.
       tf.debugging.assert_equal(
-          decoder.batch_size,
-          1,
-          message="TFLite conversion requires a batch size of 1",
+        decoder.batch_size,
+        1,
+        message="TFLite conversion requires a batch size of 1",
       )
       zero_outputs = tf.nest.map_structure(
-          lambda shape, dtype: tf.reshape(
-              tf.zeros(_prepend_batch(decoder.batch_size, shape), dtype=dtype),
-              [-1],
-          ),
-          decoder.output_size,
-          decoder.output_dtype,
+        lambda shape, dtype: tf.reshape(
+          tf.zeros(_prepend_batch(decoder.batch_size, shape), dtype=dtype),
+          [-1],
+        ),
+        decoder.output_size,
+        decoder.output_dtype,
       )
     else:
       zero_outputs = tf.nest.map_structure(
-          lambda shape, dtype: tf.zeros(_prepend_batch(decoder.batch_size, shape), dtype=dtype),
-          decoder.output_size,
-          decoder.output_dtype,
+        lambda shape, dtype: tf.zeros(_prepend_batch(decoder.batch_size, shape), dtype=dtype),
+        decoder.output_size,
+        decoder.output_dtype,
       )
 
     if maximum_iterations is not None:
@@ -380,40 +381,40 @@ def _create_ta(s, d):
       else:
         element_shape = _shape(decoder.batch_size, s)
       return tf.TensorArray(
-          dtype=d,
-          size=0 if dynamic_size else maximum_iterations,
-          dynamic_size=dynamic_size,
-          element_shape=element_shape,
+        dtype=d,
+        size=0 if dynamic_size else maximum_iterations,
+        dynamic_size=dynamic_size,
+        element_shape=element_shape,
       )
 
     initial_outputs_ta = tf.nest.map_structure(_create_ta, decoder.output_size, decoder.output_dtype)
 
     def condition(
-        unused_time,
-        unused_outputs_ta,
-        unused_state,
-        unused_inputs,
-        finished,
-        unused_sequence_lengths,
+      unused_time,
+      unused_outputs_ta,
+      unused_state,
+      unused_inputs,
+      finished,
+      unused_sequence_lengths,
     ):
       return tf.logical_not(tf.reduce_all(finished))
 
     def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
       """Internal while_loop body.
 
-            Args:
-              time: scalar int32 tensor.
-              outputs_ta: structure of TensorArray.
-              state: (structure of) state tensors and TensorArrays.
-              inputs: (structure of) input tensors.
-              finished: bool tensor (keeping track of what's finished).
-              sequence_lengths: int32 tensor (keeping track of time of finish).
-
-            Returns:
-              `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
-                next_sequence_lengths)`.
-              ```
-            """
+      Args:
+        time: scalar int32 tensor.
+        outputs_ta: structure of TensorArray.
+        state: (structure of) state tensors and TensorArrays.
+        inputs: (structure of) input tensors.
+        finished: bool tensor (keeping track of what's finished).
+        sequence_lengths: int32 tensor (keeping track of time of finish).
+
+      Returns:
+        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
+          next_sequence_lengths)`.
+        ```
+      """
       (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(time, inputs, state, training)
       decoder_state_sequence_lengths = False
       if decoder.tracks_own_finished:
@@ -433,9 +434,9 @@ def body(time, outputs_ta, state, inputs, finished, sequence_lengths):
         next_sequence_lengths = sequence_lengths
       else:
         next_sequence_lengths = tf.where(
-            tf.logical_not(finished),
-            tf.fill(tf.shape(sequence_lengths), time + 1),
-            sequence_lengths,
+          tf.logical_not(finished),
+          tf.fill(tf.shape(sequence_lengths), time + 1),
+          sequence_lengths,
         )
 
       tf.nest.assert_same_structure(state, decoder_state)
@@ -481,28 +482,28 @@ def _maybe_copy_state(new, cur):
 
       outputs_ta = tf.nest.map_structure(lambda ta, out: ta.write(time, out), outputs_ta, emit)
       return (
-          time + 1,
-          outputs_ta,
-          next_state,
-          next_inputs,
-          next_finished,
-          next_sequence_lengths,
+        time + 1,
+        outputs_ta,
+        next_state,
+        next_inputs,
+        next_finished,
+        next_sequence_lengths,
       )
 
     res = tf.while_loop(
-        condition,
-        body,
-        loop_vars=(
-            initial_time,
-            initial_outputs_ta,
-            initial_state,
-            initial_inputs,
-            initial_finished,
-            initial_sequence_lengths,
-        ),
-        parallel_iterations=parallel_iterations,
-        maximum_iterations=maximum_iterations,
-        swap_memory=swap_memory,
+      condition,
+      body,
+      loop_vars=(
+        initial_time,
+        initial_outputs_ta,
+        initial_state,
+        initial_inputs,
+        initial_finished,
+        initial_sequence_lengths,
+      ),
+      parallel_iterations=parallel_iterations,
+      maximum_iterations=maximum_iterations,
+      swap_memory=swap_memory,
     )
 
     final_outputs_ta = res[1]
@@ -532,9 +533,9 @@ def _restore_batch(x):
 def _prepend_batch(batch_size, shape):
   """Prepends the batch dimension to the shape.
 
-    If the batch_size value is known statically, this function returns a
-    TensorShape, otherwise a Tensor.
-    """
+  If the batch_size value is known statically, this function returns a
+  TensorShape, otherwise a Tensor.
+  """
   if isinstance(batch_size, tf.Tensor):
     static_batch_size = tf.get_static_value(batch_size)
   else:
@@ -546,7 +547,7 @@ def _prepend_batch(batch_size, shape):
 
 def _transpose_batch_time(tensor):
   """Transposes the batch and time dimension of tensor if its rank is at
-    least 2."""
+  least 2."""
   shape = tensor.shape
   if shape.rank is not None and shape.rank < 2:
     return tensor
diff --git a/deepray/custom_ops/seq2seq/python/loss.py b/deepray/custom_ops/seq2seq/python/loss.py
index 7d3cb9b6..ad8b20d3 100644
--- a/deepray/custom_ops/seq2seq/python/loss.py
+++ b/deepray/custom_ops/seq2seq/python/loss.py
@@ -23,101 +23,98 @@
 
 
 def sequence_loss(
-    logits: TensorLike,
-    targets: TensorLike,
-    weights: TensorLike,
-    average_across_timesteps: bool = True,
-    average_across_batch: bool = True,
-    sum_over_timesteps: bool = False,
-    sum_over_batch: bool = False,
-    softmax_loss_function: Optional[Callable] = None,
-    name: Optional[str] = None,
+  logits: TensorLike,
+  targets: TensorLike,
+  weights: TensorLike,
+  average_across_timesteps: bool = True,
+  average_across_batch: bool = True,
+  sum_over_timesteps: bool = False,
+  sum_over_batch: bool = False,
+  softmax_loss_function: Optional[Callable] = None,
+  name: Optional[str] = None,
 ) -> tf.Tensor:
   """Computes the weighted cross-entropy loss for a sequence of logits.
 
-    Depending on the values of `average_across_timesteps` /
-    `sum_over_timesteps` and `average_across_batch` / `sum_over_batch`, the
-    return Tensor will have rank 0, 1, or 2 as these arguments reduce the
-    cross-entropy at each target, which has shape
-    `[batch_size, sequence_length]`, over their respective dimensions. For
-    example, if `average_across_timesteps` is `True` and `average_across_batch`
-    is `False`, then the return Tensor will have shape `[batch_size]`.
-
-    Note that `average_across_timesteps` and `sum_over_timesteps` cannot be
-    True at same time. Same for `average_across_batch` and `sum_over_batch`.
-
-    The recommended loss reduction in tf 2.0 has been changed to sum_over,
-    instead of weighted average. User are recommend to use `sum_over_timesteps`
-    and `sum_over_batch` for reduction.
-
-    Args:
-      logits: A Tensor of shape
-        `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
-        The logits correspond to the prediction across all classes at each
-        timestep.
-      targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
-        int. The target represents the true class at each timestep.
-      weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
-        float. `weights` constitutes the weighting of each prediction in the
-        sequence. When using `weights` as masking, set all valid timesteps to 1
-        and all padded timesteps to 0, e.g. a mask returned by
-        `tf.sequence_mask`.
-      average_across_timesteps: If set, sum the cost across the sequence
-        dimension and divide the cost by the total label weight across
-        timesteps.
-      average_across_batch: If set, sum the cost across the batch dimension and
-        divide the returned cost by the batch size.
-      sum_over_timesteps: If set, sum the cost across the sequence dimension
-        and divide the size of the sequence. Note that any element with 0
-        weights will be excluded from size calculation.
-      sum_over_batch: if set, sum the cost across the batch dimension and
-        divide the total cost by the batch size. Not that any element with 0
-        weights will be excluded from size calculation.
-      softmax_loss_function: Function (labels, logits) -> loss-batch
-        to be used instead of the standard softmax (the default if this is
-        None). **Note that to avoid confusion, it is required for the function
-        to accept named arguments.**
-      name: Optional name for this operation, defaults to "sequence_loss".
-
-    Returns:
-      A float Tensor of rank 0, 1, or 2 depending on the
-      `average_across_timesteps` and `average_across_batch` arguments. By
-      default, it has rank 0 (scalar) and is the weighted average cross-entropy
-      (log-perplexity) per symbol.
-
-    Raises:
-      ValueError: logits does not have 3 dimensions or targets does not have 2
-                  dimensions or weights does not have 2 dimensions.
-    """
+  Depending on the values of `average_across_timesteps` /
+  `sum_over_timesteps` and `average_across_batch` / `sum_over_batch`, the
+  return Tensor will have rank 0, 1, or 2 as these arguments reduce the
+  cross-entropy at each target, which has shape
+  `[batch_size, sequence_length]`, over their respective dimensions. For
+  example, if `average_across_timesteps` is `True` and `average_across_batch`
+  is `False`, then the return Tensor will have shape `[batch_size]`.
+
+  Note that `average_across_timesteps` and `sum_over_timesteps` cannot be
+  True at same time. Same for `average_across_batch` and `sum_over_batch`.
+
+  The recommended loss reduction in tf 2.0 has been changed to sum_over,
+  instead of weighted average. User are recommend to use `sum_over_timesteps`
+  and `sum_over_batch` for reduction.
+
+  Args:
+    logits: A Tensor of shape
+      `[batch_size, sequence_length, num_decoder_symbols]` and dtype float.
+      The logits correspond to the prediction across all classes at each
+      timestep.
+    targets: A Tensor of shape `[batch_size, sequence_length]` and dtype
+      int. The target represents the true class at each timestep.
+    weights: A Tensor of shape `[batch_size, sequence_length]` and dtype
+      float. `weights` constitutes the weighting of each prediction in the
+      sequence. When using `weights` as masking, set all valid timesteps to 1
+      and all padded timesteps to 0, e.g. a mask returned by
+      `tf.sequence_mask`.
+    average_across_timesteps: If set, sum the cost across the sequence
+      dimension and divide the cost by the total label weight across
+      timesteps.
+    average_across_batch: If set, sum the cost across the batch dimension and
+      divide the returned cost by the batch size.
+    sum_over_timesteps: If set, sum the cost across the sequence dimension
+      and divide the size of the sequence. Note that any element with 0
+      weights will be excluded from size calculation.
+    sum_over_batch: if set, sum the cost across the batch dimension and
+      divide the total cost by the batch size. Not that any element with 0
+      weights will be excluded from size calculation.
+    softmax_loss_function: Function (labels, logits) -> loss-batch
+      to be used instead of the standard softmax (the default if this is
+      None). **Note that to avoid confusion, it is required for the function
+      to accept named arguments.**
+    name: Optional name for this operation, defaults to "sequence_loss".
+
+  Returns:
+    A float Tensor of rank 0, 1, or 2 depending on the
+    `average_across_timesteps` and `average_across_batch` arguments. By
+    default, it has rank 0 (scalar) and is the weighted average cross-entropy
+    (log-perplexity) per symbol.
+
+  Raises:
+    ValueError: logits does not have 3 dimensions or targets does not have 2
+                dimensions or weights does not have 2 dimensions.
+  """
   if len(logits.shape) != 3:
     raise ValueError("Logits must be a [batch_size x sequence_length x logits] tensor")
 
   targets_rank = len(targets.shape)
   if targets_rank != 2 and targets_rank != 3:
     raise ValueError(
-        "Targets must be either a [batch_size x sequence_length] tensor " +
-        "where each element contains the labels' index" + "or a [batch_size x sequence_length x num_classes] tensor " +
-        "where the third axis is a one-hot representation of the labels"
+      "Targets must be either a [batch_size x sequence_length] tensor "
+      + "where each element contains the labels' index"
+      + "or a [batch_size x sequence_length x num_classes] tensor "
+      + "where the third axis is a one-hot representation of the labels"
     )
 
   if len(weights.shape) != 2:
     raise ValueError("Weights must be a [batch_size x sequence_length] tensor")
 
   if average_across_timesteps and sum_over_timesteps:
-    raise ValueError("average_across_timesteps and sum_over_timesteps cannot "
-                     "be set to True at same time.")
+    raise ValueError("average_across_timesteps and sum_over_timesteps cannot be set to True at same time.")
   if average_across_batch and sum_over_batch:
-    raise ValueError("average_across_batch and sum_over_batch cannot be set "
-                     "to True at same time.")
+    raise ValueError("average_across_batch and sum_over_batch cannot be set to True at same time.")
   if average_across_batch and sum_over_timesteps:
     raise ValueError(
-        "average_across_batch and sum_over_timesteps cannot be set "
-        "to True at same time because of ambiguous order."
+      "average_across_batch and sum_over_timesteps cannot be set to True at same time because of ambiguous order."
     )
   if sum_over_batch and average_across_timesteps:
     raise ValueError(
-        "sum_over_batch and average_across_timesteps cannot be set "
-        "to True at same time because of ambiguous order."
+      "sum_over_batch and average_across_timesteps cannot be set to True at same time because of ambiguous order."
     )
   with tf.name_scope(name or "sequence_loss"):
     num_classes = tf.shape(input=logits)[2]
@@ -152,8 +149,8 @@ def sequence_loss(
         reduce_axis = [0] if sum_over_batch else [1]
         crossent = tf.reduce_sum(input_tensor=crossent, axis=reduce_axis)
         total_count = tf.cast(
-            tf.math.count_nonzero(weights, axis=reduce_axis),
-            dtype=crossent.dtype,
+          tf.math.count_nonzero(weights, axis=reduce_axis),
+          dtype=crossent.dtype,
         )
         crossent = tf.math.divide_no_nan(crossent, total_count)
     return crossent
@@ -164,13 +161,13 @@ class SequenceLoss(tf.keras.losses.Loss):
 
   @typechecked
   def __init__(
-      self,
-      average_across_timesteps: bool = False,
-      average_across_batch: bool = False,
-      sum_over_timesteps: bool = True,
-      sum_over_batch: bool = True,
-      softmax_loss_function: Optional[Callable] = None,
-      name: Optional[str] = None,
+    self,
+    average_across_timesteps: bool = False,
+    average_across_batch: bool = False,
+    sum_over_timesteps: bool = True,
+    sum_over_batch: bool = True,
+    softmax_loss_function: Optional[Callable] = None,
+    name: Optional[str] = None,
   ):
     super().__init__(reduction=tf.keras.losses.Reduction.NONE, name=name)
     self.average_across_timesteps = average_across_timesteps
@@ -181,17 +178,17 @@ def __init__(
 
   def __call__(self, y_true, y_pred, sample_weight=None):
     """Override the parent __call__ to have a customized reduce
-        behavior."""
+    behavior."""
     return sequence_loss(
-        y_pred,
-        y_true,
-        sample_weight,
-        average_across_timesteps=self.average_across_timesteps,
-        average_across_batch=self.average_across_batch,
-        sum_over_timesteps=self.sum_over_timesteps,
-        sum_over_batch=self.sum_over_batch,
-        softmax_loss_function=self.softmax_loss_function,
-        name=self.name,
+      y_pred,
+      y_true,
+      sample_weight,
+      average_across_timesteps=self.average_across_timesteps,
+      average_across_batch=self.average_across_batch,
+      sum_over_timesteps=self.sum_over_timesteps,
+      sum_over_batch=self.sum_over_batch,
+      softmax_loss_function=self.softmax_loss_function,
+      name=self.name,
     )
 
   def call(self, y_true, y_pred):
diff --git a/deepray/custom_ops/seq2seq/python/sampler.py b/deepray/custom_ops/seq2seq/python/sampler.py
index 069d70db..d585f5ed 100644
--- a/deepray/custom_ops/seq2seq/python/sampler.py
+++ b/deepray/custom_ops/seq2seq/python/sampler.py
@@ -30,52 +30,52 @@
 class Sampler(metaclass=abc.ABCMeta):
   """Interface for implementing sampling in seq2seq decoders.
 
-    Sampler classes implement the logic of sampling from the decoder output distribution
-    and producing the inputs for the next decoding step. In most cases, they should not be
-    used directly but passed to a `tfa.seq2seq.BasicDecoder` instance that will manage the
-    sampling.
-
-    Here is an example using a training sampler directly to implement a custom decoding
-    loop:
-
-    >>> batch_size = 4
-    >>> max_time = 7
-    >>> hidden_size = 16
-    >>>
-    >>> sampler = tfa.seq2seq.TrainingSampler()
-    >>> cell = tf.keras.layers.LSTMCell(hidden_size)
-    >>>
-    >>> input_tensors = tf.random.uniform([batch_size, max_time, hidden_size])
-    >>> initial_finished, initial_inputs = sampler.initialize(input_tensors)
-    >>>
-    >>> cell_input = initial_inputs
-    >>> cell_state = cell.get_initial_state(initial_inputs)
-    >>>
-    >>> for time_step in tf.range(max_time):
-    ...     cell_output, cell_state = cell(cell_input, cell_state)
-    ...     sample_ids = sampler.sample(time_step, cell_output, cell_state)
-    ...     finished, cell_input, cell_state = sampler.next_inputs(
-    ...         time_step, cell_output, cell_state, sample_ids)
-    ...     if tf.reduce_all(finished):
-    ...         break
-    """
+  Sampler classes implement the logic of sampling from the decoder output distribution
+  and producing the inputs for the next decoding step. In most cases, they should not be
+  used directly but passed to a `tfa.seq2seq.BasicDecoder` instance that will manage the
+  sampling.
+
+  Here is an example using a training sampler directly to implement a custom decoding
+  loop:
+
+  >>> batch_size = 4
+  >>> max_time = 7
+  >>> hidden_size = 16
+  >>>
+  >>> sampler = tfa.seq2seq.TrainingSampler()
+  >>> cell = tf.keras.layers.LSTMCell(hidden_size)
+  >>>
+  >>> input_tensors = tf.random.uniform([batch_size, max_time, hidden_size])
+  >>> initial_finished, initial_inputs = sampler.initialize(input_tensors)
+  >>>
+  >>> cell_input = initial_inputs
+  >>> cell_state = cell.get_initial_state(initial_inputs)
+  >>>
+  >>> for time_step in tf.range(max_time):
+  ...     cell_output, cell_state = cell(cell_input, cell_state)
+  ...     sample_ids = sampler.sample(time_step, cell_output, cell_state)
+  ...     finished, cell_input, cell_state = sampler.next_inputs(
+  ...         time_step, cell_output, cell_state, sample_ids)
+  ...     if tf.reduce_all(finished):
+  ...         break
+  """
 
   @abc.abstractmethod
   def initialize(self, inputs, **kwargs):
     """initialize the sampler with the input tensors.
 
-        This method must be invoked exactly once before calling other
-        methods of the Sampler.
+    This method must be invoked exactly once before calling other
+    methods of the Sampler.
 
-        Args:
-          inputs: A (structure of) input tensors, it could be a nested tuple or
-            a single tensor.
-          **kwargs: Other kwargs for initialization. It could contain tensors
-            like mask for inputs, or non tensor parameter.
+    Args:
+      inputs: A (structure of) input tensors, it could be a nested tuple or
+        a single tensor.
+      **kwargs: Other kwargs for initialization. It could contain tensors
+        like mask for inputs, or non tensor parameter.
 
-        Returns:
-          `(initial_finished, initial_inputs)`.
-        """
+    Returns:
+      `(initial_finished, initial_inputs)`.
+    """
     pass
 
   @abc.abstractmethod
@@ -92,28 +92,28 @@ def next_inputs(self, time, outputs, state, sample_ids):
   def batch_size(self):
     """Batch size of tensor returned by `sample`.
 
-        Returns a scalar int32 tensor. The return value might not
-        available before the invocation of initialize(), in this case,
-        ValueError is raised.
-        """
+    Returns a scalar int32 tensor. The return value might not
+    available before the invocation of initialize(), in this case,
+    ValueError is raised.
+    """
     raise NotImplementedError("batch_size has not been implemented")
 
   @abc.abstractproperty
   def sample_ids_shape(self):
     """Shape of tensor returned by `sample`, excluding the batch dimension.
 
-        Returns a `TensorShape`. The return value might not available
-        before the invocation of initialize().
-        """
+    Returns a `TensorShape`. The return value might not available
+    before the invocation of initialize().
+    """
     raise NotImplementedError("sample_ids_shape has not been implemented")
 
   @abc.abstractproperty
   def sample_ids_dtype(self):
     """DType of tensor returned by `sample`.
 
-        Returns a DType. The return value might not available before the
-        invocation of initialize().
-        """
+    Returns a DType. The return value might not available before the
+    invocation of initialize().
+    """
     raise NotImplementedError("sample_ids_dtype has not been implemented")
 
 
@@ -122,29 +122,29 @@ class CustomSampler(Sampler):
 
   @typechecked
   def __init__(
-      self,
-      initialize_fn: Initializer,
-      sample_fn: Callable,
-      next_inputs_fn: Callable,
-      sample_ids_shape: Optional[TensorLike] = None,
-      sample_ids_dtype: types.AcceptableDTypes = None,
+    self,
+    initialize_fn: Initializer,
+    sample_fn: Callable,
+    next_inputs_fn: Callable,
+    sample_ids_shape: Optional[TensorLike] = None,
+    sample_ids_dtype: types.AcceptableDTypes = None,
   ):
     """Initializer.
 
-        Args:
-          initialize_fn: callable that returns `(finished, next_inputs)` for
-            the first iteration.
-          sample_fn: callable that takes `(time, outputs, state)` and emits
-            tensor `sample_ids`.
-          next_inputs_fn: callable that takes
-            `(time, outputs, state, sample_ids)` and emits
-            `(finished, next_inputs, next_state)`.
-          sample_ids_shape: Either a list of integers, or a 1-D Tensor of type
-            `int32`, the shape of each value in the `sample_ids` batch.
-            Defaults to a scalar.
-          sample_ids_dtype: The dtype of the `sample_ids` tensor. Defaults to
-            int32.
-        """
+    Args:
+      initialize_fn: callable that returns `(finished, next_inputs)` for
+        the first iteration.
+      sample_fn: callable that takes `(time, outputs, state)` and emits
+        tensor `sample_ids`.
+      next_inputs_fn: callable that takes
+        `(time, outputs, state, sample_ids)` and emits
+        `(finished, next_inputs, next_state)`.
+      sample_ids_shape: Either a list of integers, or a 1-D Tensor of type
+        `int32`, the shape of each value in the `sample_ids` batch.
+        Defaults to a scalar.
+      sample_ids_dtype: The dtype of the `sample_ids` tensor. Defaults to
+        int32.
+    """
     self._initialize_fn = initialize_fn
     self._sample_fn = sample_fn
     self._next_inputs_fn = next_inputs_fn
@@ -182,21 +182,21 @@ def next_inputs(self, time, outputs, state, sample_ids):
 class TrainingSampler(Sampler):
   """A training sampler that simply reads its inputs.
 
-    Returned sample_ids are the argmax of the RNN output logits.
-    """
+  Returned sample_ids are the argmax of the RNN output logits.
+  """
 
   @typechecked
   def __init__(self, time_major: bool = False):
     """Initializer.
 
-        Args:
-          time_major: Python bool.  Whether the tensors in `inputs` are time
-            major. If `False` (default), they are assumed to be batch major.
+    Args:
+      time_major: Python bool.  Whether the tensors in `inputs` are time
+        major. If `False` (default), they are assumed to be batch major.
 
-        Raises:
-          ValueError: if `sequence_length` is not a 1D tensor or `mask` is
-            not a 2D boolean tensor.
-        """
+    Raises:
+      ValueError: if `sequence_length` is not a 1D tensor or `mask` is
+        not a 2D boolean tensor.
+    """
     self.time_major = time_major
     self._batch_size = None
 
@@ -217,17 +217,17 @@ def sample_ids_dtype(self):
   def initialize(self, inputs, sequence_length=None, mask=None):
     """Initialize the TrainSampler.
 
-        Args:
-          inputs: A (structure of) input tensors.
-          sequence_length: An int32 vector tensor.
-          mask: A boolean 2D tensor.
-
-        Returns:
-          (finished, next_inputs), a tuple of two items. The first item is a
-            boolean vector to indicate whether the item in the batch has
-            finished. The second item is the first slide of input data based on
-            the timestep dimension (usually the second dim of the input).
-        """
+    Args:
+      inputs: A (structure of) input tensors.
+      sequence_length: An int32 vector tensor.
+      mask: A boolean 2D tensor.
+
+    Returns:
+      (finished, next_inputs), a tuple of two items. The first item is a
+        boolean vector to indicate whether the item in the batch has
+        finished. The second item is the first slide of input data based on
+        the timestep dimension (usually the second dim of the input).
+    """
     self.inputs = tf.convert_to_tensor(inputs, name="inputs")
     if not self.time_major:
       inputs = tf.nest.map_structure(_transpose_batch_time, inputs)
@@ -240,17 +240,13 @@ def initialize(self, inputs, sequence_length=None, mask=None):
     if sequence_length is not None:
       self.sequence_length = tf.convert_to_tensor(sequence_length, name="sequence_length")
       if self.sequence_length.shape.ndims != 1:
-        raise ValueError(
-            "Expected sequence_length to be vector, but received "
-            "shape: %s" % self.sequence_length.shape
-        )
+        raise ValueError("Expected sequence_length to be vector, but received shape: %s" % self.sequence_length.shape)
     elif mask is not None:
       mask = tf.convert_to_tensor(mask)
       if mask.shape.ndims != 2:
         raise ValueError("Expected mask to a 2D tensor, but received shape: %s" % mask)
       if not mask.dtype.is_bool:
-        raise ValueError("Expected mask to be a boolean tensor, but received "
-                         "dtype: %s" % repr(mask.dtype))
+        raise ValueError("Expected mask to be a boolean tensor, but received dtype: %s" % repr(mask.dtype))
 
       axis = 1 if not self.time_major else 0
       with tf.control_dependencies([_check_sequence_is_right_padded(mask, self.time_major)]):
@@ -267,9 +263,9 @@ def initialize(self, inputs, sequence_length=None, mask=None):
     finished = tf.equal(0, self.sequence_length)
     all_finished = tf.reduce_all(finished)
     next_inputs = tf.cond(
-        all_finished,
-        lambda: self.zero_inputs,
-        lambda: tf.nest.map_structure(lambda inp: inp.read(0), self.input_tas),
+      all_finished,
+      lambda: self.zero_inputs,
+      lambda: tf.nest.map_structure(lambda inp: inp.read(0), self.input_tas),
     )
     return (finished, next_inputs)
 
@@ -288,9 +284,9 @@ def read_from_ta(inp):
       return inp.read(next_time)
 
     next_inputs = tf.cond(
-        all_finished,
-        lambda: self.zero_inputs,
-        lambda: tf.nest.map_structure(read_from_ta, self.input_tas),
+      all_finished,
+      lambda: self.zero_inputs,
+      lambda: tf.nest.map_structure(read_from_ta, self.input_tas),
     )
     return (finished, next_inputs, state)
 
@@ -298,35 +294,35 @@ def read_from_ta(inp):
 class ScheduledEmbeddingTrainingSampler(TrainingSampler):
   """A training sampler that adds scheduled sampling.
 
-    Returns -1s for sample_ids where no sampling took place; valid
-    sample id values elsewhere.
-    """
+  Returns -1s for sample_ids where no sampling took place; valid
+  sample id values elsewhere.
+  """
 
   @typechecked
   def __init__(
-      self,
-      sampling_probability: TensorLike,
-      embedding_fn: Optional[Callable] = None,
-      time_major: bool = False,
-      seed: Optional[int] = None,
-      scheduling_seed: Optional[TensorLike] = None,
+    self,
+    sampling_probability: TensorLike,
+    embedding_fn: Optional[Callable] = None,
+    time_major: bool = False,
+    seed: Optional[int] = None,
+    scheduling_seed: Optional[TensorLike] = None,
   ):
     """Initializer.
 
-        Args:
-          sampling_probability: A `float32` 0-D or 1-D tensor: the probability
-            of sampling categorically from the output ids instead of reading
-            directly from the inputs.
-          embedding_fn: A callable that takes a vector tensor of `ids`
-            (argmax ids).
-          time_major: Python bool. Whether the tensors in `inputs` are time
-            major. If `False` (default), they are assumed to be batch major.
-          seed: The sampling seed.
-          scheduling_seed: The schedule decision rule sampling seed.
-
-        Raises:
-          ValueError: if `sampling_probability` is not a scalar or vector.
-        """
+    Args:
+      sampling_probability: A `float32` 0-D or 1-D tensor: the probability
+        of sampling categorically from the output ids instead of reading
+        directly from the inputs.
+      embedding_fn: A callable that takes a vector tensor of `ids`
+        (argmax ids).
+      time_major: Python bool. Whether the tensors in `inputs` are time
+        major. If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      scheduling_seed: The schedule decision rule sampling seed.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
     self.embedding_fn = embedding_fn
     if isinstance(sampling_probability, tf.Variable):
       self.sampling_probability = sampling_probability
@@ -334,8 +330,7 @@ def __init__(
       self.sampling_probability = tf.convert_to_tensor(sampling_probability, name="sampling_probability")
     if self.sampling_probability.shape.ndims not in (0, 1):
       raise ValueError(
-          "sampling_probability must be either a scalar or a vector. "
-          "saw shape: %s" % (self.sampling_probability.shape)
+        "sampling_probability must be either a scalar or a vector. saw shape: %s" % (self.sampling_probability.shape)
       )
     self.seed = seed
     self.scheduling_seed = scheduling_seed
@@ -344,8 +339,7 @@ def __init__(
   def initialize(self, inputs, sequence_length=None, mask=None, embedding=None):
     if self.embedding_fn is None:
       if embedding is None:
-        raise ValueError("embedding is required as a keyword argument for "
-                         "ScheduledEmbeddingTrainingSampler")
+        raise ValueError("embedding is required as a keyword argument for ScheduledEmbeddingTrainingSampler")
       self.embedding_fn = lambda ids: tf.nn.embedding_lookup(embedding, ids)
     return super().initialize(inputs, sequence_length=sequence_length, mask=mask)
 
@@ -353,20 +347,21 @@ def sample(self, time, outputs, state):
     del state
     # Return -1s where we did not sample, and sample_ids elsewhere
     select_sample = bernoulli_sample(
-        probs=self.sampling_probability,
-        dtype=tf.bool,
-        sample_shape=self.batch_size,
-        seed=self.scheduling_seed,
+      probs=self.sampling_probability,
+      dtype=tf.bool,
+      sample_shape=self.batch_size,
+      seed=self.scheduling_seed,
     )
     return tf.where(
-        select_sample,
-        categorical_sample(logits=outputs, seed=self.seed),
-        tf.fill([self.batch_size], -1),
+      select_sample,
+      categorical_sample(logits=outputs, seed=self.seed),
+      tf.fill([self.batch_size], -1),
     )
 
   def next_inputs(self, time, outputs, state, sample_ids):
-    (finished, base_next_inputs,
-     state) = super().next_inputs(time=time, outputs=outputs, state=state, sample_ids=sample_ids)
+    (finished, base_next_inputs, state) = super().next_inputs(
+      time=time, outputs=outputs, state=state, sample_ids=sample_ids
+    )
 
     def maybe_sample():
       """Perform scheduled sampling."""
@@ -378,9 +373,9 @@ def maybe_sample():
       sampled_next_inputs = tf.cast(sampled_next_inputs, inputs_not_sampling.dtype)
       base_shape = tf.shape(base_next_inputs)
       return tf.scatter_nd(indices=where_sampling, updates=sampled_next_inputs, shape=base_shape) + tf.scatter_nd(
-          indices=where_not_sampling,
-          updates=inputs_not_sampling,
-          shape=base_shape,
+        indices=where_not_sampling,
+        updates=inputs_not_sampling,
+        shape=base_shape,
       )
 
     all_finished = tf.reduce_all(finished)
@@ -391,42 +386,41 @@ def maybe_sample():
 class ScheduledOutputTrainingSampler(TrainingSampler):
   """A training sampler that adds scheduled sampling directly to outputs.
 
-    Returns False for sample_ids where no sampling took place; True
-    elsewhere.
-    """
+  Returns False for sample_ids where no sampling took place; True
+  elsewhere.
+  """
 
   @typechecked
   def __init__(
-      self,
-      sampling_probability: TensorLike,
-      time_major: bool = False,
-      seed: Optional[int] = None,
-      next_inputs_fn: Optional[Callable] = None,
+    self,
+    sampling_probability: TensorLike,
+    time_major: bool = False,
+    seed: Optional[int] = None,
+    next_inputs_fn: Optional[Callable] = None,
   ):
     """Initializer.
 
-        Args:
-          sampling_probability: A `float32` scalar tensor: the probability of
-            sampling from the outputs instead of reading directly from the
-            inputs.
-          time_major: Python bool. Whether the tensors in `inputs` are time
-            major. If `False` (default), they are assumed to be batch major.
-          seed: The sampling seed.
-          next_inputs_fn: (Optional) callable to apply to the RNN outputs to
-            create the next input when sampling. If `None` (default), the RNN
-            outputs will be used as the next inputs.
-
-        Raises:
-          ValueError: if `sampling_probability` is not a scalar or vector.
-        """
+    Args:
+      sampling_probability: A `float32` scalar tensor: the probability of
+        sampling from the outputs instead of reading directly from the
+        inputs.
+      time_major: Python bool. Whether the tensors in `inputs` are time
+        major. If `False` (default), they are assumed to be batch major.
+      seed: The sampling seed.
+      next_inputs_fn: (Optional) callable to apply to the RNN outputs to
+        create the next input when sampling. If `None` (default), the RNN
+        outputs will be used as the next inputs.
+
+    Raises:
+      ValueError: if `sampling_probability` is not a scalar or vector.
+    """
     if isinstance(sampling_probability, tf.Variable):
       self.sampling_probability = sampling_probability
     else:
       self.sampling_probability = tf.convert_to_tensor(sampling_probability, name="sampling_probability")
     if self.sampling_probability.shape.ndims not in (0, 1):
       raise ValueError(
-          "sampling_probability must be either a scalar or a vector. "
-          "saw shape: %s" % (self.sampling_probability.shape)
+        "sampling_probability must be either a scalar or a vector. saw shape: %s" % (self.sampling_probability.shape)
       )
 
     self.seed = seed
@@ -453,14 +447,15 @@ def initialize(self, inputs, sequence_length=None, mask=None, auxiliary_inputs=N
   def sample(self, time, outputs, state):
     del state
     return bernoulli_sample(
-        probs=self.sampling_probability,
-        sample_shape=self.batch_size,
-        seed=self.seed,
+      probs=self.sampling_probability,
+      sample_shape=self.batch_size,
+      seed=self.seed,
     )
 
   def next_inputs(self, time, outputs, state, sample_ids):
-    (finished, base_next_inputs,
-     state) = super().next_inputs(time=time, outputs=outputs, state=state, sample_ids=sample_ids)
+    (finished, base_next_inputs, state) = super().next_inputs(
+      time=time, outputs=outputs, state=state, sample_ids=sample_ids
+    )
     sample_ids = tf.cast(sample_ids, tf.bool)
 
     def maybe_sample():
@@ -479,9 +474,9 @@ def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
 
       if self.next_inputs_fn is None:
         return tf.where(
-            tf.broadcast_to(tf.expand_dims(sample_ids, axis=-1), base_next_inputs.shape),
-            maybe_concatenate_auxiliary_inputs(outputs),
-            base_next_inputs,
+          tf.broadcast_to(tf.expand_dims(sample_ids, axis=-1), base_next_inputs.shape),
+          maybe_concatenate_auxiliary_inputs(outputs),
+          base_next_inputs,
         )
 
       where_sampling = tf.cast(tf.where(sample_ids), tf.int32)
@@ -492,17 +487,17 @@ def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
 
       base_shape = tf.shape(base_next_inputs)
       return tf.scatter_nd(indices=where_sampling, updates=sampled_next_inputs, shape=base_shape) + tf.scatter_nd(
-          indices=where_not_sampling,
-          updates=inputs_not_sampling,
-          shape=base_shape,
+        indices=where_not_sampling,
+        updates=inputs_not_sampling,
+        shape=base_shape,
       )
 
     all_finished = tf.reduce_all(finished)
     no_samples = tf.logical_not(tf.reduce_any(sample_ids))
     next_inputs = tf.cond(
-        tf.logical_or(all_finished, no_samples),
-        lambda: base_next_inputs,
-        maybe_sample,
+      tf.logical_or(all_finished, no_samples),
+      lambda: base_next_inputs,
+      maybe_sample,
     )
     return (finished, next_inputs, state)
 
@@ -510,19 +505,19 @@ def maybe_concatenate_auxiliary_inputs(outputs_, indices=None):
 class GreedyEmbeddingSampler(Sampler):
   """A inference sampler that takes the maximum from the output distribution.
 
-    Uses the argmax of the output (treated as logits) and passes the
-    result through an embedding layer to get the next input.
-    """
+  Uses the argmax of the output (treated as logits) and passes the
+  result through an embedding layer to get the next input.
+  """
 
   @typechecked
   def __init__(self, embedding_fn: Optional[Callable] = None):
     """Initializer.
 
-        Args:
-          embedding_fn: A optional callable that takes a vector tensor of `ids`
-            (argmax ids). The returned tensor will be passed to the decoder
-            input. Default to use `tf.nn.embedding_lookup`.
-        """
+    Args:
+      embedding_fn: A optional callable that takes a vector tensor of `ids`
+        (argmax ids). The returned tensor will be passed to the decoder
+        input. Default to use `tf.nn.embedding_lookup`.
+    """
     self.embedding_fn = embedding_fn
     self._batch_size = None
 
@@ -543,20 +538,20 @@ def sample_ids_dtype(self):
   def initialize(self, embedding, start_tokens=None, end_token=None):
     """Initialize the GreedyEmbeddingSampler.
 
-        Args:
-          embedding: tensor that contains embedding states matrix. It will be
-            used to generate generate outputs with `start_tokens` and `end_token`.
-            The embedding will be ignored if the `embedding_fn` has been provided
-            at __init__().
-          start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
-          end_token: `int32` scalar, the token that marks end of decoding.
-
-        Returns:
-          Tuple of two items: `(finished, self.start_inputs)`.
-        Raises:
-          ValueError: if `start_tokens` is not a 1D tensor or `end_token` is
-            not a scalar.
-        """
+    Args:
+      embedding: tensor that contains embedding states matrix. It will be
+        used to generate generate outputs with `start_tokens` and `end_token`.
+        The embedding will be ignored if the `embedding_fn` has been provided
+        at __init__().
+      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
+      end_token: `int32` scalar, the token that marks end of decoding.
+
+    Returns:
+      Tuple of two items: `(finished, self.start_inputs)`.
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is
+        not a scalar.
+    """
     if self.embedding_fn is None:
       self.embedding_fn = lambda ids: tf.nn.embedding_lookup(embedding, ids)
 
@@ -587,10 +582,10 @@ def next_inputs(self, time, outputs, state, sample_ids):
     finished = tf.equal(sample_ids, self.end_token)
     all_finished = tf.reduce_all(finished)
     next_inputs = tf.cond(
-        all_finished,
-        # If we're finished, the next_inputs value doesn't matter
-        lambda: self.start_inputs,
-        lambda: self.embedding_fn(sample_ids),
+      all_finished,
+      # If we're finished, the next_inputs value doesn't matter
+      lambda: self.start_inputs,
+      lambda: self.embedding_fn(sample_ids),
     )
     return (finished, next_inputs, state)
 
@@ -598,34 +593,34 @@ def next_inputs(self, time, outputs, state, sample_ids):
 class SampleEmbeddingSampler(GreedyEmbeddingSampler):
   """An inference sampler that randomly samples from the output distribution.
 
-    Uses sampling (from a distribution) instead of argmax and passes the
-    result through an embedding layer to get the next input.
-    """
+  Uses sampling (from a distribution) instead of argmax and passes the
+  result through an embedding layer to get the next input.
+  """
 
   @typechecked
   def __init__(
-      self,
-      embedding_fn: Optional[Callable] = None,
-      softmax_temperature: Optional[TensorLike] = None,
-      seed: Optional[TensorLike] = None,
+    self,
+    embedding_fn: Optional[Callable] = None,
+    softmax_temperature: Optional[TensorLike] = None,
+    seed: Optional[TensorLike] = None,
   ):
     """Initializer.
 
-        Args:
-          embedding_fn: (Optional) A callable that takes a vector tensor of
-            `ids` (argmax ids). The returned tensor will be passed to the
-            decoder input.
-          softmax_temperature: (Optional) `float32` scalar, value to divide the
-            logits by before computing the softmax. Larger values (above 1.0)
-            result in more random samples, while smaller values push the
-            sampling distribution towards the argmax. Must be strictly greater
-            than 0. Defaults to 1.0.
-          seed: (Optional) The sampling seed.
-
-        Raises:
-          ValueError: if `start_tokens` is not a 1D tensor or `end_token` is
-            not a scalar.
-        """
+    Args:
+      embedding_fn: (Optional) A callable that takes a vector tensor of
+        `ids` (argmax ids). The returned tensor will be passed to the
+        decoder input.
+      softmax_temperature: (Optional) `float32` scalar, value to divide the
+        logits by before computing the softmax. Larger values (above 1.0)
+        result in more random samples, while smaller values push the
+        sampling distribution towards the argmax. Must be strictly greater
+        than 0. Defaults to 1.0.
+      seed: (Optional) The sampling seed.
+
+    Raises:
+      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is
+        not a scalar.
+    """
     super().__init__(embedding_fn)
     self.softmax_temperature = softmax_temperature
     self.seed = seed
@@ -649,29 +644,29 @@ class InferenceSampler(Sampler):
 
   @typechecked
   def __init__(
-      self,
-      sample_fn: Callable,
-      sample_shape: TensorLike,
-      sample_dtype: types.AcceptableDTypes,
-      end_fn: Callable,
-      next_inputs_fn: Optional[Callable] = None,
+    self,
+    sample_fn: Callable,
+    sample_shape: TensorLike,
+    sample_dtype: types.AcceptableDTypes,
+    end_fn: Callable,
+    next_inputs_fn: Optional[Callable] = None,
   ):
     """Initializer.
 
-        Args:
-          sample_fn: A callable that takes `outputs` and emits tensor
-            `sample_ids`.
-          sample_shape: Either a list of integers, or a 1-D Tensor of type
-            `int32`, the shape of the each sample in the batch returned by
-            `sample_fn`.
-          sample_dtype: the dtype of the sample returned by `sample_fn`.
-          end_fn: A callable that takes `sample_ids` and emits a `bool` vector
-            shaped `[batch_size]` indicating whether each sample is an end
-            token.
-          next_inputs_fn: (Optional) A callable that takes `sample_ids` and
-            returns the next batch of inputs. If not provided, `sample_ids` is
-            used as the next batch of inputs.
-        """
+    Args:
+      sample_fn: A callable that takes `outputs` and emits tensor
+        `sample_ids`.
+      sample_shape: Either a list of integers, or a 1-D Tensor of type
+        `int32`, the shape of the each sample in the batch returned by
+        `sample_fn`.
+      sample_dtype: the dtype of the sample returned by `sample_fn`.
+      end_fn: A callable that takes `sample_ids` and emits a `bool` vector
+        shaped `[batch_size]` indicating whether each sample is an end
+        token.
+      next_inputs_fn: (Optional) A callable that takes `sample_ids` and
+        returns the next batch of inputs. If not provided, `sample_ids` is
+        used as the next batch of inputs.
+    """
     self.sample_fn = sample_fn
     self.sample_shape = tf.TensorShape(sample_shape)
     self.sample_dtype = sample_dtype
@@ -722,8 +717,8 @@ def _call_sampler(sample_n_fn, sample_shape, name=None):
     # Ensure sample_shape is a vector (vs just a scalar).
     pad = tf.cast(tf.equal(tf.rank(sample_shape), 0), tf.int32)
     sample_shape = tf.reshape(
-        sample_shape,
-        tf.pad(tf.shape(sample_shape), paddings=[[pad, 0]], constant_values=1),
+      sample_shape,
+      tf.pad(tf.shape(sample_shape), paddings=[[pad, 0]], constant_values=1),
     )
     samples = sample_n_fn(tf.reduce_prod(sample_shape))
     batch_event_shape = tf.shape(samples)[1:]
@@ -774,7 +769,7 @@ def _unstack_ta(inp):
 
 def _check_sequence_is_right_padded(mask, time_major):
   """Returns an Assert operation checking that if the mask tensor is right
-    padded."""
+  padded."""
   if time_major:
     mask = tf.transpose(mask)
   sequence_length = tf.math.reduce_sum(tf.cast(mask, tf.int32), axis=1)
diff --git a/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
index 82c91ba5..1498b87c 100644
--- a/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/attention_wrapper_test.py
@@ -27,7 +27,6 @@
 
 
 class DummyData:
-
   def __init__(self):
     self.batch = 10
     self.timestep = 5
@@ -41,10 +40,10 @@ def __init__(self):
 
 
 attention_classes = [
-    wrapper.LuongAttention,
-    wrapper.LuongMonotonicAttention,
-    wrapper.BahdanauAttention,
-    wrapper.BahdanauMonotonicAttention,
+  wrapper.LuongAttention,
+  wrapper.LuongMonotonicAttention,
+  wrapper.BahdanauAttention,
+  wrapper.BahdanauMonotonicAttention,
 ]
 
 
@@ -176,9 +175,7 @@ def test_masking():
 
 @pytest.mark.parametrize("attention_cls", attention_classes)
 def test_memory_re_setup(attention_cls):
-
   class MyModel(tf.keras.models.Model):
-
     def __init__(self, vocab, embedding_dim, memory_size, units):
       super().__init__()
       self.emb = tf.keras.layers.Embedding(vocab, embedding_dim, mask_zero=True)
@@ -234,7 +231,6 @@ def assert_allclose_or_equal(x, y, **kwargs):
 
 
 class DummyData2:
-
   def __init__(self):
     self.batch = 64
     self.units = 128
@@ -258,8 +254,8 @@ def test_custom_attention_layer():
     # initialized.
     attention_wrapper.get_initial_state(batch_size=dummy_data.batch, dtype=tf.float32)
   attention_mechanism.setup_memory(
-      dummy_data.encoder_outputs.astype(np.float32),
-      memory_sequence_length=dummy_data.encoder_sequence_length,
+    dummy_data.encoder_outputs.astype(np.float32),
+    memory_sequence_length=dummy_data.encoder_sequence_length,
   )
   initial_state = attention_wrapper.get_initial_state(batch_size=dummy_data.batch, dtype=tf.float32)
   assert initial_state.attention.shape[-1] == dummy_data.units * 2
@@ -269,19 +265,19 @@ def test_custom_attention_layer():
 
 
 def _test_with_attention(
-    create_attention_mechanism,
-    expected_final_output,
-    expected_final_state,
-    attention_mechanism_depth=3,
-    alignment_history=False,
-    expected_final_alignment_history=None,
-    attention_layer_size=6,
-    attention_layer=None,
-    create_query_layer=False,
-    create_memory_layer=True,
-    create_attention_kwargs=None,
+  create_attention_mechanism,
+  expected_final_output,
+  expected_final_state,
+  attention_mechanism_depth=3,
+  alignment_history=False,
+  expected_final_alignment_history=None,
+  attention_layer_size=6,
+  attention_layer=None,
+  create_query_layer=False,
+  create_memory_layer=True,
+  create_attention_kwargs=None,
 ):
-  attention_layer_sizes = ([attention_layer_size] if attention_layer_size is not None else None)
+  attention_layer_sizes = [attention_layer_size] if attention_layer_size is not None else None
   attention_layers = [attention_layer] if attention_layer is not None else None
   create_attention_mechanisms = [create_attention_mechanism]
   attention_mechanism_depths = [attention_mechanism_depth]
@@ -300,13 +296,13 @@ def _test_with_attention(
     # Compute sum of attention_layer_sizes. Use encoder_output_depth if
     # None.
     attention_depth = sum(
-        attention_layer_size or encoder_output_depth for attention_layer_size in attention_layer_sizes
+      attention_layer_size or encoder_output_depth for attention_layer_size in attention_layer_sizes
     )
   elif attention_layers is not None:
     # Compute sum of attention_layers output depth.
     attention_depth = sum(
-        attention_layer.compute_output_shape([batch_size, cell_depth + encoder_output_depth])[-1]
-        for attention_layer in attention_layers
+      attention_layer.compute_output_shape([batch_size, cell_depth + encoder_output_depth])[-1]
+      for attention_layer in attention_layers
     )
   else:
     attention_depth = encoder_output_depth * len(create_attention_mechanisms)
@@ -324,12 +320,12 @@ def _test_with_attention(
       create_attention_kwargs["memory_layer"] = tf.keras.layers.Dense(depth, kernel_initializer="ones", use_bias=False)
 
     attention_mechanisms.append(
-        creator(
-            units=depth,
-            memory=encoder_outputs,
-            memory_sequence_length=encoder_sequence_length,
-            **create_attention_kwargs,
-        )
+      creator(
+        units=depth,
+        memory=encoder_outputs,
+        memory_sequence_length=encoder_sequence_length,
+        **create_attention_kwargs,
+      )
     )
 
   attention_layer_size = attention_layer_sizes
@@ -339,17 +335,17 @@ def _test_with_attention(
   if attention_layer is not None:
     attention_layer = attention_layer[0]
   cell = tf.keras.layers.LSTMCell(
-      cell_depth,
-      recurrent_activation="sigmoid",
-      kernel_initializer="ones",
-      recurrent_initializer="ones",
+    cell_depth,
+    recurrent_activation="sigmoid",
+    kernel_initializer="ones",
+    recurrent_initializer="ones",
   )
   cell = wrapper.AttentionWrapper(
-      cell,
-      attention_mechanisms[0],
-      attention_layer_size=attention_layer_size,
-      alignment_history=alignment_history,
-      attention_layer=attention_layer,
+    cell,
+    attention_mechanisms[0],
+    attention_layer_size=attention_layer_size,
+    alignment_history=alignment_history,
+    attention_layer=attention_layer,
   )
   if cell._attention_layers is not None:
     for layer in cell._attention_layers:
@@ -360,9 +356,9 @@ def _test_with_attention(
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=policy.compute_dtype)
   final_outputs, final_state, _ = my_decoder(
-      decoder_inputs,
-      initial_state=initial_state,
-      sequence_length=decoder_sequence_length,
+    decoder_inputs,
+    initial_state=initial_state,
+    sequence_length=decoder_sequence_length,
   )
 
   assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
@@ -380,8 +376,8 @@ def _test_with_attention(
     state_alignment_history = final_state.alignment_history.stack()
     assert (expected_time, batch_size, encoder_max_time) == tuple(state_alignment_history.shape.as_list())
     tf.nest.assert_same_structure(
-        cell.state_size,
-        cell.get_initial_state(batch_size=batch_size, dtype=policy.compute_dtype),
+      cell.state_size,
+      cell.get_initial_state(batch_size=batch_size, dtype=policy.compute_dtype),
     )
     # Remove the history from final_state for purposes of the
     # remainder of the tests.
@@ -402,11 +398,11 @@ def _test_with_attention(
   if alignment_history:
     final_alignment_history_info = tf.nest.map_structure(get_result_summary, state_alignment_history)
     tf.nest.map_structure(
-        assert_allclose_or_equal,
-        # outputs are batch major but the stacked TensorArray is
-        # time major
-        expected_final_alignment_history,
-        final_alignment_history_info,
+      assert_allclose_or_equal,
+      # outputs are batch major but the stacked TensorArray is
+      # time major
+      expected_final_alignment_history,
+      final_alignment_history_info,
     )
 
 
@@ -416,11 +412,11 @@ def test_bahdanau_normalized_dtype(dtype):
   encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
   decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
   attention_mechanism = wrapper.BahdanauAttention(
-      units=dummy_data.units,
-      memory=encoder_outputs,
-      memory_sequence_length=dummy_data.encoder_sequence_length,
-      normalize=True,
-      dtype=dtype,
+    units=dummy_data.units,
+    memory=encoder_outputs,
+    memory_sequence_length=dummy_data.encoder_sequence_length,
+    normalize=True,
+    dtype=dtype,
   )
   cell = tf.keras.layers.LSTMCell(dummy_data.units, recurrent_activation="sigmoid", dtype=dtype)
   cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
@@ -429,9 +425,9 @@ def test_bahdanau_normalized_dtype(dtype):
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
 
   final_outputs, final_state, _ = my_decoder(
-      decoder_inputs,
-      initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
-      sequence_length=dummy_data.decoder_sequence_length,
+    decoder_inputs,
+    initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
+    sequence_length=dummy_data.decoder_sequence_length,
   )
   assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
   assert final_outputs.rnn_output.dtype == dtype
@@ -445,11 +441,11 @@ def test_luong_scaled_dtype(dtype):
   encoder_outputs = dummy_data.encoder_outputs.astype(dtype)
   decoder_inputs = dummy_data.decoder_inputs.astype(dtype)
   attention_mechanism = wrapper.LuongAttention(
-      units=dummy_data.units,
-      memory=encoder_outputs,
-      memory_sequence_length=dummy_data.encoder_sequence_length,
-      scale=True,
-      dtype=dtype,
+    units=dummy_data.units,
+    memory=encoder_outputs,
+    memory_sequence_length=dummy_data.encoder_sequence_length,
+    scale=True,
+    dtype=dtype,
   )
   cell = tf.keras.layers.LSTMCell(dummy_data.units, recurrent_activation="sigmoid", dtype=dtype)
   cell = wrapper.AttentionWrapper(cell, attention_mechanism, dtype=dtype)
@@ -458,9 +454,9 @@ def test_luong_scaled_dtype(dtype):
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, dtype=dtype)
 
   final_outputs, final_state, _ = my_decoder(
-      decoder_inputs,
-      initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
-      sequence_length=dummy_data.decoder_sequence_length,
+    decoder_inputs,
+    initial_state=cell.get_initial_state(batch_size=dummy_data.batch, dtype=dtype),
+    sequence_length=dummy_data.decoder_sequence_length,
   )
   assert isinstance(final_outputs, basic_decoder.BasicDecoderOutput)
   assert final_outputs.rnn_output.dtype == dtype
@@ -469,9 +465,9 @@ def test_luong_scaled_dtype(dtype):
 
 def set_random_state_for_tf_and_np():
   """Since the results of the tests have been hardcoded, we need to make sure,
-    when we refactor code that the random state is the same. Meaning that all
-    random functions should be called in the same order.
-    """
+  when we refactor code that the random state is the same. Meaning that all
+  random functions should be called in the same order.
+  """
   tf.random.set_seed(87654321)
   np.random.seed(87654321)
   DummyData2()
@@ -484,29 +480,29 @@ def test_bahdanau_not_normalized():
   create_attention_mechanism = wrapper.BahdanauAttention
   create_attention_kwargs = {"kernel_initializer": "ones"}
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=policy.compute_dtype, mean=-0.003204414),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=3.2),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=policy.compute_dtype, mean=-0.003204414),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype(np.int32), mean=3.2),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.40868404),
-          ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.89017969),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=policy.compute_dtype, mean=0.041453815),
-      alignments=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
-      attention_state=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.40868404),
+      ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.89017969),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=policy.compute_dtype, mean=0.041453815),
+    alignments=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
+    attention_state=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
+    alignment_history=(),
   )
   expected_final_alignment_history = ResultSummary(shape=(3, 5, 8), dtype=policy.compute_dtype, mean=0.125)
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      alignment_history=True,
-      create_query_layer=True,
-      expected_final_alignment_history=expected_final_alignment_history,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    alignment_history=True,
+    create_query_layer=True,
+    expected_final_alignment_history=expected_final_alignment_history,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -516,26 +512,26 @@ def test_bahdanau_normalized():
   create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.008089137),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.8),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.008089137),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.8),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.49166861),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.01068615),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.042427111),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.49166861),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.01068615),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.042427111),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    alignment_history=(),
   )
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      create_query_layer=True,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    create_query_layer=True,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -546,25 +542,25 @@ def test_luong_not_normalized():
   create_attention_mechanism = wrapper.LuongAttention
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=policy.compute_dtype, mean=-0.06124732),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=policy.compute_dtype, mean=-0.06124732),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.52021580),
-          ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=1.0964939),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=policy.compute_dtype, mean=-0.0318060),
-      alignments=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
-      attention_state=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=0.52021580),
+      ResultSummary(shape=(5, 9), dtype=policy.compute_dtype, mean=1.0964939),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=policy.compute_dtype, mean=-0.0318060),
+    alignments=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
+    attention_state=ResultSummary(shape=(5, 8), dtype=policy.compute_dtype, mean=0.125),
+    alignment_history=(),
   )
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      attention_mechanism_depth=9,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_mechanism_depth=9,
   )
 
 
@@ -574,26 +570,26 @@ def test_luong_scaled():
   create_attention_kwargs = {"scale": True}
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.06124732),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=2.73333333),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.52021580),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0964939),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=-0.0318060),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    alignment_history=(),
   )
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      attention_mechanism_depth=9,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_mechanism_depth=9,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -603,27 +599,27 @@ def test_not_use_attention_layer():
   create_attention_kwargs = {"kernel_initializer": "ones"}
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.078317143),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=4.2),
+    rnn_output=ResultSummary(shape=(5, 3, 10), dtype=np.dtype("float32"), mean=0.078317143),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=4.2),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.89382392),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.722382),
-      ],
-      attention=ResultSummary(shape=(5, 10), dtype=np.dtype("float32"), mean=0.026356646),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.89382392),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.722382),
+    ],
+    attention=ResultSummary(shape=(5, 10), dtype=np.dtype("float32"), mean=0.026356646),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.125),
+    alignment_history=(),
   )
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      attention_layer_size=None,
-      create_query_layer=True,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_layer_size=None,
+    create_query_layer=True,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -633,29 +629,29 @@ def test_bahdanau_monotonic_not_normalized():
   create_attention_kwargs = {"kernel_initializer": "ones"}
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.009921653),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=-0.009921653),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.13333333),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.44612807),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.95786464),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.038682378),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.44612807),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.95786464),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.038682378),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.09778417),
+    alignment_history=(),
   )
   expected_final_alignment_history = ResultSummary(shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.10261579603)
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      alignment_history=True,
-      expected_final_alignment_history=expected_final_alignment_history,
-      create_query_layer=True,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    alignment_history=True,
+    expected_final_alignment_history=expected_final_alignment_history,
+    create_query_layer=True,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -664,29 +660,29 @@ def test_bahdanau_monotonic_normalized():
   create_attention_mechanism = wrapper.BahdanauMonotonicAttention
   create_attention_kwargs = {"kernel_initializer": "ones", "normalize": True}
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.007140680),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.26666666),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.007140680),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.26666666),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.47012400),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0249618),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.068432882),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.47012400),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.0249618),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.068432882),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.0615656),
+    alignment_history=(),
   )
   expected_final_alignment_history = ResultSummary(shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.07909643)
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      alignment_history=True,
-      expected_final_alignment_history=expected_final_alignment_history,
-      create_query_layer=True,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    alignment_history=True,
+    expected_final_alignment_history=expected_final_alignment_history,
+    create_query_layer=True,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -695,28 +691,28 @@ def test_luong_monotonic_not_normalized():
   create_attention_mechanism = wrapper.LuongMonotonicAttention
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
+    alignment_history=(),
   )
   expected_final_alignment_history = ResultSummary(shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868)
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      attention_mechanism_depth=9,
-      alignment_history=True,
-      expected_final_alignment_history=expected_final_alignment_history,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_mechanism_depth=9,
+    alignment_history=True,
+    expected_final_alignment_history=expected_final_alignment_history,
   )
 
 
@@ -726,29 +722,29 @@ def test_luong_monotonic_scaled():
   create_attention_kwargs = {"scale": True}
 
   expected_final_output = basic_decoder.BasicDecoderOutput(
-      rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831),
-      sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
+    rnn_output=ResultSummary(shape=(5, 3, 6), dtype=np.dtype("float32"), mean=0.003664831),
+    sample_id=ResultSummary(shape=(5, 3), dtype=np.dtype("int32"), mean=3.06666666),
   )
   expected_final_state = wrapper.AttentionWrapperState(
-      cell_state=[
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
-          ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
-      ],
-      attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221),
-      alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
-      attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
-      alignment_history=(),
+    cell_state=[
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=0.54318606),
+      ResultSummary(shape=(5, 9), dtype=np.dtype("float32"), mean=1.12592840),
+    ],
+    attention=ResultSummary(shape=(5, 6), dtype=np.dtype("float32"), mean=0.059128221),
+    alignments=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
+    attention_state=ResultSummary(shape=(5, 8), dtype=np.dtype("float32"), mean=0.05112994),
+    alignment_history=(),
   )
   expected_final_alignment_history = ResultSummary(shape=(3, 5, 8), dtype=np.dtype("float32"), mean=0.06994973868)
 
   _test_with_attention(
-      create_attention_mechanism,
-      expected_final_output,
-      expected_final_state,
-      attention_mechanism_depth=9,
-      alignment_history=True,
-      expected_final_alignment_history=expected_final_alignment_history,
-      create_attention_kwargs=create_attention_kwargs,
+    create_attention_mechanism,
+    expected_final_output,
+    expected_final_state,
+    attention_mechanism_depth=9,
+    alignment_history=True,
+    expected_final_alignment_history=expected_final_alignment_history,
+    create_attention_kwargs=create_attention_kwargs,
   )
 
 
@@ -802,7 +798,7 @@ def test_attention_wrapper_with_multiple_attention_mechanisms():
   # We simply test that the wrapper creation makes no error.
   wrapper.AttentionWrapper(cell, mechanisms, attention_layer_size=[4, 5])
   wrapper.AttentionWrapper(
-      cell,
-      mechanisms,
-      attention_layer=[tf.keras.layers.Dense(4), tf.keras.layers.Dense(5)],
+    cell,
+    mechanisms,
+    attention_layer=[tf.keras.layers.Dense(4), tf.keras.layers.Dense(5)],
   )
diff --git a/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
index 5e47e17d..b8305916 100644
--- a/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/basic_decoder_test.py
@@ -46,16 +46,18 @@ def test_step_with_training_helper_output_layer(cell_class, use_output_layer):
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, output_layer=output_layer)
 
-  (first_finished, first_inputs,
-   first_state) = my_decoder.initialize(input_t, initial_state=initial_state, sequence_length=sequence_length)
+  (first_finished, first_inputs, first_state) = my_decoder.initialize(
+    input_t, initial_state=initial_state, sequence_length=sequence_length
+  )
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(expected_output_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(expected_output_depth, tf.TensorShape([])) == output_size
 
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   if isinstance(cell, tf.keras.layers.LSTMCell):
     assert len(first_state) == 2
@@ -108,17 +110,19 @@ def test_step_with_training_helper_masked_input(use_mask):
   elif use_mask:
     (first_finished, first_inputs, first_state) = my_decoder.initialize(input_t, initial_state=initial_state, mask=mask)
   else:
-    (first_finished, first_inputs,
-     first_state) = my_decoder.initialize(input_t, initial_state=initial_state, sequence_length=sequence_length)
+    (first_finished, first_inputs, first_state) = my_decoder.initialize(
+      input_t, initial_state=initial_state, sequence_length=sequence_length
+    )
 
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(expected_output_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(expected_output_depth, tf.TensorShape([])) == output_size
 
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
@@ -153,18 +157,19 @@ def test_step_with_greedy_embedding_helper():
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
   (first_finished, first_inputs, first_state) = my_decoder.initialize(
-      embeddings_t,
-      start_tokens=start_tokens,
-      end_token=end_token,
-      initial_state=initial_state,
+    embeddings_t,
+    start_tokens=start_tokens,
+    end_token=end_token,
+    initial_state=initial_state,
   )
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
@@ -202,18 +207,19 @@ def test_step_with_sample_embedding_helper():
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
   (first_finished, first_inputs, first_state) = my_decoder.initialize(
-      embeddings_t,
-      start_tokens=start_tokens,
-      end_token=end_token,
-      initial_state=initial_state,
+    embeddings_t,
+    start_tokens=start_tokens,
+    end_token=end_token,
+    initial_state=initial_state,
   )
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
@@ -249,19 +255,20 @@ def test_step_with_scheduled_embedding_training_helper():
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
   (first_finished, first_inputs, first_state) = my_decoder.initialize(
-      input_t,
-      sequence_length=sequence_length,
-      embedding=embeddings,
-      initial_state=initial_state,
+    input_t,
+    sequence_length=sequence_length,
+    embedding=embeddings,
+    initial_state=initial_state,
   )
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(vocabulary_size, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(vocabulary_size, tf.TensorShape([])) == output_size
 
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
@@ -282,12 +289,12 @@ def test_step_with_scheduled_embedding_training_helper():
   batch_where_sampling = np.where(sample_ids > -1)
 
   np.testing.assert_equal(
-      step_next_inputs.numpy()[batch_where_sampling],
-      embeddings[sample_ids[batch_where_sampling]],
+    step_next_inputs.numpy()[batch_where_sampling],
+    embeddings[sample_ids[batch_where_sampling]],
   )
   np.testing.assert_equal(
-      step_next_inputs.numpy()[batch_where_not_sampling],
-      np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
+    step_next_inputs.numpy()[batch_where_not_sampling],
+    np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
   )
 
 
@@ -322,27 +329,28 @@ def next_inputs_fn(outputs):
     next_inputs_fn = None
 
   sampler = sampler_py.ScheduledOutputTrainingSampler(
-      sampling_probability=sampling_probability,
-      time_major=False,
-      next_inputs_fn=next_inputs_fn,
+    sampling_probability=sampling_probability,
+    time_major=False,
+    next_inputs_fn=next_inputs_fn,
   )
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
 
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
 
   (first_finished, first_inputs, first_state) = my_decoder.initialize(
-      input_t,
-      sequence_length=sequence_length,
-      initial_state=initial_state,
-      auxiliary_inputs=auxiliary_inputs,
+    input_t,
+    sequence_length=sequence_length,
+    initial_state=initial_state,
+    auxiliary_inputs=auxiliary_inputs,
   )
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   if use_next_inputs_fn:
     output_after_next_inputs_fn = next_inputs_fn(step_outputs.rnn_output)
@@ -366,29 +374,30 @@ def next_inputs_fn(outputs):
   batch_where_sampling = np.where(sample_ids)
 
   auxiliary_inputs_to_concat = (
-      auxiliary_inputs[:, 1] if use_auxiliary_inputs else np.array([]).reshape(batch_size, 0).astype(np.float32)
+    auxiliary_inputs[:, 1] if use_auxiliary_inputs else np.array([]).reshape(batch_size, 0).astype(np.float32)
   )
 
   expected_next_sampling_inputs = np.concatenate(
-      (
-          output_after_next_inputs_fn.numpy()[batch_where_sampling]
-          if use_next_inputs_fn else step_outputs.rnn_output.numpy()[batch_where_sampling],
-          auxiliary_inputs_to_concat[batch_where_sampling],
-      ),
-      axis=-1,
+    (
+      output_after_next_inputs_fn.numpy()[batch_where_sampling]
+      if use_next_inputs_fn
+      else step_outputs.rnn_output.numpy()[batch_where_sampling],
+      auxiliary_inputs_to_concat[batch_where_sampling],
+    ),
+    axis=-1,
   )
 
   np.testing.assert_equal(step_next_inputs.numpy()[batch_where_sampling], expected_next_sampling_inputs)
 
   np.testing.assert_equal(
-      step_next_inputs.numpy()[batch_where_not_sampling],
-      np.concatenate(
-          (
-              np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
-              auxiliary_inputs_to_concat[batch_where_not_sampling],
-          ),
-          axis=-1,
+    step_next_inputs.numpy()[batch_where_not_sampling],
+    np.concatenate(
+      (
+        np.squeeze(inputs[batch_where_not_sampling, 1], axis=0),
+        auxiliary_inputs_to_concat[batch_where_not_sampling],
       ),
+      axis=-1,
+    ),
   )
 
 
@@ -414,11 +423,11 @@ def end_fn(sample_ids):
 
   cell = tf.keras.layers.LSTMCell(vocabulary_size)
   sampler = sampler_py.InferenceSampler(
-      sample_fn,
-      sample_shape=(),
-      sample_dtype=tf.int32,
-      end_fn=end_fn,
-      next_inputs_fn=next_inputs_fn,
+    sample_fn,
+    sample_shape=(),
+    sample_dtype=tf.int32,
+    end_fn=end_fn,
+    next_inputs_fn=next_inputs_fn,
   )
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
@@ -426,11 +435,12 @@ def end_fn(sample_ids):
 
   output_size = my_decoder.output_size
   output_dtype = my_decoder.output_dtype
-  assert (basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size)
+  assert basic_decoder.BasicDecoderOutput(cell_depth, tf.TensorShape([])) == output_size
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.int32) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
@@ -473,11 +483,11 @@ def end_fn(sample_ids):
 
   cell = tf.keras.layers.LSTMCell(vocabulary_size)
   sampler = sampler_py.InferenceSampler(
-      sample_fn,
-      sample_shape=[cell_depth],
-      sample_dtype=tf.bool,
-      end_fn=end_fn,
-      next_inputs_fn=next_inputs_fn,
+    sample_fn,
+    sample_shape=[cell_depth],
+    sample_dtype=tf.bool,
+    end_fn=end_fn,
+    next_inputs_fn=next_inputs_fn,
   )
   initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler)
@@ -487,8 +497,9 @@ def end_fn(sample_ids):
   assert basic_decoder.BasicDecoderOutput(cell_depth, cell_depth) == output_size
   assert basic_decoder.BasicDecoderOutput(tf.float32, tf.bool) == output_dtype
 
-  (step_outputs, step_state, step_next_inputs,
-   step_finished) = my_decoder.step(tf.constant(0), first_inputs, first_state)
+  (step_outputs, step_state, step_next_inputs, step_finished) = my_decoder.step(
+    tf.constant(0), first_inputs, first_state
+  )
 
   assert len(first_state) == 2
   assert len(step_state) == 2
diff --git a/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
index 854554d0..ed774b0c 100644
--- a/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_decoder_test.py
@@ -28,25 +28,28 @@ def test_gather_tree():
 
   # create (batch_size, max_time, beam_width) matrix and transpose it
   predicted_ids = np.array(
-      [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
-      dtype=np.int32,
+    [[[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[2, 3, 4], [5, 6, 7], [8, 9, 10]]],
+    dtype=np.int32,
   ).transpose([1, 0, 2])
   parent_ids = np.array(
-      [[[0, 0, 0], [0, 1, 1], [2, 1, 2]], [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
-      dtype=np.int32,
+    [[[0, 0, 0], [0, 1, 1], [2, 1, 2]], [[0, 0, 0], [1, 2, 0], [2, 1, 1]]],
+    dtype=np.int32,
   ).transpose([1, 0, 2])
 
   # sequence_lengths is shaped (batch_size = 3)
   max_sequence_lengths = [3, 3]
 
-  expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]], [[2, 4, 4], [7, 6, 6], [8, 9,
-                                                                                         10]]]).transpose([1, 0, 2])
+  expected_result = np.array([[[2, 2, 2], [6, 5, 6], [7, 8, 9]], [[2, 4, 4], [7, 6, 6], [8, 9, 10]]]).transpose([
+    1,
+    0,
+    2,
+  ])
 
   res = beam_search_decoder.gather_tree(
-      predicted_ids,
-      parent_ids,
-      max_sequence_lengths=max_sequence_lengths,
-      end_token=11,
+    predicted_ids,
+    parent_ids,
+    max_sequence_lengths=max_sequence_lengths,
+    end_token=11,
   )
 
   np.testing.assert_equal(expected_result, res)
@@ -54,21 +57,17 @@ def test_gather_tree():
 
 def _test_gather_tree_from_array(depth_ndims=0, merged_batch_beam=False):
   array = np.array([
-      [[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
-      [[2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 0]],
+    [[1, 2, 3], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
+    [[2, 3, 4], [5, 6, 7], [8, 9, 10], [11, 12, 0]],
+  ]).transpose([1, 0, 2])
+  parent_ids = np.array([
+    [[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]],
+    [[0, 0, 0], [1, 1, 0], [2, 0, 1], [0, 1, 0]],
+  ]).transpose([1, 0, 2])
+  expected_array = np.array([
+    [[2, 2, 2], [6, 5, 6], [7, 8, 9], [0, 0, 0]],
+    [[2, 3, 2], [7, 5, 7], [8, 9, 8], [11, 12, 0]],
   ]).transpose([1, 0, 2])
-  parent_ids = np.array(
-      [
-          [[0, 0, 0], [0, 1, 1], [2, 1, 2], [-1, -1, -1]],
-          [[0, 0, 0], [1, 1, 0], [2, 0, 1], [0, 1, 0]],
-      ]
-  ).transpose([1, 0, 2])
-  expected_array = np.array(
-      [
-          [[2, 2, 2], [6, 5, 6], [7, 8, 9], [0, 0, 0]],
-          [[2, 3, 2], [7, 5, 7], [8, 9, 8], [11, 12, 0]],
-      ]
-  ).transpose([1, 0, 2])
   sequence_length = [[3, 3, 3], [4, 4, 3]]
 
   array = tf.convert_to_tensor(array, dtype=tf.float32)
@@ -123,43 +122,37 @@ def test_gather_tree_from_array_2d():
 def test_gather_tree_from_array_complex_trajectory():
   # Max. time = 7, batch = 1, beam = 5.
   array = np.expand_dims(
-      np.array(
-          [
-              [[25, 12, 114, 89, 97]],
-              [[9, 91, 64, 11, 162]],
-              [[34, 34, 34, 34, 34]],
-              [[2, 4, 2, 2, 4]],
-              [[2, 3, 6, 2, 2]],
-              [[2, 2, 2, 3, 2]],
-              [[2, 2, 2, 2, 2]],
-          ]
-      ),
-      -1,
-  )
-  parent_ids = np.array(
-      [
-          [[0, 0, 0, 0, 0]],
-          [[0, 0, 0, 0, 0]],
-          [[0, 1, 2, 3, 4]],
-          [[0, 0, 1, 2, 1]],
-          [[0, 1, 1, 2, 3]],
-          [[0, 1, 3, 1, 2]],
-          [[0, 1, 2, 3, 4]],
-      ]
+    np.array([
+      [[25, 12, 114, 89, 97]],
+      [[9, 91, 64, 11, 162]],
+      [[34, 34, 34, 34, 34]],
+      [[2, 4, 2, 2, 4]],
+      [[2, 3, 6, 2, 2]],
+      [[2, 2, 2, 3, 2]],
+      [[2, 2, 2, 2, 2]],
+    ]),
+    -1,
   )
+  parent_ids = np.array([
+    [[0, 0, 0, 0, 0]],
+    [[0, 0, 0, 0, 0]],
+    [[0, 1, 2, 3, 4]],
+    [[0, 0, 1, 2, 1]],
+    [[0, 1, 1, 2, 3]],
+    [[0, 1, 3, 1, 2]],
+    [[0, 1, 2, 3, 4]],
+  ])
   expected_array = np.expand_dims(
-      np.array(
-          [
-              [[25, 25, 25, 25, 25]],
-              [[9, 9, 91, 9, 9]],
-              [[34, 34, 34, 34, 34]],
-              [[2, 4, 2, 4, 4]],
-              [[2, 3, 6, 3, 6]],
-              [[2, 2, 2, 3, 2]],
-              [[2, 2, 2, 2, 2]],
-          ]
-      ),
-      -1,
+    np.array([
+      [[25, 25, 25, 25, 25]],
+      [[9, 9, 91, 9, 9]],
+      [[34, 34, 34, 34, 34]],
+      [[2, 4, 2, 4, 4]],
+      [[2, 3, 6, 3, 6]],
+      [[2, 2, 2, 3, 2]],
+      [[2, 2, 2, 2, 2]],
+    ]),
+    -1,
   )
   sequence_length = [[4, 6, 4, 7, 6]]
 
@@ -173,7 +166,6 @@ def test_gather_tree_from_array_complex_trajectory():
 
 
 def basic_test_array_shape_dynamic_checks(static_shape, dynamic_shape, batch_size, beam_width, is_valid=True):
-
   @tf.function(input_signature=(tf.TensorSpec(dynamic_shape, dtype=tf.float32),))
   def _test_body(t):
     beam_search_decoder._check_batch_beam(t, batch_size, beam_width)
@@ -196,26 +188,24 @@ def test_array_shape_dynamic_checks():
 
 
 def test_array_shape_static_checks():
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([None, None, None]), 3, 5) is True)
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([None, None, None]), 3, 5) is True
 
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([15, None, None]), 3, 5) is True)
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([16, None, None]), 3, 5) is False)
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([3, 5, None]), 3, 5) is True)
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([3, 6, None]), 3, 5) is False)
-  assert (beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([5, 3, None]), 3, 5) is False)
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([15, None, None]), 3, 5) is True
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([16, None, None]), 3, 5) is False
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([3, 5, None]), 3, 5) is True
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([3, 6, None]), 3, 5) is False
+  assert beam_search_decoder._check_static_batch_beam_maybe(tf.TensorShape([5, 3, None]), 3, 5) is False
 
 
 def test_eos_masking():
-  probs = tf.constant(
-      [
-          [
-              [-0.2, -0.2, -0.2, -0.2, -0.2],
-              [-0.3, -0.3, -0.3, 3, 0],
-              [5, 6, 0, 0, 0],
-          ],
-          [[-0.2, -0.2, -0.2, -0.2, 0], [-0.3, -0.3, -0.1, 3, 0], [5, 6, 3, 0, 0]],
-      ]
-  )
+  probs = tf.constant([
+    [
+      [-0.2, -0.2, -0.2, -0.2, -0.2],
+      [-0.3, -0.3, -0.3, 3, 0],
+      [5, 6, 0, 0, 0],
+    ],
+    [[-0.2, -0.2, -0.2, -0.2, 0], [-0.3, -0.3, -0.1, 3, 0], [5, 6, 3, 0, 0]],
+  ])
 
   eos_token = 0
   previously_finished = np.array([[0, 1, 0], [0, 1, 1]], dtype=bool)
@@ -259,11 +249,11 @@ def test_beam_step():
 
   dummy_cell_state = tf.zeros([batch_size, beam_width])
   beam_state = beam_search_decoder.BeamSearchDecoderState(
-      cell_state=dummy_cell_state,
-      log_probs=tf.nn.log_softmax(tf.ones([batch_size, beam_width])),
-      lengths=tf.constant(2, shape=[batch_size, beam_width], dtype=tf.int64),
-      finished=tf.zeros([batch_size, beam_width], dtype=tf.bool),
-      accumulated_attention_probs=(),
+    cell_state=dummy_cell_state,
+    log_probs=tf.nn.log_softmax(tf.ones([batch_size, beam_width])),
+    lengths=tf.constant(2, shape=[batch_size, beam_width], dtype=tf.int64),
+    finished=tf.zeros([batch_size, beam_width], dtype=tf.bool),
+    accumulated_attention_probs=(),
   )
 
   logits_ = np.full([batch_size, beam_width, vocab_size], 0.0001)
@@ -279,31 +269,31 @@ def test_beam_step():
   log_probs = tf.nn.log_softmax(logits)
 
   outputs, next_beam_state = beam_search_decoder._beam_search_step(
-      time=2,
-      logits=logits,
-      next_cell_state=dummy_cell_state,
-      beam_state=beam_state,
-      batch_size=tf.convert_to_tensor(batch_size),
-      beam_width=beam_width,
-      end_token=end_token,
-      length_penalty_weight=length_penalty_weight,
-      coverage_penalty_weight=coverage_penalty_weight,
-      output_all_scores=output_all_scores,
+    time=2,
+    logits=logits,
+    next_cell_state=dummy_cell_state,
+    beam_state=beam_state,
+    batch_size=tf.convert_to_tensor(batch_size),
+    beam_width=beam_width,
+    end_token=end_token,
+    length_penalty_weight=length_penalty_weight,
+    coverage_penalty_weight=coverage_penalty_weight,
+    output_all_scores=output_all_scores,
   )
 
   outputs_, next_state_, state_, log_probs_ = [
-      outputs,
-      next_beam_state,
-      beam_state,
-      log_probs,
+    outputs,
+    next_beam_state,
+    beam_state,
+    log_probs,
   ]
 
   np.testing.assert_equal(outputs_.predicted_ids.numpy(), np.asanyarray([[3, 3, 2], [2, 2, 1]]))
   np.testing.assert_equal(outputs_.parent_ids.numpy(), np.asanyarray([[1, 0, 0], [2, 1, 0]]))
   np.testing.assert_equal(next_state_.lengths.numpy(), np.asanyarray([[3, 3, 3], [3, 3, 3]]))
   np.testing.assert_equal(
-      next_state_.finished.numpy(),
-      np.asanyarray([[False, False, False], [False, False, False]]),
+    next_state_.finished.numpy(),
+    np.asanyarray([[False, False, False], [False, False, False]]),
   )
 
   expected_log_probs = []
@@ -329,11 +319,11 @@ def test_step_with_eos():
 
   dummy_cell_state = tf.zeros([batch_size, beam_width])
   beam_state = beam_search_decoder.BeamSearchDecoderState(
-      cell_state=dummy_cell_state,
-      log_probs=tf.nn.log_softmax(tf.ones([batch_size, beam_width])),
-      lengths=tf.convert_to_tensor([[2, 1, 2], [2, 2, 1]], dtype=tf.int64),
-      finished=tf.convert_to_tensor([[False, True, False], [False, False, True]], dtype=tf.bool),
-      accumulated_attention_probs=(),
+    cell_state=dummy_cell_state,
+    log_probs=tf.nn.log_softmax(tf.ones([batch_size, beam_width])),
+    lengths=tf.convert_to_tensor([[2, 1, 2], [2, 2, 1]], dtype=tf.int64),
+    finished=tf.convert_to_tensor([[False, True, False], [False, False, True]], dtype=tf.bool),
+    accumulated_attention_probs=(),
   )
 
   logits_ = np.full([batch_size, beam_width, vocab_size], 0.0001)
@@ -349,31 +339,31 @@ def test_step_with_eos():
   log_probs = tf.nn.log_softmax(logits)
 
   outputs, next_beam_state = beam_search_decoder._beam_search_step(
-      time=2,
-      logits=logits,
-      next_cell_state=dummy_cell_state,
-      beam_state=beam_state,
-      batch_size=tf.convert_to_tensor(batch_size),
-      beam_width=beam_width,
-      end_token=end_token,
-      length_penalty_weight=length_penalty_weight,
-      coverage_penalty_weight=coverage_penalty_weight,
-      output_all_scores=output_all_scores,
+    time=2,
+    logits=logits,
+    next_cell_state=dummy_cell_state,
+    beam_state=beam_state,
+    batch_size=tf.convert_to_tensor(batch_size),
+    beam_width=beam_width,
+    end_token=end_token,
+    length_penalty_weight=length_penalty_weight,
+    coverage_penalty_weight=coverage_penalty_weight,
+    output_all_scores=output_all_scores,
   )
 
   outputs_, next_state_, state_, log_probs_ = [
-      outputs,
-      next_beam_state,
-      beam_state,
-      log_probs,
+    outputs,
+    next_beam_state,
+    beam_state,
+    log_probs,
   ]
 
   np.testing.assert_equal(outputs_.parent_ids.numpy(), np.asanyarray([[1, 0, 0], [1, 2, 0]]))
   np.testing.assert_equal(outputs_.predicted_ids.numpy(), np.asanyarray([[0, 3, 2], [2, 0, 1]]))
   np.testing.assert_equal(next_state_.lengths.numpy(), np.asanyarray([[1, 3, 3], [3, 1, 3]]))
   np.testing.assert_equal(
-      next_state_.finished.numpy(),
-      np.asanyarray([[True, False, False], [False, True, False]]),
+    next_state_.finished.numpy(),
+    np.asanyarray([[True, False, False], [False, True, False]]),
   )
 
   expected_log_probs = []
@@ -398,11 +388,11 @@ def test_large_beam_step():
   def get_probs():
     """this simulates the initialize method in BeamSearchDecoder."""
     log_prob_mask = tf.one_hot(
-        tf.zeros([batch_size], dtype=tf.int32),
-        depth=beam_width,
-        on_value=True,
-        off_value=False,
-        dtype=tf.bool,
+      tf.zeros([batch_size], dtype=tf.int32),
+      depth=beam_width,
+      on_value=True,
+      off_value=False,
+      dtype=tf.bool,
     )
 
     log_prob_zeros = tf.zeros([batch_size, beam_width], dtype=tf.float32)
@@ -415,22 +405,22 @@ def get_probs():
   dummy_cell_state = tf.zeros([batch_size, beam_width])
 
   _finished = tf.one_hot(
-      tf.zeros([batch_size], dtype=tf.int32),
-      depth=beam_width,
-      on_value=False,
-      off_value=True,
-      dtype=tf.bool,
+    tf.zeros([batch_size], dtype=tf.int32),
+    depth=beam_width,
+    on_value=False,
+    off_value=True,
+    dtype=tf.bool,
   )
   _lengths = np.zeros([batch_size, beam_width], dtype=np.int64)
   _lengths[:, 0] = 2
   _lengths = tf.constant(_lengths, dtype=tf.int64)
 
   beam_state = beam_search_decoder.BeamSearchDecoderState(
-      cell_state=dummy_cell_state,
-      log_probs=log_probs,
-      lengths=_lengths,
-      finished=_finished,
-      accumulated_attention_probs=(),
+    cell_state=dummy_cell_state,
+    log_probs=log_probs,
+    lengths=_lengths,
+    finished=_finished,
+    accumulated_attention_probs=(),
   )
 
   logits_ = np.full([batch_size, beam_width, vocab_size], 0.0001)
@@ -446,16 +436,16 @@ def get_probs():
   log_probs = tf.nn.log_softmax(logits)
 
   outputs, next_beam_state = beam_search_decoder._beam_search_step(
-      time=2,
-      logits=logits,
-      next_cell_state=dummy_cell_state,
-      beam_state=beam_state,
-      batch_size=tf.convert_to_tensor(batch_size),
-      beam_width=beam_width,
-      end_token=end_token,
-      length_penalty_weight=length_penalty_weight,
-      coverage_penalty_weight=coverage_penalty_weight,
-      output_all_scores=output_all_scores,
+    time=2,
+    logits=logits,
+    next_cell_state=dummy_cell_state,
+    beam_state=beam_state,
+    batch_size=tf.convert_to_tensor(batch_size),
+    beam_width=beam_width,
+    end_token=end_token,
+    length_penalty_weight=length_penalty_weight,
+    coverage_penalty_weight=coverage_penalty_weight,
+    output_all_scores=output_all_scores,
   )
 
   outputs_, next_state_ = [outputs, next_beam_state]
@@ -465,8 +455,8 @@ def get_probs():
   assert outputs_.predicted_ids[1, 0] == 1
   neg_inf = -np.Inf
   np.testing.assert_equal(
-      next_state_.log_probs[:, -3:].numpy(),
-      np.asanyarray([[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]]),
+    next_state_.log_probs[:, -3:].numpy(),
+    np.asanyarray([[neg_inf, neg_inf, neg_inf], [neg_inf, neg_inf, neg_inf]]),
   )
   np.testing.assert_equal(np.asanyarray(next_state_.log_probs[:, :-3] > neg_inf), True)
   np.testing.assert_equal(np.asanyarray(next_state_.lengths[:, :-3] > 0), True)
@@ -498,33 +488,35 @@ def test_beam_search_decoder(cell_class, time_major, has_attention, with_alignme
   cell = cell_class(cell_depth)
 
   if has_attention:
-    attention_mechanism = attention_wrapper.BahdanauAttention(units=attention_depth,)
+    attention_mechanism = attention_wrapper.BahdanauAttention(
+      units=attention_depth,
+    )
     cell = attention_wrapper.AttentionWrapper(
-        cell=cell,
-        attention_mechanism=attention_mechanism,
-        attention_layer_size=attention_depth,
-        alignment_history=with_alignment_history,
+      cell=cell,
+      attention_mechanism=attention_mechanism,
+      attention_layer_size=attention_depth,
+      alignment_history=with_alignment_history,
     )
     coverage_penalty_weight = 0.2
   else:
     coverage_penalty_weight = 0.0
 
   bsd = beam_search_decoder.BeamSearchDecoder(
-      cell=cell,
-      beam_width=beam_width,
-      output_layer=output_layer,
-      length_penalty_weight=0.0,
-      coverage_penalty_weight=coverage_penalty_weight,
-      output_time_major=time_major,
-      maximum_iterations=maximum_iterations,
-      output_all_scores=output_all_scores,
+    cell=cell,
+    beam_width=beam_width,
+    output_layer=output_layer,
+    length_penalty_weight=0.0,
+    coverage_penalty_weight=coverage_penalty_weight,
+    output_time_major=time_major,
+    maximum_iterations=maximum_iterations,
+    output_all_scores=output_all_scores,
   )
 
   @tf.function(
-      input_signature=(
-          tf.TensorSpec([None, None, input_depth], dtype=tf.float32),
-          tf.TensorSpec([None], dtype=tf.int32),
-      )
+    input_signature=(
+      tf.TensorSpec([None, None, input_depth], dtype=tf.float32),
+      tf.TensorSpec([None], dtype=tf.int32),
+    )
   )
   def _beam_decode_from(memory, memory_sequence_length):
     batch_size_tensor = tf.shape(memory)[0]
@@ -537,10 +529,10 @@ def _beam_decode_from(memory, memory_sequence_length):
     cell_state = cell.get_initial_state(batch_size=batch_size_tensor * beam_width, dtype=tf.float32)
 
     return bsd(
-        embedding,
-        start_tokens=tf.fill([batch_size_tensor], start_token),
-        end_token=end_token,
-        initial_state=cell_state,
+      embedding,
+      start_tokens=tf.fill([batch_size_tensor], start_token),
+      end_token=end_token,
+      initial_state=cell_state,
     )
 
   memory = tf.random.normal([batch_size, decoder_max_time, input_depth])
@@ -560,8 +552,9 @@ def _t(shape):
   assert _t((batch_size, max_sequence_length, beam_width)) == tuple(final_outputs.predicted_ids.shape.as_list())
 
   if output_all_scores:
-    assert _t((batch_size, max_sequence_length, beam_width, vocab_size)
-             ) == tuple(beam_search_decoder_output.scores.shape.as_list())
+    assert _t((batch_size, max_sequence_length, beam_width, vocab_size)) == tuple(
+      beam_search_decoder_output.scores.shape.as_list()
+    )
 
     # Check that the vocab size corresponds to the dimensions of the output.
     assert (beam_width, vocab_size) == tuple(bsd.output_size.scores.as_list())
diff --git a/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
index 06270686..78fb1110 100644
--- a/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/beam_search_ops_test.py
@@ -36,10 +36,10 @@ def test_gather_tree_one():
   max_sequence_lengths = [3]
   expected_result = _transpose_batch_time([[[2, 2, 2], [6, 5, 6], [7, 8, 9], [10, 10, 10]]])
   beams = beam_search_decoder.gather_tree(
-      step_ids=step_ids,
-      parent_ids=parent_ids,
-      max_sequence_lengths=max_sequence_lengths,
-      end_token=end_token,
+    step_ids=step_ids,
+    parent_ids=parent_ids,
+    max_sequence_lengths=max_sequence_lengths,
+    end_token=end_token,
   )
   np.testing.assert_equal(expected_result, beams.numpy())
 
@@ -55,10 +55,10 @@ def test_bad_parent_values_on_cpu():
 
   with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"):
     _ = beam_search_decoder.gather_tree(
-        step_ids=step_ids,
-        parent_ids=parent_ids,
-        max_sequence_lengths=max_sequence_lengths,
-        end_token=end_token,
+      step_ids=step_ids,
+      parent_ids=parent_ids,
+      max_sequence_lengths=max_sequence_lengths,
+      end_token=end_token,
     )
 
 
@@ -74,10 +74,10 @@ def test_bad_parent_values_on_gpu():
 
   with pytest.raises(tf.errors.InvalidArgumentError, match="parent id"):
     _ = beam_search_decoder.gather_tree(
-        step_ids=step_ids,
-        parent_ids=parent_ids,
-        max_sequence_lengths=max_sequence_lengths,
-        end_token=end_token,
+      step_ids=step_ids,
+      parent_ids=parent_ids,
+      max_sequence_lengths=max_sequence_lengths,
+      end_token=end_token,
     )
 
 
@@ -93,17 +93,17 @@ def test_gather_tree_batch():
   parent_ids = np.random.randint(0, high=beam_width - 1, size=(max_time, batch_size, beam_width))
 
   beams = beam_search_decoder.gather_tree(
-      step_ids=step_ids.astype(np.int32),
-      parent_ids=parent_ids.astype(np.int32),
-      max_sequence_lengths=max_sequence_lengths,
-      end_token=end_token,
+    step_ids=step_ids.astype(np.int32),
+    parent_ids=parent_ids.astype(np.int32),
+    max_sequence_lengths=max_sequence_lengths,
+    end_token=end_token,
   )
   beams = beams.numpy()
 
   assert (max_time, batch_size, beam_width) == beams.shape
   for b in range(batch_size):
     # Past max_sequence_lengths[b], we emit all end tokens.
-    b_value = beams[max_sequence_lengths[b]:, b, :]
+    b_value = beams[max_sequence_lengths[b] :, b, :]
     np.testing.assert_allclose(b_value, end_token * np.ones_like(b_value))
   for batch, beam in itertools.product(range(batch_size), range(beam_width)):
     v = np.squeeze(beams[:, batch, beam])
@@ -115,5 +115,5 @@ def test_gather_tree_batch():
       # If an end_token is found, everything before it should be a
       # valid id and everything after it should be end_token.
       if found > 0:
-        np.testing.assert_equal(v[:found - 1] >= 0, np.ones_like(v[:found - 1], dtype=bool))
-      np.testing.assert_allclose(v[found + 1:], end_token * np.ones_like(v[found + 1:]))
+        np.testing.assert_equal(v[: found - 1] >= 0, np.ones_like(v[: found - 1], dtype=bool))
+      np.testing.assert_allclose(v[found + 1 :], end_token * np.ones_like(v[found + 1 :]))
diff --git a/deepray/custom_ops/seq2seq/python/tests/decoder_test.py b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py
index 7b07c73a..2732a6c5 100644
--- a/deepray/custom_ops/seq2seq/python/tests/decoder_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/decoder_test.py
@@ -28,7 +28,6 @@
 @pytest.mark.parametrize("maximum_iterations", [None, 1, tf.constant(1, dtype=tf.int32)])
 @pytest.mark.parametrize("time_major", [True, False])
 def test_dynamic_decode_rnn(time_major, maximum_iterations):
-
   sequence_length = [3, 4, 3, 1, 0]
   batch_size = 5
   max_time = 8
@@ -39,17 +38,17 @@ def test_dynamic_decode_rnn(time_major, maximum_iterations):
   cell = tf.keras.layers.LSTMCell(cell_depth)
   sampler = sampler_py.TrainingSampler(time_major=time_major)
   my_decoder = basic_decoder.BasicDecoder(
-      cell=cell,
-      sampler=sampler,
-      output_time_major=time_major,
-      maximum_iterations=maximum_iterations,
+    cell=cell,
+    sampler=sampler,
+    output_time_major=time_major,
+    maximum_iterations=maximum_iterations,
   )
 
   @tf.function(
-      input_signature=(
-          tf.TensorSpec([None, None, input_depth], dtype=tf.float32),
-          tf.TensorSpec([None], dtype=tf.int32),
-      )
+    input_signature=(
+      tf.TensorSpec([None, None, input_depth], dtype=tf.float32),
+      tf.TensorSpec([None], dtype=tf.int32),
+    )
   )
   def _decode(inputs, sequence_length):
     batch_size_t = tf.shape(sequence_length)[0]
@@ -93,27 +92,27 @@ def _decode(start_tokens, end_token):
     batch_size = tf.size(start_tokens)
     initial_state = cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
     return decoder.dynamic_decode(
-        my_decoder,
-        maximum_iterations=5,
-        enable_tflite_convertible=True,
-        decoder_init_input=embeddings,
-        decoder_init_kwargs=dict(
-            initial_state=initial_state,
-            start_tokens=start_tokens,
-            end_token=end_token,
-        ),
+      my_decoder,
+      maximum_iterations=5,
+      enable_tflite_convertible=True,
+      decoder_init_input=embeddings,
+      decoder_init_kwargs=dict(
+        initial_state=initial_state,
+        start_tokens=start_tokens,
+        end_token=end_token,
+      ),
     )
 
   concrete_function = _decode.get_concrete_function(
-      tf.TensorSpec([1], dtype=tf.int32), tf.TensorSpec([], dtype=tf.int32)
+    tf.TensorSpec([1], dtype=tf.int32), tf.TensorSpec([], dtype=tf.int32)
   )
   if tf.__version__[:3] >= "2.7":
     converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function], _decode)
   else:
     converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_function])
   converter.target_spec.supported_ops = [
-      tf.lite.OpsSet.TFLITE_BUILTINS,
-      tf.lite.OpsSet.SELECT_TF_OPS,
+    tf.lite.OpsSet.TFLITE_BUILTINS,
+    tf.lite.OpsSet.SELECT_TF_OPS,
   ]
   _ = converter.convert()
 
@@ -123,7 +122,9 @@ def _decode(start_tokens, end_token):
 
 
 @pytest.mark.parametrize("use_sequence_length", [True, False])
-def test_dynamic_decode_rnn_with_training_helper_matches_dynamic_rnn(use_sequence_length,):
+def test_dynamic_decode_rnn_with_training_helper_matches_dynamic_rnn(
+  use_sequence_length,
+):
   sequence_length = [3, 4, 3, 1, 0]
   batch_size = 5
   max_time = 8
@@ -139,11 +140,12 @@ def test_dynamic_decode_rnn_with_training_helper_matches_dynamic_rnn(use_sequenc
   sampler = sampler_py.TrainingSampler()
   my_decoder = basic_decoder.BasicDecoder(cell=cell, sampler=sampler, impute_finished=use_sequence_length)
 
-  (final_decoder_outputs, final_decoder_state,
-   _) = my_decoder(inputs, initial_state=zero_state, sequence_length=sequence_length)
+  (final_decoder_outputs, final_decoder_state, _) = my_decoder(
+    inputs, initial_state=zero_state, sequence_length=sequence_length
+  )
 
   rnn = tf.keras.layers.RNN(cell, return_sequences=True, return_state=True)
-  mask = (tf.sequence_mask(sequence_length, maxlen=max_time) if use_sequence_length else None)
+  mask = tf.sequence_mask(sequence_length, maxlen=max_time) if use_sequence_length else None
   outputs = rnn(inputs, mask=mask, initial_state=zero_state)
   final_rnn_outputs = outputs[0]
   final_rnn_state = outputs[1:]
diff --git a/deepray/custom_ops/seq2seq/python/tests/loss_test.py b/deepray/custom_ops/seq2seq/python/tests/loss_test.py
index e187292d..6486db5b 100644
--- a/deepray/custom_ops/seq2seq/python/tests/loss_test.py
+++ b/deepray/custom_ops/seq2seq/python/tests/loss_test.py
@@ -37,13 +37,13 @@ def get_test_data():
   # and logits = [[0.5] * 5, [1.5] * 5, [2.5] * 5]
   expected_loss = 1.60944
   return (
-      batch_size,
-      sequence_length,
-      number_of_classes,
-      logits,
-      targets,
-      weights,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    number_of_classes,
+    logits,
+    targets,
+    weights,
+    expected_loss,
   )
 
 
@@ -53,13 +53,13 @@ def get_test_data():
 @pytest.mark.parametrize("zero_weights", [True, False])
 def test_sequence_loss(average_across_timesteps, average_across_batch, zero_weights):
   (
-      batch_size,
-      sequence_length,
-      _,
-      logits,
-      targets,
-      weights,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    _,
+    logits,
+    targets,
+    weights,
+    expected_loss,
   ) = get_test_data()
 
   if zero_weights:
@@ -67,11 +67,11 @@ def test_sequence_loss(average_across_timesteps, average_across_batch, zero_weig
     weights = tf.stack(weights, axis=1)
 
   computed = loss.sequence_loss(
-      logits,
-      targets,
-      weights,
-      average_across_timesteps=average_across_timesteps,
-      average_across_batch=average_across_batch,
+    logits,
+    targets,
+    weights,
+    average_across_timesteps=average_across_timesteps,
+    average_across_batch=average_across_batch,
   )
   computed = computed.numpy()
   if average_across_timesteps and average_across_batch and zero_weights:
@@ -98,21 +98,20 @@ def test_sequence_loss(average_across_timesteps, average_across_batch, zero_weig
 @pytest.mark.parametrize("average_across_timesteps", [True, False])
 @pytest.mark.parametrize("average_across_batch", [True, False])
 def test_sequence_loss_class(average_across_timesteps, average_across_batch):
-
   (
-      batch_size,
-      sequence_length,
-      _,
-      logits,
-      targets,
-      weights,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    _,
+    logits,
+    targets,
+    weights,
+    expected_loss,
   ) = get_test_data()
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=average_across_timesteps,
-      average_across_batch=average_across_batch,
-      sum_over_timesteps=False,
-      sum_over_batch=False,
+    average_across_timesteps=average_across_timesteps,
+    average_across_batch=average_across_batch,
+    sum_over_timesteps=False,
+    sum_over_batch=False,
   )
   average_loss_per_example = seq_loss(targets, logits, weights)
   res = average_loss_per_example.numpy()
@@ -130,29 +129,29 @@ def test_sequence_loss_class(average_across_timesteps, average_across_batch):
 
 def test_sum_reduction():
   (
-      batch_size,
-      sequence_length,
-      _,
-      logits,
-      targets,
-      weights,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    _,
+    logits,
+    targets,
+    weights,
+    expected_loss,
   ) = get_test_data()
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=True,
-      sum_over_batch=True,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=True,
+    sum_over_batch=True,
   )
   average_loss_per_example = seq_loss(targets, logits, weights)
   res = average_loss_per_example.numpy()
   np.testing.assert_allclose(expected_loss, res, atol=1e-6, rtol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=False,
-      sum_over_batch=True,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=False,
+    sum_over_batch=True,
   )
   average_loss_per_sequence = seq_loss(targets, logits, weights)
   res = average_loss_per_sequence.numpy()
@@ -160,10 +159,10 @@ def test_sum_reduction():
   np.testing.assert_allclose(compare_per_sequence, res, atol=1e-6, rtol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=True,
-      sum_over_batch=False,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=True,
+    sum_over_batch=False,
   )
   average_loss_per_batch = seq_loss(targets, logits, weights)
   res = average_loss_per_batch.numpy()
@@ -171,10 +170,10 @@ def test_sum_reduction():
   np.testing.assert_allclose(compare_per_batch, res, atol=1e-6, rtol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=False,
-      sum_over_batch=False,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=False,
+    sum_over_batch=False,
   )
   total_loss = seq_loss(targets, logits, weights)
   res = total_loss.numpy()
@@ -185,33 +184,33 @@ def test_sum_reduction():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_weighted_sum_reduction():
   (
-      batch_size,
-      sequence_length,
-      _,
-      logits,
-      targets,
-      _,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    _,
+    logits,
+    targets,
+    _,
+    expected_loss,
   ) = get_test_data()
   weights = [tf.constant(1.0, shape=[batch_size]) for _ in range(sequence_length)]
   # Make the last element in the sequence to have zero weights.
   weights[-1] = tf.constant(0.0, shape=[batch_size])
   weights = tf.stack(weights, axis=1)
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=True,
-      sum_over_batch=True,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=True,
+    sum_over_batch=True,
   )
   average_loss_per_example = seq_loss(targets, logits, weights)
   res = average_loss_per_example.numpy()
   np.testing.assert_allclose(expected_loss, res, rtol=1e-6, atol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=False,
-      sum_over_batch=True,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=False,
+    sum_over_batch=True,
   )
   average_loss_per_sequence = seq_loss(targets, logits, weights)
   res = average_loss_per_sequence.numpy()
@@ -222,10 +221,10 @@ def test_weighted_sum_reduction():
   np.testing.assert_allclose(compare_per_sequence, res, rtol=1e-6, atol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=True,
-      sum_over_batch=False,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=True,
+    sum_over_batch=False,
   )
   average_loss_per_batch = seq_loss(targets, logits, weights)
   res = average_loss_per_batch.numpy()
@@ -233,10 +232,10 @@ def test_weighted_sum_reduction():
   np.testing.assert_allclose(compare_per_batch, res, rtol=1e-6, atol=1e-6)
 
   seq_loss = loss.SequenceLoss(
-      average_across_timesteps=False,
-      average_across_batch=False,
-      sum_over_timesteps=False,
-      sum_over_batch=False,
+    average_across_timesteps=False,
+    average_across_batch=False,
+    sum_over_timesteps=False,
+    sum_over_batch=False,
   )
   total_loss = seq_loss(targets, logits, weights)
   res = total_loss.numpy()
@@ -252,10 +251,10 @@ def test_ambiguous_order():
   with pytest.raises(ValueError, match="because of ambiguous order"):
     _, _, _, logits, targets, weights, _ = get_test_data()
     seq_loss = loss.SequenceLoss(
-        average_across_timesteps=False,
-        average_across_batch=True,
-        sum_over_timesteps=True,
-        sum_over_batch=False,
+      average_across_timesteps=False,
+      average_across_batch=True,
+      sum_over_timesteps=True,
+      sum_over_batch=False,
     )
     seq_loss(targets, logits, weights).numpy()
 
@@ -263,22 +262,22 @@ def test_ambiguous_order():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_keras_compatibility():
   """To test the compatibility of SequenceLoss with Keras's built-in
-    training loops, we create a fake model which always outputs a pre-
-    defined set of logits.
-
-    Then we check the calculated loss to be equal to the expected
-    loss. Note that since the fake model doesn't have any trainable
-    parameters, no matter how many steps we train it, it always
-    outputs the same loss value.
-    """
+  training loops, we create a fake model which always outputs a pre-
+  defined set of logits.
+
+  Then we check the calculated loss to be equal to the expected
+  loss. Note that since the fake model doesn't have any trainable
+  parameters, no matter how many steps we train it, it always
+  outputs the same loss value.
+  """
   (
-      batch_size,
-      sequence_length,
-      number_of_classes,
-      logits,
-      targets,
-      weights,
-      expected_loss,
+    batch_size,
+    sequence_length,
+    number_of_classes,
+    logits,
+    targets,
+    weights,
+    expected_loss,
   ) = get_test_data()
   targets = tf.one_hot(targets, depth=number_of_classes)
 
diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table.py b/deepray/custom_ops/simple_hash_table/simple_hash_table.py
index 7ffda0ca..319dc43e 100644
--- a/deepray/custom_ops/simple_hash_table/simple_hash_table.py
+++ b/deepray/custom_ops/simple_hash_table/simple_hash_table.py
@@ -87,7 +87,7 @@ def _create_resource(self):
     """
     assert self._default_value.get_shape().ndims == 0
     table_ref = gen_simple_hash_table_op.examples_simple_hash_table_create(
-        key_dtype=self._key_dtype, value_dtype=self._value_dtype, name=self._name
+      key_dtype=self._key_dtype, value_dtype=self._value_dtype, name=self._name
     )
     return table_ref
 
@@ -99,7 +99,7 @@ def _serialize_to_tensors(self):
   def _restore_from_tensors(self, restored_tensors):
     """Implements checkpointing protocols for `Trackable`."""
     return gen_simple_hash_table_op.examples_simple_hash_table_import(
-        self.resource_handle, restored_tensors["table-keys"], restored_tensors["table-values"]
+      self.resource_handle, restored_tensors["table-keys"], restored_tensors["table-values"]
     )
 
   @property
@@ -134,10 +134,10 @@ def find(self, key, dynamic_default_value=None, name=None):
       key = tf.convert_to_tensor(key, dtype=self._key_dtype, name="key")
       if dynamic_default_value is not None:
         dynamic_default_value = tf.convert_to_tensor(
-            dynamic_default_value, dtype=self._value_dtype, name="default_value"
+          dynamic_default_value, dtype=self._value_dtype, name="default_value"
         )
       value = gen_simple_hash_table_op.examples_simple_hash_table_find(
-          self.resource_handle, key, dynamic_default_value if dynamic_default_value is not None else self._default_value
+        self.resource_handle, key, dynamic_default_value if dynamic_default_value is not None else self._default_value
       )
     return value
 
@@ -187,7 +187,7 @@ def remove(self, key, name=None):
 
       # pylint: disable=protected-access
       op = gen_simple_hash_table_op.examples_simple_hash_table_remove(
-          self.resource_handle, key, value_dtype=self._value_dtype
+        self.resource_handle, key, value_dtype=self._value_dtype
       )
       return op
 
@@ -204,7 +204,7 @@ def export(self, name=None):
     with tf.name_scope(name or "%s_lookup_table_export" % self._name):
       # pylint: disable=protected-access
       keys, values = gen_simple_hash_table_op.examples_simple_hash_table_export(
-          self.resource_handle, key_dtype=self._key_dtype, value_dtype=self._value_dtype
+        self.resource_handle, key_dtype=self._key_dtype, value_dtype=self._value_dtype
       )
       return keys, values
 
diff --git a/deepray/custom_ops/simple_hash_table/simple_hash_table_test.py b/deepray/custom_ops/simple_hash_table/simple_hash_table_test.py
index 01647021..c6b9bed5 100644
--- a/deepray/custom_ops/simple_hash_table/simple_hash_table_test.py
+++ b/deepray/custom_ops/simple_hash_table/simple_hash_table_test.py
@@ -22,12 +22,12 @@
 
 from deepray.custom_ops.simple_hash_table import simple_hash_table
 from tensorflow.python.eager import def_function
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
 
 
 class SimpleHashTableTest(tf.test.TestCase, parameterized.TestCase):
-
   # Helper function using "create, find, insert, find, remove, find
   def _use_table(self, key_dtype, value_dtype):
     hash_table = simple_hash_table.SimpleHashTable(key_dtype, value_dtype, 111)
@@ -40,7 +40,7 @@ def _use_table(self, key_dtype, value_dtype):
     return results  # expect [-999, 100, -999]
 
   # Test of "create, find, insert, find" in eager mode.
-  @parameterized.named_parameters(('int32_float', tf.int32, float), ('int64_int32', tf.int64, tf.int32))
+  @parameterized.named_parameters(("int32_float", tf.int32, float), ("int64_int32", tf.int64, tf.int32))
   def test_find_insert_find_eager(self, key_dtype, value_dtype):
     results = self._use_table(key_dtype, value_dtype)
     self.assertAllClose(results, [-999, 100, -999])
@@ -48,16 +48,16 @@ def test_find_insert_find_eager(self, key_dtype, value_dtype):
   # Test of "create, find, insert, find" in a tf.function. Note that the
   # creation and use of the ref-counted resource occurs inside a single
   # self.evaluate.
-  @parameterized.named_parameters(('int32_float', tf.int32, float), ('int64_int32', tf.int64, tf.int32))
+  @parameterized.named_parameters(("int32_float", tf.int32, float), ("int64_int32", tf.int64, tf.int32))
   def test_find_insert_find_tf_function(self, key_dtype, value_dtype):
     results = def_function.function(lambda: self._use_table(key_dtype, value_dtype))
     self.assertAllClose(self.evaluate(results), [-999.0, 100.0, -999.0])
 
   # strings for key and value
   def test_find_insert_find_strings_eager(self):
-    default = 'Default'
-    foo = 'Foo'
-    bar = 'Bar'
+    default = "Default"
+    foo = "Foo"
+    bar = "Bar"
     hash_table = simple_hash_table.SimpleHashTable(tf.string, tf.string, default)
     result1 = hash_table.find(foo, default)
     self.assertEqual(result1, default)
@@ -86,8 +86,8 @@ def test_import(self):
 
   @test_util.run_v2_only
   def testSavedModelSaveRestore(self):
-    save_dir = os.path.join(self.get_temp_dir(), 'save_restore')
-    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), 'hash')
+    save_dir = os.path.join(self.get_temp_dir(), "save_restore")
+    save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
 
     # TODO(b/203097231) is there an alternative that is not __internal__?
     root = tf.__internal__.tracking.AutoTrackable()
@@ -114,5 +114,5 @@ def lookup(key):
     self.assertEqual(loaded.lookup(10), -1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/sleep/sleep_bin.py b/deepray/custom_ops/sleep/sleep_bin.py
index 1743b538..732782d8 100644
--- a/deepray/custom_ops/sleep/sleep_bin.py
+++ b/deepray/custom_ops/sleep/sleep_bin.py
@@ -38,9 +38,9 @@ def stack50(op, delay):
   func = tf.function(lambda: tf.stack([op(delays[i]) for i in range(n)]))
   r_numpy = func().numpy()
   end_t = time.time()
-  print('')
-  print('Total time = %5.3f seconds using %s' % (end_t - start_t, str(op)))
-  print('Returned values from the ops:')
+  print("")
+  print("Total time = %5.3f seconds using %s" % (end_t - start_t, str(op)))
+  print("Returned values from the ops:")
   np.set_printoptions(precision=4, suppress=True)
   print(r_numpy)
   sys.stdout.flush()
@@ -50,27 +50,29 @@ def main(argv):
   del argv  # not used
   delay_seconds = 1.0
   print(
-      """
+    """
 Using synchronous sleep op with each of 50 ops sleeping for about %0.2f seconds,
 so total time is about %0.2f * ceil(50 / NUMBER_OF_THREADS). 16 is a typical
 number of threads, which would be %0.2f seconds. The actual time will be
 a little greater.
-""" % (delay_seconds, delay_seconds, delay_seconds * math.ceil(50.0 / 16.0))
+"""
+    % (delay_seconds, delay_seconds, delay_seconds * math.ceil(50.0 / 16.0))
   )
   stack50(sleep_op.SyncSleep, delay_seconds)
 
   print(
-      """
+    """
 Using asynchronous sleep op with each of 50 ops sleeping only as much as
 necessary so they finish after at least %0.2f seconds. Time that
 an op spends blocked waiting to finish counts as all or part of its delay.
 The returned values show how long each ops sleeps or 0 if the op does not
 need to sleep. The expected total time will be a little greater than
 the requested delay of %0.2f seconds.
-""" % (delay_seconds, delay_seconds)
+"""
+    % (delay_seconds, delay_seconds)
   )
   stack50(sleep_op.AsyncSleep, delay_seconds)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   app.run(main)
diff --git a/deepray/custom_ops/sleep/sleep_test.py b/deepray/custom_ops/sleep/sleep_test.py
index 60f5993e..64a5569a 100644
--- a/deepray/custom_ops/sleep/sleep_test.py
+++ b/deepray/custom_ops/sleep/sleep_test.py
@@ -19,12 +19,12 @@
 import tensorflow as tf
 
 from deepray.custom_ops.sleep import sleep_op
+
 # This pylint disable is only needed for internal google users
 from tensorflow.python.framework import errors_impl  # pylint: disable=g-direct-tensorflow-import
 
 
 class SleepTest(tf.test.TestCase):
-
   def _check_sleep(self, op):
     """Check that one sleep op works in isolation.
 
@@ -53,9 +53,9 @@ def test_async_sleep_error(self):
     # It is import that ComputeAsync() calls its done() callback if it returns
     # early due to an error.
     func = tf.function(lambda: sleep_op.AsyncSleep(-1.0))
-    with self.assertRaisesRegex(errors_impl.InvalidArgumentError, 'Input `delay` must be non-negative.'):
+    with self.assertRaisesRegex(errors_impl.InvalidArgumentError, "Input `delay` must be non-negative."):
       self.evaluate(func())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/custom_ops/unique_ops/python/tests/run_all_test.py b/deepray/custom_ops/unique_ops/python/tests/run_all_test.py
index 7f259e06..aa3da28c 100644
--- a/deepray/custom_ops/unique_ops/python/tests/run_all_test.py
+++ b/deepray/custom_ops/unique_ops/python/tests/run_all_test.py
@@ -5,5 +5,5 @@
 
 if __name__ == "__main__":
   dirname = Path(__file__).absolute().parent
-  sys.exit(pytest.main(['-s', str(dirname)]))
+  sys.exit(pytest.main(["-s", str(dirname)]))
   # sys.exit(pytest.main([str(dirname)]))
diff --git a/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
index bdf8a334..6456cc7d 100644
--- a/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
+++ b/deepray/custom_ops/unique_ops/python/tests/unique_ali_op_test.py
@@ -35,7 +35,6 @@
 
 
 class UniqueTest(test.TestCase):
-
   def testInt32(self):
     x = np.random.randint(0, high=1000, size=700000)
     with self.cached_session(use_gpu=True) as sess:
@@ -92,7 +91,7 @@ def testString(self):
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
     for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
+      self.assertEqual(x[i], tf_y[tf_idx[i]].decode("ascii"))
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
@@ -122,31 +121,31 @@ def testInt32V2(self):
 
   def IllegalIdForMultMapUnique(self):
     recover_env = False
-    if 'DEEPREC_UNIQUE_OP_PARTITION_SIZE' in os.environ:
+    if "DEEPREC_UNIQUE_OP_PARTITION_SIZE" in os.environ:
       recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
-    os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = '2'
+      old_env = os.environ["DEEPREC_UNIQUE_OP_PARTITION_SIZE"]
+    os.environ["DEEPREC_UNIQUE_OP_PARTITION_SIZE"] = "2"
 
     with self.cached_session() as sess:
       x = np.array([-1, 0, 1, PreservedKey], dtype=np.int64)
       y, idx = gen_array_ops.deepray_unique(x, out_idx=dtypes.int64)
       with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError, "Input id is preserved key of dense_hash_map, "
-          "not supported: " + str(PreservedKey)
+        errors_impl.InvalidArgumentError,
+        "Input id is preserved key of dense_hash_map, not supported: " + str(PreservedKey),
       ):
         tf_y, tf_idx = sess.run([y, idx])
 
-    del os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE']
+    del os.environ["DEEPREC_UNIQUE_OP_PARTITION_SIZE"]
     if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_PARTITION_SIZE'] = old_env
+      os.environ["DEEPREC_UNIQUE_OP_PARTITION_SIZE"] = old_env
 
   def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False):
     recover_env = False
-    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+    if "DEEPREC_UNIQUE_OP_HASH_MAP" in os.environ:
       recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+      old_env = os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
 
-    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = map_type
     self.testInt32()
     self.testInt32OutIdxInt64()
     self.testInt64OutIdxInt64()
@@ -156,25 +155,24 @@ def RunUniqueWithDifferentMaps(self, map_type, test_illegal_key=False):
     if test_illegal_key:
       self.IllegalIdForMultMapUnique()
 
-    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    del os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
     if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+      os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = old_env
 
   def testUniqueMultiMap(self):
-    self.RunUniqueWithDifferentMaps('MULTIMAP', True)
+    self.RunUniqueWithDifferentMaps("MULTIMAP", True)
 
   def testUniqueStlMap(self):
-    self.RunUniqueWithDifferentMaps('STL')
+    self.RunUniqueWithDifferentMaps("STL")
 
   def testUniqueAbslMap(self):
-    self.RunUniqueWithDifferentMaps('ABSL')
+    self.RunUniqueWithDifferentMaps("ABSL")
 
   def testUniqueDenseHashMap(self):
-    self.RunUniqueWithDifferentMaps('GOOGLE')
+    self.RunUniqueWithDifferentMaps("GOOGLE")
 
 
 class UniqueWithCountsTest(test.TestCase):
-
   def testInt32(self):
     x = np.random.randint(2, high=1000, size=700000)
     with self.cached_session() as sess:
@@ -212,9 +210,9 @@ def testString(self):
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
     for i in range(len(x)):
-      self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
+      self.assertEqual(x[i], tf_y[tf_idx[i]].decode("ascii"))
     for value, count in zip(tf_y, tf_count):
-      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+      v = [1 if x[i] == value.decode("ascii") else 0 for i in range(7000)]
       self.assertEqual(count, sum(v))
 
   def testInt32Axis(self):
@@ -249,35 +247,34 @@ def testInt32V2(self):
 
   def RunUniqueWithCountsWithDifferentMaps(self, map_type):
     recover_env = False
-    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+    if "DEEPREC_UNIQUE_OP_HASH_MAP" in os.environ:
       recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+      old_env = os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
 
-    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = map_type
     self.testInt32()
     self.testInt32OutIdxInt64()
     self.testInt32Axis()
     self.testInt32V2()
 
-    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    del os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
     if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+      os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = old_env
 
   def testUniqueWithCountsMultiMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+    self.RunUniqueWithCountsWithDifferentMaps("MULTIMAP")
 
   def testUniqueWithCountsStlMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('STL')
+    self.RunUniqueWithCountsWithDifferentMaps("STL")
 
   def testUniqueWithCountsAbslMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+    self.RunUniqueWithCountsWithDifferentMaps("ABSL")
 
   def testUniqueWithCountsDenseHashMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
+    self.RunUniqueWithCountsWithDifferentMaps("GOOGLE")
 
 
 class UniqueWithExtraCountsTest(test.TestCase):
-
   def testInt32(self):
     x = np.random.randint(2, high=1000, size=700000)
     extra_x = x[:5].tolist()
@@ -320,30 +317,30 @@ def testInt32OutIdxInt64(self):
 
   def RunUniqueWithCountsWithDifferentMaps(self, map_type):
     recover_env = False
-    if 'DEEPREC_UNIQUE_OP_HASH_MAP' in os.environ:
+    if "DEEPREC_UNIQUE_OP_HASH_MAP" in os.environ:
       recover_env = True
-      old_env = os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+      old_env = os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
 
-    os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = map_type
+    os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = map_type
     self.testInt32()
     self.testInt32OutIdxInt64()
 
-    del os.environ['DEEPREC_UNIQUE_OP_HASH_MAP']
+    del os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"]
     if recover_env:
-      os.environ['DEEPREC_UNIQUE_OP_HASH_MAP'] = old_env
+      os.environ["DEEPREC_UNIQUE_OP_HASH_MAP"] = old_env
 
   def testUniqueWithCountsMultiMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('MULTIMAP')
+    self.RunUniqueWithCountsWithDifferentMaps("MULTIMAP")
 
   def testUniqueWithCountsStlMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('STL')
+    self.RunUniqueWithCountsWithDifferentMaps("STL")
 
   def testUniqueWithCountsAbslMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('ABSL')
+    self.RunUniqueWithCountsWithDifferentMaps("ABSL")
 
   def testUniqueWithCountsDenseHashMap(self):
-    self.RunUniqueWithCountsWithDifferentMaps('GOOGLE')
+    self.RunUniqueWithCountsWithDifferentMaps("GOOGLE")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   test.main()
diff --git a/deepray/custom_ops/zero_out/python/tests/zero_out_ops_test.py b/deepray/custom_ops/zero_out/python/tests/zero_out_ops_test.py
index 98b37898..06cba446 100644
--- a/deepray/custom_ops/zero_out/python/tests/zero_out_ops_test.py
+++ b/deepray/custom_ops/zero_out/python/tests/zero_out_ops_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ============================================================================
 """Tests for zero_out ops."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -24,11 +25,10 @@
 
 
 class ZeroOutTest(test.TestCase):
-
   def testZeroOut(self):
     with self.test_session():
       self.assertAllClose(zero_out([[1, 2], [3, 4]]), np.array([[1, 0], [0, 0]]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   test.main()
diff --git a/deepray/datasets/adult_census_income/adult_census_income.py b/deepray/datasets/adult_census_income/adult_census_income.py
index 5ec8cced..0350b541 100644
--- a/deepray/datasets/adult_census_income/adult_census_income.py
+++ b/deepray/datasets/adult_census_income/adult_census_income.py
@@ -10,21 +10,20 @@
 from sklearn.preprocessing import MinMaxScaler, LabelEncoder
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-if os.path.exists(os.path.join(dir_path, 'feature_map.csv')):
+if os.path.exists(os.path.join(dir_path, "feature_map.csv")):
   FLAGS([
-      sys.argv[0],
-      f"--feature_map={dir_path}/feature_map.csv",
+    sys.argv[0],
+    f"--feature_map={dir_path}/feature_map.csv",
   ])
 
 
 class Adult_census_income(DataPipeline):
-
-  def __init__(self, data_path='/workspaces/dataset/census/adult.csv'):
+  def __init__(self, data_path="/workspaces/dataset/census/adult.csv"):
     super().__init__()
 
     LABEL = "income"
-    NUMERICAL_FEATURES = self.feature_map.loc[self.feature_map['ftype'] == 'Numerical', 'name'].tolist()
-    CATEGORY_FEATURES = self.feature_map.loc[self.feature_map['ftype'] == 'Categorical', 'name'].tolist()
+    NUMERICAL_FEATURES = self.feature_map.loc[self.feature_map["ftype"] == "Numerical", "name"].tolist()
+    CATEGORY_FEATURES = self.feature_map.loc[self.feature_map["ftype"] == "Categorical", "name"].tolist()
 
     df = pd.read_csv(data_path)
     df[LABEL] = (df[LABEL].apply(lambda x: ">50K" in x)).astype(int)
@@ -37,21 +36,21 @@ def __init__(self, data_path='/workspaces/dataset/census/adult.csv'):
     df[NUMERICAL_FEATURES] = mms.fit_transform(df[NUMERICAL_FEATURES])
 
     for feat in CATEGORY_FEATURES:
-      print(f"{feat}: {df[feat].max() + 1,}")
+      print(f"{feat}: {(df[feat].max() + 1,)}")
 
     self.train_df, self.valid_df = train_test_split(df, test_size=0.2, random_state=1024)
 
     FLAGS([
-        sys.argv[0],
-        f"--num_train_examples={self.train_df.shape[0]}",
+      sys.argv[0],
+      f"--num_train_examples={self.train_df.shape[0]}",
     ])
 
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
     if is_training:
-      target = self.train_df.pop('income')
+      target = self.train_df.pop("income")
       dataset = tf.data.Dataset.from_tensor_slices((dict(self.train_df), target))
     else:
-      target = self.valid_df.pop('income')
+      target = self.valid_df.pop("income")
       dataset = tf.data.Dataset.from_tensor_slices((dict(self.valid_df), target))
 
     dataset = dataset.repeat(epochs).shuffle(10000).batch(batch_size)
diff --git a/deepray/datasets/adult_census_income/adult_census_income_test.py b/deepray/datasets/adult_census_income/adult_census_income_test.py
index 4027f716..b9868e25 100644
--- a/deepray/datasets/adult_census_income/adult_census_income_test.py
+++ b/deepray/datasets/adult_census_income/adult_census_income_test.py
@@ -19,12 +19,12 @@ def runner(argv=None):
 
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "--epochs=1",
-        "--train_data=/workspaces/dataset/avazu/output/train.parquet",
-        f"--feature_map={dir_path}/feature_map.csv",
-        "--label=income",
+      sys.argv[0],
+      "--batch_size=16",
+      "--epochs=1",
+      "--train_data=/workspaces/dataset/avazu/output/train.parquet",
+      f"--feature_map={dir_path}/feature_map.csv",
+      "--label=income",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -42,7 +42,7 @@ def runner(argv=None):
     step += 1
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/ali-ccp/preprocess_ali_ccp.py b/deepray/datasets/ali-ccp/preprocess_ali_ccp.py
index d672213b..bb1c7e98 100644
--- a/deepray/datasets/ali-ccp/preprocess_ali_ccp.py
+++ b/deepray/datasets/ali-ccp/preprocess_ali_ccp.py
@@ -12,23 +12,44 @@
 
 random.seed(2022)
 np.random.seed(2022)
-sample_skeleton_train_path = './sample_skeleton_train.csv'
-common_features_train_path = './common_features_train.csv'
-sample_skeleton_test_path = './sample_skeleton_test.csv'
-common_features_test_path = './common_features_test.csv'
+sample_skeleton_train_path = "./sample_skeleton_train.csv"
+common_features_train_path = "./common_features_train.csv"
+sample_skeleton_test_path = "./sample_skeleton_test.csv"
+common_features_test_path = "./common_features_test.csv"
 save_path = "./"
-write_features_map_path = save_path + 'features_map.pkl'
-write_features_path = save_path + 'all_features'
+write_features_map_path = save_path + "features_map.pkl"
+write_features_path = save_path + "all_features"
 sparse_columns = [
-    '101', '121', '122', '124', '125', '126', '127', '128', '129', '205', '206', '207', '210', '216', '508', '509',
-    '702', '853', '301', '109_14', '110_14', '127_14', '150_14'
+  "101",
+  "121",
+  "122",
+  "124",
+  "125",
+  "126",
+  "127",
+  "128",
+  "129",
+  "205",
+  "206",
+  "207",
+  "210",
+  "216",
+  "508",
+  "509",
+  "702",
+  "853",
+  "301",
+  "109_14",
+  "110_14",
+  "127_14",
+  "150_14",
 ]
-dense_columns = ['109_14', '110_14', '127_14', '150_14', '508', '509', '702', '853']
-uses_columns = [col for col in sparse_columns] + ['D' + col for col in dense_columns]
+dense_columns = ["109_14", "110_14", "127_14", "150_14", "508", "509", "702", "853"]
+uses_columns = [col for col in sparse_columns] + ["D" + col for col in dense_columns]
 
 
-def preprocess_data(mode='train'):
-  assert mode in ['train', 'test']
+def preprocess_data(mode="train"):
+  assert mode in ["train", "test"]
   if mode == "test" and not os.path.exists(write_features_map_path):
     print("Error! Please run the train mode first!")
     return
@@ -37,43 +58,43 @@ def preprocess_data(mode='train'):
 
   print(f"Start processing common_features_{mode}")
   common_feat_dict = {}
-  with open(common_features_path, 'r') as fr:
+  with open(common_features_path, "r") as fr:
     for line in tqdm(fr):
-      line_list = line.strip().split(',')
+      line_list = line.strip().split(",")
       feat_strs = line_list[2]
       feat_dict = {}
-      for fstr in feat_strs.split('\x01'):
-        filed, feat_val = fstr.split('\x02')
-        feat, val = feat_val.split('\x03')
+      for fstr in feat_strs.split("\x01"):
+        filed, feat_val = fstr.split("\x02")
+        feat, val = feat_val.split("\x03")
         if filed in sparse_columns:
           feat_dict[filed] = feat
         if filed in dense_columns:
-          feat_dict['D' + filed] = val
+          feat_dict["D" + filed] = val
       common_feat_dict[line_list[0]] = feat_dict
 
-  print('join feats...')
+  print("join feats...")
   vocabulary = dict(zip(sparse_columns, [{} for _ in range(len(sparse_columns))]))
-  with open(f"{write_features_path}_{mode}.tmp", 'w') as fw:
-    fw.write('click,purchase,' + ','.join(uses_columns) + '\n')
-    with open(sample_skeleton_path, 'r') as fr:
+  with open(f"{write_features_path}_{mode}.tmp", "w") as fw:
+    fw.write("click,purchase," + ",".join(uses_columns) + "\n")
+    with open(sample_skeleton_path, "r") as fr:
       for line in tqdm(fr):
-        line_list = line.strip().split(',')
-        if line_list[1] == '0' and line_list[2] == '1':
+        line_list = line.strip().split(",")
+        if line_list[1] == "0" and line_list[2] == "1":
           continue
         feat_strs = line_list[5]
         feat_dict = {}
-        for fstr in feat_strs.split('\x01'):
-          filed, feat_val = fstr.split('\x02')
-          feat, val = feat_val.split('\x03')
+        for fstr in feat_strs.split("\x01"):
+          filed, feat_val = fstr.split("\x02")
+          feat, val = feat_val.split("\x03")
           if filed in sparse_columns:
             feat_dict[filed] = feat
           if filed in dense_columns:
-            feat_dict['D' + filed] = val
+            feat_dict["D" + filed] = val
         feat_dict.update(common_feat_dict[line_list[3]])
         feats = line_list[1:3]
         for k in uses_columns:
-          feats.append(feat_dict.get(k, '0'))
-        fw.write(','.join(feats) + '\n')
+          feats.append(feat_dict.get(k, "0"))
+        fw.write(",".join(feats) + "\n")
         if mode == "train":
           for k, v in feat_dict.items():
             if k in sparse_columns:
@@ -83,43 +104,43 @@ def preprocess_data(mode='train'):
                 vocabulary[k][v] = 1
 
   if mode == "train":
-    print('before filter low freq:')
+    print("before filter low freq:")
     for k, v in vocabulary.items():
-      print(k + ':' + str(len(v)))
+      print(k + ":" + str(len(v)))
     new_vocabulary = dict(zip(sparse_columns, [[] for _ in range(len(sparse_columns))]))
     for k, v in vocabulary.items():
       for k1, v1 in v.items():
         if v1 >= 10:
           new_vocabulary[k].append(k1)
     vocabulary = new_vocabulary
-    print('after filter low freq:')
+    print("after filter low freq:")
     for k, v in vocabulary.items():
-      print(k + ':' + str(len(v)))
+      print(k + ":" + str(len(v)))
     joblib.dump(vocabulary, write_features_map_path, compress=3)
 
-  print('encode feats...')
+  print("encode feats...")
   vocabulary = joblib.load(write_features_map_path)
   feat_map = {}
   for feat in sparse_columns:
     feat_map[feat] = dict(zip(vocabulary[feat], range(1, len(vocabulary[feat]) + 1)))
-  with open(f"{write_features_path}.{mode}", 'w') as fw:
-    fw.write('click,purchase,' + ','.join(uses_columns) + '\n')
-    with open(f"{write_features_path}_{mode}.tmp", 'r') as fr:
+  with open(f"{write_features_path}.{mode}", "w") as fw:
+    fw.write("click,purchase," + ",".join(uses_columns) + "\n")
+    with open(f"{write_features_path}_{mode}.tmp", "r") as fr:
       fr.readline()  # remove header
       for line in tqdm(fr):
-        line_list = line.strip().split(',')
+        line_list = line.strip().split(",")
         new_line = line_list[:2]
         for value, feat in zip(line_list[2:], uses_columns):
           if feat in sparse_columns:
-            new_line.append(str(feat_map[feat].get(value, '0')))
+            new_line.append(str(feat_map[feat].get(value, "0")))
           else:
             new_line.append(value)
-        fw.write(','.join(new_line) + '\n')
+        fw.write(",".join(new_line) + "\n")
 
 
 def reduce_mem(df):
   starttime = time.time()
-  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
+  numerics = ["int16", "int32", "int64", "float16", "float32", "float64"]
   start_mem = df.memory_usage().sum() / 1024**2
   for col in tqdm(df.columns):
     col_type = df[col].dtypes
@@ -128,7 +149,7 @@ def reduce_mem(df):
       c_max = df[col].max()
       if pd.isnull(c_min) or pd.isnull(c_max):
         continue
-      if str(col_type)[:3] == 'int':
+      if str(col_type)[:3] == "int":
         if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
           df[col] = df[col].astype(np.int8)
         elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
@@ -146,17 +167,17 @@ def reduce_mem(df):
           df[col] = df[col].astype(np.float64)
   end_mem = df.memory_usage().sum() / 1024**2
   print(
-      '-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(
-          end_mem, 100 * (start_mem - end_mem) / start_mem, (time.time() - starttime) / 60
-      )
+    "-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min".format(
+      end_mem, 100 * (start_mem - end_mem) / start_mem, (time.time() - starttime) / 60
+    )
   )
   gc.collect()
   return df
 
 
 if __name__ == "__main__":
-  preprocess_data(mode='train')
-  preprocess_data(mode='test')
+  preprocess_data(mode="train")
+  preprocess_data(mode="test")
   train_data = reduce_mem(pd.read_csv(f"{write_features_path}.train"))
   test_data = reduce_mem(pd.read_csv(f"{write_features_path}.test"))
   val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=2022)
@@ -169,9 +190,9 @@ def reduce_mem(df):
   all_data = pd.concat([train_data, val_data, test_data], axis=0)
   del train_data, val_data, test_data
   gc.collect()
-  TARGET = ['click', 'purchase']
+  TARGET = ["click", "purchase"]
   col_name = list(all_data.columns)
-  dense_features = ['D' + col for col in dense_columns]
+  dense_features = ["D" + col for col in dense_columns]
   mms = MinMaxScaler(feature_range=(0, 1))
   all_data[dense_features] = mms.fit_transform(all_data[dense_features])
   all_data = reduce_mem(all_data)
diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
index c232ac93..3a215902 100644
--- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
+++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click.py
@@ -7,7 +7,6 @@
 
 
 class Ali_display_ad_click(DataPipeline):
-
   def parse(self, record):
     label_map = {}
     for label in flags.FLAGS.label:
@@ -22,22 +21,23 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
     file_list = self.read_list_from_file(input_file_pattern)
 
     dataset = parquet_dataset_ops.ParquetDataset(
-        file_list,
-        batch_size=batch_size,
-        fields=[
-            parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0)
-            for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values
-        ],
-        num_parallel_reads=flags.FLAGS.parallel_reads_per_file
-        if flags.FLAGS.parallel_reads_per_file else dataset_ops.AUTOTUNE,
+      file_list,
+      batch_size=batch_size,
+      fields=[
+        parquet_dataset_ops.DataFrame.Field(k, dtype, ragged_rank=1 if length != 1 else 0)
+        for k, dtype, length in self.feature_map[["name", "dtype", "length"]].values
+      ],
+      num_parallel_reads=flags.FLAGS.parallel_reads_per_file
+      if flags.FLAGS.parallel_reads_per_file
+      else dataset_ops.AUTOTUNE,
     )
     dataset = dataset.map(
-        map_func=self.parse,
-        num_parallel_calls=flags.FLAGS.parallel_parse if flags.FLAGS.parallel_parse else dataset_ops.AUTOTUNE,
+      map_func=self.parse,
+      num_parallel_calls=flags.FLAGS.parallel_parse if flags.FLAGS.parallel_parse else dataset_ops.AUTOTUNE,
     )
     if flags.FLAGS.shuffle_buffer:
       dataset = dataset.apply(
-          tf.data.experimental.shuffle_and_repeat(buffer_size=flags.FLAGS.shuffle_buffer, count=flags.FLAGS.epochs)
+        tf.data.experimental.shuffle_and_repeat(buffer_size=flags.FLAGS.shuffle_buffer, count=flags.FLAGS.epochs)
       )
     else:
       dataset = dataset.repeat(flags.FLAGS.epochs)
diff --git a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
index f0d1dd39..c5dc8fce 100644
--- a/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
+++ b/deepray/datasets/ali_display_ad_click/ali_display_ad_click_test.py
@@ -19,14 +19,14 @@ def runner(argv=None):
 
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=2048",
-        "--epochs=1",
-        "--train_data=/workspaces/dataset/ali_display_ad_click/output/*.parquet",
-        "--feature_map=/workspaces/Deepray2/deepray/datasets/ali_display_ad_click/feature_map.csv",
-        # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list",
-        # f"--feature_map={dir_path}/bz_search_1to3.csv",
-        "--label=label",
+      sys.argv[0],
+      "--batch_size=2048",
+      "--epochs=1",
+      "--train_data=/workspaces/dataset/ali_display_ad_click/output/*.parquet",
+      "--feature_map=/workspaces/Deepray2/deepray/datasets/ali_display_ad_click/feature_map.csv",
+      # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list",
+      # f"--feature_map={dir_path}/bz_search_1to3.csv",
+      "--label=label",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -44,7 +44,7 @@ def runner(argv=None):
     step += 1
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/ali_display_ad_click/processing/prep_1_backbone.py b/deepray/datasets/ali_display_ad_click/processing/prep_1_backbone.py
index d0d0dabc..c38f8d41 100644
--- a/deepray/datasets/ali_display_ad_click/processing/prep_1_backbone.py
+++ b/deepray/datasets/ali_display_ad_click/processing/prep_1_backbone.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Prepare Taobao Click Logs Dataset.
+r"""Prepare Taobao Click Logs Dataset.
 
 Step 1: Build backbone click logs.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -36,143 +36,144 @@
 
 
 def main(args):
-
   def fillnull(i):
     if not i:
       return args.null_value
     return i
 
-  logging.info('Reading raw click logs from %s...', args.raw_sample_fname)
-  raw_sample_dtypes = {'clk': np.int32, 'pid': str, 'time_stamp': np.int64, 'user': np.int32, 'adgroup_id': np.int32}
-  raw_sample_columns = ['clk', 'pid', 'time_stamp', 'user', 'adgroup_id']
-  raw_sample = pd.read_csv(args.raw_sample_fname, sep=',', usecols=raw_sample_columns,
-                           dtype=raw_sample_dtypes).reindex(columns=raw_sample_columns)
-  raw_sample = raw_sample.rename(columns={'time_stamp': 'ts', 'adgroup_id': 'ad', 'clk': 'label'})
-  raw_sample['pid'] = raw_sample['pid'].apply(lambda x: int(x.replace('_', ''))).astype(np.int32)
+  logging.info("Reading raw click logs from %s...", args.raw_sample_fname)
+  raw_sample_dtypes = {"clk": np.int32, "pid": str, "time_stamp": np.int64, "user": np.int32, "adgroup_id": np.int32}
+  raw_sample_columns = ["clk", "pid", "time_stamp", "user", "adgroup_id"]
+  raw_sample = pd.read_csv(args.raw_sample_fname, sep=",", usecols=raw_sample_columns, dtype=raw_sample_dtypes).reindex(
+    columns=raw_sample_columns
+  )
+  raw_sample = raw_sample.rename(columns={"time_stamp": "ts", "adgroup_id": "ad", "clk": "label"})
+  raw_sample["pid"] = raw_sample["pid"].apply(lambda x: int(x.replace("_", ""))).astype(np.int32)
   click_log = raw_sample
 
-  logging.info('Reading ad features from %s...', args.ad_feature_fname)
+  logging.info("Reading ad features from %s...", args.ad_feature_fname)
   ad_feature_converters = {
-      'adgroup_id': np.int32,
-      'cate_id': fillnull,
-      'campaign_id': fillnull,
-      'customer': fillnull,
-      'brand': lambda i: int(i) if i and i != 'NULL' else args.null_value,
-      'price': fillnull,
+    "adgroup_id": np.int32,
+    "cate_id": fillnull,
+    "campaign_id": fillnull,
+    "customer": fillnull,
+    "brand": lambda i: int(i) if i and i != "NULL" else args.null_value,
+    "price": fillnull,
   }
   ad_feature_dtypes = {
-      'adgroup_id': np.int32,
-      'cate_id': np.int32,
-      'campaign_id': np.int32,
-      'customer': np.int32,
-      'brand': np.int32,
-      'price': np.float32
+    "adgroup_id": np.int32,
+    "cate_id": np.int32,
+    "campaign_id": np.int32,
+    "customer": np.int32,
+    "brand": np.int32,
+    "price": np.float32,
   }
-  ad_feature = pd.read_csv(args.ad_feature_fname, sep=',', converters=ad_feature_converters)
+  ad_feature = pd.read_csv(args.ad_feature_fname, sep=",", converters=ad_feature_converters)
   ad_feature = ad_feature.astype(ad_feature_dtypes)
   ad_feature = ad_feature.rename(
-      columns={
-          'adgroup_id': 'ad',
-          'campaign_id': 'ad_campaign',
-          'customer': 'ad_customer',
-          'cate_id': 'item_category',
-          'brand': 'item_brand',
-          'price': 'item_price'
-      }
+    columns={
+      "adgroup_id": "ad",
+      "campaign_id": "ad_campaign",
+      "customer": "ad_customer",
+      "cate_id": "item_category",
+      "brand": "item_brand",
+      "price": "item_price",
+    }
   )
 
-  logging.info('Merging Ad features...')
-  click_log = click_log.merge(ad_feature, on='ad', how='left')
+  logging.info("Merging Ad features...")
+  click_log = click_log.merge(ad_feature, on="ad", how="left")
   del raw_sample
   del ad_feature
 
-  logging.info('Reading user profiles from %s...', args.user_profile_fname)
+  logging.info("Reading user profiles from %s...", args.user_profile_fname)
   user_profile_converters = {
-      'userid': np.int32,
-      'cms_segid': fillnull,
-      'cms_group_id': fillnull,
-      'final_gender_code': fillnull,
-      'age_level': fillnull,
-      'pvalue_level': fillnull,
-      'shopping_level': fillnull,
-      'occupation': fillnull,
-      'new_user_class_level ': fillnull,
+    "userid": np.int32,
+    "cms_segid": fillnull,
+    "cms_group_id": fillnull,
+    "final_gender_code": fillnull,
+    "age_level": fillnull,
+    "pvalue_level": fillnull,
+    "shopping_level": fillnull,
+    "occupation": fillnull,
+    "new_user_class_level ": fillnull,
   }
   user_profile_dtypes = {
-      'userid': np.int32,
-      'cms_segid': np.int32,
-      'cms_group_id': np.int32,
-      'final_gender_code': np.int32,
-      'age_level': np.int32,
-      'pvalue_level': np.int32,
-      'shopping_level': np.int32,
-      'occupation': np.int32,
-      'new_user_class_level ': np.int32
+    "userid": np.int32,
+    "cms_segid": np.int32,
+    "cms_group_id": np.int32,
+    "final_gender_code": np.int32,
+    "age_level": np.int32,
+    "pvalue_level": np.int32,
+    "shopping_level": np.int32,
+    "occupation": np.int32,
+    "new_user_class_level ": np.int32,
   }
-  user_profile = pd.read_csv(args.user_profile_fname, sep=',',
-                             converters=user_profile_converters).astype(user_profile_dtypes)
+  user_profile = pd.read_csv(args.user_profile_fname, sep=",", converters=user_profile_converters).astype(
+    user_profile_dtypes
+  )
   user_profile = user_profile.rename(
-      columns={
-          'userid': 'user',
-          'cms_segid': 'user_cms_seg',
-          'cms_group_id': 'user_cms_group',
-          'final_gender_code': 'user_gender',
-          'age_level': 'user_age',
-          'occupation': 'user_occupation',
-          'pvalue_level': 'user_pvalue',
-          'shopping_level': 'user_shopping',
-          'new_user_class_level ': 'user_city'
-      }
+    columns={
+      "userid": "user",
+      "cms_segid": "user_cms_seg",
+      "cms_group_id": "user_cms_group",
+      "final_gender_code": "user_gender",
+      "age_level": "user_age",
+      "occupation": "user_occupation",
+      "pvalue_level": "user_pvalue",
+      "shopping_level": "user_shopping",
+      "new_user_class_level ": "user_city",
+    }
   )
 
-  logging.info('Merging user profiles...')
-  click_log = click_log.merge(user_profile, on='user', how='left')
+  logging.info("Merging user profiles...")
+  click_log = click_log.merge(user_profile, on="user", how="left")
   del user_profile
 
-  logging.info('Spliting click logs...')
+  logging.info("Spliting click logs...")
   click_log = click_log.fillna(args.null_value)
   click_log_dtypes = {
-      'user_cms_seg': np.int32,
-      'user_cms_group': np.int32,
-      'user_gender': np.int32,
-      'user_age': np.int32,
-      'user_occupation': np.int32,
-      'user_pvalue': np.int32,
-      'user_shopping': np.int32,
-      'user_city': np.int32
+    "user_cms_seg": np.int32,
+    "user_cms_group": np.int32,
+    "user_gender": np.int32,
+    "user_age": np.int32,
+    "user_occupation": np.int32,
+    "user_pvalue": np.int32,
+    "user_shopping": np.int32,
+    "user_city": np.int32,
   }
   click_log = click_log.astype(click_log_dtypes)
-  click_log.sort_values(['ts'])
-  click_log['day'] = (click_log.ts - args.begin_ts) // 86400
-  groups = click_log.groupby('day', as_index=False, sort=False)
+  click_log.sort_values(["ts"])
+  click_log["day"] = (click_log.ts - args.begin_ts) // 86400
+  groups = click_log.groupby("day", as_index=False, sort=False)
 
   for day in range(args.ndays):
     output_fname = args.output_fname_template.format(day)
-    logging.info('Writing parquet file %s...', output_fname)
+    logging.info("Writing parquet file %s...", output_fname)
     ddf = groups.get_group(day)
-    del ddf['day']
+    del ddf["day"]
     ddf_table = pa.Table.from_pandas(ddf, preserve_index=False)
     pq.write_table(
-        ddf_table,
-        output_fname,
-        use_dictionary=not args.no_use_dictionary,
-        compression=args.compression,
-        flavor=args.flavor
+      ddf_table,
+      output_fname,
+      use_dictionary=not args.no_use_dictionary,
+      compression=args.compression,
+      flavor=args.flavor,
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('--compression', default='zstd')
-  parser.add_argument('--flavor', default='spark')
-  parser.add_argument('--no-use-dictionary', default=False, action='store_true')
-  parser.add_argument('--null-value', type=int, default=-1 << 16)
-  parser.add_argument('--begin-ts', type=int, default=1494000000)
-  parser.add_argument('--ndays', type=int, default=8)
-  parser.add_argument('--raw_sample_fname', default='./raw_sample.csv')
-  parser.add_argument('--user_profile_fname', default='./user_profile.csv')
-  parser.add_argument('--ad_feature_fname', default='./ad_feature.csv')
-  parser.add_argument('--output-fname-template', default='./backbone_day_{}.parquet')
+  parser.add_argument("--compression", default="zstd")
+  parser.add_argument("--flavor", default="spark")
+  parser.add_argument("--no-use-dictionary", default=False, action="store_true")
+  parser.add_argument("--null-value", type=int, default=-1 << 16)
+  parser.add_argument("--begin-ts", type=int, default=1494000000)
+  parser.add_argument("--ndays", type=int, default=8)
+  parser.add_argument("--raw_sample_fname", default="./raw_sample.csv")
+  parser.add_argument("--user_profile_fname", default="./user_profile.csv")
+  parser.add_argument("--ad_feature_fname", default="./ad_feature.csv")
+  parser.add_argument("--output-fname-template", default="./backbone_day_{}.parquet")
   main(parser.parse_args())
diff --git a/deepray/datasets/ali_display_ad_click/processing/prep_2_bahavior.py b/deepray/datasets/ali_display_ad_click/processing/prep_2_bahavior.py
index 05288621..b1323150 100644
--- a/deepray/datasets/ali_display_ad_click/processing/prep_2_bahavior.py
+++ b/deepray/datasets/ali_display_ad_click/processing/prep_2_bahavior.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Prepare Taobao Click Logs Dataset.
+r"""Prepare Taobao Click Logs Dataset.
 
 Step 2: Build user behavior logs.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -36,20 +36,19 @@
 
 
 def _aggregate(hist, name, output_fname, no_use_dictionary, compression, flavor):
-  r'''Aggregate user behavior into lists
-  '''
-  logging.info('Grouping %s logs...', name)
-  del hist['btag']
+  r"""Aggregate user behavior into lists"""
+  logging.info("Grouping %s logs...", name)
+  del hist["btag"]
   hist = hist.rename(
-      columns={
-          'time_stamp': f'user_{name}_ts_list',
-          'cate': f'user_{name}_category_list',
-          'brand': f'user_{name}_brand_list'
-      }
+    columns={
+      "time_stamp": f"user_{name}_ts_list",
+      "cate": f"user_{name}_category_list",
+      "brand": f"user_{name}_brand_list",
+    }
   )
-  hist = hist.sort_values(by=[f'user_{name}_ts_list'])
-  hist = hist.groupby(['user'], as_index=False, sort=False).aggregate(list)
-  logging.info('Writing parquet file %s...', output_fname)
+  hist = hist.sort_values(by=[f"user_{name}_ts_list"])
+  hist = hist.groupby(["user"], as_index=False, sort=False).aggregate(list)
+  logging.info("Writing parquet file %s...", output_fname)
   hist_table = pa.Table.from_pandas(hist, preserve_index=False)
   pq.write_table(hist_table, output_fname, use_dictionary=not no_use_dictionary, compression=compression, flavor=flavor)
   del hist
@@ -57,37 +56,37 @@ def _aggregate(hist, name, output_fname, no_use_dictionary, compression, flavor)
 
 
 def main(args):
-  logging.info('Reading behavior logs from %s...', args.fname)
+  logging.info("Reading behavior logs from %s...", args.fname)
   behavior_log_dtypes = {
-      'time_stamp': np.int64,
-      'user': np.int32,
-      'btag': str,
-      'cate': np.int32,
-      'brand': np.int32,
+    "time_stamp": np.int64,
+    "user": np.int32,
+    "btag": str,
+    "cate": np.int32,
+    "brand": np.int32,
   }
-  behavior_log = pd.read_csv(args.fname, sep=',', dtype=behavior_log_dtypes)
-  pv_log = behavior_log[behavior_log.btag == 'pv'].copy()
-  cart_log = behavior_log[behavior_log.btag == 'cart'].copy()
-  fav_log = behavior_log[behavior_log.btag == 'fav'].copy()
-  buy_log = behavior_log[behavior_log.btag == 'buy'].copy()
+  behavior_log = pd.read_csv(args.fname, sep=",", dtype=behavior_log_dtypes)
+  pv_log = behavior_log[behavior_log.btag == "pv"].copy()
+  cart_log = behavior_log[behavior_log.btag == "cart"].copy()
+  fav_log = behavior_log[behavior_log.btag == "fav"].copy()
+  buy_log = behavior_log[behavior_log.btag == "buy"].copy()
   del behavior_log
 
-  _aggregate(pv_log, 'pv', args.pv_output_fname, args.no_use_dictionary, args.compression, args.flavor)
-  _aggregate(cart_log, 'cart', args.cart_output_fname, args.no_use_dictionary, args.compression, args.flavor)
-  _aggregate(fav_log, 'fav', args.fav_output_fname, args.no_use_dictionary, args.compression, args.flavor)
-  _aggregate(buy_log, 'buy', args.buy_output_fname, args.no_use_dictionary, args.compression, args.flavor)
+  _aggregate(pv_log, "pv", args.pv_output_fname, args.no_use_dictionary, args.compression, args.flavor)
+  _aggregate(cart_log, "cart", args.cart_output_fname, args.no_use_dictionary, args.compression, args.flavor)
+  _aggregate(fav_log, "fav", args.fav_output_fname, args.no_use_dictionary, args.compression, args.flavor)
+  _aggregate(buy_log, "buy", args.buy_output_fname, args.no_use_dictionary, args.compression, args.flavor)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('--compression', default='zstd')
-  parser.add_argument('--flavor', default='spark')
-  parser.add_argument('--no-use-dictionary', default=False, action='store_true')
-  parser.add_argument('--fname', default='./behavior_log.csv')
-  parser.add_argument('--pv-output-fname', default='./pv_log.parquet')
-  parser.add_argument('--cart-output-fname', default='./cart_log.parquet')
-  parser.add_argument('--fav-output-fname', default='./fav_log.parquet')
-  parser.add_argument('--buy-output-fname', default='./buy_log.parquet')
+  parser.add_argument("--compression", default="zstd")
+  parser.add_argument("--flavor", default="spark")
+  parser.add_argument("--no-use-dictionary", default=False, action="store_true")
+  parser.add_argument("--fname", default="./behavior_log.csv")
+  parser.add_argument("--pv-output-fname", default="./pv_log.parquet")
+  parser.add_argument("--cart-output-fname", default="./cart_log.parquet")
+  parser.add_argument("--fav-output-fname", default="./fav_log.parquet")
+  parser.add_argument("--buy-output-fname", default="./buy_log.parquet")
   main(parser.parse_args())
diff --git a/deepray/datasets/ali_display_ad_click/processing/prep_3_merge.py b/deepray/datasets/ali_display_ad_click/processing/prep_3_merge.py
index 60abeb82..06fb96d8 100644
--- a/deepray/datasets/ali_display_ad_click/processing/prep_3_merge.py
+++ b/deepray/datasets/ali_display_ad_click/processing/prep_3_merge.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Prepare Taobao Click Logs Dataset.
+r"""Prepare Taobao Click Logs Dataset.
 
 Step 3: Merge user bahvaior logs into backbone click logs.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -38,18 +38,16 @@
 
 
 def _merge_to_clicks(click_log, hist, hist_cols):
-  r'''Merge history clicks into click logs.
-  '''
-  merged = click_log.merge(hist, on='user', how='left')
+  r"""Merge history clicks into click logs."""
+  merged = click_log.merge(hist, on="user", how="left")
   for col in hist_cols:
     mask = merged[col].isna()
-    merged.loc[mask, col] = merged.loc[mask, col].fillna('[]').apply(eval)
+    merged.loc[mask, col] = merged.loc[mask, col].fillna("[]").apply(eval)
   return merged
 
 
 def _clip_clicks(row, behavior_log_cols, duration):
-  r'''Clip history clicks inside click logs.
-  '''
+  r"""Clip history clicks inside click logs."""
   max_ts = row.ts
   min_ts = max_ts - duration
   for ts_col, cols in behavior_log_cols.items():
@@ -67,56 +65,56 @@ def _clip_clicks(row, behavior_log_cols, duration):
 
 
 def main(args):
-  logging.info('Reading backbone click logs from %s...', args.fname)
+  logging.info("Reading backbone click logs from %s...", args.fname)
   click_log = pd.read_parquet(args.fname)
 
   behavior_logs = {
-      'pv': args.pv_log_fname,
-      'cart': args.cart_log_fname,
-      'fav': args.fav_log_fname,
-      'buy': args.buy_log_fname
+    "pv": args.pv_log_fname,
+    "cart": args.cart_log_fname,
+    "fav": args.fav_log_fname,
+    "buy": args.buy_log_fname,
   }
 
   behavior_log_cols = {}
   for btag, fname in behavior_logs.items():
-    logging.info('Reading %s logs from %s...', btag, fname)
+    logging.info("Reading %s logs from %s...", btag, fname)
     hist = pd.read_parquet(fname)
     hist_cols = hist.columns.tolist()
-    hist_cols.remove('user')
-    logging.info('Merging %s logs into click logs...', btag)
+    hist_cols.remove("user")
+    logging.info("Merging %s logs into click logs...", btag)
     click_log = _merge_to_clicks(click_log, hist, hist_cols)
-    hist_cols.remove(f'user_{btag}_ts_list')
-    behavior_log_cols[f'user_{btag}_ts_list'] = hist_cols
+    hist_cols.remove(f"user_{btag}_ts_list")
+    behavior_log_cols[f"user_{btag}_ts_list"] = hist_cols
     del hist
 
-  logging.info('Clipping click logs...')
+  logging.info("Clipping click logs...")
   click_log = click_log.progress_apply(lambda row: _clip_clicks(row, behavior_log_cols, args.clip_secs), axis=1)
   for tscol in behavior_log_cols:
     del click_log[tscol]
 
-  logging.info('Writing parquet file %s...', args.output_fname)
+  logging.info("Writing parquet file %s...", args.output_fname)
   click_log_table = pa.Table.from_pandas(click_log, preserve_index=False)
   pq.write_table(
-      click_log_table,
-      args.output_fname,
-      use_dictionary=not args.no_use_dictionary,
-      compression=args.compression,
-      flavor=args.flavor
+    click_log_table,
+    args.output_fname,
+    use_dictionary=not args.no_use_dictionary,
+    compression=args.compression,
+    flavor=args.flavor,
   )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('--compression', default='zstd')
-  parser.add_argument('--flavor', default='spark')
-  parser.add_argument('--no-use-dictionary', default=False, action='store_true')
-  parser.add_argument('--clip-secs', type=int, default=86400 * 3)  # 3 days
-  parser.add_argument('--pv-log-fname', default='./pv_log.parquet')
-  parser.add_argument('--cart-log-fname', default='./cart_log.parquet')
-  parser.add_argument('--fav-log-fname', default='./fav_log.parquet')
-  parser.add_argument('--buy-log-fname', default='./buy_log.parquet')
-  parser.add_argument('--fname')
-  parser.add_argument('--output_fname')
+  parser.add_argument("--compression", default="zstd")
+  parser.add_argument("--flavor", default="spark")
+  parser.add_argument("--no-use-dictionary", default=False, action="store_true")
+  parser.add_argument("--clip-secs", type=int, default=86400 * 3)  # 3 days
+  parser.add_argument("--pv-log-fname", default="./pv_log.parquet")
+  parser.add_argument("--cart-log-fname", default="./cart_log.parquet")
+  parser.add_argument("--fav-log-fname", default="./fav_log.parquet")
+  parser.add_argument("--buy-log-fname", default="./buy_log.parquet")
+  parser.add_argument("--fname")
+  parser.add_argument("--output_fname")
   main(parser.parse_args())
diff --git a/deepray/datasets/ali_display_ad_click/processing/prep_4_sort.py b/deepray/datasets/ali_display_ad_click/processing/prep_4_sort.py
index 7143f4ba..fe4d8f6b 100644
--- a/deepray/datasets/ali_display_ad_click/processing/prep_4_sort.py
+++ b/deepray/datasets/ali_display_ad_click/processing/prep_4_sort.py
@@ -14,13 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Prepare Taobao Click Logs Dataset.
+r"""Prepare Taobao Click Logs Dataset.
 
 Step 4: Sort click logs by user to reduce size.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -38,76 +38,112 @@
 
 def main(args):
   dtypes = {
-      'ts': np.int64,
-      'pid': np.int32,
-      'user': np.int32,
-      'user_cms_seg': np.int32,
-      'user_cms_group': np.int32,
-      'user_gender': np.int32,
-      'user_age': np.int32,
-      'user_pvalue': np.int32,
-      'user_shopping': np.int32,
-      'user_occupation': np.int32,
-      'user_city': np.int32,
-      'label': np.int32,
-      'ad': np.int32,
-      'ad_campaign': np.int32,
-      'ad_customer': np.int32,
-      'item_category': np.int32,
-      'item_brand': np.int32,
-      'item_price': np.float32
+    "ts": np.int64,
+    "pid": np.int32,
+    "user": np.int32,
+    "user_cms_seg": np.int32,
+    "user_cms_group": np.int32,
+    "user_gender": np.int32,
+    "user_age": np.int32,
+    "user_pvalue": np.int32,
+    "user_shopping": np.int32,
+    "user_occupation": np.int32,
+    "user_city": np.int32,
+    "label": np.int32,
+    "ad": np.int32,
+    "ad_campaign": np.int32,
+    "ad_customer": np.int32,
+    "item_category": np.int32,
+    "item_brand": np.int32,
+    "item_price": np.float32,
   }
   columns = [
-      'ts', 'pid', 'label', 'ad', 'ad_campaign', 'ad_customer', 'item_category', 'item_brand', 'item_price', 'user',
-      'user_cms_seg', 'user_cms_group', 'user_gender', 'user_age', 'user_pvalue', 'user_shopping', 'user_occupation',
-      'user_city', 'user_pv_category_list', 'user_pv_brand_list', 'user_cart_category_list', 'user_cart_brand_list',
-      'user_fav_category_list', 'user_fav_brand_list', 'user_buy_category_list', 'user_buy_brand_list'
+    "ts",
+    "pid",
+    "label",
+    "ad",
+    "ad_campaign",
+    "ad_customer",
+    "item_category",
+    "item_brand",
+    "item_price",
+    "user",
+    "user_cms_seg",
+    "user_cms_group",
+    "user_gender",
+    "user_age",
+    "user_pvalue",
+    "user_shopping",
+    "user_occupation",
+    "user_city",
+    "user_pv_category_list",
+    "user_pv_brand_list",
+    "user_cart_category_list",
+    "user_cart_brand_list",
+    "user_fav_category_list",
+    "user_fav_brand_list",
+    "user_buy_category_list",
+    "user_buy_brand_list",
   ]
-  pa_schema = pa.schema(
-      [
-          ('ts', pa.int64()), ('pid', pa.int32()), ('label', pa.int32()), ('ad', pa.int32()),
-          ('ad_campaign', pa.int32()), ('ad_customer', pa.int32()), ('item_category', pa.int32()),
-          ('item_brand', pa.int32()), ('item_price', pa.float32()), ('user', pa.int32()), ('user_cms_seg', pa.int32()),
-          ('user_cms_group', pa.int32()), ('user_gender', pa.int32()), ('user_age', pa.int32()),
-          ('user_pvalue', pa.int32()), ('user_shopping', pa.int32()), ('user_occupation', pa.int32()),
-          ('user_city', pa.int32()), ('user_pv_category_list', pa.list_(pa.int32())),
-          ('user_pv_brand_list', pa.list_(pa.int32())), ('user_cart_category_list', pa.list_(pa.int32())),
-          ('user_cart_brand_list', pa.list_(pa.int32())), ('user_fav_category_list', pa.list_(pa.int32())),
-          ('user_fav_brand_list', pa.list_(pa.int32())), ('user_buy_category_list', pa.list_(pa.int32())),
-          ('user_buy_brand_list', pa.list_(pa.int32()))
-      ]
-  )
-  logging.info('Sorting click logs...')
+  pa_schema = pa.schema([
+    ("ts", pa.int64()),
+    ("pid", pa.int32()),
+    ("label", pa.int32()),
+    ("ad", pa.int32()),
+    ("ad_campaign", pa.int32()),
+    ("ad_customer", pa.int32()),
+    ("item_category", pa.int32()),
+    ("item_brand", pa.int32()),
+    ("item_price", pa.float32()),
+    ("user", pa.int32()),
+    ("user_cms_seg", pa.int32()),
+    ("user_cms_group", pa.int32()),
+    ("user_gender", pa.int32()),
+    ("user_age", pa.int32()),
+    ("user_pvalue", pa.int32()),
+    ("user_shopping", pa.int32()),
+    ("user_occupation", pa.int32()),
+    ("user_city", pa.int32()),
+    ("user_pv_category_list", pa.list_(pa.int32())),
+    ("user_pv_brand_list", pa.list_(pa.int32())),
+    ("user_cart_category_list", pa.list_(pa.int32())),
+    ("user_cart_brand_list", pa.list_(pa.int32())),
+    ("user_fav_category_list", pa.list_(pa.int32())),
+    ("user_fav_brand_list", pa.list_(pa.int32())),
+    ("user_buy_category_list", pa.list_(pa.int32())),
+    ("user_buy_brand_list", pa.list_(pa.int32())),
+  ])
+  logging.info("Sorting click logs...")
   for day in tqdm.tqdm(range(args.ndays)):
     fname = args.fname_template.format(day)
     output_fname = args.output_fname_template.format(day)
     click_log = pd.read_parquet(fname)
     click_log = click_log.astype(dtypes)
-    click_log = click_log.set_index(['ts', 'pid', 'user', 'ad'])
+    click_log = click_log.set_index(["ts", "pid", "user", "ad"])
     click_log = click_log.sort_index()
     click_log = click_log.reset_index()
     click_log = click_log[columns]
     click_log_table = pa.Table.from_pandas(click_log, preserve_index=False)
     click_log_table = click_log_table.cast(pa_schema)
     pq.write_table(
-        click_log_table,
-        output_fname,
-        row_group_size=args.row_group_size,
-        use_dictionary=not args.no_use_dictionary,
-        compression=args.compression,
-        flavor=args.flavor
+      click_log_table,
+      output_fname,
+      row_group_size=args.row_group_size,
+      use_dictionary=not args.no_use_dictionary,
+      compression=args.compression,
+      flavor=args.flavor,
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('--row_group_size', type=int, default=1000000)
-  parser.add_argument('--compression', default='zstd')
-  parser.add_argument('--flavor', default='spark')
-  parser.add_argument('--no-use-dictionary', default=False, action='store_true')
-  parser.add_argument('--ndays', type=int, default=8)
-  parser.add_argument('--fname-template', default='./merged_day_{}.parquet')
-  parser.add_argument('--output-fname-template', default='./day_{}.parquet')
+  parser.add_argument("--row_group_size", type=int, default=1000000)
+  parser.add_argument("--compression", default="zstd")
+  parser.add_argument("--flavor", default="spark")
+  parser.add_argument("--no-use-dictionary", default=False, action="store_true")
+  parser.add_argument("--ndays", type=int, default=8)
+  parser.add_argument("--fname-template", default="./merged_day_{}.parquet")
+  parser.add_argument("--output-fname-template", default="./day_{}.parquet")
   main(parser.parse_args())
diff --git a/deepray/datasets/ali_display_ad_click/processing/stats.py b/deepray/datasets/ali_display_ad_click/processing/stats.py
index 1f74a230..f0803bc0 100644
--- a/deepray/datasets/ali_display_ad_click/processing/stats.py
+++ b/deepray/datasets/ali_display_ad_click/processing/stats.py
@@ -14,11 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Calculate statistics of Taobao Click Logs Dataset.
+r"""Calculate statistics of Taobao Click Logs Dataset.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -40,29 +40,29 @@ def main(args):
   for day in tqdm.tqdm(range(args.ndays)):
     fname = args.fname_template.format(day)
     click_log = pd.read_parquet(fname)
-    users += pd.unique(click_log['user']).tolist()
-    ads += pd.unique(click_log['ad']).tolist()
-    categories += pd.unique(click_log['item_category']).tolist()
-    brands += pd.unique(click_log['item_brand']).tolist()
+    users += pd.unique(click_log["user"]).tolist()
+    ads += pd.unique(click_log["ad"]).tolist()
+    categories += pd.unique(click_log["item_category"]).tolist()
+    brands += pd.unique(click_log["item_brand"]).tolist()
     del click_log
   users = np.unique(users)
-  logging.info('#users = %d', len(users))
+  logging.info("#users = %d", len(users))
   del users
   ads = np.unique(ads)
-  logging.info('#ads = %d', len(ads))
+  logging.info("#ads = %d", len(ads))
   del ads
   categories = np.unique(categories)
-  logging.info('#categories = %d', len(categories))
+  logging.info("#categories = %d", len(categories))
   del categories
   brands = np.unique(brands)
-  logging.info('#brands = %d', len(brands))
+  logging.info("#brands = %d", len(brands))
   del brands
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('--ndays', type=int, default=8)
-  parser.add_argument('--fname-template', default='./day_{}.parquet')
+  parser.add_argument("--ndays", type=int, default=8)
+  parser.add_argument("--fname-template", default="./day_{}.parquet")
   main(parser.parse_args())
diff --git a/deepray/datasets/ali_display_ad_click/processing/to_tfrecord.py b/deepray/datasets/ali_display_ad_click/processing/to_tfrecord.py
index 67ac916b..9ab37fde 100644
--- a/deepray/datasets/ali_display_ad_click/processing/to_tfrecord.py
+++ b/deepray/datasets/ali_display_ad_click/processing/to_tfrecord.py
@@ -14,11 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-r'''Convert Taobao Click Logs Dataset to TFRecord.
+r"""Convert Taobao Click Logs Dataset to TFRecord.
 
 See https://tianchi.aliyun.com/dataset/dataDetail?dataId=56 for more
 information.
-'''
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -38,70 +38,46 @@ def main(args):
   with tf.io.TFRecordWriter(args.output_fname, options) as writer:
     for row in tqdm.tqdm(df.itertuples(), total=len(df)):
       example = tf.train.Example(
-          features=tf.train.Features(
-              feature={
-                  'ts':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ts])),
-                  'pid':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.pid])),
-                  'label':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.label])),
-                  'ad':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad])),
-                  'ad_campaign':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad_campaign])),
-                  'ad_customer':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad_customer])),
-                  'item_category':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.item_category])),
-                  'item_brand':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.item_brand])),
-                  'item_price':
-                      tf.train.Feature(float_list=tf.train.FloatList(value=[row.item_price])),
-                  'user':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user])),
-                  'user_cms_seg':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_cms_seg])),
-                  'user_cms_group':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_cms_group])),
-                  'user_gender':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_gender])),
-                  'user_age':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_age])),
-                  'user_pvalue':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_pvalue])),
-                  'user_shopping':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_shopping])),
-                  'user_occupation':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_occupation])),
-                  'user_city':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_city])),
-                  'user_pv_category_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_pv_category_list)),
-                  'user_pv_brand_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_pv_brand_list)),
-                  'user_cart_category_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_cart_category_list)),
-                  'user_cart_brand_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_cart_brand_list)),
-                  'user_fav_category_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_fav_category_list)),
-                  'user_fav_brand_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_fav_brand_list)),
-                  'user_buy_category_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_buy_category_list)),
-                  'user_buy_brand_list':
-                      tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_buy_brand_list))
-              }
-          )
+        features=tf.train.Features(
+          feature={
+            "ts": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ts])),
+            "pid": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.pid])),
+            "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.label])),
+            "ad": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad])),
+            "ad_campaign": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad_campaign])),
+            "ad_customer": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.ad_customer])),
+            "item_category": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.item_category])),
+            "item_brand": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.item_brand])),
+            "item_price": tf.train.Feature(float_list=tf.train.FloatList(value=[row.item_price])),
+            "user": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user])),
+            "user_cms_seg": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_cms_seg])),
+            "user_cms_group": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_cms_group])),
+            "user_gender": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_gender])),
+            "user_age": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_age])),
+            "user_pvalue": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_pvalue])),
+            "user_shopping": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_shopping])),
+            "user_occupation": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_occupation])),
+            "user_city": tf.train.Feature(int64_list=tf.train.Int64List(value=[row.user_city])),
+            "user_pv_category_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_pv_category_list)),
+            "user_pv_brand_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_pv_brand_list)),
+            "user_cart_category_list": tf.train.Feature(
+              int64_list=tf.train.Int64List(value=row.user_cart_category_list)
+            ),
+            "user_cart_brand_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_cart_brand_list)),
+            "user_fav_category_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_fav_category_list)),
+            "user_fav_brand_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_fav_brand_list)),
+            "user_buy_category_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_buy_category_list)),
+            "user_buy_brand_list": tf.train.Feature(int64_list=tf.train.Int64List(value=row.user_buy_brand_list)),
+          }
+        )
       )
       writer.write(example.SerializeToString())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   logging.basicConfig(level=logging.INFO)
 
   parser = argparse.ArgumentParser()
-  parser.add_argument('fname')
-  parser.add_argument('output_fname')
+  parser.add_argument("fname")
+  parser.add_argument("output_fname")
   main(parser.parse_args())
diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014.py b/deepray/datasets/amazon_books_2014/amazon_books_2014.py
index 2ea68ba3..621b0a36 100644
--- a/deepray/datasets/amazon_books_2014/amazon_books_2014.py
+++ b/deepray/datasets/amazon_books_2014/amazon_books_2014.py
@@ -27,19 +27,18 @@
 
 
 class AmazonBooks2014(TFRecordPipeline):
-
   def __init__(self, max_seq_length, **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
-    FLAGS([
-        sys.argv[0],
-        "--num_train_examples=11932672",
+    flags.FLAGS([
+      sys.argv[0],
+      "--num_train_examples=11932672",
     ])
 
   def parser(self, record):
     tf_feature_spec = {
-        name: tf.io.FixedLenFeature([length] if length > 1 else [], tf.int64)
-        for name, dtype, length in self.feature_map[["name", "dtype", "length"]].values
+      name: tf.io.FixedLenFeature([length] if length > 1 else [], tf.int64)
+      for name, dtype, length in self.feature_map[["name", "dtype", "length"]].values
     }
 
     sample = tf.io.parse_example(serialized=record, features=tf_feature_spec)
@@ -49,41 +48,43 @@ def parser(self, record):
     target_item_features = {f_name: tf.reshape(sample[f_name], [-1]) for f_name in TARGET_ITEM_FEATURES}
 
     padded_positive = {
-        f_name:
-        tf.reshape(sample[f_name], [-1, self.feature_map.loc[self.feature_map['name'] == f_name, 'length'].values[0]])
-        for f_name in POSITIVE_HISTORY
+      f_name: tf.reshape(
+        sample[f_name], [-1, self.feature_map.loc[self.feature_map["name"] == f_name, "length"].values[0]]
+      )
+      for f_name in POSITIVE_HISTORY
     }
 
     padded_negative = {
-        f_name:
-        tf.reshape(sample[f_name], [-1, self.feature_map.loc[self.feature_map['name'] == f_name, 'length'].values[0]])
-        for f_name in NEGATIVE_HISTORY
+      f_name: tf.reshape(
+        sample[f_name], [-1, self.feature_map.loc[self.feature_map["name"] == f_name, "length"].values[0]]
+      )
+      for f_name in NEGATIVE_HISTORY
     }
 
-    long_sequence_features = {f_name: val[:, :self._max_seq_length] for f_name, val in padded_positive.items()}
+    long_sequence_features = {f_name: val[:, : self._max_seq_length] for f_name, val in padded_positive.items()}
 
-    short_sequence_features = {f_name: val[:, self._max_seq_length:] for f_name, val in padded_positive.items()}
+    short_sequence_features = {f_name: val[:, self._max_seq_length :] for f_name, val in padded_positive.items()}
 
-    short_neg_sequence_features = {f_name: val[:, self._max_seq_length:] for f_name, val in padded_negative.items()}
+    short_neg_sequence_features = {f_name: val[:, self._max_seq_length :] for f_name, val in padded_negative.items()}
 
     first_positive_feature_name = POSITIVE_HISTORY[0]
     first_positive_feature = padded_positive[first_positive_feature_name]
 
     history_mask = tf.cast(tf.greater(first_positive_feature, 0), tf.float32)
 
-    long_sequence_mask = history_mask[:, :self._max_seq_length]
-    short_sequence_mask = history_mask[:, self._max_seq_length:]
+    long_sequence_mask = history_mask[:, : self._max_seq_length]
+    short_sequence_mask = history_mask[:, self._max_seq_length :]
 
     label_name = LABEL[0]
     target = tf.reshape(sample[label_name], [-1])
 
     return {
-        "user_features": user_features,
-        "target_item_features": target_item_features,
-        "long_sequence_features": long_sequence_features,
-        "short_sequence_features": short_sequence_features,
-        "short_neg_sequence_features": short_neg_sequence_features,
-        "long_sequence_mask": long_sequence_mask,
-        "short_sequence_mask": short_sequence_mask,
-        "other_features": None
+      "user_features": user_features,
+      "target_item_features": target_item_features,
+      "long_sequence_features": long_sequence_features,
+      "short_sequence_features": short_sequence_features,
+      "short_neg_sequence_features": short_neg_sequence_features,
+      "long_sequence_mask": long_sequence_mask,
+      "short_sequence_mask": short_sequence_mask,
+      "other_features": None,
     }, target
diff --git a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
index f89843d7..420d144a 100644
--- a/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
+++ b/deepray/datasets/amazon_books_2014/amazon_books_2014_test.py
@@ -18,25 +18,25 @@ def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=128",
-        "-epochs=1",
-        "--train_data=/workspaces/dataset/amazon_books_2014/tfrecord_path/train/*.tfrecord",
-        "--max_seq_length=90",
-        f"--feature_map={dir_path}/feature_map.csv",
-        # "--label=clicked",
+      sys.argv[0],
+      "--batch_size=128",
+      "-epochs=1",
+      "--train_data=/workspaces/dataset/amazon_books_2014/tfrecord_path/train/*.tfrecord",
+      "--max_seq_length=90",
+      f"--feature_map={dir_path}/feature_map.csv",
+      # "--label=clicked",
     ]
   if argv:
-    FLAGS(argv, known_only=True)
+    flags.FLAGS(argv, known_only=True)
 
   prebatch_size = 5
-  data_pipe = AmazonBooks2014(prebatch_size=prebatch_size, FLAGS.max_seq_length)
+  data_pipe = AmazonBooks2014(prebatch_size=prebatch_size, max_seq_length=flags.FLAGS.max_seq_length)
   # create data pipline of train & test dataset
 
   # since each tfrecord file must include all of the features, it is enough to read first chunk for each split.
   # train_files = [dataset_dir / file for file in feature_spec.source_spec[TRAIN_MAPPING][0][FILES_SELECTOR]]
 
-  train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size)
+  train_dataset = data_pipe(flags.FLAGS.train_data, batch_size=flags.FLAGS.batch_size)
 
   _performance_calculator = PerformanceCalculator(0, 1000)
 
@@ -46,9 +46,9 @@ def runner(argv=None):
   step = 0
   for batch in train_dataset:
     step += 1
-    num_examples += FLAGS.batch_size
-    step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    num_examples += flags.FLAGS.batch_size
+    step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
+    print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/amazon_books_2014/defaults.py b/deepray/datasets/amazon_books_2014/defaults.py
index 64c64197..973e7e4e 100644
--- a/deepray/datasets/amazon_books_2014/defaults.py
+++ b/deepray/datasets/amazon_books_2014/defaults.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-REMAINDER_FILENAME = 'remainder.tfrecord'
+REMAINDER_FILENAME = "remainder.tfrecord"
 
-USER_FEATURES_CHANNEL = 'user_features'
-TARGET_ITEM_FEATURES_CHANNEL = 'target_item_features'
-POSITIVE_HISTORY_CHANNEL = 'positive_history'
-NEGATIVE_HISTORY_CHANNEL = 'negative_history'
-LABEL_CHANNEL = 'label'
+USER_FEATURES_CHANNEL = "user_features"
+TARGET_ITEM_FEATURES_CHANNEL = "target_item_features"
+POSITIVE_HISTORY_CHANNEL = "positive_history"
+NEGATIVE_HISTORY_CHANNEL = "negative_history"
+LABEL_CHANNEL = "label"
 
 TRAIN_MAPPING = "train"
 TEST_MAPPING = "test"
@@ -27,4 +27,4 @@
 
 DTYPE_SELECTOR = "dtype"
 CARDINALITY_SELECTOR = "cardinality"
-DIMENSIONS_SELECTOR = 'dimensions'
+DIMENSIONS_SELECTOR = "dimensions"
diff --git a/deepray/datasets/amazon_books_2014/preprocessing/io.py b/deepray/datasets/amazon_books_2014/preprocessing/io.py
index a466b1e5..509bec39 100644
--- a/deepray/datasets/amazon_books_2014/preprocessing/io.py
+++ b/deepray/datasets/amazon_books_2014/preprocessing/io.py
@@ -29,8 +29,8 @@ def _read_metadata_line(line: str) -> Dict[str, str]:
 
 
 def load_metadata(
-    path: str,
-    n_proc: int,
+  path: str,
+  n_proc: int,
 ) -> cudf.DataFrame:
   metadata = []
   with open(path) as fp:
@@ -51,42 +51,32 @@ def _read_json(*args, **kwargs):
 
 
 def load_review_data(
-    path: str,
-    num_workers: int,
-    dask_scheduler="processes",
+  path: str,
+  num_workers: int,
+  dask_scheduler="processes",
 ) -> cudf.DataFrame:
   ddf = dask.dataframe.read_json(
-      path,
-      lines=True,
-      blocksize=JSON_READ_BLOCKSIZE,
-      engine=_read_json,
+    path,
+    lines=True,
+    blocksize=JSON_READ_BLOCKSIZE,
+    engine=_read_json,
   )
   df = ddf.compute(scheduler=dask_scheduler, num_workers=num_workers)
   return df
 
 
 def save_metadata(
-    number_of_items: int,
-    number_of_categories: int,
-    number_of_users: int,
-    output_path: str,
+  number_of_items: int,
+  number_of_categories: int,
+  number_of_users: int,
+  output_path: str,
 ):
   data = {
-      "cardinalities":
-          [
-              {
-                  "name": "uid",
-                  "value": number_of_users
-              },
-              {
-                  "name": "item",
-                  "value": number_of_items
-              },
-              {
-                  "name": "cat",
-                  "value": number_of_categories
-              },
-          ],
+    "cardinalities": [
+      {"name": "uid", "value": number_of_users},
+      {"name": "item", "value": number_of_items},
+      {"name": "cat", "value": number_of_categories},
+    ],
   }
   with open(output_path, "w") as fp:
     json.dump(data, fp)
diff --git a/deepray/datasets/amazon_books_2014/preprocessing/ops.py b/deepray/datasets/amazon_books_2014/preprocessing/ops.py
index f69e9e16..69eeea72 100644
--- a/deepray/datasets/amazon_books_2014/preprocessing/ops.py
+++ b/deepray/datasets/amazon_books_2014/preprocessing/ops.py
@@ -29,9 +29,9 @@
 def list_slice(seq_col, start: int, end: Optional[int] = None):
   """Slices a list column
 
-    This is an nvtabular.ops.ListSlice wrapper that can be used with cuDF or dask-cuDF.
+  This is an nvtabular.ops.ListSlice wrapper that can be used with cuDF or dask-cuDF.
 
-    """
+  """
   df = cudf.DataFrame(seq_col)
   col_selector = ops.ColumnSelector(seq_col.name)
   slicer = ops.ListSlice(start, end)
@@ -104,19 +104,19 @@ def slice_and_pad_left(seq_col, max_elements, pad_value=0):
 
 class ExplodeSequence:
   """
-    For each row create a new one with a subsequence of the original list columns.
+  For each row create a new one with a subsequence of the original list columns.
 
-    Keep at most `max_elements` of elements of a list.
+  Keep at most `max_elements` of elements of a list.
 
-    WARNING: All lists in the same row must have equal lengths!
+  WARNING: All lists in the same row must have equal lengths!
 
-    """
+  """
 
   def __init__(
-      self,
-      col_names: List[str],
-      keep_cols: List[str],
-      max_elements: int,
+    self,
+    col_names: List[str],
+    keep_cols: List[str],
+    max_elements: int,
   ):
     self.col_names = col_names
     self.keep_cols = keep_cols
diff --git a/deepray/datasets/amazon_books_2014/preprocessing/parquet_to_tfrecord.py b/deepray/datasets/amazon_books_2014/preprocessing/parquet_to_tfrecord.py
index c6f188d0..6913e642 100644
--- a/deepray/datasets/amazon_books_2014/preprocessing/parquet_to_tfrecord.py
+++ b/deepray/datasets/amazon_books_2014/preprocessing/parquet_to_tfrecord.py
@@ -31,8 +31,8 @@
 os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
 
 logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] %(levelname)s: %(message)s",
+  level=logging.INFO,
+  format="[%(asctime)s] %(levelname)s: %(message)s",
 )
 
 
@@ -52,92 +52,102 @@ def prepare_record(sample, all_feature_names, sequential_data_start, prebatch):
 
 
 def save_records(output_path, records, base_output_path, feature_spec, mapping):
-
   with tf.io.TFRecordWriter(str(output_path)) as file_writer:
     for record_bytes in records:
       file_writer.write(record_bytes)
 
   feature_spec.source_spec[mapping][0][FILES_SELECTOR].append(str(output_path.relative_to(base_output_path)))
 
-  logging.info(f'Created: {output_path}')
+  logging.info(f"Created: {output_path}")
 
 
 @click.command()
 @click.option(
-    "--amazon_dataset_path",
-    required=True,
-    help="Path to the dataset directory.",
-    type=str,
+  "--amazon_dataset_path",
+  required=True,
+  help="Path to the dataset directory.",
+  type=str,
 )
 @click.option(
-    "--tfrecord_output_dir",
-    required=True,
-    help="Path of directory to output tfrecord files.",
-    type=str,
+  "--tfrecord_output_dir",
+  required=True,
+  help="Path of directory to output tfrecord files.",
+  type=str,
 )
 @click.option(
-    "--number_of_user_features",
-    default=1,
-    help="number of user specific features. Default is 1 for amazon books dataset (user_id).",
-    type=int
+  "--number_of_user_features",
+  default=1,
+  help="number of user specific features. Default is 1 for amazon books dataset (user_id).",
+  type=int,
 )
 @click.option(
-    "--max_seq_len",
-    default=100,
-    help="maximum possible length of history. (Entries will be padded to that length later).",
-    type=int
+  "--max_seq_len",
+  default=100,
+  help="maximum possible length of history. (Entries will be padded to that length later).",
+  type=int,
 )
 @click.option(
-    "--n_proc",
-    default=multiprocessing.cpu_count(),
-    help="Number of processes started to speed up conversion to tfrecord.",
-    type=int,
+  "--n_proc",
+  default=multiprocessing.cpu_count(),
+  help="Number of processes started to speed up conversion to tfrecord.",
+  type=int,
 )
 @click.option(
-    "--train_split_dir",
-    default='train',
-    help="Name of directory within amazon dataset directory containing train data.",
-    type=str
+  "--train_split_dir",
+  default="train",
+  help="Name of directory within amazon dataset directory containing train data.",
+  type=str,
 )
 @click.option(
-    "--test_split_dir",
-    default='test',
-    help="Name of directory within amazon dataset directory containing test data.",
-    type=str,
+  "--test_split_dir",
+  default="test",
+  help="Name of directory within amazon dataset directory containing test data.",
+  type=str,
 )
 @click.option(
-    "--metadata_file",
-    default='metadata.json',
-    help="Name of metadata file within amazon dataset directory (containing feature cardinalities).",
-    type=str
+  "--metadata_file",
+  default="metadata.json",
+  help="Name of metadata file within amazon dataset directory (containing feature cardinalities).",
+  type=str,
 )
-@click.option("--train_output_dir", default='train', help="Name of train directory within output directory.", type=str)
-@click.option("--test_output_dir", default='test', help='Name of test directory within output directory.', type=str)
+@click.option("--train_output_dir", default="train", help="Name of train directory within output directory.", type=str)
+@click.option("--test_output_dir", default="test", help="Name of test directory within output directory.", type=str)
 @click.option("--train_parts", default=8, help="Number of output train files.", type=int)
 @click.option("--test_parts", default=4, help="Number of output test files.", type=int)
 @click.option(
-    "--prebatch_train_size",
-    default=0,
-    help='Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.',
-    type=int
+  "--prebatch_train_size",
+  default=0,
+  help="Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.",
+  type=int,
 )
 @click.option(
-    "--prebatch_test_size",
-    default=0,
-    help='Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.',
-    type=int
+  "--prebatch_test_size",
+  default=0,
+  help="Apply batching to data in preprocessing. If prebatch_size == 0, no prebatching is done.",
+  type=int,
 )
 def main(
-    amazon_dataset_path: str, tfrecord_output_dir: str, number_of_user_features: int, max_seq_len: int, n_proc: int,
-    train_split_dir: str, test_split_dir: str, metadata_file: str, train_output_dir: str, test_output_dir: str,
-    train_parts: int, test_parts: int, prebatch_train_size: int, prebatch_test_size: int
+  amazon_dataset_path: str,
+  tfrecord_output_dir: str,
+  number_of_user_features: int,
+  max_seq_len: int,
+  n_proc: int,
+  train_split_dir: str,
+  test_split_dir: str,
+  metadata_file: str,
+  train_output_dir: str,
+  test_output_dir: str,
+  train_parts: int,
+  test_parts: int,
+  prebatch_train_size: int,
+  prebatch_test_size: int,
 ):
   """
-    read_parquet()
-    create tf.train.Features
-    create default FeatureSpec
-    dump to Tfrecords
-    """
+  read_parquet()
+  create tf.train.Features
+  create default FeatureSpec
+  dump to Tfrecords
+  """
 
   amazon_dataset_path = pathlib.Path(amazon_dataset_path)
   tfrecord_output_dir = pathlib.Path(tfrecord_output_dir)
@@ -148,18 +158,18 @@ def main(
   for split_dir in output_splits:
     os.makedirs(split_dir, exist_ok=True)
 
-  with open(amazon_dataset_path / metadata_file, 'r') as file:
+  with open(amazon_dataset_path / metadata_file, "r") as file:
     metadata = json.load(file)
 
   feature_cardinalities = []
-  for cardinality in metadata['cardinalities']:
-    feature_cardinalities.append(cardinality['value'])
+  for cardinality in metadata["cardinalities"]:
+    feature_cardinalities.append(cardinality["value"])
 
   user_features_cardinalities = feature_cardinalities[:number_of_user_features]
   item_features_cardinalities = feature_cardinalities[number_of_user_features:]
 
   feature_spec = FeatureSpec.get_default_feature_spec(
-      user_features_cardinalities, item_features_cardinalities, max_seq_len
+    user_features_cardinalities, item_features_cardinalities, max_seq_len
   )
 
   number_of_item_features = len(item_features_cardinalities)
@@ -171,38 +181,37 @@ def main(
   mappings = [TRAIN_MAPPING, TEST_MAPPING]
 
   for mapping, input_dir, output_dir, output_parts, prebatch_size in zip(
-      mappings, input_splits, output_splits, parts_per_split, prebatch_per_split
+    mappings, input_splits, output_splits, parts_per_split, prebatch_per_split
   ):
-
     prebatch = prebatch_size > 0
     prepare_record_function = partial(
-        prepare_record,
-        all_feature_names=all_feature_names,
-        sequential_data_start=sequential_data_start,
-        prebatch=prebatch
+      prepare_record,
+      all_feature_names=all_feature_names,
+      sequential_data_start=sequential_data_start,
+      prebatch=prebatch,
     )
     save_records_function = partial(
-        save_records, base_output_path=tfrecord_output_dir, feature_spec=feature_spec, mapping=mapping
+      save_records, base_output_path=tfrecord_output_dir, feature_spec=feature_spec, mapping=mapping
     )
 
     logging.info(f"Started conversion, will output to {output_dir}")
 
-    df = pd.read_parquet(input_dir, engine='pyarrow')
+    df = pd.read_parquet(input_dir, engine="pyarrow")
 
     logging.info("Parquet loaded")
 
     if prebatch:
-      df['batch_index'] = df.index // prebatch_size
-      df = df.groupby('batch_index').agg(list)
+      df["batch_index"] = df.index // prebatch_size
+      df = df.groupby("batch_index").agg(list)
       if len(df.iloc[-1, 0]) < prebatch_size:
-        remainder = df[-1:].to_dict('records')[0]
+        remainder = df[-1:].to_dict("records")[0]
         remainder = prepare_record_function(remainder)
 
         df = df[:-1]
 
       logging.info("Prebatching applied")
 
-    df = df.to_dict('records')
+    df = df.to_dict("records")
     with multiprocessing.Pool(n_proc) as pool:
       records = pool.map(prepare_record_function, df)
 
@@ -211,12 +220,12 @@ def main(
     records = np.array_split(records, output_parts)
     for i, records_part in enumerate(records):
       if len(records_part) > 0:
-        save_records_function(output_dir / f'part_{i}.tfrecord', records_part)
+        save_records_function(output_dir / f"part_{i}.tfrecord", records_part)
 
     if prebatch:
       save_records_function(output_dir / REMAINDER_FILENAME, [remainder])
 
-  feature_spec.to_yaml(tfrecord_output_dir / 'feature_spec.yaml')
+  feature_spec.to_yaml(tfrecord_output_dir / "feature_spec.yaml")
 
 
 if __name__ == "__main__":
diff --git a/deepray/datasets/amazon_books_2014/preprocessing/sim_preprocessing.py b/deepray/datasets/amazon_books_2014/preprocessing/sim_preprocessing.py
index 9f3d6432..8806307c 100644
--- a/deepray/datasets/amazon_books_2014/preprocessing/sim_preprocessing.py
+++ b/deepray/datasets/amazon_books_2014/preprocessing/sim_preprocessing.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Preprocessing script for SIM models."""
+
 import logging
 import multiprocessing
 import os
@@ -32,19 +33,19 @@
 TEST_DATA_FILE = "part.0.parquet"
 CATEGORIZED_METADATA_FILE = "metadata.json"
 OUTPUT_META = {
-    "label": "int8",
-    "uid": "int64",
-    "item": "int32",
-    "cat": "int32",
-    "item_sequence": "list",
-    "cat_sequence": "list",
-    "neg_item_sequence": "list",
-    "neg_cat_sequence": "list",
+  "label": "int8",
+  "uid": "int64",
+  "item": "int32",
+  "cat": "int32",
+  "item_sequence": "list",
+  "cat_sequence": "list",
+  "neg_item_sequence": "list",
+  "neg_cat_sequence": "list",
 }
 
 logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] %(levelname)s: %(message)s",
+  level=logging.INFO,
+  format="[%(asctime)s] %(levelname)s: %(message)s",
 )
 
 
@@ -77,8 +78,8 @@ def filter_too_short_sequences(reviews: cudf.DataFrame, min_seq_length: int) ->
 
 
 def add_items_and_categories_indices(
-    reviews: cudf.DataFrame,
-    item_and_cat_with_ids: cudf.DataFrame,
+  reviews: cudf.DataFrame,
+  item_and_cat_with_ids: cudf.DataFrame,
 ) -> cudf.DataFrame:
   return reviews.merge(item_and_cat_with_ids, how="left", on="item")
 
@@ -98,14 +99,16 @@ def create_sampling_df(all_items: cudf.DataFrame, item_and_cat_with_ids: cudf.Da
 def aggregate_per_user(df):
   df = df.sort_values(by=["unixReviewTime", "item"])
   df = df.groupby("uid").agg({
-      "item_id": list,
-      "cat_id": list,
+    "item_id": list,
+    "cat_id": list,
   })
   df.reset_index(inplace=True)
-  df = df.rename(columns={
+  df = df.rename(
+    columns={
       "item_id": "item_sequence",
       "cat_id": "cat_sequence",
-  })
+    }
+  )
 
   df["item"] = df["item_sequence"].list.get(-1)
   df["cat"] = df["cat_sequence"].list.get(-1)
@@ -118,9 +121,9 @@ def aggregate_per_user(df):
 
 def explode_sequence(df: cudf.DataFrame, min_elements: int, max_elements: int) -> cudf.DataFrame:
   df = ExplodeSequence(
-      col_names=["item_sequence", "cat_sequence"],
-      keep_cols=["uid"],
-      max_elements=max_elements + 1,
+    col_names=["item_sequence", "cat_sequence"],
+    keep_cols=["uid"],
+    max_elements=max_elements + 1,
   ).transform(df)
 
   df["item"] = df["item_sequence"].list.get(-1)
@@ -140,10 +143,10 @@ def add_negative_label(pos_df: cudf.DataFrame, sampling_df: cudf.DataFrame) -> c
   neg_df["label"] = cupy.int8(0)
 
   neg = cupy.random.randint(
-      low=0,
-      high=len(sampling_df),
-      size=len(neg_df),
-      dtype=int,
+    low=0,
+    high=len(sampling_df),
+    size=len(neg_df),
+    dtype=int,
   )
 
   neg_item_ids = sampling_df["item_id"].iloc[neg].values
@@ -161,10 +164,10 @@ def add_negative_sampling(df: cudf.DataFrame, sampling_df: cudf.DataFrame) -> cu
   df = add_negative_label(df, sampling_df)
 
   neg = cupy.random.randint(
-      low=0,
-      high=len(sampling_df),
-      size=int(df.item_sequence.list.len().sum()),
-      dtype=int,
+    low=0,
+    high=len(sampling_df),
+    size=int(df.item_sequence.list.len().sum()),
+    dtype=int,
   )
   item_samples = sampling_df["item_id"].iloc[neg]
   cat_samples = sampling_df["cat_id"].iloc[neg]
@@ -185,15 +188,14 @@ def pad_with_zeros(df: cudf.DataFrame, max_elements: int) -> cudf.DataFrame:
 
 
 def create_train_dataset(
-    df: cudf.DataFrame,
-    sampling_df: cudf.DataFrame,
-    min_elements: int,
-    max_elements: int,
-    output_path: str,
-    seed: int,
-    dask_scheduler: str = "processes",
+  df: cudf.DataFrame,
+  sampling_df: cudf.DataFrame,
+  min_elements: int,
+  max_elements: int,
+  output_path: str,
+  seed: int,
+  dask_scheduler: str = "processes",
 ) -> None:
-
   def transform(df, sampling_df, partition_info):
     part_seed = seed + partition_info["number"] + 1
     cupy.random.seed(part_seed)
@@ -215,10 +217,10 @@ def transform(df, sampling_df, partition_info):
 
 
 def create_test_dataset(
-    df: cudf.DataFrame,
-    sampling_df: cudf.DataFrame,
-    max_elements: int,
-    output_path: str,
+  df: cudf.DataFrame,
+  sampling_df: cudf.DataFrame,
+  max_elements: int,
+  output_path: str,
 ) -> None:
   df = add_negative_sampling(df, sampling_df)
   df = pad_with_zeros(df, max_elements)
@@ -233,74 +235,74 @@ def create_test_dataset(
 
 @click.command()
 @click.option(
-    "--amazon_dataset_path",
-    required=True,
-    help="Path to the dataset. Must contain both reviews and metadata json files.",
-    type=str,
+  "--amazon_dataset_path",
+  required=True,
+  help="Path to the dataset. Must contain both reviews and metadata json files.",
+  type=str,
 )
 @click.option(
-    "--output_path",
-    required=True,
-    help="Path where preprocessed dataset is saved.",
-    type=str,
+  "--output_path",
+  required=True,
+  help="Path where preprocessed dataset is saved.",
+  type=str,
 )
 @click.option(
-    "--metadata_file_name",
-    default="meta_Books.json",
-    help="Path to the dataset. Must contain both reviews and metadata json files.",
-    type=str,
+  "--metadata_file_name",
+  default="meta_Books.json",
+  help="Path to the dataset. Must contain both reviews and metadata json files.",
+  type=str,
 )
 @click.option(
-    "--reviews_file_name",
-    default="reviews_Books.json",
-    help="Path where preprocessed dataset is saved.",
-    type=str,
+  "--reviews_file_name",
+  default="reviews_Books.json",
+  help="Path where preprocessed dataset is saved.",
+  type=str,
 )
 @click.option(
-    "--max_sequence_length",
-    default=100,
-    help="Take only `max_sequence_length` last elements of a sequence.",
+  "--max_sequence_length",
+  default=100,
+  help="Take only `max_sequence_length` last elements of a sequence.",
 )
 @click.option(
-    "--shortest_sequence_for_user",
-    default=20,
-    help="Specifies what is a minimal length of a sequence. "
-    "Every user with a sequence shorter than this value will be discarded."
+  "--shortest_sequence_for_user",
+  default=20,
+  help="Specifies what is a minimal length of a sequence. "
+  "Every user with a sequence shorter than this value will be discarded.",
 )
 @click.option(
-    "--shortest_sequence_for_training",
-    default=1,
-    help="Specifies what is a minimal length of a sequence in a training set.",
+  "--shortest_sequence_for_training",
+  default=1,
+  help="Specifies what is a minimal length of a sequence in a training set.",
 )
 @click.option(
-    "--metadata_loader_n_proc",
-    default=multiprocessing.cpu_count(),
-    help="Specifies the number of processes used to parse metadata.",
+  "--metadata_loader_n_proc",
+  default=multiprocessing.cpu_count(),
+  help="Specifies the number of processes used to parse metadata.",
 )
 @click.option(
-    "--review_loader_num_workers",
-    default=20,
-    help="Specifies the number of dask workers used to read reviews data. "
-    "Note that, as each worker is a new process, too high value might cause GPU OOM errors."
+  "--review_loader_num_workers",
+  default=20,
+  help="Specifies the number of dask workers used to read reviews data. "
+  "Note that, as each worker is a new process, too high value might cause GPU OOM errors.",
 )
 @click.option(
-    "--seed",
-    default=12345,
-    help="Seed for reproducibility."
-    "Note that the results can still differ between machines because of dask/cudf non-determinism.",
-    type=int,
+  "--seed",
+  default=12345,
+  help="Seed for reproducibility."
+  "Note that the results can still differ between machines because of dask/cudf non-determinism.",
+  type=int,
 )
 def main(
-    amazon_dataset_path: str,
-    output_path: str,
-    metadata_file_name: str,
-    reviews_file_name: str,
-    max_sequence_length: int,
-    shortest_sequence_for_user: int,
-    shortest_sequence_for_training: int,
-    metadata_loader_n_proc: int,
-    review_loader_num_workers: int,
-    seed: int,
+  amazon_dataset_path: str,
+  output_path: str,
+  metadata_file_name: str,
+  reviews_file_name: str,
+  max_sequence_length: int,
+  shortest_sequence_for_user: int,
+  shortest_sequence_for_training: int,
+  metadata_loader_n_proc: int,
+  review_loader_num_workers: int,
+  seed: int,
 ):
   cupy.random.seed(seed)
   rmm.reinitialize(managed_memory=True)
@@ -335,28 +337,28 @@ def main(
 
   logging.info("Creating train dataset")
   create_train_dataset(
-      df,
-      sampling_df,
-      min_elements=shortest_sequence_for_training,
-      max_elements=max_sequence_length,
-      output_path=os.path.join(output_path, TRAIN_DATA_DIR),
-      seed=seed,
+    df,
+    sampling_df,
+    min_elements=shortest_sequence_for_training,
+    max_elements=max_sequence_length,
+    output_path=os.path.join(output_path, TRAIN_DATA_DIR),
+    seed=seed,
   )
 
   logging.info("Creating test dataset")
   create_test_dataset(
-      df,
-      sampling_df,
-      max_elements=max_sequence_length,
-      output_path=os.path.join(output_path, TEST_DATA_DIR),
+    df,
+    sampling_df,
+    max_elements=max_sequence_length,
+    output_path=os.path.join(output_path, TEST_DATA_DIR),
   )
 
   logging.info("Saving metadata")
   save_metadata(
-      number_of_items=len(item_and_cat_with_ids),
-      number_of_categories=item_and_cat_with_ids["cat_id"].nunique(),
-      number_of_users=len(df),
-      output_path=os.path.join(output_path, CATEGORIZED_METADATA_FILE),
+    number_of_items=len(item_and_cat_with_ids),
+    number_of_categories=item_and_cat_with_ids["cat_id"].nunique(),
+    number_of_users=len(df),
+    output_path=os.path.join(output_path, CATEGORIZED_METADATA_FILE),
   )
 
 
diff --git a/deepray/datasets/avazu/avazu.py b/deepray/datasets/avazu/avazu.py
index b5d80249..75e1255d 100644
--- a/deepray/datasets/avazu/avazu.py
+++ b/deepray/datasets/avazu/avazu.py
@@ -27,28 +27,27 @@
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
 FLAGS([
-    sys.argv[0],
-    "--num_train_examples=36210028",
+  sys.argv[0],
+  "--num_train_examples=36210028",
 ])
-if os.path.exists(os.path.join(dir_path, 'feature_map.csv')):
+if os.path.exists(os.path.join(dir_path, "feature_map.csv")):
   FLAGS([
-      sys.argv[0],
-      f"--feature_map={dir_path}/feature_map.csv",
+    sys.argv[0],
+    f"--feature_map={dir_path}/feature_map.csv",
   ])
 
 DEFAULT_VALUE = {"int64": 0, "float32": 0.0, "bytes": ""}
 
 
 class Avazu(ParquetPipeline):
-
   def parse(self, record):
-    for name in self.feature_map[(self.feature_map['length'] == 1)]["name"].values:
+    for name in self.feature_map[(self.feature_map["length"] == 1)]["name"].values:
       record[name] = tf.expand_dims(record[name], axis=1)
 
-    if len(self.feature_map[(self.feature_map['ftype'] == "Label")].index) == 1:
-      target = record.pop(self.feature_map[(self.feature_map['ftype'] == "Label")].iloc[0]['name'])
+    if len(self.feature_map[(self.feature_map["ftype"] == "Label")].index) == 1:
+      target = record.pop(self.feature_map[(self.feature_map["ftype"] == "Label")].iloc[0]["name"])
     else:
       target = {}
-      for name in self.feature_map[(self.feature_map['ftype'] == "Label")][["name"]].values:
+      for name in self.feature_map[(self.feature_map["ftype"] == "Label")][["name"]].values:
         target[name] = record.pop(name)
     return record, target
diff --git a/deepray/datasets/avazu/avazu_test.py b/deepray/datasets/avazu/avazu_test.py
index 3dd43727..2e6e55f5 100644
--- a/deepray/datasets/avazu/avazu_test.py
+++ b/deepray/datasets/avazu/avazu_test.py
@@ -19,12 +19,12 @@ def runner(argv=None):
 
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=128",
-        "--epochs=1",
-        "--train_data=/workspaces/dataset/avazu/output/train.parquet",
-        f"--feature_map={dir_path}/feature_map.csv",
-        "--label=label",
+      sys.argv[0],
+      "--batch_size=128",
+      "--epochs=1",
+      "--train_data=/workspaces/dataset/avazu/output/train.parquet",
+      f"--feature_map={dir_path}/feature_map.csv",
+      "--label=label",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -42,7 +42,7 @@ def runner(argv=None):
     step += 1
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/avazu/processing.py b/deepray/datasets/avazu/processing.py
index c06c1665..61bbda95 100644
--- a/deepray/datasets/avazu/processing.py
+++ b/deepray/datasets/avazu/processing.py
@@ -19,16 +19,36 @@ def sparse_faeture_dict(feat_name, feat_num, dtype, ftype, embed_dim=4, length=1
 
 
 def create_avazu_dataset(path, out_path, read_part=False, samples_num=100, embed_dim=8):
-  print('数据预处理开始')
+  print("数据预处理开始")
   sparse_features = [
-      'hour', 'id', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category', 'app_id', 'app_domain',
-      'app_category', 'device_id', 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16',
-      'C17', 'C18', 'C19', 'C20', 'C21'
+    "hour",
+    "id",
+    "C1",
+    "banner_pos",
+    "site_id",
+    "site_domain",
+    "site_category",
+    "app_id",
+    "app_domain",
+    "app_category",
+    "device_id",
+    "device_ip",
+    "device_model",
+    "device_type",
+    "device_conn_type",
+    "C14",
+    "C15",
+    "C16",
+    "C17",
+    "C18",
+    "C19",
+    "C20",
+    "C21",
   ]
 
-  train_path = os.path.join(path + '/train.gz')
-  test_path = os.path.join(path + '/test.gz')
-  print('加载数据集')
+  train_path = os.path.join(path + "/train.gz")
+  test_path = os.path.join(path + "/test.gz")
+  print("加载数据集")
   if read_part:
     train_data = pd.read_csv(train_path, nrows=samples_num)
     test_x = pd.read_csv(test_path, nrows=samples_num)
@@ -40,71 +60,72 @@ def create_avazu_dataset(path, out_path, read_part=False, samples_num=100, embed
   print(test_x.head())
 
   # hour, 只有14年10月11天的数据，year,month没必要做特征
-  train_data['hour'] = train_data['hour'].apply(str)
-  train_data['hour'] = train_data['hour'].map(lambda x: int(x[6:8]))  # int强转去掉字符串前的0
-  test_x['hour'] = test_x['hour'].apply(str)
-  test_x['hour'] = test_x['hour'].map(lambda x: int(x[6:8]))
-  print('加载数据完成')
-  print('Sparse feature encode')
+  train_data["hour"] = train_data["hour"].apply(str)
+  train_data["hour"] = train_data["hour"].map(lambda x: int(x[6:8]))  # int强转去掉字符串前的0
+  test_x["hour"] = test_x["hour"].apply(str)
+  test_x["hour"] = test_x["hour"].map(lambda x: int(x[6:8]))
+  print("加载数据完成")
+  print("Sparse feature encode")
   # sparse features
   le = LabelEncoder()
   for feat in sparse_features:
     all_class = pd.concat([train_data[feat], test_x[feat]]).unique()
-    print(f'Processing {feat}, count {len(all_class)}')
+    print(f"Processing {feat}, count {len(all_class)}")
     le.fit(all_class)
     train_data[feat] = le.transform(train_data[feat])
     test_x[feat] = le.transform(test_x[feat])
 
-  print('Sparse feature encode succeed')
+  print("Sparse feature encode succeed")
   # save LabelEncoder model for test
   # joblib.dump(le, 'label_encoder.model')
   # sparse_faeture_dict(feat_name='day', feat_num=32, embed_dim=embed_dim)
   # sparse_faeture_dict(feat_name='hour', feat_num=24, embed_dim=embed_dim)
 
-  train_data = train_data[sparse_features + ['click']].astype('int32')
+  train_data = train_data[sparse_features + ["click"]].astype("int32")
 
   train, val = train_test_split(train_data, test_size=0.2, shuffle=True)
 
-  test_x = test_x[sparse_features].astype('int32')
+  test_x = test_x[sparse_features].astype("int32")
 
   features_columns = [
-      sparse_faeture_dict(
-          feat_name=feat,
-          feat_num=train_data[feat].max() + 1,
-          dtype=train_data[feat].dtype,
-          ftype="Categorical",
-          embed_dim=embed_dim
-      ) for feat in sparse_features
+    sparse_faeture_dict(
+      feat_name=feat,
+      feat_num=train_data[feat].max() + 1,
+      dtype=train_data[feat].dtype,
+      ftype="Categorical",
+      embed_dim=embed_dim,
+    )
+    for feat in sparse_features
   ]
 
   feature_map = pd.DataFrame(features_columns)
   feature_map.loc[len(feature_map)] = {
-      "name": "click",
-      "dtype": "int32",
-      "ftype": "Label",
-      "length": 1,
-      "voc_size": 1,
-      "dim": 1,
+    "name": "click",
+    "dtype": "int32",
+    "ftype": "Label",
+    "length": 1,
+    "voc_size": 1,
+    "dim": 1,
   }
   feature_map = feature_map.astype({
-      "length": int,
-      "voc_size": int,
-      "dim": int,
+    "length": int,
+    "voc_size": int,
+    "dim": int,
   })
 
   print(feature_map)
 
-  feature_map.to_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)) + 'feature_map.csv'), index=False)
+  feature_map.to_csv(os.path.join(os.path.dirname(os.path.realpath(__file__)) + "feature_map.csv"), index=False)
 
   train.to_parquet(os.path.join(out_path + "train.parquet"))
   val.to_parquet(os.path.join(out_path + "valid.parquet"))
   test_x.to_parquet(os.path.join(out_path + "test.parquet"))
 
-  print('Data preprocessing completed.')
+  print("Data preprocessing completed.")
 
 
 if __name__ == "__main__":
-  path = '/workspaces/dataset/avazu/'
+  path = "/workspaces/dataset/avazu/"
   out_path = "/workspaces/dataset/avazu/output/"
 
   if not os.path.exists(out_path):
@@ -113,8 +134,8 @@ def create_avazu_dataset(path, out_path, read_part=False, samples_num=100, embed
   embed_dim = 11
 
   create_avazu_dataset(
-      path,
-      out_path,
-      read_part=False,
-      embed_dim=embed_dim,
+    path,
+    out_path,
+    read_part=False,
+    embed_dim=embed_dim,
   )
diff --git a/deepray/datasets/cifar/cifar.py b/deepray/datasets/cifar/cifar.py
index f825e63f..9a6ed5da 100644
--- a/deepray/datasets/cifar/cifar.py
+++ b/deepray/datasets/cifar/cifar.py
@@ -27,13 +27,12 @@
 from deepray.datasets.datapipeline import DataPipeline
 
 flags.FLAGS([
-    sys.argv[0],
-    "--num_train_examples=60000",
+  sys.argv[0],
+  "--num_train_examples=60000",
 ])
 
 
 class CIFAR(DataPipeline):
-
   def load_batch(self, fpath, label_key="labels"):
     """Internal utility for parsing CIFAR data.
 
@@ -60,7 +59,6 @@ def load_batch(self, fpath, label_key="labels"):
 
 
 class CIFAR10(CIFAR):
-
   def __init__(self, **kwargs):
     """Loads the CIFAR10 dataset.
 
@@ -114,12 +112,12 @@ def __init__(self, **kwargs):
     dirname = "cifar-10-batches-py"
     origin = "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
     self.path = get_file(
-        dirname,
-        origin=origin,
-        untar=True,
-        file_hash=(  # noqa: E501
-            "6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce"
-        ),
+      dirname,
+      origin=origin,
+      untar=True,
+      file_hash=(  # noqa: E501
+        "6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce"
+      ),
     )
 
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
@@ -132,8 +130,8 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
       for i in range(1, 6):
         fpath = os.path.join(self.path, "data_batch_" + str(i))
         (
-            x[(i - 1) * 10000:i * 10000, :, :, :],
-            y[(i - 1) * 10000:i * 10000],
+          x[(i - 1) * 10000 : i * 10000, :, :, :],
+          y[(i - 1) * 10000 : i * 10000],
         ) = self.load_batch(fpath)
     else:
       fpath = os.path.join(self.path, "test_batch")
@@ -154,61 +152,59 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
 
 
 class CIFAR100(CIFAR):
-
   def __init__(self, label_mode="fine", **kwargs):
     """Loads the CIFAR100 dataset.
 
-        This is a dataset of 50,000 32x32 color training images and
-        10,000 test images, labeled over 100 fine-grained classes that are
-        grouped into 20 coarse-grained classes. See more info at the
-        [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
+    This is a dataset of 50,000 32x32 color training images and
+    10,000 test images, labeled over 100 fine-grained classes that are
+    grouped into 20 coarse-grained classes. See more info at the
+    [CIFAR homepage](https://www.cs.toronto.edu/~kriz/cifar.html).
 
-        Args:
-          label_mode: one of "fine", "coarse". If it is "fine" the category labels
-            are the fine-grained labels, if it is "coarse" the output labels are the
-            coarse-grained superclasses.
+    Args:
+      label_mode: one of "fine", "coarse". If it is "fine" the category labels
+        are the fine-grained labels, if it is "coarse" the output labels are the
+        coarse-grained superclasses.
 
-        Returns:
-          Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
+    Returns:
+      Tuple of NumPy arrays: `(x_train, y_train), (x_test, y_test)`.
 
-        **x_train**: uint8 NumPy array of grayscale image data with shapes
-          `(50000, 32, 32, 3)`, containing the training data. Pixel values range
-          from 0 to 255.
+    **x_train**: uint8 NumPy array of grayscale image data with shapes
+      `(50000, 32, 32, 3)`, containing the training data. Pixel values range
+      from 0 to 255.
 
-        **y_train**: uint8 NumPy array of labels (integers in range 0-99)
-          with shape `(50000, 1)` for the training data.
+    **y_train**: uint8 NumPy array of labels (integers in range 0-99)
+      with shape `(50000, 1)` for the training data.
 
-        **x_test**: uint8 NumPy array of grayscale image data with shapes
-          `(10000, 32, 32, 3)`, containing the test data. Pixel values range
-          from 0 to 255.
+    **x_test**: uint8 NumPy array of grayscale image data with shapes
+      `(10000, 32, 32, 3)`, containing the test data. Pixel values range
+      from 0 to 255.
 
-        **y_test**: uint8 NumPy array of labels (integers in range 0-99)
-          with shape `(10000, 1)` for the test data.
+    **y_test**: uint8 NumPy array of labels (integers in range 0-99)
+      with shape `(10000, 1)` for the test data.
 
-        Example:
+    Example:
 
-        ```python
-        (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
-        assert x_train.shape == (50000, 32, 32, 3)
-        assert x_test.shape == (10000, 32, 32, 3)
-        assert y_train.shape == (50000, 1)
-        assert y_test.shape == (10000, 1)
-        ```
-        """
+    ```python
+    (x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+    assert x_train.shape == (50000, 32, 32, 3)
+    assert x_test.shape == (10000, 32, 32, 3)
+    assert y_train.shape == (50000, 1)
+    assert y_test.shape == (10000, 1)
+    ```
+    """
     super().__init__(**kwargs)
     if label_mode not in ["fine", "coarse"]:
-      raise ValueError('`label_mode` must be one of `"fine"`, `"coarse"`. '
-                       f"Received: label_mode={label_mode}.")
+      raise ValueError(f'`label_mode` must be one of `"fine"`, `"coarse"`. Received: label_mode={label_mode}.')
 
     dirname = "cifar-100-python"
     origin = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz"
     self.path = get_file(
-        dirname,
-        origin=origin,
-        untar=True,
-        file_hash=(  # noqa: E501
-            "85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7"
-        ),
+      dirname,
+      origin=origin,
+      untar=True,
+      file_hash=(  # noqa: E501
+        "85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7"
+      ),
     )
     self.label_mode = label_mode
 
diff --git a/deepray/datasets/cifar/cifar_test.py b/deepray/datasets/cifar/cifar_test.py
index de256065..24cc072e 100644
--- a/deepray/datasets/cifar/cifar_test.py
+++ b/deepray/datasets/cifar/cifar_test.py
@@ -15,12 +15,12 @@
 def runner(argv=None):
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        "--train_data=cifar100",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
+      sys.argv[0],
+      "--batch_size=16",
+      "-epochs=1",
+      "--train_data=cifar100",
+      # f"--feature_map={dir_path}/feature_map.csv",
+      "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/creditcardfraud/creditcardfraud.py b/deepray/datasets/creditcardfraud/creditcardfraud.py
index a2c83c97..4e39881f 100644
--- a/deepray/datasets/creditcardfraud/creditcardfraud.py
+++ b/deepray/datasets/creditcardfraud/creditcardfraud.py
@@ -26,41 +26,40 @@
 from deepray.datasets.datapipeline import DataPipeline
 
 flags.FLAGS([
-    sys.argv[0],
-    "--num_train_examples=182280",
+  sys.argv[0],
+  "--num_train_examples=182280",
 ])
 
 
 class CreditCardFraud(DataPipeline):
-
-  def __init__(self, url='https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv'):
+  def __init__(self, url="https://storage.googleapis.com/download.tensorflow.org/data/creditcard.csv"):
     super().__init__()
-    csv_file = tf.keras.utils.get_file('creditcard.csv', url)
+    csv_file = tf.keras.utils.get_file("creditcard.csv", url)
     raw_df = pd.read_csv(csv_file)
 
-    raw_df[['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V26', 'V27', 'V28', 'Amount', 'Class']].describe()
+    raw_df[["Time", "V1", "V2", "V3", "V4", "V5", "V26", "V27", "V28", "Amount", "Class"]].describe()
 
-    neg, pos = np.bincount(raw_df['Class'])
+    neg, pos = np.bincount(raw_df["Class"])
     total = neg + pos
-    print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(total, pos, 100 * pos / total))
+    print("Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n".format(total, pos, 100 * pos / total))
 
     cleaned_df = raw_df.copy()
 
     # You don't want the `Time` column.
-    cleaned_df.pop('Time')
+    cleaned_df.pop("Time")
 
     # The `Amount` column covers a huge range. Convert to log-space.
     eps = 0.001  # 0 => 0.1¢
-    cleaned_df['Log Amount'] = np.log(cleaned_df.pop('Amount') + eps)
+    cleaned_df["Log Amount"] = np.log(cleaned_df.pop("Amount") + eps)
 
     train_df, test_df = train_test_split(cleaned_df, test_size=0.2)
     train_df, val_df = train_test_split(train_df, test_size=0.2)
 
     # Form np arrays of labels and features.
-    self.train_labels = np.array(train_df.pop('Class'))
+    self.train_labels = np.array(train_df.pop("Class"))
     self.bool_train_labels = self.train_labels != 0
-    self.val_labels = np.array(val_df.pop('Class'))
-    self.test_labels = np.array(test_df.pop('Class'))
+    self.val_labels = np.array(val_df.pop("Class"))
+    self.test_labels = np.array(test_df.pop("Class"))
 
     train_features = np.array(train_df)
     val_features = np.array(val_df)
@@ -83,7 +82,7 @@ def __len__(self):
     pass
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     if is_training:
       ds = tf.data.Dataset.from_tensor_slices((self.train_features, self.train_labels))
diff --git a/deepray/datasets/creditcardfraud/creditcardfraud_test.py b/deepray/datasets/creditcardfraud/creditcardfraud_test.py
index ea7c6c91..4b64330c 100644
--- a/deepray/datasets/creditcardfraud/creditcardfraud_test.py
+++ b/deepray/datasets/creditcardfraud/creditcardfraud_test.py
@@ -15,12 +15,12 @@
 def runner(argv=None):
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=10",
-        "-epochs=1",
-        "--train_data=movielens/1m-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
+      sys.argv[0],
+      "--batch_size=10",
+      "-epochs=1",
+      "--train_data=movielens/1m-ratings",
+      # f"--feature_map={dir_path}/feature_map.csv",
+      "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/criteo/__init__.py b/deepray/datasets/criteo/__init__.py
index 921eb0c2..9c0f5ee4 100644
--- a/deepray/datasets/criteo/__init__.py
+++ b/deepray/datasets/criteo/__init__.py
@@ -1,2 +1,2 @@
 from .criteo import Criteo
-from .criteo_tsv_reader import CriteoTsvReader
\ No newline at end of file
+from .criteo_tsv_reader import CriteoTsvReader
diff --git a/deepray/datasets/criteo/criteo.py b/deepray/datasets/criteo/criteo.py
index 17e20922..d008c93e 100644
--- a/deepray/datasets/criteo/criteo.py
+++ b/deepray/datasets/criteo/criteo.py
@@ -23,12 +23,11 @@
 
 
 class Criteo(DataPipeline):
-
   def __init__(self, context: InputContext = None, **kwargs):
     super().__init__(context, **kwargs)
     flags.FLAGS([
-        sys.argv[0],
-        "--num_train_examples=11932672",
+      sys.argv[0],
+      "--num_train_examples=11932672",
     ])
 
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
diff --git a/deepray/datasets/criteo/criteo_test.py b/deepray/datasets/criteo/criteo_test.py
index bb66063b..24726a4d 100644
--- a/deepray/datasets/criteo/criteo_test.py
+++ b/deepray/datasets/criteo/criteo_test.py
@@ -12,12 +12,12 @@
 from deepray.datasets.criteo.criteo import Criteo
 
 flags_core.define_base(
-    model_dir=False,
-    clean=True,
-    epochs=True,
-    epochs_between_evals=False,
-    export_dir=False,
-    stop_threshold=False,
+  model_dir=False,
+  clean=True,
+  epochs=True,
+  epochs_between_evals=False,
+  export_dir=False,
+  stop_threshold=False,
 )
 
 TIME_STAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
@@ -27,14 +27,14 @@ def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        # '--train_data=hdfs://10.11.11.241:8020/test/offline_feature_model_topic_parquet/ymd=2021111[0-6]/*.parquet',
-        "--train_data=/Users/admin/Downloads/train.csv",
-        f"--feature_map={dir_path}/feature_map.csv",
-        "--prefetch_buffer=64",
-        "--label=clicked",
+      sys.argv[0],
+      "--batch_size=16",
+      "-epochs=1",
+      # '--train_data=hdfs://10.11.11.241:8020/test/offline_feature_model_topic_parquet/ymd=2021111[0-6]/*.parquet',
+      "--train_data=/Users/admin/Downloads/train.csv",
+      f"--feature_map={dir_path}/feature_map.csv",
+      "--prefetch_buffer=64",
+      "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/criteo/criteo_tsv_reader.py b/deepray/datasets/criteo/criteo_tsv_reader.py
index bbccdecc..fb31cb9b 100644
--- a/deepray/datasets/criteo/criteo_tsv_reader.py
+++ b/deepray/datasets/criteo/criteo_tsv_reader.py
@@ -38,19 +38,12 @@ class CriteoTsvReader(DataPipeline):
   def __init__(self, file_pattern: str = None, use_synthetic_data: bool = False, **kwargs):
     super().__init__(**kwargs)
     self._file_pattern = file_pattern
-    self._num_dense_features = self.feature_map['ftype'].value_counts()['Numerical']
-    self._vocab_sizes = self.feature_map[(self.feature_map['ftype'] == "Categorical")]["voc_size"].tolist()
+    self._num_dense_features = self.feature_map["ftype"].value_counts()["Numerical"]
+    self._vocab_sizes = self.feature_map[(self.feature_map["ftype"] == "Categorical")]["voc_size"].tolist()
     self._use_synthetic_data = use_synthetic_data
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      epochs=1,
-      shuffle=True,
-      *args,
-      **kwargs
+    self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs
   ) -> tf.data.Dataset:
     if self._use_synthetic_data:
       return self._generate_synthetic_data(is_training, batch_size)
@@ -73,7 +66,7 @@ def make_dataset():
 
     indices = tf.data.Dataset.range(get_world_size())
     dataset = indices.interleave(
-        map_func=make_dataset, cycle_length=flags.FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE
+      map_func=make_dataset, cycle_length=flags.FLAGS.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE
     )
 
     dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
@@ -87,7 +80,7 @@ def parser(self, example: tf.Tensor):
     num_sparse_features = len(self._vocab_sizes)
     categorical_defaults = [[0] for _ in range(num_sparse_features)]
     record_defaults = label_defaults + dense_defaults + categorical_defaults
-    fields = tf.io.decode_csv(example, record_defaults, field_delim='\t', na_value='-1')
+    fields = tf.io.decode_csv(example, record_defaults, field_delim="\t", na_value="-1")
 
     num_labels = 1
     label = tf.reshape(fields[0], [flags.FLAGS.batch_size, 1])
@@ -99,13 +92,13 @@ def parser(self, example: tf.Tensor):
     offset = num_labels
     for idx in range(num_dense):
       dense_features.append(fields[idx + offset])
-    features['dense_features'] = tf.stack(dense_features, axis=1)
+    features["dense_features"] = tf.stack(dense_features, axis=1)
 
     offset += num_dense
-    features['sparse_features'] = {}
+    features["sparse_features"] = {}
 
     for idx in range(num_sparse_features):
-      features['sparse_features'][str(idx)] = fields[idx + offset]
+      features["sparse_features"][str(idx)] = fields[idx + offset]
 
     return features, label
 
@@ -126,9 +119,9 @@ def _generate_synthetic_data(self, is_training: bool, batch_size: int) -> tf.dat
 
     sparse_tensors = []
     sparse_tensor_elements = {}
-    for name, voc_size, dtype in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "voc_size", "dtype"
-    ]].values:
+    for name, voc_size, dtype in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "voc_size", "dtype"]
+    ].values:
       _tensor = tf.random.uniform(shape=(dataset_size,), maxval=int(voc_size), dtype=dtype)
       sparse_tensors.append(_tensor)
       sparse_tensor_elements[name] = _tensor
@@ -146,7 +139,7 @@ def _generate_synthetic_data(self, is_training: bool, batch_size: int) -> tf.dat
     # Using the threshold 0.5 to convert to 0/1 labels.
     label_tensor = tf.cast(label_tensor + 0.5, tf.int32)
 
-    sparse_tensor_elements.update({'dense_features': dense_tensor})
+    sparse_tensor_elements.update({"dense_features": dense_tensor})
 
     input_elem = sparse_tensor_elements, label_tensor
 
diff --git a/deepray/datasets/criteo/criteo_tsv_reader_test.py b/deepray/datasets/criteo/criteo_tsv_reader_test.py
index 6188e386..af8fdcc1 100644
--- a/deepray/datasets/criteo/criteo_tsv_reader_test.py
+++ b/deepray/datasets/criteo/criteo_tsv_reader_test.py
@@ -15,12 +15,12 @@ def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=4096",
-        "-epochs=1",
-        # "--train_data=/Users/admin/Downloads/train.csv",
-        f"--feature_map={dir_path}/feature_map_small.csv",
-        "--prefetch_buffer=64",
+      sys.argv[0],
+      "--batch_size=4096",
+      "-epochs=1",
+      # "--train_data=/Users/admin/Downloads/train.csv",
+      f"--feature_map={dir_path}/feature_map_small.csv",
+      "--prefetch_buffer=64",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -38,7 +38,7 @@ def runner(argv=None):
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
 
     if num_examples % 100 == 0:
-      print(f'step {step}, Perf {step_throughput} samples/s')
+      print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(x)
   print(num_examples)
diff --git a/deepray/datasets/criteo/preproc/NVT_shuffle_spark.py b/deepray/datasets/criteo/preproc/NVT_shuffle_spark.py
index f5dcd758..490228be 100644
--- a/deepray/datasets/criteo/preproc/NVT_shuffle_spark.py
+++ b/deepray/datasets/criteo/preproc/NVT_shuffle_spark.py
@@ -28,13 +28,13 @@ def col_of_rand_long():
 
 
 def rand_ordinal(df):
-  return df.withColumn('ordinal', col_of_rand_long())
+  return df.withColumn("ordinal", col_of_rand_long())
 
 
 def _parse_args():
   parser = ArgumentParser()
-  parser.add_argument('--input_path', required=True)
-  parser.add_argument('--output_path')
+  parser.add_argument("--input_path", required=True)
+  parser.add_argument("--output_path")
   args = parser.parse_args()
   return args
 
@@ -44,11 +44,11 @@ def _main():
   spark = SparkSession.builder.getOrCreate()
 
   df = rand_ordinal(spark.read.load(args.input_path + "/*"))
-  df = df.repartition('ordinal').sortWithinPartitions('ordinal')
-  df = df.drop('ordinal')
+  df = df.repartition("ordinal").sortWithinPartitions("ordinal")
+  df = df.drop("ordinal")
 
-  df.write.parquet(args.output_path, mode='overwrite')
+  df.write.parquet(args.output_path, mode="overwrite")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   _main()
diff --git a/deepray/datasets/criteo/preproc/data/defaults.py b/deepray/datasets/criteo/preproc/data/defaults.py
index b4e12767..ebe81b75 100644
--- a/deepray/datasets/criteo/preproc/data/defaults.py
+++ b/deepray/datasets/criteo/preproc/data/defaults.py
@@ -33,7 +33,7 @@
 
 def get_categorical_feature_type(size: int):
   """This function works both when max value and cardinality is passed.
-        Consistency by the user is required"""
+  Consistency by the user is required"""
   types = (np.int8, np.int16, np.int32)
 
   for numpy_type in types:
diff --git a/deepray/datasets/criteo/preproc/data/feature_spec.py b/deepray/datasets/criteo/preproc/data/feature_spec.py
index f40a43bb..0e608b4a 100644
--- a/deepray/datasets/criteo/preproc/data/feature_spec.py
+++ b/deepray/datasets/criteo/preproc/data/feature_spec.py
@@ -17,18 +17,27 @@
 from typing import Dict
 from typing import List
 import numpy as np
-from defaults import CATEGORICAL_CHANNEL, NUMERICAL_CHANNEL, LABEL_CHANNEL, \
-    TRAIN_MAPPING, TEST_MAPPING, \
-    TYPE_SELECTOR, FEATURES_SELECTOR, FILES_SELECTOR, CARDINALITY_SELECTOR, DTYPE_SELECTOR, \
-    SPLIT_BINARY, \
-    get_categorical_feature_type
+from defaults import (
+  CATEGORICAL_CHANNEL,
+  NUMERICAL_CHANNEL,
+  LABEL_CHANNEL,
+  TRAIN_MAPPING,
+  TEST_MAPPING,
+  TYPE_SELECTOR,
+  FEATURES_SELECTOR,
+  FILES_SELECTOR,
+  CARDINALITY_SELECTOR,
+  DTYPE_SELECTOR,
+  SPLIT_BINARY,
+  get_categorical_feature_type,
+)
+
 """ For performance reasons, numerical features are required to appear in the same order
     in both source_spec and channel_spec.
     For more detailed requirements, see the check_feature_spec method"""
 
 
 class FeatureSpec:
-
   def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metadata=None, base_directory=None):
     self.feature_spec: Dict = feature_spec if feature_spec is not None else {}
     self.source_spec: Dict = source_spec if source_spec is not None else {}
@@ -38,7 +47,7 @@ def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metad
 
   @classmethod
   def from_yaml(cls, path):
-    with open(path, 'r') as feature_spec_file:
+    with open(path, "r") as feature_spec_file:
       base_directory = os.path.dirname(path)
       feature_spec = yaml.safe_load(feature_spec_file)
       return cls.from_dict(feature_spec, base_directory=base_directory)
@@ -48,7 +57,7 @@ def from_dict(cls, source_dict, base_directory):
     return cls(base_directory=base_directory, **source_dict)
 
   def to_dict(self) -> Dict:
-    attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata']
+    attributes_to_dump = ["feature_spec", "source_spec", "channel_spec", "metadata"]
     return {attr: self.__dict__[attr] for attr in attributes_to_dump}
 
   def to_string(self):
@@ -56,8 +65,8 @@ def to_string(self):
 
   def to_yaml(self, output_path=None):
     if not output_path:
-      output_path = self.base_directory + '/feature_spec.yaml'
-    with open(output_path, 'w') as output_file:
+      output_path = self.base_directory + "/feature_spec.yaml"
+    with open(output_path, "w") as output_file:
       print(yaml.dump(self.to_dict()), file=output_file)
 
   def get_number_of_numerical_features(self) -> int:
@@ -70,12 +79,12 @@ def cat_positions_to_names(self, positions: List[int]):
     return [feature_names[i] for i in positions]
 
   def get_categorical_feature_names(self):
-    """ Provides the categorical feature names. The returned order should me maintained."""
+    """Provides the categorical feature names. The returned order should me maintained."""
     return self.channel_spec[CATEGORICAL_CHANNEL]
 
   def get_categorical_sizes(self) -> List[int]:
     """For a given feature spec, this function is expected to return the sizes in the order corresponding to the
-        order in the channel_spec section """
+    order in the channel_spec section"""
     categorical_features = self.get_categorical_feature_names()
     cardinalities = [self.feature_spec[feature_name][CARDINALITY_SELECTOR] for feature_name in categorical_features]
 
@@ -109,7 +118,7 @@ def check_feature_spec(self):
     # check that all features used in channel spec are exactly ones defined in feature_spec
     feature_spec_features = list(self.feature_spec.keys())
     channel_spec_features = list(
-        set.union(set_of_categorical_features, set_of_numerical_features, {label_feature_name})
+      set.union(set_of_categorical_features, set_of_numerical_features, {label_feature_name})
     )
     assert sorted(feature_spec_features) == sorted(channel_spec_features)
 
@@ -128,7 +137,6 @@ def check_feature_spec(self):
         assert isinstance(feature_dict[CARDINALITY_SELECTOR], int)
 
     for mapping_name in [TRAIN_MAPPING, TEST_MAPPING]:
-
       mapping = self.source_spec[mapping_name]
       mapping_features = set()
       for chunk in mapping:
@@ -195,20 +203,17 @@ def get_default_feature_spec(number_of_numerical_features, categorical_feature_c
     cat_feature_types = [get_categorical_feature_type(int(cat_size)) for cat_size in categorical_feature_cardinalities]
 
     feature_dict = {
-        f_name: {
-            DTYPE_SELECTOR: str(np.dtype(f_type)),
-            CARDINALITY_SELECTOR: f_size
-        } for f_name, f_type, f_size in
-        zip(categorical_feature_names, cat_feature_types, categorical_feature_cardinalities)
+      f_name: {DTYPE_SELECTOR: str(np.dtype(f_type)), CARDINALITY_SELECTOR: f_size}
+      for f_name, f_type, f_size in zip(categorical_feature_names, cat_feature_types, categorical_feature_cardinalities)
     }
     for f_name in numerical_feature_names:
       feature_dict[f_name] = {DTYPE_SELECTOR: str(np.dtype(np.float16))}
     feature_dict[label_feature_name] = {DTYPE_SELECTOR: str(np.dtype(bool))}
 
     channel_spec = {
-        CATEGORICAL_CHANNEL: categorical_feature_names,
-        NUMERICAL_CHANNEL: numerical_feature_names,
-        LABEL_CHANNEL: [label_feature_name]
+      CATEGORICAL_CHANNEL: categorical_feature_names,
+      NUMERICAL_CHANNEL: numerical_feature_names,
+      LABEL_CHANNEL: [label_feature_name],
     }
     source_spec = {}
 
@@ -217,33 +222,27 @@ def get_default_feature_spec(number_of_numerical_features, categorical_feature_c
       dst_folder = filename
 
       numerical_file_path = os.path.join(dst_folder, numerical_file_name)
-      source_spec[filename].append(
-          {
-              TYPE_SELECTOR: SPLIT_BINARY,
-              FEATURES_SELECTOR: numerical_feature_names,
-              FILES_SELECTOR: [numerical_file_path]
-          }
-      )
+      source_spec[filename].append({
+        TYPE_SELECTOR: SPLIT_BINARY,
+        FEATURES_SELECTOR: numerical_feature_names,
+        FILES_SELECTOR: [numerical_file_path],
+      })
 
       label_file_path = os.path.join(dst_folder, label_file_name)
-      source_spec[filename].append(
-          {
-              TYPE_SELECTOR: SPLIT_BINARY,
-              FEATURES_SELECTOR: [label_feature_name],
-              FILES_SELECTOR: [label_file_path]
-          }
-      )
+      source_spec[filename].append({
+        TYPE_SELECTOR: SPLIT_BINARY,
+        FEATURES_SELECTOR: [label_feature_name],
+        FILES_SELECTOR: [label_file_path],
+      })
 
       for feature_name in categorical_feature_names:
         categorical_file_name = categorical_file_fstring.format(feature_name)
         categorical_file_path = os.path.join(dst_folder, categorical_file_name)
-        source_spec[filename].append(
-            {
-                TYPE_SELECTOR: SPLIT_BINARY,
-                FEATURES_SELECTOR: [feature_name],
-                FILES_SELECTOR: [categorical_file_path]
-            }
-        )
+        source_spec[filename].append({
+          TYPE_SELECTOR: SPLIT_BINARY,
+          FEATURES_SELECTOR: [feature_name],
+          FILES_SELECTOR: [categorical_file_path],
+        })
 
     return FeatureSpec(feature_spec=feature_dict, source_spec=source_spec, channel_spec=channel_spec, metadata={})
 
diff --git a/deepray/datasets/criteo/preproc/parquet_to_binary.py b/deepray/datasets/criteo/preproc/parquet_to_binary.py
index cf13b33b..b6665462 100644
--- a/deepray/datasets/criteo/preproc/parquet_to_binary.py
+++ b/deepray/datasets/criteo/preproc/parquet_to_binary.py
@@ -23,10 +23,10 @@
 
 
 def process_file(f, dst):
-  label = 'f_c0'
-  dense_columns = [f'f_c{i}' for i in range(1, 14)]
-  categorical_columns = [f'f_c{i}' for i in range(14, 40)]
-  all_columns_sorted = [f'f_c{i}' for i in range(0, 40)]
+  label = "f_c0"
+  dense_columns = [f"f_c{i}" for i in range(1, 14)]
+  categorical_columns = [f"f_c{i}" for i in range(14, 40)]
+  all_columns_sorted = [f"f_c{i}" for i in range(0, 40)]
   data = pd.read_parquet(f)
   data = data[all_columns_sorted]
 
@@ -37,59 +37,62 @@ def process_file(f, dst):
   data = data.to_records(index=False)
   data = data.tobytes()
 
-  dst_file = dst + '/' + f.split('/')[-1] + '.bin'
-  with open(dst_file, 'wb') as dst_fd:
+  dst_file = dst + "/" + f.split("/")[-1] + ".bin"
+  with open(dst_file, "wb") as dst_fd:
     dst_fd.write(data)
 
 
 def main():
   parser = argparse.ArgumentParser()
-  parser.add_argument('--src_dir', type=str)
-  parser.add_argument('--intermediate_dir', type=str)
-  parser.add_argument('--dst_dir', type=str)
-  parser.add_argument('--parallel_jobs', default=40, type=int)
+  parser.add_argument("--src_dir", type=str)
+  parser.add_argument("--intermediate_dir", type=str)
+  parser.add_argument("--dst_dir", type=str)
+  parser.add_argument("--parallel_jobs", default=40, type=int)
   args = parser.parse_args()
 
-  print('Processing train files...')
-  train_src_files = glob.glob(args.src_dir + '/train/*.parquet')
-  train_intermediate_dir = os.path.join(args.intermediate_dir, 'train')
+  print("Processing train files...")
+  train_src_files = glob.glob(args.src_dir + "/train/*.parquet")
+  train_intermediate_dir = os.path.join(args.intermediate_dir, "train")
   os.makedirs(train_intermediate_dir, exist_ok=True)
 
-  Parallel(n_jobs=args.parallel_jobs
-          )(delayed(process_file)(f, train_intermediate_dir) for f in tqdm.tqdm(train_src_files))
+  Parallel(n_jobs=args.parallel_jobs)(
+    delayed(process_file)(f, train_intermediate_dir) for f in tqdm.tqdm(train_src_files)
+  )
 
-  print('Train files conversion done')
+  print("Train files conversion done")
 
-  print('Processing test files...')
-  test_src_files = glob.glob(args.src_dir + '/test/*.parquet')
-  test_intermediate_dir = os.path.join(args.intermediate_dir, 'test')
+  print("Processing test files...")
+  test_src_files = glob.glob(args.src_dir + "/test/*.parquet")
+  test_intermediate_dir = os.path.join(args.intermediate_dir, "test")
   os.makedirs(test_intermediate_dir, exist_ok=True)
 
-  Parallel(n_jobs=args.parallel_jobs
-          )(delayed(process_file)(f, test_intermediate_dir) for f in tqdm.tqdm(test_src_files))
-  print('Test files conversion done')
+  Parallel(n_jobs=args.parallel_jobs)(
+    delayed(process_file)(f, test_intermediate_dir) for f in tqdm.tqdm(test_src_files)
+  )
+  print("Test files conversion done")
 
-  print('Processing validation files...')
-  valid_src_files = glob.glob(args.src_dir + '/validation/*.parquet')
-  valid_intermediate_dir = os.path.join(args.intermediate_dir, 'validation')
+  print("Processing validation files...")
+  valid_src_files = glob.glob(args.src_dir + "/validation/*.parquet")
+  valid_intermediate_dir = os.path.join(args.intermediate_dir, "validation")
   os.makedirs(valid_intermediate_dir, exist_ok=True)
 
-  Parallel(n_jobs=args.parallel_jobs
-          )(delayed(process_file)(f, valid_intermediate_dir) for f in tqdm.tqdm(valid_src_files))
-  print('Validation files conversion done')
+  Parallel(n_jobs=args.parallel_jobs)(
+    delayed(process_file)(f, valid_intermediate_dir) for f in tqdm.tqdm(valid_src_files)
+  )
+  print("Validation files conversion done")
 
   os.makedirs(args.dst_dir, exist_ok=True)
 
-  print('Concatenating train files')
-  os.system(f'cat {train_intermediate_dir}/*.bin > {args.dst_dir}/train_data.bin')
+  print("Concatenating train files")
+  os.system(f"cat {train_intermediate_dir}/*.bin > {args.dst_dir}/train_data.bin")
 
-  print('Concatenating test files')
-  os.system(f'cat {test_intermediate_dir}/*.bin > {args.dst_dir}/test_data.bin')
+  print("Concatenating test files")
+  os.system(f"cat {test_intermediate_dir}/*.bin > {args.dst_dir}/test_data.bin")
 
-  print('Concatenating validation files')
-  os.system(f'cat {valid_intermediate_dir}/*.bin > {args.dst_dir}/validation_data.bin')
-  print('Done')
+  print("Concatenating validation files")
+  os.system(f"cat {valid_intermediate_dir}/*.bin > {args.dst_dir}/validation_data.bin")
+  print("Done")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/criteo/preproc/preproc_NVTabular.py b/deepray/datasets/criteo/preproc/preproc_NVTabular.py
index 90b57faf..a18dc6da 100644
--- a/deepray/datasets/criteo/preproc/preproc_NVTabular.py
+++ b/deepray/datasets/criteo/preproc/preproc_NVTabular.py
@@ -39,13 +39,12 @@
 from nvtabular import Workflow
 from nvtabular.io import Dataset, Shuffle
 from nvtabular.utils import device_mem_size
-from nvtabular.ops import Normalize, Categorify, LogOp, FillMissing, Clip, get_embedding_sizes, \
-    LambdaOp
+from nvtabular.ops import Normalize, Categorify, LogOp, FillMissing, Clip, get_embedding_sizes, LambdaOp
 from cudf.io.parquet import ParquetWriter
 
-CRITEO_CONTINUOUS_COLUMNS = [f'f_c{x}' for x in range(1, 14)]
-CRITEO_CATEGORICAL_COLUMNS = [f'f_c{x}' for x in range(14, 40)]
-CRITEO_CLICK_COLUMNS = ['f_c0']
+CRITEO_CONTINUOUS_COLUMNS = [f"f_c{x}" for x in range(1, 14)]
+CRITEO_CATEGORICAL_COLUMNS = [f"f_c{x}" for x in range(14, 40)]
+CRITEO_CLICK_COLUMNS = ["f_c0"]
 COLUMNS = CRITEO_CONTINUOUS_COLUMNS + CRITEO_CATEGORICAL_COLUMNS + CRITEO_CLICK_COLUMNS
 CRITEO_TRAIN_DAYS = list(range(0, 23))
 
@@ -60,13 +59,13 @@ def _pool(frac=0.8):
   if initial_pool_size % 256 != 0:
     new_initial_pool_size = initial_pool_size // 256 * 256
     print(
-        f"Initial pool size for rmm has to be a multiply of 256. Got {initial_pool_size}, reducing to {new_initial_pool_size}"
+      f"Initial pool size for rmm has to be a multiply of 256. Got {initial_pool_size}, reducing to {new_initial_pool_size}"
     )
     initial_pool_size = new_initial_pool_size
 
   rmm.reinitialize(
-      pool_allocator=True,
-      initial_pool_size=initial_pool_size,
+    pool_allocator=True,
+    initial_pool_size=initial_pool_size,
   )
 
 
@@ -75,12 +74,12 @@ def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
   out_path = fs.sep.join([out_dir, f"{name}.parquet"])
   writer = ParquetWriter(out_path, compression=None)
   for gdf in nvt.Dataset(
-      path,
-      engine="csv",
-      names=cols,
-      part_memory_fraction=gpu_mem_frac,
-      sep='\t',
-      dtypes=dtypes,
+    path,
+    engine="csv",
+    names=cols,
+    part_memory_fraction=gpu_mem_frac,
+    sep="\t",
+    dtypes=dtypes,
   ).to_iter():
     writer.write_table(gdf)
     del gdf
@@ -91,17 +90,17 @@ def _convert_file(path, name, out_dir, gpu_mem_frac, fs, cols, dtypes):
 def _write_metadata(md_list, fs, path):
   if md_list:
     metadata_path = fs.sep.join([path, "_metadata"])
-    _meta = (cudf.io.merge_parquet_filemetadata(md_list) if len(md_list) > 1 else md_list[0])
+    _meta = cudf.io.merge_parquet_filemetadata(md_list) if len(md_list) > 1 else md_list[0]
     with fs.open(metadata_path, "wb") as f:
       _meta.tofile(f)
   return True
 
 
 def convert_criteo_to_parquet(
-    input_path: str,
-    output_path: str,
-    client,
-    gpu_mem_frac: float = 0.05,
+  input_path: str,
+  output_path: str,
+  client,
+  gpu_mem_frac: float = 0.05,
 ):
   print("Converting tsv to parquet files")
   if not output_path:
@@ -109,15 +108,15 @@ def convert_criteo_to_parquet(
   os.makedirs(output_path, exist_ok=True)
 
   # split last day into two parts
-  number_of_lines = int(subprocess.check_output((f'wc -l {os.path.join(input_path, "day_23")}').split()).split()[0])
+  number_of_lines = int(subprocess.check_output((f"wc -l {os.path.join(input_path, 'day_23')}").split()).split()[0])
   valid_set_size = number_of_lines // 2
   test_set_size = number_of_lines - valid_set_size
 
   with open(os.path.join(input_path, "day_23.part1"), "w") as f:
-    subprocess.run(['head', '-n', str(test_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
+    subprocess.run(["head", "-n", str(test_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
 
   with open(os.path.join(input_path, "day_23.part2"), "w") as f:
-    subprocess.run(['tail', '-n', str(valid_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
+    subprocess.run(["tail", "-n", str(valid_set_size), str(os.path.join(input_path, "day_23"))], stdout=f)
 
   fs = get_fs_token_paths(input_path, mode="rb")[0]
   file_list = [x for x in fs.glob(fs.sep.join([input_path, "day_*"])) if not x.endswith("parquet")]
@@ -142,10 +141,10 @@ def convert_criteo_to_parquet(
 
   write_meta_name = "write-metadata-" + token
   dsk[write_meta_name] = (
-      _write_metadata,
-      [(convert_file_name, i) for i in range(len(file_list))],
-      fs,
-      output_path,
+    _write_metadata,
+    [(convert_file_name, i) for i in range(len(file_list))],
+    fs,
+    output_path,
   )
   graph = HighLevelGraph.from_collections(write_meta_name, dsk, dependencies=[])
   conversion_delayed = Delayed(write_meta_name, graph)
@@ -166,21 +165,21 @@ def save_model_size_config(workflow: Workflow, output_path: str):
   ordered_dict = OrderedDict()
   for k, v in sorted(list(embeddings.items()), key=lambda x: x[0]):
     ordered_dict[k] = v
-  with open(os.path.join(output_path, "model_size.json"), 'w') as file:
+  with open(os.path.join(output_path, "model_size.json"), "w") as file:
     file.write(json.dumps(ordered_dict))
 
 
 def preprocess_criteo_parquet(
-    input_path: str,
-    output_path: str,
-    client,
-    frequency_threshold: int,
+  input_path: str,
+  output_path: str,
+  client,
+  frequency_threshold: int,
 ):
   train_days = [str(x) for x in CRITEO_TRAIN_DAYS]
   train_files = [
-      os.path.join(input_path, x)
-      for x in os.listdir(input_path)
-      if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
+    os.path.join(input_path, x)
+    for x in os.listdir(input_path)
+    if x.startswith("day") and x.split(".")[0].split("_")[-1] in train_days
   ]
   valid_file = os.path.join(input_path, "day_23.part2.parquet")
   test_file = os.path.join(input_path, "day_23.part1.parquet")
@@ -191,18 +190,16 @@ def preprocess_criteo_parquet(
   print("Creating Workflow Object")
 
   workflow = Workflow(
-      cat_names=CRITEO_CATEGORICAL_COLUMNS, cont_names=CRITEO_CONTINUOUS_COLUMNS, label_name=CRITEO_CLICK_COLUMNS
+    cat_names=CRITEO_CATEGORICAL_COLUMNS, cont_names=CRITEO_CONTINUOUS_COLUMNS, label_name=CRITEO_CLICK_COLUMNS
   )
 
   # We want to assign 0 to all missing values, and calculate log(x+3) for present values
   # so if we set missing values to -2, then the result of log(1+2+(-2)) would be 0
-  workflow.add_cont_feature(
-      [
-          FillMissing(fill_val=-2.0),
-          LambdaOp(op_name='Add3ButMinusOneCauseLogAddsOne', f=lambda col, _: col.add(2.0)),
-          LogOp(),  # Log(1+x)
-      ]
-  )
+  workflow.add_cont_feature([
+    FillMissing(fill_val=-2.0),
+    LambdaOp(op_name="Add3ButMinusOneCauseLogAddsOne", f=lambda col, _: col.add(2.0)),
+    LogOp(),  # Log(1+x)
+  ])
 
   workflow.add_cat_preprocess(Categorify(freq_threshold=frequency_threshold, out_path=output_path))
 
@@ -243,17 +240,14 @@ def parse_args():
   parser.add_argument("input_dir", help="directory with either csv or parquet dataset files inside")
   parser.add_argument("output_dir", help="directory to save preprocessed dataset files")
   parser.add_argument(
-      "--intermediate_dir",
-      required=False,
-      default=None,
-      help="directory for converted to parquet dataset files inside"
+    "--intermediate_dir", required=False, default=None, help="directory for converted to parquet dataset files inside"
   )
   parser.add_argument("--devices", required=True, help="available gpus, separated with commas; e.g 0,1,2,3")
   parser.add_argument(
-      "--freq_threshold",
-      required=False,
-      default=15,
-      help="frequency threshold for categorical can be int or dict {column_name: threshold}"
+    "--freq_threshold",
+    required=False,
+    default=15,
+    help="frequency threshold for categorical can be int or dict {column_name: threshold}",
   )
   parser.add_argument("--pool", required=False, default=False, help="bool value to use a RMM pooled allocator")
 
@@ -266,7 +260,7 @@ def parse_args():
 
 def is_input_parquet(input_dir: str):
   for f in os.listdir(input_dir):
-    if 'parquet' in f:
+    if "parquet" in f:
       return True
   return False
 
@@ -274,8 +268,8 @@ def is_input_parquet(input_dir: str):
 def start_local_CUDA_cluster(devices, pool):
   if len(devices) > 1:
     cluster = LocalCUDACluster(
-        n_workers=len(devices),
-        CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
+      n_workers=len(devices),
+      CUDA_VISIBLE_DEVICES=",".join(str(x) for x in devices),
     )
     client = Client(cluster)
     if pool:
@@ -292,21 +286,21 @@ def main():
 
   if not is_input_parquet(args.input_dir):
     convert_criteo_to_parquet(
-        input_path=args.input_dir,
-        output_path=args.intermediate_dir,
-        client=client,
+      input_path=args.input_dir,
+      output_path=args.intermediate_dir,
+      client=client,
     )
     args.input_dir = args.intermediate_dir
 
   print("Preprocessing data")
   preprocess_criteo_parquet(
-      input_path=args.input_dir,
-      output_path=args.output_dir,
-      client=client,
-      frequency_threshold=int(args.freq_threshold),
+    input_path=args.input_dir,
+    output_path=args.output_dir,
+    client=client,
+    frequency_threshold=int(args.freq_threshold),
   )
   print("Done")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/criteo/preproc/spark_data_utils.py b/deepray/datasets/criteo/preproc/spark_data_utils.py
index ee9da510..bab3a7ab 100644
--- a/deepray/datasets/criteo/preproc/spark_data_utils.py
+++ b/deepray/datasets/criteo/preproc/spark_data_utils.py
@@ -33,13 +33,14 @@
 
 
 def get_column_counts_with_frequency_limit(df, frequency_limit=None):
-  cols = ['f_c%d' % i for i in CAT_COLS]
+  cols = ["f_c%d" % i for i in CAT_COLS]
   df = (
-      df.select(posexplode(array(*cols))
-               ).withColumnRenamed('pos', 'column_id').withColumnRenamed('col',
-                                                                         'data').filter('data is not null'
-                                                                                       ).groupBy('column_id',
-                                                                                                 'data').count()
+    df.select(posexplode(array(*cols)))
+    .withColumnRenamed("pos", "column_id")
+    .withColumnRenamed("col", "data")
+    .filter("data is not null")
+    .groupBy("column_id", "data")
+    .count()
   )
 
   if frequency_limit:
@@ -52,12 +53,12 @@ def get_column_counts_with_frequency_limit(df, frequency_limit=None):
         default_limit = int(frequency_pair[0])
       elif len(frequency_pair) == 2:
         df = df.filter(
-            (col('column_id') != int(frequency_pair[0]) - CAT_COLS[0]) | (col('count') >= int(frequency_pair[1]))
+          (col("column_id") != int(frequency_pair[0]) - CAT_COLS[0]) | (col("count") >= int(frequency_pair[1]))
         )
         exclude.append(int(frequency_pair[0]))
     if default_limit:
       remain = [x - CAT_COLS[0] for x in CAT_COLS if x not in exclude]
-      df = df.filter((~col('column_id').isin(remain)) | (col('count') >= default_limit))
+      df = df.filter((~col("column_id").isin(remain)) | (col("count") >= default_limit))
       # for comparing isin and separate filter
       # for i in remain:
       #     df = df.filter((col('column_id') != i - CAT_COLS[0]) | (col('count') >= default_limit))
@@ -65,51 +66,53 @@ def get_column_counts_with_frequency_limit(df, frequency_limit=None):
 
 
 def assign_id_with_window(df):
-  windowed = Window.partitionBy('column_id').orderBy(desc('count'))
-  return (df.withColumn('id', row_number().over(windowed)).withColumnRenamed('count', 'model_count'))
+  windowed = Window.partitionBy("column_id").orderBy(desc("count"))
+  return df.withColumn("id", row_number().over(windowed)).withColumnRenamed("count", "model_count")
 
 
 def assign_low_mem_partial_ids(df):
   # To avoid some scaling issues with a simple window operation, we use a more complex method
   # to compute the same thing, but in a more distributed spark specific way
-  df = df.orderBy(asc('column_id'), desc('count'))
+  df = df.orderBy(asc("column_id"), desc("count"))
   # The monotonically_increasing_id is the partition id in the top 31 bits and the rest
   # is an increasing count of the rows within that partition.  So we split it into two parts,
   # the partion id part_id and the count mono_id
-  df = df.withColumn('part_id', spark_partition_id())
-  return df.withColumn('mono_id', monotonically_increasing_id() - shiftLeft(col('part_id'), 33))
+  df = df.withColumn("part_id", spark_partition_id())
+  return df.withColumn("mono_id", monotonically_increasing_id() - shiftLeft(col("part_id"), 33))
 
 
 def assign_low_mem_final_ids(df):
   # Now we can find the minimum and maximum mono_ids within a given column/partition pair
-  sub_model = df.groupBy('column_id', 'part_id').agg(max('mono_id').alias('top'), min('mono_id').alias('bottom'))
-  sub_model = sub_model.withColumn('diff', col('top') - col('bottom') + 1)
-  sub_model = sub_model.drop('top')
+  sub_model = df.groupBy("column_id", "part_id").agg(max("mono_id").alias("top"), min("mono_id").alias("bottom"))
+  sub_model = sub_model.withColumn("diff", col("top") - col("bottom") + 1)
+  sub_model = sub_model.drop("top")
   # This window function is over aggregated column/partition pair table. It will do a running sum of the rows
   # within that column
-  windowed = Window.partitionBy('column_id').orderBy('part_id').rowsBetween(Window.unboundedPreceding, -1)
-  sub_model = sub_model.withColumn('running_sum', sum('diff').over(windowed)).na.fill(0, ["running_sum"])
+  windowed = Window.partitionBy("column_id").orderBy("part_id").rowsBetween(Window.unboundedPreceding, -1)
+  sub_model = sub_model.withColumn("running_sum", sum("diff").over(windowed)).na.fill(0, ["running_sum"])
 
-  joined = df.withColumnRenamed('column_id', 'i_column_id')
-  joined = joined.withColumnRenamed('part_id', 'i_part_id')
-  joined = joined.withColumnRenamed('count', 'model_count')
+  joined = df.withColumnRenamed("column_id", "i_column_id")
+  joined = joined.withColumnRenamed("part_id", "i_part_id")
+  joined = joined.withColumnRenamed("count", "model_count")
 
   # Then we can join the original input with the pair it is a part of
-  joined = joined.join(sub_model, (col('i_column_id') == col('column_id')) & (col('part_id') == col('i_part_id')))
+  joined = joined.join(sub_model, (col("i_column_id") == col("column_id")) & (col("part_id") == col("i_part_id")))
 
   # So with all that we can subtract bottom from mono_id makeing it start at 0 for each partition
   # and then add in the running_sum so the id is contiguous and unique for the entire column. + 1 to make it match the 1 based indexing
   # for row_number
   ret = joined.select(
-      col('column_id'), col('data'),
-      (col('mono_id') - col('bottom') + col('running_sum') + 1).cast(IntegerType()).alias('id'), col('model_count')
+    col("column_id"),
+    col("data"),
+    (col("mono_id") - col("bottom") + col("running_sum") + 1).cast(IntegerType()).alias("id"),
+    col("model_count"),
   )
   return ret
 
 
 def get_column_models(combined_model):
   for i in CAT_COLS:
-    model = (combined_model.filter('column_id == %d' % (i - CAT_COLS[0])).drop('column_id'))
+    model = combined_model.filter("column_id == %d" % (i - CAT_COLS[0])).drop("column_id")
     yield i, model
 
 
@@ -129,29 +132,29 @@ def skewed_join(df, model, col_name, cutoff):
   # broadcast join for that part, but keep the result in
   # a separate column
   b_model = broadcast(
-      model.filter(col('model_count') >= cutoff).withColumnRenamed('data', col_name).drop('model_count')
+    model.filter(col("model_count") >= cutoff).withColumnRenamed("data", col_name).drop("model_count")
   )
 
-  df = (df.join(b_model, col_name, how='left').withColumnRenamed('id', 'id_tmp'))
+  df = df.join(b_model, col_name, how="left").withColumnRenamed("id", "id_tmp")
 
   # We also need to spread the skewed data that matched
   # evenly.  We will use a source of randomness for this
   # but use a -1 for anything that still needs to be matched
-  if 'ordinal' in df.columns:
-    rand_column = col('ordinal')
+  if "ordinal" in df.columns:
+    rand_column = col("ordinal")
   else:
     rand_column = col_of_rand_long()
 
   df = df.withColumn(
-      'join_rand',
-      # null values are not in the model, they are filtered out
-      # but can be a source of skewedness so include them in
-      # the even distribution
-      when(col('id_tmp').isNotNull() | col(col_name).isNull(), rand_column).otherwise(lit(-1))
+    "join_rand",
+    # null values are not in the model, they are filtered out
+    # but can be a source of skewedness so include them in
+    # the even distribution
+    when(col("id_tmp").isNotNull() | col(col_name).isNull(), rand_column).otherwise(lit(-1)),
   )
 
   # Null out the string data that already matched to save memory
-  df = df.withColumn(col_name, when(col('id_tmp').isNotNull(), None).otherwise(col(col_name)))
+  df = df.withColumn(col_name, when(col("id_tmp").isNotNull(), None).otherwise(col(col_name)))
 
   # Now do the second join, which will be a non broadcast join.
   # Sadly spark is too smart for its own good and will optimize out
@@ -159,17 +162,18 @@ def skewed_join(df, model, col_name, cutoff):
   # So we have to make a convoluted version of assigning a -1 to the
   # randomness column for the model itself to work around that.
   nb_model = (
-      model.withColumn('join_rand',
-                       when(col('model_count') < cutoff,
-                            lit(-1)).otherwise(lit(-2))).filter(col('model_count') < cutoff
-                                                               ).withColumnRenamed('data',
-                                                                                   col_name).drop('model_count')
+    model.withColumn("join_rand", when(col("model_count") < cutoff, lit(-1)).otherwise(lit(-2)))
+    .filter(col("model_count") < cutoff)
+    .withColumnRenamed("data", col_name)
+    .drop("model_count")
   )
 
   df = (
-      df.join(nb_model, ['join_rand', col_name], how='left').drop(col_name, 'join_rand')
-      # Pick either join result as an answer
-      .withColumn(col_name, coalesce(col('id'), col('id_tmp'))).drop('id', 'id_tmp')
+    df.join(nb_model, ["join_rand", col_name], how="left")
+    .drop(col_name, "join_rand")
+    # Pick either join result as an answer
+    .withColumn(col_name, coalesce(col("id"), col("id_tmp")))
+    .drop("id", "id_tmp")
   )
 
   return df
@@ -182,7 +186,7 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0):
   # not make a difference.
   models = sorted(models, key=itemgetter(3), reverse=True)
   for i, model, original_rows, would_broadcast in models:
-    col_name = 'f_c%d' % i
+    col_name = "f_c%d" % i
     if not (would_broadcast or broadcast_model):
       # The data is highly skewed so we need to offset that
       cutoff = int(original_rows * skew_broadcast_pct / 100.0)
@@ -190,14 +194,14 @@ def apply_models(df, models, broadcast_model=False, skew_broadcast_pct=1.0):
     else:
       # broadcast joins can handle skewed data so no need to
       # do anything special
-      model = (model.drop('model_count').withColumnRenamed('data', col_name))
+      model = model.drop("model_count").withColumnRenamed("data", col_name)
       model = broadcast(model) if broadcast_model else model
-      df = (df.join(model, col_name, how='left').drop(col_name).withColumnRenamed('id', col_name))
-  return df.fillna(0, ['f_c%d' % i for i in CAT_COLS])
+      df = df.join(model, col_name, how="left").drop(col_name).withColumnRenamed("id", col_name)
+  return df.fillna(0, ["f_c%d" % i for i in CAT_COLS])
 
 
 def transform_log(df, transform_log=False):
-  cols = ['f_c%d' % i for i in INT_COLS]
+  cols = ["f_c%d" % i for i in INT_COLS]
   if transform_log:
     for col_name in cols:
       df = df.withColumn(col_name, log(df[col_name] + 3))
@@ -226,100 +230,100 @@ def delete_data_source(spark, path):
 
 
 def load_raw(spark, folder, day_range):
-  label_fields = [StructField('f_c%d' % LABEL_COL, IntegerType())]
-  int_fields = [StructField('f_c%d' % i, IntegerType()) for i in INT_COLS]
-  str_fields = [StructField('f_c%d' % i, StringType()) for i in CAT_COLS]
+  label_fields = [StructField("f_c%d" % LABEL_COL, IntegerType())]
+  int_fields = [StructField("f_c%d" % i, IntegerType()) for i in INT_COLS]
+  str_fields = [StructField("f_c%d" % i, StringType()) for i in CAT_COLS]
 
   schema = StructType(label_fields + int_fields + str_fields)
-  paths = [os.path.join(folder, 'day_%d' % i) for i in day_range]
-  return (spark.read.schema(schema).option('sep', '\t').csv(paths))
+  paths = [os.path.join(folder, "day_%d" % i) for i in day_range]
+  return spark.read.schema(schema).option("sep", "\t").csv(paths)
 
 
 def rand_ordinal(df):
   # create a random long from the double precision float.
   # The fraction part of a double is 52 bits, so we try to capture as much
   # of that as possible
-  return df.withColumn('ordinal', col_of_rand_long())
+  return df.withColumn("ordinal", col_of_rand_long())
 
 
 def day_from_ordinal(df, num_days):
-  return df.withColumn('day', (col('ordinal') % num_days).cast(IntegerType()))
+  return df.withColumn("day", (col("ordinal") % num_days).cast(IntegerType()))
 
 
 def day_from_input_file(df):
-  return df.withColumn('day', substring_index(input_file_name(), '_', -1).cast(IntegerType()))
+  return df.withColumn("day", substring_index(input_file_name(), "_", -1).cast(IntegerType()))
 
 
 def psudo_sort_by_day_plus(spark, df, num_days):
   # Sort is very expensive because it needs to calculate the partitions
   # which in our case may involve rereading all of the data.  In some cases
   # we can avoid this by repartitioning the data and sorting within a single partition
-  shuffle_parts = int(spark.conf.get('spark.sql.shuffle.partitions'))
+  shuffle_parts = int(spark.conf.get("spark.sql.shuffle.partitions"))
   extra_parts = int(shuffle_parts / num_days)
   if extra_parts <= 0:
-    df = df.repartition('day')
+    df = df.repartition("day")
   else:
-    #We want to spread out the computation to about the same amount as shuffle_parts
-    divided = (col('ordinal') / num_days).cast(LongType())
+    # We want to spread out the computation to about the same amount as shuffle_parts
+    divided = (col("ordinal") / num_days).cast(LongType())
     extra_ident = divided % extra_parts
-    df = df.repartition(col('day'), extra_ident)
-  return df.sortWithinPartitions('day', 'ordinal')
+    df = df.repartition(col("day"), extra_ident)
+  return df.sortWithinPartitions("day", "ordinal")
 
 
 def load_combined_model(spark, model_folder):
-  path = os.path.join(model_folder, 'combined.parquet')
+  path = os.path.join(model_folder, "combined.parquet")
   return spark.read.parquet(path)
 
 
 def save_combined_model(df, model_folder, mode=None):
-  path = os.path.join(model_folder, 'combined.parquet')
+  path = os.path.join(model_folder, "combined.parquet")
   df.write.parquet(path, mode=mode)
 
 
 def delete_combined_model(spark, model_folder):
-  path = os.path.join(model_folder, 'combined.parquet')
+  path = os.path.join(model_folder, "combined.parquet")
   delete_data_source(spark, path)
 
 
 def load_low_mem_partial_ids(spark, model_folder):
-  path = os.path.join(model_folder, 'partial_ids.parquet')
+  path = os.path.join(model_folder, "partial_ids.parquet")
   return spark.read.parquet(path)
 
 
 def save_low_mem_partial_ids(df, model_folder, mode=None):
-  path = os.path.join(model_folder, 'partial_ids.parquet')
+  path = os.path.join(model_folder, "partial_ids.parquet")
   df.write.parquet(path, mode=mode)
 
 
 def delete_low_mem_partial_ids(spark, model_folder):
-  path = os.path.join(model_folder, 'partial_ids.parquet')
+  path = os.path.join(model_folder, "partial_ids.parquet")
   delete_data_source(spark, path)
 
 
 def load_column_models(spark, model_folder, count_required):
   for i in CAT_COLS:
-    path = os.path.join(model_folder, '%d.parquet' % i)
+    path = os.path.join(model_folder, "%d.parquet" % i)
     df = spark.read.parquet(path)
     if count_required:
-      values = df.agg(sum('model_count').alias('sum'), count('*').alias('size')).collect()
+      values = df.agg(sum("model_count").alias("sum"), count("*").alias("size")).collect()
     else:
-      values = df.agg(sum('model_count').alias('sum')).collect()
+      values = df.agg(sum("model_count").alias("sum")).collect()
     yield i, df, values[0], would_broadcast(spark, path)
 
 
 def save_column_models(column_models, model_folder, mode=None):
   for i, model in column_models:
-    path = os.path.join(model_folder, '%d.parquet' % i)
+    path = os.path.join(model_folder, "%d.parquet" % i)
     model.write.parquet(path, mode=mode)
 
 
 def save_model_size(model_size, path, write_mode):
-  if os.path.exists(path) and write_mode == 'errorifexists':
-    print('Error: model size file %s exists' % path)
+  if os.path.exists(path) and write_mode == "errorifexists":
+    print("Error: model size file %s exists" % path)
     sys.exit(1)
 
   os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
-  with open(path, 'w') as fp:
+  with open(path, "w") as fp:
     json.dump(model_size, fp, indent=4)
 
 
@@ -337,34 +341,34 @@ def _timed(step):
 def _parse_args():
   parser = ArgumentParser()
 
-  parser.add_argument('--mode', required=True, choices=['generate_models', 'transform'])
+  parser.add_argument("--mode", required=True, choices=["generate_models", "transform"])
 
-  parser.add_argument('--days', required=True)
-  parser.add_argument('--input_folder', required=True)
-  parser.add_argument('--output_folder')
-  parser.add_argument('--model_size_file')
-  parser.add_argument('--model_folder', required=True)
-  parser.add_argument('--write_mode', choices=['overwrite', 'errorifexists'], default='errorifexists')
+  parser.add_argument("--days", required=True)
+  parser.add_argument("--input_folder", required=True)
+  parser.add_argument("--output_folder")
+  parser.add_argument("--model_size_file")
+  parser.add_argument("--model_folder", required=True)
+  parser.add_argument("--write_mode", choices=["overwrite", "errorifexists"], default="errorifexists")
 
-  parser.add_argument('--frequency_limit')
-  parser.add_argument('--no_numeric_log_col', action='store_true')
-  #Support for running in a lower memory environment
-  parser.add_argument('--low_mem', action='store_true')
+  parser.add_argument("--frequency_limit")
+  parser.add_argument("--no_numeric_log_col", action="store_true")
+  # Support for running in a lower memory environment
+  parser.add_argument("--low_mem", action="store_true")
   parser.add_argument(
-      '--output_ordering', choices=['total_random', 'day_random', 'any', 'input'], default='total_random'
+    "--output_ordering", choices=["total_random", "day_random", "any", "input"], default="total_random"
   )
 
-  parser.add_argument('--output_partitioning', choices=['day', 'none'], default='none')
+  parser.add_argument("--output_partitioning", choices=["day", "none"], default="none")
 
-  parser.add_argument('--dict_build_shuffle_parallel_per_day', type=int, default=2)
-  parser.add_argument('--apply_shuffle_parallel_per_day', type=int, default=25)
-  parser.add_argument('--skew_broadcast_pct', type=float, default=1.0)
+  parser.add_argument("--dict_build_shuffle_parallel_per_day", type=int, default=2)
+  parser.add_argument("--apply_shuffle_parallel_per_day", type=int, default=25)
+  parser.add_argument("--skew_broadcast_pct", type=float, default=1.0)
 
-  parser.add_argument('--debug_mode', action='store_true')
+  parser.add_argument("--debug_mode", action="store_true")
 
   args = parser.parse_args()
 
-  start, end = args.days.split('-')
+  start, end = args.days.split("-")
   args.day_range = list(range(int(start), int(end) + 1))
   args.days = len(args.day_range)
 
@@ -377,9 +381,9 @@ def _main():
 
   df = load_raw(spark, args.input_folder, args.day_range)
 
-  if args.mode == 'generate_models':
-    spark.conf.set('spark.sql.shuffle.partitions', args.days * args.dict_build_shuffle_parallel_per_day)
-    with _timed('generate models'):
+  if args.mode == "generate_models":
+    spark.conf.set("spark.sql.shuffle.partitions", args.days * args.dict_build_shuffle_parallel_per_day)
+    with _timed("generate models"):
       col_counts = get_column_counts_with_frequency_limit(df, args.frequency_limit)
       if args.low_mem:
         # in low memory mode we have to save an intermediate result
@@ -389,8 +393,9 @@ def _main():
         # and then writeing them out.
         save_low_mem_partial_ids(assign_low_mem_partial_ids(col_counts), args.model_folder, args.write_mode)
         save_combined_model(
-            assign_low_mem_final_ids(load_low_mem_partial_ids(spark, args.model_folder)), args.model_folder,
-            args.write_mode
+          assign_low_mem_final_ids(load_low_mem_partial_ids(spark, args.model_folder)),
+          args.model_folder,
+          args.write_mode,
         )
         if not args.debug_mode:
           delete_low_mem_partial_ids(spark, args.model_folder)
@@ -398,76 +403,76 @@ def _main():
       else:
         save_combined_model(assign_id_with_window(col_counts), args.model_folder, args.write_mode)
       save_column_models(
-          get_column_models(load_combined_model(spark, args.model_folder)), args.model_folder, args.write_mode
+        get_column_models(load_combined_model(spark, args.model_folder)), args.model_folder, args.write_mode
       )
       if not args.debug_mode:
         delete_combined_model(spark, args.model_folder)
 
-  if args.mode == 'transform':
-    with _timed('transform'):
-      if args.output_ordering == 'total_random':
+  if args.mode == "transform":
+    with _timed("transform"):
+      if args.output_ordering == "total_random":
         df = rand_ordinal(df)
-        if args.output_partitioning == 'day':
+        if args.output_partitioning == "day":
           df = day_from_ordinal(df, args.days)
-      elif args.output_ordering == 'day_random':
+      elif args.output_ordering == "day_random":
         df = rand_ordinal(df)
         df = day_from_input_file(df)
-      elif args.output_ordering == 'input':
-        df = df.withColumn('ordinal', monotonically_increasing_id())
-        if args.output_partitioning == 'day':
+      elif args.output_ordering == "input":
+        df = df.withColumn("ordinal", monotonically_increasing_id())
+        if args.output_partitioning == "day":
           df = day_from_input_file(df)
       else:  # any ordering
-        if args.output_partitioning == 'day':
+        if args.output_partitioning == "day":
           df = day_from_input_file(df)
 
       models = list(load_column_models(spark, args.model_folder, bool(args.model_size_file)))
       if args.model_size_file:
         save_model_size(
-            OrderedDict(('f_c%d' % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode
+          OrderedDict(("f_c%d" % i, agg.size) for i, _, agg, _ in models), args.model_size_file, args.write_mode
         )
       models = [(i, df, agg.sum, flag) for i, df, agg, flag in models]
 
       df = apply_models(df, models, not args.low_mem, args.skew_broadcast_pct)
       df = transform_log(df, not args.no_numeric_log_col)
 
-      if args.output_partitioning == 'day':
-        partitionBy = 'day'
+      if args.output_partitioning == "day":
+        partitionBy = "day"
       else:
         partitionBy = None
 
-      if args.output_ordering == 'total_random':
-        if args.output_partitioning == 'day':
+      if args.output_ordering == "total_random":
+        if args.output_partitioning == "day":
           df = psudo_sort_by_day_plus(spark, df, args.days)
         else:  # none
           # Don't do a full sort it is expensive. Order is random so
           # just make it random
-          df = df.repartition('ordinal').sortWithinPartitions('ordinal')
+          df = df.repartition("ordinal").sortWithinPartitions("ordinal")
 
-        df = df.drop('ordinal')
-      elif args.output_ordering == 'day_random':
+        df = df.drop("ordinal")
+      elif args.output_ordering == "day_random":
         df = psudo_sort_by_day_plus(spark, df, args.days)
-        df = df.drop('ordinal')
-        if args.output_partitioning != 'day':
-          df = df.drop('day')
-      elif args.output_ordering == 'input':
+        df = df.drop("ordinal")
+        if args.output_partitioning != "day":
+          df = df.drop("day")
+      elif args.output_ordering == "input":
         if args.low_mem:
           # This is the slowest option. We totally messed up the order so we have to put
           # it back in the correct order
-          df = df.orderBy('ordinal')
+          df = df.orderBy("ordinal")
         else:
           # Applying the dictionary happened within a single task so we are already really
           # close to the correct order, just need to sort within the partition
-          df = df.sortWithinPartitions('ordinal')
-        df = df.drop('ordinal')
-        if args.output_partitioning != 'day':
-          df = df.drop('day')
+          df = df.sortWithinPartitions("ordinal")
+        df = df.drop("ordinal")
+        if args.output_partitioning != "day":
+          df = df.drop("day")
       # else: any ordering so do nothing the ordering does not matter
 
       df.write.parquet(args.output_folder, mode=args.write_mode, partitionBy=partitionBy)
 
-  print('=' * 100)
+  print("=" * 100)
   print(_benchmark)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   _main()
diff --git a/deepray/datasets/criteo/preproc/split_dataset.py b/deepray/datasets/criteo/preproc/split_dataset.py
index 2e7a75df..cc9f9256 100644
--- a/deepray/datasets/criteo/preproc/split_dataset.py
+++ b/deepray/datasets/criteo/preproc/split_dataset.py
@@ -24,18 +24,18 @@
 # Workaround to avoid duplicating code from the main module, without building it outright.
 import sys
 
-sys.path.append('/workspace/dlrm')
+sys.path.append("/workspace/dlrm")
 from data.defaults import get_categorical_feature_type
 from data.feature_spec import FeatureSpec
 
 
 def split_binary_file(
-    binary_file_path: str,
-    output_dir: str,
-    categorical_feature_sizes: Sequence[int],
-    num_numerical_features: int,
-    batch_size: int,
-    source_data_type: str = 'int32',
+  binary_file_path: str,
+  output_dir: str,
+  categorical_feature_sizes: Sequence[int],
+  num_numerical_features: int,
+  batch_size: int,
+  source_data_type: str = "int32",
 ):
   record_width = 1 + num_numerical_features + len(categorical_feature_sizes)  # label + numerical + categorical
   bytes_per_feature = np.__dict__[source_data_type]().nbytes
@@ -54,12 +54,12 @@ def split_binary_file(
     numerical_f = open(os.path.join(output_dir, "numerical.bin"), "wb+")
     file_streams.append(numerical_f)
 
-    label_f = open(os.path.join(output_dir, 'label.bin'), 'wb+')
+    label_f = open(os.path.join(output_dir, "label.bin"), "wb+")
     file_streams.append(label_f)
 
     categorical_fs = []
     for i in range(len(categorical_feature_sizes)):
-      fs = open(os.path.join(output_dir, f'cat_{i}.bin'), 'wb+')
+      fs = open(os.path.join(output_dir, f"cat_{i}.bin"), "wb+")
       categorical_fs.append(fs)
       file_streams.append(fs)
 
@@ -67,7 +67,7 @@ def split_binary_file(
       raw_data = np.frombuffer(input_data_f.read(bytes_per_entry * batch_size), dtype=np.int32)
       batch_data = raw_data.reshape(-1, record_width)
 
-      numerical_features = batch_data[:, 1:1 + num_numerical_features].view(dtype=np.float32)
+      numerical_features = batch_data[:, 1 : 1 + num_numerical_features].view(dtype=np.float32)
       numerical_f.write(numerical_features.astype(np.float16).tobytes())
 
       label = batch_data[:, 0]
@@ -75,7 +75,7 @@ def split_binary_file(
 
       cat_offset = num_numerical_features + 1
       for cat_idx, cat_feature_type in enumerate(cat_feature_types):
-        cat_data = batch_data[:, (cat_idx + cat_offset):(cat_idx + cat_offset + 1)].astype(cat_feature_type)
+        cat_data = batch_data[:, (cat_idx + cat_offset) : (cat_idx + cat_offset + 1)].astype(cat_feature_type)
         categorical_fs[cat_idx].write(cat_data.tobytes())
   finally:
     for stream in file_streams:
@@ -104,25 +104,25 @@ def split_dataset(dataset_dir: str, output_dir: str, batch_size: int, numerical_
 
   # VALIDATION chunk is ignored in feature spec on purpose
   feature_spec = FeatureSpec.get_default_feature_spec(
-      number_of_numerical_features=numerical_features, categorical_feature_cardinalities=categorical_cardinalities
+    number_of_numerical_features=numerical_features, categorical_feature_cardinalities=categorical_cardinalities
   )
-  feature_spec.to_yaml(os.path.join(output_dir, 'feature_spec.yaml'))
+  feature_spec.to_yaml(os.path.join(output_dir, "feature_spec.yaml"))
   split_binary_file(test_file, target_test, categorical_cardinalities, numerical_features, batch_size)
   split_binary_file(train_file, target_train, categorical_cardinalities, numerical_features, batch_size)
   split_binary_file(val_file, target_val, categorical_cardinalities, numerical_features, batch_size)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   parser = argparse.ArgumentParser()
-  parser.add_argument('--dataset', type=str, required=True)
-  parser.add_argument('--output', type=str, required=True)
-  parser.add_argument('--batch_size', type=int, default=32768)
-  parser.add_argument('--numerical_features', type=int, default=13)
+  parser.add_argument("--dataset", type=str, required=True)
+  parser.add_argument("--output", type=str, required=True)
+  parser.add_argument("--batch_size", type=int, default=32768)
+  parser.add_argument("--numerical_features", type=int, default=13)
   args = parser.parse_args()
 
   split_dataset(
-      dataset_dir=args.dataset,
-      output_dir=args.output,
-      batch_size=args.batch_size,
-      numerical_features=args.numerical_features
+    dataset_dir=args.dataset,
+    output_dir=args.output,
+    batch_size=args.batch_size,
+    numerical_features=args.numerical_features,
   )
diff --git a/deepray/datasets/csv_pipeline/csv_pipeline.py b/deepray/datasets/csv_pipeline/csv_pipeline.py
index ac8dd6e0..c005be5c 100644
--- a/deepray/datasets/csv_pipeline/csv_pipeline.py
+++ b/deepray/datasets/csv_pipeline/csv_pipeline.py
@@ -4,15 +4,14 @@
 
 
 class CSVPipeline(DataPipeline):
-
   def build_dataset(self, batch_size, input_file_pattern, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
     dataset = tf.data.experimental.make_csv_dataset(
-        input_file_pattern,
-        record_defaults=list(self.feature_map["dtype"]),
-        column_names=list(self.feature_map["name"]),
-        batch_size=batch_size,
-        label_name=flags.FLAGS.label,
-        field_delim=",",
-        header=True,
+      input_file_pattern,
+      record_defaults=list(self.feature_map["dtype"]),
+      column_names=list(self.feature_map["name"]),
+      batch_size=batch_size,
+      label_name=flags.FLAGS.label,
+      field_delim=",",
+      header=True,
     )
     return dataset
diff --git a/deepray/datasets/datapipeline.py b/deepray/datasets/datapipeline.py
index 23dcc057..c0d1c676 100644
--- a/deepray/datasets/datapipeline.py
+++ b/deepray/datasets/datapipeline.py
@@ -22,11 +22,10 @@
 
 ROOT_PATH = os.path.dirname(deepray.__file__)
 
-IS_TRAINING = Enum('is_training', ('Train', 'Valid', 'Test'))
+IS_TRAINING = Enum("is_training", ("Train", "Valid", "Test"))
 
 
 class DataPipeline(object):
-
   def __init__(self, context: tf.distribute.InputContext = None, **kwargs):
     # super().__init__(**kwargs)
     self.built = False
@@ -64,7 +63,7 @@ def parser(self, record):
 
   @abc.abstractmethod
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     """
     must be defined in subclass
@@ -74,12 +73,12 @@ def build_dataset(
   def __call__(self, batch_size=None, input_file_pattern=None, is_training=True, *args, **kwargs):
     """Gets a closure to create a dataset."""
     return self.build_dataset(
-        batch_size=self.context.get_per_replica_batch_size(batch_size) if self.context else batch_size,
-        input_file_pattern=input_file_pattern,
-        is_training=is_training,
-        epochs=1,
-        *args,
-        **kwargs
+      batch_size=self.context.get_per_replica_batch_size(batch_size) if self.context else batch_size,
+      input_file_pattern=input_file_pattern,
+      is_training=is_training,
+      epochs=1,
+      *args,
+      **kwargs,
     )
 
   def maybe_download(self, filename, expected_bytes):
@@ -101,7 +100,7 @@ def _dataset_options(self, input_files):
     # containing a single path, disable auto sharding so that
     # same input file is sent to all workers.
     if isinstance(input_files, str) or len(input_files) == 1:
-      options.experimental_distribute.auto_shard_policy = (tf.data.experimental.AutoShardPolicy.OFF)
+      options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
     else:
       """ Constructs tf.data.Options for this dataset. """
       options.experimental_optimization.parallel_batch = True
@@ -113,6 +112,7 @@ def _dataset_options(self, input_files):
 
   def train_test_split(self, arrays, test_size=0.33, shuffle=False):
     from sklearn.model_selection import train_test_split
+
     random_state = flags.FLAGS.random_seed if flags.FLAGS.random_seed else 1024
     X_train, X_test = train_test_split(arrays, test_size=test_size, shuffle=shuffle, random_state=random_state)
     return X_train, X_test
diff --git a/deepray/datasets/downloader/Downloader.py b/deepray/datasets/downloader/Downloader.py
index 2a5bb969..cb4fc5da 100644
--- a/deepray/datasets/downloader/Downloader.py
+++ b/deepray/datasets/downloader/Downloader.py
@@ -21,61 +21,60 @@
 
 
 class Downloader:
-
   def __init__(self, dataset_name, save_path):
     self.dataset_name = dataset_name
     self.save_path = save_path
 
   def download(self):
-    if self.dataset_name == 'bookscorpus':
+    if self.dataset_name == "bookscorpus":
       self.download_bookscorpus()
 
-    elif self.dataset_name == 'wikicorpus_en':
-      self.download_wikicorpus('en')
+    elif self.dataset_name == "wikicorpus_en":
+      self.download_wikicorpus("en")
 
-    elif self.dataset_name == 'wikicorpus_zh':
-      self.download_wikicorpus('zh')
+    elif self.dataset_name == "wikicorpus_zh":
+      self.download_wikicorpus("zh")
 
-    elif self.dataset_name == 'pubmed_baseline':
-      self.download_pubmed('baseline')
+    elif self.dataset_name == "pubmed_baseline":
+      self.download_pubmed("baseline")
 
-    elif self.dataset_name == 'pubmed_daily_update':
-      self.download_pubmed('daily_update')
+    elif self.dataset_name == "pubmed_daily_update":
+      self.download_pubmed("daily_update")
 
-    elif self.dataset_name == 'pubmed_fulltext':
-      self.download_pubmed('fulltext')
+    elif self.dataset_name == "pubmed_fulltext":
+      self.download_pubmed("fulltext")
 
-    elif self.dataset_name == 'pubmed_open_access':
-      self.download_pubmed('open_access')
+    elif self.dataset_name == "pubmed_open_access":
+      self.download_pubmed("open_access")
 
-    elif self.dataset_name == 'google_pretrained_weights':
+    elif self.dataset_name == "google_pretrained_weights":
       self.download_google_pretrained_weights()
 
-    elif self.dataset_name == 'nvidia_pretrained_weights':
+    elif self.dataset_name == "nvidia_pretrained_weights":
       self.download_nvidia_pretrained_weights()
 
-    elif self.dataset_name == 'mrpc':
+    elif self.dataset_name == "mrpc":
       self.download_glue(self.dataset_name)
 
-    elif self.dataset_name == 'mnli':
+    elif self.dataset_name == "mnli":
       self.download_glue(self.dataset_name)
 
-    elif self.dataset_name == 'cola':
+    elif self.dataset_name == "cola":
       self.download_glue(self.dataset_name)
-    elif self.dataset_name == 'sst-2':
+    elif self.dataset_name == "sst-2":
       self.download_glue(self.dataset_name)
 
-    elif self.dataset_name == 'squad':
+    elif self.dataset_name == "squad":
       self.download_squad()
 
-    elif self.dataset_name == 'all':
+    elif self.dataset_name == "all":
       self.download_bookscorpus()
-      self.download_wikicorpus('en')
-      self.download_wikicorpus('zh')
-      self.download_pubmed('baseline')
-      self.download_pubmed('daily_update')
-      self.download_pubmed('fulltext')
-      self.download_pubmed('open_access')
+      self.download_wikicorpus("en")
+      self.download_wikicorpus("zh")
+      self.download_pubmed("baseline")
+      self.download_pubmed("daily_update")
+      self.download_pubmed("fulltext")
+      self.download_pubmed("open_access")
       self.download_google_pretrained_weights()
       self.download_nvidia_pretrained_weights()
       self.download_glue("cola")
@@ -86,7 +85,7 @@ def download(self):
 
     else:
       print(self.dataset_name)
-      assert False, 'Unknown dataset_name provided to downloader'
+      assert False, "Unknown dataset_name provided to downloader"
 
   def download_bookscorpus(self):
     downloader = BooksDownloader(self.save_path)
diff --git a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
index 71539eed..8e720a4e 100644
--- a/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
+++ b/deepray/datasets/downloader/GooglePretrainedWeightDownloader.py
@@ -18,98 +18,95 @@
 
 
 class GooglePretrainedWeightDownloader:
-
   def __init__(self, save_path):
-    self.save_path = save_path + '/google_pretrained_weights'
+    self.save_path = save_path + "/google_pretrained_weights"
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
 
     # Download urls
     self.model_urls = {
-        'bert_base_uncased':
-            (
-                'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz',
-                'uncased_L-12_H-768_A-12.tar.gz'
-            ),
-        'bert_large_uncased':
-            (
-                'https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-24_H-1024_A-16.tar.gz',
-                'uncased_L-24_H-1024_A-16.tar.gz'
-            ),
-        # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
-        # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'),
-        # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
-        # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
-        # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+      "bert_base_uncased": (
+        "https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-12_H-768_A-12.tar.gz",
+        "uncased_L-12_H-768_A-12.tar.gz",
+      ),
+      "bert_large_uncased": (
+        "https://storage.googleapis.com/tf_model_garden/nlp/bert/v3/uncased_L-24_H-1024_A-16.tar.gz",
+        "uncased_L-24_H-1024_A-16.tar.gz",
+      ),
+      # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
+      # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'),
+      # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+      # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+      # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
     }
 
     # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
     self.bert_base_uncased_sha = {
-        'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
-        'bert_model.ckpt.data-00000-of-00001': 'f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f',
-        'bert_model.ckpt.index': '06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767',
-        # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-        'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+      "bert_config.json": "7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc",
+      "bert_model.ckpt.data-00000-of-00001": "f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f",
+      "bert_model.ckpt.index": "06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767",
+      # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
     }
 
     self.bert_large_uncased_sha = {
-        'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
-        'bert_model.ckpt.data-00000-of-00001': '9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b',
-        'bert_model.ckpt.index': '1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336',
-        # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-        'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
+      "bert_config.json": "bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb",
+      "bert_model.ckpt.data-00000-of-00001": "9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b",
+      "bert_model.ckpt.index": "1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336",
+      # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
     }
 
     self.bert_base_cased_sha = {
-        'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
-        'bert_model.ckpt.data-00000-of-00001': 'ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85',
-        'bert_model.ckpt.index': 'af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5',
-        'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-        'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+      "bert_config.json": "f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc",
+      "bert_model.ckpt.data-00000-of-00001": "ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85",
+      "bert_model.ckpt.index": "af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5",
+      "checkpoint": "da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
     }
 
     self.bert_large_cased_sha = {
-        'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
-        'bert_model.ckpt.data-00000-of-00001': '1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b',
-        'bert_model.ckpt.index': '373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3',
-        'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-        'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
+      "bert_config.json": "7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57",
+      "bert_model.ckpt.data-00000-of-00001": "1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b",
+      "bert_model.ckpt.index": "373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3",
+      "checkpoint": "da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
     }
 
     self.bert_base_multilingual_cased_sha = {
-        'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
-        'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
-        'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
-        'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
-        'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
+      "bert_config.json": "e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0",
+      "bert_model.ckpt.data-00000-of-00001": "55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5",
+      "bert_model.ckpt.index": "7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37",
+      "bert_model.ckpt.meta": "95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa",
+      "vocab.txt": "fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c",
     }
 
     self.bert_large_multilingual_uncased_sha = {
-        'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
-        'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
-        'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
-        'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
-        'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
+      "bert_config.json": "49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624",
+      "bert_model.ckpt.data-00000-of-00001": "3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429",
+      "bert_model.ckpt.index": "87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7",
+      "bert_model.ckpt.meta": "27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29",
+      "vocab.txt": "87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f",
     }
 
     self.bert_base_chinese_sha = {
-        'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
-        'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
-        'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
-        'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
-        'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
+      "bert_config.json": "7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015",
+      "bert_model.ckpt.data-00000-of-00001": "756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba",
+      "bert_model.ckpt.index": "46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e",
+      "bert_model.ckpt.meta": "c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047",
+      "vocab.txt": "45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c",
     }
 
     # Relate SHA to urls for loop below
     self.model_sha = {
-        'bert_base_uncased': self.bert_base_uncased_sha,
-        'bert_large_uncased': self.bert_large_uncased_sha,
-        # 'bert_base_cased': self.bert_base_cased_sha,
-        # 'bert_large_cased': self.bert_large_cased_sha,
-        # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
-        # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
-        # 'bert_base_chinese': self.bert_base_chinese_sha
+      "bert_base_uncased": self.bert_base_uncased_sha,
+      "bert_large_uncased": self.bert_large_uncased_sha,
+      # 'bert_base_cased': self.bert_base_cased_sha,
+      # 'bert_large_cased': self.bert_large_cased_sha,
+      # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+      # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+      # 'bert_base_chinese': self.bert_base_chinese_sha
     }
 
   # Helper to get sha256sum of a file
@@ -117,7 +114,7 @@ def sha256sum(self, filename):
     h = hashlib.sha256()
     b = bytearray(128 * 1024)
     mv = memoryview(b)
-    with open(filename, 'rb', buffering=0) as f:
+    with open(filename, "rb", buffering=0) as f:
       for n in iter(lambda: f.readinto(mv), 0):
         h.update(mv[:n])
 
@@ -128,25 +125,25 @@ def download(self):
     found_mismatch_sha = False
     for model in self.model_urls:
       url = self.model_urls[model][0]
-      file = self.save_path + '/' + self.model_urls[model][1]
+      file = self.save_path + "/" + self.model_urls[model][1]
 
-      print('Downloading', url)
+      print("Downloading", url)
       response = urllib.request.urlopen(url)
-      with open(file, 'wb') as handle:
+      with open(file, "wb") as handle:
         handle.write(response.read())
 
-      print('Unzipping', file)
+      print("Unzipping", file)
       tf = tarfile.open(file)
       tf.extractall(self.save_path)
 
       sha_dict = self.model_sha[model]
       for extracted_file in sha_dict:
         sha = sha_dict[extracted_file]
-        if sha != self.sha256sum(file[:-7] + '/' + extracted_file):
+        if sha != self.sha256sum(file[:-7] + "/" + extracted_file):
           found_mismatch_sha = True
-          print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
+          print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
         else:
-          print(file[:-7] + '/' + extracted_file, '\t', 'verified')
+          print(file[:-7] + "/" + extracted_file, "\t", "verified")
 
     if not found_mismatch_sha:
       print("All downloads pass sha256sum verification.")
diff --git a/deepray/datasets/downloader/NVIDIAPretrainedWeightDownloader.py b/deepray/datasets/downloader/NVIDIAPretrainedWeightDownloader.py
index f9f8fdae..9472cb5c 100644
--- a/deepray/datasets/downloader/NVIDIAPretrainedWeightDownloader.py
+++ b/deepray/datasets/downloader/NVIDIAPretrainedWeightDownloader.py
@@ -15,9 +15,8 @@
 
 
 class NVIDIAPretrainedWeightDownloader:
-
   def __init__(self, save_path):
-    self.save_path = save_path + '/nvidia_pretrained_weights'
+    self.save_path = save_path + "/nvidia_pretrained_weights"
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
@@ -25,4 +24,4 @@ def __init__(self, save_path):
     pass
 
   def download(self):
-    assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
+    assert False, "NVIDIAPretrainedWeightDownloader not implemented yet."
diff --git a/deepray/datasets/downloader/TextSharding.py b/deepray/datasets/downloader/TextSharding.py
index 72503047..89b849b6 100644
--- a/deepray/datasets/downloader/TextSharding.py
+++ b/deepray/datasets/downloader/TextSharding.py
@@ -20,11 +20,10 @@
 
 
 class Sharding:
-
   def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
-    assert len(input_files) > 0, 'The input file list must contain at least one file.'
-    assert n_training_shards > 0, 'There must be at least one output shard.'
-    assert n_test_shards > 0, 'There must be at least one output shard.'
+    assert len(input_files) > 0, "The input file list must contain at least one file."
+    assert n_training_shards > 0, "There must be at least one output shard."
+    assert n_test_shards > 0, "There must be at least one output shard."
 
     self.n_training_shards = n_training_shards
     self.n_test_shards = n_test_shards
@@ -33,9 +32,9 @@ def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_sh
     self.input_files = input_files
 
     self.output_name_prefix = output_name_prefix
-    self.output_training_identifier = '_training'
-    self.output_test_identifier = '_test'
-    self.output_file_extension = '.txt'
+    self.output_training_identifier = "_training"
+    self.output_test_identifier = "_test"
+    self.output_file_extension = ".txt"
 
     self.articles = {}  # key: integer identifier, value: list of articles
     self.sentences = {}  # key: integer identifier, value: list of sentences
@@ -46,34 +45,34 @@ def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_sh
 
   # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
   def load_articles(self):
-    print('Start: Loading Articles')
+    print("Start: Loading Articles")
 
     global_article_count = 0
     for input_file in self.input_files:
-      print('input file:', input_file)
-      with open(input_file, mode='r', newline='\n') as f:
+      print("input file:", input_file)
+      with open(input_file, mode="r", newline="\n") as f:
         for _, line in enumerate(f):
           if line.strip():
             self.articles[global_article_count] = line.rstrip()
             global_article_count += 1
 
-    print('End: Loading Articles: There are', len(self.articles), 'articles.')
+    print("End: Loading Articles: There are", len(self.articles), "articles.")
 
   def segment_articles_into_sentences(self, segmenter):
-    print('Start: Sentence Segmentation')
+    print("Start: Sentence Segmentation")
     if len(self.articles) == 0:
       self.load_articles()
 
-    assert len(self.articles) != 0, 'Please check that input files are present and contain data.'
+    assert len(self.articles) != 0, "Please check that input files are present and contain data."
 
-    use_multiprocessing = 'serial'
+    use_multiprocessing = "serial"
 
     def chunks(data, size=len(self.articles)):
       it = iter(data)
       for _ in range(0, len(data), size):
         yield {k: data[k] for k in islice(it, size)}
 
-    if use_multiprocessing == 'manager':
+    if use_multiprocessing == "manager":
       manager = multiprocessing.Manager()
       return_dict = manager.dict()
       jobs = []
@@ -85,7 +84,7 @@ def work(articles, return_dict):
           sentences[i] = segmenter.segment_string(articles[article])
 
           if i % 5000 == 0:
-            print('Segmenting article', i)
+            print("Segmenting article", i)
 
         return_dict.update(sentences)
 
@@ -102,7 +101,7 @@ def work(articles, return_dict):
       for proc in jobs:
         proc.join()
 
-    elif use_multiprocessing == 'queue':
+    elif use_multiprocessing == "queue":
       multiprocessing.Queue()
       jobs = []
 
@@ -114,28 +113,28 @@ def work(articles, return_dict):
         self.sentences[i] = segmenter.segment_string(self.articles[article])
 
         if i % 5000 == 0:
-          print('Segmenting article', i)
+          print("Segmenting article", i)
 
-    print('End: Sentence Segmentation')
+    print("End: Sentence Segmentation")
 
   def init_output_files(self):
-    print('Start: Init Output Files')
-    assert len(
-        self.output_training_files
-    ) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
-    assert len(
-        self.output_test_files
-    ) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+    print("Start: Init Output Files")
+    assert len(self.output_training_files) == 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
+    assert len(self.output_test_files) == 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
 
     for i in range(self.n_training_shards):
-      name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+      name = self.output_name_prefix + self.output_training_identifier + "_" + str(i) + self.output_file_extension
       self.output_training_files[name] = []
 
     for i in range(self.n_test_shards):
-      name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+      name = self.output_name_prefix + self.output_test_identifier + "_" + str(i) + self.output_file_extension
       self.output_test_files[name] = []
 
-    print('End: Init Output Files')
+    print("End: Init Output Files")
 
   def get_sentences_per_shard(self, shard):
     result = 0
@@ -145,10 +144,10 @@ def get_sentences_per_shard(self, shard):
     return result
 
   def distribute_articles_over_shards(self):
-    print('Start: Distribute Articles Over Shards')
-    assert len(
-        self.articles
-    ) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+    print("Start: Distribute Articles Over Shards")
+    assert len(self.articles) >= self.n_training_shards + self.n_test_shards, (
+      "There are fewer articles than shards. Please add more data or reduce the number of shards requested."
+    )
 
     # Create dictionary with - key: sentence count per article, value: article id number
     sentence_counts = defaultdict(lambda: [])
@@ -183,7 +182,7 @@ def distribute_articles_over_shards(self):
 
       if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
         nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
-        print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+        print("Warning: A single article contains more than the nominal number of sentences per training shard.")
 
     for file in self.output_test_files:
       current_article_id = sentence_counts[max_sentences][-1]
@@ -198,7 +197,7 @@ def distribute_articles_over_shards(self):
 
       if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
         nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
-        print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+        print("Warning: A single article contains more than the nominal number of sentences per test shard.")
 
     training_counts = []
     test_counts = []
@@ -227,8 +226,11 @@ def distribute_articles_over_shards(self):
         while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
           nominal_next_article_size -= 1
 
-        if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or training_counts[
-            fidx] > training_median:
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size == 0
+          or training_counts[fidx] > training_median
+        ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
 
         current_article_id = sentence_counts[nominal_next_article_size][-1]
@@ -248,8 +250,11 @@ def distribute_articles_over_shards(self):
         while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
           nominal_next_article_size -= 1
 
-        if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or test_counts[
-            fidx] > test_median:
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size == 0
+          or test_counts[fidx] > test_median
+        ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
 
         current_article_id = sentence_counts[nominal_next_article_size][-1]
@@ -283,45 +288,44 @@ def distribute_articles_over_shards(self):
       training_median = statistics.median(training_counts)
       test_median = statistics.median(test_counts)
 
-      print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+      print("Distributing data over shards:", len(unused_article_set), "articles remaining.")
 
     if len(unused_article_set) != 0:
-      print('Warning: Some articles did not make it into output files.')
+      print("Warning: Some articles did not make it into output files.")
 
     for shard in self.output_training_files:
-      print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+      print("Training shard:", self.get_sentences_per_shard(self.output_training_files[shard]))
 
     for shard in self.output_test_files:
-      print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+      print("Test shard:", self.get_sentences_per_shard(self.output_test_files[shard]))
 
-    print('End: Distribute Articles Over Shards')
+    print("End: Distribute Articles Over Shards")
 
   def write_shards_to_disk(self):
-    print('Start: Write Shards to Disk')
+    print("Start: Write Shards to Disk")
     for shard in self.output_training_files:
       self.write_single_shard(shard, self.output_training_files[shard])
 
     for shard in self.output_test_files:
       self.write_single_shard(shard, self.output_test_files[shard])
 
-    print('End: Write Shards to Disk')
+    print("End: Write Shards to Disk")
 
   def write_single_shard(self, shard_name, shard):
-    with open(shard_name, mode='w', newline='\n') as f:
+    with open(shard_name, mode="w", newline="\n") as f:
       for article_id in shard:
         for line in self.sentences[article_id]:
-          f.write(line + '\n')
+          f.write(line + "\n")
 
-        f.write('\n')  # Line break between articles
+        f.write("\n")  # Line break between articles
 
 
 import nltk
 
-nltk.download('punkt')
+nltk.download("punkt")
 
 
 class NLTKSegmenter:
-
   def __init(self):
     pass
 
diff --git a/deepray/datasets/downloader/bertPrep.py b/deepray/datasets/downloader/bertPrep.py
index de0cf4ca..a3daee8d 100644
--- a/deepray/datasets/downloader/bertPrep.py
+++ b/deepray/datasets/downloader/bertPrep.py
@@ -25,127 +25,174 @@
 
 
 def main(args):
-  working_dir = os.environ['BERT_PREP_WORKING_DIR']
+  working_dir = os.environ["BERT_PREP_WORKING_DIR"]
 
-  print('Working Directory:', working_dir)
-  print('Action:', args.action)
-  print('Dataset Name:', args.dataset)
+  print("Working Directory:", working_dir)
+  print("Action:", args.action)
+  print("Dataset Name:", args.dataset)
 
   if args.input_files:
-    args.input_files = args.input_files.split(',')
-
-  hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
-                                + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
-                                + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
-                                + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
+    args.input_files = args.input_files.split(",")
+
+  hdf5_tfrecord_folder_prefix = (
+    "/lower_case_"
+    + str(args.do_lower_case)
+    + "_seq_len_"
+    + str(args.max_seq_length)
+    + "_max_pred_"
+    + str(args.max_predictions_per_seq)
+    + "_masked_lm_prob_"
+    + str(args.masked_lm_prob)
+    + "_random_seed_"
+    + str(args.random_seed)
+    + "_dupe_factor_"
+    + str(args.dupe_factor)
+    + "_shard_"
+    + str(args.n_training_shards)
+    + "_test_split_"
+    + str(int(args.fraction_test_set * 100))
+  )
   directory_structure = {
-      'download': working_dir + '/download',  # Downloaded and decompressed
-      'extracted': working_dir + '/extracted',  # Extracted from whatever the initial format is (e.g., wikiextractor)
-      'formatted': working_dir +
-                   '/formatted_one_article_per_line',  # This is the level where all sources should look the same
-      'sharded': working_dir + '/sharded',
-      'tfrecord': working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
-      'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix,
+    "download": working_dir + "/download",  # Downloaded and decompressed
+    "extracted": working_dir + "/extracted",  # Extracted from whatever the initial format is (e.g., wikiextractor)
+    "formatted": working_dir
+    + "/formatted_one_article_per_line",  # This is the level where all sources should look the same
+    "sharded": working_dir + "/sharded",
+    "tfrecord": working_dir + "/tfrecord" + hdf5_tfrecord_folder_prefix,
+    "hdf5": working_dir + "/hdf5" + hdf5_tfrecord_folder_prefix,
   }
 
-  print('\nDirectory Structure:')
+  print("\nDirectory Structure:")
   pp = pprint.PrettyPrinter(indent=2)
   pp.pprint(directory_structure)
-  print('')
+  print("")
 
-  if args.action == 'download':
-    if not os.path.exists(directory_structure['download']):
-      os.makedirs(directory_structure['download'])
+  if args.action == "download":
+    if not os.path.exists(directory_structure["download"]):
+      os.makedirs(directory_structure["download"])
 
-    downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+    downloader = Downloader.Downloader(args.dataset, directory_structure["download"])
     downloader.download()
 
-  elif args.action == 'text_formatting':
-    assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
-           and args.dataset != 'squad' and args.dataset != 'mrpc' and args.dataset != 'cola' and \
-           args.dataset != 'mnli' and args.dataset != 'sst-2', 'Cannot perform text_formatting on pretrained weights'
-
-    if not os.path.exists(directory_structure['extracted']):
-      os.makedirs(directory_structure['extracted'])
-
-    if not os.path.exists(directory_structure['formatted']):
-      os.makedirs(directory_structure['formatted'])
-
-    if args.dataset == 'bookscorpus':
-      books_path = directory_structure['download'] + '/bookscorpus'
-      #books_path = directory_structure['download']
-      output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
+  elif args.action == "text_formatting":
+    assert (
+      args.dataset != "google_pretrained_weights"
+      and args.dataset != "nvidia_pretrained_weights"
+      and args.dataset != "squad"
+      and args.dataset != "mrpc"
+      and args.dataset != "cola"
+      and args.dataset != "mnli"
+      and args.dataset != "sst-2"
+    ), "Cannot perform text_formatting on pretrained weights"
+
+    if not os.path.exists(directory_structure["extracted"]):
+      os.makedirs(directory_structure["extracted"])
+
+    if not os.path.exists(directory_structure["formatted"]):
+      os.makedirs(directory_structure["formatted"])
+
+    if args.dataset == "bookscorpus":
+      books_path = directory_structure["download"] + "/bookscorpus"
+      # books_path = directory_structure['download']
+      output_filename = directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"
       books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
       books_formatter.merge()
 
-    elif args.dataset == 'wikicorpus_en':
+    elif args.dataset == "wikicorpus_en":
       if args.skip_wikiextractor == 0:
-        path_to_wikiextractor_in_container = 'python -m wikiextractor.WikiExtractor'
-        wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure[
-            'download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(
-                args.n_processes
-            ) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-        print('WikiExtractor Command:', wikiextractor_command)
+        path_to_wikiextractor_in_container = "python -m wikiextractor.WikiExtractor"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_en.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
         wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
 
-      wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
-      output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_en"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"
       wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
       wiki_formatter.merge()
 
-    elif args.dataset == 'wikicorpus_zh':
-      assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
+    elif args.dataset == "wikicorpus_zh":
+      assert False, (
+        "wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added."
+      )
       if args.skip_wikiextractor == 0:
-        path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
-        wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure[
-            'download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(
-                args.n_processes
-            ) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-        print('WikiExtractor Command:', wikiextractor_command)
+        path_to_wikiextractor_in_container = "/workspace/wikiextractor/WikiExtractor.py"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_zh.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
         wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
 
-      wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
-      output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_zh"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"
       wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
       wiki_formatter.merge()
 
-    elif args.dataset == 'pubmed_baseline':
-      pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
-      output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
+    elif args.dataset == "pubmed_baseline":
+      pubmed_path = directory_structure["download"] + "/pubmed" + "/baseline"
+      output_filename = directory_structure["formatted"] + "/pubmed_baseline_one_article_per_line.txt"
       pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
       pubmed_formatter.merge()
 
-  elif args.action == 'sharding':
+  elif args.action == "sharding":
     # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
-    if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
+    if (
+      args.dataset == "bookscorpus"
+      or "wikicorpus" in args.dataset
+      or "books_wiki" in args.dataset
+      or "pubmed" in args.dataset
+    ):
       if args.input_files is None:
-        if args.dataset == 'bookscorpus':
-          args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
-        elif args.dataset == 'wikicorpus_en':
-          args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-        elif args.dataset == 'wikicorpus_zh':
-          args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
-        elif args.dataset == 'books_wiki_en_corpus':
+        if args.dataset == "bookscorpus":
+          args.input_files = [directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"]
+        elif args.dataset == "wikicorpus_en":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"]
+        elif args.dataset == "wikicorpus_zh":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"]
+        elif args.dataset == "books_wiki_en_corpus":
           args.input_files = [
-              directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt',
-              directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt",
+            directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt",
           ]
-        elif args.dataset == 'pubmed_baseline':
-          args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
+        elif args.dataset == "pubmed_baseline":
+          args.input_files = [directory_structure["formatted"] + "/pubmed_baseline_one_article_per_line.txt"]
 
-      output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+      output_file_prefix = directory_structure["sharded"] + "/" + args.dataset + "/" + args.dataset
 
-      if not os.path.exists(directory_structure['sharded']):
-        os.makedirs(directory_structure['sharded'])
+      if not os.path.exists(directory_structure["sharded"]):
+        os.makedirs(directory_structure["sharded"])
 
-      if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
-        os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset)
 
-      if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
-        os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/training"):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/training")
 
-      if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
-        os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/test"):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/test")
 
       # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
       # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
@@ -153,7 +200,7 @@ def main(args):
       # other packages to be called from here -- just add a conditional branch for those extra steps
       segmenter = TextSharding.NLTKSegmenter()
       sharding = TextSharding.Sharding(
-          args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
+        args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
       )
 
       sharding.load_articles()
@@ -162,34 +209,56 @@ def main(args):
       sharding.write_shards_to_disk()
 
     else:
-      assert False, 'Unsupported dataset for sharding'
+      assert False, "Unsupported dataset for sharding"
 
-  elif args.action == 'create_tfrecord_files':
-    if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
-      os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+  elif args.action == "create_tfrecord_files":
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset)
 
-    if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
-      os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset + "/training"):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset + "/training")
 
-    if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
-      os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset + "/test"):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset + "/test")
 
     last_process = None
 
-    def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
-      bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
-      bert_preprocessing_command += ' --input_file=' + directory_structure[
-          'sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
-      bert_preprocessing_command += ' --output_file=' + directory_structure[
-          'tfrecord'
-      ] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
-      bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
-      bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
-      bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
-      bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
-      bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
-      bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
-      bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
+    def create_record_worker(filename_prefix, shard_id, output_format="tfrecord", split="training"):
+      bert_preprocessing_command = "python /workspace/bert_tf2/create_pretraining_data.py"
+      bert_preprocessing_command += (
+        " --input_file="
+        + directory_structure["sharded"]
+        + "/"
+        + args.dataset
+        + "/"
+        + split
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + ".txt"
+      )
+      bert_preprocessing_command += (
+        " --output_file="
+        + directory_structure["tfrecord"]
+        + "/"
+        + args.dataset
+        + "/"
+        + split
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + "."
+        + output_format
+      )
+      bert_preprocessing_command += " --vocab_file=" + args.vocab_file
+      bert_preprocessing_command += " --do_lower_case" if args.do_lower_case else ""
+      bert_preprocessing_command += " --max_seq_length=" + str(args.max_seq_length)
+      bert_preprocessing_command += " --max_predictions_per_seq=" + str(args.max_predictions_per_seq)
+      bert_preprocessing_command += " --masked_lm_prob=" + str(args.masked_lm_prob)
+      bert_preprocessing_command += " --random_seed=" + str(args.random_seed)
+      bert_preprocessing_command += " --dupe_factor=" + str(args.dupe_factor)
       bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
 
       last_process = bert_preprocessing_process
@@ -203,36 +272,55 @@ def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', sp
     output_file_prefix = args.dataset
 
     for i in range(args.n_training_shards):
-      last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
+      last_process = create_record_worker(output_file_prefix + "_training", i, "tfrecord", "training")
 
     last_process.wait()
 
     for i in range(args.n_test_shards):
-      last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
+      last_process = create_record_worker(output_file_prefix + "_test", i, "tfrecord", "test")
 
     last_process.wait()
 
-  elif args.action == 'create_hdf5_files':
-    assert False, 'HDF5 format not fully supported in this release.'
+  elif args.action == "create_hdf5_files":
+    assert False, "HDF5 format not fully supported in this release."
 
-    if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
-      os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
+    if not os.path.exists(directory_structure["hdf5"] + "/" + args.dataset):
+      os.makedirs(directory_structure["hdf5"] + "/" + args.dataset)
 
     last_process = None
 
-    def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
-      bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
-      bert_preprocessing_command += ' --input_file=' + directory_structure[
-          'sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
-      bert_preprocessing_command += ' --output_file=' + directory_structure[
-          'hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
-      bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
-      bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
-      bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
-      bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
-      bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
-      bert_preprocessing_command += ' --random_seed=' + args.random_seed
-      bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
+    def create_record_worker(filename_prefix, shard_id, output_format="hdf5"):
+      bert_preprocessing_command = "python /workspace/bert_tf2/create_pretraining_data.py"
+      bert_preprocessing_command += (
+        " --input_file="
+        + directory_structure["sharded"]
+        + "/"
+        + args.dataset
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + ".txt"
+      )
+      bert_preprocessing_command += (
+        " --output_file="
+        + directory_structure["hdf5"]
+        + "/"
+        + args.dataset
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + "."
+        + output_format
+      )
+      bert_preprocessing_command += " --vocab_file=" + args.vocab_file
+      bert_preprocessing_command += " --do_lower_case" if args.do_lower_case else ""
+      bert_preprocessing_command += " --max_seq_length=" + args.max_seq_length
+      bert_preprocessing_command += " --max_predictions_per_seq=" + args.max_predictions_per_seq
+      bert_preprocessing_command += " --masked_lm_prob=" + args.masked_lm_prob
+      bert_preprocessing_command += " --random_seed=" + args.random_seed
+      bert_preprocessing_command += " --dupe_factor=" + args.dupe_factor
       bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
 
       last_process = bert_preprocessing_process
@@ -242,101 +330,114 @@ def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
         bert_preprocessing_process.wait()
 
     for i in range(args.n_training_shards):
-      create_record_worker(args.output_file_prefix + '_training', i)
+      create_record_worker(args.output_file_prefix + "_training", i)
 
     last_process.wait()
 
     for i in range(args.n_test_shards):
-      create_record_worker(args.output_file_prefix + '_test', i)
+      create_record_worker(args.output_file_prefix + "_test", i)
 
     last_process.wait()
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='Preprocessing Application for Everything BERT-related')
+  parser = argparse.ArgumentParser(description="Preprocessing Application for Everything BERT-related")
 
   parser.add_argument(
-      '--action',
-      type=str,
-      help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
-      choices={
-          'download',  # Download and verify mdf5/sha sums
-          'text_formatting',  # Convert into a file that contains one article/book per line
-          'sharding',  # Convert previous formatted text into shards containing one sentence per line
-          'create_tfrecord_files',  # Turn each shard into a TFrecord with masking and next sentence prediction info
-          'create_hdf5_files'  # Turn each shard into a HDF5 file with masking and next sentence prediction info
-      }
+    "--action",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+    choices={
+      "download",  # Download and verify mdf5/sha sums
+      "text_formatting",  # Convert into a file that contains one article/book per line
+      "sharding",  # Convert previous formatted text into shards containing one sentence per line
+      "create_tfrecord_files",  # Turn each shard into a TFrecord with masking and next sentence prediction info
+      "create_hdf5_files",  # Turn each shard into a HDF5 file with masking and next sentence prediction info
+    },
   )
 
   parser.add_argument(
-      '--dataset',
-      type=str,
-      help='Specify the dataset to perform --action on',
-      choices={
-          'bookscorpus', 'wikicorpus_en', 'wikicorpus_zh', 'books_wiki_en_corpus', 'pubmed_baseline',
-          'pubmed_daily_update', 'pubmed_fulltext', 'pubmed_open_access', 'google_pretrained_weights',
-          'nvidia_pretrained_weights', 'squad', 'mrpc', 'sst-2', 'mnli', 'cola', 'all'
-      }
+    "--dataset",
+    type=str,
+    help="Specify the dataset to perform --action on",
+    choices={
+      "bookscorpus",
+      "wikicorpus_en",
+      "wikicorpus_zh",
+      "books_wiki_en_corpus",
+      "pubmed_baseline",
+      "pubmed_daily_update",
+      "pubmed_fulltext",
+      "pubmed_open_access",
+      "google_pretrained_weights",
+      "nvidia_pretrained_weights",
+      "squad",
+      "mrpc",
+      "sst-2",
+      "mnli",
+      "cola",
+      "all",
+    },
   )
 
-  parser.add_argument('--input_files', type=str, help='Specify the input files in a comma-separated list (no spaces)')
+  parser.add_argument("--input_files", type=str, help="Specify the input files in a comma-separated list (no spaces)")
 
   parser.add_argument(
-      '--n_training_shards', type=int, help='Specify the number of training shards to generate', default=1472
+    "--n_training_shards", type=int, help="Specify the number of training shards to generate", default=1472
   )
 
-  parser.add_argument('--n_test_shards', type=int, help='Specify the number of test shards to generate', default=1472)
+  parser.add_argument("--n_test_shards", type=int, help="Specify the number of test shards to generate", default=1472)
 
   parser.add_argument(
-      '--fraction_test_set',
-      type=float,
-      help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
-      default=0.1
+    "--fraction_test_set",
+    type=float,
+    help="Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)",
+    default=0.1,
   )
 
   parser.add_argument(
-      '--segmentation_method',
-      type=str,
-      help='Specify your choice of sentence segmentation',
-      choices={'nltk'},
-      default='nltk'
+    "--segmentation_method",
+    type=str,
+    help="Specify your choice of sentence segmentation",
+    choices={"nltk"},
+    default="nltk",
   )
 
   parser.add_argument(
-      '--n_processes', type=int, help='Specify the max number of processes to allow at one time', default=4
+    "--n_processes", type=int, help="Specify the max number of processes to allow at one time", default=4
   )
 
   parser.add_argument(
-      '--random_seed', type=int, help='Specify the base seed to use for any random number generation', default=12345
+    "--random_seed", type=int, help="Specify the base seed to use for any random number generation", default=12345
   )
 
-  parser.add_argument('--dupe_factor', type=int, help='Specify the duplication factor', default=5)
+  parser.add_argument("--dupe_factor", type=int, help="Specify the duplication factor", default=5)
 
-  parser.add_argument('--masked_lm_prob', type=float, help='Specify the probability for masked lm', default=0.15)
+  parser.add_argument("--masked_lm_prob", type=float, help="Specify the probability for masked lm", default=0.15)
 
-  parser.add_argument('--max_seq_length', type=int, help='Specify the maximum sequence length', default=512)
+  parser.add_argument("--max_seq_length", type=int, help="Specify the maximum sequence length", default=512)
 
   parser.add_argument(
-      '--max_predictions_per_seq', type=int, help='Specify the maximum number of masked words per sequence', default=20
+    "--max_predictions_per_seq", type=int, help="Specify the maximum number of masked words per sequence", default=20
   )
 
   parser.add_argument(
-      '--do_lower_case',
-      type=int,
-      help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
-      default=1
+    "--do_lower_case",
+    type=int,
+    help="Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)",
+    default=1,
   )
 
-  parser.add_argument('--vocab_file', type=str, help='Specify absolute path to vocab file to use)')
+  parser.add_argument("--vocab_file", type=str, help="Specify absolute path to vocab file to use)")
 
   parser.add_argument(
-      '--skip_wikiextractor', type=int, help='Specify whether to skip wikiextractor step 0=False, 1=True', default=0
+    "--skip_wikiextractor", type=int, help="Specify whether to skip wikiextractor step 0=False, 1=True", default=0
   )
 
   parser.add_argument(
-      '--interactive_json_config_generator',
-      type=str,
-      help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    "--interactive_json_config_generator",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
   )
 
   args = parser.parse_args()
diff --git a/deepray/datasets/downloader/bookscorpus/BooksDownloader.py b/deepray/datasets/downloader/bookscorpus/BooksDownloader.py
index 71c8537d..464fd8b7 100644
--- a/deepray/datasets/downloader/bookscorpus/BooksDownloader.py
+++ b/deepray/datasets/downloader/bookscorpus/BooksDownloader.py
@@ -15,13 +15,14 @@
 
 
 class BooksDownloader:
-
   def __init__(self, save_path):
     self.save_path = save_path
     pass
 
   def download(self):
-    bookscorpus_download_command = 'pwd && ls && python3 bookscorpus/download_files.py --list bookscorpus/url_list.jsonl --out'
-    bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
-    bookscorpus_download_command += ' --trash-bad-count'
+    bookscorpus_download_command = (
+      "pwd && ls && python3 bookscorpus/download_files.py --list bookscorpus/url_list.jsonl --out"
+    )
+    bookscorpus_download_command += " " + self.save_path + "/bookscorpus"
+    bookscorpus_download_command += " --trash-bad-count"
     bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
diff --git a/deepray/datasets/downloader/bookscorpus/BookscorpusTextFormatting.py b/deepray/datasets/downloader/bookscorpus/BookscorpusTextFormatting.py
index d400504c..125a4988 100644
--- a/deepray/datasets/downloader/bookscorpus/BookscorpusTextFormatting.py
+++ b/deepray/datasets/downloader/bookscorpus/BookscorpusTextFormatting.py
@@ -15,7 +15,6 @@
 
 
 class BookscorpusTextFormatting:
-
   def __init__(self, books_path, output_filename, recursive=False):
     self.books_path = books_path
     self.recursive = recursive
@@ -23,10 +22,10 @@ def __init__(self, books_path, output_filename, recursive=False):
 
   # This puts one book per line
   def merge(self):
-    with open(self.output_filename, mode='w', newline='\n') as ofile:
-      for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
-        with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.books_path + "/" + "*.txt", recursive=True):
+        with open(filename, mode="r", encoding="utf-8-sig", newline="\n") as file:
           for line in file:
-            if line.strip() != '':
-              ofile.write(line.strip() + ' ')
+            if line.strip() != "":
+              ofile.write(line.strip() + " ")
         ofile.write("\n\n")
diff --git a/deepray/datasets/downloader/bookscorpus/download_files.py b/deepray/datasets/downloader/bookscorpus/download_files.py
index 773a3991..ddb8afcf 100644
--- a/deepray/datasets/downloader/bookscorpus/download_files.py
+++ b/deepray/datasets/downloader/bookscorpus/download_files.py
@@ -5,15 +5,20 @@
 
 try:
   from cookielib import CookieJar
+
   cj = CookieJar()
   import urllib2
+
   opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
   import urllib
+
   urlretrieve = urllib.urlretrieve
 except ImportError:
   import http.cookiejar
+
   cj = http.cookiejar.CookieJar()
   import urllib
+
   opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
   urlretrieve = urllib.request.urlretrieve
 
@@ -26,13 +31,13 @@
 
 SLEEP_SEC = 0.05
 SUCCESS_SLEEP_SEC = 0.001
-RETRY_SLEEP_SEC = 1.
+RETRY_SLEEP_SEC = 1.0
 MAX_OPEN_COUNT = 3
 
 parser = argparse.ArgumentParser()
-parser.add_argument('--out-dir', '--out', type=str, required=True)
-parser.add_argument('--list-path', '--list', type=str, required=True)
-parser.add_argument('--trash-bad-count', action='store_true', default=False)
+parser.add_argument("--out-dir", "--out", type=str, required=True)
+parser.add_argument("--list-path", "--list", type=str, required=True)
+parser.add_argument("--trash-bad-count", action="store_true", default=False)
 args = parser.parse_args()
 
 
@@ -43,9 +48,8 @@ def write_txt(txt, out_path, num_words=None):
   counted_num_words = len(txt.split())
   if not txt.strip():
     pass
-  elif num_words is None or \
-          (num_words * 0.5 < counted_num_words < num_words * 1.5):
-    with open(out_path, "w", encoding='utf8') as txt_out:  # convert epub2txt and save
+  elif num_words is None or (num_words * 0.5 < counted_num_words < num_words * 1.5):
+    with open(out_path, "w", encoding="utf8") as txt_out:  # convert epub2txt and save
       txt_out.write(txt)
 
 
@@ -56,10 +60,10 @@ def main():
     os.makedirs(out_dir)
   filelist_path = args.list_path
 
-  lines = list(open(filelist_path, encoding='utf8').readlines())
+  lines = list(open(filelist_path, encoding="utf8").readlines())
 
-  done_files = set([os.path.split(path)[-1] for path in glob(os.path.join(out_dir, '*.txt'))])
-  sys.stderr.write('{} files had already been saved in {}.\n'.format(len(done_files), out_dir))
+  done_files = set([os.path.split(path)[-1] for path in glob(os.path.join(out_dir, "*.txt"))])
+  sys.stderr.write("{} files had already been saved in {}.\n".format(len(done_files), out_dir))
 
   for i, line in enumerate(ProgressBar()(lines)):
     if not line.strip():
@@ -74,54 +78,54 @@ def main():
       #             "Nonfiction\tPublishing\tSelf-publishing"],
       #  "publish": "May 05, 2008", "num_words": 28300, "b_idx": 1}
       data = json.loads(line.strip())
-      _, book_id = os.path.split(data['page'])
-      _, file_name = os.path.split(data['epub'])
+      _, book_id = os.path.split(data["page"])
+      _, file_name = os.path.split(data["epub"])
 
-      out_file_name = '{}__{}'.format(book_id, file_name.replace('.epub', '.txt'))
+      out_file_name = "{}__{}".format(book_id, file_name.replace(".epub", ".txt"))
       out_path = os.path.join(out_dir, out_file_name)
       if out_file_name in done_files:
         continue
-      if data['txt']:
+      if data["txt"]:
         # try to download .txt file
         for try_count in range(MAX_OPEN_COUNT):
           try:
-            response = opener.open(data['txt'])
+            response = opener.open(data["txt"])
             if try_count >= 1:
-              sys.stderr.write('Succeeded in opening {}\n'.format(data['txt']))
+              sys.stderr.write("Succeeded in opening {}\n".format(data["txt"]))
             time.sleep(SUCCESS_SLEEP_SEC)
             break  # success
           except Exception as e:
-            sys.stderr.write('Failed to open {}\n'.format(data['txt']))
-            sys.stderr.write('{}: {}\n'.format(type(e).__name__, str(e)))
+            sys.stderr.write("Failed to open {}\n".format(data["txt"]))
+            sys.stderr.write("{}: {}\n".format(type(e).__name__, str(e)))
             time.sleep(RETRY_SLEEP_SEC)
         else:
-          sys.stderr.write(' Gave up to open {}\n'.format(data['txt']))
-        txt = response.read().decode('utf-8', 'ignore')
+          sys.stderr.write(" Gave up to open {}\n".format(data["txt"]))
+        txt = response.read().decode("utf-8", "ignore")
         write_txt(txt, out_path, None)
       else:
         # revenge by converting .epub to .txt
         tmp_path = os.path.join(out_dir, file_name)
         for try_count in range(MAX_OPEN_COUNT):
           try:
-            urlretrieve(data['epub'], tmp_path)  # download epub
+            urlretrieve(data["epub"], tmp_path)  # download epub
             if try_count >= 1:
-              sys.stderr.write('Succeeded in opening {}\n'.format(data['epub']))
+              sys.stderr.write("Succeeded in opening {}\n".format(data["epub"]))
             time.sleep(SUCCESS_SLEEP_SEC)
             break  # success
           except Exception as e:
-            sys.stderr.write('Failed to open {}\n'.format(data['epub']))
-            sys.stderr.write('{}: {}\n'.format(type(e).__name__, str(e)))
+            sys.stderr.write("Failed to open {}\n".format(data["epub"]))
+            sys.stderr.write("{}: {}\n".format(type(e).__name__, str(e)))
             time.sleep(RETRY_SLEEP_SEC)
         else:
-          sys.stderr.write(' Gave up to open {}\n'.format(data['epub']))
+          sys.stderr.write(" Gave up to open {}\n".format(data["epub"]))
         txt = epub2txt.epub2txt(tmp_path).convert()
         if args.trash_bad_count:
-          if 'num_words' in data:
-            write_txt(txt, out_path, data['num_words'])
+          if "num_words" in data:
+            write_txt(txt, out_path, data["num_words"])
         else:
           write_txt(txt, out_path, None)
     except Exception as e:
-      sys.stderr.write(str(e) + '\n')
+      sys.stderr.write(str(e) + "\n")
       if os.path.exists(out_path):
         os.remove(out_path)
     # remove .epub
@@ -131,5 +135,5 @@ def main():
       pass
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/downloader/bookscorpus/download_list.py b/deepray/datasets/downloader/bookscorpus/download_list.py
index f3f56611..0a97f69c 100644
--- a/deepray/datasets/downloader/bookscorpus/download_list.py
+++ b/deepray/datasets/downloader/bookscorpus/download_list.py
@@ -6,15 +6,20 @@
 
 try:
   from cookielib import CookieJar
+
   cj = CookieJar()
   import urllib2
+
   opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
   import urllib
+
   urlretrieve = urllib.urlretrieve
 except ImportError:
   import http.cookiejar
+
   cj = http.cookiejar.CookieJar()
   import urllib
+
   opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
   urlretrieve = urllib.request.urlretrieve
 
@@ -27,32 +32,32 @@
 
 # If you wanna use some info, write them.
 REQUIRED = [
-    #    'page',
-    #    'epub',
-    #    'txt',
-    #    'title',
-    #    'author',
-    #    'genres',
-    #    'publish',
-    #    'num_words',
-    'b_idx',
+  #    'page',
+  #    'epub',
+  #    'txt',
+  #    'title',
+  #    'author',
+  #    'genres',
+  #    'publish',
+  #    'num_words',
+  "b_idx",
 ]
 
 SLEEP_SEC = 0.1
-RETRY_SLEEP_SEC = 1.
+RETRY_SLEEP_SEC = 1.0
 MAX_OPEN_COUNT = 3
 
-search_url_pt = 'https://www.smashwords.com/books/category/1/downloads/0/free/medium/{}'
+search_url_pt = "https://www.smashwords.com/books/category/1/downloads/0/free/medium/{}"
 search_urls = [search_url_pt.format(i) for i in range(0, 24000 + 1, 20)]
 
-num_words_pt = re.compile(r'Words: (\d+)')
-pub_date_pt = re.compile(r'Published: ([\w\.]+\s[\d]+,\s[\d]+)')
+num_words_pt = re.compile(r"Words: (\d+)")
+pub_date_pt = re.compile(r"Published: ([\w\.]+\s[\d]+,\s[\d]+)")
 
 
 def main():
   start_time = time.time()
   dataset = []
-  sys.stderr.write(str(datetime.datetime.now()) + '\n')
+  sys.stderr.write(str(datetime.datetime.now()) + "\n")
 
   book_index = 0
   for i, s_url in enumerate(ProgressBar()(search_urls)):
@@ -61,44 +66,44 @@ def main():
       try:
         response = opener.open(s_url)
         if try_count >= 1:
-          sys.stderr.write('Succeeded in opening {}\n'.format(s_url))
+          sys.stderr.write("Succeeded in opening {}\n".format(s_url))
         break  # success
       except Exception as e:
-        sys.stderr.write('Failed to open {}\n'.format(s_url))
-        sys.stderr.write('{}: {}\n'.format(type(e).__name__, str(e)))
+        sys.stderr.write("Failed to open {}\n".format(s_url))
+        sys.stderr.write("{}: {}\n".format(type(e).__name__, str(e)))
         time.sleep(RETRY_SLEEP_SEC)
     else:
-      sys.stderr.write(' Gave up to open {}\n'.format(s_url))
+      sys.stderr.write(" Gave up to open {}\n".format(s_url))
     body = response.read()
-    soup = BeautifulSoup(body, 'lxml')
+    soup = BeautifulSoup(body, "lxml")
 
     book_links = soup.find_all(class_="library-title")
 
     for b_link in book_links:
       book_index += 1
-      b_url = b_link.get('href')
+      b_url = b_link.get("href")
       for try_count in range(MAX_OPEN_COUNT):
         try:
           response = opener.open(b_url)
           if try_count >= 1:
-            sys.stderr.write('Succeeded in opening {}\n'.format(b_url))
+            sys.stderr.write("Succeeded in opening {}\n".format(b_url))
           break  # success
         except Exception as e:
-          sys.stderr.write('Failed to open {}\n'.format(b_url))
-          sys.stderr.write('{}: {}\n'.format(type(e).__name__, str(e)))
+          sys.stderr.write("Failed to open {}\n".format(b_url))
+          sys.stderr.write("{}: {}\n".format(type(e).__name__, str(e)))
           time.sleep(RETRY_SLEEP_SEC)
       else:
-        sys.stderr.write(' Gave up to open {}\n'.format(b_url))
+        sys.stderr.write(" Gave up to open {}\n".format(b_url))
 
       body = response.read()
-      soup = BeautifulSoup(body, 'lxml')
+      soup = BeautifulSoup(body, "lxml")
 
       # get meta
       meta_infos = soup.find_all(class_="col-md-3")
       if not meta_infos:
-        sys.stderr.write('Failed: meta_info {}\n'.format(b_url))
+        sys.stderr.write("Failed: meta_info {}\n".format(b_url))
         continue
-      meta_txts = [m.text for m in meta_infos if 'Language: English' in m.text]
+      meta_txts = [m.text for m in meta_infos if "Language: English" in m.text]
 
       # check lang
       is_english = len(meta_txts) >= 1
@@ -106,12 +111,12 @@ def main():
         continue
 
       # get num words
-      meta_txt = meta_txts[0].replace(',', '')
+      meta_txt = meta_txts[0].replace(",", "")
       match = num_words_pt.search(meta_txt)
       if match:
         num_words = int(match.group(1))
-      elif 'num_words' in REQUIRED:
-        sys.stderr.write('Failed: num_words {}\n'.format(b_url))
+      elif "num_words" in REQUIRED:
+        sys.stderr.write("Failed: num_words {}\n".format(b_url))
         continue
       else:
         num_words = 0
@@ -121,18 +126,18 @@ def main():
       match = pub_date_pt.search(meta_txt)
       if match:
         pub_date = match.group(1)
-      elif 'publish' in REQUIRED:
-        sys.stderr.write('Failed: publish {}\n'.format(b_url))
+      elif "publish" in REQUIRED:
+        sys.stderr.write("Failed: publish {}\n".format(b_url))
         continue
       else:
-        pub_data = ''
+        pub_data = ""
 
       # get genres
       genre_txts = soup.find_all(class_="category")
       if genre_txts:
-        genres = [g.text.replace('\u00a0\u00bb\u00a0', '\t').strip() for g in genre_txts]
-      elif 'genres' in REQUIRED:
-        sys.stderr.write('Failed: genre {}\n'.format(b_url))
+        genres = [g.text.replace("\u00a0\u00bb\u00a0", "\t").strip() for g in genre_txts]
+      elif "genres" in REQUIRED:
+        sys.stderr.write("Failed: genre {}\n".format(b_url))
         continue
       else:
         genres = []
@@ -141,70 +146,70 @@ def main():
       title = soup.find("h1")
       if title:
         title = title.text
-      elif 'title' in REQUIRED:
-        sys.stderr.write('Failed: title {}\n'.format(b_url))
+      elif "title" in REQUIRED:
+        sys.stderr.write("Failed: title {}\n".format(b_url))
         continue
       else:
-        title = ''
+        title = ""
 
       # get author
       author = soup.find(itemprop="author")
       if author:
         author = author.text
-      elif 'author' in REQUIRED:
-        sys.stderr.write('Failed: author {}\n'.format(b_url))
+      elif "author" in REQUIRED:
+        sys.stderr.write("Failed: author {}\n".format(b_url))
         continue
       else:
-        author = ''
+        author = ""
 
       # TODO: Using <div class="card" id="download"> and "epub"/"txt"
       # get epub
       epub_links = soup.find_all(
-          title="Supported by many apps and devices (e.g., Apple Books, Barnes and Noble Nook, Kobo, Google Play, etc.)"
+        title="Supported by many apps and devices (e.g., Apple Books, Barnes and Noble Nook, Kobo, Google Play, etc.)"
       )
       if epub_links:
-        epub_url = epub_links[0].get('href')
+        epub_url = epub_links[0].get("href")
         if epub_url:
-          epub_url = 'https://www.smashwords.com' + epub_url
-        elif 'epub' in REQUIRED:
-          sys.stderr.write('Failed: epub2 {}\n'.format(b_url))
+          epub_url = "https://www.smashwords.com" + epub_url
+        elif "epub" in REQUIRED:
+          sys.stderr.write("Failed: epub2 {}\n".format(b_url))
           continue
         else:
-          epub_url = ''
-      elif 'epub' in REQUIRED:
-        sys.stderr.write('Failed: epub1 {}\n'.format(b_url))
+          epub_url = ""
+      elif "epub" in REQUIRED:
+        sys.stderr.write("Failed: epub1 {}\n".format(b_url))
         continue
       else:
-        epub_url = ''
+        epub_url = ""
 
       # get txt if possible
       txt_links = soup.find_all(title="Plain text; contains no formatting")
       if not txt_links:
-        txt_url = ''
+        txt_url = ""
       else:
-        txt_url = txt_links[0].get('href')
+        txt_url = txt_links[0].get("href")
         if not txt_url:
-          txt_url = ''
+          txt_url = ""
         else:
-          txt_url = 'https://www.smashwords.com' + txt_url
+          txt_url = "https://www.smashwords.com" + txt_url
 
       if not epub_url and not txt_url:
-        sys.stderr.write('Failed: epub and txt {}\n'.format(b_url))
+        sys.stderr.write("Failed: epub and txt {}\n".format(b_url))
         continue
 
       data = {
-          'page': b_url,
-          'epub': epub_url,
-          'txt': txt_url,
-          'title': title,
-          'author': author,
-          'genres': genres,
-          'publish': pub_date,
-          'num_words': num_words,
-          'b_idx': book_index
+        "page": b_url,
+        "epub": epub_url,
+        "txt": txt_url,
+        "title": title,
+        "author": author,
+        "genres": genres,
+        "publish": pub_date,
+        "num_words": num_words,
+        "b_idx": book_index,
       }
       print(json.dumps(data))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/downloader/bookscorpus/epub2txt.py b/deepray/datasets/downloader/bookscorpus/epub2txt.py
index 164f1a5b..09c8bd94 100644
--- a/deepray/datasets/downloader/bookscorpus/epub2txt.py
+++ b/deepray/datasets/downloader/bookscorpus/epub2txt.py
@@ -14,8 +14,7 @@
 from glob import glob
 
 
-class ContainerParser():
-
+class ContainerParser:
   def __init__(self, xmlcontent=None):
     self.rootfile = ""
     self.xml = xmlcontent
@@ -32,8 +31,7 @@ def parseContainer(self):
     return self.rootfile
 
 
-class BookParser():
-
+class BookParser:
   def __init__(self, xmlcontent=None):
     self.xml = xmlcontent
     self.title = ""
@@ -78,8 +76,7 @@ def parseBook(self):
     return self.title, self.author, self.ncx
 
 
-class NavPoint():
-
+class NavPoint:
   def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
     self.id = id
     self.content = content
@@ -88,8 +85,7 @@ def __init__(self, id=None, playorder=None, level=0, content=None, text=None):
     self.text = text
 
 
-class TocParser():
-
+class TocParser:
   def __init__(self, xmlcontent=None):
     self.xml = xmlcontent
     self.currentNP = None
@@ -130,8 +126,7 @@ def parseToc(self):
     return self.toc
 
 
-class epub2txt():
-
+class epub2txt:
   def __init__(self, epubfile=None):
     self.epub = epubfile
 
@@ -160,7 +155,7 @@ def convert(self):
 
     # fo.close()
     file.close()
-    return ''.join(content)
+    return "".join(content)
 
 
 if __name__ == "__main__":
diff --git a/deepray/datasets/downloader/bookscorpus/make_sentlines.py b/deepray/datasets/downloader/bookscorpus/make_sentlines.py
index 66e4b6fd..c40d9568 100644
--- a/deepray/datasets/downloader/bookscorpus/make_sentlines.py
+++ b/deepray/datasets/downloader/bookscorpus/make_sentlines.py
@@ -14,25 +14,25 @@ def convert_into_sentences(lines):
   for chunk in lines:
     if not chunk.strip():
       if stack:
-        sents = text_to_sentences(" ".join(stack).strip().replace('\n', ' ')).split('\n')
+        sents = text_to_sentences(" ".join(stack).strip().replace("\n", " ")).split("\n")
         sent_L.extend(sents)
         n_sent += len(sents)
-        sent_L.append('\n')
+        sent_L.append("\n")
         stack = []
       continue
     stack.append(chunk.strip())
 
   if stack:
-    sents = text_to_sentences(" ".join(stack).strip().replace('\n', ' ')).split('\n')
+    sents = text_to_sentences(" ".join(stack).strip().replace("\n", " ")).split("\n")
     sent_L.extend(sents)
     n_sent += len(sents)
   return sent_L, n_sent
 
 
-file_list = list(sorted(glob(os.path.join(file_dir, '*.txt'))))
+file_list = list(sorted(glob(os.path.join(file_dir, "*.txt"))))
 
 for i, file_path in enumerate(file_list):
   sents, n_sent = convert_into_sentences(open(file_path).readlines())
-  print('\n'.join(sents))
-  print('\n\n\n\n')
-  sys.stderr.write('{}/{}\t{}\t{}\n'.format(i, len(file_list), n_sent, file_path))
+  print("\n".join(sents))
+  print("\n\n\n\n")
+  sys.stderr.write("{}/{}\t{}\t{}\n".format(i, len(file_list), n_sent, file_path))
diff --git a/deepray/datasets/downloader/bookscorpus/tokenize_sentlines.py b/deepray/datasets/downloader/bookscorpus/tokenize_sentlines.py
index b6b0836e..8e9cdf6a 100644
--- a/deepray/datasets/downloader/bookscorpus/tokenize_sentlines.py
+++ b/deepray/datasets/downloader/bookscorpus/tokenize_sentlines.py
@@ -5,4 +5,4 @@
   if l.strip():
     print(text_to_words(l.strip()))
   else:
-    print('')
+    print("")
diff --git a/deepray/datasets/downloader/classifier_data_lib.py b/deepray/datasets/downloader/classifier_data_lib.py
index c432324f..923288d0 100644
--- a/deepray/datasets/downloader/classifier_data_lib.py
+++ b/deepray/datasets/downloader/classifier_data_lib.py
@@ -109,7 +109,7 @@ def get_train_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language))
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "train-%d" % (i)
@@ -125,7 +125,7 @@ def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "dev-%d" % (i)
@@ -175,7 +175,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
@@ -216,7 +216,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, i)
@@ -257,7 +257,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       # Only the test set has a header
       if set_type == "test" and i == 0:
         continue
@@ -299,7 +299,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, i)
@@ -340,7 +340,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, 1)
@@ -359,7 +359,7 @@ def _create_examples(self, lines, set_type):
 def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
   """Converts a single `InputExample` into a single `InputFeatures`."""
   label_map = {}
-  for (i, label) in enumerate(label_list):
+  for i, label in enumerate(label_list):
     label_map[label] = i
 
   tokens_a = tokenizer.tokenize(example.text_a)
@@ -375,7 +375,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, tokeni
   else:
     # Account for [CLS] and [SEP] with "- 2"
     if len(tokens_a) > max_seq_length - 2:
-      tokens_a = tokens_a[0:(max_seq_length - 2)]
+      tokens_a = tokens_a[0 : (max_seq_length - 2)]
 
   # The convention in BERT is:
   # (a) For sequence pairs:
@@ -439,7 +439,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length, tokeni
     logging.info("label: %s (id = %d)", example.label, label_id)
 
   feature = InputFeatures(
-      input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True
+    input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True
   )
   return feature
 
@@ -449,7 +449,7 @@ def file_based_convert_examples_to_features(examples, label_list, max_seq_length
 
   writer = tf.io.TFRecordWriter(output_file)
 
-  for (ex_index, example) in enumerate(examples):
+  for ex_index, example in enumerate(examples):
     if ex_index % 10000 == 0:
       logging.info("Writing example %d of %d", ex_index, len(examples))
 
@@ -489,7 +489,7 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 
 
 def generate_tf_record_from_data_file(
-    processor, data_dir, tokenizer, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128
+  processor, data_dir, tokenizer, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128
 ):
   """Generates and saves training data into a tf record file.
 
@@ -515,22 +515,22 @@ def generate_tf_record_from_data_file(
   assert train_data_output_path
   train_input_data_examples = processor.get_train_examples(data_dir)
   file_based_convert_examples_to_features(
-      train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path
+    train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path
   )
   num_training_data = len(train_input_data_examples)
 
   if eval_data_output_path:
     eval_input_data_examples = processor.get_dev_examples(data_dir)
     file_based_convert_examples_to_features(
-        eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path
+      eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path
     )
 
   meta_data = {
-      "task_type": "bert_classification",
-      "processor_type": processor.get_processor_name(),
-      "num_labels": len(processor.get_labels()),
-      "train_data_size": num_training_data,
-      "max_seq_length": max_seq_length,
+    "task_type": "bert_classification",
+    "processor_type": processor.get_processor_name(),
+    "num_labels": len(processor.get_labels()),
+    "train_data_size": num_training_data,
+    "max_seq_length": max_seq_length,
   }
 
   if eval_data_output_path:
diff --git a/deepray/datasets/downloader/create_finetuning_data.py b/deepray/datasets/downloader/create_finetuning_data.py
index 581e7083..ef4abec7 100644
--- a/deepray/datasets/downloader/create_finetuning_data.py
+++ b/deepray/datasets/downloader/create_finetuning_data.py
@@ -25,79 +25,86 @@
 import tensorflow as tf
 
 import classifier_data_lib
+
 # word-piece tokenizer based squad_lib
 import squad.squad_lib as squad_lib_wp
+
 # sentence-piece tokenizer based squad_lib
 from squad import squad_lib_sp
 import tokenization
 
 flags.DEFINE_enum(
-    "fine_tuning_task_type", "classification", ["classification", "squad"],
-    "The name of the BERT fine tuning task for which data "
-    "will be generated.."
+  "fine_tuning_task_type",
+  "classification",
+  ["classification", "squad"],
+  "The name of the BERT fine tuning task for which data will be generated..",
 )
 
 # BERT classification specific flags.
 flags.DEFINE_string(
-    "input_data_dir", None, "The input data dir. Should contain the .tsv files (or other data files) "
-    "for the task."
+  "input_data_dir", None, "The input data dir. Should contain the .tsv files (or other data files) for the task."
 )
 
 flags.DEFINE_enum(
-    "classification_task_name", "MNLI", ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"],
-    "The name of the task to train BERT classifier."
+  "classification_task_name",
+  "MNLI",
+  ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"],
+  "The name of the task to train BERT classifier.",
 )
 
 # BERT Squad task specific flags.
 flags.DEFINE_string("squad_data_file", None, "The input data file in for generating training data for BERT squad task.")
 
 flags.DEFINE_integer(
-    "doc_stride", 128, "When splitting up a long document into chunks, how much stride to "
-    "take between chunks."
+  "doc_stride", 128, "When splitting up a long document into chunks, how much stride to take between chunks."
 )
 
 flags.DEFINE_integer(
-    "max_query_length", 64, "The maximum number of tokens for the question. Questions longer than "
-    "this will be truncated to this length."
+  "max_query_length",
+  64,
+  "The maximum number of tokens for the question. Questions longer than this will be truncated to this length.",
 )
 
 flags.DEFINE_bool(
-    "version_2_with_negative", False, "If true, the SQuAD examples contain some that do not have an answer."
+  "version_2_with_negative", False, "If true, the SQuAD examples contain some that do not have an answer."
 )
 
 # Shared flags across BERT fine-tuning tasks.
 flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.")
 
 flags.DEFINE_string(
-    "train_data_output_path", None, "The path in which generated training input data will be written as tf"
-    " records."
+  "train_data_output_path", None, "The path in which generated training input data will be written as tf records."
 )
 
 flags.DEFINE_string(
-    "eval_data_output_path", None, "The path in which generated training input data will be written as tf"
-    " records."
+  "eval_data_output_path", None, "The path in which generated training input data will be written as tf records."
 )
 
 flags.DEFINE_string("meta_data_file_path", None, "The path in which input meta data will be written.")
 
 flags.DEFINE_bool(
-    "do_lower_case", True, "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models."
+  "do_lower_case",
+  True,
+  "Whether to lower case the input text. Should be True for uncased models and False for cased models.",
 )
 
 flags.DEFINE_integer(
-    "max_seq_length", 128, "The maximum total input sequence length after WordPiece tokenization. "
-    "Sequences longer than this will be truncated, and sequences shorter "
-    "than this will be padded."
+  "max_seq_length",
+  128,
+  "The maximum total input sequence length after WordPiece tokenization. "
+  "Sequences longer than this will be truncated, and sequences shorter "
+  "than this will be padded.",
 )
 
 flags.DEFINE_string("sp_model_file", "", "The path to the model used by sentence piece tokenizer.")
 
 flags.DEFINE_enum(
-    "tokenizer_impl", "word_piece", ["word_piece", "sentence_piece"],
-    "Specifies the tokenizer implementation, i.e., whehter to use word_piece "
-    "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
-    "while ALBERT uses sentence_piece tokenizer."
+  "tokenizer_impl",
+  "word_piece",
+  ["word_piece", "sentence_piece"],
+  "Specifies the tokenizer implementation, i.e., whehter to use word_piece "
+  "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
+  "while ALBERT uses sentence_piece tokenizer.",
 )
 
 
@@ -106,12 +113,12 @@ def generate_classifier_dataset():
   assert FLAGS.input_data_dir and FLAGS.classification_task_name
 
   processors = {
-      "cola": classifier_data_lib.ColaProcessor,
-      "mnli": classifier_data_lib.MnliProcessor,
-      "mrpc": classifier_data_lib.MrpcProcessor,
-      "qnli": classifier_data_lib.QnliProcessor,
-      "sst-2": classifier_data_lib.SstProcessor,
-      "xnli": classifier_data_lib.XnliProcessor,
+    "cola": classifier_data_lib.ColaProcessor,
+    "mnli": classifier_data_lib.MnliProcessor,
+    "mrpc": classifier_data_lib.MrpcProcessor,
+    "qnli": classifier_data_lib.QnliProcessor,
+    "sst-2": classifier_data_lib.SstProcessor,
+    "xnli": classifier_data_lib.XnliProcessor,
   }
   task_name = FLAGS.classification_task_name.lower()
   if task_name not in processors:
@@ -127,12 +134,12 @@ def generate_classifier_dataset():
 
   processor = processors[task_name](processor_text_fn)
   return classifier_data_lib.generate_tf_record_from_data_file(
-      processor,
-      FLAGS.input_data_dir,
-      tokenizer,
-      train_data_output_path=FLAGS.train_data_output_path,
-      eval_data_output_path=FLAGS.eval_data_output_path,
-      max_seq_length=FLAGS.max_seq_length
+    processor,
+    FLAGS.input_data_dir,
+    tokenizer,
+    train_data_output_path=FLAGS.train_data_output_path,
+    eval_data_output_path=FLAGS.eval_data_output_path,
+    max_seq_length=FLAGS.max_seq_length,
   )
 
 
@@ -141,14 +148,26 @@ def generate_squad_dataset():
   assert FLAGS.squad_data_file
   if FLAGS.tokenizer_impl == "word_piece":
     return squad_lib_wp.generate_tf_record_from_json_file(
-        FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path, FLAGS.max_seq_length,
-        FLAGS.do_lower_case, FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative
+      FLAGS.squad_data_file,
+      FLAGS.vocab_file,
+      FLAGS.train_data_output_path,
+      FLAGS.max_seq_length,
+      FLAGS.do_lower_case,
+      FLAGS.max_query_length,
+      FLAGS.doc_stride,
+      FLAGS.version_2_with_negative,
     )
   else:
     assert FLAGS.tokenizer_impl == "sentence_piece"
     return squad_lib_sp.generate_tf_record_from_json_file(
-        FLAGS.squad_data_file, FLAGS.sp_model_file, FLAGS.train_data_output_path, FLAGS.max_seq_length,
-        FLAGS.do_lower_case, FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative
+      FLAGS.squad_data_file,
+      FLAGS.sp_model_file,
+      FLAGS.train_data_output_path,
+      FLAGS.max_seq_length,
+      FLAGS.do_lower_case,
+      FLAGS.max_query_length,
+      FLAGS.doc_stride,
+      FLAGS.version_2_with_negative,
     )
 
 
diff --git a/deepray/datasets/downloader/create_pretraining_data.py b/deepray/datasets/downloader/create_pretraining_data.py
index b922b241..881b0dc0 100644
--- a/deepray/datasets/downloader/create_pretraining_data.py
+++ b/deepray/datasets/downloader/create_pretraining_data.py
@@ -31,16 +31,19 @@
 flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.")
 
 flags.DEFINE_bool(
-    "do_lower_case", True, "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models."
+  "do_lower_case",
+  True,
+  "Whether to lower case the input text. Should be True for uncased models and False for cased models.",
 )
 
 flags.DEFINE_bool("do_whole_word_mask", False, "Whether to use whole word masking rather than per-WordPiece masking.")
 
 flags.DEFINE_integer(
-    "max_ngram_size", None, "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
-    "weighting scheme to favor shorter n-grams. "
-    "Note: `--do_whole_word_mask=True` must also be set when n-gram masking."
+  "max_ngram_size",
+  None,
+  "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
+  "weighting scheme to favor shorter n-grams. "
+  "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.",
 )
 
 flags.DEFINE_bool("gzip_compress", False, "Whether to use `GZIP` compress option to get compressed TFRecord files.")
@@ -56,8 +59,7 @@
 flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
 
 flags.DEFINE_float(
-    "short_seq_prob", 0.1, "Probability of creating sequences which are shorter than the "
-    "maximum length."
+  "short_seq_prob", 0.1, "Probability of creating sequences which are shorter than the maximum length."
 )
 
 
@@ -86,7 +88,7 @@ def __repr__(self):
 
 
 def write_instance_to_example_files(
-    instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files, gzip_compress, use_v2_feature_names
+  instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files, gzip_compress, use_v2_feature_names
 ):
   """Creates TF example files from `TrainingInstance`s."""
   writers = []
@@ -96,7 +98,7 @@ def write_instance_to_example_files(
   writer_index = 0
 
   total_written = 0
-  for (inst_index, instance) in enumerate(instances):
+  for inst_index, instance in enumerate(instances):
     input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
     input_mask = [1] * len(input_ids)
     segment_ids = list(instance.segment_ids)
@@ -173,16 +175,16 @@ def create_float_feature(values):
 
 
 def create_training_instances(
-    input_files,
-    tokenizer,
-    max_seq_length,
-    dupe_factor,
-    short_seq_prob,
-    masked_lm_prob,
-    max_predictions_per_seq,
-    rng,
-    do_whole_word_mask=False,
-    max_ngram_size=None
+  input_files,
+  tokenizer,
+  max_seq_length,
+  dupe_factor,
+  short_seq_prob,
+  masked_lm_prob,
+  max_predictions_per_seq,
+  rng,
+  do_whole_word_mask=False,
+  max_ngram_size=None,
 ):
   """Create `TrainingInstance`s from raw text."""
   all_documents = [[]]
@@ -217,10 +219,18 @@ def create_training_instances(
   for _ in range(dupe_factor):
     for document_index in range(len(all_documents)):
       instances.extend(
-          create_instances_from_document(
-              all_documents, document_index, max_seq_length, short_seq_prob, masked_lm_prob, max_predictions_per_seq,
-              vocab_words, rng, do_whole_word_mask, max_ngram_size
-          )
+        create_instances_from_document(
+          all_documents,
+          document_index,
+          max_seq_length,
+          short_seq_prob,
+          masked_lm_prob,
+          max_predictions_per_seq,
+          vocab_words,
+          rng,
+          do_whole_word_mask,
+          max_ngram_size,
+        )
       )
 
   rng.shuffle(instances)
@@ -228,16 +238,16 @@ def create_training_instances(
 
 
 def create_instances_from_document(
-    all_documents,
-    document_index,
-    max_seq_length,
-    short_seq_prob,
-    masked_lm_prob,
-    max_predictions_per_seq,
-    vocab_words,
-    rng,
-    do_whole_word_mask=False,
-    max_ngram_size=None
+  all_documents,
+  document_index,
+  max_seq_length,
+  short_seq_prob,
+  masked_lm_prob,
+  max_predictions_per_seq,
+  vocab_words,
+  rng,
+  do_whole_word_mask=False,
+  max_ngram_size=None,
 ):
   """Creates `TrainingInstance`s for a single document."""
   document = all_documents[document_index]
@@ -335,14 +345,14 @@ def create_instances_from_document(
         segment_ids.append(1)
 
         (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
-            tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size
+          tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size
         )
         instance = TrainingInstance(
-            tokens=tokens,
-            segment_ids=segment_ids,
-            is_random_next=is_random_next,
-            masked_lm_positions=masked_lm_positions,
-            masked_lm_labels=masked_lm_labels
+          tokens=tokens,
+          segment_ids=segment_ids,
+          is_random_next=is_random_next,
+          masked_lm_positions=masked_lm_positions,
+          masked_lm_labels=masked_lm_labels,
         )
         instances.append(instance)
       current_chunk = []
@@ -460,7 +470,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
 
   # Create the weighting for n-gram length selection.
   # Stored cummulatively for `random.choices` below.
-  cummulative_weights = list(itertools.accumulate([1. / n for n in range(1, max_ngram_size + 1)]))
+  cummulative_weights = list(itertools.accumulate([1.0 / n for n in range(1, max_ngram_size + 1)]))
 
   output_ngrams = []
   # Keep a bitmask of which tokens have been masked.
@@ -469,7 +479,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
   # n-grams of any length.
   # Each code path should ensure one or more elements from `ngrams` are removed
   # to guarentee this loop terminates.
-  while (sum(masked_tokens) < max_masked_tokens and sum(len(s) for s in ngrams.values())):
+  while sum(masked_tokens) < max_masked_tokens and sum(len(s) for s in ngrams.values()):
     # Pick an n-gram size based on our weights.
     sz = random.choices(range(1, max_ngram_size + 1), cum_weights=cummulative_weights)[0]
 
@@ -494,11 +504,11 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
       continue
 
     # Check if any of the tokens in this gram have already been masked.
-    if sum(masked_tokens[gram.begin:gram.end]):
+    if sum(masked_tokens[gram.begin : gram.end]):
       continue
 
     # Found a usable n-gram!  Mark its tokens as masked and add it to return.
-    masked_tokens[gram.begin:gram.end] = [True] * (gram.end - gram.begin)
+    masked_tokens[gram.begin : gram.end] = [True] * (gram.end - gram.begin)
     output_ngrams.append(gram)
   return output_ngrams
 
@@ -531,7 +541,7 @@ def _wordpieces_to_grams(tokens):
 
 
 def create_masked_lm_predictions(
-    tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size=None
+  tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size=None
 ):
   """Creates the predictions for the masked LM objective."""
   if do_whole_word_mask:
@@ -608,8 +618,16 @@ def main(_):
 
   rng = random.Random(FLAGS.random_seed)
   instances = create_training_instances(
-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, FLAGS.short_seq_prob, FLAGS.masked_lm_prob,
-      FLAGS.max_predictions_per_seq, rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size
+    input_files,
+    tokenizer,
+    FLAGS.max_seq_length,
+    FLAGS.dupe_factor,
+    FLAGS.short_seq_prob,
+    FLAGS.masked_lm_prob,
+    FLAGS.max_predictions_per_seq,
+    rng,
+    FLAGS.do_whole_word_mask,
+    FLAGS.max_ngram_size,
   )
 
   output_files = FLAGS.output_file.split(",")
@@ -618,8 +636,13 @@ def main(_):
     logging.info("  %s", output_file)
 
   write_instance_to_example_files(
-      instances, tokenizer, FLAGS.max_seq_length, FLAGS.max_predictions_per_seq, output_files, FLAGS.gzip_compress,
-      FLAGS.use_v2_feature_names
+    instances,
+    tokenizer,
+    FLAGS.max_seq_length,
+    FLAGS.max_predictions_per_seq,
+    output_files,
+    FLAGS.gzip_compress,
+    FLAGS.use_v2_feature_names,
   )
 
 
diff --git a/deepray/datasets/downloader/glue/GLUEDownloader.py b/deepray/datasets/downloader/glue/GLUEDownloader.py
index a8ccd045..d891a533 100644
--- a/deepray/datasets/downloader/glue/GLUEDownloader.py
+++ b/deepray/datasets/downloader/glue/GLUEDownloader.py
@@ -22,24 +22,24 @@ def mkdir(path):
 
 
 class GLUEDownloader:
-
   def __init__(self, save_path):
-    self.save_path = save_path + '/glue'
+    self.save_path = save_path + "/glue"
 
   def download(self, task_name):
     mkdir(self.save_path)
-    if task_name in {'mrpc', 'mnli'}:
+    if task_name in {"mrpc", "mnli"}:
       task_name = task_name.upper()
-    elif task_name == 'cola':
-      task_name = 'CoLA'
+    elif task_name == "cola":
+      task_name = "CoLA"
     else:  # SST-2
-      assert task_name == 'sst-2'
-      task_name = 'SST'
+      assert task_name == "sst-2"
+      task_name = "SST"
     wget.download(
-        'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
-        out=self.save_path,
+      "https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py",
+      out=self.save_path,
     )
     sys.path.append(self.save_path)
     import download_glue_data
-    download_glue_data.main(['--data_dir', self.save_path, '--tasks', task_name])
+
+    download_glue_data.main(["--data_dir", self.save_path, "--tasks", task_name])
     sys.path.pop()
diff --git a/deepray/datasets/downloader/pubmed/PubMedDownloader.py b/deepray/datasets/downloader/pubmed/PubMedDownloader.py
index 41638b31..415dd4ad 100644
--- a/deepray/datasets/downloader/pubmed/PubMedDownloader.py
+++ b/deepray/datasets/downloader/pubmed/PubMedDownloader.py
@@ -18,70 +18,68 @@
 
 
 class PubMedDownloader:
-
   def __init__(self, subset, save_path):
     self.subset = subset
     # Modifying self.save_path in two steps to handle creation of subdirectories
-    self.save_path = save_path + '/pubmed'
+    self.save_path = save_path + "/pubmed"
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
 
-    self.save_path = self.save_path + '/' + subset
+    self.save_path = self.save_path + "/" + subset
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
 
     self.download_urls = {
-        'baseline': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
-        'daily_update': 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
-        'fulltext': 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
-        'open_access': 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
+      "baseline": "ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/",
+      "daily_update": "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/",
+      "fulltext": "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/",
+      "open_access": "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/",
     }
 
   def download(self):
-    print('subset:', self.subset)
+    print("subset:", self.subset)
     url = self.download_urls[self.subset]
     self.download_files(url)
     self.extract_files()
 
   def download_files(self, url):
     url = self.download_urls[self.subset]
-    output = os.popen('curl ' + url).read()
+    output = os.popen("curl " + url).read()
 
-    if self.subset == 'fulltext' or self.subset == 'open_access':
-      line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
+    if self.subset == "fulltext" or self.subset == "open_access":
+      line_split = "comm_use" if self.subset == "fulltext" else "non_comm_use"
       for line in output.splitlines():
-        if line[-10:] == 'xml.tar.gz' and \
-            line.split(' ')[-1].split('.')[0] == line_split:
-          file = os.path.join(self.save_path, line.split(' ')[-1])
+        if line[-10:] == "xml.tar.gz" and line.split(" ")[-1].split(".")[0] == line_split:
+          file = os.path.join(self.save_path, line.split(" ")[-1])
           if not os.path.isfile(file):
-            print('Downloading', file)
-            response = urllib.request.urlopen(url + line.split(' ')[-1])
+            print("Downloading", file)
+            response = urllib.request.urlopen(url + line.split(" ")[-1])
             with open(file, "wb") as handle:
               handle.write(response.read())
 
-    elif self.subset == 'baseline' or self.subset == 'daily_update':
+    elif self.subset == "baseline" or self.subset == "daily_update":
       for line in output.splitlines():
-        if line[-3:] == '.gz':
-          file = os.path.join(self.save_path, line.split(' ')[-1])
+        if line[-3:] == ".gz":
+          file = os.path.join(self.save_path, line.split(" ")[-1])
           if not os.path.isfile(file):
-            print('Downloading', file)
-            response = urllib.request.urlopen(url + line.split(' ')[-1])
+            print("Downloading", file)
+            response = urllib.request.urlopen(url + line.split(" ")[-1])
             with open(file, "wb") as handle:
               handle.write(response.read())
     else:
-      assert False, 'Invalid PubMed dataset/subset specified.'
+      assert False, "Invalid PubMed dataset/subset specified."
 
   def extract_files(self):
-    files = glob.glob(self.save_path + '/*.xml.gz')
+    files = glob.glob(self.save_path + "/*.xml.gz")
 
     for file in files:
-      print('file:', file)
-      input = gzip.GzipFile(file, mode='rb')
+      print("file:", file)
+      input = gzip.GzipFile(file, mode="rb")
       s = input.read()
       input.close()
 
-      out = open(file[:-3], mode='wb')
+      out = open(file[:-3], mode="wb")
       out.write(s)
       out.close()
diff --git a/deepray/datasets/downloader/pubmed/PubMedTextFormatting.py b/deepray/datasets/downloader/pubmed/PubMedTextFormatting.py
index 21991dda..08cd927e 100644
--- a/deepray/datasets/downloader/pubmed/PubMedTextFormatting.py
+++ b/deepray/datasets/downloader/pubmed/PubMedTextFormatting.py
@@ -17,7 +17,6 @@
 
 
 class PubMedTextFormatting:
-
   def __init__(self, pubmed_path, output_filename, recursive=False):
     self.pubmed_path = pubmed_path
     self.recursive = recursive
@@ -25,17 +24,17 @@ def __init__(self, pubmed_path, output_filename, recursive=False):
 
   # This puts one article per line
   def merge(self):
-    print('PubMed path:', self.pubmed_path)
+    print("PubMed path:", self.pubmed_path)
 
-    with open(self.output_filename, mode='w', newline='\n') as ofile:
-      for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive):
-        print('file:', filename)
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.pubmed_path + "/*.xml*", recursive=self.recursive):
+        print("file:", filename)
         dicts_out = pmp.parse_medline_xml(filename)
         for dict_out in dicts_out:
-          if not dict_out['abstract']:
+          if not dict_out["abstract"]:
             continue
           try:
-            for line in dict_out['abstract'].splitlines():
+            for line in dict_out["abstract"].splitlines():
               if len(line) < 30:
                 continue
               ofile.write(line.strip() + " ")
diff --git a/deepray/datasets/downloader/squad/SquadDownloader.py b/deepray/datasets/downloader/squad/SquadDownloader.py
index 9eb3f07e..1c0e8af7 100644
--- a/deepray/datasets/downloader/squad/SquadDownloader.py
+++ b/deepray/datasets/downloader/squad/SquadDownloader.py
@@ -17,32 +17,25 @@
 
 
 class SquadDownloader:
-
   def __init__(self, save_path):
-    self.save_path = save_path + '/squad'
+    self.save_path = save_path + "/squad"
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
 
-    if not os.path.exists(self.save_path + '/v1.1'):
-      os.makedirs(self.save_path + '/v1.1')
+    if not os.path.exists(self.save_path + "/v1.1"):
+      os.makedirs(self.save_path + "/v1.1")
 
-    if not os.path.exists(self.save_path + '/v2.0'):
-      os.makedirs(self.save_path + '/v2.0')
+    if not os.path.exists(self.save_path + "/v2.0"):
+      os.makedirs(self.save_path + "/v2.0")
 
     self.download_urls = {
-        'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json':
-            'v1.1/train-v1.1.json',
-        'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json':
-            'v1.1/dev-v1.1.json',
-        'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/':
-            'v1.1/evaluate-v1.1.py',
-        'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json':
-            'v2.0/train-v2.0.json',
-        'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json':
-            'v2.0/dev-v2.0.json',
-        'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/':
-            'v2.0/evaluate-v2.0.py',
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json": "v1.1/train-v1.1.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json": "v1.1/dev-v1.1.json",
+      "https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/": "v1.1/evaluate-v1.1.py",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json": "v2.0/train-v2.0.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json": "v2.0/dev-v2.0.json",
+      "https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/": "v2.0/evaluate-v2.0.py",
     }
 
   def download(self):
@@ -50,10 +43,10 @@ def download(self):
       url = item
       file = self.download_urls[item]
 
-      print('Downloading:', url)
-      if os.path.isfile(self.save_path + '/' + file):
-        print('** Download file already exists, skipping download')
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + file):
+        print("** Download file already exists, skipping download")
       else:
         response = urllib.request.urlopen(url)
-        with open(self.save_path + '/' + file, "wb") as handle:
+        with open(self.save_path + "/" + file, "wb") as handle:
           handle.write(response.read())
diff --git a/deepray/datasets/downloader/tokenization.py b/deepray/datasets/downloader/tokenization.py
index 614ed50a..355099fb 100644
--- a/deepray/datasets/downloader/tokenization.py
+++ b/deepray/datasets/downloader/tokenization.py
@@ -53,7 +53,10 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
   model_name = m.group(1)
 
   lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    "uncased_L-24_H-1024_A-16",
+    "uncased_L-12_H-768_A-12",
+    "multilingual_L-12_H-768_A-12",
+    "chinese_L-12_H-768_A-12",
   ]
 
   cased_models = ["cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", "multi_cased_L-12_H-768_A-12"]
@@ -73,11 +76,11 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 
   if is_bad_config:
     raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
+      "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+      "However, `%s` seems to be a %s model, so you "
+      "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+      "how the model was pre-training. If this error is wrong, please "
+      "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
     )
 
 
@@ -278,13 +281,14 @@ def _is_chinese_char(self, cp):
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
     if (
-        (cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)
+      (cp >= 0x4E00 and cp <= 0x9FFF)  #
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)
+      or (cp >= 0xF900 and cp <= 0xFAFF)  #
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)
     ):  #
       return True
 
@@ -295,7 +299,7 @@ def _clean_text(self, text):
     output = []
     for char in text:
       cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
+      if cp == 0 or cp == 0xFFFD or _is_control(char):
         continue
       if _is_whitespace(char):
         output.append(" ")
@@ -397,7 +401,7 @@ def _is_punctuation(char):
   # Characters such as "^", "$", and "`" are not in the Unicode
   # Punctuation class but we treat them as punctuation anyways, for
   # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
     return True
   cat = unicodedata.category(char)
   if cat.startswith("P"):
diff --git a/deepray/datasets/downloader/wikicorpus/WikiDownloader.py b/deepray/datasets/downloader/wikicorpus/WikiDownloader.py
index 132e6bdc..9ebee83d 100644
--- a/deepray/datasets/downloader/wikicorpus/WikiDownloader.py
+++ b/deepray/datasets/downloader/wikicorpus/WikiDownloader.py
@@ -17,9 +17,8 @@
 
 
 class WikiDownloader:
-
   def __init__(self, language, save_path):
-    self.save_path = save_path + '/wikicorpus_' + language
+    self.save_path = save_path + "/wikicorpus_" + language
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
@@ -27,30 +26,30 @@ def __init__(self, language, save_path):
     self.language = language
     # Use a mirror from https://dumps.wikimedia.org/mirrors.html if the below links do not work
     self.download_urls = {
-        'en': 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-        'zh': 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+      "en": "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
+      "zh": "https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2",
     }
 
-    self.output_files = {'en': 'wikicorpus_en.xml.bz2', 'zh': 'wikicorpus_zh.xml.bz2'}
+    self.output_files = {"en": "wikicorpus_en.xml.bz2", "zh": "wikicorpus_zh.xml.bz2"}
 
   def download(self):
     if self.language in self.download_urls:
       url = self.download_urls[self.language]
       filename = self.output_files[self.language]
 
-      print('Downloading:', url)
-      if os.path.isfile(self.save_path + '/' + filename):
-        print('** Download file already exists, skipping download')
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + filename):
+        print("** Download file already exists, skipping download")
       else:
-        cmd = ['wget', url, f"--output-document={os.path.join(self.save_path, filename)}"]
-        print('Running:', cmd)
+        cmd = ["wget", url, f"--output-document={os.path.join(self.save_path, filename)}"]
+        print("Running:", cmd)
         status = subprocess.run(cmd)
         if status.returncode != 0:
-          raise RuntimeError('Wiki download not successful')
+          raise RuntimeError("Wiki download not successful")
 
       # Always unzipping since this is relatively fast and will overwrite
-      print('Unzipping:', self.output_files[self.language])
-      subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+      print("Unzipping:", self.output_files[self.language])
+      subprocess.run("bzip2 -dk " + self.save_path + "/" + filename, shell=True, check=True)
 
     else:
-      assert False, 'WikiDownloader not implemented for this language yet.'
+      assert False, "WikiDownloader not implemented for this language yet."
diff --git a/deepray/datasets/downloader/wikicorpus/WikicorpusTextFormatting.py b/deepray/datasets/downloader/wikicorpus/WikicorpusTextFormatting.py
index c2e5fc5b..0d77b3ed 100644
--- a/deepray/datasets/downloader/wikicorpus/WikicorpusTextFormatting.py
+++ b/deepray/datasets/downloader/wikicorpus/WikicorpusTextFormatting.py
@@ -16,7 +16,6 @@
 
 
 class WikicorpusTextFormatting:
-
   def __init__(self, wiki_path, output_filename, recursive=False):
     self.wiki_path = wiki_path
     self.recursive = recursive
@@ -24,21 +23,21 @@ def __init__(self, wiki_path, output_filename, recursive=False):
 
   # This puts one article per line
   def merge(self):
-    with open(self.output_filename, mode='w', newline='\n') as ofile:
-      for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
-        for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for dirname in glob.glob(self.wiki_path + "/*/", recursive=False):
+        for filename in glob.glob(dirname + "wiki_*", recursive=self.recursive):
           print(filename)
           article_lines = []
           article_open = False
 
-          with open(filename, mode='r', newline='\n') as file:
+          with open(filename, mode="r", newline="\n") as file:
             for line in file:
-              if '<doc id=' in line:
+              if "<doc id=" in line:
                 article_open = True
-              elif '</doc>' in line:
+              elif "</doc>" in line:
                 article_open = False
                 for oline in article_lines[1:]:
-                  if oline != '\n':
+                  if oline != "\n":
                     ofile.write(oline.rstrip() + " ")
                 ofile.write("\n\n")
                 article_lines = []
diff --git a/deepray/datasets/fashion_mnist/fashion_mnist.py b/deepray/datasets/fashion_mnist/fashion_mnist.py
index bdb78adb..028b7abf 100644
--- a/deepray/datasets/fashion_mnist/fashion_mnist.py
+++ b/deepray/datasets/fashion_mnist/fashion_mnist.py
@@ -26,13 +26,12 @@
 from deepray.datasets.datapipeline import DataPipeline
 
 flags.FLAGS([
-    sys.argv[0],
-    "--num_train_examples=60000",
+  sys.argv[0],
+  "--num_train_examples=60000",
 ])
 
 
 class FashionMNIST(DataPipeline):
-
   def __init__(self):
     """Loads the Fashion-MNIST dataset.
 
@@ -90,10 +89,10 @@ def __init__(self):
     dirname = os.path.join("datasets", "fashion-mnist")
     base = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
     files = [
-        "train-labels-idx1-ubyte.gz",
-        "train-images-idx3-ubyte.gz",
-        "t10k-labels-idx1-ubyte.gz",
-        "t10k-images-idx3-ubyte.gz",
+      "train-labels-idx1-ubyte.gz",
+      "train-images-idx3-ubyte.gz",
+      "t10k-labels-idx1-ubyte.gz",
+      "t10k-images-idx3-ubyte.gz",
     ]
 
     self.paths = []
@@ -104,7 +103,7 @@ def __len__(self):
     pass
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     if is_training:
       with gzip.open(self.paths[0], "rb") as lbpath:
@@ -119,8 +118,9 @@ def build_dataset(
       with gzip.open(self.paths[3], "rb") as imgpath:
         x = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(y), 28, 28)
 
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (tf.cast(x[..., tf.newaxis] / 255.0, tf.float32), tf.cast(y, tf.int64))
-    )
+    dataset = tf.data.Dataset.from_tensor_slices((
+      tf.cast(x[..., tf.newaxis] / 255.0, tf.float32),
+      tf.cast(y, tf.int64),
+    ))
     dataset = dataset.repeat(flags.FLAGS.epochs).shuffle(10000).batch(batch_size)
     return dataset
diff --git a/deepray/datasets/fashion_mnist/fashion_mnist_test.py b/deepray/datasets/fashion_mnist/fashion_mnist_test.py
index 73c3d9c8..87b62bc5 100644
--- a/deepray/datasets/fashion_mnist/fashion_mnist_test.py
+++ b/deepray/datasets/fashion_mnist/fashion_mnist_test.py
@@ -15,12 +15,12 @@
 def runner(argv=None):
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        "--train_data=movielens/1m-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
+      sys.argv[0],
+      "--batch_size=16",
+      "-epochs=1",
+      "--train_data=movielens/1m-ratings",
+      # f"--feature_map={dir_path}/feature_map.csv",
+      "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
index 131606e9..4773dac8 100644
--- a/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
+++ b/deepray/datasets/imagenet-1k/imagenet_to_gcs.py
@@ -50,20 +50,21 @@
 
 import tensorflow as tf
 
-flags.DEFINE_string('gcs_output_path', None, 'GCS path for uploading the dataset.')
-flags.DEFINE_string('local_scratch_dir', None, 'Scratch directory path for temporary files.')
+flags.DEFINE_string("gcs_output_path", None, "GCS path for uploading the dataset.")
+flags.DEFINE_string("local_scratch_dir", None, "Scratch directory path for temporary files.")
 flags.DEFINE_string(
-    'raw_data_dir', None, 'Directory path for raw Imagenet dataset. '
-    'Should have train and validation subdirectories inside it.'
+  "raw_data_dir",
+  None,
+  "Directory path for raw Imagenet dataset. Should have train and validation subdirectories inside it.",
 )
 
-LABELS_FILE = 'synset_labels.txt'
+LABELS_FILE = "synset_labels.txt"
 
 TRAINING_SHARDS = 1024
 VALIDATION_SHARDS = 128
 
-TRAINING_DIRECTORY = 'train'
-VALIDATION_DIRECTORY = 'validation'
+TRAINING_DIRECTORY = "train"
+VALIDATION_DIRECTORY = "validation"
 
 
 def _check_or_create_dir(directory: str):
@@ -82,12 +83,12 @@ def _int64_feature(value: Union[int, Iterable[int]]) -> tf.train.Feature:
 def _bytes_feature(value: Union[bytes, str]) -> tf.train.Feature:
   """Inserts bytes features into Example proto."""
   if isinstance(value, str):
-    value = bytes(value, 'utf-8')
+    value = bytes(value, "utf-8")
   return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 
 
 def _convert_to_example(
-    filename: str, image_buffer: str, label: int, synset: str, height: int, width: int
+  filename: str, image_buffer: str, label: int, synset: str, height: int, width: int
 ) -> tf.train.Example:
   """Builds an Example proto for an ImageNet example.
 
@@ -102,24 +103,24 @@ def _convert_to_example(
     Example proto
 
   """
-  colorspace = 'RGB'
+  colorspace = "RGB"
   channels = 3
-  image_format = 'JPEG'
+  image_format = "JPEG"
 
   example = tf.train.Example(
-      features=tf.train.Features(
-          feature={
-              'image/height': _int64_feature(height),
-              'image/width': _int64_feature(width),
-              'image/colorspace': _bytes_feature(colorspace),
-              'image/channels': _int64_feature(channels),
-              'image/class/label': _int64_feature(label),
-              'image/class/synset': _bytes_feature(synset),
-              'image/format': _bytes_feature(image_format),
-              'image/filename': _bytes_feature(os.path.basename(filename)),
-              'image/encoded': _bytes_feature(image_buffer)
-          }
-      )
+    features=tf.train.Features(
+      feature={
+        "image/height": _int64_feature(height),
+        "image/width": _int64_feature(width),
+        "image/colorspace": _bytes_feature(colorspace),
+        "image/channels": _int64_feature(channels),
+        "image/class/label": _int64_feature(label),
+        "image/class/synset": _bytes_feature(synset),
+        "image/format": _bytes_feature(image_format),
+        "image/filename": _bytes_feature(os.path.basename(filename)),
+        "image/encoded": _bytes_feature(image_buffer),
+      }
+    )
   )
   return example
 
@@ -136,7 +137,7 @@ def _is_png(filename: str) -> bool:
   """
   # File list from:
   # https://github.com/cytsai/ilsvrc-cmyk-image-list
-  return 'n02105855_2933.JPEG' in filename
+  return "n02105855_2933.JPEG" in filename
 
 
 def _is_cmyk(filename: str) -> bool:
@@ -151,16 +152,30 @@ def _is_cmyk(filename: str) -> bool:
   """
   # File list from:
   # https://github.com/cytsai/ilsvrc-cmyk-image-list
-  denylist = set(
-      [
-          'n01739381_1309.JPEG', 'n02077923_14822.JPEG', 'n02447366_23489.JPEG', 'n02492035_15739.JPEG',
-          'n02747177_10752.JPEG', 'n03018349_4028.JPEG', 'n03062245_4620.JPEG', 'n03347037_9675.JPEG',
-          'n03467068_12171.JPEG', 'n03529860_11437.JPEG', 'n03544143_17228.JPEG', 'n03633091_5218.JPEG',
-          'n03710637_5125.JPEG', 'n03961711_5286.JPEG', 'n04033995_2932.JPEG', 'n04258138_17003.JPEG',
-          'n04264628_27969.JPEG', 'n04336792_7448.JPEG', 'n04371774_5854.JPEG', 'n04596742_4225.JPEG',
-          'n07583066_647.JPEG', 'n13037406_4650.JPEG'
-      ]
-  )
+  denylist = set([
+    "n01739381_1309.JPEG",
+    "n02077923_14822.JPEG",
+    "n02447366_23489.JPEG",
+    "n02492035_15739.JPEG",
+    "n02747177_10752.JPEG",
+    "n03018349_4028.JPEG",
+    "n03062245_4620.JPEG",
+    "n03347037_9675.JPEG",
+    "n03467068_12171.JPEG",
+    "n03529860_11437.JPEG",
+    "n03544143_17228.JPEG",
+    "n03633091_5218.JPEG",
+    "n03710637_5125.JPEG",
+    "n03961711_5286.JPEG",
+    "n04033995_2932.JPEG",
+    "n04258138_17003.JPEG",
+    "n04264628_27969.JPEG",
+    "n04336792_7448.JPEG",
+    "n04371774_5854.JPEG",
+    "n04596742_4225.JPEG",
+    "n07583066_647.JPEG",
+    "n13037406_4650.JPEG",
+  ])
   return os.path.basename(filename) in denylist
 
 
@@ -174,12 +189,12 @@ def __init__(self):
     # Initializes function that converts PNG to JPEG data.
     self._png_data = tf.placeholder(dtype=tf.string)
     image = tf.image.decode_png(self._png_data, channels=3)
-    self._png_to_jpeg = tf.image.encode_jpeg(image, format='rgb', quality=100)
+    self._png_to_jpeg = tf.image.encode_jpeg(image, format="rgb", quality=100)
 
     # Initializes function that converts CMYK JPEG data to RGB JPEG data.
     self._cmyk_data = tf.placeholder(dtype=tf.string)
     image = tf.image.decode_jpeg(self._cmyk_data, channels=0)
-    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format='rgb', quality=100)
+    self._cmyk_to_rgb = tf.image.encode_jpeg(image, format="rgb", quality=100)
 
     # Initializes function that decodes RGB JPEG data.
     self._decode_jpeg_data = tf.placeholder(dtype=tf.string)
@@ -215,17 +230,17 @@ def _process_image(filename: str, coder: ImageCoder) -> Tuple[str, int, int]:
 
   """
   # Read the image file.
-  with tf.gfile.FastGFile(filename, 'rb') as f:
+  with tf.gfile.FastGFile(filename, "rb") as f:
     image_data = f.read()
 
   # Clean the dirty data.
   if _is_png(filename):
     # 1 image is a PNG.
-    logging.info('Converting PNG to JPEG for %s', filename)
+    logging.info("Converting PNG to JPEG for %s", filename)
     image_data = coder.png_to_jpeg(image_data)
   elif _is_cmyk(filename):
     # 22 JPEG images are in CMYK colorspace.
-    logging.info('Converting CMYK to RGB for %s', filename)
+    logging.info("Converting CMYK to RGB for %s", filename)
     image_data = coder.cmyk_to_rgb(image_data)
 
   # Decode the RGB JPEG.
@@ -241,8 +256,11 @@ def _process_image(filename: str, coder: ImageCoder) -> Tuple[str, int, int]:
 
 
 def _process_image_files_batch(
-    coder: ImageCoder, output_file: str, filenames: Iterable[str], synsets: Iterable[Union[str, bytes]],
-    labels: Mapping[str, int]
+  coder: ImageCoder,
+  output_file: str,
+  filenames: Iterable[str],
+  synsets: Iterable[Union[str, bytes]],
+  labels: Mapping[str, int],
 ):
   """Processes and saves a list of images as TFRecords.
 
@@ -266,8 +284,12 @@ def _process_image_files_batch(
 
 
 def _process_dataset(
-    filenames: Iterable[str], synsets: Iterable[str], labels: Mapping[str, int], output_directory: str, prefix: str,
-    num_shards: int
+  filenames: Iterable[str],
+  synsets: Iterable[str],
+  labels: Mapping[str, int],
+  output_directory: str,
+  prefix: str,
+  num_shards: int,
 ) -> List[str]:
   """Processes and saves list of images as TFRecords.
 
@@ -290,11 +312,11 @@ def _process_dataset(
   files = []
 
   for shard in range(num_shards):
-    chunk_files = filenames[shard * chunksize:(shard + 1) * chunksize]
-    chunk_synsets = synsets[shard * chunksize:(shard + 1) * chunksize]
-    output_file = os.path.join(output_directory, '%s-%.5d-of-%.5d' % (prefix, shard, num_shards))
+    chunk_files = filenames[shard * chunksize : (shard + 1) * chunksize]
+    chunk_synsets = synsets[shard * chunksize : (shard + 1) * chunksize]
+    output_file = os.path.join(output_directory, "%s-%.5d-of-%.5d" % (prefix, shard, num_shards))
     _process_image_files_batch(coder, output_file, chunk_files, chunk_synsets, labels)
-    logging.info('Finished writing file: %s', output_file)
+    logging.info("Finished writing file: %s", output_file)
     files.append(output_file)
   return files
 
@@ -312,37 +334,45 @@ def make_shuffle_idx(n):
     return order
 
   # Glob all the training files
-  training_files = tf.io.gfile.glob(os.path.join(raw_data_dir, TRAINING_DIRECTORY, '*', '*.JPEG'))
+  training_files = tf.io.gfile.glob(os.path.join(raw_data_dir, TRAINING_DIRECTORY, "*", "*.JPEG"))
 
   # Get training file synset labels from the directory name
   training_synsets = [os.path.basename(os.path.dirname(f)) for f in training_files]
-  training_synsets = list(map(lambda x: bytes(x, 'utf-8'), training_synsets))
+  training_synsets = list(map(lambda x: bytes(x, "utf-8"), training_synsets))
 
   training_shuffle_idx = make_shuffle_idx(len(training_files))
   training_files = [training_files[i] for i in training_shuffle_idx]
   training_synsets = [training_synsets[i] for i in training_shuffle_idx]
 
   # Glob all the validation files
-  validation_files = sorted(tf.io.gfile.glob(os.path.join(raw_data_dir, VALIDATION_DIRECTORY, '*.JPEG')))
+  validation_files = sorted(tf.io.gfile.glob(os.path.join(raw_data_dir, VALIDATION_DIRECTORY, "*.JPEG")))
 
   # Get validation file synset labels from labels.txt
-  validation_synsets = tf.gfile.FastGFile(os.path.join(raw_data_dir, LABELS_FILE), 'rb').read().splitlines()
+  validation_synsets = tf.gfile.FastGFile(os.path.join(raw_data_dir, LABELS_FILE), "rb").read().splitlines()
 
   # Create unique ids for all synsets
   labels = {v: k + 1 for k, v in enumerate(sorted(set(validation_synsets + training_synsets)))}
 
   # Create training data
-  logging.info('Processing the training data.')
+  logging.info("Processing the training data.")
   training_records = _process_dataset(
-      training_files, training_synsets, labels, os.path.join(local_scratch_dir, TRAINING_DIRECTORY), TRAINING_DIRECTORY,
-      TRAINING_SHARDS
+    training_files,
+    training_synsets,
+    labels,
+    os.path.join(local_scratch_dir, TRAINING_DIRECTORY),
+    TRAINING_DIRECTORY,
+    TRAINING_SHARDS,
   )
 
   # Create validation data
-  logging.info('Processing the validation data.')
+  logging.info("Processing the validation data.")
   validation_records = _process_dataset(
-      validation_files, validation_synsets, labels, os.path.join(local_scratch_dir, VALIDATION_DIRECTORY),
-      VALIDATION_DIRECTORY, VALIDATION_SHARDS
+    validation_files,
+    validation_synsets,
+    labels,
+    os.path.join(local_scratch_dir, VALIDATION_DIRECTORY),
+    VALIDATION_DIRECTORY,
+    VALIDATION_SHARDS,
   )
 
   return training_records, validation_records
@@ -361,19 +391,19 @@ def run(raw_data_dir: str, gcs_output_path: str, local_scratch_dir: str, gcs_upl
   """
 
   if gcs_upload and gcs_output_path is None:
-    raise ValueError('GCS output path must be provided.')
-  elif gcs_upload and not gcs_output_path.startswith('gs://'):
-    raise ValueError('GCS output path must start with gs://')
+    raise ValueError("GCS output path must be provided.")
+  elif gcs_upload and not gcs_output_path.startswith("gs://"):
+    raise ValueError("GCS output path must start with gs://")
 
   if raw_data_dir is None:
     raise AssertionError(
-        'The ImageNet download path is no longer supported. Please download '
-        'and extract the .tar files manually and provide the `raw_data_dir`.'
+      "The ImageNet download path is no longer supported. Please download "
+      "and extract the .tar files manually and provide the `raw_data_dir`."
     )
 
   # Convert the raw data into tf-records
   training_records, validation_records = convert_to_tf_records(
-      raw_data_dir=raw_data_dir, local_scratch_dir=local_scratch_dir
+    raw_data_dir=raw_data_dir, local_scratch_dir=local_scratch_dir
   )
 
 
@@ -381,6 +411,6 @@ def main(_):
   run(raw_data_dir=FLAGS.raw_data_dir, gcs_output_path=FLAGS.gcs_output_path, local_scratch_dir=FLAGS.local_scratch_dir)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.disable_v2_behavior()
   app.run(main)
diff --git a/deepray/datasets/imdb/imdb.py b/deepray/datasets/imdb/imdb.py
index b1bc3f54..da4ed180 100644
--- a/deepray/datasets/imdb/imdb.py
+++ b/deepray/datasets/imdb/imdb.py
@@ -30,17 +30,16 @@
 
 
 class IMDB(DataPipeline):
-
-  def __init__(self, url='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz', **kwargs):
+  def __init__(self, url="https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", **kwargs):
     super().__init__(**kwargs)
-    dataset = tf.keras.utils.get_file('aclImdb_v1.tar.gz', url, untar=True, cache_dir='.', cache_subdir='')
+    dataset = tf.keras.utils.get_file("aclImdb_v1.tar.gz", url, untar=True, cache_dir=".", cache_subdir="")
 
-    dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')
+    dataset_dir = os.path.join(os.path.dirname(dataset), "aclImdb")
 
-    train_dir = os.path.join(dataset_dir, 'train')
+    train_dir = os.path.join(dataset_dir, "train")
 
     # remove unused folders to make it easier to load the data
-    remove_dir = os.path.join(train_dir, 'unsup')
+    remove_dir = os.path.join(train_dir, "unsup")
     shutil.rmtree(remove_dir)
 
   def parser(self, record):
@@ -56,14 +55,13 @@ def parser(self, record):
     return x, y
 
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
-
     if is_training:
       raw_ds = tf.keras.utils.text_dataset_from_directory(
-          'aclImdb/train', batch_size=batch_size, seed=FLAGS.random_seed
+        "aclImdb/train", batch_size=batch_size, seed=FLAGS.random_seed
       )
 
     else:
-      raw_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test', batch_size=batch_size)
+      raw_ds = tf.keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)
 
     raw_ds = raw_ds.cache().prefetch(buffer_size=AUTOTUNE)
 
diff --git a/deepray/datasets/imdb/imdb_test.py b/deepray/datasets/imdb/imdb_test.py
index c421df93..bf42f40e 100644
--- a/deepray/datasets/imdb/imdb_test.py
+++ b/deepray/datasets/imdb/imdb_test.py
@@ -15,12 +15,12 @@
 def runner(argv=None):
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=16",
-        "-epochs=1",
-        "--train_data=movielens/1m-ratings",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        "--label=clicked",
+      sys.argv[0],
+      "--batch_size=16",
+      "-epochs=1",
+      "--train_data=movielens/1m-ratings",
+      # f"--feature_map={dir_path}/feature_map.csv",
+      "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline.py b/deepray/datasets/kafka_pipeline/kafka_pipeline.py
index 5ce39e62..e9a3b2ce 100644
--- a/deepray/datasets/kafka_pipeline/kafka_pipeline.py
+++ b/deepray/datasets/kafka_pipeline/kafka_pipeline.py
@@ -103,14 +103,14 @@ class KafkaGroupIODataset(tf.data.Dataset):
   """
 
   def __init__(
-      self,
-      topics,
-      group_id,
-      servers,
-      stream_timeout=0,
-      message_poll_timeout=10000,
-      configuration=None,
-      internal=True,
+    self,
+    topics,
+    group_id,
+    servers,
+    stream_timeout=0,
+    message_poll_timeout=10000,
+    configuration=None,
+    internal=True,
   ):
     """
     Args:
@@ -162,12 +162,12 @@ def __init__(
       self._resource = resource
       dataset = tf.data.Dataset.counter()
       dataset = dataset.map(
-          lambda i: core_ops.io_kafka_group_readable_next(
-              input=self._resource,
-              index=i,
-              message_poll_timeout=message_poll_timeout,
-              stream_timeout=stream_timeout,
-          )
+        lambda i: core_ops.io_kafka_group_readable_next(
+          input=self._resource,
+          index=i,
+          message_poll_timeout=message_poll_timeout,
+          stream_timeout=stream_timeout,
+        )
       )
       dataset = dataset.take_while(lambda v: tf.greater(v.continue_fetch, 0))
       dataset = dataset.map(lambda v: v.message)
@@ -185,17 +185,16 @@ def element_spec(self):
 
 
 class KafkaPipeline(DataPipeline, ABC):
-
   def __init__(
-      self,
-      topics,
-      group_id,
-      servers,
-      stream_timeout=None,
-      configuration=None,
-      compression_type=None,
-      num_client=1,
-      **kwargs
+    self,
+    topics,
+    group_id,
+    servers,
+    stream_timeout=None,
+    configuration=None,
+    compression_type=None,
+    num_client=1,
+    **kwargs,
   ):
     super().__init__(**kwargs)
     self.topics = topics
@@ -207,45 +206,46 @@ def __init__(
     self.num_client = num_client
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     if self.num_client > 1:
       logger.info(f"Using {self.num_client} Kafka clients.")
       clients = tuple(
-          KafkaGroupIODataset(
-              topics=self.topics,
-              group_id=self.group_id,
-              servers=self.servers,
-              stream_timeout=self.stream_timeout,
-              configuration=self.configuration,
-          ) for _ in range(self.num_client)
-      )
-      dataset = tf.data.Dataset.zip(clients)
-      dataset = dataset.map(lambda *x: tf.stack(x, axis=-1)).unbatch()
-    else:
-      dataset = KafkaGroupIODataset(
+        KafkaGroupIODataset(
           topics=self.topics,
           group_id=self.group_id,
           servers=self.servers,
           stream_timeout=self.stream_timeout,
           configuration=self.configuration,
+        )
+        for _ in range(self.num_client)
+      )
+      dataset = tf.data.Dataset.zip(clients)
+      dataset = dataset.map(lambda *x: tf.stack(x, axis=-1)).unbatch()
+    else:
+      dataset = KafkaGroupIODataset(
+        topics=self.topics,
+        group_id=self.group_id,
+        servers=self.servers,
+        stream_timeout=self.stream_timeout,
+        configuration=self.configuration,
       )
       logger.info(
-          "Using only one Kafka client, if there is an IO bottleneck, it is recommended to adjust 'num_client' to increase the number of Kafka clients"
+        "Using only one Kafka client, if there is an IO bottleneck, it is recommended to adjust 'num_client' to increase the number of Kafka clients"
       )
     if self.prebatch_size:
       if batch_size > self.prebatch_size:
         dataset = dataset.batch(
-            batch_size=batch_size // self.prebatch_size,
-            num_parallel_calls=tf.data.AUTOTUNE,
-            deterministic=True,
-            drop_remainder=True
+          batch_size=batch_size // self.prebatch_size,
+          num_parallel_calls=tf.data.AUTOTUNE,
+          deterministic=True,
+          drop_remainder=True,
         )
     else:
       dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True, drop_remainder=True)
     if self.compression_type:
       dataset = dataset.map(
-          lambda v: tf.io.decode_compressed(v, compression_type=self.compression_type), multiprocessing.cpu_count()
+        lambda v: tf.io.decode_compressed(v, compression_type=self.compression_type), multiprocessing.cpu_count()
       )
     if not hasattr(self.parser, "__isabstractmethod__"):
       dataset = dataset.map(self.parser, multiprocessing.cpu_count())
diff --git a/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
index bc957110..d1e4d47c 100644
--- a/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
+++ b/deepray/datasets/kafka_pipeline/kafka_pipeline_test.py
@@ -17,10 +17,9 @@
 
 
 def main():
-
   data_pipe = KafkaPipeline(
-      # dataset_name=flags.FLAGS.dataset,
-      # partitions=[{'ds': date} for date in get_dates()],
+    # dataset_name=flags.FLAGS.dataset,
+    # partitions=[{'ds': date} for date in get_dates()],
   )
 
   train_dataset = data_pipe(input_file_pattern=None, batch_size=flags.FLAGS.batch_size)
@@ -38,7 +37,7 @@ def main():
     step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
 
     if num_examples % 100 == 0:
-      logger.info(f'step {step}, Perf {step_throughput} samples/s')
+      logger.info(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/mnist/mnist.py b/deepray/datasets/mnist/mnist.py
index f328534f..05f56e76 100644
--- a/deepray/datasets/mnist/mnist.py
+++ b/deepray/datasets/mnist/mnist.py
@@ -26,7 +26,6 @@
 
 
 class Mnist(DataPipeline):
-
   def __init__(self, path="mnist.npz"):
     """Loads the MNIST dataset.
 
@@ -76,21 +75,21 @@ def __init__(self, path="mnist.npz"):
     super().__init__()
 
     flags.FLAGS([
-        sys.argv[0],
-        "--num_train_examples=60000",
+      sys.argv[0],
+      "--num_train_examples=60000",
     ])
 
-    origin_folder = ("https://storage.googleapis.com/tensorflow/tf-keras-datasets/")
+    origin_folder = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/"
     self.path = get_file(
-        path,
-        origin=origin_folder + "mnist.npz",
-        file_hash=(  # noqa: E501
-            "731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"
-        ),
+      path,
+      origin=origin_folder + "mnist.npz",
+      file_hash=(  # noqa: E501
+        "731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1"
+      ),
     )
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     with np.load(self.path, allow_pickle=True) as f:
       if is_training:
diff --git a/deepray/datasets/movielens/constants.py b/deepray/datasets/movielens/constants.py
index a894c7d5..55728867 100644
--- a/deepray/datasets/movielens/constants.py
+++ b/deepray/datasets/movielens/constants.py
@@ -93,8 +93,8 @@
 
 # Note: Users are indexed [1, k], not [0, k-1]
 NUM_USER_IDS = {
-    ML_1M: 6040,
-    ML_20M: 138493,
+  ML_1M: 6040,
+  ML_20M: 138493,
 }
 
 NUM_RATINGS = {ML_1M: 1000209, ML_20M: 20000263}
diff --git a/deepray/datasets/movielens/data_pipeline.py b/deepray/datasets/movielens/data_pipeline.py
index 61dc27a8..8ef74b99 100644
--- a/deepray/datasets/movielens/data_pipeline.py
+++ b/deepray/datasets/movielens/data_pipeline.py
@@ -61,7 +61,7 @@ class DatasetManager(object):
   """
 
   def __init__(
-      self, is_training, stream_files, batches_per_epoch, shard_root=None, deterministic=False, num_train_epochs=None
+    self, is_training, stream_files, batches_per_epoch, shard_root=None, deterministic=False, num_train_epochs=None
   ):
     # type: (bool, bool, int, typing.Optional[str], bool, int) -> None
     """Constructs a `DatasetManager` instance.
@@ -94,12 +94,12 @@ def __init__(
 
   @property
   def current_data_root(self):
-    subdir = (rconst.TRAIN_FOLDER_TEMPLATE.format(self._epochs_completed) if self._is_training else rconst.EVAL_FOLDER)
+    subdir = rconst.TRAIN_FOLDER_TEMPLATE.format(self._epochs_completed) if self._is_training else rconst.EVAL_FOLDER
     return os.path.join(self._shard_root, subdir)
 
   def buffer_reached(self):
     # Only applicable for training.
-    return (self._epochs_completed - self._epochs_requested >= rconst.CYCLES_TO_BUFFER and self._is_training)
+    return self._epochs_completed - self._epochs_requested >= rconst.CYCLES_TO_BUFFER and self._is_training
 
   @staticmethod
   def serialize(data):
@@ -130,16 +130,16 @@ def _get_feature_map(batch_size, is_training=True):
 
       if is_training:
         return {
-            rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
+          rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
         }
       else:
         return {
-            rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
+          rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
         }
 
     features = tf.io.parse_single_example(serialized_data, _get_feature_map(batch_size, is_training=is_training))
@@ -150,21 +150,21 @@ def _get_feature_map(batch_size, is_training=True):
       valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
       fake_dup_mask = tf.zeros_like(users)
       return {
-          rconst.USER_COLUMN: users,
-          rconst.ITEM_COLUMN: items,
-          rconst.VALID_POINT_MASK: valid_point_mask,
-          rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
-          rconst.DUPLICATE_MASK: fake_dup_mask
+        rconst.USER_COLUMN: users,
+        rconst.ITEM_COLUMN: items,
+        rconst.VALID_POINT_MASK: valid_point_mask,
+        rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
+        rconst.DUPLICATE_MASK: fake_dup_mask,
       }
     else:
       labels = tf.cast(tf.zeros_like(users), tf.bool)
       fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
       return {
-          rconst.USER_COLUMN: users,
-          rconst.ITEM_COLUMN: items,
-          rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-          rconst.VALID_POINT_MASK: fake_valid_pt_mask,
-          rconst.TRAIN_LABEL_KEY: labels
+        rconst.USER_COLUMN: users,
+        rconst.ITEM_COLUMN: items,
+        rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
+        rconst.VALID_POINT_MASK: fake_valid_pt_mask,
+        rconst.TRAIN_LABEL_KEY: labels,
       }
 
   def put(self, index, data):
@@ -248,19 +248,18 @@ def get_dataset(self, batch_size, epochs_between_evals):
     self.increment_request_epoch()
     if self._stream_files:
       if epochs_between_evals > 1:
-        raise ValueError("epochs_between_evals > 1 not supported for file "
-                         "based dataset.")
+        raise ValueError("epochs_between_evals > 1 not supported for file based dataset.")
       epoch_data_dir = self._result_queue.get(timeout=300)
       if not self._is_training:
         self._result_queue.put(epoch_data_dir)  # Eval data is reused.
 
       file_pattern = os.path.join(epoch_data_dir, rconst.SHARD_TEMPLATE.format("*"))
       dataset = StreamingFilesDataset(
-          files=file_pattern,
-          worker_job=popen_helper.worker_job(),
-          num_parallel_reads=rconst.NUM_FILE_SHARDS,
-          num_epochs=1,
-          sloppy=not self._deterministic
+        files=file_pattern,
+        worker_job=popen_helper.worker_job(),
+        num_parallel_reads=rconst.NUM_FILE_SHARDS,
+        num_epochs=1,
+        sloppy=not self._deterministic,
       )
       map_fn = functools.partial(self.deserialize, batch_size=batch_size, is_training=self._is_training)
       dataset = dataset.map(map_fn, num_parallel_calls=16)
@@ -268,8 +267,8 @@ def get_dataset(self, batch_size, epochs_between_evals):
     else:
       types = {rconst.USER_COLUMN: rconst.USER_DTYPE, rconst.ITEM_COLUMN: rconst.ITEM_DTYPE}
       shapes = {
-          rconst.USER_COLUMN: tf.TensorShape([batch_size, 1]),
-          rconst.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
+        rconst.USER_COLUMN: tf.TensorShape([batch_size, 1]),
+        rconst.ITEM_COLUMN: tf.TensorShape([batch_size, 1]),
       }
 
       if self._is_training:
@@ -297,15 +296,14 @@ def input_fn(params):
       # Estimator passes batch_size during training and eval_batch_size during
       # eval.
       param_batch_size = (
-          params["batch_size"] if self._is_training else params.get("eval_batch_size") or params["batch_size"]
+        params["batch_size"] if self._is_training else params.get("eval_batch_size") or params["batch_size"]
       )
       if batch_size != param_batch_size:
         raise ValueError(
-            "producer batch size ({}) differs from params batch "
-            "size ({})".format(batch_size, param_batch_size)
+          "producer batch size ({}) differs from params batch size ({})".format(batch_size, param_batch_size)
         )
 
-      epochs_between_evals = (params.get("epochs_between_evals", 1) if self._is_training else 1)
+      epochs_between_evals = params.get("epochs_between_evals", 1) if self._is_training else 1
       return self.get_dataset(batch_size=batch_size, epochs_between_evals=epochs_between_evals)
 
     return input_fn
@@ -324,26 +322,26 @@ class BaseDataConstructor(threading.Thread):
   """
 
   def __init__(
-      self,
-      maximum_number_epochs,  # type: int
-      num_users,  # type: int
-      num_items,  # type: int
-      user_map,  # type: dict
-      item_map,  # type: dict
-      train_pos_users,  # type: np.ndarray
-      train_pos_items,  # type: np.ndarray
-      train_batch_size,  # type: int
-      batches_per_train_step,  # type: int
-      num_train_negatives,  # type: int
-      eval_pos_users,  # type: np.ndarray
-      eval_pos_items,  # type: np.ndarray
-      eval_batch_size,  # type: int
-      batches_per_eval_step,  # type: int
-      stream_files,  # type: bool
-      deterministic=False,  # type: bool
-      epoch_dir=None,  # type: str
-      num_train_epochs=None,  # type: int
-      create_data_offline=False  # type: bool
+    self,
+    maximum_number_epochs,  # type: int
+    num_users,  # type: int
+    num_items,  # type: int
+    user_map,  # type: dict
+    item_map,  # type: dict
+    train_pos_users,  # type: np.ndarray
+    train_pos_items,  # type: np.ndarray
+    train_batch_size,  # type: int
+    batches_per_train_step,  # type: int
+    num_train_negatives,  # type: int
+    eval_pos_users,  # type: np.ndarray
+    eval_pos_items,  # type: np.ndarray
+    eval_batch_size,  # type: int
+    batches_per_eval_step,  # type: int
+    stream_files,  # type: bool
+    deterministic=False,  # type: bool
+    epoch_dir=None,  # type: str
+    num_train_epochs=None,  # type: int
+    create_data_offline=False,  # type: bool
   ):
     # General constants
     self._maximum_number_epochs = maximum_number_epochs
@@ -365,26 +363,26 @@ def __init__(
     # Training
     if self._train_pos_users.shape != self._train_pos_items.shape:
       raise ValueError(
-          "User positives ({}) is different from item positives ({})".format(
-              self._train_pos_users.shape, self._train_pos_items.shape
-          )
+        "User positives ({}) is different from item positives ({})".format(
+          self._train_pos_users.shape, self._train_pos_items.shape
+        )
       )
 
     (self._train_pos_count,) = self._train_pos_users.shape
     self._elements_in_epoch = (1 + num_train_negatives) * self._train_pos_count
     self.train_batches_per_epoch = self._count_batches(
-        self._elements_in_epoch, train_batch_size, batches_per_train_step
+      self._elements_in_epoch, train_batch_size, batches_per_train_step
     )
 
     # Evaluation
     if eval_batch_size % (1 + rconst.NUM_EVAL_NEGATIVES):
       raise ValueError(
-          "Eval batch size {} is not divisible by {}".format(eval_batch_size, 1 + rconst.NUM_EVAL_NEGATIVES)
+        "Eval batch size {} is not divisible by {}".format(eval_batch_size, 1 + rconst.NUM_EVAL_NEGATIVES)
       )
     self._eval_users_per_batch = int(eval_batch_size // (1 + rconst.NUM_EVAL_NEGATIVES))
     self._eval_elements_in_epoch = num_users * (1 + rconst.NUM_EVAL_NEGATIVES)
     self.eval_batches_per_epoch = self._count_batches(
-        self._eval_elements_in_epoch, eval_batch_size, batches_per_eval_step
+      self._eval_elements_in_epoch, eval_batch_size, batches_per_eval_step
     )
 
     # Intermediate artifacts
@@ -400,10 +398,10 @@ def __init__(
       self._shard_root = None
 
     self._train_dataset = DatasetManager(
-        True, stream_files, self.train_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
+      True, stream_files, self.train_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
     )
     self._eval_dataset = DatasetManager(
-        False, stream_files, self.eval_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
+      False, stream_files, self.eval_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
     )
 
     # Threading details
@@ -414,18 +412,18 @@ def __init__(
     self.deterministic = deterministic
 
   def __str__(self):
-    multiplier = ("(x{} devices)".format(self._batches_per_train_step) if self._batches_per_train_step > 1 else "")
+    multiplier = "(x{} devices)".format(self._batches_per_train_step) if self._batches_per_train_step > 1 else ""
     summary = SUMMARY_TEMPLATE.format(
-        spacer="  ",
-        num_users=self._num_users,
-        num_items=self._num_items,
-        train_pos_ct=self._train_pos_count,
-        train_batch_size=self.train_batch_size,
-        train_batch_ct=self.train_batches_per_epoch,
-        eval_pos_ct=self._num_users,
-        eval_batch_size=self.eval_batch_size,
-        eval_batch_ct=self.eval_batches_per_epoch,
-        multiplier=multiplier
+      spacer="  ",
+      num_users=self._num_users,
+      num_items=self._num_items,
+      train_pos_ct=self._train_pos_count,
+      train_batch_size=self.train_batch_size,
+      train_batch_ct=self.train_batches_per_epoch,
+      eval_pos_ct=self._num_users,
+      eval_batch_size=self.eval_batch_size,
+      eval_batch_ct=self.eval_batches_per_epoch,
+      multiplier=multiplier,
     )
     return super(BaseDataConstructor, self).__str__() + "\n" + summary
 
@@ -484,7 +482,7 @@ def _get_training_batch(self, i):
       i: The index of the batch. This is used when stream_files=True to assign
         data to file shards.
     """
-    batch_indices = self._current_epoch_order[i * self.train_batch_size:(i + 1) * self.train_batch_size]
+    batch_indices = self._current_epoch_order[i * self.train_batch_size : (i + 1) * self.train_batch_size]
     (mask_start_index,) = batch_indices.shape
 
     batch_ind_mod = np.mod(batch_indices, self._train_pos_count)
@@ -514,12 +512,13 @@ def _get_training_batch(self, i):
       labels = np.concatenate([labels, label_pad])
 
     self._train_dataset.put(
-        i, {
-            rconst.USER_COLUMN: np.reshape(users, (self.train_batch_size, 1)),
-            rconst.ITEM_COLUMN: np.reshape(items, (self.train_batch_size, 1)),
-            rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
-            "labels": np.reshape(labels, (self.train_batch_size, 1)),
-        }
+      i,
+      {
+        rconst.USER_COLUMN: np.reshape(users, (self.train_batch_size, 1)),
+        rconst.ITEM_COLUMN: np.reshape(items, (self.train_batch_size, 1)),
+        rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
+        "labels": np.reshape(labels, (self.train_batch_size, 1)),
+      },
     )
 
   def _wait_to_construct_train_epoch(self):
@@ -543,7 +542,7 @@ def _construct_training_epoch(self):
     map_args = list(range(self.train_batches_per_epoch))
     self._current_epoch_order = next(self._shuffle_iterator)
 
-    get_pool = (popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool)
+    get_pool = popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool
     with get_pool(6) as pool:
       pool.map(self._get_training_batch, map_args)
     self._train_dataset.end_construction()
@@ -599,18 +598,19 @@ def _get_eval_batch(self, i):
     high_index = (i + 1) * self._eval_users_per_batch
     users = np.repeat(self._eval_pos_users[low_index:high_index, np.newaxis], 1 + rconst.NUM_EVAL_NEGATIVES, axis=1)
     positive_items = self._eval_pos_items[low_index:high_index, np.newaxis]
-    negative_items = (self.lookup_negative_items(negative_users=users[:, :-1]).reshape(-1, rconst.NUM_EVAL_NEGATIVES))
+    negative_items = self.lookup_negative_items(negative_users=users[:, :-1]).reshape(-1, rconst.NUM_EVAL_NEGATIVES)
 
     users, items, duplicate_mask = self._assemble_eval_batch(
-        users, positive_items, negative_items, self._eval_users_per_batch
+      users, positive_items, negative_items, self._eval_users_per_batch
     )
 
     self._eval_dataset.put(
-        i, {
-            rconst.USER_COLUMN: np.reshape(users.flatten(), (self.eval_batch_size, 1)),
-            rconst.ITEM_COLUMN: np.reshape(items.flatten(), (self.eval_batch_size, 1)),
-            rconst.DUPLICATE_MASK: np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
-        }
+      i,
+      {
+        rconst.USER_COLUMN: np.reshape(users.flatten(), (self.eval_batch_size, 1)),
+        rconst.ITEM_COLUMN: np.reshape(items.flatten(), (self.eval_batch_size, 1)),
+        rconst.DUPLICATE_MASK: np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
+      },
     )
 
   def _construct_eval_epoch(self):
@@ -623,7 +623,7 @@ def _construct_eval_epoch(self):
     self._eval_dataset.start_construction()
     map_args = [i for i in range(self.eval_batches_per_epoch)]
 
-    get_pool = (popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool)
+    get_pool = popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool
     with get_pool(6) as pool:
       pool.map(self._get_eval_batch, map_args)
     self._eval_dataset.end_construction()
@@ -637,8 +637,9 @@ def make_input_fn(self, is_training):
       raise ValueError("Fatal exception in the data production loop: {}".format(self._fatal_exception))
 
     return (
-        self._train_dataset.make_input_fn(self.train_batch_size)
-        if is_training else self._eval_dataset.make_input_fn(self.eval_batch_size)
+      self._train_dataset.make_input_fn(self.train_batch_size)
+      if is_training
+      else self._eval_dataset.make_input_fn(self.eval_batch_size)
     )
 
   def increment_request_epoch(self):
@@ -671,7 +672,7 @@ def input_fn(params):
 
       # Estimator passes batch_size during training and eval_batch_size during
       # eval.
-      batch_size = (params["batch_size"] if is_training else params.get("eval_batch_size") or params["batch_size"])
+      batch_size = params["batch_size"] if is_training else params.get("eval_batch_size") or params["batch_size"]
       num_users = params["num_users"]
       num_items = params["num_items"]
 
@@ -681,21 +682,24 @@ def input_fn(params):
       if is_training:
         valid_point_mask = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
         labels = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
-        data = {
+        data = (
+          {
             rconst.USER_COLUMN: users,
             rconst.ITEM_COLUMN: items,
             rconst.VALID_POINT_MASK: valid_point_mask,
-        }, labels
+          },
+          labels,
+        )
       else:
         dupe_mask = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
         data = {
-            rconst.USER_COLUMN: users,
-            rconst.ITEM_COLUMN: items,
-            rconst.DUPLICATE_MASK: dupe_mask,
+          rconst.USER_COLUMN: users,
+          rconst.ITEM_COLUMN: items,
+          rconst.DUPLICATE_MASK: dupe_mask,
         }
 
       dataset = tf.data.Dataset.from_tensors(data).repeat(
-          rconst.SYNTHETIC_BATCHES_PER_EPOCH * params["batches_per_step"]
+        rconst.SYNTHETIC_BATCHES_PER_EPOCH * params["batches_per_step"]
       )
       dataset = dataset.prefetch(32)
       return dataset
@@ -762,10 +766,10 @@ def construct_lookup_variables(self):
     # call does not parallelize well. Multiprocessing incurs too much
     # serialization overhead to be worthwhile.
     for i in range(self._num_users):
-      positives = self._train_pos_items[index_bounds[i]:index_bounds[i + 1]]
+      positives = self._train_pos_items[index_bounds[i] : index_bounds[i + 1]]
       negatives = np.delete(full_set, positives)
       self._per_user_neg_count[i] = self._num_items - positives.shape[0]
-      self._negative_table[i, :self._per_user_neg_count[i]] = negatives
+      self._negative_table[i, : self._per_user_neg_count[i]] = negatives
 
     logging.info("Negative sample table built. Time: {:.1f} seconds".format(timeit.default_timer() - start_time))
 
@@ -791,7 +795,7 @@ def __init__(self, *args, **kwargs):
     self._total_negatives = None
 
   def _index_segment(self, user):
-    lower, upper = self.index_bounds[user:user + 2]
+    lower, upper = self.index_bounds[user : user + 2]
     items = self._sorted_train_pos_items[lower:upper]
 
     negatives_since_last_positive = np.concatenate([items[0][np.newaxis], items[1:] - items[:-1] - 1])
@@ -810,7 +814,7 @@ def construct_lookup_variables(self):
     self._sorted_train_pos_items = self._train_pos_items.copy()
 
     for i in range(self._num_users):
-      lower, upper = self.index_bounds[i:i + 2]
+      lower, upper = self.index_bounds[i : i + 2]
       self._sorted_train_pos_items[lower:upper].sort()
 
     self._total_negatives = np.concatenate([self._index_segment(i) for i in range(self._num_users)])
@@ -842,7 +846,7 @@ def lookup_negative_items(self, negative_users, **kwargs):
     # not worth implementing.
     use_shortcut = neg_item_choice >= self._total_negatives[right_index]
     output[use_shortcut] = (
-        self._sorted_train_pos_items[right_index] + 1 + (neg_item_choice - self._total_negatives[right_index])
+      self._sorted_train_pos_items[right_index] + 1 + (neg_item_choice - self._total_negatives[right_index])
     )[use_shortcut]
 
     if np.all(use_shortcut):
@@ -870,8 +874,8 @@ def lookup_negative_items(self, negative_users, **kwargs):
 
     assert np.all((right_index - left_index) <= 1)
 
-    output[not_use_shortcut] = (
-        self._sorted_train_pos_items[right_index] - (self._total_negatives[right_index] - neg_item_choice)
+    output[not_use_shortcut] = self._sorted_train_pos_items[right_index] - (
+      self._total_negatives[right_index] - neg_item_choice
     )
 
     assert np.all(output >= 0)
diff --git a/deepray/datasets/movielens/data_preprocessing.py b/deepray/datasets/movielens/data_preprocessing.py
index 1fcad3b9..fee5ab52 100644
--- a/deepray/datasets/movielens/data_preprocessing.py
+++ b/deepray/datasets/movielens/data_preprocessing.py
@@ -32,8 +32,12 @@
 from deepray.datasets.movielens import constants as rconst, data_pipeline
 
 _EXPECTED_CACHE_KEYS = (
-    rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY, rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY, rconst.USER_MAP,
-    rconst.ITEM_MAP
+  rconst.TRAIN_USER_KEY,
+  rconst.TRAIN_ITEM_KEY,
+  rconst.EVAL_USER_KEY,
+  rconst.EVAL_ITEM_KEY,
+  rconst.USER_MAP,
+  rconst.ITEM_MAP,
 )
 
 
@@ -164,13 +168,13 @@ def _filter_index_sort(raw_rating_path: Text, cache_path: Text) -> Tuple[pd.Data
     eval_df, train_df = grouped.tail(1), grouped.apply(lambda x: x.iloc[:-1])
 
     data = {
-        rconst.TRAIN_USER_KEY: train_df[rconst.USER_COLUMN].values.astype(rconst.USER_DTYPE),
-        rconst.TRAIN_ITEM_KEY: train_df[rconst.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
-        rconst.EVAL_USER_KEY: eval_df[rconst.USER_COLUMN].values.astype(rconst.USER_DTYPE),
-        rconst.EVAL_ITEM_KEY: eval_df[rconst.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
-        rconst.USER_MAP: user_map,
-        rconst.ITEM_MAP: item_map,
-        "create_time": time.time(),
+      rconst.TRAIN_USER_KEY: train_df[rconst.USER_COLUMN].values.astype(rconst.USER_DTYPE),
+      rconst.TRAIN_ITEM_KEY: train_df[rconst.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
+      rconst.EVAL_USER_KEY: eval_df[rconst.USER_COLUMN].values.astype(rconst.USER_DTYPE),
+      rconst.EVAL_ITEM_KEY: eval_df[rconst.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
+      rconst.USER_MAP: user_map,
+      rconst.ITEM_MAP: item_map,
+      "create_time": time.time(),
     }
 
     logging.info("Writing raw data cache.")
@@ -182,7 +186,7 @@ def _filter_index_sort(raw_rating_path: Text, cache_path: Text) -> Tuple[pd.Data
 
 
 def instantiate_pipeline(
-    dataset, data_dir, params, constructor_type=None, deterministic=False, epoch_dir=None, generate_data_offline=False
+  dataset, data_dir, params, constructor_type=None, deterministic=False, epoch_dir=None, generate_data_offline=False
 ):
   # type: (str, str, dict, typing.Optional[str], bool, typing.Optional[str], bool) -> (int, int, data_pipeline.BaseDataConstructor)
   """Load and digest data CSV into a usable form.
@@ -214,24 +218,24 @@ def instantiate_pipeline(
     raise ValueError("Expected to find {} items, but found {}".format(num_items, len(item_map)))
 
   producer = data_pipeline.get_constructor(constructor_type or "materialized")(
-      maximum_number_epochs=params["epochs"],
-      num_users=num_users,
-      num_items=num_items,
-      user_map=user_map,
-      item_map=item_map,
-      train_pos_users=raw_data[rconst.TRAIN_USER_KEY],
-      train_pos_items=raw_data[rconst.TRAIN_ITEM_KEY],
-      train_batch_size=params["batch_size"],
-      batches_per_train_step=params["batches_per_step"],
-      num_train_negatives=params["num_neg"],
-      eval_pos_users=raw_data[rconst.EVAL_USER_KEY],
-      eval_pos_items=raw_data[rconst.EVAL_ITEM_KEY],
-      eval_batch_size=params["eval_batch_size"],
-      batches_per_eval_step=params["batches_per_step"],
-      stream_files=params["stream_files"],
-      deterministic=deterministic,
-      epoch_dir=epoch_dir,
-      create_data_offline=generate_data_offline
+    maximum_number_epochs=params["epochs"],
+    num_users=num_users,
+    num_items=num_items,
+    user_map=user_map,
+    item_map=item_map,
+    train_pos_users=raw_data[rconst.TRAIN_USER_KEY],
+    train_pos_items=raw_data[rconst.TRAIN_ITEM_KEY],
+    train_batch_size=params["batch_size"],
+    batches_per_train_step=params["batches_per_step"],
+    num_train_negatives=params["num_neg"],
+    eval_pos_users=raw_data[rconst.EVAL_USER_KEY],
+    eval_pos_items=raw_data[rconst.EVAL_ITEM_KEY],
+    eval_batch_size=params["eval_batch_size"],
+    batches_per_eval_step=params["batches_per_step"],
+    stream_files=params["stream_files"],
+    deterministic=deterministic,
+    epoch_dir=epoch_dir,
+    create_data_offline=generate_data_offline,
   )
 
   run_time = timeit.default_timer() - st
diff --git a/deepray/datasets/movielens/movielens.py b/deepray/datasets/movielens/movielens.py
index dbba39c4..a7f836e9 100644
--- a/deepray/datasets/movielens/movielens.py
+++ b/deepray/datasets/movielens/movielens.py
@@ -9,7 +9,6 @@
 
 
 class Movielens(DataPipeline):
-
   @staticmethod
   def parser(self, serialized_data, batch_size=None, is_training=True):
     """Convert serialized TFRecords into tensors.
@@ -27,16 +26,16 @@ def _get_feature_map(batch_size, is_training=True):
 
       if is_training:
         return {
-            rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
+          rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
         }
       else:
         return {
-            rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
+          rconst.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
+          rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
         }
 
     features = tf.io.parse_single_example(serialized_data, _get_feature_map(batch_size, is_training=is_training))
@@ -47,33 +46,32 @@ def _get_feature_map(batch_size, is_training=True):
       valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
       fake_dup_mask = tf.zeros_like(users)
       return {
-          rconst.USER_COLUMN: users,
-          rconst.ITEM_COLUMN: items,
-          rconst.VALID_POINT_MASK: valid_point_mask,
-          rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
-          rconst.DUPLICATE_MASK: fake_dup_mask
+        rconst.USER_COLUMN: users,
+        rconst.ITEM_COLUMN: items,
+        rconst.VALID_POINT_MASK: valid_point_mask,
+        rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
+        rconst.DUPLICATE_MASK: fake_dup_mask,
       }
     else:
       labels = tf.cast(tf.zeros_like(users), tf.bool)
       fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
       return {
-          rconst.USER_COLUMN: users,
-          rconst.ITEM_COLUMN: items,
-          rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-          rconst.VALID_POINT_MASK: fake_valid_pt_mask,
-          rconst.TRAIN_LABEL_KEY: labels
+        rconst.USER_COLUMN: users,
+        rconst.ITEM_COLUMN: items,
+        rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
+        rconst.VALID_POINT_MASK: fake_valid_pt_mask,
+        rconst.TRAIN_LABEL_KEY: labels,
       }
 
   def build_dataset(self, input_file_pattern, pre_batch_size, batch_size, is_training=True, rebatch=False):
     """Creates dataset from (tf)records files for training/evaluation."""
     if pre_batch_size != batch_size:
-      raise ValueError("Pre-batch ({}) size is not equal to batch "
-                       "size ({})".format(pre_batch_size, batch_size))
+      raise ValueError("Pre-batch ({}) size is not equal to batch size ({})".format(pre_batch_size, batch_size))
 
     files = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)
 
     dataset = files.interleave(
-        tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE
+      tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE
     )
     decode_fn = functools.partial(self.parser, batch_size=pre_batch_size, is_training=is_training)
     dataset = dataset.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
diff --git a/deepray/datasets/movielens/movielens_100k_ratings.py b/deepray/datasets/movielens/movielens_100k_ratings.py
index 342ece15..e281d765 100644
--- a/deepray/datasets/movielens/movielens_100k_ratings.py
+++ b/deepray/datasets/movielens/movielens_100k_ratings.py
@@ -8,18 +8,18 @@
 
 
 class Movielens100kRating(DataPipeline):
-
   def __init__(self, split=False, **kwargs):
     super().__init__(**kwargs)
     self.split = split
     flags.FLAGS([
-        "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
+      "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
     ])
     import tensorflow_datasets as tfds
+
     # Ratings data.
     self.ratings = tfds.load("movielens/100k-ratings", split="train", data_dir="/datasets/", download=True)
     # Features of all the available movies.
-    self.movies = tfds.load('movielens/100k-movies', split="train", data_dir="/datasets/", download=True)
+    self.movies = tfds.load("movielens/100k-movies", split="train", data_dir="/datasets/", download=True)
     users = self.ratings.map(lambda x: x["user_id"], os.cpu_count())
     movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count())
     movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count())
@@ -38,25 +38,27 @@ def get_vocabulary(self, feature):
     elif feature == "movie_title":
       return self.movie_titles_vocabulary.get_vocabulary()
     else:
-      column = self.original_dataset.map(lambda x: {
-          feature: x[feature]
-      }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count())
+      column = (
+        self.original_dataset.map(lambda x: {feature: x[feature]}, os.cpu_count())
+        .batch(self.__len__)
+        .map(lambda x: x[feature], os.cpu_count())
+      )
       return np.unique(np.concatenate(list(column)))
 
   def parser(self, record):
     return {
-        "movie_id": tf.strings.to_number(record["movie_id"], tf.int64),
-        "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
-        "user_id": self.user_ids_vocabulary(record["user_id"]),
-        "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
-        "user_gender": tf.cast(record["user_gender"], tf.int32),
-        "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
-        "raw_user_age": tf.cast(record["raw_user_age"], tf.int32),
-        "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32)
+      "movie_id": tf.strings.to_number(record["movie_id"], tf.int64),
+      "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
+      "user_id": self.user_ids_vocabulary(record["user_id"]),
+      "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
+      "user_gender": tf.cast(record["user_gender"], tf.int32),
+      "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
+      "raw_user_age": tf.cast(record["raw_user_age"], tf.int32),
+      "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32),
     }, record["user_rating"]
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     dataset = self.ratings.map(self.parser, os.cpu_count())
     if epochs > 1:
diff --git a/deepray/datasets/movielens/movielens_1m_ratings.py b/deepray/datasets/movielens/movielens_1m_ratings.py
index 94b768f6..155469bc 100644
--- a/deepray/datasets/movielens/movielens_1m_ratings.py
+++ b/deepray/datasets/movielens/movielens_1m_ratings.py
@@ -8,18 +8,18 @@
 
 
 class Movielens1MRating(DataPipeline):
-
   def __init__(self, split=False, **kwargs):
     super().__init__(**kwargs)
     self.split = split
     flags.FLAGS([
-        "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
+      "--feature_map={}".format(os.path.join(os.path.dirname(__file__), "movielens.csv")),
     ])
     import tensorflow_datasets as tfds
+
     # Ratings data.
     self.ratings = tfds.load("movielens/1m-ratings", split="train", data_dir="/datasets/", download=True)
     # Features of all the available movies.
-    self.movies = tfds.load('movielens/1m-movies', split="train", data_dir="/datasets/", download=True)
+    self.movies = tfds.load("movielens/1m-movies", split="train", data_dir="/datasets/", download=True)
     users = self.ratings.map(lambda x: x["user_id"], os.cpu_count())
     movie_ids = self.movies.map(lambda x: x["movie_id"], os.cpu_count())
     movies = self.movies.map(lambda x: x["movie_title"], os.cpu_count())
@@ -38,25 +38,27 @@ def get_vocabulary(self, feature):
     elif feature == "movie_title":
       return self.movie_titles_vocabulary.get_vocabulary()
     else:
-      column = self.original_dataset.map(lambda x: {
-          feature: x[feature]
-      }, os.cpu_count()).batch(self.__len__).map(lambda x: x[feature], os.cpu_count())
+      column = (
+        self.original_dataset.map(lambda x: {feature: x[feature]}, os.cpu_count())
+        .batch(self.__len__)
+        .map(lambda x: x[feature], os.cpu_count())
+      )
       return np.unique(np.concatenate(list(column)))
 
   def parser(self, record):
     return {
-        "movie_id": self.movie_ids_vocabulary(record["movie_id"]),
-        "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
-        "user_id": self.user_ids_vocabulary(record["user_id"]),
-        "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
-        "user_gender": tf.cast(record["user_gender"], tf.int32),
-        "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
-        "bucketized_user_age": tf.cast(record["bucketized_user_age"], tf.int32),
-        "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32)
+      "movie_id": self.movie_ids_vocabulary(record["movie_id"]),
+      "movie_title": self.movie_titles_vocabulary(record["movie_title"]),
+      "user_id": self.user_ids_vocabulary(record["user_id"]),
+      "movie_genres": tf.cast(record["movie_genres"][0], tf.int32),
+      "user_gender": tf.cast(record["user_gender"], tf.int32),
+      "user_occupation_label": tf.cast(record["user_occupation_label"], tf.int32),
+      "bucketized_user_age": tf.cast(record["bucketized_user_age"], tf.int32),
+      "timestamp": tf.cast(record["timestamp"] - 880000000, tf.int32),
     }, record["user_rating"]
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     dataset = self.ratings.map(self.parser, os.cpu_count())
     if epochs > 1:
diff --git a/deepray/datasets/movielens/movielens_ratings_test.py b/deepray/datasets/movielens/movielens_ratings_test.py
index 87174de7..765201ae 100644
--- a/deepray/datasets/movielens/movielens_ratings_test.py
+++ b/deepray/datasets/movielens/movielens_ratings_test.py
@@ -16,9 +16,9 @@
 
 def define_flags():
   argv = sys.argv + [
-      "--epochs=1",
-      "--batch_size=2",
-      "--train_data=movielens/100k-ratings",
+    "--epochs=1",
+    "--batch_size=2",
+    "--train_data=movielens/100k-ratings",
   ]
   flags.FLAGS(argv)
 
diff --git a/deepray/datasets/movielens/process.py b/deepray/datasets/movielens/process.py
index 65f5e51c..80f916a7 100644
--- a/deepray/datasets/movielens/process.py
+++ b/deepray/datasets/movielens/process.py
@@ -40,8 +40,25 @@
 _DATA_URL = "https://files.grouplens.org/datasets/movielens/"
 
 GENRES = [
-    'Action', 'Adventure', 'Animation', "Children", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
-    'Horror', "IMAX", 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
+  "Action",
+  "Adventure",
+  "Animation",
+  "Children",
+  "Comedy",
+  "Crime",
+  "Documentary",
+  "Drama",
+  "Fantasy",
+  "Film-Noir",
+  "Horror",
+  "IMAX",
+  "Musical",
+  "Mystery",
+  "Romance",
+  "Sci-Fi",
+  "Thriller",
+  "War",
+  "Western",
 ]
 N_GENRE = len(GENRES)
 
@@ -95,8 +112,7 @@ def _download_and_clean(dataset, data_dir):
       if not tf.io.gfile.exists(os.path.join(data_subdir, fname)):
         tf.io.gfile.copy(os.path.join(temp_dir, fname), os.path.join(data_subdir, fname))
       else:
-        logging.info("Skipping copy of {}, as it already exists in the "
-                     "destination folder.".format(fname))
+        logging.info("Skipping copy of {}, as it already exists in the destination folder.".format(fname))
 
   finally:
     tf.io.gfile.rmtree(temp_dir)
@@ -115,9 +131,7 @@ def _transform_csv(input_path, output_path, names, skip_first, separator=","):
   if six.PY2:
     names = [six.ensure_text(n, "utf-8") for n in names]
 
-  with tf.io.gfile.GFile(output_path, "wb") as f_out, \
-      tf.io.gfile.GFile(input_path, "rb") as f_in:
-
+  with tf.io.gfile.GFile(output_path, "wb") as f_out, tf.io.gfile.GFile(input_path, "rb") as f_in:
     # Write column names to the csv.
     f_out.write(",".join(names).encode("utf-8"))
     f_out.write(b"\n")
@@ -152,19 +166,19 @@ def _regularize_1m_dataset(temp_dir):
   working_dir = os.path.join(temp_dir, constants.ML_1M)
 
   _transform_csv(
-      input_path=os.path.join(working_dir, "ratings.dat"),
-      output_path=os.path.join(temp_dir, constants.RATINGS_FILE),
-      names=constants.RATING_COLUMNS,
-      skip_first=False,
-      separator="::"
+    input_path=os.path.join(working_dir, "ratings.dat"),
+    output_path=os.path.join(temp_dir, constants.RATINGS_FILE),
+    names=constants.RATING_COLUMNS,
+    skip_first=False,
+    separator="::",
   )
 
   _transform_csv(
-      input_path=os.path.join(working_dir, "movies.dat"),
-      output_path=os.path.join(temp_dir, constants.MOVIES_FILE),
-      names=constants.MOVIE_COLUMNS,
-      skip_first=False,
-      separator="::"
+    input_path=os.path.join(working_dir, "movies.dat"),
+    output_path=os.path.join(temp_dir, constants.MOVIES_FILE),
+    names=constants.MOVIE_COLUMNS,
+    skip_first=False,
+    separator="::",
   )
 
   tf.io.gfile.rmtree(working_dir)
@@ -192,19 +206,19 @@ def _regularize_20m_dataset(temp_dir):
   working_dir = os.path.join(temp_dir, constants.ML_20M)
 
   _transform_csv(
-      input_path=os.path.join(working_dir, "ratings.csv"),
-      output_path=os.path.join(temp_dir, constants.RATINGS_FILE),
-      names=constants.RATING_COLUMNS,
-      skip_first=True,
-      separator=","
+    input_path=os.path.join(working_dir, "ratings.csv"),
+    output_path=os.path.join(temp_dir, constants.RATINGS_FILE),
+    names=constants.RATING_COLUMNS,
+    skip_first=True,
+    separator=",",
   )
 
   _transform_csv(
-      input_path=os.path.join(working_dir, "movies.csv"),
-      output_path=os.path.join(temp_dir, constants.MOVIES_FILE),
-      names=constants.MOVIE_COLUMNS,
-      skip_first=True,
-      separator=","
+    input_path=os.path.join(working_dir, "movies.csv"),
+    output_path=os.path.join(temp_dir, constants.MOVIES_FILE),
+    names=constants.MOVIE_COLUMNS,
+    skip_first=True,
+    separator=",",
   )
 
   tf.io.gfile.rmtree(working_dir)
@@ -276,5 +290,4 @@ def main(_):
 
 
 if __name__ == "__main__":
-
   app.run(main)
diff --git a/deepray/datasets/movielens/producer.py b/deepray/datasets/movielens/producer.py
index d047876e..e66e5d87 100644
--- a/deepray/datasets/movielens/producer.py
+++ b/deepray/datasets/movielens/producer.py
@@ -11,7 +11,6 @@
 
 
 class Produce(DataPipeline):
-
   def __init__(self, params, producer):
     self._producer = producer
     self._params = params
@@ -57,7 +56,7 @@ def preprocess_eval_input(self, features):
     return features, labels
 
   def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, context=None, use_horovod=False, *args, **kwargs
+    self, input_file_pattern, batch_size, is_training=True, context=None, use_horovod=False, *args, **kwargs
   ):
     input_fn = self._producer.make_input_fn(is_training=is_training)
     if is_training:
diff --git a/deepray/datasets/openwebtext/build_openwebtext_pretraining_dataset.py b/deepray/datasets/openwebtext/build_openwebtext_pretraining_dataset.py
index e662d962..9d5e80cc 100644
--- a/deepray/datasets/openwebtext/build_openwebtext_pretraining_dataset.py
+++ b/deepray/datasets/openwebtext/build_openwebtext_pretraining_dataset.py
@@ -37,15 +37,15 @@ def log(*args):
 
   log("Creating example writer")
   example_writer = build_pretraining_dataset.ExampleWriter(
-      job_id=job_id,
-      # vocab_file=os.path.join(args.data_dir, "vocab.txt"),
-      vocab_file="vocab.txt",
-      output_dir=os.path.join(args.data_dir, "pretrain_tfrecords"),
-      max_seq_length=args.max_seq_length,
-      num_jobs=args.num_processes,
-      blanks_separate_docs=False,
-      do_lower_case=args.do_lower_case,
-      strip_accents=args.strip_accents,
+    job_id=job_id,
+    # vocab_file=os.path.join(args.data_dir, "vocab.txt"),
+    vocab_file="vocab.txt",
+    output_dir=os.path.join(args.data_dir, "pretrain_tfrecords"),
+    max_seq_length=args.max_seq_length,
+    num_jobs=args.num_processes,
+    blanks_separate_docs=False,
+    do_lower_case=args.do_lower_case,
+    strip_accents=args.strip_accents,
   )
   log("Writing tf examples")
   fnames = sorted(tf.io.gfile.listdir(owt_dir))
@@ -56,11 +56,14 @@ def log(*args):
     if file_no > 0 and file_no % 10 == 0:
       elapsed = time.time() - start_time
       log(
-          "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
-          "{:} examples written".format(
-              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
-              int((len(fnames) - file_no) / (file_no / elapsed)), example_writer.n_written
-          )
+        "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, {:} examples written".format(
+          file_no,
+          len(fnames),
+          100.0 * file_no / len(fnames),
+          int(elapsed),
+          int((len(fnames) - file_no) / (file_no / elapsed)),
+          example_writer.n_written,
+        )
       )
     utils.rmkdir(job_tmp_dir)
     with tarfile.open(os.path.join(owt_dir, fname)) as f:
@@ -78,18 +81,18 @@ def main():
   parser.add_argument("--data-dir", required=True, help="Location of data (vocab file, corpus, etc).")
   parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.")
   parser.add_argument(
-      "--num-processes", default=multiprocessing.cpu_count(), type=int, help="Parallelize across multiple processes."
+    "--num-processes", default=multiprocessing.cpu_count(), type=int, help="Parallelize across multiple processes."
   )
 
   # toggle lower-case
-  parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.")
+  parser.add_argument("--do-lower-case", dest="do_lower_case", action="store_true", help="Lower case input text.")
   parser.add_argument(
-      "--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text."
+    "--no-lower-case", dest="do_lower_case", action="store_false", help="Don't lower case input text."
   )
 
   # toggle strip-accents
-  parser.add_argument("--do-strip-accents", dest='strip_accents', action='store_true', help="Strip accents (default).")
-  parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.")
+  parser.add_argument("--do-strip-accents", dest="strip_accents", action="store_true", help="Strip accents (default).")
+  parser.add_argument("--no-strip-accents", dest="strip_accents", action="store_false", help="Don't strip accents.")
 
   # set defaults for toggles
   parser.set_defaults(do_lower_case=True)
diff --git a/deepray/datasets/openwebtext/build_pretraining_dataset.py b/deepray/datasets/openwebtext/build_pretraining_dataset.py
index 755d3838..7d231269 100644
--- a/deepray/datasets/openwebtext/build_pretraining_dataset.py
+++ b/deepray/datasets/openwebtext/build_pretraining_dataset.py
@@ -69,16 +69,17 @@ def _create_example(self):
       # empty, (2) the sentence doesn't put the first segment over length or
       # (3) 50% of the time when it does put the first segment over length
       if (
-          len(first_segment) == 0 or len(first_segment) + len(sentence) < first_segment_target_length or
-          (len(second_segment) == 0 and len(first_segment) < first_segment_target_length and random.random() < 0.5)
+        len(first_segment) == 0
+        or len(first_segment) + len(sentence) < first_segment_target_length
+        or (len(second_segment) == 0 and len(first_segment) < first_segment_target_length and random.random() < 0.5)
       ):
         first_segment += sentence
       else:
         second_segment += sentence
 
     # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
-    first_segment = first_segment[:self._max_length - 2]
-    second_segment = second_segment[:max(0, self._max_length - len(first_segment) - 3)]
+    first_segment = first_segment[: self._max_length - 2]
+    second_segment = second_segment[: max(0, self._max_length - len(first_segment) - 3)]
 
     # prepare to start building the next example
     self._current_sentences = []
@@ -104,13 +105,13 @@ def _make_tf_example(self, first_segment, second_segment):
     input_mask += [0] * (self._max_length - len(input_mask))
     segment_ids += [0] * (self._max_length - len(segment_ids))
     tf_example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                "input_ids": create_int_feature(input_ids),
-                "input_mask": create_int_feature(input_mask),
-                "segment_ids": create_int_feature(segment_ids)
-            }
-        )
+      features=tf.train.Features(
+        feature={
+          "input_ids": create_int_feature(input_ids),
+          "input_mask": create_int_feature(input_mask),
+          "segment_ids": create_int_feature(segment_ids),
+        }
+      )
     )
     return tf_example
 
@@ -119,20 +120,20 @@ class ExampleWriter(object):
   """Writes pre-training examples to disk."""
 
   def __init__(
-      self,
-      job_id,
-      vocab_file,
-      output_dir,
-      max_seq_length,
-      num_jobs,
-      blanks_separate_docs,
-      do_lower_case,
-      num_out_files=1000,
-      strip_accents=True
+    self,
+    job_id,
+    vocab_file,
+    output_dir,
+    max_seq_length,
+    num_jobs,
+    blanks_separate_docs,
+    do_lower_case,
+    num_out_files=1000,
+    strip_accents=True,
   ):
     self._blanks_separate_docs = blanks_separate_docs
     tokenizer = tokenization.FullTokenizer(
-        vocab_file=vocab_file, do_lower_case=do_lower_case, strip_accents=strip_accents
+      vocab_file=vocab_file, do_lower_case=do_lower_case, strip_accents=strip_accents
     )
     self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
     self._writers = []
@@ -171,14 +172,14 @@ def log(*args):
 
   log("Creating example writer")
   example_writer = ExampleWriter(
-      job_id=job_id,
-      vocab_file=args.vocab_file,
-      output_dir=args.output_dir,
-      max_seq_length=args.max_seq_length,
-      num_jobs=args.num_processes,
-      blanks_separate_docs=args.blanks_separate_docs,
-      do_lower_case=args.do_lower_case,
-      strip_accents=args.strip_accents,
+    job_id=job_id,
+    vocab_file=args.vocab_file,
+    output_dir=args.output_dir,
+    max_seq_length=args.max_seq_length,
+    num_jobs=args.num_processes,
+    blanks_separate_docs=args.blanks_separate_docs,
+    do_lower_case=args.do_lower_case,
+    strip_accents=args.strip_accents,
   )
   log("Writing tf examples")
   fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
@@ -189,11 +190,14 @@ def log(*args):
     if file_no > 0:
       elapsed = time.time() - start_time
       log(
-          "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
-          "{:} examples written".format(
-              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
-              int((len(fnames) - file_no) / (file_no / elapsed)), example_writer.n_written
-          )
+        "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, {:} examples written".format(
+          file_no,
+          len(fnames),
+          100.0 * file_no / len(fnames),
+          int(elapsed),
+          int((len(fnames) - file_no) / (file_no / elapsed)),
+          example_writer.n_written,
+        )
       )
     example_writer.write_examples(os.path.join(args.corpus_dir, fname))
   example_writer.finish()
@@ -208,18 +212,18 @@ def main():
   parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.")
   parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.")
   parser.add_argument(
-      "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries."
+    "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries."
   )
 
   # toggle lower-case
-  parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.")
+  parser.add_argument("--do-lower-case", dest="do_lower_case", action="store_true", help="Lower case input text.")
   parser.add_argument(
-      "--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text."
+    "--no-lower-case", dest="do_lower_case", action="store_false", help="Don't lower case input text."
   )
 
   # toggle strip-accents
-  parser.add_argument("--do-strip-accents", dest='strip_accents', action='store_true', help="Strip accents (default).")
-  parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.")
+  parser.add_argument("--do-strip-accents", dest="strip_accents", action="store_true", help="Strip accents (default).")
+  parser.add_argument("--no-strip-accents", dest="strip_accents", action="store_false", help="Don't strip accents.")
 
   # set defaults for toggles
   parser.set_defaults(do_lower_case=True)
diff --git a/deepray/datasets/openwebtext/openwebtext.py b/deepray/datasets/openwebtext/openwebtext.py
index 0c623c14..cf4e0d97 100644
--- a/deepray/datasets/openwebtext/openwebtext.py
+++ b/deepray/datasets/openwebtext/openwebtext.py
@@ -23,13 +23,12 @@
 from deepray.datasets.datapipeline import DataPipeline
 
 FLAGS([
-    sys.argv[0],
-    "--num_train_examples=60000",
+  sys.argv[0],
+  "--num_train_examples=60000",
 ])
 
 
 class Openwebtext(DataPipeline):
-
   def __init__(self, max_seq_length, **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
@@ -39,9 +38,9 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
     input_files = tf.io.gfile.glob(input_file_pattern)
 
     name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
     }
 
     d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
@@ -54,9 +53,7 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
     # `sloppy` mode means that the interleaving is not exact. This adds
     # even more randomness to the training pipeline.
     d = d.apply(
-        tf.data.experimental.parallel_interleave(
-            tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length
-        )
+      tf.data.experimental.parallel_interleave(tf.data.TFRecordDataset, sloppy=is_training, cycle_length=cycle_length)
     )
     d = d.shuffle(buffer_size=100)
 
@@ -65,12 +62,12 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args,
     # and we *don"t* want to drop the remainder, otherwise we wont cover
     # every sample.
     d = d.apply(
-        tf.data.experimental.map_and_batch(
-            lambda record: self.parser(record, name_to_features),
-            batch_size=batch_size,
-            num_parallel_batches=multiprocessing.cpu_count(),
-            drop_remainder=True
-        )
+      tf.data.experimental.map_and_batch(
+        lambda record: self.parser(record, name_to_features),
+        batch_size=batch_size,
+        num_parallel_batches=multiprocessing.cpu_count(),
+        drop_remainder=True,
+      )
     )
     return d
 
diff --git a/deepray/datasets/openwebtext/openwebtext_test.py b/deepray/datasets/openwebtext/openwebtext_test.py
index a2c95f80..2743eaf9 100644
--- a/deepray/datasets/openwebtext/openwebtext_test.py
+++ b/deepray/datasets/openwebtext/openwebtext_test.py
@@ -19,11 +19,11 @@ def runner(argv=None):
   if len(argv) <= 1:
     dir_path = os.path.dirname(os.path.realpath(__file__))
     argv = [
-        sys.argv[0],
-        "--batch_size=10240",
-        "-epochs=1",
-        f"--train_data=/workspaces/dataset/openwebtext/pretrain_tfrecords/*",
-        # "--label=clicked",
+      sys.argv[0],
+      "--batch_size=10240",
+      "-epochs=1",
+      f"--train_data=/workspaces/dataset/openwebtext/pretrain_tfrecords/*",
+      # "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/openwebtext/tokenization.py b/deepray/datasets/openwebtext/tokenization.py
index 76e2ff5b..3ad37860 100644
--- a/deepray/datasets/openwebtext/tokenization.py
+++ b/deepray/datasets/openwebtext/tokenization.py
@@ -231,13 +231,14 @@ def _is_chinese_char(self, cp):
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
     if (
-        (cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)
+      (cp >= 0x4E00 and cp <= 0x9FFF)  #
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)
+      or (cp >= 0xF900 and cp <= 0xFAFF)  #
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)
     ):  #
       return True
 
@@ -248,7 +249,7 @@ def _clean_text(self, text):
     output = []
     for char in text:
       cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
+      if cp == 0 or cp == 0xFFFD or _is_control(char):
         continue
       if _is_whitespace(char):
         output.append(" ")
@@ -350,7 +351,7 @@ def _is_punctuation(char):
   # Characters such as "^", "$", and "`" are not in the Unicode
   # Punctuation class but we treat them as punctuation anyways, for
   # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
     return True
   cat = unicodedata.category(char)
   if cat.startswith("P"):
diff --git a/deepray/datasets/openwebtext/util/__init__.py b/deepray/datasets/openwebtext/util/__init__.py
index 4241467e..dc3dce1d 100644
--- a/deepray/datasets/openwebtext/util/__init__.py
+++ b/deepray/datasets/openwebtext/util/__init__.py
@@ -11,4 +11,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
index 9ef186e0..438ee400 100644
--- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
+++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset.py
@@ -35,26 +35,25 @@ def parquet_filenames(filenames, lower=False):
   elif isinstance(filenames, (tuple, list)):
     for f in filenames:
       if not isinstance(f, string_types):
-        raise ValueError(f'{f} in `filenames` must be a string')
+        raise ValueError(f"{f} in `filenames` must be a string")
   elif isinstance(filenames, dataset_ops.Dataset):
     if filenames.output_types != dtypes.string:
-      raise TypeError('`filenames` must be a `tf.data.Dataset` of `tf.string` elements.')
+      raise TypeError("`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
     if not filenames.output_shapes.is_compatible_with(tensor_shape.TensorShape([])):
-      raise ValueError('`filenames` must be a `tf.data.Dataset` of scalar `tf.string` '
-                       'elements.')
+      raise ValueError("`filenames` must be a `tf.data.Dataset` of scalar `tf.string` elements.")
   elif isinstance(filenames, ops.Tensor):
     if filenames.dtype != dtypes.string:
-      raise TypeError('`filenames` must be a `tf.Tensor` of `tf.string`.')
+      raise TypeError("`filenames` must be a `tf.Tensor` of `tf.string`.")
   else:
     raise ValueError(
-        f'`filenames` {filenames} must be a `tf.data.Dataset` of scalar '
-        '`tf.string` elements or can be converted to a `tf.Tensor` of '
-        '`tf.string`.'
+      f"`filenames` {filenames} must be a `tf.data.Dataset` of scalar "
+      "`tf.string` elements or can be converted to a `tf.Tensor` of "
+      "`tf.string`."
     )
 
   if not isinstance(filenames, dataset_ops.Dataset):
     filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
-    filenames = array_ops.reshape(filenames, [-1], name='filenames')
+    filenames = array_ops.reshape(filenames, [-1], name="filenames")
     filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
   return filenames
 
@@ -65,7 +64,7 @@ class ParquetDataset(dataset_ops.DatasetV2):  # pylint: disable=abstract-method
   VERSION = 2002
 
   def __init__(
-      self, filenames, column_names=None, batch_size=1, num_parallel_reads=None, num_sequential_reads=2, parser=None
+    self, filenames, column_names=None, batch_size=1, num_parallel_reads=None, num_sequential_reads=2, parser=None
   ):
     """Create a `ParquetDataset`.
 
@@ -93,16 +92,16 @@ def __init__(
 
     def _create_dataset(f):
       dataset = parquet_dataset_ops.ParquetDataset(
-          filenames=f,
-          fields=fields,
-          batch_size=self._batch_size,
+        filenames=f,
+        fields=fields,
+        batch_size=self._batch_size,
       )
       if self._parser:
         dataset = dataset.map(self._parser, num_parallel_calls=tf.data.AUTOTUNE)
       return dataset
 
     self._impl = self._build_dataset(
-        _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads
+      _create_dataset, filenames, num_parallel_reads=num_parallel_reads, num_sequential_reads=num_sequential_reads
     )
     super().__init__(self._impl._variant_tensor)  # pylint: disable=protected-access
 
@@ -120,18 +119,17 @@ def _build_dataset(self, dataset_creator, filenames, num_parallel_reads=None, nu
     if num_parallel_reads == dataset_ops.AUTOTUNE:
       return filenames.interleave(dataset_creator, num_parallel_calls=2, deterministic=False)
     return readers.ParallelInterleaveDataset(
-        filenames,
-        dataset_creator,
-        cycle_length=num_parallel_reads,
-        block_length=num_sequential_reads,
-        sloppy=True,
-        buffer_output_elements=None,
-        prefetch_input_elements=1
+      filenames,
+      dataset_creator,
+      cycle_length=num_parallel_reads,
+      block_length=num_sequential_reads,
+      sloppy=True,
+      buffer_output_elements=None,
+      prefetch_input_elements=1,
     )
 
 
 class ParquetPipeline(DataPipeline):
-
   def __init__(self, column_names=[], **kwargs):
     super().__init__(**kwargs)
     self.column_names = column_names
@@ -151,14 +149,7 @@ def parse(self, record):
     return record, label_map
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      is_training=True,
-      epochs=1,
-      shuffle=False,
-      *args,
-      **kwargs
+    self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ) -> tf.data.Dataset:
     if isinstance(input_file_pattern, str):
       data_file_list = self.read_list_from_file(input_file_pattern)
@@ -176,9 +167,9 @@ def build_dataset(
       logger.info(f"Shuffling {len(data_file_list)} parquet files.")
     if isinstance(data_file_list, str) or len(data_file_list) < get_world_size():
       dataset = parquet_dataset_ops.ParquetDataset(
-          filenames=data_file_list,
-          fields=self.column_names if self.column_names else None,
-          batch_size=batch_size,
+        filenames=data_file_list,
+        fields=self.column_names if self.column_names else None,
+        batch_size=batch_size,
       )
       if self.use_horovod:
         # For multi-host training, we want each hosts to always process the same
@@ -196,11 +187,11 @@ def build_dataset(
         data_file_list = [data_file_list[i] for i in range(len(data_file_list)) if i % get_world_size() == get_rank()]
         logger.info("Using files distributing strategy ❤")
       dataset = ParquetDataset(
-          filenames=data_file_list,
-          column_names=self.column_names if self.column_names else None,
-          batch_size=batch_size,
-          num_parallel_reads=dataset_ops.AUTOTUNE,
-          parser=None if hasattr(self.parser, "__isabstractmethod__") else self.parser
+        filenames=data_file_list,
+        column_names=self.column_names if self.column_names else None,
+        batch_size=batch_size,
+        num_parallel_reads=dataset_ops.AUTOTUNE,
+        parser=None if hasattr(self.parser, "__isabstractmethod__") else self.parser,
       )
 
     # if not hasattr(self.parser, "__isabstractmethod__"):
@@ -216,8 +207,10 @@ def build_dataset(
       shuffle_buffer = kwargs.get("shuffle_buffer", 10)
       logger.debug(f"kwargs = {kwargs}")
       logger.info(f"The shuffle_buffer is {shuffle_buffer}")
-      dataset = dataset.unbatch().shuffle(
-          buffer_size=shuffle_buffer, seed=flags.FLAGS.random_seed, reshuffle_each_iteration=False
-      ).batch(batch_size)
+      dataset = (
+        dataset.unbatch()
+        .shuffle(buffer_size=shuffle_buffer, seed=flags.FLAGS.random_seed, reshuffle_each_iteration=False)
+        .batch(batch_size)
+      )
     dataset = dataset.prefetch(tf.data.AUTOTUNE)
     return dataset
diff --git a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
index 5ca9fc3f..a4c0089f 100644
--- a/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
+++ b/deepray/datasets/parquet_pipeline/ali_parquet_dataset_test.py
@@ -16,9 +16,11 @@
 
 def define_flags():
   argv = sys.argv + [
-      "--batch_size=4096", "--epochs=1", "--dataset=ps_test",
-      "--feature_map=/workspaces/one-code/shadow-tf/datasets/feature_map.csv",
-      "--config_file=/workspaces/one-code/shadow-tf/train_feature_process.yaml"
+    "--batch_size=4096",
+    "--epochs=1",
+    "--dataset=ps_test",
+    "--feature_map=/workspaces/one-code/shadow-tf/datasets/feature_map.csv",
+    "--config_file=/workspaces/one-code/shadow-tf/train_feature_process.yaml",
   ]
   flags.FLAGS(argv)
 
@@ -26,10 +28,10 @@ def define_flags():
 def main():
   define_flags()
   filenames = [
-      "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
-      "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+    "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+    "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
   ]
-  data_pipe = ParquetPipeline(column_names=['f_c0', 'f_c1', 'f_c14'])
+  data_pipe = ParquetPipeline(column_names=["f_c0", "f_c1", "f_c14"])
   # create data pipline of train & test dataset
   train_dataset = data_pipe(batch_size=flags.FLAGS.batch_size, input_file_pattern=filenames, is_training=True)
   _performance_calculator = PerformanceCalculator(0, 1000)
@@ -40,7 +42,7 @@ def main():
     step += 1
     num_examples += flags.FLAGS.batch_size
     step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
   print(batch)
 
   print(num_examples)
diff --git a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
index 02324044..206cb341 100644
--- a/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
+++ b/deepray/datasets/parquet_pipeline/parquet_pipeline_test.py
@@ -18,14 +18,14 @@ def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=2",
-        "--epochs=1",
-        "--train_data=/workspaces/dataset/arc13_training_v3_data/*.parquet",
-        "--feature_map=examples/Recommendation/yekuan/tools/feature_map.csv",
-        # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list",
-        # f"--feature_map={dir_path}/bz_search_1to3.csv",
-        "--label=deal_type,detailed_deal_type",
+      sys.argv[0],
+      "--batch_size=2",
+      "--epochs=1",
+      "--train_data=/workspaces/dataset/arc13_training_v3_data/*.parquet",
+      "--feature_map=examples/Recommendation/yekuan/tools/feature_map.csv",
+      # "--white_list=examples/Recommendation/yekuan/data_pipeline/white_list",
+      # f"--feature_map={dir_path}/bz_search_1to3.csv",
+      "--label=deal_type,detailed_deal_type",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -43,7 +43,7 @@ def runner(argv=None):
     step += 1
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/squad/classifier_dataset.py b/deepray/datasets/squad/classifier_dataset.py
index dc2e2cb5..d4c4700f 100644
--- a/deepray/datasets/squad/classifier_dataset.py
+++ b/deepray/datasets/squad/classifier_dataset.py
@@ -25,7 +25,6 @@
 
 
 class Squad(DataPipeline):
-
   def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
     super().__init__(**kwargs)
     self.max_seq_length = max_seq_length
@@ -66,11 +65,11 @@ def single_file_dataset(self, input_file, name_to_features):
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
     """Creates input dataset from (tf)records files for train/eval."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([], tf.int64),
-        'is_real_example': tf.io.FixedLenFeature([], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "label_ids": tf.io.FixedLenFeature([], tf.int64),
+      "is_real_example": tf.io.FixedLenFeature([], tf.int64),
     }
     dataset = self.single_file_dataset(input_file_pattern, name_to_features)
 
@@ -78,16 +77,16 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs
     # num_input_pipelines is the number of hosts rather than number of cores.
     if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
       dataset = dataset.shard(
-          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+        self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
       )
 
     def parser(record):
       x = {
-          'input_word_ids': record['input_ids'],
-          'input_mask': record['input_mask'],
-          'input_type_ids': record['segment_ids']
+        "input_word_ids": record["input_ids"],
+        "input_mask": record["input_mask"],
+        "input_type_ids": record["segment_ids"],
       }
-      y = record['label_ids']
+      y = record["label_ids"]
       return x, y
 
     dataset = dataset.map(parser)
diff --git a/deepray/datasets/squad/pretrain_dataset.py b/deepray/datasets/squad/pretrain_dataset.py
index 6373cdcc..eff0f4cb 100644
--- a/deepray/datasets/squad/pretrain_dataset.py
+++ b/deepray/datasets/squad/pretrain_dataset.py
@@ -25,7 +25,6 @@
 
 
 class Squad(DataPipeline):
-
   def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
     super().__init__(**kwargs)
     self.max_seq_length = max_seq_length
@@ -46,25 +45,25 @@ def decode_record(self, record, name_to_features):
     return example
 
   def build_dataset(
-      self,
-      input_file_pattern,
-      batch_size,
-      max_predictions_per_seq,
-      is_training=True,
-      epochs=1,
-      shuffle=False,
-      *args,
-      **kwargs
+    self,
+    input_file_pattern,
+    batch_size,
+    max_predictions_per_seq,
+    is_training=True,
+    epochs=1,
+    shuffle=False,
+    *args,
+    **kwargs,
   ):
     """Creates input dataset from (tf)records files for pretraining."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-        'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-        'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
-        'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "masked_lm_positions": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      "masked_lm_ids": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      "masked_lm_weights": tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+      "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64),
     }
 
     dataset = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)
@@ -73,7 +72,7 @@ def build_dataset(
 
     if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
       dataset = dataset.shard(
-          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+        self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
       )
 
     dataset = dataset.repeat()
@@ -90,7 +89,7 @@ def build_dataset(
     # parallel. You may want to increase this number if you have a large number of
     # CPU cores.
     dataset = dataset.interleave(
-        tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
+      tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
     )
 
     decode_fn = lambda record: self.decode_record(record, name_to_features)
@@ -99,16 +98,16 @@ def build_dataset(
     def parser(record):
       """Filter out features to use for pretraining."""
       x = {
-          'input_word_ids': record['input_ids'],
-          'input_mask': record['input_mask'],
-          'input_type_ids': record['segment_ids'],
-          'masked_lm_positions': record['masked_lm_positions'],
-          'masked_lm_ids': record['masked_lm_ids'],
-          'masked_lm_weights': record['masked_lm_weights'],
-          'next_sentence_labels': record['next_sentence_labels'],
+        "input_word_ids": record["input_ids"],
+        "input_mask": record["input_mask"],
+        "input_type_ids": record["segment_ids"],
+        "masked_lm_positions": record["masked_lm_positions"],
+        "masked_lm_ids": record["masked_lm_ids"],
+        "masked_lm_weights": record["masked_lm_weights"],
+        "next_sentence_labels": record["next_sentence_labels"],
       }
 
-      y = record['masked_lm_weights']
+      y = record["masked_lm_weights"]
 
       return x, y
 
diff --git a/deepray/datasets/squad/squad.py b/deepray/datasets/squad/squad.py
index 67c6d7ce..a22b464a 100644
--- a/deepray/datasets/squad/squad.py
+++ b/deepray/datasets/squad/squad.py
@@ -26,7 +26,6 @@
 
 
 class Squad(DataPipeline):
-
   def __init__(self, max_seq_length, dataset_type="squad", **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
@@ -35,10 +34,10 @@ def __init__(self, max_seq_length, dataset_type="squad", **kwargs):
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs):
     if self.dataset_type == "squad":
       return self.create_squad_dataset(
-          input_file_pattern,
-          self._max_seq_length,
-          batch_size,
-          is_training=is_training,
+        input_file_pattern,
+        self._max_seq_length,
+        batch_size,
+        is_training=is_training,
       )
     elif self.dataset_type == "pretrain":
       return self.create_pretrain_dataset()
@@ -75,27 +74,27 @@ def single_file_dataset(self, input_file, name_to_features):
     # same input file is sent to all workers.
     if isinstance(input_file, str) or len(input_file) == 1:
       options = tf.data.Options()
-      options.experimental_distribute.auto_shard_policy = (tf.data.experimental.AutoShardPolicy.OFF)
+      options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
       d = d.with_options(options)
     return d
 
   def create_pretrain_dataset(
-      self,
-      input_patterns,
-      seq_length,
-      max_predictions_per_seq,
-      batch_size,
-      is_training=True,
+    self,
+    input_patterns,
+    seq_length,
+    max_predictions_per_seq,
+    batch_size,
+    is_training=True,
   ):
     """Creates input dataset from (tf)records files for pretraining."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'masked_lm_positions': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-        'masked_lm_ids': tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-        'masked_lm_weights': tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
-        'next_sentence_labels': tf.io.FixedLenFeature([1], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "masked_lm_positions": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      "masked_lm_ids": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+      "masked_lm_weights": tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+      "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64),
     }
 
     dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
@@ -116,7 +115,7 @@ def create_pretrain_dataset(
     # parallel. You may want to increase this number if you have a large number of
     # CPU cores.
     dataset = dataset.interleave(
-        tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
+      tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
     )
 
     decode_fn = lambda record: self.decode_record(record, name_to_features)
@@ -125,16 +124,16 @@ def create_pretrain_dataset(
     def _select_data_from_record(record):
       """Filter out features to use for pretraining."""
       x = {
-          'input_word_ids': record['input_ids'],
-          'input_mask': record['input_mask'],
-          'input_type_ids': record['segment_ids'],
-          'masked_lm_positions': record['masked_lm_positions'],
-          'masked_lm_ids': record['masked_lm_ids'],
-          'masked_lm_weights': record['masked_lm_weights'],
-          'next_sentence_labels': record['next_sentence_labels'],
+        "input_word_ids": record["input_ids"],
+        "input_mask": record["input_mask"],
+        "input_type_ids": record["segment_ids"],
+        "masked_lm_positions": record["masked_lm_positions"],
+        "masked_lm_ids": record["masked_lm_ids"],
+        "masked_lm_weights": record["masked_lm_weights"],
+        "next_sentence_labels": record["next_sentence_labels"],
       }
 
-      y = record['masked_lm_weights']
+      y = record["masked_lm_weights"]
 
       return x, y
 
@@ -150,11 +149,11 @@ def _select_data_from_record(record):
   def create_classifier_dataset(self, file_path, seq_length, batch_size, is_training=True):
     """Creates input dataset from (tf)records files for train/eval."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'label_ids': tf.io.FixedLenFeature([], tf.int64),
-        'is_real_example': tf.io.FixedLenFeature([], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "label_ids": tf.io.FixedLenFeature([], tf.int64),
+      "is_real_example": tf.io.FixedLenFeature([], tf.int64),
     }
     dataset = self.single_file_dataset(file_path, name_to_features)
 
@@ -165,11 +164,11 @@ def create_classifier_dataset(self, file_path, seq_length, batch_size, is_traini
 
     def _select_data_from_record(record):
       x = {
-          'input_word_ids': record['input_ids'],
-          'input_mask': record['input_mask'],
-          'input_type_ids': record['segment_ids']
+        "input_word_ids": record["input_ids"],
+        "input_mask": record["input_mask"],
+        "input_type_ids": record["segment_ids"],
       }
-      y = record['label_ids']
+      y = record["label_ids"]
       return x, y
 
     dataset = dataset.map(_select_data_from_record)
@@ -185,15 +184,15 @@ def _select_data_from_record(record):
   def create_squad_dataset(self, file_path, seq_length, batch_size, is_training=True):
     """Creates input dataset from (tf)records files for train/eval."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
     }
     if is_training:
-      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
-      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["start_positions"] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["end_positions"] = tf.io.FixedLenFeature([], tf.int64)
     else:
-      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["unique_ids"] = tf.io.FixedLenFeature([], tf.int64)
 
     dataset = self.single_file_dataset(file_path, name_to_features)
 
@@ -206,12 +205,12 @@ def _select_data_from_record(record):
       """Dispatches record to features and labels."""
       x, y = {}, {}
       for name, tensor in record.items():
-        if name in ('start_positions', 'end_positions'):
+        if name in ("start_positions", "end_positions"):
           y[name] = tensor
-        elif name == 'input_ids':
-          x['input_word_ids'] = tensor
-        elif name == 'segment_ids':
-          x['input_type_ids'] = tensor
+        elif name == "input_ids":
+          x["input_word_ids"] = tensor
+        elif name == "segment_ids":
+          x["input_type_ids"] = tensor
         else:
           x[name] = tensor
       return x, y
diff --git a/deepray/datasets/squad/squad_dataset.py b/deepray/datasets/squad/squad_dataset.py
index 4cfa4801..dfbefc73 100644
--- a/deepray/datasets/squad/squad_dataset.py
+++ b/deepray/datasets/squad/squad_dataset.py
@@ -25,7 +25,6 @@
 
 
 class Squad(DataPipeline):
-
   def __init__(self, max_seq_length, input_pipeline_context=None, **kwargs):
     super().__init__(**kwargs)
     self.max_seq_length = max_seq_length
@@ -67,15 +66,15 @@ def single_file_dataset(self, input_file, name_to_features):
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=False, *args, **kwargs):
     """Creates input dataset from (tf)records files for train/eval."""
     name_to_features = {
-        'input_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'input_mask': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
-        'segment_ids': tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([self.max_seq_length], tf.int64),
     }
     if is_training:
-      name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
-      name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["start_positions"] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["end_positions"] = tf.io.FixedLenFeature([], tf.int64)
     else:
-      name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+      name_to_features["unique_ids"] = tf.io.FixedLenFeature([], tf.int64)
 
     dataset = self.single_file_dataset(input_file_pattern, name_to_features)
 
@@ -83,19 +82,19 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs
     # num_input_pipelines is the number of hosts rather than number of cores.
     if self.input_pipeline_context and self.input_pipeline_context.num_input_pipelines > 1:
       dataset = dataset.shard(
-          self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
+        self.input_pipeline_context.num_input_pipelines, self.input_pipeline_context.input_pipeline_id
       )
 
     def parser(record):
       """Dispatches record to features and labels."""
       x, y = {}, {}
       for name, tensor in record.items():
-        if name in ('start_positions', 'end_positions'):
+        if name in ("start_positions", "end_positions"):
           y[name] = tensor
-        elif name == 'input_ids':
-          x['input_word_ids'] = tensor
-        elif name == 'segment_ids':
-          x['input_type_ids'] = tensor
+        elif name == "input_ids":
+          x["input_word_ids"] = tensor
+        elif name == "segment_ids":
+          x["input_type_ids"] = tensor
         else:
           x[name] = tensor
       return x, y
diff --git a/deepray/datasets/squad/squad_lib.py b/deepray/datasets/squad/squad_lib.py
index 5dae49b9..3d30edf4 100644
--- a/deepray/datasets/squad/squad_lib.py
+++ b/deepray/datasets/squad/squad_lib.py
@@ -36,18 +36,18 @@
 class SquadExample(object):
   """A single training/test example for simple sequence classification.
 
-     For examples without an answer, the start and end position are -1.
+  For examples without an answer, the start and end position are -1.
   """
 
   def __init__(
-      self,
-      qas_id,
-      question_text,
-      doc_tokens,
-      orig_answer_text=None,
-      start_position=None,
-      end_position=None,
-      is_impossible=False
+    self,
+    qas_id,
+    question_text,
+    doc_tokens,
+    orig_answer_text=None,
+    start_position=None,
+    end_position=None,
+    is_impossible=False,
   ):
     self.qas_id = qas_id
     self.question_text = question_text
@@ -78,19 +78,19 @@ class InputFeatures(object):
   """A single set of features of data."""
 
   def __init__(
-      self,
-      unique_id,
-      example_index,
-      doc_span_index,
-      tokens,
-      token_to_orig_map,
-      token_is_max_context,
-      input_ids,
-      input_mask,
-      segment_ids,
-      start_position=None,
-      end_position=None,
-      is_impossible=None
+    self,
+    unique_id,
+    example_index,
+    doc_span_index,
+    tokens,
+    token_to_orig_map,
+    token_is_max_context,
+    input_ids,
+    input_mask,
+    segment_ids,
+    start_position=None,
+    end_position=None,
+    is_impossible=None,
   ):
     self.unique_id = unique_id
     self.example_index = example_index
@@ -181,7 +181,6 @@ def is_whitespace(c):
         orig_answer_text = None
         is_impossible = False
         if is_training:
-
           if version_2_with_negative:
             is_impossible = qa["is_impossible"]
           if (len(qa["answers"]) != 1) and (not is_impossible):
@@ -199,7 +198,7 @@ def is_whitespace(c):
             #
             # Note that this means for training mode, every example is NOT
             # guaranteed to be preserved.
-            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+            actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
             cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(orig_answer_text))
             if actual_text.find(cleaned_answer_text) == -1:
               logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
@@ -210,13 +209,13 @@ def is_whitespace(c):
             orig_answer_text = ""
 
         example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            doc_tokens=doc_tokens,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            end_position=end_position,
-            is_impossible=is_impossible
+          qas_id=qas_id,
+          question_text=question_text,
+          doc_tokens=doc_tokens,
+          orig_answer_text=orig_answer_text,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=is_impossible,
         )
         examples.append(example)
 
@@ -224,14 +223,14 @@ def is_whitespace(c):
 
 
 def convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, batch_size=None
+  examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, batch_size=None
 ):
   """Loads a data file into a list of `InputBatch`s."""
 
   base_id = 1000000000
   unique_id = base_id
   feature = None
-  for (example_index, example) in enumerate(examples):
+  for example_index, example in enumerate(examples):
     query_tokens = tokenizer.tokenize(example.question_text)
 
     if len(query_tokens) > max_query_length:
@@ -240,7 +239,7 @@ def convert_examples_to_features(
     tok_to_orig_index = []
     orig_to_tok_index = []
     all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
+    for i, token in enumerate(example.doc_tokens):
       orig_to_tok_index.append(len(all_doc_tokens))
       sub_tokens = tokenizer.tokenize(token)
       for sub_token in sub_tokens:
@@ -259,7 +258,7 @@ def convert_examples_to_features(
       else:
         tok_end_position = len(all_doc_tokens) - 1
       (tok_start_position, tok_end_position) = _improve_answer_span(
-          all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
+        all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
       )
 
     # The -3 accounts for [CLS], [SEP] and [SEP]
@@ -269,7 +268,7 @@ def convert_examples_to_features(
     # To deal with this we do a sliding window approach, where we take chunks
     # of the up to our max length with a stride of `doc_stride`.
     _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"]
+      "DocSpan", ["start", "length"]
     )
     doc_spans = []
     start_offset = 0
@@ -282,7 +281,7 @@ def convert_examples_to_features(
         break
       start_offset += min(length, doc_stride)
 
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
+    for doc_span_index, doc_span in enumerate(doc_spans):
       tokens = []
       token_to_orig_map = {}
       token_is_max_context = {}
@@ -351,10 +350,10 @@ def convert_examples_to_features(
         logging.info("doc_span_index: %s", doc_span_index)
         logging.info("tokens: %s", " ".join([tokenization.printable_text(x) for x in tokens]))
         logging.info(
-            "token_to_orig_map: %s", " ".join(["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])
+          "token_to_orig_map: %s", " ".join(["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])
         )
         logging.info(
-            "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)])
+          "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)])
         )
         logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
         logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
@@ -362,24 +361,24 @@ def convert_examples_to_features(
         if is_training and example.is_impossible:
           logging.info("impossible example")
         if is_training and not example.is_impossible:
-          answer_text = " ".join(tokens[start_position:(end_position + 1)])
+          answer_text = " ".join(tokens[start_position : (end_position + 1)])
           logging.info("start_position: %d", start_position)
           logging.info("end_position: %d", end_position)
           logging.info("answer: %s", tokenization.printable_text(answer_text))
 
       feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=example_index,
-          doc_span_index=doc_span_index,
-          tokens=tokens,
-          token_to_orig_map=token_to_orig_map,
-          token_is_max_context=token_is_max_context,
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=example.is_impossible
+        unique_id=unique_id,
+        example_index=example_index,
+        doc_span_index=doc_span_index,
+        tokens=tokens,
+        token_to_orig_map=token_to_orig_map,
+        token_is_max_context=token_is_max_context,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        start_position=start_position,
+        end_position=end_position,
+        is_impossible=example.is_impossible,
       )
 
       # Run callback
@@ -437,7 +436,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_ans
 
   for new_start in range(input_start, input_end + 1):
     for new_end in range(input_end, new_start - 1, -1):
-      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
       if text_span == tok_answer_text:
         return new_start, new_end
 
@@ -465,7 +464,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   # and 0 right context.
   best_score = None
   best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
+  for span_index, doc_span in enumerate(doc_spans):
     end = doc_span.start + doc_span.length - 1
     if position < doc_span.start:
       continue
@@ -485,15 +484,15 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 
 
 def get_predictions(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    version_2_with_negative=False,
-    null_score_diff_threshold=0.0,
-    verbose=False
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
 ):
   example_index_to_features = collections.defaultdict(list)
   for feature in all_features:
@@ -503,14 +502,14 @@ def get_predictions(
     unique_id_to_result[result.unique_id] = result
 
   _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-    "PrelimPrediction",
-    ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+  )
 
   all_predictions = collections.OrderedDict()
   all_nbest_json = collections.OrderedDict()
   scores_diff_json = collections.OrderedDict()
 
-  for (example_index, example) in enumerate(all_examples):
+  for example_index, example in enumerate(all_examples):
     features = example_index_to_features[example_index]
 
     prelim_predictions = []
@@ -519,7 +518,7 @@ def get_predictions(
     min_null_feature_index = 0  # the paragraph slice with min mull score
     null_start_logit = 0  # the start logit at the slice with min null score
     null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
+    for feature_index, feature in enumerate(features):
       result = unique_id_to_result[feature.unique_id]
       start_indexes = _get_best_indexes(result.start_logits, n_best_size)
       end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -552,29 +551,30 @@ def get_predictions(
           if length > max_answer_length:
             continue
           prelim_predictions.append(
-              _PrelimPrediction(
-                  feature_index=feature_index,
-                  start_index=start_index,
-                  end_index=end_index,
-                  start_logit=result.start_logits[start_index],
-                  end_logit=result.end_logits[end_index]
-              )
+            _PrelimPrediction(
+              feature_index=feature_index,
+              start_index=start_index,
+              end_index=end_index,
+              start_logit=result.start_logits[start_index],
+              end_logit=result.end_logits[end_index],
+            )
           )
 
     if version_2_with_negative:
       prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=0,
-              end_index=0,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit
-          )
+        _PrelimPrediction(
+          feature_index=min_null_feature_index,
+          start_index=0,
+          end_index=0,
+          start_logit=null_start_logit,
+          end_logit=null_end_logit,
+        )
       )
     prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "NbestPrediction", ["text", "start_logit", "end_logit"])
+      "NbestPrediction", ["text", "start_logit", "end_logit"]
+    )
 
     seen_predictions = {}
     nbest = []
@@ -583,10 +583,10 @@ def get_predictions(
         break
       feature = features[pred.feature_index]
       if pred.start_index > 0:  # this is a non-null prediction
-        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+        tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
         orig_doc_start = feature.token_to_orig_map[pred.start_index]
         orig_doc_end = feature.token_to_orig_map[pred.end_index]
-        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
         tok_text = " ".join(tok_tokens)
 
         # De-tokenize WordPieces that have been split off.
@@ -631,7 +631,7 @@ def get_predictions(
     probs = _compute_softmax(total_scores)
 
     nbest_json = []
-    for (i, entry) in enumerate(nbest):
+    for i, entry in enumerate(nbest):
       output = collections.OrderedDict()
       output["text"] = entry.text
       output["probability"] = probs[i]
@@ -659,26 +659,33 @@ def get_predictions(
 
 
 def write_predictions(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    version_2_with_negative=False,
-    null_score_diff_threshold=0.0,
-    verbose=False
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  output_prediction_file,
+  output_nbest_file,
+  output_null_log_odds_file,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
 ):
   """Write final predictions to the json file and log-odds of null if needed."""
   logging.info("Writing predictions to: %s", (output_prediction_file))
   logging.info("Writing nbest to: %s", (output_nbest_file))
 
   all_predictions, all_nbest_json, scores_diff_json = get_predictions(
-      all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, version_2_with_negative,
-      null_score_diff_threshold, verbose
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    verbose,
   )
 
   with tf.io.gfile.GFile(output_prediction_file, "w") as writer:
@@ -723,7 +730,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
   def _strip_spaces(text):
     ns_chars = []
     ns_to_s_map = collections.OrderedDict()
-    for (i, c) in enumerate(text):
+    for i, c in enumerate(text):
       if c == " ":
         continue
       ns_to_s_map[len(ns_chars)] = i
@@ -757,7 +764,7 @@ def _strip_spaces(text):
   # We then project the characters in `pred_text` back to `orig_text` using
   # the character-to-character alignment.
   tok_s_to_ns_map = {}
-  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+  for i, tok_index in six.iteritems(tok_ns_to_s_map):
     tok_s_to_ns_map[tok_index] = i
 
   orig_start_position = None
@@ -782,7 +789,7 @@ def _strip_spaces(text):
       logging.info("Couldn't map end position")
     return orig_text
 
-  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  output_text = orig_text[orig_start_position : (orig_end_position + 1)]
   return output_text
 
 
@@ -822,39 +829,39 @@ def _compute_softmax(scores):
 
 
 def generate_tf_record_from_json_file(
-    input_file_path,
-    vocab_file_path,
-    output_path,
-    max_seq_length=384,
-    do_lower_case=True,
-    max_query_length=64,
-    doc_stride=128,
-    version_2_with_negative=False
+  input_file_path,
+  vocab_file_path,
+  output_path,
+  max_seq_length=384,
+  do_lower_case=True,
+  max_query_length=64,
+  doc_stride=128,
+  version_2_with_negative=False,
 ):
   """Generates and saves training data into a tf record file."""
   train_examples = read_squad_examples(
-      input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
+    input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
   )
   tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=do_lower_case)
   train_writer = FeatureWriter(filename=output_path, is_training=True)
   number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature
+    examples=train_examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=doc_stride,
+    max_query_length=max_query_length,
+    is_training=True,
+    output_fn=train_writer.process_feature,
   )
   train_writer.close()
 
   meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
+    "task_type": "bert_squad",
+    "train_data_size": number_of_examples,
+    "max_seq_length": max_seq_length,
+    "max_query_length": max_query_length,
+    "doc_stride": doc_stride,
+    "version_2_with_negative": version_2_with_negative,
   }
 
   return meta_data
diff --git a/deepray/datasets/squad/squad_lib_sp.py b/deepray/datasets/squad/squad_lib_sp.py
index dd9ac2c2..e923ad92 100644
--- a/deepray/datasets/squad/squad_lib_sp.py
+++ b/deepray/datasets/squad/squad_lib_sp.py
@@ -18,6 +18,7 @@
 
 https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -36,18 +37,18 @@
 class SquadExample(object):
   """A single training/test example for simple sequence classification.
 
-     For examples without an answer, the start and end position are -1.
+  For examples without an answer, the start and end position are -1.
   """
 
   def __init__(
-      self,
-      qas_id,
-      question_text,
-      paragraph_text,
-      orig_answer_text=None,
-      start_position=None,
-      end_position=None,
-      is_impossible=False
+    self,
+    qas_id,
+    question_text,
+    paragraph_text,
+    orig_answer_text=None,
+    start_position=None,
+    end_position=None,
+    is_impossible=False,
   ):
     self.qas_id = qas_id
     self.question_text = question_text
@@ -78,21 +79,21 @@ class InputFeatures(object):
   """A single set of features of data."""
 
   def __init__(
-      self,
-      unique_id,
-      example_index,
-      doc_span_index,
-      tok_start_to_orig_index,
-      tok_end_to_orig_index,
-      token_is_max_context,
-      tokens,
-      input_ids,
-      input_mask,
-      segment_ids,
-      paragraph_len,
-      start_position=None,
-      end_position=None,
-      is_impossible=None
+    self,
+    unique_id,
+    example_index,
+    doc_span_index,
+    tok_start_to_orig_index,
+    tok_end_to_orig_index,
+    token_is_max_context,
+    tokens,
+    input_ids,
+    input_mask,
+    segment_ids,
+    paragraph_len,
+    start_position=None,
+    end_position=None,
+    is_impossible=None,
   ):
     self.unique_id = unique_id
     self.example_index = example_index
@@ -179,12 +180,12 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
             orig_answer_text = ""
 
         example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            paragraph_text=paragraph_text,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            is_impossible=is_impossible
+          qas_id=qas_id,
+          question_text=question_text,
+          paragraph_text=paragraph_text,
+          orig_answer_text=orig_answer_text,
+          start_position=start_position,
+          is_impossible=is_impossible,
         )
         examples.append(example)
 
@@ -230,15 +231,15 @@ def _convert_index(index, pos, m=None, is_start=True):
 
 
 def convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_seq_length,
-    doc_stride,
-    max_query_length,
-    is_training,
-    output_fn,
-    do_lower_case,
-    batch_size=None
+  examples,
+  tokenizer,
+  max_seq_length,
+  doc_stride,
+  max_query_length,
+  is_training,
+  output_fn,
+  do_lower_case,
+  batch_size=None,
 ):
   """Loads a data file into a list of `InputBatch`s."""
   cnt_pos, cnt_neg = 0, 0
@@ -247,13 +248,12 @@ def convert_examples_to_features(
   max_n, max_m = 1024, 1024
   f = np.zeros((max_n, max_m), dtype=np.float32)
 
-  for (example_index, example) in enumerate(examples):
-
+  for example_index, example in enumerate(examples):
     if example_index % 100 == 0:
       logging.info("Converting %d/%d pos %d neg %d", example_index, len(examples), cnt_pos, cnt_neg)
 
     query_tokens = tokenization.encode_ids(
-        tokenizer.sp_model, tokenization.preprocess_text(example.question_text, lower=do_lower_case)
+      tokenizer.sp_model, tokenization.preprocess_text(example.question_text, lower=do_lower_case)
     )
 
     if len(query_tokens) > max_query_length:
@@ -261,7 +261,7 @@ def convert_examples_to_features(
 
     paragraph_text = example.paragraph_text
     para_tokens = tokenization.encode_pieces(
-        tokenizer.sp_model, tokenization.preprocess_text(example.paragraph_text, lower=do_lower_case)
+      tokenizer.sp_model, tokenization.preprocess_text(example.paragraph_text, lower=do_lower_case)
     )
 
     chartok_to_tok_index = []
@@ -294,7 +294,6 @@ def _lcs_match(max_dist, n=n, m=m):
       ### longest common sub sequence
       # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
       for i in range(n):
-
         # unlike standard LCS, this is specifically optimized for the setting
         # because the mismatch between sentence pieces and original text will
         # be small
@@ -312,8 +311,8 @@ def _lcs_match(max_dist, n=n, m=m):
 
           f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
           if (
-              tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False)
-              == tok_cat_text[j] and f_prev + 1 > f[i, j]
+            tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False) == tok_cat_text[j]
+            and f_prev + 1 > f[i, j]
           ):
             g[(i, j)] = 2
             f[i, j] = f_prev + 1
@@ -342,7 +341,7 @@ def _lcs_match(max_dist, n=n, m=m):
       else:
         i = i - 1
 
-    if (all(v is None for v in orig_to_chartok_index) or f[n - 1, m - 1] < 0.8 * n):
+    if all(v is None for v in orig_to_chartok_index) or f[n - 1, m - 1] < 0.8 * n:
       logging.info("MISMATCH DETECTED!")
       continue
 
@@ -387,7 +386,7 @@ def _piece_to_id(x):
     # To deal with this we do a sliding window approach, where we take chunks
     # of the up to our max length with a stride of `doc_stride`.
     _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"]
+      "DocSpan", ["start", "length"]
     )
     doc_spans = []
     start_offset = 0
@@ -400,7 +399,7 @@ def _piece_to_id(x):
         break
       start_offset += min(length, doc_stride)
 
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
+    for doc_span_index, doc_span in enumerate(doc_spans):
       tokens = []
       token_is_max_context = {}
       segment_ids = []
@@ -479,7 +478,7 @@ def _piece_to_id(x):
         logging.info("tok_start_to_orig_index: %s", " ".join([str(x) for x in cur_tok_start_to_orig_index]))
         logging.info("tok_end_to_orig_index: %s", " ".join([str(x) for x in cur_tok_end_to_orig_index]))
         logging.info(
-            "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+          "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
         )
         logging.info("input_pieces: %s", " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
         logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
@@ -490,7 +489,7 @@ def _piece_to_id(x):
           logging.info("impossible example span")
 
         if is_training and not span_is_impossible:
-          pieces = [tokenizer.sp_model.IdToPiece(token) for token in tokens[start_position:(end_position + 1)]]
+          pieces = [tokenizer.sp_model.IdToPiece(token) for token in tokens[start_position : (end_position + 1)]]
           answer_text = tokenizer.sp_model.DecodePieces(pieces)
           logging.info("start_position: %d", start_position)
           logging.info("end_position: %d", end_position)
@@ -506,20 +505,20 @@ def _piece_to_id(x):
         feat_example_index = example_index
 
       feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=feat_example_index,
-          doc_span_index=doc_span_index,
-          tok_start_to_orig_index=cur_tok_start_to_orig_index,
-          tok_end_to_orig_index=cur_tok_end_to_orig_index,
-          token_is_max_context=token_is_max_context,
-          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          paragraph_len=paragraph_len,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=span_is_impossible
+        unique_id=unique_id,
+        example_index=feat_example_index,
+        doc_span_index=doc_span_index,
+        tok_start_to_orig_index=cur_tok_start_to_orig_index,
+        tok_end_to_orig_index=cur_tok_end_to_orig_index,
+        token_is_max_context=token_is_max_context,
+        tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        paragraph_len=paragraph_len,
+        start_position=start_position,
+        end_position=end_position,
+        is_impossible=span_is_impossible,
       )
 
       # Run callback
@@ -573,7 +572,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   # and 0 right context.
   best_score = None
   best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
+  for span_index, doc_span in enumerate(doc_spans):
     end = doc_span.start + doc_span.length - 1
     if position < doc_span.start:
       continue
@@ -593,18 +592,18 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 
 
 def write_predictions(
-    all_examples,
-    all_features,
-    all_results,
-    n_best_size,
-    max_answer_length,
-    do_lower_case,
-    output_prediction_file,
-    output_nbest_file,
-    output_null_log_odds_file,
-    version_2_with_negative=False,
-    null_score_diff_threshold=0.0,
-    verbose=False
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  output_prediction_file,
+  output_nbest_file,
+  output_null_log_odds_file,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
 ):
   """Write final predictions to the json file and log-odds of null if needed."""
   del do_lower_case, verbose
@@ -620,14 +619,14 @@ def write_predictions(
     unique_id_to_result[result.unique_id] = result
 
   _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-    "PrelimPrediction",
-    ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+  )
 
   all_predictions = collections.OrderedDict()
   all_nbest_json = collections.OrderedDict()
   scores_diff_json = collections.OrderedDict()
 
-  for (example_index, example) in enumerate(all_examples):
+  for example_index, example in enumerate(all_examples):
     features = example_index_to_features[example_index]
 
     prelim_predictions = []
@@ -636,7 +635,7 @@ def write_predictions(
     min_null_feature_index = 0  # the paragraph slice with min mull score
     null_start_logit = 0  # the start logit at the slice with min null score
     null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
+    for feature_index, feature in enumerate(features):
       result = unique_id_to_result[feature.unique_id]
       start_indexes = _get_best_indexes(result.start_logits, n_best_size)
       end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -670,29 +669,30 @@ def write_predictions(
           if length > max_answer_length:
             continue
           prelim_predictions.append(
-              _PrelimPrediction(
-                  feature_index=feature_index,
-                  start_index=start_index - doc_offset,
-                  end_index=end_index - doc_offset,
-                  start_logit=result.start_logits[start_index],
-                  end_logit=result.end_logits[end_index]
-              )
+            _PrelimPrediction(
+              feature_index=feature_index,
+              start_index=start_index - doc_offset,
+              end_index=end_index - doc_offset,
+              start_logit=result.start_logits[start_index],
+              end_logit=result.end_logits[end_index],
+            )
           )
 
     if version_2_with_negative:
       prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=-1,
-              end_index=-1,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit
-          )
+        _PrelimPrediction(
+          feature_index=min_null_feature_index,
+          start_index=-1,
+          end_index=-1,
+          start_logit=null_start_logit,
+          end_logit=null_end_logit,
+        )
       )
     prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "NbestPrediction", ["text", "start_logit", "end_logit"])
+      "NbestPrediction", ["text", "start_logit", "end_logit"]
+    )
 
     seen_predictions = {}
     nbest = []
@@ -707,7 +707,7 @@ def write_predictions(
         end_orig_pos = tok_end_to_orig_index[pred.end_index]
 
         paragraph_text = example.paragraph_text
-        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
+        final_text = paragraph_text[start_orig_pos : end_orig_pos + 1].strip()
         if final_text in seen_predictions:
           continue
 
@@ -740,7 +740,7 @@ def write_predictions(
     probs = _compute_softmax(total_scores)
 
     nbest_json = []
-    for (i, entry) in enumerate(nbest):
+    for i, entry in enumerate(nbest):
       output = collections.OrderedDict()
       output["text"] = entry.text
       output["probability"] = probs[i]
@@ -811,40 +811,40 @@ def _compute_softmax(scores):
 
 
 def generate_tf_record_from_json_file(
-    input_file_path,
-    sp_model_file,
-    output_path,
-    max_seq_length=384,
-    do_lower_case=True,
-    max_query_length=64,
-    doc_stride=128,
-    version_2_with_negative=False
+  input_file_path,
+  sp_model_file,
+  output_path,
+  max_seq_length=384,
+  do_lower_case=True,
+  max_query_length=64,
+  doc_stride=128,
+  version_2_with_negative=False,
 ):
   """Generates and saves training data into a tf record file."""
   train_examples = read_squad_examples(
-      input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
+    input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
   )
   tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file=sp_model_file)
   train_writer = FeatureWriter(filename=output_path, is_training=True)
   number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature,
-      do_lower_case=do_lower_case
+    examples=train_examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=doc_stride,
+    max_query_length=max_query_length,
+    is_training=True,
+    output_fn=train_writer.process_feature,
+    do_lower_case=do_lower_case,
   )
   train_writer.close()
 
   meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
+    "task_type": "bert_squad",
+    "train_data_size": number_of_examples,
+    "max_seq_length": max_seq_length,
+    "max_query_length": max_query_length,
+    "doc_stride": doc_stride,
+    "version_2_with_negative": version_2_with_negative,
   }
 
   return meta_data
diff --git a/deepray/datasets/squad/squad_test.py b/deepray/datasets/squad/squad_test.py
index 03732f97..3a6fe581 100644
--- a/deepray/datasets/squad/squad_test.py
+++ b/deepray/datasets/squad/squad_test.py
@@ -23,20 +23,20 @@ def runner(argv=None):
   if len(argv) <= 1:
     dir_path = os.path.dirname(os.path.realpath(__file__))
     argv = [
-        sys.argv[0],
-        "--batch_size=1",
-        "-epochs=1",
-        f"--train_data={SQUAD_DIR}/squad_v{SQUAD_VERSION}_train.tf_record",
-        f"--input_meta_data_path={dir_path}/v{SQUAD_VERSION}/squad_v{SQUAD_VERSION}_meta_data",
-        # "--label=clicked",
+      sys.argv[0],
+      "--batch_size=1",
+      "-epochs=1",
+      f"--train_data={SQUAD_DIR}/squad_v{SQUAD_VERSION}_train.tf_record",
+      f"--input_meta_data_path={dir_path}/v{SQUAD_VERSION}/squad_v{SQUAD_VERSION}_meta_data",
+      # "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
 
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, "rb") as reader:
+    input_meta_data = json.loads(reader.read().decode("utf-8"))
 
-  data_pipe = Squad(max_seq_length=input_meta_data['max_seq_length'], dataset_type="squad")
+  data_pipe = Squad(max_seq_length=input_meta_data["max_seq_length"], dataset_type="squad")
   # create data pipline of train & test dataset
   train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
   num_examples = 0
diff --git a/deepray/datasets/squad/v1.1/evaluate-v1.1.py b/deepray/datasets/squad/v1.1/evaluate-v1.1.py
index e749ae44..d788f30c 100644
--- a/deepray/datasets/squad/v1.1/evaluate-v1.1.py
+++ b/deepray/datasets/squad/v1.1/evaluate-v1.1.py
@@ -1,4 +1,5 @@
-""" Official evaluation script for v1.1 of the SQuAD dataset. """
+"""Official evaluation script for v1.1 of the SQuAD dataset."""
+
 from __future__ import print_function
 from collections import Counter
 import string
@@ -12,14 +13,14 @@ def normalize_answer(s):
   """Lower text and remove punctuation, articles and extra whitespace."""
 
   def remove_articles(text):
-    return re.sub(r'\b(a|an|the)\b', ' ', text)
+    return re.sub(r"\b(a|an|the)\b", " ", text)
 
   def white_space_fix(text):
-    return ' '.join(text.split())
+    return " ".join(text.split())
 
   def remove_punc(text):
     exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
+    return "".join(ch for ch in text if ch not in exclude)
 
   def lower(text):
     return text.lower()
@@ -41,7 +42,7 @@ def f1_score(prediction, ground_truth):
 
 
 def exact_match_score(prediction, ground_truth):
-  return (normalize_answer(prediction) == normalize_answer(ground_truth))
+  return normalize_answer(prediction) == normalize_answer(ground_truth)
 
 
 def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
@@ -55,39 +56,38 @@ def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 def evaluate(dataset, predictions):
   f1 = exact_match = total = 0
   for article in dataset:
-    for paragraph in article['paragraphs']:
-      for qa in paragraph['qas']:
+    for paragraph in article["paragraphs"]:
+      for qa in paragraph["qas"]:
         total += 1
-        if qa['id'] not in predictions:
-          message = 'Unanswered question ' + qa['id'] + \
-                    ' will receive score 0.'
+        if qa["id"] not in predictions:
+          message = "Unanswered question " + qa["id"] + " will receive score 0."
           print(message, file=sys.stderr)
           continue
-        ground_truths = list(map(lambda x: x['text'], qa['answers']))
-        prediction = predictions[qa['id']]
+        ground_truths = list(map(lambda x: x["text"], qa["answers"]))
+        prediction = predictions[qa["id"]]
         exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
         f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
 
   exact_match = 100.0 * exact_match / total
   f1 = 100.0 * f1 / total
 
-  return {'exact_match': exact_match, 'f1': f1}
+  return {"exact_match": exact_match, "f1": f1}
 
 
-if __name__ == '__main__':
-  expected_version = '1.1'
-  parser = argparse.ArgumentParser(description='Evaluation for SQuAD ' + expected_version)
-  parser.add_argument('dataset_file', help='Dataset file')
-  parser.add_argument('prediction_file', help='Prediction File')
+if __name__ == "__main__":
+  expected_version = "1.1"
+  parser = argparse.ArgumentParser(description="Evaluation for SQuAD " + expected_version)
+  parser.add_argument("dataset_file", help="Dataset file")
+  parser.add_argument("prediction_file", help="Prediction File")
   args = parser.parse_args()
   with open(args.dataset_file) as dataset_file:
     dataset_json = json.load(dataset_file)
-    if (dataset_json['version'] != expected_version):
+    if dataset_json["version"] != expected_version:
       print(
-          'Evaluation expects v-' + expected_version + ', but got dataset with v-' + dataset_json['version'],
-          file=sys.stderr
+        "Evaluation expects v-" + expected_version + ", but got dataset with v-" + dataset_json["version"],
+        file=sys.stderr,
       )
-    dataset = dataset_json['data']
+    dataset = dataset_json["data"]
   with open(args.prediction_file) as prediction_file:
     predictions = json.load(prediction_file)
   print(json.dumps(evaluate(dataset, predictions)))
diff --git a/deepray/datasets/squad/v2.0/evaluate-v2.0.py b/deepray/datasets/squad/v2.0/evaluate-v2.0.py
index f8ffb724..84ea3a97 100644
--- a/deepray/datasets/squad/v2.0/evaluate-v2.0.py
+++ b/deepray/datasets/squad/v2.0/evaluate-v2.0.py
@@ -5,6 +5,7 @@
 This file is expected to map question ID's to the model's predicted probability
 that a question is unanswerable.
 """
+
 import argparse
 import collections
 import json
@@ -18,26 +19,26 @@
 
 
 def parse_args():
-  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
-  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
-  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
+  parser = argparse.ArgumentParser("Official evaluation script for SQuAD version 2.0.")
+  parser.add_argument("data_file", metavar="data.json", help="Input data JSON file.")
+  parser.add_argument("pred_file", metavar="pred.json", help="Model predictions.")
   parser.add_argument(
-      '--out-file', '-o', metavar='eval.json', help='Write accuracy metrics to file (default is stdout).'
+    "--out-file", "-o", metavar="eval.json", help="Write accuracy metrics to file (default is stdout)."
   )
   parser.add_argument(
-      '--na-prob-file', '-n', metavar='na_prob.json', help='Model estimates of probability of no answer.'
+    "--na-prob-file", "-n", metavar="na_prob.json", help="Model estimates of probability of no answer."
   )
   parser.add_argument(
-      '--na-prob-thresh',
-      '-t',
-      type=float,
-      default=1.0,
-      help='Predict "" if no-answer probability exceeds this (default = 1.0).'
+    "--na-prob-thresh",
+    "-t",
+    type=float,
+    default=1.0,
+    help='Predict "" if no-answer probability exceeds this (default = 1.0).',
   )
   parser.add_argument(
-      '--out-image-dir', '-p', metavar='out_images', default=None, help='Save precision-recall curves to directory.'
+    "--out-image-dir", "-p", metavar="out_images", default=None, help="Save precision-recall curves to directory."
   )
-  parser.add_argument('--verbose', '-v', action='store_true')
+  parser.add_argument("--verbose", "-v", action="store_true")
   if len(sys.argv) == 1:
     parser.print_help()
     sys.exit(1)
@@ -47,9 +48,9 @@ def parse_args():
 def make_qid_to_has_ans(dataset):
   qid_to_has_ans = {}
   for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+    for p in article["paragraphs"]:
+      for qa in p["qas"]:
+        qid_to_has_ans[qa["id"]] = bool(qa["answers"])
   return qid_to_has_ans
 
 
@@ -57,15 +58,15 @@ def normalize_answer(s):
   """Lower text and remove punctuation, articles and extra whitespace."""
 
   def remove_articles(text):
-    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
-    return re.sub(regex, ' ', text)
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(regex, " ", text)
 
   def white_space_fix(text):
-    return ' '.join(text.split())
+    return " ".join(text.split())
 
   def remove_punc(text):
     exclude = set(string.punctuation)
-    return ''.join(ch for ch in text if ch not in exclude)
+    return "".join(ch for ch in text if ch not in exclude)
 
   def lower(text):
     return text.lower()
@@ -103,15 +104,15 @@ def get_raw_scores(dataset, preds):
   exact_scores = {}
   f1_scores = {}
   for article in dataset:
-    for p in article['paragraphs']:
-      for qa in p['qas']:
-        qid = qa['id']
-        gold_answers = [a['text'] for a in qa['answers'] if normalize_answer(a['text'])]
+    for p in article["paragraphs"]:
+      for qa in p["qas"]:
+        qid = qa["id"]
+        gold_answers = [a["text"] for a in qa["answers"] if normalize_answer(a["text"])]
         if not gold_answers:
           # For unanswerable questions, only correct answer is empty string
-          gold_answers = ['']
+          gold_answers = [""]
         if qid not in preds:
-          print('Missing prediction for %s' % qid)
+          print("Missing prediction for %s" % qid)
           continue
         a_pred = preds[qid]
         # Take max over all gold answers
@@ -134,34 +135,30 @@ def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
 def make_eval_dict(exact_scores, f1_scores, qid_list=None):
   if not qid_list:
     total = len(exact_scores)
-    return collections.OrderedDict(
-        [
-            ('exact', 100.0 * sum(exact_scores.values()) / total),
-            ('f1', 100.0 * sum(f1_scores.values()) / total),
-            ('total', total),
-        ]
-    )
+    return collections.OrderedDict([
+      ("exact", 100.0 * sum(exact_scores.values()) / total),
+      ("f1", 100.0 * sum(f1_scores.values()) / total),
+      ("total", total),
+    ])
   else:
     total = len(qid_list)
-    return collections.OrderedDict(
-        [
-            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
-            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
-            ('total', total),
-        ]
-    )
+    return collections.OrderedDict([
+      ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+      ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+      ("total", total),
+    ])
 
 
 def merge_eval(main_eval, new_eval, prefix):
   for k in new_eval:
-    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+    main_eval["%s_%s" % (prefix, k)] = new_eval[k]
 
 
 def plot_pr_curve(precisions, recalls, out_image, title):
-  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
-  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
-  plt.xlabel('Recall')
-  plt.ylabel('Precision')
+  plt.step(recalls, precisions, color="b", alpha=0.2, where="post")
+  plt.fill_between(recalls, precisions, step="post", alpha=0.2, color="b")
+  plt.xlabel("Recall")
+  plt.ylabel("Precision")
   plt.xlim([0.0, 1.05])
   plt.ylim([0.0, 1.05])
   plt.title(title)
@@ -189,7 +186,7 @@ def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, o
       recalls.append(cur_r)
   if out_image:
     plot_pr_curve(precisions, recalls, out_image, title)
-  return {'ap': 100.0 * avg_prec}
+  return {"ap": 100.0 * avg_prec}
 
 
 def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, out_image_dir):
@@ -199,33 +196,33 @@ def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, qid_to
   if num_true_pos == 0:
     return
   pr_exact = make_precision_recall_eval(
-      exact_raw,
-      na_probs,
-      num_true_pos,
-      qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
-      title='Precision-Recall curve for Exact Match score'
+    exact_raw,
+    na_probs,
+    num_true_pos,
+    qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, "pr_exact.png"),
+    title="Precision-Recall curve for Exact Match score",
   )
   pr_f1 = make_precision_recall_eval(
-      f1_raw,
-      na_probs,
-      num_true_pos,
-      qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
-      title='Precision-Recall curve for F1 score'
+    f1_raw,
+    na_probs,
+    num_true_pos,
+    qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, "pr_f1.png"),
+    title="Precision-Recall curve for F1 score",
   )
   oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
   pr_oracle = make_precision_recall_eval(
-      oracle_scores,
-      na_probs,
-      num_true_pos,
-      qid_to_has_ans,
-      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
-      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)'
+    oracle_scores,
+    na_probs,
+    num_true_pos,
+    qid_to_has_ans,
+    out_image=os.path.join(out_image_dir, "pr_oracle.png"),
+    title="Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)",
   )
-  merge_eval(main_eval, pr_exact, 'pr_exact')
-  merge_eval(main_eval, pr_f1, 'pr_f1')
-  merge_eval(main_eval, pr_oracle, 'pr_oracle')
+  merge_eval(main_eval, pr_exact, "pr_exact")
+  merge_eval(main_eval, pr_f1, "pr_f1")
+  merge_eval(main_eval, pr_oracle, "pr_oracle")
 
 
 def histogram_na_prob(na_probs, qid_list, image_dir, name):
@@ -234,10 +231,10 @@ def histogram_na_prob(na_probs, qid_list, image_dir, name):
   x = [na_probs[k] for k in qid_list]
   weights = np.ones_like(x) / float(len(x))
   plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
-  plt.xlabel('Model probability of no-answer')
-  plt.ylabel('Proportion of dataset')
-  plt.title('Histogram of no-answer probability: %s' % name)
-  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
+  plt.xlabel("Model probability of no-answer")
+  plt.ylabel("Proportion of dataset")
+  plt.title("Histogram of no-answer probability: %s" % name)
+  plt.savefig(os.path.join(image_dir, "na_prob_hist_%s.png" % name))
   plt.clf()
 
 
@@ -267,16 +264,16 @@ def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
 def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
   best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
   best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
-  main_eval['best_exact'] = best_exact
-  main_eval['best_exact_thresh'] = exact_thresh
-  main_eval['best_f1'] = best_f1
-  main_eval['best_f1_thresh'] = f1_thresh
+  main_eval["best_exact"] = best_exact
+  main_eval["best_exact_thresh"] = exact_thresh
+  main_eval["best_f1"] = best_f1
+  main_eval["best_f1_thresh"] = f1_thresh
 
 
 def main():
   with open(OPTS.data_file) as f:
     dataset_json = json.load(f)
-    dataset = dataset_json['data']
+    dataset = dataset_json["data"]
   with open(OPTS.pred_file) as f:
     preds = json.load(f)
   if OPTS.na_prob_file:
@@ -293,27 +290,28 @@ def main():
   out_eval = make_eval_dict(exact_thresh, f1_thresh)
   if has_ans_qids:
     has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
-    merge_eval(out_eval, has_ans_eval, 'HasAns')
+    merge_eval(out_eval, has_ans_eval, "HasAns")
   if no_ans_qids:
     no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
-    merge_eval(out_eval, no_ans_eval, 'NoAns')
+    merge_eval(out_eval, no_ans_eval, "NoAns")
   if OPTS.na_prob_file:
     find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
   if OPTS.na_prob_file and OPTS.out_image_dir:
     run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, qid_to_has_ans, OPTS.out_image_dir)
-    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
-    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
+    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, "hasAns")
+    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, "noAns")
   if OPTS.out_file:
-    with open(OPTS.out_file, 'w') as f:
+    with open(OPTS.out_file, "w") as f:
       json.dump(out_eval, f)
   else:
     print(json.dumps(out_eval, indent=2))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   OPTS = parse_args()
   if OPTS.out_image_dir:
     import matplotlib
-    matplotlib.use('Agg')
+
+    matplotlib.use("Agg")
     import matplotlib.pyplot as plt
   main()
diff --git a/deepray/datasets/taobao/csv_price_int32_to_parquet.py b/deepray/datasets/taobao/csv_price_int32_to_parquet.py
index dad5ee7d..50ad31e3 100644
--- a/deepray/datasets/taobao/csv_price_int32_to_parquet.py
+++ b/deepray/datasets/taobao/csv_price_int32_to_parquet.py
@@ -2,17 +2,31 @@
 import pyarrow as pa
 import pyarrow.parquet as pq
 
-src_file = 'taobao/taobao_test_data'
-dst_file = src_file + '.parquet'
+src_file = "taobao/taobao_test_data"
+dst_file = src_file + ".parquet"
 
 chunksize = 10000  # this is the number of lines
 
 # Definition of some constants
 LABEL_COLUMNS = ["clk", "buy"]
 HASH_INPUTS = [
-    "pid", "adgroup_id", "cate_id", "campaign_id", "customer", "brand", "user_id", "cms_segid", "cms_group_id",
-    "final_gender_code", "age_level", "pvalue_level", "shopping_level", "occupation", "new_user_class_level",
-    "tag_category_list", "tag_brand_list"
+  "pid",
+  "adgroup_id",
+  "cate_id",
+  "campaign_id",
+  "customer",
+  "brand",
+  "user_id",
+  "cms_segid",
+  "cms_group_id",
+  "final_gender_code",
+  "age_level",
+  "pvalue_level",
+  "shopping_level",
+  "occupation",
+  "new_user_class_level",
+  "tag_category_list",
+  "tag_brand_list",
 ]
 IDENTITY_INPUTS = ["price"]
 ALL_INPUT = LABEL_COLUMNS + HASH_INPUTS + IDENTITY_INPUTS
@@ -31,7 +45,7 @@
 input_field = label_field + hash_field + identity_field
 
 label_default_values = {item: 0 for item in LABEL_COLUMNS}
-hash_default_values = {item: ' ' for item in HASH_INPUTS}
+hash_default_values = {item: " " for item in HASH_INPUTS}
 identity_default_values = {item: 0 for item in IDENTITY_INPUTS}
 default_values = {}
 default_values.update(label_default_values)
diff --git a/deepray/datasets/taobao/csv_price_string_to_parquet.py b/deepray/datasets/taobao/csv_price_string_to_parquet.py
index e8ecaaa4..01860b25 100644
--- a/deepray/datasets/taobao/csv_price_string_to_parquet.py
+++ b/deepray/datasets/taobao/csv_price_string_to_parquet.py
@@ -2,19 +2,34 @@
 import pyarrow as pa
 import pyarrow.parquet as pq
 
-src_file = 'taobao/taobao_test_data'
-dst_file = src_file + '.parquet'
+src_file = "taobao/taobao_test_data"
+dst_file = src_file + ".parquet"
 
 chunksize = 10000  # this is the number of lines
 
 # Definition of some constants
 INPUT_FEATURES = [
-    'pid', 'adgroup_id', 'cate_id', 'campaign_id', 'customer', 'brand', 'user_id', 'cms_segid', 'cms_group_id',
-    'final_gender_code', 'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level',
-    'tag_category_list', 'tag_brand_list', 'price'
+  "pid",
+  "adgroup_id",
+  "cate_id",
+  "campaign_id",
+  "customer",
+  "brand",
+  "user_id",
+  "cms_segid",
+  "cms_group_id",
+  "final_gender_code",
+  "age_level",
+  "pvalue_level",
+  "shopping_level",
+  "occupation",
+  "new_user_class_level",
+  "tag_category_list",
+  "tag_brand_list",
+  "price",
 ]
-LABEL_COLUMN = ['clk']
-BUY_COLUMN = ['buy']
+LABEL_COLUMN = ["clk"]
+BUY_COLUMN = ["buy"]
 INPUT_COLUMN = LABEL_COLUMN + BUY_COLUMN + INPUT_FEATURES
 
 label_dtype = {label: int for label in LABEL_COLUMN}
@@ -32,7 +47,7 @@
 
 label_default_values = {label: 0 for label in LABEL_COLUMN}
 buy_default_values = {buy: 0 for buy in BUY_COLUMN}
-features_default_values = {feature: ' ' for feature in INPUT_FEATURES}
+features_default_values = {feature: " " for feature in INPUT_FEATURES}
 default_values = {}
 default_values.update(label_default_values)
 default_values.update(buy_default_values)
diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
index 926b729f..a3cdcb0c 100644
--- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
+++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline.py
@@ -29,7 +29,7 @@ def features(self):
   def parser(self, record):
     self.context_features, self.sequence_features = self.features
     tensor, sparse_tensor, ragged_tensor = tf.io.parse_sequence_example(
-        serialized=record, context_features=self.context_features, sequence_features=self.sequence_features
+      serialized=record, context_features=self.context_features, sequence_features=self.sequence_features
     )
 
     tensor.update(sparse_tensor)
@@ -46,7 +46,7 @@ def build_dataset(self, input_file_pattern, batch_size, is_training=True, epochs
     # same input file is sent to all workers.
     if isinstance(input_files, str) or len(input_files) < get_world_size():
       dataset = tf.data.TFRecordDataset(
-          input_files, compression_type=self.compression_type, num_parallel_reads=tf.data.AUTOTUNE
+        input_files, compression_type=self.compression_type, num_parallel_reads=tf.data.AUTOTUNE
       )
       if self.use_horovod:
         # For multi-host training, we want each hosts to always process the same
@@ -72,7 +72,7 @@ def mfunc(x):
       # CPU cores.
       cycle_length = min(multiprocessing.cpu_count(), len(input_files))
       dataset = dataset.interleave(
-          mfunc, cycle_length=cycle_length, block_length=4, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True
+        mfunc, cycle_length=cycle_length, block_length=4, num_parallel_calls=tf.data.AUTOTUNE, deterministic=True
       )
 
     dataset = dataset.batch(batch_size).map(self.parser, multiprocessing.cpu_count())
diff --git a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
index 12b1edeb..8ec6d997 100644
--- a/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
+++ b/deepray/datasets/tfrecord_pipeline/tfrecord_pipeline_test.py
@@ -19,13 +19,13 @@ def runner(argv=None):
 
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=256",
-        "--epochs=1",
-        "--prebatch=1",
-        "--train_data=/workspaces/dataset/*.tfrecord",
-        "--feature_map=feature_map.csv",
-        "--label=label",
+      sys.argv[0],
+      "--batch_size=256",
+      "--epochs=1",
+      "--prebatch=1",
+      "--train_data=/workspaces/dataset/*.tfrecord",
+      "--feature_map=feature_map.csv",
+      "--label=label",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -43,7 +43,7 @@ def runner(argv=None):
     step += 1
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
   # print(batch)
 
   print(num_examples)
diff --git a/deepray/datasets/tokenization.py b/deepray/datasets/tokenization.py
index 614ed50a..355099fb 100644
--- a/deepray/datasets/tokenization.py
+++ b/deepray/datasets/tokenization.py
@@ -53,7 +53,10 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
   model_name = m.group(1)
 
   lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    "uncased_L-24_H-1024_A-16",
+    "uncased_L-12_H-768_A-12",
+    "multilingual_L-12_H-768_A-12",
+    "chinese_L-12_H-768_A-12",
   ]
 
   cased_models = ["cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", "multi_cased_L-12_H-768_A-12"]
@@ -73,11 +76,11 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 
   if is_bad_config:
     raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
+      "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+      "However, `%s` seems to be a %s model, so you "
+      "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+      "how the model was pre-training. If this error is wrong, please "
+      "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
     )
 
 
@@ -278,13 +281,14 @@ def _is_chinese_char(self, cp):
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
     if (
-        (cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)
+      (cp >= 0x4E00 and cp <= 0x9FFF)  #
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)
+      or (cp >= 0xF900 and cp <= 0xFAFF)  #
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)
     ):  #
       return True
 
@@ -295,7 +299,7 @@ def _clean_text(self, text):
     output = []
     for char in text:
       cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
+      if cp == 0 or cp == 0xFFFD or _is_control(char):
         continue
       if _is_whitespace(char):
         output.append(" ")
@@ -397,7 +401,7 @@ def _is_punctuation(char):
   # Characters such as "^", "$", and "`" are not in the Unicode
   # Punctuation class but we treat them as punctuation anyways, for
   # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
     return True
   cat = unicodedata.category(char)
   if cat.startswith("P"):
diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
index 1d5b1ffc..57ea18a3 100644
--- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
+++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge.py
@@ -11,65 +11,64 @@
 
 from deepray.datasets.datapipeline import DataPipeline
 
-os.environ['CURL_CA_BUNDLE'] = ''
+os.environ["CURL_CA_BUNDLE"] = ""
 
 FLAGS([
-    sys.argv[0],
-    "--num_train_examples=111699",
+  sys.argv[0],
+  "--num_train_examples=111699",
 ])
 
 
 class ToxicCommentClassificationChallenge(DataPipeline):
-
   def __init__(self, path="/workspaces/dataset/jigsaw-toxic-comment-classification-challenge", **kwargs):
     super().__init__(**kwargs)
 
-    with zipfile.ZipFile(os.path.join(path, "train.csv.zip"), 'r') as zip_ref:
+    with zipfile.ZipFile(os.path.join(path, "train.csv.zip"), "r") as zip_ref:
       zip_ref.extractall("data")
 
-    df = pd.read_csv(os.path.join(path, "train.csv"))  #.head(10000)
+    df = pd.read_csv(os.path.join(path, "train.csv"))  # .head(10000)
 
     train, test = train_test_split(df, test_size=0.3, random_state=1)
 
     clean_text_pipeline = [
-        preprocessing.remove_urls,  # remove urls
-        preprocessing.remove_punctuation,  # remove punctuation
-        preprocessing.remove_digits,  # remove numbers
-        preprocessing.remove_diacritics,
-        preprocessing.lowercase,  # convert to lowercase
-        preprocessing.remove_stopwords,  # remove stopwords
-        preprocessing.remove_whitespace,  # remove any extra spaces
-        preprocessing.stem  # stemming of the words
+      preprocessing.remove_urls,  # remove urls
+      preprocessing.remove_punctuation,  # remove punctuation
+      preprocessing.remove_digits,  # remove numbers
+      preprocessing.remove_diacritics,
+      preprocessing.lowercase,  # convert to lowercase
+      preprocessing.remove_stopwords,  # remove stopwords
+      preprocessing.remove_whitespace,  # remove any extra spaces
+      preprocessing.stem,  # stemming of the words
     ]
     # applying the processing pipeline
-    train['clean_text'] = hero.clean(train['comment_text'], clean_text_pipeline)
-    test['clean_text'] = hero.clean(test['comment_text'], clean_text_pipeline)
+    train["clean_text"] = hero.clean(train["comment_text"], clean_text_pipeline)
+    test["clean_text"] = hero.clean(test["comment_text"], clean_text_pipeline)
     # comparison of text before and after the processing
     print("Original Text:")
-    print(train['comment_text'].iloc[0], '\n')
+    print(train["comment_text"].iloc[0], "\n")
     print("Clean Text:")
-    print(train['clean_text'].iloc[0])
+    print(train["clean_text"].iloc[0])
 
     print("------------------------------------")
 
     print("Original Text:")
-    print(train['comment_text'].iloc[1], '\n')
+    print(train["comment_text"].iloc[1], "\n")
     print("Clean Text:")
-    print(train['clean_text'].iloc[1])
+    print(train["clean_text"].iloc[1])
 
     # create targets
-    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
+    labels = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
     self.y_train = train[labels]
     self.y_test = test[labels]
 
     # before feeding the data to the model, we will clean it a bit
     clean_text_bert_pipeline = [
-        preprocessing.remove_urls,  # remove urls
-        preprocessing.remove_diacritics,
-        preprocessing.remove_whitespace  # remove any extra spaces
+      preprocessing.remove_urls,  # remove urls
+      preprocessing.remove_diacritics,
+      preprocessing.remove_whitespace,  # remove any extra spaces
     ]
-    self.train_bert = hero.clean(train['comment_text'], clean_text_bert_pipeline)
-    self.test_bert = hero.clean(test['comment_text'], clean_text_bert_pipeline)
+    self.train_bert = hero.clean(train["comment_text"], clean_text_bert_pipeline)
+    self.test_bert = hero.clean(test["comment_text"], clean_text_bert_pipeline)
 
   def build_dataset(self, input_file_pattern, batch_size, is_training=True, *args, **kwargs):
     if is_training:
diff --git a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
index 6581d356..a15c2514 100644
--- a/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
+++ b/deepray/datasets/toxic_comment_classification_challenge/toxic_comment_classification_challenge_test.py
@@ -15,12 +15,12 @@
 def runner(argv=None):
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=1",
-        "-epochs=1",
-        # "--train_data=cifar100",
-        # f"--feature_map={dir_path}/feature_map.csv",
-        # "--label=clicked",
+      sys.argv[0],
+      "--batch_size=1",
+      "-epochs=1",
+      # "--train_data=cifar100",
+      # f"--feature_map={dir_path}/feature_map.csv",
+      # "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
diff --git a/deepray/datasets/wikicorpus_en/processing/BookscorpusTextFormatting.py b/deepray/datasets/wikicorpus_en/processing/BookscorpusTextFormatting.py
index d400504c..125a4988 100644
--- a/deepray/datasets/wikicorpus_en/processing/BookscorpusTextFormatting.py
+++ b/deepray/datasets/wikicorpus_en/processing/BookscorpusTextFormatting.py
@@ -15,7 +15,6 @@
 
 
 class BookscorpusTextFormatting:
-
   def __init__(self, books_path, output_filename, recursive=False):
     self.books_path = books_path
     self.recursive = recursive
@@ -23,10 +22,10 @@ def __init__(self, books_path, output_filename, recursive=False):
 
   # This puts one book per line
   def merge(self):
-    with open(self.output_filename, mode='w', newline='\n') as ofile:
-      for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
-        with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.books_path + "/" + "*.txt", recursive=True):
+        with open(filename, mode="r", encoding="utf-8-sig", newline="\n") as file:
           for line in file:
-            if line.strip() != '':
-              ofile.write(line.strip() + ' ')
+            if line.strip() != "":
+              ofile.write(line.strip() + " ")
         ofile.write("\n\n")
diff --git a/deepray/datasets/wikicorpus_en/processing/Downloader.py b/deepray/datasets/wikicorpus_en/processing/Downloader.py
index bdca3863..9b97a96b 100644
--- a/deepray/datasets/wikicorpus_en/processing/Downloader.py
+++ b/deepray/datasets/wikicorpus_en/processing/Downloader.py
@@ -15,22 +15,20 @@
 
 
 class Downloader:
-
   def __init__(self, dataset_name, save_path):
     self.dataset_name = dataset_name
     self.save_path = save_path
 
   def download(self):
+    if self.dataset_name == "wikicorpus_en":
+      self.download_wikicorpus("en")
 
-    if self.dataset_name == 'wikicorpus_en':
-      self.download_wikicorpus('en')
-
-    elif self.dataset_name == 'wikicorpus_zh':
-      self.download_wikicorpus('zh')
+    elif self.dataset_name == "wikicorpus_zh":
+      self.download_wikicorpus("zh")
 
     else:
       print(self.dataset_name)
-      assert False, 'Unknown dataset_name provided to downloader'
+      assert False, "Unknown dataset_name provided to downloader"
 
   def download_wikicorpus(self, language):
     downloader = WikiDownloader(language, self.save_path)
diff --git a/deepray/datasets/wikicorpus_en/processing/TextSharding.py b/deepray/datasets/wikicorpus_en/processing/TextSharding.py
index 36741a8b..8fd10ecc 100644
--- a/deepray/datasets/wikicorpus_en/processing/TextSharding.py
+++ b/deepray/datasets/wikicorpus_en/processing/TextSharding.py
@@ -19,11 +19,10 @@
 
 
 class Sharding:
-
   def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
-    assert len(input_files) > 0, 'The input file list must contain at least one file.'
-    assert n_training_shards > 0, 'There must be at least one output shard.'
-    assert n_test_shards > 0, 'There must be at least one output shard.'
+    assert len(input_files) > 0, "The input file list must contain at least one file."
+    assert n_training_shards > 0, "There must be at least one output shard."
+    assert n_test_shards > 0, "There must be at least one output shard."
 
     self.n_training_shards = n_training_shards
     self.n_test_shards = n_test_shards
@@ -32,9 +31,9 @@ def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_sh
     self.input_files = input_files
 
     self.output_name_prefix = output_name_prefix
-    self.output_training_identifier = '_training'
-    self.output_test_identifier = '_test'
-    self.output_file_extension = '.txt'
+    self.output_training_identifier = "_training"
+    self.output_test_identifier = "_test"
+    self.output_file_extension = ".txt"
 
     self.articles = {}  # key: integer identifier, value: list of articles
     self.sentences = {}  # key: integer identifier, value: list of sentences
@@ -45,35 +44,35 @@ def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_sh
 
   # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
   def load_articles(self):
-    print('Start: Loading Articles')
+    print("Start: Loading Articles")
 
     global_article_count = 0
     for input_file in self.input_files:
-      print('input file:', input_file)
-      with open(input_file, mode='r', newline='\n') as f:
+      print("input file:", input_file)
+      with open(input_file, mode="r", newline="\n") as f:
         for i, line in enumerate(f):
           if line.strip():
             self.articles[global_article_count] = line.rstrip()
             global_article_count += 1
 
-    print('End: Loading Articles: There are', len(self.articles), 'articles.')
+    print("End: Loading Articles: There are", len(self.articles), "articles.")
 
   def segment_articles_into_sentences(self, segmenter):
-    print('Start: Sentence Segmentation')
+    print("Start: Sentence Segmentation")
     if len(self.articles) is 0:
       self.load_articles()
 
-    assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
+    assert len(self.articles) is not 0, "Please check that input files are present and contain data."
 
     # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
-    use_multiprocessing = 'serial'
+    use_multiprocessing = "serial"
 
     def chunks(data, size=len(self.articles)):
       it = iter(data)
       for i in range(0, len(data), size):
         yield {k: data[k] for k in islice(it, size)}
 
-    if use_multiprocessing == 'manager':
+    if use_multiprocessing == "manager":
       manager = multiprocessing.Manager()
       return_dict = manager.dict()
       jobs = []
@@ -85,7 +84,7 @@ def work(articles, return_dict):
           sentences[i] = segmenter.segment_string(articles[article])
 
           if i % 5000 == 0:
-            print('Segmenting article', i)
+            print("Segmenting article", i)
 
         return_dict.update(sentences)
 
@@ -102,7 +101,7 @@ def work(articles, return_dict):
       for proc in jobs:
         proc.join()
 
-    elif use_multiprocessing == 'queue':
+    elif use_multiprocessing == "queue":
       work_queue = multiprocessing.Queue()
       jobs = []
 
@@ -114,28 +113,28 @@ def work(articles, return_dict):
         self.sentences[i] = segmenter.segment_string(self.articles[article])
 
         if i % 5000 == 0:
-          print('Segmenting article', i)
+          print("Segmenting article", i)
 
-    print('End: Sentence Segmentation')
+    print("End: Sentence Segmentation")
 
   def init_output_files(self):
-    print('Start: Init Output Files')
-    assert len(
-        self.output_training_files
-    ) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
-    assert len(
-        self.output_test_files
-    ) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+    print("Start: Init Output Files")
+    assert len(self.output_training_files) is 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
+    assert len(self.output_test_files) is 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
 
     for i in range(self.n_training_shards):
-      name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
+      name = self.output_name_prefix + self.output_training_identifier + "_" + str(i) + self.output_file_extension
       self.output_training_files[name] = []
 
     for i in range(self.n_test_shards):
-      name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
+      name = self.output_name_prefix + self.output_test_identifier + "_" + str(i) + self.output_file_extension
       self.output_test_files[name] = []
 
-    print('End: Init Output Files')
+    print("End: Init Output Files")
 
   def get_sentences_per_shard(self, shard):
     result = 0
@@ -145,10 +144,10 @@ def get_sentences_per_shard(self, shard):
     return result
 
   def distribute_articles_over_shards(self):
-    print('Start: Distribute Articles Over Shards')
-    assert len(
-        self.articles
-    ) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+    print("Start: Distribute Articles Over Shards")
+    assert len(self.articles) >= self.n_training_shards + self.n_test_shards, (
+      "There are fewer articles than shards. Please add more data or reduce the number of shards requested."
+    )
 
     # Create dictionary with - key: sentence count per article, value: article id number
     sentence_counts = defaultdict(lambda: [])
@@ -183,7 +182,7 @@ def distribute_articles_over_shards(self):
 
       if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
         nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
-        print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+        print("Warning: A single article contains more than the nominal number of sentences per training shard.")
 
     for file in self.output_test_files:
       current_article_id = sentence_counts[max_sentences][-1]
@@ -198,7 +197,7 @@ def distribute_articles_over_shards(self):
 
       if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
         nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
-        print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+        print("Warning: A single article contains more than the nominal number of sentences per test shard.")
 
     training_counts = []
     test_counts = []
@@ -227,8 +226,11 @@ def distribute_articles_over_shards(self):
         while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
           nominal_next_article_size -= 1
 
-        if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[
-            fidx] > training_median:
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size is 0
+          or training_counts[fidx] > training_median
+        ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
 
         current_article_id = sentence_counts[nominal_next_article_size][-1]
@@ -248,8 +250,11 @@ def distribute_articles_over_shards(self):
         while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
           nominal_next_article_size -= 1
 
-        if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[
-            fidx] > test_median:
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size is 0
+          or test_counts[fidx] > test_median
+        ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
 
         current_article_id = sentence_counts[nominal_next_article_size][-1]
@@ -283,36 +288,36 @@ def distribute_articles_over_shards(self):
       training_median = statistics.median(training_counts)
       test_median = statistics.median(test_counts)
 
-      print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+      print("Distributing data over shards:", len(unused_article_set), "articles remaining.")
 
     if len(unused_article_set) != 0:
-      print('Warning: Some articles did not make it into output files.')
+      print("Warning: Some articles did not make it into output files.")
 
     for shard in self.output_training_files:
-      print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+      print("Training shard:", self.get_sentences_per_shard(self.output_training_files[shard]))
 
     for shard in self.output_test_files:
-      print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+      print("Test shard:", self.get_sentences_per_shard(self.output_test_files[shard]))
 
-    print('End: Distribute Articles Over Shards')
+    print("End: Distribute Articles Over Shards")
 
   def write_shards_to_disk(self):
-    print('Start: Write Shards to Disk')
+    print("Start: Write Shards to Disk")
     for shard in self.output_training_files:
       self.write_single_shard(shard, self.output_training_files[shard])
 
     for shard in self.output_test_files:
       self.write_single_shard(shard, self.output_test_files[shard])
 
-    print('End: Write Shards to Disk')
+    print("End: Write Shards to Disk")
 
   def write_single_shard(self, shard_name, shard):
-    with open(shard_name, mode='w', newline='\n') as f:
+    with open(shard_name, mode="w", newline="\n") as f:
       for article_id in shard:
         for line in self.sentences[article_id]:
-          f.write(line + '\n')
+          f.write(line + "\n")
 
-        f.write('\n')  # Line break between articles
+        f.write("\n")  # Line break between articles
 
 
 import nltk
@@ -325,11 +330,10 @@ def write_single_shard(self, shard_name, shard):
 else:
   ssl._create_default_https_context = _create_unverified_https_context
 
-nltk.download('punkt')
+nltk.download("punkt")
 
 
 class NLTKSegmenter:
-
   def __init(self):
     pass
 
diff --git a/deepray/datasets/wikicorpus_en/processing/WikiDownloader.py b/deepray/datasets/wikicorpus_en/processing/WikiDownloader.py
index f5a91687..047664d8 100644
--- a/deepray/datasets/wikicorpus_en/processing/WikiDownloader.py
+++ b/deepray/datasets/wikicorpus_en/processing/WikiDownloader.py
@@ -17,37 +17,36 @@
 
 
 class WikiDownloader:
-
   def __init__(self, language, save_path):
-    self.save_path = save_path + '/wikicorpus_' + language
+    self.save_path = save_path + "/wikicorpus_" + language
 
     if not os.path.exists(self.save_path):
       os.makedirs(self.save_path)
 
     self.language = language
     self.download_urls = {
-        'en': 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-        'zh': 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
+      "en": "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
+      "zh": "https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2",
     }
 
-    self.output_files = {'en': 'wikicorpus_en.xml.bz2', 'zh': 'wikicorpus_zh.xml.bz2'}
+    self.output_files = {"en": "wikicorpus_en.xml.bz2", "zh": "wikicorpus_zh.xml.bz2"}
 
   def download(self):
     if self.language in self.download_urls:
       url = self.download_urls[self.language]
       filename = self.output_files[self.language]
 
-      print('Downloading:', url)
-      if os.path.isfile(self.save_path + '/' + filename):
-        print('** Download file already exists, skipping download')
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + filename):
+        print("** Download file already exists, skipping download")
       else:
         response = urllib.request.urlopen(url)
-        with open(self.save_path + '/' + filename, "wb") as handle:
+        with open(self.save_path + "/" + filename, "wb") as handle:
           handle.write(response.read())
 
       # Always unzipping since this is relatively fast and will overwrite
-      print('Unzipping:', self.output_files[self.language])
-      subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
+      print("Unzipping:", self.output_files[self.language])
+      subprocess.run("bzip2 -dk " + self.save_path + "/" + filename, shell=True, check=True)
 
     else:
-      assert False, 'WikiDownloader not implemented for this language yet.'
+      assert False, "WikiDownloader not implemented for this language yet."
diff --git a/deepray/datasets/wikicorpus_en/processing/WikicorpusTextFormatting.py b/deepray/datasets/wikicorpus_en/processing/WikicorpusTextFormatting.py
index 3c1e48ac..d36c328d 100644
--- a/deepray/datasets/wikicorpus_en/processing/WikicorpusTextFormatting.py
+++ b/deepray/datasets/wikicorpus_en/processing/WikicorpusTextFormatting.py
@@ -15,7 +15,6 @@
 
 
 class WikicorpusTextFormatting:
-
   def __init__(self, wiki_path, output_filename, recursive=False):
     self.wiki_path = wiki_path
     self.recursive = recursive
@@ -23,21 +22,21 @@ def __init__(self, wiki_path, output_filename, recursive=False):
 
   # This puts one article per line
   def merge(self):
-    with open(self.output_filename, mode='w', newline='\n') as ofile:
-      for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
-        for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for dirname in glob.glob(self.wiki_path + "/*/", recursive=False):
+        for filename in glob.glob(dirname + "wiki_*", recursive=self.recursive):
           print(filename)
           article_lines = []
           article_open = False
 
-          with open(filename, mode='r', newline='\n') as file:
+          with open(filename, mode="r", newline="\n") as file:
             for line in file:
-              if '<doc id=' in line:
+              if "<doc id=" in line:
                 article_open = True
-              elif '</doc>' in line:
+              elif "</doc>" in line:
                 article_open = False
                 for oline in article_lines[1:]:
-                  if oline != '\n':
+                  if oline != "\n":
                     ofile.write(oline.rstrip() + " ")
                 ofile.write("\n\n")
                 article_lines = []
diff --git a/deepray/datasets/wikicorpus_en/processing/build_pretraining_dataset.py b/deepray/datasets/wikicorpus_en/processing/build_pretraining_dataset.py
index eaf1c4cc..3d3f22d0 100644
--- a/deepray/datasets/wikicorpus_en/processing/build_pretraining_dataset.py
+++ b/deepray/datasets/wikicorpus_en/processing/build_pretraining_dataset.py
@@ -70,16 +70,17 @@ def _create_example(self):
       # empty, (2) the sentence doesn't put the first segment over length or
       # (3) 50% of the time when it does put the first segment over length
       if (
-          len(first_segment) == 0 or len(first_segment) + len(sentence) < first_segment_target_length or
-          (len(second_segment) == 0 and len(first_segment) < first_segment_target_length and random.random() < 0.5)
+        len(first_segment) == 0
+        or len(first_segment) + len(sentence) < first_segment_target_length
+        or (len(second_segment) == 0 and len(first_segment) < first_segment_target_length and random.random() < 0.5)
       ):
         first_segment += sentence
       else:
         second_segment += sentence
 
     # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
-    first_segment = first_segment[:self._max_length - 2]
-    second_segment = second_segment[:max(0, self._max_length - len(first_segment) - 3)]
+    first_segment = first_segment[: self._max_length - 2]
+    second_segment = second_segment[: max(0, self._max_length - len(first_segment) - 3)]
 
     # prepare to start building the next example
     self._current_sentences = []
@@ -105,13 +106,13 @@ def _make_tf_example(self, first_segment, second_segment):
     input_mask += [0] * (self._max_length - len(input_mask))
     segment_ids += [0] * (self._max_length - len(segment_ids))
     tf_example = tf.train.Example(
-        features=tf.train.Features(
-            feature={
-                "input_ids": create_int_feature(input_ids),
-                "input_mask": create_int_feature(input_mask),
-                "segment_ids": create_int_feature(segment_ids)
-            }
-        )
+      features=tf.train.Features(
+        feature={
+          "input_ids": create_int_feature(input_ids),
+          "input_mask": create_int_feature(input_mask),
+          "segment_ids": create_int_feature(segment_ids),
+        }
+      )
     )
     return tf_example
 
@@ -120,15 +121,15 @@ class ExampleWriter(object):
   """Writes pre-training examples to disk."""
 
   def __init__(
-      self,
-      job_id,
-      vocab_file,
-      output_dir,
-      max_seq_length,
-      num_jobs,
-      blanks_separate_docs,
-      do_lower_case,
-      num_out_files=1000
+    self,
+    job_id,
+    vocab_file,
+    output_dir,
+    max_seq_length,
+    num_jobs,
+    blanks_separate_docs,
+    do_lower_case,
+    num_out_files=1000,
   ):
     self._blanks_separate_docs = blanks_separate_docs
     tokenizer = ElectraTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
@@ -169,14 +170,14 @@ def log(*args):
 
   log("Creating example writer")
   example_writer = ExampleWriter(
-      job_id=job_id,
-      vocab_file=args.vocab_file,
-      output_dir=args.output_dir,
-      max_seq_length=args.max_seq_length,
-      num_jobs=args.num_processes,
-      blanks_separate_docs=args.blanks_separate_docs,
-      do_lower_case=args.do_lower_case,
-      num_out_files=args.num_out_files,
+    job_id=job_id,
+    vocab_file=args.vocab_file,
+    output_dir=args.output_dir,
+    max_seq_length=args.max_seq_length,
+    num_jobs=args.num_processes,
+    blanks_separate_docs=args.blanks_separate_docs,
+    do_lower_case=args.do_lower_case,
+    num_out_files=args.num_out_files,
   )
   log("Writing tf examples")
   fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
@@ -187,11 +188,14 @@ def log(*args):
     if file_no > 0:
       elapsed = time.time() - start_time
       log(
-          "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
-          "{:} examples written".format(
-              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
-              int((len(fnames) - file_no) / (file_no / elapsed)), example_writer.n_written
-          )
+        "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, {:} examples written".format(
+          file_no,
+          len(fnames),
+          100.0 * file_no / len(fnames),
+          int(elapsed),
+          int((len(fnames) - file_no) / (file_no / elapsed)),
+          example_writer.n_written,
+        )
       )
     example_writer.write_examples(os.path.join(args.corpus_dir, fname))
   example_writer.finish()
@@ -207,11 +211,11 @@ def main():
   parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.")
   parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.")
   parser.add_argument(
-      "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries."
+    "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries."
   )
-  parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.")
+  parser.add_argument("--do-lower-case", dest="do_lower_case", action="store_true", help="Lower case input text.")
   parser.add_argument(
-      "--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text."
+    "--no-lower-case", dest="do_lower_case", action="store_false", help="Don't lower case input text."
   )
   parser.add_argument("--num-out-files", default=1000, type=int, help="Number of output files.")
   parser.add_argument("--seed", default=1314, type=int)
diff --git a/deepray/datasets/wikicorpus_en/processing/dataPrep.py b/deepray/datasets/wikicorpus_en/processing/dataPrep.py
index 2819fce9..3ba5d73d 100644
--- a/deepray/datasets/wikicorpus_en/processing/dataPrep.py
+++ b/deepray/datasets/wikicorpus_en/processing/dataPrep.py
@@ -23,75 +23,97 @@
 
 
 def main(args):
-  working_dir = os.environ['DATA_PREP_WORKING_DIR']
+  working_dir = os.environ["DATA_PREP_WORKING_DIR"]
 
-  print('Working Directory:', working_dir)
-  print('Action:', args.action)
-  print('Dataset Name:', args.dataset)
+  print("Working Directory:", working_dir)
+  print("Action:", args.action)
+  print("Dataset Name:", args.dataset)
 
   if args.input_files:
-    args.input_files = args.input_files.split(',')
-
-  hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
-                                + "_random_seed_" + str(args.random_seed)
+    args.input_files = args.input_files.split(",")
+
+  hdf5_tfrecord_folder_prefix = (
+    "_lower_case_"
+    + str(args.do_lower_case)
+    + "_seq_len_"
+    + str(args.max_seq_length)
+    + "_random_seed_"
+    + str(args.random_seed)
+  )
 
   directory_structure = {
-      'download':
-          working_dir + '/download',  # Downloaded and decompressed
-      'extracted':
-          working_dir + '/extracted',  # Extracted from whatever the initial format is (e.g., wikiextractor)
-      'formatted':
-          working_dir + '/formatted_one_article_per_line',  # This is the level where all sources should look the same
-      'sharded':
-          working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" +
-          str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
-      'tfrecord':
-          working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
-      'hdf5':
-          working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
+    "download": working_dir + "/download",  # Downloaded and decompressed
+    "extracted": working_dir + "/extracted",  # Extracted from whatever the initial format is (e.g., wikiextractor)
+    "formatted": working_dir
+    + "/formatted_one_article_per_line",  # This is the level where all sources should look the same
+    "sharded": working_dir
+    + "/sharded_"
+    + "training_shards_"
+    + str(args.n_training_shards)
+    + "_test_shards_"
+    + str(args.n_test_shards)
+    + "_fraction_"
+    + str(args.fraction_test_set),
+    "tfrecord": working_dir + "/tfrecord" + hdf5_tfrecord_folder_prefix,
+    "hdf5": working_dir + "/hdf5" + hdf5_tfrecord_folder_prefix,
   }
 
-  print('\nDirectory Structure:')
+  print("\nDirectory Structure:")
   pp = pprint.PrettyPrinter(indent=2)
   pp.pprint(directory_structure)
-  print('')
+  print("")
 
-  if args.action == 'download':
-    if not os.path.exists(directory_structure['download']):
-      os.makedirs(directory_structure['download'])
+  if args.action == "download":
+    if not os.path.exists(directory_structure["download"]):
+      os.makedirs(directory_structure["download"])
 
-    downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
+    downloader = Downloader.Downloader(args.dataset, directory_structure["download"])
     downloader.download()
 
-  elif args.action == 'text_formatting':
-    assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
+  elif args.action == "text_formatting":
+    assert (
+      args.dataset != "google_pretrained_weights"
+      and args.dataset != "nvidia_pretrained_weights"
+      and args.dataset != "squad"
+      and args.dataset != "mrpc"
+    ), "Cannot perform text_formatting on pretrained weights"
 
-    if not os.path.exists(directory_structure['extracted']):
-      os.makedirs(directory_structure['extracted'])
+    if not os.path.exists(directory_structure["extracted"]):
+      os.makedirs(directory_structure["extracted"])
 
-    if not os.path.exists(directory_structure['formatted']):
-      os.makedirs(directory_structure['formatted'])
+    if not os.path.exists(directory_structure["formatted"]):
+      os.makedirs(directory_structure["formatted"])
 
-    if args.dataset == 'wikicorpus_en':
+    if args.dataset == "wikicorpus_en":
       if args.skip_wikiextractor == 0:
-        path_to_wikiextractor_in_container = 'python wikiextractor/WikiExtractor.py'
-        wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure[
-            'download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(
-                args.n_processes
-            ) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-        print('WikiExtractor Command:', wikiextractor_command)
+        path_to_wikiextractor_in_container = "python wikiextractor/WikiExtractor.py"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_en.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
         wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
         # wikiextractor_process.communicate()
 
-      wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
-      output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_en"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"
       wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
       wiki_formatter.merge()
 
-    elif args.dataset == 'wikicorpus_zh':
+    elif args.dataset == "wikicorpus_zh":
       raise NotImplementedError(
-          'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be '
-          'translated and properly segmented still, and should work once this step is added.'
+        "wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be "
+        "translated and properly segmented still, and should work once this step is added."
       )
       # if args.skip_wikiextractor == 0:
       #     path_to_wikiextractor_in_container = 'wikiextractor/WikiExtractor.py'
@@ -107,29 +129,29 @@ def main(args):
       #
       # assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
 
-  elif args.action == 'sharding':
+  elif args.action == "sharding":
     # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
-    if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
+    if args.dataset == "bookscorpus" or "wikicorpus" in args.dataset or "books_wiki" in args.dataset:
       if args.input_files is None:
-        if args.dataset == 'bookscorpus':
-          args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
-        elif args.dataset == 'wikicorpus_en':
-          args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-        elif args.dataset == 'wikicorpus_zh':
-          args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
-        elif args.dataset == 'books_wiki_en_corpus':
+        if args.dataset == "bookscorpus":
+          args.input_files = [directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"]
+        elif args.dataset == "wikicorpus_en":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"]
+        elif args.dataset == "wikicorpus_zh":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"]
+        elif args.dataset == "books_wiki_en_corpus":
           args.input_files = [
-              directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt',
-              directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
+            directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt",
+            directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt",
           ]
 
-      output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
+      output_file_prefix = directory_structure["sharded"] + "/" + args.dataset + "/" + args.dataset
 
-      if not os.path.exists(directory_structure['sharded']):
-        os.makedirs(directory_structure['sharded'])
+      if not os.path.exists(directory_structure["sharded"]):
+        os.makedirs(directory_structure["sharded"])
 
-      if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
-        os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset)
 
       # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
       # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
@@ -137,7 +159,7 @@ def main(args):
       # other packages to be called from here -- just add a conditional branch for those extra steps
       segmenter = TextSharding.NLTKSegmenter()
       sharding = TextSharding.Sharding(
-          args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
+        args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
       )
 
       sharding.load_articles()
@@ -145,124 +167,135 @@ def main(args):
       sharding.distribute_articles_over_shards()
       sharding.write_shards_to_disk()
 
-      for _dir in ['train', 'test']:
-        if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir):
-          os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir)
-        absolute_dir = directory_structure['sharded'] + '/' + args.dataset
-        command = 'mv ' + absolute_dir + '/*' + _dir + '*.txt' + ' ' + absolute_dir + '/' + _dir
+      for _dir in ["train", "test"]:
+        if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/" + _dir):
+          os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/" + _dir)
+        absolute_dir = directory_structure["sharded"] + "/" + args.dataset
+        command = "mv " + absolute_dir + "/*" + _dir + "*.txt" + " " + absolute_dir + "/" + _dir
         mv_process = subprocess.Popen(command, shell=True)
 
         mv_process.wait()
     else:
-      assert False, 'Unsupported dataset for sharding'
-
-  elif args.action == 'create_tfrecord_files':
+      assert False, "Unsupported dataset for sharding"
 
-    if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
-      os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
+  elif args.action == "create_tfrecord_files":
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset)
     if args.vocab_file is None:
       args.vocab_file = os.path.join(working_dir, "vocab.txt")
 
-    for _dir in ['train', 'test']:
-      electra_preprocessing_command = 'python build_pretraining_dataset.py'
-      electra_preprocessing_command += ' --corpus-dir=' + directory_structure['sharded'
-                                                                             ] + '/' + args.dataset + '/' + _dir
-      electra_preprocessing_command += ' --output-dir=' + directory_structure['tfrecord'
-                                                                             ] + '/' + args.dataset + '/' + _dir
-      electra_preprocessing_command += ' --vocab-file=' + args.vocab_file
-      electra_preprocessing_command += ' --do-lower-case' if args.do_lower_case else ' --no-lower-case'
-      electra_preprocessing_command += ' --max-seq-length=' + str(args.max_seq_length)
-      electra_preprocessing_command += ' --num-processes=8'
-      electra_preprocessing_command += ' --num-out-files=' + str(args.n_training_shards) if _dir == 'train' \
-        else ' --num-out-files=' + str(args.n_test_shards)
+    for _dir in ["train", "test"]:
+      electra_preprocessing_command = "python build_pretraining_dataset.py"
+      electra_preprocessing_command += (
+        " --corpus-dir=" + directory_structure["sharded"] + "/" + args.dataset + "/" + _dir
+      )
+      electra_preprocessing_command += (
+        " --output-dir=" + directory_structure["tfrecord"] + "/" + args.dataset + "/" + _dir
+      )
+      electra_preprocessing_command += " --vocab-file=" + args.vocab_file
+      electra_preprocessing_command += " --do-lower-case" if args.do_lower_case else " --no-lower-case"
+      electra_preprocessing_command += " --max-seq-length=" + str(args.max_seq_length)
+      electra_preprocessing_command += " --num-processes=8"
+      electra_preprocessing_command += (
+        " --num-out-files=" + str(args.n_training_shards)
+        if _dir == "train"
+        else " --num-out-files=" + str(args.n_test_shards)
+      )
       electra_preprocessing_process = subprocess.Popen(electra_preprocessing_command, shell=True)
 
       electra_preprocessing_process.wait()
 
-  elif args.action == 'create_hdf5_files':
+  elif args.action == "create_hdf5_files":
     raise NotImplementedError
 
 
 if __name__ == "__main__":
-  parser = argparse.ArgumentParser(description='Preprocessing Application for Everything BERT-related')
+  parser = argparse.ArgumentParser(description="Preprocessing Application for Everything BERT-related")
 
   parser.add_argument(
-      '--action',
-      type=str,
-      help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
-      choices={
-          'download',  # Download and verify mdf5/sha sums
-          'text_formatting',  # Convert into a file that contains one article/book per line
-          'sharding',  # Convert previous formatted text into shards containing one sentence per line
-          'create_tfrecord_files',  # Turn each shard into a TFrecord with masking and next sentence prediction info
-          'create_hdf5_files'  # Turn each shard into a HDF5 file with masking and next sentence prediction info
-      }
+    "--action",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+    choices={
+      "download",  # Download and verify mdf5/sha sums
+      "text_formatting",  # Convert into a file that contains one article/book per line
+      "sharding",  # Convert previous formatted text into shards containing one sentence per line
+      "create_tfrecord_files",  # Turn each shard into a TFrecord with masking and next sentence prediction info
+      "create_hdf5_files",  # Turn each shard into a HDF5 file with masking and next sentence prediction info
+    },
   )
 
   parser.add_argument(
-      '--dataset',
-      type=str,
-      help='Specify the dataset to perform --action on',
-      choices={
-          'bookscorpus', 'wikicorpus_en', 'wikicorpus_zh', 'books_wiki_en_corpus', 'google_pretrained_weights',
-          'nvidia_pretrained_weights', 'mrpc', 'squad', 'all'
-      }
+    "--dataset",
+    type=str,
+    help="Specify the dataset to perform --action on",
+    choices={
+      "bookscorpus",
+      "wikicorpus_en",
+      "wikicorpus_zh",
+      "books_wiki_en_corpus",
+      "google_pretrained_weights",
+      "nvidia_pretrained_weights",
+      "mrpc",
+      "squad",
+      "all",
+    },
   )
 
-  parser.add_argument('--input_files', type=str, help='Specify the input files in a comma-separated list (no spaces)')
+  parser.add_argument("--input_files", type=str, help="Specify the input files in a comma-separated list (no spaces)")
 
   parser.add_argument(
-      '--n_training_shards', type=int, help='Specify the number of training shards to generate', default=2048
+    "--n_training_shards", type=int, help="Specify the number of training shards to generate", default=2048
   )
 
-  parser.add_argument('--n_test_shards', type=int, help='Specify the number of test shards to generate', default=2048)
+  parser.add_argument("--n_test_shards", type=int, help="Specify the number of test shards to generate", default=2048)
 
   parser.add_argument(
-      '--fraction_test_set',
-      type=float,
-      help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
-      default=0.1
+    "--fraction_test_set",
+    type=float,
+    help="Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)",
+    default=0.1,
   )
 
   parser.add_argument(
-      '--segmentation_method',
-      type=str,
-      help='Specify your choice of sentence segmentation',
-      choices={'nltk'},
-      default='nltk'
+    "--segmentation_method",
+    type=str,
+    help="Specify your choice of sentence segmentation",
+    choices={"nltk"},
+    default="nltk",
   )
 
   parser.add_argument(
-      '--n_processes', type=int, help='Specify the max number of processes to allow at one time', default=4
+    "--n_processes", type=int, help="Specify the max number of processes to allow at one time", default=4
   )
 
   parser.add_argument(
-      '--random_seed', type=int, help='Specify the base seed to use for any random number generation', default=12345
+    "--random_seed", type=int, help="Specify the base seed to use for any random number generation", default=12345
   )
 
-  parser.add_argument('--dupe_factor', type=int, help='Specify the duplication factor', default=5)
+  parser.add_argument("--dupe_factor", type=int, help="Specify the duplication factor", default=5)
 
-  parser.add_argument('--masked_lm_prob', type=float, help='Specify the probability for masked lm', default=0.15)
+  parser.add_argument("--masked_lm_prob", type=float, help="Specify the probability for masked lm", default=0.15)
 
-  parser.add_argument('--max_seq_length', type=int, help='Specify the maximum sequence length', default=512)
+  parser.add_argument("--max_seq_length", type=int, help="Specify the maximum sequence length", default=512)
 
   parser.add_argument(
-      '--do_lower_case',
-      type=int,
-      help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
-      default=0
+    "--do_lower_case",
+    type=int,
+    help="Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)",
+    default=0,
   )
 
-  parser.add_argument('--vocab_file', type=str, help='Specify absolute path to vocab file to use)')
+  parser.add_argument("--vocab_file", type=str, help="Specify absolute path to vocab file to use)")
 
   parser.add_argument(
-      '--skip_wikiextractor', type=int, help='Specify whether to skip wikiextractor step 0=False, 1=True', default=0
+    "--skip_wikiextractor", type=int, help="Specify whether to skip wikiextractor step 0=False, 1=True", default=0
   )
 
   parser.add_argument(
-      '--interactive_json_config_generator',
-      type=str,
-      help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
+    "--interactive_json_config_generator",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
   )
 
   args = parser.parse_args()
diff --git a/deepray/datasets/wikicorpus_en/processing/file_utils.py b/deepray/datasets/wikicorpus_en/processing/file_utils.py
index 6e8a7d5e..41f26a8e 100644
--- a/deepray/datasets/wikicorpus_en/processing/file_utils.py
+++ b/deepray/datasets/wikicorpus_en/processing/file_utils.py
@@ -82,7 +82,7 @@
   torch_cache_home = _get_torch_home()
 except ImportError:
   torch_cache_home = os.path.expanduser(
-      os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+    os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
   )
 default_cache_path = os.path.join(torch_cache_home, "transformers")
 
@@ -90,11 +90,11 @@
   from pathlib import Path
 
   PYTORCH_PRETRAINED_BERT_CACHE = Path(
-      os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
+    os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
   )
 except (AttributeError, ImportError):
   PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
-      "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+    "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
   )
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
@@ -123,7 +123,6 @@ def is_tf_available():
 
 
 def add_start_docstrings(*docstr):
-
   def docstring_decorator(fn):
     fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
     return fn
@@ -132,7 +131,6 @@ def docstring_decorator(fn):
 
 
 def add_start_docstrings_to_callable(*docstr):
-
   def docstring_decorator(fn):
     class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
     intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
@@ -151,7 +149,6 @@ def docstring_decorator(fn):
 
 
 def add_end_docstrings(*docstr):
-
   def docstring_decorator(fn):
     fn.__doc__ = fn.__doc__ + "".join(docstr)
     return fn
@@ -174,13 +171,13 @@ def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
 
 def url_to_filename(url, etag=None):
   """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
-    so that TF 2.0 can identify it as a HDF5 file
-    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
+  Convert `url` into a hashed filename in a repeatable way.
+  If `etag` is specified, append its hash to the url's, delimited
+  by a period.
+  If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+  so that TF 2.0 can identify it as a HDF5 file
+  (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+  """
   url_bytes = url.encode("utf-8")
   url_hash = sha256(url_bytes)
   filename = url_hash.hexdigest()
@@ -198,9 +195,9 @@ def url_to_filename(url, etag=None):
 
 def filename_to_url(filename, cache_dir=None):
   """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
+  Return the url and etag (which may be ``None``) stored for `filename`.
+  Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+  """
   if cache_dir is None:
     cache_dir = TRANSFORMERS_CACHE
   if isinstance(cache_dir, Path):
@@ -223,35 +220,35 @@ def filename_to_url(filename, cache_dir=None):
 
 
 def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent=None,
-    extract_compressed_file=False,
-    force_extract=False,
-    local_files_only=False,
+  url_or_filename,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  resume_download=False,
+  user_agent=None,
+  extract_compressed_file=False,
+  force_extract=False,
+  local_files_only=False,
 ) -> Optional[str]:
   """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletly recieved file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and overide the folder where it was extracted.
-
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
+  Given something that might be a URL (or might be a local path),
+  determine which. If it's a URL, download the file and cache it, and
+  return the path to the cached file. If it's already a local path,
+  make sure the file exists and then return the path.
+  Args:
+      cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+      force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+      resume_download: if True, resume the download if incompletly recieved file is found.
+      user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+      extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+          file in a folder along the archive.
+      force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+          re-extract the archive and overide the folder where it was extracted.
+
+  Return:
+      None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+      Local path (string) otherwise
+  """
   if cache_dir is None:
     cache_dir = TRANSFORMERS_CACHE
   if isinstance(url_or_filename, Path):
@@ -262,13 +259,13 @@ def cached_path(
   if is_remote_url(url_or_filename):
     # URL, so get it from the cache (downloading if necessary)
     output_path = get_from_cache(
-        url_or_filename,
-        cache_dir=cache_dir,
-        force_download=force_download,
-        proxies=proxies,
-        resume_download=resume_download,
-        user_agent=user_agent,
-        local_files_only=local_files_only,
+      url_or_filename,
+      cache_dir=cache_dir,
+      force_download=force_download,
+      proxies=proxies,
+      resume_download=resume_download,
+      user_agent=user_agent,
+      local_files_only=local_files_only,
     )
   elif os.path.exists(url_or_filename):
     # File, and it exists.
@@ -329,9 +326,9 @@ def split_s3_path(url):
 
 def s3_request(func):
   """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
+  Wrapper function for s3 requests in order to create more helpful error
+  messages.
+  """
 
   @wraps(func)
   def wrapper(url, *args, **kwargs):
@@ -382,12 +379,12 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
   content_length = response.headers.get("Content-Length")
   total = resume_size + int(content_length) if content_length is not None else None
   progress = tqdm(
-      unit="B",
-      unit_scale=True,
-      total=total,
-      initial=resume_size,
-      desc="Downloading",
-      disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+    unit="B",
+    unit_scale=True,
+    total=total,
+    initial=resume_size,
+    desc="Downloading",
+    disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
   )
   for chunk in response.iter_content(chunk_size=1024):
     if chunk:  # filter out keep-alive new chunks
@@ -397,23 +394,23 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
 
 
 def get_from_cache(
-    url,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent=None,
-    local_files_only=False,
+  url,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  etag_timeout=10,
+  resume_download=False,
+  user_agent=None,
+  local_files_only=False,
 ) -> Optional[str]:
   """
-    Given a URL, look for the corresponding file in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
+  Given a URL, look for the corresponding file in the local cache.
+  If it's not there, download it. Then return the path to the cached file.
 
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
+  Return:
+      None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+      Local path (string) otherwise
+  """
   if cache_dir is None:
     cache_dir = TRANSFORMERS_CACHE
   if isinstance(cache_dir, Path):
@@ -447,8 +444,9 @@ def get_from_cache(
       return cache_path
     else:
       matching_files = [
-          file for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-          if not file.endswith(".json") and not file.endswith(".lock")
+        file
+        for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+        if not file.endswith(".json") and not file.endswith(".lock")
       ]
       if len(matching_files) > 0:
         return os.path.join(cache_dir, matching_files[-1])
@@ -458,9 +456,9 @@ def get_from_cache(
         # Notify the user about that
         if local_files_only:
           raise ValueError(
-              "Cannot find the requested files in the cached path and outgoing traffic has been"
-              " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-              " to False."
+            "Cannot find the requested files in the cached path and outgoing traffic has been"
+            " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+            " to False."
           )
         return None
 
@@ -471,7 +469,6 @@ def get_from_cache(
   # Prevent parallel downloads of the same file with a lock.
   lock_path = cache_path + ".lock"
   with FileLock(lock_path):
-
     if resume_download:
       incomplete_path = cache_path + ".incomplete"
 
diff --git a/deepray/datasets/wikicorpus_en/processing/tokenization.py b/deepray/datasets/wikicorpus_en/processing/tokenization.py
index 7362f38b..84ba789c 100644
--- a/deepray/datasets/wikicorpus_en/processing/tokenization.py
+++ b/deepray/datasets/wikicorpus_en/processing/tokenization.py
@@ -20,63 +20,44 @@
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file":
-        {
-            "google/electra-small-generator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
-            "google/electra-base-generator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
-            "google/electra-large-generator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
-            "google/electra-small-discriminator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
-            "google/electra-base-discriminator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
-            "google/electra-large-discriminator":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
-        }
+  "vocab_file": {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
+  }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/electra-small-generator": 512,
-    "google/electra-base-generator": 512,
-    "google/electra-large-generator": 512,
-    "google/electra-small-discriminator": 512,
-    "google/electra-base-discriminator": 512,
-    "google/electra-large-discriminator": 512,
+  "google/electra-small-generator": 512,
+  "google/electra-base-generator": 512,
+  "google/electra-large-generator": 512,
+  "google/electra-small-discriminator": 512,
+  "google/electra-base-discriminator": 512,
+  "google/electra-large-discriminator": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "google/electra-small-generator": {
-        "do_lower_case": True
-    },
-    "google/electra-base-generator": {
-        "do_lower_case": True
-    },
-    "google/electra-large-generator": {
-        "do_lower_case": True
-    },
-    "google/electra-small-discriminator": {
-        "do_lower_case": True
-    },
-    "google/electra-base-discriminator": {
-        "do_lower_case": True
-    },
-    "google/electra-large-discriminator": {
-        "do_lower_case": True
-    },
+  "google/electra-small-generator": {"do_lower_case": True},
+  "google/electra-base-generator": {"do_lower_case": True},
+  "google/electra-large-generator": {"do_lower_case": True},
+  "google/electra-small-discriminator": {"do_lower_case": True},
+  "google/electra-base-discriminator": {"do_lower_case": True},
+  "google/electra-large-discriminator": {"do_lower_case": True},
 }
 
 
 class ElectraTokenizer(BertTokenizer):
   r"""
-    Constructs an Electra tokenizer.
-    :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
-    tokenization: punctuation splitting + wordpiece.
+  Constructs an Electra tokenizer.
+  :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+  tokenization: punctuation splitting + wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
-    parameters.
-    """
+  Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+  parameters.
+  """
 
   vocab_files_names = VOCAB_FILES_NAMES
   pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
diff --git a/deepray/datasets/wikicorpus_en/processing/tokenization_utils.py b/deepray/datasets/wikicorpus_en/processing/tokenization_utils.py
index 0f3e8105..e9f7c7c8 100644
--- a/deepray/datasets/wikicorpus_en/processing/tokenization_utils.py
+++ b/deepray/datasets/wikicorpus_en/processing/tokenization_utils.py
@@ -54,52 +54,52 @@
 
 def flatten(x: Sequence):
   """
-    Flatten the provided (potentially nested) sequence
+  Flatten the provided (potentially nested) sequence
 
-    Args:
-        x (Sequence): Potentially nested sequence to flatten
+  Args:
+      x (Sequence): Potentially nested sequence to flatten
 
-    Returns:
-        list: Flattened sequence
-    """
+  Returns:
+      list: Flattened sequence
+  """
 
   return functools.reduce(operator.iconcat, x, [])
 
 
 @contextmanager
 def truncate_and_pad(
-    tokenizer: BaseTokenizer,
-    max_length: int,
-    stride: int,
-    strategy: str,
-    pad_to_max_length: bool,
-    padding_side: str,
-    pad_token_id: int,
-    pad_token_type_id: int,
-    pad_token: str,
+  tokenizer: BaseTokenizer,
+  max_length: int,
+  stride: int,
+  strategy: str,
+  pad_to_max_length: bool,
+  padding_side: str,
+  pad_token_id: int,
+  pad_token_type_id: int,
+  pad_token: str,
 ):
   """
-    This contextmanager is in charge of defining the truncation and the padding strategies and then
-    restore the tokenizer settings afterwards.
-
-    This contextmanager assumes the provider tokenizer has no padding / truncation strategy
-    before the managed section. If your tokenizer set a padding / truncation strategy before,
-    then it will be reset to no padding/truncation when exiting the managed section.
+  This contextmanager is in charge of defining the truncation and the padding strategies and then
+  restore the tokenizer settings afterwards.
+
+  This contextmanager assumes the provider tokenizer has no padding / truncation strategy
+  before the managed section. If your tokenizer set a padding / truncation strategy before,
+  then it will be reset to no padding/truncation when exiting the managed section.
+
+  Args:
+      tokenizer (BaseTokenizer): The tokenizer which will be used
+      max_length (int): The maximum size of the sequence
+      stride (int): The stride to use when handling overflow
+      strategy (str): Overflowing logic to use
+      pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
+      padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
+      pad_token_id (int): The integer representation of the padding token to use
+      pad_token_type_id (int): The integer representation of the padding token type to use
+      pad_token (str): The string representation of the padding token to use
+
+  Returns:
 
-    Args:
-        tokenizer (BaseTokenizer): The tokenizer which will be used
-        max_length (int): The maximum size of the sequence
-        stride (int): The stride to use when handling overflow
-        strategy (str): Overflowing logic to use
-        pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
-        padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
-        pad_token_id (int): The integer representation of the padding token to use
-        pad_token_type_id (int): The integer representation of the padding token type to use
-        pad_token (str): The string representation of the padding token to use
-
-    Returns:
-
-    """
+  """
 
   # Handle all the truncation and padding stuff
   if max_length is not None:
@@ -107,19 +107,17 @@ def truncate_and_pad(
 
   if pad_to_max_length and (pad_token and pad_token_id >= 0):
     tokenizer.enable_padding(
-        max_length=max_length,
-        direction=padding_side,
-        pad_id=pad_token_id,
-        pad_type_id=pad_token_type_id,
-        pad_token=pad_token,
+      max_length=max_length,
+      direction=padding_side,
+      pad_id=pad_token_id,
+      pad_type_id=pad_token_type_id,
+      pad_token=pad_token,
     )
   elif pad_to_max_length:
     logger.warning(
-        "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
-        "To remove this error, you can add a new pad token and then resize model embedding:\n"
-        "\ttokenizer.pad_token = '<PAD>'\n\tmodel.resize_token_embeddings(len(tokenizer))".format(
-            pad_token, pad_token_id
-        )
+      "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
+      "To remove this error, you can add a new pad token and then resize model embedding:\n"
+      "\ttokenizer.pad_token = '<PAD>'\n\tmodel.resize_token_embeddings(len(tokenizer))".format(pad_token, pad_token_id)
     )
 
   yield
@@ -133,11 +131,11 @@ def truncate_and_pad(
 
 class BatchEncoding(UserDict):
   """
-    Data structure derived from Dictionary holding all the required information to forward through
-    a model.
+  Data structure derived from Dictionary holding all the required information to forward through
+  a model.
 
-    In addition, this structure expose utility methods to map from word/char space to token space.
-    """
+  In addition, this structure expose utility methods to map from word/char space to token space.
+  """
 
   def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None):
     super().__init__(data)
@@ -161,10 +159,10 @@ def __getattr__(self, item: str):
   @property
   def encodings(self) -> Optional[List[Encoding]]:
     """
-        Return the list all encoding from the tokenization process
+    Return the list all encoding from the tokenization process
 
-        Returns: List[Encoding] or None if input was tokenized through Python tokenizer
-        """
+    Returns: List[Encoding] or None if input was tokenized through Python tokenizer
+    """
     return self._encodings
 
   def keys(self):
@@ -178,16 +176,16 @@ def items(self):
 
   def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
     """
-        Find the Offsets of the token containing the character at the specified position
+    Find the Offsets of the token containing the character at the specified position
 
-        Args:
-            sentence: Index of the sentence relative to the batch provided to the tokenizer
-            char: Char index to get the relative token offsets
+    Args:
+        sentence: Index of the sentence relative to the batch provided to the tokenizer
+        char: Char index to get the relative token offsets
 
-        Returns:
-            tuple: (token start, token end)
+    Returns:
+        tuple: (token start, token end)
 
-        """
+    """
 
     if not self._encodings:
       raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers")
@@ -195,15 +193,15 @@ def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
 
   def char_to_token(self, sentence: int, char: int) -> int:
     """
-        Return the index of the token at position of the given char.
+    Return the index of the token at position of the given char.
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            char (int): Char index to get the relative token offsets
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        char (int): Char index to get the relative token offsets
 
-        Returns:
-            int: Integer referring to the position of the token in the returned set of tokens for the sentence
-        """
+    Returns:
+        int: Integer referring to the position of the token in the returned set of tokens for the sentence
+    """
 
     if not self._encodings:
       raise ValueError("char_to_token() is not available when using Python based tokenizers")
@@ -211,15 +209,15 @@ def char_to_token(self, sentence: int, char: int) -> int:
 
   def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
     """
-        Find the Offsets of the word containing the character at the specified position
+    Find the Offsets of the word containing the character at the specified position
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            char (int): Char index to get the relative token offsets
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        char (int): Char index to get the relative token offsets
 
-        Returns:
-            tuple: (word start, word end) representing the first and last characters of the word
-        """
+    Returns:
+        tuple: (word start, word end) representing the first and last characters of the word
+    """
 
     if not self._encodings:
       raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers")
@@ -227,15 +225,15 @@ def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
 
   def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]:
     """
-        Find the Offsets of the word containing the token at the given index
+    Find the Offsets of the word containing the token at the given index
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            index (int): Index of the token to map to the original word offsets
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        index (int): Index of the token to map to the original word offsets
 
-        Returns:
-            Optional[tuple]: (word start, word end) or None
-        """
+    Returns:
+        Optional[tuple]: (word start, word end) or None
+    """
 
     if not self._encodings:
       raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers")
@@ -244,18 +242,17 @@ def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int
 
 class SpecialTokensMixin:
   SPECIAL_TOKENS_ATTRIBUTES = [
-      "bos_token",
-      "eos_token",
-      "unk_token",
-      "sep_token",
-      "pad_token",
-      "cls_token",
-      "mask_token",
-      "additional_special_tokens",
+    "bos_token",
+    "eos_token",
+    "unk_token",
+    "sep_token",
+    "pad_token",
+    "cls_token",
+    "mask_token",
+    "additional_special_tokens",
   ]
 
   def __init__(self, **kwargs):
-
     self._bos_token = None
     self._eos_token = None
     self._unk_token = None
@@ -279,56 +276,56 @@ def __init__(self, **kwargs):
 
   @property
   def bos_token(self):
-    """ Beginning of sentence token (string). Log an error if used while not having been set. """
+    """Beginning of sentence token (string). Log an error if used while not having been set."""
     if self._bos_token is None:
       logger.error("Using bos_token, but it is not set yet.")
     return self._bos_token
 
   @property
   def eos_token(self):
-    """ End of sentence token (string). Log an error if used while not having been set. """
+    """End of sentence token (string). Log an error if used while not having been set."""
     if self._eos_token is None:
       logger.error("Using eos_token, but it is not set yet.")
     return self._eos_token
 
   @property
   def unk_token(self):
-    """ Unknown token (string). Log an error if used while not having been set. """
+    """Unknown token (string). Log an error if used while not having been set."""
     if self._unk_token is None:
       logger.error("Using unk_token, but it is not set yet.")
     return self._unk_token
 
   @property
   def sep_token(self):
-    """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+    """Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set."""
     if self._sep_token is None:
       logger.error("Using sep_token, but it is not set yet.")
     return self._sep_token
 
   @property
   def pad_token(self):
-    """ Padding token (string). Log an error if used while not having been set. """
+    """Padding token (string). Log an error if used while not having been set."""
     if self._pad_token is None:
       logger.error("Using pad_token, but it is not set yet.")
     return self._pad_token
 
   @property
   def cls_token(self):
-    """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+    """Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set."""
     if self._cls_token is None:
       logger.error("Using cls_token, but it is not set yet.")
     return self._cls_token
 
   @property
   def mask_token(self):
-    """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+    """Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set."""
     if self._mask_token is None:
       logger.error("Using mask_token, but it is not set yet.")
     return self._mask_token
 
   @property
   def additional_special_tokens(self):
-    """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
+    """All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set."""
     if self._additional_special_tokens is None:
       logger.error("Using additional_special_tokens, but it is not set yet.")
     return self._additional_special_tokens
@@ -363,54 +360,54 @@ def mask_token(self, value):
 
   @property
   def bos_token_id(self):
-    """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
+    """Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.bos_token)
 
   @property
   def eos_token_id(self):
-    """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
+    """Id of the end of sentence token in the vocabulary. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.eos_token)
 
   @property
   def unk_token_id(self):
-    """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
+    """Id of the unknown token in the vocabulary. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.unk_token)
 
   @property
   def sep_token_id(self):
-    """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
+    """Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.sep_token)
 
   @property
   def pad_token_id(self):
-    """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
+    """Id of the padding token in the vocabulary. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.pad_token)
 
   @property
   def pad_token_type_id(self):
-    """ Id of the padding token type in the vocabulary."""
+    """Id of the padding token type in the vocabulary."""
     return self._pad_token_type_id
 
   @property
   def cls_token_id(self):
-    """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
+    """Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.cls_token)
 
   @property
   def mask_token_id(self):
-    """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
+    """Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.mask_token)
 
   @property
   def additional_special_tokens_ids(self):
-    """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
+    """Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set."""
     return self.convert_tokens_to_ids(self.additional_special_tokens)
 
   @property
   def special_tokens_map(self):
-    """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
+    """A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+    values ('<unk>', '<cls>'...)
+    """
     set_attr = {}
     for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
       attr_value = getattr(self, "_" + attr)
@@ -420,9 +417,9 @@ def special_tokens_map(self):
 
   @property
   def all_special_tokens(self):
-    """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
+    """List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+    (cls_token, unk_token...).
+    """
     all_toks = []
     set_attr = self.special_tokens_map
     for attr_value in set_attr.values():
@@ -432,9 +429,9 @@ def all_special_tokens(self):
 
   @property
   def all_special_ids(self):
-    """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
+    """List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+    class attributes (cls_token, unk_token...).
+    """
     all_toks = self.all_special_tokens
     all_ids = self.convert_tokens_to_ids(all_toks)
     return all_ids
@@ -445,36 +442,36 @@ def additional_special_tokens(self, value):
 
 
 class PreTrainedTokenizer(SpecialTokensMixin):
-  """ Base class for all tokenizers.
-    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+  """Base class for all tokenizers.
+  Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
 
-    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+  This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
 
-    Class attributes (overridden by derived classes):
+  Class attributes (overridden by derived classes):
 
-        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
-        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
-        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
-        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+      - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+      - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+      - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+      - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
 
-    Parameters:
+  Parameters:
 
-        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+      - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
 
-        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+      - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
 
-        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+      - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
 
-        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+      - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
 
-        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+      - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
 
-        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+      - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
 
-        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+      - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
 
-        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
-    """
+      - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+  """
 
   vocab_files_names = {}
   pretrained_vocab_files_map = {}
@@ -485,20 +482,20 @@ class PreTrainedTokenizer(SpecialTokensMixin):
   padding_side = "right"
 
   NO_PAD_TOKEN_FOR_BATCH_MSG = (
-      "No padding token is set for this model, therefore no batch can be made with uneven "
-      "sequences. Set a padding token or adjust the lengths of the sequences building the "
-      "batch so that every sequence is of the same length."
+    "No padding token is set for this model, therefore no batch can be made with uneven "
+    "sequences. Set a padding token or adjust the lengths of the sequences building the "
+    "batch so that every sequence is of the same length."
   )
 
   UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
-      "The sequences building the batch are not of the same size, no tensor "
-      "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
-      "up to the larger sequence's length."
+    "The sequences building the batch are not of the same size, no tensor "
+    "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
+    "up to the larger sequence's length."
   )
 
   @property
   def vocab_size(self) -> int:
-    """ Size of the base vocabulary (without the added tokens) """
+    """Size of the base vocabulary (without the added tokens)"""
     raise NotImplementedError
 
   @property
@@ -506,11 +503,10 @@ def is_fast(self):
     return False
 
   def get_vocab(self):
-    """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
+    """Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab."""
     raise NotImplementedError()
 
   def __init__(self, max_len=None, **kwargs):
-
     super().__init__(**kwargs)
 
     self.max_len = max_len if max_len is not None else int(1e12)
@@ -529,62 +525,62 @@ def __init__(self, max_len=None, **kwargs):
     self.init_kwargs = {}
 
   def __len__(self):
-    """ Size of the full vocabulary with the added tokens """
+    """Size of the full vocabulary with the added tokens"""
     return self.vocab_size + len(self.added_tokens_encoder)
 
   @classmethod
   def from_pretrained(cls, *inputs, **kwargs):
     r"""
-        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+    Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
 
-        Args:
-            pretrained_model_name_or_path: either:
+    Args:
+        pretrained_model_name_or_path: either:
 
-                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+            - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+            - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+            - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
 
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the vocabulary files and override the cached versions if they exists.
 
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+        resume_download: (`optional`) boolean, default False:
+            Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
 
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
 
-            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+        inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
+        kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
-        Examples::
+    Examples::
 
-            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+        # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
 
-            # Download vocabulary from S3 and cache.
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        # Download vocabulary from S3 and cache.
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-            # Download vocabulary from S3 (user-uploaded) and cache.
-            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        # Download vocabulary from S3 (user-uploaded) and cache.
+        tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
 
-            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+        # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
-            # If the tokenizer uses a single vocabulary file, you can point directly to this file
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+        # If the tokenizer uses a single vocabulary file, you can point directly to this file
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
 
-            # You can link tokens to special vocabulary when instantiating
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
-            # You should be sure '<unk>' is in the vocabulary when doing that.
-            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-            assert tokenizer.unk_token == '<unk>'
+        # You can link tokens to special vocabulary when instantiating
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+        # You should be sure '<unk>' is in the vocabulary when doing that.
+        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+        assert tokenizer.unk_token == '<unk>'
 
-        """
+    """
     return cls._from_pretrained(*inputs, **kwargs)
 
   @classmethod
@@ -602,34 +598,34 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
       # Get the vocabulary from AWS S3 bucket
       for file_id, map_list in cls.pretrained_vocab_files_map.items():
         vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-      if (cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration):
+      if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
         init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
     else:
       # Get the vocabulary from local files
       logger.info(
-          "Model name '{}' not found in model shortcut name list ({}). "
-          "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
-              pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
-          )
+        "Model name '{}' not found in model shortcut name list ({}). "
+        "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
+          pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
+        )
       )
 
       if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
         if len(cls.vocab_files_names) > 1:
           raise ValueError(
-              "Calling {}.from_pretrained() with the path to a single file or url is not supported."
-              "Use a model identifier or the path to a directory instead.".format(cls.__name__)
+            "Calling {}.from_pretrained() with the path to a single file or url is not supported."
+            "Use a model identifier or the path to a directory instead.".format(cls.__name__)
           )
         logger.warning(
-            "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(cls.__name__)
+          "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(cls.__name__)
         )
         file_id = list(cls.vocab_files_names.keys())[0]
         vocab_files[file_id] = pretrained_model_name_or_path
       else:
         # At this point pretrained_model_name_or_path is either a directory or a model identifier name
         additional_files_names = {
-            "added_tokens_file": ADDED_TOKENS_FILE,
-            "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
-            "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+          "added_tokens_file": ADDED_TOKENS_FILE,
+          "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+          "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
         }
         # Look for the tokenizer main vocabulary files + the additional tokens files
         for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
@@ -651,40 +647,40 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
           resolved_vocab_files[file_id] = None
         else:
           resolved_vocab_files[file_id] = cached_path(
-              file_path,
-              cache_dir=cache_dir,
-              force_download=force_download,
-              proxies=proxies,
-              resume_download=resume_download,
-              local_files_only=local_files_only,
+            file_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
           )
     except EnvironmentError:
       if pretrained_model_name_or_path in s3_models:
         msg = "Couldn't reach server at '{}' to download vocabulary files."
       else:
         msg = (
-            "Model name '{}' was not found in tokenizers model name list ({}). "
-            "We assumed '{}' was a path or url to a directory containing vocabulary files "
-            "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                pretrained_model_name_or_path,
-                ", ".join(s3_models),
-                pretrained_model_name_or_path,
-                list(cls.vocab_files_names.values()),
-            )
+          "Model name '{}' was not found in tokenizers model name list ({}). "
+          "We assumed '{}' was a path or url to a directory containing vocabulary files "
+          "named {}, but couldn't find such vocabulary files at this path or url.".format(
+            pretrained_model_name_or_path,
+            ", ".join(s3_models),
+            pretrained_model_name_or_path,
+            list(cls.vocab_files_names.values()),
+          )
         )
 
       raise EnvironmentError(msg)
 
     if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
       raise EnvironmentError(
-          "Model name '{}' was not found in tokenizers model name list ({}). "
-          "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
-          "named {} but couldn't find such vocabulary files at this path or url.".format(
-              pretrained_model_name_or_path,
-              ", ".join(s3_models),
-              pretrained_model_name_or_path,
-              list(cls.vocab_files_names.values()),
-          )
+        "Model name '{}' was not found in tokenizers model name list ({}). "
+        "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
+        "named {} but couldn't find such vocabulary files at this path or url.".format(
+          pretrained_model_name_or_path,
+          ", ".join(s3_models),
+          pretrained_model_name_or_path,
+          list(cls.vocab_files_names.values()),
+        )
       )
 
     for file_id, file_path in vocab_files.items():
@@ -734,8 +730,8 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
       tokenizer = cls(*init_inputs, **init_kwargs)
     except OSError:
       raise OSError(
-          "Unable to load vocabulary from file. "
-          "Please check that the provided vocabulary is accessible and not corrupted."
+        "Unable to load vocabulary from file. "
+        "Please check that the provided vocabulary is accessible and not corrupted."
       )
 
     # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
@@ -757,16 +753,16 @@ def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
     return tokenizer
 
   def save_pretrained(self, save_directory):
-    """ Save the tokenizer vocabulary files together with:
-                - added tokens,
-                - special-tokens-to-class-attributes-mapping,
-                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+    """Save the tokenizer vocabulary files together with:
+        - added tokens,
+        - special-tokens-to-class-attributes-mapping,
+        - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
 
-            This won't save modifications other than (added tokens and special token mapping) you may have
-            applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
+    This won't save modifications other than (added tokens and special token mapping) you may have
+    applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
 
-            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
+    This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+    """
     if not os.path.isdir(save_directory):
       logger.error("Saving directory ({}) should be a directory".format(save_directory))
       return
@@ -797,34 +793,34 @@ def save_pretrained(self, save_directory):
     return vocab_files + (special_tokens_map_file, added_tokens_file)
 
   def save_vocabulary(self, save_directory):
-    """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
-            and special token mappings.
+    """Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+    and special token mappings.
 
-            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
+    Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+    """
     raise NotImplementedError
 
   def add_tokens(self, new_tokens):
     """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+    Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+    vocabulary, they are added to it with indices starting from length of the current vocabulary.
 
-        Args:
-            new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+    Args:
+        new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-        Returns:
-            Number of tokens added to the vocabulary.
+    Returns:
+        Number of tokens added to the vocabulary.
 
-        Examples::
+    Examples::
 
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertModel.from_pretrained('bert-base-uncased')
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
 
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        """
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+    """
     if not new_tokens:
       return 0
 
@@ -837,8 +833,9 @@ def add_tokens(self, new_tokens):
       if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
         token = token.lower()
       if (
-          token != self.unk_token and
-          self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and token not in to_add_tokens
+        token != self.unk_token
+        and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+        and token not in to_add_tokens
       ):
         to_add_tokens.append(token)
         logger.info("Adding %s to the vocabulary", token)
@@ -853,60 +850,60 @@ def add_tokens(self, new_tokens):
 
   def num_special_tokens_to_add(self, pair=False):
     """
-        Returns the number of added tokens when encoding a sequence with special tokens.
+    Returns the number of added tokens when encoding a sequence with special tokens.
 
-        Note:
-            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
-            inside your training loop.
+    Note:
+        This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+        inside your training loop.
 
-        Args:
-            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
-                number of added tokens in the case of a single sequence if set to False.
+    Args:
+        pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+            number of added tokens in the case of a single sequence if set to False.
 
-        Returns:
-            Number of tokens added to sequences
-        """
+    Returns:
+        Number of tokens added to sequences
+    """
     token_ids_0 = []
     token_ids_1 = []
     return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
 
   def add_special_tokens(self, special_tokens_dict):
     """
-        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-        to class attributes. If special tokens are NOT in the vocabulary, they are added
-        to it (indexed starting from the last index of the current vocabulary).
+    Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+    to class attributes. If special tokens are NOT in the vocabulary, they are added
+    to it (indexed starting from the last index of the current vocabulary).
 
-        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+    Using `add_special_tokens` will ensure your special tokens can be used in several ways:
 
-        - special tokens are carefully handled by the tokenizer (they are never split)
-        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
+    - special tokens are carefully handled by the tokenizer (they are never split)
+    - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
 
-        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
+    When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
 
-        Args:
-            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
-                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
-                ``additional_special_tokens``].
+    Args:
+        special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+            [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+            ``additional_special_tokens``].
 
-                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+            Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-        Returns:
-            Number of tokens added to the vocabulary.
+    Returns:
+        Number of tokens added to the vocabulary.
 
-        Examples::
+    Examples::
 
-            # Let's see how to add a new classification token to GPT-2
-            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-            model = GPT2Model.from_pretrained('gpt2')
+        # Let's see how to add a new classification token to GPT-2
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
 
-            special_tokens_dict = {'cls_token': '<CLS>'}
+        special_tokens_dict = {'cls_token': '<CLS>'}
 
-            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+        print('We have added', num_added_toks, 'tokens')
+        model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
 
-            assert tokenizer.cls_token == '<CLS>'
-        """
+        assert tokenizer.cls_token == '<CLS>'
+    """
     if not special_tokens_dict:
       return 0
 
@@ -925,17 +922,17 @@ def add_special_tokens(self, special_tokens_dict):
     return added_tokens
 
   def tokenize(self, text: TextInput, **kwargs):
-    """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
+    """Converts a string in a sequence of tokens (string), using the tokenizer.
+    Split in words for word-based vocabulary or sub-words for sub-word-based
+    vocabularies (BPE/SentencePieces/WordPieces).
 
-            Take care of added tokens.
+    Take care of added tokens.
 
-            text: The sequence to be encoded.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
-            **kwargs: passed to the `prepare_for_tokenization` preprocessing method.
-        """
+    text: The sequence to be encoded.
+    add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
+        begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
+    **kwargs: passed to the `prepare_for_tokenization` preprocessing method.
+    """
     all_special_tokens = self.all_special_tokens
     text = self.prepare_for_tokenization(text, **kwargs)
 
@@ -984,12 +981,12 @@ def split_on_tokens(tok_list, text):
         text_list = tokenized_text
 
       return list(
-          itertools.chain.from_iterable(
-              (
-                  self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token]
-                  for token in tokenized_text
-              )
+        itertools.chain.from_iterable(
+          (
+            self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token]
+            for token in tokenized_text
           )
+        )
       )
 
     added_tokens = self.unique_added_tokens_encoder
@@ -997,18 +994,18 @@ def split_on_tokens(tok_list, text):
     return tokenized_text
 
   def _tokenize(self, text, **kwargs):
-    """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
+    """Converts a string in a sequence of tokens (string), using the tokenizer.
+    Split in words for word-based vocabulary or sub-words for sub-word-based
+    vocabularies (BPE/SentencePieces/WordPieces).
 
-            Do NOT take care of added tokens.
-        """
+    Do NOT take care of added tokens.
+    """
     raise NotImplementedError
 
   def convert_tokens_to_ids(self, tokens):
-    """ Converts a single token, or a sequence of tokens, (str) in a single integer id
-            (resp. a sequence of ids), using the vocabulary.
-        """
+    """Converts a single token, or a sequence of tokens, (str) in a single integer id
+    (resp. a sequence of ids), using the vocabulary.
+    """
     if tokens is None:
       return None
 
@@ -1032,180 +1029,180 @@ def _convert_token_to_id(self, token):
     raise NotImplementedError
 
   def encode(
-      self,
-      text: TextInput,
-      text_pair: Optional[TextInput] = None,
-      add_special_tokens: bool = True,
-      max_length: Optional[int] = None,
-      stride: int = 0,
-      truncation_strategy: str = "longest_first",
-      pad_to_max_length: bool = False,
-      return_tensors: Optional[str] = None,
-      **kwargs
+    self,
+    text: TextInput,
+    text_pair: Optional[TextInput] = None,
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    return_tensors: Optional[str] = None,
+    **kwargs,
   ):
     """
-        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-
-        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
-
-        Args:
-            text (:obj:`str` or :obj:`List[str]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            **kwargs: passed to the `self.tokenize()` method
-        """
+    Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+
+    Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+
+    Args:
+        text (:obj:`str` or :obj:`List[str]`):
+            The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+            method)
+        text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            string using the `tokenize` method) or a list of integers (tokenized string ids using the
+            `convert_tokens_to_ids` method)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        **kwargs: passed to the `self.tokenize()` method
+    """
     encoded_inputs = self.encode_plus(
-        text,
-        text_pair=text_pair,
-        max_length=max_length,
-        add_special_tokens=add_special_tokens,
-        stride=stride,
-        truncation_strategy=truncation_strategy,
-        pad_to_max_length=pad_to_max_length,
-        return_tensors=return_tensors,
-        **kwargs,
+      text,
+      text_pair=text_pair,
+      max_length=max_length,
+      add_special_tokens=add_special_tokens,
+      stride=stride,
+      truncation_strategy=truncation_strategy,
+      pad_to_max_length=pad_to_max_length,
+      return_tensors=return_tensors,
+      **kwargs,
     )
 
     return encoded_inputs["input_ids"]
 
   def encode_plus(
-      self,
-      text: TextInput,
-      text_pair: Optional[TextInput] = None,
-      add_special_tokens: bool = True,
-      max_length: Optional[int] = None,
-      stride: int = 0,
-      truncation_strategy: str = "longest_first",
-      pad_to_max_length: bool = False,
-      is_pretokenized: bool = False,
-      return_tensors: Optional[str] = None,
-      return_token_type_ids: Optional[bool] = None,
-      return_attention_mask: Optional[bool] = None,
-      return_overflowing_tokens: bool = False,
-      return_special_tokens_mask: bool = False,
-      return_offsets_mapping: bool = False,
-      **kwargs
+    self,
+    text: TextInput,
+    text_pair: Optional[TextInput] = None,
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    is_pretokenized: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_mask: bool = False,
+    return_offsets_mapping: bool = False,
+    **kwargs,
   ) -> BatchEncoding:
     """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            text (:obj:`str` or :obj:`List[str]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
-                Set to True to indicate the input is already tokenized
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are token type IDs? <../glossary.html#token-type-ids>`_
-            return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return overflowing token information (default False).
-            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return special tokens mask information (default False).
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return (char_start, char_end) for each token (default False).
-                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
-                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    attention_mask: list[int] if return_attention_mask is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-
-            - ``input_ids``: list of token ids to be fed to a model
-            - ``token_type_ids``: list of token type ids to be fed to a model
-            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-              tokens and 1 specifying sequence tokens.
-        """
+    Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+    the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+    Args:
+        text (:obj:`str` or :obj:`List[str]`):
+            The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+            method)
+        text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            string using the `tokenize` method) or a list of integers (tokenized string ids using the
+            `convert_tokens_to_ids` method)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+            Set to True to indicate the input is already tokenized
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            Whether to return token type IDs. If left to the default, will return the token type IDs according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return overflowing token information (default False).
+        return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return special tokens mask information (default False).
+        return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return (char_start, char_end) for each token (default False).
+            If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+            Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+        **kwargs: passed to the `self.tokenize()` method
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[int],
+                token_type_ids: list[int] if return_token_type_ids is True (default)
+                attention_mask: list[int] if return_attention_mask is True (default)
+                overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+
+        - ``input_ids``: list of token ids to be fed to a model
+        - ``token_type_ids``: list of token type ids to be fed to a model
+        - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+        - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+        - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+        - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+          tokens and 1 specifying sequence tokens.
+    """
 
     def get_input_ids(text):
       if isinstance(text, str):
@@ -1220,140 +1217,141 @@ def get_input_ids(text):
 
     if return_offsets_mapping:
       raise NotImplementedError(
-          "return_offset_mapping is not available when using Python tokenizers."
-          "To use this feature, change your tokenizer to one deriving from "
-          "transformers.PreTrainedTokenizerFast."
-          "More information on available tokenizers at "
-          "https://github.com/huggingface/transformers/pull/2674"
+        "return_offset_mapping is not available when using Python tokenizers."
+        "To use this feature, change your tokenizer to one deriving from "
+        "transformers.PreTrainedTokenizerFast."
+        "More information on available tokenizers at "
+        "https://github.com/huggingface/transformers/pull/2674"
       )
 
     # Throw an error if we can pad because there is no padding token
     if pad_to_max_length and self.pad_token_id is None:
       raise ValueError(
-          "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+        "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
       )
 
     first_ids = get_input_ids(text)
     second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
     return self.prepare_for_model(
-        first_ids,
-        pair_ids=second_ids,
-        max_length=max_length,
-        pad_to_max_length=pad_to_max_length,
-        add_special_tokens=add_special_tokens,
-        stride=stride,
-        truncation_strategy=truncation_strategy,
-        return_tensors=return_tensors,
-        return_attention_mask=return_attention_mask,
-        return_token_type_ids=return_token_type_ids,
-        return_overflowing_tokens=return_overflowing_tokens,
-        return_special_tokens_mask=return_special_tokens_mask,
+      first_ids,
+      pair_ids=second_ids,
+      max_length=max_length,
+      pad_to_max_length=pad_to_max_length,
+      add_special_tokens=add_special_tokens,
+      stride=stride,
+      truncation_strategy=truncation_strategy,
+      return_tensors=return_tensors,
+      return_attention_mask=return_attention_mask,
+      return_token_type_ids=return_token_type_ids,
+      return_overflowing_tokens=return_overflowing_tokens,
+      return_special_tokens_mask=return_special_tokens_mask,
     )
 
   def batch_encode_plus(
-      self,
-      batch_text_or_text_pairs: Union[List[TextInput], List[TextPairInput], List[PreTokenizedInput],
-                                      List[PreTokenizedInputPair]],
-      add_special_tokens: bool = True,
-      max_length: Optional[int] = None,
-      stride: int = 0,
-      truncation_strategy: str = "longest_first",
-      pad_to_max_length: bool = False,
-      is_pretokenized: bool = False,
-      return_tensors: Optional[str] = None,
-      return_token_type_ids: Optional[bool] = None,
-      return_attention_masks: Optional[bool] = None,
-      return_overflowing_tokens: bool = False,
-      return_special_tokens_masks: bool = False,
-      return_offsets_mapping: bool = False,
-      return_input_lengths: bool = False,
-      **kwargs
+    self,
+    batch_text_or_text_pairs: Union[
+      List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
+    ],
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    is_pretokenized: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_masks: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_masks: bool = False,
+    return_offsets_mapping: bool = False,
+    return_input_lengths: bool = False,
+    **kwargs,
   ) -> BatchEncoding:
     """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
-                Batch of sequences or pair of sequences to be encoded.
-                This can be a list of string/string-sequences/int-sequences or a list of pair of
-                string/string-sequences/int-sequence (see details in encode_plus)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
-                Set to True to indicate the input is already tokenized
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are token type IDs? <../glossary.html#token-type-ids>`_
-            return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return overflowing token information (default False).
-            return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return special tokens mask information (default False).
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return (char_start, char_end) for each token (default False).
-                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
-                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
-            return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set the resulting dictionary will include the length of each sample
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[List[int]],
-                    token_type_ids: list[List[int]] if return_token_type_ids is True (default)
-                    attention_mask: list[List[int]] if return_attention_mask is True (default)
-                    overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-
-            - ``input_ids``: list of token ids to be fed to a model
-            - ``token_type_ids``: list of token type ids to be fed to a model
-            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-              tokens and 1 specifying sequence tokens.
-        """
+    Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+    the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
+
+    Args:
+        batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
+            Batch of sequences or pair of sequences to be encoded.
+            This can be a list of string/string-sequences/int-sequences or a list of pair of
+            string/string-sequences/int-sequence (see details in encode_plus)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+            Set to True to indicate the input is already tokenized
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            Whether to return token type IDs. If left to the default, will return the token type IDs according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return overflowing token information (default False).
+        return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return special tokens mask information (default False).
+        return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return (char_start, char_end) for each token (default False).
+            If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+            Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+        return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set the resulting dictionary will include the length of each sample
+        **kwargs: passed to the `self.tokenize()` method
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[List[int]],
+                token_type_ids: list[List[int]] if return_token_type_ids is True (default)
+                attention_mask: list[List[int]] if return_attention_mask is True (default)
+                overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+
+        - ``input_ids``: list of token ids to be fed to a model
+        - ``token_type_ids``: list of token type ids to be fed to a model
+        - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+        - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+        - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+        - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+          tokens and 1 specifying sequence tokens.
+    """
 
     def get_input_ids(text):
       if isinstance(text, str):
@@ -1369,16 +1367,16 @@ def get_input_ids(text):
     # Throw an error if we can pad because there is no padding token
     if pad_to_max_length and self.pad_token_id is None:
       raise ValueError(
-          "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+        "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
       )
 
     if return_offsets_mapping:
       raise NotImplementedError(
-          "return_offset_mapping is not available when using Python tokenizers."
-          "To use this feature, change your tokenizer to one deriving from "
-          "transformers.PreTrainedTokenizerFast."
-          "More information on available tokenizers at "
-          "https://github.com/huggingface/transformers/pull/2674"
+        "return_offset_mapping is not available when using Python tokenizers."
+        "To use this feature, change your tokenizer to one deriving from "
+        "transformers.PreTrainedTokenizerFast."
+        "More information on available tokenizers at "
+        "https://github.com/huggingface/transformers/pull/2674"
       )
 
     input_ids = []
@@ -1397,8 +1395,9 @@ def get_input_ids(text):
       def total_sequence_length(input_pairs):
         first_ids, second_ids = input_pairs
         return len(first_ids) + (
-            self.num_special_tokens_to_add() if second_ids is None else
-            (len(second_ids) + self.num_special_tokens_to_add(pair=True))
+          self.num_special_tokens_to_add()
+          if second_ids is None
+          else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
         )
 
       max_length = max([total_sequence_length(ids) for ids in input_ids])
@@ -1409,17 +1408,17 @@ def total_sequence_length(input_pairs):
       # the model. It adds special tokens, truncates sequences if overflowing while taking into account
       # the special tokens and manages a window stride for overflowing tokens
       outputs = self.prepare_for_model(
-          first_ids,
-          pair_ids=second_ids,
-          max_length=max_length,
-          pad_to_max_length=pad_to_max_length,
-          add_special_tokens=add_special_tokens,
-          stride=stride,
-          truncation_strategy=truncation_strategy,
-          return_attention_mask=return_attention_masks,
-          return_token_type_ids=return_token_type_ids,
-          return_overflowing_tokens=return_overflowing_tokens,
-          return_special_tokens_mask=return_special_tokens_masks,
+        first_ids,
+        pair_ids=second_ids,
+        max_length=max_length,
+        pad_to_max_length=pad_to_max_length,
+        add_special_tokens=add_special_tokens,
+        stride=stride,
+        truncation_strategy=truncation_strategy,
+        return_attention_mask=return_attention_masks,
+        return_token_type_ids=return_token_type_ids,
+        return_overflowing_tokens=return_overflowing_tokens,
+        return_special_tokens_mask=return_special_tokens_masks,
       )
 
       # Append the non-padded length to the output
@@ -1432,7 +1431,6 @@ def total_sequence_length(input_pairs):
         batch_outputs[key].append(value)
 
     if return_tensors is not None:
-
       # Do the tensor conversion in batch
       for key, value in batch_outputs.items():
         if return_tensors == "tf" and is_tf_available():
@@ -1455,82 +1453,83 @@ def total_sequence_length(input_pairs):
               raise
         elif return_tensors is not None:
           logger.warning(
-              "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
-              .format(return_tensors)
+            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+              return_tensors
+            )
           )
 
     return BatchEncoding(batch_outputs)
 
   def prepare_for_model(
-      self,
-      ids: List[int],
-      pair_ids: Optional[List[int]] = None,
-      max_length: Optional[int] = None,
-      add_special_tokens: bool = True,
-      stride: int = 0,
-      truncation_strategy: str = "longest_first",
-      pad_to_max_length: bool = False,
-      return_tensors: Optional[str] = None,
-      return_token_type_ids: Optional[bool] = None,
-      return_attention_mask: Optional[bool] = None,
-      return_overflowing_tokens: bool = False,
-      return_special_tokens_mask: bool = False,
+    self,
+    ids: List[int],
+    pair_ids: Optional[List[int]] = None,
+    max_length: Optional[int] = None,
+    add_special_tokens: bool = True,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_mask: bool = False,
   ):
     """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
-        It adds special tokens, truncates
-        sequences if overflowing while taking into account the special tokens and manages a window stride for
-        overflowing tokens
-
-        Args:
-            ids: list of tokenized input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
-                list of inputs.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the following strings:
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
-        """
+    Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+    It adds special tokens, truncates
+    sequences if overflowing while taking into account the special tokens and manages a window stride for
+    overflowing tokens
+
+    Args:
+        ids: list of tokenized input ids. Can be obtained from a string by chaining the
+            `tokenize` and `convert_tokens_to_ids` methods.
+        pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+            `tokenize` and `convert_tokens_to_ids` methods.
+        max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+        add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+            list of inputs.
+        truncation_strategy: string selected in the following options:
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+            The tokenizer padding sides are handled by the following strings:
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+            or PyTorch torch.Tensor instead of a list of python integers.
+        return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+        return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
+        return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+        return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[int],
+                token_type_ids: list[int] if return_token_type_ids is True (default)
+                overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+            ``input_ids``: list of token ids to be fed to a model
+            ``token_type_ids``: list of token type ids to be fed to a model
+
+            ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+            tokens and 1 specifying sequence tokens.
+    """
     pair = bool(pair_ids is not None)
     len_ids = len(ids)
     len_pair_ids = len(pair_ids) if pair else 0
@@ -1546,11 +1545,11 @@ def prepare_for_model(
     total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
     if max_length and total_len > max_length:
       ids, pair_ids, overflowing_tokens = self.truncate_sequences(
-          ids,
-          pair_ids=pair_ids,
-          num_tokens_to_remove=total_len - max_length,
-          truncation_strategy=truncation_strategy,
-          stride=stride,
+        ids,
+        pair_ids=pair_ids,
+        num_tokens_to_remove=total_len - max_length,
+        truncation_strategy=truncation_strategy,
+        stride=stride,
       )
       if return_overflowing_tokens:
         encoded_inputs["overflowing_tokens"] = overflowing_tokens
@@ -1583,19 +1582,22 @@ def prepare_for_model(
 
     if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
       logger.warning(
-          "Token indices sequence length is longer than the specified maximum sequence length "
-          "for this model ({} > {}). Running this sequence through the model will result in "
-          "indexing errors".format(len(ids), self.max_len)
+        "Token indices sequence length is longer than the specified maximum sequence length "
+        "for this model ({} > {}). Running this sequence through the model will result in "
+        "indexing errors".format(len(ids), self.max_len)
       )
 
     needs_to_be_padded = pad_to_max_length and (
-        max_length and len(encoded_inputs["input_ids"]) < max_length or
-        max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000
+      max_length
+      and len(encoded_inputs["input_ids"]) < max_length
+      or max_length is None
+      and len(encoded_inputs["input_ids"]) < self.max_len
+      and self.max_len <= 10000
     )
 
     if pad_to_max_length and max_length is None and self.max_len > 10000:
       logger.warning(
-          "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
+        "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
       )
 
     if needs_to_be_padded:
@@ -1605,7 +1607,7 @@ def prepare_for_model(
         if return_attention_mask:
           encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
         if return_token_type_ids:
-          encoded_inputs["token_type_ids"] = (encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference)
+          encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
         if return_special_tokens_mask:
           encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
         encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
@@ -1644,28 +1646,27 @@ def prepare_for_model(
         encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
     elif return_tensors is not None:
       logger.warning(
-          "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available."
-          .format(return_tensors)
+        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)
       )
 
     return BatchEncoding(encoded_inputs)
 
   def prepare_for_tokenization(self, text, **kwargs):
-    """ Performs any necessary transformations before tokenization """
+    """Performs any necessary transformations before tokenization"""
     return text
 
   def truncate_sequences(
-      self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
+    self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
   ):
     """Truncates a sequence pair in place to the maximum length.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences).
-                    Overflowing tokens only contains overflow from the first sequence.
-                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-        """
+    truncation_strategy: string selected in the following options:
+        - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+            starting from the longest one at each token (when there is a pair of input sequences).
+            Overflowing tokens only contains overflow from the first sequence.
+        - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+        - 'only_second': Only truncate the second sequence
+        - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+    """
     if num_tokens_to_remove <= 0:
       return ids, pair_ids, []
 
@@ -1694,7 +1695,7 @@ def truncate_sequences(
       raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
     else:
       raise ValueError(
-          "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+        "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
       )
     return (ids, pair_ids, overflowing_tokens)
 
@@ -1705,40 +1706,40 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
 
   def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
     """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
+    Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+    by concatenating and adding special tokens.
+    A RoBERTa sequence has the following format:
+        single sequence: <s> X </s>
+        pair of sequences: <s> A </s></s> B </s>
+    """
     if token_ids_1 is None:
       return token_ids_0
     return token_ids_0 + token_ids_1
 
   def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
     """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
+    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+    Args:
+        token_ids_0: list of ids (must not contain special tokens)
+        token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+        already_has_special_tokens: (default False) Set to True if the token list is already formated with
+            special tokens for the model
+
+    Returns:
+        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+    """
     return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
   def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-    """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
+    """Converts a single index or a sequence of indices (integers) in a token "
+    (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
 
-            Args:
-                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
-        """
+    Args:
+        skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+    """
     if isinstance(ids, int):
       if ids in self.added_tokens_decoder:
         return self.added_tokens_decoder[ids]
@@ -1759,23 +1760,23 @@ def _convert_id_to_token(self, index):
     raise NotImplementedError
 
   def convert_tokens_to_string(self, tokens):
-    """ Converts a sequence of tokens (string) in a single string.
-            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
-            but we often want to remove sub-word tokenization artifacts at the same time.
-        """
+    """Converts a sequence of tokens (string) in a single string.
+    The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+    but we often want to remove sub-word tokenization artifacts at the same time.
+    """
     return " ".join(self.convert_ids_to_tokens(tokens))
 
   def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
     """
-        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-        with options to remove special tokens and clean up tokenization spaces.
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
-
-        Args:
-            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
-            skip_special_tokens: if set to True, will replace special tokens.
-            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
-        """
+    Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+    with options to remove special tokens and clean up tokenization spaces.
+    Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+
+    Args:
+        token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
+        skip_special_tokens: if set to True, will replace special tokens.
+        clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
+    """
     filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
 
     # To avoid mixing byte-level and unicode for byte-level BPT
@@ -1805,24 +1806,27 @@ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spa
 
   @staticmethod
   def clean_up_tokenization(out_string):
-    """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
-        """
+    """Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms."""
     out_string = (
-        out_string.replace(" .",
-                           ".").replace(" ?", "?").replace(" !", "!").replace(" ,", ",").replace(" ' ", "'").replace(
-                               " n't", "n't"
-                           ).replace(" 'm", "'m").replace(" do not",
-                                                          " don't").replace(" 's",
-                                                                            "'s").replace(" 've",
-                                                                                          "'ve").replace(" 're", "'re")
+      out_string.replace(" .", ".")
+      .replace(" ?", "?")
+      .replace(" !", "!")
+      .replace(" ,", ",")
+      .replace(" ' ", "'")
+      .replace(" n't", "n't")
+      .replace(" 'm", "'m")
+      .replace(" do not", " don't")
+      .replace(" 's", "'s")
+      .replace(" 've", "'ve")
+      .replace(" 're", "'re")
     )
     return out_string
 
 
 def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
+  input_ids,
+  pad_token_id,
+  attention_mask=None,
 ):
   """Remove columns that are populated exclusively by pad_token_id"""
   keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
@@ -1855,165 +1859,110 @@ def whitespace_tokenize(text):
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file":
-        {
-            "bert-base-uncased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-            "bert-large-uncased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-            "bert-base-cased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-            "bert-large-cased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-            "bert-base-multilingual-uncased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-            "bert-base-multilingual-cased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-            "bert-base-chinese":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-            "bert-base-german-cased":
-                "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-            "bert-large-uncased-whole-word-masking":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-            "bert-large-cased-whole-word-masking":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-            "bert-large-uncased-whole-word-masking-finetuned-squad":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-            "bert-large-cased-whole-word-masking-finetuned-squad":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-            "bert-base-cased-finetuned-mrpc":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-            "bert-base-german-dbmdz-cased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-            "bert-base-german-dbmdz-uncased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-            "bert-base-finnish-cased-v1":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-            "bert-base-finnish-uncased-v1":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
-            "bert-base-dutch-cased":
-                "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
-        }
+  "vocab_file": {
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
+  }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512,
-    "bert-large-uncased": 512,
-    "bert-base-cased": 512,
-    "bert-large-cased": 512,
-    "bert-base-multilingual-uncased": 512,
-    "bert-base-multilingual-cased": 512,
-    "bert-base-chinese": 512,
-    "bert-base-german-cased": 512,
-    "bert-large-uncased-whole-word-masking": 512,
-    "bert-large-cased-whole-word-masking": 512,
-    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-    "bert-base-cased-finetuned-mrpc": 512,
-    "bert-base-german-dbmdz-cased": 512,
-    "bert-base-german-dbmdz-uncased": 512,
-    "bert-base-finnish-cased-v1": 512,
-    "bert-base-finnish-uncased-v1": 512,
-    "bert-base-dutch-cased": 512,
+  "bert-base-uncased": 512,
+  "bert-large-uncased": 512,
+  "bert-base-cased": 512,
+  "bert-large-cased": 512,
+  "bert-base-multilingual-uncased": 512,
+  "bert-base-multilingual-cased": 512,
+  "bert-base-chinese": 512,
+  "bert-base-german-cased": 512,
+  "bert-large-uncased-whole-word-masking": 512,
+  "bert-large-cased-whole-word-masking": 512,
+  "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+  "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+  "bert-base-cased-finetuned-mrpc": 512,
+  "bert-base-german-dbmdz-cased": 512,
+  "bert-base-german-dbmdz-uncased": 512,
+  "bert-base-finnish-cased-v1": 512,
+  "bert-base-finnish-uncased-v1": 512,
+  "bert-base-dutch-cased": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {
-        "do_lower_case": True
-    },
-    "bert-large-uncased": {
-        "do_lower_case": True
-    },
-    "bert-base-cased": {
-        "do_lower_case": False
-    },
-    "bert-large-cased": {
-        "do_lower_case": False
-    },
-    "bert-base-multilingual-uncased": {
-        "do_lower_case": True
-    },
-    "bert-base-multilingual-cased": {
-        "do_lower_case": False
-    },
-    "bert-base-chinese": {
-        "do_lower_case": False
-    },
-    "bert-base-german-cased": {
-        "do_lower_case": False
-    },
-    "bert-large-uncased-whole-word-masking": {
-        "do_lower_case": True
-    },
-    "bert-large-cased-whole-word-masking": {
-        "do_lower_case": False
-    },
-    "bert-large-uncased-whole-word-masking-finetuned-squad": {
-        "do_lower_case": True
-    },
-    "bert-large-cased-whole-word-masking-finetuned-squad": {
-        "do_lower_case": False
-    },
-    "bert-base-cased-finetuned-mrpc": {
-        "do_lower_case": False
-    },
-    "bert-base-german-dbmdz-cased": {
-        "do_lower_case": False
-    },
-    "bert-base-german-dbmdz-uncased": {
-        "do_lower_case": True
-    },
-    "bert-base-finnish-cased-v1": {
-        "do_lower_case": False
-    },
-    "bert-base-finnish-uncased-v1": {
-        "do_lower_case": True
-    },
-    "bert-base-dutch-cased": {
-        "do_lower_case": False
-    },
+  "bert-base-uncased": {"do_lower_case": True},
+  "bert-large-uncased": {"do_lower_case": True},
+  "bert-base-cased": {"do_lower_case": False},
+  "bert-large-cased": {"do_lower_case": False},
+  "bert-base-multilingual-uncased": {"do_lower_case": True},
+  "bert-base-multilingual-cased": {"do_lower_case": False},
+  "bert-base-chinese": {"do_lower_case": False},
+  "bert-base-german-cased": {"do_lower_case": False},
+  "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+  "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+  "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+  "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+  "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+  "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+  "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+  "bert-base-finnish-cased-v1": {"do_lower_case": False},
+  "bert-base-finnish-uncased-v1": {"do_lower_case": True},
+  "bert-base-dutch-cased": {"do_lower_case": False},
 }
 
 
 # Bert Classes
 class BertTokenizer(PreTrainedTokenizer):
   r"""
-    Constructs a BERT tokenizer. Based on WordPiece.
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-
-    Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to do basic tokenization before WordPiece.
-        never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            List of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to tokenize Chinese characters.
-            This should likely be deactivated for Japanese:
-            see: https://github.com/huggingface/transformers/issues/328
-    """
+  Constructs a BERT tokenizer. Based on WordPiece.
+
+  This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+  should refer to the superclass for more information regarding methods.
+
+  Args:
+      vocab_file (:obj:`string`):
+          File containing the vocabulary.
+      do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to lowercase the input when tokenizing.
+      do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to do basic tokenization before WordPiece.
+      never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          List of tokens which will never be split during tokenization. Only has an effect when
+          :obj:`do_basic_tokenize=True`
+      unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+          The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+          token instead.
+      sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+          The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+          for sequence classification or for a text and a question for question answering.
+          It is also used as the last token of a sequence built with special tokens.
+      pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+          The token used for padding, for example when batching sequences of different lengths.
+      cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+          The classifier token which is used when doing sequence classification (classification of the whole
+          sequence instead of per-token classification). It is the first token of the sequence when built with
+          special tokens.
+      mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+          The token used for masking values. This is the token used when training this model with masked language
+          modeling. This is the token which the model will try to predict.
+      tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to tokenize Chinese characters.
+          This should likely be deactivated for Japanese:
+          see: https://github.com/huggingface/transformers/issues/328
+  """
 
   vocab_files_names = VOCAB_FILES_NAMES
   pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -2021,41 +1970,41 @@ class BertTokenizer(PreTrainedTokenizer):
   max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
   def __init__(
-      self,
-      vocab_file,
-      do_lower_case=True,
-      do_basic_tokenize=True,
-      never_split=None,
-      unk_token="[UNK]",
-      sep_token="[SEP]",
-      pad_token="[PAD]",
-      cls_token="[CLS]",
-      mask_token="[MASK]",
-      tokenize_chinese_chars=True,
-      **kwargs
+    self,
+    vocab_file,
+    do_lower_case=True,
+    do_basic_tokenize=True,
+    never_split=None,
+    unk_token="[UNK]",
+    sep_token="[SEP]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    mask_token="[MASK]",
+    tokenize_chinese_chars=True,
+    **kwargs,
   ):
     super().__init__(
-        unk_token=unk_token,
-        sep_token=sep_token,
-        pad_token=pad_token,
-        cls_token=cls_token,
-        mask_token=mask_token,
-        **kwargs,
+      unk_token=unk_token,
+      sep_token=sep_token,
+      pad_token=pad_token,
+      cls_token=cls_token,
+      mask_token=mask_token,
+      **kwargs,
     )
     self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
     self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
 
     if not os.path.isfile(vocab_file):
       raise ValueError(
-          "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-          "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+        "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+        "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
       )
     self.vocab = load_vocab(vocab_file)
     self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
     self.do_basic_tokenize = do_basic_tokenize
     if do_basic_tokenize:
       self.basic_tokenizer = BasicTokenizer(
-          do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+        do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
       )
     self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
 
@@ -2077,7 +2026,7 @@ def _tokenize(self, text):
     return split_tokens
 
   def _convert_token_to_id(self, token):
-    """ Converts a token (str) in an id using the vocab. """
+    """Converts a token (str) in an id using the vocab."""
     return self.vocab.get(token, self.vocab.get(self.unk_token))
 
   def _convert_id_to_token(self, index):
@@ -2085,30 +2034,30 @@ def _convert_id_to_token(self, index):
     return self.ids_to_tokens.get(index, self.unk_token)
 
   def convert_tokens_to_string(self, tokens):
-    """ Converts a sequence of tokens (string) in a single string. """
+    """Converts a sequence of tokens (string) in a single string."""
     out_string = " ".join(tokens).replace(" ##", "").strip()
     return out_string
 
-  def build_inputs_with_special_tokens(self,
-                                       token_ids_0: List[int],
-                                       token_ids_1: Optional[List[int]] = None) -> List[int]:
+  def build_inputs_with_special_tokens(
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+  ) -> List[int]:
+    """
+    Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+    by concatenating and adding special tokens.
+    A BERT sequence has the following format:
+
+    - single sequence: ``[CLS] X [SEP]``
+    - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+    Args:
+        token_ids_0 (:obj:`List[int]`):
+            List of IDs to which the special tokens will be added
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
+
+    Returns:
+        :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
     """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
     if token_ids_1 is None:
       return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
     cls = [self.cls_token_id]
@@ -2116,32 +2065,29 @@ def build_inputs_with_special_tokens(self,
     return cls + token_ids_0 + sep + token_ids_1 + sep
 
   def get_special_tokens_mask(
-      self,
-      token_ids_0: List[int],
-      token_ids_1: Optional[List[int]] = None,
-      already_has_special_tokens: bool = False
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
   ) -> List[int]:
     """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
+    Args:
+        token_ids_0 (:obj:`List[int]`):
+            List of ids.
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
+        already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True if the token list is already formatted with special tokens for the model
 
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
-        """
+    Returns:
+        :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+    """
 
     if already_has_special_tokens:
       if token_ids_1 is not None:
         raise ValueError(
-            "You should not supply a second sequence if the provided sequence of "
-            "ids is already formated with special tokens for the model."
+          "You should not supply a second sequence if the provided sequence of "
+          "ids is already formated with special tokens for the model."
         )
       return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
@@ -2149,30 +2095,30 @@ def get_special_tokens_mask(
       return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
     return [1] + ([0] * len(token_ids_0)) + [1]
 
-  def create_token_type_ids_from_sequences(self,
-                                           token_ids_0: List[int],
-                                           token_ids_1: Optional[List[int]] = None) -> List[int]:
+  def create_token_type_ids_from_sequences(
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+  ) -> List[int]:
     """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
+    Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+    A BERT sequence pair mask has the following format:
 
-        ::
+    ::
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
 
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
+    if token_ids_1 is None, only returns the first portion of the mask (0's).
 
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
+    Args:
+        token_ids_0 (:obj:`List[int]`):
+            List of ids.
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
 
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
+    Returns:
+        :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+        sequence(s).
+    """
     sep = [self.sep_token_id]
     cls = [self.cls_token_id]
     if token_ids_1 is None:
@@ -2181,15 +2127,15 @@ def create_token_type_ids_from_sequences(self,
 
   def save_vocabulary(self, vocab_path):
     """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+    Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
 
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
+    Args:
+        vocab_path (:obj:`str`):
+            The directory in which to save the vocabulary.
 
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
+    Returns:
+        :obj:`Tuple(str)`: Paths to the files saved.
+    """
     index = 0
     if os.path.isdir(vocab_path):
       vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
@@ -2199,8 +2145,8 @@ def save_vocabulary(self, vocab_path):
       for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
         if index != token_index:
           logger.warning(
-              "Saving vocabulary to {}: vocabulary indices are not consecutive."
-              " Please check that the vocabulary is not corrupted!".format(vocab_file)
+            "Saving vocabulary to {}: vocabulary indices are not consecutive."
+            " Please check that the vocabulary is not corrupted!".format(vocab_file)
           )
           index = token_index
         writer.write(token + "\n")
@@ -2212,19 +2158,19 @@ class BasicTokenizer(object):
   """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
 
   def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
-    """ Constructs a BasicTokenizer.
-
-        Args:
-            **do_lower_case**: Whether to lower case the input.
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
+    """Constructs a BasicTokenizer.
+
+    Args:
+        **do_lower_case**: Whether to lower case the input.
+        **never_split**: (`optional`) list of str
+            Kept for backward compatibility purposes.
+            Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+            List of token not to split.
+        **tokenize_chinese_chars**: (`optional`) boolean (default True)
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+    """
     if never_split is None:
       never_split = []
     self.do_lower_case = do_lower_case
@@ -2232,15 +2178,15 @@ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=
     self.tokenize_chinese_chars = tokenize_chinese_chars
 
   def tokenize(self, text, never_split=None):
-    """ Basic Tokenization of a piece of text.
-            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-        """
+    """Basic Tokenization of a piece of text.
+        Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+
+    Args:
+        **never_split**: (`optional`) list of str
+            Kept for backward compatibility purposes.
+            Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+            List of token not to split.
+    """
     never_split = self.never_split + (never_split if never_split is not None else [])
     text = self._clean_text(text)
     # This was added on November 1st, 2018 for the multilingual and Chinese
@@ -2319,12 +2265,14 @@ def _is_chinese_char(self, cp):
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
     if (
-        (cp >= 0x4E00 and cp <= 0x9FFF) or (cp >= 0x3400 and cp <= 0x4DBF)  #
-        or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-        or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-        or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-        or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-        or (cp >= 0xF900 and cp <= 0xFAFF) or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+      (cp >= 0x4E00 and cp <= 0x9FFF)
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+      or (cp >= 0xF900 and cp <= 0xFAFF)
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
     ):  #
       return True
 
@@ -2355,20 +2303,20 @@ def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
   def tokenize(self, text):
     """Tokenizes a piece of text into its word pieces.
 
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
 
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
 
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer`.
 
-        Returns:
-          A list of wordpiece tokens.
-        """
+    Returns:
+      A list of wordpiece tokens.
+    """
 
     output_tokens = []
     for token in whitespace_tokenize(text):
diff --git a/deepray/datasets/wikicorpus_en/processing/utils.py b/deepray/datasets/wikicorpus_en/processing/utils.py
index fdf0d9f9..20eaf77a 100644
--- a/deepray/datasets/wikicorpus_en/processing/utils.py
+++ b/deepray/datasets/wikicorpus_en/processing/utils.py
@@ -159,79 +159,79 @@ def printable_text(text):
 
 
 def get_readable_time(elapsed):
-  d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(':')]
+  d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(":")]
   d -= 1
-  return '{:2d}h{:2d}m{:2d}s'.format(24 * d + h, m, s)
+  return "{:2d}h{:2d}m{:2d}s".format(24 * d + h, m, s)
 
 
 def setup_logger(args):
   os.makedirs(args.log_dir, exist_ok=True)
   if not args.json_summary:
-    log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log'.format(get_rank()))
+    log_path = os.path.join(args.log_dir, "dllogger_rank{}.log".format(get_rank()))
   else:
     log_path = "{}_rank{}".format(args.json_summary, get_rank())
 
   if is_main_process():
     dllogger.init(
-        backends=[
-            dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
-            dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)
-        ]
+      backends=[
+        dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
+        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step),
+      ]
     )
   else:
     dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=1, filename=log_path)])
 
   for k, v in vars(args).items():
-    dllogger.log(step='PARAMETER', data={k: v}, verbosity=0)
+    dllogger.log(step="PARAMETER", data={k: v}, verbosity=0)
 
   container_setup_info = {
-      'NVIDIA_TENSORFLOW_VERSION': os.environ.get('NVIDIA_TENSORFLOW_VERSION'),
-      'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'),
-      'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-      'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-      'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-      'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-      'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-      'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-      'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-      'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
+    "NVIDIA_TENSORFLOW_VERSION": os.environ.get("NVIDIA_TENSORFLOW_VERSION"),
+    "TENSORFLOW_VERSION": os.environ.get("TENSORFLOW_VERSION"),
+    "CUBLAS_VERSION": os.environ.get("CUBLAS_VERSION"),
+    "NCCL_VERSION": os.environ.get("NCCL_VERSION"),
+    "CUDA_DRIVER_VERSION": os.environ.get("CUDA_DRIVER_VERSION"),
+    "CUDNN_VERSION": os.environ.get("CUDNN_VERSION"),
+    "CUDA_VERSION": os.environ.get("CUDA_VERSION"),
+    "NVIDIA_PIPELINE_ID": os.environ.get("NVIDIA_PIPELINE_ID"),
+    "NVIDIA_BUILD_ID": os.environ.get("NVIDIA_BUILD_ID"),
+    "NVIDIA_TF32_OVERRIDE": os.environ.get("NVIDIA_TF32_OVERRIDE"),
   }
-  dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
+  dllogger.log(step="PARAMETER", data=container_setup_info, verbosity=0)
 
 
 def postprocess_dllog(args):
   if not args.json_summary:
-    log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log')
+    log_path = os.path.join(args.log_dir, "dllogger_rank{}.log")
   else:
     log_path = str(args.json_summary) + "_rank{}"
-  logfiles = [open(log_path.format(i), 'r') for i in range(get_world_size())]
+  logfiles = [open(log_path.format(i), "r") for i in range(get_world_size())]
 
   if not args.json_summary:
-    log_path = os.path.join(args.log_dir, 'dllogger.log')
+    log_path = os.path.join(args.log_dir, "dllogger.log")
   else:
     log_path = str(args.json_summary)
 
-  with open(log_path, 'w') as dest_file:
+  with open(log_path, "w") as dest_file:
     for lines in zip(*[f.readlines() for f in logfiles]):
       json_lines = [json.loads(l[5:]) for l in lines]
 
-      assert all(x['type'] == json_lines[0]['type'] for x in json_lines)
-      if json_lines[0]['type'] != 'LOG':
+      assert all(x["type"] == json_lines[0]["type"] for x in json_lines)
+      if json_lines[0]["type"] != "LOG":
         dest_file.write(lines[0])
         continue
 
-      assert all(x['step'] == json_lines[0]['step'] for x in json_lines)
-      if json_lines[0]['step'] == 'PARAMETER':
+      assert all(x["step"] == json_lines[0]["step"] for x in json_lines)
+      if json_lines[0]["step"] == "PARAMETER":
         dest_file.write(lines[0])
       else:
-        d = dict.fromkeys(json_lines[0]['data'])
+        d = dict.fromkeys(json_lines[0]["data"])
         for k in d.keys():
-          vs = [line['data'][k] for line in json_lines]
+          vs = [line["data"][k] for line in json_lines]
           d[k] = sum(vs) / len(vs)
-        json_lines[0]['data'] = d
-        dest_file.write('DLLL ')
+        json_lines[0]["data"] = d
+        dest_file.write("DLLL ")
         dest_file.write(json.dumps(json_lines[0]))
-        dest_file.write('\n')
+        dest_file.write("\n")
 
   for l in logfiles:
     l.close()
diff --git a/deepray/datasets/wikicorpus_en/processing/wikiextractor/WikiExtractor.py b/deepray/datasets/wikicorpus_en/processing/wikiextractor/WikiExtractor.py
index 2db3697a..2aa634d4 100644
--- a/deepray/datasets/wikicorpus_en/processing/wikiextractor/WikiExtractor.py
+++ b/deepray/datasets/wikicorpus_en/processing/wikiextractor/WikiExtractor.py
@@ -76,12 +76,12 @@
   from urllib import quote
   from htmlentitydefs import name2codepoint
   from itertools import izip as zip, izip_longest as zip_longest
+
   range = xrange  # Use Python 3 equivalent
   chr = unichr  # Use Python 3 equivalent
   text_type = unicode
 
   class SimpleNamespace(object):
-
     def __init__(self, **kwargs):
       self.__dict__.update(kwargs)
 
@@ -92,115 +92,128 @@ def __repr__(self):
 
     def __eq__(self, other):
       return self.__dict__ == other.__dict__
+
 else:
   from urllib.parse import quote
   from html.entities import name2codepoint
   from itertools import zip_longest
   from types import SimpleNamespace
+
   text_type = str
 
 # ===========================================================================
 
 # Program version
-version = '2.75'
+version = "2.75"
 
 ## PARAMS ####################################################################
 
 options = SimpleNamespace(
-
-    ##
-    # Defined in <siteinfo>
-    # We include as default Template, when loading external template file.
-    knownNamespaces={'Template': 10},
-
-    ##
-    # The namespace used for template definitions
-    # It is the name associated with namespace key=10 in the siteinfo header.
-    templateNamespace='',
-    templatePrefix='',
-
-    ##
-    # The namespace used for module definitions
-    # It is the name associated with namespace key=828 in the siteinfo header.
-    moduleNamespace='',
-
-    ##
-    # Recognize only these namespaces in links
-    # w: Internal links to the Wikipedia
-    # wiktionary: Wiki dictionary
-    # wikt: shortcut for Wiktionary
-    #
-    acceptedNamespaces=['w', 'wiktionary', 'wikt'],
-
-    # This is obtained from <siteinfo>
-    urlbase='',
-
-    ##
-    # Filter disambiguation pages
-    filter_disambig_pages=False,
-
-    ##
-    # Drop tables from the article
-    keep_tables=False,
-
-    ##
-    # Whether to preserve links in output
-    keepLinks=False,
-
-    ##
-    # Whether to preserve section titles
-    keepSections=True,
-
-    ##
-    # Whether to preserve lists
-    keepLists=False,
-
-    ##
-    # Whether to output HTML instead of text
-    toHTML=False,
-
-    ##
-    # Whether to write json instead of the xml-like default output format
-    write_json=False,
-
-    ##
-    # Whether to expand templates
-    expand_templates=True,
-
-    ##
-    ## Whether to escape doc content
-    escape_doc=False,
-
-    ##
-    # Print the wikipedia article revision
-    print_revision=False,
-
-    ##
-    # Minimum expanded text length required to print document
-    min_text_length=0,
-
-    # Shared objects holding templates, redirects and cache
-    templates={},
-    redirects={},
-    # cache of parser templates
-    # FIXME: sharing this with a Manager slows down.
-    templateCache={},
-
-    # Elements to ignore/discard
-    ignored_tag_patterns=[],
-    filter_category_include=set(),
-    filter_category_exclude=set(),
-    log_file=None,
-    discardElements=[
-        'gallery', 'timeline', 'noinclude', 'pre', 'table', 'tr', 'td', 'th', 'caption', 'div', 'form', 'input',
-        'select', 'option', 'textarea', 'ul', 'li', 'ol', 'dl', 'dt', 'dd', 'menu', 'dir', 'ref', 'references', 'img',
-        'imagemap', 'source', 'small', 'sub', 'sup', 'indicator'
-    ],
+  ##
+  # Defined in <siteinfo>
+  # We include as default Template, when loading external template file.
+  knownNamespaces={"Template": 10},
+  ##
+  # The namespace used for template definitions
+  # It is the name associated with namespace key=10 in the siteinfo header.
+  templateNamespace="",
+  templatePrefix="",
+  ##
+  # The namespace used for module definitions
+  # It is the name associated with namespace key=828 in the siteinfo header.
+  moduleNamespace="",
+  ##
+  # Recognize only these namespaces in links
+  # w: Internal links to the Wikipedia
+  # wiktionary: Wiki dictionary
+  # wikt: shortcut for Wiktionary
+  #
+  acceptedNamespaces=["w", "wiktionary", "wikt"],
+  # This is obtained from <siteinfo>
+  urlbase="",
+  ##
+  # Filter disambiguation pages
+  filter_disambig_pages=False,
+  ##
+  # Drop tables from the article
+  keep_tables=False,
+  ##
+  # Whether to preserve links in output
+  keepLinks=False,
+  ##
+  # Whether to preserve section titles
+  keepSections=True,
+  ##
+  # Whether to preserve lists
+  keepLists=False,
+  ##
+  # Whether to output HTML instead of text
+  toHTML=False,
+  ##
+  # Whether to write json instead of the xml-like default output format
+  write_json=False,
+  ##
+  # Whether to expand templates
+  expand_templates=True,
+  ##
+  ## Whether to escape doc content
+  escape_doc=False,
+  ##
+  # Print the wikipedia article revision
+  print_revision=False,
+  ##
+  # Minimum expanded text length required to print document
+  min_text_length=0,
+  # Shared objects holding templates, redirects and cache
+  templates={},
+  redirects={},
+  # cache of parser templates
+  # FIXME: sharing this with a Manager slows down.
+  templateCache={},
+  # Elements to ignore/discard
+  ignored_tag_patterns=[],
+  filter_category_include=set(),
+  filter_category_exclude=set(),
+  log_file=None,
+  discardElements=[
+    "gallery",
+    "timeline",
+    "noinclude",
+    "pre",
+    "table",
+    "tr",
+    "td",
+    "th",
+    "caption",
+    "div",
+    "form",
+    "input",
+    "select",
+    "option",
+    "textarea",
+    "ul",
+    "li",
+    "ol",
+    "dl",
+    "dt",
+    "dd",
+    "menu",
+    "dir",
+    "ref",
+    "references",
+    "img",
+    "imagemap",
+    "source",
+    "small",
+    "sub",
+    "sup",
+    "indicator",
+  ],
 )
 
 ##
 # Keys for Template and Module namespaces
-templateKeys = set(['10', '828'])
+templateKeys = set(["10", "828"])
 
 ##
 # Regex for identifying disambig pages
@@ -216,7 +229,7 @@ def __eq__(self, other):
 def keepPage(ns, catSet, page):
   global g_page_articl_total, g_page_total, g_page_articl_used_total
   g_page_total += 1
-  if ns != '0':  # Aritcle
+  if ns != "0":  # Aritcle
     return False
   # remove disambig pages if desired
   g_page_articl_total += 1
@@ -274,25 +287,25 @@ def get_url(uid):
 
 # ------------------------------------------------------------------------------
 
-selfClosingTags = ('br', 'hr', 'nobr', 'ref', 'references', 'nowiki')
+selfClosingTags = ("br", "hr", "nobr", "ref", "references", "nowiki")
 
-placeholder_tags = {'math': 'formula', 'code': 'codice'}
+placeholder_tags = {"math": "formula", "code": "codice"}
 
 
 def normalizeTitle(title):
   """Normalize title"""
   # remove leading/trailing whitespace and underscores
-  title = title.strip(' _')
+  title = title.strip(" _")
   # replace sequences of whitespace and underscore chars with a single space
-  title = re.sub(r'[\s_]+', ' ', title)
+  title = re.sub(r"[\s_]+", " ", title)
 
-  m = re.match(r'([^:]*):(\s*)(\S(?:.*))', title)
+  m = re.match(r"([^:]*):(\s*)(\S(?:.*))", title)
   if m:
     prefix = m.group(1)
     if m.group(2):
-      optionalWhitespace = ' '
+      optionalWhitespace = " "
     else:
-      optionalWhitespace = ''
+      optionalWhitespace = ""
     rest = m.group(3)
 
     ns = normalizeNamespace(prefix)
@@ -319,11 +332,11 @@ def normalizeTitle(title):
 
 def unescape(text):
   """
-    Removes HTML or XML character references and entities from a text string.
+  Removes HTML or XML character references and entities from a text string.
 
-    :param text The HTML (or XML) source text.
-    :return The plain text, as a Unicode string, if necessary.
-    """
+  :param text The HTML (or XML) source text.
+  :return The plain text, as a Unicode string, if necessary.
+  """
 
   def fixup(m):
     text = m.group(0)
@@ -344,35 +357,35 @@ def fixup(m):
 
 # Match HTML comments
 # The buggy template {{Template:T}} has a comment terminating with just "->"
-comment = re.compile(r'<!--.*?-->', re.DOTALL)
+comment = re.compile(r"<!--.*?-->", re.DOTALL)
 
 # Match <nowiki>...</nowiki>
-nowiki = re.compile(r'<nowiki>.*?</nowiki>')
+nowiki = re.compile(r"<nowiki>.*?</nowiki>")
 
 
 def ignoreTag(tag):
-  left = re.compile(r'<%s\b.*?>' % tag, re.IGNORECASE | re.DOTALL)  # both <ref> and <reference>
-  right = re.compile(r'</\s*%s>' % tag, re.IGNORECASE)
+  left = re.compile(r"<%s\b.*?>" % tag, re.IGNORECASE | re.DOTALL)  # both <ref> and <reference>
+  right = re.compile(r"</\s*%s>" % tag, re.IGNORECASE)
   options.ignored_tag_patterns.append((left, right))
 
 
 # Match selfClosing HTML tags
 selfClosing_tag_patterns = [
-    re.compile(r'<\s*%s\b[^>]*/\s*>' % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
+  re.compile(r"<\s*%s\b[^>]*/\s*>" % tag, re.DOTALL | re.IGNORECASE) for tag in selfClosingTags
 ]
 
 # Match HTML placeholder tags
 placeholder_tag_patterns = [
-    (re.compile(r'<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>' % (tag, tag), re.DOTALL | re.IGNORECASE), repl)
-    for tag, repl in placeholder_tags.items()
+  (re.compile(r"<\s*%s(\s*| [^>]+?)>.*?<\s*/\s*%s\s*>" % (tag, tag), re.DOTALL | re.IGNORECASE), repl)
+  for tag, repl in placeholder_tags.items()
 ]
 
 # Match preformatted lines
-preformatted = re.compile(r'^ .*?$')
+preformatted = re.compile(r"^ .*?$")
 
 # Match external links (space separates second optional parameter)
-externalLink = re.compile(r'\[\w+[^ ]*? (.*?)]')
-externalLinkNoAnchor = re.compile(r'\[\w+[&\]]*\]')
+externalLink = re.compile(r"\[\w+[^ ]*? (.*?)]")
+externalLinkNoAnchor = re.compile(r"\[\w+[&\]]*\]")
 
 # Matches bold/italic
 bold_italic = re.compile(r"'''''(.*?)'''''")
@@ -382,18 +395,18 @@ def ignoreTag(tag):
 quote_quote = re.compile(r'""([^"]*?)""')
 
 # Matches space
-spaces = re.compile(r' {2,}')
+spaces = re.compile(r" {2,}")
 
 # Matches dots
-dots = re.compile(r'\.{4,}')
+dots = re.compile(r"\.{4,}")
 
 # ======================================================================
 
 
 class Template(list):
   """
-    A Template is a list of TemplateText or TemplateArgs
-    """
+  A Template is a list of TemplateText or TemplateArgs
+  """
 
   @classmethod
   def parse(cls, body):
@@ -406,7 +419,7 @@ def parse(cls, body):
     start = 0
     for s, e in findMatchingBraces(body, 3):
       tpl.append(TemplateText(body[start:s]))
-      tpl.append(TemplateArg(body[s + 3:e - 3]))
+      tpl.append(TemplateArg(body[s + 3 : e - 3]))
       start = e
     tpl.append(TemplateText(body[start:]))  # leftover
     return tpl
@@ -430,12 +443,12 @@ def subst(self, params, extractor, depth=0):
 
     if depth > extractor.maxParameterRecursionLevels:
       extractor.recursion_exceeded_3_errs += 1
-      return ''
+      return ""
 
-    return ''.join([tpl.subst(params, extractor, depth) for tpl in self])
+    return "".join([tpl.subst(params, extractor, depth) for tpl in self])
 
   def __str__(self):
-    return ''.join([text_type(x) for x in self])
+    return "".join([text_type(x) for x in self])
 
 
 class TemplateText(text_type):
@@ -447,14 +460,14 @@ def subst(self, params, extractor, depth):
 
 class TemplateArg(object):
   """
-    parameter to a template.
-    Has a name and a default value, both of which are Templates.
-    """
+  parameter to a template.
+  Has a name and a default value, both of which are Templates.
+  """
 
   def __init__(self, parameter):
     """
-        :param parameter: the parts of a tplarg.
-        """
+    :param parameter: the parts of a tplarg.
+    """
     # the parameter name itself might contain templates, e.g.:
     #   appointe{{#if:{{{appointer14|}}}|r|d}}14|
     #   4|{{{{{subst|}}}CURRENTYEAR}}
@@ -473,21 +486,21 @@ def __init__(self, parameter):
 
   def __str__(self):
     if self.default:
-      return '{{{%s|%s}}}' % (self.name, self.default)
+      return "{{{%s|%s}}}" % (self.name, self.default)
     else:
-      return '{{{%s}}}' % self.name
+      return "{{{%s}}}" % self.name
 
   def subst(self, params, extractor, depth):
     """
-        Substitute value for this argument from dict :param params:
-        Use :param extractor: to evaluate expressions for name and default.
-        Limit substitution to the maximun :param depth:.
-        """
+    Substitute value for this argument from dict :param params:
+    Use :param extractor: to evaluate expressions for name and default.
+    Limit substitution to the maximun :param depth:.
+    """
     # the parameter name itself might contain templates, e.g.:
     # appointe{{#if:{{{appointer14|}}}|r|d}}14|
     paramName = self.name.subst(params, extractor, depth + 1)
     paramName = extractor.transform(paramName)
-    res = ''
+    res = ""
     if paramName in params:
       res = params[paramName]  # use parameter value specified in template invocation
     elif self.default:  # use the default value
@@ -498,8 +511,7 @@ def subst(self, params, extractor, depth):
 
 
 class Frame(object):
-
-  def __init__(self, title='', args=[], prev=None):
+  def __init__(self, title="", args=[], prev=None):
     self.title = title
     self.args = args
     self.prev = prev
@@ -512,36 +524,36 @@ def pop(self):
     return self.prev
 
   def __str__(self):
-    res = ''
+    res = ""
     prev = self.prev
     while prev:
       if res:
-        res += ', '
-      res += '(%s, %s)' % (prev.title, prev.args)
+        res += ", "
+      res += "(%s, %s)" % (prev.title, prev.args)
       prev = prev.prev
-    return '<Frame [' + res + ']>'
+    return "<Frame [" + res + "]>"
 
 
 # ======================================================================
 
-substWords = 'subst:|safesubst:'
+substWords = "subst:|safesubst:"
 
 
 class Extractor(object):
   """
-    An extraction task on a article.
-    """
+  An extraction task on a article.
+  """
 
   def __init__(self, id, revid, title, lines):
     """
-        :param id: id of page.
-        :param title: tutle of page.
-        :param lines: a list of lines.
-        """
+    :param id: id of page.
+    :param title: tutle of page.
+    :param lines: a list of lines.
+    """
     self.id = id
     self.revid = revid
     self.title = title
-    self.text = ''.join(lines)
+    self.text = "".join(lines)
     self.magicWords = MagicWords()
     self.frame = Frame()
     self.recursion_exceeded_1_errs = 0  # template recursion within expand()
@@ -551,21 +563,21 @@ def __init__(self, id, revid, title, lines):
 
   def write_output(self, out, text):
     """
-        :param out: a memory file
-        :param text: the text of the page
-        """
+    :param out: a memory file
+    :param text: the text of the page
+    """
     url = get_url(self.id)
     if options.write_json:
-      json_data = {'id': self.id, 'url': url, 'title': self.title, 'text': "\n".join(text)}
+      json_data = {"id": self.id, "url": url, "title": self.title, "text": "\n".join(text)}
       if options.print_revision:
-        json_data['revid'] = self.revid
+        json_data["revid"] = self.revid
       # We don't use json.dump(data, out) because we want to be
       # able to encode the string if the output is sys.stdout
       out_str = json.dumps(json_data, ensure_ascii=False)
       if out == sys.stdout:  # option -a or -o -
-        out_str = out_str.encode('utf-8')
+        out_str = out_str.encode("utf-8")
       out.write(out_str)
-      out.write('\n')
+      out.write("\n")
     else:
       if options.print_revision:
         header = '<doc id="%s" revid="%s" url="%s" title="%s">\n' % (self.id, self.revid, url, self.title)
@@ -573,57 +585,57 @@ def write_output(self, out, text):
         header = '<doc id="%s" url="%s" title="%s">\n' % (self.id, url, self.title)
       footer = "\n</doc>\n"
       if out == sys.stdout:  # option -a or -o -
-        header = header.encode('utf-8')
+        header = header.encode("utf-8")
       out.write(header)
       for line in text:
         if out == sys.stdout:  # option -a or -o -
-          line = line.encode('utf-8')
+          line = line.encode("utf-8")
         out.write(line)
-        out.write('\n')
+        out.write("\n")
       out.write(footer)
 
   def extract(self, out):
     """
-        :param out: a memory file.
-        """
-    logging.info('%s\t%s', self.id, self.title)
+    :param out: a memory file.
+    """
+    logging.info("%s\t%s", self.id, self.title)
 
     # Separate header from text with a newline.
     if options.toHTML:
-      title_str = '<h1>' + self.title + '</h1>'
+      title_str = "<h1>" + self.title + "</h1>"
     else:
-      title_str = self.title + '\n'
+      title_str = self.title + "\n"
     # https://www.mediawiki.org/wiki/Help:Magic_words
-    colon = self.title.find(':')
+    colon = self.title.find(":")
     if colon != -1:
       ns = self.title[:colon]
-      pagename = self.title[colon + 1:]
+      pagename = self.title[colon + 1 :]
     else:
-      ns = ''  # Main
+      ns = ""  # Main
       pagename = self.title
-    self.magicWords['NAMESPACE'] = ns
-    self.magicWords['NAMESPACENUMBER'] = options.knownNamespaces.get(ns, '0')
-    self.magicWords['PAGENAME'] = pagename
-    self.magicWords['FULLPAGENAME'] = self.title
-    slash = pagename.rfind('/')
+    self.magicWords["NAMESPACE"] = ns
+    self.magicWords["NAMESPACENUMBER"] = options.knownNamespaces.get(ns, "0")
+    self.magicWords["PAGENAME"] = pagename
+    self.magicWords["FULLPAGENAME"] = self.title
+    slash = pagename.rfind("/")
     if slash != -1:
-      self.magicWords['BASEPAGENAME'] = pagename[:slash]
-      self.magicWords['SUBPAGENAME'] = pagename[slash + 1:]
+      self.magicWords["BASEPAGENAME"] = pagename[:slash]
+      self.magicWords["SUBPAGENAME"] = pagename[slash + 1 :]
     else:
-      self.magicWords['BASEPAGENAME'] = pagename
-      self.magicWords['SUBPAGENAME'] = ''
-    slash = pagename.find('/')
+      self.magicWords["BASEPAGENAME"] = pagename
+      self.magicWords["SUBPAGENAME"] = ""
+    slash = pagename.find("/")
     if slash != -1:
-      self.magicWords['ROOTPAGENAME'] = pagename[:slash]
+      self.magicWords["ROOTPAGENAME"] = pagename[:slash]
     else:
-      self.magicWords['ROOTPAGENAME'] = pagename
-    self.magicWords['CURRENTYEAR'] = time.strftime('%Y')
-    self.magicWords['CURRENTMONTH'] = time.strftime('%m')
-    self.magicWords['CURRENTDAY'] = time.strftime('%d')
-    self.magicWords['CURRENTHOUR'] = time.strftime('%H')
-    self.magicWords['CURRENTTIME'] = time.strftime('%H:%M:%S')
+      self.magicWords["ROOTPAGENAME"] = pagename
+    self.magicWords["CURRENTYEAR"] = time.strftime("%Y")
+    self.magicWords["CURRENTMONTH"] = time.strftime("%m")
+    self.magicWords["CURRENTDAY"] = time.strftime("%d")
+    self.magicWords["CURRENTHOUR"] = time.strftime("%H")
+    self.magicWords["CURRENTTIME"] = time.strftime("%H:%M:%S")
     text = self.text
-    self.text = ''  # save memory
+    self.text = ""  # save memory
     #
     # @see https://doc.wikimedia.org/mediawiki-core/master/php/classParser.html
     # This does the equivalent of internalParse():
@@ -643,22 +655,24 @@ def extract(self, out):
     self.write_output(out, text)
 
     errs = (
-        self.template_title_errs, self.recursion_exceeded_1_errs, self.recursion_exceeded_2_errs,
-        self.recursion_exceeded_3_errs
+      self.template_title_errs,
+      self.recursion_exceeded_1_errs,
+      self.recursion_exceeded_2_errs,
+      self.recursion_exceeded_3_errs,
     )
     if any(errs):
       logging.warn("Template errors in article '%s' (%s): title(%d) recursion(%d, %d, %d)", self.title, self.id, *errs)
 
   def transform(self, wikitext):
     """
-        Transforms wiki markup.
-        @see https://www.mediawiki.org/wiki/Help:Formatting
-        """
+    Transforms wiki markup.
+    @see https://www.mediawiki.org/wiki/Help:Formatting
+    """
     # look for matching <nowiki>...</nowiki>
-    res = ''
+    res = ""
     cur = 0
     for m in nowiki.finditer(wikitext, cur):
-      res += self.transform1(wikitext[cur:m.start()]) + wikitext[m.start():m.end()]
+      res += self.transform1(wikitext[cur : m.start()]) + wikitext[m.start() : m.end()]
       cur = m.end()
     # leftover
     res += self.transform1(wikitext[cur:])
@@ -672,7 +686,7 @@ def transform1(self, text):
       return self.expand(text)
     else:
       # Drop transclusions (template, parser functions)
-      return dropNested(text, r'{{', r'}}')
+      return dropNested(text, r"{{", r"}}")
 
   def wiki2text(self, text):
     #
@@ -692,22 +706,22 @@ def wiki2text(self, text):
     # Drop tables
     # first drop residual templates, or else empty parameter |} might look like end of table.
     if not options.keep_tables:
-      text = dropNested(text, r'{{', r'}}')
-      text = dropNested(text, r'{\|', r'\|}')
+      text = dropNested(text, r"{{", r"}}")
+      text = dropNested(text, r"{\|", r"\|}")
 
     # Handle bold/italic/quote
     if options.toHTML:
-      text = bold_italic.sub(r'<b>\1</b>', text)
-      text = bold.sub(r'<b>\1</b>', text)
-      text = italic.sub(r'<i>\1</i>', text)
+      text = bold_italic.sub(r"<b>\1</b>", text)
+      text = bold.sub(r"<b>\1</b>", text)
+      text = italic.sub(r"<i>\1</i>", text)
     else:
-      text = bold_italic.sub(r'\1', text)
-      text = bold.sub(r'\1', text)
+      text = bold_italic.sub(r"\1", text)
+      text = bold.sub(r"\1", text)
       text = italic_quote.sub(r'"\1"', text)
       text = italic.sub(r'"\1"', text)
       text = quote_quote.sub(r'"\1"', text)
     # residuals of unbalanced quotes
-    text = text.replace("'''", '').replace("''", '"')
+    text = text.replace("'''", "").replace("''", '"')
 
     # replace internal links
     text = replaceInternalLinks(text)
@@ -716,23 +730,23 @@ def wiki2text(self, text):
     text = replaceExternalLinks(text)
 
     # drop MagicWords behavioral switches
-    text = magicWordsRE.sub('', text)
+    text = magicWordsRE.sub("", text)
 
     # ############### Process HTML ###############
 
     # turn into HTML, except for the content of <syntaxhighlight>
-    res = ''
+    res = ""
     cur = 0
     for m in syntaxhighlight.finditer(text):
-      res += unescape(text[cur:m.start()]) + m.group(1)
+      res += unescape(text[cur : m.start()]) + m.group(1)
       cur = m.end()
     text = res + unescape(text[cur:])
     return text
 
   def clean(self, text):
     """
-        Removes irrelevant parts from :param: text.
-        """
+    Removes irrelevant parts from :param: text.
+    """
 
     # Collect spans
     spans = []
@@ -757,7 +771,7 @@ def clean(self, text):
 
     # Drop discarded elements
     for tag in options.discardElements:
-      text = dropNested(text, r'<\s*%s\b[^>/]*>' % tag, r'<\s*/\s*%s>' % tag)
+      text = dropNested(text, r"<\s*%s\b[^>/]*>" % tag, r"<\s*/\s*%s>" % tag)
 
     if not options.toHTML:
       # Turn into text what is left (&amp;nbsp;) and <syntaxhighlight>
@@ -767,29 +781,29 @@ def clean(self, text):
     for pattern, placeholder in placeholder_tag_patterns:
       index = 1
       for match in pattern.finditer(text):
-        text = text.replace(match.group(), '%s_%d' % (placeholder, index))
+        text = text.replace(match.group(), "%s_%d" % (placeholder, index))
         index += 1
 
-    text = text.replace('<<', '«').replace('>>', '»')
+    text = text.replace("<<", "«").replace(">>", "»")
 
     #############################################
 
     # Cleanup text
-    text = text.replace('\t', ' ')
-    text = spaces.sub(' ', text)
-    text = dots.sub('...', text)
-    text = re.sub(' (,:\.\)\]»)', r'\1', text)
-    text = re.sub('(\[\(«) ', r'\1', text)
-    text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
-    text = text.replace(',,', ',').replace(',.', '.')
+    text = text.replace("\t", " ")
+    text = spaces.sub(" ", text)
+    text = dots.sub("...", text)
+    text = re.sub(" (,:\.\)\]»)", r"\1", text)
+    text = re.sub("(\[\(«) ", r"\1", text)
+    text = re.sub(r"\n\W+?\n", "\n", text, flags=re.U)  # lines with only punctuations
+    text = text.replace(",,", ",").replace(",.", ".")
     if options.keep_tables:
       # the following regular expressions are used to remove the wikiml chartacters around table strucutures
       # yet keep the content. The order here is imporant so we remove certain markup like {| and then
       # then the future html attributes such as 'style'. Finally we drop the remaining '|-' that delimits cells.
-      text = re.sub(r'!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"', r'', text)
-      text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r'', text)
-      text = text.replace('|-', '')
-      text = text.replace('|', '')
+      text = re.sub(r"!(?:\s)?style=\"[a-z]+:(?:\d+)%;\"", r"", text)
+      text = re.sub(r'!(?:\s)?style="[a-z]+:(?:\d+)%;[a-z]+:(?:#)?(?:[0-9a-z]+)?"', r"", text)
+      text = text.replace("|-", "")
+      text = text.replace("|", "")
     if options.toHTML:
       text = html.escape(text)
     return text
@@ -801,32 +815,32 @@ def clean(self, text):
   maxParameterRecursionLevels = 10
 
   # check for template beginning
-  reOpen = re.compile('(?<!{){{(?!{)', re.DOTALL)
+  reOpen = re.compile("(?<!{){{(?!{)", re.DOTALL)
 
   def expand(self, wikitext):
     """
-        :param wikitext: the text to be expanded.
+    :param wikitext: the text to be expanded.
 
-        Templates are frequently nested. Occasionally, parsing mistakes may
-        cause template insertion to enter an infinite loop, for instance when
-        trying to instantiate Template:Country
+    Templates are frequently nested. Occasionally, parsing mistakes may
+    cause template insertion to enter an infinite loop, for instance when
+    trying to instantiate Template:Country
 
-        {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
+    {{country_{{{1}}}|{{{2}}}|{{{2}}}|size={{{size|}}}|name={{{name|}}}}}
 
-        which is repeatedly trying to insert template 'country_', which is
-        again resolved to Template:Country. The straightforward solution of
-        keeping track of templates that were already inserted for the current
-        article would not work, because the same template may legally be used
-        more than once, with different parameters in different parts of the
-        article.  Therefore, we limit the number of iterations of nested
-        template inclusion.
+    which is repeatedly trying to insert template 'country_', which is
+    again resolved to Template:Country. The straightforward solution of
+    keeping track of templates that were already inserted for the current
+    article would not work, because the same template may legally be used
+    more than once, with different parameters in different parts of the
+    article.  Therefore, we limit the number of iterations of nested
+    template inclusion.
 
-        """
+    """
     # Test template expansion at:
     # https://en.wikipedia.org/wiki/Special:ExpandTemplates
     # https://it.wikipedia.org/wiki/Speciale:EspandiTemplate
 
-    res = ''
+    res = ""
     if self.frame.depth >= self.maxTemplateRecursionLevels:
       self.recursion_exceeded_1_errs += 1
       return res
@@ -836,7 +850,7 @@ def expand(self, wikitext):
     cur = 0
     # look for matching {{...}}
     for s, e in findMatchingBraces(wikitext, 2):
-      res += wikitext[cur:s] + self.expandTemplate(wikitext[s + 2:e - 2])
+      res += wikitext[cur:s] + self.expandTemplate(wikitext[s + 2 : e - 2])
       cur = e
     # leftover
     res += wikitext[cur:]
@@ -845,9 +859,9 @@ def expand(self, wikitext):
 
   def templateParams(self, parameters):
     """
-        Build a dictionary with positional or name key to expanded parameters.
-        :param parameters: the parts[1:] of a template, i.e. all except the title.
-        """
+    Build a dictionary with positional or name key to expanded parameters.
+    :param parameters: the parts[1:] of a template, i.e. all except the title.
+    """
     templateParams = {}
 
     if not parameters:
@@ -888,7 +902,7 @@ def templateParams(self, parameters):
       # The '=' might occurr within an HTML attribute:
       #   "&lt;ref name=value"
       # but we stop at first.
-      m = re.match(' *([^=]*?) *?=(.*)', param, re.DOTALL)
+      m = re.match(" *([^=]*?) *?=(.*)", param, re.DOTALL)
       if m:
         # This is a named parameter.  This case also handles parameter
         # assignments like "2=xxx", where the number of an unnamed
@@ -898,14 +912,14 @@ def templateParams(self, parameters):
         parameterName = m.group(1).strip()
         parameterValue = m.group(2)
 
-        if ']]' not in parameterValue:  # if the value does not contain a link, trim whitespace
+        if "]]" not in parameterValue:  # if the value does not contain a link, trim whitespace
           parameterValue = parameterValue.strip()
         templateParams[parameterName] = parameterValue
       else:
         # this is an unnamed parameter
         unnamedParameterCounter += 1
 
-        if ']]' not in param:  # if the value does not contain a link, trim whitespace
+        if "]]" not in param:  # if the value does not contain a link, trim whitespace
           param = param.strip()
         templateParams[str(unnamedParameterCounter)] = param
     # logging.debug('%*stemplateParams> %s', self.frame.length, '', '|'.join(templateParams.values()))
@@ -913,31 +927,31 @@ def templateParams(self, parameters):
 
   def expandTemplate(self, body):
     """Expands template invocation.
-        :param body: the parts of a template.
+    :param body: the parts of a template.
 
-        :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation
-        of the process.
+    :see http://meta.wikimedia.org/wiki/Help:Expansion for an explanation
+    of the process.
 
-        See in particular: Expansion of names and values
-        http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values
+    See in particular: Expansion of names and values
+    http://meta.wikimedia.org/wiki/Help:Expansion#Expansion_of_names_and_values
 
-        For most parser functions all names and values are expanded,
-        regardless of what is relevant for the result. The branching functions
-        (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions.
+    For most parser functions all names and values are expanded,
+    regardless of what is relevant for the result. The branching functions
+    (#if, #ifeq, #iferror, #ifexist, #ifexpr, #switch) are exceptions.
 
-        All names in a template call are expanded, and the titles of the
-        tplargs in the template body, after which it is determined which
-        values must be expanded, and for which tplargs in the template body
-        the first part (default) [sic in the original doc page].
+    All names in a template call are expanded, and the titles of the
+    tplargs in the template body, after which it is determined which
+    values must be expanded, and for which tplargs in the template body
+    the first part (default) [sic in the original doc page].
 
-        In the case of a tplarg, any parts beyond the first are never
-        expanded.  The possible name and the value of the first part is
-        expanded if the title does not match a name in the template call.
+    In the case of a tplarg, any parts beyond the first are never
+    expanded.  The possible name and the value of the first part is
+    expanded if the title does not match a name in the template call.
 
-        :see code for braceSubstitution at
-        https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397:
+    :see code for braceSubstitution at
+    https://doc.wikimedia.org/mediawiki-core/master/php/html/Parser_8php_source.html#3397:
 
-        """
+    """
 
     # template        = "{{" parts "}}"
 
@@ -959,9 +973,9 @@ def expandTemplate(self, body):
     if self.frame.depth >= self.maxTemplateRecursionLevels:
       self.recursion_exceeded_2_errs += 1
       # logging.debug('%*sEXPAND> %s', self.frame.depth, '', body)
-      return ''
+      return ""
 
-    logging.debug('%*sEXPAND %s', self.frame.depth, '', body)
+    logging.debug("%*sEXPAND %s", self.frame.depth, "", body)
     parts = splitParts(body)
     # title is the portion before the first |
     title = parts[0].strip()
@@ -974,12 +988,12 @@ def expandTemplate(self, body):
     # @see https://www.mediawiki.org/wiki/Manual:Substitution#Partial_substitution
     subst = False
     if re.match(substWords, title, re.IGNORECASE):
-      title = re.sub(substWords, '', title, 1, re.IGNORECASE)
+      title = re.sub(substWords, "", title, 1, re.IGNORECASE)
       subst = True
 
     if title in self.magicWords.values:
       ret = self.magicWords[title]
-      logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, ret)
+      logging.debug("%*s<EXPAND %s %s", self.frame.depth, "", title, ret)
       return ret
 
     # Parser functions.
@@ -995,19 +1009,19 @@ def expandTemplate(self, body):
 
     # The first argument is everything after the first colon.
     # It has been evaluated above.
-    colon = title.find(':')
+    colon = title.find(":")
     if colon > 1:
       funct = title[:colon]
-      parts[0] = title[colon + 1:].strip()  # side-effect (parts[0] not used later)
+      parts[0] = title[colon + 1 :].strip()  # side-effect (parts[0] not used later)
       # arguments after first are not evaluated
       ret = callParserFunction(funct, parts, self)
-      logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', funct, ret)
+      logging.debug("%*s<EXPAND %s %s", self.frame.depth, "", funct, ret)
       return ret
 
     title = fullyQualifiedTemplateTitle(title)
     if not title:
       self.template_title_errs += 1
-      return ''
+      return ""
 
     redirected = options.redirects.get(title)
     if redirected:
@@ -1023,10 +1037,10 @@ def expandTemplate(self, body):
       del options.templates[title]
     else:
       # The page being included could not be identified
-      logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, '')
-      return ''
+      logging.debug("%*s<EXPAND %s %s", self.frame.depth, "", title, "")
+      return ""
 
-    logging.debug('%*sTEMPLATE %s: %s', self.frame.depth, '', title, template)
+    logging.debug("%*sTEMPLATE %s: %s", self.frame.depth, "", title, template)
 
     # tplarg          = "{{{" parts "}}}"
     # parts           = [ title *( "|" part ) ]
@@ -1078,7 +1092,7 @@ def expandTemplate(self, body):
     instantiated = template.subst(params, self)
     value = self.transform(instantiated)
     self.frame = self.frame.pop()
-    logging.debug('%*s<EXPAND %s %s', self.frame.depth, '', title, value)
+    logging.debug("%*s<EXPAND %s %s", self.frame.depth, "", title, value)
     return value
 
 
@@ -1088,30 +1102,30 @@ def expandTemplate(self, body):
 
 def splitParts(paramsList):
   """
-    :param paramsList: the parts of a template or tplarg.
-
-    Split template parameters at the separator "|".
-    separator "=".
-
-    Template parameters often contain URLs, internal links, text or even
-    template expressions, since we evaluate templates outside in.
-    This is required for cases like:
-      {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}}
-    Parameters are separated by "|" symbols. However, we
-    cannot simply split the string on "|" symbols, since these
-    also appear inside templates and internal links, e.g.
-
-     {{if:|
-      |{{#if:the president|
-           |{{#if:|
-               [[Category:Hatnote templates|A{{PAGENAME}}]]
-            }}
-       }}
+  :param paramsList: the parts of a template or tplarg.
+
+  Split template parameters at the separator "|".
+  separator "=".
+
+  Template parameters often contain URLs, internal links, text or even
+  template expressions, since we evaluate templates outside in.
+  This is required for cases like:
+    {{#if: {{{1}}} | {{lc:{{{1}}} | "parameter missing"}}
+  Parameters are separated by "|" symbols. However, we
+  cannot simply split the string on "|" symbols, since these
+  also appear inside templates and internal links, e.g.
+
+   {{if:|
+    |{{#if:the president|
+         |{{#if:|
+             [[Category:Hatnote templates|A{{PAGENAME}}]]
+          }}
      }}
+   }}
 
-    We split parts at the "|" symbols that are not inside any pair
-    {{{...}}}, {{...}}, [[...]], {|...|}.
-    """
+  We split parts at the "|" symbols that are not inside any pair
+  {{{...}}}, {{...}}, [[...]], {|...|}.
+  """
 
   # Must consider '[' as normal in expansion of Template:EMedicine2:
   # #ifeq: ped|article|[http://emedicine.medscape.com/article/180-overview|[http://www.emedicine.com/ped/topic180.htm#{{#if: |section~}}
@@ -1123,7 +1137,7 @@ def splitParts(paramsList):
   # and tpl parameters like:
   #    ||[[Category:People|{{#if:A|A|{{PAGENAME}}}}]]
 
-  sep = '|'
+  sep = "|"
   parameters = []
   cur = 0
 
@@ -1139,7 +1153,7 @@ def splitParts(paramsList):
       else:
         parameters = par
     elif not parameters:
-      parameters = ['']  # create first param
+      parameters = [""]  # create first param
     # add span to last previous parameter
     parameters[-1] += paramsList[s:e]
     cur = e
@@ -1161,8 +1175,8 @@ def splitParts(paramsList):
 
 def findMatchingBraces(text, ldelim=0):
   """
-    :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}.
-    """
+  :param ldelim: number of braces to match. 0 means match [[]], {{}} and {{{}}}.
+  """
   # Parsing is done with respect to pairs of double braces {{..}} delimiting
   # a template, and pairs of triple braces {{{..}}} delimiting a tplarg.
   # If double opening braces are followed by triple closing braces or
@@ -1199,11 +1213,11 @@ def findMatchingBraces(text, ldelim=0):
   #   {{{link|{{ucfirst:{{{1}}}}}} interchange}}}
 
   if ldelim:  # 2-3
-    reOpen = re.compile('[{]{%d,}' % ldelim)  # at least ldelim
-    reNext = re.compile('[{]{2,}|}{2,}')  # at least 2
+    reOpen = re.compile("[{]{%d,}" % ldelim)  # at least ldelim
+    reNext = re.compile("[{]{2,}|}{2,}")  # at least 2
   else:
-    reOpen = re.compile('{{2,}|\[{2,}')
-    reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
+    reOpen = re.compile("{{2,}|\[{2,}")
+    reNext = re.compile("{{2,}|}{2,}|\[{2,}|]{2,}")  # at least 2
 
   cur = 0
   while True:
@@ -1211,7 +1225,7 @@ def findMatchingBraces(text, ldelim=0):
     if not m1:
       return
     lmatch = m1.end() - m1.start()
-    if m1.group()[0] == '{':
+    if m1.group()[0] == "{":
       stack = [lmatch]  # stack of opening braces lengths
     else:
       stack = [-lmatch]  # negative means [
@@ -1224,9 +1238,9 @@ def findMatchingBraces(text, ldelim=0):
       brac = m2.group()[0]
       lmatch = m2.end() - m2.start()
 
-      if brac == '{':
+      if brac == "{":
         stack.append(lmatch)
-      elif brac == '}':
+      elif brac == "}":
         while stack:
           openCount = stack.pop()  # opening span
           if openCount == 0:  # illegal unmatched [[
@@ -1245,10 +1259,10 @@ def findMatchingBraces(text, ldelim=0):
           break
         elif len(stack) == 1 and 0 < stack[0] < ldelim:
           # ambiguous {{{{{ }}} }}
-          #yield m1.start() + stack[0], end
+          # yield m1.start() + stack[0], end
           cur = end
           break
-      elif brac == '[':  # [[
+      elif brac == "[":  # [[
         stack.append(-lmatch)
       else:  # ]]
         while stack and stack[-1] < 0:  # matching [[
@@ -1269,17 +1283,17 @@ def findMatchingBraces(text, ldelim=0):
         cur = end
 
 
-def findBalanced(text, openDelim=['[['], closeDelim=[']]']):
+def findBalanced(text, openDelim=["[["], closeDelim=["]]"]):
   """
-    Assuming that text contains a properly balanced expression using
-    :param openDelim: as opening delimiters and
-    :param closeDelim: as closing delimiters.
-    :return: an iterator producing pairs (start, end) of start and end
-    positions in text containing a balanced expression.
-    """
-  openPat = '|'.join([re.escape(x) for x in openDelim])
+  Assuming that text contains a properly balanced expression using
+  :param openDelim: as opening delimiters and
+  :param closeDelim: as closing delimiters.
+  :return: an iterator producing pairs (start, end) of start and end
+  positions in text containing a balanced expression.
+  """
+  openPat = "|".join([re.escape(x) for x in openDelim])
   # pattern for delimiters expected after each opening delimiter
-  afterPat = {o: re.compile(openPat + '|' + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
+  afterPat = {o: re.compile(openPat + "|" + c, re.DOTALL) for o, c in zip(openDelim, closeDelim)}
   stack = []
   start = 0
   cur = 0
@@ -1320,45 +1334,45 @@ def findBalanced(text, openDelim=['[['], closeDelim=[']]']):
 
 def if_empty(*rest):
   """
-    This implements If_empty from English Wikipedia module:
-
-       <title>Module:If empty</title>
-       <ns>828</ns>
-       <text>local p = {}
-
-    function p.main(frame)
-            local args = require('Module:Arguments').getArgs(frame, {wrappers = 'Template:If empty', removeBlanks = false})
-
-            -- For backwards compatibility reasons, the first 8 parameters can be unset instead of being blank,
-            -- even though there's really no legitimate use case for this. At some point, this will be removed.
-            local lowestNil = math.huge
-            for i = 8,1,-1 do
-                    if args[i] == nil then
-                            args[i] = ''
-                            lowestNil = i
-                    end
-            end
-
-            for k,v in ipairs(args) do
-                    if v ~= '' then
-                            if lowestNil &lt; k then
-                                    -- If any uses of this template depend on the behavior above, add them to a tracking category.
-                                    -- This is a rather fragile, convoluted, hacky way to do it, but it ensures that this module's output won't be modified
-                                    -- by it.
-                                    frame:extensionTag('ref', '[[Category:Instances of Template:If_empty missing arguments]]', {group = 'TrackingCategory'})
-                                    frame:extensionTag('references', '', {group = 'TrackingCategory'})
-                            end
-                            return v
-                    end
-            end
-    end
-
-    return p   </text>
-    """
+  This implements If_empty from English Wikipedia module:
+
+     <title>Module:If empty</title>
+     <ns>828</ns>
+     <text>local p = {}
+
+  function p.main(frame)
+          local args = require('Module:Arguments').getArgs(frame, {wrappers = 'Template:If empty', removeBlanks = false})
+
+          -- For backwards compatibility reasons, the first 8 parameters can be unset instead of being blank,
+          -- even though there's really no legitimate use case for this. At some point, this will be removed.
+          local lowestNil = math.huge
+          for i = 8,1,-1 do
+                  if args[i] == nil then
+                          args[i] = ''
+                          lowestNil = i
+                  end
+          end
+
+          for k,v in ipairs(args) do
+                  if v ~= '' then
+                          if lowestNil &lt; k then
+                                  -- If any uses of this template depend on the behavior above, add them to a tracking category.
+                                  -- This is a rather fragile, convoluted, hacky way to do it, but it ensures that this module's output won't be modified
+                                  -- by it.
+                                  frame:extensionTag('ref', '[[Category:Instances of Template:If_empty missing arguments]]', {group = 'TrackingCategory'})
+                                  frame:extensionTag('references', '', {group = 'TrackingCategory'})
+                          end
+                          return v
+                  end
+          end
+  end
+
+  return p   </text>
+  """
   for arg in rest:
     if arg:
       return arg
-  return ''
+  return ""
 
 
 # ----------------------------------------------------------------------
@@ -1368,10 +1382,10 @@ def if_empty(*rest):
 
 def functionParams(args, vars):
   """
-    Build a dictionary of var/value from :param: args.
-    Parameters can be either named or unnamed. In the latter case, their
-    name is taken fron :param: vars.
-    """
+  Build a dictionary of var/value from :param: args.
+  Parameters can be either named or unnamed. In the latter case, their
+  name is taken fron :param: vars.
+  """
   params = {}
   index = 1
   for var in vars:
@@ -1379,7 +1393,7 @@ def functionParams(args, vars):
     if value is None:
       value = args.get(str(index))  # positional argument
       if value is None:
-        value = ''
+        value = ""
       else:
         index += 1
     params[var] = value
@@ -1387,10 +1401,10 @@ def functionParams(args, vars):
 
 
 def string_sub(args):
-  params = functionParams(args, ('s', 'i', 'j'))
-  s = params.get('s', '')
-  i = int(params.get('i', 1) or 1)  # or handles case of '' value
-  j = int(params.get('j', -1) or -1)
+  params = functionParams(args, ("s", "i", "j"))
+  s = params.get("s", "")
+  i = int(params.get("i", 1) or 1)  # or handles case of '' value
+  j = int(params.get("j", -1) or -1)
   if i > 0:
     i -= 1  # lua is 1-based
   if j < 0:
@@ -1401,26 +1415,26 @@ def string_sub(args):
 
 
 def string_sublength(args):
-  params = functionParams(args, ('s', 'i', 'len'))
-  s = params.get('s', '')
-  i = int(params.get('i', 1) or 1) - 1  # lua is 1-based
-  len = int(params.get('len', 1) or 1)
-  return s[i:i + len]
+  params = functionParams(args, ("s", "i", "len"))
+  s = params.get("s", "")
+  i = int(params.get("i", 1) or 1) - 1  # lua is 1-based
+  len = int(params.get("len", 1) or 1)
+  return s[i : i + len]
 
 
 def string_len(args):
-  params = functionParams(args, ('s'))
-  s = params.get('s', '')
+  params = functionParams(args, ("s"))
+  s = params.get("s", "")
   return len(s)
 
 
 def string_find(args):
-  params = functionParams(args, ('source', 'target', 'start', 'plain'))
-  source = params.get('source', '')
-  pattern = params.get('target', '')
-  start = int('0' + params.get('start', 1)) - 1  # lua is 1-based
-  plain = int('0' + params.get('plain', 1))
-  if source == '' or pattern == '':
+  params = functionParams(args, ("source", "target", "start", "plain"))
+  source = params.get("source", "")
+  pattern = params.get("target", "")
+  start = int("0" + params.get("start", 1)) - 1  # lua is 1-based
+  plain = int("0" + params.get("plain", 1))
+  if source == "" or pattern == "":
     return 0
   if plain:
     return source.find(pattern, start) + 1  # lua is 1-based
@@ -1429,21 +1443,21 @@ def string_find(args):
 
 
 def string_pos(args):
-  params = functionParams(args, ('target', 'pos'))
-  target = params.get('target', '')
-  pos = int(params.get('pos', 1) or 1)
+  params = functionParams(args, ("target", "pos"))
+  target = params.get("target", "")
+  pos = int(params.get("pos", 1) or 1)
   if pos > 0:
     pos -= 1  # The first character has an index value of 1
   return target[pos]
 
 
 def string_replace(args):
-  params = functionParams(args, ('source', 'pattern', 'replace', 'count', 'plain'))
-  source = params.get('source', '')
-  pattern = params.get('pattern', '')
-  replace = params.get('replace', '')
-  count = int(params.get('count', 0) or 0)
-  plain = int(params.get('plain', 1) or 1)
+  params = functionParams(args, ("source", "pattern", "replace", "count", "plain"))
+  source = params.get("source", "")
+  pattern = params.get("pattern", "")
+  replace = params.get("replace", "")
+  count = int(params.get("count", 0) or 0)
+  plain = int(params.get("plain", 1) or 1)
   if plain:
     if count:
       return source.replace(pattern, replace, count)
@@ -1454,9 +1468,9 @@ def string_replace(args):
 
 
 def string_rep(args):
-  params = functionParams(args, ('s'))
-  source = params.get('source', '')
-  count = int(params.get('count', '1'))
+  params = functionParams(args, ("s"))
+  source = params.get("source", "")
+  count = int(params.get("count", "1"))
   return source * count
 
 
@@ -1469,11 +1483,11 @@ def string_rep(args):
 
 def roman_main(args):
   """Convert first arg to roman numeral if <= 5000 else :return: second arg."""
-  num = int(float(args.get('1')))
+  num = int(float(args.get("1")))
 
   # Return a message for numbers too big to be expressed in Roman numerals.
   if 0 > num or num >= 5000:
-    return args.get('2', 'N/A')
+    return args.get("2", "N/A")
 
   def toRoman(n, romanNumeralMap):
     """convert integer to Roman numeral"""
@@ -1486,8 +1500,19 @@ def toRoman(n, romanNumeralMap):
 
   # Find the Roman numerals for numbers 4999 or less.
   smallRomans = (
-      (1000, "M"), (900, "CM"), (500, "D"), (400, "CD"), (100, "C"), (90, "XC"), (50, "L"), (40, "XL"), (10, "X"),
-      (9, "IX"), (5, "V"), (4, "IV"), (1, "I")
+    (1000, "M"),
+    (900, "CM"),
+    (500, "D"),
+    (400, "CD"),
+    (100, "C"),
+    (90, "XC"),
+    (50, "L"),
+    (40, "XL"),
+    (10, "X"),
+    (9, "IX"),
+    (5, "V"),
+    (4, "IV"),
+    (1, "I"),
   )
   return toRoman(num, smallRomans)
 
@@ -1495,28 +1520,21 @@ def toRoman(n, romanNumeralMap):
 # ----------------------------------------------------------------------
 
 modules = {
-    'convert': {
-        'convert': lambda x, u, *rest: x + ' ' + u,  # no conversion
-    },
-    'If empty': {
-        'main': if_empty
-    },
-    'String':
-        {
-            'len': string_len,
-            'sub': string_sub,
-            'sublength': string_sublength,
-            'pos': string_pos,
-            'find': string_find,
-            'replace': string_replace,
-            'rep': string_rep,
-        },
-    'Roman': {
-        'main': roman_main
-    },
-    'Numero romano': {
-        'main': roman_main
-    }
+  "convert": {
+    "convert": lambda x, u, *rest: x + " " + u,  # no conversion
+  },
+  "If empty": {"main": if_empty},
+  "String": {
+    "len": string_len,
+    "sub": string_sub,
+    "sublength": string_sublength,
+    "pos": string_pos,
+    "find": string_find,
+    "replace": string_replace,
+    "rep": string_rep,
+  },
+  "Roman": {"main": roman_main},
+  "Numero romano": {"main": roman_main},
 }
 
 # ----------------------------------------------------------------------
@@ -1525,92 +1543,93 @@ def toRoman(n, romanNumeralMap):
 
 class MagicWords(object):
   """
-    One copy in each Extractor.
+  One copy in each Extractor.
+
+  @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html
+  """
 
-    @see https://doc.wikimedia.org/mediawiki-core/master/php/MagicWord_8php_source.html
-    """
   names = [
-      '!',
-      'currentmonth',
-      'currentmonth1',
-      'currentmonthname',
-      'currentmonthnamegen',
-      'currentmonthabbrev',
-      'currentday',
-      'currentday2',
-      'currentdayname',
-      'currentyear',
-      'currenttime',
-      'currenthour',
-      'localmonth',
-      'localmonth1',
-      'localmonthname',
-      'localmonthnamegen',
-      'localmonthabbrev',
-      'localday',
-      'localday2',
-      'localdayname',
-      'localyear',
-      'localtime',
-      'localhour',
-      'numberofarticles',
-      'numberoffiles',
-      'numberofedits',
-      'articlepath',
-      'pageid',
-      'sitename',
-      'server',
-      'servername',
-      'scriptpath',
-      'stylepath',
-      'pagename',
-      'pagenamee',
-      'fullpagename',
-      'fullpagenamee',
-      'namespace',
-      'namespacee',
-      'namespacenumber',
-      'currentweek',
-      'currentdow',
-      'localweek',
-      'localdow',
-      'revisionid',
-      'revisionday',
-      'revisionday2',
-      'revisionmonth',
-      'revisionmonth1',
-      'revisionyear',
-      'revisiontimestamp',
-      'revisionuser',
-      'revisionsize',
-      'subpagename',
-      'subpagenamee',
-      'talkspace',
-      'talkspacee',
-      'subjectspace',
-      'subjectspacee',
-      'talkpagename',
-      'talkpagenamee',
-      'subjectpagename',
-      'subjectpagenamee',
-      'numberofusers',
-      'numberofactiveusers',
-      'numberofpages',
-      'currentversion',
-      'rootpagename',
-      'rootpagenamee',
-      'basepagename',
-      'basepagenamee',
-      'currenttimestamp',
-      'localtimestamp',
-      'directionmark',
-      'contentlanguage',
-      'numberofadmins',
-      'cascadingsources',
+    "!",
+    "currentmonth",
+    "currentmonth1",
+    "currentmonthname",
+    "currentmonthnamegen",
+    "currentmonthabbrev",
+    "currentday",
+    "currentday2",
+    "currentdayname",
+    "currentyear",
+    "currenttime",
+    "currenthour",
+    "localmonth",
+    "localmonth1",
+    "localmonthname",
+    "localmonthnamegen",
+    "localmonthabbrev",
+    "localday",
+    "localday2",
+    "localdayname",
+    "localyear",
+    "localtime",
+    "localhour",
+    "numberofarticles",
+    "numberoffiles",
+    "numberofedits",
+    "articlepath",
+    "pageid",
+    "sitename",
+    "server",
+    "servername",
+    "scriptpath",
+    "stylepath",
+    "pagename",
+    "pagenamee",
+    "fullpagename",
+    "fullpagenamee",
+    "namespace",
+    "namespacee",
+    "namespacenumber",
+    "currentweek",
+    "currentdow",
+    "localweek",
+    "localdow",
+    "revisionid",
+    "revisionday",
+    "revisionday2",
+    "revisionmonth",
+    "revisionmonth1",
+    "revisionyear",
+    "revisiontimestamp",
+    "revisionuser",
+    "revisionsize",
+    "subpagename",
+    "subpagenamee",
+    "talkspace",
+    "talkspacee",
+    "subjectspace",
+    "subjectspacee",
+    "talkpagename",
+    "talkpagenamee",
+    "subjectpagename",
+    "subjectpagenamee",
+    "numberofusers",
+    "numberofactiveusers",
+    "numberofpages",
+    "currentversion",
+    "rootpagename",
+    "rootpagenamee",
+    "basepagename",
+    "basepagenamee",
+    "currenttimestamp",
+    "localtimestamp",
+    "directionmark",
+    "contentlanguage",
+    "numberofadmins",
+    "cascadingsources",
   ]
 
   def __init__(self):
-    self.values = {'!': '|'}
+    self.values = {"!": "|"}
 
   def __getitem__(self, name):
     return self.values.get(name)
@@ -1619,13 +1638,28 @@ def __setitem__(self, name, value):
     self.values[name] = value
 
   switches = (
-      '__NOTOC__', '__FORCETOC__', '__TOC__', '__TOC__', '__NEWSECTIONLINK__', '__NONEWSECTIONLINK__', '__NOGALLERY__',
-      '__HIDDENCAT__', '__NOCONTENTCONVERT__', '__NOCC__', '__NOTITLECONVERT__', '__NOTC__', '__START__', '__END__',
-      '__INDEX__', '__NOINDEX__', '__STATICREDIRECT__', '__DISAMBIG__'
+    "__NOTOC__",
+    "__FORCETOC__",
+    "__TOC__",
+    "__TOC__",
+    "__NEWSECTIONLINK__",
+    "__NONEWSECTIONLINK__",
+    "__NOGALLERY__",
+    "__HIDDENCAT__",
+    "__NOCONTENTCONVERT__",
+    "__NOCC__",
+    "__NOTITLECONVERT__",
+    "__NOTC__",
+    "__START__",
+    "__END__",
+    "__INDEX__",
+    "__NOINDEX__",
+    "__STATICREDIRECT__",
+    "__DISAMBIG__",
   )
 
 
-magicWordsRE = re.compile('|'.join(MagicWords.switches))
+magicWordsRE = re.compile("|".join(MagicWords.switches))
 
 # ----------------------------------------------------------------------
 # parser functions utilities
@@ -1633,12 +1667,12 @@ def __setitem__(self, name, value):
 
 def ucfirst(string):
   """:return: a string with just its first character uppercase
-    We can't use title() since it coverts all words.
-    """
+  We can't use title() since it coverts all words.
+  """
   if string:
     return string[0].upper() + string[1:]
   else:
-    return ''
+    return ""
 
 
 def lcfirst(string):
@@ -1649,19 +1683,19 @@ def lcfirst(string):
     else:
       return string.lower()
   else:
-    return ''
+    return ""
 
 
 def fullyQualifiedTemplateTitle(templateTitle):
   """
-    Determine the namespace of the page being included through the template
-    mechanism
-    """
-  if templateTitle.startswith(':'):
+  Determine the namespace of the page being included through the template
+  mechanism
+  """
+  if templateTitle.startswith(":"):
     # Leading colon by itself implies main namespace, so strip this colon
     return ucfirst(templateTitle[1:])
   else:
-    m = re.match('([^:]*)(:.*)', templateTitle)
+    m = re.match("([^:]*)(:.*)", templateTitle)
     if m:
       # colon found but not in the first position - check if it
       # designates a known namespace
@@ -1682,7 +1716,7 @@ def fullyQualifiedTemplateTitle(templateTitle):
   if templateTitle:
     return options.templatePrefix + ucfirst(templateTitle)
   else:
-    return ''  # caller may log as error
+    return ""  # caller may log as error
 
 
 def normalizeNamespace(ns):
@@ -1697,9 +1731,9 @@ def normalizeNamespace(ns):
 
 class Infix:
   """Infix operators.
-    The calling sequence for the infix is:
-      x |op| y
-    """
+  The calling sequence for the infix is:
+    x |op| y
+  """
 
   def __init__(self, function):
     self.function = function
@@ -1729,10 +1763,10 @@ def sharp_expr(extr, expr):
   """Tries converting a lua expr into a Python expr."""
   try:
     expr = extr.expand(expr)
-    expr = re.sub('(?<![!<>])=', '==', expr)  # negative lookbehind
-    expr = re.sub('mod', '%', expr)  # no \b here
-    expr = re.sub('\bdiv\b', '/', expr)
-    expr = re.sub('\bround\b', '|ROUND|', expr)
+    expr = re.sub("(?<![!<>])=", "==", expr)  # negative lookbehind
+    expr = re.sub("mod", "%", expr)  # no \b here
+    expr = re.sub("\bdiv\b", "/", expr)
+    expr = re.sub("\bround\b", "|ROUND|", expr)
     return text_type(eval(expr))
   except:
     return '<span class="error">%s</span>' % expr
@@ -1770,7 +1804,7 @@ def sharp_ifeq(extr, lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args):
   return ""
 
 
-def sharp_iferror(extr, test, then='', Else=None, *args):
+def sharp_iferror(extr, test, then="", Else=None, *args):
   if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
     return extr.expand(then.strip())
   elif Else is None:
@@ -1794,21 +1828,21 @@ def sharp_switch(extr, primary, *params):
   found = False  # for fall through cases
   default = None
   rvalue = None
-  lvalue = ''
+  lvalue = ""
   for param in params:
     # handle cases like:
     #  #default = [http://www.perseus.tufts.edu/hopper/text?doc=Perseus...]
-    pair = param.split('=', 1)
+    pair = param.split("=", 1)
     lvalue = extr.expand(pair[0].strip())
     rvalue = None
     if len(pair) > 1:
       # got "="
       rvalue = extr.expand(pair[1].strip())
       # check for any of multiple values pipe separated
-      if found or primary in [v.strip() for v in lvalue.split('|')]:
+      if found or primary in [v.strip() for v in lvalue.split("|")]:
         # Found a match, return now
         return rvalue
-      elif lvalue == '#default':
+      elif lvalue == "#default":
         default = rvalue
       rvalue = None  # avoid defaulting to last case
     elif lvalue == primary:
@@ -1820,7 +1854,7 @@ def sharp_switch(extr, primary, *params):
     return lvalue
   elif default is not None:
     return default
-  return ''
+  return ""
 
 
 # Extension Scribunto: https://www.mediawiki.org/wiki/Extension:Scribunto
@@ -1830,50 +1864,49 @@ def sharp_invoke(module, function, args):
     funct = functions.get(function)
     if funct:
       return text_type(funct(args))
-  return ''
+  return ""
 
 
 parserFunctions = {
-    '#expr': sharp_expr,
-    '#if': sharp_if,
-    '#ifeq': sharp_ifeq,
-    '#iferror': sharp_iferror,
-    '#ifexpr': lambda *args: '',  # not supported
-    '#ifexist': lambda extr, title, ifex, ifnex: extr.expand(ifnex),  # assuming title is not present
-    '#rel2abs': lambda *args: '',  # not supported
-    '#switch': sharp_switch,
-    '#language': lambda *args: '',  # not supported
-    '#time': lambda *args: '',  # not supported
-    '#timel': lambda *args: '',  # not supported
-    '#titleparts': lambda *args: '',  # not supported
-
-    # This function is used in some pages to construct links
-    # http://meta.wikimedia.org/wiki/Help:URL
-    'urlencode': lambda extr, string, *rest: quote(string.encode('utf-8')),
-    'lc': lambda extr, string, *rest: string.lower() if string else '',
-    'lcfirst': lambda extr, string, *rest: lcfirst(string),
-    'uc': lambda extr, string, *rest: string.upper() if string else '',
-    'ucfirst': lambda extr, string, *rest: ucfirst(string),
-    'int': lambda extr, string, *rest: text_type(int(string)),
+  "#expr": sharp_expr,
+  "#if": sharp_if,
+  "#ifeq": sharp_ifeq,
+  "#iferror": sharp_iferror,
+  "#ifexpr": lambda *args: "",  # not supported
+  "#ifexist": lambda extr, title, ifex, ifnex: extr.expand(ifnex),  # assuming title is not present
+  "#rel2abs": lambda *args: "",  # not supported
+  "#switch": sharp_switch,
+  "#language": lambda *args: "",  # not supported
+  "#time": lambda *args: "",  # not supported
+  "#timel": lambda *args: "",  # not supported
+  "#titleparts": lambda *args: "",  # not supported
+  # This function is used in some pages to construct links
+  # http://meta.wikimedia.org/wiki/Help:URL
+  "urlencode": lambda extr, string, *rest: quote(string.encode("utf-8")),
+  "lc": lambda extr, string, *rest: string.lower() if string else "",
+  "lcfirst": lambda extr, string, *rest: lcfirst(string),
+  "uc": lambda extr, string, *rest: string.upper() if string else "",
+  "ucfirst": lambda extr, string, *rest: ucfirst(string),
+  "int": lambda extr, string, *rest: text_type(int(string)),
 }
 
 
 def callParserFunction(functionName, args, extractor):
   """
-    Parser functions have similar syntax as templates, except that
-    the first argument is everything after the first colon.
-    :return: the result of the invocation, None in case of failure.
+  Parser functions have similar syntax as templates, except that
+  the first argument is everything after the first colon.
+  :return: the result of the invocation, None in case of failure.
 
-    :param: args not yet expanded (see branching functions).
-    https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
-    """
+  :param: args not yet expanded (see branching functions).
+  https://www.mediawiki.org/wiki/Help:Extension:ParserFunctions
+  """
 
   try:
     # https://it.wikipedia.org/wiki/Template:Str_endswith has #Invoke
     functionName = functionName.lower()
-    if functionName == '#invoke':
+    if functionName == "#invoke":
       module, fun = args[0].strip(), args[1].strip()
-      logging.debug('%*s#invoke %s %s %s', extractor.frame.depth, '', module, fun, args[2:])
+      logging.debug("%*s#invoke %s %s %s", extractor.frame.depth, "", module, fun, args[2:])
       # special handling of frame
       if len(args) == 2:
         # find parameters in frame whose title is the one of the original
@@ -1892,7 +1925,7 @@ def callParserFunction(functionName, args, extractor):
         params = [extractor.transform(p) for p in args[2:]]  # evaluates them
         params = extractor.templateParams(params)
       ret = sharp_invoke(module, fun, params)
-      logging.debug('%*s<#invoke %s %s %s', extractor.frame.depth, '', module, fun, ret)
+      logging.debug("%*s<#invoke %s %s %s", extractor.frame.depth, "", module, fun, ret)
       return ret
     if functionName in parserFunctions:
       # branching functions use the extractor to selectively evaluate args
@@ -1917,15 +1950,15 @@ def callParserFunction(functionName, args, extractor):
 # ----------------------------------------------------------------------
 # Extract Template definition
 
-reNoinclude = re.compile(r'<noinclude>(?:.*?)</noinclude>', re.DOTALL)
-reIncludeonly = re.compile(r'<includeonly>|</includeonly>', re.DOTALL)
+reNoinclude = re.compile(r"<noinclude>(?:.*?)</noinclude>", re.DOTALL)
+reIncludeonly = re.compile(r"<includeonly>|</includeonly>", re.DOTALL)
 
 
 def define_template(title, page):
   """
-    Adds a template defined in the :param page:.
-    @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
-    """
+  Adds a template defined in the :param page:.
+  @see https://en.wikipedia.org/wiki/Help:Template#Noinclude.2C_includeonly.2C_and_onlyinclude
+  """
   # title = normalizeTitle(title)
 
   # sanity check (empty template, e.g. Template:Crude Oil Prices))
@@ -1933,12 +1966,12 @@ def define_template(title, page):
     return
 
   # check for redirects
-  m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
+  m = re.match("#REDIRECT.*?\[\[([^\]]*)]]", page[0], re.IGNORECASE)
   if m:
     options.redirects[title] = m.group(1)  # normalizeTitle(m.group(1))
     return
 
-  text = unescape(''.join(page))
+  text = unescape("".join(page))
 
   # We're storing template text for future inclusion, therefore,
   # remove all <noinclude> text and keep all <includeonly> text
@@ -1950,25 +1983,25 @@ def define_template(title, page):
   # enclosing all the rest of the template body in <noinclude> tags.
 
   # remove comments
-  text = comment.sub('', text)
+  text = comment.sub("", text)
 
   # eliminate <noinclude> fragments
-  text = reNoinclude.sub('', text)
+  text = reNoinclude.sub("", text)
   # eliminate unterminated <noinclude> elements
-  text = re.sub(r'<noinclude\s*>.*$', '', text, flags=re.DOTALL)
-  text = re.sub(r'<noinclude/>', '', text)
+  text = re.sub(r"<noinclude\s*>.*$", "", text, flags=re.DOTALL)
+  text = re.sub(r"<noinclude/>", "", text)
 
-  onlyincludeAccumulator = ''
-  for m in re.finditer('<onlyinclude>(.*?)</onlyinclude>', text, re.DOTALL):
+  onlyincludeAccumulator = ""
+  for m in re.finditer("<onlyinclude>(.*?)</onlyinclude>", text, re.DOTALL):
     onlyincludeAccumulator += m.group(1)
   if onlyincludeAccumulator:
     text = onlyincludeAccumulator
   else:
-    text = reIncludeonly.sub('', text)
+    text = reIncludeonly.sub("", text)
 
   if text:
     if title in options.templates:
-      logging.warn('Redefining: %s', title)
+      logging.warn("Redefining: %s", title)
     options.templates[title] = text
 
 
@@ -1977,8 +2010,8 @@ def define_template(title, page):
 
 def dropNested(text, openDelim, closeDelim):
   """
-    A matching function for nested expressions, e.g. namespaces and tables.
-    """
+  A matching function for nested expressions, e.g. namespaces and tables.
+  """
   openRE = re.compile(openDelim, re.IGNORECASE)
   closeRE = re.compile(closeDelim, re.IGNORECASE)
   # partition text in separate blocks { } { }
@@ -2030,10 +2063,10 @@ def dropNested(text, openDelim, closeDelim):
 
 def dropSpans(spans, text):
   """
-    Drop from text the blocks identified in :param spans:, possibly nested.
-    """
+  Drop from text the blocks identified in :param spans:, possibly nested.
+  """
   spans.sort()
-  res = ''
+  res = ""
   offset = 0
   for s, e in spans:
     if offset <= s:  # handle nesting
@@ -2053,28 +2086,28 @@ def dropSpans(spans, text):
 
 def replaceInternalLinks(text):
   """
-    Replaces internal links of the form:
-    [[title |...|label]]trail
+  Replaces internal links of the form:
+  [[title |...|label]]trail
 
-    with title concatenated with trail, when present, e.g. 's' for plural.
+  with title concatenated with trail, when present, e.g. 's' for plural.
 
-    See https://www.mediawiki.org/wiki/Help:Links#Internal_links
-    """
+  See https://www.mediawiki.org/wiki/Help:Links#Internal_links
+  """
   # call this after removal of external links, so we need not worry about
   # triple closing ]]].
   cur = 0
-  res = ''
+  res = ""
   for s, e in findBalanced(text):
     m = tailRE.match(text, e)
     if m:
       trail = m.group(0)
       end = m.end()
     else:
-      trail = ''
+      trail = ""
       end = e
-    inner = text[s + 2:e - 2]
+    inner = text[s + 2 : e - 2]
     # find first |
-    pipe = inner.find('|')
+    pipe = inner.find("|")
     if pipe < 0:
       title = inner
       label = title
@@ -2083,11 +2116,11 @@ def replaceInternalLinks(text):
       # find last |
       curp = pipe + 1
       for s1, e1 in findBalanced(inner):
-        last = inner.rfind('|', curp, s1)
+        last = inner.rfind("|", curp, s1)
         if last >= 0:
           pipe = last  # advance
         curp = e1
-      label = inner[pipe + 1:].strip()
+      label = inner[pipe + 1 :].strip()
     res += text[cur:s] + makeInternalLink(title, label) + trail
     cur = end
   return res + text[cur:]
@@ -2359,16 +2392,16 @@ def replaceInternalLinks(text):
 
 
 def makeInternalLink(title, label):
-  colon = title.find(':')
+  colon = title.find(":")
   if colon > 0 and title[:colon] not in options.acceptedNamespaces:
-    return ''
+    return ""
   if colon == 0:
     # drop also :File:
-    colon2 = title.find(':', colon + 1)
-    if colon2 > 1 and title[colon + 1:colon2] not in options.acceptedNamespaces:
-      return ''
+    colon2 = title.find(":", colon + 1)
+    if colon2 > 1 and title[colon + 1 : colon2] not in options.acceptedNamespaces:
+      return ""
   if options.keepLinks:
-    return '<a href="%s">%s</a>' % (quote(title.encode('utf-8')), label)
+    return '<a href="%s">%s</a>' % (quote(title.encode("utf-8")), label)
   else:
     return label
 
@@ -2379,9 +2412,34 @@ def makeInternalLink(title, label):
 # from: https://doc.wikimedia.org/mediawiki-core/master/php/DefaultSettings_8php_source.html
 
 wgUrlProtocols = [
-    'bitcoin:', 'ftp://', 'ftps://', 'geo:', 'git://', 'gopher://', 'http://', 'https://', 'irc://', 'ircs://',
-    'magnet:', 'mailto:', 'mms://', 'news:', 'nntp://', 'redis://', 'sftp://', 'sip:', 'sips:', 'sms:', 'ssh://',
-    'svn://', 'tel:', 'telnet://', 'urn:', 'worldwind://', 'xmpp:', '//'
+  "bitcoin:",
+  "ftp://",
+  "ftps://",
+  "geo:",
+  "git://",
+  "gopher://",
+  "http://",
+  "https://",
+  "irc://",
+  "ircs://",
+  "magnet:",
+  "mailto:",
+  "mms://",
+  "news:",
+  "nntp://",
+  "redis://",
+  "sftp://",
+  "sip:",
+  "sips:",
+  "sms:",
+  "ssh://",
+  "svn://",
+  "tel:",
+  "telnet://",
+  "urn:",
+  "worldwind://",
+  "xmpp:",
+  "//",
 ]
 
 # from: https://doc.wikimedia.org/mediawiki-core/master/php/Parser_8php_source.html
@@ -2391,29 +2449,40 @@ def makeInternalLink(title, label):
 # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20
 # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
-ANCHOR_CLASS = r'[^][\x00-\x08\x0a-\x1F]'
+ANCHOR_CLASS = r"[^][\x00-\x08\x0a-\x1F]"
 ExtLinkBracketedRegex = re.compile(
-    '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)' + r'\s*((?:' + ANCHOR_CLASS + r'|\[\[' +
-    ANCHOR_CLASS + r'+\]\])' + r'*?)\]', re.S | re.U
+  "\[(((?i)"
+  + "|".join(wgUrlProtocols)
+  + ")"
+  + EXT_LINK_URL_CLASS
+  + r"+)"
+  + r"\s*((?:"
+  + ANCHOR_CLASS
+  + r"|\[\["
+  + ANCHOR_CLASS
+  + r"+\]\])"
+  + r"*?)\]",
+  re.S | re.U,
 )
 # A simpler alternative:
 # ExtLinkBracketedRegex = re.compile(r'\[(.*?)\](?!])')
 
 EXT_IMAGE_REGEX = re.compile(
-    r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
-    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""", re.X | re.S | re.U
+  r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
+    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
+  re.X | re.S | re.U,
 )
 
 
 def replaceExternalLinks(text):
   """
-    https://www.mediawiki.org/wiki/Help:Links#External_links
-    [URL anchor text]
-    """
-  s = ''
+  https://www.mediawiki.org/wiki/Help:Links#External_links
+  [URL anchor text]
+  """
+  s = ""
   cur = 0
   for m in ExtLinkBracketedRegex.finditer(text):
-    s += text[cur:m.start()]
+    s += text[cur : m.start()]
     cur = m.end()
 
     url = m.group(1)
@@ -2445,12 +2514,12 @@ def replaceExternalLinks(text):
 def makeExternalLink(url, anchor):
   """Function applied to wikiLinks"""
   if options.keepLinks:
-    return '<a href="%s">%s</a>' % (quote(url.encode('utf-8')), anchor)
+    return '<a href="%s">%s</a>' % (quote(url.encode("utf-8")), anchor)
   else:
     return anchor
 
 
-def makeExternalImage(url, alt=''):
+def makeExternalImage(url, alt=""):
   if options.keepLinks:
     return '<img src="%s" alt="%s">' % (url, alt)
   else:
@@ -2460,29 +2529,29 @@ def makeExternalImage(url, alt=''):
 # ----------------------------------------------------------------------
 
 # match tail after wikilink
-tailRE = re.compile('\w+')
+tailRE = re.compile("\w+")
 
-syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
+syntaxhighlight = re.compile("&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;", re.DOTALL)
 
 # skip level 1, it is page name level
-section = re.compile(r'(==+)\s*(.*?)\s*\1')
+section = re.compile(r"(==+)\s*(.*?)\s*\1")
 
-listOpen = {'*': '<ul>', '#': '<ol>', ';': '<dl>', ':': '<dl>'}
-listClose = {'*': '</ul>', '#': '</ol>', ';': '</dl>', ':': '</dl>'}
-listItem = {'*': '<li>%s</li>', '#': '<li>%s</<li>', ';': '<dt>%s</dt>', ':': '<dd>%s</dd>'}
+listOpen = {"*": "<ul>", "#": "<ol>", ";": "<dl>", ":": "<dl>"}
+listClose = {"*": "</ul>", "#": "</ol>", ";": "</dl>", ":": "</dl>"}
+listItem = {"*": "<li>%s</li>", "#": "<li>%s</<li>", ";": "<dt>%s</dt>", ":": "<dd>%s</dd>"}
 
 
 def compact(text):
   """Deal with headers, lists, empty sections, residuals of tables.
-    :param text: convert to HTML.
-    """
+  :param text: convert to HTML.
+  """
 
   page = []  # list of paragraph
   headers = {}  # Headers for unfilled sections
   emptySection = False  # empty sections are discarded
   listLevel = []  # nesting of lists
   listCount = []  # count of each list (it should be always in the same length of listLevel)
-  for line in text.split('\n'):
+  for line in text.split("\n"):
     if not line:  # collapse empty lines
       # if there is an opening list, close it if we see an empty line
       if len(listLevel):
@@ -2494,7 +2563,7 @@ def compact(text):
         listCount = []
         emptySection = False
       elif page and page[-1]:
-        page.append('')
+        page.append("")
       continue
     # Handle section titles
     m = section.match(line)
@@ -2503,8 +2572,8 @@ def compact(text):
       lev = len(m.group(1))  # header level
       if options.toHTML:
         page.append("<h%d>%s</h%d>" % (lev, title, lev))
-      if title and title[-1] not in '!?':
-        title += '.'  # terminate sentence.
+      if title and title[-1] not in "!?":
+        title += "."  # terminate sentence.
       headers[lev] = title
       # drop previous headers
       for i in list(headers.keys()):
@@ -2515,23 +2584,23 @@ def compact(text):
       listCount = []
       continue
     # Handle page title
-    elif line.startswith('++'):
+    elif line.startswith("++"):
       title = line[2:-2]
       if title:
-        if title[-1] not in '!?':
-          title += '.'
+        if title[-1] not in "!?":
+          title += "."
         page.append(title)
     # handle indents
-    elif line[0] == ':':
+    elif line[0] == ":":
       # page.append(line.lstrip(':*#;'))
       continue
     # handle lists
-    elif line[0] in '*#;:':
+    elif line[0] in "*#;:":
       i = 0
       # c: current level char
       # n: next level char
-      for c, n in zip_longest(listLevel, line, fillvalue=''):
-        if not n or n not in '*#;:':  # shorter or different
+      for c, n in zip_longest(listLevel, line, fillvalue=""):
+        if not n or n not in "*#;:":  # shorter or different
           if c:
             if options.toHTML:
               page.append(listClose[c])
@@ -2541,7 +2610,7 @@ def compact(text):
           else:
             break
         # n != ''
-        if c != n and (not c or (c not in ';:' and n not in ';:')):
+        if c != n and (not c or (c not in ";:" and n not in ";:")):
           if c:
             # close level
             if options.toHTML:
@@ -2565,11 +2634,11 @@ def compact(text):
           headers.clear()
           # use item count for #-lines
           listCount[i - 1] += 1
-          bullet = 'BULLET::::%d. ' % listCount[i - 1] if n == '#' else 'BULLET::::- '
-          page.append('{0:{1}s}'.format(bullet, len(listLevel)) + line)
+          bullet = "BULLET::::%d. " % listCount[i - 1] if n == "#" else "BULLET::::- "
+          page.append("{0:{1}s}".format(bullet, len(listLevel)) + line)
         elif options.toHTML:
           if n not in listItem:
-            n = '*'
+            n = "*"
           page.append(listItem[n] % line)
     elif len(listLevel):
       if options.toHTML:
@@ -2580,10 +2649,10 @@ def compact(text):
       page.append(line)
 
     # Drop residuals of lists
-    elif line[0] in '{|' or line[-1] == '}':
+    elif line[0] in "{|" or line[-1] == "}":
       continue
     # Drop irrelevant lines
-    elif (line[0] == '(' and line[-1] == ')') or line.strip('.-') == '':
+    elif (line[0] == "(" and line[-1] == ")") or line.strip(".-") == "":
       continue
     elif len(headers):
       if options.keepSections:
@@ -2595,7 +2664,7 @@ def compact(text):
       emptySection = False
     elif not emptySection:
       # Drop preformatted
-      if line[0] != ' ':  # dangerous
+      if line[0] != " ":  # dangerous
         page.append(line)
   return page
 
@@ -2603,7 +2672,7 @@ def compact(text):
 def handle_unicode(entity):
   numeric_code = int(entity[2:-1])
   if numeric_code >= 0x10000:
-    return ''
+    return ""
   return chr(numeric_code)
 
 
@@ -2613,8 +2682,8 @@ def handle_unicode(entity):
 
 class NextFile(object):
   """
-    Synchronous generation of next available file name.
-    """
+  Synchronous generation of next available file name.
+  """
 
   filesPerDir = 100
 
@@ -2637,24 +2706,24 @@ def __next__(self):
   def _dirname(self):
     char1 = self.dir_index % 26
     char2 = self.dir_index // 26 % 26
-    return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
+    return os.path.join(self.path_name, "%c%c" % (ord("A") + char2, ord("A") + char1))
 
   def _filepath(self):
-    return '%s/wiki_%02d' % (self._dirname(), self.file_index)
+    return "%s/wiki_%02d" % (self._dirname(), self.file_index)
 
 
 class OutputSplitter(object):
   """
-    File-like object, that splits output to multiple files of a given max size.
-    """
+  File-like object, that splits output to multiple files of a given max size.
+  """
 
   def __init__(self, nextFile, max_file_size=0, compress=True):
     """
-        :param nextFile: a NextFile object from which to obtain filenames
-            to use.
-        :param max_file_size: the maximum size of each file.
-        :para compress: whether to write data with bzip compression.
-        """
+    :param nextFile: a NextFile object from which to obtain filenames
+        to use.
+    :param max_file_size: the maximum size of each file.
+    :para compress: whether to write data with bzip compression.
+    """
     self.nextFile = nextFile
     self.compress = compress
     self.max_file_size = max_file_size
@@ -2674,57 +2743,57 @@ def close(self):
 
   def open(self, filename):
     if self.compress:
-      return bz2.BZ2File(filename + '.bz2', 'w')
+      return bz2.BZ2File(filename + ".bz2", "w")
     else:
-      return open(filename, 'wb')
+      return open(filename, "wb")
 
 
 # ----------------------------------------------------------------------
 # READER
 
-tagRE = re.compile(r'(.*?)<(/?\w+)[^>]*?>(?:([^<]*)(<.*?>)?)?')
+tagRE = re.compile(r"(.*?)<(/?\w+)[^>]*?>(?:([^<]*)(<.*?>)?)?")
 #                    1     2               3      4
 keyRE = re.compile(r'key="(\d*)"')
-catRE = re.compile(r'\[\[Category:([^\|]+).*\]\].*')  # capture the category name [[Category:Category name|Sortkey]]"
+catRE = re.compile(r"\[\[Category:([^\|]+).*\]\].*")  # capture the category name [[Category:Category name|Sortkey]]"
 
 
 def load_templates(file, output_file=None):
   """
-    Load templates from :param file:.
-    :param output_file: file where to save templates and modules.
-    """
-  options.templatePrefix = options.templateNamespace + ':'
-  options.modulePrefix = options.moduleNamespace + ':'
+  Load templates from :param file:.
+  :param output_file: file where to save templates and modules.
+  """
+  options.templatePrefix = options.templateNamespace + ":"
+  options.modulePrefix = options.moduleNamespace + ":"
 
   if output_file:
-    output = codecs.open(output_file, 'wb', 'utf-8')
+    output = codecs.open(output_file, "wb", "utf-8")
   for page_count, page_data in enumerate(pages_from(file)):
     id, revid, title, ns, catSet, page = page_data
     if not output_file and (not options.templateNamespace or not options.moduleNamespace):  # do not know it yet
       # reconstruct templateNamespace and moduleNamespace from the first title
       if ns in templateKeys:
-        colon = title.find(':')
+        colon = title.find(":")
         if colon > 1:
-          if ns == '10':
+          if ns == "10":
             options.templateNamespace = title[:colon]
-            options.templatePrefix = title[:colon + 1]
-          elif ns == '828':
+            options.templatePrefix = title[: colon + 1]
+          elif ns == "828":
             options.moduleNamespace = title[:colon]
-            options.modulePrefix = title[:colon + 1]
+            options.modulePrefix = title[: colon + 1]
     if ns in templateKeys:
-      text = ''.join(page)
+      text = "".join(page)
       define_template(title, text)
       # save templates and modules to file
       if output_file:
-        output.write('<page>\n')
-        output.write('   <title>%s</title>\n' % title)
-        output.write('   <ns>%s</ns>\n' % ns)
-        output.write('   <id>%s</id>\n' % id)
-        output.write('   <text>')
+        output.write("<page>\n")
+        output.write("   <title>%s</title>\n" % title)
+        output.write("   <ns>%s</ns>\n" % ns)
+        output.write("   <id>%s</id>\n" % id)
+        output.write("   <text>")
         for line in page:
           output.write(line)
-        output.write('   </text>\n')
-        output.write('</page>\n')
+        output.write("   </text>\n")
+        output.write("</page>\n")
     if page_count and page_count % 100000 == 0:
       logging.info("Preprocessed %d pages", page_count)
   if output_file:
@@ -2734,14 +2803,14 @@ def load_templates(file, output_file=None):
 
 def pages_from(input):
   """
-    Scans input extracting pages.
-    :return: (id, revid, title, namespace key, page), page is a list of lines.
-    """
+  Scans input extracting pages.
+  :return: (id, revid, title, namespace key, page), page is a list of lines.
+  """
   # we collect individual lines, since str.join() is significantly faster
   # than concatenation
   page = []
   id = None
-  ns = '0'
+  ns = "0"
   last_id = None
   revid = None
   inText = False
@@ -2749,12 +2818,12 @@ def pages_from(input):
   title = None
   for line in input:
     if not isinstance(line, text_type):
-      line = line.decode('utf-8')
-    if '<' not in line:  # faster than doing re.search()
+      line = line.decode("utf-8")
+    if "<" not in line:  # faster than doing re.search()
       if inText:
         page.append(line)
         # extract categories
-        if line.lstrip().startswith('[[Category:'):
+        if line.lstrip().startswith("[[Category:"):
           mCat = catRE.search(line)
           if mCat:
             catSet.add(mCat.group(1))
@@ -2763,40 +2832,40 @@ def pages_from(input):
     if not m:
       continue
     tag = m.group(2)
-    if tag == 'page':
+    if tag == "page":
       page = []
       catSet = set()
       redirect = False
-    elif tag == 'id' and not id:
+    elif tag == "id" and not id:
       id = m.group(3)
-    elif tag == 'id' and not revid:
+    elif tag == "id" and not revid:
       revid = m.group(3)
-    elif tag == 'title':
+    elif tag == "title":
       title = m.group(3)
-    elif tag == 'ns':
+    elif tag == "ns":
       ns = m.group(3)
-    elif tag == 'redirect':
+    elif tag == "redirect":
       redirect = True
-    elif tag == 'text':
-      if m.lastindex == 3 and line[m.start(3) - 2] == '/':  # self closing
+    elif tag == "text":
+      if m.lastindex == 3 and line[m.start(3) - 2] == "/":  # self closing
         # <text xml:space="preserve" />
         continue
       inText = True
-      line = line[m.start(3):m.end(3)]
+      line = line[m.start(3) : m.end(3)]
       page.append(line)
       if m.lastindex == 4:  # open-close
         inText = False
-    elif tag == '/text':
+    elif tag == "/text":
       if m.group(1):
         page.append(m.group(1))
       inText = False
     elif inText:
       page.append(line)
-    elif tag == '/page':
+    elif tag == "/page":
       if id != last_id and not redirect:
         yield (id, revid, title, ns, catSet, page)
         last_id = id
-        ns = '0'
+        ns = "0"
       id = None
       revid = None
       title = None
@@ -2805,62 +2874,63 @@ def pages_from(input):
 
 def process_dump(input_file, template_file, out_file, file_size, file_compress, process_count):
   """
-    :param input_file: name of the wikipedia dump file; '-' to read from stdin
-    :param template_file: optional file with template definitions.
-    :param out_file: directory where to store extracted data, or '-' for stdout
-    :param file_size: max size of each extracted file, or None for no max (one file)
-    :param file_compress: whether to compress files with bzip.
-    :param process_count: number of extraction processes to spawn.
-    """
+  :param input_file: name of the wikipedia dump file; '-' to read from stdin
+  :param template_file: optional file with template definitions.
+  :param out_file: directory where to store extracted data, or '-' for stdout
+  :param file_size: max size of each extracted file, or None for no max (one file)
+  :param file_compress: whether to compress files with bzip.
+  :param process_count: number of extraction processes to spawn.
+  """
 
   def hook_compressed_encoded(encoding):
-
     def hook(filename, mode):
       ext = os.path.splitext(filename)[1]
-      if ext == '.gz':
+      if ext == ".gz":
         import gzip
+
         return gzip.open(filename, mode, encoding=encoding)
-      elif ext == '.bz2':
+      elif ext == ".bz2":
         import bz2
+
         return bz2.open(filename, mode, encoding=encoding)
       else:
         return open(filename, mode, encoding=encoding)
 
     return hook
 
-  if input_file == '-':
+  if input_file == "-":
     input = sys.stdin
   else:
-    input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded('utf-8'))
+    input = fileinput.FileInput(input_file, openhook=hook_compressed_encoded("utf-8"))
 
   # collect siteinfo
   for line in input:
     # When an input file is .bz2 or .gz, line can be a bytes even in Python 3.
     if not isinstance(line, text_type):
-      line = line.decode('utf-8')
+      line = line.decode("utf-8")
     m = tagRE.search(line)
     if not m:
       continue
     tag = m.group(2)
-    if tag == 'base':
+    if tag == "base":
       # discover urlbase from the xml dump file
       # /mediawiki/siteinfo/base
       base = m.group(3)
-      options.urlbase = base[:base.rfind("/")]
-    elif tag == 'namespace':
+      options.urlbase = base[: base.rfind("/")]
+    elif tag == "namespace":
       mk = keyRE.search(line)
       if mk:
-        nsid = ''.join(mk.groups())
+        nsid = "".join(mk.groups())
       else:
-        nsid = ''
+        nsid = ""
       options.knownNamespaces[m.group(3)] = nsid
       if re.search('key="10"', line):
         options.templateNamespace = m.group(3)
-        options.templatePrefix = options.templateNamespace + ':'
+        options.templatePrefix = options.templateNamespace + ":"
       elif re.search('key="828"', line):
         options.moduleNamespace = m.group(3)
-        options.modulePrefix = options.moduleNamespace + ':'
-    elif tag == '/siteinfo':
+        options.modulePrefix = options.moduleNamespace + ":"
+    elif tag == "/siteinfo":
       break
 
   if options.expand_templates:
@@ -2874,7 +2944,7 @@ def hook(filename, mode):
         load_templates(file)
         file.close()
       else:
-        if input_file == '-':
+        if input_file == "-":
           # can't scan then reset stdin; must error w/ suggestion to specify template_file
           raise ValueError("to use templates with stdin dump, must supply explicit template-file")
         logging.info("Preprocessing '%s' to collect template definitions: this may take some time.", input_file)
@@ -2897,18 +2967,18 @@ def hook(filename, mode):
   # output queue
   output_queue = Queue(maxsize=maxsize)
 
-  if out_file == '-':
+  if out_file == "-":
     out_file = None
 
   worker_count = process_count
 
   # load balancing
   max_spool_length = 10000
-  spool_length = Value('i', 0, lock=False)
+  spool_length = Value("i", 0, lock=False)
 
   # reduce job that sorts and prints output
   reduce = Process(
-      target=reduce_process, args=(options, output_queue, spool_length, out_file, file_size, file_compress)
+    target=reduce_process, args=(options, output_queue, spool_length, out_file, file_size, file_compress)
   )
   reduce.start()
 
@@ -2937,7 +3007,7 @@ def hook(filename, mode):
           time.sleep(10)
           delay += 10
       if delay:
-        logging.info('Delay %ds', delay)
+        logging.info("Delay %ds", delay)
       job = (id, revid, title, page, page_num)
       jobs_queue.put(job)  # goes to any available extract_process
       page_num += 1
@@ -2960,12 +3030,15 @@ def hook(filename, mode):
   extract_duration = default_timer() - extract_start
   extract_rate = page_num / extract_duration
   logging.info(
-      "Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)", process_count, page_num, extract_duration,
-      extract_rate
+    "Finished %d-process extraction of %d articles in %.1fs (%.1f art/s)",
+    process_count,
+    page_num,
+    extract_duration,
+    extract_rate,
   )
   logging.info(
-      "total of page: %d, total of articl page: %d; total of used articl page: %d" %
-      (g_page_total, g_page_articl_total, g_page_articl_used_total)
+    "total of page: %d, total of articl page: %d; total of used articl page: %d"
+    % (g_page_total, g_page_articl_total, g_page_articl_used_total)
   )
 
 
@@ -2975,10 +3048,10 @@ def hook(filename, mode):
 
 def extract_process(opts, i, jobs_queue, output_queue):
   """Pull tuples of raw page content, do CPU/regex-heavy fixup, push finished text
-    :param i: process id.
-    :param jobs_queue: where to get jobs.
-    :param output_queue: where to queue extracted text for output.
-    """
+  :param i: process id.
+  :param jobs_queue: where to get jobs.
+  :param output_queue: where to queue extracted text for output.
+  """
 
   global options
   options = opts
@@ -2997,14 +3070,14 @@ def extract_process(opts, i, jobs_queue, output_queue):
         e.extract(out)
         text = out.getvalue()
       except:
-        text = ''
-        logging.exception('Processing page: %s %s', id, title)
+        text = ""
+        logging.exception("Processing page: %s %s", id, title)
 
       output_queue.put((page_num, text))
       out.truncate(0)
       out.seek(0)
     else:
-      logging.debug('Quit extractor')
+      logging.debug("Quit extractor")
       break
   out.close()
 
@@ -3014,13 +3087,13 @@ def extract_process(opts, i, jobs_queue, output_queue):
 
 def reduce_process(opts, output_queue, spool_length, out_file=None, file_size=0, file_compress=True):
   """Pull finished article text, write series of files (or stdout)
-    :param opts: global parameters.
-    :param output_queue: text to be output.
-    :param spool_length: spool length.
-    :param out_file: filename where to print.
-    :param file_size: max file size.
-    :param file_compress: whether to compress output.
-    """
+  :param opts: global parameters.
+  :param output_queue: text to be output.
+  :param spool_length: spool length.
+  :param out_file: filename where to print.
+  :param file_size: max file size.
+  :param file_compress: whether to compress output.
+  """
 
   global options
   options = opts
@@ -3041,7 +3114,7 @@ def reduce_process(opts, output_queue, spool_length, out_file=None, file_size=0,
   next_page = 0  # sequence numbering of page
   while True:
     if next_page in spool:
-      output.write(spool.pop(next_page).encode('utf-8'))
+      output.write(spool.pop(next_page).encode("utf-8"))
       next_page += 1
       # tell mapper our load:
       spool_length.value = len(spool)
@@ -3062,7 +3135,7 @@ def reduce_process(opts, output_queue, spool_length, out_file=None, file_size=0,
       # FIXME: if an extractor dies, process stalls; the other processes
       # continue to produce pairs, filling up memory.
       if len(spool) > 200:
-        logging.debug('Collected %d, waiting: %d, %d', len(spool), next_page, next_page == page_num)
+        logging.debug("Collected %d, waiting: %d, %d", len(spool), next_page, next_page == page_num)
   if output != sys.stdout:
     output.close()
 
@@ -3074,22 +3147,21 @@ def reduce_process(opts, output_queue, spool_length, out_file=None, file_size=0,
 
 
 def main():
-
   parser = argparse.ArgumentParser(
-      prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__
+    prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__
   )
   parser.add_argument("input", help="XML wiki dump file")
-  groupO = parser.add_argument_group('Output')
+  groupO = parser.add_argument_group("Output")
   groupO.add_argument(
-      "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)"
+    "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdout)"
   )
   groupO.add_argument(
-      "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]"
+    "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]"
   )
   groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip")
   groupO.add_argument("--json", action="store_true", help="write output in json format instead of the default one")
 
-  groupP = parser.add_argument_group('Processing')
+  groupP = parser.add_argument_group("Processing")
   groupP.add_argument("--html", action="store_true", help="produce HTML output, subsumes --links")
   groupP.add_argument("-l", "--links", action="store_true", help="preserve links")
   groupP.add_argument("-s", "--sections", action="store_true", help="preserve sections")
@@ -3098,61 +3170,61 @@ def main():
   groupP.add_argument("--templates", help="use or create file containing templates")
   groupP.add_argument("--no_templates", action="store_false", help="Do not expand templates")
   groupP.add_argument(
-      "-r",
-      "--revision",
-      action="store_true",
-      default=options.print_revision,
-      help="Include the document revision id (default=%(default)s)"
+    "-r",
+    "--revision",
+    action="store_true",
+    default=options.print_revision,
+    help="Include the document revision id (default=%(default)s)",
   )
   groupP.add_argument(
-      "--min_text_length",
-      type=int,
-      default=options.min_text_length,
-      help="Minimum expanded text length required to write document (default=%(default)s)"
+    "--min_text_length",
+    type=int,
+    default=options.min_text_length,
+    help="Minimum expanded text length required to write document (default=%(default)s)",
   )
   groupP.add_argument(
-      "--filter_disambig_pages",
-      action="store_true",
-      default=options.filter_disambig_pages,
-      help="Remove pages from output that contain disabmiguation markup (default=%(default)s)"
+    "--filter_disambig_pages",
+    action="store_true",
+    default=options.filter_disambig_pages,
+    help="Remove pages from output that contain disabmiguation markup (default=%(default)s)",
   )
   groupP.add_argument(
-      "-it",
-      "--ignored_tags",
-      default="",
-      metavar="abbr,b,big",
-      help="comma separated list of tags that will be dropped, keeping their content"
+    "-it",
+    "--ignored_tags",
+    default="",
+    metavar="abbr,b,big",
+    help="comma separated list of tags that will be dropped, keeping their content",
   )
   groupP.add_argument(
-      "-de",
-      "--discard_elements",
-      default="",
-      metavar="gallery,timeline,noinclude",
-      help="comma separated list of elements that will be removed from the article text"
+    "-de",
+    "--discard_elements",
+    default="",
+    metavar="gallery,timeline,noinclude",
+    help="comma separated list of elements that will be removed from the article text",
   )
   groupP.add_argument(
-      "--keep_tables",
-      action="store_true",
-      default=options.keep_tables,
-      help="Preserve tables in the output article text (default=%(default)s)"
+    "--keep_tables",
+    action="store_true",
+    default=options.keep_tables,
+    help="Preserve tables in the output article text (default=%(default)s)",
   )
   default_process_count = max(1, cpu_count() - 1)
   parser.add_argument(
-      "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)"
+    "--processes", type=int, default=default_process_count, help="Number of processes to use (default %(default)s)"
   )
 
-  groupS = parser.add_argument_group('Special')
+  groupS = parser.add_argument_group("Special")
   groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info")
   groupS.add_argument("--debug", action="store_true", help="print debug info")
   groupS.add_argument(
-      "-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)"
+    "-a", "--article", action="store_true", help="analyze a file containing a single article (debug option)"
   )
   groupS.add_argument("--log_file", help="path to save the log info")
-  groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version")
+  groupS.add_argument("-v", "--version", action="version", version="%(prog)s " + version, help="print program version")
   groupP.add_argument(
-      "--filter_category",
-      help="specify the file that listing the Categories you want to include or exclude. One line for"
-      " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including"
+    "--filter_category",
+    help="specify the file that listing the Categories you want to include or exclude. One line for"
+    " one category. starting with: 1) '#' comment, ignored; 2) '^' exclude; Note: excluding has higher priority than including",
   )
   args = parser.parse_args()
 
@@ -3171,24 +3243,46 @@ def main():
   options.keep_tables = args.keep_tables
 
   try:
-    power = 'kmg'.find(args.bytes[-1].lower()) + 1
+    power = "kmg".find(args.bytes[-1].lower()) + 1
     file_size = int(args.bytes[:-1]) * 1024**power
     if file_size < minFileSize:
       raise ValueError()
   except ValueError:
-    logging.error('Insufficient or invalid size: %s', args.bytes)
+    logging.error("Insufficient or invalid size: %s", args.bytes)
     return
 
   if args.namespaces:
-    options.acceptedNamespaces = set(args.namespaces.split(','))
+    options.acceptedNamespaces = set(args.namespaces.split(","))
 
   # ignoredTags and discardElemets have default values already supplied, if passed in the defaults are overwritten
   if args.ignored_tags:
-    ignoredTags = set(args.ignored_tags.split(','))
+    ignoredTags = set(args.ignored_tags.split(","))
   else:
     ignoredTags = [
-        'abbr', 'b', 'big', 'blockquote', 'center', 'cite', 'em', 'font', 'h1', 'h2', 'h3', 'h4', 'hiero', 'i', 'kbd',
-        'p', 'plaintext', 's', 'span', 'strike', 'strong', 'tt', 'u', 'var'
+      "abbr",
+      "b",
+      "big",
+      "blockquote",
+      "center",
+      "cite",
+      "em",
+      "font",
+      "h1",
+      "h2",
+      "h3",
+      "h4",
+      "hiero",
+      "i",
+      "kbd",
+      "p",
+      "plaintext",
+      "s",
+      "span",
+      "strike",
+      "strong",
+      "tt",
+      "u",
+      "var",
     ]
 
   # 'a' tag is handled separately
@@ -3196,9 +3290,9 @@ def main():
     ignoreTag(tag)
 
   if args.discard_elements:
-    options.discardElements = set(args.discard_elements.split(','))
+    options.discardElements = set(args.discard_elements.split(","))
 
-  FORMAT = '%(levelname)s: %(message)s'
+  FORMAT = "%(levelname)s: %(message)s"
   logging.basicConfig(format=FORMAT)
 
   options.quiet = args.quiet
@@ -3209,7 +3303,7 @@ def main():
   input_file = args.input
 
   if not options.keepLinks:
-    ignoreTag('a')
+    ignoreTag("a")
 
   # sharing cache of parser templates is too slow:
   # manager = Manager()
@@ -3229,31 +3323,33 @@ def main():
     return
 
   output_path = args.output
-  if output_path != '-' and not os.path.isdir(output_path):
+  if output_path != "-" and not os.path.isdir(output_path):
     try:
       os.makedirs(output_path)
     except:
-      logging.error('Could not create: %s', output_path)
+      logging.error("Could not create: %s", output_path)
       return
 
   filter_category = args.filter_category
-  if (filter_category != None and len(filter_category) > 0):
+  if filter_category != None and len(filter_category) > 0:
     with open(filter_category) as f:
       error_cnt = 0
       for line in f.readlines():
         try:
           line = str(line.strip())
-          if line.startswith('#') or len(line) == 0:
+          if line.startswith("#") or len(line) == 0:
             continue
-          elif line.startswith('^'):
-            options.filter_category_exclude.add(line.lstrip('^'))
+          elif line.startswith("^"):
+            options.filter_category_exclude.add(line.lstrip("^"))
           else:
             options.filter_category_include.add(line)
         except Exception as e:
           error_cnt += 1
-          print(u"Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt, e))
+          print("Category not in utf8, ignored. error cnt %d:\t%s" % (error_cnt, e))
           print(line)
-      logging.info("Excluding categories:",)
+      logging.info(
+        "Excluding categories:",
+      )
       logging.info(str(options.filter_category_exclude))
       logging.info("Including categories:")
       logging.info(str(len(options.filter_category_include)))
@@ -3267,11 +3363,11 @@ def createLogger(quiet, debug, log_file):
     logger.setLevel(logging.INFO)
   if debug:
     logger.setLevel(logging.DEBUG)
-  #print (log_file)
+  # print (log_file)
   if log_file:
     fileHandler = logging.FileHandler(log_file)
     logger.addHandler(fileHandler)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/wikicorpus_en/processing/wikiextractor/cirrus-extract.py b/deepray/datasets/wikicorpus_en/processing/wikiextractor/cirrus-extract.py
index 67acb853..3aa97239 100644
--- a/deepray/datasets/wikicorpus_en/processing/wikiextractor/cirrus-extract.py
+++ b/deepray/datasets/wikicorpus_en/processing/wikiextractor/cirrus-extract.py
@@ -27,7 +27,7 @@
 number of files of similar size in a given directory.
 Each file will contain several documents in the format:
 
-	<doc id="" url="" title="" language="" revision="">
+        <doc id="" url="" title="" language="" revision="">
         ...
         </doc>
 
@@ -42,17 +42,17 @@
 import logging
 
 # Program version
-version = '1.00'
+version = "1.00"
 
-urlbase = 'http://it.wikipedia.org/'
+urlbase = "http://it.wikipedia.org/"
 
 # ----------------------------------------------------------------------
 
 
 class NextFile(object):
   """
-    Synchronous generation of next available file name.
-    """
+  Synchronous generation of next available file name.
+  """
 
   filesPerDir = 100
 
@@ -73,24 +73,24 @@ def next(self):
   def _dirname(self):
     char1 = self.dir_index % 26
     char2 = self.dir_index / 26 % 26
-    return os.path.join(self.path_name, '%c%c' % (ord('A') + char2, ord('A') + char1))
+    return os.path.join(self.path_name, "%c%c" % (ord("A") + char2, ord("A") + char1))
 
   def _filepath(self):
-    return '%s/wiki_%02d' % (self._dirname(), self.file_index)
+    return "%s/wiki_%02d" % (self._dirname(), self.file_index)
 
 
 class OutputSplitter(object):
   """
-    File-like object, that splits output to multiple files of a given max size.
-    """
+  File-like object, that splits output to multiple files of a given max size.
+  """
 
   def __init__(self, nextFile, max_file_size=0, compress=True):
     """
-        :param nextfile: a NextFile object from which to obtain filenames
-            to use.
-        :param max_file_size: the maximum size of each file.
-        :para compress: whether to write data with bzip compression.
-        """
+    :param nextfile: a NextFile object from which to obtain filenames
+        to use.
+    :param max_file_size: the maximum size of each file.
+    :para compress: whether to write data with bzip compression.
+    """
     self.nextFile = nextFile
     self.compress = compress
     self.max_file_size = max_file_size
@@ -110,52 +110,55 @@ def close(self):
 
   def open(self, filename):
     if self.compress:
-      return bz2.BZ2File(filename + '.bz2', 'w')
+      return bz2.BZ2File(filename + ".bz2", "w")
     else:
-      return open(filename, 'w')
+      return open(filename, "w")
 
 
 # ----------------------------------------------------------------------
 
 
 class Extractor(object):
-
   def extract(self, out):
     """
-        :param out: output file.
-        """
+    :param out: output file.
+    """
     logging.debug("%s\t%s", self.id, self.title)
-    text = ''.join(self.page)
+    text = "".join(self.page)
     url = get_url(self.id)
     header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (
-        self.id, url, self.title, self.language, self.revision
+      self.id,
+      url,
+      self.title,
+      self.language,
+      self.revision,
     )
     # Separate header from text with a newline.
-    header += self.title + '\n\n'
-    header = header.encode('utf-8')
+    header += self.title + "\n\n"
+    header = header.encode("utf-8")
     footer = "\n</doc>\n"
     out.write(header)
     text = clean(self, text)
     for line in compact(text):
-      out.write(line.encode('utf-8'))
-      out.write('\n')
+      out.write(line.encode("utf-8"))
+      out.write("\n")
     out.write(footer)
 
 
 def process_dump(input_file, out_file, file_size, file_compress):
   """
-    :param input_file: name of the wikipedia dump file; '-' to read from stdin
-    :param out_file: directory where to store extracted data, or '-' for stdout
-    :param file_size: max size of each extracted file, or None for no max (one file)
-    :param file_compress: whether to compress files with bzip.
-    """
+  :param input_file: name of the wikipedia dump file; '-' to read from stdin
+  :param out_file: directory where to store extracted data, or '-' for stdout
+  :param file_size: max size of each extracted file, or None for no max (one file)
+  :param file_compress: whether to compress files with bzip.
+  """
 
-  if input_file == '-':
+  if input_file == "-":
     input = sys.stdin
   else:
     input = gzip.open(input_file)
 
-  if out_file == '-':
+  if out_file == "-":
     output = sys.stdout
     if file_compress:
       logging.warn("writing to stdout, so no output compression (use external tool)")
@@ -173,20 +176,20 @@ def process_dump(input_file, out_file, file_size, file_compress):
       break
     index = json.loads(line)
     content = json.loads(input.readline())
-    type = index['index']['_type']
-    id = index['index']['_id']
-    language = content['language']
-    revision = content['version']
-    if type == 'page' and content['namespace'] == 0:
-      title = content['title']
-      text = content['text']
+    type = index["index"]["_type"]
+    id = index["index"]["_id"]
+    language = content["language"]
+    revision = content["version"]
+    if type == "page" and content["namespace"] == 0:
+      title = content["title"]
+      text = content["text"]
       # drop references:
       # ^ The Penguin Dictionary
-      text = re.sub(r'  \^ .*', '', text)
-      url = urlbase + 'wiki?curid=' + id
+      text = re.sub(r"  \^ .*", "", text)
+      url = urlbase + "wiki?curid=" + id
       header = '<doc id="%s" url="%s" title="%s" language="%s" revision="%s">\n' % (id, url, title, language, revision)
-      page = header + title + '\n\n' + text + '\n</doc>\n'
-      output.write(page.encode('utf-8'))
+      page = header + title + "\n\n" + text + "\n</doc>\n"
+      output.write(page.encode("utf-8"))
 
 
 # ----------------------------------------------------------------------
@@ -197,37 +200,37 @@ def process_dump(input_file, out_file, file_size, file_compress):
 
 def main():
   parser = argparse.ArgumentParser(
-      prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__
+    prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__
   )
   parser.add_argument("input", help="Cirrus Json wiki dump file")
-  groupO = parser.add_argument_group('Output')
+  groupO = parser.add_argument_group("Output")
   groupO.add_argument(
-      "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdin)"
+    "-o", "--output", default="text", help="directory for extracted files (or '-' for dumping to stdin)"
   )
   groupO.add_argument(
-      "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]"
+    "-b", "--bytes", default="1M", help="maximum bytes per output file (default %(default)s)", metavar="n[KMG]"
   )
   groupO.add_argument("-c", "--compress", action="store_true", help="compress output files using bzip")
 
-  groupP = parser.add_argument_group('Processing')
+  groupP = parser.add_argument_group("Processing")
   groupP.add_argument("-ns", "--namespaces", default="", metavar="ns1,ns2", help="accepted namespaces")
 
-  groupS = parser.add_argument_group('Special')
+  groupS = parser.add_argument_group("Special")
   groupS.add_argument("-q", "--quiet", action="store_true", help="suppress reporting progress info")
-  groupS.add_argument("-v", "--version", action="version", version='%(prog)s ' + version, help="print program version")
+  groupS.add_argument("-v", "--version", action="version", version="%(prog)s " + version, help="print program version")
 
   args = parser.parse_args()
 
   try:
-    power = 'kmg'.find(args.bytes[-1].lower()) + 1
+    power = "kmg".find(args.bytes[-1].lower()) + 1
     file_size = int(args.bytes[:-1]) * 1024**power
     if file_size < minFileSize:
       raise ValueError()
   except ValueError:
-    logging.error('Insufficient or invalid size: %s', args.bytes)
+    logging.error("Insufficient or invalid size: %s", args.bytes)
     return
 
-  FORMAT = '%(levelname)s: %(message)s'
+  FORMAT = "%(levelname)s: %(message)s"
   logging.basicConfig(format=FORMAT)
 
   logger = logging.getLogger()
@@ -237,15 +240,15 @@ def main():
   input_file = args.input
 
   output_path = args.output
-  if output_path != '-' and not os.path.isdir(output_path):
+  if output_path != "-" and not os.path.isdir(output_path):
     try:
       os.makedirs(output_path)
     except:
-      logging.error('Could not create: %s', output_path)
+      logging.error("Could not create: %s", output_path)
       return
 
   process_dump(input_file, output_path, file_size, args.compress)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   main()
diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en.py b/deepray/datasets/wikicorpus_en/wikicorpus_en.py
index 95a9c8c5..f90f2528 100644
--- a/deepray/datasets/wikicorpus_en/wikicorpus_en.py
+++ b/deepray/datasets/wikicorpus_en/wikicorpus_en.py
@@ -22,13 +22,12 @@
 from deepray.datasets.tfrecord_pipeline import TFRecordPipeline
 
 FLAGS([
-    sys.argv[0],
-    "--num_train_examples=24324736",
+  sys.argv[0],
+  "--num_train_examples=24324736",
 ])
 
 
 class Wikicorpus_en(TFRecordPipeline):
-
   def __init__(self, max_seq_length, **kwargs):
     super().__init__(**kwargs)
     self._max_seq_length = max_seq_length
@@ -37,9 +36,9 @@ def __init__(self, max_seq_length, **kwargs):
   @property
   def features(self):
     return {
-        "input_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "input_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "input_mask": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
+      "segment_ids": tf.io.FixedLenFeature([self._max_seq_length], tf.int64),
     }
 
   def parser(self, record):
diff --git a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
index 164133f5..3bebd78b 100644
--- a/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
+++ b/deepray/datasets/wikicorpus_en/wikicorpus_en_test.py
@@ -20,11 +20,11 @@ def runner(argv=None):
   if len(argv) <= 1:
     dir_path = os.path.dirname(os.path.realpath(__file__))
     argv = [
-        sys.argv[0],
-        "--batch_size=128",
-        "-epochs=1",
-        f"--train_data=/workspaces/dataset/wikicorpus_en/data/tfrecord_lower_case_1_seq_len_128_random_seed_12345/wikicorpus_en/train/pretrain_data.*",
-        # "--label=clicked",
+      sys.argv[0],
+      "--batch_size=128",
+      "-epochs=1",
+      f"--train_data=/workspaces/dataset/wikicorpus_en/data/tfrecord_lower_case_1_seq_len_128_random_seed_12345/wikicorpus_en/train/pretrain_data.*",
+      # "--label=clicked",
     ]
   if argv:
     FLAGS(argv, known_only=True)
@@ -44,7 +44,7 @@ def runner(argv=None):
     num_examples += FLAGS.batch_size
     step_throughput = _performance_calculator(1, FLAGS.batch_size)
     if num_examples % 100 == 0:
-      print(f'num_examples {num_examples}, step {step}, Perf {step_throughput} samples/s')
+      print(f"num_examples {num_examples}, step {step}, Perf {step_throughput} samples/s")
 
   print(num_examples)
   results_perf = _performance_calculator.results
diff --git a/deepray/datasets/wikitext103/wikitext103.py b/deepray/datasets/wikitext103/wikitext103.py
index 697a5e4b..aac23257 100644
--- a/deepray/datasets/wikitext103/wikitext103.py
+++ b/deepray/datasets/wikitext103/wikitext103.py
@@ -5,7 +5,6 @@
 
 
 class Wikitext103(TFRecordPipeline, ABC):
-
   def __init__(self, bin_sizes, tgt_len, **kwargs):
     super().__init__(**kwargs)
     self.bin_sizes = bin_sizes
@@ -22,15 +21,15 @@ def _process_perm_feature(example, prefix):
 
         # tf.float32
         perm = tf.sparse_to_dense(
-            sparse_indices=tup, output_shape=[self.tgt_len, self.bin_sizes[b]], sparse_values=1.0, default_value=0.0
+          sparse_indices=tup, output_shape=[self.tgt_len, self.bin_sizes[b]], sparse_values=1.0, default_value=0.0
         )
 
         example["{}_perm_{}".format(prefix, b)] = perm
 
     # whether allow the last batch with a potentially shorter length
     record_spec = {
-        "inputs": tf.io.VarLenFeature(tf.int64),
-        "labels": tf.io.VarLenFeature(tf.int64),
+      "inputs": tf.io.VarLenFeature(tf.int64),
+      "labels": tf.io.VarLenFeature(tf.int64),
     }
 
     # retrieve serialized example
diff --git a/deepray/layers/__init__.py b/deepray/layers/__init__.py
index 88a5336f..6ea69e8f 100644
--- a/deepray/layers/__init__.py
+++ b/deepray/layers/__init__.py
@@ -15,12 +15,12 @@
 """Additional layers that conform to Keras API."""
 
 from deepray.layers.adaptive_pooling import (
-    AdaptiveAveragePooling1D,
-    AdaptiveMaxPooling1D,
-    AdaptiveAveragePooling2D,
-    AdaptiveMaxPooling2D,
-    AdaptiveAveragePooling3D,
-    AdaptiveMaxPooling3D,
+  AdaptiveAveragePooling1D,
+  AdaptiveMaxPooling1D,
+  AdaptiveAveragePooling2D,
+  AdaptiveMaxPooling2D,
+  AdaptiveAveragePooling3D,
+  AdaptiveMaxPooling3D,
 )
 
 from deepray.layers.embedding import Embedding
@@ -45,4 +45,4 @@
 from deepray.layers.on_device_embedding import OnDeviceEmbedding
 from deepray.layers.position_embedding import PositionEmbedding
 from deepray.layers.self_attention_mask import SelfAttentionMask
-from deepray.layers.transformer import Transformer
\ No newline at end of file
+from deepray.layers.transformer import Transformer
diff --git a/deepray/layers/adaptive_pooling.py b/deepray/layers/adaptive_pooling.py
index 2643ca68..c37eaa95 100644
--- a/deepray/layers/adaptive_pooling.py
+++ b/deepray/layers/adaptive_pooling.py
@@ -24,28 +24,28 @@
 class AdaptivePooling1D(tf.keras.layers.Layer):
   """Parent class for 1D pooling layers with adaptive kernel size.
 
-    This class only exists for code reuse. It will never be an exposed API.
-
-    Args:
-      reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
-      output_size: An integer or tuple/list of a single integer, specifying pooled_features.
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, steps, features)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, features, steps)`.
-    """
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Args:
+    reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
+    output_size: An integer or tuple/list of a single integer, specifying pooled_features.
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, features)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, features, steps)`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      reduce_function: Callable,
-      output_size: Union[int, Iterable[int]],
-      data_format=None,
-      **kwargs,
+    self,
+    reduce_function: Callable,
+    output_size: Union[int, Iterable[int]],
+    data_format=None,
+    **kwargs,
   ):
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.reduce_function = reduce_function
@@ -75,8 +75,8 @@ def compute_output_shape(self, input_shape):
 
   def get_config(self):
     config = {
-        "output_size": self.output_size,
-        "data_format": self.data_format,
+      "output_size": self.output_size,
+      "data_format": self.data_format,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -86,28 +86,28 @@ def get_config(self):
 class AdaptiveAveragePooling1D(AdaptivePooling1D):
   """Average Pooling with adaptive kernel size.
 
-    Args:
-      output_size: An integer or tuple/list of a single integer, specifying pooled_features.
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, steps, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, steps)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        3D tensor with shape `(batch, steps, channels)`.
-      - If `data_format='channels_first'`:
-        3D tensor with shape `(batch, channels, steps)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        3D tensor with shape `(batch_size, pooled_steps, channels)`.
-      - If `data_format='channels_first'`:
-        3D tensor with shape `(batch_size, channels, pooled_steps)`.
-    """
+  Args:
+    output_size: An integer or tuple/list of a single integer, specifying pooled_features.
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, steps)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch, steps, channels)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch, channels, steps)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, pooled_steps, channels)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, channels, pooled_steps)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
@@ -118,28 +118,28 @@ def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **k
 class AdaptiveMaxPooling1D(AdaptivePooling1D):
   """Max Pooling with adaptive kernel size.
 
-    Args:
-      output_size: An integer or tuple/list of a single integer, specifying pooled_features.
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, steps, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, steps)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        3D tensor with shape `(batch, steps, channels)`.
-      - If `data_format='channels_first'`:
-        3D tensor with shape `(batch, channels, steps)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        3D tensor with shape `(batch_size, pooled_steps, channels)`.
-      - If `data_format='channels_first'`:
-        3D tensor with shape `(batch_size, channels, pooled_steps)`.
-    """
+  Args:
+    output_size: An integer or tuple/list of a single integer, specifying pooled_features.
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, steps, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, steps)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch, steps, channels)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch, channels, steps)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, pooled_steps, channels)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, channels, pooled_steps)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
@@ -149,28 +149,28 @@ def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **k
 class AdaptivePooling2D(tf.keras.layers.Layer):
   """Parent class for 2D pooling layers with adaptive kernel size.
 
-    This class only exists for code reuse. It will never be an exposed API.
-
-    Args:
-      reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
-      output_size: An integer or tuple/list of 2 integers specifying (pooled_rows, pooled_cols).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, channels, height, width)`.
-    """
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Args:
+    reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
+    output_size: An integer or tuple/list of 2 integers specifying (pooled_rows, pooled_cols).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, height, width)`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      reduce_function: Callable,
-      output_size: Union[int, Iterable[int]],
-      data_format=None,
-      **kwargs,
+    self,
+    reduce_function: Callable,
+    output_size: Union[int, Iterable[int]],
+    data_format=None,
+    **kwargs,
   ):
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.reduce_function = reduce_function
@@ -198,25 +198,25 @@ def compute_output_shape(self, input_shape):
     input_shape = tf.TensorShape(input_shape).as_list()
     if self.data_format == "channels_last":
       shape = tf.TensorShape([
-          input_shape[0],
-          self.output_size[0],
-          self.output_size[1],
-          input_shape[3],
+        input_shape[0],
+        self.output_size[0],
+        self.output_size[1],
+        input_shape[3],
       ])
     else:
       shape = tf.TensorShape([
-          input_shape[0],
-          input_shape[1],
-          self.output_size[0],
-          self.output_size[1],
+        input_shape[0],
+        input_shape[1],
+        self.output_size[0],
+        self.output_size[1],
       ])
 
     return shape
 
   def get_config(self):
     config = {
-        "output_size": self.output_size,
-        "data_format": self.data_format,
+      "output_size": self.output_size,
+      "data_format": self.data_format,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -226,28 +226,28 @@ def get_config(self):
 class AdaptiveAveragePooling2D(AdaptivePooling2D):
   """Average Pooling with adaptive kernel size.
 
-    Args:
-      output_size: Tuple of integers specifying (pooled_rows, pooled_cols).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, height, width)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, height, width, channels)`.
-      - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, height, width)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
-      - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
-    """
+  Args:
+    output_size: Tuple of integers specifying (pooled_rows, pooled_cols).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, height, width)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, height, width, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, height, width)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
@@ -258,28 +258,28 @@ def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **k
 class AdaptiveMaxPooling2D(AdaptivePooling2D):
   """Max Pooling with adaptive kernel size.
 
-    Args:
-      output_size: Tuple of integers specifying (pooled_rows, pooled_cols).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, height, width)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, height, width, channels)`.
-      - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, height, width)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
-      - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
-    """
+  Args:
+    output_size: Tuple of integers specifying (pooled_rows, pooled_cols).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, height, width)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, height, width, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, height, width)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
@@ -289,28 +289,28 @@ def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **k
 class AdaptivePooling3D(tf.keras.layers.Layer):
   """Parent class for 3D pooling layers with adaptive kernel size.
 
-    This class only exists for code reuse. It will never be an exposed API.
-
-    Args:
-      reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
-      output_size: An integer or tuple/list of 3 integers specifying (pooled_dim1, pooled_dim2, pooled_dim3).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)` while `channels_first`
-        corresponds to inputs with shape
-        `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-    """
+  This class only exists for code reuse. It will never be an exposed API.
+
+  Args:
+    reduce_function: The reduction method to apply, e.g. `tf.reduce_max`.
+    output_size: An integer or tuple/list of 3 integers specifying (pooled_dim1, pooled_dim2, pooled_dim3).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, spatial_dim1, spatial_dim2, spatial_dim3, channels)` while `channels_first`
+      corresponds to inputs with shape
+      `(batch, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      reduce_function: Callable,
-      output_size: Union[int, Iterable[int]],
-      data_format=None,
-      **kwargs,
+    self,
+    reduce_function: Callable,
+    output_size: Union[int, Iterable[int]],
+    data_format=None,
+    **kwargs,
   ):
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.reduce_function = reduce_function
@@ -342,32 +342,28 @@ def call(self, inputs, *args):
   def compute_output_shape(self, input_shape):
     input_shape = tf.TensorShape(input_shape).as_list()
     if self.data_format == "channels_last":
-      shape = tf.TensorShape(
-          [
-              input_shape[0],
-              self.output_size[0],
-              self.output_size[1],
-              self.output_size[2],
-              input_shape[4],
-          ]
-      )
+      shape = tf.TensorShape([
+        input_shape[0],
+        self.output_size[0],
+        self.output_size[1],
+        self.output_size[2],
+        input_shape[4],
+      ])
     else:
-      shape = tf.TensorShape(
-          [
-              input_shape[0],
-              input_shape[1],
-              self.output_size[0],
-              self.output_size[1],
-              self.output_size[2],
-          ]
-      )
+      shape = tf.TensorShape([
+        input_shape[0],
+        input_shape[1],
+        self.output_size[0],
+        self.output_size[1],
+        self.output_size[2],
+      ])
 
     return shape
 
   def get_config(self):
     config = {
-        "output_size": self.output_size,
-        "data_format": self.data_format,
+      "output_size": self.output_size,
+      "data_format": self.data_format,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -377,28 +373,28 @@ def get_config(self):
 class AdaptiveAveragePooling3D(AdaptivePooling3D):
   """Average Pooling with adaptive kernel size.
 
-    Args:
-      output_size: An integer or tuple/list of 3 integers specifying (pooled_depth, pooled_height, pooled_width).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, height, width)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`.
-      - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`.
-      - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`.
-    """
+  Args:
+    output_size: An integer or tuple/list of 3 integers specifying (pooled_depth, pooled_height, pooled_width).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, height, width)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      5D tensor with shape `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`.
+    - If `data_format='channels_first'`:
+      5D tensor with shape `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      5D tensor with shape `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`.
+    - If `data_format='channels_first'`:
+      5D tensor with shape `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
@@ -409,28 +405,28 @@ def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **k
 class AdaptiveMaxPooling3D(AdaptivePooling3D):
   """Max Pooling with adaptive kernel size.
 
-    Args:
-      output_size: An integer or tuple/list of 3 integers specifying (pooled_depth, pooled_height, pooled_width).
-        The new size of output channels.
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, height, width)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`.
-      - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
-
-    Output shape:
-      - If `data_format='channels_last'`:
-        5D tensor with shape `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`.
-      - If `data_format='channels_first'`:
-        5D tensor with shape `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`.
-    """
+  Args:
+    output_size: An integer or tuple/list of 3 integers specifying (pooled_depth, pooled_height, pooled_width).
+      The new size of output channels.
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, height, width)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      5D tensor with shape `(batch_size, spatial_dim1, spatial_dim2, spatial_dim3, channels)`.
+    - If `data_format='channels_first'`:
+      5D tensor with shape `(batch_size, channels, spatial_dim1, spatial_dim2, spatial_dim3)`.
+
+  Output shape:
+    - If `data_format='channels_last'`:
+      5D tensor with shape `(batch_size, pooled_dim1, pooled_dim2, pooled_dim3, channels)`.
+    - If `data_format='channels_first'`:
+      5D tensor with shape `(batch_size, channels, pooled_dim1, pooled_dim2, pooled_dim3)`.
+  """
 
   @typechecked
   def __init__(self, output_size: Union[int, Iterable[int]], data_format=None, **kwargs):
diff --git a/deepray/layers/attention.py b/deepray/layers/attention.py
index 9a974003..96ed9f40 100644
--- a/deepray/layers/attention.py
+++ b/deepray/layers/attention.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -61,18 +62,18 @@ class Attention(keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      num_heads,
-      head_size,
-      dropout_rate=0.0,
-      kernel_initializer="glorot_uniform",
-      bias_initializer="zeros",
-      kernel_regularizer=None,
-      bias_regularizer=None,
-      activity_regularizer=None,
-      kernel_constraint=None,
-      bias_constraint=None,
-      **kwargs
+    self,
+    num_heads,
+    head_size,
+    dropout_rate=0.0,
+    kernel_initializer="glorot_uniform",
+    bias_initializer="zeros",
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    **kwargs,
   ):
     super(Attention, self).__init__(**kwargs)
     self._num_heads = num_heads
@@ -86,39 +87,39 @@ def __init__(
     self._bias_constraint = keras.constraints.get(bias_constraint)
 
     self._query_dense = dense_einsum.DenseEinsum(
-        output_shape=(self._num_heads, self._head_size),
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="query"
+      output_shape=(self._num_heads, self._head_size),
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="query",
     )
 
     self._key_dense = dense_einsum.DenseEinsum(
-        output_shape=(self._num_heads, self._head_size),
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="key"
+      output_shape=(self._num_heads, self._head_size),
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="key",
     )
 
     self._value_dense = dense_einsum.DenseEinsum(
-        output_shape=(self._num_heads, self._head_size),
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="value"
+      output_shape=(self._num_heads, self._head_size),
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="value",
     )
 
     self._masked_softmax = masked_softmax.MaskedSoftmax(mask_expansion_axes=[1])
@@ -127,16 +128,16 @@ def __init__(
 
   def get_config(self):
     config = {
-        "num_heads": self._num_heads,
-        "head_size": self._head_size,
-        "dropout_rate": self._dropout_rate,
-        "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer": keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint": keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint": keras.constraints.serialize(self._bias_constraint)
+      "num_heads": self._num_heads,
+      "head_size": self._head_size,
+      "dropout_rate": self._dropout_rate,
+      "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
+      "bias_initializer": keras.initializers.serialize(self._bias_initializer),
+      "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
+      "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
+      "activity_regularizer": keras.regularizers.serialize(self._activity_regularizer),
+      "kernel_constraint": keras.constraints.serialize(self._kernel_constraint),
+      "bias_constraint": keras.constraints.serialize(self._bias_constraint),
     }
     base_config = super(Attention, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -210,7 +211,7 @@ def _update_cache(self, key_tensor, value_tensor, cache, decode_loop_step):
       key_tensor = cache["key"] + key_tensor * indices
       value_seq_dim = cache["value"].shape.as_list()[1]
       indices = tf.reshape(
-          tf.one_hot(decode_loop_step, value_seq_dim, dtype=value_tensor.dtype), [1, value_seq_dim, 1, 1]
+        tf.one_hot(decode_loop_step, value_seq_dim, dtype=value_tensor.dtype), [1, value_seq_dim, 1, 1]
       )
       value_tensor = cache["value"] + value_tensor * indices
     else:
@@ -265,22 +266,22 @@ def call(self, inputs, decode_loop_step=None):
 
 class WindowAttention(keras.layers.Layer):
   """
-    ## Window based multi-head self-attention
-
-    Usually Transformers perform global self-attention, where the relationships between
-    a token and all other tokens are computed. The global computation leads to quadratic
-    complexity with respect to the number of tokens. Here, as the [original paper](https://arxiv.org/abs/2103.14030)
-    suggests, we compute self-attention within local windows, in a non-overlapping manner.
-    Global self-attention leads to quadratic computational complexity in the number of patches,
-    whereas window-based self-attention leads to linear complexity and is easily scalable.
-    """
+  ## Window based multi-head self-attention
+
+  Usually Transformers perform global self-attention, where the relationships between
+  a token and all other tokens are computed. The global computation leads to quadratic
+  complexity with respect to the number of tokens. Here, as the [original paper](https://arxiv.org/abs/2103.14030)
+  suggests, we compute self-attention within local windows, in a non-overlapping manner.
+  Global self-attention leads to quadratic computational complexity in the number of patches,
+  whereas window-based self-attention leads to linear complexity and is easily scalable.
+  """
 
   def __init__(self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0, **kwargs):
     super().__init__(**kwargs)
     self.dim = dim
     self.window_size = window_size
     self.num_heads = num_heads
-    self.scale = (dim // num_heads)**-0.5
+    self.scale = (dim // num_heads) ** -0.5
     self.qkv = keras.layers.Dense(dim * 3, use_bias=qkv_bias)
     self.dropout = keras.layers.Dropout(dropout_rate)
     self.proj = keras.layers.Dense(dim)
@@ -288,9 +289,9 @@ def __init__(self, dim, window_size, num_heads, qkv_bias=True, dropout_rate=0.0,
   def build(self, input_shape):
     num_window_elements = (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
     self.relative_position_bias_table = self.add_weight(
-        shape=(num_window_elements, self.num_heads),
-        initializer=tf.initializers.Zeros(),
-        trainable=True,
+      shape=(num_window_elements, self.num_heads),
+      initializer=tf.initializers.Zeros(),
+      trainable=True,
     )
     coords_h = np.arange(self.window_size[0])
     coords_w = np.arange(self.window_size[1])
@@ -305,7 +306,7 @@ def build(self, input_shape):
     relative_position_index = relative_coords.sum(-1)
 
     self.relative_position_index = tf.Variable(
-        initial_value=tf.convert_to_tensor(relative_position_index), trainable=False
+      initial_value=tf.convert_to_tensor(relative_position_index), trainable=False
     )
 
   def call(self, x, mask=None):
@@ -329,7 +330,7 @@ def call(self, x, mask=None):
     if mask is not None:
       nW = mask.get_shape()[0]
       mask_float = tf.cast(tf.expand_dims(tf.expand_dims(mask, axis=1), axis=0), tf.float32)
-      attn = (tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size)) + mask_float)
+      attn = tf.reshape(attn, shape=(-1, nW, self.num_heads, size, size)) + mask_float
       attn = tf.reshape(attn, shape=(-1, self.num_heads, size, size))
       attn = keras.activations.softmax(attn, axis=-1)
     else:
diff --git a/deepray/layers/bucketize.py b/deepray/layers/bucketize.py
index 98c2c3ef..63e6b3ef 100644
--- a/deepray/layers/bucketize.py
+++ b/deepray/layers/bucketize.py
@@ -2,7 +2,6 @@
 
 
 class Bucketize(tf.keras.layers.Layer):
-
   def __init__(self, boundaries, **kwargs):
     self.boundaries = boundaries
     super(Bucketize, self).__init__(**kwargs)
@@ -14,19 +13,20 @@ def build(self, input_shape):
   def call(self, x, **kwargs):
     return tf.raw_ops.Bucketize(input=x, boundaries=self.boundaries)
 
-  def get_config(self,):
-    config = {'boundaries': self.boundaries}
+  def get_config(
+    self,
+  ):
+    config = {"boundaries": self.boundaries}
     base_config = super(Bucketize, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 class CategoryToIdLayer(tf.keras.layers.Layer):
-
   def __init__(self, vocabulary_list, values):
     self.vocabulary_list = vocabulary_list
     self.values = values
     init = tf.lookup.KeyValueTensorInitializer(
-        keys=tf.constant(self.vocabulary_list, dtype=tf.int64), values=tf.constant(self.values, dtype=tf.int64)
+      keys=tf.constant(self.vocabulary_list, dtype=tf.int64), values=tf.constant(self.values, dtype=tf.int64)
     )
     self.table = tf.lookup.StaticVocabularyTable(init, 1)
     super(CategoryToIdLayer, self).__init__()
@@ -37,7 +37,6 @@ def call(self, inputs):
 
 
 class HashLongToIdLayer(tf.keras.layers.Layer):
-
   def __init__(self, hash_bucket_size, mask=False):
     super(HashLongToIdLayer, self).__init__()
     self.hash_bucket_size = hash_bucket_size
@@ -54,7 +53,6 @@ def call(self, inputs):
 
 
 class Hash(tf.keras.layers.Layer):
-
   def __init__(self, hash_size: int, mask: bool = False, **kwargs):
     """
     Initializes the Hash object with a hash_size and an optional mask argument.
@@ -82,7 +80,6 @@ def call(self, inputs: tf.Tensor, *args, **kwargs) -> tf.Tensor:
 
 
 class NumericaBucketIdLayer(tf.keras.layers.Layer):
-
   def __init__(self, bucket_boundaries):
     self.bucket_boundaries = bucket_boundaries
     super(NumericaBucketIdLayer, self).__init__()
diff --git a/deepray/layers/crf.py b/deepray/layers/crf.py
index 077c20a2..93bbabb4 100644
--- a/deepray/layers/crf.py
+++ b/deepray/layers/crf.py
@@ -27,49 +27,49 @@
 class CRF(tf.keras.layers.Layer):
   """Linear chain conditional random field (CRF).
 
-    Inherits from: `tf.keras.layers.Layer`.
-
-    References:
-        - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
-
-    Example:
-
-    >>> layer = dp.layers.CRF(4)
-    >>> inputs = np.random.rand(2, 4, 8).astype(np.float32)
-    >>> decoded_sequence, potentials, sequence_length, chain_kernel = layer(inputs)
-    >>> decoded_sequence.shape
-    TensorShape([2, 4])
-    >>> potentials.shape
-    TensorShape([2, 4, 4])
-    >>> sequence_length
-    <tf.Tensor: shape=(2,), dtype=int64, numpy=array([4, 4])>
-    >>> chain_kernel.shape
-    TensorShape([4, 4])
-
-    Args:
-        units: Positive integer, dimensionality of the reservoir.
-        chain_initializer: Orthogonal matrix. Default to `orthogonal`.
-        use_boundary: `Boolean`, whether the layer uses a boundary vector. Default to `True`.
-        boundary_initializer: Tensors initialized to 0. Default to `zeros`.
-        use_kernel: `Boolean`, whether the layer uses a kernel weights. Default to `True`.
-    Call Args:
-        inputs: Positive integer, dimensionality of the output space.
-        mask: A boolean `Tensor` of shape `[batch_size, sequence_length]`
-            or `None`. Default to `None`.
-    Raises:
-        ValueError: If input mask doesn't have dim 2 or None.
-        NotImplementedError: If left padding is provided.
-    """
+  Inherits from: `tf.keras.layers.Layer`.
+
+  References:
+      - [Conditional Random Field](https://en.wikipedia.org/wiki/Conditional_random_field)
+
+  Example:
+
+  >>> layer = dp.layers.CRF(4)
+  >>> inputs = np.random.rand(2, 4, 8).astype(np.float32)
+  >>> decoded_sequence, potentials, sequence_length, chain_kernel = layer(inputs)
+  >>> decoded_sequence.shape
+  TensorShape([2, 4])
+  >>> potentials.shape
+  TensorShape([2, 4, 4])
+  >>> sequence_length
+  <tf.Tensor: shape=(2,), dtype=int64, numpy=array([4, 4])>
+  >>> chain_kernel.shape
+  TensorShape([4, 4])
+
+  Args:
+      units: Positive integer, dimensionality of the reservoir.
+      chain_initializer: Orthogonal matrix. Default to `orthogonal`.
+      use_boundary: `Boolean`, whether the layer uses a boundary vector. Default to `True`.
+      boundary_initializer: Tensors initialized to 0. Default to `zeros`.
+      use_kernel: `Boolean`, whether the layer uses a kernel weights. Default to `True`.
+  Call Args:
+      inputs: Positive integer, dimensionality of the output space.
+      mask: A boolean `Tensor` of shape `[batch_size, sequence_length]`
+          or `None`. Default to `None`.
+  Raises:
+      ValueError: If input mask doesn't have dim 2 or None.
+      NotImplementedError: If left padding is provided.
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: int,
-      chain_initializer: types.Initializer = "orthogonal",
-      use_boundary: bool = True,
-      boundary_initializer: types.Initializer = "zeros",
-      use_kernel: bool = True,
-      **kwargs,
+    self,
+    units: int,
+    chain_initializer: types.Initializer = "orthogonal",
+    use_boundary: bool = True,
+    boundary_initializer: types.Initializer = "zeros",
+    use_kernel: bool = True,
+    **kwargs,
   ):
     super().__init__(**kwargs)
 
@@ -87,22 +87,22 @@ def __init__(
 
     # weights that work as transfer probability of each tags
     self.chain_kernel = self.add_weight(
-        shape=(self.units, self.units),
-        name="chain_kernel",
-        initializer=self.chain_initializer,
+      shape=(self.units, self.units),
+      name="chain_kernel",
+      initializer=self.chain_initializer,
     )
 
     # weight of <START> to tag probability and tag to <END> probability
     if self.use_boundary:
       self.left_boundary = self.add_weight(
-          shape=(self.units,),
-          name="left_boundary",
-          initializer=self.boundary_initializer,
+        shape=(self.units,),
+        name="left_boundary",
+        initializer=self.boundary_initializer,
       )
       self.right_boundary = self.add_weight(
-          shape=(self.units,),
-          name="right_boundary",
-          initializer=self.boundary_initializer,
+        shape=(self.units,),
+        name="right_boundary",
+        initializer=self.boundary_initializer,
       )
 
     if self.use_kernel:
@@ -142,13 +142,13 @@ def call(self, inputs, mask=None):
 
   def _get_sequence_length(self, input_, mask):
     """Currently underline CRF fucntion (provided by
-        deepray.text.crf) do not support bi-direction masking (left
-        padding / right padding), it support right padding by tell it the
-        sequence length.
+    deepray.text.crf) do not support bi-direction masking (left
+    padding / right padding), it support right padding by tell it the
+    sequence length.
 
-        this function is compute the sequence length from input and
-        mask.
-        """
+    this function is compute the sequence length from input and
+    mask.
+    """
     if mask is not None:
       sequence_length = self.mask_to_sequence_length(mask)
     else:
@@ -201,7 +201,6 @@ def _compute_mask_left_boundary(mask):
     return left_boundary
 
   def add_boundary_energy(self, potentials, mask, start, end):
-
     def expand_scalar_to_3d(x):
       # expand tensor from shape (x, ) to (1, 1, x)
       return tf.reshape(x, (1, 1, -1))
@@ -229,11 +228,11 @@ def get_viterbi_decoding(self, potentials, sequence_length):
   def get_config(self):
     # used for loading model from disk
     config = {
-        "units": self.units,
-        "chain_initializer": tf.keras.initializers.serialize(self.chain_initializer),
-        "use_boundary": self.use_boundary,
-        "boundary_initializer": tf.keras.initializers.serialize(self.boundary_initializer),
-        "use_kernel": self.use_kernel,
+      "units": self.units,
+      "chain_initializer": tf.keras.initializers.serialize(self.chain_initializer),
+      "use_boundary": self.use_boundary,
+      "boundary_initializer": tf.keras.initializers.serialize(self.boundary_initializer),
+      "use_kernel": self.use_kernel,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/layers/dcn.py b/deepray/layers/dcn.py
index 48658c6a..6ff503d2 100644
--- a/deepray/layers/dcn.py
+++ b/deepray/layers/dcn.py
@@ -22,75 +22,74 @@
 class Cross(keras.layers.Layer):
   """Cross Layer in Deep & Cross Network to learn explicit feature interactions.
 
-    A layer that creates explicit and bounded-degree feature interactions
-    efficiently. The `call` method accepts `inputs` as a tuple of size 2
-    tensors. The first input `x0` is the base layer that contains the original
-    features (usually the embedding layer); the second input `xi` is the output
-    of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
-    layer. For the first `Cross` layer in the stack, x0 = xi.
-
-    The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
-    where .* designates elementwise multiplication, W could be a full-rank
-    matrix, or a low-rank matrix U*V to reduce the computational cost, and
-    diag_scale increases the diagonal of W to improve training stability (
-    especially for the low-rank case).
-
-    References:
-        1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
-          See Eq. (1) for full-rank and Eq. (2) for low-rank version.
-        2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
-
-    Example:
-
-        ```python
-        # after embedding layer in a functional model:
-        input = keras.Input(shape=(None,), name='index', dtype=tf.int64)
-        x0 = dp.layers.Embedding(vocabulary_size=32, embedding_dim=6)
-        x1 = Cross()(x0, x0)
-        x2 = Cross()(x0, x1)
-        logits = keras.layers.Dense(units=10)(x2)
-        model = keras.Model(input, logits)
-        ```
-
-    Args:
-        projection_dim: project dimension to reduce the computational cost.
-          Default is `None` such that a full (`input_dim` by `input_dim`) matrix
-          W is used. If enabled, a low-rank matrix W = U*V will be used, where U
-          is of size `input_dim` by `projection_dim` and V is of size
-          `projection_dim` by `input_dim`. `projection_dim` need to be smaller
-          than `input_dim`/2 to improve the model efficiency. In practice, we've
-          observed that `projection_dim` = d/4 consistently preserved the
-          accuracy of a full-rank version.
-        diag_scale: a non-negative float used to increase the diagonal of the
-          kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
-          identity matrix.
-        use_bias: whether to add a bias term for this layer. If set to False,
-          no bias term will be used.
-        preactivation: Activation applied to output matrix of the layer, before
-          multiplication with the input. Can be used to control the scale of the
-          layer's outputs and improve stability.
-        kernel_initializer: Initializer to use on the kernel matrix.
-        bias_initializer: Initializer to use on the bias vector.
-        kernel_regularizer: Regularizer to use on the kernel matrix.
-        bias_regularizer: Regularizer to use on bias vector.
-
-    Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
-    Output shape: A single (batch_size, `input_dim`) dimensional output.
+  A layer that creates explicit and bounded-degree feature interactions
+  efficiently. The `call` method accepts `inputs` as a tuple of size 2
+  tensors. The first input `x0` is the base layer that contains the original
+  features (usually the embedding layer); the second input `xi` is the output
+  of the previous `Cross` layer in the stack, i.e., the i-th `Cross`
+  layer. For the first `Cross` layer in the stack, x0 = xi.
+
+  The output is x_{i+1} = x0 .* (W * xi + bias + diag_scale * xi) + xi,
+  where .* designates elementwise multiplication, W could be a full-rank
+  matrix, or a low-rank matrix U*V to reduce the computational cost, and
+  diag_scale increases the diagonal of W to improve training stability (
+  especially for the low-rank case).
+
+  References:
+      1. [R. Wang et al.](https://arxiv.org/pdf/2008.13535.pdf)
+        See Eq. (1) for full-rank and Eq. (2) for low-rank version.
+      2. [R. Wang et al.](https://arxiv.org/pdf/1708.05123.pdf)
+
+  Example:
+
+      ```python
+      # after embedding layer in a functional model:
+      input = keras.Input(shape=(None,), name='index', dtype=tf.int64)
+      x0 = dp.layers.Embedding(vocabulary_size=32, embedding_dim=6)
+      x1 = Cross()(x0, x0)
+      x2 = Cross()(x0, x1)
+      logits = keras.layers.Dense(units=10)(x2)
+      model = keras.Model(input, logits)
+      ```
+
+  Args:
+      projection_dim: project dimension to reduce the computational cost.
+        Default is `None` such that a full (`input_dim` by `input_dim`) matrix
+        W is used. If enabled, a low-rank matrix W = U*V will be used, where U
+        is of size `input_dim` by `projection_dim` and V is of size
+        `projection_dim` by `input_dim`. `projection_dim` need to be smaller
+        than `input_dim`/2 to improve the model efficiency. In practice, we've
+        observed that `projection_dim` = d/4 consistently preserved the
+        accuracy of a full-rank version.
+      diag_scale: a non-negative float used to increase the diagonal of the
+        kernel W by `diag_scale`, that is, W + diag_scale * I, where I is an
+        identity matrix.
+      use_bias: whether to add a bias term for this layer. If set to False,
+        no bias term will be used.
+      preactivation: Activation applied to output matrix of the layer, before
+        multiplication with the input. Can be used to control the scale of the
+        layer's outputs and improve stability.
+      kernel_initializer: Initializer to use on the kernel matrix.
+      bias_initializer: Initializer to use on the bias vector.
+      kernel_regularizer: Regularizer to use on the kernel matrix.
+      bias_regularizer: Regularizer to use on bias vector.
+
+  Input shape: A tuple of 2 (batch_size, `input_dim`) dimensional inputs.
+  Output shape: A single (batch_size, `input_dim`) dimensional output.
   """
 
   def __init__(
-      self,
-      projection_dim: Optional[int] = None,
-      diag_scale: Optional[float] = 0.0,
-      use_bias: bool = True,
-      preactivation: Optional[Union[str, keras.layers.Activation]] = None,
-      kernel_initializer: Union[Text, keras.initializers.Initializer] = "truncated_normal",
-      bias_initializer: Union[Text, keras.initializers.Initializer] = "zeros",
-      kernel_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
-      bias_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
-      **kwargs
+    self,
+    projection_dim: Optional[int] = None,
+    diag_scale: Optional[float] = 0.0,
+    use_bias: bool = True,
+    preactivation: Optional[Union[str, keras.layers.Activation]] = None,
+    kernel_initializer: Union[Text, keras.initializers.Initializer] = "truncated_normal",
+    bias_initializer: Union[Text, keras.initializers.Initializer] = "zeros",
+    kernel_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
+    bias_regularizer: Union[Text, None, keras.regularizers.Regularizer] = None,
+    **kwargs,
   ):
-
     super(Cross, self).__init__(**kwargs)
 
     self._projection_dim = projection_dim
@@ -113,32 +112,32 @@ def build(self, input_shape):
 
     if self._projection_dim is None:
       self._dense = keras.layers.Dense(
-          last_dim,
-          kernel_initializer=_clone_initializer(self._kernel_initializer),
-          bias_initializer=self._bias_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer,
-          use_bias=self._use_bias,
-          dtype=self.dtype,
-          activation=self._preactivation,
+        last_dim,
+        kernel_initializer=_clone_initializer(self._kernel_initializer),
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        use_bias=self._use_bias,
+        dtype=self.dtype,
+        activation=self._preactivation,
       )
     else:
       self._dense_u = keras.layers.Dense(
-          self._projection_dim,
-          kernel_initializer=_clone_initializer(self._kernel_initializer),
-          kernel_regularizer=self._kernel_regularizer,
-          use_bias=False,
-          dtype=self.dtype,
+        self._projection_dim,
+        kernel_initializer=_clone_initializer(self._kernel_initializer),
+        kernel_regularizer=self._kernel_regularizer,
+        use_bias=False,
+        dtype=self.dtype,
       )
       self._dense_v = keras.layers.Dense(
-          last_dim,
-          kernel_initializer=_clone_initializer(self._kernel_initializer),
-          bias_initializer=self._bias_initializer,
-          kernel_regularizer=self._kernel_regularizer,
-          bias_regularizer=self._bias_regularizer,
-          use_bias=self._use_bias,
-          dtype=self.dtype,
-          activation=self._preactivation,
+        last_dim,
+        kernel_initializer=_clone_initializer(self._kernel_initializer),
+        bias_initializer=self._bias_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer,
+        use_bias=self._use_bias,
+        dtype=self.dtype,
+        activation=self._preactivation,
       )
     self.built = True
 
@@ -163,8 +162,8 @@ def call(self, x0: tf.Tensor, x: Optional[tf.Tensor] = None) -> tf.Tensor:
 
     if x0.shape[-1] != x.shape[-1]:
       raise ValueError(
-          "`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x "
-          "dimension {}. This case is not supported yet.".format(x0.shape[-1], x.shape[-1])
+        "`x0` and `x` dimension mismatch! Got `x0` dimension {}, and x "
+        "dimension {}. This case is not supported yet.".format(x0.shape[-1], x.shape[-1])
       )
 
     if self._projection_dim is None:
@@ -181,14 +180,14 @@ def call(self, x0: tf.Tensor, x: Optional[tf.Tensor] = None) -> tf.Tensor:
 
   def get_config(self):
     config = {
-        "projection_dim": self._projection_dim,
-        "diag_scale": self._diag_scale,
-        "use_bias": self._use_bias,
-        "preactivation": keras.activations.serialize(self._preactivation),
-        "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
+      "projection_dim": self._projection_dim,
+      "diag_scale": self._diag_scale,
+      "use_bias": self._use_bias,
+      "preactivation": keras.activations.serialize(self._preactivation),
+      "kernel_initializer": keras.initializers.serialize(self._kernel_initializer),
+      "bias_initializer": keras.initializers.serialize(self._bias_initializer),
+      "kernel_regularizer": keras.regularizers.serialize(self._kernel_regularizer),
+      "bias_regularizer": keras.regularizers.serialize(self._bias_regularizer),
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/dense.py b/deepray/layers/dense.py
index 98e2c3f6..5741079c 100644
--- a/deepray/layers/dense.py
+++ b/deepray/layers/dense.py
@@ -109,28 +109,25 @@ class Dense(Layer):
 
   @utils.allow_initializer_layout
   def __init__(
-      self,
-      units,
-      activation=None,
-      use_bias=True,
-      kernel_initializer='glorot_uniform',
-      bias_initializer='zeros',
-      kernel_regularizer=None,
-      bias_regularizer=None,
-      activity_regularizer=None,
-      kernel_constraint=None,
-      bias_constraint=None,
-      **kwargs
+    self,
+    units,
+    activation=None,
+    use_bias=True,
+    kernel_initializer="glorot_uniform",
+    bias_initializer="zeros",
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    **kwargs,
   ):
     super(Dense, self).__init__(activity_regularizer=activity_regularizer, **kwargs)
 
-    self.name = self.kwargs('name', None)
+    self.name = self.kwargs("name", None)
     self.units = int(units) if not isinstance(units, int) else units
     if self.units < 0:
-      raise ValueError(
-          f'Received an invalid value for `units`, expected '
-          f'a positive integer. Received: units={units}'
-      )
+      raise ValueError(f"Received an invalid value for `units`, expected a positive integer. Received: units={units}")
     self.activation = activations.get(activation)
     self.use_bias = use_bias
     self.kernel_initializer = initializers.get(kernel_initializer)
@@ -146,38 +143,37 @@ def __init__(
   def build(self, input_shape):
     dtype = tf.as_dtype(self.dtype or backend.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError('A Dense layer can only be built with a floating-point '
-                      f'dtype. Received: dtype={dtype}')
+      raise TypeError(f"A Dense layer can only be built with a floating-point dtype. Received: dtype={dtype}")
 
     input_shape = tf.TensorShape(input_shape)
     last_dim = tf.compat.dimension_value(input_shape[-1])
     if last_dim is None:
       raise ValueError(
-          'The last dimension of the inputs to a Dense layer '
-          'should be defined. Found None. '
-          f'Full input shape received: {input_shape}'
+        "The last dimension of the inputs to a Dense layer "
+        "should be defined. Found None. "
+        f"Full input shape received: {input_shape}"
       )
     self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
     self.kernel = self.add_weight(
-        '%skernel' % self.name + '_' if self.name else "",
-        shape=[last_dim, self.units],
-        initializer=self.kernel_initializer,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        trainable=True
+      "%skernel" % self.name + "_" if self.name else "",
+      shape=[last_dim, self.units],
+      initializer=self.kernel_initializer,
+      regularizer=self.kernel_regularizer,
+      constraint=self.kernel_constraint,
+      dtype=self.dtype,
+      trainable=True,
     )
     if self.use_bias:
       self.bias = self.add_weight(
-          '%sbias' % self.name + '_' if self.name else "",
-          shape=[
-              self.units,
-          ],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True
+        "%sbias" % self.name + "_" if self.name else "",
+        shape=[
+          self.units,
+        ],
+        initializer=self.bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint,
+        dtype=self.dtype,
+        trainable=True,
       )
     else:
       self.bias = None
@@ -194,9 +190,9 @@ def call(self, inputs):
       # dimensions at the end.
       if tf.compat.dimension_value(inputs.shape[-1]) is None:
         raise ValueError(
-            'Dense layer only supports RaggedTensors when the '
-            'innermost dimension is non-ragged. Received: '
-            f'inputs.shape={inputs.shape}.'
+          "Dense layer only supports RaggedTensors when the "
+          "innermost dimension is non-ragged. Received: "
+          f"inputs.shape={inputs.shape}."
         )
       original_inputs = inputs
       if inputs.flat_values.shape.rank > 1:
@@ -233,7 +229,7 @@ def call(self, inputs):
         # of the inputs to both ops.
         ids = tf.SparseTensor(indices=inputs.indices, values=inputs.indices[:, 1], dense_shape=inputs.dense_shape)
         weights = inputs
-        outputs = tf.nn.embedding_lookup_sparse(self.kernel, ids, weights, combiner='sum')
+        outputs = tf.nn.embedding_lookup_sparse(self.kernel, ids, weights, combiner="sum")
       else:
         outputs = tf.matmul(a=inputs, b=self.kernel)
     # Broadcast kernel to inputs.
@@ -261,27 +257,25 @@ def compute_output_shape(self, input_shape):
     input_shape = input_shape.with_rank_at_least(2)
     if tf.compat.dimension_value(input_shape[-1]) is None:
       raise ValueError(
-          'The last dimension of the input shape of a Dense layer '
-          'should be defined. Found None. '
-          f'Received: input_shape={input_shape}'
+        "The last dimension of the input shape of a Dense layer "
+        "should be defined. Found None. "
+        f"Received: input_shape={input_shape}"
       )
     return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
     config = super(Dense, self).get_config()
-    config.update(
-        {
-            'name': self.name,
-            'units': self.units,
-            'activation': activations.serialize(self.activation),
-            'use_bias': self.use_bias,
-            'kernel_initializer': initializers.serialize(self.kernel_initializer),
-            'bias_initializer': initializers.serialize(self.bias_initializer),
-            'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-            'bias_regularizer': regularizers.serialize(self.bias_regularizer),
-            'activity_regularizer': regularizers.serialize(self.activity_regularizer),
-            'kernel_constraint': constraints.serialize(self.kernel_constraint),
-            'bias_constraint': constraints.serialize(self.bias_constraint)
-        }
-    )
+    config.update({
+      "name": self.name,
+      "units": self.units,
+      "activation": activations.serialize(self.activation),
+      "use_bias": self.use_bias,
+      "kernel_initializer": initializers.serialize(self.kernel_initializer),
+      "bias_initializer": initializers.serialize(self.bias_initializer),
+      "kernel_regularizer": regularizers.serialize(self.kernel_regularizer),
+      "bias_regularizer": regularizers.serialize(self.bias_regularizer),
+      "activity_regularizer": regularizers.serialize(self.activity_regularizer),
+      "kernel_constraint": constraints.serialize(self.kernel_constraint),
+      "bias_constraint": constraints.serialize(self.bias_constraint),
+    })
     return config
diff --git a/deepray/layers/dense_einsum.py b/deepray/layers/dense_einsum.py
index 6abfbd37..64b51160 100644
--- a/deepray/layers/dense_einsum.py
+++ b/deepray/layers/dense_einsum.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -57,19 +58,19 @@ class DenseEinsum(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      output_shape,
-      num_summed_dimensions=1,
-      activation=None,
-      use_bias=True,
-      kernel_initializer="glorot_uniform",
-      bias_initializer="zeros",
-      kernel_regularizer=None,
-      bias_regularizer=None,
-      activity_regularizer=None,
-      kernel_constraint=None,
-      bias_constraint=None,
-      **kwargs
+    self,
+    output_shape,
+    num_summed_dimensions=1,
+    activation=None,
+    use_bias=True,
+    kernel_initializer="glorot_uniform",
+    bias_initializer="zeros",
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    **kwargs,
   ):
     super(DenseEinsum, self).__init__(**kwargs)
     self._output_shape = output_shape if isinstance(output_shape, (list, tuple)) else (output_shape,)
@@ -117,26 +118,26 @@ def build(self, input_shape):
     self._einsum_string = self._build_einsum_string(free_input_dims, self._num_summed_dimensions, output_dims)
 
     # This is only saved for testing purposes.
-    self._kernel_shape = (input_shape[free_input_dims:].concatenate(self._output_shape))
+    self._kernel_shape = input_shape[free_input_dims:].concatenate(self._output_shape)
 
     self._kernel = self.add_weight(
-        "kernel",
-        shape=self._kernel_shape,
-        initializer=self._kernel_initializer,
-        regularizer=self._kernel_regularizer,
-        constraint=self._kernel_constraint,
-        dtype=self.dtype,
-        trainable=True
+      "kernel",
+      shape=self._kernel_shape,
+      initializer=self._kernel_initializer,
+      regularizer=self._kernel_regularizer,
+      constraint=self._kernel_constraint,
+      dtype=self.dtype,
+      trainable=True,
     )
     if self._use_bias:
       self._bias = self.add_weight(
-          "bias",
-          shape=self._output_shape,
-          initializer=self._bias_initializer,
-          regularizer=self._bias_regularizer,
-          constraint=self._bias_constraint,
-          dtype=self.dtype,
-          trainable=True
+        "bias",
+        shape=self._output_shape,
+        initializer=self._bias_initializer,
+        regularizer=self._bias_regularizer,
+        constraint=self._bias_constraint,
+        dtype=self.dtype,
+        trainable=True,
       )
     else:
       self._bias = None
@@ -144,16 +145,16 @@ def build(self, input_shape):
 
   def get_config(self):
     config = {
-        "output_shape": self._output_shape,
-        "activation": tf.keras.activations.serialize(self._activation),
-        "use_bias": self._use_bias,
-        "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint)
+      "output_shape": self._output_shape,
+      "activation": tf.keras.activations.serialize(self._activation),
+      "use_bias": self._use_bias,
+      "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
+      "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
+      "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
+      "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
+      "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
+      "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint),
     }
     base_config = super(DenseEinsum, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/dot_interaction.py b/deepray/layers/dot_interaction.py
index 22aec5c6..cb39e16c 100644
--- a/deepray/layers/dot_interaction.py
+++ b/deepray/layers/dot_interaction.py
@@ -41,7 +41,7 @@ class DotInteraction(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self, self_interaction: bool = False, skip_gather: bool = False, name: Optional[str] = None, **kwargs
+    self, self_interaction: bool = False, skip_gather: bool = False, name: Optional[str] = None, **kwargs
   ) -> None:
     self._self_interaction = self_interaction
     self._skip_gather = skip_gather
@@ -73,8 +73,7 @@ def call(self, inputs: List[tf.Tensor]) -> tf.Tensor:
       concat_features = tf.concat(inputs, axis=-1)
       concat_features = tf.reshape(concat_features, [self.batch_size, -1, self.feature_dim])
     except (ValueError, tf.errors.InvalidArgumentError) as e:
-      raise ValueError(f"Input tensors` dimensions must be equal, original"
-                       f"error message: {e}")
+      raise ValueError(f"Input tensors` dimensions must be equal, originalerror message: {e}")
 
     # Interact features, select lower-triangular portion, and re-shape.
     xactions = tf.matmul(concat_features, concat_features, transpose_b=True)
diff --git a/deepray/layers/dynamic_embedding.py b/deepray/layers/dynamic_embedding.py
index 742c632c..c3142e4d 100644
--- a/deepray/layers/dynamic_embedding.py
+++ b/deepray/layers/dynamic_embedding.py
@@ -1,5 +1,6 @@
 # -*- coding:utf-8 -*-
 """Dynamic Embedding layer."""
+
 from collections import defaultdict
 from typing import Dict, List
 from typing import Optional, Literal
@@ -22,7 +23,6 @@
   from tensorflow_recommenders_addons.dynamic_embedding.python.keras.layers import HvdAllToAllEmbedding
 
   class EmbeddingLayerRedis(DynamicEmbedding):
-
     def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
       self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
       self.mask_value = mask_value
@@ -41,15 +41,14 @@ def call(self, ids):
 
     def get_config(self):
       config = {
-          'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
-          'mask_value': self.mask_value
+        "mini_batch_regularizer": initializers.serialize(self.mini_batch_regularizer),
+        "mask_value": self.mask_value,
       }
       base_config = super(EmbeddingLayerRedis, self).get_config()
 
       return dict(list(base_config.items()) + list(config.items()))
 
   class EmbeddingLayerGPU(DynamicEmbedding):
-
     def __init__(self, mini_batch_regularizer=None, mask_value=None, **kwargs):
       self.mini_batch_regularizer = regularizers.get(mini_batch_regularizer)
       self.mask_value = mask_value
@@ -71,8 +70,8 @@ def call(self, ids):
 
     def get_config(self):
       config = {
-          'mini_batch_regularizer': initializers.serialize(self.mini_batch_regularizer),
-          'mask_value': self.mask_value
+        "mini_batch_regularizer": initializers.serialize(self.mini_batch_regularizer),
+        "mask_value": self.mask_value,
       }
       base_config = super(EmbeddingLayerGPU, self).get_config()
       return dict(list(base_config.items()) + list(config.items()))
@@ -82,7 +81,6 @@ def get_config(self):
 
 
 class DistributedDynamicEmbedding(tf.keras.layers.Layer):
-
   def get_de_options(self, case, init_capacity, **kwargs):
     redis_creator = None
     cuckoo_creator = None
@@ -97,54 +95,54 @@ def get_de_options(self, case, init_capacity, **kwargs):
 
     if case == "HKV":
       hkv_config = tfra.dynamic_embedding.HkvHashTableConfig(
-          init_capacity=init_capacity,
-          max_capacity=kwargs.get("max_capacity", 128 * 1024 * 1024),
-          max_hbm_for_values=kwargs.get("max_hbm_for_values", 4 * 1024 * 1024 * 1024),
+        init_capacity=init_capacity,
+        max_capacity=kwargs.get("max_capacity", 128 * 1024 * 1024),
+        max_hbm_for_values=kwargs.get("max_hbm_for_values", 4 * 1024 * 1024 * 1024),
       )
       if flags.FLAGS.use_horovod:
         hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator(
-            hkv_config, saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
+          hkv_config, saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
         )
       else:
         hkv_creator = tfra.dynamic_embedding.HkvHashTableCreator(hkv_config, saver=de.FileSystemSaver())
 
     if flags.FLAGS.use_horovod:
       cuckoo_creator = de.CuckooHashTableCreator(
-          saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
+        saver=de.FileSystemSaver(proc_size=get_world_size(), proc_rank=get_rank())
       )
     else:
       cuckoo_creator = de.CuckooHashTableCreator(saver=de.FileSystemSaver())
 
     switcher = {
-        "Redis": {
-            "devices": ['/CPU:0'],
-            "kv_creator": redis_creator,
-        },
-        "DRAM": {
-            "devices": ['/CPU:0'],
-            "kv_creator": cuckoo_creator,
-        },
-        "HBM": {
-            "devices": ['/GPU:0'],
-            "kv_creator": cuckoo_creator,
-        },
-        "HKV": {
-            "devices": ['/GPU:0'],
-            "kv_creator": hkv_creator,
-        },
+      "Redis": {
+        "devices": ["/CPU:0"],
+        "kv_creator": redis_creator,
+      },
+      "DRAM": {
+        "devices": ["/CPU:0"],
+        "kv_creator": cuckoo_creator,
+      },
+      "HBM": {
+        "devices": ["/GPU:0"],
+        "kv_creator": cuckoo_creator,
+      },
+      "HKV": {
+        "devices": ["/GPU:0"],
+        "kv_creator": hkv_creator,
+      },
     }
     return switcher.get(case, None)
 
   def __init__(
-      self,
-      embedding_dim: int,
-      key_dtype: str,
-      value_dtype: str,
-      initializer=None,
-      name: str = '',
-      device: Optional[Literal["HBM", "DRAM", "Redis", "HKV", "EV"]] = "DRAM",
-      init_capacity=1 * 1024 * 1024,
-      **kwargs
+    self,
+    embedding_dim: int,
+    key_dtype: str,
+    value_dtype: str,
+    initializer=None,
+    name: str = "",
+    device: Optional[Literal["HBM", "DRAM", "Redis", "HKV", "EV"]] = "DRAM",
+    init_capacity=1 * 1024 * 1024,
+    **kwargs,
   ):
     super(DistributedDynamicEmbedding, self).__init__()
     self.embedding_dim = embedding_dim
@@ -157,14 +155,14 @@ def __init__(
     if device == "Redis":
       de_option = self.get_de_options(device, init_capacity, **kwargs)
       self.emb = EmbeddingLayerRedis(
-          embedding_size=embedding_dim,
-          key_dtype=key_dtype,
-          value_dtype=value_dtype,
-          initializer=initializer,
-          name=name,
-          devices=de_option["devices"],
-          kv_creator=de_option["kv_creator"],
-          **kwargs
+        embedding_size=embedding_dim,
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        initializer=initializer,
+        name=name,
+        devices=de_option["devices"],
+        kv_creator=de_option["kv_creator"],
+        **kwargs,
       )
       if is_main_process():
         logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim")
@@ -173,29 +171,29 @@ def __init__(
     de_option = self.get_de_options(device, init_capacity, **kwargs)
     if not flags.FLAGS.use_horovod:
       self.emb = EmbeddingLayerGPU(
-          embedding_size=embedding_dim,
-          key_dtype=key_dtype,
-          value_dtype=value_dtype,
-          initializer=initializer,
-          name=name,
-          devices=de_option["devices"],
-          init_capacity=init_capacity,
-          kv_creator=de_option["kv_creator"],
-          **kwargs
+        embedding_size=embedding_dim,
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        initializer=initializer,
+        name=name,
+        devices=de_option["devices"],
+        init_capacity=init_capacity,
+        kv_creator=de_option["kv_creator"],
+        **kwargs,
       )
       if is_main_process():
         logger.info(f"Create EmbeddingLayer for {name} on {device} with {embedding_dim} dim")
     else:
       self.emb = HvdAllToAllEmbedding(
-          embedding_size=embedding_dim,
-          key_dtype=key_dtype,
-          value_dtype=value_dtype,
-          initializer=initializer,
-          name=name,
-          devices=de_option["devices"],
-          init_capacity=init_capacity,
-          kv_creator=de_option["kv_creator"],
-          **kwargs
+        embedding_size=embedding_dim,
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        initializer=initializer,
+        name=name,
+        devices=de_option["devices"],
+        init_capacity=init_capacity,
+        kv_creator=de_option["kv_creator"],
+        **kwargs,
       )
       if is_main_process():
         logger.info(f"Create HvdAllToAllEmbedding for {name} on {device} with {embedding_dim} dim")
@@ -205,16 +203,14 @@ def call(self, ids, *args, **kwargs):
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "embedding_dim": self.embedding_dim,
-            "key_dtype": self.key_dtype,
-            "value_dtype": self.value_dtype,
-            "initializer": self.initializer,
-            "device": self.device,
-            "init_capacity": self.init_capacity
-        }
-    )
+    config.update({
+      "embedding_dim": self.embedding_dim,
+      "key_dtype": self.key_dtype,
+      "value_dtype": self.value_dtype,
+      "initializer": self.initializer,
+      "device": self.device,
+      "init_capacity": self.init_capacity,
+    })
     return config
 
 
@@ -226,17 +222,17 @@ class CompositionalEmbedding(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      embedding_dim: int,
-      key_dtype: str,
-      value_dtype: str,
-      composition_size: int,
-      complementary_strategy: str = "Q-R",
-      operation: str = "add",
-      name: str = '',
-      device: Optional[Literal["HBM", "DRAM", "Redis", "HKV"]] = None,
-      initializer=None,
-      **kwargs
+    self,
+    embedding_dim: int,
+    key_dtype: str,
+    value_dtype: str,
+    composition_size: int,
+    complementary_strategy: str = "Q-R",
+    operation: str = "add",
+    name: str = "",
+    device: Optional[Literal["HBM", "DRAM", "Redis", "HKV"]] = None,
+    initializer=None,
+    **kwargs,
   ):
     super(CompositionalEmbedding, self).__init__()
     self.device = device
@@ -284,9 +280,9 @@ def factor2decimal(self, composition_part: int):
 
     res = []
     for i in range(len(result)):
-      binary_str = ''
+      binary_str = ""
       for j in range(len(result)):
-        binary_str += result[j] * ('1' if i == j else '0')
+        binary_str += result[j] * ("1" if i == j else "0")
 
       int_num = int(binary_str, 2) - 2**base if int(binary_str[0]) else int(binary_str, 2)
       res.append(int_num)
@@ -294,12 +290,12 @@ def factor2decimal(self, composition_part: int):
 
   def build(self, input_shape=None):
     self.composition_emb = EmbeddingVariable(
-        embedding_dim=self.embedding_dim,
-        key_dtype=self.key_dtype,
-        value_dtype=self.value_dtype,
-        initializer=self.initializer,
-        name=f"embeddings_{self.suffix}/Compositional",
-        device=self.device,
+      embedding_dim=self.embedding_dim,
+      key_dtype=self.key_dtype,
+      value_dtype=self.value_dtype,
+      initializer=self.initializer,
+      name=f"embeddings_{self.suffix}/Compositional",
+      device=self.device,
     )
 
   def call(self, inputs, *args, **kwargs):
@@ -349,16 +345,16 @@ def aggregate_by_dim(self, df: pd.DataFrame, fold_columns: Dict[str, List[str]])
     for key, group in fold_columns.items():
       dim_values = []
       for name in group:
-        dim_value = df.loc[df['name'] == name]['dim'].values[0]
+        dim_value = df.loc[df["name"] == name]["dim"].values[0]
         dim_values.append(dim_value)
         folder_map[name] = key
       if len(set(dim_values)) != 1:
         raise ValueError(
-            f"Cannot aggregate {group} because dimensions are not equal. Names: {group}, Dims: {dim_values}"
+          f"Cannot aggregate {group} because dimensions are not equal. Names: {group}, Dims: {dim_values}"
         )
 
     # Record the remaining features that do not need to be folded
-    for name in self.feature_map[~(self.feature_map['ftype'].isin(["Label", "Weight"]))]["name"].values:
+    for name in self.feature_map[~(self.feature_map["ftype"].isin(["Label", "Weight"]))]["name"].values:
       if name not in folder_map:
         folder_map[name] = name
     return folder_map
@@ -368,11 +364,29 @@ def build(self, input_shape):
     self.hash_long_kernel = {}
     self.numerical_bucket_kernel = {}
     self.split_dims = defaultdict(list)
-    for name, length, dim, voc_size, dtype, hash_size, composition_factor, storage_type, bucket_boundaries in self.feature_map[
-        ~(self.feature_map['ftype'].isin(["Label", "Weight"]))][[
-            "name", "length", "dim", "voc_size", "dtype", "hash_size", "composition_size", "storage_type",
-            "bucket_boundaries"
-        ]].values:
+    for (
+      name,
+      length,
+      dim,
+      voc_size,
+      dtype,
+      hash_size,
+      composition_factor,
+      storage_type,
+      bucket_boundaries,
+    ) in self.feature_map[~(self.feature_map["ftype"].isin(["Label", "Weight"]))][
+      [
+        "name",
+        "length",
+        "dim",
+        "voc_size",
+        "dtype",
+        "hash_size",
+        "composition_size",
+        "storage_type",
+        "bucket_boundaries",
+      ]
+    ].values:
       if self.is_valid_value(bucket_boundaries):
         bucket_boundaries_list = sorted(set(map(float, bucket_boundaries.split(","))))
         self.numerical_bucket_kernel[name] = NumericaBucketIdLayer(bucket_boundaries_list)
@@ -382,29 +396,33 @@ def build(self, input_shape):
         voc_size = int(hash_size)
 
       if self.fold_columns[name] not in self.embedding_layers:
-        composition_factor = self.feature_map.loc[self.feature_map['name'] == self.fold_columns[name]
-                                                 ]['composition_size'].values[0] if self.fold_columns[
-                                                     name] in self.feature_map['name'].values else composition_factor
-        storage_type = self.feature_map.loc[
-            self.feature_map['name'] == self.fold_columns[name]
-        ]['storage_type'].values[0] if self.fold_columns[name] in self.feature_map['name'].values else storage_type
+        composition_factor = (
+          self.feature_map.loc[self.feature_map["name"] == self.fold_columns[name]]["composition_size"].values[0]
+          if self.fold_columns[name] in self.feature_map["name"].values
+          else composition_factor
+        )
+        storage_type = (
+          self.feature_map.loc[self.feature_map["name"] == self.fold_columns[name]]["storage_type"].values[0]
+          if self.fold_columns[name] in self.feature_map["name"].values
+          else storage_type
+        )
         if self.is_valid_value(composition_factor):
           self.embedding_layers[self.fold_columns[name]] = CompositionalEmbedding(
-              embedding_dim=dim,
-              key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype,
-              value_dtype=tf.float32,
-              composition_size=composition_factor,
-              operation="add",
-              name=self.fold_columns[name]
+            embedding_dim=dim,
+            key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype,
+            value_dtype=tf.float32,
+            composition_size=composition_factor,
+            operation="add",
+            name=self.fold_columns[name],
           )
         else:
           self.embedding_layers[self.fold_columns[name]] = EmbeddingVariable(
-              embedding_dim=dim,
-              key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype,
-              value_dtype=tf.float32,
-              initializer=tf.keras.initializers.GlorotUniform(),
-              name='embedding_' + self.fold_columns[name],
-              device=storage_type,
+            embedding_dim=dim,
+            key_dtype=tf.int32 if self.is_valid_value(bucket_boundaries) else dtype,
+            value_dtype=tf.float32,
+            initializer=tf.keras.initializers.GlorotUniform(),
+            name="embedding_" + self.fold_columns[name],
+            device=storage_type,
           )
 
       self.split_dims[self.fold_columns[name]].append(length)
@@ -449,9 +467,8 @@ def call(self, inputs, *args, **kwargs) -> Dict[str, List[tf.Tensor]]:
     result = defaultdict(list)
     id_tensors = defaultdict(list)
     for code, name, hash_size, bucket_boundaries in self.feature_map[
-        ~(self.feature_map['ftype'].isin(["Label", "Weight"]))][["code", "name", "hash_size",
-                                                                 "bucket_boundaries"]].values:
-
+      ~(self.feature_map["ftype"].isin(["Label", "Weight"]))
+    ][["code", "name", "hash_size", "bucket_boundaries"]].values:
       input_tensor = inputs[name]
       id_tensor_prefix_code = code << 47
 
diff --git a/deepray/layers/embedding.py b/deepray/layers/embedding.py
index 557b8314..46a5ed0f 100644
--- a/deepray/layers/embedding.py
+++ b/deepray/layers/embedding.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Embedding layer."""
+
 import math
 import os
 from typing import Dict
@@ -56,11 +57,11 @@
 
 
 def get_variable_path(checkpoint_path, name, i=0):
-  tokens = name.split('/')
-  tokens = [t for t in tokens if 'model_parallel' not in t and 'data_parallel' not in t]
-  name = '_'.join(tokens)
-  name = name.replace(':', '_')
-  filename = name + f'_part{i}' + '.npy'
+  tokens = name.split("/")
+  tokens = [t for t in tokens if "model_parallel" not in t and "data_parallel" not in t]
+  name = "_".join(tokens)
+  name = name.replace(":", "_")
+  filename = name + f"_part{i}" + ".npy"
   return os.path.join(checkpoint_path, filename)
 
 
@@ -68,99 +69,99 @@ def get_variable_path(checkpoint_path, name, i=0):
 class Embedding(Layer):
   """Turns positive integers (indexes) into dense vectors of fixed size.
 
-    e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
-
-    This layer can only be used on positive integer inputs of a fixed range. The
-    `tf.keras.layers.TextVectorization`, `keras.layers.StringLookup`,
-    and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare
-    inputs for an `Embedding` layer.
-
-    This layer accepts `tf.Tensor`, `tf.RaggedTensor` and `tf.SparseTensor`
-    input.
-
-    Example:
-
-    >>> model = tf.keras.Sequential()
-    >>> model.add(dp.layers.Embedding(1000, 64, input_length=10))
-    >>> # The model will take as input an integer matrix of size (batch,
-    >>> # input_length), and the largest integer (i.e. word index) in the input
-    >>> # should be no larger than 999 (vocabulary size).
-    >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
-    >>> # dimension.
-    >>> input_array = np.random.randint(1000, size=(32, 10))
-    >>> model.compile('rmsprop', 'mse')
-    >>> output_array = model.predict(input_array)
-    >>> print(output_array.shape)
-    (32, 10, 64)
-
-    Args:
-      vocabulary_size: Integer. Size of the vocabulary,
-        i.e. maximum integer index + 1.
-      embedding_dim: Integer. Dimension of the dense embedding.
-      embeddings_initializer: Initializer for the `embeddings`
-        matrix (see `keras.initializers`).
-      embeddings_regularizer: Regularizer function applied to
-        the `embeddings` matrix (see `keras.regularizers`).
-      embeddings_constraint: Constraint function applied to
-        the `embeddings` matrix (see `keras.constraints`).
-      mask_zero: Boolean, whether or not the input value 0 is a special
-        "padding" value that should be masked out. This is useful when using
-        recurrent layers which may take variable length input. If this is
-        `True`, then all subsequent layers in the model need to support masking
-        or an exception will be raised. If mask_zero is set to True, as a
-        consequence, index 0 cannot be used in the vocabulary (input_dim should
-        equal size of vocabulary + 1).
-      input_length: Length of input sequences, when it is constant.
-        This argument is required if you are going to connect
-        `Flatten` then `Dense` layers upstream
-        (without it, the shape of the dense outputs cannot be computed).
-      sparse: If True, calling this layer returns a `tf.SparseTensor`. If False,
-        the layer returns a dense `tf.Tensor`. For an entry with no features in
-        a sparse tensor (entry with value 0), the embedding vector of index 0 is
-        returned by default.
-
-    Input shape:
-      2D tensor with shape: `(batch_size, input_length)`.
-
-    Output shape:
-      3D tensor with shape: `(batch_size, input_length, output_dim)`.
-
-    **Note on variable placement:**
-    By default, if a GPU is available, the embedding matrix will be placed on
-    the GPU. This achieves the best performance, but it might cause issues:
-
-    - You may be using an optimizer that does not support sparse GPU kernels.
-    In this case you will see an error upon training your model.
-    - Your embedding matrix may be too large to fit on your GPU. In this case
-    you will see an Out Of Memory (OOM) error.
-
-    In such cases, you should place the embedding matrix on the CPU memory.
-    You can do so with a device scope, as such:
-
-    ```python
-    with tf.device('cpu:0'):
-      embedding_layer = Embedding(...)
-      embedding_layer.build()
-    ```
-
-    The pre-built `embedding_layer` instance can then be added to a `Sequential`
-    model (e.g. `model.add(embedding_layer)`), called in a Functional model
-    (e.g. `x = embedding_layer(x)`), or used in a subclassed model.
-    """
+  e.g. `[[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]`
+
+  This layer can only be used on positive integer inputs of a fixed range. The
+  `tf.keras.layers.TextVectorization`, `keras.layers.StringLookup`,
+  and `tf.keras.layers.IntegerLookup` preprocessing layers can help prepare
+  inputs for an `Embedding` layer.
+
+  This layer accepts `tf.Tensor`, `tf.RaggedTensor` and `tf.SparseTensor`
+  input.
+
+  Example:
+
+  >>> model = tf.keras.Sequential()
+  >>> model.add(dp.layers.Embedding(1000, 64, input_length=10))
+  >>> # The model will take as input an integer matrix of size (batch,
+  >>> # input_length), and the largest integer (i.e. word index) in the input
+  >>> # should be no larger than 999 (vocabulary size).
+  >>> # Now model.output_shape is (None, 10, 64), where `None` is the batch
+  >>> # dimension.
+  >>> input_array = np.random.randint(1000, size=(32, 10))
+  >>> model.compile('rmsprop', 'mse')
+  >>> output_array = model.predict(input_array)
+  >>> print(output_array.shape)
+  (32, 10, 64)
+
+  Args:
+    vocabulary_size: Integer. Size of the vocabulary,
+      i.e. maximum integer index + 1.
+    embedding_dim: Integer. Dimension of the dense embedding.
+    embeddings_initializer: Initializer for the `embeddings`
+      matrix (see `keras.initializers`).
+    embeddings_regularizer: Regularizer function applied to
+      the `embeddings` matrix (see `keras.regularizers`).
+    embeddings_constraint: Constraint function applied to
+      the `embeddings` matrix (see `keras.constraints`).
+    mask_zero: Boolean, whether or not the input value 0 is a special
+      "padding" value that should be masked out. This is useful when using
+      recurrent layers which may take variable length input. If this is
+      `True`, then all subsequent layers in the model need to support masking
+      or an exception will be raised. If mask_zero is set to True, as a
+      consequence, index 0 cannot be used in the vocabulary (input_dim should
+      equal size of vocabulary + 1).
+    input_length: Length of input sequences, when it is constant.
+      This argument is required if you are going to connect
+      `Flatten` then `Dense` layers upstream
+      (without it, the shape of the dense outputs cannot be computed).
+    sparse: If True, calling this layer returns a `tf.SparseTensor`. If False,
+      the layer returns a dense `tf.Tensor`. For an entry with no features in
+      a sparse tensor (entry with value 0), the embedding vector of index 0 is
+      returned by default.
+
+  Input shape:
+    2D tensor with shape: `(batch_size, input_length)`.
+
+  Output shape:
+    3D tensor with shape: `(batch_size, input_length, output_dim)`.
+
+  **Note on variable placement:**
+  By default, if a GPU is available, the embedding matrix will be placed on
+  the GPU. This achieves the best performance, but it might cause issues:
+
+  - You may be using an optimizer that does not support sparse GPU kernels.
+  In this case you will see an error upon training your model.
+  - Your embedding matrix may be too large to fit on your GPU. In this case
+  you will see an Out Of Memory (OOM) error.
+
+  In such cases, you should place the embedding matrix on the CPU memory.
+  You can do so with a device scope, as such:
+
+  ```python
+  with tf.device('cpu:0'):
+    embedding_layer = Embedding(...)
+    embedding_layer.build()
+  ```
+
+  The pre-built `embedding_layer` instance can then be added to a `Sequential`
+  model (e.g. `model.add(embedding_layer)`), called in a Functional model
+  (e.g. `x = embedding_layer(x)`), or used in a subclassed model.
+  """
 
   @utils.allow_initializer_layout
   def __init__(
-      self,
-      vocabulary_size,
-      embedding_dim,
-      embeddings_initializer="uniform",
-      embeddings_regularizer=None,
-      activity_regularizer=None,
-      embeddings_constraint=None,
-      mask_zero=False,
-      input_length=None,
-      sparse=False,
-      **kwargs,
+    self,
+    vocabulary_size,
+    embedding_dim,
+    embeddings_initializer="uniform",
+    embeddings_regularizer=None,
+    activity_regularizer=None,
+    embeddings_constraint=None,
+    mask_zero=False,
+    input_length=None,
+    sparse=False,
+    **kwargs,
   ):
     if "input_shape" not in kwargs:
       if input_length:
@@ -169,11 +170,11 @@ def __init__(
         kwargs["input_shape"] = (None,)
     if vocabulary_size <= 0 or embedding_dim <= 0:
       raise ValueError(
-          "Both `vocabulary_size` and `embedding_dim` should be positive, "
-          f"Received vocabulary_size = {vocabulary_size} "
-          f"and embedding_dim = {embedding_dim}"
+        "Both `vocabulary_size` and `embedding_dim` should be positive, "
+        f"Received vocabulary_size = {vocabulary_size} "
+        f"and embedding_dim = {embedding_dim}"
       )
-    if (not base_layer_utils.v2_dtype_behavior_enabled() and "dtype" not in kwargs):
+    if not base_layer_utils.v2_dtype_behavior_enabled() and "dtype" not in kwargs:
       # In TF1, the dtype defaults to the input dtype which is typically
       # int32, so explicitly set it to floatx
       kwargs["dtype"] = backend.floatx()
@@ -197,9 +198,7 @@ def __init__(
     self.sparse = sparse
     if self.sparse and self.mask_zero:
       raise ValueError(
-          "`mask_zero` cannot be enabled when "
-          "`tf.keras.layers.Embedding` is used with `tf.SparseTensor` "
-          "input."
+        "`mask_zero` cannot be enabled when `tf.keras.layers.Embedding` is used with `tf.SparseTensor` input."
       )
     # Make this flag private and do not serialize it for now.
     # It will be part of the public API after further testing.
@@ -208,12 +207,12 @@ def __init__(
   @tf_utils.shape_type_conversion
   def build(self, input_shape=None):
     self.embeddings = self.add_weight(
-        shape=(self.input_dim, self.output_dim),
-        initializer=self.embeddings_initializer,
-        name="embeddings",
-        regularizer=self.embeddings_regularizer,
-        constraint=self.embeddings_constraint,
-        experimental_autocast=False,
+      shape=(self.input_dim, self.output_dim),
+      initializer=self.embeddings_initializer,
+      name="embeddings",
+      regularizer=self.embeddings_regularizer,
+      constraint=self.embeddings_constraint,
+      experimental_autocast=False,
     )
     self.built = True
 
@@ -233,13 +232,11 @@ def compute_output_shape(self, input_shape):
       else:
         in_lens = [self.input_length]
       if len(in_lens) != len(input_shape) - 1:
-        raise ValueError(f'"input_length" is {self.input_length}, but received '
-                         f"input has shape {input_shape}")
+        raise ValueError(f'"input_length" is {self.input_length}, but received input has shape {input_shape}')
       else:
         for i, (s1, s2) in enumerate(zip(in_lens, input_shape[1:])):
           if s1 is not None and s2 is not None and s1 != s2:
-            raise ValueError(f'"input_length" is {self.input_length}, but '
-                             f"received input has shape {input_shape}")
+            raise ValueError(f'"input_length" is {self.input_length}, but received input has shape {input_shape}')
           elif s1 is None:
             in_lens[i] = s2
       return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
@@ -261,20 +258,20 @@ def call(self, inputs):
         current_indices = tf.repeat(inputs.indices, [self.output_dim], axis=0)
         new_indices = tf.concat([current_indices, indices_values_embed_axis], 1)
         new_shape = tf.concat(
-            [tf.cast(inputs.shape, dtype=tf.int64), [self.output_dim]],
-            axis=-1,
+          [tf.cast(inputs.shape, dtype=tf.int64), [self.output_dim]],
+          axis=-1,
         )
         out = tf.SparseTensor(
-            indices=new_indices,
-            values=embedding_values,
-            dense_shape=new_shape,
+          indices=new_indices,
+          values=embedding_values,
+          dense_shape=new_shape,
         )
       else:
         sparse_inputs_expanded = tf.sparse.expand_dims(inputs, axis=-1)
         out = tf.nn.safe_embedding_lookup_sparse(
-            embedding_weights=self.embeddings,
-            sparse_ids=sparse_inputs_expanded,
-            default_id=0,
+          embedding_weights=self.embeddings,
+          sparse_ids=sparse_inputs_expanded,
+          default_id=0,
         )
     elif self._use_one_hot_matmul:
       # Note that we change the dtype of the one_hot to be same as the
@@ -289,7 +286,7 @@ def call(self, inputs):
     if self.sparse and not isinstance(out, tf.SparseTensor):
       out = tf.sparse.from_dense(out)
 
-    if (self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype):
+    if self._dtype_policy.compute_dtype != self._dtype_policy.variable_dtype:
       # Instead of casting the variable as in most layers, cast the
       # output, as this is mathematically equivalent but is faster.
       out = tf.cast(out, self._dtype_policy.compute_dtype)
@@ -297,38 +294,37 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "vocabulary_size": self.input_dim,
-        "embedding_dim": self.output_dim,
-        "embeddings_initializer": initializers.serialize(self.embeddings_initializer),
-        "embeddings_regularizer": regularizers.serialize(self.embeddings_regularizer),
-        "activity_regularizer": regularizers.serialize(self.activity_regularizer),
-        "embeddings_constraint": constraints.serialize(self.embeddings_constraint),
-        "mask_zero": self.mask_zero,
-        "input_length": self.input_length,
+      "vocabulary_size": self.input_dim,
+      "embedding_dim": self.output_dim,
+      "embeddings_initializer": initializers.serialize(self.embeddings_initializer),
+      "embeddings_regularizer": regularizers.serialize(self.embeddings_regularizer),
+      "activity_regularizer": regularizers.serialize(self.activity_regularizer),
+      "embeddings_constraint": constraints.serialize(self.embeddings_constraint),
+      "mask_zero": self.mask_zero,
+      "input_length": self.input_length,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 class EmbeddingGroup(tf.keras.layers.Layer):
-
   def __init__(self, feature_map: pd.DataFrame, trainable=True, **kwargs):
     super().__init__(trainable, **kwargs)
     self.feature_map = feature_map
     self.embedding_layers = {}
     self.hash_long_kernel = {}
-    for name, dim, voc_size, hash_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "dim", "voc_size", "hash_size"
-    ]].values:
+    for name, dim, voc_size, hash_size in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "dim", "voc_size", "hash_size"]
+    ].values:
       if not math.isnan(hash_size):
         self.hash_long_kernel[name] = Hash(int(hash_size))
         voc_size = int(hash_size)
 
-      self.embedding_layers[name] = Embedding(embedding_dim=dim, vocabulary_size=voc_size, name='embedding_' + name)
+      self.embedding_layers[name] = Embedding(embedding_dim=dim, vocabulary_size=voc_size, name="embedding_" + name)
 
   def call(self, inputs: Dict[str, tf.Tensor], *args, **kwargs) -> Dict[str, tf.Tensor]:
     embedding_out = {}
-    for name, hash_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "hash_size"]].values:
+    for name, hash_size in self.feature_map[(self.feature_map["ftype"] == "Categorical")][["name", "hash_size"]].values:
       input_tensor = inputs[name]
       if not math.isnan(hash_size):
         input_tensor = self.hash_long_kernel[name](input_tensor)
@@ -346,14 +342,13 @@ def restore_checkpoint(self, checkpoint_path):
 
 
 class JointEmbeddingInitializer(tf.keras.initializers.Initializer):
-
   def __init__(self, table_sizes, embedding_dim, wrapped):
     self.table_sizes = table_sizes
     self.wrapped = wrapped
     self.embedding_dim = embedding_dim
 
   def __call__(self, shape, dtype=tf.float32):
-    with tf.device('/CPU:0'):
+    with tf.device("/CPU:0"):
       subtables = []
       for table_size in self.table_sizes:
         subtable = self.wrapped()(shape=[table_size, self.embedding_dim], dtype=dtype)
@@ -366,10 +361,9 @@ def get_config(self):
 
 
 class EmbeddingInitializer(tf.keras.initializers.Initializer):
-
   def __call__(self, shape, dtype=tf.float32):
-    with tf.device('/CPU:0'):
-      maxval = tf.sqrt(tf.constant(1.) / tf.cast(shape[0], tf.float32))
+    with tf.device("/CPU:0"):
+      maxval = tf.sqrt(tf.constant(1.0) / tf.cast(shape[0], tf.float32))
       maxval = tf.cast(maxval, dtype=dtype)
       minval = -maxval
 
@@ -382,7 +376,6 @@ def get_config(self):
 
 
 class JointEmbedding(tf.keras.layers.Layer):
-
   def __init__(self, table_sizes, output_dim, dtype, feature_names=None, trainable=True):
     super(JointEmbedding, self).__init__(dtype=dtype)
     self.table_sizes = table_sizes
@@ -393,20 +386,20 @@ def __init__(self, table_sizes, output_dim, dtype, feature_names=None, trainable
     self.offsets = tf.constant(self.offsets, dtype=tf.int32)
     self.feature_names = feature_names
     if not self.feature_names:
-      self.feature_names = ['feature_{i}' for i in range(len(table_sizes))]
+      self.feature_names = ["feature_{i}" for i in range(len(table_sizes))]
     self.trainable = trainable
 
   def build(self, input_shape):
     initializer = JointEmbeddingInitializer(
-        table_sizes=self.table_sizes, embedding_dim=self.output_dim, wrapped=EmbeddingInitializer
+      table_sizes=self.table_sizes, embedding_dim=self.output_dim, wrapped=EmbeddingInitializer
     )
 
     self.embedding_table = self.add_weight(
-        "embedding_table",
-        shape=[self.offsets[-1], self.output_dim],
-        dtype=self.dtype,
-        initializer=initializer,
-        trainable=self.trainable
+      "embedding_table",
+      shape=[self.offsets[-1], self.output_dim],
+      dtype=self.dtype,
+      initializer=initializer,
+      trainable=self.trainable,
     )
 
   def call(self, indices):
@@ -443,26 +436,25 @@ class DualEmbeddingGroup(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      cardinalities,
-      output_dim,
-      memory_threshold,
-      cpu_embedding='multitable',
-      gpu_embedding='fused',
-      dtype=tf.float32,
-      feature_names=None,
-      trainable=True
+    self,
+    cardinalities,
+    output_dim,
+    memory_threshold,
+    cpu_embedding="multitable",
+    gpu_embedding="fused",
+    dtype=tf.float32,
+    feature_names=None,
+    trainable=True,
   ):
-
     # TODO: throw an exception if the features are not sorted by cardinality in reversed order
 
     super(DualEmbeddingGroup, self).__init__(dtype=dtype)
 
     if dtype not in [tf.float32, tf.float16]:
-      raise ValueError(f'Only float32 and float16 embedding dtypes are currently supported. Got {dtype}.')
+      raise ValueError(f"Only float32 and float16 embedding dtypes are currently supported. Got {dtype}.")
 
-    cpu_embedding_class = EmbeddingGroup if cpu_embedding == 'multitable' else JointEmbedding
-    gpu_embedding_class = EmbeddingGroup if gpu_embedding == 'multitable' else JointEmbedding
+    cpu_embedding_class = EmbeddingGroup if cpu_embedding == "multitable" else JointEmbedding
+    gpu_embedding_class = EmbeddingGroup if gpu_embedding == "multitable" else JointEmbedding
 
     cardinalities = np.array(cardinalities)
 
@@ -472,29 +464,29 @@ def __init__(
 
     self.table_sizes = cardinalities * output_dim * self.bytes_per_element
     self._find_first_gpu_index()
-    self.cpu_cardinalities = cardinalities[:self.first_gpu_index]
-    self.gpu_cardinalities = cardinalities[self.first_gpu_index:]
+    self.cpu_cardinalities = cardinalities[: self.first_gpu_index]
+    self.gpu_cardinalities = cardinalities[self.first_gpu_index :]
 
     if not feature_names:
-      feature_names = [f'feature_{i}' for i in range(len(self.table_sizes))]
+      feature_names = [f"feature_{i}" for i in range(len(self.table_sizes))]
 
     self.feature_names = feature_names
 
     self.gpu_embedding = gpu_embedding_class(
-        table_sizes=self.gpu_cardinalities.tolist(),
-        output_dim=output_dim,
-        dtype=self.dtype,
-        feature_names=feature_names[self.first_gpu_index:],
-        trainable=trainable
+      table_sizes=self.gpu_cardinalities.tolist(),
+      output_dim=output_dim,
+      dtype=self.dtype,
+      feature_names=feature_names[self.first_gpu_index :],
+      trainable=trainable,
     )
 
     # Force using FP32 for CPU embeddings, FP16 performance is much worse
     self.cpu_embeddings = cpu_embedding_class(
-        table_sizes=self.cpu_cardinalities,
-        output_dim=output_dim,
-        dtype=tf.float32,
-        feature_names=feature_names[:self.first_gpu_index],
-        trainable=trainable
+      table_sizes=self.cpu_cardinalities,
+      output_dim=output_dim,
+      dtype=tf.float32,
+      feature_names=feature_names[: self.first_gpu_index],
+      trainable=trainable,
     )
 
   def _find_first_gpu_index(self):
@@ -517,15 +509,15 @@ def call(self, indices):
     to_concat = []
     if self.first_gpu_index > 0:
       # at least one cpu-based embedding
-      cpu_indices = indices[:, :self.first_gpu_index]
-      with tf.device('/CPU:0'):
+      cpu_indices = indices[:, : self.first_gpu_index]
+      with tf.device("/CPU:0"):
         cpu_results = self.cpu_embeddings(cpu_indices)
         cpu_results = tf.cast(cpu_results, dtype=self.dtype)
         to_concat.append(cpu_results)
 
     if self.first_gpu_index < len(self.table_sizes):
       # at least one gpu-based embedding
-      gpu_indices = indices[:, self.first_gpu_index:]
+      gpu_indices = indices[:, self.first_gpu_index :]
       gpu_results = self.gpu_embedding(gpu_indices)
       to_concat.append(gpu_results)
 
@@ -545,11 +537,10 @@ def restore_checkpoint(self, checkpoint_path):
 
 
 class MaskedEmbeddingMean(tf.keras.layers.Layer):
-
   def __init__(self, hash_bucket_size, embedding_dim):
     self.embedding_dim = embedding_dim
     self.embedding = tf.keras.layers.Embedding(
-        input_dim=hash_bucket_size, output_dim=embedding_dim, embeddings_initializer='glorot_uniform'
+      input_dim=hash_bucket_size, output_dim=embedding_dim, embeddings_initializer="glorot_uniform"
     )
     super(MaskedEmbeddingMean, self).__init__()
 
@@ -557,11 +548,10 @@ def call(self, inputs):
     original_embedding = self.embedding(inputs)
     mask_tensor = 1 - tf.cast(inputs == 0, tf.float32)  # batch, len
     embedding_mask_tensor = tf.repeat(
-        tf.expand_dims(mask_tensor, axis=-1), self.embedding_dim, axis=-1
+      tf.expand_dims(mask_tensor, axis=-1), self.embedding_dim, axis=-1
     )  # batch, len, dim
     mean_tensor = tf.math.divide_no_nan(
-        tf.reduce_sum(original_embedding * embedding_mask_tensor, axis=[1]),
-        tf.reduce_sum(embedding_mask_tensor, axis=1)
+      tf.reduce_sum(original_embedding * embedding_mask_tensor, axis=[1]), tf.reduce_sum(embedding_mask_tensor, axis=1)
     )
     return tf.expand_dims(mean_tensor, axis=1)
 
@@ -594,12 +584,12 @@ def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
 
     self.index_lookup = keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
     self.q_embeddings = keras.layers.Embedding(
-        num_buckets,
-        embedding_dim,
+      num_buckets,
+      embedding_dim,
     )
     self.r_embeddings = keras.layers.Embedding(
-        num_buckets,
-        embedding_dim,
+      num_buckets,
+      embedding_dim,
     )
 
   def call(self, inputs):
@@ -673,11 +663,11 @@ def __init__(self, blocks_vocabulary, blocks_embedding_dims, base_embedding_dim,
 
   def embedding_encoder(self, vocabulary, embedding_dim, num_oov_indices=0, name=None):
     return keras.Sequential(
-        [
-            keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices),
-            keras.layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim),
-        ],
-        name=f"{name}_embedding" if name else None,
+      [
+        keras.layers.StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices),
+        keras.layers.Embedding(input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim),
+      ],
+      name=f"{name}_embedding" if name else None,
     )
 
   def call(self, inputs):
diff --git a/deepray/layers/embedding_variable.py b/deepray/layers/embedding_variable.py
index ae9aef32..43e1c2f2 100644
--- a/deepray/layers/embedding_variable.py
+++ b/deepray/layers/embedding_variable.py
@@ -1,5 +1,6 @@
 # -*- coding:utf-8 -*-
 """Dynamic Embedding layer."""
+
 import typing
 
 import horovod.tensorflow as hvd
@@ -18,13 +19,13 @@
 logger = logging_util.get_logger()
 
 StorageType = {
-    "HBM": config_pb2.StorageType.HBM,
-    "DRAM": config_pb2.StorageType.DRAM,
-    "HBM_DRAM": config_pb2.StorageType.HBM_DRAM,
-    "LEVELDB": config_pb2.StorageType.LEVELDB,
-    "SSDHASH": config_pb2.StorageType.SSDHASH,
-    "DRAM_LEVELDB": config_pb2.StorageType.DRAM_LEVELDB,
-    "DRAM_SSDHASH": config_pb2.StorageType.DRAM_SSDHASH
+  "HBM": config_pb2.StorageType.HBM,
+  "DRAM": config_pb2.StorageType.DRAM,
+  "HBM_DRAM": config_pb2.StorageType.HBM_DRAM,
+  "LEVELDB": config_pb2.StorageType.LEVELDB,
+  "SSDHASH": config_pb2.StorageType.SSDHASH,
+  "DRAM_LEVELDB": config_pb2.StorageType.DRAM_LEVELDB,
+  "DRAM_SSDHASH": config_pb2.StorageType.DRAM_SSDHASH,
 }
 
 CacheStrategy = {"LFU": config_pb2.CacheStrategy.LFU, "LRU": config_pb2.CacheStrategy.LRU}
@@ -49,26 +50,25 @@ def int64_partition_fn(keys, shard_num):
 
 def partition_fn_v2(keys, shard_num):
   return tf.cast(
-      tf.strings.to_hash_bucket_fast(
-          tf.strings.as_string(keys),  # 将 int 转为 string 再哈希
-          num_buckets=shard_num
-      ),
-      tf.int32
+    tf.strings.to_hash_bucket_fast(
+      tf.strings.as_string(keys),  # 将 int 转为 string 再哈希
+      num_buckets=shard_num,
+    ),
+    tf.int32,
   )
 
 
 class EmbeddingVariable(tf.keras.layers.Layer):
-
   def __init__(
-      self,
-      embedding_dim: int,
-      key_dtype=dtypes.int64,
-      value_dtype: str = None,
-      initializer=None,
-      name: str = '',
-      with_unique=False,
-      partition_fn: typing.Callable[[typing.Any, typing.Any], typing.Any] = None,
-      **kwargs
+    self,
+    embedding_dim: int,
+    key_dtype=dtypes.int64,
+    value_dtype: str = None,
+    initializer=None,
+    name: str = "",
+    with_unique=False,
+    partition_fn: typing.Callable[[typing.Any, typing.Any], typing.Any] = None,
+    **kwargs,
   ):
     super(EmbeddingVariable, self).__init__(name=name)
     self.embedding_size = embedding_dim
@@ -84,23 +84,23 @@ def __init__(
     storage_type = kwargs.get("storage_type", None)
     if storage_type:
       ev_option = ev_variables.EmbeddingVariableOption(
-          storage_option=ev_variables.StorageOption(
-              storage_type=StorageType[storage_type],
-              storage_path=kwargs.get("storage_path", None),
-              storage_size=kwargs.get("storage_size", [1024 * 1024 * 1024]),
-              cache_strategy=CacheStrategy[kwargs.get("cache_strategy", "LFU")]
-          )
+        storage_option=ev_variables.StorageOption(
+          storage_type=StorageType[storage_type],
+          storage_path=kwargs.get("storage_path", None),
+          storage_size=kwargs.get("storage_size", [1024 * 1024 * 1024]),
+          cache_strategy=CacheStrategy[kwargs.get("cache_strategy", "LFU")],
+        )
       )
     else:
       ev_option = ev_variables.EmbeddingVariableOption()
 
     self.embedding_variable = get_embedding_variable(
-        embedding_dim=embedding_dim,
-        key_dtype=key_dtype,
-        value_dtype=value_dtype,
-        initializer=initializer,
-        name=name,
-        ev_option=ev_option,
+      embedding_dim=embedding_dim,
+      key_dtype=key_dtype,
+      value_dtype=value_dtype,
+      initializer=initializer,
+      name=name,
+      ev_option=ev_option,
     )
 
     self.partition_fn = partition_fn
@@ -170,7 +170,7 @@ def distributed_lookup(ids):
       relocs_tensor = tf.concat(ids_partitions, axis=0)
       # Provide a unique name for the first alltoall operation
       flat_reloc_ids, remote_sizes = hvd.alltoall(
-          relocs_tensor, splits=partitions_sizes, name=f"{self.name}_alltoall_ids"
+        relocs_tensor, splits=partitions_sizes, name=f"{self.name}_alltoall_ids"
       )
 
       lookup_result = self.read(flat_reloc_ids)
@@ -200,7 +200,7 @@ def distributed_lookup(ids):
   def get_config(self):
     config = super().get_config()
     config.update({
-        "world_size": self.world_size,
-        "name": self.name,
+      "world_size": self.world_size,
+      "name": self.name,
     })
     return config
diff --git a/deepray/layers/esn.py b/deepray/layers/esn.py
index 9dd2d84d..cf0dbf3f 100644
--- a/deepray/layers/esn.py
+++ b/deepray/layers/esn.py
@@ -19,10 +19,10 @@
 from typeguard import typechecked
 
 from deepray.utils.types import (
-    Activation,
-    FloatTensorLike,
-    TensorLike,
-    Initializer,
+  Activation,
+  FloatTensorLike,
+  TensorLike,
+  Initializer,
 )
 
 
@@ -30,114 +30,114 @@
 class ESN(tf.keras.layers.RNN):
   """Echo State Network layer.
 
-    This implements the recurrent layer using the ESNCell.
-
-    This is based on the paper
-        H. Jaeger
-        ["The "echo state" approach to analysing and training recurrent neural networks"]
-        (https://www.researchgate.net/publication/215385037).
-        GMD Report148, German National Research Center for Information Technology, 2001.
-
-    Args:
-        units: Positive integer, dimensionality of the reservoir.
-        connectivity: Float between 0 and 1.
-            Connection probability between two reservoir units.
-            Default: 0.1.
-        leaky: Float between 0 and 1.
-            Leaking rate of the reservoir.
-            If you pass 1, it's the special case the model does not have leaky integration.
-            Default: 1.
-        spectral_radius: Float between 0 and 1.
-            Desired spectral radius of recurrent weight matrix.
-            Default: 0.9.
-        use_norm2: Boolean, whether to use the p-norm function (with p=2) as an upper
-            bound of the spectral radius so that the echo state property is satisfied.
-            It  avoids to compute the eigenvalues which has an exponential complexity.
-            Default: False.
-        use_bias: Boolean, whether the layer uses a bias vector.
-            Default: True.
-        activation: Activation function to use.
-            Default: hyperbolic tangent (`tanh`).
-            If you pass `None`, no activation is applied
-            (ie. "linear" activation: `a(x) = x`).
-        kernel_initializer: Initializer for the `kernel` weights matrix,
-            used for the linear transformation of the inputs.
-            Default: `glorot_uniform`.
-        recurrent_initializer: Initializer for the `recurrent_kernel` weights matrix,
-            used for the linear transformation of the recurrent state.
-            Default: `glorot_uniform`.
-        bias_initializer: Initializer for the bias vector.
-            Default: `zeros`.
-        return_sequences: Boolean. Whether to return the last output.
-            in the output sequence, or the full sequence.
-        go_backwards: Boolean (default False).
-            If True, process the input sequence backwards and return the
-            reversed sequence.
-        unroll: Boolean (default False).
-            If True, the network will be unrolled,
-            else a symbolic loop will be used.
-            Unrolling can speed-up a RNN,
-            although it tends to be more memory-intensive.
-            Unrolling is only suitable for short sequences.
-
-    Call arguments:
-        inputs: A 3D tensor.
-        mask: Binary tensor of shape `(samples, timesteps)` indicating whether
-            a given timestep should be masked.
-        training: Python boolean indicating whether the layer should behave in
-            training mode or in inference mode. This argument is passed to the cell
-            when calling it. This is only relevant if `dropout` or
-            `recurrent_dropout` is used.
-        initial_state: List of initial state tensors to be passed to the first
-            call of the cell.
-    """
+  This implements the recurrent layer using the ESNCell.
+
+  This is based on the paper
+      H. Jaeger
+      ["The "echo state" approach to analysing and training recurrent neural networks"]
+      (https://www.researchgate.net/publication/215385037).
+      GMD Report148, German National Research Center for Information Technology, 2001.
+
+  Args:
+      units: Positive integer, dimensionality of the reservoir.
+      connectivity: Float between 0 and 1.
+          Connection probability between two reservoir units.
+          Default: 0.1.
+      leaky: Float between 0 and 1.
+          Leaking rate of the reservoir.
+          If you pass 1, it's the special case the model does not have leaky integration.
+          Default: 1.
+      spectral_radius: Float between 0 and 1.
+          Desired spectral radius of recurrent weight matrix.
+          Default: 0.9.
+      use_norm2: Boolean, whether to use the p-norm function (with p=2) as an upper
+          bound of the spectral radius so that the echo state property is satisfied.
+          It  avoids to compute the eigenvalues which has an exponential complexity.
+          Default: False.
+      use_bias: Boolean, whether the layer uses a bias vector.
+          Default: True.
+      activation: Activation function to use.
+          Default: hyperbolic tangent (`tanh`).
+          If you pass `None`, no activation is applied
+          (ie. "linear" activation: `a(x) = x`).
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+          Default: `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights matrix,
+          used for the linear transformation of the recurrent state.
+          Default: `glorot_uniform`.
+      bias_initializer: Initializer for the bias vector.
+          Default: `zeros`.
+      return_sequences: Boolean. Whether to return the last output.
+          in the output sequence, or the full sequence.
+      go_backwards: Boolean (default False).
+          If True, process the input sequence backwards and return the
+          reversed sequence.
+      unroll: Boolean (default False).
+          If True, the network will be unrolled,
+          else a symbolic loop will be used.
+          Unrolling can speed-up a RNN,
+          although it tends to be more memory-intensive.
+          Unrolling is only suitable for short sequences.
+
+  Call arguments:
+      inputs: A 3D tensor.
+      mask: Binary tensor of shape `(samples, timesteps)` indicating whether
+          a given timestep should be masked.
+      training: Python boolean indicating whether the layer should behave in
+          training mode or in inference mode. This argument is passed to the cell
+          when calling it. This is only relevant if `dropout` or
+          `recurrent_dropout` is used.
+      initial_state: List of initial state tensors to be passed to the first
+          call of the cell.
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      connectivity: FloatTensorLike = 0.1,
-      leaky: FloatTensorLike = 1,
-      spectral_radius: FloatTensorLike = 0.9,
-      use_norm2: bool = False,
-      use_bias: bool = True,
-      activation: Activation = "tanh",
-      kernel_initializer: Initializer = "glorot_uniform",
-      recurrent_initializer: Initializer = "glorot_uniform",
-      bias_initializer: Initializer = "zeros",
-      return_sequences=False,
-      go_backwards=False,
-      unroll=False,
-      **kwargs,
+    self,
+    units: TensorLike,
+    connectivity: FloatTensorLike = 0.1,
+    leaky: FloatTensorLike = 1,
+    spectral_radius: FloatTensorLike = 0.9,
+    use_norm2: bool = False,
+    use_bias: bool = True,
+    activation: Activation = "tanh",
+    kernel_initializer: Initializer = "glorot_uniform",
+    recurrent_initializer: Initializer = "glorot_uniform",
+    bias_initializer: Initializer = "zeros",
+    return_sequences=False,
+    go_backwards=False,
+    unroll=False,
+    **kwargs,
   ):
     cell = ESNCell(
-        units,
-        connectivity=connectivity,
-        leaky=leaky,
-        spectral_radius=spectral_radius,
-        use_norm2=use_norm2,
-        use_bias=use_bias,
-        activation=activation,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        dtype=kwargs.get("dtype"),
+      units,
+      connectivity=connectivity,
+      leaky=leaky,
+      spectral_radius=spectral_radius,
+      use_norm2=use_norm2,
+      use_bias=use_bias,
+      activation=activation,
+      kernel_initializer=kernel_initializer,
+      recurrent_initializer=recurrent_initializer,
+      bias_initializer=bias_initializer,
+      dtype=kwargs.get("dtype"),
     )
     super().__init__(
-        cell,
-        return_sequences=return_sequences,
-        go_backwards=go_backwards,
-        unroll=unroll,
-        **kwargs,
+      cell,
+      return_sequences=return_sequences,
+      go_backwards=go_backwards,
+      unroll=unroll,
+      **kwargs,
     )
 
   def call(self, inputs, mask=None, training=None, initial_state=None):
     return super().call(
-        inputs,
-        mask=mask,
-        training=training,
-        initial_state=initial_state,
-        constants=None,
+      inputs,
+      mask=mask,
+      training=training,
+      initial_state=initial_state,
+      constants=None,
     )
 
   @property
@@ -182,16 +182,16 @@ def bias_initializer(self):
 
   def get_config(self):
     config = {
-        "units": self.units,
-        "connectivity": self.connectivity,
-        "leaky": self.leaky,
-        "spectral_radius": self.spectral_radius,
-        "use_norm2": self.use_norm2,
-        "use_bias": self.use_bias,
-        "activation": tf.keras.activations.serialize(self.activation),
-        "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
-        "recurrent_initializer": tf.keras.initializers.serialize(self.recurrent_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
+      "units": self.units,
+      "connectivity": self.connectivity,
+      "leaky": self.leaky,
+      "spectral_radius": self.spectral_radius,
+      "use_norm2": self.use_norm2,
+      "use_bias": self.use_bias,
+      "activation": tf.keras.activations.serialize(self.activation),
+      "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
+      "recurrent_initializer": tf.keras.initializers.serialize(self.recurrent_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
     }
     base_config = super().get_config()
     del base_config["cell"]
diff --git a/deepray/layers/feature_cross.py b/deepray/layers/feature_cross.py
index 40ad397c..b176cca0 100644
--- a/deepray/layers/feature_cross.py
+++ b/deepray/layers/feature_cross.py
@@ -28,8 +28,8 @@
 
 
 class MergeType:
-  CONCAT = 'concat'
-  STACK = 'stack'
+  CONCAT = "concat"
+  STACK = "stack"
   NONE = None
 
 
@@ -47,7 +47,7 @@ def check_dim(dim):
   elif isinstance(dim, tf.compat.v1.Dimension):
     return dim.value
   else:
-    raise Exception(f'dim {dim} is error')
+    raise Exception(f"dim {dim} is error")
 
 
 def dim_size(inputs, axis: int):
@@ -61,22 +61,22 @@ def dim_size(inputs, axis: int):
 
 
 def merge_tensor_list(
-    tensor_list, merge_type: str = 'concat', num_feature: int = None, axis: int = 1, keep_list: bool = False
+  tensor_list, merge_type: str = "concat", num_feature: int = None, axis: int = 1, keep_list: bool = False
 ):
   """将Tensor列表合并
-  
+
   Args:
     tensor_list (:obj:`List[tf.Tensor]`): 输入的Tensor列表
     merge_type (:obj:`str`): 合并类型, 支持stack/concat两种, 如果设为None, 则不做任何处理
     num_feature (:obj:`int`): 特征个数
     axis (:obj:`int`): merge延哪个轴进行
     keep_list (:obj:`bool`): 输出结果是否保持list
-  
+
   """
 
   if isinstance(tensor_list, tf.Tensor):
     tensor_list = [tensor_list]
-  assert merge_type in {'stack', 'concat', None}
+  assert merge_type in {"stack", "concat", None}
 
   if len(tensor_list) == 1:
     shapes = [check_dim(dim) for dim in tensor_list[0].get_shape().as_list()]
@@ -104,11 +104,11 @@ def merge_tensor_list(
     elif len(shapes) == 2:
       output = tensor_list if keep_list else tensor_list[0]
     else:
-      raise Exception("shape error: ({})".format(', '.join(map(str, shapes))))
-  elif merge_type == 'stack':
+      raise Exception("shape error: ({})".format(", ".join(map(str, shapes))))
+  elif merge_type == "stack":
     stacked = tf.stack(tensor_list, axis=axis)
     output = [stacked] if keep_list else stacked
-  elif merge_type == 'concat':
+  elif merge_type == "concat":
     concated = tf.concat(tensor_list, axis=axis)
     output = [concated] if keep_list else concated
   else:
@@ -120,17 +120,17 @@ def merge_tensor_list(
 class GroupInt(Layer):
   """Group Interaction的缩写, 一种简单的特征交叉方式, 同时支持attention. 论文可参考 https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
 
-  特征交叉可以在多个层面做, 一种方法是在特征工程中做, 即在特征工程阶段直接生成一个新特征, 这个特征是由多个原始征特拼接起来的, 然后再做Embedding. 
+  特征交叉可以在多个层面做, 一种方法是在特征工程中做, 即在特征工程阶段直接生成一个新特征, 这个特征是由多个原始征特拼接起来的, 然后再做Embedding.
   这样做的好处是记忆性较好, 但由于稀疏性, 有时训练不够充分, 也存在过拟合的风险. 另一种是在模型层面做, 代表算法为FM, DeepFM等
-  
+
   在模型中做二阶特征交叉存在如下问题:
     - 输出维度高: FM用点积表示特征交叉, 如果输入有n个特征, 输出有 n(n-1)/2 维, 当特征较多时, 给训练/推理带来很大的负担
     - 重复交叉: 特征交叉可以在两个地方做, 现实中往往同时做. FM等算法并不区分参与交叉的是原始特征还是交叉特征. 所以存在重复交叉. 不过, 也有人认为
       重复交叉会生成更高阶的特征, 不是重复
-  
+
   为了克服FM等算法的不足, 可以使用GroupInt. 它先将特征分组(Group), 哪些特征属于一个组由算法开发人员确定. 然后用sumpooling来将特征聚合
   得到group embedding. 最后用group embedding做两两交叉输出
-  
+
   GroupInt输出有如下几种形式:
     - 交叉用dot, 直接输出. 此时输出的大小远小于原始FM, 而且, 人工确定group, 减少了重复交叉
     - 交叉用multiply, 输出有两种选择:
@@ -146,28 +146,28 @@ class GroupInt(Layer):
     regularizer (:obj:`tf.regularizer`): MLP的正则化器
     out_type (:obj:`str`): 输出类型, 可以为stack, concat, None
     keep_list (:obj:`bool`): 输出是否保持list
-  
+
   """
 
   def __init__(
-      self,
-      interaction_type='multiply',
-      use_attention: bool = False,
-      attention_units: List[int] = None,
-      activation='relu',
-      initializer=None,
-      regularizer=None,
-      out_type='concat',
-      keep_list: bool = False,
-      **kwargs
+    self,
+    interaction_type="multiply",
+    use_attention: bool = False,
+    attention_units: List[int] = None,
+    activation="relu",
+    initializer=None,
+    regularizer=None,
+    out_type="concat",
+    keep_list: bool = False,
+    **kwargs,
   ):
     super(GroupInt, self).__init__(**kwargs)
-    assert interaction_type in ['multiply', 'dot']
+    assert interaction_type in ["multiply", "dot"]
     self.interaction_type = interaction_type
 
     self.use_attention = use_attention
     if use_attention:
-      assert interaction_type == 'multiply'
+      assert interaction_type == "multiply"
 
     self.attention_units = attention_units
     self.activation = activations.get(activation)
@@ -181,11 +181,11 @@ def build(self, input_shape):
     if self.use_attention:
       assert self.attention_units[-1] == 1
       self.mlp = MLP(
-          name='groupint_attention_mlp',
-          hidden_units=self.attention_units,
-          activations=self.activation,
-          kernel_initializer=self.initializer,
-          kernel_regularizer=self.regularizer
+        name="groupint_attention_mlp",
+        hidden_units=self.attention_units,
+        activations=self.activation,
+        kernel_initializer=self.initializer,
+        kernel_regularizer=self.regularizer,
       )
     else:
       self.mlp = None
@@ -198,9 +198,11 @@ def call(self, inputs, **kwargs):
     last_dim_size = dim_size(left_fields[0], -1)
     ffm_embeddings = ffm(left=left, right=right, dim_size=last_dim_size, int_type=self.interaction_type)
 
-    if self.interaction_type == 'multiply':
+    if self.interaction_type == "multiply":
       if self.use_attention:
-        num_feature = len(left_fields) * len(right_fields)  #int(dim_size(left, 1) * dim_size(right, 1) / last_dim_size)
+        num_feature = len(left_fields) * len(
+          right_fields
+        )  # int(dim_size(left, 1) * dim_size(right, 1) / last_dim_size)
         stacked = tf.reshape(ffm_embeddings, shape=(-1, num_feature, last_dim_size))
         attention = self.mlp(stacked)  # (bs, num_feature, 1)
         ffm_embeddings = tf.reshape(stacked * attention, shape=(-1, num_feature * last_dim_size))
@@ -208,14 +210,14 @@ def call(self, inputs, **kwargs):
 
   def get_config(self):
     config = {
-        'interaction_type': self.interaction_type,
-        'use_attention': self.use_attention,
-        'attention_units': self.attention_units,
-        'activation': activations.serialize(self.activation),
-        'initializer': initializers.serialize(self.initializer),
-        'regularizer': regularizers.serialize(self.regularizer),
-        'out_type': self.out_type,
-        'keep_list': self.keep_list
+      "interaction_type": self.interaction_type,
+      "use_attention": self.use_attention,
+      "attention_units": self.attention_units,
+      "activation": activations.serialize(self.activation),
+      "initializer": initializers.serialize(self.initializer),
+      "regularizer": regularizers.serialize(self.regularizer),
+      "out_type": self.out_type,
+      "keep_list": self.keep_list,
     }
     base_config = super(GroupInt, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -227,15 +229,15 @@ def get_config(self):
 class AllInt(Layer):
   r"""AllInt是All Interaction的缩写, 是一种简单的特征交叉方式, 通过引入压缩矩阵, 减少输出大小. 论文可参考 https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf
 
-  GroupInt虽然能克服FM带来的输出膨胀的问题, 但也有其它问题, 如Group要人工决定, 给算法开发人员带来较大的负担. AllInt将所有特征都做交叉, 不用人工选择, 
+  GroupInt虽然能克服FM带来的输出膨胀的问题, 但也有其它问题, 如Group要人工决定, 给算法开发人员带来较大的负担. AllInt将所有特征都做交叉, 不用人工选择,
   同时引入压缩矩阵来减少输出大小
-  
+
   All Interaction中引入压缩矩阵. 如下:
-  
+
   .. math::
-  
+
     O_{n, c} = X_{n, k} * X_{n, k}^T * C_{n, c}
-  
+
   为了避免生成(n, n)的大中间矩阵, 在计算上进行了一些优化, 即先算 :math:`X_{n, k}^T * C_{n, c}`, 这样得到的(k, c)矩阵小很多, 计算效率高
 
   Args:
@@ -245,11 +247,11 @@ class AllInt(Layer):
     use_bias (:obj:`bool`) 是否启用bias
     out_type (:obj:`str`): 输出类型, 可以为stack, concat, None
     keep_list (:obj:`bool`): 输出是否保持list
-    
+
   """
 
   def __init__(
-      self, cmp_dim, initializer=None, regularizer=None, use_bias=True, out_type='concat', keep_list=False, **kwargs
+    self, cmp_dim, initializer=None, regularizer=None, use_bias=True, out_type="concat", keep_list=False, **kwargs
   ):
     super(AllInt, self).__init__(**kwargs)
     self.cmp_dim = cmp_dim
@@ -263,16 +265,16 @@ def __init__(
   def build(self, input_shape):
     num_feat = check_dim(input_shape[1])
     self.kernel = self.add_weight(
-        name='allint_kernel',
-        shape=(num_feat, self.cmp_dim),
-        dtype=tf.float32,
-        initializer=self.initializer,
-        regularizer=self.regularizer,
-        trainable=True
+      name="allint_kernel",
+      shape=(num_feat, self.cmp_dim),
+      dtype=tf.float32,
+      initializer=self.initializer,
+      regularizer=self.regularizer,
+      trainable=True,
     )
     if self.use_bias:
       self.bias = self.add_weight(
-          name='allint_bias', shape=(self.cmp_dim,), dtype=tf.float32, initializer=initializers.Zeros(), trainable=True
+        name="allint_bias", shape=(self.cmp_dim,), dtype=tf.float32, initializer=initializers.Zeros(), trainable=True
       )
     return super(AllInt, self).build(input_shape)
 
@@ -289,12 +291,12 @@ def call(self, embeddings, **kwargs):
 
   def get_config(self):
     config = {
-        'cmp_dim': self.cmp_dim,
-        'initializer': initializers.serialize(self.initializer),
-        'regularizer': regularizers.serialize(self.regularizer),
-        'use_bias': self.use_bias,
-        'out_type': self.out_type,
-        'keep_list': self.keep_list
+      "cmp_dim": self.cmp_dim,
+      "initializer": initializers.serialize(self.initializer),
+      "regularizer": regularizers.serialize(self.regularizer),
+      "use_bias": self.use_bias,
+      "out_type": self.out_type,
+      "keep_list": self.keep_list,
     }
 
     base_config = super(AllInt, self).get_config()
@@ -303,26 +305,26 @@ def get_config(self):
 
 class CDot(Layer):
   """Compression and Dot Interaction, CDot. 可以看成是Allint的升级版, 也是一种自动做特征交叉的方法. 论文可参考 https://arxiv.org/pdf/1803.05170.pdf
-  
+
   Allint通过引入压缩矩阵, 减少相对FM的输出大小, 同时移除了GroupInt中人工定义Group的不足, CDot与Allint十分相似
-  
+
   CDot相对Allint的改进在于:
-    - AllInt引入的压缩矩阵与输入无关, 在CDot中, 压缩矩阵是与输入数据相关, 可以根据输入, 自适应地调节压缩矩阵. 
+    - AllInt引入的压缩矩阵与输入无关, 在CDot中, 压缩矩阵是与输入数据相关, 可以根据输入, 自适应地调节压缩矩阵.
     - CDot输出时, 会将压缩后的中间特征也输出, 作为上层MLP的输入, Allint不会做这一步
-  
+
   一般提取高阶特征交叉时使用MLP, MLP的输入是直接接拼起来的Embedding. 一些实验表明, 可以先用CDot提取二阶特征, 再在二阶特征基础上提取高阶
   特征效果更好. 所以CDot也可以与MLP联用, 用于高阶特征提取
-  
+
   Args:
     project_dim (:obj:`int`): 投影dim
     compress_units (:obj:`List[int]`): 用一个MLP来压缩, 压缩MLP的各层dims
     activation (:obj:`tf.activation`): MLP的激活函数
     initializer (:obj:`tf.initializer`): 初始化器
     regularizer (:obj:`tf.regularizer`): kernel正则化器
-    
+
   """
 
-  def __init__(self, project_dim, compress_units, activation='relu', initializer=None, regularizer=None, **kwargs):
+  def __init__(self, project_dim, compress_units, activation="relu", initializer=None, regularizer=None, **kwargs):
     super(CDot, self).__init__(**kwargs)
     self.activation = activations.get(activation)
     self.initializer = initializers.get(initializer) or initializers.GlorotNormal()
@@ -337,19 +339,19 @@ def build(self, input_shape):
     self._emd_size = check_dim(emd_size)
 
     self.project_weight = self.add_weight(
-        name="project_weight",
-        shape=(num_feature, self.project_dim),
-        dtype=tf.float32,
-        initializer=self.initializer,
-        regularizer=self.regularizer
+      name="project_weight",
+      shape=(num_feature, self.project_dim),
+      dtype=tf.float32,
+      initializer=self.initializer,
+      regularizer=self.regularizer,
     )
 
     self.compress_tower = MLP(
-        hidden_units=self.compress_units + [emd_size * self.project_dim],
-        activations=self.activation,
-        kernel_initializer=self.initializer,
-        kernel_regularizer=self.regularizer,
-        name="compress_tower"
+      hidden_units=self.compress_units + [emd_size * self.project_dim],
+      activations=self.activation,
+      kernel_initializer=self.initializer,
+      kernel_regularizer=self.regularizer,
+      name="compress_tower",
     )
     self.trainable_weights.extend(self.compress_tower.trainable_weights)
     self.non_trainable_weights.extend(self.compress_tower.non_trainable_weights)
@@ -363,7 +365,7 @@ def call(self, inputs, **kwargs):
 
     # 2) concat the raw compressed features, and go through mlp to cast to compressed space
     concated = tf.reshape(
-        projected, shape=(-1, self._emd_size * self.project_dim)
+      projected, shape=(-1, self._emd_size * self.project_dim)
     )  # (batch_size, emd_size * project_dim)
     compressed = self.compress_tower(concated)  # (batch_size, emd_size * project_dim)
 
@@ -371,7 +373,7 @@ def call(self, inputs, **kwargs):
     # (batch_size, num_feature, emd_size) * (batch_size, emd_size, project_dim)  -> (batch_size, num_feature, project_dim)
     crossed = tf.matmul(inputs, tf.reshape(compressed, shape=(-1, self._emd_size, self.project_dim)))
     crossed = tf.reshape(
-        crossed, shape=(-1, self._num_feature * self.project_dim)
+      crossed, shape=(-1, self._num_feature * self.project_dim)
     )  # (batch_size, num_feature * project_dim)
 
     # 4) concat the compressed features and crossed features
@@ -379,11 +381,11 @@ def call(self, inputs, **kwargs):
 
   def get_config(self):
     config = {
-        'project_dim': self.project_dim,
-        'compress_units': self.compress_units,
-        'activation': activations.serialize(self.activation),
-        'initializer': initializers.serialize(self.initializer),
-        'regularizer': regularizers.serialize(self.regularizer),
+      "project_dim": self.project_dim,
+      "compress_units": self.compress_units,
+      "activation": activations.serialize(self.activation),
+      "initializer": initializers.serialize(self.initializer),
+      "regularizer": regularizers.serialize(self.regularizer),
     }
 
     base_config = super(CDot, self).get_config()
@@ -392,22 +394,22 @@ def get_config(self):
 
 class CAN(Layer):
   """Co-action Network, CAN, 协同作用网络 论文可参考 https://arxiv.org/pdf/2011.05625.pdf
-  
+
   在模型中做特征交叉, 同一份Embedding, 同时要拟合原始特征/交叉特征, 容易两个都拟合不好. CAN是为了改善这种情况提出的, 通过拓展参数, 使得交叉特征与原始特征的学习相对独立
-  
+
   CAN Unit将要建模的”特征对”分为weight side(item)和input side(user):
     - weight side可以reshape成MLP的参数
     - input side作为MLP的输入，通过多层MLP来建模co-action
-  
+
   Args:
     layer_num (:obj:`int`): Layer的层数
     activation (:obj:`tf.activation`): 激活函数
     is_seq (:obj:`bool`): 是否为序列特征
     is_stacked (:obj:`bool`): User侧是否是多个特征stack起来的
-    
+
   """
 
-  def __init__(self, layer_num: int = 2, activation='relu', is_seq: bool = False, is_stacked: bool = True, **kwargs):
+  def __init__(self, layer_num: int = 2, activation="relu", is_seq: bool = False, is_stacked: bool = True, **kwargs):
     super(CAN, self).__init__(**kwargs)
     self.layer_num = layer_num
     self.activation = activations.get(activation)
@@ -474,24 +476,24 @@ def call(self, inputs, **kwargs):
 
   def get_config(self):
     config = {
-        'layer_num': self.layer_num,
-        'activation': activations.serialize(self.activation),
-        "is_seq": self.is_seq,
-        "is_stacked": self.is_stacked
+      "layer_num": self.layer_num,
+      "activation": activations.serialize(self.activation),
+      "is_seq": self.is_seq,
+      "is_stacked": self.is_stacked,
     }
     base_config = super(CAN, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
 class DCN(Layer):
-  r"""二阶特征交叉可用FM等方法显式提取, 更高阶的交叉用MLP隐式提取. Deep & Cross Network (DCN)可替代MLP做高阶特征交叉, 
+  r"""二阶特征交叉可用FM等方法显式提取, 更高阶的交叉用MLP隐式提取. Deep & Cross Network (DCN)可替代MLP做高阶特征交叉,
   通过加入残差联接, 达到比MLP更好的效果
-  
+
   DCN现在有三个版本(论文可参考 https://arxiv.org/pdf/1708.05123.pdf):
     - vector, :math:`x_{l+1} = x_0 * x_l w + b + x_l`, 其中w的shape为(dim, 1)
     - matrix, :math:`x_{l+1} = x_0 * (x_l w + b) + x_l`, 其中w的shape为(dim, dim)
     - mixed, :math:`x_{l+1} = \sum_i x_0 * (x_l V C U^T + b) * softmax(x_l g) + x_l`
-  
+
   Args:
     layer_num (:obj:`int`): DCN的层数
     dcn_type (:obj:`str`): DCN类型, 目前支持三种vector/matrix/mixed
@@ -502,21 +504,21 @@ class DCN(Layer):
     use_dropout (:obj:`bool`): 只否使用dropout
     keep_prob (:obj:`float`): dropout的保留概率
     mode (:obj:`str`): 运行模式, 可以是train/eval/predict
-  
+
   """
 
   def __init__(
-      self,
-      layer_num: int = 1,
-      dcn_type: str = DCNType.Matrix,
-      initializer=None,
-      regularizer=None,
-      num_experts: int = 1,
-      low_rank: int = 0,
-      allow_kernel_norm: bool = False,
-      use_dropout=False,
-      keep_prob=0.95,
-      **kwargs
+    self,
+    layer_num: int = 1,
+    dcn_type: str = DCNType.Matrix,
+    initializer=None,
+    regularizer=None,
+    num_experts: int = 1,
+    low_rank: int = 0,
+    allow_kernel_norm: bool = False,
+    use_dropout=False,
+    keep_prob=0.95,
+    **kwargs,
   ):
     super(DCN, self).__init__(**kwargs)
     self.layer_num = layer_num
@@ -533,88 +535,99 @@ def build(self, input_shape):
     dims = check_dim(input_shape[-1])
     if self.dcn_type == DCNType.Vector:
       self.kernel = [
-          self.get_variable(
-              name='kernel_{}'.format(i),
-              shape=[dims, 1],
-              dtype=tf.float32,
-              initializer=self.initializer,
-              regularizer=self.regularizer,
-              trainable=True
-          ) for i in range(self.layer_num)
+        self.get_variable(
+          name="kernel_{}".format(i),
+          shape=[dims, 1],
+          dtype=tf.float32,
+          initializer=self.initializer,
+          regularizer=self.regularizer,
+          trainable=True,
+        )
+        for i in range(self.layer_num)
       ]
     elif self.dcn_type == DCNType.Matrix:
       self.kernel = [
-          self.get_variable(
-              name='kernel_{}'.format(i),
-              shape=[dims, dims],
-              dtype=tf.float32,
-              initializer=self.initializer,
-              regularizer=self.regularizer,
-              trainable=True
-          ) for i in range(self.layer_num)
+        self.get_variable(
+          name="kernel_{}".format(i),
+          shape=[dims, dims],
+          dtype=tf.float32,
+          initializer=self.initializer,
+          regularizer=self.regularizer,
+          trainable=True,
+        )
+        for i in range(self.layer_num)
       ]
     else:
       self.U = [
-          [
-              self.get_variable(
-                  name='U_{}_{}'.format(i, j),
-                  shape=[dims, self.low_rank],
-                  dtype=tf.float32,
-                  initializer=self.initializer,
-                  regularizer=self.regularizer,
-                  trainable=True
-              ) for j in range(self.num_experts)
-          ] for i in range(self.layer_num)
+        [
+          self.get_variable(
+            name="U_{}_{}".format(i, j),
+            shape=[dims, self.low_rank],
+            dtype=tf.float32,
+            initializer=self.initializer,
+            regularizer=self.regularizer,
+            trainable=True,
+          )
+          for j in range(self.num_experts)
+        ]
+        for i in range(self.layer_num)
       ]
 
       self.V = [
-          [
-              self.get_variable(
-                  name='V_{}_{}'.format(i, j),
-                  shape=[dims, self.low_rank],
-                  dtype=tf.float32,
-                  initializer=self.initializer,
-                  regularizer=self.regularizer,
-                  trainable=True
-              ) for j in range(self.num_experts)
-          ] for i in range(self.layer_num)
+        [
+          self.get_variable(
+            name="V_{}_{}".format(i, j),
+            shape=[dims, self.low_rank],
+            dtype=tf.float32,
+            initializer=self.initializer,
+            regularizer=self.regularizer,
+            trainable=True,
+          )
+          for j in range(self.num_experts)
+        ]
+        for i in range(self.layer_num)
       ]
 
       self.C = [
-          [
-              self.get_variable(
-                  name='C_{}_{}'.format(i, j),
-                  shape=[self.low_rank, self.low_rank],
-                  dtype=tf.float32,
-                  initializer=self.initializer,
-                  regularizer=self.regularizer,
-                  trainable=True
-              ) for j in range(self.num_experts)
-          ] for i in range(self.layer_num)
+        [
+          self.get_variable(
+            name="C_{}_{}".format(i, j),
+            shape=[self.low_rank, self.low_rank],
+            dtype=tf.float32,
+            initializer=self.initializer,
+            regularizer=self.regularizer,
+            trainable=True,
+          )
+          for j in range(self.num_experts)
+        ]
+        for i in range(self.layer_num)
       ]
 
       self.G = [
-          [
-              self.get_variable(
-                  name='G_{}_{}'.format(i, j),
-                  shape=[dims, 1],
-                  dtype=tf.float32,
-                  initializer=self.initializer,
-                  regularizer=self.regularizer,
-                  trainable=True
-              ) for j in range(self.num_experts)
-          ] for i in range(self.layer_num)
+        [
+          self.get_variable(
+            name="G_{}_{}".format(i, j),
+            shape=[dims, 1],
+            dtype=tf.float32,
+            initializer=self.initializer,
+            regularizer=self.regularizer,
+            trainable=True,
+          )
+          for j in range(self.num_experts)
+        ]
+        for i in range(self.layer_num)
       ]
 
     self.bias = [
-        self.get_variable(
-            name='bias_{}'.format(i),
-            shape=[1, dims],
-            dtype=tf.float32,
-            initializer=initializers.Zeros(),
-            regularizer=None,
-            trainable=True
-        ) for i in range(self.layer_num)
+      self.get_variable(
+        name="bias_{}".format(i),
+        shape=[1, dims],
+        dtype=tf.float32,
+        initializer=initializers.Zeros(),
+        regularizer=None,
+        trainable=True,
+      )
+      for i in range(self.layer_num)
     ]
 
     return super(DCN, self).build(input_shape)
@@ -668,15 +681,15 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable):
     if self.allow_kernel_norm:
       upper_ns = tf.compat.v1.get_default_graph().get_name_scope()
       var_init = initializer(shape, dtype)
-      with tf.compat.v1.name_scope(f'{upper_ns}/{name}/') as name_scope:
-        var_name = name_scope.strip('/')
-        with tf.compat.v1.variable_scope('', reuse=tf.compat.v1.AUTO_REUSE):
+      with tf.compat.v1.name_scope(f"{upper_ns}/{name}/") as name_scope:
+        var_name = name_scope.strip("/")
+        with tf.compat.v1.variable_scope("", reuse=tf.compat.v1.AUTO_REUSE):
           var = tf.compat.v1.get_variable(
-              initializer=var_init, name=var_name, dtype=dtype, regularizer=regularizer, trainable=trainable
+            initializer=var_init, name=var_name, dtype=dtype, regularizer=regularizer, trainable=trainable
           )
 
-        normalized = tf.nn.l2_normalize(var, axis=0, epsilon=1e-6, name='normalized_var')
-        var_norm_init = tf.norm(var_init, axis=0, name='init_trainable_norm')
+        normalized = tf.nn.l2_normalize(var, axis=0, epsilon=1e-6, name="normalized_var")
+        var_norm_init = tf.norm(var_init, axis=0, name="init_trainable_norm")
         if base_layer_utils.is_split_variable(var) or isinstance(var, variable_ops.PartitionedVariable):
           for v in var:
             K.track_variable(v)
@@ -691,13 +704,14 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable):
           else:
             self.non_trainable_weights.append(var)
 
-        with tf.compat.v1.variable_scope('', reuse=tf.compat.v1.AUTO_REUSE):
+        with tf.compat.v1.variable_scope("", reuse=tf.compat.v1.AUTO_REUSE):
           trainable_var_norm = tf.compat.v1.get_variable(
-              initializer=var_norm_init, name=f'{var_name}/trainable_norm', dtype=dtype
+            initializer=var_norm_init, name=f"{var_name}/trainable_norm", dtype=dtype
           )
 
-        if base_layer_utils.is_split_variable(trainable_var_norm
-                                             ) or isinstance(trainable_var_norm, variable_ops.PartitionedVariable):
+        if base_layer_utils.is_split_variable(trainable_var_norm) or isinstance(
+          trainable_var_norm, variable_ops.PartitionedVariable
+        ):
           for v in trainable_var_norm:
             K.track_variable(v)
             if trainable:
@@ -710,25 +724,25 @@ def get_variable(self, name, shape, dtype, initializer, regularizer, trainable):
             self.trainable_weights.append(trainable_var_norm)
           else:
             self.non_trainable_weights.append(trainable_var_norm)
-        var = tf.multiply(normalized, trainable_var_norm, name='mul_var_norm')
+        var = tf.multiply(normalized, trainable_var_norm, name="mul_var_norm")
     else:
       var = self.add_weight(
-          initializer=initializer, shape=shape, name=name, dtype=dtype, regularizer=regularizer, trainable=trainable
+        initializer=initializer, shape=shape, name=name, dtype=dtype, regularizer=regularizer, trainable=trainable
       )
 
     return var
 
   def get_config(self):
     config = {
-        'layer_num': self.layer_num,
-        'dcn_type': self.dcn_type,
-        'initializer': initializers.serialize(self.initializer),
-        'regularizer': regularizers.serialize(self.regularizer),
-        'num_experts': self.num_experts,
-        'low_rank': self.low_rank,
-        'allow_kernel_norm': self.allow_kernel_norm,
-        'use_dropout': self.use_dropout,
-        'keep_prob': self.keep_prob,
+      "layer_num": self.layer_num,
+      "dcn_type": self.dcn_type,
+      "initializer": initializers.serialize(self.initializer),
+      "regularizer": regularizers.serialize(self.regularizer),
+      "num_experts": self.num_experts,
+      "low_rank": self.low_rank,
+      "allow_kernel_norm": self.allow_kernel_norm,
+      "use_dropout": self.use_dropout,
+      "keep_prob": self.keep_prob,
     }
 
     base_config = super(DCN, self).get_config()
@@ -737,36 +751,36 @@ def get_config(self):
 
 class CIN(Layer):
   r"""Compressed Interaction Network, CIN, 压缩相互作用网络. 它是高阶(二阶以上)特征提取方法, 形式上是DCN与FM的结合体, 也是xDeepFM的核心. 论文可参考 https://arxiv.org/pdf/1703.04247.pdf
-  
-  DCN的计算: 
+
+  DCN的计算:
     - :math:`x_{l+1} = f_{\theta}(x_0, x_l) + x_l`, 即它是一个残差网络, 并且每一层的计算都与 :math:`x_0` 有关
-  
-  FM的计算: 
+
+  FM的计算:
     - 相对于LR, 增加了二阶交叉项, 并且用embedding的形式压缩表达, 计算特征交叉的方式是点积
-    
-  CIN的计算: 
-    - 与DCN一样, 并且每一层的计算都与 :math:`x_0` 有关, 但是并不使用残差, :math:`f_{\theta}(x,y)` 不是线性的, 而是与FM类似, 用embedding计算得到, 
+
+  CIN的计算:
+    - 与DCN一样, 并且每一层的计算都与 :math:`x_0` 有关, 但是并不使用残差, :math:`f_{\theta}(x,y)` 不是线性的, 而是与FM类似, 用embedding计算得到,
       但使用的不是点积(bit-wise), 而是对应元素相乘, 然后线性组合(vector-wise). :math:`f_{\theta}(x,y)` 是类似于FM的方法显式交叉, 所以它是一种显式高阶特征交叉方法
     - 计算上, CIN还有一个特点是它可以转化成CNN高效计算
 
   .. math::
 
     X_{h,*}^k = \sum_{i=1}^{H_{k-1}} \sum_{j=1}^m W_{ij}^{k,k} (x_{i,*}^{k-1} \circ x_{j,*}^0)
-  
+
   CIN的主要特点是:
-    - 相互作用在vector-wise level, 而不是在bit-wise level 
+    - 相互作用在vector-wise level, 而不是在bit-wise level
     - 高阶特征交叉是显性的, 而非隐性的
     - 模型大小并不会随因交叉度增加而指数增加
-  
+
   Args:
     hidden_uints (:obj:`List[int]`): CIN隐含层uints个数
     activation (:obj:`tf.activation`): 激活函数
     initializer (:obj:`tf.initializer`): 初始化器
     regularizer (:obj:`tf.regularizer`): 正则化器
-  
+
   """
 
-  def __init__(self, hidden_uints, activation=None, initializer='glorot_uniform', regularizer=None, **kwargs):
+  def __init__(self, hidden_uints, activation=None, initializer="glorot_uniform", regularizer=None, **kwargs):
     super(CIN, self).__init__(**kwargs)
     self.hidden_uints = hidden_uints
     self.activation = activations.get(activation)
@@ -794,26 +808,26 @@ def build(self, input_shape):
 
       if i != self._layer_num - 1:
         self._conv1d.append(
-            Conv1D(
-                filters=uints,
-                kernel_size=1,
-                strides=1,
-                activation=self.activation,
-                kernel_initializer=self.initializer,
-                kernel_regularizer=self.regularizer,
-                input_shape=(emb_size, last_hidden_dim * num_feat)
-            )
+          Conv1D(
+            filters=uints,
+            kernel_size=1,
+            strides=1,
+            activation=self.activation,
+            kernel_initializer=self.initializer,
+            kernel_regularizer=self.regularizer,
+            input_shape=(emb_size, last_hidden_dim * num_feat),
+          )
         )
       else:
         self._conv1d.append(
-            Conv1D(
-                filters=uints,
-                kernel_size=1,
-                strides=1,
-                kernel_initializer=self.initializer,
-                kernel_regularizer=self.regularizer,
-                input_shape=(emb_size, last_hidden_dim * num_feat)
-            )
+          Conv1D(
+            filters=uints,
+            kernel_size=1,
+            strides=1,
+            kernel_initializer=self.initializer,
+            kernel_regularizer=self.regularizer,
+            input_shape=(emb_size, last_hidden_dim * num_feat),
+          )
         )
 
       self.trainable_weights.extend(self._conv1d[-1].trainable_weights)
@@ -829,7 +843,7 @@ def call(self, inputs, **kwargs):
       # (batch_size, emb_size, -1)
       xl_last_dim = dim_size(xl, -1)
       zl = tf.reshape(
-          tf.einsum('bdh,bdm->bdhm', xl, x0), shape=(self._batch_size, self._emb_size, xl_last_dim * self._num_feat)
+        tf.einsum("bdh,bdm->bdhm", xl, x0), shape=(self._batch_size, self._emb_size, xl_last_dim * self._num_feat)
       )
       xl = self._conv1d[i](zl)  # (batch_size, emb_size, num_hidden)
 
@@ -839,10 +853,10 @@ def call(self, inputs, **kwargs):
 
   def get_config(self):
     config = {
-        'hidden_uints': self.hidden_uints,
-        'activation': activations.serialize(self.activation),
-        "initializer": initializers.serialize(self.initializer),
-        "regularizer": regularizers.serialize(self.regularizer)
+      "hidden_uints": self.hidden_uints,
+      "activation": activations.serialize(self.activation),
+      "initializer": initializers.serialize(self.initializer),
+      "regularizer": regularizers.serialize(self.regularizer),
     }
 
     base_config = super(CIN, self).get_config()
diff --git a/deepray/layers/flash_attention.py b/deepray/layers/flash_attention.py
index a9f2feb0..7ae5eded 100644
--- a/deepray/layers/flash_attention.py
+++ b/deepray/layers/flash_attention.py
@@ -24,16 +24,16 @@ class FlashAttentionLayer(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      max_query_length,
-      max_key_length,
-      num_heads,
-      dim_head,
-      dropout_rate=0.0,
-      is_causal=False,
-      num_splits=1,
-      dtype=tf.half,
-      **kwargs
+    self,
+    max_query_length,
+    max_key_length,
+    num_heads,
+    dim_head,
+    dropout_rate=0.0,
+    is_causal=False,
+    num_splits=1,
+    dtype=tf.half,
+    **kwargs,
   ):
     """vim
     Args:
@@ -55,10 +55,10 @@ def __init__(
     self.num_heads = num_heads
     if dim_head % 8 != 0:
       raise ValueError(
-          "Head dimensions that are multiples of 8,"
-          "up to 128 (e.g., 8, 16, 24, ..., 128)."
-          "Head dim > 64 backward requires A100 or H100."
-          "You set to %s" % dim_head
+        "Head dimensions that are multiples of 8,"
+        "up to 128 (e.g., 8, 16, 24, ..., 128)."
+        "Head dim > 64 backward requires A100 or H100."
+        "You set to %s" % dim_head
       )
     self.dim_head = dim_head
     self.dropout_rate = dropout_rate
@@ -130,19 +130,30 @@ def calculate(mask):
     # [B X S, H, K] => [B X S, H, K]
     # The attention of the recommendation system currently does not require causal
     attn_weight = gen_flash_attention_ops.fmha_forward(
-        query, key, value, cu_seqlens_q, cu_seqlens_k, max_len_q, max_len_k, self.dropout_rate, self.softmax_scale,
-        zero_tensors, self.is_causal, return_softmax, self.num_splits
+      query,
+      key,
+      value,
+      cu_seqlens_q,
+      cu_seqlens_k,
+      max_len_q,
+      max_len_k,
+      self.dropout_rate,
+      self.softmax_scale,
+      zero_tensors,
+      self.is_causal,
+      return_softmax,
+      self.num_splits,
     )
 
     # output attn_weight: [B, S, H, K]
     attn_weight = tf.reshape(attn_weight, [-1, self.max_query_length, self.num_heads, self.dim_head])
-    tf.logging.info('self attention output shape {}'.format(attn_weight))
+    tf.logging.info("self attention output shape {}".format(attn_weight))
     return attn_weight
 
   def compute_output_shape(self, input_shape):
     return input_shape[0][:2] + (self.num_heads, self.dim_head)
 
   def get_config(self):
-    config = {'dropout_rate': self.dropout_rate}
+    config = {"dropout_rate": self.dropout_rate}
     base_config = super(FlashAttentionLayer, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/masked_softmax.py b/deepray/layers/masked_softmax.py
index 1925955b..79744fb9 100644
--- a/deepray/layers/masked_softmax.py
+++ b/deepray/layers/masked_softmax.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -55,6 +56,6 @@ def call(self, inputs):
     return tf.nn.softmax(scores)
 
   def get_config(self):
-    config = {'mask_expansion_axes': self._mask_expansion_axes}
+    config = {"mask_expansion_axes": self._mask_expansion_axes}
     base_config = super(MaskedSoftmax, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/max_unpooling_2d.py b/deepray/layers/max_unpooling_2d.py
index 4e7cbc28..ba9949af 100644
--- a/deepray/layers/max_unpooling_2d.py
+++ b/deepray/layers/max_unpooling_2d.py
@@ -23,22 +23,22 @@
 def normalize_tuple(value, n, name):
   """Transforms an integer or iterable of integers into an integer tuple.
 
-    A copy of tensorflow.python.keras.util.
-
-    Args:
-      value: The value to validate and convert. Could an int, or any iterable
-        of ints.
-      n: The size of the tuple to be returned.
-      name: The name of the argument being validated, e.g. "strides" or
-        "kernel_size". This is only used to format error messages.
-
-    Returns:
-      A tuple of n integers.
-
-    Raises:
-      ValueError: If something else than an int/long or iterable thereof was
-        passed.
-    """
+  A copy of tensorflow.python.keras.util.
+
+  Args:
+    value: The value to validate and convert. Could an int, or any iterable
+      of ints.
+    n: The size of the tuple to be returned.
+    name: The name of the argument being validated, e.g. "strides" or
+      "kernel_size". This is only used to format error messages.
+
+  Returns:
+    A tuple of n integers.
+
+  Raises:
+    ValueError: If something else than an int/long or iterable thereof was
+      passed.
+  """
   if isinstance(value, int):
     return (value,) * n
   else:
@@ -48,15 +48,15 @@ def normalize_tuple(value, n, name):
       raise TypeError("The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value))
     if len(value_tuple) != n:
       raise ValueError(
-          "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)
+        "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value)
       )
     for single_value in value_tuple:
       try:
         int(single_value)
       except (ValueError, TypeError):
         raise ValueError(
-            "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " "
-            "including element " + str(single_value) + " of type" + " " + str(type(single_value))
+          "The `" + name + "` argument must be a tuple of " + str(n) + " integers. Received: " + str(value) + " "
+          "including element " + str(single_value) + " of type" + " " + str(type(single_value))
         )
     return value_tuple
 
@@ -65,17 +65,17 @@ def _calculate_output_shape(input_shape, pool_size, strides, padding):
   """Calculates the shape of the unpooled output."""
   if padding == "VALID":
     output_shape = (
-        input_shape[0],
-        (input_shape[1] - 1) * strides[0] + pool_size[0],
-        (input_shape[2] - 1) * strides[1] + pool_size[1],
-        input_shape[3],
+      input_shape[0],
+      (input_shape[1] - 1) * strides[0] + pool_size[0],
+      (input_shape[2] - 1) * strides[1] + pool_size[1],
+      input_shape[3],
     )
   elif padding == "SAME":
     output_shape = (
-        input_shape[0],
-        input_shape[1] * strides[0],
-        input_shape[2] * strides[1],
-        input_shape[3],
+      input_shape[0],
+      input_shape[1] * strides[0],
+      input_shape[2] * strides[1],
+      input_shape[3],
     )
   else:
     raise ValueError('Padding must be a string from: "SAME", "VALID"')
@@ -87,10 +87,10 @@ def _max_unpooling_2d(updates, mask, pool_size=(2, 2), strides=(2, 2), padding="
   pool_size_attr = " ".join(["i: %d" % v for v in pool_size])
   strides_attr = " ".join(["i: %d" % v for v in strides])
   experimental_implements = [
-      'name: "deepray:MaxUnpooling2D"',
-      'attr { key: "pool_size" value { list {%s} } }' % pool_size_attr,
-      'attr { key: "strides" value { list {%s} } }' % strides_attr,
-      'attr { key: "padding" value { s: "%s" } }' % padding,
+    'name: "deepray:MaxUnpooling2D"',
+    'attr { key: "pool_size" value { list {%s} } }' % pool_size_attr,
+    'attr { key: "strides" value { list {%s} } }' % strides_attr,
+    'attr { key: "padding" value { s: "%s" } }' % padding,
   ]
   experimental_implements = " ".join(experimental_implements)
 
@@ -125,32 +125,32 @@ def func(updates, mask):
 class MaxUnpooling2D(tf.keras.layers.Layer):
   """Unpool the outputs of a maximum pooling operation.
 
-    This function currently does not support outputs of MaxPoolingWithArgMax in
-    following cases:
-    - include_batch_in_index equals true.
-    - input_shape is not divisible by strides if padding is "SAME".
-    - (input_shape - pool_size) is not divisible by strides if padding is "VALID".
-    - The max pooling operation results in duplicate values in updates and mask.
-
-    Args:
-      updates: The pooling result from max pooling.
-      mask: the argmax result corresponds to above max values.
-      pool_size: The filter that max pooling was performed with. Default: (2, 2).
-      strides: The strides that max pooling was performed with. Default: (2, 2).
-      padding: The padding that max pooling was performed with. Default: "SAME".
-    Input shape:
-      4D tensor with shape: `(batch_size, height, width, channel)`.
-    Output shape:
-      4D tensor with the same shape as the input of max pooling operation.
-    """
+  This function currently does not support outputs of MaxPoolingWithArgMax in
+  following cases:
+  - include_batch_in_index equals true.
+  - input_shape is not divisible by strides if padding is "SAME".
+  - (input_shape - pool_size) is not divisible by strides if padding is "VALID".
+  - The max pooling operation results in duplicate values in updates and mask.
+
+  Args:
+    updates: The pooling result from max pooling.
+    mask: the argmax result corresponds to above max values.
+    pool_size: The filter that max pooling was performed with. Default: (2, 2).
+    strides: The strides that max pooling was performed with. Default: (2, 2).
+    padding: The padding that max pooling was performed with. Default: "SAME".
+  Input shape:
+    4D tensor with shape: `(batch_size, height, width, channel)`.
+  Output shape:
+    4D tensor with the same shape as the input of max pooling operation.
+  """
 
   @typechecked
   def __init__(
-      self,
-      pool_size: Union[int, Iterable[int]] = (2, 2),
-      strides: Union[int, Iterable[int]] = (2, 2),
-      padding: str = "SAME",
-      **kwargs,
+    self,
+    pool_size: Union[int, Iterable[int]] = (2, 2),
+    strides: Union[int, Iterable[int]] = (2, 2),
+    padding: str = "SAME",
+    **kwargs,
   ):
     super(MaxUnpooling2D, self).__init__(**kwargs)
 
@@ -163,11 +163,11 @@ def __init__(
 
   def call(self, updates, mask):
     return _max_unpooling_2d(
-        updates,
-        mask,
-        pool_size=self.pool_size,
-        strides=self.strides,
-        padding=self.padding,
+      updates,
+      mask,
+      pool_size=self.pool_size,
+      strides=self.strides,
+      padding=self.padding,
     )
 
   def compute_output_shape(self, input_shapes):
diff --git a/deepray/layers/max_unpooling_2d_v2.py b/deepray/layers/max_unpooling_2d_v2.py
index 6acd769f..6efab316 100644
--- a/deepray/layers/max_unpooling_2d_v2.py
+++ b/deepray/layers/max_unpooling_2d_v2.py
@@ -51,32 +51,32 @@ def _max_unpooling_2d_v2(updates, mask, output_size):
 class MaxUnpooling2DV2(tf.keras.layers.Layer):
   """Unpool the outputs of a maximum pooling operation.
 
-    This differs from MaxUnpooling2D in that it uses output_size rather than strides and padding
-    to calculate the unpooled tensor. This is because MaxPoolingWithArgMax can map several input
-    sizes to the same output size, and specifying the output size avoids ambiguity in the
-    inversion process.
-
-    This function currently does not support outputs of MaxPoolingWithArgMax in following cases:
-    - include_batch_in_index equals true.
-    - The max pooling operation results in duplicate values in updates and mask.
-
-    Args:
-      output_size: A tuple/list of 4 integers specifying (batch_size, height, width, channel).
-        The targeted output size.
-    Call Args:
-      updates: A 4D tensor of shape `(batch_size, height, width, channel)`.
-        The pooling result from max pooling.
-      mask: A 4D tensor of shape `(batch_size, height, width, channel)`.
-        The indices of the maximal values.
-    Output shape:
-      4D tensor with the same shape as output_size.
-    """
+  This differs from MaxUnpooling2D in that it uses output_size rather than strides and padding
+  to calculate the unpooled tensor. This is because MaxPoolingWithArgMax can map several input
+  sizes to the same output size, and specifying the output size avoids ambiguity in the
+  inversion process.
+
+  This function currently does not support outputs of MaxPoolingWithArgMax in following cases:
+  - include_batch_in_index equals true.
+  - The max pooling operation results in duplicate values in updates and mask.
+
+  Args:
+    output_size: A tuple/list of 4 integers specifying (batch_size, height, width, channel).
+      The targeted output size.
+  Call Args:
+    updates: A 4D tensor of shape `(batch_size, height, width, channel)`.
+      The pooling result from max pooling.
+    mask: A 4D tensor of shape `(batch_size, height, width, channel)`.
+      The indices of the maximal values.
+  Output shape:
+    4D tensor with the same shape as output_size.
+  """
 
   @typechecked
   def __init__(
-      self,
-      output_size: Iterable[int],
-      **kwargs,
+    self,
+    output_size: Iterable[int],
+    **kwargs,
   ):
     super(MaxUnpooling2DV2, self).__init__(**kwargs)
 
diff --git a/deepray/layers/maxout.py b/deepray/layers/maxout.py
index 015ab9c4..8f179c82 100644
--- a/deepray/layers/maxout.py
+++ b/deepray/layers/maxout.py
@@ -22,25 +22,25 @@
 class Maxout(tf.keras.layers.Layer):
   """Applies Maxout to the input.
 
-    "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
-    Courville, Yoshua Bengio. https://arxiv.org/abs/1302.4389
+  "Maxout Networks" Ian J. Goodfellow, David Warde-Farley, Mehdi Mirza, Aaron
+  Courville, Yoshua Bengio. https://arxiv.org/abs/1302.4389
 
-    Usually the operation is performed in the filter/channel dimension. This
-    can also be used after Dense layers to reduce number of features.
+  Usually the operation is performed in the filter/channel dimension. This
+  can also be used after Dense layers to reduce number of features.
 
-    Args:
-      num_units: Specifies how many features will remain after maxout
-        in the `axis` dimension (usually channel).
-        This must be a factor of number of features.
-      axis: The dimension where max pooling will be performed. Default is the
-        last dimension.
+  Args:
+    num_units: Specifies how many features will remain after maxout
+      in the `axis` dimension (usually channel).
+      This must be a factor of number of features.
+    axis: The dimension where max pooling will be performed. Default is the
+      last dimension.
 
-    Input shape:
-      nD tensor with shape: `(batch_size, ..., axis_dim, ...)`.
+  Input shape:
+    nD tensor with shape: `(batch_size, ..., axis_dim, ...)`.
 
-    Output shape:
-      nD tensor with shape: `(batch_size, ..., num_units, ...)`.
-    """
+  Output shape:
+    nD tensor with shape: `(batch_size, ..., num_units, ...)`.
+  """
 
   @typechecked
   def __init__(self, num_units: int, axis: int = -1, **kwargs):
@@ -58,10 +58,7 @@ def call(self, inputs):
 
     num_channels = shape[self.axis]
     if not isinstance(num_channels, tf.Tensor) and num_channels % self.num_units:
-      raise ValueError(
-          "number of features({}) is not "
-          "a multiple of num_units({})".format(num_channels, self.num_units)
-      )
+      raise ValueError("number of features({}) is not a multiple of num_units({})".format(num_channels, self.num_units))
 
     if self.axis < 0:
       axis = self.axis + len(shape)
diff --git a/deepray/layers/mlp.py b/deepray/layers/mlp.py
index b6caa864..fc70e578 100644
--- a/deepray/layers/mlp.py
+++ b/deepray/layers/mlp.py
@@ -7,11 +7,11 @@
 
 def extend_as_list(x, n):
   """This is a helper function to extend x as list, it will do:
-    1. If x is a list, padding it to specified length n with None, if the length
-    is less than n;
-    2. If x is not a list, create a list with n elements x, please note that,
-    these n elements are the same object, not a copy of x.
-    """
+  1. If x is a list, padding it to specified length n with None, if the length
+  is less than n;
+  2. If x is not a list, create a list with n elements x, please note that,
+  these n elements are the same object, not a copy of x.
+  """
   if isinstance(x, (list, tuple)):
     if len(x) < n:
       return x + [None] * (n - len(x))
@@ -46,20 +46,20 @@ class MLP(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      hidden_units: List[int],
-      name: str = '',
-      activations=None,
-      kernel_initializer=None,
-      kernel_regularizer=None,
-      use_bias=True,
-      bias_regularizer=None,
-      enable_batch_normalization=False,
-      batch_normalization_momentum=0.99,
-      batch_normalization_renorm=False,
-      batch_normalization_renorm_clipping=None,
-      batch_normalization_renorm_momentum=0.99,
-      **kwargs
+    self,
+    hidden_units: List[int],
+    name: str = "",
+    activations=None,
+    kernel_initializer=None,
+    kernel_regularizer=None,
+    use_bias=True,
+    bias_regularizer=None,
+    enable_batch_normalization=False,
+    batch_normalization_momentum=0.99,
+    batch_normalization_renorm=False,
+    batch_normalization_renorm_clipping=None,
+    batch_normalization_renorm_momentum=0.99,
+    **kwargs,
   ):
     super(MLP, self).__init__(**kwargs)
     self.hidden_units = hidden_units
@@ -78,7 +78,7 @@ def __init__(
     self._initializers = [tf.initializers.get(init) for init in extend_as_list(kernel_initializer, self._n_layers)]
 
     if activations is None:
-      self._activations = [tf.keras.layers.Activation(activation='relu')] * (self._n_layers - 1) + [None]
+      self._activations = [tf.keras.layers.Activation(activation="relu")] * (self._n_layers - 1) + [None]
     elif isinstance(activations, (list, tuple)):
       assert len(activations) == self._n_layers
       for act in activations:
@@ -88,18 +88,18 @@ def __init__(
           self._activations.append(None)
     else:
       self._activations = [
-          tf.keras.layers.Activation(activation=activations) if i != self._n_layers - 1 else None
-          for i in range(self._n_layers)
+        tf.keras.layers.Activation(activation=activations) if i != self._n_layers - 1 else None
+        for i in range(self._n_layers)
       ]
 
   def build(self, input_shape):
     if self.enable_batch_normalization:
       bn = keras.layers.BatchNormalization(
-          momentum=self.batch_normalization_momentum,
-          renorm=self.batch_normalization_renorm,
-          renorm_clipping=self.batch_normalization_renorm_clipping,
-          renorm_momentum=self.batch_normalization_renorm_momentum,
-          name=f"BatchNorm/in"
+        momentum=self.batch_normalization_momentum,
+        renorm=self.batch_normalization_renorm,
+        renorm_clipping=self.batch_normalization_renorm_clipping,
+        renorm_momentum=self.batch_normalization_renorm_momentum,
+        name=f"BatchNorm/in",
       )
       self.trainable_weights.extend(bn.trainable_weights)
       self.non_trainable_weights.extend(bn.non_trainable_weights)
@@ -107,16 +107,16 @@ def build(self, input_shape):
       self._stacked_layers.append(bn)
 
     for layer_id, dim in enumerate(self.hidden_units):
-      is_final_layer = (layer_id == (self._n_layers - 1))
+      is_final_layer = layer_id == (self._n_layers - 1)
       dense = tf.keras.layers.Dense(
-          name=f"dense_{self.prefix}{layer_id}",
-          units=dim,
-          activation=None,
-          use_bias=self.use_bias,
-          kernel_initializer=self._initializers[layer_id],
-          bias_initializer=tf.initializers.zeros(),
-          kernel_regularizer=self.kernel_regularizer,
-          bias_regularizer=self.bias_regularizer
+        name=f"dense_{self.prefix}{layer_id}",
+        units=dim,
+        activation=None,
+        use_bias=self.use_bias,
+        kernel_initializer=self._initializers[layer_id],
+        bias_initializer=tf.initializers.zeros(),
+        kernel_regularizer=self.kernel_regularizer,
+        bias_regularizer=self.bias_regularizer,
       )
       self.trainable_weights.extend(dense.trainable_weights)
       self.non_trainable_weights.extend(dense.non_trainable_weights)
@@ -125,11 +125,11 @@ def build(self, input_shape):
 
       if not is_final_layer and self.enable_batch_normalization:
         bn = keras.layers.BatchNormalization(
-            momentum=self.batch_normalization_momentum,
-            renorm=self.batch_normalization_renorm,
-            renorm_clipping=self.batch_normalization_renorm_clipping,
-            renorm_momentum=self.batch_normalization_renorm_momentum,
-            name=f"BatchNorm/out"
+          momentum=self.batch_normalization_momentum,
+          renorm=self.batch_normalization_renorm,
+          renorm_clipping=self.batch_normalization_renorm_clipping,
+          renorm_momentum=self.batch_normalization_renorm_momentum,
+          name=f"BatchNorm/out",
         )
         self.trainable_weights.extend(bn.trainable_weights)
         self.non_trainable_weights.extend(bn.non_trainable_weights)
@@ -150,18 +150,18 @@ def call(self, input, **kwargs):
 
   def get_config(self):
     config = {
-        'hidden_units': self.hidden_units,
-        'name': self.prefix,
-        "activations": [tf.keras.layers.Activation(activation=act) for act in self._activations],
-        "initializers": [tf.initializers.serialize(init) for init in self._initializers],
-        "enable_batch_normalization": self.enable_batch_normalization,
-        "batch_normalization_momentum": self.batch_normalization_momentum,
-        "use_bias": self.use_bias,
-        'kernel_regularizer': keras.regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': keras.regularizers.serialize(self.bias_regularizer),
-        'batch_normalization_renorm': self.batch_normalization_renorm,
-        'batch_normalization_renorm_clipping': self.batch_normalization_renorm_clipping,
-        'batch_normalization_renorm_momentum': self.batch_normalization_renorm_momentum
+      "hidden_units": self.hidden_units,
+      "name": self.prefix,
+      "activations": [tf.keras.layers.Activation(activation=act) for act in self._activations],
+      "initializers": [tf.initializers.serialize(init) for init in self._initializers],
+      "enable_batch_normalization": self.enable_batch_normalization,
+      "batch_normalization_momentum": self.batch_normalization_momentum,
+      "use_bias": self.use_bias,
+      "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer),
+      "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer),
+      "batch_normalization_renorm": self.batch_normalization_renorm,
+      "batch_normalization_renorm_clipping": self.batch_normalization_renorm_clipping,
+      "batch_normalization_renorm_momentum": self.batch_normalization_renorm_momentum,
     }
     base_config = super(MLP, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/netvlad.py b/deepray/layers/netvlad.py
index 9df5111e..2247e9cf 100644
--- a/deepray/layers/netvlad.py
+++ b/deepray/layers/netvlad.py
@@ -23,19 +23,19 @@
 class NetVLAD(tf.keras.layers.Layer):
   """Applies NetVLAD to the input.
 
-    This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image
-    retrieval.
+  This is a fully-differentiable version of "Vector of Locally Aggregated Descriptors" commonly used in image
+  retrieval.
 
-    See [NetVLAD: CNN architecture for weakly supervised place recognition](https://arxiv.org/abs/1511.07247), and.
-    [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764)
+  See [NetVLAD: CNN architecture for weakly supervised place recognition](https://arxiv.org/abs/1511.07247), and.
+  [Towards Learning a Universal Non-Semantic Representation of Speech](https://arxiv.org/abs/2002.12764)
 
-    Args:
-        num_clusters: The number of clusters to use.
-    Input shape:
-        3D tensor with shape: `(batch_size, time, feature_dim)`.
-    Output shape:
-        2D tensor with shape: `(batch_size, feature_dim * num_clusters)`.
-    """
+  Args:
+      num_clusters: The number of clusters to use.
+  Input shape:
+      3D tensor with shape: `(batch_size, time, feature_dim)`.
+  Output shape:
+      2D tensor with shape: `(batch_size, feature_dim * num_clusters)`.
+  """
 
   @typechecked
   def __init__(self, num_clusters: int, **kwargs):
@@ -50,30 +50,30 @@ def build(self, input_shape):
     if not isinstance(feature_dim, int):
       feature_dim = feature_dim.value
     self.fc = tf.keras.layers.Dense(
-        units=self.num_clusters,
-        activation=tf.nn.softmax,
-        kernel_regularizer=tf.keras.regularizers.l2(1e-5),
+      units=self.num_clusters,
+      activation=tf.nn.softmax,
+      kernel_regularizer=tf.keras.regularizers.l2(1e-5),
     )
     self.cluster_centers = self.add_weight(
-        name="cluster_centers",
-        shape=(1, feature_dim, self.num_clusters),
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=1.0 / math.sqrt(feature_dim)),
-        trainable=True,
+      name="cluster_centers",
+      shape=(1, feature_dim, self.num_clusters),
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=1.0 / math.sqrt(feature_dim)),
+      trainable=True,
     )
     super(NetVLAD, self).build(input_shape)
 
   def call(self, frames):
     """Apply the NetVLAD module to the given frames.
 
-        Args:
-            frames: A tensor with shape [batch_size, max_frames, feature_dim].
+    Args:
+        frames: A tensor with shape [batch_size, max_frames, feature_dim].
 
-        Returns:
-            A tensor with shape [batch_size, feature_dim * num_clusters].
+    Returns:
+        A tensor with shape [batch_size, feature_dim * num_clusters].
 
-        Raises:
-            ValueError: If the `feature_dim` of input is not defined.
-        """
+    Raises:
+        ValueError: If the `feature_dim` of input is not defined.
+    """
     frames.shape.assert_has_rank(3)
     feature_dim = frames.shape.as_list()[-1]
     if feature_dim is None:
diff --git a/deepray/layers/networks/albert_encoder.py b/deepray/layers/networks/albert_encoder.py
index 3ef64306..9a0a96ff 100644
--- a/deepray/layers/networks/albert_encoder.py
+++ b/deepray/layers/networks/albert_encoder.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
+
 # pylint: disable=g-classes-have-attributes
 import collections
 import tensorflow as tf
@@ -20,7 +21,7 @@
 from deepray import layers
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class AlbertEncoder(tf.keras.Model):
   """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network.
 
@@ -62,82 +63,79 @@ class AlbertEncoder(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      embedding_width=128,
-      hidden_size=768,
-      num_layers=12,
-      num_attention_heads=12,
-      max_sequence_length=512,
-      type_vocab_size=16,
-      intermediate_size=3072,
-      activation=tf.keras.activations.gelu,
-      dropout_rate=0.1,
-      attention_dropout_rate=0.1,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      dict_outputs=False,
-      **kwargs
+    self,
+    vocab_size,
+    embedding_width=128,
+    hidden_size=768,
+    num_layers=12,
+    num_attention_heads=12,
+    max_sequence_length=512,
+    type_vocab_size=16,
+    intermediate_size=3072,
+    activation=tf.keras.activations.gelu,
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    dict_outputs=False,
+    **kwargs,
   ):
     activation = tf.keras.activations.get(activation)
     initializer = tf.keras.initializers.get(initializer)
 
-    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
+    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
 
     if embedding_width is None:
       embedding_width = hidden_size
     embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        name='word_embeddings'
+      vocab_size=vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      name="word_embeddings",
     )
     word_embeddings = embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
     position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
     position_embeddings = position_embedding_layer(word_embeddings)
 
-    type_embeddings = (
-        layers.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=embedding_width,
-            initializer=tf_utils.clone_initializer(initializer),
-            use_one_hot=True,
-            name='type_embeddings'
-        )(type_ids)
-    )
+    type_embeddings = layers.OnDeviceEmbedding(
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
+    )(type_ids)
 
     embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
-    embeddings = (
-        tf.keras.layers.LayerNormalization(name='embeddings/layer_norm', axis=-1, epsilon=1e-12,
-                                           dtype=tf.float32)(embeddings)
-    )
-    embeddings = (tf.keras.layers.Dropout(rate=dropout_rate)(embeddings))
+    embeddings = tf.keras.layers.LayerNormalization(
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+    )(embeddings)
+    embeddings = tf.keras.layers.Dropout(rate=dropout_rate)(embeddings)
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     if embedding_width != hidden_size:
       embeddings = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )(embeddings)
 
     data = embeddings
     attention_mask = layers.SelfAttentionMask()(data, mask)
     shared_layer = layers.TransformerEncoderBlock(
-        num_attention_heads=num_attention_heads,
-        inner_dim=intermediate_size,
-        inner_activation=activation,
-        output_dropout=dropout_rate,
-        attention_dropout=attention_dropout_rate,
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='transformer'
+      num_attention_heads=num_attention_heads,
+      inner_dim=intermediate_size,
+      inner_activation=activation,
+      output_dropout=dropout_rate,
+      attention_dropout=attention_dropout_rate,
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="transformer",
     )
     encoder_outputs = []
     for _ in range(num_layers):
@@ -149,16 +147,16 @@ def __init__(
     # layer with Python code, because that is fundamentally less portable.
     first_token_tensor = data[:, 0, :]
     cls_output = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )(first_token_tensor)
     if dict_outputs:
       outputs = dict(
-          sequence_output=data,
-          encoder_outputs=encoder_outputs,
-          pooled_output=cls_output,
+        sequence_output=data,
+        encoder_outputs=encoder_outputs,
+        pooled_output=cls_output,
       )
     else:
       outputs = [data, cls_output]
@@ -172,18 +170,18 @@ def __init__(
     # below this line.
     super().__init__(inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
     config_dict = {
-        'vocab_size': vocab_size,
-        'embedding_width': embedding_width,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'intermediate_size': intermediate_size,
-        'activation': tf.keras.activations.serialize(activation),
-        'dropout_rate': dropout_rate,
-        'attention_dropout_rate': attention_dropout_rate,
-        'initializer': tf.keras.initializers.serialize(initializer),
+      "vocab_size": vocab_size,
+      "embedding_width": embedding_width,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "intermediate_size": intermediate_size,
+      "activation": tf.keras.activations.serialize(activation),
+      "dropout_rate": dropout_rate,
+      "attention_dropout_rate": attention_dropout_rate,
+      "initializer": tf.keras.initializers.serialize(initializer),
     }
 
     # We are storing the config dict as a namedtuple here to ensure checkpoint
@@ -191,7 +189,7 @@ def __init__(
     # the config dict attribute. TF does not track immutable attrs which
     # do not contain Trackables, so by creating a config namedtuple instead of
     # a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
+    config_cls = collections.namedtuple("Config", config_dict.keys())
     self._config = config_cls(**config_dict)
     self._embedding_layer = embedding_layer
     self._position_embedding_layer = position_embedding_layer
diff --git a/deepray/layers/networks/albert_encoder_test.py b/deepray/layers/networks/albert_encoder_test.py
index 740e6ccd..8156aaf7 100644
--- a/deepray/layers/networks/albert_encoder_test.py
+++ b/deepray/layers/networks/albert_encoder_test.py
@@ -21,14 +21,13 @@
 
 
 class AlbertEncoderTest(tf.test.TestCase, parameterized.TestCase):
-
   def tearDown(self):
     super(AlbertEncoderTest, self).tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.named_parameters(
-      dict(testcase_name="default", expected_dtype=tf.float32),
-      dict(testcase_name="with_float16_dtype", expected_dtype=tf.float16),
+    dict(testcase_name="default", expected_dtype=tf.float32),
+    dict(testcase_name="with_float16_dtype", expected_dtype=tf.float16),
   )
   def test_network_creation(self, expected_dtype):
     hidden_size = 32
@@ -71,12 +70,12 @@ def test_network_invocation(self):
     num_layers = 3
     # Create a small TransformerEncoder for testing.
     test_network = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      embedding_width=8,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      type_vocab_size=num_types,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -98,33 +97,33 @@ def test_network_invocation(self):
     # Creates a TransformerEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      embedding_width=8,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      type_vocab_size=num_types,
     )
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
     _ = model.predict([word_id_data, mask_data, type_id_data])
 
     # Tests dictionary outputs.
     test_network_dict = albert_encoder.AlbertEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        type_vocab_size=num_types,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      embedding_width=8,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      type_vocab_size=num_types,
+      dict_outputs=True,
     )
     _ = test_network_dict([word_ids, mask, type_ids])
     test_network_dict.set_weights(test_network.get_weights())
     list_outputs = test_network([word_id_data, mask_data, type_id_data])
     dict_outputs = test_network_dict(
-        dict(input_word_ids=word_id_data, input_mask=mask_data, input_type_ids=type_id_data)
+      dict(input_word_ids=word_id_data, input_mask=mask_data, input_type_ids=type_id_data)
     )
     self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
     self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
@@ -134,32 +133,32 @@ def test_serialize_deserialize(self):
     tf.keras.mixed_precision.set_global_policy("mixed_float16")
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        vocab_size=100,
-        embedding_width=8,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        intermediate_size=1223,
-        activation="relu",
-        dropout_rate=0.05,
-        attention_dropout_rate=0.22,
-        initializer="glorot_uniform"
+      vocab_size=100,
+      embedding_width=8,
+      hidden_size=32,
+      num_layers=3,
+      num_attention_heads=2,
+      max_sequence_length=21,
+      type_vocab_size=12,
+      intermediate_size=1223,
+      activation="relu",
+      dropout_rate=0.05,
+      attention_dropout_rate=0.22,
+      initializer="glorot_uniform",
     )
     network = albert_encoder.AlbertEncoder(**kwargs)
 
     expected_config = dict(kwargs)
     expected_config["activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["activation"])
+      tf.keras.activations.get(expected_config["activation"])
     )
     expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"])
+      tf.keras.initializers.get(expected_config["initializer"])
     )
     self.assertEqual(network.get_config(), expected_config)
 
     # Create another network object from the first object's config.
-    new_network = (albert_encoder.AlbertEncoder.from_config(network.get_config()))
+    new_network = albert_encoder.AlbertEncoder.from_config(network.get_config())
 
     # Validate that the config can be forced to JSON.
     _ = new_network.to_json()
diff --git a/deepray/layers/networks/bert_dense_encoder_test.py b/deepray/layers/networks/bert_dense_encoder_test.py
index 8a8a8f95..92e290d8 100644
--- a/deepray/layers/networks/bert_dense_encoder_test.py
+++ b/deepray/layers/networks/bert_dense_encoder_test.py
@@ -22,7 +22,6 @@
 
 
 class BertEncoderV2Test(tf.test.TestCase, parameterized.TestCase):
-
   def tearDown(self):
     super(BertEncoderV2Test, self).tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
@@ -34,7 +33,7 @@ def test_dict_outputs_network_creation(self):
     # Create a small dense BertEncoderV2 for testing.
     kwargs = {}
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, with_dense_inputs=True, **kwargs
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, with_dense_inputs=True, **kwargs
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -46,14 +45,14 @@ def test_dict_outputs_network_creation(self):
     dense_type_ids = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
 
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
     data = dict_outputs["sequence_output"]
     pooled = dict_outputs["pooled_output"]
@@ -77,12 +76,12 @@ def test_dict_outputs_all_encoder_outputs_network_creation(self):
     dense_sequence_length = 20
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        dict_outputs=True,
-        with_dense_inputs=True
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      dict_outputs=True,
+      with_dense_inputs=True,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -94,14 +93,14 @@ def test_dict_outputs_all_encoder_outputs_network_creation(self):
     dense_type_ids = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
 
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
 
     all_encoder_outputs = dict_outputs["encoder_outputs"]
@@ -125,12 +124,12 @@ def test_dict_outputs_network_creation_with_float16_dtype(self):
     tf.keras.mixed_precision.set_global_policy("mixed_float16")
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        dict_outputs=True,
-        with_dense_inputs=True
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      dict_outputs=True,
+      with_dense_inputs=True,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -142,14 +141,14 @@ def test_dict_outputs_network_creation_with_float16_dtype(self):
     dense_type_ids = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
 
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
 
     data = dict_outputs["sequence_output"]
@@ -166,8 +165,8 @@ def test_dict_outputs_network_creation_with_float16_dtype(self):
     self.assertAllEqual(tf.float16, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 41),
-      ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
+    ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 41),
+    ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
   )
   def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_seq_len):
     hidden_size = 32
@@ -177,14 +176,14 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
     num_types = 7
     # Create a small BertEncoder for testing.
     test_network = encoder_cls(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        dict_outputs=True,
-        with_dense_inputs=True,
-        output_range=output_range
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      dict_outputs=True,
+      with_dense_inputs=True,
+      output_range=output_range,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -195,14 +194,14 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
     dense_type_ids = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
 
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
     data = dict_outputs["sequence_output"]
     pooled = dict_outputs["pooled_output"]
@@ -221,72 +220,87 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
     dense_mask_data = np.random.randint(2, size=(batch_size, dense_sequence_length))
     dense_type_ids_data = np.random.randint(num_types, size=(batch_size, dense_sequence_length))
 
-    outputs = model.predict(
-        [word_id_data, mask_data, type_id_data, dense_input_data, dense_mask_data, dense_type_ids_data]
-    )
+    outputs = model.predict([
+      word_id_data,
+      mask_data,
+      type_id_data,
+      dense_input_data,
+      dense_mask_data,
+      dense_type_ids_data,
+    ])
     self.assertEqual(outputs[0].shape[1], out_seq_len)
 
     # Creates a BertEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = encoder_cls(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      dict_outputs=True,
     )
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
     data = dict_outputs["sequence_output"]
     pooled = dict_outputs["pooled_output"]
     model = tf.keras.Model([word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids], [data, pooled])
-    outputs = model.predict(
-        [word_id_data, mask_data, type_id_data, dense_input_data, dense_mask_data, dense_type_ids_data]
-    )
+    outputs = model.predict([
+      word_id_data,
+      mask_data,
+      type_id_data,
+      dense_input_data,
+      dense_mask_data,
+      dense_type_ids_data,
+    ])
     self.assertEqual(outputs[0].shape[1], sequence_length + dense_sequence_length)
 
     # Creates a BertEncoder with embedding_width != hidden_size
     embedding_width = 16
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        embedding_width=embedding_width,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      embedding_width=embedding_width,
+      dict_outputs=True,
     )
 
     dense_inputs = tf.keras.Input(shape=(dense_sequence_length, embedding_width), dtype=tf.float32)
     dense_input_data = np.zeros((batch_size, dense_sequence_length, embedding_width), dtype=float)
 
     dict_outputs = test_network(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
     data = dict_outputs["sequence_output"]
     pooled = dict_outputs["pooled_output"]
     model = tf.keras.Model([word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids], [data, pooled])
-    outputs = model.predict(
-        [word_id_data, mask_data, type_id_data, dense_input_data, dense_mask_data, dense_type_ids_data]
-    )
+    outputs = model.predict([
+      word_id_data,
+      mask_data,
+      type_id_data,
+      dense_input_data,
+      dense_mask_data,
+      dense_type_ids_data,
+    ])
     self.assertEqual(outputs[0].shape[-1], hidden_size)
     self.assertTrue(hasattr(test_network, "_embedding_projection"))
 
@@ -296,7 +310,7 @@ def test_embeddings_as_inputs(self):
     dense_sequence_length = 20
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, with_dense_inputs=True
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, with_dense_inputs=True
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -308,26 +322,26 @@ def test_embeddings_as_inputs(self):
     dense_type_ids = tf.keras.Input(shape=(dense_sequence_length,), dtype=tf.int32)
 
     test_network.build(
-        dict(
-            input_word_ids=word_ids,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_ids=word_ids,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
     embeddings = test_network.get_embedding_layer()(word_ids)
     # Calls with the embeddings.
     dict_outputs = test_network(
-        dict(
-            input_word_embeddings=embeddings,
-            input_mask=mask,
-            input_type_ids=type_ids,
-            dense_inputs=dense_inputs,
-            dense_mask=dense_mask,
-            dense_type_ids=dense_type_ids
-        )
+      dict(
+        input_word_embeddings=embeddings,
+        input_mask=mask,
+        input_type_ids=type_ids,
+        dense_inputs=dense_inputs,
+        dense_mask=dense_mask,
+        dense_type_ids=dense_type_ids,
+      )
     )
 
     all_encoder_outputs = dict_outputs["encoder_outputs"]
diff --git a/deepray/layers/networks/bert_encoder.py b/deepray/layers/networks/bert_encoder.py
index 729ef6c9..f9bce517 100644
--- a/deepray/layers/networks/bert_encoder.py
+++ b/deepray/layers/networks/bert_encoder.py
@@ -80,39 +80,39 @@ class BertEncoderV2(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size: int,
-      hidden_size: int = 768,
-      num_layers: int = 12,
-      num_attention_heads: int = 12,
-      max_sequence_length: int = 512,
-      type_vocab_size: int = 16,
-      inner_dim: int = 3072,
-      inner_activation: _Activation = _approx_gelu,
-      output_dropout: float = 0.1,
-      attention_dropout: float = 0.1,
-      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      output_range: Optional[int] = None,
-      embedding_width: Optional[int] = None,
-      embedding_layer: Optional[tf.keras.layers.Layer] = None,
-      norm_first: bool = False,
-      with_dense_inputs: bool = False,
-      return_attention_scores: bool = False,
-      **kwargs
+    self,
+    vocab_size: int,
+    hidden_size: int = 768,
+    num_layers: int = 12,
+    num_attention_heads: int = 12,
+    max_sequence_length: int = 512,
+    type_vocab_size: int = 16,
+    inner_dim: int = 3072,
+    inner_activation: _Activation = _approx_gelu,
+    output_dropout: float = 0.1,
+    attention_dropout: float = 0.1,
+    initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    output_range: Optional[int] = None,
+    embedding_width: Optional[int] = None,
+    embedding_layer: Optional[tf.keras.layers.Layer] = None,
+    norm_first: bool = False,
+    with_dense_inputs: bool = False,
+    return_attention_scores: bool = False,
+    **kwargs,
   ):
     # Pops kwargs that are used in V1 implementation.
-    if 'dict_outputs' in kwargs:
-      kwargs.pop('dict_outputs')
-    if 'return_all_encoder_outputs' in kwargs:
-      kwargs.pop('return_all_encoder_outputs')
-    if 'intermediate_size' in kwargs:
-      inner_dim = kwargs.pop('intermediate_size')
-    if 'activation' in kwargs:
-      inner_activation = kwargs.pop('activation')
-    if 'dropout_rate' in kwargs:
-      output_dropout = kwargs.pop('dropout_rate')
-    if 'attention_dropout_rate' in kwargs:
-      attention_dropout = kwargs.pop('attention_dropout_rate')
+    if "dict_outputs" in kwargs:
+      kwargs.pop("dict_outputs")
+    if "return_all_encoder_outputs" in kwargs:
+      kwargs.pop("return_all_encoder_outputs")
+    if "intermediate_size" in kwargs:
+      inner_dim = kwargs.pop("intermediate_size")
+    if "activation" in kwargs:
+      inner_activation = kwargs.pop("activation")
+    if "dropout_rate" in kwargs:
+      output_dropout = kwargs.pop("dropout_rate")
+    if "attention_dropout_rate" in kwargs:
+      attention_dropout = kwargs.pop("attention_dropout_rate")
     super().__init__(**kwargs)
 
     self._output_range = output_range
@@ -125,116 +125,116 @@ def __init__(
 
     if embedding_layer is None:
       self._embedding_layer = layers.OnDeviceEmbedding(
-          vocab_size=vocab_size,
-          embedding_width=embedding_width,
-          initializer=tf_utils.clone_initializer(initializer),
-          name='word_embeddings'
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=tf_utils.clone_initializer(initializer),
+        name="word_embeddings",
       )
     else:
       self._embedding_layer = embedding_layer
 
     self._position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
 
     self._type_embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        use_one_hot=True,
-        name='type_embeddings'
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
     )
 
     self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
-    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name='embedding_dropout')
+    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name="embedding_dropout")
 
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     self._embedding_projection = None
     if embedding_width != hidden_size:
       self._embedding_projection = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )
 
     self._transformer_layers = []
-    self._attention_mask_layer = layers.SelfAttentionMask(name='self_attention_mask')
+    self._attention_mask_layer = layers.SelfAttentionMask(name="self_attention_mask")
     self._num_layers = num_layers
     for i in range(num_layers):
       layer = layers.TransformerEncoderBlock(
-          num_attention_heads=num_attention_heads,
-          inner_dim=inner_dim,
-          inner_activation=inner_activation,
-          output_dropout=output_dropout,
-          attention_dropout=attention_dropout,
-          norm_first=norm_first,
-          return_attention_scores=return_attention_scores,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='transformer/layer_%d' % i
+        num_attention_heads=num_attention_heads,
+        inner_dim=inner_dim,
+        inner_activation=inner_activation,
+        output_dropout=output_dropout,
+        attention_dropout=attention_dropout,
+        norm_first=norm_first,
+        return_attention_scores=return_attention_scores,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="transformer/layer_%d" % i,
       )
       self._transformer_layers.append(layer)
 
     self._pooler_layer = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )
 
     self._config = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'inner_dim': inner_dim,
-        'inner_activation': tf.keras.activations.serialize(activation),
-        'output_dropout': output_dropout,
-        'attention_dropout': attention_dropout,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'output_range': output_range,
-        'embedding_width': embedding_width,
-        'embedding_layer': embedding_layer,
-        'norm_first': norm_first,
-        'with_dense_inputs': with_dense_inputs,
-        'return_attention_scores': return_attention_scores,
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "inner_dim": inner_dim,
+      "inner_activation": tf.keras.activations.serialize(activation),
+      "output_dropout": output_dropout,
+      "attention_dropout": attention_dropout,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "output_range": output_range,
+      "embedding_width": embedding_width,
+      "embedding_layer": embedding_layer,
+      "norm_first": norm_first,
+      "with_dense_inputs": with_dense_inputs,
+      "return_attention_scores": return_attention_scores,
     }
     if with_dense_inputs:
       self.inputs = dict(
-          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
-          dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
+        dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
       )
     else:
       self.inputs = dict(
-          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)
+        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
       )
 
   def call(self, inputs):
     word_embeddings = None
     if isinstance(inputs, dict):
-      word_ids = inputs.get('input_word_ids')
-      mask = inputs.get('input_mask')
-      type_ids = inputs.get('input_type_ids')
-      word_embeddings = inputs.get('input_word_embeddings', None)
-
-      dense_inputs = inputs.get('dense_inputs', None)
-      dense_mask = inputs.get('dense_mask', None)
-      dense_type_ids = inputs.get('dense_type_ids', None)
+      word_ids = inputs.get("input_word_ids")
+      mask = inputs.get("input_mask")
+      type_ids = inputs.get("input_type_ids")
+      word_embeddings = inputs.get("input_word_embeddings", None)
+
+      dense_inputs = inputs.get("dense_inputs", None)
+      dense_mask = inputs.get("dense_mask", None)
+      dense_type_ids = inputs.get("dense_type_ids", None)
     else:
-      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
+      raise ValueError("Unexpected inputs type to %s." % self.__class__)
 
     if word_embeddings is None:
       word_embeddings = self._embedding_layer(word_ids)
@@ -259,7 +259,7 @@ def call(self, inputs):
       if i == self._num_layers - 1:
         transformer_output_range = self._output_range
       x = layer([x, attention_mask], output_range=transformer_output_range)
-      if self._config['return_attention_scores']:
+      if self._config["return_attention_scores"]:
         x, attention_scores = x
         attention_outputs.append(attention_scores)
       encoder_outputs.append(x)
@@ -269,8 +269,8 @@ def call(self, inputs):
     pooled_output = self._pooler_layer(first_token_tensor)
 
     output = dict(sequence_output=encoder_outputs[-1], pooled_output=pooled_output, encoder_outputs=encoder_outputs)
-    if self._config['return_attention_scores']:
-      output['attention_scores'] = attention_outputs
+    if self._config["return_attention_scores"]:
+      output["attention_scores"] = attention_outputs
     return output
 
   def get_embedding_table(self):
@@ -294,21 +294,25 @@ def pooler_layer(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+    if "embedding_layer" in config and config["embedding_layer"] is not None:
       warn_string = (
-          'You are reloading a model that was saved with a '
-          'potentially-shared embedding layer object. If you contine to '
-          'train this model, the embedding layer will no longer be shared. '
-          'To work around this, load the model outside of the Keras API.'
+        "You are reloading a model that was saved with a "
+        "potentially-shared embedding layer object. If you contine to "
+        "train this model, the embedding layer will no longer be shared. "
+        "To work around this, load the model outside of the Keras API."
       )
-      print('WARNING: ' + warn_string)
+      print("WARNING: " + warn_string)
       logging.warn(warn_string)
 
     return cls(**config)
 
   def _get_embeddings(
-      self, word_ids: tf.Tensor, type_ids: tf.Tensor, word_embeddings: Optional[tf.Tensor],
-      dense_inputs: Optional[tf.Tensor], dense_type_ids: Optional[tf.Tensor]
+    self,
+    word_ids: tf.Tensor,
+    type_ids: tf.Tensor,
+    word_embeddings: Optional[tf.Tensor],
+    dense_inputs: Optional[tf.Tensor],
+    dense_type_ids: Optional[tf.Tensor],
   ) -> tf.Tensor:
     if word_embeddings is None:
       word_embeddings = self._embedding_layer(word_ids)
@@ -325,7 +329,7 @@ def _get_embeddings(
     return word_embeddings + position_embeddings + type_embeddings
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class BertEncoder(tf.keras.Model):
   """Bi-directional Transformer-based encoder network.
 
@@ -387,64 +391,64 @@ class BertEncoder(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      hidden_size=768,
-      num_layers=12,
-      num_attention_heads=12,
-      max_sequence_length=512,
-      type_vocab_size=16,
-      inner_dim=3072,
-      inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
-      output_dropout=0.1,
-      attention_dropout=0.1,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      output_range=None,
-      embedding_width=None,
-      embedding_layer=None,
-      norm_first=False,
-      dict_outputs=False,
-      return_all_encoder_outputs=False,
-      return_attention_scores: bool = False,
-      **kwargs
+    self,
+    vocab_size,
+    hidden_size=768,
+    num_layers=12,
+    num_attention_heads=12,
+    max_sequence_length=512,
+    type_vocab_size=16,
+    inner_dim=3072,
+    inner_activation=lambda x: tf.keras.activations.gelu(x, approximate=True),
+    output_dropout=0.1,
+    attention_dropout=0.1,
+    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    output_range=None,
+    embedding_width=None,
+    embedding_layer=None,
+    norm_first=False,
+    dict_outputs=False,
+    return_all_encoder_outputs=False,
+    return_attention_scores: bool = False,
+    **kwargs,
   ):
-    if 'sequence_length' in kwargs:
-      kwargs.pop('sequence_length')
+    if "sequence_length" in kwargs:
+      kwargs.pop("sequence_length")
       logging.warning(
-          '`sequence_length` is a deprecated argument to '
-          '`BertEncoder`, which has no effect for a while. Please '
-          'remove `sequence_length` argument.'
+        "`sequence_length` is a deprecated argument to "
+        "`BertEncoder`, which has no effect for a while. Please "
+        "remove `sequence_length` argument."
       )
 
     # Handles backward compatible kwargs.
-    if 'intermediate_size' in kwargs:
-      inner_dim = kwargs.pop('intermediate_size')
+    if "intermediate_size" in kwargs:
+      inner_dim = kwargs.pop("intermediate_size")
 
-    if 'activation' in kwargs:
-      inner_activation = kwargs.pop('activation')
+    if "activation" in kwargs:
+      inner_activation = kwargs.pop("activation")
 
-    if 'dropout_rate' in kwargs:
-      output_dropout = kwargs.pop('dropout_rate')
+    if "dropout_rate" in kwargs:
+      output_dropout = kwargs.pop("dropout_rate")
 
-    if 'attention_dropout_rate' in kwargs:
-      attention_dropout = kwargs.pop('attention_dropout_rate')
+    if "attention_dropout_rate" in kwargs:
+      attention_dropout = kwargs.pop("attention_dropout_rate")
 
     activation = tf.keras.activations.get(inner_activation)
     initializer = tf.keras.initializers.get(initializer)
 
-    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
+    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
 
     if embedding_width is None:
       embedding_width = hidden_size
 
     if embedding_layer is None:
       embedding_layer_inst = layers.OnDeviceEmbedding(
-          vocab_size=vocab_size,
-          embedding_width=embedding_width,
-          initializer=tf_utils.clone_initializer(initializer),
-          name='word_embeddings'
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=tf_utils.clone_initializer(initializer),
+        name="word_embeddings",
       )
     else:
       embedding_layer_inst = embedding_layer
@@ -452,36 +456,36 @@ def __init__(
 
     # Always uses dynamic slicing for simplicity.
     position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
     position_embeddings = position_embedding_layer(word_embeddings)
     type_embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        use_one_hot=True,
-        name='type_embeddings'
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
     )
     type_embeddings = type_embedding_layer(type_ids)
 
     embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
 
     embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
     embeddings = embedding_norm_layer(embeddings)
-    embeddings = (tf.keras.layers.Dropout(rate=output_dropout)(embeddings))
+    embeddings = tf.keras.layers.Dropout(rate=output_dropout)(embeddings)
 
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     if embedding_width != hidden_size:
       embedding_projection = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )
       embeddings = embedding_projection(embeddings)
     else:
@@ -497,15 +501,15 @@ def __init__(
       if i == num_layers - 1:
         transformer_output_range = output_range
       layer = layers.TransformerEncoderBlock(
-          num_attention_heads=num_attention_heads,
-          inner_dim=inner_dim,
-          inner_activation=inner_activation,
-          output_dropout=output_dropout,
-          attention_dropout=attention_dropout,
-          norm_first=norm_first,
-          return_attention_scores=return_attention_scores,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='transformer/layer_%d' % i
+        num_attention_heads=num_attention_heads,
+        inner_dim=inner_dim,
+        inner_activation=inner_activation,
+        output_dropout=output_dropout,
+        attention_dropout=attention_dropout,
+        norm_first=norm_first,
+        return_attention_scores=return_attention_scores,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="transformer/layer_%d" % i,
       )
       transformer_layers.append(layer)
       data = layer([data, attention_mask], output_range=transformer_output_range)
@@ -520,35 +524,35 @@ def __init__(
     # layer with Python code, because that is fundamentally less portable.
     first_token_tensor = last_encoder_output[:, 0, :]
     pooler_layer = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )
     cls_output = pooler_layer(first_token_tensor)
 
     outputs = dict(
-        sequence_output=encoder_outputs[-1],
-        pooled_output=cls_output,
-        encoder_outputs=encoder_outputs,
+      sequence_output=encoder_outputs[-1],
+      pooled_output=cls_output,
+      encoder_outputs=encoder_outputs,
     )
     if return_attention_scores:
-      outputs['attention_scores'] = attention_outputs
+      outputs["attention_scores"] = attention_outputs
 
     if dict_outputs:
       super().__init__(inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs)
     else:
-      cls_output = outputs['pooled_output']
+      cls_output = outputs["pooled_output"]
       if return_all_encoder_outputs:
-        encoder_outputs = outputs['encoder_outputs']
+        encoder_outputs = outputs["encoder_outputs"]
         outputs = [encoder_outputs, cls_output]
       else:
-        sequence_output = outputs['sequence_output']
+        sequence_output = outputs["sequence_output"]
         outputs = [sequence_output, cls_output]
       if return_attention_scores:
         outputs.append(attention_outputs)
       super().__init__(  # pylint: disable=bad-super-call
-          inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs
+        inputs=[word_ids, mask, type_ids], outputs=outputs, **kwargs
       )
 
     self._pooler_layer = pooler_layer
@@ -561,23 +565,23 @@ def __init__(
       self._embedding_projection = embedding_projection
 
     config_dict = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'inner_dim': inner_dim,
-        'inner_activation': tf.keras.activations.serialize(activation),
-        'output_dropout': output_dropout,
-        'attention_dropout': attention_dropout,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'output_range': output_range,
-        'embedding_width': embedding_width,
-        'embedding_layer': embedding_layer,
-        'norm_first': norm_first,
-        'dict_outputs': dict_outputs,
-        'return_attention_scores': return_attention_scores,
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "inner_dim": inner_dim,
+      "inner_activation": tf.keras.activations.serialize(activation),
+      "output_dropout": output_dropout,
+      "attention_dropout": attention_dropout,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "output_range": output_range,
+      "embedding_width": embedding_width,
+      "embedding_layer": embedding_layer,
+      "norm_first": norm_first,
+      "dict_outputs": dict_outputs,
+      "return_attention_scores": return_attention_scores,
     }
     # pylint: disable=protected-access
     self._setattr_tracking = False
@@ -606,14 +610,14 @@ def pooler_layer(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+    if "embedding_layer" in config and config["embedding_layer"] is not None:
       warn_string = (
-          'You are reloading a model that was saved with a '
-          'potentially-shared embedding layer object. If you contine to '
-          'train this model, the embedding layer will no longer be shared. '
-          'To work around this, load the model outside of the Keras API.'
+        "You are reloading a model that was saved with a "
+        "potentially-shared embedding layer object. If you contine to "
+        "train this model, the embedding layer will no longer be shared. "
+        "To work around this, load the model outside of the Keras API."
       )
-      print('WARNING: ' + warn_string)
+      print("WARNING: " + warn_string)
       logging.warn(warn_string)
 
     return cls(**config)
diff --git a/deepray/layers/networks/bert_encoder_test.py b/deepray/layers/networks/bert_encoder_test.py
index 9429bc52..08c96de7 100644
--- a/deepray/layers/networks/bert_encoder_test.py
+++ b/deepray/layers/networks/bert_encoder_test.py
@@ -22,14 +22,13 @@
 
 
 class BertEncoderTest(tf.test.TestCase, parameterized.TestCase):
-
   def tearDown(self):
     super(BertEncoderTest, self).tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.named_parameters(
-      ("encoder_v2", bert_encoder.BertEncoderV2),
-      ("encoder_v1", bert_encoder.BertEncoder),
+    ("encoder_v2", bert_encoder.BertEncoderV2),
+    ("encoder_v1", bert_encoder.BertEncoder),
   )
   def test_dict_outputs_network_creation(self, encoder_cls):
     hidden_size = 32
@@ -62,15 +61,15 @@ def test_dict_outputs_network_creation(self, encoder_cls):
     self.assertAllEqual(tf.float32, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("encoder_v2", bert_encoder.BertEncoderV2),
-      ("encoder_v1", bert_encoder.BertEncoder),
+    ("encoder_v2", bert_encoder.BertEncoderV2),
+    ("encoder_v1", bert_encoder.BertEncoder),
   )
   def test_dict_outputs_all_encoder_outputs_network_creation(self, encoder_cls):
     hidden_size = 32
     sequence_length = 21
     # Create a small BertEncoder for testing.
     test_network = encoder_cls(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -92,8 +91,8 @@ def test_dict_outputs_all_encoder_outputs_network_creation(self, encoder_cls):
     self.assertAllEqual(tf.float32, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("encoder_v2", bert_encoder.BertEncoderV2),
-      ("encoder_v1", bert_encoder.BertEncoder),
+    ("encoder_v2", bert_encoder.BertEncoderV2),
+    ("encoder_v1", bert_encoder.BertEncoder),
   )
   def test_dict_outputs_network_creation_return_attention_scores(self, encoder_cls):
     hidden_size = 32
@@ -102,12 +101,12 @@ def test_dict_outputs_network_creation_return_attention_scores(self, encoder_cls
     num_layers = 3
     # Create a small BertEncoder for testing.
     test_network = encoder_cls(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        num_layers=num_layers,
-        return_attention_scores=True,
-        dict_outputs=True
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=num_attention_heads,
+      num_layers=num_layers,
+      return_attention_scores=True,
+      dict_outputs=True,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -125,8 +124,8 @@ def test_dict_outputs_network_creation_return_attention_scores(self, encoder_cls
     self.assertAllEqual(tf.float32, all_attention_outputs[-1].dtype)
 
   @parameterized.named_parameters(
-      ("encoder_v2", bert_encoder.BertEncoderV2),
-      ("encoder_v1", bert_encoder.BertEncoder),
+    ("encoder_v2", bert_encoder.BertEncoderV2),
+    ("encoder_v1", bert_encoder.BertEncoder),
   )
   def test_dict_outputs_network_creation_with_float16_dtype(self, encoder_cls):
     hidden_size = 32
@@ -134,7 +133,7 @@ def test_dict_outputs_network_creation_with_float16_dtype(self, encoder_cls):
     tf.keras.mixed_precision.set_global_policy("mixed_float16")
     # Create a small BertEncoder for testing.
     test_network = encoder_cls(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -155,10 +154,10 @@ def test_dict_outputs_network_creation_with_float16_dtype(self, encoder_cls):
     self.assertAllEqual(tf.float16, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("all_sequence_encoder_v1", bert_encoder.BertEncoder, None, 21),
-      ("output_range_encoder_v1", bert_encoder.BertEncoder, 1, 1),
-      ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 21),
-      ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
+    ("all_sequence_encoder_v1", bert_encoder.BertEncoder, None, 21),
+    ("output_range_encoder_v1", bert_encoder.BertEncoder, 1, 1),
+    ("all_sequence_encoder_v2", bert_encoder.BertEncoderV2, None, 21),
+    ("output_range_encoder_v2", bert_encoder.BertEncoderV2, 1, 1),
   )
   def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_seq_len):
     hidden_size = 32
@@ -167,13 +166,13 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
     num_types = 7
     # Create a small BertEncoder for testing.
     test_network = encoder_cls(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        output_range=output_range,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      output_range=output_range,
+      dict_outputs=True,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -198,13 +197,13 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
     # Creates a BertEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = encoder_cls(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      dict_outputs=True,
     )
     dict_outputs = test_network(dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
     data = dict_outputs["sequence_output"]
@@ -215,14 +214,14 @@ def test_dict_outputs_network_invocation(self, encoder_cls, output_range, out_se
 
     # Creates a BertEncoder with embedding_width != hidden_size
     test_network = encoder_cls(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        embedding_width=16,
-        dict_outputs=True
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      embedding_width=16,
+      dict_outputs=True,
     )
     dict_outputs = test_network(dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids))
     data = dict_outputs["sequence_output"]
@@ -237,7 +236,7 @@ def test_embeddings_as_inputs(self):
     sequence_length = 21
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoderV2(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length), dtype=tf.int32)
@@ -264,21 +263,21 @@ def test_embeddings_as_inputs(self):
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        vocab_size=100,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        inner_dim=1223,
-        inner_activation="relu",
-        output_dropout=0.05,
-        attention_dropout=0.22,
-        initializer="glorot_uniform",
-        output_range=-1,
-        embedding_width=16,
-        embedding_layer=None,
-        norm_first=False
+      vocab_size=100,
+      hidden_size=32,
+      num_layers=3,
+      num_attention_heads=2,
+      max_sequence_length=21,
+      type_vocab_size=12,
+      inner_dim=1223,
+      inner_activation="relu",
+      output_dropout=0.05,
+      attention_dropout=0.22,
+      initializer="glorot_uniform",
+      output_range=-1,
+      embedding_width=16,
+      embedding_layer=None,
+      norm_first=False,
     )
     network = bert_encoder.BertEncoder(**kwargs)
 
@@ -295,7 +294,7 @@ def test_network_creation(self):
     sequence_length = 21
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoder(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -317,7 +316,7 @@ def test_network_creation(self):
     self.assertAllEqual(tf.float32, pooled.dtype)
 
     test_network_dict = bert_encoder.BertEncoder(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, dict_outputs=True
     )
     # Create the inputs (note that the first dimension is implicit).
     inputs = dict(input_word_ids=word_ids, input_mask=mask, input_type_ids=type_ids)
@@ -332,7 +331,7 @@ def test_network_creation(self):
     type_id_data = np.random.randint(num_types, size=(batch_size, sequence_length))
     list_outputs = test_network([word_id_data, mask_data, type_id_data])
     dict_outputs = test_network_dict(
-        dict(input_word_ids=word_id_data, input_mask=mask_data, input_type_ids=type_id_data)
+      dict(input_word_ids=word_id_data, input_mask=mask_data, input_type_ids=type_id_data)
     )
     self.assertAllEqual(list_outputs[0], dict_outputs["sequence_output"])
     self.assertAllEqual(list_outputs[1], dict_outputs["pooled_output"])
@@ -342,7 +341,7 @@ def test_all_encoder_outputs_network_creation(self):
     sequence_length = 21
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoder(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, return_all_encoder_outputs=True
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, return_all_encoder_outputs=True
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -368,11 +367,11 @@ def test_attention_scores_output_network_creation(self):
     num_layers = 3
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        num_layers=num_layers,
-        return_attention_scores=True
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=num_attention_heads,
+      num_layers=num_layers,
+      return_attention_scores=True,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -394,7 +393,7 @@ def test_network_creation_with_float16_dtype(self):
     tf.keras.mixed_precision.set_global_policy("mixed_float16")
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoder(
-        vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, num_layers=3
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -413,8 +412,8 @@ def test_network_creation_with_float16_dtype(self):
     self.assertAllEqual(tf.float16, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("all_sequence", None, 21),
-      ("output_range", 1, 1),
+    ("all_sequence", None, 21),
+    ("output_range", 1, 1),
   )
   def test_network_invocation(self, output_range, out_seq_len):
     hidden_size = 32
@@ -423,12 +422,12 @@ def test_network_invocation(self, output_range, out_seq_len):
     num_types = 7
     # Create a small BertEncoder for testing.
     test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        output_range=output_range
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      output_range=output_range,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -451,12 +450,12 @@ def test_network_invocation(self, output_range, out_seq_len):
     # Creates a BertEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
     )
     data, pooled = test_network([word_ids, mask, type_ids])
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
@@ -465,13 +464,13 @@ def test_network_invocation(self, output_range, out_seq_len):
 
     # Creates a BertEncoder with embedding_width != hidden_size
     test_network = bert_encoder.BertEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        embedding_width=16
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      embedding_width=16,
     )
     data, pooled = test_network([word_ids, mask, type_ids])
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
@@ -481,7 +480,6 @@ def test_network_invocation(self, output_range, out_seq_len):
 
 
 class BertEncoderV2CompatibilityTest(tf.test.TestCase):
-
   def tearDown(self):
     super().tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
@@ -495,7 +493,7 @@ def test_weights_forward_compatible(self):
     num_types = 7
 
     kwargs = dict(
-        vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, type_vocab_size=num_types
+      vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, type_vocab_size=num_types
     )
 
     word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
@@ -541,7 +539,7 @@ def test_checkpoint_forward_compatible(self):
     num_types = 7
 
     kwargs = dict(
-        vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, type_vocab_size=num_types
+      vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=2, num_layers=3, type_vocab_size=num_types
     )
 
     word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
@@ -575,12 +573,12 @@ def test_keras_model_checkpoint_forward_compatible(self):
     num_types = 7
 
     kwargs = dict(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        output_range=None
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      output_range=None,
     )
 
     word_id_data = np.random.randint(vocab_size, size=(batch_size, sequence_length))
diff --git a/deepray/layers/networks/fnet.py b/deepray/layers/networks/fnet.py
index 5c05333e..9c9921bf 100644
--- a/deepray/layers/networks/fnet.py
+++ b/deepray/layers/networks/fnet.py
@@ -91,27 +91,27 @@ class FNet(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size: int,
-      hidden_size: int = 768,
-      num_layers: int = 12,
-      mixing_mechanism: layers.MixingMechanism = layers.MixingMechanism.FOURIER,
-      use_fft: bool = False,
-      attention_layers: Sequence[int] = (),
-      num_attention_heads: int = 12,
-      max_sequence_length: int = 512,
-      type_vocab_size: int = 16,
-      inner_dim: int = 3072,
-      inner_activation: _Activation = _approx_gelu,
-      output_dropout: float = 0.1,
-      attention_dropout: float = 0.1,
-      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      output_range: Optional[int] = None,
-      embedding_width: Optional[int] = None,
-      embedding_layer: Optional[tf.keras.layers.Layer] = None,
-      norm_first: bool = False,
-      with_dense_inputs: bool = False,
-      **kwargs
+    self,
+    vocab_size: int,
+    hidden_size: int = 768,
+    num_layers: int = 12,
+    mixing_mechanism: layers.MixingMechanism = layers.MixingMechanism.FOURIER,
+    use_fft: bool = False,
+    attention_layers: Sequence[int] = (),
+    num_attention_heads: int = 12,
+    max_sequence_length: int = 512,
+    type_vocab_size: int = 16,
+    inner_dim: int = 3072,
+    inner_activation: _Activation = _approx_gelu,
+    output_dropout: float = 0.1,
+    attention_dropout: float = 0.1,
+    initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    output_range: Optional[int] = None,
+    embedding_width: Optional[int] = None,
+    embedding_layer: Optional[tf.keras.layers.Layer] = None,
+    norm_first: bool = False,
+    with_dense_inputs: bool = False,
+    **kwargs,
   ):
     super().__init__(**kwargs)
 
@@ -122,138 +122,138 @@ def __init__(
       embedding_width = hidden_size
 
     self._config = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'mixing_mechanism': mixing_mechanism,
-        'use_fft': use_fft,
-        'attention_layers': attention_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'inner_dim': inner_dim,
-        'inner_activation': tf.keras.activations.serialize(activation),
-        'output_dropout': output_dropout,
-        'attention_dropout': attention_dropout,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'output_range': output_range,
-        'embedding_width': embedding_width,
-        'embedding_layer': embedding_layer,
-        'norm_first': norm_first,
-        'with_dense_inputs': with_dense_inputs,
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "mixing_mechanism": mixing_mechanism,
+      "use_fft": use_fft,
+      "attention_layers": attention_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "inner_dim": inner_dim,
+      "inner_activation": tf.keras.activations.serialize(activation),
+      "output_dropout": output_dropout,
+      "attention_dropout": attention_dropout,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "output_range": output_range,
+      "embedding_width": embedding_width,
+      "embedding_layer": embedding_layer,
+      "norm_first": norm_first,
+      "with_dense_inputs": with_dense_inputs,
     }
 
     if embedding_layer is None:
       self._embedding_layer = layers.OnDeviceEmbedding(
-          vocab_size=vocab_size,
-          embedding_width=embedding_width,
-          initializer=tf_utils.clone_initializer(initializer),
-          name='word_embeddings'
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=tf_utils.clone_initializer(initializer),
+        name="word_embeddings",
       )
     else:
       self._embedding_layer = embedding_layer
 
     self._position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
 
     self._type_embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        use_one_hot=True,
-        name='type_embeddings'
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
     )
 
     self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
-    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name='embedding_dropout')
+    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name="embedding_dropout")
 
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     self._embedding_projection = None
     if embedding_width != hidden_size:
       self._embedding_projection = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )
 
     self._transformer_layers = []
     for layer in range(num_layers):
       if layer in attention_layers:
         mixing_layer = layers.MultiHeadAttention(
-            num_heads=num_attention_heads,
-            key_dim=int(hidden_size // num_attention_heads),
-            dropout=attention_dropout,
-            use_bias=True,
-            kernel_initializer=tf_utils.clone_initializer(initializer),
-            name='self_attention',
+          num_heads=num_attention_heads,
+          key_dim=int(hidden_size // num_attention_heads),
+          dropout=attention_dropout,
+          use_bias=True,
+          kernel_initializer=tf_utils.clone_initializer(initializer),
+          name="self_attention",
         )
       else:
         mixing_layer = self._init_mixing_sublayer(layer)
 
       block = layers.TransformerScaffold(
-          num_attention_heads=num_attention_heads,
-          inner_dim=inner_dim,
-          inner_activation=inner_activation,
-          attention_cls=mixing_layer,
-          feedforward_cls=None,  # Fallback to default FeedForward class
-          output_dropout=output_dropout,
-          attention_dropout=attention_dropout,
-          norm_first=norm_first,
-          output_range=output_range if layer == num_layers - 1 else None,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='transformer/layer_%d' % layer
+        num_attention_heads=num_attention_heads,
+        inner_dim=inner_dim,
+        inner_activation=inner_activation,
+        attention_cls=mixing_layer,
+        feedforward_cls=None,  # Fallback to default FeedForward class
+        output_dropout=output_dropout,
+        attention_dropout=attention_dropout,
+        norm_first=norm_first,
+        output_range=output_range if layer == num_layers - 1 else None,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="transformer/layer_%d" % layer,
       )
       self._transformer_layers.append(block)
 
-    self._attention_mask_layer = layers.SelfAttentionMask(name='self_attention_mask')
+    self._attention_mask_layer = layers.SelfAttentionMask(name="self_attention_mask")
 
     self._pooler_layer = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )
 
     if with_dense_inputs:
       self.inputs = dict(
-          # The total length of token ids and dense inputs still has to be
-          # max_sequence_length. It is checked in call().
-          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
-          dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        # The total length of token ids and dense inputs still has to be
+        # max_sequence_length. It is checked in call().
+        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
+        dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
       )
 
     else:
       self.inputs = dict(
-          input_word_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32)
+        input_word_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
       )
     self._max_sequence_length = max_sequence_length
 
   def call(self, inputs):
     word_embeddings = None
     if isinstance(inputs, dict):
-      word_ids = inputs.get('input_word_ids')
-      mask = inputs.get('input_mask')
-      type_ids = inputs.get('input_type_ids')
-      word_embeddings = inputs.get('input_word_embeddings', None)
-
-      dense_inputs = inputs.get('dense_inputs', None)
-      dense_mask = inputs.get('dense_mask', None)
-      dense_type_ids = inputs.get('dense_type_ids', None)
+      word_ids = inputs.get("input_word_ids")
+      mask = inputs.get("input_mask")
+      type_ids = inputs.get("input_type_ids")
+      word_embeddings = inputs.get("input_word_embeddings", None)
+
+      dense_inputs = inputs.get("dense_inputs", None)
+      dense_mask = inputs.get("dense_mask", None)
+      dense_type_ids = inputs.get("dense_type_ids", None)
     else:
-      raise ValueError('Unexpected inputs type (%s) to %s.' % (type(inputs), self.__class__))
+      raise ValueError("Unexpected inputs type (%s) to %s." % (type(inputs), self.__class__))
 
     if word_embeddings is None:
       word_embeddings = self._embedding_layer(word_ids)
@@ -314,29 +314,29 @@ def pooler_layer(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+    if "embedding_layer" in config and config["embedding_layer"] is not None:
       warn_string = (
-          'You are reloading a model that was saved with a '
-          'potentially-shared embedding layer object. If you contine to '
-          'train this model, the embedding layer will no longer be shared. '
-          'To work around this, load the model outside of the Keras API.'
+        "You are reloading a model that was saved with a "
+        "potentially-shared embedding layer object. If you contine to "
+        "train this model, the embedding layer will no longer be shared. "
+        "To work around this, load the model outside of the Keras API."
       )
-      print('WARNING: ' + warn_string)
+      print("WARNING: " + warn_string)
       logging.warn(warn_string)
 
     return cls(**config)
 
   def _init_mixing_sublayer(self, layer: int):
     """Initializes config-dependent mixing sublayer."""
-    if self._config['mixing_mechanism'] == layers.MixingMechanism.FOURIER:
-      mixing_sublayer = layers.FourierTransformLayer(use_fft=self._config['use_fft'], name='fourier_transform')
-    elif self._config['mixing_mechanism'] == layers.MixingMechanism.HARTLEY:
-      mixing_sublayer = layers.HartleyTransformLayer(use_fft=self._config['use_fft'], name='hartley_transform')
-    elif self._config['mixing_mechanism'] == layers.MixingMechanism.LINEAR:
+    if self._config["mixing_mechanism"] == layers.MixingMechanism.FOURIER:
+      mixing_sublayer = layers.FourierTransformLayer(use_fft=self._config["use_fft"], name="fourier_transform")
+    elif self._config["mixing_mechanism"] == layers.MixingMechanism.HARTLEY:
+      mixing_sublayer = layers.HartleyTransformLayer(use_fft=self._config["use_fft"], name="hartley_transform")
+    elif self._config["mixing_mechanism"] == layers.MixingMechanism.LINEAR:
       mixing_sublayer = layers.LinearTransformLayer(
-          kernel_initializer=tf_utils.clone_initializer(self._config['initializer']), name='linear_transform'
+        kernel_initializer=tf_utils.clone_initializer(self._config["initializer"]), name="linear_transform"
       )
     else:
-      raise ValueError('Unsupported mixing mechanism: %s' % self._config['mixing_mechanism'])
+      raise ValueError("Unsupported mixing mechanism: %s" % self._config["mixing_mechanism"])
 
     return mixing_sublayer
diff --git a/deepray/layers/networks/fnet_test.py b/deepray/layers/networks/fnet_test.py
index 172fa296..72771c72 100644
--- a/deepray/layers/networks/fnet_test.py
+++ b/deepray/layers/networks/fnet_test.py
@@ -23,32 +23,31 @@
 
 
 class FNetTest(parameterized.TestCase, tf.test.TestCase):
-
   def tearDown(self):
     super(FNetTest, self).tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.named_parameters(
-      ("fnet", layers.MixingMechanism.FOURIER, ()),
-      ("fnet_hybrid", layers.MixingMechanism.FOURIER, (1, 2)),
-      ("hnet", layers.MixingMechanism.HARTLEY, ()),
-      ("hnet_hybrid", layers.MixingMechanism.HARTLEY, (1, 2)),
-      ("linear", layers.MixingMechanism.LINEAR, ()),
-      ("linear_hybrid", layers.MixingMechanism.LINEAR, (0,)),
-      ("bert", layers.MixingMechanism.FOURIER, (0, 1, 2)),
+    ("fnet", layers.MixingMechanism.FOURIER, ()),
+    ("fnet_hybrid", layers.MixingMechanism.FOURIER, (1, 2)),
+    ("hnet", layers.MixingMechanism.HARTLEY, ()),
+    ("hnet_hybrid", layers.MixingMechanism.HARTLEY, (1, 2)),
+    ("linear", layers.MixingMechanism.LINEAR, ()),
+    ("linear_hybrid", layers.MixingMechanism.LINEAR, (0,)),
+    ("bert", layers.MixingMechanism.FOURIER, (0, 1, 2)),
   )
   def test_network(self, mixing_mechanism: layers.MixingMechanism, attention_layers: Sequence[int]):
     num_layers = 3
     hidden_size = 32
     sequence_length = 21
     test_network = fnet.FNet(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        max_sequence_length=sequence_length,
-        num_layers=num_layers,
-        mixing_mechanism=mixing_mechanism,
-        attention_layers=attention_layers
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      max_sequence_length=sequence_length,
+      num_layers=num_layers,
+      mixing_mechanism=mixing_mechanism,
+      attention_layers=attention_layers,
     )
 
     # Create the inputs (note that the first dimension is implicit).
@@ -77,11 +76,7 @@ def test_embeddings_as_inputs(self):
     hidden_size = 32
     sequence_length = 21
     test_network = fnet.FNet(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        max_sequence_length=sequence_length,
-        num_layers=3
+      vocab_size=100, hidden_size=hidden_size, num_attention_heads=2, max_sequence_length=sequence_length, num_layers=3
     )
 
     # Create the inputs (note that the first dimension is implicit).
diff --git a/deepray/layers/networks/funnel_transformer.py b/deepray/layers/networks/funnel_transformer.py
index 93333f60..22214e74 100644
--- a/deepray/layers/networks/funnel_transformer.py
+++ b/deepray/layers/networks/funnel_transformer.py
@@ -25,18 +25,18 @@
 _Initializer = Union[str, tf.keras.initializers.Initializer]
 _Activation = Union[str, Callable[..., Any]]
 
-_MAX = 'max'
-_AVG = 'avg'
-_TRUNCATED_AVG = 'truncated_avg'
+_MAX = "max"
+_AVG = "avg"
+_TRUNCATED_AVG = "truncated_avg"
 
 _transformer_cls2str = {
-    layers.TransformerEncoderBlock: 'TransformerEncoderBlock',
-    layers.ReZeroTransformer: 'ReZeroTransformer'
+  layers.TransformerEncoderBlock: "TransformerEncoderBlock",
+  layers.ReZeroTransformer: "ReZeroTransformer",
 }
 
 _str2transformer_cls = {
-    'TransformerEncoderBlock': layers.TransformerEncoderBlock,
-    'ReZeroTransformer': layers.ReZeroTransformer
+  "TransformerEncoderBlock": layers.TransformerEncoderBlock,
+  "ReZeroTransformer": layers.ReZeroTransformer,
 }
 
 _approx_gelu = lambda x: tf.keras.activations.gelu(x, approximate=True)
@@ -70,7 +70,7 @@ def _pool_and_concat(mask, unpool_length: int, strides: Union[Sequence[int], int
     strides = [strides] * len(axes)
   else:
     if len(strides) != len(axes):
-      raise ValueError('The lengths of strides and axes need to match.')
+      raise ValueError("The lengths of strides and axes need to match.")
   # Bypass no pooling cases.
   if np.all(np.array(strides) == 1):
     return mask
@@ -144,7 +144,7 @@ def _create_truncated_avg_masks(input_mask: tf.Tensor, pool_strides: Sequence[in
   """
 
   def create_2d_mask(from_length, mask):
-    return tf.einsum('F,BT->BFT', tf.ones([from_length], dtype=mask.dtype), mask)
+    return tf.einsum("F,BT->BFT", tf.ones([from_length], dtype=mask.dtype), mask)
 
   attention_masks = []
   seq_length = tf.shape(input_mask)[-1]
@@ -156,14 +156,14 @@ def create_2d_mask(from_length, mask):
       pooled_seq_length = seq_length // pool_stride
       attention_masks.append(create_2d_mask(pooled_seq_length, layer_mask))
 
-      layer_mask = tf.cast(tf.einsum('BF,FT->BT', layer_mask, transform) > 0.0, dtype=layer_mask.dtype)
+      layer_mask = tf.cast(tf.einsum("BF,FT->BT", layer_mask, transform) > 0.0, dtype=layer_mask.dtype)
       seq_length = pooled_seq_length
   del seq_length
 
   return attention_masks
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class FunnelTransformerEncoder(tf.keras.layers.Layer):
   """Funnel Transformer-based encoder network.
 
@@ -218,35 +218,34 @@ class FunnelTransformerEncoder(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size: int,
-      hidden_size: int = 768,
-      num_layers: int = 12,
-      num_attention_heads: int = 12,
-      max_sequence_length: int = 512,
-      type_vocab_size: int = 16,
-      inner_dim: int = 3072,
-      inner_activation: _Activation = _approx_gelu,
-      output_dropout: float = 0.1,
-      attention_dropout: float = 0.1,
-      pool_type: str = _MAX,
-      pool_stride: int = 2,
-      unpool_length: int = 0,
-      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      output_range: Optional[int] = None,
-      embedding_width: Optional[int] = None,
-      embedding_layer: Optional[tf.keras.layers.Layer] = None,
-      norm_first: bool = False,
-      transformer_cls: Union[str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
-      share_rezero: bool = False,
-      **kwargs
+    self,
+    vocab_size: int,
+    hidden_size: int = 768,
+    num_layers: int = 12,
+    num_attention_heads: int = 12,
+    max_sequence_length: int = 512,
+    type_vocab_size: int = 16,
+    inner_dim: int = 3072,
+    inner_activation: _Activation = _approx_gelu,
+    output_dropout: float = 0.1,
+    attention_dropout: float = 0.1,
+    pool_type: str = _MAX,
+    pool_stride: int = 2,
+    unpool_length: int = 0,
+    initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    output_range: Optional[int] = None,
+    embedding_width: Optional[int] = None,
+    embedding_layer: Optional[tf.keras.layers.Layer] = None,
+    norm_first: bool = False,
+    transformer_cls: Union[str, tf.keras.layers.Layer] = layers.TransformerEncoderBlock,
+    share_rezero: bool = False,
+    **kwargs,
   ):
     super().__init__(**kwargs)
 
     if output_range is not None:
       logging.warning(
-          '`output_range` is available as an argument for `call()`.'
-          'The `output_range` as __init__ argument is deprecated.'
+        "`output_range` is available as an argument for `call()`.The `output_range` as __init__ argument is deprecated."
       )
 
     activation = tf.keras.activations.get(inner_activation)
@@ -257,78 +256,78 @@ def __init__(
 
     if embedding_layer is None:
       self._embedding_layer = layers.OnDeviceEmbedding(
-          vocab_size=vocab_size,
-          embedding_width=embedding_width,
-          initializer=tf_utils.clone_initializer(initializer),
-          name='word_embeddings'
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=tf_utils.clone_initializer(initializer),
+        name="word_embeddings",
       )
     else:
       self._embedding_layer = embedding_layer
 
     self._position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
 
     self._type_embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        use_one_hot=True,
-        name='type_embeddings'
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
     )
 
     self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
-    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name='embedding_dropout')
+    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name="embedding_dropout")
 
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     self._embedding_projection = None
     if embedding_width != hidden_size:
       self._embedding_projection = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )
 
     self._transformer_layers = []
-    self._attention_mask_layer = layers.SelfAttentionMask(name='self_attention_mask')
+    self._attention_mask_layer = layers.SelfAttentionMask(name="self_attention_mask")
     # Will raise an error if the string is not supported.
     if isinstance(transformer_cls, str):
       transformer_cls = _str2transformer_cls[transformer_cls]
     self._num_layers = num_layers
     for i in range(num_layers):
       layer = transformer_cls(
-          num_attention_heads=num_attention_heads,
-          intermediate_size=inner_dim,
-          inner_dim=inner_dim,
-          intermediate_activation=inner_activation,
-          inner_activation=inner_activation,
-          output_dropout=output_dropout,
-          attention_dropout=attention_dropout,
-          norm_first=norm_first,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          share_rezero=share_rezero,
-          name='transformer/layer_%d' % i
+        num_attention_heads=num_attention_heads,
+        intermediate_size=inner_dim,
+        inner_dim=inner_dim,
+        intermediate_activation=inner_activation,
+        inner_activation=inner_activation,
+        output_dropout=output_dropout,
+        attention_dropout=attention_dropout,
+        norm_first=norm_first,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        share_rezero=share_rezero,
+        name="transformer/layer_%d" % i,
       )
       self._transformer_layers.append(layer)
 
     self._pooler_layer = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )
     if isinstance(pool_stride, int):
       # TODO(b/197133196): Pooling layer can be shared.
       pool_strides = [pool_stride] * num_layers
     else:
       if len(pool_stride) != num_layers:
-        raise ValueError('Lengths of pool_stride and num_layers are not equal.')
+        raise ValueError("Lengths of pool_stride and num_layers are not equal.")
       pool_strides = pool_stride
     # TODO(crickwu): explore tf.keras.layers.serialize method.
     if pool_type == _MAX:
@@ -338,15 +337,15 @@ def __init__(
     elif pool_type == _TRUNCATED_AVG:
       # TODO(b/203665205): unpool_length should be implemented.
       if unpool_length != 0:
-        raise ValueError('unpool_length is not supported by truncated_avg now.')
+        raise ValueError("unpool_length is not supported by truncated_avg now.")
     else:
-      raise ValueError('pool_type not supported.')
+      raise ValueError("pool_type not supported.")
 
     if pool_type in (_MAX, _AVG):
       self._att_input_pool_layers = []
       for layer_pool_stride in pool_strides:
         att_input_pool_layer = pool_cls(
-            pool_size=layer_pool_stride, strides=layer_pool_stride, padding='same', name='att_input_pool_layer'
+          pool_size=layer_pool_stride, strides=layer_pool_stride, padding="same", name="att_input_pool_layer"
         )
         self._att_input_pool_layers.append(att_input_pool_layer)
 
@@ -356,37 +355,37 @@ def __init__(
     self._pool_type = pool_type
 
     self._config = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'inner_dim': inner_dim,
-        'inner_activation': tf.keras.activations.serialize(activation),
-        'output_dropout': output_dropout,
-        'attention_dropout': attention_dropout,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'output_range': output_range,
-        'embedding_width': embedding_width,
-        'embedding_layer': embedding_layer,
-        'norm_first': norm_first,
-        'pool_type': pool_type,
-        'pool_stride': pool_stride,
-        'unpool_length': unpool_length,
-        'transformer_cls': _transformer_cls2str.get(transformer_cls, str(transformer_cls))
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "inner_dim": inner_dim,
+      "inner_activation": tf.keras.activations.serialize(activation),
+      "output_dropout": output_dropout,
+      "attention_dropout": attention_dropout,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "output_range": output_range,
+      "embedding_width": embedding_width,
+      "embedding_layer": embedding_layer,
+      "norm_first": norm_first,
+      "pool_type": pool_type,
+      "pool_stride": pool_stride,
+      "unpool_length": unpool_length,
+      "transformer_cls": _transformer_cls2str.get(transformer_cls, str(transformer_cls)),
     }
 
     self.inputs = dict(
-        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32)
+      input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+      input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+      input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
     )
 
   def call(self, inputs, output_range: Optional[tf.Tensor] = None):
     # inputs are [word_ids, mask, type_ids]
     if isinstance(inputs, (list, tuple)):
-      logging.warning('List inputs to  %s are discouraged.', self.__class__)
+      logging.warning("List inputs to  %s are discouraged.", self.__class__)
       if len(inputs) == 3:
         word_ids, mask, type_ids = inputs
         dense_inputs = None
@@ -395,17 +394,17 @@ def call(self, inputs, output_range: Optional[tf.Tensor] = None):
       elif len(inputs) == 6:
         word_ids, mask, type_ids, dense_inputs, dense_mask, dense_type_ids = inputs
       else:
-        raise ValueError('Unexpected inputs to %s with length at %d.' % (self.__class__, len(inputs)))
+        raise ValueError("Unexpected inputs to %s with length at %d." % (self.__class__, len(inputs)))
     elif isinstance(inputs, dict):
-      word_ids = inputs.get('input_word_ids')
-      mask = inputs.get('input_mask')
-      type_ids = inputs.get('input_type_ids')
+      word_ids = inputs.get("input_word_ids")
+      mask = inputs.get("input_mask")
+      type_ids = inputs.get("input_type_ids")
 
-      dense_inputs = inputs.get('dense_inputs', None)
-      dense_mask = inputs.get('dense_mask', None)
-      dense_type_ids = inputs.get('dense_type_ids', None)
+      dense_inputs = inputs.get("dense_inputs", None)
+      dense_mask = inputs.get("dense_mask", None)
+      dense_type_ids = inputs.get("dense_type_ids", None)
     else:
-      raise ValueError('Unexpected inputs type to %s.' % self.__class__)
+      raise ValueError("Unexpected inputs type to %s." % self.__class__)
 
     word_embeddings = self._embedding_layer(word_ids)
 
@@ -433,7 +432,7 @@ def call(self, inputs, output_range: Optional[tf.Tensor] = None):
     # TODO(b/195972228): attention_mask can be co-generated with pooling.
     if self._pool_type in (_MAX, _AVG):
       attention_mask = _pool_and_concat(
-          attention_mask, unpool_length=self._unpool_length, strides=self._pool_strides[0], axes=[1]
+        attention_mask, unpool_length=self._unpool_length, strides=self._pool_strides[0], axes=[1]
       )
 
       for i, layer in enumerate(self._transformer_layers):
@@ -442,18 +441,18 @@ def call(self, inputs, output_range: Optional[tf.Tensor] = None):
           x = layer([x, x, attention_mask])
         else:
           # Pools layer for compressing the query length.
-          pooled_inputs = self._att_input_pool_layers[i](x[:, self._unpool_length:, :])
+          pooled_inputs = self._att_input_pool_layers[i](x[:, self._unpool_length :, :])
           query_inputs = tf.concat(
-              values=(tf.cast(x[:, :self._unpool_length, :], dtype=pooled_inputs.dtype), pooled_inputs), axis=1
+            values=(tf.cast(x[:, : self._unpool_length, :], dtype=pooled_inputs.dtype), pooled_inputs), axis=1
           )
           x = layer([query_inputs, x, attention_mask], output_range=output_range if i == self._num_layers - 1 else None)
         # Pools the corresponding attention_mask.
         if i < len(self._transformer_layers) - 1:
           attention_mask = _pool_and_concat(
-              attention_mask,
-              unpool_length=self._unpool_length,
-              strides=[self._pool_strides[i + 1], self._pool_strides[i]],
-              axes=[1, 2]
+            attention_mask,
+            unpool_length=self._unpool_length,
+            strides=[self._pool_strides[i + 1], self._pool_strides[i]],
+            axes=[1, 2],
           )
         encoder_outputs.append(x)
     elif self._pool_type == _TRUNCATED_AVG:
@@ -472,13 +471,12 @@ def call(self, inputs, output_range: Optional[tf.Tensor] = None):
           x = layer([x, x, attention_mask], output_range=transformer_output_range)
         else:
           pooled_inputs = tf.einsum(
-              'BFD,FT->BTD',
-              tf.cast(x[:, self._unpool_length:, :],
-                      _get_policy_dtype()),  # extra casting for faster mixed computation.
-              pooling_transforms[i]
+            "BFD,FT->BTD",
+            tf.cast(x[:, self._unpool_length :, :], _get_policy_dtype()),  # extra casting for faster mixed computation.
+            pooling_transforms[i],
           )
           query_inputs = tf.concat(
-              values=(tf.cast(x[:, :self._unpool_length, :], dtype=pooled_inputs.dtype), pooled_inputs), axis=1
+            values=(tf.cast(x[:, : self._unpool_length, :], dtype=pooled_inputs.dtype), pooled_inputs), axis=1
           )
           x = layer([query_inputs, x, attention_mask], output_range=transformer_output_range)
         encoder_outputs.append(x)
@@ -488,11 +486,11 @@ def call(self, inputs, output_range: Optional[tf.Tensor] = None):
     pooled_output = self._pooler_layer(first_token_tensor)
 
     return dict(
-        word_embeddings=word_embeddings,
-        embedding_output=embeddings,
-        sequence_output=encoder_outputs[-1],
-        pooled_output=pooled_output,
-        encoder_outputs=encoder_outputs
+      word_embeddings=word_embeddings,
+      embedding_output=embeddings,
+      sequence_output=encoder_outputs[-1],
+      pooled_output=pooled_output,
+      encoder_outputs=encoder_outputs,
     )
 
   def get_embedding_table(self):
@@ -516,14 +514,14 @@ def pooler_layer(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+    if "embedding_layer" in config and config["embedding_layer"] is not None:
       warn_string = (
-          'You are reloading a model that was saved with a '
-          'potentially-shared embedding layer object. If you contine to '
-          'train this model, the embedding layer will no longer be shared. '
-          'To work around this, load the model outside of the Keras API.'
+        "You are reloading a model that was saved with a "
+        "potentially-shared embedding layer object. If you contine to "
+        "train this model, the embedding layer will no longer be shared. "
+        "To work around this, load the model outside of the Keras API."
       )
-      print('WARNING: ' + warn_string)
+      print("WARNING: " + warn_string)
       logging.warn(warn_string)
 
     return cls(**config)
diff --git a/deepray/layers/networks/funnel_transformer_test.py b/deepray/layers/networks/funnel_transformer_test.py
index 31d25f2c..5642eb28 100644
--- a/deepray/layers/networks/funnel_transformer_test.py
+++ b/deepray/layers/networks/funnel_transformer_test.py
@@ -21,7 +21,6 @@
 
 
 class SingleLayerModel(tf.keras.Model):
-
   def __init__(self, layer):
     super().__init__()
     self.layer = layer
@@ -31,20 +30,19 @@ def call(self, inputs):
 
 
 class FunnelTransformerEncoderTest(parameterized.TestCase, tf.test.TestCase):
-
   def tearDown(self):
     super(FunnelTransformerEncoderTest, self).tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.named_parameters(
-      ("mix_truncated_avg_rezero", "mixed_float16", tf.float16, "truncated_avg", "ReZeroTransformer"),
-      ("float32_truncated_avg_rezero", "float32", tf.float32, "truncated_avg", "ReZeroTransformer"),
-      ("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg", "TransformerEncoderBlock"),
-      ("float32_truncated_avg", "float32", tf.float32, "truncated_avg", "TransformerEncoderBlock"),
-      ("mix_max", "mixed_float16", tf.float16, "max", "TransformerEncoderBlock"),
-      ("float32_max", "float32", tf.float32, "max", "TransformerEncoderBlock"),
-      ("mix_avg", "mixed_float16", tf.float16, "avg", "TransformerEncoderBlock"),
-      ("float32_avg", "float32", tf.float32, "avg", "TransformerEncoderBlock")
+    ("mix_truncated_avg_rezero", "mixed_float16", tf.float16, "truncated_avg", "ReZeroTransformer"),
+    ("float32_truncated_avg_rezero", "float32", tf.float32, "truncated_avg", "ReZeroTransformer"),
+    ("mix_truncated_avg", "mixed_float16", tf.float16, "truncated_avg", "TransformerEncoderBlock"),
+    ("float32_truncated_avg", "float32", tf.float32, "truncated_avg", "TransformerEncoderBlock"),
+    ("mix_max", "mixed_float16", tf.float16, "max", "TransformerEncoderBlock"),
+    ("float32_max", "float32", tf.float32, "max", "TransformerEncoderBlock"),
+    ("mix_avg", "mixed_float16", tf.float16, "avg", "TransformerEncoderBlock"),
+    ("float32_avg", "float32", tf.float32, "avg", "TransformerEncoderBlock"),
   )
   def test_network_creation(self, policy, pooled_dtype, pool_type, transformer_cls):
     tf.keras.mixed_precision.set_global_policy(policy)
@@ -55,15 +53,15 @@ def test_network_creation(self, policy, pooled_dtype, pool_type, transformer_cls
     num_layers = 3
     # Create a small FunnelTransformerEncoder for testing.
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        pool_stride=pool_stride,
-        pool_type=pool_type,
-        max_sequence_length=sequence_length,
-        unpool_length=0,
-        transformer_cls=transformer_cls
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      pool_stride=pool_stride,
+      pool_type=pool_type,
+      max_sequence_length=sequence_length,
+      unpool_length=0,
+      transformer_cls=transformer_cls,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -108,15 +106,15 @@ def test_network_creation_dense(self):
     num_layers = 3
     # Create a small FunnelTransformerEncoder for testing.
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        pool_stride=pool_stride,
-        pool_type=pool_type,
-        max_sequence_length=sequence_length + dense_sequence_length,
-        unpool_length=0,
-        transformer_cls="TransformerEncoderBlock"
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      pool_stride=pool_stride,
+      pool_type=pool_type,
+      max_sequence_length=sequence_length + dense_sequence_length,
+      unpool_length=0,
+      transformer_cls="TransformerEncoderBlock",
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -151,20 +149,20 @@ def test_invalid_stride_and_num_layers(self):
     unpool_length = 1
     with self.assertRaisesRegex(ValueError, "pool_stride and num_layers are not equal"):
       _ = funnel_transformer.FunnelTransformerEncoder(
-          vocab_size=100,
-          hidden_size=hidden_size,
-          num_attention_heads=2,
-          num_layers=num_layers,
-          pool_stride=pool_stride,
-          unpool_length=unpool_length
+        vocab_size=100,
+        hidden_size=hidden_size,
+        num_attention_heads=2,
+        num_layers=num_layers,
+        pool_stride=pool_stride,
+        unpool_length=unpool_length,
       )
 
   @parameterized.named_parameters(
-      ("no_stride_no_unpool", 1, 0),
-      ("stride_list_with_unpool", [2, 3, 4], 1),
-      ("large_stride_with_unpool", 3, 1),
-      ("large_stride_with_large_unpool", 5, 10),
-      ("no_stride_with_unpool", 1, 1),
+    ("no_stride_no_unpool", 1, 0),
+    ("stride_list_with_unpool", [2, 3, 4], 1),
+    ("large_stride_with_unpool", 3, 1),
+    ("large_stride_with_large_unpool", 5, 10),
+    ("no_stride_with_unpool", 1, 1),
   )
   def test_all_encoder_outputs_network_creation(self, pool_stride, unpool_length):
     hidden_size = 32
@@ -172,12 +170,12 @@ def test_all_encoder_outputs_network_creation(self, pool_stride, unpool_length):
     num_layers = 3
     # Create a small FunnelTransformerEncoder for testing.
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=num_layers,
-        pool_stride=pool_stride,
-        unpool_length=unpool_length
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=num_layers,
+      pool_stride=pool_stride,
+      unpool_length=unpool_length,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -193,8 +191,9 @@ def test_all_encoder_outputs_network_creation(self, pool_stride, unpool_length):
     if isinstance(pool_stride, int):
       pool_stride = [pool_stride] * num_layers
     for layer_pool_stride, data in zip(pool_stride, all_encoder_outputs):
-      expected_data_shape[
-          1] = unpool_length + (expected_data_shape[1] + layer_pool_stride - 1 - unpool_length) // layer_pool_stride
+      expected_data_shape[1] = (
+        unpool_length + (expected_data_shape[1] + layer_pool_stride - 1 - unpool_length) // layer_pool_stride
+      )
       self.assertAllEqual(expected_data_shape, data.shape.as_list())
     self.assertAllEqual(expected_pooled_shape, pooled.shape.as_list())
 
@@ -203,11 +202,11 @@ def test_all_encoder_outputs_network_creation(self, pool_stride, unpool_length):
     self.assertAllEqual(tf.float32, pooled.dtype)
 
   @parameterized.named_parameters(
-      ("all_sequence", None, 3, 0),
-      ("output_range", 1, 1, 0),
-      ("all_sequence_wit_unpool", None, 4, 1),
-      ("output_range_with_unpool", 1, 1, 1),
-      ("output_range_with_large_unpool", 1, 1, 2),
+    ("all_sequence", None, 3, 0),
+    ("output_range", 1, 1, 0),
+    ("all_sequence_wit_unpool", None, 4, 1),
+    ("output_range_with_unpool", 1, 1, 1),
+    ("output_range_with_large_unpool", 1, 1, 2),
   )
   def test_network_invocation(self, output_range, out_seq_len, unpool_length):
     hidden_size = 32
@@ -217,13 +216,13 @@ def test_network_invocation(self, output_range, out_seq_len, unpool_length):
     pool_stride = 2
     # Create a small FunnelTransformerEncoder for testing.
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        pool_stride=pool_stride,
-        unpool_length=unpool_length
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      pool_stride=pool_stride,
+      unpool_length=unpool_length,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -249,13 +248,13 @@ def test_network_invocation(self, output_range, out_seq_len, unpool_length):
     # sequence_length
     max_sequence_length = 128
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        pool_stride=pool_stride
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      pool_stride=pool_stride,
     )
     dict_outputs = test_network([word_ids, mask, type_ids])
     data = dict_outputs["sequence_output"]
@@ -266,14 +265,14 @@ def test_network_invocation(self, output_range, out_seq_len, unpool_length):
 
     # Creates a FunnelTransformerEncoder with embedding_width != hidden_size
     test_network = funnel_transformer.FunnelTransformerEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types,
-        embedding_width=16,
-        pool_stride=pool_stride
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
+      embedding_width=16,
+      pool_stride=pool_stride,
     )
     dict_outputs = test_network([word_ids, mask, type_ids])
     data = dict_outputs["sequence_output"]
@@ -286,33 +285,33 @@ def test_network_invocation(self, output_range, out_seq_len, unpool_length):
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        vocab_size=100,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        inner_dim=1223,
-        inner_activation="relu",
-        output_dropout=0.05,
-        attention_dropout=0.22,
-        initializer="glorot_uniform",
-        output_range=-1,
-        embedding_width=16,
-        embedding_layer=None,
-        norm_first=False,
-        pool_type="max",
-        pool_stride=2,
-        unpool_length=0,
-        transformer_cls="TransformerEncoderBlock"
+      vocab_size=100,
+      hidden_size=32,
+      num_layers=3,
+      num_attention_heads=2,
+      max_sequence_length=21,
+      type_vocab_size=12,
+      inner_dim=1223,
+      inner_activation="relu",
+      output_dropout=0.05,
+      attention_dropout=0.22,
+      initializer="glorot_uniform",
+      output_range=-1,
+      embedding_width=16,
+      embedding_layer=None,
+      norm_first=False,
+      pool_type="max",
+      pool_stride=2,
+      unpool_length=0,
+      transformer_cls="TransformerEncoderBlock",
     )
     network = funnel_transformer.FunnelTransformerEncoder(**kwargs)
     expected_config = dict(kwargs)
     expected_config["inner_activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["inner_activation"])
+      tf.keras.activations.get(expected_config["inner_activation"])
     )
     expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"])
+      tf.keras.initializers.get(expected_config["initializer"])
     )
     self.assertEqual(network.get_config(), expected_config)
     # Create another network object from the first object's config.
diff --git a/deepray/layers/networks/mobile_bert_encoder.py b/deepray/layers/networks/mobile_bert_encoder.py
index a5164916..8113796b 100644
--- a/deepray/layers/networks/mobile_bert_encoder.py
+++ b/deepray/layers/networks/mobile_bert_encoder.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """MobileBERT text encoder network."""
+
 import gin
 import tensorflow as tf
 
@@ -23,27 +24,27 @@ class MobileBERTEncoder(tf.keras.Model):
   """A Keras functional API implementation for MobileBERT encoder."""
 
   def __init__(
-      self,
-      word_vocab_size=30522,
-      word_embed_size=128,
-      type_vocab_size=2,
-      max_sequence_length=512,
-      num_blocks=24,
-      hidden_size=512,
-      num_attention_heads=4,
-      intermediate_size=512,
-      intermediate_act_fn='relu',
-      hidden_dropout_prob=0.1,
-      attention_probs_dropout_prob=0.1,
-      intra_bottleneck_size=128,
-      initializer_range=0.02,
-      use_bottleneck_attention=False,
-      key_query_shared_bottleneck=True,
-      num_feedforward_networks=4,
-      normalization_type='no_norm',
-      classifier_activation=False,
-      input_mask_dtype='int32',
-      **kwargs
+    self,
+    word_vocab_size=30522,
+    word_embed_size=128,
+    type_vocab_size=2,
+    max_sequence_length=512,
+    num_blocks=24,
+    hidden_size=512,
+    num_attention_heads=4,
+    intermediate_size=512,
+    intermediate_act_fn="relu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    intra_bottleneck_size=128,
+    initializer_range=0.02,
+    use_bottleneck_attention=False,
+    key_query_shared_bottleneck=True,
+    num_feedforward_networks=4,
+    normalization_type="no_norm",
+    classifier_activation=False,
+    input_mask_dtype="int32",
+    **kwargs,
   ):
     """Class initialization.
 
@@ -89,39 +90,39 @@ def __init__(
 
     # layer instantiation
     self.embedding_layer = layers.MobileBertEmbedding(
-        word_vocab_size=word_vocab_size,
-        word_embed_size=word_embed_size,
-        type_vocab_size=type_vocab_size,
-        output_embed_size=hidden_size,
-        max_sequence_length=max_sequence_length,
-        normalization_type=normalization_type,
-        initializer=initializer,
-        dropout_rate=hidden_dropout_prob
+      word_vocab_size=word_vocab_size,
+      word_embed_size=word_embed_size,
+      type_vocab_size=type_vocab_size,
+      output_embed_size=hidden_size,
+      max_sequence_length=max_sequence_length,
+      normalization_type=normalization_type,
+      initializer=initializer,
+      dropout_rate=hidden_dropout_prob,
     )
 
     self._transformer_layers = []
     for layer_idx in range(num_blocks):
       transformer = layers.MobileBertTransformer(
-          hidden_size=hidden_size,
-          num_attention_heads=num_attention_heads,
-          intermediate_size=intermediate_size,
-          intermediate_act_fn=intermediate_act_fn,
-          hidden_dropout_prob=hidden_dropout_prob,
-          attention_probs_dropout_prob=attention_probs_dropout_prob,
-          intra_bottleneck_size=intra_bottleneck_size,
-          use_bottleneck_attention=use_bottleneck_attention,
-          key_query_shared_bottleneck=key_query_shared_bottleneck,
-          num_feedforward_networks=num_feedforward_networks,
-          normalization_type=normalization_type,
-          initializer=initializer,
-          name=f'transformer_layer_{layer_idx}'
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        intermediate_act_fn=intermediate_act_fn,
+        hidden_dropout_prob=hidden_dropout_prob,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        intra_bottleneck_size=intra_bottleneck_size,
+        use_bottleneck_attention=use_bottleneck_attention,
+        key_query_shared_bottleneck=key_query_shared_bottleneck,
+        num_feedforward_networks=num_feedforward_networks,
+        normalization_type=normalization_type,
+        initializer=initializer,
+        name=f"transformer_layer_{layer_idx}",
       )
       self._transformer_layers.append(transformer)
 
     # input tensor
-    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
-    input_mask = tf.keras.layers.Input(shape=(None,), dtype=input_mask_dtype, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
+    input_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
+    input_mask = tf.keras.layers.Input(shape=(None,), dtype=input_mask_dtype, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
     self.inputs = [input_ids, input_mask, type_ids]
 
     # The dtype of `attention_mask` will the same as the dtype of `input_mask`.
@@ -136,7 +137,7 @@ def __init__(
 
     for layer_idx in range(num_blocks):
       layer_output, attention_score = self._transformer_layers[layer_idx](
-          prev_output, attention_mask, return_attention_scores=True
+        prev_output, attention_mask, return_attention_scores=True
       )
       all_layer_outputs.append(layer_output)
       all_attention_scores.append(attention_score)
@@ -145,22 +146,22 @@ def __init__(
 
     if classifier_activation:
       self._pooler_layer = tf.keras.layers.EinsumDense(
-          'ab,bc->ac',
-          output_shape=hidden_size,
-          activation=tf.tanh,
-          bias_axes='c',
-          kernel_initializer=initializer,
-          name='pooler'
+        "ab,bc->ac",
+        output_shape=hidden_size,
+        activation=tf.tanh,
+        bias_axes="c",
+        kernel_initializer=initializer,
+        name="pooler",
       )
       first_token = self._pooler_layer(first_token)
     else:
       self._pooler_layer = None
 
     outputs = dict(
-        sequence_output=prev_output,
-        pooled_output=first_token,
-        encoder_outputs=all_layer_outputs,
-        attention_scores=all_attention_scores
+      sequence_output=prev_output,
+      pooled_output=first_token,
+      encoder_outputs=all_layer_outputs,
+      attention_scores=all_attention_scores,
     )
 
     super().__init__(inputs=self.inputs, outputs=outputs, **kwargs)
diff --git a/deepray/layers/networks/mobile_bert_encoder_test.py b/deepray/layers/networks/mobile_bert_encoder_test.py
index 06e0c7dd..8720e30b 100644
--- a/deepray/layers/networks/mobile_bert_encoder_test.py
+++ b/deepray/layers/networks/mobile_bert_encoder_test.py
@@ -33,31 +33,33 @@ def generate_fake_input(batch_size=1, seq_len=5, vocab_size=10000, seed=0):
 
 
 class MobileBertEncoderTest(parameterized.TestCase, tf.test.TestCase):
-
   @parameterized.named_parameters(
-      ('default_setting', 'relu', True, 'no_norm', False), ('gelu', 'gelu', True, 'no_norm', False),
-      ('kq_not_shared', 'relu', False, 'no_norm', False), ('layer_norm', 'relu', True, 'layer_norm', False),
-      ('use_pooler', 'relu', True, 'no_norm', True), ('with_pooler_layer', 'relu', True, 'layer_norm', False)
+    ("default_setting", "relu", True, "no_norm", False),
+    ("gelu", "gelu", True, "no_norm", False),
+    ("kq_not_shared", "relu", False, "no_norm", False),
+    ("layer_norm", "relu", True, "layer_norm", False),
+    ("use_pooler", "relu", True, "no_norm", True),
+    ("with_pooler_layer", "relu", True, "layer_norm", False),
   )
   def test_mobilebert_encoder(self, act_fn, kq_shared_bottleneck, normalization_type, use_pooler):
     hidden_size = 32
     sequence_length = 16
     num_blocks = 3
     test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=100,
-        hidden_size=hidden_size,
-        num_blocks=num_blocks,
-        intermediate_act_fn=act_fn,
-        key_query_shared_bottleneck=kq_shared_bottleneck,
-        normalization_type=normalization_type,
-        classifier_activation=use_pooler
+      word_vocab_size=100,
+      hidden_size=hidden_size,
+      num_blocks=num_blocks,
+      intermediate_act_fn=act_fn,
+      key_query_shared_bottleneck=kq_shared_bottleneck,
+      normalization_type=normalization_type,
+      classifier_activation=use_pooler,
     )
 
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     outputs = test_network([word_ids, mask, type_ids])
-    layer_output, pooler_output = outputs['sequence_output'], outputs['pooled_output']
+    layer_output, pooler_output = outputs["sequence_output"], outputs["pooled_output"]
 
     self.assertIsInstance(test_network.transformer_layers, list)
     self.assertLen(test_network.transformer_layers, num_blocks)
@@ -73,26 +75,26 @@ def test_mobilebert_encoder_return_all_layer_output(self):
     sequence_length = 16
     num_blocks = 3
     test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=100, hidden_size=hidden_size, num_blocks=num_blocks
+      word_vocab_size=100, hidden_size=hidden_size, num_blocks=num_blocks
     )
 
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     outputs = test_network([word_ids, mask, type_ids])
-    all_layer_output = outputs['encoder_outputs']
+    all_layer_output = outputs["encoder_outputs"]
 
     self.assertIsInstance(all_layer_output, list)
     self.assertLen(all_layer_output, num_blocks + 1)
 
-  @parameterized.parameters('int32', 'float32')
+  @parameterized.parameters("int32", "float32")
   def test_mobilebert_encoder_invocation(self, input_mask_dtype):
     vocab_size = 100
     hidden_size = 32
     sequence_length = 16
     num_blocks = 3
     test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=vocab_size, hidden_size=hidden_size, num_blocks=num_blocks, input_mask_dtype=input_mask_dtype
+      word_vocab_size=vocab_size, hidden_size=hidden_size, num_blocks=num_blocks, input_mask_dtype=input_mask_dtype
     )
 
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -107,9 +109,9 @@ def test_mobilebert_encoder_invocation(self, input_mask_dtype):
     outputs = model.predict([input_seq, input_mask, token_type])
 
     sequence_output_shape = [1, sequence_length, hidden_size]
-    self.assertAllEqual(outputs['sequence_output'].shape, sequence_output_shape)
+    self.assertAllEqual(outputs["sequence_output"].shape, sequence_output_shape)
     pooled_output_shape = [1, hidden_size]
-    self.assertAllEqual(outputs['pooled_output'].shape, pooled_output_shape)
+    self.assertAllEqual(outputs["pooled_output"].shape, pooled_output_shape)
 
   def test_mobilebert_encoder_invocation_with_attention_score(self):
     vocab_size = 100
@@ -117,7 +119,7 @@ def test_mobilebert_encoder_invocation_with_attention_score(self):
     sequence_length = 16
     num_blocks = 3
     test_network = mobile_bert_encoder.MobileBERTEncoder(
-        word_vocab_size=vocab_size, hidden_size=hidden_size, num_blocks=num_blocks
+      word_vocab_size=vocab_size, hidden_size=hidden_size, num_blocks=num_blocks
     )
 
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -130,11 +132,11 @@ def test_mobilebert_encoder_invocation_with_attention_score(self):
     input_mask = generate_fake_input(batch_size=1, seq_len=sequence_length, vocab_size=2)
     token_type = generate_fake_input(batch_size=1, seq_len=sequence_length, vocab_size=2)
     outputs = model.predict([input_seq, input_mask, token_type])
-    self.assertLen(outputs['attention_scores'], num_blocks)
+    self.assertLen(outputs["attention_scores"], num_blocks)
 
   @parameterized.named_parameters(
-      ('sequence_classification', models.BertClassifier, [None, 5]),
-      ('token_classification', models.BertTokenClassifier, [None, 16, 5])
+    ("sequence_classification", models.BertClassifier, [None, 5]),
+    ("token_classification", models.BertTokenClassifier, [None, 16, 5]),
   )
   def test_mobilebert_encoder_for_downstream_task(self, task, prediction_shape):
     hidden_size = 32
@@ -148,9 +150,9 @@ def test_mobilebert_encoder_for_downstream_task(self, task, prediction_shape):
     type_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     prediction = classifier([word_ids, mask, type_ids])
     if task == models.BertTokenClassifier:
-      prediction = prediction['logits']
+      prediction = prediction["logits"]
     self.assertAllEqual(prediction.shape.as_list(), prediction_shape)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/networks/packed_sequence_embedding.py b/deepray/layers/networks/packed_sequence_embedding.py
index 0baeaab0..bf5ec7ba 100644
--- a/deepray/layers/networks/packed_sequence_embedding.py
+++ b/deepray/layers/networks/packed_sequence_embedding.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """An embedding network supporting packed sequences and position ids."""
+
 # pylint: disable=g-classes-have-attributes
 import collections
 import tensorflow as tf
@@ -20,7 +21,7 @@
 from official.nlp.modeling import layers
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class PackedSequenceEmbedding(tf.keras.Model):
   """An embedding network supporting packed sequences and position ids.
 
@@ -49,39 +50,39 @@ class PackedSequenceEmbedding(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      type_vocab_size,
-      embedding_width,
-      hidden_size,
-      max_seq_length,
-      initializer,
-      dropout_rate,
-      use_position_id=False,
-      pack_multiple_sequences=False,
-      **kwargs
+    self,
+    vocab_size,
+    type_vocab_size,
+    embedding_width,
+    hidden_size,
+    max_seq_length,
+    initializer,
+    dropout_rate,
+    use_position_id=False,
+    pack_multiple_sequences=False,
+    **kwargs,
   ):
     initializer = tf.keras.initializers.get(initializer)
     if embedding_width is None:
       embedding_width = hidden_size
     config_dict = {
-        'vocab_size': vocab_size,
-        'type_vocab_size': type_vocab_size,
-        'embedding_width': embedding_width,
-        'hidden_size': hidden_size,
-        'max_seq_length': max_seq_length,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'dropout_rate': dropout_rate,
-        'use_position_id': use_position_id,
-        'pack_multiple_sequences': pack_multiple_sequences,
+      "vocab_size": vocab_size,
+      "type_vocab_size": type_vocab_size,
+      "embedding_width": embedding_width,
+      "hidden_size": hidden_size,
+      "max_seq_length": max_seq_length,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "dropout_rate": dropout_rate,
+      "use_position_id": use_position_id,
+      "pack_multiple_sequences": pack_multiple_sequences,
     }
 
-    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='input_type_ids')
+    word_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="input_type_ids")
     inputs = [word_ids, mask, type_ids]
     if use_position_id:
-      position_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name='position_ids')
+      position_ids = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="position_ids")
       inputs.append(position_ids)
     else:
       position_ids = None
@@ -92,58 +93,57 @@ def __init__(
       sub_seq_mask = None
 
     embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        name='word_embeddings'
+      vocab_size=vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      name="word_embeddings",
     )
     word_embeddings = embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
     position_embedding_layer = PositionEmbeddingWithSubSeqMask(
-        initializer=tf_utils.clone_initializer(initializer),
-        use_dynamic_slicing=True,
-        max_sequence_length=max_seq_length,
-        name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer),
+      use_dynamic_slicing=True,
+      max_sequence_length=max_seq_length,
+      name="position_embedding",
     )
     position_embeddings = position_embedding_layer(word_embeddings, position_ids, sub_seq_mask)
 
-    type_embeddings = (
-        layers.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=embedding_width,
-            initializer=tf_utils.clone_initializer(initializer),
-            use_one_hot=True,
-            name='type_embeddings'
-        )(type_ids)
-    )
+    type_embeddings = layers.OnDeviceEmbedding(
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
+    )(type_ids)
 
     embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
     embeddings = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )(embeddings)
     embeddings = tf.keras.layers.Dropout(rate=dropout_rate, dtype=tf.float32)(embeddings)
 
     if embedding_width != hidden_size:
       embeddings = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes=None,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes=None,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )(embeddings)
 
     attention_mask = layers.SelfAttentionMask()(embeddings, mask)
     if sub_seq_mask is not None:
-      attention_mask = tf.keras.layers.Lambda(lambda x: x[0] * tf.cast(x[1], x[0].dtype))(
-          [attention_mask, sub_seq_mask]
-      )
+      attention_mask = tf.keras.layers.Lambda(lambda x: x[0] * tf.cast(x[1], x[0].dtype))([
+        attention_mask,
+        sub_seq_mask,
+      ])
 
     outputs = [embeddings, attention_mask]
     super().__init__(inputs=inputs, outputs=outputs, **kwargs)
     # TF does not track immutable attrs which do not contain Trackables,
     # so by creating a config namedtuple instead of a dict we avoid tracking it.
-    config_cls = collections.namedtuple('Config', config_dict.keys())
+    config_cls = collections.namedtuple("Config", config_dict.keys())
     self._config = config_cls(**config_dict)
     self._embedding_layer = embedding_layer
     self._position_embedding_layer = position_embedding_layer
@@ -159,7 +159,7 @@ def from_config(cls, config, custom_objects=None):
     return cls(**config)
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class PackedSequenceMask(tf.keras.layers.Layer):
   """A layer to create a mask to indicate multiple sub sequences."""
 
@@ -185,7 +185,7 @@ def call(self, input_ids):
     return tf.equal(seq_ids, tf.transpose(seq_ids, [0, 2, 1]))
 
 
-@tf.keras.utils.register_keras_serializable(package='Text')
+@tf.keras.utils.register_keras_serializable(package="Text")
 class PositionEmbeddingWithSubSeqMask(tf.keras.layers.Layer):
   """Creates a positional embedding with sub-sequence masking.
 
@@ -207,24 +207,24 @@ class PositionEmbeddingWithSubSeqMask(tf.keras.layers.Layer):
       applicable if `use_dynamic_slicing` is True.
   """
 
-  def __init__(self, initializer='glorot_uniform', use_dynamic_slicing=False, max_sequence_length=None, **kwargs):
+  def __init__(self, initializer="glorot_uniform", use_dynamic_slicing=False, max_sequence_length=None, **kwargs):
     # We need to have a default dtype of float32, since the inputs (which Keras
     # usually uses to infer the dtype) will always be int32.
-    if 'dtype' not in kwargs:
-      kwargs['dtype'] = 'float32'
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = "float32"
 
     super().__init__(**kwargs)
     if use_dynamic_slicing and max_sequence_length is None:
-      raise ValueError('If `use_dynamic_slicing` is True, `max_sequence_length` must be set.')
+      raise ValueError("If `use_dynamic_slicing` is True, `max_sequence_length` must be set.")
     self._max_sequence_length = max_sequence_length
     self._initializer = tf.keras.initializers.get(initializer)
     self._use_dynamic_slicing = use_dynamic_slicing
 
   def get_config(self):
     config = {
-        'max_sequence_length': self._max_sequence_length,
-        'initializer': tf.keras.initializers.serialize(self._initializer),
-        'use_dynamic_slicing': self._use_dynamic_slicing,
+      "max_sequence_length": self._max_sequence_length,
+      "initializer": tf.keras.initializers.serialize(self._initializer),
+      "use_dynamic_slicing": self._use_dynamic_slicing,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -234,8 +234,7 @@ def build(self, input_shape):
     dimension_list = input_shape.as_list()
 
     if len(dimension_list) != 3:
-      raise ValueError('PositionEmbedding expects a 3-dimensional input tensor '
-                       'of shape [batch, sequence, width]')
+      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor of shape [batch, sequence, width]")
     seq_length = dimension_list[1]
     width = dimension_list[2]
 
@@ -244,15 +243,15 @@ def build(self, input_shape):
     if not self._use_dynamic_slicing:
       if seq_length is None:
         raise ValueError(
-            'PositionEmbedding must have `use_dynamic_slicing` set '
-            'to True (and max_sequence_length set) when the '
-            'sequence (1st) dimension of the input is None.'
+          "PositionEmbedding must have `use_dynamic_slicing` set "
+          "to True (and max_sequence_length set) when the "
+          "sequence (1st) dimension of the input is None."
         )
       if self._max_sequence_length is not None:
         raise ValueError(
-            'When `use_dynamic_slicing` is False, max_sequence_length should '
-            'not be specified and we ought to use seq_length to get the '
-            'variable shape.'
+          "When `use_dynamic_slicing` is False, max_sequence_length should "
+          "not be specified and we ought to use seq_length to get the "
+          "variable shape."
         )
 
     if self._max_sequence_length is not None:
@@ -261,7 +260,7 @@ def build(self, input_shape):
       weight_sequence_length = seq_length
 
     self._position_embeddings = self.add_weight(
-        'embeddings', shape=[weight_sequence_length, width], initializer=self._initializer
+      "embeddings", shape=[weight_sequence_length, width], initializer=self._initializer
     )
 
     super().build(input_shape)
@@ -291,7 +290,7 @@ def call(self, inputs, position_ids=None, sub_sequence_mask=None):
     """
     input_shape = tf_utils.get_shape_list(inputs, expected_rank=3)
     if self._use_dynamic_slicing:
-      position_embeddings = self._position_embeddings[:input_shape[1], :]
+      position_embeddings = self._position_embeddings[: input_shape[1], :]
     else:
       position_embeddings = self._position_embeddings
 
diff --git a/deepray/layers/networks/packed_sequence_embedding_test.py b/deepray/layers/networks/packed_sequence_embedding_test.py
index 7bd0dfe0..3a906682 100644
--- a/deepray/layers/networks/packed_sequence_embedding_test.py
+++ b/deepray/layers/networks/packed_sequence_embedding_test.py
@@ -23,21 +23,20 @@
 
 
 class PackedSequenceEmbeddingTest(tf.test.TestCase, parameterized.TestCase):
-
   def tearDown(self):
     super(PackedSequenceEmbeddingTest, self).tearDown()
-    tf.keras.mixed_precision.set_global_policy('float32')
+    tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.parameters([
-      (True, True, True),
-      (False, False, True),
-      (False, True, False),
-      (True, False, False),
+    (True, True, True),
+    (False, False, True),
+    (False, True, False),
+    (True, False, False),
   ])
   def test_network_creation(self, use_position_id, pack_multiple_sequences, use_float16):
     """Validate that the Keras object can be created."""
     if use_float16:
-      tf.keras.mixed_precision.set_global_policy('mixed_float16')
+      tf.keras.mixed_precision.set_global_policy("mixed_float16")
     seq_length = 16
     vocab_size = 100
     max_position_embeddings = 32
@@ -45,15 +44,15 @@ def test_network_creation(self, use_position_id, pack_multiple_sequences, use_fl
     embedding_width = 16
     hidden_size = 32
     embedding_cfg = dict(
-        vocab_size=vocab_size,
-        type_vocab_size=2,
-        embedding_width=embedding_width,
-        hidden_size=hidden_size,
-        max_seq_length=max_position_embeddings,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        dropout_rate=0.1,
-        use_position_id=use_position_id,
-        pack_multiple_sequences=pack_multiple_sequences,
+      vocab_size=vocab_size,
+      type_vocab_size=2,
+      embedding_width=embedding_width,
+      hidden_size=hidden_size,
+      max_seq_length=max_position_embeddings,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      dropout_rate=0.1,
+      use_position_id=use_position_id,
+      pack_multiple_sequences=pack_multiple_sequences,
     )
     test_object = packed_sequence_embedding.PackedSequenceEmbedding(**embedding_cfg)
 
@@ -61,12 +60,12 @@ def test_network_creation(self, use_position_id, pack_multiple_sequences, use_fl
     input_mask = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
     input_type_ids = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
     network_inputs = {
-        'input_word_ids': input_word_ids,
-        'input_mask': input_mask,
-        'input_type_ids': input_type_ids,
+      "input_word_ids": input_word_ids,
+      "input_mask": input_mask,
+      "input_type_ids": input_type_ids,
     }
     if use_position_id:
-      network_inputs['position_ids'] = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
+      network_inputs["position_ids"] = tf.keras.Input(shape=(seq_length,), dtype=tf.int32)
 
     embedding, mask = test_object(network_inputs)
 
@@ -80,12 +79,12 @@ def test_network_creation(self, use_position_id, pack_multiple_sequences, use_fl
     mask_data = np.random.randint(2, size=(batch_size, seq_length))
     type_id_data = np.random.randint(type_vocab_size, size=(batch_size, seq_length))
     feed_input = {
-        'input_word_ids': word_id_data,
-        'input_mask': mask_data,
-        'input_type_ids': type_id_data,
+      "input_word_ids": word_id_data,
+      "input_mask": mask_data,
+      "input_type_ids": type_id_data,
     }
     if use_position_id:
-      feed_input['position_ids'] = np.random.randint(seq_length, size=(batch_size, seq_length))
+      feed_input["position_ids"] = np.random.randint(seq_length, size=(batch_size, seq_length))
     embeddings, attention_mask = model.predict(feed_input)
     expected_embeddings_shape = [3, seq_length, hidden_size]
     expected_attention_mask_shape = [3, seq_length, seq_length]
@@ -93,24 +92,24 @@ def test_network_creation(self, use_position_id, pack_multiple_sequences, use_fl
     self.assertAllEqual(expected_attention_mask_shape, attention_mask.shape)
 
   def test_serialize_deserialize(self):
-    tf.keras.mixed_precision.set_global_policy('mixed_float16')
+    tf.keras.mixed_precision.set_global_policy("mixed_float16")
     # Create a network object that sets all of its config options.
     embedding_cfg = dict(
-        vocab_size=100,
-        type_vocab_size=2,
-        embedding_width=64,
-        hidden_size=64,
-        max_seq_length=32,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        dropout_rate=0.1,
-        use_position_id=True,
-        pack_multiple_sequences=False,
+      vocab_size=100,
+      type_vocab_size=2,
+      embedding_width=64,
+      hidden_size=64,
+      max_seq_length=32,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      dropout_rate=0.1,
+      use_position_id=True,
+      pack_multiple_sequences=False,
     )
     network = packed_sequence_embedding.PackedSequenceEmbedding(**embedding_cfg)
 
     expected_config = dict(embedding_cfg)
-    expected_config['initializer'] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config['initializer'])
+    expected_config["initializer"] = tf.keras.initializers.serialize(
+      tf.keras.initializers.get(expected_config["initializer"])
     )
     self.assertEqual(network.get_config(), expected_config)
 
@@ -124,5 +123,5 @@ def test_serialize_deserialize(self):
     self.assertAllEqual(network.get_config(), new_network.get_config())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/networks/sparse_mixer.py b/deepray/layers/networks/sparse_mixer.py
index 9577b74d..16fcce05 100644
--- a/deepray/layers/networks/sparse_mixer.py
+++ b/deepray/layers/networks/sparse_mixer.py
@@ -114,33 +114,33 @@ class SparseMixer(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size: int,
-      hidden_size: int = 512,
-      num_layers: int = 14,
-      moe_layers: Sequence[int] = (5, 6, 7, 8),
-      attention_layers: Sequence[int] = (10, 11, 12, 13),
-      num_experts: int = 16,
-      train_capacity_factor: float = 1.,
-      eval_capacity_factor: float = 1.,
-      examples_per_group: float = 1.,
-      mixing_mechanism: layers.MixingMechanism = layers.MixingMechanism.LINEAR,
-      use_fft: bool = False,
-      num_attention_heads: int = 8,
-      max_sequence_length: int = 512,
-      type_vocab_size: int = 16,
-      inner_dim: int = 2056,
-      inner_activation: _Activation = _approx_gelu,
-      output_dropout: float = 0.1,
-      attention_dropout: float = 0.1,
-      initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      output_range: Optional[int] = None,
-      embedding_width: Optional[int] = None,
-      embedding_layer: Optional[tf.keras.layers.Layer] = None,
-      norm_first: bool = False,
-      with_dense_inputs: bool = False,
-      export_metrics: bool = True,
-      **kwargs
+    self,
+    vocab_size: int,
+    hidden_size: int = 512,
+    num_layers: int = 14,
+    moe_layers: Sequence[int] = (5, 6, 7, 8),
+    attention_layers: Sequence[int] = (10, 11, 12, 13),
+    num_experts: int = 16,
+    train_capacity_factor: float = 1.0,
+    eval_capacity_factor: float = 1.0,
+    examples_per_group: float = 1.0,
+    mixing_mechanism: layers.MixingMechanism = layers.MixingMechanism.LINEAR,
+    use_fft: bool = False,
+    num_attention_heads: int = 8,
+    max_sequence_length: int = 512,
+    type_vocab_size: int = 16,
+    inner_dim: int = 2056,
+    inner_activation: _Activation = _approx_gelu,
+    output_dropout: float = 0.1,
+    attention_dropout: float = 0.1,
+    initializer: _Initializer = tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    output_range: Optional[int] = None,
+    embedding_width: Optional[int] = None,
+    embedding_layer: Optional[tf.keras.layers.Layer] = None,
+    norm_first: bool = False,
+    with_dense_inputs: bool = False,
+    export_metrics: bool = True,
+    **kwargs,
   ):
     super().__init__(**kwargs)
 
@@ -151,167 +151,167 @@ def __init__(
       embedding_width = hidden_size
 
     self._config = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'moe_layers': moe_layers,
-        'num_experts': num_experts,
-        'train_capacity_factor': train_capacity_factor,
-        'eval_capacity_factor': eval_capacity_factor,
-        'examples_per_group': examples_per_group,
-        'mixing_mechanism': mixing_mechanism,
-        'use_fft': use_fft,
-        'attention_layers': attention_layers,
-        'num_attention_heads': num_attention_heads,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'inner_dim': inner_dim,
-        'inner_activation': tf.keras.activations.serialize(activation),
-        'output_dropout': output_dropout,
-        'attention_dropout': attention_dropout,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'output_range': output_range,
-        'embedding_width': embedding_width,
-        'embedding_layer': embedding_layer,
-        'norm_first': norm_first,
-        'with_dense_inputs': with_dense_inputs,
-        'export_metrics': export_metrics,
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "moe_layers": moe_layers,
+      "num_experts": num_experts,
+      "train_capacity_factor": train_capacity_factor,
+      "eval_capacity_factor": eval_capacity_factor,
+      "examples_per_group": examples_per_group,
+      "mixing_mechanism": mixing_mechanism,
+      "use_fft": use_fft,
+      "attention_layers": attention_layers,
+      "num_attention_heads": num_attention_heads,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "inner_dim": inner_dim,
+      "inner_activation": tf.keras.activations.serialize(activation),
+      "output_dropout": output_dropout,
+      "attention_dropout": attention_dropout,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "output_range": output_range,
+      "embedding_width": embedding_width,
+      "embedding_layer": embedding_layer,
+      "norm_first": norm_first,
+      "with_dense_inputs": with_dense_inputs,
+      "export_metrics": export_metrics,
     }
 
     if embedding_layer is None:
       self._embedding_layer = layers.OnDeviceEmbedding(
-          vocab_size=vocab_size,
-          embedding_width=embedding_width,
-          initializer=tf_utils.clone_initializer(initializer),
-          name='word_embeddings'
+        vocab_size=vocab_size,
+        embedding_width=embedding_width,
+        initializer=tf_utils.clone_initializer(initializer),
+        name="word_embeddings",
       )
     else:
       self._embedding_layer = embedding_layer
 
     self._position_embedding_layer = layers.PositionEmbedding(
-        initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name='position_embedding'
+      initializer=tf_utils.clone_initializer(initializer), max_length=max_sequence_length, name="position_embedding"
     )
 
     self._type_embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=type_vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(initializer),
-        use_one_hot=True,
-        name='type_embeddings'
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(initializer),
+      use_one_hot=True,
+      name="type_embeddings",
     )
 
     self._embedding_norm_layer = tf.keras.layers.LayerNormalization(
-        name='embeddings/layer_norm', axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
-    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name='embedding_dropout')
+    self._embedding_dropout = tf.keras.layers.Dropout(rate=output_dropout, name="embedding_dropout")
 
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     self._embedding_projection = None
     if embedding_width != hidden_size:
       self._embedding_projection = tf.keras.layers.EinsumDense(
-          '...x,xy->...y',
-          output_shape=hidden_size,
-          bias_axes='y',
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='embedding_projection'
+        "...x,xy->...y",
+        output_shape=hidden_size,
+        bias_axes="y",
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="embedding_projection",
       )
 
     self._transformer_layers = []
     for layer in range(num_layers):
       if layer in attention_layers:
         mixing_layer = layers.MultiHeadAttention(
-            num_heads=num_attention_heads,
-            key_dim=int(hidden_size // num_attention_heads),
-            dropout=attention_dropout,
-            use_bias=True,
-            kernel_initializer=tf_utils.clone_initializer(initializer),
-            name='self_attention',
+          num_heads=num_attention_heads,
+          key_dim=int(hidden_size // num_attention_heads),
+          dropout=attention_dropout,
+          use_bias=True,
+          kernel_initializer=tf_utils.clone_initializer(initializer),
+          name="self_attention",
         )
       else:
         mixing_layer = self._init_mixing_sublayer(layer)
 
       if layer in moe_layers:
         feedforward_layer = layers.MoeLayer(
-            experts=layers.FeedForwardExperts(
-                num_experts=num_experts,
-                d_ff=inner_dim,
-                output_dropout=output_dropout,
-                activation=inner_activation,
-                kernel_initializer=tf_utils.clone_initializer(initializer),
-                name='experts'
-            ),
-            router=layers.ExpertsChooseMaskedRouter(
-                num_experts=num_experts,
-                kernel_initializer=tf_utils.clone_initializer(initializer),
-                export_metrics=export_metrics,
-                name='router'
-            ),
-            train_capacity_factor=train_capacity_factor,
-            eval_capacity_factor=eval_capacity_factor,
-            examples_per_group=examples_per_group,
-            name='moe'
+          experts=layers.FeedForwardExperts(
+            num_experts=num_experts,
+            d_ff=inner_dim,
+            output_dropout=output_dropout,
+            activation=inner_activation,
+            kernel_initializer=tf_utils.clone_initializer(initializer),
+            name="experts",
+          ),
+          router=layers.ExpertsChooseMaskedRouter(
+            num_experts=num_experts,
+            kernel_initializer=tf_utils.clone_initializer(initializer),
+            export_metrics=export_metrics,
+            name="router",
+          ),
+          train_capacity_factor=train_capacity_factor,
+          eval_capacity_factor=eval_capacity_factor,
+          examples_per_group=examples_per_group,
+          name="moe",
         )
       else:
         feedforward_layer = None  # Fallback to default (dense) MLP class
 
       block = layers.TransformerScaffold(
-          num_attention_heads=num_attention_heads,
-          inner_dim=inner_dim,
-          inner_activation=inner_activation,
-          attention_cls=mixing_layer,
-          feedforward_cls=feedforward_layer,
-          output_dropout=output_dropout,
-          attention_dropout=attention_dropout,
-          norm_first=norm_first,
-          output_range=output_range if layer == num_layers - 1 else None,
-          kernel_initializer=tf_utils.clone_initializer(initializer),
-          name='transformer/layer_%d' % layer
+        num_attention_heads=num_attention_heads,
+        inner_dim=inner_dim,
+        inner_activation=inner_activation,
+        attention_cls=mixing_layer,
+        feedforward_cls=feedforward_layer,
+        output_dropout=output_dropout,
+        attention_dropout=attention_dropout,
+        norm_first=norm_first,
+        output_range=output_range if layer == num_layers - 1 else None,
+        kernel_initializer=tf_utils.clone_initializer(initializer),
+        name="transformer/layer_%d" % layer,
       )
       self._transformer_layers.append(block)
 
-    self._attention_mask_layer = layers.SelfAttentionMask(name='self_attention_mask')
+    self._attention_mask_layer = layers.SelfAttentionMask(name="self_attention_mask")
 
     self._pooler_layer = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=tf_utils.clone_initializer(initializer),
-        name='pooler_transform'
+      units=hidden_size,
+      activation="tanh",
+      kernel_initializer=tf_utils.clone_initializer(initializer),
+      name="pooler_transform",
     )
 
     if with_dense_inputs:
       self.inputs = dict(
-          # The total length of token ids and dense inputs still has to be
-          # max_sequence_length. It is checked in call().
-          input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
-          dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
-          dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        # The total length of token ids and dense inputs still has to be
+        # max_sequence_length. It is checked in call().
+        input_word_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_inputs=tf.keras.Input(shape=(None, embedding_width), dtype=tf.float32),
+        dense_mask=tf.keras.Input(shape=(None,), dtype=tf.int32),
+        dense_type_ids=tf.keras.Input(shape=(None,), dtype=tf.int32),
       )
     else:
       self.inputs = dict(
-          input_word_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
-          input_mask=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
-          input_type_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32)
+        input_word_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
+        input_mask=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
+        input_type_ids=tf.keras.Input(shape=(max_sequence_length,), dtype=tf.int32),
       )
     self._max_sequence_length = max_sequence_length
 
   def call(self, inputs):
     word_embeddings = None
     if isinstance(inputs, dict):
-      word_ids = inputs.get('input_word_ids')
-      mask = inputs.get('input_mask')
-      type_ids = inputs.get('input_type_ids')
-      word_embeddings = inputs.get('input_word_embeddings', None)
-
-      dense_inputs = inputs.get('dense_inputs', None)
-      dense_mask = inputs.get('dense_mask', None)
-      dense_type_ids = inputs.get('dense_type_ids', None)
+      word_ids = inputs.get("input_word_ids")
+      mask = inputs.get("input_mask")
+      type_ids = inputs.get("input_type_ids")
+      word_embeddings = inputs.get("input_word_embeddings", None)
+
+      dense_inputs = inputs.get("dense_inputs", None)
+      dense_mask = inputs.get("dense_mask", None)
+      dense_type_ids = inputs.get("dense_type_ids", None)
     else:
-      raise ValueError('Unexpected inputs type (%s) to %s.' % (type(inputs), self.__class__))
+      raise ValueError("Unexpected inputs type (%s) to %s." % (type(inputs), self.__class__))
 
     if word_embeddings is None:
       word_embeddings = self._embedding_layer(word_ids)
@@ -372,29 +372,29 @@ def pooler_layer(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'embedding_layer' in config and config['embedding_layer'] is not None:
+    if "embedding_layer" in config and config["embedding_layer"] is not None:
       warn_string = (
-          'You are reloading a model that was saved with a '
-          'potentially-shared embedding layer object. If you contine to '
-          'train this model, the embedding layer will no longer be shared. '
-          'To work around this, load the model outside of the Keras API.'
+        "You are reloading a model that was saved with a "
+        "potentially-shared embedding layer object. If you contine to "
+        "train this model, the embedding layer will no longer be shared. "
+        "To work around this, load the model outside of the Keras API."
       )
-      print('WARNING: ' + warn_string)
+      print("WARNING: " + warn_string)
       logging.warn(warn_string)
 
     return cls(**config)
 
   def _init_mixing_sublayer(self, layer: int):
     """Initializes config-dependent mixing sublayer."""
-    if self._config['mixing_mechanism'] == layers.MixingMechanism.FOURIER:
-      mixing_sublayer = layers.FourierTransformLayer(use_fft=self._config['use_fft'], name='fourier_transform')
-    elif self._config['mixing_mechanism'] == layers.MixingMechanism.HARTLEY:
-      mixing_sublayer = layers.HartleyTransformLayer(use_fft=self._config['use_fft'], name='hartley_transform')
-    elif self._config['mixing_mechanism'] == layers.MixingMechanism.LINEAR:
+    if self._config["mixing_mechanism"] == layers.MixingMechanism.FOURIER:
+      mixing_sublayer = layers.FourierTransformLayer(use_fft=self._config["use_fft"], name="fourier_transform")
+    elif self._config["mixing_mechanism"] == layers.MixingMechanism.HARTLEY:
+      mixing_sublayer = layers.HartleyTransformLayer(use_fft=self._config["use_fft"], name="hartley_transform")
+    elif self._config["mixing_mechanism"] == layers.MixingMechanism.LINEAR:
       mixing_sublayer = layers.LinearTransformLayer(
-          kernel_initializer=tf_utils.clone_initializer(self._config['initializer']), name='linear_transform'
+        kernel_initializer=tf_utils.clone_initializer(self._config["initializer"]), name="linear_transform"
       )
     else:
-      raise ValueError('Unsupported mixing mechanism: %s' % self._config['mixing_mechanism'])
+      raise ValueError("Unsupported mixing mechanism: %s" % self._config["mixing_mechanism"])
 
     return mixing_sublayer
diff --git a/deepray/layers/networks/sparse_mixer_test.py b/deepray/layers/networks/sparse_mixer_test.py
index c011a744..9519f1ef 100644
--- a/deepray/layers/networks/sparse_mixer_test.py
+++ b/deepray/layers/networks/sparse_mixer_test.py
@@ -23,48 +23,47 @@
 
 
 class SparseMixerTest(parameterized.TestCase, tf.test.TestCase):
-
   def tearDown(self):
     super().tearDown()
     tf.keras.mixed_precision.set_global_policy("float32")
 
   @parameterized.named_parameters(
-      dict(
-          testcase_name="sparse_mixer",
-          mixing_mechanism=layers.MixingMechanism.LINEAR,
-          moe_layers=(1,),
-          attention_layers=(2,)
-      ),
-      dict(testcase_name="fnet", mixing_mechanism=layers.MixingMechanism.FOURIER, moe_layers=(), attention_layers=()),
-      dict(
-          testcase_name="sparse_hnet",
-          mixing_mechanism=layers.MixingMechanism.HARTLEY,
-          moe_layers=(0, 1, 2),
-          attention_layers=(1, 2)
-      ),
-      dict(
-          testcase_name="sparse_bert",
-          mixing_mechanism=layers.MixingMechanism.LINEAR,
-          moe_layers=(0, 1, 2),  # All layers use MoE
-          attention_layers=(0, 1, 2)
-      ),  # All layers use attention
+    dict(
+      testcase_name="sparse_mixer",
+      mixing_mechanism=layers.MixingMechanism.LINEAR,
+      moe_layers=(1,),
+      attention_layers=(2,),
+    ),
+    dict(testcase_name="fnet", mixing_mechanism=layers.MixingMechanism.FOURIER, moe_layers=(), attention_layers=()),
+    dict(
+      testcase_name="sparse_hnet",
+      mixing_mechanism=layers.MixingMechanism.HARTLEY,
+      moe_layers=(0, 1, 2),
+      attention_layers=(1, 2),
+    ),
+    dict(
+      testcase_name="sparse_bert",
+      mixing_mechanism=layers.MixingMechanism.LINEAR,
+      moe_layers=(0, 1, 2),  # All layers use MoE
+      attention_layers=(0, 1, 2),
+    ),  # All layers use attention
   )
   def test_network(
-      self, mixing_mechanism: layers.MixingMechanism, attention_layers: Sequence[int], moe_layers: Sequence[int]
+    self, mixing_mechanism: layers.MixingMechanism, attention_layers: Sequence[int], moe_layers: Sequence[int]
   ):
     num_layers = 3
     hidden_size = 16
     sequence_length = 32
     test_network = sparse_mixer.SparseMixer(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        max_sequence_length=sequence_length,
-        num_layers=num_layers,
-        moe_layers=moe_layers,
-        num_experts=8,
-        mixing_mechanism=mixing_mechanism,
-        attention_layers=attention_layers
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      max_sequence_length=sequence_length,
+      num_layers=num_layers,
+      moe_layers=moe_layers,
+      num_experts=8,
+      mixing_mechanism=mixing_mechanism,
+      attention_layers=attention_layers,
     )
 
     batch_size = 4
@@ -93,14 +92,14 @@ def test_embeddings_as_inputs(self):
     hidden_size = 32
     sequence_length = 8
     test_network = sparse_mixer.SparseMixer(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        max_sequence_length=sequence_length,
-        num_layers=3,
-        moe_layers=(1,),
-        num_experts=4,
-        attention_layers=(2,)
+      vocab_size=100,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      max_sequence_length=sequence_length,
+      num_layers=3,
+      moe_layers=(1,),
+      num_experts=4,
+      attention_layers=(2,),
     )
 
     batch_size = 2
diff --git a/deepray/layers/networks/xlnet_base.py b/deepray/layers/networks/xlnet_base.py
index b68377d5..d50cc28d 100644
--- a/deepray/layers/networks/xlnet_base.py
+++ b/deepray/layers/networks/xlnet_base.py
@@ -72,7 +72,7 @@ def _create_causal_attention_mask(seq_length, memory_length, dtype=tf.float32, s
     lower_triangular = tf.linalg.band_part(ones_matrix, -1, 0)
     strictly_lower_triangular = lower_triangular - diagonal
     causal_attention_mask = tf.concat(
-        [causal_attention_mask[:, :seq_length] + strictly_lower_triangular, causal_attention_mask[:, seq_length:]], 1
+      [causal_attention_mask[:, :seq_length] + strictly_lower_triangular, causal_attention_mask[:, seq_length:]], 1
     )
 
   return 1 - causal_attention_mask
@@ -102,7 +102,7 @@ def _combine_masks(mask1, mask2, dtype, how="and"):
 
 
 def _compute_attention_mask(
-    input_mask, permutation_mask, attention_type, seq_length, memory_length, batch_size, dtype=tf.float32
+  input_mask, permutation_mask, attention_type, seq_length, memory_length, batch_size, dtype=tf.float32
 ):
   """Combines all input attention masks for XLNet.
 
@@ -146,7 +146,7 @@ def _compute_attention_mask(
   # `1` values mean do not attend to this position.
   if attention_type == "uni":
     causal_attention_mask = _create_causal_attention_mask(
-        seq_length=seq_length, memory_length=memory_length, dtype=dtype
+      seq_length=seq_length, memory_length=memory_length, dtype=dtype
     )
     causal_attention_mask = causal_attention_mask[None, None, :, :]
     # `causal_attention_mask`: [1, 1, S, S + M]
@@ -181,8 +181,7 @@ def _compute_attention_mask(
     # This ensures that the mask allows the model to attend to positions in
     # content positions (e.g. the content diagonal).
     non_target_mask = tf.concat(
-        [tf.zeros([seq_length, memory_length], dtype=dtype),
-         tf.eye(seq_length, dtype=dtype)], axis=-1
+      [tf.zeros([seq_length, memory_length], dtype=dtype), tf.eye(seq_length, dtype=dtype)], axis=-1
     )
     content_attention_mask = _combine_masks(attention_mask, non_target_mask, how="or", dtype=dtype)
   else:
@@ -232,9 +231,9 @@ def _compute_segment_matrix(segment_ids, memory_length, batch_size, use_cls_mask
 
     # segment_ids: [B, S]
     # padded_segment_ids: [B, S + M]
-    broadcasted_segment_class_indices = (tf.equal(segment_ids, tf.constant([_SEG_ID_CLS]))[:, :, None])
+    broadcasted_segment_class_indices = tf.equal(segment_ids, tf.constant([_SEG_ID_CLS]))[:, :, None]
 
-    broadcasted_padded_class_indices = (tf.equal(padded_segment_ids, tf.constant([_SEG_ID_CLS]))[:, None, :])
+    broadcasted_padded_class_indices = tf.equal(padded_segment_ids, tf.constant([_SEG_ID_CLS]))[:, None, :]
 
     class_index_matrix = tf.logical_or(broadcasted_segment_class_indices, broadcasted_padded_class_indices)
 
@@ -247,15 +246,15 @@ def _compute_segment_matrix(segment_ids, memory_length, batch_size, use_cls_mask
 
 
 def _compute_positional_encoding(
-    attention_type,
-    position_encoding_layer,
-    hidden_size,
-    batch_size,
-    total_length,
-    seq_length,
-    clamp_length,
-    bi_data,
-    dtype=tf.float32
+  attention_type,
+  position_encoding_layer,
+  hidden_size,
+  batch_size,
+  total_length,
+  seq_length,
+  clamp_length,
+  bi_data,
+  dtype=tf.float32,
 ):
   """Computes the relative position encoding.
 
@@ -339,7 +338,7 @@ class RelativePositionEncoding(tf.keras.layers.Layer):
   def __init__(self, hidden_size, **kwargs):
     super().__init__(**kwargs)
     self._hidden_size = hidden_size
-    self._inv_freq = 1.0 / (10000.0**(tf.range(0, self._hidden_size, 2.0) / self._hidden_size))
+    self._inv_freq = 1.0 / (10000.0 ** (tf.range(0, self._hidden_size, 2.0) / self._hidden_size))
 
   def call(self, pos_seq, batch_size=None):
     """Implements call() for the layer.
@@ -406,27 +405,27 @@ class XLNetBase(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      num_layers,
-      hidden_size,
-      num_attention_heads,
-      head_size,
-      inner_size,
-      dropout_rate,
-      attention_dropout_rate,
-      attention_type,
-      bi_data,
-      initializer,
-      two_stream=False,
-      tie_attention_biases=True,
-      memory_length=None,
-      clamp_length=-1,
-      reuse_length=None,
-      inner_activation="relu",
-      use_cls_mask=False,
-      embedding_width=None,
-      **kwargs
+    self,
+    vocab_size,
+    num_layers,
+    hidden_size,
+    num_attention_heads,
+    head_size,
+    inner_size,
+    dropout_rate,
+    attention_dropout_rate,
+    attention_type,
+    bi_data,
+    initializer,
+    two_stream=False,
+    tie_attention_biases=True,
+    memory_length=None,
+    clamp_length=-1,
+    reuse_length=None,
+    inner_activation="relu",
+    use_cls_mask=False,
+    embedding_width=None,
+    **kwargs,
   ):
     super().__init__(**kwargs)
 
@@ -458,11 +457,11 @@ def __init__(
       embedding_width = hidden_size
 
     self._embedding_layer = layers.OnDeviceEmbedding(
-        vocab_size=self._vocab_size,
-        embedding_width=embedding_width,
-        initializer=tf_utils.clone_initializer(self._initializer),
-        dtype=tf.float32,
-        name="word_embedding"
+      vocab_size=self._vocab_size,
+      embedding_width=embedding_width,
+      initializer=tf_utils.clone_initializer(self._initializer),
+      dtype=tf.float32,
+      name="word_embedding",
     )
     self._dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
 
@@ -470,44 +469,44 @@ def __init__(
     self.position_encoding = RelativePositionEncoding(self._hidden_size)
 
     self._transformer_xl = transformer_xl.TransformerXL(
-        vocab_size=vocab_size,
-        num_layers=num_layers,
-        hidden_size=hidden_size,
-        num_attention_heads=num_attention_heads,
-        head_size=head_size,
-        inner_size=inner_size,
-        dropout_rate=dropout_rate,
-        attention_dropout_rate=attention_dropout_rate,
-        initializer=initializer,
-        two_stream=two_stream,
-        tie_attention_biases=tie_attention_biases,
-        memory_length=memory_length,
-        reuse_length=reuse_length,
-        inner_activation=inner_activation,
-        name="transformer_xl"
+      vocab_size=vocab_size,
+      num_layers=num_layers,
+      hidden_size=hidden_size,
+      num_attention_heads=num_attention_heads,
+      head_size=head_size,
+      inner_size=inner_size,
+      dropout_rate=dropout_rate,
+      attention_dropout_rate=attention_dropout_rate,
+      initializer=initializer,
+      two_stream=two_stream,
+      tie_attention_biases=tie_attention_biases,
+      memory_length=memory_length,
+      reuse_length=reuse_length,
+      inner_activation=inner_activation,
+      name="transformer_xl",
     )
 
   def get_config(self):
     config = {
-        "vocab_size": self._vocab_size,
-        "num_layers": self._num_layers,
-        "hidden_size": self._hidden_size,
-        "num_attention_heads": self._num_attention_heads,
-        "head_size": self._head_size,
-        "inner_size": self._inner_size,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "attention_type": self._attention_type,
-        "bi_data": self._bi_data,
-        "initializer": self._initializer,
-        "two_stream": self._two_stream,
-        "tie_attention_biases": self._tie_attention_biases,
-        "memory_length": self._memory_length,
-        "clamp_length": self._clamp_length,
-        "reuse_length": self._reuse_length,
-        "inner_activation": self._inner_activation,
-        "use_cls_mask": self._use_cls_mask,
-        "embedding_width": self._embedding_width,
+      "vocab_size": self._vocab_size,
+      "num_layers": self._num_layers,
+      "hidden_size": self._hidden_size,
+      "num_attention_heads": self._num_attention_heads,
+      "head_size": self._head_size,
+      "inner_size": self._inner_size,
+      "dropout_rate": self._dropout_rate,
+      "attention_dropout_rate": self._attention_dropout_rate,
+      "attention_type": self._attention_type,
+      "bi_data": self._bi_data,
+      "initializer": self._initializer,
+      "two_stream": self._two_stream,
+      "tie_attention_biases": self._tie_attention_biases,
+      "memory_length": self._memory_length,
+      "clamp_length": self._clamp_length,
+      "reuse_length": self._reuse_length,
+      "inner_activation": self._inner_activation,
+      "use_cls_mask": self._use_cls_mask,
+      "embedding_width": self._embedding_width,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -517,26 +516,26 @@ def get_embedding_lookup_table(self):
     return self._embedding_layer.embeddings
 
   def __call__(
-      self,
-      input_ids,
-      segment_ids=None,
-      input_mask=None,
-      state=None,
-      permutation_mask=None,
-      target_mapping=None,
-      masked_tokens=None,
-      **kwargs
+    self,
+    input_ids,
+    segment_ids=None,
+    input_mask=None,
+    state=None,
+    permutation_mask=None,
+    target_mapping=None,
+    masked_tokens=None,
+    **kwargs,
   ):
     # Uses dict to feed inputs into call() in order to keep state as a python
     # list.
     inputs = {
-        "input_ids": input_ids,
-        "segment_ids": segment_ids,
-        "input_mask": input_mask,
-        "state": state,
-        "permutation_mask": permutation_mask,
-        "target_mapping": target_mapping,
-        "masked_tokens": masked_tokens
+      "input_ids": input_ids,
+      "segment_ids": segment_ids,
+      "input_mask": input_mask,
+      "state": state,
+      "permutation_mask": permutation_mask,
+      "target_mapping": target_mapping,
+      "masked_tokens": masked_tokens,
     }
     return super().__call__(inputs, **kwargs)
 
@@ -560,15 +559,13 @@ def call(self, inputs):
 
     if self._two_stream and masked_tokens is None:
       raise ValueError(
-          "`masked_tokens` must be provided in order to "
-          "initialize the query stream in "
-          "`TwoStreamRelativeAttention`."
+        "`masked_tokens` must be provided in order to initialize the query stream in `TwoStreamRelativeAttention`."
       )
     if masked_tokens is not None and not self._two_stream:
       logging.warning(
-          "`masked_tokens` is provided but `two_stream` is not "
-          "enabled. Please enable `two_stream` to enable two "
-          "stream attention."
+        "`masked_tokens` is provided but `two_stream` is not "
+        "enabled. Please enable `two_stream` to enable two "
+        "stream attention."
       )
 
     if input_mask is not None:
@@ -578,24 +575,24 @@ def call(self, inputs):
     else:
       dtype = tf.int32
     query_attention_mask, content_attention_mask = _compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type=self._attention_type,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=dtype
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type=self._attention_type,
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=dtype,
     )
     relative_position_encoding = _compute_positional_encoding(
-        attention_type=self._attention_type,
-        position_encoding_layer=self.position_encoding,
-        hidden_size=self._hidden_size,
-        batch_size=batch_size,
-        total_length=total_length,
-        seq_length=seq_length,
-        clamp_length=self._clamp_length,
-        bi_data=self._bi_data,
-        dtype=tf.float32
+      attention_type=self._attention_type,
+      position_encoding_layer=self.position_encoding,
+      hidden_size=self._hidden_size,
+      batch_size=batch_size,
+      total_length=total_length,
+      seq_length=seq_length,
+      clamp_length=self._clamp_length,
+      bi_data=self._bi_data,
+      dtype=tf.float32,
     )
     relative_position_encoding = self.embedding_dropout(relative_position_encoding)
 
@@ -605,15 +602,15 @@ def call(self, inputs):
     else:
       if self._segment_embedding is None:
         self._segment_embedding = self.add_weight(
-            "seg_embed",
-            shape=[self._num_layers, 2, self._num_attention_heads, self._head_size],
-            dtype=tf.float32,
-            initializer=tf_utils.clone_initializer(self._initializer)
+          "seg_embed",
+          shape=[self._num_layers, 2, self._num_attention_heads, self._head_size],
+          dtype=tf.float32,
+          initializer=tf_utils.clone_initializer(self._initializer),
         )
 
       segment_embedding = self._segment_embedding
       segment_matrix = _compute_segment_matrix(
-          segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=self._use_cls_mask
+        segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=self._use_cls_mask
       )
 
     word_embeddings = self._embedding_layer(input_ids)
@@ -624,7 +621,7 @@ def call(self, inputs):
         self._mask_embedding = self.add_weight("mask_emb/mask_emb", shape=[1, 1, self._hidden_size], dtype=tf.float32)
       if target_mapping is None:
         masked_tokens = masked_tokens[:, :, None]
-        masked_token_embedding = (masked_tokens * self._mask_embedding + (1 - masked_tokens) * word_embeddings)
+        masked_token_embedding = masked_tokens * self._mask_embedding + (1 - masked_tokens) * word_embeddings
       else:
         masked_token_embedding = tf.tile(self._mask_embedding, [batch_size, tf.shape(target_mapping)[1], 1])
       query_stream = self._dropout(masked_token_embedding)
@@ -632,13 +629,13 @@ def call(self, inputs):
       query_stream = None
 
     return self._transformer_xl(
-        content_stream=content_stream,
-        query_stream=query_stream,
-        target_mapping=target_mapping,
-        state=state,
-        relative_position_encoding=relative_position_encoding,
-        segment_matrix=segment_matrix,
-        segment_embedding=segment_embedding,
-        content_attention_mask=content_attention_mask,
-        query_attention_mask=query_attention_mask
+      content_stream=content_stream,
+      query_stream=query_stream,
+      target_mapping=target_mapping,
+      state=state,
+      relative_position_encoding=relative_position_encoding,
+      segment_matrix=segment_matrix,
+      segment_embedding=segment_embedding,
+      content_attention_mask=content_attention_mask,
+      query_attention_mask=query_attention_mask,
     )
diff --git a/deepray/layers/networks/xlnet_base_test.py b/deepray/layers/networks/xlnet_base_test.py
index 16b8207f..02cefbc9 100644
--- a/deepray/layers/networks/xlnet_base_test.py
+++ b/deepray/layers/networks/xlnet_base_test.py
@@ -22,21 +22,20 @@
 
 
 class RelativePositionEncodingTest(tf.test.TestCase):
-
   def test_positional_embedding(self):
     """A low-dimensional example is tested.
 
-     With len(pos_seq)=2 and d_model=4:
+    With len(pos_seq)=2 and d_model=4:
 
-       pos_seq  = [[1.], [0.]]
-       inv_freq = [1., 0.01]
-       pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
-       pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
-                  [sin(0.), sin(0.), cos(0.), cos(0.)]]
-               = [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
-                 [0., 0., 1., 1.]]
+      pos_seq  = [[1.], [0.]]
+      inv_freq = [1., 0.01]
+      pos_seq x inv_freq = [[1, 0.01], [0., 0.]]
+      pos_emb = [[sin(1.), sin(0.01), cos(1.), cos(0.01)],
+                 [sin(0.), sin(0.), cos(0.), cos(0.)]]
+              = [[0.84147096, 0.00999983, 0.54030228, 0.99994999],
+                [0., 0., 1., 1.]]
     """
-    target = np.array([[[0.84147096, 0.00999983, 0.54030228, 0.99994999], [0., 0., 1., 1.]]])
+    target = np.array([[[0.84147096, 0.00999983, 0.54030228, 0.99994999], [0.0, 0.0, 1.0, 1.0]]])
     hidden_size = 4
     pos_seq = tf.range(1, -1, -1.0)  # [1., 0.]
     encoding_layer = xlnet_base.RelativePositionEncoding(hidden_size=hidden_size)
@@ -45,11 +44,12 @@ def test_positional_embedding(self):
 
 
 class ComputePositionEncodingTest(tf.test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(combinations.combine(
+  @combinations.generate(
+    combinations.combine(
       attention_type=["uni", "bi"],
       bi_data=[False, True],
-  ))
+    )
+  )
   def test_compute_position_encoding_smoke(self, attention_type, bi_data):
     hidden_size = 4
     batch_size = 4
@@ -57,22 +57,21 @@ def test_compute_position_encoding_smoke(self, attention_type, bi_data):
     seq_length = 4
     position_encoding_layer = xlnet_base.RelativePositionEncoding(hidden_size=hidden_size)
     encoding = xlnet_base._compute_positional_encoding(
-        attention_type=attention_type,
-        position_encoding_layer=position_encoding_layer,
-        hidden_size=hidden_size,
-        batch_size=batch_size,
-        total_length=total_length,
-        seq_length=seq_length,
-        clamp_length=2,
-        bi_data=bi_data,
-        dtype=tf.float32
+      attention_type=attention_type,
+      position_encoding_layer=position_encoding_layer,
+      hidden_size=hidden_size,
+      batch_size=batch_size,
+      total_length=total_length,
+      seq_length=seq_length,
+      clamp_length=2,
+      bi_data=bi_data,
+      dtype=tf.float32,
     )
     self.assertEqual(encoding.shape[0], batch_size)
     self.assertEqual(encoding.shape[2], hidden_size)
 
 
 class CausalAttentionMaskTests(tf.test.TestCase):
-
   def test_casual_attention_mask_with_no_memory(self):
     seq_length, memory_length = 3, 0
     causal_attention_mask = xlnet_base._create_causal_attention_mask(seq_length=seq_length, memory_length=memory_length)
@@ -90,7 +89,7 @@ def test_casual_attention_mask_with_memory(self):
   def test_causal_attention_mask_with_same_length(self):
     seq_length, memory_length = 3, 2
     causal_attention_mask = xlnet_base._create_causal_attention_mask(
-        seq_length=seq_length, memory_length=memory_length, same_length=True
+      seq_length=seq_length, memory_length=memory_length, same_length=True
     )
 
     expected_output = np.array([[1, 1, 1, 0, 0], [0, 1, 1, 1, 0], [0, 0, 1, 1, 1]])
@@ -98,14 +97,13 @@ def test_causal_attention_mask_with_same_length(self):
 
 
 class MaskComputationTests(tf.test.TestCase, parameterized.TestCase):
-
   @combinations.generate(
-      combinations.combine(
-          use_input_mask=[False, True],
-          use_permutation_mask=[False, True],
-          attention_type=["uni", "bi"],
-          memory_length=[0, 4],
-      )
+    combinations.combine(
+      use_input_mask=[False, True],
+      use_permutation_mask=[False, True],
+      attention_type=["uni", "bi"],
+      memory_length=[0, 4],
+    )
   )
   def test_compute_attention_mask_smoke(self, use_input_mask, use_permutation_mask, attention_type, memory_length):
     """Tests coverage and functionality for different configurations."""
@@ -120,13 +118,13 @@ def test_compute_attention_mask_smoke(self, use_input_mask, use_permutation_mask
     else:
       permutation_mask = None
     _, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type=attention_type,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type=attention_type,
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=tf.float32,
     )
 
     expected_mask_shape = (batch_size, 1, seq_length, seq_length + memory_length)
@@ -135,13 +133,13 @@ def test_compute_attention_mask_smoke(self, use_input_mask, use_permutation_mask
 
   def test_no_input_masks(self):
     query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=None,
-        permutation_mask=None,
-        attention_type="uni",
-        seq_length=8,
-        memory_length=2,
-        batch_size=2,
-        dtype=tf.float32
+      input_mask=None,
+      permutation_mask=None,
+      attention_type="uni",
+      seq_length=8,
+      memory_length=2,
+      batch_size=2,
+      dtype=tf.float32,
     )
     self.assertIsNone(query_mask)
     self.assertIsNone(content_mask)
@@ -168,13 +166,13 @@ def test_input_mask_no_permutation(self):
     expected_content_mask = np.array([[[[1, 1, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 1, 0, 1]]]])
 
     query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type="bi",
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=tf.float32,
     )
 
     self.assertAllClose(query_mask, expected_query_mask)
@@ -188,20 +186,20 @@ def test_permutation_mask_no_input_mask(self):
 
     input_mask = None
     permutation_mask = np.array([
-        [[1, 0], [1, 0]],
+      [[1, 0], [1, 0]],
     ])
 
     expected_query_mask = permutation_mask[:, None, :, :]
     expected_content_mask = np.array([[[[1, 0], [1, 1]]]])
 
     query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type="bi",
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=tf.float32,
     )
 
     self.assertAllClose(query_mask, expected_query_mask)
@@ -214,23 +212,25 @@ def test_permutation_and_input_mask(self):
     memory_length = 0
 
     input_mask = np.array([[1, 1, 0, 0]])
-    permutation_mask = np.array([[
+    permutation_mask = np.array([
+      [
         [0, 1, 1, 1],
         [1, 0, 1, 1],
         [1, 1, 0, 1],
         [1, 1, 1, 0],
-    ]])
+      ]
+    ])
 
     expected_query_mask = np.array([[[[0, 1, 0, 0], [1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 0, 0]]]])
     expected_content_mask = np.array([[[[1, 1, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 1, 0, 1]]]])
     query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="bi",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type="bi",
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=tf.float32,
     )
 
     self.assertAllClose(query_mask, expected_query_mask)
@@ -243,23 +243,25 @@ def test_permutation_input_uni_mask(self):
     memory_length = 0
 
     input_mask = np.array([[1, 1, 1, 0]])
-    permutation_mask = np.array([[
+    permutation_mask = np.array([
+      [
         [0, 1, 1, 1],
         [1, 0, 1, 1],
         [1, 1, 0, 1],
         [1, 1, 1, 0],
-    ]])
+      ]
+    ])
 
     expected_query_mask = np.array([[[[0, 0, 0, 0], [1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0]]]])
     expected_content_mask = np.array([[[[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1]]]])
     query_mask, content_mask = xlnet_base._compute_attention_mask(
-        input_mask=input_mask,
-        permutation_mask=permutation_mask,
-        attention_type="uni",
-        seq_length=seq_length,
-        memory_length=memory_length,
-        batch_size=batch_size,
-        dtype=tf.float32
+      input_mask=input_mask,
+      permutation_mask=permutation_mask,
+      attention_type="uni",
+      seq_length=seq_length,
+      memory_length=memory_length,
+      batch_size=batch_size,
+      dtype=tf.float32,
     )
 
     self.assertAllClose(query_mask, expected_query_mask)
@@ -267,10 +269,9 @@ def test_permutation_input_uni_mask(self):
 
 
 class SegmentMatrixTests(tf.test.TestCase):
-
   def test_no_segment_ids(self):
     segment_matrix = xlnet_base._compute_segment_matrix(
-        segment_ids=None, memory_length=2, batch_size=1, use_cls_mask=False
+      segment_ids=None, memory_length=2, batch_size=1, use_cls_mask=False
     )
     self.assertIsNone(segment_matrix)
 
@@ -278,16 +279,11 @@ def test_basic(self):
     batch_size = 1
     memory_length = 0
     segment_ids = np.array([[1, 1, 2, 1]])
-    expected_segment_matrix = np.array(
-        [
-            [
-                [False, False, True, False], [False, False, True, False], [True, True, False, True],
-                [False, False, True, False]
-            ]
-        ]
-    )
+    expected_segment_matrix = np.array([
+      [[False, False, True, False], [False, False, True, False], [True, True, False, True], [False, False, True, False]]
+    ])
     segment_matrix = xlnet_base._compute_segment_matrix(
-        segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=False
+      segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=False
     )
     self.assertAllClose(segment_matrix, expected_segment_matrix)
 
@@ -295,19 +291,19 @@ def test_basic_with_memory(self):
     batch_size = 1
     memory_length = 1
     segment_ids = np.array([[1, 1, 2, 1]])
-    expected_segment_matrix = np.array(
-        [
-            [
-                [True, False, False, True, False], [True, False, False, True, False], [True, True, True, False, True],
-                [True, False, False, True, False]
-            ]
-        ]
-    ).astype(int)
+    expected_segment_matrix = np.array([
+      [
+        [True, False, False, True, False],
+        [True, False, False, True, False],
+        [True, True, True, False, True],
+        [True, False, False, True, False],
+      ]
+    ]).astype(int)
     segment_matrix = tf.cast(
-        xlnet_base._compute_segment_matrix(
-            segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=False
-        ),
-        dtype=tf.uint8
+      xlnet_base._compute_segment_matrix(
+        segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=False
+      ),
+      dtype=tf.uint8,
     )
     self.assertAllClose(segment_matrix, expected_segment_matrix)
 
@@ -317,25 +313,19 @@ def dont_test_basic_with_class_mask(self):
     batch_size = 1
     memory_length = 0
     segment_ids = np.array([[1, 1, 2, 1]])
-    expected_segment_matrix = np.array(
-        [
-            [
-                [False, False, True, False], [False, False, True, False], [True, True, False, True],
-                [False, False, True, False]
-            ]
-        ]
-    ).astype(int)
+    expected_segment_matrix = np.array([
+      [[False, False, True, False], [False, False, True, False], [True, True, False, True], [False, False, True, False]]
+    ]).astype(int)
     segment_matrix = tf.cast(
-        xlnet_base._compute_segment_matrix(
-            segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=True
-        ),
-        dtype=tf.uint8
+      xlnet_base._compute_segment_matrix(
+        segment_ids=segment_ids, memory_length=memory_length, batch_size=batch_size, use_cls_mask=True
+      ),
+      dtype=tf.uint8,
     )
     self.assertAllClose(segment_matrix, expected_segment_matrix)
 
 
 class XLNetModelTests(tf.test.TestCase):
-
   def _generate_data(self, batch_size, seq_length, num_predictions=None):
     """Generates sample XLNet data for testing."""
     sequence_shape = (batch_size, seq_length)
@@ -343,12 +333,12 @@ def _generate_data(self, batch_size, seq_length, num_predictions=None):
       target_mapping = tf.random.uniform(shape=(batch_size, num_predictions, seq_length))
 
     return {
-        "input_ids": np.random.randint(10, size=sequence_shape, dtype="int32"),
-        "segment_ids": np.random.randint(2, size=sequence_shape, dtype="int32"),
-        "input_mask": np.random.randint(2, size=sequence_shape).astype("float32"),
-        "permutation_mask": np.random.randint(2, size=(batch_size, seq_length, seq_length)).astype("float32"),
-        "target_mapping": target_mapping,
-        "masked_tokens": tf.random.uniform(shape=sequence_shape),
+      "input_ids": np.random.randint(10, size=sequence_shape, dtype="int32"),
+      "segment_ids": np.random.randint(2, size=sequence_shape, dtype="int32"),
+      "input_mask": np.random.randint(2, size=sequence_shape).astype("float32"),
+      "permutation_mask": np.random.randint(2, size=(batch_size, seq_length, seq_length)).astype("float32"),
+      "target_mapping": target_mapping,
+      "masked_tokens": tf.random.uniform(shape=sequence_shape),
     }
 
   def test_xlnet_model(self):
@@ -357,21 +347,21 @@ def test_xlnet_model(self):
     num_predictions = 2
     hidden_size = 4
     xlnet_model = xlnet_base.XLNetBase(
-        vocab_size=32000,
-        num_layers=2,
-        hidden_size=hidden_size,
-        num_attention_heads=2,
-        head_size=2,
-        inner_size=2,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        attention_type="bi",
-        bi_data=True,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        reuse_length=0,
-        inner_activation="relu"
+      vocab_size=32000,
+      num_layers=2,
+      hidden_size=hidden_size,
+      num_attention_heads=2,
+      head_size=2,
+      inner_size=2,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      attention_type="bi",
+      bi_data=True,
+      initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
+      two_stream=False,
+      tie_attention_biases=True,
+      reuse_length=0,
+      inner_activation="relu",
     )
     input_data = self._generate_data(batch_size=batch_size, seq_length=seq_length, num_predictions=num_predictions)
     model_output = xlnet_model(**input_data)
@@ -379,22 +369,22 @@ def test_xlnet_model(self):
 
   def test_get_config(self):
     xlnet_model = xlnet_base.XLNetBase(
-        vocab_size=32000,
-        num_layers=12,
-        hidden_size=36,
-        num_attention_heads=12,
-        head_size=12,
-        inner_size=12,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        attention_type="bi",
-        bi_data=True,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        memory_length=0,
-        reuse_length=0,
-        inner_activation="relu"
+      vocab_size=32000,
+      num_layers=12,
+      hidden_size=36,
+      num_attention_heads=12,
+      head_size=12,
+      inner_size=12,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      attention_type="bi",
+      bi_data=True,
+      initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
+      two_stream=False,
+      tie_attention_biases=True,
+      memory_length=0,
+      reuse_length=0,
+      inner_activation="relu",
     )
     config = xlnet_model.get_config()
     new_xlnet = xlnet_base.XLNetBase.from_config(config)
diff --git a/deepray/layers/noisy_dense.py b/deepray/layers/noisy_dense.py
index 83caf31e..281079d4 100644
--- a/deepray/layers/noisy_dense.py
+++ b/deepray/layers/noisy_dense.py
@@ -29,99 +29,99 @@ def _scaled_noise(size, dtype):
 class NoisyDense(tf.keras.layers.Dense):
   r"""Noisy dense layer that injects random noise to the weights of dense layer.
 
-    Noisy dense layers are fully connected layers whose weights and biases are
-    augmented by factorised Gaussian noise. The factorised Gaussian noise is
-    controlled through gradient descent by a second weights layer.
-
-    A `NoisyDense` layer implements the operation:
-    $$
-    \mathrm{NoisyDense}(x) =
-    \mathrm{activation}(\mathrm{dot}(x, \mu + (\sigma \cdot \epsilon))
-    + \mathrm{bias})
-    $$
-    where $\mu$ is the standard weights layer, $\epsilon$ is the factorised
-    Gaussian noise, and $\sigma$ is a second weights layer which controls
-    $\epsilon$.
-
-    Note: bias only added if `use_bias` is `True`.
-
-    Example:
-
-    >>> # Create a `Sequential` model and add a NoisyDense
-    >>> # layer as the first layer.
-    >>> model = tf.keras.models.Sequential()
-    >>> model.add(tf.keras.Input(shape=(16,)))
-    >>> model.add(NoisyDense(32, activation='relu'))
-    >>> # Now the model will take as input arrays of shape (None, 16)
-    >>> # and output arrays of shape (None, 32).
-    >>> # Note that after the first layer, you don't need to specify
-    >>> # the size of the input anymore:
-    >>> model.add(NoisyDense(32))
-    >>> model.output_shape
-    (None, 32)
-
-    There are implemented both variants:
-        1. Independent Gaussian noise
-        2. Factorised Gaussian noise.
-    We can choose between that by 'use_factorised' parameter.
-
-    Args:
-      units: Positive integer, dimensionality of the output space.
-      sigma: A float between 0-1 used as a standard deviation figure and is
-        applied to the gaussian noise layer (`sigma_kernel` and `sigma_bias`). (uses only if use_factorised=True)
-      use_factorised: Boolean, whether the layer uses independent or factorised Gaussian noise
-      activation: Activation function to use.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, whether the layer uses a bias vector.
-      kernel_regularizer: Regularizer function applied to
-        the `kernel` weights matrix.
-      bias_regularizer: Regularizer function applied to the bias vector.
-      activity_regularizer: Regularizer function applied to
-        the output of the layer (its "activation").
-      kernel_constraint: Constraint function applied to
-        the `kernel` weights matrix.
-      bias_constraint: Constraint function applied to the bias vector.
-
-    Input shape:
-      N-D tensor with shape: `(batch_size, ..., input_dim)`.
-      The most common situation would be
-      a 2D input with shape `(batch_size, input_dim)`.
-
-    Output shape:
-      N-D tensor with shape: `(batch_size, ..., units)`.
-      For instance, for a 2D input with shape `(batch_size, input_dim)`,
-      the output would have shape `(batch_size, units)`.
-
-    References:
-      - [Noisy Networks for Explanation](https://arxiv.org/pdf/1706.10295.pdf)
-    """
+  Noisy dense layers are fully connected layers whose weights and biases are
+  augmented by factorised Gaussian noise. The factorised Gaussian noise is
+  controlled through gradient descent by a second weights layer.
+
+  A `NoisyDense` layer implements the operation:
+  $$
+  \mathrm{NoisyDense}(x) =
+  \mathrm{activation}(\mathrm{dot}(x, \mu + (\sigma \cdot \epsilon))
+  + \mathrm{bias})
+  $$
+  where $\mu$ is the standard weights layer, $\epsilon$ is the factorised
+  Gaussian noise, and $\sigma$ is a second weights layer which controls
+  $\epsilon$.
+
+  Note: bias only added if `use_bias` is `True`.
+
+  Example:
+
+  >>> # Create a `Sequential` model and add a NoisyDense
+  >>> # layer as the first layer.
+  >>> model = tf.keras.models.Sequential()
+  >>> model.add(tf.keras.Input(shape=(16,)))
+  >>> model.add(NoisyDense(32, activation='relu'))
+  >>> # Now the model will take as input arrays of shape (None, 16)
+  >>> # and output arrays of shape (None, 32).
+  >>> # Note that after the first layer, you don't need to specify
+  >>> # the size of the input anymore:
+  >>> model.add(NoisyDense(32))
+  >>> model.output_shape
+  (None, 32)
+
+  There are implemented both variants:
+      1. Independent Gaussian noise
+      2. Factorised Gaussian noise.
+  We can choose between that by 'use_factorised' parameter.
+
+  Args:
+    units: Positive integer, dimensionality of the output space.
+    sigma: A float between 0-1 used as a standard deviation figure and is
+      applied to the gaussian noise layer (`sigma_kernel` and `sigma_bias`). (uses only if use_factorised=True)
+    use_factorised: Boolean, whether the layer uses independent or factorised Gaussian noise
+    activation: Activation function to use.
+      If you don't specify anything, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_regularizer: Regularizer function applied to
+      the `kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to
+      the output of the layer (its "activation").
+    kernel_constraint: Constraint function applied to
+      the `kernel` weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+
+  Input shape:
+    N-D tensor with shape: `(batch_size, ..., input_dim)`.
+    The most common situation would be
+    a 2D input with shape `(batch_size, input_dim)`.
+
+  Output shape:
+    N-D tensor with shape: `(batch_size, ..., units)`.
+    For instance, for a 2D input with shape `(batch_size, input_dim)`,
+    the output would have shape `(batch_size, units)`.
+
+  References:
+    - [Noisy Networks for Explanation](https://arxiv.org/pdf/1706.10295.pdf)
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: int,
-      sigma: float = 0.5,
-      use_factorised: bool = True,
-      activation: types.Activation = None,
-      use_bias: bool = True,
-      kernel_regularizer: types.Regularizer = None,
-      bias_regularizer: types.Regularizer = None,
-      activity_regularizer: types.Regularizer = None,
-      kernel_constraint: types.Constraint = None,
-      bias_constraint: types.Constraint = None,
-      **kwargs,
+    self,
+    units: int,
+    sigma: float = 0.5,
+    use_factorised: bool = True,
+    activation: types.Activation = None,
+    use_bias: bool = True,
+    kernel_regularizer: types.Regularizer = None,
+    bias_regularizer: types.Regularizer = None,
+    activity_regularizer: types.Regularizer = None,
+    kernel_constraint: types.Constraint = None,
+    bias_constraint: types.Constraint = None,
+    **kwargs,
   ):
     super().__init__(
-        units=units,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_regularizer=kernel_regularizer,
-        bias_regularizer=bias_regularizer,
-        activity_regularizer=activity_regularizer,
-        kernel_constraint=kernel_constraint,
-        bias_constraint=bias_constraint,
-        **kwargs,
+      units=units,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_regularizer=kernel_regularizer,
+      bias_regularizer=bias_regularizer,
+      activity_regularizer=activity_regularizer,
+      kernel_constraint=kernel_constraint,
+      bias_constraint=bias_constraint,
+      **kwargs,
     )
     delattr(self, "kernel_initializer")
     delattr(self, "bias_initializer")
@@ -132,15 +132,13 @@ def build(self, input_shape):
     # Make sure dtype is correct
     dtype = tf.dtypes.as_dtype(self.dtype or keras.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError("Unable to build `Dense` layer with non-floating point "
-                      "dtype %s" % (dtype,))
+      raise TypeError("Unable to build `Dense` layer with non-floating point dtype %s" % (dtype,))
 
     input_shape = tf.TensorShape(input_shape)
     self.last_dim = tf.compat.dimension_value(input_shape[-1])
-    sqrt_dim = self.last_dim**(1 / 2)
+    sqrt_dim = self.last_dim ** (1 / 2)
     if self.last_dim is None:
-      raise ValueError("The last dimension of the inputs to `Dense` "
-                       "should be defined. Found `None`.")
+      raise ValueError("The last dimension of the inputs to `Dense` should be defined. Found `None`.")
     self.input_spec = keras.layers.InputSpec(min_ndim=2, axes={-1: self.last_dim})
 
     # use factorising Gaussian variables
@@ -149,7 +147,7 @@ def build(self, input_shape):
       sigma_init = self.sigma / sqrt_dim
     # use independent Gaussian variables
     else:
-      mu_init = (3.0 / self.last_dim)**(1 / 2)
+      mu_init = (3.0 / self.last_dim) ** (1 / 2)
       sigma_init = 0.017
 
     sigma_init = keras.initializers.Constant(value=sigma_init)
@@ -157,70 +155,70 @@ def build(self, input_shape):
 
     # Learnable parameters
     self.sigma_kernel = self.add_weight(
-        "sigma_kernel",
-        shape=[self.last_dim, self.units],
-        initializer=sigma_init,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        trainable=True,
+      "sigma_kernel",
+      shape=[self.last_dim, self.units],
+      initializer=sigma_init,
+      regularizer=self.kernel_regularizer,
+      constraint=self.kernel_constraint,
+      dtype=self.dtype,
+      trainable=True,
     )
 
     self.mu_kernel = self.add_weight(
-        "mu_kernel",
-        shape=[self.last_dim, self.units],
-        initializer=mu_init,
-        regularizer=self.kernel_regularizer,
-        constraint=self.kernel_constraint,
-        dtype=self.dtype,
-        trainable=True,
+      "mu_kernel",
+      shape=[self.last_dim, self.units],
+      initializer=mu_init,
+      regularizer=self.kernel_regularizer,
+      constraint=self.kernel_constraint,
+      dtype=self.dtype,
+      trainable=True,
     )
 
     self.eps_kernel = self.add_weight(
-        "eps_kernel",
-        shape=[self.last_dim, self.units],
-        initializer=keras.initializers.Zeros(),
-        regularizer=None,
-        constraint=None,
-        dtype=self.dtype,
-        trainable=False,
+      "eps_kernel",
+      shape=[self.last_dim, self.units],
+      initializer=keras.initializers.Zeros(),
+      regularizer=None,
+      constraint=None,
+      dtype=self.dtype,
+      trainable=False,
     )
 
     if self.use_bias:
       self.sigma_bias = self.add_weight(
-          "sigma_bias",
-          shape=[
-              self.units,
-          ],
-          initializer=sigma_init,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True,
+        "sigma_bias",
+        shape=[
+          self.units,
+        ],
+        initializer=sigma_init,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint,
+        dtype=self.dtype,
+        trainable=True,
       )
 
       self.mu_bias = self.add_weight(
-          "mu_bias",
-          shape=[
-              self.units,
-          ],
-          initializer=mu_init,
-          regularizer=self.bias_regularizer,
-          constraint=self.bias_constraint,
-          dtype=self.dtype,
-          trainable=True,
+        "mu_bias",
+        shape=[
+          self.units,
+        ],
+        initializer=mu_init,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint,
+        dtype=self.dtype,
+        trainable=True,
       )
 
       self.eps_bias = self.add_weight(
-          "eps_bias",
-          shape=[
-              self.units,
-          ],
-          initializer=keras.initializers.Zeros(),
-          regularizer=None,
-          constraint=None,
-          dtype=self.dtype,
-          trainable=False,
+        "eps_bias",
+        shape=[
+          self.units,
+        ],
+        initializer=keras.initializers.Zeros(),
+        regularizer=None,
+        constraint=None,
+        dtype=self.dtype,
+        trainable=False,
       )
     else:
       self.sigma_bias = None
@@ -252,12 +250,14 @@ def reset_noise(self):
     else:
       # generate independent variables
       self.eps_kernel.assign(tf.random.normal(shape=[self.last_dim, self.units], dtype=self.dtype))
-      self.eps_bias.assign(tf.random.normal(
+      self.eps_bias.assign(
+        tf.random.normal(
           shape=[
-              self.units,
+            self.units,
           ],
           dtype=self.dtype,
-      ))
+        )
+      )
 
   def remove_noise(self):
     """Remove the factorised Gaussian noise."""
@@ -272,18 +272,16 @@ def call(self, inputs):
   def get_config(self):
     # TODO(WindQAQ): Get rid of this hacky way.
     config = super(tf.keras.layers.Dense, self).get_config()
-    config.update(
-        {
-            "units": self.units,
-            "sigma": self.sigma,
-            "use_factorised": self.use_factorised,
-            "activation": keras.activations.serialize(self.activation),
-            "use_bias": self.use_bias,
-            "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer),
-            "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer),
-            "activity_regularizer": keras.regularizers.serialize(self.activity_regularizer),
-            "kernel_constraint": keras.constraints.serialize(self.kernel_constraint),
-            "bias_constraint": keras.constraints.serialize(self.bias_constraint),
-        }
-    )
+    config.update({
+      "units": self.units,
+      "sigma": self.sigma,
+      "use_factorised": self.use_factorised,
+      "activation": keras.activations.serialize(self.activation),
+      "use_bias": self.use_bias,
+      "kernel_regularizer": keras.regularizers.serialize(self.kernel_regularizer),
+      "bias_regularizer": keras.regularizers.serialize(self.bias_regularizer),
+      "activity_regularizer": keras.regularizers.serialize(self.activity_regularizer),
+      "kernel_constraint": keras.constraints.serialize(self.kernel_constraint),
+      "bias_constraint": keras.constraints.serialize(self.bias_constraint),
+    })
     return config
diff --git a/deepray/layers/normalizations.py b/deepray/layers/normalizations.py
index f13c2270..bd356309 100644
--- a/deepray/layers/normalizations.py
+++ b/deepray/layers/normalizations.py
@@ -26,67 +26,67 @@
 class GroupNormalization(tf.keras.layers.Layer):
   """Group normalization layer.
 
-    Source: "Group Normalization" (Yuxin Wu & Kaiming He, 2018)
-    https://arxiv.org/abs/1803.08494
-
-    Group Normalization divides the channels into groups and computes
-    within each group the mean and variance for normalization.
-    Empirically, its accuracy is more stable than batch norm in a wide
-    range of small batch sizes, if learning rate is adjusted linearly
-    with batch sizes.
-
-    Relation to Layer Normalization:
-    If the number of groups is set to 1, then this operation becomes identical
-    to Layer Normalization.
-
-    Relation to Instance Normalization:
-    If the number of groups is set to the
-    input dimension (number of groups is equal
-    to number of channels), then this operation becomes
-    identical to Instance Normalization.
-
-    Args:
-        groups: Integer, the number of groups for Group Normalization.
-            Can be in the range [1, N] where N is the input dimension.
-            The input dimension must be divisible by the number of groups.
-            Defaults to 32.
-        axis: Integer, the axis that should be normalized.
-        epsilon: Small float added to variance to avoid dividing by zero.
-        center: If True, add offset of `beta` to normalized tensor.
-            If False, `beta` is ignored.
-        scale: If True, multiply by `gamma`.
-            If False, `gamma` is not used.
-        beta_initializer: Initializer for the beta weight.
-        gamma_initializer: Initializer for the gamma weight.
-        beta_regularizer: Optional regularizer for the beta weight.
-        gamma_regularizer: Optional regularizer for the gamma weight.
-        beta_constraint: Optional constraint for the beta weight.
-        gamma_constraint: Optional constraint for the gamma weight.
-
-    Input shape:
-        Arbitrary. Use the keyword argument `input_shape`
-        (tuple of integers, does not include the samples axis)
-        when using this layer as the first layer in a model.
-
-    Output shape:
-        Same shape as input.
-    """
+  Source: "Group Normalization" (Yuxin Wu & Kaiming He, 2018)
+  https://arxiv.org/abs/1803.08494
+
+  Group Normalization divides the channels into groups and computes
+  within each group the mean and variance for normalization.
+  Empirically, its accuracy is more stable than batch norm in a wide
+  range of small batch sizes, if learning rate is adjusted linearly
+  with batch sizes.
+
+  Relation to Layer Normalization:
+  If the number of groups is set to 1, then this operation becomes identical
+  to Layer Normalization.
+
+  Relation to Instance Normalization:
+  If the number of groups is set to the
+  input dimension (number of groups is equal
+  to number of channels), then this operation becomes
+  identical to Instance Normalization.
+
+  Args:
+      groups: Integer, the number of groups for Group Normalization.
+          Can be in the range [1, N] where N is the input dimension.
+          The input dimension must be divisible by the number of groups.
+          Defaults to 32.
+      axis: Integer, the axis that should be normalized.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor.
+          If False, `beta` is ignored.
+      scale: If True, multiply by `gamma`.
+          If False, `gamma` is not used.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape:
+      Same shape as input.
+  """
 
   @typechecked
   def __init__(
-      self,
-      groups: int = 32,
-      axis: int = -1,
-      epsilon: float = 1e-3,
-      center: bool = True,
-      scale: bool = True,
-      beta_initializer: types.Initializer = "zeros",
-      gamma_initializer: types.Initializer = "ones",
-      beta_regularizer: types.Regularizer = None,
-      gamma_regularizer: types.Regularizer = None,
-      beta_constraint: types.Constraint = None,
-      gamma_constraint: types.Constraint = None,
-      **kwargs,
+    self,
+    groups: int = 32,
+    axis: int = -1,
+    epsilon: float = 1e-3,
+    center: bool = True,
+    scale: bool = True,
+    beta_initializer: types.Initializer = "zeros",
+    gamma_initializer: types.Initializer = "ones",
+    beta_regularizer: types.Regularizer = None,
+    gamma_regularizer: types.Regularizer = None,
+    beta_constraint: types.Constraint = None,
+    gamma_constraint: types.Constraint = None,
+    **kwargs,
   ):
     super().__init__(**kwargs)
     self.supports_masking = True
@@ -132,17 +132,17 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "groups": self.groups,
-        "axis": self.axis,
-        "epsilon": self.epsilon,
-        "center": self.center,
-        "scale": self.scale,
-        "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer),
-        "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer),
-        "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer),
-        "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer),
-        "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint),
-        "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint),
+      "groups": self.groups,
+      "axis": self.axis,
+      "epsilon": self.epsilon,
+      "center": self.center,
+      "scale": self.scale,
+      "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer),
+      "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer),
+      "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer),
+      "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer),
+      "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint),
+      "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint),
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -173,12 +173,12 @@ def _apply_normalization(self, reshaped_inputs, input_shape):
 
     gamma, beta = self._get_reshaped_weights(input_shape)
     normalized_inputs = tf.nn.batch_normalization(
-        reshaped_inputs,
-        mean=mean,
-        variance=variance,
-        scale=gamma,
-        offset=beta,
-        variance_epsilon=self.epsilon,
+      reshaped_inputs,
+      mean=mean,
+      variance=variance,
+      scale=gamma,
+      offset=beta,
+      variance_epsilon=self.epsilon,
     )
     return normalized_inputs
 
@@ -197,9 +197,9 @@ def _check_if_input_shape_is_none(self, input_shape):
     dim = input_shape[self.axis]
     if dim is None:
       raise ValueError(
-          "Axis " + str(self.axis) + " of "
-          "input tensor should have a defined dimension "
-          "but the layer received an input with shape " + str(input_shape) + "."
+        "Axis " + str(self.axis) + " of "
+        "input tensor should have a defined dimension "
+        "but the layer received an input with shape " + str(input_shape) + "."
       )
 
   def _set_number_of_groups_for_instance_norm(self, input_shape):
@@ -212,21 +212,18 @@ def _check_size_of_dimensions(self, input_shape):
     dim = input_shape[self.axis]
     if dim < self.groups:
       raise ValueError(
-          "Number of groups (" + str(self.groups) + ") cannot be "
-          "more than the number of channels (" + str(dim) + ")."
+        "Number of groups (" + str(self.groups) + ") cannot be more than the number of channels (" + str(dim) + ")."
       )
 
     if dim % self.groups != 0:
       raise ValueError(
-          "Number of groups (" + str(self.groups) + ") must be a "
-          "multiple of the number of channels (" + str(dim) + ")."
+        "Number of groups (" + str(self.groups) + ") must be a multiple of the number of channels (" + str(dim) + ")."
       )
 
   def _check_axis(self):
     if self.axis == 0:
       raise ValueError(
-          "You are trying to normalize your batch axis. Do you want to "
-          "use tf.layer.batch_normalization instead"
+        "You are trying to normalize your batch axis. Do you want to use tf.layer.batch_normalization instead"
       )
 
   def _create_input_spec(self, input_shape):
@@ -239,11 +236,11 @@ def _add_gamma_weight(self, input_shape):
 
     if self.scale:
       self.gamma = self.add_weight(
-          shape=shape,
-          name="gamma",
-          initializer=self.gamma_initializer,
-          regularizer=self.gamma_regularizer,
-          constraint=self.gamma_constraint,
+        shape=shape,
+        name="gamma",
+        initializer=self.gamma_initializer,
+        regularizer=self.gamma_regularizer,
+        constraint=self.gamma_constraint,
       )
     else:
       self.gamma = None
@@ -254,11 +251,11 @@ def _add_beta_weight(self, input_shape):
 
     if self.center:
       self.beta = self.add_weight(
-          shape=shape,
-          name="beta",
-          initializer=self.beta_initializer,
-          regularizer=self.beta_regularizer,
-          constraint=self.beta_constraint,
+        shape=shape,
+        name="beta",
+        initializer=self.beta_initializer,
+        regularizer=self.beta_regularizer,
+        constraint=self.beta_constraint,
       )
     else:
       self.beta = None
@@ -278,38 +275,38 @@ def _create_broadcast_shape(self, input_shape):
 class InstanceNormalization(GroupNormalization):
   """Instance normalization layer.
 
-    Instance Normalization is an specific case of ```GroupNormalization```since
-    it normalizes all features of one channel. The Groupsize is equal to the
-    channel size. Empirically, its accuracy is more stable than batch norm in a
-    wide range of small batch sizes, if learning rate is adjusted linearly
-    with batch sizes.
-
-    Arguments
-        axis: Integer, the axis that should be normalized.
-        epsilon: Small float added to variance to avoid dividing by zero.
-        center: If True, add offset of `beta` to normalized tensor.
-            If False, `beta` is ignored.
-        scale: If True, multiply by `gamma`.
-            If False, `gamma` is not used.
-        beta_initializer: Initializer for the beta weight.
-        gamma_initializer: Initializer for the gamma weight.
-        beta_regularizer: Optional regularizer for the beta weight.
-        gamma_regularizer: Optional regularizer for the gamma weight.
-        beta_constraint: Optional constraint for the beta weight.
-        gamma_constraint: Optional constraint for the gamma weight.
-
-    Input shape
-        Arbitrary. Use the keyword argument `input_shape`
-        (tuple of integers, does not include the samples axis)
-        when using this layer as the first layer in a model.
-
-    Output shape
-        Same shape as input.
-
-    References
-        - [Instance Normalization: The Missing Ingredient for Fast Stylization]
-        (https://arxiv.org/abs/1607.08022)
-    """
+  Instance Normalization is an specific case of ```GroupNormalization```since
+  it normalizes all features of one channel. The Groupsize is equal to the
+  channel size. Empirically, its accuracy is more stable than batch norm in a
+  wide range of small batch sizes, if learning rate is adjusted linearly
+  with batch sizes.
+
+  Arguments
+      axis: Integer, the axis that should be normalized.
+      epsilon: Small float added to variance to avoid dividing by zero.
+      center: If True, add offset of `beta` to normalized tensor.
+          If False, `beta` is ignored.
+      scale: If True, multiply by `gamma`.
+          If False, `gamma` is not used.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+
+  Input shape
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
+
+  Output shape
+      Same shape as input.
+
+  References
+      - [Instance Normalization: The Missing Ingredient for Fast Stylization]
+      (https://arxiv.org/abs/1607.08022)
+  """
 
   def __init__(self, **kwargs):
     if "groups" in kwargs:
@@ -323,57 +320,57 @@ def __init__(self, **kwargs):
 class FilterResponseNormalization(tf.keras.layers.Layer):
   """Filter response normalization layer.
 
-    Filter Response Normalization (FRN), a normalization
-    method that enables models trained with per-channel
-    normalization to achieve high accuracy. It performs better than
-    all other normalization techniques for small batches and is par
-    with Batch Normalization for bigger batch sizes.
-
-    Arguments
-        axis: List of axes that should be normalized. This should represent the
-              spatial dimensions.
-        epsilon: Small positive float value added to variance to avoid dividing by zero.
-        beta_initializer: Initializer for the beta weight.
-        gamma_initializer: Initializer for the gamma weight.
-        beta_regularizer: Optional regularizer for the beta weight.
-        gamma_regularizer: Optional regularizer for the gamma weight.
-        beta_constraint: Optional constraint for the beta weight.
-        gamma_constraint: Optional constraint for the gamma weight.
-        learned_epsilon: (bool) Whether to add another learnable
-        epsilon parameter or not.
-        name: Optional name for the layer
-
-    Input shape
-        Arbitrary. Use the keyword argument `input_shape`
-        (tuple of integers, does not include the samples axis)
-        when using this layer as the first layer in a model. This layer, as of now,
-        works on a 4-D tensor where the tensor should have the shape [N X H X W X C]
-
-        TODO: Add support for NCHW data format and FC layers.
-
-    Output shape
-        Same shape as input.
-
-    References
-        - [Filter Response Normalization Layer: Eliminating Batch Dependence
-        in the training of Deep Neural Networks]
-        (https://arxiv.org/abs/1911.09737)
-    """
+  Filter Response Normalization (FRN), a normalization
+  method that enables models trained with per-channel
+  normalization to achieve high accuracy. It performs better than
+  all other normalization techniques for small batches and is par
+  with Batch Normalization for bigger batch sizes.
+
+  Arguments
+      axis: List of axes that should be normalized. This should represent the
+            spatial dimensions.
+      epsilon: Small positive float value added to variance to avoid dividing by zero.
+      beta_initializer: Initializer for the beta weight.
+      gamma_initializer: Initializer for the gamma weight.
+      beta_regularizer: Optional regularizer for the beta weight.
+      gamma_regularizer: Optional regularizer for the gamma weight.
+      beta_constraint: Optional constraint for the beta weight.
+      gamma_constraint: Optional constraint for the gamma weight.
+      learned_epsilon: (bool) Whether to add another learnable
+      epsilon parameter or not.
+      name: Optional name for the layer
+
+  Input shape
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model. This layer, as of now,
+      works on a 4-D tensor where the tensor should have the shape [N X H X W X C]
+
+      TODO: Add support for NCHW data format and FC layers.
+
+  Output shape
+      Same shape as input.
+
+  References
+      - [Filter Response Normalization Layer: Eliminating Batch Dependence
+      in the training of Deep Neural Networks]
+      (https://arxiv.org/abs/1911.09737)
+  """
 
   def __init__(
-      self,
-      epsilon: float = 1e-6,
-      axis: list = [1, 2],
-      beta_initializer: types.Initializer = "zeros",
-      gamma_initializer: types.Initializer = "ones",
-      beta_regularizer: types.Regularizer = None,
-      gamma_regularizer: types.Regularizer = None,
-      beta_constraint: types.Constraint = None,
-      gamma_constraint: types.Constraint = None,
-      learned_epsilon: bool = False,
-      learned_epsilon_constraint: types.Constraint = None,
-      name: str = None,
-      **kwargs,
+    self,
+    epsilon: float = 1e-6,
+    axis: list = [1, 2],
+    beta_initializer: types.Initializer = "zeros",
+    gamma_initializer: types.Initializer = "ones",
+    beta_regularizer: types.Regularizer = None,
+    gamma_regularizer: types.Regularizer = None,
+    beta_constraint: types.Constraint = None,
+    gamma_constraint: types.Constraint = None,
+    learned_epsilon: bool = False,
+    learned_epsilon_constraint: types.Constraint = None,
+    name: str = None,
+    **kwargs,
   ):
     super().__init__(name=name, **kwargs)
     self.epsilon = epsilon
@@ -390,12 +387,12 @@ def __init__(
       self.eps_learned_initializer = tf.keras.initializers.Constant(1e-4)
       self.eps_learned_constraint = tf.keras.constraints.get(learned_epsilon_constraint)
       self.eps_learned = self.add_weight(
-          shape=(1,),
-          name="learned_epsilon",
-          dtype=self.dtype,
-          initializer=tf.keras.initializers.get(self.eps_learned_initializer),
-          regularizer=None,
-          constraint=self.eps_learned_constraint,
+        shape=(1,),
+        name="learned_epsilon",
+        dtype=self.dtype,
+        initializer=tf.keras.initializers.get(self.eps_learned_initializer),
+        regularizer=None,
+        constraint=self.eps_learned_constraint,
       )
     else:
       self.eps_learned_initializer = None
@@ -423,16 +420,16 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "axis": self.axis,
-        "epsilon": self.epsilon,
-        "learned_epsilon": self.use_eps_learned,
-        "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer),
-        "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer),
-        "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer),
-        "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer),
-        "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint),
-        "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint),
-        "learned_epsilon_constraint": tf.keras.constraints.serialize(self.eps_learned_constraint),
+      "axis": self.axis,
+      "epsilon": self.epsilon,
+      "learned_epsilon": self.use_eps_learned,
+      "beta_initializer": tf.keras.initializers.serialize(self.beta_initializer),
+      "gamma_initializer": tf.keras.initializers.serialize(self.gamma_initializer),
+      "beta_regularizer": tf.keras.regularizers.serialize(self.beta_regularizer),
+      "gamma_regularizer": tf.keras.regularizers.serialize(self.gamma_regularizer),
+      "beta_constraint": tf.keras.constraints.serialize(self.beta_constraint),
+      "gamma_constraint": tf.keras.constraints.serialize(self.gamma_constraint),
+      "learned_epsilon_constraint": tf.keras.constraints.serialize(self.eps_learned_constraint),
     }
     base_config = super().get_config()
     return dict(**base_config, **config)
@@ -462,7 +459,7 @@ def _check_axis(self, axis):
 
     if self.axis != [1, 2]:
       raise ValueError(
-          """FilterResponseNormalization operates on per-channel basis.
+        """FilterResponseNormalization operates on per-channel basis.
                 Axis values should be a list of spatial dimensions."""
       )
 
@@ -470,7 +467,7 @@ def _check_if_input_shape_is_none(self, input_shape):
     dim1, dim2 = input_shape[self.axis[0]], input_shape[self.axis[1]]
     if dim1 is None or dim2 is None:
       raise ValueError(
-          """Axis {} of input tensor should have a defined dimension but
+        """Axis {} of input tensor should have a defined dimension but
                 the layer received an input with shape {}.""".format(self.axis, input_shape)
       )
 
@@ -480,12 +477,12 @@ def _add_gamma_weight(self, input_shape):
     shape = [1, 1, 1, dim]
     # Initialize gamma with shape (1, 1, 1, C)
     self.gamma = self.add_weight(
-        shape=shape,
-        name="gamma",
-        dtype=self.dtype,
-        initializer=self.gamma_initializer,
-        regularizer=self.gamma_regularizer,
-        constraint=self.gamma_constraint,
+      shape=shape,
+      name="gamma",
+      dtype=self.dtype,
+      initializer=self.gamma_initializer,
+      regularizer=self.gamma_regularizer,
+      constraint=self.gamma_constraint,
     )
 
   def _add_beta_weight(self, input_shape):
@@ -494,10 +491,10 @@ def _add_beta_weight(self, input_shape):
     shape = [1, 1, 1, dim]
     # Initialize beta with shape (1, 1, 1, C)
     self.beta = self.add_weight(
-        shape=shape,
-        name="beta",
-        dtype=self.dtype,
-        initializer=self.beta_initializer,
-        regularizer=self.beta_regularizer,
-        constraint=self.beta_constraint,
+      shape=shape,
+      name="beta",
+      dtype=self.dtype,
+      initializer=self.beta_initializer,
+      regularizer=self.beta_regularizer,
+      constraint=self.beta_constraint,
     )
diff --git a/deepray/layers/on_device_embedding.py b/deepray/layers/on_device_embedding.py
index 7cf4c4a8..3b110ddd 100644
--- a/deepray/layers/on_device_embedding.py
+++ b/deepray/layers/on_device_embedding.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -55,17 +56,17 @@ def __init__(self, vocab_size, embedding_width, initializer="glorot_uniform", us
 
   def get_config(self):
     config = {
-        "vocab_size": self._vocab_size,
-        "embedding_width": self._embedding_width,
-        "initializer": self._initializer,
-        "use_one_hot": self._use_one_hot,
+      "vocab_size": self._vocab_size,
+      "embedding_width": self._embedding_width,
+      "initializer": self._initializer,
+      "use_one_hot": self._use_one_hot,
     }
     base_config = super(OnDeviceEmbedding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def build(self, input_shape):
     self.embeddings = self.add_weight(
-        "embeddings", shape=[self._vocab_size, self._embedding_width], initializer=self._initializer
+      "embeddings", shape=[self._vocab_size, self._embedding_width], initializer=self._initializer
     )
 
     super(OnDeviceEmbedding, self).build(input_shape)
diff --git a/deepray/layers/poincare.py b/deepray/layers/poincare.py
index 3b475d08..819bc041 100644
--- a/deepray/layers/poincare.py
+++ b/deepray/layers/poincare.py
@@ -23,23 +23,23 @@
 class PoincareNormalize(tf.keras.layers.Layer):
   """Project into the Poincare ball with `norm <= 1.0 - epsilon`.
 
-    See [Poincaré Embeddings for Learning Hierarchical Representations](https://arxiv.org/pdf/1705.08039.pdf),
-    and [wiki](https://en.wikipedia.org/wiki/Poincare_ball_model).
+  See [Poincaré Embeddings for Learning Hierarchical Representations](https://arxiv.org/pdf/1705.08039.pdf),
+  and [wiki](https://en.wikipedia.org/wiki/Poincare_ball_model).
 
-    For a 1-D tensor with `axis = 0`, computes
+  For a 1-D tensor with `axis = 0`, computes
 
-                  (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
-        output =
-                   x                              otherwise
+                (x * (1 - epsilon)) / ||x||     if ||x|| > 1 - epsilon
+      output =
+                 x                              otherwise
 
-    For `x` with more dimensions, independently normalizes each 1-D slice along
-    dimension `axis`.
+  For `x` with more dimensions, independently normalizes each 1-D slice along
+  dimension `axis`.
 
-    Args:
-      axis: Axis along which to normalize.  A scalar or a vector of integers.
-      epsilon: A small deviation from the edge of the unit sphere for
-        numerical stability.
-    """
+  Args:
+    axis: Axis along which to normalize.  A scalar or a vector of integers.
+    epsilon: A small deviation from the edge of the unit sphere for
+      numerical stability.
+  """
 
   @typechecked
   def __init__(self, axis: Union[None, int, List[int]] = 1, epsilon: float = 1e-5, **kwargs):
diff --git a/deepray/layers/polynomial.py b/deepray/layers/polynomial.py
index 00ae35d2..1891b674 100644
--- a/deepray/layers/polynomial.py
+++ b/deepray/layers/polynomial.py
@@ -24,64 +24,64 @@
 class PolynomialCrossing(tf.keras.layers.Layer):
   """Layer for Deep & Cross Network to learn explicit feature interactions.
 
-    A layer that applies feature crossing in learning certain explicit
-    bounded-degree feature interactions more efficiently. The `call` method
-    accepts `inputs` as a tuple of size 2 tensors. The first input `x0` should be
-    the input to the first `PolynomialCrossing` layer in the stack, or the input
-    to the network (usually after the embedding layer), the second input `xi`
-    is the output of the previous `PolynomialCrossing` layer in the stack, i.e.,
-    the i-th `PolynomialCrossing` layer.
-
-    The output is `x[i+1] = x0 .* (W * x[i] + diag_scale * x[i]) + bias + x[i]`, where .* designates elementwise
-    multiplication, W could be a full rank matrix, or a low rank matrix `U*V` to reduce the computational cost,
-    and diag_scale increases the diagonal of W to improve training stability (especially for the low rank case).
-
-    See [Deep & Cross Network for Ad Click Predictions](https://arxiv.org/pdf/1708.05123.pdf).
-
-    Example:
-
-    >>> input = np.random.randint(10, size=(10, 5))
-    >>> x0 = dp.layers.Embedding(vocabulary_size=10, embedding_dim=3)(input)
-    >>> x1 = PolynomialCrossing(projection_dim=None)((x0, x0))
-    >>> x2 = PolynomialCrossing(projection_dim=None)((x0, x1))
-    >>> logits = tf.keras.layers.Dense(units=10)(x2)
-    >>> model = tf.keras.Model(logits)
-
-    Args:
-        projection_dim: project dimension to reduce the computational cost.
-          Default is `None` such that a full (`input_dim` by `input_dim`)
-          matrix W is used. If enabled, a low-rank matrix W = U*V will be used,
-          where U is of size `input_dim` by `projection_dim` and V is of size
-          `projection_dim` by `input_dim`. `projection_dim` need to be smaller
-          than `input_dim`/2 to improve the model efficiency.
-        diag_scale: a non-negative float used to increase the diagonal of the
-           kernel W by `diag_scale`.
-        use_bias: whether to calculate the bias/intercept for this layer. If set to
-          False, no bias/intercept will be used in calculations, e.g., the data is
-          already centered.
-        kernel_initializer: Initializer instance to use on the kernel matrix.
-        bias_initializer: Initializer instance to use on the bias vector.
-        kernel_regularizer: Regularizer instance to use on the kernel matrix.
-        bias_regularizer: Regularizer instance to use on bias vector.
-
-    Input shape:
-        A tuple of 2 `(batch_size, input_dim)` dimensional inputs.
-
-    Output shape:
-        A single `(batch_size, input_dim)` dimensional output.
-    """
+  A layer that applies feature crossing in learning certain explicit
+  bounded-degree feature interactions more efficiently. The `call` method
+  accepts `inputs` as a tuple of size 2 tensors. The first input `x0` should be
+  the input to the first `PolynomialCrossing` layer in the stack, or the input
+  to the network (usually after the embedding layer), the second input `xi`
+  is the output of the previous `PolynomialCrossing` layer in the stack, i.e.,
+  the i-th `PolynomialCrossing` layer.
+
+  The output is `x[i+1] = x0 .* (W * x[i] + diag_scale * x[i]) + bias + x[i]`, where .* designates elementwise
+  multiplication, W could be a full rank matrix, or a low rank matrix `U*V` to reduce the computational cost,
+  and diag_scale increases the diagonal of W to improve training stability (especially for the low rank case).
+
+  See [Deep & Cross Network for Ad Click Predictions](https://arxiv.org/pdf/1708.05123.pdf).
+
+  Example:
+
+  >>> input = np.random.randint(10, size=(10, 5))
+  >>> x0 = dp.layers.Embedding(vocabulary_size=10, embedding_dim=3)(input)
+  >>> x1 = PolynomialCrossing(projection_dim=None)((x0, x0))
+  >>> x2 = PolynomialCrossing(projection_dim=None)((x0, x1))
+  >>> logits = tf.keras.layers.Dense(units=10)(x2)
+  >>> model = tf.keras.Model(logits)
+
+  Args:
+      projection_dim: project dimension to reduce the computational cost.
+        Default is `None` such that a full (`input_dim` by `input_dim`)
+        matrix W is used. If enabled, a low-rank matrix W = U*V will be used,
+        where U is of size `input_dim` by `projection_dim` and V is of size
+        `projection_dim` by `input_dim`. `projection_dim` need to be smaller
+        than `input_dim`/2 to improve the model efficiency.
+      diag_scale: a non-negative float used to increase the diagonal of the
+         kernel W by `diag_scale`.
+      use_bias: whether to calculate the bias/intercept for this layer. If set to
+        False, no bias/intercept will be used in calculations, e.g., the data is
+        already centered.
+      kernel_initializer: Initializer instance to use on the kernel matrix.
+      bias_initializer: Initializer instance to use on the bias vector.
+      kernel_regularizer: Regularizer instance to use on the kernel matrix.
+      bias_regularizer: Regularizer instance to use on bias vector.
+
+  Input shape:
+      A tuple of 2 `(batch_size, input_dim)` dimensional inputs.
+
+  Output shape:
+      A single `(batch_size, input_dim)` dimensional output.
+  """
 
   @typechecked
   def __init__(
-      self,
-      projection_dim: int = None,
-      diag_scale: float = 0.0,
-      use_bias: bool = True,
-      kernel_initializer: types.Initializer = "truncated_normal",
-      bias_initializer: types.Initializer = "zeros",
-      kernel_regularizer: types.Regularizer = None,
-      bias_regularizer: types.Regularizer = None,
-      **kwargs,
+    self,
+    projection_dim: int = None,
+    diag_scale: float = 0.0,
+    use_bias: bool = True,
+    kernel_initializer: types.Initializer = "truncated_normal",
+    bias_initializer: types.Initializer = "zeros",
+    kernel_regularizer: types.Regularizer = None,
+    bias_regularizer: types.Regularizer = None,
+    **kwargs,
   ):
     super(PolynomialCrossing, self).__init__(**kwargs)
 
@@ -97,56 +97,54 @@ def __init__(
 
   def build(self, input_shape):
     if not isinstance(input_shape, (tuple, list)) or len(input_shape) != 2:
-      raise ValueError("Input shapes must be a tuple or list of size 2, "
-                       "got {}".format(input_shape))
+      raise ValueError("Input shapes must be a tuple or list of size 2, got {}".format(input_shape))
     last_dim = input_shape[-1][-1]
     if self.projection_dim is None:
       self.kernel = self.add_weight(
-          "kernel",
-          shape=[last_dim, last_dim],
-          initializer=self.kernel_initializer,
-          regularizer=self.kernel_regularizer,
-          dtype=self.dtype,
-          trainable=True,
+        "kernel",
+        shape=[last_dim, last_dim],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        dtype=self.dtype,
+        trainable=True,
       )
     else:
       if self.projection_dim < 0 or self.projection_dim > last_dim / 2:
         raise ValueError(
-            "`projection_dim` should be smaller than last_dim / 2 to improve"
-            "the model efficiency, and should be positive. Got "
-            "`projection_dim` {}, and last dimension of input {}".format(self.projection_dim, last_dim)
+          "`projection_dim` should be smaller than last_dim / 2 to improve"
+          "the model efficiency, and should be positive. Got "
+          "`projection_dim` {}, and last dimension of input {}".format(self.projection_dim, last_dim)
         )
       self.kernel_u = self.add_weight(
-          "kernel_u",
-          shape=[last_dim, self.projection_dim],
-          initializer=self.kernel_initializer,
-          regularizer=self.kernel_regularizer,
-          dtype=self.dtype,
-          trainable=True,
+        "kernel_u",
+        shape=[last_dim, self.projection_dim],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        dtype=self.dtype,
+        trainable=True,
       )
       self.kernel_v = self.add_weight(
-          "kernel_v",
-          shape=[self.projection_dim, last_dim],
-          initializer=self.kernel_initializer,
-          regularizer=self.kernel_regularizer,
-          dtype=self.dtype,
-          trainable=True,
+        "kernel_v",
+        shape=[self.projection_dim, last_dim],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        dtype=self.dtype,
+        trainable=True,
       )
     if self.use_bias:
       self.bias = self.add_weight(
-          "bias",
-          shape=[last_dim],
-          initializer=self.bias_initializer,
-          regularizer=self.bias_regularizer,
-          dtype=self.dtype,
-          trainable=True,
+        "bias",
+        shape=[last_dim],
+        initializer=self.bias_initializer,
+        regularizer=self.bias_regularizer,
+        dtype=self.dtype,
+        trainable=True,
       )
     self.built = True
 
   def call(self, inputs):
     if not isinstance(inputs, (tuple, list)) or len(inputs) != 2:
-      raise ValueError("Inputs to the layer must be a tuple or list of size 2, "
-                       "got {}".format(inputs))
+      raise ValueError("Inputs to the layer must be a tuple or list of size 2, got {}".format(inputs))
     x0, x = inputs
     if self.projection_dim is None:
       prod_output = tf.matmul(x, self.kernel)
@@ -162,19 +160,18 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "projection_dim": self.projection_dim,
-        "diag_scale": self.diag_scale,
-        "use_bias": self.use_bias,
-        "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self.kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self.bias_regularizer),
+      "projection_dim": self.projection_dim,
+      "diag_scale": self.diag_scale,
+      "use_bias": self.use_bias,
+      "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
+      "kernel_regularizer": tf.keras.regularizers.serialize(self.kernel_regularizer),
+      "bias_regularizer": tf.keras.regularizers.serialize(self.bias_regularizer),
     }
     base_config = super(PolynomialCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def compute_output_shape(self, input_shape):
     if not isinstance(input_shape, (tuple, list)):
-      raise ValueError("A `PolynomialCrossing` layer should be called "
-                       "on a list of inputs.")
+      raise ValueError("A `PolynomialCrossing` layer should be called on a list of inputs.")
     return input_shape[0]
diff --git a/deepray/layers/pooling.py b/deepray/layers/pooling.py
index a3780066..2aa751fb 100644
--- a/deepray/layers/pooling.py
+++ b/deepray/layers/pooling.py
@@ -3,8 +3,8 @@
 
 class Pooling(tf.keras.layers.Layer):
   """
-    input shape: (batch_size, seq_len, emb_dim)
-    output shape: (batch_size, 1, emb_dim)
+  input shape: (batch_size, seq_len, emb_dim)
+  output shape: (batch_size, 1, emb_dim)
   """
 
   def __init__(self, combiner, **kwargs):
@@ -17,18 +17,18 @@ def build(self, input_shape):
     super(Pooling, self).build(input_shape)
 
   def call(self, x, mask=None, **kwargs):
-    if self.combiner == 'max':
+    if self.combiner == "max":
       if mask is not None:
-        x = tf.where(tf.expand_dims(mask, axis=-1), x, tf.ones_like(x, dtype=tf.float32) * (-2**32 + 1))
+        x = tf.where(tf.expand_dims(mask, axis=-1), x, tf.ones_like(x, dtype=tf.float32) * (-(2**32) + 1))
       return tf.reduce_max(x, axis=1, keepdims=True)
 
     # sum
-    if self.combiner == 'sum':
+    if self.combiner == "sum":
       if mask is not None:
         x = tf.where(tf.expand_dims(mask, axis=-1), x, tf.zeros_like(x, dtype=tf.float32))
       return tf.reduce_sum(x, axis=1, keepdims=True)
 
-    if self.combiner == 'mean':
+    if self.combiner == "mean":
       if mask is not None:
         mask = tf.expand_dims(mask, axis=-1)
         x = tf.where(mask, x, tf.zeros_like(x, dtype=tf.float32))
@@ -38,7 +38,9 @@ def call(self, x, mask=None, **kwargs):
 
     return x
 
-  def get_config(self,):
-    config = {'combiner': self.combiner}
+  def get_config(
+    self,
+  ):
+    config = {"combiner": self.combiner}
     base_config = super(Pooling, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/position_embedding.py b/deepray/layers/position_embedding.py
index 71fa1979..83f380da 100644
--- a/deepray/layers/position_embedding.py
+++ b/deepray/layers/position_embedding.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -59,9 +60,9 @@ def __init__(self, initializer="glorot_uniform", use_dynamic_slicing=False, max_
 
   def get_config(self):
     config = {
-        "max_sequence_length": self._max_sequence_length,
-        "initializer": tf.keras.initializers.serialize(self._initializer),
-        "use_dynamic_slicing": self._use_dynamic_slicing,
+      "max_sequence_length": self._max_sequence_length,
+      "initializer": tf.keras.initializers.serialize(self._initializer),
+      "use_dynamic_slicing": self._use_dynamic_slicing,
     }
     base_config = super(PositionEmbedding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -71,8 +72,7 @@ def build(self, input_shape):
     dimension_list = input_shape.as_list()
 
     if len(dimension_list) != 3:
-      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor "
-                       "of shape [batch, sequence, width]")
+      raise ValueError("PositionEmbedding expects a 3-dimensional input tensor of shape [batch, sequence, width]")
     seq_length = dimension_list[1]
     width = dimension_list[2]
 
@@ -81,15 +81,15 @@ def build(self, input_shape):
     if not self._use_dynamic_slicing:
       if seq_length is None:
         raise ValueError(
-            "PositionEmbedding must have `use_dynamic_slicing` set "
-            "to True (and max_sequence_length set) when the "
-            "sequence (1st) dimension of the input is None."
+          "PositionEmbedding must have `use_dynamic_slicing` set "
+          "to True (and max_sequence_length set) when the "
+          "sequence (1st) dimension of the input is None."
         )
       if self._max_sequence_length is not None:
         raise ValueError(
-            "When `use_dynamic_slicing` is False, max_sequence_length should "
-            "not be specified and we ought to use seq_length to get the "
-            "variable shape."
+          "When `use_dynamic_slicing` is False, max_sequence_length should "
+          "not be specified and we ought to use seq_length to get the "
+          "variable shape."
         )
 
     if self._max_sequence_length is not None:
@@ -98,7 +98,7 @@ def build(self, input_shape):
       weight_sequence_length = seq_length
 
     self._position_embeddings = self.add_weight(
-        "embeddings", shape=[weight_sequence_length, width], initializer=self._initializer
+      "embeddings", shape=[weight_sequence_length, width], initializer=self._initializer
     )
 
     super(PositionEmbedding, self).build(input_shape)
diff --git a/deepray/layers/relative_attention.py b/deepray/layers/relative_attention.py
index 7723a5d2..5519cf9a 100644
--- a/deepray/layers/relative_attention.py
+++ b/deepray/layers/relative_attention.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Keras-based relative attention layers."""
+
 import math
 import string
 import tensorflow as tf
@@ -126,37 +127,37 @@ def _build_from_signature(self, query, value, key=None):
       key_shape = key
 
     common_kwargs = dict(
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
     )
 
     with tf.init_scope():
       einsum_equation, _, output_rank = _build_proj_equation(key_shape.rank - 1, bound_dims=1, output_dims=2)
       self._encoding_dense = tf.keras.layers.EinsumDense(
-          einsum_equation,
-          output_shape=_get_output_shape(output_rank - 1, [self._num_heads, self._key_dim]),
-          bias_axes=None,
-          name="encoding",
-          **common_kwargs
+        einsum_equation,
+        output_shape=_get_output_shape(output_rank - 1, [self._num_heads, self._key_dim]),
+        bias_axes=None,
+        name="encoding",
+        **common_kwargs,
       )
 
   def compute_attention(
-      self,
-      query,
-      key,
-      value,
-      position,
-      content_attention_bias,
-      positional_attention_bias,
-      segment_matrix=None,
-      segment_encoding=None,
-      segment_attention_bias=None,
-      attention_mask=None
+    self,
+    query,
+    key,
+    value,
+    position,
+    content_attention_bias,
+    positional_attention_bias,
+    segment_matrix=None,
+    segment_encoding=None,
+    segment_attention_bias=None,
+    attention_mask=None,
   ):
     """Computes the attention.
 
@@ -196,11 +197,11 @@ def compute_attention(
       segment_attention = tf.einsum("bind,snd->bnis", query + segment_attention_bias, segment_encoding)
       target_shape = tf.shape(positional_attention)
       segment_attention = tf.where(
-          tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
-          tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
-          tf.broadcast_to(segment_attention[:, :, :, :1], target_shape)
+        tf.broadcast_to(tf.expand_dims(segment_matrix, 1), target_shape),
+        tf.broadcast_to(segment_attention[:, :, :, 1:], target_shape),
+        tf.broadcast_to(segment_attention[:, :, :, :1], target_shape),
       )
-      attention_sum = (content_attention + positional_attention + segment_attention)
+      attention_sum = content_attention + positional_attention + segment_attention
     else:
       attention_sum = content_attention + positional_attention
 
@@ -214,18 +215,18 @@ def compute_attention(
     return attention_output
 
   def call(
-      self,
-      query,
-      value,
-      content_attention_bias,
-      positional_attention_bias,
-      key=None,
-      relative_position_encoding=None,
-      segment_matrix=None,
-      segment_encoding=None,
-      segment_attention_bias=None,
-      state=None,
-      attention_mask=None
+    self,
+    query,
+    value,
+    content_attention_bias,
+    positional_attention_bias,
+    key=None,
+    relative_position_encoding=None,
+    segment_matrix=None,
+    segment_encoding=None,
+    segment_attention_bias=None,
+    state=None,
+    attention_mask=None,
   ):
     """Compute multi-head relative attention over inputs.
 
@@ -289,16 +290,16 @@ def call(
     position = self._encoding_dense(relative_position_encoding)
 
     attention_output = self.compute_attention(
-        query=query,
-        key=key,
-        value=value,
-        position=position,
-        content_attention_bias=content_attention_bias,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        attention_mask=attention_mask
+      query=query,
+      key=key,
+      value=value,
+      position=position,
+      content_attention_bias=content_attention_bias,
+      positional_attention_bias=positional_attention_bias,
+      segment_matrix=segment_matrix,
+      segment_encoding=segment_encoding,
+      segment_attention_bias=segment_attention_bias,
+      attention_mask=attention_mask,
     )
 
     # `attention_output` = [B, S, N, H]
@@ -353,19 +354,19 @@ class TwoStreamRelativeAttention(MultiHeadRelativeAttention):
   """
 
   def call(
-      self,
-      content_stream,
-      content_attention_bias,
-      positional_attention_bias,
-      query_stream,
-      relative_position_encoding,
-      target_mapping=None,
-      segment_matrix=None,
-      segment_encoding=None,
-      segment_attention_bias=None,
-      state=None,
-      content_attention_mask=None,
-      query_attention_mask=None
+    self,
+    content_stream,
+    content_attention_bias,
+    positional_attention_bias,
+    query_stream,
+    relative_position_encoding,
+    target_mapping=None,
+    segment_matrix=None,
+    segment_encoding=None,
+    segment_attention_bias=None,
+    state=None,
+    content_attention_mask=None,
+    query_attention_mask=None,
   ):
     """Compute multi-head relative attention over inputs.
 
@@ -438,16 +439,16 @@ def call(
     position = self._encoding_dense(relative_position_encoding)
 
     content_attention_output = self.compute_attention(
-        query=query,
-        key=key,
-        value=value,
-        position=position,
-        content_attention_bias=content_attention_bias,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        attention_mask=content_attention_mask
+      query=query,
+      key=key,
+      value=value,
+      position=position,
+      content_attention_bias=content_attention_bias,
+      positional_attention_bias=positional_attention_bias,
+      segment_matrix=segment_matrix,
+      segment_encoding=segment_encoding,
+      segment_attention_bias=segment_attention_bias,
+      attention_mask=content_attention_mask,
     )
 
     # `content_attention_output` = [B, S, N, H]
@@ -459,30 +460,30 @@ def call(
       if target_mapping is not None:
         query = tf.einsum("bmnd,bml->blnd", query, target_mapping)
         query_attention_output = self.compute_attention(
-            query=query,
-            key=key,
-            value=value,
-            position=position,
-            content_attention_bias=content_attention_bias,
-            positional_attention_bias=positional_attention_bias,
-            segment_matrix=segment_matrix,
-            segment_encoding=segment_encoding,
-            segment_attention_bias=segment_attention_bias,
-            attention_mask=query_attention_mask
+          query=query,
+          key=key,
+          value=value,
+          position=position,
+          content_attention_bias=content_attention_bias,
+          positional_attention_bias=positional_attention_bias,
+          segment_matrix=segment_matrix,
+          segment_encoding=segment_encoding,
+          segment_attention_bias=segment_attention_bias,
+          attention_mask=query_attention_mask,
         )
         query_attention_output = tf.einsum("blnd,bml->bmnd", query_attention_output, target_mapping)
       else:
         query_attention_output = self.compute_attention(
-            query=query,
-            key=key,
-            value=value,
-            position=position,
-            content_attention_bias=content_attention_bias,
-            positional_attention_bias=positional_attention_bias,
-            segment_matrix=segment_matrix,
-            segment_encoding=segment_encoding,
-            segment_attention_bias=segment_attention_bias,
-            attention_mask=query_attention_mask
+          query=query,
+          key=key,
+          value=value,
+          position=position,
+          content_attention_bias=content_attention_bias,
+          positional_attention_bias=positional_attention_bias,
+          segment_matrix=segment_matrix,
+          segment_encoding=segment_encoding,
+          segment_attention_bias=segment_attention_bias,
+          attention_mask=query_attention_mask,
         )
       query_attention_output = self._output_dense(query_attention_output)
 
diff --git a/deepray/layers/rnn/esn_cell.py b/deepray/layers/rnn/esn_cell.py
index 6440c94c..0fca7618 100644
--- a/deepray/layers/rnn/esn_cell.py
+++ b/deepray/layers/rnn/esn_cell.py
@@ -16,6 +16,7 @@
 
 import tensorflow as tf
 from packaging.version import parse
+
 if parse(tf.__version__) > parse("2.16.0"):
   from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell
 else:
@@ -23,79 +24,79 @@
 from typeguard import typechecked
 
 from deepray.utils.types import (
-    Activation,
-    Initializer,
+  Activation,
+  Initializer,
 )
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class ESNCell(AbstractRNNCell):
   """Echo State recurrent Network (ESN) cell.
-    This implements the recurrent cell from the paper:
-        H. Jaeger
-        "The "echo state" approach to analysing and training recurrent neural networks".
-        GMD Report148, German National Research Center for Information Technology, 2001.
-        https://www.researchgate.net/publication/215385037
-
-    Example:
-
-    >>> inputs = np.random.random([30,23,9]).astype(np.float32)
-    >>> ESNCell = dp.rnn.ESNCell(4)
-    >>> rnn = tf.keras.layers.RNN(ESNCell, return_sequences=True, return_state=True)
-    >>> outputs, memory_state = rnn(inputs)
-    >>> outputs.shape
-    TensorShape([30, 23, 4])
-    >>> memory_state.shape
-    TensorShape([30, 4])
-
-    Args:
-        units: Positive integer, dimensionality in the reservoir.
-        connectivity: Float between 0 and 1.
-            Connection probability between two reservoir units.
-            Default: 0.1.
-        leaky: Float between 0 and 1.
-            Leaking rate of the reservoir.
-            If you pass 1, it is the special case the model does not have leaky
-            integration.
-            Default: 1.
-        spectral_radius: Float between 0 and 1.
-            Desired spectral radius of recurrent weight matrix.
-            Default: 0.9.
-        use_norm2: Boolean, whether to use the p-norm function (with p=2) as an upper
-            bound of the spectral radius so that the echo state property is satisfied.
-            It  avoids to compute the eigenvalues which has an exponential complexity.
-            Default: False.
-        use_bias: Boolean, whether the layer uses a bias vector.
-            Default: True.
-        activation: Activation function to use.
-            Default: hyperbolic tangent (`tanh`).
-        kernel_initializer: Initializer for the `kernel` weights matrix,
-            used for the linear transformation of the inputs.
-            Default: `glorot_uniform`.
-        recurrent_initializer: Initializer for the `recurrent_kernel` weights matrix,
-            used for the linear transformation of the recurrent state.
-            Default: `glorot_uniform`.
-        bias_initializer: Initializer for the bias vector.
-            Default: `zeros`.
-    Call arguments:
-        inputs: A 2D tensor (batch x num_units).
-        states: List of state tensors corresponding to the previous timestep.
-    """
+  This implements the recurrent cell from the paper:
+      H. Jaeger
+      "The "echo state" approach to analysing and training recurrent neural networks".
+      GMD Report148, German National Research Center for Information Technology, 2001.
+      https://www.researchgate.net/publication/215385037
+
+  Example:
+
+  >>> inputs = np.random.random([30,23,9]).astype(np.float32)
+  >>> ESNCell = dp.rnn.ESNCell(4)
+  >>> rnn = tf.keras.layers.RNN(ESNCell, return_sequences=True, return_state=True)
+  >>> outputs, memory_state = rnn(inputs)
+  >>> outputs.shape
+  TensorShape([30, 23, 4])
+  >>> memory_state.shape
+  TensorShape([30, 4])
+
+  Args:
+      units: Positive integer, dimensionality in the reservoir.
+      connectivity: Float between 0 and 1.
+          Connection probability between two reservoir units.
+          Default: 0.1.
+      leaky: Float between 0 and 1.
+          Leaking rate of the reservoir.
+          If you pass 1, it is the special case the model does not have leaky
+          integration.
+          Default: 1.
+      spectral_radius: Float between 0 and 1.
+          Desired spectral radius of recurrent weight matrix.
+          Default: 0.9.
+      use_norm2: Boolean, whether to use the p-norm function (with p=2) as an upper
+          bound of the spectral radius so that the echo state property is satisfied.
+          It  avoids to compute the eigenvalues which has an exponential complexity.
+          Default: False.
+      use_bias: Boolean, whether the layer uses a bias vector.
+          Default: True.
+      activation: Activation function to use.
+          Default: hyperbolic tangent (`tanh`).
+      kernel_initializer: Initializer for the `kernel` weights matrix,
+          used for the linear transformation of the inputs.
+          Default: `glorot_uniform`.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights matrix,
+          used for the linear transformation of the recurrent state.
+          Default: `glorot_uniform`.
+      bias_initializer: Initializer for the bias vector.
+          Default: `zeros`.
+  Call arguments:
+      inputs: A 2D tensor (batch x num_units).
+      states: List of state tensors corresponding to the previous timestep.
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: int,
-      connectivity: float = 0.1,
-      leaky: float = 1,
-      spectral_radius: float = 0.9,
-      use_norm2: bool = False,
-      use_bias: bool = True,
-      activation: Activation = "tanh",
-      kernel_initializer: Initializer = "glorot_uniform",
-      recurrent_initializer: Initializer = "glorot_uniform",
-      bias_initializer: Initializer = "zeros",
-      **kwargs,
+    self,
+    units: int,
+    connectivity: float = 0.1,
+    leaky: float = 1,
+    spectral_radius: float = 0.9,
+    use_norm2: bool = False,
+    use_bias: bool = True,
+    activation: Activation = "tanh",
+    kernel_initializer: Initializer = "glorot_uniform",
+    recurrent_initializer: Initializer = "glorot_uniform",
+    bias_initializer: Initializer = "zeros",
+    **kwargs,
   ):
     super().__init__(**kwargs)
     self.units = units
@@ -129,8 +130,8 @@ def _esn_recurrent_initializer(shape, dtype, partition_info=None):
       recurrent_weights = tf.keras.initializers.get(self.recurrent_initializer)(shape, dtype)
 
       connectivity_mask = tf.cast(
-          tf.math.less_equal(tf.random.uniform(shape), self.connectivity),
-          dtype,
+        tf.math.less_equal(tf.random.uniform(shape), self.connectivity),
+        dtype,
       )
       recurrent_weights = tf.math.multiply(recurrent_weights, connectivity_mask)
 
@@ -150,27 +151,27 @@ def _esn_recurrent_initializer(shape, dtype, partition_info=None):
       return recurrent_weights
 
     self.recurrent_kernel = self.add_weight(
-        name="recurrent_kernel",
-        shape=[self.units, self.units],
-        initializer=_esn_recurrent_initializer,
-        trainable=False,
-        dtype=self.dtype,
+      name="recurrent_kernel",
+      shape=[self.units, self.units],
+      initializer=_esn_recurrent_initializer,
+      trainable=False,
+      dtype=self.dtype,
     )
     self.kernel = self.add_weight(
-        name="kernel",
-        shape=[input_size, self.units],
-        initializer=self.kernel_initializer,
-        trainable=False,
-        dtype=self.dtype,
+      name="kernel",
+      shape=[input_size, self.units],
+      initializer=self.kernel_initializer,
+      trainable=False,
+      dtype=self.dtype,
     )
 
     if self.use_bias:
       self.bias = self.add_weight(
-          name="bias",
-          shape=[self.units],
-          initializer=self.bias_initializer,
-          trainable=False,
-          dtype=self.dtype,
+        name="bias",
+        shape=[self.units],
+        initializer=self.bias_initializer,
+        trainable=False,
+        dtype=self.dtype,
       )
 
     self.built = True
@@ -189,16 +190,16 @@ def call(self, inputs, state):
 
   def get_config(self):
     config = {
-        "units": self.units,
-        "connectivity": self.connectivity,
-        "leaky": self.leaky,
-        "spectral_radius": self.spectral_radius,
-        "use_norm2": self.use_norm2,
-        "use_bias": self.use_bias,
-        "activation": tf.keras.activations.serialize(self.activation),
-        "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
-        "recurrent_initializer": tf.keras.initializers.serialize(self.recurrent_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
+      "units": self.units,
+      "connectivity": self.connectivity,
+      "leaky": self.leaky,
+      "spectral_radius": self.spectral_radius,
+      "use_norm2": self.use_norm2,
+      "use_bias": self.use_bias,
+      "activation": tf.keras.activations.serialize(self.activation),
+      "kernel_initializer": tf.keras.initializers.serialize(self.kernel_initializer),
+      "recurrent_initializer": tf.keras.initializers.serialize(self.recurrent_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self.bias_initializer),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/layers/rnn/layer_norm_lstm_cell.py b/deepray/layers/rnn/layer_norm_lstm_cell.py
index 1600fecd..b6450aee 100644
--- a/deepray/layers/rnn/layer_norm_lstm_cell.py
+++ b/deepray/layers/rnn/layer_norm_lstm_cell.py
@@ -19,12 +19,12 @@
 from typeguard import typechecked
 
 from deepray.utils.types import (
-    Activation,
-    FloatTensorLike,
-    TensorLike,
-    Initializer,
-    Constraint,
-    Regularizer,
+  Activation,
+  FloatTensorLike,
+  TensorLike,
+  Initializer,
+  Constraint,
+  Regularizer,
 )
 
 
@@ -32,118 +32,118 @@
 class LayerNormLSTMCell(keras.layers.LSTMCell):
   """LSTM cell with layer normalization and recurrent dropout.
 
-    This class adds layer normalization and recurrent dropout to a LSTM unit.
-    Layer normalization implementation is based on:
+  This class adds layer normalization and recurrent dropout to a LSTM unit.
+  Layer normalization implementation is based on:
 
-      https://arxiv.org/abs/1607.06450.
+    https://arxiv.org/abs/1607.06450.
 
-    "Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
+  "Layer Normalization" Jimmy Lei Ba, Jamie Ryan Kiros, Geoffrey E. Hinton
 
-    and is applied before the internal nonlinearities.
-    Recurrent dropout is based on:
+  and is applied before the internal nonlinearities.
+  Recurrent dropout is based on:
 
-      https://arxiv.org/abs/1603.05118
+    https://arxiv.org/abs/1603.05118
 
-    "Recurrent Dropout without Memory Loss"
-    Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
+  "Recurrent Dropout without Memory Loss"
+  Stanislau Semeniuta, Aliaksei Severyn, Erhardt Barth.
 
-    Example:
+  Example:
 
-    >>> inputs = np.random.random([30,23,9]).astype(np.float32)
-    >>> lnLSTMCell = dp.rnn.LayerNormLSTMCell(4)
-    >>> rnn = tf.keras.layers.RNN(lnLSTMCell, return_sequences=True, return_state=True)
-    >>> outputs, memory_state, carry_state = rnn(inputs)
-    >>> outputs.shape
-    TensorShape([30, 23, 4])
-    >>> memory_state.shape
-    TensorShape([30, 4])
-    >>> carry_state.shape
-    TensorShape([30, 4])
-    """
+  >>> inputs = np.random.random([30,23,9]).astype(np.float32)
+  >>> lnLSTMCell = dp.rnn.LayerNormLSTMCell(4)
+  >>> rnn = tf.keras.layers.RNN(lnLSTMCell, return_sequences=True, return_state=True)
+  >>> outputs, memory_state, carry_state = rnn(inputs)
+  >>> outputs.shape
+  TensorShape([30, 23, 4])
+  >>> memory_state.shape
+  TensorShape([30, 4])
+  >>> carry_state.shape
+  TensorShape([30, 4])
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      activation: Activation = "tanh",
-      recurrent_activation: Activation = "sigmoid",
-      use_bias: bool = True,
-      kernel_initializer: Initializer = "glorot_uniform",
-      recurrent_initializer: Initializer = "orthogonal",
-      bias_initializer: Initializer = "zeros",
-      unit_forget_bias: bool = True,
-      kernel_regularizer: Regularizer = None,
-      recurrent_regularizer: Regularizer = None,
-      bias_regularizer: Regularizer = None,
-      kernel_constraint: Constraint = None,
-      recurrent_constraint: Constraint = None,
-      bias_constraint: Constraint = None,
-      dropout: FloatTensorLike = 0.0,
-      recurrent_dropout: FloatTensorLike = 0.0,
-      norm_gamma_initializer: Initializer = "ones",
-      norm_beta_initializer: Initializer = "zeros",
-      norm_epsilon: FloatTensorLike = 1e-3,
-      **kwargs,
+    self,
+    units: TensorLike,
+    activation: Activation = "tanh",
+    recurrent_activation: Activation = "sigmoid",
+    use_bias: bool = True,
+    kernel_initializer: Initializer = "glorot_uniform",
+    recurrent_initializer: Initializer = "orthogonal",
+    bias_initializer: Initializer = "zeros",
+    unit_forget_bias: bool = True,
+    kernel_regularizer: Regularizer = None,
+    recurrent_regularizer: Regularizer = None,
+    bias_regularizer: Regularizer = None,
+    kernel_constraint: Constraint = None,
+    recurrent_constraint: Constraint = None,
+    bias_constraint: Constraint = None,
+    dropout: FloatTensorLike = 0.0,
+    recurrent_dropout: FloatTensorLike = 0.0,
+    norm_gamma_initializer: Initializer = "ones",
+    norm_beta_initializer: Initializer = "zeros",
+    norm_epsilon: FloatTensorLike = 1e-3,
+    **kwargs,
   ):
     """Initializes the LSTM cell.
 
-        Args:
-          units: Positive integer, dimensionality of the output space.
-          activation: Activation function to use. Default: hyperbolic tangent
-            (`tanh`). If you pass `None`, no activation is applied (ie.
-            "linear" activation: `a(x) = x`).
-          recurrent_activation: Activation function to use for the recurrent
-            step. Default: sigmoid (`sigmoid`). If you pass `None`, no
-            activation is applied (ie. "linear" activation: `a(x) = x`).
-          use_bias: Boolean, whether the layer uses a bias vector.
-          kernel_initializer: Initializer for the `kernel` weights matrix, used
-            for the linear transformation of the inputs.
-          recurrent_initializer: Initializer for the `recurrent_kernel` weights
-            matrix, used for the linear transformation of the recurrent state.
-          bias_initializer: Initializer for the bias vector.
-          unit_forget_bias: Boolean. If True, add 1 to the bias of the forget
-            gate at initialization. Setting it to true will also force
-            `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
-              al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
-          kernel_regularizer: Regularizer function applied to the `kernel`
-            weights matrix.
-          recurrent_regularizer: Regularizer function applied to
-            the `recurrent_kernel` weights matrix.
-          bias_regularizer: Regularizer function applied to the bias vector.
-          kernel_constraint: Constraint function applied to the `kernel`
-            weights matrix.
-          recurrent_constraint: Constraint function applied to the
-            `recurrent_kernel` weights matrix.
-          bias_constraint: Constraint function applied to the bias vector.
-          dropout: Float between 0 and 1. Fraction of the units to drop for the
-            linear transformation of the inputs.
-          recurrent_dropout: Float between 0 and 1. Fraction of the units to
-            drop for the linear transformation of the recurrent state.
-          norm_gamma_initializer: Initializer for the layer normalization gain
-            initial value.
-          norm_beta_initializer: Initializer for the layer normalization shift
-            initial value.
-          norm_epsilon: Float, the epsilon value for normalization layers.
-          **kwargs: Dict, the other keyword arguments for layer creation.
-        """
+    Args:
+      units: Positive integer, dimensionality of the output space.
+      activation: Activation function to use. Default: hyperbolic tangent
+        (`tanh`). If you pass `None`, no activation is applied (ie.
+        "linear" activation: `a(x) = x`).
+      recurrent_activation: Activation function to use for the recurrent
+        step. Default: sigmoid (`sigmoid`). If you pass `None`, no
+        activation is applied (ie. "linear" activation: `a(x) = x`).
+      use_bias: Boolean, whether the layer uses a bias vector.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used
+        for the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget
+        gate at initialization. Setting it to true will also force
+        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      kernel_regularizer: Regularizer function applied to the `kernel`
+        weights matrix.
+      recurrent_regularizer: Regularizer function applied to
+        the `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      kernel_constraint: Constraint function applied to the `kernel`
+        weights matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      dropout: Float between 0 and 1. Fraction of the units to drop for the
+        linear transformation of the inputs.
+      recurrent_dropout: Float between 0 and 1. Fraction of the units to
+        drop for the linear transformation of the recurrent state.
+      norm_gamma_initializer: Initializer for the layer normalization gain
+        initial value.
+      norm_beta_initializer: Initializer for the layer normalization shift
+        initial value.
+      norm_epsilon: Float, the epsilon value for normalization layers.
+      **kwargs: Dict, the other keyword arguments for layer creation.
+    """
     super().__init__(
-        units,
-        activation=activation,
-        recurrent_activation=recurrent_activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        unit_forget_bias=unit_forget_bias,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        **kwargs,
+      units,
+      activation=activation,
+      recurrent_activation=recurrent_activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      recurrent_initializer=recurrent_initializer,
+      bias_initializer=bias_initializer,
+      unit_forget_bias=unit_forget_bias,
+      kernel_regularizer=kernel_regularizer,
+      recurrent_regularizer=recurrent_regularizer,
+      bias_regularizer=bias_regularizer,
+      kernel_constraint=kernel_constraint,
+      recurrent_constraint=recurrent_constraint,
+      bias_constraint=bias_constraint,
+      dropout=dropout,
+      recurrent_dropout=recurrent_dropout,
+      **kwargs,
     )
     self.norm_gamma_initializer = keras.initializers.get(norm_gamma_initializer)
     self.norm_beta_initializer = keras.initializers.get(norm_beta_initializer)
@@ -189,17 +189,17 @@ def call(self, inputs, states, training=None):
 
   def get_config(self):
     config = {
-        "norm_gamma_initializer": keras.initializers.serialize(self.norm_gamma_initializer),
-        "norm_beta_initializer": keras.initializers.serialize(self.norm_beta_initializer),
-        "norm_epsilon": self.norm_epsilon,
+      "norm_gamma_initializer": keras.initializers.serialize(self.norm_gamma_initializer),
+      "norm_beta_initializer": keras.initializers.serialize(self.norm_beta_initializer),
+      "norm_epsilon": self.norm_epsilon,
     }
     base_config = super().get_config()
     return {**base_config, **config}
 
   def _create_norm_layer(self, name):
     return keras.layers.LayerNormalization(
-        beta_initializer=self.norm_beta_initializer,
-        gamma_initializer=self.norm_gamma_initializer,
-        epsilon=self.norm_epsilon,
-        name=name,
+      beta_initializer=self.norm_beta_initializer,
+      gamma_initializer=self.norm_gamma_initializer,
+      epsilon=self.norm_epsilon,
+      name=name,
     )
diff --git a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
index 537a1107..35b120cd 100644
--- a/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
+++ b/deepray/layers/rnn/layer_norm_simple_rnn_cell.py
@@ -19,12 +19,12 @@
 from typeguard import typechecked
 
 from deepray.utils.types import (
-    Activation,
-    FloatTensorLike,
-    TensorLike,
-    Initializer,
-    Constraint,
-    Regularizer,
+  Activation,
+  FloatTensorLike,
+  TensorLike,
+  Initializer,
+  Constraint,
+  Regularizer,
 )
 
 
@@ -32,141 +32,141 @@
 class LayerNormSimpleRNNCell(keras.layers.SimpleRNNCell):
   """Cell class for LayerNormSimpleRNN.
 
-    References:
-    [1] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton.
-        "Layer Normalization." ArXiv:1607.06450 [Cs, Stat],
-        July 21, 2016. http://arxiv.org/abs/1607.06450
-
-    Example:
-
-    >>> inputs = np.random.random([30,23,9]).astype(np.float32)
-    >>> lnsRNNCell = dp.rnn.LayerNormSimpleRNNCell(4)
-    >>> rnn = tf.keras.layers.RNN(lnsRNNCell, return_sequences=True, return_state=True)
-    >>> outputs, memory_state = rnn(inputs)
-    >>> outputs.shape
-    TensorShape([30, 23, 4])
-    >>> memory_state.shape
-    TensorShape([30, 4])
-
-    Args:
-      units: Positive integer, dimensionality of the output space.
-      activation: Activation function to use.
-        Default: hyperbolic tangent (`tanh`).
-        If you pass `None`, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-      use_bias: Boolean, (default `True`), whether the layer uses a bias
-        vector.
-      layernorm_epsilon: Float, (default `1e-5`), Small float added to variance
-        to avoid dividing by zero.
-      kernel_initializer: Initializer for the `kernel` weights matrix,
-        used for the linear transformation of the inputs. Default:
-        `glorot_uniform`.
-      recurrent_initializer: Initializer for the `recurrent_kernel`
-        weights matrix, used for the linear transformation of the recurrent
-        state. Default: `orthogonal`.
-      bias_initializer: Initializer for the bias vector (`use_bias=True`).
-         Default: `zeros`.
-      gamma_initializer: Initializer for the gamma vector of the layer
-         normalization layer. Default: `ones`.
-      kernel_regularizer: Regularizer function applied to the `kernel` weights
-        matrix. Default: `None`.
-      recurrent_regularizer: Regularizer function applied to the
-        `recurrent_kernel` weights matrix. Default: `None`.
-      bias_regularizer: Regularizer function applied to the bias vector
-         (`use_bias=True`). Default: `None`.
-      gamma_regularizer: Regularizer function applied to the gamma vector
-         of the layer normalization layer. Default: `None`.
-      kernel_constraint: Constraint function applied to the `kernel` weights
-        matrix. Default: `None`.
-      recurrent_constraint: Constraint function applied to the
-        `recurrent_kernel` weights matrix. Default: `None`.
-      bias_constraint: Constraint function applied to the bias vector
-         (`use_bias=True`). Default: `None`.
-      gamma_constraint: Constraint function applied to the gamma vector
-         of the layer normalization layer. Default: `None`.
-      dropout: Float between 0 and 1. Fraction of the units to drop for the
-        linear transformation of the inputs. Default: 0.
-      recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
-        for the linear transformation of the recurrent state. Default: 0.
-
-    Call arguments:
-      inputs: A 2D tensor, with shape of `[batch, feature]`.
-      states: A 2D tensor with shape of `[batch, units]`, which is the state
-        from the previous time step. For timestep 0, the initial state provided
-        by the user will be feed to cell.
-      training: Python boolean indicating whether the layer should behave in
-        training mode or in inference mode. Only relevant when `dropout` or
-        `recurrent_dropout` is used.
-
-    Examples:
-
-    >>> inputs = np.random.random([32, 10, 8]).astype(np.float32)
-    >>> rnn = tf.keras.layers.RNN(dp.rnn.LayerNormSimpleRNNCell(4))
-    >>> output = rnn(inputs)  # The output has shape `[32, 4]`.
-    >>> rnn = tf.keras.layers.RNN(
-    ... dp.rnn.LayerNormSimpleRNNCell(4),
-    ... return_sequences=True,
-    ... return_state=True)
-    >>> whole_sequence_output, final_state = rnn(inputs)
-    >>> whole_sequence_output
-    <tf.Tensor: shape=(32, 10, 4), dtype=float32, numpy=...>
-    >>> final_state
-    <tf.Tensor: shape=(32, 4), dtype=float32, numpy=...>
-
-    """
+  References:
+  [1] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton.
+      "Layer Normalization." ArXiv:1607.06450 [Cs, Stat],
+      July 21, 2016. http://arxiv.org/abs/1607.06450
+
+  Example:
+
+  >>> inputs = np.random.random([30,23,9]).astype(np.float32)
+  >>> lnsRNNCell = dp.rnn.LayerNormSimpleRNNCell(4)
+  >>> rnn = tf.keras.layers.RNN(lnsRNNCell, return_sequences=True, return_state=True)
+  >>> outputs, memory_state = rnn(inputs)
+  >>> outputs.shape
+  TensorShape([30, 23, 4])
+  >>> memory_state.shape
+  TensorShape([30, 4])
+
+  Args:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+      Default: hyperbolic tangent (`tanh`).
+      If you pass `None`, no activation is applied
+      (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, (default `True`), whether the layer uses a bias
+      vector.
+    layernorm_epsilon: Float, (default `1e-5`), Small float added to variance
+      to avoid dividing by zero.
+    kernel_initializer: Initializer for the `kernel` weights matrix,
+      used for the linear transformation of the inputs. Default:
+      `glorot_uniform`.
+    recurrent_initializer: Initializer for the `recurrent_kernel`
+      weights matrix, used for the linear transformation of the recurrent
+      state. Default: `orthogonal`.
+    bias_initializer: Initializer for the bias vector (`use_bias=True`).
+       Default: `zeros`.
+    gamma_initializer: Initializer for the gamma vector of the layer
+       normalization layer. Default: `ones`.
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix. Default: `None`.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix. Default: `None`.
+    bias_regularizer: Regularizer function applied to the bias vector
+       (`use_bias=True`). Default: `None`.
+    gamma_regularizer: Regularizer function applied to the gamma vector
+       of the layer normalization layer. Default: `None`.
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix. Default: `None`.
+    recurrent_constraint: Constraint function applied to the
+      `recurrent_kernel` weights matrix. Default: `None`.
+    bias_constraint: Constraint function applied to the bias vector
+       (`use_bias=True`). Default: `None`.
+    gamma_constraint: Constraint function applied to the gamma vector
+       of the layer normalization layer. Default: `None`.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the
+      linear transformation of the inputs. Default: 0.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop
+      for the linear transformation of the recurrent state. Default: 0.
+
+  Call arguments:
+    inputs: A 2D tensor, with shape of `[batch, feature]`.
+    states: A 2D tensor with shape of `[batch, units]`, which is the state
+      from the previous time step. For timestep 0, the initial state provided
+      by the user will be feed to cell.
+    training: Python boolean indicating whether the layer should behave in
+      training mode or in inference mode. Only relevant when `dropout` or
+      `recurrent_dropout` is used.
+
+  Examples:
+
+  >>> inputs = np.random.random([32, 10, 8]).astype(np.float32)
+  >>> rnn = tf.keras.layers.RNN(dp.rnn.LayerNormSimpleRNNCell(4))
+  >>> output = rnn(inputs)  # The output has shape `[32, 4]`.
+  >>> rnn = tf.keras.layers.RNN(
+  ... dp.rnn.LayerNormSimpleRNNCell(4),
+  ... return_sequences=True,
+  ... return_state=True)
+  >>> whole_sequence_output, final_state = rnn(inputs)
+  >>> whole_sequence_output
+  <tf.Tensor: shape=(32, 10, 4), dtype=float32, numpy=...>
+  >>> final_state
+  <tf.Tensor: shape=(32, 4), dtype=float32, numpy=...>
+
+  """
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      activation: Activation = "tanh",
-      use_bias: bool = True,
-      layernorm_epsilon: FloatTensorLike = 1e-05,
-      kernel_initializer: Initializer = "glorot_uniform",
-      recurrent_initializer: Initializer = "orthogonal",
-      bias_initializer: Initializer = "zeros",
-      gamma_initializer: Initializer = "ones",
-      kernel_regularizer: Regularizer = None,
-      recurrent_regularizer: Regularizer = None,
-      bias_regularizer: Regularizer = None,
-      gamma_regularizer: Regularizer = None,
-      kernel_constraint: Regularizer = None,
-      recurrent_constraint: Constraint = None,
-      bias_constraint: Constraint = None,
-      gamma_constraint: Constraint = None,
-      dropout: FloatTensorLike = 0.0,
-      recurrent_dropout: FloatTensorLike = 0.0,
-      **kwargs,
+    self,
+    units: TensorLike,
+    activation: Activation = "tanh",
+    use_bias: bool = True,
+    layernorm_epsilon: FloatTensorLike = 1e-05,
+    kernel_initializer: Initializer = "glorot_uniform",
+    recurrent_initializer: Initializer = "orthogonal",
+    bias_initializer: Initializer = "zeros",
+    gamma_initializer: Initializer = "ones",
+    kernel_regularizer: Regularizer = None,
+    recurrent_regularizer: Regularizer = None,
+    bias_regularizer: Regularizer = None,
+    gamma_regularizer: Regularizer = None,
+    kernel_constraint: Regularizer = None,
+    recurrent_constraint: Constraint = None,
+    bias_constraint: Constraint = None,
+    gamma_constraint: Constraint = None,
+    dropout: FloatTensorLike = 0.0,
+    recurrent_dropout: FloatTensorLike = 0.0,
+    **kwargs,
   ):
     super(LayerNormSimpleRNNCell, self).__init__(
-        units,
-        activation=activation,
-        use_bias=use_bias,
-        kernel_initializer=kernel_initializer,
-        recurrent_initializer=recurrent_initializer,
-        bias_initializer=bias_initializer,
-        kernel_regularizer=kernel_regularizer,
-        recurrent_regularizer=recurrent_regularizer,
-        bias_regularizer=bias_regularizer,
-        kernel_constraint=kernel_constraint,
-        recurrent_constraint=recurrent_constraint,
-        bias_constraint=bias_constraint,
-        dropout=dropout,
-        recurrent_dropout=recurrent_dropout,
-        **kwargs,
+      units,
+      activation=activation,
+      use_bias=use_bias,
+      kernel_initializer=kernel_initializer,
+      recurrent_initializer=recurrent_initializer,
+      bias_initializer=bias_initializer,
+      kernel_regularizer=kernel_regularizer,
+      recurrent_regularizer=recurrent_regularizer,
+      bias_regularizer=bias_regularizer,
+      kernel_constraint=kernel_constraint,
+      recurrent_constraint=recurrent_constraint,
+      bias_constraint=bias_constraint,
+      dropout=dropout,
+      recurrent_dropout=recurrent_dropout,
+      **kwargs,
     )
     self.layernorm = keras.layers.LayerNormalization(
-        axis=-1,
-        epsilon=layernorm_epsilon,
-        center=False,
-        scale=True,
-        beta_initializer=None,
-        gamma_initializer=gamma_initializer,
-        beta_regularizer=None,
-        gamma_regularizer=gamma_regularizer,
-        beta_constraint=None,
-        gamma_constraint=gamma_constraint,
-        **kwargs,
+      axis=-1,
+      epsilon=layernorm_epsilon,
+      center=False,
+      scale=True,
+      beta_initializer=None,
+      gamma_initializer=gamma_initializer,
+      beta_regularizer=None,
+      gamma_regularizer=gamma_regularizer,
+      beta_constraint=None,
+      gamma_constraint=gamma_constraint,
+      **kwargs,
     )
 
   def build(self, input_shape):
@@ -176,53 +176,53 @@ def build(self, input_shape):
   def call(self, inputs, states, training=None):
     """Formulas.
 
-        Notation:
-            y_t : Cell output at t (`output`)
-            y_{t-1} : Previous cell output at t-1 (`prev_output`)
-            x_t : The new input at t (`inputs`)
-            W_xh : Weight matrix for inputs x_t (`self.kernel`)
-            W_hh : Weights for prev. outputs y_{t-1} (`self.recurrent_kernel`)
-            b : Bias term for centering (`self.bias`)
-            d1 : Dropout function for x_t (`inputs * dp_mask`)
-            d2 : Dropout function for y_{t-1} (`prev_output * rec_dp_mask`)
-            ln : Scaling function from layer normalization (`self.layernorm`)
-            f : Activation function (`self.activation`)
-
-        Case 1:
-            Keras' SimpleRNN. Only with bias and activation
-              y_t = f(x_t * W_xh + y_{t-1} * W_hh + b)
-            or
-              net = x_t * W_xh + y_{t-1} * W_hh
-              y_t = f(net + b)
-
-        Case 2:
-            deepray' LayerNormSimpleRNNCell. Like case 1 but with layer
-            normalization (only scaling).
-              y_t = f(ln(x_t * W_xh + y_{t-1} * W_hh) + b)
-            or
-              net = x_t * W_xh + y_{t-1} * W_hh
-              y_t = f(ln(net) + b)
-
-            Layer normalization with scaling and centering in one go (see Ba et
-            al (2016), page 3, formula 4, https://arxiv.org/abs/1607.06450)
-            is the same as layer normalization only with scaling, and
-            centering directly afterwards.
-
-        Case 3:
-            Keras' SimpleRNN. with dropout, bias, and activation
-              y_t = f(d1(x_t) * W_xh + d2(y_{t-1}) * W_hh + b)
-            or
-              net = d1(x_t) * W_xh + d2(y_{t-1}) * W_hh
-              y_t = f(net + b)
-
-        Case 4:
-            deepray' LayerNormSimpleRNNCell. Like case 3 but with layer
-            normalization (only scaling).
-              y_t = f(ln(d1(x_t) * W_xh + d2(y_{t-1}) * W_hh) + b)
-            or
-              net = d1(x_t) * W_xh + d2(y_{t-1}) * W_hh
-              y_t = f(ln(net) + b)
-        """
+    Notation:
+        y_t : Cell output at t (`output`)
+        y_{t-1} : Previous cell output at t-1 (`prev_output`)
+        x_t : The new input at t (`inputs`)
+        W_xh : Weight matrix for inputs x_t (`self.kernel`)
+        W_hh : Weights for prev. outputs y_{t-1} (`self.recurrent_kernel`)
+        b : Bias term for centering (`self.bias`)
+        d1 : Dropout function for x_t (`inputs * dp_mask`)
+        d2 : Dropout function for y_{t-1} (`prev_output * rec_dp_mask`)
+        ln : Scaling function from layer normalization (`self.layernorm`)
+        f : Activation function (`self.activation`)
+
+    Case 1:
+        Keras' SimpleRNN. Only with bias and activation
+          y_t = f(x_t * W_xh + y_{t-1} * W_hh + b)
+        or
+          net = x_t * W_xh + y_{t-1} * W_hh
+          y_t = f(net + b)
+
+    Case 2:
+        deepray' LayerNormSimpleRNNCell. Like case 1 but with layer
+        normalization (only scaling).
+          y_t = f(ln(x_t * W_xh + y_{t-1} * W_hh) + b)
+        or
+          net = x_t * W_xh + y_{t-1} * W_hh
+          y_t = f(ln(net) + b)
+
+        Layer normalization with scaling and centering in one go (see Ba et
+        al (2016), page 3, formula 4, https://arxiv.org/abs/1607.06450)
+        is the same as layer normalization only with scaling, and
+        centering directly afterwards.
+
+    Case 3:
+        Keras' SimpleRNN. with dropout, bias, and activation
+          y_t = f(d1(x_t) * W_xh + d2(y_{t-1}) * W_hh + b)
+        or
+          net = d1(x_t) * W_xh + d2(y_{t-1}) * W_hh
+          y_t = f(net + b)
+
+    Case 4:
+        deepray' LayerNormSimpleRNNCell. Like case 3 but with layer
+        normalization (only scaling).
+          y_t = f(ln(d1(x_t) * W_xh + d2(y_{t-1}) * W_hh) + b)
+        or
+          net = d1(x_t) * W_xh + d2(y_{t-1}) * W_hh
+          y_t = f(ln(net) + b)
+    """
     prev_output = states[0]
     dp_mask = self.get_dropout_mask_for_cell(inputs, training)
     rec_dp_mask = self.get_recurrent_dropout_mask_for_cell(prev_output, training)
@@ -257,9 +257,9 @@ def get_config(self):
 
     ln_config = self.layernorm.get_config()
     ln_config = {
-        k: v
-        for k, v in ln_config.items()
-        if k in ["epsilon", "gamma_initializer", "gamma_regularizer", "gamma_constraint"]
+      k: v
+      for k, v in ln_config.items()
+      if k in ["epsilon", "gamma_initializer", "gamma_regularizer", "gamma_constraint"]
     }
 
     ln_config["layernorm_epsilon"] = ln_config.pop("epsilon")
diff --git a/deepray/layers/rnn/nas_cell.py b/deepray/layers/rnn/nas_cell.py
index 62e04f7e..8204d54e 100644
--- a/deepray/layers/rnn/nas_cell.py
+++ b/deepray/layers/rnn/nas_cell.py
@@ -16,6 +16,7 @@
 
 import tensorflow as tf
 from packaging.version import parse
+
 if parse(tf.__version__) > parse("2.16.0"):
   from tf_keras.src.layers.rnn.abstract_rnn_cell import AbstractRNNCell
 else:
@@ -23,9 +24,9 @@
 from typeguard import typechecked
 
 from deepray.utils.types import (
-    FloatTensorLike,
-    TensorLike,
-    Initializer,
+  FloatTensorLike,
+  TensorLike,
+  Initializer,
 )
 from typing import Optional
 
@@ -34,59 +35,59 @@
 class NASCell(AbstractRNNCell):
   """Neural Architecture Search (NAS) recurrent network cell.
 
-    This implements the recurrent cell from the paper:
+  This implements the recurrent cell from the paper:
 
-      https://arxiv.org/abs/1611.01578
+    https://arxiv.org/abs/1611.01578
 
-    Barret Zoph and Quoc V. Le.
-    "Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.
+  Barret Zoph and Quoc V. Le.
+  "Neural Architecture Search with Reinforcement Learning" Proc. ICLR 2017.
 
-    The class uses an optional projection layer.
+  The class uses an optional projection layer.
 
-    Example:
+  Example:
 
-    >>> inputs = np.random.random([30,23,9]).astype(np.float32)
-    >>> NASCell = dp.rnn.NASCell(4)
-    >>> rnn = tf.keras.layers.RNN(NASCell, return_sequences=True, return_state=True)
-    >>> outputs, memory_state, carry_state = rnn(inputs)
-    >>> outputs.shape
-    TensorShape([30, 23, 4])
-    >>> memory_state.shape
-    TensorShape([30, 4])
-    >>> carry_state.shape
-    TensorShape([30, 4])
-    """
+  >>> inputs = np.random.random([30,23,9]).astype(np.float32)
+  >>> NASCell = dp.rnn.NASCell(4)
+  >>> rnn = tf.keras.layers.RNN(NASCell, return_sequences=True, return_state=True)
+  >>> outputs, memory_state, carry_state = rnn(inputs)
+  >>> outputs.shape
+  TensorShape([30, 23, 4])
+  >>> memory_state.shape
+  TensorShape([30, 4])
+  >>> carry_state.shape
+  TensorShape([30, 4])
+  """
 
   # NAS cell's architecture base.
   _NAS_BASE = 8
 
   @typechecked
   def __init__(
-      self,
-      units: TensorLike,
-      projection: Optional[FloatTensorLike] = None,
-      use_bias: bool = False,
-      kernel_initializer: Initializer = "glorot_uniform",
-      recurrent_initializer: Initializer = "glorot_uniform",
-      projection_initializer: Initializer = "glorot_uniform",
-      bias_initializer: Initializer = "zeros",
-      **kwargs,
+    self,
+    units: TensorLike,
+    projection: Optional[FloatTensorLike] = None,
+    use_bias: bool = False,
+    kernel_initializer: Initializer = "glorot_uniform",
+    recurrent_initializer: Initializer = "glorot_uniform",
+    projection_initializer: Initializer = "glorot_uniform",
+    bias_initializer: Initializer = "zeros",
+    **kwargs,
   ):
     """Initialize the parameters for a NAS cell.
 
-        Args:
-          units: int, The number of units in the NAS cell.
-          projection: (optional) int, The output dimensionality for the
-            projection matrices.  If None, no projection is performed.
-          use_bias: (optional) bool, If True then use biases within the cell.
-            This is False by default.
-          kernel_initializer: Initializer for kernel weight.
-          recurrent_initializer: Initializer for recurrent kernel weight.
-          projection_initializer: Initializer for projection weight, used when
-            projection is not None.
-          bias_initializer: Initializer for bias, used when use_bias is True.
-          **kwargs: Additional keyword arguments.
-        """
+    Args:
+      units: int, The number of units in the NAS cell.
+      projection: (optional) int, The output dimensionality for the
+        projection matrices.  If None, no projection is performed.
+      use_bias: (optional) bool, If True then use biases within the cell.
+        This is False by default.
+      kernel_initializer: Initializer for kernel weight.
+      recurrent_initializer: Initializer for recurrent kernel weight.
+      projection_initializer: Initializer for projection weight, used when
+        projection is not None.
+      bias_initializer: Initializer for bias, used when use_bias is True.
+      **kwargs: Additional keyword arguments.
+    """
     super().__init__(**kwargs)
     self.units = units
     self.projection = projection
@@ -120,28 +121,28 @@ def build(self, inputs_shape):
     # multiplying the hidden state and `kernel` is all matrices multiplying
     # the inputs.
     self.recurrent_kernel = self.add_weight(
-        name="recurrent_kernel",
-        shape=[self.output_size, self._NAS_BASE * self.units],
-        initializer=self.recurrent_initializer,
+      name="recurrent_kernel",
+      shape=[self.output_size, self._NAS_BASE * self.units],
+      initializer=self.recurrent_initializer,
     )
     self.kernel = self.add_weight(
-        name="kernel",
-        shape=[input_size, self._NAS_BASE * self.units],
-        initializer=self.kernel_initializer,
+      name="kernel",
+      shape=[input_size, self._NAS_BASE * self.units],
+      initializer=self.kernel_initializer,
     )
 
     if self.use_bias:
       self.bias = self.add_weight(
-          name="bias",
-          shape=[self._NAS_BASE * self.units],
-          initializer=self.bias_initializer,
+        name="bias",
+        shape=[self._NAS_BASE * self.units],
+        initializer=self.bias_initializer,
       )
     # Projection layer if specified
     if self.projection is not None:
       self.projection_weights = self.add_weight(
-          name="projection_weights",
-          shape=[self.units, self.projection],
-          initializer=self.projection_initializer,
+        name="projection_weights",
+        shape=[self.units, self.projection],
+        initializer=self.projection_initializer,
       )
 
     self.built = True
@@ -149,26 +150,26 @@ def build(self, inputs_shape):
   def call(self, inputs, state):
     """Run one step of NAS Cell.
 
-        Args:
-          inputs: input Tensor, 2D, batch x num_units.
-          state: This must be a list of state Tensors, both `2-D`, with column
-            sizes `c_state` and `m_state`.
-
-        Returns:
-          A tuple containing:
-          - A `2-D, [batch x output_dim]`, Tensor representing the output of
-            the NAS Cell after reading `inputs` when previous state was
-            `state`.
-            Here output_dim is:
-               projection if projection was set, units otherwise.
-          - Tensor(s) representing the new state of NAS Cell after reading
-            `inputs` when the previous state was `state`.  Same type and
-            shape(s) as `state`.
-
-        Raises:
-          ValueError: If input size cannot be inferred from inputs via
-            static shape inference.
-        """
+    Args:
+      inputs: input Tensor, 2D, batch x num_units.
+      state: This must be a list of state Tensors, both `2-D`, with column
+        sizes `c_state` and `m_state`.
+
+    Returns:
+      A tuple containing:
+      - A `2-D, [batch x output_dim]`, Tensor representing the output of
+        the NAS Cell after reading `inputs` when previous state was
+        `state`.
+        Here output_dim is:
+           projection if projection was set, units otherwise.
+      - Tensor(s) representing the new state of NAS Cell after reading
+        `inputs` when the previous state was `state`.  Same type and
+        shape(s) as `state`.
+
+    Raises:
+      ValueError: If input size cannot be inferred from inputs via
+        static shape inference.
+    """
     sigmoid = tf.math.sigmoid
     tanh = tf.math.tanh
     relu = tf.nn.relu
@@ -222,13 +223,13 @@ def call(self, inputs, state):
 
   def get_config(self):
     config = {
-        "units": self.units,
-        "projection": self.projection,
-        "use_bias": self.use_bias,
-        "kernel_initializer": self.kernel_initializer,
-        "recurrent_initializer": self.recurrent_initializer,
-        "bias_initializer": self.bias_initializer,
-        "projection_initializer": self.projection_initializer,
+      "units": self.units,
+      "projection": self.projection,
+      "use_bias": self.use_bias,
+      "kernel_initializer": self.kernel_initializer,
+      "recurrent_initializer": self.recurrent_initializer,
+      "bias_initializer": self.bias_initializer,
+      "projection_initializer": self.projection_initializer,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/layers/rnn/peephole_lstm_cell.py b/deepray/layers/rnn/peephole_lstm_cell.py
index 9216180e..11100b36 100644
--- a/deepray/layers/rnn/peephole_lstm_cell.py
+++ b/deepray/layers/rnn/peephole_lstm_cell.py
@@ -21,35 +21,35 @@
 class PeepholeLSTMCell(tf.keras.layers.LSTMCell):
   """Equivalent to `tf.keras.layers.LSTMCell` class but adds peephole connections.
 
-    Peephole connections allow the gates to utilize the previous internal state as
-    well as the previous hidden state (which is what LSTMCell is limited to).
-    This allows PeepholeLSTMCell to better learn precise timings over LSTMCell.
+  Peephole connections allow the gates to utilize the previous internal state as
+  well as the previous hidden state (which is what LSTMCell is limited to).
+  This allows PeepholeLSTMCell to better learn precise timings over LSTMCell.
 
-    From [Gers et al., 2002](
-    http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
+  From [Gers et al., 2002](
+  http://www.jmlr.org/papers/volume3/gers02a/gers02a.pdf):
 
-    "We find that LSTM augmented by 'peephole connections' from its internal
-    cells to its multiplicative gates can learn the fine distinction between
-    sequences of spikes spaced either 50 or 49 time steps apart without the help
-    of any short training exemplars."
+  "We find that LSTM augmented by 'peephole connections' from its internal
+  cells to its multiplicative gates can learn the fine distinction between
+  sequences of spikes spaced either 50 or 49 time steps apart without the help
+  of any short training exemplars."
 
-    The peephole implementation is based on:
+  The peephole implementation is based on:
 
-    [Sak et al., 2014](https://research.google.com/pubs/archive/43905.pdf)
+  [Sak et al., 2014](https://research.google.com/pubs/archive/43905.pdf)
 
-    Example:
+  Example:
 
-    >>> inputs = np.random.random([30,23,9]).astype(np.float32)
-    >>> LSTMCell = dp.rnn.PeepholeLSTMCell(4)
-    >>> rnn = tf.keras.layers.RNN(LSTMCell, return_sequences=True, return_state=True)
-    >>> outputs, memory_state, carry_state = rnn(inputs)
-    >>> outputs.shape
-    TensorShape([30, 23, 4])
-    >>> memory_state.shape
-    TensorShape([30, 4])
-    >>> carry_state.shape
-    TensorShape([30, 4])
-    """
+  >>> inputs = np.random.random([30,23,9]).astype(np.float32)
+  >>> LSTMCell = dp.rnn.PeepholeLSTMCell(4)
+  >>> rnn = tf.keras.layers.RNN(LSTMCell, return_sequences=True, return_state=True)
+  >>> outputs, memory_state, carry_state = rnn(inputs)
+  >>> outputs.shape
+  TensorShape([30, 23, 4])
+  >>> memory_state.shape
+  TensorShape([30, 4])
+  >>> carry_state.shape
+  TensorShape([30, 4])
+  """
 
   def build(self, input_shape):
     super().build(input_shape)
@@ -57,38 +57,41 @@ def build(self, input_shape):
     # are multiplied with the previous internal state during the computation of
     # carry and output.
     self.input_gate_peephole_weights = self.add_weight(
-        shape=(self.units,),
-        name="input_gate_peephole_weights",
-        initializer=self.kernel_initializer,
+      shape=(self.units,),
+      name="input_gate_peephole_weights",
+      initializer=self.kernel_initializer,
     )
     self.forget_gate_peephole_weights = self.add_weight(
-        shape=(self.units,),
-        name="forget_gate_peephole_weights",
-        initializer=self.kernel_initializer,
+      shape=(self.units,),
+      name="forget_gate_peephole_weights",
+      initializer=self.kernel_initializer,
     )
     self.output_gate_peephole_weights = self.add_weight(
-        shape=(self.units,),
-        name="output_gate_peephole_weights",
-        initializer=self.kernel_initializer,
+      shape=(self.units,),
+      name="output_gate_peephole_weights",
+      initializer=self.kernel_initializer,
     )
 
   def _compute_carry_and_output(self, x, h_tm1, c_tm1):
     x_i, x_f, x_c, x_o = x
     h_tm1_i, h_tm1_f, h_tm1_c, h_tm1_o = h_tm1
     i = self.recurrent_activation(
-        x_i + tf.keras.backend.dot(h_tm1_i, self.recurrent_kernel[:, :self.units]) +
-        self.input_gate_peephole_weights * c_tm1
+      x_i
+      + tf.keras.backend.dot(h_tm1_i, self.recurrent_kernel[:, : self.units])
+      + self.input_gate_peephole_weights * c_tm1
     )
     f = self.recurrent_activation(
-        x_f + tf.keras.backend.dot(h_tm1_f, self.recurrent_kernel[:, self.units:self.units * 2]) +
-        self.forget_gate_peephole_weights * c_tm1
+      x_f
+      + tf.keras.backend.dot(h_tm1_f, self.recurrent_kernel[:, self.units : self.units * 2])
+      + self.forget_gate_peephole_weights * c_tm1
     )
     c = f * c_tm1 + i * self.activation(
-        x_c + tf.keras.backend.dot(h_tm1_c, self.recurrent_kernel[:, self.units * 2:self.units * 3])
+      x_c + tf.keras.backend.dot(h_tm1_c, self.recurrent_kernel[:, self.units * 2 : self.units * 3])
     )
     o = self.recurrent_activation(
-        x_o + tf.keras.backend.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3:]) +
-        self.output_gate_peephole_weights * c
+      x_o
+      + tf.keras.backend.dot(h_tm1_o, self.recurrent_kernel[:, self.units * 3 :])
+      + self.output_gate_peephole_weights * c
     )
     return c, o
 
diff --git a/deepray/layers/rnn/tests/esn_cell_test.py b/deepray/layers/rnn/tests/esn_cell_test.py
index f6924270..49205137 100644
--- a/deepray/layers/rnn/tests/esn_cell_test.py
+++ b/deepray/layers/rnn/tests/esn_cell_test.py
@@ -27,24 +27,24 @@ def test_base_esn():
 
   const_initializer = tf.constant_initializer(0.5)
   cell = ESNCell(
-      units=units,
-      connectivity=1,
-      leaky=1,
-      spectral_radius=0.9,
-      use_norm2=True,
-      use_bias=True,
-      activation=None,
-      kernel_initializer=const_initializer,
-      recurrent_initializer=const_initializer,
-      bias_initializer=const_initializer,
+    units=units,
+    connectivity=1,
+    leaky=1,
+    spectral_radius=0.9,
+    use_norm2=True,
+    use_bias=True,
+    activation=None,
+    kernel_initializer=const_initializer,
+    recurrent_initializer=const_initializer,
+    bias_initializer=const_initializer,
   )
 
   inputs = tf.constant(
-      np.array(
-          [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
-          dtype=np.float32,
-      ),
-      dtype=tf.float32,
+    np.array(
+      [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
+      dtype=np.float32,
+    ),
+    dtype=tf.float32,
   )
   state_value = tf.constant(0.3 * np.ones((units, units), dtype=np.float32), dtype=tf.float32)
   init_state = [state_value, state_value]
@@ -58,10 +58,10 @@ def test_esn_echo_state_property_eig():
   use_norm2 = False
   units = 3
   cell = ESNCell(
-      units=units,
-      use_norm2=use_norm2,
-      recurrent_initializer="ones",
-      connectivity=1.0,
+    units=units,
+    use_norm2=use_norm2,
+    recurrent_initializer="ones",
+    connectivity=1.0,
   )
   cell.build((3, 3))
   recurrent_weights = tf.constant(cell.get_weights()[0], dtype=tf.float32)
@@ -83,10 +83,10 @@ def test_esn_connectivity():
   units = 1000
   connectivity = 0.5
   cell = ESNCell(
-      units=units,
-      connectivity=connectivity,
-      use_norm2=True,
-      recurrent_initializer="ones",
+    units=units,
+    connectivity=connectivity,
+    use_norm2=True,
+    recurrent_initializer="ones",
   )
   cell.build((3, 3))
   recurrent_weights = tf.constant(cell.get_weights()[0])
@@ -117,33 +117,33 @@ def test_esn_keras_rnn_e2e():
 
 def test_esn_config():
   cell = ESNCell(
-      units=3,
-      connectivity=1,
-      leaky=1,
-      spectral_radius=0.9,
-      use_norm2=False,
-      use_bias=True,
-      activation="tanh",
-      kernel_initializer="glorot_uniform",
-      recurrent_initializer="glorot_uniform",
-      bias_initializer="glorot_uniform",
-      name="esn_cell_3",
+    units=3,
+    connectivity=1,
+    leaky=1,
+    spectral_radius=0.9,
+    use_norm2=False,
+    use_bias=True,
+    activation="tanh",
+    kernel_initializer="glorot_uniform",
+    recurrent_initializer="glorot_uniform",
+    bias_initializer="glorot_uniform",
+    name="esn_cell_3",
   )
 
   expected_config = {
-      "name": "esn_cell_3",
-      "trainable": True,
-      "dtype": "float32",
-      "units": 3,
-      "connectivity": 1,
-      "leaky": 1,
-      "spectral_radius": 0.9,
-      "use_norm2": False,
-      "use_bias": True,
-      "activation": tf.keras.activations.serialize(tf.keras.activations.get("tanh")),
-      "kernel_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
-      "recurrent_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
-      "bias_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
+    "name": "esn_cell_3",
+    "trainable": True,
+    "dtype": "float32",
+    "units": 3,
+    "connectivity": 1,
+    "leaky": 1,
+    "spectral_radius": 0.9,
+    "use_norm2": False,
+    "use_bias": True,
+    "activation": tf.keras.activations.serialize(tf.keras.activations.get("tanh")),
+    "kernel_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
+    "recurrent_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
+    "bias_initializer": tf.keras.initializers.serialize(tf.keras.initializers.get("glorot_uniform")),
   }
   config = cell.get_config()
   assert config == expected_config
diff --git a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
index 49f26bd9..b6021b78 100644
--- a/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
+++ b/deepray/layers/rnn/tests/layer_norm_lstm_cell_test.py
@@ -37,11 +37,11 @@ def test_cell_output():
 
   def single_cell():
     return LayerNormLSTMCell(
-        units=2,
-        kernel_initializer=const_initializer,
-        recurrent_initializer=const_initializer,
-        bias_initializer=const_initializer,
-        norm_epsilon=1e-12,
+      units=2,
+      kernel_initializer=const_initializer,
+      recurrent_initializer=const_initializer,
+      bias_initializer=const_initializer,
+      norm_epsilon=1e-12,
     )
 
   cell = keras.layers.StackedRNNCells([single_cell() for _ in range(2)])
@@ -70,11 +70,11 @@ def single_cell():
   h = tf.constant(0.1 * np.asarray([[2, 3]]), dtype=tf.float32)
   state = [h, c]
   cell = LayerNormLSTMCell(
-      units=2,
-      kernel_initializer=const_initializer,
-      recurrent_initializer=const_initializer,
-      bias_initializer=const_initializer,
-      norm_epsilon=1e-12,
+    units=2,
+    kernel_initializer=const_initializer,
+    recurrent_initializer=const_initializer,
+    bias_initializer=const_initializer,
+    norm_epsilon=1e-12,
   )
   output_v, output_states_v = cell(x, state)
   expected_h = np.array([[-0.47406167, 0.47406143]])
@@ -85,71 +85,61 @@ def single_cell():
 
 
 @pytest.mark.skipif(
-    Version(tf.__version__) < Version("2.13"),
-    reason="TF2.13 Serialization method doesn't support legacy method on parent class",
+  Version(tf.__version__) < Version("2.13"),
+  reason="TF2.13 Serialization method doesn't support legacy method on parent class",
 )
 def test_config_layer_norm_legacy():
   cell = LayerNormLSTMCell(10, name="layer_norm_lstm_cell_3")
 
   expected_config = {
-      "dtype": "float32",
-      "name": "layer_norm_lstm_cell_3",
-      "trainable": True,
-      "units": 10,
-      "activation": "tanh",
-      "recurrent_activation": "sigmoid",
-      "use_bias": True,
-      "kernel_initializer":
-          {
-              "class_name": "GlorotUniform",
-              "config": {
-                  "seed": None
-              },
-              "module": "keras.initializers",
-              "registered_name": None,
-          },
-      "recurrent_initializer":
-          {
-              "class_name": "Orthogonal",
-              "config": {
-                  "seed": None,
-                  "gain": 1.0
-              },
-              "module": "keras.initializers",
-              "registered_name": None,
-          },
-      "bias_initializer":
-          {
-              "class_name": "Zeros",
-              "config": {},
-              "module": "keras.initializers",
-              "registered_name": None,
-          },
-      "unit_forget_bias": True,
-      "kernel_regularizer": None,
-      "recurrent_regularizer": None,
-      "bias_regularizer": None,
-      "kernel_constraint": None,
-      "recurrent_constraint": None,
-      "bias_constraint": None,
-      "dropout": 0.0,
-      "recurrent_dropout": 0.0,
-      "implementation": 2,
-      "norm_gamma_initializer":
-          {
-              "class_name": "Ones",
-              "config": {},
-              "module": "keras.initializers",
-              "registered_name": None,
-          },
-      "norm_beta_initializer":
-          {
-              "class_name": "Zeros",
-              "config": {},
-              "module": "keras.initializers",
-              "registered_name": None,
-          },
-      "norm_epsilon": 1e-3,
+    "dtype": "float32",
+    "name": "layer_norm_lstm_cell_3",
+    "trainable": True,
+    "units": 10,
+    "activation": "tanh",
+    "recurrent_activation": "sigmoid",
+    "use_bias": True,
+    "kernel_initializer": {
+      "class_name": "GlorotUniform",
+      "config": {"seed": None},
+      "module": "keras.initializers",
+      "registered_name": None,
+    },
+    "recurrent_initializer": {
+      "class_name": "Orthogonal",
+      "config": {"seed": None, "gain": 1.0},
+      "module": "keras.initializers",
+      "registered_name": None,
+    },
+    "bias_initializer": {
+      "class_name": "Zeros",
+      "config": {},
+      "module": "keras.initializers",
+      "registered_name": None,
+    },
+    "unit_forget_bias": True,
+    "kernel_regularizer": None,
+    "recurrent_regularizer": None,
+    "bias_regularizer": None,
+    "kernel_constraint": None,
+    "recurrent_constraint": None,
+    "bias_constraint": None,
+    "dropout": 0.0,
+    "recurrent_dropout": 0.0,
+    "implementation": 2,
+    "norm_gamma_initializer": {
+      "class_name": "Ones",
+      "config": {},
+      "module": "keras.initializers",
+      "registered_name": None,
+    },
+    "norm_beta_initializer": {
+      "class_name": "Zeros",
+      "config": {},
+      "module": "keras.initializers",
+      "registered_name": None,
+    },
+    "norm_epsilon": 1e-3,
   }
   config = cell.get_config()
   assert config == expected_config
@@ -160,56 +150,42 @@ def test_config_layer_norm_legacy():
 
 
 @pytest.mark.skipif(
-    Version(tf.__version__) >= Version("2.13"),
-    reason="TF2.13 Serialization method doesn't support legacy method on parent class",
+  Version(tf.__version__) >= Version("2.13"),
+  reason="TF2.13 Serialization method doesn't support legacy method on parent class",
 )
 def test_config_layer_norm():
   cell = LayerNormLSTMCell(10, name="layer_norm_lstm_cell_3")
 
   expected_config = {
-      "dtype": "float32",
-      "name": "layer_norm_lstm_cell_3",
-      "trainable": True,
-      "units": 10,
-      "activation": "tanh",
-      "recurrent_activation": "sigmoid",
-      "use_bias": True,
-      "kernel_initializer": {
-          "class_name": "GlorotUniform",
-          "config": {
-              "seed": None
-          },
-      },
-      "recurrent_initializer": {
-          "class_name": "Orthogonal",
-          "config": {
-              "seed": None,
-              "gain": 1.0
-          },
-      },
-      "bias_initializer": {
-          "class_name": "Zeros",
-          "config": {}
-      },
-      "unit_forget_bias": True,
-      "kernel_regularizer": None,
-      "recurrent_regularizer": None,
-      "bias_regularizer": None,
-      "kernel_constraint": None,
-      "recurrent_constraint": None,
-      "bias_constraint": None,
-      "dropout": 0.0,
-      "recurrent_dropout": 0.0,
-      "implementation": 2,
-      "norm_gamma_initializer": {
-          "class_name": "Ones",
-          "config": {}
-      },
-      "norm_beta_initializer": {
-          "class_name": "Zeros",
-          "config": {}
-      },
-      "norm_epsilon": 1e-3,
+    "dtype": "float32",
+    "name": "layer_norm_lstm_cell_3",
+    "trainable": True,
+    "units": 10,
+    "activation": "tanh",
+    "recurrent_activation": "sigmoid",
+    "use_bias": True,
+    "kernel_initializer": {
+      "class_name": "GlorotUniform",
+      "config": {"seed": None},
+    },
+    "recurrent_initializer": {
+      "class_name": "Orthogonal",
+      "config": {"seed": None, "gain": 1.0},
+    },
+    "bias_initializer": {"class_name": "Zeros", "config": {}},
+    "unit_forget_bias": True,
+    "kernel_regularizer": None,
+    "recurrent_regularizer": None,
+    "bias_regularizer": None,
+    "kernel_constraint": None,
+    "recurrent_constraint": None,
+    "bias_constraint": None,
+    "dropout": 0.0,
+    "recurrent_dropout": 0.0,
+    "implementation": 2,
+    "norm_gamma_initializer": {"class_name": "Ones", "config": {}},
+    "norm_beta_initializer": {"class_name": "Zeros", "config": {}},
+    "norm_epsilon": 1e-3,
   }
   config = cell.get_config()
   assert config == expected_config
@@ -222,8 +198,8 @@ def test_config_layer_norm():
 def test_build():
   cell = LayerNormLSTMCell(10, name="layer_norm_lstm_cell")
   cell(
-      inputs=tf.ones((12, 20)),
-      states=cell.get_initial_state(batch_size=12, dtype=tf.float32),
+    inputs=tf.ones((12, 20)),
+    states=cell.get_initial_state(batch_size=12, dtype=tf.float32),
   )
   assert len(cell.weights) == 9
   assert cell.weights[0].name == "layer_norm_lstm_cell/kernel:0"
diff --git a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
index baaefefc..df8c1213 100644
--- a/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
+++ b/deepray/layers/rnn/tests/layer_norm_simple_rnn_cell_test.py
@@ -27,15 +27,15 @@ def test_constraints_layernorm_rnn():
   b_constraint = keras.constraints.max_norm(0.01)
   g_constraint = keras.constraints.max_norm(0.01)
   layer = keras.layers.RNN(
-      LayerNormSimpleRNNCell(
-          units=5,
-          kernel_constraint=k_constraint,
-          recurrent_constraint=r_constraint,
-          bias_constraint=b_constraint,
-          gamma_constraint=g_constraint,
-      ),
-      input_shape=(None, embedding_dim),
-      return_sequences=False,
+    LayerNormSimpleRNNCell(
+      units=5,
+      kernel_constraint=k_constraint,
+      recurrent_constraint=r_constraint,
+      bias_constraint=b_constraint,
+      gamma_constraint=g_constraint,
+    ),
+    input_shape=(None, embedding_dim),
+    return_sequences=False,
   )
   layer.build((None, None, embedding_dim))
   assert layer.cell.kernel.constraint == k_constraint
@@ -58,15 +58,15 @@ def test_with_masking_layer_layernorm_rnn():
 def test_regularizers_layernorm_rnn():
   embedding_dim = 4
   layer = keras.layers.RNN(
-      LayerNormSimpleRNNCell(
-          units=5,
-          kernel_regularizer=keras.regularizers.l1(0.01),
-          recurrent_regularizer=keras.regularizers.l1(0.01),
-          bias_regularizer="l2",
-          gamma_regularizer="l2",
-      ),
-      input_shape=(None, embedding_dim),
-      return_sequences=False,
+    LayerNormSimpleRNNCell(
+      units=5,
+      kernel_regularizer=keras.regularizers.l1(0.01),
+      recurrent_regularizer=keras.regularizers.l1(0.01),
+      bias_regularizer="l2",
+      gamma_regularizer="l2",
+    ),
+    input_shape=(None, embedding_dim),
+    return_sequences=False,
   )
   layer.build((None, None, 2))
   assert len(layer.losses) == 4
diff --git a/deepray/layers/rnn/tests/nas_cell_test.py b/deepray/layers/rnn/tests/nas_cell_test.py
index 63079d0c..9de6d41b 100644
--- a/deepray/layers/rnn/tests/nas_cell_test.py
+++ b/deepray/layers/rnn/tests/nas_cell_test.py
@@ -24,72 +24,68 @@
 def test_base():
   units = 6
   batch_size = 3
-  expected_output = np.array(
-      [
-          [0.576751, 0.576751, 0.576751, 0.576751, 0.576751, 0.576751],
-          [0.618936, 0.618936, 0.618936, 0.618936, 0.618936, 0.618936],
-          [0.627393, 0.627393, 0.627393, 0.627393, 0.627393, 0.627393],
-      ]
-  )
-  expected_state = np.array(
-      [
-          [
-              0.7157977,
-              0.7157977,
-              0.7157977,
-              0.7157977,
-              0.7157977,
-              0.7157977,
-              0.5767508,
-              0.5767508,
-              0.5767508,
-              0.5767508,
-              0.5767508,
-              0.5767508,
-          ],
-          [
-              0.7804162,
-              0.7804162,
-              0.7804162,
-              0.7804162,
-              0.7804162,
-              0.7804162,
-              0.6189357,
-              0.6189357,
-              0.6189357,
-              0.6189357,
-              0.6189357,
-              0.6189357,
-          ],
-          [
-              0.7945764,
-              0.7945764,
-              0.7945764,
-              0.7945764,
-              0.7945765,
-              0.7945765,
-              0.6273934,
-              0.6273934,
-              0.6273934,
-              0.6273934,
-              0.6273934,
-              0.6273934,
-          ],
-      ]
-  )
+  expected_output = np.array([
+    [0.576751, 0.576751, 0.576751, 0.576751, 0.576751, 0.576751],
+    [0.618936, 0.618936, 0.618936, 0.618936, 0.618936, 0.618936],
+    [0.627393, 0.627393, 0.627393, 0.627393, 0.627393, 0.627393],
+  ])
+  expected_state = np.array([
+    [
+      0.7157977,
+      0.7157977,
+      0.7157977,
+      0.7157977,
+      0.7157977,
+      0.7157977,
+      0.5767508,
+      0.5767508,
+      0.5767508,
+      0.5767508,
+      0.5767508,
+      0.5767508,
+    ],
+    [
+      0.7804162,
+      0.7804162,
+      0.7804162,
+      0.7804162,
+      0.7804162,
+      0.7804162,
+      0.6189357,
+      0.6189357,
+      0.6189357,
+      0.6189357,
+      0.6189357,
+      0.6189357,
+    ],
+    [
+      0.7945764,
+      0.7945764,
+      0.7945764,
+      0.7945764,
+      0.7945765,
+      0.7945765,
+      0.6273934,
+      0.6273934,
+      0.6273934,
+      0.6273934,
+      0.6273934,
+      0.6273934,
+    ],
+  ])
   const_initializer = tf.constant_initializer(0.5)
   cell = NASCell(
-      units=units,
-      kernel_initializer=const_initializer,
-      recurrent_initializer=const_initializer,
+    units=units,
+    kernel_initializer=const_initializer,
+    recurrent_initializer=const_initializer,
   )
 
   inputs = tf.constant(
-      np.array(
-          [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
-          dtype=np.float32,
-      ),
-      dtype=tf.float32,
+    np.array(
+      [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
+      dtype=np.float32,
+    ),
+    dtype=tf.float32,
   )
   state_value = tf.constant(0.1 * np.ones((batch_size, units), dtype=np.float32), dtype=tf.float32)
   init_state = [state_value, state_value]
@@ -114,71 +110,67 @@ def test_projection():
   units = 6
   batch_size = 3
   projection = 5
-  expected_output = np.array(
-      [
-          [1.697418, 1.697418, 1.697418, 1.697418, 1.697418],
-          [1.840037, 1.840037, 1.840037, 1.840037, 1.840037],
-          [1.873985, 1.873985, 1.873985, 1.873985, 1.873985],
-      ]
-  )
-
-  expected_state = np.array(
-      [
-          [
-              0.69855207,
-              0.69855207,
-              0.69855207,
-              0.69855207,
-              0.69855207,
-              0.69855207,
-              1.69741797,
-              1.69741797,
-              1.69741797,
-              1.69741797,
-              1.69741797,
-          ],
-          [
-              0.77073824,
-              0.77073824,
-              0.77073824,
-              0.77073824,
-              0.77073824,
-              0.77073824,
-              1.84003687,
-              1.84003687,
-              1.84003687,
-              1.84003687,
-              1.84003687,
-          ],
-          [
-              0.78973997,
-              0.78973997,
-              0.78973997,
-              0.78973997,
-              0.78973997,
-              0.78973997,
-              1.87398517,
-              1.87398517,
-              1.87398517,
-              1.87398517,
-              1.87398517,
-          ],
-      ]
-  )
+  expected_output = np.array([
+    [1.697418, 1.697418, 1.697418, 1.697418, 1.697418],
+    [1.840037, 1.840037, 1.840037, 1.840037, 1.840037],
+    [1.873985, 1.873985, 1.873985, 1.873985, 1.873985],
+  ])
+
+  expected_state = np.array([
+    [
+      0.69855207,
+      0.69855207,
+      0.69855207,
+      0.69855207,
+      0.69855207,
+      0.69855207,
+      1.69741797,
+      1.69741797,
+      1.69741797,
+      1.69741797,
+      1.69741797,
+    ],
+    [
+      0.77073824,
+      0.77073824,
+      0.77073824,
+      0.77073824,
+      0.77073824,
+      0.77073824,
+      1.84003687,
+      1.84003687,
+      1.84003687,
+      1.84003687,
+      1.84003687,
+    ],
+    [
+      0.78973997,
+      0.78973997,
+      0.78973997,
+      0.78973997,
+      0.78973997,
+      0.78973997,
+      1.87398517,
+      1.87398517,
+      1.87398517,
+      1.87398517,
+      1.87398517,
+    ],
+  ])
   const_initializer = tf.constant_initializer(0.5)
   cell = NASCell(
-      units=units,
-      projection=projection,
-      kernel_initializer=const_initializer,
-      recurrent_initializer=const_initializer,
-      projection_initializer=const_initializer,
+    units=units,
+    projection=projection,
+    kernel_initializer=const_initializer,
+    recurrent_initializer=const_initializer,
+    projection_initializer=const_initializer,
   )
   inputs = tf.constant(
-      np.array(
-          [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
-          dtype=np.float32,
-      ),
-      dtype=tf.float32,
+    np.array(
+      [[1.0, 1.0, 1.0, 1.0], [2.0, 2.0, 2.0, 2.0], [3.0, 3.0, 3.0, 3.0]],
+      dtype=np.float32,
+    ),
+    dtype=tf.float32,
   )
   state_value_c = tf.constant(0.1 * np.ones((batch_size, units), dtype=np.float32), dtype=tf.float32)
   state_value_h = tf.constant(0.1 * np.ones((batch_size, projection), dtype=np.float32), dtype=tf.float32)
@@ -213,16 +205,16 @@ def test_config_nas():
   cell = NASCell(10, projection=5, use_bias=True, name="nas_cell_3")
 
   expected_config = {
-      "dtype": "float32",
-      "name": "nas_cell_3",
-      "trainable": True,
-      "units": 10,
-      "projection": 5,
-      "use_bias": True,
-      "kernel_initializer": "glorot_uniform",
-      "recurrent_initializer": "glorot_uniform",
-      "bias_initializer": "zeros",
-      "projection_initializer": "glorot_uniform",
+    "dtype": "float32",
+    "name": "nas_cell_3",
+    "trainable": True,
+    "units": 10,
+    "projection": 5,
+    "use_bias": True,
+    "kernel_initializer": "glorot_uniform",
+    "recurrent_initializer": "glorot_uniform",
+    "bias_initializer": "zeros",
+    "projection_initializer": "glorot_uniform",
   }
   config = cell.get_config()
   assert config == expected_config
diff --git a/deepray/layers/rnn/tests/peephole_lstm_cell_test.py b/deepray/layers/rnn/tests/peephole_lstm_cell_test.py
index 1d2ad9d1..7a45d10b 100644
--- a/deepray/layers/rnn/tests/peephole_lstm_cell_test.py
+++ b/deepray/layers/rnn/tests/peephole_lstm_cell_test.py
@@ -21,7 +21,6 @@
 
 
 def test_peephole_lstm_cell():
-
   def _run_cell(cell_fn, **kwargs):
     inputs = tf.one_hot([1, 2, 3, 4], 4)
     cell = cell_fn(5, **kwargs)
@@ -32,25 +31,25 @@ def _run_cell(cell_fn, **kwargs):
 
   tf.random.set_seed(12345)
   first_implementation_output = _run_cell(
-      PeepholeLSTMCell,
-      kernel_initializer="ones",
-      recurrent_activation="sigmoid",
-      implementation=1,
+    PeepholeLSTMCell,
+    kernel_initializer="ones",
+    recurrent_activation="sigmoid",
+    implementation=1,
   )
   second_implementation_output = _run_cell(
-      PeepholeLSTMCell,
-      kernel_initializer="ones",
-      recurrent_activation="sigmoid",
-      implementation=2,
+    PeepholeLSTMCell,
+    kernel_initializer="ones",
+    recurrent_activation="sigmoid",
+    implementation=2,
   )
   expected_output = np.asarray(
-      [
-          [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
-          [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
-          [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
-          [0.0, 0.0, 0.0, 0.0, 0.0],
-      ],
-      dtype=np.float32,
+    [
+      [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
+      [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
+      [0.417551, 0.417551, 0.417551, 0.417551, 0.417551],
+      [0.0, 0.0, 0.0, 0.0, 0.0],
+    ],
+    dtype=np.float32,
   )
   np.testing.assert_allclose(first_implementation_output, second_implementation_output)
   np.testing.assert_allclose(first_implementation_output, expected_output, rtol=1e-6, atol=1e-6)
diff --git a/deepray/layers/self_attention_mask.py b/deepray/layers/self_attention_mask.py
index fe0940f2..236150c4 100644
--- a/deepray/layers/self_attention_mask.py
+++ b/deepray/layers/self_attention_mask.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -26,12 +27,12 @@
 class SelfAttentionMask(tf.keras.layers.Layer):
   """Create 3D attention mask from a 2D tensor mask.
 
-    inputs[0]: from_tensor: 2D or 3D Tensor of shape
-      [batch_size, from_seq_length, ...].
-    inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
+  inputs[0]: from_tensor: 2D or 3D Tensor of shape
+    [batch_size, from_seq_length, ...].
+  inputs[1]: to_mask: int32 Tensor of shape [batch_size, to_seq_length].
 
-    Returns:
-      float Tensor of shape [batch_size, from_seq_length, to_seq_length].
+  Returns:
+    float Tensor of shape [batch_size, from_seq_length, to_seq_length].
   """
 
   def call(self, inputs):
diff --git a/deepray/layers/snake.py b/deepray/layers/snake.py
index f4fc4242..f66c0ca2 100644
--- a/deepray/layers/snake.py
+++ b/deepray/layers/snake.py
@@ -26,11 +26,11 @@
 class Snake(tf.keras.layers.Layer):
   """Snake layer to learn periodic functions with the trainable `frequency` scalar.
 
-    See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195).
+  See [Neural Networks Fail to Learn Periodic Functions and How to Fix It](https://arxiv.org/abs/2006.08195).
 
-    Args:
-        frequency_initializer: Initializer for the `frequency` scalar.
-    """
+  Args:
+      frequency_initializer: Initializer for the `frequency` scalar.
+  """
 
   @typechecked
   def __init__(self, frequency_initializer: types.Initializer = "ones", **kwargs):
@@ -43,7 +43,7 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "frequency_initializer": tf.keras.initializers.serialize(self.frequency_initializer),
+      "frequency_initializer": tf.keras.initializers.serialize(self.frequency_initializer),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/layers/sparsemax.py b/deepray/layers/sparsemax.py
index 28b1343a..6d492f17 100644
--- a/deepray/layers/sparsemax.py
+++ b/deepray/layers/sparsemax.py
@@ -22,13 +22,13 @@
 class Sparsemax(tf.keras.layers.Layer):
   """Sparsemax activation function.
 
-    The output shape is the same as the input shape.
+  The output shape is the same as the input shape.
 
-    See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068).
+  See [From Softmax to Sparsemax: A Sparse Model of Attention and Multi-Label Classification](https://arxiv.org/abs/1602.02068).
 
-    Args:
-        axis: Integer, axis along which the sparsemax normalization is applied.
-    """
+  Args:
+      axis: Integer, axis along which the sparsemax normalization is applied.
+  """
 
   @typechecked
   def __init__(self, axis: int = -1, **kwargs):
diff --git a/deepray/layers/spatial_pyramid_pooling.py b/deepray/layers/spatial_pyramid_pooling.py
index 99be8e0a..e7929ec1 100644
--- a/deepray/layers/spatial_pyramid_pooling.py
+++ b/deepray/layers/spatial_pyramid_pooling.py
@@ -26,47 +26,47 @@
 class SpatialPyramidPooling2D(tf.keras.layers.Layer):
   """Performs Spatial Pyramid Pooling.
 
-    See [Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition](https://arxiv.org/pdf/1406.4729.pdf).
-
-    Spatial Pyramid Pooling generates a fixed-length representation
-    regardless of input size/scale. It is typically used before a layer
-    that requires a constant input shape, for example before a Dense Layer.
-
-    Args:
-      bins: Either a collection of integers or a collection of collections of 2 integers.
-        Each element in the inner collection must contain 2 integers, (pooled_rows, pooled_cols)
-        For example, providing [1, 3, 5] or [[1, 1], [3, 3], [5, 5]] preforms pooling
-        using three different pooling layers, having outputs with dimensions 1x1, 3x3 and 5x5 respectively.
-        These are flattened along height and width to give an output of shape
-        [batch_size, (1 + 9 + 25), channels] = [batch_size, 35, channels].
-      data_format: A string,
-        one of `channels_last` (default) or `channels_first`.
-        The ordering of the dimensions in the inputs.
-        `channels_last` corresponds to inputs with shape
-        `(batch, height, width, channels)` while `channels_first`
-        corresponds to inputs with shape `(batch, channels, height, width)`.
-
-    Input shape:
-      - If `data_format='channels_last'`:
-        4D tensor with shape `(batch_size, height, width, channels)`.
-      - If `data_format='channels_first'`:
-        4D tensor with shape `(batch_size, channels, height, width)`.
-
-    Output shape:
-      The output is the pooled image, flattened across its height and width
-      - If `data_format='channels_last'`:
-        3D tensor with shape `(batch_size, num_bins, channels)`.
-      - If `data_format='channels_first'`:
-        3D tensor with shape `(batch_size, channels, num_bins)`.
-    """
+  See [Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition](https://arxiv.org/pdf/1406.4729.pdf).
+
+  Spatial Pyramid Pooling generates a fixed-length representation
+  regardless of input size/scale. It is typically used before a layer
+  that requires a constant input shape, for example before a Dense Layer.
+
+  Args:
+    bins: Either a collection of integers or a collection of collections of 2 integers.
+      Each element in the inner collection must contain 2 integers, (pooled_rows, pooled_cols)
+      For example, providing [1, 3, 5] or [[1, 1], [3, 3], [5, 5]] preforms pooling
+      using three different pooling layers, having outputs with dimensions 1x1, 3x3 and 5x5 respectively.
+      These are flattened along height and width to give an output of shape
+      [batch_size, (1 + 9 + 25), channels] = [batch_size, 35, channels].
+    data_format: A string,
+      one of `channels_last` (default) or `channels_first`.
+      The ordering of the dimensions in the inputs.
+      `channels_last` corresponds to inputs with shape
+      `(batch, height, width, channels)` while `channels_first`
+      corresponds to inputs with shape `(batch, channels, height, width)`.
+
+  Input shape:
+    - If `data_format='channels_last'`:
+      4D tensor with shape `(batch_size, height, width, channels)`.
+    - If `data_format='channels_first'`:
+      4D tensor with shape `(batch_size, channels, height, width)`.
+
+  Output shape:
+    The output is the pooled image, flattened across its height and width
+    - If `data_format='channels_last'`:
+      3D tensor with shape `(batch_size, num_bins, channels)`.
+    - If `data_format='channels_first'`:
+      3D tensor with shape `(batch_size, channels, num_bins)`.
+  """
 
   @typechecked
   def __init__(
-      self,
-      bins: Union[Iterable[int], Iterable[Iterable[int]]],
-      data_format=None,
-      *args,
-      **kwargs,
+    self,
+    bins: Union[Iterable[int], Iterable[Iterable[int]]],
+    data_format=None,
+    *args,
+    **kwargs,
   ):
     self.bins = [conv_utils.normalize_tuple(bin, 2, "bin") for bin in bins]
     self.data_format = conv_utils.normalize_data_format(data_format)
diff --git a/deepray/layers/spectral_normalization.py b/deepray/layers/spectral_normalization.py
index 6f620cb4..972a4ce5 100644
--- a/deepray/layers/spectral_normalization.py
+++ b/deepray/layers/spectral_normalization.py
@@ -21,44 +21,43 @@
 class SpectralNormalization(tf.keras.layers.Wrapper):
   """Performs spectral normalization on weights.
 
-    This wrapper controls the Lipschitz constant of the layer by
-    constraining its spectral norm, which can stabilize the training of GANs.
-
-    See [Spectral Normalization for Generative Adversarial Networks](https://arxiv.org/abs/1802.05957).
-
-    Wrap `tf.keras.layers.Conv2D`:
-
-    >>> x = np.random.rand(1, 10, 10, 1)
-    >>> conv2d = SpectralNormalization(tf.keras.layers.Conv2D(2, 2))
-    >>> y = conv2d(x)
-    >>> y.shape
-    TensorShape([1, 9, 9, 2])
-
-    Wrap `tf.keras.layers.Dense`:
-
-    >>> x = np.random.rand(1, 10, 10, 1)
-    >>> dense = SpectralNormalization(tf.keras.layers.Dense(10))
-    >>> y = dense(x)
-    >>> y.shape
-    TensorShape([1, 10, 10, 10])
-
-    Args:
-      layer: A `tf.keras.layers.Layer` instance that
-        has either `kernel` or `embeddings` attribute.
-      power_iterations: `int`, the number of iterations during normalization.
-    Raises:
-      AssertionError: If not initialized with a `Layer` instance.
-      ValueError: If initialized with negative `power_iterations`.
-      AttributeError: If `layer` does not has `kernel` or `embeddings` attribute.
-    """
+  This wrapper controls the Lipschitz constant of the layer by
+  constraining its spectral norm, which can stabilize the training of GANs.
+
+  See [Spectral Normalization for Generative Adversarial Networks](https://arxiv.org/abs/1802.05957).
+
+  Wrap `tf.keras.layers.Conv2D`:
+
+  >>> x = np.random.rand(1, 10, 10, 1)
+  >>> conv2d = SpectralNormalization(tf.keras.layers.Conv2D(2, 2))
+  >>> y = conv2d(x)
+  >>> y.shape
+  TensorShape([1, 9, 9, 2])
+
+  Wrap `tf.keras.layers.Dense`:
+
+  >>> x = np.random.rand(1, 10, 10, 1)
+  >>> dense = SpectralNormalization(tf.keras.layers.Dense(10))
+  >>> y = dense(x)
+  >>> y.shape
+  TensorShape([1, 10, 10, 10])
+
+  Args:
+    layer: A `tf.keras.layers.Layer` instance that
+      has either `kernel` or `embeddings` attribute.
+    power_iterations: `int`, the number of iterations during normalization.
+  Raises:
+    AssertionError: If not initialized with a `Layer` instance.
+    ValueError: If initialized with negative `power_iterations`.
+    AttributeError: If `layer` does not has `kernel` or `embeddings` attribute.
+  """
 
   @typechecked
   def __init__(self, layer: tf.keras.layers, power_iterations: int = 1, **kwargs):
     super().__init__(layer, **kwargs)
     if power_iterations <= 0:
       raise ValueError(
-          "`power_iterations` should be greater than zero, got "
-          "`power_iterations={}`".format(power_iterations)
+        "`power_iterations` should be greater than zero, got `power_iterations={}`".format(power_iterations)
       )
     self.power_iterations = power_iterations
     self._initialized = False
@@ -74,17 +73,16 @@ def build(self, input_shape):
     elif hasattr(self.layer, "embeddings"):
       self.w = self.layer.embeddings
     else:
-      raise AttributeError("{} object has no attribute 'kernel' nor "
-                           "'embeddings'".format(type(self.layer).__name__))
+      raise AttributeError("{} object has no attribute 'kernel' nor 'embeddings'".format(type(self.layer).__name__))
 
     self.w_shape = self.w.shape.as_list()
 
     self.u = self.add_weight(
-        shape=(1, self.w_shape[-1]),
-        initializer=tf.initializers.TruncatedNormal(stddev=0.02),
-        trainable=False,
-        name="sn_u",
-        dtype=self.w.dtype,
+      shape=(1, self.w_shape[-1]),
+      initializer=tf.initializers.TruncatedNormal(stddev=0.02),
+      trainable=False,
+      name="sn_u",
+      dtype=self.w.dtype,
     )
 
   def call(self, inputs, training=None):
@@ -104,9 +102,9 @@ def compute_output_shape(self, input_shape):
   def normalize_weights(self):
     """Generate spectral normalized weights.
 
-        This method will update the value of `self.w` with the
-        spectral normalized value, so that the layer is ready for `call()`.
-        """
+    This method will update the value of `self.w` with the
+    spectral normalized value, so that the layer is ready for `call()`.
+    """
 
     w = tf.reshape(self.w, [-1, self.w_shape[-1]])
     u = self.u
diff --git a/deepray/layers/stochastic_depth.py b/deepray/layers/stochastic_depth.py
index 645baac6..a207cf12 100644
--- a/deepray/layers/stochastic_depth.py
+++ b/deepray/layers/stochastic_depth.py
@@ -6,51 +6,51 @@
 class StochasticDepth(tf.keras.layers.Layer):
   """Stochastic Depth layer.
 
-    Implements Stochastic Depth as described in
-    [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382), to randomly drop residual branches
-    in residual architectures.
+  Implements Stochastic Depth as described in
+  [Deep Networks with Stochastic Depth](https://arxiv.org/abs/1603.09382), to randomly drop residual branches
+  in residual architectures.
 
-    Usage:
-    Residual architectures with fixed depth, use residual branches that are merged back into the main network
-    by adding the residual branch back to the input:
+  Usage:
+  Residual architectures with fixed depth, use residual branches that are merged back into the main network
+  by adding the residual branch back to the input:
 
-    >>> input = np.ones((1, 3, 3, 1), dtype = np.float32)
-    >>> residual = tf.keras.layers.Conv2D(1, 1)(input)
-    >>> output = tf.keras.layers.Add()([input, residual])
-    >>> output.shape
-    TensorShape([1, 3, 3, 1])
+  >>> input = np.ones((1, 3, 3, 1), dtype = np.float32)
+  >>> residual = tf.keras.layers.Conv2D(1, 1)(input)
+  >>> output = tf.keras.layers.Add()([input, residual])
+  >>> output.shape
+  TensorShape([1, 3, 3, 1])
 
-    StochasticDepth acts as a drop-in replacement for the addition:
+  StochasticDepth acts as a drop-in replacement for the addition:
 
-    >>> input = np.ones((1, 3, 3, 1), dtype = np.float32)
-    >>> residual = tf.keras.layers.Conv2D(1, 1)(input)
-    >>> output = dp.layers.StochasticDepth()([input, residual])
-    >>> output.shape
-    TensorShape([1, 3, 3, 1])
+  >>> input = np.ones((1, 3, 3, 1), dtype = np.float32)
+  >>> residual = tf.keras.layers.Conv2D(1, 1)(input)
+  >>> output = dp.layers.StochasticDepth()([input, residual])
+  >>> output.shape
+  TensorShape([1, 3, 3, 1])
 
-    At train time, StochasticDepth returns:
+  At train time, StochasticDepth returns:
 
-    $$
-    x[0] + b_l * x[1],
-    $$
+  $$
+  x[0] + b_l * x[1],
+  $$
 
-    where $b_l$ is a random Bernoulli variable with probability $P(b_l = 1) = p_l$
+  where $b_l$ is a random Bernoulli variable with probability $P(b_l = 1) = p_l$
 
-    At test time, StochasticDepth rescales the activations of the residual branch based on the survival probability ($p_l$):
+  At test time, StochasticDepth rescales the activations of the residual branch based on the survival probability ($p_l$):
 
-    $$
-    x[0] + p_l * x[1]
-    $$
+  $$
+  x[0] + p_l * x[1]
+  $$
 
-    Args:
-        survival_probability: float, the probability of the residual branch being kept.
+  Args:
+      survival_probability: float, the probability of the residual branch being kept.
 
-    Call Args:
-        inputs:  List of `[shortcut, residual]` where `shortcut`, and `residual` are tensors of equal shape.
+  Call Args:
+      inputs:  List of `[shortcut, residual]` where `shortcut`, and `residual` are tensors of equal shape.
 
-    Output shape:
-        Equal to the shape of inputs `shortcut`, and `residual`
-    """
+  Output shape:
+      Equal to the shape of inputs `shortcut`, and `residual`
+  """
 
   @typechecked
   def __init__(self, survival_probability: float = 0.5, **kwargs):
diff --git a/deepray/layers/tests_bak/adaptive_pooling_test.py b/deepray/layers/tests_bak/adaptive_pooling_test.py
index 65c10a34..21ef9394 100644
--- a/deepray/layers/tests_bak/adaptive_pooling_test.py
+++ b/deepray/layers/tests_bak/adaptive_pooling_test.py
@@ -17,12 +17,12 @@
 import pytest
 import numpy as np
 from deepray.layers.adaptive_pooling import (
-    AdaptiveAveragePooling1D,
-    AdaptiveMaxPooling1D,
-    AdaptiveAveragePooling2D,
-    AdaptiveMaxPooling2D,
-    AdaptiveAveragePooling3D,
-    AdaptiveMaxPooling3D,
+  AdaptiveAveragePooling1D,
+  AdaptiveMaxPooling1D,
+  AdaptiveAveragePooling2D,
+  AdaptiveMaxPooling2D,
+  AdaptiveAveragePooling3D,
+  AdaptiveMaxPooling3D,
 )
 
 from deepray.utils import test_utils
@@ -35,13 +35,10 @@ def test_avg_1d():
   output = np.array([1.0, 4.0, 7.0, 10.0]).astype(np.float32)
   output = np.reshape(output, (1, 4, 1))
   test_utils.layer_test(
-      AdaptiveAveragePooling1D,
-      kwargs={
-          "output_size": 4,
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling1D,
+    kwargs={"output_size": 4, "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=12.0, step=1.0).astype(np.float32)
@@ -49,13 +46,10 @@ def test_avg_1d():
   output = np.array([1.0, 4.0, 7.0, 10.0]).astype(np.float32)
   output = np.reshape(output, (1, 1, 4))
   test_utils.layer_test(
-      AdaptiveAveragePooling1D,
-      kwargs={
-          "output_size": 4,
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling1D,
+    kwargs={"output_size": 4, "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
 
@@ -66,13 +60,10 @@ def test_avg_2d():
   output = np.array([[7.0, 12.0], [27.0, 32.0]]).astype(np.float32)
   output = np.reshape(output, (1, 2, 2, 1))
   test_utils.layer_test(
-      AdaptiveAveragePooling2D,
-      kwargs={
-          "output_size": (2, 2),
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling2D,
+    kwargs={"output_size": (2, 2), "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=40.0, step=1.0).astype(np.float32)
@@ -80,13 +71,10 @@ def test_avg_2d():
   output = np.array([[7.0, 12.0], [27.0, 32.0]]).astype(np.float32)
   output = np.reshape(output, (1, 1, 2, 2))
   test_utils.layer_test(
-      AdaptiveAveragePooling2D,
-      kwargs={
-          "output_size": (2, 2),
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling2D,
+    kwargs={"output_size": (2, 2), "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
 
@@ -97,13 +85,10 @@ def test_avg_3d():
   output = np.array([[[14.0, 15.0], [24.0, 25.0]], [[54.0, 55.0], [64.0, 65.0]]]).astype(np.float32)
   output = np.reshape(output, (1, 2, 2, 2, 1))
   test_utils.layer_test(
-      AdaptiveAveragePooling3D,
-      kwargs={
-          "output_size": (2, 2, 2),
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling3D,
+    kwargs={"output_size": (2, 2, 2), "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=80.0, step=1.0).astype(np.float32)
@@ -111,13 +96,10 @@ def test_avg_3d():
   output = np.array([[[14.0, 15.0], [24.0, 25.0]], [[54.0, 55.0], [64.0, 65.0]]]).astype(np.float32)
   output = np.reshape(output, (1, 1, 2, 2, 2))
   test_utils.layer_test(
-      AdaptiveAveragePooling3D,
-      kwargs={
-          "output_size": (2, 2, 2),
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveAveragePooling3D,
+    kwargs={"output_size": (2, 2, 2), "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
 
@@ -128,13 +110,10 @@ def test_max_1d():
   output = np.array([2.0, 5.0, 8.0, 11.0]).astype(np.float32)
   output = np.reshape(output, (1, 4, 1))
   test_utils.layer_test(
-      AdaptiveMaxPooling1D,
-      kwargs={
-          "output_size": 4,
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling1D,
+    kwargs={"output_size": 4, "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=12.0, step=1.0).astype(np.float32)
@@ -142,13 +121,10 @@ def test_max_1d():
   output = np.array([2.0, 5.0, 8.0, 11.0]).astype(np.float32)
   output = np.reshape(output, (1, 1, 4))
   test_utils.layer_test(
-      AdaptiveMaxPooling1D,
-      kwargs={
-          "output_size": 4,
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling1D,
+    kwargs={"output_size": 4, "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
 
@@ -159,13 +135,10 @@ def test_max_2d():
   output = np.array([[14.0, 19.0], [34.0, 39.0]]).astype(np.float32)
   output = np.reshape(output, (1, 2, 2, 1))
   test_utils.layer_test(
-      AdaptiveMaxPooling2D,
-      kwargs={
-          "output_size": (2, 2),
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling2D,
+    kwargs={"output_size": (2, 2), "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=40.0, step=1.0).astype(np.float32)
@@ -173,13 +146,10 @@ def test_max_2d():
   output = np.array([[14.0, 19.0], [34.0, 39.0]]).astype(np.float32)
   output = np.reshape(output, (1, 1, 2, 2))
   test_utils.layer_test(
-      AdaptiveMaxPooling2D,
-      kwargs={
-          "output_size": (2, 2),
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling2D,
+    kwargs={"output_size": (2, 2), "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
 
@@ -190,13 +160,10 @@ def test_max_3d():
   output = np.array([[[28.0, 29.0], [38.0, 39.0]], [[68.0, 69.0], [78.0, 79.0]]]).astype(np.float32)
   output = np.reshape(output, (1, 2, 2, 2, 1))
   test_utils.layer_test(
-      AdaptiveMaxPooling3D,
-      kwargs={
-          "output_size": (2, 2, 2),
-          "data_format": "channels_last"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling3D,
+    kwargs={"output_size": (2, 2, 2), "data_format": "channels_last"},
+    input_data=valid_input,
+    expected_output=output,
   )
 
   valid_input = np.arange(start=0.0, stop=80.0, step=1.0).astype(np.float32)
@@ -204,11 +171,8 @@ def test_max_3d():
   output = np.array([[[28.0, 29.0], [38.0, 39.0]], [[68.0, 69.0], [78.0, 79.0]]]).astype(np.float32)
   output = np.reshape(output, (1, 1, 2, 2, 2))
   test_utils.layer_test(
-      AdaptiveMaxPooling3D,
-      kwargs={
-          "output_size": (2, 2, 2),
-          "data_format": "channels_first"
-      },
-      input_data=valid_input,
-      expected_output=output,
+    AdaptiveMaxPooling3D,
+    kwargs={"output_size": (2, 2, 2), "data_format": "channels_first"},
+    input_data=valid_input,
+    expected_output=output,
   )
diff --git a/deepray/layers/tests_bak/attention_test.py b/deepray/layers/tests_bak/attention_test.py
index 1c3e8768..9c0e3b40 100644
--- a/deepray/layers/tests_bak/attention_test.py
+++ b/deepray/layers/tests_bak/attention_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class AttentionLayerTest(keras_parameterized.TestCase):
-
   def test_non_masked_attention(self):
     """Test that the attention layer can be created without a mask tensor."""
     test_layer = attention.Attention(num_heads=12, head_size=64)
@@ -79,7 +78,7 @@ def test_masked_attention(self):
   def test_initializer(self):
     """Test with a specified initializer."""
     test_layer = attention.Attention(
-        num_heads=12, head_size=64, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
+      num_heads=12, head_size=64, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
     )
     # Create a 3-dimensional input (the first dimension is implicit).
     from_tensor = tf.keras.Input(shape=(40, 80))
@@ -89,14 +88,13 @@ def test_initializer(self):
 
 def _create_cache(batch_size, init_decode_length, num_heads, head_size):
   return {
-      "key": tf.zeros([batch_size, init_decode_length, num_heads, head_size], dtype=tf.float32),
-      "value": tf.zeros([batch_size, init_decode_length, num_heads, head_size], dtype=tf.float32)
+    "key": tf.zeros([batch_size, init_decode_length, num_heads, head_size], dtype=tf.float32),
+    "value": tf.zeros([batch_size, init_decode_length, num_heads, head_size], dtype=tf.float32),
   }
 
 
 @keras_parameterized.run_all_keras_modes
 class CachedAttentionTest(keras_parameterized.TestCase):
-
   def test_masked_attention(self):
     """Test with a mask tensor."""
     num_heads, head_size = 2, 2
diff --git a/deepray/layers/tests_bak/crf_test.py b/deepray/layers/tests_bak/crf_test.py
index 025dd546..afc484ed 100644
--- a/deepray/layers/tests_bak/crf_test.py
+++ b/deepray/layers/tests_bak/crf_test.py
@@ -29,53 +29,47 @@
 
 
 def get_test_data():
-  x = np.array(
-      [
-          [
-              # O   B-X  I-X  B-Y  I-Y
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 0.0, 1.0, 0.0, 0.0],
-              [0.0, 0.0, 1.0, 0.0, 0.0],
-          ],
-          [
-              # O   B-X  I-X  B-Y  I-Y
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-          ],
-      ]
-  )
+  x = np.array([
+    [
+      # O   B-X  I-X  B-Y  I-Y
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 0.0, 1.0, 0.0, 0.0],
+      [0.0, 0.0, 1.0, 0.0, 0.0],
+    ],
+    [
+      # O   B-X  I-X  B-Y  I-Y
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+    ],
+  ])
   y = np.array([[1, 2, 2], [1, 1, 1]])  # B-X  I-X  I-X  # B-X  B-X  B-X
   return x, y
 
 
 def get_test_data_extended():
-  logits = np.array(
-      [
-          [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
-          [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
-      ]
-  )
+  logits = np.array([
+    [[0, 0, 0.5, 0.5, 0.2], [0, 0, 0.3, 0.3, 0.1], [0, 0, 0.9, 10, 1]],
+    [[0, 0, 0.2, 0.5, 0.2], [0, 0, 3, 0.3, 0.1], [0, 0, 0.9, 1, 1]],
+  ])
   tags = np.array([[2, 3, 4], [3, 2, 2]])
 
-  transitions = np.array(
-      [
-          [0.1, 0.2, 0.3, 0.4, 0.5],
-          [0.8, 0.3, 0.1, 0.7, 0.9],
-          [-0.3, 2.1, -5.6, 3.4, 4.0],
-          [0.2, 0.4, 0.6, -0.3, -0.4],
-          [1.0, 1.0, 1.0, 1.0, 1.0],
-      ]
-  )
+  transitions = np.array([
+    [0.1, 0.2, 0.3, 0.4, 0.5],
+    [0.8, 0.3, 0.1, 0.7, 0.9],
+    [-0.3, 2.1, -5.6, 3.4, 4.0],
+    [0.2, 0.4, 0.6, -0.3, -0.4],
+    [1.0, 1.0, 1.0, 1.0, 1.0],
+  ])
 
   boundary_values = np.ones((5,))
   crf_layer = CRF(
-      units=5,
-      use_kernel=False,  # disable kernel transform
-      chain_initializer=tf.keras.initializers.Constant(transitions),
-      use_boundary=True,
-      boundary_initializer=tf.keras.initializers.Constant(boundary_values),
-      name="crf_layer",
+    units=5,
+    use_kernel=False,  # disable kernel transform
+    chain_initializer=tf.keras.initializers.Constant(transitions),
+    use_boundary=True,
+    boundary_initializer=tf.keras.initializers.Constant(boundary_values),
+    name="crf_layer",
   )
   return logits, tags, transitions, boundary_values, crf_layer
 
@@ -100,11 +94,11 @@ def test_unmasked_viterbi_decode():
   boundary_value = np.ones(5)
 
   layer = CRF(
-      units=5,
-      use_kernel=False,  # disable kernel transform
-      chain_initializer=tf.keras.initializers.Constant(transitions),
-      use_boundary=True,
-      boundary_initializer=tf.keras.initializers.Constant(boundary_value),
+    units=5,
+    use_kernel=False,  # disable kernel transform
+    chain_initializer=tf.keras.initializers.Constant(transitions),
+    use_boundary=True,
+    boundary_initializer=tf.keras.initializers.Constant(boundary_value),
   )
 
   decoded_sequence, _, _, _ = layer(x_np)
@@ -303,8 +297,7 @@ def compute_log_likelihood(logits, tags, transitions, boundary_values):
   for logits_i, tags_i in zip(logits, tags):
     numerator = score_logits(logits_i, tags_i, transitions, boundary_values)
     all_scores = [
-        score_logits(logits_i, tags_j, transitions, boundary_values)
-        for tags_j in itertools.product(range(5), repeat=3)
+      score_logits(logits_i, tags_j, transitions, boundary_values) for tags_j in itertools.product(range(5), repeat=3)
     ]
     denominator = math.log(sum(math.exp(score) for score in all_scores))
     # And include them in the manual calculation.
@@ -315,7 +308,7 @@ def compute_log_likelihood(logits, tags, transitions, boundary_values):
 
 def score_logits(logits, tags, transitions, boundary_values):
   """Computes the likelihood score for the given sequence of tags, given
-    the provided logits (and the transition weights in the CRF model)"""
+  the provided logits (and the transition weights in the CRF model)"""
   # Start with transitions from START and to END
   total = boundary_values[tags[0]] + boundary_values[tags[-1]]
   # Add in all the intermediate transitions
diff --git a/deepray/layers/tests_bak/dcn_test.py b/deepray/layers/tests_bak/dcn_test.py
index 2ddb75be..b9445f81 100644
--- a/deepray/layers/tests_bak/dcn_test.py
+++ b/deepray/layers/tests_bak/dcn_test.py
@@ -56,7 +56,7 @@ def test_invalid_diag_scale(self):
     with self.assertRaisesRegexp(ValueError, r"`diag_scale` should be non-negative"):
       x0 = np.asarray([[0.1, 0.2, 0.3]]).astype(np.float32)
       x = np.asarray([[0.4, 0.5, 0.6]]).astype(np.float32)
-      layer = Cross(diag_scale=-1.)
+      layer = Cross(diag_scale=-1.0)
       layer(x0, x)
 
   def test_bias(self):
@@ -64,7 +64,7 @@ def test_bias(self):
     x = np.asarray([[0.4, 0.5, 0.6]]).astype(np.float32)
     layer = Cross(projection_dim=None, kernel_initializer="ones", bias_initializer="ones")
     output = layer(x0, x)
-    self.assertAllClose(np.asarray([[0.65, 1., 1.35]]), output)
+    self.assertAllClose(np.asarray([[0.65, 1.0, 1.35]]), output)
 
   def test_serialization(self):
     layer = Cross(projection_dim=None, preactivation="swish")
@@ -75,7 +75,7 @@ def test_serialization(self):
   def test_diag_scale(self):
     x0 = np.asarray([[0.1, 0.2, 0.3]]).astype(np.float32)
     x = np.asarray([[0.4, 0.5, 0.6]]).astype(np.float32)
-    layer = Cross(projection_dim=None, diag_scale=1., kernel_initializer="ones")
+    layer = Cross(projection_dim=None, diag_scale=1.0, kernel_initializer="ones")
     output = layer(x0, x)
     self.assertAllClose(np.asarray([[0.59, 0.9, 1.23]]), output)
 
@@ -87,7 +87,6 @@ def test_preactivation(self):
     self.assertAllClose(x, output)
 
   def test_save_model(self):
-
     def get_model():
       x0 = tf.keras.layers.Input(shape=(13,))
       x1 = Cross(projection_dim=None)(x0, x0)
diff --git a/deepray/layers/tests_bak/dense_einsum_test.py b/deepray/layers/tests_bak/dense_einsum_test.py
index ec9d90dc..66399e00 100644
--- a/deepray/layers/tests_bak/dense_einsum_test.py
+++ b/deepray/layers/tests_bak/dense_einsum_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class DenseEinsumLayer(keras_parameterized.TestCase):
-
   def test_3D_einsum_with_two_bound_dimensions(self):
     test_layer = dense_einsum.DenseEinsum(output_shape=(64,), num_summed_dimensions=2)
     # Create a 4-dimensional input (the first dimension is implicit).
@@ -101,9 +100,7 @@ def test_non_iterable_output_shape(self):
 
   def test_with_explicit_initializer(self):
     test_layer = dense_einsum.DenseEinsum(
-        output_shape=(64,),
-        num_summed_dimensions=2,
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
+      output_shape=(64,), num_summed_dimensions=2, kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
     )
     # Create a 4-dimensional input (the first dimension is implicit).
     input_tensor = tf.keras.Input(shape=(None, 40, 80))
diff --git a/deepray/layers/tests_bak/dot_interaction_test.py b/deepray/layers/tests_bak/dot_interaction_test.py
index a7d5d774..e3127ba4 100644
--- a/deepray/layers/tests_bak/dot_interaction_test.py
+++ b/deepray/layers/tests_bak/dot_interaction_test.py
@@ -20,7 +20,6 @@
 
 
 class DotInteractionTest(tf.test.TestCase):
-
   def test_valid_input(self):
     feature1 = np.asarray([[0.1, -4.3, 0.2, 1.1, 0.3]]).astype(np.float32)
     feature2 = np.asarray([[2.0, 3.2, -1.0, 0.0, 1.0]]).astype(np.float32)
diff --git a/deepray/layers/tests_bak/embedding_test.py b/deepray/layers/tests_bak/embedding_test.py
index cb2a4675..48a25b59 100644
--- a/deepray/layers/tests_bak/embedding_test.py
+++ b/deepray/layers/tests_bak/embedding_test.py
@@ -25,7 +25,6 @@
 
 
 class EmbeddingTest(test_combinations.TestCase):
-
   @test_combinations.run_all_keras_modes
   def test_embedding_correctness(self):
     layer = Embedding(embedding_dim=2, vocabulary_size=2)
@@ -58,9 +57,9 @@ def test_eager_gpu_cpu(self):
   @test_combinations.run_all_keras_modes
   def test_embedding_with_ragged_input(self):
     layer = Embedding(
-        vocabulary_size=3,
-        embedding_dim=2,
-        weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
+      vocabulary_size=3,
+      embedding_dim=2,
+      weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
     )
     inputs = keras.layers.Input(shape=(None,), dtype=tf.float32, ragged=True)
 
@@ -72,15 +71,15 @@ def test_embedding_with_ragged_input(self):
     model.run_eagerly = test_utils.should_run_eagerly()
     outputs = model.predict(tf.ragged.constant([[1.0, 2.0, 2.0], [0.0], [1.0, 2.0]], ragged_rank=1))
     self.assertAllClose(
-        outputs,
-        tf.ragged.constant(
-            [
-                [[1.0, 1.0], [2.0, 2.0], [2.0, 2.0]],
-                [[0.0, 0.0]],
-                [[1.0, 1.0], [2.0, 2.0]],
-            ],
-            ragged_rank=1,
-        ),
+      outputs,
+      tf.ragged.constant(
+        [
+          [[1.0, 1.0], [2.0, 2.0], [2.0, 2.0]],
+          [[0.0, 0.0]],
+          [[1.0, 1.0], [2.0, 2.0]],
+        ],
+        ragged_rank=1,
+      ),
     )
 
   @test_utils.enable_v2_dtype_behavior
@@ -97,17 +96,17 @@ def test_mixed_precision_embedding(self):
   @test_combinations.run_all_keras_modes
   def test_embedding_with_sparse_input_sparse_output(self):
     layer = Embedding(
-        vocabulary_size=3,
-        embedding_dim=2,
-        weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
-        sparse=True,
+      vocabulary_size=3,
+      embedding_dim=2,
+      weights=[np.array([[0.0, 0.0], [1.0, 1.0], [2.0, 2.0]])],
+      sparse=True,
     )
     input = tf.SparseTensor(indices=[[0, 1], [1, 2]], values=[1, 2], dense_shape=[3, 3])
     output = layer(input)
     expected_output = tf.SparseTensor(
-        indices=[[0, 1, 0], [0, 1, 1], [1, 2, 0], [1, 2, 1]],
-        values=[1.0, 1.0, 2.0, 2.0],
-        dense_shape=[3, 3, 2],
+      indices=[[0, 1, 0], [0, 1, 1], [1, 2, 0], [1, 2, 1]],
+      values=[1.0, 1.0, 2.0, 2.0],
+      dense_shape=[3, 3, 2],
     )
     self.assertAllClose(output.indices, expected_output.indices)
     self.assertAllClose(output.values, expected_output.values)
@@ -116,37 +115,35 @@ def test_embedding_with_sparse_input_sparse_output(self):
   @test_combinations.run_all_keras_modes
   def test_embedding_with_sparse_input_dense_output(self):
     layer = Embedding(
-        vocabulary_size=3,
-        embedding_dim=2,
-        weights=[np.array([[0.1, 0.1], [1.0, 1.0], [2.0, 2.0]])],
-        sparse=False,
+      vocabulary_size=3,
+      embedding_dim=2,
+      weights=[np.array([[0.1, 0.1], [1.0, 1.0], [2.0, 2.0]])],
+      sparse=False,
     )
     input = tf.SparseTensor(indices=[[0, 1], [1, 2]], values=[1, 2], dense_shape=[3, 3])
     output = layer(input)
-    expected_output = tf.constant(
-        [
-            [[0.1, 0.1], [1.0, 1.0], [0.1, 0.1]],
-            [[0.1, 0.1], [0.1, 0.1], [2.0, 2.0]],
-            [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
-        ]
-    )
+    expected_output = tf.constant([
+      [[0.1, 0.1], [1.0, 1.0], [0.1, 0.1]],
+      [[0.1, 0.1], [0.1, 0.1], [2.0, 2.0]],
+      [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1]],
+    ])
     self.assertAllClose(output, expected_output)
 
   @test_combinations.run_all_keras_modes
   def test_embedding_with_dense_input_sprase_output(self):
     layer = Embedding(
-        vocabulary_size=3,
-        embedding_dim=2,
-        weights=[np.array([[0, 0], [1.0, 1.0], [2.0, 2.0]])],
-        sparse=True,
-        mask_zero=False,
+      vocabulary_size=3,
+      embedding_dim=2,
+      weights=[np.array([[0, 0], [1.0, 1.0], [2.0, 2.0]])],
+      sparse=True,
+      mask_zero=False,
     )
     inputs = tf.constant([0, 0, 0, 2, 1])
     output = layer(inputs)
     expected_output = tf.SparseTensor(
-        indices=[[3, 0], [3, 1], [4, 0], [4, 1]],
-        values=[2.0, 2.0, 1.0, 1.0],
-        dense_shape=[5, 2],
+      indices=[[3, 0], [3, 1], [4, 0], [4, 1]],
+      values=[2.0, 2.0, 1.0, 1.0],
+      dense_shape=[5, 2],
     )
     self.assertAllClose(output.indices, expected_output.indices)
     self.assertAllClose(output.values, expected_output.values)
diff --git a/deepray/layers/tests_bak/esn_test.py b/deepray/layers/tests_bak/esn_test.py
index 804b5a87..590b8a81 100644
--- a/deepray/layers/tests_bak/esn_test.py
+++ b/deepray/layers/tests_bak/esn_test.py
@@ -29,17 +29,17 @@ def layer_test_esn(dtype):
 
   const_initializer = tf.constant_initializer(0.5)
   kwargs = {
-      "units": 3,
-      "connectivity": 1,
-      "leaky": 1,
-      "spectral_radius": 0.9,
-      "use_norm2": True,
-      "use_bias": True,
-      "activation": None,
-      "kernel_initializer": const_initializer,
-      "recurrent_initializer": const_initializer,
-      "bias_initializer": const_initializer,
-      "dtype": dtype,
+    "units": 3,
+    "connectivity": 1,
+    "leaky": 1,
+    "spectral_radius": 0.9,
+    "use_norm2": True,
+    "use_bias": True,
+    "activation": None,
+    "kernel_initializer": const_initializer,
+    "recurrent_initializer": const_initializer,
+    "bias_initializer": const_initializer,
+    "dtype": dtype,
   }
 
   test_utils.layer_test(ESN, kwargs=kwargs, input_data=inp, expected_output=out)
@@ -48,16 +48,16 @@ def layer_test_esn(dtype):
 @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
 def test_serialization(dtype):
   esn = ESN(
-      units=3,
-      connectivity=1,
-      leaky=1,
-      spectral_radius=0.9,
-      use_norm2=False,
-      use_bias=True,
-      activation=None,
-      kernel_initializer="ones",
-      recurrent_initializer="ones",
-      bias_initializer="ones",
+    units=3,
+    connectivity=1,
+    leaky=1,
+    spectral_radius=0.9,
+    use_norm2=False,
+    use_bias=True,
+    activation=None,
+    kernel_initializer="ones",
+    recurrent_initializer="ones",
+    bias_initializer="ones",
   )
   serialized_esn = tf.keras.layers.serialize(esn)
   new_layer = tf.keras.layers.deserialize(serialized_esn)
diff --git a/deepray/layers/tests_bak/feature_cross_test.py b/deepray/layers/tests_bak/feature_cross_test.py
index 9f1f7d4c..9d8c660b 100644
--- a/deepray/layers/tests_bak/feature_cross_test.py
+++ b/deepray/layers/tests_bak/feature_cross_test.py
@@ -20,16 +20,15 @@
 
 
 class FeatureCrossTest(tf.test.TestCase):
-
   def test_groupint_instantiate(self):
-    ins1 = GroupInt(interaction_type='dot', use_attention=False, attention_units=[128, 256, 1], activation='relu')
+    ins1 = GroupInt(interaction_type="dot", use_attention=False, attention_units=[128, 256, 1], activation="relu")
     print(ins1)
 
-    ins2 = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[128, 256, 1], activation='relu')
+    ins2 = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[128, 256, 1], activation="relu")
     print(ins2)
 
   def test_groupint_serde(self):
-    ins1 = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[128, 256, 1], activation='relu')
+    ins1 = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[128, 256, 1], activation="relu")
 
     cfg = ins1.get_config()
     ins2 = GroupInt.from_config(cfg)
@@ -37,7 +36,7 @@ def test_groupint_serde(self):
     print(ins1, ins2)
 
   def test_groupint_call(self):
-    layer = GroupInt(name='test_dense0', out_type='concat')
+    layer = GroupInt(name="test_dense0", out_type="concat")
     left = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(5)]
     right = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(3)]
     sum_out = tf.reduce_sum(layer((left, right)))
@@ -46,7 +45,7 @@ def test_groupint_call(self):
         print(sess.run(sum_out))
 
   def test_groupint_attention_call(self):
-    layer = GroupInt(interaction_type='multiply', use_attention=True, attention_units=[15, 10, 1], activation='relu')
+    layer = GroupInt(interaction_type="multiply", use_attention=True, attention_units=[15, 10, 1], activation="relu")
 
     left = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(5)]
     right = [tf.keras.backend.variable(np.ones((100, 10))) for _ in range(3)]
@@ -72,7 +71,7 @@ def test_allint_serde(self):
     print(ins1, ins2)
 
   def test_allint_call(self):
-    layer = AllInt(name='test_dense0', cmp_dim=4)
+    layer = AllInt(name="test_dense0", cmp_dim=4)
 
     data = tf.keras.backend.variable(np.ones((100, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -81,14 +80,14 @@ def test_allint_call(self):
         print(sess.run(sum_out))
 
   def test_cdot_instantiate(self):
-    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     print(ins1)
 
-    ins2 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins2 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     print(ins2)
 
   def test_cdot_serde(self):
-    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    ins1 = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
 
     cfg = ins1.get_config()
     ins2 = CDot.from_config(cfg)
@@ -96,7 +95,7 @@ def test_cdot_serde(self):
     print(ins1, ins2)
 
   def test_cdot_call(self):
-    layer = CDot(project_dim=8, compress_units=[128, 256], activation='tanh')
+    layer = CDot(project_dim=8, compress_units=[128, 256], activation="tanh")
     data = tf.keras.backend.variable(np.ones((100, 10, 10)))
     test = layer(data)
     sum_out = tf.reduce_sum(test)
@@ -105,15 +104,15 @@ def test_cdot_call(self):
         print(sess.run(sum_out))
 
   def test_can_instantiate(self):
-    ins1 = CAN(layer_num=8, activation='sigmoid', is_seq=False, is_stacked=True)
+    ins1 = CAN(layer_num=8, activation="sigmoid", is_seq=False, is_stacked=True)
 
     print(ins1)
 
-    ins2 = CAN(layer_num=8, activation='tanh', is_seq=False, is_stacked=True)
+    ins2 = CAN(layer_num=8, activation="tanh", is_seq=False, is_stacked=True)
     print(ins2)
 
   def test_can_serde(self):
-    ins1 = CAN(layer_num=8, activation='tanh', is_seq=False, is_stacked=True)
+    ins1 = CAN(layer_num=8, activation="tanh", is_seq=False, is_stacked=True)
 
     cfg = ins1.get_config()
     ins2 = CAN.from_config(cfg)
@@ -121,7 +120,7 @@ def test_can_serde(self):
     print(ins1, ins2)
 
   def test_can_seq_call(self):
-    layer = CAN(layer_num=2, activation='relu', is_seq=True, is_stacked=True)
+    layer = CAN(layer_num=2, activation="relu", is_seq=True, is_stacked=True)
 
     user = tf.keras.backend.variable(np.ones((128, 10, 12, 10)))
     item = tf.keras.backend.variable(np.ones((128, 220)))
@@ -131,7 +130,7 @@ def test_can_seq_call(self):
         print(sess.run(sum_out))
 
   def test_can_call(self):
-    layer = CAN(layer_num=2, activation='relu', is_seq=False, is_stacked=True)
+    layer = CAN(layer_num=2, activation="relu", is_seq=False, is_stacked=True)
 
     user = tf.keras.backend.variable(np.ones((128, 10, 10)))
     item = tf.keras.backend.variable(np.ones((128, 220)))
@@ -141,14 +140,14 @@ def test_can_call(self):
         print(sess.run(sum_out))
 
   def test_dcn_instantiate(self):
-    ins1 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins1 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
     print(ins1)
 
-    ins2 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins2 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
     print(ins2)
 
   def test_dcn_serde(self):
-    ins1 = DCN(layer_num=8, dcn_type='matrix', use_dropout=True, keep_prob=0.5)
+    ins1 = DCN(layer_num=8, dcn_type="matrix", use_dropout=True, keep_prob=0.5)
 
     cfg = ins1.get_config()
     ins2 = DCN.from_config(cfg)
@@ -156,7 +155,7 @@ def test_dcn_serde(self):
     print(ins1, ins2)
 
   def test_dcn_vector_call(self):
-    layer = DCN(layer_num=2, dcn_type='vector', allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
+    layer = DCN(layer_num=2, dcn_type="vector", allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -165,7 +164,7 @@ def test_dcn_vector_call(self):
         print(sess.run(sum_out))
 
   def test_dcn_matrix_call(self):
-    layer = DCN(layer_num=2, dcn_type='matrix', allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
+    layer = DCN(layer_num=2, dcn_type="matrix", allow_kernel_norm=True, use_dropout=True, keep_prob=0.5)
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -175,13 +174,7 @@ def test_dcn_matrix_call(self):
 
   def test_dcn_mixed_call(self):
     layer = DCN(
-        layer_num=2,
-        dcn_type='mixed',
-        num_experts=2,
-        low_rank=5,
-        allow_kernel_norm=True,
-        use_dropout=True,
-        keep_prob=0.5
+      layer_num=2, dcn_type="mixed", num_experts=2, low_rank=5, allow_kernel_norm=True, use_dropout=True, keep_prob=0.5
     )
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
@@ -191,14 +184,14 @@ def test_dcn_mixed_call(self):
         print(sess.run(sum_out))
 
   def test_cin_instantiate(self):
-    ins1 = CIN(hidden_uints=[10, 5], activation='sigmoid')
+    ins1 = CIN(hidden_uints=[10, 5], activation="sigmoid")
     print(ins1)
 
-    ins2 = CIN(hidden_uints=[10, 5], activation='tanh')
+    ins2 = CIN(hidden_uints=[10, 5], activation="tanh")
     print(ins2)
 
   def test_cin_serde(self):
-    ins1 = CIN(hidden_uints=[10, 5], activation='tanh')
+    ins1 = CIN(hidden_uints=[10, 5], activation="tanh")
 
     cfg = ins1.get_config()
     ins2 = CIN.from_config(cfg)
@@ -206,7 +199,7 @@ def test_cin_serde(self):
     print(ins1, ins2)
 
   def test_cin_call(self):
-    layer = CIN(hidden_uints=[10, 5], activation='relu')
+    layer = CIN(hidden_uints=[10, 5], activation="relu")
 
     data = tf.keras.backend.variable(np.ones((128, 10, 10)))
     sum_out = tf.reduce_sum(layer(data))
@@ -215,5 +208,5 @@ def test_cin_call(self):
         print(sess.run(sum_out))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/tests_bak/masked_softmax_test.py b/deepray/layers/tests_bak/masked_softmax_test.py
index 3813ac59..c91cc60e 100644
--- a/deepray/layers/tests_bak/masked_softmax_test.py
+++ b/deepray/layers/tests_bak/masked_softmax_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class MaskedSoftmaxLayerTest(keras_parameterized.TestCase):
-
   def test_non_masked_softmax(self):
     test_layer = masked_softmax.MaskedSoftmax()
     input_tensor = tf.keras.Input(shape=(4, 8))
@@ -84,5 +83,5 @@ def test_softmax_with_axes_expansion(self):
     self.assertAllEqual(expected_zeros, is_zeros)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/tests_bak/max_unpooling_2d_test.py b/deepray/layers/tests_bak/max_unpooling_2d_test.py
index a01616da..43d1c6b9 100644
--- a/deepray/layers/tests_bak/max_unpooling_2d_test.py
+++ b/deepray/layers/tests_bak/max_unpooling_2d_test.py
@@ -56,9 +56,9 @@ def test_strides2x2():
   indices = np.reshape(indices, (1, 2, 4, 1))
   expected_output_shape = (1, 4, 8, 1)
   expected_output = np.array(
-      # fmt: off
-      [1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8]
-      # fmt: on
+    # fmt: off
+    [1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 0, 4, 0, 0, 0, 0, 0, 5, 6, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8]
+    # fmt: on
   ).astype(np.float32)
   expected_output = np.reshape(expected_output, expected_output_shape)
 
@@ -69,35 +69,281 @@ def test_strides2x2():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_batch():
   valid_input = np.array(
-      # fmt: off
-      [
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-          31, 32
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31,
+      32,
+    ]
+    # fmt: on
   ).astype(np.float32)
   valid_input = np.reshape(valid_input, (2, 2, 4, 2))
   indices = np.array(
-      # fmt: off
-      [
-          2, 23, 8, 9, 12, 15, 40, 43, 44, 47, 72, 75, 80, 79, 62, 65, 0, 1, 30, 7, 14, 35, 42, 21, 68, 69, 50, 51, 56,
-          5, 86, 63
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      2,
+      23,
+      8,
+      9,
+      12,
+      15,
+      40,
+      43,
+      44,
+      47,
+      72,
+      75,
+      80,
+      79,
+      62,
+      65,
+      0,
+      1,
+      30,
+      7,
+      14,
+      35,
+      42,
+      21,
+      68,
+      69,
+      50,
+      51,
+      56,
+      5,
+      86,
+      63,
+    ]
+    # fmt: on
   ).astype(np.float32)
   indices = np.reshape(indices, (2, 2, 4, 2))
   expected_output_shape = (2, 4, 12, 2)
   expected_output = np.array(
-      # fmt: off
-      [
-          0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-          0, 0, 0, 7, 0, 0, 8, 9, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 16, 0, 0, 0, 0, 0, 0,
-          11, 0, 0, 12, 0, 0, 0, 14, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 0, 0, 0, 30, 0, 20, 0, 0,
-          0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 23, 0, 0,
-          0, 0, 0, 0, 0, 27, 28, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-          0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      0,
+      0,
+      1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      3,
+      4,
+      0,
+      0,
+      5,
+      0,
+      0,
+      6,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      2,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      7,
+      0,
+      0,
+      8,
+      9,
+      0,
+      0,
+      10,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      15,
+      0,
+      0,
+      16,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      11,
+      0,
+      0,
+      12,
+      0,
+      0,
+      0,
+      14,
+      13,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      17,
+      18,
+      0,
+      0,
+      0,
+      30,
+      0,
+      20,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      21,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      24,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      19,
+      0,
+      0,
+      0,
+      0,
+      22,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      23,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      27,
+      28,
+      0,
+      0,
+      0,
+      0,
+      29,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      32,
+      0,
+      0,
+      0,
+      0,
+      25,
+      26,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      31,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+    ]
+    # fmt: on
   ).astype(np.float32)
   expected_output = np.reshape(expected_output, expected_output_shape)
 
@@ -108,34 +354,265 @@ def test_batch():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_batch_and_padding_valid():
   valid_input = np.array(
-      # fmt: off
-      [
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-          31, 32
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31,
+      32,
+    ]
+    # fmt: on
   ).astype(np.float32)
   valid_input = np.reshape(valid_input, (2, 2, 4, 2))
   indices = np.array(
-      # fmt: off
-      [
-          2, 23, 8, 9, 12, 15, 40, 43, 44, 47, 72, 75, 80, 79, 62, 65, 0, 1, 30, 7, 14, 35, 42, 21, 68, 69, 50, 51, 56,
-          5, 86, 63
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      2,
+      23,
+      8,
+      9,
+      12,
+      15,
+      40,
+      43,
+      44,
+      47,
+      72,
+      75,
+      80,
+      79,
+      62,
+      65,
+      0,
+      1,
+      30,
+      7,
+      14,
+      35,
+      42,
+      21,
+      68,
+      69,
+      50,
+      51,
+      56,
+      5,
+      86,
+      63,
+    ]
+    # fmt: on
   ).astype(np.float32)
   indices = np.reshape(indices, (2, 2, 4, 2))
   expected_output_shape = (2, 4, 11, 2)
   expected_output = np.array(
-      # fmt: off
-      [
-          0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-          0, 0, 0, 7, 0, 0, 8, 9, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 16, 0, 0, 0, 0, 0, 0,
-          11, 0, 0, 12, 0, 0, 0, 14, 13, 0, 0, 0, 0, 0, 0, 0, 17, 18, 0, 0, 0, 30, 0, 20, 0, 0, 0, 0, 0, 0, 21, 0, 0, 0,
-          0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 27, 28, 0,
-          0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31, 0
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      0,
+      0,
+      1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      3,
+      4,
+      0,
+      0,
+      5,
+      0,
+      0,
+      6,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      2,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      7,
+      0,
+      0,
+      8,
+      9,
+      0,
+      0,
+      10,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      15,
+      0,
+      0,
+      16,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      11,
+      0,
+      0,
+      12,
+      0,
+      0,
+      0,
+      14,
+      13,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      17,
+      18,
+      0,
+      0,
+      0,
+      30,
+      0,
+      20,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      21,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      24,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      19,
+      0,
+      0,
+      0,
+      0,
+      22,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      23,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      27,
+      28,
+      0,
+      0,
+      0,
+      0,
+      29,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      32,
+      0,
+      0,
+      0,
+      0,
+      25,
+      26,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      31,
+      0,
+    ]
+    # fmt: on
   ).astype(np.float32)
   expected_output = np.reshape(expected_output, expected_output_shape)
 
diff --git a/deepray/layers/tests_bak/max_unpooling_2d_v2_test.py b/deepray/layers/tests_bak/max_unpooling_2d_v2_test.py
index 5b70e3a3..dbb37cb1 100644
--- a/deepray/layers/tests_bak/max_unpooling_2d_v2_test.py
+++ b/deepray/layers/tests_bak/max_unpooling_2d_v2_test.py
@@ -51,35 +51,281 @@ def test_complex():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_batch():
   valid_input = np.array(
-      # fmt: off
-      [
-          1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
-          31, 32
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      1,
+      2,
+      3,
+      4,
+      5,
+      6,
+      7,
+      8,
+      9,
+      10,
+      11,
+      12,
+      13,
+      14,
+      15,
+      16,
+      17,
+      18,
+      19,
+      20,
+      21,
+      22,
+      23,
+      24,
+      25,
+      26,
+      27,
+      28,
+      29,
+      30,
+      31,
+      32,
+    ]
+    # fmt: on
   ).astype(np.float32)
   valid_input = np.reshape(valid_input, (2, 2, 4, 2))
   indices = np.array(
-      # fmt: off
-      [
-          2, 23, 8, 9, 12, 15, 40, 43, 44, 47, 72, 75, 80, 79, 62, 65, 0, 1, 30, 7, 14, 35, 42, 21, 68, 69, 50, 51, 56,
-          5, 86, 63
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      2,
+      23,
+      8,
+      9,
+      12,
+      15,
+      40,
+      43,
+      44,
+      47,
+      72,
+      75,
+      80,
+      79,
+      62,
+      65,
+      0,
+      1,
+      30,
+      7,
+      14,
+      35,
+      42,
+      21,
+      68,
+      69,
+      50,
+      51,
+      56,
+      5,
+      86,
+      63,
+    ]
+    # fmt: on
   ).astype(np.float32)
   indices = np.reshape(indices, (2, 2, 4, 2))
   output_shape = (2, 4, 12, 2)
   expected_output = np.array(
-      # fmt: off
-      [
-          0, 0, 1, 0, 0, 0, 0, 0, 3, 4, 0, 0, 5, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-          0, 0, 0, 7, 0, 0, 8, 9, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 0, 0, 16, 0, 0, 0, 0, 0, 0,
-          11, 0, 0, 12, 0, 0, 0, 14, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 0, 0, 0, 30, 0, 20, 0, 0,
-          0, 0, 0, 0, 21, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0, 23, 0, 0,
-          0, 0, 0, 0, 0, 27, 28, 0, 0, 0, 0, 29, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 25, 26, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-          0, 0, 0, 0, 0, 0, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0
-      ]
-      # fmt: on
+    # fmt: off
+    [
+      0,
+      0,
+      1,
+      0,
+      0,
+      0,
+      0,
+      0,
+      3,
+      4,
+      0,
+      0,
+      5,
+      0,
+      0,
+      6,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      2,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      7,
+      0,
+      0,
+      8,
+      9,
+      0,
+      0,
+      10,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      15,
+      0,
+      0,
+      16,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      11,
+      0,
+      0,
+      12,
+      0,
+      0,
+      0,
+      14,
+      13,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      17,
+      18,
+      0,
+      0,
+      0,
+      30,
+      0,
+      20,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      21,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      24,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      19,
+      0,
+      0,
+      0,
+      0,
+      22,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      23,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      27,
+      28,
+      0,
+      0,
+      0,
+      0,
+      29,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      32,
+      0,
+      0,
+      0,
+      0,
+      25,
+      26,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      31,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+      0,
+    ]
+    # fmt: on
   ).astype(np.float32)
   expected_output = np.reshape(expected_output, output_shape)
 
diff --git a/deepray/layers/tests_bak/maxout_test.py b/deepray/layers/tests_bak/maxout_test.py
index 43d485c6..3206a99d 100644
--- a/deepray/layers/tests_bak/maxout_test.py
+++ b/deepray/layers/tests_bak/maxout_test.py
@@ -38,10 +38,10 @@ def test_unknown():
   test_utils.layer_test(Maxout, kwargs={"num_units": 3}, input_shape=(5, 4, 2, None), input_data=inputs)
 
   test_utils.layer_test(
-      Maxout,
-      kwargs={"num_units": 3},
-      input_shape=(None, None, None, None),
-      input_data=inputs,
+    Maxout,
+    kwargs={"num_units": 3},
+    input_shape=(None, None, None, None),
+    input_data=inputs,
   )
 
 
diff --git a/deepray/layers/tests_bak/mlp_test.py b/deepray/layers/tests_bak/mlp_test.py
index ef337bc1..05236d3a 100644
--- a/deepray/layers/tests_bak/mlp_test.py
+++ b/deepray/layers/tests_bak/mlp_test.py
@@ -20,13 +20,9 @@
 
 
 class MLPTest(tf.test.TestCase):
-
   def test_mlp_instantiate(self):
     mlp1 = MLP(
-        name='test_dense0',
-        hidden_units=[1, 3, 4, 5],
-        activations=None,
-        initializers=tf.keras.initializers.GlorotNormal()
+      name="test_dense0", hidden_units=[1, 3, 4, 5], activations=None, initializers=tf.keras.initializers.GlorotNormal()
     )
     print(mlp1)
 
@@ -47,11 +43,11 @@ def test_mlp_instantiate(self):
 
   def test_mlp_call(self):
     layer = MLP(
-        name='test_dense0',
-        hidden_units=[100, 50, 10, 1],
-        enable_batch_normalization=True,
-        activations=['relu', tf.keras.activations.tanh, tf.keras.activations.relu, None],
-        initializers=tf.keras.initializers.GlorotNormal(),
+      name="test_dense0",
+      hidden_units=[100, 50, 10, 1],
+      enable_batch_normalization=True,
+      activations=["relu", tf.keras.activations.tanh, tf.keras.activations.relu, None],
+      initializers=tf.keras.initializers.GlorotNormal(),
     )
     data = tf.keras.backend.variable(np.ones((100, 100)))
     sum_out = tf.reduce_sum(layer(data))
@@ -60,5 +56,5 @@ def test_mlp_call(self):
       print(sess.run(sum_out))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/tests_bak/netvlad_test.py b/deepray/layers/tests_bak/netvlad_test.py
index af239359..7cb7f613 100644
--- a/deepray/layers/tests_bak/netvlad_test.py
+++ b/deepray/layers/tests_bak/netvlad_test.py
@@ -25,21 +25,21 @@
 @pytest.mark.parametrize("num_clusters", [1, 4])
 def test_simple(num_clusters):
   test_utils.layer_test(
-      NetVLAD,
-      kwargs={"num_clusters": num_clusters},
-      input_shape=(5, 4, 100),
-      expected_output_shape=(None, num_clusters * 100),
+    NetVLAD,
+    kwargs={"num_clusters": num_clusters},
+    input_shape=(5, 4, 100),
+    expected_output_shape=(None, num_clusters * 100),
   )
 
 
 def test_unknown():
   inputs = np.random.random((5, 4, 100)).astype("float32")
   test_utils.layer_test(
-      NetVLAD,
-      kwargs={"num_clusters": 3},
-      input_shape=(None, None, 100),
-      input_data=inputs,
-      expected_output_shape=(None, 3 * 100),
+    NetVLAD,
+    kwargs={"num_clusters": 3},
+    input_shape=(None, None, 100),
+    input_data=inputs,
+    expected_output_shape=(None, 3 * 100),
   )
 
 
diff --git a/deepray/layers/tests_bak/noisy_dense_test.py b/deepray/layers/tests_bak/noisy_dense_test.py
index 44c6481c..5022cd07 100644
--- a/deepray/layers/tests_bak/noisy_dense_test.py
+++ b/deepray/layers/tests_bak/noisy_dense_test.py
@@ -54,11 +54,11 @@ def test_noisy_dense_with_policy():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_noisy_dense_regularization():
   layer = NoisyDense(
-      3,
-      kernel_regularizer=keras.regularizers.l1(0.01),
-      bias_regularizer="l1",
-      activity_regularizer="l2",
-      name="noisy_dense_reg",
+    3,
+    kernel_regularizer=keras.regularizers.l1(0.01),
+    bias_regularizer="l1",
+    activity_regularizer="l2",
+    name="noisy_dense_reg",
   )
   layer(keras.backend.variable(np.ones((2, 4))))
   np.testing.assert_array_equal(5, len(layer.losses))
@@ -69,10 +69,10 @@ def test_noisy_dense_constraints():
   k_constraint = keras.constraints.max_norm(0.01)
   b_constraint = keras.constraints.max_norm(0.01)
   layer = NoisyDense(
-      3,
-      kernel_constraint=k_constraint,
-      bias_constraint=b_constraint,
-      name="noisy_dense_constriants",
+    3,
+    kernel_constraint=k_constraint,
+    bias_constraint=b_constraint,
+    name="noisy_dense_constriants",
   )
   layer(keras.backend.variable(np.ones((2, 4))))
   np.testing.assert_array_equal(layer.mu_kernel.constraint, k_constraint)
@@ -93,16 +93,16 @@ def test_noisy_dense_reset_noise():
   new_eps_kernel = layer.eps_kernel
   new_eps_bias = layer.eps_bias
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_kernel,
-      new_eps_kernel,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_kernel,
+    new_eps_kernel,
   )
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_bias,
-      new_eps_bias,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_bias,
+    new_eps_bias,
   )
 
 
@@ -118,16 +118,16 @@ def test_noisy_dense_reset_noise_independent():
   new_eps_kernel = layer.eps_kernel
   new_eps_bias = layer.eps_bias
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_kernel,
-      new_eps_kernel,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_kernel,
+    new_eps_kernel,
   )
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_bias,
-      new_eps_bias,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_bias,
+    new_eps_bias,
   )
 
 
@@ -145,16 +145,16 @@ def test_noisy_dense_remove_noise():
   kernel_zeros = tf.zeros(initial_eps_kernel.shape, dtype=initial_eps_kernel.dtype)
   bias_zeros = tf.zeros(initial_eps_bias.shape, dtype=initial_eps_kernel.dtype)
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_kernel,
-      new_eps_kernel,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_kernel,
+    new_eps_kernel,
   )
   np.testing.assert_raises(
-      AssertionError,
-      np.testing.assert_array_equal,
-      initial_eps_bias,
-      new_eps_bias,
+    AssertionError,
+    np.testing.assert_array_equal,
+    initial_eps_bias,
+    new_eps_bias,
   )
   np.testing.assert_array_equal(kernel_zeros, new_eps_kernel)
   np.testing.assert_array_equal(bias_zeros, new_eps_bias)
diff --git a/deepray/layers/tests_bak/normalizations_test.py b/deepray/layers/tests_bak/normalizations_test.py
index 5b21b553..0631146d 100644
--- a/deepray/layers/tests_bak/normalizations_test.py
+++ b/deepray/layers/tests_bak/normalizations_test.py
@@ -58,7 +58,6 @@ def test_apply_normalization():
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_reshape():
-
   def run_reshape_test(axis, group, input_shape, expected_shape):
     group_layer = GroupNormalization(groups=group, axis=axis)
     group_layer._set_number_of_groups_for_instance_norm(input_shape)
@@ -200,11 +199,11 @@ def test_instancenorm_flat():
 def test_initializer():
   # Check if the initializer for gamma and beta is working correctly
   layer = GroupNormalization(
-      groups=32,
-      beta_initializer="random_normal",
-      beta_constraint="NonNeg",
-      gamma_initializer="random_normal",
-      gamma_constraint="NonNeg",
+    groups=32,
+    beta_initializer="random_normal",
+    beta_constraint="NonNeg",
+    gamma_initializer="random_normal",
+    gamma_constraint="NonNeg",
   )
 
   model = _create_and_fit_sequential_model(layer, (64,))
@@ -415,10 +414,10 @@ def test_with_epsilon(dtype):
   inputs = np.random.rand(28, 28, 1).astype(dtype)
   inputs = np.expand_dims(inputs, axis=0)
   frn = FilterResponseNormalization(
-      beta_initializer=tf.keras.initializers.Constant(0.5),
-      gamma_initializer="ones",
-      learned_epsilon=True,
-      dtype=dtype,
+    beta_initializer=tf.keras.initializers.Constant(0.5),
+    gamma_initializer="ones",
+    learned_epsilon=True,
+    dtype=dtype,
   )
   frn.build((None, 28, 28, 1))
   observed = frn(inputs)
diff --git a/deepray/layers/tests_bak/on_device_embedding_test.py b/deepray/layers/tests_bak/on_device_embedding_test.py
index 7ab347a4..c4f4acdd 100644
--- a/deepray/layers/tests_bak/on_device_embedding_test.py
+++ b/deepray/layers/tests_bak/on_device_embedding_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class OnDeviceEmbeddingTest(keras_parameterized.TestCase):
-
   def test_layer_creation(self):
     vocab_size = 31
     embedding_width = 27
@@ -49,7 +48,7 @@ def test_layer_creation_with_float16_dtype(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
+      vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
@@ -85,7 +84,7 @@ def test_layer_invocation_with_float16_dtype(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
+      vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16"
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
@@ -106,7 +105,7 @@ def test_one_hot_layer_creation(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
+      vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
@@ -123,7 +122,7 @@ def test_one_hot_layer_creation_with_float16_dtype(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
+      vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
@@ -140,7 +139,7 @@ def test_one_hot_layer_invocation(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
+      vocab_size=vocab_size, embedding_width=embedding_width, use_one_hot=True
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
@@ -161,7 +160,7 @@ def test_one_hot_layer_invocation_with_float16_dtype(self):
     vocab_size = 31
     embedding_width = 27
     test_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
+      vocab_size=vocab_size, embedding_width=embedding_width, dtype="float16", use_one_hot=True
     )
     # Create a 2-dimensional input (the first dimension is implicit).
     sequence_length = 23
diff --git a/deepray/layers/tests_bak/poincare_test.py b/deepray/layers/tests_bak/poincare_test.py
index feddf763..d1883053 100644
--- a/deepray/layers/tests_bak/poincare_test.py
+++ b/deepray/layers/tests_bak/poincare_test.py
@@ -45,13 +45,10 @@ def test_poincare_normalize():
     outputs_expected = _poincare_normalize(inputs, dim, epsilon)
 
     outputs = test_utils.layer_test(
-        PoincareNormalize,
-        kwargs={
-            "axis": dim,
-            "epsilon": epsilon
-        },
-        input_data=inputs,
-        expected_output=outputs_expected,
+      PoincareNormalize,
+      kwargs={"axis": dim, "epsilon": epsilon},
+      input_data=inputs,
+      expected_output=outputs_expected,
     )
     for y in outputs_expected, outputs:
       norm = np.linalg.norm(y, axis=dim)
@@ -70,13 +67,10 @@ def test_poincare_normalize_dim_array():
   outputs_expected = _poincare_normalize(inputs, dim, epsilon)
 
   outputs = test_utils.layer_test(
-      PoincareNormalize,
-      kwargs={
-          "axis": dim,
-          "epsilon": epsilon
-      },
-      input_data=inputs,
-      expected_output=outputs_expected,
+    PoincareNormalize,
+    kwargs={"axis": dim, "epsilon": epsilon},
+    input_data=inputs,
+    expected_output=outputs_expected,
   )
   for y in outputs_expected, outputs:
     norm = np.linalg.norm(y, axis=tuple(dim))
diff --git a/deepray/layers/tests_bak/position_embedding_test.py b/deepray/layers/tests_bak/position_embedding_test.py
index dd03c3b4..36b1ae10 100644
--- a/deepray/layers/tests_bak/position_embedding_test.py
+++ b/deepray/layers/tests_bak/position_embedding_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class PositionEmbeddingLayerTest(keras_parameterized.TestCase):
-
   def test_static_layer_output_shape(self):
     test_layer = position_embedding.PositionEmbedding()
     # Create a 3-dimensional input (the first dimension is implicit).
diff --git a/deepray/layers/tests_bak/relative_attention_test.py b/deepray/layers/tests_bak/relative_attention_test.py
index 0f701c0e..cac42afe 100644
--- a/deepray/layers/tests_bak/relative_attention_test.py
+++ b/deepray/layers/tests_bak/relative_attention_test.py
@@ -22,17 +22,17 @@
 
 
 def _create_mock_attention_data(
-    num_heads,
-    key_dim,
-    value_dim,
-    seq_length,
-    batch_size,
-    memory_length=0,
-    num_predictions=2,
-    two_stream=False,
-    include_state=False,
-    include_mask=False,
-    include_segment=False
+  num_heads,
+  key_dim,
+  value_dim,
+  seq_length,
+  batch_size,
+  memory_length=0,
+  num_predictions=2,
+  two_stream=False,
+  include_state=False,
+  include_mask=False,
+  include_segment=False,
 ):
   """Creates mock testing data.
 
@@ -59,24 +59,24 @@ def _create_mock_attention_data(
   attention_bias_shape = (num_heads, key_dim)
 
   data = dict(
-      relative_position_encoding=tf.random.normal(shape=encoding_shape),
-      content_attention_bias=tf.random.normal(shape=attention_bias_shape),
-      positional_attention_bias=tf.random.normal(shape=attention_bias_shape)
+    relative_position_encoding=tf.random.normal(shape=encoding_shape),
+    content_attention_bias=tf.random.normal(shape=attention_bias_shape),
+    positional_attention_bias=tf.random.normal(shape=attention_bias_shape),
   )
 
   if two_stream:
     query_stream_shape = (batch_size, num_predictions, key_dim)
     target_mapping_shape = (batch_size, num_predictions, seq_length)
     stream_data = dict(
-        content_stream=tf.random.normal(shape=query_shape),
-        query_stream=tf.random.normal(shape=query_stream_shape),
-        target_mapping=tf.random.normal(shape=target_mapping_shape)
+      content_stream=tf.random.normal(shape=query_shape),
+      query_stream=tf.random.normal(shape=query_stream_shape),
+      target_mapping=tf.random.normal(shape=target_mapping_shape),
     )
   else:
     stream_data = dict(
-        query=tf.random.normal(shape=query_shape),
-        value=tf.random.normal(shape=value_shape),
-        key=tf.random.normal(shape=value_shape)
+      query=tf.random.normal(shape=query_shape),
+      value=tf.random.normal(shape=value_shape),
+      key=tf.random.normal(shape=value_shape),
     )
 
   data.update(stream_data)
@@ -102,9 +102,9 @@ def _create_mock_attention_data(
     segment_matrix = np.random.randint(2, size=(batch_size, seq_length, total_seq_length))
     segment_matrix = tf.math.equal(segment_matrix, 1)
     segment_data = dict(
-        segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
-        segment_encoding=tf.random.normal(shape=segment_encoding_shape),
-        segment_matrix=segment_matrix
+      segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
+      segment_encoding=tf.random.normal(shape=segment_encoding_shape),
+      segment_matrix=segment_matrix,
     )
     data.update(segment_data)
 
@@ -112,59 +112,60 @@ def _create_mock_attention_data(
 
 
 class MultiHeadRelativeAttentionTest(tf.test.TestCase, parameterized.TestCase):
-
   @combinations.generate(
-      combinations.combine(
-          value_dim=[32, 64], memory_length=[0, 4], state=[True, False], mask=[True, False], segment=[True, False]
-      )
+    combinations.combine(
+      value_dim=[32, 64], memory_length=[0, 4], state=[True, False], mask=[True, False], segment=[True, False]
+    )
   )
   def test_attention_scores(self, value_dim, memory_length, state, mask, segment):
     """Tests combinations of attention score calculations."""
     batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
     test_layer = relative_attention.MultiHeadRelativeAttention(
-        num_heads=num_heads, key_dim=key_dim, value_dim=value_dim
+      num_heads=num_heads, key_dim=key_dim, value_dim=value_dim
     )
     data = _create_mock_attention_data(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=value_dim,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        two_stream=False,
-        batch_size=batch_size,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment
+      num_heads=num_heads,
+      key_dim=key_dim,
+      value_dim=value_dim,
+      seq_length=seq_length,
+      memory_length=memory_length,
+      two_stream=False,
+      batch_size=batch_size,
+      include_state=state,
+      include_mask=mask,
+      include_segment=segment,
     )
     output = test_layer(**data)
     self.assertEqual(output.shape, [batch_size, seq_length, key_dim])
 
 
 class TwoStreamRelativeAttentionTest(tf.test.TestCase, parameterized.TestCase):
-
   @combinations.generate(
-      combinations.combine(
-          num_predictions=[2, 10], memory_length=[0, 4], state=[True, False], mask=[True, False], segment=[True, False]
-      )
+    combinations.combine(
+      num_predictions=[2, 10], memory_length=[0, 4], state=[True, False], mask=[True, False], segment=[True, False]
+    )
   )
   def test_attention_scores(self, num_predictions, memory_length, state, mask, segment):
     """Tests combinations of attention score calculations."""
     batch_size, num_heads, key_dim, seq_length = 2, 12, 64, 8
     test_layer = relative_attention.TwoStreamRelativeAttention(num_heads=num_heads, key_dim=key_dim, value_dim=key_dim)
     data = _create_mock_attention_data(
-        num_heads=num_heads,
-        key_dim=key_dim,
-        value_dim=key_dim,
-        seq_length=seq_length,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=True,
-        batch_size=batch_size,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment
+      num_heads=num_heads,
+      key_dim=key_dim,
+      value_dim=key_dim,
+      seq_length=seq_length,
+      memory_length=memory_length,
+      num_predictions=num_predictions,
+      two_stream=True,
+      batch_size=batch_size,
+      include_state=state,
+      include_mask=mask,
+      include_segment=segment,
     )
-    content_output, query_output, = test_layer(**data)
+    (
+      content_output,
+      query_output,
+    ) = test_layer(**data)
     self.assertEqual(content_output.shape, [batch_size, seq_length, key_dim])
     self.assertEqual(query_output.shape, [batch_size, num_predictions, key_dim])
 
diff --git a/deepray/layers/tests_bak/snake_test.py b/deepray/layers/tests_bak/snake_test.py
index 5808161c..90a07fdf 100644
--- a/deepray/layers/tests_bak/snake_test.py
+++ b/deepray/layers/tests_bak/snake_test.py
@@ -32,11 +32,8 @@ def test_layer(dtype):
   a = np.random.randn()
   val = snake(x, a)
   test_utils.layer_test(
-      Snake,
-      kwargs={
-          "frequency_initializer": tf.constant_initializer(a),
-          "dtype": dtype
-      },
-      input_data=x,
-      expected_output=val,
+    Snake,
+    kwargs={"frequency_initializer": tf.constant_initializer(a), "dtype": dtype},
+    input_data=x,
+    expected_output=val,
   )
diff --git a/deepray/layers/tests_bak/sparsemax_test.py b/deepray/layers/tests_bak/sparsemax_test.py
index abba30cc..b9d3fbdf 100644
--- a/deepray/layers/tests_bak/sparsemax_test.py
+++ b/deepray/layers/tests_bak/sparsemax_test.py
@@ -55,8 +55,8 @@ def test_sparsemax_layer_against_numpy(dtype):
   z = random.uniform(low=-3, high=3, size=(test_obs, 10)).astype(dtype)
 
   test_utils.layer_test(
-      Sparsemax,
-      kwargs={"dtype": dtype},
-      input_data=z,
-      expected_output=_np_sparsemax(z).astype(dtype),
+    Sparsemax,
+    kwargs={"dtype": dtype},
+    input_data=z,
+    expected_output=_np_sparsemax(z).astype(dtype),
   )
diff --git a/deepray/layers/tests_bak/spatial_pyramid_pooling_test.py b/deepray/layers/tests_bak/spatial_pyramid_pooling_test.py
index 9a36e899..325661b5 100644
--- a/deepray/layers/tests_bak/spatial_pyramid_pooling_test.py
+++ b/deepray/layers/tests_bak/spatial_pyramid_pooling_test.py
@@ -39,26 +39,20 @@ def test_spp_output_2d():
   inputs = np.reshape(inputs, (1, 4, 4, 1))
   output = np.array([[[7.5], [2.5], [4.5], [10.5], [12.5]]]).astype(np.float32)
   test_utils.layer_test(
-      SpatialPyramidPooling2D,
-      kwargs={
-          "bins": [[1, 1], [2, 2]],
-          "data_format": "channels_last"
-      },
-      input_data=inputs,
-      expected_output=output,
+    SpatialPyramidPooling2D,
+    kwargs={"bins": [[1, 1], [2, 2]], "data_format": "channels_last"},
+    input_data=inputs,
+    expected_output=output,
   )
 
   inputs = np.arange(start=0.0, stop=16.0, step=1.0).astype(np.float32)
   inputs = np.reshape(inputs, (1, 1, 4, 4))
   output = np.array([[[7.5, 2.5, 4.5, 10.5, 12.5]]]).astype(np.float32)
   test_utils.layer_test(
-      SpatialPyramidPooling2D,
-      kwargs={
-          "bins": [[1, 1], [2, 2]],
-          "data_format": "channels_first"
-      },
-      input_data=inputs,
-      expected_output=output,
+    SpatialPyramidPooling2D,
+    kwargs={"bins": [[1, 1], [2, 2]], "data_format": "channels_first"},
+    input_data=inputs,
+    expected_output=output,
   )
 
 
diff --git a/deepray/layers/tests_bak/spectral_normalization_test.py b/deepray/layers/tests_bak/spectral_normalization_test.py
index ed08eb91..6722c0de 100644
--- a/deepray/layers/tests_bak/spectral_normalization_test.py
+++ b/deepray/layers/tests_bak/spectral_normalization_test.py
@@ -24,12 +24,9 @@
 def test_keras():
   input_data = np.random.random((10, 3, 4)).astype(np.float32)
   test_utils.layer_test(
-      spectral_normalization.SpectralNormalization,
-      kwargs={
-          "layer": tf.keras.layers.Dense(2),
-          "input_shape": (3, 4)
-      },
-      input_data=input_data,
+    spectral_normalization.SpectralNormalization,
+    kwargs={"layer": tf.keras.layers.Dense(2), "input_shape": (3, 4)},
+    input_data=input_data,
   )
 
 
@@ -61,16 +58,16 @@ def test_save_load_model(tmpdir):
 
 
 @pytest.mark.parametrize(
-    "base_layer_fn, input_shape, output_shape",
-    [
-        (lambda: tf.keras.layers.Dense(2), [3, 2], [3, 2]),
-        (
-            lambda: tf.keras.layers.Conv2D(3, (2, 2), padding="same"),
-            [4, 4, 3],
-            [4, 4, 3],
-        ),
-        (lambda: tf.keras.layers.Embedding(2, 10), [2], [2, 10]),
-    ],
+  "base_layer_fn, input_shape, output_shape",
+  [
+    (lambda: tf.keras.layers.Dense(2), [3, 2], [3, 2]),
+    (
+      lambda: tf.keras.layers.Conv2D(3, (2, 2), padding="same"),
+      [4, 4, 3],
+      [4, 4, 3],
+    ),
+    (lambda: tf.keras.layers.Embedding(2, 10), [2], [2, 10]),
+  ],
 )
 def test_model_fit(base_layer_fn, input_shape, output_shape):
   inputs = tf.keras.layers.Input(shape=input_shape)
@@ -82,23 +79,23 @@ def test_model_fit(base_layer_fn, input_shape, output_shape):
 
   model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.001), loss="mse")
   model.fit(
-      np.random.random((2, *input_shape)),
-      np.random.random((2, *output_shape)),
-      epochs=3,
-      batch_size=10,
-      verbose=0,
+    np.random.random((2, *input_shape)),
+    np.random.random((2, *output_shape)),
+    epochs=3,
+    batch_size=10,
+    verbose=0,
   )
   assert hasattr(model.layers[0], "u")
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize(
-    "base_layer_fn, input_shape",
-    [
-        (lambda: tf.keras.layers.Dense(2), [3, 2]),
-        (lambda: tf.keras.layers.Conv2D(3, (2, 2), padding="same"), [4, 4, 3]),
-        (lambda: tf.keras.layers.Embedding(2, 10), [2]),
-    ],
+  "base_layer_fn, input_shape",
+  [
+    (lambda: tf.keras.layers.Dense(2), [3, 2]),
+    (lambda: tf.keras.layers.Conv2D(3, (2, 2), padding="same"), [4, 4, 3]),
+    (lambda: tf.keras.layers.Embedding(2, 10), [2]),
+  ],
 )
 def test_model_build(base_layer_fn, input_shape):
   inputs = tf.keras.layers.Input(shape=input_shape)
@@ -136,8 +133,8 @@ def test_normalization():
 def test_apply_layer():
   images = tf.ones((1, 2, 2, 1))
   sn_wrapper = spectral_normalization.SpectralNormalization(
-      tf.keras.layers.Conv2D(1, [2, 2], kernel_initializer=tf.constant_initializer(value=1)),
-      input_shape=(2, 2, 1),
+    tf.keras.layers.Conv2D(1, [2, 2], kernel_initializer=tf.constant_initializer(value=1)),
+    input_shape=(2, 2, 1),
   )
 
   result = sn_wrapper(images, training=False)
diff --git a/deepray/layers/tests_bak/stochastic_depth_test.py b/deepray/layers/tests_bak/stochastic_depth_test.py
index c708ce24..cb441a51 100644
--- a/deepray/layers/tests_bak/stochastic_depth_test.py
+++ b/deepray/layers/tests_bak/stochastic_depth_test.py
@@ -32,10 +32,10 @@ def stochastic_depth_test(seed, training):
     expected_output = np.asarray([[0.3, 0.3, 0.65]]).astype(np.float32)
 
   test_utils.layer_test(
-      StochasticDepth,
-      kwargs={"survival_probability": survival_probability},
-      input_data=[shortcut, residual],
-      expected_output=expected_output,
+    StochasticDepth,
+    kwargs={"survival_probability": survival_probability},
+    input_data=[shortcut, residual],
+    expected_output=expected_output,
   )
 
 
diff --git a/deepray/layers/tests_bak/tlu_test.py b/deepray/layers/tests_bak/tlu_test.py
index 95b73100..2d72fdd1 100644
--- a/deepray/layers/tests_bak/tlu_test.py
+++ b/deepray/layers/tests_bak/tlu_test.py
@@ -36,15 +36,15 @@ def test_affine(dtype):
   x = np.array([[-2.5, 0.0, 0.3]]).astype(dtype)
   val = np.array([[-1.5, 1.0, 1.3]]).astype(dtype)
   test_utils.layer_test(
-      TLU,
-      kwargs={
-          "affine": True,
-          "dtype": dtype,
-          "alpha_initializer": "ones",
-          "tau_initializer": "ones",
-      },
-      input_data=x,
-      expected_output=val,
+    TLU,
+    kwargs={
+      "affine": True,
+      "dtype": dtype,
+      "alpha_initializer": "ones",
+      "tau_initializer": "ones",
+    },
+    input_data=x,
+    expected_output=val,
   )
 
 
diff --git a/deepray/layers/tests_bak/transformer_scaffold_test.py b/deepray/layers/tests_bak/transformer_scaffold_test.py
index 1c8e3228..b739faf4 100644
--- a/deepray/layers/tests_bak/transformer_scaffold_test.py
+++ b/deepray/layers/tests_bak/transformer_scaffold_test.py
@@ -34,7 +34,6 @@
 # test serialization below.
 # @tf.keras.utils.register_keras_serializable(package='TestOnly')
 class ValidatedAttentionLayer(attention.Attention):
-
   def __init__(self, call_list, **kwargs):
     super(ValidatedAttentionLayer, self).__init__(**kwargs)
     self.list = call_list
@@ -45,7 +44,7 @@ def call(self, inputs):
 
   def get_config(self):
     config = super(ValidatedAttentionLayer, self).get_config()
-    config['call_list'] = []
+    config["call_list"] = []
     return config
 
 
@@ -53,23 +52,22 @@ def get_config(self):
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class TransformerLayerTest(keras_parameterized.TestCase):
-
   def test_layer_creation(self):
     sequence_length = 21
     width = 80
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -89,16 +87,16 @@ def test_layer_creation_with_mask(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -119,23 +117,23 @@ def test_layer_creation_with_incorrect_mask_fails(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf.keras.Input(shape=(sequence_length, width))
     # Create a 2-dimensional input (the first dimension is implicit).
     mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length - 3))
-    with self.assertRaisesRegex(ValueError, 'When passing a mask tensor.*'):
+    with self.assertRaisesRegex(ValueError, "When passing a mask tensor.*"):
       _ = test_layer([data_tensor, mask_tensor])
 
   def test_layer_invocation(self):
@@ -144,16 +142,16 @@ def test_layer_invocation(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -179,16 +177,16 @@ def test_layer_invocation_with_mask(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -219,17 +217,17 @@ def test_layer_invocation_with_float16_dtype(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        dtype='float16'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
+      dtype="float16",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -260,17 +258,17 @@ def test_transform_with_initializer(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -289,17 +287,17 @@ def test_layer_restoration_from_config(self):
 
     call_list = []
     attention_layer_cfg = {
-        'num_heads': 10,
-        'head_size': 8,
-        'call_list': call_list,
-        'name': 'test_layer',
+      "num_heads": 10,
+      "head_size": 8,
+      "call_list": call_list,
+      "name": "test_layer",
     }
     test_layer = transformer_scaffold.TransformerScaffold(
-        attention_cls=ValidatedAttentionLayer,
-        attention_cfg=attention_layer_cfg,
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu'
+      attention_cls=ValidatedAttentionLayer,
+      attention_cfg=attention_layer_cfg,
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
     )
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -336,10 +334,10 @@ def test_layer_restoration_from_config(self):
     # If the layer was configured correctly, it should have a list attribute
     # (since it should have the custom class and config passed to it).
     new_model.summary()
-    new_call_list = new_model.get_layer(name='transformer_scaffold')._attention_layer.list
+    new_call_list = new_model.get_layer(name="transformer_scaffold")._attention_layer.list
     self.assertNotEmpty(new_call_list)
     self.assertTrue(new_call_list[0], "The passed layer class wasn't instantiated.")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/tests_bak/transformer_test.py b/deepray/layers/tests_bak/transformer_test.py
index 17f1bd3d..392291c3 100644
--- a/deepray/layers/tests_bak/transformer_test.py
+++ b/deepray/layers/tests_bak/transformer_test.py
@@ -29,9 +29,8 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class TransformerLayerTest(keras_parameterized.TestCase):
-
   def test_layer_creation(self):
-    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu')
+    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu")
     sequence_length = 21
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -41,7 +40,7 @@ def test_layer_creation(self):
     self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
 
   def test_layer_creation_with_mask(self):
-    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu')
+    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu")
     sequence_length = 21
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -53,18 +52,18 @@ def test_layer_creation_with_mask(self):
     self.assertEqual(data_tensor.shape.as_list(), output_tensor.shape.as_list())
 
   def test_layer_creation_with_incorrect_mask_fails(self):
-    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu')
+    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu")
     sequence_length = 21
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
     data_tensor = tf.keras.Input(shape=(sequence_length, width))
     # Create a 2-dimensional input (the first dimension is implicit).
     mask_tensor = tf.keras.Input(shape=(sequence_length, sequence_length - 3))
-    with self.assertRaisesRegex(ValueError, 'When passing a mask tensor.*'):
+    with self.assertRaisesRegex(ValueError, "When passing a mask tensor.*"):
       _ = test_layer([data_tensor, mask_tensor])
 
   def test_layer_invocation(self):
-    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu')
+    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu")
     sequence_length = 21
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -81,7 +80,7 @@ def test_layer_invocation(self):
     _ = model.predict(input_data)
 
   def test_layer_invocation_with_mask(self):
-    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu')
+    test_layer = transformer.Transformer(num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu")
     sequence_length = 21
     width = 80
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -104,7 +103,7 @@ def test_layer_invocation_with_mask(self):
 
   def test_layer_invocation_with_float16_dtype(self):
     test_layer = transformer.Transformer(
-        num_attention_heads=10, intermediate_size=2048, intermediate_activation='relu', dtype='float16'
+      num_attention_heads=10, intermediate_size=2048, intermediate_activation="relu", dtype="float16"
     )
     sequence_length = 21
     width = 80
@@ -128,10 +127,10 @@ def test_layer_invocation_with_float16_dtype(self):
 
   def test_transform_with_initializer(self):
     test_layer = transformer.Transformer(
-        num_attention_heads=10,
-        intermediate_size=2048,
-        intermediate_activation='relu',
-        kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02)
+      num_attention_heads=10,
+      intermediate_size=2048,
+      intermediate_activation="relu",
+      kernel_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
     )
     sequence_length = 21
     width = 80
@@ -142,5 +141,5 @@ def test_transform_with_initializer(self):
     self.assertEqual(data_tensor.shape.as_list(), output.shape.as_list())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/layers/tests_bak/transformer_xl_test.py b/deepray/layers/tests_bak/transformer_xl_test.py
index 597fc088..6d2df89c 100644
--- a/deepray/layers/tests_bak/transformer_xl_test.py
+++ b/deepray/layers/tests_bak/transformer_xl_test.py
@@ -23,19 +23,19 @@
 
 
 def create_mock_transformer_xl_data(
-    batch_size,
-    num_heads,
-    head_size,
-    hidden_size,
-    seq_length,
-    memory_length=0,
-    num_predictions=2,
-    two_stream=False,
-    num_layers=1,
-    include_biases=True,
-    include_state=False,
-    include_mask=False,
-    include_segment=False
+  batch_size,
+  num_heads,
+  head_size,
+  hidden_size,
+  seq_length,
+  memory_length=0,
+  num_predictions=2,
+  two_stream=False,
+  num_layers=1,
+  include_biases=True,
+  include_state=False,
+  include_mask=False,
+  include_segment=False,
 ):
   """Creates mock testing data.
 
@@ -61,26 +61,26 @@ def create_mock_transformer_xl_data(
   encoding_shape = (batch_size, seq_length * 2, hidden_size)
 
   data = dict(
-      relative_position_encoding=tf.random.normal(shape=encoding_shape),
-      content_stream=tf.random.normal(shape=(batch_size, seq_length, hidden_size))
+    relative_position_encoding=tf.random.normal(shape=encoding_shape),
+    content_stream=tf.random.normal(shape=(batch_size, seq_length, hidden_size)),
   )
 
   if include_biases:
     attention_bias_shape = (num_heads, head_size)
     data.update(
-        dict(
-            content_attention_bias=tf.random.normal(shape=attention_bias_shape),
-            segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
-            positional_attention_bias=tf.random.normal(shape=attention_bias_shape)
-        )
+      dict(
+        content_attention_bias=tf.random.normal(shape=attention_bias_shape),
+        segment_attention_bias=tf.random.normal(shape=attention_bias_shape),
+        positional_attention_bias=tf.random.normal(shape=attention_bias_shape),
+      )
     )
 
   if two_stream:
     data.update(
-        dict(
-            query_stream=tf.random.normal(shape=(batch_size, num_predictions, hidden_size)),
-            target_mapping=tf.random.normal(shape=(batch_size, num_predictions, seq_length))
-        )
+      dict(
+        query_stream=tf.random.normal(shape=(batch_size, num_predictions, hidden_size)),
+        target_mapping=tf.random.normal(shape=(batch_size, num_predictions, seq_length)),
+      )
     )
 
   if include_state:
@@ -118,15 +118,10 @@ def create_mock_transformer_xl_data(
 
 
 class TransformerXLBlockTest(tf.test.TestCase, parameterized.TestCase):
-
   @combinations.generate(
-      combinations.combine(
-          memory_length=[0, 4],
-          two_stream=[True, False],
-          state=[True, False],
-          mask=[True, False],
-          segment=[True, False]
-      )
+    combinations.combine(
+      memory_length=[0, 4], two_stream=[True, False], state=[True, False], mask=[True, False], segment=[True, False]
+    )
   )
   def test_transformer_xl_block(self, two_stream, memory_length, state, mask, segment):
     """Tests combinations of Transformer XL block calculations."""
@@ -134,29 +129,29 @@ def test_transformer_xl_block(self, two_stream, memory_length, state, mask, segm
     hidden_size, num_predictions, inner_size = 24, 8, 12
 
     data = create_mock_transformer_xl_data(
-        include_biases=True,
-        num_heads=num_heads,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        seq_length=seq_length,
-        batch_size=batch_size,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=two_stream,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment
+      include_biases=True,
+      num_heads=num_heads,
+      head_size=head_size,
+      hidden_size=hidden_size,
+      seq_length=seq_length,
+      batch_size=batch_size,
+      memory_length=memory_length,
+      num_predictions=num_predictions,
+      two_stream=two_stream,
+      include_state=state,
+      include_mask=mask,
+      include_segment=segment,
     )
 
     test_layer = transformer_xl.TransformerXLBlock(
-        vocab_size=32000,
-        hidden_size=hidden_size,
-        num_attention_heads=num_heads,
-        head_size=head_size,
-        inner_size=inner_size,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        two_stream=two_stream
+      vocab_size=32000,
+      hidden_size=hidden_size,
+      num_attention_heads=num_heads,
+      head_size=head_size,
+      inner_size=inner_size,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      two_stream=two_stream,
     )
     output = test_layer(**data)
     content_attention = output["content_attention"]
@@ -170,14 +165,14 @@ def test_transformer_xl_block(self, two_stream, memory_length, state, mask, segm
 
   def test_get_config(self):
     transformer_xl_block = transformer_xl.TransformerXLBlock(
-        vocab_size=32000,
-        head_size=64,
-        num_attention_heads=2,
-        hidden_size=10,
-        inner_size=50,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        two_stream=False
+      vocab_size=32000,
+      head_size=64,
+      num_attention_heads=2,
+      hidden_size=10,
+      inner_size=50,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      two_stream=False,
     )
     transformer_xl_block_config = transformer_xl_block.get_config()
     new_block = transformer_xl.TransformerXLBlock.from_config(transformer_xl_block_config)
@@ -185,17 +180,16 @@ def test_get_config(self):
 
 
 class TransformerXLTest(tf.test.TestCase, parameterized.TestCase):
-
   @combinations.generate(
-      combinations.combine(
-          two_stream=[True, False],
-          memory_length=[0, 4],
-          reuse_length=[0, 4],
-          tie_attention_biases=[True, False],
-          state=[True, False],
-          mask=[True, False],
-          segment=[True, False]
-      )
+    combinations.combine(
+      two_stream=[True, False],
+      memory_length=[0, 4],
+      reuse_length=[0, 4],
+      tie_attention_biases=[True, False],
+      state=[True, False],
+      mask=[True, False],
+      segment=[True, False],
+    )
   )
   def test_transformer_xl(self, two_stream, memory_length, reuse_length, tie_attention_biases, state, mask, segment):
     batch_size, num_heads, head_size, seq_length = 2, 12, 64, 8
@@ -203,35 +197,35 @@ def test_transformer_xl(self, two_stream, memory_length, reuse_length, tie_atten
     num_layers = 3
 
     data = create_mock_transformer_xl_data(
-        include_biases=False,
-        num_heads=num_heads,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        seq_length=seq_length,
-        batch_size=batch_size,
-        memory_length=memory_length,
-        num_predictions=num_predictions,
-        two_stream=two_stream,
-        num_layers=num_layers,
-        include_state=state,
-        include_mask=mask,
-        include_segment=segment
+      include_biases=False,
+      num_heads=num_heads,
+      head_size=head_size,
+      hidden_size=hidden_size,
+      seq_length=seq_length,
+      batch_size=batch_size,
+      memory_length=memory_length,
+      num_predictions=num_predictions,
+      two_stream=two_stream,
+      num_layers=num_layers,
+      include_state=state,
+      include_mask=mask,
+      include_segment=segment,
     )
     transformer_xl_layer = transformer_xl.TransformerXL(
-        vocab_size=32000,
-        num_layers=num_layers,
-        head_size=head_size,
-        hidden_size=hidden_size,
-        num_attention_heads=num_heads,
-        inner_size=inner_size,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=two_stream,
-        tie_attention_biases=tie_attention_biases,
-        memory_length=memory_length,
-        reuse_length=reuse_length,
-        inner_activation="relu"
+      vocab_size=32000,
+      num_layers=num_layers,
+      head_size=head_size,
+      hidden_size=hidden_size,
+      num_attention_heads=num_heads,
+      inner_size=inner_size,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
+      two_stream=two_stream,
+      tie_attention_biases=tie_attention_biases,
+      memory_length=memory_length,
+      reuse_length=reuse_length,
+      inner_activation="relu",
     )
     attention_output, cached_memory_states = transformer_xl_layer(**data)
     if two_stream:
@@ -242,20 +236,20 @@ def test_transformer_xl(self, two_stream, memory_length, reuse_length, tie_atten
 
   def test_get_config(self):
     transformer_xl_layer = transformer_xl.TransformerXL(
-        vocab_size=32000,
-        num_layers=12,
-        hidden_size=36,
-        head_size=12,
-        num_attention_heads=12,
-        inner_size=12,
-        dropout_rate=0.,
-        attention_dropout_rate=0.,
-        initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
-        two_stream=False,
-        tie_attention_biases=True,
-        memory_length=0,
-        reuse_length=0,
-        inner_activation="relu"
+      vocab_size=32000,
+      num_layers=12,
+      hidden_size=36,
+      head_size=12,
+      num_attention_heads=12,
+      inner_size=12,
+      dropout_rate=0.0,
+      attention_dropout_rate=0.0,
+      initializer=tf.keras.initializers.RandomNormal(stddev=0.1),
+      two_stream=False,
+      tie_attention_biases=True,
+      memory_length=0,
+      reuse_length=0,
+      inner_activation="relu",
     )
     transformer_xl_config = transformer_xl_layer.get_config()
     new_transformer_xl = transformer_xl.TransformerXL.from_config(transformer_xl_config)
diff --git a/deepray/layers/tests_bak/wrappers_test.py b/deepray/layers/tests_bak/wrappers_test.py
index 52c5512a..041b6dd8 100644
--- a/deepray/layers/tests_bak/wrappers_test.py
+++ b/deepray/layers/tests_bak/wrappers_test.py
@@ -26,35 +26,32 @@
 
 def test_basic():
   test_utils.layer_test(
-      wrappers.WeightNormalization,
-      kwargs={"layer": tf.keras.layers.Conv2D(5, (2, 2))},
-      input_shape=(2, 4, 4, 3),
+    wrappers.WeightNormalization,
+    kwargs={"layer": tf.keras.layers.Conv2D(5, (2, 2))},
+    input_shape=(2, 4, 4, 3),
   )
 
 
 def test_no_bias():
   test_utils.layer_test(
-      wrappers.WeightNormalization,
-      kwargs={"layer": tf.keras.layers.Dense(5, use_bias=False)},
-      input_shape=(2, 4),
+    wrappers.WeightNormalization,
+    kwargs={"layer": tf.keras.layers.Dense(5, use_bias=False)},
+    input_shape=(2, 4),
   )
 
 
 def _check_data_init(data_init, input_data, expected_output):
   layer = tf.keras.layers.Dense(
-      input_data.shape[-1],
-      activation=None,
-      kernel_initializer="identity",
-      bias_initializer="zeros",
+    input_data.shape[-1],
+    activation=None,
+    kernel_initializer="identity",
+    bias_initializer="zeros",
   )
   test_utils.layer_test(
-      wrappers.WeightNormalization,
-      kwargs={
-          "layer": layer,
-          "data_init": data_init
-      },
-      input_data=input_data,
-      expected_output=expected_output,
+    wrappers.WeightNormalization,
+    kwargs={"layer": layer, "data_init": data_init},
+    input_data=input_data,
+    expected_output=expected_output,
   )
 
 
@@ -96,13 +93,13 @@ def test_with_time_dist():
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize(
-    "base_layer, rnn",
-    [
-        (lambda: tf.keras.layers.Dense(1), False),
-        (lambda: tf.keras.layers.SimpleRNN(1), True),
-        (lambda: tf.keras.layers.Conv2D(3, 1), False),
-        (lambda: tf.keras.layers.LSTM(1), True),
-    ],
+  "base_layer, rnn",
+  [
+    (lambda: tf.keras.layers.Dense(1), False),
+    (lambda: tf.keras.layers.SimpleRNN(1), True),
+    (lambda: tf.keras.layers.Conv2D(3, 1), False),
+    (lambda: tf.keras.layers.LSTM(1), True),
+  ],
 )
 def test_serialization(base_layer, rnn):
   base_layer = base_layer()
@@ -121,13 +118,13 @@ def test_serialization(base_layer, rnn):
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize("data_init", [True, False])
 @pytest.mark.parametrize(
-    "base_layer_fn, input_shape",
-    [
-        (lambda: tf.keras.layers.Dense(1), [1]),
-        (lambda: tf.keras.layers.SimpleRNN(1), [None, 10]),
-        (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
-        (lambda: tf.keras.layers.LSTM(1), [10, 10]),
-    ],
+  "base_layer_fn, input_shape",
+  [
+    (lambda: tf.keras.layers.Dense(1), [1]),
+    (lambda: tf.keras.layers.SimpleRNN(1), [None, 10]),
+    (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
+    (lambda: tf.keras.layers.LSTM(1), [10, 10]),
+  ],
 )
 def test_model_build(base_layer_fn, input_shape, data_init):
   inputs = tf.keras.layers.Input(shape=input_shape)
@@ -139,13 +136,13 @@ def test_model_build(base_layer_fn, input_shape, data_init):
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize(
-    "base_layer, input_shape",
-    [
-        (lambda: tf.keras.layers.Dense(1), [1]),
-        (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
-        (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
-        (lambda: tf.keras.layers.LSTM(1), [10, 10]),
-    ],
+  "base_layer, input_shape",
+  [
+    (lambda: tf.keras.layers.Dense(1), [1]),
+    (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
+    (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
+    (lambda: tf.keras.layers.LSTM(1), [10, 10]),
+  ],
 )
 def test_save_file_h5(base_layer, input_shape):
   base_layer = base_layer()
@@ -158,13 +155,13 @@ def test_save_file_h5(base_layer, input_shape):
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize(
-    "base_layer, input_shape",
-    [
-        (lambda: tf.keras.layers.Dense(1), [1]),
-        (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
-        (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
-        (lambda: tf.keras.layers.LSTM(1), [10, 10]),
-    ],
+  "base_layer, input_shape",
+  [
+    (lambda: tf.keras.layers.Dense(1), [1]),
+    (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
+    (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
+    (lambda: tf.keras.layers.LSTM(1), [10, 10]),
+  ],
 )
 def test_forward_pass(base_layer, input_shape):
   sample_data = np.ones([1] + input_shape, dtype=np.float32)
@@ -178,13 +175,13 @@ def test_forward_pass(base_layer, input_shape):
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.parametrize("data_init", [True, False])
 @pytest.mark.parametrize(
-    "base_layer_fn, input_shape",
-    [
-        (lambda: tf.keras.layers.Dense(1), [1]),
-        (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
-        (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
-        (lambda: tf.keras.layers.LSTM(1), [10, 10]),
-    ],
+  "base_layer_fn, input_shape",
+  [
+    (lambda: tf.keras.layers.Dense(1), [1]),
+    (lambda: tf.keras.layers.SimpleRNN(1), [10, 10]),
+    (lambda: tf.keras.layers.Conv2D(3, 1), [3, 3, 1]),
+    (lambda: tf.keras.layers.LSTM(1), [10, 10]),
+  ],
 )
 def test_removal(base_layer_fn, input_shape, data_init):
   sample_data = np.ones([1] + input_shape, dtype=np.float32)
diff --git a/deepray/layers/tf_utils.py b/deepray/layers/tf_utils.py
index 42301f6b..bcd490c8 100644
--- a/deepray/layers/tf_utils.py
+++ b/deepray/layers/tf_utils.py
@@ -92,10 +92,10 @@ def get_activation(identifier):
   """
   if isinstance(identifier, six.string_types):
     name_to_fn = {
-        "gelu": tf.keras.activations.gelu,
-        "simple_swish": swish.simple_swish,
-        "hard_swish": swish.hard_swish,
-        "identity": swish.identity,
+      "gelu": tf.keras.activations.gelu,
+      "simple_swish": swish.simple_swish,
+      "hard_swish": swish.hard_swish,
+      "identity": swish.identity,
     }
     identifier = str(identifier).lower()
     if identifier in name_to_fn:
@@ -124,7 +124,7 @@ def get_shape_list(tensor, expected_rank=None, name=None):
   shape = tensor.shape.as_list()
 
   non_static_indexes = []
-  for (index, dim) in enumerate(shape):
+  for index, dim in enumerate(shape):
     if dim is None:
       non_static_indexes.append(index)
 
@@ -158,6 +158,6 @@ def assert_rank(tensor, expected_rank, name=None):
   actual_rank = tensor.shape.ndims
   if actual_rank not in expected_rank_dict:
     raise ValueError(
-        "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
-        "equal to the expected tensor rank `%s`" % (name, actual_rank, str(tensor.shape), str(expected_rank))
+      "For the tensor `%s`, the actual tensor rank `%d` (shape = %s) is not "
+      "equal to the expected tensor rank `%s`" % (name, actual_rank, str(tensor.shape), str(expected_rank))
     )
diff --git a/deepray/layers/tlu.py b/deepray/layers/tlu.py
index f08a6b8c..d1275666 100644
--- a/deepray/layers/tlu.py
+++ b/deepray/layers/tlu.py
@@ -24,34 +24,34 @@
 class TLU(tf.keras.layers.Layer):
   r"""Thresholded Linear Unit.
 
-    An activation function which is similar to ReLU
-    but with a learned threshold that benefits models using FRN(Filter Response
-    Normalization). Original paper: https://arxiv.org/pdf/1911.09737.
+  An activation function which is similar to ReLU
+  but with a learned threshold that benefits models using FRN(Filter Response
+  Normalization). Original paper: https://arxiv.org/pdf/1911.09737.
 
-    Input shape:
-        Arbitrary. Use the keyword argument `input_shape`
-        (tuple of integers, does not include the samples axis)
-        when using this layer as the first layer in a model.
+  Input shape:
+      Arbitrary. Use the keyword argument `input_shape`
+      (tuple of integers, does not include the samples axis)
+      when using this layer as the first layer in a model.
 
-    Output shape:
-        Same shape as the input.
+  Output shape:
+      Same shape as the input.
 
-    Args:
-        affine: `bool`. Whether to make it TLU-Affine or not
-            which has the form $\max(x, \alpha*x + \tau)$`
-    """
+  Args:
+      affine: `bool`. Whether to make it TLU-Affine or not
+          which has the form $\max(x, \alpha*x + \tau)$`
+  """
 
   @typechecked
   def __init__(
-      self,
-      affine: bool = False,
-      tau_initializer: types.Initializer = "zeros",
-      tau_regularizer: types.Regularizer = None,
-      tau_constraint: types.Constraint = None,
-      alpha_initializer: types.Initializer = "zeros",
-      alpha_regularizer: types.Regularizer = None,
-      alpha_constraint: types.Constraint = None,
-      **kwargs,
+    self,
+    affine: bool = False,
+    tau_initializer: types.Initializer = "zeros",
+    tau_regularizer: types.Regularizer = None,
+    tau_constraint: types.Constraint = None,
+    alpha_initializer: types.Initializer = "zeros",
+    alpha_regularizer: types.Regularizer = None,
+    alpha_constraint: types.Constraint = None,
+    **kwargs,
   ):
     super().__init__(**kwargs)
     self.supports_masking = True
@@ -67,23 +67,23 @@ def __init__(
   def build(self, input_shape):
     param_shape = list(input_shape[1:])
     self.tau = self.add_weight(
-        shape=param_shape,
-        name="tau",
-        initializer=self.tau_initializer,
-        regularizer=self.tau_regularizer,
-        constraint=self.tau_constraint,
-        synchronization=tf.VariableSynchronization.AUTO,
-        aggregation=tf.VariableAggregation.MEAN,
+      shape=param_shape,
+      name="tau",
+      initializer=self.tau_initializer,
+      regularizer=self.tau_regularizer,
+      constraint=self.tau_constraint,
+      synchronization=tf.VariableSynchronization.AUTO,
+      aggregation=tf.VariableAggregation.MEAN,
     )
     if self.affine:
       self.alpha = self.add_weight(
-          shape=param_shape,
-          name="alpha",
-          initializer=self.alpha_initializer,
-          regularizer=self.alpha_regularizer,
-          constraint=self.alpha_constraint,
-          synchronization=tf.VariableSynchronization.AUTO,
-          aggregation=tf.VariableAggregation.MEAN,
+        shape=param_shape,
+        name="alpha",
+        initializer=self.alpha_initializer,
+        regularizer=self.alpha_regularizer,
+        constraint=self.alpha_constraint,
+        synchronization=tf.VariableSynchronization.AUTO,
+        aggregation=tf.VariableAggregation.MEAN,
       )
 
     axes = {i: input_shape[i] for i in range(1, len(input_shape))}
@@ -96,10 +96,10 @@ def call(self, inputs):
 
   def get_config(self):
     config = {
-        "tau_initializer": tf.keras.initializers.serialize(self.tau_initializer),
-        "tau_regularizer": tf.keras.regularizers.serialize(self.tau_regularizer),
-        "tau_constraint": tf.keras.constraints.serialize(self.tau_constraint),
-        "affine": self.affine,
+      "tau_initializer": tf.keras.initializers.serialize(self.tau_initializer),
+      "tau_regularizer": tf.keras.regularizers.serialize(self.tau_regularizer),
+      "tau_constraint": tf.keras.constraints.serialize(self.tau_constraint),
+      "affine": self.affine,
     }
 
     if self.affine:
diff --git a/deepray/layers/transformer.py b/deepray/layers/transformer.py
index b00085a6..8271ae3a 100644
--- a/deepray/layers/transformer.py
+++ b/deepray/layers/transformer.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -47,20 +48,20 @@ class Transformer(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      num_attention_heads,
-      intermediate_size,
-      intermediate_activation,
-      dropout_rate=0.0,
-      attention_dropout_rate=0.0,
-      kernel_initializer="glorot_uniform",
-      bias_initializer="zeros",
-      kernel_regularizer=None,
-      bias_regularizer=None,
-      activity_regularizer=None,
-      kernel_constraint=None,
-      bias_constraint=None,
-      **kwargs
+    self,
+    num_attention_heads,
+    intermediate_size,
+    intermediate_activation,
+    dropout_rate=0.0,
+    attention_dropout_rate=0.0,
+    kernel_initializer="glorot_uniform",
+    bias_initializer="zeros",
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    **kwargs,
   ):
     super(Transformer, self).__init__(**kwargs)
 
@@ -80,8 +81,7 @@ def build(self, input_shape):
     input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
     input_tensor_shape = tf.TensorShape(input_tensor)
     if len(input_tensor_shape) != 3:
-      raise ValueError("TransformerLayer expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
+      raise ValueError("TransformerLayer expects a three-dimensional input of shape [batch, sequence, width].")
     batch_size, sequence_length, hidden_size = input_tensor_shape
 
     if len(input_shape) == 2:
@@ -89,94 +89,91 @@ def build(self, input_shape):
       expected_mask_tensor_shape = tf.TensorShape([batch_size, sequence_length, sequence_length])
       if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
         raise ValueError(
-            "When passing a mask tensor to TransformerLayer, the "
-            "mask tensor must be of shape [batch, "
-            "sequence_length, sequence_length] (here %s). Got a "
-            "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
+          "When passing a mask tensor to TransformerLayer, the "
+          "mask tensor must be of shape [batch, "
+          "sequence_length, sequence_length] (here %s). Got a "
+          "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
         )
     if hidden_size % self._num_heads != 0:
       raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads)
+        "The input size (%d) is not a multiple of the number of attention heads (%d)" % (hidden_size, self._num_heads)
       )
     self._attention_head_size = int(hidden_size // self._num_heads)
 
     self._attention_layer = attention.Attention(
-        num_heads=self._num_heads,
-        head_size=self._attention_head_size,
-        dropout_rate=self._attention_dropout_rate,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention"
+      num_heads=self._num_heads,
+      head_size=self._attention_head_size,
+      dropout_rate=self._attention_dropout_rate,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="self_attention",
     )
     self._attention_output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        num_summed_dimensions=2,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention_output"
+      output_shape=hidden_size,
+      num_summed_dimensions=2,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="self_attention_output",
     )
     self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name=f"{self.name}/self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
-        )
+    self._attention_layer_norm = tf.keras.layers.LayerNormalization(
+      name=f"{self.name}/self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
     self._intermediate_dense = dense_einsum.DenseEinsum(
-        output_shape=self._intermediate_size,
-        activation=None,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="intermediate"
+      output_shape=self._intermediate_size,
+      activation=None,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="intermediate",
     )
     self._intermediate_activation_layer = tf.keras.layers.Activation(self._intermediate_activation)
     self._output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output"
+      output_shape=hidden_size,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="output",
     )
     self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
     self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name=f"{self.name}/output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+      name=f"{self.name}/output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
     super(Transformer, self).build(input_shape)
 
   def get_config(self):
     config = {
-        "num_attention_heads": self._num_heads,
-        "intermediate_size": self._intermediate_size,
-        "intermediate_activation": self._intermediate_activation,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint)
+      "num_attention_heads": self._num_heads,
+      "intermediate_size": self._intermediate_size,
+      "intermediate_activation": self._intermediate_activation,
+      "dropout_rate": self._dropout_rate,
+      "attention_dropout_rate": self._attention_dropout_rate,
+      "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
+      "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
+      "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
+      "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
+      "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
+      "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint),
     }
     base_config = super(Transformer, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/transformer_scaffold.py b/deepray/layers/transformer_scaffold.py
index 488f8a59..45a4cbfb 100644
--- a/deepray/layers/transformer_scaffold.py
+++ b/deepray/layers/transformer_scaffold.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -53,22 +54,22 @@ class TransformerScaffold(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      num_attention_heads,
-      intermediate_size,
-      intermediate_activation,
-      attention_cls=attention.Attention,
-      attention_cfg=None,
-      dropout_rate=0.0,
-      attention_dropout_rate=0.0,
-      kernel_initializer="glorot_uniform",
-      bias_initializer="zeros",
-      kernel_regularizer=None,
-      bias_regularizer=None,
-      activity_regularizer=None,
-      kernel_constraint=None,
-      bias_constraint=None,
-      **kwargs
+    self,
+    num_attention_heads,
+    intermediate_size,
+    intermediate_activation,
+    attention_cls=attention.Attention,
+    attention_cfg=None,
+    dropout_rate=0.0,
+    attention_dropout_rate=0.0,
+    kernel_initializer="glorot_uniform",
+    bias_initializer="zeros",
+    kernel_regularizer=None,
+    bias_regularizer=None,
+    activity_regularizer=None,
+    kernel_constraint=None,
+    bias_constraint=None,
+    **kwargs,
   ):
     super(TransformerScaffold, self).__init__(**kwargs)
 
@@ -90,8 +91,7 @@ def build(self, input_shape):
     input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
     input_tensor_shape = tf.TensorShape(input_tensor)
     if len(input_tensor_shape) != 3:
-      raise ValueError("TransformerScaffold expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
+      raise ValueError("TransformerScaffold expects a three-dimensional input of shape [batch, sequence, width].")
     batch_size, sequence_length, hidden_size = input_tensor_shape
 
     if len(input_shape) == 2:
@@ -99,15 +99,14 @@ def build(self, input_shape):
       expected_mask_tensor_shape = tf.TensorShape([batch_size, sequence_length, sequence_length])
       if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
         raise ValueError(
-            "When passing a mask tensor to TransformerLayer, the "
-            "mask tensor must be of shape [batch, "
-            "sequence_length, sequence_length] (here %s). Got a "
-            "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
+          "When passing a mask tensor to TransformerLayer, the "
+          "mask tensor must be of shape [batch, "
+          "sequence_length, sequence_length] (here %s). Got a "
+          "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
         )
     if hidden_size % self._num_heads != 0:
       raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads)
+        "The input size (%d) is not a multiple of the number of attention heads (%d)" % (hidden_size, self._num_heads)
       )
     self._attention_head_size = int(hidden_size // self._num_heads)
 
@@ -116,84 +115,84 @@ def build(self, input_shape):
     else:
       if self._attention_cfg is None:
         attention_cfg = {
-            "num_heads": self._num_heads,
-            "head_size": self._attention_head_size,
-            "dropout_rate": self._attention_dropout_rate,
-            "kernel_initializer": self._kernel_initializer,
-            "bias_initializer": self._bias_initializer,
-            "kernel_regularizer": self._kernel_regularizer,
-            "bias_regularizer": self._bias_regularizer,
-            "activity_regularizer": self._activity_regularizer,
-            "kernel_constraint": self._kernel_constraint,
-            "bias_constraint": self._bias_constraint,
-            "name": "self_attention"
+          "num_heads": self._num_heads,
+          "head_size": self._attention_head_size,
+          "dropout_rate": self._attention_dropout_rate,
+          "kernel_initializer": self._kernel_initializer,
+          "bias_initializer": self._bias_initializer,
+          "kernel_regularizer": self._kernel_regularizer,
+          "bias_regularizer": self._bias_regularizer,
+          "activity_regularizer": self._activity_regularizer,
+          "kernel_constraint": self._kernel_constraint,
+          "bias_constraint": self._bias_constraint,
+          "name": "self_attention",
         }
       else:
         attention_cfg = self._attention_cfg
       self._attention_layer = self._attention_cls(**attention_cfg)
 
     self._attention_output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        num_summed_dimensions=2,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="self_attention_output"
+      output_shape=hidden_size,
+      num_summed_dimensions=2,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="self_attention_output",
     )
     self._attention_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
-    self._attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+    self._attention_layer_norm = tf.keras.layers.LayerNormalization(
+      name="self_attention_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
     self._intermediate_dense = dense_einsum.DenseEinsum(
-        output_shape=self._intermediate_size,
-        activation=self._intermediate_activation,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        dtype=tf.float32,  # This layer is always float32 for numeric stability.
-        name="intermediate"
+      output_shape=self._intermediate_size,
+      activation=self._intermediate_activation,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      dtype=tf.float32,  # This layer is always float32 for numeric stability.
+      name="intermediate",
     )
     self._output_dense = dense_einsum.DenseEinsum(
-        output_shape=hidden_size,
-        kernel_initializer=self._kernel_initializer,
-        bias_initializer=self._bias_initializer,
-        kernel_regularizer=self._kernel_regularizer,
-        bias_regularizer=self._bias_regularizer,
-        activity_regularizer=self._activity_regularizer,
-        kernel_constraint=self._kernel_constraint,
-        bias_constraint=self._bias_constraint,
-        name="output"
+      output_shape=hidden_size,
+      kernel_initializer=self._kernel_initializer,
+      bias_initializer=self._bias_initializer,
+      kernel_regularizer=self._kernel_regularizer,
+      bias_regularizer=self._bias_regularizer,
+      activity_regularizer=self._activity_regularizer,
+      kernel_constraint=self._kernel_constraint,
+      bias_constraint=self._bias_constraint,
+      name="output",
     )
     self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
     self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+      name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
     )
 
     super(TransformerScaffold, self).build(input_shape)
 
   def get_config(self):
     config = {
-        "attention_cls": self._attention_layer,
-        "num_attention_heads": self._num_heads,
-        "intermediate_size": self._intermediate_size,
-        "intermediate_activation": self._intermediate_activation,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
-        "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
-        "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
-        "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
-        "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
-        "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
-        "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint)
+      "attention_cls": self._attention_layer,
+      "num_attention_heads": self._num_heads,
+      "intermediate_size": self._intermediate_size,
+      "intermediate_activation": self._intermediate_activation,
+      "dropout_rate": self._dropout_rate,
+      "attention_dropout_rate": self._attention_dropout_rate,
+      "kernel_initializer": tf.keras.initializers.serialize(self._kernel_initializer),
+      "bias_initializer": tf.keras.initializers.serialize(self._bias_initializer),
+      "kernel_regularizer": tf.keras.regularizers.serialize(self._kernel_regularizer),
+      "bias_regularizer": tf.keras.regularizers.serialize(self._bias_regularizer),
+      "activity_regularizer": tf.keras.regularizers.serialize(self._activity_regularizer),
+      "kernel_constraint": tf.keras.constraints.serialize(self._kernel_constraint),
+      "bias_constraint": tf.keras.constraints.serialize(self._bias_constraint),
     }
     base_config = super(TransformerScaffold, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/layers/transformer_xl.py b/deepray/layers/transformer_xl.py
index b84bb675..c5bad285 100644
--- a/deepray/layers/transformer_xl.py
+++ b/deepray/layers/transformer_xl.py
@@ -86,20 +86,20 @@ class TransformerXLBlock(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      hidden_size,
-      num_attention_heads,
-      head_size,
-      inner_size,
-      dropout_rate,
-      attention_dropout_rate,
-      two_stream=False,
-      norm_epsilon=1e-12,
-      inner_activation="relu",
-      kernel_initializer="variance_scaling",
-      inner_dropout=0.0,
-      **kwargs
+    self,
+    vocab_size,
+    hidden_size,
+    num_attention_heads,
+    head_size,
+    inner_size,
+    dropout_rate,
+    attention_dropout_rate,
+    two_stream=False,
+    norm_epsilon=1e-12,
+    inner_activation="relu",
+    kernel_initializer="variance_scaling",
+    inner_dropout=0.0,
+    **kwargs,
   ):
     """Initializes TransformerXLBlock layer."""
 
@@ -125,8 +125,7 @@ def build(self, input_shape):
     input_tensor = input_shape[0] if len(input_shape) == 2 else input_shape
     input_tensor_shape = tf.TensorShape(input_tensor)
     if len(input_tensor_shape.as_list()) != 3:
-      raise ValueError("TransformerLayer expects a three-dimensional input of "
-                       "shape [batch, sequence, width].")
+      raise ValueError("TransformerLayer expects a three-dimensional input of shape [batch, sequence, width].")
     batch_size, sequence_length, hidden_size = input_tensor_shape
 
     if len(input_shape) == 2:
@@ -134,85 +133,84 @@ def build(self, input_shape):
       expected_mask_tensor_shape = tf.TensorShape([batch_size, sequence_length, sequence_length])
       if not expected_mask_tensor_shape.is_compatible_with(mask_tensor_shape):
         raise ValueError(
-            "When passing a mask tensor to TransformerXLBlock, "
-            "the mask tensor must be of shape [batch, "
-            "sequence_length, sequence_length] (here %s). Got a "
-            "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
+          "When passing a mask tensor to TransformerXLBlock, "
+          "the mask tensor must be of shape [batch, "
+          "sequence_length, sequence_length] (here %s). Got a "
+          "mask tensor of shape %s." % (expected_mask_tensor_shape, mask_tensor_shape)
         )
     if hidden_size % self._num_heads != 0:
       raise ValueError(
-          "The input size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (hidden_size, self._num_heads)
+        "The input size (%d) is not a multiple of the number of attention heads (%d)" % (hidden_size, self._num_heads)
       )
     self._attention_layer = self._attention_layer_type(
-        num_heads=self._num_heads,
-        key_dim=self._head_size,
-        value_dim=self._head_size,
-        dropout=self._attention_dropout_rate,
-        use_bias=False,
-        kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
-        name="rel_attn"
+      num_heads=self._num_heads,
+      key_dim=self._head_size,
+      value_dim=self._head_size,
+      dropout=self._attention_dropout_rate,
+      use_bias=False,
+      kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
+      name="rel_attn",
     )
     self._attention_dropout = tf.keras.layers.Dropout(rate=self._attention_dropout_rate)
     self._attention_layer_norm = tf.keras.layers.LayerNormalization(
-        name="self_attention_layer_norm", axis=-1, epsilon=self._norm_epsilon, dtype=tf.float32
+      name="self_attention_layer_norm", axis=-1, epsilon=self._norm_epsilon, dtype=tf.float32
     )
     self._inner_dense = tf.keras.layers.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, self._inner_size),
-        bias_axes="d",
-        kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
-        name="inner"
+      "abc,cd->abd",
+      output_shape=(None, self._inner_size),
+      bias_axes="d",
+      kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
+      name="inner",
     )
 
     self._inner_activation_layer = tf.keras.layers.Activation(self._inner_activation)
     self._inner_dropout_layer = tf.keras.layers.Dropout(rate=self._inner_dropout)
     self._output_dense = tf.keras.layers.EinsumDense(
-        "abc,cd->abd",
-        output_shape=(None, hidden_size),
-        bias_axes="d",
-        name="output",
-        kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer)
+      "abc,cd->abd",
+      output_shape=(None, hidden_size),
+      bias_axes="d",
+      name="output",
+      kernel_initializer=tf_utils.clone_initializer(self._kernel_initializer),
     )
     self._output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
     self._output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon
+      name="output_layer_norm", axis=-1, epsilon=self._norm_epsilon
     )
 
     super().build(input_shape)
 
   def get_config(self):
     config = {
-        "vocab_size": self._vocab_size,
-        "hidden_size": self._hidden_size,
-        "num_attention_heads": self._num_heads,
-        "head_size": self._head_size,
-        "inner_size": self._inner_size,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "two_stream": self._two_stream,
-        "norm_epsilon": self._norm_epsilon,
-        "inner_activation": self._inner_activation,
-        "kernel_initializer": self._kernel_initializer,
-        "inner_dropout": self._inner_dropout,
+      "vocab_size": self._vocab_size,
+      "hidden_size": self._hidden_size,
+      "num_attention_heads": self._num_heads,
+      "head_size": self._head_size,
+      "inner_size": self._inner_size,
+      "dropout_rate": self._dropout_rate,
+      "attention_dropout_rate": self._attention_dropout_rate,
+      "two_stream": self._two_stream,
+      "norm_epsilon": self._norm_epsilon,
+      "inner_activation": self._inner_activation,
+      "kernel_initializer": self._kernel_initializer,
+      "inner_dropout": self._inner_dropout,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def call(
-      self,
-      content_stream,
-      content_attention_bias,
-      positional_attention_bias,
-      relative_position_encoding=None,
-      segment_matrix=None,
-      segment_encoding=None,
-      segment_attention_bias=None,
-      state=None,
-      content_attention_mask=None,
-      query_stream=None,
-      query_attention_mask=None,
-      target_mapping=None
+    self,
+    content_stream,
+    content_attention_bias,
+    positional_attention_bias,
+    relative_position_encoding=None,
+    segment_matrix=None,
+    segment_encoding=None,
+    segment_attention_bias=None,
+    state=None,
+    content_attention_mask=None,
+    query_stream=None,
+    query_attention_mask=None,
+    target_mapping=None,
   ):
     """Implements `call` for the Layer.
 
@@ -253,30 +251,29 @@ def call(
     """
     if not self._two_stream and query_stream is not None:
       logging.warning(
-          "`query_stream` was provided but two stream attention is "
-          "disabled. `query_stream` will be ignored."
+        "`query_stream` was provided but two stream attention is disabled. `query_stream` will be ignored."
       )
     if self._two_stream:
       attention_kwargs = dict(
-          content_stream=content_stream,
-          query_stream=query_stream,
-          query_attention_mask=query_attention_mask,
-          target_mapping=target_mapping,
-          content_attention_mask=content_attention_mask
+        content_stream=content_stream,
+        query_stream=query_stream,
+        query_attention_mask=query_attention_mask,
+        target_mapping=target_mapping,
+        content_attention_mask=content_attention_mask,
       )
     else:
       attention_kwargs = dict(
-          query=content_stream, value=content_stream, key=content_stream, attention_mask=content_attention_mask
+        query=content_stream, value=content_stream, key=content_stream, attention_mask=content_attention_mask
       )
 
     common_attention_kwargs = dict(
-        content_attention_bias=content_attention_bias,
-        relative_position_encoding=relative_position_encoding,
-        positional_attention_bias=positional_attention_bias,
-        segment_matrix=segment_matrix,
-        segment_encoding=segment_encoding,
-        segment_attention_bias=segment_attention_bias,
-        state=state
+      content_attention_bias=content_attention_bias,
+      relative_position_encoding=relative_position_encoding,
+      positional_attention_bias=positional_attention_bias,
+      segment_matrix=segment_matrix,
+      segment_encoding=segment_encoding,
+      segment_attention_bias=segment_attention_bias,
+      state=state,
     )
 
     attention_kwargs.update(common_attention_kwargs)
@@ -341,22 +338,22 @@ class TransformerXL(tf.keras.layers.Layer):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      num_layers,
-      hidden_size,
-      num_attention_heads,
-      head_size,
-      inner_size,
-      dropout_rate,
-      attention_dropout_rate,
-      initializer,
-      two_stream=False,
-      tie_attention_biases=True,
-      memory_length=None,
-      reuse_length=None,
-      inner_activation="relu",
-      **kwargs
+    self,
+    vocab_size,
+    num_layers,
+    hidden_size,
+    num_attention_heads,
+    head_size,
+    inner_size,
+    dropout_rate,
+    attention_dropout_rate,
+    initializer,
+    two_stream=False,
+    tie_attention_biases=True,
+    memory_length=None,
+    reuse_length=None,
+    inner_activation="relu",
+    **kwargs,
   ):
     """Initializes TransformerXL."""
     super().__init__(**kwargs)
@@ -383,76 +380,76 @@ def __init__(
       attention_bias_shape = [self._num_layers, self._num_attention_heads, self._head_size]
 
     self.content_attention_bias = self.add_weight(
-        "content_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=tf_utils.clone_initializer(self._initializer)
+      "content_attention_bias",
+      shape=attention_bias_shape,
+      dtype=tf.float32,
+      initializer=tf_utils.clone_initializer(self._initializer),
     )
     self.positional_attention_bias = self.add_weight(
-        "positional_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=tf_utils.clone_initializer(self._initializer)
+      "positional_attention_bias",
+      shape=attention_bias_shape,
+      dtype=tf.float32,
+      initializer=tf_utils.clone_initializer(self._initializer),
     )
     self.segment_attention_bias = self.add_weight(
-        "segment_attention_bias",
-        shape=attention_bias_shape,
-        dtype=tf.float32,
-        initializer=tf_utils.clone_initializer(self._initializer)
+      "segment_attention_bias",
+      shape=attention_bias_shape,
+      dtype=tf.float32,
+      initializer=tf_utils.clone_initializer(self._initializer),
     )
 
     self.transformer_xl_layers = []
     for i in range(self._num_layers):
       self.transformer_xl_layers.append(
-          TransformerXLBlock(
-              vocab_size=self._vocab_size,
-              hidden_size=self._head_size * self._num_attention_heads,
-              num_attention_heads=self._num_attention_heads,
-              head_size=self._head_size,
-              inner_size=self._inner_size,
-              dropout_rate=self._dropout_rate,
-              attention_dropout_rate=self._attention_dropout_rate,
-              norm_epsilon=1e-12,
-              inner_activation=self._inner_activation,
-              two_stream=self._two_stream,
-              kernel_initializer="variance_scaling",
-              name="layer_%d" % i
-          )
+        TransformerXLBlock(
+          vocab_size=self._vocab_size,
+          hidden_size=self._head_size * self._num_attention_heads,
+          num_attention_heads=self._num_attention_heads,
+          head_size=self._head_size,
+          inner_size=self._inner_size,
+          dropout_rate=self._dropout_rate,
+          attention_dropout_rate=self._attention_dropout_rate,
+          norm_epsilon=1e-12,
+          inner_activation=self._inner_activation,
+          two_stream=self._two_stream,
+          kernel_initializer="variance_scaling",
+          name="layer_%d" % i,
+        )
       )
 
     self.output_dropout = tf.keras.layers.Dropout(rate=self._dropout_rate)
 
   def get_config(self):
     config = {
-        "vocab_size": self._vocab_size,
-        "num_layers": self._num_layers,
-        "hidden_size": self._hidden_size,
-        "num_attention_heads": self._num_attention_heads,
-        "head_size": self._head_size,
-        "inner_size": self._inner_size,
-        "dropout_rate": self._dropout_rate,
-        "attention_dropout_rate": self._attention_dropout_rate,
-        "initializer": self._initializer,
-        "two_stream": self._two_stream,
-        "tie_attention_biases": self._tie_attention_biases,
-        "memory_length": self._memory_length,
-        "reuse_length": self._reuse_length,
-        "inner_activation": self._inner_activation,
+      "vocab_size": self._vocab_size,
+      "num_layers": self._num_layers,
+      "hidden_size": self._hidden_size,
+      "num_attention_heads": self._num_attention_heads,
+      "head_size": self._head_size,
+      "inner_size": self._inner_size,
+      "dropout_rate": self._dropout_rate,
+      "attention_dropout_rate": self._attention_dropout_rate,
+      "initializer": self._initializer,
+      "two_stream": self._two_stream,
+      "tie_attention_biases": self._tie_attention_biases,
+      "memory_length": self._memory_length,
+      "reuse_length": self._reuse_length,
+      "inner_activation": self._inner_activation,
     }
     base_config = super().get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def call(
-      self,
-      content_stream,
-      relative_position_encoding,
-      segment_matrix=None,
-      segment_embedding=None,
-      state=None,
-      content_attention_mask=None,
-      query_stream=None,
-      query_attention_mask=None,
-      target_mapping=None
+    self,
+    content_stream,
+    relative_position_encoding,
+    segment_matrix=None,
+    segment_embedding=None,
+    state=None,
+    content_attention_mask=None,
+    query_stream=None,
+    query_attention_mask=None,
+    target_mapping=None,
   ):
     """Implements call() for the layer.
 
@@ -500,30 +497,30 @@ def call(
         segment_encoding = None
       else:
         segment_attention_bias = (
-            self.segment_attention_bias if self._tie_attention_biases else self.segment_attention_bias[i]
+          self.segment_attention_bias if self._tie_attention_biases else self.segment_attention_bias[i]
         )
         segment_encoding = segment_embedding[i]
 
       content_attention_bias = (
-          self.content_attention_bias if self._tie_attention_biases else self.content_attention_bias[i]
+        self.content_attention_bias if self._tie_attention_biases else self.content_attention_bias[i]
       )
       positional_attention_bias = (
-          self.positional_attention_bias if self._tie_attention_biases else self.positional_attention_bias[i]
+        self.positional_attention_bias if self._tie_attention_biases else self.positional_attention_bias[i]
       )
       transformer_xl_layer = self.transformer_xl_layers[i]
       transformer_xl_output = transformer_xl_layer(
-          content_stream=content_stream,
-          content_attention_bias=content_attention_bias,
-          positional_attention_bias=positional_attention_bias,
-          relative_position_encoding=relative_position_encoding,
-          segment_matrix=segment_matrix,
-          segment_encoding=segment_encoding,
-          segment_attention_bias=segment_attention_bias,
-          state=state[i],
-          content_attention_mask=content_attention_mask,
-          query_attention_mask=query_attention_mask,
-          query_stream=query_stream,
-          target_mapping=target_mapping
+        content_stream=content_stream,
+        content_attention_bias=content_attention_bias,
+        positional_attention_bias=positional_attention_bias,
+        relative_position_encoding=relative_position_encoding,
+        segment_matrix=segment_matrix,
+        segment_encoding=segment_encoding,
+        segment_attention_bias=segment_attention_bias,
+        state=state[i],
+        content_attention_mask=content_attention_mask,
+        query_attention_mask=query_attention_mask,
+        query_stream=query_stream,
+        target_mapping=target_mapping,
       )
       content_stream = transformer_xl_output["content_attention"]
       if self._two_stream:
diff --git a/deepray/layers/wrappers.py b/deepray/layers/wrappers.py
index a7d9ba28..c0c782e0 100644
--- a/deepray/layers/wrappers.py
+++ b/deepray/layers/wrappers.py
@@ -23,37 +23,37 @@
 class WeightNormalization(tf.keras.layers.Wrapper):
   """Performs weight normalization.
 
-    This wrapper reparameterizes a layer by decoupling the weight's
-    magnitude and direction.
-    This speeds up convergence by improving the
-    conditioning of the optimization problem.
-
-    See [Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks](https://arxiv.org/abs/1602.07868).
-
-    Wrap `tf.keras.layers.Conv2D`:
-
-    >>> x = np.random.rand(1, 10, 10, 1)
-    >>> conv2d = WeightNormalization(tf.keras.layers.Conv2D(2, 2), data_init=False)
-    >>> y = conv2d(x)
-    >>> y.shape
-    TensorShape([1, 9, 9, 2])
-
-    Wrap `tf.keras.layers.Dense`:
-
-    >>> x = np.random.rand(1, 10, 10, 1)
-    >>> dense = WeightNormalization(tf.keras.layers.Dense(10), data_init=False)
-    >>> y = dense(x)
-    >>> y.shape
-    TensorShape([1, 10, 10, 10])
-
-    Args:
-      layer: A `tf.keras.layers.Layer` instance.
-      data_init: If `True` use data dependent variable initialization.
-    Raises:
-      ValueError: If not initialized with a `Layer` instance.
-      ValueError: If `Layer` does not contain a `kernel` of weights.
-      NotImplementedError: If `data_init` is True and running graph execution.
-    """
+  This wrapper reparameterizes a layer by decoupling the weight's
+  magnitude and direction.
+  This speeds up convergence by improving the
+  conditioning of the optimization problem.
+
+  See [Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks](https://arxiv.org/abs/1602.07868).
+
+  Wrap `tf.keras.layers.Conv2D`:
+
+  >>> x = np.random.rand(1, 10, 10, 1)
+  >>> conv2d = WeightNormalization(tf.keras.layers.Conv2D(2, 2), data_init=False)
+  >>> y = conv2d(x)
+  >>> y.shape
+  TensorShape([1, 9, 9, 2])
+
+  Wrap `tf.keras.layers.Dense`:
+
+  >>> x = np.random.rand(1, 10, 10, 1)
+  >>> dense = WeightNormalization(tf.keras.layers.Dense(10), data_init=False)
+  >>> y = dense(x)
+  >>> y.shape
+  TensorShape([1, 10, 10, 10])
+
+  Args:
+    layer: A `tf.keras.layers.Layer` instance.
+    data_init: If `True` use data dependent variable initialization.
+  Raises:
+    ValueError: If not initialized with a `Layer` instance.
+    ValueError: If `Layer` does not contain a `kernel` of weights.
+    NotImplementedError: If `data_init` is True and running graph execution.
+  """
 
   @typechecked
   def __init__(self, layer: tf.keras.layers, data_init: bool = True, **kwargs):
@@ -64,8 +64,7 @@ def __init__(self, layer: tf.keras.layers, data_init: bool = True, **kwargs):
 
     if self.data_init and self.is_rnn:
       logging.warning(
-          "WeightNormalization: Using `data_init=True` with RNNs "
-          "is advised against by the paper. Use `data_init=False`."
+        "WeightNormalization: Using `data_init=True` with RNNs is advised against by the paper. Use `data_init=False`."
       )
 
   def build(self, input_shape):
@@ -79,8 +78,7 @@ def build(self, input_shape):
     kernel_layer = self.layer.cell if self.is_rnn else self.layer
 
     if not hasattr(kernel_layer, "kernel"):
-      raise ValueError("`WeightNormalization` must wrap a layer that"
-                       " contains a `kernel` for weights")
+      raise ValueError("`WeightNormalization` must wrap a layer that contains a `kernel` for weights")
 
     if self.is_rnn:
       kernel = kernel_layer.recurrent_kernel
@@ -92,20 +90,20 @@ def build(self, input_shape):
     self.kernel_norm_axes = list(range(kernel.shape.rank - 1))
 
     self.g = self.add_weight(
-        name="g",
-        shape=(self.layer_depth,),
-        initializer="ones",
-        dtype=kernel.dtype,
-        trainable=True,
+      name="g",
+      shape=(self.layer_depth,),
+      initializer="ones",
+      dtype=kernel.dtype,
+      trainable=True,
     )
     self.v = kernel
 
     self._initialized = self.add_weight(
-        name="initialized",
-        shape=None,
-        initializer="zeros",
-        dtype=tf.dtypes.bool,
-        trainable=False,
+      name="initialized",
+      shape=None,
+      initializer="zeros",
+      dtype=tf.dtypes.bool,
+      trainable=False,
     )
 
     if self.data_init:
@@ -156,16 +154,14 @@ def compute_output_shape(self, input_shape):
   def _initialize_weights(self, inputs):
     """Initialize weight g.
 
-        The initial value of g could either from the initial value in v,
-        or by the input value if self.data_init is True.
-        """
-    with tf.control_dependencies(
-        [
-            tf.debugging.assert_equal(  # pylint: disable=bad-continuation
-                self._initialized, False, message="The layer has been initialized."
-            )
-        ]
-    ):
+    The initial value of g could either from the initial value in v,
+    or by the input value if self.data_init is True.
+    """
+    with tf.control_dependencies([
+      tf.debugging.assert_equal(  # pylint: disable=bad-continuation
+        self._initialized, False, message="The layer has been initialized."
+      )
+    ]):
       if self.data_init:
         assign_tensors = self._data_dep_init(inputs)
       else:
@@ -213,8 +209,8 @@ def get_config(self):
 
   def remove(self):
     kernel = tf.Variable(
-        tf.nn.l2_normalize(self.v, axis=self.kernel_norm_axes) * self.g,
-        name="recurrent_kernel" if self.is_rnn else "kernel",
+      tf.nn.l2_normalize(self.v, axis=self.kernel_norm_axes) * self.g,
+      name="recurrent_kernel" if self.is_rnn else "kernel",
     )
 
     if self.is_rnn:
diff --git a/deepray/losses/__init__.py b/deepray/losses/__init__.py
index 295e6c4b..41f6c1ac 100644
--- a/deepray/losses/__init__.py
+++ b/deepray/losses/__init__.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Additional losses that conform to Keras API."""
+
 import abc
 
 from absl import flags
@@ -33,18 +34,18 @@
 from deepray.losses.kappa_loss import WeightedKappaLoss
 from deepray.losses.lifted import lifted_struct_loss, LiftedStructLoss
 from deepray.losses.npairs import (
-    npairs_loss,
-    NpairsLoss,
-    npairs_multilabel_loss,
-    NpairsMultilabelLoss,
+  npairs_loss,
+  NpairsLoss,
+  npairs_multilabel_loss,
+  NpairsMultilabelLoss,
 )
 from deepray.losses.quantiles import pinball_loss, PinballLoss
 from deepray.losses.sparsemax_loss import sparsemax_loss, SparsemaxLoss
 from deepray.losses.triplet import (
-    triplet_semihard_loss,
-    triplet_hard_loss,
-    TripletSemiHardLoss,
-    TripletHardLoss,
+  triplet_semihard_loss,
+  triplet_hard_loss,
+  TripletSemiHardLoss,
+  TripletHardLoss,
 )
 from deepray.losses.softmax_loss import SoftmaxLoss
 
@@ -67,12 +68,13 @@ def __call__(self, y_true, y_pred, sample_weight=None, regularization_losses=Non
       self._built = True
     loss_value = self.call(y_true, y_pred, sample_weight)
     total_loss_mean_value = tf.nn.compute_average_loss(
-        loss_value, global_batch_size=flags.FLAGS.batch_size * flags.FLAGS.num_accumulation_steps
+      loss_value, global_batch_size=flags.FLAGS.batch_size * flags.FLAGS.num_accumulation_steps
     )
 
-    self._loss_metric.update_state(total_loss_mean_value,
-                                   # sample_weight=batch_dim
-                                  )
+    self._loss_metric.update_state(
+      total_loss_mean_value,
+      # sample_weight=batch_dim
+    )
     return total_loss_mean_value
 
   def __repr__(self):
diff --git a/deepray/losses/_loss_util.py b/deepray/losses/_loss_util.py
index 5eaa4561..213d0bec 100644
--- a/deepray/losses/_loss_util.py
+++ b/deepray/losses/_loss_util.py
@@ -22,9 +22,9 @@
 from typing import Callable, Dict, Tuple
 import tensorflow as tf
 
-_PADDING_LABEL = -1.
+_PADDING_LABEL = -1.0
 _PADDING_PREDICTION = -1e6
-_PADDING_WEIGHT = 0.
+_PADDING_WEIGHT = 0.0
 
 TensorLike = tf.types.experimental.TensorLike
 TransformationFunction = Callable[[TensorLike], tf.Tensor]
@@ -135,7 +135,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     Returns:
       A tuple(losses, loss_weights) that have the same shape.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def normalize_weights(self, labels, weights):
     """Normalizes weights.
@@ -224,7 +224,7 @@ def compute_per_list(self, labels, logits, weights, mask=None):
       A pair of `Tensor` objects of shape [batch_size] containing per-list
       losses and weights.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def eval_metric(self, labels, logits, weights, mask=None):
     """Computes the eval metric for the loss in tf.estimator (not tf.keras).
@@ -266,7 +266,7 @@ def ragged_to_dense(labels, predictions, weights):
     A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
   """
   # TODO: Add checks to validate (ragged) shapes of input tensors.
-  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.0), dtype=tf.bool)
   labels = labels.to_tensor(_PADDING_LABEL)
   if predictions is not None:
     predictions = predictions.to_tensor(_PADDING_PREDICTION)
@@ -278,4 +278,4 @@ def ragged_to_dense(labels, predictions, weights):
 def is_label_valid(labels):
   """Returns a boolean `Tensor` for label validity."""
   labels = tf.convert_to_tensor(value=labels)
-  return tf.greater_equal(labels, 0.)
+  return tf.greater_equal(labels, 0.0)
diff --git a/deepray/losses/contrastive.py b/deepray/losses/contrastive.py
index 501d47a2..18eb03b7 100644
--- a/deepray/losses/contrastive.py
+++ b/deepray/losses/contrastive.py
@@ -26,40 +26,40 @@
 def contrastive_loss(y_true: TensorLike, y_pred: TensorLike, margin: Number = 1.0) -> tf.Tensor:
   r"""Computes the contrastive loss between `y_true` and `y_pred`.
 
-    This loss encourages the embedding to be close to each other for
-    the samples of the same label and the embedding to be far apart at least
-    by the margin constant for the samples of different labels.
-
-    The euclidean distances `y_pred` between two embedding matrices
-    `a` and `b` with shape `[batch_size, hidden_size]` can be computed
-    as follows:
-
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],
-    ...                 [5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],
-    ...                 [1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.linalg.norm(a - b, axis=1)
-    >>> y_pred
-    <tf.Tensor: shape=(3,), dtype=float16, numpy=array([8.06 , 2.   , 4.473],
-    dtype=float16)>
-
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
-
-    See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
-
-    Args:
-      y_true: 1-D integer `Tensor` with shape `[batch_size]` of
-        binary labels indicating positive vs negative pair.
-      y_pred: 1-D float `Tensor` with shape `[batch_size]` of
-        distances between two embedding matrices.
-      margin: margin term in the loss definition.
-
-    Returns:
-      contrastive_loss: 1-D float `Tensor` with shape `[batch_size]`.
-    """
+  This loss encourages the embedding to be close to each other for
+  the samples of the same label and the embedding to be far apart at least
+  by the margin constant for the samples of different labels.
+
+  The euclidean distances `y_pred` between two embedding matrices
+  `a` and `b` with shape `[batch_size, hidden_size]` can be computed
+  as follows:
+
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],
+  ...                 [5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],
+  ...                 [1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.linalg.norm(a - b, axis=1)
+  >>> y_pred
+  <tf.Tensor: shape=(3,), dtype=float16, numpy=array([8.06 , 2.   , 4.473],
+  dtype=float16)>
+
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
+
+  See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+
+  Args:
+    y_true: 1-D integer `Tensor` with shape `[batch_size]` of
+      binary labels indicating positive vs negative pair.
+    y_pred: 1-D float `Tensor` with shape `[batch_size]` of
+      distances between two embedding matrices.
+    margin: margin term in the loss definition.
+
+  Returns:
+    contrastive_loss: 1-D float `Tensor` with shape `[batch_size]`.
+  """
   y_pred = tf.convert_to_tensor(y_pred)
   y_true = tf.dtypes.cast(y_true, y_pred.dtype)
   return y_true * tf.math.square(y_pred) + (1.0 - y_true) * tf.math.square(tf.math.maximum(margin - y_pred, 0.0))
@@ -69,46 +69,46 @@ def contrastive_loss(y_true: TensorLike, y_pred: TensorLike, margin: Number = 1.
 class ContrastiveLoss(losses.LossFunctionWrapper):
   r"""Computes the contrastive loss between `y_true` and `y_pred`.
 
-    This loss encourages the embedding to be close to each other for
-    the samples of the same label and the embedding to be far apart at least
-    by the margin constant for the samples of different labels.
-
-    See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
-
-    We expect labels `y_true` to be provided as 1-D integer `Tensor`
-    with shape `[batch_size]` of binary integer labels. And `y_pred` must be
-    1-D float `Tensor` with shape `[batch_size]` of distances between two
-    embedding matrices.
-
-    The euclidean distances `y_pred` between two embedding matrices
-    `a` and `b` with shape `[batch_size, hidden_size]` can be computed
-    as follows:
-
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],[5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],[1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.linalg.norm(a - b, axis=1)
-    >>> y_pred
-    <tf.Tensor: shape=(3,), dtype=float16, numpy=array([8.06 , 2.   , 4.473],
-    dtype=float16)>
-
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
-
-    Args:
-      margin: `Float`, margin term in the loss definition.
-        Default value is 1.0.
-      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply.
-        Default value is `SUM_OVER_BATCH_SIZE`.
-      name: (Optional) name for the loss.
-    """
+  This loss encourages the embedding to be close to each other for
+  the samples of the same label and the embedding to be far apart at least
+  by the margin constant for the samples of different labels.
+
+  See: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+
+  We expect labels `y_true` to be provided as 1-D integer `Tensor`
+  with shape `[batch_size]` of binary integer labels. And `y_pred` must be
+  1-D float `Tensor` with shape `[batch_size]` of distances between two
+  embedding matrices.
+
+  The euclidean distances `y_pred` between two embedding matrices
+  `a` and `b` with shape `[batch_size, hidden_size]` can be computed
+  as follows:
+
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],[5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],[1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.linalg.norm(a - b, axis=1)
+  >>> y_pred
+  <tf.Tensor: shape=(3,), dtype=float16, numpy=array([8.06 , 2.   , 4.473],
+  dtype=float16)>
+
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
+
+  Args:
+    margin: `Float`, margin term in the loss definition.
+      Default value is 1.0.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply.
+      Default value is `SUM_OVER_BATCH_SIZE`.
+    name: (Optional) name for the loss.
+  """
 
   @typechecked
   def __init__(
-      self,
-      margin: Number = 1.0,
-      reduction: str = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
-      name: str = "contrastive_loss",
+    self,
+    margin: Number = 1.0,
+    reduction: str = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
+    name: str = "contrastive_loss",
   ):
     super().__init__(contrastive_loss, reduction=reduction, name=name, margin=margin)
diff --git a/deepray/losses/focal_loss.py b/deepray/losses/focal_loss.py
index 21e2ddbe..8decb737 100644
--- a/deepray/losses/focal_loss.py
+++ b/deepray/losses/focal_loss.py
@@ -25,91 +25,91 @@
 class SigmoidFocalCrossEntropy(keras.losses.LossFunctionWrapper):
   """Implements the focal loss function.
 
-    Focal loss was first introduced in the RetinaNet paper
-    (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
-    classification when you have highly imbalanced classes. It down-weights
-    well-classified examples and focuses on hard examples. The loss value is
-    much higher for a sample which is misclassified by the classifier as compared
-    to the loss value corresponding to a well-classified example. One of the
-    best use-cases of focal loss is its usage in object detection where the
-    imbalance between the background class and other classes is extremely high.
-
-    Usage:
-
-    >>> fl = dp.losses.SigmoidFocalCrossEntropy()
-    >>> loss = fl(
-    ...     y_true = [[1.0], [1.0], [0.0]],y_pred = [[0.97], [0.91], [0.03]])
-    >>> loss
-    <tf.Tensor: shape=(3,), dtype=float32, numpy=array([6.8532745e-06, 1.9097870e-04, 2.0559824e-05],
-    dtype=float32)>
-
-    Usage with `tf.keras` API:
-
-    >>> model = tf.keras.Model()
-    >>> model.compile('sgd', loss=dp.losses.SigmoidFocalCrossEntropy())
-
-    Args:
-      alpha: balancing factor, default value is 0.25.
-      gamma: modulating factor, default value is 2.0.
-
-    Returns:
-      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
-          shape as `y_true`; otherwise, it is scalar.
-
-    Raises:
-        ValueError: If the shape of `sample_weight` is invalid or value of
-          `gamma` is less than zero.
-    """
+  Focal loss was first introduced in the RetinaNet paper
+  (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
+  classification when you have highly imbalanced classes. It down-weights
+  well-classified examples and focuses on hard examples. The loss value is
+  much higher for a sample which is misclassified by the classifier as compared
+  to the loss value corresponding to a well-classified example. One of the
+  best use-cases of focal loss is its usage in object detection where the
+  imbalance between the background class and other classes is extremely high.
+
+  Usage:
+
+  >>> fl = dp.losses.SigmoidFocalCrossEntropy()
+  >>> loss = fl(
+  ...     y_true = [[1.0], [1.0], [0.0]],y_pred = [[0.97], [0.91], [0.03]])
+  >>> loss
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([6.8532745e-06, 1.9097870e-04, 2.0559824e-05],
+  dtype=float32)>
+
+  Usage with `tf.keras` API:
+
+  >>> model = tf.keras.Model()
+  >>> model.compile('sgd', loss=dp.losses.SigmoidFocalCrossEntropy())
+
+  Args:
+    alpha: balancing factor, default value is 0.25.
+    gamma: modulating factor, default value is 2.0.
+
+  Returns:
+    Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+        shape as `y_true`; otherwise, it is scalar.
+
+  Raises:
+      ValueError: If the shape of `sample_weight` is invalid or value of
+        `gamma` is less than zero.
+  """
 
   @typechecked
   def __init__(
-      self,
-      from_logits: bool = False,
-      alpha: FloatTensorLike = 0.25,
-      gamma: FloatTensorLike = 2.0,
-      reduction: str = tf.keras.losses.Reduction.NONE,
-      name: str = "sigmoid_focal_crossentropy",
+    self,
+    from_logits: bool = False,
+    alpha: FloatTensorLike = 0.25,
+    gamma: FloatTensorLike = 2.0,
+    reduction: str = tf.keras.losses.Reduction.NONE,
+    name: str = "sigmoid_focal_crossentropy",
   ):
     super().__init__(
-        sigmoid_focal_crossentropy,
-        name=name,
-        reduction=reduction,
-        from_logits=from_logits,
-        alpha=alpha,
-        gamma=gamma,
+      sigmoid_focal_crossentropy,
+      name=name,
+      reduction=reduction,
+      from_logits=from_logits,
+      alpha=alpha,
+      gamma=gamma,
     )
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 @tf.function
 def sigmoid_focal_crossentropy(
-    y_true: TensorLike,
-    y_pred: TensorLike,
-    alpha: FloatTensorLike = 0.25,
-    gamma: FloatTensorLike = 2.0,
-    from_logits: bool = False,
+  y_true: TensorLike,
+  y_pred: TensorLike,
+  alpha: FloatTensorLike = 0.25,
+  gamma: FloatTensorLike = 2.0,
+  from_logits: bool = False,
 ) -> tf.Tensor:
   """Implements the focal loss function.
 
-    Focal loss was first introduced in the RetinaNet paper
-    (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
-    classification when you have highly imbalanced classes. It down-weights
-    well-classified examples and focuses on hard examples. The loss value is
-    much higher for a sample which is misclassified by the classifier as compared
-    to the loss value corresponding to a well-classified example. One of the
-    best use-cases of focal loss is its usage in object detection where the
-    imbalance between the background class and other classes is extremely high.
-
-    Args:
-        y_true: true targets tensor.
-        y_pred: predictions tensor.
-        alpha: balancing factor.
-        gamma: modulating factor.
-
-    Returns:
-        Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
-        same shape as `y_true`; otherwise, it is scalar.
-    """
+  Focal loss was first introduced in the RetinaNet paper
+  (https://arxiv.org/pdf/1708.02002.pdf). Focal loss is extremely useful for
+  classification when you have highly imbalanced classes. It down-weights
+  well-classified examples and focuses on hard examples. The loss value is
+  much higher for a sample which is misclassified by the classifier as compared
+  to the loss value corresponding to a well-classified example. One of the
+  best use-cases of focal loss is its usage in object detection where the
+  imbalance between the background class and other classes is extremely high.
+
+  Args:
+      y_true: true targets tensor.
+      y_pred: predictions tensor.
+      alpha: balancing factor.
+      gamma: modulating factor.
+
+  Returns:
+      Weighted loss float `Tensor`. If `reduction` is `NONE`,this has the
+      same shape as `y_true`; otherwise, it is scalar.
+  """
   if gamma and gamma < 0:
     raise ValueError("Value of gamma should be greater than or equal to zero.")
 
diff --git a/deepray/losses/giou_loss.py b/deepray/losses/giou_loss.py
index 81a49d96..73770bcf 100644
--- a/deepray/losses/giou_loss.py
+++ b/deepray/losses/giou_loss.py
@@ -27,36 +27,36 @@
 class GIoULoss(losses.LossFunctionWrapper):
   """Implements the GIoU loss function.
 
-    GIoU loss was first introduced in the
-    [Generalized Intersection over Union:
-    A Metric and A Loss for Bounding Box Regression]
-    (https://giou.stanford.edu/GIoU.pdf).
-    GIoU is an enhancement for models which use IoU in object detection.
+  GIoU loss was first introduced in the
+  [Generalized Intersection over Union:
+  A Metric and A Loss for Bounding Box Regression]
+  (https://giou.stanford.edu/GIoU.pdf).
+  GIoU is an enhancement for models which use IoU in object detection.
 
-    Usage:
+  Usage:
 
-    >>> gl = dp.losses.GIoULoss()
-    >>> boxes1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]])
-    >>> boxes2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0]])
-    >>> loss = gl(boxes1, boxes2)
-    >>> loss
-    <tf.Tensor: shape=(), dtype=float32, numpy=1.5041667>
+  >>> gl = dp.losses.GIoULoss()
+  >>> boxes1 = tf.constant([[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]])
+  >>> boxes2 = tf.constant([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0]])
+  >>> loss = gl(boxes1, boxes2)
+  >>> loss
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.5041667>
 
-    Usage with `tf.keras` API:
+  Usage with `tf.keras` API:
 
-    >>> model = tf.keras.Model()
-    >>> model.compile('sgd', loss=dp.losses.GIoULoss())
+  >>> model = tf.keras.Model()
+  >>> model.compile('sgd', loss=dp.losses.GIoULoss())
 
-    Args:
-      mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
-    """
+  Args:
+    mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
+  """
 
   @typechecked
   def __init__(
-      self,
-      mode: str = "giou",
-      reduction: str = tf.keras.losses.Reduction.AUTO,
-      name: Optional[str] = "giou_loss",
+    self,
+    mode: str = "giou",
+    reduction: str = tf.keras.losses.Reduction.AUTO,
+    name: Optional[str] = "giou_loss",
   ):
     super().__init__(giou_loss, name=name, reduction=reduction, mode=mode)
 
@@ -65,22 +65,22 @@ def __init__(
 def giou_loss(y_true: TensorLike, y_pred: TensorLike, mode: str = "giou") -> tf.Tensor:
   """Implements the GIoU loss function.
 
-    GIoU loss was first introduced in the
-    [Generalized Intersection over Union:
-    A Metric and A Loss for Bounding Box Regression]
-    (https://giou.stanford.edu/GIoU.pdf).
-    GIoU is an enhancement for models which use IoU in object detection.
-
-    Args:
-        y_true: true targets tensor. The coordinates of the each bounding
-            box in boxes are encoded as [y_min, x_min, y_max, x_max].
-        y_pred: predictions tensor. The coordinates of the each bounding
-            box in boxes are encoded as [y_min, x_min, y_max, x_max].
-        mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
-
-    Returns:
-        GIoU loss float `Tensor`.
-    """
+  GIoU loss was first introduced in the
+  [Generalized Intersection over Union:
+  A Metric and A Loss for Bounding Box Regression]
+  (https://giou.stanford.edu/GIoU.pdf).
+  GIoU is an enhancement for models which use IoU in object detection.
+
+  Args:
+      y_true: true targets tensor. The coordinates of the each bounding
+          box in boxes are encoded as [y_min, x_min, y_max, x_max].
+      y_pred: predictions tensor. The coordinates of the each bounding
+          box in boxes are encoded as [y_min, x_min, y_max, x_max].
+      mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
+
+  Returns:
+      GIoU loss float `Tensor`.
+  """
   if mode not in ["giou", "iou"]:
     raise ValueError("Value of mode should be 'iou' or 'giou'")
   y_pred = tf.convert_to_tensor(y_pred)
@@ -94,16 +94,16 @@ def giou_loss(y_true: TensorLike, y_pred: TensorLike, mode: str = "giou") -> tf.
 
 def _calculate_giou(b1: TensorLike, b2: TensorLike, mode: str = "giou") -> tf.Tensor:
   """
-    Args:
-        b1: bounding box. The coordinates of the each bounding box in boxes are
-            encoded as [y_min, x_min, y_max, x_max].
-        b2: the other bounding box. The coordinates of the each bounding box
-            in boxes are encoded as [y_min, x_min, y_max, x_max].
-        mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
-
-    Returns:
-        GIoU loss float `Tensor`.
-    """
+  Args:
+      b1: bounding box. The coordinates of the each bounding box in boxes are
+          encoded as [y_min, x_min, y_max, x_max].
+      b2: the other bounding box. The coordinates of the each bounding box
+          in boxes are encoded as [y_min, x_min, y_max, x_max].
+      mode: one of ['giou', 'iou'], decided to calculate GIoU or IoU loss.
+
+  Returns:
+      GIoU loss float `Tensor`.
+  """
   zero = tf.convert_to_tensor(0.0, b1.dtype)
   b1_ymin, b1_xmin, b1_ymax, b1_xmax = tf.unstack(b1, 4, axis=-1)
   b2_ymin, b2_xmin, b2_ymax, b2_xmax = tf.unstack(b2, 4, axis=-1)
diff --git a/deepray/losses/kappa_loss.py b/deepray/losses/kappa_loss.py
index 8b34431e..5703d476 100644
--- a/deepray/losses/kappa_loss.py
+++ b/deepray/losses/kappa_loss.py
@@ -26,60 +26,60 @@
 class WeightedKappaLoss(tf.keras.losses.Loss):
   r"""Implements the Weighted Kappa loss function.
 
-    Weighted Kappa loss was introduced in the
-    [Weighted kappa loss function for multi-class classification
-    of ordinal data in deep learning]
-    (https://www.sciencedirect.com/science/article/abs/pii/S0167865517301666).
-    Weighted Kappa is widely used in Ordinal Classification Problems.
-    The loss value lies in $ [-\infty, \log 2] $, where $ \log 2 $
-    means the random prediction.
-
-    Usage:
-
-    >>> kappa_loss = dp.losses.WeightedKappaLoss(num_classes=4)
-    >>> y_true = tf.constant([[0, 0, 1, 0], [0, 1, 0, 0],
-    ...                  [1, 0, 0, 0], [0, 0, 0, 1]])
-    >>> y_pred = tf.constant([[0.1, 0.2, 0.6, 0.1], [0.1, 0.5, 0.3, 0.1],
-    ...                  [0.8, 0.05, 0.05, 0.1], [0.01, 0.09, 0.1, 0.8]])
-    >>> loss = kappa_loss(y_true, y_pred)
-    >>> loss
-    <tf.Tensor: shape=(), dtype=float32, numpy=-1.1611925>
-
-    Usage with `tf.keras` API:
-
-    >>> model = tf.keras.Model()
-    >>> model.compile('sgd', loss=dp.losses.WeightedKappaLoss(num_classes=4))
-
-    <... outputs should be softmax results
-    if you want to weight the samples, just multiply the outputs
-    by the sample weight ...>
+  Weighted Kappa loss was introduced in the
+  [Weighted kappa loss function for multi-class classification
+  of ordinal data in deep learning]
+  (https://www.sciencedirect.com/science/article/abs/pii/S0167865517301666).
+  Weighted Kappa is widely used in Ordinal Classification Problems.
+  The loss value lies in $ [-\infty, \log 2] $, where $ \log 2 $
+  means the random prediction.
 
-    """
+  Usage:
+
+  >>> kappa_loss = dp.losses.WeightedKappaLoss(num_classes=4)
+  >>> y_true = tf.constant([[0, 0, 1, 0], [0, 1, 0, 0],
+  ...                  [1, 0, 0, 0], [0, 0, 0, 1]])
+  >>> y_pred = tf.constant([[0.1, 0.2, 0.6, 0.1], [0.1, 0.5, 0.3, 0.1],
+  ...                  [0.8, 0.05, 0.05, 0.1], [0.01, 0.09, 0.1, 0.8]])
+  >>> loss = kappa_loss(y_true, y_pred)
+  >>> loss
+  <tf.Tensor: shape=(), dtype=float32, numpy=-1.1611925>
+
+  Usage with `tf.keras` API:
+
+  >>> model = tf.keras.Model()
+  >>> model.compile('sgd', loss=dp.losses.WeightedKappaLoss(num_classes=4))
+
+  <... outputs should be softmax results
+  if you want to weight the samples, just multiply the outputs
+  by the sample weight ...>
+
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: int,
-      weightage: Optional[str] = "quadratic",
-      name: Optional[str] = "cohen_kappa_loss",
-      epsilon: Optional[Number] = 1e-6,
-      reduction: str = tf.keras.losses.Reduction.NONE,
+    self,
+    num_classes: int,
+    weightage: Optional[str] = "quadratic",
+    name: Optional[str] = "cohen_kappa_loss",
+    epsilon: Optional[Number] = 1e-6,
+    reduction: str = tf.keras.losses.Reduction.NONE,
   ):
     r"""Creates a `WeightedKappaLoss` instance.
 
-        Args:
-          num_classes: Number of unique classes in your dataset.
-          weightage: (Optional) Weighting to be considered for calculating
-            kappa statistics. A valid value is one of
-            ['linear', 'quadratic']. Defaults to 'quadratic'.
-          name: (Optional) String name of the metric instance.
-          epsilon: (Optional) increment to avoid log zero,
-            so the loss will be $ \log(1 - k + \epsilon) $, where $ k $ lies
-            in $ [-1, 1] $. Defaults to 1e-6.
-        Raises:
-          ValueError: If the value passed for `weightage` is invalid
-            i.e. not any one of ['linear', 'quadratic']
-        """
+    Args:
+      num_classes: Number of unique classes in your dataset.
+      weightage: (Optional) Weighting to be considered for calculating
+        kappa statistics. A valid value is one of
+        ['linear', 'quadratic']. Defaults to 'quadratic'.
+      name: (Optional) String name of the metric instance.
+      epsilon: (Optional) increment to avoid log zero,
+        so the loss will be $ \log(1 - k + \epsilon) $, where $ k $ lies
+        in $ [-1, 1] $. Defaults to 1e-6.
+    Raises:
+      ValueError: If the value passed for `weightage` is invalid
+        i.e. not any one of ['linear', 'quadratic']
+    """
 
     super().__init__(name=name, reduction=reduction)
 
@@ -97,7 +97,7 @@ def __init__(
     if weightage == "linear":
       self.weight_mat = tf.abs(col_mat - row_mat)
     else:
-      self.weight_mat = (col_mat - row_mat)**2
+      self.weight_mat = (col_mat - row_mat) ** 2
 
   def call(self, y_true, y_pred):
     y_true = tf.cast(y_true, dtype=self.col_label_vec.dtype)
@@ -109,7 +109,7 @@ def call(self, y_true, y_pred):
     if self.weightage == "linear":
       weight = tf.abs(cat_label_mat - row_label_mat)
     else:
-      weight = (cat_label_mat - row_label_mat)**2
+      weight = (cat_label_mat - row_label_mat) ** 2
     numerator = tf.reduce_sum(weight * y_pred)
     label_dist = tf.reduce_sum(y_true, axis=0, keepdims=True)
     pred_dist = tf.reduce_sum(y_pred, axis=0, keepdims=True)
@@ -121,9 +121,9 @@ def call(self, y_true, y_pred):
 
   def get_config(self):
     config = {
-        "num_classes": self.num_classes,
-        "weightage": self.weightage,
-        "epsilon": self.epsilon,
+      "num_classes": self.num_classes,
+      "weightage": self.weightage,
+      "epsilon": self.epsilon,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/losses/lifted.py b/deepray/losses/lifted.py
index 7dade105..f38db140 100644
--- a/deepray/losses/lifted.py
+++ b/deepray/losses/lifted.py
@@ -29,18 +29,18 @@
 def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: FloatTensorLike = 1.0) -> tf.Tensor:
   """Computes the lifted structured loss.
 
-    Args:
-      labels: 1-D tf.int32 `Tensor` with shape `[batch_size]` of
-        multiclass integer labels.
-      embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
-        not be l2 normalized.
-      margin: Float, margin term in the loss definition.
-
-    Returns:
-      lifted_loss: float scalar with dtype of embeddings.
-    """
-  convert_to_float32 = (embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16)
-  precise_embeddings = (tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings)
+  Args:
+    labels: 1-D tf.int32 `Tensor` with shape `[batch_size]` of
+      multiclass integer labels.
+    embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
+      not be l2 normalized.
+    margin: Float, margin term in the loss definition.
+
+  Returns:
+    lifted_loss: float scalar with dtype of embeddings.
+  """
+  convert_to_float32 = embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16
+  precise_embeddings = tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings
 
   # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
   lshape = tf.shape(labels)
@@ -63,7 +63,7 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float
   #     this is to take the max only among negatives.
   row_minimums = tf.math.reduce_min(diff, 1, keepdims=True)
   row_negative_maximums = (
-      tf.math.reduce_max(tf.math.multiply(diff - row_minimums, mask), 1, keepdims=True) + row_minimums
+    tf.math.reduce_max(tf.math.multiply(diff - row_minimums, mask), 1, keepdims=True) + row_minimums
   )
 
   # Compute the loss.
@@ -78,12 +78,12 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float
   max_elements_vect = tf.reshape(tf.transpose(max_elements), [-1, 1])
 
   loss_exp_left = tf.reshape(
-      tf.math.reduce_sum(
-          tf.math.multiply(tf.math.exp(diff_tiled - max_elements_vect), mask_tiled),
-          1,
-          keepdims=True,
-      ),
-      [batch_size, batch_size],
+    tf.math.reduce_sum(
+      tf.math.multiply(tf.math.exp(diff_tiled - max_elements_vect), mask_tiled),
+      1,
+      keepdims=True,
+    ),
+    [batch_size, batch_size],
   )
 
   loss_mat = max_elements + tf.math.log(loss_exp_left + tf.transpose(loss_exp_left))
@@ -96,8 +96,8 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float
   num_positives = tf.math.reduce_sum(mask_positives) / 2.0
 
   lifted_loss = tf.math.truediv(
-      0.25 * tf.math.reduce_sum(tf.math.square(tf.math.maximum(tf.math.multiply(loss_mat, mask_positives), 0.0))),
-      num_positives,
+    0.25 * tf.math.reduce_sum(tf.math.square(tf.math.maximum(tf.math.multiply(loss_mat, mask_positives), 0.0))),
+    num_positives,
   )
 
   if convert_to_float32:
@@ -110,22 +110,22 @@ def lifted_struct_loss(labels: TensorLike, embeddings: TensorLike, margin: Float
 class LiftedStructLoss(losses.LossFunctionWrapper):
   """Computes the lifted structured loss.
 
-    The loss encourages the positive distances (between a pair of embeddings
-    with the same labels) to be smaller than any negative distances (between
-    a pair of embeddings with different labels) in the mini-batch in a way
-    that is differentiable with respect to the embedding vectors.
-    See: https://arxiv.org/abs/1511.06452.
+  The loss encourages the positive distances (between a pair of embeddings
+  with the same labels) to be smaller than any negative distances (between
+  a pair of embeddings with different labels) in the mini-batch in a way
+  that is differentiable with respect to the embedding vectors.
+  See: https://arxiv.org/abs/1511.06452.
 
-    Args:
-      margin: Float, margin term in the loss definition.
-      name: Optional name for the op.
-    """
+  Args:
+    margin: Float, margin term in the loss definition.
+    name: Optional name for the op.
+  """
 
   @typechecked
   def __init__(self, margin: FloatTensorLike = 1.0, name: Optional[str] = None, **kwargs):
     super().__init__(
-        lifted_struct_loss,
-        name=name,
-        reduction=tf.keras.losses.Reduction.NONE,
-        margin=margin,
+      lifted_struct_loss,
+      name=name,
+      reduction=tf.keras.losses.Reduction.NONE,
+      margin=margin,
     )
diff --git a/deepray/losses/losses_impl.py b/deepray/losses/losses_impl.py
index 6230ac9e..a303864e 100644
--- a/deepray/losses/losses_impl.py
+++ b/deepray/losses/losses_impl.py
@@ -43,7 +43,7 @@ def _safe_default_gain_fn(labels):
     A `tensor` of safe gain function values of shape [batch_size, list_size].
   """
   max_labels = tf.reduce_max(labels, axis=-1, keepdims=True)
-  gains = tf.pow(2., labels - max_labels) - tf.pow(2., -max_labels)
+  gains = tf.pow(2.0, labels - max_labels) - tf.pow(2.0, -max_labels)
   return gains
 
 
@@ -100,14 +100,14 @@ def approx_ranks(logits):
   x = tf.tile(tf.expand_dims(logits, 2), [1, 1, list_size])
   y = tf.tile(tf.expand_dims(logits, 1), [1, list_size, 1])
   pairs = tf.sigmoid(y - x)
-  return tf.reduce_sum(input_tensor=pairs, axis=-1) + .5
+  return tf.reduce_sum(input_tensor=pairs, axis=-1) + 0.5
 
 
 def inverse_max_dcg(
-    labels,
-    gain_fn=lambda labels: tf.pow(2.0, labels) - 1.,
-    rank_discount_fn=lambda rank: 1. / tf.math.log1p(rank),
-    topn=None
+  labels,
+  gain_fn=lambda labels: tf.pow(2.0, labels) - 1.0,
+  rank_discount_fn=lambda rank: 1.0 / tf.math.log1p(rank),
+  topn=None,
 ):
   """Computes the inverse of max DCG.
 
@@ -122,11 +122,11 @@ def inverse_max_dcg(
   Returns:
     A `Tensor` with shape [batch_size, 1].
   """
-  ideal_sorted_labels, = utils.sort_by_scores(labels, [labels], topn=topn)
+  (ideal_sorted_labels,) = utils.sort_by_scores(labels, [labels], topn=topn)
   rank = tf.range(tf.shape(input=ideal_sorted_labels)[1]) + 1
   discounted_gain = gain_fn(ideal_sorted_labels) * rank_discount_fn(tf.cast(rank, dtype=tf.float32))
   discounted_gain = tf.reduce_sum(input_tensor=discounted_gain, axis=1, keepdims=True)
-  return tf.compat.v1.where(tf.greater(discounted_gain, 0.), 1. / discounted_gain, tf.zeros_like(discounted_gain))
+  return tf.compat.v1.where(tf.greater(discounted_gain, 0.0), 1.0 / discounted_gain, tf.zeros_like(discounted_gain))
 
 
 def ndcg(labels, ranks=None, perm_mat=None):
@@ -146,12 +146,12 @@ def ndcg(labels, ranks=None, perm_mat=None):
     A `tensor` of NDCG, ApproxNDCG, or ExpectedNDCG of shape [batch_size, 1].
   """
   if ranks is not None and perm_mat is not None:
-    raise ValueError('Cannot use both ranks and perm_mat simultaneously.')
+    raise ValueError("Cannot use both ranks and perm_mat simultaneously.")
 
   if ranks is None:
     list_size = tf.shape(labels)[1]
     ranks = tf.range(list_size) + 1
-  discounts = 1. / tf.math.log1p(tf.cast(ranks, dtype=tf.float32))
+  discounts = 1.0 / tf.math.log1p(tf.cast(ranks, dtype=tf.float32))
   gains = _safe_default_gain_fn(tf.cast(labels, dtype=tf.float32))
   if perm_mat is not None:
     gains = tf.reduce_sum(input_tensor=perm_mat * tf.expand_dims(gains, 1), axis=-1)
@@ -169,6 +169,7 @@ class _LambdaWeight(object, metaclass=abc.ABCMeta):
   to be instantiated by concrete lambda weight models. The instance is used
   together with standard loss such as logistic loss and softmax loss.
   """
+
   # TODO: Define a public version of `_LambdaWeight` for typing
   # annotations.
 
@@ -184,7 +185,7 @@ def pair_weights(self, labels, ranks):
     Returns:
       A `Tensor` that can weight example pairs.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def individual_weights(self, labels, ranks):
     """Returns the weight `Tensor` for individual examples.
@@ -213,7 +214,9 @@ def pair_weights(self, labels, ranks):
 class AbstractDCGLambdaWeight(_LambdaWeight):
   """Abstract LambdaWeight for Discounted Cumulative Gain (DCG) metric."""
 
-  def __init__(self, topn=None, gain_fn=lambda label: label, rank_discount_fn=lambda rank: 1. / rank, normalized=False):
+  def __init__(
+    self, topn=None, gain_fn=lambda label: label, rank_discount_fn=lambda rank: 1.0 / rank, normalized=False
+  ):
     """Initializer.
 
     Ranks are 1-based, not 0-based.
@@ -240,11 +243,11 @@ def _pair_rank_discount(self, ranks, topn):
     Returns:
      A pairwise weights `Tensor` based on the `rank_discount_fn`.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def pair_weights(self, labels, ranks):
     """See `_LambdaWeight`."""
-    with tf.compat.v1.name_scope(name='dcg_lambda_weight'):
+    with tf.compat.v1.name_scope(name="dcg_lambda_weight"):
       _check_tensor_shapes([labels, ranks])
       valid_pair, labels = _get_valid_pairs_and_clean_labels(labels)
       gain = self._gain_fn(labels)
@@ -266,7 +269,7 @@ def pair_weights(self, labels, ranks):
 
   def individual_weights(self, labels, ranks):
     """See `_LambdaWeight`."""
-    with tf.compat.v1.name_scope(name='dcg_lambda_weight'):
+    with tf.compat.v1.name_scope(name="dcg_lambda_weight"):
       _check_tensor_shapes([labels, ranks])
       labels = tf.convert_to_tensor(value=labels)
       labels = tf.compat.v1.where(utils.is_label_valid(labels), labels, tf.zeros_like(labels))
@@ -281,12 +284,12 @@ class DCGLambdaWeight(AbstractDCGLambdaWeight):
   """LambdaWeight for Discounted Cumulative Gain metric."""
 
   def __init__(
-      self,
-      topn=None,
-      gain_fn=lambda label: label,
-      rank_discount_fn=lambda rank: 1. / rank,
-      normalized=False,
-      smooth_fraction=0.
+    self,
+    topn=None,
+    gain_fn=lambda label: label,
+    rank_discount_fn=lambda rank: 1.0 / rank,
+    normalized=False,
+    smooth_fraction=0.0,
   ):
     """Initializer.
 
@@ -309,8 +312,8 @@ def __init__(
         LambdaMART.
     """
     super().__init__(topn, gain_fn, rank_discount_fn, normalized)
-    if not 0. <= smooth_fraction <= 1.:
-      raise ValueError('smooth_fraction %s should be in range [0, 1].' % smooth_fraction)
+    if not 0.0 <= smooth_fraction <= 1.0:
+      raise ValueError("smooth_fraction %s should be in range [0, 1]." % smooth_fraction)
     self._smooth_fraction = smooth_fraction
 
   def _pair_rank_discount(self, ranks, topn):
@@ -325,8 +328,9 @@ def _discount_for_relative_rank_diff():
       pair_valid_rank = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn))
       rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32)
       pair_discount = tf.where(
-          tf.logical_and(tf.greater(rank_diff, 0), pair_valid_rank),
-          tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)), tf.zeros_like(rank_diff)
+        tf.logical_and(tf.greater(rank_diff, 0), pair_valid_rank),
+        tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)),
+        tf.zeros_like(rank_diff),
       )
       return pair_discount
 
@@ -335,15 +339,16 @@ def _discount_for_absolute_rank():
       # When the rank discount is (1 / rank) for example, the discount is
       # |1 / r_i - 1 / r_j|. When i or j > topn, the discount becomes 0.
       rank_discount = tf.compat.v1.where(
-          tf.greater(ranks, topn), tf.zeros_like(tf.cast(ranks, dtype=tf.float32)),
-          self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32))
+        tf.greater(ranks, topn),
+        tf.zeros_like(tf.cast(ranks, dtype=tf.float32)),
+        self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32)),
       )
       pair_discount = tf.abs(_apply_pairwise_op(tf.subtract, rank_discount))
       return pair_discount
 
     u = _discount_for_relative_rank_diff()
     v = _discount_for_absolute_rank()
-    pair_discount = (1. - self._smooth_fraction) * u + self._smooth_fraction * v
+    pair_discount = (1.0 - self._smooth_fraction) * u + self._smooth_fraction * v
     pair_mask = _apply_pairwise_op(tf.logical_or, tf.less_equal(ranks, topn))
     return pair_discount * tf.cast(pair_mask, dtype=tf.float32)
 
@@ -361,12 +366,12 @@ def _pair_rank_discount(self, ranks, topn):
     rank_diff = tf.cast(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), dtype=tf.float32)
     max_rank = tf.cast(_apply_pairwise_op(tf.math.maximum, ranks), tf.float32)
     multiplier = tf.where(
-        tf.greater(max_rank, tf.cast(topn, tf.float32)), 1. / (1. - self._rank_discount_fn(max_rank)), 1.
+      tf.greater(max_rank, tf.cast(topn, tf.float32)), 1.0 / (1.0 - self._rank_discount_fn(max_rank)), 1.0
     )
     pair_discount = tf.where(
-        tf.greater(rank_diff, 0.),
-        tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)) * multiplier,
-        tf.zeros_like(rank_diff)
+      tf.greater(rank_diff, 0.0),
+      tf.abs(self._rank_discount_fn(rank_diff) - self._rank_discount_fn(rank_diff + 1)) * multiplier,
+      tf.zeros_like(rank_diff),
     )
     return pair_discount
 
@@ -377,7 +382,7 @@ class YetiDCGLambdaWeight(DCGLambdaWeightV2):
   def pair_weights(self, labels: tf.Tensor, ranks: tf.Tensor) -> tf.Tensor:
     """See `_LambdaWeight`."""
     pair_weight = super().pair_weights(labels, ranks)
-    with tf.compat.v1.name_scope(name='yeti_dcg_lambda_weight'):
+    with tf.compat.v1.name_scope(name="yeti_dcg_lambda_weight"):
       neighbor_pair = tf.equal(tf.abs(_apply_pairwise_op(tf.subtract, ranks)), 1)
       pair_weight *= tf.cast(neighbor_pair, dtype=tf.float32)
     return pair_weight
@@ -415,7 +420,7 @@ def pair_weights(self, labels, ranks):
     Returns:
       A `Tensor` that can weight example pairs.
     """
-    with tf.compat.v1.name_scope(name='precision_lambda_weight'):
+    with tf.compat.v1.name_scope(name="precision_lambda_weight"):
       _check_tensor_shapes([labels, ranks])
       valid_pair, labels = _get_valid_pairs_and_clean_labels(labels)
       binary_labels = tf.cast(self._positive_fn(labels), dtype=tf.float32)
@@ -446,7 +451,7 @@ def pair_weights(self, labels, ranks):
 
   def individual_weights(self, labels, ranks):
     """See `_LambdaWeight`."""
-    with tf.compat.v1.name_scope(name='p_list_mle_lambda_weight'):
+    with tf.compat.v1.name_scope(name="p_list_mle_lambda_weight"):
       _check_tensor_shapes([labels, ranks])
       labels = tf.convert_to_tensor(value=labels)
       rank_discount = self._rank_discount_fn(tf.cast(ranks, dtype=tf.float32))
@@ -468,7 +473,7 @@ def _compute_ranks(logits, is_valid):
   _check_tensor_shapes([logits, is_valid])
   # Only sort entries with is_valid = True.
   scores = tf.compat.v1.where(
-      is_valid, logits, -1e-6 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=1, keepdims=True)
+    is_valid, logits, -1e-6 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=1, keepdims=True)
   )
   return utils.sorted_ranks(scores)
 
@@ -548,7 +553,7 @@ def sample(self, labels, logits, weights=None):
       resulting
       Tensor has the updated dimensions.
     """
-    with tf.compat.v1.name_scope(self._name, 'gumbel_softmax_sample', (labels, logits, weights)):
+    with tf.compat.v1.name_scope(self._name, "gumbel_softmax_sample", (labels, logits, weights)):
       # Convert ragged tensors to dense and construct a mask.
       if self._ragged:
         is_weights_ragged = isinstance(weights, tf.RaggedTensor)
@@ -572,8 +577,7 @@ def sample(self, labels, logits, weights=None):
       if is_label_valid.shape.rank > 2:
         is_label_valid = tf.reduce_any(is_label_valid, axis=-1)
       sampled_logits = tf.compat.v1.where(
-          is_label_valid, sampled_logits / self._temperature,
-          tf.math.log(1e-20) * tf.ones_like(sampled_logits)
+        is_label_valid, sampled_logits / self._temperature, tf.math.log(1e-20) * tf.ones_like(sampled_logits)
       )
       sampled_logits = tf.math.log(tf.nn.softmax(sampled_logits) + 1e-20)
 
@@ -695,7 +699,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     Returns:
       A tuple(losses, loss_weights) that have the same shape.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def normalize_weights(self, labels, weights):
     """Normalizes weights.
@@ -784,7 +788,7 @@ def compute_per_list(self, labels, logits, weights, mask=None):
       A pair of `Tensor` objects of shape [batch_size] containing per-list
       losses and weights.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def eval_metric(self, labels, logits, weights, mask=None):
     """Computes the eval metric for the loss in tf.estimator (not tf.keras).
@@ -816,7 +820,7 @@ class _PairwiseLoss(_RankingLoss, metaclass=abc.ABCMeta):
   @abc.abstractmethod
   def _pairwise_loss(self, pairwise_logits):
     """The loss of pairwise logits with l_i > l_j."""
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     """See `_RankingLoss`."""
@@ -828,7 +832,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     if self._lambda_weight is not None:
       pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks)
 
-    pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient')
+    pairwise_weights = tf.stop_gradient(pairwise_weights, name="weights_stop_gradient")
     return self._pairwise_loss(pairwise_logits), pairwise_weights
 
   def compute_per_list(self, labels, logits, weights, mask=None):
@@ -868,7 +872,7 @@ def _normalize_weights_impl(self, labels, weights):
     # actually symmetric when `weights` are constant per list, i.e., listwise
     # weights.
     if weights is None:
-      weights = 1.
+      weights = 1.0
     weights = tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels))
     return tf.expand_dims(weights, axis=2)
 
@@ -896,7 +900,7 @@ class PairwiseSoftZeroOneLoss(_PairwiseLoss):
   def _pairwise_loss(self, pairwise_logits):
     """See `_PairwiseLoss`."""
     return tf.compat.v1.where(
-        tf.greater(pairwise_logits, 0), 1. - tf.sigmoid(pairwise_logits), tf.sigmoid(-pairwise_logits)
+      tf.greater(pairwise_logits, 0), 1.0 - tf.sigmoid(pairwise_logits), tf.sigmoid(-pairwise_logits)
     )
 
 
@@ -932,7 +936,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     if self._lambda_weight is not None:
       ranks = _compute_ranks(logits, mask)
       pairwise_weights *= self._lambda_weight.pair_weights(labels, ranks)
-    pairwise_weights = tf.stop_gradient(pairwise_weights, name='weights_stop_gradient')
+    pairwise_weights = tf.stop_gradient(pairwise_weights, name="weights_stop_gradient")
 
     return pairwise_mse_loss, pairwise_weights
 
@@ -950,8 +954,8 @@ def _normalize_weights_impl(self, labels, weights):
       is_valid = utils.is_label_valid(labels)
       labels = tf.where(is_valid, labels, tf.zeros_like(labels))
       return tf.compat.v1.math.divide_no_nan(
-          tf.reduce_sum(input_tensor=(weights * labels), axis=1, keepdims=True),
-          tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+        tf.reduce_sum(input_tensor=(weights * labels), axis=1, keepdims=True),
+        tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True),
       )
 
   def compute_per_list(self, labels, logits, weights, mask=None):
@@ -1011,7 +1015,7 @@ def __init__(self, name, lambda_weight=None, gamma=64, margin=0.25, ragged=False
   def get_logits(self, logits):
     """See `_RankingLoss`."""
     # Add a clip to confine scores in [0, 1].
-    return tf.clip_by_value(tf.convert_to_tensor(value=logits), 0., 1.)
+    return tf.clip_by_value(tf.convert_to_tensor(value=logits), 0.0, 1.0)
 
   def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     """See `_RankingLoss`."""
@@ -1019,14 +1023,14 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
       mask = utils.is_label_valid(labels)
 
     def circle_loss_pairwise_op(score_i, score_j):
-      alpha_i = tf.stop_gradient(tf.nn.relu(1 - score_i + self._margin), name='circle_loss_alpha_pos')
-      alpha_j = tf.stop_gradient(tf.nn.relu(score_j + self._margin), name='circle_loss_alpha_neg')
+      alpha_i = tf.stop_gradient(tf.nn.relu(1 - score_i + self._margin), name="circle_loss_alpha_pos")
+      alpha_j = tf.stop_gradient(tf.nn.relu(score_j + self._margin), name="circle_loss_alpha_neg")
       return alpha_i * (1 - score_i - self._margin) + alpha_j * (score_j - self._margin)
 
     pairwise_labels, pairwise_logits = _pairwise_comparison(
-        labels, logits, mask, pairwise_logits_op=circle_loss_pairwise_op
+      labels, logits, mask, pairwise_logits_op=circle_loss_pairwise_op
     )
-    pairwise_weights = tf.stop_gradient(pairwise_labels, name='weights_stop_gradient')
+    pairwise_weights = tf.stop_gradient(pairwise_labels, name="weights_stop_gradient")
     # TODO: try lambda_weights for circle loss.
     # Pairwise losses and weights will be of shape
     # [batch_size, list_size, list_size].
@@ -1034,9 +1038,9 @@ def circle_loss_pairwise_op(score_i, score_j):
 
     # This computes the per-list losses and weights for circle loss.
     per_list_losses = tf.math.log1p(tf.reduce_sum(tf.math.multiply(losses, pairwise_weights), axis=[1, 2]))
-    per_list_weights = tf.reduce_sum(pairwise_weights, axis=[
-        1, 2
-    ]) / tf.reduce_sum(tf.cast(pairwise_weights > 0, tf.float32), axis=[1, 2])
+    per_list_weights = tf.reduce_sum(pairwise_weights, axis=[1, 2]) / tf.reduce_sum(
+      tf.cast(pairwise_weights > 0, tf.float32), axis=[1, 2]
+    )
 
     # Return per-list losses and weights with shape [batch_size, 1].
     return tf.expand_dims(per_list_losses, 1), tf.expand_dims(per_list_weights, 1)
@@ -1163,8 +1167,9 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     denominator_logits = tf.concat([denominator_logits, tf.expand_dims(logits, axis=2)], axis=2)
     denominator_mask = tf.concat([pairwise_labels, tf.expand_dims(tf.ones_like(logits), axis=2)], axis=2)
     denominator_logits = tf.where(
-        tf.greater(denominator_mask, 0.0), denominator_logits,
-        -1e-3 + tf.reduce_min(denominator_logits) * tf.ones_like(denominator_logits)
+      tf.greater(denominator_mask, 0.0),
+      denominator_logits,
+      -1e-3 + tf.reduce_min(denominator_logits) * tf.ones_like(denominator_logits),
     )
     logits_max = tf.reduce_max(denominator_logits, axis=-1, keepdims=True)
     # Subtract the max so that exp(denominator_logits) is numerically valid.
@@ -1184,7 +1189,7 @@ class _PointwiseLoss(_RankingLoss):
   def _normalize_weights_impl(self, labels, weights):
     """See _RankingLoss."""
     if weights is None:
-      weights = 1.
+      weights = 1.0
     return tf.compat.v1.where(utils.is_label_valid(labels), tf.ones_like(labels) * weights, tf.zeros_like(labels))
 
   def compute_per_list(self, labels, logits, weights, mask=None):
@@ -1264,15 +1269,15 @@ def _compute_latent_prob(self, clicks, exam_logits, rel_logits):
       A tuple of (exam_given_clicks, rel_given_clicks) representing
       P(examination | click) and P(relevance | click).
     """
-    with tf.compat.v1.name_scope(name='compute_latent_prob'):
+    with tf.compat.v1.name_scope(name="compute_latent_prob"):
       is_clicked = tf.greater_equal(tf.cast(clicks, tf.float32), 1.0)
       exam_logits_posterior = exam_logits - tf.math.softplus(rel_logits)
       rel_logits_posterior = rel_logits - tf.math.softplus(exam_logits)
       exam_prob_posterior = tf.compat.v1.where(
-          is_clicked, tf.ones_like(exam_logits_posterior), tf.sigmoid(exam_logits_posterior)
+        is_clicked, tf.ones_like(exam_logits_posterior), tf.sigmoid(exam_logits_posterior)
       )
       rel_prob_posterior = tf.compat.v1.where(
-          is_clicked, tf.ones_like(rel_logits_posterior), tf.sigmoid(rel_logits_posterior)
+        is_clicked, tf.ones_like(rel_logits_posterior), tf.sigmoid(rel_logits_posterior)
       )
       return tf.stop_gradient(exam_prob_posterior), tf.stop_gradient(rel_prob_posterior)
 
@@ -1299,12 +1304,14 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     # The distribution in the E step.
     exam_latent_prob, rel_latent_prob = self._compute_latent_prob(labels, exam_logits, rel_logits)
     # The loss in the M step.
-    losses = tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(
-        labels=exam_latent_prob, logits=exam_logits
-    ) * self._exam_loss_weight
-    losses += tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(
-        labels=rel_latent_prob, logits=rel_logits
-    ) * self._rel_loss_weight
+    losses = (
+      tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=exam_latent_prob, logits=exam_logits)
+      * self._exam_loss_weight
+    )
+    losses += (
+      tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=rel_latent_prob, logits=rel_logits)
+      * self._rel_loss_weight
+    )
     return losses, tf.cast(mask, dtype=tf.float32)
 
 
@@ -1374,9 +1381,9 @@ def _compute_model_prob(self, per_list_logodds):
     Returns:
       A `Tensor` of probability with shape [batch_size, 1, model_num].
     """
-    with tf.compat.v1.name_scope(name='compute_model_prob'):
+    with tf.compat.v1.name_scope(name="compute_model_prob"):
       return tf.stop_gradient(
-          tf.exp(-self._alpha * (per_list_logodds - tf.reduce_min(per_list_logodds, axis=2, keepdims=True)))
+        tf.exp(-self._alpha * (per_list_logodds - tf.reduce_min(per_list_logodds, axis=2, keepdims=True)))
       )
 
   def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
@@ -1398,11 +1405,11 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     # The loss in the M step.
     # shape = [batch_size, list_size, model_num]
     losses = tf.stack(
-        [
-            tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=model_logits)
-            for model_logits in tf.unstack(logits, axis=-1)
-        ],
-        axis=2
+      [
+        tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=model_logits)
+        for model_logits in tf.unstack(logits, axis=-1)
+      ],
+      axis=2,
     )
     losses = tf.where(tf.expand_dims(mask, axis=-1), losses, tf.zeros_like(losses, dtype=tf.float32))
 
@@ -1415,8 +1422,9 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
 
     label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
     nonzero_mask = tf.greater(label_sum, 0.0)
-    return tf.reshape(tf.reduce_sum(losses * model_prob / prob_norm, axis=[1, 2]),
-                      [-1, 1]), tf.cast(nonzero_mask, dtype=tf.float32)
+    return tf.reshape(tf.reduce_sum(losses * model_prob / prob_norm, axis=[1, 2]), [-1, 1]), tf.cast(
+      nonzero_mask, dtype=tf.float32
+    )
 
 
 class ListMLELoss(_ListwiseLoss):
@@ -1431,8 +1439,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
     logits = tf.compat.v1.where(mask, logits, tf.math.log(_EPSILON) * tf.ones_like(logits))
     scores = tf.compat.v1.where(
-        mask, labels,
-        tf.reduce_min(input_tensor=labels, axis=1, keepdims=True) - 1e-6 * tf.ones_like(labels)
+      mask, labels, tf.reduce_min(input_tensor=labels, axis=1, keepdims=True) - 1e-6 * tf.ones_like(labels)
     )
     # Use a fixed ops-level seed and the randomness is controlled by the
     # graph-level seed.
@@ -1446,7 +1453,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     if self._lambda_weight is not None and isinstance(self._lambda_weight, ListMLELambdaWeight):
       batch_size, list_size = tf.unstack(tf.shape(input=sorted_labels))
       sums *= self._lambda_weight.individual_weights(
-          sorted_labels, tf.tile(tf.expand_dims(tf.range(list_size) + 1, 0), [batch_size, 1])
+        sorted_labels, tf.tile(tf.expand_dims(tf.range(list_size) + 1, 0), [batch_size, 1])
       )
 
     negative_log_likelihood = tf.reduce_sum(input_tensor=sums, axis=1, keepdims=True)
@@ -1467,7 +1474,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
       mask = utils.is_label_valid(labels)
     labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
     logits = tf.compat.v1.where(
-        mask, logits, -1e3 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
+      mask, logits, -1e3 * tf.ones_like(logits) + tf.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
     )
 
     label_sum = tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
@@ -1492,7 +1499,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
       mask = utils.is_label_valid(labels)
     labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
     logits = tf.compat.v1.where(
-        mask, logits, -1e3 * tf.ones_like(logits) + tf.math.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
+      mask, logits, -1e3 * tf.ones_like(logits) + tf.math.reduce_min(input_tensor=logits, axis=-1, keepdims=True)
     )
 
     label_sum = tf.math.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
@@ -1500,7 +1507,7 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     nonzero_mask = tf.math.greater(tf.reshape(label_sum, [-1]), 0.0)
     labels = tf.compat.v1.where(nonzero_mask, labels, _EPSILON * tf.ones_like(labels))
 
-    rr = 1. / approx_ranks(logits)
+    rr = 1.0 / approx_ranks(logits)
     rr = tf.math.reduce_sum(input_tensor=rr * labels, axis=-1, keepdims=True)
     mrr = rr / tf.math.reduce_sum(input_tensor=labels, axis=-1, keepdims=True)
     return -mrr, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
@@ -1524,20 +1531,20 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     smooth_perm = neural_sort(logits, mask=mask)
 
     losses = tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(
-        labels=true_perm, logits=tf.math.log(1e-20 + smooth_perm), axis=2
+      labels=true_perm, logits=tf.math.log(1e-20 + smooth_perm), axis=2
     )
 
     # Neural sort will place masked entries last. Losses are still computed on
     # those entries so we need to cancel those out. This means we need to mask
     # out the last n entries, where n is the number of masked items per list. We
     # do so by sorting the mask and setting (masked) invalid losses to 0.
-    sorted_mask = tf.cast(tf.sort(tf.cast(mask, dtype=tf.float32), axis=1, direction='DESCENDING'), dtype=tf.bool)
+    sorted_mask = tf.cast(tf.sort(tf.cast(mask, dtype=tf.float32), axis=1, direction="DESCENDING"), dtype=tf.bool)
     losses = tf.where(sorted_mask, losses, tf.zeros_like(losses))
 
     # shape = [batch_size, list_size].
     losses = tf.math.divide_no_nan(
-        tf.reduce_sum(input_tensor=losses, axis=-1, keepdims=True),
-        tf.reduce_sum(input_tensor=tf.cast(mask, dtype=tf.float32), axis=-1, keepdims=True)
+      tf.reduce_sum(input_tensor=losses, axis=-1, keepdims=True),
+      tf.reduce_sum(input_tensor=tf.cast(mask, dtype=tf.float32), axis=-1, keepdims=True),
     )
 
     return losses, tf.reshape(tf.cast(nonzero_mask, dtype=tf.float32), [-1, 1])
@@ -1615,7 +1622,7 @@ def neural_sort(logits, name=None, mask=None):
     A tensor of permutation matrices whose dimension is [batch_size, list_size,
     list_size].
   """
-  with tf.compat.v1.name_scope(name, 'neural_sort', [logits]):
+  with tf.compat.v1.name_scope(name, "neural_sort", [logits]):
     if mask is None:
       mask = tf.ones_like(logits, dtype=tf.bool)
 
@@ -1650,7 +1657,7 @@ def neural_sort(logits, name=None, mask=None):
 
     # By swapping the rows of masked items to the end of the permutation matrix,
     # we force masked items to be placed last.
-    sorted_mask_indices = tf.argsort(tf.cast(mask, dtype=tf.int32), axis=1, direction='DESCENDING', stable=True)
+    sorted_mask_indices = tf.argsort(tf.cast(mask, dtype=tf.int32), axis=1, direction="DESCENDING", stable=True)
     p_logits = tf.gather(p_logits, sorted_mask_indices, batch_dims=1, axis=1)
 
     smooth_perm = tf.nn.softmax(p_logits, -1)
@@ -1681,7 +1688,7 @@ def gumbel_neural_sort(logits, name=None, sample_size=8, temperature=1.0, seed=N
     A `Tensor` of permutation matrices whose dimension is [batch_size,
     sample_size, list_size, list_size].
   """
-  with tf.compat.v1.name_scope(name, 'gumbel_neural_sort', [logits]):
+  with tf.compat.v1.name_scope(name, "gumbel_neural_sort", [logits]):
     batch_size = tf.shape(input=logits)[0]
     list_size = tf.shape(input=logits)[1]
 
@@ -1748,17 +1755,17 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     if mask is None:
       mask = utils.is_label_valid(labels)
     if logits.shape.rank != 3:
-      raise ValueError('Predictions for ordinal loss must have rank 3.')
+      raise ValueError("Predictions for ordinal loss must have rank 3.")
     elif logits.shape[-1] != self._ordinal_size:
       raise ValueError(
-          'The last dimension of logits must be the number of ordinal levels '
-          f'{self._ordinal_size}, the actual dimension is {logits.shape[-1]}.'
+        "The last dimension of logits must be the number of ordinal levels "
+        f"{self._ordinal_size}, the actual dimension is {logits.shape[-1]}."
       )
     labels = tf.where(mask, labels, 0.0)
     logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0)
     ordinals = self._labels_to_ordinals(labels, mask)
     losses = tf.where(
-        tf.expand_dims(mask, -1), tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=ordinals, logits=logits), 0.0
+      tf.expand_dims(mask, -1), tf.compat.v1.nn.sigmoid_cross_entropy_with_logits(labels=ordinals, logits=logits), 0.0
     )
     return tf.reduce_sum(losses, axis=-1), tf.cast(mask, dtype=tf.float32)
 
@@ -1801,21 +1808,21 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     if mask is None:
       mask = utils.is_label_valid(labels)
     if logits.shape.rank != 3:
-      raise ValueError('Predictions for multi-class loss must have rank 3.')
+      raise ValueError("Predictions for multi-class loss must have rank 3.")
     elif logits.shape[-1] != self._num_classes:
       raise ValueError(
-          'The last dimension of logits must be the number of classes '
-          f'{self._num_classes}, the actual dimension is {logits.shape[-1]}.'
+        "The last dimension of logits must be the number of classes "
+        f"{self._num_classes}, the actual dimension is {logits.shape[-1]}."
       )
     labels = tf.where(mask, labels, 0.0)
     logits = tf.where(tf.expand_dims(mask, -1), logits, 0.0)
     classes = self._labels_to_one_hot_class(labels, mask)
     losses = tf.keras.losses.CategoricalCrossentropy(
-        from_logits=self._from_logits,
-        label_smoothing=self._label_smoothing,
-        axis=-1,
-        reduction=tf.keras.losses.Reduction.NONE,
-        name='categorical_crossentropy'
+      from_logits=self._from_logits,
+      label_smoothing=self._label_smoothing,
+      axis=-1,
+      reduction=tf.keras.losses.Reduction.NONE,
+      name="categorical_crossentropy",
     )(classes, logits, tf.cast(mask, dtype=tf.float32))
     return losses, tf.cast(mask, dtype=tf.float32)
 
@@ -1846,7 +1853,7 @@ class CoupledRankDistilLoss(_ListwiseLoss):
   [reddi2021]: https://research.google/pubs/pub50695/
   """
 
-  def __init__(self, name, sample_size, topk=None, temperature=1., ragged=False):
+  def __init__(self, name, sample_size, topk=None, temperature=1.0, ragged=False):
     """Initializer.
 
     Args:
@@ -1891,10 +1898,10 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     # Sort teacher scores and student scores to obtain top-k student scores
     # whose order is based on teacher scores.
     sorted_student_scores = utils.sort_by_scores(
-        utils.reshape_first_ndims(sampled_teacher_scores, 2, [batch_size * self._sample_size]),
-        [utils.reshape_first_ndims(expanded_student_scores, 2, [batch_size * self._sample_size])],
-        shuffle_ties=True,
-        seed=37
+      utils.reshape_first_ndims(sampled_teacher_scores, 2, [batch_size * self._sample_size]),
+      [utils.reshape_first_ndims(expanded_student_scores, 2, [batch_size * self._sample_size])],
+      shuffle_ties=True,
+      seed=37,
     )[0]
     sorted_student_scores = utils.reshape_first_ndims(sorted_student_scores, 1, [batch_size, self._sample_size])
     topk = self._topk or list_size
@@ -1917,13 +1924,12 @@ def _compute_unreduced_loss_impl(self, labels, logits, mask=None):
     # [batch_size * sample_size, topk, list_size] ->
     # [batch_size, sample_size, topk, list_size].
     topk_pl_denominator_mask = tf.cast(
-        utils.reshape_first_ndims(topk_pl_denominator_mask, 1, [batch_size, self._sample_size]), dtype=tf.bool
+      utils.reshape_first_ndims(topk_pl_denominator_mask, 1, [batch_size, self._sample_size]), dtype=tf.bool
     )
     sorted_student_scores = tf.tile(tf.expand_dims(sorted_student_scores, 2), [1, 1, topk, 1])
 
     sorted_student_scores_denom = tf.where(
-        topk_pl_denominator_mask, sorted_student_scores,
-        tf.math.log(_EPSILON) * tf.ones_like(sorted_student_scores)
+      topk_pl_denominator_mask, sorted_student_scores, tf.math.log(_EPSILON) * tf.ones_like(sorted_student_scores)
     )
     logprob = topk_student_scores - tf.math.reduce_logsumexp(sorted_student_scores_denom, axis=3)
     # Compute log-likelihood over top-k Plackett-Luce scores.
diff --git a/deepray/losses/metric_learning.py b/deepray/losses/metric_learning.py
index 6ae635c8..e17fae21 100644
--- a/deepray/losses/metric_learning.py
+++ b/deepray/losses/metric_learning.py
@@ -22,18 +22,18 @@
 def pairwise_distance(feature: TensorLike, squared: bool = False):
   """Computes the pairwise distance matrix with numerical stability.
 
-    output[i, j] = || feature[i, :] - feature[j, :] ||_2
+  output[i, j] = || feature[i, :] - feature[j, :] ||_2
 
-    Args:
-      feature: 2-D Tensor of size `[number of data, feature dimension]`.
-      squared: Boolean, whether or not to square the pairwise distances.
+  Args:
+    feature: 2-D Tensor of size `[number of data, feature dimension]`.
+    squared: Boolean, whether or not to square the pairwise distances.
 
-    Returns:
-      pairwise_distances: 2-D Tensor of size `[number of data, number of data]`.
-    """
+  Returns:
+    pairwise_distances: 2-D Tensor of size `[number of data, number of data]`.
+  """
   pairwise_distances_squared = tf.math.add(
-      tf.math.reduce_sum(tf.math.square(feature), axis=[1], keepdims=True),
-      tf.math.reduce_sum(tf.math.square(tf.transpose(feature)), axis=[0], keepdims=True),
+    tf.math.reduce_sum(tf.math.square(feature), axis=[1], keepdims=True),
+    tf.math.reduce_sum(tf.math.square(tf.transpose(feature)), axis=[0], keepdims=True),
   ) - 2.0 * tf.matmul(feature, tf.transpose(feature))
 
   # Deal with numerical inaccuracies. Set small negatives to zero.
@@ -49,8 +49,8 @@ def pairwise_distance(feature: TensorLike, squared: bool = False):
 
   # Undo conditionally adding 1e-16.
   pairwise_distances = tf.math.multiply(
-      pairwise_distances,
-      tf.cast(tf.math.logical_not(error_mask), dtype=tf.dtypes.float32),
+    pairwise_distances,
+    tf.cast(tf.math.logical_not(error_mask), dtype=tf.dtypes.float32),
   )
 
   num_data = tf.shape(feature)[0]
@@ -64,14 +64,14 @@ def pairwise_distance(feature: TensorLike, squared: bool = False):
 def angular_distance(feature: TensorLike):
   """Computes the angular distance matrix.
 
-    output[i, j] = 1 - cosine_similarity(feature[i, :], feature[j, :])
+  output[i, j] = 1 - cosine_similarity(feature[i, :], feature[j, :])
 
-    Args:
-      feature: 2-D Tensor of size `[number of data, feature dimension]`.
+  Args:
+    feature: 2-D Tensor of size `[number of data, feature dimension]`.
 
-    Returns:
-      angular_distances: 2-D Tensor of size `[number of data, number of data]`.
-    """
+  Returns:
+    angular_distances: 2-D Tensor of size `[number of data, number of data]`.
+  """
   # normalize input
   feature = tf.math.l2_normalize(feature, axis=1)
 
diff --git a/deepray/losses/npairs.py b/deepray/losses/npairs.py
index 26ca730b..4fb99fc7 100644
--- a/deepray/losses/npairs.py
+++ b/deepray/losses/npairs.py
@@ -25,41 +25,41 @@
 def npairs_loss(y_true: TensorLike, y_pred: TensorLike) -> tf.Tensor:
   """Computes the npairs loss between `y_true` and `y_pred`.
 
-    Npairs loss expects paired data where a pair is composed of samples from
-    the same labels and each pairs in the minibatch have different labels.
-    The loss takes each row of the pair-wise similarity matrix, `y_pred`,
-    as logits and the remapped multi-class labels, `y_true`, as labels.
-
-    The similarity matrix `y_pred` between two embedding matrices `a` and `b`
-    with shape `[batch_size, hidden_size]` can be computed as follows:
-
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],
-    ...                 [5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],
-    ...                 [1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
-    >>> y_pred
-    <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
-    array([[23., 15., 17.],
-       [51., 33., 35.],
-       [79., 51., 53.]], dtype=float16)>
-
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
-
-    See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
-
-    Args:
-      y_true: 1-D integer `Tensor` with shape `[batch_size]` of
-        multi-class labels.
-      y_pred: 2-D float `Tensor` with shape `[batch_size, batch_size]` of
-        similarity matrix between embedding matrices.
-
-    Returns:
-      npairs_loss: float scalar.
-    """
+  Npairs loss expects paired data where a pair is composed of samples from
+  the same labels and each pairs in the minibatch have different labels.
+  The loss takes each row of the pair-wise similarity matrix, `y_pred`,
+  as logits and the remapped multi-class labels, `y_true`, as labels.
+
+  The similarity matrix `y_pred` between two embedding matrices `a` and `b`
+  with shape `[batch_size, hidden_size]` can be computed as follows:
+
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],
+  ...                 [5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],
+  ...                 [1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
+  >>> y_pred
+  <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
+  array([[23., 15., 17.],
+     [51., 33., 35.],
+     [79., 51., 53.]], dtype=float16)>
+
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
+
+  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
+
+  Args:
+    y_true: 1-D integer `Tensor` with shape `[batch_size]` of
+      multi-class labels.
+    y_pred: 2-D float `Tensor` with shape `[batch_size, batch_size]` of
+      similarity matrix between embedding matrices.
+
+  Returns:
+    npairs_loss: float scalar.
+  """
   y_pred = tf.convert_to_tensor(y_pred)
   y_true = tf.cast(y_true, y_pred.dtype)
 
@@ -78,58 +78,58 @@ def npairs_loss(y_true: TensorLike, y_pred: TensorLike) -> tf.Tensor:
 def npairs_multilabel_loss(y_true: TensorLike, y_pred: TensorLike) -> tf.Tensor:
   r"""Computes the npairs loss between multilabel data `y_true` and `y_pred`.
 
-    Npairs loss expects paired data where a pair is composed of samples from
-    the same labels and each pairs in the minibatch have different labels.
-    The loss takes each row of the pair-wise similarity matrix, `y_pred`,
-    as logits and the remapped multi-class labels, `y_true`, as labels.
-
-    To deal with multilabel inputs, the count of label intersection
-    is computed as follows:
-
-    ```
-    L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) |
-    ```
-
-    Each row of the count based label matrix is further normalized so that
-    each row sums to one.
-
-    `y_true` should be a binary indicator for classes.
-    That is, if `y_true[i, j] = 1`, then `i`th sample is in `j`th class;
-    if `y_true[i, j] = 0`, then `i`th sample is not in `j`th class.
-
-    The similarity matrix `y_pred` between two embedding matrices `a` and `b`
-    with shape `[batch_size, hidden_size]` can be computed as follows:
-
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],
-    ...                 [5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],
-    ...                 [1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
-    >>> y_pred
-    <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
-    array([[23., 15., 17.],
-       [51., 33., 35.],
-       [79., 51., 53.]], dtype=float16)>
-
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
-
-    See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
-
-    Args:
-      y_true: Either 2-D integer `Tensor` with shape
-        `[batch_size, num_classes]`, or `SparseTensor` with dense shape
-        `[batch_size, num_classes]`. If `y_true` is a `SparseTensor`, then
-        it will be converted to `Tensor` via `tf.sparse.to_dense` first.
-
-      y_pred: 2-D float `Tensor` with shape `[batch_size, batch_size]` of
-        similarity matrix between embedding matrices.
-
-    Returns:
-      npairs_multilabel_loss: float scalar.
-    """
+  Npairs loss expects paired data where a pair is composed of samples from
+  the same labels and each pairs in the minibatch have different labels.
+  The loss takes each row of the pair-wise similarity matrix, `y_pred`,
+  as logits and the remapped multi-class labels, `y_true`, as labels.
+
+  To deal with multilabel inputs, the count of label intersection
+  is computed as follows:
+
+  ```
+  L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) |
+  ```
+
+  Each row of the count based label matrix is further normalized so that
+  each row sums to one.
+
+  `y_true` should be a binary indicator for classes.
+  That is, if `y_true[i, j] = 1`, then `i`th sample is in `j`th class;
+  if `y_true[i, j] = 0`, then `i`th sample is not in `j`th class.
+
+  The similarity matrix `y_pred` between two embedding matrices `a` and `b`
+  with shape `[batch_size, hidden_size]` can be computed as follows:
+
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],
+  ...                 [5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],
+  ...                 [1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
+  >>> y_pred
+  <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
+  array([[23., 15., 17.],
+     [51., 33., 35.],
+     [79., 51., 53.]], dtype=float16)>
+
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
+
+  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
+
+  Args:
+    y_true: Either 2-D integer `Tensor` with shape
+      `[batch_size, num_classes]`, or `SparseTensor` with dense shape
+      `[batch_size, num_classes]`. If `y_true` is a `SparseTensor`, then
+      it will be converted to `Tensor` via `tf.sparse.to_dense` first.
+
+    y_pred: 2-D float `Tensor` with shape `[batch_size, batch_size]` of
+      similarity matrix between embedding matrices.
+
+  Returns:
+    npairs_multilabel_loss: float scalar.
+  """
   y_pred = tf.convert_to_tensor(y_pred)
   y_true = tf.cast(y_true, y_pred.dtype)
 
@@ -151,35 +151,35 @@ def npairs_multilabel_loss(y_true: TensorLike, y_pred: TensorLike) -> tf.Tensor:
 class NpairsLoss(tf.keras.losses.Loss):
   """Computes the npairs loss between `y_true` and `y_pred`.
 
-    Npairs loss expects paired data where a pair is composed of samples from
-    the same labels and each pairs in the minibatch have different labels.
-    The loss takes each row of the pair-wise similarity matrix, `y_pred`,
-    as logits and the remapped multi-class labels, `y_true`, as labels.
+  Npairs loss expects paired data where a pair is composed of samples from
+  the same labels and each pairs in the minibatch have different labels.
+  The loss takes each row of the pair-wise similarity matrix, `y_pred`,
+  as logits and the remapped multi-class labels, `y_true`, as labels.
 
-    The similarity matrix `y_pred` between two embedding matrices `a` and `b`
-    with shape `[batch_size, hidden_size]` can be computed as follows:
+  The similarity matrix `y_pred` between two embedding matrices `a` and `b`
+  with shape `[batch_size, hidden_size]` can be computed as follows:
 
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],
-    ...                 [5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],
-    ...                 [1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
-    >>> y_pred
-    <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
-    array([[23., 15., 17.],
-       [51., 33., 35.],
-       [79., 51., 53.]], dtype=float16)>
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],
+  ...                 [5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],
+  ...                 [1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
+  >>> y_pred
+  <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
+  array([[23., 15., 17.],
+     [51., 33., 35.],
+     [79., 51., 53.]], dtype=float16)>
 
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
 
-    See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
+  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
 
-    Args:
-      name: (Optional) name for the loss.
-    """
+  Args:
+    name: (Optional) name for the loss.
+  """
 
   @typechecked
   def __init__(self, name: str = "npairs_loss"):
@@ -193,49 +193,49 @@ def call(self, y_true, y_pred):
 class NpairsMultilabelLoss(tf.keras.losses.Loss):
   r"""Computes the npairs loss between multilabel data `y_true` and `y_pred`.
 
-    Npairs loss expects paired data where a pair is composed of samples from
-    the same labels and each pairs in the minibatch have different labels.
-    The loss takes each row of the pair-wise similarity matrix, `y_pred`,
-    as logits and the remapped multi-class labels, `y_true`, as labels.
-
-    To deal with multilabel inputs, the count of label intersection
-    is computed as follows:
-
-    ```
-    L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) |
-    ```
-
-    Each row of the count based label matrix is further normalized so that
-    each row sums to one.
-
-    `y_true` should be a binary indicator for classes.
-    That is, if `y_true[i, j] = 1`, then `i`th sample is in `j`th class;
-    if `y_true[i, j] = 0`, then `i`th sample is not in `j`th class.
-
-    The similarity matrix `y_pred` between two embedding matrices `a` and `b`
-    with shape `[batch_size, hidden_size]` can be computed as follows:
-
-    >>> a = tf.constant([[1, 2],
-    ...                 [3, 4],
-    ...                 [5, 6]], dtype=tf.float16)
-    >>> b = tf.constant([[5, 9],
-    ...                 [3, 6],
-    ...                 [1, 8]], dtype=tf.float16)
-    >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
-    >>> y_pred
-    <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
-    array([[23., 15., 17.],
-       [51., 33., 35.],
-       [79., 51., 53.]], dtype=float16)>
-
-    <... Note: constants a & b have been used purely for
-    example purposes and have no significant value ...>
-
-    See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
-
-    Args:
-      name: (Optional) name for the loss.
-    """
+  Npairs loss expects paired data where a pair is composed of samples from
+  the same labels and each pairs in the minibatch have different labels.
+  The loss takes each row of the pair-wise similarity matrix, `y_pred`,
+  as logits and the remapped multi-class labels, `y_true`, as labels.
+
+  To deal with multilabel inputs, the count of label intersection
+  is computed as follows:
+
+  ```
+  L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) |
+  ```
+
+  Each row of the count based label matrix is further normalized so that
+  each row sums to one.
+
+  `y_true` should be a binary indicator for classes.
+  That is, if `y_true[i, j] = 1`, then `i`th sample is in `j`th class;
+  if `y_true[i, j] = 0`, then `i`th sample is not in `j`th class.
+
+  The similarity matrix `y_pred` between two embedding matrices `a` and `b`
+  with shape `[batch_size, hidden_size]` can be computed as follows:
+
+  >>> a = tf.constant([[1, 2],
+  ...                 [3, 4],
+  ...                 [5, 6]], dtype=tf.float16)
+  >>> b = tf.constant([[5, 9],
+  ...                 [3, 6],
+  ...                 [1, 8]], dtype=tf.float16)
+  >>> y_pred = tf.matmul(a, b, transpose_a=False, transpose_b=True)
+  >>> y_pred
+  <tf.Tensor: shape=(3, 3), dtype=float16, numpy=
+  array([[23., 15., 17.],
+     [51., 33., 35.],
+     [79., 51., 53.]], dtype=float16)>
+
+  <... Note: constants a & b have been used purely for
+  example purposes and have no significant value ...>
+
+  See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf
+
+  Args:
+    name: (Optional) name for the loss.
+  """
 
   @typechecked
   def __init__(self, name: str = "npairs_multilabel_loss"):
diff --git a/deepray/losses/quantiles.py b/deepray/losses/quantiles.py
index 1a4f9803..3d4e00e3 100644
--- a/deepray/losses/quantiles.py
+++ b/deepray/losses/quantiles.py
@@ -26,36 +26,36 @@
 def pinball_loss(y_true: TensorLike, y_pred: TensorLike, tau: FloatTensorLike = 0.5) -> tf.Tensor:
   """Computes the pinball loss between `y_true` and `y_pred`.
 
-    `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))`
+  `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))`
 
-    In the context of regression this loss yields an estimator of the tau
-    conditional quantile.
+  In the context of regression this loss yields an estimator of the tau
+  conditional quantile.
 
-    See: https://en.wikipedia.org/wiki/Quantile_regression
+  See: https://en.wikipedia.org/wiki/Quantile_regression
 
-    Usage:
+  Usage:
 
-    >>> loss = dp.losses.pinball_loss([0., 0., 1., 1.],
-    ... [1., 1., 1., 0.], tau=.1)
-    >>> loss
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.475>
+  >>> loss = dp.losses.pinball_loss([0., 0., 1., 1.],
+  ... [1., 1., 1., 0.], tau=.1)
+  >>> loss
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.475>
 
-    Args:
-      y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`
-      y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
-      tau: (Optional) Float in [0, 1] or a tensor taking values in [0, 1] and
-        shape = `[d0,..., dn]`.  It defines the slope of the pinball loss. In
-        the context of quantile regression, the value of tau determines the
-        conditional quantile level. When tau = 0.5, this amounts to l1
-        regression, an estimator of the conditional median (0.5 quantile).
+  Args:
+    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`
+    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`
+    tau: (Optional) Float in [0, 1] or a tensor taking values in [0, 1] and
+      shape = `[d0,..., dn]`.  It defines the slope of the pinball loss. In
+      the context of quantile regression, the value of tau determines the
+      conditional quantile level. When tau = 0.5, this amounts to l1
+      regression, an estimator of the conditional median (0.5 quantile).
 
-    Returns:
-        pinball_loss: 1-D float `Tensor` with shape [batch_size].
+  Returns:
+      pinball_loss: 1-D float `Tensor` with shape [batch_size].
 
-    References:
-      - https://en.wikipedia.org/wiki/Quantile_regression
-      - https://projecteuclid.org/download/pdfview_1/euclid.bj/1297173840
-    """
+  References:
+    - https://en.wikipedia.org/wiki/Quantile_regression
+    - https://projecteuclid.org/download/pdfview_1/euclid.bj/1297173840
+  """
   y_pred = tf.convert_to_tensor(y_pred)
   y_true = tf.cast(y_true, y_pred.dtype)
 
@@ -72,52 +72,52 @@ def pinball_loss(y_true: TensorLike, y_pred: TensorLike, tau: FloatTensorLike =
 class PinballLoss(losses.LossFunctionWrapper):
   """Computes the pinball loss between `y_true` and `y_pred`.
 
-    `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))`
+  `loss = maximum(tau * (y_true - y_pred), (tau - 1) * (y_true - y_pred))`
 
-    In the context of regression, this loss yields an estimator of the tau
-    conditional quantile.
+  In the context of regression, this loss yields an estimator of the tau
+  conditional quantile.
 
-    See: https://en.wikipedia.org/wiki/Quantile_regression
+  See: https://en.wikipedia.org/wiki/Quantile_regression
 
-    Usage:
+  Usage:
 
-    >>> pinball = dp.losses.PinballLoss(tau=.1)
-    >>> loss = pinball([0., 0., 1., 1.], [1., 1., 1., 0.])
-    >>> loss
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.475>
+  >>> pinball = dp.losses.PinballLoss(tau=.1)
+  >>> loss = pinball([0., 0., 1., 1.], [1., 1., 1., 0.])
+  >>> loss
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.475>
 
-    Usage with the `tf.keras` API:
+  Usage with the `tf.keras` API:
 
-    >>> model = tf.keras.Model()
-    >>> model.compile('sgd', loss=dp.losses.PinballLoss(tau=.1))
+  >>> model = tf.keras.Model()
+  >>> model.compile('sgd', loss=dp.losses.PinballLoss(tau=.1))
 
-    Args:
-      tau: (Optional) Float in [0, 1] or a tensor taking values in [0, 1] and
-        shape = `[d0,..., dn]`.  It defines the slope of the pinball loss. In
-        the context of quantile regression, the value of tau determines the
-        conditional quantile level. When tau = 0.5, this amounts to l1
-        regression, an estimator of the conditional median (0.5 quantile).
-      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `AUTO`. `AUTO` indicates that the reduction
-        option will be determined by the usage context. For almost all cases
-        this defaults to `SUM_OVER_BATCH_SIZE`.
-        When used with `tf.distribute.Strategy`, outside of built-in training
-        loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
-        `SUM_OVER_BATCH_SIZE` will raise an error. Please see
-        https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
-        for more details on this.
-      name: Optional name for the op.
+  Args:
+    tau: (Optional) Float in [0, 1] or a tensor taking values in [0, 1] and
+      shape = `[d0,..., dn]`.  It defines the slope of the pinball loss. In
+      the context of quantile regression, the value of tau determines the
+      conditional quantile level. When tau = 0.5, this amounts to l1
+      regression, an estimator of the conditional median (0.5 quantile).
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+      loss. Default value is `AUTO`. `AUTO` indicates that the reduction
+      option will be determined by the usage context. For almost all cases
+      this defaults to `SUM_OVER_BATCH_SIZE`.
+      When used with `tf.distribute.Strategy`, outside of built-in training
+      loops such as `tf.keras` `compile` and `fit`, using `AUTO` or
+      `SUM_OVER_BATCH_SIZE` will raise an error. Please see
+      https://www.tensorflow.org/alpha/tutorials/distribute/training_loops
+      for more details on this.
+    name: Optional name for the op.
 
-    References:
-      - https://en.wikipedia.org/wiki/Quantile_regression
-      - https://projecteuclid.org/download/pdfview_1/euclid.bj/1297173840
-    """
+  References:
+    - https://en.wikipedia.org/wiki/Quantile_regression
+    - https://projecteuclid.org/download/pdfview_1/euclid.bj/1297173840
+  """
 
   @typechecked
   def __init__(
-      self,
-      tau: FloatTensorLike = 0.5,
-      reduction: str = tf.keras.losses.Reduction.AUTO,
-      name: str = "pinball_loss",
+    self,
+    tau: FloatTensorLike = 0.5,
+    reduction: str = tf.keras.losses.Reduction.AUTO,
+    name: str = "pinball_loss",
   ):
     super().__init__(pinball_loss, reduction=reduction, name=name, tau=tau)
diff --git a/deepray/losses/softmax_loss.py b/deepray/losses/softmax_loss.py
index 76970d30..eec2b8d7 100644
--- a/deepray/losses/softmax_loss.py
+++ b/deepray/losses/softmax_loss.py
@@ -19,10 +19,7 @@ class _RankingLoss(tf.keras.losses.Loss):
   """
 
   def __init__(
-      self,
-      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
-      name: Optional[str] = None,
-      ragged: bool = False
+    self, reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO, name: Optional[str] = None, ragged: bool = False
   ):
     super().__init__(reduction, name)
     # An instance of loss in `losses_impl`. Overwrite this in subclasses.
@@ -30,14 +27,11 @@ def __init__(
     self._ragged = ragged
 
   def __call__(
-      self,
-      y_true: utils.TensorLike,
-      y_pred: utils.TensorLike,
-      sample_weight: Optional[utils.TensorLike] = None
+    self, y_true: utils.TensorLike, y_pred: utils.TensorLike, sample_weight: Optional[utils.TensorLike] = None
   ) -> tf.Tensor:
     """See tf.keras.losses.Loss."""
     if self._loss is None:
-      raise ValueError('self._loss is not defined. Please use a subclass.')
+      raise ValueError("self._loss is not defined. Please use a subclass.")
     sample_weight = self._loss.normalize_weights(y_true, sample_weight)
     return super().__call__(y_true, y_pred, sample_weight)
 
@@ -49,7 +43,7 @@ def call(self, y_true: utils.TensorLike, y_pred: utils.TensorLike) -> tf.Tensor:
 
   def get_config(self) -> Dict[str, Any]:
     config = super().get_config()
-    config.update({'ragged': self._ragged})
+    config.update({"ragged": self._ragged})
     return config
 
 
@@ -57,13 +51,13 @@ class _ListwiseLoss(_RankingLoss):
   """Base class for listwise ranking losses."""
 
   def __init__(
-      self,
-      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
-      name: Optional[str] = None,
-      lambda_weight: Optional[losses_impl._LambdaWeight] = None,
-      temperature: float = 1.0,
-      ragged: bool = False,
-      **kwargs
+    self,
+    reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
+    name: Optional[str] = None,
+    lambda_weight: Optional[losses_impl._LambdaWeight] = None,
+    temperature: float = 1.0,
+    ragged: bool = False,
+    **kwargs,
   ):
     super().__init__(reduction, name, ragged)
     self._lambda_weight = lambda_weight
@@ -71,19 +65,17 @@ def __init__(
 
   def get_config(self) -> Dict[str, Any]:
     config = super().get_config()
-    config.update(
-        {
-            'lambda_weight': utils.serialize_keras_object(self._lambda_weight),
-            'temperature': self._temperature,
-        }
-    )
+    config.update({
+      "lambda_weight": utils.serialize_keras_object(self._lambda_weight),
+      "temperature": self._temperature,
+    })
     return config
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     config.update({
-        'lambda_weight': utils.deserialize_keras_object(config['lambda_weight']),
+      "lambda_weight": utils.deserialize_keras_object(config["lambda_weight"]),
     })
     return cls(**config)
 
@@ -127,12 +119,12 @@ class SoftmaxLoss(_ListwiseLoss):
   """
 
   def __init__(
-      self,
-      reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
-      name: Optional[str] = None,
-      lambda_weight: Optional[losses_impl._LambdaWeight] = None,
-      temperature: float = 1.0,
-      ragged: bool = False
+    self,
+    reduction: tf.losses.Reduction = tf.losses.Reduction.AUTO,
+    name: Optional[str] = None,
+    lambda_weight: Optional[losses_impl._LambdaWeight] = None,
+    temperature: float = 1.0,
+    ragged: bool = False,
   ):
     """Softmax cross-entropy loss.
 
@@ -150,17 +142,11 @@ def __init__(
     """
     super().__init__(reduction, name, lambda_weight, temperature, ragged)
     self._loss = losses_impl.SoftmaxLoss(
-        name='{}_impl'.format(name) if name else None,
-        lambda_weight=lambda_weight,
-        temperature=temperature,
-        ragged=ragged
+      name="{}_impl".format(name) if name else None, lambda_weight=lambda_weight, temperature=temperature, ragged=ragged
     )
 
   def __call__(
-      self,
-      y_true: utils.TensorLike,
-      y_pred: utils.TensorLike,
-      sample_weight: Optional[utils.TensorLike] = None
+    self, y_true: utils.TensorLike, y_pred: utils.TensorLike, sample_weight: Optional[utils.TensorLike] = None
   ) -> tf.Tensor:
     """See _RankingLoss."""
     losses, sample_weight = self._loss.compute_per_list(y_true, y_pred, sample_weight)
diff --git a/deepray/losses/sparsemax_loss.py b/deepray/losses/sparsemax_loss.py
index 0fed52a9..ddcdfdbf 100644
--- a/deepray/losses/sparsemax_loss.py
+++ b/deepray/losses/sparsemax_loss.py
@@ -23,30 +23,30 @@
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 def sparsemax_loss(
-    logits: TensorLike,
-    sparsemax: TensorLike,
-    labels: TensorLike,
-    name: Optional[str] = None,
+  logits: TensorLike,
+  sparsemax: TensorLike,
+  labels: TensorLike,
+  name: Optional[str] = None,
 ) -> tf.Tensor:
   """Sparsemax loss function [1].
 
-    Computes the generalized multi-label classification loss for the sparsemax
-    function. The implementation is a reformulation of the original loss
-    function such that it uses the sparsemax probability output instead of the
-    internal $ \tau $ variable. However, the output is identical to the original
-    loss function.
-
-    [1]: https://arxiv.org/abs/1602.02068
-
-    Args:
-      logits: A `Tensor`. Must be one of the following types: `float32`,
-        `float64`.
-      sparsemax: A `Tensor`. Must have the same type as `logits`.
-      labels: A `Tensor`. Must have the same type as `logits`.
-      name: A name for the operation (optional).
-    Returns:
-      A `Tensor`. Has the same type as `logits`.
-    """
+  Computes the generalized multi-label classification loss for the sparsemax
+  function. The implementation is a reformulation of the original loss
+  function such that it uses the sparsemax probability output instead of the
+  internal $ \tau $ variable. However, the output is identical to the original
+  loss function.
+
+  [1]: https://arxiv.org/abs/1602.02068
+
+  Args:
+    logits: A `Tensor`. Must be one of the following types: `float32`,
+      `float64`.
+    sparsemax: A `Tensor`. Must have the same type as `logits`.
+    labels: A `Tensor`. Must have the same type as `logits`.
+    name: A name for the operation (optional).
+  Returns:
+    A `Tensor`. Has the same type as `logits`.
+  """
   logits = tf.convert_to_tensor(logits, name="logits")
   sparsemax = tf.convert_to_tensor(sparsemax, name="sparsemax")
   labels = tf.convert_to_tensor(labels, name="labels")
@@ -62,9 +62,9 @@ def sparsemax_loss(
   # If z = -inf, and there is no support (sparsemax = 0), a multiplication
   # would cause 0 * -inf = nan, which is not correct in this case.
   sum_s = tf.where(
-      tf.math.logical_or(sparsemax > 0, tf.math.is_nan(sparsemax)),
-      sparsemax * (z - 0.5 * sparsemax),
-      tf.zeros_like(sparsemax),
+    tf.math.logical_or(sparsemax > 0, tf.math.is_nan(sparsemax)),
+    sparsemax * (z - 0.5 * sparsemax),
+    tf.zeros_like(sparsemax),
   )
 
   # - z_k + ||q||^2
@@ -76,9 +76,9 @@ def sparsemax_loss(
   # caose the sparsemax will be nan, which means the sum_s will also be nan,
   # therefor this case doesn't need addtional special treatment.
   q_part_safe = tf.where(
-      tf.math.logical_and(tf.math.equal(labels, 0), tf.math.is_inf(z)),
-      tf.zeros_like(z),
-      q_part,
+    tf.math.logical_and(tf.math.equal(labels, 0), tf.math.is_inf(z)),
+    tf.zeros_like(z),
+    q_part,
   )
 
   return tf.math.reduce_sum(sum_s + q_part_safe, axis=1)
@@ -96,29 +96,29 @@ def sparsemax_loss_from_logits(y_true: TensorLike, logits_pred: TensorLike) -> t
 class SparsemaxLoss(tf.keras.losses.Loss):
   """Sparsemax loss function.
 
-    Computes the generalized multi-label classification loss for the sparsemax
-    function.
+  Computes the generalized multi-label classification loss for the sparsemax
+  function.
 
-    Because the sparsemax loss function needs both the probability output and
-    the logits to compute the loss value, `from_logits` must be `True`.
+  Because the sparsemax loss function needs both the probability output and
+  the logits to compute the loss value, `from_logits` must be `True`.
 
-    Because it computes the generalized multi-label loss, the shape of both
-    `y_pred` and `y_true` must be `[batch_size, num_classes]`.
+  Because it computes the generalized multi-label loss, the shape of both
+  `y_pred` and `y_true` must be `[batch_size, num_classes]`.
 
-    Args:
-      from_logits: Whether `y_pred` is expected to be a logits tensor. Default
-        is `True`, meaning `y_pred` is the logits.
-      reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
-        loss. Default value is `SUM_OVER_BATCH_SIZE`.
-      name: Optional name for the op.
-    """
+  Args:
+    from_logits: Whether `y_pred` is expected to be a logits tensor. Default
+      is `True`, meaning `y_pred` is the logits.
+    reduction: (Optional) Type of `tf.keras.losses.Reduction` to apply to
+      loss. Default value is `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
 
   @typechecked
   def __init__(
-      self,
-      from_logits: bool = True,
-      reduction: str = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
-      name: str = "sparsemax_loss",
+    self,
+    from_logits: bool = True,
+    reduction: str = tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE,
+    name: str = "sparsemax_loss",
   ):
     if from_logits is not True:
       raise ValueError("from_logits must be True")
@@ -131,7 +131,7 @@ def call(self, y_true, y_pred):
 
   def get_config(self):
     config = {
-        "from_logits": self.from_logits,
+      "from_logits": self.from_logits,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/losses/tests/focal_loss_test.py b/deepray/losses/tests/focal_loss_test.py
index 566e12d1..a5af18f9 100644
--- a/deepray/losses/tests/focal_loss_test.py
+++ b/deepray/losses/tests/focal_loss_test.py
@@ -23,19 +23,17 @@
 
 def test_config():
   bce_obj = focal_loss.SigmoidFocalCrossEntropy(
-      reduction=tf.keras.losses.Reduction.NONE, name="sigmoid_focal_crossentropy"
+    reduction=tf.keras.losses.Reduction.NONE, name="sigmoid_focal_crossentropy"
   )
   assert bce_obj.name == "sigmoid_focal_crossentropy"
   assert bce_obj.reduction == tf.keras.losses.Reduction.NONE
 
 
 def test_keras_model_compile():
-  model = tf.keras.models.Sequential(
-      [
-          tf.keras.layers.Input(shape=(100,)),
-          tf.keras.layers.Dense(5, activation="softmax"),
-      ]
-  )
+  model = tf.keras.models.Sequential([
+    tf.keras.layers.Input(shape=(100,)),
+    tf.keras.layers.Dense(5, activation="softmax"),
+  ])
   model.compile(loss="Deepray>sigmoid_focal_crossentropy")
 
 
@@ -51,14 +49,14 @@ def test_without_logits(y_pred_dtype, y_true_dtype, from_logits):
 
   # When alpha and gamma are None, the result is equal to BCE.
   fl = focal_loss.sigmoid_focal_crossentropy(
-      y_true=y_true, y_pred=y_pred, alpha=None, gamma=None, from_logits=from_logits
+    y_true=y_true, y_pred=y_pred, alpha=None, gamma=None, from_logits=from_logits
   ).numpy()
   bce = tf.keras.losses.binary_crossentropy(y_true, y_pred, from_logits=from_logits).numpy()
   np.testing.assert_allclose(fl, bce)
 
   # When gamma is 2.0.
   fl = focal_loss.sigmoid_focal_crossentropy(
-      y_true=y_true, y_pred=y_pred, alpha=None, gamma=2.0, from_logits=from_logits
+    y_true=y_true, y_pred=y_pred, alpha=None, gamma=2.0, from_logits=from_logits
   ).numpy()
   order_of_ratio = np.power(10.0, np.floor(np.log10(bce / fl)))
   pow_values = np.asarray([1000, 100, 10, 10, 100, 1000])
diff --git a/deepray/losses/tests/giou_loss_test.py b/deepray/losses/tests/giou_loss_test.py
index d74256ac..cc7c546b 100644
--- a/deepray/losses/tests/giou_loss_test.py
+++ b/deepray/losses/tests/giou_loss_test.py
@@ -73,8 +73,8 @@ def test_keras_model(dtype):
   expected_result = tf.constant(1.5041667222976685, dtype=dtype)
   model = tf.keras.Sequential()
   model.compile(
-      optimizer="adam",
-      loss=GIoULoss(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
+    optimizer="adam",
+    loss=GIoULoss(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
   )
   loss = model.evaluate(boxes1, boxes2, batch_size=2, steps=1)
   test_utils.assert_allclose_according_to_type(loss, expected_result)
diff --git a/deepray/losses/tests/kappa_loss_test.py b/deepray/losses/tests/kappa_loss_test.py
index 7d1cc21e..3aeddc12 100644
--- a/deepray/losses/tests/kappa_loss_test.py
+++ b/deepray/losses/tests/kappa_loss_test.py
@@ -29,7 +29,7 @@ def weighted_kappa_loss_np(y_true, y_pred, weightage="quadratic", eps=1e-6):
   if weightage == "linear":
     weight = np.abs(label_mat - label_mat_)
   else:
-    weight = (label_mat - label_mat_)**2
+    weight = (label_mat - label_mat_) ** 2
   numerator = (y_pred * weight).sum()
   label_dist = y_true.sum(axis=0, keepdims=True)
   pred_dist = y_pred.sum(axis=0, keepdims=True)
@@ -38,7 +38,7 @@ def weighted_kappa_loss_np(y_true, y_pred, weightage="quadratic", eps=1e-6):
   row_mat = np.tile(row_label_vec, (num_classes, 1))
   col_mat = np.tile(col_label_vec, (1, num_classes))
   if weightage == "quadratic":
-    weight_ = (col_mat - row_mat)**2
+    weight_ = (col_mat - row_mat) ** 2
   else:
     weight_ = np.abs(col_mat - row_mat)
   weighted_pred_dist = np.matmul(weight_, pred_dist.T)
@@ -91,11 +91,9 @@ def test_serialization():
 
 
 def test_save_model(tmpdir):
-  model = tf.keras.models.Sequential(
-      [
-          tf.keras.layers.Input((256, 256, 3)),
-          tf.keras.layers.Dense(6, activation="softmax"),
-      ]
-  )
+  model = tf.keras.models.Sequential([
+    tf.keras.layers.Input((256, 256, 3)),
+    tf.keras.layers.Dense(6, activation="softmax"),
+  ])
   model.compile(optimizer="adam", loss=WeightedKappaLoss(num_classes=6))
   model.save(str(tmpdir / "test.h5"))
diff --git a/deepray/losses/tests/lifted_test.py b/deepray/losses/tests/lifted_test.py
index 67bb1ebe..dbbbbb79 100644
--- a/deepray/losses/tests/lifted_test.py
+++ b/deepray/losses/tests/lifted_test.py
@@ -25,16 +25,16 @@
 def pairwise_distance_np(feature, squared=False):
   """Computes the pairwise distance matrix in numpy.
 
-    Args:
-      feature: 2-D numpy array of size [number of data, feature dimension]
-      squared: Boolean. If true, output is the pairwise squared euclidean
-        distance matrix; else, output is the pairwise euclidean distance
-        matrix.
-
-    Returns:
-      pairwise_distances: 2-D numpy array of size
-        [number of data, number of data].
-    """
+  Args:
+    feature: 2-D numpy array of size [number of data, feature dimension]
+    squared: Boolean. If true, output is the pairwise squared euclidean
+      distance matrix; else, output is the pairwise euclidean distance
+      matrix.
+
+  Returns:
+    pairwise_distances: 2-D numpy array of size
+      [number of data, number of data].
+  """
   triu = np.triu_indices(feature.shape[0], 1)
   upper_tri_pdists = np.linalg.norm(feature[triu[1]] - feature[triu[0]], axis=1)
   if squared:
@@ -43,7 +43,7 @@ def pairwise_distance_np(feature, squared=False):
   pairwise_distances = np.zeros((num_data, num_data))
   pairwise_distances[np.triu_indices(num_data, 1)] = upper_tri_pdists
   # Make symmetrical.
-  pairwise_distances = (pairwise_distances + pairwise_distances.T - np.diag(pairwise_distances.diagonal()))
+  pairwise_distances = pairwise_distances + pairwise_distances.T - np.diag(pairwise_distances.diagonal())
   return pairwise_distances
 
 
diff --git a/deepray/losses/tests/sparsemax_loss_test.py b/deepray/losses/tests/sparsemax_loss_test.py
index db1c0714..c64ab914 100644
--- a/deepray/losses/tests/sparsemax_loss_test.py
+++ b/deepray/losses/tests/sparsemax_loss_test.py
@@ -97,8 +97,9 @@ def test_gradient_against_estimate(dtype):
   q = np.zeros((test_obs, 10)).astype(dtype)
   q[np.arange(0, test_obs), np.random.randint(0, 10, size=test_obs)] = 1
 
-  (jacob_sym,), (jacob_num,
-                ) = tf.test.compute_gradient(lambda logits: sparsemax_loss(logits, sparsemax(logits), q), [z])
+  (jacob_sym,), (jacob_num,) = tf.test.compute_gradient(
+    lambda logits: sparsemax_loss(logits, sparsemax(logits), q), [z]
+  )
   test_utils.assert_allclose_according_to_type(jacob_sym, jacob_num)
 
 
@@ -166,22 +167,22 @@ def test_sparsemax_loss_of_inf(dtype):
   """check sparsemax-loss is infinity safe."""
   q = np.asarray([[0, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]])
   z_neg = np.asarray([
-      [0, -np.inf, 0],
-      [0, -np.inf, -np.inf],
-      [-np.inf, -np.inf, 0],
-      [-np.inf, -np.inf, -np.inf],
+    [0, -np.inf, 0],
+    [0, -np.inf, -np.inf],
+    [-np.inf, -np.inf, 0],
+    [-np.inf, -np.inf, -np.inf],
   ]).astype(dtype)
   z_pos = np.asarray([
-      [0, np.inf, 0],
-      [0, np.inf, np.inf],
-      [np.inf, np.inf, 0],
-      [np.inf, np.inf, np.inf],
+    [0, np.inf, 0],
+    [0, np.inf, np.inf],
+    [np.inf, np.inf, 0],
+    [np.inf, np.inf, np.inf],
   ]).astype(dtype)
   z_mix = np.asarray([
-      [0, np.inf, 0],
-      [0, np.inf, -np.inf],
-      [-np.inf, np.inf, 0],
-      [-np.inf, np.inf, -np.inf],
+    [0, np.inf, 0],
+    [0, np.inf, -np.inf],
+    [-np.inf, np.inf, 0],
+    [-np.inf, np.inf, -np.inf],
   ]).astype(dtype)
 
   _, tf_loss_neg = _tf_sparsemax_loss(z_neg, q, dtype)
diff --git a/deepray/losses/tests/triplet_test.py b/deepray/losses/tests/triplet_test.py
index 2a0669bc..6e7006cc 100644
--- a/deepray/losses/tests/triplet_test.py
+++ b/deepray/losses/tests/triplet_test.py
@@ -25,16 +25,16 @@
 def pairwise_distance_np(feature, squared=False):
   """Computes the pairwise distance matrix in numpy.
 
-    Args:
-      feature: 2-D numpy array of size [number of data, feature dimension]
-      squared: Boolean. If true, output is the pairwise squared euclidean
-        distance matrix; else, output is the pairwise euclidean distance
-        matrix.
-
-    Returns:
-      pairwise_distances: 2-D numpy array of size
-        [number of data, number of data].
-    """
+  Args:
+    feature: 2-D numpy array of size [number of data, feature dimension]
+    squared: Boolean. If true, output is the pairwise squared euclidean
+      distance matrix; else, output is the pairwise euclidean distance
+      matrix.
+
+  Returns:
+    pairwise_distances: 2-D numpy array of size
+      [number of data, number of data].
+  """
   triu = np.triu_indices(feature.shape[0], 1)
   upper_tri_pdists = np.linalg.norm(feature[triu[1]] - feature[triu[0]], axis=1)
   if squared:
@@ -43,7 +43,7 @@ def pairwise_distance_np(feature, squared=False):
   pairwise_distances = np.zeros((num_data, num_data))
   pairwise_distances[np.triu_indices(num_data, 1)] = upper_tri_pdists
   # Make symmetrical.
-  pairwise_distances = (pairwise_distances + pairwise_distances.T - np.diag(pairwise_distances.diagonal()))
+  pairwise_distances = pairwise_distances + pairwise_distances.T - np.diag(pairwise_distances.diagonal())
   return pairwise_distances
 
 
@@ -57,12 +57,12 @@ def l_2_dists(embs):
 
 def angular_distance_np(feature):
   """Computes the angular distance matrix in numpy.
-    Args:
-      feature: 2-D numpy array of size [number of data, feature dimension]
-    Returns:
-      angular_distances: 2-D numpy array of size
-        [number of data, number of data].
-    """
+  Args:
+    feature: 2-D numpy array of size [number of data, feature dimension]
+  Returns:
+    angular_distances: 2-D numpy array of size
+      [number of data, number of data].
+  """
 
   # l2-normalize all features
   normed = feature / np.linalg.norm(feature, ord=2, axis=1, keepdims=True)
@@ -147,12 +147,12 @@ def triplet_hard_loss_np(labels, embedding, margin, dist_func, soft=False):
 # triplet semihard
 @pytest.mark.parametrize("dtype", [tf.float32, tf.float16, tf.bfloat16])
 @pytest.mark.parametrize(
-    "dist_func, dist_metric",
-    [
-        (angular_distance_np, "angular"),
-        (squared_l_2_dists, "squared-L2"),
-        (l_2_dists, "L2"),
-    ],
+  "dist_func, dist_metric",
+  [
+    (angular_distance_np, "angular"),
+    (squared_l_2_dists, "squared-L2"),
+    (l_2_dists, "L2"),
+  ],
 )
 def test_semihard_tripled_loss_angular(dtype, dist_func, dist_metric):
   num_data = 10
@@ -188,12 +188,12 @@ def test_serialization_semihard():
 @pytest.mark.parametrize("dtype", [tf.float32, tf.float16, tf.bfloat16])
 @pytest.mark.parametrize("soft", [False, True])
 @pytest.mark.parametrize(
-    "dist_func, dist_metric",
-    [
-        (angular_distance_np, "angular"),
-        (squared_l_2_dists, "squared-L2"),
-        (l_2_dists, "L2"),
-    ],
+  "dist_func, dist_metric",
+  [
+    (angular_distance_np, "angular"),
+    (squared_l_2_dists, "squared-L2"),
+    (l_2_dists, "L2"),
+  ],
 )
 def test_hard_tripled_loss_angular(dtype, soft, dist_func, dist_metric):
   num_data = 20
diff --git a/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
index 5f92306b..de28bea9 100644
--- a/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
+++ b/deepray/losses/tests/weighted_sparse_categorical_crossentropy_test.py
@@ -29,16 +29,15 @@
 
 @keras_parameterized.run_all_keras_modes
 class ClassificationLossTest(keras_parameterized.TestCase):
-
   def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"):
     # First, create a transformer stack that we can use to get the LM's
     # vocabulary weight.
     xformer_stack = networks.TransformerEncoder(
-        vocab_size=vocab_size,
-        num_layers=1,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
+      vocab_size=vocab_size,
+      num_layers=1,
+      sequence_length=sequence_length,
+      hidden_size=hidden_size,
+      num_attention_heads=4,
     )
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -47,7 +46,7 @@ def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predicti
 
     # Create a maskedLM from the transformer stack.
     test_network = networks.MaskedLM(
-        num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
+      num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
     )
 
     # Create a model from the masked LM layer.
@@ -70,10 +69,7 @@ def test_per_example_loss_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Get the output of the masked LM.
@@ -85,7 +81,7 @@ def test_per_example_loss_3d_input(self):
     # Calculate per-example loss.
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels
+      predictions=output_data, labels=labels
     )
 
     # Per-example loss data should have one value per prediction, and those
@@ -108,7 +104,7 @@ def test_per_example_loss_2d_input(self):
     # Calculate per example loss.
     labels = np.random.randint(num_classes, size=(batch_size))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels
+      predictions=output_data, labels=labels
     )
 
     # Per-example loss data should have one value per batch item, and those
@@ -123,10 +119,7 @@ def test_per_example_loss_weights_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Get the output of the masked LM.
@@ -140,7 +133,7 @@ def test_per_example_loss_weights_3d_input(self):
     weights = np.random.randint(2, size=(batch_size, num_predictions))
 
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights
+      predictions=output_data, labels=labels, weights=weights
     )
 
     # Weighted per-example loss data should be equivalent to multiplying the
@@ -164,7 +157,7 @@ def test_per_example_loss_weights_2d_input(self):
     weights = np.random.randint(2, size=(batch_size))
 
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights
+      predictions=output_data, labels=labels, weights=weights
     )
 
     # Weighted per-example loss data should be equivalent to multiplying the
@@ -179,10 +172,7 @@ def test_loss_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Get the output of the masked LM.
@@ -195,7 +185,7 @@ def test_loss_3d_input(self):
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     weights = np.random.randint(2, size=(batch_size, num_predictions))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights
+      predictions=output_data, labels=labels, weights=weights
     )
 
     # Total loss data should have one value, and that value shouldn't be zero
@@ -230,10 +220,7 @@ def test_loss_weights_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Get the output of the masked LM.
@@ -246,7 +233,7 @@ def test_loss_weights_3d_input(self):
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     null_weights = np.zeros((batch_size, num_predictions))
     weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights
+      predictions=output_data, labels=labels, weights=null_weights
     )
 
     # Because the tensor is fully masked, the loss should be 0.
@@ -267,7 +254,7 @@ def test_loss_weights_2d_input(self):
     labels = np.random.randint(num_classes, size=(batch_size))
     null_weights = np.zeros((batch_size))
     weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights
+      predictions=output_data, labels=labels, weights=null_weights
     )
 
     # Because the tensor is fully masked, the loss should be 0.
@@ -291,7 +278,7 @@ def test_mismatched_weights_and_labels_ranks_fail(self):
 
     with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
       _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-          predictions=output_data, labels=labels, weights=weights
+        predictions=output_data, labels=labels, weights=weights
       )
     with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
       _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
@@ -307,7 +294,7 @@ def test_tf_tensor_inputs(self):
     # we can in fact pass tensors to these functions without causing runtime
     # errors from the shape checking code.
     _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights
+      predictions=output_data, labels=labels, weights=weights
     )
     _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
 
@@ -318,27 +305,25 @@ def test_legacy_lm_loss_compatibility(self):
     #   vocab_size = 5
     #   sequence_length = 4
     #   num_predictions = 2
-    output_data = np.array(
-        [
-            [
-                [-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
-                [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683]
-            ],
-            [
-                [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
-                [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741]
-            ],
-            [
-                [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
-                [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]
-            ]
-        ]
-    )
+    output_data = np.array([
+      [
+        [-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
+        [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683],
+      ],
+      [
+        [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+        [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+      ],
+      [
+        [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+        [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+      ],
+    ])
     labels = np.array([[4, 0], [2, 2], [2, 1]])
 
     # Validate that per_example loss calculations are the same.
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels
+      predictions=output_data, labels=labels
     )
     expected_per_example_loss_data = [[1.2923571, 2.7117882], [2.287932, 2.287932], [3.0924666, 1.8219438]]
     self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
@@ -354,14 +339,15 @@ def test_legacy_classification_loss_compatibility(self):
     # This is the empirical output of a classifier with the following params:
     #   batch_size = 2
     #   num_classes = 3
-    output_data = np.array(
-        [[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00], [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]]
-    )
+    output_data = np.array([
+      [-1.6094601e-03, -1.0966038e01, -6.4434357e00],
+      [-1.6975292e-03, -6.4009643e00, -1.0226612e01],
+    ])
     labels = np.array([2, 1])
 
     # Validate that per_example loss calculations are the same.
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels
+      predictions=output_data, labels=labels
     )
     expected_per_example_loss_data = [6.4434357, 6.4009643]
     self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
diff --git a/deepray/losses/triplet.py b/deepray/losses/triplet.py
index df566b28..57f068db 100644
--- a/deepray/losses/triplet.py
+++ b/deepray/losses/triplet.py
@@ -27,85 +27,81 @@
 def _masked_maximum(data, mask, dim=1):
   """Computes the axis wise maximum over chosen elements.
 
-    Args:
-      data: 2-D float `Tensor` of shape `[n, m]`.
-      mask: 2-D Boolean `Tensor` of shape `[n, m]`.
-      dim: The dimension over which to compute the maximum.
-
-    Returns:
-      masked_maximums: N-D `Tensor`.
-        The maximized dimension is of size 1 after the operation.
-    """
+  Args:
+    data: 2-D float `Tensor` of shape `[n, m]`.
+    mask: 2-D Boolean `Tensor` of shape `[n, m]`.
+    dim: The dimension over which to compute the maximum.
+
+  Returns:
+    masked_maximums: N-D `Tensor`.
+      The maximized dimension is of size 1 after the operation.
+  """
   axis_minimums = tf.math.reduce_min(data, dim, keepdims=True)
-  masked_maximums = (
-      tf.math.reduce_max(tf.math.multiply(data - axis_minimums, mask), dim, keepdims=True) + axis_minimums
-  )
+  masked_maximums = tf.math.reduce_max(tf.math.multiply(data - axis_minimums, mask), dim, keepdims=True) + axis_minimums
   return masked_maximums
 
 
 def _masked_minimum(data, mask, dim=1):
   """Computes the axis wise minimum over chosen elements.
 
-    Args:
-      data: 2-D float `Tensor` of shape `[n, m]`.
-      mask: 2-D Boolean `Tensor` of shape `[n, m]`.
-      dim: The dimension over which to compute the minimum.
+  Args:
+    data: 2-D float `Tensor` of shape `[n, m]`.
+    mask: 2-D Boolean `Tensor` of shape `[n, m]`.
+    dim: The dimension over which to compute the minimum.
 
-    Returns:
-      masked_minimums: N-D `Tensor`.
-        The minimized dimension is of size 1 after the operation.
-    """
+  Returns:
+    masked_minimums: N-D `Tensor`.
+      The minimized dimension is of size 1 after the operation.
+  """
   axis_maximums = tf.math.reduce_max(data, dim, keepdims=True)
-  masked_minimums = (
-      tf.math.reduce_min(tf.math.multiply(data - axis_maximums, mask), dim, keepdims=True) + axis_maximums
-  )
+  masked_minimums = tf.math.reduce_min(tf.math.multiply(data - axis_maximums, mask), dim, keepdims=True) + axis_maximums
   return masked_minimums
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 @tf.function
 def triplet_semihard_loss(
-    y_true: TensorLike,
-    y_pred: TensorLike,
-    margin: FloatTensorLike = 1.0,
-    distance_metric: Union[str, Callable] = "L2",
+  y_true: TensorLike,
+  y_pred: TensorLike,
+  margin: FloatTensorLike = 1.0,
+  distance_metric: Union[str, Callable] = "L2",
 ) -> tf.Tensor:
   r"""Computes the triplet loss with semi-hard negative mining.
 
-    Usage:
-
-    >>> y_true = tf.convert_to_tensor([0, 0])
-    >>> y_pred = tf.convert_to_tensor([[0.0, 1.0], [1.0, 0.0]])
-    >>> dp.losses.triplet_semihard_loss(y_true, y_pred, distance_metric="L2")
-    <tf.Tensor: shape=(), dtype=float32, numpy=2.4142137>
-
-    >>> # Calling with callable `distance_metric`
-    >>> distance_metric = lambda x: tf.linalg.matmul(x, x, transpose_b=True)
-    >>> dp.losses.triplet_semihard_loss(y_true, y_pred, distance_metric=distance_metric)
-    <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
-
-    Args:
-      y_true: 1-D integer `Tensor` with shape `[batch_size]` of
-        multiclass integer labels.
-      y_pred: 2-D float `Tensor` of embedding vectors. Embeddings should
-        be l2 normalized.
-      margin: Float, margin term in the loss definition.
-      distance_metric: `str` or a `Callable` that determines distance metric.
-        Valid strings are "L2" for l2-norm distance,
-        "squared-L2" for squared l2-norm distance,
-        and "angular" for cosine similarity.
-
-        A `Callable` should take a batch of embeddings as input and
-        return the pairwise distance matrix.
-
-    Returns:
-      triplet_loss: float scalar with dtype of `y_pred`.
-    """
+  Usage:
+
+  >>> y_true = tf.convert_to_tensor([0, 0])
+  >>> y_pred = tf.convert_to_tensor([[0.0, 1.0], [1.0, 0.0]])
+  >>> dp.losses.triplet_semihard_loss(y_true, y_pred, distance_metric="L2")
+  <tf.Tensor: shape=(), dtype=float32, numpy=2.4142137>
+
+  >>> # Calling with callable `distance_metric`
+  >>> distance_metric = lambda x: tf.linalg.matmul(x, x, transpose_b=True)
+  >>> dp.losses.triplet_semihard_loss(y_true, y_pred, distance_metric=distance_metric)
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
+
+  Args:
+    y_true: 1-D integer `Tensor` with shape `[batch_size]` of
+      multiclass integer labels.
+    y_pred: 2-D float `Tensor` of embedding vectors. Embeddings should
+      be l2 normalized.
+    margin: Float, margin term in the loss definition.
+    distance_metric: `str` or a `Callable` that determines distance metric.
+      Valid strings are "L2" for l2-norm distance,
+      "squared-L2" for squared l2-norm distance,
+      and "angular" for cosine similarity.
+
+      A `Callable` should take a batch of embeddings as input and
+      return the pairwise distance matrix.
+
+  Returns:
+    triplet_loss: float scalar with dtype of `y_pred`.
+  """
   labels = tf.convert_to_tensor(y_true, name="labels")
   embeddings = tf.convert_to_tensor(y_pred, name="embeddings")
 
-  convert_to_float32 = (embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16)
-  precise_embeddings = (tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings)
+  convert_to_float32 = embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16
+  precise_embeddings = tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings
 
   # Reshape label tensor to [batch_size, 1].
   lshape = tf.shape(labels)
@@ -135,15 +131,15 @@ def triplet_semihard_loss(
   # Compute the mask.
   pdist_matrix_tile = tf.tile(pdist_matrix, [batch_size, 1])
   mask = tf.math.logical_and(
-      tf.tile(adjacency_not, [batch_size, 1]),
-      tf.math.greater(pdist_matrix_tile, tf.reshape(tf.transpose(pdist_matrix), [-1, 1])),
+    tf.tile(adjacency_not, [batch_size, 1]),
+    tf.math.greater(pdist_matrix_tile, tf.reshape(tf.transpose(pdist_matrix), [-1, 1])),
   )
   mask_final = tf.reshape(
-      tf.math.greater(
-          tf.math.reduce_sum(tf.cast(mask, dtype=tf.dtypes.float32), 1, keepdims=True),
-          0.0,
-      ),
-      [batch_size, batch_size],
+    tf.math.greater(
+      tf.math.reduce_sum(tf.cast(mask, dtype=tf.dtypes.float32), 1, keepdims=True),
+      0.0,
+    ),
+    [batch_size, batch_size],
   )
   mask_final = tf.transpose(mask_final)
 
@@ -167,8 +163,8 @@ def triplet_semihard_loss(
   num_positives = tf.math.reduce_sum(mask_positives)
 
   triplet_loss = tf.math.truediv(
-      tf.math.reduce_sum(tf.math.maximum(tf.math.multiply(loss_mat, mask_positives), 0.0)),
-      num_positives,
+    tf.math.reduce_sum(tf.math.maximum(tf.math.multiply(loss_mat, mask_positives), 0.0)),
+    num_positives,
   )
 
   if convert_to_float32:
@@ -180,49 +176,49 @@ def triplet_semihard_loss(
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 @tf.function
 def triplet_hard_loss(
-    y_true: TensorLike,
-    y_pred: TensorLike,
-    margin: FloatTensorLike = 1.0,
-    soft: bool = False,
-    distance_metric: Union[str, Callable] = "L2",
+  y_true: TensorLike,
+  y_pred: TensorLike,
+  margin: FloatTensorLike = 1.0,
+  soft: bool = False,
+  distance_metric: Union[str, Callable] = "L2",
 ) -> tf.Tensor:
   r"""Computes the triplet loss with hard negative and hard positive mining.
 
-    Usage:
-
-    >>> y_true = tf.convert_to_tensor([0, 0])
-    >>> y_pred = tf.convert_to_tensor([[0.0, 1.0], [1.0, 0.0]])
-    >>> dp.losses.triplet_hard_loss(y_true, y_pred, distance_metric="L2")
-    <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
-
-    >>> # Calling with callable `distance_metric`
-    >>> distance_metric = lambda x: tf.linalg.matmul(x, x, transpose_b=True)
-    >>> dp.losses.triplet_hard_loss(y_true, y_pred, distance_metric=distance_metric)
-    <tf.Tensor: shape=(), dtype=float32, numpy=0.0>
-
-    Args:
-      y_true: 1-D integer `Tensor` with shape `[batch_size]` of
-        multiclass integer labels.
-      y_pred: 2-D float `Tensor` of embedding vectors. Embeddings should
-        be l2 normalized.
-      margin: Float, margin term in the loss definition.
-      soft: Boolean, if set, use the soft margin version.
-      distance_metric: `str` or a `Callable` that determines distance metric.
-        Valid strings are "L2" for l2-norm distance,
-        "squared-L2" for squared l2-norm distance,
-        and "angular" for cosine similarity.
-
-        A `Callable` should take a batch of embeddings as input and
-        return the pairwise distance matrix.
-
-    Returns:
-      triplet_loss: float scalar with dtype of `y_pred`.
-    """
+  Usage:
+
+  >>> y_true = tf.convert_to_tensor([0, 0])
+  >>> y_pred = tf.convert_to_tensor([[0.0, 1.0], [1.0, 0.0]])
+  >>> dp.losses.triplet_hard_loss(y_true, y_pred, distance_metric="L2")
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.0>
+
+  >>> # Calling with callable `distance_metric`
+  >>> distance_metric = lambda x: tf.linalg.matmul(x, x, transpose_b=True)
+  >>> dp.losses.triplet_hard_loss(y_true, y_pred, distance_metric=distance_metric)
+  <tf.Tensor: shape=(), dtype=float32, numpy=0.0>
+
+  Args:
+    y_true: 1-D integer `Tensor` with shape `[batch_size]` of
+      multiclass integer labels.
+    y_pred: 2-D float `Tensor` of embedding vectors. Embeddings should
+      be l2 normalized.
+    margin: Float, margin term in the loss definition.
+    soft: Boolean, if set, use the soft margin version.
+    distance_metric: `str` or a `Callable` that determines distance metric.
+      Valid strings are "L2" for l2-norm distance,
+      "squared-L2" for squared l2-norm distance,
+      and "angular" for cosine similarity.
+
+      A `Callable` should take a batch of embeddings as input and
+      return the pairwise distance matrix.
+
+  Returns:
+    triplet_loss: float scalar with dtype of `y_pred`.
+  """
   labels = tf.convert_to_tensor(y_true, name="labels")
   embeddings = tf.convert_to_tensor(y_pred, name="embeddings")
 
-  convert_to_float32 = (embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16)
-  precise_embeddings = (tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings)
+  convert_to_float32 = embeddings.dtype == tf.dtypes.float16 or embeddings.dtype == tf.dtypes.bfloat16
+  precise_embeddings = tf.cast(embeddings, tf.dtypes.float32) if convert_to_float32 else embeddings
 
   # Reshape label tensor to [batch_size, 1].
   lshape = tf.shape(labels)
@@ -277,36 +273,36 @@ def triplet_hard_loss(
 class TripletSemiHardLoss(losses.LossFunctionWrapper):
   """Computes the triplet loss with semi-hard negative mining.
 
-    The loss encourages the positive distances (between a pair of embeddings
-    with the same labels) to be smaller than the minimum negative distance
-    among which are at least greater than the positive distance plus the
-    margin constant (called semi-hard negative) in the mini-batch.
-    If no such negative exists, uses the largest negative distance instead.
-    See: https://arxiv.org/abs/1503.03832.
+  The loss encourages the positive distances (between a pair of embeddings
+  with the same labels) to be smaller than the minimum negative distance
+  among which are at least greater than the positive distance plus the
+  margin constant (called semi-hard negative) in the mini-batch.
+  If no such negative exists, uses the largest negative distance instead.
+  See: https://arxiv.org/abs/1503.03832.
 
-    We expect labels `y_true` to be provided as 1-D integer `Tensor` with shape
-    `[batch_size]` of multi-class integer labels. And embeddings `y_pred` must be
-    2-D float `Tensor` of l2 normalized embedding vectors.
+  We expect labels `y_true` to be provided as 1-D integer `Tensor` with shape
+  `[batch_size]` of multi-class integer labels. And embeddings `y_pred` must be
+  2-D float `Tensor` of l2 normalized embedding vectors.
 
-    Args:
-      margin: Float, margin term in the loss definition. Default value is 1.0.
-      name: Optional name for the op.
-    """
+  Args:
+    margin: Float, margin term in the loss definition. Default value is 1.0.
+    name: Optional name for the op.
+  """
 
   @typechecked
   def __init__(
-      self,
-      margin: FloatTensorLike = 1.0,
-      distance_metric: Union[str, Callable] = "L2",
-      name: Optional[str] = None,
-      **kwargs,
+    self,
+    margin: FloatTensorLike = 1.0,
+    distance_metric: Union[str, Callable] = "L2",
+    name: Optional[str] = None,
+    **kwargs,
   ):
     super().__init__(
-        triplet_semihard_loss,
-        name=name,
-        reduction=tf.keras.losses.Reduction.NONE,
-        margin=margin,
-        distance_metric=distance_metric,
+      triplet_semihard_loss,
+      name=name,
+      reduction=tf.keras.losses.Reduction.NONE,
+      margin=margin,
+      distance_metric=distance_metric,
     )
 
 
@@ -314,37 +310,37 @@ def __init__(
 class TripletHardLoss(losses.LossFunctionWrapper):
   """Computes the triplet loss with hard negative and hard positive mining.
 
-    The loss encourages the maximum positive distance (between a pair of embeddings
-    with the same labels) to be smaller than the minimum negative distance plus the
-    margin constant in the mini-batch.
-    The loss selects the hardest positive and the hardest negative samples
-    within the batch when forming the triplets for computing the loss.
-    See: https://arxiv.org/pdf/1703.07737.
+  The loss encourages the maximum positive distance (between a pair of embeddings
+  with the same labels) to be smaller than the minimum negative distance plus the
+  margin constant in the mini-batch.
+  The loss selects the hardest positive and the hardest negative samples
+  within the batch when forming the triplets for computing the loss.
+  See: https://arxiv.org/pdf/1703.07737.
 
-    We expect labels `y_true` to be provided as 1-D integer `Tensor` with shape
-    `[batch_size]` of multi-class integer labels. And embeddings `y_pred` must be
-    2-D float `Tensor` of l2 normalized embedding vectors.
+  We expect labels `y_true` to be provided as 1-D integer `Tensor` with shape
+  `[batch_size]` of multi-class integer labels. And embeddings `y_pred` must be
+  2-D float `Tensor` of l2 normalized embedding vectors.
 
-    Args:
-      margin: Float, margin term in the loss definition. Default value is 1.0.
-      soft: Boolean, if set, use the soft margin version. Default value is False.
-      name: Optional name for the op.
-    """
+  Args:
+    margin: Float, margin term in the loss definition. Default value is 1.0.
+    soft: Boolean, if set, use the soft margin version. Default value is False.
+    name: Optional name for the op.
+  """
 
   @typechecked
   def __init__(
-      self,
-      margin: FloatTensorLike = 1.0,
-      soft: bool = False,
-      distance_metric: Union[str, Callable] = "L2",
-      name: Optional[str] = None,
-      **kwargs,
+    self,
+    margin: FloatTensorLike = 1.0,
+    soft: bool = False,
+    distance_metric: Union[str, Callable] = "L2",
+    name: Optional[str] = None,
+    **kwargs,
   ):
     super().__init__(
-        triplet_hard_loss,
-        name=name,
-        reduction=tf.keras.losses.Reduction.NONE,
-        margin=margin,
-        soft=soft,
-        distance_metric=distance_metric,
+      triplet_hard_loss,
+      name=name,
+      reduction=tf.keras.losses.Reduction.NONE,
+      margin=margin,
+      soft=soft,
+      distance_metric=distance_metric,
     )
diff --git a/deepray/losses/utils.py b/deepray/losses/utils.py
index ee3d2f4d..2be770b9 100644
--- a/deepray/losses/utils.py
+++ b/deepray/losses/utils.py
@@ -17,9 +17,9 @@
 
 import tensorflow as tf
 
-_PADDING_LABEL = -1.
+_PADDING_LABEL = -1.0
 _PADDING_PREDICTION = -1e6
-_PADDING_WEIGHT = 0.
+_PADDING_WEIGHT = 0.0
 
 TensorLike = tf.types.experimental.TensorLike
 TransformationFunction = Callable[[TensorLike], tf.Tensor]
@@ -78,7 +78,7 @@ def gather_per_row(inputs, indices):
 def is_label_valid(labels):
   """Returns a boolean `Tensor` for label validity."""
   labels = tf.convert_to_tensor(value=labels)
-  return tf.greater_equal(labels, 0.)
+  return tf.greater_equal(labels, 0.0)
 
 
 def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None):
@@ -131,7 +131,7 @@ def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=Non
   Returns:
     A list of `Tensor`s as the list of sorted features by `scores`.
   """
-  with tf.compat.v1.name_scope(name='sort_by_scores'):
+  with tf.compat.v1.name_scope(name="sort_by_scores"):
     scores = tf.cast(scores, tf.float32)
     scores.get_shape().assert_has_rank(2)
     list_size = tf.shape(input=scores)[1]
@@ -174,7 +174,7 @@ def sorted_ranks(scores, shuffle_ties=True, seed=None):
   Returns:
     A 1-based int `Tensor`s as the ranks.
   """
-  with tf.compat.v1.name_scope(name='sorted_ranks'):
+  with tf.compat.v1.name_scope(name="sorted_ranks"):
     batch_size, list_size = tf.unstack(tf.shape(input=scores))
     # The current position in the list for each score.
     positions = tf.tile(tf.expand_dims(tf.range(list_size), 0), [batch_size, 1])
@@ -210,7 +210,7 @@ def organize_valid_indices(is_valid, shuffle=True, seed=None):
     [batch_size, list_size] tensor. The values in the last dimension are the
     indices for an element in the input tensor.
   """
-  with tf.compat.v1.name_scope(name='organize_valid_indices'):
+  with tf.compat.v1.name_scope(name="organize_valid_indices"):
     is_valid = tf.convert_to_tensor(value=is_valid)
     is_valid.get_shape().assert_has_rank(2)
     output_shape = tf.shape(input=is_valid)
@@ -218,13 +218,13 @@ def organize_valid_indices(is_valid, shuffle=True, seed=None):
     if shuffle:
       values = tf.random.uniform(output_shape, seed=seed)
     else:
-      values = (
-          tf.ones_like(is_valid, tf.float32) * tf.reverse(tf.cast(tf.range(output_shape[1]), dtype=tf.float32), [-1])
+      values = tf.ones_like(is_valid, tf.float32) * tf.reverse(
+        tf.cast(tf.range(output_shape[1]), dtype=tf.float32), [-1]
       )
 
     rand = tf.where(is_valid, values, tf.ones(output_shape) * -1e-6)
     # shape(indices) = [batch_size, list_size]
-    indices = tf.argsort(rand, direction='DESCENDING', stable=True)
+    indices = tf.argsort(rand, direction="DESCENDING", stable=True)
     return _to_nd_indices(indices)
 
 
@@ -240,7 +240,7 @@ def reshape_first_ndims(tensor, first_ndims, new_shape):
     A reshaped `Tensor`.
   """
   assert tensor.get_shape().ndims is None or tensor.get_shape().ndims >= first_ndims, (
-      'Tensor shape is less than {} dims.'.format(first_ndims)
+    "Tensor shape is less than {} dims.".format(first_ndims)
   )
   new_shape = tf.concat([new_shape, tf.shape(input=tensor)[first_ndims:]], 0)
   if isinstance(tensor, tf.SparseTensor):
@@ -251,7 +251,7 @@ def reshape_first_ndims(tensor, first_ndims, new_shape):
 
 def reshape_to_2d(tensor):
   """Converts the given `tensor` to a 2-D `Tensor`."""
-  with tf.compat.v1.name_scope(name='reshape_to_2d'):
+  with tf.compat.v1.name_scope(name="reshape_to_2d"):
     rank = tensor.shape.rank if tensor.shape is not None else None
     if rank is not None and rank != 2:
       if rank >= 3:
@@ -283,7 +283,7 @@ def _circular_indices(size, num_valid_entries):
     A tuple of Tensors (batch_indices, batch_indices_mask). The first has
     shape [batch_size, size] and the second has shape [batch_size, size].
   """
-  with tf.compat.v1.name_scope(name='circular_indices'):
+  with tf.compat.v1.name_scope(name="circular_indices"):
     # shape = [batch_size, size] with value [[0, 1, ...], [0, 1, ...], ...].
     batch_indices = tf.tile(tf.expand_dims(tf.range(size), 0), [tf.shape(input=num_valid_entries)[0], 1])
     num_valid_entries = tf.reshape(num_valid_entries, [-1, 1])
@@ -329,7 +329,7 @@ def padded_nd_indices(is_valid, shuffle=False, seed=None):
     list_size, 2] and it can be used in gather_nd or scatter_nd. The second has
     the shape of [batch_size, list_size] with value True for valid indices.
   """
-  with tf.compat.v1.name_scope(name='nd_indices_with_padding'):
+  with tf.compat.v1.name_scope(name="nd_indices_with_padding"):
     is_valid = tf.convert_to_tensor(value=is_valid)
     list_size = tf.shape(input=is_valid)[1]
     num_valid_entries = tf.reduce_sum(input_tensor=tf.cast(is_valid, dtype=tf.int32), axis=1)
@@ -369,7 +369,7 @@ def de_noise(counts, noise, ratio=0.9):
     not positive.
   """
   if not 0 < ratio < 1:
-    raise ValueError('ratio should be in (0, 1), but get {}'.format(ratio))
+    raise ValueError("ratio should be in (0, 1), but get {}".format(ratio))
   odds = (1 - ratio) / ratio
 
   counts = tf.cast(counts, dtype=tf.float32)
@@ -379,25 +379,26 @@ def de_noise(counts, noise, ratio=0.9):
   noise.get_shape().assert_has_rank(2)
   noise.get_shape().assert_is_compatible_with(counts.get_shape())
 
-  with tf.compat.v1.name_scope(name='de_noise'):
-    counts_nonneg = tf.debugging.assert_greater_equal(counts, 0.)
-    noise_pos = tf.debugging.assert_greater(noise, 0.)
+  with tf.compat.v1.name_scope(name="de_noise"):
+    counts_nonneg = tf.debugging.assert_greater_equal(counts, 0.0)
+    noise_pos = tf.debugging.assert_greater(noise, 0.0)
     with tf.control_dependencies([counts_nonneg, noise_pos]):
       # Normalize noise to be a simplex per row.
       noise = noise / tf.reduce_sum(noise, axis=1, keepdims=True)
-      sorted_idx = tf.argsort(counts / noise, direction='DESCENDING', stable=True)
+      sorted_idx = tf.argsort(counts / noise, direction="DESCENDING", stable=True)
       nd_indices = _to_nd_indices(sorted_idx)
       sorted_counts = tf.gather_nd(counts, nd_indices)
       sorted_noise = tf.gather_nd(noise, nd_indices)
       # Decide whether an entry will have a positive value or 0.
       is_pos = tf.cast(
-          (odds + tf.cumsum(sorted_noise, axis=1)) / tf.cumsum(sorted_counts, axis=1) > sorted_noise / sorted_counts,
-          tf.float32
+        (odds + tf.cumsum(sorted_noise, axis=1)) / tf.cumsum(sorted_counts, axis=1) > sorted_noise / sorted_counts,
+        tf.float32,
       )
       # The lambda in the paper above, which is the lagrangian multiplier for
       # the simplex constraint on the variables.
-      lagrangian_multiplier = tf.reduce_sum(sorted_counts * is_pos, axis=1, keepdims=True
-                                           ) / (1 + tf.reduce_sum(sorted_noise * is_pos, axis=1, keepdims=True) / odds)
+      lagrangian_multiplier = tf.reduce_sum(sorted_counts * is_pos, axis=1, keepdims=True) / (
+        1 + tf.reduce_sum(sorted_noise * is_pos, axis=1, keepdims=True) / odds
+      )
       res = (sorted_counts / lagrangian_multiplier - sorted_noise / odds) * is_pos
       return tf.scatter_nd(nd_indices, res, shape=tf.shape(counts))
 
@@ -418,7 +419,7 @@ def ragged_to_dense(labels, predictions, weights):
     A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
   """
   # TODO: Add checks to validate (ragged) shapes of input tensors.
-  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.0), dtype=tf.bool)
   labels = labels.to_tensor(_PADDING_LABEL)
   if predictions is not None:
     predictions = predictions.to_tensor(_PADDING_PREDICTION)
@@ -443,17 +444,17 @@ def parse_keys_and_weights(key: str) -> Dict[str, float]:
   """
 
   def _parse(key_with_weight: str) -> Tuple[str, float]:
-    if ':' in key_with_weight:
-      pair = key_with_weight.split(':')
+    if ":" in key_with_weight:
+      pair = key_with_weight.split(":")
     else:
       pair = [key_with_weight, 1.0]
 
     return pair[0], float(pair[1])
 
   # Remove spaces.
-  key = key.replace(' ', '')
+  key = key.replace(" ", "")
   # Single objective or multiple objectives with weights:
-  keys_to_weights = dict(_parse(loss_key_with_weight) for loss_key_with_weight in key.split(','))
+  keys_to_weights = dict(_parse(loss_key_with_weight) for loss_key_with_weight in key.split(","))
 
   return keys_to_weights
 
@@ -500,7 +501,7 @@ def inverse(rank: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `1/x`.
   """
-  return tf.math.divide_no_nan(1., rank)
+  return tf.math.divide_no_nan(1.0, rank)
 
 
 @tf.keras.utils.register_keras_serializable(package="deepray.losses")
@@ -516,7 +517,7 @@ def pow_minus_1(label: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
   """
-  return tf.math.pow(2., label) - 1.
+  return tf.math.pow(2.0, label) - 1.0
 
 
 @tf.keras.utils.register_keras_serializable(package="deepray.losses")
@@ -532,7 +533,7 @@ def log2_inverse(rank: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
   """
-  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+  return tf.math.divide_no_nan(tf.math.log(2.0), tf.math.log1p(rank))
 
 
 @tf.keras.utils.register_keras_serializable(package="deepray.losses")
diff --git a/deepray/losses/weighted_sparse_categorical_crossentropy.py b/deepray/losses/weighted_sparse_categorical_crossentropy.py
index e21a86f9..2ddc4b84 100644
--- a/deepray/losses/weighted_sparse_categorical_crossentropy.py
+++ b/deepray/losses/weighted_sparse_categorical_crossentropy.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -33,16 +34,17 @@ def _adjust_labels(labels, predictions):
 def _validate_rank(labels, predictions, weights):
   if weights is not None and len(weights.shape) != len(labels.shape):
     raise RuntimeError(
-        ("Weight and label tensors were not of the same rank. weights.shape "
-         "was %s, and labels.shape was %s.") % (predictions.shape, labels.shape)
+      ("Weight and label tensors were not of the same rank. weights.shape was %s, and labels.shape was %s.")
+      % (predictions.shape, labels.shape)
     )
   if (len(predictions.shape) - 1) != len(labels.shape):
     raise RuntimeError(
-        (
-            "Weighted sparse categorical crossentropy expects `labels` to have a "
-            "rank of one less than `predictions`. labels.shape was %s, and "
-            "predictions.shape was %s."
-        ) % (labels.shape, predictions.shape)
+      (
+        "Weighted sparse categorical crossentropy expects `labels` to have a "
+        "rank of one less than `predictions`. labels.shape was %s, and "
+        "predictions.shape was %s."
+      )
+      % (labels.shape, predictions.shape)
     )
 
 
diff --git a/deepray/metrics/__init__.py b/deepray/metrics/__init__.py
index 8f3c2214..4e901dd6 100755
--- a/deepray/metrics/__init__.py
+++ b/deepray/metrics/__init__.py
@@ -17,18 +17,18 @@
 from deepray.metrics.cohens_kappa import CohenKappa
 from deepray.metrics.f_scores import F1Score, FBetaScore
 from deepray.metrics.hamming import (
-    HammingLoss,
-    hamming_distance,
-    hamming_loss_fn,
+  HammingLoss,
+  hamming_distance,
+  hamming_loss_fn,
 )
 from deepray.metrics.utils import MeanMetricWrapper
 from deepray.metrics.geometric_mean import GeometricMean
 from deepray.metrics.harmonic_mean import HarmonicMean
 from deepray.metrics.streaming_correlations import (
-    KendallsTauB,
-    KendallsTauC,
-    PearsonsCorrelation,
-    SpearmansRank,
+  KendallsTauB,
+  KendallsTauC,
+  PearsonsCorrelation,
+  SpearmansRank,
 )
 from deepray.metrics.ndcg import NDCGMetric
-from deepray.metrics.mrr import MRRMetric
\ No newline at end of file
+from deepray.metrics.mrr import MRRMetric
diff --git a/deepray/metrics/_ranking.py b/deepray/metrics/_ranking.py
index c85a9d6c..00580983 100644
--- a/deepray/metrics/_ranking.py
+++ b/deepray/metrics/_ranking.py
@@ -61,7 +61,7 @@ def update_state(self, y_true, y_pred, sample_weight=None):
   def get_config(self):
     config = super(_RankingMetric, self).get_config()
     config.update({
-        "ragged": self._ragged,
+      "ragged": self._ragged,
     })
     return config
 
@@ -106,7 +106,7 @@ def inverse(rank: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `1/x`.
   """
-  return tf.math.divide_no_nan(1., rank)
+  return tf.math.divide_no_nan(1.0, rank)
 
 
 def pow_minus_1(label: TensorLike) -> tf.Tensor:
@@ -121,7 +121,7 @@ def pow_minus_1(label: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
   """
-  return tf.math.pow(2., label) - 1.
+  return tf.math.pow(2.0, label) - 1.0
 
 
 def log2_inverse(rank: TensorLike) -> tf.Tensor:
@@ -136,7 +136,7 @@ def log2_inverse(rank: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
   """
-  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+  return tf.math.divide_no_nan(tf.math.log(2.0), tf.math.log1p(rank))
 
 
 def is_greater_equal_1(label: TensorLike) -> tf.Tensor:
diff --git a/deepray/metrics/alpha_dcg.py b/deepray/metrics/alpha_dcg.py
index ae1f2cde..017b4dfb 100644
--- a/deepray/metrics/alpha_dcg.py
+++ b/deepray/metrics/alpha_dcg.py
@@ -75,15 +75,15 @@ class AlphaDCGMetric(_RankingMetric):
   """
 
   def __init__(
-      self,
-      name="alpha_dcg_metric",
-      topn=None,
-      alpha=0.5,
-      rank_discount_fn=None,
-      seed=None,
-      dtype=None,
-      ragged=False,
-      **kwargs
+    self,
+    name="alpha_dcg_metric",
+    topn=None,
+    alpha=0.5,
+    rank_discount_fn=None,
+    seed=None,
+    dtype=None,
+    ragged=False,
+    **kwargs,
   ):
     """Construct the ranking metric class for alpha-DCG.
 
@@ -110,17 +110,15 @@ def __init__(
     self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
     self._seed = seed
     self._metric = metrics_impl.AlphaDCGMetric(
-        name=name, topn=topn, alpha=alpha, rank_discount_fn=self._rank_discount_fn, seed=seed, ragged=ragged
+      name=name, topn=topn, alpha=alpha, rank_discount_fn=self._rank_discount_fn, seed=seed, ragged=ragged
     )
 
   def get_config(self):
     config = super(AlphaDCGMetric, self).get_config()
-    config.update(
-        {
-            "topn": self._topn,
-            "alpha": self._alpha,
-            "rank_discount_fn": self._rank_discount_fn,
-            "seed": self._seed,
-        }
-    )
+    config.update({
+      "topn": self._topn,
+      "alpha": self._alpha,
+      "rank_discount_fn": self._rank_discount_fn,
+      "seed": self._seed,
+    })
     return config
diff --git a/deepray/metrics/cohens_kappa.py b/deepray/metrics/cohens_kappa.py
index 84f1c4db..b2bc1d64 100644
--- a/deepray/metrics/cohens_kappa.py
+++ b/deepray/metrics/cohens_kappa.py
@@ -27,80 +27,80 @@
 class CohenKappa(keras.metrics.Metric):
   """Computes Kappa score between two raters.
 
-    The score lies in the range `[-1, 1]`. A score of -1 represents
-    complete disagreement between two raters whereas a score of 1
-    represents complete agreement between the two raters.
-    A score of 0 means agreement by chance.
-
-    Note: As of now, this implementation considers all labels
-    while calculating the Cohen's Kappa score.
-
-    Args:
-        num_classes: Number of unique classes in your dataset.
-        weightage: (optional) Weighting to be considered for calculating
-            kappa statistics. A valid value is one of
-            [None, 'linear', 'quadratic']. Defaults to `None`
-        sparse_labels: (bool) Valid only for multi-class scenario.
-            If True, ground truth labels are expected to be integers
-            and not one-hot encoded.
-        regression: (bool) If set, that means the problem is being treated
-            as a regression problem where you are regressing the predictions.
-            **Note:** If you are regressing for the values, the the output layer
-            should contain a single unit.
-        name: (optional) String name of the metric instance
-        dtype: (optional) Data type of the metric result. Defaults to `None`.
-
-    Raises:
-        ValueError: If the value passed for `weightage` is invalid
-        i.e. not any one of [None, 'linear', 'quadratic'].
-
-    Usage:
-
-    >>> y_true = np.array([4, 4, 3, 4, 2, 4, 1, 1], dtype=np.int32)
-    >>> y_pred = np.array([4, 4, 3, 4, 4, 2, 1, 1], dtype=np.int32)
-    >>> weights = np.array([1, 1, 2, 5, 10, 2, 3, 3], dtype=np.int32)
-    >>> metric = dp.metrics.CohenKappa(num_classes=5, sparse_labels=True)
-    >>> metric.update_state(y_true , y_pred)
-    <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
-     array([[0., 0., 0., 0., 0.],
-            [0., 2., 0., 0., 0.],
-            [0., 0., 0., 0., 1.],
-            [0., 0., 0., 1., 0.],
-            [0., 0., 1., 0., 3.]], dtype=float32)>
-    >>> result = metric.result()
-    >>> result.numpy()
-    0.61904764
-    >>> # To use this with weights, sample_weight argument can be used.
-    >>> metric = dp.metrics.CohenKappa(num_classes=5, sparse_labels=True)
-    >>> metric.update_state(y_true , y_pred , sample_weight=weights)
-    <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
-     array([[ 0.,  0.,  0.,  0.,  0.],
-            [ 0.,  6.,  0.,  0.,  0.],
-            [ 0.,  0.,  0.,  0., 10.],
-            [ 0.,  0.,  0.,  2.,  0.],
-            [ 0.,  0.,  2.,  0.,  7.]], dtype=float32)>
-    >>> result = metric.result()
-    >>> result.numpy()
-     0.37209308
-
-    Usage with `tf.keras` API:
-
-    >>> inputs = tf.keras.Input(shape=(10,))
-    >>> x = tf.keras.layers.Dense(10)(inputs)
-    >>> outputs = tf.keras.layers.Dense(1)(x)
-    >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
-    >>> model.compile('sgd', loss='mse', metrics=[dp.metrics.CohenKappa(num_classes=3, sparse_labels=True)])
-    """
+  The score lies in the range `[-1, 1]`. A score of -1 represents
+  complete disagreement between two raters whereas a score of 1
+  represents complete agreement between the two raters.
+  A score of 0 means agreement by chance.
+
+  Note: As of now, this implementation considers all labels
+  while calculating the Cohen's Kappa score.
+
+  Args:
+      num_classes: Number of unique classes in your dataset.
+      weightage: (optional) Weighting to be considered for calculating
+          kappa statistics. A valid value is one of
+          [None, 'linear', 'quadratic']. Defaults to `None`
+      sparse_labels: (bool) Valid only for multi-class scenario.
+          If True, ground truth labels are expected to be integers
+          and not one-hot encoded.
+      regression: (bool) If set, that means the problem is being treated
+          as a regression problem where you are regressing the predictions.
+          **Note:** If you are regressing for the values, the the output layer
+          should contain a single unit.
+      name: (optional) String name of the metric instance
+      dtype: (optional) Data type of the metric result. Defaults to `None`.
+
+  Raises:
+      ValueError: If the value passed for `weightage` is invalid
+      i.e. not any one of [None, 'linear', 'quadratic'].
+
+  Usage:
+
+  >>> y_true = np.array([4, 4, 3, 4, 2, 4, 1, 1], dtype=np.int32)
+  >>> y_pred = np.array([4, 4, 3, 4, 4, 2, 1, 1], dtype=np.int32)
+  >>> weights = np.array([1, 1, 2, 5, 10, 2, 3, 3], dtype=np.int32)
+  >>> metric = dp.metrics.CohenKappa(num_classes=5, sparse_labels=True)
+  >>> metric.update_state(y_true , y_pred)
+  <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
+   array([[0., 0., 0., 0., 0.],
+          [0., 2., 0., 0., 0.],
+          [0., 0., 0., 0., 1.],
+          [0., 0., 0., 1., 0.],
+          [0., 0., 1., 0., 3.]], dtype=float32)>
+  >>> result = metric.result()
+  >>> result.numpy()
+  0.61904764
+  >>> # To use this with weights, sample_weight argument can be used.
+  >>> metric = dp.metrics.CohenKappa(num_classes=5, sparse_labels=True)
+  >>> metric.update_state(y_true , y_pred , sample_weight=weights)
+  <tf.Tensor: shape=(5, 5), dtype=float32, numpy=
+   array([[ 0.,  0.,  0.,  0.,  0.],
+          [ 0.,  6.,  0.,  0.,  0.],
+          [ 0.,  0.,  0.,  0., 10.],
+          [ 0.,  0.,  0.,  2.,  0.],
+          [ 0.,  0.,  2.,  0.,  7.]], dtype=float32)>
+  >>> result = metric.result()
+  >>> result.numpy()
+   0.37209308
+
+  Usage with `tf.keras` API:
+
+  >>> inputs = tf.keras.Input(shape=(10,))
+  >>> x = tf.keras.layers.Dense(10)(inputs)
+  >>> outputs = tf.keras.layers.Dense(1)(x)
+  >>> model = tf.keras.models.Model(inputs=inputs, outputs=outputs)
+  >>> model.compile('sgd', loss='mse', metrics=[dp.metrics.CohenKappa(num_classes=3, sparse_labels=True)])
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: FloatTensorLike,
-      name: str = "cohen_kappa",
-      weightage: Optional[str] = None,
-      sparse_labels: bool = False,
-      regression: bool = False,
-      dtype: AcceptableDTypes = None,
+    self,
+    num_classes: FloatTensorLike,
+    name: str = "cohen_kappa",
+    weightage: Optional[str] = None,
+    sparse_labels: bool = False,
+    regression: bool = False,
+    dtype: AcceptableDTypes = None,
   ):
     """Creates a `CohenKappa` instance."""
     super().__init__(name=name, dtype=dtype)
@@ -121,29 +121,29 @@ def __init__(
     self.regression = regression
     self.sparse_labels = sparse_labels
     self.conf_mtx = self.add_weight(
-        "conf_mtx",
-        shape=(self.num_classes, self.num_classes),
-        initializer=tf.keras.initializers.zeros,
-        dtype=tf.float32,
+      "conf_mtx",
+      shape=(self.num_classes, self.num_classes),
+      initializer=tf.keras.initializers.zeros,
+      dtype=tf.float32,
     )
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Accumulates the confusion matrix condition statistics.
 
-        Args:
-          y_true: Labels assigned by the first annotator with shape
-            `[num_samples,]`.
-          y_pred: Labels assigned by the second annotator with shape
-            `[num_samples,]`. The kappa statistic is symmetric,
-            so swapping `y_true` and `y_pred` doesn't change the value.
-          sample_weight (optional): for weighting labels in confusion matrix
-            Defaults to `None`. The dtype for weights should be the same
-            as the dtype for confusion matrix. For more details,
-            please check `tf.math.confusion_matrix`.
-
-        Returns:
-          Update op.
-        """
+    Args:
+      y_true: Labels assigned by the first annotator with shape
+        `[num_samples,]`.
+      y_pred: Labels assigned by the second annotator with shape
+        `[num_samples,]`. The kappa statistic is symmetric,
+        so swapping `y_true` and `y_pred` doesn't change the value.
+      sample_weight (optional): for weighting labels in confusion matrix
+        Defaults to `None`. The dtype for weights should be the same
+        as the dtype for confusion matrix. For more details,
+        please check `tf.math.confusion_matrix`.
+
+    Returns:
+      Update op.
+    """
     return self._update(y_true, y_pred, sample_weight)
 
   def _update_binary_class_model(self, y_true, y_pred, sample_weight=None):
@@ -188,11 +188,11 @@ def _update_confusion_matrix(self, y_true, y_pred, sample_weight):
     y_pred = self._safe_squeeze(y_pred)
 
     new_conf_mtx = tf.math.confusion_matrix(
-        labels=y_true,
-        predictions=y_pred,
-        num_classes=self.num_classes,
-        weights=sample_weight,
-        dtype=tf.float32,
+      labels=y_true,
+      predictions=y_pred,
+      num_classes=self.num_classes,
+      weights=sample_weight,
+      dtype=tf.float32,
     )
 
     return self.conf_mtx.assign_add(new_conf_mtx)
@@ -234,19 +234,19 @@ def result(self):
     numerator = tf.reduce_sum(conf_mtx * weight_mtx)
     denominator = tf.reduce_sum(out_prod * weight_mtx)
     return tf.cond(
-        tf.math.is_nan(denominator),
-        true_fn=lambda: 0.0,
-        false_fn=lambda: 1 - (numerator / denominator),
+      tf.math.is_nan(denominator),
+      true_fn=lambda: 0.0,
+      false_fn=lambda: 1 - (numerator / denominator),
     )
 
   def get_config(self):
     """Returns the serializable config of the metric."""
 
     config = {
-        "num_classes": self.num_classes,
-        "weightage": self.weightage,
-        "sparse_labels": self.sparse_labels,
-        "regression": self.regression,
+      "num_classes": self.num_classes,
+      "weightage": self.weightage,
+      "sparse_labels": self.sparse_labels,
+      "regression": self.regression,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -256,8 +256,8 @@ def reset_state(self):
 
     for v in self.variables:
       keras.set_value(
-          v,
-          np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
+        v,
+        np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
       )
 
   def reset_states(self):
diff --git a/deepray/metrics/dcg.py b/deepray/metrics/dcg.py
index 05f770ee..15ad8141 100644
--- a/deepray/metrics/dcg.py
+++ b/deepray/metrics/dcg.py
@@ -61,15 +61,15 @@ def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dt
     self._gain_fn = gain_fn or utils.pow_minus_1
     self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
     self._metric = metrics_impl.DCGMetric(
-        name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
+      name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
     )
 
   def get_config(self):
     base_config = super(DCGMetric, self).get_config()
     config = {
-        "topn": self._topn,
-        "gain_fn": self._gain_fn,
-        "rank_discount_fn": self._rank_discount_fn,
+      "topn": self._topn,
+      "gain_fn": self._gain_fn,
+      "rank_discount_fn": self._rank_discount_fn,
     }
     config.update(base_config)
     return config
diff --git a/deepray/metrics/f_scores.py b/deepray/metrics/f_scores.py
index c15afc23..c2593d63 100755
--- a/deepray/metrics/f_scores.py
+++ b/deepray/metrics/f_scores.py
@@ -26,84 +26,83 @@
 class FBetaScore(keras.metrics.Metric):
   r"""Computes F-Beta score.
 
-    It is the weighted harmonic mean of precision
-    and recall. Output range is `[0, 1]`. Works for
-    both multi-class and multi-label classification.
-
-    $$
-    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{recall}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
-    $$
-
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro` and
-            `weighted`. Default value is None.
-        beta: Determines the weight of precision and recall
-            in harmonic mean. Determines the weight given to the
-            precision and recall. Default value is 1.
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-
-    Returns:
-        F-Beta Score: float.
-
-    Raises:
-        ValueError: If the `average` has values other than
-        `[None, 'micro', 'macro', 'weighted']`.
-
-        ValueError: If the `beta` value is less than or equal
-        to 0.
-
-    `average` parameter behavior:
-
-        None: Scores for each class are returned.
-
-        micro: True positivies, false positives and
-            false negatives are computed globally.
-
-        macro: True positivies, false positives and
-            false negatives are computed for each class
-            and their unweighted mean is returned.
-
-        weighted: Metrics are computed for each class
-            and returns the mean weighted by the
-            number of true instances in each class.
-
-    Usage:
-
-    >>> metric = dp.metrics.FBetaScore(num_classes=3, beta=2.0, threshold=0.5)
-    >>> y_true = np.array([[1, 1, 1],
-    ...                    [1, 0, 0],
-    ...                    [1, 1, 0]], np.int32)
-    >>> y_pred = np.array([[0.2, 0.6, 0.7],
-    ...                    [0.2, 0.6, 0.6],
-    ...                    [0.6, 0.8, 0.0]], np.float32)
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy()
-    array([0.3846154 , 0.90909094, 0.8333334 ], dtype=float32)
-    """
+  It is the weighted harmonic mean of precision
+  and recall. Output range is `[0, 1]`. Works for
+  both multi-class and multi-label classification.
+
+  $$
+  F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{recall}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
+  $$
+
+  Args:
+      num_classes: Number of unique classes in the dataset.
+      average: Type of averaging to be performed on data.
+          Acceptable values are `None`, `micro`, `macro` and
+          `weighted`. Default value is None.
+      beta: Determines the weight of precision and recall
+          in harmonic mean. Determines the weight given to the
+          precision and recall. Default value is 1.
+      threshold: Elements of `y_pred` greater than threshold are
+          converted to be 1, and the rest 0. If threshold is
+          None, the argmax is converted to 1, and the rest 0.
+      name: (Optional) String name of the metric instance.
+      dtype: (Optional) Data type of the metric result.
+
+  Returns:
+      F-Beta Score: float.
+
+  Raises:
+      ValueError: If the `average` has values other than
+      `[None, 'micro', 'macro', 'weighted']`.
+
+      ValueError: If the `beta` value is less than or equal
+      to 0.
+
+  `average` parameter behavior:
+
+      None: Scores for each class are returned.
+
+      micro: True positivies, false positives and
+          false negatives are computed globally.
+
+      macro: True positivies, false positives and
+          false negatives are computed for each class
+          and their unweighted mean is returned.
+
+      weighted: Metrics are computed for each class
+          and returns the mean weighted by the
+          number of true instances in each class.
+
+  Usage:
+
+  >>> metric = dp.metrics.FBetaScore(num_classes=3, beta=2.0, threshold=0.5)
+  >>> y_true = np.array([[1, 1, 1],
+  ...                    [1, 0, 0],
+  ...                    [1, 1, 0]], np.int32)
+  >>> y_pred = np.array([[0.2, 0.6, 0.7],
+  ...                    [0.2, 0.6, 0.6],
+  ...                    [0.6, 0.8, 0.0]], np.float32)
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy()
+  array([0.3846154 , 0.90909094, 0.8333334 ], dtype=float32)
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: FloatTensorLike,
-      average: Optional[str] = None,
-      beta: FloatTensorLike = 1.0,
-      threshold: Optional[FloatTensorLike] = None,
-      name: str = "fbeta_score",
-      dtype: AcceptableDTypes = None,
-      **kwargs,
+    self,
+    num_classes: FloatTensorLike,
+    average: Optional[str] = None,
+    beta: FloatTensorLike = 1.0,
+    threshold: Optional[FloatTensorLike] = None,
+    name: str = "fbeta_score",
+    dtype: AcceptableDTypes = None,
+    **kwargs,
   ):
     super().__init__(name=name, dtype=dtype)
 
     if average not in (None, "micro", "macro", "weighted"):
-      raise ValueError("Unknown average type. Acceptable values "
-                       "are: [None, 'micro', 'macro', 'weighted']")
+      raise ValueError("Unknown average type. Acceptable values are: [None, 'micro', 'macro', 'weighted']")
 
     if not isinstance(beta, float):
       raise TypeError("The value of beta should be a python float")
@@ -180,10 +179,10 @@ def get_config(self):
     """Returns the serializable config of the metric."""
 
     config = {
-        "num_classes": self.num_classes,
-        "average": self.average,
-        "beta": self.beta,
-        "threshold": self.threshold,
+      "num_classes": self.num_classes,
+      "average": self.average,
+      "beta": self.beta,
+      "threshold": self.threshold,
     }
 
     base_config = super().get_config()
@@ -204,69 +203,69 @@ def reset_states(self):
 class F1Score(FBetaScore):
   r"""Computes F-1 Score.
 
-    It is the harmonic mean of precision and recall.
-    Output range is `[0, 1]`. Works for both multi-class
-    and multi-label classification.
-
-    $$
-    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
-    $$
-
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro`
-            and `weighted`. Default value is None.
-        threshold: Elements of `y_pred` above threshold are
-            considered to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-
-    Returns:
-        F-1 Score: float.
-
-    Raises:
-        ValueError: If the `average` has values other than
-        [None, 'micro', 'macro', 'weighted'].
-
-    `average` parameter behavior:
-        None: Scores for each class are returned
-
-        micro: True positivies, false positives and
-            false negatives are computed globally.
-
-        macro: True positivies, false positives and
-            false negatives are computed for each class
-            and their unweighted mean is returned.
-
-        weighted: Metrics are computed for each class
-            and returns the mean weighted by the
-            number of true instances in each class.
-
-    Usage:
-
-    >>> metric = dp.metrics.F1Score(num_classes=3, threshold=0.5)
-    >>> y_true = np.array([[1, 1, 1],
-    ...                    [1, 0, 0],
-    ...                    [1, 1, 0]], np.int32)
-    >>> y_pred = np.array([[0.2, 0.6, 0.7],
-    ...                    [0.2, 0.6, 0.6],
-    ...                    [0.6, 0.8, 0.0]], np.float32)
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy()
-    array([0.5      , 0.8      , 0.6666667], dtype=float32)
-    """
+  It is the harmonic mean of precision and recall.
+  Output range is `[0, 1]`. Works for both multi-class
+  and multi-label classification.
+
+  $$
+  F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
+  $$
+
+  Args:
+      num_classes: Number of unique classes in the dataset.
+      average: Type of averaging to be performed on data.
+          Acceptable values are `None`, `micro`, `macro`
+          and `weighted`. Default value is None.
+      threshold: Elements of `y_pred` above threshold are
+          considered to be 1, and the rest 0. If threshold is
+          None, the argmax is converted to 1, and the rest 0.
+      name: (Optional) String name of the metric instance.
+      dtype: (Optional) Data type of the metric result.
+
+  Returns:
+      F-1 Score: float.
+
+  Raises:
+      ValueError: If the `average` has values other than
+      [None, 'micro', 'macro', 'weighted'].
+
+  `average` parameter behavior:
+      None: Scores for each class are returned
+
+      micro: True positivies, false positives and
+          false negatives are computed globally.
+
+      macro: True positivies, false positives and
+          false negatives are computed for each class
+          and their unweighted mean is returned.
+
+      weighted: Metrics are computed for each class
+          and returns the mean weighted by the
+          number of true instances in each class.
+
+  Usage:
+
+  >>> metric = dp.metrics.F1Score(num_classes=3, threshold=0.5)
+  >>> y_true = np.array([[1, 1, 1],
+  ...                    [1, 0, 0],
+  ...                    [1, 1, 0]], np.int32)
+  >>> y_pred = np.array([[0.2, 0.6, 0.7],
+  ...                    [0.2, 0.6, 0.6],
+  ...                    [0.6, 0.8, 0.0]], np.float32)
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy()
+  array([0.5      , 0.8      , 0.6666667], dtype=float32)
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: FloatTensorLike,
-      average: str = None,
-      threshold: Optional[FloatTensorLike] = None,
-      name: str = "f1_score",
-      dtype: AcceptableDTypes = None,
+    self,
+    num_classes: FloatTensorLike,
+    average: str = None,
+    threshold: Optional[FloatTensorLike] = None,
+    name: str = "f1_score",
+    dtype: AcceptableDTypes = None,
   ):
     super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
 
diff --git a/deepray/metrics/geometric_mean.py b/deepray/metrics/geometric_mean.py
index ee1081ec..e47c2e03 100644
--- a/deepray/metrics/geometric_mean.py
+++ b/deepray/metrics/geometric_mean.py
@@ -26,23 +26,23 @@
 class GeometricMean(keras.metrics.Metric):
   """Compute Geometric Mean
 
-    The geometric mean is a kind of mean. Unlike the arithmetic mean
-    that uses the sum of values, it uses the product of the values to
-    represent typical values in a set of numbers.
+  The geometric mean is a kind of mean. Unlike the arithmetic mean
+  that uses the sum of values, it uses the product of the values to
+  represent typical values in a set of numbers.
 
-    Note: `dp.metrics.GeometricMean` can be used the same as `tf.keras.metrics.Mean`.
+  Note: `dp.metrics.GeometricMean` can be used the same as `tf.keras.metrics.Mean`.
 
-    Args:
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
+  Args:
+      name: (Optional) String name of the metric instance.
+      dtype: (Optional) Data type of the metric result.
 
-    Usage:
+  Usage:
 
-    >>> metric = dp.metrics.GeometricMean()
-    >>> metric.update_state([1, 3, 5, 7, 9])
-    >>> metric.result().numpy()
-    3.9362833
-    """
+  >>> metric = dp.metrics.GeometricMean()
+  >>> metric.update_state([1, 3, 5, 7, 9])
+  >>> metric.result().numpy()
+  3.9362833
+  """
 
   @typechecked
   def __init__(self, name: str = "geometric_mean", dtype: AcceptableDTypes = None, **kwargs):
diff --git a/deepray/metrics/hamming.py b/deepray/metrics/hamming.py
index 241c67a5..3ed60204 100644
--- a/deepray/metrics/hamming.py
+++ b/deepray/metrics/hamming.py
@@ -25,25 +25,25 @@
 def hamming_distance(actuals: TensorLike, predictions: TensorLike) -> tf.Tensor:
   """Computes hamming distance.
 
-    Hamming distance is for comparing two binary strings.
-    It is the number of bit positions in which two bits
-    are different.
+  Hamming distance is for comparing two binary strings.
+  It is the number of bit positions in which two bits
+  are different.
 
-    Args:
-        actuals: actual target value.
-        predictions: predicted value.
+  Args:
+      actuals: actual target value.
+      predictions: predicted value.
 
-    Returns:
-        hamming distance: float.
+  Returns:
+      hamming distance: float.
 
-    Usage:
+  Usage:
 
-    >>> y_true = np.array([1, 1, 0, 0, 1, 0, 1, 0, 0, 1], dtype=np.int32)
-    >>> y_pred = np.array([1, 0, 0, 0, 1, 0, 0, 1, 0, 1], dtype=np.int32)
-    >>> hamming_distance(y_true, y_pred).numpy()
-    0.3
+  >>> y_true = np.array([1, 1, 0, 0, 1, 0, 1, 0, 0, 1], dtype=np.int32)
+  >>> y_pred = np.array([1, 0, 0, 0, 1, 0, 0, 1, 0, 1], dtype=np.int32)
+  >>> hamming_distance(y_true, y_pred).numpy()
+  0.3
 
-    """
+  """
   result = tf.not_equal(actuals, predictions)
   not_eq = tf.reduce_sum(tf.cast(result, tf.float32))
   ham_distance = tf.math.divide_no_nan(not_eq, len(result))
@@ -51,32 +51,32 @@ def hamming_distance(actuals: TensorLike, predictions: TensorLike) -> tf.Tensor:
 
 
 def hamming_loss_fn(
-    y_true: TensorLike,
-    y_pred: TensorLike,
-    threshold: Union[FloatTensorLike, None],
-    mode: str,
+  y_true: TensorLike,
+  y_pred: TensorLike,
+  threshold: Union[FloatTensorLike, None],
+  mode: str,
 ) -> tf.Tensor:
   """Computes hamming loss.
 
-    Hamming loss is the fraction of wrong labels to the total number
-    of labels.
-
-    In multi-class classification, hamming loss is calculated as the
-    hamming distance between `y_true` and `y_pred`.
-    In multi-label classification, hamming loss penalizes only the
-    individual labels.
-
-    Args:
-        y_true: actual target value.
-        y_pred: predicted target value.
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        mode: multi-class or multi-label.
-
-    Returns:
-        hamming loss: float.
-    """
+  Hamming loss is the fraction of wrong labels to the total number
+  of labels.
+
+  In multi-class classification, hamming loss is calculated as the
+  hamming distance between `y_true` and `y_pred`.
+  In multi-label classification, hamming loss penalizes only the
+  individual labels.
+
+  Args:
+      y_true: actual target value.
+      y_pred: predicted target value.
+      threshold: Elements of `y_pred` greater than threshold are
+          converted to be 1, and the rest 0. If threshold is
+          None, the argmax is converted to 1, and the rest 0.
+      mode: multi-class or multi-label.
+
+  Returns:
+      hamming loss: float.
+  """
   if mode not in ["multiclass", "multilabel"]:
     raise TypeError("mode must be either multiclass or multilabel]")
 
@@ -103,59 +103,59 @@ def hamming_loss_fn(
 class HammingLoss(MeanMetricWrapper):
   """Computes hamming loss.
 
-    Hamming loss is the fraction of wrong labels to the total number
-    of labels.
-
-    In multi-class classification, hamming loss is calculated as the
-    hamming distance between `y_true` and `y_pred`.
-    In multi-label classification, hamming loss penalizes only the
-    individual labels.
-
-    Args:
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        mode: multi-class or multi-label.
-        name: (Optional) string name of the metric instance.
-        dtype: (Optional) data type of the metric result.
-
-    Usage:
-
-    >>> # multi-class hamming loss
-    >>> metric = HammingLoss(mode='multiclass', threshold=0.6)
-    >>> y_true = np.array([[1.0, 0.0, 0.0, 0.0],
-    ...                    [0.0, 0.0, 1.0, 0.0],
-    ...                    [0.0, 0.0, 0.0, 1.0],
-    ...                    [0.0, 1.0, 0.0, 0.0]], dtype=np.float32)
-    >>> y_pred = np.array([[0.8, 0.1, 0.1, 0.0],
-    ...                    [0.2, 0.0, 0.8, 0.0],
-    ...                    [0.05, 0.05, 0.1, 0.8],
-    ...                    [1.0, 0.0, 0.0, 0.0]], dtype=np.float32)
-    >>> metric.update_state(y_true, y_pred)
-    <tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=4.0>
-    >>> metric.result().numpy()
-    0.25
-    >>> # multi-label hamming loss
-    >>> metric = HammingLoss(mode='multilabel', threshold=0.8)
-    >>> y_true = np.array([[1, 0, 1, 0],
-    ...                    [0, 1, 0, 1],
-    ...                    [0, 0, 0, 1]], dtype=np.int32)
-    >>> y_pred = np.array([[0.82, 0.5, 0.90, 0],
-    ...                    [0, 1, 0.4, 0.98],
-    ...                    [0.89, 0.79, 0, 0.3]], dtype=np.float32)
-    >>> metric.update_state(y_true, y_pred)
-    <tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=3.0>
-    >>> metric.result().numpy()
-    0.16666667
-    """
+  Hamming loss is the fraction of wrong labels to the total number
+  of labels.
+
+  In multi-class classification, hamming loss is calculated as the
+  hamming distance between `y_true` and `y_pred`.
+  In multi-label classification, hamming loss penalizes only the
+  individual labels.
+
+  Args:
+      threshold: Elements of `y_pred` greater than threshold are
+          converted to be 1, and the rest 0. If threshold is
+          None, the argmax is converted to 1, and the rest 0.
+      mode: multi-class or multi-label.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+  Usage:
+
+  >>> # multi-class hamming loss
+  >>> metric = HammingLoss(mode='multiclass', threshold=0.6)
+  >>> y_true = np.array([[1.0, 0.0, 0.0, 0.0],
+  ...                    [0.0, 0.0, 1.0, 0.0],
+  ...                    [0.0, 0.0, 0.0, 1.0],
+  ...                    [0.0, 1.0, 0.0, 0.0]], dtype=np.float32)
+  >>> y_pred = np.array([[0.8, 0.1, 0.1, 0.0],
+  ...                    [0.2, 0.0, 0.8, 0.0],
+  ...                    [0.05, 0.05, 0.1, 0.8],
+  ...                    [1.0, 0.0, 0.0, 0.0]], dtype=np.float32)
+  >>> metric.update_state(y_true, y_pred)
+  <tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=4.0>
+  >>> metric.result().numpy()
+  0.25
+  >>> # multi-label hamming loss
+  >>> metric = HammingLoss(mode='multilabel', threshold=0.8)
+  >>> y_true = np.array([[1, 0, 1, 0],
+  ...                    [0, 1, 0, 1],
+  ...                    [0, 0, 0, 1]], dtype=np.int32)
+  >>> y_pred = np.array([[0.82, 0.5, 0.90, 0],
+  ...                    [0, 1, 0.4, 0.98],
+  ...                    [0.89, 0.79, 0, 0.3]], dtype=np.float32)
+  >>> metric.update_state(y_true, y_pred)
+  <tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=3.0>
+  >>> metric.result().numpy()
+  0.16666667
+  """
 
   @typechecked
   def __init__(
-      self,
-      mode: str,
-      name: str = "hamming_loss",
-      threshold: Optional[FloatTensorLike] = None,
-      dtype: AcceptableDTypes = None,
-      **kwargs,
+    self,
+    mode: str,
+    name: str = "hamming_loss",
+    threshold: Optional[FloatTensorLike] = None,
+    dtype: AcceptableDTypes = None,
+    **kwargs,
   ):
     super().__init__(hamming_loss_fn, name=name, dtype=dtype, mode=mode, threshold=threshold)
diff --git a/deepray/metrics/harmonic_mean.py b/deepray/metrics/harmonic_mean.py
index cdf14fde..cb08dd8d 100644
--- a/deepray/metrics/harmonic_mean.py
+++ b/deepray/metrics/harmonic_mean.py
@@ -23,18 +23,18 @@
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class HarmonicMean(tf.keras.metrics.Mean):
   """Compute Harmonic Mean
-    The harmonic mean is a kind of mean. It can be expressed as the reciprocal of
-    the arithmetic mean of the reciprocals of the given set of numbers.
-    Note: `dp.metrics.HarmonicMean` can be used the same as `tf.keras.metrics.Mean`.
-    Args:
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Usage:
-    >>> metric = dp.metrics.HarmonicMean()
-    >>> metric.update_state([1, 4, 4])
-    >>> metric.result().numpy()
-    2.0
-    """
+  The harmonic mean is a kind of mean. It can be expressed as the reciprocal of
+  the arithmetic mean of the reciprocals of the given set of numbers.
+  Note: `dp.metrics.HarmonicMean` can be used the same as `tf.keras.metrics.Mean`.
+  Args:
+      name: (Optional) String name of the metric instance.
+      dtype: (Optional) Data type of the metric result.
+  Usage:
+  >>> metric = dp.metrics.HarmonicMean()
+  >>> metric.update_state([1, 4, 4])
+  >>> metric.result().numpy()
+  2.0
+  """
 
   @typechecked
   def __init__(self, name: str = "harmonic_mean", dtype: AcceptableDTypes = None, **kwargs):
diff --git a/deepray/metrics/hits.py b/deepray/metrics/hits.py
index eabdb2c3..d8614cc5 100644
--- a/deepray/metrics/hits.py
+++ b/deepray/metrics/hits.py
@@ -60,6 +60,6 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     config = super(HitsMetric, self).get_config()
     config.update({
-        "topn": self._topn,
+      "topn": self._topn,
     })
     return config
diff --git a/deepray/metrics/matthews_correlation_coefficient.py b/deepray/metrics/matthews_correlation_coefficient.py
index c871d7fe..b8d45e54 100644
--- a/deepray/metrics/matthews_correlation_coefficient.py
+++ b/deepray/metrics/matthews_correlation_coefficient.py
@@ -27,54 +27,54 @@
 class MatthewsCorrelationCoefficient(tf.keras.metrics.Metric):
   """Computes the Matthews Correlation Coefficient.
 
-    The statistic is also known as the phi coefficient.
-    The Matthews correlation coefficient (MCC) is used in
-    machine learning as a measure of the quality of binary
-    and multiclass classifications. It takes into account
-    true and false positives and negatives and is generally
-    regarded as a balanced measure which can be used even
-    if the classes are of very different sizes. The correlation
-    coefficient value of MCC is between -1 and +1. A
-    coefficient of +1 represents a perfect prediction,
-    0 an average random prediction and -1 an inverse
-    prediction. The statistic is also known as
-    the phi coefficient.
-
-    MCC = (TP * TN - FP * FN) /
-          ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))^(1/2)
-
-    Args:
-        num_classes : Number of unique classes in the dataset.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-
-    Usage:
-
-    >>> y_true = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], dtype=np.float32)
-    >>> y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]], dtype=np.float32)
-    >>> metric = dp.metrics.MatthewsCorrelationCoefficient(num_classes=2)
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy()
-    -0.33333334
-    """
+  The statistic is also known as the phi coefficient.
+  The Matthews correlation coefficient (MCC) is used in
+  machine learning as a measure of the quality of binary
+  and multiclass classifications. It takes into account
+  true and false positives and negatives and is generally
+  regarded as a balanced measure which can be used even
+  if the classes are of very different sizes. The correlation
+  coefficient value of MCC is between -1 and +1. A
+  coefficient of +1 represents a perfect prediction,
+  0 an average random prediction and -1 an inverse
+  prediction. The statistic is also known as
+  the phi coefficient.
+
+  MCC = (TP * TN - FP * FN) /
+        ((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))^(1/2)
+
+  Args:
+      num_classes : Number of unique classes in the dataset.
+      name: (Optional) String name of the metric instance.
+      dtype: (Optional) Data type of the metric result.
+
+  Usage:
+
+  >>> y_true = np.array([[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0]], dtype=np.float32)
+  >>> y_pred = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0]], dtype=np.float32)
+  >>> metric = dp.metrics.MatthewsCorrelationCoefficient(num_classes=2)
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy()
+  -0.33333334
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: FloatTensorLike,
-      name: str = "MatthewsCorrelationCoefficient",
-      dtype: AcceptableDTypes = None,
-      **kwargs,
+    self,
+    num_classes: FloatTensorLike,
+    name: str = "MatthewsCorrelationCoefficient",
+    dtype: AcceptableDTypes = None,
+    **kwargs,
   ):
     """Creates a Matthews Correlation Coefficient instance."""
     super().__init__(name=name, dtype=dtype)
     self.num_classes = num_classes
     self.conf_mtx = self.add_weight(
-        "conf_mtx",
-        shape=(self.num_classes, self.num_classes),
-        initializer=tf.keras.initializers.zeros,
-        dtype=self.dtype,
+      "conf_mtx",
+      shape=(self.num_classes, self.num_classes),
+      initializer=tf.keras.initializers.zeros,
+      dtype=self.dtype,
     )
 
   # TODO: sample_weights
@@ -83,11 +83,11 @@ def update_state(self, y_true, y_pred, sample_weight=None):
     y_pred = tf.cast(y_pred, dtype=self.dtype)
 
     new_conf_mtx = tf.math.confusion_matrix(
-        labels=tf.argmax(y_true, 1),
-        predictions=tf.argmax(y_pred, 1),
-        num_classes=self.num_classes,
-        weights=sample_weight,
-        dtype=self.dtype,
+      labels=tf.argmax(y_true, 1),
+      predictions=tf.argmax(y_pred, 1),
+      num_classes=self.num_classes,
+      weights=sample_weight,
+      dtype=self.dtype,
     )
 
     self.conf_mtx.assign_add(new_conf_mtx)
@@ -116,7 +116,7 @@ def get_config(self):
     """Returns the serializable config of the metric."""
 
     config = {
-        "num_classes": self.num_classes,
+      "num_classes": self.num_classes,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -126,8 +126,8 @@ def reset_state(self):
 
     for v in self.variables:
       keras.set_value(
-          v,
-          np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
+        v,
+        np.zeros((self.num_classes, self.num_classes), v.dtype.as_numpy_dtype),
       )
 
   def reset_states(self):
diff --git a/deepray/metrics/mean_average_precision.py b/deepray/metrics/mean_average_precision.py
index 76a5ecf0..264ac188 100644
--- a/deepray/metrics/mean_average_precision.py
+++ b/deepray/metrics/mean_average_precision.py
@@ -73,7 +73,7 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     base_config = super(MeanAveragePrecisionMetric, self).get_config()
     config = {
-        "topn": self._topn,
+      "topn": self._topn,
     }
     config.update(base_config)
     return config
diff --git a/deepray/metrics/metrics_impl.py b/deepray/metrics/metrics_impl.py
index dffb469d..cda667ab 100644
--- a/deepray/metrics/metrics_impl.py
+++ b/deepray/metrics/metrics_impl.py
@@ -29,7 +29,7 @@
 
 _DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1
 
-_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank)
+_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.0) / tf.math.log1p(rank)
 
 
 def _alpha_dcg_gain_fn(labels, alpha):
@@ -92,28 +92,29 @@ def _per_example_weights_to_per_list_weights(weights, relevance):
   nonzero_weights = tf.greater(tf.reduce_sum(input_tensor=weights, axis=1, keepdims=True), 0.0)
   per_list_relevance = tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True)
   nonzero_relevance = tf.compat.v1.where(
-      nonzero_weights, tf.cast(tf.greater(per_list_relevance, 0.0), tf.float32), tf.zeros_like(per_list_relevance)
+    nonzero_weights, tf.cast(tf.greater(per_list_relevance, 0.0), tf.float32), tf.zeros_like(per_list_relevance)
   )
   nonzero_relevance_count = tf.reduce_sum(input_tensor=nonzero_relevance, axis=0, keepdims=True)
 
   per_list_weights = tf.compat.v1.math.divide_no_nan(
-      tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True), per_list_relevance
+    tf.reduce_sum(input_tensor=weights * relevance, axis=1, keepdims=True), per_list_relevance
   )
   sum_weights = tf.reduce_sum(input_tensor=per_list_weights, axis=0, keepdims=True)
 
   avg_weight = tf.compat.v1.where(
-      tf.greater(nonzero_relevance_count, 0.0), tf.compat.v1.math.divide_no_nan(sum_weights, nonzero_relevance_count),
-      tf.ones_like(nonzero_relevance_count)
+    tf.greater(nonzero_relevance_count, 0.0),
+    tf.compat.v1.math.divide_no_nan(sum_weights, nonzero_relevance_count),
+    tf.ones_like(nonzero_relevance_count),
   )
   return tf.compat.v1.where(
-      nonzero_weights,
-      tf.where(tf.greater(per_list_relevance, 0.0), per_list_weights,
-               tf.ones_like(per_list_weights) * avg_weight), tf.zeros_like(per_list_weights)
+    nonzero_weights,
+    tf.where(tf.greater(per_list_relevance, 0.0), per_list_weights, tf.ones_like(per_list_weights) * avg_weight),
+    tf.zeros_like(per_list_weights),
   )
 
 
 def _discounted_cumulative_gain(
-    labels, weights=None, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN
+  labels, weights=None, gain_fn=_DEFAULT_GAIN_FN, rank_discount_fn=_DEFAULT_RANK_DISCOUNT_FN
 ):
   """Computes discounted cumulative gain (DCG).
 
@@ -161,8 +162,8 @@ def _per_list_recall(labels, predictions, topn, mask):
   topn_positives = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
   labels = tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
   per_list_recall = tf.compat.v1.math.divide_no_nan(
-      tf.reduce_sum(input_tensor=topn_positives, axis=1, keepdims=True),
-      tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True)
+    tf.reduce_sum(input_tensor=topn_positives, axis=1, keepdims=True),
+    tf.reduce_sum(input_tensor=labels, axis=1, keepdims=True),
   )
   return per_list_recall
 
@@ -190,7 +191,7 @@ def _per_list_precision(labels, predictions, topn, mask):
     topn = tf.shape(relevance)[1]
   valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True))
   per_list_precision = tf.compat.v1.math.divide_no_nan(
-      tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.cast(valid_topn, dtype=tf.float32)
+    tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True), tf.cast(valid_topn, dtype=tf.float32)
   )
   return per_list_precision
 
@@ -211,7 +212,7 @@ def __init__(self, ragged=False):
   @abc.abstractproperty
   def name(self):
     """The metric name."""
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
   def _prepare_and_validate_params(self, labels, predictions, weights, mask):
     """Prepares and validates the parameters.
@@ -232,8 +233,7 @@ def _prepare_and_validate_params(self, labels, predictions, weights, mask):
     """
     if any(isinstance(tensor, tf.RaggedTensor) for tensor in [labels, predictions, weights]):
       raise ValueError(
-          'labels, predictions and/or weights are ragged tensors, '
-          'use ragged=True to enable ragged support for metrics.'
+        "labels, predictions and/or weights are ragged tensors, use ragged=True to enable ragged support for metrics."
       )
     labels = tf.convert_to_tensor(value=labels)
     predictions = tf.convert_to_tensor(value=predictions)
@@ -249,8 +249,9 @@ def _prepare_and_validate_params(self, labels, predictions, weights, mask):
     mask = tf.math.logical_and(mask, tf.math.greater(example_weights, 0.0))
     labels = tf.compat.v1.where(mask, labels, tf.zeros_like(labels))
     predictions = tf.compat.v1.where(
-        mask, predictions,
-        -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True)
+      mask,
+      predictions,
+      -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True),
     )
     return labels, predictions, example_weights, mask
 
@@ -294,7 +295,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     Returns:
       A tf metric.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    raise NotImplementedError("Calling an abstract method.")
 
 
 class _DivRankingMetric(_RankingMetric):
@@ -359,12 +360,13 @@ def _prepare_and_validate_params(self, labels, predictions, weights, mask):
     if mask.get_shape().rank == 3:
       mask = tf.reduce_any(mask, axis=2)
     predictions = tf.where(
-        mask, predictions,
-        -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True)
+      mask,
+      predictions,
+      -1e-6 * tf.ones_like(predictions) + tf.reduce_min(input_tensor=predictions, axis=1, keepdims=True),
     )
     # All labels should be >= 0. Invalid entries are reset.
     labels = tf.where(tf.expand_dims(mask, axis=2), labels, tf.zeros_like(labels))
-    weights = (tf.constant(1.0, dtype=tf.float32) if weights is None else tf.convert_to_tensor(value=weights))
+    weights = tf.constant(1.0, dtype=tf.float32) if weights is None else tf.convert_to_tensor(value=weights)
     example_weights = tf.ones_like(predictions) * weights
 
     return labels, predictions, example_weights, mask
@@ -383,7 +385,7 @@ def _compute_per_list_weights(self, weights, labels):
     # per_list_weights are computed from the whole list to avoid the problem of
     # 0 when there is no relevant example in topn.
     return _per_example_weights_to_per_list_weights(
-        weights, tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=-1), dtype=tf.float32)
+      weights, tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=-1), dtype=tf.float32)
     )
 
   def _compute_impl(self, labels, predictions, weights, mask):
@@ -425,7 +427,7 @@ def name(self):
   def _compute_impl(self, labels, predictions, weights, mask):
     """See `_RankingMetric`."""
     topn = tf.shape(predictions)[1] if self._topn is None else self._topn
-    sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
+    (sorted_labels,) = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
     sorted_list_size = tf.shape(input=sorted_labels)[1]
     # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance.
     relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
@@ -433,7 +435,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     # MRR has a shape of [batch_size, 1].
     mrr = tf.reduce_max(input_tensor=relevance * reciprocal_rank, axis=1, keepdims=True)
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+      weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
     )
     return mrr, per_list_weights
 
@@ -461,8 +463,7 @@ def __init__(self, name, topn, ragged=False):
     self._name = name
     if topn is None:
       tf.compat.v1.logging.warning(
-          'Hits metric without `topn` specified could be trivial. '
-          'Consider specify `topn` for Hits metric.'
+        "Hits metric without `topn` specified could be trivial. Consider specify `topn` for Hits metric."
       )
     self._topn = topn
 
@@ -474,13 +475,13 @@ def name(self):
   def _compute_impl(self, labels, predictions, weights, mask):
     """See `_RankingMetric`."""
     topn = tf.shape(predictions)[1] if self._topn is None else self._topn
-    sorted_labels, = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
+    (sorted_labels,) = utils.sort_by_scores(predictions, [labels], topn=topn, mask=mask)
     # Relevance = 1.0 when labels >= 1.0 to accommodate graded relevance.
     relevance = tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32)
     # Hits has a shape of [batch_size, 1].
     hits = tf.reduce_max(input_tensor=relevance, axis=1, keepdims=True)
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+      weights=weights, relevance=tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
     )
     return hits, per_list_weights
 
@@ -503,10 +504,10 @@ def _compute_impl(self, labels, predictions, weights, mask):
     topn = tf.shape(predictions)[1]
     sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask)
     weighted_labels = sorted_labels * sorted_weights
-    position = (tf.cast(tf.range(1, topn + 1), dtype=tf.float32) * tf.ones_like(weighted_labels))
+    position = tf.cast(tf.range(1, topn + 1), dtype=tf.float32) * tf.ones_like(weighted_labels)
     per_list_weights = tf.reduce_sum(weighted_labels, axis=1, keepdims=True)
     per_list_arp = tf.compat.v1.div_no_nan(
-        tf.reduce_sum(position * weighted_labels, axis=1, keepdims=True), per_list_weights
+      tf.reduce_sum(position * weighted_labels, axis=1, keepdims=True), per_list_weights
     )
     # TODO: Consider to add a cap position topn + 1 when there is no
     # relevant examples.
@@ -534,7 +535,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     # per_list_weights are computed from the whole list to avoid the problem of
     # 0 when there is no relevant example in topn.
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+      weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
     )
     return per_list_recall, per_list_weights
 
@@ -560,7 +561,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     # per_list_weights are computed from the whole list to avoid the problem of
     # 0 when there is no relevant example in topn.
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
+      weights, tf.cast(tf.greater_equal(labels, 1.0), dtype=tf.float32)
     )
     return per_list_precision, per_list_weights
 
@@ -589,7 +590,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     per_list_cutoffs = tf.cumsum(tf.ones_like(sorted_relevance), axis=1)
     per_list_precisions = tf.math.divide_no_nan(per_list_relevant_counts, per_list_cutoffs)
     total_precision = tf.reduce_sum(
-        input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True
+      input_tensor=per_list_precisions * sorted_weights * sorted_relevance, axis=1, keepdims=True
     )
 
     # Compute the total relevance regardless of self._topn.
@@ -626,14 +627,14 @@ def _compute_impl(self, labels, predictions, weights, mask):
     # Sorting over the weighted gains to get ideal ranking.
     weighted_gains = weights * self._gain_fn(tf.cast(labels, dtype=tf.float32))
     ideal_sorted_labels, ideal_sorted_weights = utils.sort_by_scores(
-        weighted_gains, [labels, weights], topn=topn, mask=mask
+      weighted_gains, [labels, weights], topn=topn, mask=mask
     )
     ideal_dcg = _discounted_cumulative_gain(
-        ideal_sorted_labels, ideal_sorted_weights, self._gain_fn, self._rank_discount_fn
+      ideal_sorted_labels, ideal_sorted_weights, self._gain_fn, self._rank_discount_fn
     )
     per_list_ndcg = tf.compat.v1.math.divide_no_nan(dcg, ideal_dcg)
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
+      weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
     )
     return per_list_ndcg, per_list_weights
 
@@ -660,7 +661,7 @@ def _compute_impl(self, labels, predictions, weights, mask):
     sorted_labels, sorted_weights = utils.sort_by_scores(predictions, [labels, weights], topn=topn, mask=mask)
     dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn)
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
+      weights=weights, relevance=self._gain_fn(tf.cast(labels, dtype=tf.float32))
     )
     per_list_dcg = tf.compat.v1.math.divide_no_nan(dcg, per_list_weights)
     return per_list_dcg, per_list_weights
@@ -687,11 +688,14 @@ def _compute_impl(self, labels, predictions, weights, mask):
     # Correct pairs are represented twice in the above pair difference tensors.
     # We only take one copy for each pair.
     correct_pairs = tf.cast(pair_label_diff > 0, dtype=tf.float32) * tf.cast(pair_pred_diff > 0, dtype=tf.float32)
-    pair_weights = tf.cast(pair_label_diff > 0,
-                           dtype=tf.float32) * tf.expand_dims(weights, 2) * tf.cast(valid_pair, dtype=tf.float32)
+    pair_weights = (
+      tf.cast(pair_label_diff > 0, dtype=tf.float32)
+      * tf.expand_dims(weights, 2)
+      * tf.cast(valid_pair, dtype=tf.float32)
+    )
     per_list_weights = tf.expand_dims(tf.reduce_sum(pair_weights, axis=[1, 2]), 1)
     per_list_opa = tf.compat.v1.math.divide_no_nan(
-        tf.expand_dims(tf.reduce_sum(correct_pairs * pair_weights, axis=[1, 2]), 1), per_list_weights
+      tf.expand_dims(tf.reduce_sum(correct_pairs * pair_weights, axis=[1, 2]), 1), per_list_weights
     )
     return per_list_opa, per_list_weights
 
@@ -715,15 +719,15 @@ def _compute_per_list_metric(self, labels, predictions, weights, topn, mask):
     relevance = tf.reduce_sum(tf.cast(tf.greater_equal(sorted_labels, 1.0), dtype=tf.float32), axis=-1)
     # num_subtopics shape = [batch_size, 1].
     num_subtopics = tf.reduce_sum(
-        tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=1, keepdims=True), dtype=tf.float32), axis=-1
+      tf.cast(tf.reduce_any(tf.greater_equal(labels, 1.0), axis=1, keepdims=True), dtype=tf.float32), axis=-1
     )
     if topn is None:
       topn = tf.shape(relevance)[1]
     # valid_topn shape = [batch_size, 1].
     valid_topn = tf.minimum(topn, tf.reduce_sum(tf.cast(mask, dtype=tf.int32), axis=1, keepdims=True))
     return tf.compat.v1.math.divide_no_nan(
-        tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True),
-        tf.reduce_sum(input_tensor=tf.cast(valid_topn, dtype=tf.float32) * num_subtopics, axis=1, keepdims=True)
+      tf.reduce_sum(input_tensor=relevance, axis=1, keepdims=True),
+      tf.reduce_sum(input_tensor=tf.cast(valid_topn, dtype=tf.float32) * num_subtopics, axis=1, keepdims=True),
     )
 
 
@@ -753,7 +757,7 @@ def __init__(self, name, topn, alpha=0.5, rank_discount_fn=_DEFAULT_RANK_DISCOUN
   def _compute_per_list_metric(self, labels, predictions, weights, topn, mask):
     """See `_DivRankingMetric`."""
     sorted_labels, sorted_weights = utils.sort_by_scores(
-        predictions, [labels, weights], topn=topn, seed=self._seed, mask=mask
+      predictions, [labels, weights], topn=topn, seed=self._seed, mask=mask
     )
     alpha_dcg = _discounted_cumulative_gain(sorted_labels, sorted_weights, self._gain_fn, self._rank_discount_fn)
     per_list_weights = self._compute_per_list_weights(weights, labels)
@@ -816,19 +820,19 @@ def _compute_impl(self, labels, predictions, weights, mask):
     total_irrelevance = tf.reduce_sum(irrelevance, axis=1, keepdims=True)
 
     sorted_relevance, sorted_irrelevance = utils.sort_by_scores(
-        predictions, [relevance, irrelevance], mask=mask, topn=topn
+      predictions, [relevance, irrelevance], mask=mask, topn=topn
     )
 
     numerator = tf.minimum(tf.cumsum(sorted_irrelevance, axis=1), total_relevance)
     denominator = tf.minimum(total_irrelevance, total_relevance) if self._use_trec_version else total_relevance
 
     bpref = tf.math.divide_no_nan(
-        tf.reduce_sum(((1. - tf.math.divide_no_nan(numerator, denominator)) * sorted_relevance), axis=1, keepdims=True),
-        total_relevance
+      tf.reduce_sum(((1.0 - tf.math.divide_no_nan(numerator, denominator)) * sorted_relevance), axis=1, keepdims=True),
+      total_relevance,
     )
 
     per_list_weights = _per_example_weights_to_per_list_weights(
-        weights=weights, relevance=tf.cast(tf.greater_equal(relevance, 1.0), dtype=tf.float32)
+      weights=weights, relevance=tf.cast(tf.greater_equal(relevance, 1.0), dtype=tf.float32)
     )
 
     return bpref, per_list_weights
@@ -871,8 +875,7 @@ def compute(self, labels, predictions, weights=None, mask=None):
       predictions_tensor = tf.convert_to_tensor(value=predictions)
       expected_shape = tf.zeros([tf.shape(predictions_tensor)[0], 1])
       if not weights_tensor.shape.is_compatible_with(expected_shape.shape):
-        raise ValueError('Weights should be a `Tensor` of the shape'
-                         '[batch_size, 1]')
+        raise ValueError("Weights should be a `Tensor` of the shape[batch_size, 1]")
     return super().compute(labels, predictions, weights, mask)
 
   def _compute_impl(self, labels, predictions, weights, mask):
@@ -882,10 +885,10 @@ def _compute_impl(self, labels, predictions, weights, mask):
 
     sorted_list_size = tf.shape(input=sorted_labels)[1]
     position_weights = 1.0 / tf.cast(tf.range(1, sorted_list_size + 1), dtype=tf.float32)
-    masked_position_weights = (tf.cast(sorted_mask, dtype=tf.float32) * position_weights)
+    masked_position_weights = tf.cast(sorted_mask, dtype=tf.float32) * position_weights
     pwa = tf.compat.v1.math.divide_no_nan(
-        tf.reduce_sum(input_tensor=tf.multiply(sorted_labels, masked_position_weights), axis=1, keepdims=True),
-        tf.reduce_sum(input_tensor=masked_position_weights, axis=1, keepdims=True)
+      tf.reduce_sum(input_tensor=tf.multiply(sorted_labels, masked_position_weights), axis=1, keepdims=True),
+      tf.reduce_sum(input_tensor=masked_position_weights, axis=1, keepdims=True),
     )
     # Weights list should come in with size [batch_size, 1], then will be
     # expanded out to [batch_size, list_size] in the
diff --git a/deepray/metrics/mrr.py b/deepray/metrics/mrr.py
index 82469bab..7296d5a2 100644
--- a/deepray/metrics/mrr.py
+++ b/deepray/metrics/mrr.py
@@ -43,7 +43,7 @@ def update_state(self, y_true, y_pred, sample_weight=None):
   def get_config(self):
     config = super(_RankingMetric, self).get_config()
     config.update({
-        "ragged": self._ragged,
+      "ragged": self._ragged,
     })
     return config
 
@@ -106,6 +106,6 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     config = super(MRRMetric, self).get_config()
     config.update({
-        "topn": self._topn,
+      "topn": self._topn,
     })
     return config
diff --git a/deepray/metrics/multilabel_confusion_matrix.py b/deepray/metrics/multilabel_confusion_matrix.py
index 1bbe5bfb..06adb8ad 100644
--- a/deepray/metrics/multilabel_confusion_matrix.py
+++ b/deepray/metrics/multilabel_confusion_matrix.py
@@ -28,105 +28,105 @@
 class MultiLabelConfusionMatrix(Metric):
   """Computes Multi-label confusion matrix.
 
-    Class-wise confusion matrix is computed for the
-    evaluation of classification.
-
-    If multi-class input is provided, it will be treated
-    as multilabel data.
-
-    Consider classification problem with two classes
-    (i.e num_classes=2).
-
-    Resultant matrix `M` will be in the shape of `(num_classes, 2, 2)`.
-
-    Every class `i` has a dedicated matrix of shape `(2, 2)` that contains:
-
-    - true negatives for class `i` in `M(0,0)`
-    - false positives for class `i` in `M(0,1)`
-    - false negatives for class `i` in `M(1,0)`
-    - true positives for class `i` in `M(1,1)`
-
-    Args:
-        num_classes: `int`, the number of labels the prediction task can have.
-        name: (Optional) string name of the metric instance.
-        dtype: (Optional) data type of the metric result.
-
-    Usage:
-
-    >>> # multilabel confusion matrix
-    >>> y_true = np.array([[1, 0, 1], [0, 1, 0]], dtype=np.int32)
-    >>> y_pred = np.array([[1, 0, 0], [0, 1, 1]], dtype=np.int32)
-    >>> metric = dp.metrics.MultiLabelConfusionMatrix(num_classes=3)
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy()  #doctest: -DONT_ACCEPT_BLANKLINE
-    array([[[1., 0.],
-            [0., 1.]],
-    <BLANKLINE>
-           [[1., 0.],
-            [0., 1.]],
-    <BLANKLINE>
-           [[0., 1.],
-            [1., 0.]]], dtype=float32)
-    >>> # if multiclass input is provided
-    >>> y_true = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.int32)
-    >>> y_pred = np.array([[1, 0, 0], [0, 0, 1]], dtype=np.int32)
-    >>> metric = dp.metrics.MultiLabelConfusionMatrix(num_classes=3)
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy() #doctest: -DONT_ACCEPT_BLANKLINE
-    array([[[1., 0.],
-            [0., 1.]],
-    <BLANKLINE>
-           [[1., 0.],
-            [1., 0.]],
-    <BLANKLINE>
-           [[1., 1.],
-            [0., 0.]]], dtype=float32)
-
-    """
+  Class-wise confusion matrix is computed for the
+  evaluation of classification.
+
+  If multi-class input is provided, it will be treated
+  as multilabel data.
+
+  Consider classification problem with two classes
+  (i.e num_classes=2).
+
+  Resultant matrix `M` will be in the shape of `(num_classes, 2, 2)`.
+
+  Every class `i` has a dedicated matrix of shape `(2, 2)` that contains:
+
+  - true negatives for class `i` in `M(0,0)`
+  - false positives for class `i` in `M(0,1)`
+  - false negatives for class `i` in `M(1,0)`
+  - true positives for class `i` in `M(1,1)`
+
+  Args:
+      num_classes: `int`, the number of labels the prediction task can have.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+
+  Usage:
+
+  >>> # multilabel confusion matrix
+  >>> y_true = np.array([[1, 0, 1], [0, 1, 0]], dtype=np.int32)
+  >>> y_pred = np.array([[1, 0, 0], [0, 1, 1]], dtype=np.int32)
+  >>> metric = dp.metrics.MultiLabelConfusionMatrix(num_classes=3)
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy()  #doctest: -DONT_ACCEPT_BLANKLINE
+  array([[[1., 0.],
+          [0., 1.]],
+  <BLANKLINE>
+         [[1., 0.],
+          [0., 1.]],
+  <BLANKLINE>
+         [[0., 1.],
+          [1., 0.]]], dtype=float32)
+  >>> # if multiclass input is provided
+  >>> y_true = np.array([[1, 0, 0], [0, 1, 0]], dtype=np.int32)
+  >>> y_pred = np.array([[1, 0, 0], [0, 0, 1]], dtype=np.int32)
+  >>> metric = dp.metrics.MultiLabelConfusionMatrix(num_classes=3)
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy() #doctest: -DONT_ACCEPT_BLANKLINE
+  array([[[1., 0.],
+          [0., 1.]],
+  <BLANKLINE>
+         [[1., 0.],
+          [1., 0.]],
+  <BLANKLINE>
+         [[1., 1.],
+          [0., 0.]]], dtype=float32)
+
+  """
 
   @typechecked
   def __init__(
-      self,
-      num_classes: FloatTensorLike,
-      name: str = "Multilabel_confusion_matrix",
-      dtype: AcceptableDTypes = None,
-      **kwargs,
+    self,
+    num_classes: FloatTensorLike,
+    name: str = "Multilabel_confusion_matrix",
+    dtype: AcceptableDTypes = None,
+    **kwargs,
   ):
     super().__init__(name=name, dtype=dtype)
     self.num_classes = num_classes
     self.true_positives = self.add_weight(
-        "true_positives",
-        shape=[self.num_classes],
-        initializer="zeros",
-        dtype=self.dtype,
+      "true_positives",
+      shape=[self.num_classes],
+      initializer="zeros",
+      dtype=self.dtype,
     )
     self.false_positives = self.add_weight(
-        "false_positives",
-        shape=[self.num_classes],
-        initializer="zeros",
-        dtype=self.dtype,
+      "false_positives",
+      shape=[self.num_classes],
+      initializer="zeros",
+      dtype=self.dtype,
     )
     self.false_negatives = self.add_weight(
-        "false_negatives",
-        shape=[self.num_classes],
-        initializer="zeros",
-        dtype=self.dtype,
+      "false_negatives",
+      shape=[self.num_classes],
+      initializer="zeros",
+      dtype=self.dtype,
     )
     self.true_negatives = self.add_weight(
-        "true_negatives",
-        shape=[self.num_classes],
-        initializer="zeros",
-        dtype=self.dtype,
+      "true_negatives",
+      shape=[self.num_classes],
+      initializer="zeros",
+      dtype=self.dtype,
     )
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     if sample_weight is not None:
       warnings.warn(
-          "`sample_weight` is not None. Be aware that MultiLabelConfusionMatrix "
-          "does not take `sample_weight` into account when computing the metric "
-          "value."
+        "`sample_weight` is not None. Be aware that MultiLabelConfusionMatrix "
+        "does not take `sample_weight` into account when computing the metric "
+        "value."
       )
 
     y_true = tf.cast(y_true, tf.int32)
@@ -153,14 +153,12 @@ def update_state(self, y_true, y_pred, sample_weight=None):
     self.true_negatives.assign_add(tf.cast(true_negative, self.dtype))
 
   def result(self):
-    flat_confusion_matrix = tf.convert_to_tensor(
-        [
-            self.true_negatives,
-            self.false_positives,
-            self.false_negatives,
-            self.true_positives,
-        ]
-    )
+    flat_confusion_matrix = tf.convert_to_tensor([
+      self.true_negatives,
+      self.false_positives,
+      self.false_negatives,
+      self.true_positives,
+    ])
     # reshape into 2*2 matrix
     confusion_matrix = tf.reshape(tf.transpose(flat_confusion_matrix), [-1, 2, 2])
 
@@ -170,7 +168,7 @@ def get_config(self):
     """Returns the serializable config of the metric."""
 
     config = {
-        "num_classes": self.num_classes,
+      "num_classes": self.num_classes,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/metrics/ndcg.py b/deepray/metrics/ndcg.py
index b441df19..ab78e662 100644
--- a/deepray/metrics/ndcg.py
+++ b/deepray/metrics/ndcg.py
@@ -4,7 +4,7 @@
 
 _DEFAULT_GAIN_FN = lambda label: tf.pow(2.0, label) - 1
 
-_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.) / tf.math.log1p(rank)
+_DEFAULT_RANK_DISCOUNT_FN = lambda rank: tf.math.log(2.0) / tf.math.log1p(rank)
 
 
 class _RankingMetric(tf.keras.metrics.Mean):
@@ -47,7 +47,7 @@ def update_state(self, y_true, y_pred, sample_weight=None):
   def get_config(self):
     config = super(_RankingMetric, self).get_config()
     config.update({
-        "ragged": self._ragged,
+      "ragged": self._ragged,
     })
     return config
 
@@ -117,15 +117,15 @@ def __init__(self, name=None, topn=None, gain_fn=None, rank_discount_fn=None, dt
     self._gain_fn = gain_fn or utils.pow_minus_1
     self._rank_discount_fn = rank_discount_fn or utils.log2_inverse
     self._metric = metrics_impl.NDCGMetric(
-        name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
+      name=name, topn=topn, gain_fn=self._gain_fn, rank_discount_fn=self._rank_discount_fn, ragged=ragged
     )
 
   def get_config(self):
     base_config = super(NDCGMetric, self).get_config()
     config = {
-        "topn": self._topn,
-        "gain_fn": self._gain_fn,
-        "rank_discount_fn": self._rank_discount_fn,
+      "topn": self._topn,
+      "gain_fn": self._gain_fn,
+      "rank_discount_fn": self._rank_discount_fn,
     }
     config.update(base_config)
     return config
diff --git a/deepray/metrics/precision.py b/deepray/metrics/precision.py
index 54184f33..2d62798e 100644
--- a/deepray/metrics/precision.py
+++ b/deepray/metrics/precision.py
@@ -68,6 +68,6 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     config = super(PrecisionMetric, self).get_config()
     config.update({
-        "topn": self._topn,
+      "topn": self._topn,
     })
     return config
diff --git a/deepray/metrics/precision_ia.py b/deepray/metrics/precision_ia.py
index 1b4d17a5..65b40e54 100644
--- a/deepray/metrics/precision_ia.py
+++ b/deepray/metrics/precision_ia.py
@@ -83,6 +83,6 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     config = super(PrecisionIAMetric, self).get_config()
     config.update({
-        "topn": self._topn,
+      "topn": self._topn,
     })
     return config
diff --git a/deepray/metrics/r_square.py b/deepray/metrics/r_square.py
index 32585589..2707add3 100644
--- a/deepray/metrics/r_square.py
+++ b/deepray/metrics/r_square.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Implements R^2 scores."""
+
 import warnings
 
 import numpy as np
@@ -42,63 +43,62 @@ def _reduce_average(input_tensor: tf.Tensor, axis=None, keepdims=False, weights=
 class RSquare(Metric):
   """Compute R^2 score.
 
-    This is also called the [coefficient of determination
-    ](https://en.wikipedia.org/wiki/Coefficient_of_determination).
-    It tells how close are data to the fitted regression line.
-
-    - Highest score can be 1.0 and it indicates that the predictors
-        perfectly accounts for variation in the target.
-    - Score 0.0 indicates that the predictors do not
-        account for variation in the target.
-    - It can also be negative if the model is worse.
-
-    The sample weighting for this metric implementation mimics the
-    behaviour of the [scikit-learn implementation
-    ](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html)
-    of the same metric.
-
-    Can also calculate the Adjusted R2 Score.
-
-    Args:
-        multioutput: `string`, the reduce method for scores.
-            Should be one of `["raw_values", "uniform_average", "variance_weighted"]`.
-        name: (Optional) string name of the metric instance.
-        dtype: (Optional) data type of the metric result.
-        num_regressors: (Optional) Number of indepedent regressors used (Adjusted R2).
-            Defaults to zero(standard R2 score).
-
-    Usage:
-
-    >>> y_true = np.array([1, 4, 3], dtype=np.float32)
-    >>> y_pred = np.array([2, 4, 4], dtype=np.float32)
-    >>> metric = dp.metrics.r_square.RSquare()
-    >>> metric.update_state(y_true, y_pred)
-    >>> result = metric.result()
-    >>> result.numpy()
-    0.57142854
-    """
+  This is also called the [coefficient of determination
+  ](https://en.wikipedia.org/wiki/Coefficient_of_determination).
+  It tells how close are data to the fitted regression line.
+
+  - Highest score can be 1.0 and it indicates that the predictors
+      perfectly accounts for variation in the target.
+  - Score 0.0 indicates that the predictors do not
+      account for variation in the target.
+  - It can also be negative if the model is worse.
+
+  The sample weighting for this metric implementation mimics the
+  behaviour of the [scikit-learn implementation
+  ](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html)
+  of the same metric.
+
+  Can also calculate the Adjusted R2 Score.
+
+  Args:
+      multioutput: `string`, the reduce method for scores.
+          Should be one of `["raw_values", "uniform_average", "variance_weighted"]`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      num_regressors: (Optional) Number of indepedent regressors used (Adjusted R2).
+          Defaults to zero(standard R2 score).
+
+  Usage:
+
+  >>> y_true = np.array([1, 4, 3], dtype=np.float32)
+  >>> y_pred = np.array([2, 4, 4], dtype=np.float32)
+  >>> metric = dp.metrics.r_square.RSquare()
+  >>> metric.update_state(y_true, y_pred)
+  >>> result = metric.result()
+  >>> result.numpy()
+  0.57142854
+  """
 
   @typechecked
   def __init__(
-      self,
-      name: str = "r_square",
-      dtype: AcceptableDTypes = None,
-      multioutput: str = "uniform_average",
-      num_regressors: tf.int32 = 0,
-      **kwargs,
+    self,
+    name: str = "r_square",
+    dtype: AcceptableDTypes = None,
+    multioutput: str = "uniform_average",
+    num_regressors: tf.int32 = 0,
+    **kwargs,
   ):
     super().__init__(name=name, dtype=dtype, **kwargs)
 
     if "y_shape" in kwargs:
       warnings.warn(
-          "y_shape has been removed, because it's automatically derived,"
-          "and will be deprecated in Deepray 0.18.",
-          DeprecationWarning,
+        "y_shape has been removed, because it's automatically derived,and will be deprecated in Deepray 0.18.",
+        DeprecationWarning,
       )
 
     if multioutput not in _VALID_MULTIOUTPUT:
       raise ValueError(
-          "The multioutput argument must be one of {}, but was: {}".format(_VALID_MULTIOUTPUT, multioutput)
+        "The multioutput argument must be one of {}, but was: {}".format(_VALID_MULTIOUTPUT, multioutput)
       )
     self.multioutput = multioutput
     self.num_regressors = num_regressors
@@ -107,31 +107,31 @@ def __init__(
   def update_state(self, y_true, y_pred, sample_weight=None) -> None:
     if not hasattr(self, "squared_sum"):
       self.squared_sum = self.add_weight(
-          name="squared_sum",
-          shape=y_true.shape[1:],
-          initializer="zeros",
-          dtype=self._dtype,
+        name="squared_sum",
+        shape=y_true.shape[1:],
+        initializer="zeros",
+        dtype=self._dtype,
       )
     if not hasattr(self, "sum"):
       self.sum = self.add_weight(
-          name="sum",
-          shape=y_true.shape[1:],
-          initializer="zeros",
-          dtype=self._dtype,
+        name="sum",
+        shape=y_true.shape[1:],
+        initializer="zeros",
+        dtype=self._dtype,
       )
     if not hasattr(self, "res"):
       self.res = self.add_weight(
-          name="residual",
-          shape=y_true.shape[1:],
-          initializer="zeros",
-          dtype=self._dtype,
+        name="residual",
+        shape=y_true.shape[1:],
+        initializer="zeros",
+        dtype=self._dtype,
       )
     if not hasattr(self, "count"):
       self.count = self.add_weight(
-          name="count",
-          shape=y_true.shape[1:],
-          initializer="zeros",
-          dtype=self._dtype,
+        name="count",
+        shape=y_true.shape[1:],
+        initializer="zeros",
+        dtype=self._dtype,
       )
 
     y_true = tf.cast(y_true, dtype=self._dtype)
@@ -144,7 +144,7 @@ def update_state(self, y_true, y_pred, sample_weight=None) -> None:
     weighted_y_true = y_true * sample_weight
     self.sum.assign_add(tf.reduce_sum(weighted_y_true, axis=0))
     self.squared_sum.assign_add(tf.reduce_sum(y_true * weighted_y_true, axis=0))
-    self.res.assign_add(tf.reduce_sum((y_true - y_pred)**2 * sample_weight, axis=0))
+    self.res.assign_add(tf.reduce_sum((y_true - y_pred) ** 2 * sample_weight, axis=0))
     self.count.assign_add(tf.reduce_sum(sample_weight, axis=0))
     self.num_samples.assign_add(tf.size(y_true))
 
@@ -162,7 +162,7 @@ def result(self) -> tf.Tensor:
       r2_score = _reduce_average(raw_scores, weights=total)
     else:
       raise RuntimeError(
-          "The multioutput attribute must be one of {}, but was: {}".format(_VALID_MULTIOUTPUT, self.multioutput)
+        "The multioutput attribute must be one of {}, but was: {}".format(_VALID_MULTIOUTPUT, self.multioutput)
       )
 
     if self.num_regressors < 0:
@@ -171,8 +171,7 @@ def result(self) -> tf.Tensor:
     if self.num_regressors != 0:
       if self.num_regressors > self.num_samples - 1:
         UserWarning(
-            "More independent predictors than datapoints in adjusted r2 score. Falls back to standard r2 "
-            "score."
+          "More independent predictors than datapoints in adjusted r2 score. Falls back to standard r2 score."
         )
       elif self.num_regressors == self.num_samples - 1:
         UserWarning("Division by zero in adjusted r2 score. Falls back to standard r2 score.")
@@ -198,7 +197,7 @@ def reset_states(self):
 
   def get_config(self):
     config = {
-        "multioutput": self.multioutput,
+      "multioutput": self.multioutput,
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/metrics/recall.py b/deepray/metrics/recall.py
index 715ff072..3de028ec 100644
--- a/deepray/metrics/recall.py
+++ b/deepray/metrics/recall.py
@@ -68,6 +68,6 @@ def __init__(self, name=None, topn=None, dtype=None, ragged=False, **kwargs):
   def get_config(self):
     config = super(RecallMetric, self).get_config()
     config.update({
-        "topn": self._topn,
+      "topn": self._topn,
     })
     return config
diff --git a/deepray/metrics/streaming_correlations.py b/deepray/metrics/streaming_correlations.py
index 44354cdb..5a570ea2 100644
--- a/deepray/metrics/streaming_correlations.py
+++ b/deepray/metrics/streaming_correlations.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Approximate Pearson's, Spearman's, Kendall's Tau-b/c correlations based
 on the algorithm of Wei Xiao https://arxiv.org/abs/1712.01521."""
+
 from abc import abstractmethod
 
 import numpy as np
@@ -27,40 +28,40 @@
 class CorrelationBase(keras.metrics.Metric):
   """Base class for streaming correlation metrics.
 
-    Based on https://arxiv.org/abs/1712.01521.
+  Based on https://arxiv.org/abs/1712.01521.
 
-    It stores and updates the joint and marginal histograms of (`y_true`, `y_pred`).
+  It stores and updates the joint and marginal histograms of (`y_true`, `y_pred`).
 
-    The concrete classes estimate the different correlation metrics
-    based on those histograms.
-    """
+  The concrete classes estimate the different correlation metrics
+  based on those histograms.
+  """
 
   @typechecked
   def __init__(
-      self,
-      actual_min: float = 0.0,
-      actual_max: float = 1.0,
-      preds_min: float = 0.0,
-      preds_max: float = 1.0,
-      actual_cutpoints: int = 100,
-      preds_cutpoints: int = 100,
-      name: str = None,
-      dtype: AcceptableDTypes = None,
+    self,
+    actual_min: float = 0.0,
+    actual_max: float = 1.0,
+    preds_min: float = 0.0,
+    preds_max: float = 1.0,
+    actual_cutpoints: int = 100,
+    preds_cutpoints: int = 100,
+    name: str = None,
+    dtype: AcceptableDTypes = None,
   ):
     """Creates a `CorrelationBase` instance.
 
-        Args:
-          actual_min: the inclusive lower bound on values from actual.
-          actual_max: the exclusive upper bound on values from actual.
-          preds_min: the inclusive lower bound on values from preds.
-          preds_max: the exclusive upper bound on values from preds.
-          actual_cutpoints: the number of divisions to create in actual range,
-            defaults to 100.
-          preds_cutpoints: the number of divisions to create in preds range,
-            defaults to 100.
-          name: (optional) String name of the metric instance
-          dtype: (optional) Data type of the metric result. Defaults to `None`
-        """
+    Args:
+      actual_min: the inclusive lower bound on values from actual.
+      actual_max: the exclusive upper bound on values from actual.
+      preds_min: the inclusive lower bound on values from preds.
+      preds_max: the exclusive upper bound on values from preds.
+      actual_cutpoints: the number of divisions to create in actual range,
+        defaults to 100.
+      preds_cutpoints: the number of divisions to create in preds range,
+        defaults to 100.
+      name: (optional) String name of the metric instance
+      dtype: (optional) Data type of the metric result. Defaults to `None`
+    """
     super().__init__(name=name, dtype=dtype)
     self.actual_min = actual_min
     self.actual_max = actual_max
@@ -69,15 +70,15 @@ def __init__(
     self.actual_cutpoints = actual_cutpoints
     self.preds_cutpoints = preds_cutpoints
     actual_cuts = np.linspace(
-        tf.cast(self.actual_min, tf.float32),
-        tf.cast(self.actual_max, tf.float32),
-        self.actual_cutpoints,
+      tf.cast(self.actual_min, tf.float32),
+      tf.cast(self.actual_max, tf.float32),
+      self.actual_cutpoints,
     )
     actual_cuts[-1] += backend.epsilon()
     preds_cuts = np.linspace(
-        tf.cast(self.preds_min, tf.float32),
-        tf.cast(self.preds_max, tf.float32),
-        self.preds_cutpoints,
+      tf.cast(self.preds_min, tf.float32),
+      tf.cast(self.preds_max, tf.float32),
+      self.preds_cutpoints,
     )
     preds_cuts[-1] += backend.epsilon()
     self.actual_cuts = tf.convert_to_tensor(actual_cuts, tf.float32)
@@ -89,27 +90,29 @@ def __init__(
 
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Updates `m`, `nrow`, `ncol` respectively the joint and
-        marginal histograms of (`y_true`, `y_pred`)
-        """
+    marginal histograms of (`y_true`, `y_pred`)
+    """
 
     y_true = tf.clip_by_value(y_true, self.actual_min, self.actual_max)
     y_pred = tf.clip_by_value(y_pred, self.preds_min, self.preds_max)
 
     i = (
-        tf.searchsorted(
-            self.actual_cuts,
-            tf.cast(tf.reshape(y_true, [-1]), self.actual_cuts.dtype),
-            side="right",
-            out_type=tf.int64,
-        ) - 1
+      tf.searchsorted(
+        self.actual_cuts,
+        tf.cast(tf.reshape(y_true, [-1]), self.actual_cuts.dtype),
+        side="right",
+        out_type=tf.int64,
+      )
+      - 1
     )
     j = (
-        tf.searchsorted(
-            self.preds_cuts,
-            tf.cast(tf.reshape(y_pred, [-1]), self.preds_cuts.dtype),
-            side="right",
-            out_type=tf.int64,
-        ) - 1
+      tf.searchsorted(
+        self.preds_cuts,
+        tf.cast(tf.reshape(y_pred, [-1]), self.preds_cuts.dtype),
+        side="right",
+        out_type=tf.int64,
+      )
+      - 1
     )
 
     nrow = tf.tensor_scatter_nd_add(self.nrow, tf.expand_dims(i, axis=-1), tf.ones_like(i))
@@ -130,12 +133,12 @@ def get_config(self):
     """Returns the serializable config of the metric."""
 
     config = {
-        "actual_min": self.actual_min,
-        "actual_max": self.actual_max,
-        "preds_min": self.preds_min,
-        "preds_max": self.preds_max,
-        "actual_cutpoints": self.actual_cutpoints,
-        "preds_cutpoints": self.preds_cutpoints,
+      "actual_min": self.actual_min,
+      "actual_max": self.actual_max,
+      "preds_min": self.preds_min,
+      "preds_max": self.preds_max,
+      "actual_cutpoints": self.actual_cutpoints,
+      "preds_cutpoints": self.preds_cutpoints,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -160,11 +163,11 @@ class KendallsTauBase(CorrelationBase):
 
   def _compute_variables(self):
     """Compute a tuple containing the concordant pairs, discordant pairs,
-        ties in `y_true` and `y_pred`.
+    ties in `y_true` and `y_pred`.
 
-        Returns:
-          A tuple
-        """
+    Returns:
+      A tuple
+    """
     m = tf.cast(self.m, tf.float32)
     n_cap = tf.cumsum(tf.cumsum(m, axis=0), axis=1)
     # Number of concordant pairs.
@@ -186,14 +189,14 @@ def _compute_variables(self):
 class KendallsTauB(KendallsTauBase):
   """Computes Kendall's Tau-b Rank Correlation Coefficient.
 
-    Usage:
-    >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
-    >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
-    >>> m = dp.metrics.KendallsTauB(0, 13, 0, 8)
-    >>> m.update_state(actuals, preds)
-    >>> m.result().numpy()
-    -0.47140455
-    """
+  Usage:
+  >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
+  >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
+  >>> m = dp.metrics.KendallsTauB(0, 13, 0, 8)
+  >>> m.update_state(actuals, preds)
+  >>> m.result().numpy()
+  -0.47140455
+  """
 
   def result(self):
     p, q, t, u = self._compute_variables()
@@ -204,14 +207,14 @@ def result(self):
 class KendallsTauC(KendallsTauBase):
   """Computes Kendall's Tau-c Rank Correlation Coefficient.
 
-    Usage:
-    >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
-    >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
-    >>> m = dp.metrics.KendallsTauC(0, 13, 0, 8)
-    >>> m.update_state(actuals, preds)
-    >>> m.result().numpy()
-    -0.48000002
-    """
+  Usage:
+  >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
+  >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
+  >>> m = dp.metrics.KendallsTauC(0, 13, 0, 8)
+  >>> m.update_state(actuals, preds)
+  >>> m.result().numpy()
+  -0.48000002
+  """
 
   def result(self):
     p, q, _, _ = self._compute_variables()
@@ -226,14 +229,14 @@ def result(self):
 class SpearmansRank(CorrelationBase):
   """Computes Spearman's Rank Correlation Coefficient.
 
-    Usage:
-    >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
-    >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
-    >>> m = dp.metrics.SpearmansRank(0, 13, 0, 8)
-    >>> m.update_state(actuals, preds)
-    >>> m.result().numpy()
-    -0.54073805
-    """
+  Usage:
+  >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
+  >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
+  >>> m = dp.metrics.SpearmansRank(0, 13, 0, 8)
+  >>> m.update_state(actuals, preds)
+  >>> m.result().numpy()
+  -0.54073805
+  """
 
   def result(self):
     nrow = tf.cast(self.nrow, tf.float32)
@@ -258,14 +261,14 @@ def result(self):
 class PearsonsCorrelation(CorrelationBase):
   """Computes Pearsons's Correlation Coefficient.
 
-    Usage:
-    >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
-    >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
-    >>> m = dp.metrics.PearsonsCorrelation(0, 13, 0, 8)
-    >>> m.update_state(actuals, preds)
-    >>> m.result().numpy()
-    -0.5618297
-    """
+  Usage:
+  >>> actuals = tf.constant([12, 2, 1, 12, 2], dtype=tf.int32)
+  >>> preds = tf.constant([1, 4, 7, 1, 0], dtype=tf.int32)
+  >>> m = dp.metrics.PearsonsCorrelation(0, 13, 0, 8)
+  >>> m.update_state(actuals, preds)
+  >>> m.result().numpy()
+  -0.5618297
+  """
 
   def result(self):
     ncol = tf.cast(self.ncol, tf.float32)
diff --git a/deepray/metrics/tests/cohens_kappa_test.py b/deepray/metrics/tests/cohens_kappa_test.py
index 2ef68a50..f2046ce6 100644
--- a/deepray/metrics/tests/cohens_kappa_test.py
+++ b/deepray/metrics/tests/cohens_kappa_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for Cohen's Kappa Metric."""
+
 import pytest
 import numpy as np
 import tensorflow as tf
diff --git a/deepray/metrics/tests/f_scores_test.py b/deepray/metrics/tests/f_scores_test.py
index d696870c..3ed41fc2 100644
--- a/deepray/metrics/tests/f_scores_test.py
+++ b/deepray/metrics/tests/f_scores_test.py
@@ -71,21 +71,21 @@ def test_fbeta_worst_score():
 
 
 @pytest.mark.parametrize(
-    "avg_val, beta, result",
-    [
-        (None, 0.5, [0.71428573, 0.5, 0.833334]),
-        (None, 1.0, [0.8, 0.5, 0.6666667]),
-        (None, 2.0, [0.9090904, 0.5, 0.555556]),
-        ("micro", 0.5, 0.6666667),
-        ("micro", 1.0, 0.6666667),
-        ("micro", 2.0, 0.6666667),
-        ("macro", 0.5, 0.6825397),
-        ("macro", 1.0, 0.6555555),
-        ("macro", 2.0, 0.6548822),
-        ("weighted", 0.5, 0.6825397),
-        ("weighted", 1.0, 0.6555555),
-        ("weighted", 2.0, 0.6548822),
-    ],
+  "avg_val, beta, result",
+  [
+    (None, 0.5, [0.71428573, 0.5, 0.833334]),
+    (None, 1.0, [0.8, 0.5, 0.6666667]),
+    (None, 2.0, [0.9090904, 0.5, 0.555556]),
+    ("micro", 0.5, 0.6666667),
+    ("micro", 1.0, 0.6666667),
+    ("micro", 2.0, 0.6666667),
+    ("macro", 0.5, 0.6825397),
+    ("macro", 1.0, 0.6555555),
+    ("macro", 2.0, 0.6548822),
+    ("weighted", 0.5, 0.6825397),
+    ("weighted", 1.0, 0.6555555),
+    ("weighted", 2.0, 0.6548822),
+  ],
 )
 def test_fbeta_random_score(avg_val, beta, result):
   preds = [[0.7, 0.7, 0.7], [1, 0, 0], [0.9, 0.8, 0]]
@@ -94,84 +94,84 @@ def test_fbeta_random_score(avg_val, beta, result):
 
 
 @pytest.mark.parametrize(
-    "avg_val, beta, result",
-    [
-        (None, 0.5, [0.9090904, 0.555556, 1.0]),
-        (None, 1.0, [0.8, 0.6666667, 1.0]),
-        (None, 2.0, [0.71428573, 0.833334, 1.0]),
-        ("micro", 0.5, 0.833334),
-        ("micro", 1.0, 0.833334),
-        ("micro", 2.0, 0.833334),
-        ("macro", 0.5, 0.821549),
-        ("macro", 1.0, 0.822222),
-        ("macro", 2.0, 0.849206),
-        ("weighted", 0.5, 0.880471),
-        ("weighted", 1.0, 0.844445),
-        ("weighted", 2.0, 0.829365),
-    ],
+  "avg_val, beta, result",
+  [
+    (None, 0.5, [0.9090904, 0.555556, 1.0]),
+    (None, 1.0, [0.8, 0.6666667, 1.0]),
+    (None, 2.0, [0.71428573, 0.833334, 1.0]),
+    ("micro", 0.5, 0.833334),
+    ("micro", 1.0, 0.833334),
+    ("micro", 2.0, 0.833334),
+    ("macro", 0.5, 0.821549),
+    ("macro", 1.0, 0.822222),
+    ("macro", 2.0, 0.849206),
+    ("weighted", 0.5, 0.880471),
+    ("weighted", 1.0, 0.844445),
+    ("weighted", 2.0, 0.829365),
+  ],
 )
 def test_fbeta_random_score_none(avg_val, beta, result):
   preds = [
-      [0.9, 0.1, 0],
-      [0.2, 0.6, 0.2],
-      [0, 0, 1],
-      [0.4, 0.3, 0.3],
-      [0, 0.9, 0.1],
-      [0, 0, 1],
+    [0.9, 0.1, 0],
+    [0.2, 0.6, 0.2],
+    [0, 0, 1],
+    [0.4, 0.3, 0.3],
+    [0, 0.9, 0.1],
+    [0, 0, 1],
   ]
   actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
   _test_fbeta_score(actuals, preds, None, avg_val, beta, result, None)
 
 
 @pytest.mark.parametrize(
-    "avg_val, beta, sample_weights, result",
-    [
-        (None, 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.909091, 0.555556, 1.0]),
-        (None, 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
-        (None, 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.9375, 0.714286, 1.0]),
-        (None, 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.8, 0.666667, 1.0]),
-        (None, 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
-        (None, 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.857143, 0.8, 1.0]),
-        (None, 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.714286, 0.833333, 1.0]),
-        (None, 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
-        (None, 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.789474, 0.909091, 1.0]),
-        ("micro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
-        ("micro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("micro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
-        ("micro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
-        ("micro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("micro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
-        ("micro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
-        ("micro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("micro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
-        ("macro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.821549),
-        ("macro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
-        ("macro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.883929),
-        ("macro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.822222),
-        ("macro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
-        ("macro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.885714),
-        ("macro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.849206),
-        ("macro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
-        ("macro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.899522),
-        ("weighted", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.880471),
-        ("weighted", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("weighted", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.917857),
-        ("weighted", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.844444),
-        ("weighted", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("weighted", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.902857),
-        ("weighted", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.829365),
-        ("weighted", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
-        ("weighted", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.897608),
-    ],
+  "avg_val, beta, sample_weights, result",
+  [
+    (None, 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.909091, 0.555556, 1.0]),
+    (None, 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+    (None, 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.9375, 0.714286, 1.0]),
+    (None, 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.8, 0.666667, 1.0]),
+    (None, 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+    (None, 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.857143, 0.8, 1.0]),
+    (None, 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], [0.714286, 0.833333, 1.0]),
+    (None, 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0]),
+    (None, 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], [0.789474, 0.909091, 1.0]),
+    ("micro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+    ("micro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("micro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+    ("micro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+    ("micro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("micro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+    ("micro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.833333),
+    ("micro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("micro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.9),
+    ("macro", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.821549),
+    ("macro", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+    ("macro", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.883929),
+    ("macro", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.822222),
+    ("macro", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+    ("macro", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.885714),
+    ("macro", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.849206),
+    ("macro", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 0.666667),
+    ("macro", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.899522),
+    ("weighted", 0.5, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.880471),
+    ("weighted", 0.5, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("weighted", 0.5, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.917857),
+    ("weighted", 1.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.844444),
+    ("weighted", 1.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("weighted", 1.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.902857),
+    ("weighted", 2.0, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0], 0.829365),
+    ("weighted", 2.0, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0], 1.0),
+    ("weighted", 2.0, [0.5, 1.0, 1.0, 1.0, 0.5, 1.0], 0.897608),
+  ],
 )
 def test_fbeta_weighted_random_score_none(avg_val, beta, sample_weights, result):
   preds = [
-      [0.9, 0.1, 0],
-      [0.2, 0.6, 0.2],
-      [0, 0, 1],
-      [0.4, 0.3, 0.3],
-      [0, 0.9, 0.1],
-      [0, 0, 1],
+    [0.9, 0.1, 0],
+    [0.2, 0.6, 0.2],
+    [0, 0, 1],
+    [0.4, 0.3, 0.3],
+    [0, 0.9, 0.1],
+    [0, 0, 1],
   ]
   actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
   _test_fbeta_score(actuals, preds, sample_weights, avg_val, beta, result, None)
@@ -187,12 +187,12 @@ def test_eq():
   fbeta = FBetaScore(3, beta=1.0)
 
   preds = [
-      [0.9, 0.1, 0],
-      [0.2, 0.6, 0.2],
-      [0, 0, 1],
-      [0.4, 0.3, 0.3],
-      [0, 0.9, 0.1],
-      [0, 0, 1],
+    [0.9, 0.1, 0],
+    [0.2, 0.6, 0.2],
+    [0, 0, 1],
+    [0.4, 0.3, 0.3],
+    [0, 0.9, 0.1],
+    [0, 0, 1],
   ]
   actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
 
@@ -206,12 +206,12 @@ def test_sample_eq():
   f1_weighted = F1Score(3)
 
   preds = [
-      [0.9, 0.1, 0],
-      [0.2, 0.6, 0.2],
-      [0, 0, 1],
-      [0.4, 0.3, 0.3],
-      [0, 0.9, 0.1],
-      [0, 0, 1],
+    [0.9, 0.1, 0],
+    [0.2, 0.6, 0.2],
+    [0, 0, 1],
+    [0.4, 0.3, 0.3],
+    [0, 0.9, 0.1],
+    [0, 0, 1],
   ]
   actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
   sample_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
@@ -237,12 +237,12 @@ def test_config_f1():
 def test_serialization_f1_score(average, threshold):
   f1 = F1Score(3, average, threshold)
   preds = [
-      [0.9, 0.1, 0],
-      [0.2, 0.6, 0.2],
-      [0, 0, 1],
-      [0.4, 0.3, 0.3],
-      [0, 0.9, 0.1],
-      [0, 0, 1],
+    [0.9, 0.1, 0],
+    [0.2, 0.6, 0.2],
+    [0, 0, 1],
+    [0.4, 0.3, 0.3],
+    [0, 0.9, 0.1],
+    [0, 0, 1],
   ]
   actuals = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 0, 1]]
 
diff --git a/deepray/metrics/tests/geometric_mean_test.py b/deepray/metrics/tests/geometric_mean_test.py
index 7a0776f8..762b44e8 100644
--- a/deepray/metrics/tests/geometric_mean_test.py
+++ b/deepray/metrics/tests/geometric_mean_test.py
@@ -23,12 +23,12 @@
 
 def get_test_data():
   return [
-      ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0),
-      ([0, 0, 0, 0, 0, 0, 0, 1, 2, 6], 0),
-      ([0.2, 0.5, 0.3, 0.6, 0.1, 0.7], 0.32864603),
-      ([8, 4, 1, 7, 2, 11, 9, 22, 52], 7.1804023),
-      ([8.2, 9.7, 9.1, 2.7, 1.1, 2.0], 4.0324492),
-      ([0.6666666, 0.215213, 0.15167], 0.27918512),
+    ([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 0),
+    ([0, 0, 0, 0, 0, 0, 0, 1, 2, 6], 0),
+    ([0.2, 0.5, 0.3, 0.6, 0.1, 0.7], 0.32864603),
+    ([8, 4, 1, 7, 2, 11, 9, 22, 52], 7.1804023),
+    ([8.2, 9.7, 9.1, 2.7, 1.1, 2.0], 4.0324492),
+    ([0.6666666, 0.215213, 0.15167], 0.27918512),
   ]
 
 
@@ -44,7 +44,6 @@ def check_result(obj, expected_result, expected_count):
 
 
 def test_config_gmean():
-
   def _check_config(obj, name):
     assert obj.name == name
     assert obj.dtype == tf.float32
@@ -102,12 +101,12 @@ def test_reset_state():
 
 
 @pytest.mark.parametrize(
-    "values, sample_weight, expected",
-    [
-        ([1, 2, 3, 4, 5], 1, 2.6051712),
-        ([2.1, 4.6, 7.1], [1, 2, 3], 5.014777),
-        ([9.6, 1.8, 8.2], [0.2, 0.5, 0.3], 3.9649222),
-    ],
+  "values, sample_weight, expected",
+  [
+    ([1, 2, 3, 4, 5], 1, 2.6051712),
+    ([2.1, 4.6, 7.1], [1, 2, 3], 5.014777),
+    ([9.6, 1.8, 8.2], [0.2, 0.5, 0.3], 3.9649222),
+  ],
 )
 def test_sample_weight_gmean(values, sample_weight, expected):
   obj = GeometricMean()
diff --git a/deepray/metrics/tests/hamming_test.py b/deepray/metrics/tests/hamming_test.py
index 309db3c9..497b6e1d 100644
--- a/deepray/metrics/tests/hamming_test.py
+++ b/deepray/metrics/tests/hamming_test.py
@@ -33,28 +33,28 @@ def check_results(obj, value):
 
 def test_mc_4_classes():
   actuals = tf.constant(
-      [
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [0, 0, 0, 1],
-          [0, 1, 0, 0],
-          [0, 1, 0, 0],
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-      ],
-      dtype=tf.float32,
+    [
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [0, 0, 0, 1],
+      [0, 1, 0, 0],
+      [0, 1, 0, 0],
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+    ],
+    dtype=tf.float32,
   )
   predictions = tf.constant(
-      [
-          [0.85, 0.12, 0.03, 0],
-          [0, 0, 1, 0],
-          [0.10, 0.045, 0.045, 0.81],
-          [1, 0, 0, 0],
-          [0.80, 0.10, 0.10, 0],
-          [1, 0, 0, 0],
-          [0.05, 0, 0.90, 0.05],
-      ],
-      dtype=tf.float32,
+    [
+      [0.85, 0.12, 0.03, 0],
+      [0, 0, 1, 0],
+      [0.10, 0.045, 0.045, 0.81],
+      [1, 0, 0, 0],
+      [0.80, 0.10, 0.10, 0],
+      [1, 0, 0, 0],
+      [0.05, 0, 0.90, 0.05],
+    ],
+    dtype=tf.float32,
   )
   # Initialize
   hl_obj = HammingLoss("multiclass", threshold=0.8)
@@ -65,31 +65,31 @@ def test_mc_4_classes():
 
 def test_mc_5_classes():
   actuals = tf.constant(
-      [
-          [1, 0, 0, 0, 0],
-          [0, 0, 0, 1, 0],
-          [0, 0, 0, 0, 1],
-          [0, 1, 0, 0, 0],
-          [0, 0, 1, 0, 0],
-          [0, 0, 1, 0, 0],
-          [1, 0, 0, 0, 0],
-          [0, 1, 0, 0, 0],
-      ],
-      dtype=tf.float32,
+    [
+      [1, 0, 0, 0, 0],
+      [0, 0, 0, 1, 0],
+      [0, 0, 0, 0, 1],
+      [0, 1, 0, 0, 0],
+      [0, 0, 1, 0, 0],
+      [0, 0, 1, 0, 0],
+      [1, 0, 0, 0, 0],
+      [0, 1, 0, 0, 0],
+    ],
+    dtype=tf.float32,
   )
 
   predictions = tf.constant(
-      [
-          [0.85, 0, 0.15, 0, 0],
-          [0, 0, 0, 1, 0],
-          [0, 1, 0, 0, 0],
-          [0.05, 0.90, 0.04, 0, 0.01],
-          [0.10, 0, 0.81, 0.09, 0],
-          [0.10, 0.045, 0, 0.81, 0.045],
-          [1, 0, 0, 0, 0],
-          [0, 0.85, 0, 0, 0.15],
-      ],
-      dtype=tf.float32,
+    [
+      [0.85, 0, 0.15, 0, 0],
+      [0, 0, 0, 1, 0],
+      [0, 1, 0, 0, 0],
+      [0.05, 0.90, 0.04, 0, 0.01],
+      [0.10, 0, 0.81, 0.09, 0],
+      [0.10, 0.045, 0, 0.81, 0.045],
+      [1, 0, 0, 0, 0],
+      [0, 0.85, 0, 0, 0.15],
+    ],
+    dtype=tf.float32,
   )
   # Initialize
   hl_obj = HammingLoss("multiclass", threshold=0.8)
@@ -101,8 +101,8 @@ def test_mc_5_classes():
 def test_ml_4_classes():
   actuals = tf.constant([[1, 0, 1, 0], [0, 1, 0, 1], [0, 0, 0, 1]], dtype=tf.float32)
   predictions = tf.constant(
-      [[0.97, 0.56, 0.83, 0.77], [0.34, 0.95, 0.7, 0.89], [0.95, 0.45, 0.23, 0.56]],
-      dtype=tf.float32,
+    [[0.97, 0.56, 0.83, 0.77], [0.34, 0.95, 0.7, 0.89], [0.95, 0.45, 0.23, 0.56]],
+    dtype=tf.float32,
   )
   # Initialize
   hl_obj = HammingLoss("multilabel", threshold=0.8)
@@ -113,30 +113,30 @@ def test_ml_4_classes():
 
 def test_ml_5_classes():
   actuals = tf.constant(
-      [
-          [1, 0, 0, 0, 0],
-          [0, 0, 1, 1, 0],
-          [0, 1, 0, 1, 0],
-          [0, 1, 1, 0, 0],
-          [0, 0, 1, 1, 0],
-          [0, 0, 1, 1, 0],
-          [1, 0, 0, 0, 1],
-          [0, 1, 1, 0, 0],
-      ],
-      dtype=tf.float32,
+    [
+      [1, 0, 0, 0, 0],
+      [0, 0, 1, 1, 0],
+      [0, 1, 0, 1, 0],
+      [0, 1, 1, 0, 0],
+      [0, 0, 1, 1, 0],
+      [0, 0, 1, 1, 0],
+      [1, 0, 0, 0, 1],
+      [0, 1, 1, 0, 0],
+    ],
+    dtype=tf.float32,
   )
   predictions = tf.constant(
-      [
-          [1, 0.75, 0.2, 0.55, 0],
-          [0.65, 0.22, 0.97, 0.88, 0],
-          [0, 1, 0, 1, 0],
-          [0, 0.85, 0.9, 0.34, 0.5],
-          [0.4, 0.65, 0.87, 0, 0.12],
-          [0.66, 0.55, 1, 0.98, 0],
-          [0.95, 0.34, 0.67, 0.65, 0.10],
-          [0.45, 0.97, 0.89, 0.67, 0.46],
-      ],
-      dtype=tf.float32,
+    [
+      [1, 0.75, 0.2, 0.55, 0],
+      [0.65, 0.22, 0.97, 0.88, 0],
+      [0, 1, 0, 1, 0],
+      [0, 0.85, 0.9, 0.34, 0.5],
+      [0.4, 0.65, 0.87, 0, 0.12],
+      [0.66, 0.55, 1, 0.98, 0],
+      [0.95, 0.34, 0.67, 0.65, 0.10],
+      [0.45, 0.97, 0.89, 0.67, 0.46],
+    ],
+    dtype=tf.float32,
   )
   # Initialize
   hl_obj = HammingLoss("multilabel", threshold=0.7)
diff --git a/deepray/metrics/tests/harmonic_mean_test.py b/deepray/metrics/tests/harmonic_mean_test.py
index 7cd85ffd..991b5381 100644
--- a/deepray/metrics/tests/harmonic_mean_test.py
+++ b/deepray/metrics/tests/harmonic_mean_test.py
@@ -23,14 +23,14 @@
 
 def get_test_data():
   return [
-      ([np.inf] * 2, 0),
-      ([0, 0, 0, 0], 0),
-      ([1, 4, 4], 2.0),
-      ([0, 0, 0, 0, 0, 0, 0, 1, 2, 6], 0),
-      ([0.2, 0.5, 0.3, 0.6, 0.1, 0.7], 0.25609756),
-      ([8, 4, 1, 7, 2, 11, 9, 22, 52], 3.9394846),
-      ([8.2, 9.7, 9.1, 2.7, 1.1, 2.0], 2.8376906),
-      ([0.6666666, 0.215213, 0.15167], 0.23548213),
+    ([np.inf] * 2, 0),
+    ([0, 0, 0, 0], 0),
+    ([1, 4, 4], 2.0),
+    ([0, 0, 0, 0, 0, 0, 0, 1, 2, 6], 0),
+    ([0.2, 0.5, 0.3, 0.6, 0.1, 0.7], 0.25609756),
+    ([8, 4, 1, 7, 2, 11, 9, 22, 52], 3.9394846),
+    ([8.2, 9.7, 9.1, 2.7, 1.1, 2.0], 2.8376906),
+    ([0.6666666, 0.215213, 0.15167], 0.23548213),
   ]
 
 
@@ -63,12 +63,12 @@ def test_call_hmean(values, expected):
 
 
 @pytest.mark.parametrize(
-    "values, sample_weight, expected",
-    [
-        ([1, 2, 3, 4, 5], 1, 2.1897807),
-        ([2.1, 4.6, 7.1], [1, 2, 3], 4.499409),
-        ([9.6, 1.8, 8.2], [0.2, 0.5, 0.3], 2.9833248),
-    ],
+  "values, sample_weight, expected",
+  [
+    ([1, 2, 3, 4, 5], 1, 2.1897807),
+    ([2.1, 4.6, 7.1], [1, 2, 3], 4.499409),
+    ([9.6, 1.8, 8.2], [0.2, 0.5, 0.3], 2.9833248),
+  ],
 )
 def test_sample_weight_hmean(values, sample_weight, expected):
   obj = HarmonicMean()
diff --git a/deepray/metrics/tests/matthews_correlation_coefficient_test.py b/deepray/metrics/tests/matthews_correlation_coefficient_test.py
index b1f58a9e..e245c0a4 100644
--- a/deepray/metrics/tests/matthews_correlation_coefficient_test.py
+++ b/deepray/metrics/tests/matthews_correlation_coefficient_test.py
@@ -49,34 +49,30 @@ def test_binary_classes():
 
 # See issue #2339
 def test_multiple_classes():
-  gt_label = np.array(
-      [
-          [1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0],
-          [1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 1.0, 0.0],
-          [0.0, 1.0, 0.0],
-          [1.0, 0.0, 0.0],
-          [1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 1.0, 0.0],
-      ]
-  )
-  preds = np.array(
-      [
-          [0.0, 0.0, 1.0],
-          [1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0],
-          [1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0],
-          [0.0, 0.0, 1.0],
-      ]
-  )
+  gt_label = np.array([
+    [1.0, 0.0, 0.0],
+    [0.0, 0.0, 1.0],
+    [1.0, 0.0, 0.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 1.0, 0.0],
+    [0.0, 1.0, 0.0],
+    [1.0, 0.0, 0.0],
+    [1.0, 0.0, 0.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 1.0, 0.0],
+  ])
+  preds = np.array([
+    [0.0, 0.0, 1.0],
+    [1.0, 0.0, 0.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 0.0, 1.0],
+    [1.0, 0.0, 0.0],
+    [0.0, 0.0, 1.0],
+    [0.0, 0.0, 1.0],
+  ])
   tensor_gt_label = tf.constant(gt_label, dtype=tf.float32)
   tensor_preds = tf.constant(preds, dtype=tf.float32)
   # Initialize
diff --git a/deepray/metrics/tests/metrics_test.py b/deepray/metrics/tests/metrics_test.py
index c610beaf..e97ee39a 100644
--- a/deepray/metrics/tests/metrics_test.py
+++ b/deepray/metrics/tests/metrics_test.py
@@ -28,10 +28,10 @@ def test_update_state_signature():
 
 def check_update_state_signature(metric_class, public_params, case_list):
   error_msg = (
-      "Class {} is missing the parameter {} in the `update_state` "
-      "method. If the method doesn't use this argument, declare "
-      "it anyway and raise a UserWarning if it is "
-      "not None."
+    "Class {} is missing the parameter {} in the `update_state` "
+    "method. If the method doesn't use this argument, declare "
+    "it anyway and raise a UserWarning if it is "
+    "not None."
   )
 
   update_state_signature = inspect.signature(metric_class.update_state)
diff --git a/deepray/metrics/tests/multilabel_confusion_matrix_test.py b/deepray/metrics/tests/multilabel_confusion_matrix_test.py
index 62806825..eb53dd07 100644
--- a/deepray/metrics/tests/multilabel_confusion_matrix_test.py
+++ b/deepray/metrics/tests/multilabel_confusion_matrix_test.py
@@ -48,34 +48,34 @@ def test_mcm_3_classes(dtype):
 @pytest.mark.parametrize("dtype", [tf.int32, tf.int64, tf.float32, tf.float64])
 def test_mcm_4_classes(dtype):
   actuals = tf.constant(
-      [
-          [1, 0, 0, 1],
-          [0, 0, 1, 1],
-          [1, 0, 0, 1],
-          [1, 1, 0, 0],
-          [0, 1, 0, 1],
-          [1, 0, 0, 1],
-          [0, 0, 1, 1],
-          [1, 0, 0, 1],
-          [0, 1, 1, 0],
-          [0, 1, 0, 1],
-      ],
-      dtype=dtype,
+    [
+      [1, 0, 0, 1],
+      [0, 0, 1, 1],
+      [1, 0, 0, 1],
+      [1, 1, 0, 0],
+      [0, 1, 0, 1],
+      [1, 0, 0, 1],
+      [0, 0, 1, 1],
+      [1, 0, 0, 1],
+      [0, 1, 1, 0],
+      [0, 1, 0, 1],
+    ],
+    dtype=dtype,
   )
   preds = tf.constant(
-      [
-          [1, 0, 1, 0],
-          [0, 0, 1, 1],
-          [0, 0, 0, 1],
-          [1, 1, 0, 0],
-          [1, 0, 0, 0],
-          [1, 0, 0, 1],
-          [0, 0, 1, 1],
-          [1, 0, 0, 1],
-          [0, 1, 0, 0],
-          [0, 0, 0, 1],
-      ],
-      dtype=dtype,
+    [
+      [1, 0, 1, 0],
+      [0, 0, 1, 1],
+      [0, 0, 0, 1],
+      [1, 1, 0, 0],
+      [1, 0, 0, 0],
+      [1, 0, 0, 1],
+      [0, 0, 1, 1],
+      [1, 0, 0, 1],
+      [0, 1, 0, 0],
+      [0, 0, 0, 1],
+    ],
+    dtype=dtype,
   )
 
   # Initialize
@@ -83,42 +83,42 @@ def test_mcm_4_classes(dtype):
   mcm_obj.update_state(actuals, preds)
   # Check results
   check_results(
-      mcm_obj,
-      [[[4, 1], [1, 4]], [[6, 0], [2, 2]], [[6, 1], [1, 2]], [[2, 0], [2, 6]]],
+    mcm_obj,
+    [[[4, 1], [1, 4]], [[6, 0], [2, 2]], [[6, 1], [1, 2]], [[2, 0], [2, 6]]],
   )
 
 
 @pytest.mark.parametrize("dtype", [tf.int32, tf.int64, tf.float32, tf.float64])
 def test_multiclass(dtype):
   actuals = tf.constant(
-      [
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [0, 0, 0, 1],
-          [0, 1, 0, 0],
-          [0, 1, 0, 0],
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [0, 0, 0, 1],
-      ],
-      dtype=dtype,
+    [
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [0, 0, 0, 1],
+      [0, 1, 0, 0],
+      [0, 1, 0, 0],
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [0, 0, 0, 1],
+    ],
+    dtype=dtype,
   )
   preds = tf.constant(
-      [
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [0, 0, 0, 1],
-          [1, 0, 0, 0],
-          [1, 0, 0, 0],
-          [1, 0, 0, 0],
-          [0, 0, 1, 0],
-          [1, 0, 0, 0],
-          [0, 1, 0, 0],
-          [0, 0, 0, 1],
-      ],
-      dtype=dtype,
+    [
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [0, 0, 0, 1],
+      [1, 0, 0, 0],
+      [1, 0, 0, 0],
+      [1, 0, 0, 0],
+      [0, 0, 1, 0],
+      [1, 0, 0, 0],
+      [0, 1, 0, 0],
+      [0, 0, 0, 1],
+    ],
+    dtype=dtype,
   )
 
   # Initialize
@@ -126,6 +126,6 @@ def test_multiclass(dtype):
   mcm_obj.update_state(actuals, preds)
   # Check results
   check_results(
-      mcm_obj,
-      [[[5, 2], [0, 3]], [[7, 1], [2, 0]], [[7, 0], [1, 2]], [[8, 0], [0, 2]]],
+    mcm_obj,
+    [[[5, 2], [0, 3]], [[7, 1], [2, 0]], [[7, 0], [1, 2]], [[8, 0], [0, 2]]],
   )
diff --git a/deepray/metrics/tests/r_square_test.py b/deepray/metrics/tests/r_square_test.py
index a7ddd482..e591d140 100644
--- a/deepray/metrics/tests/r_square_test.py
+++ b/deepray/metrics/tests/r_square_test.py
@@ -37,8 +37,8 @@ def test_config(multioutput):
 
 
 def initialize_vars(
-    multioutput: str = "uniform_average",
-    num_regressors: tf.int32 = 0,
+  multioutput: str = "uniform_average",
+  num_regressors: tf.int32 = 0,
 ):
   return RSquare(multioutput=multioutput, num_regressors=num_regressors)
 
@@ -146,10 +146,10 @@ def test_r2_sklearn_comparison(multioutput):
     r2_obj = initialize_vars(multioutput=multioutput)
     # Update
     update_obj_states(
-        r2_obj,
-        tensor_actuals,
-        tensor_preds,
-        sample_weight=tensor_sample_weight,
+      r2_obj,
+      tensor_actuals,
+      tensor_preds,
+      sample_weight=tensor_sample_weight,
     )
     # Check results by comparing to results of scikit-learn r2 implementation
     sklearn_result = sklearn_r2_score(actuals, preds, sample_weight=sample_weight, multioutput=multioutput)
diff --git a/deepray/metrics/tests/streaming_correlations_test.py b/deepray/metrics/tests/streaming_correlations_test.py
index 3764fada..1acada92 100644
--- a/deepray/metrics/tests/streaming_correlations_test.py
+++ b/deepray/metrics/tests/streaming_correlations_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for streaming correlations metrics."""
+
 import pytest
 import numpy as np
 import tensorflow as tf
@@ -27,10 +28,10 @@
 
 class TestStreamingCorrelations:
   scipy_corr = {
-      KendallsTauB: lambda x, y: stats.kendalltau(x, y, variant="b"),
-      KendallsTauC: lambda x, y: stats.kendalltau(x, y, variant="c"),
-      SpearmansRank: stats.spearmanr,
-      PearsonsCorrelation: stats.pearsonr,
+    KendallsTauB: lambda x, y: stats.kendalltau(x, y, variant="b"),
+    KendallsTauC: lambda x, y: stats.kendalltau(x, y, variant="c"),
+    SpearmansRank: stats.spearmanr,
+    PearsonsCorrelation: stats.pearsonr,
   }
 
   testing_types = scipy_corr.keys()
@@ -104,9 +105,9 @@ def test_keras_binary_classification_model(self, correlation_type):
     else:
       optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
     model.compile(
-        optimizer=optimizer,
-        loss="binary_crossentropy",
-        metrics=[metric],
+      optimizer=optimizer,
+      loss="binary_crossentropy",
+      metrics=[metric],
     )
 
     x = np.random.rand(1024, 128).astype(np.float32)
diff --git a/deepray/metrics/utils.py b/deepray/metrics/utils.py
index 225b9689..a514049d 100644
--- a/deepray/metrics/utils.py
+++ b/deepray/metrics/utils.py
@@ -23,9 +23,9 @@
 
 from deepray.utils.types import AcceptableDTypes
 
-_PADDING_LABEL = -1.
+_PADDING_LABEL = -1.0
 _PADDING_PREDICTION = -1e6
-_PADDING_WEIGHT = 0.
+_PADDING_WEIGHT = 0.0
 
 TensorLike = tf.types.experimental.TensorLike
 GainFunction = Callable[[TensorLike], tf.Tensor]
@@ -38,20 +38,20 @@ class MeanMetricWrapper(tf.keras.metrics.Mean):
 
   @typechecked
   def __init__(
-      self,
-      fn: Callable,
-      name: Optional[str] = None,
-      dtype: AcceptableDTypes = None,
-      **kwargs,
+    self,
+    fn: Callable,
+    name: Optional[str] = None,
+    dtype: AcceptableDTypes = None,
+    **kwargs,
   ):
     """Creates a `MeanMetricWrapper` instance.
-        Args:
-          fn: The metric function to wrap, with signature
-            `fn(y_true, y_pred, **kwargs)`.
-          name: (Optional) string name of the metric instance.
-          dtype: (Optional) data type of the metric result.
-          **kwargs: The keyword arguments that are passed on to `fn`.
-        """
+    Args:
+      fn: The metric function to wrap, with signature
+        `fn(y_true, y_pred, **kwargs)`.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+      **kwargs: The keyword arguments that are passed on to `fn`.
+    """
     super().__init__(name=name, dtype=dtype)
     self._fn = fn
     self._fn_kwargs = kwargs
@@ -59,16 +59,16 @@ def __init__(
   def update_state(self, y_true, y_pred, sample_weight=None):
     """Accumulates metric statistics.
 
-        `y_true` and `y_pred` should have the same shape.
-        Args:
-          y_true: The ground truth values.
-          y_pred: The predicted values.
-          sample_weight: Optional weighting of each example. Defaults to 1.
-            Can be a `Tensor` whose rank is either 0, or the same rank as
-            `y_true`, and must be broadcastable to `y_true`.
-        Returns:
-          Update op.
-        """
+    `y_true` and `y_pred` should have the same shape.
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1.
+        Can be a `Tensor` whose rank is either 0, or the same rank as
+        `y_true`, and must be broadcastable to `y_true`.
+    Returns:
+      Update op.
+    """
     y_true = tf.cast(y_true, self._dtype)
     y_pred = tf.cast(y_pred, self._dtype)
     # TODO: Add checks for ragged tensors and dimensions:
@@ -115,7 +115,7 @@ def pow_minus_1(label: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `2**x - 1`.
   """
-  return tf.math.pow(2., label) - 1.
+  return tf.math.pow(2.0, label) - 1.0
 
 
 def log2_inverse(rank: TensorLike) -> tf.Tensor:
@@ -130,13 +130,13 @@ def log2_inverse(rank: TensorLike) -> tf.Tensor:
   Returns:
     A `Tensor` that has each input element transformed as `x` to `1./log2(1+x)`.
   """
-  return tf.math.divide_no_nan(tf.math.log(2.), tf.math.log1p(rank))
+  return tf.math.divide_no_nan(tf.math.log(2.0), tf.math.log1p(rank))
 
 
 def is_label_valid(labels):
   """Returns a boolean `Tensor` for label validity."""
   labels = tf.convert_to_tensor(value=labels)
-  return tf.greater_equal(labels, 0.)
+  return tf.greater_equal(labels, 0.0)
 
 
 def _get_shuffle_indices(shape, mask=None, shuffle_ties=True, seed=None):
@@ -189,7 +189,7 @@ def sort_by_scores(scores, features_list, topn=None, shuffle_ties=True, seed=Non
   Returns:
     A list of `Tensor`s as the list of sorted features by `scores`.
   """
-  with tf.compat.v1.name_scope(name='sort_by_scores'):
+  with tf.compat.v1.name_scope(name="sort_by_scores"):
     scores = tf.cast(scores, tf.float32)
     scores.get_shape().assert_has_rank(2)
     list_size = tf.shape(input=scores)[1]
@@ -232,7 +232,7 @@ def ragged_to_dense(labels, predictions, weights):
     A tuple (labels, predictions, weights, mask) of dense `tf.Tensor`s.
   """
   # TODO: Add checks to validate (ragged) shapes of input tensors.
-  mask = tf.cast(tf.ones_like(labels).to_tensor(0.), dtype=tf.bool)
+  mask = tf.cast(tf.ones_like(labels).to_tensor(0.0), dtype=tf.bool)
   labels = labels.to_tensor(_PADDING_LABEL)
   if predictions is not None:
     predictions = predictions.to_tensor(_PADDING_PREDICTION)
diff --git a/deepray/models/__init__.py b/deepray/models/__init__.py
index 2a4e09c4..ffe721a8 100644
--- a/deepray/models/__init__.py
+++ b/deepray/models/__init__.py
@@ -1,3 +1,3 @@
 from deepray.models.transformer_encoder import TransformerEncoder
 from deepray.models.albert_transformer_encoder import AlbertTransformerEncoder
-from deepray.models.bert_span_labeler import BertSpanLabeler
\ No newline at end of file
+from deepray.models.bert_span_labeler import BertSpanLabeler
diff --git a/deepray/models/albert_transformer_encoder.py b/deepray/models/albert_transformer_encoder.py
index 1cd7c4a2..85d28ecf 100644
--- a/deepray/models/albert_transformer_encoder.py
+++ b/deepray/models/albert_transformer_encoder.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -70,22 +71,22 @@ class AlbertTransformerEncoder(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      embedding_width=128,
-      hidden_size=768,
-      num_layers=12,
-      num_attention_heads=12,
-      sequence_length=512,
-      max_sequence_length=None,
-      type_vocab_size=16,
-      intermediate_size=3072,
-      activation=tf.keras.activations.gelu,
-      dropout_rate=0.1,
-      attention_dropout_rate=0.1,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      float_dtype='float32',
-      **kwargs
+    self,
+    vocab_size,
+    embedding_width=128,
+    hidden_size=768,
+    num_layers=12,
+    num_attention_heads=12,
+    sequence_length=512,
+    max_sequence_length=None,
+    type_vocab_size=16,
+    intermediate_size=3072,
+    activation=tf.keras.activations.gelu,
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    float_dtype="float32",
+    **kwargs,
   ):
     activation = tf.keras.activations.get(activation)
     initializer = tf.keras.initializers.get(initializer)
@@ -94,96 +95,90 @@ def __init__(
       max_sequence_length = sequence_length
     self._self_setattr_tracking = False
     self._config_dict = {
-        'vocab_size': vocab_size,
-        'embedding_width': embedding_width,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'sequence_length': sequence_length,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'intermediate_size': intermediate_size,
-        'activation': tf.keras.activations.serialize(activation),
-        'dropout_rate': dropout_rate,
-        'attention_dropout_rate': attention_dropout_rate,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'float_dtype': float_dtype,
+      "vocab_size": vocab_size,
+      "embedding_width": embedding_width,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "sequence_length": sequence_length,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "intermediate_size": intermediate_size,
+      "activation": tf.keras.activations.serialize(activation),
+      "dropout_rate": dropout_rate,
+      "attention_dropout_rate": attention_dropout_rate,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "float_dtype": float_dtype,
     }
 
-    word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids')
+    word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_type_ids")
 
     self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=embedding_width,
-        initializer=initializer,
-        dtype=float_dtype,
-        name='word_embeddings'
+      vocab_size=vocab_size,
+      embedding_width=embedding_width,
+      initializer=initializer,
+      dtype=float_dtype,
+      name="word_embeddings",
     )
     word_embeddings = self._embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
     self._position_embedding_layer = position_embedding.PositionEmbedding(
-        initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length, dtype=float_dtype
+      initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length, dtype=float_dtype
     )
     position_embeddings = self._position_embedding_layer(word_embeddings)
 
-    type_embeddings = (
-        on_device_embedding.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=embedding_width,
-            initializer=initializer,
-            use_one_hot=True,
-            dtype=float_dtype,
-            name='type_embeddings'
-        )(type_ids)
-    )
+    type_embeddings = on_device_embedding.OnDeviceEmbedding(
+      vocab_size=type_vocab_size,
+      embedding_width=embedding_width,
+      initializer=initializer,
+      use_one_hot=True,
+      dtype=float_dtype,
+      name="type_embeddings",
+    )(type_ids)
 
     embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
-    embeddings = (
-        tf.keras.layers.LayerNormalization(name='embeddings/layer_norm', axis=-1, epsilon=1e-12,
-                                           dtype=float_dtype)(embeddings)
-    )
-    embeddings = (tf.keras.layers.Dropout(rate=dropout_rate, dtype=tf.float32)(embeddings))
+    embeddings = tf.keras.layers.LayerNormalization(
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=float_dtype
+    )(embeddings)
+    embeddings = tf.keras.layers.Dropout(rate=dropout_rate, dtype=tf.float32)(embeddings)
     # We project the 'embedding' output to 'hidden_size' if it is not already
     # 'hidden_size'.
     if embedding_width != hidden_size:
       embeddings = dense_einsum.DenseEinsum(
-          output_shape=hidden_size, kernel_initializer=initializer, name='embedding_projection'
+        output_shape=hidden_size, kernel_initializer=initializer, name="embedding_projection"
       )(embeddings)
 
-    if float_dtype == 'float16':
+    if float_dtype == "float16":
       embeddings = tf.cast(embeddings, tf.float16)
-    elif float_dtype == 'bfloat16':
+    elif float_dtype == "bfloat16":
       embeddings = tf.cast(embeddings, tf.bfloat16)
 
     data = embeddings
     attention_mask = self_attention_mask.SelfAttentionMask()([data, mask])
     shared_layer = transformer.Transformer(
-        num_attention_heads=num_attention_heads,
-        intermediate_size=intermediate_size,
-        intermediate_activation=activation,
-        dropout_rate=dropout_rate,
-        attention_dropout_rate=attention_dropout_rate,
-        kernel_initializer=initializer,
-        dtype=float_dtype,
-        name='transformer'
+      num_attention_heads=num_attention_heads,
+      intermediate_size=intermediate_size,
+      intermediate_activation=activation,
+      dropout_rate=dropout_rate,
+      attention_dropout_rate=attention_dropout_rate,
+      kernel_initializer=initializer,
+      dtype=float_dtype,
+      name="transformer",
     )
     for _ in range(num_layers):
       data = shared_layer([data, attention_mask])
 
-    first_token_tensor = (tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
+    first_token_tensor = tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
     cls_output = tf.keras.layers.Dense(
-        units=hidden_size,
-        activation='tanh',
-        kernel_initializer=initializer,
-        dtype=float_dtype,
-        name='pooler_transform'
+      units=hidden_size, activation="tanh", kernel_initializer=initializer, dtype=float_dtype, name="pooler_transform"
     )(first_token_tensor)
 
-    super(AlbertTransformerEncoder,
-          self).__init__(inputs=[word_ids, mask, type_ids], outputs=[data, cls_output], **kwargs)
+    super(AlbertTransformerEncoder, self).__init__(
+      inputs=[word_ids, mask, type_ids], outputs=[data, cls_output], **kwargs
+    )
 
   def get_embedding_table(self):
     return self._embedding_layer.embeddings
diff --git a/deepray/models/bert_classifier.py b/deepray/models/bert_classifier.py
index faea48d4..3ca3e9b8 100644
--- a/deepray/models/bert_classifier.py
+++ b/deepray/models/bert_classifier.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -46,13 +47,13 @@ class BertClassifier(tf.keras.Model):
       'predictions'.
   """
 
-  def __init__(self, network, num_classes, initializer='glorot_uniform', output='logits', dropout_rate=0.1, **kwargs):
+  def __init__(self, network, num_classes, initializer="glorot_uniform", output="logits", dropout_rate=0.1, **kwargs):
     self._self_setattr_tracking = False
     self._config = {
-        'network': network,
-        'num_classes': num_classes,
-        'initializer': initializer,
-        'output': output,
+      "network": network,
+      "num_classes": num_classes,
+      "initializer": initializer,
+      "output": output,
     }
 
     # We want to use the inputs of the passed network as the inputs to this
@@ -66,11 +67,11 @@ def __init__(self, network, num_classes, initializer='glorot_uniform', output='l
     cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
 
     self.classifier = classification.Classification(
-        input_width=cls_output.shape[-1],
-        num_classes=num_classes,
-        initializer=initializer,
-        output=output,
-        name='classification'
+      input_width=cls_output.shape[-1],
+      num_classes=num_classes,
+      initializer=initializer,
+      output=output,
+      name="classification",
     )
     predictions = self.classifier(cls_output)
 
diff --git a/deepray/models/bert_pretrainer.py b/deepray/models/bert_pretrainer.py
index 90858b45..67082232 100644
--- a/deepray/models/bert_pretrainer.py
+++ b/deepray/models/bert_pretrainer.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -53,26 +54,26 @@ class BertPretrainer(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      network,
-      num_classes,
-      num_token_predictions,
-      float_type,
-      activation=None,
-      output_activation=None,
-      initializer='glorot_uniform',
-      output='logits',
-      **kwargs
+    self,
+    network,
+    num_classes,
+    num_token_predictions,
+    float_type,
+    activation=None,
+    output_activation=None,
+    initializer="glorot_uniform",
+    output="logits",
+    **kwargs,
   ):
     self._self_setattr_tracking = False
     self._config = {
-        'network': network,
-        'num_classes': num_classes,
-        'num_token_predictions': num_token_predictions,
-        'activation': activation,
-        'output_activation': output_activation,
-        'initializer': initializer,
-        'output': output,
+      "network": network,
+      "num_classes": num_classes,
+      "num_token_predictions": num_token_predictions,
+      "activation": activation,
+      "output_activation": output_activation,
+      "initializer": initializer,
+      "output": output,
     }
 
     # We want to use the inputs of the passed network as the inputs to this
@@ -92,33 +93,33 @@ def __init__(
     sequence_output_length = sequence_output.shape.as_list()[1]
     if sequence_output_length < num_token_predictions:
       raise ValueError(
-          "The passed network's output length is %s, which is less than the "
-          'requested num_token_predictions %s.' % (sequence_output_length, num_token_predictions)
+        "The passed network's output length is %s, which is less than the "
+        "requested num_token_predictions %s." % (sequence_output_length, num_token_predictions)
       )
 
     masked_lm_positions = tf.keras.layers.Input(
-        shape=(num_token_predictions,), name='masked_lm_positions', dtype=tf.int32
+      shape=(num_token_predictions,), name="masked_lm_positions", dtype=tf.int32
     )
     inputs.append(masked_lm_positions)
 
     self.masked_lm = masked_lm.MaskedLM(
-        num_predictions=num_token_predictions,
-        input_width=sequence_output.shape[-1],
-        source_network=network,
-        float_type=float_type,
-        activation=activation,
-        initializer=initializer,
-        output=output,
-        name='masked_lm'
+      num_predictions=num_token_predictions,
+      input_width=sequence_output.shape[-1],
+      source_network=network,
+      float_type=float_type,
+      activation=activation,
+      initializer=initializer,
+      output=output,
+      name="masked_lm",
     )
     lm_outputs = self.masked_lm([sequence_output, masked_lm_positions])
 
     self.classification = classification.Classification(
-        input_width=cls_output.shape[-1],
-        num_classes=num_classes,
-        initializer=initializer,
-        output=output,
-        name='classification'
+      input_width=cls_output.shape[-1],
+      num_classes=num_classes,
+      initializer=initializer,
+      output=output,
+      name="classification",
     )
     sentence_outputs = self.classification(cls_output)
 
diff --git a/deepray/models/bert_span_labeler.py b/deepray/models/bert_span_labeler.py
index e7a9b312..3035137e 100644
--- a/deepray/models/bert_span_labeler.py
+++ b/deepray/models/bert_span_labeler.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -44,12 +45,12 @@ class BertSpanLabeler(tf.keras.Model):
       'predictions'.
   """
 
-  def __init__(self, network, initializer='glorot_uniform', output='logits', **kwargs):
+  def __init__(self, network, initializer="glorot_uniform", output="logits", **kwargs):
     self._self_setattr_tracking = False
     self._config = {
-        'network': network,
-        'initializer': initializer,
-        'output': output,
+      "network": network,
+      "initializer": initializer,
+      "output": output,
     }
     # We want to use the inputs of the passed network as the inputs to this
     # Model. To do this, we need to keep a handle to the network inputs for use
@@ -63,15 +64,15 @@ def __init__(self, network, initializer='glorot_uniform', output='logits', **kwa
     # This is an instance variable for ease of access to the underlying task
     # network.
     self.span_labeling = span_labeling.SpanLabeling(
-        input_width=sequence_output.shape[-1], initializer=initializer, output=output, name='span_labeling'
+      input_width=sequence_output.shape[-1], initializer=initializer, output=output, name="span_labeling"
     )
     start_logits, end_logits = self.span_labeling(sequence_output)
 
     # Use identity layers wrapped in lambdas to explicitly name the output
     # tensors. This allows us to use string-keyed dicts in Keras fit/predict/
     # evaluate calls.
-    start_logits = tf.keras.layers.Lambda(tf.identity, name='start_positions')(start_logits)
-    end_logits = tf.keras.layers.Lambda(tf.identity, name='end_positions')(end_logits)
+    start_logits = tf.keras.layers.Lambda(tf.identity, name="start_positions")(start_logits)
+    end_logits = tf.keras.layers.Lambda(tf.identity, name="end_positions")(end_logits)
 
     logits = [start_logits, end_logits]
 
diff --git a/deepray/models/classification.py b/deepray/models/classification.py
index b447fa19..8f113c88 100644
--- a/deepray/models/classification.py
+++ b/deepray/models/classification.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -37,29 +38,28 @@ class Classification(tf.keras.Model):
       'predictions'.
   """
 
-  def __init__(self, input_width, num_classes, initializer='glorot_uniform', output='logits', **kwargs):
+  def __init__(self, input_width, num_classes, initializer="glorot_uniform", output="logits", **kwargs):
     self._self_setattr_tracking = False
     self._config_dict = {
-        'input_width': input_width,
-        'num_classes': num_classes,
-        'initializer': initializer,
-        'output': output,
+      "input_width": input_width,
+      "num_classes": num_classes,
+      "initializer": initializer,
+      "output": output,
     }
 
-    cls_output = tf.keras.layers.Input(shape=(input_width,), name='cls_output', dtype=tf.float32)
+    cls_output = tf.keras.layers.Input(shape=(input_width,), name="cls_output", dtype=tf.float32)
 
     self.logits = tf.keras.layers.Dense(
-        num_classes, activation=None, kernel_initializer=initializer, name='predictions/transform/logits'
+      num_classes, activation=None, kernel_initializer=initializer, name="predictions/transform/logits"
     )(cls_output)
     predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(self.logits)
 
-    if output == 'logits':
+    if output == "logits":
       output_tensors = self.logits
-    elif output == 'predictions':
+    elif output == "predictions":
       output_tensors = predictions
     else:
-      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or '
-                        '"predictions"') % output)
+      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or "predictions"') % output)
 
     super(Classification, self).__init__(inputs=[cls_output], outputs=output_tensors, **kwargs)
 
diff --git a/deepray/models/deep_cross_net.py b/deepray/models/deep_cross_net.py
index 353bf17f..2e29025c 100644
--- a/deepray/models/deep_cross_net.py
+++ b/deepray/models/deep_cross_net.py
@@ -6,7 +6,7 @@
 categorical_columns = []
 for feature, vocabulary_list in CATEGORIES_Dict.items():
   cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
-      key=feature, vocabulary_list=vocabulary_list, num_oov_buckets=1
+    key=feature, vocabulary_list=vocabulary_list, num_oov_buckets=1
   )
   categorical_columns.append(tf.feature_column.embedding_column(cat_col, 16))
 
@@ -20,9 +20,8 @@
 
 
 class MyModel(tf.keras.Model):
-
   def __init__(self):
-    super(MyModel, self).__init__(name='my_model')
+    super(MyModel, self).__init__(name="my_model")
     self.input_1 = categorical_columns_layer
     self.input_2 = numberical_columns_layer
     self.deep1 = layers.Dense(400)
@@ -33,7 +32,7 @@ def __init__(self):
     self.dropout2 = layers.Dropout(0.1)
     self.norm1 = layers.BatchNormalization()
     self.norm2 = layers.BatchNormalization()
-    self.sigmoid = layers.Dense(1, activation='sigmoid')
+    self.sigmoid = layers.Dense(1, activation="sigmoid")
     self.cross = CrossNet(layer_num=3)
 
   def call(self, inputs, training=None):
diff --git a/deepray/models/encoder_scaffold.py b/deepray/models/encoder_scaffold.py
index 695b3191..fc9fc755 100644
--- a/deepray/models/encoder_scaffold.py
+++ b/deepray/models/encoder_scaffold.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -90,17 +91,17 @@ class or instance defines the inputs to this encoder. If embedding_cls is
   """
 
   def __init__(
-      self,
-      num_output_classes,
-      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      classification_layer_dtype=tf.float32,
-      embedding_cls=None,
-      embedding_cfg=None,
-      embedding_data=None,
-      num_hidden_instances=1,
-      hidden_cls=transformer.Transformer,
-      hidden_cfg=None,
-      **kwargs
+    self,
+    num_output_classes,
+    classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    classification_layer_dtype=tf.float32,
+    embedding_cls=None,
+    embedding_cfg=None,
+    embedding_data=None,
+    num_hidden_instances=1,
+    hidden_cls=transformer.Transformer,
+    hidden_cfg=None,
+    **kwargs,
   ):
     print(embedding_cfg)
     self._self_setattr_tracking = False
@@ -123,48 +124,45 @@ def __init__(
       embeddings, mask = self._embedding_network(inputs)
     else:
       self._embedding_network = None
-      word_ids = tf.keras.layers.Input(shape=(embedding_cfg['seq_length'],), dtype=tf.int32, name='input_word_ids')
-      mask = tf.keras.layers.Input(shape=(embedding_cfg['seq_length'],), dtype=tf.int32, name='input_mask')
-      type_ids = tf.keras.layers.Input(shape=(embedding_cfg['seq_length'],), dtype=tf.int32, name='input_type_ids')
+      word_ids = tf.keras.layers.Input(shape=(embedding_cfg["seq_length"],), dtype=tf.int32, name="input_word_ids")
+      mask = tf.keras.layers.Input(shape=(embedding_cfg["seq_length"],), dtype=tf.int32, name="input_mask")
+      type_ids = tf.keras.layers.Input(shape=(embedding_cfg["seq_length"],), dtype=tf.int32, name="input_type_ids")
       inputs = [word_ids, mask, type_ids]
 
       self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
-          vocab_size=embedding_cfg['vocab_size'],
-          embedding_width=embedding_cfg['hidden_size'],
-          initializer=embedding_cfg['initializer'],
-          name='word_embeddings'
+        vocab_size=embedding_cfg["vocab_size"],
+        embedding_width=embedding_cfg["hidden_size"],
+        initializer=embedding_cfg["initializer"],
+        name="word_embeddings",
       )
 
       word_embeddings = self._embedding_layer(word_ids)
 
       # Always uses dynamic slicing for simplicity.
       self._position_embedding_layer = position_embedding.PositionEmbedding(
-          initializer=embedding_cfg['initializer'],
-          use_dynamic_slicing=True,
-          max_sequence_length=embedding_cfg['max_seq_length']
+        initializer=embedding_cfg["initializer"],
+        use_dynamic_slicing=True,
+        max_sequence_length=embedding_cfg["max_seq_length"],
       )
       position_embeddings = self._position_embedding_layer(word_embeddings)
 
-      type_embeddings = (
-          on_device_embedding.OnDeviceEmbedding(
-              vocab_size=embedding_cfg['type_vocab_size'],
-              embedding_width=embedding_cfg['hidden_size'],
-              initializer=embedding_cfg['initializer'],
-              use_one_hot=True,
-              name='type_embeddings'
-          )(type_ids)
-      )
+      type_embeddings = on_device_embedding.OnDeviceEmbedding(
+        vocab_size=embedding_cfg["type_vocab_size"],
+        embedding_width=embedding_cfg["hidden_size"],
+        initializer=embedding_cfg["initializer"],
+        use_one_hot=True,
+        name="type_embeddings",
+      )(type_ids)
 
       embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
-      embeddings = (
-          tf.keras.layers.LayerNormalization(name='embeddings/layer_norm', axis=-1, epsilon=1e-12,
-                                             dtype=tf.float32)(embeddings)
-      )
-      embeddings = (tf.keras.layers.Dropout(rate=embedding_cfg['dropout_rate'], dtype=tf.float32)(embeddings))
+      embeddings = tf.keras.layers.LayerNormalization(
+        name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+      )(embeddings)
+      embeddings = tf.keras.layers.Dropout(rate=embedding_cfg["dropout_rate"], dtype=tf.float32)(embeddings)
 
-      if embedding_cfg.get('dtype') == 'float16':
+      if embedding_cfg.get("dtype") == "float16":
         embeddings = tf.cast(embeddings, tf.float16)
-      elif embedding_cfg.get('dtype') == 'bfloat16':
+      elif embedding_cfg.get("dtype") == "bfloat16":
         embeddings = tf.cast(embeddings, tf.bfloat16)
 
     attention_mask = self_attention_mask.SelfAttentionMask()([embeddings, mask])
@@ -177,41 +175,41 @@ def __init__(
         layer = self._hidden_cls
       data = layer([data, attention_mask])
 
-    first_token_tensor = (tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
+    first_token_tensor = tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
     cls_output = tf.keras.layers.Dense(
-        units=num_output_classes,
-        activation='tanh',
-        kernel_initializer=classification_layer_initializer,
-        dtype=classification_layer_dtype,
-        name='cls_transform'
+      units=num_output_classes,
+      activation="tanh",
+      kernel_initializer=classification_layer_initializer,
+      dtype=classification_layer_dtype,
+      name="cls_transform",
     )(first_token_tensor)
 
     super(EncoderScaffold, self).__init__(inputs=inputs, outputs=[data, cls_output], **kwargs)
 
   def get_config(self):
     config_dict = {
-        'num_hidden_instances': self._num_hidden_instances,
-        'num_output_classes': self._num_output_classes,
-        'classification_layer_initializer': self._classification_layer_initializer,
-        'embedding_cls': self._embedding_network,
-        'embedding_cfg': self._embedding_cfg,
-        'hidden_cfg': self._hidden_cfg,
+      "num_hidden_instances": self._num_hidden_instances,
+      "num_output_classes": self._num_output_classes,
+      "classification_layer_initializer": self._classification_layer_initializer,
+      "embedding_cls": self._embedding_network,
+      "embedding_cfg": self._embedding_cfg,
+      "hidden_cfg": self._hidden_cfg,
     }
     if inspect.isclass(self._hidden_cls):
-      config_dict['hidden_cls_string'] = tf.keras.utils.get_registered_name(self._hidden_cls)
+      config_dict["hidden_cls_string"] = tf.keras.utils.get_registered_name(self._hidden_cls)
     else:
-      config_dict['hidden_cls'] = self._hidden_cls
+      config_dict["hidden_cls"] = self._hidden_cls
 
     config_dict.update(self._kwargs)
     return config_dict
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    if 'hidden_cls_string' in config:
-      config['hidden_cls'] = tf.keras.utils.get_registered_object(
-          config['hidden_cls_string'], custom_objects=custom_objects
+    if "hidden_cls_string" in config:
+      config["hidden_cls"] = tf.keras.utils.get_registered_object(
+        config["hidden_cls_string"], custom_objects=custom_objects
       )
-      del config['hidden_cls_string']
+      del config["hidden_cls_string"]
     return cls(**config)
 
   def get_embedding_table(self):
@@ -222,17 +220,18 @@ def get_embedding_table(self):
 
     if self._embedding_data is None:
       raise RuntimeError(
-          (
-              'The EncoderScaffold %s does not have a reference '
-              'to the embedding data. This is required when you '
-              'pass a custom embedding network to the scaffold. '
-              'It is also possible that you are trying to get '
-              'embedding data from an embedding scaffold with a '
-              'custom embedding network where the scaffold has '
-              'been serialized and deserialized. Unfortunately, '
-              'accessing custom embedding references after '
-              'serialization is not yet supported.'
-          ) % self.name
+        (
+          "The EncoderScaffold %s does not have a reference "
+          "to the embedding data. This is required when you "
+          "pass a custom embedding network to the scaffold. "
+          "It is also possible that you are trying to get "
+          "embedding data from an embedding scaffold with a "
+          "custom embedding network where the scaffold has "
+          "been serialized and deserialized. Unfortunately, "
+          "accessing custom embedding references after "
+          "serialization is not yet supported."
+        )
+        % self.name
       )
     else:
       return self._embedding_data
diff --git a/deepray/models/framework.py b/deepray/models/framework.py
index 1c5d739e..f3aeef2c 100644
--- a/deepray/models/framework.py
+++ b/deepray/models/framework.py
@@ -1,4 +1,5 @@
 """Defines the base task abstraction."""
+
 import abc
 import functools
 from typing import Optional
@@ -8,7 +9,6 @@
 
 
 class FrameWork(tf.keras.Model, metaclass=abc.ABCMeta):
-
   @abc.abstractmethod
   def build_network(self, flags=None, features=None):
     """
diff --git a/deepray/models/generative/conditional_gan.py b/deepray/models/generative/conditional_gan.py
index 3e603b1a..fc1ce63b 100644
--- a/deepray/models/generative/conditional_gan.py
+++ b/deepray/models/generative/conditional_gan.py
@@ -6,6 +6,7 @@
 Description: Training a GAN conditioned on class labels to generate handwritten digits.
 Accelerator: GPU
 """
+
 """
 Generative Adversarial Networks (GANs) let us generate novel image data, video data,
 or audio data from a random input. Typically, the random input is sampled
@@ -55,6 +56,7 @@
 from tensorflow import keras
 from tensorflow.keras import layers
 from tensorflow_docs.vis import embed
+
 """
 ## Constants and hyperparameters
 """
@@ -108,34 +110,34 @@
 
 # Create the discriminator.
 discriminator = keras.Sequential(
-    [
-        keras.layers.InputLayer((28, 28, discriminator_in_channels)),
-        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.GlobalMaxPooling2D(),
-        layers.Dense(1),
-    ],
-    name="discriminator",
+  [
+    keras.layers.InputLayer((28, 28, discriminator_in_channels)),
+    layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.GlobalMaxPooling2D(),
+    layers.Dense(1),
+  ],
+  name="discriminator",
 )
 
 # Create the generator.
 generator = keras.Sequential(
-    [
-        keras.layers.InputLayer((generator_in_channels,)),
-        # We want to generate 128 + num_classes coefficients to reshape into a
-        # 7x7x(128 + num_classes) map.
-        layers.Dense(7 * 7 * generator_in_channels),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Reshape((7, 7, generator_in_channels)),
-        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
-    ],
-    name="generator",
+  [
+    keras.layers.InputLayer((generator_in_channels,)),
+    # We want to generate 128 + num_classes coefficients to reshape into a
+    # 7x7x(128 + num_classes) map.
+    layers.Dense(7 * 7 * generator_in_channels),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Reshape((7, 7, generator_in_channels)),
+    layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
+  ],
+  name="generator",
 )
 """
 ## Creating a `ConditionalGAN` model
@@ -143,7 +145,6 @@
 
 
 class ConditionalGAN(keras.Model):
-
   def __init__(self, discriminator, generator, latent_dim):
     super().__init__()
     self.discriminator = discriminator
@@ -218,8 +219,8 @@ def train_step(self, data):
     self.gen_loss_tracker.update_state(g_loss)
     self.disc_loss_tracker.update_state(d_loss)
     return {
-        "g_loss": self.gen_loss_tracker.result(),
-        "d_loss": self.disc_loss_tracker.result(),
+      "g_loss": self.gen_loss_tracker.result(),
+      "d_loss": self.disc_loss_tracker.result(),
     }
 
 
@@ -229,9 +230,9 @@ def train_step(self, data):
 
 cond_gan = ConditionalGAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
 cond_gan.compile(
-    d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
+  d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+  g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+  loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
 )
 
 cond_gan.fit(dataset, epochs=20)
@@ -262,7 +263,7 @@ def interpolate_class(first_number, second_number):
   # Calculate the interpolation vector between the two labels.
   percent_second_label = tf.linspace(0, 1, num_interpolation)[:, None]
   percent_second_label = tf.cast(percent_second_label, tf.float32)
-  interpolation_labels = (first_label * (1 - percent_second_label) + second_label * percent_second_label)
+  interpolation_labels = first_label * (1 - percent_second_label) + second_label * percent_second_label
 
   # Combine the noise and the labels and run inference with the generator.
   noise_and_labels = tf.concat([interpolation_noise, interpolation_labels], 1)
diff --git a/deepray/models/masked_lm.py b/deepray/models/masked_lm.py
index 9f89fddb..59e1efad 100644
--- a/deepray/models/masked_lm.py
+++ b/deepray/models/masked_lm.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -43,59 +44,58 @@ class MaskedLM(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      input_width,
-      num_predictions,
-      source_network,
-      float_type,
-      activation=None,
-      initializer='glorot_uniform',
-      output='logits',
-      **kwargs
+    self,
+    input_width,
+    num_predictions,
+    source_network,
+    float_type,
+    activation=None,
+    initializer="glorot_uniform",
+    output="logits",
+    **kwargs,
   ):
-
     embedding_table = source_network.get_embedding_table()
     vocab_size, hidden_size = embedding_table.shape
 
-    sequence_data = tf.keras.layers.Input(shape=(None, input_width), name='sequence_data', dtype=tf.float32)
-    masked_lm_positions = tf.keras.layers.Input(shape=(num_predictions,), name='masked_lm_positions', dtype=tf.int32)
-
-    masked_lm_input = tf.keras.layers.Lambda(lambda x: self._gather_indexes(x[0], x[1]))(
-        [sequence_data, masked_lm_positions]
+    sequence_data = tf.keras.layers.Input(shape=(None, input_width), name="sequence_data", dtype=tf.float32)
+    masked_lm_positions = tf.keras.layers.Input(shape=(num_predictions,), name="masked_lm_positions", dtype=tf.int32)
+
+    masked_lm_input = tf.keras.layers.Lambda(lambda x: self._gather_indexes(x[0], x[1]))([
+      sequence_data,
+      masked_lm_positions,
+    ])
+    lm_data = tf.keras.layers.Dense(
+      hidden_size, activation=activation, kernel_initializer=initializer, name="cls/predictions/transform/dense"
+    )(masked_lm_input)
+    lm_data = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-12, name="cls/predictions/transform/LayerNorm")(
+      lm_data
     )
-    lm_data = (
-        tf.keras.layers.Dense(
-            hidden_size, activation=activation, kernel_initializer=initializer, name='cls/predictions/transform/dense'
-        )(masked_lm_input)
+    lm_data = tf.keras.layers.Lambda(lambda x: tf.matmul(x, tf.cast(embedding_table, float_type), transpose_b=True))(
+      lm_data
     )
-    lm_data = tf.keras.layers.LayerNormalization(axis=-1, epsilon=1e-12,
-                                                 name='cls/predictions/transform/LayerNorm')(lm_data)
-    lm_data = tf.keras.layers.Lambda(lambda x: tf.matmul(x, tf.cast(embedding_table, float_type), transpose_b=True)
-                                    )(lm_data)
-    logits = Bias(initializer=tf.keras.initializers.Zeros(), name='cls/predictions/output_bias')(lm_data)
+    logits = Bias(initializer=tf.keras.initializers.Zeros(), name="cls/predictions/output_bias")(lm_data)
 
     # We can't use the standard Keras reshape layer here, since it expects
     # the input and output batch size to be the same.
     reshape_layer = tf.keras.layers.Lambda(lambda x: tf.reshape(x, [-1, num_predictions, vocab_size]))
 
     self.logits = reshape_layer(logits)
-    predictions = tf.keras.layers.Activation(tf.nn.log_softmax, dtype='float32')(self.logits)
+    predictions = tf.keras.layers.Activation(tf.nn.log_softmax, dtype="float32")(self.logits)
 
-    if output == 'logits':
+    if output == "logits":
       output_tensors = self.logits
-    elif output == 'predictions':
+    elif output == "predictions":
       output_tensors = predictions
     else:
-      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or '
-                        '"predictions"') % output)
+      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or "predictions"') % output)
 
     super(MaskedLM, self).__init__(inputs=[sequence_data, masked_lm_positions], outputs=output_tensors, **kwargs)
 
   def get_config(self):
     raise NotImplementedError(
-        'MaskedLM cannot be directly serialized at this '
-        'time. Please use it only in Layers or '
-        'functionally subclassed Models/Networks.'
+      "MaskedLM cannot be directly serialized at this "
+      "time. Please use it only in Layers or "
+      "functionally subclassed Models/Networks."
     )
 
   def _gather_indexes(self, sequence_tensor, positions):
@@ -114,7 +114,7 @@ def _gather_indexes(self, sequence_tensor, positions):
         Masked out sequence tensor of shape (batch_size * num_predictions,
         num_hidden).
     """
-    sequence_shape = tf_utils.get_shape_list(sequence_tensor, name='sequence_output_tensor')
+    sequence_shape = tf_utils.get_shape_list(sequence_tensor, name="sequence_output_tensor")
     batch_size, seq_length, width = sequence_shape
 
     flat_offsets = tf.keras.backend.reshape(tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
@@ -129,7 +129,7 @@ def _gather_indexes(self, sequence_tensor, positions):
 class Bias(tf.keras.layers.Layer):
   """Adds a bias term to an input."""
 
-  def __init__(self, initializer='zeros', regularizer=None, constraint=None, activation=None, **kwargs):
+  def __init__(self, initializer="zeros", regularizer=None, constraint=None, activation=None, **kwargs):
     super(Bias, self).__init__(**kwargs)
     self._initializer = tf.keras.initializers.get(initializer)
     self._regularizer = tf.keras.regularizers.get(regularizer)
@@ -139,23 +139,23 @@ def __init__(self, initializer='zeros', regularizer=None, constraint=None, activ
   def build(self, input_shape):
     input_shape = tf.TensorShape(input_shape)
     self._bias = self.add_weight(
-        'bias',
-        shape=input_shape[1:],
-        initializer=self._initializer,
-        regularizer=self._regularizer,
-        constraint=self._constraint,
-        dtype=self._dtype,
-        trainable=True
+      "bias",
+      shape=input_shape[1:],
+      initializer=self._initializer,
+      regularizer=self._regularizer,
+      constraint=self._constraint,
+      dtype=self._dtype,
+      trainable=True,
     )
 
     super(Bias, self).build(input_shape)
 
   def get_config(self):
     config = {
-        'activation': tf.keras.activations.serialize(self._activation),
-        'initializer': tf.keras.initializers.serialize(self._initializer),
-        'regularizer': tf.keras.regularizers.serialize(self._regularizer),
-        'constraint': tf.keras.constraints.serialize(self._constraint)
+      "activation": tf.keras.activations.serialize(self._activation),
+      "initializer": tf.keras.initializers.serialize(self._initializer),
+      "regularizer": tf.keras.regularizers.serialize(self._regularizer),
+      "constraint": tf.keras.constraints.serialize(self._constraint),
     }
     base_config = super(Bias, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/deepray/models/ncf_common.py b/deepray/models/ncf_common.py
index 645f6596..7bfe4f8c 100644
--- a/deepray/models/ncf_common.py
+++ b/deepray/models/ncf_common.py
@@ -49,11 +49,11 @@ def get_inputs(params):
     num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
   else:
     num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
-        dataset=FLAGS.dataset,
-        data_dir=FLAGS.data_dir,
-        params=params,
-        constructor_type=FLAGS.constructor_type,
-        deterministic=FLAGS.random_seed is not None
+      dataset=FLAGS.dataset,
+      data_dir=FLAGS.data_dir,
+      params=params,
+      constructor_type=FLAGS.constructor_type,
+      deterministic=FLAGS.random_seed is not None,
     )
     num_train_steps = producer.train_batches_per_epoch
     num_eval_steps = producer.eval_batches_per_epoch
@@ -69,7 +69,7 @@ def get_v1_distribution_strategy(params):
       logging.getLogger(name).setLevel(logging.ERROR)
 
     tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-        tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator"
+      tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator"
     )
 
     logging.info("Issuing reset command to TPU to ensure a clean state.")
@@ -79,9 +79,9 @@ def get_v1_distribution_strategy(params):
     # by reading the `TF_CONFIG` environment variable, and the coordinator
     # is used by StreamingFilesDataset.
     tf_config_env = {
-        "session_master": tpu_cluster_resolver.get_master(),
-        "eval_session_master": tpu_cluster_resolver.get_master(),
-        "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"]
+      "session_master": tpu_cluster_resolver.get_master(),
+      "eval_session_master": tpu_cluster_resolver.get_master(),
+      "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"],
     }
     os.environ["TF_CONFIG"] = json.dumps(tf_config_env)
 
diff --git a/deepray/models/ncf_model.py b/deepray/models/ncf_model.py
index 0abe58e1..6e7eb0b5 100644
--- a/deepray/models/ncf_model.py
+++ b/deepray/models/ncf_model.py
@@ -66,7 +66,6 @@ def call(self, inputs):
 
 
 class NCFModel(tf.keras.Model):
-
   def __init__(self, params, *args, **kwargs):
     super(NCFModel, self).__init__(*args, **kwargs)
     self._params = params
diff --git a/deepray/models/ncf_test.py b/deepray/models/ncf_test.py
index 1555119c..1277c178 100644
--- a/deepray/models/ncf_test.py
+++ b/deepray/models/ncf_test.py
@@ -30,7 +30,6 @@
 
 
 class NcfTest(tf.test.TestCase):
-
   @classmethod
   def setUpClass(cls):  # pylint: disable=invalid-name
     super(NcfTest, cls).setUpClass()
@@ -47,60 +46,60 @@ def tearDown(self):
     rconst.NUM_EVAL_NEGATIVES = self.num_eval_negatives_old
     rconst.TOP_K = self.top_k_old
 
-  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-epochs', '1']
+  _BASE_END_TO_END_FLAGS = ["-batch_size", "1044", "-epochs", "1"]
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_no_dist_strat(self):
     integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-distribution_strategy', 'off']
+      ncf_keras_main.main,
+      tmp_root=self.get_temp_dir(),
+      extra_flags=self._BASE_END_TO_END_FLAGS + ["-distribution_strategy", "off"],
     )
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_dist_strat(self):
     integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0']
+      ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=self._BASE_END_TO_END_FLAGS + ["-num_gpus", "0"]
     )
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_dist_strat_ctl(self):
-    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-use_custom_training_loop', 'True'])
+    flags = self._BASE_END_TO_END_FLAGS + ["-num_gpus", "0"] + ["-use_custom_training_loop", "True"]
     integration.run_synthetic(ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=flags)
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_1_gpu_dist_strat_fp16(self):
     if context.num_gpus() < 1:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(1, context.num_gpus()))
+      self.skipTest("{} GPUs are not available for this test. {} GPUs are available".format(1, context.num_gpus()))
 
     integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16']
+      ncf_keras_main.main,
+      tmp_root=self.get_temp_dir(),
+      extra_flags=self._BASE_END_TO_END_FLAGS + ["-num_gpus", "1", "--dtype", "fp16"],
     )
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_1_gpu_dist_strat_ctl_fp16(self):
     if context.num_gpus() < 1:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(1, context.num_gpus()))
+      self.skipTest("{} GPUs are not available for this test. {} GPUs are available".format(1, context.num_gpus()))
 
     integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--use_custom_training_loop']
+      ncf_keras_main.main,
+      tmp_root=self.get_temp_dir(),
+      extra_flags=self._BASE_END_TO_END_FLAGS + ["-num_gpus", "1", "--dtype", "fp16", "--use_custom_training_loop"],
     )
 
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
+  @unittest.mock.patch.object(rconst, "SYNTHETIC_BATCHES_PER_EPOCH", 100)
   def test_end_to_end_keras_2_gpu_fp16(self):
     if context.num_gpus() < 2:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(2, context.num_gpus()))
+      self.skipTest("{} GPUs are not available for this test. {} GPUs are available".format(2, context.num_gpus()))
 
     integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2', '--dtype', 'fp16']
+      ncf_keras_main.main,
+      tmp_root=self.get_temp_dir(),
+      extra_flags=self._BASE_END_TO_END_FLAGS + ["-num_gpus", "2", "--dtype", "fp16"],
     )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/neumf_model.py b/deepray/models/neumf_model.py
index 0cbf4f8b..45159a91 100644
--- a/deepray/models/neumf_model.py
+++ b/deepray/models/neumf_model.py
@@ -28,6 +28,7 @@
 In NeuMF model, it allows GMF and MLP to learn separate embeddings, and combine
 the two models by concatenating their last hidden layer.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -47,34 +48,33 @@
 flags.DEFINE_integer(name="num_factors", default=8, help=flags_core.help_wrap("The Embedding size of MF model."))
 # Set the default as a list of strings to be consistent with input arguments
 flags.DEFINE_list(
-    name="layers",
-    default=["64", "32", "16", "8"],
-    help=flags_core.help_wrap(
-        "The sizes of hidden layers for MLP. Example "
-        "to specify different sizes of MLP layers: --layers=32,16,8,4"
-    )
+  name="layers",
+  default=["64", "32", "16", "8"],
+  help=flags_core.help_wrap(
+    "The sizes of hidden layers for MLP. Example to specify different sizes of MLP layers: --layers=32,16,8,4"
+  ),
 )
 flags.DEFINE_float(
-    name="mf_regularization",
-    default=0.,
-    help=flags_core.help_wrap(
-        "The regularization factor for MF embeddings. The factor is used by "
-        "regularizer which allows to apply penalties on layer parameters or "
-        "layer activity during optimization."
-    )
+  name="mf_regularization",
+  default=0.0,
+  help=flags_core.help_wrap(
+    "The regularization factor for MF embeddings. The factor is used by "
+    "regularizer which allows to apply penalties on layer parameters or "
+    "layer activity during optimization."
+  ),
 )
 flags.DEFINE_list(
-    name="mlp_regularization",
-    default=["0.", "0.", "0.", "0."],
-    help=flags_core.help_wrap(
-        "The regularization factor for each MLP layer. See mf_regularization "
-        "help for more info about regularization factor."
-    )
+  name="mlp_regularization",
+  default=["0.", "0.", "0.", "0."],
+  help=flags_core.help_wrap(
+    "The regularization factor for each MLP layer. See mf_regularization "
+    "help for more info about regularization factor."
+  ),
 )
 flags.DEFINE_integer(
-    name="num_neg",
-    default=4,
-    help=flags_core.help_wrap("The Number of negative instances to pair with a positive instance.")
+  name="num_neg",
+  default=4,
+  help=flags_core.help_wrap("The Number of negative instances to pair with a positive instance."),
 )
 
 
@@ -121,12 +121,7 @@ def neumf_model_fn(features, labels, mode, params):
   if mode == tf_estimator.ModeKeys.EVAL:
     duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
     return _get_estimator_spec_with_metrics(
-        logits,
-        softmax_logits,
-        duplicate_mask,
-        params["num_neg"],
-        params["match_mlperf"],
-        use_tpu_spec=params["use_tpu"]
+      logits, softmax_logits, duplicate_mask, params["num_neg"], params["match_mlperf"], use_tpu_spec=params["use_tpu"]
     )
 
   elif mode == tf_estimator.ModeKeys.TRAIN:
@@ -134,13 +129,13 @@ def neumf_model_fn(features, labels, mode, params):
     valid_pt_mask = features[rconst.VALID_POINT_MASK]
 
     optimizer = tf.compat.v1.train.AdamOptimizer(
-        learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]
+      learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]
     )
     if params["use_tpu"]:
       optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
 
     loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-        labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32)
+      labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32)
     )
 
     tf.identity(loss, name="cross_entropy")
@@ -164,20 +159,19 @@ def _strip_first_and_last_dimension(x, batch_size):
 
 
 class ConstructModel(tf.keras.layers.Layer):
-
   def __init__(self, params: Dict[Text, Any]):
     """Initialize NeuMF model.
 
-      Args:
-        user_input: keras input layer for users
-        item_input: keras input layer for items
-        params: Dict of hyperparameters.
+    Args:
+      user_input: keras input layer for users
+      item_input: keras input layer for items
+      params: Dict of hyperparameters.
 
-      Raises:
-        ValueError: if the first model layer is not even.
-      Returns:
-        model:  a keras Model for computing the logits
-      """
+    Raises:
+      ValueError: if the first model layer is not even.
+    Returns:
+      model:  a keras Model for computing the logits
+    """
     num_users = params["num_users"]
     num_items = params["num_items"]
     self.model_layers = params["model_layers"]
@@ -192,46 +186,45 @@ def __init__(self, params: Dict[Text, Any]):
     # Initializer for embedding layers
     embedding_initializer = "glorot_uniform"
     self.emb_layer = tf.keras.layers.Embedding(
-        num_items,
-        self.mf_dim + self.model_layers[0] // 2,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-        input_length=1,
-        name="embedding_item"
+      num_items,
+      self.mf_dim + self.model_layers[0] // 2,
+      embeddings_initializer=embedding_initializer,
+      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
+      input_length=1,
+      name="embedding_item",
     )
     self.user_layer = tf.keras.layers.Embedding(
-        num_users,
-        self.mf_dim + self.model_layers[0] // 2,
-        embeddings_initializer=embedding_initializer,
-        embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-        input_length=1,
-        name="embedding_user"
+      num_users,
+      self.mf_dim + self.model_layers[0] // 2,
+      embeddings_initializer=embedding_initializer,
+      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
+      input_length=1,
+      name="embedding_user",
     )
 
     self.prediction_layer = tf.keras.layers.Dense(
-        1, activation=None, kernel_initializer="lecun_uniform", name=rconst.RATING_COLUMN
+      1, activation=None, kernel_initializer="lecun_uniform", name=rconst.RATING_COLUMN
     )
 
     self.kernel = []
     num_layer = len(self.model_layers)  # Number of layers in the MLP
     for layer in xrange(1, num_layer):
       self.kernel.append(
-          tf.keras.layers.Dense(
-              self.model_layers[layer],
-              kernel_regularizer=tf.keras.regularizers.l2(self.mlp_reg_layers[layer]),
-              activation="relu"
-          )
+        tf.keras.layers.Dense(
+          self.model_layers[layer],
+          kernel_regularizer=tf.keras.regularizers.l2(self.mlp_reg_layers[layer]),
+          activation="relu",
+        )
       )
 
   def call(self, user_input: tf.Tensor, item_input: tf.Tensor, *args, **kwargs):
-
     def mf_slice_fn(x):
       x = tf.squeeze(x, [1])
-      return x[:, :self.mf_dim]
+      return x[:, : self.mf_dim]
 
     def mlp_slice_fn(x):
       x = tf.squeeze(x, [1])
-      return x[:, self.mf_dim:]
+      return x[:, self.mf_dim :]
 
     # It turns out to be significantly more effecient to store the MF and MLP
     # embedding portions in the same table, and then slice as needed.
@@ -277,41 +270,34 @@ def mlp_slice_fn(x):
 
 
 def _get_estimator_spec_with_metrics(
-    logits: tf.Tensor,
-    softmax_logits: tf.Tensor,
-    duplicate_mask: tf.Tensor,
-    num_training_neg: int,
-    match_mlperf: bool = False,
-    use_tpu_spec: bool = False
+  logits: tf.Tensor,
+  softmax_logits: tf.Tensor,
+  duplicate_mask: tf.Tensor,
+  num_training_neg: int,
+  match_mlperf: bool = False,
+  use_tpu_spec: bool = False,
 ):
   """Returns a EstimatorSpec that includes the metrics."""
-  cross_entropy, \
-    metric_fn, \
-    in_top_k, \
-    ndcg, \
-    metric_weights = compute_eval_loss_and_metrics_helper(
-    logits,
-    softmax_logits,
-    duplicate_mask,
-    num_training_neg,
-    match_mlperf)
+  cross_entropy, metric_fn, in_top_k, ndcg, metric_weights = compute_eval_loss_and_metrics_helper(
+    logits, softmax_logits, duplicate_mask, num_training_neg, match_mlperf
+  )
 
   if use_tpu_spec:
     return tf_estimator.tpu.TPUEstimatorSpec(
-        mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metrics=(metric_fn, [in_top_k, ndcg, metric_weights])
+      mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metrics=(metric_fn, [in_top_k, ndcg, metric_weights])
     )
 
   return tf_estimator.EstimatorSpec(
-      mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metric_ops=metric_fn(in_top_k, ndcg, metric_weights)
+    mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metric_ops=metric_fn(in_top_k, ndcg, metric_weights)
   )
 
 
 def compute_eval_loss_and_metrics_helper(
-    logits: tf.Tensor,
-    softmax_logits: tf.Tensor,
-    duplicate_mask: tf.Tensor,
-    num_training_neg: int,
-    match_mlperf: bool = False
+  logits: tf.Tensor,
+  softmax_logits: tf.Tensor,
+  duplicate_mask: tf.Tensor,
+  num_training_neg: int,
+  match_mlperf: bool = False,
 ):
   """Model evaluation with HR and NDCG metrics.
 
@@ -376,12 +362,12 @@ def compute_eval_loss_and_metrics_helper(
   # Examples are provided by the eval Dataset in a structured format, so eval
   # labels can be reconstructed on the fly.
   eval_labels = tf.reshape(
-      shape=(-1,),
-      tensor=tf.one_hot(
-          tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) + rconst.NUM_EVAL_NEGATIVES,
-          logits_by_user.shape[1],
-          dtype=tf.int32
-      )
+    shape=(-1,),
+    tensor=tf.one_hot(
+      tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) + rconst.NUM_EVAL_NEGATIVES,
+      logits_by_user.shape[1],
+      dtype=tf.int32,
+    ),
   )
 
   eval_labels_float = tf.cast(eval_labels, tf.float32)
@@ -392,26 +378,27 @@ def compute_eval_loss_and_metrics_helper(
   # the training data. (And provides apples-to-apples comparison)
   negative_scale_factor = num_training_neg / rconst.NUM_EVAL_NEGATIVES
   example_weights = (
-      (eval_labels_float + (1 - eval_labels_float) * negative_scale_factor) * (1 + rconst.NUM_EVAL_NEGATIVES) /
-      (1 + num_training_neg)
+    (eval_labels_float + (1 - eval_labels_float) * negative_scale_factor)
+    * (1 + rconst.NUM_EVAL_NEGATIVES)
+    / (1 + num_training_neg)
   )
 
   # Tile metric weights back to logit dimensions
   expanded_metric_weights = tf.reshape(
-      tf.tile(metric_weights[:, tf.newaxis], (1, rconst.NUM_EVAL_NEGATIVES + 1)), (-1,)
+    tf.tile(metric_weights[:, tf.newaxis], (1, rconst.NUM_EVAL_NEGATIVES + 1)), (-1,)
   )
 
   # ignore padded examples
   example_weights *= tf.cast(expanded_metric_weights, tf.float32)
 
   cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-      logits=softmax_logits, labels=eval_labels, weights=example_weights
+    logits=softmax_logits, labels=eval_labels, weights=example_weights
   )
 
   def metric_fn(top_k_tensor, ndcg_tensor, weight_tensor):
     return {
-        rconst.HR_KEY: tf.compat.v1.metrics.mean(top_k_tensor, weights=weight_tensor, name=rconst.HR_METRIC_NAME),
-        rconst.NDCG_KEY: tf.compat.v1.metrics.mean(ndcg_tensor, weights=weight_tensor, name=rconst.NDCG_METRIC_NAME)
+      rconst.HR_KEY: tf.compat.v1.metrics.mean(top_k_tensor, weights=weight_tensor, name=rconst.HR_METRIC_NAME),
+      rconst.NDCG_KEY: tf.compat.v1.metrics.mean(ndcg_tensor, weights=weight_tensor, name=rconst.NDCG_METRIC_NAME),
     }
 
   return cross_entropy, metric_fn, in_top_k, ndcg, metric_weights
diff --git a/deepray/models/rec/base_model.py b/deepray/models/rec/base_model.py
index 59095974..ab5d7d78 100644
--- a/deepray/models/rec/base_model.py
+++ b/deepray/models/rec/base_model.py
@@ -22,16 +22,34 @@
 from tensorflow_recommenders_addons import dynamic_embedding as de
 
 Feature = namedtuple(
-    'Feature', [
-        'name', 'code', 'dtype', 'len', 'default_value', 'use_hash', 'ids', 'vocab', 'boundaries', 'emb_name',
-        'emb_size', 'emb_dim', 'emb_reg_l1', 'emb_split', 'emb_reg_l2', 'emb_init', 'emb_mask', 'emb_dynamic',
-        'trainable', 'combiner', 'group'
-    ]
+  "Feature",
+  [
+    "name",
+    "code",
+    "dtype",
+    "len",
+    "default_value",
+    "use_hash",
+    "ids",
+    "vocab",
+    "boundaries",
+    "emb_name",
+    "emb_size",
+    "emb_dim",
+    "emb_reg_l1",
+    "emb_split",
+    "emb_reg_l2",
+    "emb_init",
+    "emb_mask",
+    "emb_dynamic",
+    "trainable",
+    "combiner",
+    "group",
+  ],
 )
 
 
-class BaseModel():
-
+class BaseModel:
   def __init__(self):
     super().__init__()
     self.conf = InputMeta().conf
@@ -39,8 +57,19 @@ def __init__(self):
     self.field_dict = self.get_fea_field_dict()
     self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
 
-    self.fea_gpercentile_dict, fea_gcov_dict, fea_geva_dict, self.fea_bpercentile_dict, fea_bcov_dict, fea_beva_dict, self.fea_tag_dict, self.cate_fea_dict, id_fea_dict, search_vocab_list, word_fea_dict = self.get_feature_meta(
-    )
+    (
+      self.fea_gpercentile_dict,
+      fea_gcov_dict,
+      fea_geva_dict,
+      self.fea_bpercentile_dict,
+      fea_bcov_dict,
+      fea_beva_dict,
+      self.fea_tag_dict,
+      self.cate_fea_dict,
+      id_fea_dict,
+      search_vocab_list,
+      word_fea_dict,
+    ) = self.get_feature_meta()
 
     self.save_path = self.get_save_model_path(self.conf.out_path)
 
@@ -53,26 +82,26 @@ def __init__(self):
       path_to_pb = os.path.join(self.save_path, "saved_model.pb")
       path_to_pbtxt = os.path.join(self.save_path, "saved_model.pbtxt")
       if os.path.exists(path_to_pb) or os.path.exists(path_to_pbtxt):
-        logging.info('[MODEL] Model already exists! would OVERWRITE! %s' % self.save_path)
+        logging.info("[MODEL] Model already exists! would OVERWRITE! %s" % self.save_path)
         # sys.exit(0)
 
     # input相关
-    self.use_fea_list = self.feature_map['name'].values.tolist()
+    self.use_fea_list = self.feature_map["name"].values.tolist()
 
     self.hash_fea_dict, self.id_fea_dict = self.conf.hash_fea_bucket_size_dict, self.conf.id_fea_bucket_size_dict
     self.conti_fea_cut = None
-    if self.conf.bussiness == 'boss':
+    if self.conf.bussiness == "boss":
       self.conti_fea_cut = self.fea_bpercentile_dict
-    elif self.conf.bussiness == 'geek':
+    elif self.conf.bussiness == "geek":
       self.conti_fea_cut = self.fea_gpercentile_dict
     else:
-      logging.info(f'system exits with code -1! unknown bussiness: {self.conf.bussiness}')
+      logging.info(f"system exits with code -1! unknown bussiness: {self.conf.bussiness}")
       sys.exit(-1)
     if self.conf.extra_conti_fea_cut:
       self.conti_fea_cut = {**self.conti_fea_cut, **self.conf.extra_conti_fea_cut}
-    logging.info(f'conti_fea_cut: {len(self.conti_fea_cut)}')
-    logging.info(f'cate_fea_dict: {len(self.cate_fea_dict)}')
-    logging.info(f'use_fea_list: {len(self.use_fea_list)}')
+    logging.info(f"conti_fea_cut: {len(self.conti_fea_cut)}")
+    logging.info(f"cate_fea_dict: {len(self.cate_fea_dict)}")
+    logging.info(f"use_fea_list: {len(self.use_fea_list)}")
     # output相关
     self.target_label_table, self.eva_target_label_table, self.class_weight_table = self.get_target()
 
@@ -88,7 +117,7 @@ def get_target(self):
     eva_target_label_table = dict()
     class_weight_table = dict()
     for key, value in self.conf.target.items():
-      sample_weight = value['total_sample_weight'] if 'total_sample_weight' in value else value['sample_weight']
+      sample_weight = value["total_sample_weight"] if "total_sample_weight" in value else value["sample_weight"]
       label_key = []
       label_value = []
       for k, v in sample_weight.items():
@@ -96,27 +125,27 @@ def get_target(self):
         label_value.append(0 if v < 0 else 1)
 
       # 兼容之前的格式，之后会下掉
-      if 'label' in value:
+      if "label" in value:
         label_key = []
         label_value = []
-        for k, v in value['label'].items():
+        for k, v in value["label"].items():
           label_key.append(k)
           label_value.append(v)
 
       initializer = tf.lookup.KeyValueTensorInitializer(
-          keys=label_key, values=label_value, key_dtype=tf.string, value_dtype=tf.int64, name=key + "_target_lookup_1"
+        keys=label_key, values=label_value, key_dtype=tf.string, value_dtype=tf.int64, name=key + "_target_lookup_1"
       )
       target_label_table[key] = tf.lookup.StaticHashTable(initializer, default_value=0, name=key + "_target_lookup")
       class_weight_table[key] = value["class_weight"]
 
     for key, value in self.conf.evaluate_target.items():
-      target_label = value['label']
+      target_label = value["label"]
       initializer = tf.lookup.KeyValueTensorInitializer(
-          keys=list(target_label.keys()),
-          values=list(target_label.values()),
-          key_dtype=tf.string,
-          value_dtype=tf.int64,
-          name=key + "_target_lookup_1"
+        keys=list(target_label.keys()),
+        values=list(target_label.values()),
+        key_dtype=tf.string,
+        value_dtype=tf.int64,
+        name=key + "_target_lookup_1",
       )
       eva_target_label_table[key] = tf.lookup.StaticHashTable(initializer, default_value=0, name=key + "_target_lookup")
       class_weight_table[key] = 0
@@ -135,20 +164,20 @@ def get_save_model_path(self, basedir):
 
   def get_fea_field_dict(self):
     fea_field_dict = dict()
-    fea_field_path = os.path.join(self.conf.conf_path, 'fea_field')
+    fea_field_path = os.path.join(self.conf.conf_path, "fea_field")
     for file in os.listdir(fea_field_path):
       filename = os.path.join(fea_field_path, file)
       val = []
       #         logger.info("#" + file + "#")
       if not os.path.isfile(filename):
-        logging.info('warning: %s is a directory!' % file)
+        logging.info("warning: %s is a directory!" % file)
         continue
-      with open(filename, 'r') as fr:
+      with open(filename, "r") as fr:
         while True:
           line = fr.readline().strip()
-          if line == '':
+          if line == "":
             break
-          if line == 'NULL':
+          if line == "NULL":
             continue
           val.append(line)
           # logging.info(line)
@@ -157,11 +186,11 @@ def get_fea_field_dict(self):
 
   def get_feature_tag(self):
     path = os.path.join(self.conf.common_config, self.conf.tag_file)
-    logging.info(f'fea_tag_path: {path}')
+    logging.info(f"fea_tag_path: {path}")
     fea_tag_name_dict = dict()
     f = open(path)
     for line in f.readlines():
-      line = line.strip().split('\t')
+      line = line.strip().split("\t")
       if len(line) < 2:
         continue
       fea_tag_name_dict[line[0]] = line[1]
@@ -169,7 +198,7 @@ def get_feature_tag(self):
 
   def get_cate_fea_vocab_list(self):
     path = os.path.join(self.conf.common_config, self.conf.vocab_dir)
-    logging.info(f'fea_vocab_path: {path}')
+    logging.info(f"fea_vocab_path: {path}")
     cate_fea_vocab_list = dict()
     # cate_fea_path = os.path.join(self.conf.project_path, 'self.conf', 'conf_common', self.conf.cate_fea_dir)
 
@@ -179,12 +208,12 @@ def get_cate_fea_vocab_list(self):
       val = []
       # logging.info(file)
       if os.path.isdir(filename):
-        logging.info('warning: %s is a directory!' % file)
+        logging.info("warning: %s is a directory!" % file)
         continue
-      with open(filename, 'r') as f:
+      with open(filename, "r") as f:
         for line in f:
           line = line.strip()
-          if line == '' or line == 'NULL':
+          if line == "" or line == "NULL":
             continue
           val.append(int(line))
       cate_fea_vocab_list[file] = val
@@ -193,7 +222,7 @@ def get_cate_fea_vocab_list(self):
 
   def get_search_vocab_list(self):
     path = os.path.join(self.conf.common_config, self.conf.search_dir)
-    logging.info(f'search_vocab_path: {path}')
+    logging.info(f"search_vocab_path: {path}")
     search_vocab_list = dict()
 
     for file in os.listdir(path):
@@ -201,14 +230,14 @@ def get_search_vocab_list(self):
       val = []
       # logging.info(file)
       if os.path.isdir(filename):
-        logging.info('warning: %s is a directory!' % file)
+        logging.info("warning: %s is a directory!" % file)
         continue
-      with open(filename, 'r') as f:
+      with open(filename, "r") as f:
         for line in f:
           line = line.strip()
-          if line == '' or line == 'NULL':
+          if line == "" or line == "NULL":
             continue
-          val.append(line.strip().encode('utf-8'))
+          val.append(line.strip().encode("utf-8"))
       search_vocab_list[file] = val
     # sys.exit()
     return search_vocab_list
@@ -230,21 +259,21 @@ def get_feature_meta(self):
     # fea_tag                string
 
     path = os.path.join(self.conf.common_config, self.conf.meta_file)
-    logging.info(f'fea_meta_path: {path}')
+    logging.info(f"fea_meta_path: {path}")
 
     cate_fea_vocab_list = self.get_cate_fea_vocab_list()
     fea_tag_name_dict = self.get_feature_tag()
     search_vocab_list = self.get_search_vocab_list()
 
-    cols = 'fea_code,fea_name,fea_len,fea_dtype,gpercentile,gcov,geva,bpercentile,bcov,beva,def_valu,fea_tag,dim'
-    cols = cols.split(',')
+    cols = "fea_code,fea_name,fea_len,fea_dtype,gpercentile,gcov,geva,bpercentile,bcov,beva,def_valu,fea_tag,dim"
+    cols = cols.split(",")
     logging.info(cols)
 
     cate_fea_dict = dict()
     word_fea_dict = dict()
-    fea_id_set = {'job_id', 'boss_id', 'exp_id', 'geek_id', 'addf_id'}
+    fea_id_set = {"job_id", "boss_id", "exp_id", "geek_id", "addf_id"}
     id_fea_dict = dict()
-    word_tag_set = {'word'}
+    word_tag_set = {"word"}
 
     fea_gpercentile_dict = dict()
     fea_gcov_dict = dict()
@@ -256,20 +285,20 @@ def get_feature_meta(self):
 
     f = open(path)
     for line in f.readlines():
-      line = line.strip().split('\t')
+      line = line.strip().split("\t")
 
       # logging.info(line)
       if len(line) < len(cols):
-        logging.info(f'column len not match! please check input: {line}')
+        logging.info(f"column len not match! please check input: {line}")
         continue
-      if line[2] == '-1' and not line[2].isdigit():
-        logging.info(f'fea_len = -1! please check input: {line}')
+      if line[2] == "-1" and not line[2].isdigit():
+        logging.info(f"fea_len = -1! please check input: {line}")
         continue
-      if line[3] not in ('int64', 'float32', 'string'):
-        logging.info(f'invalaid dytype! please check input: {line}')
+      if line[3] not in ("int64", "float32", "string"):
+        logging.info(f"invalaid dytype! please check input: {line}")
         continue
       if int(line[2]) > 1 and not line[11]:
-        logging.info(f'empty fea_tag! required fea_tag when vector occurs but found null! please check input: {line}')
+        logging.info(f"empty fea_tag! required fea_tag when vector occurs but found null! please check input: {line}")
         continue
       fea_name = line[1]
       fea_len = int(line[2])
@@ -280,7 +309,7 @@ def get_feature_meta(self):
       fea_bpercentile = line[7]
       fea_bcov = line[8]
       fea_beva = line[9]
-      fea_tag = line[11].strip().split(',')
+      fea_tag = line[11].strip().split(",")
       for tag in fea_tag:
         if tag in fea_tag_name_dict:
           tag_name = fea_tag_name_dict[tag]
@@ -295,36 +324,67 @@ def get_feature_meta(self):
         if tag_name in cate_fea_vocab_list:
           cate_fea_dict[fea_name] = cate_fea_vocab_list[tag_name]
         else:
-          logging.warn(f'tag {tag_name} NOT EXIST for cat feature {fea_name}')
+          logging.warn(f"tag {tag_name} NOT EXIST for cat feature {fea_name}")
           cate_fea_dict[fea_name] = []
 
-      if fea_len == 1 and fea_dtype in ('int64', 'float32') and fea_name not in fea_tag_dict:
-        if fea_gpercentile.startswith('[') or (
-            fea_gpercentile not in ['', '\"\"'] and not fea_gpercentile.startswith('{')
+      if fea_len == 1 and fea_dtype in ("int64", "float32") and fea_name not in fea_tag_dict:
+        if fea_gpercentile.startswith("[") or (
+          fea_gpercentile not in ["", '""'] and not fea_gpercentile.startswith("{")
         ):
-          fea_gpercentile = list(set(map(lambda x: float(x), fea_gpercentile.strip('[').strip(']').split(','))))
+          fea_gpercentile = list(set(map(lambda x: float(x), fea_gpercentile.strip("[").strip("]").split(","))))
           fea_gpercentile.sort()
           fea_gpercentile_dict[fea_name] = fea_gpercentile
-        if fea_bpercentile.startswith('[') or (
-            fea_bpercentile not in ['', '\"\"'] and not fea_bpercentile.startswith('{')
+        if fea_bpercentile.startswith("[") or (
+          fea_bpercentile not in ["", '""'] and not fea_bpercentile.startswith("{")
         ):
-          fea_bpercentile = list(set(map(lambda x: float(x), fea_bpercentile.strip('[').strip(']').split(','))))
+          fea_bpercentile = list(set(map(lambda x: float(x), fea_bpercentile.strip("[").strip("]").split(","))))
           fea_bpercentile.sort()
           fea_bpercentile_dict[fea_name] = fea_bpercentile
-      if fea_gcov != '':
+      if fea_gcov != "":
         fea_gcov_dict[fea_name] = fea_gcov
         fea_geva_dict[fea_name] = fea_geva
-      if fea_bcov != '':
+      if fea_bcov != "":
         fea_bcov_dict[fea_name] = fea_bcov
         fea_beva_dict[fea_name] = fea_beva
 
-    return fea_gpercentile_dict, fea_gcov_dict, fea_geva_dict, fea_bpercentile_dict, fea_bcov_dict, fea_beva_dict, fea_tag_dict, cate_fea_dict, id_fea_dict, search_vocab_list, word_fea_dict
+    return (
+      fea_gpercentile_dict,
+      fea_gcov_dict,
+      fea_geva_dict,
+      fea_bpercentile_dict,
+      fea_bcov_dict,
+      fea_beva_dict,
+      fea_tag_dict,
+      cate_fea_dict,
+      id_fea_dict,
+      search_vocab_list,
+      word_fea_dict,
+    )
 
   def make_feature(self, f=None, **kwargs):
     if not f:
       f = Feature(
-          None, 0, tf.int64, 1, -3, False, False, None, None, None, 0, 1, 0.00001, None, 0.00001, 'truncated_normal',
-          False, False, True, 'mean', None
+        None,
+        0,
+        tf.int64,
+        1,
+        -3,
+        False,
+        False,
+        None,
+        None,
+        None,
+        0,
+        1,
+        0.00001,
+        None,
+        0.00001,
+        "truncated_normal",
+        False,
+        False,
+        True,
+        "mean",
+        None,
       )
     copy = f._asdict()
     copy.update(kwargs)
@@ -350,31 +410,31 @@ def embedding_from_feature(self, features, is_training=True, emb_dict=None):
 
       if fea_emb_name and fea_emb_name not in emb_dict:
         if is_training:
-          initializer = tf.keras.initializers.TruncatedNormal(mean=0., stddev=1. / math.sqrt(feature.emb_dim))
+          initializer = tf.keras.initializers.TruncatedNormal(mean=0.0, stddev=1.0 / math.sqrt(feature.emb_dim))
         else:
           initializer = tf.keras.initializers.Zeros()
 
         if feature.emb_dynamic:  # 现在所有的特征都走de，我们仍使用这个字段标识是否是id特征
           emb = DynamicEmbedding(
+            embedding_size=feature.emb_dim,
+            mini_batch_regularizer=l2(feature.emb_reg_l2),
+            mask_value=feature.default_value,
+            key_dtype=tf.int64,
+            value_dtype=tf.float32,
+            initializer=initializer,
+            name="dynamic_" + fea_emb_name,
+          )
+        else:
+          if not FLAGS.use_horovod:
+            emb = DynamicEmbedding(
               embedding_size=feature.emb_dim,
               mini_batch_regularizer=l2(feature.emb_reg_l2),
               mask_value=feature.default_value,
               key_dtype=tf.int64,
               value_dtype=tf.float32,
               initializer=initializer,
-              name='dynamic_' + fea_emb_name
-          )
-        else:
-          if not FLAGS.use_horovod:
-            emb = DynamicEmbedding(
-                embedding_size=feature.emb_dim,
-                mini_batch_regularizer=l2(feature.emb_reg_l2),
-                mask_value=feature.default_value,
-                key_dtype=tf.int64,
-                value_dtype=tf.float32,
-                initializer=initializer,
-                name="UnifiedDynamicEmbedding",
-                # init_capacity=1000000 * 8  # 如果提示hash冲突，调整该参数
+              name="UnifiedDynamicEmbedding",
+              # init_capacity=1000000 * 8  # 如果提示hash冲突，调整该参数
             )
           else:
             import horovod.tensorflow as hvd
@@ -383,14 +443,14 @@ def embedding_from_feature(self, features, is_training=True, emb_dict=None):
             mpi_size = hvd.size()
             mpi_rank = hvd.rank()
             emb = de.keras.layers.HvdAllToAllEmbedding(
-                mpi_size=mpi_size,
-                embedding_size=feature.emb_dim,
-                key_dtype=tf.int64,
-                value_dtype=tf.float32,
-                initializer=initializer,
-                devices=gpu_device,
-                name='DenseUnifiedEmbeddingLayer',
-                kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
+              mpi_size=mpi_size,
+              embedding_size=feature.emb_dim,
+              key_dtype=tf.int64,
+              value_dtype=tf.float32,
+              initializer=initializer,
+              devices=gpu_device,
+              name="DenseUnifiedEmbeddingLayer",
+              kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank)),
             )
 
         emb_dict[fea_emb_name] = emb
@@ -413,7 +473,9 @@ def dense_from_columns(self, features, emb_dict):  # 用于非id特征
     for i, input_tensor in enumerate(redis_inputs):
       input_tensor = input_tensor if input_tensor.dtype == tf.int64 else tf.cast(input_tensor, tf.int64)
       id_tensor_prefix_code = tf.constant(int(features[i].code) << 47, dtype=tf.int64)  # 这里用的code
-      id_tensor = tf.bitwise.bitwise_xor(input_tensor, id_tensor_prefix_code)  # 前半部分是特征code，后半部分是值，全部合到一起去查询
+      id_tensor = tf.bitwise.bitwise_xor(
+        input_tensor, id_tensor_prefix_code
+      )  # 前半部分是特征code，后半部分是值，全部合到一起去查询
       id_tensors.append(id_tensor)
       fea_lens.append(features[i].len)
 
@@ -437,9 +499,9 @@ def dense_from_columns(self, features, emb_dict):  # 用于非id特征
       is_sequence_feature.append(False)
 
     id_tensors_concat = Concatenate(axis=1)(id_tensors)
-    embedding_outs_concat = emb_dict['not_id_feature'](id_tensors_concat)
+    embedding_outs_concat = emb_dict["not_id_feature"](id_tensors_concat)
     embedding_outs = tf.split(
-        embedding_outs_concat, num_or_size_splits=split_dims_final, axis=1, name=f"split_not_id_fea"
+      embedding_outs_concat, num_or_size_splits=split_dims_final, axis=1, name=f"split_not_id_fea"
     )
 
     dense_dict = OrderedDict()
@@ -453,7 +515,7 @@ def dense_from_columns(self, features, emb_dict):  # 用于非id特征
         dense_dict[seq_fea.name] = embedding_vec
       else:
         simple_fea_embeddings = tf.split(
-            embedding, num_or_size_splits=[1] * split_dims_final[i], axis=1, name=f"split_simple_fea_embeddings_{i}"
+          embedding, num_or_size_splits=[1] * split_dims_final[i], axis=1, name=f"split_simple_fea_embeddings_{i}"
         )
         for _, simple_fea_embedding in enumerate(simple_fea_embeddings):
           #                 logging.info(f"simple fea: {simple_fea_embedding.get_shape()}, {features[counter_flag].name}")
@@ -496,8 +558,9 @@ def dense_from_columns_id(self, features, emb_dict):  # 用于id特征，因为
         for i, fea in enumerate(feas):
           fea_dense = dense_looks[i]
           if fea.len > 1 and fea.combiner:
-            fea_dense = Pooling(combiner=fea.combiner,
-                                name=f"{fea.combiner}_{fea.name}")(fea_dense, mask=masks[fea.code])
+            fea_dense = Pooling(combiner=fea.combiner, name=f"{fea.combiner}_{fea.name}")(
+              fea_dense, mask=masks[fea.code]
+            )
           dense_dict[fea.name] = fea_dense
 
     return dense_dict
@@ -506,19 +569,20 @@ def build_features(self):
     # Inputs
     conti_features = self.conti_fea_dict()
     features = dict()
-    for code, fname, dtype, length, def_valu in self.feature_map[["code", "name", "dtype", "length",
-                                                                  "def_valu"]].values:
-      if 'int' in dtype:
+    for code, fname, dtype, length, def_valu in self.feature_map[
+      ["code", "name", "dtype", "length", "def_valu"]
+    ].values:
+      if "int" in dtype:
         try:
           def_valu = int(def_valu)
         except ValueError as e:
-          logging.info(f'[ERROR] default value, {fname} {dtype} {def_valu}')
+          logging.info(f"[ERROR] default value, {fname} {dtype} {def_valu}")
           def_valu = -3 if length > 1 else 0
-      elif 'float' in dtype:
+      elif "float" in dtype:
         try:
           def_valu = float(def_valu)
         except ValueError as e:
-          logging.info(f'[ERROR] default value, {fname} {dtype} {def_valu}')
+          logging.info(f"[ERROR] default value, {fname} {dtype} {def_valu}")
           def_valu = -3.0 if length > 1 else 0.0
       if def_valu == "_PAD_":
         print("def:", fname, def_valu)
@@ -545,35 +609,37 @@ def build_features(self):
       else:
         print(fname + " is error feature or is id_fea")
 
-      if fname in self.id_fea_dict:  # 这里会把上面hash中重复的特征给覆盖掉 - id/id序列特征，其tag一定要有4个id的某一个，才能确保被调用到
+      if (
+        fname in self.id_fea_dict
+      ):  # 这里会把上面hash中重复的特征给覆盖掉 - id/id序列特征，其tag一定要有4个id的某一个，才能确保被调用到
         emb_name = self.id_fea_dict[fname]  # 这个名字会重复吧
         emb_dynamic = True
         use_hash = False
         emb_size = 0
 
       features[fname] = self.make_feature(
-          name=fname,
-          code=code,
-          dtype=dtype,
-          len=int(length),
-          default_value=def_valu,
-          use_hash=use_hash,
-          vocab=vocab,
-          boundaries=boundaries,
-          emb_name=emb_name,
-          emb_size=emb_size,
-          emb_dynamic=emb_dynamic,
-          emb_reg_l2=self.conf.emb_reg_l2
+        name=fname,
+        code=code,
+        dtype=dtype,
+        len=int(length),
+        default_value=def_valu,
+        use_hash=use_hash,
+        vocab=vocab,
+        boundaries=boundaries,
+        emb_name=emb_name,
+        emb_size=emb_size,
+        emb_dynamic=emb_dynamic,
+        emb_reg_l2=self.conf.emb_reg_l2,
       )
     return features
 
   def conti_fea_dict(self):
-    if self.conf.bussiness == 'boss':
+    if self.conf.bussiness == "boss":
       conti_fea = self.fea_bpercentile_dict
-    elif self.conf.bussiness == 'geek':
+    elif self.conf.bussiness == "geek":
       conti_fea = self.fea_gpercentile_dict
     else:
-      logging.info(f'system exits with code -1! unknown bussiness: {self.conf.bussiness}')
+      logging.info(f"system exits with code -1! unknown bussiness: {self.conf.bussiness}")
       sys.exit(-1)
     if self.conf.extra_conti_fea_cut:
       conti_fea = {**conti_fea, **self.conf.extra_conti_fea_cut}
@@ -582,12 +648,12 @@ def conti_fea_dict(self):
   # 读取geek nn的特征交叉信息
   def get_geek_nn_compo(self):
     nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, 'geek')
-    with open(nn_path, 'r') as fr:
+    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "geek")
+    with open(nn_path, "r") as fr:
       for line in fr.readlines():
-        line = line.strip().split('\t')
-        fields = line[0].split(',')
-        fea_len = list(map(lambda x: int(x), line[1].split(',')))
+        line = line.strip().split("\t")
+        fields = line[0].split(",")
+        fea_len = list(map(lambda x: int(x), line[1].split(",")))
         for field in fields:
           if field not in nn_cnt:
             nn_cnt[field] = fea_len
@@ -597,12 +663,12 @@ def get_geek_nn_compo(self):
   # 读取job nn的特征交叉信息
   def get_job_nn_compo(self):
     nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, 'job')
-    with open(nn_path, 'r') as fr:
+    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "job")
+    with open(nn_path, "r") as fr:
       for line in fr.readlines():
-        line = line.strip().split('\t')
-        fields = line[0].split(',')
-        fea_len = list(map(lambda x: int(x), line[1].split(',')))
+        line = line.strip().split("\t")
+        fields = line[0].split(",")
+        fea_len = list(map(lambda x: int(x), line[1].split(",")))
         for field in fields:
           if field not in nn_cnt:
             nn_cnt[field] = fea_len
diff --git a/deepray/models/rec/cgc_v3.py b/deepray/models/rec/cgc_v3.py
index 06c2abdb..f9fe2426 100644
--- a/deepray/models/rec/cgc_v3.py
+++ b/deepray/models/rec/cgc_v3.py
@@ -18,16 +18,15 @@
 
 
 class CGCModel(BaseModel):
-
   def __call__(
-      self,
-      nn_hidden_units=(256, 128, 1),
-      nn_l2_reg=0.0,
-      nn_dropout=0.0,
-      nn_use_bn=False,
-      is_training=True,
-      *args,
-      **kwargs
+    self,
+    nn_hidden_units=(256, 128, 1),
+    nn_l2_reg=0.0,
+    nn_dropout=0.0,
+    nn_use_bn=False,
+    is_training=True,
+    *args,
+    **kwargs,
   ):
     target_label_table, eva_target_label_table, class_weight_table = self.get_target()
     targets = list(target_label_table.keys())
@@ -35,59 +34,60 @@ def __call__(
 
     gate_input_dim = self.conf.gate_input_dim if self.conf.gate_input_dim else 1
     input_list, nn_dense_features = self.get_input_and_dense_features(
-        is_training, self.get_nn_compo(), targets + ['share'], extra_dim=gate_input_dim
+      is_training, self.get_nn_compo(), targets + ["share"], extra_dim=gate_input_dim
     )
     #         print("nn_dense_features:", nn_dense_features)
     nn_target_inputs = list()
-    for target in targets + ['share']:
+    for target in targets + ["share"]:
       nn_target_inputs.append(tf.concat(nn_dense_features[target], axis=-1, name=f"{target}_input"))
     nn_input = tf.concat(nn_target_inputs, axis=1, name="cgc_input")
-    nn_input_gate = tf.concat(nn_dense_features['extra'], axis=-1, name="gate_input")
+    nn_input_gate = tf.concat(nn_dense_features["extra"], axis=-1, name="gate_input")
 
     # output target
     output_dict = dict()
     output_weights = []
     output_tensors = []
 
-    logging.info(f'class_weight_table: {class_weight_table}')
+    logging.info(f"class_weight_table: {class_weight_table}")
 
     num_experts_task = num_targets + 1 if self.conf.num_experts is None else self.conf.num_experts
     cgc_output = CGC(
-        num_tasks=num_targets,
-        num_experts_task=num_experts_task,
-        num_experts_share=num_experts_task,
-        units=[self.conf.units],
-        output_share=False,
-        name="cgc"
+      num_tasks=num_targets,
+      num_experts_task=num_experts_task,
+      num_experts_share=num_experts_task,
+      units=[self.conf.units],
+      output_share=False,
+      name="cgc",
     )([nn_input, nn_input_gate])
     target_outputs = tf.split(cgc_output, num_or_size_splits=num_targets, axis=1)
     logging.info(
-        f'[CGC] gate_input_dim: {gate_input_dim}, num_targets: {num_targets}, num_experts_task: {num_experts_task}'
+      f"[CGC] gate_input_dim: {gate_input_dim}, num_targets: {num_targets}, num_experts_task: {num_experts_task}"
     )
 
     bayes_inputs = dict()
     for i, target in enumerate(target_label_table):
       bayes_inputs[target] = DNN(
-          (self.conf.units, 64),
-          name=f'bayes_{target}',
-          l2_reg=nn_l2_reg,
-          output_activation='relu',
-          dropout_rate=nn_dropout,
-          use_bn=nn_use_bn
+        (self.conf.units, 64),
+        name=f"bayes_{target}",
+        l2_reg=nn_l2_reg,
+        output_activation="relu",
+        dropout_rate=nn_dropout,
+        use_bn=nn_use_bn,
       )(tf.squeeze(target_outputs[i], axis=1))
 
     for target, target_conf in self.conf.target.items():
       target_input = bayes_inputs[target]
       if "bayes" in target_conf and target_conf["bayes"] in bayes_inputs:
         bayes_input = bayes_inputs[target_conf["bayes"]]
-        target_input = Concatenate(axis=-1, name=f'{target}_n_{target_conf["bayes"]}')([target_input, bayes_input])
-      nn_target_output = DNN((1,), name=f'dnn_{target}', l2_reg=nn_l2_reg, dropout_rate=nn_dropout,
-                             use_bn=nn_use_bn)(target_input)
+        target_input = Concatenate(axis=-1, name=f"{target}_n_{target_conf['bayes']}")([target_input, bayes_input])
+      nn_target_output = DNN((1,), name=f"dnn_{target}", l2_reg=nn_l2_reg, dropout_rate=nn_dropout, use_bn=nn_use_bn)(
+        target_input
+      )
       # out = ClipActivation('sigmoid', name=f"clip_act_{target}")(nn_target_output)
 
       tout = tf.keras.layers.Activation(tf.nn.sigmoid)(nn_target_output)
       epsilon_ = constant_op.constant(epsilon(), dtype=tout.dtype.base_dtype)
-      output = clip_ops.clip_by_value(tout, epsilon_, 1. - epsilon_)
+      output = clip_ops.clip_by_value(tout, epsilon_, 1.0 - epsilon_)
       output_dict[target] = tf.keras.layers.Lambda(lambda x: x, name=target)(output)
 
       # output_dict[target] = Lambda(lambda x: x, name=target)(output)
@@ -98,24 +98,24 @@ def __call__(
     if len(output_tensors) > 1:
       weight_sum = sum(output_weights)
       output_weights = [w / weight_sum for w in output_weights]
-      if 'multiply' in self.conf.target_fusion_type:
+      if "multiply" in self.conf.target_fusion_type:
         predict_out = tf.reduce_prod(
-            tf.pow(tf.concat(output_tensors, axis=-1), output_weights), name="predict_multiply", axis=-1, keepdims=True
+          tf.pow(tf.concat(output_tensors, axis=-1), output_weights), name="predict_multiply", axis=-1, keepdims=True
         )
       else:  # add
         predict_out = tf.reduce_sum(
-            tf.multiply(tf.concat(output_tensors, axis=-1), output_weights), name="predict_add", axis=-1, keepdims=True
+          tf.multiply(tf.concat(output_tensors, axis=-1), output_weights), name="predict_add", axis=-1, keepdims=True
         )
     else:
       predict_out = output_tensors[0]
-    output_dict['predict'] = Lambda(lambda x: x, name="predict")(predict_out)
+    output_dict["predict"] = Lambda(lambda x: x, name="predict")(predict_out)
 
     # for mtl fusion plugin
     i = 0
-    for target in ['det', 'addf', 'chat', 'success', 'refuse']:
+    for target in ["det", "addf", "chat", "success", "refuse"]:
       if target in output_dict:
-        predict_name = 'predict_%d' % i
-        if target == 'refuse':
+        predict_name = "predict_%d" % i
+        if target == "refuse":
           output_dict[predict_name] = tf.keras.layers.Lambda(lambda x: 1.0 - x, name=predict_name)(output_dict[target])
         else:
           output_dict[predict_name] = tf.keras.layers.Lambda(lambda x: x, name=predict_name)(output_dict[target])
@@ -125,19 +125,19 @@ def __call__(
     metrics = dict()
     weighted_metrics = dict()
     for key, config in self.conf.evaluate_target.items():
-      target = config['target'] if 'target' in config else 'predict'
+      target = config["target"] if "target" in config else "predict"
       if target in output_dict:
-        metric = tf.keras.metrics.AUC(num_thresholds=1000, summation_method='minoring', name='auc')
+        metric = tf.keras.metrics.AUC(num_thresholds=1000, summation_method="minoring", name="auc")
         # pr_metric = tf.keras.metrics.AUC(num_thresholds=1000, summation_method='minoring', name='pr_auc', curve='PR')
-        if 'weighted' in config and config['weighted']:
+        if "weighted" in config and config["weighted"]:
           weighted_metrics[key] = metric
         else:
           metrics[key] = metric
       else:
-        target = 'predict'
+        target = "predict"
       output_dict[key] = Lambda(lambda x: x, name=key)(output_dict[target])
 
-    logging.info(f'class_weight_table: {class_weight_table}, output_dict: {output_dict}')
+    logging.info(f"class_weight_table: {class_weight_table}, output_dict: {output_dict}")
     return tf.keras.Model(inputs=input_list, outputs=output_dict)
 
   # 生成 Input 及 FFM & NN Feature
@@ -148,8 +148,12 @@ def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra
     features = dict()
     num_targets = 1 if not targets else len(targets)
     for fname in used_features:
-      code, dtype, length, value = self.fea_code[fname], self.fea_dtype[fname], self.fea_length[
-          fname], self.fea_def_valu_dict[fname]
+      code, dtype, length, value = (
+        self.fea_code[fname],
+        self.fea_dtype[fname],
+        self.fea_length[fname],
+        self.fea_def_valu_dict[fname],
+      )
 
       use_hash = False
       emb_size = 0
@@ -171,16 +175,16 @@ def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra
         emb_size = len(boundaries) + 1
 
       features[fname] = self.make_feature(
-          name=fname,
-          code=code,
-          dtype=dtype,
-          len=length,
-          default_value=value,
-          use_hash=use_hash,
-          vocab=vocab,
-          boundaries=boundaries,
-          emb_name=emb_name,
-          emb_size=emb_size
+        name=fname,
+        code=code,
+        dtype=dtype,
+        len=length,
+        default_value=value,
+        use_hash=use_hash,
+        vocab=vocab,
+        boundaries=boundaries,
+        emb_name=emb_name,
+        emb_size=emb_size,
       )
     input_dict = self.input_from_features(features.values())
 
@@ -205,11 +209,11 @@ def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra
             emb_dim = emb_dim_byname[emb_name]
           emb_dim_byname[emb_name] = emb_dim
         nn_features.append(
-            self.make_feature(
-                f=feature,
-                emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
-                emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else [])
-            )
+          self.make_feature(
+            f=feature,
+            emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
+            emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else []),
+          )
         )
     emb_dict = self.embedding_from_feature(nn_features, is_training)
     nn_dense_features = self.dense_from_columns(nn_features, input_dict, emb_dict)
@@ -223,7 +227,7 @@ def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra
         for i, target in enumerate(targets):
           feas_split_dense[target].append(fea_denses[i])
         if extra_dim > 0:
-          feas_split_dense['extra'].append(fea_denses[-1])
+          feas_split_dense["extra"].append(fea_denses[-1])
 
       return list(input_dict.values()), feas_split_dense
 
@@ -232,12 +236,12 @@ def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra
   # 读取nn的特征交叉信息
   def get_nn_compo(self):
     nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, 'nn')
-    with open(nn_path, 'r') as fr:
+    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "nn")
+    with open(nn_path, "r") as fr:
       for line in fr.readlines():
-        line = line.strip().split('\t')
-        fields = line[0].split(',')
-        fea_len = list(map(lambda x: int(x), line[1].split(',')))
+        line = line.strip().split("\t")
+        fields = line[0].split(",")
+        fea_len = list(map(lambda x: int(x), line[1].split(",")))
         for field in fields:
           if field not in nn_cnt:
             nn_cnt[field] = fea_len
diff --git a/deepray/models/rec/deepfm.py b/deepray/models/rec/deepfm.py
index a3a4cc5e..16ea3eed 100644
--- a/deepray/models/rec/deepfm.py
+++ b/deepray/models/rec/deepfm.py
@@ -4,25 +4,24 @@
 
 
 class DeepFM(FrameWork):
-
   def __init__(
-      self,
-      wide_column=None,
-      fm_column=None,
-      deep_column=None,
-      dnn_hidden_units=[1024, 256, 32],
-      final_hidden_units=[128, 64],
-      optimizer_type='adam',
-      learning_rate=0.001,
-      inputs=None,
-      use_bn=True,
-      bf16=False,
-      stock_tf=None,
-      adaptive_emb=False,
-      input_layer_partitioner=None,
-      dense_layer_partitioner=None,
-      *args,
-      **kwargs
+    self,
+    wide_column=None,
+    fm_column=None,
+    deep_column=None,
+    dnn_hidden_units=[1024, 256, 32],
+    final_hidden_units=[128, 64],
+    optimizer_type="adam",
+    learning_rate=0.001,
+    inputs=None,
+    use_bn=True,
+    bf16=False,
+    stock_tf=None,
+    adaptive_emb=False,
+    input_layer_partitioner=None,
+    dense_layer_partitioner=None,
+    *args,
+    **kwargs,
   ):
     super().__init__(*args, **kwargs)
     self._deep_net = MLP(hidden_units=dnn_hidden_units)
@@ -32,21 +31,18 @@ def __init__(
 
   def build_features(self):
     # input features
-    with tf.variable_scope('input_layer', partitioner=self._input_layer_partitioner, reuse=tf.AUTO_REUSE):
+    with tf.variable_scope("input_layer", partitioner=self._input_layer_partitioner, reuse=tf.AUTO_REUSE):
       fm_cols = {}
       if self._adaptive_emb and not self.tf:
-        '''Adaptive Embedding Feature Part 1 of 2'''
+        """Adaptive Embedding Feature Part 1 of 2"""
         adaptive_mask_tensors = {}
         for col in CATEGORICAL_COLUMNS:
           adaptive_mask_tensors[col] = tf.ones([args.batch_size], tf.int32)
         dnn_input = tf.feature_column.input_layer(
-            features=self._feature, feature_columns=self._deep_column, adaptive_mask_tensors=adaptive_mask_tensors
+          features=self._feature, feature_columns=self._deep_column, adaptive_mask_tensors=adaptive_mask_tensors
         )
         wide_input = tf.feature_column.input_layer(
-            self._feature,
-            self._wide_column,
-            cols_to_output_tensors=fm_cols,
-            adaptive_mask_tensors=adaptive_mask_tensors
+          self._feature, self._wide_column, cols_to_output_tensors=fm_cols, adaptive_mask_tensors=adaptive_mask_tensors
         )
       else:
         dnn_input = tf.feature_column.input_layer(self._feature, self._deep_column)
diff --git a/deepray/models/rec/dien_model.py b/deepray/models/rec/dien_model.py
index cd28d379..67d1e5c2 100644
--- a/deepray/models/rec/dien_model.py
+++ b/deepray/models/rec/dien_model.py
@@ -26,18 +26,18 @@
 
 def compute_auxiliary_probs(auxiliary_net, rnn_states, items_hist, training=False):
   """
-    Given h(1),..,h(T) GRU sequence outputs and e(1),..,e(T) encoded user
-    sequence or negative user sequence behaviours, compute probabilities
-    for auxiliary loss term.
-
-    Args:
-        auxiliary_net: model that computes a probability of interaction
-        rnn_states: sequence of GRU outputs
-        items_hist: sequence of user behaviours or negative user behaviours
-
-    Returns:
-        click_prob: clicking probability for each timestep
-    """
+  Given h(1),..,h(T) GRU sequence outputs and e(1),..,e(T) encoded user
+  sequence or negative user sequence behaviours, compute probabilities
+  for auxiliary loss term.
+
+  Args:
+      auxiliary_net: model that computes a probability of interaction
+      rnn_states: sequence of GRU outputs
+      items_hist: sequence of user behaviours or negative user behaviours
+
+  Returns:
+      click_prob: clicking probability for each timestep
+  """
   # for rnn_states, select h(1),..,h(T-1)
   rnn_states = rnn_states[:, :-1, :]
   # for items_hist, select e(2),..,e(T)
@@ -51,22 +51,21 @@ def compute_auxiliary_probs(auxiliary_net, rnn_states, items_hist, training=Fals
 
 
 class DIENModel(SequentialRecommenderModel):
-
   def __init__(self, feature_spec, mlp_hidden_dims, embedding_dim=4):
     super(DIENModel, self).__init__(feature_spec, embedding_dim, mlp_hidden_dims["classifier"])
     # DIEN block
     self.dien_block = DIENItemSequenceInteractionBlock(hidden_size=embedding_dim * DIEN_ITEM_SEQ_INTERACTION_SIZE)
     # aux_loss uses an MLP in TF1 code
     self.auxiliary_net = CTRClassificationMLP(
-        mlp_hidden_dims["aux"],
-        activation_function=partial(tf.keras.layers.Activation, activation="sigmoid"),
+      mlp_hidden_dims["aux"],
+      activation_function=partial(tf.keras.layers.Activation, activation="sigmoid"),
     )
 
   def call(
-      self,
-      inputs,
-      compute_aux_loss=True,
-      training=False,
+    self,
+    inputs,
+    compute_aux_loss=True,
+    training=False,
   ):
     user_features = inputs["user_features"]
     target_item_features = inputs["target_item_features"]
@@ -83,13 +82,11 @@ def call(
 
     # Pass sequence_embeddings and target_item_embedding to a DIEN block
     # it needs to output h'(T) for concatenation and h(1),...,h(T) for aux_loss
-    final_seq_repr, features_layer_1 = self.dien_block(
-        (
-            target_item_embedding,  # shape=(B, 32)
-            short_sequence_embeddings,  # shape=(B, 10, 32)
-            short_sequence_mask  # shape=(B, 10)
-        )
-    )
+    final_seq_repr, features_layer_1 = self.dien_block((
+      target_item_embedding,  # shape=(B, 32)
+      short_sequence_embeddings,  # shape=(B, 10, 32)
+      short_sequence_mask,  # shape=(B, 10)
+    ))
 
     # short_features_layer_1 = features_layer_1[:, -short_seq_len:, :]
 
@@ -100,24 +97,24 @@ def call(
 
       # compute auxiliary logits
       aux_click_probs = compute_auxiliary_probs(
-          self.auxiliary_net,
-          features_layer_1,
-          short_sequence_embeddings,
-          training=training,
+        self.auxiliary_net,
+        features_layer_1,
+        short_sequence_embeddings,
+        training=training,
       )
 
       aux_noclick_probs = compute_auxiliary_probs(
-          self.auxiliary_net,
-          features_layer_1,
-          short_neg_sequence_embeddings,
-          training=training,
+        self.auxiliary_net,
+        features_layer_1,
+        short_neg_sequence_embeddings,
+        training=training,
       )
 
       mask_for_aux_loss = inputs["short_sequence_mask"][:, 1:]
       dien_aux_loss = dien_auxiliary_loss_fn(
-          aux_click_probs,
-          aux_noclick_probs,
-          mask=mask_for_aux_loss,
+        aux_click_probs,
+        aux_noclick_probs,
+        mask=mask_for_aux_loss,
       )
       output_dict["auxiliary_logits"] = dien_aux_loss
 
diff --git a/deepray/models/rec/din_model.py b/deepray/models/rec/din_model.py
index f96818af..5377b396 100644
--- a/deepray/models/rec/din_model.py
+++ b/deepray/models/rec/din_model.py
@@ -20,13 +20,12 @@
 
 
 class DINModel(SequentialRecommenderModel):
-
   def __init__(
-      self,
-      feature_spec,
-      mlp_hidden_dims=(200, 80),
-      embedding_dim=4,
-      item_item_interaction="dot",
+    self,
+    feature_spec,
+    mlp_hidden_dims=(200, 80),
+    embedding_dim=4,
+    item_item_interaction="dot",
   ):
     super(DINModel, self).__init__(feature_spec, embedding_dim, mlp_hidden_dims)
     if item_item_interaction == "dot":
@@ -55,9 +54,11 @@ def call(self, inputs, training=False):
 
     sequence_embeddings = sequence_embeddings * tf.expand_dims(mask, axis=-1)
 
-    item_sequence_interaction_embedding, _ = self.item_seq_interaction(
-        (target_item_embedding, sequence_embeddings, mask)
-    )
+    item_sequence_interaction_embedding, _ = self.item_seq_interaction((
+      target_item_embedding,
+      sequence_embeddings,
+      mask,
+    ))
 
     combined_embeddings = tf.concat([target_item_embedding, item_sequence_interaction_embedding, user_embedding], -1)
 
diff --git a/deepray/models/rec/dual_channels_deep_model.py b/deepray/models/rec/dual_channels_deep_model.py
index 52b43570..0cae7af3 100644
--- a/deepray/models/rec/dual_channels_deep_model.py
+++ b/deepray/models/rec/dual_channels_deep_model.py
@@ -3,9 +3,7 @@
 
 
 class DualChannelsDeepModel(tf.keras.Model):
-
   def __init__(self, user_embedding_size=1, movie_embedding_size=1, embedding_initializer=None, is_training=True):
-
     if not is_training:
       de.enable_inference_mode()
 
@@ -17,41 +15,41 @@ def __init__(self, user_embedding_size=1, movie_embedding_size=1, embedding_init
       embedding_initializer = tf.keras.initializers.Zeros()
 
     self.user_embedding = de.keras.layers.SquashedEmbedding(
-        user_embedding_size, initializer=embedding_initializer, name='user_embedding'
+      user_embedding_size, initializer=embedding_initializer, name="user_embedding"
     )
     self.movie_embedding = de.keras.layers.SquashedEmbedding(
-        movie_embedding_size, initializer=embedding_initializer, name='movie_embedding'
+      movie_embedding_size, initializer=embedding_initializer, name="movie_embedding"
     )
 
     self.dnn1 = tf.keras.layers.Dense(
-        64,
-        activation='relu',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
+      64,
+      activation="relu",
+      kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
+      bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
     )
     self.dnn2 = tf.keras.layers.Dense(
-        16,
-        activation='relu',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
+      16,
+      activation="relu",
+      kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
+      bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
     )
     self.dnn3 = tf.keras.layers.Dense(
-        5,
-        activation='softmax',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
+      5,
+      activation="softmax",
+      kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
+      bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
     )
     self.bias_net = tf.keras.layers.Dense(
-        5,
-        activation='softmax',
-        kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
-        bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1)
+      5,
+      activation="softmax",
+      kernel_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
+      bias_initializer=tf.keras.initializers.RandomNormal(0.0, 0.1),
     )
 
   @tf.function
   def call(self, features):
-    user_id = tf.reshape(features['user_id'], (-1, 1))
-    movie_id = tf.reshape(features['movie_id'], (-1, 1))
+    user_id = tf.reshape(features["user_id"], (-1, 1))
+    movie_id = tf.reshape(features["movie_id"], (-1, 1))
     user_latent = self.user_embedding(user_id)
     movie_latent = self.movie_embedding(movie_id)
     latent = tf.concat([user_latent, movie_latent], axis=1)
diff --git a/deepray/models/rec/flen.py b/deepray/models/rec/flen.py
index ff2c245a..adbec28b 100644
--- a/deepray/models/rec/flen.py
+++ b/deepray/models/rec/flen.py
@@ -30,12 +30,11 @@
 from deepray.layers.field_wise_bi_interaction import FieldWiseBiInteraction
 
 __all__ = [
-    'FLEN',
+  "FLEN",
 ]
 
 
 class FLEN(Model):
-
   def __init__(self, field_info, embedding_dim=16):
     if not field_info or not isinstance(field_info, dict):
       raise ValueError("Must specify field_info")
@@ -47,28 +46,28 @@ def __init__(self, field_info, embedding_dim=16):
     self.embedding_layers = {}
     self.field_info = field_info
 
-    for name, ftype, dtype, voc_size, length in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "ftype", "dtype", "voc_size", "length"
-    ]].values:
+    for name, ftype, dtype, voc_size, length in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "ftype", "dtype", "voc_size", "length"]
+    ].values:
       self.embedding_layers[name] = Embedding(
-          embedding_dim=embedding_dim, vocabulary_size=voc_size + 1, name='embedding_' + name
+        embedding_dim=embedding_dim, vocabulary_size=voc_size + 1, name="embedding_" + name
       )
 
     # 2. mlp part
-    self.deep_fc_64 = dp.layers.FullyConnect(units=64, activation='relu')
+    self.deep_fc_64 = dp.layers.FullyConnect(units=64, activation="relu")
     self.deep_bn_64 = BatchNormalization(momentum=0.9)
     self.deep_dropout_64 = Dropout(rate=0.2)
-    self.deep_fc_32 = dp.layers.FullyConnect(units=32, activation='relu')
+    self.deep_fc_32 = dp.layers.FullyConnect(units=32, activation="relu")
     self.deep_bn_32 = BatchNormalization(momentum=0.9)
     self.deep_dropout_32 = Dropout(rate=0.2)
 
     # 3. field-weighted embedding
     self.fwbi = FieldWiseBiInteraction(num_fields=len(field_info.keys()), embedding_size=embedding_dim)
-    self.fwbi_fc_32 = dp.layers.FullyConnect(units=32, activation='relu')
+    self.fwbi_fc_32 = dp.layers.FullyConnect(units=32, activation="relu")
     self.fwbi_bn = BatchNormalization(momentum=0.9)
     self.fwbi_drop = Dropout(rate=0.2)
 
-    self.logits = dp.layers.FullyConnect(units=1, activation='sigmoid')
+    self.logits = dp.layers.FullyConnect(units=1, activation="sigmoid")
 
   def call(self, inputs, training=False):
     embedding = {}
@@ -88,9 +87,8 @@ def call(self, inputs, training=False):
 
     # 3. field-weighted embedding
     field_embedding = [
-        tf.reduce_mean(tf.stack([embedding[name]
-                                 for name in names], axis=-1), axis=-1)
-        for field, names in self.field_info.items()
+      tf.reduce_mean(tf.stack([embedding[name] for name in names], axis=-1), axis=-1)
+      for field, names in self.field_info.items()
     ]
     fwbi_ebm = tf.concat(field_embedding, axis=-1)
 
diff --git a/deepray/models/rec/flend.py b/deepray/models/rec/flend.py
index ff1a7233..fbba90f3 100644
--- a/deepray/models/rec/flend.py
+++ b/deepray/models/rec/flend.py
@@ -26,12 +26,11 @@
 import deepray as dp
 
 __all__ = [
-    'FLEND',
+  "FLEND",
 ]
 
 
 class FLEND(Model):
-
   def __init__(self, field_info, embedding_dim=16):
     if not field_info or not isinstance(field_info, dict):
       raise ValueError("Must specify field_info")
@@ -41,29 +40,29 @@ def __init__(self, field_info, embedding_dim=16):
     self.embedding_layers = {}
     self.dice_bn_layer = BatchNormalization(momentum=0.9)
     self.dice_dropout_layer = Dropout(0.7)
-    self.dice_fc_layers = dp.layers.FullyConnect(units=32, name='dice_fc')
+    self.dice_fc_layers = dp.layers.FullyConnect(units=32, name="dice_fc")
     # self.sparse_inputs = {}
     for k, v in field_info.items():
-      self.embedding_layers[k] = dp.layers.FullyConnectv2(units=32, input_shape=(v,), name='fc_' + k)
+      self.embedding_layers[k] = dp.layers.FullyConnectv2(units=32, input_shape=(v,), name="fc_" + k)
       # self.sparse_inputs[k] = tf.keras.Input(name=k, shape=(v,), sparse=True)
 
     # 2. mlp part
-    self.deep_fc_64 = dp.layers.FullyConnect(units=64, activation='relu')
+    self.deep_fc_64 = dp.layers.FullyConnect(units=64, activation="relu")
     self.deep_bn_64 = BatchNormalization(momentum=0.9)
     self.deep_dropout_64 = Dropout(rate=0.2)
-    self.deep_fc_32 = dp.layers.FullyConnect(units=32, activation='relu')
+    self.deep_fc_32 = dp.layers.FullyConnect(units=32, activation="relu")
     self.deep_bn_32 = BatchNormalization(momentum=0.9)
     self.deep_dropout_32 = Dropout(rate=0.2)
 
     # 3. field-weighted embedding
     self.fwbi = dp.layers.FieldWiseBiInteraction(num_fields=3, embedding_size=32)
-    self.fwbi_fc_32 = dp.layers.FullyConnect(units=32, activation='relu')
+    self.fwbi_fc_32 = dp.layers.FullyConnect(units=32, activation="relu")
     self.fwbi_bn = BatchNormalization(momentum=0.9)
     self.fwbi_drop = Dropout(rate=0.2)
 
     self.fwbi_bn2 = BatchNormalization(momentum=0.9)
 
-    self.logits = dp.layers.FullyConnect(units=1, activation='sigmoid')
+    self.logits = dp.layers.FullyConnect(units=1, activation="sigmoid")
     # self._set_inputs(self.sparse_inputs)
 
   def call(self, inputs, training=False):
diff --git a/deepray/models/rec/sequential_recommender_model.py b/deepray/models/rec/sequential_recommender_model.py
index f1c1b803..0c40d793 100644
--- a/deepray/models/rec/sequential_recommender_model.py
+++ b/deepray/models/rec/sequential_recommender_model.py
@@ -17,8 +17,11 @@
 import tensorflow as tf
 
 from deepray.datasets.amazon_books_2014.defaults import (
-    CARDINALITY_SELECTOR, NEGATIVE_HISTORY_CHANNEL, POSITIVE_HISTORY_CHANNEL, TARGET_ITEM_FEATURES_CHANNEL,
-    USER_FEATURES_CHANNEL
+  CARDINALITY_SELECTOR,
+  NEGATIVE_HISTORY_CHANNEL,
+  POSITIVE_HISTORY_CHANNEL,
+  TARGET_ITEM_FEATURES_CHANNEL,
+  USER_FEATURES_CHANNEL,
 )
 from deepray.layers.ctr_classification_mlp import CTRClassificationMLP
 
@@ -26,9 +29,8 @@
 
 
 class EmbeddingInitializer(tf.keras.initializers.Initializer):
-
   def __call__(self, shape, dtype=tf.float32):
-    maxval = tf.sqrt(tf.constant(1.) / tf.cast(shape[0], tf.float32))
+    maxval = tf.sqrt(tf.constant(1.0) / tf.cast(shape[0], tf.float32))
     maxval = tf.cast(maxval, dtype=dtype)
     minval = -maxval
 
@@ -42,23 +44,22 @@ def get_config(self):
 
 # https://github.com/NVIDIA/DeepLearningExamples/blob/81ee705868a11d6fe18c12d237abe4a08aab5fd6/TensorFlow2/Recommendation/DLRM/embedding.py#L94
 class Embedding(tf.keras.layers.Layer):
-
   def __init__(self, input_dim, output_dim, *, trainable=True, embedding_name=None, initializer=EmbeddingInitializer()):
     super(Embedding, self).__init__()
     self.input_dim = input_dim
     self.output_dim = output_dim
-    self.embedding_name = (embedding_name if embedding_name is not None else "embedding_table")
+    self.embedding_name = embedding_name if embedding_name is not None else "embedding_table"
     self.embedding_table = None
     self.trainable = trainable
     self.initializer = initializer
 
   def build(self, input_shape):
     self.embedding_table = self.add_weight(
-        self.embedding_name,
-        shape=[self.input_dim, self.output_dim],
-        dtype=tf.float32,
-        initializer=self.initializer,
-        trainable=self.trainable,
+      self.embedding_name,
+      shape=[self.input_dim, self.output_dim],
+      dtype=tf.float32,
+      initializer=self.initializer,
+      trainable=self.trainable,
     )
 
   def call(self, indices):
@@ -66,9 +67,7 @@ def call(self, indices):
 
 
 class SequentialRecommenderModel(tf.keras.Model, ABC):
-
   def __init__(self, feature_spec, embedding_dim, classifier_dense_sizes=(200,)):
-
     super(SequentialRecommenderModel, self).__init__()
     self.embedding_dim = embedding_dim
 
@@ -96,8 +95,9 @@ def __init__(self, feature_spec, embedding_dim, classifier_dense_sizes=(200,)):
 
     # Group corresponding item features from different item channels together
     zipped_item_features = zip(
-        channel_spec[TARGET_ITEM_FEATURES_CHANNEL], channel_spec[POSITIVE_HISTORY_CHANNEL],
-        channel_spec[NEGATIVE_HISTORY_CHANNEL]
+      channel_spec[TARGET_ITEM_FEATURES_CHANNEL],
+      channel_spec[POSITIVE_HISTORY_CHANNEL],
+      channel_spec[NEGATIVE_HISTORY_CHANNEL],
     )
 
     for i, (feature_target, feature_pos, feature_neg) in enumerate(zipped_item_features):
@@ -115,18 +115,18 @@ def __init__(self, feature_spec, embedding_dim, classifier_dense_sizes=(200,)):
     self.variable_embeddings_groups = []
     for embedding_name, cardinality in zip(embedding_names, feature_groups_cardinalities):
       self.variable_embeddings_groups.append(
-          Embedding(
-              embedding_name=embedding_name,
-              input_dim=cardinality + 1,  # ids in range <1, cardinality> (boundries included)
-              output_dim=embedding_dim
-          )
+        Embedding(
+          embedding_name=embedding_name,
+          input_dim=cardinality + 1,  # ids in range <1, cardinality> (boundries included)
+          output_dim=embedding_dim,
+        )
       )
 
     self.classificationMLP = CTRClassificationMLP(layer_sizes=classifier_dense_sizes)
 
   def embed(self, features):
     embeddings = []
-    for (variable, id) in features.items():
+    for variable, id in features.items():
       embedding_group = self.feature_name_to_embedding_group[variable]
 
       embeddings.append(self.variable_embeddings_groups[embedding_group](id))
diff --git a/deepray/models/rec/sim_model.py b/deepray/models/rec/sim_model.py
index 1b1a09f3..075cc21e 100644
--- a/deepray/models/rec/sim_model.py
+++ b/deepray/models/rec/sim_model.py
@@ -30,20 +30,19 @@ def masked_temporal_mean(sequence_batch, mask):
 
 
 class SIMModel(SequentialRecommenderModel):
-
   def __init__(self, feature_spec, mlp_hidden_dims, embedding_dim=4, k=50, dropout_rate=-1):
     super(SIMModel, self).__init__(feature_spec, embedding_dim)
     self.k = k
     self.stage_one_classifier = CTRClassificationMLP(layer_sizes=mlp_hidden_dims["stage_1"], dropout_rate=dropout_rate)
     self.stage_two_classifier = CTRClassificationMLP(layer_sizes=mlp_hidden_dims["stage_2"], dropout_rate=dropout_rate)
     self.stage_two_auxiliary_net = CTRClassificationMLP(
-        layer_sizes=mlp_hidden_dims["aux"],
-        activation_function=partial(tf.keras.layers.Activation, activation="sigmoid"),
-        dropout_rate=dropout_rate
+      layer_sizes=mlp_hidden_dims["aux"],
+      activation_function=partial(tf.keras.layers.Activation, activation="sigmoid"),
+      dropout_rate=dropout_rate,
     )
 
     self.stage_one_item_seq_interaction = DINItemSequenceInteractionBlock(
-        item_item_interaction=DotItemItemInteraction()
+      item_item_interaction=DotItemItemInteraction()
     )
     self.stage_two_item_seq_interaction = DIENItemSequenceInteractionBlock(hidden_size=embedding_dim * 6)
 
@@ -55,10 +54,10 @@ def select_top_k_items(self, embeddings, scores):
     return best_k_embeddings, top_k_mask
 
   def call(
-      self,
-      inputs,
-      compute_aux_loss=True,
-      training=False,
+    self,
+    inputs,
+    compute_aux_loss=True,
+    training=False,
   ):
     user_features = inputs["user_features"]
     target_item_features = inputs["target_item_features"]
@@ -76,9 +75,11 @@ def call(
     long_sequence_embeddings = self.embed(long_sequence_features)
     long_sequence_embeddings = long_sequence_embeddings * tf.expand_dims(long_sequence_mask, axis=-1)
 
-    stage_one_interaction_embedding, gsu_scores = self.stage_one_item_seq_interaction(
-        (target_item_embedding, long_sequence_embeddings, long_sequence_mask)
-    )
+    stage_one_interaction_embedding, gsu_scores = self.stage_one_item_seq_interaction((
+      target_item_embedding,
+      long_sequence_embeddings,
+      long_sequence_mask,
+    ))
     # combine all the stage 1 embeddings
     stage_one_embeddings = tf.concat([target_item_embedding, stage_one_interaction_embedding, user_embedding], -1)
     stage_one_logits = self.stage_one_classifier(stage_one_embeddings, training=training)
@@ -93,48 +94,52 @@ def call(
     # Take embeddings of k best items produced by GSU at Stage 1
     best_k_long_seq_embeddings, top_k_mask = self.select_top_k_items(long_sequence_embeddings, gsu_scores)
     # Run attention mechanism to produce a single representation
-    att_fea, _ = self.stage_one_item_seq_interaction((target_item_embedding, best_k_long_seq_embeddings, top_k_mask),)
+    att_fea, _ = self.stage_one_item_seq_interaction(
+      (target_item_embedding, best_k_long_seq_embeddings, top_k_mask),
+    )
     # Take a mean representation of best_k_long_seq_embeddings
     item_his_sum_emb = masked_temporal_mean(best_k_long_seq_embeddings, top_k_mask)
     # ---- DIEN part
     (
-        stage_two_interaction_embedding,
-        short_features_layer_1,
-    ) = self.stage_two_item_seq_interaction((target_item_embedding, short_sequence_embeddings, short_sequence_mask),)
+      stage_two_interaction_embedding,
+      short_features_layer_1,
+    ) = self.stage_two_item_seq_interaction(
+      (target_item_embedding, short_sequence_embeddings, short_sequence_mask),
+    )
 
     # Compute auxiliary logits for DIEN
     if compute_aux_loss:
       # Embed negative sequence features
       short_neg_sequence_embeddings = self.embed(short_neg_sequence_features)
-      short_neg_sequence_embeddings = (short_neg_sequence_embeddings * tf.expand_dims(short_sequence_mask, axis=-1))
+      short_neg_sequence_embeddings = short_neg_sequence_embeddings * tf.expand_dims(short_sequence_mask, axis=-1)
 
       aux_click_probs = compute_auxiliary_probs(
-          self.stage_two_auxiliary_net,
-          short_features_layer_1,
-          short_sequence_embeddings,
-          training=training,
+        self.stage_two_auxiliary_net,
+        short_features_layer_1,
+        short_sequence_embeddings,
+        training=training,
       )
 
       aux_noclick_probs = compute_auxiliary_probs(
-          self.stage_two_auxiliary_net,
-          short_features_layer_1,
-          short_neg_sequence_embeddings,
-          training=training,
+        self.stage_two_auxiliary_net,
+        short_features_layer_1,
+        short_neg_sequence_embeddings,
+        training=training,
       )
 
       mask_for_aux_loss = short_sequence_mask[:, 1:]
 
       dien_aux_loss = dien_auxiliary_loss_fn(
-          aux_click_probs,
-          aux_noclick_probs,
-          mask=mask_for_aux_loss,
+        aux_click_probs,
+        aux_noclick_probs,
+        mask=mask_for_aux_loss,
       )
       output_dict["auxiliary_logits"] = dien_aux_loss
 
     # combine all the stage 2 embeddings
     stage_two_embeddings = tf.concat(
-        [att_fea, item_his_sum_emb, target_item_embedding, stage_two_interaction_embedding, user_embedding],
-        -1,
+      [att_fea, item_his_sum_emb, target_item_embedding, stage_two_interaction_embedding, user_embedding],
+      -1,
     )
 
     stage_two_logits = self.stage_two_classifier(stage_two_embeddings, training=training)
diff --git a/deepray/models/span_labeling.py b/deepray/models/span_labeling.py
index 94287efc..4314bde9 100644
--- a/deepray/models/span_labeling.py
+++ b/deepray/models/span_labeling.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -36,35 +37,34 @@ class SpanLabeling(tf.keras.Model):
       'predictions'.
   """
 
-  def __init__(self, input_width, activation=None, initializer='glorot_uniform', output='logits', **kwargs):
+  def __init__(self, input_width, activation=None, initializer="glorot_uniform", output="logits", **kwargs):
     self._self_setattr_tracking = False
     self._config = {
-        'input_width': input_width,
-        'activation': activation,
-        'initializer': initializer,
-        'output': output,
+      "input_width": input_width,
+      "activation": activation,
+      "initializer": initializer,
+      "output": output,
     }
 
-    sequence_data = tf.keras.layers.Input(shape=(None, input_width), name='sequence_data', dtype=tf.float32)
+    sequence_data = tf.keras.layers.Input(shape=(None, input_width), name="sequence_data", dtype=tf.float32)
 
     intermediate_logits = tf.keras.layers.Dense(
-        2,  # This layer predicts start location and end location.
-        activation=activation,
-        kernel_initializer=initializer,
-        name='predictions/transform/logits'
+      2,  # This layer predicts start location and end location.
+      activation=activation,
+      kernel_initializer=initializer,
+      name="predictions/transform/logits",
     )(sequence_data)
-    self.start_logits, self.end_logits = (tf.keras.layers.Lambda(self._split_output_tensor)(intermediate_logits))
+    self.start_logits, self.end_logits = tf.keras.layers.Lambda(self._split_output_tensor)(intermediate_logits)
 
     start_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(self.start_logits)
     end_predictions = tf.keras.layers.Activation(tf.nn.log_softmax)(self.end_logits)
 
-    if output == 'logits':
+    if output == "logits":
       output_tensors = [self.start_logits, self.end_logits]
-    elif output == 'predictions':
+    elif output == "predictions":
       output_tensors = [start_predictions, end_predictions]
     else:
-      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or '
-                        '"predictions"') % output)
+      raise ValueError(('Unknown `output` value "%s". `output` can be either "logits" or "predictions"') % output)
 
     super(SpanLabeling, self).__init__(inputs=[sequence_data], outputs=output_tensors, **kwargs)
 
diff --git a/deepray/models/stable_diffusion/__internal__/layers/attention_block.py b/deepray/models/stable_diffusion/__internal__/layers/attention_block.py
index b9408376..e2533c66 100644
--- a/deepray/models/stable_diffusion/__internal__/layers/attention_block.py
+++ b/deepray/models/stable_diffusion/__internal__/layers/attention_block.py
@@ -16,12 +16,11 @@
 from tensorflow import keras
 
 from .padded_conv2d import (
-    PaddedConv2D,
+  PaddedConv2D,
 )
 
 
 class AttentionBlock(keras.layers.Layer):
-
   def __init__(self, output_dim, **kwargs):
     super().__init__(**kwargs)
     self.output_dim = output_dim
diff --git a/deepray/models/stable_diffusion/__internal__/layers/padded_conv2d.py b/deepray/models/stable_diffusion/__internal__/layers/padded_conv2d.py
index 4af748c0..0374823c 100644
--- a/deepray/models/stable_diffusion/__internal__/layers/padded_conv2d.py
+++ b/deepray/models/stable_diffusion/__internal__/layers/padded_conv2d.py
@@ -16,7 +16,6 @@
 
 
 class PaddedConv2D(keras.layers.Layer):
-
   def __init__(self, filters, kernel_size, padding=0, strides=1, **kwargs):
     super().__init__(**kwargs)
     self.padding2d = keras.layers.ZeroPadding2D(padding)
diff --git a/deepray/models/stable_diffusion/__internal__/layers/resnet_block.py b/deepray/models/stable_diffusion/__internal__/layers/resnet_block.py
index 70e2a505..bc769556 100644
--- a/deepray/models/stable_diffusion/__internal__/layers/resnet_block.py
+++ b/deepray/models/stable_diffusion/__internal__/layers/resnet_block.py
@@ -15,12 +15,11 @@
 from tensorflow import keras
 
 from .padded_conv2d import (
-    PaddedConv2D,
+  PaddedConv2D,
 )
 
 
 class ResnetBlock(keras.layers.Layer):
-
   def __init__(self, output_dim, **kwargs):
     super().__init__(**kwargs)
     self.output_dim = output_dim
diff --git a/deepray/models/stable_diffusion/clip_tokenizer.py b/deepray/models/stable_diffusion/clip_tokenizer.py
index badfd98f..d4986a75 100644
--- a/deepray/models/stable_diffusion/clip_tokenizer.py
+++ b/deepray/models/stable_diffusion/clip_tokenizer.py
@@ -26,15 +26,15 @@
 def bytes_to_unicode():
   """Return a list of utf-8 bytes and a corresponding list of unicode strings.
 
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you
-    want to avoid UNKs. When you're at something like a 10B token dataset you
-    end up needing around 5K for decent coverage. This is a significant
-    percentage of your normal, say, 32K bpe vocab. To avoid that, we want
-    lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-  bs = (list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)))
+  The reversible bpe codes work on unicode strings.
+  This means you need a large # of unicode characters in your vocab if you
+  want to avoid UNKs. When you're at something like a 10B token dataset you
+  end up needing around 5K for decent coverage. This is a significant
+  percentage of your normal, say, 32K bpe vocab. To avoid that, we want
+  lookup tables between utf-8 bytes and unicode strings.
+  And avoids mapping to whitespace/control characters the bpe code barfs on.
+  """
+  bs = list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
   cs = bs[:]
   n = 0
   for b in range(2**8):
@@ -49,9 +49,9 @@ def bytes_to_unicode():
 def get_pairs(word):
   """Return set of symbol pairs in a word.
 
-    A word is represented as tuple of symbols(symbols being variable-length
-    strings).
-    """
+  A word is represented as tuple of symbols(symbols being variable-length
+  strings).
+  """
   pairs = set()
   prev_char = word[0]
   for char in word[1:]:
@@ -72,17 +72,16 @@ def whitespace_clean(text):
 
 
 class SimpleTokenizer:
-
   def __init__(self, bpe_path=None):
     bpe_path = bpe_path or keras.utils.get_file(
-        "bpe_simple_vocab_16e6.txt.gz",
-        "https://github.com/openai/CLIP/blob/main/clip/bpe_simple_vocab_16e6.txt.gz?raw=true",  # noqa: E501
-        file_hash="924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a",  # noqa: E501
+      "bpe_simple_vocab_16e6.txt.gz",
+      "https://github.com/openai/CLIP/blob/main/clip/bpe_simple_vocab_16e6.txt.gz?raw=true",  # noqa: E501
+      file_hash="924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a",  # noqa: E501
     )
     self.byte_encoder = bytes_to_unicode()
     self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
     merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
-    merges = merges[1:49152 - 256 - 2 + 1]
+    merges = merges[1 : 49152 - 256 - 2 + 1]
     merges = [tuple(merge.split()) for merge in merges]
     vocab = list(bytes_to_unicode().values())
     vocab = vocab + [v + "</w>" for v in vocab]
@@ -95,12 +94,12 @@ def __init__(self, bpe_path=None):
     self.bpe_ranks = dict(zip(merges, range(len(merges))))
 
     self.special_tokens = {
-        "<|startoftext|>": "<|startoftext|>",
-        "<|endoftext|>": "<|endoftext|>",
+      "<|startoftext|>": "<|startoftext|>",
+      "<|endoftext|>": "<|endoftext|>",
     }
     self.cache = {
-        "<|startoftext|>": "<|startoftext|>",
-        "<|endoftext|>": "<|endoftext|>",
+      "<|startoftext|>": "<|startoftext|>",
+      "<|endoftext|>": "<|endoftext|>",
     }
     self.pat = self._create_pat()
 
@@ -112,9 +111,9 @@ def _create_decoder(self, encoder):
 
   def _create_pat(self):
     return re.compile(
-        "|".join([re.escape(key) for key in self.special_tokens.keys()]) +
-        r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
-        re.IGNORECASE,
+      "|".join([re.escape(key) for key in self.special_tokens.keys()])
+      + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+      re.IGNORECASE,
     )
 
   @property
@@ -166,7 +165,7 @@ def bpe(self, token):
           new_word.extend(word[i:])
           break
 
-        if (word[i] == first and i < len(word) - 1 and word[i + 1] == second):
+        if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
           new_word.append(first + second)
           i += 2
         else:
@@ -192,5 +191,5 @@ def encode(self, text):
 
   def decode(self, tokens):
     text = "".join([self.decoder[token] for token in tokens])
-    text = (bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " "))
+    text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors="replace").replace("</w>", " ")
     return text
diff --git a/deepray/models/stable_diffusion/constants.py b/deepray/models/stable_diffusion/constants.py
index 8dc9c89d..76494d00 100644
--- a/deepray/models/stable_diffusion/constants.py
+++ b/deepray/models/stable_diffusion/constants.py
@@ -13,1083 +13,1083 @@
 # limitations under the License.
 
 _UNCONDITIONAL_TOKENS = [
-    49406,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
-    49407,
+  49406,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
+  49407,
 ]
 _ALPHAS_CUMPROD = [
-    0.99915,
-    0.998296,
-    0.9974381,
-    0.9965762,
-    0.99571025,
-    0.9948404,
-    0.9939665,
-    0.9930887,
-    0.9922069,
-    0.9913211,
-    0.9904313,
-    0.98953754,
-    0.9886398,
-    0.9877381,
-    0.9868324,
-    0.98592263,
-    0.98500896,
-    0.9840913,
-    0.9831696,
-    0.982244,
-    0.98131436,
-    0.9803808,
-    0.97944313,
-    0.97850156,
-    0.977556,
-    0.9766064,
-    0.97565293,
-    0.9746954,
-    0.9737339,
-    0.9727684,
-    0.97179896,
-    0.97082555,
-    0.96984816,
-    0.96886677,
-    0.9678814,
-    0.96689206,
-    0.96589875,
-    0.9649015,
-    0.96390027,
-    0.9628951,
-    0.9618859,
-    0.96087277,
-    0.95985574,
-    0.95883465,
-    0.9578097,
-    0.95678073,
-    0.95574784,
-    0.954711,
-    0.95367026,
-    0.9526256,
-    0.9515769,
-    0.95052433,
-    0.94946784,
-    0.94840735,
-    0.947343,
-    0.94627476,
-    0.9452025,
-    0.9441264,
-    0.9430464,
-    0.9419625,
-    0.9408747,
-    0.939783,
-    0.9386874,
-    0.93758786,
-    0.9364845,
-    0.93537724,
-    0.9342661,
-    0.9331511,
-    0.9320323,
-    0.9309096,
-    0.929783,
-    0.9286526,
-    0.9275183,
-    0.9263802,
-    0.92523825,
-    0.92409253,
-    0.92294294,
-    0.9217895,
-    0.92063236,
-    0.9194713,
-    0.9183065,
-    0.9171379,
-    0.91596556,
-    0.9147894,
-    0.9136095,
-    0.91242576,
-    0.9112383,
-    0.9100471,
-    0.9088522,
-    0.9076535,
-    0.9064511,
-    0.90524495,
-    0.9040351,
-    0.90282154,
-    0.9016043,
-    0.90038335,
-    0.8991587,
-    0.8979304,
-    0.8966984,
-    0.89546275,
-    0.89422345,
-    0.8929805,
-    0.89173394,
-    0.89048374,
-    0.88922995,
-    0.8879725,
-    0.8867115,
-    0.88544685,
-    0.88417864,
-    0.88290685,
-    0.8816315,
-    0.88035256,
-    0.8790701,
-    0.87778413,
-    0.8764946,
-    0.8752016,
-    0.873905,
-    0.87260497,
-    0.8713014,
-    0.8699944,
-    0.86868393,
-    0.86737,
-    0.8660526,
-    0.8647318,
-    0.86340755,
-    0.8620799,
-    0.8607488,
-    0.85941434,
-    0.8580765,
-    0.8567353,
-    0.8553907,
-    0.8540428,
-    0.85269153,
-    0.85133696,
-    0.84997904,
-    0.84861785,
-    0.8472533,
-    0.8458856,
-    0.8445145,
-    0.84314024,
-    0.84176266,
-    0.8403819,
-    0.8389979,
-    0.8376107,
-    0.8362203,
-    0.83482677,
-    0.83343,
-    0.8320301,
-    0.8306271,
-    0.8292209,
-    0.82781166,
-    0.82639927,
-    0.8249838,
-    0.82356524,
-    0.8221436,
-    0.82071894,
-    0.81929123,
-    0.81786054,
-    0.8164268,
-    0.8149901,
-    0.8135504,
-    0.81210774,
-    0.81066215,
-    0.8092136,
-    0.8077621,
-    0.80630773,
-    0.80485046,
-    0.8033903,
-    0.80192727,
-    0.8004614,
-    0.79899275,
-    0.79752123,
-    0.7960469,
-    0.7945698,
-    0.7930899,
-    0.79160726,
-    0.7901219,
-    0.7886338,
-    0.787143,
-    0.7856495,
-    0.7841533,
-    0.78265446,
-    0.78115296,
-    0.7796488,
-    0.77814204,
-    0.7766327,
-    0.7751208,
-    0.7736063,
-    0.77208924,
-    0.7705697,
-    0.7690476,
-    0.767523,
-    0.7659959,
-    0.7644664,
-    0.76293445,
-    0.7614,
-    0.7598632,
-    0.75832397,
-    0.75678235,
-    0.75523835,
-    0.75369203,
-    0.7521434,
-    0.75059247,
-    0.7490392,
-    0.7474837,
-    0.7459259,
-    0.7443659,
-    0.74280363,
-    0.7412392,
-    0.7396726,
-    0.7381038,
-    0.73653287,
-    0.7349598,
-    0.7333846,
-    0.73180735,
-    0.730228,
-    0.7286466,
-    0.7270631,
-    0.7254777,
-    0.72389024,
-    0.72230077,
-    0.7207094,
-    0.71911603,
-    0.7175208,
-    0.7159236,
-    0.71432453,
-    0.7127236,
-    0.71112084,
-    0.7095162,
-    0.7079098,
-    0.7063016,
-    0.70469165,
-    0.70307994,
-    0.7014665,
-    0.69985133,
-    0.6982345,
-    0.696616,
-    0.6949958,
-    0.69337404,
-    0.69175065,
-    0.69012564,
-    0.6884991,
-    0.68687093,
-    0.6852413,
-    0.68361014,
-    0.6819775,
-    0.6803434,
-    0.67870784,
-    0.6770708,
-    0.6754324,
-    0.6737926,
-    0.67215145,
-    0.670509,
-    0.66886514,
-    0.66722,
-    0.6655736,
-    0.66392595,
-    0.662277,
-    0.6606269,
-    0.65897554,
-    0.657323,
-    0.65566933,
-    0.6540145,
-    0.6523586,
-    0.6507016,
-    0.6490435,
-    0.64738435,
-    0.6457241,
-    0.64406294,
-    0.6424008,
-    0.64073765,
-    0.63907355,
-    0.63740855,
-    0.6357426,
-    0.6340758,
-    0.6324082,
-    0.6307397,
-    0.6290704,
-    0.6274003,
-    0.6257294,
-    0.62405777,
-    0.6223854,
-    0.62071234,
-    0.6190386,
-    0.61736417,
-    0.6156891,
-    0.61401343,
-    0.6123372,
-    0.6106603,
-    0.6089829,
-    0.607305,
-    0.6056265,
-    0.6039476,
-    0.60226816,
-    0.6005883,
-    0.598908,
-    0.59722733,
-    0.5955463,
-    0.59386486,
-    0.5921831,
-    0.59050107,
-    0.5888187,
-    0.5871361,
-    0.5854532,
-    0.5837701,
-    0.5820868,
-    0.5804033,
-    0.5787197,
-    0.5770359,
-    0.575352,
-    0.57366806,
-    0.571984,
-    0.5702999,
-    0.5686158,
-    0.56693166,
-    0.56524754,
-    0.5635635,
-    0.5618795,
-    0.56019557,
-    0.5585118,
-    0.5568281,
-    0.55514455,
-    0.5534612,
-    0.551778,
-    0.5500951,
-    0.5484124,
-    0.54673,
-    0.5450478,
-    0.54336596,
-    0.54168445,
-    0.54000324,
-    0.53832245,
-    0.5366421,
-    0.53496206,
-    0.5332825,
-    0.53160346,
-    0.5299248,
-    0.52824676,
-    0.5265692,
-    0.52489215,
-    0.5232157,
-    0.5215398,
-    0.51986456,
-    0.51818997,
-    0.51651603,
-    0.51484275,
-    0.5131702,
-    0.5114983,
-    0.5098272,
-    0.50815684,
-    0.5064873,
-    0.50481856,
-    0.50315064,
-    0.50148356,
-    0.4998174,
-    0.4981521,
-    0.49648774,
-    0.49482432,
-    0.49316183,
-    0.49150035,
-    0.48983985,
-    0.4881804,
-    0.486522,
-    0.48486462,
-    0.4832084,
-    0.48155323,
-    0.4798992,
-    0.47824633,
-    0.47659463,
-    0.4749441,
-    0.47329482,
-    0.4716468,
-    0.47,
-    0.46835446,
-    0.46671024,
-    0.46506736,
-    0.4634258,
-    0.46178558,
-    0.46014675,
-    0.45850933,
-    0.45687333,
-    0.45523876,
-    0.45360568,
-    0.45197406,
-    0.45034397,
-    0.44871536,
-    0.44708833,
-    0.44546285,
-    0.44383895,
-    0.44221666,
-    0.440596,
-    0.43897697,
-    0.43735963,
-    0.43574396,
-    0.43412998,
-    0.43251774,
-    0.43090722,
-    0.4292985,
-    0.42769152,
-    0.42608637,
-    0.42448303,
-    0.4228815,
-    0.42128187,
-    0.4196841,
-    0.41808826,
-    0.4164943,
-    0.4149023,
-    0.41331223,
-    0.41172415,
-    0.41013804,
-    0.40855396,
-    0.4069719,
-    0.4053919,
-    0.40381396,
-    0.4022381,
-    0.40066436,
-    0.39909273,
-    0.39752322,
-    0.3959559,
-    0.39439073,
-    0.39282778,
-    0.39126703,
-    0.3897085,
-    0.3881522,
-    0.3865982,
-    0.38504648,
-    0.38349706,
-    0.38194993,
-    0.38040516,
-    0.37886274,
-    0.37732267,
-    0.375785,
-    0.37424973,
-    0.37271687,
-    0.37118647,
-    0.36965853,
-    0.36813304,
-    0.36661002,
-    0.36508954,
-    0.36357155,
-    0.3620561,
-    0.36054322,
-    0.3590329,
-    0.35752517,
-    0.35602003,
-    0.35451752,
-    0.35301763,
-    0.3515204,
-    0.3500258,
-    0.3485339,
-    0.3470447,
-    0.34555823,
-    0.34407446,
-    0.34259343,
-    0.34111515,
-    0.33963963,
-    0.33816692,
-    0.336697,
-    0.3352299,
-    0.33376563,
-    0.3323042,
-    0.33084565,
-    0.32938993,
-    0.32793713,
-    0.3264872,
-    0.32504022,
-    0.32359615,
-    0.32215503,
-    0.32071686,
-    0.31928164,
-    0.31784943,
-    0.3164202,
-    0.314994,
-    0.3135708,
-    0.31215066,
-    0.31073356,
-    0.3093195,
-    0.30790854,
-    0.30650064,
-    0.30509588,
-    0.30369422,
-    0.30229566,
-    0.30090025,
-    0.299508,
-    0.2981189,
-    0.29673296,
-    0.29535022,
-    0.2939707,
-    0.29259437,
-    0.29122123,
-    0.28985137,
-    0.28848472,
-    0.28712133,
-    0.2857612,
-    0.28440437,
-    0.2830508,
-    0.28170055,
-    0.2803536,
-    0.27900997,
-    0.27766964,
-    0.27633268,
-    0.27499905,
-    0.2736688,
-    0.27234194,
-    0.27101842,
-    0.2696983,
-    0.26838157,
-    0.26706827,
-    0.26575837,
-    0.26445192,
-    0.26314887,
-    0.2618493,
-    0.26055318,
-    0.2592605,
-    0.25797132,
-    0.2566856,
-    0.2554034,
-    0.25412467,
-    0.25284946,
-    0.25157773,
-    0.2503096,
-    0.24904492,
-    0.24778382,
-    0.24652626,
-    0.24527225,
-    0.2440218,
-    0.24277493,
-    0.24153163,
-    0.24029191,
-    0.23905578,
-    0.23782326,
-    0.23659433,
-    0.23536903,
-    0.23414734,
-    0.23292927,
-    0.23171483,
-    0.23050404,
-    0.22929688,
-    0.22809339,
-    0.22689353,
-    0.22569734,
-    0.22450483,
-    0.22331597,
-    0.2221308,
-    0.22094932,
-    0.21977153,
-    0.21859743,
-    0.21742703,
-    0.21626033,
-    0.21509734,
-    0.21393807,
-    0.21278252,
-    0.21163069,
-    0.21048258,
-    0.20933822,
-    0.20819758,
-    0.2070607,
-    0.20592754,
-    0.20479813,
-    0.20367248,
-    0.20255059,
-    0.20143245,
-    0.20031808,
-    0.19920748,
-    0.19810064,
-    0.19699757,
-    0.19589828,
-    0.19480278,
-    0.19371104,
-    0.1926231,
-    0.19153893,
-    0.19045855,
-    0.18938197,
-    0.18830918,
-    0.18724018,
-    0.18617497,
-    0.18511358,
-    0.18405597,
-    0.18300217,
-    0.18195218,
-    0.18090598,
-    0.1798636,
-    0.17882504,
-    0.17779027,
-    0.1767593,
-    0.17573217,
-    0.17470883,
-    0.1736893,
-    0.1726736,
-    0.1716617,
-    0.17065361,
-    0.16964935,
-    0.1686489,
-    0.16765225,
-    0.16665943,
-    0.16567042,
-    0.16468522,
-    0.16370384,
-    0.16272627,
-    0.16175252,
-    0.16078258,
-    0.15981644,
-    0.15885411,
-    0.1578956,
-    0.15694089,
-    0.15599,
-    0.15504292,
-    0.15409963,
-    0.15316014,
-    0.15222447,
-    0.15129258,
-    0.1503645,
-    0.14944021,
-    0.14851972,
-    0.14760303,
-    0.14669013,
-    0.14578101,
-    0.14487568,
-    0.14397413,
-    0.14307636,
-    0.14218238,
-    0.14129217,
-    0.14040573,
-    0.13952307,
-    0.13864417,
-    0.13776903,
-    0.13689767,
-    0.13603005,
-    0.13516618,
-    0.13430607,
-    0.13344972,
-    0.1325971,
-    0.13174823,
-    0.1309031,
-    0.13006169,
-    0.12922402,
-    0.12839006,
-    0.12755983,
-    0.12673332,
-    0.12591052,
-    0.12509143,
-    0.12427604,
-    0.12346435,
-    0.12265636,
-    0.121852055,
-    0.12105144,
-    0.1202545,
-    0.11946124,
-    0.11867165,
-    0.11788572,
-    0.11710346,
-    0.11632485,
-    0.115549885,
-    0.11477857,
-    0.11401089,
-    0.11324684,
-    0.11248643,
-    0.11172963,
-    0.11097645,
-    0.110226884,
-    0.10948092,
-    0.10873855,
-    0.10799977,
-    0.107264586,
-    0.106532976,
-    0.105804935,
-    0.10508047,
-    0.10435956,
-    0.1036422,
-    0.10292839,
-    0.10221813,
-    0.1015114,
-    0.10080819,
-    0.100108504,
-    0.09941233,
-    0.098719664,
-    0.0980305,
-    0.09734483,
-    0.09666264,
-    0.09598393,
-    0.095308684,
-    0.09463691,
-    0.093968585,
-    0.09330372,
-    0.092642285,
-    0.09198428,
-    0.09132971,
-    0.09067855,
-    0.090030804,
-    0.089386456,
-    0.088745505,
-    0.088107936,
-    0.08747375,
-    0.08684293,
-    0.08621547,
-    0.085591376,
-    0.084970616,
-    0.08435319,
-    0.0837391,
-    0.08312833,
-    0.08252087,
-    0.08191671,
-    0.08131585,
-    0.08071827,
-    0.080123976,
-    0.07953294,
-    0.078945175,
-    0.078360654,
-    0.077779375,
-    0.07720133,
-    0.07662651,
-    0.07605491,
-    0.07548651,
-    0.07492131,
-    0.0743593,
-    0.07380046,
-    0.073244795,
-    0.07269229,
-    0.07214294,
-    0.07159673,
-    0.07105365,
-    0.070513695,
-    0.06997685,
-    0.069443114,
-    0.06891247,
-    0.06838491,
-    0.067860425,
-    0.06733901,
-    0.066820644,
-    0.06630533,
-    0.06579305,
-    0.0652838,
-    0.06477757,
-    0.06427433,
-    0.0637741,
-    0.063276865,
-    0.06278259,
-    0.062291294,
-    0.061802953,
-    0.06131756,
-    0.0608351,
-    0.060355574,
-    0.05987896,
-    0.059405252,
-    0.058934443,
-    0.05846652,
-    0.058001474,
-    0.057539295,
-    0.05707997,
-    0.056623492,
-    0.05616985,
-    0.05571903,
-    0.055271026,
-    0.054825824,
-    0.05438342,
-    0.053943794,
-    0.053506944,
-    0.05307286,
-    0.052641522,
-    0.052212927,
-    0.051787063,
-    0.051363923,
-    0.05094349,
-    0.050525755,
-    0.05011071,
-    0.04969834,
-    0.049288645,
-    0.0488816,
-    0.048477206,
-    0.048075445,
-    0.04767631,
-    0.047279786,
-    0.04688587,
-    0.046494544,
-    0.046105802,
-    0.04571963,
-    0.04533602,
-    0.04495496,
-    0.04457644,
-    0.044200446,
-    0.04382697,
-    0.043456003,
-    0.043087535,
-    0.042721547,
-    0.042358037,
-    0.04199699,
-    0.041638397,
-    0.041282244,
-    0.040928524,
-    0.040577225,
-    0.040228333,
-    0.039881844,
-    0.039537743,
-    0.039196018,
-    0.038856663,
-    0.038519662,
-    0.038185004,
-    0.037852682,
-    0.037522685,
-    0.037195,
-    0.036869615,
-    0.036546525,
-    0.036225714,
-    0.03590717,
-    0.035590887,
-    0.035276853,
-    0.034965057,
-    0.034655485,
-    0.03434813,
-    0.03404298,
-    0.033740025,
-    0.033439253,
-    0.033140652,
-    0.032844216,
-    0.03254993,
-    0.032257784,
-    0.03196777,
-    0.031679876,
-    0.031394087,
-    0.031110398,
-    0.030828796,
-    0.030549273,
-    0.030271813,
-    0.02999641,
-    0.029723052,
-    0.029451728,
-    0.029182427,
-    0.02891514,
-    0.028649855,
-    0.028386563,
-    0.028125253,
-    0.02786591,
-    0.027608532,
-    0.027353102,
-    0.027099613,
-    0.026848052,
-    0.026598409,
-    0.026350675,
-    0.02610484,
-    0.02586089,
-    0.02561882,
-    0.025378617,
-    0.025140269,
-    0.024903767,
-    0.0246691,
-    0.02443626,
-    0.024205236,
-    0.023976017,
-    0.023748592,
-    0.023522953,
-    0.023299087,
-    0.023076987,
-    0.022856642,
-    0.02263804,
-    0.022421172,
-    0.022206029,
-    0.0219926,
-    0.021780876,
-    0.021570845,
-    0.021362498,
-    0.021155827,
-    0.020950818,
-    0.020747466,
-    0.020545758,
-    0.020345684,
-    0.020147236,
-    0.019950403,
-    0.019755175,
-    0.019561544,
-    0.019369498,
-    0.019179028,
-    0.018990126,
-    0.01880278,
-    0.018616982,
-    0.018432721,
-    0.01824999,
-    0.018068777,
-    0.017889075,
-    0.017710872,
-    0.01753416,
-    0.017358929,
-    0.017185168,
-    0.017012872,
-    0.016842028,
-    0.016672628,
-    0.016504662,
-    0.016338123,
-    0.016173,
-    0.016009282,
-    0.015846964,
-    0.015686033,
-    0.015526483,
-    0.015368304,
-    0.015211486,
-    0.0150560215,
-    0.014901901,
-    0.014749114,
-    0.014597654,
-    0.014447511,
-    0.0142986765,
-    0.014151142,
-    0.014004898,
-    0.013859936,
-    0.013716248,
-    0.0135738235,
-    0.013432656,
-    0.013292736,
-    0.013154055,
-    0.013016605,
-    0.012880377,
-    0.012745362,
-    0.012611552,
-    0.012478939,
-    0.012347515,
-    0.01221727,
-    0.012088198,
-    0.0119602885,
-    0.0118335355,
-    0.011707929,
-    0.011583461,
-    0.011460125,
-    0.011337912,
-    0.011216813,
-    0.011096821,
-    0.010977928,
-    0.0108601255,
-    0.010743406,
-    0.010627762,
-    0.0105131855,
-    0.010399668,
-    0.010287202,
-    0.01017578,
-    0.010065395,
-    0.009956039,
-    0.009847702,
-    0.009740381,
-    0.0096340645,
-    0.009528747,
-    0.009424419,
-    0.009321076,
-    0.009218709,
-    0.00911731,
-    0.009016872,
-    0.008917389,
-    0.008818853,
-    0.008721256,
-    0.008624591,
-    0.008528852,
-    0.00843403,
-    0.00834012,
-    0.008247114,
-    0.008155004,
-    0.008063785,
-    0.007973449,
-    0.007883989,
-    0.007795398,
-    0.0077076694,
-    0.0076207966,
-    0.0075347726,
-    0.007449591,
-    0.0073652444,
-    0.007281727,
-    0.0071990318,
-    0.007117152,
-    0.0070360815,
-    0.0069558136,
-    0.0068763415,
-    0.006797659,
-    0.00671976,
-    0.0066426382,
-    0.0065662866,
-    0.006490699,
-    0.0064158696,
-    0.006341792,
-    0.00626846,
-    0.0061958674,
-    0.0061240084,
-    0.0060528764,
-    0.0059824656,
-    0.0059127696,
-    0.0058437833,
-    0.0057755,
-    0.0057079145,
-    0.00564102,
-    0.0055748112,
-    0.0055092825,
-    0.005444428,
-    0.005380241,
-    0.0053167176,
-    0.005253851,
-    0.005191636,
-    0.005130066,
-    0.0050691366,
-    0.0050088423,
-    0.0049491767,
-    0.004890135,
-    0.0048317118,
-    0.004773902,
-    0.004716699,
-    0.0046600983,
+  0.99915,
+  0.998296,
+  0.9974381,
+  0.9965762,
+  0.99571025,
+  0.9948404,
+  0.9939665,
+  0.9930887,
+  0.9922069,
+  0.9913211,
+  0.9904313,
+  0.98953754,
+  0.9886398,
+  0.9877381,
+  0.9868324,
+  0.98592263,
+  0.98500896,
+  0.9840913,
+  0.9831696,
+  0.982244,
+  0.98131436,
+  0.9803808,
+  0.97944313,
+  0.97850156,
+  0.977556,
+  0.9766064,
+  0.97565293,
+  0.9746954,
+  0.9737339,
+  0.9727684,
+  0.97179896,
+  0.97082555,
+  0.96984816,
+  0.96886677,
+  0.9678814,
+  0.96689206,
+  0.96589875,
+  0.9649015,
+  0.96390027,
+  0.9628951,
+  0.9618859,
+  0.96087277,
+  0.95985574,
+  0.95883465,
+  0.9578097,
+  0.95678073,
+  0.95574784,
+  0.954711,
+  0.95367026,
+  0.9526256,
+  0.9515769,
+  0.95052433,
+  0.94946784,
+  0.94840735,
+  0.947343,
+  0.94627476,
+  0.9452025,
+  0.9441264,
+  0.9430464,
+  0.9419625,
+  0.9408747,
+  0.939783,
+  0.9386874,
+  0.93758786,
+  0.9364845,
+  0.93537724,
+  0.9342661,
+  0.9331511,
+  0.9320323,
+  0.9309096,
+  0.929783,
+  0.9286526,
+  0.9275183,
+  0.9263802,
+  0.92523825,
+  0.92409253,
+  0.92294294,
+  0.9217895,
+  0.92063236,
+  0.9194713,
+  0.9183065,
+  0.9171379,
+  0.91596556,
+  0.9147894,
+  0.9136095,
+  0.91242576,
+  0.9112383,
+  0.9100471,
+  0.9088522,
+  0.9076535,
+  0.9064511,
+  0.90524495,
+  0.9040351,
+  0.90282154,
+  0.9016043,
+  0.90038335,
+  0.8991587,
+  0.8979304,
+  0.8966984,
+  0.89546275,
+  0.89422345,
+  0.8929805,
+  0.89173394,
+  0.89048374,
+  0.88922995,
+  0.8879725,
+  0.8867115,
+  0.88544685,
+  0.88417864,
+  0.88290685,
+  0.8816315,
+  0.88035256,
+  0.8790701,
+  0.87778413,
+  0.8764946,
+  0.8752016,
+  0.873905,
+  0.87260497,
+  0.8713014,
+  0.8699944,
+  0.86868393,
+  0.86737,
+  0.8660526,
+  0.8647318,
+  0.86340755,
+  0.8620799,
+  0.8607488,
+  0.85941434,
+  0.8580765,
+  0.8567353,
+  0.8553907,
+  0.8540428,
+  0.85269153,
+  0.85133696,
+  0.84997904,
+  0.84861785,
+  0.8472533,
+  0.8458856,
+  0.8445145,
+  0.84314024,
+  0.84176266,
+  0.8403819,
+  0.8389979,
+  0.8376107,
+  0.8362203,
+  0.83482677,
+  0.83343,
+  0.8320301,
+  0.8306271,
+  0.8292209,
+  0.82781166,
+  0.82639927,
+  0.8249838,
+  0.82356524,
+  0.8221436,
+  0.82071894,
+  0.81929123,
+  0.81786054,
+  0.8164268,
+  0.8149901,
+  0.8135504,
+  0.81210774,
+  0.81066215,
+  0.8092136,
+  0.8077621,
+  0.80630773,
+  0.80485046,
+  0.8033903,
+  0.80192727,
+  0.8004614,
+  0.79899275,
+  0.79752123,
+  0.7960469,
+  0.7945698,
+  0.7930899,
+  0.79160726,
+  0.7901219,
+  0.7886338,
+  0.787143,
+  0.7856495,
+  0.7841533,
+  0.78265446,
+  0.78115296,
+  0.7796488,
+  0.77814204,
+  0.7766327,
+  0.7751208,
+  0.7736063,
+  0.77208924,
+  0.7705697,
+  0.7690476,
+  0.767523,
+  0.7659959,
+  0.7644664,
+  0.76293445,
+  0.7614,
+  0.7598632,
+  0.75832397,
+  0.75678235,
+  0.75523835,
+  0.75369203,
+  0.7521434,
+  0.75059247,
+  0.7490392,
+  0.7474837,
+  0.7459259,
+  0.7443659,
+  0.74280363,
+  0.7412392,
+  0.7396726,
+  0.7381038,
+  0.73653287,
+  0.7349598,
+  0.7333846,
+  0.73180735,
+  0.730228,
+  0.7286466,
+  0.7270631,
+  0.7254777,
+  0.72389024,
+  0.72230077,
+  0.7207094,
+  0.71911603,
+  0.7175208,
+  0.7159236,
+  0.71432453,
+  0.7127236,
+  0.71112084,
+  0.7095162,
+  0.7079098,
+  0.7063016,
+  0.70469165,
+  0.70307994,
+  0.7014665,
+  0.69985133,
+  0.6982345,
+  0.696616,
+  0.6949958,
+  0.69337404,
+  0.69175065,
+  0.69012564,
+  0.6884991,
+  0.68687093,
+  0.6852413,
+  0.68361014,
+  0.6819775,
+  0.6803434,
+  0.67870784,
+  0.6770708,
+  0.6754324,
+  0.6737926,
+  0.67215145,
+  0.670509,
+  0.66886514,
+  0.66722,
+  0.6655736,
+  0.66392595,
+  0.662277,
+  0.6606269,
+  0.65897554,
+  0.657323,
+  0.65566933,
+  0.6540145,
+  0.6523586,
+  0.6507016,
+  0.6490435,
+  0.64738435,
+  0.6457241,
+  0.64406294,
+  0.6424008,
+  0.64073765,
+  0.63907355,
+  0.63740855,
+  0.6357426,
+  0.6340758,
+  0.6324082,
+  0.6307397,
+  0.6290704,
+  0.6274003,
+  0.6257294,
+  0.62405777,
+  0.6223854,
+  0.62071234,
+  0.6190386,
+  0.61736417,
+  0.6156891,
+  0.61401343,
+  0.6123372,
+  0.6106603,
+  0.6089829,
+  0.607305,
+  0.6056265,
+  0.6039476,
+  0.60226816,
+  0.6005883,
+  0.598908,
+  0.59722733,
+  0.5955463,
+  0.59386486,
+  0.5921831,
+  0.59050107,
+  0.5888187,
+  0.5871361,
+  0.5854532,
+  0.5837701,
+  0.5820868,
+  0.5804033,
+  0.5787197,
+  0.5770359,
+  0.575352,
+  0.57366806,
+  0.571984,
+  0.5702999,
+  0.5686158,
+  0.56693166,
+  0.56524754,
+  0.5635635,
+  0.5618795,
+  0.56019557,
+  0.5585118,
+  0.5568281,
+  0.55514455,
+  0.5534612,
+  0.551778,
+  0.5500951,
+  0.5484124,
+  0.54673,
+  0.5450478,
+  0.54336596,
+  0.54168445,
+  0.54000324,
+  0.53832245,
+  0.5366421,
+  0.53496206,
+  0.5332825,
+  0.53160346,
+  0.5299248,
+  0.52824676,
+  0.5265692,
+  0.52489215,
+  0.5232157,
+  0.5215398,
+  0.51986456,
+  0.51818997,
+  0.51651603,
+  0.51484275,
+  0.5131702,
+  0.5114983,
+  0.5098272,
+  0.50815684,
+  0.5064873,
+  0.50481856,
+  0.50315064,
+  0.50148356,
+  0.4998174,
+  0.4981521,
+  0.49648774,
+  0.49482432,
+  0.49316183,
+  0.49150035,
+  0.48983985,
+  0.4881804,
+  0.486522,
+  0.48486462,
+  0.4832084,
+  0.48155323,
+  0.4798992,
+  0.47824633,
+  0.47659463,
+  0.4749441,
+  0.47329482,
+  0.4716468,
+  0.47,
+  0.46835446,
+  0.46671024,
+  0.46506736,
+  0.4634258,
+  0.46178558,
+  0.46014675,
+  0.45850933,
+  0.45687333,
+  0.45523876,
+  0.45360568,
+  0.45197406,
+  0.45034397,
+  0.44871536,
+  0.44708833,
+  0.44546285,
+  0.44383895,
+  0.44221666,
+  0.440596,
+  0.43897697,
+  0.43735963,
+  0.43574396,
+  0.43412998,
+  0.43251774,
+  0.43090722,
+  0.4292985,
+  0.42769152,
+  0.42608637,
+  0.42448303,
+  0.4228815,
+  0.42128187,
+  0.4196841,
+  0.41808826,
+  0.4164943,
+  0.4149023,
+  0.41331223,
+  0.41172415,
+  0.41013804,
+  0.40855396,
+  0.4069719,
+  0.4053919,
+  0.40381396,
+  0.4022381,
+  0.40066436,
+  0.39909273,
+  0.39752322,
+  0.3959559,
+  0.39439073,
+  0.39282778,
+  0.39126703,
+  0.3897085,
+  0.3881522,
+  0.3865982,
+  0.38504648,
+  0.38349706,
+  0.38194993,
+  0.38040516,
+  0.37886274,
+  0.37732267,
+  0.375785,
+  0.37424973,
+  0.37271687,
+  0.37118647,
+  0.36965853,
+  0.36813304,
+  0.36661002,
+  0.36508954,
+  0.36357155,
+  0.3620561,
+  0.36054322,
+  0.3590329,
+  0.35752517,
+  0.35602003,
+  0.35451752,
+  0.35301763,
+  0.3515204,
+  0.3500258,
+  0.3485339,
+  0.3470447,
+  0.34555823,
+  0.34407446,
+  0.34259343,
+  0.34111515,
+  0.33963963,
+  0.33816692,
+  0.336697,
+  0.3352299,
+  0.33376563,
+  0.3323042,
+  0.33084565,
+  0.32938993,
+  0.32793713,
+  0.3264872,
+  0.32504022,
+  0.32359615,
+  0.32215503,
+  0.32071686,
+  0.31928164,
+  0.31784943,
+  0.3164202,
+  0.314994,
+  0.3135708,
+  0.31215066,
+  0.31073356,
+  0.3093195,
+  0.30790854,
+  0.30650064,
+  0.30509588,
+  0.30369422,
+  0.30229566,
+  0.30090025,
+  0.299508,
+  0.2981189,
+  0.29673296,
+  0.29535022,
+  0.2939707,
+  0.29259437,
+  0.29122123,
+  0.28985137,
+  0.28848472,
+  0.28712133,
+  0.2857612,
+  0.28440437,
+  0.2830508,
+  0.28170055,
+  0.2803536,
+  0.27900997,
+  0.27766964,
+  0.27633268,
+  0.27499905,
+  0.2736688,
+  0.27234194,
+  0.27101842,
+  0.2696983,
+  0.26838157,
+  0.26706827,
+  0.26575837,
+  0.26445192,
+  0.26314887,
+  0.2618493,
+  0.26055318,
+  0.2592605,
+  0.25797132,
+  0.2566856,
+  0.2554034,
+  0.25412467,
+  0.25284946,
+  0.25157773,
+  0.2503096,
+  0.24904492,
+  0.24778382,
+  0.24652626,
+  0.24527225,
+  0.2440218,
+  0.24277493,
+  0.24153163,
+  0.24029191,
+  0.23905578,
+  0.23782326,
+  0.23659433,
+  0.23536903,
+  0.23414734,
+  0.23292927,
+  0.23171483,
+  0.23050404,
+  0.22929688,
+  0.22809339,
+  0.22689353,
+  0.22569734,
+  0.22450483,
+  0.22331597,
+  0.2221308,
+  0.22094932,
+  0.21977153,
+  0.21859743,
+  0.21742703,
+  0.21626033,
+  0.21509734,
+  0.21393807,
+  0.21278252,
+  0.21163069,
+  0.21048258,
+  0.20933822,
+  0.20819758,
+  0.2070607,
+  0.20592754,
+  0.20479813,
+  0.20367248,
+  0.20255059,
+  0.20143245,
+  0.20031808,
+  0.19920748,
+  0.19810064,
+  0.19699757,
+  0.19589828,
+  0.19480278,
+  0.19371104,
+  0.1926231,
+  0.19153893,
+  0.19045855,
+  0.18938197,
+  0.18830918,
+  0.18724018,
+  0.18617497,
+  0.18511358,
+  0.18405597,
+  0.18300217,
+  0.18195218,
+  0.18090598,
+  0.1798636,
+  0.17882504,
+  0.17779027,
+  0.1767593,
+  0.17573217,
+  0.17470883,
+  0.1736893,
+  0.1726736,
+  0.1716617,
+  0.17065361,
+  0.16964935,
+  0.1686489,
+  0.16765225,
+  0.16665943,
+  0.16567042,
+  0.16468522,
+  0.16370384,
+  0.16272627,
+  0.16175252,
+  0.16078258,
+  0.15981644,
+  0.15885411,
+  0.1578956,
+  0.15694089,
+  0.15599,
+  0.15504292,
+  0.15409963,
+  0.15316014,
+  0.15222447,
+  0.15129258,
+  0.1503645,
+  0.14944021,
+  0.14851972,
+  0.14760303,
+  0.14669013,
+  0.14578101,
+  0.14487568,
+  0.14397413,
+  0.14307636,
+  0.14218238,
+  0.14129217,
+  0.14040573,
+  0.13952307,
+  0.13864417,
+  0.13776903,
+  0.13689767,
+  0.13603005,
+  0.13516618,
+  0.13430607,
+  0.13344972,
+  0.1325971,
+  0.13174823,
+  0.1309031,
+  0.13006169,
+  0.12922402,
+  0.12839006,
+  0.12755983,
+  0.12673332,
+  0.12591052,
+  0.12509143,
+  0.12427604,
+  0.12346435,
+  0.12265636,
+  0.121852055,
+  0.12105144,
+  0.1202545,
+  0.11946124,
+  0.11867165,
+  0.11788572,
+  0.11710346,
+  0.11632485,
+  0.115549885,
+  0.11477857,
+  0.11401089,
+  0.11324684,
+  0.11248643,
+  0.11172963,
+  0.11097645,
+  0.110226884,
+  0.10948092,
+  0.10873855,
+  0.10799977,
+  0.107264586,
+  0.106532976,
+  0.105804935,
+  0.10508047,
+  0.10435956,
+  0.1036422,
+  0.10292839,
+  0.10221813,
+  0.1015114,
+  0.10080819,
+  0.100108504,
+  0.09941233,
+  0.098719664,
+  0.0980305,
+  0.09734483,
+  0.09666264,
+  0.09598393,
+  0.095308684,
+  0.09463691,
+  0.093968585,
+  0.09330372,
+  0.092642285,
+  0.09198428,
+  0.09132971,
+  0.09067855,
+  0.090030804,
+  0.089386456,
+  0.088745505,
+  0.088107936,
+  0.08747375,
+  0.08684293,
+  0.08621547,
+  0.085591376,
+  0.084970616,
+  0.08435319,
+  0.0837391,
+  0.08312833,
+  0.08252087,
+  0.08191671,
+  0.08131585,
+  0.08071827,
+  0.080123976,
+  0.07953294,
+  0.078945175,
+  0.078360654,
+  0.077779375,
+  0.07720133,
+  0.07662651,
+  0.07605491,
+  0.07548651,
+  0.07492131,
+  0.0743593,
+  0.07380046,
+  0.073244795,
+  0.07269229,
+  0.07214294,
+  0.07159673,
+  0.07105365,
+  0.070513695,
+  0.06997685,
+  0.069443114,
+  0.06891247,
+  0.06838491,
+  0.067860425,
+  0.06733901,
+  0.066820644,
+  0.06630533,
+  0.06579305,
+  0.0652838,
+  0.06477757,
+  0.06427433,
+  0.0637741,
+  0.063276865,
+  0.06278259,
+  0.062291294,
+  0.061802953,
+  0.06131756,
+  0.0608351,
+  0.060355574,
+  0.05987896,
+  0.059405252,
+  0.058934443,
+  0.05846652,
+  0.058001474,
+  0.057539295,
+  0.05707997,
+  0.056623492,
+  0.05616985,
+  0.05571903,
+  0.055271026,
+  0.054825824,
+  0.05438342,
+  0.053943794,
+  0.053506944,
+  0.05307286,
+  0.052641522,
+  0.052212927,
+  0.051787063,
+  0.051363923,
+  0.05094349,
+  0.050525755,
+  0.05011071,
+  0.04969834,
+  0.049288645,
+  0.0488816,
+  0.048477206,
+  0.048075445,
+  0.04767631,
+  0.047279786,
+  0.04688587,
+  0.046494544,
+  0.046105802,
+  0.04571963,
+  0.04533602,
+  0.04495496,
+  0.04457644,
+  0.044200446,
+  0.04382697,
+  0.043456003,
+  0.043087535,
+  0.042721547,
+  0.042358037,
+  0.04199699,
+  0.041638397,
+  0.041282244,
+  0.040928524,
+  0.040577225,
+  0.040228333,
+  0.039881844,
+  0.039537743,
+  0.039196018,
+  0.038856663,
+  0.038519662,
+  0.038185004,
+  0.037852682,
+  0.037522685,
+  0.037195,
+  0.036869615,
+  0.036546525,
+  0.036225714,
+  0.03590717,
+  0.035590887,
+  0.035276853,
+  0.034965057,
+  0.034655485,
+  0.03434813,
+  0.03404298,
+  0.033740025,
+  0.033439253,
+  0.033140652,
+  0.032844216,
+  0.03254993,
+  0.032257784,
+  0.03196777,
+  0.031679876,
+  0.031394087,
+  0.031110398,
+  0.030828796,
+  0.030549273,
+  0.030271813,
+  0.02999641,
+  0.029723052,
+  0.029451728,
+  0.029182427,
+  0.02891514,
+  0.028649855,
+  0.028386563,
+  0.028125253,
+  0.02786591,
+  0.027608532,
+  0.027353102,
+  0.027099613,
+  0.026848052,
+  0.026598409,
+  0.026350675,
+  0.02610484,
+  0.02586089,
+  0.02561882,
+  0.025378617,
+  0.025140269,
+  0.024903767,
+  0.0246691,
+  0.02443626,
+  0.024205236,
+  0.023976017,
+  0.023748592,
+  0.023522953,
+  0.023299087,
+  0.023076987,
+  0.022856642,
+  0.02263804,
+  0.022421172,
+  0.022206029,
+  0.0219926,
+  0.021780876,
+  0.021570845,
+  0.021362498,
+  0.021155827,
+  0.020950818,
+  0.020747466,
+  0.020545758,
+  0.020345684,
+  0.020147236,
+  0.019950403,
+  0.019755175,
+  0.019561544,
+  0.019369498,
+  0.019179028,
+  0.018990126,
+  0.01880278,
+  0.018616982,
+  0.018432721,
+  0.01824999,
+  0.018068777,
+  0.017889075,
+  0.017710872,
+  0.01753416,
+  0.017358929,
+  0.017185168,
+  0.017012872,
+  0.016842028,
+  0.016672628,
+  0.016504662,
+  0.016338123,
+  0.016173,
+  0.016009282,
+  0.015846964,
+  0.015686033,
+  0.015526483,
+  0.015368304,
+  0.015211486,
+  0.0150560215,
+  0.014901901,
+  0.014749114,
+  0.014597654,
+  0.014447511,
+  0.0142986765,
+  0.014151142,
+  0.014004898,
+  0.013859936,
+  0.013716248,
+  0.0135738235,
+  0.013432656,
+  0.013292736,
+  0.013154055,
+  0.013016605,
+  0.012880377,
+  0.012745362,
+  0.012611552,
+  0.012478939,
+  0.012347515,
+  0.01221727,
+  0.012088198,
+  0.0119602885,
+  0.0118335355,
+  0.011707929,
+  0.011583461,
+  0.011460125,
+  0.011337912,
+  0.011216813,
+  0.011096821,
+  0.010977928,
+  0.0108601255,
+  0.010743406,
+  0.010627762,
+  0.0105131855,
+  0.010399668,
+  0.010287202,
+  0.01017578,
+  0.010065395,
+  0.009956039,
+  0.009847702,
+  0.009740381,
+  0.0096340645,
+  0.009528747,
+  0.009424419,
+  0.009321076,
+  0.009218709,
+  0.00911731,
+  0.009016872,
+  0.008917389,
+  0.008818853,
+  0.008721256,
+  0.008624591,
+  0.008528852,
+  0.00843403,
+  0.00834012,
+  0.008247114,
+  0.008155004,
+  0.008063785,
+  0.007973449,
+  0.007883989,
+  0.007795398,
+  0.0077076694,
+  0.0076207966,
+  0.0075347726,
+  0.007449591,
+  0.0073652444,
+  0.007281727,
+  0.0071990318,
+  0.007117152,
+  0.0070360815,
+  0.0069558136,
+  0.0068763415,
+  0.006797659,
+  0.00671976,
+  0.0066426382,
+  0.0065662866,
+  0.006490699,
+  0.0064158696,
+  0.006341792,
+  0.00626846,
+  0.0061958674,
+  0.0061240084,
+  0.0060528764,
+  0.0059824656,
+  0.0059127696,
+  0.0058437833,
+  0.0057755,
+  0.0057079145,
+  0.00564102,
+  0.0055748112,
+  0.0055092825,
+  0.005444428,
+  0.005380241,
+  0.0053167176,
+  0.005253851,
+  0.005191636,
+  0.005130066,
+  0.0050691366,
+  0.0050088423,
+  0.0049491767,
+  0.004890135,
+  0.0048317118,
+  0.004773902,
+  0.004716699,
+  0.0046600983,
 ]
diff --git a/deepray/models/stable_diffusion/decoder.py b/deepray/models/stable_diffusion/decoder.py
index 167ac316..dbc7bd95 100644
--- a/deepray/models/stable_diffusion/decoder.py
+++ b/deepray/models/stable_diffusion/decoder.py
@@ -15,56 +15,55 @@
 from tensorflow import keras
 
 from .__internal__.layers.attention_block import (  # noqa: E501
-    AttentionBlock,
+  AttentionBlock,
 )
 from .__internal__.layers.padded_conv2d import (
-    PaddedConv2D,
+  PaddedConv2D,
 )
 from .__internal__.layers.resnet_block import (
-    ResnetBlock,
+  ResnetBlock,
 )
 
 
 class Decoder(keras.Sequential):
-
   def __init__(self, img_height, img_width, name=None, download_weights=True):
     super().__init__(
-        [
-            keras.layers.Input((img_height // 8, img_width // 8, 4)),
-            keras.layers.Rescaling(1.0 / 0.18215),
-            PaddedConv2D(4, 1),
-            PaddedConv2D(512, 3, padding=1),
-            ResnetBlock(512),
-            AttentionBlock(512),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            keras.layers.UpSampling2D(2),
-            PaddedConv2D(512, 3, padding=1),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            keras.layers.UpSampling2D(2),
-            PaddedConv2D(512, 3, padding=1),
-            ResnetBlock(256),
-            ResnetBlock(256),
-            ResnetBlock(256),
-            keras.layers.UpSampling2D(2),
-            PaddedConv2D(256, 3, padding=1),
-            ResnetBlock(128),
-            ResnetBlock(128),
-            ResnetBlock(128),
-            keras.layers.GroupNormalization(epsilon=1e-5),
-            keras.layers.Activation("swish"),
-            PaddedConv2D(3, 3, padding=1),
-        ],
-        name=name,
+      [
+        keras.layers.Input((img_height // 8, img_width // 8, 4)),
+        keras.layers.Rescaling(1.0 / 0.18215),
+        PaddedConv2D(4, 1),
+        PaddedConv2D(512, 3, padding=1),
+        ResnetBlock(512),
+        AttentionBlock(512),
+        ResnetBlock(512),
+        ResnetBlock(512),
+        ResnetBlock(512),
+        ResnetBlock(512),
+        keras.layers.UpSampling2D(2),
+        PaddedConv2D(512, 3, padding=1),
+        ResnetBlock(512),
+        ResnetBlock(512),
+        ResnetBlock(512),
+        keras.layers.UpSampling2D(2),
+        PaddedConv2D(512, 3, padding=1),
+        ResnetBlock(256),
+        ResnetBlock(256),
+        ResnetBlock(256),
+        keras.layers.UpSampling2D(2),
+        PaddedConv2D(256, 3, padding=1),
+        ResnetBlock(128),
+        ResnetBlock(128),
+        ResnetBlock(128),
+        keras.layers.GroupNormalization(epsilon=1e-5),
+        keras.layers.Activation("swish"),
+        PaddedConv2D(3, 3, padding=1),
+      ],
+      name=name,
     )
 
     if download_weights:
       decoder_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",  # noqa: E501
-          file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",  # noqa: E501
+        origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_decoder.h5",  # noqa: E501
+        file_hash="ad350a65cc8bc4a80c8103367e039a3329b4231c2469a1093869a345f55b1962",  # noqa: E501
       )
       self.load_weights(decoder_weights_fpath)
diff --git a/deepray/models/stable_diffusion/diffusion_model.py b/deepray/models/stable_diffusion/diffusion_model.py
index 69a0d79a..088ac06c 100644
--- a/deepray/models/stable_diffusion/diffusion_model.py
+++ b/deepray/models/stable_diffusion/diffusion_model.py
@@ -16,19 +16,18 @@
 from tensorflow import keras
 
 from .__internal__.layers.padded_conv2d import (
-    PaddedConv2D,
+  PaddedConv2D,
 )
 
 
 class DiffusionModel(keras.Model):
-
   def __init__(
-      self,
-      img_height,
-      img_width,
-      max_text_length,
-      name=None,
-      download_weights=True,
+    self,
+    img_height,
+    img_width,
+    max_text_length,
+    name=None,
+    download_weights=True,
   ):
     context = keras.layers.Input((max_text_length, 768))
     t_embed_input = keras.layers.Input((320,))
@@ -109,21 +108,20 @@ def __init__(
 
     if download_weights:
       diffusion_model_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_diffusion_model.h5",  # noqa: E501
-          file_hash="8799ff9763de13d7f30a683d653018e114ed24a6a819667da4f5ee10f9e805fe",  # noqa: E501
+        origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_diffusion_model.h5",  # noqa: E501
+        file_hash="8799ff9763de13d7f30a683d653018e114ed24a6a819667da4f5ee10f9e805fe",  # noqa: E501
       )
       self.load_weights(diffusion_model_weights_fpath)
 
 
 class DiffusionModelV2(keras.Model):
-
   def __init__(
-      self,
-      img_height,
-      img_width,
-      max_text_length,
-      name=None,
-      download_weights=True,
+    self,
+    img_height,
+    img_width,
+    max_text_length,
+    name=None,
+    download_weights=True,
   ):
     context = keras.layers.Input((max_text_length, 1024))
     t_embed_input = keras.layers.Input((320,))
@@ -204,30 +202,29 @@ def __init__(
 
     if download_weights:
       diffusion_model_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/ianstenbit/keras-sd2.1/resolve/main/diffusion_model_v2_1.h5",  # noqa: E501
-          file_hash="c31730e91111f98fe0e2dbde4475d381b5287ebb9672b1821796146a25c5132d",  # noqa: E501
+        origin="https://huggingface.co/ianstenbit/keras-sd2.1/resolve/main/diffusion_model_v2_1.h5",  # noqa: E501
+        file_hash="c31730e91111f98fe0e2dbde4475d381b5287ebb9672b1821796146a25c5132d",  # noqa: E501
       )
       self.load_weights(diffusion_model_weights_fpath)
 
 
 class ResBlock(keras.layers.Layer):
-
   def __init__(self, output_dim, **kwargs):
     super().__init__(**kwargs)
     self.output_dim = output_dim
     self.entry_flow = [
-        keras.layers.GroupNormalization(epsilon=1e-5),
-        keras.layers.Activation("swish"),
-        PaddedConv2D(output_dim, 3, padding=1),
+      keras.layers.GroupNormalization(epsilon=1e-5),
+      keras.layers.Activation("swish"),
+      PaddedConv2D(output_dim, 3, padding=1),
     ]
     self.embedding_flow = [
-        keras.layers.Activation("swish"),
-        keras.layers.Dense(output_dim),
+      keras.layers.Activation("swish"),
+      keras.layers.Dense(output_dim),
     ]
     self.exit_flow = [
-        keras.layers.GroupNormalization(epsilon=1e-5),
-        keras.layers.Activation("swish"),
-        PaddedConv2D(output_dim, 3, padding=1),
+      keras.layers.GroupNormalization(epsilon=1e-5),
+      keras.layers.Activation("swish"),
+      PaddedConv2D(output_dim, 3, padding=1),
     ]
 
   def build(self, input_shape):
@@ -250,7 +247,6 @@ def call(self, inputs):
 
 
 class SpatialTransformer(keras.layers.Layer):
-
   def __init__(self, num_heads, head_size, fully_connected=False, **kwargs):
     super().__init__(**kwargs)
     self.norm = keras.layers.GroupNormalization(epsilon=1e-5)
@@ -277,7 +273,6 @@ def call(self, inputs):
 
 
 class BasicTransformerBlock(keras.layers.Layer):
-
   def __init__(self, dim, num_heads, head_size, **kwargs):
     super().__init__(**kwargs)
     self.norm1 = keras.layers.LayerNormalization(epsilon=1e-5)
@@ -296,7 +291,6 @@ def call(self, inputs):
 
 
 class CrossAttention(keras.layers.Layer):
-
   def __init__(self, num_heads, head_size, **kwargs):
     super().__init__(**kwargs)
     self.to_q = keras.layers.Dense(num_heads * head_size, use_bias=False)
@@ -328,7 +322,6 @@ def call(self, inputs):
 
 
 class Upsample(keras.layers.Layer):
-
   def __init__(self, channels, **kwargs):
     super().__init__(**kwargs)
     self.ups = keras.layers.UpSampling2D(2)
@@ -339,7 +332,6 @@ def call(self, inputs):
 
 
 class GEGLU(keras.layers.Layer):
-
   def __init__(self, output_dim, **kwargs):
     super().__init__(**kwargs)
     self.output_dim = output_dim
@@ -347,7 +339,7 @@ def __init__(self, output_dim, **kwargs):
 
   def call(self, inputs):
     x = self.dense(inputs)
-    x, gate = x[..., :self.output_dim], x[..., self.output_dim:]
+    x, gate = x[..., : self.output_dim], x[..., self.output_dim :]
     tanh_res = keras.activations.tanh(gate * 0.7978845608 * (1 + 0.044715 * (gate**2)))
     return x * 0.5 * gate * (1 + tanh_res)
 
diff --git a/deepray/models/stable_diffusion/image_encoder.py b/deepray/models/stable_diffusion/image_encoder.py
index c5f9d98a..538b7bd1 100644
--- a/deepray/models/stable_diffusion/image_encoder.py
+++ b/deepray/models/stable_diffusion/image_encoder.py
@@ -15,13 +15,13 @@
 from tensorflow import keras
 
 from .__internal__.layers.attention_block import (  # noqa: E501
-    AttentionBlock,
+  AttentionBlock,
 )
 from .__internal__.layers.padded_conv2d import (
-    PaddedConv2D,
+  PaddedConv2D,
 )
 from .__internal__.layers.resnet_block import (
-    ResnetBlock,
+  ResnetBlock,
 )
 
 
@@ -29,39 +29,37 @@ class ImageEncoder(keras.Sequential):
   """ImageEncoder is the VAE Encoder for StableDiffusion."""
 
   def __init__(self, download_weights=True):
-    super().__init__(
-        [
-            keras.layers.Input((None, None, 3)),
-            PaddedConv2D(128, 3, padding=1),
-            ResnetBlock(128),
-            ResnetBlock(128),
-            PaddedConv2D(128, 3, padding=((0, 1), (0, 1)), strides=2),
-            ResnetBlock(256),
-            ResnetBlock(256),
-            PaddedConv2D(256, 3, padding=((0, 1), (0, 1)), strides=2),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            PaddedConv2D(512, 3, padding=((0, 1), (0, 1)), strides=2),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            ResnetBlock(512),
-            AttentionBlock(512),
-            ResnetBlock(512),
-            keras.layers.GroupNormalization(epsilon=1e-5),
-            keras.layers.Activation("swish"),
-            PaddedConv2D(8, 3, padding=1),
-            PaddedConv2D(8, 1),
-            # TODO(lukewood): can this be refactored to be a Rescaling
-            #  layer? Perhaps some sort of rescale and gather?
-            #  Either way, we may need a lambda to gather the first 4
-            #  dimensions.
-            keras.layers.Lambda(lambda x: x[..., :4] * 0.18215),
-        ]
-    )
+    super().__init__([
+      keras.layers.Input((None, None, 3)),
+      PaddedConv2D(128, 3, padding=1),
+      ResnetBlock(128),
+      ResnetBlock(128),
+      PaddedConv2D(128, 3, padding=((0, 1), (0, 1)), strides=2),
+      ResnetBlock(256),
+      ResnetBlock(256),
+      PaddedConv2D(256, 3, padding=((0, 1), (0, 1)), strides=2),
+      ResnetBlock(512),
+      ResnetBlock(512),
+      PaddedConv2D(512, 3, padding=((0, 1), (0, 1)), strides=2),
+      ResnetBlock(512),
+      ResnetBlock(512),
+      ResnetBlock(512),
+      AttentionBlock(512),
+      ResnetBlock(512),
+      keras.layers.GroupNormalization(epsilon=1e-5),
+      keras.layers.Activation("swish"),
+      PaddedConv2D(8, 3, padding=1),
+      PaddedConv2D(8, 1),
+      # TODO(lukewood): can this be refactored to be a Rescaling
+      #  layer? Perhaps some sort of rescale and gather?
+      #  Either way, we may need a lambda to gather the first 4
+      #  dimensions.
+      keras.layers.Lambda(lambda x: x[..., :4] * 0.18215),
+    ])
 
     if download_weights:
       image_encoder_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/vae_encoder.h5",  # noqa: E501
-          file_hash="c60fb220a40d090e0f86a6ab4c312d113e115c87c40ff75d11ffcf380aab7ebb",  # noqa: E501
+        origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/vae_encoder.h5",  # noqa: E501
+        file_hash="c60fb220a40d090e0f86a6ab4c312d113e115c87c40ff75d11ffcf380aab7ebb",  # noqa: E501
       )
       self.load_weights(image_encoder_weights_fpath)
diff --git a/deepray/models/stable_diffusion/noise_scheduler.py b/deepray/models/stable_diffusion/noise_scheduler.py
index f5239635..240201eb 100644
--- a/deepray/models/stable_diffusion/noise_scheduler.py
+++ b/deepray/models/stable_diffusion/noise_scheduler.py
@@ -21,31 +21,31 @@
 
 class NoiseScheduler:
   """
-    Args:
-        train_timesteps: number of diffusion steps used to train the model.
-        beta_start: the starting `beta` value of inference.
-        beta_end: the final `beta` value.
-        beta_schedule: the beta schedule, a mapping from a beta range to a
-            sequence of betas for stepping the model. Choose from `linear` or
-            `quadratic`.
-        betas: a complete set of betas, in lieu of using one of the existing
-            schedules.
-        variance_type: options to clip the variance used when adding noise to
-            the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
-            `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
-        clip_sample: option to clip predicted sample between -1 and 1 for
-            numerical stability.
-    """
+  Args:
+      train_timesteps: number of diffusion steps used to train the model.
+      beta_start: the starting `beta` value of inference.
+      beta_end: the final `beta` value.
+      beta_schedule: the beta schedule, a mapping from a beta range to a
+          sequence of betas for stepping the model. Choose from `linear` or
+          `quadratic`.
+      betas: a complete set of betas, in lieu of using one of the existing
+          schedules.
+      variance_type: options to clip the variance used when adding noise to
+          the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
+          `fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
+      clip_sample: option to clip predicted sample between -1 and 1 for
+          numerical stability.
+  """
 
   def __init__(
-      self,
-      train_timesteps=1000,
-      beta_start=0.0001,
-      beta_end=0.02,
-      beta_schedule="linear",
-      betas=None,
-      variance_type="fixed_small",
-      clip_sample=True,
+    self,
+    train_timesteps=1000,
+    beta_start=0.0001,
+    beta_end=0.02,
+    beta_schedule="linear",
+    betas=None,
+    variance_type="fixed_small",
+    clip_sample=True,
   ):
     self.train_timesteps = train_timesteps
 
@@ -53,7 +53,7 @@ def __init__(
       self.betas = tf.linspace(beta_start, beta_end, train_timesteps)
     elif beta_schedule == "scaled_linear":
       # this schedule is very specific to the latent diffusion model.
-      self.betas = (tf.linspace(beta_start**0.5, beta_end**0.5, train_timesteps)**2)
+      self.betas = tf.linspace(beta_start**0.5, beta_end**0.5, train_timesteps) ** 2
     else:
       raise ValueError(f"Invalid beta schedule: {beta_schedule}.")
 
@@ -65,9 +65,9 @@ def __init__(
 
   def _get_variance(self, timestep, predicted_variance=None):
     alpha_prod = self.alphas_cumprod[timestep]
-    alpha_prod_prev = (self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0)
+    alpha_prod_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
 
-    variance = ((1 - alpha_prod_prev) / (1 - alpha_prod) * self.betas[timestep])
+    variance = (1 - alpha_prod_prev) / (1 - alpha_prod) * self.betas[timestep]
 
     if self.variance_type == "fixed_small":
       variance = tf.clip_by_value(variance, clip_value_min=1e-20, clip_value_max=1)
@@ -90,31 +90,31 @@ def _get_variance(self, timestep, predicted_variance=None):
     return variance
 
   def step(
-      self,
-      model_output,
-      timestep,
-      sample,
-      predict_epsilon=True,
+    self,
+    model_output,
+    timestep,
+    sample,
+    predict_epsilon=True,
   ):
     """
-        Predict the sample at the previous timestep by reversing the SDE. Core
-        function to propagate the diffusion process from the learned model
-        outputs (usually the predicted noise).
-        Args:
-            model_output: a Tensor containing direct output from learned
-                diffusion model
-            timestep: current discrete timestep in the diffusion chain.
-            sample: a Tensor containing the current instance of sample being
-                created by diffusion process.
-            predict_epsilon: whether the model is predicting noise (epsilon) or
-                samples
-        Returns:
-            The predicted sample at the previous timestep
-        """
+    Predict the sample at the previous timestep by reversing the SDE. Core
+    function to propagate the diffusion process from the learned model
+    outputs (usually the predicted noise).
+    Args:
+        model_output: a Tensor containing direct output from learned
+            diffusion model
+        timestep: current discrete timestep in the diffusion chain.
+        sample: a Tensor containing the current instance of sample being
+            created by diffusion process.
+        predict_epsilon: whether the model is predicting noise (epsilon) or
+            samples
+    Returns:
+        The predicted sample at the previous timestep
+    """
 
     if model_output.shape[1] == sample.shape[1] * 2 and self.variance_type in [
-        "learned",
-        "learned_range",
+      "learned",
+      "learned_range",
     ]:
       model_output, predicted_variance = tf.split(model_output, sample.shape[1], axis=1)
     else:
@@ -122,14 +122,14 @@ def step(
 
     # 1. compute alphas, betas
     alpha_prod = self.alphas_cumprod[timestep]
-    alpha_prod_prev = (self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0)
+    alpha_prod_prev = self.alphas_cumprod[timestep - 1] if timestep > 0 else 1.0
     beta_prod = 1 - alpha_prod
     beta_prod_prev = 1 - alpha_prod_prev
 
     # 2. compute predicted original sample from predicted noise also called
     # "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf  # noqa: E501
     if predict_epsilon:
-      pred_original_sample = (sample - beta_prod**(0.5) * model_output) / alpha_prod**(0.5)
+      pred_original_sample = (sample - beta_prod ** (0.5) * model_output) / alpha_prod ** (0.5)
     else:
       pred_original_sample = model_output
 
@@ -140,37 +140,37 @@ def step(
     # 4. Compute coefficients for pred_original_sample x_0 and current
     # sample x_t
     # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-    pred_original_sample_coeff = (alpha_prod_prev**(0.5) * self.betas[timestep]) / beta_prod
-    current_sample_coeff = (self.alphas[timestep]**(0.5) * beta_prod_prev / beta_prod)
+    pred_original_sample_coeff = (alpha_prod_prev ** (0.5) * self.betas[timestep]) / beta_prod
+    current_sample_coeff = self.alphas[timestep] ** (0.5) * beta_prod_prev / beta_prod
 
     # 5. Compute predicted previous sample µ_t
     # See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
-    pred_prev_sample = (pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample)
+    pred_prev_sample = pred_original_sample_coeff * pred_original_sample + current_sample_coeff * sample
 
     # 6. Add noise
     variance = 0
     if timestep > 0:
       noise = tf.random.normal(model_output.shape)
-      variance = (self._get_variance(timestep, predicted_variance=predicted_variance)**0.5) * noise
+      variance = (self._get_variance(timestep, predicted_variance=predicted_variance) ** 0.5) * noise
 
     pred_prev_sample = pred_prev_sample + variance
 
     return pred_prev_sample
 
   def add_noise(
-      self,
-      original_samples,
-      noise,
-      timesteps,
+    self,
+    original_samples,
+    noise,
+    timesteps,
   ):
-    sqrt_alpha_prod = tf.gather(self.alphas_cumprod, timesteps)**0.5
-    sqrt_one_minus_alpha_prod = (1 - tf.gather(self.alphas_cumprod, timesteps))**0.5
+    sqrt_alpha_prod = tf.gather(self.alphas_cumprod, timesteps) ** 0.5
+    sqrt_one_minus_alpha_prod = (1 - tf.gather(self.alphas_cumprod, timesteps)) ** 0.5
 
     for _ in range(3):
       sqrt_alpha_prod = tf.expand_dims(sqrt_alpha_prod, axis=-1)
       sqrt_one_minus_alpha_prod = tf.expand_dims(sqrt_one_minus_alpha_prod, axis=-1)
 
-    noisy_samples = (sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise)
+    noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
     return noisy_samples
 
   def __len__(self):
diff --git a/deepray/models/stable_diffusion/stable_diffusion.py b/deepray/models/stable_diffusion/stable_diffusion.py
index 8d056089..6683d7b9 100644
--- a/deepray/models/stable_diffusion/stable_diffusion.py
+++ b/deepray/models/stable_diffusion/stable_diffusion.py
@@ -47,10 +47,10 @@ class StableDiffusionBase:
   """Base class for stable diffusion and stable diffusion v2 model."""
 
   def __init__(
-      self,
-      img_height=512,
-      img_width=512,
-      jit_compile=False,
+    self,
+    img_height=512,
+    img_width=512,
+    jit_compile=False,
   ):
     # UNet requires multiples of 2**7 = 128
     img_height = round(img_height / 128) * 128
@@ -68,47 +68,47 @@ def __init__(
     self.jit_compile = jit_compile
 
   def text_to_image(
-      self,
-      prompt,
-      negative_prompt=None,
-      batch_size=1,
-      num_steps=50,
-      unconditional_guidance_scale=7.5,
-      seed=None,
+    self,
+    prompt,
+    negative_prompt=None,
+    batch_size=1,
+    num_steps=50,
+    unconditional_guidance_scale=7.5,
+    seed=None,
   ):
     encoded_text = self.encode_text(prompt)
 
     return self.generate_image(
-        encoded_text,
-        negative_prompt=negative_prompt,
-        batch_size=batch_size,
-        num_steps=num_steps,
-        unconditional_guidance_scale=unconditional_guidance_scale,
-        seed=seed,
+      encoded_text,
+      negative_prompt=negative_prompt,
+      batch_size=batch_size,
+      num_steps=num_steps,
+      unconditional_guidance_scale=unconditional_guidance_scale,
+      seed=seed,
     )
 
   def encode_text(self, prompt):
     """Encodes a prompt into a latent text encoding.
 
-        The encoding produced by this method should be used as the
-        `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
-        text separately from generating an image can be used to arbitrarily
-        modify the text encoding priot to image generation, e.g. for walking
-        between two prompts.
+    The encoding produced by this method should be used as the
+    `encoded_text` parameter of `StableDiffusion.generate_image`. Encoding
+    text separately from generating an image can be used to arbitrarily
+    modify the text encoding priot to image generation, e.g. for walking
+    between two prompts.
 
-        Args:
-            prompt: a string to encode, must be 77 tokens or shorter.
+    Args:
+        prompt: a string to encode, must be 77 tokens or shorter.
 
-        Example:
+    Example:
 
-        ```python
-        from keras_cv.models import StableDiffusion
+    ```python
+    from keras_cv.models import StableDiffusion
 
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        encoded_text  = model.encode_text("Tacos at dawn")
-        img = model.generate_image(encoded_text)
-        ```
-        """
+    model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+    encoded_text  = model.encode_text("Tacos at dawn")
+    img = model.generate_image(encoded_text)
+    ```
+    """
     # Tokenize prompt (i.e. starting context)
     inputs = self.tokenizer.encode(prompt)
     if len(inputs) > MAX_PROMPT_LENGTH:
@@ -121,62 +121,62 @@ def encode_text(self, prompt):
     return context
 
   def generate_image(
-      self,
-      encoded_text,
-      negative_prompt=None,
-      batch_size=1,
-      num_steps=50,
-      unconditional_guidance_scale=7.5,
-      diffusion_noise=None,
-      seed=None,
+    self,
+    encoded_text,
+    negative_prompt=None,
+    batch_size=1,
+    num_steps=50,
+    unconditional_guidance_scale=7.5,
+    diffusion_noise=None,
+    seed=None,
   ):
     """Generates an image based on encoded text.
 
-        The encoding passed to this method should be derived from
-        `StableDiffusion.encode_text`.
-
-        Args:
-            encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
-                of shape (77, 768). When the batch axis is omitted, the same
-                encoded text will be used to produce every generated image.
-            batch_size: int, number of images to generate, defaults to 1.
-            negative_prompt: a string containing information to negatively guide
-                the image generation (e.g. by removing or altering certain
-                aspects of the generated image), defaults to None.
-            num_steps: int, number of diffusion steps (controls image quality),
-                defaults to 50.
-            unconditional_guidance_scale: float, controlling how closely the
-                image should adhere to the prompt. Larger values result in more
-                closely adhering to the prompt, but will make the image noisier.
-                Defaults to 7.5.
-            diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
-                img_width // 8, 4), or a Tensor of shape (img_height // 8,
-                img_width // 8, 4). Optional custom noise to seed the diffusion
-                process. When the batch axis is omitted, the same noise will be
-                used to seed diffusion for every generated image.
-            seed: integer which is used to seed the random generation of
-                diffusion noise, only to be specified if `diffusion_noise` is
-                None.
-
-        Example:
-
-        ```python
-        from keras_cv.models import StableDiffusion
-
-        batch_size = 8
-        model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-        e_tacos = model.encode_text("Tacos at dawn")
-        e_watermelons = model.encode_text("Watermelons at dusk")
-
-        e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
-        images = model.generate_image(e_interpolated, batch_size=batch_size)
-        ```
-        """
+    The encoding passed to this method should be derived from
+    `StableDiffusion.encode_text`.
+
+    Args:
+        encoded_text: Tensor of shape (`batch_size`, 77, 768), or a Tensor
+            of shape (77, 768). When the batch axis is omitted, the same
+            encoded text will be used to produce every generated image.
+        batch_size: int, number of images to generate, defaults to 1.
+        negative_prompt: a string containing information to negatively guide
+            the image generation (e.g. by removing or altering certain
+            aspects of the generated image), defaults to None.
+        num_steps: int, number of diffusion steps (controls image quality),
+            defaults to 50.
+        unconditional_guidance_scale: float, controlling how closely the
+            image should adhere to the prompt. Larger values result in more
+            closely adhering to the prompt, but will make the image noisier.
+            Defaults to 7.5.
+        diffusion_noise: Tensor of shape (`batch_size`, img_height // 8,
+            img_width // 8, 4), or a Tensor of shape (img_height // 8,
+            img_width // 8, 4). Optional custom noise to seed the diffusion
+            process. When the batch axis is omitted, the same noise will be
+            used to seed diffusion for every generated image.
+        seed: integer which is used to seed the random generation of
+            diffusion noise, only to be specified if `diffusion_noise` is
+            None.
+
+    Example:
+
+    ```python
+    from keras_cv.models import StableDiffusion
+
+    batch_size = 8
+    model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+    e_tacos = model.encode_text("Tacos at dawn")
+    e_watermelons = model.encode_text("Watermelons at dusk")
+
+    e_interpolated = tf.linspace(e_tacos, e_watermelons, batch_size)
+    images = model.generate_image(e_interpolated, batch_size=batch_size)
+    ```
+    """
     if diffusion_noise is not None and seed is not None:
       raise ValueError(
-          "`diffusion_noise` and `seed` should not both be passed to "
-          "`generate_image`. `seed` is only used to generate diffusion "
-          "noise when it's not already user-specified."
+        "`diffusion_noise` and `seed` should not both be passed to "
+        "`generate_image`. `seed` is only used to generate diffusion "
+        "noise when it's not already user-specified."
       )
 
     context = self._expand_tensor(encoded_text, batch_size)
@@ -208,7 +208,7 @@ def generate_image(
       latent = unconditional_latent + unconditional_guidance_scale * (latent - unconditional_latent)
       a_t, a_prev = alphas[index], alphas_prev[index]
       pred_x0 = (latent_prev - math.sqrt(1 - a_t) * latent) / math.sqrt(a_t)
-      latent = (latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0)
+      latent = latent * math.sqrt(1.0 - a_prev) + math.sqrt(a_prev) * pred_x0
       iteration += 1
       progbar.update(iteration)
 
@@ -225,7 +225,7 @@ def _get_unconditional_context(self):
 
   def _expand_tensor(self, text_embedding, batch_size):
     """Extends a tensor by repeating it to fit the shape of the given batch
-        size."""
+    size."""
     text_embedding = tf.squeeze(text_embedding)
     if text_embedding.shape.rank == 2:
       text_embedding = tf.repeat(tf.expand_dims(text_embedding, axis=0), batch_size, axis=0)
@@ -235,13 +235,13 @@ def _expand_tensor(self, text_embedding, batch_size):
   def image_encoder(self):
     """image_encoder returns the VAE Encoder with pretrained weights.
 
-        Usage:
-        ```python
-        sd = keras_cv.models.StableDiffusion()
-        my_image = np.ones((512, 512, 3))
-        latent_representation = sd.image_encoder.predict(my_image)
-        ```
-        """
+    Usage:
+    ```python
+    sd = keras_cv.models.StableDiffusion()
+    my_image = np.ones((512, 512, 3))
+    latent_representation = sd.image_encoder.predict(my_image)
+    ```
+    """
     if self._image_encoder is None:
       self._image_encoder = ImageEncoder()
       if self.jit_compile:
@@ -259,9 +259,9 @@ def diffusion_model(self):
   @property
   def decoder(self):
     """decoder returns the diffusion image decoder model with pretrained
-        weights. Can be overriden for tasks where the decoder needs to be
-        modified.
-        """
+    weights. Can be overriden for tasks where the decoder needs to be
+    modified.
+    """
     if self._decoder is None:
       self._decoder = Decoder(self.img_height, self.img_width)
       if self.jit_compile:
@@ -271,9 +271,9 @@ def decoder(self):
   @property
   def tokenizer(self):
     """tokenizer returns the tokenizer used for text inputs.
-        Can be overriden for tasks like textual inversion where the tokenizer
-        needs to be modified.
-        """
+    Can be overriden for tasks like textual inversion where the tokenizer
+    needs to be modified.
+    """
     if self._tokenizer is None:
       self._tokenizer = SimpleTokenizer()
     return self._tokenizer
@@ -295,8 +295,8 @@ def _get_initial_alphas(self, timesteps):
   def _get_initial_diffusion_noise(self, batch_size, seed):
     if seed is not None:
       return tf.random.stateless_normal(
-          (batch_size, self.img_height // 8, self.img_width // 8, 4),
-          seed=[seed, seed],
+        (batch_size, self.img_height // 8, self.img_width // 8, 4),
+        seed=[seed, seed],
       )
     else:
       return tf.random.normal((batch_size, self.img_height // 8, self.img_width // 8, 4))
@@ -309,67 +309,67 @@ def _get_pos_ids():
 class StableDiffusion(StableDiffusionBase):
   """Keras implementation of Stable Diffusion.
 
-    Note that the StableDiffusion API, as well as the APIs of the sub-components
-    of StableDiffusion (e.g. ImageEncoder, DiffusionModel) should be considered
-    unstable at this point. We do not guarantee backwards compatability for
-    future changes to these APIs.
-
-    Stable Diffusion is a powerful image generation model that can be used,
-    among other things, to generate pictures according to a short text
-    description (called a "prompt").
-
-    Arguments:
-        img_height: int, height of the images to generate, in pixel. Note that
-            only multiples of 128 are supported; the value provided will be
-            rounded to the nearest valid value. Defaults to 512.
-        img_width: int, width of the images to generate, in pixel. Note that
-            only multiples of 128 are supported; the value provided will be
-            rounded to the nearest valid value. Defaults to 512.
-        jit_compile: bool, whether to compile the underlying models to XLA.
-            This can lead to a significant speedup on some systems. Defaults to
-            False.
-
-    Example:
-
-    ```python
-    from keras_cv.models import StableDiffusion
-    from PIL import Image
-
-    model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
-    img = model.text_to_image(
-        prompt="A beautiful horse running through a field",
-        batch_size=1,  # How many images to generate at once
-        num_steps=25,  # Number of iterations (controls image quality)
-        seed=123,  # Set this to always get the same image from the same prompt
-    )
-    Image.fromarray(img[0]).save("horse.png")
-    print("saved at horse.png")
-    ```
-
-    References:
-    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
-    - [Original implementation](https://github.com/CompVis/stable-diffusion)
-    """  # noqa: E501
+  Note that the StableDiffusion API, as well as the APIs of the sub-components
+  of StableDiffusion (e.g. ImageEncoder, DiffusionModel) should be considered
+  unstable at this point. We do not guarantee backwards compatability for
+  future changes to these APIs.
+
+  Stable Diffusion is a powerful image generation model that can be used,
+  among other things, to generate pictures according to a short text
+  description (called a "prompt").
+
+  Arguments:
+      img_height: int, height of the images to generate, in pixel. Note that
+          only multiples of 128 are supported; the value provided will be
+          rounded to the nearest valid value. Defaults to 512.
+      img_width: int, width of the images to generate, in pixel. Note that
+          only multiples of 128 are supported; the value provided will be
+          rounded to the nearest valid value. Defaults to 512.
+      jit_compile: bool, whether to compile the underlying models to XLA.
+          This can lead to a significant speedup on some systems. Defaults to
+          False.
+
+  Example:
+
+  ```python
+  from keras_cv.models import StableDiffusion
+  from PIL import Image
+
+  model = StableDiffusion(img_height=512, img_width=512, jit_compile=True)
+  img = model.text_to_image(
+      prompt="A beautiful horse running through a field",
+      batch_size=1,  # How many images to generate at once
+      num_steps=25,  # Number of iterations (controls image quality)
+      seed=123,  # Set this to always get the same image from the same prompt
+  )
+  Image.fromarray(img[0]).save("horse.png")
+  print("saved at horse.png")
+  ```
+
+  References:
+  - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
+  - [Original implementation](https://github.com/CompVis/stable-diffusion)
+  """  # noqa: E501
 
   def __init__(
-      self,
-      img_height=512,
-      img_width=512,
-      jit_compile=False,
+    self,
+    img_height=512,
+    img_width=512,
+    jit_compile=False,
   ):
     super().__init__(img_height, img_width, jit_compile)
     print(
-        "By using this model checkpoint, you acknowledge that its usage is "
-        "subject to the terms of the CreativeML Open RAIL-M license at "
-        "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"  # noqa: E501
+      "By using this model checkpoint, you acknowledge that its usage is "
+      "subject to the terms of the CreativeML Open RAIL-M license at "
+      "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/LICENSE"  # noqa: E501
     )
 
   @property
   def text_encoder(self):
     """text_encoder returns the text encoder with pretrained weights.
-        Can be overriden for tasks like textual inversion where the text encoder
-        needs to be modified.
-        """
+    Can be overriden for tasks like textual inversion where the text encoder
+    needs to be modified.
+    """
     if self._text_encoder is None:
       self._text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
       if self.jit_compile:
@@ -379,9 +379,9 @@ def text_encoder(self):
   @property
   def diffusion_model(self):
     """diffusion_model returns the diffusion model with pretrained weights.
-        Can be overriden for tasks where the diffusion model needs to be
-        modified.
-        """
+    Can be overriden for tasks where the diffusion model needs to be
+    modified.
+    """
     if self._diffusion_model is None:
       self._diffusion_model = DiffusionModel(self.img_height, self.img_width, MAX_PROMPT_LENGTH)
       if self.jit_compile:
@@ -392,67 +392,67 @@ def diffusion_model(self):
 class StableDiffusionV2(StableDiffusionBase):
   """Keras implementation of Stable Diffusion v2.
 
-    Note that the StableDiffusion API, as well as the APIs of the sub-components
-    of StableDiffusionV2 (e.g. ImageEncoder, DiffusionModelV2) should be
-    considered unstable at this point. We do not guarantee backwards
-    compatability for future changes to these APIs.
-
-    Stable Diffusion is a powerful image generation model that can be used,
-    among other things, to generate pictures according to a short text
-    description (called a "prompt").
-
-    Arguments:
-        img_height: int, height of the images to generate, in pixel. Note that
-            only multiples of 128 are supported; the value provided will be
-            rounded to the nearest valid value. Defaults to 512.
-        img_width: int, width of the images to generate, in pixel. Note that
-            only multiples of 128 are supported; the value provided will be
-            rounded to the nearest valid value. Defaults to 512.
-        jit_compile: bool, whether to compile the underlying models to XLA.
-            This can lead to a significant speedup on some systems. Defaults to
-            False.
-    Example:
-
-    ```python
-    from keras_cv.models import StableDiffusionV2
-    from PIL import Image
-
-    model = StableDiffusionV2(img_height=512, img_width=512, jit_compile=True)
-    img = model.text_to_image(
-        prompt="A beautiful horse running through a field",
-        batch_size=1,  # How many images to generate at once
-        num_steps=25,  # Number of iterations (controls image quality)
-        seed=123,  # Set this to always get the same image from the same prompt
-    )
-    Image.fromarray(img[0]).save("horse.png")
-    print("saved at horse.png")
-    ```
-
-    References:
-
-    - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
-    - [Original implementation](https://github.com/Stability-AI/stablediffusion)
-    """  # noqa: E501
+  Note that the StableDiffusion API, as well as the APIs of the sub-components
+  of StableDiffusionV2 (e.g. ImageEncoder, DiffusionModelV2) should be
+  considered unstable at this point. We do not guarantee backwards
+  compatability for future changes to these APIs.
+
+  Stable Diffusion is a powerful image generation model that can be used,
+  among other things, to generate pictures according to a short text
+  description (called a "prompt").
+
+  Arguments:
+      img_height: int, height of the images to generate, in pixel. Note that
+          only multiples of 128 are supported; the value provided will be
+          rounded to the nearest valid value. Defaults to 512.
+      img_width: int, width of the images to generate, in pixel. Note that
+          only multiples of 128 are supported; the value provided will be
+          rounded to the nearest valid value. Defaults to 512.
+      jit_compile: bool, whether to compile the underlying models to XLA.
+          This can lead to a significant speedup on some systems. Defaults to
+          False.
+  Example:
+
+  ```python
+  from keras_cv.models import StableDiffusionV2
+  from PIL import Image
+
+  model = StableDiffusionV2(img_height=512, img_width=512, jit_compile=True)
+  img = model.text_to_image(
+      prompt="A beautiful horse running through a field",
+      batch_size=1,  # How many images to generate at once
+      num_steps=25,  # Number of iterations (controls image quality)
+      seed=123,  # Set this to always get the same image from the same prompt
+  )
+  Image.fromarray(img[0]).save("horse.png")
+  print("saved at horse.png")
+  ```
+
+  References:
+
+  - [About Stable Diffusion](https://stability.ai/blog/stable-diffusion-announcement)
+  - [Original implementation](https://github.com/Stability-AI/stablediffusion)
+  """  # noqa: E501
 
   def __init__(
-      self,
-      img_height=512,
-      img_width=512,
-      jit_compile=False,
+    self,
+    img_height=512,
+    img_width=512,
+    jit_compile=False,
   ):
     super().__init__(img_height, img_width, jit_compile)
     print(
-        "By using this model checkpoint, you acknowledge that its usage is "
-        "subject to the terms of the CreativeML Open RAIL++-M license at "
-        "https://github.com/Stability-AI/stablediffusion/main/LICENSE-MODEL"
+      "By using this model checkpoint, you acknowledge that its usage is "
+      "subject to the terms of the CreativeML Open RAIL++-M license at "
+      "https://github.com/Stability-AI/stablediffusion/main/LICENSE-MODEL"
     )
 
   @property
   def text_encoder(self):
     """text_encoder returns the text encoder with pretrained weights.
-        Can be overriden for tasks like textual inversion where the text encoder
-        needs to be modified.
-        """
+    Can be overriden for tasks like textual inversion where the text encoder
+    needs to be modified.
+    """
     if self._text_encoder is None:
       self._text_encoder = TextEncoderV2(MAX_PROMPT_LENGTH)
       if self.jit_compile:
@@ -462,9 +462,9 @@ def text_encoder(self):
   @property
   def diffusion_model(self):
     """diffusion_model returns the diffusion model with pretrained weights.
-        Can be overriden for tasks where the diffusion model needs to be
-        modified.
-        """
+    Can be overriden for tasks where the diffusion model needs to be
+    modified.
+    """
     if self._diffusion_model is None:
       self._diffusion_model = DiffusionModelV2(self.img_height, self.img_width, MAX_PROMPT_LENGTH)
       if self.jit_compile:
diff --git a/deepray/models/stable_diffusion/stable_diffusion_test.py b/deepray/models/stable_diffusion/stable_diffusion_test.py
index 28ebfdee..b4a782e5 100644
--- a/deepray/models/stable_diffusion/stable_diffusion_test.py
+++ b/deepray/models/stable_diffusion/stable_diffusion_test.py
@@ -19,7 +19,6 @@
 
 
 class StableDiffusionTest(tf.test.TestCase):
-
   def DISABLED_test_end_to_end_golden_value(self):
     prompt = "a caterpillar smoking a hookah while sitting on a mushroom"
     stablediff = StableDiffusion(128, 128)
@@ -36,9 +35,9 @@ def DISABLED_test_image_encoder_golden_value(self):
 
     outputs = stablediff.image_encoder.predict(tf.ones((1, 128, 128, 3)))
     self.assertAllClose(
-        outputs[0][1:4][0][0],
-        [2.451568, 1.607522, -0.546311, -1.194388],
-        atol=1e-4,
+      outputs[0][1:4][0][0],
+      [2.451568, 1.607522, -0.546311, -1.194388],
+      atol=1e-4,
     )
 
   def DISABLED_test_mixed_precision(self):
@@ -50,13 +49,13 @@ def DISABLED_test_generate_image_rejects_noise_and_seed(self):
     stablediff = StableDiffusion(128, 128)
 
     with self.assertRaisesRegex(
-        ValueError,
-        r"`diffusion_noise` and `seed` should not both be passed",
+      ValueError,
+      r"`diffusion_noise` and `seed` should not both be passed",
     ):
       _ = stablediff.generate_image(
-          stablediff.encode_text("thou shall not render"),
-          diffusion_noise=tf.random.normal((1, 16, 16, 4)),
-          seed=1337,
+        stablediff.encode_text("thou shall not render"),
+        diffusion_noise=tf.random.normal((1, 16, 16, 4)),
+        seed=1337,
       )
 
 
diff --git a/deepray/models/stable_diffusion/text_encoder.py b/deepray/models/stable_diffusion/text_encoder.py
index c7c0620d..a7244e06 100644
--- a/deepray/models/stable_diffusion/text_encoder.py
+++ b/deepray/models/stable_diffusion/text_encoder.py
@@ -18,7 +18,6 @@
 
 
 class TextEncoder(keras.Model):
-
   def __init__(self, max_length, vocab_size=49408, name=None, download_weights=True):
     tokens = keras.layers.Input(shape=(max_length,), dtype="int32", name="tokens")
     positions = keras.layers.Input(shape=(max_length,), dtype="int32", name="positions")
@@ -30,14 +29,13 @@ def __init__(self, max_length, vocab_size=49408, name=None, download_weights=Tru
 
     if download_weights:
       text_encoder_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",  # noqa: E501
-          file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",  # noqa: E501
+        origin="https://huggingface.co/fchollet/stable-diffusion/resolve/main/kcv_encoder.h5",  # noqa: E501
+        file_hash="4789e63e07c0e54d6a34a29b45ce81ece27060c499a709d556c7755b42bb0dc4",  # noqa: E501
       )
       self.load_weights(text_encoder_weights_fpath)
 
 
 class TextEncoderV2(keras.Model):
-
   def __init__(self, max_length, vocab_size=49408, name=None, download_weights=True):
     tokens = keras.layers.Input(shape=(max_length,), dtype="int32", name="tokens")
     positions = keras.layers.Input(shape=(max_length,), dtype="int32", name="positions")
@@ -49,8 +47,8 @@ def __init__(self, max_length, vocab_size=49408, name=None, download_weights=Tru
 
     if download_weights:
       text_encoder_weights_fpath = keras.utils.get_file(
-          origin="https://huggingface.co/ianstenbit/keras-sd2.1/resolve/main/text_encoder_v2_1.h5",  # noqa: E501
-          file_hash="985002e68704e1c5c3549de332218e99c5b9b745db7171d5f31fcd9a6089f25b",  # noqa: E501
+        origin="https://huggingface.co/ianstenbit/keras-sd2.1/resolve/main/text_encoder_v2_1.h5",  # noqa: E501
+        file_hash="985002e68704e1c5c3549de332218e99c5b9b745db7171d5f31fcd9a6089f25b",  # noqa: E501
       )
       self.load_weights(text_encoder_weights_fpath)
 
@@ -60,7 +58,6 @@ def quick_gelu(x):
 
 
 class CLIPEmbedding(keras.layers.Layer):
-
   def __init__(self, input_dim=49408, output_dim=768, max_length=77, **kwargs):
     super().__init__(**kwargs)
     self.token_embedding = keras.layers.Embedding(input_dim, output_dim)
@@ -74,7 +71,6 @@ def call(self, inputs):
 
 
 class CLIPEncoderLayer(keras.layers.Layer):
-
   def __init__(self, embed_dim, num_heads, activation=None, **kwargs):
     super().__init__(**kwargs)
     self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-5)
@@ -98,7 +94,6 @@ def call(self, inputs):
 
 
 class CLIPAttention(keras.layers.Layer):
-
   def __init__(self, embed_dim=768, num_heads=12, causal=True, **kwargs):
     super().__init__(**kwargs)
     self.embed_dim = embed_dim
@@ -119,8 +114,8 @@ def call(self, inputs, attention_mask=None):
     if attention_mask is None and self.causal:
       length = tf.shape(inputs)[1]
       attention_mask = tfnp.triu(
-          tf.ones((1, 1, length, length), dtype=self.compute_dtype) * -tfnp.inf,
-          k=1,
+        tf.ones((1, 1, length, length), dtype=self.compute_dtype) * -tfnp.inf,
+        k=1,
       )
 
     _, tgt_len, embed_dim = inputs.shape
diff --git a/deepray/models/tests/albert_transformer_encoder_test.py b/deepray/models/tests/albert_transformer_encoder_test.py
index 3bf39ead..a1cf88aa 100644
--- a/deepray/models/tests/albert_transformer_encoder_test.py
+++ b/deepray/models/tests/albert_transformer_encoder_test.py
@@ -30,17 +30,16 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class AlbertTransformerEncoderTest(keras_parameterized.TestCase):
-
   @parameterized.named_parameters(
-      dict(testcase_name="default", expected_dtype=tf.float32),
-      dict(testcase_name="with_float16_dtype", expected_dtype=tf.float16, float_dtype="float16"),
+    dict(testcase_name="default", expected_dtype=tf.float32),
+    dict(testcase_name="with_float16_dtype", expected_dtype=tf.float16, float_dtype="float16"),
   )
   def test_network_creation(self, expected_dtype, float_dtype=None):
     hidden_size = 32
     sequence_length = 21
 
     kwargs = dict(
-        vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3
+      vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3
     )
     if float_dtype is not None:
       kwargs["float_dtype"] = float_dtype
@@ -75,13 +74,13 @@ def test_network_invocation(self):
     num_types = 7
     # Create a small TransformerEncoder for testing.
     test_network = albert_transformer_encoder.AlbertTransformerEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      embedding_width=8,
+      hidden_size=hidden_size,
+      sequence_length=sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
     )
     self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
     # Create the inputs (note that the first dimension is implicit).
@@ -104,14 +103,14 @@ def test_network_invocation(self):
     # Creates a TransformerEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = albert_transformer_encoder.AlbertTransformerEncoder(
-        vocab_size=vocab_size,
-        embedding_width=8,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      embedding_width=8,
+      hidden_size=hidden_size,
+      sequence_length=sequence_length,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
     )
     self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
@@ -120,34 +119,34 @@ def test_network_invocation(self):
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        vocab_size=100,
-        embedding_width=8,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        sequence_length=21,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        intermediate_size=1223,
-        activation="relu",
-        dropout_rate=0.05,
-        attention_dropout_rate=0.22,
-        initializer="glorot_uniform",
-        float_dtype="float16"
+      vocab_size=100,
+      embedding_width=8,
+      hidden_size=32,
+      num_layers=3,
+      num_attention_heads=2,
+      sequence_length=21,
+      max_sequence_length=21,
+      type_vocab_size=12,
+      intermediate_size=1223,
+      activation="relu",
+      dropout_rate=0.05,
+      attention_dropout_rate=0.22,
+      initializer="glorot_uniform",
+      float_dtype="float16",
     )
     network = albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
 
     expected_config = dict(kwargs)
     expected_config["activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["activation"])
+      tf.keras.activations.get(expected_config["activation"])
     )
     expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"])
+      tf.keras.initializers.get(expected_config["initializer"])
     )
     self.assertEqual(network.get_config(), expected_config)
 
     # Create another network object from the first object's config.
-    new_network = (albert_transformer_encoder.AlbertTransformerEncoder.from_config(network.get_config()))
+    new_network = albert_transformer_encoder.AlbertTransformerEncoder.from_config(network.get_config())
 
     # Validate that the config can be forced to JSON.
     _ = new_network.to_json()
diff --git a/deepray/models/tests/bert_classifier_test.py b/deepray/models/tests/bert_classifier_test.py
index 6cf9c9e8..60cdc413 100644
--- a/deepray/models/tests/bert_classifier_test.py
+++ b/deepray/models/tests/bert_classifier_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class BertClassifierTest(keras_parameterized.TestCase):
-
   def test_bert_trainer(self):
     """Validate that the Keras object can be created."""
     # Build a transformer network to use within the BERT trainer.
@@ -81,7 +80,7 @@ def test_serialize_deserialize(self):
     # Create a BERT trainer with the created network. (Note that all the args
     # are different, so we can catch any serialization mismatches.)
     bert_trainer_model = bert_classifier.BertClassifier(
-        test_network, num_classes=4, initializer='zeros', output='predictions'
+      test_network, num_classes=4, initializer="zeros", output="predictions"
     )
 
     # Create another BERT trainer via serialization and deserialization.
@@ -95,5 +94,5 @@ def test_serialize_deserialize(self):
     self.assertAllEqual(bert_trainer_model.get_config(), new_bert_trainer_model.get_config())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/bert_pretrainer_test.py b/deepray/models/tests/bert_pretrainer_test.py
index 091910df..06498302 100644
--- a/deepray/models/tests/bert_pretrainer_test.py
+++ b/deepray/models/tests/bert_pretrainer_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class BertPretrainerTest(keras_parameterized.TestCase):
-
   def test_bert_trainer(self):
     """Validate that the Keras object can be created."""
     # Build a transformer network to use within the BERT trainer.
@@ -41,7 +40,7 @@ def test_bert_trainer(self):
     num_classes = 3
     num_token_predictions = 2
     bert_trainer_model = bert_pretrainer.BertPretrainer(
-        test_network, num_classes=num_classes, num_token_predictions=num_token_predictions
+      test_network, num_classes=num_classes, num_token_predictions=num_token_predictions
     )
 
     # Create a set of 2-dimensional inputs (the first dimension is implicit).
@@ -100,5 +99,5 @@ def test_serialize_deserialize(self):
     self.assertAllEqual(bert_trainer_model.get_config(), new_bert_trainer_model.get_config())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/bert_span_labeler_test.py b/deepray/models/tests/bert_span_labeler_test.py
index cdce91e4..4b319567 100644
--- a/deepray/models/tests/bert_span_labeler_test.py
+++ b/deepray/models/tests/bert_span_labeler_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class BertSpanLabelerTest(keras_parameterized.TestCase):
-
   def test_bert_trainer(self):
     """Validate that the Keras object can be created."""
     # Build a transformer network to use within the BERT trainer.
@@ -67,7 +66,7 @@ def test_bert_trainer_named_compilation(self):
     # Attempt to compile the model using a string-keyed dict of output names to
     # loss functions. This will validate that the outputs are named as we
     # expect.
-    bert_trainer_model.compile(optimizer='sgd', loss={'start_positions': 'mse', 'end_positions': 'mse'})
+    bert_trainer_model.compile(optimizer="sgd", loss={"start_positions": "mse", "end_positions": "mse"})
 
   def test_bert_trainer_tensor_call(self):
     """Validate that the Keras object can be invoked."""
@@ -109,5 +108,5 @@ def test_serialize_deserialize(self):
     self.assertAllEqual(bert_trainer_model.get_config(), new_bert_trainer_model.get_config())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/classification_test.py b/deepray/models/tests/classification_test.py
index 497f2a8e..79f686fc 100644
--- a/deepray/models/tests/classification_test.py
+++ b/deepray/models/tests/classification_test.py
@@ -29,7 +29,6 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class ClassificationTest(keras_parameterized.TestCase):
-
   def test_network_creation(self):
     """Validate that the Keras object can be created."""
     input_width = 512
@@ -47,7 +46,7 @@ def test_network_invocation(self):
     """Validate that the Keras object can be invoked."""
     input_width = 512
     num_classes = 10
-    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output='predictions')
+    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output="predictions")
     # Create a 2-dimensional input (the first dimension is implicit).
     cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
     output = test_object(cls_data)
@@ -61,7 +60,7 @@ def test_network_invocation_with_internal_logits(self):
     """Validate that the logit outputs are correct."""
     input_width = 512
     num_classes = 10
-    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output='predictions')
+    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output="predictions")
 
     # Create a 2-dimensional input (the first dimension is implicit).
     cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
@@ -91,7 +90,7 @@ def test_network_invocation_with_internal_and_external_logits(self):
     """Validate that the logit outputs are correct."""
     input_width = 512
     num_classes = 10
-    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output='logits')
+    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output="logits")
 
     # Create a 2-dimensional input (the first dimension is implicit).
     cls_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
@@ -115,8 +114,8 @@ def test_network_invocation_with_logit_output(self):
     """Validate that the logit outputs are correct."""
     input_width = 512
     num_classes = 10
-    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output='predictions')
-    logit_object = classification.Classification(input_width=input_width, num_classes=num_classes, output='logits')
+    test_object = classification.Classification(input_width=input_width, num_classes=num_classes, output="predictions")
+    logit_object = classification.Classification(input_width=input_width, num_classes=num_classes, output="logits")
     logit_object.set_weights(test_object.get_weights())
 
     # Create a 2-dimensional input (the first dimension is implicit).
@@ -147,7 +146,7 @@ def test_network_invocation_with_logit_output(self):
 
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
-    network = classification.Classification(input_width=128, num_classes=10, initializer='zeros', output='predictions')
+    network = classification.Classification(input_width=128, num_classes=10, initializer="zeros", output="predictions")
 
     # Create another network object from the first object's config.
     new_network = classification.Classification.from_config(network.get_config())
@@ -160,8 +159,8 @@ def test_serialize_deserialize(self):
 
   def test_unknown_output_type_fails(self):
     with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = classification.Classification(input_width=128, num_classes=10, output='bad')
+      _ = classification.Classification(input_width=128, num_classes=10, output="bad")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/encoder_scaffold_test.py b/deepray/models/tests/encoder_scaffold_test.py
index afa7cb65..5d859a23 100644
--- a/deepray/models/tests/encoder_scaffold_test.py
+++ b/deepray/models/tests/encoder_scaffold_test.py
@@ -34,7 +34,6 @@
 # test serialization below.
 # @tf.keras.utils.register_keras_serializable(package="TestOnly")
 class ValidatedTransformerLayer(layers.Transformer):
-
   def __init__(self, call_list, **kwargs):
     super(ValidatedTransformerLayer, self).__init__(**kwargs)
     self.list = call_list
@@ -53,40 +52,39 @@ def get_config(self):
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class EncoderScaffoldLayerClassTest(keras_parameterized.TestCase):
-
   def test_network_creation(self):
     hidden_size = 32
     sequence_length = 21
     num_hidden_instances = 3
     embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
+      "vocab_size": 100,
+      "type_vocab_size": 16,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
     }
 
     call_list = []
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
-        "call_list": call_list
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
+      "call_list": call_list,
     }
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=num_hidden_instances,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cls=ValidatedTransformerLayer,
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=num_hidden_instances,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cls=ValidatedTransformerLayer,
+      hidden_cfg=hidden_cfg,
+      embedding_cfg=embedding_cfg,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -113,32 +111,32 @@ def test_network_creation_with_float16_dtype(self):
     hidden_size = 32
     sequence_length = 21
     embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-        "dtype": "float16",
+      "vocab_size": 100,
+      "type_vocab_size": 16,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
+      "dtype": "float16",
     }
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float16",
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float16",
     }
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        classification_layer_dtype=tf.float16,
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      classification_layer_dtype=tf.float16,
+      hidden_cfg=hidden_cfg,
+      embedding_cfg=embedding_cfg,
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -161,33 +159,33 @@ def test_network_invocation(self):
     vocab_size = 57
     num_types = 7
     embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
+      "vocab_size": vocab_size,
+      "type_vocab_size": num_types,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
     }
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
     }
     tf.keras.mixed_precision.experimental.set_policy("float32")
     print(hidden_cfg)
     print(embedding_cfg)
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cfg=hidden_cfg,
+      embedding_cfg=embedding_cfg,
     )
 
     # Create the inputs (note that the first dimension is implicit).
@@ -210,29 +208,29 @@ def test_network_invocation(self):
     # Creates a EncoderScaffold with max_sequence_length != sequence_length
     num_types = 7
     embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length * 2,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
+      "vocab_size": vocab_size,
+      "type_vocab_size": num_types,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length * 2,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
     }
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
     }
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cfg=hidden_cfg,
+      embedding_cfg=embedding_cfg,
     )
 
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
@@ -243,30 +241,30 @@ def test_serialize_deserialize(self):
     hidden_size = 32
     sequence_length = 21
     embedding_cfg = {
-        "vocab_size": 100,
-        "type_vocab_size": 16,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
+      "vocab_size": 100,
+      "type_vocab_size": 16,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
     }
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
     }
     # Create a small EncoderScaffold for testing.
     network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cfg=hidden_cfg,
+      embedding_cfg=embedding_cfg,
     )
 
     # Create another network object from the first object's config.
@@ -281,7 +279,6 @@ def test_serialize_deserialize(self):
 
 @keras_parameterized.run_all_keras_modes
 class EncoderScaffoldEmbeddingNetworkTest(keras_parameterized.TestCase):
-
   def test_network_invocation(self):
     hidden_size = 32
     sequence_length = 21
@@ -294,32 +291,32 @@ def test_network_invocation(self):
     word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
     embedding_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings"
+      vocab_size=vocab_size,
+      embedding_width=hidden_size,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      name="word_embeddings",
     )
     word_embeddings = embedding_layer(word_ids)
     network = tf.keras.Model([word_ids, mask], [word_embeddings, mask])
 
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
     }
 
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cls=network,
-        embedding_data=embedding_layer.embeddings
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cfg=hidden_cfg,
+      embedding_cls=network,
+      embedding_data=embedding_layer.embeddings,
     )
 
     # Create the inputs (note that the first dimension is implicit).
@@ -353,32 +350,32 @@ def test_serialize_deserialize(self):
     word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
     mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
     embedding_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size,
-        embedding_width=hidden_size,
-        initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        name="word_embeddings"
+      vocab_size=vocab_size,
+      embedding_width=hidden_size,
+      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      name="word_embeddings",
     )
     word_embeddings = embedding_layer(word_ids)
     network = tf.keras.Model([word_ids, mask], [word_embeddings, mask])
 
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
     }
 
     # Create a small EncoderScaffold for testing.
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cfg=hidden_cfg,
-        embedding_cls=network,
-        embedding_data=embedding_layer.embeddings
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cfg=hidden_cfg,
+      embedding_cls=network,
+      embedding_data=embedding_layer.embeddings,
     )
 
     # Create another network object from the first object's config.
@@ -421,7 +418,6 @@ def test_serialize_deserialize(self):
 
 @keras_parameterized.run_all_keras_modes
 class EncoderScaffoldHiddenInstanceTest(keras_parameterized.TestCase):
-
   def test_network_invocation(self):
     hidden_size = 32
     sequence_length = 21
@@ -429,26 +425,26 @@ def test_network_invocation(self):
     num_types = 7
 
     embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-        "dtype": "float32",
+      "vocab_size": vocab_size,
+      "type_vocab_size": num_types,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
+      "dtype": "float32",
     }
 
     call_list = []
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
-        "call_list": call_list
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
+      "call_list": call_list,
     }
     # Create a small EncoderScaffold for testing. This time, we pass an already-
     # instantiated layer object.
@@ -456,11 +452,11 @@ def test_network_invocation(self):
     xformer = ValidatedTransformerLayer(**hidden_cfg)
 
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cls=xformer,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cls=xformer,
+      embedding_cfg=embedding_cfg,
     )
 
     # Create the inputs (note that the first dimension is implicit).
@@ -492,26 +488,26 @@ def test_serialize_deserialize(self):
     num_types = 7
 
     embedding_cfg = {
-        "vocab_size": vocab_size,
-        "type_vocab_size": num_types,
-        "hidden_size": hidden_size,
-        "seq_length": sequence_length,
-        "max_seq_length": sequence_length,
-        "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dropout_rate": 0.1,
-        "dtype": "float32",
+      "vocab_size": vocab_size,
+      "type_vocab_size": num_types,
+      "hidden_size": hidden_size,
+      "seq_length": sequence_length,
+      "max_seq_length": sequence_length,
+      "initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dropout_rate": 0.1,
+      "dtype": "float32",
     }
 
     call_list = []
     hidden_cfg = {
-        "num_attention_heads": 2,
-        "intermediate_size": 3072,
-        "intermediate_activation": activations.gelu,
-        "dropout_rate": 0.1,
-        "attention_dropout_rate": 0.1,
-        "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        "dtype": "float32",
-        "call_list": call_list
+      "num_attention_heads": 2,
+      "intermediate_size": 3072,
+      "intermediate_activation": activations.gelu,
+      "dropout_rate": 0.1,
+      "attention_dropout_rate": 0.1,
+      "kernel_initializer": tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      "dtype": "float32",
+      "call_list": call_list,
     }
     # Create a small EncoderScaffold for testing. This time, we pass an already-
     # instantiated layer object.
@@ -519,11 +515,11 @@ def test_serialize_deserialize(self):
     xformer = ValidatedTransformerLayer(**hidden_cfg)
 
     test_network = encoder_scaffold.EncoderScaffold(
-        num_hidden_instances=3,
-        num_output_classes=hidden_size,
-        classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-        hidden_cls=xformer,
-        embedding_cfg=embedding_cfg
+      num_hidden_instances=3,
+      num_output_classes=hidden_size,
+      classification_layer_initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+      hidden_cls=xformer,
+      embedding_cfg=embedding_cfg,
     )
 
     # Create another network object from the first object's config.
@@ -563,5 +559,5 @@ def test_serialize_deserialize(self):
 
 
 if __name__ == "__main__":
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
   tf.test.main()
diff --git a/deepray/models/tests/masked_lm_test.py b/deepray/models/tests/masked_lm_test.py
index 5a5f8963..1988db2c 100644
--- a/deepray/models/tests/masked_lm_test.py
+++ b/deepray/models/tests/masked_lm_test.py
@@ -31,19 +31,18 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class MaskedLMTest(keras_parameterized.TestCase):
-
   def create_network(
-      self, vocab_size, sequence_length, hidden_size, num_predictions, output='predictions', xformer_stack=None
+    self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions", xformer_stack=None
   ):
     # First, create a transformer stack that we can use to get the LM's
     # vocabulary weight.
     if xformer_stack is None:
       xformer_stack = transformer_encoder.TransformerEncoder(
-          vocab_size=vocab_size,
-          num_layers=1,
-          sequence_length=sequence_length,
-          hidden_size=hidden_size,
-          num_attention_heads=4,
+        vocab_size=vocab_size,
+        num_layers=1,
+        sequence_length=sequence_length,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
       )
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -52,7 +51,7 @@ def create_network(
 
     # Create a maskedLM from the transformer stack.
     test_network = masked_lm.MaskedLM(
-        num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
+      num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
     )
     return test_network
 
@@ -62,10 +61,7 @@ def test_network_creation(self):
     hidden_size = 64
     num_predictions = 21
     test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Make sure that the output tensor of the masked LM is the right shape.
@@ -82,10 +78,7 @@ def test_network_invocation_with_internal_logits(self):
     hidden_size = 64
     num_predictions = 21
     test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Create a model from the masked LM layer.
@@ -122,27 +115,27 @@ def test_network_invocation_with_external_logits(self):
     hidden_size = 64
     num_predictions = 21
     xformer_stack = transformer_encoder.TransformerEncoder(
-        vocab_size=vocab_size,
-        num_layers=1,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
+      vocab_size=vocab_size,
+      num_layers=1,
+      sequence_length=sequence_length,
+      hidden_size=hidden_size,
+      num_attention_heads=4,
     )
     test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions,
-        xformer_stack=xformer_stack,
-        output='predictions'
+      vocab_size=vocab_size,
+      sequence_length=sequence_length,
+      hidden_size=hidden_size,
+      num_predictions=num_predictions,
+      xformer_stack=xformer_stack,
+      output="predictions",
     )
     logit_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions,
-        xformer_stack=xformer_stack,
-        output='logits'
+      vocab_size=vocab_size,
+      sequence_length=sequence_length,
+      hidden_size=hidden_size,
+      num_predictions=num_predictions,
+      xformer_stack=xformer_stack,
+      output="logits",
     )
     logit_network.set_weights(test_network.get_weights())
 
@@ -182,10 +175,7 @@ def test_network_invocation(self):
     hidden_size = 64
     num_predictions = 21
     test_network = self.create_network(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
     )
 
     # Create a model from the masked LM layer.
@@ -203,8 +193,8 @@ def test_network_invocation(self):
 
   def test_unknown_output_type_fails(self):
     with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = self.create_network(vocab_size=8, sequence_length=8, hidden_size=8, num_predictions=8, output='bad')
+      _ = self.create_network(vocab_size=8, sequence_length=8, hidden_size=8, num_predictions=8, output="bad")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/span_labeling_test.py b/deepray/models/tests/span_labeling_test.py
index 099d2725..cce88236 100644
--- a/deepray/models/tests/span_labeling_test.py
+++ b/deepray/models/tests/span_labeling_test.py
@@ -29,12 +29,11 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class SpanLabelingTest(keras_parameterized.TestCase):
-
   def test_network_creation(self):
     """Validate that the Keras object can be created."""
     sequence_length = 15
     input_width = 512
-    test_network = span_labeling.SpanLabeling(input_width=input_width, output='predictions')
+    test_network = span_labeling.SpanLabeling(input_width=input_width, output="predictions")
     # Create a 3-dimensional input (the first dimension is implicit).
     sequence_data = tf.keras.Input(shape=(sequence_length, input_width), dtype=tf.float32)
     start_outputs, end_outputs = test_network(sequence_data)
@@ -69,7 +68,7 @@ def test_network_invocation_with_internal_logit_output(self):
     """Validate that the logit outputs are correct."""
     sequence_length = 15
     input_width = 512
-    test_network = span_labeling.SpanLabeling(input_width=input_width, output='predictions')
+    test_network = span_labeling.SpanLabeling(input_width=input_width, output="predictions")
     # Create a 3-dimensional input (the first dimension is implicit).
     sequence_data = tf.keras.Input(shape=(sequence_length, input_width), dtype=tf.float32)
     output = test_network(sequence_data)
@@ -102,8 +101,8 @@ def test_network_invocation_with_external_logit_output(self):
     """Validate that the logit outputs are correct."""
     sequence_length = 15
     input_width = 512
-    test_network = span_labeling.SpanLabeling(input_width=input_width, output='predictions')
-    logit_network = span_labeling.SpanLabeling(input_width=input_width, output='logits')
+    test_network = span_labeling.SpanLabeling(input_width=input_width, output="predictions")
+    logit_network = span_labeling.SpanLabeling(input_width=input_width, output="logits")
     logit_network.set_weights(test_network.get_weights())
 
     # Create a 3-dimensional input (the first dimension is implicit).
@@ -137,7 +136,7 @@ def test_network_invocation_with_external_logit_output(self):
 
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
-    network = span_labeling.SpanLabeling(input_width=128, activation='relu', initializer='zeros', output='predictions')
+    network = span_labeling.SpanLabeling(input_width=128, activation="relu", initializer="zeros", output="predictions")
 
     # Create another network object from the first object's config.
     new_network = span_labeling.SpanLabeling.from_config(network.get_config())
@@ -150,8 +149,8 @@ def test_serialize_deserialize(self):
 
   def test_unknown_output_type_fails(self):
     with self.assertRaisesRegex(ValueError, 'Unknown `output` value "bad".*'):
-      _ = span_labeling.SpanLabeling(input_width=10, output='bad')
+      _ = span_labeling.SpanLabeling(input_width=10, output="bad")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/models/tests/transformer_encoder_test.py b/deepray/models/tests/transformer_encoder_test.py
index 70945fbd..c28f5fda 100644
--- a/deepray/models/tests/transformer_encoder_test.py
+++ b/deepray/models/tests/transformer_encoder_test.py
@@ -29,13 +29,12 @@
 # guarantees forward compatibility of this code for the V2 switchover.
 @keras_parameterized.run_all_keras_modes
 class TransformerEncoderTest(keras_parameterized.TestCase):
-
   def test_network_creation(self):
     hidden_size = 32
     sequence_length = 21
     # Create a small TransformerEncoder for testing.
     test_network = transformer_encoder.TransformerEncoder(
-        vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3
+      vocab_size=100, hidden_size=hidden_size, sequence_length=sequence_length, num_attention_heads=2, num_layers=3
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -58,12 +57,12 @@ def test_network_creation_with_float16_dtype(self):
     tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
     # Create a small TransformerEncoder for testing.
     test_network = transformer_encoder.TransformerEncoder(
-        vocab_size=100,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        float_dtype="float16"
+      vocab_size=100,
+      hidden_size=hidden_size,
+      sequence_length=sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      float_dtype="float16",
     )
     # Create the inputs (note that the first dimension is implicit).
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -88,12 +87,12 @@ def test_network_invocation(self):
     tf.keras.mixed_precision.experimental.set_policy("float32")
     # Create a small TransformerEncoder for testing.
     test_network = transformer_encoder.TransformerEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      sequence_length=sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
     )
     self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
     # Create the inputs (note that the first dimension is implicit).
@@ -116,13 +115,13 @@ def test_network_invocation(self):
     # Creates a TransformerEncoder with max_sequence_length != sequence_length
     max_sequence_length = 128
     test_network = transformer_encoder.TransformerEncoder(
-        vocab_size=vocab_size,
-        hidden_size=hidden_size,
-        sequence_length=sequence_length,
-        max_sequence_length=max_sequence_length,
-        num_attention_heads=2,
-        num_layers=3,
-        type_vocab_size=num_types
+      vocab_size=vocab_size,
+      hidden_size=hidden_size,
+      sequence_length=sequence_length,
+      max_sequence_length=max_sequence_length,
+      num_attention_heads=2,
+      num_layers=3,
+      type_vocab_size=num_types,
     )
     self.assertTrue(test_network._position_embedding_layer._use_dynamic_slicing)
     model = tf.keras.Model([word_ids, mask, type_ids], [data, pooled])
@@ -131,28 +130,28 @@ def test_network_invocation(self):
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        vocab_size=100,
-        hidden_size=32,
-        num_layers=3,
-        num_attention_heads=2,
-        sequence_length=21,
-        max_sequence_length=21,
-        type_vocab_size=12,
-        intermediate_size=1223,
-        activation="relu",
-        dropout_rate=0.05,
-        attention_dropout_rate=0.22,
-        initializer="glorot_uniform",
-        float_dtype="float16"
+      vocab_size=100,
+      hidden_size=32,
+      num_layers=3,
+      num_attention_heads=2,
+      sequence_length=21,
+      max_sequence_length=21,
+      type_vocab_size=12,
+      intermediate_size=1223,
+      activation="relu",
+      dropout_rate=0.05,
+      attention_dropout_rate=0.22,
+      initializer="glorot_uniform",
+      float_dtype="float16",
     )
     network = transformer_encoder.TransformerEncoder(**kwargs)
 
     expected_config = dict(kwargs)
     expected_config["activation"] = tf.keras.activations.serialize(
-        tf.keras.activations.get(expected_config["activation"])
+      tf.keras.activations.get(expected_config["activation"])
     )
     expected_config["initializer"] = tf.keras.initializers.serialize(
-        tf.keras.initializers.get(expected_config["initializer"])
+      tf.keras.initializers.get(expected_config["initializer"])
     )
     self.assertEqual(network.get_config(), expected_config)
 
@@ -167,5 +166,5 @@ def test_serialize_deserialize(self):
 
 
 if __name__ == "__main__":
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
   tf.test.main()
diff --git a/deepray/models/transformer_encoder.py b/deepray/models/transformer_encoder.py
index 11ef77ef..da700dbd 100644
--- a/deepray/models/transformer_encoder.py
+++ b/deepray/models/transformer_encoder.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -63,21 +64,21 @@ class TransformerEncoder(tf.keras.Model):
   """
 
   def __init__(
-      self,
-      vocab_size,
-      hidden_size=768,
-      num_layers=12,
-      num_attention_heads=12,
-      sequence_length=512,
-      max_sequence_length=None,
-      type_vocab_size=16,
-      intermediate_size=3072,
-      activation=tf.keras.activations.gelu,
-      dropout_rate=0.1,
-      attention_dropout_rate=0.1,
-      initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
-      float_dtype='float32',
-      **kwargs
+    self,
+    vocab_size,
+    hidden_size=768,
+    num_layers=12,
+    num_attention_heads=12,
+    sequence_length=512,
+    max_sequence_length=None,
+    type_vocab_size=16,
+    intermediate_size=3072,
+    activation=tf.keras.activations.gelu,
+    dropout_rate=0.1,
+    attention_dropout_rate=0.1,
+    initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
+    float_dtype="float32",
+    **kwargs,
   ):
     activation = tf.keras.activations.get(activation)
     initializer = tf.keras.initializers.get(initializer)
@@ -86,76 +87,73 @@ def __init__(
       max_sequence_length = sequence_length
     self._self_setattr_tracking = False
     self._config_dict = {
-        'vocab_size': vocab_size,
-        'hidden_size': hidden_size,
-        'num_layers': num_layers,
-        'num_attention_heads': num_attention_heads,
-        'sequence_length': sequence_length,
-        'max_sequence_length': max_sequence_length,
-        'type_vocab_size': type_vocab_size,
-        'intermediate_size': intermediate_size,
-        'activation': tf.keras.activations.serialize(activation),
-        'dropout_rate': dropout_rate,
-        'attention_dropout_rate': attention_dropout_rate,
-        'initializer': tf.keras.initializers.serialize(initializer),
-        'float_dtype': float_dtype,
+      "vocab_size": vocab_size,
+      "hidden_size": hidden_size,
+      "num_layers": num_layers,
+      "num_attention_heads": num_attention_heads,
+      "sequence_length": sequence_length,
+      "max_sequence_length": max_sequence_length,
+      "type_vocab_size": type_vocab_size,
+      "intermediate_size": intermediate_size,
+      "activation": tf.keras.activations.serialize(activation),
+      "dropout_rate": dropout_rate,
+      "attention_dropout_rate": attention_dropout_rate,
+      "initializer": tf.keras.initializers.serialize(initializer),
+      "float_dtype": float_dtype,
     }
 
-    word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_word_ids')
-    mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_mask')
-    type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name='input_type_ids')
+    word_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_word_ids")
+    mask = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_mask")
+    type_ids = tf.keras.layers.Input(shape=(sequence_length,), dtype=tf.int32, name="input_type_ids")
 
     self._embedding_layer = on_device_embedding.OnDeviceEmbedding(
-        vocab_size=vocab_size, embedding_width=hidden_size, initializer=initializer, name='word_embeddings'
+      vocab_size=vocab_size, embedding_width=hidden_size, initializer=initializer, name="word_embeddings"
     )
     word_embeddings = self._embedding_layer(word_ids)
 
     # Always uses dynamic slicing for simplicity.
     self._position_embedding_layer = position_embedding.PositionEmbedding(
-        initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length
+      initializer=initializer, use_dynamic_slicing=True, max_sequence_length=max_sequence_length
     )
     position_embeddings = self._position_embedding_layer(word_embeddings)
 
-    type_embeddings = (
-        on_device_embedding.OnDeviceEmbedding(
-            vocab_size=type_vocab_size,
-            embedding_width=hidden_size,
-            initializer=initializer,
-            use_one_hot=True,
-            name='type_embeddings'
-        )(type_ids)
-    )
+    type_embeddings = on_device_embedding.OnDeviceEmbedding(
+      vocab_size=type_vocab_size,
+      embedding_width=hidden_size,
+      initializer=initializer,
+      use_one_hot=True,
+      name="type_embeddings",
+    )(type_ids)
 
     embeddings = tf.keras.layers.Add()([word_embeddings, position_embeddings, type_embeddings])
-    embeddings = (
-        tf.keras.layers.LayerNormalization(name='embeddings/layer_norm', axis=-1, epsilon=1e-12,
-                                           dtype=tf.float32)(embeddings)
-    )
-    embeddings = (tf.keras.layers.Dropout(rate=dropout_rate, dtype=tf.float32)(embeddings))
+    embeddings = tf.keras.layers.LayerNormalization(
+      name="embeddings/layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+    )(embeddings)
+    embeddings = tf.keras.layers.Dropout(rate=dropout_rate, dtype=tf.float32)(embeddings)
 
-    if float_dtype == 'float16':
+    if float_dtype == "float16":
       embeddings = tf.cast(embeddings, tf.float16)
-    elif float_dtype == 'bfloat16':
+    elif float_dtype == "bfloat16":
       embeddings = tf.cast(embeddings, tf.bfloat16)
 
     data = embeddings
     attention_mask = self_attention_mask.SelfAttentionMask()([data, mask])
     for i in range(num_layers):
       layer = transformer.Transformer(
-          num_attention_heads=num_attention_heads,
-          intermediate_size=intermediate_size,
-          intermediate_activation=activation,
-          dropout_rate=dropout_rate,
-          attention_dropout_rate=attention_dropout_rate,
-          kernel_initializer=initializer,
-          dtype=float_dtype,
-          name='transformer/layer_%d' % i
+        num_attention_heads=num_attention_heads,
+        intermediate_size=intermediate_size,
+        intermediate_activation=activation,
+        dropout_rate=dropout_rate,
+        attention_dropout_rate=attention_dropout_rate,
+        kernel_initializer=initializer,
+        dtype=float_dtype,
+        name="transformer/layer_%d" % i,
       )
       data = layer([data, attention_mask])
 
-    first_token_tensor = (tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data))
+    first_token_tensor = tf.keras.layers.Lambda(lambda x: tf.squeeze(x[:, 0:1, :], axis=1))(data)
     cls_output = tf.keras.layers.Dense(
-        units=hidden_size, activation='tanh', kernel_initializer=initializer, name='pooler_transform'
+      units=hidden_size, activation="tanh", kernel_initializer=initializer, name="pooler_transform"
     )(first_token_tensor)
 
     super(TransformerEncoder, self).__init__(inputs=[word_ids, mask, type_ids], outputs=[data, cls_output], **kwargs)
diff --git a/deepray/models/word2vec.py b/deepray/models/word2vec.py
index 1e769e87..3cae48ff 100644
--- a/deepray/models/word2vec.py
+++ b/deepray/models/word2vec.py
@@ -3,7 +3,6 @@
 
 
 class Word2Vec(tf.keras.Model):
-
   def __init__(self, vocab_size, embedding_dim):
     super(Word2Vec, self).__init__()
     self.target_embedding = layers.Embedding(vocab_size, embedding_dim, input_length=1, name="w2v_embedding")
@@ -20,6 +19,6 @@ def call(self, pair):
     # word_emb: (batch, embed)
     context_emb = self.context_embedding(context)
     # context_emb: (batch, context, embed)
-    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
+    dots = tf.einsum("be,bce->bc", word_emb, context_emb)
     # dots: (batch, context)
     return dots
diff --git a/deepray/ops/__init__.py b/deepray/ops/__init__.py
index 41704973..ff5fd072 100644
--- a/deepray/ops/__init__.py
+++ b/deepray/ops/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Ops package definition."""
+
 from official.nlp.modeling.ops.beam_search import sequence_beam_search
 from official.nlp.modeling.ops.beam_search import SequenceBeamSearch
 from official.nlp.modeling.ops.sampling_module import SamplingModule
diff --git a/deepray/ops/beam_search.py b/deepray/ops/beam_search.py
index bb24dc9d..5f312f0b 100644
--- a/deepray/ops/beam_search.py
+++ b/deepray/ops/beam_search.py
@@ -99,16 +99,16 @@ class SequenceBeamSearch(tf.Module):
   """Implementation of beam search loop."""
 
   def __init__(
-      self,
-      symbols_to_logits_fn,
-      vocab_size,
-      beam_size,
-      alpha,
-      max_decode_length,
-      eos_id,
-      padded_decode,
-      dtype=tf.float32,
-      decoding_name=None
+    self,
+    symbols_to_logits_fn,
+    vocab_size,
+    beam_size,
+    alpha,
+    max_decode_length,
+    eos_id,
+    padded_decode,
+    dtype=tf.float32,
+    decoding_name=None,
   ):
     """Initialize sequence beam search.
 
@@ -155,7 +155,7 @@ def search(self, initial_ids, initial_cache):
     Returns:
       finished_seq and finished_scores.
     """
-    batch_size = (initial_ids.shape.as_list()[0] if self.padded_decode else tf.shape(initial_ids)[0])
+    batch_size = initial_ids.shape.as_list()[0] if self.padded_decode else tf.shape(initial_ids)[0]
     state, state_shapes = self._create_initial_state(initial_ids, initial_cache, batch_size)
 
     def _grow_alive_seq(state):
@@ -185,7 +185,7 @@ def _grow_alive_seq(state):
       # new cache values at the same time.
       if self.padded_decode:
         flat_ids = tf.reshape(
-            tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]), [batch_size * self.beam_size, -1]
+          tf.slice(alive_seq, [0, 0, i], [batch_size, self.beam_size, 1]), [batch_size * self.beam_size, -1]
         )
       else:
         flat_ids = flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
@@ -249,14 +249,14 @@ def _get_new_alive_state(new_seq, new_log_probs, new_finished_flags, new_cache):
       new_log_probs += tf.cast(new_finished_flags, self.dtype) * -inf(self.dtype)
 
       _, topk_indexes = tf.nn.top_k(new_log_probs, k=self.beam_size)
-      top_alive_seq, top_alive_log_probs, top_alive_cache = (
-          self._gather_beams([new_seq, new_log_probs, new_cache], topk_indexes, batch_size, self.beam_size)
+      top_alive_seq, top_alive_log_probs, top_alive_cache = self._gather_beams(
+        [new_seq, new_log_probs, new_cache], topk_indexes, batch_size, self.beam_size
       )
 
       return {
-          _StateKeys.ALIVE_SEQ: top_alive_seq,
-          _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-          _StateKeys.ALIVE_CACHE: top_alive_cache
+        _StateKeys.ALIVE_SEQ: top_alive_seq,
+        _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+        _StateKeys.ALIVE_CACHE: top_alive_cache,
       }
 
     def _get_new_finished_state(state, new_seq, new_log_probs, new_finished_flags):
@@ -293,7 +293,7 @@ def _get_new_finished_state(state, new_seq, new_log_probs, new_finished_flags):
 
       # Set the scores of the still-alive seq in new_seq to large negative
       # values.
-      new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) * -inf(self.dtype))
+      new_scores += (1.0 - tf.cast(new_finished_flags, self.dtype)) * -inf(self.dtype)
 
       # Combine sequences, scores, and flags.
       finished_seq = tf.concat([finished_seq, new_seq], axis=1)
@@ -302,14 +302,14 @@ def _get_new_finished_state(state, new_seq, new_log_probs, new_finished_flags):
 
       # Return the finished sequences with the best scores.
       _, topk_indexes = tf.nn.top_k(finished_scores, k=self.beam_size)
-      top_finished_seq, top_finished_scores, top_finished_flags = (
-          self._gather_beams([finished_seq, finished_scores, finished_flags], topk_indexes, batch_size, self.beam_size)
+      top_finished_seq, top_finished_scores, top_finished_flags = self._gather_beams(
+        [finished_seq, finished_scores, finished_flags], topk_indexes, batch_size, self.beam_size
       )
 
       return {
-          _StateKeys.FINISHED_SEQ: top_finished_seq,
-          _StateKeys.FINISHED_SCORES: top_finished_scores,
-          _StateKeys.FINISHED_FLAGS: top_finished_flags
+        _StateKeys.FINISHED_SEQ: top_finished_seq,
+        _StateKeys.FINISHED_SCORES: top_finished_scores,
+        _StateKeys.FINISHED_FLAGS: top_finished_flags,
       }
 
     def _search_step(state):
@@ -346,15 +346,15 @@ def _search_step(state):
       return [new_state]
 
     finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(
-            self._continue_search,
-            _search_step,
-            loop_vars=[state],
-            shape_invariants=[state_shapes],
-            parallel_iterations=1,
-            name=self.decoding_name
-        )
+      tf.stop_gradient,
+      tf.while_loop(
+        self._continue_search,
+        _search_step,
+        loop_vars=[state],
+        shape_invariants=[state_shapes],
+        parallel_iterations=1,
+        name=self.decoding_name,
+      ),
     )
     finished_state = finished_state[0]
     return self._process_finished_state(finished_state)
@@ -383,9 +383,9 @@ def _create_initial_state(self, initial_ids, initial_cache, batch_size):
       for inner_value in tf.nest.flatten(value):
         if inner_value.dtype != self.dtype:
           raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, inner_value.dtype.name, self.dtype.name, inner_value)
+            "initial_cache element for key '%s' has dtype %s that does not "
+            "match SequenceBeamSearch's dtype of %s. Value: %s"
+            % (key, inner_value.dtype.name, self.dtype.name, inner_value)
           )
 
     # Current loop index (starts at 0)
@@ -399,7 +399,7 @@ def _create_initial_state(self, initial_ids, initial_cache, batch_size):
 
     # Create tensor for storing initial log probabilities.
     # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant([[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
+    initial_log_probs = tf.constant([[0.0] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
     alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
 
     # Expand all values stored in the dictionary to the beam size, so that each
@@ -417,13 +417,13 @@ def _create_initial_state(self, initial_ids, initial_cache, batch_size):
 
     # Create state dictionary
     state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
+      _StateKeys.CUR_INDEX: cur_index,
+      _StateKeys.ALIVE_SEQ: alive_seq,
+      _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+      _StateKeys.ALIVE_CACHE: alive_cache,
+      _StateKeys.FINISHED_SEQ: finished_seq,
+      _StateKeys.FINISHED_SCORES: finished_scores,
+      _StateKeys.FINISHED_FLAGS: finished_flags,
     }
 
     # Create state invariants for each value in the state dictionary. Each
@@ -433,23 +433,23 @@ def _create_initial_state(self, initial_ids, initial_cache, batch_size):
     #   2) the dimension may have different values on different iterations.
     if self.padded_decode:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX: tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ: tf.TensorShape([batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE: tf.nest.map_structure(lambda state: state.get_shape(), alive_cache),
-          _StateKeys.FINISHED_SEQ: tf.TensorShape([batch_size, self.beam_size, self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES: tf.TensorShape([batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS: tf.TensorShape([batch_size, self.beam_size])
+        _StateKeys.CUR_INDEX: tf.TensorShape([]),
+        _StateKeys.ALIVE_SEQ: tf.TensorShape([batch_size, self.beam_size, self.max_decode_length + 1]),
+        _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([batch_size, self.beam_size]),
+        _StateKeys.ALIVE_CACHE: tf.nest.map_structure(lambda state: state.get_shape(), alive_cache),
+        _StateKeys.FINISHED_SEQ: tf.TensorShape([batch_size, self.beam_size, self.max_decode_length + 1]),
+        _StateKeys.FINISHED_SCORES: tf.TensorShape([batch_size, self.beam_size]),
+        _StateKeys.FINISHED_FLAGS: tf.TensorShape([batch_size, self.beam_size]),
       }
     else:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX: tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE: tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size])
+        _StateKeys.CUR_INDEX: tf.TensorShape([]),
+        _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
+        _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
+        _StateKeys.ALIVE_CACHE: tf.nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+        _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
+        _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
+        _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size]),
       }
 
     return state, state_shape_invariants
@@ -492,10 +492,10 @@ def _continue_search(self, state):
     # If there are no finished sequences in a batch element, then set the lowest
     # finished score to -INF for that element.
     finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 - tf.cast(finished_batches, self.dtype)) * -inf(self.dtype))
+    lowest_finished_scores += (1.0 - tf.cast(finished_batches, self.dtype)) * -inf(self.dtype)
 
     worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores)
+      tf.greater(lowest_finished_scores, best_alive_scores)
     )
 
     return tf.logical_and(not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score))
@@ -538,17 +538,17 @@ def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
 
 
 def sequence_beam_search(
-    symbols_to_logits_fn,
-    initial_ids,
-    initial_cache,
-    vocab_size,
-    beam_size,
-    alpha,
-    max_decode_length,
-    eos_id,
-    padded_decode=False,
-    dtype="float32",
-    decoding_name=None
+  symbols_to_logits_fn,
+  initial_ids,
+  initial_cache,
+  vocab_size,
+  beam_size,
+  alpha,
+  max_decode_length,
+  eos_id,
+  padded_decode=False,
+  dtype="float32",
+  decoding_name=None,
 ):
   """Search for sequence of subtoken ids with the largest probability.
 
@@ -581,7 +581,7 @@ def sequence_beam_search(
     sequence scores [batch_size, beam_size]
   """
   sbs = SequenceBeamSearch(
-      symbols_to_logits_fn, vocab_size, beam_size, alpha, max_decode_length, eos_id, padded_decode, dtype, decoding_name
+    symbols_to_logits_fn, vocab_size, beam_size, alpha, max_decode_length, eos_id, padded_decode, dtype, decoding_name
   )
   return sbs.search(initial_ids, initial_cache)
 
@@ -592,7 +592,7 @@ def _log_prob_from_logits(logits):
 
 def _length_normalization(alpha, length, dtype=tf.float32):
   """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+  return tf.pow(((5.0 + tf.cast(length, dtype)) / 6.0), alpha)
 
 
 def expand_to_beam_size(tensor, beam_size):
diff --git a/deepray/ops/beam_search_test.py b/deepray/ops/beam_search_test.py
index 30f2dff6..91421be2 100644
--- a/deepray/ops/beam_search_test.py
+++ b/deepray/ops/beam_search_test.py
@@ -20,7 +20,6 @@
 
 
 class BeamSearchTests(tf.test.TestCase, parameterized.TestCase):
-
   def test_expand_to_beam_size(self):
     x = tf.ones([7, 4, 2, 5])
     x = beam_search.expand_to_beam_size(x, 3)
@@ -56,22 +55,21 @@ def test_gather_beams(self):
     y = beam_search.SequenceBeamSearch._gather_beams(x, [[1, 2], [0, 2]], 2, 2)
     self.assertAllEqual([[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]], y)
 
-  @parameterized.named_parameters(
-      [
-          ('padded_decode_true_with_name', True, 'decoding'),
-          ('padded_decode_false_with_name', False, 'decoding'),
-          ('padded_decode_true_without_name', True, None),
-          ('padded_decode_false_without_name', False, None),
-      ]
-  )
+  @parameterized.named_parameters([
+    ("padded_decode_true_with_name", True, "decoding"),
+    ("padded_decode_false_with_name", False, "decoding"),
+    ("padded_decode_true_without_name", True, None),
+    ("padded_decode_false_without_name", False, None),
+  ])
   def test_sequence_beam_search(self, padded_decode, name):
     # batch_size*beam_size, max_decode_length, vocab_size
-    probabilities = tf.constant(
-        [[[0.2, 0.7, 0.1], [0.5, 0.3, 0.2], [0.1, 0.8, 0.1]], [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.2, 0.1, 0.7]]]
-    )
+    probabilities = tf.constant([
+      [[0.2, 0.7, 0.1], [0.5, 0.3, 0.2], [0.1, 0.8, 0.1]],
+      [[0.1, 0.8, 0.1], [0.3, 0.4, 0.3], [0.2, 0.1, 0.7]],
+    ])
     # batch_size, max_decode_length, num_heads, embed_size per head
     x = tf.zeros([1, 3, 2, 32], dtype=tf.float32)
-    cache = {'layer_%d' % layer: {'k': x, 'v': x} for layer in range(2)}
+    cache = {"layer_%d" % layer: {"k": x, "v": x} for layer in range(2)}
 
     def _get_test_symbols_to_logits_fn():
       """Test function that returns logits for next token."""
@@ -83,20 +81,20 @@ def symbols_to_logits_fn(_, i, cache):
       return symbols_to_logits_fn
 
     predictions, _ = beam_search.sequence_beam_search(
-        symbols_to_logits_fn=_get_test_symbols_to_logits_fn(),
-        initial_ids=tf.zeros([1], dtype=tf.int32),
-        initial_cache=cache,
-        vocab_size=3,
-        beam_size=2,
-        alpha=0.6,
-        max_decode_length=3,
-        eos_id=9,
-        padded_decode=padded_decode,
-        dtype=tf.float32,
-        decoding_name=name
+      symbols_to_logits_fn=_get_test_symbols_to_logits_fn(),
+      initial_ids=tf.zeros([1], dtype=tf.int32),
+      initial_cache=cache,
+      vocab_size=3,
+      beam_size=2,
+      alpha=0.6,
+      max_decode_length=3,
+      eos_id=9,
+      padded_decode=padded_decode,
+      dtype=tf.float32,
+      decoding_name=name,
     )
     self.assertAllEqual([[[0, 1, 0, 1], [0, 1, 1, 2]]], predictions)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/ops/decoding_module.py b/deepray/ops/decoding_module.py
index 834dfeb7..e2bc66a8 100644
--- a/deepray/ops/decoding_module.py
+++ b/deepray/ops/decoding_module.py
@@ -110,11 +110,11 @@ class DecodingModule(tf.Module, metaclass=abc.ABCMeta):
   """A base class for the API required for decoding (go/decoding-tf-nlp)."""
 
   def __init__(
-      self,
-      length_normalization_fn: Callable[[int, tf.DType], float],
-      dtype: tf.DType = tf.float32,
-      decoding_name: Optional[str] = None,
-      extra_cache_output: bool = False
+    self,
+    length_normalization_fn: Callable[[int, tf.DType], float],
+    dtype: tf.DType = tf.float32,
+    decoding_name: Optional[str] = None,
+    extra_cache_output: bool = False,
   ):
     """Initialize the Decoding Module.
 
@@ -131,10 +131,7 @@ def __init__(
     self.decoding_name = decoding_name
 
   def generate(
-      self,
-      initial_ids: tf.Tensor,
-      initial_cache: Dict[str, tf.Tensor],
-      initial_log_probs: Optional[tf.Tensor] = None
+    self, initial_ids: tf.Tensor, initial_cache: Dict[str, tf.Tensor], initial_log_probs: Optional[tf.Tensor] = None
   ) -> Output:
     """Implements the decoding strategy (beam_search or sampling).
 
@@ -151,7 +148,7 @@ def generate(
         finished_scores: [batch]
         first_cache: The cache after init token
     """
-    batch_size = (initial_ids.shape.as_list()[0] if self.padded_decode else tf.shape(initial_ids)[0])
+    batch_size = initial_ids.shape.as_list()[0] if self.padded_decode else tf.shape(initial_ids)[0]
 
     state, state_shapes = self._create_initial_state(initial_ids, initial_cache, batch_size, initial_log_probs)
 
@@ -172,32 +169,33 @@ def update_with_cache(new_state, cache):
           new_state.update({StateKeys.INITIAL_OUTPUT_CACHE: cache})
 
         tf.cond(
-            tf.equal(i, 0), lambda: update_with_cache(new_state, new_cache),
-            lambda: update_with_cache(new_state, old_cache)
+          tf.equal(i, 0),
+          lambda: update_with_cache(new_state, new_cache),
+          lambda: update_with_cache(new_state, old_cache),
         )
       return [new_state]
 
     finished_state = tf.nest.map_structure(
-        tf.stop_gradient,
-        tf.while_loop(
-            self._continue_search,
-            _generate_step,
-            loop_vars=[state],
-            shape_invariants=[state_shapes],
-            parallel_iterations=1,
-            name=self.decoding_name
-        )
+      tf.stop_gradient,
+      tf.while_loop(
+        self._continue_search,
+        _generate_step,
+        loop_vars=[state],
+        shape_invariants=[state_shapes],
+        parallel_iterations=1,
+        name=self.decoding_name,
+      ),
     )
     final_state = self._process_finished_state(finished_state[0])
     return final_state
 
   @abc.abstractmethod
   def _create_initial_state(
-      self,
-      initial_ids: tf.Tensor,
-      initial_cache: Dict[str, tf.Tensor],
-      batch_size: int,
-      initial_log_probs: Optional[tf.Tensor] = None
+    self,
+    initial_ids: tf.Tensor,
+    initial_cache: Dict[str, tf.Tensor],
+    batch_size: int,
+    initial_log_probs: Optional[tf.Tensor] = None,
   ) -> InitialState:
     """Return initial state dictionary and its shape invariants."""
     pass
@@ -221,7 +219,7 @@ def _grow_alive_seq(self, state: Dict[str, Any], batch_size: int) -> InternalSta
 
   @abc.abstractmethod
   def _get_new_alive_state(
-      self, new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor, new_cache: Dict[str, tf.Tensor]
+    self, new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor, new_cache: Dict[str, tf.Tensor]
   ) -> Dict[str, Any]:
     """Gather the sequences that are still alive.
 
@@ -240,8 +238,12 @@ def _get_new_alive_state(
 
   @abc.abstractmethod
   def _get_new_finished_state(
-      self, state: Dict[str, Any], new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor,
-      batch_size: int
+    self,
+    state: Dict[str, Any],
+    new_seq: tf.Tensor,
+    new_log_probs: tf.Tensor,
+    new_finished_flags: tf.Tensor,
+    batch_size: int,
   ) -> Dict[str, tf.Tensor]:
     """Combine new and old finished sequences.
 
diff --git a/deepray/ops/decoding_module_test.py b/deepray/ops/decoding_module_test.py
index 74da898d..e8ae8f53 100644
--- a/deepray/ops/decoding_module_test.py
+++ b/deepray/ops/decoding_module_test.py
@@ -21,11 +21,10 @@
 
 def length_normalization(length, dtype):
   """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), 0.0)
+  return tf.pow(((5.0 + tf.cast(length, dtype)) / 6.0), 0.0)
 
 
 class TestSubclass(decoding_module.DecodingModule, metaclass=abc.ABCMeta):
-
   def __init__(self, length_normalization_fn=length_normalization, extra_cache_output=True, dtype=tf.float32):
     super(TestSubclass, self).__init__(length_normalization_fn=length_normalization, dtype=dtype)
 
@@ -52,7 +51,6 @@ def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags, new_c
 
 
 class DecodingModuleTest(tf.test.TestCase):
-
   def test_get_shape_keep_last_dim(self):
     y = tf.constant(4.0)
     x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
@@ -67,7 +65,7 @@ def test_shape_list(self):
   def test_inf(self):
     d = TestSubclass()
     inf_value = d.inf()
-    self.assertAllEqual(inf_value, tf.constant(10000000., tf.float32))
+    self.assertAllEqual(inf_value, tf.constant(10000000.0, tf.float32))
 
   def test_length_normalization(self):
     d = TestSubclass()
@@ -75,5 +73,5 @@ def test_length_normalization(self):
     self.assertAllEqual(normalized_length, tf.constant(1.0, tf.float32))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/ops/sampling_module.py b/deepray/ops/sampling_module.py
index 835a4472..3ee769d5 100644
--- a/deepray/ops/sampling_module.py
+++ b/deepray/ops/sampling_module.py
@@ -89,7 +89,7 @@ def sample_top_p(logits, top_p):
   # Shift the indices to the right to keep the first token above threshold.
   sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
   sorted_indices_to_remove = tf.concat(
-      [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1
+    [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1
   )
 
   # Scatter sorted indices to original indexes.
@@ -113,7 +113,7 @@ def scatter_values_on_batch_indices(values, batch_indices):
   """
   tensor_shape = decoding_module.shape_list(batch_indices)
   broad_casted_batch_dims = tf.reshape(
-      tf.broadcast_to(tf.expand_dims(tf.range(tensor_shape[0]), axis=-1), tensor_shape), [1, -1]
+    tf.broadcast_to(tf.expand_dims(tf.range(tensor_shape[0]), axis=-1), tensor_shape), [1, -1]
   )
   pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
   return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), tensor_shape)
@@ -139,20 +139,20 @@ class SamplingModule(decoding_module.DecodingModule, metaclass=abc.ABCMeta):
   """Implementation for sampling strategies (go/decoding-tf-nlp)."""
 
   def __init__(
-      self,
-      symbols_to_logits_fn,
-      vocab_size: int,
-      max_decode_length: int,
-      eos_id: int,
-      padded_decode: bool,
-      length_normalization_fn: Optional[Callable[[int, tf.DType], float]] = None,
-      top_k=0,
-      top_p=1.0,
-      sample_temperature=0.0,
-      enable_greedy: bool = True,
-      dtype: tf.DType = tf.float32,
-      decoding_name: Optional[str] = None,
-      extra_cache_output: bool = False
+    self,
+    symbols_to_logits_fn,
+    vocab_size: int,
+    max_decode_length: int,
+    eos_id: int,
+    padded_decode: bool,
+    length_normalization_fn: Optional[Callable[[int, tf.DType], float]] = None,
+    top_k=0,
+    top_p=1.0,
+    sample_temperature=0.0,
+    enable_greedy: bool = True,
+    dtype: tf.DType = tf.float32,
+    decoding_name: Optional[str] = None,
+    extra_cache_output: bool = False,
   ):
     """Initialize sampling module."""
     self.symbols_to_logits_fn = symbols_to_logits_fn
@@ -169,10 +169,10 @@ def __init__(
     self.decoding_name = decoding_name
     self.extra_cache_output = extra_cache_output
     super(SamplingModule, self).__init__(
-        length_normalization_fn=length_normalization_fn,
-        dtype=dtype,
-        decoding_name=decoding_name,
-        extra_cache_output=extra_cache_output
+      length_normalization_fn=length_normalization_fn,
+      dtype=dtype,
+      decoding_name=decoding_name,
+      extra_cache_output=extra_cache_output,
     )
 
   def _grow_alive_seq(self, state: Dict[str, Any], batch_size: int) -> decoding_module.InternalState:
@@ -212,7 +212,7 @@ def _grow_alive_seq(self, state: Dict[str, Any], batch_size: int) -> decoding_mo
     else:
       temperature_fn = sample_logits_with_temperature
       sampled_logits = tf.cond(
-          self.sample_temperature > 0.0, lambda: temperature_fn(new_logits, self.sample_temperature), lambda: new_logits
+        self.sample_temperature > 0.0, lambda: temperature_fn(new_logits, self.sample_temperature), lambda: new_logits
       )
       sampled_logits = tf.cond(self.top_k > 0, lambda: sample_top_k(sampled_logits, self.top_k), lambda: sampled_logits)
       sampled_logits = tf.cond(self.top_p < 1, lambda: sample_top_p(sampled_logits, self.top_p), lambda: sampled_logits)
@@ -227,19 +227,19 @@ def _grow_alive_seq(self, state: Dict[str, Any], batch_size: int) -> decoding_mo
     return topk_seq, topk_log_probs, topk_ids, new_cache
 
   def _create_initial_state(
-      self,
-      initial_ids: tf.Tensor,
-      initial_cache: Dict[str, tf.Tensor],
-      batch_size: int,
-      initial_log_probs: Optional[tf.Tensor] = None
+    self,
+    initial_ids: tf.Tensor,
+    initial_cache: Dict[str, tf.Tensor],
+    batch_size: int,
+    initial_log_probs: Optional[tf.Tensor] = None,
   ) -> decoding_module.InitialState:
     """Return initial state dictionary and its shape invariants."""
     for key, value in initial_cache.items():
       for inner_value in tf.nest.flatten(value):
         if inner_value.dtype != self.dtype:
           raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match sampling_module's dtype of %s. Value: %s" % (key, value.dtype.name, self.dtype.name, inner_value)
+            "initial_cache element for key '%s' has dtype %s that does not "
+            "match sampling_module's dtype of %s. Value: %s" % (key, value.dtype.name, self.dtype.name, inner_value)
           )
 
     # Current loop index (starts at 0)
@@ -253,7 +253,7 @@ def _create_initial_state(
 
     # Initial log probabilities with shape [batch_size, 1].
     if initial_log_probs is None:
-      initial_log_probs = tf.constant([[0.]], dtype=self.dtype)
+      initial_log_probs = tf.constant([[0.0]], dtype=self.dtype)
       alive_log_probs = tf.tile(initial_log_probs, [batch_size, 1])
     else:
       alive_log_probs = initial_log_probs
@@ -271,64 +271,57 @@ def _create_initial_state(
 
     # Create state dictionary and state shapes.
     state = {
-        decoding_module.StateKeys.CUR_INDEX: cur_index,
-        decoding_module.StateKeys.ALIVE_SEQ: alive_seq,
-        decoding_module.StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        decoding_module.StateKeys.ALIVE_CACHE: alive_cache,
-        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
-        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
-        decoding_module.StateKeys.FINISHED_FLAGS: finished_flags
+      decoding_module.StateKeys.CUR_INDEX: cur_index,
+      decoding_module.StateKeys.ALIVE_SEQ: alive_seq,
+      decoding_module.StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+      decoding_module.StateKeys.ALIVE_CACHE: alive_cache,
+      decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
+      decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
+      decoding_module.StateKeys.FINISHED_FLAGS: finished_flags,
     }
 
     if self.padded_decode:
       state_shape_invariants = {
-          decoding_module.StateKeys.CUR_INDEX: tf.TensorShape([]),
-          decoding_module.StateKeys.ALIVE_SEQ: tf.TensorShape([batch_size, self.max_decode_length + 1]),
-          decoding_module.StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([batch_size, 1]),
-          decoding_module.StateKeys.ALIVE_CACHE: tf.nest.map_structure(lambda state: state.get_shape(), alive_cache),
-          decoding_module.StateKeys.FINISHED_SEQ: tf.TensorShape([batch_size, self.max_decode_length + 1]),
-          decoding_module.StateKeys.FINISHED_SCORES: tf.TensorShape([batch_size, 1]),
-          decoding_module.StateKeys.FINISHED_FLAGS: tf.TensorShape([batch_size, 1])
+        decoding_module.StateKeys.CUR_INDEX: tf.TensorShape([]),
+        decoding_module.StateKeys.ALIVE_SEQ: tf.TensorShape([batch_size, self.max_decode_length + 1]),
+        decoding_module.StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([batch_size, 1]),
+        decoding_module.StateKeys.ALIVE_CACHE: tf.nest.map_structure(lambda state: state.get_shape(), alive_cache),
+        decoding_module.StateKeys.FINISHED_SEQ: tf.TensorShape([batch_size, self.max_decode_length + 1]),
+        decoding_module.StateKeys.FINISHED_SCORES: tf.TensorShape([batch_size, 1]),
+        decoding_module.StateKeys.FINISHED_FLAGS: tf.TensorShape([batch_size, 1]),
       }
     else:
       state_shape_invariants = {
-          decoding_module.StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          decoding_module.StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, None]),
-          decoding_module.StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, 1]),
-          decoding_module.StateKeys.ALIVE_CACHE:
-              tf.nest.map_structure(decoding_module.get_shape_keep_last_dim, alive_cache),
-          decoding_module.StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, None]),
-          decoding_module.StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, 1]),
-          decoding_module.StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, 1])
+        decoding_module.StateKeys.CUR_INDEX: tf.TensorShape([]),
+        decoding_module.StateKeys.ALIVE_SEQ: tf.TensorShape([None, None]),
+        decoding_module.StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, 1]),
+        decoding_module.StateKeys.ALIVE_CACHE: tf.nest.map_structure(
+          decoding_module.get_shape_keep_last_dim, alive_cache
+        ),
+        decoding_module.StateKeys.FINISHED_SEQ: tf.TensorShape([None, None]),
+        decoding_module.StateKeys.FINISHED_SCORES: tf.TensorShape([None, 1]),
+        decoding_module.StateKeys.FINISHED_FLAGS: tf.TensorShape([None, 1]),
       }
 
     if self.extra_cache_output:
       state.update({decoding_module.StateKeys.INITIAL_OUTPUT_CACHE: alive_cache})
       if self.padded_decode:
-        state_shape_invariants.update(
-            {
-                decoding_module.StateKeys.INITIAL_OUTPUT_CACHE:
-                    tf.nest.map_structure(lambda state: state.get_shape(), alive_cache)
-            }
-        )
+        state_shape_invariants.update({
+          decoding_module.StateKeys.INITIAL_OUTPUT_CACHE: tf.nest.map_structure(
+            lambda state: state.get_shape(), alive_cache
+          )
+        })
       else:
-        state_shape_invariants.update(
-            {
-                decoding_module.StateKeys.INITIAL_OUTPUT_CACHE:
-                    tf.nest.map_structure(decoding_module.get_shape_keep_last_dim, alive_cache),
-            }
-        )
+        state_shape_invariants.update({
+          decoding_module.StateKeys.INITIAL_OUTPUT_CACHE: tf.nest.map_structure(
+            decoding_module.get_shape_keep_last_dim, alive_cache
+          ),
+        })
 
     return state, state_shape_invariants
 
   def _get_new_alive_state(
-      self, new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor, new_cache: Dict[str, tf.Tensor]
+    self, new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor, new_cache: Dict[str, tf.Tensor]
   ) -> Dict[str, Any]:
     """Gather the sequences that are still alive.
 
@@ -348,14 +341,18 @@ def _get_new_alive_state(
     """
     new_seq = tf.multiply(new_seq, tf.cast(tf.logical_not(new_finished_flags), new_seq.dtype))
     return {
-        decoding_module.StateKeys.ALIVE_SEQ: new_seq,
-        decoding_module.StateKeys.ALIVE_LOG_PROBS: new_log_probs,
-        decoding_module.StateKeys.ALIVE_CACHE: new_cache
+      decoding_module.StateKeys.ALIVE_SEQ: new_seq,
+      decoding_module.StateKeys.ALIVE_LOG_PROBS: new_log_probs,
+      decoding_module.StateKeys.ALIVE_CACHE: new_cache,
     }
 
   def _get_new_finished_state(
-      self, state: Dict[str, Any], new_seq: tf.Tensor, new_log_probs: tf.Tensor, new_finished_flags: tf.Tensor,
-      batch_size: int
+    self,
+    state: Dict[str, Any],
+    new_seq: tf.Tensor,
+    new_log_probs: tf.Tensor,
+    new_finished_flags: tf.Tensor,
+    batch_size: int,
   ) -> Dict[str, tf.Tensor]:
     """Combine new and old finished sequences.
 
@@ -389,9 +386,9 @@ def _get_new_finished_state(
     finished_scores += tf.multiply(new_scores, tf.cast(new_finished_flags, new_scores.dtype))
     new_finished_flags = tf.logical_or(new_finished_flags, finished_flags)
     return {
-        decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
-        decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
-        decoding_module.StateKeys.FINISHED_FLAGS: new_finished_flags
+      decoding_module.StateKeys.FINISHED_SEQ: finished_seq,
+      decoding_module.StateKeys.FINISHED_SCORES: finished_scores,
+      decoding_module.StateKeys.FINISHED_FLAGS: new_finished_flags,
     }
 
   def _process_finished_state(self, finished_state: Dict[str, Any]) -> decoding_module.Output:
diff --git a/deepray/ops/segment_extractor.py b/deepray/ops/segment_extractor.py
index 3726d03e..d9417443 100644
--- a/deepray/ops/segment_extractor.py
+++ b/deepray/ops/segment_extractor.py
@@ -30,7 +30,7 @@ def _random_int_up_to(maxval, random_fn):
   # when we cast back to int.
   float_maxval = tf.cast(maxval, tf.float32)
   return tf.cast(
-      random_fn(shape=tf.shape(maxval), minval=tf.zeros_like(float_maxval), maxval=float_maxval), dtype=maxval.dtype
+    random_fn(shape=tf.shape(maxval), minval=tf.zeros_like(float_maxval), maxval=float_maxval), dtype=maxval.dtype
   )
 
 
@@ -102,9 +102,9 @@ def get_sentence_order_labels(sentences, random_threshold=0.5, random_next_thres
   # For every position j, sample a position following j, or a position
   # which is [j, row_max]
   all_succeeding = positions.with_flat_values(
-      tf.ragged.map_flat_values(
-          _random_int_from_range, positions.flat_values + 1, row_lengths_broadcasted_flat, random_fn
-      )
+    tf.ragged.map_flat_values(
+      _random_int_from_range, positions.flat_values + 1, row_lengths_broadcasted_flat, random_fn
+    )
   )
 
   # Convert to format that is convenient for `gather_nd`
@@ -124,12 +124,12 @@ def get_sentence_order_labels(sentences, random_threshold=0.5, random_next_thres
   # Decide what to use for the segment: (1) random, out-of-batch, (2) preceeding
   # item, or (3) succeeding.
   # Should get out-of-batch instead of succeeding item
-  should_get_random = ((_get_random(positions, random_fn) > random_threshold) | tf.logical_not(valid_succeeding_mask))
+  should_get_random = (_get_random(positions, random_fn) > random_threshold) | tf.logical_not(valid_succeeding_mask)
   random_or_succeeding_nd = tf.compat.v1.where(should_get_random, all_random_nd, all_succeeding_nd)
   # Choose which items should get a random succeeding item. Force positions that
   # don't have a valid preceeding items to get a random succeeding item.
-  should_get_random_or_succeeding = (
-      (_get_random(positions, random_fn) > random_next_threshold) | tf.logical_not(valid_preceding_mask)
+  should_get_random_or_succeeding = (_get_random(positions, random_fn) > random_next_threshold) | tf.logical_not(
+    valid_preceding_mask
   )
   gather_indices = tf.compat.v1.where(should_get_random_or_succeeding, random_or_succeeding_nd, all_preceding_nd)
   return (tf.gather_nd(sentences, gather_indices), should_get_random_or_succeeding)
@@ -168,10 +168,9 @@ def get_next_sentence_labels(sentences, random_threshold=0.5, random_fn=tf.rando
   # sentences in the document). We will patch these up and force them to grab a
   # random sentence from a random document.
   valid_next_sentences = tf.cast(
-      tf.concat([tf.ones_like(positions)[:, :-1],
-                 tf.zeros([positions.nrows(), 1], dtype=tf.int64)], -1), tf.bool
+    tf.concat([tf.ones_like(positions)[:, :-1], tf.zeros([positions.nrows(), 1], dtype=tf.int64)], -1), tf.bool
   )
 
-  is_random = ((_get_random(positions, random_fn) > random_threshold) | tf.logical_not(valid_next_sentences))
+  is_random = (_get_random(positions, random_fn) > random_threshold) | tf.logical_not(valid_next_sentences)
   gather_indices = tf.compat.v1.where(is_random, all_random_nd, next_sentences_pos_nd)
   return tf.gather_nd(sentences, gather_indices), tf.logical_not(is_random)
diff --git a/deepray/ops/segment_extractor_test.py b/deepray/ops/segment_extractor_test.py
index f6697e34..7e0bb777 100644
--- a/deepray/ops/segment_extractor_test.py
+++ b/deepray/ops/segment_extractor_test.py
@@ -14,6 +14,7 @@
 
 # encoding=utf-8
 """Tests for sentence prediction labels."""
+
 import functools
 
 from absl.testing import parameterized
@@ -23,110 +24,100 @@
 
 
 class NextSentencePredictionTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      [
-          dict(
-              test_description="all random",
-              sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
-              expected_segment=[
-                  [b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"], [b"Hello there.", b"Hello there."]
-              ],
-              expected_labels=[
-                  [False, False, False],
-                  [False, False],
-              ],
-              random_threshold=0.0,
-          ),
-          dict(
-              test_description="all next",
-              sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
-              expected_segment=[
-                  [b"La la la.", b"Such is life.", b"Who let the dogs out?"],
-                  [b"Who?.", b"Hello there."],
-              ],
-              expected_labels=[
-                  [True, True, False],
-                  [True, False],
-              ],
-              random_threshold=1.0,
-          ),
-      ]
-  )
+  @parameterized.parameters([
+    dict(
+      test_description="all random",
+      sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
+      expected_segment=[
+        [b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"],
+        [b"Hello there.", b"Hello there."],
+      ],
+      expected_labels=[
+        [False, False, False],
+        [False, False],
+      ],
+      random_threshold=0.0,
+    ),
+    dict(
+      test_description="all next",
+      sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
+      expected_segment=[
+        [b"La la la.", b"Such is life.", b"Who let the dogs out?"],
+        [b"Who?.", b"Hello there."],
+      ],
+      expected_labels=[
+        [True, True, False],
+        [True, False],
+      ],
+      random_threshold=1.0,
+    ),
+  ])
   def testNextSentencePrediction(
-      self, sentences, expected_segment, expected_labels, random_threshold=0.5, test_description=""
+    self, sentences, expected_segment, expected_labels, random_threshold=0.5, test_description=""
   ):
     sentences = tf.ragged.constant(sentences)
     # Set seed and rig the shuffle function to a deterministic reverse function
     # instead. This is so that we have consistent and deterministic results.
-    extracted_segment, actual_labels = (
-        segment_extractor.get_next_sentence_labels(
-            sentences, random_threshold, random_fn=functools.partial(tf.random.stateless_uniform, seed=(2, 3))
-        )
+    extracted_segment, actual_labels = segment_extractor.get_next_sentence_labels(
+      sentences, random_threshold, random_fn=functools.partial(tf.random.stateless_uniform, seed=(2, 3))
     )
     self.assertAllEqual(expected_segment, extracted_segment)
     self.assertAllEqual(expected_labels, actual_labels)
 
 
 class SentenceOrderLabelsTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(
-      [
-          dict(
-              test_description="all random",
-              sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
-              expected_segment=[
-                  [b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"], [b"Hello there.", b"Hello there."]
-              ],
-              expected_labels=[[True, True, True], [True, True]],
-              random_threshold=0.0,
-              random_next_threshold=0.0,
-          ),
-          dict(
-              test_description="all next",
-              sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
-              expected_segment=[
-                  [b"La la la.", b"Such is life.", b"Who let the dogs out?"], [b"Who?.", b"Hello there."]
-              ],
-              expected_labels=[[True, True, True], [True, True]],
-              random_threshold=1.0,
-              random_next_threshold=0.0,
-          ),
-          dict(
-              test_description="all preceeding",
-              sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
-              expected_segment=[
-                  [b"La la la.", b"Hello there.", b"Hello there."],
-                  [b"Who?.", b"Who let the dogs out?"],
-              ],
-              expected_labels=[
-                  [True, False, False],
-                  [True, False],
-              ],
-              random_threshold=1.0,
-              random_next_threshold=1.0,
-          ),
-      ]
-  )
+  @parameterized.parameters([
+    dict(
+      test_description="all random",
+      sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
+      expected_segment=[
+        [b"Who let the dogs out?", b"Who?.", b"Who let the dogs out?"],
+        [b"Hello there.", b"Hello there."],
+      ],
+      expected_labels=[[True, True, True], [True, True]],
+      random_threshold=0.0,
+      random_next_threshold=0.0,
+    ),
+    dict(
+      test_description="all next",
+      sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
+      expected_segment=[[b"La la la.", b"Such is life.", b"Who let the dogs out?"], [b"Who?.", b"Hello there."]],
+      expected_labels=[[True, True, True], [True, True]],
+      random_threshold=1.0,
+      random_next_threshold=0.0,
+    ),
+    dict(
+      test_description="all preceeding",
+      sentences=[[b"Hello there.", b"La la la.", b"Such is life."], [b"Who let the dogs out?", b"Who?."]],
+      expected_segment=[
+        [b"La la la.", b"Hello there.", b"Hello there."],
+        [b"Who?.", b"Who let the dogs out?"],
+      ],
+      expected_labels=[
+        [True, False, False],
+        [True, False],
+      ],
+      random_threshold=1.0,
+      random_next_threshold=1.0,
+    ),
+  ])
   def testSentenceOrderPrediction(
-      self,
-      sentences,
-      expected_segment,
-      expected_labels,
-      random_threshold=0.5,
-      random_next_threshold=0.5,
-      test_description=""
+    self,
+    sentences,
+    expected_segment,
+    expected_labels,
+    random_threshold=0.5,
+    random_next_threshold=0.5,
+    test_description="",
   ):
     sentences = tf.ragged.constant(sentences)
     # Set seed and rig the shuffle function to a deterministic reverse function
     # instead. This is so that we have consistent and deterministic results.
-    extracted_segment, actual_labels = (
-        segment_extractor.get_sentence_order_labels(
-            sentences,
-            random_threshold=random_threshold,
-            random_next_threshold=random_next_threshold,
-            random_fn=functools.partial(tf.random.stateless_uniform, seed=(2, 3))
-        )
+    extracted_segment, actual_labels = segment_extractor.get_sentence_order_labels(
+      sentences,
+      random_threshold=random_threshold,
+      random_next_threshold=random_next_threshold,
+      random_fn=functools.partial(tf.random.stateless_uniform, seed=(2, 3)),
     )
     self.assertAllEqual(expected_segment, extracted_segment)
     self.assertAllEqual(expected_labels, actual_labels)
diff --git a/deepray/optimizers/__init__.py b/deepray/optimizers/__init__.py
index c3bb482b..e58b54cb 100644
--- a/deepray/optimizers/__init__.py
+++ b/deepray/optimizers/__init__.py
@@ -19,16 +19,16 @@
 from deepray.optimizers.conditional_gradient import ConditionalGradient
 from deepray.optimizers.cyclical_learning_rate import CyclicalLearningRate
 from deepray.optimizers.cyclical_learning_rate import (
-    TriangularCyclicalLearningRate,
+  TriangularCyclicalLearningRate,
 )
 from deepray.optimizers.cyclical_learning_rate import (
-    Triangular2CyclicalLearningRate,
+  Triangular2CyclicalLearningRate,
 )
 from deepray.optimizers.cyclical_learning_rate import (
-    ExponentialCyclicalLearningRate,
+  ExponentialCyclicalLearningRate,
 )
 from deepray.optimizers.multi_optimizer import (
-    MultiOptimizer,
+  MultiOptimizer,
 )
 from deepray.optimizers.lamb import LAMB
 from deepray.optimizers.lazy_adam import LazyAdam
@@ -41,10 +41,10 @@
 from deepray.optimizers.adabelief import AdaBelief
 from deepray.optimizers.weight_decay_optimizers import SGDW
 from deepray.optimizers.weight_decay_optimizers import (
-    extend_with_decoupled_weight_decay,
+  extend_with_decoupled_weight_decay,
 )
 from deepray.optimizers.weight_decay_optimizers import (
-    DecoupledWeightDecayExtension,
+  DecoupledWeightDecayExtension,
 )
 from deepray.optimizers.yogi import Yogi
 from deepray.optimizers.cocob import COCOB
@@ -52,4 +52,4 @@
 from deepray.optimizers.adam_async import AdamAsync
 from deepray.optimizers.gradient_descent import SGD
 from deepray.optimizers.adagrad import Adagrad
-from deepray.optimizers.ftrl import FtrlOptimizer
\ No newline at end of file
+from deepray.optimizers.ftrl import FtrlOptimizer
diff --git a/deepray/optimizers/adabelief.py b/deepray/optimizers/adabelief.py
index 277c2c2b..8ccc9c46 100644
--- a/deepray/optimizers/adabelief.py
+++ b/deepray/optimizers/adabelief.py
@@ -25,108 +25,108 @@
 class AdaBelief(KerasLegacyOptimizer):
   """Variant of the Adam optimizer.
 
-    It achieves fast convergence as Adam and generalization comparable to SGD.
-    It adapts the step size depending on its "belief" in the gradient direction
-    — the optimizer adaptively scales step size by the difference between the
-    predicted and observed gradients.
-
-    It implements the AdaBelief proposed by
-    Juntang Zhuang et al. in [AdaBelief Optimizer: Adapting stepsizes by the
-    belief in observed gradients](https://arxiv.org/abs/2010.07468).
-
-    Example of usage:
-
-    ```python
-    opt = dp.optimizers.AdaBelief(lr=1e-3)
-    ```
-
-    Note: `amsgrad` is not described in the original paper. Use it with
-          caution.
-
-    You can enable enable warmup by setting `total_steps` and
-    `warmup_proportion`,
-    and enable recitifcation as in RAdam by setting 'rectify':
-    ```python
-    opt = dp.optimizers.AdaBelief(
-        lr=1e-3,
-        total_steps=10000,
-        warmup_proportion=0.1,
-        min_lr=1e-5,
-        rectify=True,
-    )
-    ```
-
-    In the above example, the learning rate will increase linearly
-    from 0 to `lr` in 1000 steps, then decrease linearly from `lr` to `min_lr`
-    in 9000 steps.
-
-    Note 'rectify' is independent of 'warmup', you can choose any combinations.
-
-    Lookahead, proposed by Michael R. Zhang et.al in the paper
-    [Lookahead Optimizer: k steps forward, 1 step back]
-    (https://arxiv.org/abs/1907.08610v1), can be integrated with AdaBelief,
-    which is called 'ranger_adabelief' in the author's implementation
-    https://github.com/juntang-zhuang/Adabelief-Optimizer.
-    The mechanism can be enabled by using the lookahead wrapper. For example:
-
-    ```python
-    adabelief = dp.optimizers.AdaBelief()
-    ranger = dp.optimizers.Lookahead(adabelief, sync_period=6, slow_step_size=0.5)
-    ```
-    """
+  It achieves fast convergence as Adam and generalization comparable to SGD.
+  It adapts the step size depending on its "belief" in the gradient direction
+  — the optimizer adaptively scales step size by the difference between the
+  predicted and observed gradients.
+
+  It implements the AdaBelief proposed by
+  Juntang Zhuang et al. in [AdaBelief Optimizer: Adapting stepsizes by the
+  belief in observed gradients](https://arxiv.org/abs/2010.07468).
+
+  Example of usage:
+
+  ```python
+  opt = dp.optimizers.AdaBelief(lr=1e-3)
+  ```
+
+  Note: `amsgrad` is not described in the original paper. Use it with
+        caution.
+
+  You can enable enable warmup by setting `total_steps` and
+  `warmup_proportion`,
+  and enable recitifcation as in RAdam by setting 'rectify':
+  ```python
+  opt = dp.optimizers.AdaBelief(
+      lr=1e-3,
+      total_steps=10000,
+      warmup_proportion=0.1,
+      min_lr=1e-5,
+      rectify=True,
+  )
+  ```
+
+  In the above example, the learning rate will increase linearly
+  from 0 to `lr` in 1000 steps, then decrease linearly from `lr` to `min_lr`
+  in 9000 steps.
+
+  Note 'rectify' is independent of 'warmup', you can choose any combinations.
+
+  Lookahead, proposed by Michael R. Zhang et.al in the paper
+  [Lookahead Optimizer: k steps forward, 1 step back]
+  (https://arxiv.org/abs/1907.08610v1), can be integrated with AdaBelief,
+  which is called 'ranger_adabelief' in the author's implementation
+  https://github.com/juntang-zhuang/Adabelief-Optimizer.
+  The mechanism can be enabled by using the lookahead wrapper. For example:
+
+  ```python
+  adabelief = dp.optimizers.AdaBelief()
+  ranger = dp.optimizers.Lookahead(adabelief, sync_period=6, slow_step_size=0.5)
+  ```
+  """
 
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable, Dict] = 0.001,
-      beta_1: FloatTensorLike = 0.9,
-      beta_2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-14,
-      weight_decay: Union[FloatTensorLike, Callable, Dict] = 0.0,
-      amsgrad: bool = False,
-      rectify: bool = True,
-      sma_threshold: FloatTensorLike = 5.0,
-      total_steps: int = 0,
-      warmup_proportion: FloatTensorLike = 0.1,
-      min_lr: FloatTensorLike = 0.0,
-      name: str = "AdaBelief",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable, Dict] = 0.001,
+    beta_1: FloatTensorLike = 0.9,
+    beta_2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-14,
+    weight_decay: Union[FloatTensorLike, Callable, Dict] = 0.0,
+    amsgrad: bool = False,
+    rectify: bool = True,
+    sma_threshold: FloatTensorLike = 5.0,
+    total_steps: int = 0,
+    warmup_proportion: FloatTensorLike = 0.1,
+    min_lr: FloatTensorLike = 0.0,
+    name: str = "AdaBelief",
+    **kwargs,
   ):
     r"""Construct a new RAdam optimizer.
 
-        Args:
-            learning_rate: A `Tensor` or a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
-                The learning rate.
-            beta_1: A float value or a constant float tensor. The exponential
-                decay rate for the 1st moment estimates.
-            beta_2: A float value or a constant float tensor. The exponential
-                decay rate for the 2nd moment estimates.
-            epsilon: A small constant for numerical stability. Default=1e-14.
-                Note that AdaBelief uses epsilon within sqrt (default=1e-14),
-                while Adam uses epsilon outside sqrt (default=1e-7).
-            weight_decay: A `Tensor` or a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
-                Weight decay for each parameter.
-            amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm
-                from the paper "On the Convergence of Adam and beyond".
-                sma_threshold. A float value. The threshold for simple mean
-                average.
-            rectify: boolean. Whether to apply learning rate rectification as
-                from RAdam.
-            total_steps: An integer. Total number of training steps. Enable
-                warmup by setting a value greater than zero.
-            warmup_proportion: A floating point value. The proportion of
-                increasing steps.
-            min_lr: A floating point value. Minimum learning rate after warmup.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "RectifiedAdam".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
-                `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
-                is clip gradients by value, `decay` is included for backward
-                compatibility to allow time inverse decay of learning rate. `lr`
-                is included for backward compatibility, recommended to use
-                `learning_rate` instead.
-        """
+    Args:
+        learning_rate: A `Tensor` or a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
+            The learning rate.
+        beta_1: A float value or a constant float tensor. The exponential
+            decay rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor. The exponential
+            decay rate for the 2nd moment estimates.
+        epsilon: A small constant for numerical stability. Default=1e-14.
+            Note that AdaBelief uses epsilon within sqrt (default=1e-14),
+            while Adam uses epsilon outside sqrt (default=1e-7).
+        weight_decay: A `Tensor` or a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
+            Weight decay for each parameter.
+        amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm
+            from the paper "On the Convergence of Adam and beyond".
+            sma_threshold. A float value. The threshold for simple mean
+            average.
+        rectify: boolean. Whether to apply learning rate rectification as
+            from RAdam.
+        total_steps: An integer. Total number of training steps. Enable
+            warmup by setting a value greater than zero.
+        warmup_proportion: A floating point value. The proportion of
+            increasing steps.
+        min_lr: A floating point value. Minimum learning rate after warmup.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "RectifiedAdam".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+            `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
+            is clip gradients by value, `decay` is included for backward
+            compatibility to allow time inverse decay of learning rate. `lr`
+            is included for backward compatibility, recommended to use
+            `learning_rate` instead.
+    """
     super().__init__(name, **kwargs)
 
     if isinstance(learning_rate, Dict):
@@ -163,7 +163,7 @@ def set_weights(self, weights):
     params = self.weights
     num_vars = int((len(params) - 1) / 2)
     if len(weights) == 3 * num_vars + 1:
-      weights = weights[:len(params)]
+      weights = weights[: len(params)]
     super().set_weights(weights)
 
   def _decayed_wd(self, var_dtype):
@@ -192,23 +192,23 @@ def _resource_apply_dense(self, grad, var):
       decay_steps = tf.maximum(total_steps - warmup_steps, 1)
       decay_rate = (min_lr - lr_t) / decay_steps
       lr_t = tf.where(
-          local_step <= warmup_steps,
-          lr_t * (local_step / warmup_steps),
-          lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
+        local_step <= warmup_steps,
+        lr_t * (local_step / warmup_steps),
+        lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
       )
 
     sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
     sma_t = sma_inf - 2.0 * local_step * beta_2_power / (1.0 - beta_2_power)
 
     m_t = m.assign(
-        beta_1_t * m + (1.0 - beta_1_t) * grad,
-        use_locking=self._use_locking,
+      beta_1_t * m + (1.0 - beta_1_t) * grad,
+      use_locking=self._use_locking,
     )
     m_corr_t = m_t / (1.0 - beta_1_power)
 
     v_t = v.assign(
-        beta_2_t * v + (1.0 - beta_2_t) * tf.math.square(grad - m_t) + epsilon_t,
-        use_locking=self._use_locking,
+      beta_2_t * v + (1.0 - beta_2_t) * tf.math.square(grad - m_t) + epsilon_t,
+      use_locking=self._use_locking,
     )
     if self.amsgrad:
       vhat = self.get_slot(var, "vhat")
@@ -224,9 +224,9 @@ def _resource_apply_dense(self, grad, var):
       r_t = tf.sqrt(r_t_numerator / r_t_denominator)
       sma_threshold = self._get_hyper("sma_threshold", var_dtype)
       var_t = tf.where(
-          sma_t >= sma_threshold,
-          r_t * m_corr_t / (v_corr_t + epsilon_t),
-          m_corr_t,
+        sma_t >= sma_threshold,
+        r_t * m_corr_t / (v_corr_t + epsilon_t),
+        m_corr_t,
       )
     else:
       var_t = m_corr_t / (v_corr_t + epsilon_t)
@@ -259,9 +259,9 @@ def _resource_apply_sparse(self, grad, var, indices):
       decay_steps = tf.maximum(total_steps - warmup_steps, 1)
       decay_rate = (min_lr - lr_t) / decay_steps
       lr_t = tf.where(
-          local_step <= warmup_steps,
-          lr_t * (local_step / warmup_steps),
-          lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
+        local_step <= warmup_steps,
+        lr_t * (local_step / warmup_steps),
+        lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
       )
 
     sma_inf = 2.0 / (1.0 - beta_2_t) - 1.0
@@ -275,7 +275,7 @@ def _resource_apply_sparse(self, grad, var, indices):
 
     v = self.get_slot(var, "v")
     m_t_indices = tf.gather(m_t, indices)
-    v_scaled_g_values = (tf.math.square(grad - m_t_indices) * (1 - beta_2_t) + epsilon_t)
+    v_scaled_g_values = tf.math.square(grad - m_t_indices) * (1 - beta_2_t) + epsilon_t
     v_t = v.assign(v * beta_2_t, use_locking=self._use_locking)
     v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
 
@@ -293,9 +293,9 @@ def _resource_apply_sparse(self, grad, var, indices):
       r_t = tf.sqrt(r_t_numerator / r_t_denominator)
       sma_threshold = self._get_hyper("sma_threshold", var_dtype)
       var_t = tf.where(
-          sma_t >= sma_threshold,
-          r_t * m_corr_t / (v_corr_t + epsilon_t),
-          m_corr_t,
+        sma_t >= sma_threshold,
+        r_t * m_corr_t / (v_corr_t + epsilon_t),
+        m_corr_t,
       )
     else:
       var_t = m_corr_t / (v_corr_t + epsilon_t)
@@ -312,20 +312,18 @@ def _resource_apply_sparse(self, grad, var, indices):
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "beta_1": self._serialize_hyperparameter("beta_1"),
-            "beta_2": self._serialize_hyperparameter("beta_2"),
-            "decay": self._serialize_hyperparameter("decay"),
-            "weight_decay": self._serialize_hyperparameter("weight_decay"),
-            "sma_threshold": self._serialize_hyperparameter("sma_threshold"),
-            "epsilon": self.epsilon,
-            "amsgrad": self.amsgrad,
-            "rectify": self.rectify,
-            "total_steps": int(self._serialize_hyperparameter("total_steps")),
-            "warmup_proportion": self._serialize_hyperparameter("warmup_proportion"),
-            "min_lr": self._serialize_hyperparameter("min_lr"),
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "beta_1": self._serialize_hyperparameter("beta_1"),
+      "beta_2": self._serialize_hyperparameter("beta_2"),
+      "decay": self._serialize_hyperparameter("decay"),
+      "weight_decay": self._serialize_hyperparameter("weight_decay"),
+      "sma_threshold": self._serialize_hyperparameter("sma_threshold"),
+      "epsilon": self.epsilon,
+      "amsgrad": self.amsgrad,
+      "rectify": self.rectify,
+      "total_steps": int(self._serialize_hyperparameter("total_steps")),
+      "warmup_proportion": self._serialize_hyperparameter("warmup_proportion"),
+      "min_lr": self._serialize_hyperparameter("min_lr"),
+    })
     return config
diff --git a/deepray/optimizers/adagrad.py b/deepray/optimizers/adagrad.py
index 31c046a1..5021f4f1 100644
--- a/deepray/optimizers/adagrad.py
+++ b/deepray/optimizers/adagrad.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Adagrad for Deepray."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -28,7 +29,6 @@
 
 
 class Adagrad(tf.keras.optimizers.legacy.Adagrad):
-
   def __init__(self, learning_rate=0.001, **kwargs):
     super().__init__(learning_rate=learning_rate, **kwargs)
     self.global_step = None
@@ -48,34 +48,28 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_c
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       if indices_counts != None:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad_with_counts(
-            var.handle,
-            acc.handle,
-            coefficients["lr_t"],
-            grad,
-            indices,
-            self.global_step,
-            indices_counts,
-            use_locking=self._use_locking
+          var.handle,
+          acc.handle,
+          coefficients["lr_t"],
+          grad,
+          indices,
+          self.global_step,
+          indices_counts,
+          use_locking=self._use_locking,
         )
       else:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adagrad(
-            var.handle,
-            acc.handle,
-            coefficients["lr_t"],
-            grad,
-            indices,
-            self.global_step,
-            use_locking=self._use_locking
+          var.handle, acc.handle, coefficients["lr_t"], grad, indices, self.global_step, use_locking=self._use_locking
         )
     else:
       return tf.raw_ops.ResourceSparseApplyAdagradV2(
-          var=var.handle,
-          accum=acc.handle,
-          lr=coefficients["lr_t"],
-          epsilon=coefficients["epsilon"],
-          grad=grad,
-          indices=indices,
-          use_locking=self._use_locking,
+        var=var.handle,
+        accum=acc.handle,
+        lr=coefficients["lr_t"],
+        epsilon=coefficients["epsilon"],
+        grad=grad,
+        indices=indices,
+        use_locking=self._use_locking,
       )
 
 
diff --git a/deepray/optimizers/adam.py b/deepray/optimizers/adam.py
index 35c2a3ff..405a36e7 100644
--- a/deepray/optimizers/adam.py
+++ b/deepray/optimizers/adam.py
@@ -51,59 +51,57 @@ def _create_slots(self, var_list):
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = (
-        (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
-    )
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
+    coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       if indices_counts is not None:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adam_with_counts(
-            var.handle,
-            m.handle,
-            v.handle,
-            coefficients['beta_1_power'],
-            coefficients['beta_2_power'],
-            coefficients['lr_t'],
-            coefficients['beta_1_t'],
-            coefficients['beta_2_t'],
-            coefficients['epsilon'],
-            grad,
-            indices,
-            self.global_step,
-            indices_counts,
-            use_locking=self._use_locking
+          var.handle,
+          m.handle,
+          v.handle,
+          coefficients["beta_1_power"],
+          coefficients["beta_2_power"],
+          coefficients["lr_t"],
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          indices,
+          self.global_step,
+          indices_counts,
+          use_locking=self._use_locking,
         )
       else:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adam(
-            var.handle,
-            m.handle,
-            v.handle,
-            coefficients['beta_1_power'],
-            coefficients['beta_2_power'],
-            coefficients['lr_t'],
-            coefficients['beta_1_t'],
-            coefficients['beta_2_t'],
-            coefficients['epsilon'],
-            grad,
-            indices,
-            self.global_step,
-            use_locking=self._use_locking
+          var.handle,
+          m.handle,
+          v.handle,
+          coefficients["beta_1_power"],
+          coefficients["beta_2_power"],
+          coefficients["lr_t"],
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          indices,
+          self.global_step,
+          use_locking=self._use_locking,
         )
     else:
       return gen_training_ops.resource_sparse_apply_adam(
-          var=var.handle,
-          m=m.handle,
-          v=v.handle,
-          beta1_power=coefficients['beta_1_power'],
-          beta2_power=coefficients['beta_2_power'],
-          lr=coefficients['lr_t'],
-          beta1=coefficients['beta_1_t'],
-          beta2=coefficients['beta_2_t'],
-          epsilon=coefficients['epsilon'],
-          grad=grad,
-          indices=indices,
-          use_locking=self._use_locking
+        var=var.handle,
+        m=m.handle,
+        v=v.handle,
+        beta1_power=coefficients["beta_1_power"],
+        beta2_power=coefficients["beta_2_power"],
+        lr=coefficients["lr_t"],
+        beta1=coefficients["beta_1_t"],
+        beta2=coefficients["beta_2_t"],
+        epsilon=coefficients["epsilon"],
+        grad=grad,
+        indices=indices,
+        use_locking=self._use_locking,
       )
 
 
diff --git a/deepray/optimizers/adam_async.py b/deepray/optimizers/adam_async.py
index 0da3bad6..97af6ed9 100644
--- a/deepray/optimizers/adam_async.py
+++ b/deepray/optimizers/adam_async.py
@@ -10,8 +10,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""AdamAsync optimizer for Deepray. 
-"""
+"""AdamAsync optimizer for Deepray."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -48,29 +47,29 @@ def _create_slots(self, var_list):
       self.add_slot(var, "v", slot_config=SlotConfig(slot_index=2, slot_num=2))
       if isinstance(var, kv_variable_ops.EmbeddingVariable):
         self.add_slot(
-            var,
-            slot_name="beta1_power",
-            initializer=array_ops.expand_dims(self._get_hyper("beta_1", var.dtype.base_dtype), -1),
-            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+          var,
+          slot_name="beta1_power",
+          initializer=array_ops.expand_dims(self._get_hyper("beta_1", var.dtype.base_dtype), -1),
+          slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE),
         )
         self.add_slot(
-            var,
-            slot_name="beta2_power",
-            initializer=array_ops.expand_dims(self._get_hyper("beta_2", var.dtype.base_dtype), -1),
-            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+          var,
+          slot_name="beta2_power",
+          initializer=array_ops.expand_dims(self._get_hyper("beta_2", var.dtype.base_dtype), -1),
+          slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE),
         )
       else:
         self.add_slot(
-            var,
-            slot_name="beta1_power",
-            initializer=self._get_hyper("beta_1", var.dtype.base_dtype),
-            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+          var,
+          slot_name="beta1_power",
+          initializer=self._get_hyper("beta_1", var.dtype.base_dtype),
+          slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE),
         )
         self.add_slot(
-            var,
-            slot_name="beta2_power",
-            initializer=self._get_hyper("beta_2", var.dtype.base_dtype),
-            slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE)
+          var,
+          slot_name="beta2_power",
+          initializer=self._get_hyper("beta_2", var.dtype.base_dtype),
+          slot_config=SlotConfig(slot_type=config_pb2.SlotType.VARIABLE),
         )
     if self.amsgrad:
       for var in var_list:
@@ -88,99 +87,97 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
 
     # lr = apply_state[(var_device, var_dtype)]["lr_t"] * (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))
     apply_state[(var_device, var_dtype)].update(
-        dict(
-            # lr=lr,
-            epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            # beta_1_power=beta_1_power,
-            one_minus_beta_1_t=1 - beta_1_t,
-            beta_2_t=beta_2_t,
-            # beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t,
-        )
+      dict(
+        # lr=lr,
+        epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+        beta_1_t=beta_1_t,
+        # beta_1_power=beta_1_power,
+        one_minus_beta_1_t=1 - beta_1_t,
+        beta_2_t=beta_2_t,
+        # beta_2_power=beta_2_power,
+        one_minus_beta_2_t=1 - beta_2_t,
+      )
     )
 
   def _resource_apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
     v = self.get_slot(var, "v")
-    beta1_power = self.get_slot(var, 'beta1_power')
-    beta2_power = self.get_slot(var, 'beta2_power')
+    beta1_power = self.get_slot(var, "beta1_power")
+    beta2_power = self.get_slot(var, "beta2_power")
     return gen_training_ops.resource_apply_adam_async(
-        var.handle,
-        m.handle,
-        v.handle,
-        beta1_power.handle,
-        beta2_power.handle,
-        math_ops.cast(self._lr_t, grad.dtype.base_dtype),
-        math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
-        math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
-        math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking,
-        apply_sparse_rmsprop=self._apply_sparse_rmsprop
+      var.handle,
+      m.handle,
+      v.handle,
+      beta1_power.handle,
+      beta2_power.handle,
+      math_ops.cast(self._lr_t, grad.dtype.base_dtype),
+      math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
+      math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
+      math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
+      grad,
+      use_locking=self._use_locking,
+      apply_sparse_rmsprop=self._apply_sparse_rmsprop,
     )
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None, indices_counts=None):
-    m = self.get_slot(var, 'm')
-    v = self.get_slot(var, 'v')
-    beta1_power = self.get_slot(var, 'beta1_power')
-    beta2_power = self.get_slot(var, 'beta2_power')
+    m = self.get_slot(var, "m")
+    v = self.get_slot(var, "v")
+    beta1_power = self.get_slot(var, "beta1_power")
+    beta2_power = self.get_slot(var, "beta2_power")
     var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = (
-        (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
-    )
+    coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)
 
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       if indices_counts is not None:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async_with_counts(
-            var.handle,
-            m.handle,
-            v.handle,
-            beta1_power.handle,
-            beta2_power.handle,
-            coefficients['lr_t'],
-            coefficients['beta_1_t'],
-            coefficients['beta_2_t'],
-            coefficients['epsilon'],
-            grad,
-            indices,
-            self.global_step,
-            indices_counts,
-            use_locking=self._use_locking,
-            apply_sparse_rmsprop=self._apply_sparse_rmsprop
+          var.handle,
+          m.handle,
+          v.handle,
+          beta1_power.handle,
+          beta2_power.handle,
+          coefficients["lr_t"],
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          indices,
+          self.global_step,
+          indices_counts,
+          use_locking=self._use_locking,
+          apply_sparse_rmsprop=self._apply_sparse_rmsprop,
         )
       else:
         return gen_kv_variable_ops.kv_resource_sparse_apply_adam_async(
-            var.handle,
-            m.handle,
-            v.handle,
-            beta1_power.handle,
-            beta2_power.handle,
-            coefficients['lr_t'],
-            coefficients['beta_1_t'],
-            coefficients['beta_2_t'],
-            coefficients['epsilon'],
-            grad,
-            indices,
-            self.global_step,
-            use_locking=self._use_locking,
-            apply_sparse_rmsprop=self._apply_sparse_rmsprop
+          var.handle,
+          m.handle,
+          v.handle,
+          beta1_power.handle,
+          beta2_power.handle,
+          coefficients["lr_t"],
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          indices,
+          self.global_step,
+          use_locking=self._use_locking,
+          apply_sparse_rmsprop=self._apply_sparse_rmsprop,
         )
     else:
       return gen_training_ops.resource_sparse_apply_adam_async(
-          var=var.handle,
-          m=m.handle,
-          v=v.handle,
-          beta1_power=beta1_power.handle,
-          beta2_power=beta2_power.handle,
-          lr=coefficients['lr_t'],
-          beta1=coefficients['beta_1_t'],
-          beta2=coefficients['beta_2_t'],
-          epsilon=coefficients['epsilon'],
-          grad=grad,
-          indices=indices,
-          use_locking=self._use_locking,
-          apply_sparse_rmsprop=self._apply_sparse_rmsprop
+        var=var.handle,
+        m=m.handle,
+        v=v.handle,
+        beta1_power=beta1_power.handle,
+        beta2_power=beta2_power.handle,
+        lr=coefficients["lr_t"],
+        beta1=coefficients["beta_1_t"],
+        beta2=coefficients["beta_2_t"],
+        epsilon=coefficients["epsilon"],
+        grad=grad,
+        indices=indices,
+        use_locking=self._use_locking,
+        apply_sparse_rmsprop=self._apply_sparse_rmsprop,
       )
 
 
diff --git a/deepray/optimizers/average_wrapper.py b/deepray/optimizers/average_wrapper.py
index 8bc46607..7ea11db1 100644
--- a/deepray/optimizers/average_wrapper.py
+++ b/deepray/optimizers/average_wrapper.py
@@ -23,18 +23,17 @@
 
 
 class AveragedOptimizerWrapper(KerasLegacyOptimizer, metaclass=abc.ABCMeta):
-
   @typechecked
   def __init__(
-      self,
-      optimizer: types.Optimizer,
-      name: str = "AverageOptimizer",
-      **kwargs,
+    self,
+    optimizer: types.Optimizer,
+    name: str = "AverageOptimizer",
+    **kwargs,
   ):
     super().__init__(name, **kwargs)
 
     if isinstance(optimizer, str):
-      if (hasattr(tf.keras.optimizers, "legacy") and KerasLegacyOptimizer == tf.keras.optimizers.legacy.Optimizer):
+      if hasattr(tf.keras.optimizers, "legacy") and KerasLegacyOptimizer == tf.keras.optimizers.legacy.Optimizer:
         optimizer = tf.keras.optimizers.get(optimizer, use_legacy_optimizer=True)
       else:
         optimizer = tf.keras.optimizers.get(optimizer)
@@ -99,41 +98,43 @@ def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, apply_sta
   def assign_average_vars(self, var_list):
     """Assign variables in var_list with their respective averages.
 
-        Args:
-            var_list: List of model variables to be assigned to their average.
+    Args:
+        var_list: List of model variables to be assigned to their average.
 
-        Returns:
-            assign_op: The op corresponding to the assignment operation of
-            variables to their average.
+    Returns:
+        assign_op: The op corresponding to the assignment operation of
+        variables to their average.
 
-        Example:
-        ```python
-        model = tf.Sequential([...])
-        opt = dp.optimizers.SWA(
-                tf.keras.optimizers.SGD(lr=2.0), 100, 10)
-        model.compile(opt, ...)
-        model.fit(x, y, ...)
+    Example:
+    ```python
+    model = tf.Sequential([...])
+    opt = dp.optimizers.SWA(
+            tf.keras.optimizers.SGD(lr=2.0), 100, 10)
+    model.compile(opt, ...)
+    model.fit(x, y, ...)
 
-        # Update the weights to their mean before saving
-        opt.assign_average_vars(model.variables)
+    # Update the weights to their mean before saving
+    opt.assign_average_vars(model.variables)
 
-        model.save('model.h5')
-        ```
-        """
+    model.save('model.h5')
+    ```
+    """
     assign_ops = []
     for var in var_list:
       try:
-        assign_ops.append(var.assign(
+        assign_ops.append(
+          var.assign(
             self.get_slot(var, "average"),
             use_locking=self._use_locking,
-        ))
+          )
+        )
       except Exception as e:
         warnings.warn("Unable to assign average slot to {} : {}".format(var, e))
     return tf.group(assign_ops)
 
   def get_config(self):
     config = {
-        "optimizer": tf.keras.optimizers.serialize(self._optimizer),
+      "optimizer": tf.keras.optimizers.serialize(self._optimizer),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/optimizers/cocob.py b/deepray/optimizers/cocob.py
index 80a97b9b..55da28ae 100644
--- a/deepray/optimizers/cocob.py
+++ b/deepray/optimizers/cocob.py
@@ -24,31 +24,31 @@
 class COCOB(KerasLegacyOptimizer):
   """Optimizer that implements COCOB Backprop Algorithm
 
-        Reference:
-            - [COntinuos COin Betting (COCOB) Backprop optimizer
-    ](https://arxiv.org/abs/1705.07795)
-    """
+      Reference:
+          - [COntinuos COin Betting (COCOB) Backprop optimizer
+  ](https://arxiv.org/abs/1705.07795)
+  """
 
   @typechecked
   def __init__(
-      self,
-      alpha: float = 100,
-      use_locking: bool = False,
-      name: str = "COCOB",
-      **kwargs,
+    self,
+    alpha: float = 100,
+    use_locking: bool = False,
+    name: str = "COCOB",
+    **kwargs,
   ):
     """Constructs a new COCOB-Backprop optimizer
 
-        Arguments:
-            `aplha`: Default value is set to 100 as per paper.
-                     This has the effect of restricting the value of the
-                     parameters in the first iterations of the algorithm.
-                     (Refer to Paper for indepth understanding)
+    Arguments:
+        `aplha`: Default value is set to 100 as per paper.
+                 This has the effect of restricting the value of the
+                 parameters in the first iterations of the algorithm.
+                 (Refer to Paper for indepth understanding)
 
-        Rasies:
-            `ValueError`: If the value of `alpha` is less than 1.
-            `NotImplementedError`: If the data is in sparse format.
-        """
+    Rasies:
+        `ValueError`: If the value of `alpha` is less than 1.
+        `NotImplementedError`: If the data is in sparse format.
+    """
 
     if alpha < 1:
       raise ValueError("`alpha` must be greater than Zero")
@@ -91,24 +91,21 @@ def _resource_apply_dense(self, grad, handle, apply_state=None):
     lr_update_op = lr.assign(lr_update)
     reward_update_op = reward.assign(reward_update)
 
-    return tf.group(
-        *[
-            gradients_sum_update_op,
-            var_update_op,
-            grad_norm_sum_update_op,
-            tilde_w_update_op,
-            reward_update_op,
-            lr_update_op,
-        ]
-    )
+    return tf.group(*[
+      gradients_sum_update_op,
+      var_update_op,
+      grad_norm_sum_update_op,
+      tilde_w_update_op,
+      reward_update_op,
+      lr_update_op,
+    ])
 
   def _resource_apply_sparse(self, grad, handle, indices, apply_state=None):
     raise NotImplementedError()
 
   def get_config(self):
-
     config = {
-        "alpha": self._serialize_hyperparameter("alpha"),
+      "alpha": self._serialize_hyperparameter("alpha"),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/optimizers/conditional_gradient.py b/deepray/optimizers/conditional_gradient.py
index eab8aa5d..ab14f027 100644
--- a/deepray/optimizers/conditional_gradient.py
+++ b/deepray/optimizers/conditional_gradient.py
@@ -26,64 +26,64 @@
 class ConditionalGradient(KerasLegacyOptimizer):
   """Optimizer that implements the Conditional Gradient optimization.
 
-    This optimizer helps handle constraints well.
+  This optimizer helps handle constraints well.
 
-    Currently only supports frobenius norm constraint or nuclear norm
-    constraint.
-    See https://arxiv.org/pdf/1803.06453.pdf
+  Currently only supports frobenius norm constraint or nuclear norm
+  constraint.
+  See https://arxiv.org/pdf/1803.06453.pdf
 
-    ```
-    variable -= (1-learning_rate) * (variable + lambda_ * gradient
-        / (frobenius_norm(gradient) + epsilon))
-    ```
+  ```
+  variable -= (1-learning_rate) * (variable + lambda_ * gradient
+      / (frobenius_norm(gradient) + epsilon))
+  ```
 
-    Note that `lambda_` here refers to the constraint "lambda" in
-    the paper. `epsilon` is constant with tiny value as compared to
-    the value of frobenius norm of gradient. The purpose of `epsilon`
-    here is to avoid the case that the value of frobenius norm of
-    gradient is 0.
+  Note that `lambda_` here refers to the constraint "lambda" in
+  the paper. `epsilon` is constant with tiny value as compared to
+  the value of frobenius norm of gradient. The purpose of `epsilon`
+  here is to avoid the case that the value of frobenius norm of
+  gradient is 0.
 
-    In this implementation, `epsilon` defaults to $10^{-7}$.
+  In this implementation, `epsilon` defaults to $10^{-7}$.
 
-    For nucler norm constraint, the formula is as following:
+  For nucler norm constraint, the formula is as following:
 
-    ```
-    variable -= (1-learning_rate) * (variable
-        + lambda_ * top_singular_vector(gradient))
-    ```
-    """
+  ```
+  variable -= (1-learning_rate) * (variable
+      + lambda_ * top_singular_vector(gradient))
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable],
-      lambda_: Union[FloatTensorLike, Callable] = 0.01,
-      epsilon: FloatTensorLike = 1e-7,
-      ord: str = "fro",
-      name: str = "ConditionalGradient",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable],
+    lambda_: Union[FloatTensorLike, Callable] = 0.01,
+    epsilon: FloatTensorLike = 1e-7,
+    ord: str = "fro",
+    name: str = "ConditionalGradient",
+    **kwargs,
   ):
     """Construct a new conditional gradient optimizer.
 
-        Args:
-            learning_rate: A `Tensor` or a floating point value. or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
-                The learning rate.
-            lambda_: A `Tensor` or a floating point value. The constraint.
-            epsilon: A `Tensor` or a floating point value. A small constant
-                for numerical stability when handling the case of norm of
-                gradient to be zero.
-            ord: Order of the norm. Supported values are `'fro'`
-                and `'nuclear'`. Default is `'fro'`, which is frobenius norm.
-            name: Optional name prefix for the operations created when
-                applying gradients. Defaults to 'ConditionalGradient'.
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
-                by norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        learning_rate: A `Tensor` or a floating point value. or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
+            The learning rate.
+        lambda_: A `Tensor` or a floating point value. The constraint.
+        epsilon: A `Tensor` or a floating point value. A small constant
+            for numerical stability when handling the case of norm of
+            gradient to be zero.
+        ord: Order of the norm. Supported values are `'fro'`
+            and `'nuclear'`. Default is `'fro'`, which is frobenius norm.
+        name: Optional name prefix for the operations created when
+            applying gradients. Defaults to 'ConditionalGradient'.
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
+            by norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(name=name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("lambda_", lambda_)
@@ -95,10 +95,10 @@ def __init__(
 
   def get_config(self):
     config = {
-        "learning_rate": self._serialize_hyperparameter("learning_rate"),
-        "lambda_": self._serialize_hyperparameter("lambda_"),
-        "epsilon": self.epsilon,
-        "ord": self.ord,
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "lambda_": self._serialize_hyperparameter("lambda_"),
+      "epsilon": self.epsilon,
+      "ord": self.ord,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -115,7 +115,7 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
 
   @staticmethod
   def _frobenius_norm(m):
-    return tf.reduce_sum(m**2)**0.5
+    return tf.reduce_sum(m**2) ** 0.5
 
   @staticmethod
   def _top_singular_vector(m):
@@ -128,12 +128,12 @@ def _top_singular_vector(m):
     first_pad = tf.cast(tf.less(original_rank, 2), dtype=tf.int32)
     second_pad = tf.cast(tf.equal(original_rank, 0), dtype=tf.int32)
     new_shape = tf.concat(
-        [
-            tf.ones(shape=first_pad, dtype=tf.int32),
-            tf.ones(shape=second_pad, dtype=tf.int32),
-            shape,
-        ],
-        axis=0,
+      [
+        tf.ones(shape=first_pad, dtype=tf.int32),
+        tf.ones(shape=second_pad, dtype=tf.int32),
+        shape,
+      ],
+      axis=0,
     )
     n = tf.reshape(m, new_shape)
     st, ut, vt = tf.linalg.svd(n, full_matrices=False)
@@ -160,9 +160,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
       s = grad / (norm + epsilon)
     else:
       top_singular_vector = tf.convert_to_tensor(
-          self._top_singular_vector(grad),
-          name="top_singular_vector",
-          dtype=var.dtype.base_dtype,
+        self._top_singular_vector(grad),
+        name="top_singular_vector",
+        dtype=var.dtype.base_dtype,
       )
       s = top_singular_vector
 
@@ -181,9 +181,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
       s = grad / (norm + epsilon)
     else:
       top_singular_vector = tf.convert_to_tensor(
-          self._top_singular_vector(grad),
-          name="top_singular_vector",
-          dtype=var.dtype.base_dtype,
+        self._top_singular_vector(grad),
+        name="top_singular_vector",
+        dtype=var.dtype.base_dtype,
       )
       s = top_singular_vector
 
diff --git a/deepray/optimizers/constants.py b/deepray/optimizers/constants.py
index 1b5b9de2..b0cf9ebe 100644
--- a/deepray/optimizers/constants.py
+++ b/deepray/optimizers/constants.py
@@ -15,8 +15,8 @@
 import tensorflow as tf
 
 if (
-    hasattr(tf.keras.optimizers, "experimental") and
-    tf.keras.optimizers.Optimizer.__module__ == tf.keras.optimizers.experimental.Optimizer.__module__
+  hasattr(tf.keras.optimizers, "experimental")
+  and tf.keras.optimizers.Optimizer.__module__ == tf.keras.optimizers.experimental.Optimizer.__module__
 ):
   # If the default optimizer points to new Keras optimizer, deepray optimizers
   # should use the legacy path.
diff --git a/deepray/optimizers/cyclical_learning_rate.py b/deepray/optimizers/cyclical_learning_rate.py
index 30628e94..324f8192 100644
--- a/deepray/optimizers/cyclical_learning_rate.py
+++ b/deepray/optimizers/cyclical_learning_rate.py
@@ -27,54 +27,54 @@ class CyclicalLearningRate(tf.keras.optimizers.schedules.LearningRateSchedule):
 
   @typechecked
   def __init__(
-      self,
-      initial_learning_rate: Union[FloatTensorLike, Callable],
-      maximal_learning_rate: Union[FloatTensorLike, Callable],
-      step_size: FloatTensorLike,
-      scale_fn: Callable,
-      scale_mode: str = "cycle",
-      name: str = "CyclicalLearningRate",
+    self,
+    initial_learning_rate: Union[FloatTensorLike, Callable],
+    maximal_learning_rate: Union[FloatTensorLike, Callable],
+    step_size: FloatTensorLike,
+    scale_fn: Callable,
+    scale_mode: str = "cycle",
+    name: str = "CyclicalLearningRate",
   ):
     """Applies cyclical schedule to the learning rate.
 
-        See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
-
-
-        ```python
-        lr_schedule = tf.keras.optimizers.schedules.CyclicalLearningRate(
-            initial_learning_rate=1e-4,
-            maximal_learning_rate=1e-2,
-            step_size=2000,
-            scale_fn=lambda x: 1.,
-            scale_mode="cycle",
-            name="MyCyclicScheduler")
-
-        model.compile(optimizer=tf.keras.optimizers.SGD(
-                                                    learning_rate=lr_schedule),
-                      loss='sparse_categorical_crossentropy',
-                      metrics=['accuracy'])
-
-        model.fit(data, labels, epochs=5)
-        ```
-
-        You can pass this schedule directly into a
-        `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
-
-        Args:
-            initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The initial learning rate.
-            maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The maximum learning rate.
-            step_size: A scalar `float32` or `float64` `Tensor` or a
-                Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate.
-            scale_fn: A function. Scheduling function applied in cycle
-            scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
-                schedule
-            name: (Optional) Name for the operation.
-
-        Returns:
-            Updated learning rate value.
-        """
+    See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
+
+
+    ```python
+    lr_schedule = tf.keras.optimizers.schedules.CyclicalLearningRate(
+        initial_learning_rate=1e-4,
+        maximal_learning_rate=1e-2,
+        step_size=2000,
+        scale_fn=lambda x: 1.,
+        scale_mode="cycle",
+        name="MyCyclicScheduler")
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                                                learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    You can pass this schedule directly into a
+    `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
+
+    Args:
+        initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The initial learning rate.
+        maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The maximum learning rate.
+        step_size: A scalar `float32` or `float64` `Tensor` or a
+            Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate.
+        scale_fn: A function. Scheduling function applied in cycle
+        scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
+            schedule
+        name: (Optional) Name for the operation.
+
+    Returns:
+        Updated learning rate value.
+    """
     super().__init__()
     self.initial_learning_rate = initial_learning_rate
     self.maximal_learning_rate = maximal_learning_rate
@@ -95,231 +95,228 @@ def __call__(self, step):
 
       mode_step = cycle if self.scale_mode == "cycle" else step
 
-      return initial_learning_rate + (maximal_learning_rate -
-                                      initial_learning_rate) * tf.maximum(tf.cast(0, dtype),
-                                                                          (1 - x)) * self.scale_fn(mode_step)
+      return initial_learning_rate + (maximal_learning_rate - initial_learning_rate) * tf.maximum(
+        tf.cast(0, dtype), (1 - x)
+      ) * self.scale_fn(mode_step)
 
   def get_config(self):
     return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "maximal_learning_rate": self.maximal_learning_rate,
-        "scale_fn": self.scale_fn,
-        "step_size": self.step_size,
-        "scale_mode": self.scale_mode,
+      "initial_learning_rate": self.initial_learning_rate,
+      "maximal_learning_rate": self.maximal_learning_rate,
+      "scale_fn": self.scale_fn,
+      "step_size": self.step_size,
+      "scale_mode": self.scale_mode,
     }
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class TriangularCyclicalLearningRate(CyclicalLearningRate):
-
   @typechecked
   def __init__(
-      self,
-      initial_learning_rate: Union[FloatTensorLike, Callable],
-      maximal_learning_rate: Union[FloatTensorLike, Callable],
-      step_size: FloatTensorLike,
-      scale_mode: str = "cycle",
-      name: str = "TriangularCyclicalLearningRate",
+    self,
+    initial_learning_rate: Union[FloatTensorLike, Callable],
+    maximal_learning_rate: Union[FloatTensorLike, Callable],
+    step_size: FloatTensorLike,
+    scale_mode: str = "cycle",
+    name: str = "TriangularCyclicalLearningRate",
   ):
     """Applies triangular cyclical schedule to the learning rate.
 
-        See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
+    See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
 
 
-        ```python
-        from tf.keras.optimizers import schedules
+    ```python
+    from tf.keras.optimizers import schedules
 
-        lr_schedule = schedules.TriangularCyclicalLearningRate(
-            initial_learning_rate=1e-4,
-            maximal_learning_rate=1e-2,
-            step_size=2000,
-            scale_mode="cycle",
-            name="MyCyclicScheduler")
+    lr_schedule = schedules.TriangularCyclicalLearningRate(
+        initial_learning_rate=1e-4,
+        maximal_learning_rate=1e-2,
+        step_size=2000,
+        scale_mode="cycle",
+        name="MyCyclicScheduler")
 
-        model.compile(optimizer=tf.keras.optimizers.SGD(
-                                                    learning_rate=lr_schedule),
-                      loss='sparse_categorical_crossentropy',
-                      metrics=['accuracy'])
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                                                learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
 
-        model.fit(data, labels, epochs=5)
-        ```
+    model.fit(data, labels, epochs=5)
+    ```
 
-        You can pass this schedule directly into a
-        `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
+    You can pass this schedule directly into a
+    `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
 
-        Args:
-            initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The initial learning rate.
-            maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The maximum learning rate.
-            step_size: A scalar `float32` or `float64` `Tensor` or a
-                Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
-            scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
-                schedule
-            name: (Optional) Name for the operation.
+    Args:
+        initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The initial learning rate.
+        maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The maximum learning rate.
+        step_size: A scalar `float32` or `float64` `Tensor` or a
+            Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
+        scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
+            schedule
+        name: (Optional) Name for the operation.
 
-        Returns:
-            Updated learning rate value.
-        """
+    Returns:
+        Updated learning rate value.
+    """
     super().__init__(
-        initial_learning_rate=initial_learning_rate,
-        maximal_learning_rate=maximal_learning_rate,
-        step_size=step_size,
-        scale_fn=lambda x: 1.0,
-        scale_mode=scale_mode,
-        name=name,
+      initial_learning_rate=initial_learning_rate,
+      maximal_learning_rate=maximal_learning_rate,
+      step_size=step_size,
+      scale_fn=lambda x: 1.0,
+      scale_mode=scale_mode,
+      name=name,
     )
 
   def get_config(self):
     return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "maximal_learning_rate": self.maximal_learning_rate,
-        "step_size": self.step_size,
-        "scale_mode": self.scale_mode,
+      "initial_learning_rate": self.initial_learning_rate,
+      "maximal_learning_rate": self.maximal_learning_rate,
+      "step_size": self.step_size,
+      "scale_mode": self.scale_mode,
     }
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class Triangular2CyclicalLearningRate(CyclicalLearningRate):
-
   @typechecked
   def __init__(
-      self,
-      initial_learning_rate: Union[FloatTensorLike, Callable],
-      maximal_learning_rate: Union[FloatTensorLike, Callable],
-      step_size: FloatTensorLike,
-      scale_mode: str = "cycle",
-      name: str = "Triangular2CyclicalLearningRate",
+    self,
+    initial_learning_rate: Union[FloatTensorLike, Callable],
+    maximal_learning_rate: Union[FloatTensorLike, Callable],
+    step_size: FloatTensorLike,
+    scale_mode: str = "cycle",
+    name: str = "Triangular2CyclicalLearningRate",
   ):
     """Applies triangular2 cyclical schedule to the learning rate.
 
-        See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
+    See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
 
 
-        ```python
-        from tf.keras.optimizers import schedules
+    ```python
+    from tf.keras.optimizers import schedules
 
-        lr_schedule = schedules.Triangular2CyclicalLearningRate(
-            initial_learning_rate=1e-4,
-            maximal_learning_rate=1e-2,
-            step_size=2000,
-            scale_mode="cycle",
-            name="MyCyclicScheduler")
+    lr_schedule = schedules.Triangular2CyclicalLearningRate(
+        initial_learning_rate=1e-4,
+        maximal_learning_rate=1e-2,
+        step_size=2000,
+        scale_mode="cycle",
+        name="MyCyclicScheduler")
 
-        model.compile(optimizer=tf.keras.optimizers.SGD(
-                                                    learning_rate=lr_schedule),
-                      loss='sparse_categorical_crossentropy',
-                      metrics=['accuracy'])
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                                                learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
 
-        model.fit(data, labels, epochs=5)
-        ```
+    model.fit(data, labels, epochs=5)
+    ```
 
-        You can pass this schedule directly into a
-        `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
+    You can pass this schedule directly into a
+    `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
 
-        Args:
-            initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The initial learning rate.
-            maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The maximum learning rate.
-            step_size: A scalar `float32` or `float64` `Tensor` or a
-                Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
-            scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
-                schedule
-            name: (Optional) Name for the operation.
+    Args:
+        initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The initial learning rate.
+        maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The maximum learning rate.
+        step_size: A scalar `float32` or `float64` `Tensor` or a
+            Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
+        scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
+            schedule
+        name: (Optional) Name for the operation.
 
-        Returns:
-            Updated learning rate value.
-        """
+    Returns:
+        Updated learning rate value.
+    """
     super().__init__(
-        initial_learning_rate=initial_learning_rate,
-        maximal_learning_rate=maximal_learning_rate,
-        step_size=step_size,
-        scale_fn=lambda x: 1 / (2.0**(x - 1)),
-        scale_mode=scale_mode,
-        name=name,
+      initial_learning_rate=initial_learning_rate,
+      maximal_learning_rate=maximal_learning_rate,
+      step_size=step_size,
+      scale_fn=lambda x: 1 / (2.0 ** (x - 1)),
+      scale_mode=scale_mode,
+      name=name,
     )
 
   def get_config(self):
     return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "maximal_learning_rate": self.maximal_learning_rate,
-        "step_size": self.step_size,
-        "scale_mode": self.scale_mode,
+      "initial_learning_rate": self.initial_learning_rate,
+      "maximal_learning_rate": self.maximal_learning_rate,
+      "step_size": self.step_size,
+      "scale_mode": self.scale_mode,
     }
 
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class ExponentialCyclicalLearningRate(CyclicalLearningRate):
-
   @typechecked
   def __init__(
-      self,
-      initial_learning_rate: Union[FloatTensorLike, Callable],
-      maximal_learning_rate: Union[FloatTensorLike, Callable],
-      step_size: FloatTensorLike,
-      scale_mode: str = "iterations",
-      gamma: FloatTensorLike = 1.0,
-      name: str = "ExponentialCyclicalLearningRate",
+    self,
+    initial_learning_rate: Union[FloatTensorLike, Callable],
+    maximal_learning_rate: Union[FloatTensorLike, Callable],
+    step_size: FloatTensorLike,
+    scale_mode: str = "iterations",
+    gamma: FloatTensorLike = 1.0,
+    name: str = "ExponentialCyclicalLearningRate",
   ):
     """Applies exponential cyclical schedule to the learning rate.
 
-        See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
-
-
-        ```python
-        from tf.keras.optimizers import schedules
-
-        lr_schedule = ExponentialCyclicalLearningRate(
-            initial_learning_rate=1e-4,
-            maximal_learning_rate=1e-2,
-            step_size=2000,
-            scale_mode="cycle",
-            gamma=0.96,
-            name="MyCyclicScheduler")
-
-        model.compile(optimizer=tf.keras.optimizers.SGD(
-                                                    learning_rate=lr_schedule),
-                      loss='sparse_categorical_crossentropy',
-                      metrics=['accuracy'])
-
-        model.fit(data, labels, epochs=5)
-        ```
-
-        You can pass this schedule directly into a
-        `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
-
-        Args:
-            initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The initial learning rate.
-            maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
-                a Python number.  The maximum learning rate.
-            step_size: A scalar `float32` or `float64` `Tensor` or a
-                Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
-            scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
-                schedule
-            gamma: A scalar `float32` or `float64` `Tensor` or a
-                Python number.  Gamma value.
-            name: (Optional) Name for the operation.
-
-        Returns:
-            Updated learning rate value.
-        """
+    See Cyclical Learning Rates for Training Neural Networks. https://arxiv.org/abs/1506.01186
+
+
+    ```python
+    from tf.keras.optimizers import schedules
+
+    lr_schedule = ExponentialCyclicalLearningRate(
+        initial_learning_rate=1e-4,
+        maximal_learning_rate=1e-2,
+        step_size=2000,
+        scale_mode="cycle",
+        gamma=0.96,
+        name="MyCyclicScheduler")
+
+    model.compile(optimizer=tf.keras.optimizers.SGD(
+                                                learning_rate=lr_schedule),
+                  loss='sparse_categorical_crossentropy',
+                  metrics=['accuracy'])
+
+    model.fit(data, labels, epochs=5)
+    ```
+
+    You can pass this schedule directly into a
+    `tf.keras.optimizers.legacy.Optimizer` as the learning rate.
+
+    Args:
+        initial_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The initial learning rate.
+        maximal_learning_rate: A scalar `float32` or `float64` `Tensor` or
+            a Python number.  The maximum learning rate.
+        step_size: A scalar `float32` or `float64` `Tensor` or a
+            Python number. Step size denotes the number of training iterations it takes to get to maximal_learning_rate
+        scale_mode: ['cycle', 'iterations']. Mode to apply during cyclic
+            schedule
+        gamma: A scalar `float32` or `float64` `Tensor` or a
+            Python number.  Gamma value.
+        name: (Optional) Name for the operation.
+
+    Returns:
+        Updated learning rate value.
+    """
     self.gamma = gamma
     super().__init__(
-        initial_learning_rate=initial_learning_rate,
-        maximal_learning_rate=maximal_learning_rate,
-        step_size=step_size,
-        scale_fn=lambda x: gamma**x,
-        scale_mode=scale_mode,
-        name=name,
+      initial_learning_rate=initial_learning_rate,
+      maximal_learning_rate=maximal_learning_rate,
+      step_size=step_size,
+      scale_fn=lambda x: gamma**x,
+      scale_mode=scale_mode,
+      name=name,
     )
 
   def get_config(self):
     return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "maximal_learning_rate": self.maximal_learning_rate,
-        "step_size": self.step_size,
-        "scale_mode": self.scale_mode,
-        "gamma": self.gamma,
+      "initial_learning_rate": self.initial_learning_rate,
+      "maximal_learning_rate": self.maximal_learning_rate,
+      "step_size": self.step_size,
+      "scale_mode": self.scale_mode,
+      "gamma": self.gamma,
     }
diff --git a/deepray/optimizers/ev_optimizer_patch.py b/deepray/optimizers/ev_optimizer_patch.py
index ba6e391f..81dc0a30 100644
--- a/deepray/optimizers/ev_optimizer_patch.py
+++ b/deepray/optimizers/ev_optimizer_patch.py
@@ -41,11 +41,13 @@
 from deepray.custom_ops.embedding_variable.python import kv_variable_ops
 
 from tensorflow.core.framework import attr_value_pb2
-from deepray.custom_ops.embedding_variable.variable_scope import get_embedding_variable_internal, get_embedding_variable_v2_internal
+from deepray.custom_ops.embedding_variable.variable_scope import (
+  get_embedding_variable_internal,
+  get_embedding_variable_v2_internal,
+)
 
 
 class SlotConfig:
-
   def __init__(self, slot_num=1, slot_index=0, slot_type=config_pb2.SlotType.EMBEDDING_VARIABLE):
     self.slot_num = slot_num
     self.slot_index = slot_index
@@ -87,8 +89,8 @@ def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=
     if isinstance(initializer, str) or callable(initializer):
       initializer = keras.initializers.get(initializer)
       if isinstance(
-          initializer,
-          tf.__internal__.tracking.CheckpointInitialValueCallable,
+        initializer,
+        tf.__internal__.tracking.CheckpointInitialValueCallable,
       ) or (shape is not None):
         slot_shape = shape
       else:
@@ -100,15 +102,15 @@ def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=
     if isinstance(var, kv_variable_ops.EmbeddingVariable):
       if slot_config is None:
         weight = get_embedding_variable_internal(
-            name=f"{var._shared_name}/{slot_name}",
-            initializer=initializer,
-            trainable=False,
-            embedding_dim=slot_shape,
-            key_dtype=var._invalid_key_type,
-            value_dtype=var.dtype,
-            validate_shape=slot_shape.is_fully_defined(),
-            steps_to_live=var._steps_to_live,
-            ht_partition_num=var._ht_partition_num
+          name=f"{var._shared_name}/{slot_name}",
+          initializer=initializer,
+          trainable=False,
+          embedding_dim=slot_shape,
+          key_dtype=var._invalid_key_type,
+          value_dtype=var.dtype,
+          validate_shape=slot_shape.is_fully_defined(),
+          steps_to_live=var._steps_to_live,
+          ht_partition_num=var._ht_partition_num,
         )
         # _set_init_op_embedding_type_attr(weight, config_pb2.EmbeddingVariableType.MUTABLE)
       else:
@@ -116,10 +118,10 @@ def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=
         if var._filter_freq != 0:
           if var._max_element_size != 0:
             filter_strategy = ev_variables.CBFFilter(
-                filter_freq=var._filter_freq,
-                max_element_size=var._max_element_size,
-                false_positive_probability=var._false_positive_probability,
-                counter_type=var._counter_type
+              filter_freq=var._filter_freq,
+              max_element_size=var._max_element_size,
+              false_positive_probability=var._false_positive_probability,
+              counter_type=var._counter_type,
             )
           else:
             filter_strategy = ev_variables.CounterFilter(filter_freq=var._filter_freq)
@@ -130,57 +132,57 @@ def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=
           if var.block_num > 1:
             var = var._primary
           weight = get_embedding_variable_v2_internal(
-              name=f"{var._shared_name}/{slot_name}",
-              initializer=initializer,
-              trainable=False,
-              embedding_dim=slot_shape,
-              key_dtype=var._invalid_key_type,
-              value_dtype=var.dtype,
-              validate_shape=slot_shape.is_fully_defined(),
-              evconfig=ev_variables.EmbeddingVariableConfig(
-                  steps_to_live=var._steps_to_live,
-                  handle_name=var._block_handle_name,
-                  emb_index=emb_index,
-                  block_num=var.block_num,
-                  slot_index=slot_config.slot_index,
-                  primary=var._primary,
-                  slot_num=slot_config.slot_num,
-                  storage_type=var.storage_type,
-                  storage_path=var._storage_path,
-                  storage_size=var._storage_size,
-                  storage_cache_strategy=var._storage_cache_strategy,
-                  layout=var._layout,
-                  l2_weight_threshold=var._l2_weight_threshold,
-                  filter_strategy=filter_strategy
-              )
+            name=f"{var._shared_name}/{slot_name}",
+            initializer=initializer,
+            trainable=False,
+            embedding_dim=slot_shape,
+            key_dtype=var._invalid_key_type,
+            value_dtype=var.dtype,
+            validate_shape=slot_shape.is_fully_defined(),
+            evconfig=ev_variables.EmbeddingVariableConfig(
+              steps_to_live=var._steps_to_live,
+              handle_name=var._block_handle_name,
+              emb_index=emb_index,
+              block_num=var.block_num,
+              slot_index=slot_config.slot_index,
+              primary=var._primary,
+              slot_num=slot_config.slot_num,
+              storage_type=var.storage_type,
+              storage_path=var._storage_path,
+              storage_size=var._storage_size,
+              storage_cache_strategy=var._storage_cache_strategy,
+              layout=var._layout,
+              l2_weight_threshold=var._l2_weight_threshold,
+              filter_strategy=filter_strategy,
+            ),
           )
         else:
           weight = tf.Variable(
-              name=f"{var._shared_name}/{slot_name}",
-              dtype=var.dtype,
-              trainable=False,
-              initial_value=initial_value,
+            name=f"{var._shared_name}/{slot_name}",
+            dtype=var.dtype,
+            trainable=False,
+            initial_value=initial_value,
           )
     else:
       with self._distribution_strategy_scope():
         strategy = tf.distribute.get_strategy()
         if not strategy.extended.variable_created_in_scope(var):
           raise ValueError(
-              "Trying to create optimizer slot variable under the "
-              "scope for tf.distribute.Strategy ({}), which is "
-              "different from the scope used for the original "
-              "variable ({}). Make sure the slot variables are "
-              "created under the same strategy scope. This may "
-              "happen if you're restoring from a checkpoint "
-              "outside the scope.".format(strategy, var)
+            "Trying to create optimizer slot variable under the "
+            "scope for tf.distribute.Strategy ({}), which is "
+            "different from the scope used for the original "
+            "variable ({}). Make sure the slot variables are "
+            "created under the same strategy scope. This may "
+            "happen if you're restoring from a checkpoint "
+            "outside the scope.".format(strategy, var)
           )
 
         with strategy.extended.colocate_vars_with(var):
           weight = tf.Variable(
-              name=f"{var._shared_name}/{slot_name}",
-              dtype=var.dtype,
-              trainable=False,
-              initial_value=initial_value,
+            name=f"{var._shared_name}/{slot_name}",
+            dtype=var.dtype,
+            trainable=False,
+            initial_value=initial_value,
           )
 
     backend.track_variable(weight)
@@ -193,8 +195,9 @@ def add_slot(self, var, slot_name, initializer="zeros", shape=None, slot_config=
 def _deduplicate_indexed_slices_with_counts(values, indices):
   """Sums `values` associated with any non-unique `indices`
   and return counts of each count in `values`."""
-  unique_indices, new_index_positions, indices_counts = \
-      gen_array_ops.deepray_unique_with_counts(indices, out_idx=dtypes.int64)
+  unique_indices, new_index_positions, indices_counts = gen_array_ops.deepray_unique_with_counts(
+    indices, out_idx=dtypes.int64
+  )
   summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0])
   return summed_values, unique_indices, indices_counts
 
@@ -202,8 +205,9 @@ def _deduplicate_indexed_slices_with_counts(values, indices):
 def _deduplicate_indexed_slices_with_counts_reduction(values, indices, extra_counts, extra_indices):
   """Sums `values` associated with any non-unique `indices`
   and return counts of each count in `values`."""
-  unique_indices, new_index_positions, summed_counts = \
-      gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts)
+  unique_indices, new_index_positions, summed_counts = gen_array_ops.deepray_unique_with_extra_counts(
+    indices, extra_indices, extra_counts
+  )
   summed_values = math_ops.unsorted_segment_sum(values, new_index_positions, array_ops.shape(unique_indices)[0])
   return summed_values, unique_indices, summed_counts
 
@@ -230,11 +234,12 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices, **kwar
     An `Operation` which updates the value of the variable.
   """
   from deepray.custom_ops.embedding_variable import kv_variable_ops
+
   if isinstance(handle, kv_variable_ops.EmbeddingVariable) and handle.need_counts():
     if len(handle._counts_tensor.keys()) == 0:
-      summed_grad, unique_indices, indices_counts = \
-          _deduplicate_indexed_slices_with_counts(
-              values=grad, indices=indices)
+      summed_grad, unique_indices, indices_counts = _deduplicate_indexed_slices_with_counts(
+        values=grad, indices=indices
+      )
     else:
       extra_counts, extra_indices = [], []
       if indices.op.type == "ConcatV2":
@@ -249,11 +254,11 @@ def _resource_apply_sparse_duplicate_indices(self, grad, handle, indices, **kwar
         if indices_tensor in handle._counts_tensor:
           extra_counts.append(handle._counts_tensor[indices_tensor])
           extra_indices.append(indices_tensor)
-      summed_grad, unique_indices, indices_counts = \
-          _deduplicate_indexed_slices_with_counts_reduction(
-              grad, indices, extra_counts, extra_indices)
+      summed_grad, unique_indices, indices_counts = _deduplicate_indexed_slices_with_counts_reduction(
+        grad, indices, extra_counts, extra_indices
+      )
     return self._resource_apply_sparse(
-        grad=summed_grad, var=handle, indices=unique_indices, indices_counts=indices_counts, **kwargs
+      grad=summed_grad, var=handle, indices=unique_indices, indices_counts=indices_counts, **kwargs
     )
   else:
     summed_grad, unique_indices = _deduplicate_indexed_slices(values=grad, indices=indices)
diff --git a/deepray/optimizers/ftrl.py b/deepray/optimizers/ftrl.py
index 33c2c2f7..cfedcb9d 100644
--- a/deepray/optimizers/ftrl.py
+++ b/deepray/optimizers/ftrl.py
@@ -9,7 +9,6 @@
 
 
 class FtrlOptimizer(tf.keras.optimizers.legacy.Ftrl):
-
   def __init__(self, learning_rate=0.001, **kwargs):
     super().__init__(learning_rate=learning_rate, **kwargs)
     self.global_step = None
@@ -29,8 +28,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
 
     # Adjust L2 regularization strength to include beta to avoid the
     # underlying TensorFlow ops needing to include it.
-    adjusted_l2_regularization_strength = coefficients["l2_regularization_strength"
-                                                      ] + coefficients["beta"] / (2.0 * coefficients["lr_t"])
+    adjusted_l2_regularization_strength = coefficients["l2_regularization_strength"] + coefficients["beta"] / (
+      2.0 * coefficients["lr_t"]
+    )
 
     accum = self.get_slot(var, "accumulator")
     linear = self.get_slot(var, "linear")
@@ -38,58 +38,58 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     if self._l2_shrinkage_regularization_strength <= 0.0:
       if isinstance(var, kv_variable_ops.EmbeddingVariable):
         return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl(
-            var.handle,
-            accum.handle,
-            linear.handle,
-            grad,
-            indices,
-            coefficients["lr_t"],
-            coefficients["l1_regularization_strength"],
-            adjusted_l2_regularization_strength,
-            coefficients["learning_rate_power"],
-            use_locking=self._use_locking
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          coefficients["lr_t"],
+          coefficients["l1_regularization_strength"],
+          adjusted_l2_regularization_strength,
+          coefficients["learning_rate_power"],
+          use_locking=self._use_locking,
         )
       else:
         return tf.raw_ops.ResourceSparseApplyFtrl(
-            var=var.handle,
-            accum=accum.handle,
-            linear=linear.handle,
-            grad=grad,
-            indices=indices,
-            lr=coefficients["lr_t"],
-            l1=coefficients["l1_regularization_strength"],
-            l2=adjusted_l2_regularization_strength,
-            lr_power=coefficients["learning_rate_power"],
-            use_locking=self._use_locking,
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          indices=indices,
+          lr=coefficients["lr_t"],
+          l1=coefficients["l1_regularization_strength"],
+          l2=adjusted_l2_regularization_strength,
+          lr_power=coefficients["learning_rate_power"],
+          use_locking=self._use_locking,
         )
     else:
       if isinstance(var, kv_variable_ops.EmbeddingVariable):
         return gen_kv_variable_ops.kv_resource_sparse_apply_ftrl_v2(
-            var.handle,
-            accum.handle,
-            linear.handle,
-            grad,
-            indices,
-            coefficients["lr_t"],
-            coefficients["l1_regularization_strength"],
-            adjusted_l2_regularization_strength,
-            coefficients["l2_shrinkage_regularization_strength"],
-            coefficients["learning_rate_power"],
-            use_locking=self._use_locking
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          coefficients["lr_t"],
+          coefficients["l1_regularization_strength"],
+          adjusted_l2_regularization_strength,
+          coefficients["l2_shrinkage_regularization_strength"],
+          coefficients["learning_rate_power"],
+          use_locking=self._use_locking,
         )
       else:
         return tf.raw_ops.ResourceSparseApplyFtrlV2(
-            var=var.handle,
-            accum=accum.handle,
-            linear=linear.handle,
-            grad=grad,
-            indices=indices,
-            lr=coefficients["lr_t"],
-            l1=coefficients["l1_regularization_strength"],
-            l2=adjusted_l2_regularization_strength,
-            l2_shrinkage=coefficients["l2_shrinkage_regularization_strength"],
-            lr_power=coefficients["learning_rate_power"],
-            use_locking=self._use_locking,
+          var=var.handle,
+          accum=accum.handle,
+          linear=linear.handle,
+          grad=grad,
+          indices=indices,
+          lr=coefficients["lr_t"],
+          l1=coefficients["l1_regularization_strength"],
+          l2=adjusted_l2_regularization_strength,
+          l2_shrinkage=coefficients["l2_shrinkage_regularization_strength"],
+          lr_power=coefficients["learning_rate_power"],
+          use_locking=self._use_locking,
         )
 
 
diff --git a/deepray/optimizers/gradient_descent.py b/deepray/optimizers/gradient_descent.py
index 1ff6f25c..8cb8e2dc 100644
--- a/deepray/optimizers/gradient_descent.py
+++ b/deepray/optimizers/gradient_descent.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """GradientDescentOptimizer for Deepray."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,27 +28,27 @@
 
 
 class SGD(gd_old.SGD):
-
   def __init__(self, learning_rate=0.01, **kwargs):
     super().__init__(learning_rate=learning_rate, **kwargs)
     self.global_step = None
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, **kwargs):
     var_device, var_dtype = var.device, var.dtype.base_dtype
-    coefficients = kwargs.get("apply_state", {}).get((var_device, var_dtype)
-                                                    ) or self._fallback_apply_state(var_device, var_dtype)
+    coefficients = kwargs.get("apply_state", {}).get((var_device, var_dtype)) or self._fallback_apply_state(
+      var_device, var_dtype
+    )
     if self._momentum:
       # This method is only needed for momentum optimization.
       momentum_var = self.get_slot(var, "momentum")
       return tf.raw_ops.ResourceSparseApplyKerasMomentum(
-          var=var.handle,
-          accum=momentum_var.handle,
-          lr=coefficients["lr_t"],
-          grad=grad,
-          indices=indices,
-          momentum=coefficients["momentum"],
-          use_locking=self._use_locking,
-          use_nesterov=self.nesterov,
+        var=var.handle,
+        accum=momentum_var.handle,
+        lr=coefficients["lr_t"],
+        grad=grad,
+        indices=indices,
+        momentum=coefficients["momentum"],
+        use_locking=self._use_locking,
+        use_nesterov=self.nesterov,
       )
     else:
       if isinstance(var, kv_variable_ops.EmbeddingVariable):
@@ -67,25 +68,27 @@ def _resource_apply_sparse_duplicate_indices(self, grad, var, indices, **kwargs)
               extra_indices.append(indices_tensor)
 
           from deepray.custom_ops.unique_ops import gen_array_ops
-          unique_indices, new_index_positions, indices_counts = \
-            gen_array_ops.deepray_unique_with_extra_counts(indices, extra_indices, extra_counts)
+
+          unique_indices, new_index_positions, indices_counts = gen_array_ops.deepray_unique_with_extra_counts(
+            indices, extra_indices, extra_counts
+          )
           summed_grads = math_ops.unsorted_segment_sum(grad, new_index_positions, array_ops.shape(unique_indices)[0])
           return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent_with_counts(
-              var.handle,
-              coefficients["lr_t"],
-              summed_grads,
-              unique_indices,
-              self.global_step,
-              indices_counts,
-              use_locking=self._use_locking
+            var.handle,
+            coefficients["lr_t"],
+            summed_grads,
+            unique_indices,
+            self.global_step,
+            indices_counts,
+            use_locking=self._use_locking,
           )
         else:
           return gen_kv_variable_ops.kv_resource_sparse_apply_gradient_descent(
-              var.handle, coefficients["lr_t"], grad, indices, self.global_step, use_locking=self._use_locking
+            var.handle, coefficients["lr_t"], grad, indices, self.global_step, use_locking=self._use_locking
           )
       else:
         return tf.raw_ops.ResourceScatterAdd(
-            resource=var.handle,
-            indices=indices,
-            updates=-grad * coefficients["lr_t"],
+          resource=var.handle,
+          indices=indices,
+          updates=-grad * coefficients["lr_t"],
         )
diff --git a/deepray/optimizers/lamb.py b/deepray/optimizers/lamb.py
index 21268f6d..f77fd00a 100644
--- a/deepray/optimizers/lamb.py
+++ b/deepray/optimizers/lamb.py
@@ -33,56 +33,55 @@
 class LAMB(KerasLegacyOptimizer):
   """Optimizer that implements the Layer-wise Adaptive Moments (LAMB).
 
-    See paper [Large Batch Optimization for Deep Learning: Training BERT
-    in 76 minutes](https://arxiv.org/abs/1904.00962).
-    """
+  See paper [Large Batch Optimization for Deep Learning: Training BERT
+  in 76 minutes](https://arxiv.org/abs/1904.00962).
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      beta_1: FloatTensorLike = 0.9,
-      beta_2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-6,
-      weight_decay: FloatTensorLike = 0.0,
-      exclude_from_weight_decay: Optional[List[str]] = None,
-      exclude_from_layer_adaptation: Optional[List[str]] = None,
-      name: str = "LAMB",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    beta_1: FloatTensorLike = 0.9,
+    beta_2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-6,
+    weight_decay: FloatTensorLike = 0.0,
+    exclude_from_weight_decay: Optional[List[str]] = None,
+    exclude_from_layer_adaptation: Optional[List[str]] = None,
+    name: str = "LAMB",
+    **kwargs,
   ):
     """Construct a new LAMB optimizer.
 
-        Args:
-            learning_rate: A `Tensor` or a floating point value. or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
-                The learning rate.
-            beta_1: A `float` value or a constant `float` tensor.
-              The exponential decay rate for the 1st moment estimates.
-            beta_2: A `float` value or a constant `float` tensor.
-              The exponential decay rate for the 2nd moment estimates.
-            epsilon: A small constant for numerical stability.
-            weight_decay: weight decay.
-            exclude_from_weight_decay: List of regex patterns of
-              variables excluded from weight decay. Variables whose name
-              contain a substring matching the pattern will be excluded.
-            exclude_from_layer_adaptation: List of regex patterns of
-              variables excluded from layer adaptation. Variables whose name
-              contain a substring matching the pattern will be excluded.
-            name: Optional name for the operations created when applying
-              gradients. Defaults to "LAMB".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-              `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
-              norm; `clipvalue` is clip gradients by value, `decay` is
-              included for backward compatibility to allow time inverse
-              decay of learning rate. `lr` is included for backward
-              compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        learning_rate: A `Tensor` or a floating point value. or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
+            The learning rate.
+        beta_1: A `float` value or a constant `float` tensor.
+          The exponential decay rate for the 1st moment estimates.
+        beta_2: A `float` value or a constant `float` tensor.
+          The exponential decay rate for the 2nd moment estimates.
+        epsilon: A small constant for numerical stability.
+        weight_decay: weight decay.
+        exclude_from_weight_decay: List of regex patterns of
+          variables excluded from weight decay. Variables whose name
+          contain a substring matching the pattern will be excluded.
+        exclude_from_layer_adaptation: List of regex patterns of
+          variables excluded from layer adaptation. Variables whose name
+          contain a substring matching the pattern will be excluded.
+        name: Optional name for the operations created when applying
+          gradients. Defaults to "LAMB".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+          `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
+          norm; `clipvalue` is clip gradients by value, `decay` is
+          included for backward compatibility to allow time inverse
+          decay of learning rate. `lr` is included for backward
+          compatibility, recommended to use `learning_rate` instead.
+    """
 
     if "weight_decay_rate" in kwargs:
       warnings.warn(
-          "weight_decay_rate has been renamed to weight_decay,"
-          "and will be deprecated in Deepray 0.18.",
-          DeprecationWarning,
+        "weight_decay_rate has been renamed to weight_decay,and will be deprecated in Deepray 0.18.",
+        DeprecationWarning,
       )
       weight_decay = kwargs["weight_decay_rate"]
       del kwargs["weight_decay_rate"]
@@ -129,16 +128,16 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
     beta_1_power = tf.pow(beta_1_t, local_step)
     beta_2_power = tf.pow(beta_2_t, local_step)
     apply_state[(var_device, var_dtype)].update(
-        dict(
-            weight_decay=weight_decay,
-            epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            beta_1_power=beta_1_power,
-            one_minus_beta_1_t=1 - beta_1_t,
-            beta_2_t=beta_2_t,
-            beta_2_power=beta_2_power,
-            one_minus_beta_2_t=1 - beta_2_t,
-        )
+      dict(
+        weight_decay=weight_decay,
+        epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+        beta_1_t=beta_1_t,
+        beta_1_power=beta_1_power,
+        one_minus_beta_1_t=1 - beta_1_t,
+        beta_2_t=beta_2_t,
+        beta_2_power=beta_2_power,
+        one_minus_beta_2_t=1 - beta_2_t,
+      )
     )
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
@@ -170,9 +169,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
       w_norm = tf.norm(var, ord=2)
       g_norm = tf.norm(update, ord=2)
       ratio = tf.where(
-          tf.greater(w_norm, 0),
-          tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0),
-          1.0,
+        tf.greater(w_norm, 0),
+        tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0),
+        1.0,
       )
 
     var_update = var - ratio * coefficients["lr_t"] * update
@@ -210,9 +209,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
       w_norm = tf.norm(var, ord=2)
       g_norm = tf.norm(update, ord=2)
       ratio = tf.where(
-          tf.greater(w_norm, 0),
-          tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0),
-          1.0,
+        tf.greater(w_norm, 0),
+        tf.where(tf.greater(g_norm, 0), (w_norm / g_norm), 1.0),
+        1.0,
       )
 
     var_update = var.assign_sub(ratio * coefficients["lr_t"] * update, use_locking=self._use_locking)
@@ -220,18 +219,16 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "weight_decay": self._serialize_hyperparameter("weight_decay"),
-            "decay": self._serialize_hyperparameter("decay"),
-            "beta_1": self._serialize_hyperparameter("beta_1"),
-            "beta_2": self._serialize_hyperparameter("beta_2"),
-            "epsilon": self.epsilon,
-            "exclude_from_weight_decay": self.exclude_from_weight_decay,
-            "exclude_from_layer_adaptation": self.exclude_from_layer_adaptation,
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "weight_decay": self._serialize_hyperparameter("weight_decay"),
+      "decay": self._serialize_hyperparameter("decay"),
+      "beta_1": self._serialize_hyperparameter("beta_1"),
+      "beta_2": self._serialize_hyperparameter("beta_2"),
+      "epsilon": self.epsilon,
+      "exclude_from_weight_decay": self.exclude_from_weight_decay,
+      "exclude_from_layer_adaptation": self.exclude_from_layer_adaptation,
+    })
     return config
 
   def _do_use_weight_decay(self, variable):
@@ -240,5 +237,5 @@ def _do_use_weight_decay(self, variable):
 
   def _do_layer_adaptation(self, variable):
     """Whether to do layer-wise learning rate adaptation for
-        `param_name`."""
+    `param_name`."""
     return not is_variable_matched_by_regexes(variable, self.exclude_from_layer_adaptation)
diff --git a/deepray/optimizers/lazy_adam.py b/deepray/optimizers/lazy_adam.py
index 2c940f32..6a931e55 100644
--- a/deepray/optimizers/lazy_adam.py
+++ b/deepray/optimizers/lazy_adam.py
@@ -37,69 +37,69 @@
 @keras.utils.register_keras_serializable(package="Deepray")
 class LazyAdam(adam_optimizer_class):
   """Variant of the Adam optimizer that handles sparse updates more
-    efficiently.
-
-    The original Adam algorithm maintains two moving-average accumulators for
-    each trainable variable; the accumulators are updated at every step.
-    This class provides lazier handling of gradient updates for sparse
-    variables.  It only updates moving-average accumulators for sparse variable
-    indices that appear in the current batch, rather than updating the
-    accumulators for all indices. Compared with the original Adam optimizer,
-    it can provide large improvements in model training throughput for some
-    applications. However, it provides slightly different semantics than the
-    original Adam algorithm, and may lead to different empirical results.
-
-    Note, amsgrad is currently not supported and the argument can only be
-    False.
-    """
+  efficiently.
+
+  The original Adam algorithm maintains two moving-average accumulators for
+  each trainable variable; the accumulators are updated at every step.
+  This class provides lazier handling of gradient updates for sparse
+  variables.  It only updates moving-average accumulators for sparse variable
+  indices that appear in the current batch, rather than updating the
+  accumulators for all indices. Compared with the original Adam optimizer,
+  it can provide large improvements in model training throughput for some
+  applications. However, it provides slightly different semantics than the
+  original Adam algorithm, and may lead to different empirical results.
+
+  Note, amsgrad is currently not supported and the argument can only be
+  False.
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      beta_1: FloatTensorLike = 0.9,
-      beta_2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-7,
-      amsgrad: bool = False,
-      name: str = "LazyAdam",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    beta_1: FloatTensorLike = 0.9,
+    beta_2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-7,
+    amsgrad: bool = False,
+    name: str = "LazyAdam",
+    **kwargs,
   ):
     """Constructs a new LazyAdam optimizer.
 
-        Args:
-          learning_rate: A `Tensor` or a floating point value. or a schedule
-            that is a `keras.optimizers.schedules.LearningRateSchedule`
-            The learning rate.
-          beta_1: A `float` value or a constant `float` tensor.
-            The exponential decay rate for the 1st moment estimates.
-          beta_2: A `float` value or a constant `float` tensor.
-            The exponential decay rate for the 2nd moment estimates.
-          epsilon: A small constant for numerical stability.
-            This epsilon is "epsilon hat" in
-            [Adam: A Method for Stochastic Optimization. Kingma et al., 2014]
-            (http://arxiv.org/abs/1412.6980) (in the formula just
-            before Section 2.1), not the epsilon in Algorithm 1 of the paper.
-          amsgrad: `boolean`. Whether to apply AMSGrad variant of this
-            algorithm from the paper "On the Convergence of Adam and beyond".
-            Note that this argument is currently not supported and the
-            argument can only be `False`.
-          name: Optional name for the operations created when applying
-            gradients. Defaults to "LazyAdam".
-          **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
-            `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
-            is clip gradients by value, `decay` is included for backward
-            compatibility to allow time inverse decay of learning rate. `lr`
-            is included for backward compatibility, recommended to use
-            `learning_rate` instead.
-        """
+    Args:
+      learning_rate: A `Tensor` or a floating point value. or a schedule
+        that is a `keras.optimizers.schedules.LearningRateSchedule`
+        The learning rate.
+      beta_1: A `float` value or a constant `float` tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta_2: A `float` value or a constant `float` tensor.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+        This epsilon is "epsilon hat" in
+        [Adam: A Method for Stochastic Optimization. Kingma et al., 2014]
+        (http://arxiv.org/abs/1412.6980) (in the formula just
+        before Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      amsgrad: `boolean`. Whether to apply AMSGrad variant of this
+        algorithm from the paper "On the Convergence of Adam and beyond".
+        Note that this argument is currently not supported and the
+        argument can only be `False`.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "LazyAdam".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+        `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
+        is clip gradients by value, `decay` is included for backward
+        compatibility to allow time inverse decay of learning rate. `lr`
+        is included for backward compatibility, recommended to use
+        `learning_rate` instead.
+    """
     super().__init__(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=amsgrad,
-        name=name,
-        **kwargs,
+      learning_rate=learning_rate,
+      beta_1=beta_1,
+      beta_2=beta_2,
+      epsilon=epsilon,
+      amsgrad=amsgrad,
+      name=name,
+      **kwargs,
     )
 
   def _resource_apply_sparse(self, grad, var, indices):
@@ -137,9 +137,9 @@ def _resource_scatter_sub(self, resource, indices, update):
 
   def _resource_scatter_operate(self, resource, indices, update, resource_scatter_op):
     resource_update_kwargs = {
-        "resource": resource.handle,
-        "indices": indices,
-        "updates": update,
+      "resource": resource.handle,
+      "indices": indices,
+      "updates": update,
     }
 
     return resource_scatter_op(**resource_update_kwargs)
diff --git a/deepray/optimizers/lookahead.py b/deepray/optimizers/lookahead.py
index 62f325cf..065765a2 100644
--- a/deepray/optimizers/lookahead.py
+++ b/deepray/optimizers/lookahead.py
@@ -24,54 +24,54 @@
 class Lookahead(KerasLegacyOptimizer):
   """This class allows to extend optimizers with the lookahead mechanism.
 
-    The mechanism is proposed by Michael R. Zhang et.al in the paper
-    [Lookahead Optimizer: k steps forward, 1 step back]
-    (https://arxiv.org/abs/1907.08610v1). The optimizer iteratively updates two
-    sets of weights: the search directions for weights are chosen by the inner
-    optimizer, while the "slow weights" are updated each `k` steps based on the
-    directions of the "fast weights" and the two sets of weights are
-    synchronized. This method improves the learning stability and lowers the
-    variance of its inner optimizer.
-
-    Example of usage:
-
-    ```python
-    opt = tf.keras.optimizers.SGD(learning_rate)
-    opt = dp.optimizers.Lookahead(opt)
-    ```
-    """
+  The mechanism is proposed by Michael R. Zhang et.al in the paper
+  [Lookahead Optimizer: k steps forward, 1 step back]
+  (https://arxiv.org/abs/1907.08610v1). The optimizer iteratively updates two
+  sets of weights: the search directions for weights are chosen by the inner
+  optimizer, while the "slow weights" are updated each `k` steps based on the
+  directions of the "fast weights" and the two sets of weights are
+  synchronized. This method improves the learning stability and lowers the
+  variance of its inner optimizer.
+
+  Example of usage:
+
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = dp.optimizers.Lookahead(opt)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      optimizer: types.Optimizer,
-      sync_period: int = 6,
-      slow_step_size: types.FloatTensorLike = 0.5,
-      name: str = "Lookahead",
-      **kwargs,
+    self,
+    optimizer: types.Optimizer,
+    sync_period: int = 6,
+    slow_step_size: types.FloatTensorLike = 0.5,
+    name: str = "Lookahead",
+    **kwargs,
   ):
     r"""Wrap optimizer with the lookahead mechanism.
 
-        Args:
-            optimizer: The original optimizer that will be used to compute
-                and apply the gradients.
-            sync_period: An integer. The synchronization period of lookahead.
-                Enable lookahead mechanism by setting it with a positive value.
-            slow_step_size: A floating point value.
-                The ratio for updating the slow weights.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "Lookahead".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
-                by norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        optimizer: The original optimizer that will be used to compute
+            and apply the gradients.
+        sync_period: An integer. The synchronization period of lookahead.
+            Enable lookahead mechanism by setting it with a positive value.
+        slow_step_size: A floating point value.
+            The ratio for updating the slow weights.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "Lookahead".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
+            by norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(name, **kwargs)
 
     if isinstance(optimizer, str):
-      if (hasattr(tf.keras.optimizers, "legacy") and KerasLegacyOptimizer == tf.keras.optimizers.legacy.Optimizer):
+      if hasattr(tf.keras.optimizers, "legacy") and KerasLegacyOptimizer == tf.keras.optimizers.legacy.Optimizer:
         optimizer = tf.keras.optimizers.get(optimizer, use_legacy_optimizer=True)
       else:
         optimizer = tf.keras.optimizers.get(optimizer)
@@ -96,7 +96,7 @@ def _prepare(self, var_list):
     return self._optimizer._prepare(var_list=var_list)  # pylint: disable=protected-access
 
   def apply_gradients(self, grads_and_vars, name=None, **kwargs):
-    self._optimizer._iterations = (self.iterations)  # pylint: disable=protected-access
+    self._optimizer._iterations = self.iterations  # pylint: disable=protected-access
     return super().apply_gradients(grads_and_vars, name, **kwargs)
 
   def _look_ahead_op(self, var):
@@ -109,12 +109,12 @@ def _look_ahead_op(self, var):
     sync_cond = tf.equal(tf.math.floordiv(local_step, sync_period) * sync_period, local_step)
     with tf.control_dependencies([step_back]):
       slow_update = slow_var.assign(
-          tf.where(sync_cond, step_back, slow_var),
-          use_locking=self._use_locking,
+        tf.where(sync_cond, step_back, slow_var),
+        use_locking=self._use_locking,
       )
       var_update = var.assign(
-          tf.where(sync_cond, step_back, var),
-          use_locking=self._use_locking,
+        tf.where(sync_cond, step_back, var),
+        use_locking=self._use_locking,
       )
     return tf.group(slow_update, var_update)
 
@@ -129,10 +129,8 @@ def _resource_apply_dense(self, grad, var):
     return tf.group(train_op, look_ahead_op)
 
   def _resource_apply_sparse(self, grad, var, indices):
-    train_op = (
-        self._optimizer._resource_apply_sparse(  # pylint: disable=protected-access
-            grad, var, indices
-        )
+    train_op = self._optimizer._resource_apply_sparse(  # pylint: disable=protected-access
+      grad, var, indices
     )
     with tf.control_dependencies([train_op]):
       look_ahead_op = self._look_ahead_op(var)
@@ -140,9 +138,9 @@ def _resource_apply_sparse(self, grad, var, indices):
 
   def get_config(self):
     config = {
-        "optimizer": tf.keras.optimizers.serialize(self._optimizer),
-        "sync_period": self._serialize_hyperparameter("sync_period"),
-        "slow_step_size": self._serialize_hyperparameter("slow_step_size"),
+      "optimizer": tf.keras.optimizers.serialize(self._optimizer),
+      "sync_period": self._serialize_hyperparameter("sync_period"),
+      "slow_step_size": self._serialize_hyperparameter("slow_step_size"),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/optimizers/moving_average.py b/deepray/optimizers/moving_average.py
index 1e2ccfb0..f5870570 100644
--- a/deepray/optimizers/moving_average.py
+++ b/deepray/optimizers/moving_average.py
@@ -26,62 +26,61 @@
 class MovingAverage(AveragedOptimizerWrapper):
   """Optimizer that computes a moving average of the variables.
 
-    Empirically it has been found that using the moving average of the trained
-    parameters of a deep network is better than using its trained parameters
-    directly. This optimizer allows you to compute this moving average and swap
-    the variables at save time so that any code outside of the training loop
-    will use by default the average values instead of the original ones.
+  Empirically it has been found that using the moving average of the trained
+  parameters of a deep network is better than using its trained parameters
+  directly. This optimizer allows you to compute this moving average and swap
+  the variables at save time so that any code outside of the training loop
+  will use by default the average values instead of the original ones.
 
-    Example of usage:
+  Example of usage:
 
-    ```python
-    opt = tf.keras.optimizers.SGD(learning_rate)
-    opt = dp.optimizers.MovingAverage(opt)
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = dp.optimizers.MovingAverage(opt)
 
-    ```
-    """
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      optimizer: types.Optimizer,
-      average_decay: types.FloatTensorLike = 0.99,
-      num_updates: Union[None, int, tf.Variable] = None,
-      start_step: int = 0,
-      dynamic_decay: bool = False,
-      name: str = "MovingAverage",
-      **kwargs,
+    self,
+    optimizer: types.Optimizer,
+    average_decay: types.FloatTensorLike = 0.99,
+    num_updates: Union[None, int, tf.Variable] = None,
+    start_step: int = 0,
+    dynamic_decay: bool = False,
+    name: str = "MovingAverage",
+    **kwargs,
   ):
     r"""Construct a new MovingAverage optimizer.
 
-        Args:
-            optimizer: str or `tf.keras.optimizers.legacy.Optimizer` that will be
-                used to compute and apply gradients.
-            average_decay: float. Decay to use to maintain the moving averages
-                of trained variables.
-            num_updates: Optional count of the number of updates applied to
-                variables.
-            start_step: int. What step to start the moving average.
-            dynamic_decay: bool. Whether to change the decay based on the number
-                of optimizer updates. Decay will start at 0.1 and gradually
-                increase up to `average_decay` after each optimizer update.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "MovingAverage".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
-                norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        optimizer: str or `tf.keras.optimizers.legacy.Optimizer` that will be
+            used to compute and apply gradients.
+        average_decay: float. Decay to use to maintain the moving averages
+            of trained variables.
+        num_updates: Optional count of the number of updates applied to
+            variables.
+        start_step: int. What step to start the moving average.
+        dynamic_decay: bool. Whether to change the decay based on the number
+            of optimizer updates. Decay will start at 0.1 and gradually
+            increase up to `average_decay` after each optimizer update.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "MovingAverage".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
+            norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(optimizer, name, **kwargs)
     self._num_updates = num_updates
     if self._num_updates is not None:
       if isinstance(self._num_updates, tf.Variable):
         tf.debugging.assert_integer(
-            self._num_updates,
-            ('type of argument "num_updates" must be '
-             "int; got {} instead".format(self._num_updates.dtype)),
+          self._num_updates,
+          ('type of argument "num_updates" must be int; got {} instead'.format(self._num_updates.dtype)),
         )
       num_updates = tf.cast(self._num_updates, tf.float32, name="num_updates")
       average_decay = tf.minimum(average_decay, (1.0 + num_updates) / (10.0 + num_updates))
@@ -112,10 +111,10 @@ def average_op(self, var, average_var, local_apply_state):
 
   def get_config(self):
     config = {
-        "average_decay": self._serialize_hyperparameter("average_decay"),
-        "num_updates": self._num_updates,
-        "start_step": self._start_step,
-        "dynamic_decay": self._dynamic_decay,
+      "average_decay": self._serialize_hyperparameter("average_decay"),
+      "num_updates": self._num_updates,
+      "start_step": self._start_step,
+      "dynamic_decay": self._dynamic_decay,
     }
     base_config = super().get_config()
     return {**base_config, **config}
@@ -143,21 +142,19 @@ def has_shadow_copy(self):
   def swap_weights(self):
     """Swap the average and moving weights.
 
-        This is a convenience method to allow one to evaluate the averaged weights
-        at test time. Loads the weights stored in `self._average_weights` into the model,
-        keeping a copy of the original model weights. Swapping twice will return
-        the original weights.
-        """
+    This is a convenience method to allow one to evaluate the averaged weights
+    at test time. Loads the weights stored in `self._average_weights` into the model,
+    keeping a copy of the original model weights. Swapping twice will return
+    the original weights.
+    """
     if tf.distribute.in_cross_replica_context():
       strategy = tf.distribute.get_strategy()
       return strategy.run(self._swap_weights, args=())
     else:
-      raise ValueError("Swapping weights must occur under a "
-                       "tf.distribute.Strategy")
+      raise ValueError("Swapping weights must occur under a tf.distribute.Strategy")
 
   @tf.function
   def _swap_weights(self):
-
     def fn_0(a, b):
       return a.assign_add(b, use_locking=self._use_locking)
 
diff --git a/deepray/optimizers/multi_optimizer.py b/deepray/optimizers/multi_optimizer.py
index aae91dd3..c1f317a2 100644
--- a/deepray/optimizers/multi_optimizer.py
+++ b/deepray/optimizers/multi_optimizer.py
@@ -30,7 +30,7 @@
 
 if Version(tf.__version__).release >= Version("2.16").release:
   # Determine if loading keras 2 or 3.
-  if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release):
+  if hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release:
     # New versions of Keras require importing from `keras.src` when
     # importing internal symbols.
     from keras.src import backend
@@ -49,61 +49,61 @@
 class MultiOptimizer(KerasLegacyOptimizer):
   """Multi Optimizer Wrapper for Discriminative Layer Training.
 
-    Creates a wrapper around a set of instantiated optimizer layer pairs.
-    Generally useful for transfer learning of deep networks.
-
-    Each optimizer will optimize only the weights associated with its paired layer.
-    This can be used to implement discriminative layer training by assigning
-    different learning rates to each optimizer layer pair.
-    `(tf.keras.optimizers.legacy.Optimizer, List[tf.keras.layers.Layer])` pairs are also supported.
-    Please note that the layers must be instantiated before instantiating the optimizer.
-
-    Args:
-        optimizers_and_varnames: a list of tuples of an optimizer and a layer or model.
-            Each tuple should contain exactly 1 instantiated optimizer and 1 object that
-            subclasses `tf.keras.Model`, `tf.keras.Sequential` or `tf.keras.layers.Layer`.
-            Nested layers and models will be automatically discovered.
-            Alternatively, in place of a single layer, you can pass a list of layers.
-        default_optimizer: Default optimizer for the left trainable variables.
-
-    Usage:
-
-    >>> model = tf.keras.Sequential([
-    ...     tf.keras.Input(shape=(4,)),
-    ...     tf.keras.layers.Dense(8, name="varname1"),
-    ...     tf.keras.layers.Dense(16, name="varname2"),
-    ...     tf.keras.layers.Dense(32, name="varname3"),
-    ...     tf.keras.layers.Dense(64),
-    ... ])
-    >>> optimizers = [
-    ...     tf.keras.optimizers.Adam(learning_rate=1e-4),
-    ...     tf.keras.optimizers.Adam(learning_rate=1e-2),
-    ...     tf.keras.optimizers.Adam(learning_rate=1e-3)
-    ... ]
-    >>> optimizers_and_varnames = [(optimizers[0], "varname1"), (optimizers[1], "varname2,varname3")]
-    >>> optimizer = dp.optimizers.MultiOptimizer(optimizers_and_varnames, optimizers[2])
-    >>> model.compile(optimizer=optimizer, loss="mse")
-
-    Reference:
-        - [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146)
-        - [Collaborative Layer-wise Discriminative Learning in Deep Neural Networks](https://arxiv.org/abs/1607.05440)
-
-    Note: Currently, `dp.optimizers.MultiOptimizer` does not support callbacks that modify optimizers.
-        However, you can instantiate optimizer layer pairs with
-        `tf.keras.optimizers.schedules.LearningRateSchedule`
-        instead of a static learning rate.
-
-    This code should function on CPU, GPU, and TPU. Apply with `tf.distribute.Strategy().scope()` context as you
-    would with any other optimizer.
-    """
+  Creates a wrapper around a set of instantiated optimizer layer pairs.
+  Generally useful for transfer learning of deep networks.
+
+  Each optimizer will optimize only the weights associated with its paired layer.
+  This can be used to implement discriminative layer training by assigning
+  different learning rates to each optimizer layer pair.
+  `(tf.keras.optimizers.legacy.Optimizer, List[tf.keras.layers.Layer])` pairs are also supported.
+  Please note that the layers must be instantiated before instantiating the optimizer.
+
+  Args:
+      optimizers_and_varnames: a list of tuples of an optimizer and a layer or model.
+          Each tuple should contain exactly 1 instantiated optimizer and 1 object that
+          subclasses `tf.keras.Model`, `tf.keras.Sequential` or `tf.keras.layers.Layer`.
+          Nested layers and models will be automatically discovered.
+          Alternatively, in place of a single layer, you can pass a list of layers.
+      default_optimizer: Default optimizer for the left trainable variables.
+
+  Usage:
+
+  >>> model = tf.keras.Sequential([
+  ...     tf.keras.Input(shape=(4,)),
+  ...     tf.keras.layers.Dense(8, name="varname1"),
+  ...     tf.keras.layers.Dense(16, name="varname2"),
+  ...     tf.keras.layers.Dense(32, name="varname3"),
+  ...     tf.keras.layers.Dense(64),
+  ... ])
+  >>> optimizers = [
+  ...     tf.keras.optimizers.Adam(learning_rate=1e-4),
+  ...     tf.keras.optimizers.Adam(learning_rate=1e-2),
+  ...     tf.keras.optimizers.Adam(learning_rate=1e-3)
+  ... ]
+  >>> optimizers_and_varnames = [(optimizers[0], "varname1"), (optimizers[1], "varname2,varname3")]
+  >>> optimizer = dp.optimizers.MultiOptimizer(optimizers_and_varnames, optimizers[2])
+  >>> model.compile(optimizer=optimizer, loss="mse")
+
+  Reference:
+      - [Universal Language Model Fine-tuning for Text Classification](https://arxiv.org/abs/1801.06146)
+      - [Collaborative Layer-wise Discriminative Learning in Deep Neural Networks](https://arxiv.org/abs/1607.05440)
+
+  Note: Currently, `dp.optimizers.MultiOptimizer` does not support callbacks that modify optimizers.
+      However, you can instantiate optimizer layer pairs with
+      `tf.keras.optimizers.schedules.LearningRateSchedule`
+      instead of a static learning rate.
+
+  This code should function on CPU, GPU, and TPU. Apply with `tf.distribute.Strategy().scope()` context as you
+  would with any other optimizer.
+  """
 
   @typechecked
   def __init__(
-      self,
-      optimizers_and_varnames: Union[list, None] = None,
-      default_optimizer: KerasLegacyOptimizer = None,
-      name: str = "MultiOptimizer",
-      **kwargs,
+    self,
+    optimizers_and_varnames: Union[list, None] = None,
+    default_optimizer: KerasLegacyOptimizer = None,
+    name: str = "MultiOptimizer",
+    **kwargs,
   ):
     super(MultiOptimizer, self).__init__(name, **kwargs)
     if default_optimizer is None:
@@ -123,7 +123,7 @@ def apply_gradients(self, grads_and_vars, **kwargs):
     for grad, var in grads_and_vars:
       # Check if each variable name exists in the variable name list
       for optimizer, varnames in self.optimizers_and_varnames:
-        if any(name in var.name for name in varnames.split(',')):
+        if any(name in var.name for name in varnames.split(",")):
           # If it does, append the variable to the optimizer's variable list
           grad_var_dict[optimizer].append((grad, var))
           break
@@ -146,7 +146,7 @@ def apply_gradients(self, grads_and_vars, **kwargs):
       # symbolic then the step update should be carried out under a graph
       # context. (eager updates execute immediately)
       with backend._current_graph(  # pylint: disable=protected-access
-          update_ops
+        update_ops
       ).as_default():
         with tf.control_dependencies([update_group]):
           return self.iterations.assign_add(1, read_value=False)
diff --git a/deepray/optimizers/novograd.py b/deepray/optimizers/novograd.py
index 307c49e9..e87b6cd9 100644
--- a/deepray/optimizers/novograd.py
+++ b/deepray/optimizers/novograd.py
@@ -25,82 +25,82 @@
 class NovoGrad(KerasLegacyOptimizer):
   """Optimizer that implements NovoGrad.
 
-    The NovoGrad Optimizer was first proposed in [Stochastic Gradient
-    Methods with Layerwise Adaptive Moments for training of Deep
-    Networks](https://arxiv.org/pdf/1905.11286.pdf) NovoGrad is a
-    first-order SGD-based algorithm, which computes second moments per
-    layer instead of per weight as in Adam. Compared to Adam, NovoGrad
-    takes less memory, and has been found to be more numerically stable.
-    (For more information on the computation please refer to this
-    [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html))
-
-    Second order moment = exponential moving average of Layer-wise square
-    of grads:
-        v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2
-    First order moment in one of four modes:
-        1. moment of grads normalized by v_t:
-            m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)]
-        2. moment similar to Adam: exponential moving average of grads
-        normalized by v_t (set grad_averaging = True to use this):
-            m_t <- beta_1 * m_{t-1} +
-                   [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))]
-        3. weight decay adds a w_d term after grads are rescaled by
-        1/sqrt(v_t) (set weight_decay > 0 to use this0:
-            m_t <- beta_1 * m_{t-1} +
-                   [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})]
-        4. weight decay + exponential moving average from Adam:
-            m_t <- beta_1 * m_{t-1} +
-                   [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) +
-                   (w_d * w_{t-1}))]
-    Weight update:
-        w_t <- w_{t-1} - lr_t * m_t
-
-    Example of usage:
-    ```python
-    opt = dp.optimizers.NovoGrad(
-        lr=1e-3,
-        beta_1=0.9,
-        beta_2=0.999,
-        weight_decay=0.001,
-        grad_averaging=False
-    )
-    ```
-    """
+  The NovoGrad Optimizer was first proposed in [Stochastic Gradient
+  Methods with Layerwise Adaptive Moments for training of Deep
+  Networks](https://arxiv.org/pdf/1905.11286.pdf) NovoGrad is a
+  first-order SGD-based algorithm, which computes second moments per
+  layer instead of per weight as in Adam. Compared to Adam, NovoGrad
+  takes less memory, and has been found to be more numerically stable.
+  (For more information on the computation please refer to this
+  [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html))
+
+  Second order moment = exponential moving average of Layer-wise square
+  of grads:
+      v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2
+  First order moment in one of four modes:
+      1. moment of grads normalized by v_t:
+          m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)]
+      2. moment similar to Adam: exponential moving average of grads
+      normalized by v_t (set grad_averaging = True to use this):
+          m_t <- beta_1 * m_{t-1} +
+                 [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))]
+      3. weight decay adds a w_d term after grads are rescaled by
+      1/sqrt(v_t) (set weight_decay > 0 to use this0:
+          m_t <- beta_1 * m_{t-1} +
+                 [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})]
+      4. weight decay + exponential moving average from Adam:
+          m_t <- beta_1 * m_{t-1} +
+                 [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) +
+                 (w_d * w_{t-1}))]
+  Weight update:
+      w_t <- w_{t-1} - lr_t * m_t
+
+  Example of usage:
+  ```python
+  opt = dp.optimizers.NovoGrad(
+      lr=1e-3,
+      beta_1=0.9,
+      beta_2=0.999,
+      weight_decay=0.001,
+      grad_averaging=False
+  )
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      beta_1: FloatTensorLike = 0.9,
-      beta_2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-7,
-      weight_decay: FloatTensorLike = 0.0,
-      grad_averaging: bool = False,
-      amsgrad: bool = False,
-      name: str = "NovoGrad",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    beta_1: FloatTensorLike = 0.9,
+    beta_2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-7,
+    weight_decay: FloatTensorLike = 0.0,
+    grad_averaging: bool = False,
+    amsgrad: bool = False,
+    name: str = "NovoGrad",
+    **kwargs,
   ):
     r"""Construct a new NovoGrad optimizer.
 
-        Args:
-            learning_rate: A `Tensor` or a floating point value. or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
-                The learning rate.
-            beta_1: A float value or a constant float tensor.
-                The exponential decay rate for the 1st moment estimates.
-            beta_2: A float value or a constant float tensor.
-                The exponential decay rate for the 2nd moment estimates.
-            epsilon: A small constant for numerical stability.
-            weight_decay: A floating point value. Weight decay for each param.
-            grad_averaging: determines whether to use Adam style exponential
-                moving averaging for the first order moments.
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
-                by norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        learning_rate: A `Tensor` or a floating point value. or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
+            The learning rate.
+        beta_1: A float value or a constant float tensor.
+            The exponential decay rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor.
+            The exponential decay rate for the 2nd moment estimates.
+        epsilon: A small constant for numerical stability.
+        weight_decay: A floating point value. Weight decay for each param.
+        grad_averaging: determines whether to use Adam style exponential
+            moving averaging for the first order moments.
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
+            by norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(name, **kwargs)
     if weight_decay < 0.0:
       raise ValueError("Weight decay rate cannot be negative")
@@ -129,13 +129,13 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
     beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
     beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
     apply_state[(var_device, var_dtype)].update(
-        dict(
-            epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
-            beta_1_t=beta_1_t,
-            beta_2_t=beta_2_t,
-            one_minus_beta_2_t=1 - beta_2_t,
-            one_minus_beta_1_t=1 - beta_1_t,
-        )
+      dict(
+        epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
+        beta_1_t=beta_1_t,
+        beta_2_t=beta_2_t,
+        one_minus_beta_2_t=1 - beta_2_t,
+        one_minus_beta_1_t=1 - beta_1_t,
+      )
     )
 
   def set_weights(self, weights):
@@ -145,7 +145,7 @@ def set_weights(self, weights):
     # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
     num_vars = int((len(params) - 1) / 2)
     if len(weights) == 3 * num_vars + 1:
-      weights = weights[:len(params)]
+      weights = weights[: len(params)]
     super().set_weights(weights)
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
@@ -157,9 +157,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
     v = self.get_slot(var, "v")
     g_2 = tf.reduce_sum(tf.square(grad))
     v_t = tf.cond(
-        tf.equal(self.iterations, 0),
-        lambda: g_2,
-        lambda: v * coefficients["beta_2_t"] + g_2 * coefficients["one_minus_beta_2_t"],
+      tf.equal(self.iterations, 0),
+      lambda: g_2,
+      lambda: v * coefficients["beta_2_t"] + g_2 * coefficients["one_minus_beta_2_t"],
     )
     v_t = v.assign(v_t, use_locking=self._use_locking)
 
@@ -171,19 +171,19 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
       grad = grad / (tf.sqrt(v_t) + self.epsilon)
     grad = tf.cond(tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad)
     grad = tf.cond(
-        tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)),
-        lambda: grad * coefficients["one_minus_beta_1_t"],
-        lambda: grad,
+      tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)),
+      lambda: grad * coefficients["one_minus_beta_1_t"],
+      lambda: grad,
     )
     m = self.get_slot(var, "m")
     return tf.raw_ops.ResourceApplyKerasMomentum(
-        var=var.handle,
-        accum=m.handle,
-        lr=coefficients["lr_t"],
-        grad=grad,
-        momentum=coefficients["beta_1_t"],
-        use_locking=self._use_locking,
-        use_nesterov=False,
+      var=var.handle,
+      accum=m.handle,
+      lr=coefficients["lr_t"],
+      grad=grad,
+      momentum=coefficients["beta_1_t"],
+      use_locking=self._use_locking,
+      use_nesterov=False,
     )
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
@@ -196,9 +196,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     g_2 = tf.reduce_sum(tf.square(grad))
     # v is just a scalar and does not need to involve sparse tensors.
     v_t = tf.cond(
-        tf.equal(self.iterations, 0),
-        lambda: g_2,
-        lambda: v * coefficients["beta_2_t"] + g_2 * coefficients["one_minus_beta_2_t"],
+      tf.equal(self.iterations, 0),
+      lambda: g_2,
+      lambda: v * coefficients["beta_2_t"] + g_2 * coefficients["one_minus_beta_2_t"],
     )
     v_t = v.assign(v_t, use_locking=self._use_locking)
 
@@ -209,37 +209,35 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     else:
       grad = grad / (tf.sqrt(v_t) + self.epsilon)
     grad = tf.cond(
-        tf.greater(weight_decay, 0),
-        lambda: grad + weight_decay * tf.gather(var, indices),
-        lambda: grad,
+      tf.greater(weight_decay, 0),
+      lambda: grad + weight_decay * tf.gather(var, indices),
+      lambda: grad,
     )
     grad = tf.cond(
-        tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)),
-        lambda: grad * coefficients["one_minus_beta_1_t"],
-        lambda: grad,
+      tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)),
+      lambda: grad * coefficients["one_minus_beta_1_t"],
+      lambda: grad,
     )
     m = self.get_slot(var, "m")
     return tf.raw_ops.ResourceSparseApplyKerasMomentum(
-        var=var.handle,
-        accum=m.handle,
-        lr=coefficients["lr_t"],
-        grad=grad,
-        indices=indices,
-        momentum=coefficients["beta_1_t"],
-        use_locking=self._use_locking,
-        use_nesterov=False,
+      var=var.handle,
+      accum=m.handle,
+      lr=coefficients["lr_t"],
+      grad=grad,
+      indices=indices,
+      momentum=coefficients["beta_1_t"],
+      use_locking=self._use_locking,
+      use_nesterov=False,
     )
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "beta_1": self._serialize_hyperparameter("beta_1"),
-            "beta_2": self._serialize_hyperparameter("beta_2"),
-            "epsilon": self.epsilon,
-            "weight_decay": self._serialize_hyperparameter("weight_decay"),
-            "grad_averaging": self._serialize_hyperparameter("grad_averaging"),
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "beta_1": self._serialize_hyperparameter("beta_1"),
+      "beta_2": self._serialize_hyperparameter("beta_2"),
+      "epsilon": self.epsilon,
+      "weight_decay": self._serialize_hyperparameter("weight_decay"),
+      "grad_averaging": self._serialize_hyperparameter("grad_averaging"),
+    })
     return config
diff --git a/deepray/optimizers/optimization.py b/deepray/optimizers/optimization.py
index 2ca36f76..24198f40 100644
--- a/deepray/optimizers/optimization.py
+++ b/deepray/optimizers/optimization.py
@@ -33,46 +33,47 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type=
   if optimizer_type == "adam":
     power = 1.0
     decayed_learning_rate_at_crossover_point = init_lr * (
-        (1.0 - float(num_warmup_steps) / float(num_train_steps))**power
+      (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power
     )
   else:
     power = 0.5
     decayed_learning_rate_at_crossover_point = init_lr
   init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
   print(
-      'decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' %
-      (decayed_learning_rate_at_crossover_point, init_lr)
+    "decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e"
+    % (decayed_learning_rate_at_crossover_point, init_lr)
   )
 
   learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=power
+    initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=power
   )
   if num_warmup_steps:
     learning_rate_fn = WarmUpPolynomial(
-        initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
+      initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
     )
-  if optimizer_type == 'adamw':
+  if optimizer_type == "adamw":
     optimizer = AdamWeightDecay(
-        learning_rate=learning_rate_fn,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias']
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
     )
-  elif optimizer_type == 'adam':
+  elif optimizer_type == "adam":
     optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate_fn, beta_1=0.9, beta_2=0.999, epsilon=1e-6)
   else:
-    skip_list = ['None']  # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
+    skip_list = ["None"]  # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
     import deepray.optimizers as dp_optimizers
+
     optimizer = dp_optimizers.LAMB(
-        learning_rate=learning_rate_fn,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
-        exclude_from_layer_adaptation=skip_list
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
+      exclude_from_layer_adaptation=skip_list,
     )
   # Horovod: add Horovod DistributedOptimizer.
   # ValueError: Unknown decay: WarmUp. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.
@@ -122,17 +123,17 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
   """
 
   def __init__(
-      self,
-      learning_rate: Union[float, tf.keras.optimizers.schedules.LearningRateSchedule] = 0.001,
-      beta_1: float = 0.9,
-      beta_2: float = 0.999,
-      epsilon: float = 1e-7,
-      amsgrad: bool = False,
-      weight_decay_rate: float = 0.0,
-      include_in_weight_decay: Optional[List[str]] = None,
-      exclude_from_weight_decay: Optional[List[str]] = None,
-      name: str = "AdamWeightDecay",
-      **kwargs,
+    self,
+    learning_rate: Union[float, tf.keras.optimizers.schedules.LearningRateSchedule] = 0.001,
+    beta_1: float = 0.9,
+    beta_2: float = 0.999,
+    epsilon: float = 1e-7,
+    amsgrad: bool = False,
+    weight_decay_rate: float = 0.0,
+    include_in_weight_decay: Optional[List[str]] = None,
+    exclude_from_weight_decay: Optional[List[str]] = None,
+    name: str = "AdamWeightDecay",
+    **kwargs,
   ):
     super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
     self.weight_decay_rate = weight_decay_rate
@@ -147,15 +148,16 @@ def from_config(cls, config):
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
     super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device,
-                 var_dtype)]["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
+    apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
+      self.weight_decay_rate, name="adam_weight_decay_rate"
+    )
 
   def _decay_weights_op(self, var, learning_rate, apply_state):
     do_decay = self._do_use_weight_decay(var.name)
     if do_decay:
       return var.assign_sub(
-          learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
-          use_locking=self._use_locking,
+        learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
+        use_locking=self._use_locking,
       )
     return tf.no_op()
 
@@ -214,11 +216,11 @@ def _do_use_weight_decay(self, param_name):
 class GradientAccumulator:
   """Gradient accumulation utility.
 
-      When used with a distribution strategy, the accumulator should be called in a
-      replica context. Gradients will be accumulated locally on each replica and
-      without synchronization. Users should then call ``.gradients``, scale the
-      gradients if required, and pass the result to ``apply_gradients``.
-      """
+  When used with a distribution strategy, the accumulator should be called in a
+  replica context. Gradients will be accumulated locally on each replica and
+  without synchronization. Users should then call ``.gradients``, scale the
+  gradients if required, and pass the result to ``apply_gradients``.
+  """
 
   def __init__(self):
     """Initializes the accumulator."""
@@ -230,10 +232,10 @@ def step(self):
     """Number of accumulated steps."""
     if self._accum_steps is None:
       self._accum_steps = tf.Variable(
-          tf.constant(0, dtype=tf.int64),
-          trainable=False,
-          synchronization=tf.VariableSynchronization.ON_READ,
-          aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+        tf.constant(0, dtype=tf.int64),
+        trainable=False,
+        synchronization=tf.VariableSynchronization.ON_READ,
+        aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
       )
     return self._accum_steps.value()
 
@@ -254,12 +256,12 @@ def reset(self):
   def add_gradients(self, grads):
     if not self._gradients:
       _ = self.step
-      self._gradients.extend(
-          [
-              tf.Variable(tf.zeros_like(g), trainable=False, synchronization=tf.VariableSynchronization.ON_READ)
-              if g is not None else None for g in grads
-          ]
-      )
+      self._gradients.extend([
+        tf.Variable(tf.zeros_like(g), trainable=False, synchronization=tf.VariableSynchronization.ON_READ)
+        if g is not None
+        else None
+        for g in grads
+      ])
     if len(grads) != len(self._gradients):
       raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(grads)))
 
diff --git a/deepray/optimizers/proximal_adagrad.py b/deepray/optimizers/proximal_adagrad.py
index a0bbe309..ee336d6e 100644
--- a/deepray/optimizers/proximal_adagrad.py
+++ b/deepray/optimizers/proximal_adagrad.py
@@ -27,47 +27,47 @@
 class ProximalAdagrad(KerasLegacyOptimizer):
   """Optimizer that implements the Proximal Adagrad algorithm.
 
-    References:
-        - [Efficient Learning using Forward-Backward Splitting](
-          http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
-    """
+  References:
+      - [Efficient Learning using Forward-Backward Splitting](
+        http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf).
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      initial_accumulator_value: float = 0.1,
-      l1_regularization_strength: float = 0.0,
-      l2_regularization_strength: float = 0.0,
-      name: str = "ProximalAdagrad",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    initial_accumulator_value: float = 0.1,
+    l1_regularization_strength: float = 0.0,
+    l2_regularization_strength: float = 0.0,
+    name: str = "ProximalAdagrad",
+    **kwargs,
   ):
     """Construct a new Proximal Adagrad optimizer.
 
-        Args:
-            learning_rate: A Tensor or a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
-                The learning rate.
-            initial_accumulator_value: A floating point value.
-                Starting value for the accumulators, must be positive.
-            l1_regularization_strength: A floating point value.
-                The l1 regularization term, must be greater than or
-                equal to zero.
-            l2_regularization_strength: A floating point value.
-                The l2 regularization term, must be greater than or
-                equal to zero.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "ProximalAdagrad".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
-                by norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        Raises:
-            ValueError: If the `initial_accumulator_value`, `l1` or `l2`
-                is invalid.
-        """
+    Args:
+        learning_rate: A Tensor or a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
+            The learning rate.
+        initial_accumulator_value: A floating point value.
+            Starting value for the accumulators, must be positive.
+        l1_regularization_strength: A floating point value.
+            The l1 regularization term, must be greater than or
+            equal to zero.
+        l2_regularization_strength: A floating point value.
+            The l2 regularization term, must be greater than or
+            equal to zero.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "ProximalAdagrad".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
+            by norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    Raises:
+        ValueError: If the `initial_accumulator_value`, `l1` or `l2`
+            is invalid.
+    """
     if initial_accumulator_value < 0.0:
       raise ValueError("`initial_accumulator_value` must be non-negative.")
     if l1_regularization_strength < 0.0:
@@ -91,23 +91,21 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
 
     acc = self.get_slot(var, "accumulator")
     return tf.raw_ops.ResourceApplyProximalAdagrad(
-        var=var.handle,
-        accum=acc.handle,
-        lr=coefficients["lr_t"],
-        l1=coefficients["l1_regularization_strength"],
-        l2=coefficients["l2_regularization_strength"],
-        grad=grad,
-        use_locking=self._use_locking,
+      var=var.handle,
+      accum=acc.handle,
+      lr=coefficients["lr_t"],
+      l1=coefficients["l1_regularization_strength"],
+      l2=coefficients["l2_regularization_strength"],
+      grad=grad,
+      use_locking=self._use_locking,
     )
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
     super()._prepare_local(var_device, var_dtype, apply_state)
-    apply_state[(var_device, var_dtype)].update(
-        {
-            "l1_regularization_strength": tf.identity(self._get_hyper("l1_regularization_strength", var_dtype)),
-            "l2_regularization_strength": tf.identity(self._get_hyper("l2_regularization_strength", var_dtype)),
-        }
-    )
+    apply_state[(var_device, var_dtype)].update({
+      "l1_regularization_strength": tf.identity(self._get_hyper("l1_regularization_strength", var_dtype)),
+      "l2_regularization_strength": tf.identity(self._get_hyper("l2_regularization_strength", var_dtype)),
+    })
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
@@ -115,24 +113,22 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
 
     acc = self.get_slot(var, "accumulator")
     return tf.raw_ops.ResourceSparseApplyProximalAdagrad(
-        var=var.handle,
-        accum=acc.handle,
-        lr=coefficients["lr_t"],
-        l1=coefficients["l1_regularization_strength"],
-        l2=coefficients["l2_regularization_strength"],
-        grad=grad,
-        indices=indices,
-        use_locking=self._use_locking,
+      var=var.handle,
+      accum=acc.handle,
+      lr=coefficients["lr_t"],
+      l1=coefficients["l1_regularization_strength"],
+      l2=coefficients["l2_regularization_strength"],
+      grad=grad,
+      indices=indices,
+      use_locking=self._use_locking,
     )
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "initial_accumulator_value": self._initial_accumulator_value,
-            "l1_regularization_strength": self._serialize_hyperparameter("l1_regularization_strength"),
-            "l2_regularization_strength": self._serialize_hyperparameter("l2_regularization_strength"),
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "initial_accumulator_value": self._initial_accumulator_value,
+      "l1_regularization_strength": self._serialize_hyperparameter("l1_regularization_strength"),
+      "l2_regularization_strength": self._serialize_hyperparameter("l2_regularization_strength"),
+    })
     return config
diff --git a/deepray/optimizers/rectified_adam.py b/deepray/optimizers/rectified_adam.py
index 807bf33e..c6975b8c 100644
--- a/deepray/optimizers/rectified_adam.py
+++ b/deepray/optimizers/rectified_adam.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Rectified Adam (RAdam) optimizer."""
+
 import tensorflow as tf
 from deepray.utils.types import FloatTensorLike
 
@@ -24,100 +25,100 @@
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class RectifiedAdam(KerasLegacyOptimizer):
   """Variant of the Adam optimizer whose adaptive learning rate is rectified
-    so as to have a consistent variance.
-
-    It implements the Rectified Adam (a.k.a. RAdam) proposed by
-    Liyuan Liu et al. in [On The Variance Of The Adaptive Learning Rate
-    And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf).
-
-    Example of usage:
-
-    ```python
-    opt = dp.optimizers.RectifiedAdam(lr=1e-3)
-    ```
-
-    Note: `amsgrad` is not described in the original paper. Use it with
-          caution.
-
-    RAdam is not a placement of the heuristic warmup, the settings should be
-    kept if warmup has already been employed and tuned in the baseline method.
-    You can enable warmup by setting `total_steps` and `warmup_proportion`:
-
-    ```python
-    opt = dp.optimizers.RectifiedAdam(
-        lr=1e-3,
-        total_steps=10000,
-        warmup_proportion=0.1,
-        min_lr=1e-5,
-    )
-    ```
-
-    In the above example, the learning rate will increase linearly
-    from 0 to `lr` in 1000 steps, then decrease linearly from `lr` to `min_lr`
-    in 9000 steps.
-
-    Lookahead, proposed by Michael R. Zhang et.al in the paper
-    [Lookahead Optimizer: k steps forward, 1 step back]
-    (https://arxiv.org/abs/1907.08610v1), can be integrated with RAdam,
-    which is announced by Less Wright and the new combined optimizer can also
-    be called "Ranger". The mechanism can be enabled by using the lookahead
-    wrapper. For example:
-
-    ```python
-    radam = dp.optimizers.RectifiedAdam()
-    ranger = dp.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
-    ```
-    """
+  so as to have a consistent variance.
+
+  It implements the Rectified Adam (a.k.a. RAdam) proposed by
+  Liyuan Liu et al. in [On The Variance Of The Adaptive Learning Rate
+  And Beyond](https://arxiv.org/pdf/1908.03265v1.pdf).
+
+  Example of usage:
+
+  ```python
+  opt = dp.optimizers.RectifiedAdam(lr=1e-3)
+  ```
+
+  Note: `amsgrad` is not described in the original paper. Use it with
+        caution.
+
+  RAdam is not a placement of the heuristic warmup, the settings should be
+  kept if warmup has already been employed and tuned in the baseline method.
+  You can enable warmup by setting `total_steps` and `warmup_proportion`:
+
+  ```python
+  opt = dp.optimizers.RectifiedAdam(
+      lr=1e-3,
+      total_steps=10000,
+      warmup_proportion=0.1,
+      min_lr=1e-5,
+  )
+  ```
+
+  In the above example, the learning rate will increase linearly
+  from 0 to `lr` in 1000 steps, then decrease linearly from `lr` to `min_lr`
+  in 9000 steps.
+
+  Lookahead, proposed by Michael R. Zhang et.al in the paper
+  [Lookahead Optimizer: k steps forward, 1 step back]
+  (https://arxiv.org/abs/1907.08610v1), can be integrated with RAdam,
+  which is announced by Less Wright and the new combined optimizer can also
+  be called "Ranger". The mechanism can be enabled by using the lookahead
+  wrapper. For example:
+
+  ```python
+  radam = dp.optimizers.RectifiedAdam()
+  ranger = dp.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable, Dict] = 0.001,
-      beta_1: FloatTensorLike = 0.9,
-      beta_2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-7,
-      weight_decay: Union[FloatTensorLike, Callable, Dict] = 0.0,
-      amsgrad: bool = False,
-      sma_threshold: FloatTensorLike = 5.0,
-      total_steps: int = 0,
-      warmup_proportion: FloatTensorLike = 0.1,
-      min_lr: FloatTensorLike = 0.0,
-      name: str = "RectifiedAdam",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable, Dict] = 0.001,
+    beta_1: FloatTensorLike = 0.9,
+    beta_2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-7,
+    weight_decay: Union[FloatTensorLike, Callable, Dict] = 0.0,
+    amsgrad: bool = False,
+    sma_threshold: FloatTensorLike = 5.0,
+    total_steps: int = 0,
+    warmup_proportion: FloatTensorLike = 0.1,
+    min_lr: FloatTensorLike = 0.0,
+    name: str = "RectifiedAdam",
+    **kwargs,
   ):
     r"""Construct a new RAdam optimizer.
 
-        Args:
-            learning_rate: A `Tensor` or a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
-                The learning rate.
-            beta_1: A float value or a constant float tensor.
-                The exponential decay rate for the 1st moment estimates.
-            beta_2: A float value or a constant float tensor.
-                The exponential decay rate for the 2nd moment estimates.
-            epsilon: A small constant for numerical stability.
-            weight_decay: A `Tensor` or a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
-                Weight decay for each parameter.
-            amsgrad: boolean. Whether to apply AMSGrad variant of this
-                algorithm from the paper "On the Convergence of Adam and
-                beyond".
-            sma_threshold. A float value.
-                The threshold for simple mean average.
-            total_steps: An integer value. Total number of training steps.
-                Enable warmup by setting a positive value.
-            warmup_proportion: A floating point value.
-                The proportion of increasing steps.
-            min_lr: A floating point value. Minimum learning rate after warmup.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "RectifiedAdam".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
-                by norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        learning_rate: A `Tensor` or a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
+            The learning rate.
+        beta_1: A float value or a constant float tensor.
+            The exponential decay rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor.
+            The exponential decay rate for the 2nd moment estimates.
+        epsilon: A small constant for numerical stability.
+        weight_decay: A `Tensor` or a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`.
+            Weight decay for each parameter.
+        amsgrad: boolean. Whether to apply AMSGrad variant of this
+            algorithm from the paper "On the Convergence of Adam and
+            beyond".
+        sma_threshold. A float value.
+            The threshold for simple mean average.
+        total_steps: An integer value. Total number of training steps.
+            Enable warmup by setting a positive value.
+        warmup_proportion: A floating point value.
+            The proportion of increasing steps.
+        min_lr: A floating point value. Minimum learning rate after warmup.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "RectifiedAdam".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
+            by norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(name, **kwargs)
 
     if isinstance(learning_rate, Dict):
@@ -153,7 +154,7 @@ def set_weights(self, weights):
     params = self.weights
     num_vars = int((len(params) - 1) / 2)
     if len(weights) == 3 * num_vars + 1:
-      weights = weights[:len(params)]
+      weights = weights[: len(params)]
     super().set_weights(weights)
 
   def _decayed_wd(self, var_dtype):
@@ -187,29 +188,29 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
       decay_steps = tf.maximum(total_steps - warmup_steps, 1)
       decay_rate = (min_lr - lr_t) / decay_steps
       lr_t = tf.where(
-          local_step <= warmup_steps,
-          lr_t * (local_step / warmup_steps),
-          lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
+        local_step <= warmup_steps,
+        lr_t * (local_step / warmup_steps),
+        lr_t + decay_rate * tf.minimum(local_step - warmup_steps, decay_steps),
       )
     apply_state[(var_device, var_dtype)].update(
-        dict(
-            lr_t=lr_t,
-            wd_t=wd_t,
-            beta_1_t=beta_1_t,
-            beta_2_t=beta_2_t,
-            epsilon_t=tf.convert_to_tensor(self.epsilon, var_dtype),
-            local_step=local_step,
-            beta_1_power=beta_1_power,
-            beta_2_power=beta_2_power,
-            sma_inf=sma_inf,
-            sma_t=sma_t,
-            one_minus_beta_1_t=one_minus_beta_1_t,
-            recip_one_minus_beta_1_power=recip_one_minus_beta_1_power,
-            one_minus_beta_2_t=one_minus_beta_2_t,
-            recip_one_minus_beta_2_power=recip_one_minus_beta_2_power,
-            r_t=r_t,
-            sma_t_ge_sma_threshold=sma_t_ge_sma_threshold,
-        )
+      dict(
+        lr_t=lr_t,
+        wd_t=wd_t,
+        beta_1_t=beta_1_t,
+        beta_2_t=beta_2_t,
+        epsilon_t=tf.convert_to_tensor(self.epsilon, var_dtype),
+        local_step=local_step,
+        beta_1_power=beta_1_power,
+        beta_2_power=beta_2_power,
+        sma_inf=sma_inf,
+        sma_t=sma_t,
+        one_minus_beta_1_t=one_minus_beta_1_t,
+        recip_one_minus_beta_1_power=recip_one_minus_beta_1_power,
+        one_minus_beta_2_t=one_minus_beta_2_t,
+        recip_one_minus_beta_2_power=recip_one_minus_beta_2_power,
+        r_t=r_t,
+        sma_t_ge_sma_threshold=sma_t_ge_sma_threshold,
+      )
     )
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
@@ -219,14 +220,14 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
     v = self.get_slot(var, "v")
 
     m_t = m.assign(
-        coef["beta_1_t"] * m + coef["one_minus_beta_1_t"] * grad,
-        use_locking=self._use_locking,
+      coef["beta_1_t"] * m + coef["one_minus_beta_1_t"] * grad,
+      use_locking=self._use_locking,
     )
     m_corr_t = m_t * coef["recip_one_minus_beta_1_power"]
 
     v_t = v.assign(
-        coef["beta_2_t"] * v + coef["one_minus_beta_2_t"] * tf.square(grad),
-        use_locking=self._use_locking,
+      coef["beta_2_t"] * v + coef["one_minus_beta_2_t"] * tf.square(grad),
+      use_locking=self._use_locking,
     )
     if self.amsgrad:
       vhat = self.get_slot(var, "vhat")
@@ -237,9 +238,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None):
       v_corr_t = tf.sqrt(v_t * coef["recip_one_minus_beta_2_power"])
 
     var_t = tf.where(
-        coef["sma_t_ge_sma_threshold"],
-        coef["r_t"] * m_corr_t / (v_corr_t + coef["epsilon_t"]),
-        m_corr_t,
+      coef["sma_t_ge_sma_threshold"],
+      coef["r_t"] * m_corr_t / (v_corr_t + coef["epsilon_t"]),
+      m_corr_t,
     )
 
     if self._has_weight_decay:
@@ -278,9 +279,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
       v_corr_t = tf.sqrt(v_t * coef["recip_one_minus_beta_2_power"])
 
     var_t = tf.where(
-        coef["sma_t_ge_sma_threshold"],
-        coef["r_t"] * m_corr_t / (v_corr_t + coef["epsilon_t"]),
-        m_corr_t,
+      coef["sma_t_ge_sma_threshold"],
+      coef["r_t"] * m_corr_t / (v_corr_t + coef["epsilon_t"]),
+      m_corr_t,
     )
 
     if self._has_weight_decay:
@@ -296,19 +297,17 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "beta_1": self._serialize_hyperparameter("beta_1"),
-            "beta_2": self._serialize_hyperparameter("beta_2"),
-            "decay": self._serialize_hyperparameter("decay"),
-            "weight_decay": self._serialize_hyperparameter("weight_decay"),
-            "sma_threshold": self._serialize_hyperparameter("sma_threshold"),
-            "epsilon": self.epsilon,
-            "amsgrad": self.amsgrad,
-            "total_steps": int(self._serialize_hyperparameter("total_steps")),
-            "warmup_proportion": self._serialize_hyperparameter("warmup_proportion"),
-            "min_lr": self._serialize_hyperparameter("min_lr"),
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "beta_1": self._serialize_hyperparameter("beta_1"),
+      "beta_2": self._serialize_hyperparameter("beta_2"),
+      "decay": self._serialize_hyperparameter("decay"),
+      "weight_decay": self._serialize_hyperparameter("weight_decay"),
+      "sma_threshold": self._serialize_hyperparameter("sma_threshold"),
+      "epsilon": self.epsilon,
+      "amsgrad": self.amsgrad,
+      "total_steps": int(self._serialize_hyperparameter("total_steps")),
+      "warmup_proportion": self._serialize_hyperparameter("warmup_proportion"),
+      "min_lr": self._serialize_hyperparameter("min_lr"),
+    })
     return config
diff --git a/deepray/optimizers/stochastic_weight_averaging.py b/deepray/optimizers/stochastic_weight_averaging.py
index eeecf488..bc17b540 100644
--- a/deepray/optimizers/stochastic_weight_averaging.py
+++ b/deepray/optimizers/stochastic_weight_averaging.py
@@ -34,74 +34,74 @@
 class SWA(AveragedOptimizerWrapper):
   """This class extends optimizers with Stochastic Weight Averaging (SWA).
 
-    The Stochastic Weight Averaging mechanism was proposed by Pavel Izmailov
-    et. al in the paper [Averaging Weights Leads to Wider Optima and
-    Better Generalization](https://arxiv.org/abs/1803.05407). The optimizer
-    implements averaging of multiple points along the trajectory of SGD. The
-    optimizer expects an inner optimizer which will be used to apply the
-    gradients to the variables and itself computes a running average of the
-    variables every `k` steps (which generally corresponds to the end
-    of a cycle when a cyclic learning rate is employed).
-
-    We also allow the specification of the number of steps averaging
-    should first happen after. Let's say, we want averaging to happen every `k`
-    steps after the first `m` steps. After step `m` we'd take a snapshot of the
-    variables and then average the weights appropriately at step `m + k`,
-    `m + 2k` and so on. The assign_average_vars function can be called at the
-    end of training to obtain the averaged_weights from the optimizer.
-
-    Note: If your model has batch-normalization layers you would need to run
-    the final weights through the data to compute the running mean and
-    variance corresponding to the activations for each layer of the network.
-    From the paper: If the DNN uses batch normalization we run one
-    additional pass over the data, to compute the running mean and standard
-    deviation of the activations for each layer of the network with SWA
-    weights after the training is finished, since these statistics are not
-    collected during training. For most deep learning libraries, such as
-    PyTorch or Tensorflow, one can typically collect these statistics by
-    making a forward pass over the data in training mode
-    ([Averaging Weights Leads to Wider Optima and Better
-    Generalization](https://arxiv.org/abs/1803.05407))
-
-    Example of usage:
-
-    ```python
-    opt = tf.keras.optimizers.SGD(learning_rate)
-    opt = dp.optimizers.SWA(opt, start_averaging=m, average_period=k)
-    ```
-    """
+  The Stochastic Weight Averaging mechanism was proposed by Pavel Izmailov
+  et. al in the paper [Averaging Weights Leads to Wider Optima and
+  Better Generalization](https://arxiv.org/abs/1803.05407). The optimizer
+  implements averaging of multiple points along the trajectory of SGD. The
+  optimizer expects an inner optimizer which will be used to apply the
+  gradients to the variables and itself computes a running average of the
+  variables every `k` steps (which generally corresponds to the end
+  of a cycle when a cyclic learning rate is employed).
+
+  We also allow the specification of the number of steps averaging
+  should first happen after. Let's say, we want averaging to happen every `k`
+  steps after the first `m` steps. After step `m` we'd take a snapshot of the
+  variables and then average the weights appropriately at step `m + k`,
+  `m + 2k` and so on. The assign_average_vars function can be called at the
+  end of training to obtain the averaged_weights from the optimizer.
+
+  Note: If your model has batch-normalization layers you would need to run
+  the final weights through the data to compute the running mean and
+  variance corresponding to the activations for each layer of the network.
+  From the paper: If the DNN uses batch normalization we run one
+  additional pass over the data, to compute the running mean and standard
+  deviation of the activations for each layer of the network with SWA
+  weights after the training is finished, since these statistics are not
+  collected during training. For most deep learning libraries, such as
+  PyTorch or Tensorflow, one can typically collect these statistics by
+  making a forward pass over the data in training mode
+  ([Averaging Weights Leads to Wider Optima and Better
+  Generalization](https://arxiv.org/abs/1803.05407))
+
+  Example of usage:
+
+  ```python
+  opt = tf.keras.optimizers.SGD(learning_rate)
+  opt = dp.optimizers.SWA(opt, start_averaging=m, average_period=k)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      optimizer: types.Optimizer,
-      start_averaging: int = 0,
-      average_period: int = 10,
-      name: str = "SWA",
-      **kwargs,
+    self,
+    optimizer: types.Optimizer,
+    start_averaging: int = 0,
+    average_period: int = 10,
+    name: str = "SWA",
+    **kwargs,
   ):
     r"""Wrap optimizer with the Stochastic Weight Averaging mechanism.
 
-        Args:
-            optimizer: The original optimizer that will be used to compute and
-                apply the gradients.
-            start_averaging: An integer. Threshold to start averaging using
-                SWA. Averaging only occurs at `start_averaging` iters, must
-                be >= 0. If start_averaging = m, the first snapshot will be
-                taken after the mth application of gradients (where the first
-                iteration is iteration 0).
-            average_period: An integer. The synchronization period of SWA. The
-                averaging occurs every average_period steps. Averaging period
-                needs to be >= 1.
-            name: Optional name for the operations created when applying
-                gradients. Defaults to 'SWA'.
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`,
-                `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
-                norm; `clipvalue` is clip gradients by value, `decay` is
-                included for backward compatibility to allow time inverse
-                decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-        """
+    Args:
+        optimizer: The original optimizer that will be used to compute and
+            apply the gradients.
+        start_averaging: An integer. Threshold to start averaging using
+            SWA. Averaging only occurs at `start_averaging` iters, must
+            be >= 0. If start_averaging = m, the first snapshot will be
+            taken after the mth application of gradients (where the first
+            iteration is iteration 0).
+        average_period: An integer. The synchronization period of SWA. The
+            averaging occurs every average_period steps. Averaging period
+            needs to be >= 1.
+        name: Optional name for the operations created when applying
+            gradients. Defaults to 'SWA'.
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`,
+            `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by
+            norm; `clipvalue` is clip gradients by value, `decay` is
+            included for backward compatibility to allow time inverse
+            decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+    """
     super().__init__(optimizer, name, **kwargs)
 
     if average_period < 1:
@@ -119,8 +119,8 @@ def average_op(self, var, average_var, local_apply_state):
     # number of times snapshots of weights have been taken (using max to
     # avoid negative values of num_snapshots).
     num_snapshots = tf.math.maximum(
-        tf.cast(0, tf.int64),
-        tf.math.floordiv(self.iterations - start_averaging, average_period),
+      tf.cast(0, tf.int64),
+      tf.math.floordiv(self.iterations - start_averaging, average_period),
     )
 
     # The average update should happen iff two conditions are met:
@@ -136,8 +136,8 @@ def average_op(self, var, average_var, local_apply_state):
 
   def get_config(self):
     config = {
-        "average_period": self._serialize_hyperparameter("average_period"),
-        "start_averaging": self._serialize_hyperparameter("start_averaging"),
+      "average_period": self._serialize_hyperparameter("average_period"),
+      "start_averaging": self._serialize_hyperparameter("start_averaging"),
     }
     base_config = super().get_config()
     return {**base_config, **config}
diff --git a/deepray/optimizers/tests/adabelief_test.py b/deepray/optimizers/tests/adabelief_test.py
index 9808b2ca..0a40bad1 100644
--- a/deepray/optimizers/tests/adabelief_test.py
+++ b/deepray/optimizers/tests/adabelief_test.py
@@ -58,9 +58,9 @@ def run_sparse_sample(iterations, expected, optimizer):
 def test_dense_sample():
   # Expected values are obtained from the previous implementation
   run_dense_sample(
-      iterations=100,
-      expected=[[0.66723955, 1.6672393], [2.6672382, 3.6672382]],
-      optimizer=AdaBelief(lr=1e-3, rectify=False),
+    iterations=100,
+    expected=[[0.66723955, 1.6672393], [2.6672382, 3.6672382]],
+    optimizer=AdaBelief(lr=1e-3, rectify=False),
   )
 
 
@@ -68,9 +68,9 @@ def test_dense_sample():
 def test_sparse_sample():
   # Expected values are obtained from the previous implementation
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.0538936, 2.0], [3.0, 3.0538926]],
-      optimizer=AdaBelief(lr=1e-3, rectify=False),
+    iterations=200,
+    expected=[[0.0538936, 2.0], [3.0, 3.0538926]],
+    optimizer=AdaBelief(lr=1e-3, rectify=False),
   )
 
 
@@ -79,9 +79,9 @@ def test_dense_sample_with_amsgrad():
   # Expected values are obtained from the official implementation
   # `amsgrad` has no effect because the gradient is fixed
   run_dense_sample(
-      iterations=100,
-      expected=[[0.67249274, 1.6724932], [2.6724923, 3.6724923]],
-      optimizer=AdaBelief(lr=1e-3, amsgrad=True, rectify=False),
+    iterations=100,
+    expected=[[0.67249274, 1.6724932], [2.6724923, 3.6724923]],
+    optimizer=AdaBelief(lr=1e-3, amsgrad=True, rectify=False),
   )
 
 
@@ -90,9 +90,9 @@ def test_sparse_sample_with_amsgrad():
   # Expected values are obtained from the official implementation
   # `amsgrad` has no effect because the gradient is fixed
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.09575394, 2.0], [3.0, 3.0957537]],
-      optimizer=AdaBelief(lr=1e-3, amsgrad=True, rectify=False),
+    iterations=200,
+    expected=[[0.09575394, 2.0], [3.0, 3.0957537]],
+    optimizer=AdaBelief(lr=1e-3, amsgrad=True, rectify=False),
   )
 
 
@@ -100,9 +100,9 @@ def test_sparse_sample_with_amsgrad():
 def test_dense_sample_with_weight_decay():
   # Expected values are obtained from the previous implementation
   run_dense_sample(
-      iterations=100,
-      expected=[[0.66637343, 1.6653734], [2.6643748, 3.6633751]],
-      optimizer=AdaBelief(lr=1e-3, weight_decay=0.01, rectify=False),
+    iterations=100,
+    expected=[[0.66637343, 1.6653734], [2.6643748, 3.6633751]],
+    optimizer=AdaBelief(lr=1e-3, weight_decay=0.01, rectify=False),
   )
 
 
@@ -110,18 +110,18 @@ def test_dense_sample_with_weight_decay():
 def test_sparse_sample_with_weight_decay():
   # Expected values are obtained from the previous implementation
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.05264655, 2.0], [3.0, 3.0466535]],
-      optimizer=AdaBelief(lr=1e-3, weight_decay=0.01, rectify=False),
+    iterations=200,
+    expected=[[0.05264655, 2.0], [3.0, 3.0466535]],
+    optimizer=AdaBelief(lr=1e-3, weight_decay=0.01, rectify=False),
   )
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_dense_sample_with_warmup():
   run_dense_sample(
-      iterations=100,
-      expected=[[0.85635465, 1.8563547], [2.8563545, 3.8563545]],
-      optimizer=AdaBelief(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5, rectify=False),
+    iterations=100,
+    expected=[[0.85635465, 1.8563547], [2.8563545, 3.8563545]],
+    optimizer=AdaBelief(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5, rectify=False),
   )
 
 
@@ -129,27 +129,27 @@ def test_dense_sample_with_warmup():
 def test_sparse_sample_with_warmup():
   # Expected values are obtained from the previous implementation
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.8502214, 2.0], [3.0, 3.85022]],
-      optimizer=AdaBelief(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5, rectify=False),
+    iterations=200,
+    expected=[[0.8502214, 2.0], [3.0, 3.85022]],
+    optimizer=AdaBelief(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5, rectify=False),
   )
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_dense_sample_with_rectify():
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.7836679, 2.0], [3.0, 3.7839665]],
-      optimizer=AdaBelief(lr=1e-3, rectify=True),
+    iterations=200,
+    expected=[[0.7836679, 2.0], [3.0, 3.7839665]],
+    optimizer=AdaBelief(lr=1e-3, rectify=True),
   )
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_sparse_sample_with_rectify():
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.7836679, 2.0], [3.0, 3.7839665]],
-      optimizer=AdaBelief(lr=1e-3, rectify=True),
+    iterations=200,
+    expected=[[0.7836679, 2.0], [3.0, 3.7839665]],
+    optimizer=AdaBelief(lr=1e-3, rectify=True),
   )
 
 
@@ -158,13 +158,13 @@ def test_dense_sample_with_lookahead():
   # Expected values are obtained from the original implementation
   # of Ranger
   run_dense_sample(
-      iterations=100,
-      expected=[[0.88910455, 1.889104], [2.8891046, 3.8891046]],
-      optimizer=Lookahead(
-          AdaBelief(lr=1e-3, beta_1=0.95, rectify=False),
-          sync_period=6,
-          slow_step_size=0.45,
-      ),
+    iterations=100,
+    expected=[[0.88910455, 1.889104], [2.8891046, 3.8891046]],
+    optimizer=Lookahead(
+      AdaBelief(lr=1e-3, beta_1=0.95, rectify=False),
+      sync_period=6,
+      slow_step_size=0.45,
+    ),
   )
 
 
@@ -173,13 +173,13 @@ def test_sparse_sample_with_lookahead():
   # Expected values are obtained from the previous implementation
   # of Ranger.
   run_sparse_sample(
-      iterations=150,
-      expected=[[0.8114481, 2.0], [3.0, 3.8114486]],
-      optimizer=Lookahead(
-          AdaBelief(lr=1e-3, beta_1=0.95, rectify=False),
-          sync_period=6,
-          slow_step_size=0.45,
-      ),
+    iterations=150,
+    expected=[[0.8114481, 2.0], [3.0, 3.8114486]],
+    optimizer=Lookahead(
+      AdaBelief(lr=1e-3, beta_1=0.95, rectify=False),
+      sync_period=6,
+      slow_step_size=0.45,
+    ),
   )
 
 
@@ -203,9 +203,9 @@ def test_schedulers():
   wd_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(2e-3, 25, 0.25)
 
   run_dense_sample(
-      iterations=100,
-      expected=[[0.84216374, 1.8420818], [2.8420012, 3.841918]],
-      optimizer=AdaBelief(learning_rate=lr_scheduler, weight_decay=wd_scheduler, rectify=False),
+    iterations=100,
+    expected=[[0.84216374, 1.8420818], [2.8420012, 3.841918]],
+    optimizer=AdaBelief(learning_rate=lr_scheduler, weight_decay=wd_scheduler, rectify=False),
   )
 
 
@@ -221,27 +221,27 @@ def test_scheduler_serialization():
   # TODO: Remove after 2.13 is oldest version supported due to new serialization
   if Version(tf.__version__) >= Version("2.13"):
     assert new_optimizer.get_config()["learning_rate"] == {
-        "class_name": "ExponentialDecay",
-        "config": lr_scheduler.get_config(),
-        "module": "keras.optimizers.schedules",
-        "registered_name": None,
+      "class_name": "ExponentialDecay",
+      "config": lr_scheduler.get_config(),
+      "module": "keras.optimizers.schedules",
+      "registered_name": None,
     }
     assert new_optimizer.get_config()["weight_decay"] == {
-        "class_name": "InverseTimeDecay",
-        "config": wd_scheduler.get_config(),
-        "module": "keras.optimizers.schedules",
-        "registered_name": None,
+      "class_name": "InverseTimeDecay",
+      "config": wd_scheduler.get_config(),
+      "module": "keras.optimizers.schedules",
+      "registered_name": None,
     }
 
   else:
     assert new_optimizer.get_config()["learning_rate"] == {
-        "class_name": "ExponentialDecay",
-        "config": lr_scheduler.get_config(),
+      "class_name": "ExponentialDecay",
+      "config": lr_scheduler.get_config(),
     }
 
     assert new_optimizer.get_config()["weight_decay"] == {
-        "class_name": "InverseTimeDecay",
-        "config": wd_scheduler.get_config(),
+      "class_name": "InverseTimeDecay",
+      "config": wd_scheduler.get_config(),
     }
 
 
diff --git a/deepray/optimizers/tests/adam_test.py b/deepray/optimizers/tests/adam_test.py
index 03dde577..271950ea 100644
--- a/deepray/optimizers/tests/adam_test.py
+++ b/deepray/optimizers/tests/adam_test.py
@@ -34,7 +34,7 @@
 
 
 def adam_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
 
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
@@ -44,7 +44,7 @@ def adam_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9, beta2=0.999, eps
 
 
 def adam_update_numpy_amsgrad(param, g_t, t, m, v, vhat, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
 
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
@@ -55,10 +55,10 @@ def adam_update_numpy_amsgrad(param, g_t, t, m, v, vhat, lr=0.001, beta1=0.9, be
 
 
 def adam_sparse_update_numpy_amsgrad(
-    param, indices, g_t, t, m, v, vhat, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7
+  param, indices, g_t, t, m, v, vhat, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7
 ):
   m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat), np.copy(param))
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
   m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
   v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
   m_t[indices] = m_t_slice
@@ -80,7 +80,6 @@ def get_beta_accumulators(opt, dtype):
 
 
 class AdamOptimizerTest(test.TestCase, parameterized.TestCase):
-
   def testSparse(self):
     # TODO(tanzheny, omalleyt): Fix test in eager mode.
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -96,13 +95,15 @@ def testSparse(self):
         var1 = variables.Variable(var1_np)
         grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = indexed_slices.IndexedSlices(
-            constant_op.constant(grads0_np[grads0_np_indices]), constant_op.constant(grads0_np_indices),
-            constant_op.constant([3])
+          constant_op.constant(grads0_np[grads0_np_indices]),
+          constant_op.constant(grads0_np_indices),
+          constant_op.constant([3]),
         )
         grads1_np_indices = np.array([0, 2], dtype=np.int32)
         grads1 = indexed_slices.IndexedSlices(
-            constant_op.constant(grads1_np[grads1_np_indices]), constant_op.constant(grads1_np_indices),
-            constant_op.constant([3])
+          constant_op.constant(grads1_np[grads1_np_indices]),
+          constant_op.constant(grads1_np_indices),
+          constant_op.constant([3]),
         )
         opt = adam.Adam()
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
@@ -115,8 +116,8 @@ def testSparse(self):
         beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
         # Run 3 steps of Adam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power))
+          self.assertAllCloseAccordingToType(0.9 ** (t + 1), self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999 ** (t + 1), self.evaluate(beta_2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -147,12 +148,14 @@ def testSparseRepeatedIndices(self):
         repeated_index_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype)
         aggregated_update_var = variables.Variable([[1.0], [2.0]], dtype=dtype)
         grad_repeated_index = indexed_slices.IndexedSlices(
-            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), constant_op.constant([1, 1]),
-            constant_op.constant([2, 1])
+          constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+          constant_op.constant([1, 1]),
+          constant_op.constant([2, 1]),
         )
         grad_aggregated = indexed_slices.IndexedSlices(
-            constant_op.constant([0.2], shape=[1, 1], dtype=dtype), constant_op.constant([1]),
-            constant_op.constant([2, 1])
+          constant_op.constant([0.2], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]),
+          constant_op.constant([2, 1]),
         )
         repeated_update = adam.Adam().apply_gradients([(grad_repeated_index, repeated_index_update_var)])
         aggregated_update = adam.Adam().apply_gradients([(grad_aggregated, aggregated_update_var)])
@@ -196,8 +199,8 @@ def doTestBasic(self, use_callable_params=False):
         # Run 3 steps of Adam
         for t in range(3):
           beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power))
+          self.assertAllCloseAccordingToType(0.9 ** (t + 1), self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999 ** (t + 1), self.evaluate(beta_2_power))
           if not context.executing_eagerly():
             self.evaluate(update)
           else:
@@ -242,8 +245,8 @@ def testBasicWithAmsgrad(self):
         # Run 3 steps of Adam
         for t in range(3):
           beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-          self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power))
+          self.assertAllCloseAccordingToType(0.9 ** (t + 1), self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999 ** (t + 1), self.evaluate(beta_2_power))
           if not context.executing_eagerly():
             self.evaluate(update)
           else:
@@ -271,8 +274,9 @@ def testSparseWithAmsgrad(self):
         aggregated_update_var = variables.Variable(var0_np, dtype=dtype)
         grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
         grad_repeated_index = indexed_slices.IndexedSlices(
-            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype), constant_op.constant([1, 1]),
-            constant_op.constant([2, 1])
+          constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+          constant_op.constant([1, 1]),
+          constant_op.constant([2, 1]),
         )
         grad_aggregated = indexed_slices.IndexedSlices(grads0_np, indices, constant_op.constant([2, 1]))
         opt_repeated = adam.Adam(amsgrad=True)
@@ -295,7 +299,7 @@ def testSparseWithAmsgrad(self):
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, self.evaluate(aggregated_update_var))
           self.assertAllCloseAccordingToType(
-              self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var)
+            self.evaluate(aggregated_update_var), self.evaluate(repeated_index_update_var)
           )
 
   def testBasicWithLearningRateDecay(self):
@@ -402,8 +406,8 @@ def testTensorLearningRate(self):
         beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
         # Run 3 steps of Adam
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power))
+          self.assertAllCloseAccordingToType(0.9 ** (t + 1), self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999 ** (t + 1), self.evaluate(beta_2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -441,8 +445,8 @@ def testSharing(self):
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(3):
-          self.assertAllCloseAccordingToType(0.9**(t + 1), self.evaluate(beta_1_power))
-          self.assertAllCloseAccordingToType(0.999**(t + 1), self.evaluate(beta_2_power))
+          self.assertAllCloseAccordingToType(0.9 ** (t + 1), self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999 ** (t + 1), self.evaluate(beta_2_power))
           if t % 2 == 0:
             update1.run()
           else:
@@ -457,9 +461,9 @@ def testSharing(self):
 
   @combinations.generate(combinations.combine(mode=["eager"]))
   def testSlotsUniqueEager(self):
-    v1 = variables.Variable(1.)
-    v2 = variables.Variable(1.)
-    opt = adam.Adam(1.)
+    v1 = variables.Variable(1.0)
+    v2 = variables.Variable(1.0)
+    opt = adam.Adam(1.0)
     opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
     # There should be iteration, and two unique slot variables for v1 and v2.
     self.assertLen(set(v.ref() for v in opt.variables()), 5)
diff --git a/deepray/optimizers/tests/cocob_test.py b/deepray/optimizers/tests/cocob_test.py
index af52f06e..9ab6a7fb 100644
--- a/deepray/optimizers/tests/cocob_test.py
+++ b/deepray/optimizers/tests/cocob_test.py
@@ -37,23 +37,23 @@ def run_dense_sample(iterations, expected, optimizer):
 
 def test_dense_sample_with_default_alpha():
   run_dense_sample(
-      iterations=10,
-      expected=[[0.84528893, 1.845289], [2.845289, 3.845289]],
-      optimizer=COCOB(),
+    iterations=10,
+    expected=[[0.84528893, 1.845289], [2.845289, 3.845289]],
+    optimizer=COCOB(),
   )
 
 
 def test_dense_sample_with_custom_int_alpha():
   run_dense_sample(
-      iterations=7,
-      expected=[[0.09346926, 1.0934693], [2.0934694, 3.0934694]],
-      optimizer=COCOB(20),
+    iterations=7,
+    expected=[[0.09346926, 1.0934693], [2.0934694, 3.0934694]],
+    optimizer=COCOB(20),
   )
 
 
 def test_dense_sample_with_custom_float_alpha():
   run_dense_sample(
-      iterations=5,
-      expected=[[0.89307845, 1.8930784], [2.8930783, 3.8930783]],
-      optimizer=COCOB(55.7),
+    iterations=5,
+    expected=[[0.89307845, 1.8930784], [2.8930783, 3.8930783]],
+    optimizer=COCOB(55.7),
   )
diff --git a/deepray/optimizers/tests/conditional_gradient_test.py b/deepray/optimizers/tests/conditional_gradient_test.py
index 08301cee..38582988 100644
--- a/deepray/optimizers/tests/conditional_gradient_test.py
+++ b/deepray/optimizers/tests/conditional_gradient_test.py
@@ -83,7 +83,7 @@ def loss():
   grads0_0 = 32 * 1.0 + 40 * 2.0
   grads0_1 = 40 * 1.0 + 50 * 2.0
   grads0 = tf.constant([[grads0_0, grads0_1]], dtype=dtype)
-  norm0 = tf.math.reduce_sum(grads0**2)**0.5
+  norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
 
   learning_rate = 0.1
   lambda_ = 0.1
@@ -91,13 +91,13 @@ def loss():
   opt = cg_lib.ConditionalGradient(learning_rate=learning_rate, lambda_=lambda_, ord=ord)
   _ = opt.minimize(loss, var_list=[var0])
   test_utils.assert_allclose_according_to_type(
+    [
       [
-          [
-              1.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_0 / norm0,
-              2.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_1 / norm0,
-          ]
-      ],
-      var0.numpy(),
+        1.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_0 / norm0,
+        2.0 * learning_rate - (1 - learning_rate) * lambda_ * grads0_1 / norm0,
+      ]
+    ],
+    var0.numpy(),
   )
 
 
@@ -113,8 +113,8 @@ def test_basic_frobenius(dtype, use_resource):
     var1 = tf.Variable([3.0, 4.0], dtype=dtype[0])
   grads0 = tf.constant([0.1, 0.1], dtype=dtype[0])
   grads1 = tf.constant([0.01, 0.01], dtype=dtype[0])
-  norm0 = tf.math.reduce_sum(grads0**2)**0.5
-  norm1 = tf.math.reduce_sum(grads1**2)**0.5
+  norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
+  norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
 
   def learning_rate():
     return 0.5
@@ -135,39 +135,35 @@ def lambda_():
   assert slot1.get_shape() == var1.get_shape()
 
   test_utils.assert_allclose_according_to_type(
-      np.array([
-          1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-          2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-      ]),
-      var0.numpy(),
+    np.array([
+      1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+      2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+    ]),
+    var0.numpy(),
   )
   test_utils.assert_allclose_according_to_type(
-      np.array([
-          3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-          4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-      ]),
-      var1.numpy(),
+    np.array([
+      3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+      4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+    ]),
+    var1.numpy(),
   )
 
   # Step 2: the conditional_gradient contain the previous update.
   cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-              (2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-          ]
-      ),
-      var0.numpy(),
+    np.array([
+      (1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+      (2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+    ]),
+    var0.numpy(),
   )
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-              (4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-          ]
-      ),
-      var1.numpy(),
+    np.array([
+      (3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+      (4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+    ]),
+    var1.numpy(),
   )
 
 
@@ -177,9 +173,8 @@ def test_basic_nuclear(use_resource):
   # TODO:
   #       to address issue #36764
   for i, dtype in enumerate(
-      _dtypes_with_checking_system(use_gpu=test_utils.is_gpu_available(), system=platform.system())
+    _dtypes_with_checking_system(use_gpu=test_utils.is_gpu_available(), system=platform.system())
   ):
-
     if use_resource:
       var0 = tf.Variable([1.0, 2.0], dtype=dtype, name="var0_%d" % i)
       var1 = tf.Variable([3.0, 4.0], dtype=dtype, name="var1_%d" % i)
@@ -211,47 +206,35 @@ def lambda_():
     assert slot1.get_shape() == var1.get_shape()
 
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
-                2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
-            ]
-        ),
-        var0.numpy(),
+      np.array([
+        1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
+        2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
+      ]),
+      var0.numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
-                4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
-            ]
-        ),
-        var1.numpy(),
+      np.array([
+        3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
+        4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
+      ]),
+      var1.numpy(),
     )
 
     # Step 2: the conditional_gradient contain the previous update.
     cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (1.0 * 0.5 -
-                 (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
-                (2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5 -
-                (1 - 0.5) * 0.01 * top_singular_vector0[1],
-            ]
-        ),
-        var0.numpy(),
+      np.array([
+        (1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
+        (2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
+      ]),
+      var0.numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (3.0 * 0.5 -
-                 (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
-                (4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 -
-                (1 - 0.5) * 0.01 * top_singular_vector1[1],
-            ]
-        ),
-        var1.numpy(),
+      np.array([
+        (3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
+        (4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
+      ]),
+      var1.numpy(),
     )
 
 
@@ -281,13 +264,13 @@ def loss():
 
     # Validate updated params
     test_utils.assert_allclose_according_to_type(
+      [
         [
-            [
-                1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
-                2.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
-            ]
-        ],
-        var0.numpy(),
+          1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
+          2.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
+        ]
+      ],
+      var0.numpy(),
     )
 
 
@@ -319,22 +302,18 @@ def test_tensor_learning_rate_and_conditional_gradient_nuclear():
 
     # Check that the parameters have been updated.
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
-                2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
-            ]
-        ),
-        var0.numpy(),
+      np.array([
+        1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
+        2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
+      ]),
+      var0.numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
-                4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
-            ]
-        ),
-        var1.numpy(),
+      np.array([
+        3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
+        4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
+      ]),
+      var1.numpy(),
     )
     # Step 2: the conditional_gradient contain the
     # previous update.
@@ -342,26 +321,18 @@ def test_tensor_learning_rate_and_conditional_gradient_nuclear():
 
     # Check that the parameters have been updated.
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (1.0 * 0.5 -
-                 (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
-                (2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5 -
-                (1 - 0.5) * 0.01 * top_singular_vector0[1],
-            ]
-        ),
-        var0.numpy(),
+      np.array([
+        (1.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[0],
+        (2.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector0[1],
+      ]),
+      var0.numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (3.0 * 0.5 -
-                 (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
-                (4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1]) * 0.5 -
-                (1 - 0.5) * 0.01 * top_singular_vector1[1],
-            ]
-        ),
-        var1.numpy(),
+      np.array([
+        (3.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[0],
+        (4.0 * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1]) * 0.5 - (1 - 0.5) * 0.01 * top_singular_vector1[1],
+      ]),
+      var1.numpy(),
     )
 
 
@@ -412,7 +383,7 @@ def loss():
 
   # the gradient for this loss function:
   grads0 = tf.constant([[0, 0], [1, 1]], dtype=tf.float32)
-  norm0 = tf.math.reduce_sum(grads0**2)**0.5
+  norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
 
   learning_rate = 0.1
   lambda_ = 0.1
@@ -422,14 +393,14 @@ def loss():
 
   # Run 1 step of cg_op
   test_utils.assert_allclose_according_to_type(
+    [
+      [1, 1],
       [
-          [1, 1],
-          [
-              learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
-              learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
-          ],
+        learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
+        learning_rate * 1 - (1 - learning_rate) * lambda_ * 1 / norm0,
       ],
-      var0.numpy(),
+    ],
+    var0.numpy(),
   )
 
 
@@ -454,11 +425,11 @@ def loss():
 
   # Run 1 step of cg_op
   test_utils.assert_allclose_according_to_type(
-      [
-          learning_rate * 1 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
-          learning_rate * 1 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
-      ],
-      var0[1],
+    [
+      learning_rate * 1 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
+      learning_rate * 1 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
+    ],
+    var0[1],
   )
 
 
@@ -469,8 +440,8 @@ def test_tensor_learning_rate_and_conditional_gradient_frobenius(dtype):
   var1 = tf.Variable([3.0, 4.0], dtype=dtype)
   grads0 = tf.constant([0.1, 0.1], dtype=dtype)
   grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-  norm0 = tf.math.reduce_sum(grads0**2)**0.5
-  norm1 = tf.math.reduce_sum(grads1**2)**0.5
+  norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
+  norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
   ord = "fro"
   cg_opt = cg_lib.ConditionalGradient(learning_rate=tf.constant(0.5), lambda_=tf.constant(0.01), ord=ord)
   _ = cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
@@ -484,300 +455,296 @@ def test_tensor_learning_rate_and_conditional_gradient_frobenius(dtype):
 
   # Check that the parameters have been updated.
   test_utils.assert_allclose_according_to_type(
-      np.array([
-          1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-          2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-      ]),
-      var0.numpy(),
+    np.array([
+      1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+      2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+    ]),
+    var0.numpy(),
   )
   test_utils.assert_allclose_according_to_type(
-      np.array([
-          3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-          4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-      ]),
-      var1.numpy(),
+    np.array([
+      3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+      4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+    ]),
+    var1.numpy(),
   )
   # Step 2: the conditional_gradient contain the
   # previous update.
   cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
   # Check that the parameters have been updated.
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-              (2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
-          ]
-      ),
-      var0.numpy(),
+    np.array([
+      (1.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+      (2.0 * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0) * 0.5 - (1 - 0.5) * 0.01 * 0.1 / norm0,
+    ]),
+    var0.numpy(),
   )
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-              (4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
-          ]
-      ),
-      var1.numpy(),
+    np.array([
+      (3.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+      (4.0 * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1) * 0.5 - (1 - 0.5) * 0.01 * 0.01 / norm1,
+    ]),
+    var1.numpy(),
   )
 
 
 def _db_params_frobenius_cg01():
   """Return dist-belief conditional_gradient values.
 
-    Return values been generated from the dist-belief
-    conditional_gradient unittest, running with a learning rate of 0.1
-    and a lambda_ of 0.1.
+  Return values been generated from the dist-belief
+  conditional_gradient unittest, running with a learning rate of 0.1
+  and a lambda_ of 0.1.
 
-    These values record how a parameter vector of size 10, initialized
-    with 0.0, gets updated with 10 consecutive conditional_gradient
-    steps.
-    It uses random gradients.
+  These values record how a parameter vector of size 10, initialized
+  with 0.0, gets updated with 10 consecutive conditional_gradient
+  steps.
+  It uses random gradients.
 
-    Returns:
-        db_grad: The gradients to apply
-        db_out: The parameters after the conditional_gradient update.
-    """
+  Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the conditional_gradient update.
+  """
   db_grad = [[]] * 10
   db_out = [[]] * 10
   db_grad[0] = [
-      0.00096264342,
-      0.17914793,
-      0.93945462,
-      0.41396621,
-      0.53037018,
-      0.93197989,
-      0.78648776,
-      0.50036013,
-      0.55345792,
-      0.96722615,
+    0.00096264342,
+    0.17914793,
+    0.93945462,
+    0.41396621,
+    0.53037018,
+    0.93197989,
+    0.78648776,
+    0.50036013,
+    0.55345792,
+    0.96722615,
   ]
   db_out[0] = [
-      -4.1555551e-05,
-      -7.7334875e-03,
-      -4.0554531e-02,
-      -1.7870162e-02,
-      -2.2895107e-02,
-      -4.0231861e-02,
-      -3.3951234e-02,
-      -2.1599628e-02,
-      -2.3891762e-02,
-      -4.1753378e-02,
+    -4.1555551e-05,
+    -7.7334875e-03,
+    -4.0554531e-02,
+    -1.7870162e-02,
+    -2.2895107e-02,
+    -4.0231861e-02,
+    -3.3951234e-02,
+    -2.1599628e-02,
+    -2.3891762e-02,
+    -4.1753378e-02,
   ]
   db_grad[1] = [
-      0.17075552,
-      0.88821375,
-      0.20873757,
-      0.25236958,
-      0.57578111,
-      0.15312378,
-      0.5513742,
-      0.94687688,
-      0.16012503,
-      0.22159521,
+    0.17075552,
+    0.88821375,
+    0.20873757,
+    0.25236958,
+    0.57578111,
+    0.15312378,
+    0.5513742,
+    0.94687688,
+    0.16012503,
+    0.22159521,
   ]
   db_out[1] = [
-      -0.00961733,
-      -0.0507779,
-      -0.01580694,
-      -0.01599489,
-      -0.03470477,
-      -0.01264373,
-      -0.03443632,
-      -0.05546713,
-      -0.01140388,
-      -0.01665068,
+    -0.00961733,
+    -0.0507779,
+    -0.01580694,
+    -0.01599489,
+    -0.03470477,
+    -0.01264373,
+    -0.03443632,
+    -0.05546713,
+    -0.01140388,
+    -0.01665068,
   ]
   db_grad[2] = [
-      0.35077485,
-      0.47304362,
-      0.44412705,
-      0.44368884,
-      0.078527533,
-      0.81223965,
-      0.31168157,
-      0.43203235,
-      0.16792089,
-      0.24644311,
+    0.35077485,
+    0.47304362,
+    0.44412705,
+    0.44368884,
+    0.078527533,
+    0.81223965,
+    0.31168157,
+    0.43203235,
+    0.16792089,
+    0.24644311,
   ]
   db_out[2] = [
-      -0.02462724,
-      -0.03699233,
-      -0.03154434,
-      -0.03153357,
-      -0.00876844,
-      -0.05606323,
-      -0.02447166,
-      -0.03469437,
-      -0.0124694,
-      -0.01829169,
+    -0.02462724,
+    -0.03699233,
+    -0.03154434,
+    -0.03153357,
+    -0.00876844,
+    -0.05606323,
+    -0.02447166,
+    -0.03469437,
+    -0.0124694,
+    -0.01829169,
   ]
   db_grad[3] = [
-      0.9694621,
-      0.75035888,
-      0.28171822,
-      0.83813518,
-      0.53807181,
-      0.3728098,
-      0.81454384,
-      0.03848977,
-      0.89759839,
-      0.93665648,
+    0.9694621,
+    0.75035888,
+    0.28171822,
+    0.83813518,
+    0.53807181,
+    0.3728098,
+    0.81454384,
+    0.03848977,
+    0.89759839,
+    0.93665648,
   ]
   db_out[3] = [
-      -0.04124615,
-      -0.03371741,
-      -0.0144246,
-      -0.03668303,
-      -0.02240246,
-      -0.02052062,
-      -0.03503307,
-      -0.00500922,
-      -0.03715545,
-      -0.0393002,
+    -0.04124615,
+    -0.03371741,
+    -0.0144246,
+    -0.03668303,
+    -0.02240246,
+    -0.02052062,
+    -0.03503307,
+    -0.00500922,
+    -0.03715545,
+    -0.0393002,
   ]
   db_grad[4] = [
-      0.38578293,
-      0.8536852,
-      0.88722926,
-      0.66276771,
-      0.13678469,
-      0.94036359,
-      0.69107032,
-      0.81897682,
-      0.5433259,
-      0.67860287,
+    0.38578293,
+    0.8536852,
+    0.88722926,
+    0.66276771,
+    0.13678469,
+    0.94036359,
+    0.69107032,
+    0.81897682,
+    0.5433259,
+    0.67860287,
   ]
   db_out[4] = [
-      -0.01979208,
-      -0.0380417,
-      -0.03747472,
-      -0.0305847,
-      -0.00779536,
-      -0.04024222,
-      -0.03156913,
-      -0.0337613,
-      -0.02578116,
-      -0.03148952,
+    -0.01979208,
+    -0.0380417,
+    -0.03747472,
+    -0.0305847,
+    -0.00779536,
+    -0.04024222,
+    -0.03156913,
+    -0.0337613,
+    -0.02578116,
+    -0.03148952,
   ]
   db_grad[5] = [
-      0.27885768,
-      0.76100707,
-      0.24625534,
-      0.81354135,
-      0.18959245,
-      0.48038563,
-      0.84163809,
-      0.41172323,
-      0.83259648,
-      0.44941229,
+    0.27885768,
+    0.76100707,
+    0.24625534,
+    0.81354135,
+    0.18959245,
+    0.48038563,
+    0.84163809,
+    0.41172323,
+    0.83259648,
+    0.44941229,
   ]
   db_out[5] = [
-      -0.01555188,
-      -0.04084422,
-      -0.01573331,
-      -0.04265549,
-      -0.01000746,
-      -0.02740575,
-      -0.04412147,
-      -0.02341569,
-      -0.0431026,
-      -0.02502293,
+    -0.01555188,
+    -0.04084422,
+    -0.01573331,
+    -0.04265549,
+    -0.01000746,
+    -0.02740575,
+    -0.04412147,
+    -0.02341569,
+    -0.0431026,
+    -0.02502293,
   ]
   db_grad[6] = [
-      0.27233034,
-      0.056316052,
-      0.5039115,
-      0.24105175,
-      0.35697976,
-      0.75913221,
-      0.73577434,
-      0.16014607,
-      0.57500273,
-      0.071136251,
+    0.27233034,
+    0.056316052,
+    0.5039115,
+    0.24105175,
+    0.35697976,
+    0.75913221,
+    0.73577434,
+    0.16014607,
+    0.57500273,
+    0.071136251,
   ]
   db_out[6] = [
-      -0.01890448,
-      -0.00767214,
-      -0.03367592,
-      -0.01962219,
-      -0.02374279,
-      -0.05110247,
-      -0.05128598,
-      -0.01254396,
-      -0.04094185,
-      -0.00703416,
+    -0.01890448,
+    -0.00767214,
+    -0.03367592,
+    -0.01962219,
+    -0.02374279,
+    -0.05110247,
+    -0.05128598,
+    -0.01254396,
+    -0.04094185,
+    -0.00703416,
   ]
   db_grad[7] = [
-      0.58697265,
-      0.2494842,
-      0.08106143,
-      0.39954534,
-      0.15892942,
-      0.12683646,
-      0.74053431,
-      0.16033,
-      0.66625422,
-      0.73515922,
+    0.58697265,
+    0.2494842,
+    0.08106143,
+    0.39954534,
+    0.15892942,
+    0.12683646,
+    0.74053431,
+    0.16033,
+    0.66625422,
+    0.73515922,
   ]
   db_out[7] = [
-      -0.03772914,
-      -0.01599993,
-      -0.00831695,
-      -0.02635719,
-      -0.01207801,
-      -0.01285448,
-      -0.05034328,
-      -0.01104364,
-      -0.04477356,
-      -0.04558991,
+    -0.03772914,
+    -0.01599993,
+    -0.00831695,
+    -0.02635719,
+    -0.01207801,
+    -0.01285448,
+    -0.05034328,
+    -0.01104364,
+    -0.04477356,
+    -0.04558991,
   ]
   db_grad[8] = [
-      0.8215279,
-      0.41994119,
-      0.95172721,
-      0.68000203,
-      0.79439718,
-      0.43384039,
-      0.55561525,
-      0.22567581,
-      0.93331909,
-      0.29438227,
+    0.8215279,
+    0.41994119,
+    0.95172721,
+    0.68000203,
+    0.79439718,
+    0.43384039,
+    0.55561525,
+    0.22567581,
+    0.93331909,
+    0.29438227,
   ]
   db_out[8] = [
-      -0.03919835,
-      -0.01970845,
-      -0.04187151,
-      -0.03195836,
-      -0.03546333,
-      -0.01999326,
-      -0.02899324,
-      -0.01083582,
-      -0.04472339,
-      -0.01725317,
+    -0.03919835,
+    -0.01970845,
+    -0.04187151,
+    -0.03195836,
+    -0.03546333,
+    -0.01999326,
+    -0.02899324,
+    -0.01083582,
+    -0.04472339,
+    -0.01725317,
   ]
   db_grad[9] = [
-      0.68297005,
-      0.67758518,
-      0.1748755,
-      0.13266537,
-      0.70697063,
-      0.055731893,
-      0.68593478,
-      0.50580865,
-      0.12602448,
-      0.093537711,
+    0.68297005,
+    0.67758518,
+    0.1748755,
+    0.13266537,
+    0.70697063,
+    0.055731893,
+    0.68593478,
+    0.50580865,
+    0.12602448,
+    0.093537711,
   ]
   db_out[9] = [
-      -0.04510314,
-      -0.04282944,
-      -0.0147322,
-      -0.0111956,
-      -0.04617687,
-      -0.00535998,
-      -0.0442614,
-      -0.03158399,
-      -0.01207165,
-      -0.00736567,
+    -0.04510314,
+    -0.04282944,
+    -0.0147322,
+    -0.0111956,
+    -0.04617687,
+    -0.00535998,
+    -0.0442614,
+    -0.03158399,
+    -0.01207165,
+    -0.00736567,
   ]
   return db_grad, db_out
 
@@ -805,17 +772,17 @@ def test_sparse_frobenius():
     var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
     var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
     grads0 = tf.IndexedSlices(
-        tf.constant([[0.1, 0.1]], dtype=dtype),
-        tf.constant([1]),
-        tf.constant([4, 2]),
+      tf.constant([[0.1, 0.1]], dtype=dtype),
+      tf.constant([1]),
+      tf.constant([4, 2]),
     )
     grads1 = tf.IndexedSlices(
-        tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
-        tf.constant([2, 3]),
-        tf.constant([4, 2]),
+      tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
+      tf.constant([2, 3]),
+      tf.constant([4, 2]),
     )
-    norm0 = tf.math.reduce_sum(tf.math.multiply(grads0, grads0))**0.5
-    norm1 = tf.math.reduce_sum(tf.math.multiply(grads1, grads1))**0.5
+    norm0 = tf.math.reduce_sum(tf.math.multiply(grads0, grads0)) ** 0.5
+    norm1 = tf.math.reduce_sum(tf.math.multiply(grads1, grads1)) ** 0.5
     learning_rate = 0.1
     lambda_ = 0.1
     ord = "fro"
@@ -831,27 +798,25 @@ def test_sparse_frobenius():
 
     # Check that the parameters have been updated.
     test_utils.assert_allclose_according_to_type(
-        np.array([
-            0 - (1 - learning_rate) * lambda_ * 0 / norm0,
-            0 - (1 - learning_rate) * lambda_ * 0 / norm0,
-        ]),
-        var0[0].numpy(),
+      np.array([
+        0 - (1 - learning_rate) * lambda_ * 0 / norm0,
+        0 - (1 - learning_rate) * lambda_ * 0 / norm0,
+      ]),
+      var0[0].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array([
-            0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
-            0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
-        ]),
-        var0[1].numpy(),
+      np.array([
+        0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+        0 - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+      ]),
+      var0[1].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
-                1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
-            ]
-        ),
-        var1[2].numpy(),
+      np.array([
+        1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+        1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+      ]),
+      var1[2].numpy(),
     )
     # Step 2: the conditional_gradient contain the
     # previous update.
@@ -859,26 +824,20 @@ def test_sparse_frobenius():
     # Check that the parameters have been updated.
     np.testing.assert_allclose(np.array([0, 0]), var0[0].numpy())
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate -
-                (1 - learning_rate) * lambda_ * 0.1 / norm0,
-                (0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate -
-                (1 - learning_rate) * lambda_ * 0.1 / norm0,
-            ]
-        ),
-        var0[1].numpy(),
+      np.array([
+        (0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+        (0 - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+      ]),
+      var0[1].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate -
-                (1 - learning_rate) * lambda_ * 0.01 / norm1,
-                (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate -
-                (1 - learning_rate) * lambda_ * 0.01 / norm1,
-            ]
-        ),
-        var1[2].numpy(),
+      np.array([
+        (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate
+        - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+        (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate
+        - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+      ]),
+      var1[2].numpy(),
     )
 
 
@@ -889,8 +848,8 @@ def test_sharing_frobenius(dtype):
   var1 = tf.Variable([3.0, 4.0], dtype=dtype)
   grads0 = tf.constant([0.1, 0.1], dtype=dtype)
   grads1 = tf.constant([0.01, 0.01], dtype=dtype)
-  norm0 = tf.math.reduce_sum(grads0**2)**0.5
-  norm1 = tf.math.reduce_sum(grads1**2)**0.5
+  norm0 = tf.math.reduce_sum(grads0**2) ** 0.5
+  norm1 = tf.math.reduce_sum(grads1**2) ** 0.5
   learning_rate = 0.1
   lambda_ = 0.1
   ord = "fro"
@@ -915,26 +874,22 @@ def test_sharing_frobenius(dtype):
   # Check that the parameters have been updated.
   cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate -
-              (1 - learning_rate) * lambda_ * 0.1 / norm0,
-              (2.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate -
-              (1 - learning_rate) * lambda_ * 0.1 / norm0,
-          ]
-      ),
-      var0.numpy(),
+    np.array([
+      (1.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate
+      - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+      (2.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.1 / norm0) * learning_rate
+      - (1 - learning_rate) * lambda_ * 0.1 / norm0,
+    ]),
+    var0.numpy(),
   )
   test_utils.assert_allclose_according_to_type(
-      np.array(
-          [
-              (3.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate -
-              (1 - learning_rate) * lambda_ * 0.01 / norm1,
-              (4.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate -
-              (1 - learning_rate) * lambda_ * 0.01 / norm1,
-          ]
-      ),
-      var1.numpy(),
+    np.array([
+      (3.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate
+      - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+      (4.0 * learning_rate - (1 - learning_rate) * lambda_ * 0.01 / norm1) * learning_rate
+      - (1 - learning_rate) * lambda_ * 0.01 / norm1,
+    ]),
+    var1.numpy(),
   )
 
 
@@ -973,286 +928,282 @@ def test_sharing_nuclear():
     # Check that the parameters have been updated.
     cg_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector0[0],
-                (2.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[1]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector0[1],
-            ]
-        ),
-        var0.numpy(),
+      np.array([
+        (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[0]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector0[0],
+        (2.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector0[1]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector0[1],
+      ]),
+      var0.numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (3.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[0]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector1[0],
-                (4.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[1]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector1[1],
-            ]
-        ),
-        var1.numpy(),
+      np.array([
+        (3.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[0]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector1[0],
+        (4.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[1]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector1[1],
+      ]),
+      var1.numpy(),
     )
 
 
 def _db_params_nuclear_cg01():
   """Return dist-belief conditional_gradient values.
 
-    Return values been generated from the dist-belief
-    conditional_gradient unittest, running with a learning rate of 0.1
-    and a lambda_ of 0.1.
+  Return values been generated from the dist-belief
+  conditional_gradient unittest, running with a learning rate of 0.1
+  and a lambda_ of 0.1.
 
-    These values record how a parameter vector of size 10, initialized
-    with 0.0, gets updated with 10 consecutive conditional_gradient
-    steps.
-    It uses random gradients.
+  These values record how a parameter vector of size 10, initialized
+  with 0.0, gets updated with 10 consecutive conditional_gradient
+  steps.
+  It uses random gradients.
 
-    Returns:
-        db_grad: The gradients to apply
-        db_out: The parameters after the conditional_gradient update.
-    """
+  Returns:
+      db_grad: The gradients to apply
+      db_out: The parameters after the conditional_gradient update.
+  """
   db_grad = [[]] * 10
   db_out = [[]] * 10
   db_grad[0] = [
-      0.00096264342,
-      0.17914793,
-      0.93945462,
-      0.41396621,
-      0.53037018,
-      0.93197989,
-      0.78648776,
-      0.50036013,
-      0.55345792,
-      0.96722615,
+    0.00096264342,
+    0.17914793,
+    0.93945462,
+    0.41396621,
+    0.53037018,
+    0.93197989,
+    0.78648776,
+    0.50036013,
+    0.55345792,
+    0.96722615,
   ]
   db_out[0] = [
-      -4.1552783e-05,
-      -7.7334875e-03,
-      -4.0554535e-02,
-      -1.7870164e-02,
-      -2.2895109e-02,
-      -4.0231861e-02,
-      -3.3951234e-02,
-      -2.1599628e-02,
-      -2.3891764e-02,
-      -4.1753381e-02,
+    -4.1552783e-05,
+    -7.7334875e-03,
+    -4.0554535e-02,
+    -1.7870164e-02,
+    -2.2895109e-02,
+    -4.0231861e-02,
+    -3.3951234e-02,
+    -2.1599628e-02,
+    -2.3891764e-02,
+    -4.1753381e-02,
   ]
   db_grad[1] = [
-      0.17075552,
-      0.88821375,
-      0.20873757,
-      0.25236958,
-      0.57578111,
-      0.15312378,
-      0.5513742,
-      0.94687688,
-      0.16012503,
-      0.22159521,
+    0.17075552,
+    0.88821375,
+    0.20873757,
+    0.25236958,
+    0.57578111,
+    0.15312378,
+    0.5513742,
+    0.94687688,
+    0.16012503,
+    0.22159521,
   ]
   db_out[1] = [
-      -0.00961733,
-      -0.0507779,
-      -0.01580694,
-      -0.01599489,
-      -0.03470477,
-      -0.01264373,
-      -0.03443632,
-      -0.05546713,
-      -0.01140388,
-      -0.01665068,
+    -0.00961733,
+    -0.0507779,
+    -0.01580694,
+    -0.01599489,
+    -0.03470477,
+    -0.01264373,
+    -0.03443632,
+    -0.05546713,
+    -0.01140388,
+    -0.01665068,
   ]
   db_grad[2] = [
-      0.35077485,
-      0.47304362,
-      0.44412705,
-      0.44368884,
-      0.078527533,
-      0.81223965,
-      0.31168157,
-      0.43203235,
-      0.16792089,
-      0.24644311,
+    0.35077485,
+    0.47304362,
+    0.44412705,
+    0.44368884,
+    0.078527533,
+    0.81223965,
+    0.31168157,
+    0.43203235,
+    0.16792089,
+    0.24644311,
   ]
   db_out[2] = [
-      -0.02462724,
-      -0.03699233,
-      -0.03154433,
-      -0.03153357,
-      -0.00876844,
-      -0.05606324,
-      -0.02447166,
-      -0.03469437,
-      -0.0124694,
-      -0.01829169,
+    -0.02462724,
+    -0.03699233,
+    -0.03154433,
+    -0.03153357,
+    -0.00876844,
+    -0.05606324,
+    -0.02447166,
+    -0.03469437,
+    -0.0124694,
+    -0.01829169,
   ]
   db_grad[3] = [
-      0.9694621,
-      0.75035888,
-      0.28171822,
-      0.83813518,
-      0.53807181,
-      0.3728098,
-      0.81454384,
-      0.03848977,
-      0.89759839,
-      0.93665648,
+    0.9694621,
+    0.75035888,
+    0.28171822,
+    0.83813518,
+    0.53807181,
+    0.3728098,
+    0.81454384,
+    0.03848977,
+    0.89759839,
+    0.93665648,
   ]
   db_out[3] = [
-      -0.04124615,
-      -0.03371741,
-      -0.0144246,
-      -0.03668303,
-      -0.02240246,
-      -0.02052062,
-      -0.03503307,
-      -0.00500922,
-      -0.03715545,
-      -0.0393002,
+    -0.04124615,
+    -0.03371741,
+    -0.0144246,
+    -0.03668303,
+    -0.02240246,
+    -0.02052062,
+    -0.03503307,
+    -0.00500922,
+    -0.03715545,
+    -0.0393002,
   ]
   db_grad[4] = [
-      0.38578293,
-      0.8536852,
-      0.88722926,
-      0.66276771,
-      0.13678469,
-      0.94036359,
-      0.69107032,
-      0.81897682,
-      0.5433259,
-      0.67860287,
+    0.38578293,
+    0.8536852,
+    0.88722926,
+    0.66276771,
+    0.13678469,
+    0.94036359,
+    0.69107032,
+    0.81897682,
+    0.5433259,
+    0.67860287,
   ]
   db_out[4] = [
-      -0.01979207,
-      -0.0380417,
-      -0.03747472,
-      -0.0305847,
-      -0.00779536,
-      -0.04024221,
-      -0.03156913,
-      -0.0337613,
-      -0.02578116,
-      -0.03148951,
+    -0.01979207,
+    -0.0380417,
+    -0.03747472,
+    -0.0305847,
+    -0.00779536,
+    -0.04024221,
+    -0.03156913,
+    -0.0337613,
+    -0.02578116,
+    -0.03148951,
   ]
   db_grad[5] = [
-      0.27885768,
-      0.76100707,
-      0.24625534,
-      0.81354135,
-      0.18959245,
-      0.48038563,
-      0.84163809,
-      0.41172323,
-      0.83259648,
-      0.44941229,
+    0.27885768,
+    0.76100707,
+    0.24625534,
+    0.81354135,
+    0.18959245,
+    0.48038563,
+    0.84163809,
+    0.41172323,
+    0.83259648,
+    0.44941229,
   ]
   db_out[5] = [
-      -0.01555188,
-      -0.04084422,
-      -0.01573331,
-      -0.04265549,
-      -0.01000746,
-      -0.02740575,
-      -0.04412147,
-      -0.02341569,
-      -0.0431026,
-      -0.02502293,
+    -0.01555188,
+    -0.04084422,
+    -0.01573331,
+    -0.04265549,
+    -0.01000746,
+    -0.02740575,
+    -0.04412147,
+    -0.02341569,
+    -0.0431026,
+    -0.02502293,
   ]
   db_grad[6] = [
-      0.27233034,
-      0.056316052,
-      0.5039115,
-      0.24105175,
-      0.35697976,
-      0.75913221,
-      0.73577434,
-      0.16014607,
-      0.57500273,
-      0.071136251,
+    0.27233034,
+    0.056316052,
+    0.5039115,
+    0.24105175,
+    0.35697976,
+    0.75913221,
+    0.73577434,
+    0.16014607,
+    0.57500273,
+    0.071136251,
   ]
   db_out[6] = [
-      -0.01890448,
-      -0.00767214,
-      -0.03367592,
-      -0.01962219,
-      -0.02374278,
-      -0.05110246,
-      -0.05128598,
-      -0.01254396,
-      -0.04094184,
-      -0.00703416,
+    -0.01890448,
+    -0.00767214,
+    -0.03367592,
+    -0.01962219,
+    -0.02374278,
+    -0.05110246,
+    -0.05128598,
+    -0.01254396,
+    -0.04094184,
+    -0.00703416,
   ]
   db_grad[7] = [
-      0.58697265,
-      0.2494842,
-      0.08106143,
-      0.39954534,
-      0.15892942,
-      0.12683646,
-      0.74053431,
-      0.16033,
-      0.66625422,
-      0.73515922,
+    0.58697265,
+    0.2494842,
+    0.08106143,
+    0.39954534,
+    0.15892942,
+    0.12683646,
+    0.74053431,
+    0.16033,
+    0.66625422,
+    0.73515922,
   ]
   db_out[7] = [
-      -0.03772915,
-      -0.01599993,
-      -0.00831695,
-      -0.0263572,
-      -0.01207801,
-      -0.01285448,
-      -0.05034329,
-      -0.01104364,
-      -0.04477356,
-      -0.04558992,
+    -0.03772915,
+    -0.01599993,
+    -0.00831695,
+    -0.0263572,
+    -0.01207801,
+    -0.01285448,
+    -0.05034329,
+    -0.01104364,
+    -0.04477356,
+    -0.04558992,
   ]
   db_grad[8] = [
-      0.8215279,
-      0.41994119,
-      0.95172721,
-      0.68000203,
-      0.79439718,
-      0.43384039,
-      0.55561525,
-      0.22567581,
-      0.93331909,
-      0.29438227,
+    0.8215279,
+    0.41994119,
+    0.95172721,
+    0.68000203,
+    0.79439718,
+    0.43384039,
+    0.55561525,
+    0.22567581,
+    0.93331909,
+    0.29438227,
   ]
   db_out[8] = [
-      -0.03919835,
-      -0.01970845,
-      -0.04187151,
-      -0.03195836,
-      -0.03546333,
-      -0.01999326,
-      -0.02899324,
-      -0.01083582,
-      -0.04472339,
-      -0.01725317,
+    -0.03919835,
+    -0.01970845,
+    -0.04187151,
+    -0.03195836,
+    -0.03546333,
+    -0.01999326,
+    -0.02899324,
+    -0.01083582,
+    -0.04472339,
+    -0.01725317,
   ]
   db_grad[9] = [
-      0.68297005,
-      0.67758518,
-      0.1748755,
-      0.13266537,
-      0.70697063,
-      0.055731893,
-      0.68593478,
-      0.50580865,
-      0.12602448,
-      0.093537711,
+    0.68297005,
+    0.67758518,
+    0.1748755,
+    0.13266537,
+    0.70697063,
+    0.055731893,
+    0.68593478,
+    0.50580865,
+    0.12602448,
+    0.093537711,
   ]
   db_out[9] = [
-      -0.04510314,
-      -0.04282944,
-      -0.0147322,
-      -0.0111956,
-      -0.04617687,
-      -0.00535998,
-      -0.0442614,
-      -0.031584,
-      -0.01207165,
-      -0.00736567,
+    -0.04510314,
+    -0.04282944,
+    -0.0147322,
+    -0.0111956,
+    -0.04617687,
+    -0.00535998,
+    -0.0442614,
+    -0.031584,
+    -0.01207165,
+    -0.00736567,
   ]
   return db_grad, db_out
 
@@ -1265,24 +1216,24 @@ def test_sparse_nuclear():
     var0 = tf.Variable(tf.zeros([4, 2], dtype=dtype))
     var1 = tf.Variable(tf.constant(1.0, dtype, [4, 2]))
     grads0 = tf.IndexedSlices(
-        tf.constant([[0.1, 0.1]], dtype=dtype),
-        tf.constant([1]),
-        tf.constant([4, 2]),
+      tf.constant([[0.1, 0.1]], dtype=dtype),
+      tf.constant([1]),
+      tf.constant([4, 2]),
     )
     grads1 = tf.IndexedSlices(
-        tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
-        tf.constant([2, 3]),
-        tf.constant([4, 2]),
+      tf.constant([[0.01, 0.01], [0.01, 0.01]], dtype=dtype),
+      tf.constant([2, 3]),
+      tf.constant([4, 2]),
     )
     top_singular_vector0 = tf.constant([[0.0, 0.0], [0.7071067, 0.7071067], [0.0, 0.0], [0.0, 0.0]], dtype=dtype)
     top_singular_vector1 = tf.constant(
-        [
-            [-4.2146844e-08, -4.2146844e-08],
-            [0.0000000e00, 0.0000000e00],
-            [4.9999994e-01, 4.9999994e-01],
-            [4.9999994e-01, 4.9999994e-01],
-        ],
-        dtype=dtype,
+      [
+        [-4.2146844e-08, -4.2146844e-08],
+        [0.0000000e00, 0.0000000e00],
+        [4.9999994e-01, 4.9999994e-01],
+        [4.9999994e-01, 4.9999994e-01],
+      ],
+      dtype=dtype,
     )
     learning_rate = 0.1
     lambda_ = 0.1
@@ -1299,31 +1250,25 @@ def test_sparse_nuclear():
 
     # Check that the parameters have been updated.
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
-                0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
-            ]
-        ),
-        var0[0].numpy(),
+      np.array([
+        0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][0],
+        0 - (1 - learning_rate) * lambda_ * top_singular_vector0[0][1],
+      ]),
+      var0[0].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
-                0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
-            ]
-        ),
-        var0[1].numpy(),
+      np.array([
+        0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
+        0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
+      ]),
+      var0[1].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
-                1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
-            ]
-        ),
-        var1[2].numpy(),
+      np.array([
+        1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
+        1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
+      ]),
+      var1[2].numpy(),
     )
     # Step 2: the conditional_gradient contain the
     # previous update.
@@ -1332,26 +1277,22 @@ def test_sparse_nuclear():
     # Check that the parameters have been updated.
     np.testing.assert_allclose(np.array([0, 0]), var0[0].numpy())
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
-                (0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
-            ]
-        ),
-        var0[1].numpy(),
+      np.array([
+        (0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector0[1][0],
+        (0 - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector0[1][1],
+      ]),
+      var0[1].numpy(),
     )
     test_utils.assert_allclose_according_to_type(
-        np.array(
-            [
-                (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][0]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
-                (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][1]) * learning_rate -
-                (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
-            ]
-        ),
-        var1[2].numpy(),
+      np.array([
+        (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][0]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector1[2][0],
+        (1.0 * learning_rate - (1 - learning_rate) * lambda_ * top_singular_vector1[2][1]) * learning_rate
+        - (1 - learning_rate) * lambda_ * top_singular_vector1[2][1],
+      ]),
+      var1[2].numpy(),
     )
 
 
diff --git a/deepray/optimizers/tests/cyclical_learning_rate_test.py b/deepray/optimizers/tests/cyclical_learning_rate_test.py
index 0aa60041..15944677 100644
--- a/deepray/optimizers/tests/cyclical_learning_rate_test.py
+++ b/deepray/optimizers/tests/cyclical_learning_rate_test.py
@@ -34,18 +34,16 @@ def test_triangular_cyclical_learning_rate(serialize):
   max_learning_rate = 1
   step_size = 40
   triangular_cyclical_lr = cyclical_learning_rate.TriangularCyclicalLearningRate(
-      initial_learning_rate=initial_learning_rate,
-      maximal_learning_rate=max_learning_rate,
-      step_size=step_size,
+    initial_learning_rate=initial_learning_rate,
+    maximal_learning_rate=max_learning_rate,
+    step_size=step_size,
   )
   triangular_cyclical_lr = _maybe_serialized(triangular_cyclical_lr, serialize)
 
-  expected = np.concatenate(
-      [
-          np.linspace(initial_learning_rate, max_learning_rate, num=step_size + 1),
-          np.linspace(max_learning_rate, initial_learning_rate, num=step_size + 1)[1:],
-      ]
-  )
+  expected = np.concatenate([
+    np.linspace(initial_learning_rate, max_learning_rate, num=step_size + 1),
+    np.linspace(max_learning_rate, initial_learning_rate, num=step_size + 1)[1:],
+  ])
 
   for step, expected_value in enumerate(expected):
     np.testing.assert_allclose(triangular_cyclical_lr(step), expected_value, 1e-6)
@@ -57,21 +55,19 @@ def test_triangular2_cyclical_learning_rate(serialize):
   maximal_lr = 1
   step_size = 30
   triangular2_lr = cyclical_learning_rate.Triangular2CyclicalLearningRate(
-      initial_learning_rate=initial_lr,
-      maximal_learning_rate=maximal_lr,
-      step_size=step_size,
+    initial_learning_rate=initial_lr,
+    maximal_learning_rate=maximal_lr,
+    step_size=step_size,
   )
   triangular2_lr = _maybe_serialized(triangular2_lr, serialize)
 
   middle_lr = (maximal_lr + initial_lr) / 2
-  expected = np.concatenate(
-      [
-          np.linspace(initial_lr, maximal_lr, num=step_size + 1),
-          np.linspace(maximal_lr, initial_lr, num=step_size + 1)[1:],
-          np.linspace(initial_lr, middle_lr, num=step_size + 1)[1:],
-          np.linspace(middle_lr, initial_lr, num=step_size + 1)[1:],
-      ]
-  )
+  expected = np.concatenate([
+    np.linspace(initial_lr, maximal_lr, num=step_size + 1),
+    np.linspace(maximal_lr, initial_lr, num=step_size + 1)[1:],
+    np.linspace(initial_lr, middle_lr, num=step_size + 1)[1:],
+    np.linspace(middle_lr, initial_lr, num=step_size + 1)[1:],
+  ])
 
   for step, expected_value in enumerate(expected):
     np.testing.assert_allclose(triangular2_lr(step).numpy(), expected_value, 1e-6)
@@ -86,17 +82,18 @@ def test_exponential_cyclical_learning_rate(serialize):
 
   step = 0
   exponential_cyclical_lr = cyclical_learning_rate.ExponentialCyclicalLearningRate(
-      initial_learning_rate=initial_learning_rate,
-      maximal_learning_rate=maximal_learning_rate,
-      step_size=step_size,
-      gamma=gamma,
+    initial_learning_rate=initial_learning_rate,
+    maximal_learning_rate=maximal_learning_rate,
+    step_size=step_size,
+    gamma=gamma,
   )
   exponential_cyclical_lr = _maybe_serialized(exponential_cyclical_lr, serialize)
 
   for i in range(0, 8001):
     non_bounded_value = np.abs(i / 2000.0 - 2 * np.floor(1 + i / (2 * 2000)) + 1)
-    expected = initial_learning_rate + (maximal_learning_rate -
-                                        initial_learning_rate) * np.maximum(0, (1 - non_bounded_value)) * (gamma**i)
+    expected = initial_learning_rate + (maximal_learning_rate - initial_learning_rate) * np.maximum(
+      0, (1 - non_bounded_value)
+    ) * (gamma**i)
     computed = exponential_cyclical_lr(step).numpy()
     np.testing.assert_allclose(computed, expected, 1e-6)
     step += 1
@@ -109,21 +106,22 @@ def test_custom_cyclical_learning_rate(serialize):
   step_size = 4000
 
   def scale_fn(x):
-    return 1 / (5**(x * 0.0001))
+    return 1 / (5 ** (x * 0.0001))
 
   custom_cyclical_lr = cyclical_learning_rate.CyclicalLearningRate(
-      initial_learning_rate=initial_learning_rate,
-      maximal_learning_rate=maximal_learning_rate,
-      step_size=step_size,
-      scale_fn=scale_fn,
+    initial_learning_rate=initial_learning_rate,
+    maximal_learning_rate=maximal_learning_rate,
+    step_size=step_size,
+    scale_fn=scale_fn,
   )
   custom_cyclical_lr = _maybe_serialized(custom_cyclical_lr, serialize)
 
   for step in range(1, 8001):
     cycle = np.floor(1 + step / (2 * step_size))
     non_bounded_value = np.abs(step / step_size - 2 * cycle + 1)
-    expected = initial_learning_rate + (maximal_learning_rate -
-                                        initial_learning_rate) * np.maximum(0, 1 - non_bounded_value) * scale_fn(cycle)
+    expected = initial_learning_rate + (maximal_learning_rate - initial_learning_rate) * np.maximum(
+      0, 1 - non_bounded_value
+    ) * scale_fn(cycle)
     np.testing.assert_allclose(custom_cyclical_lr(step).numpy(), expected, 1e-6, 1e-6)
 
 
@@ -135,20 +133,21 @@ def test_custom_cyclical_learning_rate_with_scale_mode(serialize):
   scale_mode = "iterations"
 
   def scale_fn(x):
-    return 1 / (5**(x * 0.0001))
+    return 1 / (5 ** (x * 0.0001))
 
   custom_cyclical_lr = cyclical_learning_rate.CyclicalLearningRate(
-      initial_learning_rate=initial_learning_rate,
-      maximal_learning_rate=maximal_learning_rate,
-      step_size=step_size,
-      scale_fn=scale_fn,
-      scale_mode=scale_mode,
+    initial_learning_rate=initial_learning_rate,
+    maximal_learning_rate=maximal_learning_rate,
+    step_size=step_size,
+    scale_fn=scale_fn,
+    scale_mode=scale_mode,
   )
   custom_cyclical_lr = _maybe_serialized(custom_cyclical_lr, serialize)
 
   for step in range(1, 8001):
     cycle = np.floor(1 + step / (2 * step_size))
     non_bounded_value = np.abs(step / step_size - 2 * cycle + 1)
-    expected = initial_learning_rate + (maximal_learning_rate -
-                                        initial_learning_rate) * np.maximum(0, 1 - non_bounded_value) * scale_fn(step)
+    expected = initial_learning_rate + (maximal_learning_rate - initial_learning_rate) * np.maximum(
+      0, 1 - non_bounded_value
+    ) * scale_fn(step)
     np.testing.assert_allclose(custom_cyclical_lr(step).numpy(), expected, 1e-6, 1e-6)
diff --git a/deepray/optimizers/tests/lamb_test.py b/deepray/optimizers/tests/lamb_test.py
index 51959ef1..4fa507ab 100644
--- a/deepray/optimizers/tests/lamb_test.py
+++ b/deepray/optimizers/tests/lamb_test.py
@@ -35,12 +35,11 @@ def _dtypes_to_test(use_gpu):
 
 
 def lamb_update_numpy(param, g_t, t, m, v, lr=0.001, lamb_wd=0.0, beta1=0.9, beta2=0.999, epsilon=1e-6):
-
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
 
-  m_t_hat = m_t / (1 - beta1**(t + 1))
-  v_t_hat = v_t / (1 - beta2**(t + 1))
+  m_t_hat = m_t / (1 - beta1 ** (t + 1))
+  v_t_hat = v_t / (1 - beta2 ** (t + 1))
   update = m_t_hat / (np.sqrt(v_t_hat) + epsilon)
 
   update += lamb_wd * param
@@ -76,15 +75,15 @@ def test_sparse():
     var1 = tf.Variable(var1_np)
     grads0_np_indices = np.array([0, 2], dtype=np.int32)
     grads0 = tf.IndexedSlices(
-        tf.constant(grads0_np[grads0_np_indices]),
-        tf.constant(grads0_np_indices),
-        tf.constant([3]),
+      tf.constant(grads0_np[grads0_np_indices]),
+      tf.constant(grads0_np_indices),
+      tf.constant([3]),
     )
     grads1_np_indices = np.array([0, 2], dtype=np.int32)
     grads1 = tf.IndexedSlices(
-        tf.constant(grads1_np[grads1_np_indices]),
-        tf.constant(grads1_np_indices),
-        tf.constant([3]),
+      tf.constant(grads1_np[grads1_np_indices]),
+      tf.constant(grads1_np_indices),
+      tf.constant([3]),
     )
     opt = lamb.LAMB()
 
@@ -95,8 +94,8 @@ def test_sparse():
     # Run 3 steps of LAMB
     for t in range(3):
       beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-      test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-      test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+      test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+      test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
 
       opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
@@ -131,12 +130,12 @@ def test_basic_with_learning_rate_decay():
     lamb_wd = 0.01
 
     opt = lamb.LAMB(
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        weight_decay=lamb_wd,
-        decay=decay,
+      learning_rate=learning_rate,
+      beta_1=beta_1,
+      beta_2=beta_2,
+      epsilon=epsilon,
+      weight_decay=lamb_wd,
+      decay=decay,
     )
 
     # Run 3 steps of LAMB
@@ -214,8 +213,8 @@ def test_tensor_learning_rate():
     # Run 3 steps of LAMB
     for t in range(3):
       beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-      test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-      test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+      test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+      test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
       opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
       var0_np, m0, v0 = lamb_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -249,8 +248,8 @@ def test_sharing():
     # Run 3 steps of intertwined LAMB1 and LAMB2.
     for t in range(3):
       beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-      test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-      test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+      test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+      test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
 
       opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
@@ -301,8 +300,8 @@ def learning_rate():
     # Run 3 steps of LAMB
     for t in range(3):
       beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-      test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-      test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+      test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+      test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
 
       opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
diff --git a/deepray/optimizers/tests/lazy_adam_test.py b/deepray/optimizers/tests/lazy_adam_test.py
index 6bc22f76..fe12e5cd 100644
--- a/deepray/optimizers/tests/lazy_adam_test.py
+++ b/deepray/optimizers/tests/lazy_adam_test.py
@@ -23,7 +23,7 @@
 
 
 def adam_update_numpy(param, g_t, t, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7):
-  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  lr_t = lr * np.sqrt(1 - beta2 ** (t + 1)) / (1 - beta1 ** (t + 1))
 
   m_t = beta1 * m + (1 - beta1) * g_t
   v_t = beta2 * v + (1 - beta2) * g_t * g_t
@@ -62,15 +62,15 @@ def _test_sparse(dtype):
   var1 = tf.Variable(var1_np)
   grads0_np_indices = np.array([0, 2], dtype=np.int32)
   grads0 = tf.IndexedSlices(
-      tf.constant(grads0_np[grads0_np_indices]),
-      tf.constant(grads0_np_indices),
-      tf.constant([3]),
+    tf.constant(grads0_np[grads0_np_indices]),
+    tf.constant(grads0_np_indices),
+    tf.constant([3]),
   )
   grads1_np_indices = np.array([0, 2], dtype=np.int32)
   grads1 = tf.IndexedSlices(
-      tf.constant(grads1_np[grads1_np_indices]),
-      tf.constant(grads1_np_indices),
-      tf.constant([3]),
+    tf.constant(grads1_np[grads1_np_indices]),
+    tf.constant(grads1_np_indices),
+    tf.constant([3]),
   )
   opt = lazy_adam.LazyAdam()
 
@@ -81,8 +81,8 @@ def _test_sparse(dtype):
   # Run 3 steps of Adam
   for t in range(3):
     beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-    test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-    test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+    test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+    test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
     opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
     var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -96,7 +96,6 @@ def _test_sparse(dtype):
 @pytest.mark.parametrize("dtype", [tf.int32, tf.int64])
 @pytest.mark.with_device(["cpu", "gpu"])
 def test_sparse_device_placement(dtype):
-
   # If a GPU is available, tests that all optimizer ops can be placed on
   # it (i.e. they have GPU kernels).
   var = tf.Variable([[1.0], [2.0]])
@@ -116,14 +115,14 @@ def test_sparse_repeated_indices(dtype):
     repeated_index_update_var = tf.Variable([[1], [2]], dtype=dtype)
     aggregated_update_var = tf.Variable([[1], [2]], dtype=dtype)
     grad_repeated_index = tf.IndexedSlices(
-        tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-        tf.constant([1, 1]),
-        tf.constant([2, 1]),
+      tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+      tf.constant([1, 1]),
+      tf.constant([2, 1]),
     )
     grad_aggregated = tf.IndexedSlices(
-        tf.constant([0.2], shape=[1, 1], dtype=dtype),
-        tf.constant([1]),
-        tf.constant([2, 1]),
+      tf.constant([0.2], shape=[1, 1], dtype=dtype),
+      tf.constant([1]),
+      tf.constant([2, 1]),
     )
     repeated_update_opt = lazy_adam.LazyAdam()
     aggregated_update_opt = lazy_adam.LazyAdam()
@@ -159,8 +158,8 @@ def learning_rate():
   # Run 3 steps of Adam
   for t in range(3):
     beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-    test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-    test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+    test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+    test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
     opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
     var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -189,8 +188,8 @@ def test_tensor_learning_rate(dtype):
   # Run 3 steps of Adam
   for t in range(3):
     beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-    test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-    test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+    test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+    test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
     opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
     var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
@@ -224,8 +223,8 @@ def test_sharing(dtype):
   # Run 3 steps of intertwined Adam1 and Adam2.
   for t in range(3):
     beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
-    test_utils.assert_allclose_according_to_type(0.9**(t + 1), beta_1_power)
-    test_utils.assert_allclose_according_to_type(0.999**(t + 1), beta_2_power)
+    test_utils.assert_allclose_according_to_type(0.9 ** (t + 1), beta_1_power)
+    test_utils.assert_allclose_according_to_type(0.999 ** (t + 1), beta_2_power)
     opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
 
     var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
diff --git a/deepray/optimizers/tests/lookahead_test.py b/deepray/optimizers/tests/lookahead_test.py
index 08cec421..23b27882 100644
--- a/deepray/optimizers/tests/lookahead_test.py
+++ b/deepray/optimizers/tests/lookahead_test.py
@@ -137,13 +137,13 @@ def test_fit_simple_linear_model_mixed_precision():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_model_dynamic_lr():
   grad = tf.Variable([[0.1]])
-  model = tf.keras.Sequential(
-      [tf.keras.layers.Dense(
-          1,
-          kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
-          use_bias=False,
-      )]
-  )
+  model = tf.keras.Sequential([
+    tf.keras.layers.Dense(
+      1,
+      kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
+      use_bias=False,
+    )
+  ])
   model.build(input_shape=[1, 1])
 
   opt = Lookahead("adam", sync_period=10, slow_step_size=0.4)
diff --git a/deepray/optimizers/tests/moving_average_test.py b/deepray/optimizers/tests/moving_average_test.py
index 9f519f93..2e03c064 100644
--- a/deepray/optimizers/tests/moving_average_test.py
+++ b/deepray/optimizers/tests/moving_average_test.py
@@ -54,10 +54,10 @@ def test_run():
   np.testing.assert_allclose(var0.read_value(), [0.75, 1.75])
   np.testing.assert_allclose(var1.read_value(), [2.975, 3.975])
 
-  var0.assign_add([1.0, 1.0]),
-  var1.assign_add([2.0, 2.0]),
-  ema_var0.assign_add([3.0, 3.0]),
-  ema_var1.assign_add([4.0, 4.0]),
+  (var0.assign_add([1.0, 1.0]),)
+  (var1.assign_add([2.0, 2.0]),)
+  (ema_var0.assign_add([3.0, 3.0]),)
+  (ema_var1.assign_add([4.0, 4.0]),)
 
   np.testing.assert_allclose(var0.read_value(), [1.75, 2.75])
   np.testing.assert_allclose(var1.read_value(), [4.975, 5.975])
@@ -88,13 +88,13 @@ def test_num_updates_invalid():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_model_weights_update():
   grad = tf.Variable([[0.1]])
-  model = tf.keras.Sequential(
-      [tf.keras.layers.Dense(
-          1,
-          kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
-          use_bias=False,
-      )]
-  )
+  model = tf.keras.Sequential([
+    tf.keras.layers.Dense(
+      1,
+      kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
+      use_bias=False,
+    )
+  ])
   model.build(input_shape=[1, 1])
 
   if hasattr(tf.keras.optimizers, "legacy"):
@@ -110,13 +110,13 @@ def test_model_weights_update():
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_model_dynamic_lr():
   grad = tf.Variable([[0.1]])
-  model = tf.keras.Sequential(
-      [tf.keras.layers.Dense(
-          1,
-          kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
-          use_bias=False,
-      )]
-  )
+  model = tf.keras.Sequential([
+    tf.keras.layers.Dense(
+      1,
+      kernel_initializer=tf.keras.initializers.Constant([[1.0]]),
+      use_bias=False,
+    )
+  ])
   model.build(input_shape=[1, 1])
   if hasattr(tf.keras.optimizers, "legacy"):
     opt = MovingAverage(tf.keras.optimizers.legacy.SGD(lr=1e-3), average_decay=0.5)
@@ -139,11 +139,11 @@ def test_config():
   else:
     sgd_opt = tf.keras.optimizers.SGD(lr=2.0, nesterov=True, momentum=0.3, decay=0.1)
   opt = MovingAverage(
-      sgd_opt,
-      average_decay=0.5,
-      num_updates=None,
-      start_step=5,
-      dynamic_decay=True,
+    sgd_opt,
+    average_decay=0.5,
+    num_updates=None,
+    start_step=5,
+    dynamic_decay=True,
   )
   config = opt.get_config()
 
@@ -194,11 +194,11 @@ def test_serialization():
   else:
     sgd_opt = tf.keras.optimizers.SGD(lr=2.0, nesterov=True, momentum=0.3, decay=0.1)
   optimizer = MovingAverage(
-      sgd_opt,
-      average_decay=0.5,
-      num_updates=None,
-      start_step=5,
-      dynamic_decay=True,
+    sgd_opt,
+    average_decay=0.5,
+    num_updates=None,
+    start_step=5,
+    dynamic_decay=True,
   )
   config = tf.keras.optimizers.serialize(optimizer)
   new_optimizer = tf.keras.optimizers.deserialize(config)
@@ -239,15 +239,15 @@ def test_dynamic_decay():
 
   if hasattr(tf.keras.optimizers, "legacy"):
     opt = MovingAverage(
-        tf.keras.optimizers.legacy.SGD(lr=2.0),
-        average_decay=0.5,
-        dynamic_decay=True,
+      tf.keras.optimizers.legacy.SGD(lr=2.0),
+      average_decay=0.5,
+      dynamic_decay=True,
     )
   else:
     opt = MovingAverage(
-        tf.keras.optimizers.SGD(lr=2.0),
-        average_decay=0.5,
-        dynamic_decay=True,
+      tf.keras.optimizers.SGD(lr=2.0),
+      average_decay=0.5,
+      dynamic_decay=True,
     )
 
   opt.apply_gradients(grads_and_vars)
@@ -260,8 +260,8 @@ def test_dynamic_decay():
 
 
 @pytest.mark.skipif(
-    Version(tf.__version__) >= Version("2.13"),
-    reason="TF2.13 breakage: https://github.com/tensorflow/addons/pull/2835#issuecomment-1629772331",
+  Version(tf.__version__) >= Version("2.13"),
+  reason="TF2.13 breakage: https://github.com/tensorflow/addons/pull/2835#issuecomment-1629772331",
 )
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.with_device([tf.distribute.MirroredStrategy])
@@ -299,8 +299,8 @@ def apply_gradients():
 
 
 @pytest.mark.skipif(
-    Version(tf.__version__) >= Version("2.13"),
-    reason="TF2.13 breakage: https://github.com/tensorflow/addons/pull/2835#issuecomment-1629772331",
+  Version(tf.__version__) >= Version("2.13"),
+  reason="TF2.13 breakage: https://github.com/tensorflow/addons/pull/2835#issuecomment-1629772331",
 )
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 @pytest.mark.with_device([tf.distribute.MirroredStrategy])
@@ -359,21 +359,19 @@ def test_no_average_slot():
   # They are returned when using model.variables
   # but it's unable to assign average slot to them.
   vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
-      max_tokens=max_features,
-      output_mode="int",
-      output_sequence_length=max_len,
+    max_tokens=max_features,
+    output_mode="int",
+    output_sequence_length=max_len,
   )
 
   vectorize_layer.adapt(["foo", "bar", "baz"])
 
-  model = tf.keras.models.Sequential(
-      [
-          tf.keras.Input(shape=(1,), dtype=tf.string),
-          vectorize_layer,
-          tf.keras.layers.Embedding(max_features + 1, embedding_dims),
-          tf.keras.layers.Dense(1),
-      ]
-  )
+  model = tf.keras.models.Sequential([
+    tf.keras.Input(shape=(1,), dtype=tf.string),
+    vectorize_layer,
+    tf.keras.layers.Embedding(max_features + 1, embedding_dims),
+    tf.keras.layers.Dense(1),
+  ])
 
   optimizer = MovingAverage("sgd")
 
diff --git a/deepray/optimizers/tests/multi_optimizer_test.py b/deepray/optimizers/tests/multi_optimizer_test.py
index b041237d..398d1187 100644
--- a/deepray/optimizers/tests/multi_optimizer_test.py
+++ b/deepray/optimizers/tests/multi_optimizer_test.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tests for CompositeOptimizer."""
+
 import os.path
 import tempfile
 
@@ -24,7 +25,6 @@
 
 
 class CompositeOptimizerTest(tf.test.TestCase, parameterized.TestCase):
-
   @parameterized.parameters(("sgd", "adam"), ("rmsprop", "sgd"), ("adam", "adagrad"), ("adagrad", "rmsprop"))
   def test_composite_optimizer(self, optimizer1_type, optimizer2_type):
     values1 = [1.0, 2.0, 3.0]
@@ -44,20 +44,21 @@ def test_composite_optimizer(self, optimizer1_type, optimizer2_type):
     grads3 = tf.constant(grad3_values)
 
     optimizer_dict = {
-        "sgd": tf.keras.optimizers.legacy.SGD,
-        "adam": tf.keras.optimizers.legacy.Adam,
-        "rmsprop": tf.keras.optimizers.legacy.RMSprop,
-        "adagrad": tf.keras.optimizers.legacy.Adagrad,
+      "sgd": tf.keras.optimizers.legacy.SGD,
+      "adam": tf.keras.optimizers.legacy.Adam,
+      "rmsprop": tf.keras.optimizers.legacy.RMSprop,
+      "adagrad": tf.keras.optimizers.legacy.Adagrad,
     }
 
     comp_optimizer1 = optimizer_dict[optimizer1_type]()
     comp_optimizer2 = optimizer_dict[optimizer2_type]()
 
     composite_optimizer = MultiOptimizer(
-        [
-            (comp_optimizer1, "var1"),
-            (comp_optimizer2, "var2,var3"),
-        ], default_optimizer=comp_optimizer1
+      [
+        (comp_optimizer1, "var1"),
+        (comp_optimizer2, "var2,var3"),
+      ],
+      default_optimizer=comp_optimizer1,
     )
 
     self.assertSequenceEqual(composite_optimizer.optimizers, [comp_optimizer1, comp_optimizer2])
@@ -67,7 +68,8 @@ def test_composite_optimizer(self, optimizer1_type, optimizer2_type):
 
     grads_and_vars_1 = [(tf.constant(grad1_values), tf.Variable(values1))]
     grads_and_vars_2 = [
-        (tf.constant(grad2_values), tf.Variable(values2)), (tf.constant(grad3_values), tf.Variable(values3))
+      (tf.constant(grad2_values), tf.Variable(values2)),
+      (tf.constant(grad3_values), tf.Variable(values3)),
     ]
     grads_and_vars = list(zip([grads1, grads2, grads3], [var1, var2, var3]))
 
diff --git a/deepray/optimizers/tests/novograd_test.py b/deepray/optimizers/tests/novograd_test.py
index 7c4c959d..d52cbdc2 100644
--- a/deepray/optimizers/tests/novograd_test.py
+++ b/deepray/optimizers/tests/novograd_test.py
@@ -41,10 +41,10 @@ def run_dense_sample(iterations, expected, optimizer, dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_dense_sample(dtype):
   run_dense_sample(
-      iterations=1,
-      expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]],
-      optimizer=NovoGrad(lr=0.1, epsilon=1e-8),
-      dtype=dtype,
+    iterations=1,
+    expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]],
+    optimizer=NovoGrad(lr=0.1, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
@@ -52,10 +52,10 @@ def test_dense_sample(dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_dense_sample_with_weight_decay(dtype):
   run_dense_sample(
-      iterations=1,
-      expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]],
-      optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8),
-      dtype=dtype,
+    iterations=1,
+    expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]],
+    optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
@@ -63,10 +63,10 @@ def test_dense_sample_with_weight_decay(dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_dense_sample_with_grad_averaging(dtype):
   run_dense_sample(
-      iterations=2,
-      expected=[[0.9105572849, 1.8211145698], [2.8800000024, 3.8400000032]],
-      optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8),
-      dtype=dtype,
+    iterations=2,
+    expected=[[0.9105572849, 1.8211145698], [2.8800000024, 3.8400000032]],
+    optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
@@ -90,10 +90,10 @@ def run_sparse_sample(iterations, expected, optimizer, dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_sparse_sample(dtype):
   run_sparse_sample(
-      iterations=2,
-      expected=[[0.71, 2.0], [3.0, 3.71]],
-      optimizer=NovoGrad(lr=0.1, epsilon=1e-8),
-      dtype=dtype,
+    iterations=2,
+    expected=[[0.71, 2.0], [3.0, 3.71]],
+    optimizer=NovoGrad(lr=0.1, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
@@ -101,10 +101,10 @@ def test_sparse_sample(dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_sparse_sample_with_weight_decay(dtype):
   run_sparse_sample(
-      iterations=2,
-      expected=[[0.6821, 2.0], [3.0, 3.5954]],
-      optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8),
-      dtype=dtype,
+    iterations=2,
+    expected=[[0.6821, 2.0], [3.0, 3.5954]],
+    optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
@@ -112,10 +112,10 @@ def test_sparse_sample_with_weight_decay(dtype):
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 def test_sparse_sample_with_grad_averaging(dtype):
   run_sparse_sample(
-      iterations=2,
-      expected=[[0.8, 2.0], [3.0, 3.8]],
-      optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8),
-      dtype=dtype,
+    iterations=2,
+    expected=[[0.8, 2.0], [3.0, 3.8]],
+    optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8),
+    dtype=dtype,
   )
 
 
diff --git a/deepray/optimizers/tests/proximal_adagrad_test.py b/deepray/optimizers/tests/proximal_adagrad_test.py
index 03266a7b..409c79e2 100644
--- a/deepray/optimizers/tests/proximal_adagrad_test.py
+++ b/deepray/optimizers/tests/proximal_adagrad_test.py
@@ -42,73 +42,73 @@ def run_sample(iterations, expected, optimizer, sparse=False, rtol=1e-7, atol=0.
 
 def test_without_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-6.722771, -9.230448], [3.0539124, 1.1230775]],
-      optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1),
+    iterations=10,
+    expected=[[-6.722771, -9.230448], [3.0539124, 1.1230775]],
+    optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1),
   )
 
 
 def test_with_l1_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-6.663634, -9.190331], [2.9593036, 1.0292315]],
-      optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1, l1_regularization_strength=0.001),
+    iterations=10,
+    expected=[[-6.663634, -9.190331], [2.9593036, 1.0292315]],
+    optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1, l1_regularization_strength=0.001),
   )
 
 
 def test_with_l1_l2_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-0.0495, -0.0995], [-0.0045, -0.0095]],
-      optimizer=ProximalAdagrad(
-          lr=3.0,
-          initial_accumulator_value=0.1,
-          l1_regularization_strength=0.001,
-          l2_regularization_strength=2.0,
-      ),
+    iterations=10,
+    expected=[[-0.0495, -0.0995], [-0.0045, -0.0095]],
+    optimizer=ProximalAdagrad(
+      lr=3.0,
+      initial_accumulator_value=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=2.0,
+    ),
   )
 
 
 def test_sparse_without_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-6.722771, 2.0], [4.0, 1.1230775]],
-      optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1),
-      sparse=True,
-      rtol=5e-7,
+    iterations=10,
+    expected=[[-6.722771, 2.0], [4.0, 1.1230775]],
+    optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1),
+    sparse=True,
+    rtol=5e-7,
   )
 
 
 def test_sparse_with_l1_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-6.663634, 2.0], [4.0, 1.0292315]],
-      optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1, l1_regularization_strength=0.001),
-      sparse=True,
-      rtol=5e-7,
+    iterations=10,
+    expected=[[-6.663634, 2.0], [4.0, 1.0292315]],
+    optimizer=ProximalAdagrad(lr=3.0, initial_accumulator_value=0.1, l1_regularization_strength=0.001),
+    sparse=True,
+    rtol=5e-7,
   )
 
 
 def test_sparse_with_l1_l2_regularization():
   run_sample(
-      iterations=10,
-      expected=[[-0.0495, 2.0], [4.0, -0.0095]],
-      optimizer=ProximalAdagrad(
-          lr=3.0,
-          initial_accumulator_value=0.1,
-          l1_regularization_strength=0.001,
-          l2_regularization_strength=2.0,
-      ),
-      sparse=True,
+    iterations=10,
+    expected=[[-0.0495, 2.0], [4.0, -0.0095]],
+    optimizer=ProximalAdagrad(
+      lr=3.0,
+      initial_accumulator_value=0.1,
+      l1_regularization_strength=0.001,
+      l2_regularization_strength=2.0,
+    ),
+    sparse=True,
   )
 
 
 def test_serialization():
   optimizer = ProximalAdagrad(
-      lr=1e-4,
-      initial_accumulator_value=0.1,
-      l1_regularization_strength=0.1,
-      l2_regularization_strength=0.1,
+    lr=1e-4,
+    initial_accumulator_value=0.1,
+    l1_regularization_strength=0.1,
+    l2_regularization_strength=0.1,
   )
   config = tf.keras.optimizers.serialize(optimizer)
   new_optimizer = tf.keras.optimizers.deserialize(config)
diff --git a/deepray/optimizers/tests/rectified_adam_test.py b/deepray/optimizers/tests/rectified_adam_test.py
index b7b1ed06..ea6b790f 100644
--- a/deepray/optimizers/tests/rectified_adam_test.py
+++ b/deepray/optimizers/tests/rectified_adam_test.py
@@ -58,9 +58,9 @@ def run_sparse_sample(iterations, expected, optimizer):
 def test_dense_sample():
   # Expected values are obtained from the previous implementation
   run_dense_sample(
-      iterations=100,
-      expected=[[0.985769, 1.985269], [2.986119, 3.986068]],
-      optimizer=RectifiedAdam(lr=1e-3),
+    iterations=100,
+    expected=[[0.985769, 1.985269], [2.986119, 3.986068]],
+    optimizer=RectifiedAdam(lr=1e-3),
   )
 
 
@@ -68,9 +68,9 @@ def test_dense_sample():
 def test_sparse_sample():
   # Expected values are obtained from the previous implementation
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.959333, 2.0], [3.0, 3.959632]],
-      optimizer=RectifiedAdam(lr=1e-3),
+    iterations=200,
+    expected=[[0.959333, 2.0], [3.0, 3.959632]],
+    optimizer=RectifiedAdam(lr=1e-3),
   )
 
 
@@ -79,9 +79,9 @@ def test_dense_sample_with_amsgrad():
   # Expected values are obtained from the official implementation
   # `amsgrad` has no effect because the gradient is fixed
   run_dense_sample(
-      iterations=100,
-      expected=[[0.985769, 1.985269], [2.986119, 3.986068]],
-      optimizer=RectifiedAdam(lr=1e-3, amsgrad=True),
+    iterations=100,
+    expected=[[0.985769, 1.985269], [2.986119, 3.986068]],
+    optimizer=RectifiedAdam(lr=1e-3, amsgrad=True),
   )
 
 
@@ -90,9 +90,9 @@ def test_sparse_sample_with_amsgrad():
   # Expected values are obtained from the official implementation
   # `amsgrad` has no effect because the gradient is fixed
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.959333, 2.0], [3.0, 3.959632]],
-      optimizer=RectifiedAdam(lr=1e-3, amsgrad=True),
+    iterations=200,
+    expected=[[0.959333, 2.0], [3.0, 3.959632]],
+    optimizer=RectifiedAdam(lr=1e-3, amsgrad=True),
   )
 
 
@@ -100,9 +100,9 @@ def test_sparse_sample_with_amsgrad():
 def test_dense_sample_with_weight_decay():
   # Expected values are obtained from the previous implementation
   run_dense_sample(
-      iterations=100,
-      expected=[[0.984775, 1.983276], [2.983125, 3.982076]],
-      optimizer=RectifiedAdam(lr=1e-3, weight_decay=0.01),
+    iterations=100,
+    expected=[[0.984775, 1.983276], [2.983125, 3.982076]],
+    optimizer=RectifiedAdam(lr=1e-3, weight_decay=0.01),
   )
 
 
@@ -110,27 +110,27 @@ def test_dense_sample_with_weight_decay():
 def test_sparse_sample_with_weight_decay():
   # Expected values are obtained from the previous implementation
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.957368, 2.0], [3.0, 3.951673]],
-      optimizer=RectifiedAdam(lr=1e-3, weight_decay=0.01),
+    iterations=200,
+    expected=[[0.957368, 2.0], [3.0, 3.951673]],
+    optimizer=RectifiedAdam(lr=1e-3, weight_decay=0.01),
   )
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_dense_sample_with_warmup():
   run_dense_sample(
-      iterations=100,
-      expected=[[0.994062, 1.993912], [2.994167, 3.994152]],
-      optimizer=RectifiedAdam(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5),
+    iterations=100,
+    expected=[[0.994062, 1.993912], [2.994167, 3.994152]],
+    optimizer=RectifiedAdam(lr=1e-3, total_steps=100, warmup_proportion=0.1, min_lr=1e-5),
   )
 
 
 @pytest.mark.usefixtures("maybe_run_functions_eagerly")
 def test_sparse_sample_with_warmup():
   run_sparse_sample(
-      iterations=200,
-      expected=[[0.982629, 2.0], [3.0, 3.982674]],
-      optimizer=RectifiedAdam(lr=1e-3, total_steps=200, warmup_proportion=0.1, min_lr=1e-5),
+    iterations=200,
+    expected=[[0.982629, 2.0], [3.0, 3.982674]],
+    optimizer=RectifiedAdam(lr=1e-3, total_steps=200, warmup_proportion=0.1, min_lr=1e-5),
   )
 
 
@@ -139,9 +139,9 @@ def test_dense_sample_with_lookahead():
   # Expected values are obtained from the original implementation
   # of Ranger
   run_dense_sample(
-      iterations=100,
-      expected=[[0.993126, 1.992901], [2.993283, 3.993261]],
-      optimizer=Lookahead(RectifiedAdam(lr=1e-3, beta_1=0.95), sync_period=6, slow_step_size=0.45),
+    iterations=100,
+    expected=[[0.993126, 1.992901], [2.993283, 3.993261]],
+    optimizer=Lookahead(RectifiedAdam(lr=1e-3, beta_1=0.95), sync_period=6, slow_step_size=0.45),
   )
 
 
@@ -150,9 +150,9 @@ def test_sparse_sample_with_lookahead():
   # Expected values are obtained from the previous implementation
   # of Ranger.
   run_sparse_sample(
-      iterations=150,
-      expected=[[0.988156, 2.0], [3.0, 3.988291]],
-      optimizer=Lookahead(RectifiedAdam(lr=1e-3, beta_1=0.95), sync_period=6, slow_step_size=0.45),
+    iterations=150,
+    expected=[[0.988156, 2.0], [3.0, 3.988291]],
+    optimizer=Lookahead(RectifiedAdam(lr=1e-3, beta_1=0.95), sync_period=6, slow_step_size=0.45),
   )
 
 
@@ -176,9 +176,9 @@ def test_schedulers():
   wd_scheduler = tf.keras.optimizers.schedules.InverseTimeDecay(2e-3, 25, 0.25)
 
   run_dense_sample(
-      iterations=100,
-      expected=[[0.993192, 1.992625], [2.993369, 3.993239]],
-      optimizer=RectifiedAdam(learning_rate=lr_scheduler, weight_decay=wd_scheduler),
+    iterations=100,
+    expected=[[0.993192, 1.992625], [2.993369, 3.993239]],
+    optimizer=RectifiedAdam(learning_rate=lr_scheduler, weight_decay=wd_scheduler),
   )
 
 
@@ -194,27 +194,27 @@ def test_scheduler_serialization():
   # TODO: Remove after 2.13 is oldest version supported due to new serialization
   if Version(tf.__version__) >= Version("2.13"):
     assert new_optimizer.get_config()["learning_rate"] == {
-        "class_name": "ExponentialDecay",
-        "config": lr_scheduler.get_config(),
-        "module": "keras.optimizers.schedules",
-        "registered_name": None,
+      "class_name": "ExponentialDecay",
+      "config": lr_scheduler.get_config(),
+      "module": "keras.optimizers.schedules",
+      "registered_name": None,
     }
     assert new_optimizer.get_config()["weight_decay"] == {
-        "class_name": "InverseTimeDecay",
-        "config": wd_scheduler.get_config(),
-        "module": "keras.optimizers.schedules",
-        "registered_name": None,
+      "class_name": "InverseTimeDecay",
+      "config": wd_scheduler.get_config(),
+      "module": "keras.optimizers.schedules",
+      "registered_name": None,
     }
 
   else:
     assert new_optimizer.get_config()["learning_rate"] == {
-        "class_name": "ExponentialDecay",
-        "config": lr_scheduler.get_config(),
+      "class_name": "ExponentialDecay",
+      "config": lr_scheduler.get_config(),
     }
 
     assert new_optimizer.get_config()["weight_decay"] == {
-        "class_name": "InverseTimeDecay",
-        "config": wd_scheduler.get_config(),
+      "class_name": "InverseTimeDecay",
+      "config": wd_scheduler.get_config(),
     }
 
 
diff --git a/deepray/optimizers/tests/standard_test.py b/deepray/optimizers/tests/standard_test.py
index c352a8ef..c140afe4 100644
--- a/deepray/optimizers/tests/standard_test.py
+++ b/deepray/optimizers/tests/standard_test.py
@@ -22,15 +22,15 @@
 from deepray.utils.test_utils import discover_classes
 
 class_exceptions = [
-    "MultiOptimizer",  # is wrapper
-    "SGDW",  # is wrapper
-    "AdamW",  # is wrapper
-    "SWA",  # is wrapper
-    "AveragedOptimizerWrapper",  # is wrapper
-    "ConditionalGradient",  # is wrapper
-    "Lookahead",  # is wrapper
-    "MovingAverage",  # is wrapper
-    "KerasLegacyOptimizer",  # is a constantc
+  "MultiOptimizer",  # is wrapper
+  "SGDW",  # is wrapper
+  "AdamW",  # is wrapper
+  "SWA",  # is wrapper
+  "AveragedOptimizerWrapper",  # is wrapper
+  "ConditionalGradient",  # is wrapper
+  "Lookahead",  # is wrapper
+  "MovingAverage",  # is wrapper
+  "KerasLegacyOptimizer",  # is a constantc
 ]
 
 classes_to_test = discover_classes(optimizers, KerasLegacyOptimizer, class_exceptions)
@@ -40,9 +40,9 @@
 @pytest.mark.parametrize("serialize", [True, False])
 def test_optimizer_minimize_serialize(optimizer, serialize, tmpdir):
   """
-    Purpose of this test is to confirm that the optimizer can minimize the loss in toy conditions.
-    It also tests for serialization as a parameter.
-    """
+  Purpose of this test is to confirm that the optimizer can minimize the loss in toy conditions.
+  It also tests for serialization as a parameter.
+  """
   model = tf.keras.Sequential([tf.keras.Input(shape=[1]), tf.keras.layers.Dense(1)])
 
   x = np.array(np.ones([1]))
diff --git a/deepray/optimizers/tests/weight_decay_optimizers_test.py b/deepray/optimizers/tests/weight_decay_optimizers_test.py
index c2a1041f..0e2a62a7 100644
--- a/deepray/optimizers/tests/weight_decay_optimizers_test.py
+++ b/deepray/optimizers/tests/weight_decay_optimizers_test.py
@@ -26,33 +26,33 @@
 
 
 def do_test(
-    dtype,
-    optimizer,
-    update_fn,
-    do_sparse=False,
-    do_decay_var_list=False,
-    **optimizer_kwargs,
+  dtype,
+  optimizer,
+  update_fn,
+  do_sparse=False,
+  do_decay_var_list=False,
+  **optimizer_kwargs,
 ):
   """The major test function.
 
-    Args:
-        optimizer: The tensorflow optimizer class to be tested.
-        update_fn: The numpy update function of the optimizer, the function
-            signature must be
-            update_fn(var: np.array,
-                        grad_t: np.array,
-                        slot_vars: dict,
-                        **kwargs) -> (updated_var, updated_slot_vars)
-            Note that slot_vars will be initialized to an empty dictionary
-            for each variable, initial values should be handled in the
-            update_fn.
-        do_sparse: If True, test sparse update. Defaults to False, i.e.,
-            dense update.
-        do_decay_var_list: If True, test by passing a list of vars to ensure hashing is handled correctly
-        **optimizer_kwargs:The parameters to pass to the construcor of the
-            optimizer. Either a constant or a callable. This also passed to
-            the optimizer_params in the update_fn.
-    """
+  Args:
+      optimizer: The tensorflow optimizer class to be tested.
+      update_fn: The numpy update function of the optimizer, the function
+          signature must be
+          update_fn(var: np.array,
+                      grad_t: np.array,
+                      slot_vars: dict,
+                      **kwargs) -> (updated_var, updated_slot_vars)
+          Note that slot_vars will be initialized to an empty dictionary
+          for each variable, initial values should be handled in the
+          update_fn.
+      do_sparse: If True, test sparse update. Defaults to False, i.e.,
+          dense update.
+      do_decay_var_list: If True, test by passing a list of vars to ensure hashing is handled correctly
+      **optimizer_kwargs:The parameters to pass to the construcor of the
+          optimizer. Either a constant or a callable. This also passed to
+          the optimizer_params in the update_fn.
+  """
   # TODO: Fix #347 issue
   if do_sparse and test_utils.is_gpu_available():
     pytest.skip("Wait #347 to be fixed")
@@ -94,15 +94,15 @@ def do_test(
 def do_test_sparse_repeated_indices(dtype, optimizer, **optimizer_kwargs):
   """Test for repeated indices in sparse updates.
 
-    This test verifies that an update with repeated indices is the same as
-    an update with two times the gradient.
+  This test verifies that an update with repeated indices is the same as
+  an update with two times the gradient.
 
-    Args:
-        optimizer: The tensorflow optimizer class to be tested.
-        **optimizer_kwargs: The parameters to pass to the construcor of the
-            optimizer. Either a constant or a callable. This also passed to
-            the optimizer_params in the update_fn.
-    """
+  Args:
+      optimizer: The tensorflow optimizer class to be tested.
+      **optimizer_kwargs: The parameters to pass to the construcor of the
+          optimizer. Either a constant or a callable. This also passed to
+          the optimizer_params in the update_fn.
+  """
   # TODO: Fix #347 issue
   if test_utils.is_gpu_available():
     pytest.skip("Wait #347 to be fixed")
@@ -110,14 +110,14 @@ def do_test_sparse_repeated_indices(dtype, optimizer, **optimizer_kwargs):
   repeated_index_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
   aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
   grad_repeated_index = tf.IndexedSlices(
-      tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-      tf.constant([1, 1]),
-      tf.constant([2, 1]),
+    tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+    tf.constant([1, 1]),
+    tf.constant([2, 1]),
   )
   grad_aggregated = tf.IndexedSlices(
-      tf.constant([0.2], shape=[1, 1], dtype=dtype),
-      tf.constant([1]),
-      tf.constant([2, 1]),
+    tf.constant([0.2], shape=[1, 1], dtype=dtype),
+    tf.constant([1]),
+    tf.constant([2, 1]),
   )
   opt_repeated = optimizer(**optimizer_kwargs)
   _ = opt_repeated.apply_gradients([(grad_repeated_index, repeated_index_update_var)])
@@ -133,7 +133,7 @@ def do_test_sparse_repeated_indices(dtype, optimizer, **optimizer_kwargs):
 def adamw_update_numpy(param, grad_t, slot_vars, learning_rate, beta_1, beta_2, epsilon, weight_decay):
   """Numpy update function for AdamW."""
   lr, beta1, beta2, eps, wd = (
-      v() if callable(v) else v for v in (learning_rate, beta_1, beta_2, epsilon, weight_decay)
+    v() if callable(v) else v for v in (learning_rate, beta_1, beta_2, epsilon, weight_decay)
   )
   t = slot_vars.get("t", 0) + 1
   lr_t = lr * np.sqrt(1 - beta2**t) / (1 - beta1**t)
@@ -156,71 +156,71 @@ def sgdw_update_numpy(param, grad_t, slot_vars, learning_rate, momentum, weight_
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_sparse_adamw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      adamw_update_numpy,
-      do_sparse=True,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-8,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.AdamW,
+    adamw_update_numpy,
+    do_sparse=True,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-8,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [tf.half, tf.float32, tf.float64])
 def test_sparse_repeated_indices_adamw(dtype):
   do_test_sparse_repeated_indices(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-8,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.AdamW,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-8,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_adamw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      adamw_update_numpy,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-8,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.AdamW,
+    adamw_update_numpy,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-8,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_callable_params_adamw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      adamw_update_numpy,
-      learning_rate=lambda: 0.001,
-      beta_1=lambda: 0.9,
-      beta_2=lambda: 0.999,
-      epsilon=1e-8,
-      weight_decay=lambda: WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.AdamW,
+    adamw_update_numpy,
+    learning_rate=lambda: 0.001,
+    beta_1=lambda: 0.9,
+    beta_2=lambda: 0.999,
+    epsilon=1e-8,
+    weight_decay=lambda: WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_decay_var_list_adamw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      adamw_update_numpy,
-      do_decay_var_list=True,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-8,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.AdamW,
+    adamw_update_numpy,
+    do_decay_var_list=True,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-8,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
@@ -239,16 +239,16 @@ def test_exclude_weight_decay_adamw():
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_var_list_with_exclude_list_adamw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.AdamW,
-      adamw_update_numpy,
-      do_decay_var_list=True,
-      learning_rate=0.001,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-8,
-      weight_decay=WEIGHT_DECAY,
-      exclude_from_weight_decay=["var0_*", "var1_*"],
+    dtype,
+    weight_decay_optimizers.AdamW,
+    adamw_update_numpy,
+    do_decay_var_list=True,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-8,
+    weight_decay=WEIGHT_DECAY,
+    exclude_from_weight_decay=["var0_*", "var1_*"],
   )
 
 
@@ -287,48 +287,48 @@ def test_weight_decay_with_piecewise_constant_decay_schedule():
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_sparse_sgdw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      sgdw_update_numpy,
-      do_sparse=True,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.SGDW,
+    sgdw_update_numpy,
+    do_sparse=True,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [tf.half, tf.float32, tf.float64])
 def test_sparse_repeated_indices_sgdw(dtype):
   do_test_sparse_repeated_indices(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.SGDW,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_sgdw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      sgdw_update_numpy,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.SGDW,
+    sgdw_update_numpy,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_callable_params_sgdw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      sgdw_update_numpy,
-      learning_rate=lambda: 0.001,
-      momentum=lambda: 0.9,
-      weight_decay=lambda: WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.SGDW,
+    sgdw_update_numpy,
+    learning_rate=lambda: 0.001,
+    momentum=lambda: 0.9,
+    weight_decay=lambda: WEIGHT_DECAY,
   )
 
 
@@ -336,13 +336,13 @@ def test_basic_callable_params_sgdw(dtype):
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_basic_decay_var_list_sgdw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      sgdw_update_numpy,
-      do_decay_var_list=True,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    weight_decay_optimizers.SGDW,
+    sgdw_update_numpy,
+    do_decay_var_list=True,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
@@ -362,14 +362,14 @@ def test_exclude_weight_decay_sgdw():
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_var_list_with_exclude_list_sgdw(dtype):
   do_test(
-      dtype,
-      weight_decay_optimizers.SGDW,
-      sgdw_update_numpy,
-      do_decay_var_list=True,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
-      exclude_from_weight_decay=["var0_*", "var1_*"],
+    dtype,
+    weight_decay_optimizers.SGDW,
+    sgdw_update_numpy,
+    do_decay_var_list=True,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
+    exclude_from_weight_decay=["var0_*", "var1_*"],
   )
 
 
@@ -380,30 +380,30 @@ def test_var_list_with_exclude_list_sgdw(dtype):
 
 
 @pytest.mark.parametrize(
-    "optimizer",
-    [
-        weight_decay_optimizers.SGDW,
-        weight_decay_optimizers.extend_with_decoupled_weight_decay(optimizer_class),
-    ],
+  "optimizer",
+  [
+    weight_decay_optimizers.SGDW,
+    weight_decay_optimizers.extend_with_decoupled_weight_decay(optimizer_class),
+  ],
 )
 @pytest.mark.parametrize("dtype", [(tf.half, 0), (tf.float32, 1), (tf.float64, 2)])
 def test_optimizer_basic(dtype, optimizer):
   do_test(
-      dtype,
-      optimizer,
-      sgdw_update_numpy,
-      learning_rate=0.001,
-      momentum=0.9,
-      weight_decay=WEIGHT_DECAY,
+    dtype,
+    optimizer,
+    sgdw_update_numpy,
+    learning_rate=0.001,
+    momentum=0.9,
+    weight_decay=WEIGHT_DECAY,
   )
 
 
 @pytest.mark.parametrize(
-    "optimizer",
-    [
-        weight_decay_optimizers.SGDW,
-        weight_decay_optimizers.extend_with_decoupled_weight_decay(optimizer_class),
-    ],
+  "optimizer",
+  [
+    weight_decay_optimizers.SGDW,
+    weight_decay_optimizers.extend_with_decoupled_weight_decay(optimizer_class),
+  ],
 )
 @pytest.mark.parametrize("dtype", [tf.half, tf.float32, tf.float64])
 def test_optimizer_sparse(dtype, optimizer):
diff --git a/deepray/optimizers/tests/yogi_test.py b/deepray/optimizers/tests/yogi_test.py
index d8e1b1cc..ba9ec98a 100644
--- a/deepray/optimizers/tests/yogi_test.py
+++ b/deepray/optimizers/tests/yogi_test.py
@@ -23,38 +23,38 @@
 
 
 def yogi_update_numpy(
-    param,
-    g_t,
-    t,
-    m,
-    v,
-    alpha=0.01,
-    beta1=0.9,
-    beta2=0.999,
-    epsilon=1e-3,
-    l1reg=0.0,
-    l2reg=0.0,
+  param,
+  g_t,
+  t,
+  m,
+  v,
+  alpha=0.01,
+  beta1=0.9,
+  beta2=0.999,
+  epsilon=1e-3,
+  l1reg=0.0,
+  l2reg=0.0,
 ):
   """Performs Yogi parameter update using numpy.
 
-    Args:
-      param: An numpy ndarray of the current parameter.
-      g_t: An numpy ndarray of the current gradients.
-      t: An numpy ndarray of the current time step.
-      m: An numpy ndarray of the 1st moment estimates.
-      v: An numpy ndarray of the 2nd moment estimates.
-      alpha: A float value of the learning rate.
-      beta1: A float value of the exponential decay rate for the 1st moment
-        estimates.
-      beta2: A float value of the exponential decay rate for the 2nd moment
-         estimates.
-      epsilon: A float of a small constant for numerical stability.
-      l1reg: A float value of L1 regularization
-      l2reg: A float value of L2 regularization
-    Returns:
-      A tuple of numpy ndarrays (param_t, m_t, v_t) representing the
-      updated parameters for `param`, `m`, and `v` respectively.
-    """
+  Args:
+    param: An numpy ndarray of the current parameter.
+    g_t: An numpy ndarray of the current gradients.
+    t: An numpy ndarray of the current time step.
+    m: An numpy ndarray of the 1st moment estimates.
+    v: An numpy ndarray of the 2nd moment estimates.
+    alpha: A float value of the learning rate.
+    beta1: A float value of the exponential decay rate for the 1st moment
+      estimates.
+    beta2: A float value of the exponential decay rate for the 2nd moment
+       estimates.
+    epsilon: A float of a small constant for numerical stability.
+    l1reg: A float value of L1 regularization
+    l2reg: A float value of L2 regularization
+  Returns:
+    A tuple of numpy ndarrays (param_t, m_t, v_t) representing the
+    updated parameters for `param`, `m`, and `v` respectively.
+  """
   beta1 = np.array(beta1, dtype=param.dtype)
   beta2 = np.array(beta2, dtype=param.dtype)
 
@@ -108,10 +108,10 @@ def do_test_sparse(beta1=0.0, l1reg=0.0, l2reg=0.0):
     grads1_np_indices = np.array([0, 1], dtype=np.int32)
     grads1 = tf.IndexedSlices(tf.constant(grads1_np), tf.constant(grads1_np_indices), tf.constant([2]))
     opt = yogi.Yogi(
-        beta1=beta1,
-        l1_regularization_strength=l1reg,
-        l2_regularization_strength=l2reg,
-        initial_accumulator_value=1.0,
+      beta1=beta1,
+      l1_regularization_strength=l1reg,
+      l2_regularization_strength=l2reg,
+      initial_accumulator_value=1.0,
     )
 
     # Fetch params to validate initial values.
@@ -158,21 +158,21 @@ def test_sparse_repeated_indices():
     repeated_index_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
     aggregated_update_var = tf.Variable([[1.0], [2.0]], dtype=dtype)
     grad_repeated_index = tf.IndexedSlices(
-        tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
-        tf.constant([1, 1]),
-        tf.constant([2, 1]),
+      tf.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+      tf.constant([1, 1]),
+      tf.constant([2, 1]),
     )
     grad_aggregated = tf.IndexedSlices(
-        tf.constant([0.2], shape=[1, 1], dtype=dtype),
-        tf.constant([1]),
-        tf.constant([2, 1]),
+      tf.constant([0.2], shape=[1, 1], dtype=dtype),
+      tf.constant([1]),
+      tf.constant([2, 1]),
     )
     opt1 = yogi.Yogi()
     opt2 = yogi.Yogi()
 
     np.testing.assert_allclose(
-        aggregated_update_var.numpy(),
-        repeated_index_update_var.numpy(),
+      aggregated_update_var.numpy(),
+      repeated_index_update_var.numpy(),
     )
 
     for _ in range(3):
@@ -180,8 +180,8 @@ def test_sparse_repeated_indices():
       opt2.apply_gradients([(grad_aggregated, aggregated_update_var)])
 
     np.testing.assert_allclose(
-        aggregated_update_var.numpy(),
-        repeated_index_update_var.numpy(),
+      aggregated_update_var.numpy(),
+      repeated_index_update_var.numpy(),
     )
 
 
@@ -200,10 +200,10 @@ def do_test_basic(beta1=0.0, l1reg=0.0, l2reg=0.0):
     grads1 = tf.constant(grads1_np)
 
     opt = yogi.Yogi(
-        beta1=beta1,
-        l1_regularization_strength=l1reg,
-        l2_regularization_strength=l2reg,
-        initial_accumulator_value=1.0,
+      beta1=beta1,
+      l1_regularization_strength=l1reg,
+      l2_regularization_strength=l2reg,
+      initial_accumulator_value=1.0,
     )
 
     # Fetch params to validate initial values.
diff --git a/deepray/optimizers/utils.py b/deepray/optimizers/utils.py
index d9074c76..f4f2b701 100644
--- a/deepray/optimizers/utils.py
+++ b/deepray/optimizers/utils.py
@@ -21,12 +21,12 @@
 
 def fit_bn(model, *args, **kwargs):
   """Resets batch normalization layers of model, and recalculates the
-    statistics for each batchnorm layer by running a pass on the data.
+  statistics for each batchnorm layer by running a pass on the data.
 
-    Args:
-        model: An instance of tf.keras.Model
-        *args, **kwargs: Params that'll be passed to `.fit` method of model
-    """
+  Args:
+      model: An instance of tf.keras.Model
+      *args, **kwargs: Params that'll be passed to `.fit` method of model
+  """
   kwargs["epochs"] = 1
   if not isinstance(model, tf.keras.Model):
     raise TypeError("model must be an instance of tf.keras.Model")
@@ -37,12 +37,10 @@ def fit_bn(model, *args, **kwargs):
   assign_ops = []
   for layer in model.layers:
     if isinstance(layer, tf.keras.layers.BatchNormalization):
-      assign_ops.extend(
-          [
-              layer.moving_mean.assign(tf.zeros_like(layer.moving_mean)),
-              layer.moving_variance.assign(tf.ones_like(layer.moving_variance)),
-          ]
-      )
+      assign_ops.extend([
+        layer.moving_mean.assign(tf.zeros_like(layer.moving_mean)),
+        layer.moving_variance.assign(tf.ones_like(layer.moving_variance)),
+      ])
 
   _trainable = model.trainable
   _metrics = model._metrics
diff --git a/deepray/optimizers/warmup.py b/deepray/optimizers/warmup.py
index 5da38ae3..1147188b 100644
--- a/deepray/optimizers/warmup.py
+++ b/deepray/optimizers/warmup.py
@@ -34,15 +34,16 @@ def __call__(self, step):
     # steps. If not, then throw a value error.
     if self.total_steps < self.warmup_steps:
       raise ValueError(
-          f"Total number of steps {self.total_steps} must be" + f"larger or equal to warmup steps {self.warmup_steps}."
+        f"Total number of steps {self.total_steps} must be" + f"larger or equal to warmup steps {self.warmup_steps}."
       )
 
     # `cos_annealed_lr` is a graph that increases to 1 from the initial
     # step to the warmup step. After that this graph decays to -1 at the
     # final step mark.
     cos_annealed_lr = tf.cos(
-        self.pi * (tf.cast(step, tf.float32) - self.warmup_steps) /
-        tf.cast(self.total_steps - self.warmup_steps, tf.float32)
+      self.pi
+      * (tf.cast(step, tf.float32) - self.warmup_steps)
+      / tf.cast(self.total_steps - self.warmup_steps, tf.float32)
     )
 
     # Shift the mean of the `cos_annealed_lr` graph to 1. Now the grpah goes
@@ -96,12 +97,12 @@ class WarmUpPolynomial(tf.keras.optimizers.schedules.LearningRateSchedule):
   """
 
   def __init__(
-      self,
-      initial_learning_rate: float,
-      decay_schedule_fn: Callable,
-      warmup_steps: int,
-      power: float = 1.0,
-      name: str = None,
+    self,
+    initial_learning_rate: float,
+    decay_schedule_fn: Callable,
+    warmup_steps: int,
+    power: float = 1.0,
+    name: str = None,
   ):
     super().__init__()
     self.initial_learning_rate = initial_learning_rate
@@ -119,17 +120,17 @@ def __call__(self, step):
       warmup_percent_done = global_step_float / warmup_steps_float
       warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
       return tf.cond(
-          global_step_float < warmup_steps_float,
-          lambda: warmup_learning_rate,
-          lambda: self.decay_schedule_fn(step - self.warmup_steps),
-          name=name,
+        global_step_float < warmup_steps_float,
+        lambda: warmup_learning_rate,
+        lambda: self.decay_schedule_fn(step - self.warmup_steps),
+        name=name,
       )
 
   def get_config(self):
     return {
-        "initial_learning_rate": self.initial_learning_rate,
-        "decay_schedule_fn": self.decay_schedule_fn,
-        "warmup_steps": self.warmup_steps,
-        "power": self.power,
-        "name": self.name,
+      "initial_learning_rate": self.initial_learning_rate,
+      "decay_schedule_fn": self.decay_schedule_fn,
+      "warmup_steps": self.warmup_steps,
+      "power": self.power,
+      "name": self.name,
     }
diff --git a/deepray/optimizers/weight_decay_optimizers.py b/deepray/optimizers/weight_decay_optimizers.py
index 264bab37..8656380e 100644
--- a/deepray/optimizers/weight_decay_optimizers.py
+++ b/deepray/optimizers/weight_decay_optimizers.py
@@ -26,73 +26,73 @@
 class DecoupledWeightDecayExtension:
   """This class allows to extend optimizers with decoupled weight decay.
 
-    It implements the decoupled weight decay described by [Loshchilov & Hutter]
-    (https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is
-    decoupled from the optimization steps w.r.t. to the loss function.
-    For SGD variants, this simplifies hyperparameter search since it decouples
-    the settings of weight decay and learning rate.
-    For adaptive gradient algorithms, it regularizes variables with large
-    gradients more than L2 regularization would, which was shown to yield
-    better training loss and generalization error in the paper above.
-
-    This class alone is not an optimizer but rather extends existing
-    optimizers with decoupled weight decay. We explicitly define the two
-    examples used in the above paper (SGDW and AdamW), but in general this can
-    extend any OptimizerX class by using
-        `ExtendedCls = extend_with_decoupled_weight_decay(OptimizerX)`.
-    Weight decay can then be set when instantiating the optimizer:
-        `optimizerX = ExtendedCls(weight_decay=0.001, learning_rate=0.001)`.
-    In order for it to work, it must be the first class the Optimizer with
-    weight decay inherits from, e.g.
-
-    ```python
-    class AdamW(DecoupledWeightDecayExtension, tf.keras.optimizers.Adam):
-      def __init__(self, weight_decay, *args, **kwargs):
-        super(AdamW, self).__init__(weight_decay, *args, **kwargs).
-    ```
-
-    Note: this extension decays weights BEFORE applying the update based
-    on the gradient, i.e. this extension only has the desired behaviour for
-    optimizers which do not depend on the value of'var' in the update step!
-
-    Note: when applying a decay to the learning rate, be sure to manually apply
-    the decay to the `weight_decay` as well. For example:
-
-    ```python
-    step = tf.Variable(0, trainable=False)
-    schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
-        [10000, 15000], [1e-0, 1e-1, 1e-2])
-    # lr and wd can be a function or a tensor
-    lr = 1e-1 * schedule(step)
-    wd = lambda: 1e-4 * schedule(step)
-
-    # ...
-
-    optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
-    ```
-    """
+  It implements the decoupled weight decay described by [Loshchilov & Hutter]
+  (https://arxiv.org/pdf/1711.05101.pdf), in which the weight decay is
+  decoupled from the optimization steps w.r.t. to the loss function.
+  For SGD variants, this simplifies hyperparameter search since it decouples
+  the settings of weight decay and learning rate.
+  For adaptive gradient algorithms, it regularizes variables with large
+  gradients more than L2 regularization would, which was shown to yield
+  better training loss and generalization error in the paper above.
+
+  This class alone is not an optimizer but rather extends existing
+  optimizers with decoupled weight decay. We explicitly define the two
+  examples used in the above paper (SGDW and AdamW), but in general this can
+  extend any OptimizerX class by using
+      `ExtendedCls = extend_with_decoupled_weight_decay(OptimizerX)`.
+  Weight decay can then be set when instantiating the optimizer:
+      `optimizerX = ExtendedCls(weight_decay=0.001, learning_rate=0.001)`.
+  In order for it to work, it must be the first class the Optimizer with
+  weight decay inherits from, e.g.
+
+  ```python
+  class AdamW(DecoupledWeightDecayExtension, tf.keras.optimizers.Adam):
+    def __init__(self, weight_decay, *args, **kwargs):
+      super(AdamW, self).__init__(weight_decay, *args, **kwargs).
+  ```
+
+  Note: this extension decays weights BEFORE applying the update based
+  on the gradient, i.e. this extension only has the desired behaviour for
+  optimizers which do not depend on the value of'var' in the update step!
+
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+  step = tf.Variable(0, trainable=False)
+  schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+      [10000, 15000], [1e-0, 1e-1, 1e-2])
+  # lr and wd can be a function or a tensor
+  lr = 1e-1 * schedule(step)
+  wd = lambda: 1e-4 * schedule(step)
+
+  # ...
+
+  optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      weight_decay: Union[FloatTensorLike, Callable],
-      exclude_from_weight_decay: Optional[List[str]] = None,
-      **kwargs,
+    self,
+    weight_decay: Union[FloatTensorLike, Callable],
+    exclude_from_weight_decay: Optional[List[str]] = None,
+    **kwargs,
   ):
     """Extension class that adds weight decay to an optimizer.
 
-        Args:
-            weight_decay: A `Tensor`, a floating point value, or a schedule
-                that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
-                to decay the variable by, in the update step.
-            exclude_from_weight_decay: List of regex patterns of
-              variables excluded from weight decay. Variables whose name
-              contain a substring matching the pattern will be excluded.
-              Note `decay_var_list` in `minimize` or `apply_gradients` takes
-              priority over `exclude_from_weight_decay` if specified.
-            **kwargs: Optional list or tuple or set of `Variable` objects to
-                decay.
-        """
+    Args:
+        weight_decay: A `Tensor`, a floating point value, or a schedule
+            that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
+            to decay the variable by, in the update step.
+        exclude_from_weight_decay: List of regex patterns of
+          variables excluded from weight decay. Variables whose name
+          contain a substring matching the pattern will be excluded.
+          Note `decay_var_list` in `minimize` or `apply_gradients` takes
+          priority over `exclude_from_weight_decay` if specified.
+        **kwargs: Optional list or tuple or set of `Variable` objects to
+            decay.
+    """
     wd = kwargs.pop("weight_decay", weight_decay)
     super().__init__(**kwargs)
     self._decay_var_list = None  # is set in minimize or apply_gradients
@@ -101,12 +101,10 @@ def __init__(
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "weight_decay": self._serialize_hyperparameter("weight_decay"),
-            "exclude_from_weight_decay": self.exclude_from_weight_decay,
-        }
-    )
+    config.update({
+      "weight_decay": self._serialize_hyperparameter("weight_decay"),
+      "exclude_from_weight_decay": self.exclude_from_weight_decay,
+    })
     return config
 
   @classmethod
@@ -115,80 +113,80 @@ def from_config(cls, config, custom_objects=None):
     if "learning_rate" in config:
       if isinstance(config["learning_rate"], dict):
         config["learning_rate"] = tf.keras.optimizers.schedules.deserialize(
-            config["learning_rate"], custom_objects=custom_objects
+          config["learning_rate"], custom_objects=custom_objects
         )
 
     if "weight_decay" in config:
       if isinstance(config["weight_decay"], dict):
         config["weight_decay"] = tf.keras.optimizers.schedules.deserialize(
-            config["weight_decay"], custom_objects=custom_objects
+          config["weight_decay"], custom_objects=custom_objects
         )
 
     return cls(**config)
 
   def minimize(
-      self,
-      loss,
-      var_list,
-      grad_loss=None,
-      name=None,
-      decay_var_list=None,
-      tape=None,
+    self,
+    loss,
+    var_list,
+    grad_loss=None,
+    name=None,
+    decay_var_list=None,
+    tape=None,
   ):
     """Minimize `loss` by updating `var_list`.
 
-        This method simply computes gradient using `tf.GradientTape` and calls
-        `apply_gradients()`. If you want to process the gradient before
-        applying then call `tf.GradientTape` and `apply_gradients()` explicitly
-        instead of using this function.
-
-        Args:
-            loss: `Tensor` or callable. If a callable, `loss` should take no
-                arguments and return the value to minimize. If a `Tensor`, the
-                `tape` argument must be passed.
-            var_list: list or tuple of `Variable` objects to update to
-                minimize `loss`, or a callable returning the list or tuple of
-                `Variable` objects. Use callable when the variable list would
-                otherwise be incomplete before `minimize` since the variables
-                are created at the first time `loss` is called.
-            grad_loss: Optional. A `Tensor` holding the gradient computed for
-                `loss`.
-            decay_var_list: Optional list of variables to be decayed. Defaults
-                to all variables in var_list. Note `decay_var_list` takes
-                priority over `exclude_from_weight_decay` if specified.
-            name: Optional name for the returned operation.
-            tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
-                `Tensor`, the tape that computed the `loss` must be provided.
-        Returns:
-            An Operation that updates the variables in `var_list`.
-        Raises:
-            ValueError: If some of the variables are not `Variable` objects.
-        """
+    This method simply computes gradient using `tf.GradientTape` and calls
+    `apply_gradients()`. If you want to process the gradient before
+    applying then call `tf.GradientTape` and `apply_gradients()` explicitly
+    instead of using this function.
+
+    Args:
+        loss: `Tensor` or callable. If a callable, `loss` should take no
+            arguments and return the value to minimize. If a `Tensor`, the
+            `tape` argument must be passed.
+        var_list: list or tuple of `Variable` objects to update to
+            minimize `loss`, or a callable returning the list or tuple of
+            `Variable` objects. Use callable when the variable list would
+            otherwise be incomplete before `minimize` since the variables
+            are created at the first time `loss` is called.
+        grad_loss: Optional. A `Tensor` holding the gradient computed for
+            `loss`.
+        decay_var_list: Optional list of variables to be decayed. Defaults
+            to all variables in var_list. Note `decay_var_list` takes
+            priority over `exclude_from_weight_decay` if specified.
+        name: Optional name for the returned operation.
+        tape: (Optional) `tf.GradientTape`. If `loss` is provided as a
+            `Tensor`, the tape that computed the `loss` must be provided.
+    Returns:
+        An Operation that updates the variables in `var_list`.
+    Raises:
+        ValueError: If some of the variables are not `Variable` objects.
+    """
     self._set_decay_var_list(var_list, decay_var_list)
     return super().minimize(loss, var_list=var_list, grad_loss=grad_loss, name=name, tape=tape)
 
   def apply_gradients(self, grads_and_vars, name=None, decay_var_list=None, **kwargs):
     """Apply gradients to variables.
 
-        This is the second part of `minimize()`. It returns an `Operation` that
-        applies gradients.
-
-        Args:
-            grads_and_vars: List of (gradient, variable) pairs.
-            name: Optional name for the returned operation. Default to the
-                name passed to the `Optimizer` constructor.
-            decay_var_list: Optional list of variables to be decayed. Defaults
-                to all variables in var_list. Note `decay_var_list` takes
-                priority over `exclude_from_weight_decay` if specified.
-            **kwargs: Additional arguments to pass to the base optimizer's
-                apply_gradient method, e.g., TF2.2 added an argument
-                `experimental_aggregate_gradients`.
-        Returns:
-            An `Operation` that applies the specified gradients.
-        Raises:
-            TypeError: If `grads_and_vars` is malformed.
-            ValueError: If none of the variables have gradients.
-        """
+    This is the second part of `minimize()`. It returns an `Operation` that
+    applies gradients.
+
+    Args:
+        grads_and_vars: List of (gradient, variable) pairs.
+        name: Optional name for the returned operation. Default to the
+            name passed to the `Optimizer` constructor.
+        decay_var_list: Optional list of variables to be decayed. Defaults
+            to all variables in var_list. Note `decay_var_list` takes
+            priority over `exclude_from_weight_decay` if specified.
+        **kwargs: Additional arguments to pass to the base optimizer's
+            apply_gradient method, e.g., TF2.2 added an argument
+            `experimental_aggregate_gradients`.
+    Returns:
+        An `Operation` that applies the specified gradients.
+    Raises:
+        TypeError: If `grads_and_vars` is malformed.
+        ValueError: If none of the variables have gradients.
+    """
     grads_and_vars = list(grads_and_vars)
     self._set_decay_var_list((v for _, v in grads_and_vars), decay_var_list)
     return super().apply_gradients(grads_and_vars, name=name, **kwargs)
@@ -196,8 +194,9 @@ def apply_gradients(self, grads_and_vars, name=None, decay_var_list=None, **kwar
   def _decay_weights_op(self, var, apply_state=None):
     if self._do_use_weight_decay(var):
       var_device, var_dtype = var.device, var.dtype.base_dtype
-      coefficients = (apply_state or {}).get((var_device, var_dtype)
-                                            ) or self._fallback_apply_state(var_device, var_dtype)
+      coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(
+        var_device, var_dtype
+      )
 
       return var.assign_sub(coefficients["wd_t"] * var, self._use_locking)
     return tf.no_op()
@@ -205,8 +204,9 @@ def _decay_weights_op(self, var, apply_state=None):
   def _decay_weights_sparse_op(self, var, indices, apply_state=None):
     if self._do_use_weight_decay(var):
       var_device, var_dtype = var.device, var.dtype.base_dtype
-      coefficients = (apply_state or {}).get((var_device, var_dtype)
-                                            ) or self._fallback_apply_state(var_device, var_dtype)
+      coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(
+        var_device, var_dtype
+      )
 
       update = -coefficients["wd_t"] * tf.gather(var, indices)
       return self._resource_scatter_add(var, indices, update)
@@ -244,7 +244,7 @@ def _set_decay_var_list(self, var_list, decay_var_list=None):
       self._decay_var_list = set(v.ref() for v in decay_var_list)
     elif self.exclude_from_weight_decay:
       self._decay_var_list = set(
-          v.ref() for v in var_list if not is_variable_matched_by_regexes(v, self.exclude_from_weight_decay)
+        v.ref() for v in var_list if not is_variable_matched_by_regexes(v, self.exclude_from_weight_decay)
       )
     else:
       self._decay_var_list = None
@@ -263,89 +263,91 @@ def _do_use_weight_decay(self, var):
 
 
 @typechecked
-def extend_with_decoupled_weight_decay(base_optimizer: Type[keras_legacy_optimizer],) -> Type[keras_legacy_optimizer]:
+def extend_with_decoupled_weight_decay(
+  base_optimizer: Type[keras_legacy_optimizer],
+) -> Type[keras_legacy_optimizer]:
   """Factory function returning an optimizer class with decoupled weight
-    decay.
-
-    Returns an optimizer class. An instance of the returned class computes the
-    update step of `base_optimizer` and additionally decays the weights.
-    E.g., the class returned by
-    `extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)` is
-    equivalent to `dp.optimizers.AdamW`.
-
-    The API of the new optimizer class slightly differs from the API of the
-    base optimizer:
-    - The first argument to the constructor is the weight decay rate.
-    - Optional keyword argument `exclude_from_weight_decay` accepts list of
-      regex patterns of variables excluded from weight decay. Variables whose
-      name contain a substring matching the pattern will be excluded.
-    - `minimize` and `apply_gradients` accept the optional keyword argument
-      `decay_var_list`, which specifies the variables that should be decayed.
-      Note this takes priority over `exclude_from_weight_decay` if specified.
-      If both `None`, all variables that are optimized are decayed.
-
-    Usage example:
-    ```python
-    # MyAdamW is a new class
-    MyAdamW = extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)
-    # Create a MyAdamW object
-    optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001)
-    # update var1, var2 but only decay var1
-    optimizer.minimize(loss, var_list=[var1, var2], decay_variables=[var1])
-
-    Note: this extension decays weights BEFORE applying the update based
-    on the gradient, i.e. this extension only has the desired behaviour for
-    optimizers which do not depend on the value of 'var' in the update step!
-
-    Note: when applying a decay to the learning rate, be sure to manually apply
-    the decay to the `weight_decay` as well. For example:
-
-    ```python
-    step = tf.Variable(0, trainable=False)
-    schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
-        [10000, 15000], [1e-0, 1e-1, 1e-2])
-    # lr and wd can be a function or a tensor
-    lr = 1e-1 * schedule(step)
-    wd = lambda: 1e-4 * schedule(step)
-
-    # ...
-
-    optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
-    ```
-
-    Note: you might want to register your own custom optimizer using
-    `tf.keras.utils.get_custom_objects()`.
-
-    Args:
-        base_optimizer: An optimizer class that inherits from
-            tf.optimizers.Optimizer.
-
-    Returns:
-        A new optimizer class that inherits from DecoupledWeightDecayExtension
-        and base_optimizer.
-    """
+  decay.
+
+  Returns an optimizer class. An instance of the returned class computes the
+  update step of `base_optimizer` and additionally decays the weights.
+  E.g., the class returned by
+  `extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)` is
+  equivalent to `dp.optimizers.AdamW`.
+
+  The API of the new optimizer class slightly differs from the API of the
+  base optimizer:
+  - The first argument to the constructor is the weight decay rate.
+  - Optional keyword argument `exclude_from_weight_decay` accepts list of
+    regex patterns of variables excluded from weight decay. Variables whose
+    name contain a substring matching the pattern will be excluded.
+  - `minimize` and `apply_gradients` accept the optional keyword argument
+    `decay_var_list`, which specifies the variables that should be decayed.
+    Note this takes priority over `exclude_from_weight_decay` if specified.
+    If both `None`, all variables that are optimized are decayed.
+
+  Usage example:
+  ```python
+  # MyAdamW is a new class
+  MyAdamW = extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam)
+  # Create a MyAdamW object
+  optimizer = MyAdamW(weight_decay=0.001, learning_rate=0.001)
+  # update var1, var2 but only decay var1
+  optimizer.minimize(loss, var_list=[var1, var2], decay_variables=[var1])
+
+  Note: this extension decays weights BEFORE applying the update based
+  on the gradient, i.e. this extension only has the desired behaviour for
+  optimizers which do not depend on the value of 'var' in the update step!
+
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+  step = tf.Variable(0, trainable=False)
+  schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+      [10000, 15000], [1e-0, 1e-1, 1e-2])
+  # lr and wd can be a function or a tensor
+  lr = 1e-1 * schedule(step)
+  wd = lambda: 1e-4 * schedule(step)
+
+  # ...
+
+  optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+  ```
+
+  Note: you might want to register your own custom optimizer using
+  `tf.keras.utils.get_custom_objects()`.
+
+  Args:
+      base_optimizer: An optimizer class that inherits from
+          tf.optimizers.Optimizer.
+
+  Returns:
+      A new optimizer class that inherits from DecoupledWeightDecayExtension
+      and base_optimizer.
+  """
 
   class OptimizerWithDecoupledWeightDecay(DecoupledWeightDecayExtension, base_optimizer):
     """Base_optimizer with decoupled weight decay.
 
-        This class computes the update step of `base_optimizer` and
-        additionally decays the variable with the weight decay being
-        decoupled from the optimization steps w.r.t. to the loss
-        function, as described by [Loshchilov & Hutter]
-        (https://arxiv.org/pdf/1711.05101.pdf). For SGD variants, this
-        simplifies hyperparameter search since it decouples the settings
-        of weight decay and learning rate. For adaptive gradient
-        algorithms, it regularizes variables with large gradients more
-        than L2 regularization would, which was shown to yield better
-        training loss and generalization error in the paper above.
-        """
+    This class computes the update step of `base_optimizer` and
+    additionally decays the variable with the weight decay being
+    decoupled from the optimization steps w.r.t. to the loss
+    function, as described by [Loshchilov & Hutter]
+    (https://arxiv.org/pdf/1711.05101.pdf). For SGD variants, this
+    simplifies hyperparameter search since it decouples the settings
+    of weight decay and learning rate. For adaptive gradient
+    algorithms, it regularizes variables with large gradients more
+    than L2 regularization would, which was shown to yield better
+    training loss and generalization error in the paper above.
+    """
 
     @typechecked
     def __init__(
-        self,
-        weight_decay: Union[FloatTensorLike, Callable],
-        *args,
-        **kwargs,
+      self,
+      weight_decay: Union[FloatTensorLike, Callable],
+      *args,
+      **kwargs,
     ):
       # super delegation is necessary here
       super().__init__(weight_decay, *args, **kwargs)
@@ -365,78 +367,78 @@ def __init__(
 class SGDW(DecoupledWeightDecayExtension, SGD_CLASS):
   """Optimizer that implements the Momentum algorithm with weight_decay.
 
-    This is an implementation of the SGDW optimizer described in "Decoupled
-    Weight Decay Regularization" by [Loshchilov & Hutter]
-    (https://arxiv.org/pdf/1711.05101.pdf).
-    It computes the update step of `tf.keras.optimizers.SGD` and additionally
-    decays the variable. Note that this is different from adding
-    L2 regularization on the variables to the loss. Decoupling the weight decay
-    from other hyperparameters (in particular the learning rate) simplifies
-    hyperparameter search.
-
-    For further information see the documentation of the SGD Optimizer.
-
-    This optimizer can also be instantiated as
-    ```python
-    extend_with_decoupled_weight_decay(tf.keras.optimizers.SGD,
-                                       weight_decay=weight_decay)
-    ```
-
-    Note: when applying a decay to the learning rate, be sure to manually apply
-    the decay to the `weight_decay` as well. For example:
-
-    ```python
-    step = tf.Variable(0, trainable=False)
-    schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
-        [10000, 15000], [1e-0, 1e-1, 1e-2])
-    # lr and wd can be a function or a tensor
-    lr = 1e-1 * schedule(step)
-    wd = lambda: 1e-4 * schedule(step)
-
-    # ...
-
-    optimizer = dp.optimizers.SGDW(
-        learning_rate=lr, weight_decay=wd, momentum=0.9)
-    ```
-    """
+  This is an implementation of the SGDW optimizer described in "Decoupled
+  Weight Decay Regularization" by [Loshchilov & Hutter]
+  (https://arxiv.org/pdf/1711.05101.pdf).
+  It computes the update step of `tf.keras.optimizers.SGD` and additionally
+  decays the variable. Note that this is different from adding
+  L2 regularization on the variables to the loss. Decoupling the weight decay
+  from other hyperparameters (in particular the learning rate) simplifies
+  hyperparameter search.
+
+  For further information see the documentation of the SGD Optimizer.
+
+  This optimizer can also be instantiated as
+  ```python
+  extend_with_decoupled_weight_decay(tf.keras.optimizers.SGD,
+                                     weight_decay=weight_decay)
+  ```
+
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+  step = tf.Variable(0, trainable=False)
+  schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+      [10000, 15000], [1e-0, 1e-1, 1e-2])
+  # lr and wd can be a function or a tensor
+  lr = 1e-1 * schedule(step)
+  wd = lambda: 1e-4 * schedule(step)
+
+  # ...
+
+  optimizer = dp.optimizers.SGDW(
+      learning_rate=lr, weight_decay=wd, momentum=0.9)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      weight_decay: Union[FloatTensorLike, Callable],
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      momentum: Union[FloatTensorLike, Callable] = 0.0,
-      nesterov: bool = False,
-      name: str = "SGDW",
-      **kwargs,
+    self,
+    weight_decay: Union[FloatTensorLike, Callable],
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    momentum: Union[FloatTensorLike, Callable] = 0.0,
+    nesterov: bool = False,
+    name: str = "SGDW",
+    **kwargs,
   ):
     """Construct a new SGDW optimizer.
 
-        For further information see the documentation of the SGD Optimizer.
-
-        Args:
-            learning_rate: float hyperparameter >= 0. Learning rate.
-            momentum: float hyperparameter >= 0 that accelerates SGD in the
-                relevant direction and dampens oscillations.
-            nesterov: boolean. Whether to apply Nesterov momentum.
-            name: Optional name prefix for the operations created when applying
-                gradients.  Defaults to 'SGD'.
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
-                `lr`, `decay`, `exclude_from_weight_decay`}. `clipnorm` is clip
-                gradients by norm; `clipvalue` is clip gradients by value.
-                `decay` is included for backward compatibility to allow time
-                inverse decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-                `exclude_from_weight_decay` accepts list of regex patterns of
-                variables excluded from weight decay.
-        """
+    For further information see the documentation of the SGD Optimizer.
+
+    Args:
+        learning_rate: float hyperparameter >= 0. Learning rate.
+        momentum: float hyperparameter >= 0 that accelerates SGD in the
+            relevant direction and dampens oscillations.
+        nesterov: boolean. Whether to apply Nesterov momentum.
+        name: Optional name prefix for the operations created when applying
+            gradients.  Defaults to 'SGD'.
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+            `lr`, `decay`, `exclude_from_weight_decay`}. `clipnorm` is clip
+            gradients by norm; `clipvalue` is clip gradients by value.
+            `decay` is included for backward compatibility to allow time
+            inverse decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+            `exclude_from_weight_decay` accepts list of regex patterns of
+            variables excluded from weight decay.
+    """
     super().__init__(
-        weight_decay,
-        learning_rate=learning_rate,
-        momentum=momentum,
-        nesterov=nesterov,
-        name=name,
-        **kwargs,
+      weight_decay,
+      learning_rate=learning_rate,
+      momentum=momentum,
+      nesterov=nesterov,
+      name=name,
+      **kwargs,
     )
 
 
@@ -444,90 +446,90 @@ def __init__(
 class AdamW(DecoupledWeightDecayExtension, ADAM_CLASS):
   """Optimizer that implements the Adam algorithm with weight decay.
 
-    This is an implementation of the AdamW optimizer described in "Decoupled
-    Weight Decay Regularization" by [Loshchilov & Hutter]
-    (https://arxiv.org/pdf/1711.05101.pdf).
+  This is an implementation of the AdamW optimizer described in "Decoupled
+  Weight Decay Regularization" by [Loshchilov & Hutter]
+  (https://arxiv.org/pdf/1711.05101.pdf).
 
-    It computes the update step of `tf.keras.optimizers.Adam` and additionally
-    decays the variable. Note that this is different from adding L2
-    regularization on the variables to the loss: it regularizes variables with
-    large gradients more than L2 regularization would, which was shown to yield
-    better training loss and generalization error in the paper above.
+  It computes the update step of `tf.keras.optimizers.Adam` and additionally
+  decays the variable. Note that this is different from adding L2
+  regularization on the variables to the loss: it regularizes variables with
+  large gradients more than L2 regularization would, which was shown to yield
+  better training loss and generalization error in the paper above.
 
-    For further information see the documentation of the Adam Optimizer.
+  For further information see the documentation of the Adam Optimizer.
 
-    This optimizer can also be instantiated as
-    ```python
-    extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam,
-                                       weight_decay=weight_decay)
-    ```
+  This optimizer can also be instantiated as
+  ```python
+  extend_with_decoupled_weight_decay(tf.keras.optimizers.Adam,
+                                     weight_decay=weight_decay)
+  ```
 
-    Note: when applying a decay to the learning rate, be sure to manually apply
-    the decay to the `weight_decay` as well. For example:
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
 
-    ```python
-    step = tf.Variable(0, trainable=False)
-    schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
-        [10000, 15000], [1e-0, 1e-1, 1e-2])
-    # lr and wd can be a function or a tensor
-    lr = 1e-1 * schedule(step)
-    wd = lambda: 1e-4 * schedule(step)
+  ```python
+  step = tf.Variable(0, trainable=False)
+  schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
+      [10000, 15000], [1e-0, 1e-1, 1e-2])
+  # lr and wd can be a function or a tensor
+  lr = 1e-1 * schedule(step)
+  wd = lambda: 1e-4 * schedule(step)
 
-    # ...
+  # ...
 
-    optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
-    ```
-    """
+  optimizer = dp.optimizers.AdamW(learning_rate=lr, weight_decay=wd)
+  ```
+  """
 
   @typechecked
   def __init__(
-      self,
-      weight_decay: Union[FloatTensorLike, Callable],
-      learning_rate: Union[FloatTensorLike, Callable] = 0.001,
-      beta_1: Union[FloatTensorLike, Callable] = 0.9,
-      beta_2: Union[FloatTensorLike, Callable] = 0.999,
-      epsilon: FloatTensorLike = 1e-07,
-      amsgrad: bool = False,
-      name: str = "AdamW",
-      **kwargs,
+    self,
+    weight_decay: Union[FloatTensorLike, Callable],
+    learning_rate: Union[FloatTensorLike, Callable] = 0.001,
+    beta_1: Union[FloatTensorLike, Callable] = 0.9,
+    beta_2: Union[FloatTensorLike, Callable] = 0.999,
+    epsilon: FloatTensorLike = 1e-07,
+    amsgrad: bool = False,
+    name: str = "AdamW",
+    **kwargs,
   ):
     """Construct a new AdamW optimizer.
 
-        For further information see the documentation of the Adam Optimizer.
-
-        Args:
-            weight_decay: A Tensor or a floating point value. The weight decay.
-            learning_rate: A Tensor or a floating point value. The learning
-                rate.
-            beta_1: A float value or a constant float tensor. The exponential
-                decay rate for the 1st moment estimates.
-            beta_2: A float value or a constant float tensor. The exponential
-                decay rate for the 2nd moment estimates.
-            epsilon: A small constant for numerical stability. This epsilon is
-                "epsilon hat" in the Kingma and Ba paper (in the formula just
-                before Section 2.1), not the epsilon in Algorithm 1 of the
-                paper.
-            amsgrad: boolean. Whether to apply AMSGrad variant of this
-                algorithm from the paper "On the Convergence of Adam and
-                beyond".
-            name: Optional name for the operations created when applying
-                gradients. Defaults to "AdamW".
-            **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
-                `lr`, `decay`, `exclude_from_weight_decay`}. `clipnorm` is clip
-                gradients by norm; `clipvalue` is clip gradients by value.
-                `decay` is included for backward compatibility to allow time
-                inverse decay of learning rate. `lr` is included for backward
-                compatibility, recommended to use `learning_rate` instead.
-                `exclude_from_weight_decay` accepts list of regex patterns of
-                variables excluded from weight decay.
-        """
+    For further information see the documentation of the Adam Optimizer.
+
+    Args:
+        weight_decay: A Tensor or a floating point value. The weight decay.
+        learning_rate: A Tensor or a floating point value. The learning
+            rate.
+        beta_1: A float value or a constant float tensor. The exponential
+            decay rate for the 1st moment estimates.
+        beta_2: A float value or a constant float tensor. The exponential
+            decay rate for the 2nd moment estimates.
+        epsilon: A small constant for numerical stability. This epsilon is
+            "epsilon hat" in the Kingma and Ba paper (in the formula just
+            before Section 2.1), not the epsilon in Algorithm 1 of the
+            paper.
+        amsgrad: boolean. Whether to apply AMSGrad variant of this
+            algorithm from the paper "On the Convergence of Adam and
+            beyond".
+        name: Optional name for the operations created when applying
+            gradients. Defaults to "AdamW".
+        **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+            `lr`, `decay`, `exclude_from_weight_decay`}. `clipnorm` is clip
+            gradients by norm; `clipvalue` is clip gradients by value.
+            `decay` is included for backward compatibility to allow time
+            inverse decay of learning rate. `lr` is included for backward
+            compatibility, recommended to use `learning_rate` instead.
+            `exclude_from_weight_decay` accepts list of regex patterns of
+            variables excluded from weight decay.
+    """
     super().__init__(
-        weight_decay,
-        learning_rate=learning_rate,
-        beta_1=beta_1,
-        beta_2=beta_2,
-        epsilon=epsilon,
-        amsgrad=amsgrad,
-        name=name,
-        **kwargs,
+      weight_decay,
+      learning_rate=learning_rate,
+      beta_1=beta_1,
+      beta_2=beta_2,
+      epsilon=epsilon,
+      amsgrad=amsgrad,
+      name=name,
+      **kwargs,
     )
diff --git a/deepray/optimizers/yogi.py b/deepray/optimizers/yogi.py
index 85442096..23fc6377 100644
--- a/deepray/optimizers/yogi.py
+++ b/deepray/optimizers/yogi.py
@@ -33,18 +33,18 @@
 def _solve(a, b, c):
   """Return solution of a quadratic minimization.
 
-    The optimization equation is:
-         f(a, b, c) = argmin_w{1/2 * a * w^2 + b * w + c * |w|}
-    we get optimal solution w*:
-         w* = -(b - sign(b)*c)/a if |b| > c else w* = 0
-    REQUIRES: Dimensionality of a and b must be same
-    Args:
-      a: A Tensor
-      b: A Tensor
-      c: A Tensor with one element.
-    Returns:
-      A Tensor w, which is solution for the equation
-    """
+  The optimization equation is:
+       f(a, b, c) = argmin_w{1/2 * a * w^2 + b * w + c * |w|}
+  we get optimal solution w*:
+       w* = -(b - sign(b)*c)/a if |b| > c else w* = 0
+  REQUIRES: Dimensionality of a and b must be same
+  Args:
+    a: A Tensor
+    b: A Tensor
+    c: A Tensor with one element.
+  Returns:
+    A Tensor w, which is solution for the equation
+  """
   w = (c * tf.sign(b) - b) / a
   w = tf.cast(tf.abs(b) > c, dtype=b.dtype) * w
   return w
@@ -54,50 +54,50 @@ def _solve(a, b, c):
 class Yogi(KerasLegacyOptimizer):
   """Optimizer that implements the Yogi algorithm in Keras.
 
-    See Algorithm 2 of
-    https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf.
-    """
+  See Algorithm 2 of
+  https://papers.nips.cc/paper/8186-adaptive-methods-for-nonconvex-optimization.pdf.
+  """
 
   @typechecked
   def __init__(
-      self,
-      learning_rate: Union[FloatTensorLike, Callable] = 0.01,
-      beta1: FloatTensorLike = 0.9,
-      beta2: FloatTensorLike = 0.999,
-      epsilon: FloatTensorLike = 1e-3,
-      l1_regularization_strength: FloatTensorLike = 0.0,
-      l2_regularization_strength: FloatTensorLike = 0.0,
-      initial_accumulator_value: FloatTensorLike = 1e-6,
-      activation: str = "sign",
-      name: str = "Yogi",
-      **kwargs,
+    self,
+    learning_rate: Union[FloatTensorLike, Callable] = 0.01,
+    beta1: FloatTensorLike = 0.9,
+    beta2: FloatTensorLike = 0.999,
+    epsilon: FloatTensorLike = 1e-3,
+    l1_regularization_strength: FloatTensorLike = 0.0,
+    l2_regularization_strength: FloatTensorLike = 0.0,
+    initial_accumulator_value: FloatTensorLike = 1e-6,
+    activation: str = "sign",
+    name: str = "Yogi",
+    **kwargs,
   ):
     """Construct a new Yogi optimizer.
 
-        Args:
-          learning_rate: A Tensor or a floating point value.
-            The learning rate.
-          beta1: A float value or a constant float tensor.
-            The exponential decay rate for the 1st moment estimates.
-          beta2: A float value or a constant float tensor.
-            The exponential decay rate for the 2nd moment estimates.
-          epsilon: A constant trading off adaptivity and noise.
-          l1_regularization_strength: A float value, must be greater than or
-            equal to zero.
-          l2_regularization_strength: A float value, must be greater than or
-            equal to zero.
-          initial_accumulator_value: The starting value for accumulators.
-            Only positive values are allowed.
-          activation: Use hard sign or soft tanh to determin sign.
-          name: Optional name for the operations created when applying
-            gradients. Defaults to "Yogi".
-          **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
-            `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
-            is clip gradients by value, `decay` is included for backward
-            compatibility to allow time inverse decay of learning rate. `lr`
-            is included for backward compatibility, recommended to use
-            `learning_rate` instead.
-        """
+    Args:
+      learning_rate: A Tensor or a floating point value.
+        The learning rate.
+      beta1: A float value or a constant float tensor.
+        The exponential decay rate for the 1st moment estimates.
+      beta2: A float value or a constant float tensor.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A constant trading off adaptivity and noise.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      initial_accumulator_value: The starting value for accumulators.
+        Only positive values are allowed.
+      activation: Use hard sign or soft tanh to determin sign.
+      name: Optional name for the operations created when applying
+        gradients. Defaults to "Yogi".
+      **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
+        `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue`
+        is clip gradients by value, `decay` is included for backward
+        compatibility to allow time inverse decay of learning rate. `lr`
+        is included for backward compatibility, recommended to use
+        `learning_rate` instead.
+    """
     super().__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
     self._set_hyper("decay", self._initial_decay)
@@ -208,13 +208,13 @@ def _resource_apply_dense(self, grad, var):
   def _resource_apply_sparse(self, grad, var, indices):
     """Applies sparse gradients to a variable.
 
-        Args:
-          grad: A tensor for the `values` of `tf.IndexedSlices`.
-          var: A `tf.Variable` object.
-          indices: A tensor for the `indices` of `tf.IndexedSlices`.
-        Returns:
-          An op which updates `var` with `grad` and `indices`.
-        """
+    Args:
+      grad: A tensor for the `values` of `tf.IndexedSlices`.
+      var: A `tf.Variable` object.
+      indices: A tensor for the `indices` of `tf.IndexedSlices`.
+    Returns:
+      An op which updates `var` with `grad` and `indices`.
+    """
 
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
@@ -308,17 +308,15 @@ def _resource_apply_sparse(self, grad, var, indices):
 
   def get_config(self):
     config = super().get_config()
-    config.update(
-        {
-            "learning_rate": self._serialize_hyperparameter("learning_rate"),
-            "decay": self._serialize_hyperparameter("decay"),
-            "beta1": self._serialize_hyperparameter("beta_1"),
-            "beta2": self._serialize_hyperparameter("beta_2"),
-            "epsilon": self._serialize_hyperparameter("epsilon"),
-            "l1_regularization_strength": self._serialize_hyperparameter("l1_regularization_strength"),
-            "l2_regularization_strength": self._serialize_hyperparameter("l2_regularization_strength"),
-            "activation": self._activation,
-            "initial_accumulator_value": self._initial_accumulator_value,
-        }
-    )
+    config.update({
+      "learning_rate": self._serialize_hyperparameter("learning_rate"),
+      "decay": self._serialize_hyperparameter("decay"),
+      "beta1": self._serialize_hyperparameter("beta_1"),
+      "beta2": self._serialize_hyperparameter("beta_2"),
+      "epsilon": self._serialize_hyperparameter("epsilon"),
+      "l1_regularization_strength": self._serialize_hyperparameter("l1_regularization_strength"),
+      "l2_regularization_strength": self._serialize_hyperparameter("l2_regularization_strength"),
+      "activation": self._activation,
+      "initial_accumulator_value": self._initial_accumulator_value,
+    })
     return config
diff --git a/deepray/options.py b/deepray/options.py
index 522baa81..9391e5ac 100644
--- a/deepray/options.py
+++ b/deepray/options.py
@@ -49,9 +49,9 @@ def warn_fallback(op_name):
 def enable_custom_kernel():
   """Prefer custom C++/CUDA kernel to pure python operations.
 
-    Enable using custom C++/CUDA kernel instead of pure python operations.
-    It has the same effect as setting environment variable `DEEPRAY_PY_OPS=0`.
-    """
+  Enable using custom C++/CUDA kernel instead of pure python operations.
+  It has the same effect as setting environment variable `DEEPRAY_PY_OPS=0`.
+  """
   global _DEEPRAY_PY_OPS
   _DEEPRAY_PY_OPS = False
 
@@ -59,9 +59,9 @@ def enable_custom_kernel():
 def disable_custom_kernel():
   """Prefer pure python operations to custom C++/CUDA kernel.
 
-    Disable using custom C++/CUDA kernel instead of pure python operations.
-    It has the same effect as setting environment variable `DEEPRAY_PY_OPS=1`.
-    """
+  Disable using custom C++/CUDA kernel instead of pure python operations.
+  It has the same effect as setting environment variable `DEEPRAY_PY_OPS=1`.
+  """
   global _DEEPRAY_PY_OPS
   _DEEPRAY_PY_OPS = True
 
diff --git a/deepray/register.py b/deepray/register.py
index 19a600ec..13cd4f62 100644
--- a/deepray/register.py
+++ b/deepray/register.py
@@ -9,60 +9,60 @@
 def register_all(keras_objects: bool = True, custom_kernels: bool = True) -> None:
   """Register Deepray' objects in TensorFlow global dictionaries.
 
-    When loading a Keras model that has a TF Deepray' function, it is needed
-    for this function to be known by the Keras deserialization process.
-
-    There are two ways to do this, either do
-
-    ```python
-    tf.keras.models.load_model(
-        "my_model.tf",
-        custom_objects={"LAMB": dp.image.optimizer.LAMB}
-    )
-    ```
-
-    or you can do:
-    ```python
-    dp.register_all()
-    tf.keras.models.load_model("my_model.tf")
-    ```
-
-    If the model contains custom ops (compiled ops) of Deepray,
-    and the graph is loaded with `tf.saved_model.load`, then custom ops need
-    to be registered before to avoid an error of the type:
-
-    ```
-    tensorflow.python.framework.errors_impl.NotFoundError: Op type not registered
-    '...' in binary running on ... Make sure the Op and Kernel are
-    registered in the binary running in this process.
-    ```
-
-    In this case, the only way to make sure that the ops are registered is to call
-    this function:
-
-    ```python
-    dp.register_all()
-    tf.saved_model.load("my_model.tf")
-    ```
-
-    Note that you can call this function multiple times in the same process,
-    it only has an effect the first time. Afterward, it's just a no-op.
-
-    Args:
-        keras_objects: boolean, `True` by default. If `True`, register all
-            Keras objects
-            with `tf.keras.utils.register_keras_serializable(package="Deepray")`
-            If set to False, doesn't register any Keras objects
-            of Deepray in TensorFlow.
-        custom_kernels: boolean, `True` by default. If `True`, loads all
-            custom kernels of Deepray with
-            `tf.load_op_library("path/to/so/file.so")`. Loading the SO files
-            register them automatically. If `False` doesn't load and register
-            the shared objects files. Not that it might be useful to turn it off
-            if your installation of Deepray doesn't work well with custom ops.
-    Returns:
-        None
-    """
+  When loading a Keras model that has a TF Deepray' function, it is needed
+  for this function to be known by the Keras deserialization process.
+
+  There are two ways to do this, either do
+
+  ```python
+  tf.keras.models.load_model(
+      "my_model.tf",
+      custom_objects={"LAMB": dp.image.optimizer.LAMB}
+  )
+  ```
+
+  or you can do:
+  ```python
+  dp.register_all()
+  tf.keras.models.load_model("my_model.tf")
+  ```
+
+  If the model contains custom ops (compiled ops) of Deepray,
+  and the graph is loaded with `tf.saved_model.load`, then custom ops need
+  to be registered before to avoid an error of the type:
+
+  ```
+  tensorflow.python.framework.errors_impl.NotFoundError: Op type not registered
+  '...' in binary running on ... Make sure the Op and Kernel are
+  registered in the binary running in this process.
+  ```
+
+  In this case, the only way to make sure that the ops are registered is to call
+  this function:
+
+  ```python
+  dp.register_all()
+  tf.saved_model.load("my_model.tf")
+  ```
+
+  Note that you can call this function multiple times in the same process,
+  it only has an effect the first time. Afterward, it's just a no-op.
+
+  Args:
+      keras_objects: boolean, `True` by default. If `True`, register all
+          Keras objects
+          with `tf.keras.utils.register_keras_serializable(package="Deepray")`
+          If set to False, doesn't register any Keras objects
+          of Deepray in TensorFlow.
+      custom_kernels: boolean, `True` by default. If `True`, loads all
+          custom kernels of Deepray with
+          `tf.load_op_library("path/to/so/file.so")`. Loading the SO files
+          register them automatically. If `False` doesn't load and register
+          the shared objects files. Not that it might be useful to turn it off
+          if your installation of Deepray doesn't work well with custom ops.
+  Returns:
+      None
+  """
   if keras_objects:
     register_keras_objects()
   if custom_kernels:
@@ -81,23 +81,23 @@ def register_custom_kernels() -> None:
   all_shared_objects = _get_all_shared_objects()
   if not all_shared_objects:
     raise FileNotFoundError(
-        "No shared objects files were found in the custom ops "
-        "directory in Tensorflow Deepray, check your installation again, "
-        "or, if you don't need custom ops, call `dp.register_all(custom_kernels=False)`"
-        " instead."
+      "No shared objects files were found in the custom ops "
+      "directory in Tensorflow Deepray, check your installation again, "
+      "or, if you don't need custom ops, call `dp.register_all(custom_kernels=False)`"
+      " instead."
     )
   try:
     for shared_object in all_shared_objects:
       tf.load_op_library(shared_object)
   except tf.errors.NotFoundError as e:
     raise RuntimeError(
-        "One of the shared objects ({}) could not be loaded. This may be "
-        "due to a number of reasons (incompatible TensorFlow version, buiding from "
-        "source with different flags, broken install of Deepray...). If you "
-        "wanted to register the shared objects because you needed them when loading your "
-        "model, you should fix your install of Deepray. If you don't "
-        "use custom ops in your model, you can skip registering custom ops with "
-        "`dp.register_all(custom_kernels=False)`".format(shared_object)
+      "One of the shared objects ({}) could not be loaded. This may be "
+      "due to a number of reasons (incompatible TensorFlow version, buiding from "
+      "source with different flags, broken install of Deepray...). If you "
+      "wanted to register the shared objects because you needed them when loading your "
+      "model, you should fix your install of Deepray. If you don't "
+      "use custom ops in your model, you can skip registering custom ops with "
+      "`dp.register_all(custom_kernels=False)`".format(shared_object)
     ) from e
 
 
diff --git a/deepray/testing/serialization.py b/deepray/testing/serialization.py
index bb44d416..c8ef1aab 100644
--- a/deepray/testing/serialization.py
+++ b/deepray/testing/serialization.py
@@ -8,11 +8,11 @@
 
 @typeguard.typechecked
 def check_metric_serialization(
-    metric: Metric,
-    y_true: Union[tuple, np.ndarray],
-    y_pred: Union[tuple, np.ndarray],
-    sample_weight: Union[tuple, np.ndarray, None] = None,
-    strict: bool = True,
+  metric: Metric,
+  y_true: Union[tuple, np.ndarray],
+  y_pred: Union[tuple, np.ndarray],
+  sample_weight: Union[tuple, np.ndarray, None] = None,
+  strict: bool = True,
 ):
   config = metric.get_config()
   class_ = metric.__class__
@@ -45,8 +45,7 @@ def check_metric_serialization(
     np.testing.assert_allclose(metric_result, metric_copy_result)
   except AssertionError as e:
     raise ValueError(
-        "The original and the copy of the metric give different results after "
-        "the same `.update_states()` call."
+      "The original and the copy of the metric give different results after the same `.update_states()` call."
     ) from e
 
 
@@ -58,26 +57,26 @@ def check_config(config, class_, strict):
       continue
     elif parameter_name == "args" and strict:
       raise KeyError(
-          "Please do not use args in the class constructor of {}, "
-          "as it hides the real signature "
-          "and degrades the user experience. "
-          "If you have no alternative to *args, "
-          "use `strict=False` in check_metric_serialization.".format(class_.__name__)
+        "Please do not use args in the class constructor of {}, "
+        "as it hides the real signature "
+        "and degrades the user experience. "
+        "If you have no alternative to *args, "
+        "use `strict=False` in check_metric_serialization.".format(class_.__name__)
       )
     elif parameter_name == "kwargs" and strict:
       raise KeyError(
-          "Please do not use kwargs in the class constructor of {}, "
-          "as it hides the real signature "
-          "and degrades the user experience. "
-          "If you have no alternative to **kwargs, "
-          "use `strict=False` in check_metric_serialization.".format(class_.__name__)
+        "Please do not use kwargs in the class constructor of {}, "
+        "as it hides the real signature "
+        "and degrades the user experience. "
+        "If you have no alternative to **kwargs, "
+        "use `strict=False` in check_metric_serialization.".format(class_.__name__)
       )
     if parameter_name not in config:
       raise KeyError(
-          "The constructor parameter {} is not present in the config dict "
-          "obtained with `.get_config()` of {}. All parameters should be set to "
-          "ensure a perfect copy of the keras object can be obtained when "
-          "serialized.".format(parameter_name, class_.__name__)
+        "The constructor parameter {} is not present in the config dict "
+        "obtained with `.get_config()` of {}. All parameters should be set to "
+        "ensure a perfect copy of the keras object can be obtained when "
+        "serialized.".format(parameter_name, class_.__name__)
       )
 
 
diff --git a/deepray/testing/tests/serialization_test.py b/deepray/testing/tests/serialization_test.py
index 95678497..57526102 100644
--- a/deepray/testing/tests/serialization_test.py
+++ b/deepray/testing/tests/serialization_test.py
@@ -19,14 +19,13 @@ def get_random_booleans():
 
 def test_check_metric_serialization_true_negative():
   check_metric_serialization(
-      TrueNegatives(0.8),
-      np.random.uniform(0, 2, size=(2, 2)).astype(bool),
-      np.random.uniform(0, 1, size=(2, 2)).astype(np.float32),
+    TrueNegatives(0.8),
+    np.random.uniform(0, 2, size=(2, 2)).astype(bool),
+    np.random.uniform(0, 1, size=(2, 2)).astype(np.float32),
   )
 
 
 class MyDummyMetric(Metric):
-
   def __init__(self, stuff, name):
     super().__init__(name)
     self.stuff = stuff
@@ -49,7 +48,6 @@ def test_missing_arg():
 
 
 class MyOtherDummyMetric(Metric):
-
   def __init__(self, to_add, name=None, dtype=None):
     super().__init__(name, dtype)
     self.to_add = to_add
diff --git a/deepray/tests/register_test.py b/deepray/tests/register_test.py
index 2b44d89b..21b15460 100644
--- a/deepray/tests/register_test.py
+++ b/deepray/tests/register_test.py
@@ -8,16 +8,14 @@
 
 def test_multiple_register():
   if resource_loader.SKIP_CUSTOM_OPS:
-    pytest.skip("Skipping the test because a custom ops "
-                "was being loaded while --skip-custom-ops was set.")
+    pytest.skip("Skipping the test because a custom ops was being loaded while --skip-custom-ops was set.")
   register_all()
   register_all()
 
 
 def test_get_all_shared_objects():
   if resource_loader.SKIP_CUSTOM_OPS:
-    pytest.skip("Skipping the test because a custom ops "
-                "was being loaded while --skip-custom-ops was set.")
+    pytest.skip("Skipping the test because a custom ops was being loaded while --skip-custom-ops was set.")
   all_shared_objects = _get_all_shared_objects()
   assert len(all_shared_objects) >= 4
 
diff --git a/deepray/text/crf.py b/deepray/text/crf.py
index 5066ba56..a2de257b 100644
--- a/deepray/text/crf.py
+++ b/deepray/text/crf.py
@@ -28,47 +28,47 @@
 def crf_filtered_inputs(inputs: TensorLike, tag_bitmap: TensorLike) -> tf.Tensor:
   """Constrains the inputs to filter out certain tags at each time step.
 
-    tag_bitmap limits the allowed tags at each input time step.
-    This is useful when an observed output at a given time step needs to be
-    constrained to a selected set of tags.
-
-    Args:
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-          to use as input to the CRF layer.
-      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
-          representing all active tags at each index for which to calculate the
-          unnormalized score.
-    Returns:
-      filtered_inputs: A [batch_size] vector of unnormalized sequence scores.
-    """
+  tag_bitmap limits the allowed tags at each input time step.
+  This is useful when an observed output at a given time step needs to be
+  constrained to a selected set of tags.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+  Returns:
+    filtered_inputs: A [batch_size] vector of unnormalized sequence scores.
+  """
 
   # set scores of filtered out inputs to be -inf.
   filtered_inputs = tf.where(
-      tag_bitmap,
-      inputs,
-      tf.fill(tf.shape(inputs), tf.cast(float("-inf"), inputs.dtype)),
+    tag_bitmap,
+    inputs,
+    tf.fill(tf.shape(inputs), tf.cast(float("-inf"), inputs.dtype)),
   )
   return filtered_inputs
 
 
 def crf_sequence_score(
-    inputs: TensorLike,
-    tag_indices: TensorLike,
-    sequence_lengths: TensorLike,
-    transition_params: TensorLike,
+  inputs: TensorLike,
+  tag_indices: TensorLike,
+  sequence_lengths: TensorLike,
+  transition_params: TensorLike,
 ) -> tf.Tensor:
   """Computes the unnormalized score for a tag sequence.
 
-    Args:
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-          to use as input to the CRF layer.
-      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
-          we compute the unnormalized score.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      transition_params: A [num_tags, num_tags] transition matrix.
-    Returns:
-      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-    """
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+        we compute the unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
   tag_indices = tf.cast(tag_indices, dtype=tf.int32)
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
 
@@ -86,9 +86,9 @@ def _single_seq_fn():
     sequence_scores = tf.gather_nd(inputs, indices)
 
     sequence_scores = tf.where(
-        tf.less_equal(sequence_lengths, 0),
-        tf.zeros_like(sequence_scores),
-        sequence_scores,
+      tf.less_equal(sequence_lengths, 0),
+      tf.zeros_like(sequence_scores),
+      sequence_scores,
     )
     return sequence_scores
 
@@ -103,33 +103,33 @@ def _multi_seq_fn():
 
 
 def crf_multitag_sequence_score(
-    inputs: TensorLike,
-    tag_bitmap: TensorLike,
-    sequence_lengths: TensorLike,
-    transition_params: TensorLike,
+  inputs: TensorLike,
+  tag_bitmap: TensorLike,
+  sequence_lengths: TensorLike,
+  transition_params: TensorLike,
 ) -> tf.Tensor:
   """Computes the unnormalized score of all tag sequences matching
-    tag_bitmap.
-
-    tag_bitmap enables more than one tag to be considered correct at each time
-    step. This is useful when an observed output at a given time step is
-    consistent with more than one tag, and thus the log likelihood of that
-    observation must take into account all possible consistent tags.
-
-    Using one-hot vectors in tag_bitmap gives results identical to
-    crf_sequence_score.
-
-    Args:
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-          to use as input to the CRF layer.
-      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
-          representing all active tags at each index for which to calculate the
-          unnormalized score.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      transition_params: A [num_tags, num_tags] transition matrix.
-    Returns:
-      sequence_scores: A [batch_size] vector of unnormalized sequence scores.
-    """
+  tag_bitmap.
+
+  tag_bitmap enables more than one tag to be considered correct at each time
+  step. This is useful when an observed output at a given time step is
+  consistent with more than one tag, and thus the log likelihood of that
+  observation must take into account all possible consistent tags.
+
+  Using one-hot vectors in tag_bitmap gives results identical to
+  crf_sequence_score.
+
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    sequence_scores: A [batch_size] vector of unnormalized sequence scores.
+  """
   tag_bitmap = tf.cast(tag_bitmap, dtype=tf.bool)
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
   filtered_inputs = crf_filtered_inputs(inputs, tag_bitmap)
@@ -143,9 +143,9 @@ def _multi_seq_fn():
     # Compute the logsumexp of all scores of sequences
     # matching the given tags.
     return crf_log_norm(
-        inputs=filtered_inputs,
-        sequence_lengths=sequence_lengths,
-        transition_params=transition_params,
+      inputs=filtered_inputs,
+      sequence_lengths=sequence_lengths,
+      transition_params=transition_params,
     )
 
   return tf.cond(tf.equal(tf.shape(inputs)[1], 1), _single_seq_fn, _multi_seq_fn)
@@ -154,14 +154,14 @@ def _multi_seq_fn():
 def crf_log_norm(inputs: TensorLike, sequence_lengths: TensorLike, transition_params: TensorLike) -> tf.Tensor:
   """Computes the normalization for a CRF.
 
-    Args:
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-          to use as input to the CRF layer.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      transition_params: A [num_tags, num_tags] transition matrix.
-    Returns:
-      log_norm: A [batch_size] vector of normalizers for a CRF.
-    """
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix.
+  Returns:
+    log_norm: A [batch_size] vector of normalizers for a CRF.
+  """
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
   # Split up the first and rest of the inputs in preparation for the forward
   # algorithm.
@@ -192,27 +192,27 @@ def _multi_seq_fn():
 
 
 def crf_log_likelihood(
-    inputs: TensorLike,
-    tag_indices: TensorLike,
-    sequence_lengths: TensorLike,
-    transition_params: Optional[TensorLike] = None,
+  inputs: TensorLike,
+  tag_indices: TensorLike,
+  sequence_lengths: TensorLike,
+  transition_params: Optional[TensorLike] = None,
 ) -> Tuple[tf.Tensor, tf.Tensor]:
   """Computes the log-likelihood of tag sequences in a CRF.
 
-    Args:
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
-          to use as input to the CRF layer.
-      tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
-          we compute the log-likelihood.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      transition_params: A [num_tags, num_tags] transition matrix,
-          if available.
-    Returns:
-      log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
-        each example, given the sequence of tag indices.
-      transition_params: A [num_tags, num_tags] transition matrix. This is
-          either provided by the caller or created in this function.
-    """
+  Args:
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
+        to use as input to the CRF layer.
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which
+        we compute the log-likelihood.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] transition matrix,
+        if available.
+  Returns:
+    log_likelihood: A [batch_size] `Tensor` containing the log-likelihood of
+      each example, given the sequence of tag indices.
+    transition_params: A [num_tags, num_tags] transition matrix. This is
+        either provided by the caller or created in this function.
+  """
   inputs = tf.convert_to_tensor(inputs)
 
   num_tags = inputs.shape[2]
@@ -237,13 +237,13 @@ def crf_log_likelihood(
 def crf_unary_score(tag_indices: TensorLike, sequence_lengths: TensorLike, inputs: TensorLike) -> tf.Tensor:
   """Computes the unary scores of tag sequences.
 
-    Args:
-      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
-    Returns:
-      unary_scores: A [batch_size] vector of unary scores.
-    """
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials.
+  Returns:
+    unary_scores: A [batch_size] vector of unary scores.
+  """
   tag_indices = tf.cast(tag_indices, dtype=tf.int32)
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
 
@@ -271,13 +271,13 @@ def crf_unary_score(tag_indices: TensorLike, sequence_lengths: TensorLike, input
 def crf_binary_score(tag_indices: TensorLike, sequence_lengths: TensorLike, transition_params: TensorLike) -> tf.Tensor:
   """Computes the binary scores of tag sequences.
 
-    Args:
-      tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-      transition_params: A [num_tags, num_tags] matrix of binary potentials.
-    Returns:
-      binary_scores: A [batch_size] vector of binary scores.
-    """
+  Args:
+    tag_indices: A [batch_size, max_seq_len] matrix of tag indices.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Returns:
+    binary_scores: A [batch_size] vector of binary scores.
+  """
   tag_indices = tf.cast(tag_indices, dtype=tf.int32)
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
 
@@ -303,28 +303,28 @@ def crf_binary_score(tag_indices: TensorLike, sequence_lengths: TensorLike, tran
 
 
 def crf_forward(
-    inputs: TensorLike,
-    state: TensorLike,
-    transition_params: TensorLike,
-    sequence_lengths: TensorLike,
+  inputs: TensorLike,
+  state: TensorLike,
+  transition_params: TensorLike,
+  sequence_lengths: TensorLike,
 ) -> tf.Tensor:
   """Computes the alpha values in a linear-chain CRF.
 
-    See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
-
-    Args:
-      inputs: A [batch_size, num_tags] matrix of unary potentials.
-      state: A [batch_size, num_tags] matrix containing the previous alpha
-         values.
-      transition_params: A [num_tags, num_tags] matrix of binary potentials.
-          This matrix is expanded into a [1, num_tags, num_tags] in preparation
-          for the broadcast summation occurring within the cell.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-
-    Returns:
-      new_alphas: A [batch_size, num_tags] matrix containing the
-          new alpha values.
-    """
+  See http://www.cs.columbia.edu/~mcollins/fb.pdf for reference.
+
+  Args:
+    inputs: A [batch_size, num_tags] matrix of unary potentials.
+    state: A [batch_size, num_tags] matrix containing the previous alpha
+       values.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+        This matrix is expanded into a [1, num_tags, num_tags] in preparation
+        for the broadcast summation occurring within the cell.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    new_alphas: A [batch_size, num_tags] matrix containing the
+        new alpha values.
+  """
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
 
   last_index = tf.maximum(tf.constant(0, dtype=sequence_lengths.dtype), sequence_lengths - 1)
@@ -348,17 +348,17 @@ def _scan_fn(_state, _inputs):
 def viterbi_decode(score: TensorLike, transition_params: TensorLike) -> tf.Tensor:
   """Decode the highest scoring sequence of tags outside of TensorFlow.
 
-    This should only be used at test time.
+  This should only be used at test time.
 
-    Args:
-      score: A [seq_len, num_tags] matrix of unary potentials.
-      transition_params: A [num_tags, num_tags] matrix of binary potentials.
+  Args:
+    score: A [seq_len, num_tags] matrix of unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
 
-    Returns:
-      viterbi: A [seq_len] list of integers containing the highest scoring tag
-          indices.
-      viterbi_score: A float containing the score for the Viterbi sequence.
-    """
+  Returns:
+    viterbi: A [seq_len] list of integers containing the highest scoring tag
+        indices.
+    viterbi_score: A float containing the score for the Viterbi sequence.
+  """
   trellis = np.zeros_like(score)
   backpointers = np.zeros_like(score, dtype=np.int32)
   trellis[0] = score[0]
@@ -384,12 +384,12 @@ class CrfDecodeForwardRnnCell(tf.keras.layers.AbstractRNNCell):
   def __init__(self, transition_params: TensorLike, **kwargs):
     """Initialize the CrfDecodeForwardRnnCell.
 
-        Args:
-          transition_params: A [num_tags, num_tags] matrix of binary
-            potentials. This matrix is expanded into a
-            [1, num_tags, num_tags] in preparation for the broadcast
-            summation occurring within the cell.
-        """
+    Args:
+      transition_params: A [num_tags, num_tags] matrix of binary
+        potentials. This matrix is expanded into a
+        [1, num_tags, num_tags] in preparation for the broadcast
+        summation occurring within the cell.
+    """
     super().__init__(**kwargs)
     self._transition_params = tf.expand_dims(transition_params, 0)
     self._num_tags = transition_params.shape[0]
@@ -408,15 +408,15 @@ def build(self, input_shape):
   def call(self, inputs, state):
     """Build the CrfDecodeForwardRnnCell.
 
-        Args:
-          inputs: A [batch_size, num_tags] matrix of unary potentials.
-          state: A [batch_size, num_tags] matrix containing the previous step's
-                score values.
+    Args:
+      inputs: A [batch_size, num_tags] matrix of unary potentials.
+      state: A [batch_size, num_tags] matrix containing the previous step's
+            score values.
 
-        Returns:
-          backpointers: A [batch_size, num_tags] matrix of backpointers.
-          new_state: A [batch_size, num_tags] matrix of new score values.
-        """
+    Returns:
+      backpointers: A [batch_size, num_tags] matrix of backpointers.
+      new_state: A [batch_size, num_tags] matrix of new score values.
+    """
     state = tf.expand_dims(state[0], 2)
     transition_scores = state + tf.cast(self._transition_params, self._compute_dtype)
     new_state = inputs + tf.reduce_max(transition_scores, [1])
@@ -436,33 +436,33 @@ def from_config(cls, config: dict) -> "CrfDecodeForwardRnnCell":
 
 
 def crf_decode_forward(
-    inputs: TensorLike,
-    state: TensorLike,
-    transition_params: TensorLike,
-    sequence_lengths: TensorLike,
+  inputs: TensorLike,
+  state: TensorLike,
+  transition_params: TensorLike,
+  sequence_lengths: TensorLike,
 ) -> tf.Tensor:
   """Computes forward decoding in a linear-chain CRF.
 
-    Args:
-      inputs: A [batch_size, num_tags] matrix of unary potentials.
-      state: A [batch_size, num_tags] matrix containing the previous step's
-            score values.
-      transition_params: A [num_tags, num_tags] matrix of binary potentials.
-      sequence_lengths: A [batch_size] vector of true sequence lengths.
-
-    Returns:
-      backpointers: A [batch_size, num_tags] matrix of backpointers.
-      new_state: A [batch_size, num_tags] matrix of new score values.
-    """
+  Args:
+    inputs: A [batch_size, num_tags] matrix of unary potentials.
+    state: A [batch_size, num_tags] matrix containing the previous step's
+          score values.
+    transition_params: A [num_tags, num_tags] matrix of binary potentials.
+    sequence_lengths: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    backpointers: A [batch_size, num_tags] matrix of backpointers.
+    new_state: A [batch_size, num_tags] matrix of new score values.
+  """
   sequence_lengths = tf.cast(sequence_lengths, dtype=tf.int32)
   mask = tf.sequence_mask(sequence_lengths, tf.shape(inputs)[1])
   crf_fwd_cell = CrfDecodeForwardRnnCell(transition_params, dtype=inputs.dtype)
   crf_fwd_layer = tf.keras.layers.RNN(
-      crf_fwd_cell,
-      return_sequences=True,
-      return_state=True,
-      dtype=inputs.dtype,
-      zero_output_for_mask=True,  # See: https://github.com/tensorflow/deepray/issues/2639
+    crf_fwd_cell,
+    return_sequences=True,
+    return_state=True,
+    dtype=inputs.dtype,
+    zero_output_for_mask=True,  # See: https://github.com/tensorflow/deepray/issues/2639
   )
   return crf_fwd_layer(inputs, state, mask=mask)
 
@@ -470,15 +470,15 @@ def crf_decode_forward(
 def crf_decode_backward(inputs: TensorLike, state: TensorLike) -> tf.Tensor:
   """Computes backward decoding in a linear-chain CRF.
 
-    Args:
-      inputs: A [batch_size, num_tags] matrix of
-            backpointer of next step (in time order).
-      state: A [batch_size, 1] matrix of tag index of next step.
+  Args:
+    inputs: A [batch_size, num_tags] matrix of
+          backpointer of next step (in time order).
+    state: A [batch_size, 1] matrix of tag index of next step.
 
-    Returns:
-      new_tags: A [batch_size, num_tags]
-        tensor containing the new tag indices.
-    """
+  Returns:
+    new_tags: A [batch_size, num_tags]
+      tensor containing the new tag indices.
+  """
   inputs = tf.transpose(inputs, [1, 0, 2])
 
   def _scan_fn(state, inputs):
@@ -493,21 +493,21 @@ def _scan_fn(state, inputs):
 def crf_decode(potentials: TensorLike, transition_params: TensorLike, sequence_length: TensorLike) -> tf.Tensor:
   """Decode the highest scoring sequence of tags.
 
-    Args:
-      potentials: A [batch_size, max_seq_len, num_tags] tensor of
-                unary potentials.
-      transition_params: A [num_tags, num_tags] matrix of
-                binary potentials.
-      sequence_length: A [batch_size] vector of true sequence lengths.
-
-    Returns:
-      decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
-                  Contains the highest scoring tag indices.
-      best_score: A [batch_size] vector, containing the score of `decode_tags`.
-    """
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
+              unary potentials.
+    transition_params: A [num_tags, num_tags] matrix of
+              binary potentials.
+    sequence_length: A [batch_size] vector of true sequence lengths.
+
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
+  """
   if tf.__version__[:3] == "2.4":
     warnings.warn(
-        "CRF Decoding does not work with KerasTensors in TF2.4. The bug has since been fixed in tensorflow/tensorflow##45534"
+      "CRF Decoding does not work with KerasTensors in TF2.4. The bug has since been fixed in tensorflow/tensorflow##45534"
     )
 
   sequence_length = tf.cast(sequence_length, dtype=tf.int32)
@@ -554,29 +554,29 @@ def _multi_seq_fn():
 
 
 def crf_constrained_decode(
-    potentials: TensorLike,
-    tag_bitmap: TensorLike,
-    transition_params: TensorLike,
-    sequence_length: TensorLike,
+  potentials: TensorLike,
+  tag_bitmap: TensorLike,
+  transition_params: TensorLike,
+  sequence_length: TensorLike,
 ) -> tf.Tensor:
   """Decode the highest scoring sequence of tags under constraints.
 
-    This is a function for tensor.
-
-    Args:
-      potentials: A [batch_size, max_seq_len, num_tags] tensor of
-                unary potentials.
-      tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
-          representing all active tags at each index for which to calculate the
-          unnormalized score.
-      transition_params: A [num_tags, num_tags] matrix of
-                binary potentials.
-      sequence_length: A [batch_size] vector of true sequence lengths.
-    Returns:
-      decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
-                  Contains the highest scoring tag indices.
-      best_score: A [batch_size] vector, containing the score of `decode_tags`.
-    """
+  This is a function for tensor.
+
+  Args:
+    potentials: A [batch_size, max_seq_len, num_tags] tensor of
+              unary potentials.
+    tag_bitmap: A [batch_size, max_seq_len, num_tags] boolean tensor
+        representing all active tags at each index for which to calculate the
+        unnormalized score.
+    transition_params: A [num_tags, num_tags] matrix of
+              binary potentials.
+    sequence_length: A [batch_size] vector of true sequence lengths.
+  Returns:
+    decode_tags: A [batch_size, max_seq_len] matrix, with dtype `tf.int32`.
+                Contains the highest scoring tag indices.
+    best_score: A [batch_size] vector, containing the score of `decode_tags`.
+  """
 
   filtered_potentials = crf_filtered_inputs(potentials, tag_bitmap)
   return crf_decode(filtered_potentials, transition_params, sequence_length)
diff --git a/deepray/text/crf_wrapper.py b/deepray/text/crf_wrapper.py
index c3710ce3..54f426b5 100644
--- a/deepray/text/crf_wrapper.py
+++ b/deepray/text/crf_wrapper.py
@@ -6,16 +6,15 @@
 
 @tf.keras.utils.register_keras_serializable(package="Deepray")
 class CRFModelWrapper(tf.keras.Model):
-
   def __init__(
-      self,
-      base_model: tf.keras.Model,
-      units: int,
-      chain_initializer: types.Initializer = "orthogonal",
-      use_boundary: bool = True,
-      boundary_initializer: types.Initializer = "zeros",
-      use_kernel: bool = True,
-      **kwargs,
+    self,
+    base_model: tf.keras.Model,
+    units: int,
+    chain_initializer: types.Initializer = "orthogonal",
+    use_boundary: bool = True,
+    boundary_initializer: types.Initializer = "zeros",
+    use_kernel: bool = True,
+    **kwargs,
   ):
     super().__init__()
 
@@ -24,12 +23,12 @@ def __init__(
     from deepray.layers.crf import CRF  # noqa
 
     self.crf_layer = CRF(
-        units=units,
-        chain_initializer=chain_initializer,
-        use_boundary=use_boundary,
-        boundary_initializer=boundary_initializer,
-        use_kernel=use_kernel,
-        **kwargs,
+      units=units,
+      chain_initializer=chain_initializer,
+      use_boundary=use_boundary,
+      boundary_initializer=boundary_initializer,
+      use_kernel=use_kernel,
+      **kwargs,
     )
 
     self.base_model = base_model
diff --git a/deepray/text/parse_time_op.py b/deepray/text/parse_time_op.py
index 04a0bb77..8cba11e3 100644
--- a/deepray/text/parse_time_op.py
+++ b/deepray/text/parse_time_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Parse time ops."""
+
 import platform
 
 import tensorflow as tf
@@ -28,59 +29,59 @@
 
 def parse_time(time_string: str, time_format: str, output_unit: str) -> tf.Tensor:
   """Parse an input string according to the provided format string into a
-    Unix time.
+  Unix time.
 
-    Parse an input string according to the provided format string into a Unix
-    time, the number of seconds / milliseconds / microseconds / nanoseconds
-    elapsed since January 1, 1970 UTC.
+  Parse an input string according to the provided format string into a Unix
+  time, the number of seconds / milliseconds / microseconds / nanoseconds
+  elapsed since January 1, 1970 UTC.
 
-    Uses strftime()-like formatting options, with the same extensions as
-    FormatTime(), but with the exceptions that %E#S is interpreted as %E*S, and
-    %E#f as %E*f.  %Ez and %E*z also accept the same inputs.
+  Uses strftime()-like formatting options, with the same extensions as
+  FormatTime(), but with the exceptions that %E#S is interpreted as %E*S, and
+  %E#f as %E*f.  %Ez and %E*z also accept the same inputs.
 
-    %Y consumes as many numeric characters as it can, so the matching
-    data should always be terminated with a non-numeric. %E4Y always
-    consumes exactly four characters, including any sign.
+  %Y consumes as many numeric characters as it can, so the matching
+  data should always be terminated with a non-numeric. %E4Y always
+  consumes exactly four characters, including any sign.
 
-    Unspecified fields are taken from the default date and time of ...
+  Unspecified fields are taken from the default date and time of ...
 
-      "1970-01-01 00:00:00.0 +0000"
+    "1970-01-01 00:00:00.0 +0000"
 
-    For example, parsing a string of "15:45" (%H:%M) will return an
-    Unix time that represents "1970-01-01 15:45:00.0 +0000".
+  For example, parsing a string of "15:45" (%H:%M) will return an
+  Unix time that represents "1970-01-01 15:45:00.0 +0000".
 
-    Note that ParseTime only heeds the fields year, month, day, hour,
-    minute, (fractional) second, and UTC offset.  Other fields, like
-    weekday (%a or %A), while parsed for syntactic validity, are
-    ignored in the conversion.
+  Note that ParseTime only heeds the fields year, month, day, hour,
+  minute, (fractional) second, and UTC offset.  Other fields, like
+  weekday (%a or %A), while parsed for syntactic validity, are
+  ignored in the conversion.
 
-    Date and time fields that are out-of-range will be treated as
-    errors rather than normalizing them like `absl::CivilSecond` does.
-    For example, it is an error to parse the date "Oct 32, 2013"
-    because 32 is out of range.
+  Date and time fields that are out-of-range will be treated as
+  errors rather than normalizing them like `absl::CivilSecond` does.
+  For example, it is an error to parse the date "Oct 32, 2013"
+  because 32 is out of range.
 
-    A leap second of ":60" is normalized to ":00" of the following
-    minute with fractional seconds discarded.  The following table
-    shows how the given seconds and subseconds will be parsed:
+  A leap second of ":60" is normalized to ":00" of the following
+  minute with fractional seconds discarded.  The following table
+  shows how the given seconds and subseconds will be parsed:
 
-      "59.x" -> 59.x  // exact
-      "60.x" -> 00.0  // normalized
-      "00.x" -> 00.x  // exact
+    "59.x" -> 59.x  // exact
+    "60.x" -> 00.0  // normalized
+    "00.x" -> 00.x  // exact
 
-    Args:
-      time_string: The input time string to be parsed.
-      time_format: The time format.
-      output_unit: The output unit of the parsed unix time. Can only be SECOND,
-        MILLISECOND, MICROSECOND, NANOSECOND.
+  Args:
+    time_string: The input time string to be parsed.
+    time_format: The time format.
+    output_unit: The output unit of the parsed unix time. Can only be SECOND,
+      MILLISECOND, MICROSECOND, NANOSECOND.
 
-    Returns:
-      the number of seconds / milliseconds / microseconds / nanoseconds elapsed
-        since January 1, 1970 UTC.
+  Returns:
+    the number of seconds / milliseconds / microseconds / nanoseconds elapsed
+      since January 1, 1970 UTC.
 
-    Raises:
-      ValueError: If `output_unit` is not a valid value,
-        if parsing `time_string` according to `time_format` failed.
-    """
+  Raises:
+    ValueError: If `output_unit` is not a valid value,
+      if parsing `time_string` according to `time_format` failed.
+  """
   if IS_WINDOWS:
     raise NotImplementedError("parse_time is not yet implemented on Windows.")
   return _parse_time_so.ops.deepray_parse_time(time_string, time_format, output_unit)
diff --git a/deepray/text/skip_gram_ops.py b/deepray/text/skip_gram_ops.py
index a27bdeec..db21723b 100644
--- a/deepray/text/skip_gram_ops.py
+++ b/deepray/text/skip_gram_ops.py
@@ -28,147 +28,145 @@
 
 
 def skip_gram_sample(
-    input_tensor: TensorLike,
-    min_skips: FloatTensorLike = 1,
-    max_skips: FloatTensorLike = 5,
-    start: FloatTensorLike = 0,
-    limit: FloatTensorLike = -1,
-    emit_self_as_target: bool = False,
-    vocab_freq_table: tf.lookup.KeyValueTensorInitializer = None,
-    vocab_min_count: Optional[FloatTensorLike] = None,
-    vocab_subsampling: Optional[FloatTensorLike] = None,
-    corpus_size: Optional[FloatTensorLike] = None,
-    seed: Optional[FloatTensorLike] = None,
-    name: Optional[str] = None,
+  input_tensor: TensorLike,
+  min_skips: FloatTensorLike = 1,
+  max_skips: FloatTensorLike = 5,
+  start: FloatTensorLike = 0,
+  limit: FloatTensorLike = -1,
+  emit_self_as_target: bool = False,
+  vocab_freq_table: tf.lookup.KeyValueTensorInitializer = None,
+  vocab_min_count: Optional[FloatTensorLike] = None,
+  vocab_subsampling: Optional[FloatTensorLike] = None,
+  corpus_size: Optional[FloatTensorLike] = None,
+  seed: Optional[FloatTensorLike] = None,
+  name: Optional[str] = None,
 ) -> tf.Tensor:
   """Generates skip-gram token and label paired Tensors from the input
-    tensor.
-
-    Generates skip-gram `("token", "label")` pairs using each element in the
-    rank-1 `input_tensor` as a token. The window size used for each token will
-    be randomly selected from the range specified by `[min_skips, max_skips]`,
-    inclusive. See https://arxiv.org/abs/1301.3781 for more details about
-    skip-gram.
-
-    For example, given `input_tensor = ["the", "quick", "brown", "fox",
-    "jumps"]`, `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`,
-    the output `(tokens, labels)` pairs for the token "quick" will be randomly
-    selected from either `(tokens=["quick", "quick"], labels=["the", "brown"])`
-    for 1 skip, or `(tokens=["quick", "quick", "quick"],
-    labels=["the", "brown", "fox"])` for 2 skips.
-
-    If `emit_self_as_target = True`, each token will also be emitted as a label
-    for itself. From the previous example, the output will be either
-    `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])`
-    for 1 skip, or `(tokens=["quick", "quick", "quick", "quick"],
-    labels=["the", "quick", "brown", "fox"])` for 2 skips.
-
-    The same process is repeated for each element of `input_tensor` and
-    concatenated together into the two output rank-1 `Tensors` (one for all the
-    tokens, another for all the labels).
-
-    If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
-    present in the vocabulary are discarded. Tokens whose frequency counts are
-    below `vocab_min_count` are also discarded. Tokens whose frequency
-    proportions in the corpus exceed `vocab_subsampling` may be randomly
-    down-sampled. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details
-    about subsampling.
-
-    Args:
-      input_tensor: A rank-1 `Tensor` from which to generate skip-gram
-        candidates.
-      min_skips: `int` or scalar `Tensor` specifying the minimum window size to
-        randomly use for each token. Must be >= 0 and <= `max_skips`. If
-        `min_skips` and `max_skips` are both 0, the only label outputted will
-        be the token itself when `emit_self_as_target = True` -
-        or no output otherwise.
-      max_skips: `int` or scalar `Tensor` specifying the maximum window size to
-        randomly use for each token. Must be >= 0.
-      start: `int` or scalar `Tensor` specifying the position in
-        `input_tensor` from which to start generating skip-gram candidates.
-      limit: `int` or scalar `Tensor` specifying the maximum number of
-        elements in `input_tensor` to use in generating skip-gram candidates.
-        -1 means to use the rest of the `Tensor` after `start`.
-      emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
-        each token as a label for itself.
-      vocab_freq_table: (Optional) A lookup table (subclass of
-        `lookup.InitializableLookupTableBase`) that maps tokens to their raw
-        frequency counts. If specified, any token in `input_tensor` that is not
-        found in `vocab_freq_table` will be filtered out before generating
-        skip-gram candidates. While this will typically map to integer raw
-        frequency counts, it could also map to float frequency proportions.
-        `vocab_min_count` and `corpus_size` should be in the same units
-        as this.
-      vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying
-        minimum frequency threshold (from `vocab_freq_table`) for a token to be
-        kept in `input_tensor`. If this is specified, `vocab_freq_table` must
-        also be specified - and they should both be in the same units.
-      vocab_subsampling: (Optional) `float` specifying frequency proportion
-        threshold for tokens from `input_tensor`. Tokens that occur more
-        frequently (based on the ratio of the token's `vocab_freq_table` value
-        to the `corpus_size`) will be randomly down-sampled. Reasonable
-        starting values may be around 1e-3 or 1e-5. If this is specified, both
-        `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
-        in http://arxiv.org/abs/1310.4546 for more details.
-      corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
-        total number of tokens in the corpus (e.g., sum of all the frequency
-        counts of `vocab_freq_table`). Used with `vocab_subsampling` for
-        down-sampling frequently occurring tokens. If this is specified,
-        `vocab_freq_table` and `vocab_subsampling` must also be specified.
-      seed: (Optional) `int` used to create a random seed for window size and
-        subsampling. See `set_random_seed` docs for behavior.
-      name: (Optional) A `string` name or a name scope for the operations.
-
-    Returns:
-      A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
-      rank-1 and has the same type as `input_tensor`.
-
-    Raises:
-      ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`,
-        `vocab_subsampling`, or `corpus_size` is specified.
-        If `vocab_subsampling` and `corpus_size` are not both present or
-        both absent.
-    """
+  tensor.
+
+  Generates skip-gram `("token", "label")` pairs using each element in the
+  rank-1 `input_tensor` as a token. The window size used for each token will
+  be randomly selected from the range specified by `[min_skips, max_skips]`,
+  inclusive. See https://arxiv.org/abs/1301.3781 for more details about
+  skip-gram.
+
+  For example, given `input_tensor = ["the", "quick", "brown", "fox",
+  "jumps"]`, `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`,
+  the output `(tokens, labels)` pairs for the token "quick" will be randomly
+  selected from either `(tokens=["quick", "quick"], labels=["the", "brown"])`
+  for 1 skip, or `(tokens=["quick", "quick", "quick"],
+  labels=["the", "brown", "fox"])` for 2 skips.
+
+  If `emit_self_as_target = True`, each token will also be emitted as a label
+  for itself. From the previous example, the output will be either
+  `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])`
+  for 1 skip, or `(tokens=["quick", "quick", "quick", "quick"],
+  labels=["the", "quick", "brown", "fox"])` for 2 skips.
+
+  The same process is repeated for each element of `input_tensor` and
+  concatenated together into the two output rank-1 `Tensors` (one for all the
+  tokens, another for all the labels).
+
+  If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
+  present in the vocabulary are discarded. Tokens whose frequency counts are
+  below `vocab_min_count` are also discarded. Tokens whose frequency
+  proportions in the corpus exceed `vocab_subsampling` may be randomly
+  down-sampled. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details
+  about subsampling.
+
+  Args:
+    input_tensor: A rank-1 `Tensor` from which to generate skip-gram
+      candidates.
+    min_skips: `int` or scalar `Tensor` specifying the minimum window size to
+      randomly use for each token. Must be >= 0 and <= `max_skips`. If
+      `min_skips` and `max_skips` are both 0, the only label outputted will
+      be the token itself when `emit_self_as_target = True` -
+      or no output otherwise.
+    max_skips: `int` or scalar `Tensor` specifying the maximum window size to
+      randomly use for each token. Must be >= 0.
+    start: `int` or scalar `Tensor` specifying the position in
+      `input_tensor` from which to start generating skip-gram candidates.
+    limit: `int` or scalar `Tensor` specifying the maximum number of
+      elements in `input_tensor` to use in generating skip-gram candidates.
+      -1 means to use the rest of the `Tensor` after `start`.
+    emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
+      each token as a label for itself.
+    vocab_freq_table: (Optional) A lookup table (subclass of
+      `lookup.InitializableLookupTableBase`) that maps tokens to their raw
+      frequency counts. If specified, any token in `input_tensor` that is not
+      found in `vocab_freq_table` will be filtered out before generating
+      skip-gram candidates. While this will typically map to integer raw
+      frequency counts, it could also map to float frequency proportions.
+      `vocab_min_count` and `corpus_size` should be in the same units
+      as this.
+    vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying
+      minimum frequency threshold (from `vocab_freq_table`) for a token to be
+      kept in `input_tensor`. If this is specified, `vocab_freq_table` must
+      also be specified - and they should both be in the same units.
+    vocab_subsampling: (Optional) `float` specifying frequency proportion
+      threshold for tokens from `input_tensor`. Tokens that occur more
+      frequently (based on the ratio of the token's `vocab_freq_table` value
+      to the `corpus_size`) will be randomly down-sampled. Reasonable
+      starting values may be around 1e-3 or 1e-5. If this is specified, both
+      `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
+      in http://arxiv.org/abs/1310.4546 for more details.
+    corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
+      total number of tokens in the corpus (e.g., sum of all the frequency
+      counts of `vocab_freq_table`). Used with `vocab_subsampling` for
+      down-sampling frequently occurring tokens. If this is specified,
+      `vocab_freq_table` and `vocab_subsampling` must also be specified.
+    seed: (Optional) `int` used to create a random seed for window size and
+      subsampling. See `set_random_seed` docs for behavior.
+    name: (Optional) A `string` name or a name scope for the operations.
+
+  Returns:
+    A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+    rank-1 and has the same type as `input_tensor`.
+
+  Raises:
+    ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`,
+      `vocab_subsampling`, or `corpus_size` is specified.
+      If `vocab_subsampling` and `corpus_size` are not both present or
+      both absent.
+  """
 
   if vocab_freq_table is None and (
-      vocab_min_count is not None or vocab_subsampling is not None or corpus_size is not None
+    vocab_min_count is not None or vocab_subsampling is not None or corpus_size is not None
   ):
     raise ValueError(
-        "vocab_freq_table is not provided, but vocab_min_count={}, "
-        "vocab_subsampling={}, or corpus_size={} is not None."
-        "These settings are useless without a vocab_freq_table.".format(
-            vocab_min_count, vocab_subsampling, corpus_size
-        )
+      "vocab_freq_table is not provided, but vocab_min_count={}, "
+      "vocab_subsampling={}, or corpus_size={} is not None."
+      "These settings are useless without a vocab_freq_table.".format(vocab_min_count, vocab_subsampling, corpus_size)
     )
 
   if (vocab_subsampling is None) != (corpus_size is None):
     raise ValueError(
-        "vocab_subsampling is {} while corpus_size is {} - both must be "
-        "provided in order for subsampling to work.".format(vocab_subsampling, corpus_size)
+      "vocab_subsampling is {} while corpus_size is {} - both must be "
+      "provided in order for subsampling to work.".format(vocab_subsampling, corpus_size)
     )
 
   with tf.name_scope(name or "skip_gram_sample"):
     input_tensor = _filter_input(
-        input_tensor=input_tensor,
-        vocab_freq_table=vocab_freq_table,
-        vocab_min_count=vocab_min_count,
-        vocab_subsampling=vocab_subsampling,
-        corpus_size=corpus_size,
-        seed=seed,
+      input_tensor=input_tensor,
+      vocab_freq_table=vocab_freq_table,
+      vocab_min_count=vocab_min_count,
+      vocab_subsampling=vocab_subsampling,
+      corpus_size=corpus_size,
+      seed=seed,
     )
 
     seed1, seed2 = tf.compat.v1.get_seed(seed)
     tokens, labels = _skip_gram_so.ops.deepray_skip_gram_generate_candidates(
-        input_tensor=input_tensor,
-        min_skips=min_skips,
-        max_skips=max_skips,
-        start=start,
-        limit=limit,
-        emit_self_as_target=emit_self_as_target,
-        # Note that seed here should be seed1! This is due to
-        # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2".
-        seed=seed1,
-        seed2=seed2,
+      input_tensor=input_tensor,
+      min_skips=min_skips,
+      max_skips=max_skips,
+      start=start,
+      limit=limit,
+      emit_self_as_target=emit_self_as_target,
+      # Note that seed here should be seed1! This is due to
+      # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2".
+      seed=seed1,
+      seed2=seed2,
     )
 
     # TODO(weiho): If the need arises, add support for sparse input_tensor that
@@ -179,118 +177,117 @@ def skip_gram_sample(
 
 
 def skip_gram_sample_with_text_vocab(
-    input_tensor: TensorLike,
-    vocab_freq_file: str,
-    vocab_token_index: FloatTensorLike = 0,
-    vocab_token_dtype: Optional[AcceptableDTypes] = tf.dtypes.string,
-    vocab_freq_index: FloatTensorLike = 1,
-    vocab_freq_dtype: Optional[AcceptableDTypes] = tf.dtypes.float64,
-    vocab_delimiter: str = ",",
-    vocab_min_count: Optional[FloatTensorLike] = None,
-    vocab_subsampling: Optional[FloatTensorLike] = None,
-    corpus_size: Optional[FloatTensorLike] = None,
-    min_skips: FloatTensorLike = 1,
-    max_skips: FloatTensorLike = 5,
-    start: FloatTensorLike = 0,
-    limit: FloatTensorLike = -1,
-    emit_self_as_target: bool = False,
-    seed: Optional[FloatTensorLike] = None,
-    name: Optional[str] = None,
+  input_tensor: TensorLike,
+  vocab_freq_file: str,
+  vocab_token_index: FloatTensorLike = 0,
+  vocab_token_dtype: Optional[AcceptableDTypes] = tf.dtypes.string,
+  vocab_freq_index: FloatTensorLike = 1,
+  vocab_freq_dtype: Optional[AcceptableDTypes] = tf.dtypes.float64,
+  vocab_delimiter: str = ",",
+  vocab_min_count: Optional[FloatTensorLike] = None,
+  vocab_subsampling: Optional[FloatTensorLike] = None,
+  corpus_size: Optional[FloatTensorLike] = None,
+  min_skips: FloatTensorLike = 1,
+  max_skips: FloatTensorLike = 5,
+  start: FloatTensorLike = 0,
+  limit: FloatTensorLike = -1,
+  emit_self_as_target: bool = False,
+  seed: Optional[FloatTensorLike] = None,
+  name: Optional[str] = None,
 ) -> tf.Tensor:
   """Skip-gram sampling with a text vocabulary file.
 
-    Wrapper around `skip_gram_sample()` for use with a text vocabulary file.
-    The vocabulary file is expected to be a plain-text file, with lines of
-    `vocab_delimiter`-separated columns. The `vocab_token_index` column should
-    contain the vocabulary term, while the `vocab_freq_index` column should
-    contain the number of times that term occurs in the corpus. For example,
-    with a text vocabulary file of:
-
-      ```
-      bonjour,fr,42
-      hello,en,777
-      hola,es,99
-      ```
-
-    You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
-    `vocab_freq_index=2`.
-
-    See `skip_gram_sample()` documentation for more details about the skip-gram
-    sampling process.
-
-    Args:
-      input_tensor:
-        A rank-1 `Tensor` from which to generate skip-gram candidates.
-      vocab_freq_file:
-        `string` specifying full file path to the text vocab file.
-      vocab_token_index: `int` specifying which column in the text vocab file
-        contains the tokens.
-      vocab_token_dtype:
-        `DType` specifying the format of the tokens in the text vocab file.
-      vocab_freq_index: `int` specifying which column in the text vocab file
-        contains the frequency counts of the tokens.
-      vocab_freq_dtype: `DType` specifying the format of the frequency counts
-        in the text vocab file.
-      vocab_delimiter: `string` specifying the delimiter used in the text vocab
-        file.
-      vocab_min_count: `int`, `float`, or scalar `Tensor` specifying
-        minimum frequency threshold (from `vocab_freq_file`) for a token to be
-        kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
-      vocab_subsampling: (Optional) `float` specifying frequency proportion
-        threshold for tokens from `input_tensor`. Tokens that occur more
-        frequently will be randomly down-sampled. Reasonable starting values
-        may be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546
-        for more details.
-      corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
-        total number of tokens in the corpus (e.g., sum of all the frequency
-        counts of `vocab_freq_file`). Used with `vocab_subsampling` for
-        down-sampling frequently occurring tokens. If this is specified,
-        `vocab_freq_file` and `vocab_subsampling` must also be specified.
-        If `corpus_size` is needed but not supplied, then it will be calculated
-        from `vocab_freq_file`. You might want to supply your own value if you
-        have already eliminated infrequent tokens from your vocabulary files
-        (where frequency < vocab_min_count) to save memory in the internal
-        token lookup table. Otherwise, the unused tokens' variables will waste
-        memory.  The user-supplied `corpus_size` value must be greater than or
-        equal to the sum of all the frequency counts of `vocab_freq_file`.
-      min_skips: `int` or scalar `Tensor` specifying the minimum window size to
-        randomly use for each token. Must be >= 0 and <= `max_skips`. If
-        `min_skips` and `max_skips` are both 0, the only label outputted will
-        be the token itself.
-      max_skips: `int` or scalar `Tensor` specifying the maximum window size to
-        randomly use for each token. Must be >= 0.
-      start: `int` or scalar `Tensor` specifying the position in `input_tensor`
-        from which to start generating skip-gram candidates.
-      limit: `int` or scalar `Tensor` specifying the maximum number of elements
-        in `input_tensor` to use in generating skip-gram candidates. -1 means
-        to use the rest of the `Tensor` after `start`.
-      emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
-        each token as a label for itself.
-      seed: (Optional) `int` used to create a random seed for window size and
-        subsampling. See
-        [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
-        for behavior.
-      name: (Optional) A `string` name or a name scope for the operations.
-
-    Returns:
-      A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
-      rank-1 and has the same type as `input_tensor`.
-
-    Raises:
-      ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0
-        or exceeds the number of columns in `vocab_freq_file`.
-        If `vocab_token_index` and `vocab_freq_index` are both set to the same
-        column. If any token in `vocab_freq_file` has a negative frequency.
-    """
+  Wrapper around `skip_gram_sample()` for use with a text vocabulary file.
+  The vocabulary file is expected to be a plain-text file, with lines of
+  `vocab_delimiter`-separated columns. The `vocab_token_index` column should
+  contain the vocabulary term, while the `vocab_freq_index` column should
+  contain the number of times that term occurs in the corpus. For example,
+  with a text vocabulary file of:
+
+    ```
+    bonjour,fr,42
+    hello,en,777
+    hola,es,99
+    ```
+
+  You should set `vocab_delimiter=","`, `vocab_token_index=0`, and
+  `vocab_freq_index=2`.
+
+  See `skip_gram_sample()` documentation for more details about the skip-gram
+  sampling process.
+
+  Args:
+    input_tensor:
+      A rank-1 `Tensor` from which to generate skip-gram candidates.
+    vocab_freq_file:
+      `string` specifying full file path to the text vocab file.
+    vocab_token_index: `int` specifying which column in the text vocab file
+      contains the tokens.
+    vocab_token_dtype:
+      `DType` specifying the format of the tokens in the text vocab file.
+    vocab_freq_index: `int` specifying which column in the text vocab file
+      contains the frequency counts of the tokens.
+    vocab_freq_dtype: `DType` specifying the format of the frequency counts
+      in the text vocab file.
+    vocab_delimiter: `string` specifying the delimiter used in the text vocab
+      file.
+    vocab_min_count: `int`, `float`, or scalar `Tensor` specifying
+      minimum frequency threshold (from `vocab_freq_file`) for a token to be
+      kept in `input_tensor`. This should correspond with `vocab_freq_dtype`.
+    vocab_subsampling: (Optional) `float` specifying frequency proportion
+      threshold for tokens from `input_tensor`. Tokens that occur more
+      frequently will be randomly down-sampled. Reasonable starting values
+      may be around 1e-3 or 1e-5. See Eq. 5 in http://arxiv.org/abs/1310.4546
+      for more details.
+    corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
+      total number of tokens in the corpus (e.g., sum of all the frequency
+      counts of `vocab_freq_file`). Used with `vocab_subsampling` for
+      down-sampling frequently occurring tokens. If this is specified,
+      `vocab_freq_file` and `vocab_subsampling` must also be specified.
+      If `corpus_size` is needed but not supplied, then it will be calculated
+      from `vocab_freq_file`. You might want to supply your own value if you
+      have already eliminated infrequent tokens from your vocabulary files
+      (where frequency < vocab_min_count) to save memory in the internal
+      token lookup table. Otherwise, the unused tokens' variables will waste
+      memory.  The user-supplied `corpus_size` value must be greater than or
+      equal to the sum of all the frequency counts of `vocab_freq_file`.
+    min_skips: `int` or scalar `Tensor` specifying the minimum window size to
+      randomly use for each token. Must be >= 0 and <= `max_skips`. If
+      `min_skips` and `max_skips` are both 0, the only label outputted will
+      be the token itself.
+    max_skips: `int` or scalar `Tensor` specifying the maximum window size to
+      randomly use for each token. Must be >= 0.
+    start: `int` or scalar `Tensor` specifying the position in `input_tensor`
+      from which to start generating skip-gram candidates.
+    limit: `int` or scalar `Tensor` specifying the maximum number of elements
+      in `input_tensor` to use in generating skip-gram candidates. -1 means
+      to use the rest of the `Tensor` after `start`.
+    emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
+      each token as a label for itself.
+    seed: (Optional) `int` used to create a random seed for window size and
+      subsampling. See
+      [`set_random_seed`](../../g3doc/python/constant_op.md#set_random_seed)
+      for behavior.
+    name: (Optional) A `string` name or a name scope for the operations.
+
+  Returns:
+    A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
+    rank-1 and has the same type as `input_tensor`.
+
+  Raises:
+    ValueError: If `vocab_token_index` or `vocab_freq_index` is less than 0
+      or exceeds the number of columns in `vocab_freq_file`.
+      If `vocab_token_index` and `vocab_freq_index` are both set to the same
+      column. If any token in `vocab_freq_file` has a negative frequency.
+  """
 
   if vocab_token_index < 0 or vocab_freq_index < 0:
     raise ValueError(
-        "vocab_token_index={} and vocab_freq_index={} must both be >= 0.".format(vocab_token_index, vocab_freq_index)
+      "vocab_token_index={} and vocab_freq_index={} must both be >= 0.".format(vocab_token_index, vocab_freq_index)
     )
   if vocab_token_index == vocab_freq_index:
     raise ValueError(
-        "vocab_token_index and vocab_freq_index should be different, "
-        "but are both {}.".format(vocab_token_index)
+      "vocab_token_index and vocab_freq_index should be different, but are both {}.".format(vocab_token_index)
     )
 
   # Iterates through the vocab file and calculates the number of vocab terms as
@@ -303,17 +300,16 @@ def skip_gram_sample_with_text_vocab(
     for row in reader:
       if vocab_token_index >= len(row) or vocab_freq_index >= len(row):
         raise ValueError(
-            "Row in vocab file only has {} columns, "
-            "so vocab_token_index={} or "
-            "vocab_freq_index={} is out of bounds. Row content: {}".format(
-                len(row), vocab_token_index, vocab_freq_index, row
-            )
+          "Row in vocab file only has {} columns, "
+          "so vocab_token_index={} or "
+          "vocab_freq_index={} is out of bounds. Row content: {}".format(
+            len(row), vocab_token_index, vocab_freq_index, row
+          )
         )
       vocab_size += 1
       freq = vocab_freq_dtype.as_numpy_dtype(row[vocab_freq_index])
       if freq < 0:
-        raise ValueError("Row in vocab file has negative frequency of {}. "
-                         "Row content: {}".format(freq, row))
+        raise ValueError("Row in vocab file has negative frequency of {}. Row content: {}".format(freq, row))
       # Note: tokens whose frequencies are below vocab_min_count will still
       # contribute to the total corpus size used for vocab subsampling.
       calculated_corpus_size += freq
@@ -322,50 +318,50 @@ def skip_gram_sample_with_text_vocab(
     corpus_size = calculated_corpus_size
   elif calculated_corpus_size - corpus_size > 1e-6:
     raise ValueError(
-        "`corpus_size`={} must be greater than or equal to the "
-        "sum of all the frequency counts ({}) of `vocab_freq_file` ({}).".format(
-            corpus_size, calculated_corpus_size, vocab_freq_file
-        )
+      "`corpus_size`={} must be greater than or equal to the "
+      "sum of all the frequency counts ({}) of `vocab_freq_file` ({}).".format(
+        corpus_size, calculated_corpus_size, vocab_freq_file
+      )
     )
 
   vocab_freq_table = tf.lookup.StaticHashTable(
-      tf.lookup.TextFileInitializer(
-          filename=vocab_freq_file,
-          key_dtype=vocab_token_dtype,
-          key_index=vocab_token_index,
-          value_dtype=vocab_freq_dtype,
-          value_index=vocab_freq_index,
-          vocab_size=vocab_size,
-          delimiter=vocab_delimiter,
-      ),
-      # For vocab terms not in vocab file, use a default value of -1.
-      default_value=-1,
+    tf.lookup.TextFileInitializer(
+      filename=vocab_freq_file,
+      key_dtype=vocab_token_dtype,
+      key_index=vocab_token_index,
+      value_dtype=vocab_freq_dtype,
+      value_index=vocab_freq_index,
+      vocab_size=vocab_size,
+      delimiter=vocab_delimiter,
+    ),
+    # For vocab terms not in vocab file, use a default value of -1.
+    default_value=-1,
   )
 
   return skip_gram_sample(
-      input_tensor,
-      min_skips=min_skips,
-      max_skips=max_skips,
-      start=start,
-      limit=limit,
-      emit_self_as_target=emit_self_as_target,
-      vocab_freq_table=vocab_freq_table,
-      vocab_min_count=vocab_min_count,
-      vocab_subsampling=vocab_subsampling,
-      # corpus_size is not used unless vocab_subsampling is specified.
-      corpus_size=None if vocab_subsampling is None else corpus_size,
-      seed=seed,
-      name=name,
+    input_tensor,
+    min_skips=min_skips,
+    max_skips=max_skips,
+    start=start,
+    limit=limit,
+    emit_self_as_target=emit_self_as_target,
+    vocab_freq_table=vocab_freq_table,
+    vocab_min_count=vocab_min_count,
+    vocab_subsampling=vocab_subsampling,
+    # corpus_size is not used unless vocab_subsampling is specified.
+    corpus_size=None if vocab_subsampling is None else corpus_size,
+    seed=seed,
+    name=name,
   )
 
 
 def _filter_input(
-    input_tensor,
-    vocab_freq_table,
-    vocab_min_count,
-    vocab_subsampling,
-    corpus_size,
-    seed,
+  input_tensor,
+  vocab_freq_table,
+  vocab_min_count,
+  vocab_subsampling,
+  corpus_size,
+  seed,
 ):
   input_tensor = tf.convert_to_tensor(input_tensor)
   """Filters input tensor based on vocab freq, threshold, and subsampling."""
@@ -374,9 +370,9 @@ def _filter_input(
 
   if not isinstance(vocab_freq_table, tf.lookup.StaticHashTable):
     raise ValueError(
-        "vocab_freq_table must be a subclass of "
-        "InitializableLookupTableBase (such as HashTable) instead of type "
-        "{}.".format(type(vocab_freq_table))
+      "vocab_freq_table must be a subclass of "
+      "InitializableLookupTableBase (such as HashTable) instead of type "
+      "{}.".format(type(vocab_freq_table))
     )
 
   with tf.name_scope("filter_vocab"):
@@ -410,8 +406,9 @@ def _filter_input(
 
     # From tensorflow_models/tutorials/embedding/word2vec_kernels.cc, which is
     # suppose to correlate with Eq. 5 in http://arxiv.org/abs/1310.4546.
-    keep_prob = (tf.math.sqrt(freq /
-                              (vocab_subsampling * corpus_size)) + 1.0) * (vocab_subsampling * corpus_size / freq)
+    keep_prob = (tf.math.sqrt(freq / (vocab_subsampling * corpus_size)) + 1.0) * (
+      vocab_subsampling * corpus_size / freq
+    )
     random_prob = tf.random.uniform(tf.shape(freq), minval=0, maxval=1, dtype=tf.dtypes.float64, seed=seed)
 
     mask = tf.math.less_equal(random_prob, keep_prob)
diff --git a/deepray/text/tests/crf_test.py b/deepray/text/tests/crf_test.py
index dcdfbad2..25d381a8 100644
--- a/deepray/text/tests/crf_test.py
+++ b/deepray/text/tests/crf_test.py
@@ -29,7 +29,7 @@
 def calculate_sequence_score(inputs, transition_params, tag_indices, sequence_lengths):
   expected_unary_score = sum(inputs[i][tag_indices[i]] for i in range(sequence_lengths))
   expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]] for i in range(sequence_lengths - 1)
+    transition_params[tag_indices[i], tag_indices[i + 1]] for i in range(sequence_lengths - 1)
   )
   return expected_unary_score + expected_binary_score
 
@@ -52,10 +52,10 @@ def brute_force_decode(sequence_lengths, inputs, transition_params):
     tag_indices.extend([0] * (num_words - sequence_lengths))
     all_sequences.append(tag_indices)
     sequence_score = text.crf_sequence_score(
-        inputs=inputs,
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=sequence_lengths,
-        transition_params=transition_params,
+      inputs=inputs,
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=sequence_lengths,
+      transition_params=transition_params,
     )
     sequence_score = tf.squeeze(sequence_score, [0])
     all_sequence_scores.append(sequence_score)
@@ -71,37 +71,37 @@ def test_crf_filtered_inputs(dtype):
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [np.array(3, dtype=np.int32), np.array(1, dtype=np.int32)]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[4, 5, -3]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[4, 5, -3]], dtype=dtype),
   ]
   tag_bitmap_list = [
-      np.array(
-          [
-              [True, False, False],
-              [False, True, True],
-              [False, True, True],
-              [False, True, True],
-          ],
-          dtype=bool,
-      ),
-      np.array([[False, True, True]], dtype=bool),
+    np.array(
+      [
+        [True, False, False],
+        [False, True, True],
+        [False, True, True],
+        [False, True, True],
+      ],
+      dtype=bool,
+    ),
+    np.array([[False, True, True]], dtype=bool),
   ]
   neg_inf = float("-inf")
   expected_filtered_inputs_list = [
-      np.array(
-          [[4, neg_inf, neg_inf], [neg_inf, -1, 3], [neg_inf, 2, 1], [neg_inf, 0, 0]],
-          dtype=dtype,
-      ),
-      np.array([[neg_inf, 5, -3]], dtype=dtype),
+    np.array(
+      [[4, neg_inf, neg_inf], [neg_inf, -1, 3], [neg_inf, 2, 1], [neg_inf, 0, 0]],
+      dtype=dtype,
+    ),
+    np.array([[neg_inf, 5, -3]], dtype=dtype),
   ]
   for sequence_lengths, inputs, tag_bitmap, expected_filtered_inputs in zip(
-      sequence_lengths_list,
-      inputs_list,
-      tag_bitmap_list,
-      expected_filtered_inputs_list,
+    sequence_lengths_list,
+    inputs_list,
+    tag_bitmap_list,
+    expected_filtered_inputs_list,
   ):
     filtered_inputs = text.crf_filtered_inputs(
-        inputs=tf.expand_dims(inputs, 0), tag_bitmap=tf.expand_dims(tag_bitmap, 0)
+      inputs=tf.expand_dims(inputs, 0), tag_bitmap=tf.expand_dims(tag_bitmap, 0)
     )
     filtered_inputs = tf.squeeze(filtered_inputs, [0])
 
@@ -113,23 +113,23 @@ def test_crf_sequence_score(dtype):
   transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32),
+    np.array(3, dtype=np.int32),
+    np.array(1, dtype=np.int32),
   ]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[4, 5, -3]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[4, 5, -3]], dtype=dtype),
   ]
   tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([1], dtype=np.int32),
+    np.array([1, 2, 1, 0], dtype=np.int32),
+    np.array([1], dtype=np.int32),
   ]
   for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list, inputs_list, tag_indices_list):
     sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params),
+      inputs=tf.expand_dims(inputs, 0),
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params),
     )
     sequence_score = tf.squeeze(sequence_score, [0])
 
@@ -142,37 +142,37 @@ def test_crf_multi_tag_sequence_score(dtype):
   transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int32),
+    np.array(3, dtype=np.int32),
+    np.array(1, dtype=np.int32),
   ]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[4, 5, -3]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[4, 5, -3]], dtype=dtype),
   ]
   tag_bitmap_list = [
-      np.array(
-          [
-              [True, True, False],
-              [True, False, True],
-              [False, True, True],
-              [True, False, True],
-          ],
-          dtype=bool,
-      ),
-      np.array([[True, True, False]], dtype=bool),
+    np.array(
+      [
+        [True, True, False],
+        [True, False, True],
+        [False, True, True],
+        [True, False, True],
+      ],
+      dtype=bool,
+    ),
+    np.array([[True, True, False]], dtype=bool),
   ]
   for sequence_lengths, inputs, tag_bitmap in zip(sequence_lengths_list, inputs_list, tag_bitmap_list):
     sequence_score = text.crf_multitag_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_bitmap=tf.expand_dims(tag_bitmap, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params),
+      inputs=tf.expand_dims(inputs, 0),
+      tag_bitmap=tf.expand_dims(tag_bitmap, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params),
     )
     sequence_score = tf.squeeze(sequence_score, [0])
     all_indices_list = [single_index_bitmap.nonzero()[0] for single_index_bitmap in tag_bitmap[:sequence_lengths]]
     expected_sequence_scores = [
-        calculate_sequence_score(inputs, transition_params, indices, sequence_lengths)
-        for indices in itertools.product(*all_indices_list)
+      calculate_sequence_score(inputs, transition_params, indices, sequence_lengths)
+      for indices in itertools.product(*all_indices_list)
     ]
     expected_log_sum_exp_sequence_scores = np.logaddexp.reduce(expected_sequence_scores)
     test_utils.assert_allclose_according_to_type(sequence_score, expected_log_sum_exp_sequence_scores)
@@ -185,9 +185,9 @@ def test_crf_unary_score(dtype):
     tag_indices = np.array([1, 2, 1, 0], dtype=dtype)
     sequence_lengths = np.array(3, dtype=np.int32)
     unary_score = text.crf_unary_score(
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        inputs=tf.expand_dims(inputs, 0),
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      inputs=tf.expand_dims(inputs, 0),
     )
     unary_score = tf.squeeze(unary_score, [0])
     expected_unary_score = sum(inputs[i][tag_indices[i]] for i in range(sequence_lengths))
@@ -200,13 +200,13 @@ def test_crf_binary_score(dtype):
   transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
   sequence_lengths = np.array(3, dtype=np.int32)
   binary_score = text.crf_binary_score(
-      tag_indices=tf.expand_dims(tag_indices, 0),
-      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-      transition_params=tf.constant(transition_params),
+    tag_indices=tf.expand_dims(tag_indices, 0),
+    sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+    transition_params=tf.constant(transition_params),
   )
   binary_score = tf.squeeze(binary_score, [0])
   expected_binary_score = sum(
-      transition_params[tag_indices[i], tag_indices[i + 1]] for i in range(sequence_lengths - 1)
+    transition_params[tag_indices[i], tag_indices[i + 1]] for i in range(sequence_lengths - 1)
   )
   test_utils.assert_allclose_according_to_type(binary_score, expected_binary_score)
 
@@ -216,16 +216,16 @@ def test_crf_log_norm(dtype):
   transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64),
+    np.array(3, dtype=np.int32),
+    np.array(1, dtype=np.int64),
   ]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[3, -1, 3]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[3, -1, 3]], dtype=dtype),
   ]
   tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32),
+    np.array([1, 2, 1, 0], dtype=np.int32),
+    np.array([2], dtype=np.int32),
   ]
 
   for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list, inputs_list, tag_indices_list):
@@ -238,19 +238,19 @@ def test_crf_log_norm(dtype):
       tag_indices = list(tag_indices)
       tag_indices.extend([0] * (num_words - sequence_lengths))
       all_sequence_scores.append(
-          text.crf_sequence_score(
-              inputs=tf.expand_dims(inputs, 0),
-              tag_indices=tf.expand_dims(tag_indices, 0),
-              sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-              transition_params=tf.constant(transition_params),
-          )
+        text.crf_sequence_score(
+          inputs=tf.expand_dims(inputs, 0),
+          tag_indices=tf.expand_dims(tag_indices, 0),
+          sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+          transition_params=tf.constant(transition_params),
+        )
       )
 
     brute_force_log_norm = tf.reduce_logsumexp(all_sequence_scores)
     log_norm = text.crf_log_norm(
-        inputs=tf.expand_dims(inputs, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params),
+      inputs=tf.expand_dims(inputs, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params),
     )
     log_norm = tf.squeeze(log_norm, [0])
 
@@ -260,7 +260,7 @@ def test_crf_log_norm(dtype):
 @pytest.mark.parametrize("dtype", [np.float16, np.float32])
 def test_crf_log_norm_zero_seq_length(dtype):
   """Test `crf_log_norm` when `sequence_lengths` contains one or more
-    zeros."""
+  zeros."""
   inputs = tf.constant(np.ones([2, 10, 5], dtype=dtype))
   transition_params = tf.constant(np.ones([5, 5], dtype=dtype))
   sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
@@ -284,22 +284,22 @@ def test_crf_log_likelihood(dtype):
     tag_indices = list(tag_indices)
     tag_indices.extend([0] * (num_words - sequence_lengths))
     sequence_log_likelihood, _ = text.crf_log_likelihood(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params),
+      inputs=tf.expand_dims(inputs, 0),
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params),
     )
     all_sequence_log_likelihoods.append(sequence_log_likelihood)
   total_log_likelihood = tf.reduce_logsumexp(all_sequence_log_likelihoods)
   test_utils.assert_allclose_according_to_type(
-      total_log_likelihood, 0.0, rtol=1e-6, atol=1e-6, half_rtol=2e-3, half_atol=2e-3
+    total_log_likelihood, 0.0, rtol=1e-6, atol=1e-6, half_rtol=2e-3, half_atol=2e-3
   )
 
   # check if `transition_params = None` raises an error
   text.crf_log_likelihood(
-      inputs=tf.expand_dims(inputs, 0),
-      tag_indices=tf.expand_dims(tag_indices, 0),
-      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+    inputs=tf.expand_dims(inputs, 0),
+    tag_indices=tf.expand_dims(tag_indices, 0),
+    sequence_lengths=tf.expand_dims(sequence_lengths, 0),
   )
 
 
@@ -320,10 +320,10 @@ def test_viterbi_decode(dtype):
     tag_indices.extend([0] * (num_words - sequence_lengths))
     all_sequences.append(tag_indices)
     sequence_score = text.crf_sequence_score(
-        inputs=tf.expand_dims(inputs, 0),
-        tag_indices=tf.expand_dims(tag_indices, 0),
-        sequence_lengths=tf.expand_dims(sequence_lengths, 0),
-        transition_params=tf.constant(transition_params),
+      inputs=tf.expand_dims(inputs, 0),
+      tag_indices=tf.expand_dims(tag_indices, 0),
+      sequence_lengths=tf.expand_dims(sequence_lengths, 0),
+      transition_params=tf.constant(transition_params),
     )
     sequence_score = tf.squeeze(sequence_score, [0])
     all_sequence_scores.append(sequence_score)
@@ -343,31 +343,31 @@ def test_crf_decode(dtype):
   transition_params = np.array([[-3, 5, -2], [3, 4, 1], [1, 2, 1]], dtype=dtype)
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [
-      np.array(3, dtype=np.int32),
-      np.array(1, dtype=np.int64),
+    np.array(3, dtype=np.int32),
+    np.array(1, dtype=np.int64),
   ]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[-1, 2, 1]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[-1, 2, 1]], dtype=dtype),
   ]
   tag_indices_list = [
-      np.array([1, 2, 1, 0], dtype=np.int32),
-      np.array([2], dtype=np.int32),
+    np.array([1, 2, 1, 0], dtype=np.int32),
+    np.array([2], dtype=np.int32),
   ]
 
   for sequence_lengths, inputs, tag_indices in zip(sequence_lengths_list, inputs_list, tag_indices_list):
     expected_max_sequence, expected_max_score = brute_force_decode(sequence_lengths, inputs, transition_params)
 
     actual_max_sequence, actual_max_score = text.crf_decode(
-        tf.expand_dims(inputs, 0),
-        tf.constant(transition_params),
-        tf.expand_dims(sequence_lengths, 0),
+      tf.expand_dims(inputs, 0),
+      tf.constant(transition_params),
+      tf.expand_dims(sequence_lengths, 0),
     )
     actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
     actual_max_score = tf.squeeze(actual_max_score, [0])
 
     test_utils.assert_allclose_according_to_type(actual_max_score, expected_max_score, 1e-6, 1e-6)
-    assert (list(actual_max_sequence[:sequence_lengths]) == expected_max_sequence[:sequence_lengths])
+    assert list(actual_max_sequence[:sequence_lengths]) == expected_max_sequence[:sequence_lengths]
 
 
 @pytest.mark.parametrize("dtype", [np.float16, np.float32])
@@ -376,40 +376,40 @@ def test_crf_constrained_decode(dtype):
   # Test both the length-1 and regular cases.
   sequence_lengths_list = [np.array(3, dtype=np.int32), np.array(1, dtype=np.int32)]
   inputs_list = [
-      np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
-      np.array([[4, 5, -3]], dtype=dtype),
+    np.array([[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=dtype),
+    np.array([[4, 5, -3]], dtype=dtype),
   ]
   tag_bitmap_list = [
-      np.array(
-          [
-              [True, False, False],
-              [False, True, True],
-              [False, True, True],
-              [False, True, True],
-          ],
-          dtype=bool,
-      ),
-      np.array([[False, True, True]], dtype=bool),
+    np.array(
+      [
+        [True, False, False],
+        [False, True, True],
+        [False, True, True],
+        [False, True, True],
+      ],
+      dtype=bool,
+    ),
+    np.array([[False, True, True]], dtype=bool),
   ]
   for sequence_lengths, inputs, tag_bitmap in zip(sequence_lengths_list, inputs_list, tag_bitmap_list):
     filtered_inputs = text.crf_filtered_inputs(
-        inputs=tf.expand_dims(inputs, 0), tag_bitmap=tf.expand_dims(tag_bitmap, 0)
+      inputs=tf.expand_dims(inputs, 0), tag_bitmap=tf.expand_dims(tag_bitmap, 0)
     )
 
     expected_max_sequence, expected_max_score = text.crf_decode(
-        filtered_inputs,
-        tf.constant(transition_params),
-        tf.expand_dims(sequence_lengths, 0),
+      filtered_inputs,
+      tf.constant(transition_params),
+      tf.expand_dims(sequence_lengths, 0),
     )
 
     expected_max_sequence = tf.squeeze(expected_max_sequence, [0])
     expected_max_score = tf.squeeze(expected_max_score, [0])
 
     actual_max_sequence, actual_max_score = text.crf_constrained_decode(
-        tf.expand_dims(inputs, 0),
-        tf.expand_dims(tag_bitmap, 0),
-        tf.constant(transition_params),
-        tf.expand_dims(sequence_lengths, 0),
+      tf.expand_dims(inputs, 0),
+      tf.expand_dims(tag_bitmap, 0),
+      tf.constant(transition_params),
+      tf.expand_dims(sequence_lengths, 0),
     )
 
     actual_max_sequence = tf.squeeze(actual_max_sequence, [0])
@@ -421,7 +421,7 @@ def test_crf_constrained_decode(dtype):
 
 def test_crf_decode_zero_seq_length():
   """Test that crf_decode works when sequence_length contains one or more
-    zeros."""
+  zeros."""
   inputs = tf.constant(np.ones([2, 10, 5], dtype=np.float32))
   transition_params = tf.constant(np.ones([5, 5], dtype=np.float32))
   sequence_lengths = tf.constant(np.zeros([2], dtype=np.int32))
@@ -442,9 +442,9 @@ def test_tf_function():
   batch_size = 4
   num_tags = 10
   input_signature = (
-      tf.TensorSpec([None, None, num_tags]),
-      tf.TensorSpec([num_tags, num_tags]),
-      tf.TensorSpec([None], dtype=tf.int32),
+    tf.TensorSpec([None, None, num_tags]),
+    tf.TensorSpec([num_tags, num_tags]),
+    tf.TensorSpec([None], dtype=tf.int32),
   )
   crf_decode = tf.function(input_signature=input_signature)(text.crf_decode)
   potentials = tf.random.uniform([batch_size, 1, num_tags])
@@ -454,7 +454,6 @@ def test_tf_function():
 
 
 class CRFDecode(tf.keras.layers.Layer):
-
   def __init__(self):
     super().__init__()
 
@@ -463,13 +462,11 @@ def call(self, potentials, transition_params, sequence_length):
 
 
 @pytest.mark.skipif(
-    tf.__version__[:3] == "2.4",
-    reason="CRF Decode doesn't work in TF2.4, the issue was fixed in TF core, but didn't make the release",
+  tf.__version__[:3] == "2.4",
+  reason="CRF Decode doesn't work in TF2.4, the issue was fixed in TF core, but didn't make the release",
 )
 def test_crf_decode_save_load(tmpdir):
-
   class DummyLoss(tf.keras.losses.Loss):
-
     def call(self, y_true, y_pred):
       return tf.zeros(shape=())
 
@@ -485,17 +482,17 @@ def call(self, y_true, y_pred):
   model.compile(optimizer="Adam", loss=DummyLoss())
 
   x_data = {
-      "input_tensor": np.random.random_sample((5, 10, 3)).astype(dtype=np.float32),
-      "seq_len": np.array([10] * 5, dtype=np.int32),
+    "input_tensor": np.random.random_sample((5, 10, 3)).astype(dtype=np.float32),
+    "seq_len": np.array([10] * 5, dtype=np.int32),
   }
 
-  tensor_name = ("tf.math.multiply" if Version(tf.__version__) >= Version("2.5.0") else "tf_op_layer_Mul")
+  tensor_name = "tf.math.multiply" if Version(tf.__version__) >= Version("2.5.0") else "tf_op_layer_Mul"
   y_data = {tensor_name: np.random.randint(0, 3, (5, 10))}
 
   model.fit(x_data, y_data)
   model.predict({
-      "input_tensor": tf.expand_dims(x_data["input_tensor"][0], 0),
-      "seq_len": np.array([10]),
+    "input_tensor": tf.expand_dims(x_data["input_tensor"][0], 0),
+    "seq_len": np.array([10]),
   })
 
   temp_dir = str(tmpdir.mkdir("model"))
@@ -503,33 +500,33 @@ def call(self, y_true, y_pred):
 
   tf.keras.backend.clear_session()
   model = tf.keras.models.load_model(
-      temp_dir,
-      custom_objects={
-          "CrfDecodeForwardRnnCell": text.crf.CrfDecodeForwardRnnCell,
-          "DummyLoss": DummyLoss,
-      },
+    temp_dir,
+    custom_objects={
+      "CrfDecodeForwardRnnCell": text.crf.CrfDecodeForwardRnnCell,
+      "DummyLoss": DummyLoss,
+    },
   )
   model.fit(x_data, y_data)
   model.predict({
-      "input_tensor": tf.expand_dims(x_data["input_tensor"][0], 0),
-      "seq_len": np.array([10]),
+    "input_tensor": tf.expand_dims(x_data["input_tensor"][0], 0),
+    "seq_len": np.array([10]),
   })
 
 
 @pytest.mark.parametrize(
-    "potentials,sequence_length",
-    [
-        # performs masking
-        pytest.param(
-            tf.random.normal([2, 12, 3]),
-            tf.constant([8, 10]),
-        ),
-        # does not perform masking
-        pytest.param(
-            tf.random.normal([4, 8, 10]),
-            tf.constant([8, 8, 8, 8]),
-        ),
-    ],
+  "potentials,sequence_length",
+  [
+    # performs masking
+    pytest.param(
+      tf.random.normal([2, 12, 3]),
+      tf.constant([8, 10]),
+    ),
+    # does not perform masking
+    pytest.param(
+      tf.random.normal([4, 8, 10]),
+      tf.constant([8, 8, 8, 8]),
+    ),
+  ],
 )
 def test_crf_decode_forward_mask(potentials, sequence_length):
   # mimics setup of the `_multi_seq_fn` closure in `crf_decode`
@@ -551,11 +548,11 @@ def test_crf_decode_forward_mask(potentials, sequence_length):
   masked_indices = tf.cast(tf.logical_not(mask), tf.int32)
 
   # sum of each row in the mask should equal timedim - seq lens
-  exp_mask_sums = (tf.repeat(inputs.shape[1], inputs.shape[0]) - sequence_length_less_one)
+  exp_mask_sums = tf.repeat(inputs.shape[1], inputs.shape[0]) - sequence_length_less_one
   mask_sums = tf.reduce_sum(masked_indices, axis=1)
   assert_array_equal(
-      exp_mask_sums.numpy(),
-      mask_sums.numpy(),
+    exp_mask_sums.numpy(),
+    mask_sums.numpy(),
   )
 
   # now apply the inverse mask to the backpointers and show that ALL are zeros. this is proof that
diff --git a/deepray/text/tests/crf_wrapper_test.py b/deepray/text/tests/crf_wrapper_test.py
index ddc6b420..5da7c024 100644
--- a/deepray/text/tests/crf_wrapper_test.py
+++ b/deepray/text/tests/crf_wrapper_test.py
@@ -24,22 +24,20 @@
 
 
 def get_test_data():
-  x = np.array(
-      [
-          [
-              # O   B-X  I-X  B-Y  I-Y
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 0.0, 1.0, 0.0, 0.0],
-              [0.0, 0.0, 1.0, 0.0, 0.0],
-          ],
-          [
-              # O   B-X  I-X  B-Y  I-Y
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-              [0.0, 1.0, 0.0, 0.0, 0.0],
-          ],
-      ]
-  )
+  x = np.array([
+    [
+      # O   B-X  I-X  B-Y  I-Y
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 0.0, 1.0, 0.0, 0.0],
+      [0.0, 0.0, 1.0, 0.0, 0.0],
+    ],
+    [
+      # O   B-X  I-X  B-Y  I-Y
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+      [0.0, 1.0, 0.0, 0.0, 0.0],
+    ],
+  ])
   y = np.array([[1, 2, 2], [1, 1, 1]])  # B-X  I-X  I-X  # B-X  B-X  B-X
   return x, y
 
diff --git a/deepray/text/tests/parse_time_op_test.py b/deepray/text/tests/parse_time_op_test.py
index 9a8c39b8..b7febf55 100644
--- a/deepray/text/tests/parse_time_op_test.py
+++ b/deepray/text/tests/parse_time_op_test.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Parse time op tests."""
+
 import platform
 
 import numpy as np
@@ -24,28 +25,28 @@
 IS_WINDOWS = platform.system() == "Windows"
 
 pytestmark = pytest.mark.skipif(
-    IS_WINDOWS,
-    reason="Doesn't work on Windows, see https://github.com/tensorflow/deepray/issues/782",
+  IS_WINDOWS,
+  reason="Doesn't work on Windows, see https://github.com/tensorflow/deepray/issues/782",
 )
 
 
 def test_parse_time():
   time_format = "%Y-%m-%dT%H:%M:%E*S%Ez"
   items = [
-      ("2019-05-17T23:56:09.05Z", time_format, "NANOSECOND", 1558137369050000000),
-      ("2019-05-17T23:56:09.05Z", time_format, "MICROSECOND", 1558137369050000),
-      ("2019-05-17T23:56:09.05Z", time_format, "MILLISECOND", 1558137369050),
-      ("2019-05-17T23:56:09.05Z", time_format, "SECOND", 1558137369),
-      (
-          [
-              "2019-05-17T23:56:09.05Z",
-              "2019-05-20T11:22:33.44Z",
-              "2019-05-30T22:33:44.55Z",
-          ],
-          time_format,
-          "MILLISECOND",
-          [1558137369050, 1558351353440, 1559255624550],
-      ),
+    ("2019-05-17T23:56:09.05Z", time_format, "NANOSECOND", 1558137369050000000),
+    ("2019-05-17T23:56:09.05Z", time_format, "MICROSECOND", 1558137369050000),
+    ("2019-05-17T23:56:09.05Z", time_format, "MILLISECOND", 1558137369050),
+    ("2019-05-17T23:56:09.05Z", time_format, "SECOND", 1558137369),
+    (
+      [
+        "2019-05-17T23:56:09.05Z",
+        "2019-05-20T11:22:33.44Z",
+        "2019-05-30T22:33:44.55Z",
+      ],
+      time_format,
+      "MILLISECOND",
+      [1558137369050, 1558351353440, 1559255624550],
+    ),
   ]
   for time_string, time_format, output_unit, expected in items:
     result = text.parse_time(time_string=time_string, time_format=time_format, output_unit=output_unit)
@@ -56,25 +57,25 @@ def test_invalid_output_unit():
   errors = (ValueError, tf.errors.InvalidArgumentError)
   with pytest.raises(errors):
     text.parse_time(
-        time_string="2019-05-17T23:56:09.05Z",
-        time_format="%Y-%m-%dT%H:%M:%E*S%Ez",
-        output_unit="INVALID",
+      time_string="2019-05-17T23:56:09.05Z",
+      time_format="%Y-%m-%dT%H:%M:%E*S%Ez",
+      output_unit="INVALID",
     )
 
 
 def test_invalid_time_format():
   with pytest.raises(tf.errors.InvalidArgumentError):
     text.parse_time(
-        time_string="2019-05-17T23:56:09.05Z",
-        time_format="INVALID",
-        output_unit="SECOND",
+      time_string="2019-05-17T23:56:09.05Z",
+      time_format="INVALID",
+      output_unit="SECOND",
     )
 
 
 def test_invalid_time_string():
   with pytest.raises(tf.errors.InvalidArgumentError):
     text.parse_time(
-        time_string="INVALID",
-        time_format="%Y-%m-%dT%H:%M:%E*S%Ez",
-        output_unit="SECOND",
+      time_string="INVALID",
+      time_format="%Y-%m-%dT%H:%M:%E*S%Ez",
+      output_unit="SECOND",
     )
diff --git a/deepray/text/tests/skip_gram_ops_test.py b/deepray/text/tests/skip_gram_ops_test.py
index 1e62c677..2f032aeb 100644
--- a/deepray/text/tests/skip_gram_ops_test.py
+++ b/deepray/text/tests/skip_gram_ops_test.py
@@ -36,24 +36,22 @@ def test_skip_gram_sample_skips_2():
   """Tests skip-gram with min_skips = max_skips = 2."""
   input_tensor = tf.constant([b"the", b"quick", b"brown", b"fox", b"jumps"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=2, max_skips=2)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"quick"),
-          (b"the", b"brown"),
-          (b"quick", b"the"),
-          (b"quick", b"brown"),
-          (b"quick", b"fox"),
-          (b"brown", b"the"),
-          (b"brown", b"quick"),
-          (b"brown", b"fox"),
-          (b"brown", b"jumps"),
-          (b"fox", b"quick"),
-          (b"fox", b"brown"),
-          (b"fox", b"jumps"),
-          (b"jumps", b"brown"),
-          (b"jumps", b"fox"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"quick"),
+    (b"the", b"brown"),
+    (b"quick", b"the"),
+    (b"quick", b"brown"),
+    (b"quick", b"fox"),
+    (b"brown", b"the"),
+    (b"brown", b"quick"),
+    (b"brown", b"fox"),
+    (b"brown", b"jumps"),
+    (b"fox", b"quick"),
+    (b"fox", b"brown"),
+    (b"fox", b"jumps"),
+    (b"jumps", b"brown"),
+    (b"jumps", b"fox"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -62,29 +60,27 @@ def test_skip_gram_sample_emit_self():
   """Tests skip-gram with emit_self_as_target = True."""
   input_tensor = tf.constant([b"the", b"quick", b"brown", b"fox", b"jumps"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=2, max_skips=2, emit_self_as_target=True)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"the"),
-          (b"the", b"quick"),
-          (b"the", b"brown"),
-          (b"quick", b"the"),
-          (b"quick", b"quick"),
-          (b"quick", b"brown"),
-          (b"quick", b"fox"),
-          (b"brown", b"the"),
-          (b"brown", b"quick"),
-          (b"brown", b"brown"),
-          (b"brown", b"fox"),
-          (b"brown", b"jumps"),
-          (b"fox", b"quick"),
-          (b"fox", b"brown"),
-          (b"fox", b"fox"),
-          (b"fox", b"jumps"),
-          (b"jumps", b"brown"),
-          (b"jumps", b"fox"),
-          (b"jumps", b"jumps"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"the"),
+    (b"the", b"quick"),
+    (b"the", b"brown"),
+    (b"quick", b"the"),
+    (b"quick", b"quick"),
+    (b"quick", b"brown"),
+    (b"quick", b"fox"),
+    (b"brown", b"the"),
+    (b"brown", b"quick"),
+    (b"brown", b"brown"),
+    (b"brown", b"fox"),
+    (b"brown", b"jumps"),
+    (b"fox", b"quick"),
+    (b"fox", b"brown"),
+    (b"fox", b"fox"),
+    (b"fox", b"jumps"),
+    (b"jumps", b"brown"),
+    (b"jumps", b"fox"),
+    (b"jumps", b"jumps"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -101,9 +97,11 @@ def test_skip_gram_sample_skips_0():
 
   # If emit_self_as_target is True, each token will be its own label.
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=0, max_skips=0, emit_self_as_target=True)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [(b"the", b"the"), (b"quick", b"quick"), (b"brown", b"brown")]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"the"),
+    (b"quick", b"quick"),
+    (b"brown", b"brown"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -112,16 +110,14 @@ def test_skip_gram_sample_skips_exceed_length():
   """Tests skip-gram when min/max_skips exceed length of input."""
   input_tensor = tf.constant([b"the", b"quick", b"brown"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=100, max_skips=100)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"quick"),
-          (b"the", b"brown"),
-          (b"quick", b"the"),
-          (b"quick", b"brown"),
-          (b"brown", b"the"),
-          (b"brown", b"quick"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"quick"),
+    (b"the", b"brown"),
+    (b"quick", b"the"),
+    (b"quick", b"brown"),
+    (b"brown", b"the"),
+    (b"brown", b"quick"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -130,14 +126,12 @@ def test_skip_gram_sample_start_limit():
   """Tests skip-gram over a limited portion of the input."""
   input_tensor = tf.constant([b"foo", b"the", b"quick", b"brown", b"bar"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=1, max_skips=1, start=1, limit=3)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"quick"),
-          (b"quick", b"the"),
-          (b"quick", b"brown"),
-          (b"brown", b"quick"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"quick"),
+    (b"quick", b"the"),
+    (b"quick", b"brown"),
+    (b"brown", b"quick"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -146,14 +140,12 @@ def test_skip_gram_sample_limit_exceeds():
   """Tests skip-gram when limit exceeds the length of the input."""
   input_tensor = tf.constant([b"foo", b"the", b"quick", b"brown"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=1, max_skips=1, start=1, limit=100)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"quick"),
-          (b"quick", b"the"),
-          (b"quick", b"brown"),
-          (b"brown", b"quick"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"quick"),
+    (b"quick", b"the"),
+    (b"quick", b"brown"),
+    (b"brown", b"quick"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -166,32 +158,30 @@ def test_skip_gram_sample_random_skips():
 
   input_tensor = tf.constant([b"the", b"quick", b"brown", b"fox", b"jumps", b"over"])
   tokens, labels = text.skip_gram_sample(input_tensor, min_skips=1, max_skips=2, seed=9)
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"quick"),
-          (b"the", b"brown"),
-          (b"quick", b"the"),
-          (b"quick", b"brown"),
-          (b"quick", b"fox"),
-          (b"brown", b"the"),
-          (b"brown", b"quick"),
-          (b"brown", b"fox"),
-          (b"brown", b"jumps"),
-          (b"fox", b"brown"),
-          (b"fox", b"jumps"),
-          (b"jumps", b"fox"),
-          (b"jumps", b"over"),
-          (b"over", b"fox"),
-          (b"over", b"jumps"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"quick"),
+    (b"the", b"brown"),
+    (b"quick", b"the"),
+    (b"quick", b"brown"),
+    (b"quick", b"fox"),
+    (b"brown", b"the"),
+    (b"brown", b"quick"),
+    (b"brown", b"fox"),
+    (b"brown", b"jumps"),
+    (b"fox", b"brown"),
+    (b"fox", b"jumps"),
+    (b"jumps", b"fox"),
+    (b"jumps", b"over"),
+    (b"over", b"fox"),
+    (b"over", b"jumps"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
 
 def test_skip_gram_sample_random_skips_default_seed():
   """Tests outputs are still random when no op-level seed is
-    specified."""
+  specified."""
 
   # This is needed since tests set a graph-level seed by default. We want
   # to explicitly avoid setting both graph-level seed and op-level seed,
@@ -239,11 +229,11 @@ def test_skip_gram_sample_errors():
   input_tensor = tf.constant([b"the", b"quick", b"brown"])
 
   invalid_skips = (
-      # min_skips and max_skips must be >= 0.
-      (-1, 2),
-      (1, -2),
-      # min_skips must be <= max_skips.
-      (2, 1),
+    # min_skips and max_skips must be >= 0.
+    (-1, 2),
+    (1, -2),
+    # min_skips must be <= max_skips.
+    (2, 1),
   )
   for min_skips, max_skips in invalid_skips:
     with pytest.raises(tf.errors.InvalidArgumentError):
@@ -270,23 +260,23 @@ def test_skip_gram_sample_errors():
   dummy_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer([b"foo"], [10]), -1)
   with pytest.raises(ValueError):
     text.skip_gram_sample(
-        dummy_input,
-        vocab_freq_table=dummy_table,
-        vocab_subsampling=None,
-        corpus_size=100,
+      dummy_input,
+      vocab_freq_table=dummy_table,
+      vocab_subsampling=None,
+      corpus_size=100,
     )
   with pytest.raises(ValueError):
     text.skip_gram_sample(
-        dummy_input,
-        vocab_freq_table=dummy_table,
-        vocab_subsampling=1e-5,
-        corpus_size=None,
+      dummy_input,
+      vocab_freq_table=dummy_table,
+      vocab_subsampling=1e-5,
+      corpus_size=None,
     )
 
 
 def test_filter_input_filter_vocab():
   """Tests input filtering based on vocab frequency table and
-    thresholds."""
+  thresholds."""
   input_tensor = tf.constant([b"the", b"answer", b"to", b"life", b"and", b"universe"])
   keys = tf.constant([b"and", b"life", b"the", b"to", b"universe"])
   values = tf.constant([0, 1, 2, 3, 4], tf.dtypes.int64)
@@ -294,40 +284,40 @@ def test_filter_input_filter_vocab():
 
   # No vocab_freq_table specified - output should be the same as input
   no_table_output = skip_gram_ops._filter_input(
-      input_tensor=input_tensor,
-      vocab_freq_table=None,
-      vocab_min_count=None,
-      vocab_subsampling=None,
-      corpus_size=None,
-      seed=None,
+    input_tensor=input_tensor,
+    vocab_freq_table=None,
+    vocab_min_count=None,
+    vocab_subsampling=None,
+    corpus_size=None,
+    seed=None,
   )
   np.testing.assert_equal(input_tensor.numpy(), np.asanyarray(no_table_output))
 
   # vocab_freq_table specified, but no vocab_min_count - output should
   # have filtered out tokens not in the table (b"answer").
   table_output = skip_gram_ops._filter_input(
-      input_tensor=input_tensor,
-      vocab_freq_table=vocab_freq_table,
-      vocab_min_count=None,
-      vocab_subsampling=None,
-      corpus_size=None,
-      seed=None,
+    input_tensor=input_tensor,
+    vocab_freq_table=vocab_freq_table,
+    vocab_min_count=None,
+    vocab_subsampling=None,
+    corpus_size=None,
+    seed=None,
   )
   np.testing.assert_equal(
-      np.asanyarray([b"the", b"to", b"life", b"and", b"universe"]),
-      table_output.numpy(),
+    np.asanyarray([b"the", b"to", b"life", b"and", b"universe"]),
+    table_output.numpy(),
   )
 
   # vocab_freq_table and vocab_min_count specified - output should have
   # filtered out tokens whose frequencies are below the threshold
   # (b"and": 0, b"life": 1).
   threshold_output = skip_gram_ops._filter_input(
-      input_tensor=input_tensor,
-      vocab_freq_table=vocab_freq_table,
-      vocab_min_count=2,
-      vocab_subsampling=None,
-      corpus_size=None,
-      seed=None,
+    input_tensor=input_tensor,
+    vocab_freq_table=vocab_freq_table,
+    vocab_min_count=2,
+    vocab_subsampling=None,
+    corpus_size=None,
+    seed=None,
   )
   np.testing.assert_equal(np.asanyarray([b"the", b"to", b"universe"]), threshold_output.numpy())
 
@@ -338,45 +328,41 @@ def test_filter_input_subsample_vocab():
   # that the outputs remain constant for testing.
   tf.random.set_seed(42)
 
-  input_tensor = tf.constant(
-      [
-          # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
-          b"the",
-          b"answer",  # Not in vocab. (Always discarded)
-          b"to",  # keep_prob = 0.75.
-          b"life",  # keep_prob > 1. (Always kept)
-          b"and",  # keep_prob = 0.48.
-          b"universe",  # Below vocab threshold of 3. (Always discarded)
-      ]
-  )
+  input_tensor = tf.constant([
+    # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
+    b"the",
+    b"answer",  # Not in vocab. (Always discarded)
+    b"to",  # keep_prob = 0.75.
+    b"life",  # keep_prob > 1. (Always kept)
+    b"and",  # keep_prob = 0.48.
+    b"universe",  # Below vocab threshold of 3. (Always discarded)
+  ])
   keys = tf.constant([b"and", b"life", b"the", b"to", b"universe"])
   values = tf.constant([40, 8, 30, 20, 2], tf.dtypes.int64)
   vocab_freq_table = tf.lookup.StaticHashTable(tf.lookup.KeyValueTensorInitializer(keys, values), -1)
 
   output = skip_gram_ops._filter_input(
-      input_tensor=input_tensor,
-      vocab_freq_table=vocab_freq_table,
-      vocab_min_count=3,
-      vocab_subsampling=0.05,
-      corpus_size=tf.math.reduce_sum(values),
-      seed=9,
+    input_tensor=input_tensor,
+    vocab_freq_table=vocab_freq_table,
+    vocab_min_count=3,
+    vocab_subsampling=0.05,
+    corpus_size=tf.math.reduce_sum(values),
+    seed=9,
   )
   np.testing.assert_equal(np.asanyarray([b"the", b"to", b"life", b"and"]), output.numpy())
 
 
 def test_skip_gram_sample_with_text_vocab_filter_vocab():
   """Tests skip-gram sampling with text vocab and freq threshold
-    filtering."""
-  input_tensor = tf.constant(
-      [
-          b"the",
-          b"answer",  # Will be filtered before candidate generation.
-          b"to",
-          b"life",
-          b"and",
-          b"universe",  # Will be filtered before candidate generation.
-      ]
-  )
+  filtering."""
+  input_tensor = tf.constant([
+    b"the",
+    b"answer",  # Will be filtered before candidate generation.
+    b"to",
+    b"life",
+    b"and",
+    b"universe",  # Will be filtered before candidate generation.
+  ])
 
   # b"answer" is not in vocab file, and b"universe"'s frequency is below
   # threshold of 3.
@@ -384,25 +370,23 @@ def test_skip_gram_sample_with_text_vocab_filter_vocab():
     vocab_freq_file = _make_text_vocab_freq_file(tmp_dir)
 
     tokens, labels = text.skip_gram_sample_with_text_vocab(
-        input_tensor=input_tensor,
-        vocab_freq_file=vocab_freq_file,
-        vocab_token_index=0,
-        vocab_freq_index=1,
-        vocab_min_count=3,
-        min_skips=1,
-        max_skips=1,
+      input_tensor=input_tensor,
+      vocab_freq_file=vocab_freq_file,
+      vocab_token_index=0,
+      vocab_freq_index=1,
+      vocab_min_count=3,
+      min_skips=1,
+      max_skips=1,
     )
 
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [
-          (b"the", b"to"),
-          (b"to", b"the"),
-          (b"to", b"life"),
-          (b"life", b"to"),
-          (b"life", b"and"),
-          (b"and", b"life"),
-      ]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"to"),
+    (b"to", b"the"),
+    (b"to", b"life"),
+    (b"life", b"to"),
+    (b"life", b"and"),
+    (b"and", b"life"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -412,17 +396,15 @@ def _text_vocab_subsample_vocab_helper(vocab_freq_file, vocab_min_count, vocab_f
   # that the outputs remain constant for testing.
   tf.random.set_seed(42)
 
-  input_tensor = tf.constant(
-      [
-          # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
-          b"the",
-          b"answer",  # Not in vocab. (Always discarded)
-          b"to",  # keep_prob = 0.75.
-          b"life",  # keep_prob > 1. (Always kept)
-          b"and",  # keep_prob = 0.48.
-          b"universe",  # Below vocab threshold of 3. (Always discarded)
-      ]
-  )
+  input_tensor = tf.constant([
+    # keep_prob = (sqrt(30/(0.05*100)) + 1) * (0.05*100/30) = 0.57.
+    b"the",
+    b"answer",  # Not in vocab. (Always discarded)
+    b"to",  # keep_prob = 0.75.
+    b"life",  # keep_prob > 1. (Always kept)
+    b"and",  # keep_prob = 0.48.
+    b"universe",  # Below vocab threshold of 3. (Always discarded)
+  ])
   # keep_prob calculated from vocab file with relative frequencies of:
   # and: 40
   # life: 8
@@ -431,22 +413,25 @@ def _text_vocab_subsample_vocab_helper(vocab_freq_file, vocab_min_count, vocab_f
   # universe: 2
 
   tokens, labels = text.skip_gram_sample_with_text_vocab(
-      input_tensor=input_tensor,
-      vocab_freq_file=vocab_freq_file,
-      vocab_token_index=0,
-      vocab_freq_index=1,
-      vocab_freq_dtype=tf.dtypes.float64,
-      vocab_min_count=vocab_min_count,
-      vocab_subsampling=0.05,
-      corpus_size=corpus_size,
-      min_skips=1,
-      max_skips=1,
-      seed=123,
+    input_tensor=input_tensor,
+    vocab_freq_file=vocab_freq_file,
+    vocab_token_index=0,
+    vocab_freq_index=1,
+    vocab_freq_dtype=tf.dtypes.float64,
+    vocab_min_count=vocab_min_count,
+    vocab_subsampling=0.05,
+    corpus_size=corpus_size,
+    min_skips=1,
+    max_skips=1,
+    seed=123,
   )
 
-  expected_tokens, expected_labels = _split_tokens_labels(
-      [(b"the", b"to"), (b"to", b"the"), (b"to", b"life"), (b"life", b"to")]
-  )
+  expected_tokens, expected_labels = _split_tokens_labels([
+    (b"the", b"to"),
+    (b"to", b"the"),
+    (b"to", b"life"),
+    (b"life", b"to"),
+  ])
   np.testing.assert_equal(np.asanyarray(expected_tokens), tokens.numpy())
   np.testing.assert_equal(np.asanyarray(expected_labels), labels.numpy())
 
@@ -468,31 +453,31 @@ def test_skip_gram_sample_with_text_vocab_subsample_vocab():
 
 def _skip_gram_sample_with_text_vocab_subsample_vocab(text_vocab_freq_file):
   _text_vocab_subsample_vocab_helper(
-      vocab_freq_file=text_vocab_freq_file,
-      vocab_min_count=3,
-      vocab_freq_dtype=tf.dtypes.int64,
+    vocab_freq_file=text_vocab_freq_file,
+    vocab_min_count=3,
+    vocab_freq_dtype=tf.dtypes.int64,
   )
   _text_vocab_subsample_vocab_helper(
-      vocab_freq_file=text_vocab_freq_file,
-      vocab_min_count=3,
-      vocab_freq_dtype=tf.dtypes.int64,
-      corpus_size=100,
+    vocab_freq_file=text_vocab_freq_file,
+    vocab_min_count=3,
+    vocab_freq_dtype=tf.dtypes.int64,
+    corpus_size=100,
   )
 
   # The user-supplied corpus_size should not be less than the sum of all
   # the frequency counts of vocab_freq_file, which is 100.
   with pytest.raises(ValueError):
     _text_vocab_subsample_vocab_helper(
-        vocab_freq_file=text_vocab_freq_file,
-        vocab_min_count=3,
-        vocab_freq_dtype=tf.dtypes.int64,
-        corpus_size=99,
+      vocab_freq_file=text_vocab_freq_file,
+      vocab_min_count=3,
+      vocab_freq_dtype=tf.dtypes.int64,
+      corpus_size=99,
     )
 
 
 def test_skip_gram_sample_with_text_vocab_subsample_vocab_float():
   """Tests skip-gram sampling with text vocab and subsampling with
-    floats."""
+  floats."""
   # Vocab file frequencies
   # and: 0.4
   # life: 0.08
@@ -508,31 +493,31 @@ def test_skip_gram_sample_with_text_vocab_subsample_vocab_float():
 
 def _skip_gram_sample_with_text_vocab_subsample_vocab_float(text_vocab_float_file):
   _text_vocab_subsample_vocab_helper(
-      vocab_freq_file=text_vocab_float_file,
-      vocab_min_count=0.03,
-      vocab_freq_dtype=tf.dtypes.float32,
+    vocab_freq_file=text_vocab_float_file,
+    vocab_min_count=0.03,
+    vocab_freq_dtype=tf.dtypes.float32,
   )
   _text_vocab_subsample_vocab_helper(
-      vocab_freq_file=text_vocab_float_file,
-      vocab_min_count=0.03,
-      vocab_freq_dtype=tf.dtypes.float32,
-      corpus_size=1.0,
+    vocab_freq_file=text_vocab_float_file,
+    vocab_min_count=0.03,
+    vocab_freq_dtype=tf.dtypes.float32,
+    corpus_size=1.0,
   )
 
   # The user-supplied corpus_size should not be less than the sum of all
   # the frequency counts of vocab_freq_file, which is 1.
   with pytest.raises(ValueError):
     _text_vocab_subsample_vocab_helper(
-        vocab_freq_file=text_vocab_float_file,
-        vocab_min_count=0.03,
-        vocab_freq_dtype=tf.dtypes.float32,
-        corpus_size=0.99,
+      vocab_freq_file=text_vocab_float_file,
+      vocab_min_count=0.03,
+      vocab_freq_dtype=tf.dtypes.float32,
+      corpus_size=0.99,
     )
 
 
 def test_skip_gram_sample_with_text_vocab_errors():
   """Tests various errors raised by
-    skip_gram_sample_with_text_vocab()."""
+  skip_gram_sample_with_text_vocab()."""
 
   with tempfile.TemporaryDirectory() as tmp_dir:
     vocab_freq_file = _make_text_vocab_freq_file(tmp_dir)
@@ -542,25 +527,25 @@ def test_skip_gram_sample_with_text_vocab_errors():
 def _skip_gram_sample_with_text_vocab_errors(vocab_freq_file):
   dummy_input = tf.constant([""])
   invalid_indices = (
-      # vocab_token_index can't be negative.
-      (-1, 0),
-      # vocab_freq_index can't be negative.
-      (0, -1),
-      # vocab_token_index can't be equal to vocab_freq_index.
-      (0, 0),
-      (1, 1),
-      # vocab_freq_file only has two columns.
-      (0, 2),
-      (2, 0),
+    # vocab_token_index can't be negative.
+    (-1, 0),
+    # vocab_freq_index can't be negative.
+    (0, -1),
+    # vocab_token_index can't be equal to vocab_freq_index.
+    (0, 0),
+    (1, 1),
+    # vocab_freq_file only has two columns.
+    (0, 2),
+    (2, 0),
   )
 
   for vocab_token_index, vocab_freq_index in invalid_indices:
     with pytest.raises(ValueError):
       text.skip_gram_sample_with_text_vocab(
-          input_tensor=dummy_input,
-          vocab_freq_file=vocab_freq_file,
-          vocab_token_index=vocab_token_index,
-          vocab_freq_index=vocab_freq_index,
+        input_tensor=dummy_input,
+        vocab_freq_file=vocab_freq_file,
+        vocab_token_index=vocab_token_index,
+        vocab_freq_index=vocab_freq_index,
       )
 
 
@@ -577,10 +562,10 @@ def _make_text_vocab_float_file(tmp_dir):
   with open(filepath, "w") as f:
     writer = csv.writer(f)
     writer.writerows([
-        ["and", 0.4],
-        ["life", 0.08],
-        ["the", 0.3],
-        ["to", 0.2],
-        ["universe", 0.02],
+      ["and", 0.4],
+      ["life", 0.08],
+      ["the", 0.3],
+      ["to", 0.2],
+      ["universe", 0.02],
     ])
   return filepath
diff --git a/deepray/utils/accelerator/tpu_test.py b/deepray/utils/accelerator/tpu_test.py
index d19283bf..414a2208 100644
--- a/deepray/utils/accelerator/tpu_test.py
+++ b/deepray/utils/accelerator/tpu_test.py
@@ -20,14 +20,13 @@
 from official.utils.accelerator import tpu as tpu_utils
 
 TEST_CASES = [
-    dict(embedding_dim=256, vocab_size=1000, sequence_length=64, batch_size=32, seed=54131),
-    dict(embedding_dim=8, vocab_size=15, sequence_length=12, batch_size=256, seed=536413),
-    dict(embedding_dim=2048, vocab_size=512, sequence_length=50, batch_size=8, seed=35124)
+  dict(embedding_dim=256, vocab_size=1000, sequence_length=64, batch_size=32, seed=54131),
+  dict(embedding_dim=8, vocab_size=15, sequence_length=12, batch_size=256, seed=536413),
+  dict(embedding_dim=2048, vocab_size=512, sequence_length=50, batch_size=8, seed=35124),
 ]
 
 
 class TPUBaseTester(tf.test.TestCase):
-
   def construct_embedding_and_values(self, embedding_dim, vocab_size, sequence_length, batch_size, seed):
     np.random.seed(seed)
 
@@ -36,7 +35,7 @@ def construct_embedding_and_values(self, embedding_dim, vocab_size, sequence_len
 
     tokens = np.random.randint(low=1, high=vocab_size - 1, size=(batch_size, sequence_length))
     for i in range(batch_size):
-      tokens[i, np.random.randint(low=0, high=sequence_length - 1):] = 0
+      tokens[i, np.random.randint(low=0, high=sequence_length - 1) :] = 0
     values = tf.convert_to_tensor(value=tokens, dtype=tf.int32)
     mask = tf.cast(tf.not_equal(values, 0), dtype=tf.float32)
     return embedding_table, values, mask
@@ -46,14 +45,14 @@ def _test_embedding(self, embedding_dim, vocab_size, sequence_length, batch_size
 
     with self.test_session():
       embedding_table, values, mask = self.construct_embedding_and_values(
-          embedding_dim=embedding_dim,
-          vocab_size=vocab_size,
-          sequence_length=sequence_length,
-          batch_size=batch_size,
-          seed=seed
+        embedding_dim=embedding_dim,
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        batch_size=batch_size,
+        seed=seed,
       )
 
-      embedding = (tf.nn.embedding_lookup(params=embedding_table, ids=values) * tf.expand_dims(mask, -1))
+      embedding = tf.nn.embedding_lookup(params=embedding_table, ids=values) * tf.expand_dims(mask, -1)
 
       matmul_embedding = tpu_utils.embedding_matmul(embedding_table=embedding_table, values=values, mask=mask)
 
@@ -63,11 +62,11 @@ def _test_masking(self, embedding_dim, vocab_size, sequence_length, batch_size,
     """Test that matmul embedding properly zeros masked positions."""
     with self.test_session():
       embedding_table, values, mask = self.construct_embedding_and_values(
-          embedding_dim=embedding_dim,
-          vocab_size=vocab_size,
-          sequence_length=sequence_length,
-          batch_size=batch_size,
-          seed=seed
+        embedding_dim=embedding_dim,
+        vocab_size=vocab_size,
+        sequence_length=sequence_length,
+        batch_size=batch_size,
+        seed=seed,
       )
 
       matmul_embedding = tpu_utils.embedding_matmul(embedding_table=embedding_table, values=values, mask=mask)
diff --git a/deepray/utils/benchmark.py b/deepray/utils/benchmark.py
index 65d8a608..71d00df5 100644
--- a/deepray/utils/benchmark.py
+++ b/deepray/utils/benchmark.py
@@ -20,14 +20,14 @@
 
 class PerformanceCalculator:
   """
-    PerformanceCalculator for throughput and latency statistics.
+  PerformanceCalculator for throughput and latency statistics.
 
-    Computes the statistics over a given number of steps. Timers should be initialized by the user by
-    calling init() at the right moment -- just before running consecutive iterations of training.
+  Computes the statistics over a given number of steps. Timers should be initialized by the user by
+  calling init() at the right moment -- just before running consecutive iterations of training.
 
-    Attributes:
-        warmup_steps (int): Number of initial steps to ignore for computing results.
-        total_steps (int): Number of steps to collect data for (excluding warmup_steps); use <= 0 for unbounded horizon.
+  Attributes:
+      warmup_steps (int): Number of initial steps to ignore for computing results.
+      total_steps (int): Number of steps to collect data for (excluding warmup_steps); use <= 0 for unbounded horizon.
   """
 
   def __init__(self, warmup_steps=0, total_steps=0):
diff --git a/deepray/utils/ckpt_util.py b/deepray/utils/ckpt_util.py
index 78394151..e69e2a33 100644
--- a/deepray/utils/ckpt_util.py
+++ b/deepray/utils/ckpt_util.py
@@ -7,5 +7,4 @@ def print_checkpoint(save_path):
   dtypes = reader.get_variable_to_dtype_map()
   print(f"Checkpoint at '{save_path}':")
   for key in shapes:
-    print(f"  (key='{key}', shape={shapes[key]}, dtype={dtypes[key].name}, "
-          f"value={reader.get_tensor(key)})")
\ No newline at end of file
+    print(f"  (key='{key}', shape={shapes[key]}, dtype={dtypes[key].name}, value={reader.get_tensor(key)})")
diff --git a/deepray/utils/data/feature_map.py b/deepray/utils/data/feature_map.py
index b5d07690..4b11646c 100644
--- a/deepray/utils/data/feature_map.py
+++ b/deepray/utils/data/feature_map.py
@@ -13,7 +13,6 @@
 
 
 class FeatureMap(metaclass=SingletonType):
-
   def __init__(self):
     if flags.FLAGS.config_file:
       # Read YAML file
@@ -24,14 +23,15 @@ def __init__(self):
           logging.error(exc)
     if flags.FLAGS.feature_map and tf.io.gfile.exists(flags.FLAGS.feature_map):
       self.feature_map = self.get_summary(
-          feature_map=flags.FLAGS.feature_map, black_list=flags.FLAGS.black_list, white_list=flags.FLAGS.white_list
+        feature_map=flags.FLAGS.feature_map, black_list=flags.FLAGS.black_list, white_list=flags.FLAGS.white_list
       )
       if is_main_process():
         logging.info("Used features map:")
         print(
-            "\n" +
-            self.feature_map.loc[:,
-                                 ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])].to_markdown()
+          "\n"
+          + self.feature_map.loc[
+            :, ~self.feature_map.columns.isin(["bucket_boundaries", "vocabulary_list"])
+          ].to_markdown()
         )
     else:
       logging.info(f"feature_map file not exists: {flags.FLAGS.feature_map}")
@@ -42,28 +42,30 @@ def get_summary(self, feature_map, black_list=None, white_list=None):
       file_name, file_extension = os.path.splitext(feature_map)
       sep = None
       if file_extension == ".csv":
-        sep = ','
+        sep = ","
       elif file_extension == ".tsv":
-        sep = '\t'
+        sep = "\t"
       else:
         ValueError(f"Not support format for {f}")
       feature_map = pd.read_csv(
-          f,
-          sep=sep,
-          dtype={
-              "code": int,
-              "name": "string",
-              "dtype": "string",
-              "ftype": "string",
-              "dim": "uint32",
-              "length": float,
-              "voc_size": float,
-          },
-      ).fillna(value={
+        f,
+        sep=sep,
+        dtype={
+          "code": int,
+          "name": "string",
+          "dtype": "string",
+          "ftype": "string",
+          "dim": "uint32",
+          "length": float,
+          "voc_size": float,
+        },
+      ).fillna(
+        value={
           "code": -1,
           "length": 1.0,
           "voc_size": 0.0,
-      })
+        }
+      )
     if black_list:
       with open(black_list) as f:
         black_feature_list = [feature.strip() for feature in f]
@@ -72,24 +74,24 @@ def get_summary(self, feature_map, black_list=None, white_list=None):
     if white_list:
       white_feature_list = []
       if os.path.isfile(white_list):
-        print(f'{white_list} is a file.')
+        print(f"{white_list} is a file.")
         with open(white_list) as f:
           white_feature_list += [feature.strip() for feature in f]
       elif os.path.isdir(white_list):
-        print(f'{white_list} is a directory.')
+        print(f"{white_list} is a directory.")
         for used_features in os.listdir(white_list):
           filename = os.path.join(white_list, used_features)
           with open(filename) as f:
             white_feature_list += [feature.strip() for feature in f]
       else:
-        print(f'{white_list} is neither a file nor a directory.')
+        print(f"{white_list} is neither a file nor a directory.")
 
       feature_map = feature_map[feature_map["name"].isin(white_feature_list)]
 
     # Convert these columns to int if they exist in the DataFrame
     for column in [
-        'length',
-        'voc_size',
+      "length",
+      "voc_size",
     ]:
       if column in feature_map.columns:
         feature_map[column] = feature_map[column].astype(int)
diff --git a/deepray/utils/data/feature_map_test.py b/deepray/utils/data/feature_map_test.py
index 3414b249..1a0e7595 100644
--- a/deepray/utils/data/feature_map_test.py
+++ b/deepray/utils/data/feature_map_test.py
@@ -34,38 +34,45 @@
 
 _ROWS_PER_CORE = 4
 _TEST_CASES = [
-    # One batch of one
-    dict(row_count=1, cpu_count=1, expected=[[[0]]]),
-    dict(row_count=10, cpu_count=1, expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]]),
-    dict(
-        row_count=21,
-        cpu_count=1,
-        expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]], [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]]
-    ),
-    dict(row_count=1, cpu_count=4, expected=[[[0]]]),
-    dict(row_count=10, cpu_count=4, expected=[[[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]]),
-    dict(
-        row_count=21,
-        cpu_count=4,
-        expected=[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]], [[16], [17], [18], [19, 20]]]
-    ),
-    dict(row_count=10, cpu_count=8, expected=[[[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]]),
-    dict(
-        row_count=40,
-        cpu_count=8,
-        expected=[
-            [
-                [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23],
-                [24, 25, 26, 27], [28, 29, 30, 31]
-            ], [[32], [33], [34], [35], [36], [37], [38], [39]]
-        ]
-    ),
+  # One batch of one
+  dict(row_count=1, cpu_count=1, expected=[[[0]]]),
+  dict(row_count=10, cpu_count=1, expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]]),
+  dict(
+    row_count=21,
+    cpu_count=1,
+    expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]], [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]],
+  ),
+  dict(row_count=1, cpu_count=4, expected=[[[0]]]),
+  dict(row_count=10, cpu_count=4, expected=[[[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]]),
+  dict(
+    row_count=21,
+    cpu_count=4,
+    expected=[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]], [[16], [17], [18], [19, 20]]],
+  ),
+  dict(row_count=10, cpu_count=8, expected=[[[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]]),
+  dict(
+    row_count=40,
+    cpu_count=8,
+    expected=[
+      [
+        [0, 1, 2, 3],
+        [4, 5, 6, 7],
+        [8, 9, 10, 11],
+        [12, 13, 14, 15],
+        [16, 17, 18, 19],
+        [20, 21, 22, 23],
+        [24, 25, 26, 27],
+        [28, 29, 30, 31],
+      ],
+      [[32], [33], [34], [35], [36], [37], [38], [39]],
+    ],
+  ),
 ]
 
 _FEATURE_MAP = {
-    _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32)
+  _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
+  _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
+  _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32),
 }
 
 DATA = {"calories": [420, 380, 390], "duration": [50, 40, 45]}
@@ -92,7 +99,6 @@ def fixed_core_count(cpu_count):
 
 
 class BaseTest(tf.test.TestCase):
-
   def setUp(self):
     super(BaseTest, self).setUp()
     if keras_utils.is_v2_0:
@@ -131,20 +137,16 @@ def test_large_rows_large_core(self):
 
   def _serialize_deserialize(self, num_cores=1, num_rows=20):
     np.random.seed(1)
-    df = pd.DataFrame(
-        {
-            # Serialization order is only deterministic for num_cores=1. raw_row is
-            # used in validation after the deserialization.
-            _RAW_ROW:
-                np.array(range(num_rows), dtype=np.int64),
-            _DUMMY_COL:
-                np.random.randint(0, 35, size=(num_rows,)),
-            _DUMMY_VEC_COL:
-                [
-                    np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)]) for i in range(num_rows)  # pylint: disable=unused-variable
-                ]
-        }
-    )
+    df = pd.DataFrame({
+      # Serialization order is only deterministic for num_cores=1. raw_row is
+      # used in validation after the deserialization.
+      _RAW_ROW: np.array(range(num_rows), dtype=np.int64),
+      _DUMMY_COL: np.random.randint(0, 35, size=(num_rows,)),
+      _DUMMY_VEC_COL: [
+        np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)])
+        for i in range(num_rows)  # pylint: disable=unused-variable
+      ],
+    })
 
     with fixed_core_count(num_cores):
       buffer_path = file_io.write_to_temp_buffer(df, self.get_temp_dir(), [_RAW_ROW, _DUMMY_COL, _DUMMY_VEC_COL])
diff --git a/deepray/utils/data/file_io.py b/deepray/utils/data/file_io.py
index 33ffbb94..92ec3454 100644
--- a/deepray/utils/data/file_io.py
+++ b/deepray/utils/data/file_io.py
@@ -99,7 +99,7 @@ def iter_shard_dataframe(df, rows_per_core=1000):
     df_shard = df[min_index:max_index]
     n_shard = len(df_shard)
     boundaries = np.linspace(0, n_shard, num_cores + 1, dtype=np.int64)
-    yield [df_shard[boundaries[j]:boundaries[j + 1]] for j in range(num_cores)]
+    yield [df_shard[boundaries[j] : boundaries[j + 1]] for j in range(num_cores)]
 
 
 def _shard_dict_to_examples(shard_dict):
@@ -119,7 +119,7 @@ def _shard_dict_to_examples(shard_dict):
     for i in range(n):
       feature_list[i][column] = feature_map(values[i])
   examples = [
-      tf.train.Example(features=tf.train.Features(feature=example_features)) for example_features in feature_list
+    tf.train.Example(features=tf.train.Features(feature=example_features)) for example_features in feature_list
   ]
 
   return [e.SerializeToString() for e in examples]
@@ -166,13 +166,14 @@ def write_to_buffer(dataframe, buffer_path, columns, expected_size=None):
   Returns:
     The path of the buffer.
   """
-  if (tf.io.gfile.exists(buffer_path) and tf.io.gfile.stat(buffer_path).length > 0):
+  if tf.io.gfile.exists(buffer_path) and tf.io.gfile.stat(buffer_path).length > 0:
     actual_size = tf.io.gfile.stat(buffer_path).length
     if expected_size == actual_size:
       return buffer_path
     logging.warning(
-        "Existing buffer {} has size {}. Expected size {}. Deleting and "
-        "rebuilding buffer.".format(buffer_path, actual_size, expected_size)
+      "Existing buffer {} has size {}. Expected size {}. Deleting and rebuilding buffer.".format(
+        buffer_path, actual_size, expected_size
+      )
     )
     tf.io.gfile.remove(buffer_path)
 
@@ -200,10 +201,10 @@ def write_to_buffer(dataframe, buffer_path, columns, expected_size=None):
 
 def recursive_copy(src_dir, dest_dir):
   """Copy the contents of src_dir into the folder dest_dir.
-    Args:
-      src_dir: hdfs or local path.
-      dest_dir: hdfs or local path.
-    """
+  Args:
+    src_dir: hdfs or local path.
+    dest_dir: hdfs or local path.
+  """
   for file_name in tf.io.gfile.listdir(src_dir):
     old_path = os.path.join(src_dir, file_name)
     new_path = os.path.join(dest_dir, file_name)
diff --git a/deepray/utils/data/file_io_test.py b/deepray/utils/data/file_io_test.py
index efbdebc6..e8dda926 100644
--- a/deepray/utils/data/file_io_test.py
+++ b/deepray/utils/data/file_io_test.py
@@ -37,38 +37,45 @@
 
 _ROWS_PER_CORE = 4
 _TEST_CASES = [
-    # One batch of one
-    dict(row_count=1, cpu_count=1, expected=[[[0]]]),
-    dict(row_count=10, cpu_count=1, expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]]),
-    dict(
-        row_count=21,
-        cpu_count=1,
-        expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]], [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]]
-    ),
-    dict(row_count=1, cpu_count=4, expected=[[[0]]]),
-    dict(row_count=10, cpu_count=4, expected=[[[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]]),
-    dict(
-        row_count=21,
-        cpu_count=4,
-        expected=[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]], [[16], [17], [18], [19, 20]]]
-    ),
-    dict(row_count=10, cpu_count=8, expected=[[[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]]),
-    dict(
-        row_count=40,
-        cpu_count=8,
-        expected=[
-            [
-                [0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23],
-                [24, 25, 26, 27], [28, 29, 30, 31]
-            ], [[32], [33], [34], [35], [36], [37], [38], [39]]
-        ]
-    ),
+  # One batch of one
+  dict(row_count=1, cpu_count=1, expected=[[[0]]]),
+  dict(row_count=10, cpu_count=1, expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9]]]),
+  dict(
+    row_count=21,
+    cpu_count=1,
+    expected=[[[0, 1, 2, 3]], [[4, 5, 6, 7]], [[8, 9, 10, 11]], [[12, 13, 14, 15]], [[16, 17, 18, 19]], [[20]]],
+  ),
+  dict(row_count=1, cpu_count=4, expected=[[[0]]]),
+  dict(row_count=10, cpu_count=4, expected=[[[0, 1], [2, 3, 4], [5, 6], [7, 8, 9]]]),
+  dict(
+    row_count=21,
+    cpu_count=4,
+    expected=[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11], [12, 13, 14, 15]], [[16], [17], [18], [19, 20]]],
+  ),
+  dict(row_count=10, cpu_count=8, expected=[[[0], [1], [2], [3, 4], [5], [6], [7], [8, 9]]]),
+  dict(
+    row_count=40,
+    cpu_count=8,
+    expected=[
+      [
+        [0, 1, 2, 3],
+        [4, 5, 6, 7],
+        [8, 9, 10, 11],
+        [12, 13, 14, 15],
+        [16, 17, 18, 19],
+        [20, 21, 22, 23],
+        [24, 25, 26, 27],
+        [28, 29, 30, 31],
+      ],
+      [[32], [33], [34], [35], [36], [37], [38], [39]],
+    ],
+  ),
 ]
 
 _FEATURE_MAP = {
-    _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
-    _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32)
+  _RAW_ROW: tf.io.FixedLenFeature([1], dtype=tf.int64),
+  _DUMMY_COL: tf.io.FixedLenFeature([1], dtype=tf.int64),
+  _DUMMY_VEC_COL: tf.io.FixedLenFeature([_DUMMY_VEC_LEN], dtype=tf.float32),
 }
 
 
@@ -93,7 +100,6 @@ def fixed_core_count(cpu_count):
 
 
 class BaseTest(tf.test.TestCase):
-
   def setUp(self):
     super(BaseTest, self).setUp()
     if keras_utils.is_v2_0:
@@ -132,20 +138,16 @@ def test_large_rows_large_core(self):
 
   def _serialize_deserialize(self, num_cores=1, num_rows=20):
     np.random.seed(1)
-    df = pd.DataFrame(
-        {
-            # Serialization order is only deterministic for num_cores=1. raw_row is
-            # used in validation after the deserialization.
-            _RAW_ROW:
-                np.array(range(num_rows), dtype=np.int64),
-            _DUMMY_COL:
-                np.random.randint(0, 35, size=(num_rows,)),
-            _DUMMY_VEC_COL:
-                [
-                    np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)]) for i in range(num_rows)  # pylint: disable=unused-variable
-                ]
-        }
-    )
+    df = pd.DataFrame({
+      # Serialization order is only deterministic for num_cores=1. raw_row is
+      # used in validation after the deserialization.
+      _RAW_ROW: np.array(range(num_rows), dtype=np.int64),
+      _DUMMY_COL: np.random.randint(0, 35, size=(num_rows,)),
+      _DUMMY_VEC_COL: [
+        np.array([np.random.random() for _ in range(_DUMMY_VEC_LEN)])
+        for i in range(num_rows)  # pylint: disable=unused-variable
+      ],
+    })
 
     with fixed_core_count(num_cores):
       buffer_path = file_io.write_to_temp_buffer(df, self.get_temp_dir(), [_RAW_ROW, _DUMMY_COL, _DUMMY_VEC_COL])
diff --git a/deepray/utils/data/input_meta.py b/deepray/utils/data/input_meta.py
index ab20e450..782bb417 100644
--- a/deepray/utils/data/input_meta.py
+++ b/deepray/utils/data/input_meta.py
@@ -12,14 +12,13 @@
 
 
 class InputMeta(metaclass=SingletonType):
-
   def __init__(self, conf_version=None):
     self.conf_version = conf_version if conf_version else FLAGS.input_meta_data_path
     self.conf = self.import_conf()
 
   def import_conf(self):
     project_path = os.path.abspath(os.curdir)
-    conf_path = os.path.join(project_path, 'examples/Recommendation/CGC/conf')
+    conf_path = os.path.join(project_path, "examples/Recommendation/CGC/conf")
     logging.info(conf_path)
 
     def file_name(file_dir, target):
@@ -29,18 +28,18 @@ def file_name(file_dir, target):
         if basename == target:
           paths.append(root)
       if len(paths) == 0:
-        logging.info(f'Cannot find conf: {target}')
+        logging.info(f"Cannot find conf: {target}")
         sys.exit()
       elif len(paths) > 1:
-        logging.info('Found more than one conf:')
+        logging.info("Found more than one conf:")
         for path in paths:
-          logging.info(' ', os.path.relpath(path, project_path))
+          logging.info(" ", os.path.relpath(path, project_path))
         sys.exit()
       else:
         return paths[0]
 
     local_conf_path = file_name(conf_path, self.conf_version)
     conf_module = os.path.relpath(local_conf_path, project_path).replace("/", ".")
-    conf = importlib.import_module(f'{conf_module}.params', '*')
+    conf = importlib.import_module(f"{conf_module}.params", "*")
     # import examples.Recommendation.CGC.conf.default_geek_predict.conf_geek_predict_mix_4_target_cgc_f2_1v8_weight_tfra_base_new.params as conf
     return conf
diff --git a/deepray/utils/ensure_tf_install.py b/deepray/utils/ensure_tf_install.py
index d9c1349d..5a712232 100644
--- a/deepray/utils/ensure_tf_install.py
+++ b/deepray/utils/ensure_tf_install.py
@@ -28,18 +28,18 @@
 def _check_tf_version():
   """Warn the user if the version of TensorFlow used is not supported.
 
-    This is not a check for custom ops compatibility. This check only ensure that
-    we support this TensorFlow version if the user uses only Deepray' Python code.
-    """
+  This is not a check for custom ops compatibility. This check only ensure that
+  we support this TensorFlow version if the user uses only Deepray' Python code.
+  """
 
   if "dev" in tf.__version__:
     warnings.warn(
-        "You are currently using a nightly version of TensorFlow ({}). \n"
-        "Deepray offers no support for the nightly versions of "
-        "TensorFlow. Some things might work, some other might not. \n"
-        "If you encounter a bug, do not file an issue on GitHub."
-        "".format(tf.__version__),
-        UserWarning,
+      "You are currently using a nightly version of TensorFlow ({}). \n"
+      "Deepray offers no support for the nightly versions of "
+      "TensorFlow. Some things might work, some other might not. \n"
+      "If you encounter a bug, do not file an issue on GitHub."
+      "".format(tf.__version__),
+      UserWarning,
     )
     return
 
@@ -50,18 +50,16 @@ def _check_tf_version():
     return
 
   warnings.warn(
-      "Tensorflow Deepray supports using Python ops for all Tensorflow versions "
-      "above or equal to {} and strictly below {} (nightly versions are not "
-      "supported). \n "
-      "The versions of TensorFlow you are currently using is {} and is not "
-      "supported. \n"
-      "Some things might work, some things might not.\n"
-      "If you were to encounter a bug, do not file an issue.\n"
-      "If you want to make sure you're using a tested and supported configuration, "
-      "either change the TensorFlow version or the Deepray's version. \n"
-      "You can find the compatibility matrix in TensorFlow Deepray's readme:\n"
-      "https://github.com/deepray-AI/deepray".format(
-          INCLUSIVE_MIN_TF_VERSION, EXCLUSIVE_MAX_TF_VERSION, tf.__version__
-      ),
-      UserWarning,
+    "Tensorflow Deepray supports using Python ops for all Tensorflow versions "
+    "above or equal to {} and strictly below {} (nightly versions are not "
+    "supported). \n "
+    "The versions of TensorFlow you are currently using is {} and is not "
+    "supported. \n"
+    "Some things might work, some things might not.\n"
+    "If you were to encounter a bug, do not file an issue.\n"
+    "If you want to make sure you're using a tested and supported configuration, "
+    "either change the TensorFlow version or the Deepray's version. \n"
+    "You can find the compatibility matrix in TensorFlow Deepray's readme:\n"
+    "https://github.com/deepray-AI/deepray".format(INCLUSIVE_MIN_TF_VERSION, EXCLUSIVE_MAX_TF_VERSION, tf.__version__),
+    UserWarning,
   )
diff --git a/deepray/utils/export/export.py b/deepray/utils/export/export.py
index d4ca9acc..cc1ba951 100644
--- a/deepray/utils/export/export.py
+++ b/deepray/utils/export/export.py
@@ -64,7 +64,7 @@ def build_tensor_serving_input_receiver_fn(shape, dtype=tf.float32, batch_size=1
 
   def serving_input_receiver_fn():
     # Prep a placeholder where the input example will be fed in
-    features = tf.compat.v1.placeholder(dtype=dtype, shape=[batch_size] + shape, name='input_tensor')
+    features = tf.compat.v1.placeholder(dtype=dtype, shape=[batch_size] + shape, name="input_tensor")
 
     return tf.estimator.export.TensorServingInputReceiver(features=features, receiver_tensors=features)
 
@@ -72,16 +72,15 @@ def serving_input_receiver_fn():
 
 
 def export_to_checkpoint(saver: Union[tf.train.Checkpoint, tf.train.CheckpointManager], checkpoint_number=None):
-
   def helper(name, _saver):
     """Saves model to with provided checkpoint prefix."""
-    latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(flags.FLAGS.model_dir, 'ckpt_' + name))
+    latest_checkpoint_file = tf.train.latest_checkpoint(os.path.join(flags.FLAGS.model_dir, "ckpt_" + name))
     match = re.search(r"(?<=ckpt-)\d+", latest_checkpoint_file) if latest_checkpoint_file else None
     latest_step_ckpt = int(match.group()) if match else -1
 
     if latest_step_ckpt != checkpoint_number:
       save_path = _saver.save(checkpoint_number)
-      logger.info('Saved checkpoint to {}'.format(save_path))
+      logger.info("Saved checkpoint to {}".format(save_path))
 
   def _save_fn():
     if isinstance(saver, dict):
@@ -97,12 +96,12 @@ def _save_fn():
 
 
 def export_to_savedmodel(
-    model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
-    savedmodel_dir: Optional[Text] = None,
-    checkpoint_dir: Optional[Union[Text, Dict[Text, Text]]] = None,
-    restore_model_using_load_weights: bool = False,
-    include_optimizer: bool = False,
-    signatures=None
+  model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
+  savedmodel_dir: Optional[Text] = None,
+  checkpoint_dir: Optional[Union[Text, Dict[Text, Text]]] = None,
+  restore_model_using_load_weights: bool = False,
+  include_optimizer: bool = False,
+  signatures=None,
 ) -> Text:
   """Export keras model for serving which does not include the optimizer.
 
@@ -126,13 +125,13 @@ def export_to_savedmodel(
 
   if flags.FLAGS.use_dynamic_embedding and flags.FLAGS.use_horovod:
     try:
-      rank_array = hvd.allgather_object(get_rank(), name='check_tfra_ranks')
+      rank_array = hvd.allgather_object(get_rank(), name="check_tfra_ranks")
       assert len(set(rank_array)) == get_world_size()
     except:
       raise ValueError(f"Shouldn't place {inspect.stack()[0][3]} only in the main_process when use TFRA and Horovod.")
 
   def helper(name, _model: tf.keras.Model, _checkpoint_dir):
-    _savedmodel_dir = os.path.join(flags.FLAGS.model_dir, 'export') if savedmodel_dir is None else savedmodel_dir
+    _savedmodel_dir = os.path.join(flags.FLAGS.model_dir, "export") if savedmodel_dir is None else savedmodel_dir
     if get_world_size() > 1:
       _savedmodel_dir = f"{_savedmodel_dir}_{name}_{get_rank()}"
     else:
@@ -143,7 +142,7 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir):
       # Keras compile/fit() was used to save checkpoint using
       # model.save_weights().
       if restore_model_using_load_weights:
-        model_weight_path = os.path.join(_checkpoint_dir, 'checkpoint')
+        model_weight_path = os.path.join(_checkpoint_dir, "checkpoint")
         assert tf.io.gfile.exists(model_weight_path)
         _model.load_weights(model_weight_path)
 
@@ -154,37 +153,40 @@ def helper(name, _model: tf.keras.Model, _checkpoint_dir):
         # Restores the model from latest checkpoint.
         latest_checkpoint_file = tf.train.latest_checkpoint(_checkpoint_dir)
         assert latest_checkpoint_file
-        logger.info('Checkpoint file %s found and restoring from '
-                    'checkpoint', latest_checkpoint_file)
+        logger.info("Checkpoint file %s found and restoring from checkpoint", latest_checkpoint_file)
         checkpoint.restore(latest_checkpoint_file).assert_existing_objects_matched()
 
     if flags.FLAGS.use_dynamic_embedding:
       try:
         from tensorflow_recommenders_addons import dynamic_embedding as de
+
         de.keras.models.de_save_model(
-            _model, _savedmodel_dir, overwrite=True, include_optimizer=include_optimizer, signatures=signatures
+          _model, _savedmodel_dir, overwrite=True, include_optimizer=include_optimizer, signatures=signatures
         )
       except:
         # Compatible with TFRA version before commit 460b50847d459ebbf91b30ea0f9499fbc7ed9da0
         def _check_de_var_with_fs_saver(_var):
           try:
             from tensorflow_recommenders_addons import dynamic_embedding as de
+
             # This function only serves FileSystemSaver.
-            return hasattr(_var, "params") and \
-              hasattr(_var.params, "_created_in_class") and \
-              _var.params._saveable_object_creator is not None and \
-              isinstance(_var.params.kv_creator.saver, de.FileSystemSaver)
+            return (
+              hasattr(_var, "params")
+              and hasattr(_var.params, "_created_in_class")
+              and _var.params._saveable_object_creator is not None
+              and isinstance(_var.params.kv_creator.saver, de.FileSystemSaver)
+            )
           except:
             return False
 
         de_dir = os.path.join(_savedmodel_dir, "variables", "TFRADynamicEmbedding")
-        options = tf.saved_model.SaveOptions(namespace_whitelist=['TFRA'])
+        options = tf.saved_model.SaveOptions(namespace_whitelist=["TFRA"])
         if is_main_process():
           for var in _model.variables:
             _is_dump = _check_de_var_with_fs_saver(var)
             if _is_dump:
               de_var = var.params
-              if hasattr(de_var, 'saveable'):
+              if hasattr(de_var, "saveable"):
                 de_var.saveable._saver_config.save_path = de_dir
           tf.saved_model.save(_model, export_dir=_savedmodel_dir, signatures=signatures, options=options)
         else:
@@ -197,8 +199,11 @@ def _check_de_var_with_fs_saver(_var):
               var.params.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
               # save opt weights
               if include_optimizer:
-                de_opt_vars = a2a_emb.optimizer_vars.as_list(
-                ) if hasattr(a2a_emb.optimizer_vars, "as_list") else a2a_emb.optimizer_vars
+                de_opt_vars = (
+                  a2a_emb.optimizer_vars.as_list()
+                  if hasattr(a2a_emb.optimizer_vars, "as_list")
+                  else a2a_emb.optimizer_vars
+                )
                 for de_opt_var in de_opt_vars:
                   de_opt_var.save_to_file_system(dirpath=de_dir, proc_size=get_world_size(), proc_rank=get_rank())
     else:
@@ -223,10 +228,10 @@ def _check_de_var_with_fs_saver(_var):
 
 
 def optimize_for_inference(
-    model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
-    savedmodel_dir: Text,
-    dataset: tf.data.Dataset = None,
-    signatures=None,
+  model: Union[tf.keras.Model, Dict[Text, tf.keras.Model]],
+  savedmodel_dir: Text,
+  dataset: tf.data.Dataset = None,
+  signatures=None,
 ) -> None:
   x = None
   if dataset:
@@ -241,7 +246,7 @@ def optimize_for_inference(
       logger.debug(preds)
 
   def helper(_model, path):
-    tmp_path = tempfile.mkdtemp(dir='/tmp/')
+    tmp_path = tempfile.mkdtemp(dir="/tmp/")
     export_to_savedmodel(_model, savedmodel_dir=tmp_path, signatures=signatures)
     file = os.path.join(path, "saved_model.pb")
     if tf.io.gfile.exists(file):
@@ -267,10 +272,10 @@ def helper(_model, path):
 
 
 class SavedModel:
-
   def __init__(self, model_dir, precision):
     if flags.FLAGS.use_dynamic_embedding:
       from tensorflow_recommenders_addons import dynamic_embedding as de
+
       de.enable_inference_mode()
 
     self.saved_model_loaded = tf.saved_model.load(model_dir, tags=[tag_constants.SERVING])
@@ -291,10 +296,9 @@ def infer_step(self, x):
 
 
 class TFTRTModel:
-
   def export_model(self, model_dir, prec, tf_trt_model_dir=None):
     loaded_model = tf.saved_model.load(model_dir)
-    signature = loaded_model.signatures['serving_default']
+    signature = loaded_model.signatures["serving_default"]
     logger.info(signature)
     # input_shape = [1, 384]
     # dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32))
@@ -307,11 +311,11 @@ def export_model(self, model_dir, prec, tf_trt_model_dir=None):
 
     trt_prec = trt.TrtPrecisionMode.FP32 if prec == "fp32" else trt.TrtPrecisionMode.FP16
     converter = trt.TrtGraphConverterV2(
-        input_saved_model_dir=model_dir,
-        conversion_params=trt.TrtConversionParams(precision_mode=trt_prec),
+      input_saved_model_dir=model_dir,
+      conversion_params=trt.TrtConversionParams(precision_mode=trt_prec),
     )
     converter.convert()
-    tf_trt_model_dir = tf_trt_model_dir or f'/tmp/tf-trt_model_{prec}'
+    tf_trt_model_dir = tf_trt_model_dir or f"/tmp/tf-trt_model_{prec}"
     converter.save(tf_trt_model_dir)
     logger.info(f"TF-TRT model saved at {tf_trt_model_dir}")
 
diff --git a/deepray/utils/flags/_base.py b/deepray/utils/flags/_base.py
index f4214d77..16bbde0e 100644
--- a/deepray/utils/flags/_base.py
+++ b/deepray/utils/flags/_base.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Flags which will be nearly universal across models."""
+
 import datetime
 import sys
 
@@ -22,22 +23,22 @@
 
 
 def define_base(
-    train_data=False,
-    num_train_examples=False,
-    learning_rate=False,
-    optimizer_type=False,
-    use_custom_training_loop=False,
-    model_dir=False,
-    clean=False,
-    num_accumulation_steps=False,
-    epochs=False,
-    stop_threshold=False,
-    batch_size=False,
-    num_gpus=False,
-    init_checkpoint=False,
-    hooks=False,
-    export_dir=False,
-    run_eagerly=False
+  train_data=False,
+  num_train_examples=False,
+  learning_rate=False,
+  optimizer_type=False,
+  use_custom_training_loop=False,
+  model_dir=False,
+  clean=False,
+  num_accumulation_steps=False,
+  epochs=False,
+  stop_threshold=False,
+  batch_size=False,
+  num_gpus=False,
+  init_checkpoint=False,
+  hooks=False,
+  export_dir=False,
+  run_eagerly=False,
 ):
   """Register base flags.
 
@@ -59,9 +60,9 @@ def define_base(
   """
   key_flags = []
   if train_data:
-    flags.DEFINE_list('train_data', None, 'File paths or regular expression to match train files.')
-    flags.DEFINE_list('valid_data', None, 'File paths or regular expression to match validation files.')
-    flags.DEFINE_list('test_data', None, 'File paths or regular expression to match test files.')
+    flags.DEFINE_list("train_data", None, "File paths or regular expression to match train files.")
+    flags.DEFINE_list("valid_data", None, "File paths or regular expression to match validation files.")
+    flags.DEFINE_list("test_data", None, "File paths or regular expression to match test files.")
     key_flags.append("train_data")
     key_flags.append("valid_data")
     key_flags.append("test_data")
@@ -70,33 +71,33 @@ def define_base(
     flags.DEFINE_integer("num_valid_examples", -1, "number of validation examples.")
     key_flags.append("num_train_examples")
   if learning_rate:
-    flags.DEFINE_float('learning_rate', 5e-5, 'The initial learning rate for Adam.')
+    flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
     key_flags.append("learning_rate")
   if use_custom_training_loop:
     flags.DEFINE_bool(
-        name="use_custom_training_loop",
-        default=True,
-        help=flags_core.help_wrap("If True, we use a custom training loop for keras.")
+      name="use_custom_training_loop",
+      default=True,
+      help=flags_core.help_wrap("If True, we use a custom training loop for keras."),
     )
     key_flags.append("use_custom_training_loop")
   if optimizer_type:
     flags.DEFINE_string("optimizer_type", "adam", "Optimizer used for training - LAMB or ADAM")
     key_flags.append("optimizer_type")
   if num_accumulation_steps:
-    flags.DEFINE_integer('num_accumulation_steps', 1, 'Number of accumulation steps before gradient update.')
+    flags.DEFINE_integer("num_accumulation_steps", 1, "Number of accumulation steps before gradient update.")
     key_flags.append("num_accumulation_steps")
 
   if init_checkpoint:
-    flags.DEFINE_list('init_checkpoint', '', 'Initial checkpoint (usually from a pre-trained BERT model).')
+    flags.DEFINE_list("init_checkpoint", "", "Initial checkpoint (usually from a pre-trained BERT model).")
     key_flags.append("init_checkpoint")
-    flags.DEFINE_list("init_weights", '', "Initial weights for the main model.")
+    flags.DEFINE_list("init_weights", "", "Initial weights for the main model.")
     key_flags.append("init_weights")
 
   if model_dir:
     flags.DEFINE_string(
-        name="model_dir",
-        default=f"/tmp/{datetime.datetime.now().timestamp()}",
-        help=help_wrap("The location of the model checkpoint files.")
+      name="model_dir",
+      default=f"/tmp/{datetime.datetime.now().timestamp()}",
+      help=help_wrap("The location of the model checkpoint files."),
     )
     key_flags.append("model_dir")
 
@@ -106,7 +107,7 @@ def define_base(
 
   if epochs:
     flags.DEFINE_integer(
-        name="steps_per_epoch", default=None, help=help_wrap("The number of steps in one epoch used to train.")
+      name="steps_per_epoch", default=None, help=help_wrap("The number of steps in one epoch used to train.")
     )
     flags.DEFINE_integer(name="epochs", default=1, help=help_wrap("The number of epochs used to train."))
     key_flags.append("steps_per_epoch")
@@ -114,74 +115,73 @@ def define_base(
 
   if stop_threshold:
     flags.DEFINE_float(
-        name="stop_threshold",
-        default=None,
-        help=help_wrap(
-            "If passed, training will stop at the earlier of "
-            "epochs and when the evaluation metric is  "
-            "greater than or equal to stop_threshold."
-        )
+      name="stop_threshold",
+      default=None,
+      help=help_wrap(
+        "If passed, training will stop at the earlier of "
+        "epochs and when the evaluation metric is  "
+        "greater than or equal to stop_threshold."
+      ),
     )
 
   if batch_size:
     flags.DEFINE_integer(
-        name="batch_size",
-        default=32,
-        help=help_wrap(
-            "Batch size for training and evaluation. When using "
-            "multiple gpus, this is the global batch size for "
-            "all devices. For example, if the batch size is 32 "
-            "and there are 4 GPUs, each GPU will get 8 examples on "
-            "each step."
-        )
+      name="batch_size",
+      default=32,
+      help=help_wrap(
+        "Batch size for training and evaluation. When using "
+        "multiple gpus, this is the global batch size for "
+        "all devices. For example, if the batch size is 32 "
+        "and there are 4 GPUs, each GPU will get 8 examples on "
+        "each step."
+      ),
     )
     flags.DEFINE_integer(
-        name="eval_batch_size",
-        default=None,
-        help=help_wrap(
-            "The batch size used for evaluation. This should generally be larger"
-            "than the training batch size as the lack of back propagation during"
-            "evaluation can allow for larger batch sizes to fit in memory. If not"
-            "specified, the training batch size (--batch_size) will be used."
-        )
+      name="eval_batch_size",
+      default=None,
+      help=help_wrap(
+        "The batch size used for evaluation. This should generally be larger"
+        "than the training batch size as the lack of back propagation during"
+        "evaluation can allow for larger batch sizes to fit in memory. If not"
+        "specified, the training batch size (--batch_size) will be used."
+      ),
     )
     key_flags.append("batch_size")
 
   if num_gpus:
     flags.DEFINE_integer(
-        name="num_gpus",
-        default=0,
-        help=help_wrap("How many GPUs to use at each worker with the "
-                       "DistributionStrategies API. The default is 1.")
+      name="num_gpus",
+      default=0,
+      help=help_wrap("How many GPUs to use at each worker with the DistributionStrategies API. The default is 1."),
     )
 
   if run_eagerly:
     flags.DEFINE_boolean(
-        name="run_eagerly", default=False, help="Run the model op by op without building a model function."
+      name="run_eagerly", default=False, help="Run the model op by op without building a model function."
     )
 
   if hooks:
     flags.DEFINE_list(
-        name="hooks",
-        default="LoggingTensorHook",
-        help=help_wrap(
-            u"A list of (case insensitive) strings to specify the names of "
-            u"training hooks. Example: `--hooks ProfilerHook,"
-            u"ExamplesPerSecondHook`\n See hooks_helper "
-            u"for details."
-        )
+      name="hooks",
+      default="LoggingTensorHook",
+      help=help_wrap(
+        "A list of (case insensitive) strings to specify the names of "
+        "training hooks. Example: `--hooks ProfilerHook,"
+        "ExamplesPerSecondHook`\n See hooks_helper "
+        "for details."
+      ),
     )
     key_flags.append("hooks")
 
   if export_dir:
     flags.DEFINE_string(
-        name="export_dir",
-        default=None,
-        help=help_wrap(
-            "If set, a SavedModel serialization of the model will "
-            "be exported to this directory at the end of training. "
-            "See the README for more details and relevant links."
-        )
+      name="export_dir",
+      default=None,
+      help=help_wrap(
+        "If set, a SavedModel serialization of the model will "
+        "be exported to this directory at the end of training. "
+        "See the README for more details and relevant links."
+      ),
     )
     key_flags.append("export_dir")
 
@@ -194,5 +194,6 @@ def get_num_gpus(flags_obj):
     return flags_obj.num_gpus
 
   from tensorflow.python.client import device_lib  # pylint: disable=g-import-not-at-top
+
   local_device_protos = device_lib.list_local_devices()
   return sum([1 for d in local_device_protos if d.device_type == "GPU"])
diff --git a/deepray/utils/flags/_benchmark.py b/deepray/utils/flags/_benchmark.py
index 834c0301..c7c5d0ac 100644
--- a/deepray/utils/flags/_benchmark.py
+++ b/deepray/utils/flags/_benchmark.py
@@ -20,7 +20,7 @@
 
 def define_log_steps():
   flags.DEFINE_integer(
-      name="log_steps", default=100, help="Frequency with which to log timing information with TimeHistory."
+    name="log_steps", default=100, help="Frequency with which to log timing information with TimeHistory."
   )
 
   return []
@@ -38,60 +38,57 @@ def define_benchmark(bigquery_uploader=False):
 
   key_flags = []
   flags.DEFINE_enum(
-      name="benchmark_logger_type",
-      default="BaseBenchmarkLogger",
-      enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
-      help=help_wrap(
-          "The type of benchmark logger to use. Defaults to using "
-          "BaseBenchmarkLogger which logs to STDOUT. Different "
-          "loggers will require other flags to be able to work."
-      )
+    name="benchmark_logger_type",
+    default="BaseBenchmarkLogger",
+    enum_values=["BaseBenchmarkLogger", "BenchmarkFileLogger"],
+    help=help_wrap(
+      "The type of benchmark logger to use. Defaults to using "
+      "BaseBenchmarkLogger which logs to STDOUT. Different "
+      "loggers will require other flags to be able to work."
+    ),
   )
   flags.DEFINE_string(
-      name="benchmark_test_id",
-      default=None,
-      help=help_wrap(
-          "The unique test ID of the benchmark run. It could be the "
-          "combination of key parameters. It is hardware "
-          "independent and could be used compare the performance "
-          "between different test runs. This flag is designed for "
-          "human consumption, and does not have any impact within "
-          "the system."
-      )
+    name="benchmark_test_id",
+    default=None,
+    help=help_wrap(
+      "The unique test ID of the benchmark run. It could be the "
+      "combination of key parameters. It is hardware "
+      "independent and could be used compare the performance "
+      "between different test runs. This flag is designed for "
+      "human consumption, and does not have any impact within "
+      "the system."
+    ),
   )
 
   define_log_steps()
 
   if bigquery_uploader:
     flags.DEFINE_string(
-        name="gcp_project", default=None, help=help_wrap("The GCP project name where the benchmark will be uploaded.")
+      name="gcp_project", default=None, help=help_wrap("The GCP project name where the benchmark will be uploaded.")
     )
 
     flags.DEFINE_string(
-        name="bigquery_data_set",
-        default="test_benchmark",
-        help=help_wrap("The Bigquery dataset name where the benchmark will be uploaded.")
+      name="bigquery_data_set",
+      default="test_benchmark",
+      help=help_wrap("The Bigquery dataset name where the benchmark will be uploaded."),
     )
 
     flags.DEFINE_string(
-        name="bigquery_run_table",
-        default="benchmark_run",
-        help=help_wrap("The Bigquery table name where the benchmark run "
-                       "information will be uploaded.")
+      name="bigquery_run_table",
+      default="benchmark_run",
+      help=help_wrap("The Bigquery table name where the benchmark run information will be uploaded."),
     )
 
     flags.DEFINE_string(
-        name="bigquery_run_status_table",
-        default="benchmark_run_status",
-        help=help_wrap("The Bigquery table name where the benchmark run "
-                       "status information will be uploaded.")
+      name="bigquery_run_status_table",
+      default="benchmark_run_status",
+      help=help_wrap("The Bigquery table name where the benchmark run status information will be uploaded."),
     )
 
     flags.DEFINE_string(
-        name="bigquery_metric_table",
-        default="benchmark_metric",
-        help=help_wrap("The Bigquery table name where the benchmark metric "
-                       "information will be uploaded.")
+      name="bigquery_metric_table",
+      default="benchmark_metric",
+      help=help_wrap("The Bigquery table name where the benchmark metric information will be uploaded."),
     )
 
   return key_flags
diff --git a/deepray/utils/flags/_conventions.py b/deepray/utils/flags/_conventions.py
index 7d52299b..a4280a1e 100644
--- a/deepray/utils/flags/_conventions.py
+++ b/deepray/utils/flags/_conventions.py
@@ -41,7 +41,7 @@ def _stdout_utf8():
 else:
 
   def help_wrap(text, *args, **kwargs):
-    return _help_wrap(text, *args, **kwargs).replace(u"\ufeff", u"")
+    return _help_wrap(text, *args, **kwargs).replace("\ufeff", "")
 
 
 # Replace None with h to also allow -h
diff --git a/deepray/utils/flags/_data.py b/deepray/utils/flags/_data.py
index 1e3fa40c..7e7c809d 100644
--- a/deepray/utils/flags/_data.py
+++ b/deepray/utils/flags/_data.py
@@ -6,29 +6,29 @@
 
 
 def define_data_download_flags(
-    dataset=False,
-    data_dir=False,
-    download_if_missing=False,
+  dataset=False,
+  data_dir=False,
+  download_if_missing=False,
 ):
   """Add flags specifying data download and usage arguments."""
   key_flags = []
   if dataset:
     flags.DEFINE_string(
-        'dataset', default=None, help=flags_core.help_wrap('The name of the dataset, e.g. ImageNet, etc.')
+      "dataset", default=None, help=flags_core.help_wrap("The name of the dataset, e.g. ImageNet, etc.")
     )
     key_flags.append("dataset")
   if data_dir:
     flags.DEFINE_string(
-        name="data_dir",
-        default="/tmp/movielens-data/",
-        help=flags_core.help_wrap("Directory to download and extract data.")
+      name="data_dir",
+      default="/tmp/movielens-data/",
+      help=flags_core.help_wrap("Directory to download and extract data."),
     )
     key_flags.append("data_dir")
   if download_if_missing:
     flags.DEFINE_boolean(
-        name="download_if_missing",
-        default=True,
-        help=flags_core.help_wrap("Download data to data_dir if it is not already present.")
+      name="download_if_missing",
+      default=True,
+      help=flags_core.help_wrap("Download data to data_dir if it is not already present."),
     )
     key_flags.append("download_if_missing")
   return key_flags
diff --git a/deepray/utils/flags/_device.py b/deepray/utils/flags/_device.py
index 1fc06419..88ed9b93 100644
--- a/deepray/utils/flags/_device.py
+++ b/deepray/utils/flags/_device.py
@@ -57,35 +57,35 @@ def define_device(tpu=False, redis=False):
 
   if tpu:
     flags.DEFINE_string(
-        name="tpu_address",
-        default=None,
-        help=help_wrap(
-            "The Cloud TPU to use for training. This should be either the name "
-            "used when creating the Cloud TPU, or a "
-            "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
-            "CPU of the local instance instead. (Good for debugging.)"
-        )
+      name="tpu_address",
+      default=None,
+      help=help_wrap(
+        "The Cloud TPU to use for training. This should be either the name "
+        "used when creating the Cloud TPU, or a "
+        "grpc://ip.address.of.tpu:8470 url. Passing `local` will use the"
+        "CPU of the local instance instead. (Good for debugging.)"
+      ),
     )
     key_flags.append("tpu_address")
 
     flags.DEFINE_string(
-        name="tpu_zone",
-        default=None,
-        help=help_wrap(
-            "[Optional] GCE zone where the Cloud TPU is located in. If not "
-            "specified, we will attempt to automatically detect the GCE "
-            "project from metadata."
-        )
+      name="tpu_zone",
+      default=None,
+      help=help_wrap(
+        "[Optional] GCE zone where the Cloud TPU is located in. If not "
+        "specified, we will attempt to automatically detect the GCE "
+        "project from metadata."
+      ),
     )
 
     flags.DEFINE_string(
-        name="tpu_gcp_project",
-        default=None,
-        help=help_wrap(
-            "[Optional] Project name for the Cloud TPU-enabled project. If not "
-            "specified, we will attempt to automatically detect the GCE "
-            "project from metadata."
-        )
+      name="tpu_gcp_project",
+      default=None,
+      help=help_wrap(
+        "[Optional] Project name for the Cloud TPU-enabled project. If not "
+        "specified, we will attempt to automatically detect the GCE "
+        "project from metadata."
+      ),
     )
 
     flags.DEFINE_integer(name="num_tpu_shards", default=8, help=help_wrap("Number of shards (TPU chips)."))
diff --git a/deepray/utils/flags/_distribution.py b/deepray/utils/flags/_distribution.py
index bf7420c7..86ae24af 100644
--- a/deepray/utils/flags/_distribution.py
+++ b/deepray/utils/flags/_distribution.py
@@ -33,40 +33,39 @@ def define_distribution(use_horovod=True, distribution_strategy=False, worker_ho
   key_flags = []
 
   if use_horovod:
-    flags.DEFINE_bool("use_horovod", False, 'Whether to use horovod.')
+    flags.DEFINE_bool("use_horovod", False, "Whether to use horovod.")
     key_flags.append("use_horovod")
 
   if distribution_strategy:
     flags.DEFINE_string(
-        name="distribution_strategy",
-        default="off",
-        help=help_wrap(
-            "The Distribution Strategy to use for training. "
-            "Accepted values are 'off', 'horovod', 'one_device', "
-            "'mirrored', 'parameter_server', 'collective', "
-            "case insensitive. 'off' means not to use "
-            "Distribution Strategy; 'default' means to choose "
-            "from `MirroredStrategy` or `OneDeviceStrategy` "
-            "according to the number of GPUs."
-        )
+      name="distribution_strategy",
+      default="off",
+      help=help_wrap(
+        "The Distribution Strategy to use for training. "
+        "Accepted values are 'off', 'horovod', 'one_device', "
+        "'mirrored', 'parameter_server', 'collective', "
+        "case insensitive. 'off' means not to use "
+        "Distribution Strategy; 'default' means to choose "
+        "from `MirroredStrategy` or `OneDeviceStrategy` "
+        "according to the number of GPUs."
+      ),
     )
 
   if worker_hosts:
     flags.DEFINE_string(
-        name='worker_hosts',
-        default=None,
-        help=help_wrap(
-            'Comma-separated list of worker ip:port pairs for running '
-            'multi-worker models with DistributionStrategy.  The user would '
-            'start the program on each host with identical value for this '
-            'flag.'
-        )
+      name="worker_hosts",
+      default=None,
+      help=help_wrap(
+        "Comma-separated list of worker ip:port pairs for running "
+        "multi-worker models with DistributionStrategy.  The user would "
+        "start the program on each host with identical value for this "
+        "flag."
+      ),
     )
 
   if task_index:
     flags.DEFINE_integer(
-        name='task_index', default=-1, help=help_wrap('If multi-worker training, the task_index of this '
-                                                      'worker.')
+      name="task_index", default=-1, help=help_wrap("If multi-worker training, the task_index of this worker.")
     )
 
   return key_flags
diff --git a/deepray/utils/flags/_misc.py b/deepray/utils/flags/_misc.py
index c0e16152..d526cb6a 100644
--- a/deepray/utils/flags/_misc.py
+++ b/deepray/utils/flags/_misc.py
@@ -32,16 +32,16 @@ def define_image(data_format=True):
 
   if data_format:
     flags.DEFINE_enum(
-        name="data_format",
-        default=None,
-        enum_values=["channels_first", "channels_last"],
-        help=help_wrap(
-            "A flag to override the data format used in the model. "
-            "channels_first provides a performance boost on GPU but is not "
-            "always compatible with CPU. If left unspecified, the data format "
-            "will be chosen automatically based on whether TensorFlow was "
-            "built for CPU or GPU."
-        )
+      name="data_format",
+      default=None,
+      enum_values=["channels_first", "channels_last"],
+      help=help_wrap(
+        "A flag to override the data format used in the model. "
+        "channels_first provides a performance boost on GPU but is not "
+        "always compatible with CPU. If left unspecified, the data format "
+        "will be chosen automatically based on whether TensorFlow was "
+        "built for CPU or GPU."
+      ),
     )
     key_flags.append("data_format")
 
diff --git a/deepray/utils/flags/_performance.py b/deepray/utils/flags/_performance.py
index b198bcd2..e080ab55 100644
--- a/deepray/utils/flags/_performance.py
+++ b/deepray/utils/flags/_performance.py
@@ -22,9 +22,9 @@
 
 # Map string to TensorFlow dtype
 DTYPE_MAP = {
-    "fp16": tf.float16,
-    "bf16": tf.bfloat16,
-    "fp32": tf.float32,
+  "fp16": tf.float16,
+  "bf16": tf.bfloat16,
+  "fp32": tf.float32,
 }
 
 
@@ -50,24 +50,24 @@ def get_loss_scale(flags_obj, default_for_fp16):
 
 
 def define_performance(
-    num_parallel_calls=False,
-    inter_op=False,
-    intra_op=False,
-    synthetic_data=False,
-    max_train_steps=False,
-    dtype=False,
-    all_reduce_alg=False,
-    num_packs=False,
-    tf_gpu_thread_mode=False,
-    datasets_num_private_threads=False,
-    datasets_num_parallel_batches=False,
-    dynamic_loss_scale=False,
-    fp16_implementation=False,
-    loss_scale=False,
-    tf_data_experimental_slack=False,
-    enable_xla=False,
-    force_v2_in_keras_compile=False,
-    training_dataset_cache=False
+  num_parallel_calls=False,
+  inter_op=False,
+  intra_op=False,
+  synthetic_data=False,
+  max_train_steps=False,
+  dtype=False,
+  all_reduce_alg=False,
+  num_packs=False,
+  tf_gpu_thread_mode=False,
+  datasets_num_private_threads=False,
+  datasets_num_parallel_batches=False,
+  dynamic_loss_scale=False,
+  fp16_implementation=False,
+  loss_scale=False,
+  tf_data_experimental_slack=False,
+  enable_xla=False,
+  force_v2_in_keras_compile=False,
+  training_dataset_cache=False,
 ):
   """Register flags for specifying performance tuning arguments.
 
@@ -108,92 +108,85 @@ def define_performance(
   key_flags = []
   if num_parallel_calls:
     flags.DEFINE_integer(
-        name="num_parallel_calls",
-        default=multiprocessing.cpu_count(),
-        help=help_wrap(
-            "The number of records that are  processed in parallel "
-            "during input processing. This can be optimized per "
-            "data set but for generally homogeneous data sets, "
-            "should be approximately the number of available CPU "
-            "cores. (default behavior)"
-        )
+      name="num_parallel_calls",
+      default=multiprocessing.cpu_count(),
+      help=help_wrap(
+        "The number of records that are  processed in parallel "
+        "during input processing. This can be optimized per "
+        "data set but for generally homogeneous data sets, "
+        "should be approximately the number of available CPU "
+        "cores. (default behavior)"
+      ),
     )
 
   if inter_op:
     flags.DEFINE_integer(
-        name="inter_op_parallelism_threads",
-        short_name="inter",
-        default=0,
-        help=help_wrap(
-            "Number of inter_op_parallelism_threads to use for CPU. "
-            "See TensorFlow config.proto for details."
-        )
+      name="inter_op_parallelism_threads",
+      short_name="inter",
+      default=0,
+      help=help_wrap("Number of inter_op_parallelism_threads to use for CPU. See TensorFlow config.proto for details."),
     )
 
   if intra_op:
     flags.DEFINE_integer(
-        name="intra_op_parallelism_threads",
-        short_name="intra",
-        default=0,
-        help=help_wrap(
-            "Number of intra_op_parallelism_threads to use for CPU. "
-            "See TensorFlow config.proto for details."
-        )
+      name="intra_op_parallelism_threads",
+      short_name="intra",
+      default=0,
+      help=help_wrap("Number of intra_op_parallelism_threads to use for CPU. See TensorFlow config.proto for details."),
     )
 
   if synthetic_data:
     flags.DEFINE_bool(
-        name="use_synthetic_data",
-        default=False,
-        help=help_wrap(
-            "If set, use fake data (zeroes) instead of a real dataset. "
-            "This mode is useful for performance debugging, as it removes "
-            "input processing steps, but will not learn anything."
-        )
+      name="use_synthetic_data",
+      default=False,
+      help=help_wrap(
+        "If set, use fake data (zeroes) instead of a real dataset. "
+        "This mode is useful for performance debugging, as it removes "
+        "input processing steps, but will not learn anything."
+      ),
     )
 
   if max_train_steps:
     flags.DEFINE_integer(
-        name="max_train_steps",
-        default=None,
-        help=help_wrap(
-            "The model will stop training if the global_step reaches this "
-            "value. If not set, training will run until the specified number "
-            "of epochs have run as usual. It is generally recommended to set "
-            "--epochs=1 when using this flag."
-        )
+      name="max_train_steps",
+      default=None,
+      help=help_wrap(
+        "The model will stop training if the global_step reaches this "
+        "value. If not set, training will run until the specified number "
+        "of epochs have run as usual. It is generally recommended to set "
+        "--epochs=1 when using this flag."
+      ),
     )
   if dtype:
     flags.DEFINE_enum(
-        name="dtype",
-        default="fp32",
-        enum_values=DTYPE_MAP.keys(),
-        help=help_wrap(
-            "The TensorFlow datatype used for calculations. "
-            "Variables may be cast to a higher precision on a "
-            "case-by-case basis for numerical stability."
-        )
+      name="dtype",
+      default="fp32",
+      enum_values=DTYPE_MAP.keys(),
+      help=help_wrap(
+        "The TensorFlow datatype used for calculations. "
+        "Variables may be cast to a higher precision on a "
+        "case-by-case basis for numerical stability."
+      ),
     )
 
     loss_scale_help_text = (
-        "The amount to scale the loss by when the model is run. {}. Before "
-        "gradients are computed, the loss is multiplied by the loss scale, "
-        "making all gradients loss_scale times larger. To adjust for this, "
-        "gradients are divided by the loss scale before being applied to "
-        "variables. This is mathematically equivalent to training without "
-        "a loss scale, but the loss scale helps avoid some intermediate "
-        "gradients from underflowing to zero. If not provided the default "
-        "for fp16 is 128 and 1 for all other dtypes.{}"
+      "The amount to scale the loss by when the model is run. {}. Before "
+      "gradients are computed, the loss is multiplied by the loss scale, "
+      "making all gradients loss_scale times larger. To adjust for this, "
+      "gradients are divided by the loss scale before being applied to "
+      "variables. This is mathematically equivalent to training without "
+      "a loss scale, but the loss scale helps avoid some intermediate "
+      "gradients from underflowing to zero. If not provided the default "
+      "for fp16 is 128 and 1 for all other dtypes.{}"
     )
     if dynamic_loss_scale:
       loss_scale_help_text = loss_scale_help_text.format(
-          "This can be an int/float or the string 'dynamic'",
-          " The string 'dynamic' can be used to dynamically determine the "
-          "optimal loss scale during training, but currently this "
-          "significantly slows down performance"
+        "This can be an int/float or the string 'dynamic'",
+        " The string 'dynamic' can be used to dynamically determine the "
+        "optimal loss scale during training, but currently this "
+        "significantly slows down performance",
       )
-      loss_scale_validation_msg = ("loss_scale should be a positive int/float "
-                                   "or the string 'dynamic'.")
+      loss_scale_validation_msg = "loss_scale should be a positive int/float or the string 'dynamic'."
     else:
       loss_scale_help_text = loss_scale_help_text.format("This must be an int/float", "")
       loss_scale_validation_msg = "loss_scale should be a positive int/float."
@@ -218,99 +211,93 @@ def _check_loss_scale(loss_scale):  # pylint: disable=unused-variable
 
     if fp16_implementation:
       flags.DEFINE_enum(
-          name="fp16_implementation",
-          default="keras",
-          enum_values=("keras", "graph_rewrite"),
-          help=help_wrap(
-              "When --dtype=fp16, how fp16 should be implemented. This has no "
-              "impact on correctness. 'keras' uses the "
-              "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
-              "tf.train.experimental.enable_mixed_precision_graph_rewrite "
-              "API."
-          )
+        name="fp16_implementation",
+        default="keras",
+        enum_values=("keras", "graph_rewrite"),
+        help=help_wrap(
+          "When --dtype=fp16, how fp16 should be implemented. This has no "
+          "impact on correctness. 'keras' uses the "
+          "tf.keras.mixed_precision API. 'graph_rewrite' uses the "
+          "tf.train.experimental.enable_mixed_precision_graph_rewrite "
+          "API."
+        ),
       )
 
       @flags.multi_flags_validator(["fp16_implementation", "dtype", "loss_scale"])
       def _check_fp16_implementation(flags_dict):
         """Validator to check fp16_implementation flag is valid."""
-        if (flags_dict["fp16_implementation"] == "graph_rewrite" and flags_dict["dtype"] != "fp16"):
-          raise flags.ValidationError("--fp16_implementation should not be "
-                                      "specified unless --dtype=fp16")
+        if flags_dict["fp16_implementation"] == "graph_rewrite" and flags_dict["dtype"] != "fp16":
+          raise flags.ValidationError("--fp16_implementation should not be specified unless --dtype=fp16")
         return True
 
   if all_reduce_alg:
     flags.DEFINE_string(
-        name="all_reduce_alg",
-        default=None,
-        help=help_wrap(
-            "Defines the algorithm to use for performing all-reduce."
-            "When specified with MirroredStrategy for single "
-            "worker, this controls "
-            "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
-            "specified with MultiWorkerMirroredStrategy, this "
-            "controls "
-            "tf.distribute.experimental.CollectiveCommunication; "
-            "valid options are `ring` and `nccl`."
-        )
+      name="all_reduce_alg",
+      default=None,
+      help=help_wrap(
+        "Defines the algorithm to use for performing all-reduce."
+        "When specified with MirroredStrategy for single "
+        "worker, this controls "
+        "tf.contrib.distribute.AllReduceCrossTowerOps.  When "
+        "specified with MultiWorkerMirroredStrategy, this "
+        "controls "
+        "tf.distribute.experimental.CollectiveCommunication; "
+        "valid options are `ring` and `nccl`."
+      ),
     )
 
   if num_packs:
     flags.DEFINE_integer(
-        name="num_packs",
-        default=1,
-        help=help_wrap(
-            "Sets `num_packs` in the cross device ops used in "
-            "MirroredStrategy.  For details, see "
-            "tf.distribute.NcclAllReduce."
-        )
+      name="num_packs",
+      default=1,
+      help=help_wrap(
+        "Sets `num_packs` in the cross device ops used in "
+        "MirroredStrategy.  For details, see "
+        "tf.distribute.NcclAllReduce."
+      ),
     )
 
   if tf_gpu_thread_mode:
     flags.DEFINE_string(
-        name="tf_gpu_thread_mode",
-        default=None,
-        help=help_wrap("Whether and how the GPU device uses its own threadpool.")
+      name="tf_gpu_thread_mode", default=None, help=help_wrap("Whether and how the GPU device uses its own threadpool.")
     )
 
     flags.DEFINE_integer(
-        name="per_gpu_thread_count",
-        default=0,
-        help=help_wrap("The number of threads to use for GPU. Only valid when "
-                       "tf_gpu_thread_mode is not global.")
+      name="per_gpu_thread_count",
+      default=0,
+      help=help_wrap("The number of threads to use for GPU. Only valid when tf_gpu_thread_mode is not global."),
     )
 
   if datasets_num_private_threads:
     flags.DEFINE_integer(
-        name="datasets_num_private_threads",
-        default=None,
-        help=help_wrap("Number of threads for a private threadpool created for all"
-                       "datasets computation..")
+      name="datasets_num_private_threads",
+      default=None,
+      help=help_wrap("Number of threads for a private threadpool created for alldatasets computation.."),
     )
 
   if datasets_num_parallel_batches:
     flags.DEFINE_integer(
-        name="datasets_num_parallel_batches",
-        default=None,
-        help=help_wrap("Determines how many batches to process in parallel when using "
-                       "map and batch from tf.data.")
+      name="datasets_num_parallel_batches",
+      default=None,
+      help=help_wrap("Determines how many batches to process in parallel when using map and batch from tf.data."),
     )
 
   if training_dataset_cache:
     flags.DEFINE_boolean(
-        name="training_dataset_cache",
-        default=False,
-        help=help_wrap(
-            "Determines whether to cache the training dataset on workers. "
-            "Typically used to improve training performance when training "
-            "data is in remote storage and can fit into worker memory."
-        )
+      name="training_dataset_cache",
+      default=False,
+      help=help_wrap(
+        "Determines whether to cache the training dataset on workers. "
+        "Typically used to improve training performance when training "
+        "data is in remote storage and can fit into worker memory."
+      ),
     )
 
   if tf_data_experimental_slack:
     flags.DEFINE_boolean(
-        name="tf_data_experimental_slack",
-        default=False,
-        help=help_wrap("Whether to enable tf.data's `experimental_slack` option.")
+      name="tf_data_experimental_slack",
+      default=False,
+      help=help_wrap("Whether to enable tf.data's `experimental_slack` option."),
     )
 
   if enable_xla:
@@ -318,11 +305,11 @@ def _check_fp16_implementation(flags_dict):
 
   if force_v2_in_keras_compile:
     flags.DEFINE_boolean(
-        name="force_v2_in_keras_compile",
-        default=None,
-        help="Forces the use of run_distribued path even if not"
-        "using a `strategy`. This is not the same as"
-        "`tf.distribute.OneDeviceStrategy`"
+      name="force_v2_in_keras_compile",
+      default=None,
+      help="Forces the use of run_distribued path even if not"
+      "using a `strategy`. This is not the same as"
+      "`tf.distribute.OneDeviceStrategy`",
     )
 
   return key_flags
diff --git a/deepray/utils/flags/common_flags.py b/deepray/utils/flags/common_flags.py
index 4939aeb1..febcaff8 100644
--- a/deepray/utils/flags/common_flags.py
+++ b/deepray/utils/flags/common_flags.py
@@ -23,75 +23,78 @@
 def define_common_flags():
   """Define common flags for BERT tasks."""
   flags_core.define_base(
-      train_data=True,
-      num_train_examples=True,
-      batch_size=True,
-      learning_rate=True,
-      optimizer_type=True,
-      use_custom_training_loop=True,
-      num_accumulation_steps=True,
-      init_checkpoint=True,
-      num_gpus=True,
-      model_dir=True,
-      clean=True,
-      epochs=True,
-      stop_threshold=False,
-      hooks=False,
-      export_dir=False,
-      run_eagerly=True,
+    train_data=True,
+    num_train_examples=True,
+    batch_size=True,
+    learning_rate=True,
+    optimizer_type=True,
+    use_custom_training_loop=True,
+    num_accumulation_steps=True,
+    init_checkpoint=True,
+    num_gpus=True,
+    model_dir=True,
+    clean=True,
+    epochs=True,
+    stop_threshold=False,
+    hooks=False,
+    export_dir=False,
+    run_eagerly=True,
   )
   flags.DEFINE_string(
-      'config_file',
-      default=None,
-      help='YAML/JSON files which specifies overrides. The override order '
-      'follows the order of args. Note that each file '
-      'can be used as an override template to override the default parameters '
-      'specified in Python. If the same parameter is specified in both '
-      '`--config_file` and `--params_override`, `config_file` will be used '
-      'first, followed by params_override.'
+    "config_file",
+    default=None,
+    help="YAML/JSON files which specifies overrides. The override order "
+    "follows the order of args. Note that each file "
+    "can be used as an override template to override the default parameters "
+    "specified in Python. If the same parameter is specified in both "
+    "`--config_file` and `--params_override`, `config_file` will be used "
+    "first, followed by params_override.",
   )
   flags.DEFINE_integer(
-      'steps_per_execution', None, 'Number of steps per graph-mode loop. Only training step '
-      'happens inside the loop. Callbacks will not be called '
-      'inside.'
+    "steps_per_execution",
+    None,
+    "Number of steps per graph-mode loop. Only training step "
+    "happens inside the loop. Callbacks will not be called "
+    "inside.",
   )
   flags.DEFINE_integer("stop_steps", -1, "steps when training stops")
   flags.DEFINE_string(
-      'model_name', None, 'Specifies the name of the model. '
-      'If "bert", will use canonical BERT; if "albert", will use ALBERT model.'
+    "model_name",
+    None,
+    'Specifies the name of the model. If "bert", will use canonical BERT; if "albert", will use ALBERT model.',
   )
   flags.DEFINE_bool("use_dynamic_embedding", False, "Whether use tfra.dynamic_embedding.")
   flags.DEFINE_integer(
-      "random_seed", None, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.")
+    "random_seed", None, help=flags_core.help_wrap("This value will be used to seed both NumPy and TensorFlow.")
   )
   # Adds flags for mixed precision training.
   flags_core.define_performance(
-      num_parallel_calls=False,
-      inter_op=False,
-      intra_op=False,
-      synthetic_data=False,
-      max_train_steps=False,
-      dtype=True,
-      dynamic_loss_scale=False,
-      loss_scale=True,
-      all_reduce_alg=False,
-      num_packs=False,
-      enable_xla=True,
-      fp16_implementation=True,
+    num_parallel_calls=False,
+    inter_op=False,
+    intra_op=False,
+    synthetic_data=False,
+    max_train_steps=False,
+    dtype=True,
+    dynamic_loss_scale=False,
+    loss_scale=True,
+    all_reduce_alg=False,
+    num_packs=False,
+    enable_xla=True,
+    fp16_implementation=True,
   )
 
   flags_core.define_distribution(distribution_strategy=True)
   flags_core.define_data(
-      dataset=True,
-      data_dir=False,
-      download_if_missing=False,
+    dataset=True,
+    data_dir=False,
+    download_if_missing=False,
   )
   flags_core.define_device(tpu=False, redis=False)
   flags_core.define_benchmark()
   flags.DEFINE_float(
-      "dropout_rate",
-      default=-1,
-      help="Dropout rate for all the classification MLPs (default: -1, disabled).",
+    "dropout_rate",
+    default=-1,
+    help="Dropout rate for all the classification MLPs (default: -1, disabled).",
   )
   flags.DEFINE_integer("prebatch", 1, "prebatch size for tfrecord")
   flags.DEFINE_string("feature_map", None, "path to feature_map")
@@ -105,4 +108,4 @@ def use_float16():
 
 
 def get_loss_scale():
-  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16="dynamic")
diff --git a/deepray/utils/flags/core.py b/deepray/utils/flags/core.py
index 4f89965e..3d7240b6 100644
--- a/deepray/utils/flags/core.py
+++ b/deepray/utils/flags/core.py
@@ -92,7 +92,7 @@ def _get_nondefault_flags_as_dict():
   nondefault_flags = {}
   for flag_name in flags.FLAGS:
     flag_value = getattr(flags.FLAGS, flag_name)
-    if (flag_name != flags.FLAGS[flag_name].short_name and flag_value != flags.FLAGS[flag_name].default):
+    if flag_name != flags.FLAGS[flag_name].short_name and flag_value != flags.FLAGS[flag_name].default:
       nondefault_flags[flag_name] = flag_value
   return nondefault_flags
 
@@ -120,13 +120,13 @@ def get_nondefault_flags_as_str():
   flag_strings = []
   for name, value in sorted(nondefault_flags.items()):
     if isinstance(value, bool):
-      flag_str = '--{}'.format(name) if value else '--no{}'.format(name)
+      flag_str = "--{}".format(name) if value else "--no{}".format(name)
     elif isinstance(value, list):
-      flag_str = '--{}={}'.format(name, ','.join(value))
+      flag_str = "--{}={}".format(name, ",".join(value))
     else:
-      flag_str = '--{}={}'.format(name, value)
+      flag_str = "--{}={}".format(name, value)
     flag_strings.append(flag_str)
-  return ' '.join(shlex_quote(flag_str) for flag_str in flag_strings)
+  return " ".join(shlex_quote(flag_str) for flag_str in flag_strings)
 
 
 def parse_flags(flags_obj):
@@ -137,31 +137,31 @@ def parse_flags(flags_obj):
   eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
 
   return {
-      "epochs": flags_obj.epochs,
-      "batches_per_step": 1,
-      "use_seed": flags_obj.random_seed is not None,
-      "batch_size": batch_size,
-      "eval_batch_size": eval_batch_size,
-      "learning_rate": flags_obj.learning_rate,
-      "mf_dim": flags_obj.num_factors,
-      "model_layers": [int(layer) for layer in flags_obj.layers],
-      "mf_regularization": flags_obj.mf_regularization,
-      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
-      "num_neg": flags_obj.num_neg,
-      "distribution_strategy": flags_obj.distribution_strategy,
-      "num_gpus": num_gpus,
-      "use_tpu": flags_obj.tpu is not None,
-      "tpu": flags_obj.tpu,
-      "tpu_zone": flags_obj.tpu_zone,
-      "tpu_gcp_project": flags_obj.tpu_gcp_project,
-      "beta1": flags_obj.beta1,
-      "beta2": flags_obj.beta2,
-      "epsilon": flags_obj.epsilon,
-      "match_mlperf": flags_obj.ml_perf,
-      # "epochs_between_evals": flags_obj.epochs_between_evals,
-      "use_custom_training_loop": flags_obj.use_custom_training_loop,
-      "hr_threshold": flags_obj.hr_threshold,
-      "stream_files": flags_obj.tpu is not None,
-      "train_dataset_path": flags_obj.train_dataset_path,
-      "eval_dataset_path": flags_obj.eval_dataset_path,
+    "epochs": flags_obj.epochs,
+    "batches_per_step": 1,
+    "use_seed": flags_obj.random_seed is not None,
+    "batch_size": batch_size,
+    "eval_batch_size": eval_batch_size,
+    "learning_rate": flags_obj.learning_rate,
+    "mf_dim": flags_obj.num_factors,
+    "model_layers": [int(layer) for layer in flags_obj.layers],
+    "mf_regularization": flags_obj.mf_regularization,
+    "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
+    "num_neg": flags_obj.num_neg,
+    "distribution_strategy": flags_obj.distribution_strategy,
+    "num_gpus": num_gpus,
+    "use_tpu": flags_obj.tpu is not None,
+    "tpu": flags_obj.tpu,
+    "tpu_zone": flags_obj.tpu_zone,
+    "tpu_gcp_project": flags_obj.tpu_gcp_project,
+    "beta1": flags_obj.beta1,
+    "beta2": flags_obj.beta2,
+    "epsilon": flags_obj.epsilon,
+    "match_mlperf": flags_obj.ml_perf,
+    # "epochs_between_evals": flags_obj.epochs_between_evals,
+    "use_custom_training_loop": flags_obj.use_custom_training_loop,
+    "hr_threshold": flags_obj.hr_threshold,
+    "stream_files": flags_obj.tpu is not None,
+    "train_dataset_path": flags_obj.train_dataset_path,
+    "eval_dataset_path": flags_obj.eval_dataset_path,
   }
diff --git a/deepray/utils/flags/flags_test.py b/deepray/utils/flags/flags_test.py
index cb17dcd3..c59ecf5d 100644
--- a/deepray/utils/flags/flags_test.py
+++ b/deepray/utils/flags/flags_test.py
@@ -22,17 +22,16 @@
 
 def define_flags():
   flags_core.define_base(
-      clean=True, num_gpu=False, stop_threshold=True, hooks=True, epochs=True, epochs_between_evals=True
+    clean=True, num_gpu=False, stop_threshold=True, hooks=True, epochs=True, epochs_between_evals=True
   )
   flags_core.define_performance(
-      num_parallel_calls=True, inter_op=True, intra_op=True, loss_scale=True, synthetic_data=True, dtype=True
+    num_parallel_calls=True, inter_op=True, intra_op=True, loss_scale=True, synthetic_data=True, dtype=True
   )
   flags_core.define_image()
   flags_core.define_benchmark()
 
 
 class BaseTester(unittest.TestCase):
-
   @classmethod
   def setUpClass(cls):
     super(BaseTester, cls).setUpClass()
@@ -42,16 +41,16 @@ def test_default_setting(self):
     """Test to ensure fields exist and defaults can be set."""
 
     defaults = dict(
-        data_dir="dfgasf",
-        model_dir="dfsdkjgbs",
-        epochs=534,
-        epochs_between_evals=15,
-        batch_size=256,
-        hooks=["LoggingTensorHook"],
-        num_parallel_calls=18,
-        inter_op_parallelism_threads=5,
-        intra_op_parallelism_threads=10,
-        data_format="channels_first"
+      data_dir="dfgasf",
+      model_dir="dfsdkjgbs",
+      epochs=534,
+      epochs_between_evals=15,
+      batch_size=256,
+      hooks=["LoggingTensorHook"],
+      num_parallel_calls=18,
+      inter_op_parallelism_threads=5,
+      intra_op_parallelism_threads=10,
+      data_format="channels_first",
     )
 
     flags_core.set_defaults(**defaults)
@@ -62,9 +61,9 @@ def test_default_setting(self):
 
   def test_benchmark_setting(self):
     defaults = dict(
-        hooks=["LoggingMetricHook"],
-        benchmark_log_dir="/tmp/12345",
-        gcp_project="project_abc",
+      hooks=["LoggingMetricHook"],
+      benchmark_log_dir="/tmp/12345",
+      gcp_project="project_abc",
     )
 
     flags_core.set_defaults(**defaults)
@@ -106,7 +105,7 @@ def test_parse_dtype_info(self):
 
   def test_get_nondefault_flags_as_str(self):
     defaults = dict(
-        clean=True, data_dir="abc", hooks=["LoggingTensorHook"], stop_threshold=1.5, use_synthetic_data=False
+      clean=True, data_dir="abc", hooks=["LoggingTensorHook"], stop_threshold=1.5, use_synthetic_data=False
     )
     flags_core.set_defaults(**defaults)
     flags_core.parse_flags()
@@ -126,7 +125,7 @@ def test_get_nondefault_flags_as_str(self):
     expected_flags += " --hooks=aaa,bbb,ccc"
     self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
 
-    flags.FLAGS.stop_threshold = 3.
+    flags.FLAGS.stop_threshold = 3.0
     expected_flags += " --stop_threshold=3.0"
     self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
 
@@ -137,7 +136,7 @@ def test_get_nondefault_flags_as_str(self):
     # Assert that explicit setting a flag to its default value does not cause it
     # to appear in the string
     flags.FLAGS.use_synthetic_data = False
-    expected_flags = expected_flags[:-len(" --use_synthetic_data")]
+    expected_flags = expected_flags[: -len(" --use_synthetic_data")]
     self.assertEqual(flags_core.get_nondefault_flags_as_str(), expected_flags)
 
 
diff --git a/deepray/utils/gpu_affinity.py b/deepray/utils/gpu_affinity.py
index 89f5b9ed..de844e67 100644
--- a/deepray/utils/gpu_affinity.py
+++ b/deepray/utils/gpu_affinity.py
@@ -40,10 +40,10 @@ def getName(self):
     return pynvml.nvmlDeviceGetName(self.handle)
 
   def getCpuAffinity(self):
-    affinity_string = ''
+    affinity_string = ""
     for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, device._nvml_affinity_elements):
       # assume nvml returns list of 64 bit ints
-      affinity_string = '{:064b}'.format(j) + affinity_string
+      affinity_string = "{:064b}".format(j) + affinity_string
     affinity_list = [int(x) for x in affinity_string]
     affinity_list.reverse()  # so core 0 is in 0th element of list
 
@@ -52,7 +52,7 @@ def getCpuAffinity(self):
 
 def set_affinity(gpu_id=None):
   if gpu_id is None:
-    gpu_id = int(os.getenv('LOCAL_RANK', 0))
+    gpu_id = int(os.getenv("LOCAL_RANK", 0))
 
   dev = device(gpu_id)
   os.sched_setaffinity(0, dev.getCpuAffinity())
diff --git a/deepray/utils/keras_utils.py b/deepray/utils/keras_utils.py
index 47620db9..b3309b8b 100644
--- a/deepray/utils/keras_utils.py
+++ b/deepray/utils/keras_utils.py
@@ -43,7 +43,7 @@ def get_config_proto_v1(enable_xla=False):
   config = None
   if enable_xla:
     config = tf.compat.v1.ConfigProto()
-    config.graph_options.optimizer_options.global_jit_level = (tf.OptimizerOptions.ON_2)
+    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2
   return config
 
 
@@ -62,14 +62,14 @@ def is_v2_0():
 def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads, num_gpus, per_gpu_thread_count):
   """Set GPU thread mode and count, and adjust dataset threads count."""
   cpu_count = multiprocessing.cpu_count()
-  logging.info('Logical CPU cores: %s', cpu_count)
+  logging.info("Logical CPU cores: %s", cpu_count)
 
   # Allocate private thread pool for each GPU to schedule and launch kernels
   per_gpu_thread_count = per_gpu_thread_count or 2
-  os.environ['TF_GPU_THREAD_MODE'] = gpu_thread_mode
-  os.environ['TF_GPU_THREAD_COUNT'] = str(per_gpu_thread_count)
-  logging.info('TF_GPU_THREAD_COUNT: %s', os.environ['TF_GPU_THREAD_COUNT'])
-  logging.info('TF_GPU_THREAD_MODE: %s', os.environ['TF_GPU_THREAD_MODE'])
+  os.environ["TF_GPU_THREAD_MODE"] = gpu_thread_mode
+  os.environ["TF_GPU_THREAD_COUNT"] = str(per_gpu_thread_count)
+  logging.info("TF_GPU_THREAD_COUNT: %s", os.environ["TF_GPU_THREAD_COUNT"])
+  logging.info("TF_GPU_THREAD_MODE: %s", os.environ["TF_GPU_THREAD_MODE"])
 
   # Limit data preprocessing threadpool to CPU cores minus number of total GPU
   # private threads and memory copy threads.
@@ -77,4 +77,4 @@ def set_gpu_thread_mode_and_count(gpu_thread_mode, datasets_num_private_threads,
   num_runtime_threads = num_gpus
   if not datasets_num_private_threads:
     datasets_num_private_threads = min(cpu_count - total_gpu_thread_count - num_runtime_threads, num_gpus * 8)
-    logging.info('Set datasets_num_private_threads to %s', datasets_num_private_threads)
+    logging.info("Set datasets_num_private_threads to %s", datasets_num_private_threads)
diff --git a/deepray/utils/logging_util.py b/deepray/utils/logging_util.py
index 0de16c7d..3252aa22 100644
--- a/deepray/utils/logging_util.py
+++ b/deepray/utils/logging_util.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Logging utilities."""
+"""Logging utilities."""
 
 import functools
 import logging
@@ -20,14 +20,14 @@
 import sys
 import threading
 from logging import (
-    CRITICAL,  # NOQA
-    DEBUG,  # NOQA
-    ERROR,  # NOQA
-    FATAL,  # NOQA
-    INFO,  # NOQA
-    NOTSET,  # NOQA
-    WARN,  # NOQA
-    WARNING,  # NOQA
+  CRITICAL,  # NOQA
+  DEBUG,  # NOQA
+  ERROR,  # NOQA
+  FATAL,  # NOQA
+  INFO,  # NOQA
+  NOTSET,  # NOQA
+  WARN,  # NOQA
+  WARNING,  # NOQA
 )
 from logging import captureWarnings as _captureWarnings
 from typing import Optional
@@ -38,12 +38,12 @@
 _default_handler: Optional[logging.Handler] = None
 
 log_levels = {
-    "detail": logging.DEBUG,  # will also print filename and line number
-    "debug": logging.DEBUG,
-    "info": logging.INFO,
-    "warning": logging.WARNING,
-    "error": logging.ERROR,
-    "critical": logging.CRITICAL,
+  "detail": logging.DEBUG,  # will also print filename and line number
+  "debug": logging.DEBUG,
+  "info": logging.INFO,
+  "warning": logging.WARNING,
+  "error": logging.ERROR,
+  "critical": logging.CRITICAL,
 }
 
 _default_log_level = logging.INFO
@@ -53,17 +53,16 @@
 
 def _get_default_logging_level():
   """
-    If DEEPRAY_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to `_default_log_level`
-    """
+  If DEEPRAY_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
+  not - fall back to `_default_log_level`
+  """
   env_level_str = os.getenv("DEEPRAY_VERBOSITY", None)
   if env_level_str:
     if env_level_str in log_levels:
       return log_levels[env_level_str]
     else:
       logging.getLogger().warning(
-          f"Unknown option DEEPRAY_VERBOSITY={env_level_str}, "
-          f"has to be one of: { ', '.join(log_levels.keys()) }"
+        f"Unknown option DEEPRAY_VERBOSITY={env_level_str}, has to be one of: {', '.join(log_levels.keys())}"
       )
   return _default_log_level
 
@@ -121,17 +120,17 @@ def get_log_levels_dict():
 
 def captureWarnings(capture):
   """
-    Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
-    `warnings` library.
+  Calls the `captureWarnings` method from the logging library to enable management of the warnings emitted by the
+  `warnings` library.
 
-    Read more about this method here:
-    https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
+  Read more about this method here:
+  https://docs.python.org/3/library/logging.html#integration-with-the-warnings-module
 
-    All warnings will be logged through the `py.warnings` logger.
+  All warnings will be logged through the `py.warnings` logger.
 
-    Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
-    level of that logger to the library's root logger.
-    """
+  Careful: this method also adds a handler to this logger if it does not already have one, and updates the logging
+  level of that logger to the library's root logger.
+  """
   logger = get_logger("py.warnings")
 
   if not logger.handlers:
@@ -144,10 +143,10 @@ def captureWarnings(capture):
 
 def get_logger(name: Optional[str] = None) -> logging.Logger:
   """
-    Return a logger with the specified name.
+  Return a logger with the specified name.
 
-    This function is not supposed to be directly accessed unless you are writing a custom transformers module.
-    """
+  This function is not supposed to be directly accessed unless you are writing a custom transformers module.
+  """
 
   if name is None:
     name = _get_library_name()
@@ -158,22 +157,22 @@ def get_logger(name: Optional[str] = None) -> logging.Logger:
 
 def get_verbosity() -> int:
   """
-    Return the current level for the 🤗 Transformers's root logger as an int.
+  Return the current level for the 🤗 Transformers's root logger as an int.
 
-    Returns:
-        `int`: The logging level.
+  Returns:
+      `int`: The logging level.
 
-    <Tip>
+  <Tip>
 
-    🤗 Transformers has following logging levels:
+  🤗 Transformers has following logging levels:
 
-    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-    - 40: `transformers.logging.ERROR`
-    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
-    - 20: `transformers.logging.INFO`
-    - 10: `transformers.logging.DEBUG`
+  - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+  - 40: `transformers.logging.ERROR`
+  - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+  - 20: `transformers.logging.INFO`
+  - 10: `transformers.logging.DEBUG`
 
-    </Tip>"""
+  </Tip>"""
 
   _configure_library_root_logger()
   return _get_library_root_logger().getEffectiveLevel()
@@ -181,18 +180,18 @@ def get_verbosity() -> int:
 
 def set_verbosity(verbosity: int) -> None:
   """
-    Set the verbosity level for the 🤗 Transformers's root logger.
+  Set the verbosity level for the 🤗 Transformers's root logger.
 
-    Args:
-        verbosity (`int`):
-            Logging level, e.g., one of:
+  Args:
+      verbosity (`int`):
+          Logging level, e.g., one of:
 
-            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
-            - `transformers.logging.ERROR`
-            - `transformers.logging.WARNING` or `transformers.logging.WARN`
-            - `transformers.logging.INFO`
-            - `transformers.logging.DEBUG`
-    """
+          - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+          - `transformers.logging.ERROR`
+          - `transformers.logging.WARNING` or `transformers.logging.WARN`
+          - `transformers.logging.INFO`
+          - `transformers.logging.DEBUG`
+  """
 
   _configure_library_root_logger()
   _get_library_root_logger().setLevel(verbosity)
@@ -256,8 +255,8 @@ def remove_handler(handler: logging.Handler) -> None:
 
 def disable_propagation() -> None:
   """
-    Disable propagation of the library log outputs. Note that log propagation is disabled by default.
-    """
+  Disable propagation of the library log outputs. Note that log propagation is disabled by default.
+  """
 
   _configure_library_root_logger()
   _get_library_root_logger().propagate = False
@@ -265,9 +264,9 @@ def disable_propagation() -> None:
 
 def enable_propagation() -> None:
   """
-    Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
-    prevent double logging if the root logger has been configured.
-    """
+  Enable propagation of the library log outputs. Please disable the HuggingFace Transformers's default handler to
+  prevent double logging if the root logger has been configured.
+  """
 
   _configure_library_root_logger()
   _get_library_root_logger().propagate = True
@@ -275,12 +274,12 @@ def enable_propagation() -> None:
 
 def enable_explicit_format() -> None:
   """
-    Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
-    ```
-        [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
-    ```
-    All handlers currently bound to the root logger are affected by this method.
-    """
+  Enable explicit formatting for every HuggingFace Transformers's logger. The explicit formatter is as follows:
+  ```
+      [LEVELNAME|FILENAME|LINE NUMBER] TIME >> MESSAGE
+  ```
+  All handlers currently bound to the root logger are affected by this method.
+  """
   handlers = _get_library_root_logger().handlers
 
   for handler in handlers:
@@ -290,10 +289,10 @@ def enable_explicit_format() -> None:
 
 def reset_format() -> None:
   """
-    Resets the formatting for HuggingFace Transformers's loggers.
+  Resets the formatting for HuggingFace Transformers's loggers.
 
-    All handlers currently bound to the root logger are affected by this method.
-    """
+  All handlers currently bound to the root logger are affected by this method.
+  """
   handlers = _get_library_root_logger().handlers
 
   for handler in handlers:
@@ -302,9 +301,9 @@ def reset_format() -> None:
 
 def warning_advice(self, *args, **kwargs):
   """
-    This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
-    warning will not be printed
-    """
+  This method is identical to `logger.warning()`, but if env var TRANSFORMERS_NO_ADVISORY_WARNINGS=1 is set, this
+  warning will not be printed
+  """
   no_advisory_warnings = os.getenv("TRANSFORMERS_NO_ADVISORY_WARNINGS", False)
   if no_advisory_warnings:
     return
@@ -317,12 +316,12 @@ def warning_advice(self, *args, **kwargs):
 @functools.lru_cache(None)
 def warning_once(self, *args, **kwargs):
   """
-    This method is identical to `logger.warning()`, but will emit the warning with the same message only once
+  This method is identical to `logger.warning()`, but will emit the warning with the same message only once
 
-    Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
-    The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
-    another type of cache that includes the caller frame information in the hashing function.
-    """
+  Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the cache.
+  The assumption here is that all warning messages are unique across the code. If they aren't then need to switch to
+  another type of cache that includes the caller frame information in the hashing function.
+  """
   self.warning(*args, **kwargs)
 
 
@@ -354,7 +353,6 @@ def __exit__(self, type_, value, traceback):
 
 
 class _tqdm_cls:
-
   def __call__(self, *args, **kwargs):
     if _tqdm_active:
       return tqdm_lib.tqdm(*args, **kwargs)
diff --git a/deepray/utils/logs/cloud_lib.py b/deepray/utils/logs/cloud_lib.py
index 1641aa29..7cd7c8c3 100644
--- a/deepray/utils/logs/cloud_lib.py
+++ b/deepray/utils/logs/cloud_lib.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities that interact with cloud service.
-"""
+"""Utilities that interact with cloud service."""
 
 import requests
 
diff --git a/deepray/utils/logs/cloud_lib_test.py b/deepray/utils/logs/cloud_lib_test.py
index 95d27779..24627e77 100644
--- a/deepray/utils/logs/cloud_lib_test.py
+++ b/deepray/utils/logs/cloud_lib_test.py
@@ -27,7 +27,6 @@
 
 
 class CloudLibTest(unittest.TestCase):
-
   @mock.patch("requests.get")
   def test_on_gcp(self, mock_requests_get):
     mock_response = mock.MagicMock()
diff --git a/deepray/utils/logs/hooks_helper.py b/deepray/utils/logs/hooks_helper.py
index 8784d384..1cc3e94b 100644
--- a/deepray/utils/logs/hooks_helper.py
+++ b/deepray/utils/logs/hooks_helper.py
@@ -29,7 +29,7 @@
 from official.utils.logs import logger
 from official.utils.logs import metric_hook
 
-_TENSORS_TO_LOG = dict((x, x) for x in ['learning_rate', 'cross_entropy', 'train_accuracy'])
+_TENSORS_TO_LOG = dict((x, x) for x in ["learning_rate", "cross_entropy", "train_accuracy"])
 
 
 def get_train_hooks(name_list, use_tpu=False, **kwargs):
@@ -55,8 +55,7 @@ def get_train_hooks(name_list, use_tpu=False, **kwargs):
 
   if use_tpu:
     logging.warning(
-        'hooks_helper received name_list `{}`, but a '
-        'TPU is specified. No hooks will be used.'.format(name_list)
+      "hooks_helper received name_list `{}`, but a TPU is specified. No hooks will be used.".format(name_list)
     )
     return []
 
@@ -64,7 +63,7 @@ def get_train_hooks(name_list, use_tpu=False, **kwargs):
   for name in name_list:
     hook_name = HOOKS.get(name.strip().lower())
     if hook_name is None:
-      raise ValueError('Unrecognized training hook requested: {}'.format(name))
+      raise ValueError("Unrecognized training hook requested: {}".format(name))
     else:
       train_hooks.append(hook_name(**kwargs))
 
@@ -122,10 +121,10 @@ def get_examples_per_second_hook(every_n_steps=100, batch_size=128, warm_steps=5
     profiling tools like chrome://tracing.
   """
   return hooks.ExamplesPerSecondHook(
-      batch_size=batch_size,
-      every_n_steps=every_n_steps,
-      warm_steps=warm_steps,
-      metric_logger=logger.get_benchmark_logger()
+    batch_size=batch_size,
+    every_n_steps=every_n_steps,
+    warm_steps=warm_steps,
+    metric_logger=logger.get_benchmark_logger(),
   )
 
 
@@ -145,7 +144,7 @@ def get_logging_metric_hook(tensors_to_log=None, every_n_secs=600, **kwargs):  #
   if tensors_to_log is None:
     tensors_to_log = _TENSORS_TO_LOG
   return metric_hook.LoggingMetricHook(
-      tensors=tensors_to_log, metric_logger=logger.get_benchmark_logger(), every_n_secs=every_n_secs
+    tensors=tensors_to_log, metric_logger=logger.get_benchmark_logger(), every_n_secs=every_n_secs
   )
 
 
@@ -157,9 +156,9 @@ def get_step_counter_hook(**kwargs):
 
 # A dictionary to map one hook name and its corresponding function
 HOOKS = {
-    'loggingtensorhook': get_logging_tensor_hook,
-    'profilerhook': get_profiler_hook,
-    'examplespersecondhook': get_examples_per_second_hook,
-    'loggingmetrichook': get_logging_metric_hook,
-    'stepcounterhook': get_step_counter_hook
+  "loggingtensorhook": get_logging_tensor_hook,
+  "profilerhook": get_profiler_hook,
+  "examplespersecondhook": get_examples_per_second_hook,
+  "loggingmetrichook": get_logging_metric_hook,
+  "stepcounterhook": get_step_counter_hook,
 }
diff --git a/deepray/utils/logs/hooks_helper_test.py b/deepray/utils/logs/hooks_helper_test.py
index 20052202..6a8e033a 100644
--- a/deepray/utils/logs/hooks_helper_test.py
+++ b/deepray/utils/logs/hooks_helper_test.py
@@ -27,7 +27,6 @@
 
 
 class BaseTest(unittest.TestCase):
-
   def setUp(self):
     super(BaseTest, self).setUp()
     if keras_utils.is_v2_0:
@@ -35,10 +34,10 @@ def setUp(self):
 
   def test_raise_in_non_list_names(self):
     with self.assertRaises(ValueError):
-      hooks_helper.get_train_hooks('LoggingTensorHook, ProfilerHook', model_dir="", batch_size=256)
+      hooks_helper.get_train_hooks("LoggingTensorHook, ProfilerHook", model_dir="", batch_size=256)
 
   def test_raise_in_invalid_names(self):
-    invalid_names = ['StepCounterHook', 'StopAtStepHook']
+    invalid_names = ["StepCounterHook", "StopAtStepHook"]
     with self.assertRaises(ValueError):
       hooks_helper.get_train_hooks(invalid_names, model_dir="", batch_size=256)
 
@@ -49,18 +48,18 @@ def validate_train_hook_name(self, test_hook_name, expected_hook_name, **kwargs)
     self.assertEqual(returned_hook[0].__class__.__name__.lower(), expected_hook_name)
 
   def test_get_train_hooks_logging_tensor_hook(self):
-    self.validate_train_hook_name('LoggingTensorHook', 'loggingtensorhook')
+    self.validate_train_hook_name("LoggingTensorHook", "loggingtensorhook")
 
   def test_get_train_hooks_profiler_hook(self):
-    self.validate_train_hook_name('ProfilerHook', 'profilerhook')
+    self.validate_train_hook_name("ProfilerHook", "profilerhook")
 
   def test_get_train_hooks_examples_per_second_hook(self):
-    self.validate_train_hook_name('ExamplesPerSecondHook', 'examplespersecondhook')
+    self.validate_train_hook_name("ExamplesPerSecondHook", "examplespersecondhook")
 
   def test_get_logging_metric_hook(self):
-    test_hook_name = 'LoggingMetricHook'
-    self.validate_train_hook_name(test_hook_name, 'loggingmetrichook')
+    test_hook_name = "LoggingMetricHook"
+    self.validate_train_hook_name(test_hook_name, "loggingmetrichook")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/utils/logs/logger.py b/deepray/utils/logs/logger.py
index 554b8de7..af3c91f0 100644
--- a/deepray/utils/logs/logger.py
+++ b/deepray/utils/logs/logger.py
@@ -17,6 +17,7 @@
 For collecting local environment metrics like CPU and memory, certain python
 packages need be installed. See README for details.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -58,20 +59,21 @@ def config_benchmark_logger(flag_obj=None):
     if not flag_obj:
       flag_obj = FLAGS
 
-    if (not hasattr(flag_obj, "benchmark_logger_type") or flag_obj.benchmark_logger_type == "BaseBenchmarkLogger"):
+    if not hasattr(flag_obj, "benchmark_logger_type") or flag_obj.benchmark_logger_type == "BaseBenchmarkLogger":
       _benchmark_logger = BaseBenchmarkLogger()
     elif flag_obj.benchmark_logger_type == "BenchmarkFileLogger":
       _benchmark_logger = BenchmarkFileLogger(flag_obj.benchmark_log_dir)
     elif flag_obj.benchmark_logger_type == "BenchmarkBigQueryLogger":
       from official.benchmark import benchmark_uploader as bu  # pylint: disable=g-import-not-at-top
+
       bq_uploader = bu.BigQueryUploader(gcp_project=flag_obj.gcp_project)
       _benchmark_logger = BenchmarkBigQueryLogger(
-          bigquery_uploader=bq_uploader,
-          bigquery_data_set=flag_obj.bigquery_data_set,
-          bigquery_run_table=flag_obj.bigquery_run_table,
-          bigquery_run_status_table=flag_obj.bigquery_run_status_table,
-          bigquery_metric_table=flag_obj.bigquery_metric_table,
-          run_id=str(uuid.uuid4())
+        bigquery_uploader=bq_uploader,
+        bigquery_data_set=flag_obj.bigquery_data_set,
+        bigquery_run_table=flag_obj.bigquery_run_table,
+        bigquery_run_status_table=flag_obj.bigquery_run_status_table,
+        bigquery_metric_table=flag_obj.bigquery_metric_table,
+        run_id=str(uuid.uuid4()),
       )
     else:
       raise ValueError("Unrecognized benchmark_logger_type: %s" % flag_obj.benchmark_logger_type)
@@ -210,8 +212,13 @@ class BenchmarkBigQueryLogger(BaseBenchmarkLogger):
   """Class to log the benchmark information to BigQuery data store."""
 
   def __init__(
-      self, bigquery_uploader, bigquery_data_set, bigquery_run_table, bigquery_run_status_table, bigquery_metric_table,
-      run_id
+    self,
+    bigquery_uploader,
+    bigquery_data_set,
+    bigquery_run_table,
+    bigquery_run_status_table,
+    bigquery_metric_table,
+    run_id,
   ):
     super(BenchmarkBigQueryLogger, self).__init__()
     self._bigquery_uploader = bigquery_uploader
@@ -239,8 +246,8 @@ def log_metric(self, name, value, unit=None, global_step=None, extras=None):
       # thread might have potential performance impact for model that run on
       # CPU.
       thread.start_new_thread(
-          self._bigquery_uploader.upload_benchmark_metric_json,
-          (self._bigquery_data_set, self._bigquery_metric_table, self._run_id, [metric])
+        self._bigquery_uploader.upload_benchmark_metric_json,
+        (self._bigquery_data_set, self._bigquery_metric_table, self._run_id, [metric]),
       )
 
   def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
@@ -261,30 +268,28 @@ def log_run_info(self, model_name, dataset_name, run_params, test_id=None):
     # and impact the benchmark and performance measurement. Starting a new
     # thread might have potential performance impact for model that run on CPU.
     thread.start_new_thread(
-        self._bigquery_uploader.upload_benchmark_run_json,
-        (self._bigquery_data_set, self._bigquery_run_table, self._run_id, run_info)
+      self._bigquery_uploader.upload_benchmark_run_json,
+      (self._bigquery_data_set, self._bigquery_run_table, self._run_id, run_info),
     )
     thread.start_new_thread(
-        self._bigquery_uploader.insert_run_status,
-        (self._bigquery_data_set, self._bigquery_run_status_table, self._run_id, RUN_STATUS_RUNNING)
+      self._bigquery_uploader.insert_run_status,
+      (self._bigquery_data_set, self._bigquery_run_status_table, self._run_id, RUN_STATUS_RUNNING),
     )
 
   def on_finish(self, status):
     self._bigquery_uploader.update_run_status(
-        self._bigquery_data_set, self._bigquery_run_status_table, self._run_id, status
+      self._bigquery_data_set, self._bigquery_run_status_table, self._run_id, status
     )
 
 
 def _gather_run_info(model_name, dataset_name, run_params, test_id):
   """Collect the benchmark run information for the local environment."""
   run_info = {
-      "model_name": model_name,
-      "dataset": {
-          "name": dataset_name
-      },
-      "machine_config": {},
-      "test_id": test_id,
-      "run_date": datetime.datetime.utcnow().strftime(_DATE_TIME_FORMAT_PATTERN)
+    "model_name": model_name,
+    "dataset": {"name": dataset_name},
+    "machine_config": {},
+    "test_id": test_id,
+    "run_date": datetime.datetime.utcnow().strftime(_DATE_TIME_FORMAT_PATTERN),
   }
   _collect_tensorflow_info(run_info)
   _collect_tensorflow_environment_variables(run_info)
@@ -303,12 +308,12 @@ def _process_metric_to_json(name, value, unit=None, global_step=None, extras=Non
 
   extras = _convert_to_json_dict(extras)
   return {
-      "name": name,
-      "value": float(value),
-      "unit": unit,
-      "global_step": global_step,
-      "timestamp": datetime.datetime.utcnow().strftime(_DATE_TIME_FORMAT_PATTERN),
-      "extras": extras
+    "name": name,
+    "value": float(value),
+    "unit": unit,
+    "global_step": global_step,
+    "timestamp": datetime.datetime.utcnow().strftime(_DATE_TIME_FORMAT_PATTERN),
+    "extras": extras,
   }
 
 
@@ -321,22 +326,10 @@ def _collect_run_params(run_info, run_params):
 
   def process_param(name, value):
     type_check = {
-        str: {
-            "name": name,
-            "string_value": value
-        },
-        int: {
-            "name": name,
-            "long_value": value
-        },
-        bool: {
-            "name": name,
-            "bool_value": str(value)
-        },
-        float: {
-            "name": name,
-            "float_value": value
-        },
+      str: {"name": name, "string_value": value},
+      int: {"name": name, "long_value": value},
+      bool: {"name": name, "bool_value": str(value)},
+      float: {"name": name, "float_value": value},
     }
     return type_check.get(type(value), {"name": name, "string_value": str(value)})
 
@@ -346,10 +339,7 @@ def process_param(name, value):
 
 def _collect_tensorflow_environment_variables(run_info):
   run_info["tensorflow_environment_variables"] = [
-      {
-          "name": k,
-          "value": v
-      } for k, v in sorted(os.environ.items()) if k.startswith("TF_")
+    {"name": k, "value": v} for k, v in sorted(os.environ.items()) if k.startswith("TF_")
   ]
 
 
@@ -380,6 +370,7 @@ def _collect_memory_info(run_info):
     # Note: psutil is not installed in the TensorFlow OSS tree.
     # It is installable via pip.
     import psutil  # pylint: disable=g-import-not-at-top
+
     vmem = psutil.virtual_memory()
     run_info["machine_config"]["memory_total"] = vmem.total
     run_info["machine_config"]["memory_available"] = vmem.available
diff --git a/deepray/utils/logs/logger_test.py b/deepray/utils/logs/logger_test.py
index 88f25736..cbd14e1e 100644
--- a/deepray/utils/logs/logger_test.py
+++ b/deepray/utils/logs/logger_test.py
@@ -39,7 +39,6 @@
 
 
 class BenchmarkLoggerTest(tf.test.TestCase):
-
   @classmethod
   def setUpClass(cls):  # pylint: disable=invalid-name
     super(BenchmarkLoggerTest, cls).setUpClass()
@@ -88,7 +87,6 @@ def test_benchmark_context_failure(self, mock_config_benchmark_logger):
 
 
 class BaseBenchmarkLoggerTest(tf.test.TestCase):
-
   def setUp(self):
     super(BaseBenchmarkLoggerTest, self).setUp()
     self._actual_log = logging.info
@@ -113,7 +111,6 @@ def test_log_metric(self):
 
 
 class BenchmarkFileLoggerTest(tf.test.TestCase):
-
   def setUp(self):
     super(BenchmarkFileLoggerTest, self).setUp()
     # Avoid pulling extra env vars from test environment which affects the test
@@ -238,12 +235,12 @@ def test_collect_tensorflow_info(self):
   def test_collect_run_params(self):
     run_info = {}
     run_parameters = {
-        "batch_size": 32,
-        "synthetic_data": True,
-        "epochs": 100.00,
-        "dtype": "fp16",
-        "resnet_size": 50,
-        "random_tensor": tf.constant(2.0)
+      "batch_size": 32,
+      "synthetic_data": True,
+      "epochs": 100.00,
+      "dtype": "fp16",
+      "resnet_size": 50,
+      "random_tensor": tf.constant(2.0),
     }
     logger._collect_run_params(run_info, run_parameters)
     self.assertEqual(len(run_info["run_parameters"]), 6)
@@ -251,17 +248,13 @@ def test_collect_run_params(self):
     self.assertEqual(run_info["run_parameters"][1], {"name": "dtype", "string_value": "fp16"})
     if keras_utils.is_v2_0():
       self.assertEqual(
-          run_info["run_parameters"][2], {
-              "name": "random_tensor",
-              "string_value": "tf.Tensor(2.0, shape=(), dtype=float32)"
-          }
+        run_info["run_parameters"][2],
+        {"name": "random_tensor", "string_value": "tf.Tensor(2.0, shape=(), dtype=float32)"},
       )
     else:
       self.assertEqual(
-          run_info["run_parameters"][2], {
-              "name": "random_tensor",
-              "string_value": "Tensor(\"Const:0\", shape=(), dtype=float32)"
-          }
+        run_info["run_parameters"][2],
+        {"name": "random_tensor", "string_value": 'Tensor("Const:0", shape=(), dtype=float32)'},
       )
 
     self.assertEqual(run_info["run_parameters"][3], {"name": "resnet_size", "long_value": 50})
@@ -277,14 +270,8 @@ def test_collect_tensorflow_environment_variables(self):
     logger._collect_tensorflow_environment_variables(run_info)
     self.assertIsNotNone(run_info["tensorflow_environment_variables"])
     expected_tf_envs = [
-        {
-            "name": "TF_ENABLE_WINOGRAD_NONFUSED",
-            "value": "1"
-        },
-        {
-            "name": "TF_OTHER",
-            "value": "2"
-        },
+      {"name": "TF_ENABLE_WINOGRAD_NONFUSED", "value": "1"},
+      {"name": "TF_OTHER", "value": "2"},
     ]
     self.assertEqual(run_info["tensorflow_environment_variables"], expected_tf_envs)
 
@@ -297,7 +284,6 @@ def test_collect_memory_info(self):
 
 @unittest.skipIf(bigquery is None, "Bigquery dependency is not installed.")
 class BenchmarkBigQueryLoggerTest(tf.test.TestCase):
-
   def setUp(self):
     super(BenchmarkBigQueryLoggerTest, self).setUp()
     # Avoid pulling extra env vars from test environment which affects the test
@@ -308,7 +294,7 @@ def setUp(self):
 
     self.mock_bq_uploader = mock.MagicMock()
     self.logger = logger.BenchmarkBigQueryLogger(
-        self.mock_bq_uploader, "dataset", "run_table", "run_status_table", "metric_table", "run_id"
+      self.mock_bq_uploader, "dataset", "run_table", "run_status_table", "metric_table", "run_id"
     )
 
   def tearDown(self):
@@ -320,23 +306,20 @@ def tearDown(self):
   def test_log_metric(self):
     self.logger.log_metric("accuracy", 0.999, global_step=1e4, extras={"name": "value"})
     expected_metric_json = [
-        {
-            "name": "accuracy",
-            "value": 0.999,
-            "unit": None,
-            "global_step": 1e4,
-            "timestamp": mock.ANY,
-            "extras": [{
-                "name": "name",
-                "value": "value"
-            }]
-        }
+      {
+        "name": "accuracy",
+        "value": 0.999,
+        "unit": None,
+        "global_step": 1e4,
+        "timestamp": mock.ANY,
+        "extras": [{"name": "name", "value": "value"}],
+      }
     ]
     # log_metric will call upload_benchmark_metric_json in a separate thread.
     # Give it some grace period for the new thread before assert.
     time.sleep(1)
     self.mock_bq_uploader.upload_benchmark_metric_json.assert_called_once_with(
-        "dataset", "metric_table", "run_id", expected_metric_json
+      "dataset", "metric_table", "run_id", expected_metric_json
     )
 
   @mock.patch("official.utils.logs.logger._gather_run_info")
@@ -356,7 +339,7 @@ def test_on_finish(self):
     # Give it some grace period for the new thread before assert.
     time.sleep(1)
     self.mock_bq_uploader.update_run_status.assert_called_once_with(
-        "dataset", "run_status_table", "run_id", logger.RUN_STATUS_SUCCESS
+      "dataset", "run_status_table", "run_id", logger.RUN_STATUS_SUCCESS
     )
 
 
diff --git a/deepray/utils/logs/mlperf_helper.py b/deepray/utils/logs/mlperf_helper.py
index c2553148..4a03a81d 100644
--- a/deepray/utils/logs/mlperf_helper.py
+++ b/deepray/utils/logs/mlperf_helper.py
@@ -54,9 +54,9 @@
 ParsedLine = namedtuple("ParsedLine", ["version", "benchmark", "timestamp", "callsite", "tag", "value"])
 
 LINE_PATTERN = re.compile(
-    "^{prefix} {benchmark} {timestamp} {callsite} {tag}(: |$){value}?$".format(
-        prefix=_PREFIX, benchmark=_BENCHMARK, timestamp=_TIMESTAMP, callsite=_CALLSITE, tag=_TAG, value=_VALUE
-    )
+  "^{prefix} {benchmark} {timestamp} {callsite} {tag}(: |$){value}?$".format(
+    prefix=_PREFIX, benchmark=_BENCHMARK, timestamp=_TIMESTAMP, callsite=_CALLSITE, tag=_TAG, value=_VALUE
+  )
 )
 
 
@@ -69,12 +69,12 @@ def parse_line(line):  # type: (str) -> typing.Optional[ParsedLine]
   call_file, call_line, tag, _, value = match.groups()[5:]
 
   return ParsedLine(
-      version=(int(major), int(minor), int(micro)),
-      benchmark=benchmark,
-      timestamp=timestamp,
-      callsite=(call_file, call_line),
-      tag=tag,
-      value=value
+    version=(int(major), int(minor), int(micro)),
+    benchmark=benchmark,
+    timestamp=timestamp,
+    callsite=(call_file, call_line),
+    tag=tag,
+    value=value,
   )
 
 
@@ -83,7 +83,7 @@ def unparse_line(parsed_line):  # type: (ParsedLine) -> str
   callsite_str = "({}:{})".format(*parsed_line.callsite)
   value_str = ": {}".format(parsed_line.value) if parsed_line.value else ""
   return ":::MLPv{} {} {} {} {} {}".format(
-      version_str, parsed_line.benchmark, parsed_line.timestamp, callsite_str, parsed_line.tag, value_str
+    version_str, parsed_line.benchmark, parsed_line.timestamp, callsite_str, parsed_line.tag, value_str
   )
 
 
@@ -95,13 +95,14 @@ def get_mlperf_log():
     def test_mlperf_log_pip_version():
       """Check that mlperf_compliance is up to date."""
       import pkg_resources
+
       version = pkg_resources.get_distribution("mlperf_compliance")
       version = tuple(int(i) for i in version.version.split("."))
       if version < _MIN_VERSION:
         logging.warning(
-            "mlperf_compliance is version {}, must be >= {}".format(
-                ".".join([str(i) for i in version]), ".".join([str(i) for i in _MIN_VERSION])
-            )
+          "mlperf_compliance is version {}, must be >= {}".format(
+            ".".join([str(i) for i in version]), ".".join([str(i) for i in _MIN_VERSION])
+          )
         )
         raise ImportError
       return mlperf_compliance.mlperf_log
@@ -122,7 +123,6 @@ class Logger(object):
   """
 
   class Tags(object):
-
     def __init__(self, mlperf_log):
       self._enabled = False
       self._mlperf_log = mlperf_log
@@ -139,8 +139,7 @@ def __init__(self):
 
   def __call__(self, enable=False):
     if enable and self._mlperf_log is None:
-      raise ImportError("MLPerf logging was requested, but mlperf_compliance "
-                        "module could not be loaded.")
+      raise ImportError("MLPerf logging was requested, but mlperf_compliance module could not be loaded.")
 
     self._enabled = enable
     self.tags._enabled = enable
@@ -164,12 +163,12 @@ def enabled(self):
     return self._enabled
 
   def ncf_print(
-      self, key, value=None, stack_offset=_STACK_OFFSET, deferred=False, extra_print=False, prefix=_NCF_PREFIX
+    self, key, value=None, stack_offset=_STACK_OFFSET, deferred=False, extra_print=False, prefix=_NCF_PREFIX
   ):
     if self._mlperf_log is None or not self.enabled:
       return
     self._mlperf_log.ncf_print(
-        key=key, value=value, stack_offset=stack_offset, deferred=deferred, extra_print=extra_print, prefix=prefix
+      key=key, value=value, stack_offset=stack_offset, deferred=deferred, extra_print=extra_print, prefix=prefix
     )
 
   def set_ncf_root(self, path):
diff --git a/deepray/utils/logs/summary_manager.py b/deepray/utils/logs/summary_manager.py
index 7af94e6c..d4327d00 100644
--- a/deepray/utils/logs/summary_manager.py
+++ b/deepray/utils/logs/summary_manager.py
@@ -47,14 +47,14 @@ def __init__(self, summary_dir, global_step=None):
     # Not writing tensorboard summaries if running in MLPerf.
     # Create summary writers
     if FLAGS.use_horovod and hvd.rank() != 0 or FLAGS.ml_perf:
-      self.summary_writers['train'], self.summary_writers['evel'] = None, None
+      self.summary_writers["train"], self.summary_writers["evel"] = None, None
     else:
-      self.summary_writers['evel'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "eval"))
+      self.summary_writers["evel"] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "eval"))
       if FLAGS.steps_per_execution >= _MIN_SUMMARY_STEPS:
         # Only writes summary when the stats are collected sufficiently over enough steps.
-        self.summary_writers['train'] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "train"))
+        self.summary_writers["train"] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "train"))
       else:
-        self.summary_writers['train'] = None
+        self.summary_writers["train"] = None
 
     if global_step is None:
       self._global_step = tf.summary.experimental.get_step()
@@ -73,7 +73,7 @@ def summary_writer(self, relative_path=""):
       return self.summary_writers[relative_path]
     if self._enabled:
       self.summary_writers[relative_path] = tf.summary.create_file_writer(
-          os.path.join(self._summary_dir, relative_path)
+        os.path.join(self._summary_dir, relative_path)
       )
     else:
       self.summary_writers[relative_path] = tf.summary.create_noop_writer()
diff --git a/deepray/utils/misc/model_helpers.py b/deepray/utils/misc/model_helpers.py
index 13975263..99ba7305 100644
--- a/deepray/utils/misc/model_helpers.py
+++ b/deepray/utils/misc/model_helpers.py
@@ -25,8 +25,8 @@
 
 def csv_str_to_int_list(s):
   """
-    Example: '200,80' -> [200, 80]
-    """
+  Example: '200,80' -> [200, 80]
+  """
   return list(map(int, s.split(",")))
 
 
@@ -55,8 +55,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
   if not isinstance(stop_threshold, numbers.Number):
     raise ValueError("Threshold for checking stop conditions must be a number.")
   if not isinstance(eval_metric, numbers.Number):
-    raise ValueError("Eval metric being checked against stop conditions "
-                     "must be a number.")
+    raise ValueError("Eval metric being checked against stop conditions must be a number.")
 
   if eval_metric >= stop_threshold:
     logging.info("Stop threshold of {} was passed with metric value {}.".format(stop_threshold, eval_metric))
@@ -66,7 +65,7 @@ def past_stop_threshold(stop_threshold, eval_metric):
 
 
 def generate_synthetic_data(
-    input_shape, input_value=0, input_dtype=None, label_shape=None, label_value=0, label_dtype=None
+  input_shape, input_value=0, input_dtype=None, label_shape=None, label_value=0, label_dtype=None
 ):
   """Create a repeating dataset with constant values.
 
@@ -95,6 +94,5 @@ def generate_synthetic_data(
 
 def apply_clean(flags_obj):
   if flags_obj.clean and tf.io.gfile.exists(flags_obj.model_dir):
-    logging.info("--clean flag set. Removing existing model dir:"
-                 " {}".format(flags_obj.model_dir))
+    logging.info("--clean flag set. Removing existing model dir: {}".format(flags_obj.model_dir))
     tf.io.gfile.rmtree(flags_obj.model_dir)
diff --git a/deepray/utils/misc/model_helpers_test.py b/deepray/utils/misc/model_helpers_test.py
index 02b64f32..02fecd26 100644
--- a/deepray/utils/misc/model_helpers_test.py
+++ b/deepray/utils/misc/model_helpers_test.py
@@ -46,19 +46,19 @@ def test_past_stop_threshold_none_false(self):
   def test_past_stop_threshold_not_number(self):
     """Tests for error conditions."""
     with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', 1)
+      model_helpers.past_stop_threshold("str", 1)
 
     with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', tf.constant(5))
+      model_helpers.past_stop_threshold("str", tf.constant(5))
 
     with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold('str', 'another')
+      model_helpers.past_stop_threshold("str", "another")
 
     with self.assertRaises(ValueError):
       model_helpers.past_stop_threshold(0, None)
 
     with self.assertRaises(ValueError):
-      model_helpers.past_stop_threshold(0.7, 'str')
+      model_helpers.past_stop_threshold(0.7, "str")
 
     with self.assertRaises(ValueError):
       model_helpers.past_stop_threshold(tf.constant(4), None)
@@ -69,20 +69,20 @@ class SyntheticDataTest(tf.test.TestCase):
 
   def test_generate_synethetic_data(self):
     input_element, label_element = tf.compat.v1.data.make_one_shot_iterator(
-        model_helpers.generate_synthetic_data(
-            input_shape=tf.TensorShape([5]),
-            input_value=123,
-            input_dtype=tf.float32,
-            label_shape=tf.TensorShape([]),
-            label_value=456,
-            label_dtype=tf.int32
-        )
+      model_helpers.generate_synthetic_data(
+        input_shape=tf.TensorShape([5]),
+        input_value=123,
+        input_dtype=tf.float32,
+        label_shape=tf.TensorShape([]),
+        label_value=456,
+        label_dtype=tf.int32,
+      )
     ).get_next()
 
     with self.session() as sess:
       for n in range(5):
         inp, lab = sess.run((input_element, label_element))
-        self.assertAllClose(inp, [123., 123., 123., 123., 123.])
+        self.assertAllClose(inp, [123.0, 123.0, 123.0, 123.0, 123.0])
         self.assertEquals(lab, 456)
 
   def test_generate_only_input_data(self):
@@ -97,30 +97,23 @@ def test_generate_only_input_data(self):
 
   def test_generate_nested_data(self):
     d = model_helpers.generate_synthetic_data(
-        input_shape={
-            'a': tf.TensorShape([2]),
-            'b': {
-                'c': tf.TensorShape([3]),
-                'd': tf.TensorShape([])
-            }
-        },
-        input_value=1.1
+      input_shape={"a": tf.TensorShape([2]), "b": {"c": tf.TensorShape([3]), "d": tf.TensorShape([])}}, input_value=1.1
     )
 
     element = tf.compat.v1.data.make_one_shot_iterator(d).get_next()
-    self.assertIn('a', element)
-    self.assertIn('b', element)
-    self.assertEquals(len(element['b']), 2)
-    self.assertIn('c', element['b'])
-    self.assertIn('d', element['b'])
-    self.assertNotIn('c', element)
+    self.assertIn("a", element)
+    self.assertIn("b", element)
+    self.assertEquals(len(element["b"]), 2)
+    self.assertIn("c", element["b"])
+    self.assertIn("d", element["b"])
+    self.assertNotIn("c", element)
 
     with self.session() as sess:
       inp = sess.run(element)
-      self.assertAllClose(inp['a'], [1.1, 1.1])
-      self.assertAllClose(inp['b']['c'], [1.1, 1.1, 1.1])
-      self.assertAllClose(inp['b']['d'], 1.1)
+      self.assertAllClose(inp["a"], [1.1, 1.1])
+      self.assertAllClose(inp["b"]["c"], [1.1, 1.1, 1.1])
+      self.assertAllClose(inp["b"]["d"], 1.1)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/deepray/utils/resource_loader.py b/deepray/utils/resource_loader.py
index e8c9ed61..3d32ed40 100644
--- a/deepray/utils/resource_loader.py
+++ b/deepray/utils/resource_loader.py
@@ -34,13 +34,13 @@ def get_project_root():
 def get_path_to_datafile(path, is_so=False):
   """Get the path to the specified file in the data dependencies.
 
-    The path is relative to deepray/
+  The path is relative to deepray/
 
-    Args:
-      path: a string resource path relative to deepray/
-    Returns:
-      The path to the specified data file
-    """
+  Args:
+    path: a string resource path relative to deepray/
+  Returns:
+    The path to the specified data file
+  """
   root_dir = get_project_root()
   if is_so:
     bazel_bin_dir = os.path.join(os.path.dirname(root_dir), "bazel-bin")
@@ -50,7 +50,6 @@ def get_path_to_datafile(path, is_so=False):
 
 
 class LazySO:
-
   def __init__(self, relative_path):
     self.relative_path = relative_path
     self._ops = None
@@ -60,8 +59,7 @@ def ops(self):
     if SKIP_CUSTOM_OPS:
       import pytest
 
-      pytest.skip("Skipping the test because a custom ops "
-                  "was being loaded while --skip-custom-ops was set.")
+      pytest.skip("Skipping the test because a custom ops was being loaded while --skip-custom-ops was set.")
     if self._ops is None:
       # self.display_warning_if_incompatible()
       self._ops = tf.load_op_library(get_path_to_datafile(self.relative_path, is_so=True))
@@ -73,38 +71,38 @@ def display_warning_if_incompatible(self):
       return
 
     warnings.warn(
-        "You are currently using TensorFlow {} and trying to load a custom op ({})."
-        "\n"
-        "Deepray has compiled its custom ops against TensorFlow {}, "
-        "and there are no compatibility guarantees between the two versions. "
-        "\n"
-        "This means that you might get segfaults when loading the custom op, "
-        "or other kind of low-level errors.\n If you do, do not file an issue "
-        "on Github. This is a known limitation."
-        "\n\n"
-        "It might help you to fallback to pure Python "
-        "ops by setting environment variable `DEEPRAY_PY_OPS=1` or using `dp.options.disable_custom_kernel()` in your code. "
-        "To do that, see "
-        "https://github.com/tensorflow/deepray#gpucpu-custom-ops "
-        "\n\n"
-        "You can also change the TensorFlow version installed on your system. "
-        "You would need a TensorFlow version equal to or above {} and strictly "
-        "below {}.\n Note that nightly versions of TensorFlow, "
-        "as well as non-pip TensorFlow like `conda install tensorflow` or compiled "
-        "from source are not supported."
-        "\n\n"
-        "The last solution is to find the Deepray version that has "
-        "custom ops compatible with the TensorFlow installed on your "
-        "system. To do that, refer to the readme: "
-        "https://github.com/tensorflow/deepray"
-        "".format(
-            tf.__version__,
-            self.relative_path,
-            INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY,
-            INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY,
-            EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY,
-        ),
-        UserWarning,
+      "You are currently using TensorFlow {} and trying to load a custom op ({})."
+      "\n"
+      "Deepray has compiled its custom ops against TensorFlow {}, "
+      "and there are no compatibility guarantees between the two versions. "
+      "\n"
+      "This means that you might get segfaults when loading the custom op, "
+      "or other kind of low-level errors.\n If you do, do not file an issue "
+      "on Github. This is a known limitation."
+      "\n\n"
+      "It might help you to fallback to pure Python "
+      "ops by setting environment variable `DEEPRAY_PY_OPS=1` or using `dp.options.disable_custom_kernel()` in your code. "
+      "To do that, see "
+      "https://github.com/tensorflow/deepray#gpucpu-custom-ops "
+      "\n\n"
+      "You can also change the TensorFlow version installed on your system. "
+      "You would need a TensorFlow version equal to or above {} and strictly "
+      "below {}.\n Note that nightly versions of TensorFlow, "
+      "as well as non-pip TensorFlow like `conda install tensorflow` or compiled "
+      "from source are not supported."
+      "\n\n"
+      "The last solution is to find the Deepray version that has "
+      "custom ops compatible with the TensorFlow installed on your "
+      "system. To do that, refer to the readme: "
+      "https://github.com/tensorflow/deepray"
+      "".format(
+        tf.__version__,
+        self.relative_path,
+        INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY,
+        INCLUSIVE_MIN_TF_VERSION_FOR_ABI_COMPATIBILITY,
+        EXCLUSIVE_MAX_TF_VERSION_FOR_ABI_COMPATIBILITY,
+      ),
+      UserWarning,
     )
     abi_warning_already_raised = True
 
diff --git a/deepray/utils/test_utils.py b/deepray/utils/test_utils.py
index 81cef91a..7c0434ad 100644
--- a/deepray/utils/test_utils.py
+++ b/deepray/utils/test_utils.py
@@ -137,22 +137,22 @@ def set_seeds():
 
 def pytest_addoption(parser):
   parser.addoption(
-      "--skip-custom-ops",
-      action="store_true",
-      help="When a custom op is being loaded in a test, skip this test.",
+    "--skip-custom-ops",
+    action="store_true",
+    help="When a custom op is being loaded in a test, skip this test.",
   )
 
 
 def gpus_for_testing():
   """For the moment it's very simple, but it might change in the future,
-    with multiple physical gpus for example. So it's better if this function
-    is called rather than hardcoding the gpu devices in the tests.
-    """
+  with multiple physical gpus for example. So it's better if this function
+  is called rather than hardcoding the gpu devices in the tests.
+  """
   if not is_gpu_available():
     raise SystemError(
-        "You are trying to get some gpus for testing but no gpu is available on "
-        "your system. \nDid you forget to use `@pytest.mark.needs_gpu` on your test"
-        " so that it's skipped automatically when no gpu is available?"
+      "You are trying to get some gpus for testing but no gpu is available on "
+      "your system. \nDid you forget to use `@pytest.mark.needs_gpu` on your test"
+      " so that it's skipped automatically when no gpu is available?"
     )
   return ["gpu:0", "gpu:1"]
 
@@ -226,15 +226,15 @@ def pytest_collection_modifyitems(items):
 def assert_not_allclose(a, b, **kwargs):
   """Assert that two numpy arrays, do not have near values.
 
-    Args:
-      a: the first value to compare.
-      b: the second value to compare.
-      **kwargs: additional keyword arguments to be passed to the underlying
-        `np.testing.assert_allclose` call.
+  Args:
+    a: the first value to compare.
+    b: the second value to compare.
+    **kwargs: additional keyword arguments to be passed to the underlying
+      `np.testing.assert_allclose` call.
 
-    Raises:
-      AssertionError: If `a` and `b` are unexpectedly close at all elements.
-    """
+  Raises:
+    AssertionError: If `a` and `b` are unexpectedly close at all elements.
+  """
   try:
     np.testing.assert_allclose(a, b, **kwargs)
   except AssertionError:
@@ -243,25 +243,25 @@ def assert_not_allclose(a, b, **kwargs):
 
 
 def assert_allclose_according_to_type(
-    a,
-    b,
-    rtol=1e-6,
-    atol=1e-6,
-    float_rtol=1e-6,
-    float_atol=1e-6,
-    half_rtol=1e-3,
-    half_atol=1e-3,
-    bfloat16_rtol=1e-2,
-    bfloat16_atol=1e-2,
+  a,
+  b,
+  rtol=1e-6,
+  atol=1e-6,
+  float_rtol=1e-6,
+  float_atol=1e-6,
+  half_rtol=1e-3,
+  half_atol=1e-3,
+  bfloat16_rtol=1e-2,
+  bfloat16_atol=1e-2,
 ):
   """
-    Similar to tf.test.TestCase.assertAllCloseAccordingToType()
-    but this doesn't need a subclassing to run.
-    """
+  Similar to tf.test.TestCase.assertAllCloseAccordingToType()
+  but this doesn't need a subclassing to run.
+  """
   a = np.array(a)
   b = np.array(b)
   # types with lower tol are put later to overwrite previous ones.
-  if (a.dtype == np.float32 or b.dtype == np.float32 or a.dtype == np.complex64 or b.dtype == np.complex64):
+  if a.dtype == np.float32 or b.dtype == np.float32 or a.dtype == np.complex64 or b.dtype == np.complex64:
     rtol = max(rtol, float_rtol)
     atol = max(atol, float_atol)
   if a.dtype == np.float16 or b.dtype == np.float16:
@@ -276,19 +276,19 @@ def assert_allclose_according_to_type(
 
 def discover_classes(module, parent, class_exceptions):
   """
-    Args:
-        module: a module in which to search for classes that inherit from the parent class
-        parent: the parent class that identifies classes in the module that should be tested
-        class_exceptions: a list of specific classes that should be excluded when discovering classes in a module
+  Args:
+      module: a module in which to search for classes that inherit from the parent class
+      parent: the parent class that identifies classes in the module that should be tested
+      class_exceptions: a list of specific classes that should be excluded when discovering classes in a module
 
-    Returns:
-        a list of classes for testing using pytest for parameterized tests
-    """
+  Returns:
+      a list of classes for testing using pytest for parameterized tests
+  """
 
   classes = [
-      class_info[1]
-      for class_info in inspect.getmembers(module, inspect.isclass)
-      if issubclass(class_info[1], parent) and not class_info[0] in class_exceptions
+    class_info[1]
+    for class_info in inspect.getmembers(module, inspect.isclass)
+    if issubclass(class_info[1], parent) and not class_info[0] in class_exceptions
   ]
 
   return classes
diff --git a/deepray/utils/tests/keras_utils_test.py b/deepray/utils/tests/keras_utils_test.py
index a089817b..8d756114 100644
--- a/deepray/utils/tests/keras_utils_test.py
+++ b/deepray/utils/tests/keras_utils_test.py
@@ -51,9 +51,7 @@ def test_non_cell():
 
 
 def test_custom_cell():
-
   class CustomCell(tf.keras.layers.AbstractRNNCell):
-
     @property
     def output_size(self):
       raise ValueError("assert_like_rnncell should not run code")
diff --git a/deepray/utils/timer.py b/deepray/utils/timer.py
index 8593299f..165bac42 100644
--- a/deepray/utils/timer.py
+++ b/deepray/utils/timer.py
@@ -4,10 +4,10 @@
 
 class Timer:
   """Useage
-    if __name__ == "__main__":
-    with Timer():
-        # ...
-    """
+  if __name__ == "__main__":
+  with Timer():
+      # ...
+  """
 
   def __enter__(self):
     self._enter_time = time.time()
@@ -19,10 +19,10 @@ def __exit__(self, *exc_args):
 
 def timer(func):
   """Useage
-    @timer
-    def your_function():
-        # ...
-    """
+  @timer
+  def your_function():
+      # ...
+  """
 
   @wraps(func)
   def inner(*args, **kwargs):
diff --git a/deepray/utils/types.py b/deepray/utils/types.py
index 01dd101e..ff319c3e 100644
--- a/deepray/utils/types.py
+++ b/deepray/utils/types.py
@@ -26,7 +26,7 @@
 # Find KerasTensor.
 if Version(tf.__version__).release >= Version("2.16").release:
   # Determine if loading keras 2 or 3.
-  if (hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release):
+  if hasattr(tf.keras, "version") and Version(tf.keras.version()).release >= Version("3.0").release:
     from keras import KerasTensor
   else:
     from tf_keras.src.engine.keras_tensor import KerasTensor
@@ -37,8 +37,21 @@
 else:
   from tensorflow.python.keras.engine.keras_tensor import KerasTensor
 
-Number = Union[float, int, np.float16, np.float32, np.float64, np.int8, np.int16, np.int32, np.int64, np.uint8,
-               np.uint16, np.uint32, np.uint64,]
+Number = Union[
+  float,
+  int,
+  np.float16,
+  np.float32,
+  np.float64,
+  np.int8,
+  np.int16,
+  np.int32,
+  np.int64,
+  np.uint8,
+  np.uint16,
+  np.uint32,
+  np.uint64,
+]
 
 Initializer = Union[None, dict, str, Callable, keras.initializers.Initializer]
 Regularizer = Union[None, dict, str, Callable, keras.regularizers.Regularizer]
@@ -49,7 +62,15 @@
 else:
   Optimizer = Union[keras.optimizers.Optimizer, str]
 
-TensorLike = Union[List[Union[Number, list]], tuple, Number, np.ndarray, tf.Tensor, tf.SparseTensor, tf.Variable,
-                   KerasTensor,]
+TensorLike = Union[
+  List[Union[Number, list]],
+  tuple,
+  Number,
+  np.ndarray,
+  tf.Tensor,
+  tf.SparseTensor,
+  tf.Variable,
+  KerasTensor,
+]
 FloatTensorLike = Union[tf.Tensor, float, np.float16, np.float32, np.float64]
 AcceptableDTypes = Union[tf.DType, np.dtype, type, int, str, None]
diff --git a/modelzoo/CV/Classify_images_of_clothing/train.py b/modelzoo/CV/Classify_images_of_clothing/train.py
index 590a3f5d..6f92dd5a 100644
--- a/modelzoo/CV/Classify_images_of_clothing/train.py
+++ b/modelzoo/CV/Classify_images_of_clothing/train.py
@@ -7,27 +7,25 @@
 
 
 def main():
-  model = tf.keras.models.Sequential(
-      [
-          tf.keras.layers.Flatten(input_shape=(28, 28)),
-          tf.keras.layers.Dense(128, activation='relu'),
-          tf.keras.layers.Dropout(0.2),
-          tf.keras.layers.Dense(10, activation='softmax')
-      ]
-  )
+  model = tf.keras.models.Sequential([
+    tf.keras.layers.Flatten(input_shape=(28, 28)),
+    tf.keras.layers.Dense(128, activation="relu"),
+    tf.keras.layers.Dropout(0.2),
+    tf.keras.layers.Dense(10, activation="softmax"),
+  ])
 
   trainer = Trainer(
-      model=model,
-      optimizer='adam',
-      loss='sparse_categorical_crossentropy',
-      metrics=['accuracy'],
+    model=model,
+    optimizer="adam",
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
   )
 
   data_pipe = FashionMNIST()
   train_input_fn = data_pipe(flags.FLAGS.batch_size, is_training=True)
 
   # logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
-  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+  logdir = os.path.join(flags.FLAGS.model_dir, "tensorboard")
 
   tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
   trainer.fit(train_input=train_input_fn, callbacks=[tensorboard_callback])
diff --git a/modelzoo/CV/GAN/conditional_gan_mnist.py b/modelzoo/CV/GAN/conditional_gan_mnist.py
index 38052511..c3bf7f7b 100644
--- a/modelzoo/CV/GAN/conditional_gan_mnist.py
+++ b/modelzoo/CV/GAN/conditional_gan_mnist.py
@@ -6,6 +6,7 @@
 Description: Training a GAN conditioned on class labels to generate handwritten digits.
 Accelerator: GPU
 """
+
 """
 Generative Adversarial Networks (GANs) let us generate novel image data, video data,
 or audio data from a random input. Typically, the random input is sampled
@@ -57,6 +58,7 @@
 import tensorflow as tf
 import numpy as np
 import imageio
+
 """
 ## Constants and hyperparameters
 """
@@ -110,34 +112,34 @@
 
 # Create the discriminator.
 discriminator = keras.Sequential(
-    [
-        keras.layers.InputLayer((28, 28, discriminator_in_channels)),
-        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.GlobalMaxPooling2D(),
-        layers.Dense(1),
-    ],
-    name="discriminator",
+  [
+    keras.layers.InputLayer((28, 28, discriminator_in_channels)),
+    layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.GlobalMaxPooling2D(),
+    layers.Dense(1),
+  ],
+  name="discriminator",
 )
 
 # Create the generator.
 generator = keras.Sequential(
-    [
-        keras.layers.InputLayer((generator_in_channels,)),
-        # We want to generate 128 + num_classes coefficients to reshape into a
-        # 7x7x(128 + num_classes) map.
-        layers.Dense(7 * 7 * generator_in_channels),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Reshape((7, 7, generator_in_channels)),
-        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
-    ],
-    name="generator",
+  [
+    keras.layers.InputLayer((generator_in_channels,)),
+    # We want to generate 128 + num_classes coefficients to reshape into a
+    # 7x7x(128 + num_classes) map.
+    layers.Dense(7 * 7 * generator_in_channels),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Reshape((7, 7, generator_in_channels)),
+    layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
+  ],
+  name="generator",
 )
 """
 ## Creating a `ConditionalGAN` model
@@ -145,7 +147,6 @@
 
 
 class ConditionalGAN(keras.Model):
-
   def __init__(self, discriminator, generator, latent_dim):
     super().__init__()
     self.discriminator = discriminator
@@ -220,8 +221,8 @@ def train_step(self, data):
     self.gen_loss_tracker.update_state(g_loss)
     self.disc_loss_tracker.update_state(d_loss)
     return {
-        "g_loss": self.gen_loss_tracker.result(),
-        "d_loss": self.disc_loss_tracker.result(),
+      "g_loss": self.gen_loss_tracker.result(),
+      "d_loss": self.disc_loss_tracker.result(),
     }
 
 
@@ -231,9 +232,9 @@ def train_step(self, data):
 
 cond_gan = ConditionalGAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
 cond_gan.compile(
-    d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
-    loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
+  d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+  g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+  loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
 )
 
 cond_gan.fit(dataset, epochs=20)
@@ -264,7 +265,7 @@ def interpolate_class(first_number, second_number):
   # Calculate the interpolation vector between the two labels.
   percent_second_label = tf.linspace(0, 1, num_interpolation)[:, None]
   percent_second_label = tf.cast(percent_second_label, tf.float32)
-  interpolation_labels = (first_label * (1 - percent_second_label) + second_label * percent_second_label)
+  interpolation_labels = first_label * (1 - percent_second_label) + second_label * percent_second_label
 
   # Combine the noise and the labels and run inference with the generator.
   noise_and_labels = tf.concat([interpolation_noise, interpolation_labels], 1)
diff --git a/modelzoo/CV/GAN/train.py b/modelzoo/CV/GAN/train.py
index c56339f7..e9419ad3 100644
--- a/modelzoo/CV/GAN/train.py
+++ b/modelzoo/CV/GAN/train.py
@@ -12,48 +12,44 @@
 from deepray.core.common import distribution_utils
 from deepray.datasets.mnist import Mnist
 
-FLAGS(
-    [
-        sys.argv[0],
-        "--train_data=mnist",
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--steps_per_execution=10",
-        # "--use_horovod=True",
-        # "--batch_size=1024",
-    ]
-)
+FLAGS([
+  sys.argv[0],
+  "--train_data=mnist",
+  # "--distribution_strategy=off",
+  # "--run_eagerly=true",
+  "--steps_per_execution=10",
+  # "--use_horovod=True",
+  # "--batch_size=1024",
+])
 
 
 def main(_):
   _strategy = distribution_utils.get_distribution_strategy()
   data_pipe = Mnist()
   with distribution_utils.get_strategy_scope(_strategy):
-    mnist_model = tf.keras.Sequential(
-        [
-            tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
-            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-            tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
-            tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-            tf.keras.layers.Dropout(0.25),
-            tf.keras.layers.Flatten(),
-            tf.keras.layers.Dense(128, activation="relu"),
-            tf.keras.layers.Dropout(0.5),
-            tf.keras.layers.Dense(10, activation="softmax"),
-        ]
-    )
+    mnist_model = tf.keras.Sequential([
+      tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
+      tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+      tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
+      tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+      tf.keras.layers.Dropout(0.25),
+      tf.keras.layers.Flatten(),
+      tf.keras.layers.Dense(128, activation="relu"),
+      tf.keras.layers.Dropout(0.5),
+      tf.keras.layers.Dense(10, activation="softmax"),
+    ])
 
   trainer = Trainer(
-      optimizer=tf.keras.optimizers.Adam(0.001),
-      model=mnist_model,
-      loss=tf.losses.SparseCategoricalCrossentropy(),
-      # loss='sparse_categorical_crossentropy',
-      metrics=["accuracy"]
+    optimizer=tf.keras.optimizers.Adam(0.001),
+    model=mnist_model,
+    loss=tf.losses.SparseCategoricalCrossentropy(),
+    # loss='sparse_categorical_crossentropy',
+    metrics=["accuracy"],
   )
 
   logs = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")
 
-  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logs, histogram_freq=1, profile_batch='10,100')
+  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logs, histogram_freq=1, profile_batch="10,100")
 
   train_input = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
   trainer.fit(train_input=train_input, callbacks=[tboard_callback])
diff --git a/modelzoo/CV/SwinTransformers/model.py b/modelzoo/CV/SwinTransformers/model.py
index 818ba605..b2b74b05 100644
--- a/modelzoo/CV/SwinTransformers/model.py
+++ b/modelzoo/CV/SwinTransformers/model.py
@@ -4,6 +4,7 @@
 from tensorflow.keras import layers
 
 import deepray as dp
+
 """
 ### Build the model
 
@@ -20,7 +21,6 @@
 
 
 class PatchExtract(layers.Layer):
-
   def __init__(self, patch_size, **kwargs):
     super().__init__(**kwargs)
     self.patch_size_x = patch_size[0]
@@ -29,11 +29,11 @@ def __init__(self, patch_size, **kwargs):
   def call(self, images):
     batch_size = tf.shape(images)[0]
     patches = tf.image.extract_patches(
-        images=images,
-        sizes=(1, self.patch_size_x, self.patch_size_y, 1),
-        strides=(1, self.patch_size_x, self.patch_size_y, 1),
-        rates=(1, 1, 1, 1),
-        padding="VALID",
+      images=images,
+      sizes=(1, self.patch_size_x, self.patch_size_y, 1),
+      strides=(1, self.patch_size_x, self.patch_size_y, 1),
+      rates=(1, 1, 1, 1),
+      padding="VALID",
     )
     patch_dim = patches.shape[-1]
     patch_num = patches.shape[1]
@@ -41,7 +41,6 @@ def call(self, images):
 
 
 class PatchEmbedding(layers.Layer):
-
   def __init__(self, num_patch, embed_dim, **kwargs):
     super().__init__(**kwargs)
     self.num_patch = num_patch
@@ -54,7 +53,6 @@ def call(self, patch):
 
 
 class PatchMerging(tf.keras.layers.Layer):
-
   def __init__(self, num_patch, embed_dim):
     super().__init__()
     self.num_patch = num_patch
@@ -74,20 +72,19 @@ def call(self, x):
     return self.linear_trans(x)
 
 
-class BaseModel():
-
+class BaseModel:
   def __init__(
-      self,
-      input_shape=(32, 32, 3),
-      patch_size=(2, 2),  # 2-by-2 sized patches
-      dropout_rate=0.03,  # Dropout rate
-      num_heads=8,  # Attention heads
-      embed_dim=64,  # Embedding dimension
-      num_mlp=256,  # MLP layer size
-      qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
-      window_size=2,  # Size of attention window
-      shift_size=1,  # Size of shifting window
-      image_dimension=32,  # Initial image size
+    self,
+    input_shape=(32, 32, 3),
+    patch_size=(2, 2),  # 2-by-2 sized patches
+    dropout_rate=0.03,  # Dropout rate
+    num_heads=8,  # Attention heads
+    embed_dim=64,  # Embedding dimension
+    num_mlp=256,  # MLP layer size
+    qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
+    window_size=2,  # Size of attention window
+    shift_size=1,  # Size of shifting window
+    image_dimension=32,  # Initial image size
   ):
     num_patch_x = input_shape[0] // patch_size[0]
     num_patch_y = input_shape[1] // patch_size[1]
@@ -96,24 +93,24 @@ def __init__(
     self.extract = PatchExtract(patch_size)
     self.embedding = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)
     self.swin0 = SwinTransformer(
-        dim=embed_dim,
-        num_patch=(num_patch_x, num_patch_y),
-        num_heads=num_heads,
-        window_size=window_size,
-        shift_size=0,
-        num_mlp=num_mlp,
-        qkv_bias=qkv_bias,
-        dropout_rate=dropout_rate,
+      dim=embed_dim,
+      num_patch=(num_patch_x, num_patch_y),
+      num_heads=num_heads,
+      window_size=window_size,
+      shift_size=0,
+      num_mlp=num_mlp,
+      qkv_bias=qkv_bias,
+      dropout_rate=dropout_rate,
     )
     self.swin1 = SwinTransformer(
-        dim=embed_dim,
-        num_patch=(num_patch_x, num_patch_y),
-        num_heads=num_heads,
-        window_size=window_size,
-        shift_size=shift_size,
-        num_mlp=num_mlp,
-        qkv_bias=qkv_bias,
-        dropout_rate=dropout_rate,
+      dim=embed_dim,
+      num_patch=(num_patch_x, num_patch_y),
+      num_heads=num_heads,
+      window_size=window_size,
+      shift_size=shift_size,
+      num_mlp=num_mlp,
+      qkv_bias=qkv_bias,
+      dropout_rate=dropout_rate,
     )
     self.merging = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)
     self.input_shape = input_shape
diff --git a/modelzoo/CV/SwinTransformers/train.py b/modelzoo/CV/SwinTransformers/train.py
index 246dfb83..4555b6fc 100644
--- a/modelzoo/CV/SwinTransformers/train.py
+++ b/modelzoo/CV/SwinTransformers/train.py
@@ -24,30 +24,32 @@ def main(_):
   data_pipe = CIFAR100()
   with distribution_utils.get_strategy_scope(_strategy):
     model = BaseModel(
-        input_shape=(32, 32, 3),
-        patch_size=(2, 2),  # 2-by-2 sized patches
-        dropout_rate=0.03,  # Dropout rate
-        num_heads=8,  # Attention heads
-        embed_dim=64,  # Embedding dimension
-        num_mlp=256,  # MLP layer size
-        qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
-        window_size=2,  # Size of attention window
-        shift_size=1,  # Size of shifting window
-        image_dimension=32,  # Initial image size
+      input_shape=(32, 32, 3),
+      patch_size=(2, 2),  # 2-by-2 sized patches
+      dropout_rate=0.03,  # Dropout rate
+      num_heads=8,  # Attention heads
+      embed_dim=64,  # Embedding dimension
+      num_mlp=256,  # MLP layer size
+      qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
+      window_size=2,  # Size of attention window
+      shift_size=1,  # Size of shifting window
+      image_dimension=32,  # Initial image size
     )(num_classes=100)
 
   trainer = Trainer(
-      model=model,
-      loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
-      optimizer=dp.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay),
-      metrics=[
-          keras.metrics.CategoricalAccuracy(name="accuracy"),
-          keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
-      ],
+    model=model,
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=dp.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay),
+    metrics=[
+      keras.metrics.CategoricalAccuracy(name="accuracy"),
+      keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
   )
 
   train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
+  trainer.fit(
+    train_input=train_input_fn,
+  )
 
   # trainer.export_tfra()
   """
diff --git a/modelzoo/CV/knowledge_distillation/knowledge_distillation.py b/modelzoo/CV/knowledge_distillation/knowledge_distillation.py
index 31beeb1d..2c488b05 100644
--- a/modelzoo/CV/knowledge_distillation/knowledge_distillation.py
+++ b/modelzoo/CV/knowledge_distillation/knowledge_distillation.py
@@ -6,6 +6,7 @@
 Description: Implementation of classical Knowledge Distillation.
 Accelerator: GPU
 """
+
 """
 ## Introduction to Knowledge Distillation
 
@@ -31,6 +32,7 @@
 from tensorflow import keras
 from tensorflow.keras import layers
 import numpy as np
+
 """
 ## Construct `Distiller()` class
 
@@ -55,34 +57,33 @@
 
 
 class Distiller(keras.Model):
-
   def __init__(self, student, teacher):
     super().__init__()
     self.teacher = teacher
     self.student = student
 
   def compile(
-      self,
-      optimizer,
-      metrics,
-      student_loss_fn,
-      distillation_loss_fn,
-      alpha=0.1,
-      temperature=3,
+    self,
+    optimizer,
+    metrics,
+    student_loss_fn,
+    distillation_loss_fn,
+    alpha=0.1,
+    temperature=3,
   ):
     """Configure the distiller.
 
-        Args:
-            optimizer: Keras optimizer for the student weights
-            metrics: Keras metrics for evaluation
-            student_loss_fn: Loss function of difference between student
-                predictions and ground-truth
-            distillation_loss_fn: Loss function of difference between soft
-                student predictions and soft teacher predictions
-            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
-            temperature: Temperature for softening probability distributions.
-                Larger temperature gives softer distributions.
-        """
+    Args:
+        optimizer: Keras optimizer for the student weights
+        metrics: Keras metrics for evaluation
+        student_loss_fn: Loss function of difference between student
+            predictions and ground-truth
+        distillation_loss_fn: Loss function of difference between soft
+            student predictions and soft teacher predictions
+        alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
+        temperature: Temperature for softening probability distributions.
+            Larger temperature gives softer distributions.
+    """
     super().compile(optimizer=optimizer, metrics=metrics)
     self.student_loss_fn = student_loss_fn
     self.distillation_loss_fn = distillation_loss_fn
@@ -107,10 +108,11 @@ def train_step(self, data):
       # The magnitudes of the gradients produced by the soft targets scale
       # as 1/T^2, multiply them by T^2 when using both hard and soft targets.
       distillation_loss = (
-          self.distillation_loss_fn(
-              tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
-              tf.nn.softmax(student_predictions / self.temperature, axis=1),
-          ) * self.temperature**2
+        self.distillation_loss_fn(
+          tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
+          tf.nn.softmax(student_predictions / self.temperature, axis=1),
+        )
+        * self.temperature**2
       )
 
       loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
@@ -159,30 +161,30 @@ def test_step(self, data):
 
 # Create the teacher
 teacher = keras.Sequential(
-    [
-        keras.Input(shape=(28, 28, 1)),
-        layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
-        layers.Conv2D(512, (3, 3), strides=(2, 2), padding="same"),
-        layers.Flatten(),
-        layers.Dense(10),
-    ],
-    name="teacher",
+  [
+    keras.Input(shape=(28, 28, 1)),
+    layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
+    layers.Conv2D(512, (3, 3), strides=(2, 2), padding="same"),
+    layers.Flatten(),
+    layers.Dense(10),
+  ],
+  name="teacher",
 )
 
 # Create the student
 student = keras.Sequential(
-    [
-        keras.Input(shape=(28, 28, 1)),
-        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
-        layers.LeakyReLU(alpha=0.2),
-        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
-        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
-        layers.Flatten(),
-        layers.Dense(10),
-    ],
-    name="student",
+  [
+    keras.Input(shape=(28, 28, 1)),
+    layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
+    layers.LeakyReLU(alpha=0.2),
+    layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
+    layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
+    layers.Flatten(),
+    layers.Dense(10),
+  ],
+  name="student",
 )
 
 # Clone student for later comparison
@@ -216,9 +218,9 @@ def test_step(self, data):
 
 # Train teacher as usual
 teacher.compile(
-    optimizer=keras.optimizers.Adam(),
-    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+  optimizer=keras.optimizers.Adam(),
+  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+  metrics=[keras.metrics.SparseCategoricalAccuracy()],
 )
 
 # Train and evaluate teacher on data.
@@ -235,12 +237,12 @@ def test_step(self, data):
 # Initialize and compile distiller
 distiller = Distiller(student=student, teacher=teacher)
 distiller.compile(
-    optimizer=keras.optimizers.Adam(),
-    metrics=[keras.metrics.SparseCategoricalAccuracy()],
-    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    distillation_loss_fn=keras.losses.KLDivergence(),
-    alpha=0.1,
-    temperature=10,
+  optimizer=keras.optimizers.Adam(),
+  metrics=[keras.metrics.SparseCategoricalAccuracy()],
+  student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+  distillation_loss_fn=keras.losses.KLDivergence(),
+  alpha=0.1,
+  temperature=10,
 )
 
 # Distill teacher to student
@@ -257,9 +259,9 @@ def test_step(self, data):
 
 # Train student as doen usually
 student_scratch.compile(
-    optimizer=keras.optimizers.Adam(),
-    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+  optimizer=keras.optimizers.Adam(),
+  loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+  metrics=[keras.metrics.SparseCategoricalAccuracy()],
 )
 
 # Train and evaluate student trained from scratch.
diff --git a/modelzoo/CV/mnist/train.py b/modelzoo/CV/mnist/train.py
index dbd8be68..ada89c61 100644
--- a/modelzoo/CV/mnist/train.py
+++ b/modelzoo/CV/mnist/train.py
@@ -16,24 +16,22 @@
 
 
 def define_flasg():
-  flags.FLAGS(
-      [
-          sys.argv[0],
-          "--train_data=mnist",
-          # "--run_eagerly=true",
-          "--steps_per_execution=1",
-          # "--batch_size=1024",
-      ]
-  )
+  flags.FLAGS([
+    sys.argv[0],
+    "--train_data=mnist",
+    # "--run_eagerly=true",
+    "--steps_per_execution=1",
+    # "--batch_size=1024",
+  ])
 
 
 class EarlyStoppingAtMinLoss(keras.callbacks.Callback):
   """Stop training when the loss is at its min, i.e. the loss stops decreasing.
 
-    Arguments:
-        patience: Number of epochs to wait after min has been hit. After this
-        number of no improvement, training stops.
-    """
+  Arguments:
+      patience: Number of epochs to wait after min has been hit. After this
+      number of no improvement, training stops.
+  """
 
   def __init__(self, patience=0):
     super().__init__()
@@ -79,38 +77,36 @@ def on_train_end(self, logs=None):
 
 
 def main():
-  mnist_model = tf.keras.Sequential(
-      [
-          tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
-          tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-          tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
-          tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
-          tf.keras.layers.Dropout(0.25),
-          tf.keras.layers.Flatten(),
-          tf.keras.layers.Dense(128, activation="relu"),
-          tf.keras.layers.Dropout(0.5),
-          tf.keras.layers.Dense(10, activation="softmax"),
-      ]
-  )
+  mnist_model = tf.keras.Sequential([
+    tf.keras.layers.Conv2D(32, [3, 3], activation="relu"),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Conv2D(64, [3, 3], activation="relu"),
+    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
+    tf.keras.layers.Dropout(0.25),
+    tf.keras.layers.Flatten(),
+    tf.keras.layers.Dense(128, activation="relu"),
+    tf.keras.layers.Dropout(0.5),
+    tf.keras.layers.Dense(10, activation="softmax"),
+  ])
 
   trainer = Trainer(
-      optimizer=tf.keras.optimizers.Adam(0.001),
-      model=mnist_model,
-      loss=tf.losses.SparseCategoricalCrossentropy(),
-      # loss='sparse_categorical_crossentropy',
-      metrics=["accuracy"]
+    optimizer=tf.keras.optimizers.Adam(0.001),
+    model=mnist_model,
+    loss=tf.losses.SparseCategoricalCrossentropy(),
+    # loss='sparse_categorical_crossentropy',
+    metrics=["accuracy"],
   )
   data_pipe = Mnist()
   train_input = data_pipe(flags.FLAGS.batch_size, is_training=True)
   test_input = data_pipe(flags.FLAGS.batch_size, is_training=False)
   tboard_callback = tf.keras.callbacks.TensorBoard(
-      log_dir=os.path.join(flags.FLAGS.model_dir, 'tensorboard'), histogram_freq=1, profile_batch='1,2'
+    log_dir=os.path.join(flags.FLAGS.model_dir, "tensorboard"), histogram_freq=1, profile_batch="1,2"
   )
 
   trainer.fit(
-      train_input=train_input,
-      eval_input=test_input,
-      callbacks=[tboard_callback, EarlyStoppingAtMinLoss()],
+    train_input=train_input,
+    eval_input=test_input,
+    callbacks=[tboard_callback, EarlyStoppingAtMinLoss()],
   )
 
 
diff --git a/modelzoo/ELECTRA/build_pretraining_dataset.py b/modelzoo/ELECTRA/build_pretraining_dataset.py
index e1385cb2..ce70cda6 100644
--- a/modelzoo/ELECTRA/build_pretraining_dataset.py
+++ b/modelzoo/ELECTRA/build_pretraining_dataset.py
@@ -27,7 +27,6 @@
 from tokenization import ElectraTokenizer
 
 
-
 def create_int_feature(values):
   feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
   return feature
@@ -71,19 +70,18 @@ def _create_example(self):
       # the sentence goes to the first segment if (1) the first segment is
       # empty, (2) the sentence doesn't put the first segment over length or
       # (3) 50% of the time when it does put the first segment over length
-      if (len(first_segment) == 0 or
-          len(first_segment) + len(sentence) < first_segment_target_length or
-          (len(second_segment) == 0 and
-           len(first_segment) < first_segment_target_length and
-           random.random() < 0.5)):
+      if (
+        len(first_segment) == 0
+        or len(first_segment) + len(sentence) < first_segment_target_length
+        or (len(second_segment) == 0 and len(first_segment) < first_segment_target_length and random.random() < 0.5)
+      ):
         first_segment += sentence
       else:
         second_segment += sentence
 
     # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
-    first_segment = first_segment[:self._max_length - 2]
-    second_segment = second_segment[:max(0, self._max_length -
-                                         len(first_segment) - 3)]
+    first_segment = first_segment[: self._max_length - 2]
+    second_segment = second_segment[: max(0, self._max_length - len(first_segment) - 3)]
 
     # prepare to start building the next example
     self._current_sentences = []
@@ -108,31 +106,39 @@ def _make_tf_example(self, first_segment, second_segment):
     input_ids += [0] * (self._max_length - len(input_ids))
     input_mask += [0] * (self._max_length - len(input_mask))
     segment_ids += [0] * (self._max_length - len(segment_ids))
-    tf_example = tf.train.Example(features=tf.train.Features(feature={
-        "input_ids": create_int_feature(input_ids),
-        "input_mask": create_int_feature(input_mask),
-        "segment_ids": create_int_feature(segment_ids)
-    }))
+    tf_example = tf.train.Example(
+      features=tf.train.Features(
+        feature={
+          "input_ids": create_int_feature(input_ids),
+          "input_mask": create_int_feature(input_mask),
+          "segment_ids": create_int_feature(segment_ids),
+        }
+      )
+    )
     return tf_example
 
 
 class ExampleWriter(object):
   """Writes pre-training examples to disk."""
 
-  def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
-               num_jobs, blanks_separate_docs, do_lower_case,
-               num_out_files=1000):
+  def __init__(
+    self,
+    job_id,
+    vocab_file,
+    output_dir,
+    max_seq_length,
+    num_jobs,
+    blanks_separate_docs,
+    do_lower_case,
+    num_out_files=1000,
+  ):
     self._blanks_separate_docs = blanks_separate_docs
-    tokenizer = ElectraTokenizer(
-        vocab_file=vocab_file,
-        do_lower_case=do_lower_case)
+    tokenizer = ElectraTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
     self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
     self._writers = []
     for i in range(num_out_files):
       if i % num_jobs == job_id:
-        output_fname = os.path.join(
-            output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(
-                i, num_out_files))
+        output_fname = os.path.join(output_dir, "pretrain_data.tfrecord-{:}-of-{:}".format(i, num_out_files))
         self._writers.append(tf.io.TFRecordWriter(output_fname))
     self.n_written = 0
 
@@ -144,13 +150,11 @@ def write_examples(self, input_file):
         if line or self._blanks_separate_docs:
           example = self._example_builder.add_line(line)
           if example:
-            self._writers[self.n_written % len(self._writers)].write(
-                example.SerializeToString())
+            self._writers[self.n_written % len(self._writers)].write(example.SerializeToString())
             self.n_written += 1
       example = self._example_builder.add_line("")
       if example:
-        self._writers[self.n_written % len(self._writers)].write(
-            example.SerializeToString())
+        self._writers[self.n_written % len(self._writers)].write(example.SerializeToString())
         self.n_written += 1
 
   def finish(self):
@@ -167,54 +171,54 @@ def log(*args):
 
   log("Creating example writer")
   example_writer = ExampleWriter(
-      job_id=job_id,
-      vocab_file=args.vocab_file,
-      output_dir=args.output_dir,
-      max_seq_length=args.max_seq_length,
-      num_jobs=args.num_processes,
-      blanks_separate_docs=args.blanks_separate_docs,
-      do_lower_case=args.do_lower_case,
-      num_out_files=args.num_out_files,
+    job_id=job_id,
+    vocab_file=args.vocab_file,
+    output_dir=args.output_dir,
+    max_seq_length=args.max_seq_length,
+    num_jobs=args.num_processes,
+    blanks_separate_docs=args.blanks_separate_docs,
+    do_lower_case=args.do_lower_case,
+    num_out_files=args.num_out_files,
   )
   log("Writing tf examples")
   fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
-  fnames = [f for (i, f) in enumerate(fnames)
-            if i % args.num_processes == job_id]
+  fnames = [f for (i, f) in enumerate(fnames) if i % args.num_processes == job_id]
   random.shuffle(fnames)
   start_time = time.time()
   for file_no, fname in enumerate(fnames):
     if file_no > 0:
       elapsed = time.time() - start_time
-      log("processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, "
-          "{:} examples written".format(
-              file_no, len(fnames), 100.0 * file_no / len(fnames), int(elapsed),
-              int((len(fnames) - file_no) / (file_no / elapsed)),
-              example_writer.n_written))
+      log(
+        "processed {:}/{:} files ({:.1f}%), ELAPSED: {:}s, ETA: {:}s, {:} examples written".format(
+          file_no,
+          len(fnames),
+          100.0 * file_no / len(fnames),
+          int(elapsed),
+          int((len(fnames) - file_no) / (file_no / elapsed)),
+          example_writer.n_written,
+        )
+      )
     example_writer.write_examples(os.path.join(args.corpus_dir, fname))
   example_writer.finish()
   log("Done!")
 
+
 # python build_pretraining_dataset --corpus-dir
 def main():
   parser = argparse.ArgumentParser(description=__doc__)
-  parser.add_argument("--corpus-dir", required=True,
-                      help="Location of pre-training text files.")
-  parser.add_argument("--vocab-file", required=True,
-                      help="Location of vocabulary file.")
-  parser.add_argument("--output-dir", required=True,
-                      help="Where to write out the tfrecords.")
-  parser.add_argument("--max-seq-length", default=128, type=int,
-                      help="Number of tokens per example.")
-  parser.add_argument("--num-processes", default=1, type=int,
-                      help="Parallelize across multiple processes.")
-  parser.add_argument("--blanks-separate-docs", default=True, type=bool,
-                      help="Whether blank lines indicate document boundaries.")
-  parser.add_argument("--do-lower-case", dest='do_lower_case',
-                      action='store_true', help="Lower case input text.")
-  parser.add_argument("--no-lower-case", dest='do_lower_case',
-                      action='store_false', help="Don't lower case input text.")
-  parser.add_argument("--num-out-files", default=1000, type=int,
-                      help="Number of output files.")
+  parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.")
+  parser.add_argument("--vocab-file", required=True, help="Location of vocabulary file.")
+  parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.")
+  parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.")
+  parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.")
+  parser.add_argument(
+    "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries."
+  )
+  parser.add_argument("--do-lower-case", dest="do_lower_case", action="store_true", help="Lower case input text.")
+  parser.add_argument(
+    "--no-lower-case", dest="do_lower_case", action="store_false", help="Don't lower case input text."
+  )
+  parser.add_argument("--num-out-files", default=1000, type=int, help="Number of output files.")
   parser.add_argument("--seed", default=1314, type=int)
   args = parser.parse_args()
 
diff --git a/modelzoo/ELECTRA/configuration.py b/modelzoo/ELECTRA/configuration.py
index df8d5ae7..27a565d2 100644
--- a/modelzoo/ELECTRA/configuration.py
+++ b/modelzoo/ELECTRA/configuration.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" ELECTRA model configuration """
-
+"""ELECTRA model configuration"""
 
 import logging
 
@@ -24,109 +23,110 @@
 logger = logging.getLogger(__name__)
 
 ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json",
-    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json",
-    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json",
-    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json",
-    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json",
-    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json",
+  "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json",
+  "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json",
+  "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json",
+  "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json",
+  "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json",
+  "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json",
 }
 
 
 class ElectraConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
-        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
-        architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the ELECTRA model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
-            embedding_size (:obj:`int`, optional, defaults to 128):
-                Dimensionality of the encoder layers and the pooler layer.
-            hidden_size (:obj:`int`, optional, defaults to 256):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 4):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 1024):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-
-        Example::
-
-            from transformers import ElectraModel, ElectraConfig
-
-            # Initializing a ELECTRA electra-base-uncased style configuration
-            configuration = ElectraConfig()
-
-            # Initializing a model from the electra-base-uncased style configuration
-            model = ElectraModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "electra"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        embedding_size=128,
-        hidden_size=256,
-        num_hidden_layers=12,
-        num_attention_heads=4,
-        intermediate_size=1024,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.embedding_size = embedding_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
+  r"""
+  This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+  It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+  architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+  the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+  architecture.
+
+  Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+  to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+  for more information.
+
+
+  Args:
+      vocab_size (:obj:`int`, optional, defaults to 30522):
+          Vocabulary size of the ELECTRA model. Defines the different tokens that
+          can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+      embedding_size (:obj:`int`, optional, defaults to 128):
+          Dimensionality of the encoder layers and the pooler layer.
+      hidden_size (:obj:`int`, optional, defaults to 256):
+          Dimensionality of the encoder layers and the pooler layer.
+      num_hidden_layers (:obj:`int`, optional, defaults to 12):
+          Number of hidden layers in the Transformer encoder.
+      num_attention_heads (:obj:`int`, optional, defaults to 4):
+          Number of attention heads for each attention layer in the Transformer encoder.
+      intermediate_size (:obj:`int`, optional, defaults to 1024):
+          Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+      hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+          The non-linear activation function (function or string) in the encoder and pooler.
+          If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+      hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+          The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+          The dropout ratio for the attention probabilities.
+      max_position_embeddings (:obj:`int`, optional, defaults to 512):
+          The maximum sequence length that this model might ever be used with.
+          Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+      type_vocab_size (:obj:`int`, optional, defaults to 2):
+          The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+      initializer_range (:obj:`float`, optional, defaults to 0.02):
+          The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+      layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+          The epsilon used by the layer normalization layers.
+
+  Example::
+
+      from transformers import ElectraModel, ElectraConfig
+
+      # Initializing a ELECTRA electra-base-uncased style configuration
+      configuration = ElectraConfig()
+
+      # Initializing a model from the electra-base-uncased style configuration
+      model = ElectraModel(configuration)
+
+      # Accessing the model configuration
+      configuration = model.config
+
+  Attributes:
+      pretrained_config_archive_map (Dict[str, str]):
+          A dictionary containing all the available pre-trained checkpoints.
+  """
+
+  pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP
+  model_type = "electra"
+
+  def __init__(
+    self,
+    vocab_size=30522,
+    embedding_size=128,
+    hidden_size=256,
+    num_hidden_layers=12,
+    num_attention_heads=4,
+    intermediate_size=1024,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=2,
+    initializer_range=0.02,
+    layer_norm_eps=1e-12,
+    pad_token_id=0,
+    **kwargs,
+  ):
+    super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+    self.vocab_size = vocab_size
+    self.embedding_size = embedding_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.intermediate_size = intermediate_size
+    self.hidden_act = hidden_act
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.layer_norm_eps = layer_norm_eps
diff --git a/modelzoo/ELECTRA/configuration_utils.py b/modelzoo/ELECTRA/configuration_utils.py
index b90c4025..995ca020 100644
--- a/modelzoo/ELECTRA/configuration_utils.py
+++ b/modelzoo/ELECTRA/configuration_utils.py
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Configuration base class and utilities."""
-
+"""Configuration base class and utilities."""
 
 import copy
 import json
@@ -30,489 +29,489 @@
 
 
 class PretrainedConfig(object):
-    r""" Base class for all configuration classes.
-        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
-
-        Note:
-            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
-            It only affects the model's configuration.
-
-        Class attributes (overridden by derived classes):
-            - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
-            - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
-
-        Args:
-            finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
-                Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
-            num_labels (:obj:`int`, `optional`, defaults to `2`):
-                Number of classes to use when the model is a classification model (sequences/tokens)
-            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Should the model returns attentions weights.
-            output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
-                Should the model returns all hidden-states.
-            torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Is the model used with Torchscript (for PyTorch models).
+  r"""Base class for all configuration classes.
+  Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+  Note:
+      A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+      It only affects the model's configuration.
+
+  Class attributes (overridden by derived classes):
+      - ``pretrained_config_archive_map``: a python ``dict`` with `shortcut names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+      - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
+
+  Args:
+      finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
+          Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+      num_labels (:obj:`int`, `optional`, defaults to `2`):
+          Number of classes to use when the model is a classification model (sequences/tokens)
+      output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+          Should the model returns attentions weights.
+      output_hidden_states (:obj:`string`, `optional`, defaults to :obj:`False`):
+          Should the model returns all hidden-states.
+      torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
+          Is the model used with Torchscript (for PyTorch models).
+  """
+
+  pretrained_config_archive_map = {}  # type: Dict[str, str]
+  model_type = ""  # type: str
+
+  def __init__(self, **kwargs):
+    # Attributes with defaults
+    self.output_attentions = kwargs.pop("output_attentions", False)
+    self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+    self.output_past = kwargs.pop("output_past", True)  # Not used by all models
+    self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+    self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+    self.pruned_heads = kwargs.pop("pruned_heads", {})
+
+    # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+    self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+    self.is_decoder = kwargs.pop("is_decoder", False)
+
+    # Parameters for sequence generation
+    self.max_length = kwargs.pop("max_length", 20)
+    self.min_length = kwargs.pop("min_length", 0)
+    self.do_sample = kwargs.pop("do_sample", False)
+    self.early_stopping = kwargs.pop("early_stopping", False)
+    self.num_beams = kwargs.pop("num_beams", 1)
+    self.temperature = kwargs.pop("temperature", 1.0)
+    self.top_k = kwargs.pop("top_k", 50)
+    self.top_p = kwargs.pop("top_p", 1.0)
+    self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+    self.length_penalty = kwargs.pop("length_penalty", 1.0)
+    self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+    self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+    self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+
+    # Fine-tuning task arguments
+    self.architectures = kwargs.pop("architectures", None)
+    self.finetuning_task = kwargs.pop("finetuning_task", None)
+    self.num_labels = kwargs.pop("num_labels", 2)
+    self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
+    self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+    self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
+    self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+    # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
+    self.prefix = kwargs.pop("prefix", None)
+    self.bos_token_id = kwargs.pop("bos_token_id", None)
+    self.pad_token_id = kwargs.pop("pad_token_id", None)
+    self.eos_token_id = kwargs.pop("eos_token_id", None)
+    self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+
+    # task specific arguments
+    self.task_specific_params = kwargs.pop("task_specific_params", None)
+
+    # TPU arguments
+    self.xla_device = kwargs.pop("xla_device", None)
+
+    # Additional attributes without default values
+    for key, value in kwargs.items():
+      try:
+        setattr(self, key, value)
+      except AttributeError as err:
+        log("Can't set {} with value {} for {}".format(key, value, self))
+        raise err
+
+  @property
+  def num_labels(self):
+    return self._num_labels
+
+  @num_labels.setter
+  def num_labels(self, num_labels):
+    self._num_labels = num_labels
+    self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
+    self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+    self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+    self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+  def save_pretrained(self, save_directory):
+    """
+    Save a configuration object to the directory `save_directory`, so that it
+    can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+
+    Args:
+        save_directory (:obj:`string`):
+            Directory where the configuration JSON file will be saved.
+    """
+    assert os.path.isdir(save_directory), (
+      "Saving path should be a directory where the model and configuration can be saved"
+    )
+
+    # If we save using the predefined names, we can load using `from_pretrained`
+    output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+    self.to_json_file(output_config_file)
+    log("Configuration saved in {}".format(output_config_file))
+
+  @classmethod
+  def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
+    r"""
+
+    Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+
+    Args:
+        pretrained_model_name_or_path (:obj:`string`):
+            either:
+              - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
+                download, e.g.: ``bert-base-uncased``.
+              - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
+                our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+              - a path to a `directory` containing a configuration file saved using the
+                :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+              - a path or url to a saved configuration JSON `file`, e.g.:
+                ``./my_model_directory/configuration.json``.
+        cache_dir (:obj:`string`, `optional`):
+            Path to a directory in which a downloaded pre-trained model
+            configuration should be cached if the standard cache should not be used.
+        kwargs (:obj:`Dict[str, any]`, `optional`):
+            The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+            values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
+            controlled by the `return_unused_kwargs` keyword parameter.
+        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
+        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+        proxies (:obj:`Dict`, `optional`):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.:
+            :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
+            The proxies are used on each request.
+        return_unused_kwargs: (`optional`) bool:
+            If False, then this function returns just the final configuration object.
+            If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
+            dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
+            of kwargs which has not been used to update `config` and is otherwise ignored.
+
+    Returns:
+        :class:`PretrainedConfig`: An instance of a configuration object
+
+    Examples::
+
+        # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+        # derived class: BertConfig
+        config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+        config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+        config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+        config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+        assert config.output_attention == True
+        config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                           foo=False, return_unused_kwargs=True)
+        assert config.output_attention == True
+        assert unused_kwargs == {'foo': False}
+
+    """
+    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+    return cls.from_dict(config_dict, **kwargs)
+
+  @classmethod
+  def get_config_dict(
+    cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
+  ) -> Tuple[Dict, Dict]:
+    """
+    From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
+    for instantiating a Config using `from_dict`.
+
+    Parameters:
+        pretrained_model_name_or_path (:obj:`string`):
+            The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+        pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
+            A map of `shortcut names` to `url`. By default, will use the current class attribute.
+
+    Returns:
+        :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
+
+    """
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    local_files_only = kwargs.pop("local_files_only", False)
+
+    if pretrained_config_archive_map is None:
+      pretrained_config_archive_map = cls.pretrained_config_archive_map
+
+    if pretrained_model_name_or_path in pretrained_config_archive_map:
+      config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
+    elif os.path.isdir(pretrained_model_name_or_path):
+      config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+    elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+      config_file = pretrained_model_name_or_path
+    else:
+      config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
+
+    try:
+      # Load from URL or cache if already cached
+      resolved_config_file = cached_path(
+        config_file,
+        cache_dir=cache_dir,
+        force_download=force_download,
+        proxies=proxies,
+        resume_download=resume_download,
+        local_files_only=local_files_only,
+      )
+      # Load config dict
+      if resolved_config_file is None:
+        raise EnvironmentError
+      config_dict = cls._dict_from_json_file(resolved_config_file)
+
+    except EnvironmentError:
+      if pretrained_model_name_or_path in pretrained_config_archive_map:
+        msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(config_file)
+      else:
+        msg = (
+          "Can't load '{}'. Make sure that:\n\n"
+          "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+          "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format(
+            pretrained_model_name_or_path,
+            pretrained_model_name_or_path,
+            pretrained_model_name_or_path,
+            CONFIG_NAME,
+          )
+        )
+      raise EnvironmentError(msg)
+
+    except json.JSONDecodeError:
+      msg = (
+        "Couldn't reach server at '{}' to download configuration file or "
+        "configuration file is not a valid JSON file. "
+        "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+      )
+      raise EnvironmentError(msg)
+
+    if resolved_config_file == config_file:
+      log("loading configuration file {}".format(config_file))
+    else:
+      log("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
+
+    return config_dict, kwargs
+
+  @classmethod
+  def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
+    """
+    Constructs a `Config` from a Python dictionary of parameters.
+
+    Args:
+        config_dict (:obj:`Dict[str, any]`):
+            Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
+            from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
+            method.
+        kwargs (:obj:`Dict[str, any]`):
+            Additional parameters from which to initialize the configuration object.
+
+    Returns:
+        :class:`PretrainedConfig`: An instance of a configuration object
+    """
+    return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+    config = cls(**config_dict)
+
+    if hasattr(config, "pruned_heads"):
+      config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+
+    # Update config with kwargs if needed
+    to_remove = []
+    for key, value in kwargs.items():
+      if hasattr(config, key):
+        setattr(config, key, value)
+        to_remove.append(key)
+    for key in to_remove:
+      kwargs.pop(key, None)
+
+    # log("Model config {}".format(str(config)))
+    if return_unused_kwargs:
+      return config, kwargs
+    else:
+      return config
+
+  @classmethod
+  def from_json_file(cls, json_file: str) -> "PretrainedConfig":
+    """
+    Constructs a `Config` from the path to a json file of parameters.
+
+    Args:
+        json_file (:obj:`string`):
+            Path to the JSON file containing the parameters.
+
+    Returns:
+        :class:`PretrainedConfig`: An instance of a configuration object
+
+    """
+    config_dict = cls._dict_from_json_file(json_file)
+    return cls(**config_dict)
+
+  @classmethod
+  def _dict_from_json_file(cls, json_file: str):
+    with open(json_file, "r", encoding="utf-8") as reader:
+      text = reader.read()
+    return json.loads(text)
+
+  def __eq__(self, other):
+    return self.__dict__ == other.__dict__
+
+  def __repr__(self):
+    return "{} {}".format(self.__class__.__name__, self.to_json_string())
+
+  def to_dict(self):
+    """
+    Serializes this instance to a Python dictionary.
+
+    Returns:
+        :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+    """
+    output = copy.deepcopy(self.__dict__)
+    if hasattr(self.__class__, "model_type"):
+      output["model_type"] = self.__class__.model_type
+    return output
+
+  def to_json_string(self):
+    """
+    Serializes this instance to a JSON string.
+
+    Returns:
+        :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
+    """
+    return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+  def to_json_file(self, json_file_path):
     """
-    pretrained_config_archive_map = {}  # type: Dict[str, str]
-    model_type = ""  # type: str
-
-    def __init__(self, **kwargs):
-        # Attributes with defaults
-        self.output_attentions = kwargs.pop("output_attentions", False)
-        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
-        self.output_past = kwargs.pop("output_past", True)  # Not used by all models
-        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-        self.pruned_heads = kwargs.pop("pruned_heads", {})
-
-        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
-        self.is_decoder = kwargs.pop("is_decoder", False)
-
-        # Parameters for sequence generation
-        self.max_length = kwargs.pop("max_length", 20)
-        self.min_length = kwargs.pop("min_length", 0)
-        self.do_sample = kwargs.pop("do_sample", False)
-        self.early_stopping = kwargs.pop("early_stopping", False)
-        self.num_beams = kwargs.pop("num_beams", 1)
-        self.temperature = kwargs.pop("temperature", 1.0)
-        self.top_k = kwargs.pop("top_k", 50)
-        self.top_p = kwargs.pop("top_p", 1.0)
-        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
-        self.length_penalty = kwargs.pop("length_penalty", 1.0)
-        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
-        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
-        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
-
-        # Fine-tuning task arguments
-        self.architectures = kwargs.pop("architectures", None)
-        self.finetuning_task = kwargs.pop("finetuning_task", None)
-        self.num_labels = kwargs.pop("num_labels", 2)
-        self.id2label = kwargs.pop("id2label", {i: "LABEL_{}".format(i) for i in range(self.num_labels)})
-        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = kwargs.pop("label2id", dict(zip(self.id2label.values(), self.id2label.keys())))
-        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
-
-        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
-        self.prefix = kwargs.pop("prefix", None)
-        self.bos_token_id = kwargs.pop("bos_token_id", None)
-        self.pad_token_id = kwargs.pop("pad_token_id", None)
-        self.eos_token_id = kwargs.pop("eos_token_id", None)
-        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
-
-        # task specific arguments
-        self.task_specific_params = kwargs.pop("task_specific_params", None)
-
-        # TPU arguments
-        self.xla_device = kwargs.pop("xla_device", None)
-
-        # Additional attributes without default values
-        for key, value in kwargs.items():
-            try:
-                setattr(self, key, value)
-            except AttributeError as err:
-                log("Can't set {} with value {} for {}".format(key, value, self))
-                raise err
-
-    @property
-    def num_labels(self):
-        return self._num_labels
-
-    @num_labels.setter
-    def num_labels(self, num_labels):
-        self._num_labels = num_labels
-        self.id2label = {i: "LABEL_{}".format(i) for i in range(self.num_labels)}
-        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
-        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
-        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
-
-    def save_pretrained(self, save_directory):
-        """
-        Save a configuration object to the directory `save_directory`, so that it
-        can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
-
-        Args:
-            save_directory (:obj:`string`):
-                Directory where the configuration JSON file will be saved.
-        """
-        assert os.path.isdir(
-            save_directory
-        ), "Saving path should be a directory where the model and configuration can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_config_file = os.path.join(save_directory, CONFIG_NAME)
-
-        self.to_json_file(output_config_file)
-        log("Configuration saved in {}".format(output_config_file))
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
-        r"""
-
-        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
-
-        Args:
-            pretrained_model_name_or_path (:obj:`string`):
-                either:
-                  - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
-                    download, e.g.: ``bert-base-uncased``.
-                  - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
-                    our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                  - a path to a `directory` containing a configuration file saved using the
-                    :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                  - a path or url to a saved configuration JSON `file`, e.g.:
-                    ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`string`, `optional`):
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            kwargs (:obj:`Dict[str, any]`, `optional`):
-                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
-                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
-                controlled by the `return_unused_kwargs` keyword parameter.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-            proxies (:obj:`Dict`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.:
-                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
-                The proxies are used on each request.
-            return_unused_kwargs: (`optional`) bool:
-                If False, then this function returns just the final configuration object.
-                If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
-                dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
-                of kwargs which has not been used to update `config` and is otherwise ignored.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-
-        Examples::
-
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            assert config.output_attention == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attention == True
-            assert unused_kwargs == {'foo': False}
-
-        """
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        return cls.from_dict(config_dict, **kwargs)
-
-    @classmethod
-    def get_config_dict(
-        cls, pretrained_model_name_or_path: str, pretrained_config_archive_map: Optional[Dict] = None, **kwargs
-    ) -> Tuple[Dict, Dict]:
-        """
-        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
-        for instantiating a Config using `from_dict`.
-
-        Parameters:
-            pretrained_model_name_or_path (:obj:`string`):
-                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
-            pretrained_config_archive_map: (:obj:`Dict[str, str]`, `optional`) Dict:
-                A map of `shortcut names` to `url`. By default, will use the current class attribute.
-
-        Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
-
-        """
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
-
-        if pretrained_config_archive_map is None:
-            pretrained_config_archive_map = cls.pretrained_config_archive_map
-
-        if pretrained_model_name_or_path in pretrained_config_archive_map:
-            config_file = pretrained_config_archive_map[pretrained_model_name_or_path]
-        elif os.path.isdir(pretrained_model_name_or_path):
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        else:
-            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
-
-        try:
-            # Load from URL or cache if already cached
-            resolved_config_file = cached_path(
-                config_file,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                proxies=proxies,
-                resume_download=resume_download,
-                local_files_only=local_files_only,
-            )
-            # Load config dict
-            if resolved_config_file is None:
-                raise EnvironmentError
-            config_dict = cls._dict_from_json_file(resolved_config_file)
-
-        except EnvironmentError:
-            if pretrained_model_name_or_path in pretrained_config_archive_map:
-                msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                    config_file
-                )
-            else:
-                msg = (
-                    "Can't load '{}'. Make sure that:\n\n"
-                    "- '{}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
-                    "- or '{}' is the correct path to a directory containing a '{}' file\n\n".format(
-                        pretrained_model_name_or_path,
-                        pretrained_model_name_or_path,
-                        pretrained_model_name_or_path,
-                        CONFIG_NAME,
-                    )
-                )
-            raise EnvironmentError(msg)
-
-        except json.JSONDecodeError:
-            msg = (
-                "Couldn't reach server at '{}' to download configuration file or "
-                "configuration file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
-            )
-            raise EnvironmentError(msg)
-
-        if resolved_config_file == config_file:
-            log("loading configuration file {}".format(config_file))
-        else:
-            log("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
-
-        return config_dict, kwargs
-
-    @classmethod
-    def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from a Python dictionary of parameters.
-
-        Args:
-            config_dict (:obj:`Dict[str, any]`):
-                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
-                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
-                method.
-            kwargs (:obj:`Dict[str, any]`):
-                Additional parameters from which to initialize the configuration object.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-        """
-        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        config = cls(**config_dict)
-
-        if hasattr(config, "pruned_heads"):
-            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
-
-        # Update config with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
-            if hasattr(config, key):
-                setattr(config, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
-
-        # log("Model config {}".format(str(config)))
-        if return_unused_kwargs:
-            return config, kwargs
-        else:
-            return config
-
-    @classmethod
-    def from_json_file(cls, json_file: str) -> "PretrainedConfig":
-        """
-        Constructs a `Config` from the path to a json file of parameters.
-
-        Args:
-            json_file (:obj:`string`):
-                Path to the JSON file containing the parameters.
-
-        Returns:
-            :class:`PretrainedConfig`: An instance of a configuration object
-
-        """
-        config_dict = cls._dict_from_json_file(json_file)
-        return cls(**config_dict)
-
-    @classmethod
-    def _dict_from_json_file(cls, json_file: str):
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return json.loads(text)
-
-    def __eq__(self, other):
-        return self.__dict__ == other.__dict__
-
-    def __repr__(self):
-        return "{} {}".format(self.__class__.__name__, self.to_json_string())
-
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary.
-
-        Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        if hasattr(self.__class__, "model_type"):
-            output["model_type"] = self.__class__.model_type
-        return output
-
-    def to_json_string(self):
-        """
-        Serializes this instance to a JSON string.
-
-        Returns:
-            :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
-        """
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """
-        Save this instance to a json file.
-
-        Args:
-            json_file_path (:obj:`string`):
-                Path to the JSON file in which this configuration instance's parameters will be saved.
-        """
-        with open(json_file_path, "w", encoding="utf-8") as writer:
-            writer.write(self.to_json_string())
-
-    def update(self, config_dict: Dict):
-        """
-        Updates attributes of this class
-        with attributes from `config_dict`.
-
-        Args:
-            :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
-        """
-        for key, value in config_dict.items():
-            setattr(self, key, value)
+    Save this instance to a json file.
+
+    Args:
+        json_file_path (:obj:`string`):
+            Path to the JSON file in which this configuration instance's parameters will be saved.
+    """
+    with open(json_file_path, "w", encoding="utf-8") as writer:
+      writer.write(self.to_json_string())
+
+  def update(self, config_dict: Dict):
+    """
+    Updates attributes of this class
+    with attributes from `config_dict`.
+
+    Args:
+        :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
+    """
+    for key, value in config_dict.items():
+      setattr(self, key, value)
 
 
 BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
-    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
-    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
+  "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+  "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+  "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+  "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+  "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+  "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+  "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+  "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+  "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+  "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+  "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+  "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+  "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+  "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
+  "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+  "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+  "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+  "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+  "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+  "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+  "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
+  "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
 }
 
 
 class BertConfig(PretrainedConfig):
-    r"""
-        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
-        It is used to instantiate an BERT model according to the specified arguments, defining the model
-        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
-
-        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-        for more information.
-
-
-        Args:
-            vocab_size (:obj:`int`, optional, defaults to 30522):
-                Vocabulary size of the BERT model. Defines the different tokens that
-                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
-            hidden_size (:obj:`int`, optional, defaults to 768):
-                Dimensionality of the encoder layers and the pooler layer.
-            num_hidden_layers (:obj:`int`, optional, defaults to 12):
-                Number of hidden layers in the Transformer encoder.
-            num_attention_heads (:obj:`int`, optional, defaults to 12):
-                Number of attention heads for each attention layer in the Transformer encoder.
-            intermediate_size (:obj:`int`, optional, defaults to 3072):
-                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
-                The non-linear activation function (function or string) in the encoder and pooler.
-                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
-                The dropout ratio for the attention probabilities.
-            max_position_embeddings (:obj:`int`, optional, defaults to 512):
-                The maximum sequence length that this model might ever be used with.
-                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-            type_vocab_size (:obj:`int`, optional, defaults to 2):
-                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-            initializer_range (:obj:`float`, optional, defaults to 0.02):
-                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
-                The epsilon used by the layer normalization layers.
-
-        Example::
-
-            from transformers import BertModel, BertConfig
-
-            # Initializing a BERT bert-base-uncased style configuration
-            configuration = BertConfig()
-
-            # Initializing a model from the bert-base-uncased style configuration
-            model = BertModel(configuration)
-
-            # Accessing the model configuration
-            configuration = model.config
-
-        Attributes:
-            pretrained_config_archive_map (Dict[str, str]):
-                A dictionary containing all the available pre-trained checkpoints.
-    """
-    pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-    model_type = "bert"
-
-    def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        type_vocab_size=2,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        **kwargs
-    ):
-        super().__init__(pad_token_id=pad_token_id, **kwargs)
-
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.intermediate_size = intermediate_size
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
+  r"""
+  This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+  It is used to instantiate an BERT model according to the specified arguments, defining the model
+  architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+  the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+  Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+  to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+  for more information.
+
+
+  Args:
+      vocab_size (:obj:`int`, optional, defaults to 30522):
+          Vocabulary size of the BERT model. Defines the different tokens that
+          can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+      hidden_size (:obj:`int`, optional, defaults to 768):
+          Dimensionality of the encoder layers and the pooler layer.
+      num_hidden_layers (:obj:`int`, optional, defaults to 12):
+          Number of hidden layers in the Transformer encoder.
+      num_attention_heads (:obj:`int`, optional, defaults to 12):
+          Number of attention heads for each attention layer in the Transformer encoder.
+      intermediate_size (:obj:`int`, optional, defaults to 3072):
+          Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+      hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+          The non-linear activation function (function or string) in the encoder and pooler.
+          If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+      hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+          The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+      attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+          The dropout ratio for the attention probabilities.
+      max_position_embeddings (:obj:`int`, optional, defaults to 512):
+          The maximum sequence length that this model might ever be used with.
+          Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+      type_vocab_size (:obj:`int`, optional, defaults to 2):
+          The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+      initializer_range (:obj:`float`, optional, defaults to 0.02):
+          The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+      layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+          The epsilon used by the layer normalization layers.
+
+  Example::
+
+      from transformers import BertModel, BertConfig
+
+      # Initializing a BERT bert-base-uncased style configuration
+      configuration = BertConfig()
+
+      # Initializing a model from the bert-base-uncased style configuration
+      model = BertModel(configuration)
+
+      # Accessing the model configuration
+      configuration = model.config
+
+  Attributes:
+      pretrained_config_archive_map (Dict[str, str]):
+          A dictionary containing all the available pre-trained checkpoints.
+  """
+
+  pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+  model_type = "bert"
+
+  def __init__(
+    self,
+    vocab_size=30522,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=2,
+    initializer_range=0.02,
+    layer_norm_eps=1e-12,
+    pad_token_id=0,
+    **kwargs,
+  ):
+    super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.num_hidden_layers = num_hidden_layers
+    self.num_attention_heads = num_attention_heads
+    self.hidden_act = hidden_act
+    self.intermediate_size = intermediate_size
+    self.hidden_dropout_prob = hidden_dropout_prob
+    self.attention_probs_dropout_prob = attention_probs_dropout_prob
+    self.max_position_embeddings = max_position_embeddings
+    self.type_vocab_size = type_vocab_size
+    self.initializer_range = initializer_range
+    self.layer_norm_eps = layer_norm_eps
diff --git a/modelzoo/ELECTRA/data/BooksDownloader.py b/modelzoo/ELECTRA/data/BooksDownloader.py
index a10ebde0..6b4af9cd 100644
--- a/modelzoo/ELECTRA/data/BooksDownloader.py
+++ b/modelzoo/ELECTRA/data/BooksDownloader.py
@@ -13,14 +13,16 @@
 
 import subprocess
 
-class BooksDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path
-        pass
 
+class BooksDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path
+    pass
 
-    def download(self):
-        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
-        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
-        bookscorpus_download_command += ' --trash-bad-count'
-        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
+  def download(self):
+    bookscorpus_download_command = (
+      "python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out"
+    )
+    bookscorpus_download_command += " " + self.save_path + "/bookscorpus"
+    bookscorpus_download_command += " --trash-bad-count"
+    bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
diff --git a/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
index 22e48d4b..409bdee8 100644
--- a/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
+++ b/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
@@ -14,19 +14,19 @@
 import glob
 import os
 
-class BookscorpusTextFormatting:
-    def __init__(self, books_path, output_filename, recursive = False):
-        self.books_path = books_path
-        self.recursive = recursive
-        self.output_filename = output_filename
 
+class BookscorpusTextFormatting:
+  def __init__(self, books_path, output_filename, recursive=False):
+    self.books_path = books_path
+    self.recursive = recursive
+    self.output_filename = output_filename
 
-    # This puts one book per line
-    def merge(self):
-        with open(self.output_filename, mode='w', newline='\n') as ofile:
-            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
-                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
-                    for line in file:
-                        if line.strip() != '':
-                            ofile.write(line.strip() + ' ')
-                ofile.write("\n\n")
\ No newline at end of file
+  # This puts one book per line
+  def merge(self):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.books_path + "/" + "*.txt", recursive=True):
+        with open(filename, mode="r", encoding="utf-8-sig", newline="\n") as file:
+          for line in file:
+            if line.strip() != "":
+              ofile.write(line.strip() + " ")
+        ofile.write("\n\n")
diff --git a/modelzoo/ELECTRA/data/Downloader.py b/modelzoo/ELECTRA/data/Downloader.py
index ebbd43d6..33b16097 100644
--- a/modelzoo/ELECTRA/data/Downloader.py
+++ b/modelzoo/ELECTRA/data/Downloader.py
@@ -20,72 +20,65 @@
 
 
 class Downloader:
-    def __init__(self, dataset_name, save_path):
-        self.dataset_name = dataset_name
-        self.save_path = save_path
+  def __init__(self, dataset_name, save_path):
+    self.dataset_name = dataset_name
+    self.save_path = save_path
 
+  def download(self):
+    if self.dataset_name == "bookscorpus":
+      self.download_bookscorpus()
 
-    def download(self):
-        if self.dataset_name == 'bookscorpus':
-            self.download_bookscorpus()
+    elif self.dataset_name == "wikicorpus_en":
+      self.download_wikicorpus("en")
 
-        elif self.dataset_name == 'wikicorpus_en':
-            self.download_wikicorpus('en')
+    elif self.dataset_name == "wikicorpus_zh":
+      self.download_wikicorpus("zh")
 
-        elif self.dataset_name == 'wikicorpus_zh':
-            self.download_wikicorpus('zh')
+    elif self.dataset_name == "google_pretrained_weights":
+      self.download_google_pretrained_weights()
 
-        elif self.dataset_name == 'google_pretrained_weights':
-            self.download_google_pretrained_weights()
+    elif self.dataset_name == "nvidia_pretrained_weights":
+      self.download_nvidia_pretrained_weights()
 
-        elif self.dataset_name == 'nvidia_pretrained_weights':
-            self.download_nvidia_pretrained_weights()
+    elif self.dataset_name == "mrpc":
+      self.download_mrpc()
 
-        elif self.dataset_name == 'mrpc':
-            self.download_mrpc()
+    elif self.dataset_name == "squad":
+      self.download_squad()
 
-        elif self.dataset_name == 'squad':
-            self.download_squad()
+    elif self.dataset_name == "all":
+      self.download_bookscorpus(self.save_path)
+      self.download_wikicorpus("en", self.save_path)
+      self.download_wikicorpus("zh", self.save_path)
+      self.download_google_pretrained_weights(self.save_path)
+      self.download_nvidia_pretrained_weights(self.save_path)
+      self.download_mrpc(self.save_path)
+      self.download_squad(self.save_path)
 
-        elif self.dataset_name == 'all':
-            self.download_bookscorpus(self.save_path)
-            self.download_wikicorpus('en', self.save_path)
-            self.download_wikicorpus('zh', self.save_path)
-            self.download_google_pretrained_weights(self.save_path)
-            self.download_nvidia_pretrained_weights(self.save_path)
-            self.download_mrpc(self.save_path)
-            self.download_squad(self.save_path)
+    else:
+      print(self.dataset_name)
+      assert False, "Unknown dataset_name provided to downloader"
 
-        else:
-            print(self.dataset_name)
-            assert False, 'Unknown dataset_name provided to downloader'
+  def download_bookscorpus(self):
+    downloader = BooksDownloader(self.save_path)
+    downloader.download()
 
+  def download_wikicorpus(self, language):
+    downloader = WikiDownloader(language, self.save_path)
+    downloader.download()
 
-    def download_bookscorpus(self):
-        downloader = BooksDownloader(self.save_path)
-        downloader.download()
+  def download_google_pretrained_weights(self):
+    downloader = GooglePretrainedWeightDownloader(self.save_path)
+    downloader.download()
 
+  def download_nvidia_pretrained_weights(self):
+    downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+    downloader.download()
 
-    def download_wikicorpus(self, language):
-        downloader = WikiDownloader(language, self.save_path)
-        downloader.download()
+  def download_mrpc(self):
+    downloader = MRPCDownloader(self.save_path)
+    downloader.download()
 
-
-    def download_google_pretrained_weights(self):
-        downloader = GooglePretrainedWeightDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_nvidia_pretrained_weights(self):
-        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_mrpc(self):
-        downloader = MRPCDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_squad(self):
-        downloader = SquadDownloader(self.save_path)
-        downloader.download()
+  def download_squad(self):
+    downloader = SquadDownloader(self.save_path)
+    downloader.download()
diff --git a/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
index bb0684d3..7a58f5b9 100644
--- a/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
+++ b/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
@@ -16,143 +16,164 @@
 import urllib.request
 import zipfile
 
-class GooglePretrainedWeightDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/google_pretrained_weights'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        # Download urls
-        self.model_urls = {
-            'bert_base_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip', 'uncased_L-12_H-768_A-12.zip'),
-            'bert_large_uncased': ('https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip', 'uncased_L-24_H-1024_A-16.zip'),
-            'bert_base_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip', 'cased_L-12_H-768_A-12.zip'),
-            'bert_large_cased': ('https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip', 'cased_L-24_H-1024_A-16.zip'),
-            'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
-            'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
-            'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
-        }
-
-        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
-        self.bert_base_uncased_sha = {
-            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
-            'bert_model.ckpt.data-00000-of-00001': '58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84',
-            'bert_model.ckpt.index': '04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b',
-            'bert_model.ckpt.meta': 'dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e',
-            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-        }
-
-        self.bert_large_uncased_sha = {
-            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
-            'bert_model.ckpt.data-00000-of-00001': 'bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1',
-            'bert_model.ckpt.index': '68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093',
-            'bert_model.ckpt.meta': '6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1',
-            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-        }
-
-        self.bert_base_cased_sha = {
-            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
-            'bert_model.ckpt.data-00000-of-00001': '734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea',
-            'bert_model.ckpt.index': '517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1',
-            'bert_model.ckpt.meta': '5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98',
-            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-        }
-
-        self.bert_large_cased_sha = {
-            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
-            'bert_model.ckpt.data-00000-of-00001': '6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0',
-            'bert_model.ckpt.index': 'ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf',
-            'bert_model.ckpt.meta': 'd2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1',
-            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-        }
-
-        self.bert_base_multilingual_cased_sha = {
-            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
-            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
-            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
-            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
-            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
-        }
-
-        self.bert_large_multilingual_uncased_sha = {
-            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
-            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
-            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
-            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
-            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
-        }
-
-        self.bert_base_chinese_sha = {
-            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
-            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
-            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
-            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
-            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
-        }
-
-        # Relate SHA to urls for loop below
-        self.model_sha = {
-            'bert_base_uncased': self.bert_base_uncased_sha,
-            'bert_large_uncased': self.bert_large_uncased_sha,
-            'bert_base_cased': self.bert_base_cased_sha,
-            'bert_large_cased': self.bert_large_cased_sha,
-            'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
-            'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
-            'bert_base_chinese': self.bert_base_chinese_sha
-        }
-
-    # Helper to get sha256sum of a file
-    def sha256sum(self, filename):
-      h  = hashlib.sha256()
-      b  = bytearray(128*1024)
-      mv = memoryview(b)
-      with open(filename, 'rb', buffering=0) as f:
-        for n in iter(lambda : f.readinto(mv), 0):
-          h.update(mv[:n])
-
-      return h.hexdigest()
-
-    def download(self):
-        # Iterate over urls: download, unzip, verify sha256sum
-        found_mismatch_sha = False
-        for model in self.model_urls:
-          url = self.model_urls[model][0]
-          file = self.save_path + '/' + self.model_urls[model][1]
-
-          print('Downloading', url)
-          response = urllib.request.urlopen(url)
-          with open(file, 'wb') as handle:
-            handle.write(response.read())
-
-          print('Unzipping', file)
-          zip = zipfile.ZipFile(file, 'r')
-          zip.extractall(self.save_path)
-          zip.close()
-
-          sha_dict = self.model_sha[model]
-          for extracted_file in sha_dict:
-            sha = sha_dict[extracted_file]
-            if sha != self.sha256sum(file[:-4] + '/' + extracted_file):
-              found_mismatch_sha = True
-              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
-            else:
-              print(file[:-4] + '/' + extracted_file, '\t', 'verified')
-
-        if not found_mismatch_sha:
-          print("All downloads pass sha256sum verification.")
-
-    def serialize(self):
-        pass
-
-    def deserialize(self):
-        pass
-
-    def listAvailableWeights(self):
-        print("Available Weight Datasets")
-        for item in self.model_urls:
-            print(item)
-
-    def listLocallyStoredWeights(self):
-        pass
 
+class GooglePretrainedWeightDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/google_pretrained_weights"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    # Download urls
+    self.model_urls = {
+      "bert_base_uncased": (
+        "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip",
+        "uncased_L-12_H-768_A-12.zip",
+      ),
+      "bert_large_uncased": (
+        "https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip",
+        "uncased_L-24_H-1024_A-16.zip",
+      ),
+      "bert_base_cased": (
+        "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip",
+        "cased_L-12_H-768_A-12.zip",
+      ),
+      "bert_large_cased": (
+        "https://storage.googleapis.com/bert_models/2018_10_18/cased_L-24_H-1024_A-16.zip",
+        "cased_L-24_H-1024_A-16.zip",
+      ),
+      "bert_base_multilingual_cased": (
+        "https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip",
+        "multi_cased_L-12_H-768_A-12.zip",
+      ),
+      "bert_large_multilingual_uncased": (
+        "https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip",
+        "multilingual_L-12_H-768_A-12.zip",
+      ),
+      "bert_base_chinese": (
+        "https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip",
+        "chinese_L-12_H-768_A-12.zip",
+      ),
+    }
+
+    # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+    self.bert_base_uncased_sha = {
+      "bert_config.json": "7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc",
+      "bert_model.ckpt.data-00000-of-00001": "58580dc5e0bf0ae0d2efd51d0e8272b2f808857f0a43a88aaf7549da6d7a8a84",
+      "bert_model.ckpt.index": "04c1323086e2f1c5b7c0759d8d3e484afbb0ab45f51793daab9f647113a0117b",
+      "bert_model.ckpt.meta": "dd5682170a10c3ea0280c2e9b9a45fee894eb62da649bbdea37b38b0ded5f60e",
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
+    }
+
+    self.bert_large_uncased_sha = {
+      "bert_config.json": "bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb",
+      "bert_model.ckpt.data-00000-of-00001": "bc6b3363e3be458c99ecf64b7f472d2b7c67534fd8f564c0556a678f90f4eea1",
+      "bert_model.ckpt.index": "68b52f2205ffc64dc627d1120cf399c1ef1cbc35ea5021d1afc889ffe2ce2093",
+      "bert_model.ckpt.meta": "6fcce8ff7628f229a885a593625e3d5ff9687542d5ef128d9beb1b0c05edc4a1",
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
+    }
+
+    self.bert_base_cased_sha = {
+      "bert_config.json": "f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc",
+      "bert_model.ckpt.data-00000-of-00001": "734d5a1b68bf98d4e9cb6b6692725d00842a1937af73902e51776905d8f760ea",
+      "bert_model.ckpt.index": "517d6ef5c41fc2ca1f595276d6fccf5521810d57f5a74e32616151557790f7b1",
+      "bert_model.ckpt.meta": "5f8a9771ff25dadd61582abb4e3a748215a10a6b55947cbb66d0f0ba1694be98",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
+    }
+
+    self.bert_large_cased_sha = {
+      "bert_config.json": "7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57",
+      "bert_model.ckpt.data-00000-of-00001": "6ff33640f40d472f7a16af0c17b1179ca9dcc0373155fb05335b6a4dd1657ef0",
+      "bert_model.ckpt.index": "ef42a53f577fbe07381f4161b13c7cab4f4fc3b167cec6a9ae382c53d18049cf",
+      "bert_model.ckpt.meta": "d2ddff3ed33b80091eac95171e94149736ea74eb645e575d942ec4a5e01a40a1",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
+    }
+
+    self.bert_base_multilingual_cased_sha = {
+      "bert_config.json": "e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0",
+      "bert_model.ckpt.data-00000-of-00001": "55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5",
+      "bert_model.ckpt.index": "7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37",
+      "bert_model.ckpt.meta": "95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa",
+      "vocab.txt": "fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c",
+    }
+
+    self.bert_large_multilingual_uncased_sha = {
+      "bert_config.json": "49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624",
+      "bert_model.ckpt.data-00000-of-00001": "3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429",
+      "bert_model.ckpt.index": "87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7",
+      "bert_model.ckpt.meta": "27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29",
+      "vocab.txt": "87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f",
+    }
+
+    self.bert_base_chinese_sha = {
+      "bert_config.json": "7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015",
+      "bert_model.ckpt.data-00000-of-00001": "756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba",
+      "bert_model.ckpt.index": "46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e",
+      "bert_model.ckpt.meta": "c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047",
+      "vocab.txt": "45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c",
+    }
+
+    # Relate SHA to urls for loop below
+    self.model_sha = {
+      "bert_base_uncased": self.bert_base_uncased_sha,
+      "bert_large_uncased": self.bert_large_uncased_sha,
+      "bert_base_cased": self.bert_base_cased_sha,
+      "bert_large_cased": self.bert_large_cased_sha,
+      "bert_base_multilingual_cased": self.bert_base_multilingual_cased_sha,
+      "bert_large_multilingual_uncased": self.bert_large_multilingual_uncased_sha,
+      "bert_base_chinese": self.bert_base_chinese_sha,
+    }
+
+  # Helper to get sha256sum of a file
+  def sha256sum(self, filename):
+    h = hashlib.sha256()
+    b = bytearray(128 * 1024)
+    mv = memoryview(b)
+    with open(filename, "rb", buffering=0) as f:
+      for n in iter(lambda: f.readinto(mv), 0):
+        h.update(mv[:n])
+
+    return h.hexdigest()
+
+  def download(self):
+    # Iterate over urls: download, unzip, verify sha256sum
+    found_mismatch_sha = False
+    for model in self.model_urls:
+      url = self.model_urls[model][0]
+      file = self.save_path + "/" + self.model_urls[model][1]
+
+      print("Downloading", url)
+      response = urllib.request.urlopen(url)
+      with open(file, "wb") as handle:
+        handle.write(response.read())
+
+      print("Unzipping", file)
+      zip = zipfile.ZipFile(file, "r")
+      zip.extractall(self.save_path)
+      zip.close()
+
+      sha_dict = self.model_sha[model]
+      for extracted_file in sha_dict:
+        sha = sha_dict[extracted_file]
+        if sha != self.sha256sum(file[:-4] + "/" + extracted_file):
+          found_mismatch_sha = True
+          print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
+        else:
+          print(file[:-4] + "/" + extracted_file, "\t", "verified")
+
+    if not found_mismatch_sha:
+      print("All downloads pass sha256sum verification.")
+
+  def serialize(self):
+    pass
+
+  def deserialize(self):
+    pass
+
+  def listAvailableWeights(self):
+    print("Available Weight Datasets")
+    for item in self.model_urls:
+      print(item)
+
+  def listLocallyStoredWeights(self):
+    pass
diff --git a/modelzoo/ELECTRA/data/MRPCDownloader.py b/modelzoo/ELECTRA/data/MRPCDownloader.py
index 42dd4227..01c6776a 100644
--- a/modelzoo/ELECTRA/data/MRPCDownloader.py
+++ b/modelzoo/ELECTRA/data/MRPCDownloader.py
@@ -16,29 +16,28 @@
 import urllib.request
 import sys
 
-class MRPCDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/mrpc'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
-        self.download_urls = {
-            'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc' : 'mrpc_dev_ids.tsv'
-        }
-
-    def download(self):
-        for item in self.download_urls:
-            url = item
-            file = self.download_urls[item]
-
-            print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + file):
-                print('** Download file already exists, skipping download')
-            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + file, "wb") as handle:
-                    handle.write(response.read())
-
 
+class MRPCDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/mrpc"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    # Documentation - Download link obtained from here: https://github.com/nyu-mll/GLUE-baselines/blob/master/download_glue_data.py
+    self.download_urls = {
+      "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc": "mrpc_dev_ids.tsv"
+    }
+
+  def download(self):
+    for item in self.download_urls:
+      url = item
+      file = self.download_urls[item]
+
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + file):
+        print("** Download file already exists, skipping download")
+      else:
+        response = urllib.request.urlopen(url)
+        with open(self.save_path + "/" + file, "wb") as handle:
+          handle.write(response.read())
diff --git a/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
index 13c9a320..9472cb5c 100644
--- a/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
+++ b/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
@@ -13,15 +13,15 @@
 
 import os
 
-class NVIDIAPretrainedWeightDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/nvidia_pretrained_weights'
 
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
+class NVIDIAPretrainedWeightDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/nvidia_pretrained_weights"
 
-        pass
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
 
+    pass
 
-    def download(self):
-        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
\ No newline at end of file
+  def download(self):
+    assert False, "NVIDIAPretrainedWeightDownloader not implemented yet."
diff --git a/modelzoo/ELECTRA/data/SquadDownloader.py b/modelzoo/ELECTRA/data/SquadDownloader.py
index 6d64ffc6..b1c026c3 100644
--- a/modelzoo/ELECTRA/data/SquadDownloader.py
+++ b/modelzoo/ELECTRA/data/SquadDownloader.py
@@ -16,39 +16,38 @@
 import urllib.request
 import sys
 
-class SquadDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/squad'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        if not os.path.exists(self.save_path + '/v1.1'):
-            os.makedirs(self.save_path + '/v1.1')
-
-        if not os.path.exists(self.save_path + '/v2.0'):
-            os.makedirs(self.save_path + '/v2.0')
-
-        self.download_urls = {
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
-            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
-            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
-        }
-
-    def download(self):
-        for item in self.download_urls:
-            url = item
-            file = self.download_urls[item]
-
-            print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + file):
-                print('** Download file already exists, skipping download')
-            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + file, "wb") as handle:
-                    handle.write(response.read())
-
 
+class SquadDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/squad"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    if not os.path.exists(self.save_path + "/v1.1"):
+      os.makedirs(self.save_path + "/v1.1")
+
+    if not os.path.exists(self.save_path + "/v2.0"):
+      os.makedirs(self.save_path + "/v2.0")
+
+    self.download_urls = {
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json": "v1.1/train-v1.1.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json": "v1.1/dev-v1.1.json",
+      "https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/": "v1.1/evaluate-v1.1.py",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json": "v2.0/train-v2.0.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json": "v2.0/dev-v2.0.json",
+      "https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/": "v2.0/evaluate-v2.0.py",
+    }
+
+  def download(self):
+    for item in self.download_urls:
+      url = item
+      file = self.download_urls[item]
+
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + file):
+        print("** Download file already exists, skipping download")
+      else:
+        response = urllib.request.urlopen(url)
+        with open(self.save_path + "/" + file, "wb") as handle:
+          handle.write(response.read())
diff --git a/modelzoo/ELECTRA/data/TextSharding.py b/modelzoo/ELECTRA/data/TextSharding.py
index 0753e742..f26eb592 100644
--- a/modelzoo/ELECTRA/data/TextSharding.py
+++ b/modelzoo/ELECTRA/data/TextSharding.py
@@ -17,311 +17,317 @@
 import multiprocessing
 import statistics
 
-class Sharding:
-    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
-        assert len(input_files) > 0, 'The input file list must contain at least one file.'
-        assert n_training_shards > 0, 'There must be at least one output shard.'
-        assert n_test_shards > 0, 'There must be at least one output shard.'
-
-        self.n_training_shards = n_training_shards
-        self.n_test_shards = n_test_shards
-        self.fraction_test_set = fraction_test_set
-
-        self.input_files = input_files
-
-        self.output_name_prefix = output_name_prefix
-        self.output_training_identifier = '_training'
-        self.output_test_identifier = '_test'
-        self.output_file_extension = '.txt'
-
-        self.articles = {}    # key: integer identifier, value: list of articles
-        self.sentences = {}    # key: integer identifier, value: list of sentences
-        self.output_training_files = {}    # key: filename, value: list of articles to go into file
-        self.output_test_files = {}  # key: filename, value: list of articles to go into file
-
-        self.init_output_files()
-
-
-    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
-    def load_articles(self):
-        print('Start: Loading Articles')
-
-        global_article_count = 0
-        for input_file in self.input_files:
-            print('input file:', input_file)
-            with open(input_file, mode='r', newline='\n') as f:
-                for i, line in enumerate(f):
-                    if line.strip():
-                        self.articles[global_article_count] = line.rstrip()
-                        global_article_count += 1
-
-        print('End: Loading Articles: There are', len(self.articles), 'articles.')
 
+class Sharding:
+  def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+    assert len(input_files) > 0, "The input file list must contain at least one file."
+    assert n_training_shards > 0, "There must be at least one output shard."
+    assert n_test_shards > 0, "There must be at least one output shard."
 
-    def segment_articles_into_sentences(self, segmenter):
-        print('Start: Sentence Segmentation')
-        if len(self.articles) is 0:
-            self.load_articles()
-
-        assert len(self.articles) is not 0, 'Please check that input files are present and contain data.'
-
-        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
-        use_multiprocessing = 'serial'
-
-        def chunks(data, size=len(self.articles)):
-            it = iter(data)
-            for i in range(0, len(data), size):
-                yield {k: data[k] for k in islice(it, size)}
-
-        if use_multiprocessing == 'manager':
-            manager = multiprocessing.Manager()
-            return_dict = manager.dict()
-            jobs = []
-            n_processes = 7    # in addition to the main process, total = n_proc+1
+    self.n_training_shards = n_training_shards
+    self.n_test_shards = n_test_shards
+    self.fraction_test_set = fraction_test_set
 
-            def work(articles, return_dict):
-                sentences = {}
-                for i, article in enumerate(articles):
-                    sentences[i] = segmenter.segment_string(articles[article])
+    self.input_files = input_files
 
-                    if i % 5000 == 0:
-                        print('Segmenting article', i)
+    self.output_name_prefix = output_name_prefix
+    self.output_training_identifier = "_training"
+    self.output_test_identifier = "_test"
+    self.output_file_extension = ".txt"
 
-                return_dict.update(sentences)
+    self.articles = {}  # key: integer identifier, value: list of articles
+    self.sentences = {}  # key: integer identifier, value: list of sentences
+    self.output_training_files = {}  # key: filename, value: list of articles to go into file
+    self.output_test_files = {}  # key: filename, value: list of articles to go into file
 
-            for item in chunks(self.articles, len(self.articles)):
-                p = multiprocessing.Process(target=work, args=(item, return_dict))
+    self.init_output_files()
 
-                # Busy wait
-                while len(jobs) >= n_processes:
-                    pass
+  # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+  def load_articles(self):
+    print("Start: Loading Articles")
 
-                jobs.append(p)
-                p.start()
+    global_article_count = 0
+    for input_file in self.input_files:
+      print("input file:", input_file)
+      with open(input_file, mode="r", newline="\n") as f:
+        for i, line in enumerate(f):
+          if line.strip():
+            self.articles[global_article_count] = line.rstrip()
+            global_article_count += 1
 
-            for proc in jobs:
-                proc.join()
+    print("End: Loading Articles: There are", len(self.articles), "articles.")
 
-        elif use_multiprocessing == 'queue':
-            work_queue = multiprocessing.Queue()
-            jobs = []
+  def segment_articles_into_sentences(self, segmenter):
+    print("Start: Sentence Segmentation")
+    if len(self.articles) is 0:
+      self.load_articles()
 
-            for item in chunks(self.articles, len(self.articles)):
-                pass
+    assert len(self.articles) is not 0, "Please check that input files are present and contain data."
 
-        else:    # serial option
-            for i, article in enumerate(self.articles):
-                self.sentences[i] = segmenter.segment_string(self.articles[article])
+    # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+    use_multiprocessing = "serial"
 
-                if i % 5000 == 0:
-                    print('Segmenting article', i)
+    def chunks(data, size=len(self.articles)):
+      it = iter(data)
+      for i in range(0, len(data), size):
+        yield {k: data[k] for k in islice(it, size)}
 
-        print('End: Sentence Segmentation')
+    if use_multiprocessing == "manager":
+      manager = multiprocessing.Manager()
+      return_dict = manager.dict()
+      jobs = []
+      n_processes = 7  # in addition to the main process, total = n_proc+1
 
+      def work(articles, return_dict):
+        sentences = {}
+        for i, article in enumerate(articles):
+          sentences[i] = segmenter.segment_string(articles[article])
 
-    def init_output_files(self):
-        print('Start: Init Output Files')
-        assert len(self.output_training_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
-        assert len(self.output_test_files) is 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+          if i % 5000 == 0:
+            print("Segmenting article", i)
 
-        for i in range(self.n_training_shards):
-            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
-            self.output_training_files[name] = []
+        return_dict.update(sentences)
 
-        for i in range(self.n_test_shards):
-            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
-            self.output_test_files[name] = []
+      for item in chunks(self.articles, len(self.articles)):
+        p = multiprocessing.Process(target=work, args=(item, return_dict))
 
-        print('End: Init Output Files')
+        # Busy wait
+        while len(jobs) >= n_processes:
+          pass
 
+        jobs.append(p)
+        p.start()
 
-    def get_sentences_per_shard(self, shard):
-        result = 0
-        for article_id in shard:
-            result += len(self.sentences[article_id])
+      for proc in jobs:
+        proc.join()
 
-        return result
+    elif use_multiprocessing == "queue":
+      work_queue = multiprocessing.Queue()
+      jobs = []
 
+      for item in chunks(self.articles, len(self.articles)):
+        pass
 
-    def distribute_articles_over_shards(self):
-        print('Start: Distribute Articles Over Shards')
-        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+    else:  # serial option
+      for i, article in enumerate(self.articles):
+        self.sentences[i] = segmenter.segment_string(self.articles[article])
 
-        # Create dictionary with - key: sentence count per article, value: article id number
-        sentence_counts = defaultdict(lambda: [])
+        if i % 5000 == 0:
+          print("Segmenting article", i)
 
-        max_sentences = 0
-        total_sentences = 0
+    print("End: Sentence Segmentation")
 
-        for article_id in self.sentences:
-            current_length = len(self.sentences[article_id])
-            sentence_counts[current_length].append(article_id)
-            max_sentences = max(max_sentences, current_length)
-            total_sentences += current_length
+  def init_output_files(self):
+    print("Start: Init Output Files")
+    assert len(self.output_training_files) is 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
+    assert len(self.output_test_files) is 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
 
-        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
-        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
-        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+    for i in range(self.n_training_shards):
+      name = self.output_name_prefix + self.output_training_identifier + "_" + str(i) + self.output_file_extension
+      self.output_training_files[name] = []
 
-        consumed_article_set = set({})
-        unused_article_set = set(self.articles.keys())
+    for i in range(self.n_test_shards):
+      name = self.output_name_prefix + self.output_test_identifier + "_" + str(i) + self.output_file_extension
+      self.output_test_files[name] = []
 
-        # Make first pass and add one article worth of lines per file
-        for file in self.output_training_files:
-            current_article_id = sentence_counts[max_sentences][-1]
-            sentence_counts[max_sentences].pop(-1)
-            self.output_training_files[file].append(current_article_id)
-            consumed_article_set.add(current_article_id)
-            unused_article_set.remove(current_article_id)
+    print("End: Init Output Files")
 
-            # Maintain the max sentence count
-            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                max_sentences -= 1
+  def get_sentences_per_shard(self, shard):
+    result = 0
+    for article_id in shard:
+      result += len(self.sentences[article_id])
 
-            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
-                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
-                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+    return result
 
-        for file in self.output_test_files:
-            current_article_id = sentence_counts[max_sentences][-1]
-            sentence_counts[max_sentences].pop(-1)
-            self.output_test_files[file].append(current_article_id)
-            consumed_article_set.add(current_article_id)
-            unused_article_set.remove(current_article_id)
+  def distribute_articles_over_shards(self):
+    print("Start: Distribute Articles Over Shards")
+    assert len(self.articles) >= self.n_training_shards + self.n_test_shards, (
+      "There are fewer articles than shards. Please add more data or reduce the number of shards requested."
+    )
 
-            # Maintain the max sentence count
-            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                max_sentences -= 1
+    # Create dictionary with - key: sentence count per article, value: article id number
+    sentence_counts = defaultdict(lambda: [])
 
-            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
-                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
-                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+    max_sentences = 0
+    total_sentences = 0
+
+    for article_id in self.sentences:
+      current_length = len(self.sentences[article_id])
+      sentence_counts[current_length].append(article_id)
+      max_sentences = max(max_sentences, current_length)
+      total_sentences += current_length
 
-        training_counts = []
-        test_counts = []
+    n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+    nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+    nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
 
-        for shard in self.output_training_files:
-            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+    consumed_article_set = set({})
+    unused_article_set = set(self.articles.keys())
+
+    # Make first pass and add one article worth of lines per file
+    for file in self.output_training_files:
+      current_article_id = sentence_counts[max_sentences][-1]
+      sentence_counts[max_sentences].pop(-1)
+      self.output_training_files[file].append(current_article_id)
+      consumed_article_set.add(current_article_id)
+      unused_article_set.remove(current_article_id)
+
+      # Maintain the max sentence count
+      while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+        max_sentences -= 1
+
+      if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+        nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+        print("Warning: A single article contains more than the nominal number of sentences per training shard.")
+
+    for file in self.output_test_files:
+      current_article_id = sentence_counts[max_sentences][-1]
+      sentence_counts[max_sentences].pop(-1)
+      self.output_test_files[file].append(current_article_id)
+      consumed_article_set.add(current_article_id)
+      unused_article_set.remove(current_article_id)
 
-        for shard in self.output_test_files:
-            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+      # Maintain the max sentence count
+      while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+        max_sentences -= 1
 
-        training_median = statistics.median(training_counts)
-        test_median = statistics.median(test_counts)
+      if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+        nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+        print("Warning: A single article contains more than the nominal number of sentences per test shard.")
 
-        # Make subsequent passes over files to find articles to add without going over limit
-        history_remaining = []
-        n_history_remaining = 4
+    training_counts = []
+    test_counts = []
 
-        while len(consumed_article_set) < len(self.articles):
-            for fidx, file in enumerate(self.output_training_files):
-                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+    for shard in self.output_training_files:
+      training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
 
-                # Maintain the max sentence count
-                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                    max_sentences -= 1
+    for shard in self.output_test_files:
+      test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
 
-                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
-                    nominal_next_article_size -= 1
+    training_median = statistics.median(training_counts)
+    test_median = statistics.median(test_counts)
 
-                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or training_counts[fidx] > training_median:
-                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+    # Make subsequent passes over files to find articles to add without going over limit
+    history_remaining = []
+    n_history_remaining = 4
 
-                current_article_id = sentence_counts[nominal_next_article_size][-1]
-                sentence_counts[nominal_next_article_size].pop(-1)
+    while len(consumed_article_set) < len(self.articles):
+      for fidx, file in enumerate(self.output_training_files):
+        nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
 
-                self.output_training_files[file].append(current_article_id)
-                consumed_article_set.add(current_article_id)
-                unused_article_set.remove(current_article_id)
+        # Maintain the max sentence count
+        while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+          max_sentences -= 1
 
-            for fidx, file in enumerate(self.output_test_files):
-                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+        while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+          nominal_next_article_size -= 1
 
-                # Maintain the max sentence count
-                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                    max_sentences -= 1
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size is 0
+          or training_counts[fidx] > training_median
+        ):
+          continue  # skip adding to this file, will come back later if no file can accept unused articles
 
-                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
-                    nominal_next_article_size -= 1
+        current_article_id = sentence_counts[nominal_next_article_size][-1]
+        sentence_counts[nominal_next_article_size].pop(-1)
 
-                if nominal_next_article_size not in sentence_counts or nominal_next_article_size is 0 or test_counts[fidx] > test_median:
-                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+        self.output_training_files[file].append(current_article_id)
+        consumed_article_set.add(current_article_id)
+        unused_article_set.remove(current_article_id)
 
-                current_article_id = sentence_counts[nominal_next_article_size][-1]
-                sentence_counts[nominal_next_article_size].pop(-1)
+      for fidx, file in enumerate(self.output_test_files):
+        nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
 
-                self.output_test_files[file].append(current_article_id)
-                consumed_article_set.add(current_article_id)
-                unused_article_set.remove(current_article_id)
+        # Maintain the max sentence count
+        while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+          max_sentences -= 1
 
-            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
-            if len(history_remaining) == n_history_remaining:
-                history_remaining.pop(0)
-            history_remaining.append(len(unused_article_set))
+        while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+          nominal_next_article_size -= 1
 
-            history_same = True
-            for i in range(1, len(history_remaining)):
-                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size is 0
+          or test_counts[fidx] > test_median
+        ):
+          continue  # skip adding to this file, will come back later if no file can accept unused articles
 
-            if history_same:
-                nominal_sentences_per_training_shard += 1
-                # nominal_sentences_per_test_shard += 1
+        current_article_id = sentence_counts[nominal_next_article_size][-1]
+        sentence_counts[nominal_next_article_size].pop(-1)
 
-            training_counts = []
-            test_counts = []
-            for shard in self.output_training_files:
-                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+        self.output_test_files[file].append(current_article_id)
+        consumed_article_set.add(current_article_id)
+        unused_article_set.remove(current_article_id)
 
-            for shard in self.output_test_files:
-                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+      # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+      if len(history_remaining) == n_history_remaining:
+        history_remaining.pop(0)
+      history_remaining.append(len(unused_article_set))
 
-            training_median = statistics.median(training_counts)
-            test_median = statistics.median(test_counts)
+      history_same = True
+      for i in range(1, len(history_remaining)):
+        history_same = history_same and (history_remaining[i - 1] == history_remaining[i])
 
-            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+      if history_same:
+        nominal_sentences_per_training_shard += 1
+        # nominal_sentences_per_test_shard += 1
 
+      training_counts = []
+      test_counts = []
+      for shard in self.output_training_files:
+        training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
 
-        if len(unused_article_set) != 0:
-            print('Warning: Some articles did not make it into output files.')
+      for shard in self.output_test_files:
+        test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
 
+      training_median = statistics.median(training_counts)
+      test_median = statistics.median(test_counts)
 
-        for shard in self.output_training_files:
-            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+      print("Distributing data over shards:", len(unused_article_set), "articles remaining.")
 
-        for shard in self.output_test_files:
-            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+    if len(unused_article_set) != 0:
+      print("Warning: Some articles did not make it into output files.")
 
-        print('End: Distribute Articles Over Shards')
+    for shard in self.output_training_files:
+      print("Training shard:", self.get_sentences_per_shard(self.output_training_files[shard]))
 
+    for shard in self.output_test_files:
+      print("Test shard:", self.get_sentences_per_shard(self.output_test_files[shard]))
 
-    def write_shards_to_disk(self):
-        print('Start: Write Shards to Disk')
-        for shard in self.output_training_files:
-            self.write_single_shard(shard, self.output_training_files[shard])
+    print("End: Distribute Articles Over Shards")
 
-        for shard in self.output_test_files:
-            self.write_single_shard(shard, self.output_test_files[shard])
+  def write_shards_to_disk(self):
+    print("Start: Write Shards to Disk")
+    for shard in self.output_training_files:
+      self.write_single_shard(shard, self.output_training_files[shard])
 
-        print('End: Write Shards to Disk')
+    for shard in self.output_test_files:
+      self.write_single_shard(shard, self.output_test_files[shard])
 
+    print("End: Write Shards to Disk")
 
-    def write_single_shard(self, shard_name, shard):
-        with open(shard_name, mode='w', newline='\n') as f:
-            for article_id in shard:
-                for line in self.sentences[article_id]:
-                    f.write(line + '\n')
+  def write_single_shard(self, shard_name, shard):
+    with open(shard_name, mode="w", newline="\n") as f:
+      for article_id in shard:
+        for line in self.sentences[article_id]:
+          f.write(line + "\n")
 
-                f.write('\n')  # Line break between articles
+        f.write("\n")  # Line break between articles
 
 
 import nltk
 
-nltk.download('punkt')
+nltk.download("punkt")
 
-class NLTKSegmenter:
-    def __init(self):
-        pass
 
-    def segment_string(self, article):
-        return nltk.tokenize.sent_tokenize(article)
+class NLTKSegmenter:
+  def __init(self):
+    pass
 
+  def segment_string(self, article):
+    return nltk.tokenize.sent_tokenize(article)
diff --git a/modelzoo/ELECTRA/data/WikiDownloader.py b/modelzoo/ELECTRA/data/WikiDownloader.py
index 505ec76c..334eabd1 100644
--- a/modelzoo/ELECTRA/data/WikiDownloader.py
+++ b/modelzoo/ELECTRA/data/WikiDownloader.py
@@ -17,41 +17,38 @@
 import subprocess
 import sys
 
+
 class WikiDownloader:
-    def __init__(self, language, save_path):
-        self.save_path = save_path + '/wikicorpus_' + language
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.language = language
-        self.download_urls = {
-            'en' : 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'
-        }
-
-        self.output_files = {
-            'en' : 'wikicorpus_en.xml.bz2',
-            'zh' : 'wikicorpus_zh.xml.bz2'
-        }
-
-
-    def download(self):
-        if self.language in self.download_urls:
-            url = self.download_urls[self.language]
-            filename = self.output_files[self.language]
-
-            print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + filename):
-                print('** Download file already exists, skipping download')
-            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + filename, "wb") as handle:
-                    handle.write(response.read())
-
-            # Always unzipping since this is relatively fast and will overwrite
-            print('Unzipping:', self.output_files[self.language])
-            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
-
-        else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
\ No newline at end of file
+  def __init__(self, language, save_path):
+    self.save_path = save_path + "/wikicorpus_" + language
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    self.language = language
+    self.download_urls = {
+      "en": "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
+      "zh": "https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2",
+    }
+
+    self.output_files = {"en": "wikicorpus_en.xml.bz2", "zh": "wikicorpus_zh.xml.bz2"}
+
+  def download(self):
+    if self.language in self.download_urls:
+      url = self.download_urls[self.language]
+      filename = self.output_files[self.language]
+
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + filename):
+        print("** Download file already exists, skipping download")
+      else:
+        response = urllib.request.urlopen(url)
+        with open(self.save_path + "/" + filename, "wb") as handle:
+          handle.write(response.read())
+
+      # Always unzipping since this is relatively fast and will overwrite
+      print("Unzipping:", self.output_files[self.language])
+      subprocess.run("bzip2 -dk " + self.save_path + "/" + filename, shell=True, check=True)
+
+    else:
+      assert False, "WikiDownloader not implemented for this language yet."
diff --git a/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
index 9d356b13..aa73e515 100644
--- a/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
+++ b/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
@@ -14,33 +14,33 @@
 import glob
 import os
 
-class WikicorpusTextFormatting:
-    def __init__(self, wiki_path, output_filename, recursive = False):
-        self.wiki_path = wiki_path
-        self.recursive = recursive
-        self.output_filename = output_filename
 
+class WikicorpusTextFormatting:
+  def __init__(self, wiki_path, output_filename, recursive=False):
+    self.wiki_path = wiki_path
+    self.recursive = recursive
+    self.output_filename = output_filename
 
-    # This puts one article per line
-    def merge(self):
-        with open(self.output_filename, mode='w', newline='\n') as ofile:
-            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
-                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
-                    print(filename)
-                    article_lines = []
-                    article_open = False
+  # This puts one article per line
+  def merge(self):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for dirname in glob.glob(self.wiki_path + "/*/", recursive=False):
+        for filename in glob.glob(dirname + "wiki_*", recursive=self.recursive):
+          print(filename)
+          article_lines = []
+          article_open = False
 
-                    with open(filename, mode='r', newline='\n') as file:
-                        for line in file:
-                            if '<doc id=' in line:
-                                article_open = True
-                            elif '</doc>' in line:
-                                article_open = False
-                                for oline in article_lines[1:]:
-                                    if oline != '\n':
-                                        ofile.write(oline.rstrip() + " ")
-                                ofile.write("\n\n")
-                                article_lines = []
-                            else:
-                                if article_open:
-                                    article_lines.append(line)
\ No newline at end of file
+          with open(filename, mode="r", newline="\n") as file:
+            for line in file:
+              if "<doc id=" in line:
+                article_open = True
+              elif "</doc>" in line:
+                article_open = False
+                for oline in article_lines[1:]:
+                  if oline != "\n":
+                    ofile.write(oline.rstrip() + " ")
+                ofile.write("\n\n")
+                article_lines = []
+              else:
+                if article_open:
+                  article_lines.append(line)
diff --git a/modelzoo/ELECTRA/data/dataPrep.py b/modelzoo/ELECTRA/data/dataPrep.py
index a029bc63..efe001db 100644
--- a/modelzoo/ELECTRA/data/dataPrep.py
+++ b/modelzoo/ELECTRA/data/dataPrep.py
@@ -25,288 +25,287 @@
 
 
 def main(args):
-    working_dir = os.environ['DATA_PREP_WORKING_DIR']
-
-    print('Working Directory:', working_dir)
-    print('Action:', args.action)
-    print('Dataset Name:', args.dataset)
-
-    if args.input_files:
-        args.input_files = args.input_files.split(',')
-
-    hdf5_tfrecord_folder_prefix = "_lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
-                                  + "_random_seed_" + str(args.random_seed)
-
-    directory_structure = {
-        'download' : working_dir + '/download',    # Downloaded and decompressed
-        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
-        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
-        'sharded' : working_dir + '/sharded_' + "training_shards_" + str(args.n_training_shards) + "_test_shards_" + str(args.n_test_shards) + "_fraction_" + str(args.fraction_test_set),
-        'tfrecord' : working_dir + '/tfrecord'+ hdf5_tfrecord_folder_prefix,
-        'hdf5': working_dir + '/hdf5' + hdf5_tfrecord_folder_prefix
-    }
-
-    print('\nDirectory Structure:')
-    pp = pprint.PrettyPrinter(indent=2)
-    pp.pprint(directory_structure)
-    print('')
-
-    if args.action == 'download':
-        if not os.path.exists(directory_structure['download']):
-            os.makedirs(directory_structure['download'])
-
-        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
-        downloader.download()
-
-    elif args.action == 'text_formatting':
-        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' and args.dataset != 'squad' and args.dataset != 'mrpc', 'Cannot perform text_formatting on pretrained weights'
-
-        if not os.path.exists(directory_structure['extracted']):
-            os.makedirs(directory_structure['extracted'])
-
-        if not os.path.exists(directory_structure['formatted']):
-            os.makedirs(directory_structure['formatted'])
-
-        if args.dataset == 'bookscorpus':
-            books_path = directory_structure['download'] + '/bookscorpus'
-            #books_path = directory_structure['download']
-            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
-            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
-            books_formatter.merge()
-
-        elif args.dataset == 'wikicorpus_en':
-            if args.skip_wikiextractor == 0:
-                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
-                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-                print('WikiExtractor Command:', wikiextractor_command)
-                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-                #wikiextractor_process.communicate()
-
-            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
-            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
-            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
-            wiki_formatter.merge()
-
-        elif args.dataset == 'wikicorpus_zh':
-            raise NotImplementedError(
-                'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be '
-                'translated and properly segmented still, and should work once this step is added.')
-            # if args.skip_wikiextractor == 0:
-            #     path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
-            #     wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-            #     print('WikiExtractor Command:', wikiextractor_command)
-            #     wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-            #     #wikiextractor_process.communicate()
-            #
-            # wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
-            # output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
-            # wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
-            # wiki_formatter.merge()
-            #
-            # assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
-
-    elif args.action == 'sharding':
-        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
-        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset:
-            if args.input_files is None:
-                if args.dataset == 'bookscorpus':
-                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
-                elif args.dataset == 'wikicorpus_en':
-                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-                elif args.dataset == 'wikicorpus_zh':
-                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
-                elif args.dataset == 'books_wiki_en_corpus':
-                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-
-            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
-
-            if not os.path.exists(directory_structure['sharded']):
-                os.makedirs(directory_structure['sharded'])
-
-            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
-                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
-
-            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
-            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
-            # Different languages (e.g., Chinese simplified/traditional) may require translation and
-            # other packages to be called from here -- just add a conditional branch for those extra steps
-            segmenter = TextSharding.NLTKSegmenter()
-            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
-
-            sharding.load_articles()
-            sharding.segment_articles_into_sentences(segmenter)
-            sharding.distribute_articles_over_shards()
-            sharding.write_shards_to_disk()
-
-            for _dir in ['train', 'test']:
-                if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir):
-                    os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/' + _dir)
-                absolute_dir = directory_structure['sharded'] + '/' + args.dataset
-                command = 'mv ' + absolute_dir + '/*' + _dir + '*.txt' + ' ' + absolute_dir + '/' + _dir
-                mv_process = subprocess.Popen(command, shell=True)
-
-                mv_process.wait()
-        else:
-            assert False, 'Unsupported dataset for sharding'
-
-    elif args.action == 'create_tfrecord_files':
-
-        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
-            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
-        if args.vocab_file is None:
-            args.vocab_file = os.path.join(working_dir, "vocab.txt")
-
-        for _dir in ['train', 'test']:
-            electra_preprocessing_command = 'python /workspace/electra/build_pretraining_dataset.py'
-            electra_preprocessing_command += ' --corpus-dir=' + directory_structure['sharded'] + '/' + args.dataset + '/' + _dir
-            electra_preprocessing_command += ' --output-dir=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + _dir
-            electra_preprocessing_command += ' --vocab-file=' + args.vocab_file
-            electra_preprocessing_command += ' --do-lower-case' if args.do_lower_case else ' --no-lower-case'
-            electra_preprocessing_command += ' --max-seq-length=' + str(args.max_seq_length)
-            electra_preprocessing_command += ' --num-processes=8'
-            electra_preprocessing_command += ' --num-out-files=' + str(args.n_training_shards) if _dir == 'train' \
-                else ' --num-out-files=' + str(args.n_test_shards)
-            electra_preprocessing_process = subprocess.Popen(electra_preprocessing_command, shell=True)
-
-            electra_preprocessing_process.wait()
-
-
-    elif args.action == 'create_hdf5_files':
-        raise NotImplementedError
+  working_dir = os.environ["DATA_PREP_WORKING_DIR"]
+
+  print("Working Directory:", working_dir)
+  print("Action:", args.action)
+  print("Dataset Name:", args.dataset)
+
+  if args.input_files:
+    args.input_files = args.input_files.split(",")
+
+  hdf5_tfrecord_folder_prefix = (
+    "_lower_case_"
+    + str(args.do_lower_case)
+    + "_seq_len_"
+    + str(args.max_seq_length)
+    + "_random_seed_"
+    + str(args.random_seed)
+  )
+
+  directory_structure = {
+    "download": working_dir + "/download",  # Downloaded and decompressed
+    "extracted": working_dir + "/extracted",  # Extracted from whatever the initial format is (e.g., wikiextractor)
+    "formatted": working_dir
+    + "/formatted_one_article_per_line",  # This is the level where all sources should look the same
+    "sharded": working_dir
+    + "/sharded_"
+    + "training_shards_"
+    + str(args.n_training_shards)
+    + "_test_shards_"
+    + str(args.n_test_shards)
+    + "_fraction_"
+    + str(args.fraction_test_set),
+    "tfrecord": working_dir + "/tfrecord" + hdf5_tfrecord_folder_prefix,
+    "hdf5": working_dir + "/hdf5" + hdf5_tfrecord_folder_prefix,
+  }
+
+  print("\nDirectory Structure:")
+  pp = pprint.PrettyPrinter(indent=2)
+  pp.pprint(directory_structure)
+  print("")
+
+  if args.action == "download":
+    if not os.path.exists(directory_structure["download"]):
+      os.makedirs(directory_structure["download"])
+
+    downloader = Downloader.Downloader(args.dataset, directory_structure["download"])
+    downloader.download()
+
+  elif args.action == "text_formatting":
+    assert (
+      args.dataset != "google_pretrained_weights"
+      and args.dataset != "nvidia_pretrained_weights"
+      and args.dataset != "squad"
+      and args.dataset != "mrpc"
+    ), "Cannot perform text_formatting on pretrained weights"
+
+    if not os.path.exists(directory_structure["extracted"]):
+      os.makedirs(directory_structure["extracted"])
+
+    if not os.path.exists(directory_structure["formatted"]):
+      os.makedirs(directory_structure["formatted"])
+
+    if args.dataset == "bookscorpus":
+      books_path = directory_structure["download"] + "/bookscorpus"
+      # books_path = directory_structure['download']
+      output_filename = directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"
+      books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+      books_formatter.merge()
+
+    elif args.dataset == "wikicorpus_en":
+      if args.skip_wikiextractor == 0:
+        path_to_wikiextractor_in_container = "/workspace/wikiextractor/WikiExtractor.py"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_en.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
+        wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+        # wikiextractor_process.communicate()
+
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_en"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"
+      wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+      wiki_formatter.merge()
+
+    elif args.dataset == "wikicorpus_zh":
+      raise NotImplementedError(
+        "wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be "
+        "translated and properly segmented still, and should work once this step is added."
+      )
+      # if args.skip_wikiextractor == 0:
+      #     path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
+      #     wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
+      #     print('WikiExtractor Command:', wikiextractor_command)
+      #     wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+      #     #wikiextractor_process.communicate()
+      #
+      # wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
+      # output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
+      # wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+      # wiki_formatter.merge()
+      #
+      # assert os.stat(output_filename).st_size > 0, 'File glob did not pick up extracted wiki files from WikiExtractor.'
+
+  elif args.action == "sharding":
+    # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+    if args.dataset == "bookscorpus" or "wikicorpus" in args.dataset or "books_wiki" in args.dataset:
+      if args.input_files is None:
+        if args.dataset == "bookscorpus":
+          args.input_files = [directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"]
+        elif args.dataset == "wikicorpus_en":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"]
+        elif args.dataset == "wikicorpus_zh":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"]
+        elif args.dataset == "books_wiki_en_corpus":
+          args.input_files = [
+            directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt",
+            directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt",
+          ]
+
+      output_file_prefix = directory_structure["sharded"] + "/" + args.dataset + "/" + args.dataset
+
+      if not os.path.exists(directory_structure["sharded"]):
+        os.makedirs(directory_structure["sharded"])
+
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset)
+
+      # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+      # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+      # Different languages (e.g., Chinese simplified/traditional) may require translation and
+      # other packages to be called from here -- just add a conditional branch for those extra steps
+      segmenter = TextSharding.NLTKSegmenter()
+      sharding = TextSharding.Sharding(
+        args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
+      )
+
+      sharding.load_articles()
+      sharding.segment_articles_into_sentences(segmenter)
+      sharding.distribute_articles_over_shards()
+      sharding.write_shards_to_disk()
+
+      for _dir in ["train", "test"]:
+        if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/" + _dir):
+          os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/" + _dir)
+        absolute_dir = directory_structure["sharded"] + "/" + args.dataset
+        command = "mv " + absolute_dir + "/*" + _dir + "*.txt" + " " + absolute_dir + "/" + _dir
+        mv_process = subprocess.Popen(command, shell=True)
+
+        mv_process.wait()
+    else:
+      assert False, "Unsupported dataset for sharding"
+
+  elif args.action == "create_tfrecord_files":
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset)
+    if args.vocab_file is None:
+      args.vocab_file = os.path.join(working_dir, "vocab.txt")
+
+    for _dir in ["train", "test"]:
+      electra_preprocessing_command = "python /workspace/electra/build_pretraining_dataset.py"
+      electra_preprocessing_command += (
+        " --corpus-dir=" + directory_structure["sharded"] + "/" + args.dataset + "/" + _dir
+      )
+      electra_preprocessing_command += (
+        " --output-dir=" + directory_structure["tfrecord"] + "/" + args.dataset + "/" + _dir
+      )
+      electra_preprocessing_command += " --vocab-file=" + args.vocab_file
+      electra_preprocessing_command += " --do-lower-case" if args.do_lower_case else " --no-lower-case"
+      electra_preprocessing_command += " --max-seq-length=" + str(args.max_seq_length)
+      electra_preprocessing_command += " --num-processes=8"
+      electra_preprocessing_command += (
+        " --num-out-files=" + str(args.n_training_shards)
+        if _dir == "train"
+        else " --num-out-files=" + str(args.n_test_shards)
+      )
+      electra_preprocessing_process = subprocess.Popen(electra_preprocessing_command, shell=True)
+
+      electra_preprocessing_process.wait()
+
+  elif args.action == "create_hdf5_files":
+    raise NotImplementedError
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Preprocessing Application for Everything BERT-related'
-    )
-
-    parser.add_argument(
-        '--action',
-        type=str,
-        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
-        choices={
-            'download',               # Download and verify mdf5/sha sums
-            'text_formatting',        # Convert into a file that contains one article/book per line
-            'sharding',               # Convert previous formatted text into shards containing one sentence per line
-            'create_tfrecord_files',  # Turn each shard into a TFrecord with masking and next sentence prediction info
-            'create_hdf5_files'       # Turn each shard into a HDF5 file with masking and next sentence prediction info
-        }
-    )
-
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        help='Specify the dataset to perform --action on',
-        choices={
-            'bookscorpus',
-            'wikicorpus_en',
-            'wikicorpus_zh',
-            'books_wiki_en_corpus',
-            'google_pretrained_weights',
-            'nvidia_pretrained_weights',
-            'mrpc',
-            'squad',
-            'all'
-        }
-    )
-
-    parser.add_argument(
-        '--input_files',
-        type=str,
-        help='Specify the input files in a comma-separated list (no spaces)'
-    )
-
-    parser.add_argument(
-        '--n_training_shards',
-        type=int,
-        help='Specify the number of training shards to generate',
-        default=2048
-    )
-
-    parser.add_argument(
-        '--n_test_shards',
-        type=int,
-        help='Specify the number of test shards to generate',
-        default=2048
-    )
-
-    parser.add_argument(
-        '--fraction_test_set',
-        type=float,
-        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
-        default=0.1
-    )
-
-    parser.add_argument(
-        '--segmentation_method',
-        type=str,
-        help='Specify your choice of sentence segmentation',
-        choices={
-            'nltk'
-        },
-        default='nltk'
-    )
-
-    parser.add_argument(
-        '--n_processes',
-        type=int,
-        help='Specify the max number of processes to allow at one time',
-        default=4
-    )
-
-    parser.add_argument(
-        '--random_seed',
-        type=int,
-        help='Specify the base seed to use for any random number generation',
-        default=12345
-    )
-
-    parser.add_argument(
-        '--dupe_factor',
-        type=int,
-        help='Specify the duplication factor',
-        default=5
-    )
-
-    parser.add_argument(
-        '--masked_lm_prob',
-        type=float,
-        help='Specify the probability for masked lm',
-        default=0.15
-    )
-
-    parser.add_argument(
-        '--max_seq_length',
-        type=int,
-        help='Specify the maximum sequence length',
-        default=512
-    )
-
-    parser.add_argument(
-        '--do_lower_case',
-        type=int,
-        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
-        default=0
-    )
-
-    parser.add_argument(
-        '--vocab_file',
-        type=str,
-        help='Specify absolute path to vocab file to use)'
-    )
-
-    parser.add_argument(
-        '--skip_wikiextractor',
-        type=int,
-        help='Specify whether to skip wikiextractor step 0=False, 1=True',
-        default=0
-    )
-
-    parser.add_argument(
-        '--interactive_json_config_generator',
-        type=str,
-        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
-    )
-
-    args = parser.parse_args()
-    main(args)
+  parser = argparse.ArgumentParser(description="Preprocessing Application for Everything BERT-related")
+
+  parser.add_argument(
+    "--action",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+    choices={
+      "download",  # Download and verify mdf5/sha sums
+      "text_formatting",  # Convert into a file that contains one article/book per line
+      "sharding",  # Convert previous formatted text into shards containing one sentence per line
+      "create_tfrecord_files",  # Turn each shard into a TFrecord with masking and next sentence prediction info
+      "create_hdf5_files",  # Turn each shard into a HDF5 file with masking and next sentence prediction info
+    },
+  )
+
+  parser.add_argument(
+    "--dataset",
+    type=str,
+    help="Specify the dataset to perform --action on",
+    choices={
+      "bookscorpus",
+      "wikicorpus_en",
+      "wikicorpus_zh",
+      "books_wiki_en_corpus",
+      "google_pretrained_weights",
+      "nvidia_pretrained_weights",
+      "mrpc",
+      "squad",
+      "all",
+    },
+  )
+
+  parser.add_argument("--input_files", type=str, help="Specify the input files in a comma-separated list (no spaces)")
+
+  parser.add_argument(
+    "--n_training_shards", type=int, help="Specify the number of training shards to generate", default=2048
+  )
+
+  parser.add_argument("--n_test_shards", type=int, help="Specify the number of test shards to generate", default=2048)
+
+  parser.add_argument(
+    "--fraction_test_set",
+    type=float,
+    help="Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)",
+    default=0.1,
+  )
+
+  parser.add_argument(
+    "--segmentation_method",
+    type=str,
+    help="Specify your choice of sentence segmentation",
+    choices={"nltk"},
+    default="nltk",
+  )
+
+  parser.add_argument(
+    "--n_processes", type=int, help="Specify the max number of processes to allow at one time", default=4
+  )
+
+  parser.add_argument(
+    "--random_seed", type=int, help="Specify the base seed to use for any random number generation", default=12345
+  )
+
+  parser.add_argument("--dupe_factor", type=int, help="Specify the duplication factor", default=5)
+
+  parser.add_argument("--masked_lm_prob", type=float, help="Specify the probability for masked lm", default=0.15)
+
+  parser.add_argument("--max_seq_length", type=int, help="Specify the maximum sequence length", default=512)
+
+  parser.add_argument(
+    "--do_lower_case",
+    type=int,
+    help="Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)",
+    default=0,
+  )
+
+  parser.add_argument("--vocab_file", type=str, help="Specify absolute path to vocab file to use)")
+
+  parser.add_argument(
+    "--skip_wikiextractor", type=int, help="Specify whether to skip wikiextractor step 0=False, 1=True", default=0
+  )
+
+  parser.add_argument(
+    "--interactive_json_config_generator",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+  )
+
+  args = parser.parse_args()
+  main(args)
diff --git a/modelzoo/ELECTRA/file_utils.py b/modelzoo/ELECTRA/file_utils.py
index da6a96e0..e9eb544b 100644
--- a/modelzoo/ELECTRA/file_utils.py
+++ b/modelzoo/ELECTRA/file_utils.py
@@ -49,55 +49,55 @@
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
 try:
-    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-    if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
-        import torch
-
-        _torch_available = True  # pylint: disable=invalid-name
-        logger.info("PyTorch version {} available.".format(torch.__version__))
-    else:
-        logger.info("Disabling PyTorch because USE_TF is set")
-        _torch_available = False
+  USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+  USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+  if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
+    import torch
+
+    _torch_available = True  # pylint: disable=invalid-name
+    logger.info("PyTorch version {} available.".format(torch.__version__))
+  else:
+    logger.info("Disabling PyTorch because USE_TF is set")
+    _torch_available = False
 except ImportError:
-    _torch_available = False  # pylint: disable=invalid-name
+  _torch_available = False  # pylint: disable=invalid-name
 
 try:
-    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
-    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
-
-    if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
-        import tensorflow as tf
-
-        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
-        _tf_available = True  # pylint: disable=invalid-name
-        logger.info("TensorFlow version {} available.".format(tf.__version__))
-    else:
-        logger.info("Disabling Tensorflow because USE_TORCH is set")
-        _tf_available = False
+  USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+  USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+
+  if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
+    import tensorflow as tf
+
+    assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
+    _tf_available = True  # pylint: disable=invalid-name
+    logger.info("TensorFlow version {} available.".format(tf.__version__))
+  else:
+    logger.info("Disabling Tensorflow because USE_TORCH is set")
+    _tf_available = False
 except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
+  _tf_available = False  # pylint: disable=invalid-name
 
 try:
-    from torch.hub import _get_torch_home
+  from torch.hub import _get_torch_home
 
-    torch_cache_home = _get_torch_home()
+  torch_cache_home = _get_torch_home()
 except ImportError:
-    torch_cache_home = os.path.expanduser(
-        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
-    )
+  torch_cache_home = os.path.expanduser(
+    os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+  )
 default_cache_path = os.path.join(torch_cache_home, "transformers")
 
 try:
-    from pathlib import Path
+  from pathlib import Path
 
-    PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
-    )
+  PYTORCH_PRETRAINED_BERT_CACHE = Path(
+    os.getenv("PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path))
+  )
 except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
-        "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-    )
+  PYTORCH_PRETRAINED_BERT_CACHE = os.getenv(
+    "PYTORCH_TRANSFORMERS_CACHE", os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+  )
 
 PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
@@ -118,26 +118,26 @@
 
 
 def is_torch_available():
-    return _torch_available
+  return _torch_available
 
 
 def is_tf_available():
-    return _tf_available
+  return _tf_available
 
 
 def add_start_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
+  def docstring_decorator(fn):
+    fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+    return fn
 
-    return docstring_decorator
+  return docstring_decorator
 
 
 def add_start_docstrings_to_callable(*docstr):
-    def docstring_decorator(fn):
-        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
-        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
-        note = r"""
+  def docstring_decorator(fn):
+    class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
+    intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
+    note = r"""
 
     .. note::
         Although the recipe for forward pass needs to be defined within
@@ -145,371 +145,370 @@ def docstring_decorator(fn):
         instead of this since the former takes care of running the
         pre and post processing steps while the latter silently ignores them.
         """
-        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
-        return fn
+    fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+    return fn
 
-    return docstring_decorator
+  return docstring_decorator
 
 
 def add_end_docstrings(*docstr):
-    def docstring_decorator(fn):
-        fn.__doc__ = fn.__doc__ + "".join(docstr)
-        return fn
+  def docstring_decorator(fn):
+    fn.__doc__ = fn.__doc__ + "".join(docstr)
+    return fn
 
-    return docstring_decorator
+  return docstring_decorator
 
 
 def is_remote_url(url_or_filename):
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https", "s3")
+  parsed = urlparse(url_or_filename)
+  return parsed.scheme in ("http", "https", "s3")
 
 
 def hf_bucket_url(identifier, postfix=None, cdn=False) -> str:
-    endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
-    if postfix is None:
-        return "/".join((endpoint, identifier))
-    else:
-        return "/".join((endpoint, identifier, postfix))
+  endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX
+  if postfix is None:
+    return "/".join((endpoint, identifier))
+  else:
+    return "/".join((endpoint, identifier, postfix))
 
 
 def url_to_filename(url, etag=None):
-    """
-    Convert `url` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the url's, delimited
-    by a period.
-    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
-    so that TF 2.0 can identify it as a HDF5 file
-    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
-    """
-    url_bytes = url.encode("utf-8")
-    url_hash = sha256(url_bytes)
-    filename = url_hash.hexdigest()
+  """
+  Convert `url` into a hashed filename in a repeatable way.
+  If `etag` is specified, append its hash to the url's, delimited
+  by a period.
+  If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+  so that TF 2.0 can identify it as a HDF5 file
+  (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+  """
+  url_bytes = url.encode("utf-8")
+  url_hash = sha256(url_bytes)
+  filename = url_hash.hexdigest()
 
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
+  if etag:
+    etag_bytes = etag.encode("utf-8")
+    etag_hash = sha256(etag_bytes)
+    filename += "." + etag_hash.hexdigest()
 
-    if url.endswith(".h5"):
-        filename += ".h5"
+  if url.endswith(".h5"):
+    filename += ".h5"
 
-    return filename
+  return filename
 
 
 def filename_to_url(filename, cache_dir=None):
-    """
-    Return the url and etag (which may be ``None``) stored for `filename`.
-    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    cache_path = os.path.join(cache_dir, filename)
-    if not os.path.exists(cache_path):
-        raise EnvironmentError("file {} not found".format(cache_path))
+  """
+  Return the url and etag (which may be ``None``) stored for `filename`.
+  Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+  """
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
 
-    meta_path = cache_path + ".json"
-    if not os.path.exists(meta_path):
-        raise EnvironmentError("file {} not found".format(meta_path))
+  cache_path = os.path.join(cache_dir, filename)
+  if not os.path.exists(cache_path):
+    raise EnvironmentError("file {} not found".format(cache_path))
+
+  meta_path = cache_path + ".json"
+  if not os.path.exists(meta_path):
+    raise EnvironmentError("file {} not found".format(meta_path))
 
-    with open(meta_path, encoding="utf-8") as meta_file:
-        metadata = json.load(meta_file)
-    url = metadata["url"]
-    etag = metadata["etag"]
+  with open(meta_path, encoding="utf-8") as meta_file:
+    metadata = json.load(meta_file)
+  url = metadata["url"]
+  etag = metadata["etag"]
 
-    return url, etag
+  return url, etag
 
 
 def cached_path(
-    url_or_filename,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    resume_download=False,
-    user_agent=None,
-    extract_compressed_file=False,
-    force_extract=False,
-    local_files_only=False,
+  url_or_filename,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  resume_download=False,
+  user_agent=None,
+  extract_compressed_file=False,
+  force_extract=False,
+  local_files_only=False,
 ) -> Optional[str]:
-    """
-    Given something that might be a URL (or might be a local path),
-    determine which. If it's a URL, download the file and cache it, and
-    return the path to the cached file. If it's already a local path,
-    make sure the file exists and then return the path.
-    Args:
-        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
-        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
-        resume_download: if True, resume the download if incompletly recieved file is found.
-        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
-        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
-            file in a folder along the archive.
-        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
-            re-extract the archive and overide the folder where it was extracted.
-
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    if is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = get_from_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            user_agent=user_agent,
-            local_files_only=local_files_only,
-        )
-    elif os.path.exists(url_or_filename):
-        # File, and it exists.
-        output_path = url_or_filename
-    elif urlparse(url_or_filename).scheme == "":
-        # File, but it doesn't exist.
-        raise EnvironmentError("file {} not found".format(url_or_filename))
-    else:
-        # Something unknown
-        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
-
-    if extract_compressed_file:
-        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
-            return output_path
-
-        # Path where we extract compressed archives
-        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
-        output_dir, output_file = os.path.split(output_path)
-        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-
-        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
-            return output_path_extracted
-
-        # Prevent parallel extractions
-        lock_path = output_path + ".lock"
-        with FileLock(lock_path):
-            shutil.rmtree(output_path_extracted, ignore_errors=True)
-            os.makedirs(output_path_extracted)
-            if is_zipfile(output_path):
-                with ZipFile(output_path, "r") as zip_file:
-                    zip_file.extractall(output_path_extracted)
-                    zip_file.close()
-            elif tarfile.is_tarfile(output_path):
-                tar_file = tarfile.open(output_path)
-                tar_file.extractall(output_path_extracted)
-                tar_file.close()
-            else:
-                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
-
-        return output_path_extracted
-
-    return output_path
+  """
+  Given something that might be a URL (or might be a local path),
+  determine which. If it's a URL, download the file and cache it, and
+  return the path to the cached file. If it's already a local path,
+  make sure the file exists and then return the path.
+  Args:
+      cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+      force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+      resume_download: if True, resume the download if incompletly recieved file is found.
+      user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+      extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+          file in a folder along the archive.
+      force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+          re-extract the archive and overide the folder where it was extracted.
+
+  Return:
+      None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+      Local path (string) otherwise
+  """
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(url_or_filename, Path):
+    url_or_filename = str(url_or_filename)
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+
+  if is_remote_url(url_or_filename):
+    # URL, so get it from the cache (downloading if necessary)
+    output_path = get_from_cache(
+      url_or_filename,
+      cache_dir=cache_dir,
+      force_download=force_download,
+      proxies=proxies,
+      resume_download=resume_download,
+      user_agent=user_agent,
+      local_files_only=local_files_only,
+    )
+  elif os.path.exists(url_or_filename):
+    # File, and it exists.
+    output_path = url_or_filename
+  elif urlparse(url_or_filename).scheme == "":
+    # File, but it doesn't exist.
+    raise EnvironmentError("file {} not found".format(url_or_filename))
+  else:
+    # Something unknown
+    raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+
+  if extract_compressed_file:
+    if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+      return output_path
+
+    # Path where we extract compressed archives
+    # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+    output_dir, output_file = os.path.split(output_path)
+    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+
+    if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+      return output_path_extracted
+
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+      shutil.rmtree(output_path_extracted, ignore_errors=True)
+      os.makedirs(output_path_extracted)
+      if is_zipfile(output_path):
+        with ZipFile(output_path, "r") as zip_file:
+          zip_file.extractall(output_path_extracted)
+          zip_file.close()
+      elif tarfile.is_tarfile(output_path):
+        tar_file = tarfile.open(output_path)
+        tar_file.extractall(output_path_extracted)
+        tar_file.close()
+      else:
+        raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+
+    return output_path_extracted
+
+  return output_path
 
 
 def split_s3_path(url):
-    """Split a full s3 path into the bucket name and path."""
-    parsed = urlparse(url)
-    if not parsed.netloc or not parsed.path:
-        raise ValueError("bad s3 path {}".format(url))
-    bucket_name = parsed.netloc
-    s3_path = parsed.path
-    # Remove '/' at beginning of path.
-    if s3_path.startswith("/"):
-        s3_path = s3_path[1:]
-    return bucket_name, s3_path
+  """Split a full s3 path into the bucket name and path."""
+  parsed = urlparse(url)
+  if not parsed.netloc or not parsed.path:
+    raise ValueError("bad s3 path {}".format(url))
+  bucket_name = parsed.netloc
+  s3_path = parsed.path
+  # Remove '/' at beginning of path.
+  if s3_path.startswith("/"):
+    s3_path = s3_path[1:]
+  return bucket_name, s3_path
 
 
 def s3_request(func):
-    """
-    Wrapper function for s3 requests in order to create more helpful error
-    messages.
-    """
+  """
+  Wrapper function for s3 requests in order to create more helpful error
+  messages.
+  """
 
-    @wraps(func)
-    def wrapper(url, *args, **kwargs):
-        try:
-            return func(url, *args, **kwargs)
-        except ClientError as exc:
-            if int(exc.response["Error"]["Code"]) == 404:
-                raise EnvironmentError("file {} not found".format(url))
-            else:
-                raise
+  @wraps(func)
+  def wrapper(url, *args, **kwargs):
+    try:
+      return func(url, *args, **kwargs)
+    except ClientError as exc:
+      if int(exc.response["Error"]["Code"]) == 404:
+        raise EnvironmentError("file {} not found".format(url))
+      else:
+        raise
 
-    return wrapper
+  return wrapper
 
 
 @s3_request
 def s3_etag(url, proxies=None):
-    """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_object = s3_resource.Object(bucket_name, s3_path)
-    return s3_object.e_tag
+  """Check ETag on S3 object."""
+  s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+  bucket_name, s3_path = split_s3_path(url)
+  s3_object = s3_resource.Object(bucket_name, s3_path)
+  return s3_object.e_tag
 
 
 @s3_request
 def s3_get(url, temp_file, proxies=None):
-    """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
-    bucket_name, s3_path = split_s3_path(url)
-    s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
+  """Pull a file directly from S3."""
+  s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
+  bucket_name, s3_path = split_s3_path(url)
+  s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
 
 
 def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None):
-    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
-    if is_torch_available():
-        ua += "; torch/{}".format(torch.__version__)
-    if is_tf_available():
-        ua += "; tensorflow/{}".format(tf.__version__)
-    if isinstance(user_agent, dict):
-        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
-    elif isinstance(user_agent, str):
-        ua += "; " + user_agent
-    headers = {"user-agent": ua}
-    if resume_size > 0:
-        headers["Range"] = "bytes=%d-" % (resume_size,)
-    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
-    if response.status_code == 416:  # Range not satisfiable
-        return
-    content_length = response.headers.get("Content-Length")
-    total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        initial=resume_size,
-        desc="Downloading",
-        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
-    )
-    for chunk in response.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
+  ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+  if is_torch_available():
+    ua += "; torch/{}".format(torch.__version__)
+  if is_tf_available():
+    ua += "; tensorflow/{}".format(tf.__version__)
+  if isinstance(user_agent, dict):
+    ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+  elif isinstance(user_agent, str):
+    ua += "; " + user_agent
+  headers = {"user-agent": ua}
+  if resume_size > 0:
+    headers["Range"] = "bytes=%d-" % (resume_size,)
+  response = requests.get(url, stream=True, proxies=proxies, headers=headers)
+  if response.status_code == 416:  # Range not satisfiable
+    return
+  content_length = response.headers.get("Content-Length")
+  total = resume_size + int(content_length) if content_length is not None else None
+  progress = tqdm(
+    unit="B",
+    unit_scale=True,
+    total=total,
+    initial=resume_size,
+    desc="Downloading",
+    disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+  )
+  for chunk in response.iter_content(chunk_size=1024):
+    if chunk:  # filter out keep-alive new chunks
+      progress.update(len(chunk))
+      temp_file.write(chunk)
+  progress.close()
 
 
 def get_from_cache(
-    url,
-    cache_dir=None,
-    force_download=False,
-    proxies=None,
-    etag_timeout=10,
-    resume_download=False,
-    user_agent=None,
-    local_files_only=False,
+  url,
+  cache_dir=None,
+  force_download=False,
+  proxies=None,
+  etag_timeout=10,
+  resume_download=False,
+  user_agent=None,
+  local_files_only=False,
 ) -> Optional[str]:
-    """
-    Given a URL, look for the corresponding file in the local cache.
-    If it's not there, download it. Then return the path to the cached file.
-
-    Return:
-        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
-        Local path (string) otherwise
-    """
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-
-    os.makedirs(cache_dir, exist_ok=True)
-
-    etag = None
-    if not local_files_only:
-        # Get eTag to add to filename, if it exists.
-        if url.startswith("s3://"):
-            etag = s3_etag(url, proxies=proxies)
-        else:
-            try:
-                response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
-                if response.status_code == 200:
-                    etag = response.headers.get("ETag")
-            except (EnvironmentError, requests.exceptions.Timeout):
-                # etag is already None
-                pass
-
-    filename = url_to_filename(url, etag)
-
-    # get cache path to put the file
-    cache_path = os.path.join(cache_dir, filename)
-
-    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
-    # try to get the last downloaded one
-    if etag is None:
-        if os.path.exists(cache_path):
-            return cache_path
-        else:
-            matching_files = [
-                file
-                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
-                if not file.endswith(".json") and not file.endswith(".lock")
-            ]
-            if len(matching_files) > 0:
-                return os.path.join(cache_dir, matching_files[-1])
-            else:
-                # If files cannot be found and local_files_only=True,
-                # the models might've been found if local_files_only=False
-                # Notify the user about that
-                if local_files_only:
-                    raise ValueError(
-                        "Cannot find the requested files in the cached path and outgoing traffic has been"
-                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
-                        " to False."
-                    )
-                return None
-
-    # From now on, etag is not None.
-    if os.path.exists(cache_path) and not force_download:
-        return cache_path
-
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
+  """
+  Given a URL, look for the corresponding file in the local cache.
+  If it's not there, download it. Then return the path to the cached file.
+
+  Return:
+      None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+      Local path (string) otherwise
+  """
+  if cache_dir is None:
+    cache_dir = TRANSFORMERS_CACHE
+  if isinstance(cache_dir, Path):
+    cache_dir = str(cache_dir)
+
+  os.makedirs(cache_dir, exist_ok=True)
+
+  etag = None
+  if not local_files_only:
+    # Get eTag to add to filename, if it exists.
+    if url.startswith("s3://"):
+      etag = s3_etag(url, proxies=proxies)
+    else:
+      try:
+        response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
+        if response.status_code == 200:
+          etag = response.headers.get("ETag")
+      except (EnvironmentError, requests.exceptions.Timeout):
+        # etag is already None
+        pass
+
+  filename = url_to_filename(url, etag)
+
+  # get cache path to put the file
+  cache_path = os.path.join(cache_dir, filename)
+
+  # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+  # try to get the last downloaded one
+  if etag is None:
+    if os.path.exists(cache_path):
+      return cache_path
+    else:
+      matching_files = [
+        file
+        for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+        if not file.endswith(".json") and not file.endswith(".lock")
+      ]
+      if len(matching_files) > 0:
+        return os.path.join(cache_dir, matching_files[-1])
+      else:
+        # If files cannot be found and local_files_only=True,
+        # the models might've been found if local_files_only=False
+        # Notify the user about that
+        if local_files_only:
+          raise ValueError(
+            "Cannot find the requested files in the cached path and outgoing traffic has been"
+            " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+            " to False."
+          )
+        return None
+
+  # From now on, etag is not None.
+  if os.path.exists(cache_path) and not force_download:
+    return cache_path
 
+  # Prevent parallel downloads of the same file with a lock.
+  lock_path = cache_path + ".lock"
+  with FileLock(lock_path):
+    if resume_download:
+      incomplete_path = cache_path + ".incomplete"
+
+      @contextmanager
+      def _resumable_file_manager():
+        with open(incomplete_path, "a+b") as f:
+          yield f
+
+      temp_file_manager = _resumable_file_manager
+      if os.path.exists(incomplete_path):
+        resume_size = os.stat(incomplete_path).st_size
+      else:
+        resume_size = 0
+    else:
+      temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+      resume_size = 0
+
+    # Download to temporary file, then copy to cache dir once finished.
+    # Otherwise you get corrupt cache entries if the download gets interrupted.
+    with temp_file_manager() as temp_file:
+      logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+
+      # GET file object
+      if url.startswith("s3://"):
         if resume_download:
-            incomplete_path = cache_path + ".incomplete"
-
-            @contextmanager
-            def _resumable_file_manager():
-                with open(incomplete_path, "a+b") as f:
-                    yield f
-
-            temp_file_manager = _resumable_file_manager
-            if os.path.exists(incomplete_path):
-                resume_size = os.stat(incomplete_path).st_size
-            else:
-                resume_size = 0
-        else:
-            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
-            resume_size = 0
-
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
-
-            # GET file object
-            if url.startswith("s3://"):
-                if resume_download:
-                    logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
-                s3_get(url, temp_file, proxies=proxies)
-            else:
-                http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
-
-        logger.info("storing %s in cache at %s", url, cache_path)
-        os.replace(temp_file.name, cache_path)
-
-        logger.info("creating metadata file for %s", cache_path)
-        meta = {"url": url, "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
+          logger.warn('Warning: resumable downloads are not implemented for "s3://" urls')
+        s3_get(url, temp_file, proxies=proxies)
+      else:
+        http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
 
-    return cache_path
+    logger.info("storing %s in cache at %s", url, cache_path)
+    os.replace(temp_file.name, cache_path)
+
+    logger.info("creating metadata file for %s", cache_path)
+    meta = {"url": url, "etag": etag}
+    meta_path = cache_path + ".json"
+    with open(meta_path, "w") as meta_file:
+      json.dump(meta, meta_file)
+
+  return cache_path
diff --git a/modelzoo/ELECTRA/gpu_affinity.py b/modelzoo/ELECTRA/gpu_affinity.py
index 68520734..b0aee898 100644
--- a/modelzoo/ELECTRA/gpu_affinity.py
+++ b/modelzoo/ELECTRA/gpu_affinity.py
@@ -21,43 +21,41 @@
 
 
 def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
+  return pynvml.nvmlSystemGetDriverVersion()
 
 
 def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
+  return pynvml.nvmlDeviceGetCount()
 
 
 class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+  # assume nvml returns list of 64 bit ints
+  _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
 
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+  def __init__(self, device_idx):
+    super().__init__()
+    self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
 
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
+  def getName(self):
+    return pynvml.nvmlDeviceGetName(self.handle)
 
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
+  def getCpuAffinity(self):
+    affinity_string = ""
+    for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, device._nvml_affinity_elements):
+      # assume nvml returns list of 64 bit ints
+      affinity_string = "{:064b}".format(j) + affinity_string
+    affinity_list = [int(x) for x in affinity_string]
+    affinity_list.reverse()  # so core 0 is in 0th element of list
 
-        return [i for i, e in enumerate(affinity_list) if e != 0]
+    return [i for i, e in enumerate(affinity_list) if e != 0]
 
 
 def set_affinity(gpu_id=None):
-    if gpu_id is None:
-        gpu_id = int(os.getenv('LOCAL_RANK', 0))
+  if gpu_id is None:
+    gpu_id = int(os.getenv("LOCAL_RANK", 0))
 
-    dev = device(gpu_id)
-    os.sched_setaffinity(0, dev.getCpuAffinity())
+  dev = device(gpu_id)
+  os.sched_setaffinity(0, dev.getCpuAffinity())
 
-    # list of ints representing the logical cores this process is now affinitied with
-    return os.sched_getaffinity(0)
+  # list of ints representing the logical cores this process is now affinitied with
+  return os.sched_getaffinity(0)
diff --git a/modelzoo/ELECTRA/modeling.py b/modelzoo/ELECTRA/modeling.py
index 437decca..9911c30f 100644
--- a/modelzoo/ELECTRA/modeling.py
+++ b/modelzoo/ELECTRA/modeling.py
@@ -28,279 +28,278 @@
 
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5",
-    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5",
-    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5",
-    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5",
-    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5",
-    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5",
+  "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5",
+  "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5",
+  "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5",
+  "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5",
+  "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5",
+  "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5",
 }
 
 
 class TFElectraEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+  """Construct the embeddings from word, position and token_type embeddings."""
+
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.vocab_size = config.vocab_size
+    self.embedding_size = config.embedding_size
+    self.initializer_range = config.initializer_range
+
+    self.position_embeddings = tf.keras.layers.Embedding(
+      config.max_position_embeddings,
+      config.embedding_size,
+      embeddings_initializer=get_initializer(self.initializer_range),
+      name="position_embeddings",
+    )
+    self.token_type_embeddings = tf.keras.layers.Embedding(
+      config.type_vocab_size,
+      config.embedding_size,
+      embeddings_initializer=get_initializer(self.initializer_range),
+      name="token_type_embeddings",
+    )
+
+    # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+    # any TensorFlow checkpoint file
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    self.amp = config.amp
+
+  def build(self, input_shape):
+    """Build shared word embedding layer"""
+    with tf.name_scope("word_embeddings"):
+      # Create and initialize weights. The random normal initializer was chosen
+      # arbitrarily, and works well.
+      self.word_embeddings = self.add_weight(
+        "weight",
+        shape=[self.vocab_size, self.embedding_size],
+        initializer=get_initializer(self.initializer_range),
+      )
+    super().build(input_shape)
+
+  def call(self, inputs, mode="embedding", training=False):
+    """Get token embeddings of inputs.
+    Args:
+        inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+        mode: string, a valid value is one of "embedding" and "linear".
+    Returns:
+        outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+            shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+            linear tensor, float32 with shape [batch_size, length, vocab_size].
+    Raises:
+        ValueError: if mode is not valid.
+
+    Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+    """
+    if mode == "embedding":
+      return self._embedding(inputs, training=training)
+    elif mode == "linear":
+      return self._linear(inputs)
+    else:
+      raise ValueError("mode {} is not valid.".format(mode))
+
+  def _embedding(self, inputs, training=False):
+    """Applies embedding based on inputs tensor."""
+    input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+
+    if input_ids is not None:
+      input_shape = shape_list(input_ids)
+    else:
+      input_shape = shape_list(inputs_embeds)[:-1]
+
+    seq_length = input_shape[1]
+    if position_ids is None:
+      position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+    if token_type_ids is None:
+      token_type_ids = tf.fill(input_shape, 0)
+
+    if inputs_embeds is None:
+      inputs_embeds = tf.gather(self.word_embeddings, input_ids)
+    position_embeddings = self.position_embeddings(position_ids)
+    token_type_embeddings = self.token_type_embeddings(token_type_ids)
+    if self.amp:
+      embeddings = inputs_embeds + tf.cast(position_embeddings, tf.float16) + tf.cast(token_type_embeddings, tf.float16)
+    else:
+      embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+    embeddings = self.LayerNorm(embeddings)
+    embeddings = self.dropout(embeddings, training=training)
+    return embeddings
+
+  def _linear(self, inputs):
+    """Computes logits by running inputs through a linear layer.
+    Args:
+        inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+    Returns:
+        float32 tensor with shape [batch_size, length, vocab_size].
     """
+    batch_size = shape_list(inputs)[0]
+    length = shape_list(inputs)[1]
 
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.embedding_size = config.embedding_size
-        self.initializer_range = config.initializer_range
-
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            config.embedding_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="position_embeddings",
-        )
-        self.token_type_embeddings = tf.keras.layers.Embedding(
-            config.type_vocab_size,
-            config.embedding_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="token_type_embeddings",
-        )
+    x = tf.reshape(inputs, [-1, self.embedding_size])
+    logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.amp = config.amp
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        with tf.name_scope("word_embeddings"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.embedding_size],
-                initializer=get_initializer(self.initializer_range),
-            )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding", training=False):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs, training=training)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
-
-    def _embedding(self, inputs, training=False):
-        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
-
-        if input_ids is not None:
-            input_shape = shape_list(input_ids)
-        else:
-            input_shape = shape_list(inputs_embeds)[:-1]
-
-        seq_length = input_shape[1]
-        if position_ids is None:
-            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        if inputs_embeds is None:
-            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        if self.amp:
-            embeddings = inputs_embeds + tf.cast(position_embeddings, tf.float16) + tf.cast(token_type_embeddings, tf.float16)
-        else:
-            embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings, training=training)
-        return embeddings
-
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
-        """
-        batch_size = shape_list(inputs)[0]
-        length = shape_list(inputs)[1]
-
-        x = tf.reshape(inputs, [-1, self.embedding_size])
-        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
-
-        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+    return tf.reshape(logits, [batch_size, length, self.vocab_size])
 
 
 class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
 
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense")
-        self.dense_prediction = tf.keras.layers.Dense(
-            1, kernel_initializer=get_initializer(config.initializer_range), name="dense_prediction")
-        self.config = config
+    self.dense = tf.keras.layers.Dense(
+      config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
+    self.dense_prediction = tf.keras.layers.Dense(
+      1, kernel_initializer=get_initializer(config.initializer_range), name="dense_prediction"
+    )
+    self.config = config
 
-    def call(self, discriminator_hidden_states, training=False):
-        hidden_states = self.dense(discriminator_hidden_states)
-        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
-        logits = tf.squeeze(self.dense_prediction(hidden_states), axis=-1)
+  def call(self, discriminator_hidden_states, training=False):
+    hidden_states = self.dense(discriminator_hidden_states)
+    hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+    logits = tf.squeeze(self.dense_prediction(hidden_states), axis=-1)
 
-        return logits
+    return logits
 
 
 class TFElectraGeneratorPredictions(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
 
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dense = tf.keras.layers.Dense(
-            config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense")
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+    self.dense = tf.keras.layers.Dense(
+      config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
 
-    def call(self, generator_hidden_states, training=False):
-        hidden_states = self.dense(generator_hidden_states)
-        hidden_states = ACT2FN["gelu"](hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
+  def call(self, generator_hidden_states, training=False):
+    hidden_states = self.dense(generator_hidden_states)
+    hidden_states = ACT2FN["gelu"](hidden_states)
+    hidden_states = self.LayerNorm(hidden_states)
 
-        return hidden_states
+    return hidden_states
 
 
 class TFElectraPreTrainedModel(TFBertPreTrainedModel):
+  config_class = ElectraConfig
+  pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
+  base_model_prefix = "electra"
 
-    config_class = ElectraConfig
-    pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "electra"
-
-    def get_extended_attention_mask(self, attention_mask, input_shape):
-        if attention_mask is None:
-            attention_mask = tf.fill(input_shape, 1)
+  def get_extended_attention_mask(self, attention_mask, input_shape):
+    if attention_mask is None:
+      attention_mask = tf.fill(input_shape, 1)
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+    # We create a 3D attention mask from a 2D tensor mask.
+    # Sizes are [batch_size, 1, 1, to_seq_length]
+    # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+    # this attention mask is more simple than the triangular masking of causal attention
+    # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+    extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
 
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
 
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+    extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
-        return extended_attention_mask
+    return extended_attention_mask
 
-    def get_head_mask(self, head_mask):
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
+  def get_head_mask(self, head_mask):
+    if head_mask is not None:
+      raise NotImplementedError
+    else:
+      head_mask = [None] * self.config.num_hidden_layers
 
-        return head_mask
+    return head_mask
 
 
 class TFElectraMainLayer(TFElectraPreTrainedModel):
+  config_class = ElectraConfig
+
+  def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
+    super().__init__(config, **kwargs)
+
+    if shared_embeddings and input_embeddings is not None:
+      self.embeddings = input_embeddings
+    else:
+      self.embeddings = TFElectraEmbeddings(config, name="embeddings")
+
+    if config.embedding_size != config.hidden_size:
+      self.embeddings_project = tf.keras.layers.Dense(
+        config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="embeddings_project"
+      )
+    self.encoder = TFBertEncoder(config, name="encoder")
+    self.config = config
 
-    config_class = ElectraConfig
-
-    def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
-        super().__init__(config, **kwargs)
-
-        if shared_embeddings and input_embeddings is not None:
-            self.embeddings = input_embeddings
-        else:
-            self.embeddings = TFElectraEmbeddings(config, name="embeddings")
-
-        if config.embedding_size != config.hidden_size:
-            self.embeddings_project = tf.keras.layers.Dense(
-                config.hidden_size,
-                kernel_initializer=get_initializer(config.initializer_range),
-                name="embeddings_project")
-        self.encoder = TFBertEncoder(config, name="encoder")
-        self.config = config
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, (dict, BatchEncoding)):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if attention_mask is None:
-            attention_mask = tf.fill(input_shape, 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
-        head_mask = self.get_head_mask(head_mask)
-
-        hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-
-        if hasattr(self, "embeddings_project"):
-            hidden_states = self.embeddings_project(hidden_states, training=training)
-
-        hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training)
-
-        return hidden_states
+  def get_input_embeddings(self):
+    return self.embeddings
+
+  def _resize_token_embeddings(self, new_num_tokens):
+    raise NotImplementedError
+
+  def _prune_heads(self, heads_to_prune):
+    """Prunes heads of the model.
+    heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+    See base class PreTrainedModel
+    """
+    raise NotImplementedError
+
+  def call(
+    self,
+    inputs,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    if isinstance(inputs, (tuple, list)):
+      input_ids = inputs[0]
+      attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+      token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+      position_ids = inputs[3] if len(inputs) > 3 else position_ids
+      head_mask = inputs[4] if len(inputs) > 4 else head_mask
+      inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+      assert len(inputs) <= 6, "Too many inputs."
+    elif isinstance(inputs, (dict, BatchEncoding)):
+      input_ids = inputs.get("input_ids")
+      attention_mask = inputs.get("attention_mask", attention_mask)
+      token_type_ids = inputs.get("token_type_ids", token_type_ids)
+      position_ids = inputs.get("position_ids", position_ids)
+      head_mask = inputs.get("head_mask", head_mask)
+      inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+      assert len(inputs) <= 6, "Too many inputs."
+    else:
+      input_ids = inputs
+
+    if input_ids is not None and inputs_embeds is not None:
+      raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+      input_shape = shape_list(input_ids)
+    elif inputs_embeds is not None:
+      input_shape = shape_list(inputs_embeds)[:-1]
+    else:
+      raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if attention_mask is None:
+      attention_mask = tf.fill(input_shape, 1)
+    if token_type_ids is None:
+      token_type_ids = tf.fill(input_shape, 0)
+
+    extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+    head_mask = self.get_head_mask(head_mask)
+
+    hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+
+    if hasattr(self, "embeddings_project"):
+      hidden_states = self.embeddings_project(hidden_states, training=training)
+
+    hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training)
+
+    return hidden_states
 
 
 ELECTRA_START_DOCSTRING = r"""
@@ -365,24 +364,24 @@ def call(
 
 
 @add_start_docstrings(
-    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
-    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
-    "hidden size and embedding size are different."
-    ""
-    "Both the generator and discriminator checkpoints may be loaded into this model.",
-    ELECTRA_START_DOCSTRING,
+  "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
+  "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
+  "hidden size and embedding size are different."
+  ""
+  "Both the generator and discriminator checkpoints may be loaded into this model.",
+  ELECTRA_START_DOCSTRING,
 )
 class TFElectraModel(TFElectraPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.electra = TFElectraMainLayer(config, name="electra")
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
+    self.electra = TFElectraMainLayer(config, name="electra")
 
-    def get_input_embeddings(self):
-        return self.electra.embeddings
+  def get_input_embeddings(self):
+    return self.electra.embeddings
 
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
@@ -408,42 +407,42 @@ def call(self, inputs, **kwargs):
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-        """
-        outputs = self.electra(inputs, **kwargs)
-        return outputs
+    """
+    outputs = self.electra(inputs, **kwargs)
+    return outputs
 
 
 @add_start_docstrings(
-    """
+  """
 Electra model with a binary classification head on top as used during pre-training for identifying generated
 tokens.
 
 Even though both the discriminator and generator may be loaded into this model, the discriminator is
 the only model of the two to have the correct classification head to be used for this model.""",
-    ELECTRA_START_DOCSTRING,
+  ELECTRA_START_DOCSTRING,
 )
 class TFElectraForPreTraining(TFElectraPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-
-        self.electra = TFElectraMainLayer(config, name="electra")
-        self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
-
-    def get_input_embeddings(self):
-        return self.electra.embeddings
-
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
-    def call(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
+  def __init__(self, config, **kwargs):
+    super().__init__(config, **kwargs)
+
+    self.electra = TFElectraMainLayer(config, name="electra")
+    self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
+
+  def get_input_embeddings(self):
+    return self.electra.embeddings
+
+  @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+  def call(
+    self,
+    input_ids=None,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    r"""
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -469,77 +468,76 @@ def call(
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         scores = outputs[0]
-        """
+    """
 
-        discriminator_hidden_states = self.electra(
-            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
-        )
-        discriminator_sequence_output = discriminator_hidden_states[0]
-        logits = self.discriminator_predictions(discriminator_sequence_output)
-        output = (logits,)
-        output += discriminator_hidden_states[1:]
+    discriminator_hidden_states = self.electra(
+      input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+    )
+    discriminator_sequence_output = discriminator_hidden_states[0]
+    logits = self.discriminator_predictions(discriminator_sequence_output)
+    output = (logits,)
+    output += discriminator_hidden_states[1:]
 
-        return output  # (loss), scores, (hidden_states), (attentions)
+    return output  # (loss), scores, (hidden_states), (attentions)
 
 
 class TFElectraMaskedLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.input_embeddings = input_embeddings
+  def __init__(self, config, input_embeddings, **kwargs):
+    super().__init__(**kwargs)
+    self.vocab_size = config.vocab_size
+    self.input_embeddings = input_embeddings
 
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
+  def build(self, input_shape):
+    self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+    super().build(input_shape)
 
-    def call(self, hidden_states, training=False):
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
+  def call(self, hidden_states, training=False):
+    hidden_states = self.input_embeddings(hidden_states, mode="linear")
+    hidden_states = hidden_states + self.bias
+    return hidden_states
 
 
 @add_start_docstrings(
-    """
+  """
 Electra model with a language modeling head on top.
 
 Even though both the discriminator and generator may be loaded into this model, the generator is
 the only model of the two to have been trained for the masked language modeling task.""",
-    ELECTRA_START_DOCSTRING,
+  ELECTRA_START_DOCSTRING,
 )
 class TFElectraForMaskedLM(TFElectraPreTrainedModel):
-    def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
-        super().__init__(config, **kwargs)
-
-        self.vocab_size = config.vocab_size
-        self.electra = TFElectraMainLayer(config,
-                                          shared_embeddings=shared_embeddings,
-                                          input_embeddings=input_embeddings,
-                                          name="electra")
-        self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
-        if isinstance(config.hidden_act, str):
-            self.activation = ACT2FN[config.hidden_act]
-        else:
-            self.activation = config.hidden_act
-        self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
-
-    def get_input_embeddings(self):
-        return self.electra.embeddings
-
-    def get_output_embeddings(self):
-        return self.generator_lm_head
-
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
-    def call(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
+  def __init__(self, config, shared_embeddings=False, input_embeddings=None, **kwargs):
+    super().__init__(config, **kwargs)
+
+    self.vocab_size = config.vocab_size
+    self.electra = TFElectraMainLayer(
+      config, shared_embeddings=shared_embeddings, input_embeddings=input_embeddings, name="electra"
+    )
+    self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
+    if isinstance(config.hidden_act, str):
+      self.activation = ACT2FN[config.hidden_act]
+    else:
+      self.activation = config.hidden_act
+    self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
+
+  def get_input_embeddings(self):
+    return self.electra.embeddings
+
+  def get_output_embeddings(self):
+    return self.generator_lm_head
+
+  @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+  def call(
+    self,
+    input_ids=None,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    r"""
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -566,229 +564,221 @@ def call(
         outputs = model(input_ids)
         prediction_scores = outputs[0]
 
-        """
+    """
 
-        generator_hidden_states = self.electra(
-            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
-        )
-        generator_sequence_output = generator_hidden_states[0]
-        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
-        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
-        output = (prediction_scores,)
-        output += generator_hidden_states[1:]
+    generator_hidden_states = self.electra(
+      input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+    )
+    generator_sequence_output = generator_hidden_states[0]
+    prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
+    prediction_scores = self.generator_lm_head(prediction_scores, training=training)
+    output = (prediction_scores,)
+    output += generator_hidden_states[1:]
+
+    return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
-        return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 def get_generator_config(config, bert_config):
-    """Get model config for the generator network."""
-    gen_config = ElectraConfig.from_dict(bert_config.to_dict())
-    gen_config.hidden_size = int(round(
-        bert_config.hidden_size * config.generator_hidden_size))
-    #To keep hidden size divisble by 64 - attention head size
-    if gen_config.hidden_size % 64 != 0:
-        gen_config.hidden_size += 64 - (gen_config.hidden_size % 64)
-    gen_config.num_hidden_layers = int(round(
-        bert_config.num_hidden_layers * config.generator_layers))
-    gen_config.intermediate_size = 4 * gen_config.hidden_size
-    gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
-    return gen_config
+  """Get model config for the generator network."""
+  gen_config = ElectraConfig.from_dict(bert_config.to_dict())
+  gen_config.hidden_size = int(round(bert_config.hidden_size * config.generator_hidden_size))
+  # To keep hidden size divisble by 64 - attention head size
+  if gen_config.hidden_size % 64 != 0:
+    gen_config.hidden_size += 64 - (gen_config.hidden_size % 64)
+  gen_config.num_hidden_layers = int(round(bert_config.num_hidden_layers * config.generator_layers))
+  gen_config.intermediate_size = 4 * gen_config.hidden_size
+  gen_config.num_attention_heads = max(1, gen_config.hidden_size // 64)
+  return gen_config
+
 
 class PretrainingModel(tf.keras.Model):
-    """Transformer pre-training using the replaced-token-detection task."""
-
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        # Set up model config
-        self._config = config
-        self.disc_config = ElectraConfig(vocab_size=config.vocab_size,
-                                         embedding_size=config.embedding_size,
-                                         hidden_size=config.hidden_size,
-                                         num_hidden_layers=config.num_hidden_layers,
-                                         num_attention_heads=config.num_attention_heads,
-                                         intermediate_size=4*config.hidden_size,
-                                         hidden_act=config.act_func,
-                                         hidden_dropout_prob=config.hidden_dropout_prob,
-                                         attention_probs_dropout_prob=config.attention_probs_dropout_prob, )
-        self.disc_config.update({"amp": config.amp})
-
-        # Set up discriminator
-        self.discriminator = TFElectraForPreTraining(self.disc_config)
-
-        # Set up generator
-        gen_config = get_generator_config(config, self.disc_config)
-        gen_config.update({"amp": config.amp})
-        if config.electra_objective:
-            if config.shared_embeddings:
-                self.generator = TFElectraForMaskedLM(
-                    gen_config, shared_embeddings=True,
-                    input_embeddings=self.discriminator.get_input_embeddings())
-            else:
-                self.generator = TFElectraForMaskedLM(gen_config)
-        else:
-            self.generator = TFElectraForMaskedLM(self.disc_config)
-
-    def call(self, features, is_training):
-        config = self._config
-
-        # Mask the input
-        masked_inputs = pretrain_utils.mask(
-            config, pretrain_utils.features_to_inputs(features), config.mask_prob)
-
-        # Generator
-        if config.uniform_generator:
-            mlm_output = self._get_masked_lm_output(masked_inputs, None, is_training=is_training)
-        else:
-            mlm_output = self._get_masked_lm_output(
-                masked_inputs, self.generator, is_training=is_training)
-        fake_data = self._get_fake_data(masked_inputs, mlm_output.logits)
-        total_loss = config.gen_weight * mlm_output.loss
-
-        # Discriminator
-        disc_output = None
-        if config.electra_objective:
-            disc_output = self._get_discriminator_output(
-                fake_data.inputs, self.discriminator, fake_data.is_fake_tokens,
-                is_training=is_training)
-            total_loss += config.disc_weight * disc_output.loss
-
-        # Evaluation inputs
-        eval_fn_inputs = {
-            "input_ids": masked_inputs.input_ids,
-            "masked_lm_preds": mlm_output.preds,
-            "mlm_loss": mlm_output.per_example_loss,
-            "masked_lm_ids": masked_inputs.masked_lm_ids,
-            "masked_lm_weights": masked_inputs.masked_lm_weights,
-            "input_mask": masked_inputs.input_mask
-        }
-        if config.electra_objective:
-            eval_fn_inputs.update({
-                "disc_loss": disc_output.per_example_loss,
-                "disc_labels": disc_output.labels,
-                "disc_probs": disc_output.probs,
-                "disc_preds": disc_output.preds,
-                "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1,
-                                            output_type=tf.int32)
-            })
-
-        return total_loss, eval_fn_inputs
-
-    def _get_masked_lm_output(self, inputs, generator, is_training=False):
-        """Masked language modeling softmax layer."""
-        masked_lm_weights = inputs.masked_lm_weights
-
-        if self._config.uniform_generator:
-            logits = tf.zeros(self.disc_config.vocab_size)
-            logits_tiled = tf.zeros(
-                pretrain_utils.get_shape_list(inputs.masked_lm_ids) +
-                [self.disc_config.vocab_size])
-            logits_tiled += tf.reshape(logits, [1, 1, self.disc_config.vocab_size])
-            logits = logits_tiled
-        else:
-            outputs = generator(
-                input_ids=inputs.input_ids,
-                attention_mask=inputs.input_mask,
-                token_type_ids=inputs.segment_ids,
-                training=is_training)
-            logits = outputs[0]
-            logits = pretrain_utils.gather_positions(
-                logits, inputs.masked_lm_positions)
-
-        oh_labels = tf.one_hot(
-            inputs.masked_lm_ids, depth=self.disc_config.vocab_size,
-            dtype=tf.float32)
-
-        probs = tf.cast(tf.nn.softmax(logits), tf.float32)
-        log_probs = tf.cast(tf.nn.log_softmax(logits), tf.float32)
-        label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)
-
-        numerator = tf.reduce_sum(masked_lm_weights * label_log_probs)
-        denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
-        loss = numerator / denominator
-        preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)
-
-        MLMOutput = collections.namedtuple(
-            "MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"])
-        return MLMOutput(
-            logits=logits, probs=probs, per_example_loss=label_log_probs,
-            loss=loss, preds=preds)
-
-    def _get_discriminator_output(self, inputs, discriminator, labels, is_training=False):
-        """Discriminator binary classifier."""
-
-        outputs = discriminator(
-            input_ids=inputs.input_ids,
-            attention_mask=inputs.input_mask,
-            token_type_ids=inputs.segment_ids,
-            training=is_training,
-        )
-        logits = outputs[0]
-        weights = tf.cast(inputs.input_mask, tf.float32)
-        labelsf = tf.cast(labels, tf.float32)
-        logits = tf.cast(logits, tf.float32)
-        losses = tf.nn.sigmoid_cross_entropy_with_logits(
-            logits=logits, labels=labelsf) * weights
-        per_example_loss = (tf.reduce_sum(losses, axis=-1) /
-                            (1e-6 + tf.reduce_sum(weights, axis=-1)))
-        loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
-        probs = tf.nn.sigmoid(logits)
-        preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
-        DiscOutput = collections.namedtuple(
-            "DiscOutput", ["loss", "per_example_loss", "probs", "preds",
-                           "labels"])
-        return DiscOutput(
-            loss=loss, per_example_loss=per_example_loss, probs=probs,
-            preds=preds, labels=labels,
+  """Transformer pre-training using the replaced-token-detection task."""
+
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    # Set up model config
+    self._config = config
+    self.disc_config = ElectraConfig(
+      vocab_size=config.vocab_size,
+      embedding_size=config.embedding_size,
+      hidden_size=config.hidden_size,
+      num_hidden_layers=config.num_hidden_layers,
+      num_attention_heads=config.num_attention_heads,
+      intermediate_size=4 * config.hidden_size,
+      hidden_act=config.act_func,
+      hidden_dropout_prob=config.hidden_dropout_prob,
+      attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+    )
+    self.disc_config.update({"amp": config.amp})
+
+    # Set up discriminator
+    self.discriminator = TFElectraForPreTraining(self.disc_config)
+
+    # Set up generator
+    gen_config = get_generator_config(config, self.disc_config)
+    gen_config.update({"amp": config.amp})
+    if config.electra_objective:
+      if config.shared_embeddings:
+        self.generator = TFElectraForMaskedLM(
+          gen_config, shared_embeddings=True, input_embeddings=self.discriminator.get_input_embeddings()
         )
-
-    def _get_fake_data(self, inputs, mlm_logits):
-        """Sample from the generator to create corrupted input."""
-        inputs = pretrain_utils.unmask(inputs)
-        disallow = tf.one_hot(
-            inputs.masked_lm_ids, depth=self.disc_config.vocab_size,
-            dtype=tf.float32) if self._config.disallow_correct else None
-        sampled_tokens = tf.stop_gradient(pretrain_utils.sample_from_softmax(
-            mlm_logits / self._config.temperature, disallow=disallow))
-        sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
-        updated_input_ids, masked = pretrain_utils.scatter_update(
-            inputs.input_ids, sampled_tokids, inputs.masked_lm_positions)
-        labels = masked * (1 - tf.cast(
-            tf.equal(updated_input_ids, inputs.input_ids), tf.int32))
-        updated_inputs = pretrain_utils.get_updated_inputs(
-            inputs, input_ids=updated_input_ids)
-        FakedData = collections.namedtuple("FakedData", [
-            "inputs", "is_fake_tokens", "sampled_tokens"])
-        return FakedData(inputs=updated_inputs, is_fake_tokens=labels,
-                         sampled_tokens=sampled_tokens)
+      else:
+        self.generator = TFElectraForMaskedLM(gen_config)
+    else:
+      self.generator = TFElectraForMaskedLM(self.disc_config)
+
+  def call(self, features, is_training):
+    config = self._config
+
+    # Mask the input
+    masked_inputs = pretrain_utils.mask(config, pretrain_utils.features_to_inputs(features), config.mask_prob)
+
+    # Generator
+    if config.uniform_generator:
+      mlm_output = self._get_masked_lm_output(masked_inputs, None, is_training=is_training)
+    else:
+      mlm_output = self._get_masked_lm_output(masked_inputs, self.generator, is_training=is_training)
+    fake_data = self._get_fake_data(masked_inputs, mlm_output.logits)
+    total_loss = config.gen_weight * mlm_output.loss
+
+    # Discriminator
+    disc_output = None
+    if config.electra_objective:
+      disc_output = self._get_discriminator_output(
+        fake_data.inputs, self.discriminator, fake_data.is_fake_tokens, is_training=is_training
+      )
+      total_loss += config.disc_weight * disc_output.loss
+
+    # Evaluation inputs
+    eval_fn_inputs = {
+      "input_ids": masked_inputs.input_ids,
+      "masked_lm_preds": mlm_output.preds,
+      "mlm_loss": mlm_output.per_example_loss,
+      "masked_lm_ids": masked_inputs.masked_lm_ids,
+      "masked_lm_weights": masked_inputs.masked_lm_weights,
+      "input_mask": masked_inputs.input_mask,
+    }
+    if config.electra_objective:
+      eval_fn_inputs.update({
+        "disc_loss": disc_output.per_example_loss,
+        "disc_labels": disc_output.labels,
+        "disc_probs": disc_output.probs,
+        "disc_preds": disc_output.preds,
+        "sampled_tokids": tf.argmax(fake_data.sampled_tokens, -1, output_type=tf.int32),
+      })
+
+    return total_loss, eval_fn_inputs
+
+  def _get_masked_lm_output(self, inputs, generator, is_training=False):
+    """Masked language modeling softmax layer."""
+    masked_lm_weights = inputs.masked_lm_weights
+
+    if self._config.uniform_generator:
+      logits = tf.zeros(self.disc_config.vocab_size)
+      logits_tiled = tf.zeros(pretrain_utils.get_shape_list(inputs.masked_lm_ids) + [self.disc_config.vocab_size])
+      logits_tiled += tf.reshape(logits, [1, 1, self.disc_config.vocab_size])
+      logits = logits_tiled
+    else:
+      outputs = generator(
+        input_ids=inputs.input_ids,
+        attention_mask=inputs.input_mask,
+        token_type_ids=inputs.segment_ids,
+        training=is_training,
+      )
+      logits = outputs[0]
+      logits = pretrain_utils.gather_positions(logits, inputs.masked_lm_positions)
+
+    oh_labels = tf.one_hot(inputs.masked_lm_ids, depth=self.disc_config.vocab_size, dtype=tf.float32)
+
+    probs = tf.cast(tf.nn.softmax(logits), tf.float32)
+    log_probs = tf.cast(tf.nn.log_softmax(logits), tf.float32)
+    label_log_probs = -tf.reduce_sum(log_probs * oh_labels, axis=-1)
+
+    numerator = tf.reduce_sum(masked_lm_weights * label_log_probs)
+    denominator = tf.reduce_sum(masked_lm_weights) + 1e-6
+    loss = numerator / denominator
+    preds = tf.argmax(log_probs, axis=-1, output_type=tf.int32)
+
+    MLMOutput = collections.namedtuple("MLMOutput", ["logits", "probs", "loss", "per_example_loss", "preds"])
+    return MLMOutput(logits=logits, probs=probs, per_example_loss=label_log_probs, loss=loss, preds=preds)
+
+  def _get_discriminator_output(self, inputs, discriminator, labels, is_training=False):
+    """Discriminator binary classifier."""
+
+    outputs = discriminator(
+      input_ids=inputs.input_ids,
+      attention_mask=inputs.input_mask,
+      token_type_ids=inputs.segment_ids,
+      training=is_training,
+    )
+    logits = outputs[0]
+    weights = tf.cast(inputs.input_mask, tf.float32)
+    labelsf = tf.cast(labels, tf.float32)
+    logits = tf.cast(logits, tf.float32)
+    losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labelsf) * weights
+    per_example_loss = tf.reduce_sum(losses, axis=-1) / (1e-6 + tf.reduce_sum(weights, axis=-1))
+    loss = tf.reduce_sum(losses) / (1e-6 + tf.reduce_sum(weights))
+    probs = tf.nn.sigmoid(logits)
+    preds = tf.cast(tf.round((tf.sign(logits) + 1) / 2), tf.int32)
+    DiscOutput = collections.namedtuple("DiscOutput", ["loss", "per_example_loss", "probs", "preds", "labels"])
+    return DiscOutput(
+      loss=loss,
+      per_example_loss=per_example_loss,
+      probs=probs,
+      preds=preds,
+      labels=labels,
+    )
+
+  def _get_fake_data(self, inputs, mlm_logits):
+    """Sample from the generator to create corrupted input."""
+    inputs = pretrain_utils.unmask(inputs)
+    disallow = (
+      tf.one_hot(inputs.masked_lm_ids, depth=self.disc_config.vocab_size, dtype=tf.float32)
+      if self._config.disallow_correct
+      else None
+    )
+    sampled_tokens = tf.stop_gradient(
+      pretrain_utils.sample_from_softmax(mlm_logits / self._config.temperature, disallow=disallow)
+    )
+    sampled_tokids = tf.argmax(sampled_tokens, -1, output_type=tf.int32)
+    updated_input_ids, masked = pretrain_utils.scatter_update(
+      inputs.input_ids, sampled_tokids, inputs.masked_lm_positions
+    )
+    labels = masked * (1 - tf.cast(tf.equal(updated_input_ids, inputs.input_ids), tf.int32))
+    updated_inputs = pretrain_utils.get_updated_inputs(inputs, input_ids=updated_input_ids)
+    FakedData = collections.namedtuple("FakedData", ["inputs", "is_fake_tokens", "sampled_tokens"])
+    return FakedData(inputs=updated_inputs, is_fake_tokens=labels, sampled_tokens=sampled_tokens)
 
 
 @add_start_docstrings(
-    """
+  """
 Electra model with a token classification head on top.
 
 Both the discriminator and generator may be loaded into this model.""",
-    ELECTRA_START_DOCSTRING,
+  ELECTRA_START_DOCSTRING,
 )
 class TFElectraForTokenClassification(TFElectraPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
-
-        self.electra = TFElectraMainLayer(config, name="electra")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier")
-
-    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
-    def call(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
+  def __init__(self, config, **kwargs):
+    super().__init__(config, **kwargs)
+
+    self.electra = TFElectraMainLayer(config, name="electra")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    self.classifier = tf.keras.layers.Dense(
+      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+    )
+
+  @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+  def call(
+    self,
+    input_ids=None,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    r"""
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -814,271 +804,289 @@ def call(
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         scores = outputs[0]
-        """
+    """
 
-        discriminator_hidden_states = self.electra(
-            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
-        )
-        discriminator_sequence_output = discriminator_hidden_states[0]
-        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
-        logits = self.classifier(discriminator_sequence_output)
-        output = (logits,)
-        output += discriminator_hidden_states[1:]
+    discriminator_hidden_states = self.electra(
+      input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+    )
+    discriminator_sequence_output = discriminator_hidden_states[0]
+    discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+    logits = self.classifier(discriminator_sequence_output)
+    output = (logits,)
+    output += discriminator_hidden_states[1:]
 
-        return output  # (loss), scores, (hidden_states), (attentions)
+    return output  # (loss), scores, (hidden_states), (attentions)
 
 
 class TFPoolerStartLogits(tf.keras.Model):
-    """ Compute SQuAD start_logits from sequence hidden states. """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-        self.dense = tf.keras.layers.Dense(
-            1, kernel_initializer=get_initializer(config.initializer_range), name="start_logit_pooler_dense"
-        )
-
-    def call(self, hidden_states, p_mask=None, next_layer_dtype=tf.float32):
-        """ Args:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
-                invalid position mask such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        x = tf.squeeze(self.dense(hidden_states), axis=-1,
-                       name="squeeze_start_logit_pooler")
+  """Compute SQuAD start_logits from sequence hidden states."""
+
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(*inputs, **kwargs)
+    self.dense = tf.keras.layers.Dense(
+      1, kernel_initializer=get_initializer(config.initializer_range), name="start_logit_pooler_dense"
+    )
+
+  def call(self, hidden_states, p_mask=None, next_layer_dtype=tf.float32):
+    """Args:
+    **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+        invalid position mask such as query and special symbols (PAD, SEP, CLS)
+        1.0 means token should be masked.
+    """
+    x = tf.squeeze(self.dense(hidden_states), axis=-1, name="squeeze_start_logit_pooler")
 
-        if p_mask is not None:
-            x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
+    if p_mask is not None:
+      x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
 
-        return x
+    return x
 
 
 class TFPoolerEndLogits(tf.keras.Model):
-    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+  """Compute SQuAD end_logits from sequence hidden states and start token hidden state."""
+
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(*inputs, **kwargs)
+    self.dense_0 = tf.keras.layers.Dense(
+      config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_0"
+    )
+
+    self.activation = tf.keras.layers.Activation("tanh")  # nn.Tanh()
+    self.LayerNorm = tf.keras.layers.LayerNormalization(
+      axis=-1, epsilon=config.layer_norm_eps, name="end_logit_pooler_LayerNorm"
+    )
+    self.dense_1 = tf.keras.layers.Dense(
+      1, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_1"
+    )
+
+  def call(
+    self,
+    hidden_states,
+    start_states=None,
+    start_positions=None,
+    p_mask=None,
+    training=False,
+    next_layer_dtype=tf.float32,
+  ):
+    """Args:
+    One of ``start_states``, ``start_positions`` should be not None.
+    If both are set, ``start_positions`` overrides ``start_states``.
+    **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+        hidden states of the first tokens for the labeled span.
+    **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+        position of the first token for the labeled span:
+    **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+        Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+        1.0 means token should be masked.
     """
+    assert start_states is not None or start_positions is not None, (
+      "One of start_states, start_positions should be not None"
+    )
+    if start_positions is not None and training:
+      bsz, slen, hsz = hidden_states.shape
+      start_states = tf.gather(hidden_states, start_positions[:, None], axis=1, batch_dims=1)  # shape (bsz, 1, hsz)
+      start_states = tf.broadcast_to(start_states, (bsz, slen, hsz))  # shape (bsz, slen, hsz)
 
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-        self.dense_0 = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
-            name="end_logit_pooler_dense_0"
-        )
+    x = self.dense_0(tf.concat([hidden_states, start_states], axis=-1))
+    x = self.activation(x)
+    if training:
+      # since we are not doing beam search, add dimension with value=1. corresponds to dimension with top_k during inference - if not layernorm crashes
+      x = tf.expand_dims(x, axis=2)
+    x = self.LayerNorm(x)
 
-        self.activation = tf.keras.layers.Activation('tanh')  # nn.Tanh()
-        self.LayerNorm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=config.layer_norm_eps,
-                                                            name="end_logit_pooler_LayerNorm")
-        self.dense_1 = tf.keras.layers.Dense(
-            1, kernel_initializer=get_initializer(config.initializer_range), name="end_logit_pooler_dense_1"
-        )
+    if training:
+      # undo the additional dimension added above
+      x = tf.squeeze(self.dense_1(x), axis=[-1, -2])
+    else:
+      x = tf.squeeze(self.dense_1(x), axis=-1)
 
-    def call(self, hidden_states, start_states=None, start_positions=None, p_mask=None, training=False,
-             next_layer_dtype=tf.float32):
-        """ Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span:
-            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
-                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
-                1.0 means token should be masked.
-        """
-        assert (
-                start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None and training:
-            bsz, slen, hsz = hidden_states.shape
-            start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
-                                     batch_dims=1)  # shape (bsz, 1, hsz)
-            start_states = tf.broadcast_to(start_states, (bsz, slen, hsz))  # shape (bsz, slen, hsz)
-
-        x = self.dense_0(tf.concat([hidden_states, start_states], axis=-1))
-        x = self.activation(x)
-        if training:
-            # since we are not doing beam search, add dimension with value=1. corresponds to dimension with top_k during inference - if not layernorm crashes
-            x = tf.expand_dims(x, axis=2)
-        x = self.LayerNorm(x)
-
-        if training:
-            # undo the additional dimension added above
-            x = tf.squeeze(self.dense_1(x), axis=[-1, -2])
-        else:
-            x = tf.squeeze(self.dense_1(x), axis=-1)
-
-        if p_mask is not None:
-            x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
-
-        return x
+    if p_mask is not None:
+      x = tf.cast(x, tf.float32) * (1 - p_mask) - 1e30 * p_mask
+
+    return x
 
 
 class TFPoolerAnswerClass(tf.keras.Model):
-    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+  """Compute SQuAD 2.0 answer class from classification and start tokens hidden states."""
+
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(*inputs, **kwargs)
+    self.dense_0 = tf.keras.layers.Dense(
+      config.hidden_size,
+      kernel_initializer=get_initializer(config.initializer_range),
+      name="pooler_answer_class_dense_0",
+    )
+
+    self.activation = tf.keras.layers.Activation("tanh")
+    self.dense_1 = tf.keras.layers.Dense(
+      1,
+      use_bias=False,
+      kernel_initializer=get_initializer(config.initializer_range),
+      name="pooler_answer_class_dense_1",
+    )
+
+  def call(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+    """
+    Args:
+        One of ``start_states``, ``start_positions`` should be not None.
+        If both are set, ``start_positions`` overrides ``start_states``.
+        **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+            hidden states of the first tokens for the labeled span.
+        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the first token for the labeled span.
+        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+            position of the CLS token. If None, take the last token.
+        note(Original repo):
+            no dependency on end_feature so that we can obtain one single `cls_logits`
+            for each sample
+    """
+    assert start_states is not None or start_positions is not None, (
+      "One of start_states, start_positions should be not None"
+    )
+    if start_positions is not None:
+      start_states = tf.gather(hidden_states, start_positions[:, None], axis=1, batch_dims=1)  # shape (bsz, 1, hsz)
+      start_states = tf.squeeze(start_states, axis=1)  # shape (bsz, hsz)
 
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-        self.dense_0 = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range),
-            name="pooler_answer_class_dense_0"
-        )
+    if cls_index is not None:
+      cls_token_state = tf.gather(hidden_states, cls_index[:, None], axis=1, batch_dims=1)  # shape (bsz, 1, hsz)
+      cls_token_state = tf.squeeze(cls_token_state, axis=1)  # shape (bsz, hsz)
+    else:
+      cls_token_state = hidden_states[:, 0, :]  # shape (bsz, hsz)
 
-        self.activation = tf.keras.layers.Activation('tanh')
-        self.dense_1 = tf.keras.layers.Dense(
-            1, use_bias=False, kernel_initializer=get_initializer(config.initializer_range),
-            name="pooler_answer_class_dense_1"
-        )
+    x = self.dense_0(tf.concat([start_states, cls_token_state], axis=-1))
+    x = self.activation(x)
+    x = tf.squeeze(self.dense_1(x), axis=-1)
 
-    def call(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
-        """
-        Args:
-            One of ``start_states``, ``start_positions`` should be not None.
-            If both are set, ``start_positions`` overrides ``start_states``.
-            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
-                hidden states of the first tokens for the labeled span.
-            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
-                position of the first token for the labeled span.
-            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
-                position of the CLS token. If None, take the last token.
-            note(Original repo):
-                no dependency on end_feature so that we can obtain one single `cls_logits`
-                for each sample
-        """
-        assert (
-                start_states is not None or start_positions is not None
-        ), "One of start_states, start_positions should be not None"
-        if start_positions is not None:
-            start_states = tf.gather(hidden_states, start_positions[:, None], axis=1,
-                                     batch_dims=1)  # shape (bsz, 1, hsz)
-            start_states = tf.squeeze(start_states, axis=1)  # shape (bsz, hsz)
-
-        if cls_index is not None:
-            cls_token_state = tf.gather(hidden_states, cls_index[:, None], axis=1, batch_dims=1)  # shape (bsz, 1, hsz)
-            cls_token_state = tf.squeeze(cls_token_state, axis=1)  # shape (bsz, hsz)
-        else:
-            cls_token_state = hidden_states[:, 0, :]  # shape (bsz, hsz)
-
-        x = self.dense_0(tf.concat([start_states, cls_token_state], axis=-1))
-        x = self.activation(x)
-        x = tf.squeeze(self.dense_1(x), axis=-1)
-
-        return x
+    return x
 
 
 class TFElectraForQuestionAnswering(TFElectraPreTrainedModel):
-    def __init__(self, config, args):
-        super().__init__(config, args)
-
-        self.start_n_top = args.beam_size  # config.start_n_top
-        self.end_n_top = args.beam_size  # config.end_n_top
-        self.joint_head = args.joint_head
-        self.v2 = args.version_2_with_negative
-        self.electra = TFElectraMainLayer(config, name="electra")
-        self.num_hidden_layers = config.num_hidden_layers
-        self.amp = config.amp
-
-        ##old head
-        if not self.joint_head:
-            self.qa_outputs = tf.keras.layers.Dense(
-                2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs")
-        else:
-            self.start_logits = TFPoolerStartLogits(config, name='start_logits')
-            self.end_logits = TFPoolerEndLogits(config, name='end_logits')
-            if self.v2:
-                self.answer_class = TFPoolerAnswerClass(config, name='answer_class')
-
-    def call(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            start_positions=None,
-            end_positions=None,
-            cls_index=None,
-            p_mask=None,
-            is_impossible=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            training=False,
-    ):
-        outputs = self.electra(
-            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+  def __init__(self, config, args):
+    super().__init__(config, args)
+
+    self.start_n_top = args.beam_size  # config.start_n_top
+    self.end_n_top = args.beam_size  # config.end_n_top
+    self.joint_head = args.joint_head
+    self.v2 = args.version_2_with_negative
+    self.electra = TFElectraMainLayer(config, name="electra")
+    self.num_hidden_layers = config.num_hidden_layers
+    self.amp = config.amp
+
+    ##old head
+    if not self.joint_head:
+      self.qa_outputs = tf.keras.layers.Dense(
+        2, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+      )
+    else:
+      self.start_logits = TFPoolerStartLogits(config, name="start_logits")
+      self.end_logits = TFPoolerEndLogits(config, name="end_logits")
+      if self.v2:
+        self.answer_class = TFPoolerAnswerClass(config, name="answer_class")
+
+  def call(
+    self,
+    input_ids=None,
+    attention_mask=None,
+    token_type_ids=None,
+    start_positions=None,
+    end_positions=None,
+    cls_index=None,
+    p_mask=None,
+    is_impossible=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    outputs = self.electra(
+      input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+    )
+    discriminator_sequence_output = outputs[0]
+
+    # Simple head model
+    if not self.joint_head:
+      logits = self.qa_outputs(discriminator_sequence_output)
+      [start_logits, end_logits] = tf.split(logits, 2, axis=-1)
+      start_logits = tf.squeeze(start_logits, axis=-1, name="squeeze_start_logit")
+      end_logits = tf.squeeze(end_logits, axis=-1, name="squeeze_end_logit")
+      outputs = (start_logits, end_logits) + outputs
+      return outputs
+
+    start_logits = self.start_logits(
+      discriminator_sequence_output, p_mask=p_mask, next_layer_dtype=self.end_logits.dense_0.dtype
+    )
+    if training:  # start_positions is not None and end_positions is not None:
+      # during training, compute the end logits based on the ground truth of the start position
+      end_logits = self.end_logits(
+        discriminator_sequence_output,
+        start_positions=start_positions,
+        p_mask=p_mask,
+        training=training,
+        next_layer_dtype=tf.float16 if self.amp else tf.float32,
+      )
+
+      if self.v2:  # cls_index is not None:#cls_index is not None and is_impossible is not None:
+        # Predict answerability from the representation of CLS and START
+        cls_logits = self.answer_class(
+          discriminator_sequence_output, start_positions=start_positions, cls_index=cls_index
         )
-        discriminator_sequence_output = outputs[0]
-
-        # Simple head model
-        if not self.joint_head:
-            logits = self.qa_outputs(discriminator_sequence_output)
-            [start_logits, end_logits] = tf.split(logits, 2, axis=-1)
-            start_logits = tf.squeeze(start_logits, axis=-1, name="squeeze_start_logit")
-            end_logits = tf.squeeze(end_logits, axis=-1, name="squeeze_end_logit")
-            outputs = (start_logits, end_logits) + outputs
-            return outputs
-
-        start_logits = self.start_logits(discriminator_sequence_output, p_mask=p_mask,
-                                         next_layer_dtype=self.end_logits.dense_0.dtype)
-        if training:  # start_positions is not None and end_positions is not None:
-
-            # during training, compute the end logits based on the ground truth of the start position
-            end_logits = self.end_logits(discriminator_sequence_output, start_positions=start_positions, p_mask=p_mask,
-                                         training=training,
-                                         next_layer_dtype=tf.float16 if self.amp else tf.float32)
-
-            if self.v2:  # cls_index is not None:#cls_index is not None and is_impossible is not None:
-                # Predict answerability from the representation of CLS and START
-                cls_logits = self.answer_class(discriminator_sequence_output, start_positions=start_positions,
-                                               cls_index=cls_index)
-
-            else:
-                cls_logits = None
-
-            outputs = (start_logits, end_logits, cls_logits) + outputs
-
-        else:
-            # during inference, compute the end logits based on beam search
-            bsz, slen, hsz = discriminator_sequence_output.shape
-            start_n_top = min(self.start_n_top, slen)
-            end_n_top = min(self.end_n_top, slen)
-            start_log_probs = tf.nn.log_softmax(start_logits, axis=-1, name="start_logit_softmax")  # shape (bsz, slen)
-
-            start_top_log_probs, start_top_index = tf.math.top_k(start_log_probs, k=start_n_top,
-                                                                 name="start_log_probs_top_k")
-
-            start_states = tf.gather(discriminator_sequence_output, start_top_index, axis=1,
-                                     batch_dims=1)  # shape (bsz, start_n_top, hsz)
-            start_states = tf.broadcast_to(tf.expand_dims(start_states, axis=1),
-                                           [bsz, slen, start_n_top, hsz])  # shape (bsz, slen, start_n_top, hsz)
-
-            discriminator_sequence_output_expanded = tf.broadcast_to(
-                tf.expand_dims(discriminator_sequence_output, axis=2),
-                list(start_states.shape))  # shape (bsz, slen, start_n_top, hsz)
-
-            p_mask = tf.expand_dims(p_mask, axis=-1) if p_mask is not None else None
-            end_logits = self.end_logits(discriminator_sequence_output_expanded, start_states=start_states,
-                                         p_mask=p_mask, next_layer_dtype=tf.float16 if self.amp else tf.float32)  # self.answer_class.dense_0.dtype)
-            end_log_probs = tf.nn.log_softmax(end_logits, axis=1,
-                                              name="end_logit_softmax")  # shape (bsz, slen, start_n_top)
-
-            # need to transpose because tf.math.top_k works on default axis=-1
-            end_log_probs = tf.transpose(end_log_probs, perm=[0, 2, 1])
-            end_top_log_probs, end_top_index = tf.math.top_k(
-                end_log_probs, k=end_n_top)  # shape (bsz, end_n_top, start_n_top).perm(0,2,1)
-            end_top_log_probs = tf.reshape(end_top_log_probs, (
-                -1, start_n_top * end_n_top))  # shape (bsz, self.start_n_top * self.end_n_top)
-            end_top_index = tf.reshape(end_top_index,
-                                       (-1, start_n_top * end_n_top))  # shape (bsz, self.start_n_top * self.end_n_top)
-            if self.v2:  # cls_index is not None:
-                start_p = tf.nn.softmax(start_logits, axis=-1, name="start_softmax")
-                start_states = tf.einsum(
-                    "blh,bl->bh", discriminator_sequence_output, tf.cast(start_p, tf.float16) if self.amp else start_p
-                )  # get the representation of START as weighted sum of hidden states
-                # explicitly setting cls_index to None
-                cls_logits = self.answer_class(
-                    discriminator_sequence_output, start_states=start_states, cls_index=None)
-                # one single `cls_logits` for each sample
-            else:
-                cls_logits = tf.fill([bsz], 0.0)
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
-
-        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
-        return outputs
+
+      else:
+        cls_logits = None
+
+      outputs = (start_logits, end_logits, cls_logits) + outputs
+
+    else:
+      # during inference, compute the end logits based on beam search
+      bsz, slen, hsz = discriminator_sequence_output.shape
+      start_n_top = min(self.start_n_top, slen)
+      end_n_top = min(self.end_n_top, slen)
+      start_log_probs = tf.nn.log_softmax(start_logits, axis=-1, name="start_logit_softmax")  # shape (bsz, slen)
+
+      start_top_log_probs, start_top_index = tf.math.top_k(start_log_probs, k=start_n_top, name="start_log_probs_top_k")
+
+      start_states = tf.gather(
+        discriminator_sequence_output, start_top_index, axis=1, batch_dims=1
+      )  # shape (bsz, start_n_top, hsz)
+      start_states = tf.broadcast_to(
+        tf.expand_dims(start_states, axis=1), [bsz, slen, start_n_top, hsz]
+      )  # shape (bsz, slen, start_n_top, hsz)
+
+      discriminator_sequence_output_expanded = tf.broadcast_to(
+        tf.expand_dims(discriminator_sequence_output, axis=2), list(start_states.shape)
+      )  # shape (bsz, slen, start_n_top, hsz)
+
+      p_mask = tf.expand_dims(p_mask, axis=-1) if p_mask is not None else None
+      end_logits = self.end_logits(
+        discriminator_sequence_output_expanded,
+        start_states=start_states,
+        p_mask=p_mask,
+        next_layer_dtype=tf.float16 if self.amp else tf.float32,
+      )  # self.answer_class.dense_0.dtype)
+      end_log_probs = tf.nn.log_softmax(end_logits, axis=1, name="end_logit_softmax")  # shape (bsz, slen, start_n_top)
+
+      # need to transpose because tf.math.top_k works on default axis=-1
+      end_log_probs = tf.transpose(end_log_probs, perm=[0, 2, 1])
+      end_top_log_probs, end_top_index = tf.math.top_k(
+        end_log_probs, k=end_n_top
+      )  # shape (bsz, end_n_top, start_n_top).perm(0,2,1)
+      end_top_log_probs = tf.reshape(
+        end_top_log_probs, (-1, start_n_top * end_n_top)
+      )  # shape (bsz, self.start_n_top * self.end_n_top)
+      end_top_index = tf.reshape(
+        end_top_index, (-1, start_n_top * end_n_top)
+      )  # shape (bsz, self.start_n_top * self.end_n_top)
+      if self.v2:  # cls_index is not None:
+        start_p = tf.nn.softmax(start_logits, axis=-1, name="start_softmax")
+        start_states = tf.einsum(
+          "blh,bl->bh", discriminator_sequence_output, tf.cast(start_p, tf.float16) if self.amp else start_p
+        )  # get the representation of START as weighted sum of hidden states
+        # explicitly setting cls_index to None
+        cls_logits = self.answer_class(discriminator_sequence_output, start_states=start_states, cls_index=None)
+        # one single `cls_logits` for each sample
+      else:
+        cls_logits = tf.fill([bsz], 0.0)
+
+      outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+    # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+    return outputs
diff --git a/modelzoo/ELECTRA/modeling_utils.py b/modelzoo/ELECTRA/modeling_utils.py
index bfbc4cf4..2d28d514 100644
--- a/modelzoo/ELECTRA/modeling_utils.py
+++ b/modelzoo/ELECTRA/modeling_utils.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """TF general model utils."""
+
 import functools
 import logging
 import os
@@ -31,2237 +32,2225 @@
 
 
 class TFModelUtilsMixin:
+  """
+  A few utilities for `tf.keras.Model`s, to be used as a mixin.
+  """
+
+  def num_parameters(self, only_trainable: bool = False) -> int:
     """
-    A few utilities for `tf.keras.Model`s, to be used as a mixin.
+    Get number of (optionally, trainable) parameters in the model.
     """
-
-    def num_parameters(self, only_trainable: bool = False) -> int:
-        """
-        Get number of (optionally, trainable) parameters in the model.
-        """
-        if only_trainable:
-            return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
-        else:
-            return self.count_params()
+    if only_trainable:
+      return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
+    else:
+      return self.count_params()
 
 
 def keras_serializable(cls):
-    """
-    Decorate a Keras Layer class to support Keras serialization.
-
-    This is done by:
-    1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
-       serialization time
-    2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
-       convert it to a config object for the actual layer initializer
-    3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does
-       not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`
-
-    :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a
-                `TF*MainLayer` class in this project)
-    :return: the same class object, with modifications for Keras deserialization.
-    """
-    initializer = cls.__init__
-
-    config_class = getattr(cls, "config_class", None)
-    if config_class is None:
-        raise AttributeError("Must set `config_class` to use @keras_serializable")
-
-    @functools.wraps(initializer)
-    def wrapped_init(self, *args, **kwargs):
-        transformers_config = kwargs.pop("transformers_config", None)
-        config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None)
-        if config is not None and transformers_config is not None:
-            raise ValueError("Must pass either `config` or `transformers_config`, not both")
-        elif config is not None:
-            # normal layer construction, call with unchanged args (config is already in there)
-            initializer(self, *args, **kwargs)
-        elif transformers_config is not None:
-            # Keras deserialization, convert dict to config
-            config = config_class.from_dict(transformers_config)
-            initializer(self, config, *args, **kwargs)
-        else:
-            raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)")
-        self._transformers_config = config
+  """
+  Decorate a Keras Layer class to support Keras serialization.
+
+  This is done by:
+  1. adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
+     serialization time
+  2. wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and
+     convert it to a config object for the actual layer initializer
+  3. registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does
+     not need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`
+
+  :param cls: a tf.keras.layers.Layers subclass that accepts a `config` argument to its initializer (typically a
+              `TF*MainLayer` class in this project)
+  :return: the same class object, with modifications for Keras deserialization.
+  """
+  initializer = cls.__init__
+
+  config_class = getattr(cls, "config_class", None)
+  if config_class is None:
+    raise AttributeError("Must set `config_class` to use @keras_serializable")
+
+  @functools.wraps(initializer)
+  def wrapped_init(self, *args, **kwargs):
+    transformers_config = kwargs.pop("transformers_config", None)
+    config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.get("config", None)
+    if config is not None and transformers_config is not None:
+      raise ValueError("Must pass either `config` or `transformers_config`, not both")
+    elif config is not None:
+      # normal layer construction, call with unchanged args (config is already in there)
+      initializer(self, *args, **kwargs)
+    elif transformers_config is not None:
+      # Keras deserialization, convert dict to config
+      config = config_class.from_dict(transformers_config)
+      initializer(self, config, *args, **kwargs)
+    else:
+      raise ValueError("Must pass either `config` (PretrainedConfig) or `transformers_config` (dict)")
+    self._transformers_config = config
+
+  cls.__init__ = wrapped_init
+
+  if not hasattr(cls, "get_config"):
+    raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses")
+  if hasattr(cls.get_config, "_is_default"):
+
+    def get_config(self):
+      cfg = super(cls, self).get_config()
+      cfg["transformers_config"] = self._transformers_config.to_dict()
+      return cfg
+
+    cls.get_config = get_config
+
+  cls._keras_serializable = True
+  if hasattr(tf.keras.utils, "register_keras_serializable"):
+    cls = tf.keras.utils.register_keras_serializable()(cls)
+  return cls
 
-    cls.__init__ = wrapped_init
 
-    if not hasattr(cls, "get_config"):
-        raise TypeError("Only use @keras_serializable on tf.keras.layers.Layer subclasses")
-    if hasattr(cls.get_config, "_is_default"):
+class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
+  r"""Base class for all TF models.
 
-        def get_config(self):
-            cfg = super(cls, self).get_config()
-            cfg["transformers_config"] = self._transformers_config.to_dict()
-            return cfg
+  :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+  as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
 
-        cls.get_config = get_config
+  Class attributes (overridden by derived classes):
+      - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
+      - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+      - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
 
-    cls._keras_serializable = True
-    if hasattr(tf.keras.utils, "register_keras_serializable"):
-        cls = tf.keras.utils.register_keras_serializable()(cls)
-    return cls
+          - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+          - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
+          - ``path``: a path (string) to the TensorFlow checkpoint.
 
+      - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+  """
 
-class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin):
-    r""" Base class for all TF models.
+  config_class = None
+  pretrained_model_archive_map = {}
+  base_model_prefix = ""
 
-        :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
-        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+  @property
+  def dummy_inputs(self):
+    """Dummy inputs to build the network.
 
-        Class attributes (overridden by derived classes):
-            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
-            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+    Returns:
+        tf.Tensor with dummy inputs
+    """
+    return {"input_ids": tf.constant(DUMMY_INPUTS)}
+
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(*inputs, **kwargs)
+    if not isinstance(config, PretrainedConfig):
+      raise ValueError(
+        "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+        "To create a model from a pretrained model use "
+        "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(self.__class__.__name__, self.__class__.__name__)
+      )
+    # Save config in model
+    self.config = config
+
+  def get_input_embeddings(self):
+    """
+    Returns the model's input embeddings.
 
-                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
-                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
-                - ``path``: a path (string) to the TensorFlow checkpoint.
+    Returns:
+        :obj:`tf.keras.layers.Layer`:
+            A torch module mapping vocabulary to hidden states.
+    """
+    base_model = getattr(self, self.base_model_prefix, self)
+    if base_model is not self:
+      return base_model.get_input_embeddings()
+    else:
+      raise NotImplementedError
 
-            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+  def get_output_embeddings(self):
     """
-    config_class = None
-    pretrained_model_archive_map = {}
-    base_model_prefix = ""
-
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(DUMMY_INPUTS)}
-
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
-        if not isinstance(config, PretrainedConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        # Save config in model
-        self.config = config
-
-    def get_input_embeddings(self):
-        """
-        Returns the model's input embeddings.
-
-        Returns:
-            :obj:`tf.keras.layers.Layer`:
-                A torch module mapping vocabulary to hidden states.
-        """
-        base_model = getattr(self, self.base_model_prefix, self)
-        if base_model is not self:
-            return base_model.get_input_embeddings()
-        else:
-            raise NotImplementedError
+    Returns the model's output embeddings.
 
-    def get_output_embeddings(self):
-        """
-        Returns the model's output embeddings.
+    Returns:
+        :obj:`tf.keras.layers.Layer`:
+            A torch module mapping hidden states to vocabulary.
+    """
+    return None  # Overwrite for models with output embeddings
 
-        Returns:
-            :obj:`tf.keras.layers.Layer`:
-                A torch module mapping hidden states to vocabulary.
-        """
-        return None  # Overwrite for models with output embeddings
+  def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+    """Build a resized Embedding Variable from a provided token Embedding Module.
+        Increasing the size will add newly initialized vectors at the end
+        Reducing the size will remove vectors from the end
 
-    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
-        """ Build a resized Embedding Variable from a provided token Embedding Module.
+    Args:
+        new_num_tokens: (`optional`) int
+            New number of tokens in the embedding matrix.
             Increasing the size will add newly initialized vectors at the end
             Reducing the size will remove vectors from the end
+            If not provided or None: return the provided token Embedding Module.
+    Return: ``tf.Variable``
+        Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+    """
+    # if new_num_tokens is None:
+    #     return old_embeddings
 
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: return the provided token Embedding Module.
-        Return: ``tf.Variable``
-            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
-        """
-        # if new_num_tokens is None:
-        #     return old_embeddings
+    # old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+    # if old_num_tokens == new_num_tokens:
+    #     return old_embeddings
 
-        # old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
-        # if old_num_tokens == new_num_tokens:
-        #     return old_embeddings
+    # # Build new embeddings
+    # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+    # new_embeddings.to(old_embeddings.weight.device)
 
-        # # Build new embeddings
-        # new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
-        # new_embeddings.to(old_embeddings.weight.device)
+    # # initialize all new embeddings (in particular added tokens)
+    # self._init_weights(new_embeddings)
 
-        # # initialize all new embeddings (in particular added tokens)
-        # self._init_weights(new_embeddings)
+    # # Copy token embeddings from the previous weights
+    # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+    # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
 
-        # # Copy token embeddings from the previous weights
-        # num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
-        # new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+    # return new_embeddings
 
-        # return new_embeddings
+  def resize_token_embeddings(self, new_num_tokens=None):
+    """Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+    Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
-    def resize_token_embeddings(self, new_num_tokens=None):
-        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+    Arguments:
 
-        Arguments:
+        new_num_tokens: (`optional`) int:
+            New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
+            If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
 
-            new_num_tokens: (`optional`) int:
-                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
-                If not provided or None: does nothing and just returns a pointer to the input tokens ``tf.Variable`` Module of the model.
+    Return: ``tf.Variable``
+        Pointer to the input tokens Embeddings Module of the model
+    """
+    raise NotImplementedError
 
-        Return: ``tf.Variable``
-            Pointer to the input tokens Embeddings Module of the model
-        """
-        raise NotImplementedError
+  def prune_heads(self, heads_to_prune):
+    """Prunes heads of the base model.
 
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the base model.
-
-            Arguments:
-
-                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
-        """
-        raise NotImplementedError
-
-    def save_pretrained(self, save_directory):
-        """ Save a model and its configuration file to a directory, so that it
-            can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method.
-        """
-        if os.path.isfile(save_directory):
-            log("Provided path ({}) should be a directory, not a file".format(save_directory))
-            return
-        os.makedirs(save_directory, exist_ok=True)
-
-        # Save configuration file
-        self.config.save_pretrained(save_directory)
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
-        self.save_weights(output_model_file)
-
-        with h5py.File(output_model_file, "r") as f:
-            if "layer_names" not in f.attrs and "model_weights" in f:
-                f = f["model_weights"]
-            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
-        log(f"Model weights saved in {output_model_file}: {hdf5_layer_names}")
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
-
-        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
-        It is up to you to train those weights with a downstream fine-tuning task.
-
-        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
-
-        Parameters:
-            pretrained_model_name_or_path: either:
-
-                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
-                - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
-
-            model_args: (`optional`) Sequence of positional arguments:
-                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-
-            config: (`optional`) one of:
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
-                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
-                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
-
-                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
-                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
-
-            from_pt: (`optional`) boolean, default False:
-                Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
-
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
-
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
-
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
-
-            output_loading_info: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
-                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
-
-                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
-
-        Examples::
-
-            # For example purposes. Not runnable.
-            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
-
-        """
-        config = kwargs.pop("config", None)
-        cache_dir = kwargs.pop("cache_dir", None)
-        from_pt = kwargs.pop("from_pt", False)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        output_loading_info = kwargs.pop("output_loading_info", False)
-
-        # Load config if we don't provide a configuration
-        if not isinstance(config, PretrainedConfig):
-            config_path = config if config is not None else pretrained_model_name_or_path
-            config, model_kwargs = cls.config_class.from_pretrained(
-                config_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                resume_download=resume_download,
-                **kwargs,
-            )
-        else:
-            model_kwargs = kwargs
-
-        # Load model
-        if pretrained_model_name_or_path is not None:
-            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
-            elif os.path.isdir(pretrained_model_name_or_path):
-                if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
-                    # Load from a TF 2.0 checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
-                elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
-                    # Load from a PyTorch checkpoint
-                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-                else:
-                    raise EnvironmentError(
-                        "Error no file named {} found in directory {} or `from_pt` set to False".format(
-                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
-                        )
-                    )
-            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                archive_file = pretrained_model_name_or_path
-            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
-                archive_file = pretrained_model_name_or_path + ".index"
-            else:
-                archive_file = hf_bucket_url(
-                    pretrained_model_name_or_path, postfix=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME)
-                )
-
-            # redirect to the cache, if necessary
-            try:
-                resolved_archive_file = cached_path(
-                    archive_file,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    resume_download=resume_download,
-                    proxies=proxies,
-                )
-            except EnvironmentError as e:
-                if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
-                    log("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
-                else:
-                    log(
-                        "Model name '{}' was not found in model name list ({}). "
-                        "We assumed '{}' was a path or url but couldn't find any file "
-                        "associated to this path or url.".format(
-                            pretrained_model_name_or_path,
-                            ", ".join(cls.pretrained_model_archive_map.keys()),
-                            archive_file,
-                        )
-                    )
-                raise e
-            if resolved_archive_file == archive_file:
-                log("loading weights file {}".format(archive_file))
-            else:
-                log("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
-        else:
-            resolved_archive_file = None
-
-        # Instantiate model.
-        model = cls(config, *model_args, **model_kwargs)
-
-        if from_pt:
-            # Load from a PyTorch checkpoint
-            raise NotImplementedError
-            # return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
-
-        model(model.dummy_inputs, training=False)  # build the network with dummy inputs
-
-        assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
-        # 'by_name' allow us to do transfer learning by skipping/adding layers
-        # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
-        try:
-            model.load_weights(resolved_archive_file, by_name=True)
-        except OSError:
-            raise OSError(
-                "Unable to load weights from h5 file. "
-                "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
-            )
+    Arguments:
 
-        model(model.dummy_inputs, training=False)  # Make sure restore ops are run
-
-        # Check if the models are the same to output loading information
-        with h5py.File(resolved_archive_file, "r") as f:
-            if "layer_names" not in f.attrs and "model_weights" in f:
-                f = f["model_weights"]
-            hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
-        model_layer_names = set(layer.name for layer in model.layers)
-        missing_keys = list(model_layer_names - hdf5_layer_names)
-        unexpected_keys = list(hdf5_layer_names - model_layer_names)
-        error_msgs = []
-
-        if len(unexpected_keys) > 0:
-            log(
-                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
-                f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
-            )
-        else:
-            log(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-        if len(missing_keys) > 0:
-            log(
-                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
-                f"and are newly initialized: {missing_keys}\n"
-            )
-        else:
-            log(
-                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
-                f"If your task is similar to the task the model of the ckeckpoint was trained on, "
-                f"you can already use {model.__class__.__name__} for predictions without further training."
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
-        if output_loading_info:
-            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
-            return model, loading_info
+        heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+    """
+    raise NotImplementedError
 
-        return model
+  def save_pretrained(self, save_directory):
+    """Save a model and its configuration file to a directory, so that it
+    can be re-loaded using the :func:`~transformers.PreTrainedModel.from_pretrained` class method.
+    """
+    if os.path.isfile(save_directory):
+      log("Provided path ({}) should be a directory, not a file".format(save_directory))
+      return
+    os.makedirs(save_directory, exist_ok=True)
 
-    def prepare_inputs_for_generation(self, inputs, **kwargs):
-        return {"inputs": inputs}
+    # Save configuration file
+    self.config.save_pretrained(save_directory)
 
-    def _do_output_past(self, outputs):
-        has_output_past = hasattr(self.config, "output_past") and self.config.output_past
-        has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
+    # If we save using the predefined names, we can load using `from_pretrained`
+    output_model_file = os.path.join(save_directory, TF2_WEIGHTS_NAME)
+    self.save_weights(output_model_file)
 
-        if has_output_past and not has_mem_len and len(outputs) > 1:
-            return True
-        elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
-            return True
+    with h5py.File(output_model_file, "r") as f:
+      if "layer_names" not in f.attrs and "model_weights" in f:
+        f = f["model_weights"]
+      hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
+    log(f"Model weights saved in {output_model_file}: {hdf5_layer_names}")
 
-        return False
+  @classmethod
+  def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+    r"""Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
 
-    def generate(
-        self,
-        input_ids=None,
-        max_length=None,
-        min_length=None,
-        do_sample=None,
-        early_stopping=None,
-        num_beams=None,
-        temperature=None,
-        top_k=None,
-        top_p=None,
-        repetition_penalty=None,
-        bad_words_ids=None,
-        bos_token_id=None,
-        pad_token_id=None,
-        eos_token_id=None,
-        length_penalty=None,
-        no_repeat_ngram_size=None,
-        num_return_sequences=None,
-        attention_mask=None,
-        decoder_start_token_id=None,
-    ):
-        r""" Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
-        and beam-search.
+    The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
+    It is up to you to train those weights with a downstream fine-tuning task.
 
-        Adapted in part from `Facebook's XLM beam search code`_.
-
-        .. _`Facebook's XLM beam search code`:
-           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
+    The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
 
+    Parameters:
+        pretrained_model_name_or_path: either:
 
-        Parameters:
-
-            input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)`
-                The sequence used as a prompt for the generation. If `None` the method initializes
-                it as an empty `torch.LongTensor` of shape `(1,)`.
-
-            max_length: (`optional`) int
-                The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
-
-            min_length: (`optional`) int
-                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
-            do_sample: (`optional`) bool
-                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
-
-            early_stopping: (`optional`) bool
-                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
-
-            num_beams: (`optional`) int
-                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
-
-            temperature: (`optional`) float
-                The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
-
-            top_k: (`optional`) int
-                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
-
-            top_p: (`optional`) float
-                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
-
-            repetition_penalty: (`optional`) float
-                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
-
-            bos_token_id: (`optional`) int
-                Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist.
-
-            pad_token_id: (`optional`) int
-                Pad token. Defaults to pad_token_id as defined in the models config.
-
-            eos_token_id: (`optional`) int
-                EOS token. Defaults to eos_token_id as defined in the models config.
-
-            length_penalty: (`optional`) float
-                Exponential penalty to the length. Default to 1.
-
-            no_repeat_ngram_size: (`optional`) int
-                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
-
-            bad_words_ids: (`optional`) list of lists of int
-                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-
-            num_return_sequences: (`optional`) int
-                The number of independently computed returned sequences for each element in the batch. Default to 1.
-
-            attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids`
-                Mask to avoid performing attention on padding token indices.
-                Mask values selected in ``[0, 1]``:
-                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-                Defaults to `None`.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-
-            decoder_start_token_id=None: (`optional`) int
-                If an encoder-decoder model starts decoding with a different token than BOS.
-                Defaults to `None` and is changed to `BOS` later.
-
-        Return:
-
-            output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)`
-                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
-
-        Examples::
-
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            outputs = model.generate(max_length=40)  # do greedy decoding
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
-
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+            - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+            - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+            - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
 
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
-            for i in range(3): #  3 output sequences were generated
-                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+        model_args: (`optional`) Sequence of positional arguments:
+            All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
-            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+        config: (`optional`) one of:
+                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
+            Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
-            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
-            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
-            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
-        """
+            - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+            - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+            - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
-        # We cannot generate if the model does not have a LM head
-        if self.get_output_embeddings() is None:
-            raise AttributeError(
-                "You tried to generate sequences with a model that does not have a LM Head."
-                "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)"
-            )
+        from_pt: (`optional`) boolean, default False:
+            Load the model weights from a PyTorch state_dict save file (see docstring of pretrained_model_name_or_path argument).
 
-        max_length = max_length if max_length is not None else self.config.max_length
-        min_length = min_length if min_length is not None else self.config.min_length
-        do_sample = do_sample if do_sample is not None else self.config.do_sample
-        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
-        num_beams = num_beams if num_beams is not None else self.config.num_beams
-        temperature = temperature if temperature is not None else self.config.temperature
-        top_k = top_k if top_k is not None else self.config.top_k
-        top_p = top_p if top_p is not None else self.config.top_p
-        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
-        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
-        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
-        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
-        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
-        no_repeat_ngram_size = (
-            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
-        )
-        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
-        num_return_sequences = (
-            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
-        )
-        decoder_start_token_id = (
-            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
-        )
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded pre-trained model
+            configuration should be cached if the standard cache should not be used.
 
-        if input_ids is not None:
-            batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
-        else:
-            batch_size = 1
-
-        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
-        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
-        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
-        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
-        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
-        assert temperature > 0, "`temperature` should be strictely positive."
-        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
-        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
-        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
-        assert input_ids is not None or (
-            isinstance(bos_token_id, int) and bos_token_id >= 0
-        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
-        assert pad_token_id is None or (
-            isinstance(pad_token_id, int) and (pad_token_id >= 0)
-        ), "`pad_token_id` should be a positive integer."
-        assert (eos_token_id is None) or (
-            isinstance(eos_token_id, int) and (eos_token_id >= 0)
-        ), "`eos_token_id` should be a positive integer."
-        assert length_penalty > 0, "`length_penalty` should be strictely positive."
-        assert (
-            isinstance(num_return_sequences, int) and num_return_sequences > 0
-        ), "`num_return_sequences` should be a strictely positive integer."
-        assert (
-            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
-        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
-
-        if input_ids is None:
-            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
-                "you should either supply a context to complete as `input_ids` input "
-                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
-            )
-            input_ids = tf.fill((batch_size, 1), bos_token_id)
-        else:
-            assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
-
-        # not allow to duplicate outputs when greedy decoding
-        if do_sample is False:
-            if num_beams == 1:
-                # no_beam_search greedy generation conditions
-                assert (
-                    num_return_sequences == 1
-                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
-
-            else:
-                # beam_search greedy generation conditions
-                assert (
-                    num_beams >= num_return_sequences
-                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
-
-        # create attention mask if necessary
-        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
-        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
-            attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
-        elif attention_mask is None:
-            attention_mask = tf.ones_like(input_ids)
-
-        if pad_token_id is None and eos_token_id is not None:
-            log(
-                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
-            )
-            pad_token_id = eos_token_id
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
 
-        # current position and vocab size
-        cur_len = shape_list(input_ids)[1]
-        vocab_size = self.config.vocab_size
+        resume_download: (`optional`) boolean, default False:
+            Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
 
-        # set effective batch size and effective batch multiplier according to do_sample
-        if do_sample:
-            effective_batch_size = batch_size * num_return_sequences
-            effective_batch_mult = num_return_sequences
-        else:
-            effective_batch_size = batch_size
-            effective_batch_mult = 1
-
-        # Expand input ids if num_beams > 1 or num_return_sequences > 1
-        if num_return_sequences > 1 or num_beams > 1:
-            input_ids_len = shape_list(input_ids)[-1]
-            input_ids = tf.broadcast_to(
-                tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
-            )
-            attention_mask = tf.broadcast_to(
-                tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
-            )
-            input_ids = tf.reshape(
-                input_ids, (effective_batch_size * num_beams, input_ids_len)
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
-            attention_mask = tf.reshape(
-                attention_mask, (effective_batch_size * num_beams, input_ids_len)
-            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
 
-        if self.config.is_encoder_decoder:
-            if decoder_start_token_id is None:
-                decoder_start_token_id = bos_token_id
+        output_loading_info: (`optional`) boolean:
+            Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
 
-            assert (
-                decoder_start_token_id is not None
-            ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
-            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
-            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+        kwargs: (`optional`) Remaining dictionary of keyword arguments:
+            Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
-            # get encoder and store encoder outputs
-            encoder = self.get_encoder()
+            - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+            - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
-            encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
+    Examples::
 
-            # create empty decoder_input_ids
-            input_ids = tf.ones((effective_batch_size * num_beams, 1), dtype=tf.int32,) * decoder_start_token_id
-            cur_len = 1
+        # For example purposes. Not runnable.
+        model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+        model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+        assert model.config.output_attention == True
+        # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+        model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_pt=True, config=config)
 
+    """
+    config = kwargs.pop("config", None)
+    cache_dir = kwargs.pop("cache_dir", None)
+    from_pt = kwargs.pop("from_pt", False)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    output_loading_info = kwargs.pop("output_loading_info", False)
+
+    # Load config if we don't provide a configuration
+    if not isinstance(config, PretrainedConfig):
+      config_path = config if config is not None else pretrained_model_name_or_path
+      config, model_kwargs = cls.config_class.from_pretrained(
+        config_path,
+        *model_args,
+        cache_dir=cache_dir,
+        return_unused_kwargs=True,
+        force_download=force_download,
+        resume_download=resume_download,
+        **kwargs,
+      )
+    else:
+      model_kwargs = kwargs
+
+    # Load model
+    if pretrained_model_name_or_path is not None:
+      if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+        archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+      elif os.path.isdir(pretrained_model_name_or_path):
+        if os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+          # Load from a TF 2.0 checkpoint
+          archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+        elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+          # Load from a PyTorch checkpoint
+          archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
         else:
-            encoder_outputs = None
-            cur_len = shape_list(input_ids)[-1]
-
-        if num_beams > 1:
-            output = self._generate_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                early_stopping=early_stopping,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                bos_token_id=bos_token_id,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                decoder_start_token_id=decoder_start_token_id,
-                batch_size=effective_batch_size,
-                num_return_sequences=num_return_sequences,
-                length_penalty=length_penalty,
-                num_beams=num_beams,
-                vocab_size=vocab_size,
-                encoder_outputs=encoder_outputs,
-                attention_mask=attention_mask,
+          raise EnvironmentError(
+            "Error no file named {} found in directory {} or `from_pt` set to False".format(
+              [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path
             )
+          )
+      elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+        archive_file = pretrained_model_name_or_path
+      elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+        archive_file = pretrained_model_name_or_path + ".index"
+      else:
+        archive_file = hf_bucket_url(
+          pretrained_model_name_or_path, postfix=(WEIGHTS_NAME if from_pt else TF2_WEIGHTS_NAME)
+        )
+
+      # redirect to the cache, if necessary
+      try:
+        resolved_archive_file = cached_path(
+          archive_file,
+          cache_dir=cache_dir,
+          force_download=force_download,
+          resume_download=resume_download,
+          proxies=proxies,
+        )
+      except EnvironmentError as e:
+        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+          log("Couldn't reach server at '{}' to download pretrained weights.".format(archive_file))
         else:
-            output = self._generate_no_beam_search(
-                input_ids,
-                cur_len=cur_len,
-                max_length=max_length,
-                min_length=min_length,
-                do_sample=do_sample,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repetition_penalty=repetition_penalty,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                bad_words_ids=bad_words_ids,
-                bos_token_id=bos_token_id,
-                pad_token_id=pad_token_id,
-                eos_token_id=eos_token_id,
-                decoder_start_token_id=decoder_start_token_id,
-                batch_size=effective_batch_size,
-                vocab_size=vocab_size,
-                encoder_outputs=encoder_outputs,
-                attention_mask=attention_mask,
+          log(
+            "Model name '{}' was not found in model name list ({}). "
+            "We assumed '{}' was a path or url but couldn't find any file "
+            "associated to this path or url.".format(
+              pretrained_model_name_or_path,
+              ", ".join(cls.pretrained_model_archive_map.keys()),
+              archive_file,
             )
+          )
+        raise e
+      if resolved_archive_file == archive_file:
+        log("loading weights file {}".format(archive_file))
+      else:
+        log("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
+    else:
+      resolved_archive_file = None
+
+    # Instantiate model.
+    model = cls(config, *model_args, **model_kwargs)
+
+    if from_pt:
+      # Load from a PyTorch checkpoint
+      raise NotImplementedError
+      # return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
+
+    model(model.dummy_inputs, training=False)  # build the network with dummy inputs
+
+    assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file)
+    # 'by_name' allow us to do transfer learning by skipping/adding layers
+    # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357
+    try:
+      model.load_weights(resolved_archive_file, by_name=True)
+    except OSError:
+      raise OSError(
+        "Unable to load weights from h5 file. "
+        "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. "
+      )
+
+    model(model.dummy_inputs, training=False)  # Make sure restore ops are run
+
+    # Check if the models are the same to output loading information
+    with h5py.File(resolved_archive_file, "r") as f:
+      if "layer_names" not in f.attrs and "model_weights" in f:
+        f = f["model_weights"]
+      hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, "layer_names"))
+    model_layer_names = set(layer.name for layer in model.layers)
+    missing_keys = list(model_layer_names - hdf5_layer_names)
+    unexpected_keys = list(hdf5_layer_names - model_layer_names)
+    error_msgs = []
+
+    if len(unexpected_keys) > 0:
+      log(
+        f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
+        f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
+      )
+    else:
+      log(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+    if len(missing_keys) > 0:
+      log(
+        f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
+        f"and are newly initialized: {missing_keys}\n"
+      )
+    else:
+      log(
+        f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
+        f"If your task is similar to the task the model of the ckeckpoint was trained on, "
+        f"you can already use {model.__class__.__name__} for predictions without further training."
+      )
+    if len(error_msgs) > 0:
+      raise RuntimeError(
+        "Error(s) in loading weights for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
+      )
+    if output_loading_info:
+      loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+      return model, loading_info
+
+    return model
+
+  def prepare_inputs_for_generation(self, inputs, **kwargs):
+    return {"inputs": inputs}
+
+  def _do_output_past(self, outputs):
+    has_output_past = hasattr(self.config, "output_past") and self.config.output_past
+    has_mem_len = hasattr(self.config, "mem_len") and self.config.mem_len
+
+    if has_output_past and not has_mem_len and len(outputs) > 1:
+      return True
+    elif has_mem_len and self.config.mem_len > 0 and len(outputs) > 1:
+      return True
+
+    return False
+
+  def generate(
+    self,
+    input_ids=None,
+    max_length=None,
+    min_length=None,
+    do_sample=None,
+    early_stopping=None,
+    num_beams=None,
+    temperature=None,
+    top_k=None,
+    top_p=None,
+    repetition_penalty=None,
+    bad_words_ids=None,
+    bos_token_id=None,
+    pad_token_id=None,
+    eos_token_id=None,
+    length_penalty=None,
+    no_repeat_ngram_size=None,
+    num_return_sequences=None,
+    attention_mask=None,
+    decoder_start_token_id=None,
+  ):
+    r"""Generates sequences for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling
+    and beam-search.
+
+    Adapted in part from `Facebook's XLM beam search code`_.
+
+    .. _`Facebook's XLM beam search code`:
+       https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
 
-        return output
 
-    def _generate_no_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        bos_token_id,
-        pad_token_id,
-        eos_token_id,
-        decoder_start_token_id,
-        batch_size,
-        vocab_size,
-        encoder_outputs,
-        attention_mask,
-    ):
-        """ Generate sequences for each example without beam search (num_beams == 1).
-            All returned sequence are generated independantly.
-        """
-
-        # length of generated sentences / unfinished sentences
-        unfinished_sents = tf.ones_like(input_ids[:, 0])
-        sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
-
-        past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
-            outputs = self(**model_inputs)
-            next_token_logits = outputs[0][:, -1, :]
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                next_token_logits_penalties = _create_next_token_logits_penalties(
-                    input_ids, next_token_logits, repetition_penalty
-                )
-                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
-
-            if no_repeat_ngram_size > 0:
-                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
-                # create banned_tokens boolean mask
-                banned_tokens_indices_mask = []
-                for banned_tokens_slice in banned_tokens:
-                    banned_tokens_indices_mask.append(
-                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
-                    )
-
-                next_token_logits = set_tensor_by_indices_to_value(
-                    next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
-                )
-
-            if bad_words_ids is not None:
-                # calculate a list of banned tokens according to bad words
-                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
-
-                banned_tokens_indices_mask = []
-                for banned_tokens_slice in banned_tokens:
-                    banned_tokens_indices_mask.append(
-                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
-                    )
-
-                next_token_logits = set_tensor_by_indices_to_value(
-                    next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
-                )
-
-            # set eos token prob to zero if min_length is not reached
-            if eos_token_id is not None and cur_len < min_length:
-                # create eos_token_id boolean mask
-                is_token_logit_eos_token = tf.convert_to_tensor(
-                    [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
-                )
-                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size])
-
-                next_token_logits = set_tensor_by_indices_to_value(
-                    next_token_logits, eos_token_indices_mask, -float("inf")
-                )
-
-            if do_sample:
-                # Temperature (higher temperature => more likely to sample low probability tokens)
-                if temperature != 1.0:
-                    next_token_logits = next_token_logits / temperature
-                # Top-p/top-k filtering
-                next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-                # Sample
-                next_token = tf.squeeze(
-                    tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1
-                )
-            else:
-                # Greedy decoding
-                next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32)
-
-            # update generations and finished sentences
-            if eos_token_id is not None:
-                # pad finished sentences if eos_token_id exist
-                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
-            else:
-                tokens_to_add = next_token
-
-            input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1)
-
-            if eos_token_id is not None:
-                eos_in_sents = tokens_to_add == eos_token_id
-                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
-                is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
-                    unfinished_sents, tf.cast(eos_in_sents, tf.int32)
-                )
-                sent_lengths = (
-                    sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos)
-                    + cur_len * is_sents_unfinished_and_token_to_add_is_eos
-                )
-
-                # unfinished_sents is set to zero if eos in sentence
-                unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos
-
-            # stop when there is a </s> in each sentence, or if we exceed the maximul length
-            if tf.math.reduce_max(unfinished_sents) == 0:
-                break
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = tf.concat(
-                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
-                )
-
-            cur_len = cur_len + 1
-
-        # if there are different sentences lengths in the batch, some batches have to be padded
-        min_sent_length = tf.math.reduce_min(sent_lengths)
-        max_sent_length = tf.math.reduce_max(sent_lengths)
-        if min_sent_length != max_sent_length:
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
-            # finished sents are filled with pad_token
-            padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id
-
-            # create length masks for tf.where operation
-            broad_casted_sent_lengths = tf.broadcast_to(
-                tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length]
-            )
-            broad_casted_range = tf.transpose(
-                tf.broadcast_to(tf.expand_dims(tf.range(max_length), -1), [max_length, batch_size])
-            )
+    Parameters:
 
-            decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding)
-        else:
-            decoded = input_ids
+        input_ids: (`optional`) `tf.Tensor` of `dtype=tf.int32` of shape `(batch_size, sequence_length)`
+            The sequence used as a prompt for the generation. If `None` the method initializes
+            it as an empty `torch.LongTensor` of shape `(1,)`.
 
-        return decoded
+        max_length: (`optional`) int
+            The max length of the sequence to be generated.  Between 1 and infinity. Default to 20.
 
-    def _generate_beam_search(
-        self,
-        input_ids,
-        cur_len,
-        max_length,
-        min_length,
-        do_sample,
-        early_stopping,
-        temperature,
-        top_k,
-        top_p,
-        repetition_penalty,
-        no_repeat_ngram_size,
-        bad_words_ids,
-        bos_token_id,
-        pad_token_id,
-        decoder_start_token_id,
-        eos_token_id,
-        batch_size,
-        num_return_sequences,
-        length_penalty,
-        num_beams,
-        vocab_size,
-        encoder_outputs,
-        attention_mask,
-    ):
-        """ Generate sequences for each example with beam search.
-        """
-
-        # generated hypotheses
-        generated_hyps = [
-            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
-            for _ in range(batch_size)
-        ]
-
-        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
-        if do_sample is False:
-            beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
-            beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9)
-            beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1)
-        else:
-            beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32)
-
-        beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,))
-
-        # cache compute states
-        past = encoder_outputs
-
-        # done sentences
-        done = [False for _ in range(batch_size)]
-
-        while cur_len < max_length:
-            model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
-            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
-            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
-
-            # if model has past, then set the past variable to speed up decoding
-            if self._do_output_past(outputs):
-                past = outputs[1]
-
-            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
-            if repetition_penalty != 1.0:
-                next_token_logits_penalties = _create_next_token_logits_penalties(
-                    input_ids, next_token_logits, repetition_penalty
-                )
-                next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
-
-            # Temperature (higher temperature => more likely to sample low probability tokens)
-            if temperature != 1.0:
-                next_token_logits = next_token_logits / temperature
-
-            #             calculate log softmax score
-            scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
-
-            # set eos token prob to zero if min_length is not reached
-            if eos_token_id is not None and cur_len < min_length:
-                # create eos_token_id boolean mask
-                num_batch_hypotheses = batch_size * num_beams
-
-                is_token_logit_eos_token = tf.convert_to_tensor(
-                    [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
-                )
-                eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size])
-
-                scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf"))
-
-            if no_repeat_ngram_size > 0:
-                # calculate a list of banned tokens to prevent repetitively generating the same ngrams
-                # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                num_batch_hypotheses = batch_size * num_beams
-                banned_tokens = calc_banned_ngram_tokens(
-                    input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
-                )
-                # create banned_tokens boolean mask
-                banned_tokens_indices_mask = []
-                for banned_tokens_slice in banned_tokens:
-                    banned_tokens_indices_mask.append(
-                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
-                    )
-
-                scores = set_tensor_by_indices_to_value(
-                    scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
-                )
-
-            if bad_words_ids is not None:
-                # calculate a list of banned tokens according to bad words
-                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
-
-                banned_tokens_indices_mask = []
-                for banned_tokens_slice in banned_tokens:
-                    banned_tokens_indices_mask.append(
-                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
-                    )
-
-                scores = set_tensor_by_indices_to_value(
-                    scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
-                )
-
-            assert shape_list(scores) == [batch_size * num_beams, vocab_size]
-
-            if do_sample:
-                _scores = scores + tf.broadcast_to(
-                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
-                )  # (batch_size * num_beams, vocab_size)
-
-                # Top-p/top-k filtering
-                _scores = tf_top_k_top_p_filtering(
-                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
-                )  # (batch_size * num_beams, vocab_size)
-                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
-                _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size))
-
-                next_tokens = tf.random.categorical(
-                    _scores, dtype=tf.int32, num_samples=2 * num_beams
-                )  # (batch_size, 2 * num_beams)
-                # Compute next scores
-                next_scores = tf.gather(_scores, next_tokens, batch_dims=1)  # (batch_size, 2 * num_beams)
-
-                # sort the sampled vector to make sure that the first num_beams samples are the best
-                next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1)
-                next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
-                next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
-            else:
-                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
-                next_scores = scores + tf.broadcast_to(
-                    beam_scores[:, None], (batch_size * num_beams, vocab_size)
-                )  # (batch_size * num_beams, vocab_size)
-
-                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
-                next_scores = tf.reshape(
-                    next_scores, (batch_size, num_beams * vocab_size)
-                )  # (batch_size, num_beams * vocab_size)
-
-                next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True)
-
-            assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
-
-            # next batch beam content
-            next_batch_beam = []
-
-            # for each sentence
-            for batch_idx in range(batch_size):
-
-                # if we are done with this sentence
-                if done[batch_idx]:
-                    assert (
-                        len(generated_hyps[batch_idx]) >= num_beams
-                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
-                    assert (
-                        eos_token_id is not None and pad_token_id is not None
-                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
-                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
-                    continue
-
-                # next sentence beam content
-                next_sent_beam = []
-
-                # next tokens for this sentence
-                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
-                    zip(next_tokens[batch_idx], next_scores[batch_idx])
-                ):
-                    # get beam and token IDs
-                    beam_id = beam_token_id // vocab_size
-                    token_id = beam_token_id % vocab_size
-
-                    effective_beam_id = batch_idx * num_beams + beam_id
-                    # add to generated hypotheses if end of sentence or last iteration
-                    if (eos_token_id is not None) and (token_id.numpy() == eos_token_id):
-                        # if beam_token does not belong to top num_beams tokens, it should not be added
-                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
-                        if is_beam_token_worse_than_top_num_beams:
-                            continue
-                        generated_hyps[batch_idx].add(
-                            tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy()
-                        )
-                    else:
-                        # add next predicted token if it is not eos_token
-                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
-
-                    # the beam for next step is full
-                    if len(next_sent_beam) == num_beams:
-                        break
-
-                # Check if were done so that we can save a pad step if all(done)
-                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
-                    tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len
-                )
-
-                # update next beam content
-                assert len(next_sent_beam) == num_beams, "Beam should always be full"
-                next_batch_beam.extend(next_sent_beam)
-                assert len(next_batch_beam) == num_beams * (batch_idx + 1)
-
-            # stop when we are done with each sentence
-            if all(done):
-                break
-
-            # sanity check / prepare next batch
-            assert len(next_batch_beam) == batch_size * num_beams
-            beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32)
-            beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32)
-            beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32)
-
-            # re-order batch
-            input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx])
-            input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1)
-            # re-order internal states
-            if past is not None:
-                past = self._reorder_cache(past, beam_idx)
-
-            # extend attention_mask for new generated input if only decoder
-            if self.config.is_encoder_decoder is False:
-                attention_mask = tf.concat(
-                    [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
-                )
-
-            # update current length
-            cur_len = cur_len + 1
-
-        # finalize all open beam hypotheses and end to generated hypotheses
-        for batch_idx in range(batch_size):
-            # Add all open beam hypothesis to generated_hyps
-            if done[batch_idx]:
-                continue
-            # test that beam scores match previously calculated scores if not eos and batch_idx not done
-            if eos_token_id is not None and all(
-                (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx]
-            ):
-                assert tf.reduce_all(
-                    next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
-                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
-                    next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
-                )
-
-            # need to add best num_beams hypotheses to generated hyps
-            for beam_id in range(num_beams):
-                effective_beam_id = batch_idx * num_beams + beam_id
-                final_score = beam_scores[effective_beam_id].numpy().item()
-                final_tokens = input_ids[effective_beam_id]
-                generated_hyps[batch_idx].add(final_tokens, final_score)
-
-        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
-        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
-        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
-
-        # select the best hypotheses
-        sent_lengths_list = []
-        best = []
-
-        # retrieve best hypotheses
-        for i, hypotheses in enumerate(generated_hyps):
-            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
-            for j in range(output_num_return_sequences_per_batch):
-                best_hyp = sorted_hyps.pop()[1]
-                sent_lengths_list.append(len(best_hyp))
-                best.append(best_hyp)
-        assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(
-            output_batch_size, len(best)
-        )
+        min_length: (`optional`) int
+            The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
+        do_sample: (`optional`) bool
+            If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
 
-        sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32)
-
-        # shorter batches are filled with pad_token
-        if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy():
-            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
-            sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length)
-            decoded_list = []
-
-            # fill with hypothesis and eos_token_id if necessary
-            for i, hypo in enumerate(best):
-                assert sent_lengths[i] == shape_list(hypo)[0]
-                # if sent_length is max_len do not pad
-                if sent_lengths[i] == sent_max_len:
-                    decoded_slice = hypo
-                else:
-                    # else pad to sent_max_len
-                    num_pad_tokens = sent_max_len - sent_lengths[i]
-                    padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32)
-                    decoded_slice = tf.concat([hypo, padding], axis=-1)
-
-                    # finish sentence with EOS token
-                    if sent_lengths[i] < max_length:
-                        decoded_slice = tf.where(
-                            tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i],
-                            eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32),
-                            decoded_slice,
-                        )
-                # add to list
-                decoded_list.append(decoded_slice)
-
-            decoded = tf.stack(decoded_list)
-        else:
-            # none of the hypotheses have an eos_token
-            assert (len(hypo) == max_length for hypo in best)
-            decoded = tf.stack(best)
-
-        return decoded
-
-    @staticmethod
-    def _reorder_cache(past, beam_idx):
-        reordered_past = []
-        for layer_past in past:
-            # get the correct batch idx from layer past batch dim
-            # batch dim of `past` and `mems` is at 2nd position
-            reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx]
-            reordered_layer_past = tf.concat(reordered_layer_past, axis=1)
-            # check that shape matches
-            assert shape_list(reordered_layer_past) == shape_list(layer_past)
-            reordered_past.append(reordered_layer_past)
-        past = tuple(reordered_past)
-        return past
+        early_stopping: (`optional`) bool
+            if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
 
+        num_beams: (`optional`) int
+            Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
 
-def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
-    # create logit penalties for already seen input_ids
-    token_penalties = np.ones(shape_list(logits))
-    prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
-    for i, prev_input_id in enumerate(prev_input_ids):
-        logit_penalized = logits[i].numpy()[prev_input_id]
-        logit_penalties = np.zeros(logit_penalized.shape)
-        # if previous logit score is < 0 then multiply repetition penalty else divide
-        logit_penalties[logit_penalized < 0] = repetition_penalty
-        logit_penalties[logit_penalized > 0] = 1 / repetition_penalty
-        np.put(token_penalties[i], prev_input_id, logit_penalties)
-    return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
+        temperature: (`optional`) float
+            The value used to module the next token probabilities. Must be strictely positive. Default to 1.0.
 
+        top_k: (`optional`) int
+            The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
 
-def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
-    # Copied from fairseq for no_repeat_ngram in beam_search"""
-    if cur_len + 1 < no_repeat_ngram_size:
-        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
-        return [[] for _ in range(num_hypos)]
-    generated_ngrams = [{} for _ in range(num_hypos)]
-    for idx in range(num_hypos):
-        gen_tokens = prev_input_ids[idx].numpy().tolist()
-        generated_ngram = generated_ngrams[idx]
-        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
-            prev_ngram_tuple = tuple(ngram[:-1])
-            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
-
-    def _get_generated_ngrams(hypo_idx):
-        # Before decoding the next token, prevent decoding of ngrams that have already appeared
-        start_idx = cur_len + 1 - no_repeat_ngram_size
-        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
-        return generated_ngrams[hypo_idx].get(ngram_idx, [])
-
-    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
-    return banned_tokens
+        top_p: (`optional`) float
+            The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
 
+        repetition_penalty: (`optional`) float
+            The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
 
-def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
-    banned_tokens = []
-
-    def _tokens_match(prev_tokens, tokens):
-        if len(tokens) == 0:
-            # if bad word tokens is just one token always ban it
-            return True
-        if len(tokens) > len(prev_input_ids):
-            # if bad word tokens are longer then prev input_ids they can't be equal
-            return False
-
-        if prev_tokens[-len(tokens) :] == tokens:
-            # if tokens match
-            return True
-        else:
-            return False
+        bos_token_id: (`optional`) int
+            Beginning of sentence token if no prompt is provided. Default to specicic model bos_token_id or None if it does not exist.
 
-    for prev_input_ids_slice in prev_input_ids:
-        banned_tokens_slice = []
+        pad_token_id: (`optional`) int
+            Pad token. Defaults to pad_token_id as defined in the models config.
 
-        for banned_token_seq in bad_words_ids:
-            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
-                bad_words_ids
-            )
+        eos_token_id: (`optional`) int
+            EOS token. Defaults to eos_token_id as defined in the models config.
 
-            if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False:
-                # if tokens do not match continue
-                continue
+        length_penalty: (`optional`) float
+            Exponential penalty to the length. Default to 1.
 
-            banned_tokens_slice.append(banned_token_seq[-1])
+        no_repeat_ngram_size: (`optional`) int
+            If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
 
-        banned_tokens.append(banned_tokens_slice)
+        bad_words_ids: (`optional`) list of lists of int
+            `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
 
-    return banned_tokens
+        num_return_sequences: (`optional`) int
+            The number of independently computed returned sequences for each element in the batch. Default to 1.
 
+        attention_mask (`optional`) obj: `tf.Tensor` with `dtype=tf.int32` of same shape as `input_ids`
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            Defaults to `None`.
 
-def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
-    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
-        Args:
-            logits: logits distribution shape (batch size, vocabulary size)
-            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
-            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
-                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-            Make sure we keep at least min_tokens_to_keep per batch example in the output
-        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+            `What are attention masks? <../glossary.html#attention-mask>`__
+
+        decoder_start_token_id=None: (`optional`) int
+            If an encoder-decoder model starts decoding with a different token than BOS.
+            Defaults to `None` and is changed to `BOS` later.
+
+    Return:
+
+        output: `tf.Tensor` of `dtype=tf.int32` shape `(batch_size * num_return_sequences, sequence_length)`
+            sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
+
+    Examples::
+
+        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+        outputs = model.generate(max_length=40)  # do greedy decoding
+        print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+        tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+        input_context = 'The dog'
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+        for i in range(3): #  3 output sequences were generated
+            print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+        input_context = 'The dog'
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+        for i in range(3): #  3 output sequences were generated
+            print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+
+        tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+        input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+        print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+
+        tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+        input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+        bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
     """
-    logits_shape = shape_list(logits)
-
-    if top_k > 0:
-        top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1])  # Safety check
-        # Remove all tokens with a probability less than the last token of the top-k
-        indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
-        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
-
-    if top_p < 1.0:
-        sorted_indices = tf.argsort(logits, direction="DESCENDING")
-        sorted_logits = tf.gather(
-            logits, sorted_indices, axis=-1, batch_dims=1
-        )  # expects logits to be of dim (batch_size, vocab_size)
-
-        cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
-
-        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
-        sorted_indices_to_remove = cumulative_probs > top_p
-
-        if min_tokens_to_keep > 1:
-            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
-            sorted_indices_to_remove = tf.concat(
-                [
-                    tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
-                    sorted_indices_to_remove[:, min_tokens_to_keep:],
-                ],
-                -1,
-            )
 
-        # Shift the indices to the right to keep also the first token above the threshold
-        sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
-        sorted_indices_to_remove = tf.concat(
-            [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]], -1,
+    # We cannot generate if the model does not have a LM head
+    if self.get_output_embeddings() is None:
+      raise AttributeError(
+        "You tried to generate sequences with a model that does not have a LM Head."
+        "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)"
+      )
+
+    max_length = max_length if max_length is not None else self.config.max_length
+    min_length = min_length if min_length is not None else self.config.min_length
+    do_sample = do_sample if do_sample is not None else self.config.do_sample
+    early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+    num_beams = num_beams if num_beams is not None else self.config.num_beams
+    temperature = temperature if temperature is not None else self.config.temperature
+    top_k = top_k if top_k is not None else self.config.top_k
+    top_p = top_p if top_p is not None else self.config.top_p
+    repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+    bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+    pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+    eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+    length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+    no_repeat_ngram_size = (
+      no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+    )
+    bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+    num_return_sequences = (
+      num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+    )
+    decoder_start_token_id = (
+      decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+    )
+
+    if input_ids is not None:
+      batch_size = shape_list(input_ids)[0]  # overriden by the input batch_size
+    else:
+      batch_size = 1
+
+    assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer."
+    assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
+    assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+    assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
+    assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer."
+    assert temperature > 0, "`temperature` should be strictely positive."
+    assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+    assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+    assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+    assert input_ids is not None or (isinstance(bos_token_id, int) and bos_token_id >= 0), (
+      "If input_ids is not defined, `bos_token_id` should be a positive integer."
+    )
+    assert pad_token_id is None or (isinstance(pad_token_id, int) and (pad_token_id >= 0)), (
+      "`pad_token_id` should be a positive integer."
+    )
+    assert (eos_token_id is None) or (isinstance(eos_token_id, int) and (eos_token_id >= 0)), (
+      "`eos_token_id` should be a positive integer."
+    )
+    assert length_penalty > 0, "`length_penalty` should be strictely positive."
+    assert isinstance(num_return_sequences, int) and num_return_sequences > 0, (
+      "`num_return_sequences` should be a strictely positive integer."
+    )
+    assert bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list), (
+      "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
+    )
+
+    if input_ids is None:
+      assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
+        "you should either supply a context to complete as `input_ids` input "
+        "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+      )
+      input_ids = tf.fill((batch_size, 1), bos_token_id)
+    else:
+      assert len(shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
+
+    # not allow to duplicate outputs when greedy decoding
+    if do_sample is False:
+      if num_beams == 1:
+        # no_beam_search greedy generation conditions
+        assert num_return_sequences == 1, (
+          "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
         )
-        # scatter sorted tensors to original indexing
-        indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
-        logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
-    return logits
 
+      else:
+        # beam_search greedy generation conditions
+        assert num_beams >= num_return_sequences, (
+          "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+        )
 
-def scatter_values_on_batch_indices(values, batch_indices):
-    shape = shape_list(batch_indices)
-    # broadcast batch dim to shape
-    broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
-    # transform batch_indices to pair_indices
-    pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
-    # scatter values to pair indices
-    return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
+    # create attention mask if necessary
+    # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
+    if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
+      attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
+    elif attention_mask is None:
+      attention_mask = tf.ones_like(input_ids)
+
+    if pad_token_id is None and eos_token_id is not None:
+      log("Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id))
+      pad_token_id = eos_token_id
+
+    # current position and vocab size
+    cur_len = shape_list(input_ids)[1]
+    vocab_size = self.config.vocab_size
+
+    # set effective batch size and effective batch multiplier according to do_sample
+    if do_sample:
+      effective_batch_size = batch_size * num_return_sequences
+      effective_batch_mult = num_return_sequences
+    else:
+      effective_batch_size = batch_size
+      effective_batch_mult = 1
+
+    # Expand input ids if num_beams > 1 or num_return_sequences > 1
+    if num_return_sequences > 1 or num_beams > 1:
+      input_ids_len = shape_list(input_ids)[-1]
+      input_ids = tf.broadcast_to(
+        tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
+      )
+      attention_mask = tf.broadcast_to(
+        tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
+      )
+      input_ids = tf.reshape(
+        input_ids, (effective_batch_size * num_beams, input_ids_len)
+      )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+      attention_mask = tf.reshape(
+        attention_mask, (effective_batch_size * num_beams, input_ids_len)
+      )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+
+    if self.config.is_encoder_decoder:
+      if decoder_start_token_id is None:
+        decoder_start_token_id = bos_token_id
+
+      assert decoder_start_token_id is not None, (
+        "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
+      )
+      assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+      assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+
+      # get encoder and store encoder outputs
+      encoder = self.get_encoder()
+
+      encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
+
+      # create empty decoder_input_ids
+      input_ids = (
+        tf.ones(
+          (effective_batch_size * num_beams, 1),
+          dtype=tf.int32,
+        )
+        * decoder_start_token_id
+      )
+      cur_len = 1
 
+    else:
+      encoder_outputs = None
+      cur_len = shape_list(input_ids)[-1]
 
-def set_tensor_by_indices_to_value(tensor, indices, value):
-    # create value_tensor since tensor value assignment is not possible in TF
-    value_tensor = tf.zeros_like(tensor) + value
-    return tf.where(indices, value_tensor, tensor)
+    if num_beams > 1:
+      output = self._generate_beam_search(
+        input_ids,
+        cur_len=cur_len,
+        max_length=max_length,
+        min_length=min_length,
+        do_sample=do_sample,
+        early_stopping=early_stopping,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        bad_words_ids=bad_words_ids,
+        bos_token_id=bos_token_id,
+        pad_token_id=pad_token_id,
+        eos_token_id=eos_token_id,
+        decoder_start_token_id=decoder_start_token_id,
+        batch_size=effective_batch_size,
+        num_return_sequences=num_return_sequences,
+        length_penalty=length_penalty,
+        num_beams=num_beams,
+        vocab_size=vocab_size,
+        encoder_outputs=encoder_outputs,
+        attention_mask=attention_mask,
+      )
+    else:
+      output = self._generate_no_beam_search(
+        input_ids,
+        cur_len=cur_len,
+        max_length=max_length,
+        min_length=min_length,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        bad_words_ids=bad_words_ids,
+        bos_token_id=bos_token_id,
+        pad_token_id=pad_token_id,
+        eos_token_id=eos_token_id,
+        decoder_start_token_id=decoder_start_token_id,
+        batch_size=effective_batch_size,
+        vocab_size=vocab_size,
+        encoder_outputs=encoder_outputs,
+        attention_mask=attention_mask,
+      )
+
+    return output
+
+  def _generate_no_beam_search(
+    self,
+    input_ids,
+    cur_len,
+    max_length,
+    min_length,
+    do_sample,
+    temperature,
+    top_k,
+    top_p,
+    repetition_penalty,
+    no_repeat_ngram_size,
+    bad_words_ids,
+    bos_token_id,
+    pad_token_id,
+    eos_token_id,
+    decoder_start_token_id,
+    batch_size,
+    vocab_size,
+    encoder_outputs,
+    attention_mask,
+  ):
+    """Generate sequences for each example without beam search (num_beams == 1).
+    All returned sequence are generated independantly.
+    """
 
+    # length of generated sentences / unfinished sentences
+    unfinished_sents = tf.ones_like(input_ids[:, 0])
+    sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
 
-class BeamHypotheses(object):
-    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
-        """
-        Initialize n-best list of hypotheses.
-        """
-        self.max_length = max_length - 1  # ignoring bos_token
-        self.length_penalty = length_penalty
-        self.early_stopping = early_stopping
-        self.num_beams = num_beams
-        self.beams = []
-        self.worst_score = 1e9
-
-    def __len__(self):
-        """
-        Number of hypotheses in the list.
-        """
-        return len(self.beams)
-
-    def add(self, hyp, sum_logprobs):
-        """
-        Add a new hypothesis to the list.
-        """
-        score = sum_logprobs / len(hyp) ** self.length_penalty
-        if len(self) < self.num_beams or score > self.worst_score:
-            self.beams.append((score, hyp))
-            if len(self) > self.num_beams:
-                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
-                del self.beams[sorted_scores[0][1]]
-                self.worst_score = sorted_scores[1][0]
-            else:
-                self.worst_score = min(score, self.worst_score)
-
-    def is_done(self, best_sum_logprobs, cur_len=None):
-        """
-        If there are enough hypotheses and that none of the hypotheses being generated
-        can become better than the worst one in the heap, then we are done with this sentence.
-        """
-
-        if len(self) < self.num_beams:
-            return False
-        elif self.early_stopping:
-            return True
-        else:
-            if cur_len is None:
-                cur_len = self.max_length
-            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
-            ret = self.worst_score >= cur_score
-            return ret
+    past = encoder_outputs  # defined for encoder-decoder models, None for decoder-only models
 
+    while cur_len < max_length:
+      model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
+      outputs = self(**model_inputs)
+      next_token_logits = outputs[0][:, -1, :]
 
-class TFConv1D(tf.keras.layers.Layer):
-    def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
-        """ TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
-            Basically works like a Linear layer but the weights are transposed
-        """
-        super().__init__(**kwargs)
-        self.nf = nf
-        self.nx = nx
-        self.initializer_range = initializer_range
-
-    def build(self, input_shape):
-        self.weight = self.add_weight(
-            "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+      # if model has past, then set the past variable to speed up decoding
+      if self._do_output_past(outputs):
+        past = outputs[1]
+
+      # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
+      if repetition_penalty != 1.0:
+        next_token_logits_penalties = _create_next_token_logits_penalties(
+          input_ids, next_token_logits, repetition_penalty
+        )
+        next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
+
+      if no_repeat_ngram_size > 0:
+        # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+        # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+        banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
+        # create banned_tokens boolean mask
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+          banned_tokens_indices_mask.append([
+            True if token in banned_tokens_slice else False for token in range(vocab_size)
+          ])
+
+        next_token_logits = set_tensor_by_indices_to_value(
+          next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
         )
-        self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
-    def call(self, x):
-        bz, sl = shape_list(x)[:2]
+      if bad_words_ids is not None:
+        # calculate a list of banned tokens according to bad words
+        banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
 
-        x = tf.reshape(x, [-1, self.nx])
-        x = tf.matmul(x, self.weight) + self.bias
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+          banned_tokens_indices_mask.append([
+            True if token in banned_tokens_slice else False for token in range(vocab_size)
+          ])
 
-        x = tf.reshape(x, [bz, sl, self.nf])
+        next_token_logits = set_tensor_by_indices_to_value(
+          next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+        )
 
-        return x
+      # set eos token prob to zero if min_length is not reached
+      if eos_token_id is not None and cur_len < min_length:
+        # create eos_token_id boolean mask
+        is_token_logit_eos_token = tf.convert_to_tensor(
+          [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
+        )
+        eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [batch_size, vocab_size])
+
+        next_token_logits = set_tensor_by_indices_to_value(next_token_logits, eos_token_indices_mask, -float("inf"))
+
+      if do_sample:
+        # Temperature (higher temperature => more likely to sample low probability tokens)
+        if temperature != 1.0:
+          next_token_logits = next_token_logits / temperature
+        # Top-p/top-k filtering
+        next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+        # Sample
+        next_token = tf.squeeze(tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1)
+      else:
+        # Greedy decoding
+        next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32)
+
+      # update generations and finished sentences
+      if eos_token_id is not None:
+        # pad finished sentences if eos_token_id exist
+        tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+      else:
+        tokens_to_add = next_token
+
+      input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1)
+
+      if eos_token_id is not None:
+        eos_in_sents = tokens_to_add == eos_token_id
+        # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
+        is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
+          unfinished_sents, tf.cast(eos_in_sents, tf.int32)
+        )
+        sent_lengths = (
+          sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos)
+          + cur_len * is_sents_unfinished_and_token_to_add_is_eos
+        )
 
+        # unfinished_sents is set to zero if eos in sentence
+        unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos
 
-class TFSharedEmbeddings(tf.keras.layers.Layer):
-    """Construct shared token embeddings.
-    """
+      # stop when there is a </s> in each sentence, or if we exceed the maximul length
+      if tf.math.reduce_max(unfinished_sents) == 0:
+        break
 
-    def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.initializer_range = hidden_size ** -0.5 if initializer_range is None else initializer_range
-
-    def build(self, input_shape):
-        """Build shared token embedding layer
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        self.weight = self.add_weight(
-            "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+      # extend attention_mask for new generated input if only decoder
+      if self.config.is_encoder_decoder is False:
+        attention_mask = tf.concat(
+          [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
         )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding"):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
 
-    def _embedding(self, input_ids):
-        """Applies embedding based on inputs tensor."""
-        return tf.gather(self.weight, input_ids)
+      cur_len = cur_len + 1
+
+    # if there are different sentences lengths in the batch, some batches have to be padded
+    min_sent_length = tf.math.reduce_min(sent_lengths)
+    max_sent_length = tf.math.reduce_max(sent_lengths)
+    if min_sent_length != max_sent_length:
+      assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
+      # finished sents are filled with pad_token
+      padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id
+
+      # create length masks for tf.where operation
+      broad_casted_sent_lengths = tf.broadcast_to(tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length])
+      broad_casted_range = tf.transpose(
+        tf.broadcast_to(tf.expand_dims(tf.range(max_length), -1), [max_length, batch_size])
+      )
+
+      decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding)
+    else:
+      decoded = input_ids
+
+    return decoded
+
+  def _generate_beam_search(
+    self,
+    input_ids,
+    cur_len,
+    max_length,
+    min_length,
+    do_sample,
+    early_stopping,
+    temperature,
+    top_k,
+    top_p,
+    repetition_penalty,
+    no_repeat_ngram_size,
+    bad_words_ids,
+    bos_token_id,
+    pad_token_id,
+    decoder_start_token_id,
+    eos_token_id,
+    batch_size,
+    num_return_sequences,
+    length_penalty,
+    num_beams,
+    vocab_size,
+    encoder_outputs,
+    attention_mask,
+  ):
+    """Generate sequences for each example with beam search."""
+
+    # generated hypotheses
+    generated_hyps = [
+      BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size)
+    ]
+
+    # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
+    if do_sample is False:
+      beam_scores_begin = tf.zeros((batch_size, 1), dtype=tf.float32)
+      beam_scores_end = tf.ones((batch_size, num_beams - 1), dtype=tf.float32) * (-1e9)
+      beam_scores = tf.concat([beam_scores_begin, beam_scores_end], -1)
+    else:
+      beam_scores = tf.zeros((batch_size, num_beams), dtype=tf.float32)
+
+    beam_scores = tf.reshape(beam_scores, (batch_size * num_beams,))
+
+    # cache compute states
+    past = encoder_outputs
+
+    # done sentences
+    done = [False for _ in range(batch_size)]
+
+    while cur_len < max_length:
+      model_inputs = self.prepare_inputs_for_generation(input_ids, past=past, attention_mask=attention_mask)
+      outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
+      next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+
+      # if model has past, then set the past variable to speed up decoding
+      if self._do_output_past(outputs):
+        past = outputs[1]
+
+      # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+      if repetition_penalty != 1.0:
+        next_token_logits_penalties = _create_next_token_logits_penalties(
+          input_ids, next_token_logits, repetition_penalty
+        )
+        next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
 
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [..., hidden_size]
-            Returns:
-                float32 tensor with shape [..., vocab_size].
-        """
-        first_dims = shape_list(inputs)[:-1]
+      # Temperature (higher temperature => more likely to sample low probability tokens)
+      if temperature != 1.0:
+        next_token_logits = next_token_logits / temperature
 
-        x = tf.reshape(inputs, [-1, self.hidden_size])
-        logits = tf.matmul(x, self.weight, transpose_b=True)
+      #             calculate log softmax score
+      scores = tf.nn.log_softmax(next_token_logits, axis=-1)  # (batch_size * num_beams, vocab_size)
 
-        return tf.reshape(logits, first_dims + [self.vocab_size])
+      # set eos token prob to zero if min_length is not reached
+      if eos_token_id is not None and cur_len < min_length:
+        # create eos_token_id boolean mask
+        num_batch_hypotheses = batch_size * num_beams
 
+        is_token_logit_eos_token = tf.convert_to_tensor(
+          [True if token is eos_token_id else False for token in range(vocab_size)], dtype=tf.bool
+        )
+        eos_token_indices_mask = tf.broadcast_to(is_token_logit_eos_token, [num_batch_hypotheses, vocab_size])
+
+        scores = set_tensor_by_indices_to_value(scores, eos_token_indices_mask, -float("inf"))
+
+      if no_repeat_ngram_size > 0:
+        # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+        # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+        num_batch_hypotheses = batch_size * num_beams
+        banned_tokens = calc_banned_ngram_tokens(input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len)
+        # create banned_tokens boolean mask
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+          banned_tokens_indices_mask.append([
+            True if token in banned_tokens_slice else False for token in range(vocab_size)
+          ])
+
+        scores = set_tensor_by_indices_to_value(
+          scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+        )
 
-class TFSequenceSummary(tf.keras.layers.Layer):
-    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
-        Args of the config class:
-            summary_type:
-                - 'last' => [default] take the last token hidden state (like XLNet)
-                - 'first' => take the first token hidden state (like Bert)
-                - 'mean' => take the mean of all tokens hidden states
-                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
-                - 'attn' => Not implemented now, use multi-head attention
-            summary_use_proj: Add a projection after the vector extraction
-            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
-            summary_first_dropout: Add a dropout before the projection and activation
-            summary_last_dropout: Add a dropout after the projection and activation
-    """
+      if bad_words_ids is not None:
+        # calculate a list of banned tokens according to bad words
+        banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
 
-    def __init__(self, config, initializer_range=0.02, **kwargs):
-        super().__init__(**kwargs)
-
-        self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
-        if self.summary_type == "attn":
-            # We should use a standard multi-head attention module with absolute positional embedding for that.
-            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
-            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
-            raise NotImplementedError
-
-        self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
-        if self.has_summary:
-            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
-                num_classes = config.num_labels
-            else:
-                num_classes = config.hidden_size
-            self.summary = tf.keras.layers.Dense(
-                num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
-            )
+        banned_tokens_indices_mask = []
+        for banned_tokens_slice in banned_tokens:
+          banned_tokens_indices_mask.append([
+            True if token in banned_tokens_slice else False for token in range(vocab_size)
+          ])
+
+        scores = set_tensor_by_indices_to_value(
+          scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+        )
+
+      assert shape_list(scores) == [batch_size * num_beams, vocab_size]
+
+      if do_sample:
+        _scores = scores + tf.broadcast_to(
+          beam_scores[:, None], (batch_size * num_beams, vocab_size)
+        )  # (batch_size * num_beams, vocab_size)
+
+        # Top-p/top-k filtering
+        _scores = tf_top_k_top_p_filtering(
+          _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+        )  # (batch_size * num_beams, vocab_size)
+        # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+        _scores = tf.reshape(_scores, (batch_size, num_beams * vocab_size))
+
+        next_tokens = tf.random.categorical(
+          _scores, dtype=tf.int32, num_samples=2 * num_beams
+        )  # (batch_size, 2 * num_beams)
+        # Compute next scores
+        next_scores = tf.gather(_scores, next_tokens, batch_dims=1)  # (batch_size, 2 * num_beams)
+
+        # sort the sampled vector to make sure that the first num_beams samples are the best
+        next_scores_indices = tf.argsort(next_scores, direction="DESCENDING", axis=1)
+        next_scores = tf.gather(next_scores, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
+        next_tokens = tf.gather(next_tokens, next_scores_indices, batch_dims=1)  # (batch_size, num_beams * 2)
+      else:
+        # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
+        next_scores = scores + tf.broadcast_to(
+          beam_scores[:, None], (batch_size * num_beams, vocab_size)
+        )  # (batch_size * num_beams, vocab_size)
+
+        # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+        next_scores = tf.reshape(
+          next_scores, (batch_size, num_beams * vocab_size)
+        )  # (batch_size, num_beams * vocab_size)
+
+        next_scores, next_tokens = tf.math.top_k(next_scores, k=2 * num_beams, sorted=True)
+
+      assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
+
+      # next batch beam content
+      next_batch_beam = []
+
+      # for each sentence
+      for batch_idx in range(batch_size):
+        # if we are done with this sentence
+        if done[batch_idx]:
+          assert len(generated_hyps[batch_idx]) >= num_beams, (
+            "Batch can only be done if at least {} beams have been generated".format(num_beams)
+          )
+          assert eos_token_id is not None and pad_token_id is not None, (
+            "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+          )
+          next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+          continue
+
+        # next sentence beam content
+        next_sent_beam = []
+
+        # next tokens for this sentence
+        for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
+          zip(next_tokens[batch_idx], next_scores[batch_idx])
+        ):
+          # get beam and token IDs
+          beam_id = beam_token_id // vocab_size
+          token_id = beam_token_id % vocab_size
+
+          effective_beam_id = batch_idx * num_beams + beam_id
+          # add to generated hypotheses if end of sentence or last iteration
+          if (eos_token_id is not None) and (token_id.numpy() == eos_token_id):
+            # if beam_token does not belong to top num_beams tokens, it should not be added
+            is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
+            if is_beam_token_worse_than_top_num_beams:
+              continue
+            generated_hyps[batch_idx].add(tf.identity(input_ids[effective_beam_id]), beam_token_score.numpy())
+          else:
+            # add next predicted token if it is not eos_token
+            next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
+
+          # the beam for next step is full
+          if len(next_sent_beam) == num_beams:
+            break
+
+        # Check if were done so that we can save a pad step if all(done)
+        done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
+          tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len
+        )
+
+        # update next beam content
+        assert len(next_sent_beam) == num_beams, "Beam should always be full"
+        next_batch_beam.extend(next_sent_beam)
+        assert len(next_batch_beam) == num_beams * (batch_idx + 1)
+
+      # stop when we are done with each sentence
+      if all(done):
+        break
+
+      # sanity check / prepare next batch
+      assert len(next_batch_beam) == batch_size * num_beams
+      beam_scores = tf.convert_to_tensor([x[0] for x in next_batch_beam], dtype=tf.float32)
+      beam_tokens = tf.convert_to_tensor([x[1] for x in next_batch_beam], dtype=tf.int32)
+      beam_idx = tf.convert_to_tensor([x[2] for x in next_batch_beam], dtype=tf.int32)
+
+      # re-order batch
+      input_ids = tf.stack([tf.identity(input_ids[x, :]) for x in beam_idx])
+      input_ids = tf.concat([input_ids, tf.expand_dims(beam_tokens, 1)], axis=-1)
+      # re-order internal states
+      if past is not None:
+        past = self._reorder_cache(past, beam_idx)
+
+      # extend attention_mask for new generated input if only decoder
+      if self.config.is_encoder_decoder is False:
+        attention_mask = tf.concat(
+          [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
+        )
+
+      # update current length
+      cur_len = cur_len + 1
+
+    # finalize all open beam hypotheses and end to generated hypotheses
+    for batch_idx in range(batch_size):
+      # Add all open beam hypothesis to generated_hyps
+      if done[batch_idx]:
+        continue
+      # test that beam scores match previously calculated scores if not eos and batch_idx not done
+      if eos_token_id is not None and all(
+        (token_id % vocab_size).numpy().item() is not eos_token_id for token_id in next_tokens[batch_idx]
+      ):
+        assert tf.reduce_all(
+          next_scores[batch_idx, :num_beams] == tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
+        ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
+          next_scores[:, :num_beams][batch_idx], tf.reshape(beam_scores, (batch_size, num_beams))[batch_idx]
+        )
 
-        self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
-        if self.has_activation:
-            self.activation = tf.keras.activations.tanh
-
-        self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
-        if self.has_first_dropout:
-            self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
-
-        self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
-        if self.has_last_dropout:
-            self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
-
-    def call(self, inputs, training=False):
-        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
-                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'cls_index' and cls_index is None:
-                    we take the last token of the sequence as classification token
-        """
-        if not isinstance(inputs, (dict, tuple, list)):
-            hidden_states = inputs
-            cls_index = None
-        elif isinstance(inputs, (tuple, list)):
-            hidden_states = inputs[0]
-            cls_index = inputs[1] if len(inputs) > 1 else None
-            assert len(inputs) <= 2, "Too many inputs."
+      # need to add best num_beams hypotheses to generated hyps
+      for beam_id in range(num_beams):
+        effective_beam_id = batch_idx * num_beams + beam_id
+        final_score = beam_scores[effective_beam_id].numpy().item()
+        final_tokens = input_ids[effective_beam_id]
+        generated_hyps[batch_idx].add(final_tokens, final_score)
+
+    # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+    output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+    output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+
+    # select the best hypotheses
+    sent_lengths_list = []
+    best = []
+
+    # retrieve best hypotheses
+    for i, hypotheses in enumerate(generated_hyps):
+      sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+      for j in range(output_num_return_sequences_per_batch):
+        best_hyp = sorted_hyps.pop()[1]
+        sent_lengths_list.append(len(best_hyp))
+        best.append(best_hyp)
+    assert output_batch_size == len(best), "Output batch size {} must match output beam hypotheses {}".format(
+      output_batch_size, len(best)
+    )
+
+    sent_lengths = tf.convert_to_tensor(sent_lengths_list, dtype=tf.int32)
+
+    # shorter batches are filled with pad_token
+    if tf.reduce_min(sent_lengths).numpy() != tf.reduce_max(sent_lengths).numpy():
+      assert pad_token_id is not None, "`Pad_token_id` has to be defined"
+      sent_max_len = min(tf.reduce_max(sent_lengths).numpy() + 1, max_length)
+      decoded_list = []
+
+      # fill with hypothesis and eos_token_id if necessary
+      for i, hypo in enumerate(best):
+        assert sent_lengths[i] == shape_list(hypo)[0]
+        # if sent_length is max_len do not pad
+        if sent_lengths[i] == sent_max_len:
+          decoded_slice = hypo
         else:
-            hidden_states = inputs.get("hidden_states")
-            cls_index = inputs.get("cls_index", None)
-
-        if self.summary_type == "last":
-            output = hidden_states[:, -1]
-        elif self.summary_type == "first":
-            output = hidden_states[:, 0]
-        elif self.summary_type == "mean":
-            output = tf.reduce_mean(hidden_states, axis=1)
-        elif self.summary_type == "cls_index":
-            hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
-            if cls_index is None:
-                cls_index = tf.fill(
-                    hidden_shape[:-2], hidden_shape[-2] - 1
-                )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
-            cls_shape = shape_list(cls_index)
-            if len(cls_shape) <= len(hidden_shape) - 2:
-                cls_index = cls_index[..., tf.newaxis]
-            # else:
-            # cls_index = cls_index[..., tf.newaxis]
-            # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
-            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
-            output = tf.squeeze(
-                output, axis=len(hidden_shape) - 2
-            )  # shape of output: (batch, num choices, hidden_size)
-        elif self.summary_type == "attn":
-            raise NotImplementedError
-
-        if self.has_first_dropout:
-            output = self.first_dropout(output, training=training)
-
-        if self.has_summary:
-            output = self.summary(output)
-
-        if self.has_activation:
-            output = self.activation(output)
-
-        if self.has_last_dropout:
-            output = self.last_dropout(output, training=training)
-
-        return output
+          # else pad to sent_max_len
+          num_pad_tokens = sent_max_len - sent_lengths[i]
+          padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32)
+          decoded_slice = tf.concat([hypo, padding], axis=-1)
+
+          # finish sentence with EOS token
+          if sent_lengths[i] < max_length:
+            decoded_slice = tf.where(
+              tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i],
+              eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32),
+              decoded_slice,
+            )
+        # add to list
+        decoded_list.append(decoded_slice)
+
+      decoded = tf.stack(decoded_list)
+    else:
+      # none of the hypotheses have an eos_token
+      assert (len(hypo) == max_length for hypo in best)
+      decoded = tf.stack(best)
+
+    return decoded
+
+  @staticmethod
+  def _reorder_cache(past, beam_idx):
+    reordered_past = []
+    for layer_past in past:
+      # get the correct batch idx from layer past batch dim
+      # batch dim of `past` and `mems` is at 2nd position
+      reordered_layer_past = [tf.identity(tf.expand_dims(layer_past[:, i], 1)) for i in beam_idx]
+      reordered_layer_past = tf.concat(reordered_layer_past, axis=1)
+      # check that shape matches
+      assert shape_list(reordered_layer_past) == shape_list(layer_past)
+      reordered_past.append(reordered_layer_past)
+    past = tuple(reordered_past)
+    return past
 
 
-def shape_list(x):
-    """Deal with dynamic shape in tensorflow cleanly."""
-    static = x.shape.as_list()
-    dynamic = tf.shape(x)
-    return [dynamic[i] if s is None else s for i, s in enumerate(static)]
+def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
+  # create logit penalties for already seen input_ids
+  token_penalties = np.ones(shape_list(logits))
+  prev_input_ids = [np.unique(input_id) for input_id in input_ids.numpy()]
+  for i, prev_input_id in enumerate(prev_input_ids):
+    logit_penalized = logits[i].numpy()[prev_input_id]
+    logit_penalties = np.zeros(logit_penalized.shape)
+    # if previous logit score is < 0 then multiply repetition penalty else divide
+    logit_penalties[logit_penalized < 0] = repetition_penalty
+    logit_penalties[logit_penalized > 0] = 1 / repetition_penalty
+    np.put(token_penalties[i], prev_input_id, logit_penalties)
+  return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
 
 
-def get_initializer(initializer_range=0.02):
-    """Creates a `tf.initializers.truncated_normal` with the given range.
-    Args:
-        initializer_range: float, initializer range for stddev.
-    Returns:
-        TruncatedNormal initializer with stddev = `initializer_range`.
-    """
-    return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
+def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
+  # Copied from fairseq for no_repeat_ngram in beam_search"""
+  if cur_len + 1 < no_repeat_ngram_size:
+    # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+    return [[] for _ in range(num_hypos)]
+  generated_ngrams = [{} for _ in range(num_hypos)]
+  for idx in range(num_hypos):
+    gen_tokens = prev_input_ids[idx].numpy().tolist()
+    generated_ngram = generated_ngrams[idx]
+    for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+      prev_ngram_tuple = tuple(ngram[:-1])
+      generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+
+  def _get_generated_ngrams(hypo_idx):
+    # Before decoding the next token, prevent decoding of ngrams that have already appeared
+    start_idx = cur_len + 1 - no_repeat_ngram_size
+    ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
+    return generated_ngrams[hypo_idx].get(ngram_idx, [])
+
+  banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+  return banned_tokens
 
 
-TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
-    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
-    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
-    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
-    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
-    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
-    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
-    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
-    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
-    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
-    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
-    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
-    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
-    "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
-    "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
-    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
-    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tf_model.h5",
-}
+def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
+  banned_tokens = []
 
+  def _tokens_match(prev_tokens, tokens):
+    if len(tokens) == 0:
+      # if bad word tokens is just one token always ban it
+      return True
+    if len(tokens) > len(prev_input_ids):
+      # if bad word tokens are longer then prev input_ids they can't be equal
+      return False
 
-def gelu(x):
-    """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
-    return x * cdf
+    if prev_tokens[-len(tokens) :] == tokens:
+      # if tokens match
+      return True
+    else:
+      return False
 
+  for prev_input_ids_slice in prev_input_ids:
+    banned_tokens_slice = []
 
-def gelu_new(x):
-    """Gaussian Error Linear Unit.
-    This is a smoother version of the RELU.
-    Original paper: https://arxiv.org/abs/1606.08415
-    Args:
-        x: float Tensor to perform activation.
-    Returns:
-        `x` with the GELU activation applied.
-    """
-    cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
-    return x * cdf
+    for banned_token_seq in bad_words_ids:
+      assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+        bad_words_ids
+      )
 
+      if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False:
+        # if tokens do not match continue
+        continue
 
-def swish(x):
-    return x * tf.sigmoid(x)
+      banned_tokens_slice.append(banned_token_seq[-1])
 
+    banned_tokens.append(banned_tokens_slice)
 
-ACT2FN = {
-    "gelu": tf.keras.layers.Activation(gelu),
-    "relu": tf.keras.activations.relu,
-    "swish": tf.keras.layers.Activation(swish),
-    "gelu_new": tf.keras.layers.Activation(gelu_new),
-}
+  return banned_tokens
 
 
-class TFBertEmbeddings(tf.keras.layers.Layer):
-    """Construct the embeddings from word, position and token_type embeddings.
+def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
+  """Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+  Args:
+      logits: logits distribution shape (batch size, vocabulary size)
+      if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+      if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+          Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+      Make sure we keep at least min_tokens_to_keep per batch example in the output
+  From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+  """
+  logits_shape = shape_list(logits)
+
+  if top_k > 0:
+    top_k = min(max(top_k, min_tokens_to_keep), logits_shape[-1])  # Safety check
+    # Remove all tokens with a probability less than the last token of the top-k
+    indices_to_remove = logits < tf.math.top_k(logits, k=top_k)[0][..., -1, None]
+    logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+
+  if top_p < 1.0:
+    sorted_indices = tf.argsort(logits, direction="DESCENDING")
+    sorted_logits = tf.gather(
+      logits, sorted_indices, axis=-1, batch_dims=1
+    )  # expects logits to be of dim (batch_size, vocab_size)
+
+    cumulative_probs = tf.math.cumsum(tf.nn.softmax(sorted_logits, axis=-1), axis=-1)
+
+    # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+    sorted_indices_to_remove = cumulative_probs > top_p
+
+    if min_tokens_to_keep > 1:
+      # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+      sorted_indices_to_remove = tf.concat(
+        [
+          tf.zeros_like(sorted_indices_to_remove[:, :min_tokens_to_keep]),
+          sorted_indices_to_remove[:, min_tokens_to_keep:],
+        ],
+        -1,
+      )
+
+    # Shift the indices to the right to keep also the first token above the threshold
+    sorted_indices_to_remove = tf.roll(sorted_indices_to_remove, 1, axis=-1)
+    sorted_indices_to_remove = tf.concat(
+      [tf.zeros_like(sorted_indices_to_remove[:, :1]), sorted_indices_to_remove[:, 1:]],
+      -1,
+    )
+    # scatter sorted tensors to original indexing
+    indices_to_remove = scatter_values_on_batch_indices(sorted_indices_to_remove, sorted_indices)
+    logits = set_tensor_by_indices_to_value(logits, indices_to_remove, filter_value)
+  return logits
+
+
+def scatter_values_on_batch_indices(values, batch_indices):
+  shape = shape_list(batch_indices)
+  # broadcast batch dim to shape
+  broad_casted_batch_dims = tf.reshape(tf.broadcast_to(tf.expand_dims(tf.range(shape[0]), axis=-1), shape), [1, -1])
+  # transform batch_indices to pair_indices
+  pair_indices = tf.transpose(tf.concat([broad_casted_batch_dims, tf.reshape(batch_indices, [1, -1])], 0))
+  # scatter values to pair indices
+  return tf.scatter_nd(pair_indices, tf.reshape(values, [-1]), shape)
+
+
+def set_tensor_by_indices_to_value(tensor, indices, value):
+  # create value_tensor since tensor value assignment is not possible in TF
+  value_tensor = tf.zeros_like(tensor) + value
+  return tf.where(indices, value_tensor, tensor)
+
+
+class BeamHypotheses(object):
+  def __init__(self, num_beams, max_length, length_penalty, early_stopping):
+    """
+    Initialize n-best list of hypotheses.
     """
+    self.max_length = max_length - 1  # ignoring bos_token
+    self.length_penalty = length_penalty
+    self.early_stopping = early_stopping
+    self.num_beams = num_beams
+    self.beams = []
+    self.worst_score = 1e9
+
+  def __len__(self):
+    """
+    Number of hypotheses in the list.
+    """
+    return len(self.beams)
 
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.hidden_size = config.hidden_size
-        self.initializer_range = config.initializer_range
-
-        self.position_embeddings = tf.keras.layers.Embedding(
-            config.max_position_embeddings,
-            config.hidden_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="position_embeddings",
-        )
-        self.token_type_embeddings = tf.keras.layers.Embedding(
-            config.type_vocab_size,
-            config.hidden_size,
-            embeddings_initializer=get_initializer(self.initializer_range),
-            name="token_type_embeddings",
-        )
+  def add(self, hyp, sum_logprobs):
+    """
+    Add a new hypothesis to the list.
+    """
+    score = sum_logprobs / len(hyp) ** self.length_penalty
+    if len(self) < self.num_beams or score > self.worst_score:
+      self.beams.append((score, hyp))
+      if len(self) > self.num_beams:
+        sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+        del self.beams[sorted_scores[0][1]]
+        self.worst_score = sorted_scores[1][0]
+      else:
+        self.worst_score = min(score, self.worst_score)
+
+  def is_done(self, best_sum_logprobs, cur_len=None):
+    """
+    If there are enough hypotheses and that none of the hypotheses being generated
+    can become better than the worst one in the heap, then we are done with this sentence.
+    """
 
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-
-    def build(self, input_shape):
-        """Build shared word embedding layer """
-        with tf.name_scope("word_embeddings"):
-            # Create and initialize weights. The random normal initializer was chosen
-            # arbitrarily, and works well.
-            self.word_embeddings = self.add_weight(
-                "weight",
-                shape=[self.vocab_size, self.hidden_size],
-                initializer=get_initializer(self.initializer_range),
-            )
-        super().build(input_shape)
-
-    def call(self, inputs, mode="embedding", training=False):
-        """Get token embeddings of inputs.
-        Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
-            mode: string, a valid value is one of "embedding" and "linear".
-        Returns:
-            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
-                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
-                linear tensor, float32 with shape [batch_size, length, vocab_size].
-        Raises:
-            ValueError: if mode is not valid.
-
-        Shared weights logic adapted from
-            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
-        """
-        if mode == "embedding":
-            return self._embedding(inputs, training=training)
-        elif mode == "linear":
-            return self._linear(inputs)
-        else:
-            raise ValueError("mode {} is not valid.".format(mode))
+    if len(self) < self.num_beams:
+      return False
+    elif self.early_stopping:
+      return True
+    else:
+      if cur_len is None:
+        cur_len = self.max_length
+      cur_score = best_sum_logprobs / cur_len**self.length_penalty
+      ret = self.worst_score >= cur_score
+      return ret
 
-    def _embedding(self, inputs, training=False):
-        """Applies embedding based on inputs tensor."""
-        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
 
-        if input_ids is not None:
-            input_shape = shape_list(input_ids)
-        else:
-            input_shape = shape_list(inputs_embeds)[:-1]
+class TFConv1D(tf.keras.layers.Layer):
+  def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
+    """TFConv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+    Basically works like a Linear layer but the weights are transposed
+    """
+    super().__init__(**kwargs)
+    self.nf = nf
+    self.nx = nx
+    self.initializer_range = initializer_range
 
-        seq_length = input_shape[1]
-        if position_ids is None:
-            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
+  def build(self, input_shape):
+    self.weight = self.add_weight(
+      "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range)
+    )
+    self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer())
 
-        if inputs_embeds is None:
-            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+  def call(self, x):
+    bz, sl = shape_list(x)[:2]
 
-        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings, training=training)
-        return embeddings
+    x = tf.reshape(x, [-1, self.nx])
+    x = tf.matmul(x, self.weight) + self.bias
 
-    def _linear(self, inputs):
-        """Computes logits by running inputs through a linear layer.
-            Args:
-                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
-            Returns:
-                float32 tensor with shape [batch_size, length, vocab_size].
-        """
-        batch_size = shape_list(inputs)[0]
-        length = shape_list(inputs)[1]
+    x = tf.reshape(x, [bz, sl, self.nf])
 
-        x = tf.reshape(inputs, [-1, self.hidden_size])
-        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+    return x
 
-        return tf.reshape(logits, [batch_size, length, self.vocab_size])
 
+class TFSharedEmbeddings(tf.keras.layers.Layer):
+  """Construct shared token embeddings."""
+
+  def __init__(self, vocab_size, hidden_size, initializer_range=None, **kwargs):
+    super().__init__(**kwargs)
+    self.vocab_size = vocab_size
+    self.hidden_size = hidden_size
+    self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range
+
+  def build(self, input_shape):
+    """Build shared token embedding layer
+    Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+    """
+    self.weight = self.add_weight(
+      "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range)
+    )
+    super().build(input_shape)
 
-class TFBertSelfAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
-            )
-        self.output_attentions = config.output_attentions
+  def call(self, inputs, mode="embedding"):
+    """Get token embeddings of inputs.
+    Args:
+        inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+        mode: string, a valid value is one of "embedding" and "linear".
+    Returns:
+        outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+            shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+            linear tensor, float32 with shape [batch_size, length, vocab_size].
+    Raises:
+        ValueError: if mode is not valid.
+
+    Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+    """
+    if mode == "embedding":
+      return self._embedding(inputs)
+    elif mode == "linear":
+      return self._linear(inputs)
+    else:
+      raise ValueError("mode {} is not valid.".format(mode))
+
+  def _embedding(self, input_ids):
+    """Applies embedding based on inputs tensor."""
+    return tf.gather(self.weight, input_ids)
+
+  def _linear(self, inputs):
+    """Computes logits by running inputs through a linear layer.
+    Args:
+        inputs: A float32 tensor with shape [..., hidden_size]
+    Returns:
+        float32 tensor with shape [..., vocab_size].
+    """
+    first_dims = shape_list(inputs)[:-1]
 
-        self.num_attention_heads = config.num_attention_heads
-        assert config.hidden_size % config.num_attention_heads == 0
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.amp = config.amp
+    x = tf.reshape(inputs, [-1, self.hidden_size])
+    logits = tf.matmul(x, self.weight, transpose_b=True)
 
-        self.query = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
-        )
-        self.key = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
-        )
-        self.value = tf.keras.layers.Dense(
-            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
-        )
+    return tf.reshape(logits, first_dims + [self.vocab_size])
 
-        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
 
-    def transpose_for_scores(self, x, batch_size):
-        x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
-        return tf.transpose(x, perm=[0, 2, 1, 3])
+class TFSequenceSummary(tf.keras.layers.Layer):
+  r"""Compute a single vector summary of a sequence hidden states according to various possibilities:
+  Args of the config class:
+      summary_type:
+          - 'last' => [default] take the last token hidden state (like XLNet)
+          - 'first' => take the first token hidden state (like Bert)
+          - 'mean' => take the mean of all tokens hidden states
+          - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+          - 'attn' => Not implemented now, use multi-head attention
+      summary_use_proj: Add a projection after the vector extraction
+      summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+      summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+      summary_first_dropout: Add a dropout before the projection and activation
+      summary_last_dropout: Add a dropout after the projection and activation
+  """
+
+  def __init__(self, config, initializer_range=0.02, **kwargs):
+    super().__init__(**kwargs)
+
+    self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last"
+    if self.summary_type == "attn":
+      # We should use a standard multi-head attention module with absolute positional embedding for that.
+      # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+      # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+      raise NotImplementedError
+
+    self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj
+    if self.has_summary:
+      if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+        num_classes = config.num_labels
+      else:
+        num_classes = config.hidden_size
+      self.summary = tf.keras.layers.Dense(
+        num_classes, kernel_initializer=get_initializer(initializer_range), name="summary"
+      )
+
+    self.has_activation = hasattr(config, "summary_activation") and config.summary_activation == "tanh"
+    if self.has_activation:
+      self.activation = tf.keras.activations.tanh
+
+    self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0
+    if self.has_first_dropout:
+      self.first_dropout = tf.keras.layers.Dropout(config.summary_first_dropout)
+
+    self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0
+    if self.has_last_dropout:
+      self.last_dropout = tf.keras.layers.Dropout(config.summary_last_dropout)
+
+  def call(self, inputs, training=False):
+    """hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+    cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+        shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+        if summary_type == 'cls_index' and cls_index is None:
+            we take the last token of the sequence as classification token
+    """
+    if not isinstance(inputs, (dict, tuple, list)):
+      hidden_states = inputs
+      cls_index = None
+    elif isinstance(inputs, (tuple, list)):
+      hidden_states = inputs[0]
+      cls_index = inputs[1] if len(inputs) > 1 else None
+      assert len(inputs) <= 2, "Too many inputs."
+    else:
+      hidden_states = inputs.get("hidden_states")
+      cls_index = inputs.get("cls_index", None)
+
+    if self.summary_type == "last":
+      output = hidden_states[:, -1]
+    elif self.summary_type == "first":
+      output = hidden_states[:, 0]
+    elif self.summary_type == "mean":
+      output = tf.reduce_mean(hidden_states, axis=1)
+    elif self.summary_type == "cls_index":
+      hidden_shape = shape_list(hidden_states)  # e.g. [batch, num choices, seq length, hidden dims]
+      if cls_index is None:
+        cls_index = tf.fill(
+          hidden_shape[:-2], hidden_shape[-2] - 1
+        )  # A tensor full of shape [batch] or [batch, num choices] full of sequence length
+      cls_shape = shape_list(cls_index)
+      if len(cls_shape) <= len(hidden_shape) - 2:
+        cls_index = cls_index[..., tf.newaxis]
+      # else:
+      # cls_index = cls_index[..., tf.newaxis]
+      # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+      # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+      output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2)
+      output = tf.squeeze(output, axis=len(hidden_shape) - 2)  # shape of output: (batch, num choices, hidden_size)
+    elif self.summary_type == "attn":
+      raise NotImplementedError
+
+    if self.has_first_dropout:
+      output = self.first_dropout(output, training=training)
+
+    if self.has_summary:
+      output = self.summary(output)
+
+    if self.has_activation:
+      output = self.activation(output)
+
+    if self.has_last_dropout:
+      output = self.last_dropout(output, training=training)
+
+    return output
 
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
 
-        batch_size = shape_list(hidden_states)[0]
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
+def shape_list(x):
+  """Deal with dynamic shape in tensorflow cleanly."""
+  static = x.shape.as_list()
+  dynamic = tf.shape(x)
+  return [dynamic[i] if s is None else s for i, s in enumerate(static)]
 
-        query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
-        key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
-        value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
 
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = tf.matmul(
-            query_layer, key_layer, transpose_b=True
-        )  # (batch size, num_heads, seq_len_q, seq_len_k)
-        dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
-        attention_scores = attention_scores / tf.cast(tf.math.sqrt(dk), tf.float16 if self.amp else tf.float32)
+def get_initializer(initializer_range=0.02):
+  """Creates a `tf.initializers.truncated_normal` with the given range.
+  Args:
+      initializer_range: float, initializer range for stddev.
+  Returns:
+      TruncatedNormal initializer with stddev = `initializer_range`.
+  """
+  return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
 
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
-            attention_scores = attention_scores + attention_mask
 
-        # Normalize the attention scores to probabilities.
-        attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
+  "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-tf_model.h5",
+  "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-tf_model.h5",
+  "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-tf_model.h5",
+  "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-tf_model.h5",
+  "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-tf_model.h5",
+  "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-tf_model.h5",
+  "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-tf_model.h5",
+  "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-tf_model.h5",
+  "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-tf_model.h5",
+  "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-tf_model.h5",
+  "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
+  "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
+  "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+  "bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+  "bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+  "bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+  "bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+  "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+  "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
+  "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/tf_model.h5",
+}
 
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs, training=training)
 
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
+def gelu(x):
+  """Gaussian Error Linear Unit.
+  Original Implementation of the gelu activation function in Google Bert repo when initially created.
+      For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+      0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+      Also see https://arxiv.org/abs/1606.08415
+  """
+  cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0)))
+  return x * cdf
 
-        context_layer = tf.matmul(attention_probs, value_layer)
 
-        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(
-            context_layer, (batch_size, -1, self.all_head_size)
-        )  # (batch_size, seq_len_q, all_head_size)
+def gelu_new(x):
+  """Gaussian Error Linear Unit.
+  This is a smoother version of the RELU.
+  Original paper: https://arxiv.org/abs/1606.08415
+  Args:
+      x: float Tensor to perform activation.
+  Returns:
+      `x` with the GELU activation applied.
+  """
+  cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+  return x * cdf
 
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
+
+def swish(x):
+  return x * tf.sigmoid(x)
+
+
+ACT2FN = {
+  "gelu": tf.keras.layers.Activation(gelu),
+  "relu": tf.keras.activations.relu,
+  "swish": tf.keras.layers.Activation(swish),
+  "gelu_new": tf.keras.layers.Activation(gelu_new),
+}
+
+
+class TFBertEmbeddings(tf.keras.layers.Layer):
+  """Construct the embeddings from word, position and token_type embeddings."""
+
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.vocab_size = config.vocab_size
+    self.hidden_size = config.hidden_size
+    self.initializer_range = config.initializer_range
+
+    self.position_embeddings = tf.keras.layers.Embedding(
+      config.max_position_embeddings,
+      config.hidden_size,
+      embeddings_initializer=get_initializer(self.initializer_range),
+      name="position_embeddings",
+    )
+    self.token_type_embeddings = tf.keras.layers.Embedding(
+      config.type_vocab_size,
+      config.hidden_size,
+      embeddings_initializer=get_initializer(self.initializer_range),
+      name="token_type_embeddings",
+    )
+
+    # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+    # any TensorFlow checkpoint file
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+  def build(self, input_shape):
+    """Build shared word embedding layer"""
+    with tf.name_scope("word_embeddings"):
+      # Create and initialize weights. The random normal initializer was chosen
+      # arbitrarily, and works well.
+      self.word_embeddings = self.add_weight(
+        "weight",
+        shape=[self.vocab_size, self.hidden_size],
+        initializer=get_initializer(self.initializer_range),
+      )
+    super().build(input_shape)
+
+  def call(self, inputs, mode="embedding", training=False):
+    """Get token embeddings of inputs.
+    Args:
+        inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+        mode: string, a valid value is one of "embedding" and "linear".
+    Returns:
+        outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+            shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+            linear tensor, float32 with shape [batch_size, length, vocab_size].
+    Raises:
+        ValueError: if mode is not valid.
+
+    Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+    """
+    if mode == "embedding":
+      return self._embedding(inputs, training=training)
+    elif mode == "linear":
+      return self._linear(inputs)
+    else:
+      raise ValueError("mode {} is not valid.".format(mode))
+
+  def _embedding(self, inputs, training=False):
+    """Applies embedding based on inputs tensor."""
+    input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+
+    if input_ids is not None:
+      input_shape = shape_list(input_ids)
+    else:
+      input_shape = shape_list(inputs_embeds)[:-1]
+
+    seq_length = input_shape[1]
+    if position_ids is None:
+      position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+    if token_type_ids is None:
+      token_type_ids = tf.fill(input_shape, 0)
+
+    if inputs_embeds is None:
+      inputs_embeds = tf.gather(self.word_embeddings, input_ids)
+    position_embeddings = self.position_embeddings(position_ids)
+    token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+    embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+    embeddings = self.LayerNorm(embeddings)
+    embeddings = self.dropout(embeddings, training=training)
+    return embeddings
+
+  def _linear(self, inputs):
+    """Computes logits by running inputs through a linear layer.
+    Args:
+        inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+    Returns:
+        float32 tensor with shape [batch_size, length, vocab_size].
+    """
+    batch_size = shape_list(inputs)[0]
+    length = shape_list(inputs)[1]
+
+    x = tf.reshape(inputs, [-1, self.hidden_size])
+    logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+    return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFBertSelfAttention(tf.keras.layers.Layer):
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    if config.hidden_size % config.num_attention_heads != 0:
+      raise ValueError(
+        "The hidden size (%d) is not a multiple of the number of attention "
+        "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+      )
+    self.output_attentions = config.output_attentions
+
+    self.num_attention_heads = config.num_attention_heads
+    assert config.hidden_size % config.num_attention_heads == 0
+    self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+    self.all_head_size = self.num_attention_heads * self.attention_head_size
+    self.amp = config.amp
+
+    self.query = tf.keras.layers.Dense(
+      self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+    )
+    self.key = tf.keras.layers.Dense(
+      self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+    )
+    self.value = tf.keras.layers.Dense(
+      self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+    )
+
+    self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+  def transpose_for_scores(self, x, batch_size):
+    x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size))
+    return tf.transpose(x, perm=[0, 2, 1, 3])
+
+  def call(self, inputs, training=False):
+    hidden_states, attention_mask, head_mask = inputs
+
+    batch_size = shape_list(hidden_states)[0]
+    mixed_query_layer = self.query(hidden_states)
+    mixed_key_layer = self.key(hidden_states)
+    mixed_value_layer = self.value(hidden_states)
+
+    query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+    key_layer = self.transpose_for_scores(mixed_key_layer, batch_size)
+    value_layer = self.transpose_for_scores(mixed_value_layer, batch_size)
+
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attention_scores = tf.matmul(
+      query_layer, key_layer, transpose_b=True
+    )  # (batch size, num_heads, seq_len_q, seq_len_k)
+    dk = tf.cast(shape_list(key_layer)[-1], tf.float32)
+    attention_scores = attention_scores / tf.cast(tf.math.sqrt(dk), tf.float16 if self.amp else tf.float32)
+
+    if attention_mask is not None:
+      # Apply the attention mask is (precomputed for all layers in TFBertModel call() function)
+      attention_scores = attention_scores + attention_mask
+
+    # Normalize the attention scores to probabilities.
+    attention_probs = tf.nn.softmax(attention_scores, axis=-1)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attention_probs = self.dropout(attention_probs, training=training)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+      attention_probs = attention_probs * head_mask
+
+    context_layer = tf.matmul(attention_probs, value_layer)
+
+    context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+    context_layer = tf.reshape(
+      context_layer, (batch_size, -1, self.all_head_size)
+    )  # (batch_size, seq_len_q, all_head_size)
+
+    outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+    return outputs
 
 
 class TFBertSelfOutput(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.dense = tf.keras.layers.Dense(
+      config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
-    def call(self, inputs, training=False):
-        hidden_states, input_tensor = inputs
+  def call(self, inputs, training=False):
+    hidden_states, input_tensor = inputs
 
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states, training=training)
+    hidden_states = self.LayerNorm(hidden_states + input_tensor)
+    return hidden_states
 
 
 class TFBertAttention(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.self_attention = TFBertSelfAttention(config, name="self")
-        self.dense_output = TFBertSelfOutput(config, name="output")
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.self_attention = TFBertSelfAttention(config, name="self")
+    self.dense_output = TFBertSelfOutput(config, name="output")
 
-    def prune_heads(self, heads):
-        raise NotImplementedError
+  def prune_heads(self, heads):
+    raise NotImplementedError
 
-    def call(self, inputs, training=False):
-        input_tensor, attention_mask, head_mask = inputs
+  def call(self, inputs, training=False):
+    input_tensor, attention_mask, head_mask = inputs
 
-        self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
-        attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
+    self_outputs = self.self_attention([input_tensor, attention_mask, head_mask], training=training)
+    attention_output = self.dense_output([self_outputs[0], input_tensor], training=training)
+    outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+    return outputs
 
 
 class TFBertIntermediate(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def call(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.dense = tf.keras.layers.Dense(
+      config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
+    if isinstance(config.hidden_act, str):
+      self.intermediate_act_fn = ACT2FN[config.hidden_act]
+    else:
+      self.intermediate_act_fn = config.hidden_act
+
+  def call(self, hidden_states):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.intermediate_act_fn(hidden_states)
+    return hidden_states
 
 
 class TFBertOutput(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.dense = tf.keras.layers.Dense(
+      config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
 
-    def call(self, inputs, training=False):
-        hidden_states, input_tensor = inputs
+  def call(self, inputs, training=False):
+    hidden_states, input_tensor = inputs
 
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states, training=training)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.dropout(hidden_states, training=training)
+    hidden_states = self.LayerNorm(hidden_states + input_tensor)
+    return hidden_states
 
 
 class TFBertLayer(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.attention = TFBertAttention(config, name="attention")
-        self.intermediate = TFBertIntermediate(config, name="intermediate")
-        self.bert_output = TFBertOutput(config, name="output")
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.attention = TFBertAttention(config, name="attention")
+    self.intermediate = TFBertIntermediate(config, name="intermediate")
+    self.bert_output = TFBertOutput(config, name="output")
 
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
+  def call(self, inputs, training=False):
+    hidden_states, attention_mask, head_mask = inputs
 
-        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.bert_output([intermediate_output, attention_output], training=training)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
-        return outputs
+    attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+    attention_output = attention_outputs[0]
+    intermediate_output = self.intermediate(attention_output)
+    layer_output = self.bert_output([intermediate_output, attention_output], training=training)
+    outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+    return outputs
 
 
 class TFBertEncoder(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.output_attentions = config.output_attentions
+    self.output_hidden_states = config.output_hidden_states
+    self.layer = [TFBertLayer(config, name="layer_._{}".format(i)) for i in range(config.num_hidden_layers)]
 
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
+  def call(self, inputs, training=False):
+    hidden_states, attention_mask, head_mask = inputs
 
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
+    all_hidden_states = ()
+    all_attentions = ()
+    for i, layer_module in enumerate(self.layer):
+      if self.output_hidden_states:
+        all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
-            hidden_states = layer_outputs[0]
+      layer_outputs = layer_module([hidden_states, attention_mask, head_mask[i]], training=training)
+      hidden_states = layer_outputs[0]
 
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
+      if self.output_attentions:
+        all_attentions = all_attentions + (layer_outputs[1],)
 
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+    # Add last layer
+    if self.output_hidden_states:
+      all_hidden_states = all_hidden_states + (hidden_states,)
 
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # outputs, (hidden states), (attentions)
+    outputs = (hidden_states,)
+    if self.output_hidden_states:
+      outputs = outputs + (all_hidden_states,)
+    if self.output_attentions:
+      outputs = outputs + (all_attentions,)
+    return outputs  # outputs, (hidden states), (attentions)
 
 
 class TFBertPooler(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size,
-            kernel_initializer=get_initializer(config.initializer_range),
-            activation="tanh",
-            name="dense",
-        )
-
-    def call(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        return pooled_output
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.dense = tf.keras.layers.Dense(
+      config.hidden_size,
+      kernel_initializer=get_initializer(config.initializer_range),
+      activation="tanh",
+      name="dense",
+    )
+
+  def call(self, hidden_states):
+    # We "pool" the model by simply taking the hidden state corresponding
+    # to the first token.
+    first_token_tensor = hidden_states[:, 0]
+    pooled_output = self.dense(first_token_tensor)
+    return pooled_output
 
 
 class TFBertPredictionHeadTransform(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.dense = tf.keras.layers.Dense(
-            config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
-        )
-        if isinstance(config.hidden_act, str):
-            self.transform_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.transform_act_fn = config.hidden_act
-        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
-
-    def call(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states)
-        return hidden_states
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.dense = tf.keras.layers.Dense(
+      config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+    )
+    if isinstance(config.hidden_act, str):
+      self.transform_act_fn = ACT2FN[config.hidden_act]
+    else:
+      self.transform_act_fn = config.hidden_act
+    self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+
+  def call(self, hidden_states):
+    hidden_states = self.dense(hidden_states)
+    hidden_states = self.transform_act_fn(hidden_states)
+    hidden_states = self.LayerNorm(hidden_states)
+    return hidden_states
 
 
 class TFBertLMPredictionHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.vocab_size = config.vocab_size
-        self.transform = TFBertPredictionHeadTransform(config, name="transform")
+  def __init__(self, config, input_embeddings, **kwargs):
+    super().__init__(**kwargs)
+    self.vocab_size = config.vocab_size
+    self.transform = TFBertPredictionHeadTransform(config, name="transform")
 
-        # The output weights are the same as the input embeddings, but there is
-        # an output-only bias for each token.
-        self.input_embeddings = input_embeddings
+    # The output weights are the same as the input embeddings, but there is
+    # an output-only bias for each token.
+    self.input_embeddings = input_embeddings
 
-    def build(self, input_shape):
-        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
-        super().build(input_shape)
+  def build(self, input_shape):
+    self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+    super().build(input_shape)
 
-    def call(self, hidden_states):
-        hidden_states = self.transform(hidden_states)
-        hidden_states = self.input_embeddings(hidden_states, mode="linear")
-        hidden_states = hidden_states + self.bias
-        return hidden_states
+  def call(self, hidden_states):
+    hidden_states = self.transform(hidden_states)
+    hidden_states = self.input_embeddings(hidden_states, mode="linear")
+    hidden_states = hidden_states + self.bias
+    return hidden_states
 
 
 class TFBertMLMHead(tf.keras.layers.Layer):
-    def __init__(self, config, input_embeddings, **kwargs):
-        super().__init__(**kwargs)
-        self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
+  def __init__(self, config, input_embeddings, **kwargs):
+    super().__init__(**kwargs)
+    self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions")
 
-    def call(self, sequence_output):
-        prediction_scores = self.predictions(sequence_output)
-        return prediction_scores
+  def call(self, sequence_output):
+    prediction_scores = self.predictions(sequence_output)
+    return prediction_scores
 
 
 class TFBertNSPHead(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.seq_relationship = tf.keras.layers.Dense(
-            2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
-        )
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.seq_relationship = tf.keras.layers.Dense(
+      2, kernel_initializer=get_initializer(config.initializer_range), name="seq_relationship"
+    )
 
-    def call(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
+  def call(self, pooled_output):
+    seq_relationship_score = self.seq_relationship(pooled_output)
+    return seq_relationship_score
 
 
 @keras_serializable
 class TFBertMainLayer(tf.keras.layers.Layer):
-    config_class = BertConfig
-
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.num_hidden_layers = config.num_hidden_layers
-
-        self.embeddings = TFBertEmbeddings(config, name="embeddings")
-        self.encoder = TFBertEncoder(config, name="encoder")
-        self.pooler = TFBertPooler(config, name="pooler")
-
-    def get_input_embeddings(self):
-        return self.embeddings
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        raise NotImplementedError
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        raise NotImplementedError
-
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, (dict, BatchEncoding)):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = shape_list(input_ids)
-        elif inputs_embeds is not None:
-            input_shape = shape_list(inputs_embeds)[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        if attention_mask is None:
-            attention_mask = tf.fill(input_shape, 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(input_shape, 0)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            raise NotImplementedError
-        else:
-            head_mask = [None] * self.num_hidden_layers
-            # head_mask = tf.constant([0] * self.num_hidden_layers)
+  config_class = BertConfig
 
-        embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
-        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+  def __init__(self, config, **kwargs):
+    super().__init__(**kwargs)
+    self.num_hidden_layers = config.num_hidden_layers
 
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
+    self.embeddings = TFBertEmbeddings(config, name="embeddings")
+    self.encoder = TFBertEncoder(config, name="encoder")
+    self.pooler = TFBertPooler(config, name="pooler")
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[
-            1:
-        ]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+  def get_input_embeddings(self):
+    return self.embeddings
 
+  def _resize_token_embeddings(self, new_num_tokens):
+    raise NotImplementedError
 
-class TFBertPreTrainedModel(TFPreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
+  def _prune_heads(self, heads_to_prune):
+    """Prunes heads of the model.
+    heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+    See base class PreTrainedModel
     """
+    raise NotImplementedError
+
+  def call(
+    self,
+    inputs,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    if isinstance(inputs, (tuple, list)):
+      input_ids = inputs[0]
+      attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+      token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+      position_ids = inputs[3] if len(inputs) > 3 else position_ids
+      head_mask = inputs[4] if len(inputs) > 4 else head_mask
+      inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+      assert len(inputs) <= 6, "Too many inputs."
+    elif isinstance(inputs, (dict, BatchEncoding)):
+      input_ids = inputs.get("input_ids")
+      attention_mask = inputs.get("attention_mask", attention_mask)
+      token_type_ids = inputs.get("token_type_ids", token_type_ids)
+      position_ids = inputs.get("position_ids", position_ids)
+      head_mask = inputs.get("head_mask", head_mask)
+      inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+      assert len(inputs) <= 6, "Too many inputs."
+    else:
+      input_ids = inputs
+
+    if input_ids is not None and inputs_embeds is not None:
+      raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+    elif input_ids is not None:
+      input_shape = shape_list(input_ids)
+    elif inputs_embeds is not None:
+      input_shape = shape_list(inputs_embeds)[:-1]
+    else:
+      raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+    if attention_mask is None:
+      attention_mask = tf.fill(input_shape, 1)
+    if token_type_ids is None:
+      token_type_ids = tf.fill(input_shape, 0)
+
+    # We create a 3D attention mask from a 2D tensor mask.
+    # Sizes are [batch_size, 1, 1, to_seq_length]
+    # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+    # this attention mask is more simple than the triangular masking of causal attention
+    # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+    extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+    # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+    # masked positions, this operation will create a tensor which is 0.0 for
+    # positions we want to attend and -10000.0 for masked positions.
+    # Since we are adding it to the raw scores before the softmax, this is
+    # effectively the same as removing these entirely.
+
+    extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+    # Prepare head mask if needed
+    # 1.0 in head_mask indicate we keep the head
+    # attention_probs has shape bsz x n_heads x N x N
+    # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+    # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+    if head_mask is not None:
+      raise NotImplementedError
+    else:
+      head_mask = [None] * self.num_hidden_layers
+      # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+    embedding_output = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+    encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+
+    sequence_output = encoder_outputs[0]
+    pooled_output = self.pooler(sequence_output)
+
+    outputs = (
+      sequence_output,
+      pooled_output,
+    ) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+    return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+
+
+class TFBertPreTrainedModel(TFPreTrainedModel):
+  """An abstract class to handle weights initialization and
+  a simple interface for downloading and loading pretrained models.
+  """
 
-    config_class = BertConfig
-    pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-    base_model_prefix = "bert"
+  config_class = BertConfig
+  pretrained_model_archive_map = TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+  base_model_prefix = "bert"
 
 
 BERT_START_DOCSTRING = r"""
@@ -2336,17 +2325,17 @@ class TFBertPreTrainedModel(TFPreTrainedModel):
 
 
 @add_start_docstrings(
-    "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
-    BERT_START_DOCSTRING,
+  "The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+  BERT_START_DOCSTRING,
 )
 class TFBertModel(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.bert = TFBertMainLayer(config, name="bert")
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
+    self.bert = TFBertMainLayer(config, name="bert")
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
@@ -2380,30 +2369,30 @@ def call(self, inputs, **kwargs):
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-        """
-        outputs = self.bert(inputs, **kwargs)
-        return outputs
+    """
+    outputs = self.bert(inputs, **kwargs)
+    return outputs
 
 
 @add_start_docstrings(
-    """Bert Model with two heads on top as done during the pre-training:
+  """Bert Model with two heads on top as done during the pre-training:
     a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING,
+  BERT_START_DOCSTRING,
 )
 class TFBertForPreTraining(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.nsp = TFBertNSPHead(config, name="nsp___cls")
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.nsp = TFBertNSPHead(config, name="nsp___cls")
+    self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
-    def get_output_embeddings(self):
-        return self.bert.embeddings
+  def get_output_embeddings(self):
+    return self.bert.embeddings
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -2432,34 +2421,35 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         prediction_scores, seq_relationship_scores = outputs[:2]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        sequence_output, pooled_output = outputs[:2]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
-        seq_relationship_score = self.nsp(pooled_output)
+    sequence_output, pooled_output = outputs[:2]
+    prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+    seq_relationship_score = self.nsp(pooled_output)
 
-        outputs = (prediction_scores, seq_relationship_score,) + outputs[
-            2:
-        ]  # add hidden states and attention if they are here
+    outputs = (
+      prediction_scores,
+      seq_relationship_score,
+    ) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+    return outputs  # prediction_scores, seq_relationship_score, (hidden_states), (attentions)
 
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
 class TFBertForMaskedLM(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
 
-    def get_output_embeddings(self):
-        return self.bert.embeddings
+  def get_output_embeddings(self):
+    return self.bert.embeddings
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -2486,30 +2476,31 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         prediction_scores = outputs[0]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+    sequence_output = outputs[0]
+    prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
 
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+    outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
-        return outputs  # prediction_scores, (hidden_states), (attentions)
+    return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
-    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+  """Bert Model with a `next sentence prediction (classification)` head on top. """,
+  BERT_START_DOCSTRING,
 )
 class TFBertForNextSentencePrediction(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.nsp = TFBertNSPHead(config, name="nsp___cls")
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.nsp = TFBertNSPHead(config, name="nsp___cls")
 
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`)
@@ -2536,36 +2527,36 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         seq_relationship_scores = outputs[0]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        pooled_output = outputs[1]
-        seq_relationship_score = self.nsp(pooled_output)
+    pooled_output = outputs[1]
+    seq_relationship_score = self.nsp(pooled_output)
 
-        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+    outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # seq_relationship_score, (hidden_states), (attentions)
+    return outputs  # seq_relationship_score, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
-    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+  """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
     the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING,
+  BERT_START_DOCSTRING,
 )
 class TFBertForSequenceClassification(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
+    self.num_labels = config.num_labels
+
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    self.classifier = tf.keras.layers.Dense(
+      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+    )
+
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`):
@@ -2592,55 +2583,55 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         logits = outputs[0]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        pooled_output = outputs[1]
+    pooled_output = outputs[1]
 
-        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
-        logits = self.classifier(pooled_output)
+    pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
+    logits = self.classifier(pooled_output)
 
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+    outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # logits, (hidden_states), (attentions)
+    return outputs  # logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
-    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+  """Bert Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING,
+  BERT_START_DOCSTRING,
 )
 class TFBertForMultipleChoice(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
 
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    self.classifier = tf.keras.layers.Dense(
+      1, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+    )
+
+  @property
+  def dummy_inputs(self):
+    """Dummy inputs to build the network.
 
-    @property
-    def dummy_inputs(self):
-        """ Dummy inputs to build the network.
-
-        Returns:
-            tf.Tensor with dummy inputs
-        """
-        return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(
-        self,
-        inputs,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        training=False,
-    ):
-        r"""
+    Returns:
+        tf.Tensor with dummy inputs
+    """
+    return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)}
+
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(
+    self,
+    inputs,
+    attention_mask=None,
+    token_type_ids=None,
+    position_ids=None,
+    head_mask=None,
+    inputs_embeds=None,
+    training=False,
+  ):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         classification_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, num_choices)`:
@@ -2670,79 +2661,79 @@ def call(
         outputs = model(input_ids)
         classification_scores = outputs[0]
 
-        """
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
-            assert len(inputs) <= 6, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get("input_ids")
-            attention_mask = inputs.get("attention_mask", attention_mask)
-            token_type_ids = inputs.get("token_type_ids", token_type_ids)
-            position_ids = inputs.get("position_ids", position_ids)
-            head_mask = inputs.get("head_mask", head_mask)
-            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
-            assert len(inputs) <= 6, "Too many inputs."
-        else:
-            input_ids = inputs
-
-        if input_ids is not None:
-            num_choices = shape_list(input_ids)[1]
-            seq_length = shape_list(input_ids)[2]
-        else:
-            num_choices = shape_list(inputs_embeds)[1]
-            seq_length = shape_list(inputs_embeds)[2]
-
-        flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
-        flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
-        flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
-        flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
-
-        flat_inputs = [
-            flat_input_ids,
-            flat_attention_mask,
-            flat_token_type_ids,
-            flat_position_ids,
-            head_mask,
-            inputs_embeds,
-        ]
-
-        outputs = self.bert(flat_inputs, training=training)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=training)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = tf.reshape(logits, (-1, num_choices))
-
-        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # reshaped_logits, (hidden_states), (attentions)
+    """
+    if isinstance(inputs, (tuple, list)):
+      input_ids = inputs[0]
+      attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+      token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+      position_ids = inputs[3] if len(inputs) > 3 else position_ids
+      head_mask = inputs[4] if len(inputs) > 4 else head_mask
+      inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+      assert len(inputs) <= 6, "Too many inputs."
+    elif isinstance(inputs, dict):
+      input_ids = inputs.get("input_ids")
+      attention_mask = inputs.get("attention_mask", attention_mask)
+      token_type_ids = inputs.get("token_type_ids", token_type_ids)
+      position_ids = inputs.get("position_ids", position_ids)
+      head_mask = inputs.get("head_mask", head_mask)
+      inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+      assert len(inputs) <= 6, "Too many inputs."
+    else:
+      input_ids = inputs
+
+    if input_ids is not None:
+      num_choices = shape_list(input_ids)[1]
+      seq_length = shape_list(input_ids)[2]
+    else:
+      num_choices = shape_list(inputs_embeds)[1]
+      seq_length = shape_list(inputs_embeds)[2]
+
+    flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None
+    flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None
+    flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None
+    flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None
+
+    flat_inputs = [
+      flat_input_ids,
+      flat_attention_mask,
+      flat_token_type_ids,
+      flat_position_ids,
+      head_mask,
+      inputs_embeds,
+    ]
+
+    outputs = self.bert(flat_inputs, training=training)
+
+    pooled_output = outputs[1]
+
+    pooled_output = self.dropout(pooled_output, training=training)
+    logits = self.classifier(pooled_output)
+    reshaped_logits = tf.reshape(logits, (-1, num_choices))
+
+    outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+    return outputs  # reshaped_logits, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
-    """Bert Model with a token classification head on top (a linear layer on top of
+  """Bert Model with a token classification head on top (a linear layer on top of
     the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING,
+  BERT_START_DOCSTRING,
 )
 class TFBertForTokenClassification(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
+    self.num_labels = config.num_labels
+
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+    self.classifier = tf.keras.layers.Dense(
+      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+    )
+
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
@@ -2769,37 +2760,37 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         scores = outputs[0]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        sequence_output = outputs[0]
+    sequence_output = outputs[0]
 
-        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
-        logits = self.classifier(sequence_output)
+    sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
+    logits = self.classifier(sequence_output)
 
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+    outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
 
-        return outputs  # scores, (hidden_states), (attentions)
+    return outputs  # scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings(
-    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+  """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
     the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING,
+  BERT_START_DOCSTRING,
 )
 class TFBertForQuestionAnswering(TFBertPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
-        super().__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.bert = TFBertMainLayer(config, name="bert")
-        self.qa_outputs = tf.keras.layers.Dense(
-            config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
-        )
-
-    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
-        r"""
+  def __init__(self, config, *inputs, **kwargs):
+    super().__init__(config, *inputs, **kwargs)
+    self.num_labels = config.num_labels
+
+    self.bert = TFBertMainLayer(config, name="bert")
+    self.qa_outputs = tf.keras.layers.Dense(
+      config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs"
+    )
+
+  @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING)
+  def call(self, inputs, **kwargs):
+    r"""
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         start_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`):
@@ -2828,16 +2819,19 @@ def call(self, inputs, **kwargs):
         outputs = model(input_ids)
         start_scores, end_scores = outputs[:2]
 
-        """
-        outputs = self.bert(inputs, **kwargs)
+    """
+    outputs = self.bert(inputs, **kwargs)
 
-        sequence_output = outputs[0]
+    sequence_output = outputs[0]
 
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
+    logits = self.qa_outputs(sequence_output)
+    start_logits, end_logits = tf.split(logits, 2, axis=-1)
+    start_logits = tf.squeeze(start_logits, axis=-1)
+    end_logits = tf.squeeze(end_logits, axis=-1)
 
-        outputs = (start_logits, end_logits,) + outputs[2:]
+    outputs = (
+      start_logits,
+      end_logits,
+    ) + outputs[2:]
 
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+    return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/modelzoo/ELECTRA/optimization.py b/modelzoo/ELECTRA/optimization.py
index b83e487c..3a42582d 100644
--- a/modelzoo/ELECTRA/optimization.py
+++ b/modelzoo/ELECTRA/optimization.py
@@ -28,90 +28,102 @@
 
 
 class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
-    """Applys a warmup schedule on a given learning rate decay schedule."""
-
-    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
-        super().__init__()
-        self.initial_learning_rate = initial_learning_rate
-        self.warmup_steps = warmup_steps
-        self.power = power
-        self.decay_schedule_fn = decay_schedule_fn
-        self.name = name
-
-    def __call__(self, step):
-        with tf.name_scope(self.name or "WarmUp") as name:
-            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
-            # learning rate will be `global_step/num_warmup_steps * init_lr`.
-            global_step_float = tf.cast(step, tf.float32)
-            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
-            warmup_percent_done = global_step_float / warmup_steps_float
-            warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
-            return tf.cond(
-                global_step_float < warmup_steps_float,
-                lambda: warmup_learning_rate,
-                lambda: self.decay_schedule_fn(step - self.warmup_steps),
-                name=name,
-            )
-
-    def get_config(self):
-        return {
-            "initial_learning_rate": self.initial_learning_rate,
-            "decay_schedule_fn": self.decay_schedule_fn,
-            "warmup_steps": self.warmup_steps,
-            "power": self.power,
-            "name": self.name,
-        }
-
-
-def create_optimizer(init_lr, num_train_steps, num_warmup_steps, weight_decay_rate=0.01,
-                     layerwise_lr_decay=-1, n_transformer_layers=None, clip_norm=1.0,
-                     optimizer="adam", skip_adaptive=False, power=1.0, beta_1=0.9, beta_2=0.999, end_lr=0.0):
-    """Creates an optimizer with learning rate schedule."""
-    # Implements linear decay of the learning rate.
-    learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
-        initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power
+  """Applys a warmup schedule on a given learning rate decay schedule."""
+
+  def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
+    super().__init__()
+    self.initial_learning_rate = initial_learning_rate
+    self.warmup_steps = warmup_steps
+    self.power = power
+    self.decay_schedule_fn = decay_schedule_fn
+    self.name = name
+
+  def __call__(self, step):
+    with tf.name_scope(self.name or "WarmUp") as name:
+      # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
+      # learning rate will be `global_step/num_warmup_steps * init_lr`.
+      global_step_float = tf.cast(step, tf.float32)
+      warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
+      warmup_percent_done = global_step_float / warmup_steps_float
+      warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
+      return tf.cond(
+        global_step_float < warmup_steps_float,
+        lambda: warmup_learning_rate,
+        lambda: self.decay_schedule_fn(step - self.warmup_steps),
+        name=name,
+      )
+
+  def get_config(self):
+    return {
+      "initial_learning_rate": self.initial_learning_rate,
+      "decay_schedule_fn": self.decay_schedule_fn,
+      "warmup_steps": self.warmup_steps,
+      "power": self.power,
+      "name": self.name,
+    }
+
+
+def create_optimizer(
+  init_lr,
+  num_train_steps,
+  num_warmup_steps,
+  weight_decay_rate=0.01,
+  layerwise_lr_decay=-1,
+  n_transformer_layers=None,
+  clip_norm=1.0,
+  optimizer="adam",
+  skip_adaptive=False,
+  power=1.0,
+  beta_1=0.9,
+  beta_2=0.999,
+  end_lr=0.0,
+):
+  """Creates an optimizer with learning rate schedule."""
+  # Implements linear decay of the learning rate.
+  learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
+    initial_learning_rate=init_lr, decay_steps=num_train_steps - num_warmup_steps, end_learning_rate=end_lr, power=power
+  )
+  if num_warmup_steps:
+    learning_rate_fn = WarmUp(
+      initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
     )
-    if num_warmup_steps:
-        learning_rate_fn = WarmUp(
-            initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
-        )
-    layer_decay = None
-    if layerwise_lr_decay > 0 and n_transformer_layers is not None:
-        layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers)
-
-    if optimizer == "adam":
-        optimizer = AdamWeightDecay(
-            learning_rate=learning_rate_fn,
-            weight_decay_rate=weight_decay_rate,
-            layer_decay=layer_decay,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=1e-6,
-            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
-            clip_norm=clip_norm,
-        )
+  layer_decay = None
+  if layerwise_lr_decay > 0 and n_transformer_layers is not None:
+    layer_decay = _get_layer_decay(layerwise_lr_decay, n_transformer_layers)
+
+  if optimizer == "adam":
+    optimizer = AdamWeightDecay(
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=weight_decay_rate,
+      layer_decay=layer_decay,
+      beta_1=beta_1,
+      beta_2=beta_2,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
+      clip_norm=clip_norm,
+    )
+  else:
+    if skip_adaptive:
+      skip_list = ["layer_norm", "bias", "LayerNorm"]
     else:
-        if skip_adaptive:
-            skip_list = ["layer_norm", "bias", "LayerNorm"]
-        else:
-            skip_list = ["None"]
-        log("Skip list for LAMB {}".format(skip_list))
-        
-        optimizer = tfa_optimizers.LAMB(
-            learning_rate=learning_rate_fn,
-            weight_decay_rate=weight_decay_rate,
-            beta_1=beta_1,
-            beta_2=beta_2,
-            epsilon=1e-6,
-            exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
-            exclude_from_layer_adaptation=skip_list,
-        )
+      skip_list = ["None"]
+    log("Skip list for LAMB {}".format(skip_list))
+
+    optimizer = tfa_optimizers.LAMB(
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=weight_decay_rate,
+      beta_1=beta_1,
+      beta_2=beta_2,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["layer_norm", "bias", "LayerNorm"],
+      exclude_from_layer_adaptation=skip_list,
+    )
 
-    return optimizer
+  return optimizer
 
 
 class AdamWeightDecay(tf.keras.optimizers.Adam):
-    """Adam enables L2 weight decay and clip_by_global_norm on gradients.
+  """Adam enables L2 weight decay and clip_by_global_norm on gradients.
 
   Just adding the square of the weights to the loss function is *not* the
   correct way of using L2 regularization/weight decay with Adam, since that will
@@ -122,262 +134,252 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
   the loss with plain (non-momentum) SGD.
   """
 
-    def __init__(
-            self,
-            learning_rate=0.001,
-            beta_1=0.9,
-            beta_2=0.999,
-            epsilon=1e-7,
-            amsgrad=False,
-            weight_decay_rate=0.0,
-            include_in_weight_decay=None,
-            exclude_from_weight_decay=None,
-            layer_decay=None,
-            clip_norm=1.0,
-            name="AdamWeightDecay",
-            **kwargs
-    ):
-        super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
-        self.weight_decay_rate = weight_decay_rate
-        self._include_in_weight_decay = include_in_weight_decay
-        self._exclude_from_weight_decay = exclude_from_weight_decay
-        self.layer_decay = layer_decay
-        self.clip_norm = clip_norm
-
-    @classmethod
-    def from_config(cls, config):
-        """Creates an optimizer from its config with WarmUp custom object."""
-        custom_objects = {"WarmUp": WarmUp}
-        return super().from_config(config, custom_objects=custom_objects)
-
-    def _prepare_local(self, var_device, var_dtype, apply_state):
-        super()._prepare_local(var_device, var_dtype, apply_state)
-        apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
-
-    def _decay_weights_op(self, var, learning_rate, apply_state):
-        do_decay = self._do_use_weight_decay(var.name)
-        if do_decay:
-            return var.assign_sub(
-                learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking
-            )
-        return tf.no_op()
-
-    def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True):
-        grads, tvars = list(zip(*grads_and_vars))
-        # Being done in train_step
-        ##(grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.clip_norm)
-        return super().apply_gradients(zip(grads, tvars), name=name,
-                                       experimental_aggregate_gradients=experimental_aggregate_gradients)
-
-    def _get_lr(self, var, apply_state):
-        """Retrieves the learning rate with the given state."""
-        # if apply_state is None:
-        #     return self._decayed_lr_t[var_dtype], {}
-        var_name, var_device, var_dtype = var.name, var.device, var.dtype.base_dtype
-
-        apply_state = apply_state or {}
-        coefficients = apply_state.get((var_device, var_dtype))
-        if coefficients is None:
-            coefficients = self._fallback_apply_state(var_device, var_dtype)
-            apply_state[(var_device, var_dtype)] = coefficients
-        lr_t = coefficients["lr_t"]
-        lr = coefficients["lr"]
-
-        if self.layer_decay is not None:
-            update_for_var = False
-            for key in self.layer_decay:
-                if key in var_name:
-                    update_for_var = True
-                    lr_t *= self.layer_decay[key]
-                    lr *= self.layer_decay[key]
-                    break
-            if not update_for_var:
-                raise ValueError("No learning rate specified for variable", var)
-
-        return lr_t, lr, coefficients, dict(apply_state=apply_state)
-
-    def _resource_apply_dense(self, grad, var, apply_state=None):
-        # print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
-        lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state)
-        decay = self._decay_weights_op(var, lr_t, apply_state)
-        with tf.control_dependencies([decay]):
-            m = self.get_slot(var, 'm')
-            v = self.get_slot(var, 'v')
-
-            if not self.amsgrad:
-                return training_ops.resource_apply_adam(
-                    var.handle,
-                    m.handle,
-                    v.handle,
-                    coefficients['beta_1_power'],
-                    coefficients['beta_2_power'],
-                    lr_t,
-                    coefficients['beta_1_t'],
-                    coefficients['beta_2_t'],
-                    coefficients['epsilon'],
-                    grad,
-                    use_locking=self._use_locking)
-            else:
-                vhat = self.get_slot(var, 'vhat')
-                return training_ops.resource_apply_adam_with_amsgrad(
-                    var.handle,
-                    m.handle,
-                    v.handle,
-                    vhat.handle,
-                    coefficients['beta_1_power'],
-                    coefficients['beta_2_power'],
-                    lr_t,
-                    coefficients['beta_1_t'],
-                    coefficients['beta_2_t'],
-                    coefficients['epsilon'],
-                    grad,
-                    use_locking=self._use_locking)
-
-    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
-        # print("Sparse: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
-        lr_t, lr, coefficients, kwargs = self._get_lr(var, apply_state)
-        decay = self._decay_weights_op(var, lr_t, apply_state)
-        with tf.control_dependencies([decay]):
-            # m_t = beta1 * m + (1 - beta1) * g_t
-            m = self.get_slot(var, 'm')
-            m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
-            m_t = state_ops.assign(m, m * coefficients['beta_1_t'],
-                                   use_locking=self._use_locking)
-            with tf.control_dependencies([m_t]):
-                m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
-
-            # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
-            v = self.get_slot(var, 'v')
-            v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
-            v_t = state_ops.assign(v, v * coefficients['beta_2_t'],
-                                   use_locking=self._use_locking)
-            with tf.control_dependencies([v_t]):
-                v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
-
-            if not self.amsgrad:
-                v_sqrt = math_ops.sqrt(v_t)
-                var_update = state_ops.assign_sub(
-                    var, lr * m_t / (v_sqrt + coefficients['epsilon']),
-                    use_locking=self._use_locking)
-                return control_flow_ops.group(*[var_update, m_t, v_t])
-            else:
-                v_hat = self.get_slot(var, 'vhat')
-                v_hat_t = math_ops.maximum(v_hat, v_t)
-                with tf.control_dependencies([v_hat_t]):
-                    v_hat_t = state_ops.assign(
-                        v_hat, v_hat_t, use_locking=self._use_locking)
-                v_hat_sqrt = math_ops.sqrt(v_hat_t)
-                var_update = state_ops.assign_sub(
-                    var,
-                    lr * m_t / (v_hat_sqrt + coefficients['epsilon']),
-                    use_locking=self._use_locking)
-                return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
-
-    def get_config(self):
-        config = super().get_config()
-        config.update({"weight_decay_rate": self.weight_decay_rate})
-        return config
-
-    def _do_use_weight_decay(self, param_name):
-        """Whether to use L2 weight decay for `param_name`."""
-        if self.weight_decay_rate == 0:
-            return False
-
-        if self._include_in_weight_decay:
-            for r in self._include_in_weight_decay:
-                if re.search(r, param_name) is not None:
-                    return True
-
-        if self._exclude_from_weight_decay:
-            for r in self._exclude_from_weight_decay:
-                if re.search(r, param_name) is not None:
-                    return False
-        return True
-
+  def __init__(
+    self,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-7,
+    amsgrad=False,
+    weight_decay_rate=0.0,
+    include_in_weight_decay=None,
+    exclude_from_weight_decay=None,
+    layer_decay=None,
+    clip_norm=1.0,
+    name="AdamWeightDecay",
+    **kwargs,
+  ):
+    super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
+    self.weight_decay_rate = weight_decay_rate
+    self._include_in_weight_decay = include_in_weight_decay
+    self._exclude_from_weight_decay = exclude_from_weight_decay
+    self.layer_decay = layer_decay
+    self.clip_norm = clip_norm
+
+  @classmethod
+  def from_config(cls, config):
+    """Creates an optimizer from its config with WarmUp custom object."""
+    custom_objects = {"WarmUp": WarmUp}
+    return super().from_config(config, custom_objects=custom_objects)
+
+  def _prepare_local(self, var_device, var_dtype, apply_state):
+    super()._prepare_local(var_device, var_dtype, apply_state)
+    apply_state["weight_decay_rate"] = tf.constant(self.weight_decay_rate, name="adam_weight_decay_rate")
+
+  def _decay_weights_op(self, var, learning_rate, apply_state):
+    do_decay = self._do_use_weight_decay(var.name)
+    if do_decay:
+      return var.assign_sub(learning_rate * var * apply_state["weight_decay_rate"], use_locking=self._use_locking)
+    return tf.no_op()
+
+  def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True):
+    grads, tvars = list(zip(*grads_and_vars))
+    # Being done in train_step
+    ##(grads, _) = tf.clip_by_global_norm(grads, clip_norm=self.clip_norm)
+    return super().apply_gradients(
+      zip(grads, tvars), name=name, experimental_aggregate_gradients=experimental_aggregate_gradients
+    )
 
-# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
-class GradientAccumulator(object):
-    """Distribution strategies-aware gradient accumulation utility."""
+  def _get_lr(self, var, apply_state):
+    """Retrieves the learning rate with the given state."""
+    # if apply_state is None:
+    #     return self._decayed_lr_t[var_dtype], {}
+    var_name, var_device, var_dtype = var.name, var.device, var.dtype.base_dtype
+
+    apply_state = apply_state or {}
+    coefficients = apply_state.get((var_device, var_dtype))
+    if coefficients is None:
+      coefficients = self._fallback_apply_state(var_device, var_dtype)
+      apply_state[(var_device, var_dtype)] = coefficients
+    lr_t = coefficients["lr_t"]
+    lr = coefficients["lr"]
+
+    if self.layer_decay is not None:
+      update_for_var = False
+      for key in self.layer_decay:
+        if key in var_name:
+          update_for_var = True
+          lr_t *= self.layer_decay[key]
+          lr *= self.layer_decay[key]
+          break
+      if not update_for_var:
+        raise ValueError("No learning rate specified for variable", var)
+
+    return lr_t, lr, coefficients, dict(apply_state=apply_state)
+
+  def _resource_apply_dense(self, grad, var, apply_state=None):
+    # print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
+    lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      m = self.get_slot(var, "m")
+      v = self.get_slot(var, "v")
+
+      if not self.amsgrad:
+        return training_ops.resource_apply_adam(
+          var.handle,
+          m.handle,
+          v.handle,
+          coefficients["beta_1_power"],
+          coefficients["beta_2_power"],
+          lr_t,
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          use_locking=self._use_locking,
+        )
+      else:
+        vhat = self.get_slot(var, "vhat")
+        return training_ops.resource_apply_adam_with_amsgrad(
+          var.handle,
+          m.handle,
+          v.handle,
+          vhat.handle,
+          coefficients["beta_1_power"],
+          coefficients["beta_2_power"],
+          lr_t,
+          coefficients["beta_1_t"],
+          coefficients["beta_2_t"],
+          coefficients["epsilon"],
+          grad,
+          use_locking=self._use_locking,
+        )
 
-    def __init__(self):
-        """Initializes the accumulator."""
-        self._gradients = []
-        self._accum_steps = tf.Variable(
-            initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+  def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+    # print("Sparse: {} {} {}".format(var.name, var.device, var.dtype.base_dtype))
+    lr_t, lr, coefficients, kwargs = self._get_lr(var, apply_state)
+    decay = self._decay_weights_op(var, lr_t, apply_state)
+    with tf.control_dependencies([decay]):
+      # m_t = beta1 * m + (1 - beta1) * g_t
+      m = self.get_slot(var, "m")
+      m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
+      m_t = state_ops.assign(m, m * coefficients["beta_1_t"], use_locking=self._use_locking)
+      with tf.control_dependencies([m_t]):
+        m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+      # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+      v = self.get_slot(var, "v")
+      v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
+      v_t = state_ops.assign(v, v * coefficients["beta_2_t"], use_locking=self._use_locking)
+      with tf.control_dependencies([v_t]):
+        v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+      if not self.amsgrad:
+        v_sqrt = math_ops.sqrt(v_t)
+        var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_sqrt + coefficients["epsilon"]), use_locking=self._use_locking
+        )
+        return control_flow_ops.group(*[var_update, m_t, v_t])
+      else:
+        v_hat = self.get_slot(var, "vhat")
+        v_hat_t = math_ops.maximum(v_hat, v_t)
+        with tf.control_dependencies([v_hat_t]):
+          v_hat_t = state_ops.assign(v_hat, v_hat_t, use_locking=self._use_locking)
+        v_hat_sqrt = math_ops.sqrt(v_hat_t)
+        var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_hat_sqrt + coefficients["epsilon"]), use_locking=self._use_locking
         )
+        return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
 
-    @property
-    def step(self):
-        """Number of accumulated steps."""
-        return self._accum_steps.value()
+  def get_config(self):
+    config = super().get_config()
+    config.update({"weight_decay_rate": self.weight_decay_rate})
+    return config
 
-    @property
-    def gradients(self):
-        """The accumulated gradients."""
-        return list(
-            gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients()
-        )
+  def _do_use_weight_decay(self, param_name):
+    """Whether to use L2 weight decay for `param_name`."""
+    if self.weight_decay_rate == 0:
+      return False
+
+    if self._include_in_weight_decay:
+      for r in self._include_in_weight_decay:
+        if re.search(r, param_name) is not None:
+          return True
+
+    if self._exclude_from_weight_decay:
+      for r in self._exclude_from_weight_decay:
+        if re.search(r, param_name) is not None:
+          return False
+    return True
+
+
+# Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
+class GradientAccumulator(object):
+  """Distribution strategies-aware gradient accumulation utility."""
 
-    def __call__(self, gradients):
-        """Accumulates :obj:`gradients`."""
-        if not self._gradients:
-            self._gradients.extend(
-                [
-                    tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
-                    for gradient in gradients
-                ]
-            )
-
-        if len(gradients) != len(self._gradients):
-            raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
-
-        for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
-            if accum_gradient is not None and gradient is not None:
-                accum_gradient.assign_add(gradient)
-
-        self._accum_steps.assign_add(1)
-
-    def reset(self):
-        """Resets the accumulated gradients."""
-        if self._gradients:
-            self._accum_steps.assign(0)
-
-        for gradient in self._get_replica_gradients():
-            if gradient is not None:
-                gradient.assign(tf.zeros_like(gradient))
-
-    def _get_replica_gradients(self):
-        if tf.distribute.has_strategy():
-            # In a replica context, we want to accumulate gradients on each replica
-            # without synchronization, so we directly assign the value of the
-            # current replica.
-            replica_context = tf.distribute.get_replica_context()
-
-            if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
-                return self._gradients
-
-            return (
-                gradient.device_map.select_for_current_replica(gradient.values, replica_context)
-                for gradient in self._gradients
-                if gradient is not None
-            )
-        else:
-            return self._gradients
+  def __init__(self):
+    """Initializes the accumulator."""
+    self._gradients = []
+    self._accum_steps = tf.Variable(
+      initial_value=0, dtype=tf.int64, trainable=False, aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA
+    )
+
+  @property
+  def step(self):
+    """Number of accumulated steps."""
+    return self._accum_steps.value()
+
+  @property
+  def gradients(self):
+    """The accumulated gradients."""
+    return list(gradient.value() if gradient is not None else gradient for gradient in self._get_replica_gradients())
+
+  def __call__(self, gradients):
+    """Accumulates :obj:`gradients`."""
+    if not self._gradients:
+      self._gradients.extend([
+        tf.Variable(tf.zeros_like(gradient), trainable=False) if gradient is not None else gradient
+        for gradient in gradients
+      ])
+
+    if len(gradients) != len(self._gradients):
+      raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
+
+    for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
+      if accum_gradient is not None and gradient is not None:
+        accum_gradient.assign_add(gradient)
+
+    self._accum_steps.assign_add(1)
+
+  def reset(self):
+    """Resets the accumulated gradients."""
+    if self._gradients:
+      self._accum_steps.assign(0)
+
+    for gradient in self._get_replica_gradients():
+      if gradient is not None:
+        gradient.assign(tf.zeros_like(gradient))
+
+  def _get_replica_gradients(self):
+    if tf.distribute.has_strategy():
+      # In a replica context, we want to accumulate gradients on each replica
+      # without synchronization, so we directly assign the value of the
+      # current replica.
+      replica_context = tf.distribute.get_replica_context()
+
+      if replica_context is None or tf.distribute.get_strategy().num_replicas_in_sync == 1:
+        return self._gradients
+
+      return (
+        gradient.device_map.select_for_current_replica(gradient.values, replica_context)
+        for gradient in self._gradients
+        if gradient is not None
+      )
+    else:
+      return self._gradients
 
 
 def _get_layer_decay(layer_decay, n_layers):
-    """Have lower learning rates for layers closer to the input."""
-    key_to_depths = collections.OrderedDict({
-        "/embeddings/": 0,
-        "/embeddings_project/": 0,
-        "/start_logits/": n_layers + 2,
-        "/end_logits/": n_layers + 2,
-        "/answer_class/": n_layers + 2,
-        "/qa_outputs/": n_layers + 2,
-    })
-    for layer in range(n_layers):
-        key_to_depths["encoder/layer_._" + str(layer) + "/"] = layer + 1
-    return {
-        key: layer_decay ** (n_layers + 2 - depth)
-        for key, depth in key_to_depths.items()
-    }
+  """Have lower learning rates for layers closer to the input."""
+  key_to_depths = collections.OrderedDict({
+    "/embeddings/": 0,
+    "/embeddings_project/": 0,
+    "/start_logits/": n_layers + 2,
+    "/end_logits/": n_layers + 2,
+    "/answer_class/": n_layers + 2,
+    "/qa_outputs/": n_layers + 2,
+  })
+  for layer in range(n_layers):
+    key_to_depths["encoder/layer_._" + str(layer) + "/"] = layer + 1
+  return {key: layer_decay ** (n_layers + 2 - depth) for key, depth in key_to_depths.items()}
diff --git a/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
index a18c3643..98f60817 100644
--- a/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
+++ b/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
@@ -25,48 +25,48 @@
 
 
 def from_pretrained_ckpt(args):
-    config = PretrainingConfig(
-        model_name='postprocessing',
-        data_dir='postprocessing',
-        generator_hidden_size=0.3333333,
-    )
+  config = PretrainingConfig(
+    model_name="postprocessing",
+    data_dir="postprocessing",
+    generator_hidden_size=0.3333333,
+  )
 
-    # Padding for divisibility by 8
-    if config.vocab_size % 8 != 0:
-        config.vocab_size += 8 - (config.vocab_size % 8)
+  # Padding for divisibility by 8
+  if config.vocab_size % 8 != 0:
+    config.vocab_size += 8 - (config.vocab_size % 8)
 
-    if args.amp:
-        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
-        tf.keras.mixed_precision.experimental.set_policy(policy)
-        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
-        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
+  if args.amp:
+    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    print("Compute dtype: %s" % policy.compute_dtype)  # Compute dtype: float16
+    print("Variable dtype: %s" % policy.variable_dtype)  # Variable dtype: float32
 
-    # Set up model
-    model = PretrainingModel(config)
+  # Set up model
+  model = PretrainingModel(config)
 
-    # Load checkpoint
-    checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model)
-    checkpoint.restore(args.pretrained_checkpoint).expect_partial()
-    log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint, int(checkpoint.step) - 1))
+  # Load checkpoint
+  checkpoint = tf.train.Checkpoint(step=tf.Variable(1), model=model)
+  checkpoint.restore(args.pretrained_checkpoint).expect_partial()
+  log(" ** Restored from {} at step {}".format(args.pretrained_checkpoint, int(checkpoint.step) - 1))
 
-    disc_dir = os.path.join(args.output_dir, 'discriminator')
-    gen_dir = os.path.join(args.output_dir, 'generator')
+  disc_dir = os.path.join(args.output_dir, "discriminator")
+  gen_dir = os.path.join(args.output_dir, "generator")
 
-    heading(" ** Saving discriminator")
-    model.discriminator(model.discriminator.dummy_inputs)
-    model.discriminator.save_pretrained(disc_dir)
+  heading(" ** Saving discriminator")
+  model.discriminator(model.discriminator.dummy_inputs)
+  model.discriminator.save_pretrained(disc_dir)
 
-    heading(" ** Saving generator")
-    model.generator(model.generator.dummy_inputs)
-    model.generator.save_pretrained(gen_dir)
+  heading(" ** Saving generator")
+  model.generator(model.generator.dummy_inputs)
+  model.generator.save_pretrained(gen_dir)
 
 
-if __name__ == '__main__':
-    # Parse essential args
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--pretrained_checkpoint')
-    parser.add_argument('--output_dir')
-    parser.add_argument('--amp', action='store_true', default=False)
-    args = parser.parse_args()
+if __name__ == "__main__":
+  # Parse essential args
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--pretrained_checkpoint")
+  parser.add_argument("--output_dir")
+  parser.add_argument("--amp", action="store_true", default=False)
+  args = parser.parse_args()
 
-    from_pretrained_ckpt(args)
+  from_pretrained_ckpt(args)
diff --git a/modelzoo/ELECTRA/pretrain_utils.py b/modelzoo/ELECTRA/pretrain_utils.py
index 029dce9f..f9f18a28 100644
--- a/modelzoo/ELECTRA/pretrain_utils.py
+++ b/modelzoo/ELECTRA/pretrain_utils.py
@@ -26,80 +26,75 @@
 
 
 def get_dataset(config, batch_size, num_cpu_threads=4, world_size=1, rank=0):
-    """Creates an `input_fn` closure to be passed to TPUEstimator."""
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
 
-    name_to_features = {
-        "input_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-        "input_mask": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-        "segment_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
-    }
+  name_to_features = {
+    "input_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+    "input_mask": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+    "segment_ids": tf.io.FixedLenFeature([config.max_seq_length], tf.int64),
+  }
 
-    input_files = []
-    for input_pattern in config.pretrain_tfrecords.split(","):
-        input_files.extend(tf.io.gfile.glob(input_pattern))
+  input_files = []
+  for input_pattern in config.pretrain_tfrecords.split(","):
+    input_files.extend(tf.io.gfile.glob(input_pattern))
 
-    d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
-    d = d.shard(num_shards=world_size, index=rank)
-    d = d.repeat()
-    d = d.shuffle(buffer_size=len(input_files), seed=config.seed, reshuffle_each_iteration=False)
+  d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files))
+  d = d.shard(num_shards=world_size, index=rank)
+  d = d.repeat()
+  d = d.shuffle(buffer_size=len(input_files), seed=config.seed, reshuffle_each_iteration=False)
 
-    cycle_length = min(num_cpu_threads, len(input_files))
-    d = d.interleave(
-        tf.data.TFRecordDataset,
-        cycle_length=cycle_length,
-        deterministic=True)
-    d = d.shuffle(buffer_size=100, seed=config.seed, reshuffle_each_iteration=False)
+  cycle_length = min(num_cpu_threads, len(input_files))
+  d = d.interleave(tf.data.TFRecordDataset, cycle_length=cycle_length, deterministic=True)
+  d = d.shuffle(buffer_size=100, seed=config.seed, reshuffle_each_iteration=False)
 
-    d = d.map(lambda record: _decode_record(record, name_to_features))
-    d = d.batch(batch_size)
+  d = d.map(lambda record: _decode_record(record, name_to_features))
+  d = d.batch(batch_size)
+
+  return d
 
-    return d
 
 def _decode_record(record, name_to_features):
-    """Decodes a record to a TensorFlow example."""
-    example = tf.io.parse_single_example(record, name_to_features)
+  """Decodes a record to a TensorFlow example."""
+  example = tf.io.parse_single_example(record, name_to_features)
 
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in list(example.keys()):
-        t = example[name]
-        if t.dtype == tf.int64:
-            t = tf.cast(t, tf.int32)
-        example[name] = t
+  # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+  # So cast all int64 to int32.
+  for name in list(example.keys()):
+    t = example[name]
+    if t.dtype == tf.int64:
+      t = tf.cast(t, tf.int32)
+    example[name] = t
 
-    return example
+  return example
 
 
 # model inputs - it's a bit nicer to use a namedtuple rather than keep the
 # features as a dict
 Inputs = collections.namedtuple(
-    "Inputs", ["input_ids", "input_mask", "segment_ids", "masked_lm_positions",
-               "masked_lm_ids", "masked_lm_weights"])
+  "Inputs", ["input_ids", "input_mask", "segment_ids", "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"]
+)
 
 
 def features_to_inputs(features):
-    return Inputs(
-        input_ids=features["input_ids"],
-        input_mask=features["input_mask"],
-        segment_ids=features["segment_ids"],
-        masked_lm_positions=(features["masked_lm_positions"]
-                             if "masked_lm_positions" in features else None),
-        masked_lm_ids=(features["masked_lm_ids"]
-                       if "masked_lm_ids" in features else None),
-        masked_lm_weights=(features["masked_lm_weights"]
-                           if "masked_lm_weights" in features else None),
-    )
+  return Inputs(
+    input_ids=features["input_ids"],
+    input_mask=features["input_mask"],
+    segment_ids=features["segment_ids"],
+    masked_lm_positions=(features["masked_lm_positions"] if "masked_lm_positions" in features else None),
+    masked_lm_ids=(features["masked_lm_ids"] if "masked_lm_ids" in features else None),
+    masked_lm_weights=(features["masked_lm_weights"] if "masked_lm_weights" in features else None),
+  )
 
 
 def get_updated_inputs(inputs, **kwargs):
-    features = inputs._asdict()
-    for k, v in kwargs.items():
-        features[k] = v
-    return features_to_inputs(features)
+  features = inputs._asdict()
+  for k, v in kwargs.items():
+    features[k] = v
+  return features_to_inputs(features)
 
 
 def get_shape_list(tensor, expected_rank=None, name=None):
-    """Returns a list of the shape of tensor, preferring static dimensions.
+  """Returns a list of the shape of tensor, preferring static dimensions.
 
   Args:
     tensor: A tf.Tensor object to find the shape of.
@@ -113,38 +108,38 @@ def get_shape_list(tensor, expected_rank=None, name=None):
     be returned as python integers, and dynamic dimensions will be returned
     as tf.Tensor scalars.
   """
-    if isinstance(tensor, np.ndarray) or isinstance(tensor, list):
-        shape = np.array(tensor).shape
-        if isinstance(expected_rank, six.integer_types):
-            assert len(shape) == expected_rank
-        elif expected_rank is not None:
-            assert len(shape) in expected_rank
-        return shape
-    #
-    # if name is None:
-    #     name = tensor.name
-    #
-    # if expected_rank is not None:
-    #     assert_rank(tensor, expected_rank, name)
-
-    shape = tensor.shape.as_list()
-
-    non_static_indexes = []
-    for (index, dim) in enumerate(shape):
-        if dim is None:
-            non_static_indexes.append(index)
-
-    if not non_static_indexes:
-        return shape
-
-    dyn_shape = tf.shape(tensor)
-    for index in non_static_indexes:
-        shape[index] = dyn_shape[index]
+  if isinstance(tensor, np.ndarray) or isinstance(tensor, list):
+    shape = np.array(tensor).shape
+    if isinstance(expected_rank, six.integer_types):
+      assert len(shape) == expected_rank
+    elif expected_rank is not None:
+      assert len(shape) in expected_rank
+    return shape
+  #
+  # if name is None:
+  #     name = tensor.name
+  #
+  # if expected_rank is not None:
+  #     assert_rank(tensor, expected_rank, name)
+
+  shape = tensor.shape.as_list()
+
+  non_static_indexes = []
+  for index, dim in enumerate(shape):
+    if dim is None:
+      non_static_indexes.append(index)
+
+  if not non_static_indexes:
     return shape
 
+  dyn_shape = tf.shape(tensor)
+  for index in non_static_indexes:
+    shape[index] = dyn_shape[index]
+  return shape
+
 
 def gather_positions(sequence, positions):
-    """Gathers the vectors at the specific positions over a minibatch.
+  """Gathers the vectors at the specific positions over a minibatch.
 
   Args:
     sequence: A [batch_size, seq_length] or
@@ -154,26 +149,26 @@ def gather_positions(sequence, positions):
   Returns: A [batch_size, n_positions] or
     [batch_size, n_positions, depth] tensor of the values at the indices
   """
-    shape = get_shape_list(sequence, expected_rank=[2, 3])
-    depth_dimension = (len(shape) == 3)
-    if depth_dimension:
-        B, L, D = shape
-    else:
-        B, L = shape
-        D = 1
-        sequence = tf.expand_dims(sequence, -1)
-    position_shift = tf.expand_dims(L * tf.range(B), -1)
-    flat_positions = tf.reshape(positions + position_shift, [-1])
-    flat_sequence = tf.reshape(sequence, [B * L, D])
-    gathered = tf.gather(flat_sequence, flat_positions)
-    if depth_dimension:
-        return tf.reshape(gathered, [B, -1, D])
-    else:
-        return tf.reshape(gathered, [B, -1])
+  shape = get_shape_list(sequence, expected_rank=[2, 3])
+  depth_dimension = len(shape) == 3
+  if depth_dimension:
+    B, L, D = shape
+  else:
+    B, L = shape
+    D = 1
+    sequence = tf.expand_dims(sequence, -1)
+  position_shift = tf.expand_dims(L * tf.range(B), -1)
+  flat_positions = tf.reshape(positions + position_shift, [-1])
+  flat_sequence = tf.reshape(sequence, [B * L, D])
+  gathered = tf.gather(flat_sequence, flat_positions)
+  if depth_dimension:
+    return tf.reshape(gathered, [B, -1, D])
+  else:
+    return tf.reshape(gathered, [B, -1])
 
 
 def scatter_update(sequence, updates, positions):
-    """Scatter-update a sequence.
+  """Scatter-update a sequence.
 
   Args:
     sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
@@ -186,146 +181,132 @@ def scatter_update(sequence, updates, positions):
     ignored. If there are duplicated positions the update is only applied once.
     Second is a [batch_size, seq_len] mask tensor of which inputs were updated.
   """
-    shape = get_shape_list(sequence, expected_rank=[2, 3])
-    depth_dimension = (len(shape) == 3)
-    if depth_dimension:
-        B, L, D = shape
-    else:
-        B, L = shape
-        D = 1
-        sequence = tf.expand_dims(sequence, -1)
-    N = get_shape_list(positions)[1]
-
-    shift = tf.expand_dims(L * tf.range(B), -1)
-    flat_positions = tf.reshape(positions + shift, [-1, 1])
-    flat_updates = tf.reshape(updates, [-1, D])
-    updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
-    updates = tf.reshape(updates, [B, L, D])
-
-    flat_updates_mask = tf.ones([B * N], tf.int32)
-    updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
-    updates_mask = tf.reshape(updates_mask, [B, L])
-    not_first_token = tf.concat([tf.zeros((B, 1), tf.int32),
-                                 tf.ones((B, L - 1), tf.int32)], -1)
-    updates_mask *= not_first_token
-    updates_mask_3d = tf.expand_dims(updates_mask, -1)
-
-    # account for duplicate positions
-    if sequence.dtype == tf.float32:
-        updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
-        updates /= tf.maximum(1.0, updates_mask_3d)
-    else:
-        assert sequence.dtype == tf.int32
-        updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
-    updates_mask = tf.minimum(updates_mask, 1)
-    updates_mask_3d = tf.minimum(updates_mask_3d, 1)
-
-    updated_sequence = (((1 - updates_mask_3d) * sequence) +
-                        (updates_mask_3d * updates))
-    if not depth_dimension:
-        updated_sequence = tf.squeeze(updated_sequence, -1)
-
-    return updated_sequence, updates_mask
-
-
-def _get_candidates_mask(inputs: Inputs, vocab,
-                         disallow_from_mask=None):
-    """Returns a mask tensor of positions in the input that can be masked out."""
-    ignore_ids = [vocab["[SEP]"], vocab["[CLS]"], vocab["[MASK]"]]
-    candidates_mask = tf.ones_like(inputs.input_ids, tf.bool)
-    for ignore_id in ignore_ids:
-        candidates_mask &= tf.not_equal(inputs.input_ids, ignore_id)
-    candidates_mask &= tf.cast(inputs.input_mask, tf.bool)
-    if disallow_from_mask is not None:
-        candidates_mask &= ~disallow_from_mask
-    return candidates_mask
-
-
-def mask(config, inputs, mask_prob, proposal_distribution=1.0,
-         disallow_from_mask=None, already_masked=None):
-    """Implementation of dynamic masking. The optional arguments aren't needed for
-    BERT/ELECTRA and are from early experiments in "strategically" masking out
-    tokens instead of uniformly at random.
-
-    Args:
-      config: configure_pretraining.PretrainingConfig
-      inputs: pretrain_data.Inputs containing input input_ids/input_mask
-      mask_prob: percent of tokens to mask
-      proposal_distribution: for non-uniform masking can be a [B, L] tensor
-                             of scores for masking each position.
-      disallow_from_mask: a boolean tensor of [B, L] of positions that should
-                          not be masked out
-      already_masked: a boolean tensor of [B, N] of already masked-out tokens
-                      for multiple rounds of masking
-    Returns: a pretrain_data.Inputs with masking added
-    """
-    # Get the batch size, sequence length, and max masked-out tokens
-    N = config.max_predictions_per_seq
-    B, L = get_shape_list(inputs.input_ids)
-
-    # Find indices where masking out a token is allowed
-    vocab = tokenization.ElectraTokenizer(
-        config.vocab_file, do_lower_case=config.do_lower_case).get_vocab()
-    candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask)
-
-    # Set the number of tokens to mask out per example
-    num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
-    num_to_predict = tf.maximum(1, tf.minimum(
-        N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
-    masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
-    if already_masked is not None:
-        masked_lm_weights *= (1 - already_masked)
-
-    # Get a probability of masking each position in the sequence
-    candidate_mask_float = tf.cast(candidates_mask, tf.float32)
-    sample_prob = (proposal_distribution * candidate_mask_float)
-    sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)
-
-    # Sample the positions to mask out
-    sample_prob = tf.stop_gradient(sample_prob)
-    sample_logits = tf.math.log(sample_prob)
-    masked_lm_positions = tf.random.categorical(
-        sample_logits, N, dtype=tf.int32)
-    masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)
-
-    # Get the ids of the masked-out tokens
-    shift = tf.expand_dims(L * tf.range(B), -1)
-    flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
-    masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]),
-                                 flat_positions)
-    masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
-    masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)
-
-    # Update the input ids
-    replace_with_mask_positions = masked_lm_positions * tf.cast(
-        tf.less(tf.random.uniform([B, N]), 0.85), tf.int32)
-    inputs_ids, _ = scatter_update(
-        inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]),
-        replace_with_mask_positions)
-
-    return get_updated_inputs(
-        inputs,
-        input_ids=tf.stop_gradient(inputs_ids),
-        masked_lm_positions=masked_lm_positions,
-        masked_lm_ids=masked_lm_ids,
-        masked_lm_weights=masked_lm_weights
-    )
+  shape = get_shape_list(sequence, expected_rank=[2, 3])
+  depth_dimension = len(shape) == 3
+  if depth_dimension:
+    B, L, D = shape
+  else:
+    B, L = shape
+    D = 1
+    sequence = tf.expand_dims(sequence, -1)
+  N = get_shape_list(positions)[1]
+
+  shift = tf.expand_dims(L * tf.range(B), -1)
+  flat_positions = tf.reshape(positions + shift, [-1, 1])
+  flat_updates = tf.reshape(updates, [-1, D])
+  updates = tf.scatter_nd(flat_positions, flat_updates, [B * L, D])
+  updates = tf.reshape(updates, [B, L, D])
+
+  flat_updates_mask = tf.ones([B * N], tf.int32)
+  updates_mask = tf.scatter_nd(flat_positions, flat_updates_mask, [B * L])
+  updates_mask = tf.reshape(updates_mask, [B, L])
+  not_first_token = tf.concat([tf.zeros((B, 1), tf.int32), tf.ones((B, L - 1), tf.int32)], -1)
+  updates_mask *= not_first_token
+  updates_mask_3d = tf.expand_dims(updates_mask, -1)
+
+  # account for duplicate positions
+  if sequence.dtype == tf.float32:
+    updates_mask_3d = tf.cast(updates_mask_3d, tf.float32)
+    updates /= tf.maximum(1.0, updates_mask_3d)
+  else:
+    assert sequence.dtype == tf.int32
+    updates = tf.math.floordiv(updates, tf.maximum(1, updates_mask_3d))
+  updates_mask = tf.minimum(updates_mask, 1)
+  updates_mask_3d = tf.minimum(updates_mask_3d, 1)
+
+  updated_sequence = ((1 - updates_mask_3d) * sequence) + (updates_mask_3d * updates)
+  if not depth_dimension:
+    updated_sequence = tf.squeeze(updated_sequence, -1)
+
+  return updated_sequence, updates_mask
+
+
+def _get_candidates_mask(inputs: Inputs, vocab, disallow_from_mask=None):
+  """Returns a mask tensor of positions in the input that can be masked out."""
+  ignore_ids = [vocab["[SEP]"], vocab["[CLS]"], vocab["[MASK]"]]
+  candidates_mask = tf.ones_like(inputs.input_ids, tf.bool)
+  for ignore_id in ignore_ids:
+    candidates_mask &= tf.not_equal(inputs.input_ids, ignore_id)
+  candidates_mask &= tf.cast(inputs.input_mask, tf.bool)
+  if disallow_from_mask is not None:
+    candidates_mask &= ~disallow_from_mask
+  return candidates_mask
+
+
+def mask(config, inputs, mask_prob, proposal_distribution=1.0, disallow_from_mask=None, already_masked=None):
+  """Implementation of dynamic masking. The optional arguments aren't needed for
+  BERT/ELECTRA and are from early experiments in "strategically" masking out
+  tokens instead of uniformly at random.
+
+  Args:
+    config: configure_pretraining.PretrainingConfig
+    inputs: pretrain_data.Inputs containing input input_ids/input_mask
+    mask_prob: percent of tokens to mask
+    proposal_distribution: for non-uniform masking can be a [B, L] tensor
+                           of scores for masking each position.
+    disallow_from_mask: a boolean tensor of [B, L] of positions that should
+                        not be masked out
+    already_masked: a boolean tensor of [B, N] of already masked-out tokens
+                    for multiple rounds of masking
+  Returns: a pretrain_data.Inputs with masking added
+  """
+  # Get the batch size, sequence length, and max masked-out tokens
+  N = config.max_predictions_per_seq
+  B, L = get_shape_list(inputs.input_ids)
+
+  # Find indices where masking out a token is allowed
+  vocab = tokenization.ElectraTokenizer(config.vocab_file, do_lower_case=config.do_lower_case).get_vocab()
+  candidates_mask = _get_candidates_mask(inputs, vocab, disallow_from_mask)
+
+  # Set the number of tokens to mask out per example
+  num_tokens = tf.cast(tf.reduce_sum(inputs.input_mask, -1), tf.float32)
+  num_to_predict = tf.maximum(1, tf.minimum(N, tf.cast(tf.round(num_tokens * mask_prob), tf.int32)))
+  masked_lm_weights = tf.cast(tf.sequence_mask(num_to_predict, N), tf.float32)
+  if already_masked is not None:
+    masked_lm_weights *= 1 - already_masked
+
+  # Get a probability of masking each position in the sequence
+  candidate_mask_float = tf.cast(candidates_mask, tf.float32)
+  sample_prob = proposal_distribution * candidate_mask_float
+  sample_prob /= tf.reduce_sum(sample_prob, axis=-1, keepdims=True)
+
+  # Sample the positions to mask out
+  sample_prob = tf.stop_gradient(sample_prob)
+  sample_logits = tf.math.log(sample_prob)
+  masked_lm_positions = tf.random.categorical(sample_logits, N, dtype=tf.int32)
+  masked_lm_positions *= tf.cast(masked_lm_weights, tf.int32)
+
+  # Get the ids of the masked-out tokens
+  shift = tf.expand_dims(L * tf.range(B), -1)
+  flat_positions = tf.reshape(masked_lm_positions + shift, [-1, 1])
+  masked_lm_ids = tf.gather_nd(tf.reshape(inputs.input_ids, [-1]), flat_positions)
+  masked_lm_ids = tf.reshape(masked_lm_ids, [B, -1])
+  masked_lm_ids *= tf.cast(masked_lm_weights, tf.int32)
+
+  # Update the input ids
+  replace_with_mask_positions = masked_lm_positions * tf.cast(tf.less(tf.random.uniform([B, N]), 0.85), tf.int32)
+  inputs_ids, _ = scatter_update(inputs.input_ids, tf.fill([B, N], vocab["[MASK]"]), replace_with_mask_positions)
+
+  return get_updated_inputs(
+    inputs,
+    input_ids=tf.stop_gradient(inputs_ids),
+    masked_lm_positions=masked_lm_positions,
+    masked_lm_ids=masked_lm_ids,
+    masked_lm_weights=masked_lm_weights,
+  )
 
 
 def unmask(inputs: Inputs):
-    unmasked_input_ids, _ = scatter_update(
-        inputs.input_ids, inputs.masked_lm_ids, inputs.masked_lm_positions)
-    return get_updated_inputs(inputs, input_ids=unmasked_input_ids)
+  unmasked_input_ids, _ = scatter_update(inputs.input_ids, inputs.masked_lm_ids, inputs.masked_lm_positions)
+  return get_updated_inputs(inputs, input_ids=unmasked_input_ids)
 
 
 def sample_from_softmax(logits, disallow=None):
-    if disallow is not None:
-        logits -= 1000.0 * disallow
-    uniform_noise = tf.random.uniform(
-        get_shape_list(logits), minval=0, maxval=1)
-    gumbel_noise = tf.cast(-tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9), logits.dtype)
-    return tf.one_hot(tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1,
-                                output_type=tf.int32), logits.shape[-1])
+  if disallow is not None:
+    logits -= 1000.0 * disallow
+  uniform_noise = tf.random.uniform(get_shape_list(logits), minval=0, maxval=1)
+  gumbel_noise = tf.cast(-tf.math.log(-tf.math.log(uniform_noise + 1e-9) + 1e-9), logits.dtype)
+  return tf.one_hot(tf.argmax(tf.nn.softmax(logits + gumbel_noise), -1, output_type=tf.int32), logits.shape[-1])
 
 
 ENDC = "\033[0m"
@@ -337,31 +318,28 @@ def sample_from_softmax(logits, disallow=None):
 
 
 def print_tokens(inputs: Inputs, inv_vocab, updates_mask=None):
-    """Pretty-print model inputs."""
-    pos_to_tokid = {}
-    for tokid, pos, weight in zip(
-            inputs.masked_lm_ids[0], inputs.masked_lm_positions[0],
-            inputs.masked_lm_weights[0]):
-        if weight == 0:
-            pass
-        else:
-            pos_to_tokid[pos] = tokid
-
-    text = ""
-    provided_update_mask = (updates_mask is not None)
-    if not provided_update_mask:
-        updates_mask = np.zeros_like(inputs.input_ids)
-    for pos, (tokid, um) in enumerate(
-            zip(inputs.input_ids[0], updates_mask[0])):
-        token = inv_vocab[tokid]
-        if token == "[PAD]":
-            break
-        if pos in pos_to_tokid:
-            token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC
-            if provided_update_mask:
-                assert um == 1
-        else:
-            if provided_update_mask:
-                assert um == 0
-        text += token + " "
-    utils.log(utils.printable_text(text))
+  """Pretty-print model inputs."""
+  pos_to_tokid = {}
+  for tokid, pos, weight in zip(inputs.masked_lm_ids[0], inputs.masked_lm_positions[0], inputs.masked_lm_weights[0]):
+    if weight == 0:
+      pass
+    else:
+      pos_to_tokid[pos] = tokid
+
+  text = ""
+  provided_update_mask = updates_mask is not None
+  if not provided_update_mask:
+    updates_mask = np.zeros_like(inputs.input_ids)
+  for pos, (tokid, um) in enumerate(zip(inputs.input_ids[0], updates_mask[0])):
+    token = inv_vocab[tokid]
+    if token == "[PAD]":
+      break
+    if pos in pos_to_tokid:
+      token = RED + token + " (" + inv_vocab[pos_to_tokid[pos]] + ")" + ENDC
+      if provided_update_mask:
+        assert um == 1
+    else:
+      if provided_update_mask:
+        assert um == 0
+    text += token + " "
+  utils.log(utils.printable_text(text))
diff --git a/modelzoo/ELECTRA/run_inference.py b/modelzoo/ELECTRA/run_inference.py
index 436f5814..dd7d0526 100644
--- a/modelzoo/ELECTRA/run_inference.py
+++ b/modelzoo/ELECTRA/run_inference.py
@@ -23,9 +23,9 @@
 import tensorflow as tf
 
 if sys.version_info[0] == 2:
-    import cPickle as pickle
+  import cPickle as pickle
 else:
-    import pickle
+  import pickle
 
 from configuration import ElectraConfig
 from modeling import TFElectraForQuestionAnswering
@@ -33,180 +33,193 @@
 from squad_utils import SquadResult, RawResult, _get_best_indices
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/electra-small-generator",
-    "google/electra-base-generator",
-    "google/electra-large-generator",
-    "google/electra-small-discriminator",
-    "google/electra-base-discriminator",
-    "google/electra-large-discriminator",
-    # See all ELECTRA models at https://huggingface.co/models?filter=electra
+  "google/electra-small-generator",
+  "google/electra-base-generator",
+  "google/electra-large-generator",
+  "google/electra-small-discriminator",
+  "google/electra-base-discriminator",
+  "google/electra-large-discriminator",
+  # See all ELECTRA models at https://huggingface.co/models?filter=electra
 ]
 
-_PrelimPrediction = collections.namedtuple(
-    "PrelimPrediction",
-    ["start_index", "end_index", "start_logit", "end_logit"])
+_PrelimPrediction = collections.namedtuple("PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"])
 
 
 def parse_args():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument("--electra_model", default=None, type=str, required=True,
-                        help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
-    parser.add_argument("--init_checkpoint",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The checkpoint file from pretraining")
-    parser.add_argument("--question",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Question")
-    parser.add_argument("--context",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="Context")
-    parser.add_argument(
-        "--joint_head",
-        default=True,
-        type=bool,
-        help="Jointly predict the start and end positions",
-    )
-    parser.add_argument(
-        "--beam_size",
-        default=4,
-        type=int,
-        help="Beam size when doing joint predictions",
-    )
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-
-    args = parser.parse_args()
-
-    return args
+  parser = argparse.ArgumentParser()
+
+  # Required parameters
+  parser.add_argument(
+    "--electra_model",
+    default=None,
+    type=str,
+    required=True,
+    help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST),
+  )
+  parser.add_argument(
+    "--init_checkpoint", default=None, type=str, required=True, help="The checkpoint file from pretraining"
+  )
+  parser.add_argument("--question", default=None, type=str, required=True, help="Question")
+  parser.add_argument("--context", default=None, type=str, required=True, help="Context")
+  parser.add_argument(
+    "--joint_head",
+    default=True,
+    type=bool,
+    help="Jointly predict the start and end positions",
+  )
+  parser.add_argument(
+    "--beam_size",
+    default=4,
+    type=int,
+    help="Beam size when doing joint predictions",
+  )
+  parser.add_argument(
+    "--n_best_size",
+    default=20,
+    type=int,
+    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+  )
+  parser.add_argument(
+    "--max_answer_length",
+    default=30,
+    type=int,
+    help="The maximum length of an answer that can be generated. This is needed because the start "
+    "and end predictions are not conditioned on one another.",
+  )
+
+  parser.add_argument(
+    "--version_2_with_negative",
+    action="store_true",
+    help="If true, the SQuAD examples contain some that do not have an answer.",
+  )
+  parser.add_argument(
+    "--null_score_diff_threshold",
+    type=float,
+    default=0.0,
+    help="If null_score - best_non_null is greater than the threshold predict null.",
+  )
+
+  args = parser.parse_args()
+
+  return args
 
 
 def get_predictions_joint_head(start_indices, end_indices, result, max_len, args):
-    predictions = []
-    for i in range(args.beam_size):
-        start_index = start_indices[i]
-        for j in range(args.beam_size):
-            # for end_index in end_indices:
-            end_index = end_indices[i * args.beam_size + j]
-            if start_index >= max_len:
-                continue
-            if end_index >= max_len:
-                continue
-            if end_index < start_index:
-                continue
-            length = end_index - start_index + 1
-            if length > args.max_answer_length:
-                continue
-            predictions.append(
-                _PrelimPrediction(
-                    start_index=start_index,
-                    end_index=end_index,
-                    start_logit=result.start_logits[i],
-                    end_logit=result.end_logits[i * args.beam_size + j]))
-    return predictions
+  predictions = []
+  for i in range(args.beam_size):
+    start_index = start_indices[i]
+    for j in range(args.beam_size):
+      # for end_index in end_indices:
+      end_index = end_indices[i * args.beam_size + j]
+      if start_index >= max_len:
+        continue
+      if end_index >= max_len:
+        continue
+      if end_index < start_index:
+        continue
+      length = end_index - start_index + 1
+      if length > args.max_answer_length:
+        continue
+      predictions.append(
+        _PrelimPrediction(
+          start_index=start_index,
+          end_index=end_index,
+          start_logit=result.start_logits[i],
+          end_logit=result.end_logits[i * args.beam_size + j],
+        )
+      )
+  return predictions
 
 
 def get_predictions(start_indices, end_indices, result, max_len, args):
-    predictions = []
-    for start_index in start_indices:
-        for end_index in end_indices:
-            if start_index >= max_len:
-                continue
-            if end_index >= max_len:
-                continue
-            if end_index < start_index:
-                continue
-            length = end_index - start_index + 1
-            if length > args.max_answer_length:
-                continue
-            predictions.append(
-                _PrelimPrediction(
-                    start_index=start_index,
-                    end_index=end_index,
-                    start_logit=result.start_logits[start_index],
-                    end_logit=result.end_logits[end_index]))
-    return predictions
+  predictions = []
+  for start_index in start_indices:
+    for end_index in end_indices:
+      if start_index >= max_len:
+        continue
+      if end_index >= max_len:
+        continue
+      if end_index < start_index:
+        continue
+      length = end_index - start_index + 1
+      if length > args.max_answer_length:
+        continue
+      predictions.append(
+        _PrelimPrediction(
+          start_index=start_index,
+          end_index=end_index,
+          start_logit=result.start_logits[start_index],
+          end_logit=result.end_logits[end_index],
+        )
+      )
+  return predictions
 
 
 def main():
-    args = parse_args()
-    print("***** Loading tokenizer and model *****")
-    electra_model = args.electra_model
-    config = ElectraConfig.from_pretrained(electra_model)
-    tokenizer = ElectraTokenizer.from_pretrained(electra_model)
-    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, args=args)
-
-    print("***** Loading fine-tuned checkpoint: {} *****".format(args.init_checkpoint))
-    model.load_weights(args.init_checkpoint, by_name=False, skip_mismatch=False).expect_partial()
-
-    question, text = args.question, args.context
-    encoding = tokenizer.encode_plus(question, text, return_tensors='tf')
-    input_ids, token_type_ids, attention_mask = encoding["input_ids"], encoding["token_type_ids"], \
-                                                encoding["attention_mask"]
-    all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0])
-    if not args.joint_head:
-        start_logits, end_logits = model(input_ids,
-                                         attention_mask=attention_mask,
-                                         token_type_ids=token_type_ids,
-                                         )[:2]
-        start_logits = start_logits[0].numpy().tolist()
-        end_logits = end_logits[0].numpy().tolist()
-        result = RawResult(unique_id=0,
-                           start_logits=start_logits,
-                           end_logits=end_logits)
-
-        start_indices = _get_best_indices(result.start_logits, args.n_best_size)
-        end_indices = _get_best_indices(result.end_logits, args.n_best_size)
-        predictions = get_predictions(start_indices, end_indices, result, len(all_tokens), args)
-        null_score = result.start_logits[0] + result.end_logits[0]
-
-    else:
-        outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
-        output = [output[0].numpy().tolist() for output in outputs]
-        start_logits = output[0]
-        start_top_index = output[1]
-        end_logits = output[2]
-        end_top_index = output[3]
-        cls_logits = output[4]
-        result = SquadResult(
-            0,
-            start_logits,
-            end_logits,
-            start_top_index=start_top_index,
-            end_top_index=end_top_index,
-            cls_logits=cls_logits,
-        )
-        predictions = get_predictions_joint_head(result.start_top_index, result.end_top_index, result, len(all_tokens), args)
-        null_score = result.cls_logits
+  args = parse_args()
+  print("***** Loading tokenizer and model *****")
+  electra_model = args.electra_model
+  config = ElectraConfig.from_pretrained(electra_model)
+  tokenizer = ElectraTokenizer.from_pretrained(electra_model)
+  model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, args=args)
+
+  print("***** Loading fine-tuned checkpoint: {} *****".format(args.init_checkpoint))
+  model.load_weights(args.init_checkpoint, by_name=False, skip_mismatch=False).expect_partial()
+
+  question, text = args.question, args.context
+  encoding = tokenizer.encode_plus(question, text, return_tensors="tf")
+  input_ids, token_type_ids, attention_mask = (
+    encoding["input_ids"],
+    encoding["token_type_ids"],
+    encoding["attention_mask"],
+  )
+  all_tokens = tokenizer.convert_ids_to_tokens(input_ids.numpy()[0])
+  if not args.joint_head:
+    start_logits, end_logits = model(
+      input_ids,
+      attention_mask=attention_mask,
+      token_type_ids=token_type_ids,
+    )[:2]
+    start_logits = start_logits[0].numpy().tolist()
+    end_logits = end_logits[0].numpy().tolist()
+    result = RawResult(unique_id=0, start_logits=start_logits, end_logits=end_logits)
+
+    start_indices = _get_best_indices(result.start_logits, args.n_best_size)
+    end_indices = _get_best_indices(result.end_logits, args.n_best_size)
+    predictions = get_predictions(start_indices, end_indices, result, len(all_tokens), args)
+    null_score = result.start_logits[0] + result.end_logits[0]
+
+  else:
+    outputs = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+    output = [output[0].numpy().tolist() for output in outputs]
+    start_logits = output[0]
+    start_top_index = output[1]
+    end_logits = output[2]
+    end_top_index = output[3]
+    cls_logits = output[4]
+    result = SquadResult(
+      0,
+      start_logits,
+      end_logits,
+      start_top_index=start_top_index,
+      end_top_index=end_top_index,
+      cls_logits=cls_logits,
+    )
+    predictions = get_predictions_joint_head(
+      result.start_top_index, result.end_top_index, result, len(all_tokens), args
+    )
+    null_score = result.cls_logits
 
-    predictions = sorted(predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
-    answer = predictions[0]
-    answer = ' '.join(all_tokens[answer.start_index: answer.end_index + 1])
-    if args.null_score_diff_threshold > null_score and args.version_2_with_negative:
-        answer = ''
+  predictions = sorted(predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+  answer = predictions[0]
+  answer = " ".join(all_tokens[answer.start_index : answer.end_index + 1])
+  if args.null_score_diff_threshold > null_score and args.version_2_with_negative:
+    answer = ""
 
-    print(answer)
+  print(answer)
 
-    return answer
+  return answer
 
 
 if __name__ == "__main__":
-    main()
+  main()
diff --git a/modelzoo/ELECTRA/run_pretraining.py b/modelzoo/ELECTRA/run_pretraining.py
index eb58e9fd..a0ddb8a4 100644
--- a/modelzoo/ELECTRA/run_pretraining.py
+++ b/modelzoo/ELECTRA/run_pretraining.py
@@ -37,469 +37,514 @@
 from optimization import create_optimizer, GradientAccumulator
 import dllogger
 
+
 class PretrainingConfig(object):
-    """Defines pre-training hyperparameters."""
-
-    def __init__(self, model_name, **kwargs):
-        self.model_name = model_name
-        self.seed = 42
-
-        self.debug = False  # debug mode for quickly running things
-        self.do_train = True  # pre-train ELECTRA
-        self.do_eval = False  # evaluate generator/discriminator on unlabeled data
-        self.phase2 = False
-
-        # amp
-        self.amp = True
-        self.xla = True
-        self.fp16_compression = False
-
-        # optimizer type
-        self.optimizer = 'adam'
-        self.gradient_accumulation_steps = 1
-
-        # lamb whitelisting for LN and biases
-        self.skip_adaptive = False
-
-        # loss functions
-        self.electra_objective = True  # if False, use the BERT objective instead
-        self.gen_weight = 1.0  # masked language modeling / generator loss
-        self.disc_weight = 50.0  # discriminator loss
-        self.mask_prob = 0.15  # percent of input tokens to mask out / replace
-
-        # optimization
-        self.learning_rate = 5e-4
-        self.lr_decay_power = 0.5
-        self.weight_decay_rate = 0.01
-        self.num_warmup_steps = 10000
-        self.opt_beta_1 = 0.878
-        self.opt_beta_2 = 0.974
-        self.end_lr = 0.0
-
-        # training settings
-        self.log_freq = 10
-        self.skip_checkpoint = False
-        self.save_checkpoints_steps = 1000
-        self.num_train_steps = 1000000
-        self.num_eval_steps = 100
-        self.keep_checkpoint_max = 5  # maximum number of recent checkpoint files to keep;  change to 0 or None to keep all checkpoints
-        self.restore_checkpoint = None
-        self.load_weights = False
-        self.steps_this_run = -1
-
-        # model settings
-        self.model_size = "base"  # one of "small", "base", or "large"
-        # override the default transformer hparams for the provided model size; see
-        # modeling.BertConfig for the possible hparams and util.training_utils for
-        # the defaults
-        self.model_hparam_overrides = (
-            kwargs["model_hparam_overrides"]
-            if "model_hparam_overrides" in kwargs else {})
-        self.embedding_size = None  # bert hidden size by default
-        self.vocab_size = 30522  # number of tokens in the vocabulary
-        self.do_lower_case = True  # lowercase the input?
-
-        # generator settings
-        self.uniform_generator = False  # generator is uniform at random
-        self.shared_embeddings = True  # share generator/discriminator token embeddings?
-        # self.untied_generator = True  # tie all generator/discriminator weights?
-        self.generator_layers = 1.0  # frac of discriminator layers for generator
-        self.generator_hidden_size = 0.25  # frac of discrim hidden size for gen
-        self.disallow_correct = False  # force the generator to sample incorrect
-        # tokens (so 15% of tokens are always
-        # fake)
-        self.temperature = 1.0  # temperature for sampling from generator
-
-        # batch sizes
-        self.max_seq_length = 128
-        self.train_batch_size = 128
-        self.eval_batch_size = 128
-
-        self.results_dir = "results"
-        self.json_summary = None
-        self.update(kwargs)
-        # default locations of data files
-        
-        self.pretrain_tfrecords = os.path.join(
-            "data", "pretrain_tfrecords/pretrain_data.tfrecord*")
-        self.vocab_file = os.path.join("vocab", "vocab.txt")
-        self.model_dir = os.path.join(self.results_dir, "models", model_name)
-        self.checkpoints_dir = os.path.join(self.model_dir, "checkpoints")
-        self.weights_dir = os.path.join(self.model_dir, "weights")
-        self.results_txt = os.path.join(self.results_dir, "unsup_results.txt")
-        self.results_pkl = os.path.join(self.results_dir, "unsup_results.pkl")
-        self.log_dir = os.path.join(self.model_dir, "logs")
-
-        self.max_predictions_per_seq = int((self.mask_prob + 0.005) *
-                                           self.max_seq_length)
-
-        # defaults for different-sized model
-        if self.model_size == "base":
-            self.embedding_size = 768
-            self.hidden_size = 768
-            self.num_hidden_layers = 12
-            if self.hidden_size % 64 != 0:
-                raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size))	
-            self.num_attention_heads = int(self.hidden_size / 64.)
-        elif self.model_size == "large":
-            self.embedding_size = 1024
-            self.hidden_size = 1024
-            self.num_hidden_layers = 24
-            if self.hidden_size % 64 != 0:
-                raise ValueError("Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(self.hidden_size, self.hidden_size))
-            self.num_attention_heads = int(self.hidden_size / 64.)
-        else:
-            raise ValueError("--model_size : 'base' and 'large supported only.")
-        self.act_func = "gelu"
-        self.hidden_dropout_prob = 0.1 
-        self.attention_probs_dropout_prob = 0.1
-
-        self.update(kwargs)
-
-    def update(self, kwargs):
-        for k, v in kwargs.items():
-            if v is not None:
-                self.__dict__[k] = v
+  """Defines pre-training hyperparameters."""
+
+  def __init__(self, model_name, **kwargs):
+    self.model_name = model_name
+    self.seed = 42
+
+    self.debug = False  # debug mode for quickly running things
+    self.do_train = True  # pre-train ELECTRA
+    self.do_eval = False  # evaluate generator/discriminator on unlabeled data
+    self.phase2 = False
+
+    # amp
+    self.amp = True
+    self.xla = True
+    self.fp16_compression = False
+
+    # optimizer type
+    self.optimizer = "adam"
+    self.gradient_accumulation_steps = 1
+
+    # lamb whitelisting for LN and biases
+    self.skip_adaptive = False
+
+    # loss functions
+    self.electra_objective = True  # if False, use the BERT objective instead
+    self.gen_weight = 1.0  # masked language modeling / generator loss
+    self.disc_weight = 50.0  # discriminator loss
+    self.mask_prob = 0.15  # percent of input tokens to mask out / replace
+
+    # optimization
+    self.learning_rate = 5e-4
+    self.lr_decay_power = 0.5
+    self.weight_decay_rate = 0.01
+    self.num_warmup_steps = 10000
+    self.opt_beta_1 = 0.878
+    self.opt_beta_2 = 0.974
+    self.end_lr = 0.0
+
+    # training settings
+    self.log_freq = 10
+    self.skip_checkpoint = False
+    self.save_checkpoints_steps = 1000
+    self.num_train_steps = 1000000
+    self.num_eval_steps = 100
+    self.keep_checkpoint_max = (
+      5  # maximum number of recent checkpoint files to keep;  change to 0 or None to keep all checkpoints
+    )
+    self.restore_checkpoint = None
+    self.load_weights = False
+    self.steps_this_run = -1
+
+    # model settings
+    self.model_size = "base"  # one of "small", "base", or "large"
+    # override the default transformer hparams for the provided model size; see
+    # modeling.BertConfig for the possible hparams and util.training_utils for
+    # the defaults
+    self.model_hparam_overrides = kwargs["model_hparam_overrides"] if "model_hparam_overrides" in kwargs else {}
+    self.embedding_size = None  # bert hidden size by default
+    self.vocab_size = 30522  # number of tokens in the vocabulary
+    self.do_lower_case = True  # lowercase the input?
+
+    # generator settings
+    self.uniform_generator = False  # generator is uniform at random
+    self.shared_embeddings = True  # share generator/discriminator token embeddings?
+    # self.untied_generator = True  # tie all generator/discriminator weights?
+    self.generator_layers = 1.0  # frac of discriminator layers for generator
+    self.generator_hidden_size = 0.25  # frac of discrim hidden size for gen
+    self.disallow_correct = False  # force the generator to sample incorrect
+    # tokens (so 15% of tokens are always
+    # fake)
+    self.temperature = 1.0  # temperature for sampling from generator
+
+    # batch sizes
+    self.max_seq_length = 128
+    self.train_batch_size = 128
+    self.eval_batch_size = 128
+
+    self.results_dir = "results"
+    self.json_summary = None
+    self.update(kwargs)
+    # default locations of data files
+
+    self.pretrain_tfrecords = os.path.join("data", "pretrain_tfrecords/pretrain_data.tfrecord*")
+    self.vocab_file = os.path.join("vocab", "vocab.txt")
+    self.model_dir = os.path.join(self.results_dir, "models", model_name)
+    self.checkpoints_dir = os.path.join(self.model_dir, "checkpoints")
+    self.weights_dir = os.path.join(self.model_dir, "weights")
+    self.results_txt = os.path.join(self.results_dir, "unsup_results.txt")
+    self.results_pkl = os.path.join(self.results_dir, "unsup_results.pkl")
+    self.log_dir = os.path.join(self.model_dir, "logs")
+
+    self.max_predictions_per_seq = int((self.mask_prob + 0.005) * self.max_seq_length)
+
+    # defaults for different-sized model
+    if self.model_size == "base":
+      self.embedding_size = 768
+      self.hidden_size = 768
+      self.num_hidden_layers = 12
+      if self.hidden_size % 64 != 0:
+        raise ValueError(
+          "Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(
+            self.hidden_size, self.hidden_size
+          )
+        )
+      self.num_attention_heads = int(self.hidden_size / 64.0)
+    elif self.model_size == "large":
+      self.embedding_size = 1024
+      self.hidden_size = 1024
+      self.num_hidden_layers = 24
+      if self.hidden_size % 64 != 0:
+        raise ValueError(
+          "Hidden size {} should be divisible by 64. Number of attention heads is hidden size {} / 64 ".format(
+            self.hidden_size, self.hidden_size
+          )
+        )
+      self.num_attention_heads = int(self.hidden_size / 64.0)
+    else:
+      raise ValueError("--model_size : 'base' and 'large supported only.")
+    self.act_func = "gelu"
+    self.hidden_dropout_prob = 0.1
+    self.attention_probs_dropout_prob = 0.1
+
+    self.update(kwargs)
+
+  def update(self, kwargs):
+    for k, v in kwargs.items():
+      if v is not None:
+        self.__dict__[k] = v
 
 
 def metric_fn(config, metrics, eval_fn_inputs):
-    """Computes the loss and accuracy of the model."""
-    d = eval_fn_inputs
-    metrics["masked_lm_accuracy"].update_state(
-        y_true=tf.reshape(d["masked_lm_ids"], [-1]),
-        y_pred=tf.reshape(d["masked_lm_preds"], [-1]),
-        sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
-    metrics["masked_lm_loss"].update_state(
-        values=tf.reshape(d["mlm_loss"], [-1]),
-        sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
-    if config.electra_objective:
-        metrics["sampled_masked_lm_accuracy"].update_state(
-            y_true=tf.reshape(d["masked_lm_ids"], [-1]),
-            y_pred=tf.reshape(d["sampled_tokids"], [-1]),
-            sample_weight=tf.reshape(d["masked_lm_weights"], [-1]))
-        if config.disc_weight > 0:
-            metrics["disc_loss"].update_state(d["disc_loss"])
-            #metrics["disc_auc"].update_state(
-            #    d["disc_labels"] * d["input_mask"],
-            #    d["disc_probs"] * tf.cast(d["input_mask"], tf.float32))
-            metrics["disc_accuracy"].update_state(
-                y_true=d["disc_labels"], y_pred=d["disc_preds"],
-                sample_weight=d["input_mask"])
-            metrics["disc_precision"].update_state(
-                y_true=d["disc_labels"], y_pred=d["disc_preds"],
-                sample_weight=d["disc_preds"] * d["input_mask"])
-            metrics["disc_recall"].update_state(
-                y_true=d["disc_labels"], y_pred=d["disc_preds"],
-                sample_weight=d["disc_labels"] * d["input_mask"])
-    return metrics
+  """Computes the loss and accuracy of the model."""
+  d = eval_fn_inputs
+  metrics["masked_lm_accuracy"].update_state(
+    y_true=tf.reshape(d["masked_lm_ids"], [-1]),
+    y_pred=tf.reshape(d["masked_lm_preds"], [-1]),
+    sample_weight=tf.reshape(d["masked_lm_weights"], [-1]),
+  )
+  metrics["masked_lm_loss"].update_state(
+    values=tf.reshape(d["mlm_loss"], [-1]), sample_weight=tf.reshape(d["masked_lm_weights"], [-1])
+  )
+  if config.electra_objective:
+    metrics["sampled_masked_lm_accuracy"].update_state(
+      y_true=tf.reshape(d["masked_lm_ids"], [-1]),
+      y_pred=tf.reshape(d["sampled_tokids"], [-1]),
+      sample_weight=tf.reshape(d["masked_lm_weights"], [-1]),
+    )
+    if config.disc_weight > 0:
+      metrics["disc_loss"].update_state(d["disc_loss"])
+      # metrics["disc_auc"].update_state(
+      #    d["disc_labels"] * d["input_mask"],
+      #    d["disc_probs"] * tf.cast(d["input_mask"], tf.float32))
+      metrics["disc_accuracy"].update_state(
+        y_true=d["disc_labels"], y_pred=d["disc_preds"], sample_weight=d["input_mask"]
+      )
+      metrics["disc_precision"].update_state(
+        y_true=d["disc_labels"], y_pred=d["disc_preds"], sample_weight=d["disc_preds"] * d["input_mask"]
+      )
+      metrics["disc_recall"].update_state(
+        y_true=d["disc_labels"], y_pred=d["disc_preds"], sample_weight=d["disc_labels"] * d["input_mask"]
+      )
+  return metrics
+
 
 @tf.function
 def train_one_step(config, model, optimizer, features, accumulator, first_step, take_step, clip_norm=1.0):
-
-    #Forward and Backward pass
-    with tf.GradientTape() as tape:
-        total_loss, eval_fn_inputs = model(features, is_training=True)
-        unscaled_loss = tf.stop_gradient(total_loss)
-        if config.amp:
-            total_loss = optimizer.get_scaled_loss(total_loss)
-   
-    #Backpropogate gradients
-    #tape = hvd.DistributedGradientTape(
-    #    tape, sparse_as_dense=True,
-    #    compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
-    gradients = tape.gradient(total_loss, model.trainable_variables)
-
-    #Get unscaled gradients if AMP
+  # Forward and Backward pass
+  with tf.GradientTape() as tape:
+    total_loss, eval_fn_inputs = model(features, is_training=True)
+    unscaled_loss = tf.stop_gradient(total_loss)
     if config.amp:
-        gradients = optimizer.get_unscaled_gradients(gradients)
-
-    #Accumulate gradients
-    accumulator(gradients)
-    #Need to call apply_gradients on very first step irrespective of gradient accumulation
-    #This is required for the optimizer to build it's states
-    if first_step or take_step:
-        #All reduce and Clip the accumulated gradients
-        allreduced_accumulated_gradients = [None if g is None else hvd.allreduce(g / tf.cast(config.gradient_accumulation_steps, g.dtype),
-                                compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
-                                for g in accumulator.gradients]
-        (clipped_accumulated_gradients, _) = tf.clip_by_global_norm(allreduced_accumulated_gradients, clip_norm=clip_norm)
-        #Weight update
-        optimizer.apply_gradients(zip(clipped_accumulated_gradients, model.trainable_variables))
-        accumulator.reset()
-
-    #brodcast model weights after first train step
-    if first_step:
-        hvd.broadcast_variables(model.variables, root_rank=0)
-        hvd.broadcast_variables(optimizer.variables(), root_rank=0)
-
-    return unscaled_loss, eval_fn_inputs
+      total_loss = optimizer.get_scaled_loss(total_loss)
+
+  # Backpropogate gradients
+  # tape = hvd.DistributedGradientTape(
+  #    tape, sparse_as_dense=True,
+  #    compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none)
+  gradients = tape.gradient(total_loss, model.trainable_variables)
+
+  # Get unscaled gradients if AMP
+  if config.amp:
+    gradients = optimizer.get_unscaled_gradients(gradients)
+
+  # Accumulate gradients
+  accumulator(gradients)
+  # Need to call apply_gradients on very first step irrespective of gradient accumulation
+  # This is required for the optimizer to build it's states
+  if first_step or take_step:
+    # All reduce and Clip the accumulated gradients
+    allreduced_accumulated_gradients = [
+      None
+      if g is None
+      else hvd.allreduce(
+        g / tf.cast(config.gradient_accumulation_steps, g.dtype),
+        compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none,
+      )
+      for g in accumulator.gradients
+    ]
+    (clipped_accumulated_gradients, _) = tf.clip_by_global_norm(allreduced_accumulated_gradients, clip_norm=clip_norm)
+    # Weight update
+    optimizer.apply_gradients(zip(clipped_accumulated_gradients, model.trainable_variables))
+    accumulator.reset()
 
-def main(e2e_start_time):
-    # Parse essential argumentss
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", required=True)
-    parser.add_argument("--model_size", default="base", type=str, help="base or large")
-    parser.add_argument("--pretrain_tfrecords", type=str)
-    parser.add_argument("--phase2", action='store_true')
-    parser.add_argument("--fp16_compression", action='store_true')
-    parser.add_argument("--amp", action='store_true',
-                        help="Whether to use fp16.")
-    parser.add_argument("--xla", action='store_true',
-                        help="Whether to use xla.")
-    parser.add_argument("--seed", default=42, type=int)
-    parser.add_argument("--num_train_steps", type=int)
-    parser.add_argument("--num_warmup_steps", type=int)
-    parser.add_argument("--learning_rate", type=float)
-    parser.add_argument("--train_batch_size", type=int)
-    parser.add_argument("--max_seq_length", type=int)
-
-    parser.add_argument("--mask_prob", type=float)
-    parser.add_argument("--disc_weight", type=float)
-    parser.add_argument("--generator_hidden_size", type=float)
-
-    parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency")
-    parser.add_argument("--save_checkpoints_steps", type=int)
-    parser.add_argument("--steps_this_run", type=int, default=-1, help="run a fixed number of steps only")
-    parser.add_argument("--keep_checkpoint_max", type=int)
-    parser.add_argument("--restore_checkpoint", default=None, type=str)
-    parser.add_argument("--load_weights", action='store_true')
-    parser.add_argument("--weights_dir")
-
-    parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb")
-    parser.add_argument("--skip_adaptive", action='store_true', help="Whether to apply adaptive LR on LayerNorm and biases")
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps")
-    parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power")
-    parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1")
-    parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2")
-    parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR")
-    parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs")
-    parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results")
-    parser.add_argument("--skip_checkpoint", action='store_true', default=False, help="Path to store logs")
-    parser.add_argument('--json-summary', type=str, default=None,
-                        help='If provided, the json summary will be written to the specified file.')
-    args = parser.parse_args()
-    config = PretrainingConfig(**args.__dict__)
-    # Padding for divisibility by 8
-    if config.vocab_size % 8 != 0:
-        config.vocab_size += 8 - (config.vocab_size % 8)
-
-    # Set up tensorflow
-    hvd.init()
-
-    args.log_dir = config.log_dir
-    # DLLogger
-    setup_logger(args)
-    dllogger.metadata('training_sequences_per_second', {'unit': 'sequences/s'})
-    dllogger.metadata('final_loss', {'unit': None})
-    dllogger.metadata('e2e_train_time', {'unit': 's'})
-
-    set_affinity(hvd.local_rank())
-    gpus = tf.config.experimental.list_physical_devices('GPU')
-    if gpus:
-        for gpu in gpus:
-            tf.config.experimental.set_memory_growth(gpu, True)
-        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
-    tf.config.optimizer.set_jit(config.xla)
-    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp})
+  # brodcast model weights after first train step
+  if first_step:
+    hvd.broadcast_variables(model.variables, root_rank=0)
+    hvd.broadcast_variables(optimizer.variables(), root_rank=0)
 
-    if config.amp:
-        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
-        tf.keras.mixed_precision.experimental.set_policy(policy)
-        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
-        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
-
-    #tf.random.set_seed(config.seed)
-
-    # Set up config cont'
-    if config.load_weights and config.restore_checkpoint:
-        raise ValueError("`load_weights` and `restore_checkpoint` should not be on at the same time.")
-    if config.phase2 and not config.restore_checkpoint:
-        raise ValueError("`phase2` cannot be used without `restore_checkpoint`.")
-    utils.heading("Config:")
-    log_config(config)
-
-    # Save pretrain configs
-    pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json')
-    if is_main_process():
-        utils.write_json(config.__dict__, pretrain_config_json)
-        log("Configuration saved in {}".format(pretrain_config_json))
-
-    # Set up model
-    model = PretrainingModel(config)
-
-    # Set up metrics
-    metrics = dict()
-    metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")
-    metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
-    metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="masked_lm_accuracy")
-    metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss")
-    if config.electra_objective:
-        metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="sampled_masked_lm_accuracy")
-        if config.disc_weight > 0:
-            metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
-            metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
-            metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(name="disc_accuracy")
-            metrics["disc_precision"] = tf.keras.metrics.Accuracy(name="disc_precision")
-            metrics["disc_recall"] = tf.keras.metrics.Accuracy(name="disc_recall")
-
-    # Set up tensorboard
-    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-    train_log_dir = os.path.join(config.log_dir, current_time,
-                                 'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
-    train_summary_writer = tf.summary.create_file_writer(train_log_dir)
-
-    # Set up dataset
-    dataset = pretrain_utils.get_dataset(
-        config, config.train_batch_size, world_size=get_world_size(), rank=get_rank())
-    train_iterator = iter(dataset)
-
-    # Set up optimizer
-    optimizer = create_optimizer(
-        init_lr=config.learning_rate,
-        num_train_steps=config.num_train_steps,
-        num_warmup_steps=config.num_warmup_steps,
-        weight_decay_rate=config.weight_decay_rate,
-        optimizer=config.optimizer,
-        skip_adaptive=config.skip_adaptive,
-        power=config.lr_decay_power,
-        beta_1=config.opt_beta_1,
-        beta_2=config.opt_beta_2,
-        end_lr=config.end_lr)
-        
-    accumulator = GradientAccumulator()
-    if config.amp:
-        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
-
-    # Set up model checkpoint
-    checkpoint = tf.train.Checkpoint(
-        step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model)
-    manager = tf.train.CheckpointManager(checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max)
-    if config.restore_checkpoint and config.restore_checkpoint != "latest":
-        checkpoint.restore(config.restore_checkpoint)
-        log(" ** Restored model checkpoint from {}".format(config.restore_checkpoint))
-    elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint:
-        checkpoint.restore(manager.latest_checkpoint)
-        log(" ** Restored model checkpoint from {}".format(manager.latest_checkpoint))
-    elif config.load_weights:
-        model.generator(model.generator.dummy_inputs)
-        model.discriminator(model.discriminator.dummy_inputs)
-        model.generator.load_weights(os.path.join(config.weights_dir, 'generator', 'tf_model.h5'))
-        model.discriminator.load_weights(os.path.join(config.weights_dir, 'discriminator', 'tf_model.h5'))
-    else:
-        log(" ** Initializing from scratch.")
-
-    restore_iterator = bool(config.restore_checkpoint) and config.restore_checkpoint == "latest"
-    # Initialize global step for phase2
-    if config.phase2 and not bool(checkpoint.phase2):
-        optimizer.iterations.assign(0)
-        checkpoint.step.assign(0)
-        checkpoint.phase2.assign(True)
-        restore_iterator = False
-    if bool(checkpoint.phase2):
-        manager = tf.train.CheckpointManager(
-            checkpoint, config.checkpoints_dir,
-            checkpoint_name='ckpt-p2',
-            max_to_keep=config.keep_checkpoint_max)
-
-    # Set up iterator checkpoint
-    iter_checkpoint = tf.train.Checkpoint(
-        train_iterator=train_iterator, world_size=tf.Variable(get_world_size()), rank=tf.Variable(get_rank()))
-    iter_manager = tf.train.CheckpointManager(
-        iter_checkpoint,
-        os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
-        checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
-        max_to_keep=config.keep_checkpoint_max)
-    if restore_iterator and iter_manager.latest_checkpoint:
-        ckpt_world_size = tf.train.load_variable(
-            iter_manager.latest_checkpoint, 'world_size/.ATTRIBUTES/VARIABLE_VALUE')
-        if ckpt_world_size == get_world_size():
-            iter_checkpoint.restore(iter_manager.latest_checkpoint)
-            log(" ** Restored iterator checkpoint from {}".format(iter_manager.latest_checkpoint), all_rank=True)
-
-    utils.heading("Running training")
-    accumulator.reset()
-    train_start, start_step = time.time(), int(checkpoint.step) - 1
-    local_step = 0
+  return unscaled_loss, eval_fn_inputs
+
+
+def main(e2e_start_time):
+  # Parse essential argumentss
+  parser = argparse.ArgumentParser()
+  parser.add_argument("--model_name", required=True)
+  parser.add_argument("--model_size", default="base", type=str, help="base or large")
+  parser.add_argument("--pretrain_tfrecords", type=str)
+  parser.add_argument("--phase2", action="store_true")
+  parser.add_argument("--fp16_compression", action="store_true")
+  parser.add_argument("--amp", action="store_true", help="Whether to use fp16.")
+  parser.add_argument("--xla", action="store_true", help="Whether to use xla.")
+  parser.add_argument("--seed", default=42, type=int)
+  parser.add_argument("--num_train_steps", type=int)
+  parser.add_argument("--num_warmup_steps", type=int)
+  parser.add_argument("--learning_rate", type=float)
+  parser.add_argument("--train_batch_size", type=int)
+  parser.add_argument("--max_seq_length", type=int)
+
+  parser.add_argument("--mask_prob", type=float)
+  parser.add_argument("--disc_weight", type=float)
+  parser.add_argument("--generator_hidden_size", type=float)
+
+  parser.add_argument("--log_freq", type=int, default=10, help="Training metrics logging frequency")
+  parser.add_argument("--save_checkpoints_steps", type=int)
+  parser.add_argument("--steps_this_run", type=int, default=-1, help="run a fixed number of steps only")
+  parser.add_argument("--keep_checkpoint_max", type=int)
+  parser.add_argument("--restore_checkpoint", default=None, type=str)
+  parser.add_argument("--load_weights", action="store_true")
+  parser.add_argument("--weights_dir")
+
+  parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb")
+  parser.add_argument(
+    "--skip_adaptive", action="store_true", help="Whether to apply adaptive LR on LayerNorm and biases"
+  )
+  parser.add_argument(
+    "--gradient_accumulation_steps", type=int, default=1, help="Number of Gradient Accumulation steps"
+  )
+  parser.add_argument("--lr_decay_power", type=float, default=0.5, help="LR decay power")
+  parser.add_argument("--opt_beta_1", type=float, default=0.878, help="Optimizer beta1")
+  parser.add_argument("--opt_beta_2", type=float, default=0.974, help="Optimizer beta2")
+  parser.add_argument("--end_lr", type=float, default=0.0, help="Ending LR")
+  parser.add_argument("--log_dir", type=str, default=None, help="Path to store logs")
+  parser.add_argument("--results_dir", type=str, default=None, help="Path to store all model results")
+  parser.add_argument("--skip_checkpoint", action="store_true", default=False, help="Path to store logs")
+  parser.add_argument(
+    "--json-summary",
+    type=str,
+    default=None,
+    help="If provided, the json summary will be written to the specified file.",
+  )
+  args = parser.parse_args()
+  config = PretrainingConfig(**args.__dict__)
+  # Padding for divisibility by 8
+  if config.vocab_size % 8 != 0:
+    config.vocab_size += 8 - (config.vocab_size % 8)
+
+  # Set up tensorflow
+  hvd.init()
+
+  args.log_dir = config.log_dir
+  # DLLogger
+  setup_logger(args)
+  dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
+  dllogger.metadata("final_loss", {"unit": None})
+  dllogger.metadata("e2e_train_time", {"unit": "s"})
+
+  set_affinity(hvd.local_rank())
+  gpus = tf.config.experimental.list_physical_devices("GPU")
+  if gpus:
+    for gpu in gpus:
+      tf.config.experimental.set_memory_growth(gpu, True)
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
+  tf.config.optimizer.set_jit(config.xla)
+  # tf.config.optimizer.set_experimental_options({"auto_mixed_precision": config.amp})
+
+  if config.amp:
+    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    print("Compute dtype: %s" % policy.compute_dtype)  # Compute dtype: float16
+    print("Variable dtype: %s" % policy.variable_dtype)  # Variable dtype: float32
+
+  # tf.random.set_seed(config.seed)
+
+  # Set up config cont'
+  if config.load_weights and config.restore_checkpoint:
+    raise ValueError("`load_weights` and `restore_checkpoint` should not be on at the same time.")
+  if config.phase2 and not config.restore_checkpoint:
+    raise ValueError("`phase2` cannot be used without `restore_checkpoint`.")
+  utils.heading("Config:")
+  log_config(config)
+
+  # Save pretrain configs
+  pretrain_config_json = os.path.join(config.checkpoints_dir, "pretrain_config.json")
+  if is_main_process():
+    utils.write_json(config.__dict__, pretrain_config_json)
+    log("Configuration saved in {}".format(pretrain_config_json))
+
+  # Set up model
+  model = PretrainingModel(config)
+
+  # Set up metrics
+  metrics = dict()
+  metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")
+  metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
+  metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="masked_lm_accuracy")
+  metrics["masked_lm_loss"] = tf.keras.metrics.Mean(name="masked_lm_loss")
+  if config.electra_objective:
+    metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(name="sampled_masked_lm_accuracy")
+    if config.disc_weight > 0:
+      metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
+      metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
+      metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(name="disc_accuracy")
+      metrics["disc_precision"] = tf.keras.metrics.Accuracy(name="disc_precision")
+      metrics["disc_recall"] = tf.keras.metrics.Accuracy(name="disc_recall")
+
+  # Set up tensorboard
+  current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
+  train_log_dir = os.path.join(
+    config.log_dir, current_time, "train_" + str(get_rank()) + "_of_" + str(get_world_size())
+  )
+  train_summary_writer = tf.summary.create_file_writer(train_log_dir)
+
+  # Set up dataset
+  dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank())
+  train_iterator = iter(dataset)
+
+  # Set up optimizer
+  optimizer = create_optimizer(
+    init_lr=config.learning_rate,
+    num_train_steps=config.num_train_steps,
+    num_warmup_steps=config.num_warmup_steps,
+    weight_decay_rate=config.weight_decay_rate,
+    optimizer=config.optimizer,
+    skip_adaptive=config.skip_adaptive,
+    power=config.lr_decay_power,
+    beta_1=config.opt_beta_1,
+    beta_2=config.opt_beta_2,
+    end_lr=config.end_lr,
+  )
+
+  accumulator = GradientAccumulator()
+  if config.amp:
+    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")
+
+  # Set up model checkpoint
+  checkpoint = tf.train.Checkpoint(step=tf.Variable(0), phase2=tf.Variable(False), optimizer=optimizer, model=model)
+  manager = tf.train.CheckpointManager(checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max)
+  if config.restore_checkpoint and config.restore_checkpoint != "latest":
+    checkpoint.restore(config.restore_checkpoint)
+    log(" ** Restored model checkpoint from {}".format(config.restore_checkpoint))
+  elif config.restore_checkpoint and config.restore_checkpoint == "latest" and manager.latest_checkpoint:
+    checkpoint.restore(manager.latest_checkpoint)
+    log(" ** Restored model checkpoint from {}".format(manager.latest_checkpoint))
+  elif config.load_weights:
+    model.generator(model.generator.dummy_inputs)
+    model.discriminator(model.discriminator.dummy_inputs)
+    model.generator.load_weights(os.path.join(config.weights_dir, "generator", "tf_model.h5"))
+    model.discriminator.load_weights(os.path.join(config.weights_dir, "discriminator", "tf_model.h5"))
+  else:
+    log(" ** Initializing from scratch.")
+
+  restore_iterator = bool(config.restore_checkpoint) and config.restore_checkpoint == "latest"
+  # Initialize global step for phase2
+  if config.phase2 and not bool(checkpoint.phase2):
+    optimizer.iterations.assign(0)
+    checkpoint.step.assign(0)
+    checkpoint.phase2.assign(True)
+    restore_iterator = False
+  if bool(checkpoint.phase2):
+    manager = tf.train.CheckpointManager(
+      checkpoint, config.checkpoints_dir, checkpoint_name="ckpt-p2", max_to_keep=config.keep_checkpoint_max
+    )
+
+  # Set up iterator checkpoint
+  iter_checkpoint = tf.train.Checkpoint(
+    train_iterator=train_iterator, world_size=tf.Variable(get_world_size()), rank=tf.Variable(get_rank())
+  )
+  iter_manager = tf.train.CheckpointManager(
+    iter_checkpoint,
+    os.path.join(config.checkpoints_dir, "iter_ckpt_rank_" + "{:02}".format(get_rank())),
+    checkpoint_name="iter_ckpt_rank_" + "{:02}".format(get_rank()),
+    max_to_keep=config.keep_checkpoint_max,
+  )
+  if restore_iterator and iter_manager.latest_checkpoint:
+    ckpt_world_size = tf.train.load_variable(iter_manager.latest_checkpoint, "world_size/.ATTRIBUTES/VARIABLE_VALUE")
+    if ckpt_world_size == get_world_size():
+      iter_checkpoint.restore(iter_manager.latest_checkpoint)
+      log(" ** Restored iterator checkpoint from {}".format(iter_manager.latest_checkpoint), all_rank=True)
+
+  utils.heading("Running training")
+  accumulator.reset()
+  train_start, start_step = time.time(), int(checkpoint.step) - 1
+  local_step = 0
+  saved_ckpt = False
+  while int(checkpoint.step) <= config.num_train_steps:
     saved_ckpt = False
-    while int(checkpoint.step) <= config.num_train_steps:
-        saved_ckpt = False
-        step = int(checkpoint.step)
-        features = next(train_iterator)
-        iter_start = time.time()
-
-        # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
-        total_loss, eval_fn_inputs = train_one_step(config, model, optimizer, features, accumulator,
-                                                    local_step==1, take_step=local_step % args.gradient_accumulation_steps == 0)
-        # if step == 300: tf.profiler.experimental.stop()
-
-        metrics["train_perf"].update_state(
-            config.train_batch_size * get_world_size() / (time.time() - iter_start))
-        metrics["total_loss"].update_state(values=total_loss)
-        metric_fn(config, metrics, eval_fn_inputs)
-
-        if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0):
-            log_info_dict = {k:float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items()}
-            dllogger.log(step=(step,), data=log_info_dict, verbosity=0)
-            log('Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
-                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, '.format(
-                step=step, **log_info_dict,
-                loss_scale=optimizer.loss_scale if config.amp else 1,
-                elapsed=utils.get_readable_time(time.time() - train_start),
-                eta=utils.get_readable_time(
-                    (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))),
-                all_rank=True)
-
-            with train_summary_writer.as_default():
-                for key, m in metrics.items():
-                    tf.summary.scalar(key, m.result(), step=step)
-
-            if int(checkpoint.step) < config.num_train_steps:
-                for m in metrics.values():
-                    m.reset_states()
-
-        #Print allreduced metrics on the last step
-        if (int(checkpoint.step) == config.num_train_steps and (local_step % args.gradient_accumulation_steps == 0)) or ((local_step + 1) % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0):
-            log_info_dict = {k:float(hvd.allreduce(v.result()).numpy() * 100) if "accuracy" in k else float(hvd.allreduce(v.result()).numpy()) for k, v in metrics.items()}
-            log_info_dict["training_sequences_per_second"] = log_info_dict["train_perf"]
-            log_info_dict["final_loss"] = log_info_dict["total_loss"]
-            log_info_dict["e2e_train_time"] = time.time() - e2e_start_time
-            dllogger.log(step=(), data=log_info_dict, verbosity=0)
-            log('<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, '
-                'Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},'.format(
-                step=step, **log_info_dict),
-                all_rank=False)
-
-        if local_step % args.gradient_accumulation_steps == 0:
-            checkpoint.step.assign(int(optimizer.iterations))
-
-        if not config.skip_checkpoint and (local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0):
-            saved_ckpt = True
-            if is_main_process():
-                save_path = manager.save(checkpoint_number=step)
-                log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
-            iter_save_path = iter_manager.save(checkpoint_number=step)
-            log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
-        local_step += 1
-        if config.steps_this_run != -1 and (local_step % (config.steps_this_run * args.gradient_accumulation_steps) == 0):
-            #terminating run sooner as steps_this_run has been reached
-            log("terminating as steps_this_run:{} has been reached".format(config.steps_this_run))
-            break
-
-    step = (int(checkpoint.step) - 1)
-    dllogger.flush()
-    if not config.skip_checkpoint and not saved_ckpt:
-        if is_main_process():
-            save_path = manager.save(checkpoint_number=step)
-            log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
-        iter_save_path = iter_manager.save(checkpoint_number=step)
-        log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
-
-    return args
+    step = int(checkpoint.step)
+    features = next(train_iterator)
+    iter_start = time.time()
+
+    # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
+    total_loss, eval_fn_inputs = train_one_step(
+      config,
+      model,
+      optimizer,
+      features,
+      accumulator,
+      local_step == 1,
+      take_step=local_step % args.gradient_accumulation_steps == 0,
+    )
+    # if step == 300: tf.profiler.experimental.stop()
+
+    metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start))
+    metrics["total_loss"].update_state(values=total_loss)
+    metric_fn(config, metrics, eval_fn_inputs)
+
+    if (step % args.log_freq == 0) and (local_step % args.gradient_accumulation_steps == 0):
+      log_info_dict = {
+        k: float(v.result().numpy() * 100) if "accuracy" in k else float(v.result().numpy()) for k, v in metrics.items()
+      }
+      dllogger.log(step=(step,), data=log_info_dict, verbosity=0)
+      log(
+        "Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, "
+        "Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f}, Loss Scaler: {loss_scale}, Elapsed: {elapsed}, ETA: {eta}, ".format(
+          step=step,
+          **log_info_dict,
+          loss_scale=optimizer.loss_scale if config.amp else 1,
+          elapsed=utils.get_readable_time(time.time() - train_start),
+          eta=utils.get_readable_time(
+            (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step)
+          ),
+        ),
+        all_rank=True,
+      )
+
+      with train_summary_writer.as_default():
+        for key, m in metrics.items():
+          tf.summary.scalar(key, m.result(), step=step)
+
+      if int(checkpoint.step) < config.num_train_steps:
+        for m in metrics.values():
+          m.reset_states()
+
+    # Print allreduced metrics on the last step
+    if (int(checkpoint.step) == config.num_train_steps and (local_step % args.gradient_accumulation_steps == 0)) or (
+      (local_step + 1) % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0
+    ):
+      log_info_dict = {
+        k: float(hvd.allreduce(v.result()).numpy() * 100)
+        if "accuracy" in k
+        else float(hvd.allreduce(v.result()).numpy())
+        for k, v in metrics.items()
+      }
+      log_info_dict["training_sequences_per_second"] = log_info_dict["train_perf"]
+      log_info_dict["final_loss"] = log_info_dict["total_loss"]
+      log_info_dict["e2e_train_time"] = time.time() - e2e_start_time
+      dllogger.log(step=(), data=log_info_dict, verbosity=0)
+      log(
+        "<FINAL STEP METRICS> Step:{step:6d}, Loss:{total_loss:10.6f}, Gen_loss:{masked_lm_loss:10.6f}, Disc_loss:{disc_loss:10.6f}, Gen_acc:{masked_lm_accuracy:6.2f}, "
+        "Disc_acc:{disc_accuracy:6.2f}, Perf:{train_perf:4.0f},".format(step=step, **log_info_dict),
+        all_rank=False,
+      )
+
+    if local_step % args.gradient_accumulation_steps == 0:
+      checkpoint.step.assign(int(optimizer.iterations))
+
+    if not config.skip_checkpoint and (
+      local_step % (config.save_checkpoints_steps * args.gradient_accumulation_steps) == 0
+    ):
+      saved_ckpt = True
+      if is_main_process():
+        save_path = manager.save(checkpoint_number=step)
+        log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
+      iter_save_path = iter_manager.save(checkpoint_number=step)
+      log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
+    local_step += 1
+    if config.steps_this_run != -1 and (local_step % (config.steps_this_run * args.gradient_accumulation_steps) == 0):
+      # terminating run sooner as steps_this_run has been reached
+      log("terminating as steps_this_run:{} has been reached".format(config.steps_this_run))
+      break
+
+  step = int(checkpoint.step) - 1
+  dllogger.flush()
+  if not config.skip_checkpoint and not saved_ckpt:
+    if is_main_process():
+      save_path = manager.save(checkpoint_number=step)
+      log(" ** Saved model checkpoint for step {}: {}".format(step, save_path))
+    iter_save_path = iter_manager.save(checkpoint_number=step)
+    log(" ** Saved iterator checkpoint for step {}: {}".format(step, iter_save_path), all_rank=True)
+
+  return args
 
 
 if __name__ == "__main__":
-    start_time = time.time()
-    args = main(start_time)
-    log("Total Time:{:.4f}".format(time.time() - start_time))
-    if is_main_process():
-        postprocess_dllog(args)
+  start_time = time.time()
+  args = main(start_time)
+  log("Total Time:{:.4f}".format(time.time() - start_time))
+  if is_main_process():
+    postprocess_dllog(args)
diff --git a/modelzoo/ELECTRA/run_tf_squad.py b/modelzoo/ELECTRA/run_tf_squad.py
index b76d6e12..88b51a5d 100644
--- a/modelzoo/ELECTRA/run_tf_squad.py
+++ b/modelzoo/ELECTRA/run_tf_squad.py
@@ -26,9 +26,9 @@
 from gpu_affinity import set_affinity
 
 if sys.version_info[0] == 2:
-    import cPickle as pickle
+  import cPickle as pickle
 else:
-    import pickle
+  import pickle
 
 from tqdm import tqdm
 import dllogger
@@ -37,639 +37,745 @@
 from modeling import TFElectraForQuestionAnswering
 from tokenization import ElectraTokenizer
 from optimization import create_optimizer
-from squad_utils import SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features, \
-    SquadResult, RawResult, get_answers
+from squad_utils import (
+  SquadV1Processor,
+  SquadV2Processor,
+  squad_convert_examples_to_features,
+  SquadResult,
+  RawResult,
+  get_answers,
+)
 
 
 TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "google/electra-small-generator",
-    "google/electra-base-generator",
-    "google/electra-large-generator",
-    "google/electra-small-discriminator",
-    "google/electra-base-discriminator",
-    "google/electra-large-discriminator",
-    # See all ELECTRA models at https://huggingface.co/models?filter=electra
+  "google/electra-small-generator",
+  "google/electra-base-generator",
+  "google/electra-large-generator",
+  "google/electra-small-discriminator",
+  "google/electra-base-discriminator",
+  "google/electra-large-discriminator",
+  # See all ELECTRA models at https://huggingface.co/models?filter=electra
 ]
 
+
 def parse_args():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument("--electra_model", default=None, type=str, required=True,
-                        help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST))
-    parser.add_argument("--data_dir", default=None, type=str, required=True,
-                        help="Path to dataset.")
-    parser.add_argument("--output_dir", default=".", type=str, required=True,
-                        help="The output directory where the model checkpoints and predictions will be written.")
-    parser.add_argument("--init_checkpoint",
-                        default=None,
-                        type=str,
-                        help="The checkpoint file from pretraining")
-
-    # Other parameters
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to use evaluate accuracy of predictions")
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
-
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument("--weight_decay_rate", default=0.01, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--layerwise_lr_decay", default=0.8, type=float,
-                        help="The layerwise learning rate decay. Shallower layers have lower learning rates.")
-
-    parser.add_argument("--num_train_epochs", default=3, type=int,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--max_steps", default=-1.0, type=float,
-                        help="Total number of training steps to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
-
-    parser.add_argument("--max_seq_length", default=384, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
-                             "longer than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--doc_stride", default=128, type=int,
-                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
-    parser.add_argument("--max_query_length", default=64, type=int,
-                        help="The maximum number of tokens for the question. Questions longer than this will "
-                             "be truncated to this length.")
-    parser.add_argument("--vocab_file", default=None, type=str,
-                        help="Path to vocabulary file use for tokenization")
-    parser.add_argument("--ci", action="store_true", help="true if running on CI")
-    parser.add_argument(
-        "--joint_head",
-        default=True,
-        type=bool,
-        help="Jointly predict the start and end positions",
-    )
-    parser.add_argument(
-        "--beam_size",
-        default=4,
-        type=int,
-        help="Beam size when doing joint predictions",
-    )
-    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
-    parser.add_argument("--max_answer_length", default=30, type=int,
-                        help="The maximum length of an answer that can be generated. This is needed because the start "
-                             "and end predictions are not conditioned on one another.")
-
-    parser.add_argument("--verbose_logging", action='store_true',
-                        help="If true, all of the warnings related to data processing will be printed. "
-                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=os.getenv('LOCAL_RANK', -1),
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--amp',
-                        action='store_true',
-                        help="Automatic mixed precision training")
-    parser.add_argument('--fp16_all_reduce',
-                        action='store_true',
-                        help="Whether to use 16-bit all reduce")
-    parser.add_argument('--xla',
-                        action='store_true',
-                        help="Whether to use XLA")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
-    parser.add_argument('--log_freq',
-                        type=int, default=50,
-                        help='frequency of logging loss.')
-    parser.add_argument('--json-summary', type=str, default="results/dllogger.json",
-                        help='If provided, the json summary will be written to the specified file.')
-    parser.add_argument("--eval_script",
-                        help="Script to evaluate squad predictions",
-                        default="evaluate.py",
-                        type=str)
-    parser.add_argument("--use_env",
-                        action='store_true',
-                        help="Whether to read local rank from ENVVAR")
-    parser.add_argument('--skip_checkpoint',
-                        default=False,
-                        action='store_true',
-                        help="Whether to save checkpoints")
-    parser.add_argument('--disable-progress-bar',
-                        default=False,
-                        action='store_true',
-                        help='Disable tqdm progress bar')
-    parser.add_argument("--skip_cache",
-                        default=False,
-                        action='store_true',
-                        help="Whether to cache train features")
-    parser.add_argument("--cache_dir",
-                        default=None,
-                        type=str,
-                        help="Location to cache train feaures. Will default to the dataset direct")
-    args = parser.parse_args()
-
-    if not args.do_train and (not args.init_checkpoint or args.init_checkpoint == 'None'):
-        raise ValueError("Checkpoint is required if do_train is not set")
-
-    return args
+  parser = argparse.ArgumentParser()
+
+  # Required parameters
+  parser.add_argument(
+    "--electra_model",
+    default=None,
+    type=str,
+    required=True,
+    help="Model selected in the list: " + ", ".join(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST),
+  )
+  parser.add_argument("--data_dir", default=None, type=str, required=True, help="Path to dataset.")
+  parser.add_argument(
+    "--output_dir",
+    default=".",
+    type=str,
+    required=True,
+    help="The output directory where the model checkpoints and predictions will be written.",
+  )
+  parser.add_argument("--init_checkpoint", default=None, type=str, help="The checkpoint file from pretraining")
+
+  # Other parameters
+  parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+  parser.add_argument("--do_predict", action="store_true", help="Whether to run eval on the dev set.")
+  parser.add_argument("--do_eval", action="store_true", help="Whether to use evaluate accuracy of predictions")
+  parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
+  parser.add_argument(
+    "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
+  )
+
+  parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
+  parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
+  parser.add_argument("--learning_rate", default=1e-4, type=float, help="The initial learning rate for Adam.")
+  parser.add_argument("--weight_decay_rate", default=0.01, type=float, help="Weight decay if we apply some.")
+  parser.add_argument(
+    "--layerwise_lr_decay",
+    default=0.8,
+    type=float,
+    help="The layerwise learning rate decay. Shallower layers have lower learning rates.",
+  )
+
+  parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.")
+  parser.add_argument("--max_steps", default=-1.0, type=float, help="Total number of training steps to perform.")
+  parser.add_argument(
+    "--warmup_proportion",
+    default=0.1,
+    type=float,
+    help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% of training.",
+  )
+
+  parser.add_argument(
+    "--max_seq_length",
+    default=384,
+    type=int,
+    help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+    "longer than this will be truncated, and sequences shorter than this will be padded.",
+  )
+  parser.add_argument(
+    "--doc_stride",
+    default=128,
+    type=int,
+    help="When splitting up a long document into chunks, how much stride to take between chunks.",
+  )
+  parser.add_argument(
+    "--max_query_length",
+    default=64,
+    type=int,
+    help="The maximum number of tokens for the question. Questions longer than this will be truncated to this length.",
+  )
+  parser.add_argument("--vocab_file", default=None, type=str, help="Path to vocabulary file use for tokenization")
+  parser.add_argument("--ci", action="store_true", help="true if running on CI")
+  parser.add_argument(
+    "--joint_head",
+    default=True,
+    type=bool,
+    help="Jointly predict the start and end positions",
+  )
+  parser.add_argument(
+    "--beam_size",
+    default=4,
+    type=int,
+    help="Beam size when doing joint predictions",
+  )
+  parser.add_argument(
+    "--n_best_size",
+    default=20,
+    type=int,
+    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+  )
+  parser.add_argument(
+    "--max_answer_length",
+    default=30,
+    type=int,
+    help="The maximum length of an answer that can be generated. This is needed because the start "
+    "and end predictions are not conditioned on one another.",
+  )
+
+  parser.add_argument(
+    "--verbose_logging",
+    action="store_true",
+    help="If true, all of the warnings related to data processing will be printed. "
+    "A number of warnings are expected for a normal SQuAD evaluation.",
+  )
+  parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+  parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+  parser.add_argument(
+    "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+  )
+  parser.add_argument(
+    "--gradient_accumulation_steps",
+    type=int,
+    default=1,
+    help="Number of updates steps to accumulate before performing a backward/update pass.",
+  )
+  parser.add_argument(
+    "--do_lower_case",
+    action="store_true",
+    help="Whether to lower case the input text. True for uncased models, False for cased models.",
+  )
+  parser.add_argument(
+    "--local_rank", type=int, default=os.getenv("LOCAL_RANK", -1), help="local_rank for distributed training on gpus"
+  )
+  parser.add_argument("--amp", action="store_true", help="Automatic mixed precision training")
+  parser.add_argument("--fp16_all_reduce", action="store_true", help="Whether to use 16-bit all reduce")
+  parser.add_argument("--xla", action="store_true", help="Whether to use XLA")
+  parser.add_argument(
+    "--version_2_with_negative",
+    action="store_true",
+    help="If true, the SQuAD examples contain some that do not have an answer.",
+  )
+  parser.add_argument(
+    "--null_score_diff_threshold",
+    type=float,
+    default=0.0,
+    help="If null_score - best_non_null is greater than the threshold predict null.",
+  )
+  parser.add_argument("--log_freq", type=int, default=50, help="frequency of logging loss.")
+  parser.add_argument(
+    "--json-summary",
+    type=str,
+    default="results/dllogger.json",
+    help="If provided, the json summary will be written to the specified file.",
+  )
+  parser.add_argument("--eval_script", help="Script to evaluate squad predictions", default="evaluate.py", type=str)
+  parser.add_argument("--use_env", action="store_true", help="Whether to read local rank from ENVVAR")
+  parser.add_argument("--skip_checkpoint", default=False, action="store_true", help="Whether to save checkpoints")
+  parser.add_argument("--disable-progress-bar", default=False, action="store_true", help="Disable tqdm progress bar")
+  parser.add_argument("--skip_cache", default=False, action="store_true", help="Whether to cache train features")
+  parser.add_argument(
+    "--cache_dir", default=None, type=str, help="Location to cache train feaures. Will default to the dataset direct"
+  )
+  args = parser.parse_args()
+
+  if not args.do_train and (not args.init_checkpoint or args.init_checkpoint == "None"):
+    raise ValueError("Checkpoint is required if do_train is not set")
+
+  return args
 
 
 def get_dataset_from_features(features, batch_size, drop_remainder=True, ngpu=8, mode="train", v2=False):
-    """Input function for training"""
+  """Input function for training"""
 
-    all_input_ids = tf.convert_to_tensor([f.input_ids for f in features], dtype=tf.int64)
-    all_input_mask = tf.convert_to_tensor([f.attention_mask for f in features], dtype=tf.int64)
-    all_segment_ids = tf.convert_to_tensor([f.token_type_ids for f in features], dtype=tf.int64)
-    all_start_pos = tf.convert_to_tensor([f.start_position for f in features], dtype=tf.int64)
-    all_end_pos = tf.convert_to_tensor([f.end_position for f in features], dtype=tf.int64)
+  all_input_ids = tf.convert_to_tensor([f.input_ids for f in features], dtype=tf.int64)
+  all_input_mask = tf.convert_to_tensor([f.attention_mask for f in features], dtype=tf.int64)
+  all_segment_ids = tf.convert_to_tensor([f.token_type_ids for f in features], dtype=tf.int64)
+  all_start_pos = tf.convert_to_tensor([f.start_position for f in features], dtype=tf.int64)
+  all_end_pos = tf.convert_to_tensor([f.end_position for f in features], dtype=tf.int64)
 
-    # if v2 else None:
-    all_cls_index = tf.convert_to_tensor([f.cls_index for f in features], dtype=tf.int64)
-    all_p_mask = tf.convert_to_tensor([f.p_mask for f in features], dtype=tf.float32)
-    all_is_impossible = tf.convert_to_tensor([f.is_impossible for f in features], dtype=tf.float32)
+  # if v2 else None:
+  all_cls_index = tf.convert_to_tensor([f.cls_index for f in features], dtype=tf.int64)
+  all_p_mask = tf.convert_to_tensor([f.p_mask for f in features], dtype=tf.float32)
+  all_is_impossible = tf.convert_to_tensor([f.is_impossible for f in features], dtype=tf.float32)
 
-    dataset = tf.data.Dataset.from_tensor_slices(
-        (all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos)
-        + (all_cls_index, all_p_mask, all_is_impossible))
-    if ngpu > 1:
-        dataset = dataset.shard(get_world_size(), get_rank())
+  dataset = tf.data.Dataset.from_tensor_slices(
+    (all_input_ids, all_input_mask, all_segment_ids, all_start_pos, all_end_pos)
+    + (all_cls_index, all_p_mask, all_is_impossible)
+  )
+  if ngpu > 1:
+    dataset = dataset.shard(get_world_size(), get_rank())
 
-    if mode == "train":
-        dataset = dataset.shuffle(batch_size * 3)
-    # dataset = dataset.map(self._preproc_samples,
-    #                      num_parallel_calls=multiprocessing.cpu_count()//self._num_gpus)
-    dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
-    dataset = dataset.prefetch(batch_size)
+  if mode == "train":
+    dataset = dataset.shuffle(batch_size * 3)
+  # dataset = dataset.map(self._preproc_samples,
+  #                      num_parallel_calls=multiprocessing.cpu_count()//self._num_gpus)
+  dataset = dataset.batch(batch_size, drop_remainder=drop_remainder)
+  dataset = dataset.prefetch(batch_size)
 
-    return dataset
+  return dataset
 
 
 @tf.function
 def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0):
-    with tf.GradientTape() as tape:
-        [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs
-
-        if not v2:
-            is_impossible = None
-
-        start_logits, end_logits, cls_logits = model(input_ids,
-                                                     attention_mask=input_mask,
-                                                     token_type_ids=segment_ids,
-                                                     start_positions=start_positions,
-                                                     end_positions=end_positions,
-                                                     cls_index=cls_index,
-                                                     p_mask=p_mask,
-                                                     is_impossible=is_impossible,
-                                                     position_ids=None,
-                                                     head_mask=None,
-                                                     inputs_embeds=None,
-                                                     training=True,
-                                                     )[0:3]
-
-        # If we are on multi-GPU, split add a dimension
-        if len(start_positions.shape) > 1:
-            start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions")
-        if len(end_positions.shape) > 1:
-            end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions")
-        if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None:
-            is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible")
-
-        # sometimes the start/end positions are outside our model inputs, we ignore these terms
-        ignored_index = start_logits.shape[1]
-        start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions")
-        end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions")
-
-        start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32))
-        end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32))
-        loss_value = (start_loss + end_loss) / 2
-
-        if v2:
-            cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32))
-            loss_value += cls_loss_value * 0.5
-
-        unscaled_loss = tf.stop_gradient(loss_value)
-        if amp:
-            loss_value = opt.get_scaled_loss(loss_value)
-
-    tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True,
-                                       compression=Compression.fp16 if fp16 else Compression.none)
-    gradients = tape.gradient(loss_value, model.trainable_variables)
+  with tf.GradientTape() as tape:
+    [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs
+
+    if not v2:
+      is_impossible = None
+
+    start_logits, end_logits, cls_logits = model(
+      input_ids,
+      attention_mask=input_mask,
+      token_type_ids=segment_ids,
+      start_positions=start_positions,
+      end_positions=end_positions,
+      cls_index=cls_index,
+      p_mask=p_mask,
+      is_impossible=is_impossible,
+      position_ids=None,
+      head_mask=None,
+      inputs_embeds=None,
+      training=True,
+    )[0:3]
+
+    # If we are on multi-GPU, split add a dimension
+    if len(start_positions.shape) > 1:
+      start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions")
+    if len(end_positions.shape) > 1:
+      end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions")
+    if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None:
+      is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible")
+
+    # sometimes the start/end positions are outside our model inputs, we ignore these terms
+    ignored_index = start_logits.shape[1]
+    start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions")
+    end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions")
+
+    start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32))
+    end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32))
+    loss_value = (start_loss + end_loss) / 2
+
+    if v2:
+      cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32))
+      loss_value += cls_loss_value * 0.5
+
+    unscaled_loss = tf.stop_gradient(loss_value)
     if amp:
-        gradients = opt.get_unscaled_gradients(gradients)
-    (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
-    opt.apply_gradients(zip(gradients, model.trainable_variables))  # , clip_norm=1.0)
+      loss_value = opt.get_scaled_loss(loss_value)
 
-    if init:
-        hvd.broadcast_variables(model.variables, root_rank=0)
-        hvd.broadcast_variables(opt.variables(), root_rank=0)
+  tape = hvd.DistributedGradientTape(
+    tape, sparse_as_dense=True, compression=Compression.fp16 if fp16 else Compression.none
+  )
+  gradients = tape.gradient(loss_value, model.trainable_variables)
+  if amp:
+    gradients = opt.get_unscaled_gradients(gradients)
+  (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm)
+  opt.apply_gradients(zip(gradients, model.trainable_variables))  # , clip_norm=1.0)
 
-    return unscaled_loss  # , outputs#, tape.gradient(loss_value, model.trainable_variables)
+  if init:
+    hvd.broadcast_variables(model.variables, root_rank=0)
+    hvd.broadcast_variables(opt.variables(), root_rank=0)
+
+  return unscaled_loss  # , outputs#, tape.gradient(loss_value, model.trainable_variables)
 
 
 @tf.function
-def infer_step(model, input_ids,
-               attention_mask=None,
-               token_type_ids=None,
-               cls_index=None,
-               p_mask=None,
-               position_ids=None,
-               head_mask=None,
-               inputs_embeds=None,
-               training=False,
-               ):
-    return model(input_ids,
-                 attention_mask=attention_mask,
-                 token_type_ids=token_type_ids,
-                 cls_index=cls_index,
-                 p_mask=p_mask,
-                 position_ids=position_ids,
-                 head_mask=head_mask,
-                 inputs_embeds=inputs_embeds,
-                 training=training,
-                 )
+def infer_step(
+  model,
+  input_ids,
+  attention_mask=None,
+  token_type_ids=None,
+  cls_index=None,
+  p_mask=None,
+  position_ids=None,
+  head_mask=None,
+  inputs_embeds=None,
+  training=False,
+):
+  return model(
+    input_ids,
+    attention_mask=attention_mask,
+    token_type_ids=token_type_ids,
+    cls_index=cls_index,
+    p_mask=p_mask,
+    position_ids=position_ids,
+    head_mask=head_mask,
+    inputs_embeds=inputs_embeds,
+    training=training,
+  )
 
 
 def main():
-    args = parse_args()
-
-    hvd.init()
-    set_affinity(hvd.local_rank())
-
-    if is_main_process():
-        log("Running total processes: {}".format(get_world_size()))
-    log("Starting process: {}".format(get_rank()))
-
-    if is_main_process():
-        dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
-                                                           filename=args.json_summary),
-                                dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
-    else:
-        dllogger.init(backends=[])
-
-    dllogger.metadata("exact_match", {"unit": None})
-    dllogger.metadata("F1", {"unit": None})
-    dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"})
-    dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
-
-    tf.random.set_seed(args.seed)
-    dllogger.log(step="PARAMETER", data={"SEED": args.seed})
-    # script parameters
-    BATCH_SIZE = args.train_batch_size
-    EVAL_BATCH_SIZE = args.predict_batch_size
-    USE_XLA = args.xla
-    USE_AMP = args.amp
-    EPOCHS = args.num_train_epochs
-
-    if not args.do_train:
-        EPOCHS = args.num_train_epochs = 1
-        log("Since running inference only, setting args.num_train_epochs to 1")
-
-    if not os.path.exists(args.output_dir) and is_main_process():
-        os.makedirs(args.output_dir)
-
-    # TensorFlow configuration
-    gpus = tf.config.experimental.list_physical_devices('GPU')
-    if gpus:
-        for gpu in gpus:
-            tf.config.experimental.set_memory_growth(gpu, True)
-        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
-    tf.config.optimizer.set_jit(USE_XLA)
-    #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
-    
-    if args.amp:
-        policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
-        tf.keras.mixed_precision.experimental.set_policy(policy)
-        print('Compute dtype: %s' % policy.compute_dtype)  # Compute dtype: float16
-        print('Variable dtype: %s' % policy.variable_dtype)  # Variable dtype: float32
-
-    if is_main_process():
-        log("***** Loading tokenizer and model *****")
-    # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
-    electra_model = args.electra_model
-    config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
-    config.update({"amp": args.amp})
-    if args.vocab_file is None:
-        tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
-    else:
-        tokenizer = ElectraTokenizer(
-            vocab_file=args.vocab_file,
-            do_lower_case=args.do_lower_case)
-
-    model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args)
-
-    if is_main_process():
-        log("***** Loading dataset *****")
-    # Load data
-    processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-    train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
-    dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None
-
-    if is_main_process():
-        log("***** Loading features *****")
-    # Load cached features
-    squad_version = '2.0' if args.version_2_with_negative else '1.1'
-    if args.cache_dir is None:
-        args.cache_dir = args.data_dir
-    cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format(
-        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
-        str(args.max_query_length), squad_version)
-    cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format(
-        electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride),
-        str(args.max_query_length), squad_version)
-
-    try:
-        with open(cached_train_features_file, "rb") as reader:
-            train_features = pickle.load(reader) if args.do_train else []
-        with open(cached_dev_features_file, "rb") as reader:
-            dev_features = pickle.load(reader) if args.do_predict else []
-    except:
-        train_features = (  # TODO: (yy) do on rank 0?
-            squad_convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True,
-                return_dataset="",
-            )
-            if args.do_train
-            else []
+  args = parse_args()
+
+  hvd.init()
+  set_affinity(hvd.local_rank())
+
+  if is_main_process():
+    log("Running total processes: {}".format(get_world_size()))
+  log("Starting process: {}".format(get_rank()))
+
+  if is_main_process():
+    dllogger.init(
+      backends=[
+        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary),
+        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step),
+      ]
+    )
+  else:
+    dllogger.init(backends=[])
+
+  dllogger.metadata("exact_match", {"unit": None})
+  dllogger.metadata("F1", {"unit": None})
+  dllogger.metadata("inference_sequences_per_second", {"unit": "sequences/s"})
+  dllogger.metadata("training_sequences_per_second", {"unit": "sequences/s"})
+
+  tf.random.set_seed(args.seed)
+  dllogger.log(step="PARAMETER", data={"SEED": args.seed})
+  # script parameters
+  BATCH_SIZE = args.train_batch_size
+  EVAL_BATCH_SIZE = args.predict_batch_size
+  USE_XLA = args.xla
+  USE_AMP = args.amp
+  EPOCHS = args.num_train_epochs
+
+  if not args.do_train:
+    EPOCHS = args.num_train_epochs = 1
+    log("Since running inference only, setting args.num_train_epochs to 1")
+
+  if not os.path.exists(args.output_dir) and is_main_process():
+    os.makedirs(args.output_dir)
+
+  # TensorFlow configuration
+  gpus = tf.config.experimental.list_physical_devices("GPU")
+  if gpus:
+    for gpu in gpus:
+      tf.config.experimental.set_memory_growth(gpu, True)
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
+  tf.config.optimizer.set_jit(USE_XLA)
+  # tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
+
+  if args.amp:
+    policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic")
+    tf.keras.mixed_precision.experimental.set_policy(policy)
+    print("Compute dtype: %s" % policy.compute_dtype)  # Compute dtype: float16
+    print("Variable dtype: %s" % policy.variable_dtype)  # Variable dtype: float32
+
+  if is_main_process():
+    log("***** Loading tokenizer and model *****")
+  # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+  electra_model = args.electra_model
+  config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir)
+  config.update({"amp": args.amp})
+  if args.vocab_file is None:
+    tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir)
+  else:
+    tokenizer = ElectraTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
+
+  model = TFElectraForQuestionAnswering.from_pretrained(
+    electra_model, config=config, cache_dir=args.cache_dir, args=args
+  )
+
+  if is_main_process():
+    log("***** Loading dataset *****")
+  # Load data
+  processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+  train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None
+  dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None
+
+  if is_main_process():
+    log("***** Loading features *****")
+  # Load cached features
+  squad_version = "2.0" if args.version_2_with_negative else "1.1"
+  if args.cache_dir is None:
+    args.cache_dir = args.data_dir
+  cached_train_features_file = (
+    args.cache_dir.rstrip("/")
+    + "/"
+    + "TF2_train-v{4}.json_{1}_{2}_{3}".format(
+      electra_model.split("/")[1],
+      str(args.max_seq_length),
+      str(args.doc_stride),
+      str(args.max_query_length),
+      squad_version,
+    )
+  )
+  cached_dev_features_file = (
+    args.cache_dir.rstrip("/")
+    + "/"
+    + "TF2_dev-v{4}.json_{1}_{2}_{3}".format(
+      electra_model.split("/")[1],
+      str(args.max_seq_length),
+      str(args.doc_stride),
+      str(args.max_query_length),
+      squad_version,
+    )
+  )
+
+  try:
+    with open(cached_train_features_file, "rb") as reader:
+      train_features = pickle.load(reader) if args.do_train else []
+    with open(cached_dev_features_file, "rb") as reader:
+      dev_features = pickle.load(reader) if args.do_predict else []
+  except:
+    train_features = (  # TODO: (yy) do on rank 0?
+      squad_convert_examples_to_features(
+        examples=train_examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=True,
+        return_dataset="",
+      )
+      if args.do_train
+      else []
+    )
+    dev_features = (
+      squad_convert_examples_to_features(
+        examples=dev_examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=False,
+        return_dataset="",
+      )
+      if args.do_predict
+      else []
+    )
+    # Dump Cached features
+    if not args.skip_cache and is_main_process():
+      if args.do_train:
+        log("***** Building Cache Files: {} *****".format(cached_train_features_file))
+        with open(cached_train_features_file, "wb") as writer:
+          pickle.dump(train_features, writer)
+      if args.do_predict:
+        log("***** Building Cache Files: {} *****".format(cached_dev_features_file))
+        with open(cached_dev_features_file, "wb") as writer:
+          pickle.dump(dev_features, writer)
+
+  len_train_features = len(train_features)
+  total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
+  train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
+  len_dev_features = len(dev_features)
+  total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1
+
+  train_dataset = (
+    get_dataset_from_features(train_features, BATCH_SIZE, v2=args.version_2_with_negative) if args.do_train else []
+  )
+  dev_dataset = (
+    get_dataset_from_features(
+      dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev", v2=args.version_2_with_negative
+    )
+    if args.do_predict
+    else []
+  )
+
+  opt = create_optimizer(
+    init_lr=args.learning_rate,
+    num_train_steps=total_train_steps,
+    num_warmup_steps=int(args.warmup_proportion * total_train_steps),
+    weight_decay_rate=args.weight_decay_rate,
+    layerwise_lr_decay=args.layerwise_lr_decay,
+    n_transformer_layers=model.num_hidden_layers,
+  )
+  if USE_AMP:
+    # loss scaling is currently required when using mixed precision
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
+
+  # Define loss function
+  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+  loss_class = tf.keras.losses.BinaryCrossentropy(from_logits=True, name="binary_crossentropy")
+  metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
+  model.compile(optimizer=opt, loss=loss, metrics=[metric])
+  train_loss_results = []
+
+  if args.do_train and is_main_process():
+    log("***** Running training *****")
+    log("  Num examples = ", len_train_features)
+    log("  Num Epochs = ", args.num_train_epochs)
+    log("  Instantaneous batch size per GPU = ", args.train_batch_size)
+    log(
+      "  Total train batch size (w. parallel, distributed & accumulation) = ",
+      args.train_batch_size * get_world_size(),
+    )
+    log("  Total optimization steps =", total_train_steps)
+
+  total_train_time = 0
+  latency = []
+  for epoch in range(EPOCHS):
+    if args.do_train:
+      epoch_loss_avg = tf.keras.metrics.Mean()
+      epoch_perf_avg = tf.keras.metrics.Mean()
+      epoch_start = time.time()
+
+      epoch_iterator = tqdm(
+        train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5, disable=not is_main_process()
+      )
+      for iter, inputs in enumerate(epoch_iterator):
+        # breaking criterion if max_steps if > 1
+        if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
+          break
+        iter_start = time.time()
+        # Optimize the model
+        loss_value = train_step(
+          model,
+          inputs,
+          loss,
+          USE_AMP,
+          opt,
+          (iter == 0 and epoch == 0),
+          v2=args.version_2_with_negative,
+          loss_class=loss_class,
+          fp16=USE_AMP,
         )
-        dev_features = (
-            squad_convert_examples_to_features(
-                examples=dev_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=False,
-                return_dataset="",
+        # introduce CPU-GPU sync for training perf computation
+        loss_numpy = loss_value.numpy()
+
+        epoch_perf_avg.update_state(1.0 * BATCH_SIZE / (time.time() - iter_start))
+        if iter % args.log_freq == 0:
+          if is_main_process():
+            log(
+              "\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(
+                epoch,
+                iter,
+                loss_value,
+                epoch_perf_avg.result() * get_world_size(),
+                opt.loss_scale if config.amp else 1,
+                int(opt.iterations),
+              )
             )
-            if args.do_predict
-            else []
+          dllogger.log(
+            step=(
+              epoch,
+              iter,
+            ),
+            data={
+              "step_loss": float(loss_value.numpy()),
+              "train_perf": float(epoch_perf_avg.result().numpy() * get_world_size()),
+            },
+          )
+
+        # Track progress
+        epoch_loss_avg.update_state(loss_value)  # Add current batch loss
+
+      # End epoch
+      train_loss_results.append(epoch_loss_avg.result())
+      total_train_time += float(time.time() - epoch_start)
+      # Summarize and save checkpoint at the end of each epoch
+      if is_main_process():
+        dllogger.log(
+          step=tuple(),
+          data={
+            "e2e_train_time": total_train_time,
+            "training_sequences_per_second": float(epoch_perf_avg.result().numpy() * get_world_size()),
+            "final_loss": float(epoch_loss_avg.result().numpy()),
+          },
         )
-        # Dump Cached features
-        if not args.skip_cache and is_main_process():
-            if args.do_train:
-                log("***** Building Cache Files: {} *****".format(cached_train_features_file))
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-            if args.do_predict:
-                log("***** Building Cache Files: {} *****".format(cached_dev_features_file))
-                with open(cached_dev_features_file, "wb") as writer:
-                    pickle.dump(dev_features, writer)
-
-    len_train_features = len(train_features)
-    total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1
-    train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1
-    len_dev_features = len(dev_features)
-    total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1
-
-    train_dataset = get_dataset_from_features(train_features, BATCH_SIZE,
-                                              v2=args.version_2_with_negative) if args.do_train else []
-    dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev",
-                                            v2=args.version_2_with_negative) if args.do_predict else []
-
-    opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps,
-                           num_warmup_steps=int(args.warmup_proportion * total_train_steps),
-                           weight_decay_rate=args.weight_decay_rate,
-                           layerwise_lr_decay=args.layerwise_lr_decay,
-                           n_transformer_layers=model.num_hidden_layers)
-    if USE_AMP:
-        # loss scaling is currently required when using mixed precision
-        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic")
-
-    # Define loss function
-    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-    loss_class = tf.keras.losses.BinaryCrossentropy(
-        from_logits=True,
-        name='binary_crossentropy'
-    )
-    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")
-    model.compile(optimizer=opt, loss=loss, metrics=[metric])
-    train_loss_results = []
-
-    if args.do_train and is_main_process():
-        log("***** Running training *****")
-        log("  Num examples = ", len_train_features)
-        log("  Num Epochs = ", args.num_train_epochs)
-        log("  Instantaneous batch size per GPU = ", args.train_batch_size)
-        log(
-            "  Total train batch size (w. parallel, distributed & accumulation) = ",
-            args.train_batch_size
-            * get_world_size(),
+
+      if not args.skip_checkpoint:
+        if args.ci:
+          checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(
+            args.output_dir, args.version_2_with_negative, epoch + 1
+          )
+        else:
+          checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(
+            args.version_2_with_negative, epoch + 1
+          )
+        if is_main_process():
+          model.save_weights(checkpoint_name)
+
+    if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
+      if not args.do_train:
+        log("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
+        model.load_weights(args.init_checkpoint).expect_partial()
+
+      current_feature_id = 0
+      all_results = []
+      if is_main_process():
+        log("***** Running evaluation *****")
+        log("  Num Batches = ", total_dev_steps)
+        log("  Batch size = ", args.predict_batch_size)
+
+      raw_infer_start = time.time()
+      if is_main_process():
+        infer_perf_avg = tf.keras.metrics.Mean()
+        dev_iterator = tqdm(
+          dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5, disable=not is_main_process()
         )
-        log("  Total optimization steps =", total_train_steps)
-
-    total_train_time = 0
-    latency = []
-    for epoch in range(EPOCHS):
-        if args.do_train:
-            epoch_loss_avg = tf.keras.metrics.Mean()
-            epoch_perf_avg = tf.keras.metrics.Mean()
-            epoch_start = time.time()
-
-            epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5,
-                                  disable=not is_main_process())
-            for iter, inputs in enumerate(epoch_iterator):
-                # breaking criterion if max_steps if > 1
-                if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps:
-                    break
-                iter_start = time.time()
-                # Optimize the model
-                loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0),
-                                        v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP)
-                #introduce CPU-GPU sync for training perf computation
-                loss_numpy = loss_value.numpy()
-                
-                epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start))
-                if iter % args.log_freq == 0:
-                    if is_main_process():
-                        log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value,
-                                                                                              epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1,
-                                                                                              int(opt.iterations)))
-                    dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()),
-                                                            "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())})
-
-                # Track progress
-                epoch_loss_avg.update_state(loss_value)  # Add current batch loss
-
-            # End epoch
-            train_loss_results.append(epoch_loss_avg.result())
-            total_train_time += float(time.time() - epoch_start)
-            # Summarize and save checkpoint at the end of each epoch
-            if is_main_process():
-
-                dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time,
-                                                 "training_sequences_per_second": float(
-                                                     epoch_perf_avg.result().numpy() * get_world_size()),
-                                                 "final_loss": float(epoch_loss_avg.result().numpy())})
-
-            if not args.skip_checkpoint:
-                if args.ci:
-                    checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1)
-                else:
-                    checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1)
-                if is_main_process():
-                    model.save_weights(checkpoint_name)
-
-
-        if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1):
-            if not args.do_train:
-                log("***** Loading checkpoint: {} *****".format(args.init_checkpoint))
-                model.load_weights(args.init_checkpoint).expect_partial()
-
-            current_feature_id = 0
-            all_results = []
-            if is_main_process():
-                log("***** Running evaluation *****")
-                log("  Num Batches = ", total_dev_steps)
-                log("  Batch size = ", args.predict_batch_size)
-
-            raw_infer_start = time.time()
-            if is_main_process():
-                infer_perf_avg = tf.keras.metrics.Mean()
-                dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5,
-                                    disable=not is_main_process())
-                for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator:
-                    # training=False is needed only if there are layers with different
-                    # behavior during training versus inference (e.g. Dropout).
-
-                    iter_start = time.time()
-
-                    if not args.joint_head:
-                        batch_start_logits, batch_end_logits = infer_step(model, input_ids,
-                                                                          attention_mask=input_mask,
-                                                                          token_type_ids=segment_ids,
-                                                                          )[:2]
-                        #Synchronize with GPU to compute time
-                        _ = batch_start_logits.numpy()
-                                                            
-                    else:
-                        
-                        outputs = infer_step(model, input_ids,
-                                             attention_mask=input_mask,
-                                             token_type_ids=segment_ids,
-                                             cls_index=cls_index,
-                                             p_mask=p_mask,
-                                             )
-                        #Synchronize with GPU to compute time
-                        _ = outputs[0].numpy()
-
-                    infer_time = (time.time() - iter_start)
-                    infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time)
-                    latency.append(infer_time)
-
-                    for iter_ in range(input_ids.shape[0]):
-
-                        if not args.joint_head:
-                            start_logits = batch_start_logits[iter_].numpy().tolist()
-                            end_logits = batch_end_logits[iter_].numpy().tolist()
-                            dev_feature = dev_features[current_feature_id]
-                            current_feature_id += 1
-                            unique_id = int(dev_feature.unique_id)
-                            all_results.append(RawResult(unique_id=unique_id,
-                                                         start_logits=start_logits,
-                                                         end_logits=end_logits))
-                        else:
-                            dev_feature = dev_features[current_feature_id]
-                            current_feature_id += 1
-                            unique_id = int(dev_feature.unique_id)
-                            output = [output[iter_].numpy().tolist() for output in outputs]
-
-                            start_logits = output[0]
-                            start_top_index = output[1]
-                            end_logits = output[2]
-                            end_top_index = output[3]
-                            cls_logits = output[4]
-                            result = SquadResult(
-                                unique_id,
-                                start_logits,
-                                end_logits,
-                                start_top_index=start_top_index,
-                                end_top_index=end_top_index,
-                                cls_logits=cls_logits,
-                            )
-
-                            all_results.append(result)
-
-                # Compute and save predictions
-                answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)
-
-                output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-                output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-                e2e_infer_time = time.time() - raw_infer_start
-                # if args.version_2_with_negative:
-                #     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-                # else:
-                #     output_null_log_odds_file = None
-                with open(output_prediction_file, "w") as f:
-                    f.write(json.dumps(answers, indent=4) + "\n")
-                with open(output_nbest_file, "w") as f:
-                    f.write(json.dumps(nbest_answers, indent=4) + "\n")
-
-                if args.do_eval:
-                    if args.version_2_with_negative:
-                        dev_file = "dev-v2.0.json"
-                    else:
-                        dev_file = "dev-v1.1.json"
-
-                    eval_out = subprocess.check_output([sys.executable, args.eval_script,
-                                                        args.data_dir + "/" + dev_file, output_prediction_file])
-                    log(eval_out.decode('UTF-8'))
-                    scores = str(eval_out).strip()
-                    exact_match = float(scores.split(":")[1].split(",")[0])
-                    if args.version_2_with_negative:
-                        f1 = float(scores.split(":")[2].split(",")[0])
-                    else:
-                        f1 = float(scores.split(":")[2].split("}")[0])
-
-                    log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8')))
-                    log("**EVAL SUMMARY** - Epoch: {:03d},  EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s"
-                          .format(epoch, exact_match, f1, infer_perf_avg.result()))
-
-                latency_all = sorted(latency)[:-2]
-                log(
-                    "**LATENCY SUMMARY** - Epoch: {:03d},  Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms"
-                    .format(epoch, sum(latency_all) / len(latency_all) * 1000,
-                            sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
-                            sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
-                            sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
-                            ))
-                dllogger.log(step=tuple(),
-                             data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), 
-                                   "e2e_inference_time": e2e_infer_time})
-
-    if is_main_process() and args.do_train and args.do_eval:
+        for (
+          input_ids,
+          input_mask,
+          segment_ids,
+          start_positions,
+          end_positions,
+          cls_index,
+          p_mask,
+          is_impossible,
+        ) in dev_iterator:
+          # training=False is needed only if there are layers with different
+          # behavior during training versus inference (e.g. Dropout).
+
+          iter_start = time.time()
+
+          if not args.joint_head:
+            batch_start_logits, batch_end_logits = infer_step(
+              model,
+              input_ids,
+              attention_mask=input_mask,
+              token_type_ids=segment_ids,
+            )[:2]
+            # Synchronize with GPU to compute time
+            _ = batch_start_logits.numpy()
+
+          else:
+            outputs = infer_step(
+              model,
+              input_ids,
+              attention_mask=input_mask,
+              token_type_ids=segment_ids,
+              cls_index=cls_index,
+              p_mask=p_mask,
+            )
+            # Synchronize with GPU to compute time
+            _ = outputs[0].numpy()
+
+          infer_time = time.time() - iter_start
+          infer_perf_avg.update_state(1.0 * EVAL_BATCH_SIZE / infer_time)
+          latency.append(infer_time)
+
+          for iter_ in range(input_ids.shape[0]):
+            if not args.joint_head:
+              start_logits = batch_start_logits[iter_].numpy().tolist()
+              end_logits = batch_end_logits[iter_].numpy().tolist()
+              dev_feature = dev_features[current_feature_id]
+              current_feature_id += 1
+              unique_id = int(dev_feature.unique_id)
+              all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits))
+            else:
+              dev_feature = dev_features[current_feature_id]
+              current_feature_id += 1
+              unique_id = int(dev_feature.unique_id)
+              output = [output[iter_].numpy().tolist() for output in outputs]
+
+              start_logits = output[0]
+              start_top_index = output[1]
+              end_logits = output[2]
+              end_top_index = output[3]
+              cls_logits = output[4]
+              result = SquadResult(
+                unique_id,
+                start_logits,
+                end_logits,
+                start_top_index=start_top_index,
+                end_top_index=end_top_index,
+                cls_logits=cls_logits,
+              )
+
+              all_results.append(result)
+
+        # Compute and save predictions
+        answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args)
+
+        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
+        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
+        e2e_infer_time = time.time() - raw_infer_start
+        # if args.version_2_with_negative:
+        #     output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
+        # else:
+        #     output_null_log_odds_file = None
+        with open(output_prediction_file, "w") as f:
+          f.write(json.dumps(answers, indent=4) + "\n")
+        with open(output_nbest_file, "w") as f:
+          f.write(json.dumps(nbest_answers, indent=4) + "\n")
+
+        if args.do_eval:
+          if args.version_2_with_negative:
+            dev_file = "dev-v2.0.json"
+          else:
+            dev_file = "dev-v1.1.json"
+
+          eval_out = subprocess.check_output([
+            sys.executable,
+            args.eval_script,
+            args.data_dir + "/" + dev_file,
+            output_prediction_file,
+          ])
+          log(eval_out.decode("UTF-8"))
+          scores = str(eval_out).strip()
+          exact_match = float(scores.split(":")[1].split(",")[0])
+          if args.version_2_with_negative:
+            f1 = float(scores.split(":")[2].split(",")[0])
+          else:
+            f1 = float(scores.split(":")[2].split("}")[0])
+
+          log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode("UTF-8")))
+          log(
+            "**EVAL SUMMARY** - Epoch: {:03d},  EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s".format(
+              epoch, exact_match, f1, infer_perf_avg.result()
+            )
+          )
+
+        latency_all = sorted(latency)[:-2]
         log(
-            "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s"
-            .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(),
-                    infer_perf_avg.result()))
-        dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
+          "**LATENCY SUMMARY** - Epoch: {:03d},  Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms".format(
+            epoch,
+            sum(latency_all) / len(latency_all) * 1000,
+            sum(latency_all[: int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000,
+            sum(latency_all[: int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000,
+            sum(latency_all[: int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000,
+          )
+        )
+        dllogger.log(
+          step=tuple(),
+          data={
+            "inference_sequences_per_second": float(infer_perf_avg.result().numpy()),
+            "e2e_inference_time": e2e_infer_time,
+          },
+        )
+
+  if is_main_process() and args.do_train and args.do_eval:
+    log(
+      "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s".format(
+        exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(), infer_perf_avg.result()
+      )
+    )
+    dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
 
 
 if __name__ == "__main__":
-    main()
+  main()
diff --git a/modelzoo/ELECTRA/squad_utils.py b/modelzoo/ELECTRA/squad_utils.py
index a15c4dd9..9f00b6a8 100644
--- a/modelzoo/ELECTRA/squad_utils.py
+++ b/modelzoo/ELECTRA/squad_utils.py
@@ -29,1065 +29,1060 @@
 from tokenization_utils import BasicTokenizer, whitespace_tokenize
 
 if is_torch_available():
-    import torch
-    from torch.utils.data import TensorDataset
+  import torch
+  from torch.utils.data import TensorDataset
 
 if is_tf_available():
-    import tensorflow as tf
+  import tensorflow as tf
 
 logger = logging.getLogger(__name__)
 
 
 def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer."""
-    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+  """Returns tokenized answer spans that better match the annotated answer."""
+  tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
 
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
+  for new_start in range(input_start, input_end + 1):
+    for new_end in range(input_end, new_start - 1, -1):
+      text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
+      if text_span == tok_answer_text:
+        return (new_start, new_end)
 
-    return (input_start, input_end)
+  return (input_start, input_end)
 
 
 def _check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span.start + doc_span.length - 1
-        if position < doc_span.start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span.start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
+  """Check if this is the 'max context' doc span for the token."""
+  best_score = None
+  best_span_index = None
+  for span_index, doc_span in enumerate(doc_spans):
+    end = doc_span.start + doc_span.length - 1
+    if position < doc_span.start:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span.start
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
 
 
 def _new_check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token."""
-    # if len(doc_spans) == 1:
-    # return True
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        end = doc_span["start"] + doc_span["length"] - 1
-        if position < doc_span["start"]:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - doc_span["start"]
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
+  """Check if this is the 'max context' doc span for the token."""
+  # if len(doc_spans) == 1:
+  # return True
+  best_score = None
+  best_span_index = None
+  for span_index, doc_span in enumerate(doc_spans):
+    end = doc_span["start"] + doc_span["length"] - 1
+    if position < doc_span["start"]:
+      continue
+    if position > end:
+      continue
+    num_left_context = position - doc_span["start"]
+    num_right_context = end - position
+    score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"]
+    if best_score is None or score > best_score:
+      best_score = score
+      best_span_index = span_index
+
+  return cur_span_index == best_span_index
 
 
 def _is_whitespace(c):
-    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
-        return True
-    return False
+  if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+    return True
+  return False
 
 
 def squad_convert_example_to_features(example, max_seq_length, doc_stride, max_query_length, is_training):
-    features = []
-    if is_training and not example.is_impossible:
-        # Get start and end position
-        start_position = example.start_position
-        end_position = example.end_position
-
-        # If the answer cannot be found in the text, then skip this example.
-        actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
-        cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
-        if actual_text.find(cleaned_answer_text) == -1:
-            logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
-            return []
-
-    tok_to_orig_index = []
-    orig_to_tok_index = []
-    all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
-        orig_to_tok_index.append(len(all_doc_tokens))
-        sub_tokens = tokenizer.tokenize(token)
-        for sub_token in sub_tokens:
-            tok_to_orig_index.append(i)
-            all_doc_tokens.append(sub_token)
-
-    if is_training and not example.is_impossible:
-        tok_start_position = orig_to_tok_index[example.start_position]
-        if example.end_position < len(example.doc_tokens) - 1:
-            tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
-        else:
-            tok_end_position = len(all_doc_tokens) - 1
-
-        (tok_start_position, tok_end_position) = _improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
-        )
+  features = []
+  if is_training and not example.is_impossible:
+    # Get start and end position
+    start_position = example.start_position
+    end_position = example.end_position
+
+    # If the answer cannot be found in the text, then skip this example.
+    actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
+    cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
+    if actual_text.find(cleaned_answer_text) == -1:
+      logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
+      return []
+
+  tok_to_orig_index = []
+  orig_to_tok_index = []
+  all_doc_tokens = []
+  for i, token in enumerate(example.doc_tokens):
+    orig_to_tok_index.append(len(all_doc_tokens))
+    sub_tokens = tokenizer.tokenize(token)
+    for sub_token in sub_tokens:
+      tok_to_orig_index.append(i)
+      all_doc_tokens.append(sub_token)
+
+  if is_training and not example.is_impossible:
+    tok_start_position = orig_to_tok_index[example.start_position]
+    if example.end_position < len(example.doc_tokens) - 1:
+      tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+    else:
+      tok_end_position = len(all_doc_tokens) - 1
+
+    (tok_start_position, tok_end_position) = _improve_answer_span(
+      all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text
+    )
 
-    spans = []
+  spans = []
+
+  truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
+  sequence_added_tokens = (
+    tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+    if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
+    else tokenizer.max_len - tokenizer.max_len_single_sentence
+  )
+  sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
+
+  span_doc_tokens = all_doc_tokens
+  while len(spans) * doc_stride < len(all_doc_tokens):
+    encoded_dict = tokenizer.encode_plus(
+      truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+      span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+      max_length=max_seq_length,
+      return_overflowing_tokens=True,
+      pad_to_max_length=True,
+      stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
+      truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
+      return_token_type_ids=True,
+    )
 
-    truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-    sequence_added_tokens = (
-        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
-        if "roberta" in str(type(tokenizer)) or "camembert" in str(type(tokenizer))
-        else tokenizer.max_len - tokenizer.max_len_single_sentence
+    paragraph_len = min(
+      len(all_doc_tokens) - len(spans) * doc_stride,
+      max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
     )
-    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
-
-    span_doc_tokens = all_doc_tokens
-    while len(spans) * doc_stride < len(all_doc_tokens):
-
-        encoded_dict = tokenizer.encode_plus(
-            truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
-            span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
-            max_length=max_seq_length,
-            return_overflowing_tokens=True,
-            pad_to_max_length=True,
-            stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-            truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
-            return_token_type_ids=True,
-        )
 
-        paragraph_len = min(
-            len(all_doc_tokens) - len(spans) * doc_stride,
-            max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+    if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+      if tokenizer.padding_side == "right":
+        non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
+      else:
+        last_padding_id_position = (
+          len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
         )
+        non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+
+    else:
+      non_padded_ids = encoded_dict["input_ids"]
+
+    tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
+
+    token_to_orig_map = {}
+    for i in range(paragraph_len):
+      index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
+      token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
+
+    encoded_dict["paragraph_len"] = paragraph_len
+    encoded_dict["tokens"] = tokens
+    encoded_dict["token_to_orig_map"] = token_to_orig_map
+    encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
+    encoded_dict["token_is_max_context"] = {}
+    encoded_dict["start"] = len(spans) * doc_stride
+    encoded_dict["length"] = paragraph_len
+
+    spans.append(encoded_dict)
+
+    if "overflowing_tokens" not in encoded_dict:
+      break
+    span_doc_tokens = encoded_dict["overflowing_tokens"]
+
+  for doc_span_index in range(len(spans)):
+    for j in range(spans[doc_span_index]["paragraph_len"]):
+      is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
+      index = (
+        j
+        if tokenizer.padding_side == "left"
+        else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+      )
+      spans[doc_span_index]["token_is_max_context"][index] = is_max_context
+
+  for span in spans:
+    # Identify the position of the CLS token
+    cls_index = span["input_ids"].index(tokenizer.cls_token_id)
+
+    # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+    # Original TF implem also keep the classification token (set to 0) (not sure why...)
+    p_mask = np.array(span["token_type_ids"])
+
+    p_mask = np.minimum(p_mask, 1)
+
+    if tokenizer.padding_side == "right":
+      # Limit positive values to one
+      p_mask = 1 - p_mask
+
+    p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
+
+    # Set the CLS index to '0'
+    p_mask[cls_index] = 0
+
+    span_is_impossible = example.is_impossible
+    start_position = 0
+    end_position = 0
+    if is_training and not span_is_impossible:
+      # For training, if our document chunk does not contain an annotation
+      # we throw it out, since there is nothing to predict.
+      doc_start = span["start"]
+      doc_end = span["start"] + span["length"] - 1
+      out_of_span = False
+
+      if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
+        out_of_span = True
+
+      if out_of_span:
+        start_position = cls_index
+        end_position = cls_index
+        span_is_impossible = True
+      else:
+        if tokenizer.padding_side == "left":
+          doc_offset = 0
+        else:
+          doc_offset = len(truncated_query) + sequence_added_tokens
 
-        if tokenizer.pad_token_id in encoded_dict["input_ids"]:
-            if tokenizer.padding_side == "right":
-                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
-            else:
-                last_padding_id_position = (
-                    len(encoded_dict["input_ids"]) - 1 - encoded_dict["input_ids"][::-1].index(tokenizer.pad_token_id)
-                )
-                non_padded_ids = encoded_dict["input_ids"][last_padding_id_position + 1 :]
+        start_position = tok_start_position - doc_start + doc_offset
+        end_position = tok_end_position - doc_start + doc_offset
 
-        else:
-            non_padded_ids = encoded_dict["input_ids"]
-
-        tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
-
-        token_to_orig_map = {}
-        for i in range(paragraph_len):
-            index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
-            token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
-
-        encoded_dict["paragraph_len"] = paragraph_len
-        encoded_dict["tokens"] = tokens
-        encoded_dict["token_to_orig_map"] = token_to_orig_map
-        encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens
-        encoded_dict["token_is_max_context"] = {}
-        encoded_dict["start"] = len(spans) * doc_stride
-        encoded_dict["length"] = paragraph_len
-
-        spans.append(encoded_dict)
-
-        if "overflowing_tokens" not in encoded_dict:
-            break
-        span_doc_tokens = encoded_dict["overflowing_tokens"]
-
-    for doc_span_index in range(len(spans)):
-        for j in range(spans[doc_span_index]["paragraph_len"]):
-            is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-            index = (
-                j
-                if tokenizer.padding_side == "left"
-                else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
-            )
-            spans[doc_span_index]["token_is_max_context"][index] = is_max_context
-
-    for span in spans:
-        # Identify the position of the CLS token
-        cls_index = span["input_ids"].index(tokenizer.cls_token_id)
-
-        # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
-        # Original TF implem also keep the classification token (set to 0) (not sure why...)
-        p_mask = np.array(span["token_type_ids"])
-
-        p_mask = np.minimum(p_mask, 1)
-
-        if tokenizer.padding_side == "right":
-            # Limit positive values to one
-            p_mask = 1 - p_mask
-
-        p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1
-
-        # Set the CLS index to '0'
-        p_mask[cls_index] = 0
-
-        span_is_impossible = example.is_impossible
-        start_position = 0
-        end_position = 0
-        if is_training and not span_is_impossible:
-            # For training, if our document chunk does not contain an annotation
-            # we throw it out, since there is nothing to predict.
-            doc_start = span["start"]
-            doc_end = span["start"] + span["length"] - 1
-            out_of_span = False
-
-            if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
-                out_of_span = True
-
-            if out_of_span:
-                start_position = cls_index
-                end_position = cls_index
-                span_is_impossible = True
-            else:
-                if tokenizer.padding_side == "left":
-                    doc_offset = 0
-                else:
-                    doc_offset = len(truncated_query) + sequence_added_tokens
-
-                start_position = tok_start_position - doc_start + doc_offset
-                end_position = tok_end_position - doc_start + doc_offset
-
-        features.append(
-            SquadFeatures(
-                span["input_ids"],
-                span["attention_mask"],
-                span["token_type_ids"],
-                cls_index,
-                p_mask.tolist(),
-                example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
-                unique_id=0,
-                paragraph_len=span["paragraph_len"],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                start_position=start_position,
-                end_position=end_position,
-                is_impossible=span_is_impossible,
-            )
-        )
-    return features
+    features.append(
+      SquadFeatures(
+        span["input_ids"],
+        span["attention_mask"],
+        span["token_type_ids"],
+        cls_index,
+        p_mask.tolist(),
+        example_index=0,  # Can not set unique_id and example_index here. They will be set after multiple processing.
+        unique_id=0,
+        paragraph_len=span["paragraph_len"],
+        token_is_max_context=span["token_is_max_context"],
+        tokens=span["tokens"],
+        token_to_orig_map=span["token_to_orig_map"],
+        start_position=start_position,
+        end_position=end_position,
+        is_impossible=span_is_impossible,
+      )
+    )
+  return features
 
 
 def squad_convert_example_to_features_init(tokenizer_for_convert):
-    global tokenizer
-    tokenizer = tokenizer_for_convert
+  global tokenizer
+  tokenizer = tokenizer_for_convert
 
 
 def squad_convert_examples_to_features(
-    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
+  examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False, threads=1
 ):
-    """
-    Converts a list of examples into a list of features that can be directly given as input to a model.
-    It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
-
-    Args:
-        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
-        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
-        max_seq_length: The maximum sequence length of the inputs.
-        doc_stride: The stride used when the context is too large and is split across several features.
-        max_query_length: The maximum length of the query.
-        is_training: whether to create features for model evaluation or model training.
-        return_dataset: Default False. Either 'pt' or 'tf'.
-            if 'pt': returns a torch.data.TensorDataset,
-            if 'tf': returns a tf.data.Dataset
-        threads: multiple processing threadsa-smi
-
-
-    Returns:
-        list of :class:`~transformers.data.processors.squad.SquadFeatures`
-
-    Example::
-
-        processor = SquadV2Processor()
-        examples = processor.get_dev_examples(data_dir)
-
-        features = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
+  """
+  Converts a list of examples into a list of features that can be directly given as input to a model.
+  It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
+
+  Args:
+      examples: list of :class:`~transformers.data.processors.squad.SquadExample`
+      tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+      max_seq_length: The maximum sequence length of the inputs.
+      doc_stride: The stride used when the context is too large and is split across several features.
+      max_query_length: The maximum length of the query.
+      is_training: whether to create features for model evaluation or model training.
+      return_dataset: Default False. Either 'pt' or 'tf'.
+          if 'pt': returns a torch.data.TensorDataset,
+          if 'tf': returns a tf.data.Dataset
+      threads: multiple processing threadsa-smi
+
+
+  Returns:
+      list of :class:`~transformers.data.processors.squad.SquadFeatures`
+
+  Example::
+
+      processor = SquadV2Processor()
+      examples = processor.get_dev_examples(data_dir)
+
+      features = squad_convert_examples_to_features(
+          examples=examples,
+          tokenizer=tokenizer,
+          max_seq_length=args.max_seq_length,
+          doc_stride=args.doc_stride,
+          max_query_length=args.max_query_length,
+          is_training=not evaluate,
+      )
+  """
+
+  # Defining helper methods
+  features = []
+  threads = min(threads, cpu_count())
+  with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
+    annotate_ = partial(
+      squad_convert_example_to_features,
+      max_seq_length=max_seq_length,
+      doc_stride=doc_stride,
+      max_query_length=max_query_length,
+      is_training=is_training,
+    )
+    features = list(
+      tqdm(
+        p.imap(annotate_, examples, chunksize=32),
+        total=len(examples),
+        desc="convert squad examples to features",
+        mininterval=5,
+        disable=hvd.rank() not in [-1, 0],
+      )
+    )
+  new_features = []
+  unique_id = 1000000000
+  example_index = 0
+  for example_features in tqdm(
+    features,
+    total=len(features),
+    desc="add example index and unique id",
+    mininterval=5,
+    disable=hvd.rank() not in [-1, 0],
+  ):
+    if not example_features:
+      continue
+    for example_feature in example_features:
+      example_feature.example_index = example_index
+      example_feature.unique_id = unique_id
+      new_features.append(example_feature)
+      unique_id += 1
+    example_index += 1
+  features = new_features
+  del new_features
+  if return_dataset == "pt":
+    if not is_torch_available():
+      raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
+
+    if not is_training:
+      all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+      dataset = TensorDataset(
+        all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+      )
+    else:
+      all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+      all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+      dataset = TensorDataset(
+        all_input_ids,
+        all_attention_masks,
+        all_token_type_ids,
+        all_start_positions,
+        all_end_positions,
+        all_cls_index,
+        all_p_mask,
+        all_is_impossible,
+      )
+
+    return features, dataset
+  elif return_dataset == "tf":
+    if not is_tf_available():
+      raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+    def gen():
+      for ex in features:
+        yield (
+          {
+            "input_ids": ex.input_ids,
+            "attention_mask": ex.attention_mask,
+            "token_type_ids": ex.token_type_ids,
+          },
+          {
+            "start_position": ex.start_position,
+            "end_position": ex.end_position,
+            "cls_index": ex.cls_index,
+            "p_mask": ex.p_mask,
+            "is_impossible": ex.is_impossible,
+          },
         )
-    """
 
-    # Defining helper methods
-    features = []
-    threads = min(threads, cpu_count())
-    with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
-        annotate_ = partial(
-            squad_convert_example_to_features,
-            max_seq_length=max_seq_length,
-            doc_stride=doc_stride,
-            max_query_length=max_query_length,
-            is_training=is_training,
-        )
-        features = list(
-            tqdm(
-                p.imap(annotate_, examples, chunksize=32),
-                total=len(examples),
-                desc="convert squad examples to features",
-                mininterval=5,
-                disable=hvd.rank() not in [-1, 0]
-            )
-        )
-    new_features = []
-    unique_id = 1000000000
-    example_index = 0
-    for example_features in tqdm(features, total=len(features), desc="add example index and unique id",
-                                 mininterval=5, disable=hvd.rank() not in [-1, 0]):
-        if not example_features:
-            continue
-        for example_feature in example_features:
-            example_feature.example_index = example_index
-            example_feature.unique_id = unique_id
-            new_features.append(example_feature)
-            unique_id += 1
-        example_index += 1
-    features = new_features
-    del new_features
-    if return_dataset == "pt":
-        if not is_torch_available():
-            raise RuntimeError("PyTorch must be installed to return a PyTorch dataset.")
-
-        # Convert to Tensors and build dataset
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
-        all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
-        all_is_impossible = torch.tensor([f.is_impossible for f in features], dtype=torch.float)
-
-        if not is_training:
-            all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
-            )
-        else:
-            all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
-            all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(
-                all_input_ids,
-                all_attention_masks,
-                all_token_type_ids,
-                all_start_positions,
-                all_end_positions,
-                all_cls_index,
-                all_p_mask,
-                all_is_impossible,
-            )
-
-        return features, dataset
-    elif return_dataset == "tf":
-        if not is_tf_available():
-            raise RuntimeError("TensorFlow must be installed to return a TensorFlow dataset.")
-
-        def gen():
-            for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    {
-                        "start_position": ex.start_position,
-                        "end_position": ex.end_position,
-                        "cls_index": ex.cls_index,
-                        "p_mask": ex.p_mask,
-                        "is_impossible": ex.is_impossible,
-                    },
-                )
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            (
-                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
-                {
-                    "start_position": tf.int64,
-                    "end_position": tf.int64,
-                    "cls_index": tf.int64,
-                    "p_mask": tf.int32,
-                    "is_impossible": tf.int32,
-                },
-            ),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                {
-                    "start_position": tf.TensorShape([]),
-                    "end_position": tf.TensorShape([]),
-                    "cls_index": tf.TensorShape([]),
-                    "p_mask": tf.TensorShape([None]),
-                    "is_impossible": tf.TensorShape([]),
-                },
-            ),
-        )
+    return tf.data.Dataset.from_generator(
+      gen,
+      (
+        {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+        {
+          "start_position": tf.int64,
+          "end_position": tf.int64,
+          "cls_index": tf.int64,
+          "p_mask": tf.int32,
+          "is_impossible": tf.int32,
+        },
+      ),
+      (
+        {
+          "input_ids": tf.TensorShape([None]),
+          "attention_mask": tf.TensorShape([None]),
+          "token_type_ids": tf.TensorShape([None]),
+        },
+        {
+          "start_position": tf.TensorShape([]),
+          "end_position": tf.TensorShape([]),
+          "cls_index": tf.TensorShape([]),
+          "p_mask": tf.TensorShape([None]),
+          "is_impossible": tf.TensorShape([]),
+        },
+      ),
+    )
 
-    return features
+  return features
 
 
-class DataProcessor(object): # TODO can be removed
-    """Base class for data converters for sequence classification data sets."""
+class DataProcessor(object):  # TODO can be removed
+  """Base class for data converters for sequence classification data sets."""
 
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
-        """
-        raise NotImplementedError()
+  def get_example_from_tensor_dict(self, tensor_dict):
+    """Gets an example from a dict with tensorflow tensors
+    Args:
+        tensor_dict: Keys and values should match the corresponding Glue
+            tensorflow_dataset examples.
+    """
+    raise NotImplementedError()
 
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
+  def get_train_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the train set."""
+    raise NotImplementedError()
 
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
+  def get_dev_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the dev set."""
+    raise NotImplementedError()
 
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
+  def get_labels(self):
+    """Gets the list of labels for this data set."""
+    raise NotImplementedError()
 
-    def tfds_map(self, example):
-        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
-        This method converts examples to the correct format."""
-        if len(self.get_labels()) > 1:
-            example.label = self.get_labels()[int(example.label)]
-        return example
+  def tfds_map(self, example):
+    """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are.
+    This method converts examples to the correct format."""
+    if len(self.get_labels()) > 1:
+      example.label = self.get_labels()[int(example.label)]
+    return example
 
-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
+  @classmethod
+  def _read_tsv(cls, input_file, quotechar=None):
+    """Reads a tab separated value file."""
+    with open(input_file, "r", encoding="utf-8-sig") as f:
+      return list(csv.reader(f, delimiter="\t", quotechar=quotechar))
 
 
 class SquadProcessor(DataProcessor):
-    """
-    Processor for the SQuAD data set.
-    Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
-    """
+  """
+  Processor for the SQuAD data set.
+  Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
+  """
+
+  train_file = None
+  dev_file = None
+
+  def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
+    if not evaluate:
+      answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+      answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
+      answers = []
+    else:
+      answers = [
+        {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+        for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+      ]
+
+      answer = None
+      answer_start = None
+
+    return SquadExample(
+      qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+      question_text=tensor_dict["question"].numpy().decode("utf-8"),
+      context_text=tensor_dict["context"].numpy().decode("utf-8"),
+      answer_text=answer,
+      start_position_character=answer_start,
+      title=tensor_dict["title"].numpy().decode("utf-8"),
+      answers=answers,
+    )
 
-    train_file = None
-    dev_file = None
+  def get_examples_from_dataset(self, dataset, evaluate=False):
+    """
+    Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
 
-    def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
-        if not evaluate:
-            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
-            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
-            answers = []
-        else:
-            answers = [
-                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
-                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
-            ]
-
-            answer = None
-            answer_start = None
-
-        return SquadExample(
-            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
-            question_text=tensor_dict["question"].numpy().decode("utf-8"),
-            context_text=tensor_dict["context"].numpy().decode("utf-8"),
-            answer_text=answer,
-            start_position_character=answer_start,
-            title=tensor_dict["title"].numpy().decode("utf-8"),
-            answers=answers,
-        )
+    Args:
+        dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+        evaluate: boolean specifying if in evaluation mode or in training mode
 
-    def get_examples_from_dataset(self, dataset, evaluate=False):
-        """
-        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+    Returns:
+        List of SquadExample
 
-        Args:
-            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
-            evaluate: boolean specifying if in evaluation mode or in training mode
+    Examples::
 
-        Returns:
-            List of SquadExample
+        import tensorflow_datasets as tfds
+        dataset = tfds.load("squad")
 
-        Examples::
+        training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+    """
 
-            import tensorflow_datasets as tfds
-            dataset = tfds.load("squad")
+    if evaluate:
+      dataset = dataset["validation"]
+    else:
+      dataset = dataset["train"]
 
-            training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
-        """
+    examples = []
+    for tensor_dict in tqdm(dataset, mininterval=5, disable=hvd.rank() not in [-1, 0]):
+      examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
 
-        if evaluate:
-            dataset = dataset["validation"]
-        else:
-            dataset = dataset["train"]
-
-        examples = []
-        for tensor_dict in tqdm(dataset, mininterval=5, disable=hvd.rank() not in [-1, 0]):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
-
-        return examples
-
-    def get_train_examples(self, data_dir, filename=None):
-        """
-        Returns the training examples from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the training file has a different name than the original one
-                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.train_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "train")
-
-    def get_dev_examples(self, data_dir, filename=None):
-        """
-        Returns the evaluation example from the data directory.
-
-        Args:
-            data_dir: Directory containing the data files used for training and evaluating.
-            filename: None by default, specify this if the evaluation file has a different name than the original one
-                which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
-        """
-        if data_dir is None:
-            data_dir = ""
-
-        if self.dev_file is None:
-            raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-
-        with open(
-            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
-        ) as reader:
-            input_data = json.load(reader)["data"]
-        return self._create_examples(input_data, "dev")
-
-    def _create_examples(self, input_data, set_type):
-        is_training = set_type == "train"
-        examples = []
-        for entry in tqdm(input_data, mininterval=5, disable=hvd.rank() not in [-1, 0]):
-            title = entry["title"]
-            for paragraph in entry["paragraphs"]:
-                context_text = paragraph["context"]
-                for qa in paragraph["qas"]:
-                    qas_id = qa["id"]
-                    question_text = qa["question"]
-                    start_position_character = None
-                    answer_text = None
-                    answers = []
-
-                    if "is_impossible" in qa:
-                        is_impossible = qa["is_impossible"]
-                    else:
-                        is_impossible = False
-
-                    if not is_impossible:
-                        if is_training:
-                            answer = qa["answers"][0]
-                            answer_text = answer["text"]
-                            start_position_character = answer["answer_start"]
-                        else:
-                            answers = qa["answers"]
-
-                    example = SquadExample(
-                        qas_id=qas_id,
-                        question_text=question_text,
-                        context_text=context_text,
-                        answer_text=answer_text,
-                        start_position_character=start_position_character,
-                        title=title,
-                        is_impossible=is_impossible,
-                        answers=answers,
-                    )
-
-                    examples.append(example)
-        return examples
+    return examples
 
+  def get_train_examples(self, data_dir, filename=None):
+    """
+    Returns the training examples from the data directory.
 
-class SquadV1Processor(SquadProcessor):
-    train_file = "train-v1.1.json"
-    dev_file = "dev-v1.1.json"
+    Args:
+        data_dir: Directory containing the data files used for training and evaluating.
+        filename: None by default, specify this if the training file has a different name than the original one
+            which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
 
+    """
+    if data_dir is None:
+      data_dir = ""
 
-class SquadV2Processor(SquadProcessor):
-    train_file = "train-v2.0.json"
-    dev_file = "dev-v2.0.json"
+    if self.train_file is None:
+      raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
+    with open(
+      os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+    ) as reader:
+      input_data = json.load(reader)["data"]
+    return self._create_examples(input_data, "train")
 
-class SquadExample(object):
+  def get_dev_examples(self, data_dir, filename=None):
     """
-    A single training/test example for the Squad dataset, as loaded from disk.
+    Returns the evaluation example from the data directory.
 
     Args:
-        qas_id: The example's unique identifier
-        question_text: The question string
-        context_text: The context string
-        answer_text: The answer string
-        start_position_character: The character position of the start of the answer
-        title: The title of the example
-        answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
-        is_impossible: False by default, set to True if the example has no possible answer.
+        data_dir: Directory containing the data files used for training and evaluating.
+        filename: None by default, specify this if the evaluation file has a different name than the original one
+            which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
     """
-
-    def __init__(
-        self,
-        qas_id,
-        question_text,
-        context_text,
-        answer_text,
-        start_position_character,
-        title,
-        answers=[],
-        is_impossible=False,
-    ):
-        self.qas_id = qas_id
-        self.question_text = question_text
-        self.context_text = context_text
-        self.answer_text = answer_text
-        self.title = title
-        self.is_impossible = is_impossible
-        self.answers = answers
-
-        self.start_position, self.end_position = 0, 0
-
-        doc_tokens = []
-        char_to_word_offset = []
-        prev_is_whitespace = True
-
-        # Split on whitespace so that different tokens may be attributed to their original position.
-        for c in self.context_text:
-            if _is_whitespace(c):
-                prev_is_whitespace = True
+    if data_dir is None:
+      data_dir = ""
+
+    if self.dev_file is None:
+      raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
+
+    with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8") as reader:
+      input_data = json.load(reader)["data"]
+    return self._create_examples(input_data, "dev")
+
+  def _create_examples(self, input_data, set_type):
+    is_training = set_type == "train"
+    examples = []
+    for entry in tqdm(input_data, mininterval=5, disable=hvd.rank() not in [-1, 0]):
+      title = entry["title"]
+      for paragraph in entry["paragraphs"]:
+        context_text = paragraph["context"]
+        for qa in paragraph["qas"]:
+          qas_id = qa["id"]
+          question_text = qa["question"]
+          start_position_character = None
+          answer_text = None
+          answers = []
+
+          if "is_impossible" in qa:
+            is_impossible = qa["is_impossible"]
+          else:
+            is_impossible = False
+
+          if not is_impossible:
+            if is_training:
+              answer = qa["answers"][0]
+              answer_text = answer["text"]
+              start_position_character = answer["answer_start"]
             else:
-                if prev_is_whitespace:
-                    doc_tokens.append(c)
-                else:
-                    doc_tokens[-1] += c
-                prev_is_whitespace = False
-            char_to_word_offset.append(len(doc_tokens) - 1)
-
-        self.doc_tokens = doc_tokens
-        self.char_to_word_offset = char_to_word_offset
-
-        # Start and end positions only has a value during evaluation.
-        if start_position_character is not None and not is_impossible:
-            self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[
-                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
-            ]
-
+              answers = qa["answers"]
+
+          example = SquadExample(
+            qas_id=qas_id,
+            question_text=question_text,
+            context_text=context_text,
+            answer_text=answer_text,
+            start_position_character=start_position_character,
+            title=title,
+            is_impossible=is_impossible,
+            answers=answers,
+          )
 
-class SquadFeatures(object):
-    """
-    Single squad example features to be fed to a model.
-    Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
-    using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+          examples.append(example)
+    return examples
 
-    Args:
-        input_ids: Indices of input sequence tokens in the vocabulary.
-        attention_mask: Mask to avoid performing attention on padding token indices.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        cls_index: the index of the CLS token.
-        p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
-            Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
-        example_index: the index of the example
-        unique_id: The unique Feature identifier
-        paragraph_len: The length of the context
-        token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
-            If a token does not have their maximum context in this feature object, it means that another feature object
-            has more information related to that token and should be prioritized over this feature for that token.
-        tokens: list of tokens corresponding to the input ids
-        token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
-        start_position: start of the answer token index
-        end_position: end of the answer token index
-    """
 
-    def __init__(
-        self,
-        input_ids,
-        attention_mask,
-        token_type_ids,
-        cls_index,
-        p_mask,
-        example_index,
-        unique_id,
-        paragraph_len,
-        token_is_max_context,
-        tokens,
-        token_to_orig_map,
-        start_position,
-        end_position,
-        is_impossible,
-    ):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.cls_index = cls_index
-        self.p_mask = p_mask
-
-        self.example_index = example_index
-        self.unique_id = unique_id
-        self.paragraph_len = paragraph_len
-        self.token_is_max_context = token_is_max_context
-        self.tokens = tokens
-        self.token_to_orig_map = token_to_orig_map
-
-        self.start_position = start_position
-        self.end_position = end_position
-        self.is_impossible = is_impossible
+class SquadV1Processor(SquadProcessor):
+  train_file = "train-v1.1.json"
+  dev_file = "dev-v1.1.json"
 
 
-class SquadResult(object):
-    """
-    Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
+class SquadV2Processor(SquadProcessor):
+  train_file = "train-v2.0.json"
+  dev_file = "dev-v2.0.json"
 
-    Args:
-        unique_id: The unique identifier corresponding to that example.
-        start_logits: The logits corresponding to the start of the answer
-        end_logits: The logits corresponding to the end of the answer
-    """
 
-    def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
-        self.start_logits = start_logits
-        self.end_logits = end_logits
-        self.unique_id = unique_id
+class SquadExample(object):
+  """
+  A single training/test example for the Squad dataset, as loaded from disk.
+
+  Args:
+      qas_id: The example's unique identifier
+      question_text: The question string
+      context_text: The context string
+      answer_text: The answer string
+      start_position_character: The character position of the start of the answer
+      title: The title of the example
+      answers: None by default, this is used during evaluation. Holds answers as well as their start positions.
+      is_impossible: False by default, set to True if the example has no possible answer.
+  """
+
+  def __init__(
+    self,
+    qas_id,
+    question_text,
+    context_text,
+    answer_text,
+    start_position_character,
+    title,
+    answers=[],
+    is_impossible=False,
+  ):
+    self.qas_id = qas_id
+    self.question_text = question_text
+    self.context_text = context_text
+    self.answer_text = answer_text
+    self.title = title
+    self.is_impossible = is_impossible
+    self.answers = answers
+
+    self.start_position, self.end_position = 0, 0
+
+    doc_tokens = []
+    char_to_word_offset = []
+    prev_is_whitespace = True
+
+    # Split on whitespace so that different tokens may be attributed to their original position.
+    for c in self.context_text:
+      if _is_whitespace(c):
+        prev_is_whitespace = True
+      else:
+        if prev_is_whitespace:
+          doc_tokens.append(c)
+        else:
+          doc_tokens[-1] += c
+        prev_is_whitespace = False
+      char_to_word_offset.append(len(doc_tokens) - 1)
 
-        if start_top_index:
-            self.start_top_index = start_top_index
-            self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
+    self.doc_tokens = doc_tokens
+    self.char_to_word_offset = char_to_word_offset
 
+    # Start and end positions only has a value during evaluation.
+    if start_position_character is not None and not is_impossible:
+      self.start_position = char_to_word_offset[start_position_character]
+      self.end_position = char_to_word_offset[
+        min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+      ]
 
 
+class SquadFeatures(object):
+  """
+  Single squad example features to be fed to a model.
+  Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample`
+  using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+
+  Args:
+      input_ids: Indices of input sequence tokens in the vocabulary.
+      attention_mask: Mask to avoid performing attention on padding token indices.
+      token_type_ids: Segment token indices to indicate first and second portions of the inputs.
+      cls_index: the index of the CLS token.
+      p_mask: Mask identifying tokens that can be answers vs. tokens that cannot.
+          Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer
+      example_index: the index of the example
+      unique_id: The unique Feature identifier
+      paragraph_len: The length of the context
+      token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object.
+          If a token does not have their maximum context in this feature object, it means that another feature object
+          has more information related to that token and should be prioritized over this feature for that token.
+      tokens: list of tokens corresponding to the input ids
+      token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer.
+      start_position: start of the answer token index
+      end_position: end of the answer token index
+  """
+
+  def __init__(
+    self,
+    input_ids,
+    attention_mask,
+    token_type_ids,
+    cls_index,
+    p_mask,
+    example_index,
+    unique_id,
+    paragraph_len,
+    token_is_max_context,
+    tokens,
+    token_to_orig_map,
+    start_position,
+    end_position,
+    is_impossible,
+  ):
+    self.input_ids = input_ids
+    self.attention_mask = attention_mask
+    self.token_type_ids = token_type_ids
+    self.cls_index = cls_index
+    self.p_mask = p_mask
+
+    self.example_index = example_index
+    self.unique_id = unique_id
+    self.paragraph_len = paragraph_len
+    self.token_is_max_context = token_is_max_context
+    self.tokens = tokens
+    self.token_to_orig_map = token_to_orig_map
+
+    self.start_position = start_position
+    self.end_position = end_position
+    self.is_impossible = is_impossible
 
 
+class SquadResult(object):
+  """
+  Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset.
 
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
+  Args:
+      unique_id: The unique identifier corresponding to that example.
+      start_logits: The logits corresponding to the start of the answer
+      end_logits: The logits corresponding to the end of the answer
+  """
 
+  def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
+    self.start_logits = start_logits
+    self.end_logits = end_logits
+    self.unique_id = unique_id
 
-def get_answers(examples, features, results, args):
-    predictions = collections.defaultdict(list)  # it is possible that one example corresponds to multiple features
-    _Prediction = collections.namedtuple('_Prediction', ['text', 'start_logit', 'end_logit'])
+    if start_top_index:
+      self.start_top_index = start_top_index
+      self.end_top_index = end_top_index
+      self.cls_logits = cls_logits
 
-    if args.version_2_with_negative:
-        null_vals = collections.defaultdict(lambda: (float("inf"), 0, 0))
 
-    for ex, feat, result in match_results(examples, features, results):
-        if not args.joint_head:
-            start_indices = _get_best_indices(result.start_logits, args.n_best_size)
-            end_indices = _get_best_indices(result.end_logits, args.n_best_size)
-            prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
-            feature_null_score = result.start_logits[0] + result.end_logits[0]
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
-        else:
-            prelim_predictions = get_valid_prelim_predictions_joint_head(result.start_top_index, result.end_top_index,
-                                                                         feat, result, args)
-            # start_indices = result.start_top_index
-            # end_indices = result.end_top_index
-            feature_null_score = result.cls_logits
-
-        prelim_predictions = sorted(
-            prelim_predictions,
-            key=lambda x: (x.start_logit + x.end_logit),
-            reverse=True)
-        if args.version_2_with_negative and feature_null_score < null_vals[ex.qas_id][0]:
-            null_vals[ex.qas_id] = (feature_null_score, result.start_logits[0], result.end_logits[0])
-
-        curr_predictions = []
-        seen_predictions = set()
-        for pred in prelim_predictions:
-            if len(curr_predictions) == args.n_best_size:
-                break
-            if pred.start_index > 0:  # this is a non-null prediction TODO: this probably is irrelevant
-                final_text = get_answer_text(ex, feat, pred, args)
-            else:
-                final_text = ''
-            if final_text in seen_predictions:
-                continue
 
-            seen_predictions.add(final_text)
-            curr_predictions.append(_Prediction(final_text, pred.start_logit, pred.end_logit))
-        predictions[ex.qas_id] += curr_predictions
+def get_answers(examples, features, results, args):
+  predictions = collections.defaultdict(list)  # it is possible that one example corresponds to multiple features
+  _Prediction = collections.namedtuple("_Prediction", ["text", "start_logit", "end_logit"])
+
+  if args.version_2_with_negative:
+    null_vals = collections.defaultdict(lambda: (float("inf"), 0, 0))
+
+  for ex, feat, result in match_results(examples, features, results):
+    if not args.joint_head:
+      start_indices = _get_best_indices(result.start_logits, args.n_best_size)
+      end_indices = _get_best_indices(result.end_logits, args.n_best_size)
+      prelim_predictions = get_valid_prelim_predictions(start_indices, end_indices, feat, result, args)
+      feature_null_score = result.start_logits[0] + result.end_logits[0]
+
+    else:
+      prelim_predictions = get_valid_prelim_predictions_joint_head(
+        result.start_top_index, result.end_top_index, feat, result, args
+      )
+      # start_indices = result.start_top_index
+      # end_indices = result.end_top_index
+      feature_null_score = result.cls_logits
+
+    prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
+    if args.version_2_with_negative and feature_null_score < null_vals[ex.qas_id][0]:
+      null_vals[ex.qas_id] = (feature_null_score, result.start_logits[0], result.end_logits[0])
+
+    curr_predictions = []
+    seen_predictions = set()
+    for pred in prelim_predictions:
+      if len(curr_predictions) == args.n_best_size:
+        break
+      if pred.start_index > 0:  # this is a non-null prediction TODO: this probably is irrelevant
+        final_text = get_answer_text(ex, feat, pred, args)
+      else:
+        final_text = ""
+      if final_text in seen_predictions:
+        continue
+
+      seen_predictions.add(final_text)
+      curr_predictions.append(_Prediction(final_text, pred.start_logit, pred.end_logit))
+    predictions[ex.qas_id] += curr_predictions
+
+  # Add empty prediction
+  if args.version_2_with_negative:
+    for qas_id in predictions.keys():
+      predictions[qas_id].append(_Prediction("", null_vals[qas_id][1], null_vals[qas_id][2]))
+
+  nbest_answers = collections.defaultdict(list)
+  answers = {}
+  for qas_id, preds in predictions.items():
+    # nbest = sorted(
+    #     preds,
+    #     key=lambda x: (x.start_logit + x.end_logit),
+    #     reverse=True)[:args.n_best_size]
+    seen_predictions = set()
+    nbest = []
+    for pred in sorted(predictions[qas_id], key=lambda x: (x.start_logit + x.end_logit), reverse=True):
+      if len(nbest) >= args.n_best_size:
+        break
+      if pred.text in seen_predictions:
+        continue
+      seen_predictions.add(pred.text)
+      nbest.append(pred)
+
+    # In very rare edge cases we could only have single null prediction.
+    # So we just create a nonce prediction in this case to avoid failure.
+    if not nbest or (args.version_2_with_negative and len(nbest) == 1):
+      nbest.append(_Prediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+    total_scores = []
+    best_non_null_entry = None
+    for entry in nbest:
+      total_scores.append(entry.start_logit + entry.end_logit)
+      if not best_non_null_entry and entry.text:
+        best_non_null_entry = entry
+
+    probs = _compute_softmax(total_scores)
+    for i, entry in enumerate(nbest):
+      output = collections.OrderedDict()
+      output["text"] = entry.text
+      output["probability"] = probs[i]
+      output["start_logit"] = entry.start_logit
+      output["end_logit"] = entry.end_logit
+      nbest_answers[qas_id].append(output)
 
-    # Add empty prediction
     if args.version_2_with_negative:
-        for qas_id in predictions.keys():
-            predictions[qas_id].append(_Prediction('',
-                                                   null_vals[qas_id][1],
-                                                   null_vals[qas_id][2]))
-
-    nbest_answers = collections.defaultdict(list)
-    answers = {}
-    for qas_id, preds in predictions.items():
-        # nbest = sorted(
-        #     preds,
-        #     key=lambda x: (x.start_logit + x.end_logit),
-        #     reverse=True)[:args.n_best_size]
-        seen_predictions = set()
-        nbest = []
-        for pred in sorted(predictions[qas_id], key=lambda x: (x.start_logit + x.end_logit), reverse=True):
-            if len(nbest) >= args.n_best_size:
-                break
-            if pred.text in seen_predictions:
-                continue
-            seen_predictions.add(pred.text)
-            nbest.append(pred)
-
-        # In very rare edge cases we could only have single null prediction.
-        # So we just create a nonce prediction in this case to avoid failure.
-        if not nbest or (args.version_2_with_negative and len(nbest) == 1):
-            nbest.append(_Prediction(text="empty", start_logit=0.0, end_logit=0.0))
-
-        total_scores = []
-        best_non_null_entry = None
-        for entry in nbest:
-            total_scores.append(entry.start_logit + entry.end_logit)
-            if not best_non_null_entry and entry.text:
-                best_non_null_entry = entry
-
-        probs = _compute_softmax(total_scores)
-        for (i, entry) in enumerate(nbest):
-            output = collections.OrderedDict()
-            output["text"] = entry.text
-            output["probability"] = probs[i]
-            output["start_logit"] = entry.start_logit
-            output["end_logit"] = entry.end_logit
-            nbest_answers[qas_id].append(output)
-
-        if args.version_2_with_negative:
-            if not args.joint_head:
-                score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
-            else:
-                score_diff = null_vals[qas_id][0]
-            if score_diff > args.null_score_diff_threshold:
-                answers[qas_id] = ""
-            else:
-                answers[qas_id] = best_non_null_entry.text
-        else:
-            answers[qas_id] = nbest_answers[qas_id][0]['text']
+      if not args.joint_head:
+        score_diff = null_vals[qas_id][0] - best_non_null_entry.start_logit - best_non_null_entry.end_logit
+      else:
+        score_diff = null_vals[qas_id][0]
+      if score_diff > args.null_score_diff_threshold:
+        answers[qas_id] = ""
+      else:
+        answers[qas_id] = best_non_null_entry.text
+    else:
+      answers[qas_id] = nbest_answers[qas_id][0]["text"]
 
-    return answers, nbest_answers
+  return answers, nbest_answers
 
 
 def get_answer_text(example, feature, pred, args):
-    tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
-    orig_doc_start = feature.token_to_orig_map[pred.start_index]
-    orig_doc_end = feature.token_to_orig_map[pred.end_index]
-    orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
-    tok_text = " ".join(tok_tokens)
+  tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
+  orig_doc_start = feature.token_to_orig_map[pred.start_index]
+  orig_doc_end = feature.token_to_orig_map[pred.end_index]
+  orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
+  tok_text = " ".join(tok_tokens)
 
-    # De-tokenize WordPieces that have been split off.
-    tok_text = tok_text.replace(" ##", "")
-    tok_text = tok_text.replace("##", "")
+  # De-tokenize WordPieces that have been split off.
+  tok_text = tok_text.replace(" ##", "")
+  tok_text = tok_text.replace("##", "")
 
-    # Clean whitespace
-    tok_text = tok_text.strip()
-    tok_text = " ".join(tok_text.split())
-    orig_text = " ".join(orig_tokens)
+  # Clean whitespace
+  tok_text = tok_text.strip()
+  tok_text = " ".join(tok_text.split())
+  orig_text = " ".join(orig_tokens)
 
-    final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
-    return final_text
+  final_text = get_final_text(tok_text, orig_text, args.do_lower_case, args.verbose_logging)
+  return final_text
 
 
 def get_valid_prelim_predictions_joint_head(start_indices, end_indices, feature, result, args):
-    _PrelimPrediction = collections.namedtuple(
-        "PrelimPrediction",
-        ["start_index", "end_index", "start_logit", "end_logit"])
-    prelim_predictions = []
-    # for start_index in start_indices:
-
-    for i in range(args.beam_size):
-        start_index = start_indices[i]
-        for j in range(args.beam_size):
-            # for end_index in end_indices:
-            end_index = end_indices[i * args.beam_size + j]
-            if start_index >= len(feature.tokens):
-                continue
-            if end_index >= len(feature.tokens):
-                continue
-            if start_index not in feature.token_to_orig_map:
-                continue
-            if end_index not in feature.token_to_orig_map:
-                continue
-            if not feature.token_is_max_context.get(start_index, False):
-                continue
-            if end_index < start_index:
-                continue
-            length = end_index - start_index + 1
-            if length > args.max_answer_length:
-                continue
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    start_index=start_index,
-                    end_index=end_index,
-                    start_logit=result.start_logits[i],  # start_index],
-                    end_logit=result.end_logits[i * args.beam_size + j]))  # end_index]))
-    return prelim_predictions
+  _PrelimPrediction = collections.namedtuple(
+    "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
+  )
+  prelim_predictions = []
+  # for start_index in start_indices:
+
+  for i in range(args.beam_size):
+    start_index = start_indices[i]
+    for j in range(args.beam_size):
+      # for end_index in end_indices:
+      end_index = end_indices[i * args.beam_size + j]
+      if start_index >= len(feature.tokens):
+        continue
+      if end_index >= len(feature.tokens):
+        continue
+      if start_index not in feature.token_to_orig_map:
+        continue
+      if end_index not in feature.token_to_orig_map:
+        continue
+      if not feature.token_is_max_context.get(start_index, False):
+        continue
+      if end_index < start_index:
+        continue
+      length = end_index - start_index + 1
+      if length > args.max_answer_length:
+        continue
+      prelim_predictions.append(
+        _PrelimPrediction(
+          start_index=start_index,
+          end_index=end_index,
+          start_logit=result.start_logits[i],  # start_index],
+          end_logit=result.end_logits[i * args.beam_size + j],
+        )
+      )  # end_index]))
+  return prelim_predictions
 
 
 def get_valid_prelim_predictions(start_indices, end_indices, feature, result, args):
-    _PrelimPrediction = collections.namedtuple(
-        "PrelimPrediction",
-        ["start_index", "end_index", "start_logit", "end_logit"])
-    prelim_predictions = []
-    for start_index in start_indices:
-        for end_index in end_indices:
-            if start_index >= len(feature.tokens):
-                continue
-            if end_index >= len(feature.tokens):
-                continue
-            if start_index not in feature.token_to_orig_map:
-                continue
-            if end_index not in feature.token_to_orig_map:
-                continue
-            if not feature.token_is_max_context.get(start_index, False):
-                continue
-            if end_index < start_index:
-                continue
-            length = end_index - start_index + 1
-            if length > args.max_answer_length:
-                continue
-            prelim_predictions.append(
-                _PrelimPrediction(
-                    start_index=start_index,
-                    end_index=end_index,
-                    start_logit=result.start_logits[start_index],
-                    end_logit=result.end_logits[end_index]))
-    return prelim_predictions
+  _PrelimPrediction = collections.namedtuple(
+    "PrelimPrediction", ["start_index", "end_index", "start_logit", "end_logit"]
+  )
+  prelim_predictions = []
+  for start_index in start_indices:
+    for end_index in end_indices:
+      if start_index >= len(feature.tokens):
+        continue
+      if end_index >= len(feature.tokens):
+        continue
+      if start_index not in feature.token_to_orig_map:
+        continue
+      if end_index not in feature.token_to_orig_map:
+        continue
+      if not feature.token_is_max_context.get(start_index, False):
+        continue
+      if end_index < start_index:
+        continue
+      length = end_index - start_index + 1
+      if length > args.max_answer_length:
+        continue
+      prelim_predictions.append(
+        _PrelimPrediction(
+          start_index=start_index,
+          end_index=end_index,
+          start_logit=result.start_logits[start_index],
+          end_logit=result.end_logits[end_index],
+        )
+      )
+  return prelim_predictions
 
 
 def match_results(examples, features, results):
-    unique_f_ids = set([f.unique_id for f in features])
-    unique_r_ids = set([r.unique_id for r in results])
-    matching_ids = unique_f_ids & unique_r_ids
-    features = [f for f in features if f.unique_id in matching_ids]
-    results = [r for r in results if r.unique_id in matching_ids]
-    features.sort(key=lambda x: x.unique_id)
-    results.sort(key=lambda x: x.unique_id)
+  unique_f_ids = set([f.unique_id for f in features])
+  unique_r_ids = set([r.unique_id for r in results])
+  matching_ids = unique_f_ids & unique_r_ids
+  features = [f for f in features if f.unique_id in matching_ids]
+  results = [r for r in results if r.unique_id in matching_ids]
+  features.sort(key=lambda x: x.unique_id)
+  results.sort(key=lambda x: x.unique_id)
 
-    for f, r in zip(features, results):  # original code assumes strict ordering of examples. TODO: rewrite this
-        yield examples[f.example_index], f, r
+  for f, r in zip(features, results):  # original code assumes strict ordering of examples. TODO: rewrite this
+    yield examples[f.example_index], f, r
 
 
 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
-    """Project the tokenized prediction back to the original text."""
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = collections.OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == " ":
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = "".join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-
-    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
-
-    tok_text = " ".join(tokenizer.tokenize(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        if verbose_logging:
-            logger.info(
-                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        if verbose_logging:
-            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                        orig_ns_text, tok_ns_text)
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for (i, tok_index) in tok_ns_to_s_map.items():
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map start position")
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        if verbose_logging:
-            logger.info("Couldn't map end position")
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
+  """Project the tokenized prediction back to the original text."""
+
+  def _strip_spaces(text):
+    ns_chars = []
+    ns_to_s_map = collections.OrderedDict()
+    for i, c in enumerate(text):
+      if c == " ":
+        continue
+      ns_to_s_map[len(ns_chars)] = i
+      ns_chars.append(c)
+    ns_text = "".join(ns_chars)
+    return (ns_text, ns_to_s_map)
+
+  # We first tokenize `orig_text`, strip whitespace from the result
+  # and `pred_text`, and check if they are the same length. If they are
+  # NOT the same length, the heuristic has failed. If they are the same
+  # length, we assume the characters are one-to-one aligned.
+
+  tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+  tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+  start_position = tok_text.find(pred_text)
+  if start_position == -1:
+    if verbose_logging:
+      logger.info("Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+    return orig_text
+  end_position = start_position + len(pred_text) - 1
+
+  (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+  (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+  if len(orig_ns_text) != len(tok_ns_text):
+    if verbose_logging:
+      logger.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
+    return orig_text
+
+  # We then project the characters in `pred_text` back to `orig_text` using
+  # the character-to-character alignment.
+  tok_s_to_ns_map = {}
+  for i, tok_index in tok_ns_to_s_map.items():
+    tok_s_to_ns_map[tok_index] = i
+
+  orig_start_position = None
+  if start_position in tok_s_to_ns_map:
+    ns_start_position = tok_s_to_ns_map[start_position]
+    if ns_start_position in orig_ns_to_s_map:
+      orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+  if orig_start_position is None:
+    if verbose_logging:
+      logger.info("Couldn't map start position")
+    return orig_text
+
+  orig_end_position = None
+  if end_position in tok_s_to_ns_map:
+    ns_end_position = tok_s_to_ns_map[end_position]
+    if ns_end_position in orig_ns_to_s_map:
+      orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+  if orig_end_position is None:
+    if verbose_logging:
+      logger.info("Couldn't map end position")
+    return orig_text
+
+  output_text = orig_text[orig_start_position : (orig_end_position + 1)]
+  return output_text
 
 
 def _get_best_indices(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+  """Get the n-best logits from a list."""
+  index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
 
-    best_indices = []
-    for i in range(len(index_and_score)):
-        if i >= n_best_size:
-            break
-        best_indices.append(index_and_score[i][0])
-    return best_indices
+  best_indices = []
+  for i in range(len(index_and_score)):
+    if i >= n_best_size:
+      break
+    best_indices.append(index_and_score[i][0])
+  return best_indices
 
 
 def _compute_softmax(scores):
-    """Compute softmax probability over raw logits."""
-    if not scores:
-        return []
-
-    max_score = None
-    for score in scores:
-        if max_score is None or score > max_score:
-            max_score = score
-
-    exp_scores = []
-    total_sum = 0.0
-    for score in scores:
-        x = math.exp(score - max_score)
-        exp_scores.append(x)
-        total_sum += x
-
-    probs = []
-    for score in exp_scores:
-        probs.append(score / total_sum)
-    return probs
+  """Compute softmax probability over raw logits."""
+  if not scores:
+    return []
+
+  max_score = None
+  for score in scores:
+    if max_score is None or score > max_score:
+      max_score = score
+
+  exp_scores = []
+  total_sum = 0.0
+  for score in scores:
+    x = math.exp(score - max_score)
+    exp_scores.append(x)
+    total_sum += x
+
+  probs = []
+  for score in exp_scores:
+    probs.append(score / total_sum)
+  return probs
diff --git a/modelzoo/ELECTRA/tokenization.py b/modelzoo/ELECTRA/tokenization.py
index 47421c2d..087a8939 100644
--- a/modelzoo/ELECTRA/tokenization.py
+++ b/modelzoo/ELECTRA/tokenization.py
@@ -20,49 +20,47 @@
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
-        "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
-        "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
-        "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
-        "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
-        "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
-    }
+  "vocab_file": {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
+  }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "google/electra-small-generator": 512,
-    "google/electra-base-generator": 512,
-    "google/electra-large-generator": 512,
-    "google/electra-small-discriminator": 512,
-    "google/electra-base-discriminator": 512,
-    "google/electra-large-discriminator": 512,
+  "google/electra-small-generator": 512,
+  "google/electra-base-generator": 512,
+  "google/electra-large-generator": 512,
+  "google/electra-small-discriminator": 512,
+  "google/electra-base-discriminator": 512,
+  "google/electra-large-discriminator": 512,
 }
 
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "google/electra-small-generator": {"do_lower_case": True},
-    "google/electra-base-generator": {"do_lower_case": True},
-    "google/electra-large-generator": {"do_lower_case": True},
-    "google/electra-small-discriminator": {"do_lower_case": True},
-    "google/electra-base-discriminator": {"do_lower_case": True},
-    "google/electra-large-discriminator": {"do_lower_case": True},
+  "google/electra-small-generator": {"do_lower_case": True},
+  "google/electra-base-generator": {"do_lower_case": True},
+  "google/electra-large-generator": {"do_lower_case": True},
+  "google/electra-small-discriminator": {"do_lower_case": True},
+  "google/electra-base-discriminator": {"do_lower_case": True},
+  "google/electra-large-discriminator": {"do_lower_case": True},
 }
 
 
 class ElectraTokenizer(BertTokenizer):
-    r"""
-    Constructs an Electra tokenizer.
-    :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
-    tokenization: punctuation splitting + wordpiece.
-
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+  r"""
+  Constructs an Electra tokenizer.
+  :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+  tokenization: punctuation splitting + wordpiece.
 
+  Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+  parameters.
+  """
 
+  vocab_files_names = VOCAB_FILES_NAMES
+  pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+  pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/modelzoo/ELECTRA/tokenization_utils.py b/modelzoo/ELECTRA/tokenization_utils.py
index 928532c6..64b1264e 100644
--- a/modelzoo/ELECTRA/tokenization_utils.py
+++ b/modelzoo/ELECTRA/tokenization_utils.py
@@ -36,9 +36,9 @@
 
 
 if is_tf_available():
-    import tensorflow as tf
+  import tensorflow as tf
 if is_torch_available():
-    import torch
+  import torch
 
 logger = logging.getLogger(__name__)
 
@@ -55,2361 +55,2339 @@
 
 
 def flatten(x: Sequence):
-    """
-    Flatten the provided (potentially nested) sequence
+  """
+  Flatten the provided (potentially nested) sequence
 
-    Args:
-        x (Sequence): Potentially nested sequence to flatten
+  Args:
+      x (Sequence): Potentially nested sequence to flatten
 
-    Returns:
-        list: Flattened sequence
-    """
+  Returns:
+      list: Flattened sequence
+  """
 
-    return functools.reduce(operator.iconcat, x, [])
+  return functools.reduce(operator.iconcat, x, [])
 
 
 @contextmanager
 def truncate_and_pad(
-    tokenizer: BaseTokenizer,
-    max_length: int,
-    stride: int,
-    strategy: str,
-    pad_to_max_length: bool,
-    padding_side: str,
-    pad_token_id: int,
-    pad_token_type_id: int,
-    pad_token: str,
+  tokenizer: BaseTokenizer,
+  max_length: int,
+  stride: int,
+  strategy: str,
+  pad_to_max_length: bool,
+  padding_side: str,
+  pad_token_id: int,
+  pad_token_type_id: int,
+  pad_token: str,
 ):
+  """
+  This contextmanager is in charge of defining the truncation and the padding strategies and then
+  restore the tokenizer settings afterwards.
+
+  This contextmanager assumes the provider tokenizer has no padding / truncation strategy
+  before the managed section. If your tokenizer set a padding / truncation strategy before,
+  then it will be reset to no padding/truncation when exiting the managed section.
+
+  Args:
+      tokenizer (BaseTokenizer): The tokenizer which will be used
+      max_length (int): The maximum size of the sequence
+      stride (int): The stride to use when handling overflow
+      strategy (str): Overflowing logic to use
+      pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
+      padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
+      pad_token_id (int): The integer representation of the padding token to use
+      pad_token_type_id (int): The integer representation of the padding token type to use
+      pad_token (str): The string representation of the padding token to use
+
+  Returns:
+
+  """
+
+  # Handle all the truncation and padding stuff
+  if max_length is not None:
+    tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
+
+  if pad_to_max_length and (pad_token and pad_token_id >= 0):
+    tokenizer.enable_padding(
+      max_length=max_length,
+      direction=padding_side,
+      pad_id=pad_token_id,
+      pad_type_id=pad_token_type_id,
+      pad_token=pad_token,
+    )
+  elif pad_to_max_length:
+    logger.warning(
+      "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
+      "To remove this error, you can add a new pad token and then resize model embedding:\n"
+      "\ttokenizer.pad_token = '<PAD>'\n\tmodel.resize_token_embeddings(len(tokenizer))".format(pad_token, pad_token_id)
+    )
+
+  yield
+
+  if max_length is not None:
+    tokenizer.no_truncation()
+
+  if pad_to_max_length and (pad_token and pad_token_id >= 0):
+    tokenizer.no_padding()
+
+
+class BatchEncoding(UserDict):
+  """
+  Data structure derived from Dictionary holding all the required information to forward through
+  a model.
+
+  In addition, this structure expose utility methods to map from word/char space to token space.
+  """
+
+  def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None):
+    super().__init__(data)
+
+    if isinstance(encoding, Encoding):
+      encoding = [encoding]
+
+    self._encodings = encoding
+
+  def __getitem__(self, item: Union[int, str]) -> Encoding:
+    if isinstance(item, str):
+      return self.data[item]
+    elif self._encodings is not None:
+      return self._encodings[item]
+    else:
+      raise KeyError("int index is supported only on {} from a Rust tokenizer".format(type(self).__name__))
+
+  def __getattr__(self, item: str):
+    return self.data[item]
+
+  @property
+  def encodings(self) -> Optional[List[Encoding]]:
+    """
+    Return the list all encoding from the tokenization process
+
+    Returns: List[Encoding] or None if input was tokenized through Python tokenizer
     """
-    This contextmanager is in charge of defining the truncation and the padding strategies and then
-    restore the tokenizer settings afterwards.
+    return self._encodings
+
+  def keys(self):
+    return self.data.keys()
 
-    This contextmanager assumes the provider tokenizer has no padding / truncation strategy
-    before the managed section. If your tokenizer set a padding / truncation strategy before,
-    then it will be reset to no padding/truncation when exiting the managed section.
+  def values(self):
+    return self.data.values()
+
+  def items(self):
+    return self.data.items()
+
+  def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
+    """
+    Find the Offsets of the token containing the character at the specified position
 
     Args:
-        tokenizer (BaseTokenizer): The tokenizer which will be used
-        max_length (int): The maximum size of the sequence
-        stride (int): The stride to use when handling overflow
-        strategy (str): Overflowing logic to use
-        pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
-        padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
-        pad_token_id (int): The integer representation of the padding token to use
-        pad_token_type_id (int): The integer representation of the padding token type to use
-        pad_token (str): The string representation of the padding token to use
+        sentence: Index of the sentence relative to the batch provided to the tokenizer
+        char: Char index to get the relative token offsets
 
     Returns:
+        tuple: (token start, token end)
 
     """
 
-    # Handle all the truncation and padding stuff
-    if max_length is not None:
-        tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
-
-    if pad_to_max_length and (pad_token and pad_token_id >= 0):
-        tokenizer.enable_padding(
-            max_length=max_length,
-            direction=padding_side,
-            pad_id=pad_token_id,
-            pad_type_id=pad_token_type_id,
-            pad_token=pad_token,
-        )
-    elif pad_to_max_length:
-        logger.warning(
-            "Disabled padding because no padding token set (pad_token: {}, pad_token_id: {}).\n"
-            "To remove this error, you can add a new pad token and then resize model embedding:\n"
-            "\ttokenizer.pad_token = '<PAD>'\n\tmodel.resize_token_embeddings(len(tokenizer))".format(
-                pad_token, pad_token_id
-            )
-        )
+    if not self._encodings:
+      raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers")
+    return self[sentence].char_to_token_offsets(char)
 
-    yield
+  def char_to_token(self, sentence: int, char: int) -> int:
+    """
+    Return the index of the token at position of the given char.
 
-    if max_length is not None:
-        tokenizer.no_truncation()
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        char (int): Char index to get the relative token offsets
 
-    if pad_to_max_length and (pad_token and pad_token_id >= 0):
-        tokenizer.no_padding()
+    Returns:
+        int: Integer referring to the position of the token in the returned set of tokens for the sentence
+    """
 
+    if not self._encodings:
+      raise ValueError("char_to_token() is not available when using Python based tokenizers")
+    return self[sentence].char_to_token(char)
 
-class BatchEncoding(UserDict):
+  def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
     """
-    Data structure derived from Dictionary holding all the required information to forward through
-    a model.
+    Find the Offsets of the word containing the character at the specified position
+
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        char (int): Char index to get the relative token offsets
 
-    In addition, this structure expose utility methods to map from word/char space to token space.
+    Returns:
+        tuple: (word start, word end) representing the first and last characters of the word
     """
 
-    def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None):
-        super().__init__(data)
+    if not self._encodings:
+      raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers")
+    return self[sentence].char_to_word_offsets(char)
+
+  def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]:
+    """
+    Find the Offsets of the word containing the token at the given index
+
+    Args:
+        sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+        index (int): Index of the token to map to the original word offsets
+
+    Returns:
+        Optional[tuple]: (word start, word end) or None
+    """
 
-        if isinstance(encoding, Encoding):
-            encoding = [encoding]
+    if not self._encodings:
+      raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers")
+    return self[sentence].token_to_word_offsets(index)
 
-        self._encodings = encoding
 
-    def __getitem__(self, item: Union[int, str]) -> Encoding:
-        if isinstance(item, str):
-            return self.data[item]
-        elif self._encodings is not None:
-            return self._encodings[item]
+class SpecialTokensMixin:
+  SPECIAL_TOKENS_ATTRIBUTES = [
+    "bos_token",
+    "eos_token",
+    "unk_token",
+    "sep_token",
+    "pad_token",
+    "cls_token",
+    "mask_token",
+    "additional_special_tokens",
+  ]
+
+  def __init__(self, **kwargs):
+    self._bos_token = None
+    self._eos_token = None
+    self._unk_token = None
+    self._sep_token = None
+    self._pad_token = None
+    self._cls_token = None
+    self._mask_token = None
+    self._pad_token_type_id = 0
+    self._additional_special_tokens = []
+
+    for key, value in kwargs.items():
+      if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+        if key == "additional_special_tokens":
+          assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
+        elif isinstance(value, AddedToken):
+          setattr(self, key, str(value))
+        elif isinstance(value, str):
+          setattr(self, key, value)
         else:
-            raise KeyError("int index is supported only on {} from a Rust tokenizer".format(type(self).__name__))
+          raise TypeError("special token {} has to be either str or AddedToken but got: {}".format(key, type(value)))
+
+  @property
+  def bos_token(self):
+    """Beginning of sentence token (string). Log an error if used while not having been set."""
+    if self._bos_token is None:
+      logger.error("Using bos_token, but it is not set yet.")
+    return self._bos_token
+
+  @property
+  def eos_token(self):
+    """End of sentence token (string). Log an error if used while not having been set."""
+    if self._eos_token is None:
+      logger.error("Using eos_token, but it is not set yet.")
+    return self._eos_token
+
+  @property
+  def unk_token(self):
+    """Unknown token (string). Log an error if used while not having been set."""
+    if self._unk_token is None:
+      logger.error("Using unk_token, but it is not set yet.")
+    return self._unk_token
+
+  @property
+  def sep_token(self):
+    """Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set."""
+    if self._sep_token is None:
+      logger.error("Using sep_token, but it is not set yet.")
+    return self._sep_token
+
+  @property
+  def pad_token(self):
+    """Padding token (string). Log an error if used while not having been set."""
+    if self._pad_token is None:
+      logger.error("Using pad_token, but it is not set yet.")
+    return self._pad_token
+
+  @property
+  def cls_token(self):
+    """Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set."""
+    if self._cls_token is None:
+      logger.error("Using cls_token, but it is not set yet.")
+    return self._cls_token
+
+  @property
+  def mask_token(self):
+    """Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set."""
+    if self._mask_token is None:
+      logger.error("Using mask_token, but it is not set yet.")
+    return self._mask_token
+
+  @property
+  def additional_special_tokens(self):
+    """All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set."""
+    if self._additional_special_tokens is None:
+      logger.error("Using additional_special_tokens, but it is not set yet.")
+    return self._additional_special_tokens
+
+  @bos_token.setter
+  def bos_token(self, value):
+    self._bos_token = value
+
+  @eos_token.setter
+  def eos_token(self, value):
+    self._eos_token = value
+
+  @unk_token.setter
+  def unk_token(self, value):
+    self._unk_token = value
+
+  @sep_token.setter
+  def sep_token(self, value):
+    self._sep_token = value
+
+  @pad_token.setter
+  def pad_token(self, value):
+    self._pad_token = value
+
+  @cls_token.setter
+  def cls_token(self, value):
+    self._cls_token = value
+
+  @mask_token.setter
+  def mask_token(self, value):
+    self._mask_token = value
+
+  @property
+  def bos_token_id(self):
+    """Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.bos_token)
+
+  @property
+  def eos_token_id(self):
+    """Id of the end of sentence token in the vocabulary. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.eos_token)
+
+  @property
+  def unk_token_id(self):
+    """Id of the unknown token in the vocabulary. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.unk_token)
+
+  @property
+  def sep_token_id(self):
+    """Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.sep_token)
+
+  @property
+  def pad_token_id(self):
+    """Id of the padding token in the vocabulary. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.pad_token)
+
+  @property
+  def pad_token_type_id(self):
+    """Id of the padding token type in the vocabulary."""
+    return self._pad_token_type_id
+
+  @property
+  def cls_token_id(self):
+    """Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.cls_token)
+
+  @property
+  def mask_token_id(self):
+    """Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.mask_token)
+
+  @property
+  def additional_special_tokens_ids(self):
+    """Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set."""
+    return self.convert_tokens_to_ids(self.additional_special_tokens)
+
+  @property
+  def special_tokens_map(self):
+    """A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+    values ('<unk>', '<cls>'...)
+    """
+    set_attr = {}
+    for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+      attr_value = getattr(self, "_" + attr)
+      if attr_value:
+        set_attr[attr] = attr_value
+    return set_attr
+
+  @property
+  def all_special_tokens(self):
+    """List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+    (cls_token, unk_token...).
+    """
+    all_toks = []
+    set_attr = self.special_tokens_map
+    for attr_value in set_attr.values():
+      all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
+    all_toks = list(set(all_toks))
+    return all_toks
+
+  @property
+  def all_special_ids(self):
+    """List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+    class attributes (cls_token, unk_token...).
+    """
+    all_toks = self.all_special_tokens
+    all_ids = self.convert_tokens_to_ids(all_toks)
+    return all_ids
 
-    def __getattr__(self, item: str):
-        return self.data[item]
+  @additional_special_tokens.setter
+  def additional_special_tokens(self, value):
+    self._additional_special_tokens = value
 
-    @property
-    def encodings(self) -> Optional[List[Encoding]]:
-        """
-        Return the list all encoding from the tokenization process
 
-        Returns: List[Encoding] or None if input was tokenized through Python tokenizer
-        """
-        return self._encodings
+class PreTrainedTokenizer(SpecialTokensMixin):
+  """Base class for all tokenizers.
+  Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
 
-    def keys(self):
-        return self.data.keys()
+  This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
 
-    def values(self):
-        return self.data.values()
+  Class attributes (overridden by derived classes):
 
-    def items(self):
-        return self.data.items()
+      - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+      - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+      - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+      - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
 
-    def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
-        """
-        Find the Offsets of the token containing the character at the specified position
+  Parameters:
 
-        Args:
-            sentence: Index of the sentence relative to the batch provided to the tokenizer
-            char: Char index to get the relative token offsets
+      - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
 
-        Returns:
-            tuple: (token start, token end)
+      - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
 
-        """
+      - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
 
-        if not self._encodings:
-            raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers")
-        return self[sentence].char_to_token_offsets(char)
+      - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
 
-    def char_to_token(self, sentence: int, char: int) -> int:
-        """
-        Return the index of the token at position of the given char.
+      - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            char (int): Char index to get the relative token offsets
+      - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
 
-        Returns:
-            int: Integer referring to the position of the token in the returned set of tokens for the sentence
-        """
+      - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
 
-        if not self._encodings:
-            raise ValueError("char_to_token() is not available when using Python based tokenizers")
-        return self[sentence].char_to_token(char)
+      - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+  """
 
-    def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
-        """
-        Find the Offsets of the word containing the character at the specified position
+  vocab_files_names = {}
+  pretrained_vocab_files_map = {}
+  pretrained_init_configuration = {}
+  max_model_input_sizes = {}
+  model_input_names = ["token_type_ids", "attention_mask"]
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            char (int): Char index to get the relative token offsets
+  padding_side = "right"
 
-        Returns:
-            tuple: (word start, word end) representing the first and last characters of the word
-        """
+  NO_PAD_TOKEN_FOR_BATCH_MSG = (
+    "No padding token is set for this model, therefore no batch can be made with uneven "
+    "sequences. Set a padding token or adjust the lengths of the sequences building the "
+    "batch so that every sequence is of the same length."
+  )
 
-        if not self._encodings:
-            raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers")
-        return self[sentence].char_to_word_offsets(char)
+  UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
+    "The sequences building the batch are not of the same size, no tensor "
+    "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
+    "up to the larger sequence's length."
+  )
 
-    def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]:
-        """
-        Find the Offsets of the word containing the token at the given index
+  @property
+  def vocab_size(self) -> int:
+    """Size of the base vocabulary (without the added tokens)"""
+    raise NotImplementedError
 
-        Args:
-            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
-            index (int): Index of the token to map to the original word offsets
+  @property
+  def is_fast(self):
+    return False
 
-        Returns:
-            Optional[tuple]: (word start, word end) or None
-        """
+  def get_vocab(self):
+    """Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab."""
+    raise NotImplementedError()
 
-        if not self._encodings:
-            raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers")
-        return self[sentence].token_to_word_offsets(index)
+  def __init__(self, max_len=None, **kwargs):
+    super().__init__(**kwargs)
 
+    self.max_len = max_len if max_len is not None else int(1e12)
 
-class SpecialTokensMixin:
-    SPECIAL_TOKENS_ATTRIBUTES = [
-        "bos_token",
-        "eos_token",
-        "unk_token",
-        "sep_token",
-        "pad_token",
-        "cls_token",
-        "mask_token",
-        "additional_special_tokens",
-    ]
-
-    def __init__(self, **kwargs):
-
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
-        self._pad_token_type_id = 0
-        self._additional_special_tokens = []
-
-        for key, value in kwargs.items():
-            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                elif isinstance(value, AddedToken):
-                    setattr(self, key, str(value))
-                elif isinstance(value, str):
-                    setattr(self, key, value)
-                else:
-                    raise TypeError(
-                        "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
-                    )
-
-    @property
-    def bos_token(self):
-        """ Beginning of sentence token (string). Log an error if used while not having been set. """
-        if self._bos_token is None:
-            logger.error("Using bos_token, but it is not set yet.")
-        return self._bos_token
-
-    @property
-    def eos_token(self):
-        """ End of sentence token (string). Log an error if used while not having been set. """
-        if self._eos_token is None:
-            logger.error("Using eos_token, but it is not set yet.")
-        return self._eos_token
-
-    @property
-    def unk_token(self):
-        """ Unknown token (string). Log an error if used while not having been set. """
-        if self._unk_token is None:
-            logger.error("Using unk_token, but it is not set yet.")
-        return self._unk_token
-
-    @property
-    def sep_token(self):
-        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        if self._sep_token is None:
-            logger.error("Using sep_token, but it is not set yet.")
-        return self._sep_token
-
-    @property
-    def pad_token(self):
-        """ Padding token (string). Log an error if used while not having been set. """
-        if self._pad_token is None:
-            logger.error("Using pad_token, but it is not set yet.")
-        return self._pad_token
-
-    @property
-    def cls_token(self):
-        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        if self._cls_token is None:
-            logger.error("Using cls_token, but it is not set yet.")
-        return self._cls_token
-
-    @property
-    def mask_token(self):
-        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        if self._mask_token is None:
-            logger.error("Using mask_token, but it is not set yet.")
-        return self._mask_token
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
-        if self._additional_special_tokens is None:
-            logger.error("Using additional_special_tokens, but it is not set yet.")
-        return self._additional_special_tokens
-
-    @bos_token.setter
-    def bos_token(self, value):
-        self._bos_token = value
-
-    @eos_token.setter
-    def eos_token(self, value):
-        self._eos_token = value
-
-    @unk_token.setter
-    def unk_token(self, value):
-        self._unk_token = value
-
-    @sep_token.setter
-    def sep_token(self, value):
-        self._sep_token = value
-
-    @pad_token.setter
-    def pad_token(self, value):
-        self._pad_token = value
-
-    @cls_token.setter
-    def cls_token(self, value):
-        self._cls_token = value
-
-    @mask_token.setter
-    def mask_token(self, value):
-        self._mask_token = value
-
-    @property
-    def bos_token_id(self):
-        """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.bos_token)
-
-    @property
-    def eos_token_id(self):
-        """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.eos_token)
-
-    @property
-    def unk_token_id(self):
-        """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.unk_token)
-
-    @property
-    def sep_token_id(self):
-        """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.sep_token)
-
-    @property
-    def pad_token_id(self):
-        """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.pad_token)
-
-    @property
-    def pad_token_type_id(self):
-        """ Id of the padding token type in the vocabulary."""
-        return self._pad_token_type_id
-
-    @property
-    def cls_token_id(self):
-        """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.cls_token)
-
-    @property
-    def mask_token_id(self):
-        """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.mask_token)
-
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
-        return self.convert_tokens_to_ids(self.additional_special_tokens)
-
-    @property
-    def special_tokens_map(self):
-        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
-
-    @property
-    def all_special_tokens(self):
-        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
-        all_toks = []
-        set_attr = self.special_tokens_map
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(set(all_toks))
-        return all_toks
-
-    @property
-    def all_special_ids(self):
-        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
-        all_toks = self.all_special_tokens
-        all_ids = self.convert_tokens_to_ids(all_toks)
-        return all_ids
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
+    # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
+    self.padding_side = kwargs.pop("padding_side", self.padding_side)
+    self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
 
+    # Added tokens
+    self.added_tokens_encoder = {}
+    self.unique_added_tokens_encoder = set()
+    self.added_tokens_decoder = {}
 
-class PreTrainedTokenizer(SpecialTokensMixin):
-    """ Base class for all tokenizers.
-    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+    # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+    self.init_inputs = ()
+    self.init_kwargs = {}
 
-    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+  def __len__(self):
+    """Size of the full vocabulary with the added tokens"""
+    return self.vocab_size + len(self.added_tokens_encoder)
 
-    Class attributes (overridden by derived classes):
+  @classmethod
+  def from_pretrained(cls, *inputs, **kwargs):
+    r"""
+    Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
 
-        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
-        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
-        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
-        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+    Args:
+        pretrained_model_name_or_path: either:
 
-    Parameters:
+            - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+            - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+            - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+            - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
-        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+        cache_dir: (`optional`) string:
+            Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
 
-        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+        force_download: (`optional`) boolean, default False:
+            Force to (re-)download the vocabulary files and override the cached versions if they exists.
 
-        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+        resume_download: (`optional`) boolean, default False:
+            Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
 
-        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+        proxies: (`optional`) dict, default None:
+            A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+            The proxies are used on each request.
 
-        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+        inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
-        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+        kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
 
-        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+    Examples::
 
-        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
-    """
+        # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
 
-    vocab_files_names = {}
-    pretrained_vocab_files_map = {}
-    pretrained_init_configuration = {}
-    max_model_input_sizes = {}
-    model_input_names = ["token_type_ids", "attention_mask"]
+        # Download vocabulary from S3 and cache.
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-    padding_side = "right"
+        # Download vocabulary from S3 (user-uploaded) and cache.
+        tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
 
-    NO_PAD_TOKEN_FOR_BATCH_MSG = (
-        "No padding token is set for this model, therefore no batch can be made with uneven "
-        "sequences. Set a padding token or adjust the lengths of the sequences building the "
-        "batch so that every sequence is of the same length."
-    )
+        # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
-    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
-        "The sequences building the batch are not of the same size, no tensor "
-        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
-        "up to the larger sequence's length."
-    )
+        # If the tokenizer uses a single vocabulary file, you can point directly to this file
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+
+        # You can link tokens to special vocabulary when instantiating
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+        # You should be sure '<unk>' is in the vocabulary when doing that.
+        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+        assert tokenizer.unk_token == '<unk>'
+
+    """
+    return cls._from_pretrained(*inputs, **kwargs)
+
+  @classmethod
+  def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+    cache_dir = kwargs.pop("cache_dir", None)
+    force_download = kwargs.pop("force_download", False)
+    resume_download = kwargs.pop("resume_download", False)
+    proxies = kwargs.pop("proxies", None)
+    local_files_only = kwargs.pop("local_files_only", False)
+
+    s3_models = list(cls.max_model_input_sizes.keys())
+    vocab_files = {}
+    init_configuration = {}
+    if pretrained_model_name_or_path in s3_models:
+      # Get the vocabulary from AWS S3 bucket
+      for file_id, map_list in cls.pretrained_vocab_files_map.items():
+        vocab_files[file_id] = map_list[pretrained_model_name_or_path]
+      if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration:
+        init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
+    else:
+      # Get the vocabulary from local files
+      logger.info(
+        "Model name '{}' not found in model shortcut name list ({}). "
+        "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
+          pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
+        )
+      )
+
+      if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+        if len(cls.vocab_files_names) > 1:
+          raise ValueError(
+            "Calling {}.from_pretrained() with the path to a single file or url is not supported."
+            "Use a model identifier or the path to a directory instead.".format(cls.__name__)
+          )
+        logger.warning(
+          "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(cls.__name__)
+        )
+        file_id = list(cls.vocab_files_names.keys())[0]
+        vocab_files[file_id] = pretrained_model_name_or_path
+      else:
+        # At this point pretrained_model_name_or_path is either a directory or a model identifier name
+        additional_files_names = {
+          "added_tokens_file": ADDED_TOKENS_FILE,
+          "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
+          "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
+        }
+        # Look for the tokenizer main vocabulary files + the additional tokens files
+        for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
+          if os.path.isdir(pretrained_model_name_or_path):
+            full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+            if not os.path.exists(full_file_name):
+              logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+              full_file_name = None
+          else:
+            full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+
+          vocab_files[file_id] = full_file_name
+
+    # Get files from url, cache, or disk depending on the case
+    try:
+      resolved_vocab_files = {}
+      for file_id, file_path in vocab_files.items():
+        if file_path is None:
+          resolved_vocab_files[file_id] = None
+        else:
+          resolved_vocab_files[file_id] = cached_path(
+            file_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            local_files_only=local_files_only,
+          )
+    except EnvironmentError:
+      if pretrained_model_name_or_path in s3_models:
+        msg = "Couldn't reach server at '{}' to download vocabulary files."
+      else:
+        msg = (
+          "Model name '{}' was not found in tokenizers model name list ({}). "
+          "We assumed '{}' was a path or url to a directory containing vocabulary files "
+          "named {}, but couldn't find such vocabulary files at this path or url.".format(
+            pretrained_model_name_or_path,
+            ", ".join(s3_models),
+            pretrained_model_name_or_path,
+            list(cls.vocab_files_names.values()),
+          )
+        )
 
-    @property
-    def vocab_size(self) -> int:
-        """ Size of the base vocabulary (without the added tokens) """
-        raise NotImplementedError
+      raise EnvironmentError(msg)
+
+    if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
+      raise EnvironmentError(
+        "Model name '{}' was not found in tokenizers model name list ({}). "
+        "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
+        "named {} but couldn't find such vocabulary files at this path or url.".format(
+          pretrained_model_name_or_path,
+          ", ".join(s3_models),
+          pretrained_model_name_or_path,
+          list(cls.vocab_files_names.values()),
+        )
+      )
+
+    for file_id, file_path in vocab_files.items():
+      if file_path == resolved_vocab_files[file_id]:
+        logger.info("loading file {}".format(file_path))
+      else:
+        logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
+
+    # Prepare tokenizer initialization kwargs
+    # Did we saved some inputs and kwargs to reload ?
+    tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+    if tokenizer_config_file is not None:
+      with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+        init_kwargs = json.load(tokenizer_config_handle)
+      saved_init_inputs = init_kwargs.pop("init_inputs", ())
+      if not init_inputs:
+        init_inputs = saved_init_inputs
+    else:
+      init_kwargs = init_configuration
+
+    # Update with newly provided kwargs
+    init_kwargs.update(kwargs)
+
+    # Set max length if needed
+    if pretrained_model_name_or_path in cls.max_model_input_sizes:
+      # if we're using a pretrained model, ensure the tokenizer
+      # wont index sequences longer than the number of positional embeddings
+      max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
+      if max_len is not None and isinstance(max_len, (int, float)):
+        init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
+
+    # Merge resolved_vocab_files arguments in init_kwargs.
+    added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+    special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
+    for args_name, file_path in resolved_vocab_files.items():
+      if args_name not in init_kwargs:
+        init_kwargs[args_name] = file_path
+    if special_tokens_map_file is not None:
+      with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+        special_tokens_map = json.load(special_tokens_map_handle)
+      for key, value in special_tokens_map.items():
+        if key not in init_kwargs:
+          init_kwargs[key] = value
+
+    # Instantiate tokenizer.
+    try:
+      tokenizer = cls(*init_inputs, **init_kwargs)
+    except OSError:
+      raise OSError(
+        "Unable to load vocabulary from file. "
+        "Please check that the provided vocabulary is accessible and not corrupted."
+      )
+
+    # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+    tokenizer.init_inputs = init_inputs
+    tokenizer.init_kwargs = init_kwargs
+
+    # update unique_added_tokens_encoder with special tokens for correct tokenization
+    tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens))
+
+    # Add supplementary tokens.
+    if added_tokens_file is not None:
+      with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+        added_tok_encoder = json.load(added_tokens_handle)
+      added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+      tokenizer.added_tokens_encoder.update(added_tok_encoder)
+      tokenizer.added_tokens_decoder.update(added_tok_decoder)
+      tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()))
+
+    return tokenizer
+
+  def save_pretrained(self, save_directory):
+    """Save the tokenizer vocabulary files together with:
+        - added tokens,
+        - special-tokens-to-class-attributes-mapping,
+        - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
+
+    This won't save modifications other than (added tokens and special token mapping) you may have
+    applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
+
+    This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+    """
+    if not os.path.isdir(save_directory):
+      logger.error("Saving directory ({}) should be a directory".format(save_directory))
+      return
 
-    @property
-    def is_fast(self):
-        return False
+    special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
+    added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
+    tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
 
-    def get_vocab(self):
-        """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
-        raise NotImplementedError()
+    tokenizer_config = copy.deepcopy(self.init_kwargs)
+    if len(self.init_inputs) > 0:
+      tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
+    for file_id in self.vocab_files_names.keys():
+      tokenizer_config.pop(file_id, None)
 
-    def __init__(self, max_len=None, **kwargs):
+    with open(tokenizer_config_file, "w", encoding="utf-8") as f:
+      f.write(json.dumps(tokenizer_config, ensure_ascii=False))
 
-        super().__init__(**kwargs)
+    with open(special_tokens_map_file, "w", encoding="utf-8") as f:
+      f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
 
-        self.max_len = max_len if max_len is not None else int(1e12)
+    if len(self.added_tokens_encoder) > 0:
+      with open(added_tokens_file, "w", encoding="utf-8") as f:
+        out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
+        f.write(out_str)
 
-        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
-        self.padding_side = kwargs.pop("padding_side", self.padding_side)
-        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
+    vocab_files = self.save_vocabulary(save_directory)
 
-        # Added tokens
-        self.added_tokens_encoder = {}
-        self.unique_added_tokens_encoder = set()
-        self.added_tokens_decoder = {}
+    return vocab_files + (special_tokens_map_file, added_tokens_file)
 
-        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
-        self.init_inputs = ()
-        self.init_kwargs = {}
+  def save_vocabulary(self, save_directory):
+    """Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+    and special token mappings.
 
-    def __len__(self):
-        """ Size of the full vocabulary with the added tokens """
-        return self.vocab_size + len(self.added_tokens_encoder)
+    Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
+    """
+    raise NotImplementedError
 
-    @classmethod
-    def from_pretrained(cls, *inputs, **kwargs):
-        r"""
-        Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+  def add_tokens(self, new_tokens):
+    """
+    Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+    vocabulary, they are added to it with indices starting from length of the current vocabulary.
 
-        Args:
-            pretrained_model_name_or_path: either:
+    Args:
+        new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - (not applicable to all derived classes, deprecated) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+    Returns:
+        Number of tokens added to the vocabulary.
 
-            cache_dir: (`optional`) string:
-                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+    Examples::
 
-            force_download: (`optional`) boolean, default False:
-                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
 
-            resume_download: (`optional`) boolean, default False:
-                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+    """
+    if not new_tokens:
+      return 0
+
+    if not isinstance(new_tokens, list):
+      new_tokens = [new_tokens]
+
+    to_add_tokens = []
+    for token in new_tokens:
+      assert isinstance(token, str)
+      if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
+        token = token.lower()
+      if (
+        token != self.unk_token
+        and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+        and token not in to_add_tokens
+      ):
+        to_add_tokens.append(token)
+        logger.info("Adding %s to the vocabulary", token)
+
+    added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
+    added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+    self.added_tokens_encoder.update(added_tok_encoder)
+    self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
+    self.added_tokens_decoder.update(added_tok_decoder)
+
+    return len(to_add_tokens)
+
+  def num_special_tokens_to_add(self, pair=False):
+    """
+    Returns the number of added tokens when encoding a sequence with special tokens.
 
-            proxies: (`optional`) dict, default None:
-                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
-                The proxies are used on each request.
+    Note:
+        This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+        inside your training loop.
 
-            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+    Args:
+        pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+            number of added tokens in the case of a single sequence if set to False.
 
-            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details.
+    Returns:
+        Number of tokens added to sequences
+    """
+    token_ids_0 = []
+    token_ids_1 = []
+    return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
 
-        Examples::
+  def add_special_tokens(self, special_tokens_dict):
+    """
+    Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+    to class attributes. If special tokens are NOT in the vocabulary, they are added
+    to it (indexed starting from the last index of the current vocabulary).
 
-            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+    Using `add_special_tokens` will ensure your special tokens can be used in several ways:
 
-            # Download vocabulary from S3 and cache.
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    - special tokens are carefully handled by the tokenizer (they are never split)
+    - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
 
-            # Download vocabulary from S3 (user-uploaded) and cache.
-            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+    When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
 
-            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+    Args:
+        special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+            [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+            ``additional_special_tokens``].
 
-            # If the tokenizer uses a single vocabulary file, you can point directly to this file
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+            Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-            # You can link tokens to special vocabulary when instantiating
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
-            # You should be sure '<unk>' is in the vocabulary when doing that.
-            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-            assert tokenizer.unk_token == '<unk>'
+    Returns:
+        Number of tokens added to the vocabulary.
 
-        """
-        return cls._from_pretrained(*inputs, **kwargs)
+    Examples::
 
-    @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        resume_download = kwargs.pop("resume_download", False)
-        proxies = kwargs.pop("proxies", None)
-        local_files_only = kwargs.pop("local_files_only", False)
+        # Let's see how to add a new classification token to GPT-2
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
 
-        s3_models = list(cls.max_model_input_sizes.keys())
-        vocab_files = {}
-        init_configuration = {}
-        if pretrained_model_name_or_path in s3_models:
-            # Get the vocabulary from AWS S3 bucket
-            for file_id, map_list in cls.pretrained_vocab_files_map.items():
-                vocab_files[file_id] = map_list[pretrained_model_name_or_path]
-            if (
-                cls.pretrained_init_configuration
-                and pretrained_model_name_or_path in cls.pretrained_init_configuration
-            ):
-                init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path].copy()
-        else:
-            # Get the vocabulary from local files
-            logger.info(
-                "Model name '{}' not found in model shortcut name list ({}). "
-                "Assuming '{}' is a path, a model identifier, or url to a directory containing tokenizer files.".format(
-                    pretrained_model_name_or_path, ", ".join(s3_models), pretrained_model_name_or_path
-                )
-            )
+        special_tokens_dict = {'cls_token': '<CLS>'}
 
-            if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
-                if len(cls.vocab_files_names) > 1:
-                    raise ValueError(
-                        "Calling {}.from_pretrained() with the path to a single file or url is not supported."
-                        "Use a model identifier or the path to a directory instead.".format(cls.__name__)
-                    )
-                logger.warning(
-                    "Calling {}.from_pretrained() with the path to a single file or url is deprecated".format(
-                        cls.__name__
-                    )
-                )
-                file_id = list(cls.vocab_files_names.keys())[0]
-                vocab_files[file_id] = pretrained_model_name_or_path
-            else:
-                # At this point pretrained_model_name_or_path is either a directory or a model identifier name
-                additional_files_names = {
-                    "added_tokens_file": ADDED_TOKENS_FILE,
-                    "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE,
-                    "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
-                }
-                # Look for the tokenizer main vocabulary files + the additional tokens files
-                for file_id, file_name in {**cls.vocab_files_names, **additional_files_names}.items():
-                    if os.path.isdir(pretrained_model_name_or_path):
-                        full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                        if not os.path.exists(full_file_name):
-                            logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                            full_file_name = None
-                    else:
-                        full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
-
-                    vocab_files[file_id] = full_file_name
-
-        # Get files from url, cache, or disk depending on the case
-        try:
-            resolved_vocab_files = {}
-            for file_id, file_path in vocab_files.items():
-                if file_path is None:
-                    resolved_vocab_files[file_id] = None
-                else:
-                    resolved_vocab_files[file_id] = cached_path(
-                        file_path,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        resume_download=resume_download,
-                        local_files_only=local_files_only,
-                    )
-        except EnvironmentError:
-            if pretrained_model_name_or_path in s3_models:
-                msg = "Couldn't reach server at '{}' to download vocabulary files."
-            else:
-                msg = (
-                    "Model name '{}' was not found in tokenizers model name list ({}). "
-                    "We assumed '{}' was a path or url to a directory containing vocabulary files "
-                    "named {}, but couldn't find such vocabulary files at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ", ".join(s3_models),
-                        pretrained_model_name_or_path,
-                        list(cls.vocab_files_names.values()),
-                    )
-                )
-
-            raise EnvironmentError(msg)
-
-        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
-            raise EnvironmentError(
-                "Model name '{}' was not found in tokenizers model name list ({}). "
-                "We assumed '{}' was a path, a model identifier, or url to a directory containing vocabulary files "
-                "named {} but couldn't find such vocabulary files at this path or url.".format(
-                    pretrained_model_name_or_path,
-                    ", ".join(s3_models),
-                    pretrained_model_name_or_path,
-                    list(cls.vocab_files_names.values()),
-                )
-            )
+        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+        print('We have added', num_added_toks, 'tokens')
+        model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
 
-        for file_id, file_path in vocab_files.items():
-            if file_path == resolved_vocab_files[file_id]:
-                logger.info("loading file {}".format(file_path))
-            else:
-                logger.info("loading file {} from cache at {}".format(file_path, resolved_vocab_files[file_id]))
-
-        # Prepare tokenizer initialization kwargs
-        # Did we saved some inputs and kwargs to reload ?
-        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
-        if tokenizer_config_file is not None:
-            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
-                init_kwargs = json.load(tokenizer_config_handle)
-            saved_init_inputs = init_kwargs.pop("init_inputs", ())
-            if not init_inputs:
-                init_inputs = saved_init_inputs
+        assert tokenizer.cls_token == '<CLS>'
+    """
+    if not special_tokens_dict:
+      return 0
+
+    added_tokens = 0
+    for key, value in special_tokens_dict.items():
+      assert key in self.SPECIAL_TOKENS_ATTRIBUTES
+      if key == "additional_special_tokens":
+        assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
+        added_tokens += self.add_tokens(value)
+      else:
+        assert isinstance(value, str)
+        added_tokens += self.add_tokens([value])
+      logger.info("Assigning %s to the %s key of the tokenizer", value, key)
+      setattr(self, key, value)
+
+    return added_tokens
+
+  def tokenize(self, text: TextInput, **kwargs):
+    """Converts a string in a sequence of tokens (string), using the tokenizer.
+    Split in words for word-based vocabulary or sub-words for sub-word-based
+    vocabularies (BPE/SentencePieces/WordPieces).
+
+    Take care of added tokens.
+
+    text: The sequence to be encoded.
+    add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
+        begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
+    **kwargs: passed to the `prepare_for_tokenization` preprocessing method.
+    """
+    all_special_tokens = self.all_special_tokens
+    text = self.prepare_for_tokenization(text, **kwargs)
+
+    def lowercase_text(t):
+      # convert non-special tokens to lowercase
+      escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
+      pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+      return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
+
+    if self.init_kwargs.get("do_lower_case", False):
+      text = lowercase_text(text)
+
+    def split_on_token(tok, text):
+      result = []
+      split_text = text.split(tok)
+      for i, sub_text in enumerate(split_text):
+        sub_text = sub_text.rstrip()
+        if i == 0 and not sub_text:
+          result += [tok]
+        elif i == len(split_text) - 1:
+          if sub_text:
+            result += [sub_text]
+          else:
+            pass
         else:
-            init_kwargs = init_configuration
-
-        # Update with newly provided kwargs
-        init_kwargs.update(kwargs)
-
-        # Set max length if needed
-        if pretrained_model_name_or_path in cls.max_model_input_sizes:
-            # if we're using a pretrained model, ensure the tokenizer
-            # wont index sequences longer than the number of positional embeddings
-            max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
-            if max_len is not None and isinstance(max_len, (int, float)):
-                init_kwargs["max_len"] = min(init_kwargs.get("max_len", int(1e12)), max_len)
-
-        # Merge resolved_vocab_files arguments in init_kwargs.
-        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
-        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
-        for args_name, file_path in resolved_vocab_files.items():
-            if args_name not in init_kwargs:
-                init_kwargs[args_name] = file_path
-        if special_tokens_map_file is not None:
-            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-                special_tokens_map = json.load(special_tokens_map_handle)
-            for key, value in special_tokens_map.items():
-                if key not in init_kwargs:
-                    init_kwargs[key] = value
-
-        # Instantiate tokenizer.
-        try:
-            tokenizer = cls(*init_inputs, **init_kwargs)
-        except OSError:
-            raise OSError(
-                "Unable to load vocabulary from file. "
-                "Please check that the provided vocabulary is accessible and not corrupted."
-            )
+          if sub_text:
+            result += [sub_text]
+          result += [tok]
+      return result
 
-        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
-        tokenizer.init_inputs = init_inputs
-        tokenizer.init_kwargs = init_kwargs
-
-        # update unique_added_tokens_encoder with special tokens for correct tokenization
-        tokenizer.unique_added_tokens_encoder.update(set(tokenizer.all_special_tokens))
-
-        # Add supplementary tokens.
-        if added_tokens_file is not None:
-            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-                added_tok_encoder = json.load(added_tokens_handle)
-            added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-            tokenizer.added_tokens_encoder.update(added_tok_encoder)
-            tokenizer.added_tokens_decoder.update(added_tok_decoder)
-            tokenizer.unique_added_tokens_encoder.update(set(tokenizer.added_tokens_encoder.keys()))
-
-        return tokenizer
-
-    def save_pretrained(self, save_directory):
-        """ Save the tokenizer vocabulary files together with:
-                - added tokens,
-                - special-tokens-to-class-attributes-mapping,
-                - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert).
-
-            This won't save modifications other than (added tokens and special token mapping) you may have
-            applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation).
-
-            This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
-        if not os.path.isdir(save_directory):
-            logger.error("Saving directory ({}) should be a directory".format(save_directory))
-            return
-
-        special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
-        added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
-        tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE)
-
-        tokenizer_config = copy.deepcopy(self.init_kwargs)
-        if len(self.init_inputs) > 0:
-            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
-        for file_id in self.vocab_files_names.keys():
-            tokenizer_config.pop(file_id, None)
-
-        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
-
-        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
-
-        if len(self.added_tokens_encoder) > 0:
-            with open(added_tokens_file, "w", encoding="utf-8") as f:
-                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
-                f.write(out_str)
-
-        vocab_files = self.save_vocabulary(save_directory)
-
-        return vocab_files + (special_tokens_map_file, added_tokens_file)
+    def split_on_tokens(tok_list, text):
+      if not text.strip():
+        return []
+      if not tok_list:
+        return self._tokenize(text)
+
+      tokenized_text = []
+      text_list = [text]
+      for tok in tok_list:
+        tokenized_text = []
+        for sub_text in text_list:
+          if sub_text not in self.unique_added_tokens_encoder:
+            tokenized_text += split_on_token(tok, sub_text)
+          else:
+            tokenized_text += [sub_text]
+        text_list = tokenized_text
+
+      return list(
+        itertools.chain.from_iterable(
+          (
+            self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token]
+            for token in tokenized_text
+          )
+        )
+      )
 
-    def save_vocabulary(self, save_directory):
-        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
-            and special token mappings.
+    added_tokens = self.unique_added_tokens_encoder
+    tokenized_text = split_on_tokens(added_tokens, text)
+    return tokenized_text
 
-            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method.
-        """
-        raise NotImplementedError
+  def _tokenize(self, text, **kwargs):
+    """Converts a string in a sequence of tokens (string), using the tokenizer.
+    Split in words for word-based vocabulary or sub-words for sub-word-based
+    vocabularies (BPE/SentencePieces/WordPieces).
 
-    def add_tokens(self, new_tokens):
-        """
-        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-        vocabulary, they are added to it with indices starting from length of the current vocabulary.
-
-        Args:
-            new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+    Do NOT take care of added tokens.
+    """
+    raise NotImplementedError
 
-        Returns:
-            Number of tokens added to the vocabulary.
-
-        Examples::
-
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertModel.from_pretrained('bert-base-uncased')
-
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-        """
-        if not new_tokens:
-            return 0
-
-        if not isinstance(new_tokens, list):
-            new_tokens = [new_tokens]
-
-        to_add_tokens = []
-        for token in new_tokens:
-            assert isinstance(token, str)
-            if self.init_kwargs.get("do_lower_case", False) and token not in self.all_special_tokens:
-                token = token.lower()
-            if (
-                token != self.unk_token
-                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
-                and token not in to_add_tokens
-            ):
-                to_add_tokens.append(token)
-                logger.info("Adding %s to the vocabulary", token)
-
-        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
-        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
-        self.added_tokens_encoder.update(added_tok_encoder)
-        self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens))
-        self.added_tokens_decoder.update(added_tok_decoder)
-
-        return len(to_add_tokens)
-
-    def num_special_tokens_to_add(self, pair=False):
-        """
-        Returns the number of added tokens when encoding a sequence with special tokens.
-
-        Note:
-            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
-            inside your training loop.
-
-        Args:
-            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
-                number of added tokens in the case of a single sequence if set to False.
-
-        Returns:
-            Number of tokens added to sequences
-        """
-        token_ids_0 = []
-        token_ids_1 = []
-        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+  def convert_tokens_to_ids(self, tokens):
+    """Converts a single token, or a sequence of tokens, (str) in a single integer id
+    (resp. a sequence of ids), using the vocabulary.
+    """
+    if tokens is None:
+      return None
+
+    if isinstance(tokens, str):
+      return self._convert_token_to_id_with_added_voc(tokens)
+
+    ids = []
+    for token in tokens:
+      ids.append(self._convert_token_to_id_with_added_voc(token))
+    return ids
+
+  def _convert_token_to_id_with_added_voc(self, token):
+    if token is None:
+      return None
+
+    if token in self.added_tokens_encoder:
+      return self.added_tokens_encoder[token]
+    return self._convert_token_to_id(token)
+
+  def _convert_token_to_id(self, token):
+    raise NotImplementedError
+
+  def encode(
+    self,
+    text: TextInput,
+    text_pair: Optional[TextInput] = None,
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    return_tensors: Optional[str] = None,
+    **kwargs,
+  ):
+    """
+    Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
 
-    def add_special_tokens(self, special_tokens_dict):
-        """
-        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-        to class attributes. If special tokens are NOT in the vocabulary, they are added
-        to it (indexed starting from the last index of the current vocabulary).
-
-        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
+    Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
 
-        - special tokens are carefully handled by the tokenizer (they are never split)
-        - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts.
-
-        When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '</s>')
+    Args:
+        text (:obj:`str` or :obj:`List[str]`):
+            The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+            method)
+        text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            string using the `tokenize` method) or a list of integers (tokenized string ids using the
+            `convert_tokens_to_ids` method)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        **kwargs: passed to the `self.tokenize()` method
+    """
+    encoded_inputs = self.encode_plus(
+      text,
+      text_pair=text_pair,
+      max_length=max_length,
+      add_special_tokens=add_special_tokens,
+      stride=stride,
+      truncation_strategy=truncation_strategy,
+      pad_to_max_length=pad_to_max_length,
+      return_tensors=return_tensors,
+      **kwargs,
+    )
 
-        Args:
-            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
-                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
-                ``additional_special_tokens``].
-
-                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+    return encoded_inputs["input_ids"]
+
+  def encode_plus(
+    self,
+    text: TextInput,
+    text_pair: Optional[TextInput] = None,
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    is_pretokenized: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_mask: bool = False,
+    return_offsets_mapping: bool = False,
+    **kwargs,
+  ) -> BatchEncoding:
+    """
+    Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+    the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
 
-        Returns:
-            Number of tokens added to the vocabulary.
-
-        Examples::
-
-            # Let's see how to add a new classification token to GPT-2
-            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-            model = GPT2Model.from_pretrained('gpt2')
-
-            special_tokens_dict = {'cls_token': '<CLS>'}
-
-            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-            print('We have added', num_added_toks, 'tokens')
-            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-
-            assert tokenizer.cls_token == '<CLS>'
-        """
-        if not special_tokens_dict:
-            return 0
-
-        added_tokens = 0
-        for key, value in special_tokens_dict.items():
-            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
-            if key == "additional_special_tokens":
-                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                added_tokens += self.add_tokens(value)
-            else:
-                assert isinstance(value, str)
-                added_tokens += self.add_tokens([value])
-            logger.info("Assigning %s to the %s key of the tokenizer", value, key)
-            setattr(self, key, value)
-
-        return added_tokens
-
-    def tokenize(self, text: TextInput, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Take care of added tokens.
-
-            text: The sequence to be encoded.
-            add_prefix_space: Only applies to GPT-2 and RoBERTa tokenizers. When `True`, this ensures that the sequence
-                begins with an empty space. False by default except for when using RoBERTa with `add_special_tokens=True`.
-            **kwargs: passed to the `prepare_for_tokenization` preprocessing method.
-        """
-        all_special_tokens = self.all_special_tokens
-        text = self.prepare_for_tokenization(text, **kwargs)
-
-        def lowercase_text(t):
-            # convert non-special tokens to lowercase
-            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
-            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
-            return re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), t)
-
-        if self.init_kwargs.get("do_lower_case", False):
-            text = lowercase_text(text)
-
-        def split_on_token(tok, text):
-            result = []
-            split_text = text.split(tok)
-            for i, sub_text in enumerate(split_text):
-                sub_text = sub_text.rstrip()
-                if i == 0 and not sub_text:
-                    result += [tok]
-                elif i == len(split_text) - 1:
-                    if sub_text:
-                        result += [sub_text]
-                    else:
-                        pass
-                else:
-                    if sub_text:
-                        result += [sub_text]
-                    result += [tok]
-            return result
-
-        def split_on_tokens(tok_list, text):
-            if not text.strip():
-                return []
-            if not tok_list:
-                return self._tokenize(text)
-
-            tokenized_text = []
-            text_list = [text]
-            for tok in tok_list:
-                tokenized_text = []
-                for sub_text in text_list:
-                    if sub_text not in self.unique_added_tokens_encoder:
-                        tokenized_text += split_on_token(tok, sub_text)
-                    else:
-                        tokenized_text += [sub_text]
-                text_list = tokenized_text
-
-            return list(
-                itertools.chain.from_iterable(
-                    (
-                        self._tokenize(token) if token not in self.unique_added_tokens_encoder else [token]
-                        for token in tokenized_text
-                    )
-                )
-            )
+    Args:
+        text (:obj:`str` or :obj:`List[str]`):
+            The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
+            the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
+            method)
+        text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
+            Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
+            string using the `tokenize` method) or a list of integers (tokenized string ids using the
+            `convert_tokens_to_ids` method)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+            Set to True to indicate the input is already tokenized
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            Whether to return token type IDs. If left to the default, will return the token type IDs according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return overflowing token information (default False).
+        return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return special tokens mask information (default False).
+        return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return (char_start, char_end) for each token (default False).
+            If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+            Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+        **kwargs: passed to the `self.tokenize()` method
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[int],
+                token_type_ids: list[int] if return_token_type_ids is True (default)
+                attention_mask: list[int] if return_attention_mask is True (default)
+                overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+
+        - ``input_ids``: list of token ids to be fed to a model
+        - ``token_type_ids``: list of token type ids to be fed to a model
+        - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+        - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+        - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+        - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+          tokens and 1 specifying sequence tokens.
+    """
 
-        added_tokens = self.unique_added_tokens_encoder
-        tokenized_text = split_on_tokens(added_tokens, text)
-        return tokenized_text
-
-    def _tokenize(self, text, **kwargs):
-        """ Converts a string in a sequence of tokens (string), using the tokenizer.
-            Split in words for word-based vocabulary or sub-words for sub-word-based
-            vocabularies (BPE/SentencePieces/WordPieces).
-
-            Do NOT take care of added tokens.
-        """
-        raise NotImplementedError
-
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token, or a sequence of tokens, (str) in a single integer id
-            (resp. a sequence of ids), using the vocabulary.
-        """
-        if tokens is None:
-            return None
-
-        if isinstance(tokens, str):
-            return self._convert_token_to_id_with_added_voc(tokens)
-
-        ids = []
-        for token in tokens:
-            ids.append(self._convert_token_to_id_with_added_voc(token))
-        return ids
-
-    def _convert_token_to_id_with_added_voc(self, token):
-        if token is None:
-            return None
-
-        if token in self.added_tokens_encoder:
-            return self.added_tokens_encoder[token]
-        return self._convert_token_to_id(token)
-
-    def _convert_token_to_id(self, token):
-        raise NotImplementedError
-
-    def encode(
-        self,
-        text: TextInput,
-        text_pair: Optional[TextInput] = None,
-        add_special_tokens: bool = True,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        truncation_strategy: str = "longest_first",
-        pad_to_max_length: bool = False,
-        return_tensors: Optional[str] = None,
-        **kwargs
-    ):
-        """
-        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-
-        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
-
-        Args:
-            text (:obj:`str` or :obj:`List[str]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            **kwargs: passed to the `self.tokenize()` method
-        """
-        encoded_inputs = self.encode_plus(
-            text,
-            text_pair=text_pair,
-            max_length=max_length,
-            add_special_tokens=add_special_tokens,
-            stride=stride,
-            truncation_strategy=truncation_strategy,
-            pad_to_max_length=pad_to_max_length,
-            return_tensors=return_tensors,
-            **kwargs,
-        )
+    def get_input_ids(text):
+      if isinstance(text, str):
+        tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
+        return self.convert_tokens_to_ids(tokens)
+      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+        return self.convert_tokens_to_ids(text)
+      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+        return text
+      else:
+        raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+
+    if return_offsets_mapping:
+      raise NotImplementedError(
+        "return_offset_mapping is not available when using Python tokenizers."
+        "To use this feature, change your tokenizer to one deriving from "
+        "transformers.PreTrainedTokenizerFast."
+        "More information on available tokenizers at "
+        "https://github.com/huggingface/transformers/pull/2674"
+      )
+
+    # Throw an error if we can pad because there is no padding token
+    if pad_to_max_length and self.pad_token_id is None:
+      raise ValueError(
+        "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+      )
+
+    first_ids = get_input_ids(text)
+    second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+    return self.prepare_for_model(
+      first_ids,
+      pair_ids=second_ids,
+      max_length=max_length,
+      pad_to_max_length=pad_to_max_length,
+      add_special_tokens=add_special_tokens,
+      stride=stride,
+      truncation_strategy=truncation_strategy,
+      return_tensors=return_tensors,
+      return_attention_mask=return_attention_mask,
+      return_token_type_ids=return_token_type_ids,
+      return_overflowing_tokens=return_overflowing_tokens,
+      return_special_tokens_mask=return_special_tokens_mask,
+    )
 
-        return encoded_inputs["input_ids"]
-
-    def encode_plus(
-        self,
-        text: TextInput,
-        text_pair: Optional[TextInput] = None,
-        add_special_tokens: bool = True,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        truncation_strategy: str = "longest_first",
-        pad_to_max_length: bool = False,
-        is_pretokenized: bool = False,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-        return_offsets_mapping: bool = False,
-        **kwargs
-    ) -> BatchEncoding:
-        """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            text (:obj:`str` or :obj:`List[str]`):
-                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
-                method)
-            text_pair (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`None`):
-                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized
-                string using the `tokenize` method) or a list of integers (tokenized string ids using the
-                `convert_tokens_to_ids` method)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
-                Set to True to indicate the input is already tokenized
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are token type IDs? <../glossary.html#token-type-ids>`_
-            return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`none`):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return overflowing token information (default False).
-            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return special tokens mask information (default False).
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return (char_start, char_end) for each token (default False).
-                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
-                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    attention_mask: list[int] if return_attention_mask is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-
-            - ``input_ids``: list of token ids to be fed to a model
-            - ``token_type_ids``: list of token type ids to be fed to a model
-            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-              tokens and 1 specifying sequence tokens.
-        """
-
-        def get_input_ids(text):
-            if isinstance(text, str):
-                tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
-                return self.convert_tokens_to_ids(tokens)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                return self.convert_tokens_to_ids(text)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
-            else:
-                raise ValueError(
-                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-                )
-
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers."
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+  def batch_encode_plus(
+    self,
+    batch_text_or_text_pairs: Union[
+      List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
+    ],
+    add_special_tokens: bool = True,
+    max_length: Optional[int] = None,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    is_pretokenized: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_masks: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_masks: bool = False,
+    return_offsets_mapping: bool = False,
+    return_input_lengths: bool = False,
+    **kwargs,
+  ) -> BatchEncoding:
+    """
+    Returns a dictionary containing the encoded sequence or sequence pair and additional information:
+    the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
 
-        # Throw an error if we can pad because there is no padding token
-        if pad_to_max_length and self.pad_token_id is None:
-            raise ValueError(
-                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
-            )
+    Args:
+        batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
+            Batch of sequences or pair of sequences to be encoded.
+            This can be a list of string/string-sequences/int-sequences or a list of pair of
+            string/string-sequences/int-sequence (see details in encode_plus)
+        add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            If set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
+            If set to a number, will limit the total sequence returned so that it has a maximum length.
+            If there are overflowing tokens, those will be added to the returned dictionary
+        stride (:obj:`int`, `optional`, defaults to ``0``):
+            If set to a number along with max_length, the overflowing tokens returned will contain some tokens
+            from the main sequence returned. The value of this argument defines the number of additional tokens.
+        truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
+            String selected in the following options:
+
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+              starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the
+            model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
+            which can be set to the following strings:
+
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+            Set to True to indicate the input is already tokenized
+        return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
+            Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
+            or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
+        return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            Whether to return token type IDs. If left to the default, will return the token type IDs according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
+            Whether to return the attention mask. If left to the default, will return the attention mask according
+            to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return overflowing token information (default False).
+        return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return special tokens mask information (default False).
+        return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True to return (char_start, char_end) for each token (default False).
+            If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
+            Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
+        return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If set the resulting dictionary will include the length of each sample
+        **kwargs: passed to the `self.tokenize()` method
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[List[int]],
+                token_type_ids: list[List[int]] if return_token_type_ids is True (default)
+                attention_mask: list[List[int]] if return_attention_mask is True (default)
+                overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+
+        - ``input_ids``: list of token ids to be fed to a model
+        - ``token_type_ids``: list of token type ids to be fed to a model
+        - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
+        - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+        - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+        - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+          tokens and 1 specifying sequence tokens.
+    """
 
-        first_ids = get_input_ids(text)
-        second_ids = get_input_ids(text_pair) if text_pair is not None else None
-
-        return self.prepare_for_model(
-            first_ids,
-            pair_ids=second_ids,
-            max_length=max_length,
-            pad_to_max_length=pad_to_max_length,
-            add_special_tokens=add_special_tokens,
-            stride=stride,
-            truncation_strategy=truncation_strategy,
-            return_tensors=return_tensors,
-            return_attention_mask=return_attention_mask,
-            return_token_type_ids=return_token_type_ids,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
+    def get_input_ids(text):
+      if isinstance(text, str):
+        tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
+        return self.convert_tokens_to_ids(tokens)
+      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+        return self.convert_tokens_to_ids(text)
+      elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+        return text
+      else:
+        raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.")
+
+    # Throw an error if we can pad because there is no padding token
+    if pad_to_max_length and self.pad_token_id is None:
+      raise ValueError(
+        "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+      )
+
+    if return_offsets_mapping:
+      raise NotImplementedError(
+        "return_offset_mapping is not available when using Python tokenizers."
+        "To use this feature, change your tokenizer to one deriving from "
+        "transformers.PreTrainedTokenizerFast."
+        "More information on available tokenizers at "
+        "https://github.com/huggingface/transformers/pull/2674"
+      )
+
+    input_ids = []
+    for ids_or_pair_ids in batch_text_or_text_pairs:
+      if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
+        ids, pair_ids = ids_or_pair_ids
+      else:
+        ids, pair_ids = ids_or_pair_ids, None
+
+      first_ids = get_input_ids(ids)
+      second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+      input_ids.append((first_ids, second_ids))
+
+    if max_length is None and pad_to_max_length:
+
+      def total_sequence_length(input_pairs):
+        first_ids, second_ids = input_pairs
+        return len(first_ids) + (
+          self.num_special_tokens_to_add()
+          if second_ids is None
+          else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
         )
 
-    def batch_encode_plus(
-        self,
-        batch_text_or_text_pairs: Union[
-            List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
-        ],
-        add_special_tokens: bool = True,
-        max_length: Optional[int] = None,
-        stride: int = 0,
-        truncation_strategy: str = "longest_first",
-        pad_to_max_length: bool = False,
-        is_pretokenized: bool = False,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_masks: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_masks: bool = False,
-        return_offsets_mapping: bool = False,
-        return_input_lengths: bool = False,
-        **kwargs
-    ) -> BatchEncoding:
-        """
-        Returns a dictionary containing the encoded sequence or sequence pair and additional information:
-        the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
-
-        Args:
-            batch_text_or_text_pairs (:obj:`List[str]` or :obj:`List[List[str]]`):
-                Batch of sequences or pair of sequences to be encoded.
-                This can be a list of string/string-sequences/int-sequences or a list of pair of
-                string/string-sequences/int-sequence (see details in encode_plus)
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`None`):
-                If set to a number, will limit the total sequence returned so that it has a maximum length.
-                If there are overflowing tokens, those will be added to the returned dictionary
-            stride (:obj:`int`, `optional`, defaults to ``0``):
-                If set to a number along with max_length, the overflowing tokens returned will contain some tokens
-                from the main sequence returned. The value of this argument defines the number of additional tokens.
-            truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
-                String selected in the following options:
-
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                  starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the
-                model's max length. The tokenizer padding sides are handled by the class attribute `padding_side`
-                which can be set to the following strings:
-
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
-                Set to True to indicate the input is already tokenized
-            return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
-                Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
-                or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
-            return_token_type_ids (:obj:`bool`, `optional`, defaults to :obj:`None`):
-                Whether to return token type IDs. If left to the default, will return the token type IDs according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are token type IDs? <../glossary.html#token-type-ids>`_
-            return_attention_masks (:obj:`bool`, `optional`, defaults to :obj:`none`):
-                Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
-
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return overflowing token information (default False).
-            return_special_tokens_masks (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return special tokens mask information (default False).
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True to return (char_start, char_end) for each token (default False).
-                If using Python's tokenizer, this method will raise NotImplementedError. This one is only available on
-                Rust-based tokenizers inheriting from PreTrainedTokenizerFast.
-            return_input_lengths (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If set the resulting dictionary will include the length of each sample
-            **kwargs: passed to the `self.tokenize()` method
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[List[int]],
-                    token_type_ids: list[List[int]] if return_token_type_ids is True (default)
-                    attention_mask: list[List[int]] if return_attention_mask is True (default)
-                    overflowing_tokens: list[List[int]] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: List[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[List[int]] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-
-            - ``input_ids``: list of token ids to be fed to a model
-            - ``token_type_ids``: list of token type ids to be fed to a model
-            - ``attention_mask``: list of indices specifying which tokens should be attended to by the model
-            - ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-            - ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-            - ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-              tokens and 1 specifying sequence tokens.
-        """
-
-        def get_input_ids(text):
-            if isinstance(text, str):
-                tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
-                return self.convert_tokens_to_ids(tokens)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                return self.convert_tokens_to_ids(text)
-            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
-                return text
+      max_length = max([total_sequence_length(ids) for ids in input_ids])
+
+    batch_outputs = {}
+    for first_ids, second_ids in input_ids:
+      # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by
+      # the model. It adds special tokens, truncates sequences if overflowing while taking into account
+      # the special tokens and manages a window stride for overflowing tokens
+      outputs = self.prepare_for_model(
+        first_ids,
+        pair_ids=second_ids,
+        max_length=max_length,
+        pad_to_max_length=pad_to_max_length,
+        add_special_tokens=add_special_tokens,
+        stride=stride,
+        truncation_strategy=truncation_strategy,
+        return_attention_mask=return_attention_masks,
+        return_token_type_ids=return_token_type_ids,
+        return_overflowing_tokens=return_overflowing_tokens,
+        return_special_tokens_mask=return_special_tokens_masks,
+      )
+
+      # Append the non-padded length to the output
+      if return_input_lengths:
+        outputs["input_len"] = len(outputs["input_ids"])
+
+      for key, value in outputs.items():
+        if key not in batch_outputs:
+          batch_outputs[key] = []
+        batch_outputs[key].append(value)
+
+    if return_tensors is not None:
+      # Do the tensor conversion in batch
+      for key, value in batch_outputs.items():
+        if return_tensors == "tf" and is_tf_available():
+          try:
+            batch_outputs[key] = tf.constant(value)
+          except ValueError:
+            if None in [item for sequence in value for item in sequence]:
+              raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
+            else:
+              raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
+        elif return_tensors == "pt" and is_torch_available():
+          try:
+            batch_outputs[key] = torch.tensor(value)
+          except ValueError:
+            raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
+          except RuntimeError:
+            if None in [item for sequence in value for item in sequence]:
+              raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
             else:
-                raise ValueError(
-                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
-                )
-
-        # Throw an error if we can pad because there is no padding token
-        if pad_to_max_length and self.pad_token_id is None:
-            raise ValueError(
-                "Unable to set proper padding strategy as the tokenizer does not have a padding token. In this case please set the `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via the function add_special_tokens if you want to use a padding strategy"
+              raise
+        elif return_tensors is not None:
+          logger.warning(
+            "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+              return_tensors
             )
+          )
+
+    return BatchEncoding(batch_outputs)
+
+  def prepare_for_model(
+    self,
+    ids: List[int],
+    pair_ids: Optional[List[int]] = None,
+    max_length: Optional[int] = None,
+    add_special_tokens: bool = True,
+    stride: int = 0,
+    truncation_strategy: str = "longest_first",
+    pad_to_max_length: bool = False,
+    return_tensors: Optional[str] = None,
+    return_token_type_ids: Optional[bool] = None,
+    return_attention_mask: Optional[bool] = None,
+    return_overflowing_tokens: bool = False,
+    return_special_tokens_mask: bool = False,
+  ):
+    """
+    Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+    It adds special tokens, truncates
+    sequences if overflowing while taking into account the special tokens and manages a window stride for
+    overflowing tokens
 
-        if return_offsets_mapping:
-            raise NotImplementedError(
-                "return_offset_mapping is not available when using Python tokenizers."
-                "To use this feature, change your tokenizer to one deriving from "
-                "transformers.PreTrainedTokenizerFast."
-                "More information on available tokenizers at "
-                "https://github.com/huggingface/transformers/pull/2674"
-            )
+    Args:
+        ids: list of tokenized input ids. Can be obtained from a string by chaining the
+            `tokenize` and `convert_tokens_to_ids` methods.
+        pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
+            `tokenize` and `convert_tokens_to_ids` methods.
+        max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
+        add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+            to their model.
+        stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
+            list of inputs.
+        truncation_strategy: string selected in the following options:
+            - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+                starting from the longest one at each token (when there is a pair of input sequences)
+            - 'only_first': Only truncate the first sequence
+            - 'only_second': Only truncate the second sequence
+            - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+        pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
+            padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
+            The tokenizer padding sides are handled by the following strings:
+            - 'left': pads on the left of the sequences
+            - 'right': pads on the right of the sequences
+            Defaults to False: no padding.
+        return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
+            or PyTorch torch.Tensor instead of a list of python integers.
+        return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
+        return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
+        return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
+        return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
+
+    Return:
+        A Dictionary of shape::
+
+            {
+                input_ids: list[int],
+                token_type_ids: list[int] if return_token_type_ids is True (default)
+                overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
+                num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
+                special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
+            }
+
+        With the fields:
+            ``input_ids``: list of token ids to be fed to a model
+            ``token_type_ids``: list of token type ids to be fed to a model
+
+            ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
+            ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
+            ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
+            tokens and 1 specifying sequence tokens.
+    """
+    pair = bool(pair_ids is not None)
+    len_ids = len(ids)
+    len_pair_ids = len(pair_ids) if pair else 0
+
+    if return_token_type_ids is None:
+      return_token_type_ids = "token_type_ids" in self.model_input_names
+    if return_attention_mask is None:
+      return_attention_mask = "attention_mask" in self.model_input_names
+
+    encoded_inputs = {}
+
+    # Handle max sequence length
+    total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+    if max_length and total_len > max_length:
+      ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+        ids,
+        pair_ids=pair_ids,
+        num_tokens_to_remove=total_len - max_length,
+        truncation_strategy=truncation_strategy,
+        stride=stride,
+      )
+      if return_overflowing_tokens:
+        encoded_inputs["overflowing_tokens"] = overflowing_tokens
+        encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+    # Handle special_tokens
+    if add_special_tokens:
+      sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+      token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+    else:
+      sequence = ids + pair_ids if pair else ids
+      token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
+
+    if return_special_tokens_mask:
+      if add_special_tokens:
+        encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+      else:
+        encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+    encoded_inputs["input_ids"] = sequence
+    if return_token_type_ids:
+      encoded_inputs["token_type_ids"] = token_type_ids
+
+    if max_length and len(encoded_inputs["input_ids"]) > max_length:
+      encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
+      if return_token_type_ids:
+        encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
+      if return_special_tokens_mask:
+        encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
+
+    if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
+      logger.warning(
+        "Token indices sequence length is longer than the specified maximum sequence length "
+        "for this model ({} > {}). Running this sequence through the model will result in "
+        "indexing errors".format(len(ids), self.max_len)
+      )
+
+    needs_to_be_padded = pad_to_max_length and (
+      max_length
+      and len(encoded_inputs["input_ids"]) < max_length
+      or max_length is None
+      and len(encoded_inputs["input_ids"]) < self.max_len
+      and self.max_len <= 10000
+    )
 
-        input_ids = []
-        for ids_or_pair_ids in batch_text_or_text_pairs:
-            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
-                ids, pair_ids = ids_or_pair_ids
-            else:
-                ids, pair_ids = ids_or_pair_ids, None
-
-            first_ids = get_input_ids(ids)
-            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
-            input_ids.append((first_ids, second_ids))
-
-        if max_length is None and pad_to_max_length:
-
-            def total_sequence_length(input_pairs):
-                first_ids, second_ids = input_pairs
-                return len(first_ids) + (
-                    self.num_special_tokens_to_add()
-                    if second_ids is None
-                    else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
-                )
-
-            max_length = max([total_sequence_length(ids) for ids in input_ids])
-
-        batch_outputs = {}
-        for first_ids, second_ids in input_ids:
-            # Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by
-            # the model. It adds special tokens, truncates sequences if overflowing while taking into account
-            # the special tokens and manages a window stride for overflowing tokens
-            outputs = self.prepare_for_model(
-                first_ids,
-                pair_ids=second_ids,
-                max_length=max_length,
-                pad_to_max_length=pad_to_max_length,
-                add_special_tokens=add_special_tokens,
-                stride=stride,
-                truncation_strategy=truncation_strategy,
-                return_attention_mask=return_attention_masks,
-                return_token_type_ids=return_token_type_ids,
-                return_overflowing_tokens=return_overflowing_tokens,
-                return_special_tokens_mask=return_special_tokens_masks,
-            )
+    if pad_to_max_length and max_length is None and self.max_len > 10000:
+      logger.warning(
+        "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
+      )
 
-            # Append the non-padded length to the output
-            if return_input_lengths:
-                outputs["input_len"] = len(outputs["input_ids"])
-
-            for key, value in outputs.items():
-                if key not in batch_outputs:
-                    batch_outputs[key] = []
-                batch_outputs[key].append(value)
-
-        if return_tensors is not None:
-
-            # Do the tensor conversion in batch
-            for key, value in batch_outputs.items():
-                if return_tensors == "tf" and is_tf_available():
-                    try:
-                        batch_outputs[key] = tf.constant(value)
-                    except ValueError:
-                        if None in [item for sequence in value for item in sequence]:
-                            raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
-                        else:
-                            raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
-                elif return_tensors == "pt" and is_torch_available():
-                    try:
-                        batch_outputs[key] = torch.tensor(value)
-                    except ValueError:
-                        raise ValueError(self.UNEVEN_SEQUENCES_FOR_BATCH_MSG)
-                    except RuntimeError:
-                        if None in [item for sequence in value for item in sequence]:
-                            raise ValueError(self.NO_PAD_TOKEN_FOR_BATCH_MSG)
-                        else:
-                            raise
-                elif return_tensors is not None:
-                    logger.warning(
-                        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                            return_tensors
-                        )
-                    )
-
-        return BatchEncoding(batch_outputs)
-
-    def prepare_for_model(
-        self,
-        ids: List[int],
-        pair_ids: Optional[List[int]] = None,
-        max_length: Optional[int] = None,
-        add_special_tokens: bool = True,
-        stride: int = 0,
-        truncation_strategy: str = "longest_first",
-        pad_to_max_length: bool = False,
-        return_tensors: Optional[str] = None,
-        return_token_type_ids: Optional[bool] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_overflowing_tokens: bool = False,
-        return_special_tokens_mask: bool = False,
-    ):
-        """
-        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
-        It adds special tokens, truncates
-        sequences if overflowing while taking into account the special tokens and manages a window stride for
-        overflowing tokens
-
-        Args:
-            ids: list of tokenized input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the
-                `tokenize` and `convert_tokens_to_ids` methods.
-            max_length: maximum length of the returned list. Will truncate by taking into account the special tokens.
-            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
-                to their model.
-            stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential
-                list of inputs.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences)
-                - 'only_first': Only truncate the first sequence
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-            pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and
-                padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length.
-                The tokenizer padding sides are handled by the following strings:
-                - 'left': pads on the left of the sequences
-                - 'right': pads on the right of the sequences
-                Defaults to False: no padding.
-            return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
-                or PyTorch torch.Tensor instead of a list of python integers.
-            return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
-            return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
-            return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
-
-        Return:
-            A Dictionary of shape::
-
-                {
-                    input_ids: list[int],
-                    token_type_ids: list[int] if return_token_type_ids is True (default)
-                    overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True
-                    num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True
-                    special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True
-                }
-
-            With the fields:
-                ``input_ids``: list of token ids to be fed to a model
-                ``token_type_ids``: list of token type ids to be fed to a model
-
-                ``overflowing_tokens``: list of overflowing tokens if a max length is specified.
-                ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified
-                ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added
-                tokens and 1 specifying sequence tokens.
-        """
-        pair = bool(pair_ids is not None)
-        len_ids = len(ids)
-        len_pair_ids = len(pair_ids) if pair else 0
-
-        if return_token_type_ids is None:
-            return_token_type_ids = "token_type_ids" in self.model_input_names
-        if return_attention_mask is None:
-            return_attention_mask = "attention_mask" in self.model_input_names
-
-        encoded_inputs = {}
-
-        # Handle max sequence length
-        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
-        if max_length and total_len > max_length:
-            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
-                ids,
-                pair_ids=pair_ids,
-                num_tokens_to_remove=total_len - max_length,
-                truncation_strategy=truncation_strategy,
-                stride=stride,
-            )
-            if return_overflowing_tokens:
-                encoded_inputs["overflowing_tokens"] = overflowing_tokens
-                encoded_inputs["num_truncated_tokens"] = total_len - max_length
-
-        # Handle special_tokens
-        if add_special_tokens:
-            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
-            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-        else:
-            sequence = ids + pair_ids if pair else ids
-            token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
+    if needs_to_be_padded:
+      difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
 
+      if self.padding_side == "right":
+        if return_attention_mask:
+          encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
+        if return_token_type_ids:
+          encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
         if return_special_tokens_mask:
-            if add_special_tokens:
-                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
-            else:
-                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
-
-        encoded_inputs["input_ids"] = sequence
+          encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+        encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
+      elif self.padding_side == "left":
+        if return_attention_mask:
+          encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
         if return_token_type_ids:
-            encoded_inputs["token_type_ids"] = token_type_ids
-
-        if max_length and len(encoded_inputs["input_ids"]) > max_length:
-            encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length]
-            if return_token_type_ids:
-                encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length]
-            if return_special_tokens_mask:
-                encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length]
-
-        if max_length is None and len(encoded_inputs["input_ids"]) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum sequence length "
-                "for this model ({} > {}). Running this sequence through the model will result in "
-                "indexing errors".format(len(ids), self.max_len)
-            )
-
-        needs_to_be_padded = pad_to_max_length and (
-            max_length
-            and len(encoded_inputs["input_ids"]) < max_length
-            or max_length is None
-            and len(encoded_inputs["input_ids"]) < self.max_len
-            and self.max_len <= 10000
-        )
-
-        if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning(
-                "Sequence can't be padded as no maximum length is specified and the model maximum length is too high."
-            )
-
-        if needs_to_be_padded:
-            difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])
-
-            if self.padding_side == "right":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference
-                if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
-                    )
-                if return_special_tokens_mask:
-                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
-                encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-            elif self.padding_side == "left":
-                if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
-                if return_token_type_ids:
-                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                        "token_type_ids"
-                    ]
-                if return_special_tokens_mask:
-                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+          encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
+        if return_special_tokens_mask:
+          encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+        encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"]
+
+      else:
+        raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+
+    elif return_attention_mask:
+      encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+
+    # Prepare inputs as tensors if asked
+    if return_tensors == "tf" and is_tf_available():
+      encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+
+      if "token_type_ids" in encoded_inputs:
+        encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+      if "attention_mask" in encoded_inputs:
+        encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+    elif return_tensors == "pt" and is_torch_available():
+      encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+
+      if "token_type_ids" in encoded_inputs:
+        encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+      if "attention_mask" in encoded_inputs:
+        encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+    elif return_tensors is not None:
+      logger.warning(
+        "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)
+      )
+
+    return BatchEncoding(encoded_inputs)
+
+  def prepare_for_tokenization(self, text, **kwargs):
+    """Performs any necessary transformations before tokenization"""
+    return text
+
+  def truncate_sequences(
+    self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
+  ):
+    """Truncates a sequence pair in place to the maximum length.
+    truncation_strategy: string selected in the following options:
+        - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
+            starting from the longest one at each token (when there is a pair of input sequences).
+            Overflowing tokens only contains overflow from the first sequence.
+        - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
+        - 'only_second': Only truncate the second sequence
+        - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
+    """
+    if num_tokens_to_remove <= 0:
+      return ids, pair_ids, []
+
+    if truncation_strategy == "longest_first":
+      overflowing_tokens = []
+      for _ in range(num_tokens_to_remove):
+        if pair_ids is None or len(ids) > len(pair_ids):
+          overflowing_tokens = [ids[-1]] + overflowing_tokens
+          ids = ids[:-1]
+        else:
+          pair_ids = pair_ids[:-1]
+      window_len = min(len(ids), stride)
+      if window_len > 0:
+        overflowing_tokens = ids[-window_len:] + overflowing_tokens
+    elif truncation_strategy == "only_first":
+      assert len(ids) > num_tokens_to_remove
+      window_len = min(len(ids), stride + num_tokens_to_remove)
+      overflowing_tokens = ids[-window_len:]
+      ids = ids[:-num_tokens_to_remove]
+    elif truncation_strategy == "only_second":
+      assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
+      window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+      overflowing_tokens = pair_ids[-window_len:]
+      pair_ids = pair_ids[:-num_tokens_to_remove]
+    elif truncation_strategy == "do_not_truncate":
+      raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
+    else:
+      raise ValueError(
+        "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
+      )
+    return (ids, pair_ids, overflowing_tokens)
 
-            else:
-                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+  def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+    if token_ids_1 is None:
+      return len(token_ids_0) * [0]
+    return [0] * len(token_ids_0) + [1] * len(token_ids_1)
 
-        elif return_attention_mask:
-            encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
+  def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    """
+    Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+    by concatenating and adding special tokens.
+    A RoBERTa sequence has the following format:
+        single sequence: <s> X </s>
+        pair of sequences: <s> A </s></s> B </s>
+    """
+    if token_ids_1 is None:
+      return token_ids_0
+    return token_ids_0 + token_ids_1
 
-        # Prepare inputs as tensors if asked
-        if return_tensors == "tf" and is_tf_available():
-            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+  def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+    """
+    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+    Args:
+        token_ids_0: list of ids (must not contain special tokens)
+        token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+            for sequence pairs
+        already_has_special_tokens: (default False) Set to True if the token list is already formated with
+            special tokens for the model
 
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+    Returns:
+        A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+    """
+    return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
 
-        elif return_tensors == "pt" and is_torch_available():
-            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+  def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+    """Converts a single index or a sequence of indices (integers) in a token "
+    (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
 
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+    Args:
+        skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+    """
+    if isinstance(ids, int):
+      if ids in self.added_tokens_decoder:
+        return self.added_tokens_decoder[ids]
+      else:
+        return self._convert_id_to_token(ids)
+    tokens = []
+    for index in ids:
+      index = int(index)
+      if skip_special_tokens and index in self.all_special_ids:
+        continue
+      if index in self.added_tokens_decoder:
+        tokens.append(self.added_tokens_decoder[index])
+      else:
+        tokens.append(self._convert_id_to_token(index))
+    return tokens
 
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
-        elif return_tensors is not None:
-            logger.warning(
-                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
-                    return_tensors
-                )
-            )
+  def _convert_id_to_token(self, index):
+    raise NotImplementedError
 
-        return BatchEncoding(encoded_inputs)
+  def convert_tokens_to_string(self, tokens):
+    """Converts a sequence of tokens (string) in a single string.
+    The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+    but we often want to remove sub-word tokenization artifacts at the same time.
+    """
+    return " ".join(self.convert_ids_to_tokens(tokens))
 
-    def prepare_for_tokenization(self, text, **kwargs):
-        """ Performs any necessary transformations before tokenization """
-        return text
+  def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    """
+    Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+    with options to remove special tokens and clean up tokenization spaces.
+    Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
 
-    def truncate_sequences(
-        self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy="longest_first", stride=0
-    ):
-        """Truncates a sequence pair in place to the maximum length.
-            truncation_strategy: string selected in the following options:
-                - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length
-                    starting from the longest one at each token (when there is a pair of input sequences).
-                    Overflowing tokens only contains overflow from the first sequence.
-                - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove.
-                - 'only_second': Only truncate the second sequence
-                - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length)
-        """
-        if num_tokens_to_remove <= 0:
-            return ids, pair_ids, []
-
-        if truncation_strategy == "longest_first":
-            overflowing_tokens = []
-            for _ in range(num_tokens_to_remove):
-                if pair_ids is None or len(ids) > len(pair_ids):
-                    overflowing_tokens = [ids[-1]] + overflowing_tokens
-                    ids = ids[:-1]
-                else:
-                    pair_ids = pair_ids[:-1]
-            window_len = min(len(ids), stride)
-            if window_len > 0:
-                overflowing_tokens = ids[-window_len:] + overflowing_tokens
-        elif truncation_strategy == "only_first":
-            assert len(ids) > num_tokens_to_remove
-            window_len = min(len(ids), stride + num_tokens_to_remove)
-            overflowing_tokens = ids[-window_len:]
-            ids = ids[:-num_tokens_to_remove]
-        elif truncation_strategy == "only_second":
-            assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove
-            window_len = min(len(pair_ids), stride + num_tokens_to_remove)
-            overflowing_tokens = pair_ids[-window_len:]
-            pair_ids = pair_ids[:-num_tokens_to_remove]
-        elif truncation_strategy == "do_not_truncate":
-            raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.")
-        else:
-            raise ValueError(
-                "Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']"
-            )
-        return (ids, pair_ids, overflowing_tokens)
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        if token_ids_1 is None:
-            return len(token_ids_0) * [0]
-        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
-
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A RoBERTa sequence has the following format:
-            single sequence: <s> X </s>
-            pair of sequences: <s> A </s></s> B </s>
-        """
-        if token_ids_1 is None:
-            return token_ids_0
-        return token_ids_0 + token_ids_1
-
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """ Converts a single index or a sequence of indices (integers) in a token "
-            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
-
-            Args:
-                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
-        """
-        if isinstance(ids, int):
-            if ids in self.added_tokens_decoder:
-                return self.added_tokens_decoder[ids]
-            else:
-                return self._convert_id_to_token(ids)
-        tokens = []
-        for index in ids:
-            index = int(index)
-            if skip_special_tokens and index in self.all_special_ids:
-                continue
-            if index in self.added_tokens_decoder:
-                tokens.append(self.added_tokens_decoder[index])
-            else:
-                tokens.append(self._convert_id_to_token(index))
-        return tokens
-
-    def _convert_id_to_token(self, index):
-        raise NotImplementedError
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string.
-            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
-            but we often want to remove sub-word tokenization artifacts at the same time.
-        """
-        return " ".join(self.convert_ids_to_tokens(tokens))
-
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """
-        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-        with options to remove special tokens and clean up tokenization spaces.
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
-
-        Args:
-            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
-            skip_special_tokens: if set to True, will replace special tokens.
-            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
-        """
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        # To avoid mixing byte-level and unicode for byte-level BPT
-        # we need to build string separatly for added tokens and byte-level tokens
-        # cf. https://github.com/huggingface/transformers/issues/1133
-        sub_texts = []
-        current_sub_text = []
-        for token in filtered_tokens:
-            if skip_special_tokens and token in self.all_special_ids:
-                continue
-            if token in self.added_tokens_encoder:
-                if current_sub_text:
-                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-                    current_sub_text = []
-                sub_texts.append(token)
-            else:
-                current_sub_text.append(token)
+    Args:
+        token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
+        skip_special_tokens: if set to True, will replace special tokens.
+        clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
+    """
+    filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+    # To avoid mixing byte-level and unicode for byte-level BPT
+    # we need to build string separatly for added tokens and byte-level tokens
+    # cf. https://github.com/huggingface/transformers/issues/1133
+    sub_texts = []
+    current_sub_text = []
+    for token in filtered_tokens:
+      if skip_special_tokens and token in self.all_special_ids:
+        continue
+      if token in self.added_tokens_encoder:
         if current_sub_text:
-            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = " ".join(sub_texts)
-
-        if clean_up_tokenization_spaces:
-            clean_text = self.clean_up_tokenization(text)
-            return clean_text
-        else:
-            return text
-
-    @staticmethod
-    def clean_up_tokenization(out_string):
-        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
-        """
-        out_string = (
-            out_string.replace(" .", ".")
-            .replace(" ?", "?")
-            .replace(" !", "!")
-            .replace(" ,", ",")
-            .replace(" ' ", "'")
-            .replace(" n't", "n't")
-            .replace(" 'm", "'m")
-            .replace(" do not", " don't")
-            .replace(" 's", "'s")
-            .replace(" 've", "'ve")
-            .replace(" 're", "'re")
-        )
-        return out_string
+          sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+          current_sub_text = []
+        sub_texts.append(token)
+      else:
+        current_sub_text.append(token)
+    if current_sub_text:
+      sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+    text = " ".join(sub_texts)
+
+    if clean_up_tokenization_spaces:
+      clean_text = self.clean_up_tokenization(text)
+      return clean_text
+    else:
+      return text
+
+  @staticmethod
+  def clean_up_tokenization(out_string):
+    """Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms."""
+    out_string = (
+      out_string.replace(" .", ".")
+      .replace(" ?", "?")
+      .replace(" !", "!")
+      .replace(" ,", ",")
+      .replace(" ' ", "'")
+      .replace(" n't", "n't")
+      .replace(" 'm", "'m")
+      .replace(" do not", " don't")
+      .replace(" 's", "'s")
+      .replace(" 've", "'ve")
+      .replace(" 're", "'re")
+    )
+    return out_string
 
 
 def trim_batch(
-    input_ids, pad_token_id, attention_mask=None,
+  input_ids,
+  pad_token_id,
+  attention_mask=None,
 ):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
+  """Remove columns that are populated exclusively by pad_token_id"""
+  keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
+  if attention_mask is None:
+    return input_ids[:, keep_column_mask]
+  else:
+    return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
 
 
 def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  with open(vocab_file, "r", encoding="utf-8") as reader:
+    tokens = reader.readlines()
+  for index, token in enumerate(tokens):
+    token = token.rstrip("\n")
+    vocab[token] = index
+  return vocab
 
 
 def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
 
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
-        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
-        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
-        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
-        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
-        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
-        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
-        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
-        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
-        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
-        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
-        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
-        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
-        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
-        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-        "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
-        "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
-    }
+  "vocab_file": {
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+    "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+    "bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+    "bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+    "bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
+  }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "bert-base-uncased": 512,
-    "bert-large-uncased": 512,
-    "bert-base-cased": 512,
-    "bert-large-cased": 512,
-    "bert-base-multilingual-uncased": 512,
-    "bert-base-multilingual-cased": 512,
-    "bert-base-chinese": 512,
-    "bert-base-german-cased": 512,
-    "bert-large-uncased-whole-word-masking": 512,
-    "bert-large-cased-whole-word-masking": 512,
-    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
-    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
-    "bert-base-cased-finetuned-mrpc": 512,
-    "bert-base-german-dbmdz-cased": 512,
-    "bert-base-german-dbmdz-uncased": 512,
-    "bert-base-finnish-cased-v1": 512,
-    "bert-base-finnish-uncased-v1": 512,
-    "bert-base-dutch-cased": 512,
+  "bert-base-uncased": 512,
+  "bert-large-uncased": 512,
+  "bert-base-cased": 512,
+  "bert-large-cased": 512,
+  "bert-base-multilingual-uncased": 512,
+  "bert-base-multilingual-cased": 512,
+  "bert-base-chinese": 512,
+  "bert-base-german-cased": 512,
+  "bert-large-uncased-whole-word-masking": 512,
+  "bert-large-cased-whole-word-masking": 512,
+  "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+  "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+  "bert-base-cased-finetuned-mrpc": 512,
+  "bert-base-german-dbmdz-cased": 512,
+  "bert-base-german-dbmdz-uncased": 512,
+  "bert-base-finnish-cased-v1": 512,
+  "bert-base-finnish-uncased-v1": 512,
+  "bert-base-dutch-cased": 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "bert-base-uncased": {"do_lower_case": True},
-    "bert-large-uncased": {"do_lower_case": True},
-    "bert-base-cased": {"do_lower_case": False},
-    "bert-large-cased": {"do_lower_case": False},
-    "bert-base-multilingual-uncased": {"do_lower_case": True},
-    "bert-base-multilingual-cased": {"do_lower_case": False},
-    "bert-base-chinese": {"do_lower_case": False},
-    "bert-base-german-cased": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
-    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
-    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
-    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
-    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
-    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
-    "bert-base-finnish-cased-v1": {"do_lower_case": False},
-    "bert-base-finnish-uncased-v1": {"do_lower_case": True},
-    "bert-base-dutch-cased": {"do_lower_case": False},
+  "bert-base-uncased": {"do_lower_case": True},
+  "bert-large-uncased": {"do_lower_case": True},
+  "bert-base-cased": {"do_lower_case": False},
+  "bert-large-cased": {"do_lower_case": False},
+  "bert-base-multilingual-uncased": {"do_lower_case": True},
+  "bert-base-multilingual-cased": {"do_lower_case": False},
+  "bert-base-chinese": {"do_lower_case": False},
+  "bert-base-german-cased": {"do_lower_case": False},
+  "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+  "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+  "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+  "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+  "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+  "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+  "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+  "bert-base-finnish-cased-v1": {"do_lower_case": False},
+  "bert-base-finnish-uncased-v1": {"do_lower_case": True},
+  "bert-base-dutch-cased": {"do_lower_case": False},
 }
 
 
 # Bert Classes
 class BertTokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a BERT tokenizer. Based on WordPiece.
+  r"""
+  Constructs a BERT tokenizer. Based on WordPiece.
+
+  This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+  should refer to the superclass for more information regarding methods.
+
+  Args:
+      vocab_file (:obj:`string`):
+          File containing the vocabulary.
+      do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to lowercase the input when tokenizing.
+      do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to do basic tokenization before WordPiece.
+      never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          List of tokens which will never be split during tokenization. Only has an effect when
+          :obj:`do_basic_tokenize=True`
+      unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+          The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+          token instead.
+      sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+          The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+          for sequence classification or for a text and a question for question answering.
+          It is also used as the last token of a sequence built with special tokens.
+      pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+          The token used for padding, for example when batching sequences of different lengths.
+      cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+          The classifier token which is used when doing sequence classification (classification of the whole
+          sequence instead of per-token classification). It is the first token of the sequence when built with
+          special tokens.
+      mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+          The token used for masking values. This is the token used when training this model with masked language
+          modeling. This is the token which the model will try to predict.
+      tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+          Whether to tokenize Chinese characters.
+          This should likely be deactivated for Japanese:
+          see: https://github.com/huggingface/transformers/issues/328
+  """
+
+  vocab_files_names = VOCAB_FILES_NAMES
+  pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+  pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+  max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+  def __init__(
+    self,
+    vocab_file,
+    do_lower_case=True,
+    do_basic_tokenize=True,
+    never_split=None,
+    unk_token="[UNK]",
+    sep_token="[SEP]",
+    pad_token="[PAD]",
+    cls_token="[CLS]",
+    mask_token="[MASK]",
+    tokenize_chinese_chars=True,
+    **kwargs,
+  ):
+    super().__init__(
+      unk_token=unk_token,
+      sep_token=sep_token,
+      pad_token=pad_token,
+      cls_token=cls_token,
+      mask_token=mask_token,
+      **kwargs,
+    )
+    self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+    self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+    if not os.path.isfile(vocab_file):
+      raise ValueError(
+        "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+        "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+      )
+    self.vocab = load_vocab(vocab_file)
+    self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+    self.do_basic_tokenize = do_basic_tokenize
+    if do_basic_tokenize:
+      self.basic_tokenizer = BasicTokenizer(
+        do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+      )
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+  @property
+  def vocab_size(self):
+    return len(self.vocab)
+
+  def get_vocab(self):
+    return dict(self.vocab, **self.added_tokens_encoder)
+
+  def _tokenize(self, text):
+    split_tokens = []
+    if self.do_basic_tokenize:
+      for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+        for sub_token in self.wordpiece_tokenizer.tokenize(token):
+          split_tokens.append(sub_token)
+    else:
+      split_tokens = self.wordpiece_tokenizer.tokenize(text)
+    return split_tokens
+
+  def _convert_token_to_id(self, token):
+    """Converts a token (str) in an id using the vocab."""
+    return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+  def _convert_id_to_token(self, index):
+    """Converts an index (integer) in a token (str) using the vocab."""
+    return self.ids_to_tokens.get(index, self.unk_token)
+
+  def convert_tokens_to_string(self, tokens):
+    """Converts a sequence of tokens (string) in a single string."""
+    out_string = " ".join(tokens).replace(" ##", "").strip()
+    return out_string
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
+  def build_inputs_with_special_tokens(
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+  ) -> List[int]:
+    """
+    Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+    by concatenating and adding special tokens.
+    A BERT sequence has the following format:
+
+    - single sequence: ``[CLS] X [SEP]``
+    - pair of sequences: ``[CLS] A [SEP] B [SEP]``
 
     Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to do basic tokenization before WordPiece.
-        never_split (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            List of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to tokenize Chinese characters.
-            This should likely be deactivated for Japanese:
-            see: https://github.com/huggingface/transformers/issues/328
+        token_ids_0 (:obj:`List[int]`):
+            List of IDs to which the special tokens will be added
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
+
+    Returns:
+        :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
     """
+    if token_ids_1 is None:
+      return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+    cls = [self.cls_token_id]
+    sep = [self.sep_token_id]
+    return cls + token_ids_0 + sep + token_ids_1 + sep
+
+  def get_special_tokens_mask(
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+  ) -> List[int]:
+    """
+    Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+    special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
 
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        tokenize_chinese_chars=True,
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
+    Args:
+        token_ids_0 (:obj:`List[int]`):
+            List of ids.
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
+        already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Set to True if the token list is already formatted with special tokens for the model
+
+    Returns:
+        :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+    """
+
+    if already_has_special_tokens:
+      if token_ids_1 is not None:
+        raise ValueError(
+          "You should not supply a second sequence if the provided sequence of "
+          "ids is already formated with special tokens for the model."
         )
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+      return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
 
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-            self.basic_tokenizer = BasicTokenizer(
-                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
-            )
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+    if token_ids_1 is not None:
+      return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+    return [1] + ([0] * len(token_ids_0)) + [1]
+
+  def create_token_type_ids_from_sequences(
+    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+  ) -> List[int]:
+    """
+    Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+    A BERT sequence pair mask has the following format:
 
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
+    ::
 
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
 
-    def _tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, vocab_path):
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
-
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
+    if token_ids_1 is None, only returns the first portion of the mask (0's).
+
+    Args:
+        token_ids_0 (:obj:`List[int]`):
+            List of ids.
+        token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            Optional second list of IDs for sequence pairs.
+
+    Returns:
+        :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+        sequence(s).
+    """
+    sep = [self.sep_token_id]
+    cls = [self.cls_token_id]
+    if token_ids_1 is None:
+      return len(cls + token_ids_0 + sep) * [0]
+    return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+  def save_vocabulary(self, vocab_path):
+    """
+    Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+
+    Args:
+        vocab_path (:obj:`str`):
+            The directory in which to save the vocabulary.
+
+    Returns:
+        :obj:`Tuple(str)`: Paths to the files saved.
+    """
+    index = 0
+    if os.path.isdir(vocab_path):
+      vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+    else:
+      vocab_file = vocab_path
+    with open(vocab_file, "w", encoding="utf-8") as writer:
+      for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+        if index != token_index:
+          logger.warning(
+            "Saving vocabulary to {}: vocabulary indices are not consecutive."
+            " Please check that the vocabulary is not corrupted!".format(vocab_file)
+          )
+          index = token_index
+        writer.write(token + "\n")
+        index += 1
+    return (vocab_file,)
 
 
 class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
-        """ Constructs a BasicTokenizer.
-
-        Args:
-            **do_lower_case**: Whether to lower case the input.
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-            **tokenize_chinese_chars**: (`optional`) boolean (default True)
-                Whether to tokenize Chinese characters.
-                This should likely be deactivated for Japanese:
-                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
-        """
-        if never_split is None:
-            never_split = []
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-        self.tokenize_chinese_chars = tokenize_chinese_chars
-
-    def tokenize(self, text, never_split=None):
-        """ Basic Tokenization of a piece of text.
-            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
-
-        Args:
-            **never_split**: (`optional`) list of str
-                Kept for backward compatibility purposes.
-                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
-                List of token not to split.
-        """
-        never_split = self.never_split + (never_split if never_split is not None else [])
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        if self.tokenize_chinese_chars:
-            text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token, never_split))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text, never_split=None):
-        """Splits punctuation on a piece of text."""
-        if never_split is not None and text in never_split:
-            return [text]
-        chars = list(text)
-        i = 0
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+    """Constructs a BasicTokenizer.
+
+    Args:
+        **do_lower_case**: Whether to lower case the input.
+        **never_split**: (`optional`) list of str
+            Kept for backward compatibility purposes.
+            Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+            List of token not to split.
+        **tokenize_chinese_chars**: (`optional`) boolean (default True)
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+    """
+    if never_split is None:
+      never_split = []
+    self.do_lower_case = do_lower_case
+    self.never_split = never_split
+    self.tokenize_chinese_chars = tokenize_chinese_chars
+
+  def tokenize(self, text, never_split=None):
+    """Basic Tokenization of a piece of text.
+        Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+
+    Args:
+        **never_split**: (`optional`) list of str
+            Kept for backward compatibility purposes.
+            Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+            List of token not to split.
+    """
+    never_split = self.never_split + (never_split if never_split is not None else [])
+    text = self._clean_text(text)
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    if self.tokenize_chinese_chars:
+      text = self._tokenize_chinese_chars(text)
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if self.do_lower_case and token not in never_split:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text, never_split=None):
+    """Splits punctuation on a piece of text."""
+    if never_split is not None and text in never_split:
+      return [text]
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
         start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if (
-            (cp >= 0x4E00 and cp <= 0x9FFF)
-            or (cp >= 0x3400 and cp <= 0x4DBF)  #
-            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
-            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
-            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
-            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
-            or (cp >= 0xF900 and cp <= 0xFAFF)
-            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
-        ):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xFFFD or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+      (cp >= 0x4E00 and cp <= 0x9FFF)
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+      or (cp >= 0xF900 and cp <= 0xFAFF)
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+    ):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xFFFD or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
 
 
 class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
+  """Runs WordPiece tokenization."""
+
+  def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer`.
+
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
 
 
 def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically contorl characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
 
 
 def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
     return False
+  cat = unicodedata.category(char)
+  if cat.startswith("C"):
+    return True
+  return False
 
 
 def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
diff --git a/modelzoo/ELECTRA/utils.py b/modelzoo/ELECTRA/utils.py
index d3e3cc04..a3c4abef 100644
--- a/modelzoo/ELECTRA/utils.py
+++ b/modelzoo/ELECTRA/utils.py
@@ -17,215 +17,222 @@
 import tensorflow as tf
 import dllogger
 
+
 def get_rank():
-    try:
-        return hvd.rank()
-    except:
-        return 0
+  try:
+    return hvd.rank()
+  except:
+    return 0
 
 
 def get_world_size():
-    try:
-        return hvd.size()
-    except:
-        return 1
+  try:
+    return hvd.size()
+  except:
+    return 1
 
 
 def is_main_process():
-    return get_rank() == 0
+  return get_rank() == 0
 
 
 def format_step(step):
-    if isinstance(step, str):
-        return step
-    s = ""
-    if len(step) == 1:
-        s += "Training Iteration: {} ".format(step[0])
-        return s   
-    if len(step) > 0:
-        s += "Training Epoch: {} ".format(step[0])
-    if len(step) > 1:
-        s += "Training Iteration: {} ".format(step[1])
+  if isinstance(step, str):
+    return step
+  s = ""
+  if len(step) == 1:
+    s += "Training Iteration: {} ".format(step[0])
     return s
+  if len(step) > 0:
+    s += "Training Epoch: {} ".format(step[0])
+  if len(step) > 1:
+    s += "Training Iteration: {} ".format(step[1])
+  return s
 
 
 def load_json(path):
-    with tf.io.gfile.GFile(path, "r") as f:
-        return json.load(f)
+  with tf.io.gfile.GFile(path, "r") as f:
+    return json.load(f)
 
 
 def write_json(o, path):
-    if "/" in path:
-        tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
-    with tf.io.gfile.GFile(path, "w") as f:
-        json.dump(o, f)
+  if "/" in path:
+    tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
+  with tf.io.gfile.GFile(path, "w") as f:
+    json.dump(o, f)
 
 
 def load_pickle(path):
-    with tf.io.gfile.GFile(path, "rb") as f:
-        return pickle.load(f)
+  with tf.io.gfile.GFile(path, "rb") as f:
+    return pickle.load(f)
 
 
 def write_pickle(o, path):
-    if "/" in path:
-        tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
-    with tf.io.gfile.GFile(path, "wb") as f:
-        pickle.dump(o, f, -1)
+  if "/" in path:
+    tf.io.gfile.makedirs(path.rsplit("/", 1)[0])
+  with tf.io.gfile.GFile(path, "wb") as f:
+    pickle.dump(o, f, -1)
 
 
 def mkdir(path):
-    if not tf.io.gfile.exists(path):
-        tf.io.gfile.makedirs(path)
+  if not tf.io.gfile.exists(path):
+    tf.io.gfile.makedirs(path)
 
 
 def rmrf(path):
-    if tf.io.gfile.exists(path):
-        tf.io.gfile.rmtree(path)
+  if tf.io.gfile.exists(path):
+    tf.io.gfile.rmtree(path)
 
 
 def rmkdir(path):
-    rmrf(path)
-    mkdir(path)
+  rmrf(path)
+  mkdir(path)
 
 
 def log(*args, **kwargs):
-    all_rank = kwargs.pop("all_rank", False)
-    if not all_rank and not is_main_process():
-        return
-    msg = " ".join(map(str, args))
-    sys.stdout.write(msg + "\n")
-    sys.stdout.flush()
+  all_rank = kwargs.pop("all_rank", False)
+  if not all_rank and not is_main_process():
+    return
+  msg = " ".join(map(str, args))
+  sys.stdout.write(msg + "\n")
+  sys.stdout.flush()
 
 
 def log_config(config):
-    for key, value in sorted(config.__dict__.items()):
-        log(key, value)
-    log()
+  for key, value in sorted(config.__dict__.items()):
+    log(key, value)
+  log()
 
 
 def heading(*args):
-    log(80 * "=")
-    log(*args)
-    log(80 * "=")
+  log(80 * "=")
+  log(*args)
+  log(80 * "=")
 
 
 def nest_dict(d, prefixes, delim="_"):
-    """Go from {prefix_key: value} to {prefix: {key: value}}."""
-    nested = {}
-    for k, v in d.items():
-        for prefix in prefixes:
-            if k.startswith(prefix + delim):
-                if prefix not in nested:
-                    nested[prefix] = {}
-                nested[prefix][k.split(delim, 1)[1]] = v
-            else:
-                nested[k] = v
-    return nested
+  """Go from {prefix_key: value} to {prefix: {key: value}}."""
+  nested = {}
+  for k, v in d.items():
+    for prefix in prefixes:
+      if k.startswith(prefix + delim):
+        if prefix not in nested:
+          nested[prefix] = {}
+        nested[prefix][k.split(delim, 1)[1]] = v
+      else:
+        nested[k] = v
+  return nested
 
 
 def flatten_dict(d, delim="_"):
-    """Go from {prefix: {key: value}} to {prefix_key: value}."""
-    flattened = {}
-    for k, v in d.items():
-        if isinstance(v, dict):
-            for k2, v2 in v.items():
-                flattened[k + delim + k2] = v2
-        else:
-            flattened[k] = v
-    return flattened
+  """Go from {prefix: {key: value}} to {prefix_key: value}."""
+  flattened = {}
+  for k, v in d.items():
+    if isinstance(v, dict):
+      for k2, v2 in v.items():
+        flattened[k + delim + k2] = v2
+    else:
+      flattened[k] = v
+  return flattened
 
 
 def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if six.PY3:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, bytes):
-            return text.decode("utf-8", "ignore")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
-    elif six.PY2:
-        if isinstance(text, str):
-            return text
-        elif isinstance(text, unicode):
-            return text.encode("utf-8")
-        else:
-            raise ValueError("Unsupported string type: %s" % (type(text)))
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
     else:
-        raise ValueError("Not running on Python2 or Python 3?")
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
 
 
 def get_readable_time(elapsed):
-    d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(':')]
-    d -= 1
-    return '{:2d}h{:2d}m{:2d}s'.format(24*d + h, m, s)
+  d, h, m, s = [int(x) for x in time.strftime("%d:%H:%M:%S", time.gmtime(elapsed)).split(":")]
+  d -= 1
+  return "{:2d}h{:2d}m{:2d}s".format(24 * d + h, m, s)
+
 
 def setup_logger(args):
-    os.makedirs(args.log_dir, exist_ok=True)
-    if not args.json_summary:
-        log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log'.format(get_rank()))
-    else:
-        log_path = "{}_rank{}".format(args.json_summary, get_rank())
+  os.makedirs(args.log_dir, exist_ok=True)
+  if not args.json_summary:
+    log_path = os.path.join(args.log_dir, "dllogger_rank{}.log".format(get_rank()))
+  else:
+    log_path = "{}_rank{}".format(args.json_summary, get_rank())
+
+  if is_main_process():
+    dllogger.init(
+      backends=[
+        dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
+        dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step),
+      ]
+    )
+  else:
+    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=1, filename=log_path)])
+
+  for k, v in vars(args).items():
+    dllogger.log(step="PARAMETER", data={k: v}, verbosity=0)
+
+  container_setup_info = {
+    "NVIDIA_TENSORFLOW_VERSION": os.environ.get("NVIDIA_TENSORFLOW_VERSION"),
+    "TENSORFLOW_VERSION": os.environ.get("TENSORFLOW_VERSION"),
+    "CUBLAS_VERSION": os.environ.get("CUBLAS_VERSION"),
+    "NCCL_VERSION": os.environ.get("NCCL_VERSION"),
+    "CUDA_DRIVER_VERSION": os.environ.get("CUDA_DRIVER_VERSION"),
+    "CUDNN_VERSION": os.environ.get("CUDNN_VERSION"),
+    "CUDA_VERSION": os.environ.get("CUDA_VERSION"),
+    "NVIDIA_PIPELINE_ID": os.environ.get("NVIDIA_PIPELINE_ID"),
+    "NVIDIA_BUILD_ID": os.environ.get("NVIDIA_BUILD_ID"),
+    "NVIDIA_TF32_OVERRIDE": os.environ.get("NVIDIA_TF32_OVERRIDE"),
+  }
+  dllogger.log(step="PARAMETER", data=container_setup_info, verbosity=0)
 
-    if is_main_process():
-        dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path),
-                                  dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)])
-    else:
-        dllogger.init(backends = [dllogger.JSONStreamBackend(verbosity=1, filename=log_path)])
-
-    for k,v in vars(args).items():
-        dllogger.log(step='PARAMETER', data={k:v}, verbosity=0)
-
-    container_setup_info = {
-        'NVIDIA_TENSORFLOW_VERSION': os.environ.get('NVIDIA_TENSORFLOW_VERSION'),
-        'TENSORFLOW_VERSION': os.environ.get('TENSORFLOW_VERSION'),
-        'CUBLAS_VERSION': os.environ.get('CUBLAS_VERSION'),
-        'NCCL_VERSION': os.environ.get('NCCL_VERSION'),
-        'CUDA_DRIVER_VERSION': os.environ.get('CUDA_DRIVER_VERSION'),
-        'CUDNN_VERSION': os.environ.get('CUDNN_VERSION'),
-        'CUDA_VERSION': os.environ.get('CUDA_VERSION'),
-        'NVIDIA_PIPELINE_ID': os.environ.get('NVIDIA_PIPELINE_ID'),
-        'NVIDIA_BUILD_ID': os.environ.get('NVIDIA_BUILD_ID'),
-        'NVIDIA_TF32_OVERRIDE': os.environ.get('NVIDIA_TF32_OVERRIDE'),
-    }
-    dllogger.log(step='PARAMETER', data=container_setup_info, verbosity=0)
 
 def postprocess_dllog(args):
-    if not args.json_summary:
-        log_path = os.path.join(args.log_dir, 'dllogger_rank{}.log')
-    else:
-        log_path = str(args.json_summary) + "_rank{}"
-    logfiles = [open(log_path.format(i), 'r') for i in range(get_world_size())]
-
-    if not args.json_summary:
-        log_path = os.path.join(args.log_dir, 'dllogger.log')
-    else:
-        log_path = str(args.json_summary)
-
-    with open(log_path, 'w') as dest_file:
-        for lines in zip(*[f.readlines() for f in logfiles]):
-            json_lines = [json.loads(l[5:]) for l in lines]
-
-            assert all(x['type'] == json_lines[0]['type'] for x in json_lines)
-            if json_lines[0]['type'] != 'LOG':
-                dest_file.write(lines[0])
-                continue
-
-            assert all(x['step'] == json_lines[0]['step'] for x in json_lines)
-            if json_lines[0]['step'] == 'PARAMETER':
-                dest_file.write(lines[0])
-            else:
-                d =  dict.fromkeys(json_lines[0]['data'])
-                for k in d.keys():
-                    vs = [line['data'][k] for line in json_lines]
-                    d[k] = sum(vs)/len(vs)
-                json_lines[0]['data'] = d
-                dest_file.write('DLLL ')
-                dest_file.write(json.dumps(json_lines[0]))
-                dest_file.write('\n')
-
-    for l in logfiles:
-        l.close()
+  if not args.json_summary:
+    log_path = os.path.join(args.log_dir, "dllogger_rank{}.log")
+  else:
+    log_path = str(args.json_summary) + "_rank{}"
+  logfiles = [open(log_path.format(i), "r") for i in range(get_world_size())]
+
+  if not args.json_summary:
+    log_path = os.path.join(args.log_dir, "dllogger.log")
+  else:
+    log_path = str(args.json_summary)
+
+  with open(log_path, "w") as dest_file:
+    for lines in zip(*[f.readlines() for f in logfiles]):
+      json_lines = [json.loads(l[5:]) for l in lines]
+
+      assert all(x["type"] == json_lines[0]["type"] for x in json_lines)
+      if json_lines[0]["type"] != "LOG":
+        dest_file.write(lines[0])
+        continue
+
+      assert all(x["step"] == json_lines[0]["step"] for x in json_lines)
+      if json_lines[0]["step"] == "PARAMETER":
+        dest_file.write(lines[0])
+      else:
+        d = dict.fromkeys(json_lines[0]["data"])
+        for k in d.keys():
+          vs = [line["data"][k] for line in json_lines]
+          d[k] = sum(vs) / len(vs)
+        json_lines[0]["data"] = d
+        dest_file.write("DLLL ")
+        dest_file.write(json.dumps(json_lines[0]))
+        dest_file.write("\n")
+
+  for l in logfiles:
+    l.close()
diff --git a/modelzoo/LanguageModeling/BERT/classifier_data_lib.py b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py
index 891d4991..923288d0 100644
--- a/modelzoo/LanguageModeling/BERT/classifier_data_lib.py
+++ b/modelzoo/LanguageModeling/BERT/classifier_data_lib.py
@@ -52,12 +52,7 @@ def __init__(self, guid, text_a, text_b=None, label=None):
 class InputFeatures(object):
   """A single set of features of data."""
 
-  def __init__(self,
-               input_ids,
-               input_mask,
-               segment_ids,
-               label_id,
-               is_real_example=True):
+  def __init__(self, input_ids, input_mask, segment_ids, label_id, is_real_example=True):
     self.input_ids = input_ids
     self.input_mask = input_mask
     self.segment_ids = segment_ids
@@ -112,11 +107,9 @@ def __init__(self, process_text_fn=tokenization.convert_to_unicode):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    lines = self._read_tsv(
-        os.path.join(data_dir, "multinli",
-                     "multinli.train.%s.tsv" % self.language))
+    lines = self._read_tsv(os.path.join(data_dir, "multinli", "multinli.train.%s.tsv" % self.language))
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "train-%d" % (i)
@@ -125,15 +118,14 @@ def get_train_examples(self, data_dir):
       label = self.process_text_fn(line[2])
       if label == self.process_text_fn("contradictory"):
         label = self.process_text_fn("contradiction")
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 
   def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "dev-%d" % (i)
@@ -143,8 +135,7 @@ def get_dev_examples(self, data_dir):
       text_a = self.process_text_fn(line[6])
       text_b = self.process_text_fn(line[7])
       label = self.process_text_fn(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 
   def get_labels(self):
@@ -162,19 +153,15 @@ class MnliProcessor(DataProcessor):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
   def get_dev_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-        "dev_matched")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched")
 
   def get_test_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
 
   def get_labels(self):
     """See base class."""
@@ -188,7 +175,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, self.process_text_fn(line[0]))
@@ -198,8 +185,7 @@ def _create_examples(self, lines, set_type):
         label = "contradiction"
       else:
         label = self.process_text_fn(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 
 
@@ -208,18 +194,15 @@ class MrpcProcessor(DataProcessor):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
   def get_dev_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
   def get_test_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 
   def get_labels(self):
     """See base class."""
@@ -233,7 +216,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, i)
@@ -243,8 +226,7 @@ def _create_examples(self, lines, set_type):
         label = "0"
       else:
         label = self.process_text_fn(line[0])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 
 
@@ -253,18 +235,15 @@ class ColaProcessor(DataProcessor):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
   def get_dev_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
   def get_test_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 
   def get_labels(self):
     """See base class."""
@@ -278,7 +257,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       # Only the test set has a header
       if set_type == "test" and i == 0:
         continue
@@ -289,8 +268,7 @@ def _create_examples(self, lines, set_type):
       else:
         text_a = self.process_text_fn(line[3])
         label = self.process_text_fn(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
 
 
@@ -299,18 +277,15 @@ class SstProcessor(DataProcessor):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
   def get_dev_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
 
   def get_test_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 
   def get_labels(self):
     """See base class."""
@@ -324,7 +299,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, i)
@@ -334,8 +309,7 @@ def _create_examples(self, lines, set_type):
       else:
         text_a = tokenization.convert_to_unicode(line[0])
         label = tokenization.convert_to_unicode(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
 
 
@@ -344,18 +318,15 @@ class QnliProcessor(DataProcessor):
 
   def get_train_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
 
   def get_dev_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched")
 
   def get_test_examples(self, data_dir):
     """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+    return self._create_examples(self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
 
   def get_labels(self):
     """See base class."""
@@ -369,7 +340,7 @@ def get_processor_name():
   def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
-    for (i, line) in enumerate(lines):
+    for i, line in enumerate(lines):
       if i == 0:
         continue
       guid = "%s-%s" % (set_type, 1)
@@ -381,16 +352,14 @@ def _create_examples(self, lines, set_type):
         text_a = tokenization.convert_to_unicode(line[1])
         text_b = tokenization.convert_to_unicode(line[2])
         label = tokenization.convert_to_unicode(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+      examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return examples
 
 
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
+def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer):
   """Converts a single `InputExample` into a single `InputFeatures`."""
   label_map = {}
-  for (i, label) in enumerate(label_list):
+  for i, label in enumerate(label_list):
     label_map[label] = i
 
   tokens_a = tokenizer.tokenize(example.text_a)
@@ -406,7 +375,7 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
   else:
     # Account for [CLS] and [SEP] with "- 2"
     if len(tokens_a) > max_seq_length - 2:
-      tokens_a = tokens_a[0:(max_seq_length - 2)]
+      tokens_a = tokens_a[0 : (max_seq_length - 2)]
 
   # The convention in BERT is:
   # (a) For sequence pairs:
@@ -463,35 +432,28 @@ def convert_single_example(ex_index, example, label_list, max_seq_length,
   if ex_index < 5:
     logging.info("*** Example ***")
     logging.info("guid: %s", (example.guid))
-    logging.info("tokens: %s",
-                 " ".join([tokenization.printable_text(x) for x in tokens]))
+    logging.info("tokens: %s", " ".join([tokenization.printable_text(x) for x in tokens]))
     logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
     logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
     logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
     logging.info("label: %s (id = %d)", example.label, label_id)
 
   feature = InputFeatures(
-      input_ids=input_ids,
-      input_mask=input_mask,
-      segment_ids=segment_ids,
-      label_id=label_id,
-      is_real_example=True)
+    input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, is_real_example=True
+  )
   return feature
 
 
-def file_based_convert_examples_to_features(examples, label_list,
-                                            max_seq_length, tokenizer,
-                                            output_file):
+def file_based_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer, output_file):
   """Convert a set of `InputExample`s to a TFRecord file."""
 
   writer = tf.io.TFRecordWriter(output_file)
 
-  for (ex_index, example) in enumerate(examples):
+  for ex_index, example in enumerate(examples):
     if ex_index % 10000 == 0:
       logging.info("Writing example %d of %d", ex_index, len(examples))
 
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)
+    feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer)
 
     def create_int_feature(values):
       f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
@@ -502,8 +464,7 @@ def create_int_feature(values):
     features["input_mask"] = create_int_feature(feature.input_mask)
     features["segment_ids"] = create_int_feature(feature.segment_ids)
     features["label_ids"] = create_int_feature([feature.label_id])
-    features["is_real_example"] = create_int_feature(
-        [int(feature.is_real_example)])
+    features["is_real_example"] = create_int_feature([int(feature.is_real_example)])
 
     tf_example = tf.train.Example(features=tf.train.Features(feature=features))
     writer.write(tf_example.SerializeToString())
@@ -527,12 +488,9 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
       tokens_b.pop()
 
 
-def generate_tf_record_from_data_file(processor,
-                                      data_dir,
-                                      tokenizer,
-                                      train_data_output_path=None,
-                                      eval_data_output_path=None,
-                                      max_seq_length=128):
+def generate_tf_record_from_data_file(
+  processor, data_dir, tokenizer, train_data_output_path=None, eval_data_output_path=None, max_seq_length=128
+):
   """Generates and saves training data into a tf record file.
 
   Arguments:
@@ -556,23 +514,23 @@ def generate_tf_record_from_data_file(processor,
   label_list = processor.get_labels()
   assert train_data_output_path
   train_input_data_examples = processor.get_train_examples(data_dir)
-  file_based_convert_examples_to_features(train_input_data_examples, label_list,
-                                          max_seq_length, tokenizer,
-                                          train_data_output_path)
+  file_based_convert_examples_to_features(
+    train_input_data_examples, label_list, max_seq_length, tokenizer, train_data_output_path
+  )
   num_training_data = len(train_input_data_examples)
 
   if eval_data_output_path:
     eval_input_data_examples = processor.get_dev_examples(data_dir)
-    file_based_convert_examples_to_features(eval_input_data_examples,
-                                            label_list, max_seq_length,
-                                            tokenizer, eval_data_output_path)
+    file_based_convert_examples_to_features(
+      eval_input_data_examples, label_list, max_seq_length, tokenizer, eval_data_output_path
+    )
 
   meta_data = {
-      "task_type": "bert_classification",
-      "processor_type": processor.get_processor_name(),
-      "num_labels": len(processor.get_labels()),
-      "train_data_size": num_training_data,
-      "max_seq_length": max_seq_length,
+    "task_type": "bert_classification",
+    "processor_type": processor.get_processor_name(),
+    "num_labels": len(processor.get_labels()),
+    "train_data_size": num_training_data,
+    "max_seq_length": max_seq_length,
   }
 
   if eval_data_output_path:
diff --git a/modelzoo/LanguageModeling/BERT/common_flags.py b/modelzoo/LanguageModeling/BERT/common_flags.py
index 9728288f..19147a98 100644
--- a/modelzoo/LanguageModeling/BERT/common_flags.py
+++ b/modelzoo/LanguageModeling/BERT/common_flags.py
@@ -22,46 +22,38 @@
 
 def define_common_bert_flags():
   """Define common flags for BERT tasks."""
-  flags.DEFINE_string('bert_config_file', None,
-                      'Bert configuration file to define core bert layers.')
-  flags.DEFINE_string(
-      'model_export_path', None,
-      'Path to the directory, where trainined model will be '
-      'exported.')
-  flags.DEFINE_string('tpu', '', 'TPU address to connect to.')
-  flags.DEFINE_integer('num_train_epochs', 3,
-                       'Total number of training epochs to perform.')
+  flags.DEFINE_string("bert_config_file", None, "Bert configuration file to define core bert layers.")
+  flags.DEFINE_string("model_export_path", None, "Path to the directory, where trainined model will be exported.")
+  flags.DEFINE_string("tpu", "", "TPU address to connect to.")
+  flags.DEFINE_integer("num_train_epochs", 3, "Total number of training epochs to perform.")
   flags.DEFINE_integer(
-      'steps_per_loop', 200,
-      'Number of steps per graph-mode loop. Only training step '
-      'happens inside the loop. Callbacks will not be called '
-      'inside.')
+    "steps_per_loop",
+    200,
+    "Number of steps per graph-mode loop. Only training step "
+    "happens inside the loop. Callbacks will not be called "
+    "inside.",
+  )
   flags.DEFINE_boolean(
-      'scale_loss', False,
-      'Whether to divide the loss by number of replica inside the per-replica '
-      'loss function.')
+    "scale_loss", False, "Whether to divide the loss by number of replica inside the per-replica loss function."
+  )
   flags.DEFINE_boolean(
-    'use_keras_compile_fit', False,
-    'If True, uses Keras compile/fit() API for training logic. Otherwise '
-    'use custom training loop.')
+    "use_keras_compile_fit",
+    False,
+    "If True, uses Keras compile/fit() API for training logic. Otherwise use custom training loop.",
+  )
   flags.DEFINE_string(
-    'hub_module_url', None, 'TF-Hub path/url to Bert module. '
-    'If specified, init_checkpoint flag should not be used.')
+    "hub_module_url", None, "TF-Hub path/url to Bert module. If specified, init_checkpoint flag should not be used."
+  )
   flags.DEFINE_enum(
-    'model_type', 'bert', ['bert', 'albert'],
-    'Specifies the type of the model. '
-    'If "bert", will use canonical BERT; if "albert", will use ALBERT model.')
-  flags.DEFINE_boolean(
-      'use_fp16', False,
-      'Whether to use fp32 or fp16 arithmetic on GPU.')
-  flags.DEFINE_integer(
-    'save_checkpoint_steps', 1000,
-    'save checkpoint for every n steps')
-  flags.DEFINE_string(
-    'dllog_path', 'bert_dllogger.json', 'filename where dllogger writes to')
-  flags.DEFINE_boolean(
-      'benchmark', False,
-      'Benchmark mode.')
+    "model_type",
+    "bert",
+    ["bert", "albert"],
+    'Specifies the type of the model. If "bert", will use canonical BERT; if "albert", will use ALBERT model.',
+  )
+  flags.DEFINE_boolean("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
+  flags.DEFINE_integer("save_checkpoint_steps", 1000, "save checkpoint for every n steps")
+  flags.DEFINE_string("dllog_path", "bert_dllogger.json", "filename where dllogger writes to")
+  flags.DEFINE_boolean("benchmark", False, "Benchmark mode.")
 
 
 def use_float16():
@@ -69,4 +61,4 @@ def use_float16():
 
 
 def get_loss_scale():
-  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16='dynamic')
+  return flags_core.get_loss_scale(flags.FLAGS, default_for_fp16="dynamic")
diff --git a/modelzoo/LanguageModeling/BERT/create_finetuning_data.py b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py
index 5bfeff55..f61fea9e 100644
--- a/modelzoo/LanguageModeling/BERT/create_finetuning_data.py
+++ b/modelzoo/LanguageModeling/BERT/create_finetuning_data.py
@@ -26,8 +26,10 @@
 import tensorflow as tf
 
 import classifier_data_lib
+
 # word-piece tokenizer based squad_lib
 import squad_lib as squad_lib_wp
+
 # sentence-piece tokenizer based squad_lib
 import squad_lib_sp
 import tokenization
@@ -35,75 +37,78 @@
 FLAGS = flags.FLAGS
 
 flags.DEFINE_enum(
-    "fine_tuning_task_type", "classification", ["classification", "squad"],
-    "The name of the BERT fine tuning task for which data "
-    "will be generated..")
+  "fine_tuning_task_type",
+  "classification",
+  ["classification", "squad"],
+  "The name of the BERT fine tuning task for which data will be generated..",
+)
 
 # BERT classification specific flags.
 flags.DEFINE_string(
-    "input_data_dir", None,
-    "The input data dir. Should contain the .tsv files (or other data files) "
-    "for the task.")
+  "input_data_dir", None, "The input data dir. Should contain the .tsv files (or other data files) for the task."
+)
 
-flags.DEFINE_enum("classification_task_name", "MNLI",
-                  ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"],
-                  "The name of the task to train BERT classifier.")
+flags.DEFINE_enum(
+  "classification_task_name",
+  "MNLI",
+  ["COLA", "MNLI", "MRPC", "QNLI", "SST-2", "XNLI"],
+  "The name of the task to train BERT classifier.",
+)
 
 # BERT Squad task specific flags.
-flags.DEFINE_string(
-    "squad_data_file", None,
-    "The input data file in for generating training data for BERT squad task.")
+flags.DEFINE_string("squad_data_file", None, "The input data file in for generating training data for BERT squad task.")
 
 flags.DEFINE_integer(
-    "doc_stride", 128,
-    "When splitting up a long document into chunks, how much stride to "
-    "take between chunks.")
+  "doc_stride", 128, "When splitting up a long document into chunks, how much stride to take between chunks."
+)
 
 flags.DEFINE_integer(
-    "max_query_length", 64,
-    "The maximum number of tokens for the question. Questions longer than "
-    "this will be truncated to this length.")
+  "max_query_length",
+  64,
+  "The maximum number of tokens for the question. Questions longer than this will be truncated to this length.",
+)
 
 flags.DEFINE_bool(
-    "version_2_with_negative", False,
-    "If true, the SQuAD examples contain some that do not have an answer.")
+  "version_2_with_negative", False, "If true, the SQuAD examples contain some that do not have an answer."
+)
 
 # Shared flags across BERT fine-tuning tasks.
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.")
 
 flags.DEFINE_string(
-    "train_data_output_path", None,
-    "The path in which generated training input data will be written as tf"
-    " records.")
+  "train_data_output_path", None, "The path in which generated training input data will be written as tf records."
+)
 
 flags.DEFINE_string(
-    "eval_data_output_path", None,
-    "The path in which generated training input data will be written as tf"
-    " records.")
+  "eval_data_output_path", None, "The path in which generated training input data will be written as tf records."
+)
 
-flags.DEFINE_string("meta_data_file_path", None,
-                    "The path in which input meta data will be written.")
+flags.DEFINE_string("meta_data_file_path", None, "The path in which input meta data will be written.")
 
 flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
+  "do_lower_case",
+  True,
+  "Whether to lower case the input text. Should be True for uncased models and False for cased models.",
+)
 
 flags.DEFINE_integer(
-    "max_seq_length", 128,
-    "The maximum total input sequence length after WordPiece tokenization. "
-    "Sequences longer than this will be truncated, and sequences shorter "
-    "than this will be padded.")
+  "max_seq_length",
+  128,
+  "The maximum total input sequence length after WordPiece tokenization. "
+  "Sequences longer than this will be truncated, and sequences shorter "
+  "than this will be padded.",
+)
 
-flags.DEFINE_string("sp_model_file", "",
-                    "The path to the model used by sentence piece tokenizer.")
+flags.DEFINE_string("sp_model_file", "", "The path to the model used by sentence piece tokenizer.")
 
 flags.DEFINE_enum(
-    "tokenizer_impl", "word_piece", ["word_piece", "sentence_piece"],
-    "Specifies the tokenizer implementation, i.e., whehter to use word_piece "
-    "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
-    "while ALBERT uses sentence_piece tokenizer.")
+  "tokenizer_impl",
+  "word_piece",
+  ["word_piece", "sentence_piece"],
+  "Specifies the tokenizer implementation, i.e., whehter to use word_piece "
+  "or sentence_piece tokenizer. Canonical BERT uses word_piece tokenizer, "
+  "while ALBERT uses sentence_piece tokenizer.",
+)
 
 
 def generate_classifier_dataset():
@@ -111,35 +116,34 @@ def generate_classifier_dataset():
   assert FLAGS.input_data_dir and FLAGS.classification_task_name
 
   processors = {
-      "cola": classifier_data_lib.ColaProcessor,
-      "mnli": classifier_data_lib.MnliProcessor,
-      "mrpc": classifier_data_lib.MrpcProcessor,
-      "qnli": classifier_data_lib.QnliProcessor,
-      "sst-2": classifier_data_lib.SstProcessor,
-      "xnli": classifier_data_lib.XnliProcessor,
+    "cola": classifier_data_lib.ColaProcessor,
+    "mnli": classifier_data_lib.MnliProcessor,
+    "mrpc": classifier_data_lib.MrpcProcessor,
+    "qnli": classifier_data_lib.QnliProcessor,
+    "sst-2": classifier_data_lib.SstProcessor,
+    "xnli": classifier_data_lib.XnliProcessor,
   }
   task_name = FLAGS.classification_task_name.lower()
   if task_name not in processors:
     raise ValueError("Task not found: %s" % (task_name))
 
   if FLAGS.tokenizer_impl == "word_piece":
-    tokenizer = tokenization.FullTokenizer(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
     processor_text_fn = tokenization.convert_to_unicode
   else:
     assert FLAGS.tokenizer_impl == "sentence_piece"
     tokenizer = tokenization.FullSentencePieceTokenizer(FLAGS.sp_model_file)
-    processor_text_fn = functools.partial(
-        tokenization.preprocess_text, lower=FLAGS.do_lower_case)
+    processor_text_fn = functools.partial(tokenization.preprocess_text, lower=FLAGS.do_lower_case)
 
   processor = processors[task_name](processor_text_fn)
   return classifier_data_lib.generate_tf_record_from_data_file(
-      processor,
-      FLAGS.input_data_dir,
-      tokenizer,
-      train_data_output_path=FLAGS.train_data_output_path,
-      eval_data_output_path=FLAGS.eval_data_output_path,
-      max_seq_length=FLAGS.max_seq_length)
+    processor,
+    FLAGS.input_data_dir,
+    tokenizer,
+    train_data_output_path=FLAGS.train_data_output_path,
+    eval_data_output_path=FLAGS.eval_data_output_path,
+    max_seq_length=FLAGS.max_seq_length,
+  )
 
 
 def generate_squad_dataset():
@@ -147,27 +151,37 @@ def generate_squad_dataset():
   assert FLAGS.squad_data_file
   if FLAGS.tokenizer_impl == "word_piece":
     return squad_lib_wp.generate_tf_record_from_json_file(
-        FLAGS.squad_data_file, FLAGS.vocab_file, FLAGS.train_data_output_path,
-        FLAGS.max_seq_length, FLAGS.do_lower_case, FLAGS.max_query_length,
-        FLAGS.doc_stride, FLAGS.version_2_with_negative)
+      FLAGS.squad_data_file,
+      FLAGS.vocab_file,
+      FLAGS.train_data_output_path,
+      FLAGS.max_seq_length,
+      FLAGS.do_lower_case,
+      FLAGS.max_query_length,
+      FLAGS.doc_stride,
+      FLAGS.version_2_with_negative,
+    )
   else:
     assert FLAGS.tokenizer_impl == "sentence_piece"
     return squad_lib_sp.generate_tf_record_from_json_file(
-        FLAGS.squad_data_file, FLAGS.sp_model_file,
-        FLAGS.train_data_output_path, FLAGS.max_seq_length, FLAGS.do_lower_case,
-        FLAGS.max_query_length, FLAGS.doc_stride, FLAGS.version_2_with_negative)
+      FLAGS.squad_data_file,
+      FLAGS.sp_model_file,
+      FLAGS.train_data_output_path,
+      FLAGS.max_seq_length,
+      FLAGS.do_lower_case,
+      FLAGS.max_query_length,
+      FLAGS.doc_stride,
+      FLAGS.version_2_with_negative,
+    )
 
 
 def main(_):
   if FLAGS.tokenizer_impl == "word_piece":
     if not FLAGS.vocab_file:
-      raise ValueError(
-          "FLAG vocab_file for word-piece tokenizer is not specified.")
+      raise ValueError("FLAG vocab_file for word-piece tokenizer is not specified.")
   else:
     assert FLAGS.tokenizer_impl == "sentence_piece"
     if not FLAGS.sp_model_file:
-      raise ValueError(
-          "FLAG sp_model_file for sentence-piece tokenizer is not specified.")
+      raise ValueError("FLAG sp_model_file for sentence-piece tokenizer is not specified.")
 
   if FLAGS.fine_tuning_task_type == "classification":
     input_meta_data = generate_classifier_dataset()
diff --git a/modelzoo/LanguageModeling/BERT/create_pretraining_data.py b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py
index 69a5c696..77086c77 100644
--- a/modelzoo/LanguageModeling/BERT/create_pretraining_data.py
+++ b/modelzoo/LanguageModeling/BERT/create_pretraining_data.py
@@ -28,63 +28,51 @@
 
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("input_file", None,
-                    "Input raw text file (or comma-separated list of files).")
+flags.DEFINE_string("input_file", None, "Input raw text file (or comma-separated list of files).")
 
-flags.DEFINE_string(
-    "output_file", None,
-    "Output TF example file (or comma-separated list of files).")
+flags.DEFINE_string("output_file", None, "Output TF example file (or comma-separated list of files).")
 
-flags.DEFINE_string("vocab_file", None,
-                    "The vocabulary file that the BERT model was trained on.")
+flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.")
 
 flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
+  "do_lower_case",
+  True,
+  "Whether to lower case the input text. Should be True for uncased models and False for cased models.",
+)
 
-flags.DEFINE_bool(
-    "do_whole_word_mask", False,
-    "Whether to use whole word masking rather than per-WordPiece masking.")
+flags.DEFINE_bool("do_whole_word_mask", False, "Whether to use whole word masking rather than per-WordPiece masking.")
 
 flags.DEFINE_integer(
-    "max_ngram_size", None,
-    "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
-    "weighting scheme to favor shorter n-grams. "
-    "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.")
+  "max_ngram_size",
+  None,
+  "Mask contiguous whole words (n-grams) of up to `max_ngram_size` using a "
+  "weighting scheme to favor shorter n-grams. "
+  "Note: `--do_whole_word_mask=True` must also be set when n-gram masking.",
+)
 
-flags.DEFINE_bool(
-    "gzip_compress", False,
-    "Whether to use `GZIP` compress option to get compressed TFRecord files.")
+flags.DEFINE_bool("gzip_compress", False, "Whether to use `GZIP` compress option to get compressed TFRecord files.")
 
-flags.DEFINE_bool(
-    "use_v2_feature_names", False,
-    "Whether to use the feature names consistent with the models.")
+flags.DEFINE_bool("use_v2_feature_names", False, "Whether to use the feature names consistent with the models.")
 
 flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
 
-flags.DEFINE_integer("max_predictions_per_seq", 20,
-                     "Maximum number of masked LM predictions per sequence.")
+flags.DEFINE_integer("max_predictions_per_seq", 20, "Maximum number of masked LM predictions per sequence.")
 
 flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
 
-flags.DEFINE_integer(
-    "dupe_factor", 10,
-    "Number of times to duplicate the input data (with different masks).")
+flags.DEFINE_integer("dupe_factor", 10, "Number of times to duplicate the input data (with different masks).")
 
 flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
 
 flags.DEFINE_float(
-    "short_seq_prob", 0.1,
-    "Probability of creating sequences which are shorter than the "
-    "maximum length.")
+  "short_seq_prob", 0.1, "Probability of creating sequences which are shorter than the maximum length."
+)
 
 
 class TrainingInstance(object):
   """A single training instance (sentence pair)."""
 
-  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
-               is_random_next):
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, is_random_next):
     self.tokens = tokens
     self.segment_ids = segment_ids
     self.is_random_next = is_random_next
@@ -93,14 +81,11 @@ def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
 
   def __str__(self):
     s = ""
-    s += "tokens: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "tokens: %s\n" % (" ".join([tokenization.printable_text(x) for x in self.tokens]))
     s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
     s += "is_random_next: %s\n" % self.is_random_next
-    s += "masked_lm_positions: %s\n" % (" ".join(
-        [str(x) for x in self.masked_lm_positions]))
-    s += "masked_lm_labels: %s\n" % (" ".join(
-        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "masked_lm_positions: %s\n" % (" ".join([str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join([tokenization.printable_text(x) for x in self.masked_lm_labels]))
     s += "\n"
     return s
 
@@ -108,20 +93,18 @@ def __repr__(self):
     return self.__str__()
 
 
-def write_instance_to_example_files(instances, tokenizer, max_seq_length,
-                                    max_predictions_per_seq, output_files,
-                                    gzip_compress, use_v2_feature_names):
+def write_instance_to_example_files(
+  instances, tokenizer, max_seq_length, max_predictions_per_seq, output_files, gzip_compress, use_v2_feature_names
+):
   """Creates TF example files from `TrainingInstance`s."""
   writers = []
   for output_file in output_files:
-    writers.append(
-        tf.io.TFRecordWriter(
-            output_file, options="GZIP" if gzip_compress else ""))
+    writers.append(tf.io.TFRecordWriter(output_file, options="GZIP" if gzip_compress else ""))
 
   writer_index = 0
 
   total_written = 0
-  for (inst_index, instance) in enumerate(instances):
+  for inst_index, instance in enumerate(instances):
     input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
     input_mask = [1] * len(input_ids)
     segment_ids = list(instance.segment_ids)
@@ -170,8 +153,7 @@ def write_instance_to_example_files(instances, tokenizer, max_seq_length,
 
     if inst_index < 20:
       logging.info("*** Example ***")
-      logging.info("tokens: %s", " ".join(
-          [tokenization.printable_text(x) for x in instance.tokens]))
+      logging.info("tokens: %s", " ".join([tokenization.printable_text(x) for x in instance.tokens]))
 
       for feature_name in features.keys():
         feature = features[feature_name]
@@ -198,16 +180,18 @@ def create_float_feature(values):
   return feature
 
 
-def create_training_instances(input_files,
-                              tokenizer,
-                              max_seq_length,
-                              dupe_factor,
-                              short_seq_prob,
-                              masked_lm_prob,
-                              max_predictions_per_seq,
-                              rng,
-                              do_whole_word_mask=False,
-                              max_ngram_size=None):
+def create_training_instances(
+  input_files,
+  tokenizer,
+  max_seq_length,
+  dupe_factor,
+  short_seq_prob,
+  masked_lm_prob,
+  max_predictions_per_seq,
+  rng,
+  do_whole_word_mask=False,
+  max_ngram_size=None,
+):
   """Create `TrainingInstance`s from raw text."""
   all_documents = [[]]
 
@@ -241,20 +225,36 @@ def create_training_instances(input_files,
   for _ in range(dupe_factor):
     for document_index in range(len(all_documents)):
       instances.extend(
-          create_instances_from_document(
-              all_documents, document_index, max_seq_length, short_seq_prob,
-              masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-              do_whole_word_mask, max_ngram_size))
+        create_instances_from_document(
+          all_documents,
+          document_index,
+          max_seq_length,
+          short_seq_prob,
+          masked_lm_prob,
+          max_predictions_per_seq,
+          vocab_words,
+          rng,
+          do_whole_word_mask,
+          max_ngram_size,
+        )
+      )
 
   rng.shuffle(instances)
   return instances
 
 
 def create_instances_from_document(
-    all_documents, document_index, max_seq_length, short_seq_prob,
-    masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-    do_whole_word_mask=False,
-    max_ngram_size=None):
+  all_documents,
+  document_index,
+  max_seq_length,
+  short_seq_prob,
+  masked_lm_prob,
+  max_predictions_per_seq,
+  vocab_words,
+  rng,
+  do_whole_word_mask=False,
+  max_ngram_size=None,
+):
   """Creates `TrainingInstance`s for a single document."""
   document = all_documents[document_index]
 
@@ -350,16 +350,16 @@ def create_instances_from_document(
         tokens.append("[SEP]")
         segment_ids.append(1)
 
-        (tokens, masked_lm_positions,
-         masked_lm_labels) = create_masked_lm_predictions(
-             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng,
-             do_whole_word_mask, max_ngram_size)
+        (tokens, masked_lm_positions, masked_lm_labels) = create_masked_lm_predictions(
+          tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size
+        )
         instance = TrainingInstance(
-            tokens=tokens,
-            segment_ids=segment_ids,
-            is_random_next=is_random_next,
-            masked_lm_positions=masked_lm_positions,
-            masked_lm_labels=masked_lm_labels)
+          tokens=tokens,
+          segment_ids=segment_ids,
+          is_random_next=is_random_next,
+          masked_lm_positions=masked_lm_positions,
+          masked_lm_labels=masked_lm_labels,
+        )
         instances.append(instance)
       current_chunk = []
       current_length = 0
@@ -368,8 +368,7 @@ def create_instances_from_document(
   return instances
 
 
-MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
-                                          ["index", "label"])
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance", ["index", "label"])
 
 # A _Gram is a [half-open) interval of token indices which form a word.
 # E.g.,
@@ -464,8 +463,8 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
       raise ValueError("overlapping grams: {}".format(grams))
 
   # Build map from n-gram length to list of n-grams.
-  ngrams = {i: [] for i in range(1, max_ngram_size+1)}
-  for gram_size in range(1, max_ngram_size+1):
+  ngrams = {i: [] for i in range(1, max_ngram_size + 1)}
+  for gram_size in range(1, max_ngram_size + 1):
     for g in _window(grams, gram_size):
       if _contiguous(g):
         # Add an n-gram which spans these one-grams.
@@ -477,8 +476,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
 
   # Create the weighting for n-gram length selection.
   # Stored cummulatively for `random.choices` below.
-  cummulative_weights = list(
-      itertools.accumulate([1./n for n in range(1, max_ngram_size+1)]))
+  cummulative_weights = list(itertools.accumulate([1.0 / n for n in range(1, max_ngram_size + 1)]))
 
   output_ngrams = []
   # Keep a bitmask of which tokens have been masked.
@@ -487,11 +485,9 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
   # n-grams of any length.
   # Each code path should ensure one or more elements from `ngrams` are removed
   # to guarentee this loop terminates.
-  while (sum(masked_tokens) < max_masked_tokens and
-         sum(len(s) for s in ngrams.values())):
+  while sum(masked_tokens) < max_masked_tokens and sum(len(s) for s in ngrams.values()):
     # Pick an n-gram size based on our weights.
-    sz = random.choices(range(1, max_ngram_size+1),
-                        cum_weights=cummulative_weights)[0]
+    sz = random.choices(range(1, max_ngram_size + 1), cum_weights=cummulative_weights)[0]
 
     # Ensure this size doesn't result in too many masked tokens.
     # E.g., a two-gram contains _at least_ two tokens.
@@ -507,18 +503,18 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
 
     # Choose a random n-gram of the given size.
     gram = ngrams[sz].pop()
-    num_gram_tokens = gram.end-gram.begin
+    num_gram_tokens = gram.end - gram.begin
 
     # Check if this would add too many tokens.
     if num_gram_tokens + sum(masked_tokens) > max_masked_tokens:
       continue
 
     # Check if any of the tokens in this gram have already been masked.
-    if sum(masked_tokens[gram.begin:gram.end]):
+    if sum(masked_tokens[gram.begin : gram.end]):
       continue
 
     # Found a usable n-gram!  Mark its tokens as masked and add it to return.
-    masked_tokens[gram.begin:gram.end] = [True] * (gram.end-gram.begin)
+    masked_tokens[gram.begin : gram.end] = [True] * (gram.end - gram.begin)
     output_ngrams.append(gram)
   return output_ngrams
 
@@ -550,10 +546,9 @@ def _wordpieces_to_grams(tokens):
   return grams
 
 
-def create_masked_lm_predictions(tokens, masked_lm_prob,
-                                 max_predictions_per_seq, vocab_words, rng,
-                                 do_whole_word_mask,
-                                 max_ngram_size=None):
+def create_masked_lm_predictions(
+  tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng, do_whole_word_mask, max_ngram_size=None
+):
   """Creates the predictions for the masked LM objective."""
   if do_whole_word_mask:
     grams = _wordpieces_to_grams(tokens)
@@ -561,16 +556,13 @@ def create_masked_lm_predictions(tokens, masked_lm_prob,
     # Here we consider each token to be a word to allow for sub-word masking.
     if max_ngram_size:
       raise ValueError("cannot use ngram masking without whole word masking")
-    grams = [_Gram(i, i+1) for i in range(0, len(tokens))
-             if tokens[i] not in ["[CLS]", "[SEP]"]]
+    grams = [_Gram(i, i + 1) for i in range(0, len(tokens)) if tokens[i] not in ["[CLS]", "[SEP]"]]
 
-  num_to_predict = min(max_predictions_per_seq,
-                       max(1, int(round(len(tokens) * masked_lm_prob))))
+  num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob))))
   # Generate masks.  If `max_ngram_size` in [0, None] it means we're doing
   # whole word masking or token level masking.  Both of these can be treated
   # as the `max_ngram_size=1` case.
-  masked_grams = _masking_ngrams(grams, max_ngram_size or 1,
-                                 num_to_predict, rng)
+  masked_grams = _masking_ngrams(grams, max_ngram_size or 1, num_to_predict, rng)
   masked_lms = []
   output_tokens = list(tokens)
   for gram in masked_grams:
@@ -620,8 +612,7 @@ def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
 
 
 def main(_):
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+  tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
 
   input_files = []
   for input_pattern in FLAGS.input_file.split(","):
@@ -633,23 +624,36 @@ def main(_):
 
   rng = random.Random(FLAGS.random_seed)
   instances = create_training_instances(
-      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
-      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
-      rng, FLAGS.do_whole_word_mask, FLAGS.max_ngram_size)
+    input_files,
+    tokenizer,
+    FLAGS.max_seq_length,
+    FLAGS.dupe_factor,
+    FLAGS.short_seq_prob,
+    FLAGS.masked_lm_prob,
+    FLAGS.max_predictions_per_seq,
+    rng,
+    FLAGS.do_whole_word_mask,
+    FLAGS.max_ngram_size,
+  )
 
   output_files = FLAGS.output_file.split(",")
   logging.info("*** Writing to output files ***")
   for output_file in output_files:
     logging.info("  %s", output_file)
 
-  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
-                                  FLAGS.max_predictions_per_seq, output_files,
-                                  FLAGS.gzip_compress,
-                                  FLAGS.use_v2_feature_names)
+  write_instance_to_example_files(
+    instances,
+    tokenizer,
+    FLAGS.max_seq_length,
+    FLAGS.max_predictions_per_seq,
+    output_files,
+    FLAGS.gzip_compress,
+    FLAGS.use_v2_feature_names,
+  )
 
 
 if __name__ == "__main__":
   flags.mark_flag_as_required("input_file")
   flags.mark_flag_as_required("output_file")
   flags.mark_flag_as_required("vocab_file")
-  app.run(main)
\ No newline at end of file
+  app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
index 53ee6c43..6b4af9cd 100644
--- a/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/BooksDownloader.py
@@ -13,14 +13,16 @@
 
 import subprocess
 
-class BooksDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path
-        pass
 
+class BooksDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path
+    pass
 
-    def download(self):
-        bookscorpus_download_command = 'python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out'
-        bookscorpus_download_command += ' ' + self.save_path + '/bookscorpus'
-        bookscorpus_download_command += ' --trash-bad-count'
-        bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
\ No newline at end of file
+  def download(self):
+    bookscorpus_download_command = (
+      "python3 /workspace/bookcorpus/download_files.py --list /workspace/bookcorpus/url_list.jsonl --out"
+    )
+    bookscorpus_download_command += " " + self.save_path + "/bookscorpus"
+    bookscorpus_download_command += " --trash-bad-count"
+    bookscorpus_download_process = subprocess.run(bookscorpus_download_command, shell=True, check=True)
diff --git a/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
index 22e48d4b..409bdee8 100644
--- a/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
+++ b/modelzoo/LanguageModeling/BERT/data/BookscorpusTextFormatting.py
@@ -14,19 +14,19 @@
 import glob
 import os
 
-class BookscorpusTextFormatting:
-    def __init__(self, books_path, output_filename, recursive = False):
-        self.books_path = books_path
-        self.recursive = recursive
-        self.output_filename = output_filename
 
+class BookscorpusTextFormatting:
+  def __init__(self, books_path, output_filename, recursive=False):
+    self.books_path = books_path
+    self.recursive = recursive
+    self.output_filename = output_filename
 
-    # This puts one book per line
-    def merge(self):
-        with open(self.output_filename, mode='w', newline='\n') as ofile:
-            for filename in glob.glob(self.books_path + '/' + '*.txt', recursive=True):
-                with open(filename, mode='r', encoding='utf-8-sig', newline='\n') as file:
-                    for line in file:
-                        if line.strip() != '':
-                            ofile.write(line.strip() + ' ')
-                ofile.write("\n\n")
\ No newline at end of file
+  # This puts one book per line
+  def merge(self):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.books_path + "/" + "*.txt", recursive=True):
+        with open(filename, mode="r", encoding="utf-8-sig", newline="\n") as file:
+          for line in file:
+            if line.strip() != "":
+              ofile.write(line.strip() + " ")
+        ofile.write("\n\n")
diff --git a/modelzoo/LanguageModeling/BERT/data/Downloader.py b/modelzoo/LanguageModeling/BERT/data/Downloader.py
index bb5c6287..04260ec9 100644
--- a/modelzoo/LanguageModeling/BERT/data/Downloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/Downloader.py
@@ -19,105 +19,98 @@
 from SquadDownloader import SquadDownloader
 from PubMedDownloader import PubMedDownloader
 
-class Downloader:
-    def __init__(self, dataset_name, save_path):
-        self.dataset_name = dataset_name
-        self.save_path = save_path
-
-
-    def download(self):
-        if self.dataset_name == 'bookscorpus':
-            self.download_bookscorpus()
-
-        elif self.dataset_name == 'wikicorpus_en':
-            self.download_wikicorpus('en')
-
-        elif self.dataset_name == 'wikicorpus_zh':
-            self.download_wikicorpus('zh')
-
-        elif self.dataset_name == 'pubmed_baseline':
-            self.download_pubmed('baseline')
-
-        elif self.dataset_name == 'pubmed_daily_update':
-            self.download_pubmed('daily_update')
-
-        elif self.dataset_name == 'pubmed_fulltext':
-            self.download_pubmed('fulltext')
-
-        elif self.dataset_name == 'pubmed_open_access':
-            self.download_pubmed('open_access')
-
-        elif self.dataset_name == 'google_pretrained_weights':
-            self.download_google_pretrained_weights()
-
-        elif self.dataset_name == 'nvidia_pretrained_weights':
-            self.download_nvidia_pretrained_weights()
-
-        elif self.dataset_name == 'mrpc':
-            self.download_glue(self.dataset_name)
-
-        elif self.dataset_name == 'mnli':
-            self.download_glue(self.dataset_name)
-
-        elif self.dataset_name == 'cola':
-            self.download_glue(self.dataset_name)
-        elif self.dataset_name == 'sst-2':
-            self.download_glue(self.dataset_name)
-
-        elif self.dataset_name == 'squad':
-            self.download_squad()
-
-        elif self.dataset_name == 'all':
-            self.download_bookscorpus()
-            self.download_wikicorpus('en')
-            self.download_wikicorpus('zh')
-            self.download_pubmed('baseline')
-            self.download_pubmed('daily_update')
-            self.download_pubmed('fulltext')
-            self.download_pubmed('open_access')
-            self.download_google_pretrained_weights()
-            self.download_nvidia_pretrained_weights()
-            self.download_glue("cola")
-            self.download_glue("mnli")
-            self.download_glue("mrpc")
-            self.download_glue("sst-2")
-            self.download_squad()
 
-        else:
-            print(self.dataset_name)
-            assert False, 'Unknown dataset_name provided to downloader'
-
-
-    def download_bookscorpus(self):
-        downloader = BooksDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_wikicorpus(self, language):
-        downloader = WikiDownloader(language, self.save_path)
-        downloader.download()
-
-
-    def download_pubmed(self, subset):
-        downloader = PubMedDownloader(subset, self.save_path)
-        downloader.download()
-
-
-    def download_google_pretrained_weights(self):
-        downloader = GooglePretrainedWeightDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_nvidia_pretrained_weights(self):
-        downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
-        downloader.download()
-
-
-    def download_glue(self, glue_task_name):
-        downloader = GLUEDownloader(self.save_path)
-        downloader.download(glue_task_name)
-
-
-    def download_squad(self):
-        downloader = SquadDownloader(self.save_path)
-        downloader.download()
+class Downloader:
+  def __init__(self, dataset_name, save_path):
+    self.dataset_name = dataset_name
+    self.save_path = save_path
+
+  def download(self):
+    if self.dataset_name == "bookscorpus":
+      self.download_bookscorpus()
+
+    elif self.dataset_name == "wikicorpus_en":
+      self.download_wikicorpus("en")
+
+    elif self.dataset_name == "wikicorpus_zh":
+      self.download_wikicorpus("zh")
+
+    elif self.dataset_name == "pubmed_baseline":
+      self.download_pubmed("baseline")
+
+    elif self.dataset_name == "pubmed_daily_update":
+      self.download_pubmed("daily_update")
+
+    elif self.dataset_name == "pubmed_fulltext":
+      self.download_pubmed("fulltext")
+
+    elif self.dataset_name == "pubmed_open_access":
+      self.download_pubmed("open_access")
+
+    elif self.dataset_name == "google_pretrained_weights":
+      self.download_google_pretrained_weights()
+
+    elif self.dataset_name == "nvidia_pretrained_weights":
+      self.download_nvidia_pretrained_weights()
+
+    elif self.dataset_name == "mrpc":
+      self.download_glue(self.dataset_name)
+
+    elif self.dataset_name == "mnli":
+      self.download_glue(self.dataset_name)
+
+    elif self.dataset_name == "cola":
+      self.download_glue(self.dataset_name)
+    elif self.dataset_name == "sst-2":
+      self.download_glue(self.dataset_name)
+
+    elif self.dataset_name == "squad":
+      self.download_squad()
+
+    elif self.dataset_name == "all":
+      self.download_bookscorpus()
+      self.download_wikicorpus("en")
+      self.download_wikicorpus("zh")
+      self.download_pubmed("baseline")
+      self.download_pubmed("daily_update")
+      self.download_pubmed("fulltext")
+      self.download_pubmed("open_access")
+      self.download_google_pretrained_weights()
+      self.download_nvidia_pretrained_weights()
+      self.download_glue("cola")
+      self.download_glue("mnli")
+      self.download_glue("mrpc")
+      self.download_glue("sst-2")
+      self.download_squad()
+
+    else:
+      print(self.dataset_name)
+      assert False, "Unknown dataset_name provided to downloader"
+
+  def download_bookscorpus(self):
+    downloader = BooksDownloader(self.save_path)
+    downloader.download()
+
+  def download_wikicorpus(self, language):
+    downloader = WikiDownloader(language, self.save_path)
+    downloader.download()
+
+  def download_pubmed(self, subset):
+    downloader = PubMedDownloader(subset, self.save_path)
+    downloader.download()
+
+  def download_google_pretrained_weights(self):
+    downloader = GooglePretrainedWeightDownloader(self.save_path)
+    downloader.download()
+
+  def download_nvidia_pretrained_weights(self):
+    downloader = NVIDIAPretrainedWeightDownloader(self.save_path)
+    downloader.download()
+
+  def download_glue(self, glue_task_name):
+    downloader = GLUEDownloader(self.save_path)
+    downloader.download(glue_task_name)
+
+  def download_squad(self):
+    downloader = SquadDownloader(self.save_path)
+    downloader.download()
diff --git a/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
index 4c1e701f..d891a533 100644
--- a/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/GLUEDownloader.py
@@ -18,29 +18,28 @@
 
 
 def mkdir(path):
-    Path(path).mkdir(parents=True, exist_ok=True)
+  Path(path).mkdir(parents=True, exist_ok=True)
 
 
 class GLUEDownloader:
-
-    def __init__(self, save_path):
-        self.save_path = save_path + '/glue'
-
-    def download(self, task_name):
-        mkdir(self.save_path)
-        if task_name in {'mrpc', 'mnli'}:
-            task_name = task_name.upper()
-        elif task_name == 'cola':
-            task_name = 'CoLA'
-        else:  # SST-2
-            assert task_name == 'sst-2'
-            task_name = 'SST'
-        wget.download(
-            'https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py',
-            out=self.save_path,
-        )
-        sys.path.append(self.save_path)
-        import download_glue_data
-        download_glue_data.main(
-            ['--data_dir', self.save_path, '--tasks', task_name])
-        sys.path.pop()
+  def __init__(self, save_path):
+    self.save_path = save_path + "/glue"
+
+  def download(self, task_name):
+    mkdir(self.save_path)
+    if task_name in {"mrpc", "mnli"}:
+      task_name = task_name.upper()
+    elif task_name == "cola":
+      task_name = "CoLA"
+    else:  # SST-2
+      assert task_name == "sst-2"
+      task_name = "SST"
+    wget.download(
+      "https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py",
+      out=self.save_path,
+    )
+    sys.path.append(self.save_path)
+    import download_glue_data
+
+    download_glue_data.main(["--data_dir", self.save_path, "--tasks", task_name])
+    sys.path.pop()
diff --git a/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
index 7d21f0bf..dac50300 100644
--- a/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/GooglePretrainedWeightDownloader.py
@@ -16,142 +16,148 @@
 import urllib.request
 import tarfile
 
-class GooglePretrainedWeightDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/google_pretrained_weights'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        # Download urls
-        self.model_urls = {
-            'bert_base_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz', 'uncased_L-12_H-768_A-12.tar.gz'),
-            'bert_large_uncased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz', 'uncased_L-24_H-1024_A-16.tar.gz'),
-            # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
-            # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'),
-            # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
-            # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
-            # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
-        }
-
-        # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
-        self.bert_base_uncased_sha = {
-            'bert_config.json': '7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc',
-            'bert_model.ckpt.data-00000-of-00001': 'f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f',
-            'bert_model.ckpt.index': '06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767',
-            # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-        }
-
-        self.bert_large_uncased_sha = {
-            'bert_config.json': 'bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb',
-            'bert_model.ckpt.data-00000-of-00001': '9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b',
-            'bert_model.ckpt.index': '1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336',
-            # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-            'vocab.txt': '07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3',
-        }
-
-        self.bert_base_cased_sha = {
-            'bert_config.json': 'f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc',
-            'bert_model.ckpt.data-00000-of-00001': 'ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85',
-            'bert_model.ckpt.index': 'af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5',
-            'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-        }
-
-        self.bert_large_cased_sha = {
-            'bert_config.json': '7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57',
-            'bert_model.ckpt.data-00000-of-00001': '1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b',
-            'bert_model.ckpt.index': '373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3',
-            'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
-            'vocab.txt': 'eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02',
-        }
-
-        self.bert_base_multilingual_cased_sha = {
-            'bert_config.json': 'e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0',
-            'bert_model.ckpt.data-00000-of-00001': '55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5',
-            'bert_model.ckpt.index': '7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37',
-            'bert_model.ckpt.meta': '95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa',
-            'vocab.txt': 'fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c',
-        }
-
-        self.bert_large_multilingual_uncased_sha = {
-            'bert_config.json': '49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624',
-            'bert_model.ckpt.data-00000-of-00001': '3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429',
-            'bert_model.ckpt.index': '87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7',
-            'bert_model.ckpt.meta': '27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29',
-            'vocab.txt': '87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f',
-        }
-
-        self.bert_base_chinese_sha = {
-            'bert_config.json': '7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015',
-            'bert_model.ckpt.data-00000-of-00001': '756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba',
-            'bert_model.ckpt.index': '46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e',
-            'bert_model.ckpt.meta': 'c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047',
-            'vocab.txt': '45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c',
-        }
-
-        # Relate SHA to urls for loop below
-        self.model_sha = {
-            'bert_base_uncased': self.bert_base_uncased_sha,
-            'bert_large_uncased': self.bert_large_uncased_sha,
-            # 'bert_base_cased': self.bert_base_cased_sha,
-            # 'bert_large_cased': self.bert_large_cased_sha,
-            # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
-            # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
-            # 'bert_base_chinese': self.bert_base_chinese_sha
-        }
-
-    # Helper to get sha256sum of a file
-    def sha256sum(self, filename):
-      h  = hashlib.sha256()
-      b  = bytearray(128*1024)
-      mv = memoryview(b)
-      with open(filename, 'rb', buffering=0) as f:
-        for n in iter(lambda : f.readinto(mv), 0):
-          h.update(mv[:n])
-
-      return h.hexdigest()
-
-    def download(self):
-        # Iterate over urls: download, unzip, verify sha256sum
-        found_mismatch_sha = False
-        for model in self.model_urls:
-          url = self.model_urls[model][0]
-          file = self.save_path + '/' + self.model_urls[model][1]
-
-          print('Downloading', url)
-          response = urllib.request.urlopen(url)
-          with open(file, 'wb') as handle:
-            handle.write(response.read())
-
-          print('Unzipping', file)
-          tf = tarfile.open(file)
-          tf.extractall(self.save_path)
-
-          sha_dict = self.model_sha[model]
-          for extracted_file in sha_dict:
-            sha = sha_dict[extracted_file]
-            if sha != self.sha256sum(file[:-7] + '/' + extracted_file):
-              found_mismatch_sha = True
-              print('SHA256sum does not match on file:', extracted_file, 'from download url:', url)
-            else:
-              print(file[:-7] + '/' + extracted_file, '\t', 'verified')
-
-        if not found_mismatch_sha:
-          print("All downloads pass sha256sum verification.")
-
-    def serialize(self):
-        pass
-
-    def deserialize(self):
-        pass
-
-    def listAvailableWeights(self):
-        print("Available Weight Datasets")
-        for item in self.model_urls:
-            print(item)
-
-    def listLocallyStoredWeights(self):
-        pass
 
+class GooglePretrainedWeightDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/google_pretrained_weights"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    # Download urls
+    self.model_urls = {
+      "bert_base_uncased": (
+        "https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12.tar.gz",
+        "uncased_L-12_H-768_A-12.tar.gz",
+      ),
+      "bert_large_uncased": (
+        "https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/uncased_L-24_H-1024_A-16.tar.gz",
+        "uncased_L-24_H-1024_A-16.tar.gz",
+      ),
+      # 'bert_base_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-12_H-768_A-12.tar.gz', 'cased_L-12_H-768_A-12.tar.gz'),
+      # 'bert_large_cased': ('https://storage.googleapis.com/cloud-tpu-checkpoints/bert/keras_bert/cased_L-24_H-1024_A-16.tar.gz', 'cased_L-24_H-1024_A-16.tar.gz'),
+      # 'bert_base_multilingual_cased': ('https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip', 'multi_cased_L-12_H-768_A-12.zip'),
+      # 'bert_large_multilingual_uncased': ('https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip', 'multilingual_L-12_H-768_A-12.zip'),
+      # 'bert_base_chinese': ('https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip', 'chinese_L-12_H-768_A-12.zip')
+    }
+
+    # SHA256sum verification for file download integrity (and checking for changes from the download source over time)
+    self.bert_base_uncased_sha = {
+      "bert_config.json": "7b4e5f53efbd058c67cda0aacfafb340113ea1b5797d9ce6ee411704ba21fcbc",
+      "bert_model.ckpt.data-00000-of-00001": "f8d2e9873133ea4d252662be01a074fb6b9e115d5fd1e3678d385cf65cf5210f",
+      "bert_model.ckpt.index": "06a6b8cdff0e61f62f8f24946a607aa6f5ad9b969c1b85363541ab144f80c767",
+      # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
+    }
+
+    self.bert_large_uncased_sha = {
+      "bert_config.json": "bfa42236d269e2aeb3a6d30412a33d15dbe8ea597e2b01dc9518c63cc6efafcb",
+      "bert_model.ckpt.data-00000-of-00001": "9aa66efcbbbfd87fc173115c4f906a42a70d26ca4ca1e318358e4de81dbddb0b",
+      "bert_model.ckpt.index": "1811d5b68b2fd1a8c5d2961b2691eb626d75c4e789079eb1ba3649aa3fff7336",
+      # 'checkpoint': 'da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639',
+      "vocab.txt": "07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3",
+    }
+
+    self.bert_base_cased_sha = {
+      "bert_config.json": "f11dfb757bea16339a33e1bf327b0aade6e57fd9c29dc6b84f7ddb20682f48bc",
+      "bert_model.ckpt.data-00000-of-00001": "ed0febc0fbcd2b7ef9f02112e00cb26c5de2086bca26c07b48b09c723446bc85",
+      "bert_model.ckpt.index": "af085a027ef3686466c9b662f9174129401bb4bc49856c917c02322ab7ca26d5",
+      "checkpoint": "da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
+    }
+
+    self.bert_large_cased_sha = {
+      "bert_config.json": "7adb2125c8225da495656c982fd1c5f64ba8f20ad020838571a3f8a954c2df57",
+      "bert_model.ckpt.data-00000-of-00001": "1f96efeac7c8728e2bacb8ec6230f5ed42a26f5aa6b6b0a138778c190adf2a0b",
+      "bert_model.ckpt.index": "373ed159af87775ce549239649bfc4df825bffab0da31620575dab44818443c3",
+      "checkpoint": "da4c827756174a576abc3490e385fa8a36600cf5eb7bbea29315cf1f4ad59639",
+      "vocab.txt": "eeaa9875b23b04b4c54ef759d03db9d1ba1554838f8fb26c5d96fa551df93d02",
+    }
+
+    self.bert_base_multilingual_cased_sha = {
+      "bert_config.json": "e76c3964bc14a8bb37a5530cdc802699d2f4a6fddfab0611e153aa2528f234f0",
+      "bert_model.ckpt.data-00000-of-00001": "55b8a2df41f69c60c5180e50a7c31b7cdf6238909390c4ddf05fbc0d37aa1ac5",
+      "bert_model.ckpt.index": "7d8509c2a62b4e300feb55f8e5f1eef41638f4998dd4d887736f42d4f6a34b37",
+      "bert_model.ckpt.meta": "95e5f1997e8831f1c31e5cf530f1a2e99f121e9cd20887f2dce6fe9e3343e3fa",
+      "vocab.txt": "fe0fda7c425b48c516fc8f160d594c8022a0808447475c1a7c6d6479763f310c",
+    }
+
+    self.bert_large_multilingual_uncased_sha = {
+      "bert_config.json": "49063bb061390211d2fdd108cada1ed86faa5f90b80c8f6fdddf406afa4c4624",
+      "bert_model.ckpt.data-00000-of-00001": "3cd83912ebeb0efe2abf35c9f1d5a515d8e80295e61c49b75c8853f756658429",
+      "bert_model.ckpt.index": "87c372c1a3b1dc7effaaa9103c80a81b3cbab04c7933ced224eec3b8ad2cc8e7",
+      "bert_model.ckpt.meta": "27f504f34f02acaa6b0f60d65195ec3e3f9505ac14601c6a32b421d0c8413a29",
+      "vocab.txt": "87b44292b452f6c05afa49b2e488e7eedf79ea4f4c39db6f2f4b37764228ef3f",
+    }
+
+    self.bert_base_chinese_sha = {
+      "bert_config.json": "7aaad0335058e2640bcb2c2e9a932b1cd9da200c46ea7b8957d54431f201c015",
+      "bert_model.ckpt.data-00000-of-00001": "756699356b78ad0ef1ca9ba6528297bcb3dd1aef5feadd31f4775d7c7fc989ba",
+      "bert_model.ckpt.index": "46315546e05ce62327b3e2cd1bed22836adcb2ff29735ec87721396edb21b82e",
+      "bert_model.ckpt.meta": "c0f8d51e1ab986604bc2b25d6ec0af7fd21ff94cf67081996ec3f3bf5d823047",
+      "vocab.txt": "45bbac6b341c319adc98a532532882e91a9cefc0329aa57bac9ae761c27b291c",
+    }
+
+    # Relate SHA to urls for loop below
+    self.model_sha = {
+      "bert_base_uncased": self.bert_base_uncased_sha,
+      "bert_large_uncased": self.bert_large_uncased_sha,
+      # 'bert_base_cased': self.bert_base_cased_sha,
+      # 'bert_large_cased': self.bert_large_cased_sha,
+      # 'bert_base_multilingual_cased': self.bert_base_multilingual_cased_sha,
+      # 'bert_large_multilingual_uncased': self.bert_large_multilingual_uncased_sha,
+      # 'bert_base_chinese': self.bert_base_chinese_sha
+    }
+
+  # Helper to get sha256sum of a file
+  def sha256sum(self, filename):
+    h = hashlib.sha256()
+    b = bytearray(128 * 1024)
+    mv = memoryview(b)
+    with open(filename, "rb", buffering=0) as f:
+      for n in iter(lambda: f.readinto(mv), 0):
+        h.update(mv[:n])
+
+    return h.hexdigest()
+
+  def download(self):
+    # Iterate over urls: download, unzip, verify sha256sum
+    found_mismatch_sha = False
+    for model in self.model_urls:
+      url = self.model_urls[model][0]
+      file = self.save_path + "/" + self.model_urls[model][1]
+
+      print("Downloading", url)
+      response = urllib.request.urlopen(url)
+      with open(file, "wb") as handle:
+        handle.write(response.read())
+
+      print("Unzipping", file)
+      tf = tarfile.open(file)
+      tf.extractall(self.save_path)
+
+      sha_dict = self.model_sha[model]
+      for extracted_file in sha_dict:
+        sha = sha_dict[extracted_file]
+        if sha != self.sha256sum(file[:-7] + "/" + extracted_file):
+          found_mismatch_sha = True
+          print("SHA256sum does not match on file:", extracted_file, "from download url:", url)
+        else:
+          print(file[:-7] + "/" + extracted_file, "\t", "verified")
+
+    if not found_mismatch_sha:
+      print("All downloads pass sha256sum verification.")
+
+  def serialize(self):
+    pass
+
+  def deserialize(self):
+    pass
+
+  def listAvailableWeights(self):
+    print("Available Weight Datasets")
+    for item in self.model_urls:
+      print(item)
+
+  def listLocallyStoredWeights(self):
+    pass
diff --git a/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
index 13c9a320..9472cb5c 100644
--- a/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/NVIDIAPretrainedWeightDownloader.py
@@ -13,15 +13,15 @@
 
 import os
 
-class NVIDIAPretrainedWeightDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/nvidia_pretrained_weights'
 
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
+class NVIDIAPretrainedWeightDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/nvidia_pretrained_weights"
 
-        pass
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
 
+    pass
 
-    def download(self):
-        assert False, 'NVIDIAPretrainedWeightDownloader not implemented yet.'
\ No newline at end of file
+  def download(self):
+    assert False, "NVIDIAPretrainedWeightDownloader not implemented yet."
diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
index a2aef07a..0221e081 100644
--- a/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/PubMedDownloader.py
@@ -19,75 +19,70 @@
 import shutil
 import sys
 
-class PubMedDownloader:
-    def __init__(self, subset, save_path):
-        self.subset = subset
-        # Modifying self.save_path in two steps to handle creation of subdirectories
-        self.save_path = save_path + '/pubmed' + '/'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.save_path = self.save_path + '/' + subset
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.download_urls = {
-            'baseline' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/',
-            'daily_update' : 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/',
-            'fulltext' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/',
-            'open_access' : 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/'
-        }
-
-
-    def download(self):
-        print('subset:', self.subset)
-        url = self.download_urls[self.subset]
-        self.download_files(url)
-        self.extract_files()
-
-
-    def download_files(self, url):
-        url = self.download_urls[self.subset]
-        output = os.popen('curl ' + url).read()
-
-        if self.subset == 'fulltext' or self.subset == 'open_access':
-            line_split = 'comm_use' if self.subset == 'fulltext' else 'non_comm_use'
-            for line in output.splitlines():
-                if line[-10:] == 'xml.tar.gz' and \
-                        line.split(' ')[-1].split('.')[0] == line_split:
-                    file = os.path.join(self.save_path, line.split(' ')[-1])
-                    if not os.path.isfile(file):
-                        print('Downloading', file)
-                        response = urllib.request.urlopen(url + line.split(' ')[-1])
-                        with open(file, "wb") as handle:
-                            handle.write(response.read())
-
-        elif self.subset == 'baseline' or self.subset == 'daily_update':
-            for line in output.splitlines():
-                if line[-3:] == '.gz':
-                    file = os.path.join(self.save_path, line.split(' ')[-1])
-                    if not os.path.isfile(file):
-                        print('Downloading', file)
-                        response = urllib.request.urlopen(url + line.split(' ')[-1])
-                        with open(file, "wb") as handle:
-                            handle.write(response.read())
-        else:
-            assert False, 'Invalid PubMed dataset/subset specified.'
-
-    def extract_files(self):
-        files = glob.glob(self.save_path + '/*.xml.gz')
-
-        for file in files:
-            print('file:', file)
-            input = gzip.GzipFile(file, mode='rb')
-            s = input.read()
-            input.close()
-
-            out = open(file[:-3], mode='wb')
-            out.write(s)
-            out.close()
-
-
 
+class PubMedDownloader:
+  def __init__(self, subset, save_path):
+    self.subset = subset
+    # Modifying self.save_path in two steps to handle creation of subdirectories
+    self.save_path = save_path + "/pubmed" + "/"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    self.save_path = self.save_path + "/" + subset
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    self.download_urls = {
+      "baseline": "ftp://ftp.ncbi.nlm.nih.gov/pubmed/baseline/",
+      "daily_update": "ftp://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/",
+      "fulltext": "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/",
+      "open_access": "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_bulk/",
+    }
+
+  def download(self):
+    print("subset:", self.subset)
+    url = self.download_urls[self.subset]
+    self.download_files(url)
+    self.extract_files()
+
+  def download_files(self, url):
+    url = self.download_urls[self.subset]
+    output = os.popen("curl " + url).read()
+
+    if self.subset == "fulltext" or self.subset == "open_access":
+      line_split = "comm_use" if self.subset == "fulltext" else "non_comm_use"
+      for line in output.splitlines():
+        if line[-10:] == "xml.tar.gz" and line.split(" ")[-1].split(".")[0] == line_split:
+          file = os.path.join(self.save_path, line.split(" ")[-1])
+          if not os.path.isfile(file):
+            print("Downloading", file)
+            response = urllib.request.urlopen(url + line.split(" ")[-1])
+            with open(file, "wb") as handle:
+              handle.write(response.read())
+
+    elif self.subset == "baseline" or self.subset == "daily_update":
+      for line in output.splitlines():
+        if line[-3:] == ".gz":
+          file = os.path.join(self.save_path, line.split(" ")[-1])
+          if not os.path.isfile(file):
+            print("Downloading", file)
+            response = urllib.request.urlopen(url + line.split(" ")[-1])
+            with open(file, "wb") as handle:
+              handle.write(response.read())
+    else:
+      assert False, "Invalid PubMed dataset/subset specified."
+
+  def extract_files(self):
+    files = glob.glob(self.save_path + "/*.xml.gz")
+
+    for file in files:
+      print("file:", file)
+      input = gzip.GzipFile(file, mode="rb")
+      s = input.read()
+      input.close()
+
+      out = open(file[:-3], mode="wb")
+      out.write(s)
+      out.close()
diff --git a/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
index df851789..a24fa747 100644
--- a/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
+++ b/modelzoo/LanguageModeling/BERT/data/PubMedTextFormatting.py
@@ -15,30 +15,30 @@
 import os
 import pubmed_parser as pmp
 
-class PubMedTextFormatting:
-    def __init__(self, pubmed_path, output_filename, recursive = False):
-        self.pubmed_path = pubmed_path
-        self.recursive = recursive
-        self.output_filename = output_filename
 
+class PubMedTextFormatting:
+  def __init__(self, pubmed_path, output_filename, recursive=False):
+    self.pubmed_path = pubmed_path
+    self.recursive = recursive
+    self.output_filename = output_filename
 
-    # This puts one article per line
-    def merge(self):
-        print('PubMed path:', self.pubmed_path)
+  # This puts one article per line
+  def merge(self):
+    print("PubMed path:", self.pubmed_path)
 
-        with open(self.output_filename, mode='w', newline='\n') as ofile:
-            for filename in glob.glob(self.pubmed_path + '/*.xml*', recursive=self.recursive):
-                print('file:', filename)
-                dicts_out = pmp.parse_medline_xml(filename)
-                for dict_out in dicts_out:
-                    if not dict_out['abstract']:
-                        continue
-                    try:
-                        for line in dict_out['abstract'].splitlines():
-                            if len(line) < 30:
-                                continue
-                            ofile.write(line.strip() + " ")
-                        ofile.write("\n\n")
-                    except:
-                        ofile.write("\n\n")
-                        continue
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for filename in glob.glob(self.pubmed_path + "/*.xml*", recursive=self.recursive):
+        print("file:", filename)
+        dicts_out = pmp.parse_medline_xml(filename)
+        for dict_out in dicts_out:
+          if not dict_out["abstract"]:
+            continue
+          try:
+            for line in dict_out["abstract"].splitlines():
+              if len(line) < 30:
+                continue
+              ofile.write(line.strip() + " ")
+            ofile.write("\n\n")
+          except:
+            ofile.write("\n\n")
+            continue
diff --git a/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
index 6d64ffc6..b1c026c3 100644
--- a/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/SquadDownloader.py
@@ -16,39 +16,38 @@
 import urllib.request
 import sys
 
-class SquadDownloader:
-    def __init__(self, save_path):
-        self.save_path = save_path + '/squad'
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        if not os.path.exists(self.save_path + '/v1.1'):
-            os.makedirs(self.save_path + '/v1.1')
-
-        if not os.path.exists(self.save_path + '/v2.0'):
-            os.makedirs(self.save_path + '/v2.0')
-
-        self.download_urls = {
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json' : 'v1.1/train-v1.1.json',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json' : 'v1.1/dev-v1.1.json',
-            'https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/' : 'v1.1/evaluate-v1.1.py',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json' : 'v2.0/train-v2.0.json',
-            'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json' : 'v2.0/dev-v2.0.json',
-            'https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/' : 'v2.0/evaluate-v2.0.py',
-        }
-
-    def download(self):
-        for item in self.download_urls:
-            url = item
-            file = self.download_urls[item]
-
-            print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + file):
-                print('** Download file already exists, skipping download')
-            else:
-                response = urllib.request.urlopen(url)
-                with open(self.save_path + '/' + file, "wb") as handle:
-                    handle.write(response.read())
-
 
+class SquadDownloader:
+  def __init__(self, save_path):
+    self.save_path = save_path + "/squad"
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    if not os.path.exists(self.save_path + "/v1.1"):
+      os.makedirs(self.save_path + "/v1.1")
+
+    if not os.path.exists(self.save_path + "/v2.0"):
+      os.makedirs(self.save_path + "/v2.0")
+
+    self.download_urls = {
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json": "v1.1/train-v1.1.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json": "v1.1/dev-v1.1.json",
+      "https://worksheets.codalab.org/rest/bundles/0xbcd57bee090b421c982906709c8c27e1/contents/blob/": "v1.1/evaluate-v1.1.py",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json": "v2.0/train-v2.0.json",
+      "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json": "v2.0/dev-v2.0.json",
+      "https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/": "v2.0/evaluate-v2.0.py",
+    }
+
+  def download(self):
+    for item in self.download_urls:
+      url = item
+      file = self.download_urls[item]
+
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + file):
+        print("** Download file already exists, skipping download")
+      else:
+        response = urllib.request.urlopen(url)
+        with open(self.save_path + "/" + file, "wb") as handle:
+          handle.write(response.read())
diff --git a/modelzoo/LanguageModeling/BERT/data/TextSharding.py b/modelzoo/LanguageModeling/BERT/data/TextSharding.py
index a6b0ca49..3920f406 100644
--- a/modelzoo/LanguageModeling/BERT/data/TextSharding.py
+++ b/modelzoo/LanguageModeling/BERT/data/TextSharding.py
@@ -18,314 +18,320 @@
 import os
 import statistics
 
-class Sharding:
-    def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
-        assert len(input_files) > 0, 'The input file list must contain at least one file.'
-        assert n_training_shards > 0, 'There must be at least one output shard.'
-        assert n_test_shards > 0, 'There must be at least one output shard.'
-
-        self.n_training_shards = n_training_shards
-        self.n_test_shards = n_test_shards
-        self.fraction_test_set = fraction_test_set
-
-        self.input_files = input_files
-
-        self.output_name_prefix = output_name_prefix
-        self.output_training_identifier = '_training'
-        self.output_test_identifier = '_test'
-        self.output_file_extension = '.txt'
-
-        self.articles = {}    # key: integer identifier, value: list of articles
-        self.sentences = {}    # key: integer identifier, value: list of sentences
-        self.output_training_files = {}    # key: filename, value: list of articles to go into file
-        self.output_test_files = {}  # key: filename, value: list of articles to go into file
-
-        self.init_output_files()
-
-
-    # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
-    def load_articles(self):
-        print('Start: Loading Articles')
-
-        global_article_count = 0
-        for input_file in self.input_files:
-            print('input file:', input_file)
-            with open(input_file, mode='r', newline='\n') as f:
-                for i, line in enumerate(f):
-                    if line.strip():
-                        self.articles[global_article_count] = line.rstrip()
-                        global_article_count += 1
-
-        print('End: Loading Articles: There are', len(self.articles), 'articles.')
 
+class Sharding:
+  def __init__(self, input_files, output_name_prefix, n_training_shards, n_test_shards, fraction_test_set):
+    assert len(input_files) > 0, "The input file list must contain at least one file."
+    assert n_training_shards > 0, "There must be at least one output shard."
+    assert n_test_shards > 0, "There must be at least one output shard."
 
-    def segment_articles_into_sentences(self, segmenter):
-        print('Start: Sentence Segmentation')
-        if len(self.articles) == 0:
-            self.load_articles()
-
-        assert len(self.articles) != 0, 'Please check that input files are present and contain data.'
-
-        # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
-        use_multiprocessing = 'serial'
-
-        def chunks(data, size=len(self.articles)):
-            it = iter(data)
-            for i in range(0, len(data), size):
-                yield {k: data[k] for k in islice(it, size)}
-
-        if use_multiprocessing == 'manager':
-            manager = multiprocessing.Manager()
-            return_dict = manager.dict()
-            jobs = []
-            n_processes = 7    # in addition to the main process, total = n_proc+1
+    self.n_training_shards = n_training_shards
+    self.n_test_shards = n_test_shards
+    self.fraction_test_set = fraction_test_set
 
-            def work(articles, return_dict):
-                sentences = {}
-                for i, article in enumerate(articles):
-                    sentences[i] = segmenter.segment_string(articles[article])
+    self.input_files = input_files
 
-                    if i % 5000 == 0:
-                        print('Segmenting article', i)
+    self.output_name_prefix = output_name_prefix
+    self.output_training_identifier = "_training"
+    self.output_test_identifier = "_test"
+    self.output_file_extension = ".txt"
 
-                return_dict.update(sentences)
+    self.articles = {}  # key: integer identifier, value: list of articles
+    self.sentences = {}  # key: integer identifier, value: list of sentences
+    self.output_training_files = {}  # key: filename, value: list of articles to go into file
+    self.output_test_files = {}  # key: filename, value: list of articles to go into file
 
-            for item in chunks(self.articles, len(self.articles)):
-                p = multiprocessing.Process(target=work, args=(item, return_dict))
+    self.init_output_files()
 
-                # Busy wait
-                while len(jobs) >= n_processes:
-                    pass
+  # Remember, the input files contain one article per line (the whitespace check is to skip extraneous blank lines)
+  def load_articles(self):
+    print("Start: Loading Articles")
 
-                jobs.append(p)
-                p.start()
+    global_article_count = 0
+    for input_file in self.input_files:
+      print("input file:", input_file)
+      with open(input_file, mode="r", newline="\n") as f:
+        for i, line in enumerate(f):
+          if line.strip():
+            self.articles[global_article_count] = line.rstrip()
+            global_article_count += 1
 
-            for proc in jobs:
-                proc.join()
+    print("End: Loading Articles: There are", len(self.articles), "articles.")
 
-        elif use_multiprocessing == 'queue':
-            work_queue = multiprocessing.Queue()
-            jobs = []
+  def segment_articles_into_sentences(self, segmenter):
+    print("Start: Sentence Segmentation")
+    if len(self.articles) == 0:
+      self.load_articles()
 
-            for item in chunks(self.articles, len(self.articles)):
-                pass
+    assert len(self.articles) != 0, "Please check that input files are present and contain data."
 
-        else:    # serial option
-            for i, article in enumerate(self.articles):
-                self.sentences[i] = segmenter.segment_string(self.articles[article])
+    # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
+    use_multiprocessing = "serial"
 
-                if i % 5000 == 0:
-                    print('Segmenting article', i)
+    def chunks(data, size=len(self.articles)):
+      it = iter(data)
+      for i in range(0, len(data), size):
+        yield {k: data[k] for k in islice(it, size)}
 
-        print('End: Sentence Segmentation')
+    if use_multiprocessing == "manager":
+      manager = multiprocessing.Manager()
+      return_dict = manager.dict()
+      jobs = []
+      n_processes = 7  # in addition to the main process, total = n_proc+1
 
+      def work(articles, return_dict):
+        sentences = {}
+        for i, article in enumerate(articles):
+          sentences[i] = segmenter.segment_string(articles[article])
 
-    def init_output_files(self):
-        print('Start: Init Output Files')
-        assert len(self.output_training_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
-        assert len(self.output_test_files) == 0, 'Internal storage self.output_files already contains data. This function is intended to be used by the constructor only.'
+          if i % 5000 == 0:
+            print("Segmenting article", i)
 
-        for i in range(self.n_training_shards):
-            name = self.output_name_prefix + self.output_training_identifier + '_' + str(i) + self.output_file_extension
-            self.output_training_files[name] = []
+        return_dict.update(sentences)
 
-        for i in range(self.n_test_shards):
-            name = self.output_name_prefix + self.output_test_identifier + '_' + str(i) + self.output_file_extension
-            self.output_test_files[name] = []
+      for item in chunks(self.articles, len(self.articles)):
+        p = multiprocessing.Process(target=work, args=(item, return_dict))
 
-        print('End: Init Output Files')
+        # Busy wait
+        while len(jobs) >= n_processes:
+          pass
 
+        jobs.append(p)
+        p.start()
 
-    def get_sentences_per_shard(self, shard):
-        result = 0
-        for article_id in shard:
-            result += len(self.sentences[article_id])
+      for proc in jobs:
+        proc.join()
 
-        return result
+    elif use_multiprocessing == "queue":
+      work_queue = multiprocessing.Queue()
+      jobs = []
 
+      for item in chunks(self.articles, len(self.articles)):
+        pass
 
-    def distribute_articles_over_shards(self):
-        print('Start: Distribute Articles Over Shards')
-        assert len(self.articles) >= self.n_training_shards + self.n_test_shards, 'There are fewer articles than shards. Please add more data or reduce the number of shards requested.'
+    else:  # serial option
+      for i, article in enumerate(self.articles):
+        self.sentences[i] = segmenter.segment_string(self.articles[article])
 
-        # Create dictionary with - key: sentence count per article, value: article id number
-        sentence_counts = defaultdict(lambda: [])
+        if i % 5000 == 0:
+          print("Segmenting article", i)
 
-        max_sentences = 0
-        total_sentences = 0
+    print("End: Sentence Segmentation")
 
-        for article_id in self.sentences:
-            current_length = len(self.sentences[article_id])
-            sentence_counts[current_length].append(article_id)
-            max_sentences = max(max_sentences, current_length)
-            total_sentences += current_length
+  def init_output_files(self):
+    print("Start: Init Output Files")
+    assert len(self.output_training_files) == 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
+    assert len(self.output_test_files) == 0, (
+      "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
+    )
 
-        n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
-        nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
-        nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
+    for i in range(self.n_training_shards):
+      name = self.output_name_prefix + self.output_training_identifier + "_" + str(i) + self.output_file_extension
+      self.output_training_files[name] = []
 
-        consumed_article_set = set({})
-        unused_article_set = set(self.articles.keys())
+    for i in range(self.n_test_shards):
+      name = self.output_name_prefix + self.output_test_identifier + "_" + str(i) + self.output_file_extension
+      self.output_test_files[name] = []
 
-        # Make first pass and add one article worth of lines per file
-        for file in self.output_training_files:
-            current_article_id = sentence_counts[max_sentences][-1]
-            sentence_counts[max_sentences].pop(-1)
-            self.output_training_files[file].append(current_article_id)
-            consumed_article_set.add(current_article_id)
-            unused_article_set.remove(current_article_id)
+    print("End: Init Output Files")
 
-            # Maintain the max sentence count
-            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                max_sentences -= 1
+  def get_sentences_per_shard(self, shard):
+    result = 0
+    for article_id in shard:
+      result += len(self.sentences[article_id])
 
-            if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
-                nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
-                print('Warning: A single article contains more than the nominal number of sentences per training shard.')
+    return result
 
-        for file in self.output_test_files:
-            current_article_id = sentence_counts[max_sentences][-1]
-            sentence_counts[max_sentences].pop(-1)
-            self.output_test_files[file].append(current_article_id)
-            consumed_article_set.add(current_article_id)
-            unused_article_set.remove(current_article_id)
+  def distribute_articles_over_shards(self):
+    print("Start: Distribute Articles Over Shards")
+    assert len(self.articles) >= self.n_training_shards + self.n_test_shards, (
+      "There are fewer articles than shards. Please add more data or reduce the number of shards requested."
+    )
 
-            # Maintain the max sentence count
-            while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                max_sentences -= 1
+    # Create dictionary with - key: sentence count per article, value: article id number
+    sentence_counts = defaultdict(lambda: [])
 
-            if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
-                nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
-                print('Warning: A single article contains more than the nominal number of sentences per test shard.')
+    max_sentences = 0
+    total_sentences = 0
+
+    for article_id in self.sentences:
+      current_length = len(self.sentences[article_id])
+      sentence_counts[current_length].append(article_id)
+      max_sentences = max(max_sentences, current_length)
+      total_sentences += current_length
 
-        training_counts = []
-        test_counts = []
+    n_sentences_assigned_to_training = int((1 - self.fraction_test_set) * total_sentences)
+    nominal_sentences_per_training_shard = n_sentences_assigned_to_training // self.n_training_shards
+    nominal_sentences_per_test_shard = (total_sentences - n_sentences_assigned_to_training) // self.n_test_shards
 
-        for shard in self.output_training_files:
-            training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+    consumed_article_set = set({})
+    unused_article_set = set(self.articles.keys())
+
+    # Make first pass and add one article worth of lines per file
+    for file in self.output_training_files:
+      current_article_id = sentence_counts[max_sentences][-1]
+      sentence_counts[max_sentences].pop(-1)
+      self.output_training_files[file].append(current_article_id)
+      consumed_article_set.add(current_article_id)
+      unused_article_set.remove(current_article_id)
+
+      # Maintain the max sentence count
+      while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+        max_sentences -= 1
+
+      if len(self.sentences[current_article_id]) > nominal_sentences_per_training_shard:
+        nominal_sentences_per_training_shard = len(self.sentences[current_article_id])
+        print("Warning: A single article contains more than the nominal number of sentences per training shard.")
+
+    for file in self.output_test_files:
+      current_article_id = sentence_counts[max_sentences][-1]
+      sentence_counts[max_sentences].pop(-1)
+      self.output_test_files[file].append(current_article_id)
+      consumed_article_set.add(current_article_id)
+      unused_article_set.remove(current_article_id)
 
-        for shard in self.output_test_files:
-            test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+      # Maintain the max sentence count
+      while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+        max_sentences -= 1
 
-        training_median = statistics.median(training_counts)
-        test_median = statistics.median(test_counts)
+      if len(self.sentences[current_article_id]) > nominal_sentences_per_test_shard:
+        nominal_sentences_per_test_shard = len(self.sentences[current_article_id])
+        print("Warning: A single article contains more than the nominal number of sentences per test shard.")
 
-        # Make subsequent passes over files to find articles to add without going over limit
-        history_remaining = []
-        n_history_remaining = 4
+    training_counts = []
+    test_counts = []
 
-        while len(consumed_article_set) < len(self.articles):
-            for fidx, file in enumerate(self.output_training_files):
-                nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
+    for shard in self.output_training_files:
+      training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
 
-                # Maintain the max sentence count
-                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                    max_sentences -= 1
+    for shard in self.output_test_files:
+      test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
 
-                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
-                    nominal_next_article_size -= 1
+    training_median = statistics.median(training_counts)
+    test_median = statistics.median(test_counts)
 
-                if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or training_counts[fidx] > training_median:
-                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+    # Make subsequent passes over files to find articles to add without going over limit
+    history_remaining = []
+    n_history_remaining = 4
 
-                current_article_id = sentence_counts[nominal_next_article_size][-1]
-                sentence_counts[nominal_next_article_size].pop(-1)
+    while len(consumed_article_set) < len(self.articles):
+      for fidx, file in enumerate(self.output_training_files):
+        nominal_next_article_size = min(nominal_sentences_per_training_shard - training_counts[fidx], max_sentences)
 
-                self.output_training_files[file].append(current_article_id)
-                consumed_article_set.add(current_article_id)
-                unused_article_set.remove(current_article_id)
+        # Maintain the max sentence count
+        while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+          max_sentences -= 1
 
-            for fidx, file in enumerate(self.output_test_files):
-                nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
+        while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+          nominal_next_article_size -= 1
 
-                # Maintain the max sentence count
-                while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
-                    max_sentences -= 1
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size == 0
+          or training_counts[fidx] > training_median
+        ):
+          continue  # skip adding to this file, will come back later if no file can accept unused articles
 
-                while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
-                    nominal_next_article_size -= 1
+        current_article_id = sentence_counts[nominal_next_article_size][-1]
+        sentence_counts[nominal_next_article_size].pop(-1)
 
-                if nominal_next_article_size not in sentence_counts or nominal_next_article_size == 0 or test_counts[fidx] > test_median:
-                    continue    # skip adding to this file, will come back later if no file can accept unused articles
+        self.output_training_files[file].append(current_article_id)
+        consumed_article_set.add(current_article_id)
+        unused_article_set.remove(current_article_id)
 
-                current_article_id = sentence_counts[nominal_next_article_size][-1]
-                sentence_counts[nominal_next_article_size].pop(-1)
+      for fidx, file in enumerate(self.output_test_files):
+        nominal_next_article_size = min(nominal_sentences_per_test_shard - test_counts[fidx], max_sentences)
 
-                self.output_test_files[file].append(current_article_id)
-                consumed_article_set.add(current_article_id)
-                unused_article_set.remove(current_article_id)
+        # Maintain the max sentence count
+        while len(sentence_counts[max_sentences]) == 0 and max_sentences > 0:
+          max_sentences -= 1
 
-            # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
-            if len(history_remaining) == n_history_remaining:
-                history_remaining.pop(0)
-            history_remaining.append(len(unused_article_set))
+        while len(sentence_counts[nominal_next_article_size]) == 0 and nominal_next_article_size > 0:
+          nominal_next_article_size -= 1
 
-            history_same = True
-            for i in range(1, len(history_remaining)):
-                history_same = history_same and (history_remaining[i-1] == history_remaining[i])
+        if (
+          nominal_next_article_size not in sentence_counts
+          or nominal_next_article_size == 0
+          or test_counts[fidx] > test_median
+        ):
+          continue  # skip adding to this file, will come back later if no file can accept unused articles
 
-            if history_same:
-                nominal_sentences_per_training_shard += 1
-                # nominal_sentences_per_test_shard += 1
+        current_article_id = sentence_counts[nominal_next_article_size][-1]
+        sentence_counts[nominal_next_article_size].pop(-1)
 
-            training_counts = []
-            test_counts = []
-            for shard in self.output_training_files:
-                training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
+        self.output_test_files[file].append(current_article_id)
+        consumed_article_set.add(current_article_id)
+        unused_article_set.remove(current_article_id)
 
-            for shard in self.output_test_files:
-                test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
+      # If unable to place articles a few times, bump up nominal sizes by fraction until articles get placed
+      if len(history_remaining) == n_history_remaining:
+        history_remaining.pop(0)
+      history_remaining.append(len(unused_article_set))
 
-            training_median = statistics.median(training_counts)
-            test_median = statistics.median(test_counts)
+      history_same = True
+      for i in range(1, len(history_remaining)):
+        history_same = history_same and (history_remaining[i - 1] == history_remaining[i])
 
-            print('Distributing data over shards:', len(unused_article_set), 'articles remaining.')
+      if history_same:
+        nominal_sentences_per_training_shard += 1
+        # nominal_sentences_per_test_shard += 1
 
+      training_counts = []
+      test_counts = []
+      for shard in self.output_training_files:
+        training_counts.append(self.get_sentences_per_shard(self.output_training_files[shard]))
 
-        if len(unused_article_set) != 0:
-            print('Warning: Some articles did not make it into output files.')
+      for shard in self.output_test_files:
+        test_counts.append(self.get_sentences_per_shard(self.output_test_files[shard]))
 
+      training_median = statistics.median(training_counts)
+      test_median = statistics.median(test_counts)
 
-        for shard in self.output_training_files:
-            print('Training shard:', self.get_sentences_per_shard(self.output_training_files[shard]))
+      print("Distributing data over shards:", len(unused_article_set), "articles remaining.")
 
-        for shard in self.output_test_files:
-            print('Test shard:', self.get_sentences_per_shard(self.output_test_files[shard]))
+    if len(unused_article_set) != 0:
+      print("Warning: Some articles did not make it into output files.")
 
-        print('End: Distribute Articles Over Shards')
+    for shard in self.output_training_files:
+      print("Training shard:", self.get_sentences_per_shard(self.output_training_files[shard]))
 
+    for shard in self.output_test_files:
+      print("Test shard:", self.get_sentences_per_shard(self.output_test_files[shard]))
 
-    def write_shards_to_disk(self):
-        print('Start: Write Shards to Disk')
-        for shard in self.output_training_files:
-            self.write_single_shard(shard, self.output_training_files[shard], 'training')
+    print("End: Distribute Articles Over Shards")
 
-        for shard in self.output_test_files:
-            self.write_single_shard(shard, self.output_test_files[shard], 'test')
+  def write_shards_to_disk(self):
+    print("Start: Write Shards to Disk")
+    for shard in self.output_training_files:
+      self.write_single_shard(shard, self.output_training_files[shard], "training")
 
-        print('End: Write Shards to Disk')
+    for shard in self.output_test_files:
+      self.write_single_shard(shard, self.output_test_files[shard], "test")
 
+    print("End: Write Shards to Disk")
 
-    def write_single_shard(self, shard_name, shard, split):
-        shard_split = os.path.split(shard_name)
-        shard_name = shard_split[0] + '/' + split + '/' + shard_split[1]
+  def write_single_shard(self, shard_name, shard, split):
+    shard_split = os.path.split(shard_name)
+    shard_name = shard_split[0] + "/" + split + "/" + shard_split[1]
 
-        with open(shard_name, mode='w', newline='\n') as f:
-            for article_id in shard:
-                for line in self.sentences[article_id]:
-                    f.write(line + '\n')
+    with open(shard_name, mode="w", newline="\n") as f:
+      for article_id in shard:
+        for line in self.sentences[article_id]:
+          f.write(line + "\n")
 
-                f.write('\n')  # Line break between articles
+        f.write("\n")  # Line break between articles
 
 
 import nltk
 
-nltk.download('punkt')
+nltk.download("punkt")
 
-class NLTKSegmenter:
-    def __init(self):
-        pass
 
-    def segment_string(self, article):
-        return nltk.tokenize.sent_tokenize(article)
+class NLTKSegmenter:
+  def __init(self):
+    pass
 
+  def segment_string(self, article):
+    return nltk.tokenize.sent_tokenize(article)
diff --git a/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
index 1e5e36ce..b4c455eb 100644
--- a/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
+++ b/modelzoo/LanguageModeling/BERT/data/WikiDownloader.py
@@ -17,43 +17,40 @@
 import sys
 import subprocess
 
-class WikiDownloader:
-    def __init__(self, language, save_path):
-        self.save_path = save_path + '/wikicorpus_' + language
-
-        if not os.path.exists(self.save_path):
-            os.makedirs(self.save_path)
-
-        self.language = language
-        self.download_urls = {
-            'en' : 'https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2',
-            'zh' : 'https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2'        }
-
-        self.output_files = {
-            'en' : 'wikicorpus_en.xml.bz2',
-            'zh' : 'wikicorpus_zh.xml.bz2'
-        }
-
-
-    def download(self):
-        if self.language in self.download_urls:
-            url = self.download_urls[self.language]
-            filename = self.output_files[self.language]
-
-            print('Downloading:', url)
-            if os.path.isfile(self.save_path + '/' + filename):
-                print('** Download file already exists, skipping download')
-            else:
-                cmd = ['wget', url, '--output-document={}'.format(self.save_path + '/' + filename), '--no-check-certificate']
-                print('Running:', cmd)
-                status = subprocess.run(cmd)
-                if status.returncode != 0:
-                    raise RuntimeError('Wiki download not successful')
-
-            # Always unzipping since this is relatively fast and will overwrite
-            print('Unzipping:', self.output_files[self.language])
-            subprocess.run('bzip2 -dk ' + self.save_path + '/' + filename, shell=True, check=True)
-
-        else:
-            assert False, 'WikiDownloader not implemented for this language yet.'
 
+class WikiDownloader:
+  def __init__(self, language, save_path):
+    self.save_path = save_path + "/wikicorpus_" + language
+
+    if not os.path.exists(self.save_path):
+      os.makedirs(self.save_path)
+
+    self.language = language
+    self.download_urls = {
+      "en": "https://dumps.wikimedia.your.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2",
+      "zh": "https://dumps.wikimedia.your.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2",
+    }
+
+    self.output_files = {"en": "wikicorpus_en.xml.bz2", "zh": "wikicorpus_zh.xml.bz2"}
+
+  def download(self):
+    if self.language in self.download_urls:
+      url = self.download_urls[self.language]
+      filename = self.output_files[self.language]
+
+      print("Downloading:", url)
+      if os.path.isfile(self.save_path + "/" + filename):
+        print("** Download file already exists, skipping download")
+      else:
+        cmd = ["wget", url, "--output-document={}".format(self.save_path + "/" + filename), "--no-check-certificate"]
+        print("Running:", cmd)
+        status = subprocess.run(cmd)
+        if status.returncode != 0:
+          raise RuntimeError("Wiki download not successful")
+
+      # Always unzipping since this is relatively fast and will overwrite
+      print("Unzipping:", self.output_files[self.language])
+      subprocess.run("bzip2 -dk " + self.save_path + "/" + filename, shell=True, check=True)
+
+    else:
+      assert False, "WikiDownloader not implemented for this language yet."
diff --git a/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
index 9d356b13..aa73e515 100644
--- a/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
+++ b/modelzoo/LanguageModeling/BERT/data/WikicorpusTextFormatting.py
@@ -14,33 +14,33 @@
 import glob
 import os
 
-class WikicorpusTextFormatting:
-    def __init__(self, wiki_path, output_filename, recursive = False):
-        self.wiki_path = wiki_path
-        self.recursive = recursive
-        self.output_filename = output_filename
 
+class WikicorpusTextFormatting:
+  def __init__(self, wiki_path, output_filename, recursive=False):
+    self.wiki_path = wiki_path
+    self.recursive = recursive
+    self.output_filename = output_filename
 
-    # This puts one article per line
-    def merge(self):
-        with open(self.output_filename, mode='w', newline='\n') as ofile:
-            for dirname in glob.glob(self.wiki_path + '/*/', recursive=False):
-                for filename in glob.glob(dirname + 'wiki_*', recursive=self.recursive):
-                    print(filename)
-                    article_lines = []
-                    article_open = False
+  # This puts one article per line
+  def merge(self):
+    with open(self.output_filename, mode="w", newline="\n") as ofile:
+      for dirname in glob.glob(self.wiki_path + "/*/", recursive=False):
+        for filename in glob.glob(dirname + "wiki_*", recursive=self.recursive):
+          print(filename)
+          article_lines = []
+          article_open = False
 
-                    with open(filename, mode='r', newline='\n') as file:
-                        for line in file:
-                            if '<doc id=' in line:
-                                article_open = True
-                            elif '</doc>' in line:
-                                article_open = False
-                                for oline in article_lines[1:]:
-                                    if oline != '\n':
-                                        ofile.write(oline.rstrip() + " ")
-                                ofile.write("\n\n")
-                                article_lines = []
-                            else:
-                                if article_open:
-                                    article_lines.append(line)
\ No newline at end of file
+          with open(filename, mode="r", newline="\n") as file:
+            for line in file:
+              if "<doc id=" in line:
+                article_open = True
+              elif "</doc>" in line:
+                article_open = False
+                for oline in article_lines[1:]:
+                  if oline != "\n":
+                    ofile.write(oline.rstrip() + " ")
+                ofile.write("\n\n")
+                article_lines = []
+              else:
+                if article_open:
+                  article_lines.append(line)
diff --git a/modelzoo/LanguageModeling/BERT/data/__init__.py b/modelzoo/LanguageModeling/BERT/data/__init__.py
index d49f0d05..98386fd4 100644
--- a/modelzoo/LanguageModeling/BERT/data/__init__.py
+++ b/modelzoo/LanguageModeling/BERT/data/__init__.py
@@ -9,4 +9,4 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
+# limitations under the License.
diff --git a/modelzoo/LanguageModeling/BERT/data/bertPrep.py b/modelzoo/LanguageModeling/BERT/data/bertPrep.py
index 656d909e..d2a74502 100644
--- a/modelzoo/LanguageModeling/BERT/data/bertPrep.py
+++ b/modelzoo/LanguageModeling/BERT/data/bertPrep.py
@@ -26,363 +26,420 @@
 
 
 def main(args):
-    working_dir = os.environ['BERT_PREP_WORKING_DIR']
-
-    print('Working Directory:', working_dir)
-    print('Action:', args.action)
-    print('Dataset Name:', args.dataset)
-
-    if args.input_files:
-        args.input_files = args.input_files.split(',')
-
-    hdf5_tfrecord_folder_prefix = "/lower_case_" + str(args.do_lower_case) + "_seq_len_" + str(args.max_seq_length) \
-                                  + "_max_pred_" + str(args.max_predictions_per_seq) + "_masked_lm_prob_" + str(args.masked_lm_prob) \
-                                  + "_random_seed_" + str(args.random_seed) + "_dupe_factor_" + str(args.dupe_factor) \
-                                  + "_shard_" + str(args.n_training_shards) + "_test_split_" + str(int(args.fraction_test_set * 100))
-    directory_structure = {
-        'download' : working_dir + '/download',    # Downloaded and decompressed
-        'extracted' : working_dir +'/extracted',    # Extracted from whatever the initial format is (e.g., wikiextractor)
-        'formatted' : working_dir + '/formatted_one_article_per_line',    # This is the level where all sources should look the same
-        'sharded' : working_dir + '/sharded',
-        'tfrecord' : working_dir + '/tfrecord' + hdf5_tfrecord_folder_prefix,
-        'hdf5': working_dir + '/hdf5'+ hdf5_tfrecord_folder_prefix,
-    }
-
-    print('\nDirectory Structure:')
-    pp = pprint.PrettyPrinter(indent=2)
-    pp.pprint(directory_structure)
-    print('')
-
-    if args.action == 'download':
-        if not os.path.exists(directory_structure['download']):
-            os.makedirs(directory_structure['download'])
-
-        downloader = Downloader.Downloader(args.dataset, directory_structure['download'])
-        downloader.download()
-
-    elif args.action == 'text_formatting':
-        assert args.dataset != 'google_pretrained_weights' and args.dataset != 'nvidia_pretrained_weights' \
-               and args.dataset != 'squad' and args.dataset != 'mrpc' and args.dataset != 'cola' and \
-               args.dataset != 'mnli' and args.dataset != 'sst-2', 'Cannot perform text_formatting on pretrained weights'
-
-        if not os.path.exists(directory_structure['extracted']):
-            os.makedirs(directory_structure['extracted'])
-
-        if not os.path.exists(directory_structure['formatted']):
-            os.makedirs(directory_structure['formatted'])
-
-        if args.dataset == 'bookscorpus':
-            books_path = directory_structure['download'] + '/bookscorpus'
-            #books_path = directory_structure['download']
-            output_filename = directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt'
-            books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
-            books_formatter.merge()
-
-        elif args.dataset == 'wikicorpus_en':
-            if args.skip_wikiextractor == 0:
-                path_to_wikiextractor_in_container = 'python -m wikiextractor.WikiExtractor'
-                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_en.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-                print('WikiExtractor Command:', wikiextractor_command)
-                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-
-            wiki_path = directory_structure['extracted'] + '/wikicorpus_en'
-            output_filename = directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt'
-            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
-            wiki_formatter.merge()
-
-        elif args.dataset == 'wikicorpus_zh':
-            assert False, 'wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added.'
-            if args.skip_wikiextractor == 0:
-                path_to_wikiextractor_in_container = '/workspace/wikiextractor/WikiExtractor.py'
-                wikiextractor_command = path_to_wikiextractor_in_container + ' ' + directory_structure['download'] + '/' + args.dataset + '/wikicorpus_zh.xml ' + '-b 100M --processes ' + str(args.n_processes) + ' -o ' + directory_structure['extracted'] + '/' + args.dataset
-                print('WikiExtractor Command:', wikiextractor_command)
-                wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
-
-            wiki_path = directory_structure['extracted'] + '/wikicorpus_zh'
-            output_filename = directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt'
-            wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
-            wiki_formatter.merge()
-
-        elif args.dataset == 'pubmed_baseline':
-            pubmed_path = directory_structure['download'] + '/pubmed' + '/baseline'
-            output_filename = directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt'
-            pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
-            pubmed_formatter.merge()
-
-    elif args.action == 'sharding':
-        # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
-        if args.dataset == 'bookscorpus' or 'wikicorpus' in args.dataset or 'books_wiki' in args.dataset or 'pubmed' in args.dataset:
-            if args.input_files is None:
-                if args.dataset == 'bookscorpus':
-                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt']
-                elif args.dataset == 'wikicorpus_en':
-                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-                elif args.dataset == 'wikicorpus_zh':
-                    args.input_files = [directory_structure['formatted'] + '/wikicorpus_zh_one_article_per_line.txt']
-                elif args.dataset == 'books_wiki_en_corpus':
-                    args.input_files = [directory_structure['formatted'] + '/bookscorpus_one_book_per_line.txt', directory_structure['formatted'] + '/wikicorpus_en_one_article_per_line.txt']
-                elif args.dataset == 'pubmed_baseline':
-                    args.input_files = [directory_structure['formatted'] + '/pubmed_baseline_one_article_per_line.txt']
-
-            output_file_prefix = directory_structure['sharded'] + '/' + args.dataset + '/' + args.dataset
-
-            if not os.path.exists(directory_structure['sharded']):
-                os.makedirs(directory_structure['sharded'])
-
-            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset):
-                os.makedirs(directory_structure['sharded'] + '/' + args.dataset)
-
-            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/training'):
-                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/training')
-
-            if not os.path.exists(directory_structure['sharded'] + '/' + args.dataset + '/test'):
-                os.makedirs(directory_structure['sharded'] + '/' + args.dataset + '/test')
-
-            # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
-            # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
-            # Different languages (e.g., Chinese simplified/traditional) may require translation and
-            # other packages to be called from here -- just add a conditional branch for those extra steps
-            segmenter = TextSharding.NLTKSegmenter()
-            sharding = TextSharding.Sharding(args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set)
-
-            sharding.load_articles()
-            sharding.segment_articles_into_sentences(segmenter)
-            sharding.distribute_articles_over_shards()
-            sharding.write_shards_to_disk()
-
-        else:
-            assert False, 'Unsupported dataset for sharding'
-
-    elif args.action == 'create_tfrecord_files':
-        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset):
-            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset)
-
-        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/training'):
-            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/training')
-
-        if not os.path.exists(directory_structure['tfrecord'] + "/" + args.dataset + '/test'):
-            os.makedirs(directory_structure['tfrecord'] + "/" + args.dataset + '/test')
-
-        last_process = None
-
-        def create_record_worker(filename_prefix, shard_id, output_format='tfrecord', split='training'):
-            bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
-            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
-            bert_preprocessing_command += ' --output_file=' + directory_structure['tfrecord'] + '/' + args.dataset + '/' + split + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
-            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
-            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
-            bert_preprocessing_command += ' --max_seq_length=' + str(args.max_seq_length)
-            bert_preprocessing_command += ' --max_predictions_per_seq=' + str(args.max_predictions_per_seq)
-            bert_preprocessing_command += ' --masked_lm_prob=' + str(args.masked_lm_prob)
-            bert_preprocessing_command += ' --random_seed=' + str(args.random_seed)
-            bert_preprocessing_command += ' --dupe_factor=' + str(args.dupe_factor)
-            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
-
-            last_process = bert_preprocessing_process
-
-            # This could be better optimized (fine if all take equal time)
-            if shard_id % args.n_processes == 0 and shard_id > 0:
-                bert_preprocessing_process.wait()
-
-            return last_process
-
-        output_file_prefix = args.dataset
-
-        for i in range(args.n_training_shards):
-            last_process = create_record_worker(output_file_prefix + '_training', i, 'tfrecord', 'training')
-
-        last_process.wait()
-
-        for i in range(args.n_test_shards):
-            last_process = create_record_worker(output_file_prefix + '_test', i, 'tfrecord', 'test')
-
-        last_process.wait()
-
-
-    elif args.action == 'create_hdf5_files':
-        assert False, 'HDF5 format not fully supported in this release.'
-
-        if not os.path.exists(directory_structure['hdf5'] + "/" + args.dataset):
-            os.makedirs(directory_structure['hdf5'] + "/" + args.dataset)
-
-        last_process = None
-
-        def create_record_worker(filename_prefix, shard_id, output_format='hdf5'):
-            bert_preprocessing_command = 'python /workspace/bert_tf2/create_pretraining_data.py'
-            bert_preprocessing_command += ' --input_file=' + directory_structure['sharded'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.txt'
-            bert_preprocessing_command += ' --output_file=' + directory_structure['hdf5'] + '/' + args.dataset + '/' + filename_prefix + '_' + str(shard_id) + '.' + output_format
-            bert_preprocessing_command += ' --vocab_file=' + args.vocab_file
-            bert_preprocessing_command += ' --do_lower_case' if args.do_lower_case else ''
-            bert_preprocessing_command += ' --max_seq_length=' + args.max_seq_length
-            bert_preprocessing_command += ' --max_predictions_per_seq=' + args.max_predictions_per_seq
-            bert_preprocessing_command += ' --masked_lm_prob=' + args.masked_lm_prob
-            bert_preprocessing_command += ' --random_seed=' + args.random_seed
-            bert_preprocessing_command += ' --dupe_factor=' + args.dupe_factor
-            bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
-
-            last_process = bert_preprocessing_process
-
-            # This could be better optimized (fine if all take equal time)
-            if shard_id % args.n_processes == 0 and shard_id > 0:
-                bert_preprocessing_process.wait()
-
-        for i in range(args.n_training_shards):
-            create_record_worker(args.output_file_prefix + '_training', i)
-
-        last_process.wait()
-
-        for i in range(args.n_test_shards):
-            create_record_worker(args.output_file_prefix + '_test', i)
-
-        last_process.wait()
+  working_dir = os.environ["BERT_PREP_WORKING_DIR"]
+
+  print("Working Directory:", working_dir)
+  print("Action:", args.action)
+  print("Dataset Name:", args.dataset)
+
+  if args.input_files:
+    args.input_files = args.input_files.split(",")
+
+  hdf5_tfrecord_folder_prefix = (
+    "/lower_case_"
+    + str(args.do_lower_case)
+    + "_seq_len_"
+    + str(args.max_seq_length)
+    + "_max_pred_"
+    + str(args.max_predictions_per_seq)
+    + "_masked_lm_prob_"
+    + str(args.masked_lm_prob)
+    + "_random_seed_"
+    + str(args.random_seed)
+    + "_dupe_factor_"
+    + str(args.dupe_factor)
+    + "_shard_"
+    + str(args.n_training_shards)
+    + "_test_split_"
+    + str(int(args.fraction_test_set * 100))
+  )
+  directory_structure = {
+    "download": working_dir + "/download",  # Downloaded and decompressed
+    "extracted": working_dir + "/extracted",  # Extracted from whatever the initial format is (e.g., wikiextractor)
+    "formatted": working_dir
+    + "/formatted_one_article_per_line",  # This is the level where all sources should look the same
+    "sharded": working_dir + "/sharded",
+    "tfrecord": working_dir + "/tfrecord" + hdf5_tfrecord_folder_prefix,
+    "hdf5": working_dir + "/hdf5" + hdf5_tfrecord_folder_prefix,
+  }
+
+  print("\nDirectory Structure:")
+  pp = pprint.PrettyPrinter(indent=2)
+  pp.pprint(directory_structure)
+  print("")
+
+  if args.action == "download":
+    if not os.path.exists(directory_structure["download"]):
+      os.makedirs(directory_structure["download"])
+
+    downloader = Downloader.Downloader(args.dataset, directory_structure["download"])
+    downloader.download()
+
+  elif args.action == "text_formatting":
+    assert (
+      args.dataset != "google_pretrained_weights"
+      and args.dataset != "nvidia_pretrained_weights"
+      and args.dataset != "squad"
+      and args.dataset != "mrpc"
+      and args.dataset != "cola"
+      and args.dataset != "mnli"
+      and args.dataset != "sst-2"
+    ), "Cannot perform text_formatting on pretrained weights"
+
+    if not os.path.exists(directory_structure["extracted"]):
+      os.makedirs(directory_structure["extracted"])
+
+    if not os.path.exists(directory_structure["formatted"]):
+      os.makedirs(directory_structure["formatted"])
+
+    if args.dataset == "bookscorpus":
+      books_path = directory_structure["download"] + "/bookscorpus"
+      # books_path = directory_structure['download']
+      output_filename = directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"
+      books_formatter = BookscorpusTextFormatting.BookscorpusTextFormatting(books_path, output_filename, recursive=True)
+      books_formatter.merge()
+
+    elif args.dataset == "wikicorpus_en":
+      if args.skip_wikiextractor == 0:
+        path_to_wikiextractor_in_container = "python -m wikiextractor.WikiExtractor"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_en.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
+        wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_en"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"
+      wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+      wiki_formatter.merge()
+
+    elif args.dataset == "wikicorpus_zh":
+      assert False, (
+        "wikicorpus_zh not fully supported at this time. The simplified/tradition Chinese data needs to be translated and properly segmented still, and should work once this step is added."
+      )
+      if args.skip_wikiextractor == 0:
+        path_to_wikiextractor_in_container = "/workspace/wikiextractor/WikiExtractor.py"
+        wikiextractor_command = (
+          path_to_wikiextractor_in_container
+          + " "
+          + directory_structure["download"]
+          + "/"
+          + args.dataset
+          + "/wikicorpus_zh.xml "
+          + "-b 100M --processes "
+          + str(args.n_processes)
+          + " -o "
+          + directory_structure["extracted"]
+          + "/"
+          + args.dataset
+        )
+        print("WikiExtractor Command:", wikiextractor_command)
+        wikiextractor_process = subprocess.run(wikiextractor_command, shell=True, check=True)
+
+      wiki_path = directory_structure["extracted"] + "/wikicorpus_zh"
+      output_filename = directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"
+      wiki_formatter = WikicorpusTextFormatting.WikicorpusTextFormatting(wiki_path, output_filename, recursive=True)
+      wiki_formatter.merge()
+
+    elif args.dataset == "pubmed_baseline":
+      pubmed_path = directory_structure["download"] + "/pubmed" + "/baseline"
+      output_filename = directory_structure["formatted"] + "/pubmed_baseline_one_article_per_line.txt"
+      pubmed_formatter = PubMedTextFormatting.PubMedTextFormatting(pubmed_path, output_filename, recursive=True)
+      pubmed_formatter.merge()
+
+  elif args.action == "sharding":
+    # Note: books+wiki requires user to provide list of input_files (comma-separated with no spaces)
+    if (
+      args.dataset == "bookscorpus"
+      or "wikicorpus" in args.dataset
+      or "books_wiki" in args.dataset
+      or "pubmed" in args.dataset
+    ):
+      if args.input_files is None:
+        if args.dataset == "bookscorpus":
+          args.input_files = [directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt"]
+        elif args.dataset == "wikicorpus_en":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt"]
+        elif args.dataset == "wikicorpus_zh":
+          args.input_files = [directory_structure["formatted"] + "/wikicorpus_zh_one_article_per_line.txt"]
+        elif args.dataset == "books_wiki_en_corpus":
+          args.input_files = [
+            directory_structure["formatted"] + "/bookscorpus_one_book_per_line.txt",
+            directory_structure["formatted"] + "/wikicorpus_en_one_article_per_line.txt",
+          ]
+        elif args.dataset == "pubmed_baseline":
+          args.input_files = [directory_structure["formatted"] + "/pubmed_baseline_one_article_per_line.txt"]
+
+      output_file_prefix = directory_structure["sharded"] + "/" + args.dataset + "/" + args.dataset
+
+      if not os.path.exists(directory_structure["sharded"]):
+        os.makedirs(directory_structure["sharded"])
+
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset)
+
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/training"):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/training")
+
+      if not os.path.exists(directory_structure["sharded"] + "/" + args.dataset + "/test"):
+        os.makedirs(directory_structure["sharded"] + "/" + args.dataset + "/test")
+
+      # Segmentation is here because all datasets look the same in one article/book/whatever per line format, and
+      # it seemed unnecessarily complicated to add an additional preprocessing step to call just for this.
+      # Different languages (e.g., Chinese simplified/traditional) may require translation and
+      # other packages to be called from here -- just add a conditional branch for those extra steps
+      segmenter = TextSharding.NLTKSegmenter()
+      sharding = TextSharding.Sharding(
+        args.input_files, output_file_prefix, args.n_training_shards, args.n_test_shards, args.fraction_test_set
+      )
+
+      sharding.load_articles()
+      sharding.segment_articles_into_sentences(segmenter)
+      sharding.distribute_articles_over_shards()
+      sharding.write_shards_to_disk()
+
+    else:
+      assert False, "Unsupported dataset for sharding"
+
+  elif args.action == "create_tfrecord_files":
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset)
+
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset + "/training"):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset + "/training")
+
+    if not os.path.exists(directory_structure["tfrecord"] + "/" + args.dataset + "/test"):
+      os.makedirs(directory_structure["tfrecord"] + "/" + args.dataset + "/test")
+
+    last_process = None
+
+    def create_record_worker(filename_prefix, shard_id, output_format="tfrecord", split="training"):
+      bert_preprocessing_command = "python /workspace/bert_tf2/create_pretraining_data.py"
+      bert_preprocessing_command += (
+        " --input_file="
+        + directory_structure["sharded"]
+        + "/"
+        + args.dataset
+        + "/"
+        + split
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + ".txt"
+      )
+      bert_preprocessing_command += (
+        " --output_file="
+        + directory_structure["tfrecord"]
+        + "/"
+        + args.dataset
+        + "/"
+        + split
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + "."
+        + output_format
+      )
+      bert_preprocessing_command += " --vocab_file=" + args.vocab_file
+      bert_preprocessing_command += " --do_lower_case" if args.do_lower_case else ""
+      bert_preprocessing_command += " --max_seq_length=" + str(args.max_seq_length)
+      bert_preprocessing_command += " --max_predictions_per_seq=" + str(args.max_predictions_per_seq)
+      bert_preprocessing_command += " --masked_lm_prob=" + str(args.masked_lm_prob)
+      bert_preprocessing_command += " --random_seed=" + str(args.random_seed)
+      bert_preprocessing_command += " --dupe_factor=" + str(args.dupe_factor)
+      bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+      last_process = bert_preprocessing_process
+
+      # This could be better optimized (fine if all take equal time)
+      if shard_id % args.n_processes == 0 and shard_id > 0:
+        bert_preprocessing_process.wait()
+
+      return last_process
+
+    output_file_prefix = args.dataset
+
+    for i in range(args.n_training_shards):
+      last_process = create_record_worker(output_file_prefix + "_training", i, "tfrecord", "training")
+
+    last_process.wait()
+
+    for i in range(args.n_test_shards):
+      last_process = create_record_worker(output_file_prefix + "_test", i, "tfrecord", "test")
+
+    last_process.wait()
+
+  elif args.action == "create_hdf5_files":
+    assert False, "HDF5 format not fully supported in this release."
+
+    if not os.path.exists(directory_structure["hdf5"] + "/" + args.dataset):
+      os.makedirs(directory_structure["hdf5"] + "/" + args.dataset)
+
+    last_process = None
+
+    def create_record_worker(filename_prefix, shard_id, output_format="hdf5"):
+      bert_preprocessing_command = "python /workspace/bert_tf2/create_pretraining_data.py"
+      bert_preprocessing_command += (
+        " --input_file="
+        + directory_structure["sharded"]
+        + "/"
+        + args.dataset
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + ".txt"
+      )
+      bert_preprocessing_command += (
+        " --output_file="
+        + directory_structure["hdf5"]
+        + "/"
+        + args.dataset
+        + "/"
+        + filename_prefix
+        + "_"
+        + str(shard_id)
+        + "."
+        + output_format
+      )
+      bert_preprocessing_command += " --vocab_file=" + args.vocab_file
+      bert_preprocessing_command += " --do_lower_case" if args.do_lower_case else ""
+      bert_preprocessing_command += " --max_seq_length=" + args.max_seq_length
+      bert_preprocessing_command += " --max_predictions_per_seq=" + args.max_predictions_per_seq
+      bert_preprocessing_command += " --masked_lm_prob=" + args.masked_lm_prob
+      bert_preprocessing_command += " --random_seed=" + args.random_seed
+      bert_preprocessing_command += " --dupe_factor=" + args.dupe_factor
+      bert_preprocessing_process = subprocess.Popen(bert_preprocessing_command, shell=True)
+
+      last_process = bert_preprocessing_process
+
+      # This could be better optimized (fine if all take equal time)
+      if shard_id % args.n_processes == 0 and shard_id > 0:
+        bert_preprocessing_process.wait()
+
+    for i in range(args.n_training_shards):
+      create_record_worker(args.output_file_prefix + "_training", i)
+
+    last_process.wait()
+
+    for i in range(args.n_test_shards):
+      create_record_worker(args.output_file_prefix + "_test", i)
+
+    last_process.wait()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Preprocessing Application for Everything BERT-related'
-    )
-
-    parser.add_argument(
-        '--action',
-        type=str,
-        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords',
-        choices={
-            'download',                   # Download and verify mdf5/sha sums
-            'text_formatting',            # Convert into a file that contains one article/book per line
-            'sharding',                   # Convert previous formatted text into shards containing one sentence per line
-            'create_tfrecord_files',      # Turn each shard into a TFrecord with masking and next sentence prediction info
-            'create_hdf5_files'           # Turn each shard into a HDF5 file with masking and next sentence prediction info
-        }
-    )
-
-    parser.add_argument(
-        '--dataset',
-        type=str,
-        help='Specify the dataset to perform --action on',
-        choices={
-            'bookscorpus',
-            'wikicorpus_en',
-            'wikicorpus_zh',
-            'books_wiki_en_corpus',
-            'pubmed_baseline',
-            'pubmed_daily_update',
-            'pubmed_fulltext',
-            'pubmed_open_access',
-            'google_pretrained_weights',
-            'nvidia_pretrained_weights',
-            'squad',
-            'mrpc',
-            'sst-2',
-            'mnli',
-            'cola',
-            'all'
-        }
-    )
-
-    parser.add_argument(
-        '--input_files',
-        type=str,
-        help='Specify the input files in a comma-separated list (no spaces)'
-    )
-
-    parser.add_argument(
-        '--n_training_shards',
-        type=int,
-        help='Specify the number of training shards to generate',
-        default=1472
-    )
-
-    parser.add_argument(
-        '--n_test_shards',
-        type=int,
-        help='Specify the number of test shards to generate',
-        default=1472
-    )
-
-    parser.add_argument(
-        '--fraction_test_set',
-        type=float,
-        help='Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)',
-        default=0.1
-    )
-
-    parser.add_argument(
-        '--segmentation_method',
-        type=str,
-        help='Specify your choice of sentence segmentation',
-        choices={
-            'nltk'
-        },
-        default='nltk'
-    )
-
-    parser.add_argument(
-        '--n_processes',
-        type=int,
-        help='Specify the max number of processes to allow at one time',
-        default=4
-    )
-
-    parser.add_argument(
-        '--random_seed',
-        type=int,
-        help='Specify the base seed to use for any random number generation',
-        default=12345
-    )
-
-    parser.add_argument(
-        '--dupe_factor',
-        type=int,
-        help='Specify the duplication factor',
-        default=5
-    )
-
-    parser.add_argument(
-        '--masked_lm_prob',
-        type=float,
-        help='Specify the probability for masked lm',
-        default=0.15
-    )
-
-    parser.add_argument(
-        '--max_seq_length',
-        type=int,
-        help='Specify the maximum sequence length',
-        default=512
-    )
-
-    parser.add_argument(
-        '--max_predictions_per_seq',
-        type=int,
-        help='Specify the maximum number of masked words per sequence',
-        default=20
-    )
-
-    parser.add_argument(
-        '--do_lower_case',
-        type=int,
-        help='Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)',
-        default=1
-    )
-
-    parser.add_argument(
-        '--vocab_file',
-        type=str,
-        help='Specify absolute path to vocab file to use)'
-    )
-
-    parser.add_argument(
-        '--skip_wikiextractor',
-        type=int,
-        help='Specify whether to skip wikiextractor step 0=False, 1=True',
-        default=0
-    )
-
-    parser.add_argument(
-        '--interactive_json_config_generator',
-        type=str,
-        help='Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords'
-    )
-
-    args = parser.parse_args()
-    main(args)
+  parser = argparse.ArgumentParser(description="Preprocessing Application for Everything BERT-related")
+
+  parser.add_argument(
+    "--action",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+    choices={
+      "download",  # Download and verify mdf5/sha sums
+      "text_formatting",  # Convert into a file that contains one article/book per line
+      "sharding",  # Convert previous formatted text into shards containing one sentence per line
+      "create_tfrecord_files",  # Turn each shard into a TFrecord with masking and next sentence prediction info
+      "create_hdf5_files",  # Turn each shard into a HDF5 file with masking and next sentence prediction info
+    },
+  )
+
+  parser.add_argument(
+    "--dataset",
+    type=str,
+    help="Specify the dataset to perform --action on",
+    choices={
+      "bookscorpus",
+      "wikicorpus_en",
+      "wikicorpus_zh",
+      "books_wiki_en_corpus",
+      "pubmed_baseline",
+      "pubmed_daily_update",
+      "pubmed_fulltext",
+      "pubmed_open_access",
+      "google_pretrained_weights",
+      "nvidia_pretrained_weights",
+      "squad",
+      "mrpc",
+      "sst-2",
+      "mnli",
+      "cola",
+      "all",
+    },
+  )
+
+  parser.add_argument("--input_files", type=str, help="Specify the input files in a comma-separated list (no spaces)")
+
+  parser.add_argument(
+    "--n_training_shards", type=int, help="Specify the number of training shards to generate", default=1472
+  )
+
+  parser.add_argument("--n_test_shards", type=int, help="Specify the number of test shards to generate", default=1472)
+
+  parser.add_argument(
+    "--fraction_test_set",
+    type=float,
+    help="Specify the fraction (0..1) of the data to withhold for the test data split (based on number of sequences)",
+    default=0.1,
+  )
+
+  parser.add_argument(
+    "--segmentation_method",
+    type=str,
+    help="Specify your choice of sentence segmentation",
+    choices={"nltk"},
+    default="nltk",
+  )
+
+  parser.add_argument(
+    "--n_processes", type=int, help="Specify the max number of processes to allow at one time", default=4
+  )
+
+  parser.add_argument(
+    "--random_seed", type=int, help="Specify the base seed to use for any random number generation", default=12345
+  )
+
+  parser.add_argument("--dupe_factor", type=int, help="Specify the duplication factor", default=5)
+
+  parser.add_argument("--masked_lm_prob", type=float, help="Specify the probability for masked lm", default=0.15)
+
+  parser.add_argument("--max_seq_length", type=int, help="Specify the maximum sequence length", default=512)
+
+  parser.add_argument(
+    "--max_predictions_per_seq", type=int, help="Specify the maximum number of masked words per sequence", default=20
+  )
+
+  parser.add_argument(
+    "--do_lower_case",
+    type=int,
+    help="Specify whether it is cased (0) or uncased (1) (any number greater than 0 will be treated as uncased)",
+    default=1,
+  )
+
+  parser.add_argument("--vocab_file", type=str, help="Specify absolute path to vocab file to use)")
+
+  parser.add_argument(
+    "--skip_wikiextractor", type=int, help="Specify whether to skip wikiextractor step 0=False, 1=True", default=0
+  )
+
+  parser.add_argument(
+    "--interactive_json_config_generator",
+    type=str,
+    help="Specify the action you want the app to take. e.g., generate vocab, segment, create tfrecords",
+  )
+
+  args = parser.parse_args()
+  main(args)
diff --git a/modelzoo/LanguageModeling/BERT/gpu_affinity.py b/modelzoo/LanguageModeling/BERT/gpu_affinity.py
index e43cd4db..de844e67 100644
--- a/modelzoo/LanguageModeling/BERT/gpu_affinity.py
+++ b/modelzoo/LanguageModeling/BERT/gpu_affinity.py
@@ -21,43 +21,41 @@
 
 
 def systemGetDriverVersion():
-    return pynvml.nvmlSystemGetDriverVersion()
+  return pynvml.nvmlSystemGetDriverVersion()
 
 
 def deviceGetCount():
-    return pynvml.nvmlDeviceGetCount()
+  return pynvml.nvmlDeviceGetCount()
 
 
 class device:
-    # assume nvml returns list of 64 bit ints
-    _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
+  # assume nvml returns list of 64 bit ints
+  _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)
 
-    def __init__(self, device_idx):
-        super().__init__()
-        self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
+  def __init__(self, device_idx):
+    super().__init__()
+    self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
 
-    def getName(self):
-        return pynvml.nvmlDeviceGetName(self.handle)
+  def getName(self):
+    return pynvml.nvmlDeviceGetName(self.handle)
 
-    def getCpuAffinity(self):
-        affinity_string = ''
-        for j in pynvml.nvmlDeviceGetCpuAffinity(
-            self.handle, device._nvml_affinity_elements
-        ):
-            # assume nvml returns list of 64 bit ints
-            affinity_string = '{:064b}'.format(j) + affinity_string
-        affinity_list = [int(x) for x in affinity_string]
-        affinity_list.reverse()  # so core 0 is in 0th element of list
+  def getCpuAffinity(self):
+    affinity_string = ""
+    for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, device._nvml_affinity_elements):
+      # assume nvml returns list of 64 bit ints
+      affinity_string = "{:064b}".format(j) + affinity_string
+    affinity_list = [int(x) for x in affinity_string]
+    affinity_list.reverse()  # so core 0 is in 0th element of list
 
-        return [i for i, e in enumerate(affinity_list) if e != 0]
+    return [i for i, e in enumerate(affinity_list) if e != 0]
 
 
 def set_affinity(gpu_id=None):
-    if gpu_id is None:
-        gpu_id = int(os.getenv('LOCAL_RANK', 0))
+  if gpu_id is None:
+    gpu_id = int(os.getenv("LOCAL_RANK", 0))
 
-    dev = device(gpu_id)
-    os.sched_setaffinity(0, dev.getCpuAffinity())
+  dev = device(gpu_id)
+  os.sched_setaffinity(0, dev.getCpuAffinity())
 
-    # list of ints representing the logical cores this process is now affinitied with
-    return os.sched_getaffinity(0)
+  # list of ints representing the logical cores this process is now affinitied with
+  return os.sched_getaffinity(0)
diff --git a/modelzoo/LanguageModeling/BERT/input_pipeline.py b/modelzoo/LanguageModeling/BERT/input_pipeline.py
index 86225f60..057f2358 100644
--- a/modelzoo/LanguageModeling/BERT/input_pipeline.py
+++ b/modelzoo/LanguageModeling/BERT/input_pipeline.py
@@ -51,35 +51,29 @@ def single_file_dataset(input_file, name_to_features, use_horovod=False):
   # same input file is sent to all workers.
   if isinstance(input_file, str) or len(input_file) == 1:
     options = tf.data.Options()
-    options.experimental_distribute.auto_shard_policy = (
-        tf.data.experimental.AutoShardPolicy.OFF)
+    options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
     d = d.with_options(options)
   return d
 
 
-def create_pretrain_dataset(input_patterns,
-                            seq_length,
-                            max_predictions_per_seq,
-                            batch_size,
-                            is_training=True,
-                            input_pipeline_context=None,
-                            use_horovod=False):
+def create_pretrain_dataset(
+  input_patterns,
+  seq_length,
+  max_predictions_per_seq,
+  batch_size,
+  is_training=True,
+  input_pipeline_context=None,
+  use_horovod=False,
+):
   """Creates input dataset from (tf)records files for pretraining."""
   name_to_features = {
-      'input_ids':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids':
-          tf.io.FixedLenFeature([seq_length], tf.int64),
-      'masked_lm_positions':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-      'masked_lm_ids':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
-      'masked_lm_weights':
-          tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
-      'next_sentence_labels':
-          tf.io.FixedLenFeature([1], tf.int64),
+    "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "masked_lm_positions": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+    "masked_lm_ids": tf.io.FixedLenFeature([max_predictions_per_seq], tf.int64),
+    "masked_lm_weights": tf.io.FixedLenFeature([max_predictions_per_seq], tf.float32),
+    "next_sentence_labels": tf.io.FixedLenFeature([1], tf.int64),
   }
 
   dataset = tf.data.Dataset.list_files(input_patterns, shuffle=is_training)
@@ -87,8 +81,7 @@ def create_pretrain_dataset(input_patterns,
     dataset = dataset.shard(hvd.size(), hvd.rank())
 
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines, input_pipeline_context.input_pipeline_id)
 
   dataset = dataset.repeat()
 
@@ -104,32 +97,29 @@ def create_pretrain_dataset(input_patterns,
   # parallel. You may want to increase this number if you have a large number of
   # CPU cores.
   dataset = dataset.interleave(
-      tf.data.TFRecordDataset, cycle_length=8,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+    tf.data.TFRecordDataset, cycle_length=8, num_parallel_calls=tf.data.experimental.AUTOTUNE
+  )
 
   decode_fn = lambda record: decode_record(record, name_to_features)
-  dataset = dataset.map(
-      decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
   def _select_data_from_record(record):
     """Filter out features to use for pretraining."""
     x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids'],
-        'masked_lm_positions': record['masked_lm_positions'],
-        'masked_lm_ids': record['masked_lm_ids'],
-        'masked_lm_weights': record['masked_lm_weights'],
-        'next_sentence_labels': record['next_sentence_labels'],
+      "input_word_ids": record["input_ids"],
+      "input_mask": record["input_mask"],
+      "input_type_ids": record["segment_ids"],
+      "masked_lm_positions": record["masked_lm_positions"],
+      "masked_lm_ids": record["masked_lm_ids"],
+      "masked_lm_weights": record["masked_lm_weights"],
+      "next_sentence_labels": record["next_sentence_labels"],
     }
 
-    y = record['masked_lm_weights']
+    y = record["masked_lm_weights"]
 
     return (x, y)
 
-  dataset = dataset.map(
-      _select_data_from_record,
-      num_parallel_calls=tf.data.experimental.AUTOTUNE)
+  dataset = dataset.map(_select_data_from_record, num_parallel_calls=tf.data.experimental.AUTOTUNE)
 
   if is_training:
     dataset = dataset.shuffle(100)
@@ -139,35 +129,31 @@ def _select_data_from_record(record):
   return dataset
 
 
-def create_classifier_dataset(file_path,
-                              seq_length,
-                              batch_size,
-                              is_training=True,
-                              input_pipeline_context=None,
-                              use_horovod=False):
+def create_classifier_dataset(
+  file_path, seq_length, batch_size, is_training=True, input_pipeline_context=None, use_horovod=False
+):
   """Creates input dataset from (tf)records files for train/eval."""
   name_to_features = {
-      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'label_ids': tf.io.FixedLenFeature([], tf.int64),
-      'is_real_example': tf.io.FixedLenFeature([], tf.int64),
+    "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "label_ids": tf.io.FixedLenFeature([], tf.int64),
+    "is_real_example": tf.io.FixedLenFeature([], tf.int64),
   }
   dataset = single_file_dataset(file_path, name_to_features, use_horovod)
 
   # The dataset is always sharded by number of hosts.
   # num_input_pipelines is the number of hosts rather than number of cores.
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines, input_pipeline_context.input_pipeline_id)
 
   def _select_data_from_record(record):
     x = {
-        'input_word_ids': record['input_ids'],
-        'input_mask': record['input_mask'],
-        'input_type_ids': record['segment_ids']
+      "input_word_ids": record["input_ids"],
+      "input_mask": record["input_mask"],
+      "input_type_ids": record["segment_ids"],
     }
-    y = record['label_ids']
+    y = record["label_ids"]
     return (x, y)
 
   dataset = dataset.map(_select_data_from_record)
@@ -181,42 +167,38 @@ def _select_data_from_record(record):
   return dataset
 
 
-def create_squad_dataset(file_path,
-                         seq_length,
-                         batch_size,
-                         is_training=True,
-                         input_pipeline_context=None,
-                         use_horovod=False):
+def create_squad_dataset(
+  file_path, seq_length, batch_size, is_training=True, input_pipeline_context=None, use_horovod=False
+):
   """Creates input dataset from (tf)records files for train/eval."""
   name_to_features = {
-      'input_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'input_mask': tf.io.FixedLenFeature([seq_length], tf.int64),
-      'segment_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
+    "input_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "input_mask": tf.io.FixedLenFeature([seq_length], tf.int64),
+    "segment_ids": tf.io.FixedLenFeature([seq_length], tf.int64),
   }
   if is_training:
-    name_to_features['start_positions'] = tf.io.FixedLenFeature([], tf.int64)
-    name_to_features['end_positions'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features["start_positions"] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features["end_positions"] = tf.io.FixedLenFeature([], tf.int64)
   else:
-    name_to_features['unique_ids'] = tf.io.FixedLenFeature([], tf.int64)
+    name_to_features["unique_ids"] = tf.io.FixedLenFeature([], tf.int64)
 
   dataset = single_file_dataset(file_path, name_to_features, use_horovod)
 
   # The dataset is always sharded by number of hosts.
   # num_input_pipelines is the number of hosts rather than number of cores.
   if input_pipeline_context and input_pipeline_context.num_input_pipelines > 1:
-    dataset = dataset.shard(input_pipeline_context.num_input_pipelines,
-                            input_pipeline_context.input_pipeline_id)
+    dataset = dataset.shard(input_pipeline_context.num_input_pipelines, input_pipeline_context.input_pipeline_id)
 
   def _select_data_from_record(record):
     """Dispatches record to features and labels."""
     x, y = {}, {}
     for name, tensor in record.items():
-      if name in ('start_positions', 'end_positions'):
+      if name in ("start_positions", "end_positions"):
         y[name] = tensor
-      elif name == 'input_ids':
-        x['input_word_ids'] = tensor
-      elif name == 'segment_ids':
-        x['input_type_ids'] = tensor
+      elif name == "input_ids":
+        x["input_word_ids"] = tensor
+      elif name == "segment_ids":
+        x["input_type_ids"] = tensor
       else:
         x[name] = tensor
     return (x, y)
diff --git a/modelzoo/LanguageModeling/BERT/model_saving_utils.py b/modelzoo/LanguageModeling/BERT/model_saving_utils.py
index e8e8fa84..0f3c4672 100644
--- a/modelzoo/LanguageModeling/BERT/model_saving_utils.py
+++ b/modelzoo/LanguageModeling/BERT/model_saving_utils.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -26,10 +27,12 @@
 import typing
 
 
-def export_bert_model(model_export_path: typing.Text,
-                      model: tf.keras.Model,
-                      checkpoint_dir: typing.Optional[typing.Text] = None,
-                      restore_model_using_load_weights: bool = False) -> None:
+def export_bert_model(
+  model_export_path: typing.Text,
+  model: tf.keras.Model,
+  checkpoint_dir: typing.Optional[typing.Text] = None,
+  restore_model_using_load_weights: bool = False,
+) -> None:
   """Export BERT model for serving which does not include the optimizer.
 
   Arguments:
@@ -50,15 +53,15 @@ def export_bert_model(model_export_path: typing.Text,
     ValueError when either model_export_path or model is not specified.
   """
   if not model_export_path:
-    raise ValueError('model_export_path must be specified.')
+    raise ValueError("model_export_path must be specified.")
   if not isinstance(model, tf.keras.Model):
-    raise ValueError('model must be a tf.keras.Model object.')
+    raise ValueError("model must be a tf.keras.Model object.")
 
   if checkpoint_dir:
     # Keras compile/fit() was used to save checkpoint using
     # model.save_weights().
     if restore_model_using_load_weights:
-      model_weight_path = os.path.join(checkpoint_dir, 'checkpoint')
+      model_weight_path = os.path.join(checkpoint_dir, "checkpoint")
       assert tf.io.gfile.exists(model_weight_path)
       model.load_weights(model_weight_path)
 
@@ -69,12 +72,10 @@ def export_bert_model(model_export_path: typing.Text,
       # Restores the model from latest checkpoint.
       latest_checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
       assert latest_checkpoint_file
-      logging.info('Checkpoint file %s found and restoring from '
-                   'checkpoint', latest_checkpoint_file)
-      checkpoint.restore(
-          latest_checkpoint_file).assert_existing_objects_matched()
+      logging.info("Checkpoint file %s found and restoring from checkpoint", latest_checkpoint_file)
+      checkpoint.restore(latest_checkpoint_file).assert_existing_objects_matched()
 
-  model.save(model_export_path, include_optimizer=False, save_format='tf')
+  model.save(model_export_path, include_optimizer=False, save_format="tf")
 
 
 class BertModelCheckpoint(tf.keras.callbacks.Callback):
@@ -88,14 +89,12 @@ def __init__(self, checkpoint_dir, checkpoint):
       checkpoint: tf.train.Checkpoint object.
     """
     super(BertModelCheckpoint, self).__init__()
-    self.checkpoint_file_name = os.path.join(
-        checkpoint_dir, 'bert_training_checkpoint_step_{global_step}.ckpt')
+    self.checkpoint_file_name = os.path.join(checkpoint_dir, "bert_training_checkpoint_step_{global_step}.ckpt")
     assert isinstance(checkpoint, tf.train.Checkpoint)
     self.checkpoint = checkpoint
 
   def on_epoch_end(self, epoch, logs=None):
     global_step = tf.keras.backend.get_value(self.model.optimizer.iterations)
-    formatted_file_name = self.checkpoint_file_name.format(
-        global_step=global_step)
+    formatted_file_name = self.checkpoint_file_name.format(global_step=global_step)
     saved_path = self.checkpoint.save(formatted_file_name)
-    logging.info('Saving model TF checkpoint to : %s', saved_path)
+    logging.info("Saving model TF checkpoint to : %s", saved_path)
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
index 236a466d..d9f4ae39 100644
--- a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict.py
@@ -30,7 +30,8 @@
 # key-value pair string. It splits each k-v pair on the = sign, and
 # matches on values that are within single quotes, double quotes, single
 # values (e.g. floats, ints, etc.), and a lists within brackets.
-_PARAM_RE = re.compile(r"""
+_PARAM_RE = re.compile(
+  r"""
   (?P<name>[a-zA-Z][\w\.]*)    # variable name: "var" or "x"
   \s*=\s*
   ((?P<val>\'(.*?)\'           # single quote
@@ -40,13 +41,15 @@
   [^,\[]*                      # single value
   |
   \[[^\]]*\]))                 # list of values
-  ($|,\s*)""", re.VERBOSE)
+  ($|,\s*)""",
+  re.VERBOSE,
+)
 
 
 class ParamsDict(object):
   """A hyperparameter container class."""
 
-  RESERVED_ATTR = ['_locked', '_restrictions']
+  RESERVED_ATTR = ["_locked", "_restrictions"]
 
   def __init__(self, default_params=None, restrictions=None):
     """Instantiate a ParamsDict.
@@ -93,12 +96,11 @@ def __setattr__(self, k, v):
     """
     if k not in ParamsDict.RESERVED_ATTR:
       if k not in self.__dict__.keys():
-        raise KeyError('The key `%{}` does not exist. '
-                       'To extend the existing keys, use '
-                       '`override` with `is_strict` = True.'.format(k))
+        raise KeyError(
+          "The key `%{}` does not exist. To extend the existing keys, use `override` with `is_strict` = True.".format(k)
+        )
       if self._locked:
-        raise ValueError('The ParamsDict has been locked. '
-                         'No change is allowed.')
+        raise ValueError("The ParamsDict has been locked. No change is allowed.")
     self._set(k, v)
 
   def __getattr__(self, k):
@@ -114,7 +116,7 @@ def __getattr__(self, k):
       KeyError: if k is not defined in the ParamsDict.
     """
     if k not in self.__dict__.keys():
-      raise KeyError('The key `{}` does not exist. '.format(k))
+      raise KeyError("The key `{}` does not exist. ".format(k))
     return self.__dict__[k]
 
   def __contains__(self, key):
@@ -138,7 +140,7 @@ def override(self, override_params, is_strict=True):
         be extended to include the new keys.
     """
     if self._locked:
-      raise ValueError('The ParamsDict has been locked. No change is allowed.')
+      raise ValueError("The ParamsDict has been locked. No change is allowed.")
     if isinstance(override_params, ParamsDict):
       override_params = override_params.as_dict()
     self._override(override_params, is_strict)  # pylint: disable=protected-access
@@ -147,13 +149,14 @@ def _override(self, override_dict, is_strict=True):
     """The implementation of `override`."""
     for k, v in six.iteritems(override_dict):
       if k in ParamsDict.RESERVED_ATTR:
-        raise KeyError('The key `%{}` is internally reserved. '
-                       'Can not be overridden.')
+        raise KeyError("The key `%{}` is internally reserved. Can not be overridden.")
       if k not in self.__dict__.keys():
         if is_strict:
-          raise KeyError('The key `{}` does not exist. '
-                         'To extend the existing keys, use '
-                         '`override` with `is_strict` = False.'.format(k))
+          raise KeyError(
+            "The key `{}` does not exist. To extend the existing keys, use `override` with `is_strict` = False.".format(
+              k
+            )
+          )
         else:
           self._set(k, v)
       else:
@@ -219,8 +222,9 @@ def validate(self):
         (2) any inconsistency violating the restriction is found.
       ValueError: if the restriction defined in the string is not supported.
     """
+
     def _get_kv(dotted_string, params_dict):
-      tokenized_params = dotted_string.split('.')
+      tokenized_params = dotted_string.split(".")
       v = params_dict
       for t in tokenized_params:
         v = v[t]
@@ -228,7 +232,7 @@ def _get_kv(dotted_string, params_dict):
 
     def _get_kvs(tokens, params_dict):
       if len(tokens) != 2:
-        raise ValueError('Only support binary relation in restriction.')
+        raise ValueError("Only support binary relation in restriction.")
       stripped_tokens = [t.strip() for t in tokens]
       left_k, left_v = _get_kv(stripped_tokens[0], params_dict)
       right_k, right_v = _get_kv(stripped_tokens[1], params_dict)
@@ -236,61 +240,55 @@ def _get_kvs(tokens, params_dict):
 
     params_dict = self.as_dict()
     for restriction in self._restrictions:
-      if '==' in restriction:
-        tokens = restriction.split('==')
+      if "==" in restriction:
+        tokens = restriction.split("==")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v != right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
-      elif '!=' in restriction:
-        tokens = restriction.split('!=')
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
+      elif "!=" in restriction:
+        tokens = restriction.split("!=")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v == right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
-      elif '<' in restriction:
-        tokens = restriction.split('<')
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
+      elif "<" in restriction:
+        tokens = restriction.split("<")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v >= right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
-      elif '<=' in restriction:
-        tokens = restriction.split('<=')
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
+      elif "<=" in restriction:
+        tokens = restriction.split("<=")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v > right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
-      elif '>' in restriction:
-        tokens = restriction.split('>')
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
+      elif ">" in restriction:
+        tokens = restriction.split(">")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v <= right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
-      elif '>=' in restriction:
-        tokens = restriction.split('>=')
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
+      elif ">=" in restriction:
+        tokens = restriction.split(">=")
         _, left_v, _, right_v = _get_kvs(tokens, params_dict)
         if left_v < right_v:
-          raise KeyError('Found inconsistncy between key `{}` and key `{}`.'
-                         .format(tokens[0], tokens[1]))
+          raise KeyError("Found inconsistncy between key `{}` and key `{}`.".format(tokens[0], tokens[1]))
       else:
-        raise ValueError('Unsupported relation in restriction.')
+        raise ValueError("Unsupported relation in restriction.")
 
 
 def read_yaml_to_params_dict(file_path):
   """Reads a YAML file to a ParamsDict."""
-  with tf.io.gfile.GFile(file_path, 'r') as f:
+  with tf.io.gfile.GFile(file_path, "r") as f:
     params_dict = yaml.load(f)
     return ParamsDict(params_dict)
 
 
 def save_params_dict_to_yaml(params, file_path):
   """Saves the input ParamsDict to a YAML file."""
-  with tf.io.gfile.GFile(file_path, 'w') as f:
+  with tf.io.gfile.GFile(file_path, "w") as f:
 
     def _my_list_rep(dumper, data):
       # u'tag:yaml.org,2002:seq' is the YAML internal tag for sequence.
-      return dumper.represent_sequence(
-          u'tag:yaml.org,2002:seq', data, flow_style=True)
+      return dumper.represent_sequence("tag:yaml.org,2002:seq", data, flow_style=True)
+
     yaml.add_representer(list, _my_list_rep)
     yaml.dump(params.as_dict(), f, default_flow_style=False)
 
@@ -329,7 +327,7 @@ def nested_csv_str_to_json_str(csv_str):
       if the string is formatted incorrectly.
   """
   if not csv_str:
-    return ''
+    return ""
 
   formatted_entries = []
   nested_map = collections.defaultdict(list)
@@ -337,32 +335,31 @@ def nested_csv_str_to_json_str(csv_str):
   while pos < len(csv_str):
     m = _PARAM_RE.match(csv_str, pos)
     if not m:
-      raise ValueError('Malformed hyperparameter value while parsing '
-                       'CSV string: %s' % csv_str[pos:])
+      raise ValueError("Malformed hyperparameter value while parsing CSV string: %s" % csv_str[pos:])
     pos = m.end()
     # Parse the values.
     m_dict = m.groupdict()
-    name = m_dict['name']
-    v = m_dict['val']
+    name = m_dict["name"]
+    v = m_dict["val"]
 
     # If a GCS path (e.g. gs://...) is provided, wrap this in quotes
     # as yaml.load would otherwise throw an exception
-    if re.match(r'(?=[^\"\'])(?=[gs://])', v):
-      v = '\'{}\''.format(v)
+    if re.match(r"(?=[^\"\'])(?=[gs://])", v):
+      v = "'{}'".format(v)
 
-    name_nested = name.split('.')
+    name_nested = name.split(".")
     if len(name_nested) > 1:
       grouping = name_nested[0]
-      value = '.'.join(name_nested[1:]) + '=' + v
+      value = ".".join(name_nested[1:]) + "=" + v
       nested_map[grouping].append(value)
     else:
-      formatted_entries.append('%s : %s' % (name, v))
+      formatted_entries.append("%s : %s" % (name, v))
 
   for grouping, value in nested_map.items():
-    value = ','.join(value)
+    value = ",".join(value)
     value = nested_csv_str_to_json_str(value)
-    formatted_entries.append('%s : %s' % (grouping, value))
-  return '{' + ', '.join(formatted_entries) + '}'
+    formatted_entries.append("%s : %s" % (grouping, value))
+  return "{" + ", ".join(formatted_entries) + "}"
 
 
 def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
@@ -395,8 +392,7 @@ def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
     params.override(dict_or_string_or_yaml_file, is_strict)
   elif isinstance(dict_or_string_or_yaml_file, six.string_types):
     try:
-      dict_or_string_or_yaml_file = (
-          nested_csv_str_to_json_str(dict_or_string_or_yaml_file))
+      dict_or_string_or_yaml_file = nested_csv_str_to_json_str(dict_or_string_or_yaml_file)
     except ValueError:
       pass
     params_dict = yaml.load(dict_or_string_or_yaml_file)
@@ -406,5 +402,5 @@ def override_params_dict(params, dict_or_string_or_yaml_file, is_strict):
       with tf.io.gfile.GFile(dict_or_string_or_yaml_file) as f:
         params.override(yaml.load(f), is_strict)
   else:
-    raise ValueError('Unknown input type to parse.')
+    raise ValueError("Unknown input type to parse.")
   return params
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
index b800c877..8b538654 100644
--- a/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/hyperparams/params_dict_test.py
@@ -24,287 +24,278 @@
 
 
 class ParamsDictTest(tf.test.TestCase):
-
   def test_init_from_an_empty_dict(self):
     params = params_dict.ParamsDict()
     with self.assertRaises(KeyError):
       _ = params.a
 
     with self.assertRaises(KeyError):
-      params.a = 'aa'
+      params.a = "aa"
 
   def test_init_from_a_dict(self):
-    params = params_dict.ParamsDict({'a': 'aa', 'b': 2})
-    self.assertEqual(params.a, 'aa')
+    params = params_dict.ParamsDict({"a": "aa", "b": 2})
+    self.assertEqual(params.a, "aa")
     self.assertEqual(params.b, 2)
 
   def test_init_from_a_param_dict(self):
-    params_init = params_dict.ParamsDict({'a': 'aa', 'b': 2})
+    params_init = params_dict.ParamsDict({"a": "aa", "b": 2})
     params = params_dict.ParamsDict(params_init)
-    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.a, "aa")
     self.assertEqual(params.b, 2)
 
   def test_lock(self):
-    params = params_dict.ParamsDict({'a': 1, 'b': 2})
+    params = params_dict.ParamsDict({"a": 1, "b": 2})
     params.lock()
     with self.assertRaises(ValueError):
       params.a = 10
     with self.assertRaises(ValueError):
-      params.override({'b': 20})
+      params.override({"b": 20})
 
   def test_setattr(self):
     params = params_dict.ParamsDict()
-    params.override(
-        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
-    params.c = 'ccc'
-    self.assertEqual(params.a, 'aa')
+    params.override({"a": "aa", "b": 2, "c": None}, is_strict=False)
+    params.c = "ccc"
+    self.assertEqual(params.a, "aa")
     self.assertEqual(params.b, 2)
-    self.assertEqual(params.c, 'ccc')
+    self.assertEqual(params.c, "ccc")
 
   def test_getattr(self):
     params = params_dict.ParamsDict()
-    params.override(
-        {'a': 'aa', 'b': 2, 'c': None}, is_strict=False)
-    self.assertEqual(params.a, 'aa')
+    params.override({"a": "aa", "b": 2, "c": None}, is_strict=False)
+    self.assertEqual(params.a, "aa")
     self.assertEqual(params.b, 2)
     self.assertEqual(params.c, None)
 
   def test_contains(self):
     params = params_dict.ParamsDict()
-    params.override(
-        {'a': 'aa'}, is_strict=False)
-    self.assertIn('a', params)
-    self.assertNotIn('b', params)
+    params.override({"a": "aa"}, is_strict=False)
+    self.assertIn("a", params)
+    self.assertNotIn("b", params)
 
   def test_get(self):
     params = params_dict.ParamsDict()
-    params.override(
-        {'a': 'aa'}, is_strict=False)
-    self.assertEqual(params.get('a'), 'aa')
-    self.assertEqual(params.get('b', 2), 2)
-    self.assertEqual(params.get('b'), None)
+    params.override({"a": "aa"}, is_strict=False)
+    self.assertEqual(params.get("a"), "aa")
+    self.assertEqual(params.get("b", 2), 2)
+    self.assertEqual(params.get("b"), None)
 
   def test_override_is_strict_true(self):
-    params = params_dict.ParamsDict(
-        {'a': 'aa', 'b': 2, 'c': {'c1': 'cc', 'c2': 20}})
-    params.override({'a': 2, 'c': {'c1': 'ccc'}}, is_strict=True)
+    params = params_dict.ParamsDict({"a": "aa", "b": 2, "c": {"c1": "cc", "c2": 20}})
+    params.override({"a": 2, "c": {"c1": "ccc"}}, is_strict=True)
     self.assertEqual(params.a, 2)
-    self.assertEqual(params.c.c1, 'ccc')
+    self.assertEqual(params.c.c1, "ccc")
     with self.assertRaises(KeyError):
-      params.override({'d': 'ddd'}, is_strict=True)
+      params.override({"d": "ddd"}, is_strict=True)
     with self.assertRaises(KeyError):
-      params.override({'c': {'c3': 30}}, is_strict=True)
+      params.override({"c": {"c3": 30}}, is_strict=True)
 
   def test_override_is_strict_false(self):
-    params = params_dict.ParamsDict(
-        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
-    params.override({'a': 2, 'c': {'c3': 3000}}, is_strict=False)
+    params = params_dict.ParamsDict({"a": "aa", "b": 2, "c": {"c1": 10, "c2": 20}})
+    params.override({"a": 2, "c": {"c3": 3000}}, is_strict=False)
     self.assertEqual(params.a, 2)
     self.assertEqual(params.c.c3, 3000)
-    params.override({'d': 'ddd'}, is_strict=False)
-    self.assertEqual(params.d, 'ddd')
-    params.override({'c': {'c4': 4444}}, is_strict=False)
+    params.override({"d": "ddd"}, is_strict=False)
+    self.assertEqual(params.d, "ddd")
+    params.override({"c": {"c4": 4444}}, is_strict=False)
     self.assertEqual(params.c.c4, 4444)
 
   def test_as_dict(self):
-    params = params_dict.ParamsDict(
-        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
+    params = params_dict.ParamsDict({"a": "aa", "b": 2, "c": {"c1": 10, "c2": 20}})
     params_d = params.as_dict()
-    self.assertEqual(params_d['a'], 'aa')
-    self.assertEqual(params_d['b'], 2)
-    self.assertEqual(params_d['c']['c1'], 10)
-    self.assertEqual(params_d['c']['c2'], 20)
+    self.assertEqual(params_d["a"], "aa")
+    self.assertEqual(params_d["b"], 2)
+    self.assertEqual(params_d["c"]["c1"], 10)
+    self.assertEqual(params_d["c"]["c2"], 20)
 
   def test_validate(self):
     # Raise error due to the unknown parameter.
     with self.assertRaises(KeyError):
-      params = params_dict.ParamsDict(
-          {'a': 1, 'b': {'a': 11}}, ['a == c'])
+      params = params_dict.ParamsDict({"a": 1, "b": {"a": 11}}, ["a == c"])
 
     # OK to check equality of two nested dicts.
-    params = params_dict.ParamsDict(
-        {'a': 1, 'b': {'a': 10}, 'c': {'a': 10}}, ['b == c'])
+    params = params_dict.ParamsDict({"a": 1, "b": {"a": 10}, "c": {"a": 10}}, ["b == c"])
 
     # Raise error due to inconsistency
     with self.assertRaises(KeyError):
-      params = params_dict.ParamsDict(
-          {'a': 1, 'c': {'a': 10}}, ['a == c.a'])
+      params = params_dict.ParamsDict({"a": 1, "c": {"a": 10}}, ["a == c.a"])
 
     # Valid rule.
-    params = params_dict.ParamsDict(
-        {'a': 1, 'c': {'a': 1}}, ['a == c.a'])
+    params = params_dict.ParamsDict({"a": 1, "c": {"a": 1}}, ["a == c.a"])
 
     # Overridding violates the existing rule, raise error upon validate.
-    params.override({'a': 11})
+    params.override({"a": 11})
     with self.assertRaises(KeyError):
       params.validate()
 
 
 class ParamsDictIOTest(tf.test.TestCase):
-
   def write_temp_file(self, filename, text):
     temp_file = os.path.join(self.get_temp_dir(), filename)
-    with tf.io.gfile.GFile(temp_file, 'w') as writer:
+    with tf.io.gfile.GFile(temp_file, "w") as writer:
       writer.write(text)
     return temp_file
 
   def test_save_params_dict_to_yaml(self):
-    params = params_dict.ParamsDict(
-        {'a': 'aa', 'b': 2, 'c': {'c1': 10, 'c2': 20}})
-    output_yaml_file = os.path.join(self.get_temp_dir(), 'params.yaml')
+    params = params_dict.ParamsDict({"a": "aa", "b": 2, "c": {"c1": 10, "c2": 20}})
+    output_yaml_file = os.path.join(self.get_temp_dir(), "params.yaml")
     params_dict.save_params_dict_to_yaml(params, output_yaml_file)
 
-    with tf.io.gfile.GFile(output_yaml_file, 'r') as f:
+    with tf.io.gfile.GFile(output_yaml_file, "r") as f:
       params_d = yaml.load(f)
-      self.assertEqual(params.a, params_d['a'])
-      self.assertEqual(params.b, params_d['b'])
-      self.assertEqual(params.c.c1, params_d['c']['c1'])
-      self.assertEqual(params.c.c2, params_d['c']['c2'])
+      self.assertEqual(params.a, params_d["a"])
+      self.assertEqual(params.b, params_d["b"])
+      self.assertEqual(params.c.c1, params_d["c"]["c1"])
+      self.assertEqual(params.c.c2, params_d["c"]["c2"])
 
   def test_read_yaml_to_params_dict(self):
     input_yaml_file = self.write_temp_file(
-        'params.yaml', r"""
+      "params.yaml",
+      r"""
         a: 'aa'
         b: 2
         c:
           c1: 10
           c2: 20
-    """)
+    """,
+    )
     params = params_dict.read_yaml_to_params_dict(input_yaml_file)
 
-    self.assertEqual(params.a, 'aa')
+    self.assertEqual(params.a, "aa")
     self.assertEqual(params.b, 2)
     self.assertEqual(params.c.c1, 10)
     self.assertEqual(params.c.c2, 20)
 
   def test_override_params_dict_using_dict(self):
-    params = params_dict.ParamsDict({
-        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
-    override_dict = {'b': 5.2, 'c': [30, 40]}
-    params = params_dict.override_params_dict(
-        params, override_dict, is_strict=True)
+    params = params_dict.ParamsDict({"a": 1, "b": 2.5, "c": [3, 4], "d": "hello", "e": False})
+    override_dict = {"b": 5.2, "c": [30, 40]}
+    params = params_dict.override_params_dict(params, override_dict, is_strict=True)
     self.assertEqual(1, params.a)
     self.assertEqual(5.2, params.b)
     self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
+    self.assertEqual("hello", params.d)
     self.assertEqual(False, params.e)
 
   def test_override_params_dict_using_yaml_string(self):
-    params = params_dict.ParamsDict({
-        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    params = params_dict.ParamsDict({"a": 1, "b": 2.5, "c": [3, 4], "d": "hello", "e": False})
     override_yaml_string = "'b': 5.2\n'c': [30, 40]"
-    params = params_dict.override_params_dict(
-        params, override_yaml_string, is_strict=True)
+    params = params_dict.override_params_dict(params, override_yaml_string, is_strict=True)
     self.assertEqual(1, params.a)
     self.assertEqual(5.2, params.b)
     self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
+    self.assertEqual("hello", params.d)
     self.assertEqual(False, params.e)
 
   def test_override_params_dict_using_json_string(self):
     params = params_dict.ParamsDict({
-        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
-        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+      "a": 1,
+      "b": {
+        "b1": 2,
+        "b2": [2, 3],
+      },
+      "d": {"d1": {"d2": "hello"}},
+      "e": False,
+    })
     override_json_string = "{ b: { b2: [3, 4] }, d: { d1: { d2: 'hi' } } }"
-    params = params_dict.override_params_dict(
-        params, override_json_string, is_strict=True)
+    params = params_dict.override_params_dict(params, override_json_string, is_strict=True)
     self.assertEqual(1, params.a)
     self.assertEqual(2, params.b.b1)
     self.assertEqual([3, 4], params.b.b2)
-    self.assertEqual('hi', params.d.d1.d2)
+    self.assertEqual("hi", params.d.d1.d2)
     self.assertEqual(False, params.e)
 
   def test_override_params_dict_using_csv_string(self):
     params = params_dict.ParamsDict({
-        'a': 1, 'b': {'b1': 2, 'b2': [2, 3],},
-        'd': {'d1': {'d2': 'hello'}}, 'e': False})
+      "a": 1,
+      "b": {
+        "b1": 2,
+        "b2": [2, 3],
+      },
+      "d": {"d1": {"d2": "hello"}},
+      "e": False,
+    })
     override_csv_string = "b.b2=[3,4], d.d1.d2='hi, world', e=gs://test"
-    params = params_dict.override_params_dict(
-        params, override_csv_string, is_strict=True)
+    params = params_dict.override_params_dict(params, override_csv_string, is_strict=True)
     self.assertEqual(1, params.a)
     self.assertEqual(2, params.b.b1)
     self.assertEqual([3, 4], params.b.b2)
-    self.assertEqual('hi, world', params.d.d1.d2)
-    self.assertEqual('gs://test', params.e)
+    self.assertEqual("hi, world", params.d.d1.d2)
+    self.assertEqual("gs://test", params.e)
 
   def test_override_params_dict_using_yaml_file(self):
-    params = params_dict.ParamsDict({
-        'a': 1, 'b': 2.5, 'c': [3, 4], 'd': 'hello', 'e': False})
+    params = params_dict.ParamsDict({"a": 1, "b": 2.5, "c": [3, 4], "d": "hello", "e": False})
     override_yaml_file = self.write_temp_file(
-        'params.yaml', r"""
+      "params.yaml",
+      r"""
         b: 5.2
         c: [30, 40]
-        """)
-    params = params_dict.override_params_dict(
-        params, override_yaml_file, is_strict=True)
+        """,
+    )
+    params = params_dict.override_params_dict(params, override_yaml_file, is_strict=True)
     self.assertEqual(1, params.a)
     self.assertEqual(5.2, params.b)
     self.assertEqual([30, 40], params.c)
-    self.assertEqual('hello', params.d)
+    self.assertEqual("hello", params.d)
     self.assertEqual(False, params.e)
 
 
 class IOTest(tf.test.TestCase):
-
   def test_basic_csv_str_to_json_str(self):
-    csv_str = 'a=1,b=2,c=3'
-    json_str = '{a : 1, b : 2, c : 3}'
+    csv_str = "a=1,b=2,c=3"
+    json_str = "{a : 1, b : 2, c : 3}"
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     self.assertEqual(converted_csv_str, json_str)
 
   def test_basic_csv_str_load(self):
-    csv_str = 'a=1,b=2,c=3'
-    expected_output = {'a': 1, 'b': 2, 'c': 3}
+    csv_str = "a=1,b=2,c=3"
+    expected_output = {"a": 1, "b": 2, "c": 3}
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     converted_dict = yaml.load(converted_csv_str)
     self.assertDictEqual(converted_dict, expected_output)
 
   def test_basic_nested_csv_str_to_json_str(self):
-    csv_str = 'a=1,b.b1=2'
-    json_str = '{a : 1, b : {b1 : 2}}'
+    csv_str = "a=1,b.b1=2"
+    json_str = "{a : 1, b : {b1 : 2}}"
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     self.assertEqual(converted_csv_str, json_str)
 
   def test_basic_nested_csv_str_load(self):
-    csv_str = 'a=1,b.b1=2,c.c1=3'
-    expected_output = {'a': 1, 'b': {'b1': 2}, 'c': {'c1': 3}}
+    csv_str = "a=1,b.b1=2,c.c1=3"
+    expected_output = {"a": 1, "b": {"b1": 2}, "c": {"c1": 3}}
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     converted_dict = yaml.load(converted_csv_str)
     self.assertDictEqual(converted_dict, expected_output)
 
   def test_complex_nested_csv_str_to_json_str(self):
-    csv_str = 'a.aa.aaa.aaaaa.a=1'
-    json_str = '{a : {aa : {aaa : {aaaaa : {a : 1}}}}}'
+    csv_str = "a.aa.aaa.aaaaa.a=1"
+    json_str = "{a : {aa : {aaa : {aaaaa : {a : 1}}}}}"
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     self.assertEqual(converted_csv_str, json_str)
 
   def test_complex_nested_csv_str_load(self):
-    csv_str = 'a.aa.aaa.aaaaa.a=1,a.a=2'
-    expected_output = {'a': {'aa': {'aaa': {'aaaaa': {'a': 1}}}, 'a': 2}}
+    csv_str = "a.aa.aaa.aaaaa.a=1,a.a=2"
+    expected_output = {"a": {"aa": {"aaa": {"aaaaa": {"a": 1}}}, "a": 2}}
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     converted_dict = yaml.load(converted_csv_str)
     self.assertDictEqual(converted_dict, expected_output)
 
   def test_csv_str_load_supported_datatypes(self):
-    csv_str = 'a=1,b=2.,c=[1,2,3],d=\'hello, there\',e=\"Hi.\"'
+    csv_str = "a=1,b=2.,c=[1,2,3],d='hello, there',e=\"Hi.\""
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     converted_dict = yaml.load(converted_csv_str)
-    self.assertEqual(converted_dict['a'], 1)
-    self.assertEqual(converted_dict['b'], 2.)
-    self.assertEqual(converted_dict['c'], [1, 2, 3])
-    self.assertEqual(converted_dict['d'], 'hello, there')
-    self.assertEqual(converted_dict['e'], 'Hi.')
+    self.assertEqual(converted_dict["a"], 1)
+    self.assertEqual(converted_dict["b"], 2.0)
+    self.assertEqual(converted_dict["c"], [1, 2, 3])
+    self.assertEqual(converted_dict["d"], "hello, there")
+    self.assertEqual(converted_dict["e"], "Hi.")
 
   def test_csv_str_load_unsupported_datatypes(self):
-    csv_str = 'a=[[1,2,3],[4,5,6]]'
-    self.assertRaises(ValueError,
-                      params_dict.nested_csv_str_to_json_str,
-                      csv_str)
+    csv_str = "a=[[1,2,3],[4,5,6]]"
+    self.assertRaises(ValueError, params_dict.nested_csv_str_to_json_str, csv_str)
 
   def test_csv_str_to_json_str_spacing(self):
-    csv_str1 = 'a=1,b=2,c=3'
-    csv_str2 = 'a = 1, b = 2, c = 3'
-    json_str = '{a : 1, b : 2, c : 3}'
+    csv_str1 = "a=1,b=2,c=3"
+    csv_str2 = "a = 1, b = 2, c = 3"
+    json_str = "{a : 1, b : 2, c : 3}"
     converted_csv_str1 = params_dict.nested_csv_str_to_json_str(csv_str1)
     converted_csv_str2 = params_dict.nested_csv_str_to_json_str(csv_str2)
     self.assertEqual(converted_csv_str1, converted_csv_str2)
@@ -312,11 +303,11 @@ def test_csv_str_to_json_str_spacing(self):
     self.assertEqual(converted_csv_str2, json_str)
 
   def test_gcs_added_quotes(self):
-    csv_str = 'a=gs://abc, b=gs://def'
-    expected_output = '{a : \'gs://abc\', b : \'gs://def\'}'
+    csv_str = "a=gs://abc, b=gs://def"
+    expected_output = "{a : 'gs://abc', b : 'gs://def'}"
     converted_csv_str = params_dict.nested_csv_str_to_json_str(csv_str)
     self.assertEqual(converted_csv_str, expected_output)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py b/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
index 9133b91a..63b0d465 100644
--- a/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
+++ b/modelzoo/LanguageModeling/BERT/official/modeling/training/distributed_executor.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -39,24 +40,24 @@
 def strategy_flags_dict():
   """Returns TPU related flags in a dictionary."""
   return {
-      # TPUStrategy related flags.
-      'tpu': FLAGS.tpu,
-      # MultiWorkerMirroredStrategy related flags.
-      'worker_hosts': FLAGS.worker_hosts,
-      'task_index': FLAGS.task_index,
+    # TPUStrategy related flags.
+    "tpu": FLAGS.tpu,
+    # MultiWorkerMirroredStrategy related flags.
+    "worker_hosts": FLAGS.worker_hosts,
+    "task_index": FLAGS.task_index,
   }
 
 
 def hparam_flags_dict():
   """Returns model params related flags in a dictionary."""
   return {
-      'data_dir': FLAGS.data_dir,
-      'model_dir': FLAGS.model_dir,
-      'train_batch_size': FLAGS.train_batch_size,
-      'eval_batch_size': FLAGS.eval_batch_size,
-      'precision': FLAGS.precision,
-      'config_file': FLAGS.config_file,
-      'params_override': FLAGS.params_override,
+    "data_dir": FLAGS.data_dir,
+    "model_dir": FLAGS.model_dir,
+    "train_batch_size": FLAGS.train_batch_size,
+    "eval_batch_size": FLAGS.eval_batch_size,
+    "precision": FLAGS.precision,
+    "config_file": FLAGS.config_file,
+    "params_override": FLAGS.params_override,
   }
 
 
@@ -65,13 +66,13 @@ def _save_checkpoint(checkpoint, model_dir, checkpoint_prefix):
 
   checkpoint_path = os.path.join(model_dir, checkpoint_prefix)
   saved_path = checkpoint.save(checkpoint_path)
-  logging.info('Saving model as TF checkpoint: %s', saved_path)
+  logging.info("Saving model as TF checkpoint: %s", saved_path)
 
 
 def _steps_to_run(current_step, total_steps, steps_per_loop):
   """Calculates steps to run on device."""
   if steps_per_loop <= 0:
-    raise ValueError('steps_per_loop should be positive integer.')
+    raise ValueError("steps_per_loop should be positive integer.")
   return min(total_steps - current_step, steps_per_loop)
 
 
@@ -104,8 +105,8 @@ def __call__(self, metrics: Union[Dict[Text, float], float], step: int):
     """
     if not isinstance(metrics, dict):
       # Support scalar metric without name.
-      logging.warning('Warning: summary writer prefer metrics as dictionary.')
-      metrics = {'metric': metrics}
+      logging.warning("Warning: summary writer prefer metrics as dictionary.")
+      metrics = {"metric": metrics}
 
     with self._writer.as_default():
       for k, v in metrics.items():
@@ -128,18 +129,12 @@ class DistributedExecutor(object):
       worker GPU or TPU pod (slice). Otherwise, False.
   """
 
-  def __init__(self,
-               strategy,
-               params,
-               model_fn,
-               loss_fn,
-               is_multi_host=False):
-
+  def __init__(self, strategy, params, model_fn, loss_fn, is_multi_host=False):
     self._params = params
     self._model_fn = model_fn
     self._loss_fn = loss_fn
     self._strategy = strategy
-    self._checkpoint_name = 'ctl_step_{step}.ckpt'
+    self._checkpoint_name = "ctl_step_{step}.ckpt"
     self._is_multi_host = is_multi_host
 
   @property
@@ -161,19 +156,18 @@ def model_fn(self, params):
   def _save_config(self, model_dir):
     """Save parameters to config files if model_dir is defined."""
 
-    logging.info('Save config to model_dir %s.', model_dir)
+    logging.info("Save config to model_dir %s.", model_dir)
     if model_dir:
       if not tf.io.gfile.exists(model_dir):
         tf.io.gfile.makedirs(model_dir)
       self._params.lock()
-      params_dict.save_params_dict_to_yaml(self._params,
-                                           model_dir + '/params.yaml')
+      params_dict.save_params_dict_to_yaml(self._params, model_dir + "/params.yaml")
     else:
-      logging.warning('model_dir is empty, so skip the save config.')
+      logging.warning("model_dir is empty, so skip the save config.")
 
   def _get_input_iterator(
-      self, input_fn: Callable[..., tf.data.Dataset],
-      strategy: tf.distribute.Strategy) -> Optional[Iterator[Any]]:
+    self, input_fn: Callable[..., tf.data.Dataset], strategy: tf.distribute.Strategy
+  ) -> Optional[Iterator[Any]]:
     """Returns distributed dataset iterator.
 
     Args:
@@ -190,19 +184,12 @@ def _get_input_iterator(
     # across workers. Since Dataset instance cannot be cloned in eager mode,
     # we instead pass callable that returns a dataset.
     if self._is_multi_host:
-      return iter(
-          strategy.experimental_distribute_datasets_from_function(input_fn))
+      return iter(strategy.experimental_distribute_datasets_from_function(input_fn))
     else:
       input_data = input_fn()
       return iter(strategy.experimental_distribute_dataset(input_data))
 
-  def _create_replicated_step(self,
-                              strategy,
-                              model,
-                              loss_fn,
-                              optimizer,
-                              metric=None):
-
+  def _create_replicated_step(self, strategy, model, loss_fn, optimizer, metric=None):
     def _replicated_step(inputs):
       """Replicated training step."""
       inputs, labels = inputs
@@ -215,8 +202,7 @@ def _replicated_step(inputs):
         if isinstance(metric, tf.keras.metrics.Metric):
           metric.update_state(labels, outputs)
         else:
-          logging.error('train metric is not an instance of '
-                        'tf.keras.metrics.Metric.')
+          logging.error("train metric is not an instance of tf.keras.metrics.Metric.")
 
       grads = tape.gradient(loss, model.trainable_variables)
       optimizer.apply_gradients(zip(grads, model.trainable_variables))
@@ -224,27 +210,21 @@ def _replicated_step(inputs):
 
     return _replicated_step
 
-  def _create_train_step(self,
-                         strategy,
-                         model,
-                         loss_fn,
-                         optimizer,
-                         metric=None):
+  def _create_train_step(self, strategy, model, loss_fn, optimizer, metric=None):
     """Creates a distributed training step.
 
-      Args:
-        strategy: an instance of tf.distribute.Strategy.
-        model: (Tensor, bool) -> Tensor. model function.
-        loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
-        optimizer: tf.keras.optimizers.Optimizer.
-        iterator: an iterator that yields input tensors.
-        metric: tf.keras.metrics.Metric subclass.
+    Args:
+      strategy: an instance of tf.distribute.Strategy.
+      model: (Tensor, bool) -> Tensor. model function.
+      loss_fn: (y_true: Tensor, y_pred: Tensor) -> Tensor.
+      optimizer: tf.keras.optimizers.Optimizer.
+      iterator: an iterator that yields input tensors.
+      metric: tf.keras.metrics.Metric subclass.
 
-      Returns:
-        The training step callable.
+    Returns:
+      The training step callable.
     """
-    _replicated_step = self._create_replicated_step(strategy, model, loss_fn,
-                                                    optimizer, metric)
+    _replicated_step = self._create_replicated_step(strategy, model, loss_fn, optimizer, metric)
 
     @tf.function
     def train_step(iterator, num_steps):
@@ -257,18 +237,14 @@ def train_step(iterator, num_steps):
         The loss tensor.
       """
       if not isinstance(num_steps, tf.Tensor):
-        raise ValueError('steps should be an Tensor. Python object may cause '
-                         'retracing.')
+        raise ValueError("steps should be an Tensor. Python object may cause retracing.")
 
-      per_replica_losses = strategy.experimental_run_v2(
-          _replicated_step, args=(next(iterator),))
+      per_replica_losses = strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
       for _ in tf.range(num_steps - 1):
-        per_replica_losses = strategy.experimental_run_v2(
-            _replicated_step, args=(next(iterator),))
+        per_replica_losses = strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
 
       # For reporting, we returns the mean of losses.
-      loss = strategy.reduce(
-          tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
+      loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_losses, axis=None)
       return loss
 
     return train_step
@@ -280,12 +256,12 @@ def _create_test_step(self, strategy, model, metric):
     def test_step(iterator):
       """Calculates evaluation metrics on distributed devices."""
       if not metric:
-        logging.info('Skip test_step because metric is None (%s)', metric)
+        logging.info("Skip test_step because metric is None (%s)", metric)
         return None, None
       if not isinstance(metric, tf.keras.metrics.Metric):
         raise ValueError(
-            'Metric must be an instance of tf.keras.metrics.Metric '
-            'for running in test_step. Actual {}'.format(metric))
+          "Metric must be an instance of tf.keras.metrics.Metric for running in test_step. Actual {}".format(metric)
+        )
 
       def _test_step_fn(inputs):
         """Replicated accuracy calculation."""
@@ -298,20 +274,20 @@ def _test_step_fn(inputs):
 
     return test_step
 
-  def train(self,
-            train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
-            eval_input_fn: Callable[[params_dict.ParamsDict],
-                                    tf.data.Dataset] = None,
-            model_dir: Text = None,
-            total_steps: int = 1,
-            iterations_per_loop: int = 1,
-            train_metric_fn: Callable[[], Any] = None,
-            eval_metric_fn: Callable[[], Any] = None,
-            summary_writer_fn: Callable[[Text, Text],
-                                        SummaryWriter] = SummaryWriter,
-            init_checkpoint: Callable[[tf.keras.Model], Any] = None,
-            custom_callbacks: List[tf.keras.callbacks.Callback] = None,
-            save_config: bool = True):
+  def train(
+    self,
+    train_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+    eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset] = None,
+    model_dir: Text = None,
+    total_steps: int = 1,
+    iterations_per_loop: int = 1,
+    train_metric_fn: Callable[[], Any] = None,
+    eval_metric_fn: Callable[[], Any] = None,
+    summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter,
+    init_checkpoint: Callable[[tf.keras.Model], Any] = None,
+    custom_callbacks: List[tf.keras.callbacks.Callback] = None,
+    save_config: bool = True,
+  ):
     """Runs distributed training.
 
     Args:
@@ -337,18 +313,16 @@ def train(self,
     """
     assert train_input_fn is not None
     if train_metric_fn and not callable(train_metric_fn):
-      raise ValueError('if `train_metric_fn` is specified, '
-                       'train_metric_fn must be a callable.')
+      raise ValueError("if `train_metric_fn` is specified, train_metric_fn must be a callable.")
     if eval_metric_fn and not callable(eval_metric_fn):
-      raise ValueError('if `eval_metric_fn` is specified, '
-                       'eval_metric_fn must be a callable.')
+      raise ValueError("if `eval_metric_fn` is specified, eval_metric_fn must be a callable.")
     train_metric_fn = train_metric_fn or _no_metric
     eval_metric_fn = eval_metric_fn or _no_metric
 
     if custom_callbacks and iterations_per_loop != 1:
       logging.error(
-          'It is sematically wrong to run callbacks when '
-          'iterations_per_loop is not one (%s)', iterations_per_loop)
+        "It is sematically wrong to run callbacks when iterations_per_loop is not one (%s)", iterations_per_loop
+      )
 
     def _run_callbacks_on_batch_begin(batch):
       """Runs custom callbacks at the start of every step."""
@@ -385,9 +359,8 @@ def _run_callbacks_on_batch_end(batch):
       # To correctly place the model weights on accelerators,
       # model and optimizer should be created in scope.
       model = self.model_fn(params.as_dict())
-      if not hasattr(model, 'optimizer'):
-        raise ValueError('User should set optimizer attribute to model '
-                         'inside `model_fn`.')
+      if not hasattr(model, "optimizer"):
+        raise ValueError("User should set optimizer attribute to model inside `model_fn`.")
       optimizer = model.optimizer
 
       # Training loop starts here.
@@ -395,95 +368,82 @@ def _run_callbacks_on_batch_end(batch):
       latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
       initial_step = 0
       if latest_checkpoint_file:
-        logging.info(
-            'Checkpoint file %s found and restoring from '
-            'checkpoint', latest_checkpoint_file)
+        logging.info("Checkpoint file %s found and restoring from checkpoint", latest_checkpoint_file)
         checkpoint.restore(latest_checkpoint_file)
         initial_step = optimizer.iterations.numpy()
-        logging.info('Loading from checkpoint file completed. Init step %d',
-                     initial_step)
+        logging.info("Loading from checkpoint file completed. Init step %d", initial_step)
       elif init_checkpoint:
-        logging.info('Restoring from init checkpoint function')
+        logging.info("Restoring from init checkpoint function")
         init_checkpoint(model)
-        logging.info('Loading from init checkpoint file completed')
+        logging.info("Loading from init checkpoint file completed")
 
       current_step = optimizer.iterations.numpy()
       checkpoint_name = self.checkpoint_name
 
       eval_metric = eval_metric_fn()
       train_metric = train_metric_fn()
-      train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
-      test_summary_writer = summary_writer_fn(model_dir, 'eval_test')
+      train_summary_writer = summary_writer_fn(model_dir, "eval_train")
+      test_summary_writer = summary_writer_fn(model_dir, "eval_test")
 
     # Continue training loop.
     train_step = self._create_train_step(
-        strategy=strategy,
-        model=model,
-        loss_fn=self.loss_fn(),
-        optimizer=optimizer,
-        metric=train_metric)
+      strategy=strategy, model=model, loss_fn=self.loss_fn(), optimizer=optimizer, metric=train_metric
+    )
     test_step = None
     if eval_input_fn and eval_metric:
       test_step = self._create_test_step(strategy, model, metric=eval_metric)
 
-    logging.info('Training started')
+    logging.info("Training started")
     last_save_checkpoint_step = current_step
     while current_step < total_steps:
-
       num_steps = _steps_to_run(current_step, total_steps, iterations_per_loop)
       _run_callbacks_on_batch_begin(current_step)
-      train_loss = train_step(train_iterator,
-                              tf.convert_to_tensor(num_steps, dtype=tf.int32))
+      train_loss = train_step(train_iterator, tf.convert_to_tensor(num_steps, dtype=tf.int32))
       _run_callbacks_on_batch_end(current_step)
       current_step += num_steps
 
-      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float),
-                                         train_loss)
+      train_loss = tf.nest.map_structure(lambda x: x.numpy().astype(float), train_loss)
       if not isinstance(train_loss, dict):
-        train_loss = {'total_loss': train_loss}
-      if np.isnan(train_loss['total_loss']):
-        raise ValueError('total loss is NaN.')
+        train_loss = {"total_loss": train_loss}
+      if np.isnan(train_loss["total_loss"]):
+        raise ValueError("total loss is NaN.")
 
       if train_metric:
         train_metric_result = train_metric.result()
         if isinstance(train_metric, tf.keras.metrics.Metric):
-          train_metric_result = tf.nest.map_structure(
-              lambda x: x.numpy().astype(float), train_metric_result)
+          train_metric_result = tf.nest.map_structure(lambda x: x.numpy().astype(float), train_metric_result)
         if not isinstance(train_metric_result, dict):
-          train_metric_result = {'metric': train_metric_result}
+          train_metric_result = {"metric": train_metric_result}
         train_metric_result.update(train_loss)
       else:
         train_metric_result = train_loss
       if callable(optimizer.lr):
-        train_metric_result.update(
-            {'learning_rate': optimizer.lr(current_step).numpy()})
+        train_metric_result.update({"learning_rate": optimizer.lr(current_step).numpy()})
       else:
-        train_metric_result.update({'learning_rate': optimizer.lr.numpy()})
-      logging.info('Train Step: %d/%d  / loss = %s / training metric = %s',
-                   current_step, total_steps, train_loss,
-                   train_metric_result)
+        train_metric_result.update({"learning_rate": optimizer.lr.numpy()})
+      logging.info(
+        "Train Step: %d/%d  / loss = %s / training metric = %s",
+        current_step,
+        total_steps,
+        train_loss,
+        train_metric_result,
+      )
 
-      train_summary_writer(
-          metrics=train_metric_result, step=optimizer.iterations)
+      train_summary_writer(metrics=train_metric_result, step=optimizer.iterations)
 
       # Saves model checkpoints and run validation steps at every
       # iterations_per_loop steps.
       # To avoid repeated model saving, we do not save after the last
       # step of training.
-      if save_freq > 0 and current_step < total_steps and (
-          current_step - last_save_checkpoint_step) >= save_freq:
-        _save_checkpoint(checkpoint, model_dir,
-                         checkpoint_name.format(step=current_step))
+      if save_freq > 0 and current_step < total_steps and (current_step - last_save_checkpoint_step) >= save_freq:
+        _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step))
         last_save_checkpoint_step = current_step
 
       if test_step:
         eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-        eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                  eval_metric, eval_iterator)
-        logging.info('Step: %s evalation metric = %s.', current_step,
-                     eval_metric_result)
-        test_summary_writer(
-            metrics=eval_metric_result, step=optimizer.iterations)
+        eval_metric_result = self._run_evaluation(test_step, current_step, eval_metric, eval_iterator)
+        logging.info("Step: %s evalation metric = %s.", current_step, eval_metric_result)
+        test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
 
       # Re-initialize evaluation metric, except the last step.
       if eval_metric and current_step < total_steps:
@@ -493,29 +453,23 @@ def _run_callbacks_on_batch_end(batch):
 
     # Reaches the end of training and saves the last checkpoint.
     if last_save_checkpoint_step < total_steps:
-      _save_checkpoint(checkpoint, model_dir,
-                       checkpoint_name.format(step=current_step))
+      _save_checkpoint(checkpoint, model_dir, checkpoint_name.format(step=current_step))
 
     if test_step:
-      logging.info('Running final evaluation after training is complete.')
+      logging.info("Running final evaluation after training is complete.")
       eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-      eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                eval_metric, eval_iterator)
-      logging.info('Final evaluation metric = %s.', eval_metric_result)
-      test_summary_writer(
-          metrics=eval_metric_result, step=optimizer.iterations)
+      eval_metric_result = self._run_evaluation(test_step, current_step, eval_metric, eval_iterator)
+      logging.info("Final evaluation metric = %s.", eval_metric_result)
+      test_summary_writer(metrics=eval_metric_result, step=optimizer.iterations)
 
     return train_loss, eval_metric_result
 
-  def _run_evaluation(self, test_step, current_training_step, metric,
-                      test_iterator):
+  def _run_evaluation(self, test_step, current_training_step, metric, test_iterator):
     """Runs validation steps and aggregate metrics."""
     if not test_iterator or not metric:
-      logging.warning(
-          'Both test_iterator (%s) and metrics (%s) must not be None.',
-          test_iterator, metric)
+      logging.warning("Both test_iterator (%s) and metrics (%s) must not be None.", test_iterator, metric)
       return None
-    logging.info('Running evaluation after step: %s.', current_training_step)
+    logging.info("Running evaluation after step: %s.", current_training_step)
     while True:
       try:
         test_step(test_iterator)
@@ -525,19 +479,19 @@ def _run_evaluation(self, test_step, current_training_step, metric,
     metric_result = metric.result()
     if isinstance(metric, tf.keras.metrics.Metric):
       metric_result = metric_result.numpy().astype(float)
-    logging.info('Step: [%d] Validation metric = %f', current_training_step,
-                 metric_result)
+    logging.info("Step: [%d] Validation metric = %f", current_training_step, metric_result)
     return metric_result
 
   def evaluate_from_model_dir(
-      self,
-      model_dir: Text,
-      eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
-      eval_metric_fn: Callable[[], Any],
-      total_steps: int = -1,
-      eval_timeout: int = None,
-      min_eval_interval: int = 180,
-      summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter):
+    self,
+    model_dir: Text,
+    eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+    eval_metric_fn: Callable[[], Any],
+    total_steps: int = -1,
+    eval_timeout: int = None,
+    min_eval_interval: int = 180,
+    summary_writer_fn: Callable[[Text, Text], SummaryWriter] = SummaryWriter,
+  ):
     """Runs distributed evaluation on model folder.
 
     Args:
@@ -559,38 +513,37 @@ def evaluate_from_model_dir(
     """
 
     if not model_dir:
-      raise ValueError('model_dir must be set.')
+      raise ValueError("model_dir must be set.")
 
     def terminate_eval():
-      tf.logging.info('Terminating eval after %d seconds of no checkpoints' %
-                      eval_timeout)
+      tf.logging.info("Terminating eval after %d seconds of no checkpoints" % eval_timeout)
       return True
 
-    summary_writer = summary_writer_fn(model_dir, 'eval')
+    summary_writer = summary_writer_fn(model_dir, "eval")
 
     # Read checkpoints from the given model directory
     # until `eval_timeout` seconds elapses.
     for checkpoint_path in tf.train.checkpoints_iterator(
-        model_dir,
-        min_interval_secs=min_eval_interval,
-        timeout=eval_timeout,
-        timeout_fn=terminate_eval):
+      model_dir, min_interval_secs=min_eval_interval, timeout=eval_timeout, timeout_fn=terminate_eval
+    ):
       eval_metric_result, current_step = self.evaluate_checkpoint(
-          checkpoint_path=checkpoint_path,
-          eval_input_fn=eval_input_fn,
-          eval_metric_fn=eval_metric_fn,
-          summary_writer=summary_writer)
+        checkpoint_path=checkpoint_path,
+        eval_input_fn=eval_input_fn,
+        eval_metric_fn=eval_metric_fn,
+        summary_writer=summary_writer,
+      )
       if total_steps > 0 and current_step >= total_steps:
-        logging.info('Evaluation finished after training step %d', current_step)
+        logging.info("Evaluation finished after training step %d", current_step)
         break
     return eval_metric_result
 
-  def evaluate_checkpoint(self,
-                          checkpoint_path: Text,
-                          eval_input_fn: Callable[[params_dict.ParamsDict],
-                                                  tf.data.Dataset],
-                          eval_metric_fn: Callable[[], Any],
-                          summary_writer: SummaryWriter = None):
+  def evaluate_checkpoint(
+    self,
+    checkpoint_path: Text,
+    eval_input_fn: Callable[[params_dict.ParamsDict], tf.data.Dataset],
+    eval_metric_fn: Callable[[], Any],
+    summary_writer: SummaryWriter = None,
+  ):
     """Runs distributed evaluation on the one checkpoint.
 
     Args:
@@ -604,47 +557,40 @@ def evaluate_checkpoint(self,
       Eval metrics dictionary of the last checkpoint.
     """
     if not callable(eval_metric_fn):
-      raise ValueError('if `eval_metric_fn` is specified, '
-                       'eval_metric_fn must be a callable.')
+      raise ValueError("if `eval_metric_fn` is specified, eval_metric_fn must be a callable.")
 
     params = self._params
     strategy = self._strategy
     # To reduce unnecessary send/receive input pipeline operation, we place
     # input pipeline ops in worker task.
     with strategy.scope():
-
       # To correctly place the model weights on accelerators,
       # model and optimizer should be created in scope.
       model = self.model_fn(params.as_dict())
       checkpoint = tf.train.Checkpoint(model=model)
 
       eval_metric = eval_metric_fn()
-      assert eval_metric, 'eval_metric does not exist'
+      assert eval_metric, "eval_metric does not exist"
       test_step = self._create_test_step(strategy, model, metric=eval_metric)
 
-      logging.info('Starting to evaluate.')
+      logging.info("Starting to evaluate.")
       if not checkpoint_path:
-        raise ValueError('checkpoint path is empty')
+        raise ValueError("checkpoint path is empty")
       reader = tf.compat.v1.train.NewCheckpointReader(checkpoint_path)
-      current_step = reader.get_tensor(
-          'optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE')
-      logging.info(
-          'Checkpoint file %s found and restoring from '
-          'checkpoint', checkpoint_path)
+      current_step = reader.get_tensor("optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE")
+      logging.info("Checkpoint file %s found and restoring from checkpoint", checkpoint_path)
       checkpoint.restore(checkpoint_path)
 
       eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
-      eval_metric_result = self._run_evaluation(test_step, current_step,
-                                                eval_metric, eval_iterator)
-      logging.info('Step: %s evalation metric = %s.', current_step,
-                   eval_metric_result)
+      eval_metric_result = self._run_evaluation(test_step, current_step, eval_metric, eval_iterator)
+      logging.info("Step: %s evalation metric = %s.", current_step, eval_metric_result)
       summary_writer(metrics=eval_metric_result, step=current_step)
       eval_metric.reset_states()
 
     return eval_metric_result, current_step
 
   def predict(self):
-    return NotImplementedError('Unimplmented function.')
+    return NotImplementedError("Unimplmented function.")
 
 
 # TODO(yeqing): Add unit test for MultiWorkerMirroredStrategy.
@@ -717,17 +663,16 @@ def _build_strategy(self, strategy_type):
     if strategy_type is None:
       return None
 
-    if strategy_type == 'tpu':
+    if strategy_type == "tpu":
       return self._build_tpu_strategy()
-    elif strategy_type == 'one_device_gpu':
+    elif strategy_type == "one_device_gpu":
       return tf.distribute.OneDeviceStrategy("device:GPU:0")
-    elif strategy_type == 'mirrored':
+    elif strategy_type == "mirrored":
       return self._build_mirrored_strategy()
-    elif strategy_type == 'multi_worker_mirrored':
+    elif strategy_type == "multi_worker_mirrored":
       return self._build_multiworker_mirrored_strategy()
     else:
-      raise NotImplementedError('Unsupport accelerator type "%s"' %
-                                strategy_type)
+      raise NotImplementedError('Unsupport accelerator type "%s"' % strategy_type)
 
   def _build_mirrored_strategy(self):
     """Builds a MirroredStrategy object."""
@@ -737,7 +682,7 @@ def _build_tpu_strategy(self):
     """Builds a TPUStrategy object."""
 
     tpu = self._strategy_config.tpu
-    logging.info('Use TPU at %s', tpu if tpu is not None else '')
+    logging.info("Use TPU at %s", tpu if tpu is not None else "")
     cluster_resolver = tpu_lib.tpu_initialize(tpu)
     strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver)
 
@@ -750,28 +695,17 @@ def _build_multiworker_mirrored_strategy(self):
 
     if worker_hosts is not None:
       # Set TF_CONFIG environment variable
-      worker_hosts = worker_hosts.split(',')
+      worker_hosts = worker_hosts.split(",")
       task_index = self._strategy_config.task_index
-      os.environ['TF_CONFIG'] = json.dumps({
-          'cluster': {
-              'worker': worker_hosts
-          },
-          'task': {
-              'type': 'worker',
-              'index': task_index
-          }
+      os.environ["TF_CONFIG"] = json.dumps({
+        "cluster": {"worker": worker_hosts},
+        "task": {"type": "worker", "index": task_index},
       })
 
-    multiworker_strategy = (
-        tf.distribute.experimental.MultiWorkerMirroredStrategy())
+    multiworker_strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
     return multiworker_strategy
 
-  def build_executor(self,
-                     class_ctor=DistributedExecutor,
-                     params=None,
-                     model_fn=None,
-                     loss_fn=None,
-                     **kwargs):
+  def build_executor(self, class_ctor=DistributedExecutor, params=None, model_fn=None, loss_fn=None, **kwargs):
     """Creates an executor according to strategy type.
 
     See doc string of the DistributedExecutor.__init__ for more information of
@@ -789,12 +723,9 @@ def build_executor(self,
       An instance of DistributedExecutor or its subclass.
     """
     if self._strategy is None:
-      raise ValueError('`strategy` should not be None. You need to specify '
-                       '`strategy_type` in the builder contructor or directly '
-                       'set the `strategy` property of the builder.')
-    return class_ctor(
-        strategy=self._strategy,
-        params=params,
-        model_fn=model_fn,
-        loss_fn=loss_fn,
-        **kwargs)
+      raise ValueError(
+        "`strategy` should not be None. You need to specify "
+        "`strategy_type` in the builder contructor or directly "
+        "set the `strategy` property of the builder."
+      )
+    return class_ctor(strategy=self._strategy, params=params, model_fn=model_fn, loss_fn=loss_fn, **kwargs)
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py b/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
index bba4162b..6c61c731 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/bert_modeling.py
@@ -28,23 +28,24 @@
 from deepray.layers import tf_utils
 
 
-
 class BertConfig(object):
   """Configuration for `BertModel`."""
 
-  def __init__(self,
-               vocab_size,
-               hidden_size=768,
-               num_hidden_layers=12,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               hidden_act="gelu",
-               hidden_dropout_prob=0.1,
-               attention_probs_dropout_prob=0.1,
-               max_position_embeddings=512,
-               type_vocab_size=16,
-               initializer_range=0.02,
-               backward_compatible=True):
+  def __init__(
+    self,
+    vocab_size,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    hidden_act="gelu",
+    hidden_dropout_prob=0.1,
+    attention_probs_dropout_prob=0.1,
+    max_position_embeddings=512,
+    type_vocab_size=16,
+    initializer_range=0.02,
+    backward_compatible=True,
+  ):
     """Constructs BertConfig.
 
     Args:
@@ -88,7 +89,7 @@ def __init__(self,
   def from_dict(cls, json_object):
     """Constructs a `BertConfig` from a Python dictionary of parameters."""
     config = BertConfig(vocab_size=None)
-    for (key, value) in six.iteritems(json_object):
+    for key, value in six.iteritems(json_object):
       config.__dict__[key] = value
     return config
 
@@ -112,11 +113,7 @@ def to_json_string(self):
 class AlbertConfig(BertConfig):
   """Configuration for `ALBERT`."""
 
-  def __init__(self,
-               embedding_size,
-               num_hidden_groups=1,
-               inner_group_num=1,
-               **kwargs):
+  def __init__(self, embedding_size, num_hidden_groups=1, inner_group_num=1, **kwargs):
     """Constructs AlbertConfig.
 
     Args:
@@ -135,32 +132,25 @@ def __init__(self,
     # in the released ALBERT. Support other values in AlbertTransformerEncoder
     # if needed.
     if inner_group_num != 1 or num_hidden_groups != 1:
-      raise ValueError("We only support 'inner_group_num' and "
-                       "'num_hidden_groups' as 1.")
+      raise ValueError("We only support 'inner_group_num' and 'num_hidden_groups' as 1.")
 
   @classmethod
   def from_dict(cls, json_object):
     """Constructs a `AlbertConfig` from a Python dictionary of parameters."""
     config = AlbertConfig(embedding_size=None, vocab_size=None)
-    for (key, value) in six.iteritems(json_object):
+    for key, value in six.iteritems(json_object):
       config.__dict__[key] = value
     return config
 
 
 @deprecation.deprecated(None, "The function should not be used any more.")
-def get_bert_model(input_word_ids,
-                   input_mask,
-                   input_type_ids,
-                   config=None,
-                   name=None,
-                   float_type=tf.float32):
+def get_bert_model(input_word_ids, input_mask, input_type_ids, config=None, name=None, float_type=tf.float32):
   """Wraps the core BERT model as a keras.Model."""
   bert_model_layer = BertModel(config=config, float_type=float_type, name=name)
-  pooled_output, sequence_output = bert_model_layer(input_word_ids, input_mask,
-                                                    input_type_ids)
+  pooled_output, sequence_output = bert_model_layer(input_word_ids, input_mask, input_type_ids)
   bert_model = tf.keras.Model(
-      inputs=[input_word_ids, input_mask, input_type_ids],
-      outputs=[pooled_output, sequence_output])
+    inputs=[input_word_ids, input_mask, input_type_ids], outputs=[pooled_output, sequence_output]
+  )
   return bert_model
 
 
@@ -186,56 +176,53 @@ class BertModel(tf.keras.layers.Layer):
   ```
   """
 
-  @deprecation.deprecated(
-      None, "Please use `nlp.modeling.networks.TransformerEncoder` instead.")
+  @deprecation.deprecated(None, "Please use `nlp.modeling.networks.TransformerEncoder` instead.")
   def __init__(self, config, float_type=tf.float32, **kwargs):
     super(BertModel, self).__init__(**kwargs)
-    self.config = (
-        BertConfig.from_dict(config)
-        if isinstance(config, dict) else copy.deepcopy(config))
+    self.config = BertConfig.from_dict(config) if isinstance(config, dict) else copy.deepcopy(config)
     self.float_type = float_type
 
   def build(self, unused_input_shapes):
     """Implements build() for the layer."""
     self.embedding_lookup = EmbeddingLookup(
-        vocab_size=self.config.vocab_size,
-        embedding_size=self.config.hidden_size,
-        initializer_range=self.config.initializer_range,
-        dtype=tf.float32,
-        name="word_embeddings")
+      vocab_size=self.config.vocab_size,
+      embedding_size=self.config.hidden_size,
+      initializer_range=self.config.initializer_range,
+      dtype=tf.float32,
+      name="word_embeddings",
+    )
     self.embedding_postprocessor = EmbeddingPostprocessor(
-        use_type_embeddings=True,
-        token_type_vocab_size=self.config.type_vocab_size,
-        use_position_embeddings=True,
-        max_position_embeddings=self.config.max_position_embeddings,
-        dropout_prob=self.config.hidden_dropout_prob,
-        initializer_range=self.config.initializer_range,
-        dtype=tf.float32,
-        name="embedding_postprocessor")
+      use_type_embeddings=True,
+      token_type_vocab_size=self.config.type_vocab_size,
+      use_position_embeddings=True,
+      max_position_embeddings=self.config.max_position_embeddings,
+      dropout_prob=self.config.hidden_dropout_prob,
+      initializer_range=self.config.initializer_range,
+      dtype=tf.float32,
+      name="embedding_postprocessor",
+    )
     self.encoder = Transformer(
-        num_hidden_layers=self.config.num_hidden_layers,
-        hidden_size=self.config.hidden_size,
-        num_attention_heads=self.config.num_attention_heads,
-        intermediate_size=self.config.intermediate_size,
-        intermediate_activation=self.config.hidden_act,
-        hidden_dropout_prob=self.config.hidden_dropout_prob,
-        attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
-        initializer_range=self.config.initializer_range,
-        backward_compatible=self.config.backward_compatible,
-        float_type=self.float_type,
-        name="encoder")
+      num_hidden_layers=self.config.num_hidden_layers,
+      hidden_size=self.config.hidden_size,
+      num_attention_heads=self.config.num_attention_heads,
+      intermediate_size=self.config.intermediate_size,
+      intermediate_activation=self.config.hidden_act,
+      hidden_dropout_prob=self.config.hidden_dropout_prob,
+      attention_probs_dropout_prob=self.config.attention_probs_dropout_prob,
+      initializer_range=self.config.initializer_range,
+      backward_compatible=self.config.backward_compatible,
+      float_type=self.float_type,
+      name="encoder",
+    )
     self.pooler_transform = tf.keras.layers.Dense(
-        units=self.config.hidden_size,
-        activation="tanh",
-        kernel_initializer=get_initializer(self.config.initializer_range),
-        name="pooler_transform")
+      units=self.config.hidden_size,
+      activation="tanh",
+      kernel_initializer=get_initializer(self.config.initializer_range),
+      name="pooler_transform",
+    )
     super(BertModel, self).build(unused_input_shapes)
 
-  def __call__(self,
-               input_word_ids,
-               input_mask=None,
-               input_type_ids=None,
-               **kwargs):
+  def __call__(self, input_word_ids, input_mask=None, input_type_ids=None, **kwargs):
     inputs = tf_utils.pack_inputs([input_word_ids, input_mask, input_type_ids])
     return super(BertModel, self).__call__(inputs, **kwargs)
 
@@ -257,18 +244,15 @@ def call(self, inputs, mode="bert"):
     input_type_ids = unpacked_inputs[2]
 
     word_embeddings = self.embedding_lookup(input_word_ids)
-    embedding_tensor = self.embedding_postprocessor(
-        word_embeddings=word_embeddings, token_type_ids=input_type_ids)
+    embedding_tensor = self.embedding_postprocessor(word_embeddings=word_embeddings, token_type_ids=input_type_ids)
     if self.float_type == tf.float16:
       embedding_tensor = tf.cast(embedding_tensor, tf.float16)
     attention_mask = None
     if input_mask is not None:
-      attention_mask = create_attention_mask_from_input_mask(
-          input_word_ids, input_mask)
+      attention_mask = create_attention_mask_from_input_mask(input_word_ids, input_mask)
 
     if mode == "encoder":
-      return self.encoder(
-          embedding_tensor, attention_mask, return_all_layers=True)
+      return self.encoder(embedding_tensor, attention_mask, return_all_layers=True)
 
     sequence_output = self.encoder(embedding_tensor, attention_mask)
     first_token_tensor = tf.squeeze(sequence_output[:, 0:1, :], axis=1)
@@ -285,11 +269,7 @@ def get_config(self):
 class EmbeddingLookup(tf.keras.layers.Layer):
   """Looks up words embeddings for id tensor."""
 
-  def __init__(self,
-               vocab_size,
-               embedding_size=768,
-               initializer_range=0.02,
-               **kwargs):
+  def __init__(self, vocab_size, embedding_size=768, initializer_range=0.02, **kwargs):
     super(EmbeddingLookup, self).__init__(**kwargs)
     self.vocab_size = vocab_size
     self.embedding_size = embedding_size
@@ -298,10 +278,11 @@ def __init__(self,
   def build(self, unused_input_shapes):
     """Implements build() for the layer."""
     self.embeddings = self.add_weight(
-        "embeddings",
-        shape=[self.vocab_size, self.embedding_size],
-        initializer=get_initializer(self.initializer_range),
-        dtype=self.dtype)
+      "embeddings",
+      shape=[self.vocab_size, self.embedding_size],
+      initializer=get_initializer(self.initializer_range),
+      dtype=self.dtype,
+    )
     super(EmbeddingLookup, self).build(unused_input_shapes)
 
   def call(self, inputs):
@@ -316,15 +297,17 @@ def call(self, inputs):
 class EmbeddingPostprocessor(tf.keras.layers.Layer):
   """Performs various post-processing on a word embedding tensor."""
 
-  def __init__(self,
-               use_type_embeddings=False,
-               token_type_vocab_size=None,
-               use_position_embeddings=True,
-               max_position_embeddings=512,
-               dropout_prob=0.0,
-               initializer_range=0.02,
-               initializer=None,
-               **kwargs):
+  def __init__(
+    self,
+    use_type_embeddings=False,
+    token_type_vocab_size=None,
+    use_position_embeddings=True,
+    max_position_embeddings=512,
+    dropout_prob=0.0,
+    initializer_range=0.02,
+    initializer=None,
+    **kwargs,
+  ):
     super(EmbeddingPostprocessor, self).__init__(**kwargs)
     self.use_type_embeddings = use_type_embeddings
     self.token_type_vocab_size = token_type_vocab_size
@@ -339,8 +322,7 @@ def __init__(self,
       self.initializer = initializer
 
     if self.use_type_embeddings and not self.token_type_vocab_size:
-      raise ValueError("If `use_type_embeddings` is True, then "
-                       "`token_type_vocab_size` must be specified.")
+      raise ValueError("If `use_type_embeddings` is True, then `token_type_vocab_size` must be specified.")
 
   def build(self, input_shapes):
     """Implements build() for the layer."""
@@ -349,23 +331,25 @@ def build(self, input_shapes):
     self.type_embeddings = None
     if self.use_type_embeddings:
       self.type_embeddings = self.add_weight(
-          "type_embeddings",
-          shape=[self.token_type_vocab_size, width],
-          initializer=get_initializer(self.initializer_range),
-          dtype=self.dtype)
+        "type_embeddings",
+        shape=[self.token_type_vocab_size, width],
+        initializer=get_initializer(self.initializer_range),
+        dtype=self.dtype,
+      )
 
     self.position_embeddings = None
     if self.use_position_embeddings:
       self.position_embeddings = self.add_weight(
-          "position_embeddings",
-          shape=[self.max_position_embeddings, width],
-          initializer=get_initializer(self.initializer_range),
-          dtype=self.dtype)
+        "position_embeddings",
+        shape=[self.max_position_embeddings, width],
+        initializer=get_initializer(self.initializer_range),
+        dtype=self.dtype,
+      )
 
     self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
-    self.output_dropout = tf.keras.layers.Dropout(
-        rate=self.dropout_prob, dtype=tf.float32)
+      name="layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+    )
+    self.output_dropout = tf.keras.layers.Dropout(rate=self.dropout_prob, dtype=tf.float32)
     super(EmbeddingPostprocessor, self).build(input_shapes)
 
   def __call__(self, word_embeddings, token_type_ids=None, **kwargs):
@@ -385,16 +369,12 @@ def call(self, inputs):
     output = word_embeddings
     if self.use_type_embeddings:
       flat_token_type_ids = tf.reshape(token_type_ids, [-1])
-      token_type_embeddings = tf.gather(self.type_embeddings,
-                                        flat_token_type_ids)
-      token_type_embeddings = tf.reshape(token_type_embeddings,
-                                         [batch_size, seq_length, width])
+      token_type_embeddings = tf.gather(self.type_embeddings, flat_token_type_ids)
+      token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width])
       output += token_type_embeddings
 
     if self.use_position_embeddings:
-      position_embeddings = tf.expand_dims(
-          tf.slice(self.position_embeddings, [0, 0], [seq_length, width]),
-          axis=0)
+      position_embeddings = tf.expand_dims(tf.slice(self.position_embeddings, [0, 0], [seq_length, width]), axis=0)
 
       output += position_embeddings
 
@@ -435,13 +415,15 @@ class Attention(tf.keras.layers.Layer):
     Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout)
   """
 
-  def __init__(self,
-               num_attention_heads=12,
-               size_per_head=64,
-               attention_probs_dropout_prob=0.0,
-               initializer_range=0.02,
-               backward_compatible=False,
-               **kwargs):
+  def __init__(
+    self,
+    num_attention_heads=12,
+    size_per_head=64,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    backward_compatible=False,
+    **kwargs,
+  ):
     super(Attention, self).__init__(**kwargs)
     self.num_attention_heads = num_attention_heads
     self.size_per_head = size_per_head
@@ -454,16 +436,14 @@ def build(self, unused_input_shapes):
     self.query_dense = self._projection_dense_layer("query")
     self.key_dense = self._projection_dense_layer("key")
     self.value_dense = self._projection_dense_layer("value")
-    self.attention_probs_dropout = tf.keras.layers.Dropout(
-        rate=self.attention_probs_dropout_prob)
+    self.attention_probs_dropout = tf.keras.layers.Dropout(rate=self.attention_probs_dropout_prob)
     super(Attention, self).build(unused_input_shapes)
 
   def reshape_to_matrix(self, input_tensor):
     """Reshape N > 2 rank tensor to rank 2 tensor for performance."""
     ndims = input_tensor.shape.ndims
     if ndims < 2:
-      raise ValueError("Input tensor must have at least rank 2."
-                       "Shape = %s" % (input_tensor.shape))
+      raise ValueError("Input tensor must have at least rank 2.Shape = %s" % (input_tensor.shape))
     if ndims == 2:
       return input_tensor
 
@@ -497,8 +477,7 @@ def call(self, inputs):
     # Take the dot product between "query" and "key" to get the raw
     # attention scores.
     attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_tensor, query_tensor)
-    attention_scores = tf.multiply(attention_scores,
-                                   1.0 / math.sqrt(float(self.size_per_head)))
+    attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(self.size_per_head)))
 
     if attention_mask is not None:
       # `attention_mask` = [B, 1, F, T]
@@ -529,12 +508,13 @@ def call(self, inputs):
   def _projection_dense_layer(self, name):
     """A helper to define a projection layer."""
     return Dense3D(
-        num_attention_heads=self.num_attention_heads,
-        size_per_head=self.size_per_head,
-        kernel_initializer=get_initializer(self.initializer_range),
-        output_projection=False,
-        backward_compatible=self.backward_compatible,
-        name=name)
+      num_attention_heads=self.num_attention_heads,
+      size_per_head=self.size_per_head,
+      kernel_initializer=get_initializer(self.initializer_range),
+      output_projection=False,
+      backward_compatible=self.backward_compatible,
+      name=name,
+    )
 
 
 class Dense3D(tf.keras.layers.Layer):
@@ -556,16 +536,18 @@ class Dense3D(tf.keras.layers.Layer):
       checkpoints converted from TF 1.x.
   """
 
-  def __init__(self,
-               num_attention_heads=12,
-               size_per_head=72,
-               kernel_initializer=None,
-               bias_initializer="zeros",
-               activation=None,
-               use_bias=True,
-               output_projection=False,
-               backward_compatible=False,
-               **kwargs):
+  def __init__(
+    self,
+    num_attention_heads=12,
+    size_per_head=72,
+    kernel_initializer=None,
+    bias_initializer="zeros",
+    activation=None,
+    use_bias=True,
+    output_projection=False,
+    backward_compatible=False,
+    **kwargs,
+  ):
     """Inits Dense3D."""
     super(Dense3D, self).__init__(**kwargs)
     self.num_attention_heads = num_attention_heads
@@ -604,15 +586,12 @@ def build(self, input_shape):
     """Implements build() for the layer."""
     dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError("Unable to build `Dense3D` layer with non-floating "
-                      "point (and non-complex) dtype %s" % (dtype,))
+      raise TypeError("Unable to build `Dense3D` layer with non-floating point (and non-complex) dtype %s" % (dtype,))
     input_shape = tf.TensorShape(input_shape)
     if tf.compat.dimension_value(input_shape[-1]) is None:
-      raise ValueError("The last dimension of the inputs to `Dense3D` "
-                       "should be defined. Found `None`.")
+      raise ValueError("The last dimension of the inputs to `Dense3D` should be defined. Found `None`.")
     self.last_dim = tf.compat.dimension_value(input_shape[-1])
-    self.input_spec = tf.keras.layers.InputSpec(
-        min_ndim=3, axes={-1: self.last_dim})
+    self.input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: self.last_dim})
     # Determines variable shapes.
     if self.backward_compatible:
       kernel_shape = self.compatible_kernel_shape
@@ -622,18 +601,12 @@ def build(self, input_shape):
       bias_shape = self.bias_shape
 
     self.kernel = self.add_weight(
-        "kernel",
-        shape=kernel_shape,
-        initializer=self.kernel_initializer,
-        dtype=self.dtype,
-        trainable=True)
+      "kernel", shape=kernel_shape, initializer=self.kernel_initializer, dtype=self.dtype, trainable=True
+    )
     if self.use_bias:
       self.bias = self.add_weight(
-          "bias",
-          shape=bias_shape,
-          initializer=self.bias_initializer,
-          dtype=self.dtype,
-          trainable=True)
+        "bias", shape=bias_shape, initializer=self.bias_initializer, dtype=self.dtype, trainable=True
+      )
     else:
       self.bias = None
     super(Dense3D, self).build(input_shape)
@@ -653,9 +626,7 @@ def call(self, inputs):
     """
     if self.backward_compatible:
       kernel = tf.keras.backend.reshape(self.kernel, self.kernel_shape)
-      bias = (
-          tf.keras.backend.reshape(self.bias, self.bias_shape)
-          if self.use_bias else None)
+      bias = tf.keras.backend.reshape(self.bias, self.bias_shape) if self.use_bias else None
     else:
       kernel = self.kernel
       bias = self.bias
@@ -674,13 +645,15 @@ def call(self, inputs):
 class Dense2DProjection(tf.keras.layers.Layer):
   """A 2D projection layer with tf.einsum implementation."""
 
-  def __init__(self,
-               output_size,
-               kernel_initializer=None,
-               bias_initializer="zeros",
-               activation=None,
-               fp32_activation=False,
-               **kwargs):
+  def __init__(
+    self,
+    output_size,
+    kernel_initializer=None,
+    bias_initializer="zeros",
+    activation=None,
+    fp32_activation=False,
+    **kwargs,
+  ):
     super(Dense2DProjection, self).__init__(**kwargs)
     self.output_size = output_size
     self.kernel_initializer = kernel_initializer
@@ -692,28 +665,24 @@ def build(self, input_shape):
     """Implements build() for the layer."""
     dtype = tf.as_dtype(self.dtype or tf.keras.backend.floatx())
     if not (dtype.is_floating or dtype.is_complex):
-      raise TypeError("Unable to build `Dense2DProjection` layer with "
-                      "non-floating point (and non-complex) "
-                      "dtype %s" % (dtype,))
+      raise TypeError(
+        "Unable to build `Dense2DProjection` layer with non-floating point (and non-complex) dtype %s" % (dtype,)
+      )
     input_shape = tf.TensorShape(input_shape)
     if tf.compat.dimension_value(input_shape[-1]) is None:
-      raise ValueError("The last dimension of the inputs to "
-                       "`Dense2DProjection` should be defined. "
-                       "Found `None`.")
+      raise ValueError("The last dimension of the inputs to `Dense2DProjection` should be defined. Found `None`.")
     last_dim = tf.compat.dimension_value(input_shape[-1])
     self.input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: last_dim})
     self.kernel = self.add_weight(
-        "kernel",
-        shape=[last_dim, self.output_size],
-        initializer=self.kernel_initializer,
-        dtype=self.dtype,
-        trainable=True)
+      "kernel",
+      shape=[last_dim, self.output_size],
+      initializer=self.kernel_initializer,
+      dtype=self.dtype,
+      trainable=True,
+    )
     self.bias = self.add_weight(
-        "bias",
-        shape=[self.output_size],
-        initializer=self.bias_initializer,
-        dtype=self.dtype,
-        trainable=True)
+      "bias", shape=[self.output_size], initializer=self.bias_initializer, dtype=self.dtype, trainable=True
+    )
     super(Dense2DProjection, self).build(input_shape)
 
   def call(self, inputs):
@@ -742,23 +711,24 @@ class TransformerBlock(tf.keras.layers.Layer):
   the second is a positionwise fully connected feed-forward network.
   """
 
-  def __init__(self,
-               hidden_size=768,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               intermediate_activation="gelu",
-               hidden_dropout_prob=0.0,
-               attention_probs_dropout_prob=0.0,
-               initializer_range=0.02,
-               backward_compatible=False,
-               float_type=tf.float32,
-               **kwargs):
+  def __init__(
+    self,
+    hidden_size=768,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_activation="gelu",
+    hidden_dropout_prob=0.0,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    backward_compatible=False,
+    float_type=tf.float32,
+    **kwargs,
+  ):
     super(TransformerBlock, self).__init__(**kwargs)
     self.hidden_size = hidden_size
     self.num_attention_heads = num_attention_heads
     self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(
-        intermediate_activation)
+    self.intermediate_activation = tf_utils.get_activation(intermediate_activation)
     self.hidden_dropout_prob = hidden_dropout_prob
     self.attention_probs_dropout_prob = attention_probs_dropout_prob
     self.initializer_range = initializer_range
@@ -767,58 +737,65 @@ def __init__(self,
 
     if self.hidden_size % self.num_attention_heads != 0:
       raise ValueError(
-          "The hidden size (%d) is not a multiple of the number of attention "
-          "heads (%d)" % (self.hidden_size, self.num_attention_heads))
+        "The hidden size (%d) is not a multiple of the number of attention "
+        "heads (%d)" % (self.hidden_size, self.num_attention_heads)
+      )
     self.attention_head_size = int(self.hidden_size / self.num_attention_heads)
 
   def build(self, unused_input_shapes):
     """Implements build() for the layer."""
     self.attention_layer = Attention(
-        num_attention_heads=self.num_attention_heads,
-        size_per_head=self.attention_head_size,
-        attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-        initializer_range=self.initializer_range,
-        backward_compatible=self.backward_compatible,
-        name="self_attention")
+      num_attention_heads=self.num_attention_heads,
+      size_per_head=self.attention_head_size,
+      attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+      initializer_range=self.initializer_range,
+      backward_compatible=self.backward_compatible,
+      name="self_attention",
+    )
     self.attention_output_dense = Dense3D(
-        num_attention_heads=self.num_attention_heads,
-        size_per_head=int(self.hidden_size / self.num_attention_heads),
-        kernel_initializer=get_initializer(self.initializer_range),
-        output_projection=True,
-        backward_compatible=self.backward_compatible,
-        name="self_attention_output")
-    self.attention_dropout = tf.keras.layers.Dropout(
-        rate=self.hidden_dropout_prob)
-    self.attention_layer_norm = (
-        tf.keras.layers.LayerNormalization(
-            name="self_attention_layer_norm",
-            axis=-1,
-            epsilon=1e-12,
-            # We do layer norm in float32 for numeric stability.
-            dtype=tf.float32))
+      num_attention_heads=self.num_attention_heads,
+      size_per_head=int(self.hidden_size / self.num_attention_heads),
+      kernel_initializer=get_initializer(self.initializer_range),
+      output_projection=True,
+      backward_compatible=self.backward_compatible,
+      name="self_attention_output",
+    )
+    self.attention_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
+    self.attention_layer_norm = tf.keras.layers.LayerNormalization(
+      name="self_attention_layer_norm",
+      axis=-1,
+      epsilon=1e-12,
+      # We do layer norm in float32 for numeric stability.
+      dtype=tf.float32,
+    )
     self.intermediate_dense = Dense2DProjection(
-        output_size=self.intermediate_size,
-        kernel_initializer=get_initializer(self.initializer_range),
-        activation=self.intermediate_activation,
-        # Uses float32 so that gelu activation is done in float32.
-        fp32_activation=True,
-        name="intermediate")
+      output_size=self.intermediate_size,
+      kernel_initializer=get_initializer(self.initializer_range),
+      activation=self.intermediate_activation,
+      # Uses float32 so that gelu activation is done in float32.
+      fp32_activation=True,
+      name="intermediate",
+    )
     self.output_dense = Dense2DProjection(
-        output_size=self.hidden_size,
-        kernel_initializer=get_initializer(self.initializer_range),
-        name="output")
+      output_size=self.hidden_size, kernel_initializer=get_initializer(self.initializer_range), name="output"
+    )
     self.output_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)
     self.output_layer_norm = tf.keras.layers.LayerNormalization(
-        name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32)
+      name="output_layer_norm", axis=-1, epsilon=1e-12, dtype=tf.float32
+    )
     super(TransformerBlock, self).build(unused_input_shapes)
 
   def common_layers(self):
     """Explicitly gets all layer objects inside a Transformer encoder block."""
     return [
-        self.attention_layer, self.attention_output_dense,
-        self.attention_dropout, self.attention_layer_norm,
-        self.intermediate_dense, self.output_dense, self.output_dropout,
-        self.output_layer_norm
+      self.attention_layer,
+      self.attention_output_dense,
+      self.attention_dropout,
+      self.attention_layer_norm,
+      self.intermediate_dense,
+      self.output_dense,
+      self.output_dropout,
+      self.output_layer_norm,
     ]
 
   def __call__(self, input_tensor, attention_mask=None, **kwargs):
@@ -829,15 +806,13 @@ def call(self, inputs):
     """Implements call() for the layer."""
     (input_tensor, attention_mask) = tf_utils.unpack_inputs(inputs)
     attention_output = self.attention_layer(
-        from_tensor=input_tensor,
-        to_tensor=input_tensor,
-        attention_mask=attention_mask)
+      from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=attention_mask
+    )
     attention_output = self.attention_output_dense(attention_output)
     attention_output = self.attention_dropout(attention_output)
     # Use float32 in keras layer norm and the gelu activation in the
     # intermediate dense layer for numeric stability
-    attention_output = self.attention_layer_norm(input_tensor +
-                                                 attention_output)
+    attention_output = self.attention_layer_norm(input_tensor + attention_output)
     if self.float_type == tf.float16:
       attention_output = tf.cast(attention_output, tf.float16)
     intermediate_output = self.intermediate_dense(attention_output)
@@ -864,25 +839,26 @@ class Transformer(tf.keras.layers.Layer):
   https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py
   """
 
-  def __init__(self,
-               num_hidden_layers=12,
-               hidden_size=768,
-               num_attention_heads=12,
-               intermediate_size=3072,
-               intermediate_activation="gelu",
-               hidden_dropout_prob=0.0,
-               attention_probs_dropout_prob=0.0,
-               initializer_range=0.02,
-               backward_compatible=False,
-               float_type=tf.float32,
-               **kwargs):
+  def __init__(
+    self,
+    num_hidden_layers=12,
+    hidden_size=768,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    intermediate_activation="gelu",
+    hidden_dropout_prob=0.0,
+    attention_probs_dropout_prob=0.0,
+    initializer_range=0.02,
+    backward_compatible=False,
+    float_type=tf.float32,
+    **kwargs,
+  ):
     super(Transformer, self).__init__(**kwargs)
     self.num_hidden_layers = num_hidden_layers
     self.hidden_size = hidden_size
     self.num_attention_heads = num_attention_heads
     self.intermediate_size = intermediate_size
-    self.intermediate_activation = tf_utils.get_activation(
-        intermediate_activation)
+    self.intermediate_activation = tf_utils.get_activation(intermediate_activation)
     self.hidden_dropout_prob = hidden_dropout_prob
     self.attention_probs_dropout_prob = attention_probs_dropout_prob
     self.initializer_range = initializer_range
@@ -894,17 +870,19 @@ def build(self, unused_input_shapes):
     self.layers = []
     for i in range(self.num_hidden_layers):
       self.layers.append(
-          TransformerBlock(
-              hidden_size=self.hidden_size,
-              num_attention_heads=self.num_attention_heads,
-              intermediate_size=self.intermediate_size,
-              intermediate_activation=self.intermediate_activation,
-              hidden_dropout_prob=self.hidden_dropout_prob,
-              attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-              initializer_range=self.initializer_range,
-              backward_compatible=self.backward_compatible,
-              float_type=self.float_type,
-              name=("layer_%d" % i)))
+        TransformerBlock(
+          hidden_size=self.hidden_size,
+          num_attention_heads=self.num_attention_heads,
+          intermediate_size=self.intermediate_size,
+          intermediate_activation=self.intermediate_activation,
+          hidden_dropout_prob=self.hidden_dropout_prob,
+          attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+          initializer_range=self.initializer_range,
+          backward_compatible=self.backward_compatible,
+          float_type=self.float_type,
+          name=("layer_%d" % i),
+        )
+      )
     super(Transformer, self).build(unused_input_shapes)
 
   def __call__(self, input_tensor, attention_mask=None, **kwargs):
@@ -967,17 +945,14 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask):
   to_shape = tf_utils.get_shape_list(to_mask, expected_rank=2)
   to_seq_length = to_shape[1]
 
-  to_mask = tf.cast(
-      tf.reshape(to_mask, [batch_size, 1, to_seq_length]),
-      dtype=from_tensor.dtype)
+  to_mask = tf.cast(tf.reshape(to_mask, [batch_size, 1, to_seq_length]), dtype=from_tensor.dtype)
 
   # We don't assume that `from_tensor` is a mask (although it could be). We
   # don't actually care if we attend *from* padding tokens (only *to* padding)
   # tokens so we create a tensor of all ones.
   #
   # `broadcast_ones` = [batch_size, from_seq_length, 1]
-  broadcast_ones = tf.ones(
-      shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
+  broadcast_ones = tf.ones(shape=[batch_size, from_seq_length, 1], dtype=from_tensor.dtype)
 
   # Here we broadcast along two dimensions to create the mask.
   mask = broadcast_ones * to_mask
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py b/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
index 81350a10..79432899 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/bert_models.py
@@ -43,17 +43,14 @@ def gather_indexes(sequence_tensor, positions):
       Masked out sequence tensor of shape (batch_size * max_predictions_per_seq,
       num_hidden).
   """
-  sequence_shape = tf_utils.get_shape_list(
-      sequence_tensor, name='sequence_output_tensor')
+  sequence_shape = tf_utils.get_shape_list(sequence_tensor, name="sequence_output_tensor")
   batch_size = sequence_shape[0]
   seq_length = sequence_shape[1]
   width = sequence_shape[2]
 
-  flat_offsets = tf.keras.backend.reshape(
-      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
+  flat_offsets = tf.keras.backend.reshape(tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
   flat_positions = tf.keras.backend.reshape(positions + flat_offsets, [-1])
-  flat_sequence_tensor = tf.keras.backend.reshape(
-      sequence_tensor, [batch_size * seq_length, width])
+  flat_sequence_tensor = tf.keras.backend.reshape(sequence_tensor, [batch_size * seq_length, width])
   output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
 
   return output_tensor
@@ -66,46 +63,31 @@ def __init__(self, vocab_size, **kwargs):
     super(BertPretrainLossAndMetricLayer, self).__init__(**kwargs)
     self._vocab_size = vocab_size
     self.config = {
-        'vocab_size': vocab_size,
+      "vocab_size": vocab_size,
     }
 
-  def __call__(self,
-               lm_output,
-               sentence_output=None,
-               lm_label_ids=None,
-               lm_label_weights=None,
-               sentence_labels=None,
-               **kwargs):
-    inputs = tf_utils.pack_inputs([
-        lm_output, sentence_output, lm_label_ids, lm_label_weights,
-        sentence_labels
-    ])
-    return super(BertPretrainLossAndMetricLayer,
-                 self).__call__(inputs, **kwargs)
-
-  def _add_metrics(self, lm_output, lm_labels, lm_label_weights,
-                   lm_example_loss, sentence_output, sentence_labels,
-                   next_sentence_loss):
+  def __call__(
+    self, lm_output, sentence_output=None, lm_label_ids=None, lm_label_weights=None, sentence_labels=None, **kwargs
+  ):
+    inputs = tf_utils.pack_inputs([lm_output, sentence_output, lm_label_ids, lm_label_weights, sentence_labels])
+    return super(BertPretrainLossAndMetricLayer, self).__call__(inputs, **kwargs)
+
+  def _add_metrics(
+    self, lm_output, lm_labels, lm_label_weights, lm_example_loss, sentence_output, sentence_labels, next_sentence_loss
+  ):
     """Adds metrics."""
-    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
-        lm_labels, lm_output)
+    masked_lm_accuracy = tf.keras.metrics.sparse_categorical_accuracy(lm_labels, lm_output)
     numerator = tf.reduce_sum(masked_lm_accuracy * lm_label_weights)
     denominator = tf.reduce_sum(lm_label_weights) + 1e-5
     masked_lm_accuracy = numerator / denominator
-    self.add_metric(
-        masked_lm_accuracy, name='masked_lm_accuracy', aggregation='mean')
+    self.add_metric(masked_lm_accuracy, name="masked_lm_accuracy", aggregation="mean")
 
-    self.add_metric(lm_example_loss, name='lm_example_loss', aggregation='mean')
+    self.add_metric(lm_example_loss, name="lm_example_loss", aggregation="mean")
 
-    next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(
-        sentence_labels, sentence_output)
-    self.add_metric(
-        next_sentence_accuracy,
-        name='next_sentence_accuracy',
-        aggregation='mean')
+    next_sentence_accuracy = tf.keras.metrics.sparse_categorical_accuracy(sentence_labels, sentence_output)
+    self.add_metric(next_sentence_accuracy, name="next_sentence_accuracy", aggregation="mean")
 
-    self.add_metric(
-        next_sentence_loss, name='next_sentence_loss', aggregation='mean')
+    self.add_metric(next_sentence_loss, name="next_sentence_loss", aggregation="mean")
 
   def call(self, inputs):
     """Implements call() for the layer."""
@@ -117,23 +99,23 @@ def call(self, inputs):
     sentence_labels = unpacked_inputs[4]
 
     mask_label_loss = losses.weighted_sparse_categorical_crossentropy_loss(
-        labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights)
+      labels=lm_label_ids, predictions=lm_output, weights=lm_label_weights
+    )
     sentence_loss = losses.weighted_sparse_categorical_crossentropy_loss(
-        labels=sentence_labels, predictions=sentence_output)
+      labels=sentence_labels, predictions=sentence_output
+    )
     loss = mask_label_loss + sentence_loss
     batch_shape = tf.slice(tf.keras.backend.shape(sentence_labels), [0], [1])
     # TODO(hongkuny): Avoids the hack and switches add_loss.
     final_loss = tf.fill(batch_shape, loss)
 
-    self._add_metrics(lm_output, lm_label_ids, lm_label_weights,
-                      mask_label_loss, sentence_output, sentence_labels,
-                      sentence_loss)
+    self._add_metrics(
+      lm_output, lm_label_ids, lm_label_weights, mask_label_loss, sentence_output, sentence_labels, sentence_loss
+    )
     return final_loss
 
 
-def get_transformer_encoder(bert_config,
-                            sequence_length,
-                            float_dtype=tf.float32):
+def get_transformer_encoder(bert_config, sequence_length, float_dtype=tf.float32):
   """Gets a 'TransformerEncoder' object.
 
   Args:
@@ -145,33 +127,29 @@ def get_transformer_encoder(bert_config,
     A networks.TransformerEncoder object.
   """
   kwargs = dict(
-      vocab_size=bert_config.vocab_size,
-      hidden_size=bert_config.hidden_size,
-      num_layers=bert_config.num_hidden_layers,
-      num_attention_heads=bert_config.num_attention_heads,
-      intermediate_size=bert_config.intermediate_size,
-      activation=tf_utils.get_activation(bert_config.hidden_act),
-      dropout_rate=bert_config.hidden_dropout_prob,
-      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
-      sequence_length=sequence_length,
-      max_sequence_length=bert_config.max_position_embeddings,
-      type_vocab_size=bert_config.type_vocab_size,
-      initializer=tf.keras.initializers.TruncatedNormal(
-          stddev=bert_config.initializer_range),
-      float_dtype=float_dtype.name)
+    vocab_size=bert_config.vocab_size,
+    hidden_size=bert_config.hidden_size,
+    num_layers=bert_config.num_hidden_layers,
+    num_attention_heads=bert_config.num_attention_heads,
+    intermediate_size=bert_config.intermediate_size,
+    activation=tf_utils.get_activation(bert_config.hidden_act),
+    dropout_rate=bert_config.hidden_dropout_prob,
+    attention_dropout_rate=bert_config.attention_probs_dropout_prob,
+    sequence_length=sequence_length,
+    max_sequence_length=bert_config.max_position_embeddings,
+    type_vocab_size=bert_config.type_vocab_size,
+    initializer=tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range),
+    float_dtype=float_dtype.name,
+  )
   if isinstance(bert_config, bert_modeling.AlbertConfig):
-    kwargs['embedding_width'] = bert_config.embedding_size
+    kwargs["embedding_width"] = bert_config.embedding_size
     return dp.models.albert_transformer_encoder.AlbertTransformerEncoder(**kwargs)
   else:
     assert isinstance(bert_config, bert_modeling.BertConfig)
     return dp.models.transformer_encoder.TransformerEncoder(**kwargs)
 
 
-def pretrain_model(bert_config,
-                   seq_length,
-                   max_predictions_per_seq,
-                   float_type,
-                   initializer=None):
+def pretrain_model(bert_config, seq_length, max_predictions_per_seq, float_type, initializer=None):
   """Returns model to be used for pre-training.
 
   Args:
@@ -185,55 +163,44 @@ def pretrain_model(bert_config,
       Pretraining model as well as core BERT submodel from which to save
       weights after pretraining.
   """
-  input_word_ids = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
-  input_mask = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_mask', dtype=tf.int32)
-  input_type_ids = tf.keras.layers.Input(
-      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
+  input_word_ids = tf.keras.layers.Input(shape=(seq_length,), name="input_word_ids", dtype=tf.int32)
+  input_mask = tf.keras.layers.Input(shape=(seq_length,), name="input_mask", dtype=tf.int32)
+  input_type_ids = tf.keras.layers.Input(shape=(seq_length,), name="input_type_ids", dtype=tf.int32)
   masked_lm_positions = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,),
-      name='masked_lm_positions',
-      dtype=tf.int32)
-  masked_lm_ids = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
-  masked_lm_weights = tf.keras.layers.Input(
-      shape=(max_predictions_per_seq,),
-      name='masked_lm_weights',
-      dtype=tf.int32)
-  next_sentence_labels = tf.keras.layers.Input(
-      shape=(1,), name='next_sentence_labels', dtype=tf.int32)
+    shape=(max_predictions_per_seq,), name="masked_lm_positions", dtype=tf.int32
+  )
+  masked_lm_ids = tf.keras.layers.Input(shape=(max_predictions_per_seq,), name="masked_lm_ids", dtype=tf.int32)
+  masked_lm_weights = tf.keras.layers.Input(shape=(max_predictions_per_seq,), name="masked_lm_weights", dtype=tf.int32)
+  next_sentence_labels = tf.keras.layers.Input(shape=(1,), name="next_sentence_labels", dtype=tf.int32)
 
   transformer_encoder = get_transformer_encoder(bert_config, seq_length, float_type)
   if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
   pretrainer_model = bert_pretrainer.BertPretrainer(
-      network=transformer_encoder,
-      num_classes=2,  # The next sentence prediction label has two classes.
-      num_token_predictions=max_predictions_per_seq,
-      initializer=initializer,
-      float_type=float_type,
-      output='predictions')
-
-  lm_output, sentence_output = pretrainer_model(
-      [input_word_ids, input_mask, input_type_ids, masked_lm_positions])
-
-  pretrain_loss_layer = BertPretrainLossAndMetricLayer(
-      vocab_size=bert_config.vocab_size)
-  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids,
-                                    masked_lm_weights, next_sentence_labels)
+    network=transformer_encoder,
+    num_classes=2,  # The next sentence prediction label has two classes.
+    num_token_predictions=max_predictions_per_seq,
+    initializer=initializer,
+    float_type=float_type,
+    output="predictions",
+  )
+
+  lm_output, sentence_output = pretrainer_model([input_word_ids, input_mask, input_type_ids, masked_lm_positions])
+
+  pretrain_loss_layer = BertPretrainLossAndMetricLayer(vocab_size=bert_config.vocab_size)
+  output_loss = pretrain_loss_layer(lm_output, sentence_output, masked_lm_ids, masked_lm_weights, next_sentence_labels)
   keras_model = tf.keras.Model(
-      inputs={
-          'input_word_ids': input_word_ids,
-          'input_mask': input_mask,
-          'input_type_ids': input_type_ids,
-          'masked_lm_positions': masked_lm_positions,
-          'masked_lm_ids': masked_lm_ids,
-          'masked_lm_weights': masked_lm_weights,
-          'next_sentence_labels': next_sentence_labels,
-      },
-      outputs=output_loss)
+    inputs={
+      "input_word_ids": input_word_ids,
+      "input_mask": input_mask,
+      "input_type_ids": input_type_ids,
+      "masked_lm_positions": masked_lm_positions,
+      "masked_lm_ids": masked_lm_ids,
+      "masked_lm_weights": masked_lm_weights,
+      "next_sentence_labels": next_sentence_labels,
+    },
+    outputs=output_loss,
+  )
   return keras_model, transformer_encoder
 
 
@@ -247,21 +214,18 @@ def __init__(self, initializer=None, float_type=tf.float32, **kwargs):
 
   def build(self, unused_input_shapes):
     """Implements build() for the layer."""
-    self.final_dense = tf.keras.layers.Dense(
-        units=2, kernel_initializer=self.initializer, name='final_dense')
+    self.final_dense = tf.keras.layers.Dense(units=2, kernel_initializer=self.initializer, name="final_dense")
     super(BertSquadLogitsLayer, self).build(unused_input_shapes)
 
   def call(self, inputs):
     """Implements call() for the layer."""
     sequence_output = inputs
 
-    input_shape = tf_utils.get_shape_list(
-        sequence_output, name='sequence_output_tensor')
+    input_shape = tf_utils.get_shape_list(sequence_output, name="sequence_output_tensor")
     sequence_length = input_shape[1]
     num_hidden_units = input_shape[2]
 
-    final_hidden_input = tf.keras.backend.reshape(sequence_output,
-                                                  [-1, num_hidden_units])
+    final_hidden_input = tf.keras.backend.reshape(sequence_output, [-1, num_hidden_units])
     logits = self.final_dense(final_hidden_input)
     logits = tf.keras.backend.reshape(logits, [-1, sequence_length, 2])
     logits = tf.transpose(logits, [2, 0, 1])
@@ -271,11 +235,7 @@ def call(self, inputs):
     return unstacked_logits[0], unstacked_logits[1]
 
 
-def squad_model(bert_config,
-                max_seq_length,
-                float_type,
-                initializer=None,
-                hub_module_url=None):
+def squad_model(bert_config, max_seq_length, float_type, initializer=None, hub_module_url=None):
   """Returns BERT Squad model along with core BERT model to import weights.
 
   Args:
@@ -291,48 +251,38 @@ def squad_model(bert_config,
     (2) the core BERT transformer encoder.
   """
   if initializer is None:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
   if not hub_module_url:
-    bert_encoder = get_transformer_encoder(bert_config, max_seq_length,
-                                           float_type)
-    return dp.models.bert_span_labeler.BertSpanLabeler(
-        network=bert_encoder, initializer=initializer), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+    bert_encoder = get_transformer_encoder(bert_config, max_seq_length, float_type)
+    return dp.models.bert_span_labeler.BertSpanLabeler(network=bert_encoder, initializer=initializer), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
+  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
+  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_type_ids")
   core_model = hub.KerasLayer(hub_module_url, trainable=True)
-  _, sequence_output = core_model(
-      [input_word_ids, input_mask, input_type_ids])
+  _, sequence_output = core_model([input_word_ids, input_mask, input_type_ids])
   # Sets the shape manually due to a bug in TF shape inference.
   # TODO(hongkuny): remove this once shape inference is correct.
   sequence_output.set_shape((None, max_seq_length, bert_config.hidden_size))
 
-  squad_logits_layer = BertSquadLogitsLayer(
-      initializer=initializer, float_type=float_type, name='squad_logits')
+  squad_logits_layer = BertSquadLogitsLayer(initializer=initializer, float_type=float_type, name="squad_logits")
   start_logits, end_logits = squad_logits_layer(sequence_output)
 
   squad = tf.keras.Model(
-      inputs={
-          'input_word_ids': input_word_ids,
-          'input_mask': input_mask,
-          'input_type_ids': input_type_ids,
-      },
-      outputs=[start_logits, end_logits],
-      name='squad_model')
+    inputs={
+      "input_word_ids": input_word_ids,
+      "input_mask": input_mask,
+      "input_type_ids": input_type_ids,
+    },
+    outputs=[start_logits, end_logits],
+    name="squad_model",
+  )
   return squad, core_model
 
 
-def classifier_model(bert_config,
-                     float_type,
-                     num_labels,
-                     max_seq_length,
-                     final_layer_initializer=None,
-                     hub_module_url=None):
+def classifier_model(
+  bert_config, float_type, num_labels, max_seq_length, final_layer_initializer=None, hub_module_url=None
+):
   """BERT classifier model in functional API style.
 
   Construct a Keras model for predicting `num_labels` outputs from an input with
@@ -355,38 +305,23 @@ def classifier_model(bert_config,
   if final_layer_initializer is not None:
     initializer = final_layer_initializer
   else:
-    initializer = tf.keras.initializers.TruncatedNormal(
-        stddev=bert_config.initializer_range)
+    initializer = tf.keras.initializers.TruncatedNormal(stddev=bert_config.initializer_range)
 
   if not hub_module_url:
     bert_encoder = get_transformer_encoder(bert_config, max_seq_length)
     return bert_classifier.BertClassifier(
-        bert_encoder,
-        num_classes=num_labels,
-        dropout_rate=bert_config.hidden_dropout_prob,
-        initializer=initializer), bert_encoder
-
-  input_word_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_word_ids')
-  input_mask = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_mask')
-  input_type_ids = tf.keras.layers.Input(
-      shape=(max_seq_length,), dtype=tf.int32, name='input_type_ids')
+      bert_encoder, num_classes=num_labels, dropout_rate=bert_config.hidden_dropout_prob, initializer=initializer
+    ), bert_encoder
+
+  input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
+  input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
+  input_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_type_ids")
   bert_model = hub.KerasLayer(hub_module_url, trainable=True)
   pooled_output, _ = bert_model([input_word_ids, input_mask, input_type_ids])
-  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(
-      pooled_output)
-
-  output = tf.keras.layers.Dense(
-      num_labels,
-      kernel_initializer=initializer,
-      name='output',
-      dtype=float_type)(
-          output)
+  output = tf.keras.layers.Dropout(rate=bert_config.hidden_dropout_prob)(pooled_output)
+
+  output = tf.keras.layers.Dense(num_labels, kernel_initializer=initializer, name="output", dtype=float_type)(output)
   return tf.keras.Model(
-      inputs={
-          'input_word_ids': input_word_ids,
-          'input_mask': input_mask,
-          'input_type_ids': input_type_ids
-      },
-      outputs=output), bert_model
+    inputs={"input_word_ids": input_word_ids, "input_mask": input_mask, "input_type_ids": input_type_ids},
+    outputs=output,
+  ), bert_model
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
index 919bad30..599bed32 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/__init__.py
@@ -13,5 +13,10 @@
 # limitations under the License.
 # ==============================================================================
 """Activations package definition. Subject to change."""
-from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import loss as weighted_sparse_categorical_crossentropy_loss
-from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import per_example_loss as weighted_sparse_categorical_crossentropy_per_example_loss
+
+from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import (
+  loss as weighted_sparse_categorical_crossentropy_loss,
+)
+from official.nlp.modeling.losses.weighted_sparse_categorical_crossentropy import (
+  per_example_loss as weighted_sparse_categorical_crossentropy_per_example_loss,
+)
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
index fc02ec80..2ddc4b84 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy.py
@@ -16,6 +16,7 @@
 
 from __future__ import absolute_import
 from __future__ import division
+
 # from __future__ import google_type_annotations
 from __future__ import print_function
 
@@ -33,14 +34,18 @@ def _adjust_labels(labels, predictions):
 def _validate_rank(labels, predictions, weights):
   if weights is not None and len(weights.shape) != len(labels.shape):
     raise RuntimeError(
-        ("Weight and label tensors were not of the same rank. weights.shape "
-         "was %s, and labels.shape was %s.") %
-        (predictions.shape, labels.shape))
+      ("Weight and label tensors were not of the same rank. weights.shape was %s, and labels.shape was %s.")
+      % (predictions.shape, labels.shape)
+    )
   if (len(predictions.shape) - 1) != len(labels.shape):
     raise RuntimeError(
-        ("Weighted sparse categorical crossentropy expects `labels` to have a "
-         "rank of one less than `predictions`. labels.shape was %s, and "
-         "predictions.shape was %s.") % (labels.shape, predictions.shape))
+      (
+        "Weighted sparse categorical crossentropy expects `labels` to have a "
+        "rank of one less than `predictions`. labels.shape was %s, and "
+        "predictions.shape was %s."
+      )
+      % (labels.shape, predictions.shape)
+    )
 
 
 def per_example_loss(labels, predictions, weights=None):
@@ -65,8 +70,7 @@ def per_example_loss(labels, predictions, weights=None):
 
   labels_one_hot = tf.keras.backend.one_hot(labels, predictions.shape[-1])
   labels_one_hot = tf.keras.backend.cast(labels_one_hot, predictions.dtype)
-  per_example_loss_data = -tf.keras.backend.sum(
-      predictions * labels_one_hot, axis=[-1])
+  per_example_loss_data = -tf.keras.backend.sum(predictions * labels_one_hot, axis=[-1])
   if weights is not None:
     weights = tf.keras.backend.cast(weights, per_example_loss_data.dtype)
     per_example_loss_data = weights * per_example_loss_data
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
index deb4d120..b7115e3b 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/modeling/losses/weighted_sparse_categorical_crossentropy_test.py
@@ -29,21 +29,15 @@
 
 @keras_parameterized.run_all_keras_modes
 class ClassificationLossTest(keras_parameterized.TestCase):
-
-  def create_lm_model(self,
-                      vocab_size,
-                      sequence_length,
-                      hidden_size,
-                      num_predictions,
-                      output="predictions"):
+  def create_lm_model(self, vocab_size, sequence_length, hidden_size, num_predictions, output="predictions"):
     # First, create a transformer stack that we can use to get the LM's
     # vocabulary weight.
     xformer_stack = networks.TransformerEncoder(
-        vocab_size=vocab_size,
-        num_layers=1,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_attention_heads=4,
+      vocab_size=vocab_size,
+      num_layers=1,
+      sequence_length=sequence_length,
+      hidden_size=hidden_size,
+      num_attention_heads=4,
     )
     word_ids = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
     mask = tf.keras.Input(shape=(sequence_length,), dtype=tf.int32)
@@ -52,21 +46,17 @@ def create_lm_model(self,
 
     # Create a maskedLM from the transformer stack.
     test_network = networks.MaskedLM(
-        num_predictions=num_predictions,
-        input_width=lm_outputs.shape[-1],
-        source_network=xformer_stack,
-        output=output)
+      num_predictions=num_predictions, input_width=lm_outputs.shape[-1], source_network=xformer_stack, output=output
+    )
 
     # Create a model from the masked LM layer.
     lm_input_tensor = tf.keras.Input(shape=(sequence_length, hidden_size))
-    masked_lm_positions = tf.keras.Input(
-        shape=(num_predictions,), dtype=tf.int32)
+    masked_lm_positions = tf.keras.Input(shape=(num_predictions,), dtype=tf.int32)
     output = test_network([lm_input_tensor, masked_lm_positions])
     return tf.keras.Model([lm_input_tensor, masked_lm_positions], output)
 
   def create_classification_model(self, input_width, num_classes):
-    test_object = networks.Classification(
-        input_width=input_width, num_classes=num_classes)
+    test_object = networks.Classification(input_width=input_width, num_classes=num_classes)
     # Create a 2-dimensional input (the first dimension is implicit).
     pooled_data = tf.keras.Input(shape=(input_width,), dtype=tf.float32)
     output = test_object(pooled_data)
@@ -79,30 +69,26 @@ def test_per_example_loss_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
+    )
 
     # Get the output of the masked LM.
     batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
     output_data = model.predict([lm_input_data, masked_position_data])
 
     # Calculate per-example loss.
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
+      predictions=output_data, labels=labels
+    )
 
     # Per-example loss data should have one value per prediction, and those
     # values shouldn't be zero in this case (as we're using random data).
     expected_shape = [batch_size, num_predictions]
     self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
 
   def test_per_example_loss_2d_input(self):
     """Test per-example loss with a 2-d input, from a classifier."""
@@ -118,13 +104,13 @@ def test_per_example_loss_2d_input(self):
     # Calculate per example loss.
     labels = np.random.randint(num_classes, size=(batch_size))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
+      predictions=output_data, labels=labels
+    )
 
     # Per-example loss data should have one value per batch item, and those
     # values shouldn't be zero in this case (as we're using random data).
     self.assertEqual([batch_size], per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
 
   def test_per_example_loss_weights_3d_input(self):
     """Test weighted per-example loss with a 3-d input, from a masked LM."""
@@ -133,17 +119,13 @@ def test_per_example_loss_weights_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
+    )
 
     # Get the output of the masked LM.
     batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
     output_data = model.predict([lm_input_data, masked_position_data])
 
     # Calculate per-example loss with weights.
@@ -151,7 +133,8 @@ def test_per_example_loss_weights_3d_input(self):
     weights = np.random.randint(2, size=(batch_size, num_predictions))
 
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
+      predictions=output_data, labels=labels, weights=weights
+    )
 
     # Weighted per-example loss data should be equivalent to multiplying the
     # loss tensor by the weights tensor.
@@ -174,7 +157,8 @@ def test_per_example_loss_weights_2d_input(self):
     weights = np.random.randint(2, size=(batch_size))
 
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
+      predictions=output_data, labels=labels, weights=weights
+    )
 
     # Weighted per-example loss data should be equivalent to multiplying the
     # loss tensor by the weights tensor.
@@ -188,31 +172,27 @@ def test_loss_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
+    )
 
     # Get the output of the masked LM.
     batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
     output_data = model.predict([lm_input_data, masked_position_data])
 
     # Calculate loss.
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     weights = np.random.randint(2, size=(batch_size, num_predictions))
     per_example_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+      predictions=output_data, labels=labels, weights=weights
+    )
 
     # Total loss data should have one value, and that value shouldn't be zero
     # in this case (as we're using random data).
     expected_shape = []  # Scalar
     self.assertEqual(expected_shape, per_example_loss_data.shape.as_list())
-    self.assertNotAllClose(
-        tf.zeros_like(per_example_loss_data), per_example_loss_data)
+    self.assertNotAllClose(tf.zeros_like(per_example_loss_data), per_example_loss_data)
 
   def test_loss_2d_input(self):
     """Test overall loss with a 2-d input, from a classifier."""
@@ -227,8 +207,7 @@ def test_loss_2d_input(self):
 
     # Calculate per example loss.
     labels = np.random.randint(num_classes, size=(batch_size))
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels)
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels)
 
     # Loss data should have one value only, and that value shouldn't be zero in
     # this case (as we're using random data).
@@ -241,24 +220,21 @@ def test_loss_weights_3d_input(self):
     hidden_size = 64
     num_predictions = 21
     model = self.create_lm_model(
-        vocab_size=vocab_size,
-        sequence_length=sequence_length,
-        hidden_size=hidden_size,
-        num_predictions=num_predictions)
+      vocab_size=vocab_size, sequence_length=sequence_length, hidden_size=hidden_size, num_predictions=num_predictions
+    )
 
     # Get the output of the masked LM.
     batch_size = 3
-    lm_input_data = 10 * np.random.random_sample(
-        (batch_size, sequence_length, hidden_size))
-    masked_position_data = np.random.randint(
-        2, size=(batch_size, num_predictions))
+    lm_input_data = 10 * np.random.random_sample((batch_size, sequence_length, hidden_size))
+    masked_position_data = np.random.randint(2, size=(batch_size, num_predictions))
     output_data = model.predict([lm_input_data, masked_position_data])
 
     # Calculate a fully masked weight tensor. This should give a loss of zero.
     labels = np.random.randint(vocab_size, size=(batch_size, num_predictions))
     null_weights = np.zeros((batch_size, num_predictions))
     weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights)
+      predictions=output_data, labels=labels, weights=null_weights
+    )
 
     # Because the tensor is fully masked, the loss should be 0.
     self.assertAllClose(0, weighted_loss_data)
@@ -278,7 +254,8 @@ def test_loss_weights_2d_input(self):
     labels = np.random.randint(num_classes, size=(batch_size))
     null_weights = np.zeros((batch_size))
     weighted_loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=null_weights)
+      predictions=output_data, labels=labels, weights=null_weights
+    )
 
     # Because the tensor is fully masked, the loss should be 0.
     self.assertAllClose(0, weighted_loss_data)
@@ -290,8 +267,7 @@ def test_mismatched_predictions_and_labels_ranks_squeezes(self):
     labels = np.random.randint(10, size=(batch_size, 1))
 
     # All that this test tests is that the squeeze is successful.
-    _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
+    _ = weighted_sparse_categorical_crossentropy.per_example_loss(predictions=output_data, labels=labels)
 
   def test_mismatched_weights_and_labels_ranks_fail(self):
     """Test that the loss asserts when rank(predictions) != rank(labels)."""
@@ -302,16 +278,15 @@ def test_mismatched_weights_and_labels_ranks_fail(self):
 
     with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
       _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-          predictions=output_data, labels=labels, weights=weights)
+        predictions=output_data, labels=labels, weights=weights
+      )
     with self.assertRaisesRegex(RuntimeError, ".*of the same rank.*"):
-      _ = weighted_sparse_categorical_crossentropy.loss(
-          predictions=output_data, labels=labels, weights=weights)
+      _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
 
   def test_tf_tensor_inputs(self):
     """Test that tf.Tensors can be used as inputs to the loss function."""
     batch_size = 3
-    output_data = tf.convert_to_tensor(
-        np.random.random_sample((batch_size, 10, 15)))
+    output_data = tf.convert_to_tensor(np.random.random_sample((batch_size, 10, 15)))
     labels = tf.convert_to_tensor(np.random.randint(10, size=(batch_size, 10)))
     weights = tf.convert_to_tensor(np.random.randint(2, size=(batch_size, 10)))
 
@@ -319,9 +294,9 @@ def test_tf_tensor_inputs(self):
     # we can in fact pass tensors to these functions without causing runtime
     # errors from the shape checking code.
     _ = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels, weights=weights)
-    _ = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+      predictions=output_data, labels=labels, weights=weights
+    )
+    _ = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
 
   def test_legacy_lm_loss_compatibility(self):
     """Test to validate computational correctness during refactors."""
@@ -330,27 +305,32 @@ def test_legacy_lm_loss_compatibility(self):
     #   vocab_size = 5
     #   sequence_length = 4
     #   num_predictions = 2
-    output_data = np.array(
-        [[[-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
-          [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683]],
-         [[-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
-          [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741]],
-         [[-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
-          [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509]]])
+    output_data = np.array([
+      [
+        [-2.5286622, -1.0963473, -1.4925185, -2.4451098, -1.2923571],
+        [-2.7117882, -1.1205841, -4.02187, -0.9966936, -1.5119683],
+      ],
+      [
+        [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+        [-2.5379114, -0.82479054, -2.287932, -1.3747153, -2.053741],
+      ],
+      [
+        [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+        [-2.7760355, -1.8219438, -3.0924666, -1.0779881, -0.9407509],
+      ],
+    ])
     labels = np.array([[4, 0], [2, 2], [2, 1]])
 
     # Validate that per_example loss calculations are the same.
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
-    expected_per_example_loss_data = [[1.2923571, 2.7117882],
-                                      [2.287932, 2.287932],
-                                      [3.0924666, 1.8219438]]
+      predictions=output_data, labels=labels
+    )
+    expected_per_example_loss_data = [[1.2923571, 2.7117882], [2.287932, 2.287932], [3.0924666, 1.8219438]]
     self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
 
     # Validate that overall loss calculations are the same.
     weights = np.array([[1, 0], [0, 0], [0, 0]])
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
     expected_loss_data = 1.2923441
     self.assertAllClose(expected_loss_data, loss_data)
 
@@ -359,20 +339,22 @@ def test_legacy_classification_loss_compatibility(self):
     # This is the empirical output of a classifier with the following params:
     #   batch_size = 2
     #   num_classes = 3
-    output_data = np.array([[-1.6094601e-03, -1.0966038e+01, -6.4434357e+00],
-                            [-1.6975292e-03, -6.4009643e+00, -1.0226612e+01]])
+    output_data = np.array([
+      [-1.6094601e-03, -1.0966038e01, -6.4434357e00],
+      [-1.6975292e-03, -6.4009643e00, -1.0226612e01],
+    ])
     labels = np.array([2, 1])
 
     # Validate that per_example loss calculations are the same.
     per_example_loss_data = weighted_sparse_categorical_crossentropy.per_example_loss(
-        predictions=output_data, labels=labels)
+      predictions=output_data, labels=labels
+    )
     expected_per_example_loss_data = [6.4434357, 6.4009643]
     self.assertAllClose(expected_per_example_loss_data, per_example_loss_data)
 
     # Validate that overall loss calculations are the same.
     weights = None
-    loss_data = weighted_sparse_categorical_crossentropy.loss(
-        predictions=output_data, labels=labels, weights=weights)
+    loss_data = weighted_sparse_categorical_crossentropy.loss(predictions=output_data, labels=labels, weights=weights)
     expected_loss_data = 6.4222
     self.assertAllClose(expected_loss_data, loss_data)
 
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
index 8b143b1b..264f1349 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1.py
@@ -42,7 +42,7 @@ def inf(dtype):
     # exist for some reason.
     return np.finfo(np.float16).max  # pylint: disable=no-member
   else:
-    raise AssertionError('Invalid dtype: %s' % dtype)
+    raise AssertionError("Invalid dtype: %s" % dtype)
 
 
 class _StateKeys(object):
@@ -79,16 +79,18 @@ class _StateKeys(object):
 class SequenceBeamSearch(object):
   """Implementation of beam search loop."""
 
-  def __init__(self,
-               symbols_to_logits_fn,
-               vocab_size,
-               batch_size,
-               beam_size,
-               alpha,
-               max_decode_length,
-               eos_id,
-               padded_decode,
-               dtype=tf.float32):
+  def __init__(
+    self,
+    symbols_to_logits_fn,
+    vocab_size,
+    batch_size,
+    beam_size,
+    alpha,
+    max_decode_length,
+    eos_id,
+    padded_decode,
+    dtype=tf.float32,
+  ):
     """Initialize sequence beam search.
 
     Args:
@@ -129,8 +131,13 @@ def search(self, initial_ids, initial_cache):
     state, state_shapes = self._create_initial_state(initial_ids, initial_cache)
 
     finished_state = tf.while_loop(
-        self._continue_search, self._search_step, loop_vars=[state],
-        shape_invariants=[state_shapes], parallel_iterations=1, back_prop=False)
+      self._continue_search,
+      self._search_step,
+      loop_vars=[state],
+      shape_invariants=[state_shapes],
+      parallel_iterations=1,
+      back_prop=False,
+    )
     finished_state = finished_state[0]
 
     alive_seq = finished_state[_StateKeys.ALIVE_SEQ]
@@ -142,10 +149,8 @@ def search(self, initial_ids, initial_cache):
     # Account for corner case where there are no finished sequences for a
     # particular batch item. In that case, return alive sequences for that batch
     # item.
-    finished_seq = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
-    finished_scores = tf.where(
-        tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
+    finished_seq = tf.where(tf.reduce_any(finished_flags, 1), finished_seq, alive_seq)
+    finished_scores = tf.where(tf.reduce_any(finished_flags, 1), finished_scores, alive_log_probs)
     return finished_seq, finished_scores
 
   def _create_initial_state(self, initial_ids, initial_cache):
@@ -164,9 +169,9 @@ def _create_initial_state(self, initial_ids, initial_cache):
       for inner_value in nest.flatten(value):
         if inner_value.dtype != self.dtype:
           raise TypeError(
-              "initial_cache element for key '%s' has dtype %s that does not "
-              "match SequenceBeamSearch's dtype of %s. Value: %s" %
-              (key, value.dtype.name, self.dtype.name, inner_value))
+            "initial_cache element for key '%s' has dtype %s that does not "
+            "match SequenceBeamSearch's dtype of %s. Value: %s" % (key, value.dtype.name, self.dtype.name, inner_value)
+          )
 
     # Current loop index (starts at 0)
     cur_index = tf.constant(0)
@@ -179,34 +184,31 @@ def _create_initial_state(self, initial_ids, initial_cache):
 
     # Create tensor for storing initial log probabilities.
     # Assume initial_ids are prob 1.0
-    initial_log_probs = tf.constant(
-        [[0.] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
+    initial_log_probs = tf.constant([[0.0] + [-float("inf")] * (self.beam_size - 1)], dtype=self.dtype)
     alive_log_probs = tf.tile(initial_log_probs, [self.batch_size, 1])
 
     # Expand all values stored in the dictionary to the beam size, so that each
     # beam has a separate cache.
-    alive_cache = nest.map_structure(
-        lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
+    alive_cache = nest.map_structure(lambda t: _expand_to_beam_size(t, self.beam_size), initial_cache)
 
     # Initialize tensor storing finished sequences with filler values.
     finished_seq = tf.zeros(tf.shape(alive_seq), tf.int32)
 
     # Set scores of the initial finished seqs to negative infinity.
-    finished_scores = tf.ones([self.batch_size, self.beam_size],
-                              dtype=self.dtype) * -inf(self.dtype)
+    finished_scores = tf.ones([self.batch_size, self.beam_size], dtype=self.dtype) * -inf(self.dtype)
 
     # Initialize finished flags with all False values.
     finished_flags = tf.zeros([self.batch_size, self.beam_size], tf.bool)
 
     # Create state dictionary
     state = {
-        _StateKeys.CUR_INDEX: cur_index,
-        _StateKeys.ALIVE_SEQ: alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
-        _StateKeys.ALIVE_CACHE: alive_cache,
-        _StateKeys.FINISHED_SEQ: finished_seq,
-        _StateKeys.FINISHED_SCORES: finished_scores,
-        _StateKeys.FINISHED_FLAGS: finished_flags
+      _StateKeys.CUR_INDEX: cur_index,
+      _StateKeys.ALIVE_SEQ: alive_seq,
+      _StateKeys.ALIVE_LOG_PROBS: alive_log_probs,
+      _StateKeys.ALIVE_CACHE: alive_cache,
+      _StateKeys.FINISHED_SEQ: finished_seq,
+      _StateKeys.FINISHED_SCORES: finished_scores,
+      _StateKeys.FINISHED_FLAGS: finished_flags,
     }
 
     # Create state invariants for each value in the state dictionary. Each
@@ -216,41 +218,23 @@ def _create_initial_state(self, initial_ids, initial_cache):
     #   2) the dimension may have different values on different iterations.
     if self.padded_decode:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape(
-                  [self.batch_size, self.beam_size,
-                   self.max_decode_length + 1]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([self.batch_size, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([self.batch_size, self.beam_size])
+        _StateKeys.CUR_INDEX: tf.TensorShape([]),
+        _StateKeys.ALIVE_SEQ: tf.TensorShape([self.batch_size, self.beam_size, self.max_decode_length + 1]),
+        _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([self.batch_size, self.beam_size]),
+        _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape, alive_cache),
+        _StateKeys.FINISHED_SEQ: tf.TensorShape([self.batch_size, self.beam_size, self.max_decode_length + 1]),
+        _StateKeys.FINISHED_SCORES: tf.TensorShape([self.batch_size, self.beam_size]),
+        _StateKeys.FINISHED_FLAGS: tf.TensorShape([self.batch_size, self.beam_size]),
       }
     else:
       state_shape_invariants = {
-          _StateKeys.CUR_INDEX:
-              tf.TensorShape([]),
-          _StateKeys.ALIVE_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.ALIVE_LOG_PROBS:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.ALIVE_CACHE:
-              nest.map_structure(_get_shape_keep_last_dim, alive_cache),
-          _StateKeys.FINISHED_SEQ:
-              tf.TensorShape([None, self.beam_size, None]),
-          _StateKeys.FINISHED_SCORES:
-              tf.TensorShape([None, self.beam_size]),
-          _StateKeys.FINISHED_FLAGS:
-              tf.TensorShape([None, self.beam_size])
+        _StateKeys.CUR_INDEX: tf.TensorShape([]),
+        _StateKeys.ALIVE_SEQ: tf.TensorShape([None, self.beam_size, None]),
+        _StateKeys.ALIVE_LOG_PROBS: tf.TensorShape([None, self.beam_size]),
+        _StateKeys.ALIVE_CACHE: nest.map_structure(_get_shape_keep_last_dim, alive_cache),
+        _StateKeys.FINISHED_SEQ: tf.TensorShape([None, self.beam_size, None]),
+        _StateKeys.FINISHED_SCORES: tf.TensorShape([None, self.beam_size]),
+        _StateKeys.FINISHED_FLAGS: tf.TensorShape([None, self.beam_size]),
       }
 
     return state, state_shape_invariants
@@ -279,31 +263,24 @@ def _continue_search(self, state):
     not_at_max_decode_length = tf.less(i, self.max_decode_length)
 
     # Calculate largest length penalty (the larger penalty, the better score).
-    max_length_norm = _length_normalization(self.alpha, self.max_decode_length,
-                                            dtype=self.dtype)
+    max_length_norm = _length_normalization(self.alpha, self.max_decode_length, dtype=self.dtype)
     # Get the best possible scores from alive sequences.
     best_alive_scores = alive_log_probs[:, 0] / max_length_norm
 
     # Compute worst score in finished sequences for each batch element
-    finished_scores *= tf.cast(finished_flags,
-                               self.dtype)  # set filler scores to zero
+    finished_scores *= tf.cast(finished_flags, self.dtype)  # set filler scores to zero
     lowest_finished_scores = tf.reduce_min(finished_scores, axis=1)
 
     # If there are no finished sequences in a batch element, then set the lowest
     # finished score to -INF for that element.
     finished_batches = tf.reduce_any(finished_flags, 1)
-    lowest_finished_scores += ((1.0 -
-                                tf.cast(finished_batches, self.dtype)) *
-                               -inf(self.dtype))
+    lowest_finished_scores += (1.0 - tf.cast(finished_batches, self.dtype)) * -inf(self.dtype)
 
     worst_finished_score_better_than_best_alive_score = tf.reduce_all(
-        tf.greater(lowest_finished_scores, best_alive_scores)
+      tf.greater(lowest_finished_scores, best_alive_scores)
     )
 
-    return tf.logical_and(
-        not_at_max_decode_length,
-        tf.logical_not(worst_finished_score_better_than_best_alive_score)
-    )
+    return tf.logical_and(not_at_max_decode_length, tf.logical_not(worst_finished_score_better_than_best_alive_score))
 
   def _search_step(self, state):
     """Beam search loop body.
@@ -326,13 +303,11 @@ def _search_step(self, state):
     new_seq, new_log_probs, topk_ids, new_cache = self._grow_alive_seq(state)
     new_finished_flags = tf.equal(topk_ids, self.eos_id)
     # Collect top beam_size alive sequences
-    alive_state = self._get_new_alive_state(new_seq, new_log_probs,
-                                            new_finished_flags, new_cache)
+    alive_state = self._get_new_alive_state(new_seq, new_log_probs, new_finished_flags, new_cache)
 
     # Combine newly finished sequences with existing finished sequences, and
     # collect the top k scoring sequences.
-    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs,
-                                                  new_finished_flags)
+    finished_state = self._get_new_finished_state(state, new_seq, new_log_probs, new_finished_flags)
 
     # Increment loop index and create new state dictionary
     new_state = {_StateKeys.CUR_INDEX: state[_StateKeys.CUR_INDEX] + 1}
@@ -366,8 +341,8 @@ def _grow_alive_seq(self, state):
     # cache values at the same time.
     if self.padded_decode:
       flat_ids = tf.reshape(
-          tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]),
-          [self.batch_size * self.beam_size, -1])
+        tf.slice(alive_seq, [0, 0, i], [self.batch_size, self.beam_size, 1]), [self.batch_size * self.beam_size, -1]
+      )
     else:
       flat_ids = _flatten_beam_dim(alive_seq)  # [batch_size * beam_size]
     flat_cache = nest.map_structure(_flatten_beam_dim, alive_cache)
@@ -376,9 +351,7 @@ def _grow_alive_seq(self, state):
 
     # Unflatten logits to shape [batch_size, beam_size, vocab_size]
     logits = _unflatten_beam_dim(flat_logits, self.batch_size, self.beam_size)
-    new_cache = nest.map_structure(
-        lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size),
-        flat_cache)
+    new_cache = nest.map_structure(lambda t: _unflatten_beam_dim(t, self.batch_size, self.beam_size), flat_cache)
 
     # Convert logits to normalized log probs
     candidate_log_probs = _log_prob_from_logits(logits)
@@ -390,31 +363,26 @@ def _grow_alive_seq(self, state):
 
     # Each batch item has beam_size * vocab_size candidate sequences. For each
     # batch item, get the k candidates with the highest log probabilities.
-    flat_log_probs = tf.reshape(log_probs,
-                                [-1, self.beam_size * self.vocab_size])
+    flat_log_probs = tf.reshape(log_probs, [-1, self.beam_size * self.vocab_size])
     topk_log_probs, topk_indices = tf.nn.top_k(flat_log_probs, k=beams_to_keep)
 
     # Extract the alive sequences that generate the highest log probabilities
     # after being extended.
     topk_beam_indices = topk_indices // self.vocab_size
-    topk_seq, new_cache = _gather_beams(
-        [alive_seq, new_cache], topk_beam_indices, self.batch_size,
-        beams_to_keep)
+    topk_seq, new_cache = _gather_beams([alive_seq, new_cache], topk_beam_indices, self.batch_size, beams_to_keep)
 
     # Append the most probable IDs to the topk sequences
     topk_ids = topk_indices % self.vocab_size
     if self.padded_decode:
       topk_seq = tf.transpose(topk_seq, perm=[2, 0, 1])
       # TODO(b/145533236, hongkuny): Reverts once TF fix the validation.
-      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]],
-                                             tf.expand_dims(topk_ids, axis=0))
+      topk_seq = tf.tensor_scatter_nd_update(topk_seq, [[i + 1]], tf.expand_dims(topk_ids, axis=0))
       topk_seq = tf.transpose(topk_seq, perm=[1, 2, 0])
     else:
       topk_seq = tf.concat([topk_seq, tf.expand_dims(topk_ids, axis=2)], axis=2)
     return topk_seq, topk_log_probs, topk_ids, new_cache
 
-  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
-                           new_cache):
+  def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags, new_cache):
     """Gather the top k sequences that are still alive.
 
     Args:
@@ -436,17 +404,16 @@ def _get_new_alive_state(self, new_seq, new_log_probs, new_finished_flags,
     new_log_probs += tf.cast(new_finished_flags, self.dtype) * -inf(self.dtype)
 
     top_alive_seq, top_alive_log_probs, top_alive_cache = _gather_topk_beams(
-        [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size,
-        self.beam_size)
+      [new_seq, new_log_probs, new_cache], new_log_probs, self.batch_size, self.beam_size
+    )
 
     return {
-        _StateKeys.ALIVE_SEQ: top_alive_seq,
-        _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
-        _StateKeys.ALIVE_CACHE: top_alive_cache
+      _StateKeys.ALIVE_SEQ: top_alive_seq,
+      _StateKeys.ALIVE_LOG_PROBS: top_alive_log_probs,
+      _StateKeys.ALIVE_CACHE: top_alive_cache,
     }
 
-  def _get_new_finished_state(self, state, new_seq, new_log_probs,
-                              new_finished_flags):
+  def _get_new_finished_state(self, state, new_seq, new_log_probs, new_finished_flags):
     """Combine new and old finished sequences, and gather the top k sequences.
 
     Args:
@@ -472,19 +439,14 @@ def _get_new_finished_state(self, state, new_seq, new_log_probs,
     # First append a column of 0-ids to finished_seq to increment the length.
     # New shape of finished_seq: [batch_size, beam_size, i + 1]
     if not self.padded_decode:
-      finished_seq = tf.concat([
-          finished_seq,
-          tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)
-      ],
-                               axis=2)
+      finished_seq = tf.concat([finished_seq, tf.zeros([self.batch_size, self.beam_size, 1], tf.int32)], axis=2)
 
     # Calculate new seq scores from log probabilities.
     length_norm = _length_normalization(self.alpha, i + 1, dtype=self.dtype)
     new_scores = new_log_probs / length_norm
 
     # Set the scores of the still-alive seq in new_seq to large negative values.
-    new_scores += ((1. - tf.cast(new_finished_flags, self.dtype)) *
-                   -inf(self.dtype))
+    new_scores += (1.0 - tf.cast(new_finished_flags, self.dtype)) * -inf(self.dtype)
 
     # Combine sequences, scores, and flags.
     finished_seq = tf.concat([finished_seq, new_seq], axis=1)
@@ -492,20 +454,28 @@ def _get_new_finished_state(self, state, new_seq, new_log_probs,
     finished_flags = tf.concat([finished_flags, new_finished_flags], axis=1)
 
     # Return the finished sequences with the best scores.
-    top_finished_seq, top_finished_scores, top_finished_flags = (
-        _gather_topk_beams([finished_seq, finished_scores, finished_flags],
-                           finished_scores, self.batch_size, self.beam_size))
+    top_finished_seq, top_finished_scores, top_finished_flags = _gather_topk_beams(
+      [finished_seq, finished_scores, finished_flags], finished_scores, self.batch_size, self.beam_size
+    )
 
     return {
-        _StateKeys.FINISHED_SEQ: top_finished_seq,
-        _StateKeys.FINISHED_SCORES: top_finished_scores,
-        _StateKeys.FINISHED_FLAGS: top_finished_flags
+      _StateKeys.FINISHED_SEQ: top_finished_seq,
+      _StateKeys.FINISHED_SCORES: top_finished_scores,
+      _StateKeys.FINISHED_FLAGS: top_finished_flags,
     }
 
 
 def sequence_beam_search(
-    symbols_to_logits_fn, initial_ids, initial_cache, vocab_size, beam_size,
-    alpha, max_decode_length, eos_id, padded_decode=False):
+  symbols_to_logits_fn,
+  initial_ids,
+  initial_cache,
+  vocab_size,
+  beam_size,
+  alpha,
+  max_decode_length,
+  eos_id,
+  padded_decode=False,
+):
   """Search for sequence of subtoken ids with the largest probability.
 
   Args:
@@ -536,12 +506,10 @@ def sequence_beam_search(
     Top decoded sequences [batch_size, beam_size, max_decode_length]
     sequence scores [batch_size, beam_size]
   """
-  batch_size = (
-      initial_ids.shape.as_list()[0] if padded_decode else
-      tf.shape(initial_ids)[0])
-  sbs = SequenceBeamSearch(symbols_to_logits_fn, vocab_size, batch_size,
-                           beam_size, alpha, max_decode_length, eos_id,
-                           padded_decode)
+  batch_size = initial_ids.shape.as_list()[0] if padded_decode else tf.shape(initial_ids)[0]
+  sbs = SequenceBeamSearch(
+    symbols_to_logits_fn, vocab_size, batch_size, beam_size, alpha, max_decode_length, eos_id, padded_decode
+  )
   return sbs.search(initial_ids, initial_cache)
 
 
@@ -551,7 +519,7 @@ def _log_prob_from_logits(logits):
 
 def _length_normalization(alpha, length, dtype=tf.float32):
   """Return length normalization factor."""
-  return tf.pow(((5. + tf.cast(length, dtype)) / 6.), alpha)
+  return tf.pow(((5.0 + tf.cast(length, dtype)) / 6.0), alpha)
 
 
 def _expand_to_beam_size(tensor, beam_size):
@@ -665,8 +633,7 @@ def _gather_beams(nested, beam_indices, batch_size, new_beam_size):
   # the (i, j) gathering coordinates.
   coordinates = tf.stack([batch_pos, beam_indices], axis=2)
 
-  return nest.map_structure(
-      lambda state: tf.gather_nd(state, coordinates), nested)
+  return nest.map_structure(lambda state: tf.gather_nd(state, coordinates), nested)
 
 
 def _gather_topk_beams(nested, score_or_log_prob, batch_size, beam_size):
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
index 53cf921f..c2e7ac25 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/beam_search_v1_test.py
@@ -20,7 +20,6 @@
 
 
 class BeamSearchHelperTests(tf.test.TestCase):
-
   def setUp(self):
     super(BeamSearchHelperTests, self).setUp()
     tf.compat.v1.disable_eager_execution()
@@ -45,8 +44,7 @@ def test_get_shape_keep_last_dim(self):
     y = tf.constant(4.0)
     x = tf.ones([7, tf.cast(tf.sqrt(y), tf.int32), 2, 5])
     shape = beam_search._get_shape_keep_last_dim(x)
-    self.assertAllEqual([None, None, None, 5],
-                        shape.as_list())
+    self.assertAllEqual([None, None, None, 5], shape.as_list())
 
   def test_flatten_beam_dim(self):
     x = tf.ones([7, 4, 2, 5])
@@ -76,11 +74,7 @@ def test_gather_beams(self):
     with self.session() as sess:
       y = sess.run(y)
 
-    self.assertAllEqual([[[4, 5, 6, 7],
-                          [8, 9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [20, 21, 22, 23]]],
-                        y)
+    self.assertAllEqual([[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]], y)
 
   def test_gather_topk_beams(self):
     x = tf.reshape(tf.range(24), [2, 3, 4])
@@ -90,11 +84,7 @@ def test_gather_topk_beams(self):
     with self.session() as sess:
       y = sess.run(y)
 
-    self.assertAllEqual([[[4, 5, 6, 7],
-                          [8, 9, 10, 11]],
-                         [[12, 13, 14, 15],
-                          [20, 21, 22, 23]]],
-                        y)
+    self.assertAllEqual([[[4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [20, 21, 22, 23]]], y)
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
index e978abea..1b287b22 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_params.py
@@ -18,79 +18,65 @@
 
 
 BASE_PARAMS = defaultdict(
-    lambda: None,  # Set default value to None.
-
-    # Input params
-    default_batch_size=2048,  # Maximum number of tokens per batch of examples.
-    default_batch_size_tpu=32768,
-    max_length=256,  # Maximum number of tokens per example.
-
-    # Model params
-    initializer_gain=1.0,  # Used in trainable variable initialization.
-    vocab_size=33708,  # Number of tokens defined in the vocabulary file.
-    hidden_size=512,  # Model dimension in the hidden layers.
-    num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
-    num_heads=8,  # Number of heads to use in multi-headed attention.
-    filter_size=2048,  # Inner layer dimension in the feedforward network.
-
-    # Dropout values (only used when training)
-    layer_postprocess_dropout=0.1,
-    attention_dropout=0.1,
-    relu_dropout=0.1,
-
-    # Training params
-    label_smoothing=0.1,
-    learning_rate=2.0,
-    learning_rate_decay_rate=1.0,
-    learning_rate_warmup_steps=16000,
-
-    # Optimizer params
-    optimizer_adam_beta1=0.9,
-    optimizer_adam_beta2=0.997,
-    optimizer_adam_epsilon=1e-09,
-
-    # Default prediction params
-    extra_decode_length=50,
-    beam_size=4,
-    alpha=0.6,  # used to calculate length normalization in beam search
-
-    # TPU specific parameters
-    use_tpu=False,
-    static_batch=False,
-    allow_ffn_pad=True,
+  lambda: None,  # Set default value to None.
+  # Input params
+  default_batch_size=2048,  # Maximum number of tokens per batch of examples.
+  default_batch_size_tpu=32768,
+  max_length=256,  # Maximum number of tokens per example.
+  # Model params
+  initializer_gain=1.0,  # Used in trainable variable initialization.
+  vocab_size=33708,  # Number of tokens defined in the vocabulary file.
+  hidden_size=512,  # Model dimension in the hidden layers.
+  num_hidden_layers=6,  # Number of layers in the encoder and decoder stacks.
+  num_heads=8,  # Number of heads to use in multi-headed attention.
+  filter_size=2048,  # Inner layer dimension in the feedforward network.
+  # Dropout values (only used when training)
+  layer_postprocess_dropout=0.1,
+  attention_dropout=0.1,
+  relu_dropout=0.1,
+  # Training params
+  label_smoothing=0.1,
+  learning_rate=2.0,
+  learning_rate_decay_rate=1.0,
+  learning_rate_warmup_steps=16000,
+  # Optimizer params
+  optimizer_adam_beta1=0.9,
+  optimizer_adam_beta2=0.997,
+  optimizer_adam_epsilon=1e-09,
+  # Default prediction params
+  extra_decode_length=50,
+  beam_size=4,
+  alpha=0.6,  # used to calculate length normalization in beam search
+  # TPU specific parameters
+  use_tpu=False,
+  static_batch=False,
+  allow_ffn_pad=True,
 )
 
 BIG_PARAMS = BASE_PARAMS.copy()
 BIG_PARAMS.update(
-    default_batch_size=4096,
-
-    # default batch size is smaller than for BASE_PARAMS due to memory limits.
-    default_batch_size_tpu=16384,
-
-    hidden_size=1024,
-    filter_size=4096,
-    num_heads=16,
+  default_batch_size=4096,
+  # default batch size is smaller than for BASE_PARAMS due to memory limits.
+  default_batch_size_tpu=16384,
+  hidden_size=1024,
+  filter_size=4096,
+  num_heads=16,
 )
 
 # Parameters for running the model in multi gpu. These should not change the
 # params that modify the model shape (such as the hidden_size or num_heads).
 BASE_MULTI_GPU_PARAMS = BASE_PARAMS.copy()
-BASE_MULTI_GPU_PARAMS.update(
-    learning_rate_warmup_steps=8000
-)
+BASE_MULTI_GPU_PARAMS.update(learning_rate_warmup_steps=8000)
 
 BIG_MULTI_GPU_PARAMS = BIG_PARAMS.copy()
-BIG_MULTI_GPU_PARAMS.update(
-    layer_postprocess_dropout=0.3,
-    learning_rate_warmup_steps=8000
-)
+BIG_MULTI_GPU_PARAMS.update(layer_postprocess_dropout=0.3, learning_rate_warmup_steps=8000)
 
 # Parameters for testing the model
 TINY_PARAMS = BASE_PARAMS.copy()
 TINY_PARAMS.update(
-    default_batch_size=1024,
-    default_batch_size_tpu=1024,
-    hidden_size=32,
-    num_heads=4,
-    filter_size=256,
+  default_batch_size=1024,
+  default_batch_size_tpu=1024,
+  hidden_size=32,
+  num_heads=4,
+  filter_size=256,
 )
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
index 3f860f04..e56a3ba9 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils.py
@@ -29,8 +29,7 @@
 _NEG_INF_FP16 = np.finfo(np.float16).min
 
 
-def get_position_encoding(
-    length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
+def get_position_encoding(length, hidden_size, min_timescale=1.0, max_timescale=1.0e4):
   """Return positional encoding.
 
   Calculates the position encoding as a mix of sine and cosine functions with
@@ -51,11 +50,10 @@ def get_position_encoding(
   # in float16.
   position = tf.cast(tf.range(length), tf.float32)
   num_timescales = hidden_size // 2
-  log_timescale_increment = (
-      math.log(float(max_timescale) / float(min_timescale)) /
-      (tf.cast(num_timescales, tf.float32) - 1))
-  inv_timescales = min_timescale * tf.exp(
-      tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
+  log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
+    tf.cast(num_timescales, tf.float32) - 1
+  )
+  inv_timescales = min_timescale * tf.exp(tf.cast(tf.range(num_timescales), tf.float32) * -log_timescale_increment)
   scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
   signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
   return signal
@@ -77,8 +75,7 @@ def get_decoder_self_attention_bias(length, dtype=tf.float32):
   """
   neg_inf = _NEG_INF_FP16 if dtype == tf.float16 else _NEG_INF_FP32
   with tf.name_scope("decoder_self_attention_bias"):
-    valid_locs = tf.linalg.band_part(tf.ones([length, length], dtype=dtype),
-                                     -1, 0)
+    valid_locs = tf.linalg.band_part(tf.ones([length, length], dtype=dtype), -1, 0)
     valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
     decoder_bias = neg_inf * (1.0 - valid_locs)
   return decoder_bias
@@ -118,6 +115,5 @@ def get_padding_bias(x, padding_value=0, dtype=tf.float32):
   with tf.name_scope("attention_bias"):
     padding = get_padding(x, padding_value, dtype)
     attention_bias = padding * _NEG_INF_FP32
-    attention_bias = tf.expand_dims(
-        tf.expand_dims(attention_bias, axis=1), axis=1)
+    attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, axis=1), axis=1)
   return attention_bias
diff --git a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
index 06303da0..3e0ffee2 100644
--- a/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
+++ b/modelzoo/LanguageModeling/BERT/official/nlp/transformer/model_utils_test.py
@@ -26,13 +26,11 @@
 
 
 class ModelUtilsTest(tf.test.TestCase):
-
   def test_get_padding(self):
     x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
     padding = model_utils.get_padding(x, padding_value=0)
 
-    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]],
-                        padding)
+    self.assertAllEqual([[0, 1, 1, 1, 0], [0, 0, 1, 1, 1], [1, 0, 0, 1, 0]], padding)
 
   def test_get_padding_bias(self):
     x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]])
@@ -40,24 +38,32 @@ def test_get_padding_bias(self):
     bias_shape = tf.shape(bias)
     flattened_bias = tf.reshape(bias, [3, 5])
 
-    self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0],
-                         [0, 0, NEG_INF, NEG_INF, NEG_INF],
-                         [NEG_INF, 0, 0, NEG_INF, 0]],
-                        flattened_bias)
+    self.assertAllEqual(
+      [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]],
+      flattened_bias,
+    )
     self.assertAllEqual([3, 1, 1, 5], bias_shape)
 
   def test_get_decoder_self_attention_bias(self):
     length = 5
     bias = model_utils.get_decoder_self_attention_bias(length)
 
-    self.assertAllEqual([[[[0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
-                           [0, 0, NEG_INF, NEG_INF, NEG_INF],
-                           [0, 0, 0, NEG_INF, NEG_INF],
-                           [0, 0, 0, 0, NEG_INF],
-                           [0, 0, 0, 0, 0]]]],
-                        bias)
+    self.assertAllEqual(
+      [
+        [
+          [
+            [0, NEG_INF, NEG_INF, NEG_INF, NEG_INF],
+            [0, 0, NEG_INF, NEG_INF, NEG_INF],
+            [0, 0, 0, NEG_INF, NEG_INF],
+            [0, 0, 0, 0, NEG_INF],
+            [0, 0, 0, 0, 0],
+          ]
+        ]
+      ],
+      bias,
+    )
 
 
 if __name__ == "__main__":
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
   tf.test.main()
diff --git a/modelzoo/LanguageModeling/BERT/optimization.py b/modelzoo/LanguageModeling/BERT/optimization.py
index c2e58a12..780314c2 100644
--- a/modelzoo/LanguageModeling/BERT/optimization.py
+++ b/modelzoo/LanguageModeling/BERT/optimization.py
@@ -28,13 +28,7 @@
 class WarmUp(tf_keras.optimizers.schedules.LearningRateSchedule):
   """Applys a warmup schedule on a given learning rate decay schedule."""
 
-  def __init__(
-      self,
-      initial_learning_rate,
-      decay_schedule_fn,
-      warmup_steps,
-      power=1.0,
-      name=None):
+  def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
     super(WarmUp, self).__init__()
     self.initial_learning_rate = initial_learning_rate
     self.warmup_steps = warmup_steps
@@ -43,27 +37,27 @@ def __init__(
     self.name = name
 
   def __call__(self, step):
-    with tf.name_scope(self.name or 'WarmUp') as name:
+    with tf.name_scope(self.name or "WarmUp") as name:
       # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
       # learning rate will be `global_step/num_warmup_steps * init_lr`.
       global_step_float = tf.cast(step, tf.float32)
       warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
       warmup_percent_done = global_step_float / warmup_steps_float
-      warmup_learning_rate = (
-          self.initial_learning_rate *
-          tf.math.pow(warmup_percent_done, self.power))
-      return tf.cond(global_step_float < warmup_steps_float,
-                     lambda: warmup_learning_rate,
-                     lambda: self.decay_schedule_fn(step),
-                     name=name)
+      warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power)
+      return tf.cond(
+        global_step_float < warmup_steps_float,
+        lambda: warmup_learning_rate,
+        lambda: self.decay_schedule_fn(step),
+        name=name,
+      )
 
   def get_config(self):
     return {
-        'initial_learning_rate': self.initial_learning_rate,
-        'decay_schedule_fn': self.decay_schedule_fn,
-        'warmup_steps': self.warmup_steps,
-        'power': self.power,
-        'name': self.name
+      "initial_learning_rate": self.initial_learning_rate,
+      "decay_schedule_fn": self.decay_schedule_fn,
+      "warmup_steps": self.warmup_steps,
+      "power": self.power,
+      "name": self.name,
     }
 
 
@@ -73,40 +67,44 @@ def create_optimizer(init_lr, num_train_steps, num_warmup_steps, optimizer_type=
   if optimizer_type == "adam":
     power = 1.0
     decayed_learning_rate_at_crossover_point = init_lr * (
-                  (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power)
+      (1.0 - float(num_warmup_steps) / float(num_train_steps)) ** power
+    )
   else:
     power = 0.5
     decayed_learning_rate_at_crossover_point = init_lr
   init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
-  print('decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e' % (decayed_learning_rate_at_crossover_point, init_lr))
+  print(
+    "decayed_learning_rate_at_crossover_point = %e, adjusted_init_lr = %e"
+    % (decayed_learning_rate_at_crossover_point, init_lr)
+  )
 
   learning_rate_fn = tf_keras.optimizers.schedules.PolynomialDecay(
-      initial_learning_rate=init_lr,
-      decay_steps=num_train_steps,
-      end_learning_rate=0.0,
-      power=power)
+    initial_learning_rate=init_lr, decay_steps=num_train_steps, end_learning_rate=0.0, power=power
+  )
   if num_warmup_steps:
-    learning_rate_fn = WarmUp(initial_learning_rate=init_lr,
-                              decay_schedule_fn=learning_rate_fn,
-                              warmup_steps=num_warmup_steps)
-  if optimizer_type == 'adam':
+    learning_rate_fn = WarmUp(
+      initial_learning_rate=init_lr, decay_schedule_fn=learning_rate_fn, warmup_steps=num_warmup_steps
+    )
+  if optimizer_type == "adam":
     optimizer = AdamWeightDecay(
-        learning_rate=learning_rate_fn,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'])
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
+    )
   else:
-    skip_list = ['None'] # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
+    skip_list = ["None"]  # to avoid exclude_from_layer_adaptation set to exclude_from_weight_decay if the arg is None
     optimizer = dp.optimizers.LAMB(
-        learning_rate=learning_rate_fn,
-        weight_decay_rate=0.01,
-        beta_1=0.9,
-        beta_2=0.999,
-        epsilon=1e-6,
-        exclude_from_weight_decay=['LayerNorm', 'layer_norm', 'bias'],
-        exclude_from_layer_adaptation=skip_list)
+      learning_rate=learning_rate_fn,
+      weight_decay_rate=0.01,
+      beta_1=0.9,
+      beta_2=0.999,
+      epsilon=1e-6,
+      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
+      exclude_from_layer_adaptation=skip_list,
+    )
   return optimizer
 
 
@@ -120,19 +118,20 @@ class AdamWeightDecay(tf_keras.optimizers.legacy.Adam):
   the loss with plain (non-momentum) SGD.
   """
 
-  def __init__(self,
-               learning_rate=0.001,
-               beta_1=0.9,
-               beta_2=0.999,
-               epsilon=1e-7,
-               amsgrad=False,
-               weight_decay_rate=0.0,
-               include_in_weight_decay=None,
-               exclude_from_weight_decay=None,
-               name='AdamWeightDecay',
-               **kwargs):
-    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2,
-                                          epsilon, amsgrad, name, **kwargs)
+  def __init__(
+    self,
+    learning_rate=0.001,
+    beta_1=0.9,
+    beta_2=0.999,
+    epsilon=1e-7,
+    amsgrad=False,
+    weight_decay_rate=0.0,
+    include_in_weight_decay=None,
+    exclude_from_weight_decay=None,
+    name="AdamWeightDecay",
+    **kwargs,
+  ):
+    super(AdamWeightDecay, self).__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs)
     self.weight_decay_rate = weight_decay_rate
     self._include_in_weight_decay = include_in_weight_decay
     self._exclude_from_weight_decay = exclude_from_weight_decay
@@ -140,23 +139,22 @@ def __init__(self,
   @classmethod
   def from_config(cls, config):
     """Creates an optimizer from its config with WarmUp custom object."""
-    custom_objects = {'WarmUp': WarmUp}
-    return super(AdamWeightDecay, cls).from_config(
-        config, custom_objects=custom_objects)
+    custom_objects = {"WarmUp": WarmUp}
+    return super(AdamWeightDecay, cls).from_config(config, custom_objects=custom_objects)
 
   def _prepare_local(self, var_device, var_dtype, apply_state):
-    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype,
-                                                apply_state)
-    apply_state[(var_device, var_dtype)]['weight_decay_rate'] = tf.constant(
-        self.weight_decay_rate, name='adam_weight_decay_rate')
+    super(AdamWeightDecay, self)._prepare_local(var_device, var_dtype, apply_state)
+    apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant(
+      self.weight_decay_rate, name="adam_weight_decay_rate"
+    )
 
   def _decay_weights_op(self, var, learning_rate, apply_state):
     do_decay = self._do_use_weight_decay(var.name)
     if do_decay:
       return var.assign_sub(
-          learning_rate * var *
-          apply_state[(var.device, var.dtype.base_dtype)]['weight_decay_rate'],
-          use_locking=self._use_locking)
+        learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"],
+        use_locking=self._use_locking,
+      )
     return tf.no_op()
 
   def _get_lr(self, var_device, var_dtype, apply_state):
@@ -170,26 +168,24 @@ def _get_lr(self, var_device, var_dtype, apply_state):
       coefficients = self._fallback_apply_state(var_device, var_dtype)
       apply_state[(var_device, var_dtype)] = coefficients
 
-    return coefficients['lr_t'], dict(apply_state=apply_state)
+    return coefficients["lr_t"], dict(apply_state=apply_state)
 
   def _resource_apply_dense(self, grad, var, apply_state=None):
     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
     decay = self._decay_weights_op(var, lr_t, apply_state)
     with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay,
-                   self)._resource_apply_dense(grad, var, **kwargs)
+      return super(AdamWeightDecay, self)._resource_apply_dense(grad, var, **kwargs)
 
   def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
     lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state)
     decay = self._decay_weights_op(var, lr_t, apply_state)
     with tf.control_dependencies([decay]):
-      return super(AdamWeightDecay,
-                   self)._resource_apply_sparse(grad, var, indices, **kwargs)
+      return super(AdamWeightDecay, self)._resource_apply_sparse(grad, var, indices, **kwargs)
 
   def get_config(self):
     config = super(AdamWeightDecay, self).get_config()
     config.update({
-        'weight_decay_rate': self.weight_decay_rate,
+      "weight_decay_rate": self.weight_decay_rate,
     })
     return config
 
@@ -209,18 +205,20 @@ def _do_use_weight_decay(self, param_name):
           return False
     return True
 
+
 # Inspired from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py
-class GradientAccumulator():
+class GradientAccumulator:
   def __init__(self):
     self._gradients = []
     self._accum_steps = None
 
   def zero(self, dtype):
     return tf.Variable(
-            tf.constant(0, dtype=dtype),
-            trainable=False,
-            synchronization=tf.VariableSynchronization.ON_READ,
-            aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA)
+      tf.constant(0, dtype=dtype),
+      trainable=False,
+      synchronization=tf.VariableSynchronization.ON_READ,
+      aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA,
+    )
 
   @property
   def step(self):
@@ -246,16 +244,13 @@ def add_gradients(self, grads):
     if not self._gradients:
       _ = self.step
       self._gradients.extend([
-        tf.Variable(
-          tf.zeros_like(g),
-          trainable=False,
-          synchronization=tf.VariableSynchronization.ON_READ
-        ) if g is not None else None
+        tf.Variable(tf.zeros_like(g), trainable=False, synchronization=tf.VariableSynchronization.ON_READ)
+        if g is not None
+        else None
         for g in grads
       ])
     if len(grads) != len(self._gradients):
-      raise ValueError("Expected %s gradients, but got %d" % (
-          len(self._gradients), len(grads)))
+      raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(grads)))
 
     for accum_grad, grad in zip(self._gradients, grads):
       if accum_grad is not None:
diff --git a/modelzoo/LanguageModeling/BERT/run_classifier.py b/modelzoo/LanguageModeling/BERT/run_classifier.py
index 7c6f76b0..ec143bc7 100644
--- a/modelzoo/LanguageModeling/BERT/run_classifier.py
+++ b/modelzoo/LanguageModeling/BERT/run_classifier.py
@@ -40,22 +40,24 @@
 from official.utils.misc import keras_utils
 
 flags.DEFINE_enum(
-    'mode', 'train_and_eval', ['train_and_eval', 'export_only'],
-    'One of {"train_and_eval", "export_only"}. `train_and_eval`: '
-    'trains the model and evaluates in the meantime. '
-    '`export_only`: will take the latest checkpoint inside '
-    'model_dir and export a `SavedModel`.')
-flags.DEFINE_string('train_data_path', None,
-                    'Path to training data for BERT classifier.')
-flags.DEFINE_string('eval_data_path', None,
-                    'Path to evaluation data for BERT classifier.')
+  "mode",
+  "train_and_eval",
+  ["train_and_eval", "export_only"],
+  'One of {"train_and_eval", "export_only"}. `train_and_eval`: '
+  "trains the model and evaluates in the meantime. "
+  "`export_only`: will take the latest checkpoint inside "
+  "model_dir and export a `SavedModel`.",
+)
+flags.DEFINE_string("train_data_path", None, "Path to training data for BERT classifier.")
+flags.DEFINE_string("eval_data_path", None, "Path to evaluation data for BERT classifier.")
 # Model training specific flags.
 flags.DEFINE_string(
-    'input_meta_data_path', None,
-    'Path to file that contains meta data about input '
-    'to be used for training and evaluation.')
-flags.DEFINE_integer('train_batch_size', 32, 'Batch size for training.')
-flags.DEFINE_integer('eval_batch_size', 32, 'Batch size for evaluation.')
+  "input_meta_data_path",
+  None,
+  "Path to file that contains meta data about input to be used for training and evaluation.",
+)
+flags.DEFINE_integer("train_batch_size", 32, "Batch size for training.")
+flags.DEFINE_integer("eval_batch_size", 32, "Batch size for evaluation.")
 
 common_flags.define_common_bert_flags()
 
@@ -69,10 +71,8 @@ def classification_loss_fn(labels, logits):
     """Classification loss."""
     labels = tf.squeeze(labels)
     log_probs = tf.nn.log_softmax(logits, axis=-1)
-    one_hot_labels = tf.one_hot(
-        tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
-    per_example_loss = -tf.reduce_sum(
-        tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
+    one_hot_labels = tf.one_hot(tf.cast(labels, dtype=tf.int32), depth=num_classes, dtype=tf.float32)
+    per_example_loss = -tf.reduce_sum(tf.cast(one_hot_labels, dtype=tf.float32) * log_probs, axis=-1)
     loss = tf.reduce_mean(per_example_loss)
     loss *= loss_factor
     return loss
@@ -80,65 +80,62 @@ def classification_loss_fn(labels, logits):
   return classification_loss_fn
 
 
-def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
-                   is_training, use_horovod):
+def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size, is_training, use_horovod):
   """Gets a closure to create a dataset."""
 
   def _dataset_fn(ctx=None):
     """Returns tf.data.Dataset for distributed BERT pretraining."""
-    batch_size = ctx.get_per_replica_batch_size(
-        global_batch_size) if ctx else global_batch_size
+    batch_size = ctx.get_per_replica_batch_size(global_batch_size) if ctx else global_batch_size
     dataset = input_pipeline.create_classifier_dataset(
-        input_file_pattern,
-        max_seq_length,
-        batch_size,
-        is_training=is_training,
-        input_pipeline_context=ctx,
-        use_horovod=use_horovod)
+      input_file_pattern,
+      max_seq_length,
+      batch_size,
+      is_training=is_training,
+      input_pipeline_context=ctx,
+      use_horovod=use_horovod,
+    )
     return dataset
 
   return _dataset_fn
 
 
-def run_bert_classifier(strategy,
-                        bert_config,
-                        input_meta_data,
-                        model_dir,
-                        epochs,
-                        steps_per_epoch,
-                        steps_per_loop,
-                        eval_steps,
-                        warmup_steps,
-                        initial_lr,
-                        init_checkpoint,
-                        train_input_fn,
-                        eval_input_fn,
-                        custom_callbacks=None,
-                        run_eagerly=False,
-                        use_keras_compile_fit=False):
+def run_bert_classifier(
+  strategy,
+  bert_config,
+  input_meta_data,
+  model_dir,
+  epochs,
+  steps_per_epoch,
+  steps_per_loop,
+  eval_steps,
+  warmup_steps,
+  initial_lr,
+  init_checkpoint,
+  train_input_fn,
+  eval_input_fn,
+  custom_callbacks=None,
+  run_eagerly=False,
+  use_keras_compile_fit=False,
+):
   """Run BERT classifier training using low-level API."""
-  max_seq_length = input_meta_data['max_seq_length']
-  num_classes = input_meta_data['num_labels']
+  max_seq_length = input_meta_data["max_seq_length"]
+  num_classes = input_meta_data["num_labels"]
 
   def _get_classifier_model():
     """Gets a classifier model."""
-    classifier_model, core_model = (
-        bert_models.classifier_model(
-            bert_config,
-            tf.float32,
-            num_classes,
-            max_seq_length,
-            hub_module_url=FLAGS.hub_module_url))
-    classifier_model.optimizer = optimization.create_optimizer(
-        initial_lr, steps_per_epoch * epochs, warmup_steps)
-    if FLAGS.fp16_implementation == 'graph_rewrite':
+    classifier_model, core_model = bert_models.classifier_model(
+      bert_config, tf.float32, num_classes, max_seq_length, hub_module_url=FLAGS.hub_module_url
+    )
+    classifier_model.optimizer = optimization.create_optimizer(initial_lr, steps_per_epoch * epochs, warmup_steps)
+    if FLAGS.fp16_implementation == "graph_rewrite":
       # Note: when flags_obj.fp16_implementation == "graph_rewrite", dtype as
       # determined by flags_core.get_tf_dtype(flags_obj) would be 'float32'
       # which will ensure tf.compat.v2.keras.mixed_precision and
       # tf.train.experimental.enable_mixed_precision_graph_rewrite do not double
       # up.
       classifier_model.optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
-          classifier_model.optimizer)
+        classifier_model.optimizer
+      )
     return classifier_model, core_model
 
   # During distributed training, loss used for gradient computation is
@@ -156,60 +153,61 @@ def _get_classifier_model():
   # Defines evaluation metrics function, which will create metrics in the
   # correct device and strategy scope.
   def metric_fn():
-    return tf.keras.metrics.SparseCategoricalAccuracy(
-        'test_accuracy', dtype=tf.float32)
+    return tf.keras.metrics.SparseCategoricalAccuracy("test_accuracy", dtype=tf.float32)
 
   if use_keras_compile_fit:
     # Start training using Keras compile/fit API.
-    logging.info('Training using TF 2.0 Keras compile/fit API with '
-                 'distribution strategy.')
+    logging.info("Training using TF 2.0 Keras compile/fit API with distribution strategy.")
     return run_keras_compile_fit(
-        model_dir,
-        strategy,
-        _get_classifier_model,
-        train_input_fn,
-        eval_input_fn,
-        loss_fn,
-        metric_fn,
-        init_checkpoint,
-        epochs,
-        steps_per_epoch,
-        eval_steps,
-        custom_callbacks=None)
+      model_dir,
+      strategy,
+      _get_classifier_model,
+      train_input_fn,
+      eval_input_fn,
+      loss_fn,
+      metric_fn,
+      init_checkpoint,
+      epochs,
+      steps_per_epoch,
+      eval_steps,
+      custom_callbacks=None,
+    )
 
   # Use user-defined loop to start training.
-  logging.info('Training using customized training loop TF 2.0 with '
-               'distribution strategy.')
+  logging.info("Training using customized training loop TF 2.0 with distribution strategy.")
   return model_training_utils.run_customized_training_loop(
-      strategy=strategy,
-      model_fn=_get_classifier_model,
-      loss_fn=loss_fn,
-      model_dir=model_dir,
-      steps_per_epoch=steps_per_epoch,
-      steps_per_loop=steps_per_loop,
-      epochs=epochs,
-      train_input_fn=train_input_fn,
-      eval_input_fn=eval_input_fn,
-      eval_steps=eval_steps,
-      init_checkpoint=init_checkpoint,
-      metric_fn=metric_fn,
-      hvd=hvd if FLAGS.use_horovod else None,
-      custom_callbacks=custom_callbacks,
-      run_eagerly=run_eagerly)
-
-
-def run_keras_compile_fit(model_dir,
-                          strategy,
-                          model_fn,
-                          train_input_fn,
-                          eval_input_fn,
-                          loss_fn,
-                          metric_fn,
-                          init_checkpoint,
-                          epochs,
-                          steps_per_epoch,
-                          eval_steps,
-                          custom_callbacks=None):
+    strategy=strategy,
+    model_fn=_get_classifier_model,
+    loss_fn=loss_fn,
+    model_dir=model_dir,
+    steps_per_epoch=steps_per_epoch,
+    steps_per_loop=steps_per_loop,
+    epochs=epochs,
+    train_input_fn=train_input_fn,
+    eval_input_fn=eval_input_fn,
+    eval_steps=eval_steps,
+    init_checkpoint=init_checkpoint,
+    metric_fn=metric_fn,
+    hvd=hvd if FLAGS.use_horovod else None,
+    custom_callbacks=custom_callbacks,
+    run_eagerly=run_eagerly,
+  )
+
+
+def run_keras_compile_fit(
+  model_dir,
+  strategy,
+  model_fn,
+  train_input_fn,
+  eval_input_fn,
+  loss_fn,
+  metric_fn,
+  init_checkpoint,
+  epochs,
+  steps_per_epoch,
+  eval_steps,
+  custom_callbacks=None,
+):
   """Runs BERT classifier model using Keras compile/fit API."""
 
   with distribution_utils.get_strategy_scope(strategy):
@@ -224,11 +222,10 @@ def run_keras_compile_fit(model_dir,
 
     bert_model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metric_fn()])
 
-    summary_dir = os.path.join(model_dir, 'summaries')
+    summary_dir = os.path.join(model_dir, "summaries")
     summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
-    checkpoint_path = os.path.join(model_dir, 'checkpoint')
-    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
-        checkpoint_path, save_weights_only=True)
+    checkpoint_path = os.path.join(model_dir, "checkpoint")
+    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True)
 
     if custom_callbacks is not None:
       custom_callbacks += [summary_callback, checkpoint_callback]
@@ -236,19 +233,18 @@ def run_keras_compile_fit(model_dir,
       custom_callbacks = [summary_callback, checkpoint_callback]
 
     bert_model.fit(
-        x=training_dataset,
-        validation_data=evaluation_dataset,
-        steps_per_epoch=steps_per_epoch,
-        epochs=epochs,
-        validation_steps=eval_steps,
-        callbacks=custom_callbacks)
+      x=training_dataset,
+      validation_data=evaluation_dataset,
+      steps_per_epoch=steps_per_epoch,
+      epochs=epochs,
+      validation_steps=eval_steps,
+      callbacks=custom_callbacks,
+    )
 
     return bert_model
 
 
-def export_classifier(model_export_path, input_meta_data,
-                      restore_model_using_load_weights,
-                      bert_config, model_dir):
+def export_classifier(model_export_path, input_meta_data, restore_model_using_load_weights, bert_config, model_dir):
   """Exports a trained model as a `SavedModel` for inference.
 
   Args:
@@ -270,47 +266,45 @@ def export_classifier(model_export_path, input_meta_data,
     Export path is not specified, got an empty string or None.
   """
   if not model_export_path:
-    raise ValueError('Export path is not specified: %s' % model_export_path)
+    raise ValueError("Export path is not specified: %s" % model_export_path)
   if not model_dir:
-    raise ValueError('Export path is not specified: %s' % model_dir)
+    raise ValueError("Export path is not specified: %s" % model_dir)
 
   classifier_model = bert_models.classifier_model(
-      bert_config, tf.float32, input_meta_data['num_labels'],
-      input_meta_data['max_seq_length'])[0]
+    bert_config, tf.float32, input_meta_data["num_labels"], input_meta_data["max_seq_length"]
+  )[0]
 
   model_saving_utils.export_bert_model(
-      model_export_path,
-      model=classifier_model,
-      checkpoint_dir=model_dir,
-      restore_model_using_load_weights=restore_model_using_load_weights)
+    model_export_path,
+    model=classifier_model,
+    checkpoint_dir=model_dir,
+    restore_model_using_load_weights=restore_model_using_load_weights,
+  )
 
 
-def run_bert(strategy,
-             input_meta_data,
-             train_input_fn=None,
-             eval_input_fn=None):
+def run_bert(strategy, input_meta_data, train_input_fn=None, eval_input_fn=None):
   """Run BERT training."""
-  if FLAGS.model_type == 'bert':
+  if FLAGS.model_type == "bert":
     bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
   else:
-    assert FLAGS.model_type == 'albert'
+    assert FLAGS.model_type == "albert"
     bert_config = modeling.AlbertConfig.from_json_file(FLAGS.bert_config_file)
-  if FLAGS.mode == 'export_only':
+  if FLAGS.mode == "export_only":
     # As Keras ModelCheckpoint callback used with Keras compile/fit() API
     # internally uses model.save_weights() to save checkpoints, we must
     # use model.load_weights() when Keras compile/fit() is used.
-    export_classifier(FLAGS.model_export_path, input_meta_data,
-                      FLAGS.use_keras_compile_fit,
-                      bert_config, FLAGS.model_dir)
+    export_classifier(
+      FLAGS.model_export_path, input_meta_data, FLAGS.use_keras_compile_fit, bert_config, FLAGS.model_dir
+    )
     return
 
-  if FLAGS.mode != 'train_and_eval':
-    raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode)
+  if FLAGS.mode != "train_and_eval":
+    raise ValueError("Unsupported mode is specified: %s" % FLAGS.mode)
   # Enables XLA in Session Config. Should not be set for TPU.
   keras_utils.set_config_v2(FLAGS.enable_xla)
 
   epochs = FLAGS.num_train_epochs
-  train_data_size = input_meta_data['train_data_size']
+  train_data_size = input_meta_data["train_data_size"]
   global_batch_size = FLAGS.train_batch_size
   learning_rate = FLAGS.learning_rate
   if FLAGS.use_horovod:
@@ -318,85 +312,77 @@ def run_bert(strategy,
     learning_rate *= hvd.size()
   steps_per_epoch = int(train_data_size / global_batch_size)
   warmup_steps = int(epochs * train_data_size * 0.1 / global_batch_size)
-  eval_steps = int(
-      math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size))
+  eval_steps = int(math.ceil(input_meta_data["eval_data_size"] / FLAGS.eval_batch_size))
 
   if strategy:
     # Runs customized training loop.
-    logging.info('Training using customized training loop TF 2.0 with distrubuted'
-                'strategy.')
+    logging.info("Training using customized training loop TF 2.0 with distrubutedstrategy.")
   trained_model = run_bert_classifier(
-      strategy,
-      bert_config,
-      input_meta_data,
-      FLAGS.model_dir,
-      epochs,
-      steps_per_epoch,
-      FLAGS.steps_per_loop,
-      eval_steps,
-      warmup_steps,
-      learning_rate,
-      FLAGS.init_checkpoint,
-      train_input_fn,
-      eval_input_fn,
-      run_eagerly=FLAGS.run_eagerly,
-      use_keras_compile_fit=FLAGS.use_keras_compile_fit)
+    strategy,
+    bert_config,
+    input_meta_data,
+    FLAGS.model_dir,
+    epochs,
+    steps_per_epoch,
+    FLAGS.steps_per_loop,
+    eval_steps,
+    warmup_steps,
+    learning_rate,
+    FLAGS.init_checkpoint,
+    train_input_fn,
+    eval_input_fn,
+    run_eagerly=FLAGS.run_eagerly,
+    use_keras_compile_fit=FLAGS.use_keras_compile_fit,
+  )
 
   if FLAGS.model_export_path:
     model_saving_utils.export_bert_model(
-        FLAGS.model_export_path,
-        model=trained_model,
-        restore_model_using_load_weights=FLAGS.use_keras_compile_fit)
+      FLAGS.model_export_path, model=trained_model, restore_model_using_load_weights=FLAGS.use_keras_compile_fit
+    )
   return trained_model
 
 
 def main(_):
   # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
 
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, "rb") as reader:
+    input_meta_data = json.loads(reader.read().decode("utf-8"))
 
   if FLAGS.use_fp16:
-    os.environ['TF_ENABLE_AUTO_MIXED_PRECISION'] = '1'
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION"] = "1"
 
   strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
+    distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu
+  )
 
   if FLAGS.use_horovod:
     if strategy:
-      raise ValueError('Should not run horovod with distribution strategy')
+      raise ValueError("Should not run horovod with distribution strategy")
 
     hvd.init()
-    gpus = tf.config.experimental.list_physical_devices('GPU')
+    gpus = tf.config.experimental.list_physical_devices("GPU")
     for gpu in gpus:
       tf.config.experimental.set_memory_growth(gpu, True)
     if gpus:
-      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
 
   if not FLAGS.model_dir:
-    FLAGS.model_dir = '/tmp/bert20/'
+    FLAGS.model_dir = "/tmp/bert20/"
 
-  max_seq_length = input_meta_data['max_seq_length']
+  max_seq_length = input_meta_data["max_seq_length"]
   train_input_fn = get_dataset_fn(
-      FLAGS.train_data_path,
-      max_seq_length,
-      FLAGS.train_batch_size,
-      is_training=True,
-      use_horovod=FLAGS.use_horovod)
+    FLAGS.train_data_path, max_seq_length, FLAGS.train_batch_size, is_training=True, use_horovod=FLAGS.use_horovod
+  )
   eval_input_fn = get_dataset_fn(
-      FLAGS.eval_data_path,
-      max_seq_length,
-      FLAGS.eval_batch_size,
-      is_training=False,
-      use_horovod=FLAGS.use_horovod)
+    FLAGS.eval_data_path, max_seq_length, FLAGS.eval_batch_size, is_training=False, use_horovod=FLAGS.use_horovod
+  )
 
   run_bert(strategy, input_meta_data, train_input_fn, eval_input_fn)
 
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('input_meta_data_path')
-  flags.mark_flag_as_required('model_dir')
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("input_meta_data_path")
+  flags.mark_flag_as_required("model_dir")
   app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/run_pretraining.py b/modelzoo/LanguageModeling/BERT/run_pretraining.py
index df2e9cf7..12800766 100644
--- a/modelzoo/LanguageModeling/BERT/run_pretraining.py
+++ b/modelzoo/LanguageModeling/BERT/run_pretraining.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Run masked LM/next sentence masked_lm pre-training for BERT in tf2.0."""
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -37,43 +38,41 @@
 from official.utils.misc import keras_utils
 from official.utils.misc import tpu_lib
 
-flags.DEFINE_string('input_files', None,
-                    'File path to retrieve training data for pre-training.')
+flags.DEFINE_string("input_files", None, "File path to retrieve training data for pre-training.")
 # Model training specific flags.
 flags.DEFINE_integer(
-    'max_seq_length', 128,
-    'The maximum total input sequence length after WordPiece tokenization. '
-    'Sequences longer than this will be truncated, and sequences shorter '
-    'than this will be padded.')
-flags.DEFINE_integer('max_predictions_per_seq', 20,
-                     'Maximum predictions per sequence_output.')
-flags.DEFINE_integer('train_batch_size', 32, 'Total batch size for training.')
-flags.DEFINE_integer('num_steps_per_epoch', 1000,
-                     'Total number of training steps to run per epoch.')
-flags.DEFINE_float('warmup_steps', 10000,
-                   'Warmup steps for Adam weight decay optimizer.')
+  "max_seq_length",
+  128,
+  "The maximum total input sequence length after WordPiece tokenization. "
+  "Sequences longer than this will be truncated, and sequences shorter "
+  "than this will be padded.",
+)
+flags.DEFINE_integer("max_predictions_per_seq", 20, "Maximum predictions per sequence_output.")
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+flags.DEFINE_integer("num_steps_per_epoch", 1000, "Total number of training steps to run per epoch.")
+flags.DEFINE_float("warmup_steps", 10000, "Warmup steps for Adam weight decay optimizer.")
 
 common_flags.define_common_bert_flags()
 
 FLAGS = flags.FLAGS
 
 
-def get_pretrain_dataset_fn(input_file_pattern, seq_length,
-                            max_predictions_per_seq, global_batch_size):
+def get_pretrain_dataset_fn(input_file_pattern, seq_length, max_predictions_per_seq, global_batch_size):
   """Returns input dataset from input file string."""
+
   def _dataset_fn(ctx=None):
     """Returns tf.data.Dataset for distributed BERT pretraining."""
-    input_patterns = input_file_pattern.split(',')
-    batch_size = ctx.get_per_replica_batch_size(
-        global_batch_size) if ctx else global_batch_size
+    input_patterns = input_file_pattern.split(",")
+    batch_size = ctx.get_per_replica_batch_size(global_batch_size) if ctx else global_batch_size
     train_dataset = input_pipeline.create_pretrain_dataset(
-        input_patterns,
-        seq_length,
-        max_predictions_per_seq,
-        batch_size,
-        is_training=True,
-        input_pipeline_context=ctx,
-        use_horovod=FLAGS.use_horovod)
+      input_patterns,
+      seq_length,
+      max_predictions_per_seq,
+      batch_size,
+      is_training=True,
+      input_pipeline_context=ctx,
+      use_horovod=FLAGS.use_horovod,
+    )
     return train_dataset
 
   return _dataset_fn
@@ -88,53 +87,53 @@ def _bert_pretrain_loss_fn(unused_labels, losses, **unused_args):
   return _bert_pretrain_loss_fn
 
 
-def run_customized_training(strategy,
-                            bert_config,
-                            max_seq_length,
-                            max_predictions_per_seq,
-                            model_dir,
-                            steps_per_epoch,
-                            steps_per_loop,
-                            epochs,
-                            initial_lr,
-                            warmup_steps,
-                            input_files,
-                            train_batch_size):
+def run_customized_training(
+  strategy,
+  bert_config,
+  max_seq_length,
+  max_predictions_per_seq,
+  model_dir,
+  steps_per_epoch,
+  steps_per_loop,
+  epochs,
+  initial_lr,
+  warmup_steps,
+  input_files,
+  train_batch_size,
+):
   """Run BERT pretrain model training using low-level API."""
 
-  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length,
-                                           max_predictions_per_seq,
-                                           train_batch_size)
+  train_input_fn = get_pretrain_dataset_fn(input_files, max_seq_length, max_predictions_per_seq, train_batch_size)
 
   def _get_pretrain_model():
     """Gets a pretraining model."""
     pretrain_model, core_model = bert_models.pretrain_model(
-        bert_config, max_seq_length, max_predictions_per_seq, float_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+      bert_config, max_seq_length, max_predictions_per_seq, float_type=tf.float16 if FLAGS.use_fp16 else tf.float32
+    )
     pretrain_model.optimizer = optimization.create_optimizer(
-        initial_lr, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type)
+      initial_lr, steps_per_epoch * epochs, warmup_steps, FLAGS.optimizer_type
+    )
     if FLAGS.use_fp16:
-      pretrain_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(pretrain_model.optimizer,
-        dynamic=True)
+      pretrain_model.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(pretrain_model.optimizer, dynamic=True)
     return pretrain_model, core_model
 
-  params = {'FLAGS' : FLAGS}
+  params = {"FLAGS": FLAGS}
   logging.info("init_lr = %f", initial_lr)
   trained_model = model_training_utils.run_customized_training_loop(
-      strategy=strategy,
-      model_fn=_get_pretrain_model,
-      loss_fn=get_loss_fn(
-          loss_factor=1.0 /
-          strategy.num_replicas_in_sync if FLAGS.scale_loss and strategy else 1.0),
-      model_dir=model_dir,
-      train_input_fn=train_input_fn,
-      steps_per_epoch=steps_per_epoch,
-      num_accumulative_step=FLAGS.num_accumulation_steps,
-      steps_per_loop=steps_per_loop,
-      epochs=epochs,
-      sub_model_export_name='pretrained/bert_model',
-      init_checkpoint=FLAGS.init_checkpoint,
-      hvd=hvd if FLAGS.use_horovod else None,
-      params=params)
+    strategy=strategy,
+    model_fn=_get_pretrain_model,
+    loss_fn=get_loss_fn(loss_factor=1.0 / strategy.num_replicas_in_sync if FLAGS.scale_loss and strategy else 1.0),
+    model_dir=model_dir,
+    train_input_fn=train_input_fn,
+    steps_per_epoch=steps_per_epoch,
+    num_accumulative_step=FLAGS.num_accumulation_steps,
+    steps_per_loop=steps_per_loop,
+    epochs=epochs,
+    sub_model_export_name="pretrained/bert_model",
+    init_checkpoint=FLAGS.init_checkpoint,
+    hvd=hvd if FLAGS.use_horovod else None,
+    params=params,
+  )
 
   return trained_model
 
@@ -147,51 +146,50 @@ def run_bert_pretrain(strategy):
   # if bert_config.vocab_size % 8 != 0:
   #   bert_config.vocab_size += 8 - bert_config.vocab_size % 8
   if strategy:
-    logging.info('Training using customized training loop TF 2.0 with distrubuted'
-                'strategy.')
+    logging.info("Training using customized training loop TF 2.0 with distrubutedstrategy.")
 
   keras_utils.set_config_v2(FLAGS.enable_xla)
   # Runs customized training loop.
   return run_customized_training(
-      strategy,
-      bert_config,
-      FLAGS.max_seq_length,
-      FLAGS.max_predictions_per_seq,
-      FLAGS.model_dir,
-      FLAGS.num_steps_per_epoch,
-      FLAGS.steps_per_loop,
-      FLAGS.num_train_epochs,
-      FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate,
-      FLAGS.warmup_steps,
-      FLAGS.input_files,
-      FLAGS.train_batch_size)
+    strategy,
+    bert_config,
+    FLAGS.max_seq_length,
+    FLAGS.max_predictions_per_seq,
+    FLAGS.model_dir,
+    FLAGS.num_steps_per_epoch,
+    FLAGS.steps_per_loop,
+    FLAGS.num_train_epochs,
+    FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate,
+    FLAGS.warmup_steps,
+    FLAGS.input_files,
+    FLAGS.train_batch_size,
+  )
 
 
 def main(_):
   # Users should always run this script under TF 2.x
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
 
   if not FLAGS.model_dir:
-    FLAGS.model_dir = '/tmp/bert20/'
+    FLAGS.model_dir = "/tmp/bert20/"
 
-  gpus = tf.config.experimental.list_physical_devices('GPU')
+  gpus = tf.config.experimental.list_physical_devices("GPU")
   for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu, True)
 
   strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
+    distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu
+  )
   if strategy:
-    print('***** Number of cores used : ', strategy.num_replicas_in_sync)
+    print("***** Number of cores used : ", strategy.num_replicas_in_sync)
 
   if FLAGS.use_horovod:
     if strategy:
-      raise ValueError('Should not run horovod with distribution strategy')
+      raise ValueError("Should not run horovod with distribution strategy")
 
     hvd.init()
     if gpus:
-      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
       gpu_affinity.set_affinity(hvd.local_rank())
 
   if FLAGS.use_fp16:
@@ -201,5 +199,5 @@ def main(_):
   run_bert_pretrain(strategy)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/run_squad.py b/modelzoo/LanguageModeling/BERT/run_squad.py
index 05e5d449..d5ca678c 100644
--- a/modelzoo/LanguageModeling/BERT/run_squad.py
+++ b/modelzoo/LanguageModeling/BERT/run_squad.py
@@ -39,94 +39,88 @@
 import input_pipeline
 import model_saving_utils
 import optimization
+
 # word-piece tokenizer based squad_lib
 import squad_lib as squad_lib_wp
+
 # sentence-piece tokenizer based squad_lib
 import squad_lib_sp
 import tf_trt
 import tokenization
 from official.nlp import bert_modeling as modeling
+
 # Import BERT model libraries.
 from official.nlp import bert_models
 from deepray.core.common import distribution_utils
 from deepray.utils import keras_utils
 
 flags.DEFINE_enum(
-    'mode', 'train_and_predict',
-    ['train_and_predict', 'train', 'predict', 'export_only', 'sm_predict', 'trt_predict'],
-    'One of {"train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"}. '
-    '`train_and_predict`: both train and predict to a json file. '
-    '`train`: only trains the model. '
-    'trains the model and evaluates in the meantime. '
-    '`predict`: predict answers from the squad json file. '
-    '`export_only`: will take the latest checkpoint inside '
-    'model_dir and export a `SavedModel`.'
-    '`sm_predict`: will load SavedModel from savedmodel_dir and predict answers'
-    '`trt_predict`: will load SavedModel from savedmodel_dir, convert and predict answers with TF-TRT')
-flags.DEFINE_string('train_data_path', '',
-                    'Training data path with train tfrecords.')
+  "mode",
+  "train_and_predict",
+  ["train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"],
+  'One of {"train_and_predict", "train", "predict", "export_only", "sm_predict", "trt_predict"}. '
+  "`train_and_predict`: both train and predict to a json file. "
+  "`train`: only trains the model. "
+  "trains the model and evaluates in the meantime. "
+  "`predict`: predict answers from the squad json file. "
+  "`export_only`: will take the latest checkpoint inside "
+  "model_dir and export a `SavedModel`."
+  "`sm_predict`: will load SavedModel from savedmodel_dir and predict answers"
+  "`trt_predict`: will load SavedModel from savedmodel_dir, convert and predict answers with TF-TRT",
+)
+flags.DEFINE_string("train_data_path", "", "Training data path with train tfrecords.")
 flags.DEFINE_string(
-    'input_meta_data_path', None,
-    'Path to file that contains meta data about input '
-    'to be used for training and evaluation.')
-flags.DEFINE_string(
-    "eval_script", None,
-    "SQuAD evaluate.py file to compute f1 and exact_match E.g., evaluate-v1.1.py")
+  "input_meta_data_path",
+  None,
+  "Path to file that contains meta data about input to be used for training and evaluation.",
+)
+flags.DEFINE_string("eval_script", None, "SQuAD evaluate.py file to compute f1 and exact_match E.g., evaluate-v1.1.py")
 
 # Model training specific flags.
-flags.DEFINE_integer('train_batch_size', 8, 'Total batch size for training.')
+flags.DEFINE_integer("train_batch_size", 8, "Total batch size for training.")
 # Predict processing related.
-flags.DEFINE_string('predict_file', None,
-                    'Prediction data path with train tfrecords.')
-flags.DEFINE_string('vocab_file', None,
-                    'The vocabulary file that the BERT model was trained on.')
+flags.DEFINE_string("predict_file", None, "Prediction data path with train tfrecords.")
+flags.DEFINE_string("vocab_file", None, "The vocabulary file that the BERT model was trained on.")
 flags.DEFINE_bool(
-    'do_lower_case', True,
-    'Whether to lower case the input text. Should be True for uncased '
-    'models and False for cased models.')
+  "do_lower_case",
+  True,
+  "Whether to lower case the input text. Should be True for uncased models and False for cased models.",
+)
 flags.DEFINE_bool(
-    'verbose_logging', False,
-    'If true, all of the warnings related to data processing will be printed. '
-    'A number of warnings are expected for a normal SQuAD evaluation.')
-flags.DEFINE_integer('predict_batch_size', 8,
-                     'Total batch size for prediction.')
+  "verbose_logging",
+  False,
+  "If true, all of the warnings related to data processing will be printed. "
+  "A number of warnings are expected for a normal SQuAD evaluation.",
+)
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for prediction.")
 flags.DEFINE_integer(
-    'n_best_size', 20,
-    'The total number of n-best predictions to generate in the '
-    'nbest_predictions.json output file.')
+  "n_best_size", 20, "The total number of n-best predictions to generate in the nbest_predictions.json output file."
+)
 flags.DEFINE_integer(
-    'max_answer_length', 30,
-    'The maximum length of an answer that can be generated. This is needed '
-    'because the start and end predictions are not conditioned on one another.')
-flags.DEFINE_string(
-    'sp_model_file', None,
-    'The path to the sentence piece model. Used by sentence piece tokenizer '
-    'employed by ALBERT.')
+  "max_answer_length",
+  30,
+  "The maximum length of an answer that can be generated. This is needed "
+  "because the start and end predictions are not conditioned on one another.",
+)
 flags.DEFINE_string(
-    'savedmodel_dir', None,
-    'The path of SavedModel for Savedmodel and TF-TRT prediction.')
+  "sp_model_file", None, "The path to the sentence piece model. Used by sentence piece tokenizer employed by ALBERT."
+)
+flags.DEFINE_string("savedmodel_dir", None, "The path of SavedModel for Savedmodel and TF-TRT prediction.")
 
 common_flags.define_common_bert_flags()
 
 FLAGS = flags.FLAGS
 
 MODEL_CLASSES = {
-    'bert': (modeling.BertConfig, squad_lib_wp, tokenization.FullTokenizer),
-    'albert': (modeling.AlbertConfig, squad_lib_sp,
-               tokenization.FullSentencePieceTokenizer),
+  "bert": (modeling.BertConfig, squad_lib_wp, tokenization.FullTokenizer),
+  "albert": (modeling.AlbertConfig, squad_lib_sp, tokenization.FullSentencePieceTokenizer),
 }
 
 
-def squad_loss_fn(start_positions,
-                  end_positions,
-                  start_logits,
-                  end_logits,
-                  loss_factor=1.0):
+def squad_loss_fn(start_positions, end_positions, start_logits, end_logits, loss_factor=1.0):
   """Returns sparse categorical crossentropy for start/end logits."""
-  start_loss = tf.keras.backend.sparse_categorical_crossentropy(
-      start_positions, start_logits, from_logits=True)
-  end_loss = tf.keras.backend.sparse_categorical_crossentropy(
-      end_positions, end_logits, from_logits=True)
+  start_loss = tf.keras.backend.sparse_categorical_crossentropy(start_positions, start_logits, from_logits=True)
+  end_loss = tf.keras.backend.sparse_categorical_crossentropy(end_positions, end_logits, from_logits=True)
 
   total_loss = (tf.reduce_mean(start_loss) + tf.reduce_mean(end_loss)) / 2
   total_loss *= loss_factor
@@ -137,15 +131,10 @@ def get_loss_fn(loss_factor=1.0):
   """Gets a loss function for squad task."""
 
   def _loss_fn(labels, model_outputs):
-    start_positions = labels['start_positions']
-    end_positions = labels['end_positions']
+    start_positions = labels["start_positions"]
+    end_positions = labels["end_positions"]
     start_logits, end_logits = model_outputs
-    return squad_loss_fn(
-        start_positions,
-        end_positions,
-        start_logits,
-        end_logits,
-        loss_factor=loss_factor)
+    return squad_loss_fn(start_positions, end_positions, start_logits, end_logits, loss_factor=loss_factor)
 
   return _loss_fn
 
@@ -153,68 +142,64 @@ def _loss_fn(labels, model_outputs):
 def get_raw_results(predictions):
   """Converts multi-replica predictions to RawResult."""
   squad_lib = MODEL_CLASSES[FLAGS.model_type][1]
-  for unique_ids, start_logits, end_logits in zip(predictions['unique_ids'],
-                                                  predictions['start_logits'],
-                                                  predictions['end_logits']):
-    for values in zip(unique_ids.numpy(), start_logits.numpy(),
-                      end_logits.numpy()):
-      yield squad_lib.RawResult(
-          unique_id=values[0],
-          start_logits=values[1].tolist(),
-          end_logits=values[2].tolist())
-
-def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size,
-                   is_training, use_horovod):
+  for unique_ids, start_logits, end_logits in zip(
+    predictions["unique_ids"], predictions["start_logits"], predictions["end_logits"]
+  ):
+    for values in zip(unique_ids.numpy(), start_logits.numpy(), end_logits.numpy()):
+      yield squad_lib.RawResult(unique_id=values[0], start_logits=values[1].tolist(), end_logits=values[2].tolist())
+
+
+def get_dataset_fn(input_file_pattern, max_seq_length, global_batch_size, is_training, use_horovod):
   """Gets a closure to create a dataset.."""
 
   def _dataset_fn(ctx=None):
     """Returns tf.data.Dataset for distributed BERT pretraining."""
-    batch_size = ctx.get_per_replica_batch_size(
-        global_batch_size) if ctx else global_batch_size
+    batch_size = ctx.get_per_replica_batch_size(global_batch_size) if ctx else global_batch_size
     dataset = input_pipeline.create_squad_dataset(
-        input_file_pattern,
-        max_seq_length,
-        batch_size,
-        is_training=is_training,
-        input_pipeline_context=ctx,
-        use_horovod=use_horovod)
+      input_file_pattern,
+      max_seq_length,
+      batch_size,
+      is_training=is_training,
+      input_pipeline_context=ctx,
+      use_horovod=use_horovod,
+    )
     return dataset
 
   return _dataset_fn
 
-def predict_squad_customized(strategy, input_meta_data, bert_config,
-                             predict_tfrecord_path, num_steps):
+
+def predict_squad_customized(strategy, input_meta_data, bert_config, predict_tfrecord_path, num_steps):
   """Make predictions using a Bert-based squad model."""
   predict_dataset_fn = get_dataset_fn(
-      predict_tfrecord_path,
-      input_meta_data['max_seq_length'],
-      FLAGS.predict_batch_size,
-      is_training=False,
-      use_horovod=False)
+    predict_tfrecord_path,
+    input_meta_data["max_seq_length"],
+    FLAGS.predict_batch_size,
+    is_training=False,
+    use_horovod=False,
+  )
   if strategy:
-    predict_iterator = iter(
-      strategy.experimental_distribute_datasets_from_function(
-          predict_dataset_fn))
+    predict_iterator = iter(strategy.experimental_distribute_datasets_from_function(predict_dataset_fn))
   else:
     predict_iterator = iter(predict_dataset_fn())
 
-  if FLAGS.mode == 'trt_predict':
+  if FLAGS.mode == "trt_predict":
     squad_model = tf_trt.TFTRTModel(FLAGS.savedmodel_dir, "amp" if FLAGS.use_fp16 else "fp32")
 
-  elif FLAGS.mode == 'sm_predict':
+  elif FLAGS.mode == "sm_predict":
     squad_model = tf_trt.SavedModel(FLAGS.savedmodel_dir, "amp" if FLAGS.use_fp16 else "fp32")
 
   else:
     with distribution_utils.get_strategy_scope(strategy):
       squad_model, _ = bert_models.squad_model(
-          bert_config, input_meta_data['max_seq_length'], float_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+        bert_config, input_meta_data["max_seq_length"], float_type=tf.float16 if FLAGS.use_fp16 else tf.float32
+      )
 
     if FLAGS.init_checkpoint:
       checkpoint = tf.train.Checkpoint(model=squad_model)
       checkpoint.restore(FLAGS.init_checkpoint[0]).expect_partial()
 
     checkpoint_path = tf.train.latest_checkpoint(FLAGS.model_dir)
-    logging.info('Restoring checkpoints from %s', checkpoint_path)
+    logging.info("Restoring checkpoints from %s", checkpoint_path)
     checkpoint = tf.train.Checkpoint(model=squad_model)
     checkpoint.restore(checkpoint_path).expect_partial()
 
@@ -225,25 +210,23 @@ def predict_step(iterator):
     def _replicated_step(inputs):
       """Replicated prediction calculation."""
       x, _ = inputs
-      unique_ids = x.pop('unique_ids')
+      unique_ids = x.pop("unique_ids")
       if FLAGS.benchmark:
         t0 = tf.timestamp()
         unique_ids = t0
       start_logits, end_logits = squad_model(x, training=False)
-      return dict(
-          unique_ids=unique_ids,
-          start_logits=start_logits,
-          end_logits=end_logits)
+      return dict(unique_ids=unique_ids, start_logits=start_logits, end_logits=end_logits)
 
     def tuple_fun(x):
       return (x,)
 
     if strategy:
-      outputs = strategy.experimental_run_v2(
-          _replicated_step, args=(next(iterator),))
+      outputs = strategy.experimental_run_v2(_replicated_step, args=(next(iterator),))
       map_func = strategy.experimental_local_results
     else:
-      outputs = _replicated_step(next(iterator),)
+      outputs = _replicated_step(
+        next(iterator),
+      )
       map_func = tuple_fun
     return tf.nest.map_structure(map_func, outputs)
 
@@ -256,8 +239,8 @@ def tuple_fun(x):
     predictions = predict_step(predict_iterator)
     if FLAGS.benchmark:
       # transfer tensor to CPU for synchronization
-      t0 = predictions['unique_ids'][0]
-      start_logits = predictions['start_logits'][0]
+      t0 = predictions["unique_ids"][0]
+      start_logits = predictions["start_logits"][0]
       start_logits.numpy()
       elapsed_secs = time.time() - t0.numpy()
       # Removing first 4 (arbitrary) number of startup iterations from perf evaluations
@@ -269,16 +252,17 @@ def tuple_fun(x):
       all_results.append(result)
 
     if len(all_results) % 100 == 0:
-      logging.info('Made predictions for %d records.', len(all_results))
+      logging.info("Made predictions for %d records.", len(all_results))
 
   eval_time_elapsed = time.time() - eval_start_time
   logging.info("-----------------------------")
   logging.info("Summary Inference Statistics")
   logging.info("Batch size = %d", FLAGS.predict_batch_size)
-  logging.info("Sequence Length = %d", input_meta_data['max_seq_length'])
+  logging.info("Sequence Length = %d", input_meta_data["max_seq_length"])
   logging.info("Precision = %s", "fp16" if FLAGS.use_fp16 else "fp32")
-  logging.info("Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed,
-    num_steps * FLAGS.predict_batch_size)
+  logging.info(
+    "Total Inference Time = %0.2f for Sentences = %d", eval_time_elapsed, num_steps * FLAGS.predict_batch_size
+  )
 
   if FLAGS.benchmark:
     eval_time_wo_overhead = sum(time_list)
@@ -286,15 +270,18 @@ def tuple_fun(x):
     num_sentences = (num_steps - 4) * FLAGS.predict_batch_size
 
     avg = np.mean(time_list)
-    cf_50 = max(time_list[:int(len(time_list) * 0.50)])
-    cf_90 = max(time_list[:int(len(time_list) * 0.90)])
-    cf_95 = max(time_list[:int(len(time_list) * 0.95)])
-    cf_99 = max(time_list[:int(len(time_list) * 0.99)])
-    cf_100 = max(time_list[:int(len(time_list) * 1)])
+    cf_50 = max(time_list[: int(len(time_list) * 0.50)])
+    cf_90 = max(time_list[: int(len(time_list) * 0.90)])
+    cf_95 = max(time_list[: int(len(time_list) * 0.95)])
+    cf_99 = max(time_list[: int(len(time_list) * 0.99)])
+    cf_100 = max(time_list[: int(len(time_list) * 1)])
     ss_sentences_per_second = num_sentences * 1.0 / eval_time_wo_overhead
 
-    logging.info("Total Inference Time W/O Overhead = %0.2f for Sequences = %d", eval_time_wo_overhead,
-      (num_steps - 4) * FLAGS.predict_batch_size)
+    logging.info(
+      "Total Inference Time W/O Overhead = %0.2f for Sequences = %d",
+      eval_time_wo_overhead,
+      (num_steps - 4) * FLAGS.predict_batch_size,
+    )
     logging.info("Latency Confidence Level 50 (ms) = %0.2f", cf_50 * 1000)
     logging.info("Latency Confidence Level 90 (ms) = %0.2f", cf_90 * 1000)
     logging.info("Latency Confidence Level 95 (ms) = %0.2f", cf_95 * 1000)
@@ -303,30 +290,25 @@ def tuple_fun(x):
     logging.info("Latency Average (ms) = %0.2f", avg * 1000)
     logging.info("Throughput Average (sequences/sec) = %0.2f", ss_sentences_per_second)
 
-
   logging.info("-----------------------------")
 
   return all_results
 
 
-def train_squad(strategy,
-                input_meta_data,
-                custom_callbacks=None,
-                run_eagerly=False):
+def train_squad(strategy, input_meta_data, custom_callbacks=None, run_eagerly=False):
   """Run bert squad training."""
   if strategy:
-    logging.info('Training using customized training loop with distribution'
-                 ' strategy.')
+    logging.info("Training using customized training loop with distribution strategy.")
   # Enables XLA in Session Config. Should not be set for TPU.
   keras_utils.set_config_v2(FLAGS.enable_xla)
 
   use_float16 = common_flags.use_float16()
   if use_float16:
-    tf.keras.mixed_precision.experimental.set_policy('mixed_float16')
+    tf.keras.mixed_precision.experimental.set_policy("mixed_float16")
 
   bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(FLAGS.bert_config_file)
-  num_train_examples = input_meta_data['train_data_size']
-  max_seq_length = input_meta_data['max_seq_length']
+  num_train_examples = input_meta_data["train_data_size"]
+  max_seq_length = input_meta_data["max_seq_length"]
   global_batch_size = FLAGS.train_batch_size * FLAGS.num_accumulation_steps
   if FLAGS.use_horovod:
     global_batch_size *= hvd.size()
@@ -335,38 +317,34 @@ def train_squad(strategy,
 
   data_pipe = Squad(max_seq_length=max_seq_length, dataset_type="squad")
   train_input = data_pipe(
-      flags.FLAGS.batch_size,
-      flags.FLAGS.train_data_path,
-      is_training=True,
+    flags.FLAGS.batch_size,
+    flags.FLAGS.train_data_path,
+    is_training=True,
   )
 
-  
   squad_model, core_model = bert_models.squad_model(
-        bert_config,
-        max_seq_length,
-        float_type=tf.float16 if FLAGS.use_fp16 else tf.float32,
-        hub_module_url=FLAGS.hub_module_url)
-  
+    bert_config,
+    max_seq_length,
+    float_type=tf.float16 if FLAGS.use_fp16 else tf.float32,
+    hub_module_url=FLAGS.hub_module_url,
+  )
+
   learning_rate = FLAGS.learning_rate * hvd.size() if FLAGS.use_horovod else FLAGS.learning_rate
-  optimizer = optimization.create_optimizer(learning_rate, steps_per_epoch * flags.FLAGS.epochs, warmup_steps, FLAGS.optimizer_type)
+  optimizer = optimization.create_optimizer(
+    learning_rate, steps_per_epoch * flags.FLAGS.epochs, warmup_steps, FLAGS.optimizer_type
+  )
   if FLAGS.use_fp16:
     optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer, dynamic=True)
-  
+
   trainer = Trainer(
-      model={
-          "main": squad_model,
-          "sub_model": core_model
-      },
-      optimizer=optimizer,
-      loss={
-          "start_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
-          "end_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-      },
-      loss_weights={
-          "start_positions": 0.5,
-          "end_positions": 0.5
-      },
-      jit_compile=True
+    model={"main": squad_model, "sub_model": core_model},
+    optimizer=optimizer,
+    loss={
+      "start_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+      "end_positions": tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    },
+    loss_weights={"start_positions": 0.5, "end_positions": 0.5},
+    jit_compile=True,
   )
   trainer.fit(x=train_input, steps_per_epoch=steps_per_epoch)
 
@@ -377,24 +355,19 @@ def predict_squad(strategy, input_meta_data):
   config_cls, squad_lib, tokenizer_cls = MODEL_CLASSES[FLAGS.model_type]
   bert_config = config_cls.from_json_file(FLAGS.bert_config_file)
   if tokenizer_cls == tokenization.FullTokenizer:
-    tokenizer = tokenizer_cls(
-        vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+    tokenizer = tokenizer_cls(vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
   else:
     assert tokenizer_cls == tokenization.FullSentencePieceTokenizer
     tokenizer = tokenizer_cls(sp_model_file=FLAGS.sp_model_file)
-  doc_stride = input_meta_data['doc_stride']
-  max_query_length = input_meta_data['max_query_length']
+  doc_stride = input_meta_data["doc_stride"]
+  max_query_length = input_meta_data["max_query_length"]
   # Whether data should be in Ver 2.0 format.
-  version_2_with_negative = input_meta_data.get('version_2_with_negative',
-                                                False)
+  version_2_with_negative = input_meta_data.get("version_2_with_negative", False)
   eval_examples = squad_lib.read_squad_examples(
-      input_file=FLAGS.predict_file,
-      is_training=False,
-      version_2_with_negative=version_2_with_negative)
+    input_file=FLAGS.predict_file, is_training=False, version_2_with_negative=version_2_with_negative
+  )
 
-  eval_writer = squad_lib.FeatureWriter(
-      filename=os.path.join(FLAGS.model_dir, 'eval.tf_record'),
-      is_training=False)
+  eval_writer = squad_lib.FeatureWriter(filename=os.path.join(FLAGS.model_dir, "eval.tf_record"), is_training=False)
   eval_features = []
 
   def _append_feature(feature, is_padding):
@@ -407,54 +380,54 @@ def _append_feature(feature, is_padding):
   # will get dropped. So we pad with fake examples which are ignored
   # later on.
   kwargs = dict(
-      examples=eval_examples,
-      tokenizer=tokenizer,
-      max_seq_length=input_meta_data['max_seq_length'],
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=False,
-      output_fn=_append_feature,
-      batch_size=FLAGS.predict_batch_size)
+    examples=eval_examples,
+    tokenizer=tokenizer,
+    max_seq_length=input_meta_data["max_seq_length"],
+    doc_stride=doc_stride,
+    max_query_length=max_query_length,
+    is_training=False,
+    output_fn=_append_feature,
+    batch_size=FLAGS.predict_batch_size,
+  )
 
   # squad_lib_sp requires one more argument 'do_lower_case'.
   if squad_lib == squad_lib_sp:
-    kwargs['do_lower_case'] = FLAGS.do_lower_case
+    kwargs["do_lower_case"] = FLAGS.do_lower_case
   dataset_size = squad_lib.convert_examples_to_features(**kwargs)
   eval_writer.close()
 
-  logging.info('***** Running predictions *****')
-  logging.info('  Num orig examples = %d', len(eval_examples))
-  logging.info('  Num split examples = %d', len(eval_features))
-  logging.info('  Batch size = %d', FLAGS.predict_batch_size)
+  logging.info("***** Running predictions *****")
+  logging.info("  Num orig examples = %d", len(eval_examples))
+  logging.info("  Num split examples = %d", len(eval_features))
+  logging.info("  Batch size = %d", FLAGS.predict_batch_size)
 
   num_steps = int(dataset_size / FLAGS.predict_batch_size)
   if FLAGS.benchmark and num_steps > 1000:
     num_steps = 1000
-  all_results = predict_squad_customized(strategy, input_meta_data, bert_config,
-                                         eval_writer.filename, num_steps)
+  all_results = predict_squad_customized(strategy, input_meta_data, bert_config, eval_writer.filename, num_steps)
 
   if FLAGS.benchmark:
     return
 
-  output_prediction_file = os.path.join(FLAGS.model_dir, 'predictions.json')
-  output_nbest_file = os.path.join(FLAGS.model_dir, 'nbest_predictions.json')
-  output_null_log_odds_file = os.path.join(FLAGS.model_dir, 'null_odds.json')
+  output_prediction_file = os.path.join(FLAGS.model_dir, "predictions.json")
+  output_nbest_file = os.path.join(FLAGS.model_dir, "nbest_predictions.json")
+  output_null_log_odds_file = os.path.join(FLAGS.model_dir, "null_odds.json")
 
   squad_lib.write_predictions(
-      eval_examples,
-      eval_features,
-      all_results,
-      FLAGS.n_best_size,
-      FLAGS.max_answer_length,
-      FLAGS.do_lower_case,
-      output_prediction_file,
-      output_nbest_file,
-      output_null_log_odds_file,
-      verbose=FLAGS.verbose_logging)
+    eval_examples,
+    eval_features,
+    all_results,
+    FLAGS.n_best_size,
+    FLAGS.max_answer_length,
+    FLAGS.do_lower_case,
+    output_prediction_file,
+    output_nbest_file,
+    output_null_log_odds_file,
+    verbose=FLAGS.verbose_logging,
+  )
 
   if FLAGS.eval_script:
-    eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script,
-                                        FLAGS.predict_file, output_prediction_file])
+    eval_out = subprocess.check_output([sys.executable, FLAGS.eval_script, FLAGS.predict_file, output_prediction_file])
     scores = str(eval_out).strip()
     exact_match = float(scores.split(":")[1].split(",")[0])
     if version_2_with_negative:
@@ -463,6 +436,7 @@ def _append_feature(feature, is_padding):
       f1 = float(scores.split(":")[2].split("}")[0])
     print(str(eval_out))
 
+
 def export_squad(model_export_path, input_meta_data):
   """Exports a trained model as a `SavedModel` for inference.
 
@@ -474,13 +448,12 @@ def export_squad(model_export_path, input_meta_data):
     Export path is not specified, got an empty string or None.
   """
   if not model_export_path:
-    raise ValueError('Export path is not specified: %s' % model_export_path)
-  bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(
-      FLAGS.bert_config_file)
-  squad_model, _ = bert_models.squad_model(
-      bert_config, input_meta_data['max_seq_length'], float_type=tf.float32)
+    raise ValueError("Export path is not specified: %s" % model_export_path)
+  bert_config = MODEL_CLASSES[FLAGS.model_type][0].from_json_file(FLAGS.bert_config_file)
+  squad_model, _ = bert_models.squad_model(bert_config, input_meta_data["max_seq_length"], float_type=tf.float32)
   model_saving_utils.export_bert_model(
-      model_export_path + '/savedmodel', model=squad_model, checkpoint_dir=FLAGS.model_dir)
+    model_export_path + "/savedmodel", model=squad_model, checkpoint_dir=FLAGS.model_dir
+  )
 
   model_name = FLAGS.triton_model_name
 
@@ -490,21 +463,29 @@ def export_squad(model_export_path, input_meta_data):
 
   if not os.path.exists(version_folder):
     os.makedirs(version_folder)
-  if (not os.path.exists(final_model_folder)):
-    os.rename(model_export_path + '/savedmodel', final_model_folder)
+  if not os.path.exists(final_model_folder):
+    os.rename(model_export_path + "/savedmodel", final_model_folder)
     print("Model saved to dir", final_model_folder)
   else:
-    if (FLAGS.triton_model_overwrite):
+    if FLAGS.triton_model_overwrite:
       shutil.rmtree(final_model_folder)
-      os.rename(model_export_path + '/savedmodel', final_model_folder)
+      os.rename(model_export_path + "/savedmodel", final_model_folder)
       print("WARNING: Existing model was overwritten. Model dir: {}".format(final_model_folder))
     else:
-      print("ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}".format(final_model_folder))
+      print(
+        "ERROR: Could not save Triton model. Folder already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model. Model dir: {}".format(
+          final_model_folder
+        )
+      )
       return
 
   config_filename = os.path.join(model_folder, "config.pbtxt")
-  if (os.path.exists(config_filename) and not FLAGS.triton_model_overwrite):
-    print("ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}".format(config_filename))
+  if os.path.exists(config_filename) and not FLAGS.triton_model_overwrite:
+    print(
+      "ERROR: Could not save Triton model config. Config file already exists. Use '--triton_model_overwrite=True' if you would like to overwrite an existing model config. Model config: {}".format(
+        config_filename
+      )
+    )
     return
 
   config_template = r"""
@@ -552,7 +533,7 @@ def export_squad(model_export_path, input_meta_data):
   batching_str = ""
   max_batch_size = FLAGS.triton_max_batch_size
 
-  if (FLAGS.triton_dyn_batching_delay > 0):
+  if FLAGS.triton_dyn_batching_delay > 0:
     # Use only full and half full batches
     pref_batch_size = [int(max_batch_size / 2.0), max_batch_size]
 
@@ -565,10 +546,10 @@ def export_squad(model_export_path, input_meta_data):
   config_values = {
     "model_name": model_name,
     "max_batch_size": max_batch_size,
-    "seq_length": input_meta_data['max_seq_length'],
+    "seq_length": input_meta_data["max_seq_length"],
     "dynamic_batching": batching_str,
-    "gpu_list": ", ".join([x.name.split(":")[-1] for x in tf.config.list_physical_devices('GPU')]),
-    "engine_count": FLAGS.triton_engine_count
+    "gpu_list": ", ".join([x.name.split(":")[-1] for x in tf.config.list_physical_devices("GPU")]),
+    "engine_count": FLAGS.triton_engine_count,
   }
 
   with open(model_folder + "/config.pbtxt", "w") as file:
@@ -579,31 +560,30 @@ def export_squad(model_export_path, input_meta_data):
 def main(_):
   # Users should always run this script under TF 2.x
   # The container haven't changed version number yet, skip the check.
-  assert tf.version.VERSION.startswith('2.')
+  assert tf.version.VERSION.startswith("2.")
 
-  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
-    input_meta_data = json.loads(reader.read().decode('utf-8'))
+  with tf.io.gfile.GFile(FLAGS.input_meta_data_path, "rb") as reader:
+    input_meta_data = json.loads(reader.read().decode("utf-8"))
 
-  if FLAGS.mode == 'export_only':
+  if FLAGS.mode == "export_only":
     export_squad(FLAGS.model_export_path, input_meta_data)
     return
 
-  gpus = tf.config.experimental.list_physical_devices('GPU')
+  gpus = tf.config.experimental.list_physical_devices("GPU")
   for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu, True)
 
   strategy = distribution_utils.get_distribution_strategy(
-      distribution_strategy=FLAGS.distribution_strategy,
-      num_gpus=FLAGS.num_gpus,
-      tpu_address=FLAGS.tpu)
+    distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu
+  )
 
   if FLAGS.use_horovod:
     if strategy:
-      raise ValueError('Should not run horovod with distribution strategy')
+      raise ValueError("Should not run horovod with distribution strategy")
 
     hvd.init()
     if gpus:
-      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
       gpu_affinity.set_affinity(hvd.local_rank())
 
   if FLAGS.use_fp16:
@@ -612,13 +592,15 @@ def main(_):
 
   os.makedirs(FLAGS.model_dir, exist_ok=True)
 
-  if FLAGS.mode in ('train', 'train_and_predict'):
+  if FLAGS.mode in ("train", "train_and_predict"):
     train_squad(strategy, input_meta_data)
-  if FLAGS.mode in ('predict', 'sm_predict', 'trt_predict', 'train_and_predict') and (not FLAGS.use_horovod or hvd.rank() == 0):
+  if FLAGS.mode in ("predict", "sm_predict", "trt_predict", "train_and_predict") and (
+    not FLAGS.use_horovod or hvd.rank() == 0
+  ):
     predict_squad(strategy, input_meta_data)
 
 
-if __name__ == '__main__':
-  flags.mark_flag_as_required('bert_config_file')
-  flags.mark_flag_as_required('model_dir')
+if __name__ == "__main__":
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("model_dir")
   app.run(main)
diff --git a/modelzoo/LanguageModeling/BERT/squad_lib.py b/modelzoo/LanguageModeling/BERT/squad_lib.py
index cd36125b..4694ca95 100644
--- a/modelzoo/LanguageModeling/BERT/squad_lib.py
+++ b/modelzoo/LanguageModeling/BERT/squad_lib.py
@@ -35,17 +35,19 @@
 class SquadExample(object):
   """A single training/test example for simple sequence classification.
 
-     For examples without an answer, the start and end position are -1.
+  For examples without an answer, the start and end position are -1.
   """
 
-  def __init__(self,
-               qas_id,
-               question_text,
-               doc_tokens,
-               orig_answer_text=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=False):
+  def __init__(
+    self,
+    qas_id,
+    question_text,
+    doc_tokens,
+    orig_answer_text=None,
+    start_position=None,
+    end_position=None,
+    is_impossible=False,
+  ):
     self.qas_id = qas_id
     self.question_text = question_text
     self.doc_tokens = doc_tokens
@@ -60,8 +62,7 @@ def __str__(self):
   def __repr__(self):
     s = ""
     s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-    s += ", question_text: %s" % (
-        tokenization.printable_text(self.question_text))
+    s += ", question_text: %s" % (tokenization.printable_text(self.question_text))
     s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
     if self.start_position:
       s += ", start_position: %d" % (self.start_position)
@@ -75,19 +76,21 @@ def __repr__(self):
 class InputFeatures(object):
   """A single set of features of data."""
 
-  def __init__(self,
-               unique_id,
-               example_index,
-               doc_span_index,
-               tokens,
-               token_to_orig_map,
-               token_is_max_context,
-               input_ids,
-               input_mask,
-               segment_ids,
-               start_position=None,
-               end_position=None,
-               is_impossible=None):
+  def __init__(
+    self,
+    unique_id,
+    example_index,
+    doc_span_index,
+    tokens,
+    token_to_orig_map,
+    token_is_max_context,
+    input_ids,
+    input_mask,
+    segment_ids,
+    start_position=None,
+    end_position=None,
+    is_impossible=None,
+  ):
     self.unique_id = unique_id
     self.example_index = example_index
     self.doc_span_index = doc_span_index
@@ -116,8 +119,7 @@ def process_feature(self, feature):
     self.num_features += 1
 
     def create_int_feature(values):
-      feature = tf.train.Feature(
-          int64_list=tf.train.Int64List(value=list(values)))
+      feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
       return feature
 
     features = collections.OrderedDict()
@@ -178,33 +180,27 @@ def is_whitespace(c):
         orig_answer_text = None
         is_impossible = False
         if is_training:
-
           if version_2_with_negative:
             is_impossible = qa["is_impossible"]
           if (len(qa["answers"]) != 1) and (not is_impossible):
-            raise ValueError(
-                "For training, each question should have exactly 1 answer.")
+            raise ValueError("For training, each question should have exactly 1 answer.")
           if not is_impossible:
             answer = qa["answers"][0]
             orig_answer_text = answer["text"]
             answer_offset = answer["answer_start"]
             answer_length = len(orig_answer_text)
             start_position = char_to_word_offset[answer_offset]
-            end_position = char_to_word_offset[answer_offset + answer_length -
-                                               1]
+            end_position = char_to_word_offset[answer_offset + answer_length - 1]
             # Only add answers where the text can be exactly recovered from the
             # document. If this CAN'T happen it's likely due to weird Unicode
             # stuff so we will just skip the example.
             #
             # Note that this means for training mode, every example is NOT
             # guaranteed to be preserved.
-            actual_text = " ".join(
-                doc_tokens[start_position:(end_position + 1)])
-            cleaned_answer_text = " ".join(
-                tokenization.whitespace_tokenize(orig_answer_text))
+            actual_text = " ".join(doc_tokens[start_position : (end_position + 1)])
+            cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(orig_answer_text))
             if actual_text.find(cleaned_answer_text) == -1:
-              logging.warning("Could not find answer: '%s' vs. '%s'",
-                              actual_text, cleaned_answer_text)
+              logging.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
               continue
           else:
             start_position = -1
@@ -212,32 +208,28 @@ def is_whitespace(c):
             orig_answer_text = ""
 
         example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            doc_tokens=doc_tokens,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            end_position=end_position,
-            is_impossible=is_impossible)
+          qas_id=qas_id,
+          question_text=question_text,
+          doc_tokens=doc_tokens,
+          orig_answer_text=orig_answer_text,
+          start_position=start_position,
+          end_position=end_position,
+          is_impossible=is_impossible,
+        )
         examples.append(example)
 
   return examples
 
 
-def convert_examples_to_features(examples,
-                                 tokenizer,
-                                 max_seq_length,
-                                 doc_stride,
-                                 max_query_length,
-                                 is_training,
-                                 output_fn,
-                                 batch_size=None):
+def convert_examples_to_features(
+  examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, output_fn, batch_size=None
+):
   """Loads a data file into a list of `InputBatch`s."""
 
   base_id = 1000000000
   unique_id = base_id
   feature = None
-  for (example_index, example) in enumerate(examples):
+  for example_index, example in enumerate(examples):
     query_tokens = tokenizer.tokenize(example.question_text)
 
     if len(query_tokens) > max_query_length:
@@ -246,7 +238,7 @@ def convert_examples_to_features(examples,
     tok_to_orig_index = []
     orig_to_tok_index = []
     all_doc_tokens = []
-    for (i, token) in enumerate(example.doc_tokens):
+    for i, token in enumerate(example.doc_tokens):
       orig_to_tok_index.append(len(all_doc_tokens))
       sub_tokens = tokenizer.tokenize(token)
       for sub_token in sub_tokens:
@@ -265,8 +257,8 @@ def convert_examples_to_features(examples,
       else:
         tok_end_position = len(all_doc_tokens) - 1
       (tok_start_position, tok_end_position) = _improve_answer_span(
-          all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-          example.orig_answer_text)
+        all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.orig_answer_text
+      )
 
     # The -3 accounts for [CLS], [SEP] and [SEP]
     max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
@@ -275,7 +267,8 @@ def convert_examples_to_features(examples,
     # To deal with this we do a sliding window approach, where we take chunks
     # of the up to our max length with a stride of `doc_stride`.
     _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
+      "DocSpan", ["start", "length"]
+    )
     doc_spans = []
     start_offset = 0
     while start_offset < len(all_doc_tokens):
@@ -287,7 +280,7 @@ def convert_examples_to_features(examples,
         break
       start_offset += min(length, doc_stride)
 
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
+    for doc_span_index, doc_span in enumerate(doc_spans):
       tokens = []
       token_to_orig_map = {}
       token_is_max_context = {}
@@ -304,8 +297,7 @@ def convert_examples_to_features(examples,
         split_token_index = doc_span.start + i
         token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
 
-        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                               split_token_index)
+        is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
         token_is_max_context[len(tokens)] = is_max_context
         tokens.append(all_doc_tokens[split_token_index])
         segment_ids.append(1)
@@ -336,8 +328,7 @@ def convert_examples_to_features(examples,
         doc_start = doc_span.start
         doc_end = doc_span.start + doc_span.length - 1
         out_of_span = False
-        if not (tok_start_position >= doc_start and
-                tok_end_position <= doc_end):
+        if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
           out_of_span = True
         if out_of_span:
           start_position = 0
@@ -356,41 +347,38 @@ def convert_examples_to_features(examples,
         logging.info("unique_id: %s", (unique_id))
         logging.info("example_index: %s", (example_index))
         logging.info("doc_span_index: %s", (doc_span_index))
-        logging.info("tokens: %s",
-                     " ".join([tokenization.printable_text(x) for x in tokens]))
+        logging.info("tokens: %s", " ".join([tokenization.printable_text(x) for x in tokens]))
         logging.info(
-            "token_to_orig_map: %s", " ".join([
-                "%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)
-            ]))
+          "token_to_orig_map: %s", " ".join(["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])
+        )
         logging.info(
-            "token_is_max_context: %s", " ".join([
-                "%d:%s" % (x, y)
-                for (x, y) in six.iteritems(token_is_max_context)
-            ]))
+          "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context)])
+        )
         logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
         logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
         logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
         if is_training and example.is_impossible:
           logging.info("impossible example")
         if is_training and not example.is_impossible:
-          answer_text = " ".join(tokens[start_position:(end_position + 1)])
+          answer_text = " ".join(tokens[start_position : (end_position + 1)])
           logging.info("start_position: %d", (start_position))
           logging.info("end_position: %d", (end_position))
           logging.info("answer: %s", tokenization.printable_text(answer_text))
 
       feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=example_index,
-          doc_span_index=doc_span_index,
-          tokens=tokens,
-          token_to_orig_map=token_to_orig_map,
-          token_is_max_context=token_is_max_context,
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=example.is_impossible)
+        unique_id=unique_id,
+        example_index=example_index,
+        doc_span_index=doc_span_index,
+        tokens=tokens,
+        token_to_orig_map=token_to_orig_map,
+        token_is_max_context=token_is_max_context,
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        start_position=start_position,
+        end_position=end_position,
+        is_impossible=example.is_impossible,
+      )
 
       # Run callback
       if is_training:
@@ -418,8 +406,7 @@ def convert_examples_to_features(examples,
   return unique_id - base_id
 
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                         orig_answer_text):
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
   """Returns tokenized answer spans that better match the annotated answer."""
 
   # The SQuAD annotations are character based. We first project them to
@@ -448,7 +435,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
 
   for new_start in range(input_start, input_end + 1):
     for new_end in range(input_end, new_start - 1, -1):
-      text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+      text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
       if text_span == tok_answer_text:
         return (new_start, new_end)
 
@@ -476,7 +463,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   # and 0 right context.
   best_score = None
   best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
+  for span_index, doc_span in enumerate(doc_spans):
     end = doc_span.start + doc_span.length - 1
     if position < doc_span.start:
       continue
@@ -492,20 +479,20 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   return cur_span_index == best_span_index
 
 
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
-
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
-def get_predictions(all_examples,
-                    all_features,
-                    all_results,
-                    n_best_size,
-                    max_answer_length,
-                    do_lower_case,
-                    version_2_with_negative=False,
-                    null_score_diff_threshold=0.0,
-                    verbose=False):
 
+def get_predictions(
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
+):
   example_index_to_features = collections.defaultdict(list)
   for feature in all_features:
     example_index_to_features[feature.example_index].append(feature)
@@ -514,14 +501,14 @@ def get_predictions(all_examples,
     unique_id_to_result[result.unique_id] = result
 
   _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "PrelimPrediction",
-      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+  )
 
   all_predictions = collections.OrderedDict()
   all_nbest_json = collections.OrderedDict()
   scores_diff_json = collections.OrderedDict()
 
-  for (example_index, example) in enumerate(all_examples):
+  for example_index, example in enumerate(all_examples):
     features = example_index_to_features[example_index]
 
     prelim_predictions = []
@@ -530,7 +517,7 @@ def get_predictions(all_examples,
     min_null_feature_index = 0  # the paragraph slice with min mull score
     null_start_logit = 0  # the start logit at the slice with min null score
     null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
+    for feature_index, feature in enumerate(features):
       result = unique_id_to_result[feature.unique_id]
       start_indexes = _get_best_indexes(result.start_logits, n_best_size)
       end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -563,28 +550,30 @@ def get_predictions(all_examples,
           if length > max_answer_length:
             continue
           prelim_predictions.append(
-              _PrelimPrediction(
-                  feature_index=feature_index,
-                  start_index=start_index,
-                  end_index=end_index,
-                  start_logit=result.start_logits[start_index],
-                  end_logit=result.end_logits[end_index]))
+            _PrelimPrediction(
+              feature_index=feature_index,
+              start_index=start_index,
+              end_index=end_index,
+              start_logit=result.start_logits[start_index],
+              end_logit=result.end_logits[end_index],
+            )
+          )
 
     if version_2_with_negative:
       prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=0,
-              end_index=0,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit))
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
+        _PrelimPrediction(
+          feature_index=min_null_feature_index,
+          start_index=0,
+          end_index=0,
+          start_logit=null_start_logit,
+          end_logit=null_end_logit,
+        )
+      )
+    prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
+      "NbestPrediction", ["text", "start_logit", "end_logit"]
+    )
 
     seen_predictions = {}
     nbest = []
@@ -593,10 +582,10 @@ def get_predictions(all_examples,
         break
       feature = features[pred.feature_index]
       if pred.start_index > 0:  # this is a non-null prediction
-        tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+        tok_tokens = feature.tokens[pred.start_index : (pred.end_index + 1)]
         orig_doc_start = feature.token_to_orig_map[pred.start_index]
         orig_doc_end = feature.token_to_orig_map[pred.end_index]
-        orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+        orig_tokens = example.doc_tokens[orig_doc_start : (orig_doc_end + 1)]
         tok_text = " ".join(tok_tokens)
 
         # De-tokenize WordPieces that have been split off.
@@ -608,8 +597,7 @@ def get_predictions(all_examples,
         tok_text = " ".join(tok_text.split())
         orig_text = " ".join(orig_tokens)
 
-        final_text = get_final_text(
-            tok_text, orig_text, do_lower_case, verbose=verbose)
+        final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose=verbose)
         if final_text in seen_predictions:
           continue
 
@@ -618,24 +606,16 @@ def get_predictions(all_examples,
         final_text = ""
         seen_predictions[final_text] = True
 
-      nbest.append(
-          _NbestPrediction(
-              text=final_text,
-              start_logit=pred.start_logit,
-              end_logit=pred.end_logit))
+      nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
 
     # if we didn't inlude the empty option in the n-best, inlcude it
     if version_2_with_negative:
       if "" not in seen_predictions:
-        nbest.append(
-            _NbestPrediction(
-                text="", start_logit=null_start_logit,
-                end_logit=null_end_logit))
+        nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
     # In very rare edge cases we could have no valid predictions. So we
     # just create a nonce prediction in this case to avoid failure.
     if not nbest:
-      nbest.append(
-          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+      nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
     assert len(nbest) >= 1
 
@@ -650,7 +630,7 @@ def get_predictions(all_examples,
     probs = _compute_softmax(total_scores)
 
     nbest_json = []
-    for (i, entry) in enumerate(nbest):
+    for i, entry in enumerate(nbest):
       output = collections.OrderedDict()
       output["text"] = entry.text
       output["probability"] = probs[i]
@@ -665,8 +645,7 @@ def get_predictions(all_examples,
     else:
       # pytype: disable=attribute-error
       # predict "" iff the null score - the score of best non-null > threshold
-      score_diff = score_null - best_non_null_entry.start_logit - (
-          best_non_null_entry.end_logit)
+      score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
       scores_diff_json[example.qas_id] = score_diff
       if score_diff > null_score_diff_threshold:
         all_predictions[example.qas_id] = ""
@@ -677,26 +656,36 @@ def get_predictions(all_examples,
     all_nbest_json[example.qas_id] = nbest_json
   return all_predictions, all_nbest_json, scores_diff_json
 
-def write_predictions(all_examples,
-                      all_features,
-                      all_results,
-                      n_best_size,
-                      max_answer_length,
-                      do_lower_case,
-                      output_prediction_file,
-                      output_nbest_file,
-                      output_null_log_odds_file,
-                      version_2_with_negative=False,
-                      null_score_diff_threshold=0.0,
-                      verbose=False):
+
+def write_predictions(
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  output_prediction_file,
+  output_nbest_file,
+  output_null_log_odds_file,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
+):
   """Write final predictions to the json file and log-odds of null if needed."""
   logging.info("Writing predictions to: %s", (output_prediction_file))
   logging.info("Writing nbest to: %s", (output_nbest_file))
 
   all_predictions, all_nbest_json, scores_diff_json = get_predictions(
-    all_examples, all_features, all_results, n_best_size,
-    max_answer_length, do_lower_case, version_2_with_negative,
-    null_score_diff_threshold, verbose)
+    all_examples,
+    all_features,
+    all_results,
+    n_best_size,
+    max_answer_length,
+    do_lower_case,
+    version_2_with_negative,
+    null_score_diff_threshold,
+    verbose,
+  )
 
   with tf.io.gfile.GFile(output_prediction_file, "w") as writer:
     writer.write(json.dumps(all_predictions, indent=4) + "\n")
@@ -740,7 +729,7 @@ def get_final_text(pred_text, orig_text, do_lower_case, verbose=False):
   def _strip_spaces(text):
     ns_chars = []
     ns_to_s_map = collections.OrderedDict()
-    for (i, c) in enumerate(text):
+    for i, c in enumerate(text):
       if c == " ":
         continue
       ns_to_s_map[len(ns_chars)] = i
@@ -768,14 +757,13 @@ def _strip_spaces(text):
 
   if len(orig_ns_text) != len(tok_ns_text):
     if verbose:
-      logging.info("Length not equal after stripping spaces: '%s' vs '%s'",
-                   orig_ns_text, tok_ns_text)
+      logging.info("Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text)
     return orig_text
 
   # We then project the characters in `pred_text` back to `orig_text` using
   # the character-to-character alignment.
   tok_s_to_ns_map = {}
-  for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+  for i, tok_index in six.iteritems(tok_ns_to_s_map):
     tok_s_to_ns_map[tok_index] = i
 
   orig_start_position = None
@@ -800,7 +788,7 @@ def _strip_spaces(text):
       logging.info("Couldn't map end position")
     return orig_text
 
-  output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+  output_text = orig_text[orig_start_position : (orig_end_position + 1)]
   return output_text
 
 
@@ -839,39 +827,40 @@ def _compute_softmax(scores):
   return probs
 
 
-def generate_tf_record_from_json_file(input_file_path,
-                                      vocab_file_path,
-                                      output_path,
-                                      max_seq_length=384,
-                                      do_lower_case=True,
-                                      max_query_length=64,
-                                      doc_stride=128,
-                                      version_2_with_negative=False):
+def generate_tf_record_from_json_file(
+  input_file_path,
+  vocab_file_path,
+  output_path,
+  max_seq_length=384,
+  do_lower_case=True,
+  max_query_length=64,
+  doc_stride=128,
+  version_2_with_negative=False,
+):
   """Generates and saves training data into a tf record file."""
   train_examples = read_squad_examples(
-      input_file=input_file_path,
-      is_training=True,
-      version_2_with_negative=version_2_with_negative)
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=vocab_file_path, do_lower_case=do_lower_case)
+    input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
+  )
+  tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path, do_lower_case=do_lower_case)
   train_writer = FeatureWriter(filename=output_path, is_training=True)
   number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature)
+    examples=train_examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=doc_stride,
+    max_query_length=max_query_length,
+    is_training=True,
+    output_fn=train_writer.process_feature,
+  )
   train_writer.close()
 
   meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
+    "task_type": "bert_squad",
+    "train_data_size": number_of_examples,
+    "max_seq_length": max_seq_length,
+    "max_query_length": max_query_length,
+    "doc_stride": doc_stride,
+    "version_2_with_negative": version_2_with_negative,
   }
 
   return meta_data
diff --git a/modelzoo/LanguageModeling/BERT/squad_lib_sp.py b/modelzoo/LanguageModeling/BERT/squad_lib_sp.py
index 64a99290..fe7454b4 100644
--- a/modelzoo/LanguageModeling/BERT/squad_lib_sp.py
+++ b/modelzoo/LanguageModeling/BERT/squad_lib_sp.py
@@ -18,6 +18,7 @@
 
 https://github.com/google-research/ALBERT/blob/master/run_squad_sp.py
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -36,17 +37,19 @@
 class SquadExample(object):
   """A single training/test example for simple sequence classification.
 
-     For examples without an answer, the start and end position are -1.
+  For examples without an answer, the start and end position are -1.
   """
 
-  def __init__(self,
-               qas_id,
-               question_text,
-               paragraph_text,
-               orig_answer_text=None,
-               start_position=None,
-               end_position=None,
-               is_impossible=False):
+  def __init__(
+    self,
+    qas_id,
+    question_text,
+    paragraph_text,
+    orig_answer_text=None,
+    start_position=None,
+    end_position=None,
+    is_impossible=False,
+  ):
     self.qas_id = qas_id
     self.question_text = question_text
     self.paragraph_text = paragraph_text
@@ -61,8 +64,7 @@ def __str__(self):
   def __repr__(self):
     s = ""
     s += "qas_id: %s" % (tokenization.printable_text(self.qas_id))
-    s += ", question_text: %s" % (
-        tokenization.printable_text(self.question_text))
+    s += ", question_text: %s" % (tokenization.printable_text(self.question_text))
     s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text))
     if self.start_position:
       s += ", start_position: %d" % (self.start_position)
@@ -76,21 +78,23 @@ def __repr__(self):
 class InputFeatures(object):
   """A single set of features of data."""
 
-  def __init__(self,
-               unique_id,
-               example_index,
-               doc_span_index,
-               tok_start_to_orig_index,
-               tok_end_to_orig_index,
-               token_is_max_context,
-               tokens,
-               input_ids,
-               input_mask,
-               segment_ids,
-               paragraph_len,
-               start_position=None,
-               end_position=None,
-               is_impossible=None):
+  def __init__(
+    self,
+    unique_id,
+    example_index,
+    doc_span_index,
+    tok_start_to_orig_index,
+    tok_end_to_orig_index,
+    token_is_max_context,
+    tokens,
+    input_ids,
+    input_mask,
+    segment_ids,
+    paragraph_len,
+    start_position=None,
+    end_position=None,
+    is_impossible=None,
+  ):
     self.unique_id = unique_id
     self.example_index = example_index
     self.doc_span_index = doc_span_index
@@ -128,8 +132,7 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
         if is_training:
           is_impossible = qa.get("is_impossible", False)
           if (len(qa["answers"]) != 1) and (not is_impossible):
-            raise ValueError(
-                "For training, each question should have exactly 1 answer.")
+            raise ValueError("For training, each question should have exactly 1 answer.")
           if not is_impossible:
             answer = qa["answers"][0]
             orig_answer_text = answer["text"]
@@ -139,12 +142,13 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):
             orig_answer_text = ""
 
         example = SquadExample(
-            qas_id=qas_id,
-            question_text=question_text,
-            paragraph_text=paragraph_text,
-            orig_answer_text=orig_answer_text,
-            start_position=start_position,
-            is_impossible=is_impossible)
+          qas_id=qas_id,
+          question_text=question_text,
+          paragraph_text=paragraph_text,
+          orig_answer_text=orig_answer_text,
+          start_position=start_position,
+          is_impossible=is_impossible,
+        )
         examples.append(example)
 
   return examples
@@ -188,15 +192,17 @@ def _convert_index(index, pos, m=None, is_start=True):
       return index[front]
 
 
-def convert_examples_to_features(examples,
-                                 tokenizer,
-                                 max_seq_length,
-                                 doc_stride,
-                                 max_query_length,
-                                 is_training,
-                                 output_fn,
-                                 do_lower_case,
-                                 batch_size=None):
+def convert_examples_to_features(
+  examples,
+  tokenizer,
+  max_seq_length,
+  doc_stride,
+  max_query_length,
+  is_training,
+  output_fn,
+  do_lower_case,
+  batch_size=None,
+):
   """Loads a data file into a list of `InputBatch`s."""
   cnt_pos, cnt_neg = 0, 0
   base_id = 1000000000
@@ -204,25 +210,21 @@ def convert_examples_to_features(examples,
   max_n, max_m = 1024, 1024
   f = np.zeros((max_n, max_m), dtype=np.float32)
 
-  for (example_index, example) in enumerate(examples):
-
+  for example_index, example in enumerate(examples):
     if example_index % 100 == 0:
-      logging.info("Converting %d/%d pos %d neg %d", example_index,
-                   len(examples), cnt_pos, cnt_neg)
+      logging.info("Converting %d/%d pos %d neg %d", example_index, len(examples), cnt_pos, cnt_neg)
 
     query_tokens = tokenization.encode_ids(
-        tokenizer.sp_model,
-        tokenization.preprocess_text(
-            example.question_text, lower=do_lower_case))
+      tokenizer.sp_model, tokenization.preprocess_text(example.question_text, lower=do_lower_case)
+    )
 
     if len(query_tokens) > max_query_length:
       query_tokens = query_tokens[0:max_query_length]
 
     paragraph_text = example.paragraph_text
     para_tokens = tokenization.encode_pieces(
-        tokenizer.sp_model,
-        tokenization.preprocess_text(
-            example.paragraph_text, lower=do_lower_case))
+      tokenizer.sp_model, tokenization.preprocess_text(example.paragraph_text, lower=do_lower_case)
+    )
 
     chartok_to_tok_index = []
     tok_start_to_chartok_index = []
@@ -235,8 +237,7 @@ def convert_examples_to_features(examples,
       char_cnt += len(new_token)
       tok_end_to_chartok_index.append(char_cnt - 1)
 
-    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE,
-                                                " ")
+    tok_cat_text = "".join(para_tokens).replace(tokenization.SPIECE_UNDERLINE, " ")
     n, m = len(paragraph_text), len(tok_cat_text)
 
     if n > max_n or m > max_m:
@@ -245,6 +246,7 @@ def convert_examples_to_features(examples,
       f = np.zeros((max_n, max_m), dtype=np.float32)
 
     g = {}
+
     # pylint: disable=cell-var-from-loop
     def _lcs_match(max_dist, n=n, m=m):
       """Longest-common-substring algorithm."""
@@ -254,7 +256,6 @@ def _lcs_match(max_dist, n=n, m=m):
       ### longest common sub sequence
       # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j))
       for i in range(n):
-
         # unlike standard LCS, this is specifically optimized for the setting
         # because the mismatch between sentence pieces and original text will
         # be small
@@ -271,11 +272,13 @@ def _lcs_match(max_dist, n=n, m=m):
             f[i, j] = f[i, j - 1]
 
           f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
-          if (tokenization.preprocess_text(
-              paragraph_text[i], lower=do_lower_case,
-              remove_space=False) == tok_cat_text[j] and f_prev + 1 > f[i, j]):
+          if (
+            tokenization.preprocess_text(paragraph_text[i], lower=do_lower_case, remove_space=False) == tok_cat_text[j]
+            and f_prev + 1 > f[i, j]
+          ):
             g[(i, j)] = 2
             f[i, j] = f_prev + 1
+
     # pylint: enable=cell-var-from-loop
 
     max_dist = abs(n - m) + 5
@@ -300,8 +303,7 @@ def _lcs_match(max_dist, n=n, m=m):
       else:
         i = i - 1
 
-    if (all(v is None for v in orig_to_chartok_index) or
-        f[n - 1, m - 1] < 0.8 * n):
+    if all(v is None for v in orig_to_chartok_index) or f[n - 1, m - 1] < 0.8 * n:
       logging.info("MISMATCH DETECTED!")
       continue
 
@@ -310,10 +312,8 @@ def _lcs_match(max_dist, n=n, m=m):
     for i in range(len(para_tokens)):
       start_chartok_pos = tok_start_to_chartok_index[i]
       end_chartok_pos = tok_end_to_chartok_index[i]
-      start_orig_pos = _convert_index(
-          chartok_to_orig_index, start_chartok_pos, n, is_start=True)
-      end_orig_pos = _convert_index(
-          chartok_to_orig_index, end_chartok_pos, n, is_start=False)
+      start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, n, is_start=True)
+      end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, n, is_start=False)
 
       tok_start_to_orig_index.append(start_orig_pos)
       tok_end_to_orig_index.append(end_orig_pos)
@@ -329,12 +329,10 @@ def _lcs_match(max_dist, n=n, m=m):
       start_position = example.start_position
       end_position = start_position + len(example.orig_answer_text) - 1
 
-      start_chartok_pos = _convert_index(
-          orig_to_chartok_index, start_position, is_start=True)
+      start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, is_start=True)
       tok_start_position = chartok_to_tok_index[start_chartok_pos]
 
-      end_chartok_pos = _convert_index(
-          orig_to_chartok_index, end_position, is_start=False)
+      end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, is_start=False)
       tok_end_position = chartok_to_tok_index[end_chartok_pos]
       assert tok_start_position <= tok_end_position
 
@@ -350,7 +348,8 @@ def _piece_to_id(x):
     # To deal with this we do a sliding window approach, where we take chunks
     # of the up to our max length with a stride of `doc_stride`.
     _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
-        "DocSpan", ["start", "length"])
+      "DocSpan", ["start", "length"]
+    )
     doc_spans = []
     start_offset = 0
     while start_offset < len(all_doc_tokens):
@@ -362,7 +361,7 @@ def _piece_to_id(x):
         break
       start_offset += min(length, doc_stride)
 
-    for (doc_span_index, doc_span) in enumerate(doc_spans):
+    for doc_span_index, doc_span in enumerate(doc_spans):
       tokens = []
       token_is_max_context = {}
       segment_ids = []
@@ -381,13 +380,10 @@ def _piece_to_id(x):
       for i in range(doc_span.length):
         split_token_index = doc_span.start + i
 
-        cur_tok_start_to_orig_index.append(
-            tok_start_to_orig_index[split_token_index])
-        cur_tok_end_to_orig_index.append(
-            tok_end_to_orig_index[split_token_index])
+        cur_tok_start_to_orig_index.append(tok_start_to_orig_index[split_token_index])
+        cur_tok_end_to_orig_index.append(tok_end_to_orig_index[split_token_index])
 
-        is_max_context = _check_is_max_context(doc_spans, doc_span_index,
-                                               split_token_index)
+        is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index)
         token_is_max_context[len(tokens)] = is_max_context
         tokens.append(all_doc_tokens[split_token_index])
         segment_ids.append(1)
@@ -420,8 +416,7 @@ def _piece_to_id(x):
         doc_start = doc_span.start
         doc_end = doc_span.start + doc_span.length - 1
         out_of_span = False
-        if not (tok_start_position >= doc_start and
-                tok_end_position <= doc_end):
+        if not (tok_start_position >= doc_start and tok_end_position <= doc_end):
           out_of_span = True
         if out_of_span:
           # continue
@@ -442,16 +437,12 @@ def _piece_to_id(x):
         logging.info("unique_id: %s", (unique_id))
         logging.info("example_index: %s", (example_index))
         logging.info("doc_span_index: %s", (doc_span_index))
-        logging.info("tok_start_to_orig_index: %s",
-                     " ".join([str(x) for x in cur_tok_start_to_orig_index]))
-        logging.info("tok_end_to_orig_index: %s",
-                     " ".join([str(x) for x in cur_tok_end_to_orig_index]))
+        logging.info("tok_start_to_orig_index: %s", " ".join([str(x) for x in cur_tok_start_to_orig_index]))
+        logging.info("tok_end_to_orig_index: %s", " ".join([str(x) for x in cur_tok_end_to_orig_index]))
         logging.info(
-            "token_is_max_context: %s", " ".join(
-                ["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]))
-        logging.info(
-            "input_pieces: %s",
-            " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
+          "token_is_max_context: %s", " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()])
+        )
+        logging.info("input_pieces: %s", " ".join([tokenizer.sp_model.IdToPiece(x) for x in tokens]))
         logging.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
         logging.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
         logging.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
@@ -460,10 +451,7 @@ def _piece_to_id(x):
           logging.info("impossible example span")
 
         if is_training and not span_is_impossible:
-          pieces = [
-              tokenizer.sp_model.IdToPiece(token)
-              for token in tokens[start_position:(end_position + 1)]
-          ]
+          pieces = [tokenizer.sp_model.IdToPiece(token) for token in tokens[start_position : (end_position + 1)]]
           answer_text = tokenizer.sp_model.DecodePieces(pieces)
           logging.info("start_position: %d", (start_position))
           logging.info("end_position: %d", (end_position))
@@ -479,20 +467,21 @@ def _piece_to_id(x):
         feat_example_index = example_index
 
       feature = InputFeatures(
-          unique_id=unique_id,
-          example_index=feat_example_index,
-          doc_span_index=doc_span_index,
-          tok_start_to_orig_index=cur_tok_start_to_orig_index,
-          tok_end_to_orig_index=cur_tok_end_to_orig_index,
-          token_is_max_context=token_is_max_context,
-          tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
-          input_ids=input_ids,
-          input_mask=input_mask,
-          segment_ids=segment_ids,
-          paragraph_len=paragraph_len,
-          start_position=start_position,
-          end_position=end_position,
-          is_impossible=span_is_impossible)
+        unique_id=unique_id,
+        example_index=feat_example_index,
+        doc_span_index=doc_span_index,
+        tok_start_to_orig_index=cur_tok_start_to_orig_index,
+        tok_end_to_orig_index=cur_tok_end_to_orig_index,
+        token_is_max_context=token_is_max_context,
+        tokens=[tokenizer.sp_model.IdToPiece(x) for x in tokens],
+        input_ids=input_ids,
+        input_mask=input_mask,
+        segment_ids=segment_ids,
+        paragraph_len=paragraph_len,
+        start_position=start_position,
+        end_position=end_position,
+        is_impossible=span_is_impossible,
+      )
 
       # Run callback
       if is_training:
@@ -520,8 +509,7 @@ def _piece_to_id(x):
       output_fn(feature, is_padding=True)
       unique_id += 1
 
-  logging.info("Total number of instances: %d = pos %d neg %d",
-               cnt_pos + cnt_neg, cnt_pos, cnt_neg)
+  logging.info("Total number of instances: %d = pos %d neg %d", cnt_pos + cnt_neg, cnt_pos, cnt_neg)
   return unique_id - base_id
 
 
@@ -546,7 +534,7 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   # and 0 right context.
   best_score = None
   best_span_index = None
-  for (span_index, doc_span) in enumerate(doc_spans):
+  for span_index, doc_span in enumerate(doc_spans):
     end = doc_span.start + doc_span.length - 1
     if position < doc_span.start:
       continue
@@ -562,22 +550,23 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
   return cur_span_index == best_span_index
 
 
-RawResult = collections.namedtuple("RawResult",
-                                   ["unique_id", "start_logits", "end_logits"])
+RawResult = collections.namedtuple("RawResult", ["unique_id", "start_logits", "end_logits"])
 
 
-def write_predictions(all_examples,
-                      all_features,
-                      all_results,
-                      n_best_size,
-                      max_answer_length,
-                      do_lower_case,
-                      output_prediction_file,
-                      output_nbest_file,
-                      output_null_log_odds_file,
-                      version_2_with_negative=False,
-                      null_score_diff_threshold=0.0,
-                      verbose=False):
+def write_predictions(
+  all_examples,
+  all_features,
+  all_results,
+  n_best_size,
+  max_answer_length,
+  do_lower_case,
+  output_prediction_file,
+  output_nbest_file,
+  output_null_log_odds_file,
+  version_2_with_negative=False,
+  null_score_diff_threshold=0.0,
+  verbose=False,
+):
   """Write final predictions to the json file and log-odds of null if needed."""
   del do_lower_case, verbose
   logging.info("Writing predictions to: %s", (output_prediction_file))
@@ -592,14 +581,14 @@ def write_predictions(all_examples,
     unique_id_to_result[result.unique_id] = result
 
   _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-      "PrelimPrediction",
-      ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+    "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]
+  )
 
   all_predictions = collections.OrderedDict()
   all_nbest_json = collections.OrderedDict()
   scores_diff_json = collections.OrderedDict()
 
-  for (example_index, example) in enumerate(all_examples):
+  for example_index, example in enumerate(all_examples):
     features = example_index_to_features[example_index]
 
     prelim_predictions = []
@@ -608,7 +597,7 @@ def write_predictions(all_examples,
     min_null_feature_index = 0  # the paragraph slice with min mull score
     null_start_logit = 0  # the start logit at the slice with min null score
     null_end_logit = 0  # the end logit at the slice with min null score
-    for (feature_index, feature) in enumerate(features):
+    for feature_index, feature in enumerate(features):
       result = unique_id_to_result[feature.unique_id]
       start_indexes = _get_best_indexes(result.start_logits, n_best_size)
       end_indexes = _get_best_indexes(result.end_logits, n_best_size)
@@ -642,28 +631,30 @@ def write_predictions(all_examples,
           if length > max_answer_length:
             continue
           prelim_predictions.append(
-              _PrelimPrediction(
-                  feature_index=feature_index,
-                  start_index=start_index - doc_offset,
-                  end_index=end_index - doc_offset,
-                  start_logit=result.start_logits[start_index],
-                  end_logit=result.end_logits[end_index]))
+            _PrelimPrediction(
+              feature_index=feature_index,
+              start_index=start_index - doc_offset,
+              end_index=end_index - doc_offset,
+              start_logit=result.start_logits[start_index],
+              end_logit=result.end_logits[end_index],
+            )
+          )
 
     if version_2_with_negative:
       prelim_predictions.append(
-          _PrelimPrediction(
-              feature_index=min_null_feature_index,
-              start_index=-1,
-              end_index=-1,
-              start_logit=null_start_logit,
-              end_logit=null_end_logit))
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.start_logit + x.end_logit),
-        reverse=True)
+        _PrelimPrediction(
+          feature_index=min_null_feature_index,
+          start_index=-1,
+          end_index=-1,
+          start_logit=null_start_logit,
+          end_logit=null_end_logit,
+        )
+      )
+    prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True)
 
     _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
-        "NbestPrediction", ["text", "start_logit", "end_logit"])
+      "NbestPrediction", ["text", "start_logit", "end_logit"]
+    )
 
     seen_predictions = {}
     nbest = []
@@ -678,7 +669,7 @@ def write_predictions(all_examples,
         end_orig_pos = tok_end_to_orig_index[pred.end_index]
 
         paragraph_text = example.paragraph_text
-        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
+        final_text = paragraph_text[start_orig_pos : end_orig_pos + 1].strip()
         if final_text in seen_predictions:
           continue
 
@@ -687,24 +678,16 @@ def write_predictions(all_examples,
         final_text = ""
         seen_predictions[final_text] = True
 
-      nbest.append(
-          _NbestPrediction(
-              text=final_text,
-              start_logit=pred.start_logit,
-              end_logit=pred.end_logit))
+      nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit))
 
     # if we didn't inlude the empty option in the n-best, inlcude it
     if version_2_with_negative:
       if "" not in seen_predictions:
-        nbest.append(
-            _NbestPrediction(
-                text="", start_logit=null_start_logit,
-                end_logit=null_end_logit))
+        nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit))
     # In very rare edge cases we could have no valid predictions. So we
     # just create a nonce prediction in this case to avoid failure.
     if not nbest:
-      nbest.append(
-          _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+      nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
 
     assert len(nbest) >= 1
 
@@ -719,7 +702,7 @@ def write_predictions(all_examples,
     probs = _compute_softmax(total_scores)
 
     nbest_json = []
-    for (i, entry) in enumerate(nbest):
+    for i, entry in enumerate(nbest):
       output = collections.OrderedDict()
       output["text"] = entry.text
       output["probability"] = probs[i]
@@ -734,8 +717,7 @@ def write_predictions(all_examples,
     else:
       assert best_non_null_entry is not None
       # predict "" iff the null score - the score of best non-null > threshold
-      score_diff = score_null - best_non_null_entry.start_logit - (
-          best_non_null_entry.end_logit)
+      score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit)
       scores_diff_json[example.qas_id] = score_diff
       if score_diff > null_score_diff_threshold:
         all_predictions[example.qas_id] = ""
@@ -804,8 +786,7 @@ def process_feature(self, feature):
     self.num_features += 1
 
     def create_int_feature(values):
-      feature = tf.train.Feature(
-          int64_list=tf.train.Int64List(value=list(values)))
+      feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
       return feature
 
     features = collections.OrderedDict()
@@ -829,40 +810,41 @@ def close(self):
     self._writer.close()
 
 
-def generate_tf_record_from_json_file(input_file_path,
-                                      sp_model_file,
-                                      output_path,
-                                      max_seq_length=384,
-                                      do_lower_case=True,
-                                      max_query_length=64,
-                                      doc_stride=128,
-                                      version_2_with_negative=False):
+def generate_tf_record_from_json_file(
+  input_file_path,
+  sp_model_file,
+  output_path,
+  max_seq_length=384,
+  do_lower_case=True,
+  max_query_length=64,
+  doc_stride=128,
+  version_2_with_negative=False,
+):
   """Generates and saves training data into a tf record file."""
   train_examples = read_squad_examples(
-      input_file=input_file_path,
-      is_training=True,
-      version_2_with_negative=version_2_with_negative)
-  tokenizer = tokenization.FullSentencePieceTokenizer(
-      sp_model_file=sp_model_file)
+    input_file=input_file_path, is_training=True, version_2_with_negative=version_2_with_negative
+  )
+  tokenizer = tokenization.FullSentencePieceTokenizer(sp_model_file=sp_model_file)
   train_writer = FeatureWriter(filename=output_path, is_training=True)
   number_of_examples = convert_examples_to_features(
-      examples=train_examples,
-      tokenizer=tokenizer,
-      max_seq_length=max_seq_length,
-      doc_stride=doc_stride,
-      max_query_length=max_query_length,
-      is_training=True,
-      output_fn=train_writer.process_feature,
-      do_lower_case=do_lower_case)
+    examples=train_examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=doc_stride,
+    max_query_length=max_query_length,
+    is_training=True,
+    output_fn=train_writer.process_feature,
+    do_lower_case=do_lower_case,
+  )
   train_writer.close()
 
   meta_data = {
-      "task_type": "bert_squad",
-      "train_data_size": number_of_examples,
-      "max_seq_length": max_seq_length,
-      "max_query_length": max_query_length,
-      "doc_stride": doc_stride,
-      "version_2_with_negative": version_2_with_negative,
+    "task_type": "bert_squad",
+    "train_data_size": number_of_examples,
+    "max_seq_length": max_seq_length,
+    "max_query_length": max_query_length,
+    "doc_stride": doc_stride,
+    "version_2_with_negative": version_2_with_negative,
   }
 
   return meta_data
diff --git a/modelzoo/LanguageModeling/BERT/tf_trt.py b/modelzoo/LanguageModeling/BERT/tf_trt.py
index af5c7a7c..8a002488 100644
--- a/modelzoo/LanguageModeling/BERT/tf_trt.py
+++ b/modelzoo/LanguageModeling/BERT/tf_trt.py
@@ -18,53 +18,55 @@
 
 
 def export_model(model_dir, prec, tf_trt_model_dir=None):
-    model = tf.saved_model.load(model_dir)
-    input_shape = [1, 384]
-    dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32))
-    x = [
-        tf.constant(dummy_input, name='input_word_ids'),
-        tf.constant(dummy_input, name='input_mask'),
-        tf.constant(dummy_input, name='input_type_ids'),
-    ]
-    _ = model(x)
+  model = tf.saved_model.load(model_dir)
+  input_shape = [1, 384]
+  dummy_input = tf.constant(tf.zeros(input_shape, dtype=tf.int32))
+  x = [
+    tf.constant(dummy_input, name="input_word_ids"),
+    tf.constant(dummy_input, name="input_mask"),
+    tf.constant(dummy_input, name="input_type_ids"),
+  ]
+  _ = model(x)
+
+  trt_prec = trt.TrtPrecisionMode.FP32 if prec == "fp32" else trt.TrtPrecisionMode.FP16
+  converter = trt.TrtGraphConverterV2(
+    input_saved_model_dir=model_dir,
+    conversion_params=trt.TrtConversionParams(precision_mode=trt_prec),
+  )
+  converter.convert()
+  tf_trt_model_dir = tf_trt_model_dir or f"/tmp/tf-trt_model_{prec}"
+  converter.save(tf_trt_model_dir)
+  print(f"TF-TRT model saved at {tf_trt_model_dir}")
 
-    trt_prec = trt.TrtPrecisionMode.FP32 if prec == "fp32" else trt.TrtPrecisionMode.FP16
-    converter = trt.TrtGraphConverterV2(
-        input_saved_model_dir=model_dir,
-        conversion_params=trt.TrtConversionParams(precision_mode=trt_prec),
-    )
-    converter.convert()
-    tf_trt_model_dir = tf_trt_model_dir or f'/tmp/tf-trt_model_{prec}'
-    converter.save(tf_trt_model_dir)
-    print(f"TF-TRT model saved at {tf_trt_model_dir}")
 
 class SavedModel:
-    def __init__(self, model_dir, precision):
-        self.saved_model_loaded = tf.saved_model.load(model_dir, tags=[tag_constants.SERVING])
-        self.graph_func = self.saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-        self.precision = tf.float16 if precision == "amp" else tf.float32
+  def __init__(self, model_dir, precision):
+    self.saved_model_loaded = tf.saved_model.load(model_dir, tags=[tag_constants.SERVING])
+    self.graph_func = self.saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    self.precision = tf.float16 if precision == "amp" else tf.float32
+
+  def __call__(self, x, **kwargs):
+    return self.infer_step(x)
 
-    def __call__(self, x, **kwargs):
-        return self.infer_step(x)
+  @tf.function
+  def infer_step(self, x):
+    output = self.graph_func(**x)
+    return output["start_positions"], output["end_positions"]
 
-    @tf.function
-    def infer_step(self, x):
-        output = self.graph_func(**x)
-        return output['start_positions'], output['end_positions']
 
 class TFTRTModel:
-    def __init__(self, model_dir, precision):
-        temp_tftrt_dir = f"/tmp/tf-trt_model_{precision}"
-        export_model(model_dir, precision, temp_tftrt_dir)
-        saved_model_loaded = tf.saved_model.load(temp_tftrt_dir, tags=[tag_constants.SERVING])
-        print(f"TF-TRT model loaded from {temp_tftrt_dir}")
-        self.graph_func = saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
-        self.precision = tf.float16 if precision == "amp" else tf.float32
+  def __init__(self, model_dir, precision):
+    temp_tftrt_dir = f"/tmp/tf-trt_model_{precision}"
+    export_model(model_dir, precision, temp_tftrt_dir)
+    saved_model_loaded = tf.saved_model.load(temp_tftrt_dir, tags=[tag_constants.SERVING])
+    print(f"TF-TRT model loaded from {temp_tftrt_dir}")
+    self.graph_func = saved_model_loaded.signatures[signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+    self.precision = tf.float16 if precision == "amp" else tf.float32
 
-    def __call__(self, x, **kwargs):
-        return self.infer_step(x)
+  def __call__(self, x, **kwargs):
+    return self.infer_step(x)
 
-    @tf.function
-    def infer_step(self, x):
-        output = self.graph_func(**x)
-        return output['start_positions'], output['end_positions']
+  @tf.function
+  def infer_step(self, x):
+    output = self.graph_func(**x)
+    return output["start_positions"], output["end_positions"]
diff --git a/modelzoo/LanguageModeling/BERT/tokenization.py b/modelzoo/LanguageModeling/BERT/tokenization.py
index d0608b0b..355099fb 100644
--- a/modelzoo/LanguageModeling/BERT/tokenization.py
+++ b/modelzoo/LanguageModeling/BERT/tokenization.py
@@ -53,14 +53,13 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
   model_name = m.group(1)
 
   lower_models = [
-      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
-      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+    "uncased_L-24_H-1024_A-16",
+    "uncased_L-12_H-768_A-12",
+    "multilingual_L-12_H-768_A-12",
+    "chinese_L-12_H-768_A-12",
   ]
 
-  cased_models = [
-      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
-      "multi_cased_L-12_H-768_A-12"
-  ]
+  cased_models = ["cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", "multi_cased_L-12_H-768_A-12"]
 
   is_bad_config = False
   if model_name in lower_models and not do_lower_case:
@@ -77,12 +76,12 @@ def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
 
   if is_bad_config:
     raise ValueError(
-        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
-        "However, `%s` seems to be a %s model, so you "
-        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
-        "how the model was pre-training. If this error is wrong, please "
-        "just comment out this check." %
-        (actual_flag, init_checkpoint, model_name, case_name, opposite_flag))
+      "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+      "However, `%s` seems to be a %s model, so you "
+      "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+      "how the model was pre-training. If this error is wrong, please "
+      "just comment out this check." % (actual_flag, init_checkpoint, model_name, case_name, opposite_flag)
+    )
 
 
 def convert_to_unicode(text):
@@ -281,14 +280,16 @@ def _is_chinese_char(self, cp):
     # as is Japanese Hiragana and Katakana. Those alphabets are used to write
     # space-separated words, so they are not treated specially and handled
     # like the all of the other languages.
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-        (cp >= 0x3400 and cp <= 0x4DBF) or  #
-        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-        (cp >= 0x2B820 and cp <= 0x2CEAF) or
-        (cp >= 0xF900 and cp <= 0xFAFF) or  #
-        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+    if (
+      (cp >= 0x4E00 and cp <= 0x9FFF)  #
+      or (cp >= 0x3400 and cp <= 0x4DBF)  #
+      or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+      or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+      or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+      or (cp >= 0x2B820 and cp <= 0x2CEAF)
+      or (cp >= 0xF900 and cp <= 0xFAFF)  #
+      or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):  #
       return True
 
     return False
@@ -298,7 +299,7 @@ def _clean_text(self, text):
     output = []
     for char in text:
       cp = ord(char)
-      if cp == 0 or cp == 0xfffd or _is_control(char):
+      if cp == 0 or cp == 0xFFFD or _is_control(char):
         continue
       if _is_whitespace(char):
         output.append(" ")
@@ -400,8 +401,7 @@ def _is_punctuation(char):
   # Characters such as "^", "$", and "`" are not in the Unicode
   # Punctuation class but we treat them as punctuation anyways, for
   # consistency.
-  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+  if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
     return True
   cat = unicodedata.category(char)
   if cat.startswith("P"):
@@ -469,8 +469,7 @@ def encode_pieces(sp_model, text, sample=False):
   for piece in pieces:
     piece = printable_text(piece)
     if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
-      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(
-          SPIECE_UNDERLINE, ""))
+      cur_pieces = sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, ""))
       if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
         if len(cur_pieces[0]) == 1:
           cur_pieces = cur_pieces[1:]
@@ -519,10 +518,7 @@ def __init__(self, sp_model_file):
     """
     self.sp_model = spm.SentencePieceProcessor()
     self.sp_model.Load(sp_model_file)
-    self.vocab = {
-        self.sp_model.IdToPiece(i): i
-        for i in six.moves.range(self.sp_model.GetPieceSize())
-    }
+    self.vocab = {self.sp_model.IdToPiece(i): i for i in six.moves.range(self.sp_model.GetPieceSize())}
 
   def tokenize(self, text):
     """Tokenizes text into pieces."""
diff --git a/modelzoo/LanguageModeling/Classify-text-with-BERT/classify_text_with_bert.py b/modelzoo/LanguageModeling/Classify-text-with-BERT/classify_text_with_bert.py
index c55c6b9b..8df02110 100644
--- a/modelzoo/LanguageModeling/Classify-text-with-BERT/classify_text_with_bert.py
+++ b/modelzoo/LanguageModeling/Classify-text-with-BERT/classify_text_with_bert.py
@@ -16,7 +16,7 @@
 seed = 42
 
 raw_train_ds = tf.keras.utils.text_dataset_from_directory(
-    'aclImdb/train', batch_size=batch_size, validation_split=0.2, subset='training', seed=seed
+  "aclImdb/train", batch_size=batch_size, validation_split=0.2, subset="training", seed=seed
 )
 
 class_names = raw_train_ds.class_names
@@ -24,7 +24,7 @@
 
 val_ds = datapipeline()
 
-test_ds = tf.keras.utils.text_dataset_from_directory('aclImdb/test', batch_size=batch_size)
+test_ds = tf.keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=batch_size)
 
 test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
 
@@ -34,155 +34,122 @@
 # %%
 for text_batch, label_batch in train_ds.take(1):
   for i in range(3):
-    print(f'Review: {text_batch.numpy()[i]}')
+    print(f"Review: {text_batch.numpy()[i]}")
     label = label_batch.numpy()[i]
-    print(f'Label : {label} ({class_names[label]})')
+    print(f"Label : {label} ({class_names[label]})")
 
-bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'  # @param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]
+bert_model_name = "small_bert/bert_en_uncased_L-4_H-512_A-8"  # @param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]
 
 map_name_to_handle = {
-    'bert_en_uncased_L-12_H-768_A-12':
-        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
-    'bert_en_cased_L-12_H-768_A-12':
-        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
-    'bert_multi_cased_L-12_H-768_A-12':
-        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
-    'small_bert/bert_en_uncased_L-2_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-2_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-2_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-2_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
-    'small_bert/bert_en_uncased_L-4_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-4_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-4_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-4_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
-    'small_bert/bert_en_uncased_L-6_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-6_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-6_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-6_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
-    'small_bert/bert_en_uncased_L-8_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-8_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-8_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-8_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
-    'small_bert/bert_en_uncased_L-10_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-10_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-10_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-10_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
-    'small_bert/bert_en_uncased_L-12_H-128_A-2':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
-    'small_bert/bert_en_uncased_L-12_H-256_A-4':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
-    'small_bert/bert_en_uncased_L-12_H-512_A-8':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
-    'small_bert/bert_en_uncased_L-12_H-768_A-12':
-        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
-    'albert_en_base':
-        'https://tfhub.dev/tensorflow/albert_en_base/2',
-    'electra_small':
-        'https://tfhub.dev/google/electra_small/2',
-    'electra_base':
-        'https://tfhub.dev/google/electra_base/2',
-    'experts_pubmed':
-        'https://tfhub.dev/google/experts/bert/pubmed/2',
-    'experts_wiki_books':
-        'https://tfhub.dev/google/experts/bert/wiki_books/2',
-    'talking-heads_base':
-        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
+  "bert_en_uncased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3",
+  "bert_en_cased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3",
+  "bert_multi_cased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3",
+  "small_bert/bert_en_uncased_L-2_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-2_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-2_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-2_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1",
+  "small_bert/bert_en_uncased_L-4_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-4_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-4_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-4_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1",
+  "small_bert/bert_en_uncased_L-6_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-6_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-6_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-6_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1",
+  "small_bert/bert_en_uncased_L-8_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-8_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-8_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-8_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1",
+  "small_bert/bert_en_uncased_L-10_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-10_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-10_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-10_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1",
+  "small_bert/bert_en_uncased_L-12_H-128_A-2": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1",
+  "small_bert/bert_en_uncased_L-12_H-256_A-4": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1",
+  "small_bert/bert_en_uncased_L-12_H-512_A-8": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1",
+  "small_bert/bert_en_uncased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1",
+  "albert_en_base": "https://tfhub.dev/tensorflow/albert_en_base/2",
+  "electra_small": "https://tfhub.dev/google/electra_small/2",
+  "electra_base": "https://tfhub.dev/google/electra_base/2",
+  "experts_pubmed": "https://tfhub.dev/google/experts/bert/pubmed/2",
+  "experts_wiki_books": "https://tfhub.dev/google/experts/bert/wiki_books/2",
+  "talking-heads_base": "https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1",
 }
 
 map_model_to_preprocess = {
-    'bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'bert_en_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
-    'small_bert/bert_en_uncased_L-2_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-2_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-2_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-2_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-4_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-4_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-4_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-4_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-6_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-6_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-6_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-6_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-8_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-8_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-8_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-8_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-10_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-10_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-10_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-10_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-12_H-128_A-2': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-12_H-256_A-4': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-12_H-512_A-8': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'small_bert/bert_en_uncased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'bert_multi_cased_L-12_H-768_A-12': 'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
-    'albert_en_base': 'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
-    'electra_small': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'electra_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'experts_pubmed': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'experts_wiki_books': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
-    'talking-heads_base': 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
+  "bert_en_uncased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "bert_en_cased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3",
+  "small_bert/bert_en_uncased_L-2_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-2_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-2_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-2_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-4_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-4_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-4_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-4_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-6_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-6_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-6_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-6_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-8_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-8_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-8_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-8_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-10_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-10_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-10_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-10_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-12_H-128_A-2": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-12_H-256_A-4": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-12_H-512_A-8": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "small_bert/bert_en_uncased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "bert_multi_cased_L-12_H-768_A-12": "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3",
+  "albert_en_base": "https://tfhub.dev/tensorflow/albert_en_preprocess/3",
+  "electra_small": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "electra_base": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "experts_pubmed": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "experts_wiki_books": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
+  "talking-heads_base": "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
 }
 
 tfhub_handle_encoder = map_name_to_handle[bert_model_name]
 tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]
 
-print(f'BERT model selected           : {tfhub_handle_encoder}')
-print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')
+print(f"BERT model selected           : {tfhub_handle_encoder}")
+print(f"Preprocess model auto-selected: {tfhub_handle_preprocess}")
 
 bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
 
-text_test = ['this is such an amazing movie!']
+text_test = ["this is such an amazing movie!"]
 text_preprocessed = bert_preprocess_model(text_test)
 
-print(f'Keys       : {list(text_preprocessed.keys())}')
-print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
-print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
-print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
-print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')
+print(f"Keys       : {list(text_preprocessed.keys())}")
+print(f"Shape      : {text_preprocessed['input_word_ids'].shape}")
+print(f"Word Ids   : {text_preprocessed['input_word_ids'][0, :12]}")
+print(f"Input Mask : {text_preprocessed['input_mask'][0, :12]}")
+print(f"Type Ids   : {text_preprocessed['input_type_ids'][0, :12]}")
 
 bert_model = hub.KerasLayer(tfhub_handle_encoder)
 
 # %%
 bert_results = bert_model(text_preprocessed)
 
-print(f'Loaded BERT: {tfhub_handle_encoder}')
-print(f'Pooled Outputs Shape:{bert_results["pooled_output"].shape}')
-print(f'Pooled Outputs Values:{bert_results["pooled_output"][0, :12]}')
-print(f'Sequence Outputs Shape:{bert_results["sequence_output"].shape}')
-print(f'Sequence Outputs Values:{bert_results["sequence_output"][0, :12]}')
+print(f"Loaded BERT: {tfhub_handle_encoder}")
+print(f"Pooled Outputs Shape:{bert_results['pooled_output'].shape}")
+print(f"Pooled Outputs Values:{bert_results['pooled_output'][0, :12]}")
+print(f"Sequence Outputs Shape:{bert_results['sequence_output'].shape}")
+print(f"Sequence Outputs Values:{bert_results['sequence_output'][0, :12]}")
 
 
 def build_classifier_model():
-  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
-  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
+  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
+  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name="preprocessing")
   encoder_inputs = preprocessing_layer(text_input)
-  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
+  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name="BERT_encoder")
   outputs = encoder(encoder_inputs)
-  net = outputs['pooled_output']
+  net = outputs["pooled_output"]
   net = tf.keras.layers.Dropout(0.1)(net)
-  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
+  net = tf.keras.layers.Dense(1, activation=None, name="classifier")(net)
   return tf.keras.Model(text_input, net)
 
 
@@ -201,7 +168,7 @@ def build_classifier_model():
 
 init_lr = 3e-5
 optimizer = optimization.create_optimizer(
-    init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type='adamw'
+  init_lr=init_lr, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, optimizer_type="adamw"
 )
 
 # %% [markdown]
@@ -216,7 +183,7 @@ def build_classifier_model():
 # Note: training time will vary depending on the complexity of the BERT model you have selected.
 
 # %%
-print(f'Training model with {tfhub_handle_encoder}')
+print(f"Training model with {tfhub_handle_encoder}")
 history = classifier_model.fit(x=train_ds, validation_data=val_ds, epochs=epochs)
 
 # %% [markdown]
@@ -227,8 +194,8 @@ def build_classifier_model():
 # %%
 loss, accuracy = classifier_model.evaluate(test_ds)
 
-print(f'Loss: {loss}')
-print(f'Accuracy: {accuracy}')
+print(f"Loss: {loss}")
+print(f"Accuracy: {accuracy}")
 
 # %% [markdown]
 # ### Plot the accuracy and loss over time
@@ -239,14 +206,14 @@ def build_classifier_model():
 history_dict = history.history
 print(history_dict.keys())
 
-acc = history_dict['binary_accuracy']
-val_acc = history_dict['val_binary_accuracy']
-loss = history_dict['loss']
-val_loss = history_dict['val_loss']
+acc = history_dict["binary_accuracy"]
+val_acc = history_dict["val_binary_accuracy"]
+loss = history_dict["loss"]
+val_loss = history_dict["val_loss"]
 
 # %%
-dataset_name = 'imdb'
-saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))
+dataset_name = "imdb"
+saved_model_path = "./{}_bert".format(dataset_name.replace("/", "_"))
 
 classifier_model.save(saved_model_path, include_optimizer=False)
 
@@ -259,36 +226,33 @@ def build_classifier_model():
 
 # %%
 def print_my_examples(inputs, results):
-  result_for_printing = \
-    [f'input: {inputs[i]:<30} : score: {results[i][0]:.6f}'
-     for i in range(len(inputs))]
-  print(*result_for_printing, sep='\n')
+  result_for_printing = [f"input: {inputs[i]:<30} : score: {results[i][0]:.6f}" for i in range(len(inputs))]
+  print(*result_for_printing, sep="\n")
   print()
 
 
 examples = [
-    'this is such an amazing movie!',  # this is the same sentence tried earlier
-    'The movie was great!',
-    'The movie was meh.',
-    'The movie was okish.',
-    'The movie was terrible...'
+  "this is such an amazing movie!",  # this is the same sentence tried earlier
+  "The movie was great!",
+  "The movie was meh.",
+  "The movie was okish.",
+  "The movie was terrible...",
 ]
 
 reloaded_results = tf.sigmoid(reloaded_model(tf.constant(examples)))
 original_results = tf.sigmoid(classifier_model(tf.constant(examples)))
 
-print('Results from the saved model:')
+print("Results from the saved model:")
 print_my_examples(examples, reloaded_results)
-print('Results from the model in memory:')
+print("Results from the model in memory:")
 print_my_examples(examples, original_results)
 
 # %% [markdown]
 # If you want to use your model on [TF Serving](https://www.tensorflow.org/tfx/guide/serving), remember that it will call your SavedModel through one of its named signatures. In Python, you can test them as follows:
 
 # %%
-serving_results = reloaded_model \
-  .signatures['serving_default'](tf.constant(examples))
+serving_results = reloaded_model.signatures["serving_default"](tf.constant(examples))
 
-serving_results = tf.sigmoid(serving_results['classifier'])
+serving_results = tf.sigmoid(serving_results["classifier"])
 
 print_my_examples(examples, serving_results)
diff --git a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
index 4fa67eb4..18b35f96 100644
--- a/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
+++ b/modelzoo/LanguageModeling/Multi-label-classification-with-BERT/trainer.py
@@ -10,22 +10,22 @@
 # if FLAGS.use_dynamic_embedding:
 
 # tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
-tfhub_handle_encoder = 'https://hub.tensorflow.google.cn/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1'
+tfhub_handle_encoder = "https://hub.tensorflow.google.cn/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1"
 
 # tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
-tfhub_handle_preprocess = 'https://hub.tensorflow.google.cn/tensorflow/bert_en_uncased_preprocess/3'
+tfhub_handle_preprocess = "https://hub.tensorflow.google.cn/tensorflow/bert_en_uncased_preprocess/3"
 
 
 def build_classifier_model():
-  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
-  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
+  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
+  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name="preprocessing")
   encoder_inputs = preprocessing_layer(text_input)
-  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
+  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name="BERT_encoder")
   outputs = encoder(encoder_inputs)
-  net = outputs['pooled_output']
+  net = outputs["pooled_output"]
   net = tf.keras.layers.Dropout(0.1)(net)
-  net = tf.keras.layers.Dense(500, activation='relu')(net)
-  net = tf.keras.layers.Dense(6, activation="sigmoid", name='classifier')(net)
+  net = tf.keras.layers.Dense(500, activation="relu")(net)
+  net = tf.keras.layers.Dense(6, activation="sigmoid", name="classifier")(net)
   return tf.keras.Model(text_input, net)
 
 
@@ -35,13 +35,13 @@ def main(_):
     model = build_classifier_model()
 
   callbacks = [
-      tf.keras.callbacks.ModelCheckpoint('best_bert_model', save_best_only=True),
-      tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
+    tf.keras.callbacks.ModelCheckpoint("best_bert_model", save_best_only=True),
+    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3),
   ]
   trainer = Trainer(
-      model=model,
-      loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
-      metrics=[tf.metrics.BinaryAccuracy(), tf.metrics.AUC(multi_label=True)],
+    model=model,
+    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+    metrics=[tf.metrics.BinaryAccuracy(), tf.metrics.AUC(multi_label=True)],
   )
   data_pipe = ToxicCommentClassificationChallenge()
   train_dataset = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
index a9bfe493..ca26efc8 100644
--- a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/a.py
@@ -8,50 +8,50 @@
 
 import tensorflow_text
 
-examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+examples, metadata = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
 
-train_examples, val_examples = examples['train'], examples['validation']
+train_examples, val_examples = examples["train"], examples["validation"]
 
 for pt_examples, en_examples in train_examples.batch(3).take(1):
-  print('> Examples in Portuguese:')
+  print("> Examples in Portuguese:")
   for pt in pt_examples.numpy():
-    print(pt.decode('utf-8'))
+    print(pt.decode("utf-8"))
   print()
 
-  print('> Examples in English:')
+  print("> Examples in English:")
   for en in en_examples.numpy():
-    print(en.decode('utf-8'))
+    print(en.decode("utf-8"))
 
-model_name = 'ted_hrlr_translate_pt_en_converter'
+model_name = "ted_hrlr_translate_pt_en_converter"
 tf.keras.utils.get_file(
-    f'{model_name}.zip',
-    f'https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip',
-    cache_dir='.',
-    cache_subdir='',
-    extract=True
+  f"{model_name}.zip",
+  f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
+  cache_dir=".",
+  cache_subdir="",
+  extract=True,
 )
 
 tokenizers = tf.saved_model.load(model_name)
 
-[item for item in dir(tokenizers.en) if not item.startswith('_')]
+[item for item in dir(tokenizers.en) if not item.startswith("_")]
 
-print('> This is a batch of strings:')
+print("> This is a batch of strings:")
 for en in en_examples.numpy():
-  print(en.decode('utf-8'))
+  print(en.decode("utf-8"))
 
 encoded = tokenizers.en.tokenize(en_examples)
 
-print('> This is a padded-batch of token IDs:')
+print("> This is a padded-batch of token IDs:")
 for row in encoded.to_list():
   print(row)
 
 round_trip = tokenizers.en.detokenize(encoded)
 
-print('> This is human-readable text:')
+print("> This is human-readable text:")
 for line in round_trip.numpy():
-  print(line.decode('utf-8'))
+  print(line.decode("utf-8"))
 
-print('> This is the text split into tokens:')
+print("> This is the text split into tokens:")
 tokens = tokenizers.en.lookup(encoded)
 print(tokens)
 
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
index cc227dce..c5a49b41 100644
--- a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/models.py
@@ -3,7 +3,6 @@
 
 
 class BaseAttention(tf.keras.layers.Layer):
-
   def __init__(self, **kwargs):
     super().__init__()
     self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
@@ -26,7 +25,6 @@ def positional_encoding(length, depth):
 
 
 class PositionalEmbedding(tf.keras.layers.Layer):
-
   def __init__(self, vocab_size, d_model):
     super().__init__()
     self.d_model = d_model
@@ -46,7 +44,6 @@ def call(self, x):
 
 
 class CrossAttention(BaseAttention):
-
   def call(self, x, context):
     attn_output, attn_scores = self.mha(query=x, key=context, value=context, return_attention_scores=True)
 
@@ -60,7 +57,6 @@ def call(self, x, context):
 
 
 class CausalSelfAttention(BaseAttention):
-
   def call(self, x):
     attn_output = self.mha(query=x, value=x, key=x, use_causal_mask=True)
     x = self.add([x, attn_output])
@@ -69,16 +65,13 @@ def call(self, x):
 
 
 class FeedForward(tf.keras.layers.Layer):
-
   def __init__(self, d_model, dff, dropout_rate=0.1):
     super().__init__()
-    self.seq = tf.keras.Sequential(
-        [
-            tf.keras.layers.Dense(dff, activation='relu'),
-            tf.keras.layers.Dense(d_model),
-            tf.keras.layers.Dropout(dropout_rate)
-        ]
-    )
+    self.seq = tf.keras.Sequential([
+      tf.keras.layers.Dense(dff, activation="relu"),
+      tf.keras.layers.Dense(d_model),
+      tf.keras.layers.Dropout(dropout_rate),
+    ])
     self.add = tf.keras.layers.Add()
     self.layer_norm = tf.keras.layers.LayerNormalization()
 
@@ -89,7 +82,6 @@ def call(self, x):
 
 
 class GlobalSelfAttention(BaseAttention):
-
   def call(self, x):
     attn_output = self.mha(query=x, value=x, key=x)
     x = self.add([x, attn_output])
@@ -98,7 +90,6 @@ def call(self, x):
 
 
 class EncoderLayer(tf.keras.layers.Layer):
-
   def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
     super().__init__()
 
@@ -113,7 +104,6 @@ def call(self, x):
 
 
 class Encoder(tf.keras.layers.Layer):
-
   def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
     super().__init__()
 
@@ -123,8 +113,7 @@ def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_r
     self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
 
     self.enc_layers = [
-        EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
-        for _ in range(num_layers)
+      EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate) for _ in range(num_layers)
     ]
     self.dropout = tf.keras.layers.Dropout(dropout_rate)
 
@@ -142,7 +131,6 @@ def call(self, x):
 
 
 class DecoderLayer(tf.keras.layers.Layer):
-
   def __init__(self, *, d_model, num_heads, dff, dropout_rate=0.1):
     super(DecoderLayer, self).__init__()
 
@@ -164,7 +152,6 @@ def call(self, x, context):
 
 
 class Decoder(tf.keras.layers.Layer):
-
   def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_rate=0.1):
     super(Decoder, self).__init__()
 
@@ -174,8 +161,7 @@ def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size, dropout_r
     self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)
     self.dropout = tf.keras.layers.Dropout(dropout_rate)
     self.dec_layers = [
-        DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate)
-        for _ in range(num_layers)
+      DecoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, dropout_rate=dropout_rate) for _ in range(num_layers)
     ]
 
     self.last_attn_scores = None
@@ -196,25 +182,24 @@ def call(self, x, context):
 
 
 class Transformer(tf.keras.Model):
-
   def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, dropout_rate=0.1):
     super().__init__()
     self.encoder = Encoder(
-        num_layers=num_layers,
-        d_model=d_model,
-        num_heads=num_heads,
-        dff=dff,
-        vocab_size=input_vocab_size,
-        dropout_rate=dropout_rate
+      num_layers=num_layers,
+      d_model=d_model,
+      num_heads=num_heads,
+      dff=dff,
+      vocab_size=input_vocab_size,
+      dropout_rate=dropout_rate,
     )
 
     self.decoder = Decoder(
-        num_layers=num_layers,
-        d_model=d_model,
-        num_heads=num_heads,
-        dff=dff,
-        vocab_size=target_vocab_size,
-        dropout_rate=dropout_rate
+      num_layers=num_layers,
+      d_model=d_model,
+      num_heads=num_heads,
+      dff=dff,
+      vocab_size=target_vocab_size,
+      dropout_rate=dropout_rate,
     )
 
     self.final_layer = tf.keras.layers.Dense(target_vocab_size)
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
index 3895f6f3..805f1ea9 100644
--- a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run.py
@@ -9,7 +9,6 @@
 
 
 class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-
   def __init__(self, d_model, warmup_steps=4000):
     super().__init__()
 
@@ -39,7 +38,7 @@ def __call__(self, step):
 
 def masked_loss(label, pred):
   mask = label != 0
-  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
+  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
   loss = loss_object(label, pred)
 
   mask = tf.cast(mask, dtype=loss.dtype)
@@ -72,7 +71,7 @@ def prepare_batch(pt, en):
   pt = pt.to_tensor()  # Convert to 0-padded dense Tensor
 
   en = tokenizers.en.tokenize(en)
-  en = en[:, :(MAX_TOKENS + 1)]
+  en = en[:, : (MAX_TOKENS + 1)]
   en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
   en_labels = en[:, 1:].to_tensor()  # Drop the [START] tokens
 
@@ -81,8 +80,10 @@ def prepare_batch(pt, en):
 
 def make_batches(ds):
   return (
-      ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(prepare_batch,
-                                                    tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)
+    ds.shuffle(BUFFER_SIZE)
+    .batch(BATCH_SIZE)
+    .map(prepare_batch, tf.data.AUTOTUNE)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
   )
 
 
@@ -96,31 +97,32 @@ def make_batches(ds):
 tokenizers = tf.saved_model.load("/workspaces/datasets/ted_hrlr_translate_pt_en_converter")
 
 transformer = Transformer(
-    num_layers=num_layers,
-    d_model=d_model,
-    num_heads=num_heads,
-    dff=dff,
-    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
-    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
-    dropout_rate=dropout_rate
+  num_layers=num_layers,
+  d_model=d_model,
+  num_heads=num_heads,
+  dff=dff,
+  input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
+  target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
+  dropout_rate=dropout_rate,
 )
 
 transformer.compile(
-    loss=masked_loss,
-    optimizer=optimizer,
-    metrics=[masked_accuracy],
-    steps_per_execution=10,
-    # jit_compile=True
+  loss=masked_loss,
+  optimizer=optimizer,
+  metrics=[masked_accuracy],
+  steps_per_execution=10,
+  # jit_compile=True
 )
 
-examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+examples, metadata = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
 
-train_examples, val_examples = examples['train'], examples['validation']
+train_examples, val_examples = examples["train"], examples["validation"]
 # Create training and validation set batches.
 train_batches = make_batches(train_examples)
 val_batches = make_batches(val_examples)
 
-transformer.fit(train_batches,
-                # epochs=1,
-                # validation_data=val_batches
-               )
+transformer.fit(
+  train_batches,
+  # epochs=1,
+  # validation_data=val_batches
+)
diff --git a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
index a1cbdc43..9438325f 100644
--- a/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
+++ b/modelzoo/LanguageModeling/Neural-machine-translation-with-Transformer/run_dp.py
@@ -16,7 +16,6 @@
 
 
 class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
-
   def __init__(self, d_model, warmup_steps=4000):
     super().__init__()
 
@@ -42,7 +41,7 @@ def __call__(self, step):
 
 def masked_loss(label, pred):
   mask = label != 0
-  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
+  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
   loss = loss_object(label, pred)
 
   mask = tf.cast(mask, dtype=loss.dtype)
@@ -85,7 +84,7 @@ def prepare_batch(pt, en):
     pt = pt.to_tensor()  # Convert to 0-padded dense Tensor
 
     en = tokenizers.en.tokenize(en)
-    en = en[:, :(MAX_TOKENS + 1)]
+    en = en[:, : (MAX_TOKENS + 1)]
     en_inputs = en[:, :-1].to_tensor()  # Drop the [END] tokens
     en_labels = en[:, 1:].to_tensor()  # Drop the [START] tokens
 
@@ -93,18 +92,20 @@ def prepare_batch(pt, en):
 
   def make_batches(ds):
     return (
-        ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(prepare_batch,
-                                                      tf.data.AUTOTUNE).prefetch(buffer_size=tf.data.AUTOTUNE)
+      ds.shuffle(BUFFER_SIZE)
+      .batch(BATCH_SIZE)
+      .map(prepare_batch, tf.data.AUTOTUNE)
+      .prefetch(buffer_size=tf.data.AUTOTUNE)
     )
 
   transformer = Transformer(
-      num_layers=num_layers,
-      d_model=d_model,
-      num_heads=num_heads,
-      dff=dff,
-      input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
-      target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
-      dropout_rate=dropout_rate
+    num_layers=num_layers,
+    d_model=d_model,
+    num_heads=num_heads,
+    dff=dff,
+    input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
+    target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
+    dropout_rate=dropout_rate,
   )
 
   learning_rate = CustomSchedule(d_model)
@@ -112,26 +113,26 @@ def make_batches(ds):
   optimizer = lamb.LAMB(learning_rate=learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
 
   trainer = Trainer(
-      model=transformer,
-      loss=masked_loss,
-      optimizer=optimizer,
-      metrics=[masked_accuracy],
-      # jit_compile=True
+    model=transformer,
+    loss=masked_loss,
+    optimizer=optimizer,
+    metrics=[masked_accuracy],
+    # jit_compile=True
   )
 
-  examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
+  examples, metadata = tfds.load("ted_hrlr_translate/pt_to_en", with_info=True, as_supervised=True)
 
-  train_examples, val_examples = examples['train'], examples['validation']
+  train_examples, val_examples = examples["train"], examples["validation"]
   # Create training and validation set batches.
   train_batches = make_batches(train_examples)
   val_batches = make_batches(val_examples)
 
   trainer.fit(
-      train_input=train_batches,
-      # steps_per_epoch=811,
-      # eval_input=val_batches,
-      # eval_steps=valid_steps,
-      # callbacks=[ModelCheckpoint()],
+    train_input=train_batches,
+    # steps_per_epoch=811,
+    # eval_input=val_batches,
+    # eval_steps=valid_steps,
+    # callbacks=[ModelCheckpoint()],
   )
 
 
diff --git a/modelzoo/LanguageModeling/text_generation_with_miniature_gpt/text_generation_with_miniature_gpt.py b/modelzoo/LanguageModeling/text_generation_with_miniature_gpt/text_generation_with_miniature_gpt.py
index 704e8882..e54476a6 100644
--- a/modelzoo/LanguageModeling/text_generation_with_miniature_gpt/text_generation_with_miniature_gpt.py
+++ b/modelzoo/LanguageModeling/text_generation_with_miniature_gpt/text_generation_with_miniature_gpt.py
@@ -6,6 +6,7 @@
 Description: Implement a miniature version of GPT and train it to generate text.
 Accelerator: GPU
 """
+
 """
 ## Introduction
 
@@ -41,6 +42,7 @@
 from tensorflow.keras.layers import TextVectorization
 
 import deepray as dp
+
 """
 ## Implement a Transformer block as a layer
 """
@@ -48,10 +50,10 @@
 
 def causal_attention_mask(batch_size, n_dest, n_src, dtype):
   """
-    Mask the upper half of the dot product matrix in self attention.
-    This prevents flow of information from future tokens to current token.
-    1's in the lower triangle, counting from the lower right corner.
-    """
+  Mask the upper half of the dot product matrix in self attention.
+  This prevents flow of information from future tokens to current token.
+  1's in the lower triangle, counting from the lower right corner.
+  """
   i = tf.range(n_dest)[:, None]
   j = tf.range(n_src)
   m = i >= j - n_src + n_dest
@@ -62,13 +64,12 @@ def causal_attention_mask(batch_size, n_dest, n_src, dtype):
 
 
 class TransformerBlock(layers.Layer):
-
   def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
     super().__init__()
     self.att = layers.MultiHeadAttention(num_heads, embed_dim)
     self.ffn = keras.Sequential([
-        layers.Dense(ff_dim, activation="relu"),
-        layers.Dense(embed_dim),
+      layers.Dense(ff_dim, activation="relu"),
+      layers.Dense(embed_dim),
     ])
     self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
     self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
@@ -97,7 +98,6 @@ def call(self, inputs):
 
 
 class TokenAndPositionEmbedding(layers.Layer):
-
   def __init__(self, maxlen, vocab_size, embed_dim):
     super().__init__()
     self.token_emb = dp.layers.Embedding(vocabulary_size=vocab_size, embedding_dim=embed_dim)
@@ -131,8 +131,8 @@ def create_model():
   model = keras.Model(inputs=inputs, outputs=[outputs, x])
   loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
   model.compile(
-      "adam",
-      loss=[loss_fn, None],
+    "adam",
+    loss=[loss_fn, None],
   )  # No loss and optimization based on word embeddings from transformer block
   return model
 
@@ -155,10 +155,10 @@ def create_model():
 # Create a list all files
 filenames = []
 directories = [
-    "/workspaces/dataset/aclImdb/train/pos",
-    "/workspaces/dataset/aclImdb/train/neg",
-    "/workspaces/dataset/aclImdb/test/pos",
-    "/workspaces/dataset/aclImdb/test/neg",
+  "/workspaces/dataset/aclImdb/train/pos",
+  "/workspaces/dataset/aclImdb/train/neg",
+  "/workspaces/dataset/aclImdb/test/pos",
+  "/workspaces/dataset/aclImdb/test/neg",
 ]
 for dir in directories:
   for f in os.listdir(dir):
@@ -182,10 +182,10 @@ def custom_standardization(input_string):
 
 # Create a vectorization layer and adapt it to the text
 vectorize_layer = TextVectorization(
-    standardize=custom_standardization,
-    max_tokens=vocab_size - 1,
-    output_mode="int",
-    output_sequence_length=maxlen + 1,
+  standardize=custom_standardization,
+  max_tokens=vocab_size - 1,
+  output_mode="int",
+  output_sequence_length=maxlen + 1,
 )
 vectorize_layer.adapt(text_ds)
 vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
@@ -193,10 +193,10 @@ def custom_standardization(input_string):
 
 def prepare_lm_inputs_labels(text):
   """
-    Shift word sequences by 1 position so that the target for position (i) is
-    word at position (i+1). The model will use all words up till position (i)
-    to predict the next word.
-    """
+  Shift word sequences by 1 position so that the target for position (i) is
+  word at position (i+1). The model will use all words up till position (i)
+  to predict the next word.
+  """
   text = tf.expand_dims(text, -1)
   tokenized_sentences = vectorize_layer(text)
   x = tokenized_sentences[:, :-1]
@@ -213,17 +213,17 @@ def prepare_lm_inputs_labels(text):
 
 class TextGenerator(keras.callbacks.Callback):
   """A callback to generate text from a trained model.
-    1. Feed some starting prompt to the model
-    2. Predict probabilities for the next token
-    3. Sample the next token and add it to the next input
-
-    Arguments:
-        max_tokens: Integer, the number of tokens to be generated after prompt.
-        start_tokens: List of integers, the token indices for the starting prompt.
-        index_to_word: List of strings, obtained from the TextVectorization layer.
-        top_k: Integer, sample from the `top_k` token predictions.
-        print_every: Integer, print after this many epochs.
-    """
+  1. Feed some starting prompt to the model
+  2. Predict probabilities for the next token
+  3. Sample the next token and add it to the next input
+
+  Arguments:
+      max_tokens: Integer, the number of tokens to be generated after prompt.
+      start_tokens: List of integers, the token indices for the starting prompt.
+      index_to_word: List of strings, obtained from the TextVectorization layer.
+      top_k: Integer, sample from the `top_k` token predictions.
+      print_every: Integer, print after this many epochs.
+  """
 
   def __init__(self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1):
     self.max_tokens = max_tokens
diff --git a/modelzoo/Recommendation/CreditCardFraudDetection/train.py b/modelzoo/Recommendation/CreditCardFraudDetection/train.py
index cdf1b7df..ef5cfbcb 100644
--- a/modelzoo/Recommendation/CreditCardFraudDetection/train.py
+++ b/modelzoo/Recommendation/CreditCardFraudDetection/train.py
@@ -11,15 +11,15 @@
 from tf_keras.optimizers.legacy import Adam
 
 METRICS = [
-    keras.metrics.TruePositives(name='tp'),
-    keras.metrics.FalsePositives(name='fp'),
-    keras.metrics.TrueNegatives(name='tn'),
-    keras.metrics.FalseNegatives(name='fn'),
-    keras.metrics.BinaryAccuracy(name='accuracy'),
-    keras.metrics.Precision(name='precision'),
-    keras.metrics.Recall(name='recall'),
-    keras.metrics.AUC(name='auc'),
-    keras.metrics.AUC(name='prc', curve='PR'),  # precision-recall curve
+  keras.metrics.TruePositives(name="tp"),
+  keras.metrics.FalsePositives(name="fp"),
+  keras.metrics.TrueNegatives(name="tn"),
+  keras.metrics.FalseNegatives(name="fn"),
+  keras.metrics.BinaryAccuracy(name="accuracy"),
+  keras.metrics.Precision(name="precision"),
+  keras.metrics.Recall(name="recall"),
+  keras.metrics.AUC(name="auc"),
+  keras.metrics.AUC(name="prc", curve="PR"),  # precision-recall curve
 ]
 
 
@@ -28,14 +28,12 @@ def main():
   train_input = data_pipe(flags.FLAGS.batch_size, is_training=True)
   output_bias = None
   input_dim = data_pipe.train_features.shape[-1]
-  model = keras.Sequential(
-      [
-          tf.keras.layers.InputLayer(input_shape=(input_dim,)),
-          keras.layers.Dense(16, activation='relu'),
-          keras.layers.Dropout(0.5),
-          keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias),
-      ]
-  )
+  model = keras.Sequential([
+    tf.keras.layers.InputLayer(input_shape=(input_dim,)),
+    keras.layers.Dense(16, activation="relu"),
+    keras.layers.Dropout(0.5),
+    keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
+  ])
 
   optimizer = Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
   trainer = Trainer(model=model, optimizer=optimizer, loss=keras.losses.BinaryCrossentropy(), metrics=METRICS)
diff --git a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
index 31267a8f..836399d4 100644
--- a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
+++ b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset.py
@@ -6,18 +6,17 @@
 
 
 class CustomParquetPipeline(ParquetPipeline):
-
   def __init__(self, **kwargs):
     super().__init__(**kwargs)
     self.column_names = list(self.feature_map["name"])
-    self._label_name = self.feature_map[(self.feature_map['ftype'] == "Label")].iloc[0]['name']
-    self.numerical_features = self.feature_map[(self.feature_map['ftype'] == "Numerical")]['name'].tolist()
-    self.categorical_features = self.feature_map[(self.feature_map['ftype'] == "Categorical")]['name'].tolist()
+    self._label_name = self.feature_map[(self.feature_map["ftype"] == "Label")].iloc[0]["name"]
+    self.numerical_features = self.feature_map[(self.feature_map["ftype"] == "Numerical")]["name"].tolist()
+    self.categorical_features = self.feature_map[(self.feature_map["ftype"] == "Categorical")]["name"].tolist()
 
   def parser(self, record):
     if self.numerical_features:
-      record['dense_features'] = tf.concat(
-          [tf.reshape(record.pop(key), [-1, 1]) for key in self.numerical_features], axis=1
+      record["dense_features"] = tf.concat(
+        [tf.reshape(record.pop(key), [-1, 1]) for key in self.numerical_features], axis=1
       )
     for fea in self.categorical_features:
       record[fea] = tf.reshape(record[fea], [-1])
diff --git a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
index 91ba0de6..c4badc65 100644
--- a/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
+++ b/modelzoo/Recommendation/Criteo_DCN/datasets/custom_dataset_test.py
@@ -12,9 +12,9 @@
 
 def define_flags():
   argv = sys.argv + [
-      "--batch_size=4096",
-      "--epochs=1",
-      # "--feature_map=feature_map_small.csv",
+    "--batch_size=4096",
+    "--epochs=1",
+    # "--feature_map=feature_map_small.csv",
   ]
   flags.FLAGS(argv)
 
@@ -22,11 +22,11 @@ def define_flags():
 define_flags()
 data_pipe = CustomParquetPipeline()
 train_dataset = data_pipe(
-    batch_size=flags.FLAGS.batch_size,
-    input_file_pattern=[
-        "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
-        "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
-    ]
+  batch_size=flags.FLAGS.batch_size,
+  input_file_pattern=[
+    "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+    "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+  ],
 )
 _performance_calculator = PerformanceCalculator(0, 1000)
 
@@ -38,7 +38,7 @@ def define_flags():
   step_throughput = _performance_calculator(1, flags.FLAGS.batch_size)
 
   if num_examples % 100 == 0:
-    print(f'step {step}, Perf {step_throughput} samples/s')
+    print(f"step {step}, Perf {step_throughput} samples/s")
 # print(batch)
 
 print(num_examples)
diff --git a/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py b/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
index 42f116dc..7fee97cb 100644
--- a/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
+++ b/modelzoo/Recommendation/Criteo_DCN/dcn_v2.py
@@ -14,7 +14,6 @@
 
 
 class EmbeddingContainer(tf.Module):
-
   def __init__(self, training, use_group_embedding):
     super().__init__()
     self.embeddings = {}
@@ -24,31 +23,31 @@ def __init__(self, training, use_group_embedding):
   def add_embedding(self, name, dim, dtype, voc_size):
     if voc_size:
       self.embeddings[name] = keras.layers.Embedding(
-          input_dim=voc_size + 1,
-          output_dim=dim,
-          embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
+        input_dim=voc_size + 1,
+        output_dim=dim,
+        embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
       )
     elif flags.FLAGS.use_dynamic_embedding:
       self.embeddings[name] = DistributedDynamicEmbedding(
-          embedding_dim=dim,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
-          name='DynamicVariable_' + name,
-          device="DRAM",
-          init_capacity=1024 * 10,
-          max_capacity=1024 * 100,
+        embedding_dim=dim,
+        key_dtype=dtype,
+        value_dtype=tf.float32,
+        initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+        name="DynamicVariable_" + name,
+        device="DRAM",
+        init_capacity=1024 * 10,
+        max_capacity=1024 * 100,
       )
     else:
       emb = EmbeddingVariable(
-          embedding_dim=dim,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
-          name='emb' + name,
-          # storage_type="DRAM",
-          # with_unique=True,
-          storage_type="HBM",
+        embedding_dim=dim,
+        key_dtype=dtype,
+        value_dtype=tf.float32,
+        initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+        name="emb" + name,
+        # storage_type="DRAM",
+        # with_unique=True,
+        storage_type="HBM",
       )
       self.embeddings[name] = emb
 
@@ -63,7 +62,6 @@ def get_embedding_list(self):
 
 
 class Ranking(keras.Model):
-
   def __init__(self, interaction, training=True, use_group_embedding=False, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.feature_map = FeatureMap().feature_map
@@ -71,22 +69,22 @@ def __init__(self, interaction, training=True, use_group_embedding=False, *args,
     self._top_stack = MLP(hidden_units=[512, 256, 1], activations=[None, None, "sigmoid"])
     self._interaction = interaction
     self.training = training
-    if interaction == 'dot':
+    if interaction == "dot":
       self._feature_interaction = DotInteraction(skip_gather=True)
-    elif interaction == 'cross':
+    elif interaction == "cross":
       self._feature_interaction = Cross()
     else:
       raise ValueError(
-          f'params.task.model.interaction {self.task_config.model.interaction} '
-          f'is not supported it must be either \'dot\' or \'cross\'.'
+        f"params.task.model.interaction {self.task_config.model.interaction} "
+        f"is not supported it must be either 'dot' or 'cross'."
       )
     self.use_group_embedding = use_group_embedding
     self.embedding_container = EmbeddingContainer(training, use_group_embedding)
 
   def build(self, input_shape):
-    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "dim", "dtype", "voc_size"
-    ]].values:
+    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "dim", "dtype", "voc_size"]
+    ].values:
       self.embedding_container.add_embedding(name, dim, dtype, voc_size)
 
   def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
@@ -94,7 +92,7 @@ def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Ten
     sparse_embedding_vecs = []
     indices = []  # Keep indices for group embedding lookup
 
-    for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
+    for name, dim in self.feature_map[(self.feature_map["ftype"] == "Categorical")][["name", "dim"]].values:
       tensor = inputs[name]
       if self.use_group_embedding:
         indices.append(tensor)
diff --git a/modelzoo/Recommendation/Criteo_DCN/train.py b/modelzoo/Recommendation/Criteo_DCN/train.py
index e69556d6..63b4ecaa 100644
--- a/modelzoo/Recommendation/Criteo_DCN/train.py
+++ b/modelzoo/Recommendation/Criteo_DCN/train.py
@@ -18,7 +18,7 @@
 
 
 def define_flags():
-  flags.mark_flag_as_required('model_dir')
+  flags.mark_flag_as_required("model_dir")
   flags.FLAGS(sys.argv)
 
 
@@ -28,6 +28,7 @@ def main():
   # input("pid: " + str(pid) +", press enter to continue")
   if flags.FLAGS.use_dynamic_embedding:
     from tensorflow_recommenders_addons import dynamic_embedding as de
+
     optimizer = Adam(learning_rate=flags.FLAGS.learning_rate, amsgrad=False)
     optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
   else:
@@ -39,42 +40,42 @@ def main():
 
   data_pipe = CustomParquetPipeline()
   train_ds = data_pipe(
-      batch_size=flags.FLAGS.batch_size,
-      input_file_pattern=[
-          "/workspaces/datasets/criteo-small-00000.parquet",
-          "/workspaces/datasets/criteo-small-01799.parquet",
-      ]
+    batch_size=flags.FLAGS.batch_size,
+    input_file_pattern=[
+      "/workspaces/datasets/criteo-small-00000.parquet",
+      "/workspaces/datasets/criteo-small-01799.parquet",
+    ],
   )
   valid_ds = data_pipe(
-      batch_size=flags.FLAGS.batch_size,
-      input_file_pattern=[
-          "/workspaces/datasets/criteo-small-01799.parquet",
-      ]
+    batch_size=flags.FLAGS.batch_size,
+    input_file_pattern=[
+      "/workspaces/datasets/criteo-small-01799.parquet",
+    ],
   )
 
   trainer = Trainer(
-      model=model,
-      optimizer=optimizer,
-      loss="binary_crossentropy",
-      metrics=['AUC'],
-      jit_compile=False,
-      run_eagerly=flags.FLAGS.run_eagerly
+    model=model,
+    optimizer=optimizer,
+    loss="binary_crossentropy",
+    metrics=["AUC"],
+    jit_compile=False,
+    run_eagerly=flags.FLAGS.run_eagerly,
   )
   # Create a TensorBoard callback
-  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
-  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
+  logdir = os.path.join(flags.FLAGS.model_dir, "tensorboard")
+  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch="5,52")
   trainer.fit(
-      x=train_ds,
-      epochs=flags.FLAGS.epochs,
-      # verbose=0,
-      # steps_per_epoch=460,
-      # validation_data=valid_ds,
-      # validation_steps=191/get_world_size()-1,
-      callbacks=[
-          tboard_callback,
-          TrainingSpeed(),
-          ModelCheckpoint(),
-      ],
+    x=train_ds,
+    epochs=flags.FLAGS.epochs,
+    # verbose=0,
+    # steps_per_epoch=460,
+    # validation_data=valid_ds,
+    # validation_steps=191/get_world_size()-1,
+    callbacks=[
+      tboard_callback,
+      TrainingSpeed(),
+      ModelCheckpoint(),
+    ],
   )
   savedmodel_path = export_to_savedmodel(model)
   print(savedmodel_path)
diff --git a/modelzoo/Recommendation/Criteo_DCN/train1.py b/modelzoo/Recommendation/Criteo_DCN/train1.py
index a70ab109..e8ab5c71 100644
--- a/modelzoo/Recommendation/Criteo_DCN/train1.py
+++ b/modelzoo/Recommendation/Criteo_DCN/train1.py
@@ -18,7 +18,7 @@
 
 
 def define_flags():
-  flags.mark_flag_as_required('model_dir')
+  flags.mark_flag_as_required("model_dir")
   flags.FLAGS(sys.argv)
 
 
@@ -31,6 +31,7 @@ def main():
   # input("pid: " + str(pid) +", press enter to continue")
   if flags.FLAGS.use_dynamic_embedding:
     from tensorflow_recommenders_addons import dynamic_embedding as de
+
     optimizer = optimizers.legacy.Adam(learning_rate=flags.FLAGS.learning_rate)
     optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
   else:
@@ -46,40 +47,40 @@ def main():
   # ], default_optimizer=dense_opt)
   data_pipe = CustomParquetPipeline()
   train_ds = data_pipe(
-      batch_size=flags.FLAGS.batch_size,
-      input_file_pattern=[
-          "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
-      ]
+    batch_size=flags.FLAGS.batch_size,
+    input_file_pattern=[
+      "/workspaces/datasets/00000-1-038360cf-9d9d-454c-8381-6a57bdbf6d57-00001.parquet",
+    ],
   )
   valid_ds = data_pipe(
-      batch_size=flags.FLAGS.batch_size,
-      input_file_pattern=[
-          "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
-      ]
+    batch_size=flags.FLAGS.batch_size,
+    input_file_pattern=[
+      "/workspaces/datasets/01799-1800-26382079-2024-439e-84bf-e7b2231e0a2f-00001.parquet",
+    ],
   )
 
   optimizer.global_step = model._train_counter
   model.compile(
-      optimizer=optimizer,
-      loss="binary_crossentropy",
-      metrics=['AUC'],
-      jit_compile=False,
-      run_eagerly=flags.FLAGS.run_eagerly
+    optimizer=optimizer,
+    loss="binary_crossentropy",
+    metrics=["AUC"],
+    jit_compile=False,
+    run_eagerly=flags.FLAGS.run_eagerly,
   )
   # Create a TensorBoard callback
-  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
-  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
+  logdir = os.path.join(flags.FLAGS.model_dir, "tensorboard")
+  tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch="5,52")
   # breakpoint()
   model.fit(
-      x=train_ds,
-      epochs=flags.FLAGS.epochs,
-      steps_per_epoch=1000,
-      validation_data=valid_ds,
-      # validation_steps=191/get_world_size()-1,
-      callbacks=[
-          # tboard_callback
-          # ModelCheckpoint(),
-      ],
+    x=train_ds,
+    epochs=flags.FLAGS.epochs,
+    steps_per_epoch=1000,
+    validation_data=valid_ds,
+    # validation_steps=191/get_world_size()-1,
+    callbacks=[
+      # tboard_callback
+      # ModelCheckpoint(),
+    ],
   )
   # savedmodel_path = export_to_savedmodel(model)
   # print(savedmodel_path)
diff --git a/modelzoo/Recommendation/MovieLens/mymodel.py b/modelzoo/Recommendation/MovieLens/mymodel.py
index de709797..17b9ff81 100644
--- a/modelzoo/Recommendation/MovieLens/mymodel.py
+++ b/modelzoo/Recommendation/MovieLens/mymodel.py
@@ -8,31 +8,30 @@
 
 
 class MovieLensRankingModel(tf.keras.Model):
-
   def __init__(self, user_input_dim=None, movie_input_dim=None, embedding_dimension=64):
     super().__init__()
     if USE_SOK:
       self.user_embed = sok.DynamicVariable(
-          dimension=64,
-          var_type="hybrid",
-          # key_dtype=tf.int64,
-          # value_dtype=tf.float32,
-          # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
-          initializer='uniform',
-          name="DynamicVariable_user",
-          init_capacity=1024,
-          max_capacity=1024,
+        dimension=64,
+        var_type="hybrid",
+        # key_dtype=tf.int64,
+        # value_dtype=tf.float32,
+        # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
+        initializer="uniform",
+        name="DynamicVariable_user",
+        init_capacity=1024,
+        max_capacity=1024,
       )
       self.movie_embed = sok.DynamicVariable(
-          dimension=64,
-          var_type="hybrid",
-          # key_dtype=tf.int64,
-          # value_dtype=tf.float32,
-          # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
-          initializer='uniform',
-          name="DynamicVariable_movie",
-          init_capacity=1024,
-          max_capacity=1024,
+        dimension=64,
+        var_type="hybrid",
+        # key_dtype=tf.int64,
+        # value_dtype=tf.float32,
+        # initializer=tf.keras.initializers.TruncatedNormal(mean=0., stddev=1./math.sqrt(emb_dim)),
+        initializer="uniform",
+        name="DynamicVariable_movie",
+        init_capacity=1024,
+        max_capacity=1024,
       )
     else:
       if USE_EV:
diff --git a/modelzoo/Recommendation/MovieLens/train.py b/modelzoo/Recommendation/MovieLens/train.py
index 06135462..2bdf9faf 100644
--- a/modelzoo/Recommendation/MovieLens/train.py
+++ b/modelzoo/Recommendation/MovieLens/train.py
@@ -10,7 +10,6 @@
 
 
 class TestData(Movielens100kRating):
-
   def parser(self, x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tensor]:
     labels = x.pop("user_rating")
     x["movie_title"] = self.movie_titles_vocabulary(x["movie_title"])
@@ -19,18 +18,18 @@ def parser(self, x: Dict[str, tf.Tensor]) -> Tuple[Dict[str, tf.Tensor], tf.Tens
     return x, labels
 
   def build_dataset(
-      self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
+    self, batch_size, input_file_pattern=None, is_training=True, epochs=1, shuffle=False, *args, **kwargs
   ):
     key_func = lambda x: self.user_ids_vocabulary(x["user_id"])
     reduce_func = lambda key, dataset: dataset.batch(100)
 
     ratings = self.ratings.map(
-        lambda x: {
-            "movie_title": x["movie_title"],
-            "user_id": x["user_id"],
-            "movie_id": x["movie_id"],
-            "user_rating": x["user_rating"]
-        }
+      lambda x: {
+        "movie_title": x["movie_title"],
+        "user_id": x["user_id"],
+        "movie_id": x["movie_id"],
+        "user_rating": x["user_rating"],
+      }
     )
     ds_train = ratings.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=100)
     ds_train = ds_train.map(self.parser)
@@ -54,24 +53,27 @@ def build_dataset(
 # Create the ranking model, trained with a ranking loss and evaluated with
 # ranking metrics.
 model = MovieLensRankingModel(
-    user_input_dim=data_pipe.user_ids_vocabulary.vocabulary_size(),
-    movie_input_dim=data_pipe.movie_titles_vocabulary.vocabulary_size()
+  user_input_dim=data_pipe.user_ids_vocabulary.vocabulary_size(),
+  movie_input_dim=data_pipe.movie_titles_vocabulary.vocabulary_size(),
 )
 
 if USE_SOK:
   embedding_opt = dp.optimizers.Adagrad(learning_rate=0.5)
   embedding_opt = sok.OptimizerWrapper(embedding_opt)
   dense_opt = dp.optimizers.Adagrad(learning_rate=0.5)
-  optimizer = MultiOptimizer([
+  optimizer = MultiOptimizer(
+    [
       (embedding_opt, "DynamicVariable_"),
-  ], default_optimizer=dense_opt)
+    ],
+    default_optimizer=dense_opt,
+  )
 else:
   optimizer = dp.optimizers.Adagrad(0.5)
 
 loss = dp.losses.SoftmaxLoss(ragged=True)
 eval_metrics = [
-    dp.metrics.NDCGMetric(ragged=True, name="metric/ndcg"),
-    dp.metrics.MRRMetric(ragged=True, name="metric/mrr"),
+  dp.metrics.NDCGMetric(ragged=True, name="metric/ndcg"),
+  dp.metrics.MRRMetric(ragged=True, name="metric/mrr"),
 ]
 model.compile(optimizer=optimizer, loss=loss, metrics=eval_metrics, run_eagerly=False)
 
@@ -84,8 +86,8 @@ def build_dataset(
 
 # Generate the input for user 42.
 inputs = {
-    "user_id": tf.expand_dims(tf.repeat(data_pipe.user_ids_vocabulary("42"), repeats=movie_titles.shape[0]), axis=0),
-    "movie_title": tf.expand_dims(data_pipe.movie_titles_vocabulary(movie_titles), axis=0)
+  "user_id": tf.expand_dims(tf.repeat(data_pipe.user_ids_vocabulary("42"), repeats=movie_titles.shape[0]), axis=0),
+  "movie_title": tf.expand_dims(data_pipe.movie_titles_vocabulary(movie_titles), axis=0),
 }
 
 # Get movie recommendations for user 42.
diff --git a/modelzoo/Recommendation/MovieLens/train_ranking.py b/modelzoo/Recommendation/MovieLens/train_ranking.py
index 3a3fe45f..8e0751d7 100644
--- a/modelzoo/Recommendation/MovieLens/train_ranking.py
+++ b/modelzoo/Recommendation/MovieLens/train_ranking.py
@@ -10,7 +10,6 @@
 
 
 class RankingModel(tf.keras.Model):
-
   def __init__(self, embedding_dimension=32):
     super().__init__()
     # Compute embeddings for users.
@@ -18,15 +17,13 @@ def __init__(self, embedding_dimension=32):
     self.movie_embeddings = EmbeddingVariable(embedding_dim=embedding_dimension)
 
     # Compute predictions.
-    self.ratings = tf.keras.Sequential(
-        [
-            # Learn multiple dense layers.
-            tf.keras.layers.Dense(256, activation="relu"),
-            tf.keras.layers.Dense(64, activation="relu"),
-            # Make rating predictions in the final layer.
-            tf.keras.layers.Dense(1)
-        ]
-    )
+    self.ratings = tf.keras.Sequential([
+      # Learn multiple dense layers.
+      tf.keras.layers.Dense(256, activation="relu"),
+      tf.keras.layers.Dense(64, activation="relu"),
+      # Make rating predictions in the final layer.
+      tf.keras.layers.Dense(1),
+    ])
 
   def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor:
     user_id, movie_title = inputs["user_id"], inputs["movie_title"]
@@ -45,18 +42,18 @@ def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor:
 optimizer = dp.optimizers.Adagrad(0.1)
 model = RankingModel()
 
-score = model(
-    {
-        "user_id": data_pipe.user_ids_vocabulary("42"),
-        "movie_title": data_pipe.movie_titles_vocabulary("One Flew Over the Cuckoo's Nest (1975)")
-    }
-)
+score = model({
+  "user_id": data_pipe.user_ids_vocabulary("42"),
+  "movie_title": data_pipe.movie_titles_vocabulary("One Flew Over the Cuckoo's Nest (1975)"),
+})
 print(score)
 
-trainer = Trainer(model=model,
-                  optimizer=optimizer,
-                  loss=tf.keras.losses.MeanSquaredError(), 
-                  metrics=[tf.keras.metrics.RootMeanSquaredError()])
+trainer = Trainer(
+  model=model,
+  optimizer=optimizer,
+  loss=tf.keras.losses.MeanSquaredError(),
+  metrics=[tf.keras.metrics.RootMeanSquaredError()],
+)
 
 
 cached_train = train_dataset.cache()
@@ -69,12 +66,10 @@ def call(self, inputs: Dict[str, tf.Tensor]) -> tf.Tensor:
 test_ratings = {}
 test_movie_titles = ["M*A*S*H (1970)", "Dances with Wolves (1990)", "Speed (1994)"]
 for movie_title in test_movie_titles:
-  test_ratings[movie_title] = model(
-      {
-          "user_id": data_pipe.user_ids_vocabulary("42"),
-          "movie_title": data_pipe.movie_titles_vocabulary(movie_title)
-      }
-  )
+  test_ratings[movie_title] = model({
+    "user_id": data_pipe.user_ids_vocabulary("42"),
+    "movie_title": data_pipe.movie_titles_vocabulary(movie_title),
+  })
 
 print("Ratings:")
 for title, score in sorted(test_ratings.items(), key=lambda x: x[1], reverse=True):
diff --git a/modelzoo/Recommendation/NCF/run_ncf.py b/modelzoo/Recommendation/NCF/run_ncf.py
index 8f7c964a..598f3d28 100644
--- a/modelzoo/Recommendation/NCF/run_ncf.py
+++ b/modelzoo/Recommendation/NCF/run_ncf.py
@@ -16,6 +16,7 @@
 The NeuMF model assembles both MF and MLP models under the NCF framework. Check
 `neumf_model.py` for more details about the models.
 """
+
 import json
 import sys
 
@@ -41,33 +42,32 @@
 # pylint: disable=g-bad-import-order
 # pylint: enable=g-bad-import-order
 
-FLAGS(
-    [
-        sys.argv[0],
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--dataset=ml-1m",
-        "--eval_batch_size=160000",
-    ]
-)
+FLAGS([
+  sys.argv[0],
+  # "--distribution_strategy=off",
+  # "--run_eagerly=true",
+  "--dataset=ml-1m",
+  "--eval_batch_size=160000",
+])
 
 
 def define_ncf_flags():
   """Add flags for running ncf_main."""
   # Add common flags
   flags_core.define_base(
-      train_data=False,
-      model_dir=False,
-      clean=False,
-      epochs=False,
-      epochs_between_evals=True,
-      export_dir=False,
-      stop_threshold=False,
+    train_data=False,
+    model_dir=False,
+    clean=False,
+    epochs=False,
+    epochs_between_evals=True,
+    export_dir=False,
+    stop_threshold=False,
+  )
+  flags_core.define_performance(
+    synthetic_data=True,
+    # fp16_implementation=True,
+    # loss_scale=True,
   )
-  flags_core.define_performance(synthetic_data=True,
-                                # fp16_implementation=True,
-                                # loss_scale=True,
-                               )
   flags_core.define_device(tpu=True)
 
   flags.adopt_module_key_flags(flags_core)
@@ -81,40 +81,39 @@ def define_ncf_flags():
   #   tpu=None)
 
   flags.DEFINE_float(
-      name="beta1", default=0.9, help=flags_core.help_wrap("beta1 hyperparameter for the Adam optimizer.")
+    name="beta1", default=0.9, help=flags_core.help_wrap("beta1 hyperparameter for the Adam optimizer.")
   )
 
   flags.DEFINE_float(
-      name="beta2", default=0.999, help=flags_core.help_wrap("beta2 hyperparameter for the Adam optimizer.")
+    name="beta2", default=0.999, help=flags_core.help_wrap("beta2 hyperparameter for the Adam optimizer.")
   )
 
   flags.DEFINE_float(
-      name="epsilon", default=1e-8, help=flags_core.help_wrap("epsilon hyperparameter for the Adam "
-                                                              "optimizer.")
+    name="epsilon", default=1e-8, help=flags_core.help_wrap("epsilon hyperparameter for the Adam optimizer.")
   )
 
   flags.DEFINE_float(
-      name="hr_threshold",
-      default=1.0,
-      help=flags_core.help_wrap(
-          "If passed, training will stop when the evaluation metric HR is "
-          "greater than or equal to hr_threshold. For dataset ml-1m, the "
-          "desired hr_threshold is 0.68 which is the result from the paper; "
-          "For dataset ml-20m, the threshold can be set as 0.95 which is "
-          "achieved by MLPerf implementation."
-      )
+    name="hr_threshold",
+    default=1.0,
+    help=flags_core.help_wrap(
+      "If passed, training will stop when the evaluation metric HR is "
+      "greater than or equal to hr_threshold. For dataset ml-1m, the "
+      "desired hr_threshold is 0.68 which is the result from the paper; "
+      "For dataset ml-20m, the threshold can be set as 0.95 which is "
+      "achieved by MLPerf implementation."
+    ),
   )
 
   flags.DEFINE_enum(
-      name="constructor_type",
-      default="bisection",
-      enum_values=["bisection", "materialized"],
-      case_sensitive=False,
-      help=flags_core.help_wrap(
-          "Strategy to use for generating false negatives. materialized has a"
-          "precompute that scales badly, but a faster per-epoch construction"
-          "time and can be faster on very large systems."
-      )
+    name="constructor_type",
+    default="bisection",
+    enum_values=["bisection", "materialized"],
+    case_sensitive=False,
+    help=flags_core.help_wrap(
+      "Strategy to use for generating false negatives. materialized has a"
+      "precompute that scales badly, but a faster per-epoch construction"
+      "time and can be faster on very large systems."
+    ),
   )
 
   flags.DEFINE_string(name="train_dataset_path", default=None, help=flags_core.help_wrap("Path to training data."))
@@ -122,16 +121,16 @@ def define_ncf_flags():
   flags.DEFINE_string(name="eval_dataset_path", default=None, help=flags_core.help_wrap("Path to evaluation data."))
 
   flags.DEFINE_bool(
-      name="output_ml_perf_compliance_logging",
-      default=False,
-      help=flags_core.help_wrap(
-          "If set, output the MLPerf compliance logging. This is only useful "
-          "if one is running the model for MLPerf. See "
-          "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
-          "#submission-compliance-logs for details. This uses sudo and so may "
-          "ask for your password, as root access is needed to clear the system "
-          "caches, which is required for MLPerf compliance."
-      )
+    name="output_ml_perf_compliance_logging",
+    default=False,
+    help=flags_core.help_wrap(
+      "If set, output the MLPerf compliance logging. This is only useful "
+      "if one is running the model for MLPerf. See "
+      "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
+      "#submission-compliance-logs for details. This uses sudo and so may "
+      "ask for your password, as root access is needed to clear the system "
+      "caches, which is required for MLPerf compliance."
+    ),
   )
 
   # @flags.validator(
@@ -143,9 +142,9 @@ def define_ncf_flags():
   #           int(eval_batch_size) > rconst.NUM_EVAL_NEGATIVES)
 
   flags.DEFINE_bool(
-      name="early_stopping",
-      default=False,
-      help=flags_core.help_wrap("If True, we stop the training when it reaches hr_threshold")
+    name="early_stopping",
+    default=False,
+    help=flags_core.help_wrap("If True, we stop the training when it reaches hr_threshold"),
   )
 
 
@@ -157,42 +156,43 @@ def parse_flags(flags_obj):
   eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
 
   return {
-      "epochs": flags_obj.epochs,
-      "batches_per_step": 1,
-      "use_seed": flags_obj.random_seed is not None,
-      "batch_size": batch_size,
-      "eval_batch_size": eval_batch_size,
-      "learning_rate": flags_obj.learning_rate,
-      "mf_dim": flags_obj.num_factors,
-      "model_layers": [int(layer) for layer in flags_obj.layers],
-      "mf_regularization": flags_obj.mf_regularization,
-      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
-      "num_neg": flags_obj.num_neg,
-      "distribution_strategy": flags_obj.distribution_strategy,
-      "num_gpus": num_gpus,
-      "use_tpu": flags_obj.tpu is not None,
-      "tpu": flags_obj.tpu,
-      "tpu_zone": flags_obj.tpu_zone,
-      "tpu_gcp_project": flags_obj.tpu_gcp_project,
-      "beta1": flags_obj.beta1,
-      "beta2": flags_obj.beta2,
-      "epsilon": flags_obj.epsilon,
-      "match_mlperf": flags_obj.benchmark,
-      "epochs_between_evals": flags_obj.epochs_between_evals,
-      "use_custom_training_loop": flags_obj.use_custom_training_loop,
-      "hr_threshold": flags_obj.hr_threshold,
-      "stream_files": flags_obj.tpu is not None,
-      "train_dataset_path": flags_obj.train_dataset_path,
-      "eval_dataset_path": flags_obj.eval_dataset_path,
+    "epochs": flags_obj.epochs,
+    "batches_per_step": 1,
+    "use_seed": flags_obj.random_seed is not None,
+    "batch_size": batch_size,
+    "eval_batch_size": eval_batch_size,
+    "learning_rate": flags_obj.learning_rate,
+    "mf_dim": flags_obj.num_factors,
+    "model_layers": [int(layer) for layer in flags_obj.layers],
+    "mf_regularization": flags_obj.mf_regularization,
+    "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
+    "num_neg": flags_obj.num_neg,
+    "distribution_strategy": flags_obj.distribution_strategy,
+    "num_gpus": num_gpus,
+    "use_tpu": flags_obj.tpu is not None,
+    "tpu": flags_obj.tpu,
+    "tpu_zone": flags_obj.tpu_zone,
+    "tpu_gcp_project": flags_obj.tpu_gcp_project,
+    "beta1": flags_obj.beta1,
+    "beta2": flags_obj.beta2,
+    "epsilon": flags_obj.epsilon,
+    "match_mlperf": flags_obj.benchmark,
+    "epochs_between_evals": flags_obj.epochs_between_evals,
+    "use_custom_training_loop": flags_obj.use_custom_training_loop,
+    "hr_threshold": flags_obj.hr_threshold,
+    "stream_files": flags_obj.tpu is not None,
+    "train_dataset_path": flags_obj.train_dataset_path,
+    "eval_dataset_path": flags_obj.eval_dataset_path,
   }
 
 
 def build_loss(y_true, y_pred, weights):
   # The loss can overflow in float16, so we cast to float32.
   softmax_logits = tf.cast(y_pred, "float32")
-  loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction="sum",
-                                                       from_logits=True)(y_true, softmax_logits, sample_weight=weights)
-  loss *= (1.0 / FLAGS.batch_size)
+  loss = tf.keras.losses.SparseCategoricalCrossentropy(reduction="sum", from_logits=True)(
+    y_true, softmax_logits, sample_weight=weights
+  )
+  loss *= 1.0 / FLAGS.batch_size
   return loss
 
 
@@ -200,7 +200,7 @@ def run_ncf(_):
   """Run NCF training and eval with Keras."""
 
   params = parse_flags(FLAGS)
-  params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")
+  params["use_tpu"] = FLAGS.distribution_strategy == "tpu"
 
   if params["use_tpu"] and not params["use_custom_training_loop"]:
     logging.error("Custom training loop must be used when using TPUStrategy.")
@@ -247,16 +247,16 @@ def run_ncf(_):
     keras_model = NCFModel(params)
 
   trainer = Trainer(
-      model=keras_model,
-      loss=build_loss,
-      metrics=[
-          tf.keras.metrics.CategoricalAccuracy(name='accuracy', dtype=tf.float32),
-          HitRateMetric(params["match_mlperf"])
-      ],
-      callbacks=callbacks
+    model=keras_model,
+    loss=build_loss,
+    metrics=[
+      tf.keras.metrics.CategoricalAccuracy(name="accuracy", dtype=tf.float32),
+      HitRateMetric(params["match_mlperf"]),
+    ],
+    callbacks=callbacks,
   )
   train_loss, eval_results = trainer.fit(
-      train_input=train_input_dataset, eval_input=eval_input_dataset, eval_steps=num_eval_steps
+    train_input=train_input_dataset, eval_input=eval_input_dataset, eval_steps=num_eval_steps
   )
 
   stats = build_stats(train_loss, eval_results, time_callback)
@@ -281,7 +281,7 @@ def build_stats(loss, eval_result, time_callback):
 
   if eval_result:
     # stats["eval_loss"] = eval_result['eval_loss']
-    stats["eval_hit_rate"] = eval_result['hit_rate']
+    stats["eval_hit_rate"] = eval_result["hit_rate"]
 
   if time_callback:
     timestamp_log = time_callback.timestamp_log
@@ -289,8 +289,10 @@ def build_stats(loss, eval_result, time_callback):
     stats["train_finish_time"] = time_callback.train_finish_time
     if len(timestamp_log) > 1:
       stats["avg_exp_per_second"] = (
-          time_callback.batch_size * time_callback.log_steps * (len(time_callback.timestamp_log) - 1) /
-          (timestamp_log[-1].timestamp - timestamp_log[0].timestamp)
+        time_callback.batch_size
+        * time_callback.log_steps
+        * (len(time_callback.timestamp_log) - 1)
+        / (timestamp_log[-1].timestamp - timestamp_log[0].timestamp)
       )
 
   return stats
diff --git a/modelzoo/Recommendation/SIM/defaults.py b/modelzoo/Recommendation/SIM/defaults.py
index fd2aafa9..5e303008 100644
--- a/modelzoo/Recommendation/SIM/defaults.py
+++ b/modelzoo/Recommendation/SIM/defaults.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-REMAINDER_FILENAME = 'remainder.tfrecord'
+REMAINDER_FILENAME = "remainder.tfrecord"
 
-USER_FEATURES_CHANNEL = 'user_features'
-TARGET_ITEM_FEATURES_CHANNEL = 'target_item_features'
-POSITIVE_HISTORY_CHANNEL = 'positive_history'
-NEGATIVE_HISTORY_CHANNEL = 'negative_history'
-LABEL_CHANNEL = 'label'
+USER_FEATURES_CHANNEL = "user_features"
+TARGET_ITEM_FEATURES_CHANNEL = "target_item_features"
+POSITIVE_HISTORY_CHANNEL = "positive_history"
+NEGATIVE_HISTORY_CHANNEL = "negative_history"
+LABEL_CHANNEL = "label"
 
 TRAIN_MAPPING = "train"
 TEST_MAPPING = "test"
@@ -27,7 +27,7 @@
 
 DTYPE_SELECTOR = "dtype"
 CARDINALITY_SELECTOR = "cardinality"
-DIMENSIONS_SELECTOR = 'dimensions'
+DIMENSIONS_SELECTOR = "dimensions"
 
 from absl import flags
 
@@ -36,16 +36,14 @@ def define_din_flags():
   """Add flags for running ncf_main."""
   # Add common flags
   flags.DEFINE_list(
-      "stage_one_mlp_dims",
-      default="200",
-      help="MLP hidden dimensions for stage one (excluding classification output)."
+    "stage_one_mlp_dims", default="200", help="MLP hidden dimensions for stage one (excluding classification output)."
   )
   flags.DEFINE_list(
-      "stage_two_mlp_dims",
-      default="200,80",
-      help="MLP hidden dimensions for stage two (excluding classification output)."
+    "stage_two_mlp_dims",
+    default="200,80",
+    help="MLP hidden dimensions for stage two (excluding classification output).",
   )
   flags.DEFINE_list(
-      "aux_mlp_dims", default="100,50", help="MLP hidden dimensions for aux loss (excluding classification output)."
+    "aux_mlp_dims", default="100,50", help="MLP hidden dimensions for aux loss (excluding classification output)."
   )
   flags.DEFINE_integer("embedding_dim", default=16, help="Embedding dimension.")
diff --git a/modelzoo/Recommendation/SIM/feature_spec.py b/modelzoo/Recommendation/SIM/feature_spec.py
index 94a07be0..e30e5857 100644
--- a/modelzoo/Recommendation/SIM/feature_spec.py
+++ b/modelzoo/Recommendation/SIM/feature_spec.py
@@ -19,13 +19,20 @@
 import yaml
 
 from .defaults import (
-    CARDINALITY_SELECTOR, DIMENSIONS_SELECTOR, DTYPE_SELECTOR, LABEL_CHANNEL, NEGATIVE_HISTORY_CHANNEL,
-    POSITIVE_HISTORY_CHANNEL, TARGET_ITEM_FEATURES_CHANNEL, TEST_MAPPING, TRAIN_MAPPING, USER_FEATURES_CHANNEL
+  CARDINALITY_SELECTOR,
+  DIMENSIONS_SELECTOR,
+  DTYPE_SELECTOR,
+  LABEL_CHANNEL,
+  NEGATIVE_HISTORY_CHANNEL,
+  POSITIVE_HISTORY_CHANNEL,
+  TARGET_ITEM_FEATURES_CHANNEL,
+  TEST_MAPPING,
+  TRAIN_MAPPING,
+  USER_FEATURES_CHANNEL,
 )
 
 
 class FeatureSpec:
-
   def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metadata=None, base_directory=None):
     self.feature_spec: Dict = feature_spec if feature_spec is not None else {}
     self.source_spec: Dict = source_spec if source_spec is not None else {}
@@ -35,7 +42,7 @@ def __init__(self, feature_spec=None, source_spec=None, channel_spec=None, metad
 
   @classmethod
   def from_yaml(cls, path):
-    with open(path, 'r') as feature_spec_file:
+    with open(path, "r") as feature_spec_file:
       base_directory = os.path.dirname(path)
       feature_spec = yaml.safe_load(feature_spec_file)
       return cls.from_dict(feature_spec, base_directory=base_directory)
@@ -45,7 +52,7 @@ def from_dict(cls, source_dict, base_directory):
     return cls(base_directory=base_directory, **source_dict)
 
   def to_dict(self):
-    attributes_to_dump = ['feature_spec', 'source_spec', 'channel_spec', 'metadata']
+    attributes_to_dump = ["feature_spec", "source_spec", "channel_spec", "metadata"]
     return {attr: self.__dict__[attr] for attr in attributes_to_dump}
 
   def to_string(self):
@@ -53,42 +60,40 @@ def to_string(self):
 
   def to_yaml(self, output_path=None):
     if not output_path:
-      output_path = self.base_directory + '/feature_spec.yaml'
-    with open(output_path, 'w') as output_file:
+      output_path = self.base_directory + "/feature_spec.yaml"
+    with open(output_path, "w") as output_file:
       print(yaml.dump(self.to_dict()), file=output_file)
 
   @staticmethod
   def get_default_features_names(number_of_user_features, number_of_item_features):
-    user_feature_fstring = 'user_feat_{}'
-    item_feature_fstring = 'item_feat_{}_{}'
+    user_feature_fstring = "user_feat_{}"
+    item_feature_fstring = "item_feat_{}_{}"
     label_feature_name = "label"
 
-    item_channels_feature_name_suffixes = ['trgt', 'pos', 'neg']
+    item_channels_feature_name_suffixes = ["trgt", "pos", "neg"]
 
     user_features_names = [user_feature_fstring.format(i) for i in range(number_of_user_features)]
 
     item_features_names = [
-        item_feature_fstring.format(i, channel_suffix)
-        for channel_suffix in item_channels_feature_name_suffixes
-        for i in range(number_of_item_features)
+      item_feature_fstring.format(i, channel_suffix)
+      for channel_suffix in item_channels_feature_name_suffixes
+      for i in range(number_of_item_features)
     ]
 
     return [label_feature_name] + user_features_names + item_features_names
 
   @staticmethod
   def get_default_feature_spec(user_features_cardinalities, item_features_cardinalities, max_seq_len):
-
     number_of_user_features = len(user_features_cardinalities)
     number_of_item_features = len(item_features_cardinalities)
 
     all_features_names = FeatureSpec.get_default_features_names(number_of_user_features, number_of_item_features)
 
     user_features = {
-        f_name: {
-            DTYPE_SELECTOR: str(np.dtype(np.int64)),
-            CARDINALITY_SELECTOR: int(cardinality)
-        } for i, (f_name, cardinality
-                 ) in enumerate(zip(all_features_names[1:1 + number_of_user_features], user_features_cardinalities))
+      f_name: {DTYPE_SELECTOR: str(np.dtype(np.int64)), CARDINALITY_SELECTOR: int(cardinality)}
+      for i, (f_name, cardinality) in enumerate(
+        zip(all_features_names[1 : 1 + number_of_user_features], user_features_cardinalities)
+      )
     }
 
     item_channels = [TARGET_ITEM_FEATURES_CHANNEL, POSITIVE_HISTORY_CHANNEL, NEGATIVE_HISTORY_CHANNEL]
@@ -98,7 +103,6 @@ def get_default_feature_spec(user_features_cardinalities, item_features_cardinal
 
     for i, cardinality in enumerate(item_features_cardinalities):
       for j, (channel, dictionary) in enumerate(item_channels_info):
-
         feature_name = all_features_names[1 + number_of_user_features + i + j * number_of_item_features]
 
         dictionary[feature_name] = {DTYPE_SELECTOR: str(np.dtype(np.int64)), CARDINALITY_SELECTOR: int(cardinality)}
@@ -107,26 +111,24 @@ def get_default_feature_spec(user_features_cardinalities, item_features_cardinal
           dictionary[feature_name][DIMENSIONS_SELECTOR] = [max_seq_len]
 
     feature_spec = {
-        feat_name: feat_spec for dictionary in [user_features] + item_channels_feature_dicts
-        for feat_name, feat_spec in dictionary.items()
+      feat_name: feat_spec
+      for dictionary in [user_features] + item_channels_feature_dicts
+      for feat_name, feat_spec in dictionary.items()
     }
 
     feature_spec[all_features_names[0]] = {DTYPE_SELECTOR: str(np.dtype(np.bool))}
 
     channel_spec = {
-        USER_FEATURES_CHANNEL: list(user_features),
-        TARGET_ITEM_FEATURES_CHANNEL: list(item_channels_feature_dicts[0]),
-        POSITIVE_HISTORY_CHANNEL: list(item_channels_feature_dicts[1]),
-        NEGATIVE_HISTORY_CHANNEL: list(item_channels_feature_dicts[2]),
-        LABEL_CHANNEL: all_features_names[:1]
+      USER_FEATURES_CHANNEL: list(user_features),
+      TARGET_ITEM_FEATURES_CHANNEL: list(item_channels_feature_dicts[0]),
+      POSITIVE_HISTORY_CHANNEL: list(item_channels_feature_dicts[1]),
+      NEGATIVE_HISTORY_CHANNEL: list(item_channels_feature_dicts[2]),
+      LABEL_CHANNEL: all_features_names[:1],
     }
 
     source_spec = {
-        split: [{
-            'type': 'tfrecord',
-            'features': all_features_names,
-            'files': []
-        }] for split in [TRAIN_MAPPING, TEST_MAPPING]
+      split: [{"type": "tfrecord", "features": all_features_names, "files": []}]
+      for split in [TRAIN_MAPPING, TEST_MAPPING]
     }
 
     return FeatureSpec(feature_spec=feature_spec, channel_spec=channel_spec, source_spec=source_spec)
diff --git a/modelzoo/Recommendation/SIM/main.py b/modelzoo/Recommendation/SIM/main.py
index 099072da..11c8c335 100644
--- a/modelzoo/Recommendation/SIM/main.py
+++ b/modelzoo/Recommendation/SIM/main.py
@@ -28,17 +28,15 @@
 from deepray.models.rec.din_model import DINModel
 from deepray.models.rec.dien_model import DIENModel
 
-FLAGS(
-    [
-        sys.argv[0],
-        "--train_data=movielens/100k-ratings",
-        # "--distribution_strategy=off",
-        # "--run_eagerly=true",
-        "--steps_per_execution=20",
-        "--use_dynamic_embedding=True",
-        # "--batch_size=1024",
-    ]
-)
+FLAGS([
+  sys.argv[0],
+  "--train_data=movielens/100k-ratings",
+  # "--distribution_strategy=off",
+  # "--run_eagerly=true",
+  "--steps_per_execution=20",
+  "--use_dynamic_embedding=True",
+  # "--batch_size=1024",
+])
 
 
 def build_sim_loss_fn(alpha=1.0, beta=1.0):
@@ -66,21 +64,21 @@ def dien_auxiliary_loss_fn(click_probs, noclick_probs, mask=None):
 def build_model_and_loss(model_params):
   if FLAGS.model_name == "sim":
     model = SIMModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims=model_params["mlp_hidden_dims"],
-        embedding_dim=model_params["embedding_dim"],
-        dropout_rate=model_params["dropout_rate"]
+      model_params["feature_spec"],
+      mlp_hidden_dims=model_params["mlp_hidden_dims"],
+      embedding_dim=model_params["embedding_dim"],
+      dropout_rate=model_params["dropout_rate"],
     )
     classification_loss_fn = build_sim_loss_fn()
 
     def classification_loss(targets, output_dict):
-      """ compute loss."""
+      """compute loss."""
       return classification_loss_fn(targets, output_dict["stage_one_logits"], output_dict["stage_two_logits"])
 
     dien_aux_loss = dien_auxiliary_loss_fn(
-        output_dict["aux_click_probs"],
-        output_dict["aux_noclick_probs"],
-        mask=mask_for_aux_loss,
+      output_dict["aux_click_probs"],
+      output_dict["aux_noclick_probs"],
+      mask=mask_for_aux_loss,
     )
 
     total_loss = classification_loss + dien_aux_loss
@@ -100,26 +98,26 @@ def model_fn(batch, training=True):
       loss_dict = {"total_loss": total_loss, "classification_loss": classification_loss, "dien_aux_loss": dien_aux_loss}
 
       return (targets, logits), loss_dict
+
   elif FLAGS.model_name == "dien":
     model = DIENModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims={
-            "classifier": model_params["mlp_hidden_dims"]["stage_2"],
-            "aux": model_params["mlp_hidden_dims"]["aux"],
-        },
-        embedding_dim=model_params["embedding_dim"],
+      model_params["feature_spec"],
+      mlp_hidden_dims={
+        "classifier": model_params["mlp_hidden_dims"]["stage_2"],
+        "aux": model_params["mlp_hidden_dims"]["aux"],
+      },
+      embedding_dim=model_params["embedding_dim"],
     )
     classification_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
 
     class CustomLossClass:
-
       def __call__(self, targets, output_dict):
         classification_loss = classification_loss_fn(targets, output_dict["logits"])
 
         dien_aux_loss = dien_auxiliary_loss_fn(
-            output_dict["aux_click_probs"],
-            output_dict["aux_noclick_probs"],
-            mask=input_data["short_sequence_mask"][:, 1:],
+          output_dict["aux_click_probs"],
+          output_dict["aux_noclick_probs"],
+          mask=input_data["short_sequence_mask"][:, 1:],
         )
 
         total_loss = classification_loss + dien_aux_loss
@@ -137,9 +135,9 @@ def model_fn(batch, training=True):
       classification_loss = classification_loss_fn(targets, output_dict["logits"])
 
       dien_aux_loss = dien_auxiliary_loss_fn(
-          output_dict["aux_click_probs"],
-          output_dict["aux_noclick_probs"],
-          mask=mask_for_aux_loss,
+        output_dict["aux_click_probs"],
+        output_dict["aux_noclick_probs"],
+        mask=mask_for_aux_loss,
       )
 
       total_loss = classification_loss + dien_aux_loss
@@ -149,11 +147,12 @@ def model_fn(batch, training=True):
       loss_dict = {"total_loss": total_loss, "classification_loss": classification_loss, "dien_aux_loss": dien_aux_loss}
 
       return (targets, logits), loss_dict
+
   elif FLAGS.model_name == "din":
     model = DINModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims=model_params["mlp_hidden_dims"]["stage_2"],
-        embedding_dim=model_params["embedding_dim"]
+      model_params["feature_spec"],
+      mlp_hidden_dims=model_params["mlp_hidden_dims"]["stage_2"],
+      embedding_dim=model_params["embedding_dim"],
     )
     classification_loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)
 
@@ -183,12 +182,14 @@ def main(_):
     model = build_model_and_loss()
 
   trainer = Trainer(
-      model=model,
-      loss=tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM),
+    model=model,
+    loss=tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM),
   )
 
   train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
+  trainer.fit(
+    train_input=train_input_fn,
+  )
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/Recommendation/SIM/run_dien.py b/modelzoo/Recommendation/SIM/run_dien.py
index 369175f1..ae523a5b 100644
--- a/modelzoo/Recommendation/SIM/run_dien.py
+++ b/modelzoo/Recommendation/SIM/run_dien.py
@@ -39,40 +39,38 @@ def main(_):
   data_pipe = AmazonBooks2014(FLAGS.max_seq_length)
   with distribution_utils.get_strategy_scope(_strategy):
     model_params = {
-        "feature_spec": feature_spec,
-        "embedding_dim": FLAGS.embedding_dim,
-        "mlp_hidden_dims":
-            {
-                "stage_1": FLAGS.stage_one_mlp_dims,
-                "stage_2": FLAGS.stage_two_mlp_dims,
-                "aux": FLAGS.aux_mlp_dims
-            },
-        "dropout_rate": FLAGS.dropout_rate,
-        # "model_type": model_type
+      "feature_spec": feature_spec,
+      "embedding_dim": FLAGS.embedding_dim,
+      "mlp_hidden_dims": {
+        "stage_1": FLAGS.stage_one_mlp_dims,
+        "stage_2": FLAGS.stage_two_mlp_dims,
+        "aux": FLAGS.aux_mlp_dims,
+      },
+      "dropout_rate": FLAGS.dropout_rate,
+      # "model_type": model_type
     }
 
     model = DIENModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims={
-            "classifier": model_params["mlp_hidden_dims"]["stage_2"],
-            "aux": model_params["mlp_hidden_dims"]["aux"],
-        },
-        embedding_dim=model_params["embedding_dim"],
+      model_params["feature_spec"],
+      mlp_hidden_dims={
+        "classifier": model_params["mlp_hidden_dims"]["stage_2"],
+        "aux": model_params["mlp_hidden_dims"]["aux"],
+      },
+      embedding_dim=model_params["embedding_dim"],
     )
 
   trainer = Trainer(
-      model=model,
-      loss={
-          "logits": tf.keras.losses.BinaryCrossentropy(from_logits=True),
-          "auxiliary_logits": custom_loss_fn
-      },
-      metrics={"logits": tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True)},
+    model=model,
+    loss={"logits": tf.keras.losses.BinaryCrossentropy(from_logits=True), "auxiliary_logits": custom_loss_fn},
+    metrics={"logits": tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True)},
   )
 
   # since each tfrecord file must include all of the features, it is enough to read first chunk for each split.
   train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size, prebatch_size=FLAGS.prebatch)
 
-  trainer.fit(train_input=train_dataset,)
+  trainer.fit(
+    train_input=train_dataset,
+  )
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/Recommendation/SIM/run_din.py b/modelzoo/Recommendation/SIM/run_din.py
index 16a70e60..822e2846 100644
--- a/modelzoo/Recommendation/SIM/run_din.py
+++ b/modelzoo/Recommendation/SIM/run_din.py
@@ -35,33 +35,34 @@ def main(_):
   data_pipe = AmazonBooks2014(FLAGS.max_seq_length)
   with distribution_utils.get_strategy_scope(_strategy):
     model_params = {
-        "feature_spec": feature_spec,
-        "embedding_dim": FLAGS.embedding_dim,
-        "mlp_hidden_dims":
-            {
-                "stage_1": FLAGS.stage_one_mlp_dims,
-                "stage_2": FLAGS.stage_two_mlp_dims,
-                "aux": FLAGS.aux_mlp_dims
-            },
-        "dropout_rate": FLAGS.dropout_rate,
-        # "model_type": model_type
+      "feature_spec": feature_spec,
+      "embedding_dim": FLAGS.embedding_dim,
+      "mlp_hidden_dims": {
+        "stage_1": FLAGS.stage_one_mlp_dims,
+        "stage_2": FLAGS.stage_two_mlp_dims,
+        "aux": FLAGS.aux_mlp_dims,
+      },
+      "dropout_rate": FLAGS.dropout_rate,
+      # "model_type": model_type
     }
 
     model = DINModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims=model_params["mlp_hidden_dims"]["stage_2"],
-        embedding_dim=model_params["embedding_dim"]
+      model_params["feature_spec"],
+      mlp_hidden_dims=model_params["mlp_hidden_dims"]["stage_2"],
+      embedding_dim=model_params["embedding_dim"],
     )
 
   trainer = Trainer(
-      model=model,
-      loss={"logits": tf.keras.losses.BinaryCrossentropy(from_logits=True)},
-      metrics=tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True),
+    model=model,
+    loss={"logits": tf.keras.losses.BinaryCrossentropy(from_logits=True)},
+    metrics=tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True),
   )
 
   # since each tfrecord file must include all of the features, it is enough to read first chunk for each split.
   train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size, prebatch_size=FLAGS.prebatch)
-  trainer.fit(train_input=train_dataset,)
+  trainer.fit(
+    train_input=train_dataset,
+  )
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/Recommendation/SIM/run_sim.py b/modelzoo/Recommendation/SIM/run_sim.py
index 3c976f51..06f8d0e2 100644
--- a/modelzoo/Recommendation/SIM/run_sim.py
+++ b/modelzoo/Recommendation/SIM/run_sim.py
@@ -51,43 +51,40 @@ def main(_):
   data_pipe = AmazonBooks2014(FLAGS.max_seq_length)
   with distribution_utils.get_strategy_scope(_strategy):
     model_params = {
-        "feature_spec": feature_spec,
-        "embedding_dim": FLAGS.embedding_dim,
-        "mlp_hidden_dims":
-            {
-                "stage_1": FLAGS.stage_one_mlp_dims,
-                "stage_2": FLAGS.stage_two_mlp_dims,
-                "aux": FLAGS.aux_mlp_dims
-            },
-        "dropout_rate": FLAGS.dropout_rate,
-        # "model_type": model_type
+      "feature_spec": feature_spec,
+      "embedding_dim": FLAGS.embedding_dim,
+      "mlp_hidden_dims": {
+        "stage_1": FLAGS.stage_one_mlp_dims,
+        "stage_2": FLAGS.stage_two_mlp_dims,
+        "aux": FLAGS.aux_mlp_dims,
+      },
+      "dropout_rate": FLAGS.dropout_rate,
+      # "model_type": model_type
     }
 
     model = SIMModel(
-        model_params['feature_spec'],
-        mlp_hidden_dims=model_params["mlp_hidden_dims"],
-        embedding_dim=model_params["embedding_dim"],
-        dropout_rate=model_params["dropout_rate"]
+      model_params["feature_spec"],
+      mlp_hidden_dims=model_params["mlp_hidden_dims"],
+      embedding_dim=model_params["embedding_dim"],
+      dropout_rate=model_params["dropout_rate"],
     )
 
   trainer = Trainer(
-      model=model,
-      loss={
-          "stage_one_logits": tf.keras.losses.BinaryCrossentropy(from_logits=True),
-          "stage_two_logits": tf.keras.losses.BinaryCrossentropy(from_logits=True),
-          "auxiliary_logits": custom_loss_fn
-      },
-      loss_weights={
-          "stage_one_logits": 0.5,
-          "stage_two_logits": 0.5,
-          "auxiliary_logits": 1
-      },
-      metrics={"stage_two_logits": tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True)},
+    model=model,
+    loss={
+      "stage_one_logits": tf.keras.losses.BinaryCrossentropy(from_logits=True),
+      "stage_two_logits": tf.keras.losses.BinaryCrossentropy(from_logits=True),
+      "auxiliary_logits": custom_loss_fn,
+    },
+    loss_weights={"stage_one_logits": 0.5, "stage_two_logits": 0.5, "auxiliary_logits": 1},
+    metrics={"stage_two_logits": tf.keras.metrics.AUC(num_thresholds=8000, name="auc_accumulator", from_logits=True)},
   )
 
   # since each tfrecord file must include all of the features, it is enough to read first chunk for each split.
   train_dataset = data_pipe(FLAGS.train_data, batch_size=FLAGS.batch_size, prebatch_size=FLAGS.prebatch)
-  trainer.fit(train_input=train_dataset,)
+  trainer.fit(
+    train_input=train_dataset,
+  )
 
 
 if __name__ == "__main__":
diff --git a/modelzoo/Recommendation/WideDeep/train.py b/modelzoo/Recommendation/WideDeep/train.py
index 246dfb83..4555b6fc 100644
--- a/modelzoo/Recommendation/WideDeep/train.py
+++ b/modelzoo/Recommendation/WideDeep/train.py
@@ -24,30 +24,32 @@ def main(_):
   data_pipe = CIFAR100()
   with distribution_utils.get_strategy_scope(_strategy):
     model = BaseModel(
-        input_shape=(32, 32, 3),
-        patch_size=(2, 2),  # 2-by-2 sized patches
-        dropout_rate=0.03,  # Dropout rate
-        num_heads=8,  # Attention heads
-        embed_dim=64,  # Embedding dimension
-        num_mlp=256,  # MLP layer size
-        qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
-        window_size=2,  # Size of attention window
-        shift_size=1,  # Size of shifting window
-        image_dimension=32,  # Initial image size
+      input_shape=(32, 32, 3),
+      patch_size=(2, 2),  # 2-by-2 sized patches
+      dropout_rate=0.03,  # Dropout rate
+      num_heads=8,  # Attention heads
+      embed_dim=64,  # Embedding dimension
+      num_mlp=256,  # MLP layer size
+      qkv_bias=True,  # Convert embedded patches to query, key, and values with a learnable additive value
+      window_size=2,  # Size of attention window
+      shift_size=1,  # Size of shifting window
+      image_dimension=32,  # Initial image size
     )(num_classes=100)
 
   trainer = Trainer(
-      model=model,
-      loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
-      optimizer=dp.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay),
-      metrics=[
-          keras.metrics.CategoricalAccuracy(name="accuracy"),
-          keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
-      ],
+    model=model,
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=dp.optimizers.AdamW(learning_rate=learning_rate, weight_decay=weight_decay),
+    metrics=[
+      keras.metrics.CategoricalAccuracy(name="accuracy"),
+      keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
   )
 
   train_input_fn = data_pipe(FLAGS.train_data, FLAGS.batch_size, is_training=True)
-  trainer.fit(train_input=train_input_fn,)
+  trainer.fit(
+    train_input=train_input_fn,
+  )
 
   # trainer.export_tfra()
   """
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
index ea854726..fe622e4e 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm.py
@@ -14,7 +14,6 @@
 
 
 class KMaxPooling(Layer):
-
   def __init__(self, k, dim):
     super(KMaxPooling, self).__init__()
     self.k = k
@@ -27,13 +26,12 @@ def forward(self, X):
 
 
 class CCPM(keras.Model):
-  """
-  """
+  """ """
 
   def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_reg=1e-6, *args, **kwargs):
     super(CCPM, self).__init__()
     self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
-    self.sparse_feat_len = self.feature_map[(self.feature_map['ftype'] == "Categorical")].shape[0]
+    self.sparse_feat_len = self.feature_map[(self.feature_map["ftype"] == "Categorical")].shape[0]
     self.conv_len = len(conv_filters)  # 卷积层数
 
     # KMaxPooling
@@ -48,7 +46,7 @@ def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_re
 
     self.padding_list = [ZeroPadding2D(padding=(0, conv_kernel_width[i] - 1)) for i in range(self.conv_len)]
     self.conv_list = [
-        Conv2D(filters=conv_filters[i], kernel_size=(1, conv_kernel_width[i])) for i in range(self.conv_len)
+      Conv2D(filters=conv_filters[i], kernel_size=(1, conv_kernel_width[i])) for i in range(self.conv_len)
     ]
 
     self.flatten = Flatten()
@@ -57,24 +55,24 @@ def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_re
   def build(self, input_shape):
     self.embedding_layers = {}
     self.hash_long_kernel = {}
-    for name, dim, voc_size, hash_size, dtype in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "dim", "voc_size", "hash_size", "dtype"
-    ]].values:
+    for name, dim, voc_size, hash_size, dtype in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "dim", "voc_size", "hash_size", "dtype"]
+    ].values:
       if not math.isnan(hash_size):
         self.hash_long_kernel[name] = Hash(int(hash_size))
         voc_size = int(hash_size)
 
       if not FLAGS.use_horovod:
         self.embedding_layers[name] = DynamicEmbedding(
-            embedding_size=dim,
-            # mini_batch_regularizer=l2(feature.emb_reg_l2),
-            # mask_value=feature.default_value,
-            key_dtype=dtype,
-            value_dtype=tf.float32,
-            initializer=tf.keras.initializers.GlorotUniform(),
-            name='embedding_' + name,
-            # init_capacity=800000,
-            kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver())
+          embedding_size=dim,
+          # mini_batch_regularizer=l2(feature.emb_reg_l2),
+          # mask_value=feature.default_value,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=tf.keras.initializers.GlorotUniform(),
+          name="embedding_" + name,
+          # init_capacity=800000,
+          kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver()),
         )
       else:
         import horovod.tensorflow as hvd
@@ -83,15 +81,15 @@ def build(self, input_shape):
         mpi_size = hvd.size()
         mpi_rank = hvd.rank()
         self.embedding_layers[name] = de.keras.layers.HvdAllToAllEmbedding(
-            # mpi_size=mpi_size,
-            embedding_size=dim,
-            key_dtype=dtype,
-            value_dtype=tf.float32,
-            initializer=tf.keras.initializers.GlorotUniform(),
-            devices=gpu_device,
-            init_capacity=800000,
-            name='embedding_' + name,
-            kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank))
+          # mpi_size=mpi_size,
+          embedding_size=dim,
+          key_dtype=dtype,
+          value_dtype=tf.float32,
+          initializer=tf.keras.initializers.GlorotUniform(),
+          devices=gpu_device,
+          init_capacity=800000,
+          name="embedding_" + name,
+          kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank)),
         )
 
       # self.embedding_layers[name] = Embedding(
@@ -101,7 +99,7 @@ def build(self, input_shape):
 
   def call(self, inputs, training=None, mask=None):
     embedding_out = []
-    for name, hash_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "hash_size"]].values:
+    for name, hash_size in self.feature_map[(self.feature_map["ftype"] == "Categorical")][["name", "hash_size"]].values:
       input_tensor = inputs[name]
       if not math.isnan(hash_size):
         input_tensor = self.hash_long_kernel[name](input_tensor)
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
index 4bce6933..3faa8b3f 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/ccpm_diamond.py
@@ -10,7 +10,6 @@
 
 
 class KMaxPooling(Layer):
-
   def __init__(self, k, dim):
     super(KMaxPooling, self).__init__()
     self.k = k
@@ -23,13 +22,12 @@ def forward(self, X):
 
 
 class CCPM(keras.Model):
-  """
-  """
+  """ """
 
   def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_reg=1e-6, *args, **kwargs):
     super(CCPM, self).__init__()
     self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
-    self.sparse_feat_len = self.feature_map[(self.feature_map['ftype'] == "Categorical")].shape[0]
+    self.sparse_feat_len = self.feature_map[(self.feature_map["ftype"] == "Categorical")].shape[0]
     self.conv_len = len(conv_filters)  # 卷积层数
 
     # KMaxPooling
@@ -44,7 +42,7 @@ def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_re
 
     self.padding_list = [ZeroPadding2D(padding=(0, conv_kernel_width[i] - 1)) for i in range(self.conv_len)]
     self.conv_list = [
-        Conv2D(filters=conv_filters[i], kernel_size=(1, conv_kernel_width[i])) for i in range(self.conv_len)
+      Conv2D(filters=conv_filters[i], kernel_size=(1, conv_kernel_width[i])) for i in range(self.conv_len)
     ]
 
     self.flatten = Flatten()
@@ -52,9 +50,29 @@ def __init__(self, conv_kernel_width=(6, 5, 3), conv_filters=(4, 4, 4), embed_re
 
   def build(self, input_shape):
     fold_columns = [
-        "hour", "id", "C1", "banner_pos", "site_id", "site_domain", "site_category", "app_id", "app_domain",
-        "app_category", "device_id", "device_ip", "device_model", "device_type", "device_conn_type", "C14", "C15",
-        "C16", "C17", "C18", "C19", "C20", "C21"
+      "hour",
+      "id",
+      "C1",
+      "banner_pos",
+      "site_id",
+      "site_domain",
+      "site_category",
+      "app_id",
+      "app_domain",
+      "app_category",
+      "device_id",
+      "device_ip",
+      "device_model",
+      "device_type",
+      "device_conn_type",
+      "C14",
+      "C15",
+      "C16",
+      "C17",
+      "C18",
+      "C19",
+      "C20",
+      "C21",
     ]
     self.embedding_group = DiamondEmbedding(self.feature_map, fold_columns)
 
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/train.py b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
index 969b7931..bcd7dbc2 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/train.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
@@ -32,22 +32,20 @@
 
 def main(_):
   field_info = {
-      "user": [
-          'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C1', 'device_model', 'device_type', 'device_id'
-      ],
-      "context": [
-          'banner_pos',
-          'site_id',
-          'site_domain',
-          'site_category',
-          'device_conn_type',
-          'hour',
-      ],
-      "item": [
-          'app_id',
-          'app_domain',
-          'app_category',
-      ]
+    "user": ["C14", "C15", "C16", "C17", "C18", "C19", "C20", "C21", "C1", "device_model", "device_type", "device_id"],
+    "context": [
+      "banner_pos",
+      "site_id",
+      "site_domain",
+      "site_category",
+      "device_conn_type",
+      "hour",
+    ],
+    "item": [
+      "app_id",
+      "app_domain",
+      "app_category",
+    ],
   }
 
   if flags.FLAGS.model_name == "flen":
@@ -69,21 +67,21 @@ def main(_):
   valid_g2b = data_pipe(flags.FLAGS.valid_data, flags.FLAGS.batch_size, is_training=False)
 
   trainer = Trainer(
-      model=model,
-      loss=tf.keras.losses.BinaryCrossentropy(),
-      # optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.03, initial_accumulator_value=1e-3),
-      optimizer=optimizer,
-      metrics=[tf.keras.metrics.AUC()]
+    model=model,
+    loss=tf.keras.losses.BinaryCrossentropy(),
+    # optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.03, initial_accumulator_value=1e-3),
+    optimizer=optimizer,
+    metrics=[tf.keras.metrics.AUC()],
   )
 
   trainer.fit(
-      x=train_dataset,
-      validation_data=valid_g2b,
-      # callbacks=[
-      #   # Write TensorBoard logs to `./logs` directory
-      #   tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir, histogram_freq=1, profile_batch=3),
-      #   # tf.keras.callbacks.ModelCheckpoint(filepath=FLAGS.model_dir),
-      # ]
+    x=train_dataset,
+    validation_data=valid_g2b,
+    # callbacks=[
+    #   # Write TensorBoard logs to `./logs` directory
+    #   tf.keras.callbacks.TensorBoard(log_dir=FLAGS.model_dir, histogram_freq=1, profile_batch=3),
+    #   # tf.keras.callbacks.ModelCheckpoint(filepath=FLAGS.model_dir),
+    # ]
   )
 
 
diff --git a/modelzoo/Recommendation/criteo_ctr/dcn_v2.py b/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
index 8f730df1..c316ebf1 100644
--- a/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
+++ b/modelzoo/Recommendation/criteo_ctr/dcn_v2.py
@@ -14,7 +14,6 @@
 
 
 class EmbeddingContainer(tf.Module):
-
   def __init__(self, training, use_group_embedding):
     super().__init__()
     self.embeddings = {}
@@ -24,31 +23,31 @@ def __init__(self, training, use_group_embedding):
   def add_embedding(self, name, dim, dtype, voc_size):
     if voc_size:
       self.embeddings[name] = keras.layers.Embedding(
-          input_dim=voc_size + 1,
-          output_dim=dim,
-          embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
+        input_dim=voc_size + 1,
+        output_dim=dim,
+        embeddings_initializer="uniform" if self.training else keras.initializers.Zeros(),
       )
     elif flags.FLAGS.use_dynamic_embedding:
       self.embeddings[name] = DistributedDynamicEmbedding(
-          embedding_dim=dim,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
-          name='DynamicVariable_' + name,
-          device="DRAM",
-          init_capacity=1024 * 10,
-          max_capacity=1024 * 100,
+        embedding_dim=dim,
+        key_dtype=dtype,
+        value_dtype=tf.float32,
+        initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+        name="DynamicVariable_" + name,
+        device="DRAM",
+        init_capacity=1024 * 10,
+        max_capacity=1024 * 100,
       )
     else:
       emb = EmbeddingVariable(
-          embedding_dim=dim,
-          key_dtype=dtype,
-          value_dtype=tf.float32,
-          initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
-          name='emb' + name,
-          storage_type="DRAM",
-          # with_unique=True,
-          # storage_type="HBM",
+        embedding_dim=dim,
+        key_dtype=dtype,
+        value_dtype=tf.float32,
+        initializer=keras.initializers.TruncatedNormal() if self.training else keras.initializers.Zeros(),
+        name="emb" + name,
+        storage_type="DRAM",
+        # with_unique=True,
+        # storage_type="HBM",
       )
       self.embeddings[name] = emb
 
@@ -63,7 +62,6 @@ def get_embedding_list(self):
 
 
 class Ranking(keras.Model):
-
   def __init__(self, interaction, training=True, use_group_embedding=False, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.feature_map = FeatureMap().feature_map
@@ -71,22 +69,22 @@ def __init__(self, interaction, training=True, use_group_embedding=False, *args,
     self._top_stack = MLP(hidden_units=[512, 256, 1], activations=[None, None, "sigmoid"])
     self._interaction = interaction
     self.training = training
-    if interaction == 'dot':
+    if interaction == "dot":
       self._feature_interaction = DotInteraction(skip_gather=True)
-    elif interaction == 'cross':
+    elif interaction == "cross":
       self._feature_interaction = Cross()
     else:
       raise ValueError(
-          f'params.task.model.interaction {self.task_config.model.interaction} '
-          f'is not supported it must be either \'dot\' or \'cross\'.'
+        f"params.task.model.interaction {self.task_config.model.interaction} "
+        f"is not supported it must be either 'dot' or 'cross'."
       )
     self.use_group_embedding = use_group_embedding
     self.embedding_container = EmbeddingContainer(training, use_group_embedding)
 
   def build(self, input_shape):
-    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map['ftype'] == "Categorical")][[
-        "name", "dim", "dtype", "voc_size"
-    ]].values:
+    for name, dim, dtype, voc_size in self.feature_map[(self.feature_map["ftype"] == "Categorical")][
+      ["name", "dim", "dtype", "voc_size"]
+    ].values:
       self.embedding_container.add_embedding(name, dim, dtype, voc_size)
 
   def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Tensor:
@@ -94,7 +92,7 @@ def call(self, inputs: Dict[str, tf.Tensor], training=None, mask=None) -> tf.Ten
     sparse_embedding_vecs = []
     indices = []  # Keep indices for group embedding lookup
 
-    for name, dim in self.feature_map[(self.feature_map['ftype'] == "Categorical")][["name", "dim"]].values:
+    for name, dim in self.feature_map[(self.feature_map["ftype"] == "Categorical")][["name", "dim"]].values:
       tensor = inputs[name]
       if self.use_group_embedding:
         indices.append(tensor)
diff --git a/modelzoo/Recommendation/criteo_ctr/infer.py b/modelzoo/Recommendation/criteo_ctr/infer.py
index 44022f83..ed6d0281 100644
--- a/modelzoo/Recommendation/criteo_ctr/infer.py
+++ b/modelzoo/Recommendation/criteo_ctr/infer.py
@@ -15,19 +15,19 @@ def runner(argv=None):
   dir_path = os.path.dirname(os.path.realpath(__file__))
   if len(argv) <= 1:
     argv = [
-        sys.argv[0],
-        "--batch_size=8",
-        "--dataset=gs_rank_e2e",
-        "--epochs=1",
-        "--run_eagerly=False",
-        "--use_dynamic_embedding=True",
-        f"--feature_map={dir_path}/feature_map.csv",
-        "--model_dir=/code/fuhailin/arsenal_tfra_accelerate/gs_rank_tfra_accelerate_test/latest",
+      sys.argv[0],
+      "--batch_size=8",
+      "--dataset=gs_rank_e2e",
+      "--epochs=1",
+      "--run_eagerly=False",
+      "--use_dynamic_embedding=True",
+      f"--feature_map={dir_path}/feature_map.csv",
+      "--model_dir=/code/fuhailin/arsenal_tfra_accelerate/gs_rank_tfra_accelerate_test/latest",
     ]
   if argv:
     FLAGS(argv, known_only=True)
 
-  data_pipe = CustomArsenalParquetDataset(dataset_name=FLAGS.dataset, partitions=[{'ds': "2023-09-06"}])
+  data_pipe = CustomArsenalParquetDataset(dataset_name=FLAGS.dataset, partitions=[{"ds": "2023-09-06"}])
   test_files_list = data_pipe.get_dataset_files()
   test_ds = data_pipe(input_file_pattern=test_files_list[-1], batch_size=FLAGS.batch_size)
   model = MatchModel(pretrain=FLAGS.pretrain, training=False).build()
diff --git a/modelzoo/Recommendation/criteo_ctr/train.py b/modelzoo/Recommendation/criteo_ctr/train.py
index 67a8f657..5a641794 100644
--- a/modelzoo/Recommendation/criteo_ctr/train.py
+++ b/modelzoo/Recommendation/criteo_ctr/train.py
@@ -19,16 +19,13 @@
 
 
 def define_flags():
-  flags.mark_flag_as_required('model_dir')
+  flags.mark_flag_as_required("model_dir")
   flags.FLAGS(sys.argv)
 
 
 def build_dataset(split="train", version="criteo-small"):
   data_pipe = CustomArsenalParquetDataset(
-      dataset_name=flags.FLAGS.dataset, partitions=[{
-          "version": version,
-          "split": split
-      }]
+    dataset_name=flags.FLAGS.dataset, partitions=[{"version": version, "split": split}]
   )
   if split == "valid":
     is_training = False
@@ -37,11 +34,11 @@ def build_dataset(split="train", version="criteo-small"):
 
   file_list = data_pipe.get_dataset_files()
   dataset = data_pipe(
-      input_file_pattern=file_list,
-      batch_size=flags.FLAGS.batch_size,
-      is_training=is_training,
-      shuffle=True,
-      shuffle_buffer=20
+    input_file_pattern=file_list,
+    batch_size=flags.FLAGS.batch_size,
+    is_training=is_training,
+    shuffle=True,
+    shuffle_buffer=20,
   )
   steps = data_pipe.get_hvd_step(flags.FLAGS.batch_size, file_list=file_list)
   logger.info(f"steps = {steps}")
@@ -57,6 +54,7 @@ def main():
   # input("pid: " + str(pid) +", press enter to continue")
   if flags.FLAGS.use_dynamic_embedding:
     from tensorflow_recommenders_addons import dynamic_embedding as de
+
     optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=flags.FLAGS.learning_rate)
     optimizer = de.DynamicEmbeddingOptimizer(optimizer, synchronous=flags.FLAGS.use_horovod)
   else:
@@ -69,20 +67,20 @@ def main():
   train_ds, train_steps = build_dataset("train")
   # valid_ds, valid_steps = build_dataset("validation")
   # test_ds, test_steps = build_dataset("test")
-  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=['AUC'], jit_compile=False)
+  trainer = Trainer(model=model, optimizer=optimizer, loss="binary_crossentropy", metrics=["AUC"], jit_compile=False)
   # Create a TensorBoard callback
-  logdir = os.path.join(flags.FLAGS.model_dir, 'tensorboard')
+  logdir = os.path.join(flags.FLAGS.model_dir, "tensorboard")
   # tboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=1, profile_batch='5,52')
   trainer.fit(
-      x=train_ds,
-      steps_per_epoch=train_steps,
-      # eval_input=valid_ds,
-      # eval_steps=valid_steps,
-      callbacks=[
-          TrainingSpeed(),
-          # tboard_callback,
-          # ModelCheckpoint(),
-      ],
+    x=train_ds,
+    steps_per_epoch=train_steps,
+    # eval_input=valid_ds,
+    # eval_steps=valid_steps,
+    callbacks=[
+      TrainingSpeed(),
+      # tboard_callback,
+      # ModelCheckpoint(),
+    ],
   )
   savedmodel_path = export_to_savedmodel(model)
   print(savedmodel_path)
diff --git a/modelzoo/horovod_test/tensorflow2_synthetic_benchmark.py b/modelzoo/horovod_test/tensorflow2_synthetic_benchmark.py
index 04d43e73..c8c2dcd9 100644
--- a/modelzoo/horovod_test/tensorflow2_synthetic_benchmark.py
+++ b/modelzoo/horovod_test/tensorflow2_synthetic_benchmark.py
@@ -23,22 +23,22 @@
 
 # Benchmark settings
 parser = argparse.ArgumentParser(
-    description='TensorFlow Synthetic Benchmark', formatter_class=argparse.ArgumentDefaultsHelpFormatter
+  description="TensorFlow Synthetic Benchmark", formatter_class=argparse.ArgumentDefaultsHelpFormatter
 )
 parser.add_argument(
-    '--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce'
+  "--fp16-allreduce", action="store_true", default=False, help="use fp16 compression during allreduce"
 )
 
-parser.add_argument('--model', type=str, default='ResNet50', help='model to benchmark')
-parser.add_argument('--batch-size', type=int, default=32, help='input batch size')
+parser.add_argument("--model", type=str, default="ResNet50", help="model to benchmark")
+parser.add_argument("--batch-size", type=int, default=32, help="input batch size")
 
 parser.add_argument(
-    '--num-warmup-batches', type=int, default=10, help='number of warm-up batches that don\'t count towards benchmark'
+  "--num-warmup-batches", type=int, default=10, help="number of warm-up batches that don't count towards benchmark"
 )
-parser.add_argument('--num-batches-per-iter', type=int, default=10, help='number of batches per benchmark iteration')
-parser.add_argument('--num-iters', type=int, default=10, help='number of benchmark iterations')
+parser.add_argument("--num-batches-per-iter", type=int, default=10, help="number of batches per benchmark iteration")
+parser.add_argument("--num-iters", type=int, default=10, help="number of benchmark iterations")
 
-parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training')
+parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training")
 
 args = parser.parse_args()
 args.cuda = not args.no_cuda
@@ -48,11 +48,11 @@
 
 # Horovod: pin GPU to be used to process local rank (one GPU per process)
 if args.cuda:
-  gpus = tf.config.experimental.list_physical_devices('GPU')
+  gpus = tf.config.experimental.list_physical_devices("GPU")
   for gpu in gpus:
     tf.config.experimental.set_memory_growth(gpu, True)
   if gpus:
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
+    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
 else:
   os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
@@ -94,34 +94,34 @@ def benchmark_step(first_batch):
 def log(s, nl=True):
   if hvd.rank() != 0:
     return
-  print(s, end='\n' if nl else '')
+  print(s, end="\n" if nl else "")
 
 
-log('Model: %s' % args.model)
-log('Batch size: %d' % args.batch_size)
-device = 'GPU' if args.cuda else 'CPU'
-log('Number of %ss: %d' % (device, hvd.size()))
+log("Model: %s" % args.model)
+log("Batch size: %d" % args.batch_size)
+device = "GPU" if args.cuda else "CPU"
+log("Number of %ss: %d" % (device, hvd.size()))
 
 with tf.device(device):
   # Warm-up
-  log('Running warmup...')
+  log("Running warmup...")
   benchmark_step(first_batch=True)
   timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_warmup_batches)
 
   # Benchmark
-  log('Running benchmark...')
+  log("Running benchmark...")
   img_secs = []
   for x in range(args.num_iters):
     time = timeit.timeit(lambda: benchmark_step(first_batch=False), number=args.num_batches_per_iter)
     img_sec = args.batch_size * args.num_batches_per_iter / time
-    log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
+    log("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
     img_secs.append(img_sec)
 
   # Results
   img_sec_mean = np.mean(img_secs)
   img_sec_conf = 1.96 * np.std(img_secs)
-  log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
+  log("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf))
   log(
-      'Total img/sec on %d %s(s): %.1f +-%.1f' %
-      (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)
+    "Total img/sec on %d %s(s): %.1f +-%.1f"
+    % (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)
   )
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..bb51944e
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,138 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[tool.mypy]
+show_error_codes = true
+disable_error_code = "attr-defined, name-defined, annotation-unchecked"
+no_implicit_optional = true
+warn_redundant_casts = true
+allow_redefinition = true
+
+[[tool.mypy.overrides]]
+module = [
+    "IPython.*",
+    "absl.*",
+    "colorama.*",
+    "etils.*",
+    "filelock.*",
+    "flatbuffers.*",
+    "flax.*",
+    "google.colab.*",
+    "hypothesis.*",
+    "jraph.*",
+    "libtpu.*",
+    "matplotlib.*",
+    "mlir.*",
+    "ml_dtypes.*",
+    "nvidia.*",
+    "numpy.*",
+    "opt_einsum.*",
+    "optax.*",
+    "portpicker.*",
+    "pygments.*",
+    "pytest.*",
+    "rich.*",
+    "scipy.*",
+    "setuptools.*",
+    "xprof.convert.*",
+    "web_pdb.*",
+    "zstandard.*",
+    "kubernetes.*"
+]
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+markers = [
+    "multiaccelerator: indicates that a test can make use of and possibly requires multiple accelerators",
+    "SlurmMultiNodeGpuTest: mark a test for Slurm multinode GPU nightly CI"
+]
+filterwarnings = [
+    "error",
+
+    # TODO(jakevdp): remove when array_api_tests stabilize
+    "default:.*not machine-readable.*:UserWarning",
+    "default:Special cases found for .* but none were parsed.*:UserWarning",
+
+    # jax.profiler imports tensorflow.python.profiler.trace internally, which
+    # will fail with python 3.12 and some versions of protobuf because the
+    # "error" entry above promotes DeprecationWarnings into errors. See also:
+    # https://github.com/protocolbuffers/protobuf/issues/12186#issuecomment-1745679358
+    "ignore:Type google\\._upb\\._message\\.(Scalar|Message)MapContainer uses PyType_Spec with a metaclass that has custom tp_new\\. This is deprecated and will no longer be allowed in Python 3\\.14\\.:DeprecationWarning",
+
+    # TODO(b/401588349): Remove this once transparent hugepages are enabled.
+    "ignore:Transparent hugepages",
+
+    # NOTE: this is probably not where you want to add code to suppress a
+    # warning. Only pytest tests look at this list, whereas Bazel tests also
+    # check for warnings and do not check this list. Most likely, you should
+    # add a @jtu.ignore_warning decorator to your test instead.
+]
+doctest_optionflags = [
+    "NUMBER",
+    "NORMALIZE_WHITESPACE"
+]
+addopts = "--doctest-glob='*.rst' --ignore='examples/ffi' --import-mode=importlib"
+
+[tool.ruff]
+preview = true
+exclude = [
+    ".git",
+    "build",
+    "__pycache__",
+]
+line-length = 120
+indent-width = 2
+target-version = "py310"
+
+[tool.ruff.lint]
+ignore = [
+    # Unnecessary collection call
+    "C408",
+    # Unnecessary map usage
+    "C417",
+    # Unnecessary dict comprehension for iterable
+    "C420",
+    # Object names too complex
+    "C901",
+    # Local variable is assigned to but never used
+    "F841",
+    # Class could be dataclass or namedtuple
+    "B903",
+    # Raise with from clause inside except block
+    "B904",
+    # Zip without explicit strict parameter
+    "B905",
+]
+select = [
+    "B9",
+    "C",
+    "F",
+    "W",
+    "YTT",
+    "ASYNC",
+    "E101",
+    "E112",
+    "E113",
+    "E115",
+    "E117",
+    "E225",
+    "E227",
+    "E228",
+]
+
+[tool.ruff.lint.mccabe]
+max-complexity = 18
+
+[tool.ruff.lint.per-file-ignores]
+# F811: Redefinition of unused name.
+# F821: Undefined name.
+"docs/autodidax.py" = ["F811"]
+"docs/pallas/tpu/matmul.ipynb" = ["F811"]
+"docs/pallas/tpu/distributed.ipynb" = ["F811"]
+"docs/pallas/quickstart.ipynb" = ["F811"]
+"docs/notebooks/autodiff_cookbook.ipynb" = ["F811", "F821"]
+"docs/notebooks/autodiff_remat.ipynb" = ["F811", "F821"]
+"docs/notebooks/Custom_derivative_rules_for_Python_code.ipynb" = ["F811"]
+"docs/jep/9407-type-promotion.ipynb" = ["F811"]
+"docs/autodidax.ipynb" = ["F811"]
diff --git a/recommendation/README.md b/recommendation/README.md
deleted file mode 100644
index 2a1e51fd..00000000
--- a/recommendation/README.md
+++ /dev/null
@@ -1,72 +0,0 @@
-# Recommendation Model
-## Overview
-This is an implementation of the Neural Collaborative Filtering (NCF) framework with the Neural Matrix Factorization (NeuMF) model as described in the [Neural Collaborative Filtering](https://arxiv.org/abs/1708.05031) paper. The current implementation is based on the code from the authors' [NCF code](https://github.com/hexiangnan/neural_collaborative_filtering) and the Stanford implementation in the [MLPerf Repo](https://github.com/mlperf).
-
-NCF is a general framework for the collaborative filtering of recommendations in which a neural network architecture is used to model user-item interactions. Unlike traditional models, NCF does not resort to Matrix Factorization (MF) with an inner product on latent features of users and items. It replaces the inner product with a multi-layer perceptron that can learn an arbitrary function from data.
-
-Two instantiations of NCF are Generalized Matrix Factorization (GMF) and Multi-Layer Perceptron (MLP). GMF applies a linear kernel to model the latent feature interactions, and MLP uses a nonlinear kernel to learn the interaction function from data. NeuMF is a fused model of GMF and MLP to better model the complex user-item interactions and unifies the strengths of linearity of MF and non-linearity of MLP for modeling the user-item latent structures. NeuMF allows GMF and MLP to learn separate embeddings and combines the two models by concatenating their last hidden layer. [neumf_model.py](neumf_model.py) defines the architecture details.
-
-Some abbreviations used the code base include:
-  - NCF: Neural Collaborative Filtering
-  - NeuMF: Neural Matrix Factorization
-  - GMF: Generalized Matrix Factorization
-  - MLP: Multi-Layer Perceptron
-  - HR: Hit Ratio (HR)
-  - NDCG: Normalized Discounted Cumulative Gain
-  - ml-1m: MovieLens 1 million dataset
-  - ml-20m: MovieLens 20 million dataset
-
-## Dataset
-The [MovieLens datasets](https://files.grouplens.org/datasets/movielens/) are used for model training and evaluation. Specifically, we use two datasets: **ml-1m** (short for MovieLens 1 million) and **ml-20m** (short for MovieLens 20 million).
-
-### ml-1m
-ml-1m dataset contains 1,000,209 anonymous ratings of approximately 3,706 movies made by 6,040 users who joined MovieLens in 2000. All ratings are contained in the file "ratings.dat" without a header row, and are in the following format:
-```
-  UserID::MovieID::Rating::Timestamp
-```
-  - UserIDs range between 1 and 6040.
-  - MovieIDs range between 1 and 3952.
-  - Ratings are made on a 5-star scale (whole-star ratings only).
-
-### ml-20m
-ml-20m dataset contains 20,000,263 ratings of 26,744 movies by 138493 users. All ratings are contained in the file "ratings.csv". Each line of this file after the header row represents one rating of one movie by one user, and has the following format:
-```
-userId,movieId,rating,timestamp
-```
-  - The lines within this file are ordered first by userId, then, within user, by movieId.
-  - Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).
-
-In both datasets, the timestamp is represented in seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970. Each user has at least 20 ratings.
-
-## Running Code
-
-### Download and preprocess dataset
-To download the dataset, please install Pandas package first. Then issue the following command:
-```
-python movielens.py
-```
-Arguments:
-  * `--data_dir`: Directory where to download and save the preprocessed data. By default, it is `/tmp/movielens-data/`.
-  * `--dataset`: The dataset name to be downloaded and preprocessed. By default, it is `ml-1m`.
-
-Use the `--help` or `-h` flag to get a full list of possible arguments.
-
-Note the ml-20m dataset is large (the rating file is ~500 MB), and it may take several minutes (~2 mins) for data preprocessing.
-Both the ml-1m and ml-20m datasets will be coerced into a common format when downloaded.
-
-### Train and evaluate model
-
-[ncf_keras_main.py](ncf_keras_main.py) is the Keras trainer that supports
-features in TF 2.x. Users can train the model on both GPU and TPU.
-
-To train and evaluate the model, issue the following command:
-```
-python ncf_keras_main.py
-```
-Arguments:
-  * `--model_dir`: Directory to save model training checkpoints. By default, it is `/tmp/ncf/`.
-  * `--data_dir`: This should be set to the same directory given to the `data_download`'s `data_dir` argument.
-  * `--dataset`: The dataset name to be downloaded and preprocessed. By default, it is `ml-1m`.
-  * `--num_gpus`: The number of GPUs used for training/evaluation of the model. Use CPU if this flag is 0. By default, it is 1.
-
-There are other arguments about models and the training processes. Refer to the [Flags package](https://abseil.io/docs/python/guides/flags) documentation or use the `--helpful` flag to get a full list of possible arguments with detailed descriptions.
diff --git a/recommendation/__init__.py b/recommendation/__init__.py
deleted file mode 100644
index 526acdde..00000000
--- a/recommendation/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/recommendation/constants.py b/recommendation/constants.py
deleted file mode 100644
index dc30c839..00000000
--- a/recommendation/constants.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Central location for NCF specific values."""
-
-import sys
-
-import numpy as np
-
-from . import movielens
-
-# ==============================================================================
-# == Main Thread Data Processing ===============================================
-# ==============================================================================
-
-# Keys for data shards
-TRAIN_USER_KEY = "train_{}".format(movielens.USER_COLUMN)
-TRAIN_ITEM_KEY = "train_{}".format(movielens.ITEM_COLUMN)
-TRAIN_LABEL_KEY = "train_labels"
-MASK_START_INDEX = "mask_start_index"
-VALID_POINT_MASK = "valid_point_mask"
-EVAL_USER_KEY = "eval_{}".format(movielens.USER_COLUMN)
-EVAL_ITEM_KEY = "eval_{}".format(movielens.ITEM_COLUMN)
-
-USER_MAP = "user_map"
-ITEM_MAP = "item_map"
-
-USER_DTYPE = np.int32
-ITEM_DTYPE = np.int32
-
-# In both datasets, each user has at least 20 ratings.
-MIN_NUM_RATINGS = 20
-
-# The number of negative examples attached with a positive example
-# when performing evaluation.
-NUM_EVAL_NEGATIVES = 999
-
-# keys for evaluation metrics
-TOP_K = 10  # Top-k list for evaluation
-HR_KEY = "HR"
-NDCG_KEY = "NDCG"
-DUPLICATE_MASK = "duplicate_mask"
-
-# Metric names
-HR_METRIC_NAME = "HR_METRIC"
-NDCG_METRIC_NAME = "NDCG_METRIC"
-
-# Trying to load a cache created in py2 when running in py3 will cause an
-# error due to differences in unicode handling.
-RAW_CACHE_FILE = "raw_data_cache_py{}.pickle".format(sys.version_info[0])
-CACHE_INVALIDATION_SEC = 3600 * 24
-
-# ==============================================================================
-# == Data Generation ===========================================================
-# ==============================================================================
-CYCLES_TO_BUFFER = 3  # The number of train cycles worth of data to "run ahead"
-# of the main training loop.
-
-# Number of batches to run per epoch when using synthetic data. At high batch
-# sizes, we run for more batches than with real data, which is good since
-# running more batches reduces noise when measuring the average batches/second.
-SYNTHETIC_BATCHES_PER_EPOCH = 2000
-
-# Only used when StreamingFilesDataset is used.
-NUM_FILE_SHARDS = 16
-TRAIN_FOLDER_TEMPLATE = "training_cycle_{}"
-EVAL_FOLDER = "eval_data"
-SHARD_TEMPLATE = "shard_{}.tfrecords"
diff --git a/recommendation/create_ncf_data.py b/recommendation/create_ncf_data.py
deleted file mode 100644
index f1fe3c3d..00000000
--- a/recommendation/create_ncf_data.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Binary to generate training/evaluation dataset for NCF model."""
-
-import json
-
-# pylint: disable=g-bad-import-order
-# Import libraries
-from absl import app
-from absl import flags
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from . import movielens
-from . import data_preprocessing
-
-flags.DEFINE_string("meta_data_file_path", None, "The path in which input meta data will be written.")
-flags.DEFINE_enum(
-    "constructor_type", "bisection", ["bisection", "materialized"],
-    "Strategy to use for generating false negatives. materialized has a "
-    "precompute that scales badly, but a faster per-epoch construction "
-    "time and can be faster on very large systems."
-)
-flags.DEFINE_integer("num_train_epochs", 14, "Total number of training epochs to generate.")
-flags.DEFINE_integer("num_negative_samples", 4, "Number of negative instances to pair with positive instance.")
-flags.DEFINE_integer(
-    "train_prebatch_size", 99000, "Batch size to be used for prebatching the dataset "
-    "for training."
-)
-flags.DEFINE_integer("eval_prebatch_size", 99000, "Batch size to be used for prebatching the dataset "
-                     "for training.")
-
-
-
-
-def prepare_raw_data(flag_obj):
-  """Downloads and prepares raw data for data generation."""
-  movielens.download(flag_obj.dataset, flag_obj.data_dir)
-
-  data_processing_params = {
-      "epochs": flag_obj.epochs,
-      "batch_size": flag_obj.train_prebatch_size,
-      "eval_batch_size": flag_obj.eval_prebatch_size,
-      "batches_per_step": 1,
-      "stream_files": True,
-      "num_neg": flag_obj.num_negative_samples,
-  }
-
-  num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
-      dataset=flag_obj.dataset,
-      data_dir=flag_obj.data_dir,
-      params=data_processing_params,
-      constructor_type=flag_obj.constructor_type,
-      epoch_dir=flag_obj.data_dir,
-      generate_data_offline=True
-  )
-
-  # pylint: disable=protected-access
-  input_metadata = {
-      "num_users": num_users,
-      "num_items": num_items,
-      "constructor_type": flag_obj.constructor_type,
-      "num_train_elements": producer._elements_in_epoch,
-      "num_eval_elements": producer._eval_elements_in_epoch,
-      "num_train_epochs": flag_obj.num_train_epochs,
-      "train_prebatch_size": flag_obj.train_prebatch_size,
-      "eval_prebatch_size": flag_obj.eval_prebatch_size,
-      "num_train_steps": producer.train_batches_per_epoch,
-      "num_eval_steps": producer.eval_batches_per_epoch,
-  }
-  # pylint: enable=protected-access
-
-  return producer, input_metadata
-
-
-def generate_data():
-  """Creates NCF train/eval dataset and writes input metadata as a file."""
-  producer, input_metadata = prepare_raw_data(FLAGS)
-  producer.run()
-
-  with tf.io.gfile.GFile(FLAGS.meta_data_file_path, "w") as writer:
-    writer.write(json.dumps(input_metadata, indent=4) + "\n")
-
-
-def main(_):
-  generate_data()
-
-
-if __name__ == "__main__":
-  flags.mark_flag_as_required("data_dir")
-  flags.mark_flag_as_required("meta_data_file_path")
-  app.run(main)
diff --git a/recommendation/data_pipeline.py b/recommendation/data_pipeline.py
deleted file mode 100644
index c1a6225e..00000000
--- a/recommendation/data_pipeline.py
+++ /dev/null
@@ -1,889 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Asynchronous data producer for the NCF pipeline."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import atexit
-import functools
-import os
-import sys
-import tempfile
-import threading
-import time
-import timeit
-import traceback
-import typing
-
-from absl import logging
-import numpy as np
-from six.moves import queue
-import tensorflow as tf
-
-from tensorflow.python.tpu.datasets import StreamingFilesDataset
-from . import constants as rconst
-from . import movielens
-from . import popen_helper
-from . import stat_utils
-
-SUMMARY_TEMPLATE = """General:
-{spacer}Num users: {num_users}
-{spacer}Num items: {num_items}
-
-Training:
-{spacer}Positive count:          {train_pos_ct}
-{spacer}Batch size:              {train_batch_size} {multiplier}
-{spacer}Batch count per epoch:   {train_batch_ct}
-
-Eval:
-{spacer}Positive count:          {eval_pos_ct}
-{spacer}Batch size:              {eval_batch_size} {multiplier}
-{spacer}Batch count per epoch:   {eval_batch_ct}"""
-
-
-class DatasetManager(object):
-  """Helper class for handling TensorFlow specific data tasks.
-
-  This class takes the (relatively) framework agnostic work done by the data
-  constructor classes and handles the TensorFlow specific portions (TFRecord
-  management, tf.Dataset creation, etc.).
-  """
-
-  def __init__(
-      self, is_training, stream_files, batches_per_epoch, shard_root=None, deterministic=False, num_train_epochs=None
-  ):
-    # type: (bool, bool, int, typing.Optional[str], bool, int) -> None
-    """Constructs a `DatasetManager` instance.
-
-    Args:
-      is_training: Boolean of whether the data provided is training or
-        evaluation data. This determines whether to reuse the data (if
-        is_training=False) and the exact structure to use when storing and
-        yielding data.
-      stream_files: Boolean indicating whether data should be serialized and
-        written to file shards.
-      batches_per_epoch: The number of batches in a single epoch.
-      shard_root: The base directory to be used when stream_files=True.
-      deterministic: Forgo non-deterministic speedups. (i.e. sloppy=True)
-      num_train_epochs: Number of epochs to generate. If None, then each call to
-        `get_dataset()` increments the number of epochs requested.
-    """
-    self._is_training = is_training
-    self._deterministic = deterministic
-    self._stream_files = stream_files
-    self._writers = []
-    self._write_locks = [threading.RLock() for _ in range(rconst.NUM_FILE_SHARDS)] if stream_files else []
-    self._batches_per_epoch = batches_per_epoch
-    self._epochs_completed = 0
-    self._epochs_requested = num_train_epochs if num_train_epochs else 0
-    self._shard_root = shard_root
-
-    self._result_queue = queue.Queue()
-    self._result_reuse = []
-
-  @property
-  def current_data_root(self):
-    subdir = (rconst.TRAIN_FOLDER_TEMPLATE.format(self._epochs_completed) if self._is_training else rconst.EVAL_FOLDER)
-    return os.path.join(self._shard_root, subdir)
-
-  def buffer_reached(self):
-    # Only applicable for training.
-    return self._epochs_completed - self._epochs_requested >= rconst.CYCLES_TO_BUFFER and self._is_training
-
-  @staticmethod
-  def serialize(data):
-    """Convert NumPy arrays into a TFRecords entry."""
-
-    def create_int_feature(values):
-      values = np.squeeze(values)
-      return tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-
-    feature_dict = {k: create_int_feature(v.astype(np.int64)) for k, v in data.items()}
-
-    return tf.train.Example(features=tf.train.Features(feature=feature_dict)).SerializeToString()
-
-  @staticmethod
-  def deserialize(serialized_data, batch_size=None, is_training=True):
-    """Convert serialized TFRecords into tensors.
-
-    Args:
-      serialized_data: A tensor containing serialized records.
-      batch_size: The data arrives pre-batched, so batch size is needed to
-        deserialize the data.
-      is_training: Boolean, whether data to deserialize to training data or
-        evaluation data.
-    """
-
-    def _get_feature_map(batch_size, is_training=True):
-      """Returns data format of the serialized tf record file."""
-
-      if is_training:
-        return {
-            movielens.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            movielens.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
-        }
-      else:
-        return {
-            movielens.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            movielens.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
-        }
-
-    features = tf.io.parse_single_example(serialized_data, _get_feature_map(batch_size, is_training=is_training))
-    users = tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE)
-    items = tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE)
-
-    if is_training:
-      valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
-      fake_dup_mask = tf.zeros_like(users)
-      return {
-          movielens.USER_COLUMN: users,
-          movielens.ITEM_COLUMN: items,
-          rconst.VALID_POINT_MASK: valid_point_mask,
-          rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
-          rconst.DUPLICATE_MASK: fake_dup_mask
-      }
-    else:
-      labels = tf.cast(tf.zeros_like(users), tf.bool)
-      fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
-      return {
-          movielens.USER_COLUMN: users,
-          movielens.ITEM_COLUMN: items,
-          rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-          rconst.VALID_POINT_MASK: fake_valid_pt_mask,
-          rconst.TRAIN_LABEL_KEY: labels
-      }
-
-  def put(self, index, data):
-    # type: (int, dict) -> None
-    """Store data for later consumption.
-
-    Because there are several paths for storing and yielding data (queues,
-    lists, files) the data producer simply provides the data in a standard
-    format at which point the dataset manager handles storing it in the correct
-    form.
-
-    Args:
-      index: Used to select shards when writing to files.
-      data: A dict of the data to be stored. This method mutates data, and
-        therefore expects to be the only consumer.
-    """
-    if self._is_training:
-      mask_start_index = data.pop(rconst.MASK_START_INDEX)
-      batch_size = data[movielens.ITEM_COLUMN].shape[0]
-      data[rconst.VALID_POINT_MASK] = np.expand_dims(np.less(np.arange(batch_size), mask_start_index), -1)
-
-    if self._stream_files:
-      example_bytes = self.serialize(data)
-      with self._write_locks[index % rconst.NUM_FILE_SHARDS]:
-        self._writers[index % rconst.NUM_FILE_SHARDS].write(example_bytes)
-
-    else:
-      self._result_queue.put((data, data.pop("labels")) if self._is_training else data)
-
-  def start_construction(self):
-    if self._stream_files:
-      tf.io.gfile.makedirs(self.current_data_root)
-      template = os.path.join(self.current_data_root, rconst.SHARD_TEMPLATE)
-      self._writers = [tf.io.TFRecordWriter(template.format(i)) for i in range(rconst.NUM_FILE_SHARDS)]
-
-  def end_construction(self):
-    if self._stream_files:
-      [writer.close() for writer in self._writers]
-      self._writers = []
-      self._result_queue.put(self.current_data_root)
-
-    self._epochs_completed += 1
-
-  def data_generator(self, epochs_between_evals):
-    """Yields examples during local training."""
-    assert not self._stream_files
-    assert self._is_training or epochs_between_evals == 1
-
-    if self._is_training:
-      for _ in range(self._batches_per_epoch * epochs_between_evals):
-        yield self._result_queue.get(timeout=300)
-
-    else:
-      if self._result_reuse:
-        assert len(self._result_reuse) == self._batches_per_epoch
-
-        for i in self._result_reuse:
-          yield i
-      else:
-        # First epoch.
-        for _ in range(self._batches_per_epoch * epochs_between_evals):
-          result = self._result_queue.get(timeout=300)
-          self._result_reuse.append(result)
-          yield result
-
-  def increment_request_epoch(self):
-    self._epochs_requested += 1
-
-  def get_dataset(self, batch_size, epochs_between_evals):
-    """Construct the dataset to be used for training and eval.
-
-    For local training, data is provided through Dataset.from_generator. For
-    remote training (TPUs) the data is first serialized to files and then sent
-    to the TPU through a StreamingFilesDataset.
-
-    Args:
-      batch_size: The per-replica batch size of the dataset.
-      epochs_between_evals: How many epochs worth of data to yield. (Generator
-        mode only.)
-    """
-    self.increment_request_epoch()
-    if self._stream_files:
-      if epochs_between_evals > 1:
-        raise ValueError("epochs_between_evals > 1 not supported for file "
-                         "based dataset.")
-      epoch_data_dir = self._result_queue.get(timeout=300)
-      if not self._is_training:
-        self._result_queue.put(epoch_data_dir)  # Eval data is reused.
-
-      file_pattern = os.path.join(epoch_data_dir, rconst.SHARD_TEMPLATE.format("*"))
-      dataset = StreamingFilesDataset(
-          files=file_pattern,
-          worker_job=popen_helper.worker_job(),
-          num_parallel_reads=rconst.NUM_FILE_SHARDS,
-          num_epochs=1,
-          sloppy=not self._deterministic
-      )
-      map_fn = functools.partial(self.deserialize, batch_size=batch_size, is_training=self._is_training)
-      dataset = dataset.map(map_fn, num_parallel_calls=16)
-
-    else:
-      types = {movielens.USER_COLUMN: rconst.USER_DTYPE, movielens.ITEM_COLUMN: rconst.ITEM_DTYPE}
-      shapes = {
-          movielens.USER_COLUMN: tf.TensorShape([batch_size, 1]),
-          movielens.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
-      }
-
-      if self._is_training:
-        types[rconst.VALID_POINT_MASK] = bool
-        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size, 1])
-
-        types = (types, bool)
-        shapes = (shapes, tf.TensorShape([batch_size, 1]))
-
-      else:
-        types[rconst.DUPLICATE_MASK] = bool
-        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size, 1])
-
-      data_generator = functools.partial(self.data_generator, epochs_between_evals=epochs_between_evals)
-      dataset = tf.data.Dataset.from_generator(generator=data_generator, output_types=types, output_shapes=shapes)
-
-    return dataset.prefetch(16)
-
-  def make_input_fn(self, batch_size):
-    """Create an input_fn which checks for batch size consistency."""
-
-    def input_fn(params):
-      """Returns batches for training."""
-
-      # Estimator passes batch_size during training and eval_batch_size during
-      # eval.
-      param_batch_size = (
-          params["batch_size"] if self._is_training else params.get("eval_batch_size") or params["batch_size"]
-      )
-      if batch_size != param_batch_size:
-        raise ValueError(
-            "producer batch size ({}) differs from params batch "
-            "size ({})".format(batch_size, param_batch_size)
-        )
-
-      epochs_between_evals = (params.get("epochs_between_evals", 1) if self._is_training else 1)
-      return self.get_dataset(batch_size=batch_size, epochs_between_evals=epochs_between_evals)
-
-    return input_fn
-
-
-class BaseDataConstructor(threading.Thread):
-  """Data constructor base class.
-
-  This class manages the control flow for constructing data. It is not meant
-  to be used directly, but instead subclasses should implement the following
-  two methods:
-
-    self.construct_lookup_variables
-    self.lookup_negative_items
-
-  """
-
-  def __init__(
-      self,
-      maximum_number_epochs,  # type: int
-      num_users,  # type: int
-      num_items,  # type: int
-      user_map,  # type: dict
-      item_map,  # type: dict
-      train_pos_users,  # type: np.ndarray
-      train_pos_items,  # type: np.ndarray
-      train_batch_size,  # type: int
-      batches_per_train_step,  # type: int
-      num_train_negatives,  # type: int
-      eval_pos_users,  # type: np.ndarray
-      eval_pos_items,  # type: np.ndarray
-      eval_batch_size,  # type: int
-      batches_per_eval_step,  # type: int
-      stream_files,  # type: bool
-      deterministic=False,  # type: bool
-      epoch_dir=None,  # type: str
-      num_train_epochs=None,  # type: int
-      create_data_offline=False  # type: bool
-  ):
-    # General constants
-    self._maximum_number_epochs = maximum_number_epochs
-    self._num_users = num_users
-    self._num_items = num_items
-    self.user_map = user_map
-    self.item_map = item_map
-    self._train_pos_users = train_pos_users
-    self._train_pos_items = train_pos_items
-    self.train_batch_size = train_batch_size
-    self._num_train_negatives = num_train_negatives
-    self._batches_per_train_step = batches_per_train_step
-    self._eval_pos_users = eval_pos_users
-    self._eval_pos_items = eval_pos_items
-    self.eval_batch_size = eval_batch_size
-    self.num_train_epochs = num_train_epochs
-    self.create_data_offline = create_data_offline
-
-    # Training
-    if self._train_pos_users.shape != self._train_pos_items.shape:
-      raise ValueError(
-          "User positives ({}) is different from item positives ({})".format(
-              self._train_pos_users.shape, self._train_pos_items.shape
-          )
-      )
-
-    (self._train_pos_count,) = self._train_pos_users.shape
-    self._elements_in_epoch = (1 + num_train_negatives) * self._train_pos_count
-    self.train_batches_per_epoch = self._count_batches(
-        self._elements_in_epoch, train_batch_size, batches_per_train_step
-    )
-
-    # Evaluation
-    if eval_batch_size % (1 + rconst.NUM_EVAL_NEGATIVES):
-      raise ValueError(
-          "Eval batch size {} is not divisible by {}".format(eval_batch_size, 1 + rconst.NUM_EVAL_NEGATIVES)
-      )
-    self._eval_users_per_batch = int(eval_batch_size // (1 + rconst.NUM_EVAL_NEGATIVES))
-    self._eval_elements_in_epoch = num_users * (1 + rconst.NUM_EVAL_NEGATIVES)
-    self.eval_batches_per_epoch = self._count_batches(
-        self._eval_elements_in_epoch, eval_batch_size, batches_per_eval_step
-    )
-
-    # Intermediate artifacts
-    self._current_epoch_order = np.empty(shape=(0,))
-    self._shuffle_iterator = None
-
-    self._shuffle_with_forkpool = not stream_files
-    if stream_files:
-      self._shard_root = epoch_dir or tempfile.mkdtemp(prefix="ncf_")
-      if not create_data_offline:
-        atexit.register(tf.io.gfile.rmtree, self._shard_root)
-    else:
-      self._shard_root = None
-
-    self._train_dataset = DatasetManager(
-        True, stream_files, self.train_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
-    )
-    self._eval_dataset = DatasetManager(
-        False, stream_files, self.eval_batches_per_epoch, self._shard_root, deterministic, num_train_epochs
-    )
-
-    # Threading details
-    super(BaseDataConstructor, self).__init__()
-    self.daemon = True
-    self._stop_loop = False
-    self._fatal_exception = None
-    self.deterministic = deterministic
-
-  def __str__(self):
-    multiplier = ("(x{} devices)".format(self._batches_per_train_step) if self._batches_per_train_step > 1 else "")
-    summary = SUMMARY_TEMPLATE.format(
-        spacer="  ",
-        num_users=self._num_users,
-        num_items=self._num_items,
-        train_pos_ct=self._train_pos_count,
-        train_batch_size=self.train_batch_size,
-        train_batch_ct=self.train_batches_per_epoch,
-        eval_pos_ct=self._num_users,
-        eval_batch_size=self.eval_batch_size,
-        eval_batch_ct=self.eval_batches_per_epoch,
-        multiplier=multiplier
-    )
-    return super(BaseDataConstructor, self).__str__() + "\n" + summary
-
-  @staticmethod
-  def _count_batches(example_count, batch_size, batches_per_step):
-    """Determine the number of batches, rounding up to fill all devices."""
-    x = (example_count + batch_size - 1) // batch_size
-    return (x + batches_per_step - 1) // batches_per_step * batches_per_step
-
-  def stop_loop(self):
-    self._stop_loop = True
-
-  def construct_lookup_variables(self):
-    """Perform any one time pre-compute work."""
-    raise NotImplementedError
-
-  def lookup_negative_items(self, **kwargs):
-    """Randomly sample negative items for given users."""
-    raise NotImplementedError
-
-  def _run(self):
-    atexit.register(self.stop_loop)
-    self._start_shuffle_iterator()
-    self.construct_lookup_variables()
-    self._construct_training_epoch()
-    self._construct_eval_epoch()
-    for _ in range(self._maximum_number_epochs - 1):
-      self._construct_training_epoch()
-    self.stop_loop()
-
-  def run(self):
-    try:
-      self._run()
-    except Exception as e:
-      # The Thread base class swallows stack traces, so unfortunately it is
-      # necessary to catch and re-raise to get debug output
-      traceback.print_exc()
-      self._fatal_exception = e
-      sys.stderr.flush()
-      raise
-
-  def _start_shuffle_iterator(self):
-    if self._shuffle_with_forkpool:
-      pool = popen_helper.get_forkpool(3, closing=False)
-    else:
-      pool = popen_helper.get_threadpool(1, closing=False)
-    atexit.register(pool.close)
-    args = [(self._elements_in_epoch, stat_utils.random_int32()) for _ in range(self._maximum_number_epochs)]
-    imap = pool.imap if self.deterministic else pool.imap_unordered
-    self._shuffle_iterator = imap(stat_utils.permutation, args)
-
-  def _get_training_batch(self, i):
-    """Construct a single batch of training data.
-
-    Args:
-      i: The index of the batch. This is used when stream_files=True to assign
-        data to file shards.
-    """
-    batch_indices = self._current_epoch_order[i * self.train_batch_size:(i + 1) * self.train_batch_size]
-    (mask_start_index,) = batch_indices.shape
-
-    batch_ind_mod = np.mod(batch_indices, self._train_pos_count)
-    users = self._train_pos_users[batch_ind_mod]
-
-    negative_indices = np.greater_equal(batch_indices, self._train_pos_count)
-    negative_users = users[negative_indices]
-
-    negative_items = self.lookup_negative_items(negative_users=negative_users)
-
-    items = self._train_pos_items[batch_ind_mod]
-    items[negative_indices] = negative_items
-
-    labels = np.logical_not(negative_indices)
-
-    # Pad last partial batch
-    pad_length = self.train_batch_size - mask_start_index
-    if pad_length:
-      # We pad with arange rather than zeros because the network will still
-      # compute logits for padded examples, and padding with zeros would create
-      # a very "hot" embedding key which can have performance implications.
-      user_pad = np.arange(pad_length, dtype=users.dtype) % self._num_users
-      item_pad = np.arange(pad_length, dtype=items.dtype) % self._num_items
-      label_pad = np.zeros(shape=(pad_length,), dtype=labels.dtype)
-      users = np.concatenate([users, user_pad])
-      items = np.concatenate([items, item_pad])
-      labels = np.concatenate([labels, label_pad])
-
-    self._train_dataset.put(
-        i, {
-            movielens.USER_COLUMN: np.reshape(users, (self.train_batch_size, 1)),
-            movielens.ITEM_COLUMN: np.reshape(items, (self.train_batch_size, 1)),
-            rconst.MASK_START_INDEX: np.array(mask_start_index, dtype=np.int32),
-            "labels": np.reshape(labels, (self.train_batch_size, 1)),
-        }
-    )
-
-  def _wait_to_construct_train_epoch(self):
-    count = 0
-    while self._train_dataset.buffer_reached() and not self._stop_loop:
-      time.sleep(0.01)
-      count += 1
-      if count >= 100 and np.log10(count) == np.round(np.log10(count)):
-        logging.info("Waited {} times for training data to be consumed".format(count))
-
-  def _construct_training_epoch(self):
-    """Loop to construct a batch of training data."""
-    if not self.create_data_offline:
-      self._wait_to_construct_train_epoch()
-
-    start_time = timeit.default_timer()
-    if self._stop_loop:
-      return
-
-    self._train_dataset.start_construction()
-    map_args = list(range(self.train_batches_per_epoch))
-    self._current_epoch_order = next(self._shuffle_iterator)
-
-    get_pool = (popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool)
-    with get_pool(6) as pool:
-      pool.map(self._get_training_batch, map_args)
-    self._train_dataset.end_construction()
-
-    logging.info("Epoch construction complete. Time: {:.1f} seconds".format(timeit.default_timer() - start_time))
-
-  @staticmethod
-  def _assemble_eval_batch(users, positive_items, negative_items, users_per_batch):
-    """Construct duplicate_mask and structure data accordingly.
-
-    The positive items should be last so that they lose ties. However, they
-    should not be masked out if the true eval positive happens to be
-    selected as a negative. So instead, the positive is placed in the first
-    position, and then switched with the last element after the duplicate
-    mask has been computed.
-
-    Args:
-      users: An array of users in a batch. (should be identical along axis 1)
-      positive_items: An array (batch_size x 1) of positive item indices.
-      negative_items: An array of negative item indices.
-      users_per_batch: How many users should be in the batch. This is passed as
-        an argument so that ncf_test.py can use this method.
-
-    Returns:
-      User, item, and duplicate_mask arrays.
-    """
-    items = np.concatenate([positive_items, negative_items], axis=1)
-
-    # We pad the users and items here so that the duplicate mask calculation
-    # will include padding. The metric function relies on all padded elements
-    # except the positive being marked as duplicate to mask out padded points.
-    if users.shape[0] < users_per_batch:
-      pad_rows = users_per_batch - users.shape[0]
-      padding = np.zeros(shape=(pad_rows, users.shape[1]), dtype=np.int32)
-      users = np.concatenate([users, padding.astype(users.dtype)], axis=0)
-      items = np.concatenate([items, padding.astype(items.dtype)], axis=0)
-
-    duplicate_mask = stat_utils.mask_duplicates(items, axis=1).astype(bool)
-
-    items[:, (0, -1)] = items[:, (-1, 0)]
-    duplicate_mask[:, (0, -1)] = duplicate_mask[:, (-1, 0)]
-
-    assert users.shape == items.shape == duplicate_mask.shape
-    return users, items, duplicate_mask
-
-  def _get_eval_batch(self, i):
-    """Construct a single batch of evaluation data.
-
-    Args:
-      i: The index of the batch.
-    """
-    low_index = i * self._eval_users_per_batch
-    high_index = (i + 1) * self._eval_users_per_batch
-    users = np.repeat(self._eval_pos_users[low_index:high_index, np.newaxis], 1 + rconst.NUM_EVAL_NEGATIVES, axis=1)
-    positive_items = self._eval_pos_items[low_index:high_index, np.newaxis]
-    negative_items = (self.lookup_negative_items(negative_users=users[:, :-1]).reshape(-1, rconst.NUM_EVAL_NEGATIVES))
-
-    users, items, duplicate_mask = self._assemble_eval_batch(
-        users, positive_items, negative_items, self._eval_users_per_batch
-    )
-
-    self._eval_dataset.put(
-        i, {
-            movielens.USER_COLUMN: np.reshape(users.flatten(), (self.eval_batch_size, 1)),
-            movielens.ITEM_COLUMN: np.reshape(items.flatten(), (self.eval_batch_size, 1)),
-            rconst.DUPLICATE_MASK: np.reshape(duplicate_mask.flatten(), (self.eval_batch_size, 1)),
-        }
-    )
-
-  def _construct_eval_epoch(self):
-    """Loop to construct data for evaluation."""
-    if self._stop_loop:
-      return
-
-    start_time = timeit.default_timer()
-
-    self._eval_dataset.start_construction()
-    map_args = [i for i in range(self.eval_batches_per_epoch)]
-
-    get_pool = (popen_helper.get_fauxpool if self.deterministic else popen_helper.get_threadpool)
-    with get_pool(6) as pool:
-      pool.map(self._get_eval_batch, map_args)
-    self._eval_dataset.end_construction()
-
-    logging.info("Eval construction complete. Time: {:.1f} seconds".format(timeit.default_timer() - start_time))
-
-  def make_input_fn(self, is_training):
-    # It isn't feasible to provide a foolproof check, so this is designed to
-    # catch most failures rather than provide an exhaustive guard.
-    if self._fatal_exception is not None:
-      raise ValueError("Fatal exception in the data production loop: {}".format(self._fatal_exception))
-
-    return (
-        self._train_dataset.make_input_fn(self.train_batch_size)
-        if is_training else self._eval_dataset.make_input_fn(self.eval_batch_size)
-    )
-
-  def increment_request_epoch(self):
-    self._train_dataset.increment_request_epoch()
-
-
-class DummyConstructor(threading.Thread):
-  """Class for running with synthetic data."""
-
-  def __init__(self, *args, **kwargs):
-    super(DummyConstructor, self).__init__(*args, **kwargs)
-    self.train_batches_per_epoch = rconst.SYNTHETIC_BATCHES_PER_EPOCH
-    self.eval_batches_per_epoch = rconst.SYNTHETIC_BATCHES_PER_EPOCH
-
-  def run(self):
-    pass
-
-  def stop_loop(self):
-    pass
-
-  def increment_request_epoch(self):
-    pass
-
-  @staticmethod
-  def make_input_fn(is_training):
-    """Construct training input_fn that uses synthetic data."""
-
-    def input_fn(params):
-      """Returns dummy input batches for training."""
-
-      # Estimator passes batch_size during training and eval_batch_size during
-      # eval.
-      batch_size = (params["batch_size"] if is_training else params.get("eval_batch_size") or params["batch_size"])
-      num_users = params["num_users"]
-      num_items = params["num_items"]
-
-      users = tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=num_users)
-      items = tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=num_items)
-
-      if is_training:
-        valid_point_mask = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
-        labels = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
-        data = {
-            movielens.USER_COLUMN: users,
-            movielens.ITEM_COLUMN: items,
-            rconst.VALID_POINT_MASK: valid_point_mask,
-        }, labels
-      else:
-        dupe_mask = tf.cast(tf.random.uniform([batch_size, 1], dtype=tf.int32, minval=0, maxval=2), tf.bool)
-        data = {
-            movielens.USER_COLUMN: users,
-            movielens.ITEM_COLUMN: items,
-            rconst.DUPLICATE_MASK: dupe_mask,
-        }
-
-      dataset = tf.data.Dataset.from_tensors(data).repeat(
-          rconst.SYNTHETIC_BATCHES_PER_EPOCH * params["batches_per_step"]
-      )
-      dataset = dataset.prefetch(32)
-      return dataset
-
-    return input_fn
-
-
-class MaterializedDataConstructor(BaseDataConstructor):
-  """Materialize a table of negative examples for fast negative generation.
-
-  This class creates a table (num_users x num_items) containing all of the
-  negative examples for each user. This table is conceptually ragged; that is to
-  say the items dimension will have a number of unused elements at the end equal
-  to the number of positive elements for a given user. For instance:
-
-  num_users = 3
-  num_items = 5
-  positives = [[1, 3], [0], [1, 2, 3, 4]]
-
-  will generate a negative table:
-  [
-    [0         2         4         int32max  int32max],
-    [1         2         3         4         int32max],
-    [0         int32max  int32max  int32max  int32max],
-  ]
-
-  and a vector of per-user negative counts, which in this case would be:
-    [3, 4, 1]
-
-  When sampling negatives, integers are (nearly) uniformly selected from the
-  range [0, per_user_neg_count[user]) which gives a column_index, at which
-  point the negative can be selected as:
-    negative_table[user, column_index]
-
-  This technique will not scale; however MovieLens is small enough that even
-  a pre-compute which is quadratic in problem size will still fit in memory. A
-  more scalable lookup method is in the works.
-  """
-
-  def __init__(self, *args, **kwargs):
-    super(MaterializedDataConstructor, self).__init__(*args, **kwargs)
-    self._negative_table = None
-    self._per_user_neg_count = None
-
-  def construct_lookup_variables(self):
-    # Materialize negatives for fast lookup sampling.
-    start_time = timeit.default_timer()
-    inner_bounds = np.argwhere(self._train_pos_users[1:] - self._train_pos_users[:-1])[:, 0] + 1
-    (upper_bound,) = self._train_pos_users.shape
-    index_bounds = [0] + inner_bounds.tolist() + [upper_bound]
-    self._negative_table = np.zeros(shape=(self._num_users, self._num_items), dtype=rconst.ITEM_DTYPE)
-
-    # Set the table to the max value to make sure the embedding lookup will fail
-    # if we go out of bounds, rather than just overloading item zero.
-    self._negative_table += np.iinfo(rconst.ITEM_DTYPE).max
-    assert self._num_items < np.iinfo(rconst.ITEM_DTYPE).max
-
-    # Reuse arange during generation. np.delete will make a copy.
-    full_set = np.arange(self._num_items, dtype=rconst.ITEM_DTYPE)
-
-    self._per_user_neg_count = np.zeros(shape=(self._num_users,), dtype=np.int32)
-
-    # Threading does not improve this loop. For some reason, the np.delete
-    # call does not parallelize well. Multiprocessing incurs too much
-    # serialization overhead to be worthwhile.
-    for i in range(self._num_users):
-      positives = self._train_pos_items[index_bounds[i]:index_bounds[i + 1]]
-      negatives = np.delete(full_set, positives)
-      self._per_user_neg_count[i] = self._num_items - positives.shape[0]
-      self._negative_table[i, :self._per_user_neg_count[i]] = negatives
-
-    logging.info("Negative sample table built. Time: {:.1f} seconds".format(timeit.default_timer() - start_time))
-
-  def lookup_negative_items(self, negative_users, **kwargs):
-    negative_item_choice = stat_utils.very_slightly_biased_randint(self._per_user_neg_count[negative_users])
-    return self._negative_table[negative_users, negative_item_choice]
-
-
-class BisectionDataConstructor(BaseDataConstructor):
-  """Use bisection to index within positive examples.
-
-  This class tallies the number of negative items which appear before each
-  positive item for a user. This means that in order to select the ith negative
-  item for a user, it only needs to determine which two positive items bound
-  it at which point the item id for the ith negative is a simply algebraic
-  expression.
-  """
-
-  def __init__(self, *args, **kwargs):
-    super(BisectionDataConstructor, self).__init__(*args, **kwargs)
-    self.index_bounds = None
-    self._sorted_train_pos_items = None
-    self._total_negatives = None
-
-  def _index_segment(self, user):
-    lower, upper = self.index_bounds[user:user + 2]
-    items = self._sorted_train_pos_items[lower:upper]
-
-    negatives_since_last_positive = np.concatenate([items[0][np.newaxis], items[1:] - items[:-1] - 1])
-
-    return np.cumsum(negatives_since_last_positive)
-
-  def construct_lookup_variables(self):
-    start_time = timeit.default_timer()
-    inner_bounds = np.argwhere(self._train_pos_users[1:] - self._train_pos_users[:-1])[:, 0] + 1
-    (upper_bound,) = self._train_pos_users.shape
-    self.index_bounds = np.array([0] + inner_bounds.tolist() + [upper_bound])
-
-    # Later logic will assume that the users are in sequential ascending order.
-    assert np.array_equal(self._train_pos_users[self.index_bounds[:-1]], np.arange(self._num_users))
-
-    self._sorted_train_pos_items = self._train_pos_items.copy()
-
-    for i in range(self._num_users):
-      lower, upper = self.index_bounds[i:i + 2]
-      self._sorted_train_pos_items[lower:upper].sort()
-
-    self._total_negatives = np.concatenate([self._index_segment(i) for i in range(self._num_users)])
-
-    logging.info("Negative total vector built. Time: {:.1f} seconds".format(timeit.default_timer() - start_time))
-
-  def lookup_negative_items(self, negative_users, **kwargs):
-    output = np.zeros(shape=negative_users.shape, dtype=rconst.ITEM_DTYPE) - 1
-
-    left_index = self.index_bounds[negative_users]
-    right_index = self.index_bounds[negative_users + 1] - 1
-
-    num_positives = right_index - left_index + 1
-    num_negatives = self._num_items - num_positives
-    neg_item_choice = stat_utils.very_slightly_biased_randint(num_negatives)
-
-    # Shortcuts:
-    # For points where the negative is greater than or equal to the tally before
-    # the last positive point there is no need to bisect. Instead the item id
-    # corresponding to the negative item choice is simply:
-    #   last_postive_index + 1 + (neg_choice - last_negative_tally)
-    # Similarly, if the selection is less than the tally at the first positive
-    # then the item_id is simply the selection.
-    #
-    # Because MovieLens organizes popular movies into low integers (which is
-    # preserved through the preprocessing), the first shortcut is very
-    # efficient, allowing ~60% of samples to bypass the bisection. For the same
-    # reason, the second shortcut is rarely triggered (<0.02%) and is therefore
-    # not worth implementing.
-    use_shortcut = neg_item_choice >= self._total_negatives[right_index]
-    output[use_shortcut] = (
-        self._sorted_train_pos_items[right_index] + 1 + (neg_item_choice - self._total_negatives[right_index])
-    )[use_shortcut]
-
-    if np.all(use_shortcut):
-      # The bisection code is ill-posed when there are no elements.
-      return output
-
-    not_use_shortcut = np.logical_not(use_shortcut)
-    left_index = left_index[not_use_shortcut]
-    right_index = right_index[not_use_shortcut]
-    neg_item_choice = neg_item_choice[not_use_shortcut]
-
-    num_loops = np.max(np.ceil(np.log2(num_positives[not_use_shortcut])).astype(np.int32))
-
-    for i in range(num_loops):
-      mid_index = (left_index + right_index) // 2
-      right_criteria = self._total_negatives[mid_index] > neg_item_choice
-      left_criteria = np.logical_not(right_criteria)
-
-      right_index[right_criteria] = mid_index[right_criteria]
-      left_index[left_criteria] = mid_index[left_criteria]
-
-    # Expected state after bisection pass:
-    #   The right index is the smallest index whose tally is greater than the
-    #   negative item choice index.
-
-    assert np.all((right_index - left_index) <= 1)
-
-    output[not_use_shortcut] = (
-        self._sorted_train_pos_items[right_index] - (self._total_negatives[right_index] - neg_item_choice)
-    )
-
-    assert np.all(output >= 0)
-
-    return output
-
-
-def get_constructor(name):
-  if name == "bisection":
-    return BisectionDataConstructor
-  if name == "materialized":
-    return MaterializedDataConstructor
-  raise ValueError("Unrecognized constructor: {}".format(name))
diff --git a/recommendation/data_preprocessing.py b/recommendation/data_preprocessing.py
deleted file mode 100644
index 8a13acd1..00000000
--- a/recommendation/data_preprocessing.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Preprocess dataset and construct any necessary artifacts."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import pickle
-import time
-import timeit
-import typing
-from typing import Dict, Text, Tuple
-
-from absl import logging
-import numpy as np
-import pandas as pd
-import tensorflow as tf
-
-from . import constants as rconst
-from . import data_pipeline
-from . import movielens
-
-_EXPECTED_CACHE_KEYS = (
-    rconst.TRAIN_USER_KEY, rconst.TRAIN_ITEM_KEY, rconst.EVAL_USER_KEY, rconst.EVAL_ITEM_KEY, rconst.USER_MAP,
-    rconst.ITEM_MAP
-)
-
-
-def read_dataframe(raw_rating_path: Text) -> Tuple[Dict[int, int], Dict[int, int], pd.DataFrame]:
-  """Read in data CSV, and output DataFrame for downstream processing.
-
-  This function reads in the raw CSV of positive items, and performs three
-  preprocessing transformations:
-
-  1)  Filter out all users who have not rated at least a certain number
-      of items. (Typically 20 items)
-
-  2)  Zero index the users and items such that the largest user_id is
-      `num_users - 1` and the largest item_id is `num_items - 1`
-
-  3)  Sort the dataframe by user_id, with timestamp as a secondary sort key.
-      This allows the dataframe to be sliced by user in-place, and for the last
-      item to be selected simply by calling the `-1` index of a user's slice.
-
-  Args:
-    raw_rating_path: The path to the CSV which contains the raw dataset.
-
-  Returns:
-    A dict mapping raw user IDs to regularized user IDs, a dict mapping raw
-    item IDs to regularized item IDs, and a filtered, zero-index remapped,
-    sorted dataframe.
-  """
-  with tf.io.gfile.GFile(raw_rating_path) as f:
-    df = pd.read_csv(f)
-
-  # Get the info of users who have more than 20 ratings on items
-  grouped = df.groupby(movielens.USER_COLUMN)
-  df = grouped.filter(lambda x: len(x) >= rconst.MIN_NUM_RATINGS)  # type: pd.DataFrame
-
-  original_users = df[movielens.USER_COLUMN].unique()
-  original_items = df[movielens.ITEM_COLUMN].unique()
-
-  # Map the ids of user and item to 0 based index for following processing
-  logging.info("Generating user_map and item_map...")
-  user_map = {user: index for index, user in enumerate(original_users)}
-  item_map = {item: index for index, item in enumerate(original_items)}
-
-  df[movielens.USER_COLUMN] = df[movielens.USER_COLUMN].apply(lambda user: user_map[user])
-  df[movielens.ITEM_COLUMN] = df[movielens.ITEM_COLUMN].apply(lambda item: item_map[item])
-
-  num_users = len(original_users)
-  num_items = len(original_items)
-
-  assert num_users <= np.iinfo(rconst.USER_DTYPE).max
-  assert num_items <= np.iinfo(rconst.ITEM_DTYPE).max
-  assert df[movielens.USER_COLUMN].max() == num_users - 1
-  assert df[movielens.ITEM_COLUMN].max() == num_items - 1
-
-  # This sort is used to shard the dataframe by user, and later to select
-  # the last item for a user to be used in validation.
-  logging.info("Sorting by user, timestamp...")
-
-  # This sort is equivalent to
-  #   df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN],
-  #   inplace=True)
-  # except that the order of items with the same user and timestamp are
-  # sometimes different. For some reason, this sort results in a better
-  # hit-rate during evaluation, matching the performance of the MLPerf
-  # reference implementation.
-  df.sort_values(by=movielens.TIMESTAMP_COLUMN, inplace=True)
-  df.sort_values([movielens.USER_COLUMN, movielens.TIMESTAMP_COLUMN], inplace=True, kind="mergesort")
-
-  # The dataframe does not reconstruct indices in the sort or filter steps.
-  return user_map, item_map, df.reset_index()
-
-
-def _filter_index_sort(raw_rating_path: Text, cache_path: Text) -> Tuple[pd.DataFrame, bool]:
-  """Read in data CSV, and output structured data.
-
-  This function reads in the raw CSV of positive items, and performs three
-  preprocessing transformations:
-
-  1)  Filter out all users who have not rated at least a certain number
-      of items. (Typically 20 items)
-
-  2)  Zero index the users and items such that the largest user_id is
-      `num_users - 1` and the largest item_id is `num_items - 1`
-
-  3)  Sort the dataframe by user_id, with timestamp as a secondary sort key.
-      This allows the dataframe to be sliced by user in-place, and for the last
-      item to be selected simply by calling the `-1` index of a user's slice.
-
-  While all of these transformations are performed by Pandas (and are therefore
-  single-threaded), they only take ~2 minutes, and the overhead to apply a
-  MapReduce pattern to parallel process the dataset adds significant complexity
-  for no computational gain. For a larger dataset parallelizing this
-  preprocessing could yield speedups. (Also, this preprocessing step is only
-  performed once for an entire run.
-
-  Args:
-    raw_rating_path: The path to the CSV which contains the raw dataset.
-    cache_path: The path to the file where results of this function are saved.
-
-  Returns:
-    A filtered, zero-index remapped, sorted dataframe, a dict mapping raw user
-    IDs to regularized user IDs, and a dict mapping raw item IDs to regularized
-    item IDs.
-  """
-  valid_cache = tf.io.gfile.exists(cache_path)
-  if valid_cache:
-    with tf.io.gfile.GFile(cache_path, "rb") as f:
-      cached_data = pickle.load(f)
-
-    # (nnigania)disabled this check as the dataset is not expected to change
-    # cache_age = time.time() - cached_data.get("create_time", 0)
-    # if cache_age > rconst.CACHE_INVALIDATION_SEC:
-    #   valid_cache = False
-
-    for key in _EXPECTED_CACHE_KEYS:
-      if key not in cached_data:
-        valid_cache = False
-
-    if not valid_cache:
-      logging.info("Removing stale raw data cache file.")
-      tf.io.gfile.remove(cache_path)
-
-  if valid_cache:
-    data = cached_data
-  else:
-    user_map, item_map, df = read_dataframe(raw_rating_path)
-
-    grouped = df.groupby(movielens.USER_COLUMN, group_keys=False)
-    eval_df, train_df = grouped.tail(1), grouped.apply(lambda x: x.iloc[:-1])
-
-    data = {
-        rconst.TRAIN_USER_KEY: train_df[movielens.USER_COLUMN].values.astype(rconst.USER_DTYPE),
-        rconst.TRAIN_ITEM_KEY: train_df[movielens.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
-        rconst.EVAL_USER_KEY: eval_df[movielens.USER_COLUMN].values.astype(rconst.USER_DTYPE),
-        rconst.EVAL_ITEM_KEY: eval_df[movielens.ITEM_COLUMN].values.astype(rconst.ITEM_DTYPE),
-        rconst.USER_MAP: user_map,
-        rconst.ITEM_MAP: item_map,
-        "create_time": time.time(),
-    }
-
-    logging.info("Writing raw data cache.")
-    with tf.io.gfile.GFile(cache_path, "wb") as f:
-      pickle.dump(data, f, protocol=4)
-
-  # TODO(robieta): MLPerf cache clear.
-  return data, valid_cache
-
-
-def instantiate_pipeline(
-    dataset, data_dir, params, constructor_type=None, deterministic=False, epoch_dir=None, generate_data_offline=False
-):
-  # type: (str, str, dict, typing.Optional[str], bool, typing.Optional[str], bool) -> (int, int, data_pipeline.BaseDataConstructor)
-  """Load and digest data CSV into a usable form.
-
-  Args:
-    dataset: The name of the dataset to be used.
-    data_dir: The root directory of the dataset.
-    params: dict of parameters for the run.
-    constructor_type: The name of the constructor subclass that should be used
-      for the input pipeline.
-    deterministic: Tell the data constructor to produce deterministically.
-    epoch_dir: Directory in which to store the training epochs.
-    generate_data_offline: Boolean, whether current pipeline is done offline or
-      while training.
-  """
-  logging.info("Beginning data preprocessing.")
-
-  st = timeit.default_timer()
-  raw_rating_path = os.path.join(data_dir, dataset, movielens.RATINGS_FILE)
-  cache_path = os.path.join(data_dir, dataset, rconst.RAW_CACHE_FILE)
-
-  raw_data, _ = _filter_index_sort(raw_rating_path, cache_path)
-  user_map, item_map = raw_data["user_map"], raw_data["item_map"]
-  num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[dataset]
-
-  if num_users != len(user_map):
-    raise ValueError("Expected to find {} users, but found {}".format(num_users, len(user_map)))
-  if num_items != len(item_map):
-    raise ValueError("Expected to find {} items, but found {}".format(num_items, len(item_map)))
-
-  producer = data_pipeline.get_constructor(constructor_type or "materialized")(
-      maximum_number_epochs=params["epochs"],
-      num_users=num_users,
-      num_items=num_items,
-      user_map=user_map,
-      item_map=item_map,
-      train_pos_users=raw_data[rconst.TRAIN_USER_KEY],
-      train_pos_items=raw_data[rconst.TRAIN_ITEM_KEY],
-      train_batch_size=params["batch_size"],
-      batches_per_train_step=params["batches_per_step"],
-      num_train_negatives=params["num_neg"],
-      eval_pos_users=raw_data[rconst.EVAL_USER_KEY],
-      eval_pos_items=raw_data[rconst.EVAL_ITEM_KEY],
-      eval_batch_size=params["eval_batch_size"],
-      batches_per_eval_step=params["batches_per_step"],
-      stream_files=params["stream_files"],
-      deterministic=deterministic,
-      epoch_dir=epoch_dir,
-      create_data_offline=generate_data_offline
-  )
-
-  run_time = timeit.default_timer() - st
-  logging.info("Data preprocessing complete. Time: {:.1f} sec.".format(run_time))
-
-  # print(producer)
-  return num_users, num_items, producer
diff --git a/recommendation/data_test.py b/recommendation/data_test.py
deleted file mode 100644
index 39f3bcac..00000000
--- a/recommendation/data_test.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test NCF data pipeline."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from collections import defaultdict
-import hashlib
-import os
-
-import mock
-
-import numpy as np
-import scipy.stats
-import tensorflow as tf
-
-from . import constants as rconst
-from . import data_preprocessing
-from . import movielens
-from . import popen_helper
-
-DATASET = "ml-test"
-NUM_USERS = 1000
-NUM_ITEMS = 2000
-NUM_PTS = 50000
-BATCH_SIZE = 2048
-EVAL_BATCH_SIZE = 4000
-NUM_NEG = 4
-
-END_TO_END_TRAIN_MD5 = "b218738e915e825d03939c5e305a2698"
-END_TO_END_EVAL_MD5 = "d753d0f3186831466d6e218163a9501e"
-FRESH_RANDOMNESS_MD5 = "63d0dff73c0e5f1048fbdc8c65021e22"
-
-
-def mock_download(*args, **kwargs):
-  return
-
-
-# The forkpool used by data producers interacts badly with the threading
-# used by TestCase. Without this patch tests will hang, and no amount
-# of diligent closing and joining within the producer will prevent it.
-@mock.patch.object(popen_helper, "get_forkpool", popen_helper.get_fauxpool)
-class BaseTest(tf.test.TestCase):
-
-  def setUp(self):
-    tf.compat.v1.disable_eager_execution()
-    self.temp_data_dir = self.get_temp_dir()
-    ratings_folder = os.path.join(self.temp_data_dir, DATASET)
-    tf.io.gfile.makedirs(ratings_folder)
-    np.random.seed(0)
-    raw_user_ids = np.arange(NUM_USERS * 3)
-    np.random.shuffle(raw_user_ids)
-    raw_user_ids = raw_user_ids[:NUM_USERS]
-
-    raw_item_ids = np.arange(NUM_ITEMS * 3)
-    np.random.shuffle(raw_item_ids)
-    raw_item_ids = raw_item_ids[:NUM_ITEMS]
-
-    users = np.random.choice(raw_user_ids, NUM_PTS)
-    items = np.random.choice(raw_item_ids, NUM_PTS)
-    scores = np.random.randint(low=0, high=5, size=NUM_PTS)
-    times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS)
-
-    self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE)
-    self.seen_pairs = set()
-    self.holdout = {}
-    with tf.io.gfile.GFile(self.rating_file, "w") as f:
-      f.write("user_id,item_id,rating,timestamp\n")
-      for usr, itm, scr, ts in zip(users, items, scores, times):
-        pair = (usr, itm)
-        if pair in self.seen_pairs:
-          continue
-        self.seen_pairs.add(pair)
-        if usr not in self.holdout or (ts, itm) > self.holdout[usr]:
-          self.holdout[usr] = (ts, itm)
-
-        f.write("{},{},{},{}\n".format(usr, itm, scr, ts))
-
-    movielens.download = mock_download
-    movielens.NUM_RATINGS[DATASET] = NUM_PTS
-    movielens.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS, NUM_ITEMS)
-
-  def make_params(self, epochs=1):
-    return {
-        "epochs": epochs,
-        "batches_per_step": 1,
-        "use_seed": False,
-        "batch_size": BATCH_SIZE,
-        "eval_batch_size": EVAL_BATCH_SIZE,
-        "num_neg": NUM_NEG,
-        "match_mlperf": True,
-        "use_tpu": False,
-        "use_xla_for_gpu": False,
-        "stream_files": False,
-    }
-
-  def test_preprocessing(self):
-    # For the most part the necessary checks are performed within
-    # _filter_index_sort()
-
-    cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle")
-    data, valid_cache = data_preprocessing._filter_index_sort(self.rating_file, cache_path=cache_path)
-
-    assert len(data[rconst.USER_MAP]) == NUM_USERS
-    assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS
-
-  def drain_dataset(self, dataset, g):
-    # type: (tf.data.Dataset, tf.Graph) -> list
-    with self.session(graph=g) as sess:
-      with g.as_default():
-        batch = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
-      output = []
-      while True:
-        try:
-          output.append(sess.run(batch))
-        except tf.errors.OutOfRangeError:
-          break
-    return output
-
-  def _test_end_to_end(self, constructor_type):
-    params = self.make_params(epochs=1)
-    _, _, producer = data_preprocessing.instantiate_pipeline(
-        dataset=DATASET,
-        data_dir=self.temp_data_dir,
-        params=params,
-        constructor_type=constructor_type,
-        deterministic=True
-    )
-
-    producer.start()
-    producer.join()
-    assert producer._fatal_exception is None
-
-    user_inv_map = {v: k for k, v in producer.user_map.items()}
-    item_inv_map = {v: k for k, v in producer.item_map.items()}
-
-    # ==========================================================================
-    # == Training Data =========================================================
-    # ==========================================================================
-    g = tf.Graph()
-    with g.as_default():
-      input_fn = producer.make_input_fn(is_training=True)
-      dataset = input_fn(params)
-
-    first_epoch = self.drain_dataset(dataset=dataset, g=g)
-
-    counts = defaultdict(int)
-    train_examples = {
-        True: set(),
-        False: set(),
-    }
-
-    md5 = hashlib.md5()
-    for features, labels in first_epoch:
-      data_list = [
-          features[movielens.USER_COLUMN].flatten(), features[movielens.ITEM_COLUMN].flatten(),
-          features[rconst.VALID_POINT_MASK].flatten(),
-          labels.flatten()
-      ]
-      for i in data_list:
-        md5.update(i.tobytes())
-
-      for u, i, v, l in zip(*data_list):
-        if not v:
-          continue  # ignore padding
-
-        u_raw = user_inv_map[u]
-        i_raw = item_inv_map[i]
-        if ((u_raw, i_raw) in self.seen_pairs) != l:
-          # The evaluation item is not considered during false negative
-          # generation, so it will occasionally appear as a negative example
-          # during training.
-          assert not l
-          self.assertEqual(i_raw, self.holdout[u_raw][1])
-        train_examples[l].add((u_raw, i_raw))
-        counts[(u_raw, i_raw)] += 1
-
-    self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)
-
-    num_positives_seen = len(train_examples[True])
-    self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen)
-
-    # This check is more heuristic because negatives are sampled with
-    # replacement. It only checks that negative generation is reasonably random.
-    self.assertGreater(len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)
-
-    # This checks that the samples produced are independent by checking the
-    # number of duplicate entries. If workers are not properly independent there
-    # will be lots of repeated pairs.
-    self.assertLess(np.mean(list(counts.values())), 1.1)
-
-    # ==========================================================================
-    # == Eval Data =============================================================
-    # ==========================================================================
-    with g.as_default():
-      input_fn = producer.make_input_fn(is_training=False)
-      dataset = input_fn(params)
-
-    eval_data = self.drain_dataset(dataset=dataset, g=g)
-
-    current_user = None
-    md5 = hashlib.md5()
-    for features in eval_data:
-      data_list = [
-          features[movielens.USER_COLUMN].flatten(), features[movielens.ITEM_COLUMN].flatten(),
-          features[rconst.DUPLICATE_MASK].flatten()
-      ]
-      for i in data_list:
-        md5.update(i.tobytes())
-
-      for idx, (u, i, d) in enumerate(zip(*data_list)):
-        u_raw = user_inv_map[u]
-        i_raw = item_inv_map[i]
-        if current_user is None:
-          current_user = u
-
-        # Ensure that users appear in blocks, as the evaluation logic expects
-        # this structure.
-        self.assertEqual(u, current_user)
-
-        # The structure of evaluation data is 999 negative examples followed
-        # by the holdout positive.
-        if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
-          # Check that the last element in each chunk is the holdout item.
-          self.assertEqual(i_raw, self.holdout[u_raw][1])
-          current_user = None
-
-        elif i_raw == self.holdout[u_raw][1]:
-          # Because the holdout item is not given to the negative generation
-          # process, it can appear as a negative. In that case, it should be
-          # masked out as a duplicate. (Since the true positive is placed at
-          # the end and would therefore lose the tie.)
-          assert d
-
-        else:
-          # Otherwise check that the other 999 points for a user are selected
-          # from the negatives.
-          assert (u_raw, i_raw) not in self.seen_pairs
-
-    self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)
-
-  def _test_fresh_randomness(self, constructor_type):
-    epochs = 5
-    params = self.make_params(epochs=epochs)
-    _, _, producer = data_preprocessing.instantiate_pipeline(
-        dataset=DATASET,
-        data_dir=self.temp_data_dir,
-        params=params,
-        constructor_type=constructor_type,
-        deterministic=True
-    )
-
-    producer.start()
-
-    results = []
-    g = tf.Graph()
-    with g.as_default():
-      for _ in range(epochs):
-        input_fn = producer.make_input_fn(is_training=True)
-        dataset = input_fn(params)
-        results.extend(self.drain_dataset(dataset=dataset, g=g))
-
-    producer.join()
-    assert producer._fatal_exception is None
-
-    positive_counts, negative_counts = defaultdict(int), defaultdict(int)
-    md5 = hashlib.md5()
-    for features, labels in results:
-      data_list = [
-          features[movielens.USER_COLUMN].flatten(), features[movielens.ITEM_COLUMN].flatten(),
-          features[rconst.VALID_POINT_MASK].flatten(),
-          labels.flatten()
-      ]
-      for i in data_list:
-        md5.update(i.tobytes())
-
-      for u, i, v, l in zip(*data_list):
-        if not v:
-          continue  # ignore padding
-
-        if l:
-          positive_counts[(u, i)] += 1
-        else:
-          negative_counts[(u, i)] += 1
-
-    self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)
-
-    # The positive examples should appear exactly once each epoch
-    self.assertAllEqual(list(positive_counts.values()), [epochs for _ in positive_counts])
-
-    # The threshold for the negatives is heuristic, but in general repeats are
-    # expected, but should not appear too frequently.
-
-    pair_cardinality = NUM_USERS * NUM_ITEMS
-    neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)
-
-    # Approximation for the expectation number of times that a particular
-    # negative will appear in a given epoch. Implicit in this calculation is the
-    # treatment of all negative pairs as equally likely. Normally is not
-    # necessarily reasonable; however the generation in self.setUp() will
-    # approximate this behavior sufficiently for heuristic testing.
-    e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality
-
-    # The frequency of occurance of a given negative pair should follow an
-    # approximately binomial distribution in the limit that the cardinality of
-    # the negative pair set >> number of samples per epoch.
-    approx_pdf = scipy.stats.binom.pmf(k=np.arange(epochs + 1), n=epochs, p=e_sample)
-
-    # Tally the actual observed counts.
-    count_distribution = [0 for _ in range(epochs + 1)]
-    for i in negative_counts.values():
-      i = min([i, epochs])  # round down tail for simplicity.
-      count_distribution[i] += 1
-    count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:])
-
-    # Check that the frequency of negative pairs is approximately binomial.
-    for i in range(epochs + 1):
-      if approx_pdf[i] < 0.05:
-        continue  # Variance will be high at the tails.
-
-      observed_fraction = count_distribution[i] / neg_pair_cardinality
-      deviation = (2 * abs(observed_fraction - approx_pdf[i]) / (observed_fraction + approx_pdf[i]))
-
-      self.assertLess(deviation, 0.2)
-
-  def test_end_to_end_materialized(self):
-    self._test_end_to_end("materialized")
-
-  def test_end_to_end_bisection(self):
-    self._test_end_to_end("bisection")
-
-  def test_fresh_randomness_materialized(self):
-    self._test_fresh_randomness("materialized")
-
-  def test_fresh_randomness_bisection(self):
-    self._test_fresh_randomness("bisection")
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/recommendation/movielens.py b/recommendation/movielens.py
deleted file mode 100644
index 3321720d..00000000
--- a/recommendation/movielens.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Download and extract the MovieLens dataset from GroupLens website.
-
-Download the dataset, and perform basic preprocessing.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import tempfile
-import zipfile
-
-# pylint: disable=g-bad-import-order
-# Import libraries
-import numpy as np
-import pandas as pd
-import six
-from six.moves import urllib  # pylint: disable=redefined-builtin
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from deepray.utils.flags import core as flags_core
-
-ML_1M = "ml-1m"
-ML_20M = "ml-20m"
-DATASETS = [ML_1M, ML_20M]
-
-RATINGS_FILE = "ratings.csv"
-MOVIES_FILE = "movies.csv"
-
-# URL to download dataset
-_DATA_URL = "https://files.grouplens.org/datasets/movielens/"
-
-GENRE_COLUMN = "genres"
-ITEM_COLUMN = "item_id"  # movies
-RATING_COLUMN = "rating"
-TIMESTAMP_COLUMN = "timestamp"
-TITLE_COLUMN = "titles"
-USER_COLUMN = "user_id"
-
-GENRES = [
-    'Action', 'Adventure', 'Animation', "Children", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
-    'Horror', "IMAX", 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
-]
-N_GENRE = len(GENRES)
-
-RATING_COLUMNS = [USER_COLUMN, ITEM_COLUMN, RATING_COLUMN, TIMESTAMP_COLUMN]
-MOVIE_COLUMNS = [ITEM_COLUMN, TITLE_COLUMN, GENRE_COLUMN]
-
-# Note: Users are indexed [1, k], not [0, k-1]
-NUM_USER_IDS = {
-    ML_1M: 6040,
-    ML_20M: 138493,
-}
-
-# Note: Movies are indexed [1, k], not [0, k-1]
-# Both the 1m and 20m datasets use the same movie set.
-NUM_ITEM_IDS = 3952
-
-MAX_RATING = 5
-
-NUM_RATINGS = {ML_1M: 1000209, ML_20M: 20000263}
-
-DATASET_TO_NUM_USERS_AND_ITEMS = {ML_1M: (6040, 3706), ML_20M: (138493, 26744)}
-
-
-def _download_and_clean(dataset, data_dir):
-  """Download MovieLens dataset in a standard format.
-
-  This function downloads the specified MovieLens format and coerces it into a
-  standard format. The only difference between the ml-1m and ml-20m datasets
-  after this point (other than size, of course) is that the 1m dataset uses
-  whole number ratings while the 20m dataset allows half integer ratings.
-  """
-  if dataset not in DATASETS:
-    raise ValueError("dataset {} is not in {{{}}}".format(dataset, ",".join(DATASETS)))
-
-  data_subdir = os.path.join(data_dir, dataset)
-
-  expected_files = ["{}.zip".format(dataset), RATINGS_FILE, MOVIES_FILE]
-
-  tf.io.gfile.makedirs(data_subdir)
-  if set(expected_files).intersection(tf.io.gfile.listdir(data_subdir)) == set(expected_files):
-    logging.info("Dataset {} has already been downloaded".format(dataset))
-    return
-
-  url = "{}{}.zip".format(_DATA_URL, dataset)
-
-  temp_dir = tempfile.mkdtemp()
-  try:
-    zip_path = os.path.join(temp_dir, "{}.zip".format(dataset))
-    zip_path, _ = urllib.request.urlretrieve(url, zip_path)
-    statinfo = os.stat(zip_path)
-    # A new line to clear the carriage return from download progress
-    # logging.info is not applicable here
-    logging.info("Successfully downloaded {} {} bytes".format(zip_path, statinfo.st_size))
-
-    zipfile.ZipFile(zip_path, "r").extractall(temp_dir)
-
-    if dataset == ML_1M:
-      _regularize_1m_dataset(temp_dir)
-    else:
-      _regularize_20m_dataset(temp_dir)
-
-    for fname in tf.io.gfile.listdir(temp_dir):
-      if not tf.io.gfile.exists(os.path.join(data_subdir, fname)):
-        tf.io.gfile.copy(os.path.join(temp_dir, fname), os.path.join(data_subdir, fname))
-      else:
-        logging.info("Skipping copy of {}, as it already exists in the "
-                     "destination folder.".format(fname))
-
-  finally:
-    tf.io.gfile.rmtree(temp_dir)
-
-
-def _transform_csv(input_path, output_path, names, skip_first, separator=","):
-  """Transform csv to a regularized format.
-
-  Args:
-    input_path: The path of the raw csv.
-    output_path: The path of the cleaned csv.
-    names: The csv column names.
-    skip_first: Boolean of whether to skip the first line of the raw csv.
-    separator: Character used to separate fields in the raw csv.
-  """
-  if six.PY2:
-    names = [six.ensure_text(n, "utf-8") for n in names]
-
-  with tf.io.gfile.GFile(output_path, "wb") as f_out, \
-      tf.io.gfile.GFile(input_path, "rb") as f_in:
-
-    # Write column names to the csv.
-    f_out.write(",".join(names).encode("utf-8"))
-    f_out.write(b"\n")
-    for i, line in enumerate(f_in):
-      if i == 0 and skip_first:
-        continue  # ignore existing labels in the csv
-
-      line = six.ensure_text(line, "utf-8", errors="ignore")
-      fields = line.split(separator)
-      if separator != ",":
-        fields = ['"{}"'.format(field) if "," in field else field for field in fields]
-      f_out.write(",".join(fields).encode("utf-8"))
-
-
-def _regularize_1m_dataset(temp_dir):
-  """
-  ratings.dat
-    The file has no header row, and each line is in the following format:
-    UserID::MovieID::Rating::Timestamp
-      - UserIDs range from 1 and 6040
-      - MovieIDs range from 1 and 3952
-      - Ratings are made on a 5-star scale (whole-star ratings only)
-      - Timestamp is represented in seconds since midnight Coordinated Universal
-        Time (UTC) of January 1, 1970.
-      - Each user has at least 20 ratings
-
-  movies.dat
-    Each line has the following format:
-    MovieID::Title::Genres
-      - MovieIDs range from 1 and 3952
-  """
-  working_dir = os.path.join(temp_dir, ML_1M)
-
-  _transform_csv(
-      input_path=os.path.join(working_dir, "ratings.dat"),
-      output_path=os.path.join(temp_dir, RATINGS_FILE),
-      names=RATING_COLUMNS,
-      skip_first=False,
-      separator="::"
-  )
-
-  _transform_csv(
-      input_path=os.path.join(working_dir, "movies.dat"),
-      output_path=os.path.join(temp_dir, MOVIES_FILE),
-      names=MOVIE_COLUMNS,
-      skip_first=False,
-      separator="::"
-  )
-
-  tf.io.gfile.rmtree(working_dir)
-
-
-def _regularize_20m_dataset(temp_dir):
-  """
-  ratings.csv
-    Each line of this file after the header row represents one rating of one
-    movie by one user, and has the following format:
-    userId,movieId,rating,timestamp
-    - The lines within this file are ordered first by userId, then, within user,
-      by movieId.
-    - Ratings are made on a 5-star scale, with half-star increments
-      (0.5 stars - 5.0 stars).
-    - Timestamps represent seconds since midnight Coordinated Universal Time
-      (UTC) of January 1, 1970.
-    - All the users had rated at least 20 movies.
-
-  movies.csv
-    Each line has the following format:
-    MovieID,Title,Genres
-      - MovieIDs range from 1 and 3952
-  """
-  working_dir = os.path.join(temp_dir, ML_20M)
-
-  _transform_csv(
-      input_path=os.path.join(working_dir, "ratings.csv"),
-      output_path=os.path.join(temp_dir, RATINGS_FILE),
-      names=RATING_COLUMNS,
-      skip_first=True,
-      separator=","
-  )
-
-  _transform_csv(
-      input_path=os.path.join(working_dir, "movies.csv"),
-      output_path=os.path.join(temp_dir, MOVIES_FILE),
-      names=MOVIE_COLUMNS,
-      skip_first=True,
-      separator=","
-  )
-
-  tf.io.gfile.rmtree(working_dir)
-
-
-def download(dataset, data_dir):
-  if dataset:
-    _download_and_clean(dataset, data_dir)
-  else:
-    _ = [_download_and_clean(d, data_dir) for d in DATASETS]
-
-
-def ratings_csv_to_dataframe(data_dir, dataset):
-  with tf.io.gfile.GFile(os.path.join(data_dir, dataset, RATINGS_FILE)) as f:
-    return pd.read_csv(f, encoding="utf-8")
-
-
-def csv_to_joint_dataframe(data_dir, dataset):
-  ratings = ratings_csv_to_dataframe(data_dir, dataset)
-
-  with tf.io.gfile.GFile(os.path.join(data_dir, dataset, MOVIES_FILE)) as f:
-    movies = pd.read_csv(f, encoding="utf-8")
-
-  df = ratings.merge(movies, on=ITEM_COLUMN)
-  df[RATING_COLUMN] = df[RATING_COLUMN].astype(np.float32)
-
-  return df
-
-
-def integerize_genres(dataframe):
-  """Replace genre string with a binary vector.
-
-  Args:
-    dataframe: a pandas dataframe of movie data.
-
-  Returns:
-    The transformed dataframe.
-  """
-
-  def _map_fn(entry):
-    entry.replace("Children's", "Children")  # naming difference.
-    movie_genres = entry.split("|")
-    output = np.zeros((len(GENRES),), dtype=np.int64)
-    for i, genre in enumerate(GENRES):
-      if genre in movie_genres:
-        output[i] = 1
-    return output
-
-  dataframe[GENRE_COLUMN] = dataframe[GENRE_COLUMN].apply(_map_fn)
-
-  return dataframe
-
-
-def define_data_download_flags():
-  """Add flags specifying data download and usage arguments."""
-  flags.DEFINE_string(
-      name="data_dir",
-      default="/tmp/movielens-data/",
-      help=flags_core.help_wrap("Directory to download and extract data.")
-  )
-
-
-def main(_):
-  """Download and extract the data from GroupLens website."""
-  download(flags.FLAGS.dataset, flags.FLAGS.data_dir)
-
-
-if __name__ == "__main__":
-  # define_data_download_flags()
-  
-  app.run(main)
diff --git a/recommendation/movielens_dataset.py b/recommendation/movielens_dataset.py
deleted file mode 100644
index 1194c2a1..00000000
--- a/recommendation/movielens_dataset.py
+++ /dev/null
@@ -1,228 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Asynchronous data producer for the NCF pipeline."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import atexit
-import functools
-import os
-import sys
-import tempfile
-import threading
-import time
-import timeit
-import traceback
-import typing
-
-from absl import logging
-import numpy as np
-from six.moves import queue
-import tensorflow as tf
-from . import movielens
-from . import popen_helper
-from . import constants as rconst
-
-from tensorflow.python.tpu.datasets import StreamingFilesDataset
-
-import sys
-
-from absl import flags
-
-from deepray.datasets.datapipeline import DataPipeline
-
-
-
-
-class Movielens(DataPipeline):
-
-  def __init__(
-      self,
-      stream_files: bool = False,
-      use_synthetic_data: bool = False,
-      shard_root=None,
-      deterministic=False,
-      **kwargs
-  ):
-    """Constructs a `Movielens DatasetManager` instance.
-
-    Args:
-      is_training: Boolean of whether the data provided is training or
-        evaluation data. This determines whether to reuse the data (if
-        is_training=False) and the exact structure to use when storing and
-        yielding data.
-      stream_files: Boolean indicating whether data should be serialized and
-        written to file shards.
-      batches_per_epoch: The number of batches in a single epoch.
-      shard_root: The base directory to be used when stream_files=True.
-      deterministic: Forgo non-deterministic speedups. (i.e. sloppy=True)
-      num_train_epochs: Number of epochs to generate. If None, then each call to
-        `get_dataset()` increments the number of epochs requested.
-    """
-    super().__init__(**kwargs)
-    self._stream_files = stream_files
-    self._use_synthetic_data = use_synthetic_data
-    self._epochs_requested = FLAGS.epochs if FLAGS.epochs else 0
-    self._shard_root = shard_root
-    self._deterministic = deterministic
-
-    self._result_queue = queue.Queue()
-    self._result_reuse = []
-
-  @property
-  def current_data_root(self):
-    subdir = (rconst.TRAIN_FOLDER_TEMPLATE.format(self._epochs_completed) if self._is_training else rconst.EVAL_FOLDER)
-    return os.path.join(self._shard_root, subdir)
-
-  @staticmethod
-  def parser(serialized_data, batch_size=None, is_training=True):
-    """Convert serialized TFRecords into tensors.
-
-    Args:
-      serialized_data: A tensor containing serialized records.
-      batch_size: The data arrives pre-batched, so batch size is needed to
-        deserialize the data.
-      is_training: Boolean, whether data to deserialize to training data or
-        evaluation data.
-    """
-
-    def _get_feature_map(batch_size, is_training=True):
-      """Returns data format of the serialized tf record file."""
-
-      if is_training:
-        return {
-            movielens.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            movielens.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.VALID_POINT_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            "labels": tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
-        }
-      else:
-        return {
-            movielens.USER_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            movielens.ITEM_COLUMN: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64),
-            rconst.DUPLICATE_MASK: tf.io.FixedLenFeature([batch_size, 1], dtype=tf.int64)
-        }
-
-    features = tf.io.parse_single_example(serialized_data, _get_feature_map(batch_size, is_training=is_training))
-    users = tf.cast(features[movielens.USER_COLUMN], rconst.USER_DTYPE)
-    items = tf.cast(features[movielens.ITEM_COLUMN], rconst.ITEM_DTYPE)
-
-    if is_training:
-      valid_point_mask = tf.cast(features[rconst.VALID_POINT_MASK], tf.bool)
-      fake_dup_mask = tf.zeros_like(users)
-      return {
-          movielens.USER_COLUMN: users,
-          movielens.ITEM_COLUMN: items,
-          rconst.VALID_POINT_MASK: valid_point_mask,
-          rconst.TRAIN_LABEL_KEY: tf.reshape(tf.cast(features["labels"], tf.bool), (batch_size, 1)),
-          rconst.DUPLICATE_MASK: fake_dup_mask
-      }
-    else:
-      labels = tf.cast(tf.zeros_like(users), tf.bool)
-      fake_valid_pt_mask = tf.cast(tf.zeros_like(users), tf.bool)
-      return {
-          movielens.USER_COLUMN: users,
-          movielens.ITEM_COLUMN: items,
-          rconst.DUPLICATE_MASK: tf.cast(features[rconst.DUPLICATE_MASK], tf.bool),
-          rconst.VALID_POINT_MASK: fake_valid_pt_mask,
-          rconst.TRAIN_LABEL_KEY: labels
-      }
-
-  def data_generator(self, epochs, is_training):
-    """Yields examples during local training."""
-    assert not self._stream_files
-    assert is_training or epochs == 1
-
-    if is_training:
-      for _ in range(self._batches_per_epoch * epochs):
-        yield self._result_queue.get(timeout=300)
-
-    else:
-      if self._result_reuse:
-        assert len(self._result_reuse) == self._batches_per_epoch
-
-        for i in self._result_reuse:
-          yield i
-      else:
-        # First epoch.
-        for _ in range(self._batches_per_epoch * epochs):
-          result = self._result_queue.get(timeout=300)
-          self._result_reuse.append(result)
-          yield result
-
-  def increment_request_epoch(self):
-    self._epochs_requested += 1
-
-  def build_dataset(
-      self, input_file_pattern, batch_size, is_training=True, epochs=1, shuffle=True, *args, **kwargs
-  ):
-    """Construct the dataset to be used for training and eval.
-
-    For local training, data is provided through Dataset.from_generator. For
-    remote training (TPUs) the data is first serialized to files and then sent
-    to the TPU through a StreamingFilesDataset.
-
-    Args:
-      batch_size: The per-replica batch size of the dataset.
-      is_training: Boolean of whether the data provided is training or
-        evaluation data. This determines whether to reuse the data (if
-        is_training=False) and the exact structure to use when storing and
-        yielding data.
-      epochs: How many epochs worth of data to yield. (Generator
-        mode only.)
-    """
-    self.increment_request_epoch()
-    if self._stream_files:
-      if epochs > 1:
-        raise ValueError("epochs > 1 not supported for file "
-                         "based dataset.")
-      epoch_data_dir = self._result_queue.get(timeout=300)
-      if not is_training:
-        self._result_queue.put(epoch_data_dir)  # Eval data is reused.
-
-      file_pattern = os.path.join(epoch_data_dir, rconst.SHARD_TEMPLATE.format("*"))
-      dataset = StreamingFilesDataset(
-          files=file_pattern,
-          worker_job=popen_helper.worker_job(),
-          num_parallel_reads=rconst.NUM_FILE_SHARDS,
-          num_epochs=1,
-          sloppy=not self._deterministic
-      )
-      map_fn = functools.partial(self.parser, batch_size=batch_size, is_training=is_training)
-      dataset = dataset.map(map_fn, num_parallel_calls=16)
-
-    else:
-      types = {movielens.USER_COLUMN: rconst.USER_DTYPE, movielens.ITEM_COLUMN: rconst.ITEM_DTYPE}
-      shapes = {
-          movielens.USER_COLUMN: tf.TensorShape([batch_size, 1]),
-          movielens.ITEM_COLUMN: tf.TensorShape([batch_size, 1])
-      }
-
-      if is_training:
-        types[rconst.VALID_POINT_MASK] = bool
-        shapes[rconst.VALID_POINT_MASK] = tf.TensorShape([batch_size, 1])
-
-        types = (types, bool)
-        shapes = (shapes, tf.TensorShape([batch_size, 1]))
-
-      else:
-        types[rconst.DUPLICATE_MASK] = bool
-        shapes[rconst.DUPLICATE_MASK] = tf.TensorShape([batch_size, 1])
-
-      data_generator = functools.partial(self.data_generator, epochs=epochs, is_training=is_training)
-      dataset = tf.data.Dataset.from_generator(generator=data_generator, output_types=types, output_shapes=shapes)
-
-    return dataset.prefetch(16)
diff --git a/recommendation/ncf_common.py b/recommendation/ncf_common.py
deleted file mode 100644
index 52cc609e..00000000
--- a/recommendation/ncf_common.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Common functionalities used by both Keras and Estimator implementations."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import json
-import os
-
-from absl import flags
-from absl import logging
-import numpy as np
-import tensorflow as tf
-
-from deepray.core.common import distribution_utils
-from . import constants as rconst
-from . import data_pipeline
-from . import data_preprocessing
-from . import movielens
-from deepray.utils.flags import core as flags_core
-
-
-
-
-def get_inputs(params):
-  """Returns some parameters used by the model."""
-  if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
-    movielens.download(FLAGS.dataset, FLAGS.data_dir)
-
-  if FLAGS.random_seed is not None:
-    np.random.seed(FLAGS.random_seed)
-
-  if FLAGS.use_synthetic_data:
-    producer = data_pipeline.DummyConstructor()
-    num_users, num_items = movielens.DATASET_TO_NUM_USERS_AND_ITEMS[FLAGS.dataset]
-    num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
-    num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
-  else:
-    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
-        dataset=FLAGS.dataset,
-        data_dir=FLAGS.data_dir,
-        params=params,
-        constructor_type=FLAGS.constructor_type,
-        deterministic=FLAGS.random_seed is not None
-    )
-    num_train_steps = producer.train_batches_per_epoch
-    num_eval_steps = producer.eval_batches_per_epoch
-
-  return num_users, num_items, num_train_steps, num_eval_steps, producer
-
-
-def parse_flags(flags_obj):
-  """Convenience function to turn flags into params."""
-  num_gpus = flags_core.get_num_gpus(flags_obj)
-
-  batch_size = flags_obj.batch_size
-  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
-
-  return {
-      "epochs": flags_obj.epochs,
-      "batches_per_step": 1,
-      "use_seed": flags_obj.random_seed is not None,
-      "batch_size": batch_size,
-      "eval_batch_size": eval_batch_size,
-      "learning_rate": flags_obj.learning_rate,
-      "mf_dim": flags_obj.num_factors,
-      "model_layers": [int(layer) for layer in flags_obj.layers],
-      "mf_regularization": flags_obj.mf_regularization,
-      "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
-      "num_neg": flags_obj.num_neg,
-      "distribution_strategy": flags_obj.distribution_strategy,
-      "num_gpus": num_gpus,
-      "use_tpu": flags_obj.tpu is not None,
-      "tpu": flags_obj.tpu,
-      "tpu_zone": flags_obj.tpu_zone,
-      "tpu_gcp_project": flags_obj.tpu_gcp_project,
-      "beta1": flags_obj.beta1,
-      "beta2": flags_obj.beta2,
-      "epsilon": flags_obj.epsilon,
-      "match_mlperf": flags_obj.ml_perf,
-      "use_custom_training_loop": flags_obj.use_custom_training_loop,
-      "hr_threshold": flags_obj.hr_threshold,
-      "stream_files": flags_obj.tpu is not None,
-      "train_dataset_path": flags_obj.train_dataset_path,
-      "eval_dataset_path": flags_obj.eval_dataset_path,
-      "input_meta_data_path": flags_obj.input_meta_data_path,
-  }
-
-
-def get_v1_distribution_strategy(params):
-  """Returns the distribution strategy to use."""
-  if params["use_tpu"]:
-    # Some of the networking libraries are quite chatty.
-    for name in ["googleapiclient.discovery", "googleapiclient.discovery_cache", "oauth2client.transport"]:
-      logging.getLogger(name).setLevel(logging.ERROR)
-
-    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
-        tpu=params["tpu"], zone=params["tpu_zone"], project=params["tpu_gcp_project"], coordinator_name="coordinator"
-    )
-
-    logging.info("Issuing reset command to TPU to ensure a clean state.")
-    tf.Session.reset(tpu_cluster_resolver.get_master())
-
-    # Estimator looks at the master it connects to for MonitoredTrainingSession
-    # by reading the `TF_CONFIG` environment variable, and the coordinator
-    # is used by StreamingFilesDataset.
-    tf_config_env = {
-        "session_master": tpu_cluster_resolver.get_master(),
-        "eval_session_master": tpu_cluster_resolver.get_master(),
-        "coordinator": tpu_cluster_resolver.cluster_spec().as_dict()["coordinator"]
-    }
-    os.environ["TF_CONFIG"] = json.dumps(tf_config_env)
-
-    distribution = tf.distribute.TPUStrategy(tpu_cluster_resolver, steps_per_run=100)
-
-  else:
-    distribution = distribution_utils.get_distribution_strategy(num_gpus=params["num_gpus"])
-
-  return distribution
-
-
-def define_ncf_flags():
-  """Add flags for running ncf_main."""
-  # Add common flags
-  #   flags_core.define_base(epochs_between_evals=True,)
-  flags_core.define_performance(synthetic_data=True,)
-  flags_core.define_device(tpu=True)
-  #   flags_core.define_data()
-
-  flags.adopt_module_key_flags(flags_core)
-
-  flags_core.set_defaults(
-      model_dir="/tmp/ncf/",
-      data_dir="/tmp/movielens-data/",
-      dataset=movielens.ML_1M,
-      epochs=2,
-      batch_size=99000,
-      tpu=None
-  )
-
-  flags.DEFINE_integer(name="num_factors", default=8, help=flags_core.help_wrap("The Embedding size of MF model."))
-
-  # Set the default as a list of strings to be consistent with input arguments
-  flags.DEFINE_list(
-      name="layers",
-      default=["64", "32", "16", "8"],
-      help=flags_core.help_wrap(
-          "The sizes of hidden layers for MLP. Example "
-          "to specify different sizes of MLP layers: --layers=32,16,8,4"
-      )
-  )
-
-  flags.DEFINE_float(
-      name="mf_regularization",
-      default=0.,
-      help=flags_core.help_wrap(
-          "The regularization factor for MF embeddings. The factor is used by "
-          "regularizer which allows to apply penalties on layer parameters or "
-          "layer activity during optimization."
-      )
-  )
-
-  flags.DEFINE_list(
-      name="mlp_regularization",
-      default=["0.", "0.", "0.", "0."],
-      help=flags_core.help_wrap(
-          "The regularization factor for each MLP layer. See mf_regularization "
-          "help for more info about regularization factor."
-      )
-  )
-
-  flags.DEFINE_integer(
-      name="num_neg",
-      default=4,
-      help=flags_core.help_wrap("The Number of negative instances to pair with a positive instance.")
-  )
-
-  #   flags.DEFINE_float(
-  #       name="learning_rate",
-  #       default=0.001,
-  #       help=flags_core.help_wrap("The learning rate."))
-
-  flags.DEFINE_float(
-      name="beta1", default=0.9, help=flags_core.help_wrap("beta1 hyperparameter for the Adam optimizer.")
-  )
-
-  flags.DEFINE_float(
-      name="beta2", default=0.999, help=flags_core.help_wrap("beta2 hyperparameter for the Adam optimizer.")
-  )
-
-  flags.DEFINE_float(
-      name="epsilon", default=1e-8, help=flags_core.help_wrap("epsilon hyperparameter for the Adam "
-                                                              "optimizer.")
-  )
-
-  flags.DEFINE_float(
-      name="hr_threshold",
-      default=1.0,
-      help=flags_core.help_wrap(
-          "If passed, training will stop when the evaluation metric HR is "
-          "greater than or equal to hr_threshold. For dataset ml-1m, the "
-          "desired hr_threshold is 0.68 which is the result from the paper; "
-          "For dataset ml-20m, the threshold can be set as 0.95 which is "
-          "achieved by MLPerf implementation."
-      )
-  )
-
-  flags.DEFINE_enum(
-      name="constructor_type",
-      default="bisection",
-      enum_values=["bisection", "materialized"],
-      case_sensitive=False,
-      help=flags_core.help_wrap(
-          "Strategy to use for generating false negatives. materialized has a"
-          "precompute that scales badly, but a faster per-epoch construction"
-          "time and can be faster on very large systems."
-      )
-  )
-
-  flags.DEFINE_string(name="train_dataset_path", default=None, help=flags_core.help_wrap("Path to training data."))
-
-  flags.DEFINE_string(name="eval_dataset_path", default=None, help=flags_core.help_wrap("Path to evaluation data."))
-
-  flags.DEFINE_bool(
-      name="ml_perf",
-      default=False,
-      help=flags_core.help_wrap(
-          "If set, changes the behavior of the model slightly to match the "
-          "MLPerf reference implementations here: \n"
-          "https://github.com/mlperf/reference/tree/master/recommendation/"
-          "pytorch\n"
-          "The two changes are:\n"
-          "1. When computing the HR and NDCG during evaluation, remove "
-          "duplicate user-item pairs before the computation. This results in "
-          "better HRs and NDCGs.\n"
-          "2. Use a different soring algorithm when sorting the input data, "
-          "which performs better due to the fact the sorting algorithms are "
-          "not stable."
-      )
-  )
-
-  flags.DEFINE_bool(
-      name="output_ml_perf_compliance_logging",
-      default=False,
-      help=flags_core.help_wrap(
-          "If set, output the MLPerf compliance logging. This is only useful "
-          "if one is running the model for MLPerf. See "
-          "https://github.com/mlperf/policies/blob/master/training_rules.adoc"
-          "#submission-compliance-logs for details. This uses sudo and so may "
-          "ask for your password, as root access is needed to clear the system "
-          "caches, which is required for MLPerf compliance."
-      )
-  )
-
-  @flags.validator("eval_batch_size", "eval_batch_size must be at least {}".format(rconst.NUM_EVAL_NEGATIVES + 1))
-  def eval_size_check(eval_batch_size):
-    return (eval_batch_size is None or int(eval_batch_size) > rconst.NUM_EVAL_NEGATIVES)
-
-  flags.DEFINE_bool(
-      name="early_stopping",
-      default=False,
-      help=flags_core.help_wrap("If True, we stop the training when it reaches hr_threshold")
-  )
-
-
-def convert_to_softmax_logits(logits):
-  """Convert the logits returned by the base model to softmax logits.
-
-  Args:
-    logits: used to create softmax.
-
-  Returns:
-    Softmax with the first column of zeros is equivalent to sigmoid.
-  """
-  softmax_logits = tf.concat([logits * 0, logits], axis=1)
-  return softmax_logits
diff --git a/recommendation/ncf_input_pipeline.py b/recommendation/ncf_input_pipeline.py
deleted file mode 100644
index 24a32adf..00000000
--- a/recommendation/ncf_input_pipeline.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NCF model input pipeline."""
-
-import functools
-
-# pylint: disable=g-bad-import-order
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from . import constants as rconst
-from . import data_pipeline
-from . import movielens
-
-
-def create_dataset_from_tf_record_files(
-    input_file_pattern, pre_batch_size, batch_size, is_training=True, rebatch=False
-):
-  """Creates dataset from (tf)records files for training/evaluation."""
-  if pre_batch_size != batch_size:
-    raise ValueError("Pre-batch ({}) size is not equal to batch "
-                     "size ({})".format(pre_batch_size, batch_size))
-
-  files = tf.data.Dataset.list_files(input_file_pattern, shuffle=is_training)
-
-  dataset = files.interleave(tf.data.TFRecordDataset, cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-  decode_fn = functools.partial(
-      data_pipeline.DatasetManager.deserialize, batch_size=pre_batch_size, is_training=is_training
-  )
-  dataset = dataset.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-
-  if rebatch:
-    # A workaround for TPU Pod evaluation dataset.
-    # TODO (b/162341937) remove once it's fixed.
-    dataset = dataset.unbatch()
-    dataset = dataset.batch(pre_batch_size)
-
-  dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-  return dataset
-
-
-def create_dataset_from_data_producer(producer, params):
-  """Return dataset online-generating data."""
-
-  def preprocess_train_input(features, labels):
-    """Pre-process the training data.
-
-    This is needed because
-    - The label needs to be extended to be used in the loss fn
-    - We need the same inputs for training and eval so adding fake inputs
-      for DUPLICATE_MASK in training data.
-
-    Args:
-      features: Dictionary of features for training.
-      labels: Training labels.
-
-    Returns:
-      Processed training features.
-    """
-    fake_dup_mask = tf.zeros_like(features[movielens.USER_COLUMN])
-    features[rconst.DUPLICATE_MASK] = fake_dup_mask
-    features[rconst.TRAIN_LABEL_KEY] = labels
-    return features
-
-  train_input_fn = producer.make_input_fn(is_training=True)
-  train_input_dataset = train_input_fn(params).map(preprocess_train_input)
-
-  def preprocess_eval_input(features):
-    """Pre-process the eval data.
-
-    This is needed because:
-    - The label needs to be extended to be used in the loss fn
-    - We need the same inputs for training and eval so adding fake inputs
-      for VALID_PT_MASK in eval data.
-
-    Args:
-      features: Dictionary of features for evaluation.
-
-    Returns:
-      Processed evaluation features.
-    """
-    labels = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
-    fake_valid_pt_mask = tf.cast(tf.zeros_like(features[movielens.USER_COLUMN]), tf.bool)
-    features[rconst.VALID_POINT_MASK] = fake_valid_pt_mask
-    features[rconst.TRAIN_LABEL_KEY] = labels
-    return features
-
-  eval_input_fn = producer.make_input_fn(is_training=False)
-  eval_input_dataset = eval_input_fn(params).map(preprocess_eval_input)
-
-  return train_input_dataset, eval_input_dataset
-
-
-def create_ncf_input_data(params, producer=None, input_meta_data=None, strategy=None):
-  """Creates NCF training/evaluation dataset.
-
-  Args:
-    params: Dictionary containing parameters for train/evaluation data.
-    producer: Instance of BaseDataConstructor that generates data online. Must
-      not be None when params['train_dataset_path'] or
-      params['eval_dataset_path'] is not specified.
-    input_meta_data: A dictionary of input metadata to be used when reading data
-      from tf record files. Must be specified when params["train_input_dataset"]
-      is specified.
-    strategy: Distribution strategy used for distributed training. If specified,
-      used to assert that evaluation batch size is correctly a multiple of total
-      number of devices used.
-
-  Returns:
-    (training dataset, evaluation dataset, train steps per epoch,
-    eval steps per epoch)
-
-  Raises:
-    ValueError: If data is being generated online for when using TPU's.
-  """
-  # NCF evaluation metric calculation logic assumes that evaluation data
-  # sample size are in multiples of (1 + number of negative samples in
-  # evaluation) for each device. As so, evaluation batch size must be a
-  # multiple of (number of replicas * (1 + number of negative samples)).
-  num_devices = strategy.num_replicas_in_sync if strategy else 1
-  if (params["eval_batch_size"] % (num_devices * (1 + rconst.NUM_EVAL_NEGATIVES))):
-    raise ValueError(
-        "Evaluation batch size must be divisible by {} "
-        "times {}".format(num_devices, (1 + rconst.NUM_EVAL_NEGATIVES))
-    )
-
-  if params["train_dataset_path"]:
-    assert params["eval_dataset_path"]
-
-    train_dataset = create_dataset_from_tf_record_files(
-        params["train_dataset_path"],
-        input_meta_data["train_prebatch_size"],
-        params["batch_size"],
-        is_training=True,
-        rebatch=False
-    )
-
-    # Re-batch evaluation dataset for TPU Pods.
-    # TODO (b/162341937) remove once it's fixed.
-    eval_rebatch = (params["use_tpu"] and strategy.num_replicas_in_sync > 8)
-    eval_dataset = create_dataset_from_tf_record_files(
-        params["eval_dataset_path"],
-        input_meta_data["eval_prebatch_size"],
-        params["eval_batch_size"],
-        is_training=False,
-        rebatch=eval_rebatch
-    )
-
-    num_train_steps = int(input_meta_data["num_train_steps"])
-    num_eval_steps = int(input_meta_data["num_eval_steps"])
-  else:
-    if params["use_tpu"]:
-      raise ValueError("TPU training does not support data producer yet. "
-                       "Use pre-processed data.")
-
-    assert producer
-    # Start retrieving data from producer.
-    train_dataset, eval_dataset = create_dataset_from_data_producer(producer, params)
-    num_train_steps = producer.train_batches_per_epoch
-    num_eval_steps = producer.eval_batches_per_epoch
-
-  return train_dataset, eval_dataset, num_train_steps, num_eval_steps
diff --git a/recommendation/ncf_keras_main.py b/recommendation/ncf_keras_main.py
deleted file mode 100644
index 4dfc1a70..00000000
--- a/recommendation/ncf_keras_main.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""NCF framework to train and evaluate the NeuMF model.
-
-The NeuMF model assembles both MF and MLP models under the NCF framework. Check
-`neumf_model.py` for more details about the models.
-"""
-
-import json
-import os
-
-# pylint: disable=g-bad-import-order
-
-from absl import app
-from absl import flags
-from absl import logging
-import tensorflow as tf
-# pylint: enable=g-bad-import-order
-
-from deepray.core.common import distribution_utils
-from . import constants as rconst
-from . import movielens
-from . import ncf_common
-from . import ncf_input_pipeline
-from . import neumf_model
-from deepray.utils.flags import core as flags_core
-from deepray.utils.misc import keras_utils
-from deepray.utils.misc import model_helpers
-
-
-
-
-def metric_fn(logits, dup_mask, match_mlperf):
-  dup_mask = tf.cast(dup_mask, tf.float32)
-  logits = tf.slice(logits, [0, 1], [-1, -1])
-  in_top_k, _, metric_weights, _ = neumf_model.compute_top_k_and_ndcg(logits, dup_mask, match_mlperf)
-  metric_weights = tf.cast(metric_weights, tf.float32)
-  return in_top_k, metric_weights
-
-
-class MetricLayer(tf.keras.layers.Layer):
-  """Custom layer of metrics for NCF model."""
-
-  def __init__(self, match_mlperf):
-    super(MetricLayer, self).__init__()
-    self.match_mlperf = match_mlperf
-
-  def get_config(self):
-    return {"match_mlperf": self.match_mlperf}
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  def call(self, inputs, training=False):
-    logits, dup_mask = inputs
-
-    if training:
-      hr_sum = 0.0
-      hr_count = 0.0
-    else:
-      metric, metric_weights = metric_fn(logits, dup_mask, self.match_mlperf)
-      hr_sum = tf.reduce_sum(metric * metric_weights)
-      hr_count = tf.reduce_sum(metric_weights)
-
-    self.add_metric(hr_sum, name="hr_sum", aggregation="mean")
-    self.add_metric(hr_count, name="hr_count", aggregation="mean")
-    return logits
-
-
-class LossLayer(tf.keras.layers.Layer):
-  """Pass-through loss layer for NCF model."""
-
-  def __init__(self, loss_normalization_factor):
-    # The loss may overflow in float16, so we use float32 instead.
-    super(LossLayer, self).__init__(dtype="float32")
-    self.loss_normalization_factor = loss_normalization_factor
-    self.loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="sum")
-
-  def get_config(self):
-    return {"loss_normalization_factor": self.loss_normalization_factor}
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    return cls(**config)
-
-  def call(self, inputs):
-    logits, labels, valid_pt_mask_input = inputs
-    loss = self.loss(y_true=labels, y_pred=logits, sample_weight=valid_pt_mask_input)
-    loss = loss * (1.0 / self.loss_normalization_factor)
-    self.add_loss(loss)
-    return logits
-
-
-class IncrementEpochCallback(tf.keras.callbacks.Callback):
-  """A callback to increase the requested epoch for the data producer.
-
-  The reason why we need this is because we can only buffer a limited amount of
-  data. So we keep a moving window to represent the buffer. This is to move the
-  one of the window's boundaries for each epoch.
-  """
-
-  def __init__(self, producer):
-    self._producer = producer
-
-  def on_epoch_begin(self, epoch, logs=None):
-    self._producer.increment_request_epoch()
-
-
-class CustomEarlyStopping(tf.keras.callbacks.Callback):
-  """Stop training has reached a desired hit rate."""
-
-  def __init__(self, monitor, desired_value):
-    super(CustomEarlyStopping, self).__init__()
-
-    self.monitor = monitor
-    self.desired = desired_value
-    self.stopped_epoch = 0
-
-  def on_epoch_end(self, epoch, logs=None):
-    current = self.get_monitor_value(logs)
-    if current and current >= self.desired:
-      self.stopped_epoch = epoch
-      self.model.stop_training = True
-
-  def on_train_end(self, logs=None):
-    if self.stopped_epoch > 0:
-      logging.info("Epoch %05d: early stopping" % (self.stopped_epoch + 1))
-
-  def get_monitor_value(self, logs):
-    logs = logs or {}
-    monitor_value = logs.get(self.monitor)
-    if monitor_value is None:
-      logging.warning(
-          "Early stopping conditioned on metric `%s` "
-          "which is not available. Available metrics are: %s", self.monitor, ",".join(list(logs.keys()))
-      )
-    return monitor_value
-
-
-def _get_keras_model(params):
-  """Constructs and returns the model."""
-  batch_size = params["batch_size"]
-
-  user_input = tf.keras.layers.Input(shape=(1,), name=movielens.USER_COLUMN, dtype=tf.int32)
-
-  item_input = tf.keras.layers.Input(shape=(1,), name=movielens.ITEM_COLUMN, dtype=tf.int32)
-
-  valid_pt_mask_input = tf.keras.layers.Input(shape=(1,), name=rconst.VALID_POINT_MASK, dtype=tf.bool)
-
-  dup_mask_input = tf.keras.layers.Input(shape=(1,), name=rconst.DUPLICATE_MASK, dtype=tf.int32)
-
-  label_input = tf.keras.layers.Input(shape=(1,), name=rconst.TRAIN_LABEL_KEY, dtype=tf.bool)
-
-  base_model = neumf_model.construct_model(user_input, item_input, params)
-
-  logits = base_model.output
-
-  zeros = tf.keras.layers.Lambda(lambda x: x * 0)(logits)
-
-  softmax_logits = tf.keras.layers.concatenate([zeros, logits], axis=-1)
-
-  # Custom training loop calculates loss and metric as a part of
-  # training/evaluation step function.
-  if not params["use_custom_training_loop"]:
-    softmax_logits = MetricLayer(params["match_mlperf"])([softmax_logits, dup_mask_input])
-    # TODO(b/134744680): Use model.add_loss() instead once the API is well
-    # supported.
-    softmax_logits = LossLayer(batch_size)([softmax_logits, label_input, valid_pt_mask_input])
-
-  keras_model = tf.keras.Model(
-      inputs={
-          movielens.USER_COLUMN: user_input,
-          movielens.ITEM_COLUMN: item_input,
-          rconst.VALID_POINT_MASK: valid_pt_mask_input,
-          rconst.DUPLICATE_MASK: dup_mask_input,
-          rconst.TRAIN_LABEL_KEY: label_input
-      },
-      outputs=softmax_logits
-  )
-
-  keras_model.summary()
-  return keras_model
-
-
-def run_ncf(_):
-  """Run NCF training and eval with Keras."""
-
-  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)
-
-  if FLAGS.random_seed is not None:
-    logging.info("Setting tf seed")
-    tf.random.set_seed(FLAGS.random_seed)
-
-  model_helpers.apply_clean(FLAGS)
-
-  if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
-    tf.keras.mixed_precision.set_global_policy("mixed_float16")
-
-  strategy = distribution_utils.get_distribution_strategy(distribution_strategy=FLAGS.distribution_strategy,)
-
-  params = ncf_common.parse_flags(FLAGS)
-  params["distribute_strategy"] = strategy
-  params["use_tpu"] = (FLAGS.distribution_strategy == "tpu")
-
-  if params["use_tpu"] and not params["use_custom_training_loop"]:
-    logging.error("Custom training loop must be used when using TPUStrategy.")
-    return
-
-  batch_size = FLAGS.batch_size
-  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
-  callbacks = [time_callback]
-
-  producer, input_meta_data = None, None
-  generate_input_online = params["train_dataset_path"] is None
-
-  if generate_input_online:
-    # Start data producing thread.
-    num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
-    producer.start()
-    per_epoch_callback = IncrementEpochCallback(producer)
-    callbacks.append(per_epoch_callback)
-  else:
-    assert params["eval_dataset_path"] and params["input_meta_data_path"]
-    with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
-      input_meta_data = json.loads(reader.read().decode("utf-8"))
-      num_users = input_meta_data["num_users"]
-      num_items = input_meta_data["num_items"]
-
-  params["num_users"], params["num_items"] = num_users, num_items
-
-  if FLAGS.early_stopping:
-    early_stopping_callback = CustomEarlyStopping("val_HR_METRIC", desired_value=FLAGS.hr_threshold)
-    callbacks.append(early_stopping_callback)
-
-  (train_input_dataset, eval_input_dataset, num_train_steps,
-   num_eval_steps) = ncf_input_pipeline.create_ncf_input_data(params, producer, input_meta_data, strategy)
-  steps_per_epoch = None if generate_input_online else num_train_steps
-
-  with distribution_utils.get_strategy_scope(strategy):
-    keras_model = _get_keras_model(params)
-    optimizer = tf.keras.optimizers.Adam(
-        learning_rate=params["learning_rate"],
-        beta_1=params["beta1"],
-        beta_2=params["beta2"],
-        epsilon=params["epsilon"]
-    )
-    if FLAGS.fp16_implementation == "graph_rewrite":
-      optimizer = \
-        tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
-            optimizer,
-            loss_scale=flags_core.get_loss_scale(FLAGS,
-                                                 default_for_fp16="dynamic"))
-    elif FLAGS.dtype == "fp16":
-      loss_scale = flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")
-      # Note Model.compile automatically wraps the optimizer with a
-      # LossScaleOptimizer using dynamic loss scaling. We explicitly wrap it
-      # here for the case where a custom training loop or fixed loss scale is
-      # used.
-      if loss_scale == "dynamic":
-        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer)
-      else:
-        optimizer = tf.keras.mixed_precision.LossScaleOptimizer(optimizer, dynamic=False, initial_scale=loss_scale)
-
-    if params["use_custom_training_loop"]:
-      train_loss, eval_results = run_ncf_custom_training(
-          params,
-          strategy,
-          keras_model,
-          optimizer,
-          callbacks,
-          train_input_dataset,
-          eval_input_dataset,
-          num_train_steps,
-          num_eval_steps,
-          generate_input_online=generate_input_online
-      )
-    else:
-      keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly)
-
-      if not FLAGS.ml_perf:
-        # Create Tensorboard summary and checkpoint callbacks.
-        summary_dir = os.path.join(FLAGS.model_dir, "summaries")
-        summary_callback = tf.keras.callbacks.TensorBoard(summary_dir, profile_batch=0)
-        checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
-        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True)
-
-        callbacks += [summary_callback, checkpoint_callback]
-
-      history = keras_model.fit(
-          train_input_dataset,
-          epochs=FLAGS.epochs,
-          steps_per_epoch=steps_per_epoch,
-          callbacks=callbacks,
-          validation_data=eval_input_dataset,
-          validation_steps=num_eval_steps,
-          verbose=2
-      )
-
-      logging.info("Training done. Start evaluating")
-
-      eval_loss_and_metrics = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2)
-
-      logging.info("Keras evaluation is done.")
-
-      # Keras evaluate() API returns scalar loss and metric values from
-      # evaluation as a list. Here, the returned list would contain
-      # [evaluation loss, hr sum, hr count].
-      eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2]
-
-      # Format evaluation result into [eval loss, eval hit accuracy].
-      eval_results = [eval_loss_and_metrics[0], eval_hit_rate]
-
-      if history and history.history:
-        train_history = history.history
-        train_loss = train_history["loss"][-1]
-
-  stats = build_stats(train_loss, eval_results, time_callback)
-  return stats
-
-
-def run_ncf_custom_training(
-    params,
-    strategy,
-    keras_model,
-    optimizer,
-    callbacks,
-    train_input_dataset,
-    eval_input_dataset,
-    num_train_steps,
-    num_eval_steps,
-    generate_input_online=True
-):
-  """Runs custom training loop.
-
-  Args:
-    params: Dictionary containing training parameters.
-    strategy: Distribution strategy to be used for distributed training.
-    keras_model: Model used for training.
-    optimizer: Optimizer used for training.
-    callbacks: Callbacks to be invoked between batches/epochs.
-    train_input_dataset: tf.data.Dataset used for training.
-    eval_input_dataset: tf.data.Dataset used for evaluation.
-    num_train_steps: Total number of steps to run for training.
-    num_eval_steps: Total number of steps to run for evaluation.
-    generate_input_online: Whether input data was generated by data producer.
-      When data is generated by data producer, then train dataset must be
-      re-initialized after every epoch.
-
-  Returns:
-    A tuple of train loss and a list of training and evaluation results.
-  """
-  loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction="sum", from_logits=True)
-  train_input_iterator = iter(strategy.experimental_distribute_dataset(train_input_dataset))
-
-  def train_step(train_iterator):
-    """Called once per step to train the model."""
-
-    def step_fn(features):
-      """Computes loss and applied gradient per replica."""
-      with tf.GradientTape() as tape:
-        softmax_logits = keras_model(features)
-        # The loss can overflow in float16, so we cast to float32.
-        softmax_logits = tf.cast(softmax_logits, "float32")
-        labels = features[rconst.TRAIN_LABEL_KEY]
-        loss = loss_object(labels, softmax_logits, sample_weight=features[rconst.VALID_POINT_MASK])
-        loss *= (1.0 / params["batch_size"])
-        if FLAGS.dtype == "fp16":
-          loss = optimizer.get_scaled_loss(loss)
-
-      grads = tape.gradient(loss, keras_model.trainable_variables)
-      if FLAGS.dtype == "fp16":
-        grads = optimizer.get_unscaled_gradients(grads)
-      # Converting gradients to dense form helps in perf on GPU for NCF
-      grads = neumf_model.sparse_to_dense_grads(list(zip(grads, keras_model.trainable_variables)))
-      optimizer.apply_gradients(grads)
-      return loss
-
-    per_replica_losses = strategy.run(step_fn, args=(next(train_iterator),))
-    mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None)
-    return mean_loss
-
-  def eval_step(eval_iterator):
-    """Called once per eval step to compute eval metrics."""
-
-    def step_fn(features):
-      """Computes eval metrics per replica."""
-      softmax_logits = keras_model(features)
-      in_top_k, metric_weights = metric_fn(softmax_logits, features[rconst.DUPLICATE_MASK], params["match_mlperf"])
-      hr_sum = tf.reduce_sum(in_top_k * metric_weights)
-      hr_count = tf.reduce_sum(metric_weights)
-      return hr_sum, hr_count
-
-    per_replica_hr_sum, per_replica_hr_count = (strategy.run(step_fn, args=(next(eval_iterator),)))
-    hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None)
-    hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None)
-    return hr_sum, hr_count
-
-  if not FLAGS.run_eagerly:
-    train_step = tf.function(train_step)
-    eval_step = tf.function(eval_step)
-
-  for callback in callbacks:
-    callback.on_train_begin()
-
-  # Not writing tensorboard summaries if running in MLPerf.
-  if FLAGS.ml_perf:
-    eval_summary_writer, train_summary_writer = None, None
-  else:
-    summary_dir = os.path.join(FLAGS.model_dir, "summaries")
-    eval_summary_writer = tf.summary.create_file_writer(os.path.join(summary_dir, "eval"))
-    train_summary_writer = tf.summary.create_file_writer(os.path.join(summary_dir, "train"))
-
-  train_loss = 0
-  for epoch in range(FLAGS.epochs):
-    for cb in callbacks:
-      cb.on_epoch_begin(epoch)
-
-    # As NCF dataset is sampled with randomness, not repeating
-    # data elements in each epoch has significant impact on
-    # convergence. As so, offline-generated TF record files
-    # contains all epoch worth of data. Thus we do not need
-    # to initialize dataset when reading from tf record files.
-    if generate_input_online:
-      train_input_iterator = iter(strategy.experimental_distribute_dataset(train_input_dataset))
-
-    train_loss = 0
-    for step in range(num_train_steps):
-      current_step = step + epoch * num_train_steps
-      for c in callbacks:
-        c.on_batch_begin(current_step)
-
-      train_loss += train_step(train_input_iterator)
-
-      # Write train loss once in every 1000 steps.
-      if train_summary_writer and step % 1000 == 0:
-        with train_summary_writer.as_default():
-          tf.summary.scalar("training_loss", train_loss / (step + 1), step=current_step)
-
-      for c in callbacks:
-        c.on_batch_end(current_step)
-
-    train_loss /= num_train_steps
-    logging.info("Done training epoch %s, epoch loss=%.3f", epoch + 1, train_loss)
-
-    eval_input_iterator = iter(strategy.experimental_distribute_dataset(eval_input_dataset))
-
-    hr_sum = 0.0
-    hr_count = 0.0
-    for _ in range(num_eval_steps):
-      step_hr_sum, step_hr_count = eval_step(eval_input_iterator)
-      hr_sum += step_hr_sum
-      hr_count += step_hr_count
-
-    logging.info("Done eval epoch %s, hit_rate=%.3f", epoch + 1, hr_sum / hr_count)
-    if eval_summary_writer:
-      with eval_summary_writer.as_default():
-        tf.summary.scalar("hit_rate", hr_sum / hr_count, step=current_step)
-
-    if (FLAGS.early_stopping and float(hr_sum / hr_count) > params["hr_threshold"]):
-      break
-
-  for c in callbacks:
-    c.on_train_end()
-
-  # Saving the model at the end of training.
-  if not FLAGS.ml_perf:
-    checkpoint = tf.train.Checkpoint(model=keras_model, optimizer=optimizer)
-    checkpoint_path = os.path.join(FLAGS.model_dir, "ctl_checkpoint")
-    checkpoint.save(checkpoint_path)
-    logging.info("Saving model as TF checkpoint: %s", checkpoint_path)
-
-  return train_loss, [None, hr_sum / hr_count]
-
-
-def build_stats(loss, eval_result, time_callback):
-  """Normalizes and returns dictionary of stats.
-
-  Args:
-    loss: The final loss at training time.
-    eval_result: Output of the eval step. Assumes first value is eval_loss and
-      second value is accuracy_top_1.
-    time_callback: Time tracking callback likely used during keras.fit.
-
-  Returns:
-    Dictionary of normalized results.
-  """
-  stats = {}
-  if loss:
-    stats["loss"] = loss
-
-  if eval_result:
-    stats["eval_loss"] = eval_result[0]
-    stats["eval_hit_rate"] = eval_result[1]
-
-  if time_callback:
-    timestamp_log = time_callback.timestamp_log
-    stats["step_timestamp_log"] = timestamp_log
-    stats["train_finish_time"] = time_callback.train_finish_time
-    if len(timestamp_log) > 1:
-      stats["avg_exp_per_second"] = (
-          time_callback.batch_size * time_callback.log_steps * (len(time_callback.timestamp_log) - 1) /
-          (timestamp_log[-1].timestamp - timestamp_log[0].timestamp)
-      )
-
-  return stats
-
-
-def main(_):
-  logging.info("Result is %s", run_ncf(FLAGS))
-
-
-if __name__ == "__main__":
-  ncf_common.define_ncf_flags()
-  app.run(main)
diff --git a/recommendation/ncf_test.py b/recommendation/ncf_test.py
deleted file mode 100644
index 5c18c2e8..00000000
--- a/recommendation/ncf_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tests NCF."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-
-import tensorflow as tf
-from tensorflow.python.eager import context  # pylint: disable=ungrouped-imports
-from . import constants as rconst
-from . import ncf_common
-from . import ncf_keras_main
-from official.utils.testing import integration
-
-NUM_TRAIN_NEG = 4
-
-
-class NcfTest(tf.test.TestCase):
-
-  @classmethod
-  def setUpClass(cls):  # pylint: disable=invalid-name
-    super(NcfTest, cls).setUpClass()
-    ncf_common.define_ncf_flags()
-
-  def setUp(self):
-    super().setUp()
-    self.top_k_old = rconst.TOP_K
-    self.num_eval_negatives_old = rconst.NUM_EVAL_NEGATIVES
-    rconst.NUM_EVAL_NEGATIVES = 2
-
-  def tearDown(self):
-    super().tearDown()
-    rconst.NUM_EVAL_NEGATIVES = self.num_eval_negatives_old
-    rconst.TOP_K = self.top_k_old
-
-  _BASE_END_TO_END_FLAGS = ['-batch_size', '1044', '-epochs', '1']
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_no_dist_strat(self):
-    integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-distribution_strategy', 'off']
-    )
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_dist_strat(self):
-    integration.run_synthetic(
-        ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0']
-    )
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_dist_strat_ctl(self):
-    flags = (self._BASE_END_TO_END_FLAGS + ['-num_gpus', '0'] + ['-use_custom_training_loop', 'True'])
-    integration.run_synthetic(ncf_keras_main.main, tmp_root=self.get_temp_dir(), extra_flags=flags)
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_1_gpu_dist_strat_fp16(self):
-    if context.num_gpus() < 1:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(1, context.num_gpus()))
-
-    integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16']
-    )
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_1_gpu_dist_strat_ctl_fp16(self):
-    if context.num_gpus() < 1:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(1, context.num_gpus()))
-
-    integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '1', '--dtype', 'fp16', '--use_custom_training_loop']
-    )
-
-  @unittest.mock.patch.object(rconst, 'SYNTHETIC_BATCHES_PER_EPOCH', 100)
-  def test_end_to_end_keras_2_gpu_fp16(self):
-    if context.num_gpus() < 2:
-      self.skipTest('{} GPUs are not available for this test. {} GPUs are available'.format(2, context.num_gpus()))
-
-    integration.run_synthetic(
-        ncf_keras_main.main,
-        tmp_root=self.get_temp_dir(),
-        extra_flags=self._BASE_END_TO_END_FLAGS + ['-num_gpus', '2', '--dtype', 'fp16']
-    )
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/recommendation/neumf_model.py b/recommendation/neumf_model.py
deleted file mode 100644
index fa469831..00000000
--- a/recommendation/neumf_model.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Defines NeuMF model for NCF framework.
-
-Some abbreviations used in the code base:
-NeuMF: Neural Matrix Factorization
-NCF: Neural Collaborative Filtering
-GMF: Generalized Matrix Factorization
-MLP: Multi-Layer Perceptron
-
-GMF applies a linear kernel to model the latent feature interactions, and MLP
-uses a nonlinear kernel to learn the interaction function from data. NeuMF model
-is a fused model of GMF and MLP to better model the complex user-item
-interactions, and unifies the strengths of linearity of MF and non-linearity of
-MLP for modeling the user-item latent structures.
-
-In NeuMF model, it allows GMF and MLP to learn separate embeddings, and combine
-the two models by concatenating their last hidden layer.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-from six.moves import xrange  # pylint: disable=redefined-builtin
-import tensorflow as tf
-from tensorflow import estimator as tf_estimator
-from typing import Any, Dict, Text
-
-from . import constants as rconst
-from . import movielens
-from . import ncf_common
-from . import stat_utils
-
-
-def sparse_to_dense_grads(grads_and_vars):
-  """Convert sparse gradients to dense gradients.
-
-  All sparse gradients, which are represented as instances of tf.IndexedSlices,
-  are converted to dense Tensors. Dense gradients, which are represents as
-  Tensors, are unchanged.
-
-  The purpose of this conversion is that for small embeddings, which are used by
-  this model, applying dense gradients with the AdamOptimizer is faster than
-  applying sparse gradients.
-
-  Args
-    grads_and_vars: A list of (gradient, variable) tuples. Each gradient can
-      be a Tensor or an IndexedSlices. Tensors are unchanged, and IndexedSlices
-      are converted to dense Tensors.
-  Returns:
-    The same list of (gradient, variable) as `grads_and_vars`, except each
-    IndexedSlices gradient is converted to a Tensor.
-  """
-
-  # Calling convert_to_tensor changes IndexedSlices into Tensors, and leaves
-  # Tensors unchanged.
-  return [(tf.convert_to_tensor(g), v) for g, v in grads_and_vars]
-
-
-def neumf_model_fn(features, labels, mode, params):
-  """Model Function for NeuMF estimator."""
-  if params.get("use_seed"):
-    tf.set_random_seed(stat_utils.random_int32())
-
-  users = features[movielens.USER_COLUMN]
-  items = features[movielens.ITEM_COLUMN]
-
-  user_input = tf.keras.layers.Input(tensor=users)
-  item_input = tf.keras.layers.Input(tensor=items)
-  logits = construct_model(user_input, item_input, params).output
-
-  # Softmax with the first column of zeros is equivalent to sigmoid.
-  softmax_logits = ncf_common.convert_to_softmax_logits(logits)
-
-  if mode == tf_estimator.ModeKeys.EVAL:
-    duplicate_mask = tf.cast(features[rconst.DUPLICATE_MASK], tf.float32)
-    return _get_estimator_spec_with_metrics(
-        logits,
-        softmax_logits,
-        duplicate_mask,
-        params["num_neg"],
-        params["match_mlperf"],
-        use_tpu_spec=params["use_tpu"]
-    )
-
-  elif mode == tf_estimator.ModeKeys.TRAIN:
-    labels = tf.cast(labels, tf.int32)
-    valid_pt_mask = features[rconst.VALID_POINT_MASK]
-
-    optimizer = tf.compat.v1.train.AdamOptimizer(
-        learning_rate=params["learning_rate"], beta1=params["beta1"], beta2=params["beta2"], epsilon=params["epsilon"]
-    )
-    if params["use_tpu"]:
-      optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
-
-    loss = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-        labels=labels, logits=softmax_logits, weights=tf.cast(valid_pt_mask, tf.float32)
-    )
-
-    tf.identity(loss, name="cross_entropy")
-
-    global_step = tf.compat.v1.train.get_global_step()
-    tvars = tf.compat.v1.trainable_variables()
-    gradients = optimizer.compute_gradients(loss, tvars, colocate_gradients_with_ops=True)
-    gradients = sparse_to_dense_grads(gradients)
-    minimize_op = optimizer.apply_gradients(gradients, global_step=global_step, name="train")
-    update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
-    train_op = tf.group(minimize_op, update_ops)
-
-    return tf_estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
-
-  else:
-    raise NotImplementedError
-
-
-def _strip_first_and_last_dimension(x, batch_size):
-  return tf.reshape(x[0, :], (batch_size,))
-
-
-def construct_model(user_input: tf.Tensor, item_input: tf.Tensor, params: Dict[Text, Any]) -> tf.keras.Model:
-  """Initialize NeuMF model.
-
-  Args:
-    user_input: keras input layer for users
-    item_input: keras input layer for items
-    params: Dict of hyperparameters.
-
-  Raises:
-    ValueError: if the first model layer is not even.
-  Returns:
-    model:  a keras Model for computing the logits
-  """
-  num_users = params["num_users"]
-  num_items = params["num_items"]
-
-  model_layers = params["model_layers"]
-
-  mf_regularization = params["mf_regularization"]
-  mlp_reg_layers = params["mlp_reg_layers"]
-
-  mf_dim = params["mf_dim"]
-
-  if model_layers[0] % 2 != 0:
-    raise ValueError("The first layer size should be multiple of 2!")
-
-  # Initializer for embedding layers
-  embedding_initializer = "glorot_uniform"
-
-  def mf_slice_fn(x):
-    x = tf.squeeze(x, [1])
-    return x[:, :mf_dim]
-
-  def mlp_slice_fn(x):
-    x = tf.squeeze(x, [1])
-    return x[:, mf_dim:]
-
-  # It turns out to be significantly more effecient to store the MF and MLP
-  # embedding portions in the same table, and then slice as needed.
-  embedding_user = tf.keras.layers.Embedding(
-      num_users,
-      mf_dim + model_layers[0] // 2,
-      embeddings_initializer=embedding_initializer,
-      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1,
-      name="embedding_user"
-  )(user_input)
-
-  embedding_item = tf.keras.layers.Embedding(
-      num_items,
-      mf_dim + model_layers[0] // 2,
-      embeddings_initializer=embedding_initializer,
-      embeddings_regularizer=tf.keras.regularizers.l2(mf_regularization),
-      input_length=1,
-      name="embedding_item"
-  )(item_input)
-
-  # GMF part
-  mf_user_latent = tf.keras.layers.Lambda(mf_slice_fn, name="embedding_user_mf")(embedding_user)
-  mf_item_latent = tf.keras.layers.Lambda(mf_slice_fn, name="embedding_item_mf")(embedding_item)
-
-  # MLP part
-  mlp_user_latent = tf.keras.layers.Lambda(mlp_slice_fn, name="embedding_user_mlp")(embedding_user)
-  mlp_item_latent = tf.keras.layers.Lambda(mlp_slice_fn, name="embedding_item_mlp")(embedding_item)
-
-  # Element-wise multiply
-  mf_vector = tf.keras.layers.multiply([mf_user_latent, mf_item_latent])
-
-  # Concatenation of two latent features
-  mlp_vector = tf.keras.layers.concatenate([mlp_user_latent, mlp_item_latent])
-
-  num_layer = len(model_layers)  # Number of layers in the MLP
-  for layer in xrange(1, num_layer):
-    model_layer = tf.keras.layers.Dense(
-        model_layers[layer], kernel_regularizer=tf.keras.regularizers.l2(mlp_reg_layers[layer]), activation="relu"
-    )
-    mlp_vector = model_layer(mlp_vector)
-
-  # Concatenate GMF and MLP parts
-  predict_vector = tf.keras.layers.concatenate([mf_vector, mlp_vector])
-
-  # Final prediction layer
-  logits = tf.keras.layers.Dense(1, activation=None, kernel_initializer="lecun_uniform",
-                                 name=movielens.RATING_COLUMN)(predict_vector)
-
-  # Print model topology.
-  model = tf.keras.Model([user_input, item_input], logits)
-  model.summary()
-  sys.stdout.flush()
-
-  return model
-
-
-def _get_estimator_spec_with_metrics(
-    logits: tf.Tensor,
-    softmax_logits: tf.Tensor,
-    duplicate_mask: tf.Tensor,
-    num_training_neg: int,
-    match_mlperf: bool = False,
-    use_tpu_spec: bool = False
-):
-  """Returns a EstimatorSpec that includes the metrics."""
-  cross_entropy, \
-  metric_fn, \
-  in_top_k, \
-  ndcg, \
-  metric_weights = compute_eval_loss_and_metrics_helper(
-      logits,
-      softmax_logits,
-      duplicate_mask,
-      num_training_neg,
-      match_mlperf)
-
-  if use_tpu_spec:
-    return tf_estimator.tpu.TPUEstimatorSpec(
-        mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metrics=(metric_fn, [in_top_k, ndcg, metric_weights])
-    )
-
-  return tf_estimator.EstimatorSpec(
-      mode=tf_estimator.ModeKeys.EVAL, loss=cross_entropy, eval_metric_ops=metric_fn(in_top_k, ndcg, metric_weights)
-  )
-
-
-def compute_eval_loss_and_metrics_helper(
-    logits: tf.Tensor,
-    softmax_logits: tf.Tensor,
-    duplicate_mask: tf.Tensor,
-    num_training_neg: int,
-    match_mlperf: bool = False
-):
-  """Model evaluation with HR and NDCG metrics.
-
-  The evaluation protocol is to rank the test interacted item (truth items)
-  among the randomly chosen 999 items that are not interacted by the user.
-  The performance of the ranked list is judged by Hit Ratio (HR) and Normalized
-  Discounted Cumulative Gain (NDCG).
-
-  For evaluation, the ranked list is truncated at 10 for both metrics. As such,
-  the HR intuitively measures whether the test item is present on the top-10
-  list, and the NDCG accounts for the position of the hit by assigning higher
-  scores to hits at top ranks. Both metrics are calculated for each test user,
-  and the average scores are reported.
-
-  If `match_mlperf` is True, then the HR and NDCG computations are done in a
-  slightly unusual way to match the MLPerf reference implementation.
-  Specifically, if the evaluation negatives contain duplicate items, it will be
-  treated as if the item only appeared once. Effectively, for duplicate items in
-  a row, the predicted score for all but one of the items will be set to
-  -infinity
-
-  For example, suppose we have that following inputs:
-  logits_by_user:     [[ 2,  3,  3],
-                       [ 5,  4,  4]]
-
-  items_by_user:     [[10, 20, 20],
-                      [30, 40, 40]]
-
-  # Note: items_by_user is not explicitly present. Instead the relevant \
-          information is contained within `duplicate_mask`
-
-  top_k: 2
-
-  Then with match_mlperf=True, the HR would be 2/2 = 1.0. With
-  match_mlperf=False, the HR would be 1/2 = 0.5. This is because each user has
-  predicted scores for only 2 unique items: 10 and 20 for the first user, and 30
-  and 40 for the second. Therefore, with match_mlperf=True, it's guaranteed the
-  first item's score is in the top 2. With match_mlperf=False, this function
-  would compute the first user's first item is not in the top 2, because item 20
-  has a higher score, and item 20 occurs twice.
-
-  Args:
-    logits: A tensor containing the predicted logits for each user. The shape of
-      logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits for a
-      user are grouped, and the last element of the group is the true element.
-    softmax_logits: The same tensor, but with zeros left-appended.
-    duplicate_mask: A vector with the same shape as logits, with a value of 1 if
-      the item corresponding to the logit at that position has already appeared
-      for that user.
-    num_training_neg: The number of negatives per positive during training.
-    match_mlperf: Use the MLPerf reference convention for computing rank.
-
-  Returns:
-    cross_entropy: the loss
-    metric_fn: the metrics function
-    in_top_k: hit rate metric
-    ndcg: ndcg metric
-    metric_weights: metric weights
-  """
-  in_top_k, ndcg, metric_weights, logits_by_user = compute_top_k_and_ndcg(logits, duplicate_mask, match_mlperf)
-
-  # Examples are provided by the eval Dataset in a structured format, so eval
-  # labels can be reconstructed on the fly.
-  eval_labels = tf.reshape(
-      shape=(-1,),
-      tensor=tf.one_hot(
-          tf.zeros(shape=(logits_by_user.shape[0],), dtype=tf.int32) + rconst.NUM_EVAL_NEGATIVES,
-          logits_by_user.shape[1],
-          dtype=tf.int32
-      )
-  )
-
-  eval_labels_float = tf.cast(eval_labels, tf.float32)
-
-  # During evaluation, the ratio of negatives to positives is much higher
-  # than during training. (Typically 999 to 1 vs. 4 to 1) By adjusting the
-  # weights for the negative examples we compute a loss which is consistent with
-  # the training data. (And provides apples-to-apples comparison)
-  negative_scale_factor = num_training_neg / rconst.NUM_EVAL_NEGATIVES
-  example_weights = (
-      (eval_labels_float + (1 - eval_labels_float) * negative_scale_factor) * (1 + rconst.NUM_EVAL_NEGATIVES) /
-      (1 + num_training_neg)
-  )
-
-  # Tile metric weights back to logit dimensions
-  expanded_metric_weights = tf.reshape(
-      tf.tile(metric_weights[:, tf.newaxis], (1, rconst.NUM_EVAL_NEGATIVES + 1)), (-1,)
-  )
-
-  # ignore padded examples
-  example_weights *= tf.cast(expanded_metric_weights, tf.float32)
-
-  cross_entropy = tf.compat.v1.losses.sparse_softmax_cross_entropy(
-      logits=softmax_logits, labels=eval_labels, weights=example_weights
-  )
-
-  def metric_fn(top_k_tensor, ndcg_tensor, weight_tensor):
-    return {
-        rconst.HR_KEY: tf.compat.v1.metrics.mean(top_k_tensor, weights=weight_tensor, name=rconst.HR_METRIC_NAME),
-        rconst.NDCG_KEY: tf.compat.v1.metrics.mean(ndcg_tensor, weights=weight_tensor, name=rconst.NDCG_METRIC_NAME)
-    }
-
-  return cross_entropy, metric_fn, in_top_k, ndcg, metric_weights
-
-
-def compute_top_k_and_ndcg(logits: tf.Tensor, duplicate_mask: tf.Tensor, match_mlperf: bool = False):
-  """Compute inputs of metric calculation.
-
-  Args:
-    logits: A tensor containing the predicted logits for each user. The shape of
-      logits is (num_users_per_batch * (1 + NUM_EVAL_NEGATIVES),) Logits for a
-      user are grouped, and the first element of the group is the true element.
-    duplicate_mask: A vector with the same shape as logits, with a value of 1 if
-      the item corresponding to the logit at that position has already appeared
-      for that user.
-    match_mlperf: Use the MLPerf reference convention for computing rank.
-
-  Returns:
-    is_top_k, ndcg and weights, all of which has size (num_users_in_batch,), and
-    logits_by_user which has size
-    (num_users_in_batch, (rconst.NUM_EVAL_NEGATIVES + 1)).
-  """
-  logits_by_user = tf.reshape(logits, (-1, rconst.NUM_EVAL_NEGATIVES + 1))
-  duplicate_mask_by_user = tf.cast(
-      tf.reshape(duplicate_mask, (-1, rconst.NUM_EVAL_NEGATIVES + 1)), logits_by_user.dtype
-  )
-
-  if match_mlperf:
-    # Set duplicate logits to the min value for that dtype. The MLPerf
-    # reference dedupes during evaluation.
-    logits_by_user *= (1 - duplicate_mask_by_user)
-    logits_by_user += duplicate_mask_by_user * logits_by_user.dtype.min
-
-  # Determine the location of the first element in each row after the elements
-  # are sorted.
-  sort_indices = tf.argsort(logits_by_user, axis=1, direction="DESCENDING")
-
-  # Use matrix multiplication to extract the position of the true item from the
-  # tensor of sorted indices. This approach is chosen because both GPUs and TPUs
-  # perform matrix multiplications very quickly. This is similar to np.argwhere.
-  # However this is a special case because the target will only appear in
-  # sort_indices once.
-  one_hot_position = tf.cast(tf.equal(sort_indices, rconst.NUM_EVAL_NEGATIVES), tf.int32)
-  sparse_positions = tf.multiply(one_hot_position, tf.range(logits_by_user.shape[1])[tf.newaxis, :])
-  position_vector = tf.reduce_sum(sparse_positions, axis=1)
-
-  in_top_k = tf.cast(tf.less(position_vector, rconst.TOP_K), tf.float32)
-  ndcg = tf.math.log(2.) / tf.math.log(tf.cast(position_vector, tf.float32) + 2)
-  ndcg *= in_top_k
-
-  # If a row is a padded row, all but the first element will be a duplicate.
-  metric_weights = tf.not_equal(tf.reduce_sum(duplicate_mask_by_user, axis=1), rconst.NUM_EVAL_NEGATIVES)
-
-  return in_top_k, ndcg, metric_weights, logits_by_user
diff --git a/recommendation/popen_helper.py b/recommendation/popen_helper.py
deleted file mode 100644
index 55cd2b4d..00000000
--- a/recommendation/popen_helper.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Helper file for running the async data generation process in OSS."""
-
-import contextlib
-import multiprocessing
-import multiprocessing.pool
-
-
-def get_forkpool(num_workers, init_worker=None, closing=True):
-  pool = multiprocessing.Pool(processes=num_workers, initializer=init_worker)
-  return contextlib.closing(pool) if closing else pool
-
-
-def get_threadpool(num_workers, init_worker=None, closing=True):
-  pool = multiprocessing.pool.ThreadPool(processes=num_workers, initializer=init_worker)
-  return contextlib.closing(pool) if closing else pool
-
-
-class FauxPool(object):
-  """Mimic a pool using for loops.
-
-  This class is used in place of proper pools when true determinism is desired
-  for testing or debugging.
-  """
-
-  def __init__(self, *args, **kwargs):
-    pass
-
-  def map(self, func, iterable, chunksize=None):
-    return [func(i) for i in iterable]
-
-  def imap(self, func, iterable, chunksize=1):
-    for i in iterable:
-      yield func(i)
-
-  def close(self):
-    pass
-
-  def terminate(self):
-    pass
-
-  def join(self):
-    pass
-
-
-def get_fauxpool(num_workers, init_worker=None, closing=True):
-  pool = FauxPool(processes=num_workers, initializer=init_worker)
-  return contextlib.closing(pool) if closing else pool
-
-
-def worker_job():
-  return "worker"
diff --git a/recommendation/ranking/README.md b/recommendation/ranking/README.md
deleted file mode 100644
index b126b8f8..00000000
--- a/recommendation/ranking/README.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# TF Model Garden Ranking Models
-
-## Overview
-This is an implementation of [DLRM](https://arxiv.org/abs/1906.00091) and
-[DCN v2](https://arxiv.org/abs/2008.13535) ranking models that can be used for
-tasks such as CTR prediction.
-
-The model inputs are numerical and categorical features, and output is a scalar
-(for example click probability).
-The model can be trained and evaluated on GPU, TPU and CPU. The deep ranking
-models are both memory intensive (for embedding tables/lookup) and compute
-intensive for deep networks (MLPs). CPUs are best suited for large sparse
-embedding lookup, GPUs for fast compute. TPUs are designed for both.
-
-When training on TPUs we use
-[TPUEmbedding layer](https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/layers/embedding/tpu_embedding_layer.py)
-for categorical features. TPU embedding supports large embedding tables with
-fast lookup, the size of embedding tables scales linearly with the size of TPU
-pod. We can have up to 90 GB embedding tables for TPU v3-8 and 5.6 TB for
-v3-512 and 22,4 TB for TPU Pod v3-2048.
-
-The Model code is in
-[TensorFlow Recommenders](https://github.com/tensorflow/recommenders/tree/main/tensorflow_recommenders/experimental/models)
-library, while input pipeline, configuration and training loop is here.
-
-## Prerequisites
-To get started, download the code from TensorFlow models GitHub repository or
-use the pre-installed Google Cloud VM.
-
-```bash
-git clone https://github.com/tensorflow/models.git
-export PYTHONPATH=$PYTHONPATH:$(pwd)/models
-```
-
-We also need to install
-[TensorFlow Recommenders](https://www.tensorflow.org/recommenders) library.
-If you are using [tf-nightly](https://pypi.org/project/tf-nightly/) make
-sure to install
-[tensorflow-recommenders](https://pypi.org/project/tensorflow-recommenders/)
-without its dependancies by passing `--no-deps` argument.
-
-For tf-nightly:
-```bash
-pip install tensorflow-recommenders --no-deps
-```
-
-For stable TensorFlow 2.4+ [releases](https://pypi.org/project/tensorflow/):
-```bash
-pip install tensorflow-recommenders
-```
-
-
-## Dataset
-
-The models can be trained on various datasets, Two commonly used ones are
-[Criteo Terabyte](https://labs.criteo.com/2013/12/download-terabyte-click-logs/)
-and [Criteo Kaggle](https://labs.criteo.com/2014/02/kaggle-display-advertising-challenge-dataset/)
-datasets.
-We can train on synthetic data, by setting the flag `use_synthetic_data=True`.
-
-### Download
-
-The dataset is the Terabyte click logs dataset provided by Criteo. Follow the
-[instructions](https://labs.criteo.com/2013/12/download-terabyte-click-logs/) at
-the Criteo website to download the data.
-
-Note that the dataset is large (~1TB).
-
-### Preprocess the data
-
-Follow the instructions in [Data Preprocessing](./preprocessing) to
-preprocess the Criteo Terabyte dataset.
-
-Data preprocessing steps are summarized below.
-
-Integer feature processing steps, sequentially:
-
-1.  Missing values are replaced with zeros.
-2.  Negative values are replaced with zeros.
-3.  Integer features are transformed by log(x+1) and are hence tf.float32.
-
-Categorical features:
-
-1.  Categorical data is bucketized to tf.int32.
-2.  Optionally, the resulting integers are hashed to a lower dimensionality.
-    This is necessary to reduce the sizes of the large tables. Simple hashing
-    function such as modulus will suffice, i.e. feature_value % MAX_INDEX.
-
-The vocabulary sizes resulting from pre-processing are passed in to the model
-trainer using the model.vocab_sizes config. Note that provided values in sample below
-are only valid for Criteo Terabyte dataset.
-
-The full dataset is composed of 24 directories. Partition the data into training
-and eval sets, for example days 1-23 for training and day 24 for evaluation.
-
-Training and eval datasets are expected to be saved in many tab-separated values
-(TSV) files in the following format: numberical fetures, categorical features
-and label.
-
-On each row of the TSV file, the first one is the label
-(either 0 or 1), the next `num_dense_features` inputs are numerical
-features, then `vocab_sizes` categorical features. Each i-th categorical feature is expected to be an integer in
-the range of `[0, vocab_sizes[i])`.
-
-## Train and Evaluate
-
-To train DLRM model we use dot product feature interaction, i.e.
-`interaction: 'dot'` to train DCN v2 model we use `interaction: 'cross'`.
-
-
-### Training on TPU
-
-```shell
-export TPU_NAME=my-dlrm-tpu
-export EXPERIMENT_NAME=my_experiment_name
-export BUCKET_NAME="gs://my_dlrm_bucket"
-export DATA_DIR="${BUCKET_NAME}/data"
-export EMBEDDING_DIM=32
-
-python3 models/official/recommendation/ranking/train.py --mode=train_and_eval \
---model_dir=${BUCKET_NAME}/model_dirs/${EXPERIMENT_NAME} --params_override="
-runtime:
-    distribution_strategy: 'tpu'
-task:
-    use_synthetic_data: false
-    train_data:
-        input_path: '${DATA_DIR}/train/*'
-        global_batch_size: 16384
-    validation_data:
-        input_path: '${DATA_DIR}/eval/*'
-        global_batch_size: 16384
-    model:
-        num_dense_features: 13
-        bottom_mlp: [512,256,${EMBEDDING_DIM}]
-        embedding_dim: ${EMBEDDING_DIM}
-        top_mlp: [1024,1024,512,256,1]
-        interaction: 'dot'
-        vocab_sizes: [39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63,
-            38532951, 2953546, 403346, 10, 2208, 11938, 155, 4, 976, 14,
-            39979771, 25641295, 39664984, 585935, 12972, 108, 36]
-trainer:
-    use_orbit: true
-    validation_interval: 85352
-    checkpoint_interval: 85352
-    validation_steps: 5440
-    train_steps: 256054
-    steps_per_loop: 1000
-"
-```
-
-The data directory should have two subdirectories:
-
-*   $DATA_DIR/train
-*   $DATA_DIR/eval
-
-### Training on GPU
-
-Training on GPUs are similar to TPU training. Only distribution strategy needs
-to be updated and number of GPUs provided (for 4 GPUs):
-
-```shell
-export EMBEDDING_DIM=8
-
-python3 official/recommendation/ranking/train.py --mode=train_and_eval \
---model_dir=${BUCKET_NAME}/model_dirs/${EXPERIMENT_NAME} --params_override="
-runtime:
-  distribution_strategy: 'mirrored'
-  num_gpus: 4
-...
-"
-```
diff --git a/recommendation/ranking/__init__.py b/recommendation/ranking/__init__.py
deleted file mode 100644
index 526acdde..00000000
--- a/recommendation/ranking/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/recommendation/ranking/common.py b/recommendation/ranking/common.py
deleted file mode 100644
index 8c3ecf79..00000000
--- a/recommendation/ranking/common.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Flags and common definitions for Ranking Models."""
-
-from absl import flags
-import tensorflow as tf
-
-from official.common import flags as tfm_flags
-
-
-
-
-def define_flags() -> None:
-  """Defines flags for training the Ranking model."""
-  tfm_flags.define_flags()
-
-  FLAGS.set_default(name='experiment', value='dlrm_criteo')
-  FLAGS.set_default(name='mode', value='train_and_eval')
-
-  flags.DEFINE_integer(name='seed', default=None, help='This value will be used to seed both NumPy and TensorFlow.')
-  flags.DEFINE_string(
-      name='profile_steps',
-      default='20,40',
-      help='Save profiling data to model dir at given range of global steps. '
-      'The value must be a comma separated pair of positive integers, '
-      'specifying the first and last step to profile. For example, '
-      '"--profile_steps=2,4" triggers the profiler to process 3 steps, starting'
-      ' from the 2nd step. Note that profiler has a non-trivial performance '
-      'overhead, and the output file can be gigantic if profiling many steps.'
-  )
-
-
-@tf.keras.utils.register_keras_serializable(package='RANKING')
-class WarmUpAndPolyDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
-  """Learning rate callable for the embeddings.
-
-  Linear warmup on [0, warmup_steps] then
-  Constant on [warmup_steps, decay_start_steps]
-  And polynomial decay on [decay_start_steps, decay_start_steps + decay_steps].
-  """
-
-  def __init__(
-      self,
-      batch_size: int,
-      decay_exp: float = 2.0,
-      learning_rate: float = 40.0,
-      warmup_steps: int = 8000,
-      decay_steps: int = 12000,
-      decay_start_steps: int = 10000
-  ):
-    super(WarmUpAndPolyDecay, self).__init__()
-    self.batch_size = batch_size
-    self.decay_exp = decay_exp
-    self.learning_rate = learning_rate
-    self.warmup_steps = warmup_steps
-    self.decay_steps = decay_steps
-    self.decay_start_steps = decay_start_steps
-
-  def __call__(self, step):
-    decay_exp = self.decay_exp
-    learning_rate = self.learning_rate
-    warmup_steps = self.warmup_steps
-    decay_steps = self.decay_steps
-    decay_start_steps = self.decay_start_steps
-
-    scal = self.batch_size / 2048
-
-    adj_lr = learning_rate * scal
-    if warmup_steps == 0:
-      return adj_lr
-
-    warmup_lr = step / warmup_steps * adj_lr
-    global_step = tf.cast(step, tf.float32)
-    decay_steps = tf.cast(decay_steps, tf.float32)
-    decay_start_step = tf.cast(decay_start_steps, tf.float32)
-    warmup_lr = tf.cast(warmup_lr, tf.float32)
-
-    steps_since_decay_start = global_step - decay_start_step
-    already_decayed_steps = tf.minimum(steps_since_decay_start, decay_steps)
-    decay_lr = adj_lr * ((decay_steps - already_decayed_steps) / decay_steps)**decay_exp
-    decay_lr = tf.maximum(0.0001, decay_lr)
-
-    lr = tf.where(
-        global_step < warmup_steps, warmup_lr,
-        tf.where(tf.logical_and(decay_steps > 0, global_step > decay_start_step), decay_lr, adj_lr)
-    )
-
-    lr = tf.maximum(0.01, lr)
-    return lr
-
-  def get_config(self):
-    return {
-        'batch_size': self.batch_size,
-        'decay_exp': self.decay_exp,
-        'learning_rate': self.learning_rate,
-        'warmup_steps': self.warmup_steps,
-        'decay_steps': self.decay_steps,
-        'decay_start_steps': self.decay_start_steps
-    }
diff --git a/recommendation/ranking/configs/__init__.py b/recommendation/ranking/configs/__init__.py
deleted file mode 100644
index 526acdde..00000000
--- a/recommendation/ranking/configs/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/recommendation/ranking/configs/config.py b/recommendation/ranking/configs/config.py
deleted file mode 100644
index 4065a90f..00000000
--- a/recommendation/ranking/configs/config.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Ranking Model configuration definition."""
-import dataclasses
-from typing import List, Optional, Union
-from official.core import config_definitions as cfg
-from official.core import exp_factory
-from official.modeling import hyperparams
-
-
-@dataclasses.dataclass
-class CallbacksConfig(hyperparams.Config):
-  """Configuration for Callbacks.
-
-  Attributes:
-    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
-      Callback. Defaults to True.
-    enable_backup_and_restore: Whether or not to add BackupAndRestore
-      callback. Defaults to True.
-    enable_tensorboard: Whether or not to enable TensorBoard as a Callback.
-      Defaults to True.
-    enable_time_history: Whether or not to enable TimeHistory Callbacks.
-      Defaults to True.
-  """
-  enable_checkpoint_and_export: bool = True
-  enable_backup_and_restore: bool = False
-  enable_tensorboard: bool = True
-  enable_time_history: bool = True
-
-
-@dataclasses.dataclass
-class LearningRateConfig(hyperparams.Config):
-  """Learning rate scheduler config."""
-  learning_rate: float = 1.25
-  warmup_steps: int = 8000
-  decay_steps: int = 30000
-  decay_start_steps: int = 70000
-  decay_exp: float = 2
-
-
-@dataclasses.dataclass
-class OptimizationConfig(hyperparams.Config):
-  """Embedding and dense optimizer configs."""
-  lr_config: LearningRateConfig = dataclasses.field(default_factory=LearningRateConfig)
-  dense_sgd_config: LearningRateConfig = dataclasses.field(default_factory=lambda: LearningRateConfig(warmup_steps=0))
-  embedding_optimizer: str = 'SGD'
-  dense_optimizer: str = 'Adam'
-
-
-@dataclasses.dataclass
-class DataConfig(hyperparams.Config):
-  """Dataset config for training and evaluation."""
-  input_path: str = ''
-  global_batch_size: int = 0
-  is_training: bool = True
-  dtype: str = 'float32'
-  shuffle_buffer_size: int = 10000
-  cycle_length: int = 10
-  sharding: bool = True
-  num_shards_per_host: int = 8
-
-
-@dataclasses.dataclass
-class ModelConfig(hyperparams.Config):
-  """Configuration for training.
-
-  Attributes:
-    num_dense_features: Number of dense features.
-    vocab_sizes: Vocab sizes for each of the sparse features. The order agrees
-      with the order of the input data.
-    embedding_dim: An integer or a list of embedding table dimensions.
-      If it's an integer then all tables will have the same embedding dimension.
-      If it's a list then the length should match with `vocab_sizes`.
-    size_threshold: A threshold for table sizes below which a keras
-        embedding layer is used, and above which a TPU embedding layer is used.
-        If it's -1 then only keras embedding layer will be used for all tables,
-        if 0 only then only TPU embedding layer will be used.
-    bottom_mlp: The sizes of hidden layers for bottom MLP applied to dense
-      features.
-    top_mlp: The sizes of hidden layers for top MLP.
-    interaction: Interaction can be on of the following:
-     'dot', 'cross'.
-  """
-  num_dense_features: int = 13
-  vocab_sizes: List[int] = dataclasses.field(default_factory=list)
-  embedding_dim: Union[int, List[int]] = 8
-  size_threshold: int = 50_000
-  bottom_mlp: List[int] = dataclasses.field(default_factory=list)
-  top_mlp: List[int] = dataclasses.field(default_factory=list)
-  interaction: str = 'dot'
-
-
-@dataclasses.dataclass
-class Loss(hyperparams.Config):
-  """Configuration for Loss.
-
-  Attributes:
-    label_smoothing: Whether or not to apply label smoothing to the
-    Binary Crossentropy loss.
-  """
-  label_smoothing: float = 0.0
-
-
-@dataclasses.dataclass
-class Task(hyperparams.Config):
-  """The model config."""
-  init_checkpoint: str = ''
-  model: ModelConfig = dataclasses.field(default_factory=ModelConfig)
-  train_data: DataConfig = dataclasses.field(default_factory=lambda: DataConfig(is_training=True))
-  validation_data: DataConfig = dataclasses.field(default_factory=lambda: DataConfig(is_training=False))
-  loss: Loss = dataclasses.field(default_factory=Loss)
-  use_synthetic_data: bool = False
-
-
-@dataclasses.dataclass
-class TimeHistoryConfig(hyperparams.Config):
-  """Configuration for the TimeHistory callback.
-
-  Attributes:
-    log_steps: Interval of steps between logging of batch level stats.
-  """
-  log_steps: Optional[int] = None
-
-
-@dataclasses.dataclass
-class TrainerConfig(cfg.TrainerConfig):
-  """Configuration for training.
-
-  Attributes:
-    train_steps: The number of steps used to train.
-    validation_steps: The number of steps used to eval.
-    validation_interval: The Number of training steps to run between
-      evaluations.
-    callbacks: An instance of CallbacksConfig.
-    use_orbit: Whether to use orbit library with custom training loop or
-      compile/fit API.
-    enable_metrics_in_training: Whether to enable metrics during training.
-    time_history: Config of TimeHistory callback.
-    optimizer_config: An `OptimizerConfig` instance for embedding optimizer.
-       Defaults to None.
-    pipeline_sparse_and_dense_exeuction: Whether to pipeline embedding and
-      dense execution. This is a performance optimization.
-  """
-  train_steps: int = 0
-  # Sets validation steps to be -1 to evaluate the entire dataset.
-  validation_steps: int = -1
-  validation_interval: int = 70000
-  callbacks: CallbacksConfig = dataclasses.field(default_factory=CallbacksConfig)
-  use_orbit: bool = False
-  enable_metrics_in_training: bool = True
-  time_history: TimeHistoryConfig = dataclasses.field(default_factory=lambda: TimeHistoryConfig(log_steps=5000))
-  optimizer_config: OptimizationConfig = dataclasses.field(default_factory=OptimizationConfig)
-  pipeline_sparse_and_dense_execution: bool = False
-
-
-NUM_TRAIN_EXAMPLES = 4195197692
-NUM_EVAL_EXAMPLES = 89137318
-
-train_batch_size = 16384
-eval_batch_size = 16384
-steps_per_epoch = NUM_TRAIN_EXAMPLES // train_batch_size
-vocab_sizes = [
-    39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, 38532951, 2953546, 403346, 10, 2208, 11938, 155, 4, 976, 14,
-    39979771, 25641295, 39664984, 585935, 12972, 108, 36
-]
-
-
-@dataclasses.dataclass
-class Config(hyperparams.Config):
-  """Configuration to train the RankingModel.
-
-  By default it configures DLRM model on criteo dataset.
-
-  Attributes:
-    runtime: A `RuntimeConfig` instance.
-    task: `Task` instance.
-    trainer: A `TrainerConfig` instance.
-  """
-  runtime: cfg.RuntimeConfig = dataclasses.field(default_factory=cfg.RuntimeConfig)
-  task: Task = dataclasses.field(
-      default_factory=lambda: Task(  # pylint: disable=g-long-lambda
-          model=ModelConfig(
-              embedding_dim=8,
-              vocab_sizes=vocab_sizes,
-              bottom_mlp=[64, 32, 8],
-              top_mlp=[64, 32, 1],
-          ),
-          loss=Loss(label_smoothing=0.0),
-          train_data=DataConfig(
-              is_training=True, global_batch_size=train_batch_size
-          ),
-          validation_data=DataConfig(
-              is_training=False, global_batch_size=eval_batch_size
-          ),
-      )
-  )
-  trainer: TrainerConfig = dataclasses.field(
-      default_factory=lambda: TrainerConfig(  # pylint: disable=g-long-lambda
-          train_steps=2 * steps_per_epoch,
-          validation_interval=steps_per_epoch,
-          validation_steps=NUM_EVAL_EXAMPLES // eval_batch_size,
-          enable_metrics_in_training=True,
-          optimizer_config=OptimizationConfig(),
-      )
-  )
-  restrictions: dataclasses.InitVar[Optional[List[str]]] = None
-
-
-def default_config() -> Config:
-  return Config(
-      runtime=cfg.RuntimeConfig(),
-      task=Task(
-          model=ModelConfig(embedding_dim=8, vocab_sizes=vocab_sizes, bottom_mlp=[64, 32, 4], top_mlp=[64, 32, 1]),
-          loss=Loss(label_smoothing=0.0),
-          train_data=DataConfig(global_batch_size=train_batch_size, is_training=True, sharding=True),
-          validation_data=DataConfig(global_batch_size=eval_batch_size, is_training=False, sharding=False)
-      ),
-      trainer=TrainerConfig(
-          train_steps=2 * steps_per_epoch,
-          validation_interval=steps_per_epoch,
-          validation_steps=NUM_EVAL_EXAMPLES // eval_batch_size,
-          enable_metrics_in_training=True,
-          optimizer_config=OptimizationConfig()
-      ),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-      ]
-  )
-
-
-@exp_factory.register_config_factory('dlrm_criteo')
-def dlrm_criteo_tb_config() -> Config:
-  return Config(
-      runtime=cfg.RuntimeConfig(),
-      task=Task(
-          model=ModelConfig(
-              num_dense_features=13,
-              vocab_sizes=vocab_sizes,
-              bottom_mlp=[512, 256, 64],
-              embedding_dim=64,
-              top_mlp=[1024, 1024, 512, 256, 1],
-              interaction='dot'
-          ),
-          loss=Loss(label_smoothing=0.0),
-          train_data=DataConfig(global_batch_size=train_batch_size, is_training=True, sharding=True),
-          validation_data=DataConfig(global_batch_size=eval_batch_size, is_training=False, sharding=False)
-      ),
-      trainer=TrainerConfig(
-          train_steps=steps_per_epoch,
-          validation_interval=steps_per_epoch // 2,
-          validation_steps=NUM_EVAL_EXAMPLES // eval_batch_size,
-          enable_metrics_in_training=True,
-          optimizer_config=OptimizationConfig()
-      ),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-      ]
-  )
-
-
-@exp_factory.register_config_factory('dcn_criteo')
-def dcn_criteo_tb_config() -> Config:
-  return Config(
-      runtime=cfg.RuntimeConfig(),
-      task=Task(
-          model=ModelConfig(
-              num_dense_features=13,
-              vocab_sizes=vocab_sizes,
-              bottom_mlp=[512, 256, 64],
-              embedding_dim=64,
-              top_mlp=[1024, 1024, 512, 256, 1],
-              interaction='cross'
-          ),
-          loss=Loss(label_smoothing=0.0),
-          train_data=DataConfig(global_batch_size=train_batch_size, is_training=True, sharding=True),
-          validation_data=DataConfig(global_batch_size=eval_batch_size, is_training=False, sharding=False)
-      ),
-      trainer=TrainerConfig(
-          train_steps=steps_per_epoch,
-          validation_interval=steps_per_epoch // 2,
-          validation_steps=NUM_EVAL_EXAMPLES // eval_batch_size,
-          enable_metrics_in_training=True,
-          optimizer_config=OptimizationConfig()
-      ),
-      restrictions=[
-          'task.train_data.is_training != None',
-          'task.validation_data.is_training != None',
-      ]
-  )
diff --git a/recommendation/ranking/configs/config_test.py b/recommendation/ranking/configs/config_test.py
deleted file mode 100644
index 9e1190d8..00000000
--- a/recommendation/ranking/configs/config_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unit tests for DLRM config."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.recommendation.ranking.configs import config
-
-
-class ConfigTest(tf.test.TestCase, parameterized.TestCase):
-
-  def test_configs(self):
-    criteo_config = config.default_config()
-    self.assertIsInstance(criteo_config, config.Config)
-    self.assertIsInstance(criteo_config.task, config.Task)
-    self.assertIsInstance(criteo_config.task.model, config.ModelConfig)
-    self.assertIsInstance(criteo_config.task.train_data, config.DataConfig)
-    self.assertIsInstance(criteo_config.task.validation_data, config.DataConfig)
-    criteo_config.task.train_data.is_training = None
-    with self.assertRaises(KeyError):
-      criteo_config.validate()
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/recommendation/ranking/configs/yaml/dcn_v2_criteo_tpu.yaml b/recommendation/ranking/configs/yaml/dcn_v2_criteo_tpu.yaml
deleted file mode 100644
index a281e032..00000000
--- a/recommendation/ranking/configs/yaml/dcn_v2_criteo_tpu.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-runtime:
-  distribution_strategy: 'tpu'
-task:
-  model:
-    bottom_mlp: [512, 256, 64]
-    embedding_dim: 64
-    num_dense_features: 13
-    top_mlp: [1024, 1024, 512, 256, 1]
-    interaction: 'cross'
-    vocab_sizes: [39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, 38532951, 2953546, 403346,
-                  10, 2208, 11938, 155, 4, 976, 14, 39979771, 25641295, 39664984, 585935, 12972,
-                  108, 36]
-  train_data:
-    global_batch_size: 16384
-    input_path: path_to_training_data_dir/*
-    is_training: true
-    num_shards_per_host: 4
-    sharding: true
-  validation_data:
-    global_batch_size: 16384
-    input_path: path_to_eval_data_dir/*
-    is_training: false
-    sharding: false
-trainer:
-  checkpoint_interval: 85352
-  eval_tf_function: true
-  eval_tf_while_loop: false
-  max_to_keep: 5
-  train_steps: 256054
-  train_tf_function: true
-  train_tf_while_loop: true
-  use_orbit: true
-  validation_interval: 85352
-  validation_steps: 5440
-  validation_summary_subdir: 'validation'
diff --git a/recommendation/ranking/configs/yaml/dlrm_criteo_tpu.yaml b/recommendation/ranking/configs/yaml/dlrm_criteo_tpu.yaml
deleted file mode 100644
index aaaadf58..00000000
--- a/recommendation/ranking/configs/yaml/dlrm_criteo_tpu.yaml
+++ /dev/null
@@ -1,35 +0,0 @@
-runtime:
-  distribution_strategy: 'tpu'
-task:
-  model:
-    bottom_mlp: [512, 256, 64]
-    embedding_dim: 64
-    num_dense_features: 13
-    top_mlp: [1024, 1024, 512, 256, 1]
-    interaction: 'dot'
-    vocab_sizes: [39884406, 39043, 17289, 7420, 20263, 3, 7120, 1543, 63, 38532951, 2953546, 403346,
-                  10, 2208, 11938, 155, 4, 976, 14, 39979771, 25641295, 39664984, 585935, 12972,
-                  108, 36]
-  train_data:
-    global_batch_size: 16384
-    input_path: path_to_training_data_dir/*
-    is_training: true
-    num_shards_per_host: 4
-    sharding: true
-  validation_data:
-    global_batch_size: 16384
-    input_path: path_to_eval_data_dir/*
-    is_training: false
-    sharding: false
-trainer:
-  checkpoint_interval: 85352
-  eval_tf_function: true
-  eval_tf_while_loop: false
-  max_to_keep: 5
-  train_steps: 256054
-  train_tf_function: true
-  train_tf_while_loop: true
-  use_orbit: true
-  validation_interval: 85352
-  validation_steps: 5440
-  validation_summary_subdir: 'validation'
diff --git a/recommendation/ranking/data/__init__.py b/recommendation/ranking/data/__init__.py
deleted file mode 100644
index 526acdde..00000000
--- a/recommendation/ranking/data/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/recommendation/ranking/data/data_pipeline.py b/recommendation/ranking/data/data_pipeline.py
deleted file mode 100644
index 9dd50786..00000000
--- a/recommendation/ranking/data/data_pipeline.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Data pipeline for the Ranking model.
-
-This module defines various input datasets for the Ranking model.
-"""
-
-from typing import List
-import tensorflow as tf
-
-from official.recommendation.ranking.configs import config
-
-
-class CriteoTsvReader:
-  """Input reader callable for pre-processed Criteo data.
-
-  Raw Criteo data is assumed to be preprocessed in the following way:
-  1. Missing values are replaced with zeros.
-  2. Negative values are replaced with zeros.
-  3. Integer features are transformed by log(x+1) and are hence tf.float32.
-  4. Categorical data is bucketized and are hence tf.int32.
-  """
-
-  def __init__(
-      self,
-      file_pattern: str,
-      params: config.DataConfig,
-      num_dense_features: int,
-      vocab_sizes: List[int],
-      use_synthetic_data: bool = False
-  ):
-    self._file_pattern = file_pattern
-    self._params = params
-    self._num_dense_features = num_dense_features
-    self._vocab_sizes = vocab_sizes
-    self._use_synthetic_data = use_synthetic_data
-
-  def __call__(self, ctx: tf.distribute.InputContext) -> tf.data.Dataset:
-    params = self._params
-    # Per replica batch size.
-    batch_size = ctx.get_per_replica_batch_size(params.global_batch_size) if ctx else params.global_batch_size
-    if self._use_synthetic_data:
-      return self._generate_synthetic_data(ctx, batch_size)
-
-    @tf.function
-    def _parse_fn(example: tf.Tensor):
-      """Parser function for pre-processed Criteo TSV records."""
-      label_defaults = [[0.0]]
-      dense_defaults = [[0.0] for _ in range(self._num_dense_features)]
-      num_sparse_features = len(self._vocab_sizes)
-      categorical_defaults = [[0] for _ in range(num_sparse_features)]
-      record_defaults = label_defaults + dense_defaults + categorical_defaults
-      fields = tf.io.decode_csv(example, record_defaults, field_delim='\t', na_value='-1')
-
-      num_labels = 1
-      label = tf.reshape(fields[0], [batch_size, 1])
-
-      features = {}
-      num_dense = len(dense_defaults)
-
-      dense_features = []
-      offset = num_labels
-      for idx in range(num_dense):
-        dense_features.append(fields[idx + offset])
-      features['dense_features'] = tf.stack(dense_features, axis=1)
-
-      offset += num_dense
-      features['sparse_features'] = {}
-
-      for idx in range(num_sparse_features):
-        features['sparse_features'][str(idx)] = fields[idx + offset]
-
-      return features, label
-
-    filenames = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)
-
-    # Shard the full dataset according to host number.
-    # Each host will get 1 / num_of_hosts portion of the data.
-    if params.sharding and ctx and ctx.num_input_pipelines > 1:
-      filenames = filenames.shard(ctx.num_input_pipelines, ctx.input_pipeline_id)
-
-    num_shards_per_host = 1
-    if params.sharding:
-      num_shards_per_host = params.num_shards_per_host
-
-    def make_dataset(shard_index):
-      filenames_for_shard = filenames.shard(num_shards_per_host, shard_index)
-      dataset = tf.data.TextLineDataset(filenames_for_shard)
-      if params.is_training:
-        dataset = dataset.repeat()
-      dataset = dataset.batch(batch_size, drop_remainder=True)
-      dataset = dataset.map(_parse_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
-      return dataset
-
-    indices = tf.data.Dataset.range(num_shards_per_host)
-    dataset = indices.interleave(
-        map_func=make_dataset, cycle_length=params.cycle_length, num_parallel_calls=tf.data.experimental.AUTOTUNE
-    )
-
-    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
-
-    return dataset
-
-  def _generate_synthetic_data(self, ctx: tf.distribute.InputContext, batch_size: int) -> tf.data.Dataset:
-    """Creates synthetic data based on the parameter batch size.
-
-    Args:
-      ctx: Input Context
-      batch_size: per replica batch size.
-
-    Returns:
-      The synthetic dataset.
-    """
-    params = self._params
-    num_dense = self._num_dense_features
-    num_replicas = ctx.num_replicas_in_sync if ctx else 1
-
-    if params.is_training:
-      dataset_size = 1000 * batch_size * num_replicas
-    else:
-      dataset_size = 1000 * batch_size * num_replicas
-    dense_tensor = tf.random.uniform(shape=(dataset_size, num_dense), maxval=1.0, dtype=tf.float32)
-
-    sparse_tensors = []
-    for size in self._vocab_sizes:
-      sparse_tensors.append(tf.random.uniform(shape=(dataset_size,), maxval=int(size), dtype=tf.int32))
-
-    sparse_tensor_elements = {str(i): sparse_tensors[i] for i in range(len(sparse_tensors))}
-
-    # the mean is in [0, 1] interval.
-    dense_tensor_mean = tf.math.reduce_mean(dense_tensor, axis=1)
-
-    sparse_tensors = tf.stack(sparse_tensors, axis=-1)
-    sparse_tensors_mean = tf.math.reduce_sum(sparse_tensors, axis=1)
-    # the mean is in [0, 1] interval.
-    sparse_tensors_mean = tf.cast(sparse_tensors_mean, dtype=tf.float32)
-    sparse_tensors_mean /= sum(self._vocab_sizes)
-    # the label is in [0, 1] interval.
-    label_tensor = (dense_tensor_mean + sparse_tensors_mean) / 2.0
-    # Using the threshold 0.5 to convert to 0/1 labels.
-    label_tensor = tf.cast(label_tensor + 0.5, tf.int32)
-
-    input_elem = {'dense_features': dense_tensor, 'sparse_features': sparse_tensor_elements}, label_tensor
-
-    dataset = tf.data.Dataset.from_tensor_slices(input_elem)
-    dataset = dataset.cache()
-    if params.is_training:
-      dataset = dataset.repeat()
-
-    return dataset.batch(batch_size, drop_remainder=True)
-
-
-def train_input_fn(params: config.Task) -> CriteoTsvReader:
-  """Returns callable object of batched training examples.
-
-  Args:
-    params: hyperparams to create input pipelines.
-
-  Returns:
-    CriteoTsvReader callable for training dataset.
-  """
-  return CriteoTsvReader(
-      file_pattern=params.train_data.input_path,
-      params=params.train_data,
-      vocab_sizes=params.model.vocab_sizes,
-      num_dense_features=params.model.num_dense_features,
-      use_synthetic_data=params.use_synthetic_data
-  )
-
-
-def eval_input_fn(params: config.Task) -> CriteoTsvReader:
-  """Returns callable object of batched eval examples.
-
-  Args:
-    params: hyperparams to create input pipelines.
-
-  Returns:
-    CriteoTsvReader callable for eval dataset.
-  """
-
-  return CriteoTsvReader(
-      file_pattern=params.validation_data.input_path,
-      params=params.validation_data,
-      vocab_sizes=params.model.vocab_sizes,
-      num_dense_features=params.model.num_dense_features,
-      use_synthetic_data=params.use_synthetic_data
-  )
diff --git a/recommendation/ranking/data/data_pipeline_test.py b/recommendation/ranking/data/data_pipeline_test.py
deleted file mode 100644
index e286c068..00000000
--- a/recommendation/ranking/data/data_pipeline_test.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unit tests for data_pipeline."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.recommendation.ranking.configs import config
-from official.recommendation.ranking.data import data_pipeline
-
-
-class DataPipelineTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.named_parameters(('Train', True), ('Eval', False))
-  def testSyntheticDataPipeline(self, is_training):
-    task = config.Task(
-        model=config.ModelConfig(
-            embedding_dim=4,
-            num_dense_features=8,
-            vocab_sizes=[40, 12, 11, 13, 2, 5],
-            bottom_mlp=[64, 32, 4],
-            top_mlp=[64, 32, 1]
-        ),
-        train_data=config.DataConfig(global_batch_size=16),
-        validation_data=config.DataConfig(global_batch_size=16),
-        use_synthetic_data=True
-    )
-
-    num_dense_features = task.model.num_dense_features
-    num_sparse_features = len(task.model.vocab_sizes)
-    batch_size = task.train_data.global_batch_size
-
-    if is_training:
-      dataset = data_pipeline.train_input_fn(task)
-    else:
-      dataset = data_pipeline.eval_input_fn(task)
-
-    dataset_iter = iter(dataset(ctx=None))
-
-    # Consume full batches and validate shapes.
-    for _ in range(10):
-      features, label = next(dataset_iter)
-      dense_features = features['dense_features']
-      sparse_features = features['sparse_features']
-      self.assertEqual(dense_features.shape, [batch_size, num_dense_features])
-      self.assertLen(sparse_features, num_sparse_features)
-      for _, val in sparse_features.items():
-        self.assertEqual(val.shape, [batch_size])
-      self.assertEqual(label.shape, [batch_size])
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/recommendation/ranking/preprocessing/README.md b/recommendation/ranking/preprocessing/README.md
deleted file mode 100644
index 6fb73835..00000000
--- a/recommendation/ranking/preprocessing/README.md
+++ /dev/null
@@ -1,122 +0,0 @@
-## Download and preprocess Criteo TB dataset
-
-[Apache Beam](https://beam.apache.org) enables distributed preprocessing of the
-dataset and can be run on
-[Google Cloud Dataflow](https://cloud.google.com/dataflow/). The preprocessing
-scripts can be run locally via DirectRunner provided that the local host has
-enough CPU/Memory/Storage.
-
-Install required packages.
-
-```bash
-python3 setup.py install
-```
-
-
-Set up the following environment variables, replacing bucket-name with the name
-of your Cloud Storage bucket and project name with your GCP project name.
-
-```bash
-export STORAGE_BUCKET=gs://bucket-name
-export PROJECT=my-gcp-project
-export REGION=us-central1
-```
-
-Note: If running locally above environment variables won't be needed and instead
-of gs://bucket-name a local path can be used, also consider passing smaller
-`max_vocab_size` argument.
-
-
-1.  Download raw
-    [Criteo TB dataset](https://labs.criteo.com/2013/12/download-terabyte-click-logs/)
-    to a GCS bucket.
-
-Organize the data in the following way:
-
-*   The files day_0.gz, day_1.gz, ..., day_22.gz in
-    ${STORAGE_BUCKET}/criteo_raw/train/
-
-*   The file day_23.gz in ${STORAGE_BUCKET}/criteo_raw/test/
-
-2. Shard the raw training/test data into multiple files.
-
-```bash
-python3 shard_rebalancer.py \
-  --input_path "${STORAGE_BUCKET}/criteo_raw/train/*" \
-  --output_path "${STORAGE_BUCKET}/criteo_raw_sharded/train/train" \
-  --num_output_files 1024 --filetype csv --runner DataflowRunner \
-  --project ${PROJECT} --region ${REGION}
-```
-
-
-```bash
-python3 shard_rebalancer.py \
-  --input_path "${STORAGE_BUCKET}/criteo_raw/test/*" \
-  --output_path "${STORAGE_BUCKET}/criteo_raw_sharded/test/test" \
-  --num_output_files 64 --filetype csv --runner DataflowRunner \
-  --project ${PROJECT} --region ${REGION}
-```
-
-3. Generate vocabulary and preprocess the data.
-
-Generate vocabulary:
-
-```bash
-python3 criteo_preprocess.py \
-  --input_path "${STORAGE_BUCKET}/criteo_raw_sharded/*/*" \
-  --output_path "${STORAGE_BUCKET}/criteo/" \
-  --temp_dir "${STORAGE_BUCKET}/criteo_vocab/" \
-  --vocab_gen_mode --runner DataflowRunner --max_vocab_size 5000000 \
-  --project ${PROJECT} --region ${REGION}
-```
-Vocabulary for each feature is going to be generated to
-`${STORAGE_BUCKET}/criteo_vocab/tftransform_tmp/feature_??_vocab` files.
-Vocabulary size can be found as `wc -l <feature_vocab_file>`.
-
-Preprocess training and test data:
-
-```bash
-python3 criteo_preprocess.py \
-  --input_path "${STORAGE_BUCKET}/criteo_raw_sharded/train/*" \
-  --output_path "${STORAGE_BUCKET}/criteo/train/train" \
-  --temp_dir "${STORAGE_BUCKET}/criteo_vocab/" \
-  --runner DataflowRunner --max_vocab_size 5000000 \
-  --project ${PROJECT} --region ${REGION}
-```
-
-```bash
-python3 criteo_preprocess.py \
-  --input_path "${STORAGE_BUCKET}/criteo_raw_sharded/test/*" \
-  --output_path "${STORAGE_BUCKET}/criteo/test/test" \
-  --temp_dir "${STORAGE_BUCKET}/criteo_vocab/" \
-  --runner DataflowRunner --max_vocab_size 5000000 \
-  --project ${PROJECT} --region ${REGION}
-```
-
-
-4. (Optional) Re-balance the dataset.
-
-```bash
-python3 shard_rebalancer.py \
-  --input_path "${STORAGE_BUCKET}/criteo/train/*" \
-  --output_path "${STORAGE_BUCKET}/criteo_balanced/train/train" \
-  --num_output_files 8192 --filetype csv --runner DataflowRunner \
-  --project ${PROJECT} --region ${REGION}
-```
-
-```bash
-python3 shard_rebalancer.py \
-  --input_path "${STORAGE_BUCKET}/criteo/test/*" \
-  --output_path "${STORAGE_BUCKET}/criteo_balanced/test/test" \
-  --num_output_files 1024 --filetype csv --runner DataflowRunner \
-  --project ${PROJECT} --region ${REGION}
-```
-
-At this point training and test data are in the buckets:
-
-* `${STORAGE_BUCKET}/criteo_balanced/train/`
-* `${STORAGE_BUCKET}/criteo_balanced/test/`
-
-All other buckets can be removed.
-
-
diff --git a/recommendation/ranking/preprocessing/criteo_preprocess.py b/recommendation/ranking/preprocessing/criteo_preprocess.py
deleted file mode 100644
index 8ed94900..00000000
--- a/recommendation/ranking/preprocessing/criteo_preprocess.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""TFX beam preprocessing pipeline for Criteo data.
-
-Preprocessing util for criteo data. Transformations:
-1. Fill missing features with zeros.
-2. Set negative integer features to zeros.
-3. Normalize integer features using log(x+1).
-4. For categorical features (hex), convert to integer and take value modulus the
-   max_vocab_size value.
-
-Usage:
-For raw Criteo data, this script should be run twice.
-First run should set vocab_gen_mode to true.  This run is used to generate
-  vocabulary files in the temp_dir location.
-Second run should set vocab_gen_mode to false.  It is necessary to point to the
-  same temp_dir used during the first run.
-"""
-
-import argparse
-import datetime
-import os
-from absl import logging
-
-import apache_beam as beam
-import numpy as np
-import tensorflow as tf
-import tensorflow_transform as tft
-import tensorflow_transform.beam as tft_beam
-from tensorflow_transform.tf_metadata import dataset_metadata
-from tensorflow_transform.tf_metadata import schema_utils
-from tfx_bsl.public import tfxio
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--input_path",
-    default=None,
-    required=True,
-    help="Input path. Be sure to set this to cover all data, to ensure "
-    "that sparse vocabs are complete."
-)
-parser.add_argument("--output_path", default=None, required=True, help="Output path.")
-parser.add_argument(
-    "--temp_dir",
-    default=None,
-    required=True,
-    help="Directory to store temporary metadata. Important because vocab "
-    "dictionaries will be stored here. Co-located with data, ideally."
-)
-parser.add_argument("--csv_delimeter", default="\t", help="Delimeter string for input and output.")
-parser.add_argument(
-    "--vocab_gen_mode",
-    action="store_true",
-    default=False,
-    help="If it is set, process full dataset and do not write CSV output. In "
-    "this mode, See temp_dir for vocab files. input_path should cover all "
-    "data, e.g. train, test, eval."
-)
-parser.add_argument(
-    "--runner",
-    help="Runner for Apache Beam, needs to be one of {DirectRunner, "
-    "DataflowRunner}.",
-    default="DirectRunner"
-)
-parser.add_argument("--project", default=None, help="ID of your project. Ignored by DirectRunner.")
-parser.add_argument("--region", default=None, help="Region. Ignored by DirectRunner.")
-parser.add_argument(
-    "--max_vocab_size",
-    type=int,
-    default=10_000_000,
-    help="Max index range, categorical features convert to integer and take "
-    "value modulus the max_vocab_size"
-)
-
-args = parser.parse_args()
-
-NUM_NUMERIC_FEATURES = 13
-
-NUMERIC_FEATURE_KEYS = [f"int-feature-{x + 1}" for x in range(NUM_NUMERIC_FEATURES)]
-CATEGORICAL_FEATURE_KEYS = ["categorical-feature-%d" % x for x in range(NUM_NUMERIC_FEATURES + 1, 40)]
-LABEL_KEY = "clicked"
-
-# Data is first preprocessed in pure Apache Beam using numpy.
-# This removes missing values and hexadecimal-encoded values.
-# For the TF schema, we can thus specify the schema as FixedLenFeature
-# for TensorFlow Transform.
-FEATURE_SPEC = dict(
-    [(name, tf.io.FixedLenFeature([], dtype=tf.int64)) for name in CATEGORICAL_FEATURE_KEYS] +
-    [(name, tf.io.FixedLenFeature([], dtype=tf.float32)) for name in NUMERIC_FEATURE_KEYS] +
-    [(LABEL_KEY, tf.io.FixedLenFeature([], tf.float32))]
-)
-INPUT_METADATA = dataset_metadata.DatasetMetadata(schema_utils.schema_from_feature_spec(FEATURE_SPEC))
-
-
-def apply_vocab_fn(inputs):
-  """Preprocessing fn for sparse features.
-
-  Applies vocab to bucketize sparse features. This function operates using
-  previously-created vocab files.
-  Pre-condition: Full vocab has been materialized.
-
-  Args:
-    inputs: Input features to transform.
-
-  Returns:
-    Output dict with transformed features.
-  """
-  outputs = {}
-
-  outputs[LABEL_KEY] = inputs[LABEL_KEY]
-  for key in NUMERIC_FEATURE_KEYS:
-    outputs[key] = inputs[key]
-  for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS):
-    vocab_fn = os.path.join(args.temp_dir, "tftransform_tmp", "feature_{}_vocab".format(idx))
-    outputs[key] = tft.apply_vocabulary(inputs[key], vocab_fn)
-
-  return outputs
-
-
-def compute_vocab_fn(inputs):
-  """Preprocessing fn for sparse features.
-
-  This function computes unique IDs for the sparse features. We rely on implicit
-  behavior which writes the vocab files to the vocab_filename specified in
-  tft.compute_and_apply_vocabulary.
-
-  Pre-condition: Sparse features have been converted to integer and mod'ed with
-  args.max_vocab_size.
-
-  Args:
-    inputs: Input features to transform.
-
-  Returns:
-    Output dict with transformed features.
-  """
-  outputs = {}
-
-  outputs[LABEL_KEY] = inputs[LABEL_KEY]
-  for key in NUMERIC_FEATURE_KEYS:
-    outputs[key] = inputs[key]
-  for idx, key in enumerate(CATEGORICAL_FEATURE_KEYS):
-    outputs[key] = tft.compute_and_apply_vocabulary(x=inputs[key], vocab_filename="feature_{}_vocab".format(idx))
-
-  return outputs
-
-
-class FillMissing(beam.DoFn):
-  """Fills missing elements with zero string value."""
-
-  def process(self, element):
-    elem_list = element.split(args.csv_delimeter)
-    out_list = []
-    for val in elem_list:
-      new_val = "0" if not val else val
-      out_list.append(new_val)
-    yield (args.csv_delimeter).join(out_list)
-
-
-class NegsToZeroLog(beam.DoFn):
-  """For int features, sets negative values to zero and takes log(x+1)."""
-
-  def process(self, element):
-    elem_list = element.split(args.csv_delimeter)
-    out_list = []
-    for i, val in enumerate(elem_list):
-      if i > 0 and i <= NUM_NUMERIC_FEATURES:
-        new_val = "0" if int(val) < 0 else val
-        new_val = np.log(int(new_val) + 1)
-        new_val = str(new_val)
-      else:
-        new_val = val
-      out_list.append(new_val)
-    yield (args.csv_delimeter).join(out_list)
-
-
-class HexToIntModRange(beam.DoFn):
-  """For categorical features, takes decimal value and mods with max value."""
-
-  def process(self, element):
-    elem_list = element.split(args.csv_delimeter)
-    out_list = []
-    for i, val in enumerate(elem_list):
-      if i > NUM_NUMERIC_FEATURES:
-        new_val = int(val, 16) % args.max_vocab_size
-      else:
-        new_val = val
-      out_list.append(str(new_val))
-    yield str.encode((args.csv_delimeter).join(out_list))
-
-
-def transform_data(data_path, output_path):
-  """Preprocesses Criteo data.
-
-  Two processing modes are supported. Raw data will require two passes.
-  If full vocab files already exist, only one pass is necessary.
-
-  Args:
-    data_path: File(s) to read.
-    output_path: Path to which output CSVs are written, if necessary.
-  """
-
-  preprocessing_fn = compute_vocab_fn if args.vocab_gen_mode else apply_vocab_fn
-
-  gcp_project = args.project
-  region = args.region
-
-  job_name = (f"criteo-preprocessing-"
-              f"{datetime.datetime.now().strftime('%y%m%d-%H%M%S')}")
-
-  # set up Beam pipeline.
-  pipeline_options = None
-
-  if args.runner == "DataflowRunner":
-    options = {
-        "staging_location": os.path.join(output_path, "tmp", "staging"),
-        "temp_location": os.path.join(output_path, "tmp"),
-        "job_name": job_name,
-        "project": gcp_project,
-        "save_main_session": True,
-        "region": region,
-        "setup_file": "./setup.py",
-    }
-    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
-  elif args.runner == "DirectRunner":
-    pipeline_options = beam.options.pipeline_options.DirectOptions(
-        direct_num_workers=os.cpu_count(), direct_running_mode="multi_threading"
-    )
-
-  with beam.Pipeline(args.runner, options=pipeline_options) as pipeline:
-    with tft_beam.Context(temp_dir=args.temp_dir):
-      processed_lines = (
-          pipeline
-          # Read in TSV data.
-          | beam.io.ReadFromText(data_path, coder=beam.coders.StrUtf8Coder())
-          # Fill in missing elements with the defaults (zeros).
-          | "FillMissing" >> beam.ParDo(FillMissing())
-          # For numerical features, set negatives to zero. Then take log(x+1).
-          | "NegsToZeroLog" >> beam.ParDo(NegsToZeroLog())
-          # For categorical features, mod the values with vocab size.
-          | "HexToIntModRange" >> beam.ParDo(HexToIntModRange())
-      )
-
-      # CSV reader: List the cols in order, as dataset schema is not ordered.
-      ordered_columns = [LABEL_KEY] + NUMERIC_FEATURE_KEYS + CATEGORICAL_FEATURE_KEYS
-
-      csv_tfxio = tfxio.BeamRecordCsvTFXIO(
-          physical_format="text",
-          column_names=ordered_columns,
-          delimiter=args.csv_delimeter,
-          schema=INPUT_METADATA.schema
-      )
-
-      converted_data = (processed_lines | "DecodeData" >> csv_tfxio.BeamSource())
-
-      raw_dataset = (converted_data, csv_tfxio.TensorAdapterConfig())
-
-      # The TFXIO output format is chosen for improved performance.
-      transformed_dataset, _ = (
-          raw_dataset | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn, output_record_batches=False)
-      )
-
-      # Transformed metadata is not necessary for encoding.
-      transformed_data, transformed_metadata = transformed_dataset
-
-      if not args.vocab_gen_mode:
-        # Write to CSV.
-        transformed_csv_coder = tft.coders.CsvCoder(
-            ordered_columns, transformed_metadata.schema, delimiter=args.csv_delimeter
-        )
-        _ = (
-            transformed_data | "EncodeDataCsv" >> beam.Map(transformed_csv_coder.encode) |
-            "WriteDataCsv" >> beam.io.WriteToText(output_path)
-        )
-
-
-if __name__ == "__main__":
-
-  transform_data(data_path=args.input_path, output_path=args.output_path)
diff --git a/recommendation/ranking/preprocessing/setup.py b/recommendation/ranking/preprocessing/setup.py
deleted file mode 100644
index 1297892f..00000000
--- a/recommendation/ranking/preprocessing/setup.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Setup configuration for criteo dataset preprocessing.
-
-This is used while running Tensorflow transform on Cloud Dataflow.
-"""
-
-import setuptools
-
-version = "0.1.0"
-
-if __name__ == "__main__":
-  setuptools.setup(
-      name="criteo_preprocessing",
-      version=version,
-      install_requires=["tensorflow-transform"],
-      packages=setuptools.find_packages(),
-  )
diff --git a/recommendation/ranking/preprocessing/shard_rebalancer.py b/recommendation/ranking/preprocessing/shard_rebalancer.py
deleted file mode 100644
index 38153e5a..00000000
--- a/recommendation/ranking/preprocessing/shard_rebalancer.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Rebalance a set of CSV/TFRecord shards to a target number of files.
-"""
-
-import argparse
-import datetime
-import os
-
-import apache_beam as beam
-import tensorflow as tf
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--input_path", default=None, required=True, help="Input path.")
-parser.add_argument("--output_path", default=None, required=True, help="Output path.")
-parser.add_argument("--num_output_files", type=int, default=256, help="Number of output file shards.")
-parser.add_argument("--filetype", default="tfrecord", help="File type, needs to be one of {tfrecord, csv}.")
-parser.add_argument("--project", default=None, help="ID (not name) of your project. Ignored by DirectRunner")
-parser.add_argument(
-    "--runner",
-    help="Runner for Apache Beam, needs to be one of "
-    "{DirectRunner, DataflowRunner}.",
-    default="DirectRunner"
-)
-parser.add_argument("--region", default=None, help="region")
-
-args = parser.parse_args()
-
-
-def rebalance_data_shards():
-  """Rebalances data shards."""
-
-  def csv_pipeline(pipeline: beam.Pipeline):
-    """Rebalances CSV dataset.
-
-    Args:
-      pipeline: Beam pipeline object.
-    """
-    _ = (
-        pipeline | beam.io.ReadFromText(args.input_path) |
-        beam.io.WriteToText(args.output_path, num_shards=args.num_output_files)
-    )
-
-  def tfrecord_pipeline(pipeline: beam.Pipeline):
-    """Rebalances TFRecords dataset.
-
-    Args:
-      pipeline: Beam pipeline object.
-    """
-    example_coder = beam.coders.ProtoCoder(tf.train.Example)
-    _ = (
-        pipeline | beam.io.ReadFromTFRecord(args.input_path, coder=example_coder) | beam.io.WriteToTFRecord(
-            args.output_path, file_name_suffix="tfrecord", coder=example_coder, num_shards=args.num_output_files
-        )
-    )
-
-  job_name = (f"shard-rebalancer-{datetime.datetime.now().strftime('%y%m%d-%H%M%S')}")
-
-  # set up Beam pipeline.
-  options = {
-      "staging_location": os.path.join(args.output_path, "tmp", "staging"),
-      "temp_location": os.path.join(args.output_path, "tmp"),
-      "job_name": job_name,
-      "project": args.project,
-      "save_main_session": True,
-      "region": args.region,
-  }
-
-  opts = beam.pipeline.PipelineOptions(flags=[], **options)
-
-  with beam.Pipeline(args.runner, options=opts) as pipeline:
-    if args.filetype == "tfrecord":
-      tfrecord_pipeline(pipeline)
-    elif args.filetype == "csv":
-      csv_pipeline(pipeline)
-
-
-if __name__ == "__main__":
-  rebalance_data_shards()
diff --git a/recommendation/ranking/task.py b/recommendation/ranking/task.py
deleted file mode 100644
index a65a1499..00000000
--- a/recommendation/ranking/task.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Task for the Ranking model."""
-
-import math
-from typing import Dict, List, Optional, Union
-
-import tensorflow as tf
-import tensorflow_recommenders as tfrs
-
-from official.core import base_task
-from official.core import config_definitions
-from official.recommendation.ranking import common
-from official.recommendation.ranking.configs import config
-from official.recommendation.ranking.data import data_pipeline
-
-RuntimeConfig = config_definitions.RuntimeConfig
-
-
-def _get_tpu_embedding_feature_config(
-    vocab_sizes: List[int],
-    embedding_dim: Union[int, List[int]],
-    table_name_prefix: str = 'embedding_table',
-    batch_size: Optional[int] = None
-) -> Dict[str, tf.tpu.experimental.embedding.FeatureConfig]:
-  """Returns TPU embedding feature config.
-
-  i'th table config will have vocab size of vocab_sizes[i] and embedding
-  dimension of embedding_dim if embedding_dim is an int or embedding_dim[i] if
-  embedding_dim is a list).
-  Args:
-    vocab_sizes: List of sizes of categories/id's in the table.
-    embedding_dim: An integer or a list of embedding table dimensions.
-    table_name_prefix: a prefix for embedding tables.
-    batch_size: Per-replica batch size.
-  Returns:
-    A dictionary of feature_name, FeatureConfig pairs.
-  """
-  if isinstance(embedding_dim, List):
-    if len(vocab_sizes) != len(embedding_dim):
-      raise ValueError(
-          f'length of vocab_sizes: {len(vocab_sizes)} is not equal to the '
-          f'length of embedding_dim: {len(embedding_dim)}'
-      )
-  elif isinstance(embedding_dim, int):
-    embedding_dim = [embedding_dim] * len(vocab_sizes)
-  else:
-    raise ValueError('embedding_dim is not either a list or an int, got '
-                     f'{type(embedding_dim)}')
-
-  feature_config = {}
-
-  for i, vocab_size in enumerate(vocab_sizes):
-    table_config = tf.tpu.experimental.embedding.TableConfig(
-        vocabulary_size=vocab_size,
-        dim=embedding_dim[i],
-        combiner='mean',
-        initializer=tf.initializers.TruncatedNormal(mean=0.0, stddev=1 / math.sqrt(embedding_dim[i])),
-        name=table_name_prefix + '_%02d' % i
-    )
-    feature_config[str(i)] = tf.tpu.experimental.embedding.FeatureConfig(
-        name=str(i),
-        table=table_config,
-        output_shape=[batch_size] if batch_size else None,
-    )
-
-  return feature_config
-
-
-class RankingTask(base_task.Task):
-  """A task for Ranking Model."""
-
-  def __init__(
-      self,
-      params: config.Task,
-      trainer_config: config.TrainerConfig,
-      logging_dir: Optional[str] = None,
-      steps_per_execution: int = 1,
-      name: Optional[str] = None
-  ):
-    """Task initialization.
-
-    Args:
-      params: the RankingModel task configuration instance.
-      trainer_config: Trainer configuration instance.
-      logging_dir: a string pointing to where the model, summaries etc. will be
-        saved.
-      steps_per_execution: Int. Defaults to 1. The number of batches to run
-        during each `tf.function` call. It's used for compile/fit API.
-      name: the task name.
-    """
-    super().__init__(params, logging_dir, name=name)
-    self._trainer_config = trainer_config
-    self._optimizer_config = trainer_config.optimizer_config
-    self._steps_per_execution = steps_per_execution
-
-  def build_inputs(self, params, input_context=None):
-    """Builds classification input."""
-
-    dataset = data_pipeline.CriteoTsvReader(
-        file_pattern=params.input_path,
-        params=params,
-        vocab_sizes=self.task_config.model.vocab_sizes,
-        num_dense_features=self.task_config.model.num_dense_features,
-        use_synthetic_data=self.task_config.use_synthetic_data
-    )
-
-    return dataset(input_context)
-
-  @classmethod
-  def create_optimizer(
-      cls, optimizer_config: config.OptimizationConfig, runtime_config: Optional[RuntimeConfig] = None
-  ) -> None:
-    """See base class. Return None, optimizer is set in `build_model`."""
-    return None
-
-  def build_model(self) -> tf.keras.Model:
-    """Creates Ranking model architecture and Optimizers.
-
-    The RankingModel uses different optimizers/learning rates for embedding
-    variables and dense variables.
-
-    Returns:
-      A Ranking model instance.
-    """
-    lr_config = self.optimizer_config.lr_config
-    lr_callable = common.WarmUpAndPolyDecay(
-        batch_size=self.task_config.train_data.global_batch_size,
-        decay_exp=lr_config.decay_exp,
-        learning_rate=lr_config.learning_rate,
-        warmup_steps=lr_config.warmup_steps,
-        decay_steps=lr_config.decay_steps,
-        decay_start_steps=lr_config.decay_start_steps
-    )
-    embedding_optimizer = tf.keras.optimizers.get(self.optimizer_config.embedding_optimizer, use_legacy_optimizer=True)
-    embedding_optimizer.learning_rate = lr_callable
-
-    dense_optimizer = tf.keras.optimizers.get(self.optimizer_config.dense_optimizer, use_legacy_optimizer=True)
-    if self.optimizer_config.dense_optimizer == 'SGD':
-      dense_lr_config = self.optimizer_config.dense_sgd_config
-      dense_lr_callable = common.WarmUpAndPolyDecay(
-          batch_size=self.task_config.train_data.global_batch_size,
-          decay_exp=dense_lr_config.decay_exp,
-          learning_rate=dense_lr_config.learning_rate,
-          warmup_steps=dense_lr_config.warmup_steps,
-          decay_steps=dense_lr_config.decay_steps,
-          decay_start_steps=dense_lr_config.decay_start_steps
-      )
-      dense_optimizer.learning_rate = dense_lr_callable
-
-    feature_config = _get_tpu_embedding_feature_config(
-        embedding_dim=self.task_config.model.embedding_dim,
-        vocab_sizes=self.task_config.model.vocab_sizes,
-        batch_size=self.task_config.train_data.global_batch_size // tf.distribute.get_strategy().num_replicas_in_sync,
-    )
-
-    embedding_layer = tfrs.experimental.layers.embedding.PartialTPUEmbedding(
-        feature_config=feature_config,
-        optimizer=embedding_optimizer,
-        pipeline_execution_with_tensor_core=self.trainer_config.pipeline_sparse_and_dense_execution,
-        size_threshold=self.task_config.model.size_threshold,
-    )
-
-    if self.task_config.model.interaction == 'dot':
-      feature_interaction = tfrs.layers.feature_interaction.DotInteraction(skip_gather=True)
-    elif self.task_config.model.interaction == 'cross':
-      feature_interaction = tf.keras.Sequential(
-          [tf.keras.layers.Concatenate(), tfrs.layers.feature_interaction.Cross()]
-      )
-    else:
-      raise ValueError(
-          f'params.task.model.interaction {self.task_config.model.interaction} '
-          f'is not supported it must be either \'dot\' or \'cross\'.'
-      )
-
-    model = tfrs.experimental.models.Ranking(
-        embedding_layer=embedding_layer,
-        bottom_stack=tfrs.layers.blocks.MLP(units=self.task_config.model.bottom_mlp, final_activation='relu'),
-        feature_interaction=feature_interaction,
-        top_stack=tfrs.layers.blocks.MLP(units=self.task_config.model.top_mlp, final_activation='sigmoid'),
-    )
-    optimizer = tfrs.experimental.optimizers.CompositeOptimizer(
-        [
-            (embedding_optimizer, lambda: model.embedding_trainable_variables),
-            (dense_optimizer, lambda: model.dense_trainable_variables),
-        ]
-    )
-
-    model.compile(optimizer, steps_per_execution=self._steps_per_execution)
-    return model
-
-  def train_step(
-      self,
-      inputs: Dict[str, tf.Tensor],
-      model: tf.keras.Model,
-      optimizer: tf.keras.optimizers.Optimizer,
-      metrics: Optional[List[tf.keras.metrics.Metric]] = None
-  ) -> tf.Tensor:
-    """See base class."""
-    # All metrics need to be passed through the RankingModel.
-    assert metrics == model.metrics
-    return model.train_step(inputs)
-
-  def validation_step(
-      self,
-      inputs: Dict[str, tf.Tensor],
-      model: tf.keras.Model,
-      metrics: Optional[List[tf.keras.metrics.Metric]] = None
-  ) -> tf.Tensor:
-    """See base class."""
-    # All metrics need to be passed through the RankingModel.
-    assert metrics == model.metrics
-    return model.test_step(inputs)
-
-  @property
-  def trainer_config(self) -> config.TrainerConfig:
-    return self._trainer_config
-
-  @property
-  def optimizer_config(self) -> config.OptimizationConfig:
-    return self._optimizer_config
diff --git a/recommendation/ranking/task_test.py b/recommendation/ranking/task_test.py
deleted file mode 100644
index 2ef4da74..00000000
--- a/recommendation/ranking/task_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unit tests for task."""
-
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.core import exp_factory
-from official.recommendation.ranking import task
-from official.recommendation.ranking.data import data_pipeline
-
-
-class TaskTest(parameterized.TestCase, tf.test.TestCase):
-
-  @parameterized.parameters(('dlrm_criteo', True), ('dlrm_criteo', False), ('dcn_criteo', True), ('dcn_criteo', False))
-  def test_task(self, config_name, is_training):
-    params = exp_factory.get_exp_config(config_name)
-
-    params.task.train_data.global_batch_size = 16
-    params.task.validation_data.global_batch_size = 16
-    params.task.model.vocab_sizes = [40, 12, 11, 13, 2, 5]
-    params.task.model.embedding_dim = 8
-    params.task.model.bottom_mlp = [64, 32, 8]
-    params.task.use_synthetic_data = True
-    params.task.model.num_dense_features = 5
-
-    ranking_task = task.RankingTask(params.task, params.trainer)
-
-    if is_training:
-      dataset = data_pipeline.train_input_fn(params.task)
-    else:
-      dataset = data_pipeline.eval_input_fn(params.task)
-
-    iterator = iter(dataset(ctx=None))
-    model = ranking_task.build_model()
-
-    if is_training:
-      ranking_task.train_step(next(iterator), model, model.optimizer, metrics=model.metrics)
-    else:
-      ranking_task.validation_step(next(iterator), model, metrics=model.metrics)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/recommendation/ranking/train.py b/recommendation/ranking/train.py
deleted file mode 100644
index 92b9533c..00000000
--- a/recommendation/ranking/train.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Train and evaluate the Ranking model."""
-
-from typing import Dict
-
-from absl import app
-from absl import flags
-from absl import logging
-
-import tensorflow as tf
-
-from official.common import distribute_utils
-from official.core import base_trainer
-from official.core import train_lib
-from official.core import train_utils
-from official.recommendation.ranking import common
-from official.recommendation.ranking.task import RankingTask
-from official.utils.misc import keras_utils
-
-
-
-
-class RankingTrainer(base_trainer.Trainer):
-  """A trainer for Ranking Model.
-
-  The RankingModel has two optimizers for embedding and non embedding weights.
-  Overriding `train_loop_end` method to log learning rates for each optimizer.
-  """
-
-  def train_loop_end(self) -> Dict[str, float]:
-    """See base class."""
-    self.join()
-    logs = {}
-    for metric in self.train_metrics + [self.train_loss]:
-      logs[metric.name] = metric.result()
-      metric.reset_states()
-
-    for i, optimizer in enumerate(self.optimizer.optimizers):
-      lr_key = f'{type(optimizer).__name__}_{i}_learning_rate'
-      if callable(optimizer.learning_rate):
-        logs[lr_key] = optimizer.learning_rate(self.global_step)
-      else:
-        logs[lr_key] = optimizer.learning_rate
-    return logs
-
-
-def main(_) -> None:
-  """Train and evaluate the Ranking model."""
-  params = train_utils.parse_configuration(FLAGS)
-  mode = FLAGS.mode
-  model_dir = FLAGS.model_dir
-  if 'train' in FLAGS.mode:
-    # Pure eval modes do not output yaml files. Otherwise continuous eval job
-    # may race against the train job for writing the same file.
-    train_utils.serialize_config(params, model_dir)
-
-  if FLAGS.seed is not None:
-    logging.info('Setting tf seed.')
-    tf.random.set_seed(FLAGS.seed)
-
-  task = RankingTask(
-      params=params.task,
-      trainer_config=params.trainer,
-      logging_dir=model_dir,
-      steps_per_execution=params.trainer.steps_per_loop,
-      name='RankingTask'
-  )
-
-  enable_tensorboard = params.trainer.callbacks.enable_tensorboard
-
-  strategy = distribute_utils.get_distribution_strategy(
-      distribution_strategy=params.runtime.distribution_strategy,
-      all_reduce_alg=params.runtime.all_reduce_alg,
-      num_gpus=params.runtime.num_gpus,
-      tpu_address=params.runtime.tpu
-  )
-
-  with strategy.scope():
-    model = task.build_model()
-
-  def get_dataset_fn(params):
-    return lambda input_context: task.build_inputs(params, input_context)
-
-  train_dataset = None
-  if 'train' in mode:
-    train_dataset = strategy.distribute_datasets_from_function(
-        get_dataset_fn(params.task.train_data), options=tf.distribute.InputOptions(experimental_fetch_to_device=False)
-    )
-
-  validation_dataset = None
-  if 'eval' in mode:
-    validation_dataset = strategy.distribute_datasets_from_function(
-        get_dataset_fn(params.task.validation_data),
-        options=tf.distribute.InputOptions(experimental_fetch_to_device=False)
-    )
-
-  if params.trainer.use_orbit:
-    with strategy.scope():
-      checkpoint_exporter = train_utils.maybe_create_best_ckpt_exporter(params, model_dir)
-      trainer = RankingTrainer(
-          config=params,
-          task=task,
-          model=model,
-          optimizer=model.optimizer,
-          train='train' in mode,
-          evaluate='eval' in mode,
-          train_dataset=train_dataset,
-          validation_dataset=validation_dataset,
-          checkpoint_exporter=checkpoint_exporter
-      )
-
-    train_lib.run_experiment(
-        distribution_strategy=strategy, task=task, mode=mode, params=params, model_dir=model_dir, trainer=trainer
-    )
-
-  else:  # Compile/fit
-    checkpoint = tf.train.Checkpoint(model=model, optimizer=model.optimizer)
-
-    latest_checkpoint = tf.train.latest_checkpoint(model_dir)
-    if latest_checkpoint:
-      checkpoint.restore(latest_checkpoint)
-      logging.info('Loaded checkpoint %s', latest_checkpoint)
-
-    checkpoint_manager = tf.train.CheckpointManager(
-        checkpoint,
-        directory=model_dir,
-        max_to_keep=params.trainer.max_to_keep,
-        step_counter=model.optimizer.iterations,
-        checkpoint_interval=params.trainer.checkpoint_interval
-    )
-    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)
-
-    time_callback = keras_utils.TimeHistory(
-        params.task.train_data.global_batch_size,
-        params.trainer.time_history.log_steps,
-        logdir=model_dir if enable_tensorboard else None
-    )
-    callbacks = [checkpoint_callback, time_callback]
-
-    if enable_tensorboard:
-      tensorboard_callback = tf.keras.callbacks.TensorBoard(
-          log_dir=model_dir,
-          update_freq=min(1000, params.trainer.validation_interval),
-          profile_batch=FLAGS.profile_steps
-      )
-      callbacks.append(tensorboard_callback)
-
-    num_epochs = (params.trainer.train_steps // params.trainer.validation_interval)
-    current_step = model.optimizer.iterations.numpy()
-    initial_epoch = current_step // params.trainer.validation_interval
-
-    eval_steps = params.trainer.validation_steps if 'eval' in mode else None
-
-    if mode in ['train', 'train_and_eval']:
-      logging.info('Training started')
-      history = model.fit(
-          train_dataset,
-          initial_epoch=initial_epoch,
-          epochs=num_epochs,
-          steps_per_epoch=params.trainer.validation_interval,
-          validation_data=validation_dataset,
-          validation_steps=eval_steps,
-          callbacks=callbacks,
-      )
-      model.summary()
-      logging.info('Train history: %s', history.history)
-    elif mode == 'eval':
-      logging.info('Evaluation started')
-      validation_output = model.evaluate(validation_dataset, steps=eval_steps)
-      logging.info('Evaluation output: %s', validation_output)
-    else:
-      raise NotImplementedError('The mode is not implemented: %s' % mode)
-
-
-if __name__ == '__main__':
-  common.define_flags()
-  app.run(main)
diff --git a/recommendation/ranking/train_test.py b/recommendation/ranking/train_test.py
deleted file mode 100644
index 8fd055bf..00000000
--- a/recommendation/ranking/train_test.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Unit tests for ranking model and associated functionality."""
-
-import json
-import os
-from absl import flags
-from absl.testing import parameterized
-import tensorflow as tf
-
-from official.recommendation.ranking import common
-from official.recommendation.ranking import train
-
-
-
-
-def _get_params_override(vocab_sizes, interaction='dot', use_orbit=True, strategy='mirrored'):
-  # Update `data_dir` if `synthetic_data=False`.
-  data_dir = ''
-
-  return json.dumps(
-      {
-          'runtime': {
-              'distribution_strategy': strategy,
-          },
-          'task':
-              {
-                  'model':
-                      {
-                          'vocab_sizes': vocab_sizes,
-                          'embedding_dim': [8] * len(vocab_sizes),
-                          'bottom_mlp': [64, 32, 8],
-                          'interaction': interaction,
-                      },
-                  'train_data': {
-                      'input_path': os.path.join(data_dir, 'train/*'),
-                      'global_batch_size': 16,
-                  },
-                  'validation_data': {
-                      'input_path': os.path.join(data_dir, 'eval/*'),
-                      'global_batch_size': 16,
-                  },
-                  'use_synthetic_data': True,
-              },
-          'trainer': {
-              'use_orbit': use_orbit,
-              'validation_interval': 20,
-              'validation_steps': 20,
-              'train_steps': 40,
-          },
-      }
-  )
-
-
-class TrainTest(parameterized.TestCase, tf.test.TestCase):
-
-  def setUp(self):
-    super().setUp()
-    self._temp_dir = self.get_temp_dir()
-    self._model_dir = os.path.join(self._temp_dir, 'model_dir')
-    tf.io.gfile.makedirs(self._model_dir)
-    FLAGS.model_dir = self._model_dir
-
-    FLAGS.tpu = ''
-
-  def tearDown(self):
-    tf.io.gfile.rmtree(self._model_dir)
-    super().tearDown()
-
-  @parameterized.named_parameters(
-      ('DlrmOneDeviceCTL', 'one_device', 'dot', True),
-      ('DlrmOneDevice', 'one_device', 'dot', False),
-      ('DcnOneDeviceCTL', 'one_device', 'cross', True),
-      ('DcnOneDevice', 'one_device', 'cross', False),
-      ('DlrmTPUCTL', 'tpu', 'dot', True),
-      ('DlrmTPU', 'tpu', 'dot', False),
-      ('DcnTPUCTL', 'tpu', 'cross', True),
-      ('DcnTPU', 'tpu', 'cross', False),
-      ('DlrmMirroredCTL', 'Mirrored', 'dot', True),
-      ('DlrmMirrored', 'Mirrored', 'dot', False),
-      ('DcnMirroredCTL', 'Mirrored', 'cross', True),
-      ('DcnMirrored', 'Mirrored', 'cross', False),
-  )
-  def testTrainEval(self, strategy, interaction, use_orbit=True):
-    # Set up simple trainer with synthetic data.
-    # By default the mode must be `train_and_eval`.
-    self.assertEqual(FLAGS.mode, 'train_and_eval')
-
-    vocab_sizes = [40, 12, 11, 13]
-
-    FLAGS.params_override = _get_params_override(
-        vocab_sizes=vocab_sizes, interaction=interaction, use_orbit=use_orbit, strategy=strategy
-    )
-    train.main('unused_args')
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self._model_dir, 'params.yaml')))
-
-  @parameterized.named_parameters(
-      ('DlrmTPUCTL', 'tpu', 'dot', True),
-      ('DlrmTPU', 'tpu', 'dot', False),
-      ('DcnTPUCTL', 'tpu', 'cross', True),
-      ('DcnTPU', 'tpu', 'cross', False),
-      ('DlrmMirroredCTL', 'Mirrored', 'dot', True),
-      ('DlrmMirrored', 'Mirrored', 'dot', False),
-      ('DcnMirroredCTL', 'Mirrored', 'cross', True),
-      ('DcnMirrored', 'Mirrored', 'cross', False),
-  )
-  def testTrainThenEval(self, strategy, interaction, use_orbit=True):
-    # Set up simple trainer with synthetic data.
-    vocab_sizes = [40, 12, 11, 13]
-
-    FLAGS.params_override = _get_params_override(
-        vocab_sizes=vocab_sizes, interaction=interaction, use_orbit=use_orbit, strategy=strategy
-    )
-
-    default_mode = FLAGS.mode
-    # Training.
-    FLAGS.mode = 'train'
-    train.main('unused_args')
-    self.assertNotEmpty(tf.io.gfile.glob(os.path.join(self._model_dir, 'params.yaml')))
-
-    # Evaluation.
-    FLAGS.mode = 'eval'
-    train.main('unused_args')
-    FLAGS.mode = default_mode
-
-
-if __name__ == '__main__':
-  common.define_flags()
-  tf.test.main()
diff --git a/recommendation/run.sh b/recommendation/run.sh
deleted file mode 100755
index 6b58d9f1..00000000
--- a/recommendation/run.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-set -e
-
-if [ `id -u` != 0 ]; then
-  echo "Calling sudo to gain root for this shell. (Needed to clear caches.)"
-  sudo echo "Success"
-fi
-
-SCRIPT_DIR=`dirname "$BASH_SOURCE"`
-export PYTHONPATH="${SCRIPT_DIR}/../../"
-MAIN_SCRIPT="ncf_estimator_main.py"
-
-DATASET="ml-20m"
-
-BUCKET=${BUCKET:-""}
-ROOT_DIR="${BUCKET:-/tmp}/MLPerf_NCF"
-echo "Root directory: ${ROOT_DIR}"
-
-if [[ -z ${BUCKET} ]]; then
-  LOCAL_ROOT=${ROOT_DIR}
-else
-  LOCAL_ROOT="/tmp/MLPerf_NCF"
-  mkdir -p ${LOCAL_ROOT}
-  echo "Local root (for files which cannot use GCS): ${LOCAL_ROOT}"
-fi
-
-DATE=$(date '+%Y-%m-%d_%H:%M:%S')
-TEST_DIR="${ROOT_DIR}/${DATE}"
-LOCAL_TEST_DIR="${LOCAL_ROOT}/${DATE}"
-mkdir -p ${LOCAL_TEST_DIR}
-
-TPU=${TPU:-""}
-if [[ -z ${TPU} ]]; then
-  DEVICE_FLAG="--num_gpus -1" # --use_xla_for_gpu"
-else
-  DEVICE_FLAG="--tpu ${TPU} --num_gpus 0"
-fi
-
-DATA_DIR="${ROOT_DIR}/movielens_data"
-python "${SCRIPT_DIR}/movielens.py" --data_dir ${DATA_DIR} --dataset ${DATASET}
-
-if [ "$1" == "keras" ]
-then
-	MAIN_SCRIPT="ncf_keras_main.py"
-	BATCH_SIZE=99000
-	DEVICE_FLAG="--num_gpus 1"
-else
-	BATCH_SIZE=98340
-fi
-
-{
-
-for i in `seq 0 4`;
-do
-  START_TIME=$(date +%s)
-  MODEL_DIR="${TEST_DIR}/model_dir_${i}"
-
-  RUN_LOG="${LOCAL_TEST_DIR}/run_${i}.log"
-  export COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_raw.log"
-  export STITCHED_COMPLIANCE_FILE="${LOCAL_TEST_DIR}/run_${i}_compliance_submission.log"
-  echo ""
-  echo "Beginning run ${i}"
-  echo "  Complete output logs are in ${RUN_LOG}"
-  echo "  Compliance logs: (submission log is created after run.)"
-  echo "    ${COMPLIANCE_FILE}"
-  echo "    ${STITCHED_COMPLIANCE_FILE}"
-
-  # To reduce variation set the seed flag:
-  #   --seed ${i}
-
-  python -u "${SCRIPT_DIR}/${MAIN_SCRIPT}" \
-      --model_dir ${MODEL_DIR} \
-      --data_dir ${DATA_DIR} \
-      --dataset ${DATASET} --hooks "" \
-      ${DEVICE_FLAG} \
-      --clean \
-      --epochs 14 \
-      --batch_size ${BATCH_SIZE} \
-      --eval_batch_size 160000 \
-      --learning_rate 0.00382059 \
-      --beta1 0.783529 \
-      --beta2 0.909003 \
-      --epsilon 1.45439e-07 \
-      --layers 256,256,128,64 --num_factors 64 \
-      --hr_threshold 0.635 \
-      --ml_perf \
- |& tee ${RUN_LOG} \
- | grep --line-buffered  -E --regexp="(Iteration [0-9]+: HR = [0-9\.]+, NDCG = [0-9\.]+, Loss = [0-9\.]+)|(pipeline_hash)|(MLPerf time:)"
-
-  END_TIME=$(date +%s)
-  echo "Run ${i} complete: $(( $END_TIME - $START_TIME )) seconds."
-
-  # Don't fill up the local hard drive.
-  if [[ -z ${BUCKET} ]]; then
-    echo "Removing model directory to save space."
-    rm -r ${MODEL_DIR}
-  fi
-
-done
-
-} |& tee "${LOCAL_TEST_DIR}/summary.log"
diff --git a/recommendation/stat_utils.py b/recommendation/stat_utils.py
deleted file mode 100644
index 23a36a43..00000000
--- a/recommendation/stat_utils.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Statistics utility functions of NCF."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-
-def random_int32():
-  return np.random.randint(low=0, high=np.iinfo(np.int32).max, dtype=np.int32)
-
-
-def permutation(args):
-  """Fork safe permutation function.
-
-  This function can be called within a multiprocessing worker and give
-  appropriately random results.
-
-  Args:
-    args: A size two tuple that will unpacked into the size of the permutation
-      and the random seed. This form is used because starmap is not universally
-      available.
-  Returns:
-    A NumPy array containing a random permutation.
-  """
-  x, seed = args
-
-  # If seed is None NumPy will seed randomly.
-  state = np.random.RandomState(seed=seed)  # pylint: disable=no-member
-  output = np.arange(x, dtype=np.int32)
-  state.shuffle(output)
-  return output
-
-
-def very_slightly_biased_randint(max_val_vector):
-  sample_dtype = np.uint64
-  out_dtype = max_val_vector.dtype
-  samples = np.random.randint(low=0, high=np.iinfo(sample_dtype).max, size=max_val_vector.shape, dtype=sample_dtype)
-  return np.mod(samples, max_val_vector.astype(sample_dtype)).astype(out_dtype)
-
-
-def mask_duplicates(x, axis=1):  # type: (np.ndarray, int) -> np.ndarray
-  """Identify duplicates from sampling with replacement.
-
-  Args:
-    x: A 2D NumPy array of samples
-    axis: The axis along which to de-dupe.
-
-  Returns:
-    A NumPy array with the same shape as x with one if an element appeared
-    previously along axis 1, else zero.
-  """
-  if axis != 1:
-    raise NotImplementedError
-
-  x_sort_ind = np.argsort(x, axis=1, kind="mergesort")
-  sorted_x = x[np.arange(x.shape[0])[:, np.newaxis], x_sort_ind]
-
-  # compute the indices needed to map values back to their original position.
-  inv_x_sort_ind = np.argsort(x_sort_ind, axis=1, kind="mergesort")
-
-  # Compute the difference of adjacent sorted elements.
-  diffs = sorted_x[:, :-1] - sorted_x[:, 1:]
-
-  # We are only interested in whether an element is zero. Therefore left padding
-  # with ones to restore the original shape is sufficient.
-  diffs = np.concatenate([np.ones((diffs.shape[0], 1), dtype=diffs.dtype), diffs], axis=1)
-
-  # Duplicate values will have a difference of zero. By definition the first
-  # element is never a duplicate.
-  return np.where(diffs[np.arange(x.shape[0])[:, np.newaxis], inv_x_sort_ind], 0, 1)
diff --git a/recommendation/train.sh b/recommendation/train.sh
deleted file mode 100644
index e9f402b3..00000000
--- a/recommendation/train.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -ex
-
-python -m recommendation.ncf_keras_main \
-    --use_synthetic_data \
-    --num_gpus=1 \
-    --train_dataset_path=/tmp/movielens-data/training_cycle_0/*.tfrecords \
-    --input_meta_data_path=/tmp/movielens-data/meta_data \
-    --eval_dataset_path=/tmp/movielens-data/eval_data/*.tfrecords
diff --git a/setup.py b/setup.py
index c2795716..041fcfc3 100644
--- a/setup.py
+++ b/setup.py
@@ -79,35 +79,35 @@ def has_ext_modules(self):
 inclusive_min_tf_version = version["INCLUSIVE_MIN_TF_VERSION"]
 exclusive_max_tf_version = version["EXCLUSIVE_MAX_TF_VERSION"]
 setup(
-    name=project_name,
-    version=version["__version__"],
-    description=DOCLINES[0],
-    long_description="\n".join(DOCLINES[2:]),
-    author="Hailin Fu",
-    author_email="hailinfufu@outlook.com",
-    packages=find_packages(),
-    ext_modules=get_ext_modules(),
-    install_requires=Path("requirements.txt").read_text().splitlines(),
-    extras_require={
-        "tensorflow": ["tensorflow>={},<{}".format(inclusive_min_tf_version, exclusive_max_tf_version)],
-    },
-    include_package_data=True,
-    zip_safe=False,
-    distclass=BinaryDistribution,
-    classifiers=[
-        "Development Status :: 4 - Beta",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Education",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Programming Language :: Python :: 3.11",
-        "Topic :: Scientific/Engineering :: Mathematics",
-        "Topic :: Software Development :: Libraries :: Python Modules",
-        "Topic :: Software Development :: Libraries",
-    ],
-    license="Apache 2.0",
-    keywords="tensorflow deepray machine learning",
+  name=project_name,
+  version=version["__version__"],
+  description=DOCLINES[0],
+  long_description="\n".join(DOCLINES[2:]),
+  author="Hailin Fu",
+  author_email="hailinfufu@outlook.com",
+  packages=find_packages(),
+  ext_modules=get_ext_modules(),
+  install_requires=Path("requirements.txt").read_text().splitlines(),
+  extras_require={
+    "tensorflow": ["tensorflow>={},<{}".format(inclusive_min_tf_version, exclusive_max_tf_version)],
+  },
+  include_package_data=True,
+  zip_safe=False,
+  distclass=BinaryDistribution,
+  classifiers=[
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Education",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Software Development :: Libraries",
+  ],
+  license="Apache 2.0",
+  keywords="tensorflow deepray machine learning",
 )
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index c132db0e..a801e57e 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -76,6 +76,7 @@ class ConfigError(Exception):
 def _is_aarch64():
   return platform.machine() == "aarch64"
 
+
 def _is_linux():
   return platform.system() == "Linux"
 
@@ -91,20 +92,20 @@ def _is_macos():
 def _matches_version(actual_version, required_version):
   """Checks whether some version meets the requirements.
 
-      All elements of the required_version need to be present in the
-      actual_version.
+  All elements of the required_version need to be present in the
+  actual_version.
 
-          required_version  actual_version  result
-          -----------------------------------------
-          1                 1.1             True
-          1.2               1               False
-          1.2               1.3             False
-                            1               True
+      required_version  actual_version  result
+      -----------------------------------------
+      1                 1.1             True
+      1.2               1               False
+      1.2               1.3             False
+                        1               True
 
-      Args:
-        required_version: The version specified by the user.
-        actual_version: The version detected from the CUDA installation.
-      Returns: Whether the actual version matches the required one.
+  Args:
+    required_version: The version specified by the user.
+    actual_version: The version detected from the CUDA installation.
+  Returns: Whether the actual version matches the required one.
   """
   if actual_version is None:
     return False
@@ -160,41 +161,35 @@ def _get_default_cuda_paths(cuda_version):
     cuda_version = cuda_version + ".*"
 
   if _is_windows():
-    return [
-        os.environ.get(
-            "CUDA_PATH",
-            "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" %
-            cuda_version)
-    ]
-  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr",
-         "/usr/local/cudnn"] + _get_ld_config_paths()
+    return [os.environ.get("CUDA_PATH", "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v%s\\" % cuda_version)]
+  return ["/usr/local/cuda-%s" % cuda_version, "/usr/local/cuda", "/usr", "/usr/local/cudnn"] + _get_ld_config_paths()
 
 
 def _header_paths():
   """Returns hard-coded set of relative paths to look for header files."""
   return [
-      "",
-      "include",
-      "include/cuda",
-      "include/*-linux-gnu",
-      "extras/CUPTI/include",
-      "include/cuda/CUPTI",
-      "local/cuda/extras/CUPTI/include",
-      "targets/x86_64-linux/include",
+    "",
+    "include",
+    "include/cuda",
+    "include/*-linux-gnu",
+    "extras/CUPTI/include",
+    "include/cuda/CUPTI",
+    "local/cuda/extras/CUPTI/include",
+    "targets/x86_64-linux/include",
   ]
 
 
 def _library_paths():
   """Returns hard-coded set of relative paths to look for library files."""
   return [
-      "",
-      "lib64",
-      "lib",
-      "lib/*-linux-gnu",
-      "lib/x64",
-      "extras/CUPTI/*",
-      "local/cuda/lib64",
-      "local/cuda/extras/CUPTI/lib64",
+    "",
+    "lib64",
+    "lib",
+    "lib/*-linux-gnu",
+    "lib/x64",
+    "extras/CUPTI/*",
+    "local/cuda/lib64",
+    "local/cuda/extras/CUPTI/lib64",
   ]
 
 
@@ -202,8 +197,8 @@ def _not_found_error(base_paths, relative_paths, filepattern):
   base_paths = "".join(["\n        '%s'" % path for path in sorted(base_paths)])
   relative_paths = "".join(["\n        '%s'" % path for path in relative_paths])
   return ConfigError(
-      "Could not find any %s in any subdirectory:%s\nof:%s\n" %
-      (filepattern, relative_paths, base_paths))
+    "Could not find any %s in any subdirectory:%s\nof:%s\n" % (filepattern, relative_paths, base_paths)
+  )
 
 
 def _find_file(base_paths, relative_paths, filepattern):
@@ -218,16 +213,13 @@ def _find_library(base_paths, library_name, required_version):
   if _is_windows():
     filepattern = library_name + ".lib"
   elif _is_macos():
-    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] +
-                                          required_version.split(".")[:1]))
+    filepattern = "%s*.dylib" % (".".join(["lib" + library_name] + required_version.split(".")[:1]))
   else:
-    filepattern = ".".join(["lib" + library_name, "so"] +
-                           required_version.split(".")[:1]) + "*"
+    filepattern = ".".join(["lib" + library_name, "so"] + required_version.split(".")[:1]) + "*"
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
-def _find_versioned_file(base_paths, relative_paths, filepatterns,
-                         required_version, get_version):
+def _find_versioned_file(base_paths, relative_paths, filepatterns, required_version, get_version):
   """Returns first valid path to a file that matches the requested version."""
   if type(filepatterns) not in [list, tuple]:
     filepatterns = [filepatterns]
@@ -238,27 +230,23 @@ def _find_versioned_file(base_paths, relative_paths, filepatterns,
         if _matches_version(actual_version, required_version):
           return file, actual_version
   raise _not_found_error(
-      base_paths, relative_paths,
-      ", ".join(filepatterns) + " matching version '%s'" % required_version)
+    base_paths, relative_paths, ", ".join(filepatterns) + " matching version '%s'" % required_version
+  )
 
 
 def _find_header(base_paths, header_name, required_version, get_version):
   """Returns first valid path to a header that matches the requested version."""
-  return _find_versioned_file(base_paths, _header_paths(), header_name,
-                              required_version, get_version)
+  return _find_versioned_file(base_paths, _header_paths(), header_name, required_version, get_version)
 
 
 def _find_cuda_config(base_paths, required_version):
-
   def get_header_version(path):
     version = int(_get_header_version(path, "CUDA_VERSION"))
     if not version:
       return None
     return "%d.%d" % (version // 1000, version % 1000 // 10)
 
-  cuda_header_path, header_version = _find_header(base_paths, "cuda.h",
-                                                  required_version,
-                                                  get_header_version)
+  cuda_header_path, header_version = _find_header(base_paths, "cuda.h", required_version, get_header_version)
   cuda_version = header_version  # x.y, see above.
 
   cuda_library_path = _find_library(base_paths, "cudart", cuda_version)
@@ -272,18 +260,28 @@ def get_nvcc_version(path):
     return None
 
   nvcc_name = "nvcc.exe" if _is_windows() else "nvcc"
-  nvcc_path, nvcc_version = _find_versioned_file(base_paths, [
+  nvcc_path, nvcc_version = _find_versioned_file(
+    base_paths,
+    [
       "",
       "bin",
       "local/cuda/bin",
-  ], nvcc_name, cuda_version, get_nvcc_version)
+    ],
+    nvcc_name,
+    cuda_version,
+    get_nvcc_version,
+  )
 
-  nvvm_path = _find_file(base_paths, [
+  nvvm_path = _find_file(
+    base_paths,
+    [
       "nvvm/libdevice",
       "share/cuda",
       "lib/nvidia-cuda-toolkit/libdevice",
       "local/cuda/nvvm/libdevice",
-  ], "libdevice*.10.bc")
+    ],
+    "libdevice*.10.bc",
+  )
 
   cupti_header_path = _find_file(base_paths, _header_paths(), "cupti.h")
   nvml_header_dir = _find_file(base_paths, _header_paths(), "nvml.h")
@@ -295,40 +293,35 @@ def get_nvcc_version(path):
   # XLA requires the toolkit path to find ptxas and libdevice.
   # TODO(csigg): pass in both directories instead.
   cuda_toolkit_paths = (
-      os.path.normpath(os.path.join(cuda_binary_dir, "..")),
-      os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
+    os.path.normpath(os.path.join(cuda_binary_dir, "..")),
+    os.path.normpath(os.path.join(nvvm_library_dir, "../..")),
   )
   if cuda_toolkit_paths[0] != cuda_toolkit_paths[1]:
-    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" %
-                      cuda_toolkit_paths)
+    raise ConfigError("Inconsistent CUDA toolkit path: %s vs %s" % cuda_toolkit_paths)
 
   return {
-      "cuda_version": cuda_version,
-      "cuda_include_dir": os.path.dirname(cuda_header_path),
-      "cuda_library_dir": os.path.dirname(cuda_library_path),
-      "cuda_binary_dir": cuda_binary_dir,
-      "nvvm_library_dir": nvvm_library_dir,
-      "cupti_include_dir": os.path.dirname(cupti_header_path),
-      "cupti_library_dir": os.path.dirname(cupti_library_path),
-      "cuda_toolkit_path": cuda_toolkit_paths[0],
-      "nvml_header_dir": os.path.dirname(nvml_header_dir),
+    "cuda_version": cuda_version,
+    "cuda_include_dir": os.path.dirname(cuda_header_path),
+    "cuda_library_dir": os.path.dirname(cuda_library_path),
+    "cuda_binary_dir": cuda_binary_dir,
+    "nvvm_library_dir": nvvm_library_dir,
+    "cupti_include_dir": os.path.dirname(cupti_header_path),
+    "cupti_library_dir": os.path.dirname(cupti_library_path),
+    "cuda_toolkit_path": cuda_toolkit_paths[0],
+    "nvml_header_dir": os.path.dirname(nvml_header_dir),
   }
 
 
 def _find_cublas_config(base_paths, required_version, cuda_version):
-
   if _at_least_version(cuda_version, "10.1"):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name)
-          for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR",
-                       "CUBLAS_VER_PATCH"))
+        _get_header_version(path, name) for name in ("CUBLAS_VER_MAJOR", "CUBLAS_VER_MINOR", "CUBLAS_VER_PATCH")
+      )
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cublas_api.h",
-                                               required_version,
-                                               get_header_version)
+    header_path, header_version = _find_header(base_paths, "cublas_api.h", required_version, get_header_version)
     # cuBLAS uses the major version only.
     cublas_version = header_version.split(".")[0]
 
@@ -342,26 +335,22 @@ def get_header_version(path):
   library_path = _find_library(base_paths, "cublas", cublas_version)
 
   return {
-      "cublas_version": header_version,
-      "cublas_include_dir": os.path.dirname(header_path),
-      "cublas_library_dir": os.path.dirname(library_path),
+    "cublas_version": header_version,
+    "cublas_include_dir": os.path.dirname(header_path),
+    "cublas_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_cusolver_config(base_paths, required_version, cuda_version):
-
-  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
+  if _at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2")):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name)
-          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
-                       "CUSOLVER_VER_PATCH"))
+        _get_header_version(path, name) for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR", "CUSOLVER_VER_PATCH")
+      )
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
-                                               required_version,
-                                               get_header_version)
+    header_path, header_version = _find_header(base_paths, "cusolver_common.h", required_version, get_header_version)
     cusolver_version = header_version.split(".")[0]
 
   else:
@@ -372,26 +361,22 @@ def get_header_version(path):
   library_path = _find_library(base_paths, "cusolver", cusolver_version)
 
   return {
-      "cusolver_version": header_version,
-      "cusolver_include_dir": os.path.dirname(header_path),
-      "cusolver_library_dir": os.path.dirname(library_path),
+    "cusolver_version": header_version,
+    "cusolver_include_dir": os.path.dirname(header_path),
+    "cusolver_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_curand_config(base_paths, required_version, cuda_version):
-
-  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
+  if _at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2")):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name)
-          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
-                       "CURAND_VER_PATCH"))
+        _get_header_version(path, name) for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR", "CURAND_VER_PATCH")
+      )
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "curand.h",
-                                               required_version,
-                                               get_header_version)
+    header_path, header_version = _find_header(base_paths, "curand.h", required_version, get_header_version)
     curand_version = header_version.split(".")[0]
 
   else:
@@ -402,25 +387,20 @@ def get_header_version(path):
   library_path = _find_library(base_paths, "curand", curand_version)
 
   return {
-      "curand_version": header_version,
-      "curand_include_dir": os.path.dirname(header_path),
-      "curand_library_dir": os.path.dirname(library_path),
+    "curand_version": header_version,
+    "curand_include_dir": os.path.dirname(header_path),
+    "curand_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_cufft_config(base_paths, required_version, cuda_version):
-
-  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
+  if _at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2")):
 
     def get_header_version(path):
-      version = (
-          _get_header_version(path, name)
-          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
+      version = (_get_header_version(path, name) for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR", "CUFFT_VER_PATCH"))
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cufft.h",
-                                               required_version,
-                                               get_header_version)
+    header_path, header_version = _find_header(base_paths, "cufft.h", required_version, get_header_version)
     cufft_version = header_version.split(".")[0]
 
   else:
@@ -431,49 +411,41 @@ def get_header_version(path):
   library_path = _find_library(base_paths, "cufft", cufft_version)
 
   return {
-      "cufft_version": header_version,
-      "cufft_include_dir": os.path.dirname(header_path),
-      "cufft_library_dir": os.path.dirname(library_path),
+    "cufft_version": header_version,
+    "cufft_include_dir": os.path.dirname(header_path),
+    "cufft_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_cudnn_config(base_paths, required_version):
-
   def get_header_version(path):
-    version = [
-        _get_header_version(path, name)
-        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
+    version = [_get_header_version(path, name) for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
     return ".".join(version) if version[0] else None
 
-  header_path, header_version = _find_header(base_paths,
-                                             ("cudnn.h", "cudnn_version.h"),
-                                             required_version,
-                                             get_header_version)
+  header_path, header_version = _find_header(
+    base_paths, ("cudnn.h", "cudnn_version.h"), required_version, get_header_version
+  )
   cudnn_version = header_version.split(".")[0]
 
   library_path = _find_library(base_paths, "cudnn", cudnn_version)
 
   return {
-      "cudnn_version": cudnn_version,
-      "cudnn_include_dir": os.path.dirname(header_path),
-      "cudnn_library_dir": os.path.dirname(library_path),
+    "cudnn_version": cudnn_version,
+    "cudnn_include_dir": os.path.dirname(header_path),
+    "cudnn_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_cusparse_config(base_paths, required_version, cuda_version):
-
-  if (_at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2"))):
+  if _at_least_version(cuda_version, "11.0") or (_is_aarch64 and _at_least_version(cuda_version, "10.2")):
 
     def get_header_version(path):
       version = (
-          _get_header_version(path, name)
-          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
-                       "CUSPARSE_VER_PATCH"))
+        _get_header_version(path, name) for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR", "CUSPARSE_VER_PATCH")
+      )
       return ".".join(version)
 
-    header_path, header_version = _find_header(base_paths, "cusparse.h",
-                                               required_version,
-                                               get_header_version)
+    header_path, header_version = _find_header(base_paths, "cusparse.h", required_version, get_header_version)
     cusparse_version = header_version.split(".")[0]
 
   else:
@@ -484,41 +456,34 @@ def get_header_version(path):
   library_path = _find_library(base_paths, "cusparse", cusparse_version)
 
   return {
-      "cusparse_version": header_version,
-      "cusparse_include_dir": os.path.dirname(header_path),
-      "cusparse_library_dir": os.path.dirname(library_path),
+    "cusparse_version": header_version,
+    "cusparse_include_dir": os.path.dirname(header_path),
+    "cusparse_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_nccl_config(base_paths, required_version):
-
   def get_header_version(path):
-    version = (
-        _get_header_version(path, name)
-        for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
+    version = (_get_header_version(path, name) for name in ("NCCL_MAJOR", "NCCL_MINOR", "NCCL_PATCH"))
     return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths, "nccl.h",
-                                             required_version,
-                                             get_header_version)
+  header_path, header_version = _find_header(base_paths, "nccl.h", required_version, get_header_version)
   nccl_version = header_version.split(".")[0]
 
   library_path = _find_library(base_paths, "nccl", nccl_version)
 
   return {
-      "nccl_version": nccl_version,
-      "nccl_include_dir": os.path.dirname(header_path),
-      "nccl_library_dir": os.path.dirname(library_path),
+    "nccl_version": nccl_version,
+    "nccl_include_dir": os.path.dirname(header_path),
+    "nccl_library_dir": os.path.dirname(library_path),
   }
 
 
 def _find_tensorrt_config(base_paths, required_version):
-
   def get_header_version(path):
     version = (
-        _get_header_version(path, name)
-        for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR",
-                     "NV_TENSORRT_PATCH"))
+      _get_header_version(path, name) for name in ("NV_TENSORRT_MAJOR", "NV_TENSORRT_MINOR", "NV_TENSORRT_PATCH")
+    )
     # `version` is a generator object, so we convert it to a list before using
     # it (muitiple times below).
     version = list(version)
@@ -526,17 +491,15 @@ def get_header_version(path):
       return None  # Versions not found, make _matches_version returns False.
     return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths, "NvInferVersion.h",
-                                             required_version,
-                                             get_header_version)
+  header_path, header_version = _find_header(base_paths, "NvInferVersion.h", required_version, get_header_version)
 
   tensorrt_version = header_version.split(".")[0]
   library_path = _find_library(base_paths, "nvinfer", tensorrt_version)
 
   return {
-      "tensorrt_version": header_version,
-      "tensorrt_include_dir": os.path.dirname(header_path),
-      "tensorrt_library_dir": os.path.dirname(library_path),
+    "tensorrt_version": header_version,
+    "tensorrt_include_dir": os.path.dirname(header_path),
+    "tensorrt_library_dir": os.path.dirname(library_path),
   }
 
 
@@ -573,8 +536,7 @@ def find_cuda_config():
   """Returns a dictionary of CUDA library and header file paths."""
   libraries = [argv.lower() for argv in sys.argv[1:]]
   cuda_version = os.environ.get("TF_CUDA_VERSION", "")
-  base_paths = _list_from_env("TF_CUDA_PATHS",
-                              _get_default_cuda_paths(cuda_version))
+  base_paths = _list_from_env("TF_CUDA_PATHS", _get_default_cuda_paths(cuda_version))
   base_paths = [path for path in base_paths if os.path.exists(path)]
 
   result = {}
@@ -590,22 +552,19 @@ def find_cuda_config():
       # Before CUDA 10.1, cuBLAS was in the same directory as the toolkit.
       cublas_paths = cuda_paths
     cublas_version = os.environ.get("TF_CUBLAS_VERSION", "")
-    result.update(
-        _find_cublas_config(cublas_paths, cublas_version, cuda_version))
+    result.update(_find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
     cusolver_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       cusolver_paths = cuda_paths
     cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
-    result.update(
-        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
+    result.update(_find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
 
     curand_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       curand_paths = cuda_paths
     curand_version = os.environ.get("TF_CURAND_VERSION", "")
-    result.update(
-        _find_curand_config(curand_paths, curand_version, cuda_version))
+    result.update(_find_curand_config(curand_paths, curand_version, cuda_version))
 
     cufft_paths = base_paths
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
@@ -617,8 +576,7 @@ def find_cuda_config():
     if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
       cusparse_paths = cuda_paths
     cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
-    result.update(
-        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
+    result.update(_find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
 
   if "cudnn" in libraries:
     cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
@@ -647,7 +605,7 @@ def main():
     for key, value in sorted(find_cuda_config().items()):
       print("%s: %s" % (key, value))
   except ConfigError as e:
-    sys.stderr.write(str(e) + '\n')
+    sys.stderr.write(str(e) + "\n")
     sys.exit(1)
 
 
diff --git a/tools/docs/build_docs.py b/tools/docs/build_docs.py
index 8304176c..d0301edf 100644
--- a/tools/docs/build_docs.py
+++ b/tools/docs/build_docs.py
@@ -36,7 +36,6 @@
 PROJECT_FULL_NAME = "Deepray"
 
 
-
 flags.DEFINE_string("git_branch", default=None, help="The name of the corresponding branch on github.")
 
 CODE_PREFIX_TEMPLATE = "https://github.com/tensorflow/deepray/tree/{git_branch}/deepray"
@@ -62,20 +61,20 @@ def main(argv):
     code_url_prefix = CODE_PREFIX_TEMPLATE.format(git_branch="master")
 
   doc_generator = generate_lib.DocGenerator(
-      root_title=PROJECT_FULL_NAME,
-      py_modules=[(PROJECT_SHORT_NAME, dp)],
-      code_url_prefix=code_url_prefix,
-      private_map={
-          "dp": ["__version__", "utils", "version"],
-          "dp.options": ["warn_fallback"],
-      },
-      # These callbacks usually clean up a lot of aliases caused by internal imports.
-      callbacks=[
-          public_api.local_definitions_filter,
-          public_api.explicit_package_contents_filter,
-      ],
-      search_hints=FLAGS.search_hints,
-      site_path=FLAGS.site_path,
+    root_title=PROJECT_FULL_NAME,
+    py_modules=[(PROJECT_SHORT_NAME, dp)],
+    code_url_prefix=code_url_prefix,
+    private_map={
+      "dp": ["__version__", "utils", "version"],
+      "dp.options": ["warn_fallback"],
+    },
+    # These callbacks usually clean up a lot of aliases caused by internal imports.
+    callbacks=[
+      public_api.local_definitions_filter,
+      public_api.explicit_package_contents_filter,
+    ],
+    search_hints=FLAGS.search_hints,
+    site_path=FLAGS.site_path,
   )
 
   doc_generator.build(FLAGS.output_dir)

From 0c48a14da439417c873817a790c48d704ae3c957 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 22:03:13 +0800
Subject: [PATCH 08/11] Test: ruff ignore F401 imported but unused

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index bb51944e..258939ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,6 +103,8 @@ ignore = [
     "B904",
     # Zip without explicit strict parameter
     "B905",
+    # imported but unused
+    "F401",
 ]
 select = [
     "B9",

From bb609b8e65e54f118fcc4f4a7fb77f60a2c30d34 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 22:32:26 +0800
Subject: [PATCH 09/11] Test: Update ruff check rules

---
 .../custom_ops/distributed_embeddings/BUILD   |   56 -
 .../distributed_embeddings/README.md          |   48 -
 .../distributed_embeddings/__init__.py        |   27 -
 .../cc/kernels/embedding_lookup.h             |   63 -
 .../cc/kernels/embedding_lookup_kernels.cc    |  203 ---
 .../cc/kernels/embedding_lookup_kernels.cu.cc |  861 -----------
 .../cc/ops/embedding_lookup_ops.cc            |  106 --
 .../distributed_embeddings/python/__init__.py |   14 -
 .../python/layers/__init__.py                 |   14 -
 .../python/layers/dist_model_parallel.py      | 1314 -----------------
 .../python/layers/embedding.py                |  280 ----
 .../python/ops/__init__.py                    |   14 -
 .../python/ops/embedding_lookup_ops.py        |  125 --
 .../python/tests/dist_model_parallel_test.py  |  774 ----------
 .../python/tests/embedding_lookup_ops_test.py |  116 --
 .../python/tests/embedding_test.py            |  190 ---
 .../python/tests/integer_lookup_test.py       |   72 -
 .../python/tests/run_all_test.py              |    8 -
 .../python/group_embedding_lookup_ops.py      |    8 +-
 deepray/layers/dense.py                       |    1 -
 deepray/layers/mlp.py                         |    4 +-
 deepray/models/rec/base_model.py              |  683 ---------
 deepray/models/rec/cgc_v3.py                  |  248 ----
 deepray/utils/flags/core.py                   |   38 -
 deepray/utils/logs/hooks_helper.py            |    1 +
 deepray/utils/logs/logger.py                  |    9 +-
 deepray/utils/logs/logger_test.py             |    4 +-
 deepray/utils/logs/mlperf_helper.py           |    6 +-
 deepray/utils/logs/summary_manager.py         |   13 +-
 pyproject.toml                                |   10 +
 30 files changed, 31 insertions(+), 5279 deletions(-)
 delete mode 100644 deepray/custom_ops/distributed_embeddings/BUILD
 delete mode 100644 deepray/custom_ops/distributed_embeddings/README.md
 delete mode 100644 deepray/custom_ops/distributed_embeddings/__init__.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup.h
 delete mode 100644 deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cc
 delete mode 100644 deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
 delete mode 100644 deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/__init__.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/layers/__init__.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/ops/__init__.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
 delete mode 100644 deepray/custom_ops/distributed_embeddings/python/tests/run_all_test.py
 delete mode 100644 deepray/models/rec/base_model.py
 delete mode 100644 deepray/models/rec/cgc_v3.py

diff --git a/deepray/custom_ops/distributed_embeddings/BUILD b/deepray/custom_ops/distributed_embeddings/BUILD
deleted file mode 100644
index 33e86195..00000000
--- a/deepray/custom_ops/distributed_embeddings/BUILD
+++ /dev/null
@@ -1,56 +0,0 @@
-load("//deepray:deepray.bzl", "custom_op_library")
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
-
-custom_op_library(
-    name = "_distributed_embeddings_ops.so",
-    srcs = [
-        "cc/kernels/embedding_lookup.h",
-        "cc/kernels/embedding_lookup_kernels.cc",
-        "cc/ops/embedding_lookup_ops.cc",
-    ],
-    gpu_deps = [
-        "@cub_archive//:cub",
-        "@com_github_NVIDIA_cuCollections//:cuCollections",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cuda_runtime",
-        "@local_config_cuda//cuda:cudart",
-    ],
-    gpu_srcs = [
-        # TODO: Update cuCollections version
-        "cc/kernels/embedding_lookup.h",
-        "cc/kernels/embedding_lookup_kernels.cu.cc",
-    ],
-    deps = [
-        "//deepray/custom_ops/utils:ok_status_util",
-    ],
-)
-
-py_library(
-    name = "distributed_embeddings",
-    srcs = glob(
-        [
-            "python/*.py",
-            "*.py",
-        ],
-    ),
-    data = [
-        ":_distributed_embeddings_ops.so",
-        "//deepray:options.py",
-    ],
-    deps = [
-        "//deepray/utils",
-    ],
-)
-
-py_test(
-    name = "distributed_embeddings_ops_test",
-    size = "small",
-    srcs = glob(["python/tests/*"]),
-    main = "python/tests/run_all_test.py",
-    deps = [
-        ":distributed_embeddings",
-    ],
-)
diff --git a/deepray/custom_ops/distributed_embeddings/README.md b/deepray/custom_ops/distributed_embeddings/README.md
deleted file mode 100644
index 69f2da72..00000000
--- a/deepray/custom_ops/distributed_embeddings/README.md
+++ /dev/null
@@ -1,48 +0,0 @@
-# [Distributed Embeddings](https://github.com/NVIDIA-Merlin/distributed-embeddings)
-
-[![Documentation](https://img.shields.io/badge/documentation-blue.svg)](https://nvidia-merlin.github.io/distributed-embeddings/Introduction.html)
-[![LICENSE](https://img.shields.io/github/license/NVIDIA-Merlin/NVTabular)](https://github.com/NVIDIA-Merlin/distributed-embeddingsb/blob/main/LICENSE)
-
-distributed-embeddings is a library for building large embedding based (e.g. recommender) models in Tensorflow 2. It provides a scalable model parallel wrapper that automatically distribute embedding tables to multiple GPUs, as well as efficient embedding operations that cover and extend Tensorflow's embedding functionalities.
-
-Refer to [NVIDIA Developer blog](https://developer.nvidia.com/blog/fast-terabyte-scale-recommender-training-made-easy-with-nvidia-merlin-distributed-embeddings/) about Terabyte-scale Recommender Training for more details.
-
-## Features
-
-### Distributed model parallel wrapper
-`distributed_embeddings.dist_model_parallel` is a tool to enable model parallel training by changing only three lines of your script. It can also be used alongside data parallel to form hybrid parallel training. Users can easily experiment large scale embeddings beyond single GPU's memory capacity without complex code to handle cross-worker communication.
-
-### Embedding Layers
-
-`distributed_embeddings.Embedding` combines functionalities of `tf.keras.layers.Embedding` and `tf.nn.embedding_lookup_sparse` under a unified Keras layer API. The backend is designed to achieve high GPU efficiency.
-
-See more details at [User Guide](https://nvidia-merlin.github.io/distributed-embeddings/userguide.html)
-
-## Installation
-### Requirements
-Python 3, CUDA 11 or newer, TensorFlow 2
-### Containers ###
-You can build inside 22.03 or later NGC TF2 [image](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tensorflow):
-
-Note: horovod v0.27 and TensorFlow 2.10, alternatively NGC 23.03 container, is required for building v0.3+
-```bash
-docker pull nvcr.io/nvidia/tensorflow:23.03-tf2-py3
-```
-### Build from source
-
-After clone this repository, run:
-```bash
-git submodule update --init --recursive
-make pip_pkg && pip install artifacts/*.whl
-```
-Test installation with:
-```python
-python -c "import distributed_embeddings"
-```
-You can also run [Synthetic](https://github.com/NVIDIA-Merlin/distributed-embeddings/tree/main/examples/benchmarks/synthetic_models) and [DLRM](https://github.com/NVIDIA-Merlin/distributed-embeddings/blob/main/examples/dlrm/main.py) examples.
-
-## Feedback and Support
-
-If you'd like to contribute to the library directly, see the [CONTRIBUTING.md](https://github.com/NVIDIA-Merlin/distributed-embeddings/blob/main/CONTRIBUTING.md). We're particularly interested in contributions or feature requests for our feature engineering and preprocessing operations. To further advance our Merlin Roadmap, we encourage you to share all the details regarding your recommender system pipeline in this [survey](https://developer.nvidia.com/merlin-devzone-survey).
-
-If you're interested in learning more about how distributed-embeddings works, see [documentation]( https://nvidia-merlin.github.io/distributed-embeddings/Introduction.html).
diff --git a/deepray/custom_ops/distributed_embeddings/__init__.py b/deepray/custom_ops/distributed_embeddings/__init__.py
deleted file mode 100644
index 29a108e3..00000000
--- a/deepray/custom_ops/distributed_embeddings/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Distributed embedding API."""
-
-from deepray.custom_ops.distributed_embeddings.python.ops.embedding_lookup_ops import embedding_lookup
-from deepray.custom_ops.distributed_embeddings.python.layers.embedding import Embedding
-from deepray.custom_ops.distributed_embeddings.python.layers.embedding import IntegerLookup
-from deepray.custom_ops.distributed_embeddings.python.layers import dist_model_parallel
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import DistEmbeddingStrategy
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import DistributedEmbedding
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import broadcast_variables
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import DistributedGradientTape
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import DistributedOptimizer
-from deepray.custom_ops.distributed_embeddings.python.layers.dist_model_parallel import BroadcastGlobalVariablesCallback
-from .version import __version__
diff --git a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup.h b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup.h
deleted file mode 100644
index 80042a1a..00000000
--- a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef DISTRIBUTED_EMBEDDING_CC_KERNELS_EMBEDDING_LOOKUP_H_
-#define DISTRIBUTED_EMBEDDING_CC_KERNELS_EMBEDDING_LOOKUP_H_
-
-#include <string>
-
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-enum class Combiner { Mean = 0, Sum = 1 };
-inline Combiner StringToEnum(std::string combiner) {
-  return combiner == "mean" ? Combiner::Mean : Combiner::Sum;
-}
-
-template <typename Device, typename Tindices>
-struct RowToSplitFunctor {
-  void operator()(const Device& d, Tindices* split_ptr, const Tindices* row_ptr,
-                  Tindices num_ids, Tindices num_rows) const;
-};
-
-template <typename Device, typename T, typename Tindices>
-struct EmbeddingLookupVariableHotnessFunctor {
-  void operator()(const Device& d, T* output_ptr, const T* param_ptr,
-                  const Tindices* ids_ptr, const Tindices* offsets_ptr,
-                  Tindices num_rows, Tindices embedding_width,
-                  Combiner combiner, Tindices ave_red_len) const;
-};
-
-template <typename Device, typename T, typename Tindices>
-struct EmbeddingLookupVariableHotnessGradFunctor {
-  void operator()(OpKernelContext* context, const Tindices* ids_ptr,
-                  const Tindices* row_ptr, const T* grad_ptr, int64_t num_ids,
-                  Tindices embedding_width, Tindices num_rows,
-                  int64_t dense_shape_dim0, int64_t max_red_len,
-                  Combiner combiner) const;
-};
-
-template <typename Device, typename T, typename CountT>
-struct IntegerLookupFunctor {
-  void operator()(OpKernelContext* context, T* table_ptr, CountT* count_ptr,
-                  const T* keys_ptr, T* values_ptr, T num_ele, bool init,
-                  int64_t capacity) const;
-};
-
-}  // namespace tensorflow
-
-#endif  // DISTRIBUTED_EMBEDDING_CC_KERNELS_EMBEDDING_LOOKUP_H_
diff --git a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cc b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cc
deleted file mode 100644
index 887723b5..00000000
--- a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cc
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "embedding_lookup.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/resource_var.h"
-
-namespace tensorflow {
-
-// helper op for sparse read that don't respect copy_on_read_mode
-// Compare to sparse_read(resource_gather) method of resource variable,
-// as long as a custom sparse read op following this, only difference is earlier
-// lock release since copy_on_read_mode result in dense write copying anyway,
-// early unlock won't affect it
-class ReadVariableNoCopyOp : public OpKernel {
- public:
-  explicit ReadVariableNoCopyOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
-  }
-  void Compute(OpKernelContext* context) override {
-    core::RefCountPtr<Var> v;
-    OP_REQUIRES_OK(context,
-                   LookupResource(context, HandleFromInput(context, 0), &v));
-    tf_shared_lock ml(*v->mu());
-    const Tensor* t = v->tensor();
-    OP_REQUIRES(context, dtype_ == t->dtype(),
-                errors::InvalidArgument(
-                    "Trying to read variable(no copy) with wrong dtype."));
-    context->set_output(0, *t);
-  }
-
- private:
-  DataType dtype_;
-};
-
-template <typename Device, typename Tindices>
-class RowToSplitOp : public OpKernel {
- public:
-  explicit RowToSplitOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    // [n, 2]
-    const Tensor& row = context->input(0);
-    auto num_ids = row.dim_size(0);
-    auto num_rows = context->input(1).scalar<int32>()();
-
-    TensorShape output_shape({num_rows + 1});
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-
-    RowToSplitFunctor<Device, Tindices>()(
-        context->eigen_device<Device>(), output->flat<Tindices>().data(),
-        row.flat<Tindices>().data(), num_ids, num_rows);
-  }
-};
-
-template <typename Device, typename T, typename Tindices>
-class EmbeddingLookupVariableHotnessOp : public OpKernel {
- public:
-  explicit EmbeddingLookupVariableHotnessOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("combiner", &_combiner));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& params = context->input(0);
-    const Tensor& ids = context->input(1);
-    const Tensor& offsets = context->input(2);
-
-    auto num_rows = offsets.dim_size(0) - 1;
-    auto embedding_width = params.dim_size(1);
-
-    auto num_ids = ids.dim_size(0);
-    auto ave_red_len = num_ids / num_rows;
-
-    TensorShape output_shape({num_rows, embedding_width});
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-
-    EmbeddingLookupVariableHotnessFunctor<Device, T, Tindices>()(
-        context->eigen_device<Device>(), output->flat<T>().data(),
-        params.flat<T>().data(), ids.flat<Tindices>().data(),
-        offsets.flat<Tindices>().data(), num_rows, embedding_width,
-        StringToEnum(_combiner), ave_red_len);
-  }
-
- private:
-  string _combiner;
-};
-
-template <typename Device, typename T, typename Tindices>
-class EmbeddingLookupVariableHotnessGradOp : public OpKernel {
- public:
-  explicit EmbeddingLookupVariableHotnessGradOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("combiner", &_combiner));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    const Tensor& ids = context->input(0);
-    const Tensor& offset_in = context->input(1);
-    const Tensor& grad = context->input(2);
-    const Tensor& param = context->input(3);
-    auto num_ids = ids.dim_size(0);
-    auto num_rows = offset_in.dim_size(0) - 1;
-    auto embedding_width = grad.dim_size(1);
-    auto max_red_len = grad.dim_size(0);
-    auto dense_shape_dim0 = param.dim_size(0);
-
-    EmbeddingLookupVariableHotnessGradFunctor<Device, T, Tindices>()(
-        context, ids.flat<Tindices>().data(), offset_in.flat<Tindices>().data(),
-        grad.flat<T>().data(), num_ids, embedding_width, num_rows,
-        dense_shape_dim0, max_red_len, StringToEnum(_combiner));
-  }
-
- private:
-  string _combiner;
-};
-
-template <typename Device, typename T, typename CountT>
-class IntegerLookupOp : public OpKernel {
- public:
-  explicit IntegerLookupOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("capacity", &capacity_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    core::RefCountPtr<Var> table;
-    OP_REQUIRES_OK(
-        context, LookupResource(context, HandleFromInput(context, 0), &table));
-    core::RefCountPtr<Var> count;
-    OP_REQUIRES_OK(
-        context, LookupResource(context, HandleFromInput(context, 1), &count));
-    const Tensor& keys = context->input(2);
-
-    Tensor* values = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, keys.shape(), &values));
-
-    IntegerLookupFunctor<Device, T, CountT>()(
-        context, table->tensor()->flat<T>().data(),
-        count->tensor()->flat<CountT>().data(), keys.flat<T>().data(),
-        values->flat<T>().data(), keys.NumElements(), need_init_, capacity_);
-    need_init_ = false;
-  }
-
- private:
-  // TODO(deyuf): move init to its own op
-  bool need_init_ = true;
-  int64_t capacity_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ReadVariableNoCopy").Device(DEVICE_CPU),
-                        ReadVariableNoCopyOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ReadVariableNoCopy").Device(DEVICE_DEFAULT).HostMemory("resource"),
-    ReadVariableNoCopyOp);
-REGISTER_KERNEL_BUILDER(
-    Name("ReadVariableNoCopy").Device(DEVICE_GPU).HostMemory("resource"),
-    ReadVariableNoCopyOp);
-
-REGISTER_KERNEL_BUILDER(Name("IntegerLookup")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<int64_t>("T")
-                            .TypeConstraint<uint32_t>("count_dtype"),
-                        IntegerLookupOp<Eigen::GpuDevice, int64_t, uint32_t>);
-
-#define REGISTER_GPU(T, Tindices)                                       \
-  REGISTER_KERNEL_BUILDER(Name("RowToSplit")                            \
-                              .Device(DEVICE_GPU)                       \
-                              .TypeConstraint<Tindices>("Tindices")     \
-                              .HostMemory("shape"),                     \
-                          RowToSplitOp<Eigen::GpuDevice, Tindices>);    \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("EmbeddingLookupVariableHotness")                            \
-          .Device(DEVICE_GPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .TypeConstraint<Tindices>("Tindices"),                        \
-      EmbeddingLookupVariableHotnessOp<Eigen::GpuDevice, T, Tindices>); \
-  REGISTER_KERNEL_BUILDER(                                              \
-      Name("EmbeddingLookupVariableHotnessGrad")                        \
-          .Device(DEVICE_GPU)                                           \
-          .TypeConstraint<T>("T")                                       \
-          .TypeConstraint<Tindices>("Tindices"),                        \
-      EmbeddingLookupVariableHotnessGradOp<Eigen::GpuDevice, T, Tindices>);
-
-REGISTER_GPU(float, int64_t)
-REGISTER_GPU(float, int32_t)
-}  // namespace tensorflow
diff --git a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc b/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
deleted file mode 100644
index 09638d40..00000000
--- a/deepray/custom_ops/distributed_embeddings/cc/kernels/embedding_lookup_kernels.cu.cc
+++ /dev/null
@@ -1,861 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#define EIGEN_USE_GPU
-#define THRUST_IGNORE_CUB_VERSION_CHECK
-#define _CG_ABI_EXPERIMENTAL  // enable experimental API
-#define ILP 4
-
-#include <cooperative_groups.h>
-
-#include <cub/cub.cuh>
-#include <cuco/static_map.cuh>
-
-#include "embedding_lookup.h"
-#include "tensorflow/core/lib/core/bits.h"
-#include "tensorflow/core/util/gpu_kernel_helper.h"
-namespace cg = cooperative_groups;
-
-namespace tensorflow {
-
-template <typename T, typename TIndex, int tile, int row>
-__device__ void EmbeddingReduceByIndices(cg::thread_block_tile<tile> g, T* out,
-                                         const T* params, int embedding_width,
-                                         int query_nnz, const TIndex* indices,
-                                         TIndex* shmem_indices,
-                                         Combiner combiner, const T* weights) {
-  T weight = 1;
-  int tid = g.thread_rank();
-  int row_off = tid / row * row;
-  int row_tid = tid % row;
-  T result[(row + tile - 1) / tile] = {0};
-
-  // Remainder is handled first
-  int remainder = query_nnz % tile;
-  // First stage, each CTA load one segment of indices in the sample into shared
-  // memory
-  g.sync();
-  if (tid < remainder) {
-    shmem_indices[tid] = indices[tid];
-  }
-  g.sync();
-  // Second stage
-  // A CTA first reads indices from shared memory and finds the corresponding
-  // entry in the embedding table. Then the CTA reads the embedding vector and
-  // accumulates into register file. Each thread in the CTA reads one element of
-  // the embedding vector
-  _Pragma("unroll 4") for (int i = tid / row; i < remainder;
-                           i += (tile + row - 1) / row) {
-    if (weights != nullptr) weight = weights[shmem_indices[i]];
-    _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-      if (j * tile + row_tid < embedding_width) {
-        result[j] +=
-            weight *
-            params[shmem_indices[i] * static_cast<int64_t>(embedding_width) +
-                   j * tile + row_tid];
-      }
-    }
-  }
-
-  _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-    out[j] += result[j];
-    result[j] = 0;
-  }
-
-  g.sync();
-  // Repeat stages above and handle one block size of indices at a time
-  for (int processed = remainder; processed < query_nnz; processed += tile) {
-    shmem_indices[tid] = indices[processed + tid];
-    g.sync();
-    _Pragma("unroll 4") for (int i = 0; i < row && i < tile; ++i) {
-      if (weights != nullptr) weight = weights[shmem_indices[i + row_off]];
-      _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-        if (j * tile + row_tid < embedding_width) {
-          result[j] +=
-              weight * params[shmem_indices[i + row_off] *
-                                  static_cast<int64_t>(embedding_width) +
-                              j * tile + row_tid];
-        }
-      }
-    }
-    _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-      out[j] += result[j];
-      result[j] = 0;
-    }
-    g.sync();
-  }
-
-  // reduce down to row elements, only first row have correct result
-  for (int i = tile / 2; i >= row; i /= 2) {
-    _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-      out[j] += g.shfl_down(out[j], i);
-    }
-  }
-}
-
-template <typename T, typename TIndex, int tile, int row>
-__device__ void EmbeddingReduceByIndicesWide(
-    cg::thread_block_tile<tile> g, T* out, const T* params, int embedding_width,
-    int query_nnz, const TIndex* indices, TIndex* shmem_indices,
-    Combiner combiner, const T* weights, int rem_width) {
-  T weight = 1;
-  int tid = g.thread_rank();
-  T result[(row + tile - 1) / tile] = {0};
-
-  // Remainder is handled first
-  int remainder = query_nnz % tile;
-  // First stage, each CTA load one segment of indices in the sample into shared
-  // memory
-  g.sync();
-  if (tid < remainder) {
-    shmem_indices[tid] = indices[tid];
-  }
-  g.sync();
-  // Second stage
-  // A CTA first reads indices from shared memory and finds the corresponding
-  // entry in the embedding table. Then the CTA reads the embedding vector and
-  // accumulates into register file. Each thread in the CTA reads one element of
-  // the embedding vector
-  _Pragma("unroll 4") for (int i = 0; i < remainder; ++i) {
-    if (weights != nullptr) weight = weights[shmem_indices[i]];
-    _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-      if (j * tile + tid < rem_width) {
-        result[j] +=
-            weight *
-            params[shmem_indices[i] * static_cast<int64_t>(embedding_width) +
-                   j * tile + tid];
-      }
-    }
-  }
-
-  _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-    out[j] += result[j];
-    result[j] = 0;
-  }
-
-  g.sync();
-  // Repeat stages above and handle one block size of indices at a time
-  for (int processed = remainder; processed < query_nnz; processed += tile) {
-    shmem_indices[tid] = indices[processed + tid];
-    g.sync();
-    _Pragma("unroll 4") for (int i = 0; i < tile; ++i) {
-      if (weights != nullptr) weight = weights[shmem_indices[i]];
-      _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-        if (j * tile + tid < rem_width) {
-          result[j] +=
-              weight *
-              params[shmem_indices[i] * static_cast<int64_t>(embedding_width) +
-                     j * tile + tid];
-        }
-      }
-    }
-    _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-      out[j] += result[j];
-      result[j] = 0;
-    }
-    g.sync();
-  }
-}
-
-template <typename T, typename TIndex, int tile, int row>
-__global__ void EmbeddingLookUpVariableHot(const T* params, int embedding_width,
-                                           const TIndex* indptr,
-                                           const TIndex* indices, T* out,
-                                           Combiner combiner, TIndex num_rows,
-                                           const T* weights) {
-  auto row_group = cg::tiled_partition<tile>(cg::this_thread_block());
-
-  // smem same size as block size.
-  extern __shared__ char shmem[];
-  TIndex* shmem_indices = reinterpret_cast<TIndex*>(shmem);
-  T* shmem_values = reinterpret_cast<T*>(shmem);
-  shmem_indices += threadIdx.y * blockDim.x;
-
-  int num_step = num_rows / gridDim.x;
-  indptr += blockIdx.x;
-  out += blockIdx.x * embedding_width + threadIdx.x;
-  if (blockIdx.x < (num_rows % gridDim.x)) num_step += 1;
-  int step_counter = threadIdx.y;
-  for (int step = 0; step < num_step; step++) {
-    int64_t block_ind_offset = indptr[0];
-    int query_nnz = indptr[1] - block_ind_offset;
-    // we only want break down skewed long reductions, i.e, power law input
-    // backward. These reduction length correlate strongly to batchsize. Let's
-    // say we care about perf beyond 1k batchsize in general, then we probably
-    // need this threshold <512 to be able to breakdown long reduction in these
-    // cases. 128 is chosen so each warp have a full read into indptr when there
-    // are 4 of them. it seems works fine, but we can make it a function of
-    // launch config if needed
-    if (query_nnz > 128 && blockDim.y > 1) {
-      T result[(row + tile - 1) / tile] = {0};
-      int prev_row_extra = (query_nnz % blockDim.y) > threadIdx.y
-                               ? threadIdx.y
-                               : query_nnz % blockDim.y;
-      int row_extra = (query_nnz % blockDim.y) > threadIdx.y ? 1 : 0;
-      int row_offset = (query_nnz / blockDim.y) * threadIdx.y + prev_row_extra;
-      int row_nnz = (query_nnz / blockDim.y) + row_extra;
-      EmbeddingReduceByIndices<T, TIndex, tile, row>(
-          row_group, result, params, embedding_width, row_nnz,
-          indices + block_ind_offset + row_offset, shmem_indices, combiner,
-          weights);
-      __syncthreads();
-      _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-        shmem_values[threadIdx.y * blockDim.x + threadIdx.x] = result[j];
-        __syncthreads();
-        if (threadIdx.y == 0) {
-          for (int i = 1; i < blockDim.y; i++) {
-            result[j] += shmem_values[i * blockDim.x + threadIdx.x];
-          }
-          if (combiner == Combiner::Mean) {
-            result[j] /= query_nnz;
-          }
-          if (j * tile + threadIdx.x < embedding_width)
-            out[j * tile] = result[j];
-        }
-        __syncthreads();
-      }
-    } else {
-      // only one row of threads handle one query(output) of embedding
-      // the rest of thread can proceed without stucking here
-      if (!step_counter) {
-        step_counter = blockDim.y;
-        T result[(row + tile - 1) / tile] = {0};
-        EmbeddingReduceByIndices<T, TIndex, tile, row>(
-            row_group, result, params, embedding_width, query_nnz,
-            indices + block_ind_offset, shmem_indices, combiner, weights);
-        _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-          if (combiner == Combiner::Mean) {
-            result[j] /= query_nnz;
-          }
-          if (j * tile + threadIdx.x < embedding_width)
-            out[j * tile] = result[j];
-        }
-      }
-      step_counter -= 1;
-    }
-    indptr += gridDim.x;
-    out += gridDim.x * embedding_width;
-  }
-}
-
-// version for tile size greater than 32, differences are:
-// each tile not within warp so no reduction with shfldown
-// have an outer loop to handle arbitrary embedding_width
-template <typename T, typename TIndex, int tile, int row>
-__global__ void EmbeddingLookUpVariableHotWide(
-    const T* params, int embedding_width, const TIndex* indptr,
-    const TIndex* indices, T* out, Combiner combiner, TIndex num_rows,
-    const T* weights) {
-#if __CUDACC_VER_MAJOR__ >= 12
-  // According to cuda doc, on compute capability 80 or higher, this should
-  // consume no memory
-  __shared__ cg::block_tile_memory<tile * 8> shared_for_cg;
-  cg::thread_block thb = cg::this_thread_block(shared_for_cg);
-  auto row_group = cg::tiled_partition<tile>(thb);
-#else
-  // unchanged legacy code. these are under experimental namespace before
-  // cuda 12.0
-  __shared__ cg::experimental::block_tile_memory<sizeof(T), tile * 8>
-      shared_for_cg;
-  cg::thread_block thb = cg::experimental::this_thread_block(shared_for_cg);
-  auto row_group = cg::experimental::tiled_partition<tile>(thb);
-#endif
-
-  // smem same size as block size.
-  extern __shared__ char shmem[];
-  TIndex* shmem_indices = reinterpret_cast<TIndex*>(shmem);
-  T* shmem_values = reinterpret_cast<T*>(shmem);
-  shmem_indices += threadIdx.y * blockDim.x;
-
-  int rem_width = embedding_width;
-  for (int out_i = 0; out_i < (embedding_width + row - 1) / row; ++out_i) {
-    int cur_id = blockIdx.x;
-    while (cur_id < num_rows) {
-      TIndex block_ind_offset = indptr[cur_id];
-      int query_nnz = indptr[cur_id + 1] - block_ind_offset;
-      int64_t block_out_offset = cur_id * embedding_width;
-      if (query_nnz > 128 && blockDim.y > 1) {
-        T result[(row + tile - 1) / tile] = {0};
-        int prev_row_extra = (query_nnz % blockDim.y) > threadIdx.y
-                                 ? threadIdx.y
-                                 : query_nnz % blockDim.y;
-        int row_extra = (query_nnz % blockDim.y) > threadIdx.y ? 1 : 0;
-        int row_offset =
-            (query_nnz / blockDim.y) * threadIdx.y + prev_row_extra;
-        int row_nnz = (query_nnz / blockDim.y) + row_extra;
-        EmbeddingReduceByIndicesWide<T, TIndex, tile, row>(
-            row_group, result, params, embedding_width, row_nnz,
-            indices + block_ind_offset + row_offset, shmem_indices, combiner,
-            weights, rem_width);
-        __syncthreads();
-
-        _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-          shmem_values[threadIdx.y * blockDim.x + threadIdx.x] = result[j];
-          __syncthreads();
-          if (threadIdx.y == 0) {
-            for (int i = 1; i < blockDim.y; i++) {
-              result[j] += shmem_values[i * blockDim.x + threadIdx.x];
-            }
-            if (combiner == Combiner::Mean) {
-              result[j] /= query_nnz;
-            }
-            if (j * tile + threadIdx.x < rem_width)
-              out[block_out_offset + j * tile + threadIdx.x] = result[j];
-          }
-          __syncthreads();
-        }
-      } else {
-        // only one row of threads handle one query(output) of embedding
-        // the rest of thread can proceed without stucking here
-        if ((cur_id / gridDim.x) % blockDim.y == threadIdx.y) {
-          T result[(row + tile - 1) / tile] = {0};
-          EmbeddingReduceByIndicesWide<T, TIndex, tile, row>(
-              row_group, result, params, embedding_width, query_nnz,
-              indices + block_ind_offset, shmem_indices, combiner, weights,
-              rem_width);
-          _Pragma("unroll") for (int j = 0; j < (row + tile - 1) / tile; ++j) {
-            if (combiner == Combiner::Mean) {
-              result[j] /= query_nnz;
-            }
-            if (j * tile + threadIdx.x < rem_width)
-              out[block_out_offset + j * tile + threadIdx.x] = result[j];
-          }
-        }
-      }
-      cur_id += gridDim.x;
-    }
-    params += row;
-    out += row;
-    rem_width -= row;
-  }
-}
-template <typename TIndex>
-__global__ void RowToSplit(TIndex* split_ptr, const TIndex* row_ptr,
-                           TIndex num_ids, TIndex num_rows) {
-  // effectively parallel binary search
-  auto tid = blockDim.x * blockIdx.x + threadIdx.x;
-  if (tid == num_rows) split_ptr[tid] = num_ids;
-  if (tid >= num_rows) return;
-  TIndex res, begin = 0, end = num_ids - 1;
-  while (begin < end) {
-    res = (begin + end) / 2;
-    if (row_ptr[res * 2] < tid) {
-      begin = res + 1;
-    } else if (row_ptr[res * 2] > tid) {
-      end = res - 1;
-    } else {
-      end = res;
-    }
-  }
-  split_ptr[tid] = end;
-}
-
-template <typename T, typename TIndex>
-__global__ void OffsetToWeightsAndRowId(const TIndex* indptr, int32_t* out,
-                                        T* weights) {
-  TIndex block_start_offset = indptr[blockIdx.x];
-  TIndex block_end_offset = indptr[blockIdx.x + 1];
-  for (TIndex i = block_start_offset + threadIdx.x; i < block_end_offset;
-       i += blockDim.x) {
-    out[i] = blockIdx.x;
-  }
-  if (threadIdx.x == 0 && weights)
-    weights[blockIdx.x] = static_cast<T>(1) /
-                          static_cast<T>(block_end_offset - block_start_offset);
-}
-
-template <typename TIndex>
-struct RowToSplitFunctor<Eigen::GpuDevice, TIndex> {
-  void operator()(const Eigen::GpuDevice& d, TIndex* split_ptr,
-                  const TIndex* row_ptr, TIndex num_ids,
-                  TIndex num_rows) const {
-    TF_CHECK_OK(GpuLaunchKernel(RowToSplit<TIndex>, num_rows / 512 + 1, 512, 0,
-                                d.stream(), split_ptr, row_ptr, num_ids,
-                                num_rows));
-  }
-};
-
-// The kernel does following things:
-// - generate available indices from count array
-// - try insert new value from available indices with each key
-// - insert either succeed, or get existed value from pervious batch/other
-// parallel threads
-// - now we have needed output, update count array for future available index
-// generation
-template <typename ViewT, typename T, typename CountT>
-__global__ void SearchAndUpdate(ViewT view, const T* keys, T* values, T* avails,
-                                CountT* counts, T num_elem, int* g_counter,
-                                T capacity) {
-  cg::grid_group grid = cg::this_grid();
-  int tid = blockDim.x * blockIdx.x + threadIdx.x;
-  // set global atomic counters to save a memset outside
-  g_counter[0] = 0;
-  g_counter[1] = 0;
-  grid.sync();
-
-  // Find at most num_elem indices where count is zero
-  // TODO: maybe randomize where to start and avoid slowdown when half full?
-  CountT count;
-  int avail_offset;
-  for (int i = tid; i < capacity; i += blockDim.x * gridDim.x) {
-    count = counts[i];
-    if (0 == count) {
-      avail_offset = atomicAdd(g_counter, 1);
-      if (avail_offset >= num_elem) break;
-      avails[avail_offset] = i;
-    }
-  }
-  grid.sync();
-
-  // now we have available indices, try insert them with keys
-  int num_avail = g_counter[0];
-  T key, value;
-  // First deal with case where we still have empty slot but not enough to do in
-  // one go
-  if (num_avail > 0 && num_avail < num_elem) {
-    if (tid < num_avail) {
-      int cur_offset = atomicAdd(g_counter + 1, 1);
-      value = avails[tid];
-      while (cur_offset < num_elem) {
-        key = keys[cur_offset];
-#if __CUDA_ARCH__ < 700
-        if constexpr (cuco::detail::is_packable<ViewT::value_type>()) {
-#endif
-          auto [iter, inserted] =
-              view.insert_and_find(cuco::make_pair(key, value));
-          counts[iter->second] += 1;
-          values[cur_offset] = iter->second;
-          if (inserted) break;
-#if __CUDA_ARCH__ < 700
-          // TODO(deyuf): add fallback logic determinism and pre-volta gpu.
-          // might need multi-kernel
-        }
-#endif
-        cur_offset = atomicAdd(g_counter + 1, 1);
-      }
-    }
-    // above run could stop before checking all keys, when all avaiable indices
-    // are inserted
-    grid.sync();
-
-    // threads with tid>g_counter will continue and insert_and_find remaining
-    // keys with default g_counter >= num_elem means all thread should returns
-    // since all keys are looked up already
-    if (tid < g_counter[1]) return;
-  }
-  // drop rest of no longer needed threads after possible grid sync
-  if (tid >= num_elem) return;
-
-  // Three cases we end up here:
-  // - we have enough new indices for use in one go, all num_elem threads got
-  // here
-  // - there is no new indices to use at all, all num_elem threads got here
-  // - we run out of new indices during above if, only tid matching never looked
-  // up key got here
-  key = keys[tid];
-
-  // Don't insert OOV keys so table remain not full
-  if (num_avail >= num_elem) {
-    value = avails[tid];
-#if __CUDA_ARCH__ < 700
-    if constexpr (cuco::detail::is_packable<ViewT::value_type>()) {
-#endif
-      auto [iter, inserted] = view.insert_and_find(cuco::make_pair(key, value));
-      if (!inserted) value = iter->second;
-#if __CUDA_ARCH__ < 700
-      // TODO(deyuf): need fallback since pre-volta gpu will fail.
-    }
-#endif
-  } else {
-    value = 0;
-    auto s_view = typename cuco::static_map<T, T>::device_view(view);
-    auto found = s_view.find(key);
-    if (found != s_view.end()) value = found->second;
-  }
-  // update count so this index won't be used again
-  atomicAdd(counts + value, (CountT)1);
-  // write out to output
-  values[tid] = value;
-}
-
-template <typename T, typename CountT>
-struct IntegerLookupFunctor<Eigen::GpuDevice, T, CountT> {
-  void operator()(OpKernelContext* context, T* table_ptr, CountT* count_ptr,
-                  const T* keys_ptr, T* value_ptr, T num_elem, bool init,
-                  int64_t capacity) const {
-    const auto& cu_stream = GetGpuStream(context);
-
-    // get a mutable view from TF managed memory, initialize if needed
-    auto table_capacity = capacity * 3 / 2;
-    T constexpr empty_key_sentinel = -1;
-    T constexpr empty_value_sentinel = -1;
-    auto slot =
-        reinterpret_cast<typename cuco::static_map<T, T>::pair_atomic_type*>(
-            table_ptr);
-    if (init) {
-      using atomic_key_type = typename cuco::static_map<T, T>::atomic_key_type;
-      using atomic_mapped_type =
-          typename cuco::static_map<T, T>::atomic_mapped_type;
-      auto grid_size = (table_capacity + 1023) / 1024;
-      cuco::detail::initialize<256, atomic_key_type, atomic_mapped_type>
-          <<<grid_size, 256, 0, cu_stream>>>(
-              slot, cuco::empty_key{empty_key_sentinel},
-              cuco::empty_value{empty_value_sentinel}, table_capacity);
-    }
-    auto view = typename cuco::static_map<T, T>::device_mutable_view(
-        slot, table_capacity, cuco::empty_key{empty_key_sentinel},
-        cuco::empty_value{empty_value_sentinel});
-
-    // counters to figure out offsets between threads
-    Tensor atomic_counter;
-    context->allocate_temp(DT_INT32, TensorShape({static_cast<int64_t>(2)}),
-                           &atomic_counter);
-    auto atomic_counter_ptr = atomic_counter.flat<int>().data();
-    // DRAM workspace buffer to store new indices available for use
-    Tensor temp_avail;
-    context->allocate_temp(DataTypeToEnum<T>::value,
-                           TensorShape({static_cast<int64_t>(num_elem)}),
-                           &temp_avail);
-    auto temp_avail_ptr = temp_avail.flat<int64_t>().data();
-
-    int num_threads = 512;
-    // TODO: add loop for batch dim and get device prop from TF to figure
-    // safe/largest num_blocks For now, use max(enough_for_batch, 64) since most
-    // cards we care have more than 64 sm
-    auto num_blocks = (num_elem + num_threads - 1) / num_threads;
-    num_blocks = num_blocks < 64 ? 64 : num_blocks;
-    void* args[] = {&view,      &keys_ptr, &value_ptr,          &temp_avail_ptr,
-                    &count_ptr, &num_elem, &atomic_counter_ptr, &capacity};
-    cudaLaunchCooperativeKernel(
-        (void*)SearchAndUpdate<
-            typename cuco::static_map<T, T>::device_mutable_view, T, CountT>,
-        num_blocks, num_threads, args, 0, cu_stream);
-  }
-};
-
-template <typename T, typename TIndex>
-struct EmbeddingLookupVariableHotnessFunctor<Eigen::GpuDevice, T, TIndex> {
-  void operator()(const Eigen::GpuDevice& d, T* output_ptr, const T* param_ptr,
-                  const TIndex* ids_ptr, const TIndex* offsets_ptr,
-                  TIndex num_rows, TIndex embedding_width, Combiner combiner,
-                  TIndex ave_red_len) const {
-    int next_power_of_two = 1 << Log2Ceiling64(embedding_width);
-
-    // decide number of parallel tile base on reduction length
-    int parallel_tile = 1;
-    if (ave_red_len >= 256) parallel_tile = 2;
-    if (ave_red_len >= 1024) parallel_tile = 4;
-
-    // decide number of threads per tile and adjust number of tile with CUDA
-    // limits
-    int blockX = next_power_of_two / ILP;
-    if (blockX < 32) blockX = 32;
-    if (blockX > 256) blockX = 256;
-    if (parallel_tile * blockX > 1024) parallel_tile = 1024 / blockX;
-
-    // decide grid dimension and dynamic shared memory size
-    dim3 blockDim = dim3(blockX, parallel_tile);
-    int smem_size = sizeof(TIndex) > sizeof(T) ? sizeof(TIndex) : sizeof(T);
-    smem_size = blockX * parallel_tile * smem_size;
-    int gridDim = 32768 / (blockX / 32 * parallel_tile);
-    if (gridDim > num_rows) gridDim = num_rows;
-
-    switch (next_power_of_two) {
-      case 1:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 1>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 2:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 2>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 4:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 4>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 8:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 8>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 16:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 16>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 32:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 32>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 64:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 64>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 128:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, TIndex, 32, 128>, gridDim, blockDim,
-            smem_size, d.stream(), param_ptr, embedding_width, offsets_ptr,
-            ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 256:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, TIndex, 64, 256>, gridDim,
-            blockDim, smem_size, d.stream(), param_ptr, embedding_width,
-            offsets_ptr, ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      case 512:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, TIndex, 128, 512>, gridDim,
-            blockDim, smem_size, d.stream(), param_ptr, embedding_width,
-            offsets_ptr, ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-      default:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, TIndex, 256, 1024>, gridDim,
-            blockDim, smem_size, d.stream(), param_ptr, embedding_width,
-            offsets_ptr, ids_ptr, output_ptr, combiner, num_rows, nullptr));
-        break;
-    }
-  }
-};
-
-template <typename T, typename TIndex>
-struct EmbeddingLookupVariableHotnessGradFunctor<Eigen::GpuDevice, T, TIndex> {
-  void operator()(OpKernelContext* context, const TIndex* ids_ptr,
-                  const TIndex* offset_in_ptr, const T* grad_ptr,
-                  int64_t num_ids, TIndex embedding_width, TIndex num_rows,
-                  int64_t dense_shape_dim0, int64_t max_red_len,
-                  Combiner combiner) const {
-    const auto& cu_stream = GetGpuStream(context);
-    cub::CountingInputIterator<int32_t> itr(0);
-
-    // allocate intermediate results buffer
-    Tensor tmp_unique_ids;
-    Tensor offsets;
-    Tensor num_unique_ids;
-    Tensor sorted_ids;
-    context->allocate_temp(DataTypeToEnum<TIndex>::value,
-                           TensorShape({num_ids}), &tmp_unique_ids);
-    context->allocate_temp(DataTypeToEnum<int32_t>::value,
-                           TensorShape({num_ids}), &offsets);
-    context->allocate_temp(DataTypeToEnum<TIndex>::value, TensorShape({1}),
-                           &num_unique_ids);
-    context->allocate_temp(DataTypeToEnum<TIndex>::value,
-                           TensorShape({num_ids}), &sorted_ids);
-    auto tmp_unique_ids_ptr = tmp_unique_ids.flat<TIndex>().data();
-    auto offsets_ptr = offsets.flat<int32_t>().data();
-    auto num_unique_ids_ptr = num_unique_ids.flat<TIndex>().data();
-    auto sorted_ids_ptr = sorted_ids.flat<TIndex>().data();
-
-    Tensor row;
-    Tensor sorted_row;
-    context->allocate_temp(DataTypeToEnum<int32_t>::value,
-                           TensorShape({num_ids}), &row);
-    context->allocate_temp(DataTypeToEnum<int32_t>::value,
-                           TensorShape({num_ids}), &sorted_row);
-    auto row_ptr = row.flat<int32_t>().data();
-    auto sorted_row_ptr = sorted_row.flat<int32_t>().data();
-
-    T* weights_ptr = nullptr;
-    Tensor weights;
-    if (combiner == Combiner::Mean) {
-      context->allocate_temp(DataTypeToEnum<T>::value, TensorShape({num_rows}),
-                             &weights);
-      weights_ptr = weights.flat<T>().data();
-    }
-
-    TF_CHECK_OK(GpuLaunchKernel(OffsetToWeightsAndRowId<T, TIndex>, num_rows,
-                                32, 0, cu_stream, offset_in_ptr, row_ptr,
-                                weights_ptr));
-
-    // Determine temporary device storage requirements
-    size_t temp_sort = 0;
-    size_t temp_unique = 0;
-    cub::DeviceRadixSort::SortPairs(nullptr, temp_sort, ids_ptr, sorted_ids_ptr,
-                                    row_ptr, sorted_row_ptr, num_ids, 0,
-                                    Log2Ceiling64(dense_shape_dim0), cu_stream);
-    cub::DeviceSelect::UniqueByKey(nullptr, temp_unique, sorted_ids_ptr, itr,
-                                   tmp_unique_ids_ptr, offsets_ptr,
-                                   num_unique_ids_ptr, num_ids, cu_stream);
-    Tensor temp_storage;
-    size_t temp_storage_bytes =
-        temp_sort > temp_unique ? temp_sort : temp_unique;
-    context->allocate_temp(
-        DT_INT8, TensorShape({static_cast<int64_t>(temp_storage_bytes)}),
-        &temp_storage);
-
-    auto temp_storage_ptr = temp_storage.flat<int8>().data();
-    cub::DeviceRadixSort::SortPairs(
-        temp_storage_ptr, temp_sort, ids_ptr, sorted_ids_ptr, row_ptr,
-        sorted_row_ptr, num_ids, 0, Log2Ceiling64(dense_shape_dim0), cu_stream);
-    cub::DeviceSelect::UniqueByKey(
-        temp_storage_ptr, temp_unique, sorted_ids_ptr, itr, tmp_unique_ids_ptr,
-        offsets_ptr, num_unique_ids_ptr, num_ids, cu_stream);
-
-    // copy this back to host. should be ok to sync since there is not much to
-    // do in between TF way of doing it seems to be event query base
-    TIndex num_unique_ids_host = 0;
-    cudaMemcpyAsync(&num_unique_ids_host, num_unique_ids_ptr, sizeof(TIndex),
-                    cudaMemcpyDeviceToHost, cu_stream);
-
-    cudaMemcpyAsync(offsets_ptr + num_unique_ids_host, &num_ids,
-                    sizeof(int32_t), cudaMemcpyHostToDevice, cu_stream);
-    // allocate output
-    Tensor* unique_ids = nullptr;
-    OP_REQUIRES_OK(
-        context, context->allocate_output(0, TensorShape({num_unique_ids_host}),
-                                          &unique_ids));
-    auto unique_ids_ptr = unique_ids->flat<TIndex>().data();
-    cudaMemcpyAsync(unique_ids_ptr, tmp_unique_ids_ptr,
-                    num_unique_ids_host * sizeof(TIndex),
-                    cudaMemcpyDeviceToDevice, cu_stream);
-
-    Tensor* unique_grad = nullptr;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(
-                       1, TensorShape({num_unique_ids_host, embedding_width}),
-                       &unique_grad));
-    auto unique_grad_ptr = unique_grad->flat<T>().data();
-
-    int next_power_of_two = 1 << Log2Ceiling64(embedding_width);
-
-    // decide number of parallel tile base on reduction length
-    int parallel_tile = 1;
-    if (max_red_len > 512) parallel_tile = 2;
-    if (max_red_len > 4096) parallel_tile = 4;
-    if (max_red_len > 65536) parallel_tile = 6;
-
-    // decide number of threads per tile and adjust number of tile with CUDA
-    // limits
-    int blockX = next_power_of_two / ILP;
-    if (blockX < 32) blockX = 32;
-    if (blockX > 256) blockX = 256;
-    if (parallel_tile * blockX > 1024) parallel_tile = 1024 / blockX;
-
-    // decide grid dimension and dynamic shared memory size
-    dim3 blockDim = dim3(blockX, parallel_tile);
-    int smem_size = sizeof(TIndex) > sizeof(T) ? sizeof(TIndex) : sizeof(T);
-    smem_size = blockX * parallel_tile * smem_size;
-    int gridDim = 32768 / (blockX / 32 * parallel_tile);
-    if (gridDim > num_unique_ids_host) gridDim = num_unique_ids_host;
-
-    switch (next_power_of_two) {
-      case 1:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 1>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 2:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 2>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 4:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 4>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 8:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 8>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 16:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 16>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 32:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 32>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 64:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 64>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 128:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHot<T, int32_t, 32, 128>, gridDim, blockDim,
-            smem_size, cu_stream, grad_ptr, embedding_width, offsets_ptr,
-            sorted_row_ptr, unique_grad_ptr, Combiner::Sum, num_unique_ids_host,
-            weights_ptr));
-        break;
-      case 256:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, int32_t, 64, 256>, gridDim,
-            blockDim, smem_size, cu_stream, grad_ptr, embedding_width,
-            offsets_ptr, sorted_row_ptr, unique_grad_ptr, Combiner::Sum,
-            num_unique_ids_host, weights_ptr));
-        break;
-      case 512:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, int32_t, 128, 512>, gridDim,
-            blockDim, smem_size, cu_stream, grad_ptr, embedding_width,
-            offsets_ptr, sorted_row_ptr, unique_grad_ptr, Combiner::Sum,
-            num_unique_ids_host, weights_ptr));
-        break;
-      default:
-        TF_CHECK_OK(GpuLaunchKernel(
-            EmbeddingLookUpVariableHotWide<T, int32_t, 256, 1024>, gridDim,
-            blockDim, smem_size, cu_stream, grad_ptr, embedding_width,
-            offsets_ptr, sorted_row_ptr, unique_grad_ptr, Combiner::Sum,
-            num_unique_ids_host, weights_ptr));
-        break;
-    }
-  }
-};
-
-template struct RowToSplitFunctor<Eigen::GpuDevice, int64_t>;
-template struct RowToSplitFunctor<Eigen::GpuDevice, int32_t>;
-template struct EmbeddingLookupVariableHotnessFunctor<Eigen::GpuDevice, float,
-                                                      int64_t>;
-template struct EmbeddingLookupVariableHotnessFunctor<Eigen::GpuDevice, float,
-                                                      int32_t>;
-template struct EmbeddingLookupVariableHotnessGradFunctor<Eigen::GpuDevice,
-                                                          float, int64_t>;
-template struct EmbeddingLookupVariableHotnessGradFunctor<Eigen::GpuDevice,
-                                                          float, int32_t>;
-template struct IntegerLookupFunctor<Eigen::GpuDevice, int64_t, uint32_t>;
-
-}  // namespace tensorflow
diff --git a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc b/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
deleted file mode 100644
index 7d498505..00000000
--- a/deepray/custom_ops/distributed_embeddings/cc/ops/embedding_lookup_ops.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES.
- * All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "deepray/custom_ops/utils/ok_status_util.h"
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/shape_inference.h"
-
-namespace tensorflow {
-
-REGISTER_OP("ReadVariableNoCopy")
-    .Input("resource: resource")
-    .Output("value: dtype")
-    .Attr("dtype: type")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      std::vector<shape_inference::ShapeAndType> shape_and_type;
-      TF_RETURN_IF_ERROR(
-          shape_inference::ValidateVariableResourceHandle(c, &shape_and_type));
-      c->set_output(0, shape_and_type[0].shape);
-      return TFOkStatus;
-    });
-
-REGISTER_OP("RowToSplit")
-    .Attr("Tindices: {int32, int64}")
-    .Input("row_ids: Tindices")
-    .Input("shape: int32")
-    .Output("row_split: Tindices")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // TODO
-      return TFOkStatus;
-    });
-
-REGISTER_OP("EmbeddingLookupVariableHotness")
-    .Attr("T: {float}")
-    .Attr("Tindices: {int32, int64}")
-    .Input("param: T")
-    .Input("ids: Tindices")
-    .Input("offsets: Tindices")
-    .Output("output_params: T")
-    .Attr("combiner:  {'sum', 'mean'}")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // vitual input: [m,n], param: [N,p], ids:[nnz], offsets:[m+1]
-      // output: [m, p]
-      shape_inference::ShapeHandle params_shape;
-      shape_inference::ShapeHandle ids_shape;
-      shape_inference::ShapeHandle offsets_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &params_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &ids_shape));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &offsets_shape));
-      auto outdim_0 = c->Value(c->Dim(offsets_shape, 0));
-      // Just in case shape inference happens while batch dim is not None
-      if (outdim_0 > 0) {
-        outdim_0 -= 1;
-      }
-      c->set_output(0, c->Matrix(outdim_0, c->Dim(params_shape, 1)));
-      return TFOkStatus;
-    });
-
-REGISTER_OP("EmbeddingLookupVariableHotnessGrad")
-    .Attr("T: {float}")
-    .Attr("Tindices: {int32, int64}")
-    .Input("ids: Tindices")
-    .Input("offset: Tindices")
-    .Input("grad: T")
-    .Input("param: T")
-    .Output("unique_ids: Tindices")
-    .Output("unique_grad: T")
-    .Attr("combiner:  {'sum', 'mean'}")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle grad_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &grad_shape));
-      c->set_output(0,
-                    c->Vector(shape_inference::InferenceContext::kUnknownDim));
-      c->set_output(1, c->Matrix(shape_inference::InferenceContext::kUnknownDim,
-                                 c->Dim(grad_shape, 1)));
-      return TFOkStatus;
-    });
-
-REGISTER_OP("IntegerLookup")
-    .Input("table_handle: resource")
-    .Input("count_handle: resource")
-    .Input("keys: T")
-    .Attr("capacity: int")
-    .Attr("T: {int64}")
-    .Attr("count_dtype: type")
-    .Output("values: T")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      c->set_output(0, c->input(2));
-      return TFOkStatus;
-    });
-
-}  // namespace tensorflow
diff --git a/deepray/custom_ops/distributed_embeddings/python/__init__.py b/deepray/custom_ops/distributed_embeddings/python/__init__.py
deleted file mode 100644
index cf3615b2..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/deepray/custom_ops/distributed_embeddings/python/layers/__init__.py b/deepray/custom_ops/distributed_embeddings/python/layers/__init__.py
deleted file mode 100644
index cf3615b2..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/layers/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py b/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
deleted file mode 100644
index 5c54ed01..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/layers/dist_model_parallel.py
+++ /dev/null
@@ -1,1314 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Distributed Embedding layers and utils"""
-
-import types
-import math
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import initializers
-from tensorflow.python.keras.utils import tf_utils
-import horovod
-import horovod.tensorflow as hvd
-import horovod.tensorflow.keras as hvd_keras
-from distributed_embeddings.python.ops.embedding_lookup_ops import read_var_no_copy
-from .embedding import Embedding
-
-
-class ConcatInitializer(tf.keras.initializers.Initializer):
-  """initializer wrapper to handle automatic concat table on first dimension"""
-
-  def __init__(self, initializer, sizes):
-    self._initializer = initializer
-    self.sizes = sizes
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    weights = [self._initializer([size, shape[1]], dtype=dtype, **kwargs) for size in self.sizes]
-    weights = tf.concat(weights, axis=0)
-    return weights
-
-
-def _get_shape(tensor):
-  """Return shape of tensor
-
-  Static shape is not always available, in which case we use tf.shape to get dynamic shape
-
-  Args:
-      tensor (Tensor): Input tensor
-
-  Returns:
-      tf.Shape
-  """
-  if tensor.shape is not None and None not in tensor.shape:
-    return tensor.shape
-  return tf.shape(tensor)
-
-
-def _argsort(l, key=None, reverse=False):
-  if key is None:
-    key = lambda x: x
-
-  r = list(sorted(enumerate(l), key=lambda x: key(x[1]), reverse=reverse))
-  order = [x[0] for x in r]
-  values = [x[1] for x in r]
-  return values, order
-
-
-def _transpose_ragged_2d(x, world_size, num_local_features):
-  """
-  Transpose from a tf.RaggedTensor from worker-major to feature-major format.
-
-  Args:
-    x (tf.RaggedTensor): input tensor
-    world_size: total number of workers
-    num_local_features: number of features on the current worker
-
-  Returns:
-    A transposed tf.RaggedTensor
-  """
-  x = tf.split(x, world_size * num_local_features)
-  transposed = []
-  for j in range(num_local_features):
-    for i in range(world_size):
-      transposed.append(x[i * num_local_features + j])
-  transposed = tf.concat(transposed, axis=0)
-  return transposed
-
-
-def _dp_to_mp_input_ragged(dp_inputs, rank_to_local_features):
-  """
-  Transforms embedding input indices from data-parallel to model-parallel paradigm
-  using horovod all-to-all operations. Only supports tf.RaggedTensor inputs.
-
-  Args:
-    dp_inputs (dict of ragged tf.Tensors): a dictionary mapping feature index
-      to a ragged data-parallel tensor with the input data for this feature.
-    rank_to_local_features (dict of lists): a dictionary mapping the rank of each horovod worker
-        to the list of feature indices that are supposed to be gathered onto that worker.
-
-  Returns:
-    A dictionary mapping the index of the feature to a ragged tensor.
-    Each tensor contains model-parallel data for the corresponding feature.
-
-  Raises:
-    ValueError: in case of incorrect input.
-  """
-
-  if not isinstance(dp_inputs, dict):
-    raise ValueError(f"Expected a dict, got: {type(dp_inputs)}")
-
-  if not dp_inputs:
-    return {}
-
-  # The split tensor for the first all-to-all of the flat tensor values
-  a2a_splits = {}
-
-  # features_dp is a list of all values we need to send to each worker.
-  # It is not simply dp_inputs.values() because some tensors might need to be sent
-  # to multiple workers. For example because an embedding table is split onto multiple workers.
-  features_dp = []
-  for worker, features in rank_to_local_features.items():
-    a2a_splits[worker] = tf.math.reduce_sum([tf.size(dp_inputs[f].flat_values) for f in features])
-    # Need to cast in case of zero, which turns out as float32.
-    a2a_splits[worker] = tf.cast(a2a_splits[worker], dtype=tf.int32)
-
-    for feature_id in features:
-      features_dp.append(dp_inputs[feature_id])
-
-  flat_values_a2a_splits = [a2a_splits[i] for i in range(hvd.size())]
-
-  flat_features_dp = tf.concat(features_dp, axis=0).flat_values
-  # Perform the first all-to-all for the ragged tensor values.
-  flat_features_mp, _ = hvd.alltoall(flat_features_dp, splits=flat_values_a2a_splits)
-
-  local_batch = next(iter(dp_inputs.values())).shape[0]
-
-  # Compute the splits for the second all-to-all (row-lengths).
-  row_lengths_a2a_splits = [local_batch * len(rank_to_local_features[r]) for r in range(hvd.size())]
-
-  # Perform the second all-to-all on the row-length data.
-  row_lengths_dp = tf.concat([x.row_lengths() for x in features_dp], axis=0)
-  row_lengths_mp, _ = hvd.alltoall(row_lengths_dp, splits=row_lengths_a2a_splits)
-
-  num_mp_features = len(rank_to_local_features[hvd.rank()])
-  if num_mp_features == 0:
-    # The current worker has not received any variable-length features.
-    # Nothing to be done. Simply return an empty dict.
-    return {}
-
-  # This reshape is necessary for correct static shape inference in graph mode
-  row_lengths_mp = tf.reshape(row_lengths_mp, shape=[num_mp_features * local_batch * hvd.size()])
-
-  # Construct a large ragged tensor containing all the features the current worker has received.
-  flat_features_mp = tf.RaggedTensor.from_row_lengths(values=flat_features_mp, row_lengths=row_lengths_mp)
-
-  # Need to transpose from worker-major to feature-major before we split into individual features.
-  flat_features_mp = _transpose_ragged_2d(flat_features_mp, num_local_features=num_mp_features, world_size=hvd.size())
-
-  # Split the large tensor into a list of smaller ragged tensors for each feature.
-  features_mp = tf.split(flat_features_mp, num_or_size_splits=num_mp_features)
-  features_mp = dict(zip(rank_to_local_features[hvd.rank()], features_mp, strict=True))
-  return features_mp
-
-
-def _dp_to_mp_input_dense(dp_inputs, rank_to_local_features):
-  """
-    Transforms embedding input indices from data-parallel to model-parallel paradigm
-    using horovod all-to-all operations. Only supports fixed-length tf.Tensor inputs.
-
-  Args:
-    dp_inputs (dict of dense tf.Tensors): a dictionary mapping feature index
-        to a potentially ragged data-parallel tensor with the input data for this feature.
-    rank_to_local_features (dict of lists): a dictionary mapping the rank of each horovod worker
-        to the list of feature indices that are supposed to be gathered onto that worker.
-
-  Returns:
-    A dictionary mapping the index of the feature to a dense tf.Tensor.
-    Each tensor contains model-parallel data for the corresponding feature.
-
-  Raises:
-    ValueError: in case of incorrect input.
-  """
-  if not isinstance(dp_inputs, dict):
-    raise ValueError(f"Expected a dict, got: {type(dp_inputs)}")
-
-  if not dp_inputs:
-    return {}
-
-  world_size, rank = hvd.size(), hvd.rank()
-
-  comm_dtype = tf.int32
-  for inp in dp_inputs.values():
-    if inp.dtype == tf.int64:
-      comm_dtype = tf.int64
-  dp_inputs = {k: tf.cast(v, comm_dtype) for k, v in dp_inputs.items()}
-  local_shapes, local_splits, global_splits, flat_inputs = [], [], [], []
-
-  for rank_input_ids in rank_to_local_features.values():
-    rank_inputs = [dp_inputs[feature_key] for feature_key in rank_input_ids]
-    local_shapes.append([_get_shape(inp) for inp in rank_inputs])
-    rank_inputs = [tf.reshape(inp, [-1]) for inp in rank_inputs]
-    local_splits.append([_get_shape(inp)[0] for inp in rank_inputs])
-    global_splits.append(sum(local_splits[-1]))
-    flat_inputs += rank_inputs
-  dp_inputs = tf.concat(flat_inputs, 0)
-
-  mp_inputs, _ = hvd.alltoall(dp_inputs, splits=global_splits, name="inp_dp_to_mp")
-
-  mp_inputs = tf.reshape(mp_inputs, [world_size, -1])
-  mp_inputs = tf.split(mp_inputs, local_splits[rank], 1)
-  mp_inputs = [
-    tf.reshape(inp, [world_size * shape[0]] + shape[1:]) for inp, shape in zip(mp_inputs, local_shapes[rank])
-  ]
-
-  mp_inputs = dict(zip(rank_to_local_features[rank], mp_inputs, strict=True))
-  return mp_inputs
-
-
-def _dp_to_mp_input(dp_inputs, rank_to_local_features):
-  """
-  Transforms embedding input indices from data-parallel to model-parallel paradigm
-  using horovod all-to-all operations. The resulting model-parallel indices
-  can be used to run a model-parallel embedding lookup. Supports dense tf.Tensor
-  and tf.RaggedTensor inputs.
-
-  Args:
-    dp_inputs (dict of potentially ragged tf.Tensors): a dictionary mapping feature index
-        to a potentially ragged data-parallel tensor with the input data for this feature.
-        Each tensor can be either a dense tf.Tensor or a tf.RaggedTensor.
-    rank_to_local_features (dict of lists): a dictionary mapping the rank of each horovod worker
-        to the list of feature indices that are supposed to be gathered onto that worker.
-        E.g., passing rank_to_local_features={0: [0, 1], 1: [2]} means that worker "0" should
-        receive the data for features "0" and "1" and worker "1" should receive
-        the data for feature "2".
-
-  Returns:
-    A dictionary mapping the index of the feature to a potentially ragged tensor.
-    Each tensor contains model-parallel data for the corresponding feature.
-
-  Raises:
-    ValueError: if a tf.SparseTensor input has been passed. This is not supported.
-  """
-  # Handle the trivial single-worker case separately.
-  if hvd.size() <= 1:
-    # Expected input order may still change in case of single process.
-    inputs = {idx: dp_inputs[idx] for idx in rank_to_local_features[0]}
-    return inputs
-
-  if isinstance(dp_inputs, list):
-    dp_inputs = dict(enumerate(dp_inputs))
-
-  # Partition the input tensors into two groups: fixed length and variable length,
-  # so that they can be handled separately.
-  ragged_dp_inputs, dense_dp_inputs = {}, {}
-  for i, f in dp_inputs.items():
-    if isinstance(f, tf.RaggedTensor):
-      ragged_dp_inputs[i] = f
-    elif isinstance(f, tf.SparseTensor):
-      # TODO(tgrel): support sparse input, possibly by converting to ragged here
-      raise ValueError("Sparse tensor data-parallel input is not supported")
-    else:
-      dense_dp_inputs[i] = f
-
-  # Partition the input map into two separate maps:
-  #  - One that maps each worker rank to a list of dense features it is supposed to receive.
-  #  - The other that maps each worker rank to a list of variable length features it
-  #    is supposed to receive.
-  rank_to_dense_features, rank_to_ragged_features = {}, {}
-  for rank in range(hvd.size()):
-    all_rank_features = rank_to_local_features[rank]
-    rank_to_dense_features[rank] = [v for v in all_rank_features if v in dense_dp_inputs]
-    rank_to_ragged_features[rank] = [v for v in all_rank_features if v in ragged_dp_inputs]
-
-  # Call the subroutine for fixed-length inputs.
-  dense_mp_inputs = _dp_to_mp_input_dense(dense_dp_inputs, rank_to_local_features=rank_to_dense_features)
-
-  # Call the subroutine for variable-length inputs.
-  ragged_mp_inputs = _dp_to_mp_input_ragged(ragged_dp_inputs, rank_to_local_features=rank_to_ragged_features)
-
-  mp_inputs = {**dense_mp_inputs, **ragged_mp_inputs}
-  return mp_inputs
-
-
-@tf.custom_gradient
-def grouped_reducescatter_unscaled(inputs):
-  outputs = hvd.grouped_reducescatter(inputs, op=hvd.Sum)
-
-  def grad(*upstream):
-    return hvd.grouped_allgather(upstream)
-
-  return outputs, grad
-
-
-class DistEmbeddingStrategy:
-  """Distributed embedding strategy
-
-  Args:
-    embeddings (list of Embedding): list of unbuilt Embedding layers globally
-    strategy (str): A string indicates how embedding tables are distributed.
-        Choices are [“basic”, “memory_balanced”]. Default "basic"
-    input_table_map (list or None): A list of table ids mapping each input to a table, i.e.,
-        `input[i]` map to `table[input_table_map[i]]`. None means there are same number of
-        inputs/tables and `input[i]` map to `table[i]`. Default None.
-    column_slice_threshold: desired upper bound of elements count in each slice.
-    row_slice_threshold: desired lower bound where table larger than this will be row sliced.
-    gpu_embedding_size: total number of local table-parallel embedding elements to fit on the GPU.
-                        if more elements are used, they will be CPU offloaded.
-                        Set to None (default) to try to fit all table-parallel tables on the GPU.
-                        CPU offloading row-sliced and data-parallel tables is not supported.
-    data_parallel_threshold: Embedding smaller than this will be run data-parallel.
-
-  Attributes:
-    strategy: string indicates how embedding tables are distributed.
-    column_slice_threshold: desired upper bound of elements count in each slice.
-    row_slice_threshold: desired lower bound where table larger than this will be row sliced.
-    gpu_embedding_size: total number of local table-parallel embedding elements to fit on the GPU.
-                        if more elements are used, they will be CPU offloaded.
-                        Set to None (default) to try to fit all table-parallel tables on the GPU.
-                        CPU offloading row-sliced and data-parallel tables is not supported.
-    data_parallel_threshold: Embedding smaller than this will be run data-parallel.
-    sliced_out_ranges: list of [output_pos, num_slices] for each input, used to merged outputs.
-    table_groups: lists of table ids. group 0 runs dp, group 1 do column slice with table parallel,
-                  group 2 runs row_slice onto all workers.
-    input_groups: input ids coresponding to table grouping.
-    map_groups: input to table map within each group.
-    rev_group_ids: list used to reorder grouped output back into flat input order.
-    row_sliced_configs: configs that's been row sliced.
-    row_inputs_offsets: add to row slice input so inputs except certain range will go OOB.
-    input_ids_list: nested list, contain input index list in rank order. use for dp_input == False.
-    local_maps: nested list, contain per rank local input to table map.
-    local_configs: nested list, contain per rank lists of local table configs.
-    local_input_offsets: nested list, contain per rank offsets to form input for concat embedding.
-    local_weight_offsets: nested list, contain per rank weight internal offsets get/set_weights.
-    local_group_list: nested list, contain per rank concat grouping info for get/set_weights.
-    table_ids: nested list, contain per rank table ids for get/set_weights.
-    widths_list_flat: list of all output width, before merging slices and in worker order
-    rev_tp_ids: list used to reorder table parallel output back into flat tp input order.
-  """
-
-  def __init__(
-    self,
-    embeddings,
-    world_size,
-    strategy="basic",
-    input_table_map=None,
-    column_slice_threshold=None,
-    row_slice_threshold=None,
-    data_parallel_threshold=None,
-    gpu_embedding_size=None,
-  ):
-    # code in DMP to skip hvd call in single process case may assume "basic"
-    self.strategy = "basic" if world_size == 1 else strategy
-    # column_slice can be used to enable more table concat, so keep it in single process
-    self.column_slice_threshold = column_slice_threshold
-    self.row_slice_threshold = row_slice_threshold
-    self.gpu_embedding_size = gpu_embedding_size
-    self.data_parallel_threshold = data_parallel_threshold
-    self.global_configs = [e.get_config() for e in embeddings]
-    # Insert layer type information to config dicts
-    for config, embedding in zip(self.global_configs, embeddings):
-      config["layer_type"] = type(embedding)
-    if input_table_map is None:
-      input_table_map = list(range(len(embeddings)))
-
-    # separated table ids into groups for different strat
-    self.table_groups = self.init_table_groups(self.global_configs)
-    # input ids and map. rev_group_ids here to reverse grouped call back to input order
-    self.input_groups, self.map_groups, self.rev_group_ids = self.init_input_and_map_groups(
-      self.table_groups, input_table_map
-    )
-
-    # 1. handle data parallel
-    self.dp_configs = [self.global_configs[idx] for idx in self.table_groups[0]]
-
-    # 2. handle row slicing
-    if self.table_groups[2]:
-      self.row_sliced_configs, self.row_inputs_offsets = self.create_row_sliced_configs(
-        [self.global_configs[idx] for idx in self.table_groups[2]], world_size
-      )
-    else:
-      self.row_sliced_configs = [[] for _ in range(world_size)]
-
-    # 3. handle column slicing and table parallel
-    if not self.table_groups[1]:
-      return
-
-    # Create (maybe) sliced configs
-    sliced_configs, self.sliced_out_ranges = self.create_col_sliced_configs(
-      [self.global_configs[idx] for idx in self.table_groups[1]],
-      world_size,
-      self.column_slice_threshold,
-      self.map_groups[1],
-    )
-
-    # Apply strategy and save nested list containing table indices by rank
-    table_ids = self.apply_strategy(self.strategy, world_size, sliced_configs)
-
-    # Following are ALL nested lists holding info for distributing embeddings, ordered by rank
-    self.input_ids_list = []
-    self.local_maps = []
-    self.local_configs = []
-    self.local_input_offsets = []
-    self.local_weight_offsets = []
-    self.local_group_list = []
-    self.table_ids = []
-
-    # Each worker loop over all rank to get global view of strategy
-    for rank_table_ids in table_ids:
-      # first merge different shards of same table that ends up on same rank
-      rank_table_ids, rank_configs = self._merge_slices(rank_table_ids, sliced_configs)
-      self.table_ids.append(rank_table_ids)
-
-      # calculate local input ids and map from this rank's table_ids and global input map
-      rank_input_ids, rank_input_map = [], []
-      for m, table_idx in enumerate(rank_table_ids):
-        for k, mapped_idx in enumerate(self.map_groups[1]):
-          if table_idx == mapped_idx:
-            rank_input_ids.append(k)
-            rank_input_map.append(m)
-
-      # Only offloading the table-parallel/column-sliced embeddings.
-      rank_configs = self._maybe_offload(configs=rank_configs)
-
-      # concat eligible tables then adjust local config and map
-      rank_configs, rank_input_map, input_offsets, group, weight_offsets = self._create_concat(
-        rank_configs, rank_input_map
-      )
-
-      # save results to global nested list
-      self.input_ids_list.append(rank_input_ids)
-      self.local_configs.append(rank_configs)
-      self.local_maps.append(rank_input_map)
-      self.local_input_offsets.append(input_offsets)
-      self.local_group_list.append(group)
-      self.local_weight_offsets.append(weight_offsets)
-
-    # create a flatten list contain table widths, in worker order, used for slice after alltoall
-    # This is fast but might switch to not use this to support non-2D output(no local combiner)
-    self.widths_list_flat = []
-    for config, input_map in zip(self.local_configs, self.local_maps):
-      self.widths_list_flat += [config[m]["output_dim"] for m in input_map]
-
-    # List of indices to shuffle worker ordered embedding outputs back to original order
-    worker_order_input_ids = [item for sublist in self.input_ids_list for item in sublist]
-    self.rev_tp_ids = [index for _, index in sorted(zip(worker_order_input_ids, range(len(worker_order_input_ids))))]
-
-  def _maybe_offload(self, configs):
-    """
-    Offloads the largest tables among the "configs" argument,
-    such that the sum of not offloaded tables is smaller than the threshold.
-
-    Args:
-      configs (List): a list of configs to process
-
-    Returns:
-      A list of configs in the same order as the "configs" argument,
-      but with the "cpu_offload" field set to either True or False.
-    """
-    configs = configs.copy()
-
-    if self.gpu_embedding_size is None:
-      for config in configs:
-        config["cpu_offload"] = False
-      return configs
-
-    current_total_size = 0
-
-    # use indices rather than sorting the list directly to maintain the original order
-    _, order = _argsort(configs, key=lambda x: x["input_dim"] * x["output_dim"])
-    for index in order:
-      config = configs[index]
-      current_total_size += config["input_dim"] * config["output_dim"]
-      config["cpu_offload"] = current_total_size > self.gpu_embedding_size
-    return configs
-
-  # below are the methods to divide table into groups and adjust input and input map accordingly
-  def init_table_groups(self, configs):
-    # We want to support data parallel, table parallel, column slice and row slice
-    # Couple assumptions:
-    # - strat applied by size in above order, meaning small table run dp -> large table run row slice
-    # - currently only apply one of above to any table. it may make sense to mix row/col slice in future
-    # - because communication pattern is different, we run 3 separate groups calls
-    # - non-symmetric table parallel is only applied to column sliced group
-    num_elems = [config["input_dim"] * config["output_dim"] for config in configs]
-    dp, col, row = [], [], []
-    for i, num_elem in enumerate(num_elems):
-      if self.data_parallel_threshold and num_elem <= self.data_parallel_threshold:
-        dp.append(i)
-      elif self.row_slice_threshold and num_elem >= self.row_slice_threshold:
-        row.append(i)
-      else:
-        col.append(i)
-    return [dp, col, row]
-
-  def init_input_and_map_groups(self, table_groups, input_table_map):
-    dp, col, row = table_groups
-    # pick out inputs for each group
-    dp_in, col_in, row_in = [], [], []
-    dp_map, col_map, row_map = [], [], []
-    for i, idx in enumerate(input_table_map):
-      if idx in dp:
-        dp_in.append(i)
-        dp_map.append(dp.index(idx))
-      elif idx in col:
-        col_in.append(i)
-        col_map.append(col.index(idx))
-      elif idx in row:
-        row_in.append(i)
-        row_map.append(row.index(idx))
-      else:
-        raise ValueError("Wrong input initializing input/map groups.")
-    flat_input_ids = dp_in + col_in + row_in
-    reverse_ids = [index for _, index in sorted(zip(flat_input_ids, range(len(flat_input_ids))))]
-    return [dp_in, col_in, row_in], [dp_map, col_map, row_map], reverse_ids
-
-  def maybe_slice_table_column(self, orig_config, column_slice_threshold, world_size):
-    """Column slice a embedding config if size exceed column_slice_threshold.
-    Assume N is smallest power of 2 so that when evenly slice original table into N tables,
-    each have less than column_slice_threshold elements.
-    So final number of slices will be min(N, world_size, table_width).
-    Args:
-      orig_config (dict): embedding layer config to create slices from
-      column_slice_threshold (int or None): desired upper bound of elements count in each slice
-      world_size (int): number of total model parallel worker
-    Returns:
-      sliced_config (list): list of embedding layer config that concat into original config
-    """
-    if column_slice_threshold is None:
-      column_slice_threshold = float("inf")
-    table_size = orig_config["input_dim"] * orig_config["output_dim"]
-    num_slices = 1
-    while table_size > column_slice_threshold:
-      num_slices *= 2
-      table_size /= 2
-    if num_slices == 1:
-      return [orig_config.copy()]
-    num_slices = min(num_slices, world_size, orig_config["output_dim"])
-    column_per_slice = orig_config["output_dim"] // num_slices
-    remainder = orig_config["output_dim"] % num_slices
-    sliced_config = []
-    for i in range(num_slices):
-      config = orig_config.copy()
-      config["output_dim"] = column_per_slice
-      if i < remainder:
-        config["output_dim"] += 1
-      sliced_config.append(config)
-    return sliced_config
-
-  def create_col_sliced_configs(self, global_col_configs, world_size, column_slice_threshold, input_table_map):
-    """Create column sliced configs from global configs.
-    This function also calculate ranges of data parallel output needs concat due to this slice.
-    Args:
-      global_col_configs (list): selected configs for doing column slice
-      world_size (int): number of model parallel workers
-      column_slice_threshold (int or None): desired upper bound of elements count in each slice
-      input_table_map (list): A list of table ids mapping each input to a table
-    Returns:
-      sliced_configs (list): same length as global configs. each element is a list represent sliced
-    form of global config at the same position.
-      sliced_out_ranges (list): each element is list of 2 integers, representing output ranges need
-    to be concatenated to re-form output due to above slice.
-    """
-    # less table than worker, we try our best to slice into worker count slices(may go over)
-    if column_slice_threshold is None:
-      table_sizes = [config["input_dim"] * config["output_dim"] for config in global_col_configs]
-      while world_size > len(table_sizes):
-        table_sizes.sort()
-        column_slice_threshold = table_sizes[-1] - 1
-        cur_max_size = table_sizes.pop(-1)
-        table_sizes += [cur_max_size // 2, cur_max_size // 2]
-
-    sliced_configs = []
-    for col_config in global_col_configs:
-      maybe_sliced_config = self.maybe_slice_table_column(col_config, column_slice_threshold, world_size)
-      sliced_configs.append(maybe_sliced_config)
-    # figure out ranges of output that needs concat
-    # this needs to be in output order, otherwise range modification would fail
-    sliced_out_ranges = []
-    for input_id, table_id in enumerate(input_table_map):
-      if len(sliced_configs[table_id]) > 1:
-        sliced_out_ranges.append([input_id, input_id + len(sliced_configs[table_id])])
-    return sliced_configs, sliced_out_ranges
-
-  def create_row_sliced_configs(self, global_row_configs, world_size):
-    # initial test code. not considering corner cases
-    sliced_configs, offsets = [], []
-    for orig_config in global_row_configs:
-      sliced_config, offset = [], []
-      cur_offset = 0
-      row_per_slice = orig_config["input_dim"] // world_size
-      remainder = orig_config["input_dim"] % world_size
-      for i in range(world_size):
-        config = orig_config.copy()
-        config["input_dim"] = row_per_slice
-        if i < remainder:
-          config["input_dim"] += 1
-        sliced_config.append(config)
-        offset.append(cur_offset)
-        cur_offset -= config["input_dim"]
-      sliced_configs.append(sliced_config)
-      offsets.append(offset)
-    # re-divide lists by rank
-    sliced_configs = [list(rank_configs) for rank_configs in zip(*sliced_configs)]
-    offsets = [list(rank_offsets) for rank_offsets in zip(*offsets)]
-    return sliced_configs, offsets
-
-  # pylint: disable=missing-param-doc,missing-type-doc,missing-raises-doc
-  def apply_strategy(self, mode, world_size, sliced_configs):
-    """Distribute tables to workers from sliced config, a nested list.
-    Returns:
-      divided_ids (list): world_size length list. Each element is list of
-    sliced table ids distribute to rank according to position.
-    """
-    global_ids = []
-    table_sizes = []
-    for i, sliced_config in enumerate(sliced_configs):
-      for config in sliced_config:
-        global_ids.append(i)
-        table_sizes.append(config["input_dim"] * config["output_dim"])
-
-    # Round-robin distribute tables onto workers
-    if mode == "basic":
-      divided_ids = [global_ids[i::world_size] for i in range(world_size)]
-    # Distributed table so that memory is balanced while table count remain even
-    elif mode == "memory_balanced":
-      sorted_ids = [idx for _, idx in sorted(zip(table_sizes, global_ids), reverse=True)]
-      divided_ids = [
-        sorted_ids[i :: 2 * world_size] + sorted_ids[(2 * world_size - 1 - i) :: 2 * world_size]
-        for i in range(world_size)
-      ]
-    # Try to optimize for total memory first. After sorted by size, table are distributed one by one
-    # to worker with lowest total size. Memory usage will be more even but table count may not.
-    elif mode == "memory_optimized":
-      sorted_pairs = list(sorted(zip(table_sizes, global_ids)))
-      res = [[0, []] for _ in range(world_size)]
-      while sorted_pairs:
-        cur = sorted_pairs.pop()
-        res[0][0] += cur[0]
-        res[0][1].append(cur[1])
-        res = sorted(res)
-      divided_ids = [r[1] for r in res]
-    else:
-      raise ValueError(f"Unsupported strategy {strategy}")
-    return divided_ids
-
-  # Concat table so different table now become shared embedding. XLA does rest of optimization.
-  def _create_concat(self, table_configs, input_maps):
-    # first get local table id into groups
-    grouped_table_ids, concat_configs = [], []
-    for table_id, config in enumerate(table_configs):
-      for group, concat_config in zip(grouped_table_ids, concat_configs):
-        same_output_dim = config["output_dim"] == concat_config["output_dim"]
-        same_combiner = config.get("combiner") == concat_config.get("combiner")
-        no_offload = not (config["cpu_offload"] or concat_config["cpu_offload"])
-        if same_output_dim and same_combiner and no_offload:
-          group.append(table_id)
-          concat_config["input_dim"] += config["input_dim"]
-          concat_config["input_dims"].append(config["input_dim"])
-          concat_config["offsets"].append(concat_config["offsets"][-1] + config["input_dim"])
-          break
-      else:  # can't merge with any group, create a new one
-        grouped_table_ids.append([table_id])
-        config["input_dims"] = [config["input_dim"]]
-        config["offsets"] = [0, config["input_dim"]]
-        concat_configs.append(config)
-
-    # adjust input map and create according offset map
-    new_input_map, input_offsets = [], []
-    for input_map in input_maps:
-      for gid, (group, concat_config) in enumerate(zip(grouped_table_ids, concat_configs)):
-        if input_map in group:
-          new_input_map.append(gid)
-          input_offsets.append(concat_config["offsets"][group.index(input_map)])
-          break
-
-    # switch to concat initializer to keep behavior associated with shape
-    for concat_config in concat_configs:
-      input_dims = concat_config.pop("input_dims")
-      if len(input_dims) > 1:
-        # TODO(deyuf): we don't really need serialize and can just get from original class
-        if "embeddings_initializer" in concat_config:
-          orig_initializer = initializers.deserialize(concat_config["embeddings_initializer"])
-          concat_config["embeddings_initializer"] = ConcatInitializer(orig_initializer, input_dims)
-
-    # record weight offsets for get/set.
-    weight_offsets = [concat_config.pop("offsets", None) for concat_config in concat_configs]
-    return concat_configs, new_input_map, input_offsets, grouped_table_ids, weight_offsets
-
-  # Helper function to re-merge slices of same table in cases they end up on same workers
-  def _merge_slices(self, rank_table_ids, sliced_configs):
-    merged_table_ids, rank_configs = [], []
-    for table_idx in rank_table_ids:
-      # this id has been seen on this rank before, merge it with earlier shard
-      if table_idx in merged_table_ids:
-        config_to_merge = sliced_configs[table_idx].pop(0)
-        index_to_merge = merged_table_ids.index(table_idx)
-        rank_configs[index_to_merge]["output_dim"] += config_to_merge["output_dim"]
-        # modify output concat ranges
-        for out_range in self.sliced_out_ranges:
-          if out_range[0] == table_idx:
-            out_range[-1] -= 1
-      else:
-        merged_table_ids.append(table_idx)
-        rank_configs.append(sliced_configs[table_idx].pop(0))
-    return merged_table_ids, rank_configs
-
-
-class DistributedEmbedding(tf.keras.layers.Layer):
-  """Distributed embedding wrapper
-
-  This class is a hybrid parallel wrapper around embedding. It handles all to all communication of
-  forward and backward of embedding.
-
-  Args:
-    embeddings (list of keras Embedding layers): embedding tables to be distributed
-    strategy (str): A string indicates how embedding tables are distributed.
-        Choices are [“basic”, “memory_balanced”]. Default "basic"
-    column_slice_threshold (int or None): If None, column slice only happen when there are more
-        workers than tables. In that case, column_slice_threshold will be choose automatically
-        so each worker receive at least one slice.
-        If not None, embedding tables with more elements than column_slice_threshold will be divide
-        into N even pieces alone embedded width dimension.
-        N is smallest power of 2 makes each slice smaller than column_slice_threshold. Default None.
-    row_slice_threshold: Embedding larger than this will be evenly row sliced onto all workers
-    dp_input (bool): If True, takes data parallel input, i.e. in shape
-        [local_batch_size x global_num_embeddings]. Otherwise take model parallel input in shape
-        [global_batch_size x local_num_embeddings]. Default True.
-    input_table_map (list or None): same length list as inputs, map `input[i]`
-        to `table[input_table_map[i]]`. None means there are same number of
-        inputs/tables and `input[i]` map to `table[i]`. Default None.
-    data_parallel_threshold: Embedding smaller than this will be run data-parallel.
-    gpu_embedding_size: total number of local table-parallel embedding elements to fit on the GPU.
-                        if more elements are used, they will be CPU offloaded.
-                        Set to None (default) to try to fit all table-parallel tables on the GPU.
-                        CPU offloading row-sliced and data-parallel tables is not supported.
-  """
-
-  def __init__(
-    self,
-    embeddings,
-    strategy="basic",
-    column_slice_threshold=None,
-    row_slice_threshold=None,
-    dp_input=True,
-    input_table_map=None,
-    data_parallel_threshold=None,
-    gpu_embedding_size=None,
-    **kwargs,
-  ):
-    super().__init__(**kwargs)
-    if strategy not in ["basic", "memory_balanced", "memory_optimized"]:
-      raise ValueError(f"Unsupported shard strategy {strategy}")
-
-    # Currently assume data parallel ranks == model parallel ranks
-    # TODO(deyuf): add more control over this with newly added hvd process_set api
-    if not hvd.is_initialized():
-      hvd.init()
-    self.world_size = hvd.size()
-    self.rank = hvd.rank()
-
-    # single worker case fallback to no dp and no row slice.
-    # ideally we could fallback to dp, but do mp for mp_input backward compatibilty
-    self.dp_input = dp_input
-    self.column_slice_threshold = column_slice_threshold
-    self.gpu_embedding_size = gpu_embedding_size
-    if self.world_size > 1:
-      self.row_slice_threshold = row_slice_threshold if dp_input else None
-      self.data_parallel_threshold = data_parallel_threshold if dp_input else None
-    else:
-      self.row_slice_threshold = None
-      self.data_parallel_threshold = None
-
-    # get model parallel distribution strategy
-    self.strategy = DistEmbeddingStrategy(
-      embeddings,
-      self.world_size,
-      strategy,
-      input_table_map=input_table_map,
-      column_slice_threshold=column_slice_threshold,
-      row_slice_threshold=self.row_slice_threshold,
-      data_parallel_threshold=self.data_parallel_threshold,
-      gpu_embedding_size=self.gpu_embedding_size,
-    )
-
-    # Here we make sure empty lists exist
-    # create data parallel layers
-    self.dp_layers = []
-    if self.strategy.table_groups[0]:
-      for config in self.strategy.dp_configs:
-        self.dp_layers.append(self._create_layer_from_config(config))
-
-    # create (maybe) column sliced embeddings and table parallel.
-    self.local_embedding_layers = []
-    self.col_inputs_offsets = []
-    if self.strategy.table_groups[1]:
-      # Handle explicit threshold or corner cases, in which worker may receive no configs
-      # Column slice still need to expand all gpu, otherwise alltoall fails
-      if not all(rank_configs for rank_configs in self.strategy.local_configs):
-        raise ValueError(
-          "Not enough table after slicing to run on all worker."
-          "Try decrease column_slice_threshold or decrease worker count"
-        )
-      for config in self.strategy.local_configs[self.rank]:
-        self.local_embedding_layers.append(self._create_layer_from_config(config))
-      self.col_inputs_offsets = [
-        None if offset == 0 else tf.constant([offset], dtype=tf.int64)
-        for offset in self.strategy.local_input_offsets[self.rank]
-      ]
-
-    # create row sliced embeddings.
-    self.row_layers = []
-    self.row_inputs_offsets = []
-    if self.strategy.table_groups[2]:
-      for config in self.strategy.row_sliced_configs[self.rank]:
-        self.row_layers.append(self._create_layer_from_config(config))
-      self.row_inputs_offsets = [
-        None if offset == 0 else tf.constant([offset], dtype=tf.int64)
-        for offset in self.strategy.row_inputs_offsets[self.rank]
-      ]
-
-  def _create_layer_from_config(self, config):
-    # For stock keras Embedding, we switch underlying layer for better performance
-    # If inputs are custom layers, original layer will be used
-    layer_type = config.pop("layer_type")
-    offloaded = config.pop("cpu_offload", False)
-
-    if layer_type == tf.keras.layers.Embedding:
-      layer_type = Embedding
-
-    if offloaded and layer_type == Embedding:
-      config["use_custom_kernel"] = False
-
-    layer = layer_type.from_config(config)
-    layer.cpu_offloaded = offloaded
-    return layer
-
-  def _call_data_parallel(self, inputs):
-    outputs = [self.dp_layers[m](inp) for m, inp in zip(self.strategy.map_groups[0], inputs)]
-    outputs = [tf.cast(output, self.compute_dtype) for output in outputs]
-
-    return outputs
-
-  def _call_table_parallel(self, inputs):  # pylint: disable=missing-param-doc,missing-type-doc
-    """Call function that do embeddings and communication
-
-    Currently, it requires same batch_size on all workers.
-    """
-    # get model parallel input from data parallel
-    if self.dp_input:
-      rank_to_local_features = dict(enumerate(self.strategy.input_ids_list))
-      inputs = _dp_to_mp_input(inputs, rank_to_local_features)
-      inputs = list(inputs.values())
-
-    if len(inputs) != len(self.strategy.local_maps[self.rank]):
-      raise ValueError(f"Expect {self.strategy.local_maps[self.rank]} inputs, got {len(inputs)}.")
-
-    # offset inputs
-    inputs = [
-      inp if offset is None else tf.cast(inp, tf.int64) + offset for inp, offset in zip(inputs, self.col_inputs_offsets)
-    ]
-    # do embedding
-    mp_outs = [self.local_embedding_layers[m](inp) for m, inp in zip(self.strategy.local_maps[self.rank], inputs)]
-    mp_outs = [tf.cast(output, self.compute_dtype) for output in mp_outs]
-
-    if self.world_size > 1:
-      # TODO(deyuf): current assume 2D with same batch for all output, ideally should support general case
-      mp_outs = [tf.reshape(mp_out, [self.world_size, -1]) for mp_out in mp_outs]
-      mp_outs = tf.reshape(tf.concat(mp_outs, axis=1), [-1])
-      dp_outs = hvd.alltoall(mp_outs, name="out_mp_to_dp")
-      batch_size = tf.shape(inputs[0], out_type=tf.int32)[0] if inputs[0].shape[0] is None else inputs[0].shape[0]
-      local_bs = batch_size // self.world_size
-      num_elements = [local_bs * width for width in self.strategy.widths_list_flat]
-      split_outs = tf.split(dp_outs, num_elements)
-      mp_outs = [tf.reshape(split_out, [local_bs, -1]) for split_out in split_outs]
-
-    # reorder outputs to be same as inputs order
-    result = [mp_outs[index] for index in self.strategy.rev_tp_ids]
-
-    # Concat sliced outputs result from column slicing back together
-    for start, end in self.strategy.sliced_out_ranges:
-      result[start:end] = [tf.concat(result[start:end], axis=-1)]
-
-    return result
-
-  def _call_row_slice(self, inputs):
-    # initial version, just allgather input, do lookup and allreduce output
-    # for lookup that does not exist on this worker(OOB), zero vector is added in
-
-    inputs = hvd.grouped_allgather(inputs)
-    # offset inputs
-    inputs = [
-      inp if offset is None else tf.cast(inp, tf.int64) + offset for inp, offset in zip(inputs, self.row_inputs_offsets)
-    ]
-    # do embedding
-    outputs = [self.row_layers[m](inp) for m, inp in zip(self.strategy.map_groups[2], inputs)]
-    outputs = [tf.cast(output, self.compute_dtype, name="row_slice_cast") for output in outputs]
-    outputs = grouped_reducescatter_unscaled(outputs)
-
-    return outputs
-
-  def set_col_slice_weights(self, weights):
-    if not weights:
-      return []
-    if self.world_size == 1:
-      if isinstance(weights[0], str):
-        weights = [np.load(file=path, mmap_mode="r") for path in weights]
-    else:
-      slice_info = [
-        [rank_table_id.count(table_id) for rank_table_id in self.strategy.table_ids] for table_id in range(len(weights))
-      ]
-      local_info = [slice_info[index] for index in self.strategy.table_ids[self.rank]]
-      weights = [weights[index] for index in self.strategy.table_ids[self.rank]]
-      if isinstance(weights[0], str):
-        weights = [np.load(file=path, mmap_mode="r") for path in weights]
-
-      def _slice_weight_for_rank(weight, info, global_rank):
-        num_columns = weight.shape[1]
-        num_slices = sum(info)
-        column_per_slice = num_columns // num_slices
-        remainder = num_columns % num_slices
-        rank = sum(info[:global_rank])
-
-        start = column_per_slice * rank + min(rank, remainder)
-        rank += 1
-        end = column_per_slice * rank + min(rank, remainder)
-        return weight[:, start:end]
-
-      weights = [_slice_weight_for_rank(weight, info, self.rank) for weight, info in zip(weights, local_info)]
-
-    # now we have weight distributed, need to concat
-    concat_weights = []
-    for group in self.strategy.local_group_list[self.rank]:
-      to_concat = [weights[idx] for idx in group]
-      concat_weights.append(np.concatenate(to_concat))
-    return concat_weights
-
-  def set_row_slice_weights(self, weights):
-    # we make sure no table is in this group in single worker case
-    if not weights:
-      return []
-    if isinstance(weights[0], str):
-      weights = [np.load(file=path, mmap_mode="r") for path in weights]
-    local_info = [[1 for _ in range(self.world_size)] for _ in weights]
-
-    def _slice_weight_for_rank(weight, info, global_rank):
-      num_columns = weight.shape[0]
-      num_slices = sum(info)
-      column_per_slice = num_columns // num_slices
-      remainder = num_columns % num_slices
-      rank = sum(info[:global_rank])
-
-      start = column_per_slice * rank + min(rank, remainder)
-      rank += 1
-      end = column_per_slice * rank + min(rank, remainder)
-      return weight[start:end, :]
-
-    weights = [_slice_weight_for_rank(weight, info, self.rank) for weight, info in zip(weights, local_info)]
-    return weights
-
-  def set_weights(self, weights, chunk=134217728, use_lock=False):
-    """Sets the weights of the layer, from NumPy arrays.
-
-    Args:
-      weights (list): list containing global weights for all table.
-          item in the list can be either numpy array or file path to load from.
-      chunk (int): max number of elements per chunk when set weight on GPU by chunks.
-          this will be round to number of rows base on weight shape.
-      use_lock (bool): If true, set weights rank by rank in lock step to avoid OOM. Default False.
-    Raises:
-      ValueError: If length of weights does not match length of expected weights.
-    """
-    if use_lock:
-      for _ in range(self.rank):
-        hvd.broadcast_object(0)
-
-    dp_weights = [weights[idx] for idx in self.strategy.table_groups[0]]
-    col_weights = [weights[idx] for idx in self.strategy.table_groups[1]]
-    row_weights = [weights[idx] for idx in self.strategy.table_groups[2]]
-
-    col_weights = self.set_col_slice_weights(col_weights)
-    row_weights = self.set_row_slice_weights(row_weights)
-
-    weights = dp_weights + col_weights + row_weights
-    # variable.assign and copy-on-write creates extra copy of weight that causes OOM
-    # so here we scatter update by ~128M elements chunks instead of just do
-    # super().set_weights(weights)
-    if len(self.weights) != len(weights):
-      raise ValueError(
-        f"You called `set_weights(weights)` on layer {self.name} with a weight list of "
-        f"length {len(weights)}, but the layer was expecting {len(self.weights)} weights."
-      )
-    for weight, arr in zip(self.weights, weights):
-      if arr.size <= chunk:
-        weight.assign(arr)
-      else:
-        chunk_size_dim0 = chunk // weight.shape[1]
-        num_chunks = math.ceil(weight.shape[0] / chunk_size_dim0)
-        last_size = weight.shape[0] - chunk_size_dim0 * (num_chunks - 1)
-        chunk_sizes = [chunk_size_dim0] * (num_chunks - 1) + [last_size]
-        for i in range(num_chunks):
-          start = i * chunk_size_dim0
-          end = start + chunk_sizes[i]
-          indices = tf.range(start=start, limit=end, dtype=tf.int64)
-          update = tf.IndexedSlices(values=arr[start:end], indices=indices, dense_shape=weight.shape)
-          weight.scatter_update(sparse_delta=update)
-
-    del weights
-    if use_lock:
-      for _ in range(self.world_size - self.rank):
-        hvd.broadcast_object(0)
-
-  # 1d split that works beyond 32bit indexing limit TF support
-  def _split_1d(self, tensor, lengths):
-    # choose a number close to int32 limit as maximum chunk size
-    # This will handle tensor with size up to square of int32_max
-    chunking_threshold = 2147483646
-    if tensor.shape[0] <= chunking_threshold:
-      return tf.split(tensor, lengths)
-    num_chunks = math.ceil(tensor.shape[0] / chunking_threshold)
-    padding_len = math.ceil(tensor.shape[0] / num_chunks) * num_chunks - tensor.shape[0]
-    padded_tensor = tf.concat([tensor, tf.zeros(padding_len, tensor.dtype)], axis=0)
-    tensor_list = tf.unstack(tf.reshape(padded_tensor, [num_chunks, -1]))
-    result = []
-    for length in lengths:
-      this_slice = []
-      while length > 0:
-        if length > tensor_list[0].shape[0]:
-          this_slice.append(tensor_list.pop(0))
-        else:
-          this_slice.append(tensor_list[0][:length])
-          tensor_list[0] = tensor_list[0][length:]
-        length -= this_slice[-1].shape[0]
-      result.append(tf.concat(this_slice, axis=0))
-    return result
-
-  def get_row_sliced_weights(self, weights):
-    if not weights:
-      return []
-    # weights are already selected with group info
-    # assume row slice run on all workers, then allgather conveniently stitch them together
-    weights = hvd.grouped_allgather(weights)
-    return [w.numpy() for w in weights]
-
-  def get_col_sliced_weights(self, local_weights, all_ranks=False):
-    if not local_weights:
-      return []
-    # TODO(deyuf): undo concat locally first. this require we save original local config
-    if self.world_size == 1:
-      concat_weights = [w.numpy() for w in local_weights]
-      res = [item for sublist in self.strategy.local_group_list[0] for item in sublist]
-      for offsets, f_w, group in zip(
-        self.strategy.local_weight_offsets[0], concat_weights, self.strategy.local_group_list[0]
-      ):
-        for i in range(len(offsets) - 1):
-          res[group[i]] = f_w[offsets[i] : offsets[i + 1]]
-      return res
-
-    # mpi segfault on over 32bit range index, so we gather weights chunk by chunk here
-    # choose a number not very close to int32 limit as maximum chunk size just to be safe
-    chunking_threshold = 2000000000
-    num_chunks = 1
-    for local_config in self.strategy.local_configs:
-      total_elements = sum([c["input_dim"] * c["output_dim"] for c in local_config])
-      num_chunks = max(num_chunks, math.ceil(self.world_size * total_elements / chunking_threshold))
-
-    with tf.device("CPU:0"):
-      local_weights = tf.concat([tf.reshape(w, [-1]) for w in local_weights], axis=0)
-      chunk_size = local_weights.shape[0] // num_chunks
-      last_size = local_weights.shape[0] - chunk_size * (num_chunks - 1)
-      chunk_sizes = [chunk_size] * (num_chunks - 1) + [last_size]
-      local_weights = self._split_1d(local_weights, chunk_sizes)
-      # communicate chunk sizes
-      all_sizes = hvd.allgather(chunk_sizes)
-
-      # collect all chunks and split to reverse allgather concat
-      chunks = []
-      for i, w in enumerate(local_weights):
-        w = hvd.allgather(w)
-        if all_ranks or self.rank == 0:
-          chunks += self._split_1d(w, all_sizes[i::num_chunks])
-      if not chunks:
-        return []
-
-      # re-construct all local weights from chunks
-      local_weights = []
-      for i in range(self.world_size):
-        local_weights.append(tf.concat(chunks[i :: self.world_size], axis=0))
-      del chunks
-
-      # split flat local weights into correct sizes
-      weights = []
-      for local_weight, local_config, weight_offsets, local_groups in zip(
-        local_weights, self.strategy.local_configs, self.strategy.local_weight_offsets, self.strategy.local_group_list
-      ):
-        local_shapes = [[c["input_dim"], c["output_dim"]] for c in local_config]
-        local_sizes = [shape[0] * shape[1] for shape in local_shapes]
-        flat_weights = self._split_1d(local_weight, local_sizes)
-        concat_weights = [tf.reshape(weight, shape) for weight, shape in zip(flat_weights, local_shapes)]
-        # split concat embedding weights
-        res = [item for sublist in local_groups for item in sublist]
-        for offsets, f_w, group in zip(weight_offsets, concat_weights, local_groups):
-          for i in range(len(offsets) - 1):
-            res[group[i]] = f_w[offsets[i] : offsets[i + 1]]
-        weights += res
-
-      # restore original table order
-      # flatten self.strategy.table_ids
-      worker_order_table_ids = [item for sublist in self.strategy.table_ids for item in sublist]
-      # Shuffle worker ordered embedding weights(sliced) back to original order.
-      ids_and_weights = sorted(zip(worker_order_table_ids, weights), key=lambda x: x[0])
-      # concat sliced weights
-      result = []
-      cur_id = 0
-      cur_list = []
-      while ids_and_weights:
-        cur = ids_and_weights.pop(0)
-        if cur[0] == cur_id:
-          cur_list.append(cur[1])
-        else:
-          result.append(tf.concat(cur_list, axis=1).numpy())
-          cur_id = cur[0]
-          cur_list = [cur[1]]
-      result.append(tf.concat(cur_list, axis=1).numpy())
-      return result
-
-  def get_weights(self, all_ranks=False):
-    """Returns the current weights of the layer, as NumPy arrays.
-
-    This override outputs global weights for all tables.
-    Args:
-      all_ranks (bool): If true, return weights in all ranks, otherwise only in rank 0.
-          Default False.
-    Returns:
-      result (list): List of weight tensors.
-    """
-    # avoid copy-on-read on dense access, assume order here for code simplicity
-    weights = [read_var_no_copy(w) for w in self.weights]
-    num_dp, num_col = len(self.dp_layers), len(self.local_embedding_layers)
-    dp_weights = weights[:num_dp]
-    col_weights = weights[num_dp : num_dp + num_col]
-    row_weights = weights[num_dp + num_col :]
-
-    col_weights = self.get_col_sliced_weights(col_weights, all_ranks)
-    row_weights = self.get_row_sliced_weights(row_weights)
-
-    weights = dp_weights + col_weights + row_weights
-    group_order_table_ids = [idx for group in self.strategy.table_groups for idx in group]
-    weights = [w for _, w in sorted(zip(group_order_table_ids, weights))]
-    return weights
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):
-    if input_shape is not None and None not in input_shape[0]:
-      # Do some checks to detect cases that are not supported
-      if not isinstance(input_shape, (list, tuple)):
-        input_shape = [input_shape]
-      batch_sizes = [shape[0] for shape in input_shape]
-      batch_sizes = hvd.allgather(batch_sizes).numpy().tolist()
-      if len(set(batch_sizes)) > 1:
-        raise ValueError(f"All input need to have same batchsize. got {set(batch_sizes)}.")
-      if not self.dp_input:
-        if batch_sizes[0] % self.world_size > 0:
-          raise ValueError(f"Global batchsize {batch_sizes[0]} not divisible workers count {self.world_size}.")
-
-    # build both col and row slice tables
-    for layer in self.dp_layers:
-      layer.build(input_shape[0] if input_shape else None)
-      # set built flag to prevent above build trigger again and above flag fall off
-      layer.built = True
-
-    # build both col and row slice tables
-    for layer in self.local_embedding_layers + self.row_layers:
-      device = "CPU:0" if layer.cpu_offloaded else "GPU:0"
-      with tf.device(device):
-        layer.build(input_shape[0] if input_shape else None)
-      for var in layer.trainable_weights:
-        # Mark local(model parallel) variable. use prefix de(distributed embeddings) to avoid conflicts.
-        var.de_local = True
-      # set built flag to prevent above build trigger again and above flag fall off
-      layer.built = True
-
-    self.built = True
-
-  def call(self, inputs):  # pylint: disable=missing-function-docstring
-    # call data parallel tables
-    dp_in = [inputs[idx] for idx in self.strategy.input_groups[0]]
-    dp_out = self._call_data_parallel(dp_in) if dp_in else []
-
-    # call col slice tables
-    col_in = [inputs[idx] for idx in self.strategy.input_groups[1]] if self.dp_input else inputs
-    col_out = self._call_table_parallel(col_in) if col_in else []
-
-    # call row slice tables
-    row_in = [inputs[idx] for idx in self.strategy.input_groups[2]]
-    row_out = self._call_row_slice(row_in) if row_in else []
-
-    # now we have output from all groups, reorder them into input order
-    outputs = dp_out + col_out + row_out
-    outputs = [outputs[idx] for idx in self.strategy.rev_group_ids]
-    return outputs
-
-
-# Monkey patch horovod bcast/tape so we can handle mp/dp vars differently in single backward
-# pylint: disable=protected-access, missing-any-param-doc, invalid-name
-def broadcast_variables(model_vars, root_rank=0):
-  """Broadcasts variables from root rank to all other processes in a process set
-
-  Replace horovod's broadcast_variables when running hybrid parallel
-
-  See https://horovod.readthedocs.io/en/stable/api.html for more details
-  """
-  dp_vars = []
-  mp_vars = []
-  for var in model_vars:
-    if hasattr(var, "de_local"):
-      mp_vars.append(var)
-    else:
-      dp_vars.append(var)
-
-  # modify broadcast to ignore_name_scope by default
-  # TODO(deyuf): make it not positional
-  _broadcast_defaults = list(hvd.broadcast.__defaults__)
-  _broadcast_defaults[1] = True
-  hvd.broadcast.__defaults__ = tuple(_broadcast_defaults)
-  hvd.broadcast_variables(dp_vars, root_rank=root_rank)
-
-
-def DistributedGradientTape(*args, **kwargs):
-  """Graident tape that supports hybrid parallel
-
-  Replace horovod's DistributedGradientTape when running hybrid parallel
-
-  See https://horovod.readthedocs.io/en/stable/api.html for more details
-  """
-
-  def _gradient(self, target, sources, *args, **kwargs):
-    # Overwrite use_generic_names to always be True
-    kwargs["use_generic_names"] = True
-
-    gradients = self.raw_gradient(target, sources, *args, **kwargs)
-    return gradients
-
-  if horovod.__version__ < "0.27.0":
-    raise NotImplementedError("DistributedGradientTape is only compatible with horovod 0.27 or newer.")
-  tape = hvd.DistributedGradientTape(sparse_as_dense=True, *args, **kwargs)
-  for var in tape.watched_variables():
-    if hasattr(var, "de_local"):
-      tape.register_local_source(var)
-
-  tape.raw_gradient = tape.gradient
-  tape.gradient = types.MethodType(_gradient, tape)
-  return tape
-
-
-def DistributedOptimizer(*args, **kwargs):
-  """Distributed optimizer that supports hybrid parallel
-
-  Replace horovod's DistributedOptimizer when running hybrid parallel
-
-  See https://horovod.readthedocs.io/en/stable/api.html for more details
-  """
-
-  # might be correct to patch get/aggregate gradient, but those seems already messy
-  def _register_then_allreduce(self, grads, model_vars):
-    if not self.local_var_registed:
-      for var in model_vars:
-        if hasattr(var, "de_local"):
-          self.register_local_var(var)
-      self.local_var_registed = True
-    return self.raw_allreduce(grads, model_vars)
-
-  if horovod.__version__ < "0.27.0":
-    raise NotImplementedError("Distributed Optimizer is only compatible with horovod 0.27 or newer")
-  opt = hvd_keras.DistributedOptimizer(sparse_as_dense=True, *args, **kwargs)
-  opt.local_var_registed = False
-  opt.raw_allreduce = opt._allreduce
-  opt._allreduce = types.MethodType(_register_then_allreduce, opt)
-
-  # need to patch internal allreduce call with use_generic_names
-  def _named_allreduce_grads(self, grads, variables):
-    return self.raw_allreduce_grads(grads, variables, use_generic_names=True)
-
-  opt.raw_allreduce_grads = opt._allreduce_grads
-  opt._allreduce_grads = types.MethodType(_named_allreduce_grads, opt)
-  return opt
-
-
-def BroadcastGlobalVariablesCallback(*args, **kwargs):
-  """Broadcast callback that supports hybrid parallel
-
-  Replace horovod's BroadcastGlobalVariablesCallback when running hybrid parallel
-
-  See https://horovod.readthedocs.io/en/stable/api.html for more details
-  """
-
-  def _on_batch_end(self, batch, logs=None):
-    if not self.local_var_registed:
-      for var in self.model.variables:
-        if hasattr(var, "de_local"):
-          self.register_local_var(var)
-      self.local_var_registed = True
-    return self.raw_on_batch_end(batch, logs)
-
-  if horovod.__version__ < "0.27.0":
-    raise NotImplementedError("BroadcastGlobalVariablesCallback is only compatible with horovod 0.27 or newer.")
-  bcb = hvd_keras.callbacks.BroadcastGlobalVariablesCallback(*args, **kwargs)
-  bcb.local_var_registed = False
-  bcb.raw_on_batch_end = bcb.on_batch_end
-  bcb.on_batch_end = types.MethodType(_on_batch_end, bcb)
-  return bcb
-
-
-# pylint: enable=protected-access, missing-any-param-doc, invalid-name
diff --git a/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py b/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
deleted file mode 100644
index 4b1404e6..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/layers/embedding.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Embedding layers"""
-
-import numpy as np
-import tensorflow as tf
-from tensorflow.keras import constraints
-from tensorflow.keras import initializers
-from tensorflow.keras import regularizers
-from tensorflow.keras import backend
-from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.ops.ragged import ragged_tensor
-from distributed_embeddings.python.ops import embedding_lookup_ops
-
-
-class CPUInitializer(tf.keras.initializers.Initializer):
-  """initializer wrapper to force one-time init onto CPU, avoiding OOM"""
-
-  def __init__(self, initializer):
-    self._initializer = initializer
-
-  def __call__(self, shape, dtype=None, **kwargs):
-    with tf.device("/CPU:0"):
-      res = self._initializer(shape, **kwargs)
-    return res
-
-
-def _embedding_lookup_native(param, ids, combiner=None):
-  t = tf.nn.embedding_lookup(param, ids)
-  if combiner == "sum":
-    t = tf.reduce_sum(t, axis=1)
-  elif combiner == "mean":
-    t = tf.reduce_mean(t, axis=1)
-  return t
-
-
-class Embedding(tf.keras.layers.Layer):
-  """Turns indices into vectors of fixed size.
-
-  Args:
-    input_dim (int): Size of the vocabulary, i.e. maximum index + 1.
-    output_dim (int): Length of embedding vectors.
-    embeddings_initializer: Initializer for the `embeddings`
-      matrix (see `keras.initializers`).
-    embeddings_regularizer: Regularizer function applied to
-      the `embeddings` matrix (see `keras.regularizers`).
-    embeddings_constraint: Constraint function applied to
-      the `embeddings` matrix (see `keras.constraints`).
-    combiner (str): Reduction method, ['sum', 'mean'] or None. Default None.
-    use_custom_kernel (bool): Enable using custom CUDA kernels. Default True.
-
-  When combiner is not None, supported input and their respectively output shape are:
-    N-D `Tensor`: `(d1,...,dn)`, output shape: `(d1,...,dn-1,output_dim)`, N >= 2
-    2-D `RaggedTensor`: `(batch_size, ragged_dim)`, output shape: `(batch_size, output_dim)`
-    2-D `SparseTensor`: `(batch_size, max_hotness)`, output shape: `(batch_size, output_dim)`
-  Embedding picked from last input dimension will be reduced with given combiner.
-  """
-
-  def __init__(
-    self,
-    input_dim,
-    output_dim,
-    embeddings_initializer="uniform",
-    embeddings_regularizer=None,
-    activity_regularizer=None,
-    embeddings_constraint=None,
-    combiner=None,
-    use_custom_kernel=True,
-    **kwargs,
-  ):
-    if "input_shape" not in kwargs:
-      kwargs["input_shape"] = (None,)
-    if input_dim <= 0 or output_dim <= 0:
-      raise ValueError(f"Both input_dim and output_dim should be positive, found {input_dim} and {output_dim}")
-    if not base_layer_utils.v2_dtype_behavior_enabled() and "dtype" not in kwargs:
-      # In TF1, the dtype defaults to the input dtype which is typically int32,
-      # so explicitly set it to floatx
-      kwargs["dtype"] = backend.floatx()
-    # No autocast.
-    kwargs["autocast"] = False
-    super().__init__(**kwargs)
-    self.input_dim = input_dim
-    self.output_dim = output_dim
-    self.embeddings_initializer = initializers.get(embeddings_initializer)
-    self.embeddings_initializer_cpu = CPUInitializer(self.embeddings_initializer)
-    self.embeddings_regularizer = regularizers.get(embeddings_regularizer)
-    self.activity_regularizer = regularizers.get(activity_regularizer)
-    self.embeddings_constraint = constraints.get(embeddings_constraint)
-    self.combiner = combiner
-    self.use_custom_kernel = use_custom_kernel
-
-  @tf_utils.shape_type_conversion
-  def build(self, input_shape):  # pylint: disable=unused-argument
-    self.embeddings = self.add_weight(
-      shape=(self.input_dim, self.output_dim),
-      initializer=self.embeddings_initializer_cpu,
-      name="embeddings",
-      regularizer=self.embeddings_regularizer,
-      constraint=self.embeddings_constraint,
-      experimental_autocast=False,
-    )
-    self.built = True
-
-  @tf_utils.shape_type_conversion
-  def compute_output_shape(self, input_shape):
-    if self.combiner is None:
-      return input_shape + (self.output_dim,)
-    return input_shape[:-1] + (self.output_dim,)
-
-  def call(self, inputs):  # pylint: disable=missing-function-docstring
-    dtype = backend.dtype(inputs)
-    if dtype not in ["int64", "int32"]:
-      inputs = tf.cast(inputs, "int32")
-    # For needed case, compute output shape and replace leading possible None with -1
-    out_shape = None
-    if len(inputs.shape) != 2:
-      out_shape = [-1] + list(self.compute_output_shape(inputs.shape))[1:]
-    # check for unsupported cases and reshape non-2D dense inputs
-    if isinstance(inputs, ragged_tensor.RaggedTensor):
-      if len(inputs.shape) > 2:
-        raise ValueError("Ragged input should be 2D. Nested ragged is not supported.")
-    else:
-      if len(inputs.shape) == 1:
-        if self.combiner is not None:
-          raise ValueError("1D input with combiner is ambiguous. Please create batch dimension.")
-        inputs = tf.reshape(inputs, [-1, 1])
-      if len(inputs.shape) > 2:
-        inputs = tf.reshape(inputs, [-1, inputs.shape[-1]])
-
-    if self.use_custom_kernel:
-      out = embedding_lookup_ops.embedding_lookup(self.embeddings, inputs, combiner=self.combiner)
-    else:
-      out = _embedding_lookup_native(self.embeddings, inputs, combiner=self.combiner)
-
-    if out_shape is not None:
-      out = tf.reshape(out, out_shape)
-    return out
-
-  def get_config(self):  # pylint: disable=missing-function-docstring
-    config = {
-      "input_dim": self.input_dim,
-      "output_dim": self.output_dim,
-      "embeddings_initializer": initializers.serialize(self.embeddings_initializer),
-      "embeddings_regularizer": regularizers.serialize(self.embeddings_regularizer),
-      "activity_regularizer": regularizers.serialize(self.activity_regularizer),
-      "embeddings_constraint": constraints.serialize(self.embeddings_constraint),
-      "combiner": self.combiner,
-      "use_custom_kernel": self.use_custom_kernel,
-    }
-    base_config = super().get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
-    Overriding this to enable instantiating fast embedding from keras embedding configs
-    """
-    config.pop("mask_zero", None)
-    config.pop("input_length", None)
-    return super().from_config(config)
-
-
-class ConcatOneHotEmbedding(tf.keras.layers.Layer):
-  """Concatenated one hot embedding
-
-  Args:
-    feature_sizes (list): A list of integer indicating number of features of each embedding table
-    embedding_width (int): Width of embedding vector
-
-  """
-
-  def __init__(self, feature_sizes, embedding_width):
-    super().__init__(dtype=tf.float32)
-    self.embedding_width = embedding_width
-    self._offsets_np = np.array([0] + feature_sizes).cumsum()
-
-    self.params = self.add_weight("params", shape=[self._offsets_np[-1], self.embedding_width], dtype=tf.float32)
-    self.offsets = tf.constant(self._offsets_np, dtype=tf.int32)
-
-  def call(self, inputs):
-    assert inputs.shape[1] == len(self.offsets) - 1
-
-    offset_indices = inputs + self.offsets[:-1]
-    embedding_out = tf.gather(params=self.params, indices=offset_indices, axis=None)
-
-    return embedding_out
-
-
-# pylint: disable=missing-class-docstring
-class IntegerLookup(tf.keras.layers.Layer):
-  """
-  A preprocessing layer which maps integer features to contiguous ranges.
-  Vocabulary is generated on the fly, static vocabulary and adapt() will be supported.
-  Partially support features of tf.keras.layers.IntegerLookup.
-  Frequency of keys are counted when GPU algorithm is used.
-  """
-
-  def __init__(self, max_tokens, use_gpu=True):
-    super().__init__()
-    max_tokens = int(max_tokens)
-    self.capacity = max_tokens + 1
-    self.use_gpu = use_gpu
-    if self.use_gpu:
-      # TODO(deyuf): run some benchmark to make sure 32bit here does not cause problem
-      self.count = tf.Variable(tf.zeros((max_tokens + 1,), tf.uint32), trainable=False)
-      # Reserve first index for oov token. The inplementation doesn't require a oov token like -1
-      # Alternatively, we could define an oov token and insert with 0.
-      self.count.scatter_update(tf.IndexedSlices(1, 0))
-      # TODO: explore adjusting table size on the fly
-      # TODO: table init on first lookup now. should separate init op out.
-      # Initialize the table so it can be used across ops. Since all cucollection need is pointer,
-      # we don't need a "tableop" like native to return resource handle.
-      # 1.5x load factor, 2x keys + values
-      self.table = tf.Variable(tf.zeros((2 * int(1.5 * self.capacity),), tf.int64), trainable=False)
-    else:
-      with tf.device("/CPU"):
-        self.table = tf.lookup.experimental.DenseHashTable(
-          key_dtype=tf.int64, value_dtype=tf.int64, default_value=0, empty_key=-2, deleted_key=-3
-        )
-
-        # TODO(deyuf): benchmark code that handles max_token for cpu table
-        self.table.insert(-1, 0)
-        self.num_empty_slot = max_tokens
-
-  def call(self, inputs):
-    if self.use_gpu:
-      return embedding_lookup_ops.integer_lookup(self.table, self.count, inputs, self.capacity)
-    # This is efficient on cpu, especially with power law distribution data
-    with tf.device("/CPU"):
-      input_shape = tf.shape(inputs)
-      inputs = tf.reshape(inputs, [-1])
-      if self.num_empty_slot > 0:
-        keys, _ = tf.unique(inputs)
-        vals = self.table.lookup(keys)
-        new_keys = tf.gather(keys, tf.reshape(tf.where(vals <= 0), [-1]))[: self.num_empty_slot]
-        num_insert = tf.shape(new_keys, out_type=tf.int64)[0]
-        self.num_empty_slot -= num_insert
-        self.table.insert(new_keys, tf.range(self.table.size(), self.table.size() + num_insert))
-    return tf.reshape(self.table.lookup(inputs), input_shape)
-
-  def get_vocabulary(self):
-    # TODO: may need a new api as gpu lookup may not be contiguous
-    if self.use_gpu:
-      # just to be sure, we sort the index returned by where so key/value pair remain together
-      used_ids = tf.sort(tf.reshape(tf.where(self.table != -1), [-1]))
-      kv = tf.gather(self.table, used_ids)
-      # split k,v
-      keys = kv[0::2]
-      vals = kv[1::2]
-      # filter out oov keys that mapped to 0
-      non_oov_ids = tf.sort(tf.reshape(tf.where(vals > 0), [-1]))
-      keys = tf.gather(keys, non_oov_ids)
-      vals = tf.gather(vals, non_oov_ids)
-      # sort output dict into lookup value order
-      sort_order = tf.argsort(vals)
-      keys = tf.gather(keys, sort_order)
-      return [-1] + keys.numpy().tolist()
-
-    keys, _ = self.table.export()
-    unique_keys, _ = tf.unique(tf.reshape(keys, [-1]))
-    # remove reserved empty and deleted key
-    unique_keys = tf.gather(unique_keys, tf.reshape(tf.where(unique_keys != -2), [-1]))
-    unique_keys = tf.gather(unique_keys, tf.reshape(tf.where(unique_keys != -3), [-1]))
-    unique_vals = self.table.lookup(unique_keys)
-    sort_order = tf.argsort(unique_vals)
-    unique_keys = tf.gather(unique_keys, sort_order)
-    return unique_keys.numpy().tolist()
diff --git a/deepray/custom_ops/distributed_embeddings/python/ops/__init__.py b/deepray/custom_ops/distributed_embeddings/python/ops/__init__.py
deleted file mode 100644
index cf3615b2..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/ops/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py b/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
deleted file mode 100644
index 9b47d0f8..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/ops/embedding_lookup_ops.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Embedding ops."""
-
-import tensorflow as tf
-
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import resource_loader
-from tensorflow.python.ops import resource_variable_ops
-
-ops = tf.load_op_library(resource_loader.get_path_to_datafile("_embedding_lookup_ops.so"))
-
-
-def read_var_no_copy(res_var):
-  resource_variable_ops.variable_accessed(res_var)
-  return ops.read_variable_no_copy(res_var.handle, dtype=res_var.dtype)
-
-
-@tf.RegisterGradient("ReadVariableNoCopy")
-def _read_grad(_, grad):
-  """Gradient for read op(no copy)."""
-  return grad
-
-
-def embedding_lookup(param, ids, combiner=None):
-  """Looks up embeddings for the given `ids` from a embedding tensor.
-
-  Args:
-    param (Tensor): A single tensor representing the complete embedding tensor.
-    ids (Tensor): A 2D `int32` or `int64` `Tensor` containing the ids to be looked up
-      in `param`. Also support `RaggedTensor` and `SparseTensor`.
-    combiner (string or None): Reduction method, ['sum', 'mean'] or None. Default None.
-
-  Returns:
-    Tensor: A `Tensor` with the same type as the tensors in `param`.
-
-  .. note::
-    When combiner is None, returned tensor has shape: ``shape(ids) + shape(param)[1]``
-
-    Otherwise, embedding from same row is reduced and returned tensor has shape:
-    ``shape(ids)[0] + shape(param)[1]``
-
-  Note when ids is RaggedTensor, its values and row_splits are col_index and row_index
-  of CSR format hotness matrix, thus can be directly constructed.
-
-  Raises:
-    TypeError: If `param` is empty.
-    ValueError: If `ids` is not 2D tensor.
-  """
-  if not tf.is_tensor(param):
-    raise TypeError("param must be Tensor")
-  if ids.get_shape().ndims != 2:
-    raise ValueError("Only support 2D input")
-  if combiner is None:
-    return tf.nn.embedding_lookup(param, ids)
-  if isinstance(ids, ragged_tensor.RaggedTensor):
-    # assuming no empty sample. tf.shape may fail on earlier tf version with ragged input
-    try:
-      dim_0 = tf.shape(ids, out_type=tf.int32)[0] if ids.shape[0] is None else ids.shape[0]
-    except:  # pylint: disable=bare-except
-      dim_0 = tf.shape(ids.row_splits, out_type=tf.int32)[0] - 1 if ids.shape[0] is None else ids.shape[0]
-    num_input = tf.shape(ids.values, out_type=tf.int32)[0] if ids.values.shape[0] is None else ids.values.shape[0]
-    if dim_0 == num_input:
-      return tf.nn.embedding_lookup(param, ids.values)
-    return ops.embedding_lookup_variable_hotness(read_var_no_copy(param), ids.values, ids.row_splits, combiner)
-  if isinstance(ids, tf.SparseTensor):
-    # sparse is ordered but may not be right-ragged. so we generate offset here
-    # avoid d2h copy in eager mode by using sparsetensor's shape directly
-    dim_0 = tf.shape(ids, out_type=tf.int32)[0] if ids.shape[0] is None else ids.shape[0]
-    num_input = tf.shape(ids.values, out_type=tf.int32)[0] if ids.values.shape[0] is None else ids.values.shape[0]
-    if dim_0 == num_input:
-      return tf.nn.embedding_lookup(param, ids.values)
-    # use custom op to avoid bad XLA bahavior and d2h copy caused by searchsorted
-    row_splits = ops.row_to_split(ids.indices, dim_0)
-    # we really want ids.values and row_splits to be same dtype to simplify things
-    # since max(row_splits) here is likely ~total hotness, int32 should be ok
-    # TODO(Deyu): fuse this cast into above row_to_split function and make always int32
-    return ops.embedding_lookup_variable_hotness(
-      read_var_no_copy(param), ids.values, tf.cast(row_splits, dtype=ids.values.dtype), combiner
-    )
-  dim1 = tf.shape(ids, out_type=tf.int32)[1] if ids.shape[1] is None else ids.shape[1]
-  if dim1 == 1:
-    return tf.nn.embedding_lookup(param, tf.squeeze(ids, [1]))
-  if combiner == "sum":
-    return tf.reduce_sum(tf.nn.embedding_lookup(param, ids), axis=1)
-  return tf.reduce_mean(tf.nn.embedding_lookup(param, ids), axis=1)
-
-
-@tf.RegisterGradient("EmbeddingLookupVariableHotness")
-def _embedding_lookup_variable_hotness_grad(op, grad):
-  """The gradients for `embedding_lookup_variable_hotness`.
-  Args:
-    op (object): The `embedding_lookup_variable_hotness` `Operation` that we are differentiating,
-      which we can use to find the inputs and outputs of the original op.
-    grad (Tensor): Gradient with respect to the output of `embedding_lookup_variable_hotness`.
-  Returns:
-    IndexedSlices: A `IndexedSlices` contain sparse gradients with respect to
-      the embedding parameter of `embedding_lookup_variable_hotness`.
-  """
-  param_shape = tf.shape(op.inputs[0])
-  flat_ids = tf.reshape(op.inputs[1], [-1])
-  offsets = op.inputs[2]
-  unique_ids, unique_grad = ops.embedding_lookup_variable_hotness_grad(
-    flat_ids, offsets, grad, op.inputs[0], combiner=op.get_attr("combiner")
-  )
-
-  return (tf.IndexedSlices(unique_grad, unique_ids, param_shape), None, None)
-
-
-def integer_lookup(table, count, keys, capacity):
-  resource_variable_ops.variable_accessed(table)
-  resource_variable_ops.variable_accessed(count)
-  return ops.integer_lookup(table.handle, count.handle, keys, capacity, count.dtype)
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
deleted file mode 100644
index 7dfad894..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/tests/dist_model_parallel_test.py
+++ /dev/null
@@ -1,774 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test of distributed model parallel"""
-
-import random
-import os
-from collections import defaultdict
-
-from absl import flags
-import tensorflow as tf
-from tensorflow.python.platform import test
-from tensorflow.python.keras import keras_parameterized
-import horovod.tensorflow as hvd
-
-from distributed_embeddings.python.layers import dist_model_parallel as dmp
-from distributed_embeddings.python.layers.embedding import Embedding
-from distributed_embeddings.python.layers.dist_model_parallel import _dp_to_mp_input
-
-flags.DEFINE_integer("seed", default=42, help="Random seed for the randomized tests.")
-flags.DEFINE_bool("graph_mode", default=False, help="Run in graph mode.")
-flags.DEFINE_string("mixed_precision_policy", default=None, help="Mixed precision policy to be set.")
-
-large_testcase_sizes = [
-  [2, 8],
-  [2, 16],
-  [10, 8],
-  [10, 16],
-  [10, 16],
-  [10, 16],
-  [10, 16],
-  [10, 16],
-  [10, 32],
-  [10, 128],
-  [10, 128],
-  [10, 128],
-  [10, 128],
-  [10, 1024],
-  [100, 16],
-  [100, 32],
-  [100, 32],
-  [100, 32],
-  [100, 32],
-  [100, 128],
-  [100, 128],
-  [1000, 16],
-  [1000, 16],
-  [1000, 48],
-  [1000, 128],
-  [1000, 128],
-  [1000, 384],
-  [10000, 64],
-  [10000, 64],
-  [10000, 2048],
-  [100000, 32],
-  [100000, 64],
-  [100000, 64],
-  [100000, 64],
-  [100000, 128],
-  [1000000, 96],
-  [1000000, 128],
-  [1000000, 128],
-  [9999999, 8],
-  [10000000, 8],
-  [10000001, 8],
-]
-
-
-# There are some functions in TF that pylint can't inspect correctly which leads to incorrect
-# report of unexpected-keyword-arg, no-value-for-parameter. Disable them globally here
-# pylint: disable=no-self-use,unexpected-keyword-arg,no-value-for-parameter,missing-docstring
-class CustomEmbedding(tf.keras.layers.Layer):
-  def __init__(self, input_dim, output_dim, **kwargs):
-    super().__init__(**kwargs)
-    self.input_dim = input_dim
-    self.output_dim = output_dim
-
-  def build(self, _):
-    self.params = self.add_weight("params", shape=[self.input_dim, self.output_dim], dtype=tf.float32)
-
-  def call(self, inputs):
-    return tf.gather(params=self.params, indices=inputs, axis=None)
-
-  def get_config(self):
-    config = {"input_dim": self.input_dim, "output_dim": self.output_dim}
-    return config
-
-
-class EmbeddingListModel(tf.keras.Model):
-  """A simple model for test"""
-
-  def __init__(
-    self,
-    table_sizes,
-    distribute=False,
-    strategy="basic",
-    dp_input=True,
-    input_table_map=None,
-    column_slice_threshold=None,
-    test_custom_layer=False,
-    combiner=None,
-    use_custom_kernels=True,
-    row_slice_threshold=None,
-    data_parallel_threshold=None,
-    gpu_embedding_size=None,
-  ):
-    super().__init__()
-    self.embeddings = []
-    for size in table_sizes:
-      if test_custom_layer:
-        self.embeddings.append(CustomEmbedding(*size))
-      elif combiner is None:
-        self.embeddings.append(tf.keras.layers.Embedding(*size))
-      else:
-        self.embeddings.append(Embedding(*size, combiner=combiner, use_custom_kernel=use_custom_kernels))
-    if distribute:
-      self.dist_embeddings = dmp.DistributedEmbedding(
-        self.embeddings,
-        strategy=strategy,
-        dp_input=dp_input,
-        input_table_map=input_table_map,
-        column_slice_threshold=column_slice_threshold,
-        row_slice_threshold=row_slice_threshold,
-        data_parallel_threshold=data_parallel_threshold,
-        gpu_embedding_size=gpu_embedding_size,
-      )
-    else:
-      self.dist_embeddings = None
-    self.dense = tf.keras.layers.Dense(5)
-    self.input_table_map = input_table_map
-
-  @tf.function
-  def call(self, inputs):
-    if self.dist_embeddings is not None:
-      outs = self.dist_embeddings(inputs)
-    elif self.input_table_map:
-      outs = [self.embeddings[j](i) for i, j in zip(inputs, self.input_table_map)]
-    else:
-      outs = [e(i) for i, e in zip(inputs, self.embeddings)]
-    out = tf.concat(outs, 1)
-    return self.dense(out)
-
-  def get_config(self):
-    """
-    get_config is an abstrct method in keras.Model, which implies it is important to be explicit
-    although we don't use it in the test
-    """
-    return None
-
-
-def initialize_hvd():
-  os.environ["HOROVOD_STALL_CHECK_TIME_SECONDS"] = "5"
-  os.environ["HOROVOD_STALL_SHUTDOWN_TIME_SECONDS"] = "30"
-
-  hvd.init()
-  gpus = tf.config.experimental.list_physical_devices("GPU")
-  for gpu in gpus:
-    tf.config.experimental.set_memory_growth(gpu, True)
-  if gpus:
-    tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], "GPU")
-
-
-def gen_table_sizes(num_tables=None):
-  random.seed(FLAGS.seed)
-  if num_tables is None:
-    num_tables = random.randint(1, 2 * hvd.size())
-  table_sizes = []
-  for _ in range(num_tables):
-    table_height = random.randint(4, 20)
-    table_width = random.randint(4, 15)
-    table_sizes.append([table_height, table_width])
-  return table_sizes
-
-
-def gen_input_to_table_map(num_tables):
-  # make sure each table have input
-  mapping = list(range(num_tables))
-  # fix number of shared input to 3
-  for _ in range(3):
-    mapping.append(random.randint(0, num_tables - 1))
-  random.shuffle(mapping)
-  return mapping
-
-
-def gen_inputs(*args, hotness=None, **kwargs):
-  if hotness is None:
-    return gen_inputs_onehot(*args, **kwargs)
-
-  return gen_inputs_multihot(*args, hotness=hotness, **kwargs)
-
-
-def gen_inputs_onehot(global_batch, table_sizes, input_to_table_map=None, mp_input_ids=None, return_global=False):
-  # create global inputs
-  if input_to_table_map is None:
-    input_to_table_map = list(range(len(table_sizes)))
-  global_inputs = [
-    tf.random.uniform(shape=[global_batch], minval=0, maxval=table_sizes[i][0], dtype=tf.int64)
-    for i in input_to_table_map
-  ]
-  for t in global_inputs:
-    hvd.broadcast(t, root_rank=0)
-
-  if return_global:
-    return global_inputs
-
-  local_batch = global_batch // hvd.size()
-
-  dp_inputs = [t[hvd.rank() * local_batch : (hvd.rank() + 1) * local_batch] for t in global_inputs]
-  mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else []
-
-  return dp_inputs, mp_inputs
-
-
-def gen_inputs_multihot(
-  global_batch, table_sizes, hotness=None, input_to_table_map=None, mp_input_ids=None, return_global=False
-):
-  # create global inputs
-  if input_to_table_map is None:
-    input_to_table_map = list(range(len(table_sizes)))
-
-  if isinstance(hotness, int):
-    hotness = [hotness for _ in range(len(input_to_table_map))]
-
-  global_inputs = []
-  for i, hotness_i in zip(input_to_table_map, hotness):
-    t = tf.random.uniform(shape=[global_batch * hotness_i], minval=0, maxval=table_sizes[i][0], dtype=tf.int64)
-    hvd.broadcast(t, root_rank=0)
-    row_lengths = tf.ones(shape=[global_batch], dtype=tf.int64) * hotness_i
-    t = tf.RaggedTensor.from_row_lengths(values=t, row_lengths=row_lengths)
-    global_inputs.append(t)
-
-  if return_global:
-    return global_inputs
-
-  local_batch = global_batch // hvd.size()
-  dp_inputs = [t[hvd.rank() * local_batch : (hvd.rank() + 1) * local_batch, :] for t in global_inputs]
-  mp_inputs = [global_inputs[i] for i in mp_input_ids] if mp_input_ids else []
-
-  return dp_inputs, mp_inputs
-
-
-class DistributedEmbeddingTest(keras_parameterized.TestCase):
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    initialize_hvd()
-    self.global_batch = 24
-
-    tf.config.run_functions_eagerly(not flags.FLAGS.graph_mode)
-
-    if flags.FLAGS.mixed_precision_policy:
-      policy = tf.keras.mixed_precision.Policy(flags.FLAGS.mixed_precision_policy)
-      tf.keras.mixed_precision.set_global_policy(policy)
-
-  def run_and_test(self, ref_model, ref_inputs, test_model, test_inputs, fwd_tol=None, bwd_tol=1e-5):
-    tf.keras.utils.set_random_seed(hvd.rank())
-    # run a batch to initialize weight tensors
-    _ = ref_model(ref_inputs)
-    _ = test_model(test_inputs)
-
-    # broadcast ref model weights and set test model weights
-    hvd.broadcast_variables(ref_model.variables, root_rank=0)
-    ref_weights = ref_model.get_weights()
-    num_tables = len(ref_model.embeddings)
-    test_model.dist_embeddings.set_weights(ref_weights[:num_tables])
-    test_model.dense.set_weights(ref_weights[num_tables:])
-
-    with tf.GradientTape() as tape:
-      ref_out = tf.cumsum(ref_model(ref_inputs), axis=1)
-    tape = hvd.DistributedGradientTape(tape)
-    ref_grads = tape.gradient(ref_out, ref_model.variables)
-
-    with tf.GradientTape() as tape:
-      test_out = tf.cumsum(test_model(test_inputs), axis=1)
-    tape = dmp.DistributedGradientTape(tape)
-    test_grads = tape.gradient(test_out, test_model.variables)
-
-    # assert forward result match
-    if fwd_tol is None:
-      self.assertAllEqual(ref_out, test_out)
-    else:
-      self.assertAllClose(ref_out, test_out, rtol=fwd_tol, atol=fwd_tol)
-
-    # slicing grad is tricky. so we check weights updated with grad
-    optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1.5, momentum=0)
-    optimizer.apply_gradients(zip(ref_grads, ref_model.variables))
-    optimizer.apply_gradients(zip(test_grads, test_model.variables))
-    ref_weights = ref_model.get_weights()
-    test_weights = test_model.dist_embeddings.get_weights(True) + test_model.dense.get_weights()
-
-    for ref_w, test_w in zip(ref_weights, test_weights):
-      # assert close here since order of accumulations(inputs and batch dim) might have changed
-      self.assertAllClose(tf.convert_to_tensor(ref_w), tf.convert_to_tensor(test_w), rtol=bwd_tol, atol=bwd_tol)
-
-  def test_broadcast(self):
-    tf.keras.utils.set_random_seed(hvd.rank())
-    num_tables = 7
-    table_sizes = [[11, 7], [5, 8], [3, 8], [5, 8], [12, 25], [3, 12], [7, 13]]
-
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
-    # create same input on all worker
-    dup_ids = [
-      tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
-    ]
-    dup_ids = hvd.broadcast_object(dup_ids, root_rank=0)
-    # different output from each worker with different weight
-    test_out = test_model(dup_ids)
-    test_outs = tf.unstack(hvd.allgather(tf.expand_dims(test_out, axis=0)))
-    for idx, out1 in enumerate(test_outs):
-      for out2 in test_outs[idx + 1 :]:
-        self.assertNotAllClose(out1, out2)
-
-    # same output from each worker after broadcast data parallel weights
-    dmp.broadcast_variables(test_model.variables, root_rank=0)
-    test_out = test_model(dup_ids)
-    test_outs = tf.unstack(hvd.allgather(tf.expand_dims(test_out, axis=0)))
-    for out1 in test_outs[1:]:
-      self.assertAllEqual(test_outs[0], out1)
-
-  def test_basic(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_row_slice(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", row_slice_threshold=1)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_data_parallel(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", data_parallel_threshold=100000)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_memory_optimized(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_optimized")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_shared_basic(self):
-    table_sizes = gen_table_sizes()
-    input_to_table_map = gen_input_to_table_map(len(table_sizes))
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", input_table_map=input_to_table_map)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, input_to_table_map)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_shared_basic_mp(self):
-    table_sizes = gen_table_sizes()
-    input_to_table_map = gen_input_to_table_map(len(table_sizes))
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, strategy="basic", dp_input=False, input_table_map=input_to_table_map
-    )
-
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, input_to_table_map, mp_input_ids)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
-
-  def test_shared_mb_mp(self):
-    table_sizes = gen_table_sizes()
-    input_to_table_map = gen_input_to_table_map(len(table_sizes))
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, input_table_map=input_to_table_map)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, strategy="memory_balanced", dp_input=False, input_table_map=input_to_table_map
-    )
-
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, input_to_table_map, mp_input_ids)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
-
-  def test_column_slice_merge(self):
-    # test on 4 GPUs
-    table_sizes = [[100, 8], [5, 8], [10, 8], [25, 4]]
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced", column_slice_threshold=45)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-    for tables in test_model.dist_embeddings.strategy.table_ids:
-      self.assertEqual(len(tables), len(set(tables)))
-
-  def test_column_slice_threshold(self):
-    table_sizes = gen_table_sizes(hvd.size() + 1)
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", column_slice_threshold=30)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_column_slice_dup_worker(self):
-    table_sizes = [[10, 4], [11, 2], [4, 2], [4, 2]]
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, strategy="memory_balanced", dp_input=False, column_slice_threshold=10
-    )
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
-
-  def test_8table_width2_auto_concat(self):
-    table_sizes = [[10, 2], [11, 2], [4, 2], [4, 2], [10, 2], [11, 2], [4, 2], [4, 2]]
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced", dp_input=False)
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
-    self.assertEqual(len(test_model.dist_embeddings.weights), 1, "Table fusion failed.")
-
-  def test_set_weight_uninitialized(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-
-    # run a batch to initialize weight tensors
-    _ = ref_model(dp_inputs)
-    ref_weights = ref_model.get_weights()
-    num_tables = len(ref_model.embeddings)
-    with self.assertRaises(ValueError):
-      test_model.dist_embeddings.set_weights(ref_weights[:num_tables])
-      test_model.dense.set_weights(ref_weights[num_tables:])
-
-  def test_indivisible_batch(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", dp_input=False)
-
-    # First generate model parallel batches that's divisible by world_size. We then use (batch_size - 1)
-    # which will be indivisible by world_size greater than 1 due to consecutive numbers coprimes
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids)
-    mp_inputs = [inp[1:] for inp in mp_inputs]
-    if hvd.size() > 1:
-      with self.assertRaisesRegex(ValueError, "not divisible"):
-        self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs)
-
-  def test_fewer_tables_than_workers(self):
-    table_sizes = gen_table_sizes(1)
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="memory_balanced")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_custom_embedding_layer(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, test_custom_layer=True)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", test_custom_layer=True)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_all_parallelism_modes(self):
-    """
-    This testcase is designed to check if all parallelism modes,
-    i.e., data-parallelism, table-parallism, column-parallelism,
-    and row-parallelism, work together correctly.
-    """
-
-    table_sizes = large_testcase_sizes
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(
-      table_sizes,
-      distribute=True,
-      strategy="memory_balanced",
-      data_parallel_threshold=10000,
-      column_slice_threshold=1000000,
-      row_slice_threshold=10000000,
-    )
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs)
-
-  def test_cpu_offload(self):
-    table_sizes = [[100, 32], [100, 32], [100, 32], [100, 32], [1000, 64], [1000, 64], [1000, 64], [1000, 64]]
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic", gpu_embedding_size=32000)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
-
-  def test_column_slicing_offload(self):
-    table_sizes = large_testcase_sizes
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, strategy="memory_balanced", column_slice_threshold=1000000, gpu_embedding_size=0
-    )
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
-
-  def test_multihot_mp_input(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic", dp_input=False)
-
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids, hotness=5)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs, fwd_tol=1e-6)
-
-  def test_multihot_dp_input(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
-
-  def test_multihot_dp_input_split(self):
-    # More workers than tables
-    table_sizes = gen_table_sizes(num_tables=max(hvd.size() // 2, 1))
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, combiner="sum", strategy="basic")
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
-
-  def test_multihot_offloaded_mp_input(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, combiner="sum", strategy="basic", dp_input=False, gpu_embedding_size=0
-    )
-
-    mp_input_ids = test_model.dist_embeddings.strategy.input_ids_list[hvd.rank()]
-
-    dp_inputs, mp_inputs = gen_inputs(self.global_batch, table_sizes, mp_input_ids=mp_input_ids, hotness=5)
-    self.run_and_test(ref_model, dp_inputs, test_model, mp_inputs, fwd_tol=1e-6)
-
-  def test_multihot_offloaded_dp_input(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False, combiner="sum", use_custom_kernels=False)
-    test_model = EmbeddingListModel(
-      table_sizes, distribute=True, combiner="sum", strategy="basic", gpu_embedding_size=0
-    )
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes, hotness=5)
-    self.run_and_test(ref_model, dp_inputs, test_model, dp_inputs, fwd_tol=1e-6)
-
-
-class DistributedEmbeddingModelFitTest(keras_parameterized.TestCase):
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    initialize_hvd()
-    self.global_batch = 12
-
-  def run_and_test(self):
-    raise NotImplementedError
-
-  def test_model_fit_bce(self):
-    table_sizes = gen_table_sizes()
-
-    ref_model = EmbeddingListModel(table_sizes, distribute=False)
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
-    optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1.5, momentum=0)
-    bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
-    label = tf.fill([self.global_batch // hvd.size(), 5], 0.5)
-
-    dp_inputs, _ = gen_inputs(self.global_batch, table_sizes)
-    ref_model(dp_inputs)
-
-    # patched DistributedOptimizer that register local variables on first allreduce automatically
-    dist_optimizer = dmp.DistributedOptimizer(optimizer)
-    test_model.compile(optimizer=dist_optimizer, loss=bce)
-
-    # need to force init so we can set_weight from reference and compare result
-    # no need to broadcast since weight will be overwritten anyway
-    test_model.fit(dp_inputs, label, epochs=1, steps_per_epoch=1)
-
-    # broadcast ref model weights and set test model weights
-    hvd.broadcast_variables(ref_model.variables, root_rank=0)
-    ref_weights = ref_model.get_weights()
-    num_tables = len(ref_model.embeddings)
-
-    test_model.dist_embeddings.set_weights(ref_weights[:num_tables])
-    test_model.dense.set_weights(ref_weights[num_tables:])
-
-    with tf.GradientTape() as tape:
-      ref_loss = bce(label, ref_model(dp_inputs))
-    tape = hvd.DistributedGradientTape(tape)
-    ref_grads = tape.gradient(ref_loss, ref_model.variables)
-    optimizer.apply_gradients(zip(ref_grads, ref_model.variables))
-    ref_weights = ref_model.get_weights()
-
-    test_history = test_model.fit(dp_inputs, label, epochs=1, steps_per_epoch=1)
-    test_weights = test_model.dist_embeddings.get_weights(True) + test_model.dense.get_weights()
-
-    self.assertAllClose(ref_loss, test_history.history["loss"][0])
-    for ref_w, test_w in zip(ref_weights, test_weights):
-      # assert close here since order of accumulations(inputs and batch dim) might have changed
-      self.assertAllClose(tf.convert_to_tensor(ref_w), tf.convert_to_tensor(test_w))
-
-  def test_broadcast_callback(self):
-    tf.keras.utils.set_random_seed(hvd.rank())
-    num_tables = 7
-    table_sizes = [[11, 7], [5, 8], [3, 8], [5, 8], [12, 25], [3, 12], [7, 13]]
-
-    test_model = EmbeddingListModel(table_sizes, distribute=True, strategy="basic")
-
-    # create same input on all worker
-    dup_ids = [
-      tf.random.uniform(shape=[3], minval=0, maxval=table_sizes[i][0], dtype=tf.int64) for i in range(num_tables)
-    ]
-    dup_ids = hvd.broadcast_object(dup_ids, root_rank=0)
-
-    optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=1.5, momentum=0)
-    bce = tf.keras.losses.BinaryCrossentropy(from_logits=True)
-    label = tf.fill([3, 5], 0.3)
-    # optimizer should not matter since broadcast happens after
-    test_model.compile(optimizer=optimizer, loss=bce)
-    callback = dmp.BroadcastGlobalVariablesCallback(0)
-
-    # run one batch with broadcasting callback
-    test_history = test_model.fit(dup_ids, label, epochs=1, steps_per_epoch=1, callbacks=[callback])
-
-    # losses from initial batch should be different
-    loss = test_history.history["loss"][0]
-    losses = tf.unstack(hvd.allgather(tf.expand_dims(loss, axis=0)))
-    for loss in losses[1:]:
-      self.assertNotAllClose(losses[0], loss)
-
-    # same output from each worker after broadcast data parallel weights
-    test_out = test_model(dup_ids)
-    test_outs = tf.unstack(hvd.allgather(tf.expand_dims(test_out, axis=0)))
-    for out1 in test_outs[1:]:
-      self.assertAllEqual(test_outs[0], out1)
-
-    # now try model fit again, loss should be same
-    test_history = test_model.fit(dup_ids, label, epochs=1, steps_per_epoch=1)
-    loss = test_history.history["loss"][0]
-    losses = tf.unstack(hvd.allgather(tf.expand_dims(loss, axis=0)))
-    for loss in losses[1:]:
-      # TODO(deyuf): understand why model.fit causes 1e-8 error sometime
-      self.assertAllCloseAccordingToType(losses[0], loss)
-
-
-def get_variable_length_ragged_test_data():
-  return [
-    tf.ragged.constant([[11, 12], [13, 14, 15], [16], [17], [21], [22, 23, 24, 25, 26, 27, 28, 29, 210], [211], [212]]),
-    tf.ragged.constant([[31, 32, 33], [34], [35], [36], [41, 42, 43, 44, 45, 46, 47, 48], [49], [410], [411]]),
-    tf.ragged.constant([
-      [51],
-      [52, 53, 54, 55, 56, 57, 58, 59, 510],
-      [511],
-      [512],
-      [61, 62, 63, 64, 65, 66, 67],
-      [68],
-      [69],
-      [610],
-    ]),
-  ]
-
-
-class DpToMpInputTest(keras_parameterized.TestCase):
-  def __init__(self, *args, **kwargs):
-    super().__init__(*args, **kwargs)
-    initialize_hvd()
-
-    tf.config.run_functions_eagerly(not flags.FLAGS.graph_mode)
-
-    if flags.FLAGS.mixed_precision_policy:
-      policy = tf.keras.mixed_precision.Policy(flags.FLAGS.mixed_precision_policy)
-      tf.keras.mixed_precision.set_global_policy(policy)
-
-  def run_and_test(self, features, rank_to_local_features=None):
-    features = dict(enumerate(features))
-
-    if rank_to_local_features is None:
-      # generate a round-robin strategy
-      rank_to_local_features = defaultdict(list)
-      for feature_id in features.keys():
-        rank = feature_id % hvd.size()
-        rank_to_local_features[rank].append(feature_id)
-
-    dp_inputs = {}
-    for feature_id, feature in features.items():
-      local_batch = feature.shape[0] // hvd.size()
-      begin = hvd.rank() * local_batch
-      end = begin + local_batch
-      dp_inputs[feature_id] = feature[begin:end, ...]
-
-    features_mp = _dp_to_mp_input(dp_inputs=dp_inputs, rank_to_local_features=rank_to_local_features)
-
-    for feature_id in rank_to_local_features[hvd.rank()]:
-      feature_mp = features_mp[feature_id]
-      orig_data = features[feature_id]
-      self.assertAllEqual(feature_mp, orig_data)
-
-  def test_dense_onehot_dp_to_mp(self):
-    table_sizes = gen_table_sizes(num_tables=41)
-    dense_inputs = gen_inputs(global_batch=8, table_sizes=table_sizes, return_global=True)
-    self.run_and_test(dense_inputs)
-
-  def test_dense_multihot_dp_to_mp(self):
-    table_sizes = gen_table_sizes(num_tables=41)
-    dense_inputs = gen_inputs(global_batch=8, table_sizes=table_sizes, hotness=5, return_global=True)
-    self.run_and_test(dense_inputs)
-
-  def test_ragged_dp_to_mp(self):
-    self.run_and_test(get_variable_length_ragged_test_data())
-
-  def test_ragged_dp_to_mp_unbalanced(self):
-    ragged_data = get_variable_length_ragged_test_data()
-    rank_to_local_features = {}
-    for rank in range(hvd.size()):
-      if rank == 0:
-        # corner-case test - send all features to a single worker
-        rank_to_local_features[rank] = list(range(len(ragged_data)))
-      else:
-        rank_to_local_features[rank] = []
-
-    self.run_and_test(ragged_data, rank_to_local_features)
-
-  def test_ragged_and_dense_dp_to_mp(self):
-    dense_inputs = gen_inputs_onehot(
-      global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
-    )
-    all_inputs = [*dense_inputs, *get_variable_length_ragged_test_data()]
-    self.run_and_test(all_inputs)
-
-  def test_ragged_and_dense_dp_to_mp_reversed(self):
-    dense_inputs = gen_inputs_onehot(
-      global_batch=8, table_sizes=[[10, 8], [100, 8], [1000, 8], [10, 16], [10, 16], [10, 4]], return_global=True
-    )
-    all_inputs = [*get_variable_length_ragged_test_data(), *dense_inputs]
-    self.run_and_test(all_inputs)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
deleted file mode 100644
index d7b5aabb..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_lookup_ops_test.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint:disable=missing-docstring, no-self-use, invalid-name
-import tensorflow as tf
-from distributed_embeddings import embedding_lookup
-
-
-class EmbeddingLookupTest(tf.test.TestCase):
-  def test_variable_hotness(self):
-    voc, emb, batch, max_hotness = 69, 64, 15, 207
-    # create dense representation of index matrix
-    data_a = tf.random.uniform(shape=[batch, max_hotness], minval=1, maxval=max_hotness + 1)
-    data_b = tf.random.uniform(shape=[batch], minval=1, maxval=max_hotness + 1)
-    # make sure there is no empty row
-    data_c = tf.reshape(tf.eye(max_hotness, batch_shape=[batch // max_hotness + 1]), [-1, max_hotness])[:batch]
-
-    data_0 = tf.cast((data_a / tf.reshape(data_b, [-1, 1]) + data_c) > 1, tf.int64)
-    data_1 = tf.random.uniform(shape=[batch, max_hotness], minval=0, maxval=voc, dtype=tf.int64)
-    data = data_0 * data_1
-
-    # COO format for tf native API
-    ref_ids = tf.sparse.from_dense(data)
-    # Ragged format. We can't use ragged.from_sparse() since it is not ragged-right, see:
-    # https://github.com/tensorflow/tensorflow/blob/v2.5.0/tensorflow/python/ops/ragged/ragged_tensor.py#L2678
-    row_lengths = tf.math.count_nonzero(data, axis=1)
-    ids = tf.RaggedTensor.from_row_lengths(ref_ids.values, row_lengths)
-
-    initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
-    param = tf.Variable(initial_weight)
-
-    for red in ["sum", "mean"]:
-      with tf.GradientTape(persistent=True) as tape:
-        tape.watch(param)
-        ref_ret = tf.nn.embedding_lookup_sparse(param, ref_ids, sp_weights=None, combiner=red)
-        ret = embedding_lookup(param, ids, combiner=red)
-      ref_g = tape.gradient(ref_ret, param)
-      g = tape.gradient(ret, param)
-
-      ref_g_dense = tf.convert_to_tensor(ref_g)
-      g_dense = tf.convert_to_tensor(g)
-      # Seems some ops in sparse lookup is running on CPU and rounding differently
-      self.assertAllClose(ref_ret, ret)
-      self.assertAllClose(ref_g_dense, g_dense)
-
-  def test_constant_hotness(self):
-    voc, emb, batch, hotness = 69, 64, 15, 207
-    ids = tf.random.uniform(shape=[batch, hotness], minval=0, maxval=voc, dtype=tf.int64)
-
-    initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
-    param = tf.Variable(initial_weight)
-
-    for red in ["sum", "mean"]:
-      with tf.GradientTape(persistent=True) as tape:
-        tape.watch(param)
-        if red == "sum":
-          ref_ret = tf.reduce_sum(tf.nn.embedding_lookup(param, ids), 1)
-        if red == "mean":
-          ref_ret = tf.reduce_mean(tf.nn.embedding_lookup(param, ids), 1)
-        ret = embedding_lookup(param, ids, combiner=red)
-      ref_g = tape.gradient(ref_ret, param)
-      g = tape.gradient(ret, param)
-
-      ref_g_dense = tf.convert_to_tensor(ref_g)
-      g_dense = tf.convert_to_tensor(g)
-
-      self.assertAllEqual(ref_ret, ret)
-      self.assertAllEqual(ref_g_dense, g_dense)
-
-  def test_sparse_tensor_input(self):
-    voc, emb, batch, max_hotness = 69, 64, 15, 207
-    # create dense representation of index matrix
-    data_a = tf.random.uniform(shape=[batch, max_hotness], minval=1, maxval=max_hotness + 1)
-    data_b = tf.random.uniform(shape=[batch], minval=1, maxval=max_hotness + 1)
-    # make sure there is no empty row
-    data_c = tf.reshape(tf.eye(max_hotness, batch_shape=[batch // max_hotness + 1]), [-1, max_hotness])[:batch]
-
-    data_0 = tf.cast((data_a / tf.reshape(data_b, [-1, 1]) + data_c) > 1, tf.int64)
-    data_1 = tf.random.uniform(shape=[batch, max_hotness], minval=0, maxval=voc, dtype=tf.int64)
-    data = data_0 * data_1
-
-    # COO format for tf native API
-    ref_ids = tf.sparse.from_dense(data)
-    test_ids = tf.sparse.from_dense(data)
-
-    initial_weight = tf.random.uniform([voc, emb], dtype=tf.float32)
-    param = tf.Variable(initial_weight)
-
-    for red in ["sum", "mean"]:
-      with tf.GradientTape(persistent=True) as tape:
-        tape.watch(param)
-        ref_ret = tf.nn.embedding_lookup_sparse(param, ref_ids, sp_weights=None, combiner=red)
-        ret = embedding_lookup(param, test_ids, combiner=red)
-      ref_g = tape.gradient(ref_ret, param)
-      g = tape.gradient(ret, param)
-
-      ref_g_dense = tf.convert_to_tensor(ref_g)
-      g_dense = tf.convert_to_tensor(g)
-      # Seems some ops in sparse lookup is running on CPU and rounding differently
-      self.assertAllClose(ref_ret, ret)
-      self.assertAllClose(ref_g_dense, g_dense)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
deleted file mode 100644
index edfa5e1f..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/tests/embedding_test.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test of embedding layers"""
-
-import tensorflow as tf
-import numpy as np
-from tensorflow.python.eager import backprop
-from tensorflow.python.platform import test
-from tensorflow.python.keras import keras_parameterized, testing_utils, combinations
-from tensorflow.python.training import adagrad
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from distributed_embeddings.python.layers import embedding
-
-
-# pylint:disable=missing-docstring, no-self-use
-class EmbeddingTest(keras_parameterized.TestCase):
-  @keras_parameterized.run_all_keras_modes
-  def test_1d_input(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3)
-    model = tf.keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model(tf.constant([0, 1, 0], dtype="int64"))
-    self.assertAllEqual(outputs, [[1, 2], [3, 4], [1, 2]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_2d_input_no_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3)
-    model = tf.keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype="int64"))
-    self.assertAllEqual(outputs, [[[1, 2], [3, 4]], [[5, 6], [1, 2]]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_2d_input_with_sum_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
-    model = tf.keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model.predict(np.array([[0, 1], [2, 0]], dtype="int64"))
-    self.assertAllEqual(outputs, [[4, 6], [6, 8]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_3d_input_no_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3)
-    model = tf.keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype="int64")
-    outputs = model.predict(ids)
-    self.assertAllEqual(outputs, [[[[1, 2], [3, 4]], [[5, 6], [1, 2]], [[3, 4], [5, 6]]]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_3d_input_with_mean_combiner(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="mean")
-    model = tf.keras.models.Sequential([layer])
-
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    ids = np.array([[[0, 1], [2, 0], [1, 2]]], dtype="int64")
-    outputs = model.predict(ids)
-    self.assertAllEqual(outputs, [[[2, 3], [3, 4], [4, 5]]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_ragged_input(self):
-    layer = embedding.Embedding(input_dim=3, output_dim=2, weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])])
-    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    outputs = layer(inputs)
-
-    model = tf.keras.Model(inputs, outputs)
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    ids = ragged_factory_ops.constant([[1, 2, 2], [0], [1, 2]], ragged_rank=1)
-    outputs = model.predict(ids)
-
-    ref_layer = tf.keras.layers.Embedding(
-      input_dim=3, output_dim=2, weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
-    )
-    ref_outputs = ref_layer(ids)
-    self.assertAllEqual(outputs, ref_outputs)
-
-  @keras_parameterized.run_all_keras_modes
-  def test_ragged_input_with_mean_combiner(self):
-    layer = embedding.Embedding(
-      input_dim=3, output_dim=2, combiner="mean", weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
-    )
-    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    outputs = layer(inputs)
-
-    model = tf.keras.Model(inputs, outputs)
-    model.run_eagerly = testing_utils.should_run_eagerly()
-    outputs = model.predict(ragged_factory_ops.constant([[1, 2, 2], [0], [1, 2]], ragged_rank=1))
-    self.assertAllEqual(outputs, [[5.0, 3.0], [0.0, 3.0], [4.0, 3.5]])
-
-  @keras_parameterized.run_all_keras_modes
-  def test_sparse_input_with_mean_combiner(self):
-    layer = embedding.Embedding(
-      input_dim=3, output_dim=2, combiner="mean", weights=[np.array([[0.0, 3.0], [1.0, 5.0], [7.0, 2.0]])]
-    )
-    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    outputs = layer(inputs)
-
-    model = tf.keras.Model(inputs, outputs)
-    model.run_eagerly = testing_utils.should_run_eagerly()
-
-    outputs = model.predict(
-      tf.sparse.SparseTensor(
-        indices=[[0, 0], [0, 1], [0, 2], [1, 0], [2, 0], [2, 1]], values=[1, 2, 2, 0, 1, 2], dense_shape=[3, 4]
-      )
-    )
-    self.assertAllEqual(outputs, [[5.0, 3.0], [0.0, 3.0], [4.0, 3.5]])
-
-  @combinations.generate(combinations.combine(mode=["eager"]))
-  def test_2d_input_with_sum_combiner_with_grad(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
-    layer.build((None, 2))
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype="int64")
-    with backprop.GradientTape() as tape:
-      output = layer(inputs)
-    gs = tape.gradient(output, layer.weights)
-    opt = adagrad.AdagradOptimizer(0.1)
-    opt.apply_gradients(zip(gs, layer.weights))
-
-    ref_layer = tf.keras.layers.Embedding(output_dim=2, input_dim=3)
-    ref_layer.build((None, 2))
-    ref_layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    # grad of sum combiner is same as grads for flatten inputs without combiner
-    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype="int64")
-    with backprop.GradientTape() as tape:
-      ref_output = ref_layer(ref_inputs)
-    ref_gs = tape.gradient(ref_output, ref_layer.weights)
-    ref_opt = adagrad.AdagradOptimizer(0.1)
-    ref_opt.apply_gradients(zip(ref_gs, ref_layer.weights))
-    self.assertAllEqual(layer.weights[0], ref_layer.weights[0])
-    self.assertAllEqual(tf.convert_to_tensor(gs[0]), tf.convert_to_tensor(ref_gs[0]))
-
-  @combinations.generate(combinations.combine(mode=["eager"]))
-  def test_2d_input_with_sum_combiner_with_grad_32bit(self):
-    layer = embedding.Embedding(output_dim=2, input_dim=3, combiner="sum")
-    layer.build((None, 2))
-    layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    inputs = tf.keras.backend.constant([[0, 1, 0]], dtype="int32")
-    with backprop.GradientTape() as tape:
-      output = layer(inputs)
-    gs = tape.gradient(output, layer.weights)
-    opt = adagrad.AdagradOptimizer(0.1)
-    opt.apply_gradients(zip(gs, layer.weights))
-
-    ref_layer = tf.keras.layers.Embedding(output_dim=2, input_dim=3)
-    ref_layer.build((None, 2))
-    ref_layer.set_weights([np.array([[1, 2], [3, 4], [5, 6]])])
-    # grad of sum combiner is same as grads for flatten inputs without combiner
-    ref_inputs = tf.keras.backend.constant([0, 1, 0], dtype="int32")
-    with backprop.GradientTape() as tape:
-      ref_output = ref_layer(ref_inputs)
-    ref_gs = tape.gradient(ref_output, ref_layer.weights)
-    ref_opt = adagrad.AdagradOptimizer(0.1)
-    ref_opt.apply_gradients(zip(ref_gs, ref_layer.weights))
-    self.assertAllEqual(layer.weights[0], ref_layer.weights[0])
-    self.assertAllEqual(tf.convert_to_tensor(gs[0]), tf.convert_to_tensor(ref_gs[0]))
-
-
-class ConcatOneHotEmbeddingTest(test.TestCase):
-  def test_smoke(self):
-    feature_sizes = [3, 5, 7, 11]
-    embedding_width = 3
-    test_embedding = embedding.ConcatOneHotEmbedding(feature_sizes, embedding_width)
-    indices = tf.constant([[1, 2, 3, 4], [2, 4, 6, 10]])
-    test_embedding(indices)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
deleted file mode 100644
index 6a3650c6..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/tests/integer_lookup_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test of embedding layers"""
-
-import tensorflow as tf
-import numpy as np
-
-from absl.testing import parameterized
-
-from keras.layers.preprocessing import integer_lookup
-from keras.testing_infra import test_combinations
-from distributed_embeddings.python.layers.embedding import IntegerLookup
-
-
-# pylint:disable=missing-docstring, no-self-use
-@test_combinations.run_all_keras_modes(always_skip_v1=True)
-class IntegerLookupLayerTest(test_combinations.TestCase):
-  # TODO: this test pass not but in theory it depends on atomic
-  @parameterized.named_parameters(
-    ("gpu", True),
-    ("cpu", False),
-  )
-  def test_layer_with_list_input(self, use_gpu):
-    vocab = [12, 36, 1138, 42]
-    data = [[12, 1138, 42], [42, 1000, 36]]  # Note OOV tokens
-    layer = integer_lookup.IntegerLookup(vocabulary=vocab)
-    output = layer(data)
-    expected_output = np.array([[1, 3, 4], [4, 0, 2]])
-
-    test_layer = IntegerLookup(max_tokens=4, use_gpu=use_gpu)
-    test_layer(tf.convert_to_tensor(vocab, tf.int64))  # Init with vocab
-    test_output = test_layer(tf.convert_to_tensor(data, tf.int64))
-
-    self.assertEqual(output.numpy().tolist(), expected_output.tolist())
-    self.assertEqual(test_output.numpy().tolist(), expected_output.tolist())
-
-  @parameterized.named_parameters(
-    ("gpu", True),
-    ("cpu", False),
-  )
-  def test_layer_against_native(self, use_gpu):
-    for key_max in [100, 200, 500, 1000]:
-      for vocab_size in [100, 200, 500, 1000]:
-        vocab = tf.random.uniform(shape=(vocab_size,), maxval=key_max, dtype=tf.int64)
-        unique_vocab = tf.size(tf.unique(vocab)[0])
-        # make sure test table is full so we can compare against reference without inserting new
-        # TODO: test get_vocabulary()
-        test_layer = IntegerLookup(max_tokens=unique_vocab, use_gpu=use_gpu)
-        test_layer(vocab)  # Init with vocab
-        ref_layer = integer_lookup.IntegerLookup(vocabulary=test_layer.get_vocabulary()[1:])
-
-        data = tf.range(0, 1024, dtype=tf.int64)
-        ref_output = ref_layer(data)
-        test_output = test_layer(data)
-
-        self.assertAllEqual(ref_output, test_output)
-
-
-if __name__ == "__main__":
-  tf.test.main()
diff --git a/deepray/custom_ops/distributed_embeddings/python/tests/run_all_test.py b/deepray/custom_ops/distributed_embeddings/python/tests/run_all_test.py
deleted file mode 100644
index d5c4af3d..00000000
--- a/deepray/custom_ops/distributed_embeddings/python/tests/run_all_test.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from pathlib import Path
-import sys
-
-import pytest
-
-if __name__ == "__main__":
-  dirname = Path(__file__).absolute().parent
-  sys.exit(pytest.main([str(dirname)]))
diff --git a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
index aaacced3..3a60b764 100644
--- a/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
+++ b/deepray/custom_ops/embedding_variable/python/group_embedding_lookup_ops.py
@@ -178,9 +178,7 @@ def group_embedding_lookup(params, ids, partition_strategy="mod", name=None):
         with ops.name_scope(
           name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + ids
         ) as name_scope:
-          outputs = group_embedding_lookup_ops.group_variable_lookup_dense(
-            tf_handlers[group_id], tf_ids[group_id], dim
-          )[0]
+          outputs = gen_group_embedding_ops.group_variable_lookup_dense(tf_handlers[group_id], tf_ids[group_id], dim)[0]
           for idx, output in zip(output_index, outputs):
             emb_vec[idx] = output
 
@@ -463,7 +461,7 @@ def group_embedding_lookup_sparse(
           with ops.name_scope(
             name, "localized_group_embedding_lookup_variable_dim{}_{}".format(dim, j), params + sp_ids
           ) as name_scope:
-            outputs = group_embedding_lookup_ops.group_variable_lookup(
+            outputs = gen_group_embedding_ops.group_variable_lookup(
               (tf_handlers[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
               (tf_sp_values[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
               (tf_sp_indices[group_id])[j * params_num_per_group : (j + 1) * params_num_per_group],
@@ -485,7 +483,7 @@ def group_embedding_lookup_sparse(
           with ops.name_scope(
             name, "localized_group_embedding_lookup_variable_dim{}".format(dim), params + sp_ids
           ) as name_scope:
-            outputs = group_embedding_lookup_ops.group_variable_lookup(
+            outputs = gen_group_embedding_ops.group_variable_lookup(
               (tf_handlers[group_id])[-num_remainder:],
               (tf_sp_values[group_id])[-num_remainder:],
               (tf_sp_indices[group_id])[-num_remainder:],
diff --git a/deepray/layers/dense.py b/deepray/layers/dense.py
index 5741079c..c3517576 100644
--- a/deepray/layers/dense.py
+++ b/deepray/layers/dense.py
@@ -34,7 +34,6 @@
   from keras.src import constraints
   from keras.src import initializers
   from keras.src import regularizers
-  from keras.src.dtensor import utils
   from keras.src.engine.base_layer import Layer
   from keras.src.engine.input_spec import InputSpec
 
diff --git a/deepray/layers/mlp.py b/deepray/layers/mlp.py
index fc70e578..b31ff2bf 100644
--- a/deepray/layers/mlp.py
+++ b/deepray/layers/mlp.py
@@ -99,7 +99,7 @@ def build(self, input_shape):
         renorm=self.batch_normalization_renorm,
         renorm_clipping=self.batch_normalization_renorm_clipping,
         renorm_momentum=self.batch_normalization_renorm_momentum,
-        name=f"BatchNorm/in",
+        name=f"{self.prefix}/BatchNorm/in",
       )
       self.trainable_weights.extend(bn.trainable_weights)
       self.non_trainable_weights.extend(bn.non_trainable_weights)
@@ -129,7 +129,7 @@ def build(self, input_shape):
           renorm=self.batch_normalization_renorm,
           renorm_clipping=self.batch_normalization_renorm_clipping,
           renorm_momentum=self.batch_normalization_renorm_momentum,
-          name=f"BatchNorm/out",
+          name=f"{self.prefix}/BatchNorm/out",
         )
         self.trainable_weights.extend(bn.trainable_weights)
         self.non_trainable_weights.extend(bn.non_trainable_weights)
diff --git a/deepray/models/rec/base_model.py b/deepray/models/rec/base_model.py
deleted file mode 100644
index ab5d7d78..00000000
--- a/deepray/models/rec/base_model.py
+++ /dev/null
@@ -1,683 +0,0 @@
-import abc
-import datetime
-import math
-import os
-import sys
-from collections import OrderedDict, defaultdict
-from collections import namedtuple
-
-import tensorflow as tf
-from absl import logging, flags
-from tensorflow.keras.layers import Concatenate
-from tensorflow.keras.regularizers import l2
-from tensorflow.python.keras import backend as K
-
-from deepray.layers.bucketize import Bucketize
-from deepray.layers.embedding import DynamicEmbedding
-from deepray.layers.seq import Pooling
-from deepray.utils.data.feature_map import FeatureMap
-from deepray.utils.data.input_meta import InputMeta
-
-# if FLAGS.use_dynamic_embedding:
-from tensorflow_recommenders_addons import dynamic_embedding as de
-
-Feature = namedtuple(
-  "Feature",
-  [
-    "name",
-    "code",
-    "dtype",
-    "len",
-    "default_value",
-    "use_hash",
-    "ids",
-    "vocab",
-    "boundaries",
-    "emb_name",
-    "emb_size",
-    "emb_dim",
-    "emb_reg_l1",
-    "emb_split",
-    "emb_reg_l2",
-    "emb_init",
-    "emb_mask",
-    "emb_dynamic",
-    "trainable",
-    "combiner",
-    "group",
-  ],
-)
-
-
-class BaseModel:
-  def __init__(self):
-    super().__init__()
-    self.conf = InputMeta().conf
-    self.conf_version = InputMeta().conf_version
-    self.field_dict = self.get_fea_field_dict()
-    self.feature_map = FeatureMap(feature_map=FLAGS.feature_map, black_list=FLAGS.black_list).feature_map
-
-    (
-      self.fea_gpercentile_dict,
-      fea_gcov_dict,
-      fea_geva_dict,
-      self.fea_bpercentile_dict,
-      fea_bcov_dict,
-      fea_beva_dict,
-      self.fea_tag_dict,
-      self.cate_fea_dict,
-      id_fea_dict,
-      search_vocab_list,
-      word_fea_dict,
-    ) = self.get_feature_meta()
-
-    self.save_path = self.get_save_model_path(self.conf.out_path)
-
-    fine_tune = FLAGS.fine_tune
-    if fine_tune:
-      self.save_path = os.path.join(self.save_path, fine_tune)
-      os.makedirs(self.save_path, exist_ok=True)
-
-    if os.path.exists(self.save_path) and not self.conf.only_predict:
-      path_to_pb = os.path.join(self.save_path, "saved_model.pb")
-      path_to_pbtxt = os.path.join(self.save_path, "saved_model.pbtxt")
-      if os.path.exists(path_to_pb) or os.path.exists(path_to_pbtxt):
-        logging.info("[MODEL] Model already exists! would OVERWRITE! %s" % self.save_path)
-        # sys.exit(0)
-
-    # input相关
-    self.use_fea_list = self.feature_map["name"].values.tolist()
-
-    self.hash_fea_dict, self.id_fea_dict = self.conf.hash_fea_bucket_size_dict, self.conf.id_fea_bucket_size_dict
-    self.conti_fea_cut = None
-    if self.conf.bussiness == "boss":
-      self.conti_fea_cut = self.fea_bpercentile_dict
-    elif self.conf.bussiness == "geek":
-      self.conti_fea_cut = self.fea_gpercentile_dict
-    else:
-      logging.info(f"system exits with code -1! unknown bussiness: {self.conf.bussiness}")
-      sys.exit(-1)
-    if self.conf.extra_conti_fea_cut:
-      self.conti_fea_cut = {**self.conti_fea_cut, **self.conf.extra_conti_fea_cut}
-    logging.info(f"conti_fea_cut: {len(self.conti_fea_cut)}")
-    logging.info(f"cate_fea_dict: {len(self.cate_fea_dict)}")
-    logging.info(f"use_fea_list: {len(self.use_fea_list)}")
-    # output相关
-    self.target_label_table, self.eva_target_label_table, self.class_weight_table = self.get_target()
-
-  @abc.abstractmethod
-  def build_network(self, flags=None, features=None):
-    """
-    must defined in subclass
-    """
-    raise NotImplementedError("build_network: not implemented!")
-
-  def get_target(self):
-    target_label_table = dict()
-    eva_target_label_table = dict()
-    class_weight_table = dict()
-    for key, value in self.conf.target.items():
-      sample_weight = value["total_sample_weight"] if "total_sample_weight" in value else value["sample_weight"]
-      label_key = []
-      label_value = []
-      for k, v in sample_weight.items():
-        label_key.append(k)
-        label_value.append(0 if v < 0 else 1)
-
-      # 兼容之前的格式，之后会下掉
-      if "label" in value:
-        label_key = []
-        label_value = []
-        for k, v in value["label"].items():
-          label_key.append(k)
-          label_value.append(v)
-
-      initializer = tf.lookup.KeyValueTensorInitializer(
-        keys=label_key, values=label_value, key_dtype=tf.string, value_dtype=tf.int64, name=key + "_target_lookup_1"
-      )
-      target_label_table[key] = tf.lookup.StaticHashTable(initializer, default_value=0, name=key + "_target_lookup")
-      class_weight_table[key] = value["class_weight"]
-
-    for key, value in self.conf.evaluate_target.items():
-      target_label = value["label"]
-      initializer = tf.lookup.KeyValueTensorInitializer(
-        keys=list(target_label.keys()),
-        values=list(target_label.values()),
-        key_dtype=tf.string,
-        value_dtype=tf.int64,
-        name=key + "_target_lookup_1",
-      )
-      eva_target_label_table[key] = tf.lookup.StaticHashTable(initializer, default_value=0, name=key + "_target_lookup")
-      class_weight_table[key] = 0
-
-    class_weight_table["predict"] = 0
-
-    return target_label_table, eva_target_label_table, class_weight_table
-
-  def get_save_model_path(self, basedir):
-    end_date = FLAGS.end_date
-    end_dt = datetime.datetime.strptime(end_date, "%Y-%m-%d")
-    save_date = "+%02d%02d" % (end_dt.month, end_dt.day)
-    save_path = os.path.join(basedir, save_date)
-    os.makedirs(save_path, exist_ok=True)
-    return save_path
-
-  def get_fea_field_dict(self):
-    fea_field_dict = dict()
-    fea_field_path = os.path.join(self.conf.conf_path, "fea_field")
-    for file in os.listdir(fea_field_path):
-      filename = os.path.join(fea_field_path, file)
-      val = []
-      #         logger.info("#" + file + "#")
-      if not os.path.isfile(filename):
-        logging.info("warning: %s is a directory!" % file)
-        continue
-      with open(filename, "r") as fr:
-        while True:
-          line = fr.readline().strip()
-          if line == "":
-            break
-          if line == "NULL":
-            continue
-          val.append(line)
-          # logging.info(line)
-      fea_field_dict[file] = val
-    return fea_field_dict
-
-  def get_feature_tag(self):
-    path = os.path.join(self.conf.common_config, self.conf.tag_file)
-    logging.info(f"fea_tag_path: {path}")
-    fea_tag_name_dict = dict()
-    f = open(path)
-    for line in f.readlines():
-      line = line.strip().split("\t")
-      if len(line) < 2:
-        continue
-      fea_tag_name_dict[line[0]] = line[1]
-    return fea_tag_name_dict
-
-  def get_cate_fea_vocab_list(self):
-    path = os.path.join(self.conf.common_config, self.conf.vocab_dir)
-    logging.info(f"fea_vocab_path: {path}")
-    cate_fea_vocab_list = dict()
-    # cate_fea_path = os.path.join(self.conf.project_path, 'self.conf', 'conf_common', self.conf.cate_fea_dir)
-
-    # logging.info('reading category_feature')
-    for file in os.listdir(path):
-      filename = os.path.join(path, file)
-      val = []
-      # logging.info(file)
-      if os.path.isdir(filename):
-        logging.info("warning: %s is a directory!" % file)
-        continue
-      with open(filename, "r") as f:
-        for line in f:
-          line = line.strip()
-          if line == "" or line == "NULL":
-            continue
-          val.append(int(line))
-      cate_fea_vocab_list[file] = val
-    # sys.exit()
-    return cate_fea_vocab_list
-
-  def get_search_vocab_list(self):
-    path = os.path.join(self.conf.common_config, self.conf.search_dir)
-    logging.info(f"search_vocab_path: {path}")
-    search_vocab_list = dict()
-
-    for file in os.listdir(path):
-      filename = os.path.join(path, file)
-      val = []
-      # logging.info(file)
-      if os.path.isdir(filename):
-        logging.info("warning: %s is a directory!" % file)
-        continue
-      with open(filename, "r") as f:
-        for line in f:
-          line = line.strip()
-          if line == "" or line == "NULL":
-            continue
-          val.append(line.strip().encode("utf-8"))
-      search_vocab_list[file] = val
-    # sys.exit()
-    return search_vocab_list
-
-  def get_feature_meta(self):
-    # 299     boss_ret_num_1d3        1       float32 [1.0, 1.0, 2.0, 3.0, 5.0, 7.0, 10.0, 16.0, 28.0, 505.0] 0.3836  0.5177|0.5037|0.5328|0.5377|0.5233|0.521.5233|0.5211       [1.0, 2.0, 2.0, 4.0, 5.0, 7.0, 11.0, 16.0, 26.0, 502.0] 0.6171  0.5|0.5062|0.5|0.5016|0.5|0.4929        0       ,8,102,38,
-
-    # fea_code               string
-    # fea_name               string
-    # fea_len                string
-    # fea_dtype              string
-    # gpercentile            string
-    # gcov                   string
-    # geva                   string
-    # bpercentile            string
-    # bcov                   string
-    # beva                   string
-    # def_valu               string
-    # fea_tag                string
-
-    path = os.path.join(self.conf.common_config, self.conf.meta_file)
-    logging.info(f"fea_meta_path: {path}")
-
-    cate_fea_vocab_list = self.get_cate_fea_vocab_list()
-    fea_tag_name_dict = self.get_feature_tag()
-    search_vocab_list = self.get_search_vocab_list()
-
-    cols = "fea_code,fea_name,fea_len,fea_dtype,gpercentile,gcov,geva,bpercentile,bcov,beva,def_valu,fea_tag,dim"
-    cols = cols.split(",")
-    logging.info(cols)
-
-    cate_fea_dict = dict()
-    word_fea_dict = dict()
-    fea_id_set = {"job_id", "boss_id", "exp_id", "geek_id", "addf_id"}
-    id_fea_dict = dict()
-    word_tag_set = {"word"}
-
-    fea_gpercentile_dict = dict()
-    fea_gcov_dict = dict()
-    fea_geva_dict = dict()
-    fea_bpercentile_dict = dict()
-    fea_bcov_dict = dict()
-    fea_beva_dict = dict()
-    fea_tag_dict = dict()
-
-    f = open(path)
-    for line in f.readlines():
-      line = line.strip().split("\t")
-
-      # logging.info(line)
-      if len(line) < len(cols):
-        logging.info(f"column len not match! please check input: {line}")
-        continue
-      if line[2] == "-1" and not line[2].isdigit():
-        logging.info(f"fea_len = -1! please check input: {line}")
-        continue
-      if line[3] not in ("int64", "float32", "string"):
-        logging.info(f"invalaid dytype! please check input: {line}")
-        continue
-      if int(line[2]) > 1 and not line[11]:
-        logging.info(f"empty fea_tag! required fea_tag when vector occurs but found null! please check input: {line}")
-        continue
-      fea_name = line[1]
-      fea_len = int(line[2])
-      fea_dtype = line[3]
-      fea_gpercentile = line[4]
-      fea_gcov = line[5]
-      fea_geva = line[6]
-      fea_bpercentile = line[7]
-      fea_bcov = line[8]
-      fea_beva = line[9]
-      fea_tag = line[11].strip().split(",")
-      for tag in fea_tag:
-        if tag in fea_tag_name_dict:
-          tag_name = fea_tag_name_dict[tag]
-          fea_tag_dict[fea_name] = tag_name
-        if tag in fea_id_set:
-          id_fea_dict[fea_name] = tag
-        if tag in word_tag_set:
-          word_fea_dict[fea_name] = tag
-
-      if fea_name in fea_tag_dict:
-        tag_name = fea_tag_dict[fea_name]
-        if tag_name in cate_fea_vocab_list:
-          cate_fea_dict[fea_name] = cate_fea_vocab_list[tag_name]
-        else:
-          logging.warn(f"tag {tag_name} NOT EXIST for cat feature {fea_name}")
-          cate_fea_dict[fea_name] = []
-
-      if fea_len == 1 and fea_dtype in ("int64", "float32") and fea_name not in fea_tag_dict:
-        if fea_gpercentile.startswith("[") or (
-          fea_gpercentile not in ["", '""'] and not fea_gpercentile.startswith("{")
-        ):
-          fea_gpercentile = list(set(map(lambda x: float(x), fea_gpercentile.strip("[").strip("]").split(","))))
-          fea_gpercentile.sort()
-          fea_gpercentile_dict[fea_name] = fea_gpercentile
-        if fea_bpercentile.startswith("[") or (
-          fea_bpercentile not in ["", '""'] and not fea_bpercentile.startswith("{")
-        ):
-          fea_bpercentile = list(set(map(lambda x: float(x), fea_bpercentile.strip("[").strip("]").split(","))))
-          fea_bpercentile.sort()
-          fea_bpercentile_dict[fea_name] = fea_bpercentile
-      if fea_gcov != "":
-        fea_gcov_dict[fea_name] = fea_gcov
-        fea_geva_dict[fea_name] = fea_geva
-      if fea_bcov != "":
-        fea_bcov_dict[fea_name] = fea_bcov
-        fea_beva_dict[fea_name] = fea_beva
-
-    return (
-      fea_gpercentile_dict,
-      fea_gcov_dict,
-      fea_geva_dict,
-      fea_bpercentile_dict,
-      fea_bcov_dict,
-      fea_beva_dict,
-      fea_tag_dict,
-      cate_fea_dict,
-      id_fea_dict,
-      search_vocab_list,
-      word_fea_dict,
-    )
-
-  def make_feature(self, f=None, **kwargs):
-    if not f:
-      f = Feature(
-        None,
-        0,
-        tf.int64,
-        1,
-        -3,
-        False,
-        False,
-        None,
-        None,
-        None,
-        0,
-        1,
-        0.00001,
-        None,
-        0.00001,
-        "truncated_normal",
-        False,
-        False,
-        True,
-        "mean",
-        None,
-      )
-    copy = f._asdict()
-    copy.update(kwargs)
-    return Feature(**copy)
-
-  def input_from_features(self):
-    input_dict = OrderedDict()
-    for code, fname, dtype, length in self.feature_map[["code", "name", "dtype", "length"]].values:
-      if code not in input_dict:
-        input_dict[code] = tf.keras.Input(shape=(int(length),), name=code, dtype=dtype)
-    return input_dict
-
-  def embedding_from_feature(self, features, is_training=True, emb_dict=None):
-    if emb_dict is None:
-      emb_dict = {}
-    emb_reuse_count = 0
-    for feature in features:
-      fea_emb_name = feature.emb_name
-      if not feature.emb_dynamic:  # 非id特征重命名成一个，方便后续进行合并
-        fea_emb_name = "not_id_feature"
-      else:
-        print("DynamicEmbedding:", feature.name, feature.emb_name, is_training)
-
-      if fea_emb_name and fea_emb_name not in emb_dict:
-        if is_training:
-          initializer = tf.keras.initializers.TruncatedNormal(mean=0.0, stddev=1.0 / math.sqrt(feature.emb_dim))
-        else:
-          initializer = tf.keras.initializers.Zeros()
-
-        if feature.emb_dynamic:  # 现在所有的特征都走de，我们仍使用这个字段标识是否是id特征
-          emb = DynamicEmbedding(
-            embedding_size=feature.emb_dim,
-            mini_batch_regularizer=l2(feature.emb_reg_l2),
-            mask_value=feature.default_value,
-            key_dtype=tf.int64,
-            value_dtype=tf.float32,
-            initializer=initializer,
-            name="dynamic_" + fea_emb_name,
-          )
-        else:
-          if not FLAGS.use_horovod:
-            emb = DynamicEmbedding(
-              embedding_size=feature.emb_dim,
-              mini_batch_regularizer=l2(feature.emb_reg_l2),
-              mask_value=feature.default_value,
-              key_dtype=tf.int64,
-              value_dtype=tf.float32,
-              initializer=initializer,
-              name="UnifiedDynamicEmbedding",
-              # init_capacity=1000000 * 8  # 如果提示hash冲突，调整该参数
-            )
-          else:
-            import horovod.tensorflow as hvd
-
-            gpu_device = ["GPU:0"]
-            mpi_size = hvd.size()
-            mpi_rank = hvd.rank()
-            emb = de.keras.layers.HvdAllToAllEmbedding(
-              mpi_size=mpi_size,
-              embedding_size=feature.emb_dim,
-              key_dtype=tf.int64,
-              value_dtype=tf.float32,
-              initializer=initializer,
-              devices=gpu_device,
-              name="DenseUnifiedEmbeddingLayer",
-              kv_creator=de.CuckooHashTableCreator(saver=de.FileSystemSaver(proc_size=mpi_size, proc_rank=mpi_rank)),
-            )
-
-        emb_dict[fea_emb_name] = emb
-      else:
-        emb_reuse_count += 1
-
-    logging.info(f"embedding reuse count: {emb_reuse_count}")
-    return emb_dict
-
-  def dense_from_columns(self, features, emb_dict):  # 用于非id特征
-    redis_inputs = list()
-    for feature in features:
-      if feature.boundaries:
-        redis_inputs.append(Bucketize(feature.boundaries, name=f"bucket_{feature.code}")(self.input_dict[feature.code]))
-      else:
-        redis_inputs.append(self.input_dict[feature.code])
-
-    id_tensors = list()
-    fea_lens = list()
-    for i, input_tensor in enumerate(redis_inputs):
-      input_tensor = input_tensor if input_tensor.dtype == tf.int64 else tf.cast(input_tensor, tf.int64)
-      id_tensor_prefix_code = tf.constant(int(features[i].code) << 47, dtype=tf.int64)  # 这里用的code
-      id_tensor = tf.bitwise.bitwise_xor(
-        input_tensor, id_tensor_prefix_code
-      )  # 前半部分是特征code，后半部分是值，全部合到一起去查询
-      id_tensors.append(id_tensor)
-      fea_lens.append(features[i].len)
-
-    split_dims_final = list()
-    is_sequence_feature = list()  # 标记当前这一段是否是序列
-    tmp_sum = 0
-    for fea_len in fea_lens:
-      if fea_len == 1:
-        tmp_sum += 1
-      elif fea_len > 1:
-        if tmp_sum > 0:  # 如果当前特征是序列，先把之前的非序列加入，再处理当前特征
-          split_dims_final.append(tmp_sum)
-          is_sequence_feature.append(False)
-        split_dims_final.append(fea_len)
-        is_sequence_feature.append(True)
-        tmp_sum = 0
-      else:
-        raise ("fea_len must >= 1, which is {}".format(fea_len))
-    if tmp_sum > 0:  # 后处理：非序列特征
-      split_dims_final.append(tmp_sum)
-      is_sequence_feature.append(False)
-
-    id_tensors_concat = Concatenate(axis=1)(id_tensors)
-    embedding_outs_concat = emb_dict["not_id_feature"](id_tensors_concat)
-    embedding_outs = tf.split(
-      embedding_outs_concat, num_or_size_splits=split_dims_final, axis=1, name=f"split_not_id_fea"
-    )
-
-    dense_dict = OrderedDict()
-    counter_flag = 0
-    for i, embedding in enumerate(embedding_outs):
-      if is_sequence_feature[i]:
-        #             logging.info(f"seq fea: {embedding.get_shape()}, {features[counter_flag].name}")
-        embedding_vec = tf.math.reduce_mean(embedding, axis=1, keepdims=True)
-        seq_fea = features[counter_flag]
-        counter_flag += 1
-        dense_dict[seq_fea.name] = embedding_vec
-      else:
-        simple_fea_embeddings = tf.split(
-          embedding, num_or_size_splits=[1] * split_dims_final[i], axis=1, name=f"split_simple_fea_embeddings_{i}"
-        )
-        for _, simple_fea_embedding in enumerate(simple_fea_embeddings):
-          #                 logging.info(f"simple fea: {simple_fea_embedding.get_shape()}, {features[counter_flag].name}")
-          simple_fea = features[counter_flag]
-          counter_flag += 1
-          dense_dict[simple_fea.name] = simple_fea_embedding
-
-    return dense_dict
-
-  def dense_from_columns_id(self, features, emb_dict):  # 用于id特征，因为是cpu处理，只对相同emb_name的特征做合并查询
-    merge_dict = defaultdict(list)  # 按照emb_name合并
-    feature_set = set()
-    for feature in features:
-      if feature.name in feature_set:  # 避免重复特征多次查询
-        continue
-      feature_set.add(feature.name)
-      merge_dict[feature.emb_name].append(feature)
-
-    dense_dict = OrderedDict()
-    for emb_name, feas in merge_dict.items():
-      denses = list()
-      splits = list()
-      masks = dict()
-      for fea in feas:
-        if fea.name in dense_dict:
-          continue
-        dense = self.input_dict[fea.code]
-
-        if fea.len > 1 and fea.combiner:  # 注意，id序列如果有特殊处理，一定要把combiner设置为None
-          masks[fea.code] = tf.greater_equal(dense, 0)
-
-        if emb_name in emb_dict:
-          denses.append(dense)
-          splits.append(fea.len)
-      if denses:
-        dense_concat = tf.concat(denses, axis=1, name=f"concat_{emb_name}")
-        dense_look = emb_dict[emb_name](dense_concat)
-        dense_looks = tf.split(dense_look, num_or_size_splits=splits, axis=1, name=f"split_{emb_name}")
-
-        for i, fea in enumerate(feas):
-          fea_dense = dense_looks[i]
-          if fea.len > 1 and fea.combiner:
-            fea_dense = Pooling(combiner=fea.combiner, name=f"{fea.combiner}_{fea.name}")(
-              fea_dense, mask=masks[fea.code]
-            )
-          dense_dict[fea.name] = fea_dense
-
-    return dense_dict
-
-  def build_features(self):
-    # Inputs
-    conti_features = self.conti_fea_dict()
-    features = dict()
-    for code, fname, dtype, length, def_valu in self.feature_map[
-      ["code", "name", "dtype", "length", "def_valu"]
-    ].values:
-      if "int" in dtype:
-        try:
-          def_valu = int(def_valu)
-        except ValueError as e:
-          logging.info(f"[ERROR] default value, {fname} {dtype} {def_valu}")
-          def_valu = -3 if length > 1 else 0
-      elif "float" in dtype:
-        try:
-          def_valu = float(def_valu)
-        except ValueError as e:
-          logging.info(f"[ERROR] default value, {fname} {dtype} {def_valu}")
-          def_valu = -3.0 if length > 1 else 0.0
-      if def_valu == "_PAD_":
-        print("def:", fname, def_valu)
-
-      use_hash = False
-      emb_size = 0
-      vocab = None
-      boundaries = None
-      emb_name = fname
-      emb_dynamic = False
-      if fname in self.conf.hash_fea_bucket_size_dict:
-        use_hash = True
-        emb_size = self.conf.hash_fea_bucket_size_dict[fname]
-      elif fname in self.conf.id_fea_bucket_size_dict:
-        emb_size = self.conf.id_fea_bucket_size_dict[fname]
-      elif fname in self.cate_fea_dict:  # 属性特征/属性序列特征，其tag一定要在指定的tag_vocab中，才能确保被调用到
-        if self.conf.emb_reuse:
-          emb_name = self.fea_tag_dict[fname]
-        vocab = self.cate_fea_dict[fname]
-        emb_size = len(vocab) + 1
-      elif fname in conti_features:  # 分桶特征，不存在序列
-        boundaries = conti_features[fname]
-        emb_size = len(boundaries) + 1
-      else:
-        print(fname + " is error feature or is id_fea")
-
-      if (
-        fname in self.id_fea_dict
-      ):  # 这里会把上面hash中重复的特征给覆盖掉 - id/id序列特征，其tag一定要有4个id的某一个，才能确保被调用到
-        emb_name = self.id_fea_dict[fname]  # 这个名字会重复吧
-        emb_dynamic = True
-        use_hash = False
-        emb_size = 0
-
-      features[fname] = self.make_feature(
-        name=fname,
-        code=code,
-        dtype=dtype,
-        len=int(length),
-        default_value=def_valu,
-        use_hash=use_hash,
-        vocab=vocab,
-        boundaries=boundaries,
-        emb_name=emb_name,
-        emb_size=emb_size,
-        emb_dynamic=emb_dynamic,
-        emb_reg_l2=self.conf.emb_reg_l2,
-      )
-    return features
-
-  def conti_fea_dict(self):
-    if self.conf.bussiness == "boss":
-      conti_fea = self.fea_bpercentile_dict
-    elif self.conf.bussiness == "geek":
-      conti_fea = self.fea_gpercentile_dict
-    else:
-      logging.info(f"system exits with code -1! unknown bussiness: {self.conf.bussiness}")
-      sys.exit(-1)
-    if self.conf.extra_conti_fea_cut:
-      conti_fea = {**conti_fea, **self.conf.extra_conti_fea_cut}
-    return conti_fea
-
-  # 读取geek nn的特征交叉信息
-  def get_geek_nn_compo(self):
-    nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "geek")
-    with open(nn_path, "r") as fr:
-      for line in fr.readlines():
-        line = line.strip().split("\t")
-        fields = line[0].split(",")
-        fea_len = list(map(lambda x: int(x), line[1].split(",")))
-        for field in fields:
-          if field not in nn_cnt:
-            nn_cnt[field] = fea_len
-    # {'geek_base': [32]}
-    return nn_cnt
-
-  # 读取job nn的特征交叉信息
-  def get_job_nn_compo(self):
-    nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "job")
-    with open(nn_path, "r") as fr:
-      for line in fr.readlines():
-        line = line.strip().split("\t")
-        fields = line[0].split(",")
-        fea_len = list(map(lambda x: int(x), line[1].split(",")))
-        for field in fields:
-          if field not in nn_cnt:
-            nn_cnt[field] = fea_len
-    # {'job_base': [32]}
-    return nn_cnt
-
-  def cosin(self, input_query, input_doc):
-    query_norm = K.sqrt(K.sum(K.square(input_query), axis=-1))
-    doc_norm = K.sqrt(K.sum(K.square(input_doc), axis=-1))
-    query_doc = K.sum(input_query * input_doc, axis=-1)
-    cosin = query_doc / (query_norm * doc_norm + 1e-8)
-    return cosin
diff --git a/deepray/models/rec/cgc_v3.py b/deepray/models/rec/cgc_v3.py
deleted file mode 100644
index f9fe2426..00000000
--- a/deepray/models/rec/cgc_v3.py
+++ /dev/null
@@ -1,248 +0,0 @@
-import os
-from collections import defaultdict
-from itertools import chain
-
-import tensorflow as tf
-from absl import logging
-from tensorflow.keras.layers import Concatenate, Lambda
-from tensorflow.python.framework import constant_op
-from tensorflow.python.keras import backend_config
-from tensorflow.python.ops import clip_ops
-
-from deepray.layers.core import DNN
-from deepray.layers.mtl import CGC
-
-epsilon = backend_config.epsilon
-
-from .base_model import BaseModel
-
-
-class CGCModel(BaseModel):
-  def __call__(
-    self,
-    nn_hidden_units=(256, 128, 1),
-    nn_l2_reg=0.0,
-    nn_dropout=0.0,
-    nn_use_bn=False,
-    is_training=True,
-    *args,
-    **kwargs,
-  ):
-    target_label_table, eva_target_label_table, class_weight_table = self.get_target()
-    targets = list(target_label_table.keys())
-    num_targets = len(targets)
-
-    gate_input_dim = self.conf.gate_input_dim if self.conf.gate_input_dim else 1
-    input_list, nn_dense_features = self.get_input_and_dense_features(
-      is_training, self.get_nn_compo(), targets + ["share"], extra_dim=gate_input_dim
-    )
-    #         print("nn_dense_features:", nn_dense_features)
-    nn_target_inputs = list()
-    for target in targets + ["share"]:
-      nn_target_inputs.append(tf.concat(nn_dense_features[target], axis=-1, name=f"{target}_input"))
-    nn_input = tf.concat(nn_target_inputs, axis=1, name="cgc_input")
-    nn_input_gate = tf.concat(nn_dense_features["extra"], axis=-1, name="gate_input")
-
-    # output target
-    output_dict = dict()
-    output_weights = []
-    output_tensors = []
-
-    logging.info(f"class_weight_table: {class_weight_table}")
-
-    num_experts_task = num_targets + 1 if self.conf.num_experts is None else self.conf.num_experts
-    cgc_output = CGC(
-      num_tasks=num_targets,
-      num_experts_task=num_experts_task,
-      num_experts_share=num_experts_task,
-      units=[self.conf.units],
-      output_share=False,
-      name="cgc",
-    )([nn_input, nn_input_gate])
-    target_outputs = tf.split(cgc_output, num_or_size_splits=num_targets, axis=1)
-    logging.info(
-      f"[CGC] gate_input_dim: {gate_input_dim}, num_targets: {num_targets}, num_experts_task: {num_experts_task}"
-    )
-
-    bayes_inputs = dict()
-    for i, target in enumerate(target_label_table):
-      bayes_inputs[target] = DNN(
-        (self.conf.units, 64),
-        name=f"bayes_{target}",
-        l2_reg=nn_l2_reg,
-        output_activation="relu",
-        dropout_rate=nn_dropout,
-        use_bn=nn_use_bn,
-      )(tf.squeeze(target_outputs[i], axis=1))
-
-    for target, target_conf in self.conf.target.items():
-      target_input = bayes_inputs[target]
-      if "bayes" in target_conf and target_conf["bayes"] in bayes_inputs:
-        bayes_input = bayes_inputs[target_conf["bayes"]]
-        target_input = Concatenate(axis=-1, name=f"{target}_n_{target_conf['bayes']}")([target_input, bayes_input])
-      nn_target_output = DNN((1,), name=f"dnn_{target}", l2_reg=nn_l2_reg, dropout_rate=nn_dropout, use_bn=nn_use_bn)(
-        target_input
-      )
-      # out = ClipActivation('sigmoid', name=f"clip_act_{target}")(nn_target_output)
-
-      tout = tf.keras.layers.Activation(tf.nn.sigmoid)(nn_target_output)
-      epsilon_ = constant_op.constant(epsilon(), dtype=tout.dtype.base_dtype)
-      output = clip_ops.clip_by_value(tout, epsilon_, 1.0 - epsilon_)
-      output_dict[target] = tf.keras.layers.Lambda(lambda x: x, name=target)(output)
-
-      # output_dict[target] = Lambda(lambda x: x, name=target)(output)
-      output_weights.append(class_weight_table[target])
-      output_tensors.append(output_dict[target])
-
-    # output predict
-    if len(output_tensors) > 1:
-      weight_sum = sum(output_weights)
-      output_weights = [w / weight_sum for w in output_weights]
-      if "multiply" in self.conf.target_fusion_type:
-        predict_out = tf.reduce_prod(
-          tf.pow(tf.concat(output_tensors, axis=-1), output_weights), name="predict_multiply", axis=-1, keepdims=True
-        )
-      else:  # add
-        predict_out = tf.reduce_sum(
-          tf.multiply(tf.concat(output_tensors, axis=-1), output_weights), name="predict_add", axis=-1, keepdims=True
-        )
-    else:
-      predict_out = output_tensors[0]
-    output_dict["predict"] = Lambda(lambda x: x, name="predict")(predict_out)
-
-    # for mtl fusion plugin
-    i = 0
-    for target in ["det", "addf", "chat", "success", "refuse"]:
-      if target in output_dict:
-        predict_name = "predict_%d" % i
-        if target == "refuse":
-          output_dict[predict_name] = tf.keras.layers.Lambda(lambda x: 1.0 - x, name=predict_name)(output_dict[target])
-        else:
-          output_dict[predict_name] = tf.keras.layers.Lambda(lambda x: x, name=predict_name)(output_dict[target])
-        i += 1
-
-    # output eva target & metrics
-    metrics = dict()
-    weighted_metrics = dict()
-    for key, config in self.conf.evaluate_target.items():
-      target = config["target"] if "target" in config else "predict"
-      if target in output_dict:
-        metric = tf.keras.metrics.AUC(num_thresholds=1000, summation_method="minoring", name="auc")
-        # pr_metric = tf.keras.metrics.AUC(num_thresholds=1000, summation_method='minoring', name='pr_auc', curve='PR')
-        if "weighted" in config and config["weighted"]:
-          weighted_metrics[key] = metric
-        else:
-          metrics[key] = metric
-      else:
-        target = "predict"
-      output_dict[key] = Lambda(lambda x: x, name=key)(output_dict[target])
-
-    logging.info(f"class_weight_table: {class_weight_table}, output_dict: {output_dict}")
-    return tf.keras.Model(inputs=input_list, outputs=output_dict)
-
-  # 生成 Input 及 FFM & NN Feature
-  def get_input_and_dense_features(self, is_training, nn_comp, targets=None, extra_dim=0):
-    # Inputs
-    used_features = list(chain.from_iterable(self.field_dict.values()))
-    conti_features = self.conti_fea_dict()
-    features = dict()
-    num_targets = 1 if not targets else len(targets)
-    for fname in used_features:
-      code, dtype, length, value = (
-        self.fea_code[fname],
-        self.fea_dtype[fname],
-        self.fea_length[fname],
-        self.fea_def_valu_dict[fname],
-      )
-
-      use_hash = False
-      emb_size = 0
-      vocab = None
-      boundaries = None
-      emb_name = fname
-      if fname in self.conf.hash_fea_bucket_size_dict:
-        use_hash = True
-        emb_size = self.conf.hash_fea_bucket_size_dict[fname]
-      elif fname in self.conf.id_fea_bucket_size_dict:
-        emb_size = self.conf.id_fea_bucket_size_dict[fname]
-      elif fname in self.cate_fea_dict:
-        if self.conf.emb_reuse:
-          emb_name = self.fea_tag_dict[fname]
-        vocab = self.cate_fea_dict[fname]
-        emb_size = len(vocab) + 1
-      elif fname in conti_features:
-        boundaries = conti_features[fname]
-        emb_size = len(boundaries) + 1
-
-      features[fname] = self.make_feature(
-        name=fname,
-        code=code,
-        dtype=dtype,
-        len=length,
-        default_value=value,
-        use_hash=use_hash,
-        vocab=vocab,
-        boundaries=boundaries,
-        emb_name=emb_name,
-        emb_size=emb_size,
-      )
-    input_dict = self.input_from_features(features.values())
-
-    # NN features
-    nn_features = []
-    nn_cnt = nn_comp
-    emb_dim_byname = dict()
-    for field, fea_list in self.field_dict.items():
-      if field not in nn_cnt:
-        continue
-      emb_dims = nn_cnt[field]
-      if len(emb_dims) < len(fea_list):
-        emb_dims = emb_dims + [emb_dims[-1]] * (len(fea_list) - len(emb_dims))
-
-      for i, fea_name in enumerate(fea_list):
-        feature = features[fea_name]
-        emb_name = feature.emb_name
-        emb_dim = emb_dims[i]
-        if self.conf.emb_reuse:
-          if emb_name in emb_dim_byname and emb_dim_byname[emb_name] != emb_dim:
-            logging.warn(f"[EMBED REUSE] {feature.name}@{emb_name} from {emb_dim} to {emb_dim_byname[emb_name]}")
-            emb_dim = emb_dim_byname[emb_name]
-          emb_dim_byname[emb_name] = emb_dim
-        nn_features.append(
-          self.make_feature(
-            f=feature,
-            emb_dim=emb_dim * num_targets + (extra_dim if extra_dim > 0 else 0),
-            emb_split=[emb_dim] * num_targets + ([extra_dim] if extra_dim > 0 else []),
-          )
-        )
-    emb_dict = self.embedding_from_feature(nn_features, is_training)
-    nn_dense_features = self.dense_from_columns(nn_features, input_dict, emb_dict)
-
-    if num_targets > 1:
-      feas_split_dense = defaultdict(list)
-      for feature in nn_features:
-        fea_name = feature.name
-        fea_dense = nn_dense_features[fea_name]
-        fea_denses = tf.split(fea_dense, num_or_size_splits=feature.emb_split, axis=-1, name=f"split_{fea_name}")
-        for i, target in enumerate(targets):
-          feas_split_dense[target].append(fea_denses[i])
-        if extra_dim > 0:
-          feas_split_dense["extra"].append(fea_denses[-1])
-
-      return list(input_dict.values()), feas_split_dense
-
-    return list(input_dict.values()), list(nn_dense_features.values())
-
-  # 读取nn的特征交叉信息
-  def get_nn_compo(self):
-    nn_cnt = dict()
-    nn_path = os.path.join(self.conf.conf_path, self.conf.network_compo_dir, "nn")
-    with open(nn_path, "r") as fr:
-      for line in fr.readlines():
-        line = line.strip().split("\t")
-        fields = line[0].split(",")
-        fea_len = list(map(lambda x: int(x), line[1].split(",")))
-        for field in fields:
-          if field not in nn_cnt:
-            nn_cnt[field] = fea_len
-    return nn_cnt
diff --git a/deepray/utils/flags/core.py b/deepray/utils/flags/core.py
index 3d7240b6..662e276d 100644
--- a/deepray/utils/flags/core.py
+++ b/deepray/utils/flags/core.py
@@ -127,41 +127,3 @@ def get_nondefault_flags_as_str():
       flag_str = "--{}={}".format(name, value)
     flag_strings.append(flag_str)
   return " ".join(shlex_quote(flag_str) for flag_str in flag_strings)
-
-
-def parse_flags(flags_obj):
-  """Convenience function to turn flags into params."""
-  num_gpus = get_num_gpus(flags_obj)
-
-  batch_size = flags_obj.batch_size
-  eval_batch_size = flags_obj.eval_batch_size or flags_obj.batch_size
-
-  return {
-    "epochs": flags_obj.epochs,
-    "batches_per_step": 1,
-    "use_seed": flags_obj.random_seed is not None,
-    "batch_size": batch_size,
-    "eval_batch_size": eval_batch_size,
-    "learning_rate": flags_obj.learning_rate,
-    "mf_dim": flags_obj.num_factors,
-    "model_layers": [int(layer) for layer in flags_obj.layers],
-    "mf_regularization": flags_obj.mf_regularization,
-    "mlp_reg_layers": [float(reg) for reg in flags_obj.mlp_regularization],
-    "num_neg": flags_obj.num_neg,
-    "distribution_strategy": flags_obj.distribution_strategy,
-    "num_gpus": num_gpus,
-    "use_tpu": flags_obj.tpu is not None,
-    "tpu": flags_obj.tpu,
-    "tpu_zone": flags_obj.tpu_zone,
-    "tpu_gcp_project": flags_obj.tpu_gcp_project,
-    "beta1": flags_obj.beta1,
-    "beta2": flags_obj.beta2,
-    "epsilon": flags_obj.epsilon,
-    "match_mlperf": flags_obj.ml_perf,
-    # "epochs_between_evals": flags_obj.epochs_between_evals,
-    "use_custom_training_loop": flags_obj.use_custom_training_loop,
-    "hr_threshold": flags_obj.hr_threshold,
-    "stream_files": flags_obj.tpu is not None,
-    "train_dataset_path": flags_obj.train_dataset_path,
-    "eval_dataset_path": flags_obj.eval_dataset_path,
-  }
diff --git a/deepray/utils/logs/hooks_helper.py b/deepray/utils/logs/hooks_helper.py
index 1cc3e94b..4c21c91d 100644
--- a/deepray/utils/logs/hooks_helper.py
+++ b/deepray/utils/logs/hooks_helper.py
@@ -24,6 +24,7 @@
 from __future__ import print_function
 
 import tensorflow as tf  # pylint: disable=g-bad-import-order
+from absl import logging
 
 from official.utils.logs import hooks
 from official.utils.logs import logger
diff --git a/deepray/utils/logs/logger.py b/deepray/utils/logs/logger.py
index af3c91f0..ec0272c7 100644
--- a/deepray/utils/logs/logger.py
+++ b/deepray/utils/logs/logger.py
@@ -31,12 +31,11 @@
 import threading
 import uuid
 
-from six.moves import _thread as thread
-from absl import flags
 import tensorflow as tf
-from tensorflow.python.client import device_lib
-
+from absl import flags
+from absl import logging
 from official.utils.logs import cloud_lib
+from six.moves import _thread as thread
 
 METRIC_LOG_FILE_NAME = "metric.log"
 BENCHMARK_RUN_LOG_FILE_NAME = "benchmark_run.log"
@@ -57,7 +56,7 @@ def config_benchmark_logger(flag_obj=None):
   try:
     global _benchmark_logger
     if not flag_obj:
-      flag_obj = FLAGS
+      flag_obj = flags.FLAGS
 
     if not hasattr(flag_obj, "benchmark_logger_type") or flag_obj.benchmark_logger_type == "BaseBenchmarkLogger":
       _benchmark_logger = BaseBenchmarkLogger()
diff --git a/deepray/utils/logs/logger_test.py b/deepray/utils/logs/logger_test.py
index cbd14e1e..66abd6b3 100644
--- a/deepray/utils/logs/logger_test.py
+++ b/deepray/utils/logs/logger_test.py
@@ -26,7 +26,9 @@
 
 import mock
 from absl.testing import flagsaver
-import tensorflow as tf  # pylint: disable=g-bad-import-order
+from absl import logging
+
+import tensorflow as tf
 
 try:
   from google.cloud import bigquery
diff --git a/deepray/utils/logs/mlperf_helper.py b/deepray/utils/logs/mlperf_helper.py
index 4a03a81d..54f668f6 100644
--- a/deepray/utils/logs/mlperf_helper.py
+++ b/deepray/utils/logs/mlperf_helper.py
@@ -23,15 +23,13 @@
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
-import json
 import os
 import re
 import subprocess
-import sys
 import typing
+from collections import namedtuple
 
-import tensorflow as tf
+from absl import logging
 
 _MIN_VERSION = (0, 0, 10)
 _STACK_OFFSET = 2
diff --git a/deepray/utils/logs/summary_manager.py b/deepray/utils/logs/summary_manager.py
index d4327d00..4438230c 100644
--- a/deepray/utils/logs/summary_manager.py
+++ b/deepray/utils/logs/summary_manager.py
@@ -14,13 +14,12 @@
 """Provides a utility class for managing summary writing."""
 
 import os
-from absl import flags, logging
-from datetime import datetime
 
-from deepray.design_patterns import SingletonType
-
-import tensorflow as tf
 import horovod.tensorflow as hvd
+import tensorflow as tf
+from absl import flags
+
+from deepray.design_patterns import SingletonType
 
 _MIN_SUMMARY_STEPS = 10
 
@@ -46,11 +45,11 @@ def __init__(self, summary_dir, global_step=None):
 
     # Not writing tensorboard summaries if running in MLPerf.
     # Create summary writers
-    if FLAGS.use_horovod and hvd.rank() != 0 or FLAGS.ml_perf:
+    if flags.FLAGS.use_horovod and hvd.rank() != 0 or flags.FLAGS.ml_perf:
       self.summary_writers["train"], self.summary_writers["evel"] = None, None
     else:
       self.summary_writers["evel"] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "eval"))
-      if FLAGS.steps_per_execution >= _MIN_SUMMARY_STEPS:
+      if flags.FLAGS.steps_per_execution >= _MIN_SUMMARY_STEPS:
         # Only writes summary when the stats are collected sufficiently over enough steps.
         self.summary_writers["train"] = tf.summary.create_file_writer(os.path.join(self._summary_dir, "train"))
       else:
diff --git a/pyproject.toml b/pyproject.toml
index 258939ea..7cd7c232 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,6 +79,7 @@ preview = true
 exclude = [
     ".git",
     "build",
+    "deepray/datasets",
     "__pycache__",
 ]
 line-length = 120
@@ -105,6 +106,15 @@ ignore = [
     "B905",
     # imported but unused
     "F401",
+    # Unnecessary list comprehension
+    "C419",
+    # Unnecessary generator (rewrite as a set comprehension)
+    "C401",
+    "C402",
+    "C403",
+    "F821",
+    "C400",
+    "C416",
 ]
 select = [
     "B9",

From 35f97e755bcd6d382340cc389940718a765a022b Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 22:40:36 +0800
Subject: [PATCH 10/11] Test: Update ruff check rules

---
 modelzoo/{ => LanguageModeling}/ELECTRA/.gitignore   |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/Dockerfile   |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/LICENSE      |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/NOTICE       |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/README.md    |  0
 .../ELECTRA/build_pretraining_dataset.py             |  0
 .../{ => LanguageModeling}/ELECTRA/configuration.py  |  0
 .../ELECTRA/configuration_utils.py                   |  0
 .../ELECTRA/data/BooksDownloader.py                  |  0
 .../ELECTRA/data/BookscorpusTextFormatting.py        |  0
 .../ELECTRA/data/Downloader.py                       |  0
 .../ELECTRA/data/GooglePretrainedWeightDownloader.py |  0
 .../ELECTRA/data/MRPCDownloader.py                   |  0
 .../ELECTRA/data/NVIDIAPretrainedWeightDownloader.py |  0
 .../ELECTRA/data/SquadDownloader.py                  |  0
 .../ELECTRA/data/TextSharding.py                     | 12 ++++++------
 .../ELECTRA/data/WikiDownloader.py                   |  0
 .../ELECTRA/data/WikicorpusTextFormatting.py         |  0
 .../{ => LanguageModeling}/ELECTRA/data/__init__.py  |  0
 .../ELECTRA/data/create_datasets_from_start.sh       |  0
 .../{ => LanguageModeling}/ELECTRA/data/dataPrep.py  |  0
 .../ELECTRA/data/glue/download_mrpc.sh               |  0
 .../ELECTRA/data/squad/squad_download.sh             |  0
 .../{ => LanguageModeling}/ELECTRA/file_utils.py     |  0
 .../{ => LanguageModeling}/ELECTRA/gpu_affinity.py   |  0
 .../ELECTRA/images/total_loss.svg                    |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/modeling.py  |  0
 .../{ => LanguageModeling}/ELECTRA/modeling_utils.py |  0
 .../{ => LanguageModeling}/ELECTRA/optimization.py   |  0
 .../ELECTRA/postprocess_pretrained_ckpt.py           |  0
 .../{ => LanguageModeling}/ELECTRA/pretrain_utils.py |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/run.sub      |  0
 .../{ => LanguageModeling}/ELECTRA/run_inference.py  |  0
 .../ELECTRA/run_pretraining.py                       |  0
 .../{ => LanguageModeling}/ELECTRA/run_tf_squad.py   |  0
 .../ELECTRA/scripts/benchmark_pretraining.sh         |  0
 .../ELECTRA/scripts/benchmark_squad.sh               |  0
 .../{ => LanguageModeling}/ELECTRA/scripts/bind.sh   |  0
 .../ELECTRA/scripts/configs/pretrain_config.sh       |  0
 .../ELECTRA/scripts/configs/squad_config.sh          |  0
 .../ELECTRA/scripts/docker/build.sh                  |  0
 .../ELECTRA/scripts/docker/launch.sh                 |  0
 .../ELECTRA/scripts/finetune_ckpts_on_squad.sh       |  0
 .../ELECTRA/scripts/run_pretraining.sh               |  0
 .../ELECTRA/scripts/run_squad.sh                     |  0
 .../{ => LanguageModeling}/ELECTRA/squad_utils.py    |  0
 .../{ => LanguageModeling}/ELECTRA/tokenization.py   |  1 -
 .../ELECTRA/tokenization_utils.py                    |  0
 modelzoo/{ => LanguageModeling}/ELECTRA/utils.py     |  0
 .../{ => LanguageModeling}/ELECTRA/vocab/vocab.txt   |  0
 .../Recommendation/avazu-ctr-prediction/train.py     |  1 -
 pyproject.toml                                       |  4 +++-
 third_party/gpus/find_cuda_config.py                 |  2 +-
 53 files changed, 10 insertions(+), 10 deletions(-)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/.gitignore (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/Dockerfile (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/LICENSE (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/NOTICE (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/README.md (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/build_pretraining_dataset.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/configuration.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/configuration_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/BooksDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/BookscorpusTextFormatting.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/Downloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/GooglePretrainedWeightDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/MRPCDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/SquadDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/TextSharding.py (97%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/WikiDownloader.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/WikicorpusTextFormatting.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/__init__.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/create_datasets_from_start.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/dataPrep.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/glue/download_mrpc.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/data/squad/squad_download.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/file_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/gpu_affinity.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/images/total_loss.svg (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/modeling.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/modeling_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/optimization.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/postprocess_pretrained_ckpt.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/pretrain_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/run.sub (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/run_inference.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/run_pretraining.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/run_tf_squad.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/benchmark_pretraining.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/benchmark_squad.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/bind.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/configs/pretrain_config.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/configs/squad_config.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/docker/build.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/docker/launch.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/finetune_ckpts_on_squad.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/run_pretraining.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/scripts/run_squad.sh (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/squad_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/tokenization.py (98%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/tokenization_utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/utils.py (100%)
 rename modelzoo/{ => LanguageModeling}/ELECTRA/vocab/vocab.txt (100%)

diff --git a/modelzoo/ELECTRA/.gitignore b/modelzoo/LanguageModeling/ELECTRA/.gitignore
similarity index 100%
rename from modelzoo/ELECTRA/.gitignore
rename to modelzoo/LanguageModeling/ELECTRA/.gitignore
diff --git a/modelzoo/ELECTRA/Dockerfile b/modelzoo/LanguageModeling/ELECTRA/Dockerfile
similarity index 100%
rename from modelzoo/ELECTRA/Dockerfile
rename to modelzoo/LanguageModeling/ELECTRA/Dockerfile
diff --git a/modelzoo/ELECTRA/LICENSE b/modelzoo/LanguageModeling/ELECTRA/LICENSE
similarity index 100%
rename from modelzoo/ELECTRA/LICENSE
rename to modelzoo/LanguageModeling/ELECTRA/LICENSE
diff --git a/modelzoo/ELECTRA/NOTICE b/modelzoo/LanguageModeling/ELECTRA/NOTICE
similarity index 100%
rename from modelzoo/ELECTRA/NOTICE
rename to modelzoo/LanguageModeling/ELECTRA/NOTICE
diff --git a/modelzoo/ELECTRA/README.md b/modelzoo/LanguageModeling/ELECTRA/README.md
similarity index 100%
rename from modelzoo/ELECTRA/README.md
rename to modelzoo/LanguageModeling/ELECTRA/README.md
diff --git a/modelzoo/ELECTRA/build_pretraining_dataset.py b/modelzoo/LanguageModeling/ELECTRA/build_pretraining_dataset.py
similarity index 100%
rename from modelzoo/ELECTRA/build_pretraining_dataset.py
rename to modelzoo/LanguageModeling/ELECTRA/build_pretraining_dataset.py
diff --git a/modelzoo/ELECTRA/configuration.py b/modelzoo/LanguageModeling/ELECTRA/configuration.py
similarity index 100%
rename from modelzoo/ELECTRA/configuration.py
rename to modelzoo/LanguageModeling/ELECTRA/configuration.py
diff --git a/modelzoo/ELECTRA/configuration_utils.py b/modelzoo/LanguageModeling/ELECTRA/configuration_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/configuration_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/configuration_utils.py
diff --git a/modelzoo/ELECTRA/data/BooksDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/BooksDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/BooksDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/BooksDownloader.py
diff --git a/modelzoo/ELECTRA/data/BookscorpusTextFormatting.py b/modelzoo/LanguageModeling/ELECTRA/data/BookscorpusTextFormatting.py
similarity index 100%
rename from modelzoo/ELECTRA/data/BookscorpusTextFormatting.py
rename to modelzoo/LanguageModeling/ELECTRA/data/BookscorpusTextFormatting.py
diff --git a/modelzoo/ELECTRA/data/Downloader.py b/modelzoo/LanguageModeling/ELECTRA/data/Downloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/Downloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/Downloader.py
diff --git a/modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/GooglePretrainedWeightDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/GooglePretrainedWeightDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/GooglePretrainedWeightDownloader.py
diff --git a/modelzoo/ELECTRA/data/MRPCDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/MRPCDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/MRPCDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/MRPCDownloader.py
diff --git a/modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/NVIDIAPretrainedWeightDownloader.py
diff --git a/modelzoo/ELECTRA/data/SquadDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/SquadDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/SquadDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/SquadDownloader.py
diff --git a/modelzoo/ELECTRA/data/TextSharding.py b/modelzoo/LanguageModeling/ELECTRA/data/TextSharding.py
similarity index 97%
rename from modelzoo/ELECTRA/data/TextSharding.py
rename to modelzoo/LanguageModeling/ELECTRA/data/TextSharding.py
index f26eb592..852115b6 100644
--- a/modelzoo/ELECTRA/data/TextSharding.py
+++ b/modelzoo/LanguageModeling/ELECTRA/data/TextSharding.py
@@ -59,10 +59,10 @@ def load_articles(self):
 
   def segment_articles_into_sentences(self, segmenter):
     print("Start: Sentence Segmentation")
-    if len(self.articles) is 0:
+    if len(self.articles) == 0:
       self.load_articles()
 
-    assert len(self.articles) is not 0, "Please check that input files are present and contain data."
+    assert len(self.articles) != 0, "Please check that input files are present and contain data."
 
     # TODO: WIP: multiprocessing (create independent ranges and spawn processes)
     use_multiprocessing = "serial"
@@ -119,10 +119,10 @@ def work(articles, return_dict):
 
   def init_output_files(self):
     print("Start: Init Output Files")
-    assert len(self.output_training_files) is 0, (
+    assert len(self.output_training_files) == 0, (
       "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
     )
-    assert len(self.output_test_files) is 0, (
+    assert len(self.output_test_files) == 0, (
       "Internal storage self.output_files already contains data. This function is intended to be used by the constructor only."
     )
 
@@ -228,7 +228,7 @@ def distribute_articles_over_shards(self):
 
         if (
           nominal_next_article_size not in sentence_counts
-          or nominal_next_article_size is 0
+          or nominal_next_article_size == 0
           or training_counts[fidx] > training_median
         ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
@@ -252,7 +252,7 @@ def distribute_articles_over_shards(self):
 
         if (
           nominal_next_article_size not in sentence_counts
-          or nominal_next_article_size is 0
+          or nominal_next_article_size == 0
           or test_counts[fidx] > test_median
         ):
           continue  # skip adding to this file, will come back later if no file can accept unused articles
diff --git a/modelzoo/ELECTRA/data/WikiDownloader.py b/modelzoo/LanguageModeling/ELECTRA/data/WikiDownloader.py
similarity index 100%
rename from modelzoo/ELECTRA/data/WikiDownloader.py
rename to modelzoo/LanguageModeling/ELECTRA/data/WikiDownloader.py
diff --git a/modelzoo/ELECTRA/data/WikicorpusTextFormatting.py b/modelzoo/LanguageModeling/ELECTRA/data/WikicorpusTextFormatting.py
similarity index 100%
rename from modelzoo/ELECTRA/data/WikicorpusTextFormatting.py
rename to modelzoo/LanguageModeling/ELECTRA/data/WikicorpusTextFormatting.py
diff --git a/modelzoo/ELECTRA/data/__init__.py b/modelzoo/LanguageModeling/ELECTRA/data/__init__.py
similarity index 100%
rename from modelzoo/ELECTRA/data/__init__.py
rename to modelzoo/LanguageModeling/ELECTRA/data/__init__.py
diff --git a/modelzoo/ELECTRA/data/create_datasets_from_start.sh b/modelzoo/LanguageModeling/ELECTRA/data/create_datasets_from_start.sh
similarity index 100%
rename from modelzoo/ELECTRA/data/create_datasets_from_start.sh
rename to modelzoo/LanguageModeling/ELECTRA/data/create_datasets_from_start.sh
diff --git a/modelzoo/ELECTRA/data/dataPrep.py b/modelzoo/LanguageModeling/ELECTRA/data/dataPrep.py
similarity index 100%
rename from modelzoo/ELECTRA/data/dataPrep.py
rename to modelzoo/LanguageModeling/ELECTRA/data/dataPrep.py
diff --git a/modelzoo/ELECTRA/data/glue/download_mrpc.sh b/modelzoo/LanguageModeling/ELECTRA/data/glue/download_mrpc.sh
similarity index 100%
rename from modelzoo/ELECTRA/data/glue/download_mrpc.sh
rename to modelzoo/LanguageModeling/ELECTRA/data/glue/download_mrpc.sh
diff --git a/modelzoo/ELECTRA/data/squad/squad_download.sh b/modelzoo/LanguageModeling/ELECTRA/data/squad/squad_download.sh
similarity index 100%
rename from modelzoo/ELECTRA/data/squad/squad_download.sh
rename to modelzoo/LanguageModeling/ELECTRA/data/squad/squad_download.sh
diff --git a/modelzoo/ELECTRA/file_utils.py b/modelzoo/LanguageModeling/ELECTRA/file_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/file_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/file_utils.py
diff --git a/modelzoo/ELECTRA/gpu_affinity.py b/modelzoo/LanguageModeling/ELECTRA/gpu_affinity.py
similarity index 100%
rename from modelzoo/ELECTRA/gpu_affinity.py
rename to modelzoo/LanguageModeling/ELECTRA/gpu_affinity.py
diff --git a/modelzoo/ELECTRA/images/total_loss.svg b/modelzoo/LanguageModeling/ELECTRA/images/total_loss.svg
similarity index 100%
rename from modelzoo/ELECTRA/images/total_loss.svg
rename to modelzoo/LanguageModeling/ELECTRA/images/total_loss.svg
diff --git a/modelzoo/ELECTRA/modeling.py b/modelzoo/LanguageModeling/ELECTRA/modeling.py
similarity index 100%
rename from modelzoo/ELECTRA/modeling.py
rename to modelzoo/LanguageModeling/ELECTRA/modeling.py
diff --git a/modelzoo/ELECTRA/modeling_utils.py b/modelzoo/LanguageModeling/ELECTRA/modeling_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/modeling_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/modeling_utils.py
diff --git a/modelzoo/ELECTRA/optimization.py b/modelzoo/LanguageModeling/ELECTRA/optimization.py
similarity index 100%
rename from modelzoo/ELECTRA/optimization.py
rename to modelzoo/LanguageModeling/ELECTRA/optimization.py
diff --git a/modelzoo/ELECTRA/postprocess_pretrained_ckpt.py b/modelzoo/LanguageModeling/ELECTRA/postprocess_pretrained_ckpt.py
similarity index 100%
rename from modelzoo/ELECTRA/postprocess_pretrained_ckpt.py
rename to modelzoo/LanguageModeling/ELECTRA/postprocess_pretrained_ckpt.py
diff --git a/modelzoo/ELECTRA/pretrain_utils.py b/modelzoo/LanguageModeling/ELECTRA/pretrain_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/pretrain_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/pretrain_utils.py
diff --git a/modelzoo/ELECTRA/run.sub b/modelzoo/LanguageModeling/ELECTRA/run.sub
similarity index 100%
rename from modelzoo/ELECTRA/run.sub
rename to modelzoo/LanguageModeling/ELECTRA/run.sub
diff --git a/modelzoo/ELECTRA/run_inference.py b/modelzoo/LanguageModeling/ELECTRA/run_inference.py
similarity index 100%
rename from modelzoo/ELECTRA/run_inference.py
rename to modelzoo/LanguageModeling/ELECTRA/run_inference.py
diff --git a/modelzoo/ELECTRA/run_pretraining.py b/modelzoo/LanguageModeling/ELECTRA/run_pretraining.py
similarity index 100%
rename from modelzoo/ELECTRA/run_pretraining.py
rename to modelzoo/LanguageModeling/ELECTRA/run_pretraining.py
diff --git a/modelzoo/ELECTRA/run_tf_squad.py b/modelzoo/LanguageModeling/ELECTRA/run_tf_squad.py
similarity index 100%
rename from modelzoo/ELECTRA/run_tf_squad.py
rename to modelzoo/LanguageModeling/ELECTRA/run_tf_squad.py
diff --git a/modelzoo/ELECTRA/scripts/benchmark_pretraining.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/benchmark_pretraining.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/benchmark_pretraining.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/benchmark_pretraining.sh
diff --git a/modelzoo/ELECTRA/scripts/benchmark_squad.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/benchmark_squad.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/benchmark_squad.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/benchmark_squad.sh
diff --git a/modelzoo/ELECTRA/scripts/bind.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/bind.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/bind.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/bind.sh
diff --git a/modelzoo/ELECTRA/scripts/configs/pretrain_config.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/configs/pretrain_config.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/configs/pretrain_config.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/configs/pretrain_config.sh
diff --git a/modelzoo/ELECTRA/scripts/configs/squad_config.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/configs/squad_config.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/configs/squad_config.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/configs/squad_config.sh
diff --git a/modelzoo/ELECTRA/scripts/docker/build.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/docker/build.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/docker/build.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/docker/build.sh
diff --git a/modelzoo/ELECTRA/scripts/docker/launch.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/docker/launch.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/docker/launch.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/docker/launch.sh
diff --git a/modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/finetune_ckpts_on_squad.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/finetune_ckpts_on_squad.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/finetune_ckpts_on_squad.sh
diff --git a/modelzoo/ELECTRA/scripts/run_pretraining.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/run_pretraining.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/run_pretraining.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/run_pretraining.sh
diff --git a/modelzoo/ELECTRA/scripts/run_squad.sh b/modelzoo/LanguageModeling/ELECTRA/scripts/run_squad.sh
similarity index 100%
rename from modelzoo/ELECTRA/scripts/run_squad.sh
rename to modelzoo/LanguageModeling/ELECTRA/scripts/run_squad.sh
diff --git a/modelzoo/ELECTRA/squad_utils.py b/modelzoo/LanguageModeling/ELECTRA/squad_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/squad_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/squad_utils.py
diff --git a/modelzoo/ELECTRA/tokenization.py b/modelzoo/LanguageModeling/ELECTRA/tokenization.py
similarity index 98%
rename from modelzoo/ELECTRA/tokenization.py
rename to modelzoo/LanguageModeling/ELECTRA/tokenization.py
index 087a8939..8baa7a57 100644
--- a/modelzoo/ELECTRA/tokenization.py
+++ b/modelzoo/LanguageModeling/ELECTRA/tokenization.py
@@ -15,7 +15,6 @@
 
 from tokenization_utils import BertTokenizer
 
-from tokenization_utils import BertTokenizer
 
 VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
 
diff --git a/modelzoo/ELECTRA/tokenization_utils.py b/modelzoo/LanguageModeling/ELECTRA/tokenization_utils.py
similarity index 100%
rename from modelzoo/ELECTRA/tokenization_utils.py
rename to modelzoo/LanguageModeling/ELECTRA/tokenization_utils.py
diff --git a/modelzoo/ELECTRA/utils.py b/modelzoo/LanguageModeling/ELECTRA/utils.py
similarity index 100%
rename from modelzoo/ELECTRA/utils.py
rename to modelzoo/LanguageModeling/ELECTRA/utils.py
diff --git a/modelzoo/ELECTRA/vocab/vocab.txt b/modelzoo/LanguageModeling/ELECTRA/vocab/vocab.txt
similarity index 100%
rename from modelzoo/ELECTRA/vocab/vocab.txt
rename to modelzoo/LanguageModeling/ELECTRA/vocab/vocab.txt
diff --git a/modelzoo/Recommendation/avazu-ctr-prediction/train.py b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
index bcd7dbc2..ae5372c1 100644
--- a/modelzoo/Recommendation/avazu-ctr-prediction/train.py
+++ b/modelzoo/Recommendation/avazu-ctr-prediction/train.py
@@ -22,7 +22,6 @@
 
 import tensorflow as tf
 from absl import app
-from absl import flags
 import tensorflow_recommenders_addons as tfra
 
 from deepray.core.trainer import Trainer
diff --git a/pyproject.toml b/pyproject.toml
index 7cd7c232..458de1e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -79,8 +79,9 @@ preview = true
 exclude = [
     ".git",
     "build",
-    "deepray/datasets",
     "__pycache__",
+    "deepray/datasets/",
+    "modelzoo/LanguageModeling/ELECTRA/"
 ]
 line-length = 120
 indent-width = 2
@@ -115,6 +116,7 @@ ignore = [
     "F821",
     "C400",
     "C416",
+    "YTT202",
 ]
 select = [
     "B9",
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index a801e57e..ae499e4d 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -151,7 +151,7 @@ def _get_ld_config_paths():
       match = False
     if match:
       result.add(os.path.dirname(match.group(1)))
-  return sorted(list(result))
+  return sorted(result)
 
 
 def _get_default_cuda_paths(cuda_version):

From 289b7915054307d18fc6829d0456794922e22339 Mon Sep 17 00:00:00 2001
From: fuhailin <hailinfufu@outlook.com>
Date: Wed, 25 Jun 2025 22:56:55 +0800
Subject: [PATCH 11/11] Test: Use buildidier action check Bazel files

---
 .github/workflows/check-bazel-format.yml      | 11 +++++++
 ...ormat-check.yml => check-clang-format.yml} |  0
 ...rmat_check.yml => check-python-format.yml} |  0
 .github/workflows/ci_test.yml                 | 29 -------------------
 4 files changed, 11 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/check-bazel-format.yml
 rename .github/workflows/{clang-format-check.yml => check-clang-format.yml} (100%)
 rename .github/workflows/{python_format_check.yml => check-python-format.yml} (100%)

diff --git a/.github/workflows/check-bazel-format.yml b/.github/workflows/check-bazel-format.yml
new file mode 100644
index 00000000..12161ced
--- /dev/null
+++ b/.github/workflows/check-bazel-format.yml
@@ -0,0 +1,11 @@
+name: Bazel files check
+on: [push]
+jobs:
+  formatting-check:
+    name: Run buildifier check
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Run buildifier
+      uses: jbajic/buildifier@v1
\ No newline at end of file
diff --git a/.github/workflows/clang-format-check.yml b/.github/workflows/check-clang-format.yml
similarity index 100%
rename from .github/workflows/clang-format-check.yml
rename to .github/workflows/check-clang-format.yml
diff --git a/.github/workflows/python_format_check.yml b/.github/workflows/check-python-format.yml
similarity index 100%
rename from .github/workflows/python_format_check.yml
rename to .github/workflows/check-python-format.yml
diff --git a/.github/workflows/ci_test.yml b/.github/workflows/ci_test.yml
index 859219e4..8e029a25 100644
--- a/.github/workflows/ci_test.yml
+++ b/.github/workflows/ci_test.yml
@@ -11,35 +11,6 @@ on:
       - r*
 
 jobs:
-  valid_build_files:
-    name: Valid build files
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Run type check
-        run: bash tools/run_build.sh valid_build_files
-
-  check-bazel-format:
-    name: Bazel code format
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Run type check
-        run: bash tools/run_build.sh check-bazel-format
-  pre-commit:
-    name: Check that the pre-commit works
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - name: Checking the pre-commit
-        run: bash tools/pre-commit.sh
-  # docs_tests:
-  #   name: Check that we can build the docs
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Building the docs
-  #       run: bash tools/run_build.sh docs_tests
   nbfmt:
     name: Notebook format
     runs-on: ubuntu-latest